From 095675711a89d836f4d0f10978ed5759b93fe76f Mon Sep 17 00:00:00 2001 From: Jesse Wolfe Date: Mon, 22 Nov 2010 15:17:51 -0800 Subject: Fix #5261 Don't escape Unicode characters in PSON This patch removes the escaping of valid UTF-8 sequences as "\uXXXX". This code was unreliable, as it relied on Iconv's ability to convert those codepoints between UTF-8 and UTF-16, but some versions of Iconv barf on some valid codepoints. Invalid UTF-8 sequences are still passed through unchanged. We believe that this is fine; if you are concerned about complience with the JSON standard, what we are doing is equivalent to: * interpreting binary files as Latin-1 encoded character sequences * JSON-encoding those characters according to RFC 4627 * outputting the JSON as Latin-1 This allows all raw binary files to be transmitted losslessly. Paired-With: Paul Berry --- lib/puppet/external/pson/pure/generator.rb | 23 +---------------------- spec/unit/util/pson_spec.rb | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 22 deletions(-) diff --git a/lib/puppet/external/pson/pure/generator.rb b/lib/puppet/external/pson/pure/generator.rb index 4180be57d..89a0c62e0 100644 --- a/lib/puppet/external/pson/pure/generator.rb +++ b/lib/puppet/external/pson/pure/generator.rb @@ -44,34 +44,13 @@ module PSON string << '' # XXX workaround: avoid buffer sharing string.force_encoding(Encoding::ASCII_8BIT) string.gsub!(/["\\\x0-\x1f]/) { MAP[$MATCH] } - string.gsub!(/( - (?: - [\xc2-\xdf][\x80-\xbf] | - [\xe0-\xef][\x80-\xbf]{2} | - [\xf0-\xf4][\x80-\xbf]{3} - )+ | - [\x80-\xc1\xf5-\xff] # invalid - )/nx) { |c| - c.size == 1 and raise GeneratorError, "invalid utf8 byte: '#{c}'" - s = PSON::UTF8toUTF16.iconv(c).unpack('H*')[0] - s.gsub!(/.{4}/n, '\\\\u\&') - } - string.force_encoding(Encoding::UTF_8) string rescue Iconv::Failure => e raise GeneratorError, "Caught #{e.class}: #{e}" end else def utf8_to_pson(string) # :nodoc: - string. - gsub(/["\\\x0-\x1f]/n) { MAP[$MATCH] }. - gsub(/((?: - [\xc2-\xdf][\x80-\xbf] | - [\xe0-\xef][\x80-\xbf]{2} | - [\xf0-\xf4][\x80-\xbf]{3} - )+)/nx) { |c| - PSON::UTF8toUTF16.iconv(c).unpack('H*')[0].gsub(/.{4}/n, '\\\\u\&') - } + string.gsub(/["\\\x0-\x1f]/n) { MAP[$MATCH] } end end module_function :utf8_to_pson diff --git a/spec/unit/util/pson_spec.rb b/spec/unit/util/pson_spec.rb index d02d28517..474ddafa4 100755 --- a/spec/unit/util/pson_spec.rb +++ b/spec/unit/util/pson_spec.rb @@ -35,4 +35,19 @@ describe Puppet::Util::Pson do bin_string = (1..20000).collect { |i| ((17*i+13*i*i) % 255).chr }.join PSON.parse(%Q{{ "type": "foo", "data": #{bin_string.to_pson} }})["data"].should == bin_string end + + it "should be able to handle UTF8 that isn't a real unicode character" do + s = ["\355\274\267"] + PSON.parse( [s].to_pson ).should == [s] + end + + it "should be able to handle UTF8 for \\xFF" do + s = ["\xc3\xbf"] + PSON.parse( [s].to_pson ).should == [s] + end + + it "should be able to handle invalid UTF8 bytes" do + s = ["\xc3\xc3"] + PSON.parse( [s].to_pson ).should == [s] + end end -- cgit