diff options
| author | ser <ser@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2004-07-19 01:43:31 +0000 |
|---|---|---|
| committer | ser <ser@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2004-07-19 01:43:31 +0000 |
| commit | 5f3ee100b1ae2def7eb374d8de700f8f3eaa68b2 (patch) | |
| tree | 354fa51d0d75bc57eeaeb536dd01e2bda7c1921d /lib/rexml/source.rb | |
| parent | 3d391146d116d48402a878cc321ef3d4806ff84f (diff) | |
| download | ruby-5f3ee100b1ae2def7eb374d8de700f8f3eaa68b2.tar.gz ruby-5f3ee100b1ae2def7eb374d8de700f8f3eaa68b2.tar.xz ruby-5f3ee100b1ae2def7eb374d8de700f8f3eaa68b2.zip | |
r1025 | ser | 2004-07-18 08:18:36 -0400 (Sun, 18 Jul 2004) | 2 lines
@@ Fixed a CDATA pretty-printing bug. (#39) @@
r1026 | ser | 2004-07-18 09:03:02 -0400 (Sun, 18 Jul 2004) | 4 lines
@@ Fixed a buffering bug in Source.rb that affected the SAX parser @@
This bug was related to how REXML determines the encoding of a file, and
evinced itself by hanging on input when using the SAX parser.
r1028 | ser | 2004-07-18 09:06:18 -0400 (Sun, 18 Jul 2004) | 3 lines
* Minor pretty printing fix.
git-svn-id: http://svn.ruby-lang.org/repos/ruby/branches/ruby_1_8@6677 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'lib/rexml/source.rb')
| -rw-r--r-- | lib/rexml/source.rb | 20 |
1 files changed, 15 insertions, 5 deletions
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index ce10d03a6..725166616 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -116,11 +116,21 @@ module REXML def initialize(arg, block_size=500) @er_source = @source = arg @to_utf = false - # FIXME - # This is broken. If the user puts in enough carriage returns, this can fail - # to calculate the correct encoding. - super @source.read( 100 ) - @line_break = encode( '>' ) + # Determining the encoding is a deceptively difficult issue to resolve. + # First, we check the first two bytes for UTF-16. Then we + # assume that the encoding is at least ASCII enough for the '>', and + # we read until we get one of those. This gives us the XML declaration, + # if there is one. If there isn't one, the file MUST be UTF-8, as per + # the XML spec. If there is one, we can determine the encoding from + # it. + str = @source.read( 2 ) + if (str[0] == 254 && str[1] == 255) || (str[0] == 255 && str[1] == 254) + @encoding = check_encoding( str ) + @line_break = encode( '>' ) + else + @line_break = '>' + end + super str+@source.readline( @line_break ) end def scan(pattern, cons=false) |
