r1025 | ser | 2004-07-18 08:18:36 -0400 (Sun, 18 Jul 2004) | 2 lines

@@ Fixed a CDATA pretty-printing bug. (#39) @@ r1026 | ser | 2004-07-18 09:03:02 -0400 (Sun, 18 Jul 2004) | 4 lines @@ Fixed a buffering bug in Source.rb that affected the SAX parser @@ This bug was related to how REXML determines the encoding of a file, and evinced itself by hanging on input when using the SAX parser. r1028 | ser | 2004-07-18 09:06:18 -0400 (Sun, 18 Jul 2004) | 3 lines * Minor pretty printing fix. git-svn-id: http://svn.ruby-lang.org/repos/ruby/branches/ruby_1_8@6677 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
author: ser <ser@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> 2004-07-19 01:43:31 +0000
committer: ser <ser@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> 2004-07-19 01:43:31 +0000
commit: 5f3ee100b1ae2def7eb374d8de700f8f3eaa68b2 (patch)
tree: 354fa51d0d75bc57eeaeb536dd01e2bda7c1921d /lib/rexml/source.rb
parent: 3d391146d116d48402a878cc321ef3d4806ff84f (diff)
download: ruby-5f3ee100b1ae2def7eb374d8de700f8f3eaa68b2.tar.gz
ruby-5f3ee100b1ae2def7eb374d8de700f8f3eaa68b2.tar.xz
ruby-5f3ee100b1ae2def7eb374d8de700f8f3eaa68b2.zip
1 files changed, 15 insertions, 5 deletions
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index ce10d03a6..725166616 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -116,11 +116,21 @@ module REXML
 		def initialize(arg, block_size=500)
 			@er_source = @source = arg
 			@to_utf = false
-      # FIXME
-      # This is broken.  If the user puts in enough carriage returns, this can fail
-      # to calculate the correct encoding.
-      super @source.read( 100 )
-			@line_break = encode( '>' )
+      # Determining the encoding is a deceptively difficult issue to resolve.
+      # First, we check the first two bytes for UTF-16.  Then we
+      # assume that the encoding is at least ASCII enough for the '>', and
+      # we read until we get one of those.  This gives us the XML declaration,
+      # if there is one.  If there isn't one, the file MUST be UTF-8, as per
+      # the XML spec.  If there is one, we can determine the encoding from
+      # it.
+      str = @source.read( 2 )
+      if (str[0] == 254 && str[1] == 255) || (str[0] == 255 && str[1] == 254)
+        @encoding = check_encoding( str )
+        @line_break = encode( '>' )
+      else
+        @line_break = '>'
+      end
+      super str+@source.readline( @line_break )
 		end
 
 		def scan(pattern, cons=false)
author	ser <ser@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>	2004-07-19 01:43:31 +0000
committer	ser <ser@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>	2004-07-19 01:43:31 +0000
commit	5f3ee100b1ae2def7eb374d8de700f8f3eaa68b2 (patch)
tree	354fa51d0d75bc57eeaeb536dd01e2bda7c1921d /lib/rexml/source.rb
parent	3d391146d116d48402a878cc321ef3d4806ff84f (diff)
download	ruby-5f3ee100b1ae2def7eb374d8de700f8f3eaa68b2.tar.gz ruby-5f3ee100b1ae2def7eb374d8de700f8f3eaa68b2.tar.xz ruby-5f3ee100b1ae2def7eb374d8de700f8f3eaa68b2.zip