require 'rexml/child' require 'rexml/source' require 'rexml/xmltokens' module REXML # God, I hate DTDs. I really do. Why this idiot standard still # plagues us is beyond me. class Entity < Child include XMLTokens PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#" SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))} PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')} EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))" NDATADECL = "\\s+NDATA\\s+#{NAME}" PEREFERENCE = "%#{NAME};" ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))} PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})" ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" PEDECL = "" GEDECL = "" ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um attr_reader :name, :external, :ref, :ndata, :pubid # Create a new entity. Simple entities can be constructed by passing a # name, value to the constructor; this creates a generic, plain entity # reference. For anything more complicated, you have to pass a Source to # the constructor with the entity definiton, or use the accessor methods. # +WARNING+: There is no validation of entity state except when the entity # is read from a stream. If you start poking around with the accessors, # you can easily create a non-conformant Entity. The best thing to do is # dump the stupid DTDs and use XMLSchema instead. # # e = Entity.new( 'amp', '&' ) def initialize stream, value=nil, parent=nil, reference=false super(parent) @ndata = @pubid = @value = @external = nil if stream.kind_of? Array @name = stream[1] if stream[-1] == '%' @reference = true stream.pop else @reference = false end if stream[2] =~ /SYSTEM|PUBLIC/ @external = stream[2] if @external == 'SYSTEM' @ref = stream[3] @ndata = stream[4] if stream.size == 5 else @pubid = stream[3] @ref = stream[4] end else @value = stream[2] end else @reference = reference @external = nil @name = stream @value = value end end # Evaluates whether the given string matchs an entity definition, # returning true if so, and false otherwise. def Entity::matches? string (ENTITYDECL =~ string) == 0 end # Evaluates to the unnormalized value of this entity; that is, replacing # all entities -- both %ent; and &ent; entities. This differs from # +value()+ in that +value+ only replaces %ent; entities. def unnormalized v = value() return nil if v.nil? @unnormalized = Text::unnormalize(v, parent) @unnormalized end #once :unnormalized # Returns the value of this entity unprocessed -- raw. This is the # normalized value; that is, with all %ent; and &ent; entities intact def normalized @value end # Write out a fully formed, correct entity definition (assuming the Entity # object itself is valid.) def write out, indent=-1 out << '' end # Returns this entity as a string. See write(). def to_s rv = '' write rv rv end PEREFERENCE_RE = /#{PEREFERENCE}/um # Returns the value of this entity. At the moment, only internal entities # are processed. If the value contains internal references (IE, # %blah;), those are replaced with their values. IE, if the doctype # contains: # # # then: # doctype.entity('yada').value #-> "nanoo bar nanoo" def value if @value matches = @value.scan(PEREFERENCE_RE) rv = @value.clone if @parent matches.each do |entity_reference| entity_value = @parent.entity( entity_reference[0] ) rv.gsub!( /%#{entity_reference};/um, entity_value ) end end return rv end nil end end # This is a set of entity constants -- the ones defined in the XML # specification. These are +gt+, +lt+, +amp+, +quot+ and +apos+. module EntityConst # +>+ GT = Entity.new( 'gt', '>' ) # +<+ LT = Entity.new( 'lt', '<' ) # +&+ AMP = Entity.new( 'amp', '&' ) # +"+ QUOT = Entity.new( 'quot', '"' ) # +'+ APOS = Entity.new( 'apos', "'" ) end end