1 files changed, 391 insertions, 282 deletions
diff --git a/lib/puppet/parser/lexer.rb b/lib/puppet/parser/lexer.rb
index 086d82c09..51026ea1b 100644
--- a/lib/puppet/parser/lexer.rb
+++ b/lib/puppet/parser/lexer.rb
@@ -1,4 +1,3 @@
-
 # the scanner/lexer
 
 require 'strscan'
@@ -7,326 +6,436 @@ require 'puppet'
 
 module Puppet
     class LexError < RuntimeError; end
-    module Parser
-        #---------------------------------------------------------------
-        class Lexer
-            attr_reader :line, :last, :file
-
-            attr_accessor :indefine
-
-                #%r{\w+} => :WORD,
-            @@tokens = {
-                %r{#.*} => :COMMENT,
-                %r{\[} => :LBRACK,
-                %r{\]} => :RBRACK,
-                %r{\{} => :LBRACE,
-                %r{\}} => :RBRACE,
-                %r{\(} => :LPAREN,
-                %r{\)} => :RPAREN,
-                %r{\"} => :DQUOTE,
-                %r{\n} => :RETURN,
-                %r{\'} => :SQUOTE,
-                %r{=} => :EQUALS,
-                %r{==} => :ISEQUAL,
-                %r{>=} => :GREATEREQUAL,
-                %r{>} => :GREATERTHAN,
-                %r{<} => :LESSTHAN,
-                %r{<=} => :LESSEQUAL,
-                %r{!=} => :NOTEQUAL,
-                %r{!} => :NOT,
-                %r{,} => :COMMA,
-                %r{\.} => :DOT,
-                %r{:} => :COLON,
-                %r{@} => :AT,
-                %r{<<\|} => :LLCOLLECT,
-                %r{\|>>} => :RRCOLLECT,
-                %r{<\|} => :LCOLLECT,
-                %r{\|>} => :RCOLLECT,
-                %r{;} => :SEMIC,
-                %r{\?} => :QMARK,
-                %r{\\} => :BACKSLASH,
-                %r{=>} => :FARROW,
-                %r{\+>} => :PARROW,
-                %r{[a-z][-\w]*} => :NAME,
-                %r{([a-z][-\w]*::)+[a-z][-\w]*} => :CLASSNAME,
-                %r{((::){0,1}[A-Z][-\w]*)+} => :CLASSREF,
-                %r{[0-9]+} => :NUMBER,
-                %r{\$(\w*::)*\w+} => :VARIABLE
-            }
-
-            @@pairs = {
-                "{" => "}",
-                "(" => ")",
-                "[" => "]",
-                "<|" => "|>",
-                "<<|" => "|>>"
-            }
-
-            @@reverse_pairs = @@pairs.inject({}) { |hash, pair| hash[pair[1]] = pair[0]; hash }
-
-            @@keywords = {
-                "case" => :CASE,
-                "class" => :CLASS,
-                "default" => :DEFAULT,
-                "define" => :DEFINE,
-                "false" => :BOOLEAN,
-                "import" => :IMPORT,
-                "if" => :IF,
-                "elsif" => :ELSIF,
-                "else" => :ELSE,
-                "inherits" => :INHERITS,
-                "node" => :NODE,
-                "true" => :BOOLEAN,
-                "and"  => :AND,
-                "or"   => :OR,
-                "undef"   => :UNDEF
-            }
-
-            def clear
-                initvars
-            end
+end
 
-            def expected
-                if @expected.empty?
-                    nil
-                else
-                    token = @expected[-1]
-                    @@tokens.each do |value, name|
-                        if token == name
-                            return value
-                        end
-                    end
-                    return token
-                end
-            end
+module Puppet::Parser; end
 
-            # scan the whole file
-            # basically just used for testing
-            def fullscan
-                array = []
-
-                self.scan { |token,str|
-                    # Ignore any definition nesting problems
-                    @indefine = false
-                    #Puppet.debug("got token '%s' => '%s'" % [token,str])
-                    if token.nil?
-                        return array
-                    else
-                        array.push([token,str])
-                    end
-                }
-                return array
+class Puppet::Parser::Lexer
+    attr_reader :last, :file
+
+    attr_accessor :line, :indefine
+
+    # Our base token class.
+    class Token
+        attr_accessor :regex, :name, :string, :skip, :incr_line, :skip_text
+
+        def initialize(regex, name)
+            if regex.is_a?(String)
+                @name, @string = name, regex
+                @regex = Regexp.new(Regexp.escape(@string))
+            else
+                @name, @regex = name, regex
             end
+        end
 
-            # this is probably pretty damned inefficient...
-            # it'd be nice not to have to load the whole file first...
-            def file=(file)
-                @file = file
-                @line = 1
-                File.open(file) { |of|
-                    str = ""
-                    of.each { |line| str += line }
-                    @scanner = StringScanner.new(str)
-                }
+        def skip?
+            self.skip
+        end
+
+        def to_s
+            if self.string
+                @string
+            else
+                @name.to_s
             end
+        end
+    end
 
-            def indefine?
-                if defined? @indefine
-                    @indefine
-                else
-                    false
-                end
+    # Maintain a list of tokens.
+    class TokenList
+        attr_reader :regex_tokens, :string_tokens
+
+        def [](name)
+            @tokens[name]
+        end
+
+        # Create a new token.
+        def add_token(name, regex, options = {}, &block)
+            token = Token.new(regex, name)
+            raise(ArgumentError, "Token %s already exists" % name) if @tokens.include?(name)
+            @tokens[token.name] = token
+            if token.string
+                @string_tokens << token
+                @tokens_by_string[token.string] = token
+            else
+                @regex_tokens << token
             end
 
-            def initialize
-                initvars()
+            options.each do |name, option|
+                token.send(name.to_s + "=", option)
             end
 
-            def initvars
-                @line = 1
-                @last = ""
-                @lasttoken = nil
-                @scanner = nil
-                @file = nil
-                # AAARRGGGG! okay, regexes in ruby are bloody annoying
-                # no one else has "\n" =~ /\s/
-                @skip = %r{[ \t]+}
+            token.meta_def(:convert, &block) if block_given?
+
+            token
+        end
 
-                @namestack = []
-                @indefine = false
+        def initialize
+            @tokens = {}
+            @regex_tokens = []
+            @string_tokens = []
+            @tokens_by_string = {}
+        end
 
-                @expected = []
-            end
+        # Look up a token by its value, rather than name.
+        def lookup(string)
+            @tokens_by_string[string]
+        end
 
-            # Go up one in the namespace.
-            def namepop
-                @namestack.pop
+        # Define more tokens.
+        def add_tokens(hash)
+            hash.each do |regex, name|
+                add_token(name, regex)
             end
+        end
 
-            # Collect the current namespace.
-            def namespace
-                @namestack.join("::")
+        # Sort our tokens by length, so we know once we match, we're done.
+        # This helps us avoid the O(n^2) nature of token matching.
+        def sort_tokens
+            @string_tokens.sort! { |a, b| b.string.length <=> a.string.length }
+        end
+    end
+
+    TOKENS = TokenList.new
+    TOKENS.add_tokens(
+        '[' => :LBRACK,
+        ']' => :RBRACK,
+        '{' => :LBRACE,
+        '}' => :RBRACE,
+        '(' => :LPAREN,
+        ')' => :RPAREN,
+        '=' => :EQUALS,
+        '==' => :ISEQUAL,
+        '>=' => :GREATEREQUAL,
+        '>' => :GREATERTHAN,
+        '<' => :LESSTHAN,
+        '<=' => :LESSEQUAL,
+        '!=' => :NOTEQUAL,
+        '!' => :NOT,
+        ',' => :COMMA,
+        '.' => :DOT,
+        ':' => :COLON,
+        '@' => :AT,
+        '<<|' => :LLCOLLECT,
+        '|>>' => :RRCOLLECT,
+        '<|' => :LCOLLECT,
+        '|>' => :RCOLLECT,
+        ';' => :SEMIC,
+        '?' => :QMARK,
+        '\\' => :BACKSLASH,
+        '=>' => :FARROW,
+        '+>' => :PARROW,
+        %r{([a-z][-\w]*::)+[a-z][-\w]*} => :CLASSNAME,
+        %r{((::){0,1}[A-Z][-\w]*)+} => :CLASSREF
+    )
+
+    TOKENS.add_tokens "Whatever" => :DQTEXT, "Nomatter" => :SQTEXT, "alsonomatter" => :BOOLEAN
+
+    TOKENS.add_token :NAME, %r{[a-z][-\w]*} do |lexer, value|
+        string_token = self
+        # we're looking for keywords here
+        if tmp = KEYWORDS.lookup(value)
+            string_token = tmp
+            if [:TRUE, :FALSE].include?(string_token.name)
+                value = eval(value)
+                string_token = TOKENS[:BOOLEAN]
             end
+        end
+        [string_token, value]
+    end
+
+    TOKENS.add_token :NUMBER, %r{[0-9]+} do |lexer, value|
+        [TOKENS[:NAME], value]
+    end
+
+    TOKENS.add_token :COMMENT, %r{#.*}, :skip => true
+
+    TOKENS.add_token :RETURN, "\n", :skip => true, :incr_line => true, :skip_text => true
+
+    TOKENS.add_token :SQUOTE, "'" do |lexer, value|
+        value = lexer.slurpstring(value)
+        [TOKENS[:SQTEXT], value]
+    end
+
+    TOKENS.add_token :DQUOTE, '"' do |lexer, value|
+        value = lexer.slurpstring(value)
+        [TOKENS[:DQTEXT], value]
+    end
+
+    TOKENS.add_token :VARIABLE, %r{\$(\w*::)*\w+} do |lexer, value|
+        value = value.sub(/^\$/, '')
+        [self, value]
+    end
+
+    TOKENS.sort_tokens
+
+    @@pairs = {
+        "{" => "}",
+        "(" => ")",
+        "[" => "]",
+        "<|" => "|>",
+        "<<|" => "|>>"
+    }
+
+    KEYWORDS = TokenList.new
+
+    KEYWORDS.add_tokens(
+        "case" => :CASE,
+        "class" => :CLASS,
+        "default" => :DEFAULT,
+        "define" => :DEFINE,
+        "import" => :IMPORT,
+        "if" => :IF,
+        "elsif" => :ELSIF,
+        "else" => :ELSE,
+        "inherits" => :INHERITS,
+        "node" => :NODE,
+        "and"  => :AND,
+        "or"   => :OR,
+        "undef"   => :UNDEF,
+        "false" => :FALSE,
+        "true" => :TRUE
+    )
+
+    def clear
+        initvars
+    end
+
+    def expected
+        return nil if @expected.empty?
+        name = @expected[-1]
+        raise "Could not find expected token %s" % name unless token = TOKENS.lookup(name)
+
+        return token
+    end
+
+    # scan the whole file
+    # basically just used for testing
+    def fullscan
+        array = []
+
+        self.scan { |token, str|
+            # Ignore any definition nesting problems
+            @indefine = false
+            array.push([token,str])
+        }
+        return array
+    end
+
+    # this is probably pretty damned inefficient...
+    # it'd be nice not to have to load the whole file first...
+    def file=(file)
+        @file = file
+        @line = 1
+        File.open(file) { |of|
+            str = ""
+            of.each { |line| str += line }
+            @scanner = StringScanner.new(str)
+        }
+    end
 
-            # This value might have :: in it, but we don't care -- it'll be
-            # handled normally when joining, and when popping we want to pop
-            # this full value, however long the namespace is.
-            def namestack(value)
-                @namestack << value
+    def find_string_token
+        matched_token = value = nil
+
+        # We know our longest string token is three chars, so try each size in turn
+        # until we either match or run out of chars.  This way our worst-case is three
+        # tries, where it is otherwise the number of string chars we have.  Also,
+        # the lookups are optimized hash lookups, instead of regex scans.
+        [3, 2, 1].each do |i|
+            str = @scanner.peek(i)
+            if matched_token = TOKENS.lookup(str)
+                value = @scanner.scan(matched_token.regex)
+                break
             end
+        end
+
+        return matched_token, value
+    end
 
-            def rest
-                @scanner.rest
+    # Find the next token that matches a regex.  We look for these first.
+    def find_regex_token
+        @regex += 1
+        matched_token = nil
+        value = ""
+        length = 0
+
+        # I tried optimizing based on the first char, but it had
+        # a slightly negative affect and was a good bit more complicated.
+        TOKENS.regex_tokens.each do |token|
+            next unless match_length = @scanner.match?(token.regex) 
+            
+            # We've found a longer match
+            if match_length > length
+                value = @scanner.scan(token.regex) 
+                length = value.length
+                matched_token = token
             end
+        end
 
-            # this is the heart of the lexer
-            def scan
-                #Puppet.debug("entering scan")
-                if @scanner.nil?
-                    raise TypeError.new("Invalid or empty string")
-                end
+        return matched_token, value
+    end
 
-                @scanner.skip(@skip)
-                until @scanner.eos? do
-                    yielded = false
-                    sendbreak = false # gah, this is a nasty hack
-                    stoken = nil
-                    sregex = nil
-                    value = ""
-
-                    # first find out which type of token we've got
-                    @@tokens.each { |regex,token|
-                        # we're just checking, which doesn't advance the scan
-                        # pointer
-                        tmp = @scanner.check(regex)
-                        if tmp.nil?
-                            #puppet.debug("did not match %s to '%s'" %
-                            #    [regex,@scanner.rest])
-                            next
-                        end
-
-                        # find the longest match
-                        if tmp.length > value.length
-                            value = tmp 
-                            stoken = token
-                            sregex = regex
-                        else
-                            # we've already got a longer match
-                            next
-                        end
-                    }
-
-                    # error out if we didn't match anything at all
-                    if stoken.nil?
-                        nword = nil
-                        if @scanner.rest =~ /^(\S+)/
-                            nword = $1
-                        elsif@scanner.rest =~ /^(\s+)/
-                            nword = $1
-                        else
-                            nword = @scanner.rest
-                        end
-                        raise "Could not match '%s'" % nword
-                    end
+    # Find the next token, returning the string and the token.
+    def find_token
+        @find += 1
+        matched_token, value = find_regex_token
 
-                    value = @scanner.scan(sregex)
+        unless matched_token
+            matched_token, value = find_string_token
+        end
 
-                    if value == ""
-                        raise "Didn't match regex on token %s" % stoken
-                    end
+        return matched_token, value
+    end
 
-                    # token-specific operations
-                    # if this gets much more complicated, it should
-                    # be moved up to where the tokens themselves are defined
-                    # which will get me about 75% of the way to a lexer generator
-                    ptoken = stoken
-                    case stoken
-                    when :NAME then
-                        wtoken = stoken
-                        # we're looking for keywords here
-                        if @@keywords.include?(value)
-                            wtoken = @@keywords[value]
-                            #Puppet.debug("token '%s'" % wtoken)
-                            if wtoken == :BOOLEAN
-                                value = eval(value)
-                            end
-                        end
-                        ptoken = wtoken
-                    when :NUMBER then
-                        ptoken = :NAME
-                    when :COMMENT then
-                        # just throw comments away
-                        next
-                    when :RETURN then
-                        @line += 1
-                        @scanner.skip(@skip)
-                        next
-                    when :SQUOTE then
-                        #Puppet.debug("searching '%s' after '%s'" % [self.rest,value])
-                        value = self.slurpstring(value)
-                        ptoken = :SQTEXT
-                        #Puppet.debug("got string '%s' => '%s'" % [:DQTEXT,value])
-                    when :DQUOTE then
-                        value = self.slurpstring(value)
-                        ptoken = :DQTEXT
-                    when :VARIABLE then
-                        value = value.sub(/^\$/, '')
-                    end
+    def indefine?
+        if defined? @indefine
+            @indefine
+        else
+            false
+        end
+    end
 
-                    if match = @@pairs[value] and ptoken != :DQUOTE and ptoken != :SQUOTE
-                        @expected << match
-                    elsif exp = @expected[-1] and exp == value and ptoken != :DQUOTE and ptoken != :SQUOTE
-                        @expected.pop
-                    end
+    def initialize
+        @find = 0
+        @regex = 0
+        initvars()
+    end
 
-                    yield [ptoken, value]
+    def initvars
+        @line = 1
+        @previous_token = nil
+        @scanner = nil
+        @file = nil
+        # AAARRGGGG! okay, regexes in ruby are bloody annoying
+        # no one else has "\n" =~ /\s/
+        @skip = %r{[ \t]+}
+
+        @namestack = []
+        @indefine = false
+        @expected = []
+    end
 
-                    if @lasttoken == :CLASS
-                        namestack(value)
-                    end
+    # Make any necessary changes to the token and/or value.
+    def munge_token(token, value)
+        @line += 1 if token.incr_line
 
-                    if @lasttoken == :DEFINE
-                        if indefine?
-                            msg = "Cannot nest definition %s inside %s" % [value, @indefine]
-                            self.indefine = false
-                            raise Puppet::ParseError, msg
-                        end
+        skip() if token.skip_text
 
-                        @indefine = value
-                    end
+        return if token.skip
 
-                    @last = value
-                    @lasttoken = ptoken
+        token, value = token.convert(self, value) if token.respond_to?(:convert)
 
-                    @scanner.skip(@skip)
-                end
-                @scanner = nil
-                yield [false,false]
-            end
+        return unless token
+
+        return token, value
+    end
+
+    # Go up one in the namespace.
+    def namepop
+        @namestack.pop
+    end
 
-            # we've encountered an opening quote...
-            # slurp in the rest of the string and return it
-            def slurpstring(quote)
-                # we search for the next quote that isn't preceded by a
-                # backslash; the caret is there to match empty strings
-                str = @scanner.scan_until(/([^\\]|^)#{quote}/)
-                if str.nil?
-                    raise Puppet::LexError.new("Unclosed quote after '%s' in '%s'" %
-                        [self.last,self.rest])
+    # Collect the current namespace.
+    def namespace
+        @namestack.join("::")
+    end
+
+    # This value might have :: in it, but we don't care -- it'll be
+    # handled normally when joining, and when popping we want to pop
+    # this full value, however long the namespace is.
+    def namestack(value)
+        @namestack << value
+    end
+
+    def rest
+        @scanner.rest
+    end
+
+    # this is the heart of the lexer
+    def scan
+        #Puppet.debug("entering scan")
+        raise Puppet::LexError.new("Invalid or empty string") unless @scanner
+
+        # Skip any initial whitespace.
+        skip()
+
+        until @scanner.eos? do
+            yielded = false
+            matched_token, value = find_token
+
+            # error out if we didn't match anything at all
+            if matched_token.nil?
+                nword = nil
+                # Try to pull a 'word' out of the remaining string.
+                if @scanner.rest =~ /^(\S+)/
+                    nword = $1
+                elsif @scanner.rest =~ /^(\s+)/
+                    nword = $1
                 else
-                    str.sub!(/#{quote}\Z/,"")
-                    str.gsub!(/\\#{quote}/,quote)
+                    nword = @scanner.rest
                 end
+                raise "Could not match '%s'" % nword
+            end
 
-                return str
+            final_token, value = munge_token(matched_token, value)
+
+            next unless final_token
+
+            if match = @@pairs[value] and final_token.name != :DQUOTE and final_token.name != :SQUOTE
+                @expected << match
+            elsif exp = @expected[-1] and exp == value and final_token.name != :DQUOTE and final_token.name != :SQUOTE
+                @expected.pop
             end
 
-            # just parse a string, not a whole file
-            def string=(string)
-                @scanner = StringScanner.new(string)
+            yield [final_token.name, value]
+
+            if @previous_token
+                namestack(value) if @previous_token.name == :CLASS
+
+                if @previous_token.name == :DEFINE
+                    if indefine?
+                        msg = "Cannot nest definition %s inside %s" % [value, @indefine]
+                        self.indefine = false
+                        raise Puppet::ParseError, msg
+                    end
+
+                    @indefine = value
+                end
             end
+
+            @previous_token = final_token
+            skip()
         end
-        #---------------------------------------------------------------
+        @scanner = nil
+
+        # This indicates that we're done parsing.
+        yield [false,false]
+    end
+
+    # Skip any skipchars in our remaining string.
+    def skip
+        @scanner.skip(@skip)
     end
-end
 
+    # we've encountered an opening quote...
+    # slurp in the rest of the string and return it
+    def slurpstring(quote)
+        # we search for the next quote that isn't preceded by a
+        # backslash; the caret is there to match empty strings
+        str = @scanner.scan_until(/([^\\]|^)#{quote}/)
+        if str.nil?
+            raise Puppet::LexError.new("Unclosed quote after '%s' in '%s'" %
+                [self.last,self.rest])
+        else
+            str.sub!(/#{quote}\Z/,"")
+            str.gsub!(/\\#{quote}/,quote)
+        end
+
+        return str
+    end
+
+    # just parse a string, not a whole file
+    def string=(string)
+        @scanner = StringScanner.new(string)
+    end
+end