diff options
Diffstat (limited to 'lib/puppet/parser/lexer.rb')
-rw-r--r-- | lib/puppet/parser/lexer.rb | 673 |
1 files changed, 391 insertions, 282 deletions
diff --git a/lib/puppet/parser/lexer.rb b/lib/puppet/parser/lexer.rb index 086d82c09..51026ea1b 100644 --- a/lib/puppet/parser/lexer.rb +++ b/lib/puppet/parser/lexer.rb @@ -1,4 +1,3 @@ - # the scanner/lexer require 'strscan' @@ -7,326 +6,436 @@ require 'puppet' module Puppet class LexError < RuntimeError; end - module Parser - #--------------------------------------------------------------- - class Lexer - attr_reader :line, :last, :file - - attr_accessor :indefine - - #%r{\w+} => :WORD, - @@tokens = { - %r{#.*} => :COMMENT, - %r{\[} => :LBRACK, - %r{\]} => :RBRACK, - %r{\{} => :LBRACE, - %r{\}} => :RBRACE, - %r{\(} => :LPAREN, - %r{\)} => :RPAREN, - %r{\"} => :DQUOTE, - %r{\n} => :RETURN, - %r{\'} => :SQUOTE, - %r{=} => :EQUALS, - %r{==} => :ISEQUAL, - %r{>=} => :GREATEREQUAL, - %r{>} => :GREATERTHAN, - %r{<} => :LESSTHAN, - %r{<=} => :LESSEQUAL, - %r{!=} => :NOTEQUAL, - %r{!} => :NOT, - %r{,} => :COMMA, - %r{\.} => :DOT, - %r{:} => :COLON, - %r{@} => :AT, - %r{<<\|} => :LLCOLLECT, - %r{\|>>} => :RRCOLLECT, - %r{<\|} => :LCOLLECT, - %r{\|>} => :RCOLLECT, - %r{;} => :SEMIC, - %r{\?} => :QMARK, - %r{\\} => :BACKSLASH, - %r{=>} => :FARROW, - %r{\+>} => :PARROW, - %r{[a-z][-\w]*} => :NAME, - %r{([a-z][-\w]*::)+[a-z][-\w]*} => :CLASSNAME, - %r{((::){0,1}[A-Z][-\w]*)+} => :CLASSREF, - %r{[0-9]+} => :NUMBER, - %r{\$(\w*::)*\w+} => :VARIABLE - } - - @@pairs = { - "{" => "}", - "(" => ")", - "[" => "]", - "<|" => "|>", - "<<|" => "|>>" - } - - @@reverse_pairs = @@pairs.inject({}) { |hash, pair| hash[pair[1]] = pair[0]; hash } - - @@keywords = { - "case" => :CASE, - "class" => :CLASS, - "default" => :DEFAULT, - "define" => :DEFINE, - "false" => :BOOLEAN, - "import" => :IMPORT, - "if" => :IF, - "elsif" => :ELSIF, - "else" => :ELSE, - "inherits" => :INHERITS, - "node" => :NODE, - "true" => :BOOLEAN, - "and" => :AND, - "or" => :OR, - "undef" => :UNDEF - } - - def clear - initvars - end +end - def expected - if @expected.empty? - nil - else - token = @expected[-1] - @@tokens.each do |value, name| - if token == name - return value - end - end - return token - end - end +module Puppet::Parser; end - # scan the whole file - # basically just used for testing - def fullscan - array = [] - - self.scan { |token,str| - # Ignore any definition nesting problems - @indefine = false - #Puppet.debug("got token '%s' => '%s'" % [token,str]) - if token.nil? - return array - else - array.push([token,str]) - end - } - return array +class Puppet::Parser::Lexer + attr_reader :last, :file + + attr_accessor :line, :indefine + + # Our base token class. + class Token + attr_accessor :regex, :name, :string, :skip, :incr_line, :skip_text + + def initialize(regex, name) + if regex.is_a?(String) + @name, @string = name, regex + @regex = Regexp.new(Regexp.escape(@string)) + else + @name, @regex = name, regex end + end - # this is probably pretty damned inefficient... - # it'd be nice not to have to load the whole file first... - def file=(file) - @file = file - @line = 1 - File.open(file) { |of| - str = "" - of.each { |line| str += line } - @scanner = StringScanner.new(str) - } + def skip? + self.skip + end + + def to_s + if self.string + @string + else + @name.to_s end + end + end - def indefine? - if defined? @indefine - @indefine - else - false - end + # Maintain a list of tokens. + class TokenList + attr_reader :regex_tokens, :string_tokens + + def [](name) + @tokens[name] + end + + # Create a new token. + def add_token(name, regex, options = {}, &block) + token = Token.new(regex, name) + raise(ArgumentError, "Token %s already exists" % name) if @tokens.include?(name) + @tokens[token.name] = token + if token.string + @string_tokens << token + @tokens_by_string[token.string] = token + else + @regex_tokens << token end - def initialize - initvars() + options.each do |name, option| + token.send(name.to_s + "=", option) end - def initvars - @line = 1 - @last = "" - @lasttoken = nil - @scanner = nil - @file = nil - # AAARRGGGG! okay, regexes in ruby are bloody annoying - # no one else has "\n" =~ /\s/ - @skip = %r{[ \t]+} + token.meta_def(:convert, &block) if block_given? + + token + end - @namestack = [] - @indefine = false + def initialize + @tokens = {} + @regex_tokens = [] + @string_tokens = [] + @tokens_by_string = {} + end - @expected = [] - end + # Look up a token by its value, rather than name. + def lookup(string) + @tokens_by_string[string] + end - # Go up one in the namespace. - def namepop - @namestack.pop + # Define more tokens. + def add_tokens(hash) + hash.each do |regex, name| + add_token(name, regex) end + end - # Collect the current namespace. - def namespace - @namestack.join("::") + # Sort our tokens by length, so we know once we match, we're done. + # This helps us avoid the O(n^2) nature of token matching. + def sort_tokens + @string_tokens.sort! { |a, b| b.string.length <=> a.string.length } + end + end + + TOKENS = TokenList.new + TOKENS.add_tokens( + '[' => :LBRACK, + ']' => :RBRACK, + '{' => :LBRACE, + '}' => :RBRACE, + '(' => :LPAREN, + ')' => :RPAREN, + '=' => :EQUALS, + '==' => :ISEQUAL, + '>=' => :GREATEREQUAL, + '>' => :GREATERTHAN, + '<' => :LESSTHAN, + '<=' => :LESSEQUAL, + '!=' => :NOTEQUAL, + '!' => :NOT, + ',' => :COMMA, + '.' => :DOT, + ':' => :COLON, + '@' => :AT, + '<<|' => :LLCOLLECT, + '|>>' => :RRCOLLECT, + '<|' => :LCOLLECT, + '|>' => :RCOLLECT, + ';' => :SEMIC, + '?' => :QMARK, + '\\' => :BACKSLASH, + '=>' => :FARROW, + '+>' => :PARROW, + %r{([a-z][-\w]*::)+[a-z][-\w]*} => :CLASSNAME, + %r{((::){0,1}[A-Z][-\w]*)+} => :CLASSREF + ) + + TOKENS.add_tokens "Whatever" => :DQTEXT, "Nomatter" => :SQTEXT, "alsonomatter" => :BOOLEAN + + TOKENS.add_token :NAME, %r{[a-z][-\w]*} do |lexer, value| + string_token = self + # we're looking for keywords here + if tmp = KEYWORDS.lookup(value) + string_token = tmp + if [:TRUE, :FALSE].include?(string_token.name) + value = eval(value) + string_token = TOKENS[:BOOLEAN] end + end + [string_token, value] + end + + TOKENS.add_token :NUMBER, %r{[0-9]+} do |lexer, value| + [TOKENS[:NAME], value] + end + + TOKENS.add_token :COMMENT, %r{#.*}, :skip => true + + TOKENS.add_token :RETURN, "\n", :skip => true, :incr_line => true, :skip_text => true + + TOKENS.add_token :SQUOTE, "'" do |lexer, value| + value = lexer.slurpstring(value) + [TOKENS[:SQTEXT], value] + end + + TOKENS.add_token :DQUOTE, '"' do |lexer, value| + value = lexer.slurpstring(value) + [TOKENS[:DQTEXT], value] + end + + TOKENS.add_token :VARIABLE, %r{\$(\w*::)*\w+} do |lexer, value| + value = value.sub(/^\$/, '') + [self, value] + end + + TOKENS.sort_tokens + + @@pairs = { + "{" => "}", + "(" => ")", + "[" => "]", + "<|" => "|>", + "<<|" => "|>>" + } + + KEYWORDS = TokenList.new + + KEYWORDS.add_tokens( + "case" => :CASE, + "class" => :CLASS, + "default" => :DEFAULT, + "define" => :DEFINE, + "import" => :IMPORT, + "if" => :IF, + "elsif" => :ELSIF, + "else" => :ELSE, + "inherits" => :INHERITS, + "node" => :NODE, + "and" => :AND, + "or" => :OR, + "undef" => :UNDEF, + "false" => :FALSE, + "true" => :TRUE + ) + + def clear + initvars + end + + def expected + return nil if @expected.empty? + name = @expected[-1] + raise "Could not find expected token %s" % name unless token = TOKENS.lookup(name) + + return token + end + + # scan the whole file + # basically just used for testing + def fullscan + array = [] + + self.scan { |token, str| + # Ignore any definition nesting problems + @indefine = false + array.push([token,str]) + } + return array + end + + # this is probably pretty damned inefficient... + # it'd be nice not to have to load the whole file first... + def file=(file) + @file = file + @line = 1 + File.open(file) { |of| + str = "" + of.each { |line| str += line } + @scanner = StringScanner.new(str) + } + end - # This value might have :: in it, but we don't care -- it'll be - # handled normally when joining, and when popping we want to pop - # this full value, however long the namespace is. - def namestack(value) - @namestack << value + def find_string_token + matched_token = value = nil + + # We know our longest string token is three chars, so try each size in turn + # until we either match or run out of chars. This way our worst-case is three + # tries, where it is otherwise the number of string chars we have. Also, + # the lookups are optimized hash lookups, instead of regex scans. + [3, 2, 1].each do |i| + str = @scanner.peek(i) + if matched_token = TOKENS.lookup(str) + value = @scanner.scan(matched_token.regex) + break end + end + + return matched_token, value + end - def rest - @scanner.rest + # Find the next token that matches a regex. We look for these first. + def find_regex_token + @regex += 1 + matched_token = nil + value = "" + length = 0 + + # I tried optimizing based on the first char, but it had + # a slightly negative affect and was a good bit more complicated. + TOKENS.regex_tokens.each do |token| + next unless match_length = @scanner.match?(token.regex) + + # We've found a longer match + if match_length > length + value = @scanner.scan(token.regex) + length = value.length + matched_token = token end + end - # this is the heart of the lexer - def scan - #Puppet.debug("entering scan") - if @scanner.nil? - raise TypeError.new("Invalid or empty string") - end + return matched_token, value + end - @scanner.skip(@skip) - until @scanner.eos? do - yielded = false - sendbreak = false # gah, this is a nasty hack - stoken = nil - sregex = nil - value = "" - - # first find out which type of token we've got - @@tokens.each { |regex,token| - # we're just checking, which doesn't advance the scan - # pointer - tmp = @scanner.check(regex) - if tmp.nil? - #puppet.debug("did not match %s to '%s'" % - # [regex,@scanner.rest]) - next - end - - # find the longest match - if tmp.length > value.length - value = tmp - stoken = token - sregex = regex - else - # we've already got a longer match - next - end - } - - # error out if we didn't match anything at all - if stoken.nil? - nword = nil - if @scanner.rest =~ /^(\S+)/ - nword = $1 - elsif@scanner.rest =~ /^(\s+)/ - nword = $1 - else - nword = @scanner.rest - end - raise "Could not match '%s'" % nword - end + # Find the next token, returning the string and the token. + def find_token + @find += 1 + matched_token, value = find_regex_token - value = @scanner.scan(sregex) + unless matched_token + matched_token, value = find_string_token + end - if value == "" - raise "Didn't match regex on token %s" % stoken - end + return matched_token, value + end - # token-specific operations - # if this gets much more complicated, it should - # be moved up to where the tokens themselves are defined - # which will get me about 75% of the way to a lexer generator - ptoken = stoken - case stoken - when :NAME then - wtoken = stoken - # we're looking for keywords here - if @@keywords.include?(value) - wtoken = @@keywords[value] - #Puppet.debug("token '%s'" % wtoken) - if wtoken == :BOOLEAN - value = eval(value) - end - end - ptoken = wtoken - when :NUMBER then - ptoken = :NAME - when :COMMENT then - # just throw comments away - next - when :RETURN then - @line += 1 - @scanner.skip(@skip) - next - when :SQUOTE then - #Puppet.debug("searching '%s' after '%s'" % [self.rest,value]) - value = self.slurpstring(value) - ptoken = :SQTEXT - #Puppet.debug("got string '%s' => '%s'" % [:DQTEXT,value]) - when :DQUOTE then - value = self.slurpstring(value) - ptoken = :DQTEXT - when :VARIABLE then - value = value.sub(/^\$/, '') - end + def indefine? + if defined? @indefine + @indefine + else + false + end + end - if match = @@pairs[value] and ptoken != :DQUOTE and ptoken != :SQUOTE - @expected << match - elsif exp = @expected[-1] and exp == value and ptoken != :DQUOTE and ptoken != :SQUOTE - @expected.pop - end + def initialize + @find = 0 + @regex = 0 + initvars() + end - yield [ptoken, value] + def initvars + @line = 1 + @previous_token = nil + @scanner = nil + @file = nil + # AAARRGGGG! okay, regexes in ruby are bloody annoying + # no one else has "\n" =~ /\s/ + @skip = %r{[ \t]+} + + @namestack = [] + @indefine = false + @expected = [] + end - if @lasttoken == :CLASS - namestack(value) - end + # Make any necessary changes to the token and/or value. + def munge_token(token, value) + @line += 1 if token.incr_line - if @lasttoken == :DEFINE - if indefine? - msg = "Cannot nest definition %s inside %s" % [value, @indefine] - self.indefine = false - raise Puppet::ParseError, msg - end + skip() if token.skip_text - @indefine = value - end + return if token.skip - @last = value - @lasttoken = ptoken + token, value = token.convert(self, value) if token.respond_to?(:convert) - @scanner.skip(@skip) - end - @scanner = nil - yield [false,false] - end + return unless token + + return token, value + end + + # Go up one in the namespace. + def namepop + @namestack.pop + end - # we've encountered an opening quote... - # slurp in the rest of the string and return it - def slurpstring(quote) - # we search for the next quote that isn't preceded by a - # backslash; the caret is there to match empty strings - str = @scanner.scan_until(/([^\\]|^)#{quote}/) - if str.nil? - raise Puppet::LexError.new("Unclosed quote after '%s' in '%s'" % - [self.last,self.rest]) + # Collect the current namespace. + def namespace + @namestack.join("::") + end + + # This value might have :: in it, but we don't care -- it'll be + # handled normally when joining, and when popping we want to pop + # this full value, however long the namespace is. + def namestack(value) + @namestack << value + end + + def rest + @scanner.rest + end + + # this is the heart of the lexer + def scan + #Puppet.debug("entering scan") + raise Puppet::LexError.new("Invalid or empty string") unless @scanner + + # Skip any initial whitespace. + skip() + + until @scanner.eos? do + yielded = false + matched_token, value = find_token + + # error out if we didn't match anything at all + if matched_token.nil? + nword = nil + # Try to pull a 'word' out of the remaining string. + if @scanner.rest =~ /^(\S+)/ + nword = $1 + elsif @scanner.rest =~ /^(\s+)/ + nword = $1 else - str.sub!(/#{quote}\Z/,"") - str.gsub!(/\\#{quote}/,quote) + nword = @scanner.rest end + raise "Could not match '%s'" % nword + end - return str + final_token, value = munge_token(matched_token, value) + + next unless final_token + + if match = @@pairs[value] and final_token.name != :DQUOTE and final_token.name != :SQUOTE + @expected << match + elsif exp = @expected[-1] and exp == value and final_token.name != :DQUOTE and final_token.name != :SQUOTE + @expected.pop end - # just parse a string, not a whole file - def string=(string) - @scanner = StringScanner.new(string) + yield [final_token.name, value] + + if @previous_token + namestack(value) if @previous_token.name == :CLASS + + if @previous_token.name == :DEFINE + if indefine? + msg = "Cannot nest definition %s inside %s" % [value, @indefine] + self.indefine = false + raise Puppet::ParseError, msg + end + + @indefine = value + end end + + @previous_token = final_token + skip() end - #--------------------------------------------------------------- + @scanner = nil + + # This indicates that we're done parsing. + yield [false,false] + end + + # Skip any skipchars in our remaining string. + def skip + @scanner.skip(@skip) end -end + # we've encountered an opening quote... + # slurp in the rest of the string and return it + def slurpstring(quote) + # we search for the next quote that isn't preceded by a + # backslash; the caret is there to match empty strings + str = @scanner.scan_until(/([^\\]|^)#{quote}/) + if str.nil? + raise Puppet::LexError.new("Unclosed quote after '%s' in '%s'" % + [self.last,self.rest]) + else + str.sub!(/#{quote}\Z/,"") + str.gsub!(/\\#{quote}/,quote) + end + + return str + end + + # just parse a string, not a whole file + def string=(string) + @scanner = StringScanner.new(string) + end +end |