diff options
Diffstat (limited to 'rnc_tokenize.py')
-rw-r--r-- | rnc_tokenize.py | 118 |
1 files changed, 118 insertions, 0 deletions
diff --git a/rnc_tokenize.py b/rnc_tokenize.py new file mode 100644 index 0000000..20cfc34 --- /dev/null +++ b/rnc_tokenize.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python + +# Define the tokenizer for RELAX NG compact syntax +# This file released to the Public Domain by David Mertz +import lex +tokens = tuple(''' + ELEM ATTR EMPTY TEXT KEYWORD LITERAL ANNOTATION COMMENT + BEG_PAREN END_PAREN BEG_BODY END_BODY EQUAL NAME CHOICE SEQ + INTERLEAVE ANY SOME MAYBE WHITESPACE TODO DATATAG PATTERN + DEFAULT_NS NS DATATYPES NS_ANNOTATION START DEFINE + '''.split()) + +reserved = { + 'element' : 'ELEM', + 'attribute' : 'ATTR', + 'empty' : 'EMPTY', + 'text' : 'TEXT', + 'div' : 'TODO', + 'external' : 'TODO', + 'grammar' : 'TODO', + 'include' : 'TODO', + 'inherit' : 'TODO', + 'list' : 'TODO', + 'mixed' : 'TODO', + 'notAllowed' : 'TODO', + 'parent' : 'TODO', + 'string' : 'TODO', + 'token' : 'TODO', +} + +def t_START(t): + r"(?im)^start\s*=\s*.*$" + t.value = t.value.split('=')[1].strip() + return t + +def t_DEFINE(t): + r"(?im)^[\w-]+\s*=" + t.value = t.value.split('=')[0].strip() + return t + +def t_ANNOTATION(t): + r"(?im)^\#\# .*$" + t.value = t.value[3:] + return t + +def t_COMMENT(t): + r"(?im)^\# .*$" + t.value = t.value[2:] + return t + +def t_DEFAULT_NS(t): + r"(?im)default\s+namespace\s*=\s*.*$" + t.value = t.value.split('=')[1].strip() + return t + +def t_DATATYPES(t): + r"(?im)datatypes\s+xsd\s*=\s*.*$" + t.value = t.value.split('=')[1].strip() + return t + +def t_DATATAG(t): + r"xsd:\w+" + t.value = t.value.split(':')[1] + return t + +def t_PATTERN(t): + r'{\s*pattern\s*=\s*".*"\s*}' + t.value = t.value[:-1].split('=')[1].strip()[1:-1] + return t + +def t_NS(t): + r"(?im)^namespace\s+.*$" + t.value = t.value.split(None,1)[1] + return t + +def t_ID(t): + r"[\w:_-]+" + t.type = reserved.get(t.value,'NAME') # Check for reserved words + return t + +def t_LITERAL(t): + r'".+?"' + t.value = t.value[1:-1] + return t + +t_BEG_PAREN = r"\(" +t_END_PAREN = r"\)" +t_BEG_BODY = r"{" +t_END_BODY = r"}" +t_EQUAL = r"=" +t_CHOICE = r"[|]" +t_SEQ = r"," +t_INTERLEAVE= r"&" +t_ANY = r"[*]" +t_SOME = r"[+]" +t_MAYBE = r"[?]" +t_WHITESPACE= r"\s+" +t_ignore = " \t\n\r" + +def t_error(t): + t.skip(1) + +def token_list(rnc): + lex.lex() + lex.input(rnc) + ts = [] + while 1: + t = lex.token() + if t is None: + break + ts.append(t) + return ts + +if __name__=='__main__': + import sys + del t_ignore + tokens = token_list(sys.stdin.read()) + print '\n'.join(map(repr, tokens)) |