summaryrefslogtreecommitdiffstats
path: root/rnc_tokenize.py
diff options
context:
space:
mode:
Diffstat (limited to 'rnc_tokenize.py')
-rw-r--r--rnc_tokenize.py118
1 files changed, 118 insertions, 0 deletions
diff --git a/rnc_tokenize.py b/rnc_tokenize.py
new file mode 100644
index 0000000..20cfc34
--- /dev/null
+++ b/rnc_tokenize.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python
+
+# Define the tokenizer for RELAX NG compact syntax
+# This file released to the Public Domain by David Mertz
+import lex
+tokens = tuple('''
+ ELEM ATTR EMPTY TEXT KEYWORD LITERAL ANNOTATION COMMENT
+ BEG_PAREN END_PAREN BEG_BODY END_BODY EQUAL NAME CHOICE SEQ
+ INTERLEAVE ANY SOME MAYBE WHITESPACE TODO DATATAG PATTERN
+ DEFAULT_NS NS DATATYPES NS_ANNOTATION START DEFINE
+ '''.split())
+
+reserved = {
+ 'element' : 'ELEM',
+ 'attribute' : 'ATTR',
+ 'empty' : 'EMPTY',
+ 'text' : 'TEXT',
+ 'div' : 'TODO',
+ 'external' : 'TODO',
+ 'grammar' : 'TODO',
+ 'include' : 'TODO',
+ 'inherit' : 'TODO',
+ 'list' : 'TODO',
+ 'mixed' : 'TODO',
+ 'notAllowed' : 'TODO',
+ 'parent' : 'TODO',
+ 'string' : 'TODO',
+ 'token' : 'TODO',
+}
+
+def t_START(t):
+ r"(?im)^start\s*=\s*.*$"
+ t.value = t.value.split('=')[1].strip()
+ return t
+
+def t_DEFINE(t):
+ r"(?im)^[\w-]+\s*="
+ t.value = t.value.split('=')[0].strip()
+ return t
+
+def t_ANNOTATION(t):
+ r"(?im)^\#\# .*$"
+ t.value = t.value[3:]
+ return t
+
+def t_COMMENT(t):
+ r"(?im)^\# .*$"
+ t.value = t.value[2:]
+ return t
+
+def t_DEFAULT_NS(t):
+ r"(?im)default\s+namespace\s*=\s*.*$"
+ t.value = t.value.split('=')[1].strip()
+ return t
+
+def t_DATATYPES(t):
+ r"(?im)datatypes\s+xsd\s*=\s*.*$"
+ t.value = t.value.split('=')[1].strip()
+ return t
+
+def t_DATATAG(t):
+ r"xsd:\w+"
+ t.value = t.value.split(':')[1]
+ return t
+
+def t_PATTERN(t):
+ r'{\s*pattern\s*=\s*".*"\s*}'
+ t.value = t.value[:-1].split('=')[1].strip()[1:-1]
+ return t
+
+def t_NS(t):
+ r"(?im)^namespace\s+.*$"
+ t.value = t.value.split(None,1)[1]
+ return t
+
+def t_ID(t):
+ r"[\w:_-]+"
+ t.type = reserved.get(t.value,'NAME') # Check for reserved words
+ return t
+
+def t_LITERAL(t):
+ r'".+?"'
+ t.value = t.value[1:-1]
+ return t
+
+t_BEG_PAREN = r"\("
+t_END_PAREN = r"\)"
+t_BEG_BODY = r"{"
+t_END_BODY = r"}"
+t_EQUAL = r"="
+t_CHOICE = r"[|]"
+t_SEQ = r","
+t_INTERLEAVE= r"&"
+t_ANY = r"[*]"
+t_SOME = r"[+]"
+t_MAYBE = r"[?]"
+t_WHITESPACE= r"\s+"
+t_ignore = " \t\n\r"
+
+def t_error(t):
+ t.skip(1)
+
+def token_list(rnc):
+ lex.lex()
+ lex.input(rnc)
+ ts = []
+ while 1:
+ t = lex.token()
+ if t is None:
+ break
+ ts.append(t)
+ return ts
+
+if __name__=='__main__':
+ import sys
+ del t_ignore
+ tokens = token_list(sys.stdin.read())
+ print '\n'.join(map(repr, tokens))