From b5a09481faa2eda03b568839ed724970bc8a1adc Mon Sep 17 00:00:00 2001
From: Rainer Gerhards <rgerhards@adiscon.com>
Date: Tue, 19 Feb 2008 16:16:09 +0000
Subject: implemented initial tokenizer (stage work for expr parser)

---
 ctok.c | 283 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 282 insertions(+), 1 deletion(-)

(limited to 'ctok.c')

diff --git a/ctok.c b/ctok.c
index 917b3175..b6301ff6 100644
--- a/ctok.c
+++ b/ctok.c
@@ -30,6 +30,8 @@
 
 #include "config.h"
 #include <stdlib.h>
+#include <ctype.h>
+#include <strings.h>
 #include <assert.h>
 
 #include "rsyslog.h"
@@ -49,7 +51,7 @@ ENDobjConstruct(ctok)
 /* ConstructionFinalizer
  * rgerhards, 2008-01-09
  */
-rsRetVal ctokConstructFinalize(ctok_t *pThis)
+rsRetVal ctokConstructFinalize(ctok_t __attribute__((unused)) *pThis)
 {
 	DEFiRet;
 	RETiRet;
@@ -62,6 +64,285 @@ CODESTARTobjDestruct(ctok)
 	/* ... then free resources */
 ENDobjDestruct(ctok)
 
+
+/* unget character from input stream. At most one character can be ungotten.
+ * This funtion is only permitted to be called after at least one character
+ * has been read from the stream. Right now, we handle the situation simply by
+ * moving the string "stream" pointer one position backwards. If we work with
+ * real streams (some time), the strm object will handle the functionality
+ * itself. -- rgerhards, 2008-02-19
+ */
+static rsRetVal
+ctokUngetCharFromStream(ctok_t *pThis, uchar __attribute__((unused)) c)
+{
+	DEFiRet;
+
+	ISOBJ_TYPE_assert(pThis, ctok);
+	--pThis->pp;
+
+	RETiRet;
+}
+
+
+/* get the next character from the input "stream" (currently just a in-memory
+ * string...) -- rgerhards, 2008-02-19
+ */
+static rsRetVal 
+ctokGetCharFromStream(ctok_t *pThis, uchar *pc)
+{
+	DEFiRet;
+
+	ISOBJ_TYPE_assert(pThis, ctok);
+	ASSERT(pc != NULL);
+
+	if(*pThis->pp == '\0') {
+		ABORT_FINALIZE(RS_RET_EOS);
+	} else {
+		*pc = *pThis->pp;
+		++pThis->pp;
+	}
+
+finalize_it:
+	RETiRet;
+}
+
+
+/* skip whitespace in the input "stream".
+ * rgerhards, 2008-02-19
+ */
+static rsRetVal 
+ctokSkipWhitespaceFromStream(ctok_t *pThis)
+{
+	DEFiRet;
+	uchar c;
+
+	ISOBJ_TYPE_assert(pThis, ctok);
+
+	CHKiRet(ctokGetCharFromStream(pThis, &c));
+	while(isspace(c)) {
+		CHKiRet(ctokGetCharFromStream(pThis, &c));
+	}
+
+	/* we must unget the one non-whitespace we found */
+	CHKiRet(ctokUngetCharFromStream(pThis, c));
+
+dbgprintf("skipped whitepsace, stream now '%s'\n", pThis->pp);
+finalize_it:
+	RETiRet;
+}
+
+
+/* get the next word from the input "stream" (currently just a in-memory
+ * string...). A word is anything between whitespace. If the word is longer
+ * than the provided memory buffer, parsing terminates when buffer length
+ * has been reached. A buffer of 128 bytes or more should always be by
+ * far sufficient. -- rgerhards, 2008-02-19
+ */
+static rsRetVal 
+ctokGetWordFromStream(ctok_t *pThis, uchar *pWordBuf, size_t lenWordBuf)
+{
+	DEFiRet;
+	uchar c;
+
+	ISOBJ_TYPE_assert(pThis, ctok);
+	ASSERT(pWordBuf != NULL);
+	ASSERT(lenWordBuf > 0);
+
+	CHKiRet(ctokSkipWhitespaceFromStream(pThis));
+
+	CHKiRet(ctokGetCharFromStream(pThis, &c));
+	while(!isspace(c) && lenWordBuf > 1) {
+		*pWordBuf = c;
+		--lenWordBuf;
+		CHKiRet(ctokGetCharFromStream(pThis, &c));
+	}
+	*pWordBuf = '\0'; /* there is always space for this - see while() */
+
+dbgprintf("end ctokGetWorkFromStream, stream now '%s'\n", pThis->pp);
+finalize_it:
+	RETiRet;
+}
+
+
+#if 0
+/* Get the next token from the input stream. This parses the next token and
+ * ignores any whitespace in between. End of stream is communicated via iRet.
+ * rgerhards, 2008-02-19
+ */
+rsRetVal
+ctokGetNextToken(ctok_t *pThis, ctok_token_t *pToken)
+{
+	DEFiRet;
+	uchar pszWord[128];
+
+	ISOBJ_TYPE_assert(pThis, ctok);
+	ASSERT(pToken != NULL);
+
+	CHKiRet(ctokGetWordFromStream(pThis, pszWord, sizeof(pszWord)/sizeof(uchar)));
+
+	/* now recognize words... */
+	if(strcasecmp((char*)pszWord, "or")) {
+		*pToken = ctok_OR;
+	} else if(strcasecmp((char*)pszWord, "and")) {
+		*pToken = ctok_AND;
+	} else if(strcasecmp((char*)pszWord, "+")) {
+		*pToken = ctok_PLUS;
+	} else if(strcasecmp((char*)pszWord, "-")) {
+		*pToken = ctok_MINUS;
+	} else if(strcasecmp((char*)pszWord, "*")) {
+		*pToken = ctok_TIMES;
+	} else if(strcasecmp((char*)pszWord, "/")) {
+		*pToken = ctok_DIV;
+	} else if(strcasecmp((char*)pszWord, "%")) {
+		*pToken = ctok_MOD;
+	} else if(strcasecmp((char*)pszWord, "not")) {
+		*pToken = ctok_NOT;
+	} else if(strcasecmp((char*)pszWord, "(")) {
+		*pToken = ctok_LPAREN;
+	} else if(strcasecmp((char*)pszWord, ")")) {
+		*pToken = ctok_RPAREN;
+	} else if(strcasecmp((char*)pszWord, ",")) {
+		*pToken = ctok_COMMA;
+	} else if(strcasecmp((char*)pszWord, "$")) {
+		*pToken = ctok_DOLLAR;
+	} else if(strcasecmp((char*)pszWord, "'")) {
+		*pToken = ctok_QUOTE;
+	} else if(strcasecmp((char*)pszWord, "\"")) {
+		*pToken = ctok_DBL_QUOTE;
+	} else if(strcasecmp((char*)pszWord, "==")) {
+		*pToken = ctok_CMP_EQ;
+	} else if(strcasecmp((char*)pszWord, "!=")) {
+		*pToken = ctok_CMP_NEQ;
+	} else if(strcasecmp((char*)pszWord, "<>")) { /* an alias for the non-C folks... */
+		*pToken = ctok_CMP_NEQ;
+	} else if(strcasecmp((char*)pszWord, "<")) {
+		*pToken = ctok_CMP_LT;
+	} else if(strcasecmp((char*)pszWord, ">")) {
+		*pToken = ctok_CMP_GT;
+	} else if(strcasecmp((char*)pszWord, "<=")) {
+		*pToken = ctok_CMP_LTEQ;
+	} else if(strcasecmp((char*)pszWord, ">=")) {
+		*pToken = ctok_CMP_GTEQ;
+	}
+
+RUNLOG_VAR("%d", *pToken);
+
+finalize_it:
+	RETiRet;
+}
+#endif
+
+
+/* Get the next token from the input stream. This parses the next token and
+ * ignores any whitespace in between. End of stream is communicated via iRet.
+ * rgerhards, 2008-02-19
+ */
+rsRetVal
+ctokGetNextToken(ctok_t *pThis, ctok_token_t *pToken)
+{
+	DEFiRet;
+	uchar c;
+
+	ISOBJ_TYPE_assert(pThis, ctok);
+	ASSERT(pToken != NULL);
+
+	CHKiRet(ctokSkipWhitespaceFromStream(pThis));
+
+	CHKiRet(ctokGetCharFromStream(pThis, &c)); /* read a charater */
+	switch(c) {
+		case 'o':/* or */
+			CHKiRet(ctokGetCharFromStream(pThis, &c)); /* read a charater */
+			*pToken = (c == 'r')? ctok_OR : ctok_INVALID;
+			break;
+		case 'a': /* and */
+			CHKiRet(ctokGetCharFromStream(pThis, &c)); /* read a charater */
+			if(c == 'n') {
+				CHKiRet(ctokGetCharFromStream(pThis, &c)); /* read a charater */
+				*pToken = (c == 'd')? ctok_AND : ctok_INVALID;
+			} else {
+				*pToken = ctok_INVALID;
+			}
+			break;
+		case 'n': /* not */
+			CHKiRet(ctokGetCharFromStream(pThis, &c)); /* read a charater */
+			if(c == 'o') {
+				CHKiRet(ctokGetCharFromStream(pThis, &c)); /* read a charater */
+				*pToken = (c == 't')? ctok_NOT : ctok_INVALID;
+			} else {
+				*pToken = ctok_INVALID;
+			}
+			break;
+		case '=': /* == */
+			CHKiRet(ctokGetCharFromStream(pThis, &c)); /* read a charater */
+			*pToken = (c == '=')? ctok_CMP_EQ : ctok_INVALID;
+			break;
+		case '!': /* != */
+			CHKiRet(ctokGetCharFromStream(pThis, &c)); /* read a charater */
+			*pToken = (c == '=')? ctok_CMP_NEQ : ctok_INVALID;
+			break;
+		case '<': /* <, <=, <> */
+			CHKiRet(ctokGetCharFromStream(pThis, &c)); /* read a charater */
+			if(c == '=') {
+				*pToken = ctok_CMP_LTEQ;
+			} else if(c == '>') {
+				*pToken = ctok_CMP_NEQ;
+			} else {
+				*pToken = ctok_CMP_LT;
+			}
+			break;
+		case '>': /* >, >= */
+			CHKiRet(ctokGetCharFromStream(pThis, &c)); /* read a charater */
+			if(c == '=') {
+				*pToken = ctok_CMP_GTEQ;
+			} else {
+				*pToken = ctok_CMP_GT;
+			}
+			break;
+		case '+':
+			*pToken = ctok_PLUS;
+			break;
+		case '-':
+			*pToken = ctok_MINUS;
+			break;
+		case '*':
+			*pToken = ctok_TIMES;
+			break;
+		case '/':
+			*pToken = ctok_DIV;
+			break;
+		case '%':
+			*pToken = ctok_MOD;
+			break;
+		case '(':
+			*pToken = ctok_LPAREN;
+			break;
+		case ')':
+			*pToken = ctok_RPAREN;
+			break;
+		case ',':
+			*pToken = ctok_COMMA;
+			break;
+		case '$':
+			*pToken = ctok_DOLLAR;
+			break;
+		case '\'':
+			*pToken = ctok_QUOTE;
+			break;
+		case '"':
+			*pToken = ctok_DBL_QUOTE;
+			break;
+		default:
+			*pToken = ctok_INVALID;
+			break;
+	}
+
+RUNLOG_VAR("%d", *pToken);
+
+finalize_it:
+	RETiRet;
+}
+
+
 /* property set methods */
 /* simple ones first */
 DEFpropSetMeth(ctok, pp, uchar*)
-- 
cgit