From b5a09481faa2eda03b568839ed724970bc8a1adc Mon Sep 17 00:00:00 2001 From: Rainer Gerhards Date: Tue, 19 Feb 2008 16:16:09 +0000 Subject: implemented initial tokenizer (stage work for expr parser) --- ctok.c | 283 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 282 insertions(+), 1 deletion(-) (limited to 'ctok.c') diff --git a/ctok.c b/ctok.c index 917b3175..b6301ff6 100644 --- a/ctok.c +++ b/ctok.c @@ -30,6 +30,8 @@ #include "config.h" #include +#include +#include #include #include "rsyslog.h" @@ -49,7 +51,7 @@ ENDobjConstruct(ctok) /* ConstructionFinalizer * rgerhards, 2008-01-09 */ -rsRetVal ctokConstructFinalize(ctok_t *pThis) +rsRetVal ctokConstructFinalize(ctok_t __attribute__((unused)) *pThis) { DEFiRet; RETiRet; @@ -62,6 +64,285 @@ CODESTARTobjDestruct(ctok) /* ... then free resources */ ENDobjDestruct(ctok) + +/* unget character from input stream. At most one character can be ungotten. + * This funtion is only permitted to be called after at least one character + * has been read from the stream. Right now, we handle the situation simply by + * moving the string "stream" pointer one position backwards. If we work with + * real streams (some time), the strm object will handle the functionality + * itself. -- rgerhards, 2008-02-19 + */ +static rsRetVal +ctokUngetCharFromStream(ctok_t *pThis, uchar __attribute__((unused)) c) +{ + DEFiRet; + + ISOBJ_TYPE_assert(pThis, ctok); + --pThis->pp; + + RETiRet; +} + + +/* get the next character from the input "stream" (currently just a in-memory + * string...) -- rgerhards, 2008-02-19 + */ +static rsRetVal +ctokGetCharFromStream(ctok_t *pThis, uchar *pc) +{ + DEFiRet; + + ISOBJ_TYPE_assert(pThis, ctok); + ASSERT(pc != NULL); + + if(*pThis->pp == '\0') { + ABORT_FINALIZE(RS_RET_EOS); + } else { + *pc = *pThis->pp; + ++pThis->pp; + } + +finalize_it: + RETiRet; +} + + +/* skip whitespace in the input "stream". + * rgerhards, 2008-02-19 + */ +static rsRetVal +ctokSkipWhitespaceFromStream(ctok_t *pThis) +{ + DEFiRet; + uchar c; + + ISOBJ_TYPE_assert(pThis, ctok); + + CHKiRet(ctokGetCharFromStream(pThis, &c)); + while(isspace(c)) { + CHKiRet(ctokGetCharFromStream(pThis, &c)); + } + + /* we must unget the one non-whitespace we found */ + CHKiRet(ctokUngetCharFromStream(pThis, c)); + +dbgprintf("skipped whitepsace, stream now '%s'\n", pThis->pp); +finalize_it: + RETiRet; +} + + +/* get the next word from the input "stream" (currently just a in-memory + * string...). A word is anything between whitespace. If the word is longer + * than the provided memory buffer, parsing terminates when buffer length + * has been reached. A buffer of 128 bytes or more should always be by + * far sufficient. -- rgerhards, 2008-02-19 + */ +static rsRetVal +ctokGetWordFromStream(ctok_t *pThis, uchar *pWordBuf, size_t lenWordBuf) +{ + DEFiRet; + uchar c; + + ISOBJ_TYPE_assert(pThis, ctok); + ASSERT(pWordBuf != NULL); + ASSERT(lenWordBuf > 0); + + CHKiRet(ctokSkipWhitespaceFromStream(pThis)); + + CHKiRet(ctokGetCharFromStream(pThis, &c)); + while(!isspace(c) && lenWordBuf > 1) { + *pWordBuf = c; + --lenWordBuf; + CHKiRet(ctokGetCharFromStream(pThis, &c)); + } + *pWordBuf = '\0'; /* there is always space for this - see while() */ + +dbgprintf("end ctokGetWorkFromStream, stream now '%s'\n", pThis->pp); +finalize_it: + RETiRet; +} + + +#if 0 +/* Get the next token from the input stream. This parses the next token and + * ignores any whitespace in between. End of stream is communicated via iRet. + * rgerhards, 2008-02-19 + */ +rsRetVal +ctokGetNextToken(ctok_t *pThis, ctok_token_t *pToken) +{ + DEFiRet; + uchar pszWord[128]; + + ISOBJ_TYPE_assert(pThis, ctok); + ASSERT(pToken != NULL); + + CHKiRet(ctokGetWordFromStream(pThis, pszWord, sizeof(pszWord)/sizeof(uchar))); + + /* now recognize words... */ + if(strcasecmp((char*)pszWord, "or")) { + *pToken = ctok_OR; + } else if(strcasecmp((char*)pszWord, "and")) { + *pToken = ctok_AND; + } else if(strcasecmp((char*)pszWord, "+")) { + *pToken = ctok_PLUS; + } else if(strcasecmp((char*)pszWord, "-")) { + *pToken = ctok_MINUS; + } else if(strcasecmp((char*)pszWord, "*")) { + *pToken = ctok_TIMES; + } else if(strcasecmp((char*)pszWord, "/")) { + *pToken = ctok_DIV; + } else if(strcasecmp((char*)pszWord, "%")) { + *pToken = ctok_MOD; + } else if(strcasecmp((char*)pszWord, "not")) { + *pToken = ctok_NOT; + } else if(strcasecmp((char*)pszWord, "(")) { + *pToken = ctok_LPAREN; + } else if(strcasecmp((char*)pszWord, ")")) { + *pToken = ctok_RPAREN; + } else if(strcasecmp((char*)pszWord, ",")) { + *pToken = ctok_COMMA; + } else if(strcasecmp((char*)pszWord, "$")) { + *pToken = ctok_DOLLAR; + } else if(strcasecmp((char*)pszWord, "'")) { + *pToken = ctok_QUOTE; + } else if(strcasecmp((char*)pszWord, "\"")) { + *pToken = ctok_DBL_QUOTE; + } else if(strcasecmp((char*)pszWord, "==")) { + *pToken = ctok_CMP_EQ; + } else if(strcasecmp((char*)pszWord, "!=")) { + *pToken = ctok_CMP_NEQ; + } else if(strcasecmp((char*)pszWord, "<>")) { /* an alias for the non-C folks... */ + *pToken = ctok_CMP_NEQ; + } else if(strcasecmp((char*)pszWord, "<")) { + *pToken = ctok_CMP_LT; + } else if(strcasecmp((char*)pszWord, ">")) { + *pToken = ctok_CMP_GT; + } else if(strcasecmp((char*)pszWord, "<=")) { + *pToken = ctok_CMP_LTEQ; + } else if(strcasecmp((char*)pszWord, ">=")) { + *pToken = ctok_CMP_GTEQ; + } + +RUNLOG_VAR("%d", *pToken); + +finalize_it: + RETiRet; +} +#endif + + +/* Get the next token from the input stream. This parses the next token and + * ignores any whitespace in between. End of stream is communicated via iRet. + * rgerhards, 2008-02-19 + */ +rsRetVal +ctokGetNextToken(ctok_t *pThis, ctok_token_t *pToken) +{ + DEFiRet; + uchar c; + + ISOBJ_TYPE_assert(pThis, ctok); + ASSERT(pToken != NULL); + + CHKiRet(ctokSkipWhitespaceFromStream(pThis)); + + CHKiRet(ctokGetCharFromStream(pThis, &c)); /* read a charater */ + switch(c) { + case 'o':/* or */ + CHKiRet(ctokGetCharFromStream(pThis, &c)); /* read a charater */ + *pToken = (c == 'r')? ctok_OR : ctok_INVALID; + break; + case 'a': /* and */ + CHKiRet(ctokGetCharFromStream(pThis, &c)); /* read a charater */ + if(c == 'n') { + CHKiRet(ctokGetCharFromStream(pThis, &c)); /* read a charater */ + *pToken = (c == 'd')? ctok_AND : ctok_INVALID; + } else { + *pToken = ctok_INVALID; + } + break; + case 'n': /* not */ + CHKiRet(ctokGetCharFromStream(pThis, &c)); /* read a charater */ + if(c == 'o') { + CHKiRet(ctokGetCharFromStream(pThis, &c)); /* read a charater */ + *pToken = (c == 't')? ctok_NOT : ctok_INVALID; + } else { + *pToken = ctok_INVALID; + } + break; + case '=': /* == */ + CHKiRet(ctokGetCharFromStream(pThis, &c)); /* read a charater */ + *pToken = (c == '=')? ctok_CMP_EQ : ctok_INVALID; + break; + case '!': /* != */ + CHKiRet(ctokGetCharFromStream(pThis, &c)); /* read a charater */ + *pToken = (c == '=')? ctok_CMP_NEQ : ctok_INVALID; + break; + case '<': /* <, <=, <> */ + CHKiRet(ctokGetCharFromStream(pThis, &c)); /* read a charater */ + if(c == '=') { + *pToken = ctok_CMP_LTEQ; + } else if(c == '>') { + *pToken = ctok_CMP_NEQ; + } else { + *pToken = ctok_CMP_LT; + } + break; + case '>': /* >, >= */ + CHKiRet(ctokGetCharFromStream(pThis, &c)); /* read a charater */ + if(c == '=') { + *pToken = ctok_CMP_GTEQ; + } else { + *pToken = ctok_CMP_GT; + } + break; + case '+': + *pToken = ctok_PLUS; + break; + case '-': + *pToken = ctok_MINUS; + break; + case '*': + *pToken = ctok_TIMES; + break; + case '/': + *pToken = ctok_DIV; + break; + case '%': + *pToken = ctok_MOD; + break; + case '(': + *pToken = ctok_LPAREN; + break; + case ')': + *pToken = ctok_RPAREN; + break; + case ',': + *pToken = ctok_COMMA; + break; + case '$': + *pToken = ctok_DOLLAR; + break; + case '\'': + *pToken = ctok_QUOTE; + break; + case '"': + *pToken = ctok_DBL_QUOTE; + break; + default: + *pToken = ctok_INVALID; + break; + } + +RUNLOG_VAR("%d", *pToken); + +finalize_it: + RETiRet; +} + + /* property set methods */ /* simple ones first */ DEFpropSetMeth(ctok, pp, uchar*) -- cgit