From 7d78b3bdfd357dd921797ce983eb96532c56a7f6 Mon Sep 17 00:00:00 2001
From: Rainer Gerhards <rgerhards@adiscon.com>
Date: Tue, 3 Nov 2009 10:41:47 +0100
Subject: restructured parser part of rsyslog

now cleaner and hopefully usuable as a basis for loadable parser
modules. I also cleaned up/consolidated some of the internal
message generation functionality in rsyslogd.
---
 runtime/parser.c | 382 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 382 insertions(+)

(limited to 'runtime')

diff --git a/runtime/parser.c b/runtime/parser.c
index e29f3b0b..89e59f87 100644
--- a/runtime/parser.c
+++ b/runtime/parser.c
@@ -37,7 +37,10 @@
 #include "dirty.h"
 #include "msg.h"
 #include "obj.h"
+#include "datetime.h"
 #include "errmsg.h"
+#include "unicode-helper.h"
+#include "dirty.h"
 
 /* some defines */
 #define DEFUPRI		(LOG_USER|LOG_NOTICE)
@@ -46,6 +49,7 @@
 DEFobjStaticHelpers
 DEFobjCurrIf(glbl)
 DEFobjCurrIf(errmsg)
+DEFobjCurrIf(datetime)
 
 /* static data */
 
@@ -60,11 +64,389 @@ rsRetVal parserClassInit(void)
 	CHKiRet(objGetObjInterface(&obj)); /* this provides the root pointer for all other queries */
 	CHKiRet(objUse(glbl, CORE_COMPONENT));
 	CHKiRet(objUse(errmsg, CORE_COMPONENT));
+	CHKiRet(objUse(datetime, CORE_COMPONENT));
 // TODO: free components! see action.c
 finalize_it:
 	RETiRet;
 }
 
+/***************************RFC 5425 PARSER ******************************************************/
+
+
+/* Helper to parseRFCSyslogMsg. This function parses a field up to
+ * (and including) the SP character after it. The field contents is
+ * returned in a caller-provided buffer. The parsepointer is advanced
+ * to after the terminating SP. The caller must ensure that the 
+ * provided buffer is large enough to hold the to be extracted value.
+ * Returns 0 if everything is fine or 1 if either the field is not
+ * SP-terminated or any other error occurs. -- rger, 2005-11-24
+ * The function now receives the size of the string and makes sure
+ * that it does not process more than that. The *pLenStr counter is
+ * updated on exit. -- rgerhards, 2009-09-23
+ */
+static int parseRFCField(uchar **pp2parse, uchar *pResult, int *pLenStr)
+{
+	uchar *p2parse;
+	int iRet = 0;
+
+	assert(pp2parse != NULL);
+	assert(*pp2parse != NULL);
+	assert(pResult != NULL);
+
+	p2parse = *pp2parse;
+
+	/* this is the actual parsing loop */
+	while(*pLenStr > 0  && *p2parse != ' ') {
+		*pResult++ = *p2parse++;
+		--(*pLenStr);
+	}
+
+	if(*pLenStr > 0 && *p2parse == ' ') {
+		++p2parse; /* eat SP, but only if not at end of string */
+		--(*pLenStr);
+	} else {
+		iRet = 1; /* there MUST be an SP! */
+	}
+	*pResult = '\0';
+
+	/* set the new parse pointer */
+	*pp2parse = p2parse;
+	return 0;
+}
+
+
+/* Helper to parseRFCSyslogMsg. This function parses the structured
+ * data field of a message. It does NOT parse inside structured data,
+ * just gets the field as whole. Parsing the single entities is left
+ * to other functions. The parsepointer is advanced
+ * to after the terminating SP. The caller must ensure that the 
+ * provided buffer is large enough to hold the to be extracted value.
+ * Returns 0 if everything is fine or 1 if either the field is not
+ * SP-terminated or any other error occurs. -- rger, 2005-11-24
+ * The function now receives the size of the string and makes sure
+ * that it does not process more than that. The *pLenStr counter is
+ * updated on exit. -- rgerhards, 2009-09-23
+ */
+static int parseRFCStructuredData(uchar **pp2parse, uchar *pResult, int *pLenStr)
+{
+	uchar *p2parse;
+	int bCont = 1;
+	int iRet = 0;
+	int lenStr;
+
+	assert(pp2parse != NULL);
+	assert(*pp2parse != NULL);
+	assert(pResult != NULL);
+
+	p2parse = *pp2parse;
+	lenStr = *pLenStr;
+
+	/* this is the actual parsing loop
+	 * Remeber: structured data starts with [ and includes any characters
+	 * until the first ] followed by a SP. There may be spaces inside
+	 * structured data. There may also be \] inside the structured data, which
+	 * do NOT terminate an element.
+	 */
+	if(lenStr == 0 || *p2parse != '[')
+		return 1; /* this is NOT structured data! */
+
+	if(*p2parse == '-') { /* empty structured data? */
+		*pResult++ = '-';
+		++p2parse;
+		--lenStr;
+	} else {
+		while(bCont) {
+			if(lenStr < 2) {
+				/* we now need to check if we have only structured data */
+				if(lenStr > 0 && *p2parse == ']') {
+					*pResult++ = *p2parse;
+					p2parse++;
+					lenStr--;
+					bCont = 0;
+				} else {
+					iRet = 1; /* this is not valid! */
+					bCont = 0;
+				}
+			} else if(*p2parse == '\\' && *(p2parse+1) == ']') {
+				/* this is escaped, need to copy both */
+				*pResult++ = *p2parse++;
+				*pResult++ = *p2parse++;
+				lenStr -= 2;
+			} else if(*p2parse == ']' && *(p2parse+1) == ' ') {
+				/* found end, just need to copy the ] and eat the SP */
+				*pResult++ = *p2parse;
+				p2parse += 2;
+				lenStr -= 2;
+				bCont = 0;
+			} else {
+				*pResult++ = *p2parse++;
+				--lenStr;
+			}
+		}
+	}
+
+	if(lenStr > 0 && *p2parse == ' ') {
+		++p2parse; /* eat SP, but only if not at end of string */
+		--lenStr;
+	} else {
+		iRet = 1; /* there MUST be an SP! */
+	}
+	*pResult = '\0';
+
+	/* set the new parse pointer */
+	*pp2parse = p2parse;
+	*pLenStr = lenStr;
+	return 0;
+}
+
+/* parse a RFC5424-formatted syslog message. This function returns
+ * 0 if processing of the message shall continue and 1 if something
+ * went wrong and this messe should be ignored. This function has been
+ * implemented in the effort to support syslog-protocol. Please note that
+ * the name (parse *RFC*) stems from the hope that syslog-protocol will
+ * some time become an RFC. Do not confuse this with informational
+ * RFC 3164 (which is legacy syslog).
+ *
+ * currently supported format:
+ *
+ * <PRI>VERSION SP TIMESTAMP SP HOSTNAME SP APP-NAME SP PROCID SP MSGID SP [SD-ID]s SP MSG
+ *
+ * <PRI> is already stripped when this function is entered. VERSION already
+ * has been confirmed to be "1", but has NOT been stripped from the message.
+ *
+ * rger, 2005-11-24
+ */
+static int parseRFCSyslogMsg(msg_t *pMsg, int flags)
+{
+	uchar *p2parse;
+	uchar *pBuf;
+	int lenMsg;
+	int bContParse = 1;
+
+	BEGINfunc
+	assert(pMsg != NULL);
+	assert(pMsg->pszRawMsg != NULL);
+	p2parse = pMsg->pszRawMsg + pMsg->offAfterPRI; /* point to start of text, after PRI */
+	lenMsg = pMsg->iLenRawMsg - pMsg->offAfterPRI;
+
+	/* do a sanity check on the version and eat it (the caller checked this already) */
+	assert(p2parse[0] == '1' && p2parse[1] == ' ');
+	p2parse += 2;
+	lenMsg -= 2;
+
+	/* Now get us some memory we can use as a work buffer while parsing.
+	 * We simply allocated a buffer sufficiently large to hold all of the
+	 * message, so we can not run into any troubles. I think this is
+	 * more wise then to use individual buffers.
+	 */
+	if((pBuf = MALLOC(sizeof(uchar) * (lenMsg + 1))) == NULL)
+		return 1;
+		
+	/* IMPORTANT NOTE:
+	 * Validation is not actually done below nor are any errors handled. I have
+	 * NOT included this for the current proof of concept. However, it is strongly
+	 * advisable to add it when this code actually goes into production.
+	 * rgerhards, 2005-11-24
+	 */
+
+	/* TIMESTAMP */
+	if(datetime.ParseTIMESTAMP3339(&(pMsg->tTIMESTAMP),  &p2parse, &lenMsg) == RS_RET_OK) {
+		if(flags & IGNDATE) {
+			/* we need to ignore the msg data, so simply copy over reception date */
+			memcpy(&pMsg->tTIMESTAMP, &pMsg->tRcvdAt, sizeof(struct syslogTime));
+		}
+	} else {
+		DBGPRINTF("no TIMESTAMP detected!\n");
+		bContParse = 0;
+	}
+
+	/* HOSTNAME */
+	if(bContParse) {
+		parseRFCField(&p2parse, pBuf, &lenMsg);
+		MsgSetHOSTNAME(pMsg, pBuf, ustrlen(pBuf));
+	}
+
+	/* APP-NAME */
+	if(bContParse) {
+		parseRFCField(&p2parse, pBuf, &lenMsg);
+		MsgSetAPPNAME(pMsg, (char*)pBuf);
+	}
+
+	/* PROCID */
+	if(bContParse) {
+		parseRFCField(&p2parse, pBuf, &lenMsg);
+		MsgSetPROCID(pMsg, (char*)pBuf);
+	}
+
+	/* MSGID */
+	if(bContParse) {
+		parseRFCField(&p2parse, pBuf, &lenMsg);
+		MsgSetMSGID(pMsg, (char*)pBuf);
+	}
+
+	/* STRUCTURED-DATA */
+	if(bContParse) {
+		parseRFCStructuredData(&p2parse, pBuf, &lenMsg);
+		MsgSetStructuredData(pMsg, (char*)pBuf);
+	}
+
+	/* MSG */
+	MsgSetMSGoffs(pMsg, p2parse - pMsg->pszRawMsg);
+
+	free(pBuf);
+	ENDfunc
+	return 0; /* all ok */
+}
+
+
+/*********************** END RFC5425 PARSER ******************************************/
+
+/***************************RFC 5425 PARSER ******************************************************/
+
+
+/* parse a legay-formatted syslog message. This function returns
+ * 0 if processing of the message shall continue and 1 if something
+ * went wrong and this messe should be ignored. This function has been
+ * implemented in the effort to support syslog-protocol.
+ * rger, 2005-11-24
+ * As of 2006-01-10, I am removing the logic to continue parsing only
+ * when a valid TIMESTAMP is detected. Validity of other fields already
+ * is ignored. This is due to the fact that the parser has grown smarter
+ * and is now more able to understand different dialects of the syslog
+ * message format. I do not expect any bad side effects of this change,
+ * but I thought I log it in this comment.
+ * rgerhards, 2006-01-10
+ */
+static int parseLegacySyslogMsg(msg_t *pMsg, int flags)
+{
+	uchar *p2parse;
+	int lenMsg;
+	int bTAGCharDetected;
+	int i;	/* general index for parsing */
+	uchar bufParseTAG[CONF_TAG_MAXSIZE];
+	uchar bufParseHOSTNAME[CONF_TAG_HOSTNAME];
+	BEGINfunc
+
+	assert(pMsg != NULL);
+	assert(pMsg->pszRawMsg != NULL);
+	lenMsg = pMsg->iLenRawMsg - (pMsg->offAfterPRI + 1);
+	p2parse = pMsg->pszRawMsg + pMsg->offAfterPRI; /* point to start of text, after PRI */
+
+	/* Check to see if msg contains a timestamp. We start by assuming
+	 * that the message timestamp is the time of reception (which we 
+	 * generated ourselfs and then try to actually find one inside the
+	 * message. There we go from high-to low precison and are done
+	 * when we find a matching one. -- rgerhards, 2008-09-16
+	 */
+	if(datetime.ParseTIMESTAMP3339(&(pMsg->tTIMESTAMP), &p2parse, &lenMsg) == RS_RET_OK) {
+		/* we are done - parse pointer is moved by ParseTIMESTAMP3339 */;
+	} else if(datetime.ParseTIMESTAMP3164(&(pMsg->tTIMESTAMP), &p2parse, &lenMsg) == RS_RET_OK) {
+		/* we are done - parse pointer is moved by ParseTIMESTAMP3164 */;
+	} else if(*p2parse == ' ' && lenMsg > 1) { /* try to see if it is slighly malformed - HP procurve seems to do that sometimes */
+		++p2parse;	/* move over space */
+		--lenMsg;
+		if(datetime.ParseTIMESTAMP3164(&(pMsg->tTIMESTAMP), &p2parse, &lenMsg) == RS_RET_OK) {
+			/* indeed, we got it! */
+			/* we are done - parse pointer is moved by ParseTIMESTAMP3164 */;
+		} else {/* parse pointer needs to be restored, as we moved it off-by-one
+			 * for this try.
+			 */
+			--p2parse;
+			++lenMsg;
+		}
+	}
+
+	if(flags & IGNDATE) {
+		/* we need to ignore the msg data, so simply copy over reception date */
+		memcpy(&pMsg->tTIMESTAMP, &pMsg->tRcvdAt, sizeof(struct syslogTime));
+	}
+
+	/* rgerhards, 2006-03-13: next, we parse the hostname and tag. But we 
+	 * do this only when the user has not forbidden this. I now introduce some
+	 * code that allows a user to configure rsyslogd to treat the rest of the
+	 * message as MSG part completely. In this case, the hostname will be the
+	 * machine that we received the message from and the tag will be empty. This
+	 * is meant to be an interim solution, but for now it is in the code.
+	 */
+	if(bParseHOSTNAMEandTAG && !(flags & INTERNAL_MSG)) {
+		/* parse HOSTNAME - but only if this is network-received!
+		 * rger, 2005-11-14: we still have a problem with BSD messages. These messages
+		 * do NOT include a host name. In most cases, this leads to the TAG to be treated
+		 * as hostname and the first word of the message as the TAG. Clearly, this is not
+		 * of advantage ;) I think I have now found a way to handle this situation: there
+		 * are certain characters which are frequently used in TAG (e.g. ':'), which are
+		 * *invalid* in host names. So while parsing the hostname, I check for these characters.
+		 * If I find them, I set a simple flag but continue. After parsing, I check the flag.
+		 * If it was set, then we most probably do not have a hostname but a TAG. Thus, I change
+		 * the fields. I think this logic shall work with any type of syslog message.
+		 * rgerhards, 2009-06-23: and I now have extended this logic to every character
+		 * that is not a valid hostname.
+		 */
+		bTAGCharDetected = 0;
+		if(lenMsg > 0 && flags & PARSE_HOSTNAME) {
+			i = 0;
+			while(i < lenMsg && (isalnum(p2parse[i]) || p2parse[i] == '.' || p2parse[i] == '.'
+				|| p2parse[i] == '_' || p2parse[i] == '-') && i < CONF_TAG_MAXSIZE) {
+				bufParseHOSTNAME[i] = p2parse[i];
+				++i;
+			}
+
+			if(i > 0 && p2parse[i] == ' ' && isalnum(p2parse[i-1])) {
+				/* we got a hostname! */
+				p2parse += i + 1; /* "eat" it (including SP delimiter) */
+				lenMsg -= i + 1;
+				bufParseHOSTNAME[i] = '\0';
+				MsgSetHOSTNAME(pMsg, bufParseHOSTNAME, i);
+			}
+		}
+
+		/* now parse TAG - that should be present in message from all sources.
+		 * This code is somewhat not compliant with RFC 3164. As of 3164,
+		 * the TAG field is ended by any non-alphanumeric character. In
+		 * practice, however, the TAG often contains dashes and other things,
+		 * which would end the TAG. So it is not desirable. As such, we only
+		 * accept colon and SP to be terminators. Even there is a slight difference:
+		 * a colon is PART of the TAG, while a SP is NOT part of the tag
+		 * (it is CONTENT). Starting 2008-04-04, we have removed the 32 character
+		 * size limit (from RFC3164) on the tag. This had bad effects on existing
+		 * envrionments, as sysklogd didn't obey it either (probably another bug
+		 * in RFC3164...). We now receive the full size, but will modify the
+		 * outputs so that only 32 characters max are used by default.
+		 */
+		i = 0;
+		while(lenMsg > 0 && *p2parse != ':' && *p2parse != ' ' && i < CONF_TAG_MAXSIZE) {
+			bufParseTAG[i++] = *p2parse++;
+			--lenMsg;
+		}
+		if(lenMsg > 0 && *p2parse == ':') {
+			++p2parse; 
+			--lenMsg;
+			bufParseTAG[i++] = ':';
+		}
+
+		/* no TAG can only be detected if the message immediatly ends, in which case an empty TAG
+		 * is considered OK. So we do not need to check for empty TAG. -- rgerhards, 2009-06-23
+		 */
+		bufParseTAG[i] = '\0';	/* terminate string */
+		MsgSetTAG(pMsg, bufParseTAG, i);
+	} else {/* we enter this code area when the user has instructed rsyslog NOT
+		 * to parse HOSTNAME and TAG - rgerhards, 2006-03-13
+		 */
+		if(!(flags & INTERNAL_MSG)) {
+			DBGPRINTF("HOSTNAME and TAG not parsed by user configuraton.\n");
+		}
+	}
+
+	/* The rest is the actual MSG */
+	MsgSetMSGoffs(pMsg, p2parse - pMsg->pszRawMsg);
+
+	ENDfunc
+	return 0; /* all ok */
+}
+
+
+/***************************END RFC 5425 PARSER ******************************************************/
+
 
 /* uncompress a received message if it is compressed.
  * pMsg->pszRawMsg buffer is updated.
-- 
cgit