From eddaca33a81206aab7c6627e5c91d22232445adf Mon Sep 17 00:00:00 2001
From: Rainer Gerhards <rgerhards@adiscon.com>
Date: Wed, 4 Jun 2008 11:11:52 +0200
Subject: enhanced property replacer to support multiple regex matches

---
 ChangeLog                  |  1 +
 doc/property_replacer.html | 11 +++++++++--
 runtime/msg.c              | 33 ++++++++++++++++++++++++++++-----
 template.c                 | 26 ++++++++++++++++++++++----
 template.h                 |  1 +
 5 files changed, 61 insertions(+), 11 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 0c7a4109..df941331 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -13,6 +13,7 @@ Version 3.19.5 (rgerhards), 2008-05-30
 - implemented in property replacer: if a regular expression does not match,
   it can now either return "**NO MATCH** (default, as before), a blank
   property or the full original property text
+- enhanced property replacer to support multiple regex matches
 ---------------------------------------------------------------------------
 Version 3.19.4 (rgerhards), 2008-05-27
 - implemented x509/certvalid gtls auth mode
diff --git a/doc/property_replacer.html b/doc/property_replacer.html
index b6eaae0f..86d28274 100644
--- a/doc/property_replacer.html
+++ b/doc/property_replacer.html
@@ -207,13 +207,18 @@ sequence with a regular expression is: "%msg:R:.*Sev:. \(.*\)
 \[.*--end%"</p>
 <p>It is possible to specify some parametes after the "R". These are
 comma-separated. They are:
-<p>R,&lt;regexp-type&gt;,&lt;submatch&gt;,&lt;nomatch&gt;
+<p>R,&lt;regexp-type&gt;,&lt;submatch&gt;,&lt;nomatch&gt;,&lt;match-number&gt;
 <p>regexp-type is either "BRE" for Posix basic regular expressions or
 "ERE" for extended ones. The string must be given in upper case. The
 default is "BRE" to be consistent with earlier versions of rsyslog that
 did not support ERE. The submatch identifies the submatch to be used
 with the result. A single digit is supported. Match 0 is the full match,
-while 1 to 9 are the acutal submatches.
+while 1 to 9 are the acutal submatches. The match-number identifies which match to
+use, if the expression occurs more than once inside the string. Please note
+that the first match is number 0, the second 1 and so on. Up to 10 matches
+(up to number 9) are supported. Please note that it would be more
+natural to have the match-number in front of submatch, but this would break 
+backward-compatibility. So the match-number must be specified after "nomatch".
 <p>nomatch is either "DFLT", "BLANK" or "FIELD" (all upper case!). It tells
 what to use if no match is found. With "DFLT", the strig "**NO MATCH**" is
 used. This was the only supported value up to rsyslog 3.19.5. With "BLANK"
@@ -224,6 +229,8 @@ to be useful.
 submatch from the message string and replaces the expression with
 the full field if no match is found:
 <p>%msg:R,ERE,1,FIELD:for (vlan[0-9]*):--end%
+<p>and this takes the first submatch of the second match of said expression:
+<p>%msg:R,ERE,1,FIELD,1:for (vlan[0-9]*):--end%
 <p><b>Also, extraction can be done based on so-called
 "fields"</b>. To do so, place a "F" into FromChar. A field in its
 current definition is anything that is delimited by a delimiter
diff --git a/runtime/msg.c b/runtime/msg.c
index a90416ff..f195d3bd 100644
--- a/runtime/msg.c
+++ b/runtime/msg.c
@@ -1602,6 +1602,7 @@ char *MsgGetProp(msg_t *pMsg, struct templateEntry *pTpe,
 	char *pBufStart;
 	char *pBuf;
 	int iLen;
+	short iOffs;
 
 #ifdef	FEATURE_REGEXP
 	/* Variables necessary for regular expression matching */
@@ -1842,7 +1843,29 @@ char *MsgGetProp(msg_t *pMsg, struct templateEntry *pTpe,
 			dbgprintf("string to match for regex is: %s\n", pRes);
 
 			if(objUse(regexp, LM_REGEXP_FILENAME) == RS_RET_OK) {
-				if (0 != regexp.regexec(&pTpe->data.field.re, pRes, nmatch, pmatch, 0)) {
+				short iTry = 0;
+				uchar bFound = 0;
+				iOffs = 0;
+				/* first see if we find a match, iterating through the series of
+				 * potential matches over the string.
+				 */
+				while(!bFound) {
+					if(regexp.regexec(&pTpe->data.field.re, pRes + iOffs, nmatch, pmatch, 0) == 0) {
+						if(pmatch[0].rm_so == -1) {
+							dbgprintf("oops ... start offset of successful regexec is -1\n");
+							break;
+						}
+						if(iTry == pTpe->data.field.iMatchToUse) {
+							bFound = 1;
+						} else {
+							iOffs += pmatch[0].rm_eo;
+							++iTry;
+						}
+					} else {
+						break;
+					}
+				}
+				if(!bFound) {
 					/* we got no match! */
 					if(pTpe->data.field.nomatchAction != TPL_REGEX_NOMATCH_USE_WHOLE_FIELD) {
 						if (*pbMustBeFreed == 1) {
@@ -1857,7 +1880,7 @@ char *MsgGetProp(msg_t *pMsg, struct templateEntry *pTpe,
 				} else {
 					/* Match- but did it match the one we wanted? */
 					/* we got no match! */
-					if(pmatch[pTpe->data.field.iMatchToUse].rm_so == -1) {
+					if(pmatch[pTpe->data.field.iSubMatchToUse].rm_so == -1) {
 						if(pTpe->data.field.nomatchAction != TPL_REGEX_NOMATCH_USE_WHOLE_FIELD) {
 							if (*pbMustBeFreed == 1) {
 								free(pRes);
@@ -1873,8 +1896,8 @@ char *MsgGetProp(msg_t *pMsg, struct templateEntry *pTpe,
 					int iLenBuf;
 					char *pB;
 
-					iLenBuf = pmatch[pTpe->data.field.iMatchToUse].rm_eo
-						  - pmatch[pTpe->data.field.iMatchToUse].rm_so;
+					iLenBuf = pmatch[pTpe->data.field.iSubMatchToUse].rm_eo
+						  - pmatch[pTpe->data.field.iSubMatchToUse].rm_so;
 					pB = (char *) malloc((iLenBuf + 1) * sizeof(char));
 
 					if (pB == NULL) {
@@ -1885,7 +1908,7 @@ char *MsgGetProp(msg_t *pMsg, struct templateEntry *pTpe,
 					}
 
 					/* Lets copy the matched substring to the buffer */
-					memcpy(pB, pRes + pmatch[pTpe->data.field.iMatchToUse].rm_so, iLenBuf);
+					memcpy(pB, pRes + iOffs +  pmatch[pTpe->data.field.iSubMatchToUse].rm_so, iLenBuf);
 					pB[iLenBuf] = '\0';/* terminate string, did not happen before */
 
 					if (*pbMustBeFreed == 1)
diff --git a/template.c b/template.c
index 2b336ba9..e85be4be 100644
--- a/template.c
+++ b/template.c
@@ -534,14 +534,14 @@ static int do_Parameter(unsigned char **pp, struct template *pTpl)
 			}
 
 			/* now check for submatch ID */
-			pTpe->data.field.iMatchToUse = 0;
+			pTpe->data.field.iSubMatchToUse = 0;
 			if(*p == ',') {
 				/* in this case a number follows, which indicates which match
 				 * shall be used. This must be a single digit.
 				 */
 				++p; /* eat ',' */
 				if(isdigit((int) *p)) {
-					pTpe->data.field.iMatchToUse = *p - '0';
+					pTpe->data.field.iSubMatchToUse = *p - '0';
 					++p; /* eat digit */
 				}
 			}
@@ -561,12 +561,30 @@ static int do_Parameter(unsigned char **pp, struct template *pTpl)
 				   && (p[5] == ',' || p[5] == ':')) {
 					pTpe->data.field.nomatchAction = TPL_REGEX_NOMATCH_USE_WHOLE_FIELD;
 					p += 5; /* eat indicator sequence */
+				} else if(p[0] == ',') { /* empty, use default */
+					pTpe->data.field.nomatchAction = TPL_REGEX_NOMATCH_USE_DFLTSTR;
+					 /* do NOT eat indicator sequence, as this was already eaten - the 
+					  * comma itself is already part of the next field.
+					  */
 				} else {
 					errmsg.LogError(NO_ERRCODE, "error: invalid regular expression type, rest of line %s",
 				               (char*) p);
 				}
 			}
 
+			/* now check for match ID */
+			pTpe->data.field.iMatchToUse = 0;
+			if(*p == ',') {
+				/* in this case a number follows, which indicates which match
+				 * shall be used. This must be a single digit.
+				 */
+				++p; /* eat ',' */
+				if(isdigit((int) *p)) {
+					pTpe->data.field.iMatchToUse = *p - '0';
+					++p; /* eat digit */
+				}
+			}
+
 			if(*p != ':') {
 				/* There is something more than an R , this is invalid ! */
 				/* Complain on extra characters */
@@ -574,8 +592,8 @@ static int do_Parameter(unsigned char **pp, struct template *pTpl)
 				    (char*) *pp);
 			} else {
 				pTpe->data.field.has_regex = 1;
-				dbgprintf("we have a regexp and use match #%d\n",
-					  pTpe->data.field.iMatchToUse);
+				dbgprintf("we have a regexp and use match #%d, submatch #%d\n",
+					  pTpe->data.field.iMatchToUse, pTpe->data.field.iSubMatchToUse);
 			}
 		} else {
 			/* now we fall through the "regular" FromPos code */
diff --git a/template.h b/template.h
index dff06583..baf33d4e 100644
--- a/template.h
+++ b/template.h
@@ -69,6 +69,7 @@ struct templateEntry {
 			regex_t re;	/* APR: this is the regular expression */
 			short has_regex;
 			short iMatchToUse;/* which match should be obtained (10 max) */
+			short iSubMatchToUse;/* which submatch should be obtained (10 max) */
 			enum {
 				TPL_REGEX_BRE = 0, /* posix BRE */
 				TPL_REGEX_ERE = 1  /* posix ERE */
-- 
cgit