From 99f18190a1f911224d45ca61706ae3fbc9ad7a80 Mon Sep 17 00:00:00 2001
From: Rainer Gerhards <rgerhards@adiscon.com>
Date: Thu, 29 May 2008 12:48:15 +0200
Subject: enhanced property replacer's regex to support submatches

- enabled Posix ERE expressions inside the property replacer
  (previously BRE was permitted only)
- provided ability to specify that a regular expression submatch shall
  be used inside the property replacer
---
 ChangeLog                  |  4 ++++
 doc/property_replacer.html | 15 +++++++++++++--
 runtime/msg.c              | 28 +++++++++++++++++++++-------
 template.c                 | 39 +++++++++++++++++++++++++++++++++++----
 template.h                 |  8 +++++++-
 5 files changed, 80 insertions(+), 14 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 3aec9670..cbb150ad 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,9 @@
 ---------------------------------------------------------------------------
 Version 3.19.5 (rgerhards), 2008-05-??
+- enabled Posix ERE expressions inside the property replacer
+  (previously BRE was permitted only)
+- provided ability to specify that a regular expression submatch shall
+  be used inside the property replacer
 ---------------------------------------------------------------------------
 Version 3.19.4 (rgerhards), 2008-05-27
 - implemented x509/certvalid gtls auth mode
diff --git a/doc/property_replacer.html b/doc/property_replacer.html
index 4fa7ee4a..992bf8e0 100644
--- a/doc/property_replacer.html
+++ b/doc/property_replacer.html
@@ -204,8 +204,19 @@ not become part of it. If you are using regular expressions, the
 property replacer will return the part of the property text that
 matches the regular expression. An example for a property replacer
 sequence with a regular expression is: "%msg:R:.*Sev:. \(.*\)
-\[.*--end%"<br>
-</p>
+\[.*--end%"</p>
+<p>It is possible to specify some parametes after the "R". These are
+comma-separated. They are:
+<p>R,&lt;regexp-type&gt;,&lt;submatch&gt;
+<p>regexp-type is either "BRE" for Posix basic regular expressions or
+"ERE" for extended ones. The string must be given in upper case. The
+default is "BRE" to be consistent with earlier versions of rsyslog that
+did not support ERE. The submatch identifies the submatch to be used
+with the result. A single digit is supported. Match 0 is the full match,
+while 1 to 9 are the acutal submatches.
+<p>The following is a sample of an ERE expression that takes the first
+submatch from the message string:
+<p>%msg:R,ERE,1:for (vlan[0-9]*):--end%
 <p><b>Also, extraction can be done based on so-called
 "fields"</b>. To do so, place a "F" into FromChar. A field in its
 current definition is anything that is delimited by a delimiter
diff --git a/runtime/msg.c b/runtime/msg.c
index b421c88f..2798b7be 100644
--- a/runtime/msg.c
+++ b/runtime/msg.c
@@ -1605,8 +1605,8 @@ char *MsgGetProp(msg_t *pMsg, struct templateEntry *pTpe,
 
 #ifdef	FEATURE_REGEXP
 	/* Variables necessary for regular expression matching */
-	size_t nmatch = 1;
-	regmatch_t pmatch[1];
+	size_t nmatch = 10;
+	regmatch_t pmatch[10];
 #endif
 
 	assert(pMsg != NULL);
@@ -1839,7 +1839,7 @@ char *MsgGetProp(msg_t *pMsg, struct templateEntry *pTpe,
 				/* Could not compile regex before! */
 				return "**NO MATCH** **BAD REGULAR EXPRESSION**";
 
-			dbgprintf("debug: String to match for regex is: %s\n", pRes);
+			dbgprintf("string to match for regex is: %s\n", pRes);
 
 			if(objUse(regexp, LM_REGEXP_FILENAME) == RS_RET_OK) {
 				if (0 != regexp.regexec(&pTpe->data.field.re, pRes, nmatch, pmatch, 0)) {
@@ -1850,12 +1850,26 @@ char *MsgGetProp(msg_t *pMsg, struct templateEntry *pTpe,
 					}
 					return "**NO MATCH**";
 				} else {
-					/* Match! */
-					/* I need to malloc pB */
+{int i; for(i = 0 ; i < 10 ; ++i) {
+dbgprintf("rqtd regex match (nmatch %d) # %d, idx %d: so %d, eo %d\n", nmatch, pTpe->data.field.iMatchToUse, i,
+pmatch[i].rm_so,
+pmatch[i].rm_eo);
+}}
+					/* Match- but did it match the one we wanted? */
+					/* we got no match! */
+					if(pmatch[pTpe->data.field.iMatchToUse].rm_so == -1) {
+						if (*pbMustBeFreed == 1) {
+							free(pRes);
+							*pbMustBeFreed = 0;
+						}
+						return "**NO MATCH**";
+					}
+					/* OK, we have a usable match - we now need to malloc pB */
 					int iLenBuf;
 					char *pB;
 
-					iLenBuf = pmatch[0].rm_eo - pmatch[0].rm_so;
+					iLenBuf = pmatch[pTpe->data.field.iMatchToUse].rm_eo
+						  - pmatch[pTpe->data.field.iMatchToUse].rm_so;
 					pB = (char *) malloc((iLenBuf + 1) * sizeof(char));
 
 					if (pB == NULL) {
@@ -1866,7 +1880,7 @@ char *MsgGetProp(msg_t *pMsg, struct templateEntry *pTpe,
 					}
 
 					/* Lets copy the matched substring to the buffer */
-					memcpy(pB, pRes + pmatch[0].rm_so, iLenBuf);
+					memcpy(pB, pRes + pmatch[pTpe->data.field.iMatchToUse].rm_so, iLenBuf);
 					pB[iLenBuf] = '\0';/* terminate string, did not happen before */
 
 					if (*pbMustBeFreed == 1)
diff --git a/template.c b/template.c
index e5021f35..bccc6516 100644
--- a/template.c
+++ b/template.c
@@ -514,17 +514,47 @@ static int do_Parameter(unsigned char **pp, struct template *pTpl)
 	if(*p == ':') {
 		++p; /* eat ':' */
 #ifdef FEATURE_REGEXP
-		if (*p == 'R') {
+		if(*p == 'R') {
 			/* APR: R found! regex alarm ! :) */
 			++p;	/* eat ':' */
 
-			if (*p != ':') {
+			/* first come the regex type */
+			if(*p == ',') {
+				++p; /* eat ',' */
+				if(*p == 'B' && *(p+1) == 'R' && *(p+2) == 'E' && *(p+3) == ',') {
+					pTpe->data.field.typeRegex = TPL_REGEX_BRE;
+					p += 3; /* eat indicator sequence */
+				} else if(*p == 'E' && *(p+1) == 'R' && *(p+2) == 'E' && *(p+3) == ',') {
+					pTpe->data.field.typeRegex = TPL_REGEX_ERE;
+					p += 3; /* eat indicator sequence */
+				} else {
+					errmsg.LogError(NO_ERRCODE, "error: invalid regular expression type, rest of line %s",
+				               (char*) p);
+				}
+			}
+
+			/* now check for submatch ID */
+			pTpe->data.field.iMatchToUse = 0;
+			if(*p == ',') {
+				/* in this case a number follows, which indicates which match
+				 * shall be used. This must be a single digit.
+				 */
+				++p; /* eat ',' */
+				if(isdigit((int) *p)) {
+					pTpe->data.field.iMatchToUse = *p - '0';
+					++p; /* eat digit */
+				}
+			}
+
+			if(*p != ':') {
 				/* There is something more than an R , this is invalid ! */
 				/* Complain on extra characters */
 				errmsg.LogError(NO_ERRCODE, "error: invalid character in frompos after \"R\", property: '%%%s'",
 				    (char*) *pp);
 			} else {
 				pTpe->data.field.has_regex = 1;
+				dbgprintf("we have a regexp and use match #%d\n",
+					  pTpe->data.field.iMatchToUse);
 			}
 		} else {
 			/* now we fall through the "regular" FromPos code */
@@ -620,8 +650,9 @@ static int do_Parameter(unsigned char **pp, struct template *pTpl)
 				/* Now i compile the regex */
 				/* Remember that the re is an attribute of the Template entry */
 				if((iRetLocal = objUse(regexp, LM_REGEXP_FILENAME)) == RS_RET_OK) {
-dbgprintf("compile data.field.re ptr: %p (pTpe %p)\n", (&(pTpe->data.field.re)), pTpe);
-					if(regexp.regcomp(&(pTpe->data.field.re), (char*) regex_char, 0) != 0) {
+					int iOptions;
+					iOptions = (pTpe->data.field.typeRegex == TPL_REGEX_ERE) ? REG_EXTENDED : 0;
+					if(regexp.regcomp(&(pTpe->data.field.re), (char*) regex_char, iOptions) != 0) {
 						dbgprintf("error: can not compile regex: '%s'\n", regex_char);
 						pTpe->data.field.has_regex = 2;
 					}
diff --git a/template.h b/template.h
index 5b0bcdb4..daeeb5fd 100644
--- a/template.h
+++ b/template.h
@@ -67,7 +67,13 @@ struct templateEntry {
 			unsigned iToPos;	/* up to that one... */
 #ifdef FEATURE_REGEXP
 			regex_t re;	/* APR: this is the regular expression */
-			unsigned has_regex;
+			short has_regex;
+			short iMatchToUse;/* which match should be obtained (10 max) */
+			enum {
+				TPL_REGEX_BRE = 0, /* posix BRE */
+				TPL_REGEX_ERE = 1  /* posix ERE */
+			} typeRegex;
+			
 #endif
 			unsigned has_fields; /* support for field-counting: field to extract */
 			unsigned char field_delim; /* support for field-counting: field delemiter char */
-- 
cgit v1.2.3