2 files changed, 103 insertions, 18 deletions
diff --git a/doc/mmutf8fix.html b/doc/mmutf8fix.html
index c75e71bc..1a98f660 100644
--- a/doc/mmutf8fix.html
+++ b/doc/mmutf8fix.html
@@ -17,17 +17,12 @@ in non-UTF character sets, e.g. ISO 8859. As syslog does not have a way
 to convey the character set information, these sequences are not properly
 handled. While they are typically uncritical with plain text files, they can
 cause big headache with database sources as well as systems like ElasticSearch.
-<p>The module is an experiement at "fixing" such encoding problems. It
-begun as a very simple replacer of non-control characters, and actually breaks
-some UTF-8 encoding right now. If the module turns out to be useful, it
-should be enhanced to support modes that really detect invalid UTF8. In the longer term
+<p>The module supports different "fixing" modes and fixes. The current
+implementation will always replace invalid bytes with a single US ASCII
+character. Additional replacement modes will probably be added in the future,
+depending on user demand.  In the longer term
 it could also be evolved into an any-charset-to-UTF8 converter. But
 first let's see if it really gets into widespread enough use.
-<p>What it currently does is simply replace all US-ASCII control characters
-(characters ouside the range of 32 to 126) by a configured replacement
-character. For forward compatibility, this will remain the default mode
-in the future. However, as said above, more useful modes will be added
-based on user feedback and demand.
 
 <p><b>Proper Usage</b>:</p>
 <p>Some notes are due for proper use of this module. This is a message modification
@@ -50,8 +45,22 @@ ruleset.
 <p>&nbsp;</p>
 <p><b>Action Confguration Parameters</b>:</p>
 <ul>
+<li><b>mode</b> - <b>utf8</b>/controlcharacters<br>
+This sets the basic detection mode.
+<br>In <b>utf8</b> mode (the default), proper
+UTF-8 encoding is checked and bytes which are not proper UTF-8 sequences
+are acted on. If a proper multi-byte start sequence byte is detected but
+any of the following bytes is invalid, the whole sequence is replaced by
+the replacement method. This mode is most useful with non-US-ASCII character
+sets, which validly includes multibyte sequences. Note that in this mode
+control characters are NOT being replaced, because they are valid UTF-8.
+<br>In <b>controlcharacters</b> mode, all bytes which do not represent a
+printable US-ASCII character (codes 32 to 126) are replaced. Note that this
+also mangles valid UTF-8 multi-byte sequences, as these are (deliberately) outside
+of that character range.
 <li><b>replacementChar</b> - default " " (space), a single character<br>
-This is the character that invalid sequences are replaced by.
+This is the character that invalid sequences are replaced by. Currently, it
+MUST be a <b>printable</b> US-ASCII character.
 </ul>
 
 <p><b>Caveats/Known Bugs:</b>
diff --git a/plugins/mmutf8fix/mmutf8fix.c b/plugins/mmutf8fix/mmutf8fix.c
index 7ffa3ac5..7a5d1d03 100644
--- a/plugins/mmutf8fix/mmutf8fix.c
+++ b/plugins/mmutf8fix/mmutf8fix.c
@@ -50,10 +50,14 @@ MODULE_CNFNAME("mmutf8fix")
 DEFobjCurrIf(errmsg);
 DEF_OMOD_STATIC_DATA
 
-/* config variables */
+/* define operation modes we have */
+#define MODE_CC 0	 /* just fix control characters */
+#define MODE_UTF8 1	 /* do real UTF-8 fixing */
 
+/* config variables */
 typedef struct _instanceData {
 	uchar replChar;
+	uint8_t mode;		/* operations mode */
 } instanceData;
 
 struct modConfData_s {
@@ -66,6 +70,7 @@ static modConfData_t *runModConf = NULL;/* modConf ptr to use for the current ex
 /* tables for interfacing with the v6 config system */
 /* action (instance) parameters */
 static struct cnfparamdescr actpdescr[] = {
+	{ "mode", eCmdHdlrGetWord, 0 },
 	{ "replacementchar", eCmdHdlrGetChar, 0 }
 };
 static struct cnfparamblk actpblk =
@@ -116,6 +121,7 @@ ENDfreeInstance
 static inline void
 setInstParamDefaults(instanceData *pData)
 {
+	pData->mode = MODE_UTF8;
 	pData->replChar = ' ';
 }
 
@@ -136,7 +142,21 @@ CODESTARTnewActInst
 	for(i = 0 ; i < actpblk.nParams ; ++i) {
 		if(!pvals[i].bUsed)
 			continue;
-		if(!strcmp(actpblk.descr[i].name, "replacementchar")) {
+		if(!strcmp(actpblk.descr[i].name, "mode")) {
+			if(!es_strbufcmp(pvals[i].val.d.estr, (uchar*)"utf8",
+					 sizeof("utf8")-1)) {
+				pData->mode = MODE_UTF8;
+			} else if(!es_strbufcmp(pvals[i].val.d.estr, (uchar*)"controlcharacters",
+					 sizeof("controlcharacters")-1)) {
+				pData->mode = MODE_CC;
+			} else {
+				char *cstr = es_str2cstr(pvals[i].val.d.estr, NULL);
+				errmsg.LogError(0, RS_RET_INVLD_MODE,
+					"mmutf8fix: invalid mode '%s' - ignored",
+					cstr);
+				free(cstr);
+			}
+		} else if(!strcmp(actpblk.descr[i].name, "replacementchar")) {
 			pData->replChar = es_getBufAddr(pvals[i].val.d.estr)[0];
 		} else {
 			dbgprintf("mmutf8fix: program error, non-handled "
@@ -159,19 +179,76 @@ CODESTARTtryResume
 ENDtryResume
 
 
+static inline void
+doCC(instanceData *pData, uchar *msg, int lenMsg)
+{
+	int i;
+
+	for(i = 0 ; i < lenMsg ; ++i) {
+		if(msg[i] < 32 || msg[i] > 126) {
+			msg[i] = pData->replChar;
+		}
+	}
+}
+
+static inline void
+doUTF8(instanceData *pData, uchar *msg, int lenMsg)
+{
+	uchar c;
+	int8_t seqLen, bytesLeft = 0;
+	int strtIdx, endIdx;
+	int i, j;
+
+	for(i = 0 ; i < lenMsg ; ++i) {
+		c = msg[i];
+		if(bytesLeft) {
+			if((c & 0xc0) != 0x80) {
+				/* sequence invalid, invalidate all bytes */
+				endIdx = strtIdx + seqLen;
+				if(endIdx > lenMsg)
+					endIdx = lenMsg;
+				for(j = strtIdx ; j < endIdx ; ++j)
+					msg[j] = pData->replChar;
+				i = endIdx - 1;
+				bytesLeft = 0;
+			} else {
+				--bytesLeft;
+			}
+		} else {
+			if((c & 0x80) == 0) {
+				/* 1-byte sequence, US-ASCII */
+				; /* nothing to do, all well */
+			} else if((c & 0xe0) == 0xc0) {
+				/* 2-byte sequence */
+				strtIdx = i;
+				seqLen = bytesLeft = 1;
+			} else if((c & 0xf0) == 0xe0) {
+				/* 3-byte sequence */
+				strtIdx = i;
+				seqLen = bytesLeft = 2;
+			} else if((c & 0xf8) == 0xf0) {
+				/* 4-byte sequence */
+				strtIdx = i;
+				seqLen = bytesLeft = 3;
+			} else {   /* invalid (5&6 byte forbidden by RFC3629) */
+				msg[i] = pData->replChar;
+			}
+		}
+	}
+}
+
 BEGINdoAction
 	msg_t *pMsg;
 	uchar *msg;
 	int lenMsg;
-	int i;
 CODESTARTdoAction
 	pMsg = (msg_t*) ppString[0];
 	lenMsg = getMSGLen(pMsg);
 	msg = getMSG(pMsg);
-	for(i = 0 ; i < lenMsg ; ++i) {
-		if(msg[i] < 32 || msg[i] > 126) {
-			msg[i] = pData->replChar;
-		}
+	if(pData->mode == MODE_CC) {
+		doCC(pData, msg, lenMsg);
+	} else {
+		doUTF8(pData, msg, lenMsg);
 	}
 ENDdoAction
 
@@ -203,7 +280,6 @@ CODEqueryEtryPt_STD_CONF2_QUERIES
 ENDqueryEtryPt
 
 
-
 BEGINmodInit()
 CODESTARTmodInit
 	*ipIFVersProvided = CURR_MOD_IF_VERSION; /* we only support the current interface specification */