mmutf8fix: stricter UTF-8 checking

author: Rainer Gerhards <rgerhards@adiscon.com> 2013-09-20 17:24:17 +0200
committer: Rainer Gerhards <rgerhards@adiscon.com> 2013-09-20 17:24:17 +0200
commit: 666d301e148df2c150404c4fc73f63c6da933815 (patch)
tree: ab78f1dc555cfce5bfc503faf0439ff0c64b5d3b
parent: f06eb68bc6e623c64cca6bb50155ff67bcf545c6 (diff)
download: rsyslog-666d301e148df2c150404c4fc73f63c6da933815.tar.gz
rsyslog-666d301e148df2c150404c4fc73f63c6da933815.tar.bz2
rsyslog-666d301e148df2c150404c4fc73f63c6da933815.zip
2 files changed, 34 insertions, 11 deletions
diff --git a/doc/mmutf8fix.html b/doc/mmutf8fix.html
index e8020685..6275c17e 100644
--- a/doc/mmutf8fix.html
+++ b/doc/mmutf8fix.html
@@ -45,9 +45,9 @@ ruleset.
 <p>&nbsp;</p>
 <p><b>Action Confguration Parameters</b>:</p>
 <ul>
-<li><b>mode</b> - <b>utf8</b>/controlcharacters<br>
+<li><b>mode</b> - <b>utf-8</b>/controlcharacters<br>
 This sets the basic detection mode.
-<br>In <b>utf8</b> mode (the default), proper
+<br>In <b>utf-8</b> mode (the default), proper
 UTF-8 encoding is checked and bytes which are not proper UTF-8 sequences
 are acted on. If a proper multi-byte start sequence byte is detected but
 any of the following bytes is invalid, the whole sequence is replaced by
@@ -66,7 +66,7 @@ MUST be a <b>printable</b> US-ASCII character.
 
 <p><b>Caveats/Known Bugs:</b>
 <ul>
-<li><b>only IPv4</b> is supported
+<li>overlong UTF-8 encodings are currently not detected in utf-8 mode.
 </ul>
 
 <p><b>Samples:</b></p>
diff --git a/plugins/mmutf8fix/mmutf8fix.c b/plugins/mmutf8fix/mmutf8fix.c
index 7a5d1d03..41d98653 100644
--- a/plugins/mmutf8fix/mmutf8fix.c
+++ b/plugins/mmutf8fix/mmutf8fix.c
@@ -143,8 +143,8 @@ CODESTARTnewActInst
 		if(!pvals[i].bUsed)
 			continue;
 		if(!strcmp(actpblk.descr[i].name, "mode")) {
-			if(!es_strbufcmp(pvals[i].val.d.estr, (uchar*)"utf8",
-					 sizeof("utf8")-1)) {
+			if(!es_strbufcmp(pvals[i].val.d.estr, (uchar*)"utf-8",
+					 sizeof("utf-8")-1)) {
 				pData->mode = MODE_UTF8;
 			} else if(!es_strbufcmp(pvals[i].val.d.estr, (uchar*)"controlcharacters",
 					 sizeof("controlcharacters")-1)) {
@@ -191,28 +191,48 @@ doCC(instanceData *pData, uchar *msg, int lenMsg)
 	}
 }
 
+/* fix an invalid multibyte sequence */
+static inline void
+fixInvldMBSeq(instanceData *pData, uchar *msg, int lenMsg, int strtIdx, int *endIdx, int8_t seqLen)
+{
+	int i;
+
+	*endIdx = strtIdx + seqLen;
+	if(*endIdx > lenMsg)
+		*endIdx = lenMsg;
+	for(i = strtIdx ; i < *endIdx ; ++i)
+		msg[i] = pData->replChar;
+}
+
 static inline void
 doUTF8(instanceData *pData, uchar *msg, int lenMsg)
 {
 	uchar c;
 	int8_t seqLen, bytesLeft = 0;
+	uint32_t codepoint;
 	int strtIdx, endIdx;
-	int i, j;
+	int i;
 
 	for(i = 0 ; i < lenMsg ; ++i) {
 		c = msg[i];
 		if(bytesLeft) {
 			if((c & 0xc0) != 0x80) {
 				/* sequence invalid, invalidate all bytes */
-				endIdx = strtIdx + seqLen;
-				if(endIdx > lenMsg)
-					endIdx = lenMsg;
-				for(j = strtIdx ; j < endIdx ; ++j)
-					msg[j] = pData->replChar;
+				fixInvldMBSeq(pData, msg, lenMsg, strtIdx, &endIdx,
+				              seqLen);
 				i = endIdx - 1;
 				bytesLeft = 0;
 			} else {
+				codepoint = (codepoint << 6) | (c & 0x3f);
 				--bytesLeft;
+				if(bytesLeft == 0) {
+					/* too-large codepoint? */
+					if(codepoint > 0x10FFFF) {
+						fixInvldMBSeq(pData, msg, lenMsg,
+							      strtIdx, &endIdx,
+							      seqLen);
+					}
+				}
 			}
 		} else {
 			if((c & 0x80) == 0) {
@@ -222,14 +242,17 @@ doUTF8(instanceData *pData, uchar *msg, int lenMsg)
 				/* 2-byte sequence */
 				strtIdx = i;
 				seqLen = bytesLeft = 1;
+				codepoint = c & 0x1f;
 			} else if((c & 0xf0) == 0xe0) {
 				/* 3-byte sequence */
 				strtIdx = i;
 				seqLen = bytesLeft = 2;
+				codepoint = c & 0x0f;
 			} else if((c & 0xf8) == 0xf0) {
 				/* 4-byte sequence */
 				strtIdx = i;
 				seqLen = bytesLeft = 3;
+				codepoint = c & 0x07;
 			} else {   /* invalid (5&6 byte forbidden by RFC3629) */
 				msg[i] = pData->replChar;
 			}
author	Rainer Gerhards <rgerhards@adiscon.com>	2013-09-20 17:24:17 +0200
committer	Rainer Gerhards <rgerhards@adiscon.com>	2013-09-20 17:24:17 +0200
commit	666d301e148df2c150404c4fc73f63c6da933815 (patch)
tree	ab78f1dc555cfce5bfc503faf0439ff0c64b5d3b
parent	f06eb68bc6e623c64cca6bb50155ff67bcf545c6 (diff)
download	rsyslog-666d301e148df2c150404c4fc73f63c6da933815.tar.gz rsyslog-666d301e148df2c150404c4fc73f63c6da933815.tar.bz2 rsyslog-666d301e148df2c150404c4fc73f63c6da933815.zip