summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRainer Gerhards <rgerhards@adiscon.com>2013-09-20 17:24:17 +0200
committerRainer Gerhards <rgerhards@adiscon.com>2013-09-20 17:24:17 +0200
commit666d301e148df2c150404c4fc73f63c6da933815 (patch)
treeab78f1dc555cfce5bfc503faf0439ff0c64b5d3b
parentf06eb68bc6e623c64cca6bb50155ff67bcf545c6 (diff)
downloadrsyslog-666d301e148df2c150404c4fc73f63c6da933815.tar.gz
rsyslog-666d301e148df2c150404c4fc73f63c6da933815.tar.bz2
rsyslog-666d301e148df2c150404c4fc73f63c6da933815.zip
mmutf8fix: stricter UTF-8 checking
-rw-r--r--doc/mmutf8fix.html6
-rw-r--r--plugins/mmutf8fix/mmutf8fix.c39
2 files changed, 34 insertions, 11 deletions
diff --git a/doc/mmutf8fix.html b/doc/mmutf8fix.html
index e8020685..6275c17e 100644
--- a/doc/mmutf8fix.html
+++ b/doc/mmutf8fix.html
@@ -45,9 +45,9 @@ ruleset.
<p>&nbsp;</p>
<p><b>Action Confguration Parameters</b>:</p>
<ul>
-<li><b>mode</b> - <b>utf8</b>/controlcharacters<br>
+<li><b>mode</b> - <b>utf-8</b>/controlcharacters<br>
This sets the basic detection mode.
-<br>In <b>utf8</b> mode (the default), proper
+<br>In <b>utf-8</b> mode (the default), proper
UTF-8 encoding is checked and bytes which are not proper UTF-8 sequences
are acted on. If a proper multi-byte start sequence byte is detected but
any of the following bytes is invalid, the whole sequence is replaced by
@@ -66,7 +66,7 @@ MUST be a <b>printable</b> US-ASCII character.
<p><b>Caveats/Known Bugs:</b>
<ul>
-<li><b>only IPv4</b> is supported
+<li>overlong UTF-8 encodings are currently not detected in utf-8 mode.
</ul>
<p><b>Samples:</b></p>
diff --git a/plugins/mmutf8fix/mmutf8fix.c b/plugins/mmutf8fix/mmutf8fix.c
index 7a5d1d03..41d98653 100644
--- a/plugins/mmutf8fix/mmutf8fix.c
+++ b/plugins/mmutf8fix/mmutf8fix.c
@@ -143,8 +143,8 @@ CODESTARTnewActInst
if(!pvals[i].bUsed)
continue;
if(!strcmp(actpblk.descr[i].name, "mode")) {
- if(!es_strbufcmp(pvals[i].val.d.estr, (uchar*)"utf8",
- sizeof("utf8")-1)) {
+ if(!es_strbufcmp(pvals[i].val.d.estr, (uchar*)"utf-8",
+ sizeof("utf-8")-1)) {
pData->mode = MODE_UTF8;
} else if(!es_strbufcmp(pvals[i].val.d.estr, (uchar*)"controlcharacters",
sizeof("controlcharacters")-1)) {
@@ -191,28 +191,48 @@ doCC(instanceData *pData, uchar *msg, int lenMsg)
}
}
+/* fix an invalid multibyte sequence */
+static inline void
+fixInvldMBSeq(instanceData *pData, uchar *msg, int lenMsg, int strtIdx, int *endIdx, int8_t seqLen)
+{
+ int i;
+
+ *endIdx = strtIdx + seqLen;
+ if(*endIdx > lenMsg)
+ *endIdx = lenMsg;
+ for(i = strtIdx ; i < *endIdx ; ++i)
+ msg[i] = pData->replChar;
+}
+
static inline void
doUTF8(instanceData *pData, uchar *msg, int lenMsg)
{
uchar c;
int8_t seqLen, bytesLeft = 0;
+ uint32_t codepoint;
int strtIdx, endIdx;
- int i, j;
+ int i;
for(i = 0 ; i < lenMsg ; ++i) {
c = msg[i];
if(bytesLeft) {
if((c & 0xc0) != 0x80) {
/* sequence invalid, invalidate all bytes */
- endIdx = strtIdx + seqLen;
- if(endIdx > lenMsg)
- endIdx = lenMsg;
- for(j = strtIdx ; j < endIdx ; ++j)
- msg[j] = pData->replChar;
+ fixInvldMBSeq(pData, msg, lenMsg, strtIdx, &endIdx,
+ seqLen);
i = endIdx - 1;
bytesLeft = 0;
} else {
+ codepoint = (codepoint << 6) | (c & 0x3f);
--bytesLeft;
+ if(bytesLeft == 0) {
+ /* too-large codepoint? */
+ if(codepoint > 0x10FFFF) {
+ fixInvldMBSeq(pData, msg, lenMsg,
+ strtIdx, &endIdx,
+ seqLen);
+ }
+ }
}
} else {
if((c & 0x80) == 0) {
@@ -222,14 +242,17 @@ doUTF8(instanceData *pData, uchar *msg, int lenMsg)
/* 2-byte sequence */
strtIdx = i;
seqLen = bytesLeft = 1;
+ codepoint = c & 0x1f;
} else if((c & 0xf0) == 0xe0) {
/* 3-byte sequence */
strtIdx = i;
seqLen = bytesLeft = 2;
+ codepoint = c & 0x0f;
} else if((c & 0xf8) == 0xf0) {
/* 4-byte sequence */
strtIdx = i;
seqLen = bytesLeft = 3;
+ codepoint = c & 0x07;
} else { /* invalid (5&6 byte forbidden by RFC3629) */
msg[i] = pData->replChar;
}