diff options
author | Rainer Gerhards <rgerhards@adiscon.com> | 2013-09-20 17:24:17 +0200 |
---|---|---|
committer | Rainer Gerhards <rgerhards@adiscon.com> | 2013-09-20 17:24:17 +0200 |
commit | 666d301e148df2c150404c4fc73f63c6da933815 (patch) | |
tree | ab78f1dc555cfce5bfc503faf0439ff0c64b5d3b | |
parent | f06eb68bc6e623c64cca6bb50155ff67bcf545c6 (diff) | |
download | rsyslog-666d301e148df2c150404c4fc73f63c6da933815.tar.gz rsyslog-666d301e148df2c150404c4fc73f63c6da933815.tar.bz2 rsyslog-666d301e148df2c150404c4fc73f63c6da933815.zip |
mmutf8fix: stricter UTF-8 checking
-rw-r--r-- | doc/mmutf8fix.html | 6 | ||||
-rw-r--r-- | plugins/mmutf8fix/mmutf8fix.c | 39 |
2 files changed, 34 insertions, 11 deletions
diff --git a/doc/mmutf8fix.html b/doc/mmutf8fix.html index e8020685..6275c17e 100644 --- a/doc/mmutf8fix.html +++ b/doc/mmutf8fix.html @@ -45,9 +45,9 @@ ruleset. <p> </p> <p><b>Action Confguration Parameters</b>:</p> <ul> -<li><b>mode</b> - <b>utf8</b>/controlcharacters<br> +<li><b>mode</b> - <b>utf-8</b>/controlcharacters<br> This sets the basic detection mode. -<br>In <b>utf8</b> mode (the default), proper +<br>In <b>utf-8</b> mode (the default), proper UTF-8 encoding is checked and bytes which are not proper UTF-8 sequences are acted on. If a proper multi-byte start sequence byte is detected but any of the following bytes is invalid, the whole sequence is replaced by @@ -66,7 +66,7 @@ MUST be a <b>printable</b> US-ASCII character. <p><b>Caveats/Known Bugs:</b> <ul> -<li><b>only IPv4</b> is supported +<li>overlong UTF-8 encodings are currently not detected in utf-8 mode. </ul> <p><b>Samples:</b></p> diff --git a/plugins/mmutf8fix/mmutf8fix.c b/plugins/mmutf8fix/mmutf8fix.c index 7a5d1d03..41d98653 100644 --- a/plugins/mmutf8fix/mmutf8fix.c +++ b/plugins/mmutf8fix/mmutf8fix.c @@ -143,8 +143,8 @@ CODESTARTnewActInst if(!pvals[i].bUsed) continue; if(!strcmp(actpblk.descr[i].name, "mode")) { - if(!es_strbufcmp(pvals[i].val.d.estr, (uchar*)"utf8", - sizeof("utf8")-1)) { + if(!es_strbufcmp(pvals[i].val.d.estr, (uchar*)"utf-8", + sizeof("utf-8")-1)) { pData->mode = MODE_UTF8; } else if(!es_strbufcmp(pvals[i].val.d.estr, (uchar*)"controlcharacters", sizeof("controlcharacters")-1)) { @@ -191,28 +191,48 @@ doCC(instanceData *pData, uchar *msg, int lenMsg) } } +/* fix an invalid multibyte sequence */ +static inline void +fixInvldMBSeq(instanceData *pData, uchar *msg, int lenMsg, int strtIdx, int *endIdx, int8_t seqLen) +{ + int i; + + *endIdx = strtIdx + seqLen; + if(*endIdx > lenMsg) + *endIdx = lenMsg; + for(i = strtIdx ; i < *endIdx ; ++i) + msg[i] = pData->replChar; +} + static inline void doUTF8(instanceData *pData, uchar *msg, int lenMsg) { uchar c; int8_t seqLen, bytesLeft = 0; + uint32_t codepoint; int strtIdx, endIdx; - int i, j; + int i; for(i = 0 ; i < lenMsg ; ++i) { c = msg[i]; if(bytesLeft) { if((c & 0xc0) != 0x80) { /* sequence invalid, invalidate all bytes */ - endIdx = strtIdx + seqLen; - if(endIdx > lenMsg) - endIdx = lenMsg; - for(j = strtIdx ; j < endIdx ; ++j) - msg[j] = pData->replChar; + fixInvldMBSeq(pData, msg, lenMsg, strtIdx, &endIdx, + seqLen); i = endIdx - 1; bytesLeft = 0; } else { + codepoint = (codepoint << 6) | (c & 0x3f); --bytesLeft; + if(bytesLeft == 0) { + /* too-large codepoint? */ + if(codepoint > 0x10FFFF) { + fixInvldMBSeq(pData, msg, lenMsg, + strtIdx, &endIdx, + seqLen); + } + } } } else { if((c & 0x80) == 0) { @@ -222,14 +242,17 @@ doUTF8(instanceData *pData, uchar *msg, int lenMsg) /* 2-byte sequence */ strtIdx = i; seqLen = bytesLeft = 1; + codepoint = c & 0x1f; } else if((c & 0xf0) == 0xe0) { /* 3-byte sequence */ strtIdx = i; seqLen = bytesLeft = 2; + codepoint = c & 0x0f; } else if((c & 0xf8) == 0xf0) { /* 4-byte sequence */ strtIdx = i; seqLen = bytesLeft = 3; + codepoint = c & 0x07; } else { /* invalid (5&6 byte forbidden by RFC3629) */ msg[i] = pData->replChar; } |