diff options
author | Rainer Gerhards <rgerhards@adiscon.com> | 2013-09-20 17:24:17 +0200 |
---|---|---|
committer | Rainer Gerhards <rgerhards@adiscon.com> | 2013-09-20 17:24:17 +0200 |
commit | 666d301e148df2c150404c4fc73f63c6da933815 (patch) | |
tree | ab78f1dc555cfce5bfc503faf0439ff0c64b5d3b /plugins/mmutf8fix/mmutf8fix.c | |
parent | f06eb68bc6e623c64cca6bb50155ff67bcf545c6 (diff) | |
download | rsyslog-666d301e148df2c150404c4fc73f63c6da933815.tar.gz rsyslog-666d301e148df2c150404c4fc73f63c6da933815.tar.bz2 rsyslog-666d301e148df2c150404c4fc73f63c6da933815.zip |
mmutf8fix: stricter UTF-8 checking
Diffstat (limited to 'plugins/mmutf8fix/mmutf8fix.c')
-rw-r--r-- | plugins/mmutf8fix/mmutf8fix.c | 39 |
1 files changed, 31 insertions, 8 deletions
diff --git a/plugins/mmutf8fix/mmutf8fix.c b/plugins/mmutf8fix/mmutf8fix.c index 7a5d1d03..41d98653 100644 --- a/plugins/mmutf8fix/mmutf8fix.c +++ b/plugins/mmutf8fix/mmutf8fix.c @@ -143,8 +143,8 @@ CODESTARTnewActInst if(!pvals[i].bUsed) continue; if(!strcmp(actpblk.descr[i].name, "mode")) { - if(!es_strbufcmp(pvals[i].val.d.estr, (uchar*)"utf8", - sizeof("utf8")-1)) { + if(!es_strbufcmp(pvals[i].val.d.estr, (uchar*)"utf-8", + sizeof("utf-8")-1)) { pData->mode = MODE_UTF8; } else if(!es_strbufcmp(pvals[i].val.d.estr, (uchar*)"controlcharacters", sizeof("controlcharacters")-1)) { @@ -191,28 +191,48 @@ doCC(instanceData *pData, uchar *msg, int lenMsg) } } +/* fix an invalid multibyte sequence */ +static inline void +fixInvldMBSeq(instanceData *pData, uchar *msg, int lenMsg, int strtIdx, int *endIdx, int8_t seqLen) +{ + int i; + + *endIdx = strtIdx + seqLen; + if(*endIdx > lenMsg) + *endIdx = lenMsg; + for(i = strtIdx ; i < *endIdx ; ++i) + msg[i] = pData->replChar; +} + static inline void doUTF8(instanceData *pData, uchar *msg, int lenMsg) { uchar c; int8_t seqLen, bytesLeft = 0; + uint32_t codepoint; int strtIdx, endIdx; - int i, j; + int i; for(i = 0 ; i < lenMsg ; ++i) { c = msg[i]; if(bytesLeft) { if((c & 0xc0) != 0x80) { /* sequence invalid, invalidate all bytes */ - endIdx = strtIdx + seqLen; - if(endIdx > lenMsg) - endIdx = lenMsg; - for(j = strtIdx ; j < endIdx ; ++j) - msg[j] = pData->replChar; + fixInvldMBSeq(pData, msg, lenMsg, strtIdx, &endIdx, + seqLen); i = endIdx - 1; bytesLeft = 0; } else { + codepoint = (codepoint << 6) | (c & 0x3f); --bytesLeft; + if(bytesLeft == 0) { + /* too-large codepoint? */ + if(codepoint > 0x10FFFF) { + fixInvldMBSeq(pData, msg, lenMsg, + strtIdx, &endIdx, + seqLen); + } + } } } else { if((c & 0x80) == 0) { @@ -222,14 +242,17 @@ doUTF8(instanceData *pData, uchar *msg, int lenMsg) /* 2-byte sequence */ strtIdx = i; seqLen = bytesLeft = 1; + codepoint = c & 0x1f; } else if((c & 0xf0) == 0xe0) { /* 3-byte sequence */ strtIdx = i; seqLen = bytesLeft = 2; + codepoint = c & 0x0f; } else if((c & 0xf8) == 0xf0) { /* 4-byte sequence */ strtIdx = i; seqLen = bytesLeft = 3; + codepoint = c & 0x07; } else { /* invalid (5&6 byte forbidden by RFC3629) */ msg[i] = pData->replChar; } |