From c752742366d6ab0e69067243500a76ee7e9f16ae Mon Sep 17 00:00:00 2001 From: Kaz Kylheku Date: Mon, 17 Dec 2018 09:47:01 -0800 Subject: UTF-8: fix incorrect decoding of four-byte sequences. utf8.c (utf8_decode): The wch_min value is set incorrectly for the four byte case due to an extra zero; it should be only 0x10000. Code points encoded to four utf8 bytes start at this value. The consequence of this error is that utf8-encoded characters in this range are treated as invalid bytes after being decoded due to failing the range test. --- utf8.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utf8.c b/utf8.c index 3ddc74a5..eaef3864 100644 --- a/utf8.c +++ b/utf8.c @@ -324,7 +324,7 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx) if (ch < 0xF5) { ud->state = utf8_more3; ud->wch = (ch & 0x7); - ud->wch_min = 0x100000; + ud->wch_min = 0x10000; break; } /* fallthrough */ -- cgit v1.2.3