From 2db8b0497c7cc13b44210fb06b74d45fefccefc3 Mon Sep 17 00:00:00 2001 From: Kaz Kylheku Date: Tue, 20 Apr 2021 07:45:30 -0700 Subject: utf8: decode: reduce strictness of full unicode check. * utf8.c (utf8_from_buf, utf8_deocde): On 16 bit wchar_t, we dont' have to throw on every value in the range 0xF0-0xFF. Only the values 0xF0 through 0xF4 are potential UTF-8 bytes; so we only need to error out on those. 0xF5 through 0xFF are invalid bytes, which we can map into the 0xDCNN range. --- utf8.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/utf8.c b/utf8.c index c23eefce..0d484f4f 100644 --- a/utf8.c +++ b/utf8.c @@ -84,16 +84,16 @@ size_t utf8_from_buf(wchar_t *wdst, const unsigned char *src, size_t nbytes) wch_min = 0x800; break; case 0xF: -#ifdef FULL_UNICODE if (ch < 0xF5) { +#ifdef FULL_UNICODE state = utf8_more3; wch = (ch & 0x7); wch_min = 0x10000; break; - } #else conversion_error(); #endif + } /* fallthrough */ default: if (wdst) @@ -317,16 +317,16 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx) ud->wch_min = 0x800; break; case 0xF: -#ifdef FULL_UNICODE if (ch < 0xF5) { +#ifdef FULL_UNICODE ud->state = utf8_more3; ud->wch = (ch & 0x7); ud->wch_min = 0x10000; break; - } #else conversion_error(); #endif + } /* fallthrough */ default: ud->back = ud->tail; -- cgit v1.2.3