diff options
author | Kaz Kylheku <kaz@kylheku.com> | 2021-04-20 07:45:30 -0700 |
---|---|---|
committer | Kaz Kylheku <kaz@kylheku.com> | 2021-04-20 07:45:30 -0700 |
commit | 222adfcb6c232f4b91260e9253ae70af74274371 (patch) | |
tree | 224f5617e347fa39b371eb7716567187b34b8480 | |
parent | 48808c951895bcc48ddb0fc3f406993d69620470 (diff) | |
download | txr-222adfcb6c232f4b91260e9253ae70af74274371.tar.gz txr-222adfcb6c232f4b91260e9253ae70af74274371.tar.bz2 txr-222adfcb6c232f4b91260e9253ae70af74274371.zip |
utf8: decode: reduce strictness of full unicode check.
* utf8.c (utf8_from_buf, utf8_deocde): On 16 bit wchar_t, we
dont' have to throw on every value in the range 0xF0-0xFF.
Only the values 0xF0 through 0xF4 are potential UTF-8 bytes;
so we only need to error out on those. 0xF5 through 0xFF
are invalid bytes, which we can map into the 0xDCNN range.
-rw-r--r-- | utf8.c | 8 |
1 files changed, 4 insertions, 4 deletions
@@ -84,16 +84,16 @@ size_t utf8_from_buf(wchar_t *wdst, const unsigned char *src, size_t nbytes) wch_min = 0x800; break; case 0xF: -#ifdef FULL_UNICODE if (ch < 0xF5) { +#ifdef FULL_UNICODE state = utf8_more3; wch = (ch & 0x7); wch_min = 0x10000; break; - } #else conversion_error(); #endif + } /* fallthrough */ default: if (wdst) @@ -317,16 +317,16 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx) ud->wch_min = 0x800; break; case 0xF: -#ifdef FULL_UNICODE if (ch < 0xF5) { +#ifdef FULL_UNICODE ud->state = utf8_more3; ud->wch = (ch & 0x7); ud->wch_min = 0x10000; break; - } #else conversion_error(); #endif + } /* fallthrough */ default: ud->back = ud->tail; |