From 2db8b0497c7cc13b44210fb06b74d45fefccefc3 Mon Sep 17 00:00:00 2001
From: Kaz Kylheku <kaz@kylheku.com>
Date: Tue, 20 Apr 2021 07:45:30 -0700
Subject: utf8: decode: reduce strictness of full unicode check.

* utf8.c (utf8_from_buf, utf8_deocde): On 16 bit wchar_t, we
dont' have to throw on every value in the range 0xF0-0xFF.
Only the values 0xF0 through 0xF4 are potential UTF-8 bytes;
so we only need to error out on those. 0xF5 through 0xFF
are invalid bytes, which we can map into the 0xDCNN range.
---
 utf8.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/utf8.c b/utf8.c
index c23eefce..0d484f4f 100644
--- a/utf8.c
+++ b/utf8.c
@@ -84,16 +84,16 @@ size_t utf8_from_buf(wchar_t *wdst, const unsigned char *src, size_t nbytes)
         wch_min = 0x800;
         break;
       case 0xF:
-#ifdef FULL_UNICODE
         if (ch < 0xF5) {
+#ifdef FULL_UNICODE
           state = utf8_more3;
           wch = (ch & 0x7);
           wch_min = 0x10000;
           break;
-        }
 #else
         conversion_error();
 #endif
+        }
         /* fallthrough */
       default:
         if (wdst)
@@ -317,16 +317,16 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx)
         ud->wch_min = 0x800;
         break;
       case 0xF:
-#ifdef FULL_UNICODE
         if (ch < 0xF5) {
+#ifdef FULL_UNICODE
           ud->state = utf8_more3;
           ud->wch = (ch & 0x7);
           ud->wch_min = 0x10000;
           break;
-        }
 #else
         conversion_error();
 #endif
+        }
         /* fallthrough */
       default:
         ud->back = ud->tail;
-- 
cgit v1.2.3