From 8faa55b1efed781a3af9e2da72262e531d1f9156 Mon Sep 17 00:00:00 2001
From: Kaz Kylheku <kaz@kylheku.com>
Date: Wed, 7 Apr 2021 07:09:15 -0700
Subject: utf8: fix backtracking bugs in buffer decoder.

* utf8.c (utf8_from_buffer): Fix incorrect backtracking logic
for handling bad UTF-8 bytes. Firstly, we are not backtracking
to the correct byte. Because src is incremented at the top of
the loop, the backtrack pointer must be set to src - 1 to
point to the possibly bad byte. Secondly, when we backtrack,
we are neglecting to rewinding nbytes! Thus after
backtracking, we will not scan the entire input. Let's avoid
using nbytes, and guard the loop based on whether we hit the
end of the buffer; then we don't have any nbytes state to
backtrack.

* tests/017/ffi-misc.tl: New test case converting a three-byte
UTF-8 encoding of U+DC01: an invalid character in the
surrogate range. We test that the buffer decoder turns this
into three characters, exactly like the stream decoder.
Another test case for invalid bytes following a valid
sequence start.
---
 tests/017/ffi-misc.tl | 7 +++++++
 utf8.c                | 8 +++++---
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/tests/017/ffi-misc.tl b/tests/017/ffi-misc.tl
index 1578cd2c..db510737 100644
--- a/tests/017/ffi-misc.tl
+++ b/tests/017/ffi-misc.tl
@@ -9,3 +9,10 @@
 (test (ffi-put "\x1234@@@" zar) #b'e188b440404000')
 
 (test (ffi-get (ffi-put "\x1234@@@" zar) zar) "\x1234@@@")
+
+(unless (meq (os-symbol) :cygwin :cygnal)
+  (test (ffi-get #b'EDB08100' (ffi (zarray char)))
+       "\xDCED\xDCB0\xDC81")
+
+  (test (ffi-get #b'ED7F7FEDFF00' (ffi (zarray char)))
+       "\xDCED\x7F\x7F\xDCED\xDCFF"))
diff --git a/utf8.c b/utf8.c
index 8cb81749..c23eefce 100644
--- a/utf8.c
+++ b/utf8.c
@@ -54,10 +54,10 @@ size_t utf8_from_buf(wchar_t *wdst, const unsigned char *src, size_t nbytes)
 {
   size_t nchar = 1;
   enum utf8_state state = utf8_init;
-  const unsigned char *backtrack = 0;
+  const unsigned char *backtrack = 0, *end = src + nbytes;
   wchar_t wch = 0, wch_min = 0;
 
-  while (nbytes-- > 0) {
+  while (src < end) {
     int ch = *src++;
 
     switch (state) {
@@ -101,7 +101,7 @@ size_t utf8_from_buf(wchar_t *wdst, const unsigned char *src, size_t nbytes)
         nchar++;
         break;
       }
-      backtrack = src;
+      backtrack = src - 1;
       break;
     case utf8_more1:
     case utf8_more2:
@@ -118,6 +118,7 @@ size_t utf8_from_buf(wchar_t *wdst, const unsigned char *src, size_t nbytes)
             src = backtrack;
             if (wdst)
               *wdst++ = 0xDC00 | *src;
+            src++;
           } else {
             if (wdst)
               *wdst++ = wch;
@@ -128,6 +129,7 @@ size_t utf8_from_buf(wchar_t *wdst, const unsigned char *src, size_t nbytes)
         src = backtrack;
         if (wdst)
           *wdst++ = 0xDC00 | *src;
+        src++;
         nchar++;
         state = utf8_init;
       }
-- 
cgit v1.2.3