From 4fc01f0f2522de6b44282956ec909e6874b86b3a Mon Sep 17 00:00:00 2001
From: Kaz Kylheku <kaz@kylheku.com>
Date: Tue, 5 Feb 2019 08:16:34 -0800
Subject: parser: security: UTF-8 and NUL handling in literals.

A null byte in regex and string literals is being processed as
a #\nul instead of correctly turning into #\pnul. Bad UTF-8 is
not being rejected.

* parser.l (REGCHAR, LITCHAR): Use utf8_from_buffer to
properly convert yytext using its true length, rather than
utf8_from which assumes a null-terminated string. Thus
null bytes (including the case of a yytext being single NUL)
are handled properly. Check that the result is exactly one
character (null-terminated buffer, two characters wide).

* utf8.c (utf8_from): Unused function removed.

* utf8.h (utf8_from): Declaration removed.
---
 parser.l | 20 ++++++++++++++------
 utf8.c   |  6 ------
 utf8.h   |  1 -
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/parser.l b/parser.l
index da2f8116..7a9d8d8b 100644
--- a/parser.l
+++ b/parser.l
@@ -903,9 +903,13 @@ UONLY   {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
 }
 
 <REGEX,SREGEX>{UANYN}  {
-  wchar_t buf[8];
-  utf8_from(buf, yytext);
-  yylval->chr = buf[0];
+  wchar_t wchr[8];
+  if (utf8_from_buf(wchr, coerce(unsigned char *, yytext), yyleng) != 2) {
+    yyerrprepf(yyg, lit("non-UTF-8 byte in regex: '\\x~02x'"),
+               num(convert(unsigned char, yytext[0])), nao);
+    return ERRTOK;
+  }
+  yylval->chr = wchr[0];
   return REGCHAR;
 }
 
@@ -1057,9 +1061,13 @@ UONLY   {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U}
 }
 
 <STRLIT,CHRLIT,QSILIT,WLIT,QWLIT>{UANYN} {
-  wchar_t buf[8];
-  utf8_from(buf, yytext);
-  yylval->chr = buf[0];
+  wchar_t wchr[8];
+  if (utf8_from_buf(wchr, coerce(unsigned char *, yytext), yyleng) != 2) {
+    yyerrprepf(yyg, lit("non-UTF-8 byte in literal: '\\x~02x'"),
+               num(convert(unsigned char, yytext[0])), nao);
+    return ERRTOK;
+  }
+  yylval->chr = wchr[0];
   return LITCHAR;
 }
 
diff --git a/utf8.c b/utf8.c
index 620c6bdb..c5f9f3dc 100644
--- a/utf8.c
+++ b/utf8.c
@@ -139,12 +139,6 @@ size_t utf8_from_buf(wchar_t *wdst, const unsigned char *src, size_t nbytes)
   return nchar;
 }
 
-size_t utf8_from(wchar_t *wdst, const char *src)
-{
-  size_t nbytes = strlen(src);
-  return utf8_from_buf(wdst, coerce(const unsigned char *, src), nbytes);
-}
-
 size_t utf8_to_buf(unsigned char *dst, const wchar_t *wsrc, int null_term)
 {
   size_t nbyte = 0;
diff --git a/utf8.h b/utf8.h
index a60fe944..a007ac68 100644
--- a/utf8.h
+++ b/utf8.h
@@ -26,7 +26,6 @@
  */
 
 size_t utf8_from_buf(wchar_t *, const unsigned char *, size_t nbytes);
-size_t utf8_from(wchar_t *, const char *);
 size_t utf8_to_buf(unsigned char *dst, const wchar_t *wsrc, int null_term);
 size_t utf8_to(char *, const wchar_t *);
 wchar_t *utf8_dup_from(const char *);
-- 
cgit v1.2.3