From 4fc01f0f2522de6b44282956ec909e6874b86b3a Mon Sep 17 00:00:00 2001 From: Kaz Kylheku Date: Tue, 5 Feb 2019 08:16:34 -0800 Subject: parser: security: UTF-8 and NUL handling in literals. A null byte in regex and string literals is being processed as a #\nul instead of correctly turning into #\pnul. Bad UTF-8 is not being rejected. * parser.l (REGCHAR, LITCHAR): Use utf8_from_buffer to properly convert yytext using its true length, rather than utf8_from which assumes a null-terminated string. Thus null bytes (including the case of a yytext being single NUL) are handled properly. Check that the result is exactly one character (null-terminated buffer, two characters wide). * utf8.c (utf8_from): Unused function removed. * utf8.h (utf8_from): Declaration removed. --- parser.l | 20 ++++++++++++++------ utf8.c | 6 ------ utf8.h | 1 - 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/parser.l b/parser.l index da2f8116..7a9d8d8b 100644 --- a/parser.l +++ b/parser.l @@ -903,9 +903,13 @@ UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} } {UANYN} { - wchar_t buf[8]; - utf8_from(buf, yytext); - yylval->chr = buf[0]; + wchar_t wchr[8]; + if (utf8_from_buf(wchr, coerce(unsigned char *, yytext), yyleng) != 2) { + yyerrprepf(yyg, lit("non-UTF-8 byte in regex: '\\x~02x'"), + num(convert(unsigned char, yytext[0])), nao); + return ERRTOK; + } + yylval->chr = wchr[0]; return REGCHAR; } @@ -1057,9 +1061,13 @@ UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} } {UANYN} { - wchar_t buf[8]; - utf8_from(buf, yytext); - yylval->chr = buf[0]; + wchar_t wchr[8]; + if (utf8_from_buf(wchr, coerce(unsigned char *, yytext), yyleng) != 2) { + yyerrprepf(yyg, lit("non-UTF-8 byte in literal: '\\x~02x'"), + num(convert(unsigned char, yytext[0])), nao); + return ERRTOK; + } + yylval->chr = wchr[0]; return LITCHAR; } diff --git a/utf8.c b/utf8.c index 620c6bdb..c5f9f3dc 100644 --- a/utf8.c +++ b/utf8.c @@ -139,12 +139,6 @@ size_t utf8_from_buf(wchar_t *wdst, const unsigned char *src, size_t nbytes) return nchar; } -size_t utf8_from(wchar_t *wdst, const char *src) -{ - size_t nbytes = strlen(src); - return utf8_from_buf(wdst, coerce(const unsigned char *, src), nbytes); -} - size_t utf8_to_buf(unsigned char *dst, const wchar_t *wsrc, int null_term) { size_t nbyte = 0; diff --git a/utf8.h b/utf8.h index a60fe944..a007ac68 100644 --- a/utf8.h +++ b/utf8.h @@ -26,7 +26,6 @@ */ size_t utf8_from_buf(wchar_t *, const unsigned char *, size_t nbytes); -size_t utf8_from(wchar_t *, const char *); size_t utf8_to_buf(unsigned char *dst, const wchar_t *wsrc, int null_term); size_t utf8_to(char *, const wchar_t *); wchar_t *utf8_dup_from(const char *); -- cgit v1.2.3