From 4edb6eac41259a43a547772c4249ba2d6cc68106 Mon Sep 17 00:00:00 2001 From: Kaz Kylheku Date: Thu, 3 Jun 2021 06:29:10 -0700 Subject: json: improve escaping for script tags. * lib.c (out_json_str): Strengthen the test for escaping the forward slash. It has to occur in the sequence in the string, and encode them. * tests/010/json.tl: Cover this area with some tests. * txr.1: Documented. --- lib.c | 13 ++++++++++++- tests/010/json.tl | 14 ++++++++++++-- txr.1 | 45 ++++++++++++++++++++++++++++++++++----------- 3 files changed, 58 insertions(+), 14 deletions(-) diff --git a/lib.c b/lib.c index 38837e43..d3ae1425 100644 --- a/lib.c +++ b/lib.c @@ -12600,8 +12600,19 @@ static void out_json_str(val str, val out) break; case '<': put_char(chr(ch), out); - if (*cstr == '/') + if (wcsncmp(cstr, L"/script", 7) == 0) { put_char(chr('\\'), out); + } else if (wcsncmp(cstr, L"!--", 3) == 0) { + put_string(lit("\\u0021"), out); + cstr++; + } + break; + case '-': + put_char(chr(ch), out); + if (wcsncmp(cstr, L"->", 2) == 0) { + put_string(lit("\\u002D"), out); + cstr++; + } break; case 0xDC00: put_string(lit("\\u0000"), out); diff --git a/tests/010/json.tl b/tests/010/json.tl index 843b7b58..29ee2833 100644 --- a/tests/010/json.tl +++ b/tests/010/json.tl @@ -94,8 +94,18 @@ (get-json "false") nil (get-json "null") null) -(test - (tojson #(1.0 "abc" t)) "[1,\"abc\",true]") +(mtest + (tojson #(1.0 "abc" t)) "[1,\"abc\",true]" + (tojson "") "\"-\\u002D>\"" + (tojson "a-->b") "\"a-\\u002D>b\"" + (tojson "->") "\"->\"" + (tojson " +occurs in a string, then in the JSON representation, the sequence is +rendered as +.codn -\eu002D> . +Instances of +.code - +(hyphen) in other situations are not encoded. Rationale: safe +embedding in HTML +.code script +tags. +.IP 6. The code point U+DC00 (\*(TX's pseudo-null character) is translated into the .code "\eu0000" escape syntax. -.IP 5. +.IP 7. The code points U+DC01 through U+DCFF are send to the stream as-is. If the stream performs UTF-8 encoding, these characters turn into individual bytes in the range 0 to 255. -.IP 6. +.IP 8. Control characters in the U+0001 to U+001F other than the ones subject to rule 1 above are rendered as .code \eu @@ -72229,7 +72252,7 @@ the range U+D800 to U+DBFF, U+DD00 to U+DFFF, and the code points U+FFFE and U+FFFF are also encoded as .code \eu escape sequences. -.IP 7. +.IP 9. A character outside of the BMP (Basic Multilingual Plane) in the range U+10000 to U+10FFFF is encoded using as a pair of consecutive .code \eu -- cgit v1.2.3