From 3b64319b10196425401d4d71f7ee1273e3bffe32 Mon Sep 17 00:00:00 2001 From: Kaz Kylheku Date: Sat, 15 Feb 2014 00:19:15 -0800 Subject: A trivial change in the UTF-8 decoder allows TXR to handle null bytes in text. * utf8.h (UTF8_ADMIT_NUL): New preprocessor symbol. (utf8_decoder): New member, flags. * utf8.c (utf8_decoder_init): Initialize flags to 0. (utf8_decode): If a null byte is encountered in the input, then convert it to 0xDC00, rather than keeping it as zero, unless flags contains UTF8_ADMIT_NUL. * txr.1: Document handling of null bytes. --- ChangeLog | 15 +++++++++++++++ txr.1 | 9 ++++++++- utf8.c | 3 +++ utf8.h | 3 +++ 4 files changed, 29 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 00fff529..92e3d13e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,18 @@ +2014-02-15 Kaz Kylheku + + A trivial change in the UTF-8 decoder allows TXR to handle null bytes + in text. + + * utf8.h (UTF8_ADMIT_NUL): New preprocessor symbol. + (utf8_decoder): New member, flags. + + * utf8.c (utf8_decoder_init): Initialize flags to 0. + (utf8_decode): If a null byte is encountered in the input, + then convert it to 0xDC00, rather than keeping it as zero, + unless flags contains UTF8_ADMIT_NUL. + + * txr.1: Document handling of null bytes. + 2014-02-14 Kaz Kylheku * hash.c (hash_update): Avoid double cdr. diff --git a/txr.1 b/txr.1 index dc692dd2..d69b8645 100644 --- a/txr.1 +++ b/txr.1 @@ -478,7 +478,7 @@ does not split the line into two; it's embedded into the line and thus cannot match anything. However, @\en may be useful in the @(cat) directive and in @(output). -.SS International Characters +.SS Character Handling and International Characters .B TXR represents text internally using wide characters, which are used to represent @@ -519,6 +519,13 @@ mapping it to the Unicode character range U+DC00 through U+DCFF. The decoding resumes afresh at the following byte, expecting that byte to be the start of a UTF-8 code. +Furthermore, because TXR internally uses a null-terminated character +representation of strings which easily interoperates with C language +interfaces, when a null character is read from a stream, TXR converts it to +the code U+DC00. On output, this code converts back to a null byte, +as explained in the previous paragraph. By means of this representational +trick, TXR can handle textual data containing null bytes. + .SS Regular Expression Directives In place of a piece of text (see section Text above), a regular expression diff --git a/utf8.c b/utf8.c index 26e5795d..e3ef3e7a 100644 --- a/utf8.c +++ b/utf8.c @@ -260,6 +260,7 @@ int utf8_encode(wchar_t wch, int (*put)(int ch, mem_t *ctx), mem_t *ctx) void utf8_decoder_init(utf8_decoder_t *ud) { ud->state = utf8_init; + ud->flags = 0; ud->wch = 0; ud->head = ud->tail = ud->back = 0; } @@ -295,6 +296,8 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx) case 0x0: case 0x1: case 0x2: case 0x3: case 0x4: case 0x5: case 0x6: case 0x7: ud->back = ud->tail; + if (ch == 0 && (ud->flags & UTF8_ADMIT_NUL) == 0) + return 0xDC00; return ch; case 0xC: case 0xD: ud->state = utf8_more1; diff --git a/utf8.h b/utf8.h index c4915488..67dee69a 100644 --- a/utf8.h +++ b/utf8.h @@ -35,8 +35,11 @@ unsigned char *utf8_dup_to_uc(const wchar_t *); enum utf8_state { utf8_init, utf8_more1, utf8_more2, utf8_more3 }; +#define UTF8_ADMIT_NUL 1 + typedef struct utf8_decoder { enum utf8_state state; + int flags; wchar_t wch, wch_min; int head, tail, back; int buf[8]; -- cgit v1.2.3