From 905b074cea7303553777e169529efc8aeccdc35a Mon Sep 17 00:00:00 2001 From: Kaz Kylheku Date: Thu, 2 Feb 2012 21:36:24 -0800 Subject: * utf8.c (utf8_to_uc, utf8_encode): Do not encode surrogate code points (U+DC00 to U+DCFF) as multi-byte UTF8 sequences. We use that range for invalid bytes on input, so on output the best thing to do is to reproduce the original bytes. E.g the code U+DCA0 will produce the byte A0. --- ChangeLog | 8 ++++++++ utf8.c | 26 ++++++++++++++++++-------- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/ChangeLog b/ChangeLog index 608d20e8..5c7ecd02 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +2012-02-02 Kaz Kylheku + + * utf8.c (utf8_to_uc, utf8_encode): Do not encode surrogate code + points (U+DC00 to U+DCFF) as multi-byte UTF8 sequences. We use + that range for invalid bytes on input, so on output the best thing + to do is to reproduce the original bytes. E.g the code U+DCA0 + will produce the byte A0. + 2012-02-02 Kaz Kylheku * txr.1: UTF-8 handling clarified. diff --git a/utf8.c b/utf8.c index 1ca8f7b5..fcc4dc98 100644 --- a/utf8.c +++ b/utf8.c @@ -152,11 +152,17 @@ size_t utf8_to_uc(unsigned char *dst, const wchar_t *wsrc) *dst++ = 0x80 | (wch & 0x3F); } } else if (wch < 0x10000) { - nbyte += 3; - if (dst) { - *dst++ = 0xE0 | (wch >> 12); - *dst++ = 0x80 | ((wch >> 6) & 0x3F); - *dst++ = 0x80 | (wch & 0x3F); + if ((wch & 0xFF00) == 0xDC00) { + nbyte += 1; + if (dst) + *dst++ = (wch & 0xff); + } else { + nbyte += 3; + if (dst) { + *dst++ = 0xE0 | (wch >> 12); + *dst++ = 0x80 | ((wch >> 6) & 0x3F); + *dst++ = 0x80 | (wch & 0x3F); + } } } else if (wch < 0x110000) { nbyte += 4; @@ -219,9 +225,13 @@ int utf8_encode(wchar_t wch, int (*put)(int ch, mem_t *ctx), mem_t *ctx) return put(0xC0 | (wch >> 6), ctx) && put(0x80 | (wch & 0x3F), ctx); } else if (wch < 0x10000) { - return put(0xE0 | (wch >> 12), ctx) && - put(0x80 | ((wch >> 6) & 0x3F), ctx) && - put(0x80 | (wch & 0x3F), ctx); + if ((wch & 0xFF00) == 0xDC00) { + return put(wch & 0xFF, ctx); + } else { + return put(0xE0 | (wch >> 12), ctx) && + put(0x80 | ((wch >> 6) & 0x3F), ctx) && + put(0x80 | (wch & 0x3F), ctx); + } } else if (wch < 0x110000) { return put(0xF0 | (wch >> 18), ctx) && put(0x80 | ((wch >> 12) & 0x3F), ctx) && -- cgit v1.2.3