From 4b65f190450f70bd5819bb5c18e3370d75ffebde Mon Sep 17 00:00:00 2001 From: Corinna Vinschen Date: Wed, 6 Feb 2008 18:24:50 +0000 Subject: * fhandler.h (fhandler_console::trunc_buf): Add to use as cache for truncated multibyte characters on input. (fhandler_console::write_replacement_char): Declare new method. * fhandler_console.cc (CONVERT_LIMIT): Raise to 64K. (fhandler_console::fhandler_console): Initialize trunc_buf. (ERR): Define as independent value again. (fhandler_console::write_replacement_char): New method to print replacement chars. (fhandler_console::write_normal): Add handling for truncated multibyte sequences. Call next_char instead of pathetic CharNextExA function. Don't change src, rather just work with found later on. * miscfuncs.cc (is_cp_multibyte): Move here from strfuncs.cc. Don't call Windows function, restrict to well-known ANSI/OEM codepages and UTF-8. (next_char): Call CharNextExA only for doublebyte codepages. Implement for UTF-8 here. * strfuncs.cc (is_cp_multibyte): Move to miscfuncs.cc. * winsup.h (next_char): Declare. * include/limits.h (MB_LEN_MAX): Set to maximum value of MB_CUR_MAX as defined by newlib for now. --- winsup/cygwin/miscfuncs.cc | 115 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 114 insertions(+), 1 deletion(-) (limited to 'winsup/cygwin/miscfuncs.cc') diff --git a/winsup/cygwin/miscfuncs.cc b/winsup/cygwin/miscfuncs.cc index 0ec0b4873..4edfbab94 100644 --- a/winsup/cygwin/miscfuncs.cc +++ b/winsup/cygwin/miscfuncs.cc @@ -17,7 +17,8 @@ details. */ #include #include #include -#include +#include +#include #include #include "cygthread.h" #include "cygtls.h" @@ -192,6 +193,118 @@ cygwin_strupr (char *string) return string; } +/* FIXME? We only support standard ANSI/OEM codepages according to + http://www.microsoft.com/globaldev/reference/cphome.mspx as well + as UTF-8 and codepage 1361, which is also mentioned as valid + doublebyte codepage in MSDN man pages (e.g. IsDBCSLeadByteEx). + Everything else will be hosed. */ + +bool +is_cp_multibyte (UINT cp) +{ + switch (cp) + { + case 932: + case 936: + case 949: + case 950: + case 1361: + case 65001: + return true; + } + return false; +} + +/* OMYGOD! CharNextExA is not UTF-8 aware! It only works fine with + double byte charsets. So we have to do it ourselves for UTF-8. + + While being at it, we do more. If a double-byte or multibyte + sequence is trucated due to an early end, we need a way to recognize + it. The reason is that multiple buffered write statements might + accidentally stop and start in the middle of a single character byte + sequence. If we have to interpret the byte sequences (as in + fhandler_console, we would print wrong output in these cases. + + So we have four possible return values here: + + ret = end if str >= end + ret = NULL if we encounter an invalid byte sequence + ret = str if we encounter the start byte of a truncated byte sequence + ret = str + n if we encounter a vaild byte sequence +*/ + +const unsigned char * +next_char (UINT cp, const unsigned char *str, const unsigned char *end) +{ + const unsigned char *ret; + + if (str >= end) + return end; + + switch (cp) + { + case 932: + case 936: + case 949: + case 950: + case 1361: + if (*str <= 0x7f) + ret = str + 1; + else if (str == end - 1 && IsDBCSLeadByteEx (cp, *str)) + ret = str; + else + ret = (const unsigned char *) CharNextExA (cp, (const CHAR *) str, 0); + break; + case CP_UTF8: + switch (str[0] >> 4) + { + case 0x0 ... 0x7: /* One byte character. */ + ret = str + 1; + break; + case 0x8 ... 0xb: /* Followup byte. Invalid as first byte. */ + ret = NULL; + break; + case 0xc ... 0xd: /* Two byte character. */ + /* Check followup bytes for validity. */ + if (str >= end - 1) + ret = str; + else if (str[1] <= 0xbf) + ret = str + 2; + else + ret = NULL; + break; + case 0xe: /* Three byte character. */ + if (str >= end - 2) + ret = str; + else if ((str[1] & 0xc0) == 0x80 && (str[2] & 0xc0) == 0x80 + && (str[0] != 0xe0 || str[1] >= 0xa0) + && (str[0] != 0xed || str[1] <= 0x9f)) + ret = str + 3; + else + ret = NULL; + break; + case 0xf: /* Four byte character. */ + if (str[0] >= 0xf8) + ret = NULL; + else if (str >= end - 3) + ret = str; + else if ((str[1] & 0xc0) == 0x80 && (str[2] & 0xc0) == 0x80 + && (str[3] & 0xc0) == 0x80 + && (str[0] == 0xf0 || str[1] >= 0x90) + && (str[0] == 0xf4 || str[1] <= 0x8f)) + ret = str + 4; + else + ret = NULL; + break; + } + break; + default: + ret = str + 1; + break; + } + return ret; +} + int __stdcall check_invalid_virtual_addr (const void *s, unsigned sz) { -- cgit v1.2.3