diff options
author | Arnold D. Robbins <arnold@skeeve.com> | 2015-08-25 19:51:54 +0300 |
---|---|---|
committer | Arnold D. Robbins <arnold@skeeve.com> | 2015-08-25 19:51:54 +0300 |
commit | 278fe876bb18938803ac1c36b028adb8cef6fe84 (patch) | |
tree | 289302aa264df2025b48f07fbffca7b70bd01f83 /node.c | |
parent | 96cc85ac9ba06ab6b9edface5e4c34392a07a98d (diff) | |
download | egawk-278fe876bb18938803ac1c36b028adb8cef6fe84.tar.gz egawk-278fe876bb18938803ac1c36b028adb8cef6fe84.tar.bz2 egawk-278fe876bb18938803ac1c36b028adb8cef6fe84.zip |
Improve handling of invalid data in UTF locales.
Diffstat (limited to 'node.c')
-rw-r--r-- | node.c | 38 |
1 files changed, 27 insertions, 11 deletions
@@ -717,22 +717,37 @@ str2wstr(NODE *n, size_t **ptr) case (size_t) -2: case (size_t) -1: /* - * Just skip the bad byte and keep going, so that - * we get a more-or-less full string, instead of - * stopping early. This is particularly important - * for match() where we need to build the indices. - */ - sp++; - src_count--; - /* * mbrtowc(3) says the state of mbs becomes undefined * after a bad character, so reset it. */ memset(& mbs, 0, sizeof(mbs)); - /* And warn the user something's wrong */ - if (do_lint && ! warned) { + + /* Warn the user something's wrong */ + if (! warned) { warned = true; - lintwarn(_("Invalid multibyte data detected. There may be a mismatch between your data and your locale.")); + warning(_("Invalid multibyte data detected. There may be a mismatch between your data and your locale.")); + } + + /* + * 8/2015: If we're using UTF, then instead of just + * skipping the character, plug in the Unicode + * replacement character. In most cases this gives + * us "better" results, in that character counts + * and string lengths tend to make more sense. + * + * Otherwise, just skip the bad byte and keep going, + * so that we get a more-or-less full string, instead of + * stopping early. This is particularly important + * for match() where we need to build the indices. + */ + if (using_utf8()) { + count = 1; + wc = 0xFFFD; /* unicode replacement character */ + goto set_wc; + } else { + /* skip it and keep going */ + sp++; + src_count--; } break; @@ -740,6 +755,7 @@ str2wstr(NODE *n, size_t **ptr) count = 1; /* fall through */ default: + set_wc: *wsp++ = wc; src_count -= count; while (count--) { |