aboutsummaryrefslogtreecommitdiffstats
path: root/node.c
diff options
context:
space:
mode:
authorArnold D. Robbins <arnold@skeeve.com>2015-08-25 19:51:54 +0300
committerArnold D. Robbins <arnold@skeeve.com>2015-08-25 19:51:54 +0300
commit278fe876bb18938803ac1c36b028adb8cef6fe84 (patch)
tree289302aa264df2025b48f07fbffca7b70bd01f83 /node.c
parent96cc85ac9ba06ab6b9edface5e4c34392a07a98d (diff)
downloadegawk-278fe876bb18938803ac1c36b028adb8cef6fe84.tar.gz
egawk-278fe876bb18938803ac1c36b028adb8cef6fe84.tar.bz2
egawk-278fe876bb18938803ac1c36b028adb8cef6fe84.zip
Improve handling of invalid data in UTF locales.
Diffstat (limited to 'node.c')
-rw-r--r--node.c38
1 files changed, 27 insertions, 11 deletions
diff --git a/node.c b/node.c
index 1741a13b..de771147 100644
--- a/node.c
+++ b/node.c
@@ -717,22 +717,37 @@ str2wstr(NODE *n, size_t **ptr)
case (size_t) -2:
case (size_t) -1:
/*
- * Just skip the bad byte and keep going, so that
- * we get a more-or-less full string, instead of
- * stopping early. This is particularly important
- * for match() where we need to build the indices.
- */
- sp++;
- src_count--;
- /*
* mbrtowc(3) says the state of mbs becomes undefined
* after a bad character, so reset it.
*/
memset(& mbs, 0, sizeof(mbs));
- /* And warn the user something's wrong */
- if (do_lint && ! warned) {
+
+ /* Warn the user something's wrong */
+ if (! warned) {
warned = true;
- lintwarn(_("Invalid multibyte data detected. There may be a mismatch between your data and your locale."));
+ warning(_("Invalid multibyte data detected. There may be a mismatch between your data and your locale."));
+ }
+
+ /*
+ * 8/2015: If we're using UTF, then instead of just
+ * skipping the character, plug in the Unicode
+ * replacement character. In most cases this gives
+ * us "better" results, in that character counts
+ * and string lengths tend to make more sense.
+ *
+ * Otherwise, just skip the bad byte and keep going,
+ * so that we get a more-or-less full string, instead of
+ * stopping early. This is particularly important
+ * for match() where we need to build the indices.
+ */
+ if (using_utf8()) {
+ count = 1;
+ wc = 0xFFFD; /* unicode replacement character */
+ goto set_wc;
+ } else {
+ /* skip it and keep going */
+ sp++;
+ src_count--;
}
break;
@@ -740,6 +755,7 @@ str2wstr(NODE *n, size_t **ptr)
count = 1;
/* fall through */
default:
+ set_wc:
*wsp++ = wc;
src_count -= count;
while (count--) {