Improve handling of invalid data in UTF locales.

author: Arnold D. Robbins <arnold@skeeve.com> 2015-08-25 19:51:54 +0300
committer: Arnold D. Robbins <arnold@skeeve.com> 2015-08-25 19:51:54 +0300
commit: 278fe876bb18938803ac1c36b028adb8cef6fe84 (patch)
tree: 289302aa264df2025b48f07fbffca7b70bd01f83 /node.c
parent: 96cc85ac9ba06ab6b9edface5e4c34392a07a98d (diff)
download: egawk-278fe876bb18938803ac1c36b028adb8cef6fe84.tar.gz
egawk-278fe876bb18938803ac1c36b028adb8cef6fe84.tar.bz2
egawk-278fe876bb18938803ac1c36b028adb8cef6fe84.zip
1 files changed, 27 insertions, 11 deletions
diff --git a/node.c b/node.c
index 1741a13b..de771147 100644
--- a/node.c
+++ b/node.c
@@ -717,22 +717,37 @@ str2wstr(NODE *n, size_t **ptr)
 		case (size_t) -2:
 		case (size_t) -1:
 			/*
-			 * Just skip the bad byte and keep going, so that
-			 * we get a more-or-less full string, instead of
-			 * stopping early. This is particularly important
-			 * for match() where we need to build the indices.
-			 */
-			sp++;
-			src_count--;
-			/*
 			 * mbrtowc(3) says the state of mbs becomes undefined
 			 * after a bad character, so reset it.
 			 */
 			memset(& mbs, 0, sizeof(mbs));
-			/* And warn the user something's wrong */
-			if (do_lint && ! warned) {
+
+			/* Warn the user something's wrong */
+			if (! warned) {
 				warned = true;
-				lintwarn(_("Invalid multibyte data detected. There may be a mismatch between your data and your locale."));
+				warning(_("Invalid multibyte data detected. There may be a mismatch between your data and your locale."));
+			}
+
+			/*
+			 * 8/2015: If we're using UTF, then instead of just
+			 * skipping the character, plug in the Unicode
+			 * replacement character. In most cases this gives
+			 * us "better" results, in that character counts
+			 * and string lengths tend to make more sense.
+			 *
+			 * Otherwise, just skip the bad byte and keep going,
+			 * so that we get a more-or-less full string, instead of
+			 * stopping early. This is particularly important
+			 * for match() where we need to build the indices.
+			 */
+			if (using_utf8()) {
+				count = 1;
+				wc = 0xFFFD;	/* unicode replacement character */
+				goto set_wc;
+			} else {
+				/* skip it and keep going */
+				sp++;
+				src_count--;
 			}
 			break;
 
@@ -740,6 +755,7 @@ str2wstr(NODE *n, size_t **ptr)
 			count = 1;
 			/* fall through */
 		default:
+		set_wc:
 			*wsp++ = wc;
 			src_count -= count;
 			while (count--)  {
author	Arnold D. Robbins <arnold@skeeve.com>	2015-08-25 19:51:54 +0300
committer	Arnold D. Robbins <arnold@skeeve.com>	2015-08-25 19:51:54 +0300
commit	278fe876bb18938803ac1c36b028adb8cef6fe84 (patch)
tree	289302aa264df2025b48f07fbffca7b70bd01f83 /node.c
parent	96cc85ac9ba06ab6b9edface5e4c34392a07a98d (diff)
download	egawk-278fe876bb18938803ac1c36b028adb8cef6fe84.tar.gz egawk-278fe876bb18938803ac1c36b028adb8cef6fe84.tar.bz2 egawk-278fe876bb18938803ac1c36b028adb8cef6fe84.zip