5 files changed, 169 insertions, 98 deletions
diff --git a/ChangeLog b/ChangeLog
index 57d55fb6..c86de65e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2014-03-30         Arnold D. Robbins     <arnold@skeeve.com>
+
+	* dfa.c: Sync with GNU grep.
+
 2014-03-28         Arnold D. Robbins     <arnold@skeeve.com>
 
 	* configure.ac: Remove duplicate AC_HEADER_TIME and rearrange
diff --git a/dfa.c b/dfa.c
index 0fc68cff..378305df 100644
--- a/dfa.c
+++ b/dfa.c
@@ -43,7 +43,11 @@
 #include "missing_d/gawkbool.h"
 #endif /* HAVE_STDBOOL_H */
 
-/* Gawk doesn't use Gnulib, so don't assume static_assert is present.  */
+/* Gawk doesn't use Gnulib, so don't assume that setlocale and
+   static_assert are present.  */
+#ifndef LC_ALL
+# define setlocale(category, locale) NULL
+#endif
 #ifndef static_assert
 # define static_assert(cond, diagnostic) \
     extern int (*foo (void)) [!!sizeof (struct { int foo: (cond) ? 8 : -1; })]
@@ -408,6 +412,14 @@ struct dfa
   size_t nmultibyte_prop;
   int *multibyte_prop;
 
+#if MBS_SUPPORT
+  /* A table indexed by byte values that contains the corresponding wide
+     character (if any) for that byte.  WEOF means the byte is the
+     leading byte of a multibyte character.  Invalid and null bytes are
+     mapped to themselves.  */
+  wint_t mbrtowc_cache[NOTCHAR];
+#endif
+
   /* Array of the bracket expression in the DFA.  */
   struct mb_char_classes *mbcsets;
   size_t nmbcsets;
@@ -510,6 +522,64 @@ static void regexp (void);
     }								\
   while (false)
 
+static void
+dfambcache (struct dfa *d)
+{
+#if MBS_SUPPORT
+  int i;
+  for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
+    {
+      char c = i;
+      unsigned char uc = i;
+      mbstate_t s = { 0 };
+      wchar_t wc;
+      wint_t wi;
+      switch (mbrtowc (&wc, &c, 1, &s))
+        {
+        default: wi = wc; break;
+        case (size_t) -2: wi = WEOF; break;
+        case (size_t) -1: wi = uc; break;
+        }
+      d->mbrtowc_cache[uc] = wi;
+    }
+#endif
+}
+
+#if MBS_SUPPORT
+/* Given the dfa D, store into *PWC the result of converting the
+   leading bytes of the multibyte buffer S of length N bytes, updating
+   the conversion state in *MBS.  On conversion error, convert just a
+   single byte as-is.  Return the number of bytes converted.
+
+   This differs from mbrtowc (PWC, S, N, MBS) as follows:
+
+   * Extra arg D, containing an mbrtowc_cache for speed.
+   * N must be at least 1.
+   * S[N - 1] must be a sentinel byte.
+   * Shift encodings are not supported.
+   * The return value is always in the range 1..N.
+   * *MBS is always valid afterwards.
+   * *PWC is always set to something.  */
+static size_t
+mbs_to_wchar (struct dfa *d, wchar_t *pwc, char const *s, size_t n,
+              mbstate_t *mbs)
+{
+  unsigned char uc = s[0];
+  wint_t wc = d->mbrtowc_cache[uc];
+
+  if (wc == WEOF)
+    {
+      size_t nbytes = mbrtowc (pwc, s, n, mbs);
+      if (0 < nbytes && nbytes < (size_t) -2)
+        return nbytes;
+      memset (mbs, 0, sizeof *mbs);
+      wc = uc;
+    }
+
+  *pwc = wc;
+  return 1;
+}
+#endif
 
 #ifdef DEBUG
 
@@ -820,13 +890,10 @@ using_simple_locale (void)
       static int unibyte_c = -1;
       if (unibyte_c < 0)
         {
-#ifdef LC_ALL
-          char *locale = setlocale (LC_ALL, NULL);
-          unibyte_c = (locale && (STREQ (locale, "C")
-                                  || STREQ (locale, "POSIX")));
-#else
-          unibyte_c = 1;
-#endif
+          char const *locale = setlocale (LC_ALL, NULL);
+          unibyte_c = (!locale
+                       || STREQ (locale, "C")
+                       || STREQ (locale, "POSIX"));
         }
       return unibyte_c;
     }
@@ -848,7 +915,7 @@ static int minrep, maxrep;      /* Repeat counts for {m,n}.  */
 static int cur_mb_len = 1;      /* Length of the multibyte representation of
                                    wctok.  */
 /* These variables are used only if (MB_CUR_MAX > 1).  */
-static mbstate_t mbs;           /* Mbstate for mbrlen.  */
+static mbstate_t mbs;           /* mbstate for mbrtowc.  */
 static wchar_t wctok;           /* Wide character representation of the current
                                    multibyte character.  */
 static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec.
@@ -885,32 +952,18 @@ static unsigned char const *buf_end;    /* reference to end in dfaexec.  */
     else					\
       {						\
         wchar_t _wc;				\
-        cur_mb_len = mbrtowc (&_wc, lexptr, lexleft, &mbs); \
-        if (cur_mb_len <= 0)			\
-          {					\
-            cur_mb_len = 1;			\
-            --lexleft;				\
-            (wc) = (c) = to_uchar (*lexptr++);  \
-          }					\
-        else					\
-          {					\
-            lexptr += cur_mb_len;		\
-            lexleft -= cur_mb_len;		\
-            (wc) = _wc;				\
-            (c) = wctob (wc);			\
-          }					\
+        size_t nbytes = mbs_to_wchar (dfa, &_wc, lexptr, lexleft, &mbs); \
+        cur_mb_len = nbytes;			\
+        (wc) = _wc;				\
+        (c) = nbytes == 1 ? to_uchar (*lexptr) : EOF;    \
+        lexptr += nbytes;			\
+        lexleft -= nbytes;			\
       }						\
   } while (0)
 
-# define FETCH(c, eoferr)			\
-  do {						\
-    wint_t wc;					\
-    FETCH_WC (c, wc, eoferr);			\
-  } while (0)
-
 #else
 /* Note that characters become unsigned here.  */
-# define FETCH(c, eoferr)	      \
+# define FETCH_WC(c, unused, eoferr)  \
   do {				      \
     if (! lexleft)		      \
       {				      \
@@ -923,8 +976,6 @@ static unsigned char const *buf_end;    /* reference to end in dfaexec.  */
     --lexleft;			      \
   } while (0)
 
-# define FETCH_WC(c, unused, eoferr) FETCH (c, eoferr)
-
 #endif /* MBS_SUPPORT */
 
 #ifndef MIN
@@ -1302,14 +1353,9 @@ lex (void)
      "if (backslash) ...".  */
   for (i = 0; i < 2; ++i)
     {
-      if (MB_CUR_MAX > 1)
-        {
-          FETCH_WC (c, wctok, NULL);
-          if ((int) c == EOF)
-            goto normal_char;
-        }
-      else
-        FETCH (c, NULL);
+      FETCH_WC (c, wctok, NULL);
+      if (c == (unsigned int) EOF)
+        goto normal_char;
 
       switch (c)
         {
@@ -1726,16 +1772,19 @@ static void
 addtok_wc (wint_t wc)
 {
   unsigned char buf[MB_LEN_MAX];
-  mbstate_t s;
+  mbstate_t s = { 0 };
   int i;
-  memset (&s, 0, sizeof s);
-  cur_mb_len = wcrtomb ((char *) buf, wc, &s);
+  size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
 
-  /* This is merely stop-gap.  When cur_mb_len is 0 or negative,
-     buf[0] is undefined, yet skipping the addtok_mb call altogether
-     can result in heap corruption.  */
-  if (cur_mb_len <= 0)
-    buf[0] = 0;
+  if (stored_bytes != (size_t) -1)
+    cur_mb_len = stored_bytes;
+  else
+    {
+      /* This is merely stop-gap.  buf[0] is undefined, yet skipping
+         the addtok_mb call altogether can corrupt the heap.  */
+      cur_mb_len = 1;
+      buf[0] = 0;
+    }
 
   addtok_mb (buf[0], cur_mb_len == 1 ? 3 : 1);
   for (i = 1; i < cur_mb_len; i++)
@@ -3356,43 +3405,26 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp)
 /* Initialize mblen_buf and inputwcs with data from the next line.  */
 
 static void
-prepare_wc_buf (const char *begin, const char *end)
+prepare_wc_buf (struct dfa *d, const char *begin, const char *end)
 {
 #if MBS_SUPPORT
   unsigned char eol = eolbyte;
-  size_t remain_bytes, i;
+  size_t i;
+  size_t ilim = end - begin + 1;
 
   buf_begin = (unsigned char *) begin;
 
-  remain_bytes = 0;
-  for (i = 0; i < end - begin + 1; i++)
+  for (i = 0; i < ilim; i++)
     {
-      if (remain_bytes == 0)
-        {
-          remain_bytes
-            = mbrtowc (inputwcs + i, begin + i, end - begin - i + 1, &mbs);
-          if (remain_bytes < 1
-              || remain_bytes == (size_t) -1
-              || remain_bytes == (size_t) -2
-              || (remain_bytes == 1 && inputwcs[i] == (wchar_t) begin[i]))
-            {
-              remain_bytes = 0;
-              inputwcs[i] = (wchar_t) begin[i];
-              mblen_buf[i] = 0;
-              if (begin[i] == eol)
-                break;
-            }
-          else
-            {
-              mblen_buf[i] = remain_bytes;
-              remain_bytes--;
-            }
-        }
-      else
+      size_t nbytes = mbs_to_wchar (d, inputwcs + i, begin + i, ilim - i, &mbs);
+      mblen_buf[i] = nbytes - (nbytes == 1);
+      if (begin[i] == eol)
+        break;
+      while (--nbytes != 0)
         {
-          mblen_buf[i] = remain_bytes;
+          i++;
+          mblen_buf[i] = nbytes;
           inputwcs[i] = 0;
-          remain_bytes--;
         }
     }
 
@@ -3439,7 +3471,7 @@ dfaexec (struct dfa *d, char const *begin, char *end,
       MALLOC (mblen_buf, end - begin + 2);
       MALLOC (inputwcs, end - begin + 2);
       memset (&mbs, 0, sizeof (mbstate_t));
-      prepare_wc_buf ((const char *) p, end);
+      prepare_wc_buf (d, (const char *) p, end);
     }
 
   for (;;)
@@ -3529,7 +3561,7 @@ dfaexec (struct dfa *d, char const *begin, char *end,
             ++*count;
 
           if (d->mb_cur_max > 1)
-            prepare_wc_buf ((const char *) p, end);
+            prepare_wc_buf (d, (const char *) p, end);
         }
 
       /* Check if we've run off the end of the buffer.  */
@@ -3648,6 +3680,7 @@ void
 dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
 {
   dfainit (d);
+  dfambcache (d);
   dfaparse (s, len, d);
   dfamust (d);
   dfaoptimize (d);
diff --git a/doc/ChangeLog b/doc/ChangeLog
index 4ef12ca6..d2763325 100644
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -1,3 +1,7 @@
+2014-03-30         Arnold D. Robbins     <arnold@skeeve.com>
+
+	* gawktexi.in: Cleanups to docbook, finish math stuff.
+
 2014-03-28         Arnold D. Robbins     <arnold@skeeve.com>
 
 	* gawktexi.in: Minor cleanups to the indexing.
diff --git a/doc/gawk.texi b/doc/gawk.texi
index 2b666d35..47dee34c 100644
--- a/doc/gawk.texi
+++ b/doc/gawk.texi
@@ -169,6 +169,9 @@
 @ignore
 Some comments on the layout for TeX.
 1. Use at least texinfo.tex 2014-01-30.15
+2. When using @docbook, if the last line is part of a paragraph, end
+it with a space and @c so that the lines won't run together. This is a
+quirk of the language / makeinfo, and isn't going to change.
 @end ignore
 
 @c merge the function and variable indexes into the concept index
@@ -1061,7 +1064,7 @@ $\sim\! Cn^2$
 @end ifnotdocbook
 @end ifnottex
 @docbook
-<emphasis>&sim; Cn<superscript>2</superscript></emphasis>&thinsp;
+<emphasis>&sim; Cn<superscript>2</superscript></emphasis> @c
 @end docbook
 performance, while
 theory predicted
@@ -1074,7 +1077,7 @@ $\sim\! Cn\log n$
 @end ifnotdocbook
 @end ifnottex
 @docbook
-<emphasis>&sim; Cn log n</emphasis>&thinsp;
+<emphasis>&sim; Cn log n</emphasis> @c
 @end docbook
 behavior. A few minutes poring
 over the @file{awkprof.out} profile pinpointed the problem to
@@ -17311,7 +17314,7 @@ All known POSIX-compliant systems support timestamps from 0 through
 @end ifnotdocbook
 @end ifnottex
 @docbook
-2<superscript>31</superscript> &minus; 1,&thinsp;
+2<superscript>31</superscript> &minus; 1, @c
 @end docbook
 which is sufficient to represent times through
 2038-01-19 03:14:07 UTC.  Many systems support a wider range of timestamps,
@@ -28801,7 +28804,7 @@ then the answer is
 @end ifnotdocbook
 @end ifnottex
 @docbook
-2<superscript>53</superscript>.&thinsp;
+2<superscript>53</superscript>. @c
 @end docbook
 The next representable number is the even number
 @iftex
@@ -28813,7 +28816,7 @@ The next representable number is the even number
 @end ifnotdocbook
 @end ifnottex
 @docbook
-2<superscript>53</superscript> &plus; 2,
+2<superscript>53</superscript> &plus; 2, @c
 @end docbook
 meaning it is unlikely that you will be able to make
 @command{gawk} print
@@ -28826,7 +28829,7 @@ meaning it is unlikely that you will be able to make
 @end ifnotdocbook
 @end ifnottex
 @docbook
-2<superscript>53</superscript> &plus; 1&thinsp;
+2<superscript>53</superscript> &plus; 1 @c
 @end docbook
 in integer format.
 The range of integers exactly representable by a 64-bit double
@@ -28840,7 +28843,7 @@ is
 @end ifnotdocbook
 @end ifnottex
 @docbook
-[&minus;2<superscript>53</superscript>, 2<superscript>53</superscript>].&thinsp;
+[&minus;2<superscript>53</superscript>, 2<superscript>53</superscript>]. @c
 @end docbook
 If you ever see an integer outside this range in @command{awk}
 using 64-bit doubles, you have reason to be very suspicious about
@@ -29070,7 +29073,7 @@ number is then
 @end ifnotdocbook
 @end ifnottex
 @docbook
-<emphasis>s &cdot; 2<superscript>e</superscript></emphasis>.&thinsp;
+<emphasis>s &sdot; 2<superscript>e</superscript></emphasis>. @c
 @end docbook
 The first bit of a non-zero binary significand
 is always one, so the significand in an IEEE-754 format only includes the
@@ -29319,7 +29322,7 @@ numbers are not implemented.}
 @end ifnotdocbook
 @end ifnottex
 @docbook
-(<emphasis>emax</emphasis> = 2<superscript>30</superscript> &minus; 1, <emphasis>emin</emphasis> = &minus;<emphasis>emax</emphasis>)&thinsp;
+(<emphasis>emax</emphasis> = 2<superscript>30</superscript> &minus; 1, <emphasis>emin</emphasis> = &minus;<emphasis>emax</emphasis>) @c
 @end docbook
 for all floating-point contexts.
 There is no explicit mechanism to adjust the exponent range.
@@ -29398,7 +29401,7 @@ formula:
 @end ifnottex
 @docbook
 <para>
-<emphasis>prec</emphasis> = 3.322 &cdot; <emphasis>dps</emphasis>
+<emphasis>prec</emphasis> = 3.322 &sdot; <emphasis>dps</emphasis> @c
 </para>
 @end docbook
 
@@ -29636,8 +29639,13 @@ For example, the following computes
 @math{5^{4^{3^{2}}}},
 @end iftex
 @ifnottex
+@ifnotdocbook
 5^4^3^2,
+@end ifnotdocbook
 @end ifnottex
+@docbook
+5<superscript>4<superscript>3<superscript>2</superscript></superscript></superscript>, @c
+@end docbook
 the result of which is beyond the
 limits of ordinary @command{gawk} numbers:
 
@@ -29659,9 +29667,16 @@ floating-point values instead, the precision needed for correct output
 would be @math{3.322 @cdot 183231},
 @end iftex
 @ifnottex
+@ifnotdocbook
 @samp{prec = 3.322 * dps}),
 would be 3.322 x 183231,
+@end ifnotdocbook
 @end ifnottex
+@docbook
+<emphasis>prec</emphasis> = 3.322 &sdot; <emphasis>dps</emphasis>),
+would be
+<emphasis>prec</emphasis> = 3.322 &sdot; 183231, @c
+@end docbook
 or 608693.
 
 The result from an arithmetic operation with an integer and a floating-point value
diff --git a/doc/gawktexi.in b/doc/gawktexi.in
index 66df3784..446c13d5 100644
--- a/doc/gawktexi.in
+++ b/doc/gawktexi.in
@@ -164,6 +164,9 @@
 @ignore
 Some comments on the layout for TeX.
 1. Use at least texinfo.tex 2014-01-30.15
+2. When using @docbook, if the last line is part of a paragraph, end
+it with a space and @c so that the lines won't run together. This is a
+quirk of the language / makeinfo, and isn't going to change.
 @end ignore
 
 @c merge the function and variable indexes into the concept index
@@ -1056,7 +1059,7 @@ $\sim\! Cn^2$
 @end ifnotdocbook
 @end ifnottex
 @docbook
-<emphasis>&sim; Cn<superscript>2</superscript></emphasis>&thinsp;
+<emphasis>&sim; Cn<superscript>2</superscript></emphasis> @c
 @end docbook
 performance, while
 theory predicted
@@ -1069,7 +1072,7 @@ $\sim\! Cn\log n$
 @end ifnotdocbook
 @end ifnottex
 @docbook
-<emphasis>&sim; Cn log n</emphasis>&thinsp;
+<emphasis>&sim; Cn log n</emphasis> @c
 @end docbook
 behavior. A few minutes poring
 over the @file{awkprof.out} profile pinpointed the problem to
@@ -16481,7 +16484,7 @@ All known POSIX-compliant systems support timestamps from 0 through
 @end ifnotdocbook
 @end ifnottex
 @docbook
-2<superscript>31</superscript> &minus; 1,&thinsp;
+2<superscript>31</superscript> &minus; 1, @c
 @end docbook
 which is sufficient to represent times through
 2038-01-19 03:14:07 UTC.  Many systems support a wider range of timestamps,
@@ -27942,7 +27945,7 @@ then the answer is
 @end ifnotdocbook
 @end ifnottex
 @docbook
-2<superscript>53</superscript>.&thinsp;
+2<superscript>53</superscript>. @c
 @end docbook
 The next representable number is the even number
 @iftex
@@ -27954,7 +27957,7 @@ The next representable number is the even number
 @end ifnotdocbook
 @end ifnottex
 @docbook
-2<superscript>53</superscript> &plus; 2,
+2<superscript>53</superscript> &plus; 2, @c
 @end docbook
 meaning it is unlikely that you will be able to make
 @command{gawk} print
@@ -27967,7 +27970,7 @@ meaning it is unlikely that you will be able to make
 @end ifnotdocbook
 @end ifnottex
 @docbook
-2<superscript>53</superscript> &plus; 1&thinsp;
+2<superscript>53</superscript> &plus; 1 @c
 @end docbook
 in integer format.
 The range of integers exactly representable by a 64-bit double
@@ -27981,7 +27984,7 @@ is
 @end ifnotdocbook
 @end ifnottex
 @docbook
-[&minus;2<superscript>53</superscript>, 2<superscript>53</superscript>].&thinsp;
+[&minus;2<superscript>53</superscript>, 2<superscript>53</superscript>]. @c
 @end docbook
 If you ever see an integer outside this range in @command{awk}
 using 64-bit doubles, you have reason to be very suspicious about
@@ -28211,7 +28214,7 @@ number is then
 @end ifnotdocbook
 @end ifnottex
 @docbook
-<emphasis>s &cdot; 2<superscript>e</superscript></emphasis>.&thinsp;
+<emphasis>s &sdot; 2<superscript>e</superscript></emphasis>. @c
 @end docbook
 The first bit of a non-zero binary significand
 is always one, so the significand in an IEEE-754 format only includes the
@@ -28460,7 +28463,7 @@ numbers are not implemented.}
 @end ifnotdocbook
 @end ifnottex
 @docbook
-(<emphasis>emax</emphasis> = 2<superscript>30</superscript> &minus; 1, <emphasis>emin</emphasis> = &minus;<emphasis>emax</emphasis>)&thinsp;
+(<emphasis>emax</emphasis> = 2<superscript>30</superscript> &minus; 1, <emphasis>emin</emphasis> = &minus;<emphasis>emax</emphasis>) @c
 @end docbook
 for all floating-point contexts.
 There is no explicit mechanism to adjust the exponent range.
@@ -28539,7 +28542,7 @@ formula:
 @end ifnottex
 @docbook
 <para>
-<emphasis>prec</emphasis> = 3.322 &cdot; <emphasis>dps</emphasis>
+<emphasis>prec</emphasis> = 3.322 &sdot; <emphasis>dps</emphasis> @c
 </para>
 @end docbook
 
@@ -28777,8 +28780,13 @@ For example, the following computes
 @math{5^{4^{3^{2}}}},
 @end iftex
 @ifnottex
+@ifnotdocbook
 5^4^3^2,
+@end ifnotdocbook
 @end ifnottex
+@docbook
+5<superscript>4<superscript>3<superscript>2</superscript></superscript></superscript>, @c
+@end docbook
 the result of which is beyond the
 limits of ordinary @command{gawk} numbers:
 
@@ -28800,9 +28808,16 @@ floating-point values instead, the precision needed for correct output
 would be @math{3.322 @cdot 183231},
 @end iftex
 @ifnottex
+@ifnotdocbook
 @samp{prec = 3.322 * dps}),
 would be 3.322 x 183231,
+@end ifnotdocbook
 @end ifnottex
+@docbook
+<emphasis>prec</emphasis> = 3.322 &sdot; <emphasis>dps</emphasis>),
+would be
+<emphasis>prec</emphasis> = 3.322 &sdot; 183231, @c
+@end docbook
 or 608693.
 
 The result from an arithmetic operation with an integer and a floating-point value