Merge multithreaded dfa into gawk.

author: Arnold D. Robbins <arnold@skeeve.com> 2016-09-01 20:46:12 +0300
committer: Arnold D. Robbins <arnold@skeeve.com> 2016-09-01 20:46:12 +0300
commit: b02f580f06996bd88f741f9c7330aff79216a169 (patch)
tree: 7bf557815e612b7bc1f9ea9cbc2dfc66781e175e /dfa.c
parent: af43bad53b2f05ba0d4403a59433f587a1e32b22 (diff)
download: egawk-b02f580f06996bd88f741f9c7330aff79216a169.tar.gz
egawk-b02f580f06996bd88f741f9c7330aff79216a169.tar.bz2
egawk-b02f580f06996bd88f741f9c7330aff79216a169.zip
1 files changed, 118 insertions, 198 deletions
diff --git a/dfa.c b/dfa.c
index 85cb46ad..fad03e4f 100644
--- a/dfa.c
+++ b/dfa.c
@@ -69,6 +69,8 @@
 
 #include "dfa.h"
 
+#include "localeinfo.h"
+
 #ifdef GAWK
 static int
 is_blank (int c)
@@ -445,14 +447,9 @@ struct dfa
   size_t nregexps;              /* Count of parallel regexps being built
                                    with dfaparse.  */
   bool fast;			/* The DFA is fast.  */
-  bool multibyte;		/* MB_CUR_MAX > 1.  */
   token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales.  */
   mbstate_t mbs;		/* Multibyte conversion state.  */
 
-  /* dfaexec implementation.  */
-  char *(*dfaexec) (struct dfa *, char const *, char *,
-                    bool, size_t *, bool *);
-
   /* The following are valid only if MB_CUR_MAX > 1.  */
 
   /* The value of multibyte_prop[i] is defined by following rule.
@@ -538,6 +535,21 @@ struct dfa
   state_num **mb_trans;      /* Transition tables for states with ANYCHAR.  */
   state_num mb_trcount;         /* Number of transition tables for states with
                                    ANYCHAR that have actually been built.  */
+
+  /* Information derived from the locale.  This is at the end so that
+     a quick memset need not clear it specially.  */
+
+  /* dfaexec implementation.  */
+  char *(*dfaexec) (struct dfa *, char const *, char *,
+                    bool, size_t *, bool *);
+
+  /* The locale is simple, like the C locale.  These locales can be
+     processed more efficiently, e.g., the relationship between lower-
+     and upper-case letters is 1-1.  */
+  bool simple_locale;
+
+  /* Other cached information derived from the locale.  */
+  struct localeinfo localeinfo;
 };
 
 /* Some macros for user access to dfa internals.  */
@@ -551,13 +563,8 @@ struct dfa
 
 static void regexp (struct dfa *dfa);
 
-/* A table indexed by byte values that contains the corresponding wide
-   character (if any) for that byte.  WEOF means the byte is not a
-   valid single-byte character.  */
-static wint_t mbrtowc_cache[NOTCHAR];
-
 /* Store into *PWC the result of converting the leading bytes of the
-   multibyte buffer S of length N bytes, using the mbrtowc_cache in *D
+   multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
    and updating the conversion state in *D.  On conversion error,
    convert just a single byte, to WEOF.  Return the number of bytes
    converted.
@@ -566,7 +573,7 @@ static wint_t mbrtowc_cache[NOTCHAR];
 
    * PWC points to wint_t, not to wchar_t.
    * The last arg is a dfa *D instead of merely a multibyte conversion
-     state D->mbs.  D also contains an mbrtowc_cache for speed.
+     state D->mbs.
    * N must be at least 1.
    * S[N - 1] must be a sentinel byte.
    * Shift encodings are not supported.
@@ -577,7 +584,7 @@ static size_t
 mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d)
 {
   unsigned char uc = s[0];
-  wint_t wc = mbrtowc_cache[uc];
+  wint_t wc = d->localeinfo.sbctowc[uc];
 
   if (wc == WEOF)
     {
@@ -754,7 +761,7 @@ maybe_realloc (void *ptr, size_t nitems, size_t *nalloc, size_t itemsize)
 
 /* In DFA D, find the index of charclass S, or allocate a new one.  */
 static size_t
-dfa_charclass_index (struct dfa *d, charclass const s)
+charclass_index (struct dfa *d, charclass const s)
 {
   size_t i;
 
@@ -769,9 +776,9 @@ dfa_charclass_index (struct dfa *d, charclass const s)
 }
 
 static bool
-unibyte_word_constituent (unsigned char c)
+unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
 {
-  return mbrtowc_cache[c] != WEOF && (isalnum (c) || (c) == '_');
+  return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
 }
 
 static int
@@ -779,68 +786,11 @@ char_context (struct dfa const *dfa, unsigned char c)
 {
   if (c == dfa->syntax.eolbyte)
     return CTX_NEWLINE;
-  if (unibyte_word_constituent (c))
+  if (unibyte_word_constituent (dfa, c))
     return CTX_LETTER;
   return CTX_NONE;
 }
 
-/* UTF-8 encoding allows some optimizations that we can't otherwise
-   assume in a multibyte encoding.  */
-static bool using_utf8;
-
-bool
-dfa_using_utf8 (void)
-{
-  return using_utf8;
-}
-
-static void
-init_mbrtowc_cache (void)
-{
-  int i;
-  for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
-    {
-      char c = i;
-      unsigned char uc = i;
-      mbstate_t s = { 0 };
-      wchar_t wc;
-      mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF;
-    }
-}
-
-/* Entry point to set syntax options.  */
-void
-dfasyntax (struct dfa *dfa, reg_syntax_t bits, bool fold, unsigned char eol)
-{
-  int i;
-  dfa->syntax.syntax_bits_set = true;
-  dfa->syntax.syntax_bits = bits;
-  dfa->syntax.case_fold = fold;
-  dfa->syntax.eolbyte = eol;
-
-  for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
-    {
-      unsigned char uc = i;
-
-      /* Use mbrtowc_cache to calculate sbit.  */
-      dfa->syntax.sbit[uc] = char_context (dfa, uc);
-      switch (dfa->syntax.sbit[uc])
-        {
-        case CTX_LETTER:
-          setbit (uc, dfa->syntax.letters);
-          break;
-        case CTX_NEWLINE:
-          setbit (uc, dfa->syntax.newline);
-          break;
-        }
-
-      /* POSIX requires that the five bytes in "\n\r./" (including the
-         terminating NUL) cannot occur inside a multibyte character.  */
-      dfa->syntax.never_trail[uc] = (using_utf8 ? (uc & 0xc0) != 0x80
-                                     : strchr ("\n\r./", uc) != NULL);
-    }
-}
-
 /* Set a bit in the charclass for the given wchar_t.  Do nothing if WC
    is represented by a multi-byte sequence.  Even for MB_CUR_MAX == 1,
    this may happen when folding case in weird Turkish locales where
@@ -869,30 +819,10 @@ setbit_case_fold_c (int b, charclass c)
       setbit (i, c);
 }
 
-static void check_utf8 (void)
-{
-  wchar_t wc;
-  mbstate_t mbs = { 0 };
-  using_utf8 = mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
-}
-
-static bool unibyte_c;
-
-static void check_unibyte_c (void)
-{
-  char const *locale = setlocale (LC_ALL, NULL);
-  unibyte_c = (!locale
-               || STREQ (locale, "C")
-               || STREQ (locale, "POSIX"));
-}
-
-/* The current locale is known to be a unibyte locale
-   without multicharacter collating sequences and where range
-   comparisons simply use the native encoding.  These locales can be
-   processed more efficiently.  */
+/* Return true if the locale compatible with the C locale.  */
 
 static bool
-using_simple_locale (struct dfa const *dfa)
+using_simple_locale (bool multibyte)
 {
   /* The native character set is known to be compatible with
      the C locale.  The following test isn't perfect, but it's good
@@ -910,7 +840,15 @@ using_simple_locale (struct dfa const *dfa)
      && '}' == 125 && '~' == 126)
   };
 
-  return (native_c_charset & !dfa->multibyte) | unibyte_c;
+  if (native_c_charset && !multibyte)
+    return true;
+  else
+    {
+      /* Treat C and POSIX locales as being compatible.  Also, treat
+         errors as compatible, as these are invariably from stubs.  */
+      char const *loc = setlocale (LC_ALL, NULL);
+      return !loc || STREQ (loc, "C") || STREQ (loc, "POSIX");
+    }
 }
 
 /* Fetch the next lexical input character.  Set C (of type int) to the
@@ -946,53 +884,6 @@ using_simple_locale (struct dfa const *dfa)
 # define MIN(a,b) ((a) < (b) ? (a) : (b))
 #endif
 
-/* The set of wchar_t values C such that there's a useful locale
-   somewhere where C != towupper (C) && C != towlower (towupper (C)).
-   For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
-   towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
-   towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU).  */
-static short const lonesome_lower[] =
-  {
-    0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
-    0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
-
-    /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
-       counterpart in locales predating Unicode 4.0.0 (April 2003).  */
-    0x03F2,
-
-    0x03F5, 0x1E9B, 0x1FBE,
-  };
-
-/* Maximum number of characters that can be the case-folded
-   counterparts of a single character, not counting the character
-   itself.  This is 1 for towupper, 1 for towlower, and 1 for each
-   entry in LONESOME_LOWER.  */
-enum
-{ CASE_FOLDED_BUFSIZE = 2 + sizeof lonesome_lower / sizeof *lonesome_lower };
-
-/* Find the characters equal to C after case-folding, other than C
-   itself, and store them into FOLDED.  Return the number of characters
-   stored.  */
-static unsigned int
-case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
-{
-  unsigned int i;
-  unsigned int n = 0;
-  wint_t uc = towupper (c);
-  wint_t lc = towlower (uc);
-  if (uc != c)
-    folded[n++] = uc;
-  if (lc != uc && lc != c && towupper (lc) == uc)
-    folded[n++] = lc;
-  for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
-    {
-      wint_t li = lonesome_lower[i];
-      if (li != lc && li != uc && li != c && towupper (li) == uc)
-        folded[n++] = li;
-    }
-  return n;
-}
-
 typedef int predicate (int);
 
 /* The following list maps the names of the Posix named character classes
@@ -1061,7 +952,7 @@ parse_bracket_exp (struct dfa *dfa)
   size_t chars_al;
 
   chars_al = 0;
-  if (dfa->multibyte)
+  if (dfa->localeinfo.multibyte)
     {
       dfa->mbcsets = maybe_realloc (dfa->mbcsets, dfa->nmbcsets,
                                     &dfa->mbcsets_alloc,
@@ -1084,7 +975,7 @@ parse_bracket_exp (struct dfa *dfa)
     {
       FETCH_WC (dfa, c, wc, _("unbalanced ["));
       invert = true;
-      known_bracket_exp = using_simple_locale (dfa);
+      known_bracket_exp = dfa->simple_locale;
     }
   else
     invert = false;
@@ -1139,7 +1030,7 @@ parse_bracket_exp (struct dfa *dfa)
                   if (!pred)
                     dfaerror (_("invalid character class"));
 
-                  if (dfa->multibyte && !pred->single_byte_only)
+                  if (dfa->localeinfo.multibyte && !pred->single_byte_only)
                     known_bracket_exp = false;
                   else
                     for (c2 = 0; c2 < NOTCHAR; ++c2)
@@ -1199,9 +1090,9 @@ parse_bracket_exp (struct dfa *dfa)
               /* Treat [x-y] as a range if x != y.  */
               if (wc != wc2 || wc == WEOF)
                 {
-                  if (dfa->multibyte)
+                  if (dfa->localeinfo.multibyte)
                     known_bracket_exp = false;
-                  else if (using_simple_locale (dfa))
+                  else if (dfa->simple_locale)
                     {
                       int ci;
                       for (ci = c; ci <= c2; ci++)
@@ -1228,7 +1119,7 @@ parse_bracket_exp (struct dfa *dfa)
 
       colon_warning_state |= (c == ':') ? 2 : 4;
 
-      if (!dfa->multibyte)
+      if (!dfa->localeinfo.multibyte)
         {
           if (dfa->syntax.case_fold)
             setbit_case_fold_c (c, ccl);
@@ -1265,22 +1156,22 @@ parse_bracket_exp (struct dfa *dfa)
   if (! known_bracket_exp)
     return BACKREF;
 
-  if (dfa->multibyte)
+  if (dfa->localeinfo.multibyte)
     {
       work_mbc->invert = invert;
-      work_mbc->cset = emptyset (ccl) ? -1 : dfa_charclass_index (dfa, ccl);
+      work_mbc->cset = emptyset (ccl) ? -1 : charclass_index (dfa, ccl);
       return MBCSET;
     }
 
   if (invert)
     {
-      assert (!dfa->multibyte);
+      assert (!dfa->localeinfo.multibyte);
       notset (ccl);
       if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
         clrbit ('\n', ccl);
     }
 
-  return CSET + dfa_charclass_index (dfa, ccl);
+  return CSET + charclass_index (dfa, ccl);
 }
 
 struct lexptr
@@ -1535,7 +1426,7 @@ lex (struct dfa *dfa)
         case '.':
           if (backslash)
             goto normal_char;
-          if (dfa->multibyte)
+          if (dfa->localeinfo.multibyte)
             {
               /* In multibyte environment period must match with a single
                  character not a byte.  So we use ANYCHAR.  */
@@ -1549,13 +1440,13 @@ lex (struct dfa *dfa)
           if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
             clrbit ('\0', ccl);
           dfa->lex.laststart = false;
-          return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+          return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
 
         case 's':
         case 'S':
           if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
             goto normal_char;
-          if (!dfa->multibyte)
+          if (!dfa->localeinfo.multibyte)
             {
               zeroset (ccl);
               for (c2 = 0; c2 < NOTCHAR; ++c2)
@@ -1564,7 +1455,7 @@ lex (struct dfa *dfa)
               if (c == 'S')
                 notset (ccl);
               dfa->lex.laststart = false;
-              return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+              return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
             }
 
           /* FIXME: see if optimizing this, as is done with ANYCHAR and
@@ -1588,16 +1479,16 @@ lex (struct dfa *dfa)
           if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
             goto normal_char;
 
-          if (!dfa->multibyte)
+          if (!dfa->localeinfo.multibyte)
             {
               zeroset (ccl);
               for (c2 = 0; c2 < NOTCHAR; ++c2)
-                if (unibyte_word_constituent (c2))
+                if (unibyte_word_constituent (dfa, c2))
                   setbit (c2, ccl);
               if (c == 'W')
                 notset (ccl);
               dfa->lex.laststart = false;
-              return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+              return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
             }
 
           /* FIXME: see if optimizing this, as is done with ANYCHAR and
@@ -1627,14 +1518,14 @@ lex (struct dfa *dfa)
           dfa->lex.laststart = false;
           /* For multibyte character sets, folding is done in atom.  Always
              return WCHAR.  */
-          if (dfa->multibyte)
+          if (dfa->localeinfo.multibyte)
             return dfa->lex.lasttok = WCHAR;
 
           if (dfa->syntax.case_fold && isalpha (c))
             {
               zeroset (ccl);
               setbit_case_fold_c (c, ccl);
-              return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+              return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
             }
 
           return dfa->lex.lasttok = c;
@@ -1654,11 +1545,11 @@ addtok_mb (struct dfa *dfa, token t, int mbprop)
     {
       dfa->tokens = x2nrealloc (dfa->tokens, &dfa->talloc,
                                 sizeof *dfa->tokens);
-      if (dfa->multibyte)
+      if (dfa->localeinfo.multibyte)
         dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc,
                                          sizeof *dfa->multibyte_prop);
     }
-  if (dfa->multibyte)
+  if (dfa->localeinfo.multibyte)
     dfa->multibyte_prop[dfa->tindex] = mbprop;
   dfa->tokens[dfa->tindex++] = t;
 
@@ -1695,7 +1586,7 @@ static void addtok_wc (struct dfa *dfa, wint_t wc);
 static void
 addtok (struct dfa *dfa, token t)
 {
-  if (dfa->multibyte && t == MBCSET)
+  if (dfa->localeinfo.multibyte && t == MBCSET)
     {
       bool need_or = false;
       struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1];
@@ -1794,7 +1685,7 @@ add_utf8_anychar (struct dfa *dfa)
             if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
               clrbit ('\0', c);
           }
-        dfa->utf8_anychar_classes[i] = CSET + dfa_charclass_index (dfa, c);
+        dfa->utf8_anychar_classes[i] = CSET + charclass_index (dfa, c);
       }
 
   /* A valid UTF-8 character is
@@ -1878,7 +1769,7 @@ atom (struct dfa *dfa)
 
       dfa->parse.tok = lex (dfa);
     }
-  else if (dfa->parse.tok == ANYCHAR && using_utf8)
+  else if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
     {
       /* For UTF-8 expand the period to a series of CSETs that define a valid
          UTF-8 character.  This avoids using the slow multibyte path.  I'm
@@ -1939,7 +1830,7 @@ copytoks (struct dfa *dfa, size_t tindex, size_t ntokens)
 {
   size_t i;
 
-  if (dfa->multibyte)
+  if (dfa->localeinfo.multibyte)
     for (i = 0; i < ntokens; ++i)
       addtok_mb (dfa, dfa->tokens[tindex + i], dfa->multibyte_prop[tindex + i]);
   else
@@ -2025,7 +1916,7 @@ dfaparse (char const *s, size_t len, struct dfa *d)
   d->lex.lasttok = END;
   d->lex.laststart = true;
   d->lex.parens = 0;
-  if (d->multibyte)
+  if (d->localeinfo.multibyte)
     {
       d->lex.cur_mb_len = 0;
       memset (&d->mbs, 0, sizeof d->mbs);
@@ -2214,7 +2105,7 @@ state_index (struct dfa *d, position_set const *s, int context)
         }
       else if (d->tokens[s->elems[j].index] == BACKREF)
         constraint = NO_CONSTRAINT;
-      if (d->multibyte && d->tokens[s->elems[j].index] == ANYCHAR)
+      if (d->localeinfo.multibyte && d->tokens[s->elems[j].index] == ANYCHAR)
         {
           int acceptable
             = ((SUCCEEDS_IN_CONTEXT (c, context, CTX_NEWLINE)
@@ -2691,7 +2582,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
         setbit (d->tokens[pos.index], matches);
       else if (d->tokens[pos.index] >= CSET)
         copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
-      else if (d->multibyte && d->tokens[pos.index] == ANYCHAR)
+      else if (d->localeinfo.multibyte && d->tokens[pos.index] == ANYCHAR)
         {
           /* ANYCHAR must match a single character, so put it to
              D->states[s].mbps which contains the positions which can
@@ -2837,7 +2728,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
         state_letter = state;
 
       for (i = 0; i < NOTCHAR; ++i)
-        trans[i] = unibyte_word_constituent (i) ? state_letter : state;
+        trans[i] = unibyte_word_constituent (d, i) ? state_letter : state;
       trans[d->syntax.eolbyte] = state_newline;
     }
   else
@@ -2854,7 +2745,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
         for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k)
           insert (d->follows[grps[i].elems[j]].elems[k], &follows);
 
-      if (d->multibyte)
+      if (d->localeinfo.multibyte)
         {
           /* If a token in follows.elems is not 1st byte of a multibyte
              character, or the states of follows must accept the bytes
@@ -2887,7 +2778,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
 
       /* If we are building a searching matcher, throw in the positions
          of state 0 as well.  */
-      if (d->searchflag && (!d->multibyte || !next_isnt_1st_byte))
+      if (d->searchflag && (!d->localeinfo.multibyte || !next_isnt_1st_byte))
         {
           merge (&d->states[0].elems, &follows, &tmp);
           copy (&tmp, &follows);
@@ -2943,7 +2834,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
 
               if (c == d->syntax.eolbyte)
                 trans[c] = state_newline;
-              else if (unibyte_word_constituent (c))
+              else if (unibyte_word_constituent (d, c))
                 trans[c] = state_letter;
               else if (c < NOTCHAR)
                 trans[c] = state;
@@ -2984,7 +2875,7 @@ realloc_trans_if_necessary (struct dfa *d, state_num new_state)
       d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails);
       d->success = xnrealloc (d->success, newalloc, sizeof *d->success);
       d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines);
-      if (d->multibyte)
+      if (d->localeinfo.multibyte)
         {
           realtrans = d->mb_trans ? d->mb_trans - 1 : NULL;
           realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans);
@@ -2996,7 +2887,7 @@ realloc_trans_if_necessary (struct dfa *d, state_num new_state)
         {
           d->trans[oldalloc] = NULL;
           d->fails[oldalloc] = NULL;
-          if (d->multibyte)
+          if (d->localeinfo.multibyte)
             d->mb_trans[oldalloc] = NULL;
         }
     }
@@ -3030,7 +2921,7 @@ build_state (state_num s, struct dfa *d)
         }
       d->trcount = d->min_trcount;
 
-      if (d->multibyte)
+      if (d->localeinfo.multibyte)
         {
           for (i = d->min_trcount; i < d->tralloc; i++)
             {
@@ -3481,7 +3372,7 @@ dfaexec_noop (struct dfa *d, char const *begin, char *end,
   return (char *) begin;
 }
 
-/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->multibyte),
+/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
    but faster and set *BACKREF if the DFA code does not support this
    regexp usage.  */
 
@@ -3539,7 +3430,7 @@ dfa_supported (struct dfa const *d)
         case ENDWORD:
         case LIMWORD:
         case NOTLIMWORD:
-          if (!d->multibyte)
+          if (!d->localeinfo.multibyte)
             continue;
           /* fallthrough */
 
@@ -3557,7 +3448,7 @@ dfaoptimize (struct dfa *d)
   size_t i;
   bool have_backref = false;
 
-  if (!using_utf8)
+  if (!d->localeinfo.using_utf8)
     return;
 
   for (i = 0; i < d->tindex; ++i)
@@ -3587,7 +3478,7 @@ dfaoptimize (struct dfa *d)
     }
 
   free_mbdata (d);
-  d->multibyte = false;
+  d->localeinfo.multibyte = false;
   d->dfaexec = dfaexec_sb;
   d->fast = true;
 }
@@ -3602,7 +3493,7 @@ dfassbuild (struct dfa *d)
   struct dfa *sup = dfaalloc ();
 
   *sup = *d;
-  sup->multibyte = false;
+  sup->localeinfo.multibyte = false;
   sup->dfaexec = dfaexec_sb;
   sup->multibyte_prop = NULL;
   sup->mbcsets = NULL;
@@ -3635,7 +3526,7 @@ dfassbuild (struct dfa *d)
         case BACKREF:
           zeroset (ccl);
           notset (ccl);
-          sup->tokens[j++] = CSET + dfa_charclass_index (sup, ccl);
+          sup->tokens[j++] = CSET + charclass_index (sup, ccl);
           sup->tokens[j++] = STAR;
           if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
               || d->tokens[i + 1] == PLUS)
@@ -3646,7 +3537,7 @@ dfassbuild (struct dfa *d)
         case ENDWORD:
         case LIMWORD:
         case NOTLIMWORD:
-          if (d->multibyte)
+          if (d->localeinfo.multibyte)
             {
               /* These constraints aren't supported in a multibyte locale.
                  Ignore them in the superset DFA.  */
@@ -3663,7 +3554,7 @@ dfassbuild (struct dfa *d)
     }
   sup->tindex = j;
 
-  if (have_nchar && (have_achar || d->multibyte))
+  if (have_nchar && (have_achar || d->localeinfo.multibyte))
     d->superset = sup;
   else
     {
@@ -3705,7 +3596,7 @@ dfafree (struct dfa *d)
   free (d->charclasses);
   free (d->tokens);
 
-  if (d->multibyte)
+  if (d->localeinfo.multibyte)
     free_mbdata (d);
 
   for (i = 0; i < d->sindex; ++i)
@@ -4227,20 +4118,49 @@ dfamustfree (struct dfamust *dm)
 struct dfa *
 dfaalloc (void)
 {
-  struct dfa *d = xzalloc (sizeof *d);
-  d->multibyte = MB_CUR_MAX > 1;
-  d->dfaexec = d->multibyte ? dfaexec_mb : dfaexec_sb;
-  d->fast = !d->multibyte;
-  d->lex.cur_mb_len = 1;
-  return d;
+  return xmalloc (sizeof (struct dfa));
 }
 
+/* Initialize DFA.  */
 void
-dfa_init (void)
+dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
+           reg_syntax_t bits, bool fold, unsigned char eol)
 {
-  check_utf8 ();
-  check_unibyte_c ();
-  init_mbrtowc_cache ();
+  int i;
+  memset (dfa, 0, offsetof (struct dfa, dfaexec));
+  dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
+  dfa->simple_locale = using_simple_locale (linfo->multibyte);
+  dfa->localeinfo = *linfo;
+
+  dfa->fast = !dfa->localeinfo.multibyte;
+
+  dfa->lex.cur_mb_len = 1;
+  dfa->syntax.syntax_bits_set = true;
+  dfa->syntax.syntax_bits = bits;
+  dfa->syntax.case_fold = fold;
+  dfa->syntax.eolbyte = eol;
+
+  for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
+    {
+      unsigned char uc = i;
+
+      dfa->syntax.sbit[uc] = char_context (dfa, uc);
+      switch (dfa->syntax.sbit[uc])
+        {
+        case CTX_LETTER:
+          setbit (uc, dfa->syntax.letters);
+          break;
+        case CTX_NEWLINE:
+          setbit (uc, dfa->syntax.newline);
+          break;
+        }
+
+      /* POSIX requires that the five bytes in "\n\r./" (including the
+         terminating NUL) cannot occur inside a multibyte character.  */
+      dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
+                                     ? (uc & 0xc0) != 0x80
+                                     : strchr ("\n\r./", uc) != NULL);
+    }
 }
 
 /* vim:set shiftwidth=2: */
author	Arnold D. Robbins <arnold@skeeve.com>	2016-09-01 20:46:12 +0300
committer	Arnold D. Robbins <arnold@skeeve.com>	2016-09-01 20:46:12 +0300
commit	b02f580f06996bd88f741f9c7330aff79216a169 (patch)
tree	7bf557815e612b7bc1f9ea9cbc2dfc66781e175e /dfa.c
parent	af43bad53b2f05ba0d4403a59433f587a1e32b22 (diff)
download	egawk-b02f580f06996bd88f741f9c7330aff79216a169.tar.gz egawk-b02f580f06996bd88f741f9c7330aff79216a169.tar.bz2 egawk-b02f580f06996bd88f741f9c7330aff79216a169.zip