aboutsummaryrefslogtreecommitdiffstats
path: root/dfa.c
diff options
context:
space:
mode:
authorArnold D. Robbins <arnold@skeeve.com>2016-09-01 20:46:12 +0300
committerArnold D. Robbins <arnold@skeeve.com>2016-09-01 20:46:12 +0300
commitb02f580f06996bd88f741f9c7330aff79216a169 (patch)
tree7bf557815e612b7bc1f9ea9cbc2dfc66781e175e /dfa.c
parentaf43bad53b2f05ba0d4403a59433f587a1e32b22 (diff)
downloadegawk-b02f580f06996bd88f741f9c7330aff79216a169.tar.gz
egawk-b02f580f06996bd88f741f9c7330aff79216a169.tar.bz2
egawk-b02f580f06996bd88f741f9c7330aff79216a169.zip
Merge multithreaded dfa into gawk.
Diffstat (limited to 'dfa.c')
-rw-r--r--dfa.c316
1 files changed, 118 insertions, 198 deletions
diff --git a/dfa.c b/dfa.c
index 85cb46ad..fad03e4f 100644
--- a/dfa.c
+++ b/dfa.c
@@ -69,6 +69,8 @@
#include "dfa.h"
+#include "localeinfo.h"
+
#ifdef GAWK
static int
is_blank (int c)
@@ -445,14 +447,9 @@ struct dfa
size_t nregexps; /* Count of parallel regexps being built
with dfaparse. */
bool fast; /* The DFA is fast. */
- bool multibyte; /* MB_CUR_MAX > 1. */
token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */
mbstate_t mbs; /* Multibyte conversion state. */
- /* dfaexec implementation. */
- char *(*dfaexec) (struct dfa *, char const *, char *,
- bool, size_t *, bool *);
-
/* The following are valid only if MB_CUR_MAX > 1. */
/* The value of multibyte_prop[i] is defined by following rule.
@@ -538,6 +535,21 @@ struct dfa
state_num **mb_trans; /* Transition tables for states with ANYCHAR. */
state_num mb_trcount; /* Number of transition tables for states with
ANYCHAR that have actually been built. */
+
+ /* Information derived from the locale. This is at the end so that
+ a quick memset need not clear it specially. */
+
+ /* dfaexec implementation. */
+ char *(*dfaexec) (struct dfa *, char const *, char *,
+ bool, size_t *, bool *);
+
+ /* The locale is simple, like the C locale. These locales can be
+ processed more efficiently, e.g., the relationship between lower-
+ and upper-case letters is 1-1. */
+ bool simple_locale;
+
+ /* Other cached information derived from the locale. */
+ struct localeinfo localeinfo;
};
/* Some macros for user access to dfa internals. */
@@ -551,13 +563,8 @@ struct dfa
static void regexp (struct dfa *dfa);
-/* A table indexed by byte values that contains the corresponding wide
- character (if any) for that byte. WEOF means the byte is not a
- valid single-byte character. */
-static wint_t mbrtowc_cache[NOTCHAR];
-
/* Store into *PWC the result of converting the leading bytes of the
- multibyte buffer S of length N bytes, using the mbrtowc_cache in *D
+ multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
and updating the conversion state in *D. On conversion error,
convert just a single byte, to WEOF. Return the number of bytes
converted.
@@ -566,7 +573,7 @@ static wint_t mbrtowc_cache[NOTCHAR];
* PWC points to wint_t, not to wchar_t.
* The last arg is a dfa *D instead of merely a multibyte conversion
- state D->mbs. D also contains an mbrtowc_cache for speed.
+ state D->mbs.
* N must be at least 1.
* S[N - 1] must be a sentinel byte.
* Shift encodings are not supported.
@@ -577,7 +584,7 @@ static size_t
mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d)
{
unsigned char uc = s[0];
- wint_t wc = mbrtowc_cache[uc];
+ wint_t wc = d->localeinfo.sbctowc[uc];
if (wc == WEOF)
{
@@ -754,7 +761,7 @@ maybe_realloc (void *ptr, size_t nitems, size_t *nalloc, size_t itemsize)
/* In DFA D, find the index of charclass S, or allocate a new one. */
static size_t
-dfa_charclass_index (struct dfa *d, charclass const s)
+charclass_index (struct dfa *d, charclass const s)
{
size_t i;
@@ -769,9 +776,9 @@ dfa_charclass_index (struct dfa *d, charclass const s)
}
static bool
-unibyte_word_constituent (unsigned char c)
+unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
{
- return mbrtowc_cache[c] != WEOF && (isalnum (c) || (c) == '_');
+ return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
}
static int
@@ -779,68 +786,11 @@ char_context (struct dfa const *dfa, unsigned char c)
{
if (c == dfa->syntax.eolbyte)
return CTX_NEWLINE;
- if (unibyte_word_constituent (c))
+ if (unibyte_word_constituent (dfa, c))
return CTX_LETTER;
return CTX_NONE;
}
-/* UTF-8 encoding allows some optimizations that we can't otherwise
- assume in a multibyte encoding. */
-static bool using_utf8;
-
-bool
-dfa_using_utf8 (void)
-{
- return using_utf8;
-}
-
-static void
-init_mbrtowc_cache (void)
-{
- int i;
- for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
- {
- char c = i;
- unsigned char uc = i;
- mbstate_t s = { 0 };
- wchar_t wc;
- mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF;
- }
-}
-
-/* Entry point to set syntax options. */
-void
-dfasyntax (struct dfa *dfa, reg_syntax_t bits, bool fold, unsigned char eol)
-{
- int i;
- dfa->syntax.syntax_bits_set = true;
- dfa->syntax.syntax_bits = bits;
- dfa->syntax.case_fold = fold;
- dfa->syntax.eolbyte = eol;
-
- for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
- {
- unsigned char uc = i;
-
- /* Use mbrtowc_cache to calculate sbit. */
- dfa->syntax.sbit[uc] = char_context (dfa, uc);
- switch (dfa->syntax.sbit[uc])
- {
- case CTX_LETTER:
- setbit (uc, dfa->syntax.letters);
- break;
- case CTX_NEWLINE:
- setbit (uc, dfa->syntax.newline);
- break;
- }
-
- /* POSIX requires that the five bytes in "\n\r./" (including the
- terminating NUL) cannot occur inside a multibyte character. */
- dfa->syntax.never_trail[uc] = (using_utf8 ? (uc & 0xc0) != 0x80
- : strchr ("\n\r./", uc) != NULL);
- }
-}
-
/* Set a bit in the charclass for the given wchar_t. Do nothing if WC
is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
this may happen when folding case in weird Turkish locales where
@@ -869,30 +819,10 @@ setbit_case_fold_c (int b, charclass c)
setbit (i, c);
}
-static void check_utf8 (void)
-{
- wchar_t wc;
- mbstate_t mbs = { 0 };
- using_utf8 = mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
-}
-
-static bool unibyte_c;
-
-static void check_unibyte_c (void)
-{
- char const *locale = setlocale (LC_ALL, NULL);
- unibyte_c = (!locale
- || STREQ (locale, "C")
- || STREQ (locale, "POSIX"));
-}
-
-/* The current locale is known to be a unibyte locale
- without multicharacter collating sequences and where range
- comparisons simply use the native encoding. These locales can be
- processed more efficiently. */
+/* Return true if the locale compatible with the C locale. */
static bool
-using_simple_locale (struct dfa const *dfa)
+using_simple_locale (bool multibyte)
{
/* The native character set is known to be compatible with
the C locale. The following test isn't perfect, but it's good
@@ -910,7 +840,15 @@ using_simple_locale (struct dfa const *dfa)
&& '}' == 125 && '~' == 126)
};
- return (native_c_charset & !dfa->multibyte) | unibyte_c;
+ if (native_c_charset && !multibyte)
+ return true;
+ else
+ {
+ /* Treat C and POSIX locales as being compatible. Also, treat
+ errors as compatible, as these are invariably from stubs. */
+ char const *loc = setlocale (LC_ALL, NULL);
+ return !loc || STREQ (loc, "C") || STREQ (loc, "POSIX");
+ }
}
/* Fetch the next lexical input character. Set C (of type int) to the
@@ -946,53 +884,6 @@ using_simple_locale (struct dfa const *dfa)
# define MIN(a,b) ((a) < (b) ? (a) : (b))
#endif
-/* The set of wchar_t values C such that there's a useful locale
- somewhere where C != towupper (C) && C != towlower (towupper (C)).
- For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
- towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
- towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */
-static short const lonesome_lower[] =
- {
- 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
- 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
-
- /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
- counterpart in locales predating Unicode 4.0.0 (April 2003). */
- 0x03F2,
-
- 0x03F5, 0x1E9B, 0x1FBE,
- };
-
-/* Maximum number of characters that can be the case-folded
- counterparts of a single character, not counting the character
- itself. This is 1 for towupper, 1 for towlower, and 1 for each
- entry in LONESOME_LOWER. */
-enum
-{ CASE_FOLDED_BUFSIZE = 2 + sizeof lonesome_lower / sizeof *lonesome_lower };
-
-/* Find the characters equal to C after case-folding, other than C
- itself, and store them into FOLDED. Return the number of characters
- stored. */
-static unsigned int
-case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
-{
- unsigned int i;
- unsigned int n = 0;
- wint_t uc = towupper (c);
- wint_t lc = towlower (uc);
- if (uc != c)
- folded[n++] = uc;
- if (lc != uc && lc != c && towupper (lc) == uc)
- folded[n++] = lc;
- for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
- {
- wint_t li = lonesome_lower[i];
- if (li != lc && li != uc && li != c && towupper (li) == uc)
- folded[n++] = li;
- }
- return n;
-}
-
typedef int predicate (int);
/* The following list maps the names of the Posix named character classes
@@ -1061,7 +952,7 @@ parse_bracket_exp (struct dfa *dfa)
size_t chars_al;
chars_al = 0;
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
{
dfa->mbcsets = maybe_realloc (dfa->mbcsets, dfa->nmbcsets,
&dfa->mbcsets_alloc,
@@ -1084,7 +975,7 @@ parse_bracket_exp (struct dfa *dfa)
{
FETCH_WC (dfa, c, wc, _("unbalanced ["));
invert = true;
- known_bracket_exp = using_simple_locale (dfa);
+ known_bracket_exp = dfa->simple_locale;
}
else
invert = false;
@@ -1139,7 +1030,7 @@ parse_bracket_exp (struct dfa *dfa)
if (!pred)
dfaerror (_("invalid character class"));
- if (dfa->multibyte && !pred->single_byte_only)
+ if (dfa->localeinfo.multibyte && !pred->single_byte_only)
known_bracket_exp = false;
else
for (c2 = 0; c2 < NOTCHAR; ++c2)
@@ -1199,9 +1090,9 @@ parse_bracket_exp (struct dfa *dfa)
/* Treat [x-y] as a range if x != y. */
if (wc != wc2 || wc == WEOF)
{
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
known_bracket_exp = false;
- else if (using_simple_locale (dfa))
+ else if (dfa->simple_locale)
{
int ci;
for (ci = c; ci <= c2; ci++)
@@ -1228,7 +1119,7 @@ parse_bracket_exp (struct dfa *dfa)
colon_warning_state |= (c == ':') ? 2 : 4;
- if (!dfa->multibyte)
+ if (!dfa->localeinfo.multibyte)
{
if (dfa->syntax.case_fold)
setbit_case_fold_c (c, ccl);
@@ -1265,22 +1156,22 @@ parse_bracket_exp (struct dfa *dfa)
if (! known_bracket_exp)
return BACKREF;
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
{
work_mbc->invert = invert;
- work_mbc->cset = emptyset (ccl) ? -1 : dfa_charclass_index (dfa, ccl);
+ work_mbc->cset = emptyset (ccl) ? -1 : charclass_index (dfa, ccl);
return MBCSET;
}
if (invert)
{
- assert (!dfa->multibyte);
+ assert (!dfa->localeinfo.multibyte);
notset (ccl);
if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
clrbit ('\n', ccl);
}
- return CSET + dfa_charclass_index (dfa, ccl);
+ return CSET + charclass_index (dfa, ccl);
}
struct lexptr
@@ -1535,7 +1426,7 @@ lex (struct dfa *dfa)
case '.':
if (backslash)
goto normal_char;
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
{
/* In multibyte environment period must match with a single
character not a byte. So we use ANYCHAR. */
@@ -1549,13 +1440,13 @@ lex (struct dfa *dfa)
if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
clrbit ('\0', ccl);
dfa->lex.laststart = false;
- return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+ return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
case 's':
case 'S':
if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
goto normal_char;
- if (!dfa->multibyte)
+ if (!dfa->localeinfo.multibyte)
{
zeroset (ccl);
for (c2 = 0; c2 < NOTCHAR; ++c2)
@@ -1564,7 +1455,7 @@ lex (struct dfa *dfa)
if (c == 'S')
notset (ccl);
dfa->lex.laststart = false;
- return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+ return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
}
/* FIXME: see if optimizing this, as is done with ANYCHAR and
@@ -1588,16 +1479,16 @@ lex (struct dfa *dfa)
if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
goto normal_char;
- if (!dfa->multibyte)
+ if (!dfa->localeinfo.multibyte)
{
zeroset (ccl);
for (c2 = 0; c2 < NOTCHAR; ++c2)
- if (unibyte_word_constituent (c2))
+ if (unibyte_word_constituent (dfa, c2))
setbit (c2, ccl);
if (c == 'W')
notset (ccl);
dfa->lex.laststart = false;
- return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+ return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
}
/* FIXME: see if optimizing this, as is done with ANYCHAR and
@@ -1627,14 +1518,14 @@ lex (struct dfa *dfa)
dfa->lex.laststart = false;
/* For multibyte character sets, folding is done in atom. Always
return WCHAR. */
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
return dfa->lex.lasttok = WCHAR;
if (dfa->syntax.case_fold && isalpha (c))
{
zeroset (ccl);
setbit_case_fold_c (c, ccl);
- return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl);
+ return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
}
return dfa->lex.lasttok = c;
@@ -1654,11 +1545,11 @@ addtok_mb (struct dfa *dfa, token t, int mbprop)
{
dfa->tokens = x2nrealloc (dfa->tokens, &dfa->talloc,
sizeof *dfa->tokens);
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc,
sizeof *dfa->multibyte_prop);
}
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
dfa->multibyte_prop[dfa->tindex] = mbprop;
dfa->tokens[dfa->tindex++] = t;
@@ -1695,7 +1586,7 @@ static void addtok_wc (struct dfa *dfa, wint_t wc);
static void
addtok (struct dfa *dfa, token t)
{
- if (dfa->multibyte && t == MBCSET)
+ if (dfa->localeinfo.multibyte && t == MBCSET)
{
bool need_or = false;
struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1];
@@ -1794,7 +1685,7 @@ add_utf8_anychar (struct dfa *dfa)
if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
clrbit ('\0', c);
}
- dfa->utf8_anychar_classes[i] = CSET + dfa_charclass_index (dfa, c);
+ dfa->utf8_anychar_classes[i] = CSET + charclass_index (dfa, c);
}
/* A valid UTF-8 character is
@@ -1878,7 +1769,7 @@ atom (struct dfa *dfa)
dfa->parse.tok = lex (dfa);
}
- else if (dfa->parse.tok == ANYCHAR && using_utf8)
+ else if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
{
/* For UTF-8 expand the period to a series of CSETs that define a valid
UTF-8 character. This avoids using the slow multibyte path. I'm
@@ -1939,7 +1830,7 @@ copytoks (struct dfa *dfa, size_t tindex, size_t ntokens)
{
size_t i;
- if (dfa->multibyte)
+ if (dfa->localeinfo.multibyte)
for (i = 0; i < ntokens; ++i)
addtok_mb (dfa, dfa->tokens[tindex + i], dfa->multibyte_prop[tindex + i]);
else
@@ -2025,7 +1916,7 @@ dfaparse (char const *s, size_t len, struct dfa *d)
d->lex.lasttok = END;
d->lex.laststart = true;
d->lex.parens = 0;
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
{
d->lex.cur_mb_len = 0;
memset (&d->mbs, 0, sizeof d->mbs);
@@ -2214,7 +2105,7 @@ state_index (struct dfa *d, position_set const *s, int context)
}
else if (d->tokens[s->elems[j].index] == BACKREF)
constraint = NO_CONSTRAINT;
- if (d->multibyte && d->tokens[s->elems[j].index] == ANYCHAR)
+ if (d->localeinfo.multibyte && d->tokens[s->elems[j].index] == ANYCHAR)
{
int acceptable
= ((SUCCEEDS_IN_CONTEXT (c, context, CTX_NEWLINE)
@@ -2691,7 +2582,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
setbit (d->tokens[pos.index], matches);
else if (d->tokens[pos.index] >= CSET)
copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
- else if (d->multibyte && d->tokens[pos.index] == ANYCHAR)
+ else if (d->localeinfo.multibyte && d->tokens[pos.index] == ANYCHAR)
{
/* ANYCHAR must match a single character, so put it to
D->states[s].mbps which contains the positions which can
@@ -2837,7 +2728,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
state_letter = state;
for (i = 0; i < NOTCHAR; ++i)
- trans[i] = unibyte_word_constituent (i) ? state_letter : state;
+ trans[i] = unibyte_word_constituent (d, i) ? state_letter : state;
trans[d->syntax.eolbyte] = state_newline;
}
else
@@ -2854,7 +2745,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k)
insert (d->follows[grps[i].elems[j]].elems[k], &follows);
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
{
/* If a token in follows.elems is not 1st byte of a multibyte
character, or the states of follows must accept the bytes
@@ -2887,7 +2778,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
/* If we are building a searching matcher, throw in the positions
of state 0 as well. */
- if (d->searchflag && (!d->multibyte || !next_isnt_1st_byte))
+ if (d->searchflag && (!d->localeinfo.multibyte || !next_isnt_1st_byte))
{
merge (&d->states[0].elems, &follows, &tmp);
copy (&tmp, &follows);
@@ -2943,7 +2834,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
if (c == d->syntax.eolbyte)
trans[c] = state_newline;
- else if (unibyte_word_constituent (c))
+ else if (unibyte_word_constituent (d, c))
trans[c] = state_letter;
else if (c < NOTCHAR)
trans[c] = state;
@@ -2984,7 +2875,7 @@ realloc_trans_if_necessary (struct dfa *d, state_num new_state)
d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails);
d->success = xnrealloc (d->success, newalloc, sizeof *d->success);
d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines);
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
{
realtrans = d->mb_trans ? d->mb_trans - 1 : NULL;
realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans);
@@ -2996,7 +2887,7 @@ realloc_trans_if_necessary (struct dfa *d, state_num new_state)
{
d->trans[oldalloc] = NULL;
d->fails[oldalloc] = NULL;
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
d->mb_trans[oldalloc] = NULL;
}
}
@@ -3030,7 +2921,7 @@ build_state (state_num s, struct dfa *d)
}
d->trcount = d->min_trcount;
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
{
for (i = d->min_trcount; i < d->tralloc; i++)
{
@@ -3481,7 +3372,7 @@ dfaexec_noop (struct dfa *d, char const *begin, char *end,
return (char *) begin;
}
-/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->multibyte),
+/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
but faster and set *BACKREF if the DFA code does not support this
regexp usage. */
@@ -3539,7 +3430,7 @@ dfa_supported (struct dfa const *d)
case ENDWORD:
case LIMWORD:
case NOTLIMWORD:
- if (!d->multibyte)
+ if (!d->localeinfo.multibyte)
continue;
/* fallthrough */
@@ -3557,7 +3448,7 @@ dfaoptimize (struct dfa *d)
size_t i;
bool have_backref = false;
- if (!using_utf8)
+ if (!d->localeinfo.using_utf8)
return;
for (i = 0; i < d->tindex; ++i)
@@ -3587,7 +3478,7 @@ dfaoptimize (struct dfa *d)
}
free_mbdata (d);
- d->multibyte = false;
+ d->localeinfo.multibyte = false;
d->dfaexec = dfaexec_sb;
d->fast = true;
}
@@ -3602,7 +3493,7 @@ dfassbuild (struct dfa *d)
struct dfa *sup = dfaalloc ();
*sup = *d;
- sup->multibyte = false;
+ sup->localeinfo.multibyte = false;
sup->dfaexec = dfaexec_sb;
sup->multibyte_prop = NULL;
sup->mbcsets = NULL;
@@ -3635,7 +3526,7 @@ dfassbuild (struct dfa *d)
case BACKREF:
zeroset (ccl);
notset (ccl);
- sup->tokens[j++] = CSET + dfa_charclass_index (sup, ccl);
+ sup->tokens[j++] = CSET + charclass_index (sup, ccl);
sup->tokens[j++] = STAR;
if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
|| d->tokens[i + 1] == PLUS)
@@ -3646,7 +3537,7 @@ dfassbuild (struct dfa *d)
case ENDWORD:
case LIMWORD:
case NOTLIMWORD:
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
{
/* These constraints aren't supported in a multibyte locale.
Ignore them in the superset DFA. */
@@ -3663,7 +3554,7 @@ dfassbuild (struct dfa *d)
}
sup->tindex = j;
- if (have_nchar && (have_achar || d->multibyte))
+ if (have_nchar && (have_achar || d->localeinfo.multibyte))
d->superset = sup;
else
{
@@ -3705,7 +3596,7 @@ dfafree (struct dfa *d)
free (d->charclasses);
free (d->tokens);
- if (d->multibyte)
+ if (d->localeinfo.multibyte)
free_mbdata (d);
for (i = 0; i < d->sindex; ++i)
@@ -4227,20 +4118,49 @@ dfamustfree (struct dfamust *dm)
struct dfa *
dfaalloc (void)
{
- struct dfa *d = xzalloc (sizeof *d);
- d->multibyte = MB_CUR_MAX > 1;
- d->dfaexec = d->multibyte ? dfaexec_mb : dfaexec_sb;
- d->fast = !d->multibyte;
- d->lex.cur_mb_len = 1;
- return d;
+ return xmalloc (sizeof (struct dfa));
}
+/* Initialize DFA. */
void
-dfa_init (void)
+dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
+ reg_syntax_t bits, bool fold, unsigned char eol)
{
- check_utf8 ();
- check_unibyte_c ();
- init_mbrtowc_cache ();
+ int i;
+ memset (dfa, 0, offsetof (struct dfa, dfaexec));
+ dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
+ dfa->simple_locale = using_simple_locale (linfo->multibyte);
+ dfa->localeinfo = *linfo;
+
+ dfa->fast = !dfa->localeinfo.multibyte;
+
+ dfa->lex.cur_mb_len = 1;
+ dfa->syntax.syntax_bits_set = true;
+ dfa->syntax.syntax_bits = bits;
+ dfa->syntax.case_fold = fold;
+ dfa->syntax.eolbyte = eol;
+
+ for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
+ {
+ unsigned char uc = i;
+
+ dfa->syntax.sbit[uc] = char_context (dfa, uc);
+ switch (dfa->syntax.sbit[uc])
+ {
+ case CTX_LETTER:
+ setbit (uc, dfa->syntax.letters);
+ break;
+ case CTX_NEWLINE:
+ setbit (uc, dfa->syntax.newline);
+ break;
+ }
+
+ /* POSIX requires that the five bytes in "\n\r./" (including the
+ terminating NUL) cannot occur inside a multibyte character. */
+ dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
+ ? (uc & 0xc0) != 0x80
+ : strchr ("\n\r./", uc) != NULL);
+ }
}
/* vim:set shiftwidth=2: */