diff options
author | Arnold D. Robbins <arnold@skeeve.com> | 2016-09-01 20:46:12 +0300 |
---|---|---|
committer | Arnold D. Robbins <arnold@skeeve.com> | 2016-09-01 20:46:12 +0300 |
commit | b02f580f06996bd88f741f9c7330aff79216a169 (patch) | |
tree | 7bf557815e612b7bc1f9ea9cbc2dfc66781e175e /dfa.c | |
parent | af43bad53b2f05ba0d4403a59433f587a1e32b22 (diff) | |
download | egawk-b02f580f06996bd88f741f9c7330aff79216a169.tar.gz egawk-b02f580f06996bd88f741f9c7330aff79216a169.tar.bz2 egawk-b02f580f06996bd88f741f9c7330aff79216a169.zip |
Merge multithreaded dfa into gawk.
Diffstat (limited to 'dfa.c')
-rw-r--r-- | dfa.c | 316 |
1 files changed, 118 insertions, 198 deletions
@@ -69,6 +69,8 @@ #include "dfa.h" +#include "localeinfo.h" + #ifdef GAWK static int is_blank (int c) @@ -445,14 +447,9 @@ struct dfa size_t nregexps; /* Count of parallel regexps being built with dfaparse. */ bool fast; /* The DFA is fast. */ - bool multibyte; /* MB_CUR_MAX > 1. */ token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */ mbstate_t mbs; /* Multibyte conversion state. */ - /* dfaexec implementation. */ - char *(*dfaexec) (struct dfa *, char const *, char *, - bool, size_t *, bool *); - /* The following are valid only if MB_CUR_MAX > 1. */ /* The value of multibyte_prop[i] is defined by following rule. @@ -538,6 +535,21 @@ struct dfa state_num **mb_trans; /* Transition tables for states with ANYCHAR. */ state_num mb_trcount; /* Number of transition tables for states with ANYCHAR that have actually been built. */ + + /* Information derived from the locale. This is at the end so that + a quick memset need not clear it specially. */ + + /* dfaexec implementation. */ + char *(*dfaexec) (struct dfa *, char const *, char *, + bool, size_t *, bool *); + + /* The locale is simple, like the C locale. These locales can be + processed more efficiently, e.g., the relationship between lower- + and upper-case letters is 1-1. */ + bool simple_locale; + + /* Other cached information derived from the locale. */ + struct localeinfo localeinfo; }; /* Some macros for user access to dfa internals. */ @@ -551,13 +563,8 @@ struct dfa static void regexp (struct dfa *dfa); -/* A table indexed by byte values that contains the corresponding wide - character (if any) for that byte. WEOF means the byte is not a - valid single-byte character. */ -static wint_t mbrtowc_cache[NOTCHAR]; - /* Store into *PWC the result of converting the leading bytes of the - multibyte buffer S of length N bytes, using the mbrtowc_cache in *D + multibyte buffer S of length N bytes, using D->localeinfo.sbctowc and updating the conversion state in *D. On conversion error, convert just a single byte, to WEOF. Return the number of bytes converted. @@ -566,7 +573,7 @@ static wint_t mbrtowc_cache[NOTCHAR]; * PWC points to wint_t, not to wchar_t. * The last arg is a dfa *D instead of merely a multibyte conversion - state D->mbs. D also contains an mbrtowc_cache for speed. + state D->mbs. * N must be at least 1. * S[N - 1] must be a sentinel byte. * Shift encodings are not supported. @@ -577,7 +584,7 @@ static size_t mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d) { unsigned char uc = s[0]; - wint_t wc = mbrtowc_cache[uc]; + wint_t wc = d->localeinfo.sbctowc[uc]; if (wc == WEOF) { @@ -754,7 +761,7 @@ maybe_realloc (void *ptr, size_t nitems, size_t *nalloc, size_t itemsize) /* In DFA D, find the index of charclass S, or allocate a new one. */ static size_t -dfa_charclass_index (struct dfa *d, charclass const s) +charclass_index (struct dfa *d, charclass const s) { size_t i; @@ -769,9 +776,9 @@ dfa_charclass_index (struct dfa *d, charclass const s) } static bool -unibyte_word_constituent (unsigned char c) +unibyte_word_constituent (struct dfa const *dfa, unsigned char c) { - return mbrtowc_cache[c] != WEOF && (isalnum (c) || (c) == '_'); + return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_'); } static int @@ -779,68 +786,11 @@ char_context (struct dfa const *dfa, unsigned char c) { if (c == dfa->syntax.eolbyte) return CTX_NEWLINE; - if (unibyte_word_constituent (c)) + if (unibyte_word_constituent (dfa, c)) return CTX_LETTER; return CTX_NONE; } -/* UTF-8 encoding allows some optimizations that we can't otherwise - assume in a multibyte encoding. */ -static bool using_utf8; - -bool -dfa_using_utf8 (void) -{ - return using_utf8; -} - -static void -init_mbrtowc_cache (void) -{ - int i; - for (i = CHAR_MIN; i <= CHAR_MAX; ++i) - { - char c = i; - unsigned char uc = i; - mbstate_t s = { 0 }; - wchar_t wc; - mbrtowc_cache[uc] = mbrtowc (&wc, &c, 1, &s) <= 1 ? wc : WEOF; - } -} - -/* Entry point to set syntax options. */ -void -dfasyntax (struct dfa *dfa, reg_syntax_t bits, bool fold, unsigned char eol) -{ - int i; - dfa->syntax.syntax_bits_set = true; - dfa->syntax.syntax_bits = bits; - dfa->syntax.case_fold = fold; - dfa->syntax.eolbyte = eol; - - for (i = CHAR_MIN; i <= CHAR_MAX; ++i) - { - unsigned char uc = i; - - /* Use mbrtowc_cache to calculate sbit. */ - dfa->syntax.sbit[uc] = char_context (dfa, uc); - switch (dfa->syntax.sbit[uc]) - { - case CTX_LETTER: - setbit (uc, dfa->syntax.letters); - break; - case CTX_NEWLINE: - setbit (uc, dfa->syntax.newline); - break; - } - - /* POSIX requires that the five bytes in "\n\r./" (including the - terminating NUL) cannot occur inside a multibyte character. */ - dfa->syntax.never_trail[uc] = (using_utf8 ? (uc & 0xc0) != 0x80 - : strchr ("\n\r./", uc) != NULL); - } -} - /* Set a bit in the charclass for the given wchar_t. Do nothing if WC is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1, this may happen when folding case in weird Turkish locales where @@ -869,30 +819,10 @@ setbit_case_fold_c (int b, charclass c) setbit (i, c); } -static void check_utf8 (void) -{ - wchar_t wc; - mbstate_t mbs = { 0 }; - using_utf8 = mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100; -} - -static bool unibyte_c; - -static void check_unibyte_c (void) -{ - char const *locale = setlocale (LC_ALL, NULL); - unibyte_c = (!locale - || STREQ (locale, "C") - || STREQ (locale, "POSIX")); -} - -/* The current locale is known to be a unibyte locale - without multicharacter collating sequences and where range - comparisons simply use the native encoding. These locales can be - processed more efficiently. */ +/* Return true if the locale compatible with the C locale. */ static bool -using_simple_locale (struct dfa const *dfa) +using_simple_locale (bool multibyte) { /* The native character set is known to be compatible with the C locale. The following test isn't perfect, but it's good @@ -910,7 +840,15 @@ using_simple_locale (struct dfa const *dfa) && '}' == 125 && '~' == 126) }; - return (native_c_charset & !dfa->multibyte) | unibyte_c; + if (native_c_charset && !multibyte) + return true; + else + { + /* Treat C and POSIX locales as being compatible. Also, treat + errors as compatible, as these are invariably from stubs. */ + char const *loc = setlocale (LC_ALL, NULL); + return !loc || STREQ (loc, "C") || STREQ (loc, "POSIX"); + } } /* Fetch the next lexical input character. Set C (of type int) to the @@ -946,53 +884,6 @@ using_simple_locale (struct dfa const *dfa) # define MIN(a,b) ((a) < (b) ? (a) : (b)) #endif -/* The set of wchar_t values C such that there's a useful locale - somewhere where C != towupper (C) && C != towlower (towupper (C)). - For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because - towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and - towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */ -static short const lonesome_lower[] = - { - 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345, - 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1, - - /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase - counterpart in locales predating Unicode 4.0.0 (April 2003). */ - 0x03F2, - - 0x03F5, 0x1E9B, 0x1FBE, - }; - -/* Maximum number of characters that can be the case-folded - counterparts of a single character, not counting the character - itself. This is 1 for towupper, 1 for towlower, and 1 for each - entry in LONESOME_LOWER. */ -enum -{ CASE_FOLDED_BUFSIZE = 2 + sizeof lonesome_lower / sizeof *lonesome_lower }; - -/* Find the characters equal to C after case-folding, other than C - itself, and store them into FOLDED. Return the number of characters - stored. */ -static unsigned int -case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE]) -{ - unsigned int i; - unsigned int n = 0; - wint_t uc = towupper (c); - wint_t lc = towlower (uc); - if (uc != c) - folded[n++] = uc; - if (lc != uc && lc != c && towupper (lc) == uc) - folded[n++] = lc; - for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++) - { - wint_t li = lonesome_lower[i]; - if (li != lc && li != uc && li != c && towupper (li) == uc) - folded[n++] = li; - } - return n; -} - typedef int predicate (int); /* The following list maps the names of the Posix named character classes @@ -1061,7 +952,7 @@ parse_bracket_exp (struct dfa *dfa) size_t chars_al; chars_al = 0; - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) { dfa->mbcsets = maybe_realloc (dfa->mbcsets, dfa->nmbcsets, &dfa->mbcsets_alloc, @@ -1084,7 +975,7 @@ parse_bracket_exp (struct dfa *dfa) { FETCH_WC (dfa, c, wc, _("unbalanced [")); invert = true; - known_bracket_exp = using_simple_locale (dfa); + known_bracket_exp = dfa->simple_locale; } else invert = false; @@ -1139,7 +1030,7 @@ parse_bracket_exp (struct dfa *dfa) if (!pred) dfaerror (_("invalid character class")); - if (dfa->multibyte && !pred->single_byte_only) + if (dfa->localeinfo.multibyte && !pred->single_byte_only) known_bracket_exp = false; else for (c2 = 0; c2 < NOTCHAR; ++c2) @@ -1199,9 +1090,9 @@ parse_bracket_exp (struct dfa *dfa) /* Treat [x-y] as a range if x != y. */ if (wc != wc2 || wc == WEOF) { - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) known_bracket_exp = false; - else if (using_simple_locale (dfa)) + else if (dfa->simple_locale) { int ci; for (ci = c; ci <= c2; ci++) @@ -1228,7 +1119,7 @@ parse_bracket_exp (struct dfa *dfa) colon_warning_state |= (c == ':') ? 2 : 4; - if (!dfa->multibyte) + if (!dfa->localeinfo.multibyte) { if (dfa->syntax.case_fold) setbit_case_fold_c (c, ccl); @@ -1265,22 +1156,22 @@ parse_bracket_exp (struct dfa *dfa) if (! known_bracket_exp) return BACKREF; - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) { work_mbc->invert = invert; - work_mbc->cset = emptyset (ccl) ? -1 : dfa_charclass_index (dfa, ccl); + work_mbc->cset = emptyset (ccl) ? -1 : charclass_index (dfa, ccl); return MBCSET; } if (invert) { - assert (!dfa->multibyte); + assert (!dfa->localeinfo.multibyte); notset (ccl); if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE) clrbit ('\n', ccl); } - return CSET + dfa_charclass_index (dfa, ccl); + return CSET + charclass_index (dfa, ccl); } struct lexptr @@ -1535,7 +1426,7 @@ lex (struct dfa *dfa) case '.': if (backslash) goto normal_char; - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) { /* In multibyte environment period must match with a single character not a byte. So we use ANYCHAR. */ @@ -1549,13 +1440,13 @@ lex (struct dfa *dfa) if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL) clrbit ('\0', ccl); dfa->lex.laststart = false; - return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl); + return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl); case 's': case 'S': if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) goto normal_char; - if (!dfa->multibyte) + if (!dfa->localeinfo.multibyte) { zeroset (ccl); for (c2 = 0; c2 < NOTCHAR; ++c2) @@ -1564,7 +1455,7 @@ lex (struct dfa *dfa) if (c == 'S') notset (ccl); dfa->lex.laststart = false; - return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl); + return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl); } /* FIXME: see if optimizing this, as is done with ANYCHAR and @@ -1588,16 +1479,16 @@ lex (struct dfa *dfa) if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)) goto normal_char; - if (!dfa->multibyte) + if (!dfa->localeinfo.multibyte) { zeroset (ccl); for (c2 = 0; c2 < NOTCHAR; ++c2) - if (unibyte_word_constituent (c2)) + if (unibyte_word_constituent (dfa, c2)) setbit (c2, ccl); if (c == 'W') notset (ccl); dfa->lex.laststart = false; - return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl); + return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl); } /* FIXME: see if optimizing this, as is done with ANYCHAR and @@ -1627,14 +1518,14 @@ lex (struct dfa *dfa) dfa->lex.laststart = false; /* For multibyte character sets, folding is done in atom. Always return WCHAR. */ - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) return dfa->lex.lasttok = WCHAR; if (dfa->syntax.case_fold && isalpha (c)) { zeroset (ccl); setbit_case_fold_c (c, ccl); - return dfa->lex.lasttok = CSET + dfa_charclass_index (dfa, ccl); + return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl); } return dfa->lex.lasttok = c; @@ -1654,11 +1545,11 @@ addtok_mb (struct dfa *dfa, token t, int mbprop) { dfa->tokens = x2nrealloc (dfa->tokens, &dfa->talloc, sizeof *dfa->tokens); - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc, sizeof *dfa->multibyte_prop); } - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) dfa->multibyte_prop[dfa->tindex] = mbprop; dfa->tokens[dfa->tindex++] = t; @@ -1695,7 +1586,7 @@ static void addtok_wc (struct dfa *dfa, wint_t wc); static void addtok (struct dfa *dfa, token t) { - if (dfa->multibyte && t == MBCSET) + if (dfa->localeinfo.multibyte && t == MBCSET) { bool need_or = false; struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1]; @@ -1794,7 +1685,7 @@ add_utf8_anychar (struct dfa *dfa) if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL) clrbit ('\0', c); } - dfa->utf8_anychar_classes[i] = CSET + dfa_charclass_index (dfa, c); + dfa->utf8_anychar_classes[i] = CSET + charclass_index (dfa, c); } /* A valid UTF-8 character is @@ -1878,7 +1769,7 @@ atom (struct dfa *dfa) dfa->parse.tok = lex (dfa); } - else if (dfa->parse.tok == ANYCHAR && using_utf8) + else if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8) { /* For UTF-8 expand the period to a series of CSETs that define a valid UTF-8 character. This avoids using the slow multibyte path. I'm @@ -1939,7 +1830,7 @@ copytoks (struct dfa *dfa, size_t tindex, size_t ntokens) { size_t i; - if (dfa->multibyte) + if (dfa->localeinfo.multibyte) for (i = 0; i < ntokens; ++i) addtok_mb (dfa, dfa->tokens[tindex + i], dfa->multibyte_prop[tindex + i]); else @@ -2025,7 +1916,7 @@ dfaparse (char const *s, size_t len, struct dfa *d) d->lex.lasttok = END; d->lex.laststart = true; d->lex.parens = 0; - if (d->multibyte) + if (d->localeinfo.multibyte) { d->lex.cur_mb_len = 0; memset (&d->mbs, 0, sizeof d->mbs); @@ -2214,7 +2105,7 @@ state_index (struct dfa *d, position_set const *s, int context) } else if (d->tokens[s->elems[j].index] == BACKREF) constraint = NO_CONSTRAINT; - if (d->multibyte && d->tokens[s->elems[j].index] == ANYCHAR) + if (d->localeinfo.multibyte && d->tokens[s->elems[j].index] == ANYCHAR) { int acceptable = ((SUCCEEDS_IN_CONTEXT (c, context, CTX_NEWLINE) @@ -2691,7 +2582,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) setbit (d->tokens[pos.index], matches); else if (d->tokens[pos.index] >= CSET) copyset (d->charclasses[d->tokens[pos.index] - CSET], matches); - else if (d->multibyte && d->tokens[pos.index] == ANYCHAR) + else if (d->localeinfo.multibyte && d->tokens[pos.index] == ANYCHAR) { /* ANYCHAR must match a single character, so put it to D->states[s].mbps which contains the positions which can @@ -2837,7 +2728,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) state_letter = state; for (i = 0; i < NOTCHAR; ++i) - trans[i] = unibyte_word_constituent (i) ? state_letter : state; + trans[i] = unibyte_word_constituent (d, i) ? state_letter : state; trans[d->syntax.eolbyte] = state_newline; } else @@ -2854,7 +2745,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) for (k = 0; k < d->follows[grps[i].elems[j]].nelem; ++k) insert (d->follows[grps[i].elems[j]].elems[k], &follows); - if (d->multibyte) + if (d->localeinfo.multibyte) { /* If a token in follows.elems is not 1st byte of a multibyte character, or the states of follows must accept the bytes @@ -2887,7 +2778,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) /* If we are building a searching matcher, throw in the positions of state 0 as well. */ - if (d->searchflag && (!d->multibyte || !next_isnt_1st_byte)) + if (d->searchflag && (!d->localeinfo.multibyte || !next_isnt_1st_byte)) { merge (&d->states[0].elems, &follows, &tmp); copy (&tmp, &follows); @@ -2943,7 +2834,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) if (c == d->syntax.eolbyte) trans[c] = state_newline; - else if (unibyte_word_constituent (c)) + else if (unibyte_word_constituent (d, c)) trans[c] = state_letter; else if (c < NOTCHAR) trans[c] = state; @@ -2984,7 +2875,7 @@ realloc_trans_if_necessary (struct dfa *d, state_num new_state) d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails); d->success = xnrealloc (d->success, newalloc, sizeof *d->success); d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines); - if (d->multibyte) + if (d->localeinfo.multibyte) { realtrans = d->mb_trans ? d->mb_trans - 1 : NULL; realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans); @@ -2996,7 +2887,7 @@ realloc_trans_if_necessary (struct dfa *d, state_num new_state) { d->trans[oldalloc] = NULL; d->fails[oldalloc] = NULL; - if (d->multibyte) + if (d->localeinfo.multibyte) d->mb_trans[oldalloc] = NULL; } } @@ -3030,7 +2921,7 @@ build_state (state_num s, struct dfa *d) } d->trcount = d->min_trcount; - if (d->multibyte) + if (d->localeinfo.multibyte) { for (i = d->min_trcount; i < d->tralloc; i++) { @@ -3481,7 +3372,7 @@ dfaexec_noop (struct dfa *d, char const *begin, char *end, return (char *) begin; } -/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->multibyte), +/* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte), but faster and set *BACKREF if the DFA code does not support this regexp usage. */ @@ -3539,7 +3430,7 @@ dfa_supported (struct dfa const *d) case ENDWORD: case LIMWORD: case NOTLIMWORD: - if (!d->multibyte) + if (!d->localeinfo.multibyte) continue; /* fallthrough */ @@ -3557,7 +3448,7 @@ dfaoptimize (struct dfa *d) size_t i; bool have_backref = false; - if (!using_utf8) + if (!d->localeinfo.using_utf8) return; for (i = 0; i < d->tindex; ++i) @@ -3587,7 +3478,7 @@ dfaoptimize (struct dfa *d) } free_mbdata (d); - d->multibyte = false; + d->localeinfo.multibyte = false; d->dfaexec = dfaexec_sb; d->fast = true; } @@ -3602,7 +3493,7 @@ dfassbuild (struct dfa *d) struct dfa *sup = dfaalloc (); *sup = *d; - sup->multibyte = false; + sup->localeinfo.multibyte = false; sup->dfaexec = dfaexec_sb; sup->multibyte_prop = NULL; sup->mbcsets = NULL; @@ -3635,7 +3526,7 @@ dfassbuild (struct dfa *d) case BACKREF: zeroset (ccl); notset (ccl); - sup->tokens[j++] = CSET + dfa_charclass_index (sup, ccl); + sup->tokens[j++] = CSET + charclass_index (sup, ccl); sup->tokens[j++] = STAR; if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR || d->tokens[i + 1] == PLUS) @@ -3646,7 +3537,7 @@ dfassbuild (struct dfa *d) case ENDWORD: case LIMWORD: case NOTLIMWORD: - if (d->multibyte) + if (d->localeinfo.multibyte) { /* These constraints aren't supported in a multibyte locale. Ignore them in the superset DFA. */ @@ -3663,7 +3554,7 @@ dfassbuild (struct dfa *d) } sup->tindex = j; - if (have_nchar && (have_achar || d->multibyte)) + if (have_nchar && (have_achar || d->localeinfo.multibyte)) d->superset = sup; else { @@ -3705,7 +3596,7 @@ dfafree (struct dfa *d) free (d->charclasses); free (d->tokens); - if (d->multibyte) + if (d->localeinfo.multibyte) free_mbdata (d); for (i = 0; i < d->sindex; ++i) @@ -4227,20 +4118,49 @@ dfamustfree (struct dfamust *dm) struct dfa * dfaalloc (void) { - struct dfa *d = xzalloc (sizeof *d); - d->multibyte = MB_CUR_MAX > 1; - d->dfaexec = d->multibyte ? dfaexec_mb : dfaexec_sb; - d->fast = !d->multibyte; - d->lex.cur_mb_len = 1; - return d; + return xmalloc (sizeof (struct dfa)); } +/* Initialize DFA. */ void -dfa_init (void) +dfasyntax (struct dfa *dfa, struct localeinfo const *linfo, + reg_syntax_t bits, bool fold, unsigned char eol) { - check_utf8 (); - check_unibyte_c (); - init_mbrtowc_cache (); + int i; + memset (dfa, 0, offsetof (struct dfa, dfaexec)); + dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb; + dfa->simple_locale = using_simple_locale (linfo->multibyte); + dfa->localeinfo = *linfo; + + dfa->fast = !dfa->localeinfo.multibyte; + + dfa->lex.cur_mb_len = 1; + dfa->syntax.syntax_bits_set = true; + dfa->syntax.syntax_bits = bits; + dfa->syntax.case_fold = fold; + dfa->syntax.eolbyte = eol; + + for (i = CHAR_MIN; i <= CHAR_MAX; ++i) + { + unsigned char uc = i; + + dfa->syntax.sbit[uc] = char_context (dfa, uc); + switch (dfa->syntax.sbit[uc]) + { + case CTX_LETTER: + setbit (uc, dfa->syntax.letters); + break; + case CTX_NEWLINE: + setbit (uc, dfa->syntax.newline); + break; + } + + /* POSIX requires that the five bytes in "\n\r./" (including the + terminating NUL) cannot occur inside a multibyte character. */ + dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8 + ? (uc & 0xc0) != 0x80 + : strchr ("\n\r./", uc) != NULL); + } } /* vim:set shiftwidth=2: */ |