diff options
author | Arnold D. Robbins <arnold@skeeve.com> | 2015-08-02 20:41:39 +0300 |
---|---|---|
committer | Arnold D. Robbins <arnold@skeeve.com> | 2015-08-02 20:41:39 +0300 |
commit | a90865d0db6d60847272d46020a14aad9901f304 (patch) | |
tree | 10d821fddcfe87ca7ec5f3786fe769234b13f00d /dfa.c | |
parent | b64f39e810edd1c9e1884a7a410bfc278ed31eda (diff) | |
parent | c5137ae530c49765049adb53777c79ebb7607ebe (diff) | |
download | egawk-a90865d0db6d60847272d46020a14aad9901f304.tar.gz egawk-a90865d0db6d60847272d46020a14aad9901f304.tar.bz2 egawk-a90865d0db6d60847272d46020a14aad9901f304.zip |
Merge branch 'gawk-4.1-stable'
Diffstat (limited to 'dfa.c')
-rw-r--r-- | dfa.c | 103 |
1 files changed, 62 insertions, 41 deletions
@@ -309,8 +309,6 @@ typedef struct size_t hash; /* Hash of the positions of this state. */ position_set elems; /* Positions this state could match. */ unsigned char context; /* Context from previous state. */ - bool has_backref; /* This state matches a \<digit>. */ - bool has_mbcset; /* This state matches a MBCSET. */ unsigned short constraint; /* Constraint for this state to accept. */ token first_end; /* Token value of the first END in elems. */ position_set mbps; /* Positions which can match multibyte @@ -2195,8 +2193,6 @@ state_index (struct dfa *d, position_set const *s, int context) alloc_position_set (&d->states[i].elems, s->nelem); copy (s, &d->states[i].elems); d->states[i].context = context; - d->states[i].has_backref = false; - d->states[i].has_mbcset = false; d->states[i].constraint = 0; d->states[i].first_end = 0; d->states[i].mbps.nelem = 0; @@ -2212,10 +2208,7 @@ state_index (struct dfa *d, position_set const *s, int context) d->states[i].first_end = d->tokens[s->elems[j].index]; } else if (d->tokens[s->elems[j].index] == BACKREF) - { - d->states[i].constraint = NO_CONSTRAINT; - d->states[i].has_backref = true; - } + d->states[i].constraint = NO_CONSTRAINT; ++d->sindex; @@ -2674,9 +2667,6 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) if (d->tokens[pos.index] == MBCSET || d->tokens[pos.index] == ANYCHAR) { - /* MB_CUR_MAX > 1 */ - if (d->tokens[pos.index] == MBCSET) - d->states[s].has_mbcset = true; /* ANYCHAR and MBCSET must match with a single character, so we must put it to d->states[s].mbps, which contains the positions which can match with a single character not a byte. */ @@ -3388,15 +3378,18 @@ skip_remains_mb (struct dfa *d, unsigned char const *p, When ALLOW_NL is nonzero, newlines may appear in the matching string. If COUNT is non-NULL, increment *COUNT once for each newline processed. Finally, if BACKREF is non-NULL set *BACKREF to indicate whether we - encountered a back-reference (1) or not (0). The caller may use this - to decide whether to fall back on a backtracking matcher. - - If MULTIBYTE, the input consists of multibyte characters and/or - encoding-error bytes. Otherwise, the input consists of single-byte - characters. */ + encountered a DFA-unfriendly construct. The caller may use this to + decide whether to fall back on a matcher like regex. If MULTIBYTE, + the input consists of multibyte characters and/or encoding-error bytes. + Otherwise, the input consists of single-byte characters. + Here is the list of features that make this DFA matcher punt: + - [M-N]-range-in-MB-locale: regex is up to 25% faster on [a-z] + - back-reference: (.)\1 + - word-delimiter-in-MB-locale: \<, \>, \b + */ static inline char * -dfaexec_main (struct dfa *d, char const *begin, char *end, - int allow_nl, size_t *count, int *backref, bool multibyte) +dfaexec_main (struct dfa *d, char const *begin, char *end, int allow_nl, + size_t *count, bool multibyte) { state_num s, s1; /* Current state. */ unsigned char const *p, *mbp; /* Current input character. */ @@ -3486,16 +3479,6 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, Use a macro to avoid the risk that they diverge. */ #define State_transition() \ do { \ - /* Falling back to the glibc matcher in this case gives \ - better performance (up to 25% better on [a-z], for \ - example) and enables support for collating symbols and \ - equivalence classes. */ \ - if (d->states[s].has_mbcset && backref) \ - { \ - *backref = 1; \ - goto done; \ - } \ - \ /* Can match with a multibyte character (and multi-character \ collating element). Transition table might be updated. */ \ s = transit_state (d, s, &p, (unsigned char *) end); \ @@ -3569,11 +3552,7 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, if (d->fails[s]) { if (d->success[s] & sbit[*p]) - { - if (backref) - *backref = d->states[s].has_backref; - goto done; - } + goto done; s1 = s; if (multibyte) @@ -3603,14 +3582,24 @@ static char * dfaexec_mb (struct dfa *d, char const *begin, char *end, int allow_nl, size_t *count, int *backref) { - return dfaexec_main (d, begin, end, allow_nl, count, backref, true); + return dfaexec_main (d, begin, end, allow_nl, count, true); } static char * dfaexec_sb (struct dfa *d, char const *begin, char *end, int allow_nl, size_t *count, int *backref) { - return dfaexec_main (d, begin, end, allow_nl, count, backref, false); + return dfaexec_main (d, begin, end, allow_nl, count, false); +} + +/* Always set *BACKREF and return BEGIN. Use this wrapper for + any regexp that uses a construct not supported by this code. */ +static char * +dfaexec_noop (struct dfa *d, char const *begin, char *end, + int allow_nl, size_t *count, int *backref) +{ + *backref = 1; + return (char *) begin; } /* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, BACKREF, D->multibyte), @@ -3676,6 +3665,31 @@ dfainit (struct dfa *d) d->fast = !d->multibyte; } +/* Return true if every construct in D is supported by this DFA matcher. */ +static bool _GL_ATTRIBUTE_PURE +dfa_supported (struct dfa const *d) +{ + size_t i; + for (i = 0; i < d->tindex; i++) + { + switch (d->tokens[i]) + { + case BEGWORD: + case ENDWORD: + case LIMWORD: + case NOTLIMWORD: + if (!d->multibyte) + continue; + /* fallthrough */ + + case BACKREF: + case MBCSET: + return false; + } + } + return true; +} + static void dfaoptimize (struct dfa *d) { @@ -3773,10 +3787,8 @@ dfassbuild (struct dfa *d) if (d->multibyte) { /* These constraints aren't supported in a multibyte locale. - Ignore them in the superset DFA, and treat them as - backreferences in the main DFA. */ + Ignore them in the superset DFA. */ sup->tokens[j++] = EMPTY; - d->tokens[i] = BACKREF; break; } default: @@ -3806,8 +3818,17 @@ dfacomp (char const *s, size_t len, struct dfa *d, int searchflag) dfambcache (d); dfaparse (s, len, d); dfassbuild (d); - dfaoptimize (d); - dfaanalyze (d, searchflag); + + if (dfa_supported (d)) + { + dfaoptimize (d); + dfaanalyze (d, searchflag); + } + else + { + d->dfaexec = dfaexec_noop; + } + if (d->superset) { d->fast = true; |