diff options
author | Arnold D. Robbins <arnold@skeeve.com> | 2016-08-25 21:40:11 +0300 |
---|---|---|
committer | Arnold D. Robbins <arnold@skeeve.com> | 2016-08-25 21:40:11 +0300 |
commit | 7453c813457583197fcf0fe1c7d2301d6013bfea (patch) | |
tree | ecaa73572d5f486bf64c1fdf000d7b3944277a82 | |
parent | d60bf1935df309eea0bcc87ec542030a5b022f35 (diff) | |
download | egawk-7453c813457583197fcf0fe1c7d2301d6013bfea.tar.gz egawk-7453c813457583197fcf0fe1c7d2301d6013bfea.tar.bz2 egawk-7453c813457583197fcf0fe1c7d2301d6013bfea.zip |
Remove avoid_dfa. Simplify dfa usage and rearrange callers in re.c.
-rw-r--r-- | ChangeLog | 15 | ||||
-rw-r--r-- | awk.h | 11 | ||||
-rw-r--r-- | interpret.h | 18 | ||||
-rw-r--r-- | re.c | 55 |
4 files changed, 35 insertions, 64 deletions
@@ -1,3 +1,18 @@ +2016-08-25 Norihiro Tanaka <noritnk@kcn.ne.jp> + + * awk.h (struct Regexp): Remove dfa. Now dfareg instead of it. All + referers changed. + * re.c (research): Arrange caller of dfaexec and research. + * (avoid_dfa): Removed. All callers changed. + * awk.h (avoid_dfa): Removed. + + Other changes by Arnold Robbins: + + * awk.h (struct Regexp): Change various boolean members to bool. + (RE_NO_FLAGS): New #define. + * interpret.h: Use RE_NO_FLAGS instead of zero. + * re.c (research): Prettify the logic a little bit. + 2016-08-25 Arnold D. Robbins <arnold@skeeve.com> * dfa.c: Sync with grep. @@ -206,11 +206,10 @@ typedef struct Regexp { struct re_pattern_buffer pat; struct re_registers regs; struct dfa *dfareg; - short dfa; - short has_anchor; /* speed up of avoid_dfa kludge, temporary */ - short non_empty; /* for use in fpat_parse_field */ - short has_meta; /* re has meta chars so (probably) isn't simple string */ - short maybe_long; /* re has meta chars that can match long text */ + bool has_anchor; /* re has anchors which dfa avoids */ + bool non_empty; /* for use in fpat_parse_field */ + bool has_meta; /* re has meta chars so (probably) isn't simple string */ + bool maybe_long; /* re has meta chars that can match long text */ } Regexp; #define RESTART(rp,s) (rp)->regs.start[0] #define REEND(rp,s) (rp)->regs.end[0] @@ -219,6 +218,7 @@ typedef struct Regexp { #define NUMSUBPATS(rp,s) (rp)->regs.num_regs /* regexp matching flags: */ +#define RE_NO_FLAGS 0 /* empty flags */ #define RE_NEED_START 1 /* need to know start/end of match */ #define RE_NO_BOL 2 /* not allowed to match ^ in regexp */ @@ -1650,7 +1650,6 @@ extern void reg_error(const char *s); extern Regexp *re_update(NODE *t); extern void resyntax(int syntax); extern void resetup(void); -extern int avoid_dfa(NODE *re, char *str, size_t len); extern int reisstring(const char *text, size_t len, Regexp *re, const char *buf); extern int get_numbase(const char *str, bool use_locale); diff --git a/interpret.h b/interpret.h index 6b832c16..4b8dc472 100644 --- a/interpret.h +++ b/interpret.h @@ -832,8 +832,7 @@ mod: t2 = TOP_SCALAR(); /* switch expression */ t2 = force_string(t2); rp = re_update(m); - di = (research(rp, t2->stptr, 0, t2->stlen, - avoid_dfa(m, t2->stptr, t2->stlen)) >= 0); + di = (research(rp, t2->stptr, 0, t2->stlen, RE_NO_FLAGS) >= 0); } else { t1 = POP_SCALAR(); /* case value */ t2 = TOP_SCALAR(); /* switch expression */ @@ -996,20 +995,7 @@ arrayfor: t1 = *get_field(0, (Func_ptr *) 0); match_re: rp = re_update(m); - /* - * Any place where research() is called with a last parameter of - * zero, we need to use the avoid_dfa test. This appears here and - * in the code for Op_K_case. - * - * A new or improved dfa that distinguishes beginning/end of - * string from beginning/end of line will allow us to get rid of - * this hack. - * - * The avoid_dfa() function is in re.c; it is not very smart. - */ - - di = research(rp, t1->stptr, 0, t1->stlen, - avoid_dfa(m, t1->stptr, t1->stlen)); + di = research(rp, t1->stptr, 0, t1->stlen, RE_NO_FLAGS); di = (di == -1) ^ (op != Op_nomatch); if (op != Op_match_rec) { decr_sp(); @@ -170,7 +170,6 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal) emalloc(rp, Regexp *, sizeof(*rp), "make_regexp"); memset((char *) rp, 0, sizeof(*rp)); - rp->dfareg = NULL; rp->pat.allocated = 0; /* regex will allocate the buffer */ emalloc(rp->pat.fastmap, char *, 256, "make_regexp"); @@ -223,12 +222,11 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal) /* gack. this must be done *after* re_compile_pattern */ rp->pat.newline_anchor = false; /* don't get \n in middle of string */ if (dfa && ! no_dfa) { - rp->dfa = true; rp->dfareg = dfaalloc(); dfasyntax(rp->dfareg, dfa_syn, ignorecase, '\n'); dfacomp(buf, len, rp->dfareg, true); } else - rp->dfa = false; + rp->dfareg = NULL; rp->has_anchor = has_anchor; /* Additional flags that help with RS as regexp. */ @@ -278,26 +276,25 @@ research(Regexp *rp, char *str, int start, * starts in the middle of a string, so don't bother trying it * in that case. */ - if (rp->dfa && ! no_bol && start == 0) { - char save; - size_t count = 0; + if (rp->dfareg != NULL && ! no_bol && start == 0) { struct dfa *superset = dfasuperset(rp->dfareg); - /* - * dfa likes to stick a '\n' right after the matched - * text. So we just save and restore the character. - */ - save = str[start+len]; if (superset) ret = dfaexec(superset, str+start, str+start+len, true, NULL, NULL); - if (ret) + + if (ret && ((! need_start && ! rp->has_anchor) + || (! superset && dfaisfast(rp->dfareg)))) ret = dfaexec(rp->dfareg, str+start, str+start+len, - true, &count, &try_backref); - str[start+len] = save; + true, NULL, &try_backref); } if (ret) { - if (need_start || rp->dfa == false || try_backref) { + if ( rp->dfareg == NULL + || start != 0 + || no_bol + || need_start + || rp->has_anchor + || try_backref) { /* * Passing NULL as last arg speeds up search for cases * where we don't need the start/end info. @@ -326,7 +323,7 @@ refree(Regexp *rp) free(rp->regs.start); if (rp->regs.end) free(rp->regs.end); - if (rp->dfa) { + if (rp->dfareg != NULL) { dfafree(rp->dfareg); free(rp->dfareg); } @@ -425,32 +422,6 @@ resetup() dfa_init(); } -/* avoid_dfa --- return true if we should not use the DFA matcher */ - -int -avoid_dfa(NODE *re, char *str, size_t len) -{ - char *end; - - /* - * f = @/.../ - * if ("foo" ~ f) ... - * - * This creates a Node_dynregex with NULL re_reg. - */ - if (re->re_reg == NULL) - return false; - - if (! re->re_reg->has_anchor) - return false; - - for (end = str + len; str < end; str++) - if (*str == '\n') - return true; - - return false; -} - /* reisstring --- return true if the RE match is a simple string match */ int |