aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorArnold D. Robbins <arnold@skeeve.com>2016-08-25 21:40:11 +0300
committerArnold D. Robbins <arnold@skeeve.com>2016-08-25 21:40:11 +0300
commit7453c813457583197fcf0fe1c7d2301d6013bfea (patch)
treeecaa73572d5f486bf64c1fdf000d7b3944277a82
parentd60bf1935df309eea0bcc87ec542030a5b022f35 (diff)
downloadegawk-7453c813457583197fcf0fe1c7d2301d6013bfea.tar.gz
egawk-7453c813457583197fcf0fe1c7d2301d6013bfea.tar.bz2
egawk-7453c813457583197fcf0fe1c7d2301d6013bfea.zip
Remove avoid_dfa. Simplify dfa usage and rearrange callers in re.c.
-rw-r--r--ChangeLog15
-rw-r--r--awk.h11
-rw-r--r--interpret.h18
-rw-r--r--re.c55
4 files changed, 35 insertions, 64 deletions
diff --git a/ChangeLog b/ChangeLog
index 7f607883..3e87c20d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,18 @@
+2016-08-25 Norihiro Tanaka <noritnk@kcn.ne.jp>
+
+ * awk.h (struct Regexp): Remove dfa. Now dfareg instead of it. All
+ referers changed.
+ * re.c (research): Arrange caller of dfaexec and research.
+ * (avoid_dfa): Removed. All callers changed.
+ * awk.h (avoid_dfa): Removed.
+
+ Other changes by Arnold Robbins:
+
+ * awk.h (struct Regexp): Change various boolean members to bool.
+ (RE_NO_FLAGS): New #define.
+ * interpret.h: Use RE_NO_FLAGS instead of zero.
+ * re.c (research): Prettify the logic a little bit.
+
2016-08-25 Arnold D. Robbins <arnold@skeeve.com>
* dfa.c: Sync with grep.
diff --git a/awk.h b/awk.h
index 5587cbc3..d8b5b8d4 100644
--- a/awk.h
+++ b/awk.h
@@ -206,11 +206,10 @@ typedef struct Regexp {
struct re_pattern_buffer pat;
struct re_registers regs;
struct dfa *dfareg;
- short dfa;
- short has_anchor; /* speed up of avoid_dfa kludge, temporary */
- short non_empty; /* for use in fpat_parse_field */
- short has_meta; /* re has meta chars so (probably) isn't simple string */
- short maybe_long; /* re has meta chars that can match long text */
+ bool has_anchor; /* re has anchors which dfa avoids */
+ bool non_empty; /* for use in fpat_parse_field */
+ bool has_meta; /* re has meta chars so (probably) isn't simple string */
+ bool maybe_long; /* re has meta chars that can match long text */
} Regexp;
#define RESTART(rp,s) (rp)->regs.start[0]
#define REEND(rp,s) (rp)->regs.end[0]
@@ -219,6 +218,7 @@ typedef struct Regexp {
#define NUMSUBPATS(rp,s) (rp)->regs.num_regs
/* regexp matching flags: */
+#define RE_NO_FLAGS 0 /* empty flags */
#define RE_NEED_START 1 /* need to know start/end of match */
#define RE_NO_BOL 2 /* not allowed to match ^ in regexp */
@@ -1650,7 +1650,6 @@ extern void reg_error(const char *s);
extern Regexp *re_update(NODE *t);
extern void resyntax(int syntax);
extern void resetup(void);
-extern int avoid_dfa(NODE *re, char *str, size_t len);
extern int reisstring(const char *text, size_t len, Regexp *re, const char *buf);
extern int get_numbase(const char *str, bool use_locale);
diff --git a/interpret.h b/interpret.h
index 6b832c16..4b8dc472 100644
--- a/interpret.h
+++ b/interpret.h
@@ -832,8 +832,7 @@ mod:
t2 = TOP_SCALAR(); /* switch expression */
t2 = force_string(t2);
rp = re_update(m);
- di = (research(rp, t2->stptr, 0, t2->stlen,
- avoid_dfa(m, t2->stptr, t2->stlen)) >= 0);
+ di = (research(rp, t2->stptr, 0, t2->stlen, RE_NO_FLAGS) >= 0);
} else {
t1 = POP_SCALAR(); /* case value */
t2 = TOP_SCALAR(); /* switch expression */
@@ -996,20 +995,7 @@ arrayfor:
t1 = *get_field(0, (Func_ptr *) 0);
match_re:
rp = re_update(m);
- /*
- * Any place where research() is called with a last parameter of
- * zero, we need to use the avoid_dfa test. This appears here and
- * in the code for Op_K_case.
- *
- * A new or improved dfa that distinguishes beginning/end of
- * string from beginning/end of line will allow us to get rid of
- * this hack.
- *
- * The avoid_dfa() function is in re.c; it is not very smart.
- */
-
- di = research(rp, t1->stptr, 0, t1->stlen,
- avoid_dfa(m, t1->stptr, t1->stlen));
+ di = research(rp, t1->stptr, 0, t1->stlen, RE_NO_FLAGS);
di = (di == -1) ^ (op != Op_nomatch);
if (op != Op_match_rec) {
decr_sp();
diff --git a/re.c b/re.c
index c7899694..b11a6984 100644
--- a/re.c
+++ b/re.c
@@ -170,7 +170,6 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
emalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
memset((char *) rp, 0, sizeof(*rp));
- rp->dfareg = NULL;
rp->pat.allocated = 0; /* regex will allocate the buffer */
emalloc(rp->pat.fastmap, char *, 256, "make_regexp");
@@ -223,12 +222,11 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
/* gack. this must be done *after* re_compile_pattern */
rp->pat.newline_anchor = false; /* don't get \n in middle of string */
if (dfa && ! no_dfa) {
- rp->dfa = true;
rp->dfareg = dfaalloc();
dfasyntax(rp->dfareg, dfa_syn, ignorecase, '\n');
dfacomp(buf, len, rp->dfareg, true);
} else
- rp->dfa = false;
+ rp->dfareg = NULL;
rp->has_anchor = has_anchor;
/* Additional flags that help with RS as regexp. */
@@ -278,26 +276,25 @@ research(Regexp *rp, char *str, int start,
* starts in the middle of a string, so don't bother trying it
* in that case.
*/
- if (rp->dfa && ! no_bol && start == 0) {
- char save;
- size_t count = 0;
+ if (rp->dfareg != NULL && ! no_bol && start == 0) {
struct dfa *superset = dfasuperset(rp->dfareg);
- /*
- * dfa likes to stick a '\n' right after the matched
- * text. So we just save and restore the character.
- */
- save = str[start+len];
if (superset)
ret = dfaexec(superset, str+start, str+start+len,
true, NULL, NULL);
- if (ret)
+
+ if (ret && ((! need_start && ! rp->has_anchor)
+ || (! superset && dfaisfast(rp->dfareg))))
ret = dfaexec(rp->dfareg, str+start, str+start+len,
- true, &count, &try_backref);
- str[start+len] = save;
+ true, NULL, &try_backref);
}
if (ret) {
- if (need_start || rp->dfa == false || try_backref) {
+ if ( rp->dfareg == NULL
+ || start != 0
+ || no_bol
+ || need_start
+ || rp->has_anchor
+ || try_backref) {
/*
* Passing NULL as last arg speeds up search for cases
* where we don't need the start/end info.
@@ -326,7 +323,7 @@ refree(Regexp *rp)
free(rp->regs.start);
if (rp->regs.end)
free(rp->regs.end);
- if (rp->dfa) {
+ if (rp->dfareg != NULL) {
dfafree(rp->dfareg);
free(rp->dfareg);
}
@@ -425,32 +422,6 @@ resetup()
dfa_init();
}
-/* avoid_dfa --- return true if we should not use the DFA matcher */
-
-int
-avoid_dfa(NODE *re, char *str, size_t len)
-{
- char *end;
-
- /*
- * f = @/.../
- * if ("foo" ~ f) ...
- *
- * This creates a Node_dynregex with NULL re_reg.
- */
- if (re->re_reg == NULL)
- return false;
-
- if (! re->re_reg->has_anchor)
- return false;
-
- for (end = str + len; str < end; str++)
- if (*str == '\n')
- return true;
-
- return false;
-}
-
/* reisstring --- return true if the RE match is a simple string match */
int