Remove avoid_dfa. Simplify dfa usage and rearrange callers in re.c.

author: Arnold D. Robbins <arnold@skeeve.com> 2016-08-25 21:40:11 +0300
committer: Arnold D. Robbins <arnold@skeeve.com> 2016-08-25 21:40:11 +0300
commit: 7453c813457583197fcf0fe1c7d2301d6013bfea (patch)
tree: ecaa73572d5f486bf64c1fdf000d7b3944277a82
parent: d60bf1935df309eea0bcc87ec542030a5b022f35 (diff)
download: egawk-7453c813457583197fcf0fe1c7d2301d6013bfea.tar.gz
egawk-7453c813457583197fcf0fe1c7d2301d6013bfea.tar.bz2
egawk-7453c813457583197fcf0fe1c7d2301d6013bfea.zip
4 files changed, 35 insertions, 64 deletions
diff --git a/ChangeLog b/ChangeLog
index 7f607883..3e87c20d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,18 @@
+2016-08-25         Norihiro Tanaka      <noritnk@kcn.ne.jp>
+
+	* awk.h (struct Regexp): Remove dfa.  Now dfareg instead of it.  All
+	referers changed.
+	* re.c (research): Arrange caller of dfaexec and research.
+	* (avoid_dfa): Removed.  All callers changed.
+	* awk.h (avoid_dfa): Removed.
+
+	Other changes by Arnold Robbins:
+
+	* awk.h (struct Regexp): Change various boolean members to bool.
+	(RE_NO_FLAGS): New #define.
+	* interpret.h: Use RE_NO_FLAGS instead of zero.
+	* re.c (research): Prettify the logic a little bit.
+
 2016-08-25         Arnold D. Robbins     <arnold@skeeve.com>
 
 	* dfa.c: Sync with grep.
diff --git a/awk.h b/awk.h
index 5587cbc3..d8b5b8d4 100644
--- a/awk.h
+++ b/awk.h
@@ -206,11 +206,10 @@ typedef struct Regexp {
 	struct re_pattern_buffer pat;
 	struct re_registers regs;
 	struct dfa *dfareg;
-	short dfa;
-	short has_anchor;	/* speed up of avoid_dfa kludge, temporary */
-	short non_empty;	/* for use in fpat_parse_field */
-	short has_meta;		/* re has meta chars so (probably) isn't simple string */
-	short maybe_long;	/* re has meta chars that can match long text */
+	bool has_anchor;	/* re has anchors which dfa avoids */
+	bool non_empty;		/* for use in fpat_parse_field */
+	bool has_meta;		/* re has meta chars so (probably) isn't simple string */
+	bool maybe_long;	/* re has meta chars that can match long text */
 } Regexp;
 #define	RESTART(rp,s)	(rp)->regs.start[0]
 #define	REEND(rp,s)	(rp)->regs.end[0]
@@ -219,6 +218,7 @@ typedef struct Regexp {
 #define	NUMSUBPATS(rp,s)	(rp)->regs.num_regs
 
 /* regexp matching flags: */
+#define RE_NO_FLAGS	0	/* empty flags */
 #define RE_NEED_START	1	/* need to know start/end of match */
 #define RE_NO_BOL	2	/* not allowed to match ^ in regexp */
 
@@ -1650,7 +1650,6 @@ extern void reg_error(const char *s);
 extern Regexp *re_update(NODE *t);
 extern void resyntax(int syntax);
 extern void resetup(void);
-extern int avoid_dfa(NODE *re, char *str, size_t len);
 extern int reisstring(const char *text, size_t len, Regexp *re, const char *buf);
 extern int get_numbase(const char *str, bool use_locale);
 
diff --git a/interpret.h b/interpret.h
index 6b832c16..4b8dc472 100644
--- a/interpret.h
+++ b/interpret.h
@@ -832,8 +832,7 @@ mod:
 				t2 = TOP_SCALAR();	/* switch expression */
 				t2 = force_string(t2);
 				rp = re_update(m);
-				di = (research(rp, t2->stptr, 0, t2->stlen,
-							avoid_dfa(m, t2->stptr, t2->stlen)) >= 0);
+				di = (research(rp, t2->stptr, 0, t2->stlen, RE_NO_FLAGS) >= 0);
 			} else {
 				t1 = POP_SCALAR();	/* case value */
 				t2 = TOP_SCALAR();	/* switch expression */
@@ -996,20 +995,7 @@ arrayfor:
 			t1 = *get_field(0, (Func_ptr *) 0);
 match_re:
 			rp = re_update(m);
-			/*
-			 * Any place where research() is called with a last parameter of
-			 * zero, we need to use the avoid_dfa test. This appears here and
-			 * in the code for Op_K_case.
-			 *
-			 * A new or improved dfa that distinguishes beginning/end of
-			 * string from beginning/end of line will allow us to get rid of
-			 * this hack.
-			 *
-			 * The avoid_dfa() function is in re.c; it is not very smart.
-			 */
-
-			di = research(rp, t1->stptr, 0, t1->stlen,
-								avoid_dfa(m, t1->stptr, t1->stlen));
+			di = research(rp, t1->stptr, 0, t1->stlen, RE_NO_FLAGS);
 			di = (di == -1) ^ (op != Op_nomatch);
 			if (op != Op_match_rec) {
 				decr_sp();
diff --git a/re.c b/re.c
index c7899694..b11a6984 100644
--- a/re.c
+++ b/re.c
@@ -170,7 +170,6 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
 
 	emalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
 	memset((char *) rp, 0, sizeof(*rp));
-	rp->dfareg = NULL;
 	rp->pat.allocated = 0;	/* regex will allocate the buffer */
 	emalloc(rp->pat.fastmap, char *, 256, "make_regexp");
 
@@ -223,12 +222,11 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
 	/* gack. this must be done *after* re_compile_pattern */
 	rp->pat.newline_anchor = false; /* don't get \n in middle of string */
 	if (dfa && ! no_dfa) {
-		rp->dfa = true;
 		rp->dfareg = dfaalloc();
 		dfasyntax(rp->dfareg, dfa_syn, ignorecase, '\n');
 		dfacomp(buf, len, rp->dfareg, true);
 	} else
-		rp->dfa = false;
+		rp->dfareg = NULL;
 	rp->has_anchor = has_anchor;
 
 	/* Additional flags that help with RS as regexp. */
@@ -278,26 +276,25 @@ research(Regexp *rp, char *str, int start,
 	 * starts in the middle of a string, so don't bother trying it
 	 * in that case.
 	 */
-	if (rp->dfa && ! no_bol && start == 0) {
-		char save;
-		size_t count = 0;
+	if (rp->dfareg != NULL && ! no_bol && start == 0) {
 		struct dfa *superset = dfasuperset(rp->dfareg);
-		/*
-		 * dfa likes to stick a '\n' right after the matched
-		 * text.  So we just save and restore the character.
-		 */
-		save = str[start+len];
 		if (superset)
 			ret = dfaexec(superset, str+start, str+start+len,
 							true, NULL, NULL);
-		if (ret)
+
+		if (ret && ((! need_start && ! rp->has_anchor)
+				|| (! superset && dfaisfast(rp->dfareg))))
 			ret = dfaexec(rp->dfareg, str+start, str+start+len,
-						true, &count, &try_backref);
-		str[start+len] = save;
+						true, NULL, &try_backref);
 	}
 
 	if (ret) {
-		if (need_start || rp->dfa == false || try_backref) {
+		if (   rp->dfareg == NULL
+			|| start != 0
+			|| no_bol
+			|| need_start
+			|| rp->has_anchor
+			|| try_backref) {
 			/*
 			 * Passing NULL as last arg speeds up search for cases
 			 * where we don't need the start/end info.
@@ -326,7 +323,7 @@ refree(Regexp *rp)
 		free(rp->regs.start);
 	if (rp->regs.end)
 		free(rp->regs.end);
-	if (rp->dfa) {
+	if (rp->dfareg != NULL) {
 		dfafree(rp->dfareg);
 		free(rp->dfareg);
 	}
@@ -425,32 +422,6 @@ resetup()
 	dfa_init();
 }
 
-/* avoid_dfa --- return true if we should not use the DFA matcher */
-
-int
-avoid_dfa(NODE *re, char *str, size_t len)
-{
-	char *end;
-
-	/*
-	 * f = @/.../
-	 * if ("foo" ~ f) ...
-	 *
-	 * This creates a Node_dynregex with NULL re_reg.
-	 */
-	if (re->re_reg == NULL)
-		return false;
-
-	if (! re->re_reg->has_anchor)
-		return false;
-
-	for (end = str + len; str < end; str++)
-		if (*str == '\n')
-			return true;
-
-	return false;
-}
-
 /* reisstring --- return true if the RE match is a simple string match */
 
 int
author	Arnold D. Robbins <arnold@skeeve.com>	2016-08-25 21:40:11 +0300
committer	Arnold D. Robbins <arnold@skeeve.com>	2016-08-25 21:40:11 +0300
commit	7453c813457583197fcf0fe1c7d2301d6013bfea (patch)
tree	ecaa73572d5f486bf64c1fdf000d7b3944277a82
parent	d60bf1935df309eea0bcc87ec542030a5b022f35 (diff)
download	egawk-7453c813457583197fcf0fe1c7d2301d6013bfea.tar.gz egawk-7453c813457583197fcf0fe1c7d2301d6013bfea.tar.bz2 egawk-7453c813457583197fcf0fe1c7d2301d6013bfea.zip