From 3775eb7b3afe49d6b39e5a907747c23de8b5f42e Mon Sep 17 00:00:00 2001 From: Kaz Kylheku Date: Sat, 14 Sep 2024 20:50:03 -0700 Subject: read-until-match: fix regression. Commit 9aa751c8a4f845ef2d2bba091c81ffeded941afd broke things. This fix affects the function read-until-match, scan-until-match and count-until-match which share implementation. * regex.c (scan_until_common): In the REGM_MATCH_DONE and REGM_MATCH cases, we must push the character onto the local stack, before doing the match = stack assignment. Otherwise, it's possible that the stack is empty and so no match is recorded. The REGM_FAIL case will then behave as if no match was found, consuming a character and continuing. * txr.1: Codify an existing behavior: only non-empty matches for the regex are considered by read-until-match. * tests/015/regex.tl: New file. I am amazed to discover that we don't seem to have a test suite for regexes at all. Putting the tests here which confirm this fix and provide coverage for some edge cases in read-until-match. --- regex.c | 2 ++ tests/015/regex.tl | 18 ++++++++++++++++++ txr.1 | 3 +-- 3 files changed, 21 insertions(+), 2 deletions(-) create mode 100644 tests/015/regex.tl diff --git a/regex.c b/regex.c index aaddec64..9437881e 100644 --- a/regex.c +++ b/regex.c @@ -3234,9 +3234,11 @@ static val scan_until_common(val self, val regex, val stream_in, regex_machine_reset(®m); continue; case REGM_MATCH_DONE: + push(ch, &stack); match = stack; goto out_match; case REGM_MATCH: + push(ch, &stack); match = stack; continue; case REGM_INCOMPLETE: diff --git a/tests/015/regex.tl b/tests/015/regex.tl new file mode 100644 index 00000000..68058eea --- /dev/null +++ b/tests/015/regex.tl @@ -0,0 +1,18 @@ +(load "../common") + +(defun rum (str regex : include-match) + (with-in-string-stream (s str) + (list (read-until-match regex s include-match) + (read-until-match regex s include-match)))) + +(mtest + (rum "a-b" #/-/) ("a" "b") + (rum "a-b" #/-/ t) ("a-" "b") + (rum "a----b" #/-+/) ("a" "b") + (rum "a----b" #/-+/ t) ("a----" "b") + (rum "a----b" #/-*/) ("a" "b") + (rum "a----b" #/-*/ t) ("a----" "b") + (rum "abc" #/-/) ("abc" nil) + (rum "abc" #/-/ t) ("abc" nil) + (rum "a___b___#c" #/_+#/) ("a___b" "c") + (rum "a___b___#c" #/_+#/ t) ("a___b___#" "c")) diff --git a/txr.1 b/txr.1 index 277673c7..128fcebf 100644 --- a/txr.1 +++ b/txr.1 @@ -56676,10 +56676,9 @@ matched by is included in the returned string. It defaults to .codn nil . -The accumulation of characters is terminated by a match on +The accumulation of characters is terminated by a non-empty match on .metn regex , the end of the stream, or an error. - This means that characters are read from the stream and accumulated while the stream has more characters available, and while its prefix does not match .metn regex . -- cgit v1.2.3