From 4fbc51dadaeb3d25887cec5bf824b8992a960b02 Mon Sep 17 00:00:00 2001 From: Kaz Kylheku Date: Mon, 3 Oct 2016 06:47:29 -0700 Subject: search-regex improvement: negative start and more. * regex.c (search_regex): Handle negative starting positions according to the convention elsewhere and fail excessively negative ones. Consistently fail on starting positions exceeding the length of the string. Handle zero length matches by reporting them against the start position or position one past the last character, based on the value of from-end. * txr.1: search-regex documentation updated. --- regex.c | 92 +++++++++++++++++++++++++++++++++++++---------------------------- txr.1 | 66 ++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 110 insertions(+), 48 deletions(-) diff --git a/regex.c b/regex.c index 80923521..ed2fc2d5 100644 --- a/regex.c +++ b/regex.c @@ -2394,62 +2394,74 @@ static regm_result_t regex_machine_feed(regex_machine_t *regm, wchar_t ch) val search_regex(val haystack, val needle_regex, val start, val from_end) { + val slen = nil; start = default_arg(start, zero); from_end = default_bool_arg(from_end); - if (length_str_lt(haystack, start)) { - return nil; + if (minusp(start)) { + slen = length_str(haystack); + start = plus(start, slen); + if (minusp(start)) + start = zero; + } + + if (from_end) { + cnum i; + cnum s = c_num(start); + const wchar_t *h = c_str(haystack); + + slen = (slen ? slen : length_str(haystack)); + + if (regex_run(needle_regex, L"") >= 0) + return cons(slen, zero); + + for (i = c_num(slen) - 1; i >= s; i--) { + cnum span = regex_run(needle_regex, h + i); + if (span >= 0) + return cons(num(i), num(span)); + } + + gc_hint(haystack); } else { - if (from_end) { - cnum i; - cnum s = c_num(start); - const wchar_t *h = c_str(haystack); - - for (i = c_num(length_str(haystack)) - 1; i >= s; i--) { - cnum span = regex_run(needle_regex, h + i); - if (span >= 0) - return cons(num(i), num(span)); - } + regex_machine_t regm; + val i, pos = start, retval; + regm_result_t last_res = REGM_INCOMPLETE; - gc_hint(haystack); - } else { - regex_machine_t regm; - val i, pos = start, retval; - regm_result_t last_res = REGM_INCOMPLETE; + if (length_str_lt(haystack, pos)) + return nil; - regex_machine_init(®m, needle_regex); + regex_machine_init(®m, needle_regex); again: - for (i = pos; length_str_gt(haystack, i); i = plus(i, one)) { - last_res = regex_machine_feed(®m, c_chr(chr_str(haystack, i))); + for (i = pos; length_str_gt(haystack, i); i = plus(i, one)) { + last_res = regex_machine_feed(®m, c_chr(chr_str(haystack, i))); + if (last_res == REGM_FAIL) { + last_res = regex_machine_feed(®m, 0); if (last_res == REGM_FAIL) { - last_res = regex_machine_feed(®m, 0); - if (last_res == REGM_FAIL) { - regex_machine_reset(®m); - pos = plus(pos, one); - goto again; - } - break; + regex_machine_reset(®m); + pos = plus(pos, one); + goto again; } + break; } + } - last_res = regex_machine_feed(®m, 0); + last_res = regex_machine_feed(®m, 0); - switch (last_res) { - case REGM_INCOMPLETE: - case REGM_MATCH: - retval = cons(pos, num(regex_machine_match_span(®m))); - regex_machine_cleanup(®m); - return retval; - case REGM_FAIL: - regex_machine_cleanup(®m); - return nil; - } + switch (last_res) { + case REGM_INCOMPLETE: + case REGM_MATCH: + retval = cons(pos, num(regex_machine_match_span(®m))); + regex_machine_cleanup(®m); + return retval; + case REGM_FAIL: + regex_machine_cleanup(®m); + return nil; } - - return nil; } + + return nil; } val range_regex(val haystack, val needle_regex, val start, diff --git a/txr.1 b/txr.1 index 37c5a4a8..ba023aa2 100644 --- a/txr.1 +++ b/txr.1 @@ -32162,6 +32162,7 @@ at position .meta start for a match for .metn regex . + If .meta start is omitted, the search starts at position 0. If @@ -32169,9 +32170,37 @@ is omitted, the search starts at position 0. If is specified and has a .cod2 non- nil value, the search -proceeds in reverse, from the last position in the string, toward +proceeds in reverse, from the position just beyond the last character of +.metn string , +toward .metn start . -This function returns + +if +.meta start +exceeds the length of the string, then +.code search-regex +returns +.codn nil . + +If +.meta start +is negative then it indicates positions from the end of the string, +such that -1 is the last character, -2 the second last and so forth. +If the value is so negative that it refers beyond the start of +the string, then the starting position is deemed to be zero. + +If +.meta start +is equal to the length of +.metn string , +and thus refers to the position one character past its +length, then a match occurs at that position if +.meta regex +admits such a match. + +The +.code search-regex +function returns .code nil if no match is found, otherwise it returns a cons, whose @@ -32180,6 +32209,18 @@ indicates the position of the match, and whose .code cdr indicates the length of the match. +If +.meta regex +is capable of matching empty strings, and no other kind of match +is found within +.metn string , +then search regex reports a zero length match. If +.meta from-end +is false, then this match is reported at +.metn start , +otherwise it is reported at the position one character beyond +the end of the string. + The .code range-regex function is similar to @@ -32733,13 +32774,22 @@ argument defaults to the length of so that the end position coincides with the end of the string. -A value in either parameter which is excessively -negative or positive, such that it indexes before -the start of the string or exceeds its length -results in a failed match and consequently -.code nil +With one exception, a value in either parameter which is excessively negative +or positive, such that it indexes before the start of the string or exceeds its +length results in a failed match and consequently +.codn nil +being returned. The exception is that the +.code rr +function permits a negative +.meta position +value which refers before the start of the string; this is effectively +treated as zero. + +The +.meta from-end +argument defaults to +.codn nil . -being returned. The .code r^$ function tests whether the entire portion of -- cgit v1.2.3