diff options
-rw-r--r-- | ChangeLog | 15 | ||||
-rw-r--r-- | eval.c | 2 | ||||
-rw-r--r-- | lib.c | 23 | ||||
-rw-r--r-- | lib.h | 1 | ||||
-rw-r--r-- | regex.c | 13 | ||||
-rw-r--r-- | regex.h | 1 | ||||
-rw-r--r-- | txr.1 | 30 |
7 files changed, 81 insertions, 4 deletions
@@ -1,5 +1,20 @@ 2014-06-26 Kaz Kylheku <kaz@kylheku.com> + * eval.c (eval_init): register range_regex and tok_where + as intrinsics. + + * lib.c (tok_where): New function. + + * lib.h (tok_where): Declared. + + * regex.c (range_regex): New function. + + * regex.h (range_regex): Declared. + + * txr.1: Documented tok-where and range-regex. + +2014-06-26 Kaz Kylheku <kaz@kylheku.com> + * lib.c (search): Bugfix in type mismatch error message: trying to print cons function pointer as value. @@ -3424,6 +3424,7 @@ void eval_init(void) reg_fun(intern(lit("regex-compile"), user_package), func_n2o(regex_compile, 1)); reg_fun(intern(lit("regexp"), user_package), func_n1(regexp)); reg_fun(intern(lit("search-regex"), user_package), func_n4o(search_regex, 2)); + reg_fun(intern(lit("range-regex"), user_package), func_n4o(range_regex, 2)); reg_fun(intern(lit("match-regex"), user_package), func_n3o(match_regex, 2)); reg_fun(intern(lit("match-regex-right"), user_package), func_n3o(match_regex_right, 2)); @@ -3532,6 +3533,7 @@ void eval_init(void) reg_fun(intern(lit("split-str"), user_package), func_n2(split_str)); reg_fun(intern(lit("split-str-set"), user_package), func_n2(split_str_set)); reg_fun(intern(lit("tok-str"), user_package), func_n3o(tok_str, 1)); + reg_fun(intern(lit("tok-where"), user_package), func_n2(tok_where)); reg_fun(intern(lit("list-str"), user_package), func_n1(list_str)); reg_fun(intern(lit("trim-str"), user_package), func_n1(trim_str)); reg_fun(intern(lit("cmp-str"), user_package), func_n2(cmp_str)); @@ -2562,6 +2562,29 @@ val tok_str(val str, val tok_regex, val keep_sep) return out; } +val tok_where(val str, val tok_regex) +{ + list_collect_decl (out, iter); + val pos = zero; + + for (;;) { + val range = range_regex(str, tok_regex, pos, nil); + cons_bind (match_start, match_end, range); + + if (!match_start) + break; + + iter = list_collect(iter, range); + + pos = match_end; + + if (numeq(match_end, match_start)) + pos = plus(pos, one); + } + + return out; +} + val list_str(val str) { const wchar_t *cstr = c_str(str); @@ -577,6 +577,7 @@ val cat_str(val list, val sep); val split_str(val str, val sep); val split_str_set(val str, val set); val tok_str(val str, val tok_regex, val keep_sep); +val tok_where(val str, val tok_regex); val list_str(val str); val trim_str(val str); val cmp_str(val astr, val bstr); @@ -1845,6 +1845,19 @@ again: } } +val range_regex(val haystack, val needle_regex, val start, + val from_end) +{ + val result = search_regex(haystack, needle_regex, start, from_end); + + if (result) { + cons_bind (pos, len, result); + rplacd(result, plus(pos, len)); + } + + return result; +} + val match_regex(val str, val reg, val pos) { regex_machine_t regm; @@ -30,6 +30,7 @@ extern val cspace_k, cdigit_k, cword_char_k; val regex_compile(val regex, val error_stream); val regexp(val); val search_regex(val haystack, val needle_regex, val start_num, val from_end); +val range_regex(val haystack, val needle_regex, val start_num, val from_end); val match_regex(val str, val regex, val pos); val match_regex_right(val str, val regex, val end); val regsub(val regex, val repl, val str); @@ -9302,12 +9302,13 @@ be separate gaps which come between empty strings. This operation is nondestructive: <string> is not modified in any way. -.SS Function tok-str +.SS Functions tok-str and tok-where .TP Syntax: (tok-str <string> <regex> [<keep-between>]) + (tok-where <string> <regex>) .TP Description: @@ -9329,7 +9330,20 @@ of tok-str changes in the following way. The pieces of <string> which are skipped by the search for tokens are included in the output. If no token is found in <string>, then a list of one element is returned, containing <string>. Generally, if N tokens are found, then the returned list consists of 2N + 1 -elements. The first element of the list is the (possibly empty) substring which had to be skipped to find the first token. Then the token follows. The next element is the next skipped substring and so on. The last element is the substring of <string> between the last token and the end. +elements. The first element of the list is the (possibly empty) substring which +had to be skipped to find the first token. Then the token follows. The next +element is the next skipped substring and so on. The last element is the +substring of <string> between the last token and the end. + +The tok-where function works similarly to tok-str, but instead of returning +the extracted tokens themselves, it returns a list of the character position +ranges within <string> where matches for <regex> occur. The ranges +are pairs of numbers, represented as cons cells, where the first number +of the pair gives the starting character position, and the second number +is one position past the end of the match. If a match is empty, then the +two numbers are equal. + +The tok-where function does not support the <keep-between> parameter. .SS Function list-str @@ -11233,12 +11247,13 @@ and error. .SH REGULAR EXPRESSION LIBRARY -.SS Function search-regex +.SS Functions search-regex and range-regex .TP Syntax: (search-regex <haystack-string> <needle-regex> [ <start> [<from-end>] ]) + (range-regex <haystack-string> <needle-regex> [ <start> [<from-end>] ]) .TP Description @@ -11249,7 +11264,14 @@ the search starts at position 0. If <from-end> is specified, the search proceeds in reverse, from the last position in the string, toward <start>. This function returns nil if no match is found, otherwise it returns a cons pair, whose car indicates the position of the match, and whose -cdr indicates the length of the match. +cdr indicates the length of the match. + +The range-regex function is similar to search-regex, except when +a match is found, it returns a position range, rather than a position +and length. A cons pair is returned whose car indicates the position +of the match, and whose cdr indicates the position one element past the +last character of the match. If the match is empty, the two integers +are equal. .SS Function match-regex |