From d5e21633e094e4f0a71939c65637d55aa7f9c536 Mon Sep 17 00:00:00 2001 From: Kaz Kylheku Date: Thu, 26 Jun 2014 07:27:46 -0700 Subject: * eval.c (eval_init): register range_regex and tok_where as intrinsics. * lib.c (tok_where): New function. * lib.h (tok_where): Declared. * regex.c (range_regex): New function. * regex.h (range_regex): Declared. * txr.1: Documented tok-where and range-regex. --- ChangeLog | 15 +++++++++++++++ eval.c | 2 ++ lib.c | 23 +++++++++++++++++++++++ lib.h | 1 + regex.c | 13 +++++++++++++ regex.h | 1 + txr.1 | 30 ++++++++++++++++++++++++++---- 7 files changed, 81 insertions(+), 4 deletions(-) diff --git a/ChangeLog b/ChangeLog index 269a9161..b5916346 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,18 @@ +2014-06-26 Kaz Kylheku + + * eval.c (eval_init): register range_regex and tok_where + as intrinsics. + + * lib.c (tok_where): New function. + + * lib.h (tok_where): Declared. + + * regex.c (range_regex): New function. + + * regex.h (range_regex): Declared. + + * txr.1: Documented tok-where and range-regex. + 2014-06-26 Kaz Kylheku * lib.c (search): Bugfix in type mismatch error message: diff --git a/eval.c b/eval.c index f62f87c9..f3a9dd19 100644 --- a/eval.c +++ b/eval.c @@ -3424,6 +3424,7 @@ void eval_init(void) reg_fun(intern(lit("regex-compile"), user_package), func_n2o(regex_compile, 1)); reg_fun(intern(lit("regexp"), user_package), func_n1(regexp)); reg_fun(intern(lit("search-regex"), user_package), func_n4o(search_regex, 2)); + reg_fun(intern(lit("range-regex"), user_package), func_n4o(range_regex, 2)); reg_fun(intern(lit("match-regex"), user_package), func_n3o(match_regex, 2)); reg_fun(intern(lit("match-regex-right"), user_package), func_n3o(match_regex_right, 2)); @@ -3532,6 +3533,7 @@ void eval_init(void) reg_fun(intern(lit("split-str"), user_package), func_n2(split_str)); reg_fun(intern(lit("split-str-set"), user_package), func_n2(split_str_set)); reg_fun(intern(lit("tok-str"), user_package), func_n3o(tok_str, 1)); + reg_fun(intern(lit("tok-where"), user_package), func_n2(tok_where)); reg_fun(intern(lit("list-str"), user_package), func_n1(list_str)); reg_fun(intern(lit("trim-str"), user_package), func_n1(trim_str)); reg_fun(intern(lit("cmp-str"), user_package), func_n2(cmp_str)); diff --git a/lib.c b/lib.c index 197a89fb..3846d885 100644 --- a/lib.c +++ b/lib.c @@ -2562,6 +2562,29 @@ val tok_str(val str, val tok_regex, val keep_sep) return out; } +val tok_where(val str, val tok_regex) +{ + list_collect_decl (out, iter); + val pos = zero; + + for (;;) { + val range = range_regex(str, tok_regex, pos, nil); + cons_bind (match_start, match_end, range); + + if (!match_start) + break; + + iter = list_collect(iter, range); + + pos = match_end; + + if (numeq(match_end, match_start)) + pos = plus(pos, one); + } + + return out; +} + val list_str(val str) { const wchar_t *cstr = c_str(str); diff --git a/lib.h b/lib.h index 0ea13787..448447b4 100644 --- a/lib.h +++ b/lib.h @@ -577,6 +577,7 @@ val cat_str(val list, val sep); val split_str(val str, val sep); val split_str_set(val str, val set); val tok_str(val str, val tok_regex, val keep_sep); +val tok_where(val str, val tok_regex); val list_str(val str); val trim_str(val str); val cmp_str(val astr, val bstr); diff --git a/regex.c b/regex.c index 1d67d9de..af53021a 100644 --- a/regex.c +++ b/regex.c @@ -1845,6 +1845,19 @@ again: } } +val range_regex(val haystack, val needle_regex, val start, + val from_end) +{ + val result = search_regex(haystack, needle_regex, start, from_end); + + if (result) { + cons_bind (pos, len, result); + rplacd(result, plus(pos, len)); + } + + return result; +} + val match_regex(val str, val reg, val pos) { regex_machine_t regm; diff --git a/regex.h b/regex.h index 1c531cff..eafbc8cf 100644 --- a/regex.h +++ b/regex.h @@ -30,6 +30,7 @@ extern val cspace_k, cdigit_k, cword_char_k; val regex_compile(val regex, val error_stream); val regexp(val); val search_regex(val haystack, val needle_regex, val start_num, val from_end); +val range_regex(val haystack, val needle_regex, val start_num, val from_end); val match_regex(val str, val regex, val pos); val match_regex_right(val str, val regex, val end); val regsub(val regex, val repl, val str); diff --git a/txr.1 b/txr.1 index 279db6e6..c90771f0 100644 --- a/txr.1 +++ b/txr.1 @@ -9302,12 +9302,13 @@ be separate gaps which come between empty strings. This operation is nondestructive: is not modified in any way. -.SS Function tok-str +.SS Functions tok-str and tok-where .TP Syntax: (tok-str []) + (tok-where ) .TP Description: @@ -9329,7 +9330,20 @@ of tok-str changes in the following way. The pieces of which are skipped by the search for tokens are included in the output. If no token is found in , then a list of one element is returned, containing . Generally, if N tokens are found, then the returned list consists of 2N + 1 -elements. The first element of the list is the (possibly empty) substring which had to be skipped to find the first token. Then the token follows. The next element is the next skipped substring and so on. The last element is the substring of between the last token and the end. +elements. The first element of the list is the (possibly empty) substring which +had to be skipped to find the first token. Then the token follows. The next +element is the next skipped substring and so on. The last element is the +substring of between the last token and the end. + +The tok-where function works similarly to tok-str, but instead of returning +the extracted tokens themselves, it returns a list of the character position +ranges within where matches for occur. The ranges +are pairs of numbers, represented as cons cells, where the first number +of the pair gives the starting character position, and the second number +is one position past the end of the match. If a match is empty, then the +two numbers are equal. + +The tok-where function does not support the parameter. .SS Function list-str @@ -11233,12 +11247,13 @@ and error. .SH REGULAR EXPRESSION LIBRARY -.SS Function search-regex +.SS Functions search-regex and range-regex .TP Syntax: (search-regex [ [] ]) + (range-regex [ [] ]) .TP Description @@ -11249,7 +11264,14 @@ the search starts at position 0. If is specified, the search proceeds in reverse, from the last position in the string, toward . This function returns nil if no match is found, otherwise it returns a cons pair, whose car indicates the position of the match, and whose -cdr indicates the length of the match. +cdr indicates the length of the match. + +The range-regex function is similar to search-regex, except when +a match is found, it returns a position range, rather than a position +and length. A cons pair is returned whose car indicates the position +of the match, and whose cdr indicates the position one element past the +last character of the match. If the match is empty, the two integers +are equal. .SS Function match-regex -- cgit v1.2.3