From 7c6391bb10adc88d156ec88148184bc3eb8681ce Mon Sep 17 00:00:00 2001 From: Kaz Kylheku Date: Tue, 19 Jan 2010 15:16:28 -0800 Subject: More regex grammar work. --- ChangeLog | 11 +++++++++++ parser.h | 1 + parser.y | 3 ++- txr.1 | 7 +++---- 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/ChangeLog b/ChangeLog index 93a450de..a8247e8b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,14 @@ +2010-01-19 Kaz Kylheku + + * parser.y (regex): Getting rid of empty '/' '/' production + again. + (regexpr): Re-introducing empty production; this time using + %prec LOW trick to give this interpretation the lowest + possible precedence. Thus expressions like /&/ work again. + (regbranch): New production to allow R1~R2 to be valid. + + * txr.1: Documented. + 2010-01-19 Kaz Kylheku * parser.l (grammar): The ^ character is no longer considered diff --git a/parser.h b/parser.h index e4f712b9..3a7fb720 100644 --- a/parser.h +++ b/parser.h @@ -36,3 +36,4 @@ void yyerror(const char *s); void yyerrorf(val s, ...); void yybadtoken(int tok, val context); void end_of_regex(void); +int yylex(void); diff --git a/parser.y b/parser.y index cb9d320f..b2745c2f 100644 --- a/parser.y +++ b/parser.y @@ -451,7 +451,6 @@ expr : IDENT { $$ = intern(string_own($1), nil); } ; regex : '/' regexpr '/' { $$ = $2; end_of_regex(); } - | '/' '/' { $$ = nil; end_of_regex(); } | '/' error { $$ = nil; yybadtoken(yychar, lit("regex")); end_of_regex(); } @@ -463,10 +462,12 @@ regexpr : regbranch { $$ = if3(cdr($1), | regexpr '|' regexpr { $$ = list(or_s, $1, $3, nao); } | regexpr '&' regexpr { $$ = list(and_s, $1, $3, nao); } | '~' regexpr { $$ = list(compl_s, $2, nao); } + | /* empty */ %prec LOW { $$ = nil; } ; regbranch : regterm %prec LOW { $$ = cons($1, nil); } | regterm regbranch { $$ = cons($1, $2); } + | regterm '~' regexpr { $$ = list($1, list(compl_s, $3, nao), nao); } ; regterm : regterm '*' { $$ = list(zeroplus_s, $1, nao); } diff --git a/txr.1 b/txr.1 index 64403966..729107be 100644 --- a/txr.1 +++ b/txr.1 @@ -695,7 +695,7 @@ string, then R1%R2 is equivalent to R1*. .IP ~R match the complement of the following expression R; i.e. match those texts that R does not match. This operator is called complement, -or logical not. +or logical not. The form R1~R2 is permitted and means R1(~R2) .IP R1R2 Two consecutive regular expressions denote catenation: the left expression must match, and then the right. @@ -735,9 +735,8 @@ means ab((c*)%(d*ef)). The left argument of % is c*, but the right is the entire expression d*ef. The unary complement operator has the next lower precedence, so -that ~A* means the ~(A*): "match the all text that is not matched by zero -or more repetitions of A", not "match zero or more times the text -not matched by A". +that ~AB means ~(AB) not (~A)B. AB~CD means (AB)~(CD) where +the (CD) is complemented, and catenated to (AB). Catenation is on the next lower precedence rung, so that AB? means A(B?), or "match A, and then optionally B", not "match A and B, as one optional -- cgit v1.2.3