From 5961f0de80abce4645ec2f022b2346e24b6479ed Mon Sep 17 00:00:00 2001 From: Kaz Kylheku Date: Tue, 13 Mar 2012 12:57:21 -0700 Subject: Implementing URL filtering. * eval.c (eval_init): New intrinsic functions: url-encode, url-decode. * filter.c (tourl_k, fromurl_k): New keyword variables. (is_url_reserved, digit_value): New static functions. (url_encode, url_decode): New functions. (filter_init): Intialize new keyword variables and register new :tourl and :fromurl filters. * filter.h (tourl_k, fromurl_k, url_encode, url_decode): Declared. * txr.1: Updated. * txr.vim: Likewise. --- ChangeLog | 18 +++++++++++++++ eval.c | 4 ++++ filter.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ filter.h | 5 ++++ txr.1 | 21 +++++++++++++++++ txr.vim | 6 +++-- 6 files changed, 131 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index 8298ec3e..807896fb 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,21 @@ +2012-03-13 Kaz Kylheku + + Implementing URL filtering. + + * eval.c (eval_init): New intrinsic functions: url-encode, url-decode. + + * filter.c (tourl_k, fromurl_k): New keyword variables. + (is_url_reserved, digit_value): New static functions. + (url_encode, url_decode): New functions. + (filter_init): Intialize new keyword variables and register + new :tourl and :fromurl filters. + + * filter.h (tourl_k, fromurl_k, url_encode, url_decode): Declared. + + * txr.1: Updated. + + * txr.vim: Likewise. + 2012-03-13 Kaz Kylheku * stream.c (string_out_byte_flush): Bugfix. Do not loop inside this diff --git a/eval.c b/eval.c index c527d9de..156e58e2 100644 --- a/eval.c +++ b/eval.c @@ -43,6 +43,7 @@ #include "debug.h" #include "match.h" #include "rand.h" +#include "filter.h" #include "eval.h" typedef val (*opfun_t)(val, val); @@ -2354,6 +2355,9 @@ void eval_init(void) reg_fun(intern(lit("match-fun"), user_package), func_n4(match_fun)); + reg_fun(intern(lit("url-encode"), user_package), func_n1(url_encode)); + reg_fun(intern(lit("url-decode"), user_package), func_n1(url_decode)); + eval_error_s = intern(lit("eval-error"), user_package); uw_register_subtype(eval_error_s, error_s); } diff --git a/filter.c b/filter.c index 9dd07e11..f66420ba 100644 --- a/filter.c +++ b/filter.c @@ -26,8 +26,11 @@ #include #include +#include #include #include +#include +#include #include "config.h" #include "lib.h" #include "hash.h" @@ -35,10 +38,12 @@ #include "match.h" #include "filter.h" #include "gc.h" +#include "stream.h" val filters; val filter_k, lfilt_k, rfilt_k, to_html_k, from_html_k; val upcase_k, downcase_k, fun_k; +val tourl_k, fromurl_k; static val make_trie(void) { @@ -573,6 +578,75 @@ static val html_numeric_handler(val ch) return func_f1(cons(ch, nil), html_dec_continue); } +static int is_url_reserved(int ch) +{ + return (ch <= 0x20 || ch >= 0x7F || strchr(":/?#[]@!$&'()*+,;=%", ch) != 0); +} + +val url_encode(val str) +{ + val in_byte = make_string_byte_input_stream(str); + val out = make_string_output_stream(); + val ch; + + while ((ch = get_byte(in_byte)) != nil) { + int c = c_num(ch); + + if (is_url_reserved(c)) + format(out, lit("%~1X~1X"), num_fast(c >> 4), num_fast(c & 0xf), nao); + else + put_char(chr_num(ch), out); + } + + return get_string_from_stream(out); +} + +static int digit_value(int digit) +{ + if (digit >= '0' && digit <= '9') + return digit - '0'; + if (digit >= 'A' && digit <= 'F') + return digit - 'A' + 10; + if (digit >= 'a' && digit <= 'f') + return digit - 'a' + 10; + internal_error("bad digit"); +} + +val url_decode(val str) +{ + val in = make_string_input_stream(str); + val out = make_string_output_stream(); + + for (;;) { + val ch = get_char(in); + + if (ch == chr('%')) { + val ch2 = get_char(in); + val ch3 = get_char(in); + + if (ch2 && ch3 && chr_isxdigit(ch2) && chr_isxdigit(ch3)) { + int byte = digit_value(c_num(ch2)) << 4 | digit_value(c_num(ch3)); + put_byte(num_fast(byte), out); + } else { + put_char(ch, out); + if (!ch2) + break; + put_char(ch2, out); + if (!ch3) + break; + put_char(ch3, out); + } + continue; + } + if (!ch) + break; + + put_char(ch, out); + } + + return get_string_from_stream(out); +} + void filter_init(void) { protect(&filters, (val *) 0); @@ -586,6 +660,9 @@ void filter_init(void) upcase_k = intern(lit("upcase"), keyword_package); downcase_k = intern(lit("downcase"), keyword_package); fun_k = intern(lit("fun"), keyword_package); + tourl_k = intern(lit("tourl"), keyword_package); + fromurl_k = intern(lit("fromurl"), keyword_package); + sethash(filters, to_html_k, build_filter(to_html_table, t)); { val trie = build_filter(from_html_table, nil); @@ -595,4 +672,6 @@ void filter_init(void) } sethash(filters, upcase_k, func_n1(upcase_str)); sethash(filters, downcase_k, func_n1(downcase_str)); + sethash(filters, tourl_k, func_n1(url_encode)); + sethash(filters, fromurl_k, func_n1(url_decode)); } diff --git a/filter.h b/filter.h index a3dcab2f..1a084e66 100644 --- a/filter.h +++ b/filter.h @@ -27,6 +27,7 @@ extern val filters; extern val filter_k, lfilt_k, rfilt_k, to_html_k, from_html_k; extern val upcase_k, downcase_k, fun_k; +extern val tourl_k, fromurl_k; val trie_lookup_begin(val trie); val trie_value_at(val node); @@ -36,4 +37,8 @@ val filter_string(val trie, val str); val filter_equal(val lfilt, val rfilt, val left, val right); val register_filter(val sym, val table); +val url_encode(val str); +val url_decode(val str); + void filter_init(void); + diff --git a/txr.1 b/txr.1 index fbc61d4d..e5e63478 100644 --- a/txr.1 +++ b/txr.1 @@ -3638,6 +3638,25 @@ Convert the 26 lower case letters of the English alphabet to upper case. .IP :downcase Convert the 26 upper case letters of the English alphabet to lower case. +.IP :fromurl +Decode URL-encoded (a.k.a. percent-encoded) text. Character triplets consisting +of the % character followed by a pair of hexadecimal digits (case insensitive) +are are converted to bytes having the value represented by the hexadecimal +digits (most significant nybble first). Sequences of one or more such bytes are +treated as UTF-8 data and decoded to characters. + +.IP :tourl +Convert to URL encoding according to RFC 3986. The text is first converted +to UTF-8 bytes. The bytes are then converted back to text as follows. +Bytes in the range 0 to 32, and 127 to 255 (note: including the ASCII DEL), +bytes whose values correspond to ASCII characters which are listed by RFC 3986 +as being in the "reserved set", and the byte value corresponding to the +ASCII % character are encoded as a three-character sequence consisting +of the % character followed by two hexadecimal digits derived from the +byte value (most significant nybble first, upper case). All other bytes +are converted directly to characters of the same value without any such +encoding. + Example: to escape HTML characters in all variable substitutions occuring in an output clause, specify :filter :to_html in the directive: @@ -6754,6 +6773,8 @@ Certain object types have a custom equal function. .SS Function match-fun +.SS Functions url-encode and url-decode + .SH APPENDIX A: NOTES ON EXOTIC REGULAR EXPRESSIONS Users familiar with regular expressions may not be familiar with the complement diff --git a/txr.vim b/txr.vim index f59de0fa..edecc5aa 100644 --- a/txr.vim +++ b/txr.vim @@ -45,7 +45,9 @@ syn keyword txl_keyword contained remq remql remqual syn keyword txl_keyword contained all none eq eql equal + - * abs trunc mod syn keyword txl_keyword contained expt exptmod sqrt gcd fixnump bignump syn keyword txl_keyword contained numberp zerop evenp oddp > -syn keyword txl_keyword contained < >= <= max min search-regex match-regex regsub + +.SS Functions url-encode and url-decode + syn keyword txl_keyword contained make-hash hash gethash sethash pushhash remhash syn keyword txl_keyword contained hash-count get-hash-userdata set-hash-userdata hashp maphash syn keyword txl_keyword contained hash-eql hash-equal eval *stdout* *stdin* *stddebug* @@ -83,7 +85,7 @@ syn keyword txl_keyword contained make-random-state random-state-p syn keyword txl_keyword contained random-fixnum random rand syn keyword txl_keyword contained range range* generate repeat force -syn keyword txl_keyword contained throw throwf error match-fun +syn keyword txl_keyword contained throw throwf error match-fun url-encode url-decode syn match txr_error "@[\t ]*[*]\?[\t ]*." syn match txr_nested_error "[^\t `]\+" contained -- cgit v1.2.3