From aa4420347f132039a3e37d6996d1e31096fc10de Mon Sep 17 00:00:00 2001 From: Kaz Kylheku Date: Thu, 12 Nov 2009 16:34:27 -0800 Subject: Documenting extended characters in man page. Cleaned up some more issues related to extended characters. --- ChangeLog | 22 ++++++++++++++++++++++ parser.l | 15 +++++++++++++++ stream.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++------------- stream.h | 2 +- txr.1 | 22 ++++++++++++++++++++++ 5 files changed, 107 insertions(+), 14 deletions(-) diff --git a/ChangeLog b/ChangeLog index 4fbbf5bb..82ee1edf 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,25 @@ +2009-11-12 Kaz Kylheku + + Documenting extended characters in man page. + Cleaned up some more issues related to extended characters. + + * parser.l (grammar): Added error sctions for invalid UTF-8 bytes. + + * stream.c (BROKEN_POPEN_GETWC): New macro. Enables workaround + for a glibc bug, whereby getwc blows up when applied to a FILE * + stream returned from a popen call. + (struct strm_ops): put_char function takes wchar_t. + (common_format): Use wchar_t rather than int. + (stdio_put_string): fputws returns -1, not EOF. + (stdio_put_char, put_cchar): Character argument changed to wchar_t. + Output done with putwc used instead of putc. + (snarf_line, stdio_get_char): Use getwc to read from the stream. + (pipe_close, make_pipe_stream): Implement workaround form glibc bug. + + * stream.h (put_cchar): Declaration updated. + + * txr.1: Added notes about international characters. + 2009-11-12 Kaz Kylheku Regular expression module updated to do unicode character sets. diff --git a/parser.l b/parser.l index 5919f929..b15f5ad1 100644 --- a/parser.l +++ b/parser.l @@ -399,6 +399,11 @@ UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} yyerrorf("bad character in directive: '%s'", yytext); } +. { + yyerrorf("non-UTF-8 byte in directive: '\\x%02x'", + (unsigned char) yytext[0]); + } + [/] { yy_pop_state(); if (yy_top_state() == INITIAL @@ -452,6 +457,11 @@ UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} return REGCHAR; } +. { + yyerrorf("non-UTF-8 byte in regex: '\\x%02x'", + (unsigned char) yytext[0]); + } + ({UONLY}|[^@\n])+ { yylval.lexeme = utf8_dup_from(yytext); return TEXT; @@ -536,4 +546,9 @@ UONLY {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} return LITCHAR; } +. { + yyerrorf("non-UTF-8 byte in literal: '\\x%02x'", + (unsigned char) yytext[0]); + } + %% diff --git a/stream.c b/stream.c index 91ee2e85..0c1050f1 100644 --- a/stream.c +++ b/stream.c @@ -24,6 +24,12 @@ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. */ +/* + * Enable code to work around getwc crash in glibc, + * which happens on FILE * handles from popen. + */ +#define BROKEN_POPEN_GETWC + #include #include #include @@ -33,6 +39,7 @@ #include #include #include +#include #include "lib.h" #include "gc.h" #include "unwind.h" @@ -44,7 +51,7 @@ obj_t *std_input, *std_output, *std_error; struct strm_ops { struct cobj_ops cobj_ops; obj_t *(*put_string)(obj_t *, const wchar_t *); - obj_t *(*put_char)(obj_t *, int); + obj_t *(*put_char)(obj_t *, wchar_t); obj_t *(*get_line)(obj_t *); obj_t *(*get_char)(obj_t *); obj_t *(*vcformat)(obj_t *, const char *fmt, va_list vl); @@ -64,7 +71,7 @@ static void common_destroy(obj_t *obj) obj_t *common_vformat(obj_t *stream, const wchar_t *fmt, va_list vl) { - int ch; + wchar_t ch; for (; (ch = *fmt) != 0; fmt++) { obj_t *obj; @@ -105,6 +112,9 @@ obj_t *common_vformat(obj_t *stream, const wchar_t *fmt, va_list vl) struct stdio_handle { FILE *f; +#ifdef BROKEN_POPEN_GETWC + FILE *f_orig_pipe; +#endif obj_t *descr; }; @@ -152,13 +162,14 @@ static obj_t *stdio_maybe_write_error(obj_t *stream) static obj_t *stdio_put_string(obj_t *stream, const wchar_t *s) { struct stdio_handle *h = (struct stdio_handle *) stream->co.handle; - return (h->f && fputws(s, h->f) != EOF) ? t : stdio_maybe_write_error(stream); + return (h->f && fputws(s, h->f) != -1) ? t : stdio_maybe_write_error(stream); } -static obj_t *stdio_put_char(obj_t *stream, int ch) +static obj_t *stdio_put_char(obj_t *stream, wchar_t ch) { struct stdio_handle *h = (struct stdio_handle *) stream->co.handle; - return (h->f && putc(ch, h->f) != EOF) ? t : stdio_maybe_write_error(stream); + return (h->f && putwc(ch, h->f) != WEOF) + ? t : stdio_maybe_write_error(stream); } static wchar_t *snarf_line(FILE *in) @@ -169,9 +180,9 @@ static wchar_t *snarf_line(FILE *in) wchar_t *buf = 0; for (;;) { - int ch = getc(in); + wint_t ch = getwc(in); - if (ch == EOF && buf == 0) + if (ch == WEOF && buf == 0) break; if (fill >= size) { @@ -180,7 +191,7 @@ static wchar_t *snarf_line(FILE *in) size = newsize; } - if (ch == '\n' || ch == EOF) { + if (ch == '\n' || ch == WEOF) { buf[fill++] = 0; break; } @@ -210,8 +221,8 @@ obj_t *stdio_get_char(obj_t *stream) { struct stdio_handle *h = (struct stdio_handle *) stream->co.handle; if (h->f) { - int ch = getc(h->f); - return (ch != EOF) ? chr(ch) : stdio_maybe_read_error(stream); + wint_t ch = getwc(h->f); + return (ch != WEOF) ? chr(ch) : stdio_maybe_read_error(stream); } return nil; } @@ -262,9 +273,13 @@ static obj_t *pipe_close(obj_t *stream, obj_t *throw_on_error) struct stdio_handle *h = (struct stdio_handle *) stream->co.handle; if (h->f != 0) { +#ifdef BROKEN_POPEN_GETWC + int status = (fclose(h->f), pclose(h->f_orig_pipe)); + h->f = h->f_orig_pipe = 0; +#else int status = pclose(h->f); - h->f = 0; +#endif if (status != 0 && throw_on_error) { if (status < 0) { @@ -403,7 +418,7 @@ static obj_t *string_out_put_string(obj_t *stream, const wchar_t *s) } } -static obj_t *string_out_put_char(obj_t *stream, int ch) +static obj_t *string_out_put_char(obj_t *stream, wchar_t ch) { wchar_t mini[2]; mini[0] = ch; @@ -539,8 +554,27 @@ obj_t *make_stdio_stream(FILE *f, obj_t *descr, obj_t *input, obj_t *output) obj_t *make_pipe_stream(FILE *f, obj_t *descr, obj_t *input, obj_t *output) { struct stdio_handle *h = (struct stdio_handle *) chk_malloc(sizeof *h); +#ifdef BROKEN_POPEN_GETWC + int dup_fd = dup(fileno(f)); + FILE *dup_f = (dup_fd != -1) ? fdopen(dup_fd, output ? "w" : "r") : 0; + + if (dup_fd == -1 || dup_f == 0) { + int error = errno; + if (dup_f != 0) + fclose(dup_f); + else if (dup_fd != -1) + close(dup_fd); + free(h); + uw_throwf(process_error, L"unable to create pipe ~a: ~a/~s", descr, + num(error), string_utf8(strerror(error)), nao); + } + + h->f_orig_pipe = f; + h->f = dup_f; +#else h->f = f; h->descr = descr; +#endif return cobj((void *) h, stream_t, &pipe_ops.cobj_ops); } @@ -712,7 +746,7 @@ obj_t *put_char(obj_t *stream, obj_t *ch) } } -obj_t *put_cchar(obj_t *stream, int ch) +obj_t *put_cchar(obj_t *stream, wchar_t ch) { type_check (stream, COBJ); type_assert (stream->co.cls == stream_t, (L"~a is not a stream", stream)); diff --git a/stream.h b/stream.h index 78f83f93..13b428c0 100644 --- a/stream.h +++ b/stream.h @@ -43,6 +43,6 @@ obj_t *put_string(obj_t *stream, obj_t *string); obj_t *put_line(obj_t *stream, obj_t *string); obj_t *put_cstring(obj_t *stream, const wchar_t *); obj_t *put_char(obj_t *stream, obj_t *ch); -obj_t *put_cchar(obj_t *stream, int ch); +obj_t *put_cchar(obj_t *stream, wchar_t ch); void stream_init(void); diff --git a/txr.1 b/txr.1 index 19ffeb30..e62b30e1 100644 --- a/txr.1 +++ b/txr.1 @@ -396,6 +396,28 @@ does not split the line into two; it's embedded into the line and thus cannot match anything. However, @\en may be useful in the @(cat) directive and in @(output). +.SS International Characters + +.B txr +represents text internally using wide characters, which are used to represent +Unicode code points. The query language, as well as all data sources, are +assumed to be in the UTF-8 encoding. In the query language, extended +characters can be used directly in comments, literal text, string literals, +quasiliterals and regular expressions. Extended characters can also be +expressed indirectly using hexadecimal or octal escapes. +On some platforms, wide characters may be restricted to 16 bits, so that +.B txr +can only work with characters in the BMP (Basic Multilingual Plane) +subset of Unicode. + +If +.B txr +encounters an invalid bytes in the UTF-8 input, what happens depends on the +context in which this occurs. Invalid bytes in a query are reported as errors. +Invalid bytes in data are currently treated in an unspecified way. In +the future, invalid bytes in data will be mapped to the Unicode codes +U+DC00 through U+DCFF. + .SS Variables Much of the query syntax consists of arbitrary text, which matches file data -- cgit v1.2.3