diff options
author | Kaz Kylheku <kaz@kylheku.com> | 2025-05-22 20:56:35 -0700 |
---|---|---|
committer | Kaz Kylheku <kaz@kylheku.com> | 2025-05-22 20:56:35 -0700 |
commit | ccaacb51f45b21da7f8229dcc27d20a02c551e77 (patch) | |
tree | 39c1f7f13afc270bf1c0daade0b6be2f6d078b76 | |
parent | b4d5f05822b3e45442a027121fe5bf66c64c432b (diff) | |
download | txr-ccaacb51f45b21da7f8229dcc27d20a02c551e77.tar.gz txr-ccaacb51f45b21da7f8229dcc27d20a02c551e77.tar.bz2 txr-ccaacb51f45b21da7f8229dcc27d20a02c551e77.zip |
streams: improve pushback semantics of stdio streams.
* utf8.[ch] (utf8_getc, utf8_ungetc): New functions
which allow the push-back buffer of the decoder to
be accessed. We can use the decoder's push-back
buffer to implement a stream's byte push-back,
so that the behavior is then consistent: invalid
bytes pushed back by the decoder are treated
uniformly with bytes pushed back using unget-char.
* stream.c (stdio_switch): Bugfix: reset the UTF8
decoder when changing direction. Without this, it
is possible that pushed back bytes in the decoder's
buffer will be read, even though write operations
moved the position. Thus stdio_switch is now defined
as a function regardless of whether CONFIG_STDIO_STRICT
is in effect.
(stdio_get_byte): If there are pushed back characters
present, throw an error. Otherwise, try to get
a byte from the UTF8 buffer's pushback first via
utf8_getc. If that produces something, just return
it. Otherwise fall back on reading from the stdio
stream.
(stdio_unget_byte): If there are pushed back characters
present, throw an error. Otherwise push back the
character using utf8_ungetc. If that reports no
space, throw an error.
(stdio_fill_buf): Take bytes from the push-back buffer
int he UTF8 decoder first, then fread the rest
from the stdio stream, if necessary.
-rw-r--r-- | stream.c | 70 | ||||
-rw-r--r-- | tests/018/streams.tl | 61 | ||||
-rw-r--r-- | txr.1 | 43 | ||||
-rw-r--r-- | utf8.c | 21 | ||||
-rw-r--r-- | utf8.h | 2 |
5 files changed, 169 insertions, 28 deletions
@@ -511,9 +511,7 @@ val make_null_stream(void) return cobj(coerce(mem_t *, n), stream_cls, &null_ops.cobj_ops); } -#if CONFIG_STDIO_STRICT enum stdio_op { stdio_none, stdio_read, stdio_write }; -#endif struct stdio_handle { struct strm_base a; @@ -530,9 +528,7 @@ struct stdio_handle { unsigned is_rotated : 8; /* used by tail */ unsigned is_real_time : 8; unsigned is_byte_oriented : 8; -#if CONFIG_STDIO_STRICT enum stdio_op last_op; -#endif #if HAVE_SOCKETS val family; val type; @@ -658,10 +654,10 @@ static int se_fflush(FILE *f) return ret; } -#if CONFIG_STDIO_STRICT static void stdio_switch(struct stdio_handle *h, enum stdio_op op) { if (h->last_op != op) { +#if CONFIG_STDIO_STRICT if (h->f) { switch (h->last_op) { case stdio_read: @@ -674,13 +670,12 @@ static void stdio_switch(struct stdio_handle *h, enum stdio_op op) break; } } - +#endif + if (h->last_op != stdio_none) + utf8_decoder_init(&h->ud); h->last_op = op; } } -#else -#define stdio_switch(X, Y) ((void) 0) -#endif static int stdio_put_char_callback(int ch, mem_t *f) { @@ -915,13 +910,24 @@ static val stdio_get_byte(val stream) { struct stdio_handle *h = coerce(struct stdio_handle *, stream->co.handle); - stdio_switch(h, stdio_read); + if (h->unget_c) { + uw_throwf(file_error_s, + lit("get-byte: ~s: pushed-back characters prevent byte reads"), + stream, nao); + } else { + int ch = utf8_getc(&h->ud); + + if (ch != EOF) + return num_fast(ch); - if (h->f) { - int ch = se_getc(h->f); - return (ch != EOF) ? num(ch) : stdio_maybe_read_error(stream); + stdio_switch(h, stdio_read); + + if (h->f) { + int ch = se_getc(h->f); + return (ch != EOF) ? num(ch) : stdio_maybe_read_error(stream); + } + return stdio_maybe_read_error(stream); } - return stdio_maybe_read_error(stream); } static val stdio_unget_char(val stream, val ch) @@ -934,11 +940,18 @@ static val stdio_unget_char(val stream, val ch) static val stdio_unget_byte(val stream, int byte) { struct stdio_handle *h = coerce(struct stdio_handle *, stream->co.handle); - - errno = 0; - return h->f != 0 && ungetc(byte, coerce(FILE *, h->f)) != EOF - ? num_fast(byte) - : stdio_maybe_error(stream, lit("writing")); + if (h->unget_c) { + uw_throwf(file_error_s, + lit("unget-byte: ~s: previously pushed chars are in the way"), + stream, nao); + } else { + int uch = utf8_ungetc(&h->ud, byte); + return (uch == EOF) + ? uw_throwf(file_error_s, + lit("unget-byte: ~s: out of space pushing ~s"), + stream, num_fast(byte), nao) + : num_fast(byte); + } } static ucnum stdio_put_buf(val stream, mem_t *ptr, ucnum len, ucnum pos) @@ -963,17 +976,24 @@ static ucnum stdio_fill_buf(val stream, mem_t *ptr, ucnum len, ucnum pos) { val self = lit("fill-buf"); struct stdio_handle *h = coerce(struct stdio_handle *, stream->co.handle); + int ch; if (convert(size_t, len) != len || len > INT_PTR_MAX) uw_throwf(error_s, lit("~a: buffer too large"), self, nao); if (pos >= len) return len; - errno = 0; - if (h->f != 0) { - cnum nread = fread(ptr + pos, 1, len - pos, h->f); - if (nread > 0) - return pos + nread; + + while (pos < len && (ch = utf8_getc(&h->ud)) != EOF) + ptr[pos++] = ch; + + if (pos < len) { + errno = 0; + if (h->f != 0) { + cnum nread = fread(ptr + pos, 1, len - pos, h->f); + if (nread > 0) + return pos + nread; + } + stdio_maybe_read_error(stream); } - stdio_maybe_read_error(stream); return pos; } diff --git a/tests/018/streams.tl b/tests/018/streams.tl new file mode 100644 index 00000000..12157e48 --- /dev/null +++ b/tests/018/streams.tl @@ -0,0 +1,61 @@ +(load "../common") + +(push-after-load + (each ((file '#"test-file")) + (remove-path file))) + +(file-put-buf "test-file" #b'e38182e38182e38182') + +(with-stream (s (open-file "test-file")) + (mtest + (true s) t + (get-byte s) #xe3 + (get-byte s) #x81 + (unget-byte #x81 s) #x81 + (unget-byte #xe3 s) #xe3 + (get-char s) #\あ)) + +(with-stream (s (open-file "test-file")) + (mtest + (true s) t + (get-byte s) #xe3 + (get-char s) #\xdc81 + (get-byte s) #x82)) + +(file-put-buf "test-file" #b'e38122e38182e38182') + +(with-stream (s (open-file "test-file")) + (let ((b (make-buf 256))) + (mtest + (true s) t + (get-char s) #\xdce3 + (get-byte s) #x81 + (fill-buf-adjust b 0 s) 7 + b #b'22e38182e38182'))) + +(with-stream (s (open-file "test-file")) + (mtest + (true s) t + (unget-char #\a s) #\a + (get-byte s) :error + (unget-byte 42 s) :error + (get-char s) #\a + (unget-byte 42 s) 42 + (get-byte s) 42 + (get-byte s) #xe3)) + +(with-stream (s (open-file "test-file")) + (mtest + (true s) t + (unget-char #\a s) #\a + (unget-byte 42 s) :error + (get-char s) #\a + (unget-byte 42 s) 42 + (get-byte s) 42 + (get-byte s) #xe3)) + +(mtest + (unget-byte #x82) #x82 + (unget-byte #x81) #x81 + (unget-byte #xe3) #xe3 + (get-char) #\x3042) @@ -67441,9 +67441,10 @@ The number of characters that may be pushed back by .code unget-char is not limited. -Pushing both a byte and a character, in either order, is also unsupported. -Pushing a byte and then reading a character, or pushing a character and -reading a byte, are unsupported mixtures of operations. +Streams may also not support pushing back mixtures of bytes +and characters, and reading a character when pushed-back bytes +are present, or reading a byte when pushed-back characters are +present. If the stream is binary, then pushing back a byte decrements its position, except if the position is already zero. At that point, the position becomes @@ -67453,6 +67454,42 @@ The behavior of pushing back immediately after a .code seek-stream positioning operation is unspecified. +The position reported by +.code seek-stream +when it is invoked with a +.meta whence +argument value of +.code :from-current +is not required to take into account pushed-back bytes or characters. + +Uncompressed file streams, tail streams, pipe streams and socket streams make +the following guarantees: +.RS +.IP 1. +The stream has room for 7 (seven) pushed-back bytes, which are +shared with the UTF-8 decoder. When, during the most recent +character-reading operation, the decoder has encountered invalid +bytes, it may leave up to three bytes in the push-back buffer. This +worst case occurs when a four-byte sequence is read whose last +byte is invalid. In this situation, the application has space for +four bytes of pushback. A character-reading +operation which returns a code point not lying in the +range U+DC00 to U+DCFF deposits no bytes into the pushback buffer. +.IP 2. +Characters may be pushed back even when one or more pushed back +bytes are present. Character pushback is unlimited. +.IP 3. +Bytes may not be pushed back when one or more characters have been +pushed back. +.IP 4. +Character-reading operations such as +.code get-char +first consume the pushed back characters. When those +are exhausted, then pushed back bytes are consumed and decoded +as UTF-8. After pushed back bytes are exhausted, bytes are +read from the buffered stream in the usual way. +.RE + .coNP Functions @, put-string @, put-line @ put-char and @ put-byte .synb .mets (put-string < string <> [ stream ]) @@ -375,6 +375,27 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx) } } +int utf8_getc(utf8_decoder_t *ud) +{ + if (ud->tail == ud->head) { + return EOF; + } else { + int ch = ud->buf[ud->back]; + ud->tail = ud->back = (ud->tail + 1) % 8; + return ch; + } +} + +int utf8_ungetc(utf8_decoder_t *ud, int ch) +{ + unsigned ntail = (ud->tail + 7) % 8; + if (ntail == ud->head) + return EOF; + ud->back = ud->tail = ntail; + ud->buf[ud->tail] = ch; + return ch; +} + FILE *w_fopen(const wchar_t *wname, const wchar_t *wmode) { char *name = utf8_dup_to(wname); @@ -50,6 +50,8 @@ typedef struct utf8_decoder { int utf8_encode(wchar_t, int (*put)(int ch, mem_t *ctx), mem_t *ctx); void utf8_decoder_init(utf8_decoder_t *); wint_t utf8_decode(utf8_decoder_t *,int (*get)(mem_t *ctx), mem_t *ctx); +int utf8_getc(utf8_decoder_t *); +int utf8_ungetc(utf8_decoder_t *, int ch); FILE *w_fopen(const wchar_t *, const wchar_t *); FILE *w_freopen(const wchar_t *, const wchar_t *, FILE *); |