diff options
-rw-r--r-- | stream.c | 70 | ||||
-rw-r--r-- | tests/018/streams.tl | 61 | ||||
-rw-r--r-- | txr.1 | 43 | ||||
-rw-r--r-- | utf8.c | 21 | ||||
-rw-r--r-- | utf8.h | 2 |
5 files changed, 169 insertions, 28 deletions
@@ -511,9 +511,7 @@ val make_null_stream(void) return cobj(coerce(mem_t *, n), stream_cls, &null_ops.cobj_ops); } -#if CONFIG_STDIO_STRICT enum stdio_op { stdio_none, stdio_read, stdio_write }; -#endif struct stdio_handle { struct strm_base a; @@ -530,9 +528,7 @@ struct stdio_handle { unsigned is_rotated : 8; /* used by tail */ unsigned is_real_time : 8; unsigned is_byte_oriented : 8; -#if CONFIG_STDIO_STRICT enum stdio_op last_op; -#endif #if HAVE_SOCKETS val family; val type; @@ -658,10 +654,10 @@ static int se_fflush(FILE *f) return ret; } -#if CONFIG_STDIO_STRICT static void stdio_switch(struct stdio_handle *h, enum stdio_op op) { if (h->last_op != op) { +#if CONFIG_STDIO_STRICT if (h->f) { switch (h->last_op) { case stdio_read: @@ -674,13 +670,12 @@ static void stdio_switch(struct stdio_handle *h, enum stdio_op op) break; } } - +#endif + if (h->last_op != stdio_none) + utf8_decoder_init(&h->ud); h->last_op = op; } } -#else -#define stdio_switch(X, Y) ((void) 0) -#endif static int stdio_put_char_callback(int ch, mem_t *f) { @@ -915,13 +910,24 @@ static val stdio_get_byte(val stream) { struct stdio_handle *h = coerce(struct stdio_handle *, stream->co.handle); - stdio_switch(h, stdio_read); + if (h->unget_c) { + uw_throwf(file_error_s, + lit("get-byte: ~s: pushed-back characters prevent byte reads"), + stream, nao); + } else { + int ch = utf8_getc(&h->ud); + + if (ch != EOF) + return num_fast(ch); - if (h->f) { - int ch = se_getc(h->f); - return (ch != EOF) ? num(ch) : stdio_maybe_read_error(stream); + stdio_switch(h, stdio_read); + + if (h->f) { + int ch = se_getc(h->f); + return (ch != EOF) ? num(ch) : stdio_maybe_read_error(stream); + } + return stdio_maybe_read_error(stream); } - return stdio_maybe_read_error(stream); } static val stdio_unget_char(val stream, val ch) @@ -934,11 +940,18 @@ static val stdio_unget_char(val stream, val ch) static val stdio_unget_byte(val stream, int byte) { struct stdio_handle *h = coerce(struct stdio_handle *, stream->co.handle); - - errno = 0; - return h->f != 0 && ungetc(byte, coerce(FILE *, h->f)) != EOF - ? num_fast(byte) - : stdio_maybe_error(stream, lit("writing")); + if (h->unget_c) { + uw_throwf(file_error_s, + lit("unget-byte: ~s: previously pushed chars are in the way"), + stream, nao); + } else { + int uch = utf8_ungetc(&h->ud, byte); + return (uch == EOF) + ? uw_throwf(file_error_s, + lit("unget-byte: ~s: out of space pushing ~s"), + stream, num_fast(byte), nao) + : num_fast(byte); + } } static ucnum stdio_put_buf(val stream, mem_t *ptr, ucnum len, ucnum pos) @@ -963,17 +976,24 @@ static ucnum stdio_fill_buf(val stream, mem_t *ptr, ucnum len, ucnum pos) { val self = lit("fill-buf"); struct stdio_handle *h = coerce(struct stdio_handle *, stream->co.handle); + int ch; if (convert(size_t, len) != len || len > INT_PTR_MAX) uw_throwf(error_s, lit("~a: buffer too large"), self, nao); if (pos >= len) return len; - errno = 0; - if (h->f != 0) { - cnum nread = fread(ptr + pos, 1, len - pos, h->f); - if (nread > 0) - return pos + nread; + + while (pos < len && (ch = utf8_getc(&h->ud)) != EOF) + ptr[pos++] = ch; + + if (pos < len) { + errno = 0; + if (h->f != 0) { + cnum nread = fread(ptr + pos, 1, len - pos, h->f); + if (nread > 0) + return pos + nread; + } + stdio_maybe_read_error(stream); } - stdio_maybe_read_error(stream); return pos; } diff --git a/tests/018/streams.tl b/tests/018/streams.tl new file mode 100644 index 00000000..12157e48 --- /dev/null +++ b/tests/018/streams.tl @@ -0,0 +1,61 @@ +(load "../common") + +(push-after-load + (each ((file '#"test-file")) + (remove-path file))) + +(file-put-buf "test-file" #b'e38182e38182e38182') + +(with-stream (s (open-file "test-file")) + (mtest + (true s) t + (get-byte s) #xe3 + (get-byte s) #x81 + (unget-byte #x81 s) #x81 + (unget-byte #xe3 s) #xe3 + (get-char s) #\あ)) + +(with-stream (s (open-file "test-file")) + (mtest + (true s) t + (get-byte s) #xe3 + (get-char s) #\xdc81 + (get-byte s) #x82)) + +(file-put-buf "test-file" #b'e38122e38182e38182') + +(with-stream (s (open-file "test-file")) + (let ((b (make-buf 256))) + (mtest + (true s) t + (get-char s) #\xdce3 + (get-byte s) #x81 + (fill-buf-adjust b 0 s) 7 + b #b'22e38182e38182'))) + +(with-stream (s (open-file "test-file")) + (mtest + (true s) t + (unget-char #\a s) #\a + (get-byte s) :error + (unget-byte 42 s) :error + (get-char s) #\a + (unget-byte 42 s) 42 + (get-byte s) 42 + (get-byte s) #xe3)) + +(with-stream (s (open-file "test-file")) + (mtest + (true s) t + (unget-char #\a s) #\a + (unget-byte 42 s) :error + (get-char s) #\a + (unget-byte 42 s) 42 + (get-byte s) 42 + (get-byte s) #xe3)) + +(mtest + (unget-byte #x82) #x82 + (unget-byte #x81) #x81 + (unget-byte #xe3) #xe3 + (get-char) #\x3042) @@ -67441,9 +67441,10 @@ The number of characters that may be pushed back by .code unget-char is not limited. -Pushing both a byte and a character, in either order, is also unsupported. -Pushing a byte and then reading a character, or pushing a character and -reading a byte, are unsupported mixtures of operations. +Streams may also not support pushing back mixtures of bytes +and characters, and reading a character when pushed-back bytes +are present, or reading a byte when pushed-back characters are +present. If the stream is binary, then pushing back a byte decrements its position, except if the position is already zero. At that point, the position becomes @@ -67453,6 +67454,42 @@ The behavior of pushing back immediately after a .code seek-stream positioning operation is unspecified. +The position reported by +.code seek-stream +when it is invoked with a +.meta whence +argument value of +.code :from-current +is not required to take into account pushed-back bytes or characters. + +Uncompressed file streams, tail streams, pipe streams and socket streams make +the following guarantees: +.RS +.IP 1. +The stream has room for 7 (seven) pushed-back bytes, which are +shared with the UTF-8 decoder. When, during the most recent +character-reading operation, the decoder has encountered invalid +bytes, it may leave up to three bytes in the push-back buffer. This +worst case occurs when a four-byte sequence is read whose last +byte is invalid. In this situation, the application has space for +four bytes of pushback. A character-reading +operation which returns a code point not lying in the +range U+DC00 to U+DCFF deposits no bytes into the pushback buffer. +.IP 2. +Characters may be pushed back even when one or more pushed back +bytes are present. Character pushback is unlimited. +.IP 3. +Bytes may not be pushed back when one or more characters have been +pushed back. +.IP 4. +Character-reading operations such as +.code get-char +first consume the pushed back characters. When those +are exhausted, then pushed back bytes are consumed and decoded +as UTF-8. After pushed back bytes are exhausted, bytes are +read from the buffered stream in the usual way. +.RE + .coNP Functions @, put-string @, put-line @ put-char and @ put-byte .synb .mets (put-string < string <> [ stream ]) @@ -375,6 +375,27 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx) } } +int utf8_getc(utf8_decoder_t *ud) +{ + if (ud->tail == ud->head) { + return EOF; + } else { + int ch = ud->buf[ud->back]; + ud->tail = ud->back = (ud->tail + 1) % 8; + return ch; + } +} + +int utf8_ungetc(utf8_decoder_t *ud, int ch) +{ + unsigned ntail = (ud->tail + 7) % 8; + if (ntail == ud->head) + return EOF; + ud->back = ud->tail = ntail; + ud->buf[ud->tail] = ch; + return ch; +} + FILE *w_fopen(const wchar_t *wname, const wchar_t *wmode) { char *name = utf8_dup_to(wname); @@ -50,6 +50,8 @@ typedef struct utf8_decoder { int utf8_encode(wchar_t, int (*put)(int ch, mem_t *ctx), mem_t *ctx); void utf8_decoder_init(utf8_decoder_t *); wint_t utf8_decode(utf8_decoder_t *,int (*get)(mem_t *ctx), mem_t *ctx); +int utf8_getc(utf8_decoder_t *); +int utf8_ungetc(utf8_decoder_t *, int ch); FILE *w_fopen(const wchar_t *, const wchar_t *); FILE *w_freopen(const wchar_t *, const wchar_t *, FILE *); |