summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKaz Kylheku <kaz@kylheku.com>2025-05-22 20:56:35 -0700
committerKaz Kylheku <kaz@kylheku.com>2025-05-22 20:56:35 -0700
commitccaacb51f45b21da7f8229dcc27d20a02c551e77 (patch)
tree39c1f7f13afc270bf1c0daade0b6be2f6d078b76
parentb4d5f05822b3e45442a027121fe5bf66c64c432b (diff)
downloadtxr-ccaacb51f45b21da7f8229dcc27d20a02c551e77.tar.gz
txr-ccaacb51f45b21da7f8229dcc27d20a02c551e77.tar.bz2
txr-ccaacb51f45b21da7f8229dcc27d20a02c551e77.zip
streams: improve pushback semantics of stdio streams.
* utf8.[ch] (utf8_getc, utf8_ungetc): New functions which allow the push-back buffer of the decoder to be accessed. We can use the decoder's push-back buffer to implement a stream's byte push-back, so that the behavior is then consistent: invalid bytes pushed back by the decoder are treated uniformly with bytes pushed back using unget-char. * stream.c (stdio_switch): Bugfix: reset the UTF8 decoder when changing direction. Without this, it is possible that pushed back bytes in the decoder's buffer will be read, even though write operations moved the position. Thus stdio_switch is now defined as a function regardless of whether CONFIG_STDIO_STRICT is in effect. (stdio_get_byte): If there are pushed back characters present, throw an error. Otherwise, try to get a byte from the UTF8 buffer's pushback first via utf8_getc. If that produces something, just return it. Otherwise fall back on reading from the stdio stream. (stdio_unget_byte): If there are pushed back characters present, throw an error. Otherwise push back the character using utf8_ungetc. If that reports no space, throw an error. (stdio_fill_buf): Take bytes from the push-back buffer int he UTF8 decoder first, then fread the rest from the stdio stream, if necessary.
-rw-r--r--stream.c70
-rw-r--r--tests/018/streams.tl61
-rw-r--r--txr.143
-rw-r--r--utf8.c21
-rw-r--r--utf8.h2
5 files changed, 169 insertions, 28 deletions
diff --git a/stream.c b/stream.c
index 5e11e67d..9f358a3c 100644
--- a/stream.c
+++ b/stream.c
@@ -511,9 +511,7 @@ val make_null_stream(void)
return cobj(coerce(mem_t *, n), stream_cls, &null_ops.cobj_ops);
}
-#if CONFIG_STDIO_STRICT
enum stdio_op { stdio_none, stdio_read, stdio_write };
-#endif
struct stdio_handle {
struct strm_base a;
@@ -530,9 +528,7 @@ struct stdio_handle {
unsigned is_rotated : 8; /* used by tail */
unsigned is_real_time : 8;
unsigned is_byte_oriented : 8;
-#if CONFIG_STDIO_STRICT
enum stdio_op last_op;
-#endif
#if HAVE_SOCKETS
val family;
val type;
@@ -658,10 +654,10 @@ static int se_fflush(FILE *f)
return ret;
}
-#if CONFIG_STDIO_STRICT
static void stdio_switch(struct stdio_handle *h, enum stdio_op op)
{
if (h->last_op != op) {
+#if CONFIG_STDIO_STRICT
if (h->f) {
switch (h->last_op) {
case stdio_read:
@@ -674,13 +670,12 @@ static void stdio_switch(struct stdio_handle *h, enum stdio_op op)
break;
}
}
-
+#endif
+ if (h->last_op != stdio_none)
+ utf8_decoder_init(&h->ud);
h->last_op = op;
}
}
-#else
-#define stdio_switch(X, Y) ((void) 0)
-#endif
static int stdio_put_char_callback(int ch, mem_t *f)
{
@@ -915,13 +910,24 @@ static val stdio_get_byte(val stream)
{
struct stdio_handle *h = coerce(struct stdio_handle *, stream->co.handle);
- stdio_switch(h, stdio_read);
+ if (h->unget_c) {
+ uw_throwf(file_error_s,
+ lit("get-byte: ~s: pushed-back characters prevent byte reads"),
+ stream, nao);
+ } else {
+ int ch = utf8_getc(&h->ud);
+
+ if (ch != EOF)
+ return num_fast(ch);
- if (h->f) {
- int ch = se_getc(h->f);
- return (ch != EOF) ? num(ch) : stdio_maybe_read_error(stream);
+ stdio_switch(h, stdio_read);
+
+ if (h->f) {
+ int ch = se_getc(h->f);
+ return (ch != EOF) ? num(ch) : stdio_maybe_read_error(stream);
+ }
+ return stdio_maybe_read_error(stream);
}
- return stdio_maybe_read_error(stream);
}
static val stdio_unget_char(val stream, val ch)
@@ -934,11 +940,18 @@ static val stdio_unget_char(val stream, val ch)
static val stdio_unget_byte(val stream, int byte)
{
struct stdio_handle *h = coerce(struct stdio_handle *, stream->co.handle);
-
- errno = 0;
- return h->f != 0 && ungetc(byte, coerce(FILE *, h->f)) != EOF
- ? num_fast(byte)
- : stdio_maybe_error(stream, lit("writing"));
+ if (h->unget_c) {
+ uw_throwf(file_error_s,
+ lit("unget-byte: ~s: previously pushed chars are in the way"),
+ stream, nao);
+ } else {
+ int uch = utf8_ungetc(&h->ud, byte);
+ return (uch == EOF)
+ ? uw_throwf(file_error_s,
+ lit("unget-byte: ~s: out of space pushing ~s"),
+ stream, num_fast(byte), nao)
+ : num_fast(byte);
+ }
}
static ucnum stdio_put_buf(val stream, mem_t *ptr, ucnum len, ucnum pos)
@@ -963,17 +976,24 @@ static ucnum stdio_fill_buf(val stream, mem_t *ptr, ucnum len, ucnum pos)
{
val self = lit("fill-buf");
struct stdio_handle *h = coerce(struct stdio_handle *, stream->co.handle);
+ int ch;
if (convert(size_t, len) != len || len > INT_PTR_MAX)
uw_throwf(error_s, lit("~a: buffer too large"), self, nao);
if (pos >= len)
return len;
- errno = 0;
- if (h->f != 0) {
- cnum nread = fread(ptr + pos, 1, len - pos, h->f);
- if (nread > 0)
- return pos + nread;
+
+ while (pos < len && (ch = utf8_getc(&h->ud)) != EOF)
+ ptr[pos++] = ch;
+
+ if (pos < len) {
+ errno = 0;
+ if (h->f != 0) {
+ cnum nread = fread(ptr + pos, 1, len - pos, h->f);
+ if (nread > 0)
+ return pos + nread;
+ }
+ stdio_maybe_read_error(stream);
}
- stdio_maybe_read_error(stream);
return pos;
}
diff --git a/tests/018/streams.tl b/tests/018/streams.tl
new file mode 100644
index 00000000..12157e48
--- /dev/null
+++ b/tests/018/streams.tl
@@ -0,0 +1,61 @@
+(load "../common")
+
+(push-after-load
+ (each ((file '#"test-file"))
+ (remove-path file)))
+
+(file-put-buf "test-file" #b'e38182e38182e38182')
+
+(with-stream (s (open-file "test-file"))
+ (mtest
+ (true s) t
+ (get-byte s) #xe3
+ (get-byte s) #x81
+ (unget-byte #x81 s) #x81
+ (unget-byte #xe3 s) #xe3
+ (get-char s) #\あ))
+
+(with-stream (s (open-file "test-file"))
+ (mtest
+ (true s) t
+ (get-byte s) #xe3
+ (get-char s) #\xdc81
+ (get-byte s) #x82))
+
+(file-put-buf "test-file" #b'e38122e38182e38182')
+
+(with-stream (s (open-file "test-file"))
+ (let ((b (make-buf 256)))
+ (mtest
+ (true s) t
+ (get-char s) #\xdce3
+ (get-byte s) #x81
+ (fill-buf-adjust b 0 s) 7
+ b #b'22e38182e38182')))
+
+(with-stream (s (open-file "test-file"))
+ (mtest
+ (true s) t
+ (unget-char #\a s) #\a
+ (get-byte s) :error
+ (unget-byte 42 s) :error
+ (get-char s) #\a
+ (unget-byte 42 s) 42
+ (get-byte s) 42
+ (get-byte s) #xe3))
+
+(with-stream (s (open-file "test-file"))
+ (mtest
+ (true s) t
+ (unget-char #\a s) #\a
+ (unget-byte 42 s) :error
+ (get-char s) #\a
+ (unget-byte 42 s) 42
+ (get-byte s) 42
+ (get-byte s) #xe3))
+
+(mtest
+ (unget-byte #x82) #x82
+ (unget-byte #x81) #x81
+ (unget-byte #xe3) #xe3
+ (get-char) #\x3042)
diff --git a/txr.1 b/txr.1
index 48247fce..e1cab5a0 100644
--- a/txr.1
+++ b/txr.1
@@ -67441,9 +67441,10 @@ The number of characters that may be pushed back by
.code unget-char
is not limited.
-Pushing both a byte and a character, in either order, is also unsupported.
-Pushing a byte and then reading a character, or pushing a character and
-reading a byte, are unsupported mixtures of operations.
+Streams may also not support pushing back mixtures of bytes
+and characters, and reading a character when pushed-back bytes
+are present, or reading a byte when pushed-back characters are
+present.
If the stream is binary, then pushing back a byte decrements its position,
except if the position is already zero. At that point, the position becomes
@@ -67453,6 +67454,42 @@ The behavior of pushing back immediately after a
.code seek-stream
positioning operation is unspecified.
+The position reported by
+.code seek-stream
+when it is invoked with a
+.meta whence
+argument value of
+.code :from-current
+is not required to take into account pushed-back bytes or characters.
+
+Uncompressed file streams, tail streams, pipe streams and socket streams make
+the following guarantees:
+.RS
+.IP 1.
+The stream has room for 7 (seven) pushed-back bytes, which are
+shared with the UTF-8 decoder. When, during the most recent
+character-reading operation, the decoder has encountered invalid
+bytes, it may leave up to three bytes in the push-back buffer. This
+worst case occurs when a four-byte sequence is read whose last
+byte is invalid. In this situation, the application has space for
+four bytes of pushback. A character-reading
+operation which returns a code point not lying in the
+range U+DC00 to U+DCFF deposits no bytes into the pushback buffer.
+.IP 2.
+Characters may be pushed back even when one or more pushed back
+bytes are present. Character pushback is unlimited.
+.IP 3.
+Bytes may not be pushed back when one or more characters have been
+pushed back.
+.IP 4.
+Character-reading operations such as
+.code get-char
+first consume the pushed back characters. When those
+are exhausted, then pushed back bytes are consumed and decoded
+as UTF-8. After pushed back bytes are exhausted, bytes are
+read from the buffered stream in the usual way.
+.RE
+
.coNP Functions @, put-string @, put-line @ put-char and @ put-byte
.synb
.mets (put-string < string <> [ stream ])
diff --git a/utf8.c b/utf8.c
index fe0f9d89..0bdb70a8 100644
--- a/utf8.c
+++ b/utf8.c
@@ -375,6 +375,27 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx)
}
}
+int utf8_getc(utf8_decoder_t *ud)
+{
+ if (ud->tail == ud->head) {
+ return EOF;
+ } else {
+ int ch = ud->buf[ud->back];
+ ud->tail = ud->back = (ud->tail + 1) % 8;
+ return ch;
+ }
+}
+
+int utf8_ungetc(utf8_decoder_t *ud, int ch)
+{
+ unsigned ntail = (ud->tail + 7) % 8;
+ if (ntail == ud->head)
+ return EOF;
+ ud->back = ud->tail = ntail;
+ ud->buf[ud->tail] = ch;
+ return ch;
+}
+
FILE *w_fopen(const wchar_t *wname, const wchar_t *wmode)
{
char *name = utf8_dup_to(wname);
diff --git a/utf8.h b/utf8.h
index 3300cfbd..f861bd39 100644
--- a/utf8.h
+++ b/utf8.h
@@ -50,6 +50,8 @@ typedef struct utf8_decoder {
int utf8_encode(wchar_t, int (*put)(int ch, mem_t *ctx), mem_t *ctx);
void utf8_decoder_init(utf8_decoder_t *);
wint_t utf8_decode(utf8_decoder_t *,int (*get)(mem_t *ctx), mem_t *ctx);
+int utf8_getc(utf8_decoder_t *);
+int utf8_ungetc(utf8_decoder_t *, int ch);
FILE *w_fopen(const wchar_t *, const wchar_t *);
FILE *w_freopen(const wchar_t *, const wchar_t *, FILE *);