summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--stream.c70
-rw-r--r--tests/018/streams.tl61
-rw-r--r--txr.143
-rw-r--r--utf8.c21
-rw-r--r--utf8.h2
5 files changed, 169 insertions, 28 deletions
diff --git a/stream.c b/stream.c
index 5e11e67d..9f358a3c 100644
--- a/stream.c
+++ b/stream.c
@@ -511,9 +511,7 @@ val make_null_stream(void)
return cobj(coerce(mem_t *, n), stream_cls, &null_ops.cobj_ops);
}
-#if CONFIG_STDIO_STRICT
enum stdio_op { stdio_none, stdio_read, stdio_write };
-#endif
struct stdio_handle {
struct strm_base a;
@@ -530,9 +528,7 @@ struct stdio_handle {
unsigned is_rotated : 8; /* used by tail */
unsigned is_real_time : 8;
unsigned is_byte_oriented : 8;
-#if CONFIG_STDIO_STRICT
enum stdio_op last_op;
-#endif
#if HAVE_SOCKETS
val family;
val type;
@@ -658,10 +654,10 @@ static int se_fflush(FILE *f)
return ret;
}
-#if CONFIG_STDIO_STRICT
static void stdio_switch(struct stdio_handle *h, enum stdio_op op)
{
if (h->last_op != op) {
+#if CONFIG_STDIO_STRICT
if (h->f) {
switch (h->last_op) {
case stdio_read:
@@ -674,13 +670,12 @@ static void stdio_switch(struct stdio_handle *h, enum stdio_op op)
break;
}
}
-
+#endif
+ if (h->last_op != stdio_none)
+ utf8_decoder_init(&h->ud);
h->last_op = op;
}
}
-#else
-#define stdio_switch(X, Y) ((void) 0)
-#endif
static int stdio_put_char_callback(int ch, mem_t *f)
{
@@ -915,13 +910,24 @@ static val stdio_get_byte(val stream)
{
struct stdio_handle *h = coerce(struct stdio_handle *, stream->co.handle);
- stdio_switch(h, stdio_read);
+ if (h->unget_c) {
+ uw_throwf(file_error_s,
+ lit("get-byte: ~s: pushed-back characters prevent byte reads"),
+ stream, nao);
+ } else {
+ int ch = utf8_getc(&h->ud);
+
+ if (ch != EOF)
+ return num_fast(ch);
- if (h->f) {
- int ch = se_getc(h->f);
- return (ch != EOF) ? num(ch) : stdio_maybe_read_error(stream);
+ stdio_switch(h, stdio_read);
+
+ if (h->f) {
+ int ch = se_getc(h->f);
+ return (ch != EOF) ? num(ch) : stdio_maybe_read_error(stream);
+ }
+ return stdio_maybe_read_error(stream);
}
- return stdio_maybe_read_error(stream);
}
static val stdio_unget_char(val stream, val ch)
@@ -934,11 +940,18 @@ static val stdio_unget_char(val stream, val ch)
static val stdio_unget_byte(val stream, int byte)
{
struct stdio_handle *h = coerce(struct stdio_handle *, stream->co.handle);
-
- errno = 0;
- return h->f != 0 && ungetc(byte, coerce(FILE *, h->f)) != EOF
- ? num_fast(byte)
- : stdio_maybe_error(stream, lit("writing"));
+ if (h->unget_c) {
+ uw_throwf(file_error_s,
+ lit("unget-byte: ~s: previously pushed chars are in the way"),
+ stream, nao);
+ } else {
+ int uch = utf8_ungetc(&h->ud, byte);
+ return (uch == EOF)
+ ? uw_throwf(file_error_s,
+ lit("unget-byte: ~s: out of space pushing ~s"),
+ stream, num_fast(byte), nao)
+ : num_fast(byte);
+ }
}
static ucnum stdio_put_buf(val stream, mem_t *ptr, ucnum len, ucnum pos)
@@ -963,17 +976,24 @@ static ucnum stdio_fill_buf(val stream, mem_t *ptr, ucnum len, ucnum pos)
{
val self = lit("fill-buf");
struct stdio_handle *h = coerce(struct stdio_handle *, stream->co.handle);
+ int ch;
if (convert(size_t, len) != len || len > INT_PTR_MAX)
uw_throwf(error_s, lit("~a: buffer too large"), self, nao);
if (pos >= len)
return len;
- errno = 0;
- if (h->f != 0) {
- cnum nread = fread(ptr + pos, 1, len - pos, h->f);
- if (nread > 0)
- return pos + nread;
+
+ while (pos < len && (ch = utf8_getc(&h->ud)) != EOF)
+ ptr[pos++] = ch;
+
+ if (pos < len) {
+ errno = 0;
+ if (h->f != 0) {
+ cnum nread = fread(ptr + pos, 1, len - pos, h->f);
+ if (nread > 0)
+ return pos + nread;
+ }
+ stdio_maybe_read_error(stream);
}
- stdio_maybe_read_error(stream);
return pos;
}
diff --git a/tests/018/streams.tl b/tests/018/streams.tl
new file mode 100644
index 00000000..12157e48
--- /dev/null
+++ b/tests/018/streams.tl
@@ -0,0 +1,61 @@
+(load "../common")
+
+(push-after-load
+ (each ((file '#"test-file"))
+ (remove-path file)))
+
+(file-put-buf "test-file" #b'e38182e38182e38182')
+
+(with-stream (s (open-file "test-file"))
+ (mtest
+ (true s) t
+ (get-byte s) #xe3
+ (get-byte s) #x81
+ (unget-byte #x81 s) #x81
+ (unget-byte #xe3 s) #xe3
+ (get-char s) #\あ))
+
+(with-stream (s (open-file "test-file"))
+ (mtest
+ (true s) t
+ (get-byte s) #xe3
+ (get-char s) #\xdc81
+ (get-byte s) #x82))
+
+(file-put-buf "test-file" #b'e38122e38182e38182')
+
+(with-stream (s (open-file "test-file"))
+ (let ((b (make-buf 256)))
+ (mtest
+ (true s) t
+ (get-char s) #\xdce3
+ (get-byte s) #x81
+ (fill-buf-adjust b 0 s) 7
+ b #b'22e38182e38182')))
+
+(with-stream (s (open-file "test-file"))
+ (mtest
+ (true s) t
+ (unget-char #\a s) #\a
+ (get-byte s) :error
+ (unget-byte 42 s) :error
+ (get-char s) #\a
+ (unget-byte 42 s) 42
+ (get-byte s) 42
+ (get-byte s) #xe3))
+
+(with-stream (s (open-file "test-file"))
+ (mtest
+ (true s) t
+ (unget-char #\a s) #\a
+ (unget-byte 42 s) :error
+ (get-char s) #\a
+ (unget-byte 42 s) 42
+ (get-byte s) 42
+ (get-byte s) #xe3))
+
+(mtest
+ (unget-byte #x82) #x82
+ (unget-byte #x81) #x81
+ (unget-byte #xe3) #xe3
+ (get-char) #\x3042)
diff --git a/txr.1 b/txr.1
index 48247fce..e1cab5a0 100644
--- a/txr.1
+++ b/txr.1
@@ -67441,9 +67441,10 @@ The number of characters that may be pushed back by
.code unget-char
is not limited.
-Pushing both a byte and a character, in either order, is also unsupported.
-Pushing a byte and then reading a character, or pushing a character and
-reading a byte, are unsupported mixtures of operations.
+Streams may also not support pushing back mixtures of bytes
+and characters, and reading a character when pushed-back bytes
+are present, or reading a byte when pushed-back characters are
+present.
If the stream is binary, then pushing back a byte decrements its position,
except if the position is already zero. At that point, the position becomes
@@ -67453,6 +67454,42 @@ The behavior of pushing back immediately after a
.code seek-stream
positioning operation is unspecified.
+The position reported by
+.code seek-stream
+when it is invoked with a
+.meta whence
+argument value of
+.code :from-current
+is not required to take into account pushed-back bytes or characters.
+
+Uncompressed file streams, tail streams, pipe streams and socket streams make
+the following guarantees:
+.RS
+.IP 1.
+The stream has room for 7 (seven) pushed-back bytes, which are
+shared with the UTF-8 decoder. When, during the most recent
+character-reading operation, the decoder has encountered invalid
+bytes, it may leave up to three bytes in the push-back buffer. This
+worst case occurs when a four-byte sequence is read whose last
+byte is invalid. In this situation, the application has space for
+four bytes of pushback. A character-reading
+operation which returns a code point not lying in the
+range U+DC00 to U+DCFF deposits no bytes into the pushback buffer.
+.IP 2.
+Characters may be pushed back even when one or more pushed back
+bytes are present. Character pushback is unlimited.
+.IP 3.
+Bytes may not be pushed back when one or more characters have been
+pushed back.
+.IP 4.
+Character-reading operations such as
+.code get-char
+first consume the pushed back characters. When those
+are exhausted, then pushed back bytes are consumed and decoded
+as UTF-8. After pushed back bytes are exhausted, bytes are
+read from the buffered stream in the usual way.
+.RE
+
.coNP Functions @, put-string @, put-line @ put-char and @ put-byte
.synb
.mets (put-string < string <> [ stream ])
diff --git a/utf8.c b/utf8.c
index fe0f9d89..0bdb70a8 100644
--- a/utf8.c
+++ b/utf8.c
@@ -375,6 +375,27 @@ wint_t utf8_decode(utf8_decoder_t *ud, int (*get)(mem_t *ctx), mem_t *ctx)
}
}
+int utf8_getc(utf8_decoder_t *ud)
+{
+ if (ud->tail == ud->head) {
+ return EOF;
+ } else {
+ int ch = ud->buf[ud->back];
+ ud->tail = ud->back = (ud->tail + 1) % 8;
+ return ch;
+ }
+}
+
+int utf8_ungetc(utf8_decoder_t *ud, int ch)
+{
+ unsigned ntail = (ud->tail + 7) % 8;
+ if (ntail == ud->head)
+ return EOF;
+ ud->back = ud->tail = ntail;
+ ud->buf[ud->tail] = ch;
+ return ch;
+}
+
FILE *w_fopen(const wchar_t *wname, const wchar_t *wmode)
{
char *name = utf8_dup_to(wname);
diff --git a/utf8.h b/utf8.h
index 3300cfbd..f861bd39 100644
--- a/utf8.h
+++ b/utf8.h
@@ -50,6 +50,8 @@ typedef struct utf8_decoder {
int utf8_encode(wchar_t, int (*put)(int ch, mem_t *ctx), mem_t *ctx);
void utf8_decoder_init(utf8_decoder_t *);
wint_t utf8_decode(utf8_decoder_t *,int (*get)(mem_t *ctx), mem_t *ctx);
+int utf8_getc(utf8_decoder_t *);
+int utf8_ungetc(utf8_decoder_t *, int ch);
FILE *w_fopen(const wchar_t *, const wchar_t *);
FILE *w_freopen(const wchar_t *, const wchar_t *, FILE *);