diff options
author | Kaz Kylheku <kaz@kylheku.com> | 2025-05-22 22:43:36 -0700 |
---|---|---|
committer | Kaz Kylheku <kaz@kylheku.com> | 2025-05-22 22:43:36 -0700 |
commit | a619222ee1eb25d48516f6cfcb848a17eab052fb (patch) | |
tree | e4aa8423e8d8b0359192f493334aba540dc3c5a2 | |
parent | ccaacb51f45b21da7f8229dcc27d20a02c551e77 (diff) | |
download | txr-a619222ee1eb25d48516f6cfcb848a17eab052fb.tar.gz txr-a619222ee1eb25d48516f6cfcb848a17eab052fb.tar.bz2 txr-a619222ee1eb25d48516f6cfcb848a17eab052fb.zip |
streams: get-char for string byte input streams.
String byte input streams extended to provide
characer input (get-char), and any mixture of
unget-byte and unget-char. Also fill-buf
is supported.
* stream.c (struct byte_input): New ut8_decoder_t
member ud.
(struct byte_input_ungetch): New struct type.
(byte_in_get_char_callback, byte_in_get_char,
byte_in_unget_char_callback, byte_in_unget_char):
New functions.
(byte_in_ops): Wire in byte_in_get_char,
byte_in_unget_char and byte_in_unget_byte.
Also generic_fill_buf.
(make_string_byte_input_stream): Initialize the
UTF-8 decoder.
* tests/018/streams.tl: New tests.
* txr.1: Documented.
-rw-r--r-- | stream.c | 61 | ||||
-rw-r--r-- | tests/018/streams.tl | 13 | ||||
-rw-r--r-- | txr.1 | 41 |
3 files changed, 109 insertions, 6 deletions
@@ -2081,6 +2081,12 @@ struct byte_input { unsigned char *buf; size_t size; size_t index; + utf8_decoder_t ud; +}; + +struct byte_input_ungetch { + unsigned char buf[8]; + unsigned char *ptr; }; static void byte_in_stream_destroy(val stream) @@ -2092,6 +2098,53 @@ static void byte_in_stream_destroy(val stream) free(bi); } +static int byte_in_get_char_callback(mem_t *ctx) +{ + struct byte_input *bi = coerce(struct byte_input *, ctx); + return (bi->index < bi->size) ? bi->buf[bi->index++] : EOF; +} + +static val byte_in_get_char(val stream) +{ + struct byte_input *bi = coerce(struct byte_input *, stream->co.handle); + wint_t wch = utf8_decode(&bi->ud, byte_in_get_char_callback, + coerce(mem_t *, bi)); + int ch; + + while ((ch = utf8_getc(&bi->ud)) != EOF) + if (bi->index > 0) + bi->buf[--bi->index] = ch; + + return (wch != WEOF) ? chr(wch) : nil; +} + +static int byte_in_unget_char_callback(int ch, mem_t *ctx) +{ + struct byte_input_ungetch *bu = coerce(struct byte_input_ungetch *, ctx); + return (bu->ptr > bu->buf) ? *--bu->ptr = ch, 1 : 0; +} + +static val byte_in_unget_char(val stream, val ch) +{ + struct byte_input *bi = coerce(struct byte_input *, stream->co.handle); + struct byte_input_ungetch bu; + unsigned char *bend = bu.buf + sizeof bu.buf; + + bu.ptr = bend; + + (void) utf8_encode(c_chr(ch), byte_in_unget_char_callback, coerce(mem_t *, &bu)); + + if (convert(size_t, bend - bu.ptr) > bi->index) + uw_throwf(file_error_s, + lit("unget-char: cannot push past beginning of byte stream"), + nao); + + while (bu.ptr < bend) + bi->buf[--bi->index] = *bu.ptr++; + + return ch; +} + static val byte_in_get_byte(val stream) { struct byte_input *bi = coerce(struct byte_input *, stream->co.handle); @@ -2133,10 +2186,13 @@ static struct strm_ops byte_in_ops = cobj_eq_hash_op, 0), wli("byte-input-stream"), - 0, 0, 0, 0, 0, + 0, 0, 0, 0, + byte_in_get_char, byte_in_get_byte, + byte_in_unget_char, + byte_in_unget_byte, 0, - byte_in_unget_byte, 0, 0, + generic_fill_buf, 0, 0, 0, 0, 0, 0, byte_in_get_error, byte_in_get_error_str, @@ -2153,6 +2209,7 @@ val make_string_byte_input_stream(val string) strm_base_init(&bi->a); bi->buf = utf8_dup_to_buf(wstring, &bi->size, 0); bi->index = 0; + utf8_decoder_init(&bi->ud); return cobj(coerce(mem_t *, bi), stream_cls, &byte_in_ops.cobj_ops); } } diff --git a/tests/018/streams.tl b/tests/018/streams.tl index 12157e48..1350131f 100644 --- a/tests/018/streams.tl +++ b/tests/018/streams.tl @@ -59,3 +59,16 @@ (unget-byte #x81) #x81 (unget-byte #xe3) #xe3 (get-char) #\x3042) + +(with-in-string-byte-stream (s "ABCD") + (mtest + (unget-byte 3 s) :error + (get-char s) #\A + (get-byte s) 66 + (get-char s) #\C + (unget-char #\x3042 s) #\x3042 + (get-char s) #\x3042 + (unget-char #\x3042 s) #\x3042 + (get-byte s) #xe3 + (get-char s) #\xdc81 + (unget-char #\x3042 s) :error)) @@ -66882,11 +66882,20 @@ Output operations and byte operations are not supported. .desc The .code make-string-byte-input-stream -function produces an input stream object. Byte read operations on -this stream object read successive byte values obtained by encoding +function produces an input stream object. The +.code get-byte +operations on this stream type reads successive byte values obtained by +encoding .meta string -into UTF-8. Character read operations are not supported, and neither -are output operations. +into UTF-8. The +.code get-char +operation is also supported, via decoding UTF-8 characters +from the byte sequence. Both +.code unget-byte +and +.code unget-char +are supported. +Output and positioning operations are not supported. .coNP Function @ make-strlist-input-stream .synb @@ -67490,6 +67499,30 @@ as UTF-8. After pushed back bytes are exhausted, bytes are read from the buffered stream in the usual way. .RE +String byte input streams, created by +.codn make-string-byte-input-stream , +make the following guarantees: +.RS +.IP 1. +Only as many bytes may be pushed back as have been read +from the stream. +.IP 2. +When a character is pushed back, it is converted to a UTF-8 +byte sequence. The byte sequence is then pushed back, in +reverse order, so that a subsequent +.code get-char +operator can recover the same character via UTF-8 decoding. +If there isn't room to push back all of the bytes, +an exception of type +.code file-error +is thrown thrown. +.IP 3. +Since push-back is byte-based, rather than a separate mechanism +for characters and bytes, as in the case of file streams, +mixtures of getting and ungetting bytes and characters +are supported. +.RE + .coNP Functions @, put-string @, put-line @ put-char and @ put-byte .synb .mets (put-string < string <> [ stream ]) |