diff options
| author | gingerBill <bill@gingerbill.org> | 2021-06-25 14:31:35 +0100 |
|---|---|---|
| committer | gingerBill <bill@gingerbill.org> | 2021-06-25 14:31:35 +0100 |
| commit | f1cf724bd43996260a1b665e914f0bf48ffed9a8 (patch) | |
| tree | 0bdfe4c16cc993ac8972bc199081c61e21a29b24 /core/bufio | |
| parent | fc809d3fad41b89b2a4bfa0045b58cb29989174a (diff) | |
Add `bufio.Scanner`
Diffstat (limited to 'core/bufio')
| -rw-r--r-- | core/bufio/scanner.odin | 340 |
1 files changed, 340 insertions, 0 deletions
diff --git a/core/bufio/scanner.odin b/core/bufio/scanner.odin new file mode 100644 index 000000000..1dd13f19a --- /dev/null +++ b/core/bufio/scanner.odin @@ -0,0 +1,340 @@ +package bufio + +import "core:bytes" +import "core:io" +import "core:mem" +import "core:unicode/utf8" +import "intrinsics" + +// Extra errors returns by scanning procedures +Scanner_Extra_Error :: enum i32 { + Negative_Advance, + Advanced_Too_Far, + Bad_Read_Count, + Too_Long, + Too_Short, +} + +Scanner_Error :: union { + io.Error, + Scanner_Extra_Error, +} + +// Split_Proc is the signature of the split procedure used to tokenize the input. +Split_Proc :: proc(data: []byte, at_eof: bool) -> (advance: int, token: []byte, err: Scanner_Error, final_token: bool); + +Scanner :: struct { + r: io.Reader, + split: Split_Proc, + + buf: [dynamic]byte, + max_token_size: int, + start: int, + end: int, + token: []byte, + + _err: Scanner_Error, + max_consecutive_empty_reads: int, + successive_empty_token_count: int, + scan_called: bool, + done: bool, +} + +DEFAULT_MAX_SCAN_TOKEN_SIZE :: 1<<16; + +@(private) +_INIT_BUF_SIZE :: 4096; + +scanner_init :: proc(s: ^Scanner, r: io.Reader, buf_allocator := context.allocator) -> ^Scanner { + s.r = r; + s.split = scan_lines; + s.max_token_size = DEFAULT_MAX_SCAN_TOKEN_SIZE; + s.buf.allocator = buf_allocator; + return s; +} +scanner_init_with_buffer :: proc(s: ^Scanner, r: io.Reader, buf: []byte) -> ^Scanner { + s.r = r; + s.split = scan_lines; + s.max_token_size = DEFAULT_MAX_SCAN_TOKEN_SIZE; + s.buf = mem.buffer_from_slice(buf); + resize(&s.buf, cap(s.buf)); + return s; +} +scanner_destroy :: proc(s: ^Scanner) { + delete(s.buf); +} + + +// Returns the first non-EOF error that was encounted by the scanner +scanner_error :: proc(s: ^Scanner) -> Scanner_Error { + switch s._err { + case .EOF, .None: + return nil; + } + return s._err; +} + +// Returns the most recent token created by scanner_scan. +// The underlying array may point to data that may be overwritten +// by another call to scanner_scan. +// Treat the returned value as if it is immutable. +scanner_bytes :: proc(s: ^Scanner) -> []byte { + return s.token; +} + +// Returns the most recent token created by scanner_scan. +// The underlying array may point to data that may be overwritten +// by another call to scanner_scan. +// Treat the returned value as if it is immutable. +scanner_text :: proc(s: ^Scanner) -> string { + return string(s.token); +} + +// scanner_scan advances the scanner +scanner_scan :: proc(s: ^Scanner) -> bool { + set_err :: proc(s: ^Scanner, err: Scanner_Error) { + err := err; + if err == .None { + err = nil; + } + switch s._err { + case nil, .EOF: + s._err = err; + } + } + + if s.done { + return false; + } + s.scan_called = true; + + for { + // Check if a token is possible with what is available + // Allow the split procedure to recover if it fails + if s.start < s.end || s._err != nil { + advance, token, err, final_token := s.split(s.buf[s.start:s.end], s._err != nil); + if final_token { + s.token = token; + s.done = true; + return true; + } + if err != nil { + set_err(s, err); + return false; + } + + // Do advance + if advance < 0 { + set_err(s, .Negative_Advance); + return false; + } + if advance > s.end-s.start { + set_err(s, .Advanced_Too_Far); + return false; + } + s.start += advance; + + s.token = token; + if s.token != nil { + if s._err == nil || advance > 0 { + s.successive_empty_token_count = 0; + } else { + s.successive_empty_token_count += 1; + + if s.max_consecutive_empty_reads <= 0 { + s.max_consecutive_empty_reads = DEFAULT_MAX_CONSECUTIVE_EMPTY_READS; + } + if s.successive_empty_token_count > s.max_consecutive_empty_reads { + set_err(s, .No_Progress); + return false; + } + } + return true; + } + } + + // If an error is hit, no token can be created + if s._err != nil { + s.start = 0; + s.end = 0; + return false; + } + + // More data must be required to be read + if s.start > 0 && (s.end == len(s.buf) || s.start > len(s.buf)/2) { + copy(s.buf[:], s.buf[s.start:s.end]); + s.end -= s.start; + s.start = 0; + } + + could_be_too_short := false; + + // Resize the buffer if full + if s.end == len(s.buf) { + if s.max_token_size <= 0 { + s.max_token_size = DEFAULT_MAX_SCAN_TOKEN_SIZE; + } + if len(s.buf) >= s.max_token_size { + set_err(s, .Too_Long); + return false; + } + // overflow check + new_size := _INIT_BUF_SIZE; + if len(s.buf) > 0 { + overflowed: bool; + if new_size, overflowed = intrinsics.overflow_mul(len(s.buf), 2); overflowed { + set_err(s, .Too_Long); + return false; + } + } + + old_size := len(s.buf); + new_size = min(new_size, s.max_token_size); + resize(&s.buf, new_size); + s.end -= s.start; + s.start = 0; + + could_be_too_short = old_size >= len(s.buf); + + } + + // Read data into the buffer + loop := 0; + for { + n, err := io.read(s.r, s.buf[s.end:len(s.buf)]); + if n < 0 || len(s.buf)-s.end < n { + set_err(s, .Bad_Read_Count); + break; + } + s.end += n; + if err != nil { + set_err(s, err); + break; + } + if n > 0 { + s.successive_empty_token_count = 0; + break; + } + loop += 1; + + if s.max_consecutive_empty_reads <= 0 { + s.max_consecutive_empty_reads = DEFAULT_MAX_CONSECUTIVE_EMPTY_READS; + } + if loop > s.max_consecutive_empty_reads { + if could_be_too_short { + set_err(s, .Too_Short); + } else { + set_err(s, .No_Progress); + } + break; + } + } + } +} + +scan_bytes :: proc(data: []byte, at_eof: bool) -> (advance: int, token: []byte, err: Scanner_Error, final_token: bool) { + if at_eof && len(data) == 0 { + return; + } + return 1, data[0:1], nil, false; +} + +scan_runes :: proc(data: []byte, at_eof: bool) -> (advance: int, token: []byte, err: Scanner_Error, final_token: bool) { + if at_eof && len(data) == 0 { + return; + } + + if data[0] < utf8.RUNE_SELF { + advance = 1; + token = data[0:1]; + return; + } + + _, width := utf8.decode_rune(data); + if width > 1 { + advance = width; + token = data[0:width]; + return; + } + + if !at_eof && !utf8.full_rune(data) { + return; + } + + @thread_local ERROR_RUNE := []byte{0xef, 0xbf, 0xbd}; + + advance = 1; + token = ERROR_RUNE; + return; +} + +scan_words :: proc(data: []byte, at_eof: bool) -> (advance: int, token: []byte, err: Scanner_Error, final_token: bool) { + is_space :: proc "contextless" (r: rune) -> bool { + switch r { + // lower ones + case ' ', '\t', '\n', '\v', '\f', '\r': + return true; + case '\u0085', '\u00a0': + return true; + // higher ones + case '\u2000' ..= '\u200a': + return true; + case '\u1680', '\u2028', '\u2029', '\u202f', '\u205f', '\u3000': + return true; + } + return false; + } + + // skip spaces at the beginning + start := 0; + for width := 0; start < len(data); start += width { + r: rune; + r, width = utf8.decode_rune(data[start:]); + if !is_space(r) { + break; + } + } + + for width, i := 0, start; i < len(data); i += width { + r: rune; + r, width = utf8.decode_rune(data[i:]); + if is_space(r) { + advance = i+width; + token = data[start:i]; + return; + } + } + + if at_eof && len(data) > start { + advance = len(data); + token = data[start:]; + return; + } + + advance = start; + return; +} + +scan_lines :: proc(data: []byte, at_eof: bool) -> (advance: int, token: []byte, err: Scanner_Error, final_token: bool) { + trim_carriage_return :: proc "contextless" (data: []byte) -> []byte { + if len(data) > 0 && data[len(data)-1] == '\r' { + return data[0:len(data)-1]; + } + return data; + } + + if at_eof && len(data) == 0 { + return; + } + if i := bytes.index_byte(data, '\n'); i >= 0 { + advance = i+1; + token = trim_carriage_return(data[0:i]); + return; + } + + if at_eof { + advance = len(data); + token = trim_carriage_return(data); + } + return; +} |