diff options
| author | Jeroen van Rijn <Kelimion@users.noreply.github.com> | 2022-05-13 14:19:04 +0200 |
|---|---|---|
| committer | Jeroen van Rijn <Kelimion@users.noreply.github.com> | 2022-05-13 14:19:04 +0200 |
| commit | 7bc21c6691266426824d7a6033c6b8768a93ce70 (patch) | |
| tree | 0eb2cfb9ce985581cf736e56655b31a57cf3b7fd /core/encoding/csv | |
| parent | dd56c85e5508793f30e0fb24692e36e951957103 (diff) | |
Allow CSV/TSV reader to read multi-line fields.
Diffstat (limited to 'core/encoding/csv')
| -rw-r--r-- | core/encoding/csv/reader.odin | 86 |
1 files changed, 65 insertions, 21 deletions
diff --git a/core/encoding/csv/reader.odin b/core/encoding/csv/reader.odin index aecb73d7b..f8f1d4051 100644 --- a/core/encoding/csv/reader.odin +++ b/core/encoding/csv/reader.odin @@ -34,6 +34,10 @@ Reader :: struct { // If lazy_quotes is true, a quote may appear in an unquoted field and a non-doubled quote may appear in a quoted field lazy_quotes: bool, + // multiline_fields, when set to true, will treat a field starting with a " as a multiline string + // therefore, instead of reading until the next \n, it'll read until the next " + multiline_fields: bool, + // reuse_record controls whether calls to 'read' may return a slice using the backing buffer // for performance // By default, each call to 'read' returns a newly allocated slice @@ -194,32 +198,72 @@ is_valid_delim :: proc(r: rune) -> bool { @private _read_record :: proc(r: ^Reader, dst: ^[dynamic]string, allocator := context.allocator) -> ([]string, Error) { read_line :: proc(r: ^Reader) -> ([]byte, io.Error) { - line, err := bufio.reader_read_slice(&r.r, '\n') - if err == .Buffer_Full { - clear(&r.raw_buffer) - append(&r.raw_buffer, ..line) - for err == .Buffer_Full { - line, err = bufio.reader_read_slice(&r.r, '\n') + if !r.multiline_fields { + line, err := bufio.reader_read_slice(&r.r, '\n') + if err == .Buffer_Full { + clear(&r.raw_buffer) append(&r.raw_buffer, ..line) + for err == .Buffer_Full { + line, err = bufio.reader_read_slice(&r.r, '\n') + append(&r.raw_buffer, ..line) + } + line = r.raw_buffer[:] } - line = r.raw_buffer[:] - } - if len(line) > 0 && err == .EOF { - err = nil - if line[len(line)-1] == '\r' { - line = line[:len(line)-1] + if len(line) > 0 && err == .EOF { + err = nil + if line[len(line)-1] == '\r' { + line = line[:len(line)-1] + } } - } - r.line_count += 1 + r.line_count += 1 - // normalize \r\n to \n - n := len(line) - for n >= 2 && string(line[n-2:]) == "\r\n" { - line[n-2] = '\n' - line = line[:n-1] - } + // normalize \r\n to \n + n := len(line) + for n >= 2 && string(line[n-2:]) == "\r\n" { + line[n-2] = '\n' + line = line[:n-1] + } + return line, err + + } else { + // Reading a "line" that can possibly contain multiline fields. + // Unfortunately, this means we need to read a character at a time. - return line, err + err: io.Error + cur: rune + is_quoted: bool + + field_length := 0 + + clear(&r.raw_buffer) + + read_loop: for err == .None { + cur, _, err = bufio.reader_read_rune(&r.r) + + if err != .None { break read_loop } + + switch cur { + case '"': + is_quoted = field_length == 0 + field_length += 1 + + case '\n', '\r': + if !is_quoted { break read_loop } + + case r.comma: + field_length = 0 + + case: + field_length += 1 + } + + rune_buf, rune_len := utf8.encode_rune(cur) + append(&r.raw_buffer, ..rune_buf[:rune_len]) + } + + return r.raw_buffer[:], err + } + unreachable() } length_newline :: proc(b: []byte) -> int { |