Allow CSV/TSV reader to read multi-line fields.

author: Jeroen van Rijn <Kelimion@users.noreply.github.com> 2022-05-13 14:19:04 +0200
committer: Jeroen van Rijn <Kelimion@users.noreply.github.com> 2022-05-13 14:19:04 +0200
commit: 7bc21c6691266426824d7a6033c6b8768a93ce70 (patch)
tree: 0eb2cfb9ce985581cf736e56655b31a57cf3b7fd /core/encoding/csv
parent: dd56c85e5508793f30e0fb24692e36e951957103 (diff)
1 files changed, 65 insertions, 21 deletions
diff --git a/core/encoding/csv/reader.odin b/core/encoding/csv/reader.odin
index aecb73d7b..f8f1d4051 100644
--- a/core/encoding/csv/reader.odin
+++ b/core/encoding/csv/reader.odin
@@ -34,6 +34,10 @@ Reader :: struct {
 	// If lazy_quotes is true, a quote may appear in an unquoted field and a non-doubled quote may appear in a quoted field
 	lazy_quotes: bool,
 
+	// multiline_fields, when set to true, will treat a field starting with a " as a multiline string
+	// therefore, instead of reading until the next \n, it'll read until the next "
+	multiline_fields: bool,
+
 	// reuse_record controls whether calls to 'read' may return a slice using the backing buffer
 	// for performance
 	// By default, each call to 'read' returns a newly allocated slice
@@ -194,32 +198,72 @@ is_valid_delim :: proc(r: rune) -> bool {
 @private
 _read_record :: proc(r: ^Reader, dst: ^[dynamic]string, allocator := context.allocator) -> ([]string, Error) {
 	read_line :: proc(r: ^Reader) -> ([]byte, io.Error) {
-		line, err := bufio.reader_read_slice(&r.r, '\n')
-		if err == .Buffer_Full {
-			clear(&r.raw_buffer)
-			append(&r.raw_buffer, ..line)
-			for err == .Buffer_Full {
-				line, err = bufio.reader_read_slice(&r.r, '\n')
+		if !r.multiline_fields {
+			line, err := bufio.reader_read_slice(&r.r, '\n')
+			if err == .Buffer_Full {
+				clear(&r.raw_buffer)
 				append(&r.raw_buffer, ..line)
+				for err == .Buffer_Full {
+					line, err = bufio.reader_read_slice(&r.r, '\n')
+					append(&r.raw_buffer, ..line)
+				}
+				line = r.raw_buffer[:]
 			}
-			line = r.raw_buffer[:]
-		}
-		if len(line) > 0 && err == .EOF {
-			err = nil
-			if line[len(line)-1] == '\r' {
-				line = line[:len(line)-1]
+			if len(line) > 0 && err == .EOF {
+				err = nil
+				if line[len(line)-1] == '\r' {
+					line = line[:len(line)-1]
+				}
 			}
-		}
-		r.line_count += 1
+			r.line_count += 1
 
-		// normalize \r\n to \n
-		n := len(line)
-		for n >= 2 && string(line[n-2:]) == "\r\n" {
-			line[n-2] = '\n'
-			line = line[:n-1]
-		}
+			// normalize \r\n to \n
+			n := len(line)
+			for n >= 2 && string(line[n-2:]) == "\r\n" {
+				line[n-2] = '\n'
+				line = line[:n-1]
+			}
+			return line, err
+
+		} else {
+			// Reading a "line" that can possibly contain multiline fields.
+			// Unfortunately, this means we need to read a character at a time.
 
-		return line, err
+			err:       io.Error
+			cur:       rune
+			is_quoted: bool
+
+			field_length := 0
+
+			clear(&r.raw_buffer)
+
+			read_loop: for err == .None {
+				cur, _, err = bufio.reader_read_rune(&r.r)
+
+				if err != .None { break read_loop }
+
+				switch cur {
+				case '"':
+					is_quoted = field_length == 0
+					field_length += 1
+
+				case '\n', '\r':
+					if !is_quoted { break read_loop }
+
+				case r.comma:
+					field_length = 0
+
+				case:
+					field_length += 1
+				}
+
+				rune_buf, rune_len := utf8.encode_rune(cur)
+				append(&r.raw_buffer, ..rune_buf[:rune_len])
+			}
+
+			return r.raw_buffer[:], err
+		}
+		unreachable()
 	}
 
 	length_newline :: proc(b: []byte) -> int {
author	Jeroen van Rijn <Kelimion@users.noreply.github.com>	2022-05-13 14:19:04 +0200
committer	Jeroen van Rijn <Kelimion@users.noreply.github.com>	2022-05-13 14:19:04 +0200
commit	7bc21c6691266426824d7a6033c6b8768a93ce70 (patch)
tree	0eb2cfb9ce985581cf736e56655b31a57cf3b7fd /core/encoding/csv
parent	dd56c85e5508793f30e0fb24692e36e951957103 (diff)