diff options
| author | gingerBill <bill@gingerbill.org> | 2021-01-08 23:24:35 +0000 |
|---|---|---|
| committer | gingerBill <bill@gingerbill.org> | 2021-01-08 23:24:35 +0000 |
| commit | da380d6fc41bb36de1eb8d820e15715e986710ba (patch) | |
| tree | bb114979581848e7a070df0bbca8fcb396227f2d /core/encoding/csv/reader.odin | |
| parent | bf183b2c2c619335da86b7ad7170a55492e802a9 (diff) | |
Add encoding/csv `Reader`
Diffstat (limited to 'core/encoding/csv/reader.odin')
| -rw-r--r-- | core/encoding/csv/reader.odin | 406 |
1 files changed, 406 insertions, 0 deletions
diff --git a/core/encoding/csv/reader.odin b/core/encoding/csv/reader.odin new file mode 100644 index 000000000..9baaede24 --- /dev/null +++ b/core/encoding/csv/reader.odin @@ -0,0 +1,406 @@ +// package csv reads and writes comma-separated values (CSV) files. +// This package supports the format described in RFC 4180 <https://tools.ietf.org/html/rfc4180.html> +package csv + +import "core:bufio" +import "core:bytes" +import "core:io" +import "core:strings" +import "core:unicode/utf8" + +// Reader is a data structure used for reading records from a CSV-encoded file +// +// The associated procedures for Reader expects its input to conform to RFC 4180. +Reader :: struct { + // comma is the field delimiter + // reader_init will set it to be ',' + // A "comma" must be a valid rune, nor can it be \r, \n, or the Unicode replacement character (0xfffd) + comma: rune, + + // comment, if not 0, is the comment character + // Lines beginning with the comment character without a preceding whitespace are ignored + comment: rune, + + // fields_per_record is the number of expected fields per record + // if fields_per_record is >0, 'read' requires each record to have that field count + // if fields_per_record is 0, 'read' sets it to the field count in the first record + // if fields_per_record is <0, no check is made and records may have a variable field count + fields_per_record: int, + + // If trim_leading_space is true, leading whitespace in a field is ignored + // This is done even if the field delimiter (comma), is whitespace + trim_leading_space: bool, + + // If lazy_quotes is true, a quote may appear in an unquoted field and a non-doubled quote may appear in a quoted field + lazy_quotes: bool, + + // reuse_record controls whether calls to 'read' may return a slice using the backing buffer + // for performance + // By default, each call to 'read' returns a newly allocated slice + reuse_record: bool, + + // reuse_record_buffer controls whether calls to 'read' clone the strings of each field or uses + // the data stored in record buffer for performance + // By default, each call to 'read' clones the strings of each field + reuse_record_buffer: bool, + + + // internal buffers + r: bufio.Reader, + line_count: int, // current line being read in the CSV file + raw_buffer: [dynamic]byte, + record_buffer: [dynamic]byte, + field_indices: [dynamic]int, + last_record: [dynamic]string, + sr: strings.Reader, // used by reader_init_with_string +} + + +Parser_Error_Kind :: enum { + Bare_Quote, + Quote, + Field_Count, + Invalid_Delim, +} + +parser_error_kind_string := [Parser_Error_Kind]string{ + .Bare_Quote = "bare \" in non-quoted field", + .Quote = "extra or missing \" in quoted field", + .Field_Count = "wrong field count", + .Invalid_Delim = "invalid delimiter", +}; + +Parser_Error :: struct { + kind: Parser_Error_Kind, + start_line: int, + line: int, + column: int, + expected, got: int, // used by .Field_Count +} + +Error :: union { + Parser_Error, + io.Error, +} + +DEFAULT_RECORD_BUFFER_CAPACITY :: 256; + +// reader_init initializes a new Reader from r +reader_init :: proc(reader: ^Reader, r: io.Reader, buffer_allocator := context.allocator) { + reader.comma = ','; + + context.allocator = buffer_allocator; + reserve(&reader.record_buffer, DEFAULT_RECORD_BUFFER_CAPACITY); + reserve(&reader.raw_buffer, 0); + reserve(&reader.field_indices, 0); + reserve(&reader.last_record, 0); + bufio.reader_init(&reader.r, r); +} + + +// reader_init_with_string initializes a new Reader from s +reader_init_with_string :: proc(reader: ^Reader, s: string, buffer_allocator := context.allocator) { + strings.reader_init(&reader.sr, s); + r, _ := io.to_reader(strings.reader_to_stream(&reader.sr)); + reader_init(reader, r, buffer_allocator); +} + +// reader_destroy destroys a Reader +reader_destroy :: proc(r: ^Reader) { + delete(r.raw_buffer); + delete(r.record_buffer); + delete(r.field_indices); + delete(r.last_record); + bufio.reader_destroy(&r.r); +} + +// read reads a single record (a slice of fields) from r +// +// All \r\n sequences are normalized to \n, including multi-line field +read :: proc(r: ^Reader, allocator := context.allocator) -> (record: []string, err: Error) { + if r.reuse_record { + record, err = _read_record(r, &r.last_record, allocator); + resize(&r.last_record, len(record)); + copy(r.last_record[:], record); + } else { + record, err = _read_record(r, nil, allocator); + } + return; +} + +// is_io_error checks where an Error is a specific io.Error kind +is_io_error :: proc(err: Error, io_err: io.Error) -> bool { + if v, ok := err.(io.Error); ok { + return v == io_err; + } + return false; +} + + +// read_all reads all the remaining records from r. +// Each record is a slice of fields. +// read_all is defined to read until an EOF, and does not treat, and does not treat EOF as an error +read_all :: proc(r: ^Reader, allocator := context.allocator) -> ([][]string, Error) { + context.allocator = allocator; + records: [dynamic][]string; + for { + record, rerr := _read_record(r, nil, allocator); + if is_io_error(rerr, .EOF) { + return records[:], nil; + } + if rerr != nil { + return nil, rerr; + } + append(&records, record); + } +} + +// read reads a single record (a slice of fields) from the provided input. +read_from_string :: proc(input: string, record_allocator := context.allocator, buffer_allocator := context.allocator) -> (record: []string, n: int, err: Error) { + ir: strings.Reader; + strings.reader_init(&ir, input); + input_reader, _ := io.to_reader(strings.reader_to_stream(&ir)); + + r: Reader; + reader_init(&r, input_reader, buffer_allocator); + defer reader_destroy(&r); + record, err = read(&r, record_allocator); + n = int(r.r.r); + return; +} + + +// read_all reads all the remaining records from the provided input. +read_all_from_string :: proc(input: string, records_allocator := context.allocator, buffer_allocator := context.allocator) -> ([][]string, Error) { + ir: strings.Reader; + strings.reader_init(&ir, input); + input_reader, _ := io.to_reader(strings.reader_to_stream(&ir)); + + r: Reader; + reader_init(&r, input_reader, buffer_allocator); + defer reader_destroy(&r); + return read_all(&r, records_allocator); +} + + +@private +_read_record :: proc(r: ^Reader, dst: ^[dynamic]string, allocator := context.allocator) -> ([]string, Error) { + read_line :: proc(r: ^Reader) -> ([]byte, io.Error) { + line, err := bufio.reader_read_slice(&r.r, '\n'); + if err == .Buffer_Full { + clear(&r.raw_buffer); + append(&r.raw_buffer, ..line); + for err == .Buffer_Full { + line, err = bufio.reader_read_slice(&r.r, '\n'); + append(&r.raw_buffer, ..line); + } + line = r.raw_buffer[:]; + } + if len(line) > 0 && err == .EOF { + err = nil; + if line[len(line)-1] == '\r' { + line = line[:len(line)-1]; + } + } + r.line_count += 1; + + // normalize \r\n to \n + n := len(line); + for n >= 2 && string(line[n-2:]) == "\r\n" { + line[n-2] = '\n'; + line = line[:n-1]; + } + + return line, err; + } + + is_valid_delim :: proc(r: rune) -> bool { + switch r { + case 0, '"', '\r', '\n', utf8.RUNE_ERROR: + return false; + } + return utf8.valid_rune(r); + } + + length_newline :: proc(b: []byte) -> int { + if len(b) > 0 && b[len(b)-1] == '\n' { + return 1; + } + return 0; + } + + next_rune :: proc(b: []byte) -> rune { + r, _ := utf8.decode_rune(b); + return r; + } + + if r.comma == r.comment || + !is_valid_delim(r.comma) || + (r.comment != 0 && !is_valid_delim(r.comment)) { + err := Parser_Error{ + kind = .Invalid_Delim, + line = r.line_count, + }; + return nil, err; + } + + line, full_line: []byte; + err_read: io.Error; + for err_read == nil { + line, err_read = read_line(r); + if r.comment != 0 && next_rune(line) == r.comment { + line = nil; + continue; + } + if err_read == nil && len(line) == length_newline(line) { + line = nil; + continue; + } + full_line = line; + break; + } + + if is_io_error(err_read, .EOF) { + return nil, err_read; + } + + err: Error; + quote_len :: len(`"`); + comma_len := utf8.rune_size(r.comma); + record_line := r.line_count; + clear(&r.record_buffer); + clear(&r.field_indices); + + parse_field: for { + if r.trim_leading_space { + line = bytes.trim_left_space(line); + } + if len(line) == 0 || line[0] != '"' { + i := bytes.index_rune(line, r.comma); + field := line; + if i >= 0 { + field = field[:i]; + } else { + field = field[:len(field) - length_newline(field)]; + } + + if !r.lazy_quotes { + if j := bytes.index_byte(field, '"'); j >= 0 { + column := utf8.rune_count(full_line[:len(full_line) - len(line[j:])]); + err = Parser_Error{ + kind = .Bare_Quote, + start_line = record_line, + line = r.line_count, + column = column, + }; + break parse_field; + } + } + append(&r.record_buffer, ..field); + append(&r.field_indices, len(r.record_buffer)); + if i >= 0 { + line = line[i+comma_len:]; + continue parse_field; + } + break parse_field; + + } else { + line = line[quote_len:]; + for { + i := bytes.index_byte(line, '"'); + switch { + case i >= 0: + append(&r.record_buffer, ..line[:i]); + line = line[i+quote_len:]; + switch ch := next_rune(line); { + case ch == '"': // append quote + append(&r.record_buffer, '"'); + line = line[quote_len:]; + case ch == r.comma: // end of field + line = line[comma_len:]; + append(&r.field_indices, len(r.record_buffer)); + continue parse_field; + case length_newline(line) == len(line): // end of line + append(&r.field_indices, len(r.record_buffer)); + break parse_field; + case r.lazy_quotes: // bare quote + append(&r.record_buffer, '"'); + case: // invalid non-escaped quote + column := utf8.rune_count(full_line[:len(full_line) - len(line) - quote_len]); + err = Parser_Error{ + kind = .Quote, + start_line = record_line, + line = r.line_count, + column = column, + }; + break parse_field; + } + + case len(line) > 0: + append(&r.record_buffer, ..line); + if err_read != nil { + break parse_field; + } + line, err_read = read_line(r); + if is_io_error(err_read, .EOF) { + err_read = nil; + } + full_line = line; + + case: + if !r.lazy_quotes && err_read == nil { + column := utf8.rune_count(full_line); + err = Parser_Error{ + kind = .Quote, + start_line = record_line, + line = r.line_count, + column = column, + }; + break parse_field; + } + append(&r.field_indices, len(r.record_buffer)); + break parse_field; + } + } + } + } + + if err == nil && err_read != nil { + err = err_read; + } + + context.allocator = allocator; + dst := dst; + str := string(r.record_buffer[:]); + if dst == nil { + // use local variable + dst = &([dynamic]string){}; + } + clear(dst); + resize(dst, len(r.field_indices)); + pre_idx: int; + for idx, i in r.field_indices { + field := str[pre_idx:idx]; + if !r.reuse_record_buffer { + field = strings.clone(field); + } + dst[i] = field; + pre_idx = idx; + } + + if r.fields_per_record > 0 { + if len(dst) != r.fields_per_record && err == nil { + err = Parser_Error{ + kind = .Field_Count, + start_line = record_line, + line = r.line_count, + expected = r.fields_per_record, + got = len(dst), + }; + } + } else if r.fields_per_record == 0 { + r.fields_per_record = len(dst); + } + return dst[:], err; + +} |