diff options
| author | gingerBill <bill@gingerbill.org> | 2019-01-06 20:37:12 +0000 |
|---|---|---|
| committer | gingerBill <bill@gingerbill.org> | 2019-01-06 20:37:12 +0000 |
| commit | c5def602241aba33a4c111edc22577d9490c8ef2 (patch) | |
| tree | 829b9fd3882da7d9f2b90cbe853fd97da2a2508d /core/encoding | |
| parent | 6e6a05382318658bbab8040f6c4ae66bc9f1044c (diff) | |
Begin work on `package json`
Diffstat (limited to 'core/encoding')
| -rw-r--r-- | core/encoding/json/parser.odin | 323 | ||||
| -rw-r--r-- | core/encoding/json/tokenizer.odin | 322 | ||||
| -rw-r--r-- | core/encoding/json/types.odin | 70 |
3 files changed, 715 insertions, 0 deletions
diff --git a/core/encoding/json/parser.odin b/core/encoding/json/parser.odin new file mode 100644 index 000000000..2b66a1558 --- /dev/null +++ b/core/encoding/json/parser.odin @@ -0,0 +1,323 @@ +package json + +import "core:mem" +import "core:unicode/utf8" +import "core:strconv" +import "core:strings" + +Parser :: struct { + tok: Tokenizer, + curr_token: Token, + allocator: mem.Allocator, +} + +make_parser :: proc(data: string, allocator := context.allocator) -> Parser { + p: Parser; + p.tok = make_tokenizer(data); + p.allocator = allocator; + assert(p.allocator.procedure != nil); + advance_token(&p); + return p; +} + +parse :: proc(data: string, allocator := context.allocator) -> (Value, Error) { + p := make_parser(data, allocator); + return parse_object(&p); +} + +advance_token :: proc(p: ^Parser) -> (Token, Error) { + err: Error; + prev := p.curr_token; + p.curr_token, err = get_token(&p.tok); + return prev, err; +} + + +allow_token :: proc(p: ^Parser, kind: Kind) -> bool { + if p.curr_token.kind == kind { + advance_token(p); + return true; + } + return false; +} + +expect_token :: proc(p: ^Parser, kind: Kind) -> Error { + prev := p.curr_token; + advance_token(p); + if prev.kind == kind { + return Error.None; + } + return Error.Unexpected_Token; +} + + + +parse_value :: proc(p: ^Parser) -> (value: Value, err: Error) { + value.pos = p.curr_token.pos; + token := p.curr_token; + switch token.kind { + case Kind.Null: + value.value = Null{}; + advance_token(p); + return; + case Kind.False: + value.value = Boolean(false); + advance_token(p); + return; + case Kind.True: + value.value = Boolean(true); + advance_token(p); + return; + + case Kind.Integer: + value.value = Integer(strconv.parse_i64(token.text)); + advance_token(p); + return; + case Kind.Float: + value.value = Float(strconv.parse_f64(token.text)); + advance_token(p); + return; + case Kind.String: + value.value = String(unquote_string(token, p.allocator)); + advance_token(p); + return; + + case Kind.Open_Brace: + return parse_object(p); + + case Kind.Open_Bracket: + return parse_array(p); + } + + err = Error.Unexpected_Token; + advance_token(p); + return; +} + +parse_array :: proc(p: ^Parser) -> (value: Value, err: Error) { + value.pos = p.curr_token.pos; + if err = expect_token(p, Kind.Open_Bracket); err != Error.None { + return; + } + + array: Array; + array.allocator = p.allocator; + defer if err != Error.None { + for elem in array { + destroy_value(elem); + } + delete(array); + } + + for p.curr_token.kind != Kind.Close_Bracket { + elem, elem_err := parse_value(p); + if elem_err != Error.None { + err = elem_err; + return; + } + append(&array, elem); + + // Disallow trailing commas for the time being + if allow_token(p, Kind.Comma) { + continue; + } else { + break; + } + } + + if err = expect_token(p, Kind.Close_Bracket); err != Error.None { + return; + } + + value.value = array; + return; +} + + +parse_object :: proc(p: ^Parser) -> (value: Value, err: Error) { + value.pos = p.curr_token.pos; + if err = expect_token(p, Kind.Open_Brace); err != Error.None { + value.pos = p.curr_token.pos; + return; + } + + obj: Object; + obj.allocator = p.allocator; + defer if err != Error.None { + for key, elem in obj { + delete(key); + destroy_value(elem); + } + delete(obj); + } + + for p.curr_token.kind != Kind.Close_Brace { + tok := p.curr_token; + if tok_err := expect_token(p, Kind.String); tok_err != Error.None { + err = Error.Expected_String_For_Object_Key; + value.pos = p.curr_token.pos; + return; + } + key := unquote_string(tok, p.allocator); + + if colon_err := expect_token(p, Kind.Colon); colon_err != Error.None { + err = Error.Expected_Colon_After_Key; + value.pos = p.curr_token.pos; + return; + } + + elem, elem_err := parse_value(p); + if elem_err != Error.None { + err = elem_err; + value.pos = p.curr_token.pos; + return; + } + + if key in obj { + err = Error.Duplicate_Object_Key; + value.pos = p.curr_token.pos; + delete(key); + return; + } + + obj[key] = elem; + + // Disallow trailing commas for the time being + if allow_token(p, Kind.Comma) { + continue; + } else { + break; + } + } + + if err = expect_token(p, Kind.Close_Brace); err != Error.None { + value.pos = p.curr_token.pos; + return; + } + + value.value = obj; + return; +} + + +// IMPORTANT NOTE(bill): unquote_string assumes a mostly valid string +unquote_string :: proc(token: Token, allocator := context.allocator) -> string { + get_u4_rune :: proc(s: string) -> rune { + if len(s) < 6 || s[0] != '\\' || s[1] != 'u' { + return -1; + } + + r: rune; + for c in s[2:6] { + x: rune; + switch c { + case '0'..'9': x = c - '0'; + case 'a'..'f': x = c - 'a' + 10; + case 'A'..'F': x = c - 'A' + 10; + case: return -1; + } + r = r*16 + x; + } + return r; + } + + if token.kind != Kind.String { + return ""; + } + s := token.text; + if len(s) <= 2 { + return ""; + } + s = s[1:len(s)-1]; + + i := 0; + for i < len(s) { + c := s[i]; + if c == '\\' || c == '"' || c < ' ' { + break; + } + if c < utf8.RUNE_SELF { + i += 1; + continue; + } + r, w := utf8.decode_rune_in_string(s); + if r == utf8.RUNE_ERROR && w == 1 { + break; + } + i += w; + } + if i == len(s) { + return strings.new_string(s, allocator); + } + + b := make([]byte, len(s) + 2*utf8.UTF_MAX, allocator); + w := copy(b, cast([]byte)s[0:i]); + loop: for i < len(s) { + c := s[i]; + switch { + case c == '\\': + i += 1; + if i >= len(s) { + break loop; + } + switch s[i] { + case: break loop; + case '"', '\'', '\\', '/': + b[w] = s[i]; + i += 1; + w += 1; + + case 'b': + b[w] = '\b'; + i += 1; + w += 1; + case 'f': + b[w] = '\f'; + i += 1; + w += 1; + case 'r': + b[w] = '\r'; + i += 1; + w += 1; + case 't': + b[w] = '\t'; + i += 1; + w += 1; + case 'n': + b[w] = '\n'; + i += 1; + w += 1; + case 'u': + i -= 1; // Include the \u in the check for sanity sake + r := get_u4_rune(s[i:]); + if r < 0 { + break loop; + } + i += 6; + + buf, buf_width := utf8.encode_rune(r); + copy(b[w:], buf[:buf_width]); + w += buf_width; + } + + case c == '"', c < ' ': + break loop; + + case c < utf8.RUNE_SELF: + b[w] = c; + i += 1; + w += 1; + + case: + r, width := utf8.decode_rune_in_string(s[i:]); + i += width; + + buf, buf_width := utf8.encode_rune(r); + assert(buf_width <= width); + copy(b[w:], buf[:buf_width]); + w += buf_width; + } + } + + return string(b[:w]); +} diff --git a/core/encoding/json/tokenizer.odin b/core/encoding/json/tokenizer.odin new file mode 100644 index 000000000..dfa20a6a7 --- /dev/null +++ b/core/encoding/json/tokenizer.odin @@ -0,0 +1,322 @@ +package json + +import "core:unicode/utf8" + +Token :: struct { + using pos: Pos, + kind: Kind, + text: string, +} + +Kind :: enum { + Invalid, + + Null, + False, + True, + + Ident, + + Integer, + Float, + String, + + Colon, + Comma, + + Open_Brace, + Close_Brace, + + Open_Bracket, + Close_Bracket, +} + +Tokenizer :: struct { + using pos: Pos, + data: string, + r: rune, // current rune + w: int, // current rune width in bytes + curr_line_offset: int, +} + + + +make_tokenizer :: proc(data: string) -> Tokenizer { + t := Tokenizer{pos = {line=1}, data = data}; + next_rune(&t); + return t; +} + +next_rune :: proc(t: ^Tokenizer) -> rune #no_bounds_check { + if t.offset >= len(t.data) { + return utf8.RUNE_EOF; + } + t.offset += t.w; + t.r, t.w = utf8.decode_rune_in_string(t.data[t.offset:]); + t.pos.column = t.offset - t.curr_line_offset; + return t.r; +} + + +get_token :: proc(t: ^Tokenizer) -> (token: Token, err: Error) { + skip_digits :: proc(t: ^Tokenizer) { + for t.offset < len(t.data) { + next_rune(t); + if '0' <= t.r && t.r <= '9' { + // Okay + } else { + return; + } + } + } + + scan_espace :: proc(t: ^Tokenizer) -> bool { + switch t.r { + case '"', '\'', '\\', '/', 'b', 'n', 'r', 't', 'f': + next_rune(t); + return true; + case 'u': + // Expect 4 hexadecimal digits + for i := 0; i < 4; i += 1 { + r := next_rune(t); + switch r { + case '0'..'9', 'a'..'f', 'A'..'F': + // Okay + case: + return false; + } + } + case: + // Ignore the next rune regardless + next_rune(t); + } + return false; + } + + skip_whitespace :: proc(t: ^Tokenizer) -> rune { + loop: for t.offset < len(t.data) { + switch t.r { + case ' ', '\t', '\v', '\f', '\r': + next_rune(t); + case '\n': + t.line += 1; + t.curr_line_offset = t.offset; + t.pos.column = 1; + next_rune(t); + case: + break loop; + } + } + return t.r; + } + + skip_whitespace(t); + + token.pos = t.pos; + token.kind = Kind.Invalid; + + curr_rune := t.r; + next_rune(t); + + switch curr_rune { + case utf8.RUNE_ERROR: + err = Error.Illegal_Character; + case utf8.RUNE_EOF, '\x00': + err = Error.EOF; + + case 'A'..'Z', 'a'..'z', '_': + token.kind = Kind.Ident; + + for t.offset < len(t.data) { + switch next_rune(t) { + case 'A'..'Z', 'a'..'z', '0'..'9', '_': + continue; + } + + break; + } + + switch str := t.data[token.offset:t.offset]; str { + case "null": token.kind = Kind.Null; + case "false": token.kind = Kind.False; + case "true": token.kind = Kind.True; + } + + case '-': + switch t.r { + case '0'..'9': + // Okay + case: + // Illegal use of +/- + err = Error.Illegal_Character; + break; + } + fallthrough; + + case '0'..'9': + token.kind = Kind.Integer; + + skip_digits(t); + if t.r == '.' { + token.kind = Kind.Float; + next_rune(t); + skip_digits(t); + } + if t.r == 'e' || t.r == 'E' { + switch r := next_rune(t); r { + case '+', '-': + next_rune(t); + } + skip_digits(t); + } + + str := t.data[token.offset:t.offset]; + if !is_valid_number(str) { + err = Error.Invalid_Number; + } + + + case '"': + token.kind = Kind.String; + quote := curr_rune; + for t.offset < len(t.data) { + r := t.r; + if r == '\n' || r < 0 { + err = Error.String_Not_Terminated; + break; + } + next_rune(t); + if r == quote { + break; + } + if r == '\\' { + scan_espace(t); + } + } + + if !is_valid_string_literal(t.data[token.offset : t.offset]) { + err = Error.Invalid_String; + } + + case ',': token.kind = Kind.Comma; + case ':': token.kind = Kind.Colon; + case '{': token.kind = Kind.Open_Brace; + case '}': token.kind = Kind.Close_Brace; + case '[': token.kind = Kind.Open_Bracket; + case ']': token.kind = Kind.Close_Bracket; + + case: err = Error.Illegal_Character; + } + + token.text = t.data[token.offset : t.offset]; + + return; +} + + + +is_valid_number :: proc(s: string) -> bool { + if s == "" { + return false; + } + + if s[0] == '-' { + s = s[1:]; + if s == "" { + return false; + } + } + + switch s[0] { + case '0': + s = s[1:]; + case '1'..'9': + s = s[1:]; + for len(s) > 0 && '0' <= s[0] && s[0] <= '9' do s = s[1:]; + case: + return false; + } + + + if len(s) >= 2 && s[0] == '.' && '0' <= s[1] && s[1] <= '9' { + s = s[2:]; + for len(s) > 0 && '0' <= s[0] && s[0] <= '9' do s = s[1:]; + } + + if len(s) >= 2 && (s[0] == 'e' || s[0] == 'E') { + s = s[1:]; + switch s[0] { + case '+', '-': + s = s[1:]; + if s == "" { + return false; + } + } + for len(s) > 0 && '0' <= s[0] && s[0] <= '9' do s = s[1:]; + } + + // The string should be empty now to be valid + return s == ""; +} + +is_valid_string_literal :: proc(s: string) -> bool { + if len(s) < 2 || s[0] != '"' || s[len(s)-1] != '"' { + return false; + } + s = s[1 : len(s)-1]; + + i := 0; + for i < len(s) { + c := s[i]; + switch { + case c == '\\': + i += 1; + if i >= len(s) { + return false; + } + switch s[i] { + case '"', '\'', '\\', '/', 'b', 'n', 'r', 't', 'f': + i += 1; + case 'u': + if i >= len(s) { + return false; + } + hex := s[i+1:]; + if len(hex) < 4 { + return false; + } + hex = hex[:4]; + i += 5; + + for j := 0; j < 4; j += 1 { + c := hex[j]; + switch c { + case '0'..'9', 'a'..'z', 'A'..'Z': + // Okay + case: + return false; + } + } + + case: return false; + } + + case c == '"', c < ' ': + return false; + + case c < utf8.RUNE_SELF: + i += 1; + + case: + r, width := utf8.decode_rune_in_string(s[i:]); + if r == utf8.RUNE_ERROR && width == 1 { + return false; + } + i += width; + } + } + if i == len(s) { + return true; + } + return true; +} diff --git a/core/encoding/json/types.odin b/core/encoding/json/types.odin new file mode 100644 index 000000000..d8a10b801 --- /dev/null +++ b/core/encoding/json/types.odin @@ -0,0 +1,70 @@ +package json + +import "core:strconv" + +Null :: distinct rawptr; +Integer :: i64; +Float :: f64; +Boolean :: bool; +String :: string; +Array :: distinct [dynamic]Value; +Object :: distinct map[string]Value; + +Value :: struct { + pos: Pos, + value: union { + Null, + Integer, + Float, + Boolean, + String, + Array, + Object, + } +} + +Pos :: struct { + offset: int, + line: int, + column: int, +} + + +Error :: enum { + None, + + EOF, // Not necessarily an error + + // Tokenizing Errors + Illegal_Character, + Invalid_Number, + String_Not_Terminated, + Invalid_String, + + + // Parsing Errors + Unexpected_Token, + Expected_String_For_Object_Key, + Duplicate_Object_Key, + Expected_Colon_After_Key, +} + + + + +destroy_value :: proc(value: Value) { + switch v in value.value { + case Object: + for key, elem in v { + delete(key); + destroy_value(elem); + } + delete(v); + case Array: + for elem in v do destroy_value(elem); + delete(v); + case String: + delete(v); + } +} + |