package json; JSON5 support

author: gingerBill <bill@gingerbill.org> 2019-01-06 21:48:13 +0000
committer: gingerBill <bill@gingerbill.org> 2019-01-06 21:48:13 +0000
commit: d1b9f3ac74df5533f1857a26831419aeb560fd2f (patch)
tree: ce7b77ec1fbd83516e0a49d4f4ccc82422f6ab87 /core/encoding
parent: d732a5158761578d14e69daf3a94fad9f0a8c23c (diff)
4 files changed, 319 insertions, 48 deletions
diff --git a/core/encoding/json/parser.odin b/core/encoding/json/parser.odin
index 3ef8cee93..2c7d79465 100644
--- a/core/encoding/json/parser.odin
+++ b/core/encoding/json/parser.odin
@@ -7,20 +7,27 @@ import "core:strconv"
 Parser :: struct {
 	tok:        Tokenizer,
 	curr_token: Token,
+	spec:       Specification,
 	allocator:  mem.Allocator,
 }
 
-make_parser :: proc(data: string, allocator := context.allocator) -> Parser {
+make_parser :: proc(data: string, spec := Specification.JSON, allocator := context.allocator) -> Parser {
 	p: Parser;
-	p.tok = make_tokenizer(data);
+	p.tok = make_tokenizer(data, spec);
+	p.spec = spec;
 	p.allocator = allocator;
 	assert(p.allocator.procedure != nil);
 	advance_token(&p);
 	return p;
 }
 
-parse :: proc(data: string, allocator := context.allocator) -> (Value, Error) {
-	p := make_parser(data, allocator);
+parse :: proc(data: string, spec := Specification.JSON, allocator := context.allocator) -> (Value, Error) {
+	context.allocator = allocator;
+	p := make_parser(data, spec, allocator);
+
+	if p.spec == Specification.JSON5 {
+		return parse_value(&p);
+	}
 	return parse_object(&p);
 }
 
@@ -77,7 +84,7 @@ parse_value :: proc(p: ^Parser) -> (value: Value, err: Error) {
 		advance_token(p);
 		return;
 	case Kind.String:
-		value.value = String(unquote_string(token, p.allocator));
+		value.value = String(unquote_string(token, p.spec, p.allocator));
 		advance_token(p);
 		return;
 
@@ -132,6 +139,34 @@ parse_array :: proc(p: ^Parser) -> (value: Value, err: Error) {
 	return;
 }
 
+clone_string :: proc(s: string, allocator: mem.Allocator) -> string {
+	n := len(s);
+	b := make([]byte, n+1, allocator);
+	copy(b, cast([]byte)s);
+	b[n] = 0;
+	return string(b[:n]);
+}
+
+parse_object_key :: proc(p: ^Parser) -> (key: string, err: Error) {
+	tok := p.curr_token;
+	if p.spec == Specification.JSON5 {
+		if tok.kind == Kind.String {
+			expect_token(p, Kind.String);
+			key = unquote_string(tok, p.spec, p.allocator);
+			return;
+		} else if tok.kind == Kind.Ident {
+			expect_token(p, Kind.Ident);
+			key = clone_string(tok.text, p.allocator);
+			return;
+		}
+	}
+	if tok_err := expect_token(p, Kind.String); tok_err != Error.None {
+		err = Error.Expected_String_For_Object_Key;
+		return;
+	}
+	key = unquote_string(tok, p.spec, p.allocator);
+	return;
+}
 
 parse_object :: proc(p: ^Parser) -> (value: Value, err: Error) {
 	value.pos = p.curr_token.pos;
@@ -144,20 +179,20 @@ parse_object :: proc(p: ^Parser) -> (value: Value, err: Error) {
 	obj.allocator = p.allocator;
 	defer if err != Error.None {
 		for key, elem in obj {
-			delete(key);
+			delete(key, p.allocator);
 			destroy_value(elem);
 		}
 		delete(obj);
 	}
 
 	for p.curr_token.kind != Kind.Close_Brace {
-		tok := p.curr_token;
-		if tok_err := expect_token(p, Kind.String); tok_err != Error.None {
-			err = Error.Expected_String_For_Object_Key;
+		key: string;
+		key, err = parse_object_key(p);
+		if err != Error.None {
+			delete(key, p.allocator);
 			value.pos = p.curr_token.pos;
 			return;
 		}
-		key := unquote_string(tok, p.allocator);
 
 		if colon_err := expect_token(p, Kind.Colon); colon_err != Error.None {
 			err = Error.Expected_Colon_After_Key;
@@ -175,17 +210,24 @@ parse_object :: proc(p: ^Parser) -> (value: Value, err: Error) {
 		if key in obj {
 			err = Error.Duplicate_Object_Key;
 			value.pos = p.curr_token.pos;
-			delete(key);
+			delete(key, p.allocator);
 			return;
 		}
 
 		obj[key] = elem;
 
-		// Disallow trailing commas for the time being
-		if allow_token(p, Kind.Comma) {
-			continue;
+		if p.spec == Specification.JSON5 {
+			// Allow trailing commas
+			if allow_token(p, Kind.Comma) {
+				continue;
+			}
 		} else {
-			break;
+			// Disallow trailing commas
+			if allow_token(p, Kind.Comma) {
+				continue;
+			} else {
+				break;
+			}
 		}
 	}
 
@@ -200,7 +242,25 @@ parse_object :: proc(p: ^Parser) -> (value: Value, err: Error) {
 
 
 // IMPORTANT NOTE(bill): unquote_string assumes a mostly valid string
-unquote_string :: proc(token: Token, allocator := context.allocator) -> string {
+unquote_string :: proc(token: Token, spec: Specification, allocator := context.allocator) -> string {
+	get_u2_rune :: proc(s: string) -> rune {
+		if len(s) < 4 || s[0] != '\\' || s[1] != 'x' {
+			return -1;
+		}
+
+		r: rune;
+		for c in s[2:4] {
+			x: rune;
+			switch c {
+			case '0'..'9': x = c - '0';
+			case 'a'..'f': x = c - 'a' + 10;
+			case 'A'..'F': x = c - 'A' + 10;
+			case: return -1;
+			}
+			r = r*16 + x;
+		}
+		return r;
+	}
 	get_u4_rune :: proc(s: string) -> rune {
 		if len(s) < 6 || s[0] != '\\' || s[1] != 'u' {
 			return -1;
@@ -227,12 +287,17 @@ unquote_string :: proc(token: Token, allocator := context.allocator) -> string {
 	if len(s) <= 2 {
 		return "";
 	}
+	quote := s[0];
+	if s[0] != s[len(s)-1] {
+		// Invalid string
+		return "";
+	}
 	s = s[1:len(s)-1];
 
 	i := 0;
 	for i < len(s) {
 		c := s[i];
-		if c == '\\' || c == '"' || c < ' ' {
+		if c == '\\' || c == quote || c < ' ' {
 			break;
 		}
 		if c < utf8.RUNE_SELF {
@@ -246,9 +311,7 @@ unquote_string :: proc(token: Token, allocator := context.allocator) -> string {
 		i += w;
 	}
 	if i == len(s) {
-		b := make([]byte, len(s), allocator);
-		copy(b, cast([]byte)s);
-		return string(b);
+		return clone_string(s, allocator);
 	}
 
 	b := make([]byte, len(s) + 2*utf8.UTF_MAX, allocator);
@@ -299,9 +362,43 @@ unquote_string :: proc(token: Token, allocator := context.allocator) -> string {
 				buf, buf_width := utf8.encode_rune(r);
 				copy(b[w:], buf[:buf_width]);
 				w += buf_width;
+
+
+			case '0':
+				if spec == Specification.JSON5 {
+					b[w] = '\x00';
+					i += 1;
+					w += 1;
+				} else {
+					break loop;
+				}
+			case 'v':
+				if spec == Specification.JSON5 {
+					b[w] = '\v';
+					i += 1;
+					w += 1;
+				} else {
+					break loop;
+				}
+
+			case 'x':
+				if spec == Specification.JSON5 {
+					i -= 1; // Include the \x in the check for sanity sake
+					r := get_u2_rune(s[i:]);
+					if r < 0 {
+						break loop;
+					}
+					i += 4;
+
+					buf, buf_width := utf8.encode_rune(r);
+					copy(b[w:], buf[:buf_width]);
+					w += buf_width;
+				} else {
+					break loop;
+				}
 			}
 
-		case c == '"', c < ' ':
+		case c == quote, c < ' ':
 			break loop;
 
 		case c < utf8.RUNE_SELF:
diff --git a/core/encoding/json/tokenizer.odin b/core/encoding/json/tokenizer.odin
index dfa20a6a7..3cada4b45 100644
--- a/core/encoding/json/tokenizer.odin
+++ b/core/encoding/json/tokenizer.odin
@@ -15,6 +15,9 @@ Kind :: enum {
 	False,
 	True,
 
+	Infinity,
+	NaN,
+
 	Ident,
 
 	Integer,
@@ -37,13 +40,17 @@ Tokenizer :: struct {
 	r: rune, // current rune
 	w: int,  // current rune width in bytes
 	curr_line_offset: int,
+	spec: Specification,
 }
 
 
 
-make_tokenizer :: proc(data: string) -> Tokenizer {
-	t := Tokenizer{pos = {line=1}, data = data};
+make_tokenizer :: proc(data: string, spec := Specification.JSON) -> Tokenizer {
+	t := Tokenizer{pos = {line=1}, data = data, spec = spec};
 	next_rune(&t);
+	if t.r == utf8.RUNE_BOM {
+		next_rune(&t);
+	}
 	return t;
 }
 
@@ -69,6 +76,17 @@ get_token :: proc(t: ^Tokenizer) -> (token: Token, err: Error) {
 			}
 		}
 	}
+	skip_hex_digits :: proc(t: ^Tokenizer) {
+		for t.offset < len(t.data) {
+			next_rune(t);
+			switch t.r {
+			case '0'..'9', 'a'..'f', 'A'..'F':
+				// Okay
+			case:
+				return;
+			}
+		}
+	}
 
 	scan_espace :: proc(t: ^Tokenizer) -> bool {
 		switch t.r {
@@ -104,12 +122,39 @@ get_token :: proc(t: ^Tokenizer) -> (token: Token, err: Error) {
 				t.pos.column = 1;
 				next_rune(t);
 			case:
+				if t.spec == Specification.JSON5 {
+					switch t.r {
+					case 0x2028, 0x2029, 0xFEFF:
+						next_rune(t);
+						continue loop;
+					}
+				}
 				break loop;
 			}
 		}
 		return t.r;
 	}
 
+	skip_to_next_line :: proc(t: ^Tokenizer) {
+		for t.offset < len(t.data) {
+			r := next_rune(t);
+			if r == '\n' {
+				return;
+			}
+		}
+	}
+
+	skip_alphanum :: proc(t: ^Tokenizer) {
+		for t.offset < len(t.data) {
+			switch next_rune(t) {
+			case 'A'..'Z', 'a'..'z', '0'..'9', '_':
+				continue;
+			}
+
+			return;
+		}
+	}
+
 	skip_whitespace(t);
 
 	token.pos = t.pos;
@@ -118,7 +163,7 @@ get_token :: proc(t: ^Tokenizer) -> (token: Token, err: Error) {
 	curr_rune := t.r;
 	next_rune(t);
 
-	switch curr_rune {
+	block: switch curr_rune {
 	case utf8.RUNE_ERROR:
 		err = Error.Illegal_Character;
 	case utf8.RUNE_EOF, '\x00':
@@ -127,21 +172,26 @@ get_token :: proc(t: ^Tokenizer) -> (token: Token, err: Error) {
 	case 'A'..'Z', 'a'..'z', '_':
 		token.kind = Kind.Ident;
 
-		for t.offset < len(t.data) {
-			switch next_rune(t) {
-			case 'A'..'Z', 'a'..'z', '0'..'9', '_':
-				continue;
-			}
-
-			break;
-		}
+		skip_alphanum(t);
 
 		switch str := t.data[token.offset:t.offset]; str {
 		case "null":  token.kind = Kind.Null;
 		case "false": token.kind = Kind.False;
 		case "true":  token.kind = Kind.True;
+		case:
+			if t.spec == Specification.JSON5 do switch str {
+			case "Infinity": token.kind = Kind.Infinity;
+			case "NaN":      token.kind = Kind.NaN;
+			}
 		}
 
+	case '+':
+		err = Error.Illegal_Character;
+		if t.spec != Specification.JSON5 {
+			break;
+		}
+		fallthrough;
+
 	case '-':
 		switch t.r {
 		case '0'..'9':
@@ -149,12 +199,46 @@ get_token :: proc(t: ^Tokenizer) -> (token: Token, err: Error) {
 		case:
 			// Illegal use of +/-
 			err = Error.Illegal_Character;
-			break;
+
+			if t.spec == Specification.JSON5 {
+				if t.r == 'I' || t.r == 'N' {
+					skip_alphanum(t);
+				}
+				switch t.data[token.offset:t.offset] {
+				case "-Infinity": token.kind = Kind.Infinity;
+				case "-NaN":      token.kind = Kind.NaN;
+				}
+			}
+			break block;
 		}
 		fallthrough;
 
+	case '.':
+		err = Error.Illegal_Character;
+		if t.spec == Specification.JSON5 { // Allow leading decimal point
+			skip_digits(t);
+			if t.r == 'e' || t.r == 'E' {
+				switch r := next_rune(t); r {
+				case '+', '-':
+					next_rune(t);
+				}
+				skip_digits(t);
+			}
+			str := t.data[token.offset:t.offset];
+			if !is_valid_number(str, t.spec) {
+				err = Error.Invalid_Number;
+			}
+		}
+
 	case '0'..'9':
 		token.kind = Kind.Integer;
+		if t.spec == Specification.JSON5 { // Hexadecimal Numbers
+			if curr_rune == '0' && (t.r == 'x' || t.r == 'X') {
+				next_rune(t);
+				skip_hex_digits(t);
+				break;
+			}
+		}
 
 		skip_digits(t);
 		if t.r == '.' {
@@ -171,11 +255,17 @@ get_token :: proc(t: ^Tokenizer) -> (token: Token, err: Error) {
 		}
 
 		str := t.data[token.offset:t.offset];
-		if !is_valid_number(str) {
+		if !is_valid_number(str, t.spec) {
 			err = Error.Invalid_Number;
 		}
 
 
+	case '\'':
+		err = Error.Illegal_Character;
+		if t.spec != Specification.JSON5 {
+			break;
+		}
+		fallthrough;
 	case '"':
 		token.kind = Kind.String;
 		quote := curr_rune;
@@ -194,10 +284,11 @@ get_token :: proc(t: ^Tokenizer) -> (token: Token, err: Error) {
 			}
 		}
 
-		if !is_valid_string_literal(t.data[token.offset : t.offset]) {
+		if !is_valid_string_literal(t.data[token.offset : t.offset], t.spec) {
 			err = Error.Invalid_String;
 		}
 
+
 	case ',': token.kind = Kind.Comma;
 	case ':': token.kind = Kind.Colon;
 	case '{': token.kind = Kind.Open_Brace;
@@ -205,6 +296,30 @@ get_token :: proc(t: ^Tokenizer) -> (token: Token, err: Error) {
 	case '[': token.kind = Kind.Open_Bracket;
 	case ']': token.kind = Kind.Close_Bracket;
 
+	case '/':
+		err = Error.Illegal_Character;
+		if t.spec == Specification.JSON5 {
+			switch t.r {
+			case '/':
+				// Single-line comments
+				skip_to_next_line(t);
+				return get_token(t);
+			case '*':
+				// None-nested multi-line comments
+				for t.offset < len(t.data) {
+					next_rune(t);
+					if t.r == '*' {
+						next_rune(t);
+						if t.r == '/' {
+							next_rune(t);
+							return get_token(t);
+						}
+					}
+				}
+				err = Error.EOF;
+			}
+		}
+
 	case: err = Error.Illegal_Character;
 	}
 
@@ -215,7 +330,7 @@ get_token :: proc(t: ^Tokenizer) -> (token: Token, err: Error) {
 
 
 
-is_valid_number :: proc(s: string) -> bool {
+is_valid_number :: proc(s: string, spec: Specification) -> bool {
 	if s == "" {
 		return false;
 	}
@@ -225,6 +340,13 @@ is_valid_number :: proc(s: string) -> bool {
 		if s == "" {
 			return false;
 		}
+	} else if spec == Specification.JSON5 {
+		if s[0] == '+' { // Allow positive sign
+			s = s[1:];
+			if s == "" {
+				return false;
+			}
+		}
 	}
 
 	switch s[0] {
@@ -233,10 +355,21 @@ is_valid_number :: proc(s: string) -> bool {
 	case '1'..'9':
 		s = s[1:];
 		for len(s) > 0 && '0' <= s[0] && s[0] <= '9' do s = s[1:];
+	case '.':
+		if spec == Specification.JSON5 { // Allow leading decimal point
+			s = s[1:];
+		} else {
+			return false;
+		}
 	case:
 		return false;
 	}
 
+	if spec == Specification.JSON5 {
+		if len(s) == 1 && s[0] == '.' { // Allow trailing decimal point
+			return true;
+		}
+	}
 
 	if len(s) >= 2 && s[0] == '.' && '0' <= s[1] && s[1] <= '9' {
 		s = s[2:];
@@ -259,10 +392,23 @@ is_valid_number :: proc(s: string) -> bool {
 	return s == "";
 }
 
-is_valid_string_literal :: proc(s: string) -> bool {
-	if len(s) < 2 || s[0] != '"' || s[len(s)-1] != '"' {
+is_valid_string_literal :: proc(s: string, spec: Specification) -> bool {
+	if len(s) < 2 {
+		return false;
+	}
+	quote := s[0];
+	if s[0] != s[len(s)-1] {
 		return false;
 	}
+	if s[0] != '"' || s[len(s)-1] != '"' {
+		if spec == Specification.JSON5 {
+			if s[0] != '\'' || s[len(s)-1] != '\'' {
+				return false;
+			}
+		} else {
+			return false;
+		}
+	}
 	s = s[1 : len(s)-1];
 
 	i := 0;
@@ -301,7 +447,7 @@ is_valid_string_literal :: proc(s: string) -> bool {
 			case: return false;
 			}
 
-		case c == '"', c < ' ':
+		case c == quote, c < ' ':
 			return false;
 
 		case c < utf8.RUNE_SELF:
diff --git a/core/encoding/json/types.odin b/core/encoding/json/types.odin
index d8a10b801..f10136ad0 100644
--- a/core/encoding/json/types.odin
+++ b/core/encoding/json/types.odin
@@ -2,6 +2,11 @@ package json
 
 import "core:strconv"
 
+Specification :: enum {
+	JSON,
+	JSON5,
+}
+
 Null    :: distinct rawptr;
 Integer :: i64;
 Float   :: f64;
diff --git a/core/encoding/json/validator.odin b/core/encoding/json/validator.odin
index ac4e62d6b..aa49364ec 100644
--- a/core/encoding/json/validator.odin
+++ b/core/encoding/json/validator.odin
@@ -3,19 +3,35 @@ package json
 import "core:mem"
 
 // NOTE(bill): is_valid will not check for duplicate keys
-is_valid :: proc(data: string) -> bool {
-	p := make_parser(data, mem.nil_allocator());
+is_valid :: proc(data: string, spec := Specification.JSON) -> bool {
+	p := make_parser(data, spec, mem.nil_allocator());
+	if p.spec == Specification.JSON5 {
+		return validate_value(&p);
+	}
 	return validate_object(&p);
 }
 
+validate_object_key :: proc(p: ^Parser) -> bool {
+	tok := p.curr_token;
+	if p.spec == Specification.JSON5 {
+		if tok.kind == Kind.String {
+			expect_token(p, Kind.String);
+			return true;
+		} else if tok.kind == Kind.Ident {
+			expect_token(p, Kind.Ident);
+			return true;
+		}
+	}
+	err := expect_token(p, Kind.String);
+	return err == Error.None;
+}
 validate_object :: proc(p: ^Parser) -> bool {
 	if err := expect_token(p, Kind.Open_Brace); err != Error.None {
 		return false;
 	}
 
 	for p.curr_token.kind != Kind.Close_Brace {
-		tok := p.curr_token;
-		if tok_err := expect_token(p, Kind.String); tok_err != Error.None {
+		if !validate_object_key(p) {
 			return false;
 		}
 		if colon_err := expect_token(p, Kind.Colon); colon_err != Error.None {
@@ -26,11 +42,18 @@ validate_object :: proc(p: ^Parser) -> bool {
 			return false;
 		}
 
-		// Disallow trailing commas for the time being
-		if allow_token(p, Kind.Comma) {
-			continue;
+		if p.spec == Specification.JSON5 {
+			// Allow trailing commas
+			if allow_token(p, Kind.Comma) {
+				continue;
+			}
 		} else {
-			break;
+			// Disallow trailing commas
+			if allow_token(p, Kind.Comma) {
+				continue;
+			} else {
+				break;
+			}
 		}
 	}
 
@@ -85,7 +108,7 @@ validate_value :: proc(p: ^Parser) -> bool {
 		return true;
 	case Kind.String:
 		advance_token(p);
-		return is_valid_string_literal(token.text);
+		return is_valid_string_literal(token.text, p.spec);
 
 	case Kind.Open_Brace:
 		return validate_object(p);
author	gingerBill <bill@gingerbill.org>	2019-01-06 21:48:13 +0000
committer	gingerBill <bill@gingerbill.org>	2019-01-06 21:48:13 +0000
commit	d1b9f3ac74df5533f1857a26831419aeb560fd2f (patch)
tree	ce7b77ec1fbd83516e0a49d4f4ccc82422f6ab87 /core/encoding
parent	d732a5158761578d14e69daf3a94fad9f0a8c23c (diff)