[xml] Initial implementation of `core:encoding/xml`.

A from-scratch XML implementation, loosely modeled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816). Features: - Supports enough of the XML 1.0/1.1 spec to handle the 99.9% of XML documents in common current usage. - Simple to understand and use. Small. Caveats: - We do NOT support HTML in this package, as that may or may not be valid XML. If it works, great. If it doesn't, that's not considered a bug. - We do NOT support UTF-16. If you have a UTF-16 XML file, please convert it to UTF-8 first. Also, our condolences. - <[!ELEMENT and <[!ATTLIST are not supported, and will be either ignored or return an error depending on the parser options. TODO: - Optional CDATA unboxing. - Optional `>`, ` `, ` ` and other escape substitution in tag bodies. - Test suite MAYBE: - XML writer? - Serialize/deserialize Odin types?
author: Jeroen van Rijn <Kelimion@users.noreply.github.com> 2021-11-30 23:01:22 +0100
committer: Jeroen van Rijn <Kelimion@users.noreply.github.com> 2021-12-05 02:52:22 +0100
commit: b5c828fe4ee3f0942b2eda1dc5753e4ad6d38ea9 (patch)
tree: ffbd45adb60e3de951dc2948801d5a57b21dd2c9 /core/encoding
parent: 6ce5608003e630bc0de1c591fd4cbea3fe59e1d3 (diff)
4 files changed, 1118 insertions, 0 deletions
diff --git a/core/encoding/xml/debug_print.odin b/core/encoding/xml/debug_print.odin
new file mode 100644
index 000000000..0b7ffa822
--- /dev/null
+++ b/core/encoding/xml/debug_print.odin
@@ -0,0 +1,73 @@
+package xml
+/*
+	An XML 1.0 / 1.1 parser
+
+	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
+	Made available under Odin's BSD-3 license.
+
+	A from-scratch XML implementation, loosely modeled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
+
+	List of contributors:
+		Jeroen van Rijn: Initial implementation.
+*/
+import "core:fmt"
+
+/*
+	Just for debug purposes.
+*/
+print :: proc(doc: ^Document) {
+	assert(doc != nil)
+
+	using fmt
+	println("[XML Prolog]")
+
+	for attr in doc.prolog {
+		printf("\t%v: %v\n", attr.key, attr.val)
+	}
+
+	printf("[Encoding] %v\n",  doc.encoding)
+	printf("[DOCTYPE]  %v\n",  doc.doctype.ident)
+
+	if len(doc.doctype.rest) > 0 {
+		printf("\t%v\n", doc.doctype.rest)
+	}
+
+	if doc.root != nil {
+		println(" --- ")
+		print_element(0, doc.root)
+		println(" --- ")		
+	}
+}
+
+print_element :: proc(indent: int, element: ^Element) {
+	if element == nil { return }
+	using fmt
+
+	tab :: proc(indent: int) {
+		tabs := "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+
+		i := max(0, min(indent, len(tabs)))
+		printf("%v", tabs[:i])
+	}
+
+	tab(indent)
+
+	if element.kind == .Element {
+		printf("<%v>\n", element.ident)
+		if len(element.value) > 0 {
+			tab(indent + 1)
+			printf("[Value] %v\n", element.value)
+		}
+
+		for attr in element.attribs {
+			tab(indent + 1)
+			printf("[Attr] %v: %v\n", attr.key, attr.val)
+		}
+
+		for child in element.children {
+			print_element(indent + 1, child)
+		}
+	} else if element.kind == .Comment {
+		printf("[COMMENT] %v\n", element.value)
+	}
+}
+\ No newline at end of file
diff --git a/core/encoding/xml/example/xml_example.odin b/core/encoding/xml/example/xml_example.odin
new file mode 100644
index 000000000..24a277de6
--- /dev/null
+++ b/core/encoding/xml/example/xml_example.odin
@@ -0,0 +1,55 @@
+package xml_example
+
+import "core:encoding/xml"
+import "core:mem"
+import "core:fmt"
+
+Error_Handler :: proc(pos: xml.Pos, fmt: string, args: ..any) {
+
+}
+
+FILENAME :: "../../../../tests/core/assets/xml/nl_NL-xliff-1.0.xliff"
+DOC      :: #load(FILENAME)
+
+OPTIONS  :: xml.Options{
+	flags            = {
+		.Ignore_Unsupported, .Intern_Comments,
+	},
+	expected_doctype = "",
+}
+
+_main :: proc() {
+	using fmt
+
+	println("--- DOCUMENT TO PARSE  ---")
+	println(string(DOC))
+	println("--- /DOCUMENT TO PARSE ---\n")
+
+	doc, err := xml.parse(DOC, OPTIONS, FILENAME, Error_Handler)
+	defer xml.destroy(doc)
+
+	xml.print(doc)
+
+	if err != .None {
+		printf("Parse error: %v\n", err)
+	} else {
+		println("DONE!")
+	}
+}
+
+main :: proc() {
+	using fmt
+
+	track: mem.Tracking_Allocator
+	mem.tracking_allocator_init(&track, context.allocator)
+	context.allocator = mem.tracking_allocator(&track)
+
+	_main()
+
+	if len(track.allocation_map) > 0 {
+		println()
+		for _, v in track.allocation_map {
+			printf("%v Leaked %v bytes.\n", v.location, v.size)
+		}
+	}	
+}
+\ No newline at end of file
diff --git a/core/encoding/xml/tokenizer.odin b/core/encoding/xml/tokenizer.odin
new file mode 100644
index 000000000..a63dca5bd
--- /dev/null
+++ b/core/encoding/xml/tokenizer.odin
@@ -0,0 +1,339 @@
+package xml
+
+import "core:fmt"
+import "core:unicode"
+import "core:unicode/utf8"
+
+Error_Handler :: #type proc(pos: Pos, fmt: string, args: ..any)
+
+Token :: struct {
+	kind: Token_Kind,
+	text: string,
+	pos:  Pos,
+}
+
+Pos :: struct {
+	file:   string,
+	offset: int, // starting at 0
+	line:   int, // starting at 1
+	column: int, // starting at 1
+}
+
+Token_Kind :: enum {
+	Invalid,
+
+	Ident,
+	Literal,
+	Rune,
+	String,
+
+	Double_Quote,  // "
+	Single_Quote,  // '
+	Colon,         // :
+
+	Eq,            // =
+	Lt,            // <
+	Gt,            // >
+	Exclaim,       // !
+	Question,      // ?
+	Hash,          // #
+	Slash,         // /
+	Dash,          // -
+
+	Open_Bracket,  // [
+	Close_Bracket, // ]
+
+	EOF,
+}
+
+CDATA_START :: "<![CDATA["
+CDATA_END   :: "]]>"
+
+Tokenizer :: struct {
+	// Immutable data
+	path: string,
+	src:  string,
+	err:  Error_Handler,
+
+	// Tokenizing state
+	ch:          rune,
+	offset:      int,
+	read_offset: int,
+	line_offset: int,
+	line_count:  int,
+
+	// Mutable data
+	error_count: int,
+}
+
+init :: proc(t: ^Tokenizer, src: string, path: string, err: Error_Handler = default_error_handler) {
+	t.src = src
+	t.err = err
+	t.ch = ' '
+	t.offset = 0
+	t.read_offset = 0
+	t.line_offset = 0
+	t.line_count = len(src) > 0 ? 1 : 0
+	t.error_count = 0
+	t.path = path
+
+	advance_rune(t)
+	if t.ch == utf8.RUNE_BOM {
+		advance_rune(t)
+	}
+}
+
+@(private)
+offset_to_pos :: proc(t: ^Tokenizer, offset: int) -> Pos {
+	line := t.line_count
+	column := offset - t.line_offset + 1
+
+	return Pos {
+		file = t.path,
+		offset = offset,
+		line = line,
+		column = column,
+	}
+}
+
+default_error_handler :: proc(pos: Pos, msg: string, args: ..any) {
+	fmt.eprintf("%s(%d:%d) ", pos.file, pos.line, pos.column)
+	fmt.eprintf(msg, ..args)
+	fmt.eprintf("\n")
+}
+
+error :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) {
+	pos := offset_to_pos(t, offset)
+	if t.err != nil {
+		t.err(pos, msg, ..args)
+	}
+	t.error_count += 1
+}
+
+advance_rune :: proc(using t: ^Tokenizer) {
+	if read_offset < len(src) {
+		offset = read_offset
+		if ch == '\n' {
+			line_offset = offset
+			line_count += 1
+		}
+		r, w := rune(src[read_offset]), 1
+		switch {
+		case r == 0:
+			error(t, t.offset, "illegal character NUL")
+		case r >= utf8.RUNE_SELF:
+			r, w = utf8.decode_rune_in_string(src[read_offset:])
+			if r == utf8.RUNE_ERROR && w == 1 {
+				error(t, t.offset, "illegal UTF-8 encoding")
+			} else if r == utf8.RUNE_BOM && offset > 0 {
+				error(t, t.offset, "illegal byte order mark")
+			}
+		}
+		read_offset += w
+		ch = r
+	} else {
+		offset = len(src)
+		if ch == '\n' {
+			line_offset = offset
+			line_count += 1
+		}
+		ch = -1
+	}
+}
+
+peek_byte :: proc(t: ^Tokenizer, offset := 0) -> byte {
+	if t.read_offset+offset < len(t.src) {
+		return t.src[t.read_offset+offset]
+	}
+	return 0
+}
+
+skip_whitespace :: proc(t: ^Tokenizer) {
+	for {
+		switch t.ch {
+		case ' ', '\t', '\r', '\n':
+			advance_rune(t)
+		case:
+			return
+		}
+	}
+}
+
+is_letter :: proc(r: rune) -> bool {
+	if r < utf8.RUNE_SELF {
+		switch r {
+		case '_':
+			return true
+		case 'A'..='Z', 'a'..='z':
+			return true
+		}
+	}
+	return unicode.is_letter(r)
+}
+
+is_valid_identifier_rune :: proc(r: rune) -> bool {
+	if r < utf8.RUNE_SELF {
+		switch r {
+		case '_', '-', ':':        return true
+		case 'A'..='Z', 'a'..='z': return true
+		case '0'..'9':             return true
+		}
+	}
+
+	if unicode.is_digit(r) || unicode.is_letter(r) {
+		return true
+	}
+	return false
+}
+
+scan_identifier :: proc(t: ^Tokenizer) -> string {
+	offset     := t.offset
+	namespaced := false
+
+	for is_valid_identifier_rune(t.ch) {
+		advance_rune(t)
+		if t.ch == ':' {
+			/*
+				A namespaced attr can have at most two parts, `namespace:ident`.
+			*/
+			if namespaced {
+				break	
+			}
+			namespaced = true
+		}
+	}
+	return string(t.src[offset : t.offset])
+}
+
+scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close := false) -> (value: string, err: Error) {
+	err = .None
+	in_cdata := false
+
+	loop: for {
+		ch := t.ch
+
+		switch ch {
+		case -1:
+			error(t, t.offset, "[scan_string] Premature end of file.\n")
+			return "", .Premature_EOF
+
+		case '<':
+			/*
+				Might be the start of a CDATA tag.
+			*/
+			if t.read_offset + len(CDATA_START) < len(t.src) {
+				if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
+					in_cdata = true
+				}
+			}
+
+		case ']':
+			/*
+				Might be the end of a CDATA tag.
+			*/
+			if t.read_offset + len(CDATA_END) < len(t.src) {
+				if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
+					in_cdata = false
+				}
+			}
+
+		case '\n':
+			if !in_cdata {
+				error(t, offset, string(t.src[offset : t.offset]))
+				error(t, offset, "[scan_string] Not terminated\n")
+				err = .Invalid_Tag_Value
+				break loop	
+			}
+		}
+
+		if ch == close && !in_cdata {
+			/*
+				If it's not a CDATA tag, it's the end of this body.
+			*/
+			break loop
+		}
+
+		advance_rune(t)
+	}
+
+	lit := string(t.src[offset : t.offset])
+	if consume_close {
+		advance_rune(t)
+	}
+
+	/*
+		TODO: Handle decoding escape characters and unboxing CDATA.
+	*/
+
+	return lit, err
+}
+
+peek :: proc(t: ^Tokenizer) -> (token: Token) {
+	old  := t^
+	token = scan(t)
+	t^ = old
+	return token
+}
+
+scan :: proc(t: ^Tokenizer) -> Token {
+	skip_whitespace(t)
+
+	offset := t.offset
+
+	kind: Token_Kind
+	err:  Error
+	lit:  string
+	pos := offset_to_pos(t, offset)
+
+	switch ch := t.ch; true {
+	case is_letter(ch):
+		lit = scan_identifier(t)
+		kind = .Ident
+
+	case:
+		advance_rune(t)
+		switch ch {
+		case -1:
+			kind = .EOF
+
+		case '<': kind = .Lt
+		case '>': kind = .Gt
+		case '!': kind = .Exclaim
+		case '?': kind = .Question
+		case '=': kind = .Eq
+		case '#': kind = .Hash
+		case '/': kind = .Slash
+		case '-': kind = .Dash
+		case ':': kind = .Colon
+
+		case '"', '\'':
+			lit, err = scan_string(t, t.offset, ch, true)
+			if err == .None {
+				kind = .String
+			} else {
+				kind = .Invalid
+			}
+
+		case '\n':
+			lit = "\n"
+
+		case '\\':
+			token := scan(t)
+			if token.pos.line == pos.line {
+				error(t, token.pos.offset, "expected a newline after \\")
+			}
+			return token
+
+		case:
+			if ch != utf8.RUNE_BOM {
+				// error(t, t.offset, "illegal character '%r': %d", ch, ch)
+			}
+			kind = .Invalid
+		}
+	}
+
+	if lit == "" {
+		lit = string(t.src[offset : t.offset])
+	}
+	return Token{kind, lit, pos}
+}
+\ No newline at end of file
diff --git a/core/encoding/xml/xml_reader.odin b/core/encoding/xml/xml_reader.odin
new file mode 100644
index 000000000..526be5856
--- /dev/null
+++ b/core/encoding/xml/xml_reader.odin
@@ -0,0 +1,651 @@
+package xml
+/*
+	An XML 1.0 / 1.1 parser
+
+	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
+	Made available under Odin's BSD-3 license.
+
+	A from-scratch XML implementation, loosely modelled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
+
+	Features:
+		- Supports enough of the XML 1.0/1.1 spec to handle the 99.9% of XML documents in common current usage.
+		- Simple to understand and use. Small.
+
+	Caveats:
+		- We do NOT support HTML in this package, as that may or may not be valid XML.
+		  If it works, great. If it doesn't, that's not considered a bug.
+
+		- We do NOT support UTF-16. If you have a UTF-16 XML file, please convert it to UTF-8 first. Also, our condolences.
+		- <[!ELEMENT and <[!ATTLIST are not supported, and will be either ignored or return an error depending on the parser options.
+
+	TODO:
+	- Optional CDATA unboxing.
+	- Optional `&gt;`, `&#32;`, `&#x20;` and other escape substitution in tag bodies.
+
+	MAYBE:
+	- XML writer?
+	- Serialize/deserialize Odin types?
+
+	List of contributors:
+		Jeroen van Rijn: Initial implementation.
+*/
+
+import "core:strings"
+import "core:mem"
+import "core:os"
+
+DEFAULT_Options :: Options{
+	flags            = {
+		.Ignore_Unsupported,
+	},
+	expected_doctype = "",
+}
+
+Option_Flag :: enum {
+	/*
+		Document MUST start with `<?xml` prolog.
+	*/
+	Must_Have_Prolog,
+
+	/*
+		Document MUST have a `<!DOCTYPE`.
+	*/
+	Must_Have_DocType,
+
+	/*
+		By default we skip comments. Use this option to intern a comment on a parented Element.
+	*/
+	Intern_Comments,
+
+	/*
+		How to handle unsupported parts of the specification, like <! other than <!DOCTYPE and <![CDATA[
+	*/
+	Error_on_Unsupported,
+	Ignore_Unsupported,
+
+	/*
+		By default CDATA tags are passed-through as-is.
+		This option unwraps them when encountered.
+	*/
+	Unbox_CDATA,
+
+	/*
+		By default SGML entities like `&gt;`, `&#32;` and `&#x20;` are passed-through as-is.
+		This option decodes them when encountered.
+	*/
+	Decode_SGML_Entities,
+}
+
+Document :: struct {
+	root:     ^Element,
+	prolog:   Attributes,
+	encoding: Encoding,
+
+	doctype: struct {
+		/*
+			We only scan the <!DOCTYPE IDENT part and skip the rest.
+		*/
+		ident: string,
+		rest:  string,
+	},
+
+	/*
+		Internal
+	*/
+	tokenizer: ^Tokenizer,
+	allocator: mem.Allocator,
+	intern:    strings.Intern,
+}
+
+Element :: struct {
+	ident:   string,
+	value:   string,
+	attribs: Attributes,
+
+	kind: enum {
+		Element = 0,
+		Comment,
+	},
+
+	parent:   ^Element,
+	children: [dynamic]^Element,
+}
+
+Attr :: struct {
+	key: string,
+	val: string,
+}
+
+Attributes :: [dynamic]Attr
+
+Options :: struct {
+	flags:            Option_Flags,
+	expected_doctype: string,
+}
+Option_Flags :: bit_set[Option_Flag]
+
+Encoding :: enum {
+	Unknown,
+
+	UTF_8,
+	ISO_8859_1,
+
+	/*
+		Aliases
+	*/
+	LATIN_1 = ISO_8859_1,
+}
+
+Error :: enum {
+	/*
+		General return values.
+	*/
+	None = 0,
+	General_Error,
+	Unexpected_Token,
+	Invalid_Token,
+
+	/*
+		Couldn't find, open or read file.
+	*/
+	File_Error,
+
+	/*
+		File too short.
+	*/
+	Premature_EOF,
+
+	/*
+		XML-specific errors.
+	*/
+	No_Prolog,
+	Invalid_Prolog,
+	Too_Many_Prologs,
+
+	No_DocType,
+	Too_Many_DocTypes,
+	DocType_Must_Proceed_Elements,
+
+	/*
+		If a DOCTYPE is present _or_ the caller
+		asked for a specific DOCTYPE and the DOCTYPE
+		and root tag don't match, we return `.Invalid_DocType`.
+	*/
+	Invalid_DocType,
+
+	Invalid_Tag_Value,
+	Mismatched_Closing_Tag,
+
+	Unclosed_Comment,
+	Comment_Before_Root_Element,
+	Invalid_Sequence_In_Comment,
+
+	Unsupported_Version,
+	Unsupported_Encoding,
+
+	/*
+		<!FOO are usually skipped.
+	*/
+	Unhandled_Bang,
+
+	Duplicate_Attribute,
+	Conflicting_Options,
+
+	/*
+		Unhandled TODO:
+	*/
+	Unhandled_CDATA_Unboxing,
+	Unhandled_SGML_Entity_Decoding,
+}
+
+/*
+	Implementation starts here.
+*/
+parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
+	context.allocator = allocator
+
+	opts := validate_options(options) or_return
+
+	t := &Tokenizer{}
+	init(t, string(data), path, error_handler)
+
+	doc = new(Document)
+	doc.allocator = allocator
+	doc.tokenizer = t
+
+	strings.intern_init(&doc.intern, allocator, allocator)
+
+	err =               .Unexpected_Token
+	element, parent:    ^Element
+
+	/*
+		If a DOCTYPE is present, the root tag has to match.
+		If an expected DOCTYPE is given in options (i.e. it's non-empty), the DOCTYPE (if present) and root tag have to match.
+	*/
+	expected_doctype := options.expected_doctype
+
+	loop: for {
+		tok := scan(t)
+		#partial switch tok.kind {
+
+		case .Lt:
+			open := scan(t)
+			#partial switch open.kind {
+
+			case .Question:
+				/*
+					<?xml
+				*/
+				next := scan(t)
+				#partial switch next.kind {
+				case .Ident:
+					if len(next.text) == 3 && strings.to_lower(next.text, context.temp_allocator) == "xml" {
+						parse_prolog(doc) or_return
+					} else if len(doc.prolog) > 0 {
+						/*
+							We've already seen a prolog.
+						*/
+						return doc, .Too_Many_Prologs
+					} else {
+						error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", tok.text)	
+						return
+					}
+				case:
+					error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", tok.text)
+					return
+				}
+
+			case .Exclaim:
+				/*
+					<!
+				*/
+				next := scan(t)
+				#partial switch next.kind {
+				case .Ident:
+					switch next.text {
+					case "DOCTYPE":
+						if len(doc.doctype.ident) > 0 {
+							return doc, .Too_Many_DocTypes
+						}
+						if doc.root != nil {
+							return doc, .DocType_Must_Proceed_Elements
+						}
+						parse_doctype(doc) or_return
+
+						if len(expected_doctype) > 0 && expected_doctype != doc.doctype.ident {
+							error(t, t.offset, "Invalid DOCTYPE. Expected: %v, got: %v\n", expected_doctype, doc.doctype.ident)
+							return doc, .Invalid_DocType
+						}
+						expected_doctype = doc.doctype.ident
+
+					case:
+						if .Error_on_Unsupported in opts.flags {
+							error(t, t.offset, "Unhandled: <!%v\n", next.text)
+							err = .Unhandled_Bang
+							return	
+						}
+						skip_element(t) or_return
+					}
+
+				case .Dash:
+					/*
+						Comment: <!-- -->.
+						The grammar does not allow a comment to end in --->
+					*/
+					if doc.root == nil {
+						return doc, .Comment_Before_Root_Element
+					}
+
+					expect(t, .Dash)
+					offset := t.offset
+
+					for {
+						advance_rune(t)
+						ch := t.ch
+
+						/*
+							A comment ends when we see -->, preceded by a character that's not a dash.
+							"For compatibility, the string "--" (double-hyphen) must not occur within comments."
+
+							See: https://www.w3.org/TR/2006/REC-xml11-20060816/#dt-comment
+
+							Thanks to the length (4) of the comment start, we also have enough lookback,
+							and the peek at the next byte asserts that there's at least one more character
+							that's a `>`.
+						*/
+						if ch < 0 {
+							error(t, offset, "[parse] Comment was not terminated\n")
+							return doc, .Unclosed_Comment
+						}
+
+						if string(t.src[t.offset - 1:][:2]) == "--" {
+							if peek_byte(t) == '>' {
+								break
+							} else {
+								error(t, t.offset - 1, "Invalid -- sequence in comment.\n")
+								return doc, .Invalid_Sequence_In_Comment
+							}
+						}
+					}
+
+					if .Intern_Comments in opts.flags {
+						el := new(Element)
+
+						el.parent = element
+						el.kind   = .Comment
+						el.value  = strings.intern_get(&doc.intern, string(t.src[offset : t.offset - 1]))
+						append(&element.children, el)
+					}
+
+					expect(t, .Dash)
+					expect(t, .Gt)
+
+				case:
+					error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next)
+					return
+				}
+
+			case .Ident:
+				/*
+					e.g. <odin - Start of new element.
+				*/
+				element = new(Element)
+
+				if doc.root == nil {
+					/*
+						First element.
+					*/
+					doc.root = element
+					parent   = element
+				} else {
+					append(&parent.children, element)
+				}
+
+				element.parent = parent
+				element.ident  = strings.intern_get(&doc.intern, open.text)
+
+				parse_attributes(doc, &element.attribs) or_return
+
+				/*
+					If a DOCTYPE is present _or_ the caller
+					asked for a specific DOCTYPE and the DOCTYPE
+					and root tag don't match, we return .Invalid_Root_Tag.
+				*/
+				if element == doc.root {
+					if len(expected_doctype) > 0 && expected_doctype != open.text {
+						error(t, t.offset, "Root Tag doesn't match DOCTYPE. Expected: %v, got: %v\n", expected_doctype, open.text)
+						return doc, .Invalid_DocType
+					}
+				}
+
+				/*
+					One of these should follow:
+					- `>`,  which means we've just opened this tag and expect a later element to close it.
+					- `/>`, which means this is an 'empty' or self-closing tag.
+				*/
+				end_token := scan(t)
+
+				#partial switch end_token.kind {
+				case .Gt:
+					/*
+						We're now the new parent.
+					*/
+					parent = element
+
+				case .Slash:
+					/*
+						Empty tag?
+					*/
+					expect(t, .Gt) or_return
+
+				case:
+					error(t, t.offset, "Expected close tag, got: %#v\n", end_token)
+					return
+				}
+
+			case .Slash:
+				/*
+					Close tag.
+				*/
+				ident := expect(t, .Ident) or_return
+				_      = expect(t, .Gt)    or_return
+
+				if element.ident != ident.text {
+					error(t, t.offset, "Mismatched Closing Tag: %v\n", ident.text)
+					return doc, .Mismatched_Closing_Tag
+				}
+				parent  = element.parent
+				element = parent
+
+			case:
+				error(t, t.offset, "Invalid Token after <: %#v\n", open)
+				return
+			}
+
+		case .EOF:
+			break loop
+
+		case:
+			/*
+				This should be a tag's body text.
+			*/
+			element.value = scan_string(t, tok.pos.offset) or_return
+		}
+	}
+
+	if .Must_Have_Prolog in opts.flags && len(doc.prolog) == 0 {
+		return doc, .No_Prolog
+	}
+
+	if .Must_Have_DocType in opts.flags && len(doc.doctype.ident) == 0 {
+		return doc, .No_DocType
+	}
+
+	return doc, .None
+}
+
+parse_from_file :: proc(filename: string, options := DEFAULT_Options, error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
+	context.allocator = allocator
+
+	data, data_ok := os.read_entire_file(filename)
+	defer delete(data)
+
+	if !data_ok { return {}, .File_Error }
+
+	return parse_from_slice(data, options, filename, error_handler, allocator)
+}
+
+parse :: proc { parse_from_file, parse_from_slice }
+
+free_element :: proc(element: ^Element) {
+	if element == nil { return }
+
+	for child in element.children {
+		/*
+			NOTE: Recursive.
+
+			Could be rewritten so it adds them to a list of pointers to free.
+		*/
+		free_element(child)
+	}
+	delete(element.attribs)
+	delete(element.children)
+	free(element)
+}
+
+destroy :: proc(doc: ^Document) {
+	if doc == nil { return }
+
+	free_element(doc.root)
+	strings.intern_destroy(&doc.intern)
+
+	delete(doc.prolog)
+	free(doc)
+}
+
+/*
+	Helpers.
+*/
+
+validate_options :: proc(options: Options) -> (validated: Options, err: Error) {
+	validated = options
+
+	if .Error_on_Unsupported in validated.flags && .Ignore_Unsupported in validated.flags {
+		return options, .Conflicting_Options
+	}
+
+	if .Unbox_CDATA in validated.flags {
+		return options, .Unhandled_CDATA_Unboxing
+	}
+
+	if .Decode_SGML_Entities in validated.flags {
+		return options, .Unhandled_SGML_Entity_Decoding
+	}
+
+	return validated, .None
+}
+
+expect :: proc(t: ^Tokenizer, kind: Token_Kind) -> (tok: Token, err: Error) {
+	tok = scan(t)
+	if tok.kind == kind { return tok, .None }
+
+	error(t, t.offset, "Expected \"%v\", got \"%v\".", kind, tok.kind)
+	return tok, .Unexpected_Token
+}
+
+parse_attribute :: proc(doc: ^Document) -> (attr: Attr, offset: int, err: Error) {
+	assert(doc != nil)
+	context.allocator = doc.allocator
+	t := doc.tokenizer
+
+	key    := expect(t, .Ident)  or_return
+	offset  = t.offset - len(key.text)
+
+	_       = expect(t, .Eq)     or_return
+	value  := expect(t, .String) or_return
+
+	attr.key = strings.intern_get(&doc.intern, key.text)
+	attr.val = strings.intern_get(&doc.intern, value.text)
+
+	err = .None
+	return
+}
+
+check_duplicate_attributes :: proc(t: ^Tokenizer, attribs: Attributes, attr: Attr, offset: int) -> (err: Error) {
+	for a in attribs {
+		if attr.key == a.key {
+			error(t, offset, "Duplicate attribute: %v\n", attr.key)
+			return .Duplicate_Attribute
+		}
+	}
+	return .None	
+}
+
+parse_attributes :: proc(doc: ^Document, attribs: ^Attributes) -> (err: Error) {
+	assert(doc != nil)
+	context.allocator = doc.allocator
+	t := doc.tokenizer
+
+	for peek(t).kind == .Ident {
+		attr, offset := parse_attribute(doc)                  or_return
+		check_duplicate_attributes(t, attribs^, attr, offset) or_return
+		append(attribs, attr)
+	}
+	skip_whitespace(t)
+	return .None
+}
+
+parse_prolog :: proc(doc: ^Document) -> (err: Error) {
+	assert(doc != nil)
+	context.allocator = doc.allocator
+	t := doc.tokenizer
+
+	offset := t.offset
+	parse_attributes(doc, &doc.prolog) or_return
+
+	for attr in doc.prolog {
+		switch attr.key {
+		case "version":
+			switch attr.val {
+			case "1.0", "1.1":
+			case:
+				error(t, offset, "[parse_prolog] Warning: Unhandled XML version: %v\n", attr.val)
+			}
+
+		case "encoding":
+			switch strings.to_lower(attr.val, context.temp_allocator) {
+			case "utf-8", "utf8":
+				doc.encoding = .UTF_8
+
+			case "latin-1", "latin1", "iso-8859-1":
+				doc.encoding = .LATIN_1
+
+			case:
+				/*
+					Unrecognized encoding, assume UTF-8.
+				*/
+				error(t, offset, "[parse_prolog] Warning: Unrecognized encoding: %v\n", attr.val)
+			}
+
+		case:
+			// Ignored.
+		}
+	}
+
+	_ = expect(t, .Question) or_return
+	_ = expect(t, .Gt)       or_return
+
+	return .None
+}
+
+skip_element :: proc(t: ^Tokenizer) -> (err: Error) {
+	close := 1
+
+	loop: for {
+		tok := scan(t)
+		#partial switch tok.kind {
+		case .EOF:
+			error(t, t.offset, "[skip_element] Premature EOF\n")
+			return .Premature_EOF
+
+		case .Lt:
+			close += 1
+
+		case .Gt:
+			close -= 1
+			if close == 0 {
+				break loop
+			}
+
+		case:
+
+		}
+	}
+	return .None
+}
+
+parse_doctype :: proc(doc: ^Document) -> (err: Error) {
+	/*
+		<!DOCTYPE greeting SYSTEM "hello.dtd">
+
+		<!DOCTYPE greeting [
+			<!ELEMENT greeting (#PCDATA)>
+		]>
+	*/
+	assert(doc != nil)
+	context.allocator = doc.allocator
+	t := doc.tokenizer
+
+	tok := expect(t, .Ident) or_return
+	doc.doctype.ident = strings.intern_get(&doc.intern, tok.text)
+
+	skip_whitespace(t)
+	offset := t.offset
+	skip_element(t) or_return
+
+	/*
+		-1 because the current offset is that of the closing tag, so the rest of the DOCTYPE tag ends just before it.
+	*/
+	doc.doctype.rest = strings.intern_get(&doc.intern, string(t.src[offset : t.offset - 1]))
+	return .None
+}
+\ No newline at end of file
author	Jeroen van Rijn <Kelimion@users.noreply.github.com>	2021-11-30 23:01:22 +0100
committer	Jeroen van Rijn <Kelimion@users.noreply.github.com>	2021-12-05 02:52:22 +0100
commit	b5c828fe4ee3f0942b2eda1dc5753e4ad6d38ea9 (patch)
tree	ffbd45adb60e3de951dc2948801d5a57b21dd2c9 /core/encoding
parent	6ce5608003e630bc0de1c591fd4cbea3fe59e1d3 (diff)