1 files changed, 713 insertions, 0 deletions
diff --git a/core/encoding/xml/xml_reader.odin b/core/encoding/xml/xml_reader.odin
new file mode 100644
index 000000000..b77ae97b3
--- /dev/null
+++ b/core/encoding/xml/xml_reader.odin
@@ -0,0 +1,713 @@
+/*
+	An XML 1.0 / 1.1 parser
+
+	Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>.
+	Made available under Odin's BSD-3 license.
+
+	A from-scratch XML implementation, loosely modelled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
+
+	Features:
+		- Supports enough of the XML 1.0/1.1 spec to handle the 99.9% of XML documents in common current usage.
+		- Simple to understand and use. Small.
+
+	Caveats:
+		- We do NOT support HTML in this package, as that may or may not be valid XML.
+		  If it works, great. If it doesn't, that's not considered a bug.
+
+		- We do NOT support UTF-16. If you have a UTF-16 XML file, please convert it to UTF-8 first. Also, our condolences.
+		- <[!ELEMENT and <[!ATTLIST are not supported, and will be either ignored or return an error depending on the parser options.
+
+	MAYBE:
+	- XML writer?
+	- Serialize/deserialize Odin types?
+
+	List of contributors:
+		Jeroen van Rijn: Initial implementation.
+*/
+package xml
+// An XML 1.0 / 1.1 parser
+
+import "core:bytes"
+import "core:encoding/entity"
+import "core:intrinsics"
+import "core:mem"
+import "core:os"
+import "core:strings"
+
+likely :: intrinsics.expect
+
+DEFAULT_OPTIONS :: Options{
+	flags = {.Ignore_Unsupported},
+	expected_doctype = "",
+}
+
+Option_Flag :: enum {
+	/*
+		If the caller says that input may be modified, we can perform in-situ parsing.
+		If this flag isn't provided, the XML parser first duplicates the input so that it can.
+	*/
+	Input_May_Be_Modified,
+
+	/*
+		Document MUST start with `<?xml` prologue.
+	*/
+	Must_Have_Prolog,
+
+	/*
+		Document MUST have a `<!DOCTYPE`.
+	*/
+	Must_Have_DocType,
+
+	/*
+		By default we skip comments. Use this option to intern a comment on a parented Element.
+	*/
+	Intern_Comments,
+
+	/*
+		How to handle unsupported parts of the specification, like <! other than <!DOCTYPE and <![CDATA[
+	*/
+	Error_on_Unsupported,
+	Ignore_Unsupported,
+
+	/*
+		By default CDATA tags are passed-through as-is.
+		This option unwraps them when encountered.
+	*/
+	Unbox_CDATA,
+
+	/*
+		By default SGML entities like `&gt;`, `&#32;` and `&#x20;` are passed-through as-is.
+		This option decodes them when encountered.
+	*/
+	Decode_SGML_Entities,
+
+	/*
+		If a tag body has a comment, it will be stripped unless this option is given.
+	*/
+	Keep_Tag_Body_Comments,
+}
+Option_Flags :: bit_set[Option_Flag; u16]
+
+Document :: struct {
+	elements:      [dynamic]Element,
+	element_count: Element_ID,
+
+	prologue: Attributes,
+	encoding: Encoding,
+
+	doctype: struct {
+		/*
+			We only scan the <!DOCTYPE IDENT part and skip the rest.
+		*/
+		ident:   string,
+		rest:    string,
+	},
+
+	/*
+		If we encounter comments before the root node, and the option to intern comments is given, this is where they'll live.
+		Otherwise they'll be in the element tree.
+	*/
+	comments: [dynamic]string,
+
+	/*
+		Internal
+	*/
+	tokenizer: ^Tokenizer,
+	allocator: mem.Allocator,
+
+	/*
+		Input. Either the original buffer, or a copy if `.Input_May_Be_Modified` isn't specified.
+	*/
+	input:           []u8,
+	strings_to_free: [dynamic]string,
+}
+
+Element :: struct {
+	ident:   string,
+	value:   string,
+	attribs: Attributes,
+
+	kind: enum {
+		Element = 0,
+		Comment,
+	},
+
+	parent:   Element_ID,
+	children: [dynamic]Element_ID,
+}
+
+Attribute :: struct {
+	key: string,
+	val: string,
+}
+
+Attributes :: [dynamic]Attribute
+
+Options :: struct {
+	flags:            Option_Flags,
+	expected_doctype: string,
+}
+
+Encoding :: enum {
+	Unknown,
+
+	UTF_8,
+	ISO_8859_1,
+
+	/*
+		Aliases
+	*/
+	LATIN_1 = ISO_8859_1,
+}
+
+Error :: enum {
+	/*
+		General return values.
+	*/
+	None = 0,
+	General_Error,
+	Unexpected_Token,
+	Invalid_Token,
+
+	/*
+		Couldn't find, open or read file.
+	*/
+	File_Error,
+
+	/*
+		File too short.
+	*/
+	Premature_EOF,
+
+	/*
+		XML-specific errors.
+	*/
+	No_Prolog,
+	Invalid_Prolog,
+	Too_Many_Prologs,
+
+	No_DocType,
+	Too_Many_DocTypes,
+	DocType_Must_Preceed_Elements,
+
+	/*
+		If a DOCTYPE is present _or_ the caller
+		asked for a specific DOCTYPE and the DOCTYPE
+		and root tag don't match, we return `.Invalid_DocType`.
+	*/
+	Invalid_DocType,
+
+	Invalid_Tag_Value,
+	Mismatched_Closing_Tag,
+
+	Unclosed_Comment,
+	Comment_Before_Root_Element,
+	Invalid_Sequence_In_Comment,
+
+	Unsupported_Version,
+	Unsupported_Encoding,
+
+	/*
+		<!FOO are usually skipped.
+	*/
+	Unhandled_Bang,
+
+	Duplicate_Attribute,
+	Conflicting_Options,
+}
+
+/*
+	Implementation starts here.
+*/
+parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
+	data := data
+	context.allocator = allocator
+
+	opts := validate_options(options) or_return
+
+	/*
+		If `.Input_May_Be_Modified` is not specified, we duplicate the input so that we can modify it in-place.
+	*/
+	if .Input_May_Be_Modified not_in opts.flags {
+		data = bytes.clone(data)
+	}
+
+	t := &Tokenizer{}
+	init(t, string(data), path, error_handler)
+
+	doc = new(Document)
+	doc.allocator = allocator
+	doc.tokenizer = t
+	doc.input     = data
+
+	doc.elements = make([dynamic]Element, 1024, 1024, allocator)
+
+	// strings.intern_init(&doc.intern, allocator, allocator)
+
+	err =            .Unexpected_Token
+	element, parent: Element_ID
+
+	tag_is_open   := false
+	first_element := true
+	open: Token
+
+	/*
+		If a DOCTYPE is present, the root tag has to match.
+		If an expected DOCTYPE is given in options (i.e. it's non-empty), the DOCTYPE (if present) and root tag have to match.
+	*/
+	expected_doctype := options.expected_doctype
+
+	loop: for {
+		skip_whitespace(t)
+		// NOTE(Jeroen): This is faster as a switch.
+		switch t.ch {
+		case '<':
+			/*
+				Consume peeked `<`
+			*/
+			advance_rune(t)
+
+			open = scan(t)
+			// NOTE(Jeroen): We're not using a switch because this if-else chain ordered by likelihood is 2.5% faster at -o:size and -o:speed.
+			if likely(open.kind, Token_Kind.Ident) == .Ident {
+				/*
+					e.g. <odin - Start of new element.
+				*/
+				element = new_element(doc)
+				tag_is_open = true
+
+				if first_element {
+					/*
+						First element.
+					*/
+					parent   = element
+					first_element = false
+				} else {
+					append(&doc.elements[parent].children, element)
+				}
+
+				doc.elements[element].parent = parent
+				doc.elements[element].ident  = open.text
+
+				parse_attributes(doc, &doc.elements[element].attribs) or_return
+
+				/*
+					If a DOCTYPE is present _or_ the caller
+					asked for a specific DOCTYPE and the DOCTYPE
+					and root tag don't match, we return .Invalid_Root_Tag.
+				*/
+				if element == 0 { // Root tag?
+					if len(expected_doctype) > 0 && expected_doctype != open.text {
+						error(t, t.offset, "Root Tag doesn't match DOCTYPE. Expected: %v, got: %v\n", expected_doctype, open.text)
+						return doc, .Invalid_DocType
+					}
+				}
+
+				/*
+					One of these should follow:
+					- `>`,  which means we've just opened this tag and expect a later element to close it.
+					- `/>`, which means this is an 'empty' or self-closing tag.
+				*/
+				end_token := scan(t)
+				#partial switch end_token.kind {
+				case .Gt:
+					/*
+						We're now the new parent.
+					*/
+					parent = element
+
+				case .Slash:
+					/*
+						Empty tag. Close it.
+					*/
+					expect(t, .Gt) or_return
+					parent      = doc.elements[element].parent
+					element     = parent
+					tag_is_open = false
+
+				case:
+					error(t, t.offset, "Expected close tag, got: %#v\n", end_token)
+					return
+				}
+
+			} else if open.kind == .Slash {
+				/*
+					Close tag.
+				*/
+				ident := expect(t, .Ident) or_return
+				_      = expect(t, .Gt)    or_return
+
+				if doc.elements[element].ident != ident.text {
+					error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", doc.elements[element].ident, ident.text)
+					return doc, .Mismatched_Closing_Tag
+				}
+				parent      = doc.elements[element].parent
+				element     = parent
+				tag_is_open = false
+
+			} else if open.kind == .Exclaim {
+				/*
+					<!
+				*/
+				next := scan(t)
+				#partial switch next.kind {
+				case .Ident:
+					switch next.text {
+					case "DOCTYPE":
+						if len(doc.doctype.ident) > 0 {
+							return doc, .Too_Many_DocTypes
+						}
+						if doc.element_count > 0 {
+							return doc, .DocType_Must_Preceed_Elements
+						}
+						parse_doctype(doc) or_return
+
+						if len(expected_doctype) > 0 && expected_doctype != doc.doctype.ident {
+							error(t, t.offset, "Invalid DOCTYPE. Expected: %v, got: %v\n", expected_doctype, doc.doctype.ident)
+							return doc, .Invalid_DocType
+						}
+						expected_doctype = doc.doctype.ident
+
+					case:
+						if .Error_on_Unsupported in opts.flags {
+							error(t, t.offset, "Unhandled: <!%v\n", next.text)
+							return doc, .Unhandled_Bang
+						}
+						skip_element(t) or_return
+					}
+
+				case .Dash:
+					/*
+						Comment: <!-- -->.
+						The grammar does not allow a comment to end in --->
+					*/
+					expect(t, .Dash)
+					comment := scan_comment(t) or_return
+
+					if .Intern_Comments in opts.flags {
+						if len(doc.elements) == 0 {
+							append(&doc.comments, comment)
+						} else {
+							el := new_element(doc)
+							doc.elements[el].parent = element
+							doc.elements[el].kind   = .Comment
+							doc.elements[el].value  = comment
+							append(&doc.elements[element].children, el)
+						}
+					}
+
+				case:
+					error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next)
+					return
+				}
+
+			} else if open.kind == .Question {
+				/*
+					<?xml
+				*/
+				next := scan(t)
+				#partial switch next.kind {
+				case .Ident:
+					if len(next.text) == 3 && strings.to_lower(next.text, context.temp_allocator) == "xml" {
+						parse_prologue(doc) or_return
+					} else if len(doc.prologue) > 0 {
+						/*
+							We've already seen a prologue.
+						*/
+						return doc, .Too_Many_Prologs
+					} else {
+						/*
+							Could be `<?xml-stylesheet`, etc. Ignore it.
+						*/
+						skip_element(t) or_return
+					}
+				case:
+					error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", next.text)
+					return
+				}
+
+			} else {
+				error(t, t.offset, "Invalid Token after <: %#v\n", open)
+				return
+			}
+
+		case -1:
+			/*
+				End of file.
+			*/
+			if tag_is_open {
+				return doc, .Premature_EOF
+			}
+			break loop
+
+		case:
+			/*
+				This should be a tag's body text.
+			*/
+			body_text        := scan_string(t, t.offset) or_return
+			needs_processing := .Unbox_CDATA          in opts.flags
+			needs_processing |= .Decode_SGML_Entities in opts.flags
+
+			if !needs_processing {
+				doc.elements[element].value = body_text
+				continue
+			}
+
+			decode_opts := entity.XML_Decode_Options{}
+			if .Keep_Tag_Body_Comments not_in opts.flags {
+				decode_opts += { .Comment_Strip }
+			}
+
+			if .Decode_SGML_Entities not_in opts.flags {
+				decode_opts += { .No_Entity_Decode }
+			}
+
+			if .Unbox_CDATA in opts.flags {
+				decode_opts += { .Unbox_CDATA }
+				if .Decode_SGML_Entities in opts.flags {
+					decode_opts += { .Decode_CDATA }
+				}
+			}
+
+			decoded, decode_err := entity.decode_xml(body_text, decode_opts)
+			if decode_err == .None {
+				doc.elements[element].value = decoded
+				append(&doc.strings_to_free, decoded)
+			} else {
+				doc.elements[element].value = body_text
+			}
+		}
+	}
+
+	if .Must_Have_Prolog in opts.flags && len(doc.prologue) == 0 {
+		return doc, .No_Prolog
+	}
+
+	if .Must_Have_DocType in opts.flags && len(doc.doctype.ident) == 0 {
+		return doc, .No_DocType
+	}
+
+	resize(&doc.elements, int(doc.element_count))
+	return doc, .None
+}
+
+parse_string :: proc(data: string, options := DEFAULT_OPTIONS, path := "", error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
+	_data := transmute([]u8)data
+
+	return parse_bytes(_data, options, path, error_handler, allocator)
+}
+
+parse :: proc { parse_string, parse_bytes }
+
+// Load an XML file
+load_from_file :: proc(filename: string, options := DEFAULT_OPTIONS, error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
+	context.allocator = allocator
+	options := options
+
+	data, data_ok := os.read_entire_file(filename)
+	if !data_ok { return {}, .File_Error }
+
+	options.flags += { .Input_May_Be_Modified }
+
+	return parse_bytes(data, options, filename, error_handler, allocator)
+}
+
+destroy :: proc(doc: ^Document) {
+	if doc == nil { return }
+
+	for el in doc.elements {
+		delete(el.attribs)
+		delete(el.children)
+	}
+	delete(doc.elements)
+
+	delete(doc.prologue)
+	delete(doc.comments)
+	delete(doc.input)
+
+	for s in doc.strings_to_free {
+		delete(s)
+	}
+	delete(doc.strings_to_free)
+
+	free(doc)
+}
+
+/*
+	Helpers.
+*/
+
+validate_options :: proc(options: Options) -> (validated: Options, err: Error) {
+	validated = options
+
+	if .Error_on_Unsupported in validated.flags && .Ignore_Unsupported in validated.flags {
+		return options, .Conflicting_Options
+	}
+	return validated, .None
+}
+
+expect :: proc(t: ^Tokenizer, kind: Token_Kind) -> (tok: Token, err: Error) {
+	tok = scan(t)
+	if tok.kind == kind { return tok, .None }
+
+	error(t, t.offset, "Expected \"%v\", got \"%v\".", kind, tok.kind)
+	return tok, .Unexpected_Token
+}
+
+parse_attribute :: proc(doc: ^Document) -> (attr: Attribute, offset: int, err: Error) {
+	assert(doc != nil)
+	context.allocator = doc.allocator
+	t := doc.tokenizer
+
+	key    := expect(t, .Ident)  or_return
+	offset  = t.offset - len(key.text)
+
+	_       = expect(t, .Eq)     or_return
+	value  := expect(t, .String) or_return
+
+	attr.key = key.text
+	attr.val = value.text
+
+	err = .None
+	return
+}
+
+check_duplicate_attributes :: proc(t: ^Tokenizer, attribs: Attributes, attr: Attribute, offset: int) -> (err: Error) {
+	for a in attribs {
+		if attr.key == a.key {
+			error(t, offset, "Duplicate attribute: %v\n", attr.key)
+			return .Duplicate_Attribute
+		}
+	}
+	return .None	
+}
+
+parse_attributes :: proc(doc: ^Document, attribs: ^Attributes) -> (err: Error) {
+	assert(doc != nil)
+	context.allocator = doc.allocator
+	t := doc.tokenizer
+
+	for peek(t).kind == .Ident {
+		attr, offset := parse_attribute(doc)                  or_return
+		check_duplicate_attributes(t, attribs^, attr, offset) or_return
+		append(attribs, attr)
+	}
+	skip_whitespace(t)
+	return .None
+}
+
+parse_prologue :: proc(doc: ^Document) -> (err: Error) {
+	assert(doc != nil)
+	context.allocator = doc.allocator
+	t := doc.tokenizer
+
+	offset := t.offset
+	parse_attributes(doc, &doc.prologue) or_return
+
+	for attr in doc.prologue {
+		switch attr.key {
+		case "version":
+			switch attr.val {
+			case "1.0", "1.1":
+			case:
+				error(t, offset, "[parse_prologue] Warning: Unhandled XML version: %v\n", attr.val)
+			}
+
+		case "encoding":
+			switch strings.to_lower(attr.val, context.temp_allocator) {
+			case "utf-8", "utf8":
+				doc.encoding = .UTF_8
+
+			case "latin-1", "latin1", "iso-8859-1":
+				doc.encoding = .LATIN_1
+
+			case:
+				/*
+					Unrecognized encoding, assume UTF-8.
+				*/
+				error(t, offset, "[parse_prologue] Warning: Unrecognized encoding: %v\n", attr.val)
+			}
+
+		case:
+			// Ignored.
+		}
+	}
+
+	_ = expect(t, .Question) or_return
+	_ = expect(t, .Gt)       or_return
+
+	return .None
+}
+
+skip_element :: proc(t: ^Tokenizer) -> (err: Error) {
+	close := 1
+
+	loop: for {
+		tok := scan(t)
+		#partial switch tok.kind {
+		case .EOF:
+			error(t, t.offset, "[skip_element] Premature EOF\n")
+			return .Premature_EOF
+
+		case .Lt:
+			close += 1
+
+		case .Gt:
+			close -= 1
+			if close == 0 {
+				break loop
+			}
+
+		case:
+
+		}
+	}
+	return .None
+}
+
+parse_doctype :: proc(doc: ^Document) -> (err: Error) {
+	/*
+		<!DOCTYPE greeting SYSTEM "hello.dtd">
+
+		<!DOCTYPE greeting [
+			<!ELEMENT greeting (#PCDATA)>
+		]>
+	*/
+	assert(doc != nil)
+	context.allocator = doc.allocator
+	t := doc.tokenizer
+
+	tok := expect(t, .Ident) or_return
+	doc.doctype.ident = tok.text
+
+	skip_whitespace(t)
+	offset := t.offset
+	skip_element(t) or_return
+
+	/*
+		-1 because the current offset is that of the closing tag, so the rest of the DOCTYPE tag ends just before it.
+	*/
+	doc.doctype.rest = string(t.src[offset : t.offset - 1])
+	return .None
+}
+
+Element_ID :: u32
+
+new_element :: proc(doc: ^Document) -> (id: Element_ID) {
+	element_space := len(doc.elements)
+
+	// Need to resize
+	if int(doc.element_count) + 1 > element_space {
+		if element_space < 65536 {
+			element_space *= 2
+		} else {
+			element_space += 65536
+		}
+		resize(&doc.elements, element_space)
+	}
+
+	cur := doc.element_count
+	doc.element_count += 1
+
+	return cur
+}
+\ No newline at end of file