[xml] Speedup.

author: Jeroen van Rijn <Kelimion@users.noreply.github.com> 2022-04-28 15:29:00 +0200
committer: Jeroen van Rijn <Kelimion@users.noreply.github.com> 2022-04-28 15:29:00 +0200
commit: 80878264b63cd8476def629526b294b8e129791a (patch)
tree: d50ef7531aa490cad474f3e805a3f199455a12b4 /core/encoding
parent: 6df21d6a9f08deb4dab96fb17f3540ebfbc8b8fe (diff)
5 files changed, 236 insertions, 174 deletions
diff --git a/core/encoding/xml/debug_print.odin b/core/encoding/xml/debug_print.odin
index e6a8c9433..7c20ac123 100644
--- a/core/encoding/xml/debug_print.odin
+++ b/core/encoding/xml/debug_print.odin
@@ -1,8 +1,7 @@
-package xml
 /*
 	An XML 1.0 / 1.1 parser
 
-	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
+	Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
 
 	A from-scratch XML implementation, loosely modeled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
@@ -10,6 +9,8 @@ package xml
 	List of contributors:
 		Jeroen van Rijn: Initial implementation.
 */
+package xml
+
 import "core:io"
 import "core:fmt"
 
@@ -40,17 +41,16 @@ print :: proc(writer: io.Writer, doc: ^Document) -> (written: int, err: io.Error
 		written += wprintf(writer, "[Pre-root comment]  %v\n", comment)
 	}
 
-	if doc.root != nil {
+	if len(doc.elements) > 0 {
+	 	wprintln(writer, " --- ")
+	 	print_element(writer, doc, 0)
 	 	wprintln(writer, " --- ")
-	 	print_element(writer, doc.root)
-	 	wprintln(writer, " --- ")		
 	 }
 
 	return written, .None
 }
 
-print_element :: proc(writer: io.Writer, element: ^Element, indent := 0) -> (written: int, err: io.Error) {
-	if element == nil { return }
+print_element :: proc(writer: io.Writer, doc: ^Document, element_id: Element_ID, indent := 0) -> (written: int, err: io.Error) {
 	using fmt
 
 	tab :: proc(writer: io.Writer, indent: int) {
@@ -61,6 +61,8 @@ print_element :: proc(writer: io.Writer, element: ^Element, indent := 0) -> (wri
 
 	tab(writer, indent)
 
+	element := doc.elements[element_id]
+
 	if element.kind == .Element {
 		wprintf(writer, "<%v>\n", element.ident)
 		if len(element.value) > 0 {
@@ -74,7 +76,7 @@ print_element :: proc(writer: io.Writer, element: ^Element, indent := 0) -> (wri
 		}
 
 		for child in element.children {
-			print_element(writer, child, indent + 1)
+			print_element(writer, doc, child, indent + 1)
 		}
 	} else if element.kind == .Comment {
 		wprintf(writer, "[COMMENT] %v\n", element.value)
diff --git a/core/encoding/xml/example/xml_example.odin b/core/encoding/xml/example/xml_example.odin
index daa3c5dab..cadfcfb43 100644
--- a/core/encoding/xml/example/xml_example.odin
+++ b/core/encoding/xml/example/xml_example.odin
@@ -1,52 +1,85 @@
 package xml_example
 
 import "core:encoding/xml"
-import "core:os"
 import "core:mem"
 import "core:fmt"
 import "core:time"
 import "core:strings"
 import "core:hash"
 
+N :: 1
+
 example :: proc() {
 	using fmt
 
-	doc: ^xml.Document
-	err: xml.Error
+	docs:  [N]^xml.Document
+	errs:  [N]xml.Error
+	times: [N]time.Duration
+
+	defer for round in 0..<N {
+		xml.destroy(docs[round])
+	}
 
 	DOC :: #load("../../../../tests/core/assets/XML/unicode.xml")
+	input := DOC
+
+	for round in 0..<N {
+		start := time.tick_now()
 
-	parse_duration: time.Duration
-	{
-		time.SCOPED_TICK_DURATION(&parse_duration)
-		doc, err = xml.parse(DOC, xml.Options{flags={.Ignore_Unsupported}})
+		docs[round], errs[round] = xml.parse(input, xml.Options{
+			flags={.Ignore_Unsupported},
+			expected_doctype = "",
+		})
+
+		end   := time.tick_now()
+		times[round] = time.tick_diff(start, end)
 	}
-	defer xml.destroy(doc)
 
-	ms := time.duration_milliseconds(parse_duration)
-	speed := (f64(1000.0) / ms) * f64(len(DOC)) / 1_024.0 / 1_024.0
-	fmt.printf("Parse time: %v bytes in %.2f ms (%.2f MiB/s).\n", len(DOC), ms, speed)
+	fastest := time.Duration(max(i64))
+	slowest := time.Duration(0)
+	total   := time.Duration(0)
+
+	for round in 0..<N {
+		fastest = min(fastest, times[round])
+		slowest = max(slowest, times[round])
+		total  += times[round]
+	}
 
-	if err != .None {
-		printf("Load/Parse error: %v\n", err)
-		if err == .File_Error {
+	fastest_ms := time.duration_milliseconds(fastest)
+	slowest_ms := time.duration_milliseconds(slowest)
+	average_ms := time.duration_milliseconds(time.Duration(f64(total) / f64(N)))
+
+	fastest_speed := (f64(1000.0) / fastest_ms) * f64(len(DOC)) / 1_024.0 / 1_024.0
+	slowest_speed := (f64(1000.0) / slowest_ms) * f64(len(DOC)) / 1_024.0 / 1_024.0
+	average_speed := (f64(1000.0) / average_ms) * f64(len(DOC)) / 1_024.0 / 1_024.0
+
+	fmt.printf("N = %v\n", N)
+	fmt.printf("[Fastest]: %v bytes in %.2f ms (%.2f MiB/s).\n", len(input), fastest_ms, fastest_speed)
+	fmt.printf("[Slowest]: %v bytes in %.2f ms (%.2f MiB/s).\n", len(input), slowest_ms, slowest_speed)
+	fmt.printf("[Average]: %v bytes in %.2f ms (%.2f MiB/s).\n", len(input), average_ms, average_speed)
+
+	if errs[0] != .None {
+		printf("Load/Parse error: %v\n", errs[0])
+		if errs[0] == .File_Error {
 			println("\"unicode.xml\" not found. Did you run \"tests\\download_assets.py\"?")
 		}
-		os.exit(1)
+		return
 	}
 
-	println("\"unicode.xml\" loaded and parsed.")
-
-	charlist, charlist_ok := xml.find_child_by_ident(doc.root, "charlist")
+	charlist, charlist_ok := xml.find_child_by_ident(docs[0], 0, "charlist")
 	if !charlist_ok {
-		eprintln("Could not locate top-level `<charlist>` tag.")
-		os.exit(1)
+	 	eprintln("Could not locate top-level `<charlist>` tag.")
+	 	return
 	}
 
-	printf("Found `<charlist>` with %v children.\n", len(charlist.children))
+	printf("Found `<charlist>` with %v children, %v elements total\n", len(docs[0].elements[charlist].children), docs[0].element_count)
 
-	crc32 := doc_hash(doc)
+	crc32 := doc_hash(docs[0])
 	printf("[%v] CRC32: 0x%08x\n", "🎉" if crc32 == 0xcaa042b9 else "🤬", crc32)
+
+	for round in 0..<N {
+		defer xml.destroy(docs[round])
+	}
 }
 
 doc_hash :: proc(doc: ^xml.Document, print := false) -> (crc32: u32) {
diff --git a/core/encoding/xml/helpers.odin b/core/encoding/xml/helpers.odin
index 14597ddbd..48f058334 100644
--- a/core/encoding/xml/helpers.odin
+++ b/core/encoding/xml/helpers.odin
@@ -1,22 +1,20 @@
-package xml
 /*
 	An XML 1.0 / 1.1 parser
 
-	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
+	Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
 
 	This file contains helper functions.
 */
+package xml
 
-
-/*
-	Find `tag`'s nth child with a given ident.
-*/
-find_child_by_ident :: proc(tag: ^Element, ident: string, nth := 0) -> (res: ^Element, found: bool) {
-	if tag == nil                                 { return nil, false }
+// Find parent's nth child with a given ident.
+find_child_by_ident :: proc(doc: ^Document, parent_id: Element_ID, ident: string, nth := 0) -> (res: Element_ID, found: bool) {
+	tag := doc.elements[parent_id]
 
 	count := 0
-	for child in tag.children {
+	for child_id in tag.children {
+		child := doc.elements[child_id]
 		/*
 			Skip commments. They have no name.
 		*/
@@ -26,18 +24,16 @@ find_child_by_ident :: proc(tag: ^Element, ident: string, nth := 0) -> (res: ^El
 			If the ident matches and it's the nth such child, return it.
 		*/
 		if child.ident == ident {
-			if count == nth                       { return child, true }
+			if count == nth                       { return child_id, true }
 			count += 1
 		}
 	}
-	return nil, false
+	return 0, false
 }
 
-/*
-	Find an attribute by key.
-*/
-find_attribute_val_by_key :: proc(tag: ^Element, key: string) -> (val: string, found: bool) {
-	if tag == nil            { return "", false }
+// Find an attribute by key.
+find_attribute_val_by_key :: proc(doc: ^Document, parent_id: Element_ID, key: string) -> (val: string, found: bool) {
+	tag := doc.elements[parent_id]
 
 	for attr in tag.attribs {
 		/*
diff --git a/core/encoding/xml/tokenizer.odin b/core/encoding/xml/tokenizer.odin
index 2da3b7683..c3fece76e 100644
--- a/core/encoding/xml/tokenizer.odin
+++ b/core/encoding/xml/tokenizer.odin
@@ -1,3 +1,14 @@
+/*
+	An XML 1.0 / 1.1 parser
+
+	Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>.
+	Made available under Odin's BSD-3 license.
+
+	A from-scratch XML implementation, loosely modeled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
+
+	List of contributors:
+		Jeroen van Rijn: Initial implementation.
+*/
 package xml
 
 import "core:fmt"
diff --git a/core/encoding/xml/xml_reader.odin b/core/encoding/xml/xml_reader.odin
index 0315b0e05..636dd0ae4 100644
--- a/core/encoding/xml/xml_reader.odin
+++ b/core/encoding/xml/xml_reader.odin
@@ -1,8 +1,7 @@
-package xml
 /*
 	An XML 1.0 / 1.1 parser
 
-	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
+	Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>.
 	Made available under Odin's BSD-3 license.
 
 	A from-scratch XML implementation, loosely modelled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
@@ -25,12 +24,17 @@ package xml
 	List of contributors:
 		Jeroen van Rijn: Initial implementation.
 */
+package xml
+// An XML 1.0 / 1.1 parser
 
 import "core:bytes"
-import "core:strings"
 import "core:encoding/entity"
+import "core:intrinsics"
 import "core:mem"
 import "core:os"
+import "core:strings"
+
+likely :: intrinsics.expect
 
 DEFAULT_Options :: Options{
 	flags            = {
@@ -88,7 +92,9 @@ Option_Flag :: enum {
 Option_Flags :: bit_set[Option_Flag; u16]
 
 Document :: struct {
-	root:     ^Element,
+	elements:      [dynamic]Element,
+	element_count: Element_ID,
+
 	prolog:   Attributes,
 	encoding: Encoding,
 
@@ -129,8 +135,8 @@ Element :: struct {
 		Comment,
 	},
 
-	parent:   ^Element,
-	children: [dynamic]^Element,
+	parent:   Element_ID,
+	children: [dynamic]Element_ID,
 }
 
 Attr :: struct {
@@ -185,7 +191,7 @@ Error :: enum {
 
 	No_DocType,
 	Too_Many_DocTypes,
-	DocType_Must_Proceed_Elements,
+	DocType_Must_Preceed_Elements,
 
 	/*
 		If a DOCTYPE is present _or_ the caller
@@ -237,12 +243,16 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
 	doc.tokenizer = t
 	doc.input     = data
 
+	doc.elements = make([dynamic]Element, 1024, 1024, allocator)
+
 	// strings.intern_init(&doc.intern, allocator, allocator)
 
 	err =            .Unexpected_Token
-	element, parent: ^Element
+	element, parent: Element_ID
 
-	tag_is_open := false
+	tag_is_open   := false
+	first_element := true
+	open: Token
 
 	/*
 		If a DOCTYPE is present, the root tag has to match.
@@ -252,6 +262,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
 
 	loop: for {
 		skip_whitespace(t)
+		// NOTE(Jeroen): This is faster as a switch.
 		switch t.ch {
 		case '<':
 			/*
@@ -259,118 +270,36 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
 			*/
 			advance_rune(t)
 
-			open := scan(t)
-			#partial switch open.kind {
-
-			case .Question:
-				/*
-					<?xml
-				*/
-				next := scan(t)
-				#partial switch next.kind {
-				case .Ident:
-					if len(next.text) == 3 && strings.to_lower(next.text, context.temp_allocator) == "xml" {
-						parse_prolog(doc) or_return
-					} else if len(doc.prolog) > 0 {
-						/*
-							We've already seen a prolog.
-						*/
-						return doc, .Too_Many_Prologs
-					} else {
-						/*
-							Could be `<?xml-stylesheet`, etc. Ignore it.
-						*/
-						skip_element(t) or_return
-					}
-				case:
-					error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", next.text)
-					return
-				}
-
-			case .Exclaim:
-				/*
-					<!
-				*/
-				next := scan(t)
-				#partial switch next.kind {
-				case .Ident:
-					switch next.text {
-					case "DOCTYPE":
-						if len(doc.doctype.ident) > 0 {
-							return doc, .Too_Many_DocTypes
-						}
-						if doc.root != nil {
-							return doc, .DocType_Must_Proceed_Elements
-						}
-						parse_doctype(doc) or_return
-
-						if len(expected_doctype) > 0 && expected_doctype != doc.doctype.ident {
-							error(t, t.offset, "Invalid DOCTYPE. Expected: %v, got: %v\n", expected_doctype, doc.doctype.ident)
-							return doc, .Invalid_DocType
-						}
-						expected_doctype = doc.doctype.ident
-
-					case:
-						if .Error_on_Unsupported in opts.flags {
-							error(t, t.offset, "Unhandled: <!%v\n", next.text)
-							return doc, .Unhandled_Bang
-						}
-						skip_element(t) or_return
-					}
-
-				case .Dash:
-					/*
-						Comment: <!-- -->.
-						The grammar does not allow a comment to end in --->
-					*/
-					expect(t, .Dash)
-					comment := scan_comment(t) or_return
-
-					if .Intern_Comments in opts.flags {
-						if doc.root == nil {
-							append(&doc.comments, comment)
-						} else {
-							el := new(Element)
-							el.parent = element
-							el.kind   = .Comment
-							el.value  = comment
-							append(&element.children, el)
-						}
-					}
-
-				case:
-					error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next)
-					return
-				}
-
-			case .Ident:
+			open = scan(t)
+			// NOTE(Jeroen): We're not using a switch because this if-else chain ordered by likelihood is 2.5% faster at -o:size and -o:speed.
+			if likely(open.kind, Token_Kind.Ident) == .Ident {
 				/*
 					e.g. <odin - Start of new element.
 				*/
-				element = new(Element)
+				element = new_element(doc)
 				tag_is_open = true
 
-				if doc.root == nil {
+				if first_element {
 					/*
 						First element.
 					*/
-					doc.root = element
 					parent   = element
+					first_element = false
 				} else {
-					append(&parent.children, element)
+					append(&doc.elements[parent].children, element)
 				}
 
-				element.parent = parent
-				element.ident  = open.text
+				doc.elements[element].parent = parent
+				doc.elements[element].ident  = open.text
 
-				parse_attributes(doc, &element.attribs) or_return
+				parse_attributes(doc, &doc.elements[element].attribs) or_return
 
 				/*
 					If a DOCTYPE is present _or_ the caller
 					asked for a specific DOCTYPE and the DOCTYPE
 					and root tag don't match, we return .Invalid_Root_Tag.
 				*/
-				if element == doc.root {
+				if element == 0 { // Root tag?
 					if len(expected_doctype) > 0 && expected_doctype != open.text {
 						error(t, t.offset, "Root Tag doesn't match DOCTYPE. Expected: %v, got: %v\n", expected_doctype, open.text)
 						return doc, .Invalid_DocType
@@ -395,7 +324,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
 						Empty tag. Close it.
 					*/
 					expect(t, .Gt) or_return
-					parent      = element.parent
+					parent      = doc.elements[element].parent
 					element     = parent
 					tag_is_open = false
 
@@ -404,22 +333,103 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
 					return
 				}
 
-			case .Slash:
+			} else if open.kind == .Slash {
 				/*
 					Close tag.
 				*/
 				ident := expect(t, .Ident) or_return
 				_      = expect(t, .Gt)    or_return
 
-				if element.ident != ident.text {
-					error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", element.ident, ident.text)
+				if doc.elements[element].ident != ident.text {
+					error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", doc.elements[element].ident, ident.text)
 					return doc, .Mismatched_Closing_Tag
 				}
-				parent      = element.parent
+				parent      = doc.elements[element].parent
 				element     = parent
 				tag_is_open = false
 
-			case:
+			} else if open.kind == .Exclaim {
+				/*
+					<!
+				*/
+				next := scan(t)
+				#partial switch next.kind {
+				case .Ident:
+					switch next.text {
+					case "DOCTYPE":
+						if len(doc.doctype.ident) > 0 {
+							return doc, .Too_Many_DocTypes
+						}
+						if doc.element_count > 0 {
+							return doc, .DocType_Must_Preceed_Elements
+						}
+						parse_doctype(doc) or_return
+
+						if len(expected_doctype) > 0 && expected_doctype != doc.doctype.ident {
+							error(t, t.offset, "Invalid DOCTYPE. Expected: %v, got: %v\n", expected_doctype, doc.doctype.ident)
+							return doc, .Invalid_DocType
+						}
+						expected_doctype = doc.doctype.ident
+
+					case:
+						if .Error_on_Unsupported in opts.flags {
+							error(t, t.offset, "Unhandled: <!%v\n", next.text)
+							return doc, .Unhandled_Bang
+						}
+						skip_element(t) or_return
+					}
+
+				case .Dash:
+					/*
+						Comment: <!-- -->.
+						The grammar does not allow a comment to end in --->
+					*/
+					expect(t, .Dash)
+					comment := scan_comment(t) or_return
+
+					if .Intern_Comments in opts.flags {
+						if len(doc.elements) == 0 {
+							append(&doc.comments, comment)
+						} else {
+							el := new_element(doc)
+							doc.elements[el].parent = element
+							doc.elements[el].kind   = .Comment
+							doc.elements[el].value  = comment
+							append(&doc.elements[element].children, el)
+						}
+					}
+
+				case:
+					error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next)
+					return
+				}
+
+			} else if open.kind == .Question {
+				/*
+					<?xml
+				*/
+				next := scan(t)
+				#partial switch next.kind {
+				case .Ident:
+					if len(next.text) == 3 && strings.to_lower(next.text, context.temp_allocator) == "xml" {
+						parse_prolog(doc) or_return
+					} else if len(doc.prolog) > 0 {
+						/*
+							We've already seen a prolog.
+						*/
+						return doc, .Too_Many_Prologs
+					} else {
+						/*
+							Could be `<?xml-stylesheet`, etc. Ignore it.
+						*/
+						skip_element(t) or_return
+					}
+				case:
+					error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", next.text)
+					return
+				}
+
+			} else {
 				error(t, t.offset, "Invalid Token after <: %#v\n", open)
 				return
 			}
@@ -442,7 +452,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
 			needs_processing |= .Decode_SGML_Entities in opts.flags
 
 			if !needs_processing {
-				element.value = body_text
+				doc.elements[element].value = body_text
 				continue
 			}
 
@@ -464,10 +474,10 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
 
 			decoded, decode_err := entity.decode_xml(body_text, decode_opts)
 			if decode_err == .None {
-				element.value = decoded
+				doc.elements[element].value = decoded
 				append(&doc.strings_to_free, decoded)
 			} else {
-				element.value = body_text
+				doc.elements[element].value = body_text
 			}
 		}
 	}
@@ -480,6 +490,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
 		return doc, .No_DocType
 	}
 
+	resize(&doc.elements, int(doc.element_count))
 	return doc, .None
 }
 
@@ -497,26 +508,14 @@ parse_from_file :: proc(filename: string, options := DEFAULT_Options, error_hand
 
 parse :: proc { parse_from_file, parse_from_slice }
 
-free_element :: proc(element: ^Element) {
-	if element == nil { return }
-
-	for child in element.children {
-		/*
-			NOTE: Recursive.
-
-			Could be rewritten so it adds them to a list of pointers to free.
-		*/
-		free_element(child)
-	}
-	delete(element.attribs)
-	delete(element.children)
-	free(element)
-}
-
 destroy :: proc(doc: ^Document) {
 	if doc == nil { return }
 
-	free_element(doc.root)
+	for el in doc.elements {
+		delete(el.attribs)
+		delete(el.children)
+	}
+	delete(doc.elements)
 
 	delete(doc.prolog)
 	delete(doc.comments)
@@ -686,4 +685,25 @@ parse_doctype :: proc(doc: ^Document) -> (err: Error) {
 	*/
 	doc.doctype.rest = string(t.src[offset : t.offset - 1])
 	return .None
+}
+
+Element_ID :: u32
+
+new_element :: proc(doc: ^Document) -> (id: Element_ID) {
+	element_space := len(doc.elements)
+
+	// Need to resize
+	if int(doc.element_count) + 1 > element_space {
+		if element_space < 65536 {
+			element_space *= 2
+		} else {
+			element_space += 65536
+		}
+		resize(&doc.elements, element_space)
+	}
+
+	cur := doc.element_count
+	doc.element_count += 1
+
+	return cur
 }
 \ No newline at end of file
author	Jeroen van Rijn <Kelimion@users.noreply.github.com>	2022-04-28 15:29:00 +0200
committer	Jeroen van Rijn <Kelimion@users.noreply.github.com>	2022-04-28 15:29:00 +0200
commit	80878264b63cd8476def629526b294b8e129791a (patch)
tree	d50ef7531aa490cad474f3e805a3f199455a12b4 /core/encoding
parent	6df21d6a9f08deb4dab96fb17f3540ebfbc8b8fe (diff)