aboutsummaryrefslogtreecommitdiff
path: root/core/encoding
diff options
context:
space:
mode:
authorJeroen van Rijn <Kelimion@users.noreply.github.com>2022-04-28 15:29:00 +0200
committerJeroen van Rijn <Kelimion@users.noreply.github.com>2022-04-28 15:29:00 +0200
commit80878264b63cd8476def629526b294b8e129791a (patch)
treed50ef7531aa490cad474f3e805a3f199455a12b4 /core/encoding
parent6df21d6a9f08deb4dab96fb17f3540ebfbc8b8fe (diff)
[xml] Speedup.
Diffstat (limited to 'core/encoding')
-rw-r--r--core/encoding/xml/debug_print.odin18
-rw-r--r--core/encoding/xml/example/xml_example.odin77
-rw-r--r--core/encoding/xml/helpers.odin28
-rw-r--r--core/encoding/xml/tokenizer.odin11
-rw-r--r--core/encoding/xml/xml_reader.odin276
5 files changed, 236 insertions, 174 deletions
diff --git a/core/encoding/xml/debug_print.odin b/core/encoding/xml/debug_print.odin
index e6a8c9433..7c20ac123 100644
--- a/core/encoding/xml/debug_print.odin
+++ b/core/encoding/xml/debug_print.odin
@@ -1,8 +1,7 @@
-package xml
/*
An XML 1.0 / 1.1 parser
- Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
+ Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>.
Made available under Odin's BSD-3 license.
A from-scratch XML implementation, loosely modeled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
@@ -10,6 +9,8 @@ package xml
List of contributors:
Jeroen van Rijn: Initial implementation.
*/
+package xml
+
import "core:io"
import "core:fmt"
@@ -40,17 +41,16 @@ print :: proc(writer: io.Writer, doc: ^Document) -> (written: int, err: io.Error
written += wprintf(writer, "[Pre-root comment] %v\n", comment)
}
- if doc.root != nil {
+ if len(doc.elements) > 0 {
+ wprintln(writer, " --- ")
+ print_element(writer, doc, 0)
wprintln(writer, " --- ")
- print_element(writer, doc.root)
- wprintln(writer, " --- ")
}
return written, .None
}
-print_element :: proc(writer: io.Writer, element: ^Element, indent := 0) -> (written: int, err: io.Error) {
- if element == nil { return }
+print_element :: proc(writer: io.Writer, doc: ^Document, element_id: Element_ID, indent := 0) -> (written: int, err: io.Error) {
using fmt
tab :: proc(writer: io.Writer, indent: int) {
@@ -61,6 +61,8 @@ print_element :: proc(writer: io.Writer, element: ^Element, indent := 0) -> (wri
tab(writer, indent)
+ element := doc.elements[element_id]
+
if element.kind == .Element {
wprintf(writer, "<%v>\n", element.ident)
if len(element.value) > 0 {
@@ -74,7 +76,7 @@ print_element :: proc(writer: io.Writer, element: ^Element, indent := 0) -> (wri
}
for child in element.children {
- print_element(writer, child, indent + 1)
+ print_element(writer, doc, child, indent + 1)
}
} else if element.kind == .Comment {
wprintf(writer, "[COMMENT] %v\n", element.value)
diff --git a/core/encoding/xml/example/xml_example.odin b/core/encoding/xml/example/xml_example.odin
index daa3c5dab..cadfcfb43 100644
--- a/core/encoding/xml/example/xml_example.odin
+++ b/core/encoding/xml/example/xml_example.odin
@@ -1,52 +1,85 @@
package xml_example
import "core:encoding/xml"
-import "core:os"
import "core:mem"
import "core:fmt"
import "core:time"
import "core:strings"
import "core:hash"
+N :: 1
+
example :: proc() {
using fmt
- doc: ^xml.Document
- err: xml.Error
+ docs: [N]^xml.Document
+ errs: [N]xml.Error
+ times: [N]time.Duration
+
+ defer for round in 0..<N {
+ xml.destroy(docs[round])
+ }
DOC :: #load("../../../../tests/core/assets/XML/unicode.xml")
+ input := DOC
+
+ for round in 0..<N {
+ start := time.tick_now()
- parse_duration: time.Duration
- {
- time.SCOPED_TICK_DURATION(&parse_duration)
- doc, err = xml.parse(DOC, xml.Options{flags={.Ignore_Unsupported}})
+ docs[round], errs[round] = xml.parse(input, xml.Options{
+ flags={.Ignore_Unsupported},
+ expected_doctype = "",
+ })
+
+ end := time.tick_now()
+ times[round] = time.tick_diff(start, end)
}
- defer xml.destroy(doc)
- ms := time.duration_milliseconds(parse_duration)
- speed := (f64(1000.0) / ms) * f64(len(DOC)) / 1_024.0 / 1_024.0
- fmt.printf("Parse time: %v bytes in %.2f ms (%.2f MiB/s).\n", len(DOC), ms, speed)
+ fastest := time.Duration(max(i64))
+ slowest := time.Duration(0)
+ total := time.Duration(0)
+
+ for round in 0..<N {
+ fastest = min(fastest, times[round])
+ slowest = max(slowest, times[round])
+ total += times[round]
+ }
- if err != .None {
- printf("Load/Parse error: %v\n", err)
- if err == .File_Error {
+ fastest_ms := time.duration_milliseconds(fastest)
+ slowest_ms := time.duration_milliseconds(slowest)
+ average_ms := time.duration_milliseconds(time.Duration(f64(total) / f64(N)))
+
+ fastest_speed := (f64(1000.0) / fastest_ms) * f64(len(DOC)) / 1_024.0 / 1_024.0
+ slowest_speed := (f64(1000.0) / slowest_ms) * f64(len(DOC)) / 1_024.0 / 1_024.0
+ average_speed := (f64(1000.0) / average_ms) * f64(len(DOC)) / 1_024.0 / 1_024.0
+
+ fmt.printf("N = %v\n", N)
+ fmt.printf("[Fastest]: %v bytes in %.2f ms (%.2f MiB/s).\n", len(input), fastest_ms, fastest_speed)
+ fmt.printf("[Slowest]: %v bytes in %.2f ms (%.2f MiB/s).\n", len(input), slowest_ms, slowest_speed)
+ fmt.printf("[Average]: %v bytes in %.2f ms (%.2f MiB/s).\n", len(input), average_ms, average_speed)
+
+ if errs[0] != .None {
+ printf("Load/Parse error: %v\n", errs[0])
+ if errs[0] == .File_Error {
println("\"unicode.xml\" not found. Did you run \"tests\\download_assets.py\"?")
}
- os.exit(1)
+ return
}
- println("\"unicode.xml\" loaded and parsed.")
-
- charlist, charlist_ok := xml.find_child_by_ident(doc.root, "charlist")
+ charlist, charlist_ok := xml.find_child_by_ident(docs[0], 0, "charlist")
if !charlist_ok {
- eprintln("Could not locate top-level `<charlist>` tag.")
- os.exit(1)
+ eprintln("Could not locate top-level `<charlist>` tag.")
+ return
}
- printf("Found `<charlist>` with %v children.\n", len(charlist.children))
+ printf("Found `<charlist>` with %v children, %v elements total\n", len(docs[0].elements[charlist].children), docs[0].element_count)
- crc32 := doc_hash(doc)
+ crc32 := doc_hash(docs[0])
printf("[%v] CRC32: 0x%08x\n", "🎉" if crc32 == 0xcaa042b9 else "🤬", crc32)
+
+ for round in 0..<N {
+ defer xml.destroy(docs[round])
+ }
}
doc_hash :: proc(doc: ^xml.Document, print := false) -> (crc32: u32) {
diff --git a/core/encoding/xml/helpers.odin b/core/encoding/xml/helpers.odin
index 14597ddbd..48f058334 100644
--- a/core/encoding/xml/helpers.odin
+++ b/core/encoding/xml/helpers.odin
@@ -1,22 +1,20 @@
-package xml
/*
An XML 1.0 / 1.1 parser
- Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
+ Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>.
Made available under Odin's BSD-3 license.
This file contains helper functions.
*/
+package xml
-
-/*
- Find `tag`'s nth child with a given ident.
-*/
-find_child_by_ident :: proc(tag: ^Element, ident: string, nth := 0) -> (res: ^Element, found: bool) {
- if tag == nil { return nil, false }
+// Find parent's nth child with a given ident.
+find_child_by_ident :: proc(doc: ^Document, parent_id: Element_ID, ident: string, nth := 0) -> (res: Element_ID, found: bool) {
+ tag := doc.elements[parent_id]
count := 0
- for child in tag.children {
+ for child_id in tag.children {
+ child := doc.elements[child_id]
/*
Skip commments. They have no name.
*/
@@ -26,18 +24,16 @@ find_child_by_ident :: proc(tag: ^Element, ident: string, nth := 0) -> (res: ^El
If the ident matches and it's the nth such child, return it.
*/
if child.ident == ident {
- if count == nth { return child, true }
+ if count == nth { return child_id, true }
count += 1
}
}
- return nil, false
+ return 0, false
}
-/*
- Find an attribute by key.
-*/
-find_attribute_val_by_key :: proc(tag: ^Element, key: string) -> (val: string, found: bool) {
- if tag == nil { return "", false }
+// Find an attribute by key.
+find_attribute_val_by_key :: proc(doc: ^Document, parent_id: Element_ID, key: string) -> (val: string, found: bool) {
+ tag := doc.elements[parent_id]
for attr in tag.attribs {
/*
diff --git a/core/encoding/xml/tokenizer.odin b/core/encoding/xml/tokenizer.odin
index 2da3b7683..c3fece76e 100644
--- a/core/encoding/xml/tokenizer.odin
+++ b/core/encoding/xml/tokenizer.odin
@@ -1,3 +1,14 @@
+/*
+ An XML 1.0 / 1.1 parser
+
+ Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>.
+ Made available under Odin's BSD-3 license.
+
+ A from-scratch XML implementation, loosely modeled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
+
+ List of contributors:
+ Jeroen van Rijn: Initial implementation.
+*/
package xml
import "core:fmt"
diff --git a/core/encoding/xml/xml_reader.odin b/core/encoding/xml/xml_reader.odin
index 0315b0e05..636dd0ae4 100644
--- a/core/encoding/xml/xml_reader.odin
+++ b/core/encoding/xml/xml_reader.odin
@@ -1,8 +1,7 @@
-package xml
/*
An XML 1.0 / 1.1 parser
- Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
+ Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>.
Made available under Odin's BSD-3 license.
A from-scratch XML implementation, loosely modelled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
@@ -25,12 +24,17 @@ package xml
List of contributors:
Jeroen van Rijn: Initial implementation.
*/
+package xml
+// An XML 1.0 / 1.1 parser
import "core:bytes"
-import "core:strings"
import "core:encoding/entity"
+import "core:intrinsics"
import "core:mem"
import "core:os"
+import "core:strings"
+
+likely :: intrinsics.expect
DEFAULT_Options :: Options{
flags = {
@@ -88,7 +92,9 @@ Option_Flag :: enum {
Option_Flags :: bit_set[Option_Flag; u16]
Document :: struct {
- root: ^Element,
+ elements: [dynamic]Element,
+ element_count: Element_ID,
+
prolog: Attributes,
encoding: Encoding,
@@ -129,8 +135,8 @@ Element :: struct {
Comment,
},
- parent: ^Element,
- children: [dynamic]^Element,
+ parent: Element_ID,
+ children: [dynamic]Element_ID,
}
Attr :: struct {
@@ -185,7 +191,7 @@ Error :: enum {
No_DocType,
Too_Many_DocTypes,
- DocType_Must_Proceed_Elements,
+ DocType_Must_Preceed_Elements,
/*
If a DOCTYPE is present _or_ the caller
@@ -237,12 +243,16 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
doc.tokenizer = t
doc.input = data
+ doc.elements = make([dynamic]Element, 1024, 1024, allocator)
+
// strings.intern_init(&doc.intern, allocator, allocator)
err = .Unexpected_Token
- element, parent: ^Element
+ element, parent: Element_ID
- tag_is_open := false
+ tag_is_open := false
+ first_element := true
+ open: Token
/*
If a DOCTYPE is present, the root tag has to match.
@@ -252,6 +262,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
loop: for {
skip_whitespace(t)
+ // NOTE(Jeroen): This is faster as a switch.
switch t.ch {
case '<':
/*
@@ -259,118 +270,36 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
*/
advance_rune(t)
- open := scan(t)
- #partial switch open.kind {
-
- case .Question:
- /*
- <?xml
- */
- next := scan(t)
- #partial switch next.kind {
- case .Ident:
- if len(next.text) == 3 && strings.to_lower(next.text, context.temp_allocator) == "xml" {
- parse_prolog(doc) or_return
- } else if len(doc.prolog) > 0 {
- /*
- We've already seen a prolog.
- */
- return doc, .Too_Many_Prologs
- } else {
- /*
- Could be `<?xml-stylesheet`, etc. Ignore it.
- */
- skip_element(t) or_return
- }
- case:
- error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", next.text)
- return
- }
-
- case .Exclaim:
- /*
- <!
- */
- next := scan(t)
- #partial switch next.kind {
- case .Ident:
- switch next.text {
- case "DOCTYPE":
- if len(doc.doctype.ident) > 0 {
- return doc, .Too_Many_DocTypes
- }
- if doc.root != nil {
- return doc, .DocType_Must_Proceed_Elements
- }
- parse_doctype(doc) or_return
-
- if len(expected_doctype) > 0 && expected_doctype != doc.doctype.ident {
- error(t, t.offset, "Invalid DOCTYPE. Expected: %v, got: %v\n", expected_doctype, doc.doctype.ident)
- return doc, .Invalid_DocType
- }
- expected_doctype = doc.doctype.ident
-
- case:
- if .Error_on_Unsupported in opts.flags {
- error(t, t.offset, "Unhandled: <!%v\n", next.text)
- return doc, .Unhandled_Bang
- }
- skip_element(t) or_return
- }
-
- case .Dash:
- /*
- Comment: <!-- -->.
- The grammar does not allow a comment to end in --->
- */
- expect(t, .Dash)
- comment := scan_comment(t) or_return
-
- if .Intern_Comments in opts.flags {
- if doc.root == nil {
- append(&doc.comments, comment)
- } else {
- el := new(Element)
- el.parent = element
- el.kind = .Comment
- el.value = comment
- append(&element.children, el)
- }
- }
-
- case:
- error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next)
- return
- }
-
- case .Ident:
+ open = scan(t)
+ // NOTE(Jeroen): We're not using a switch because this if-else chain ordered by likelihood is 2.5% faster at -o:size and -o:speed.
+ if likely(open.kind, Token_Kind.Ident) == .Ident {
/*
e.g. <odin - Start of new element.
*/
- element = new(Element)
+ element = new_element(doc)
tag_is_open = true
- if doc.root == nil {
+ if first_element {
/*
First element.
*/
- doc.root = element
parent = element
+ first_element = false
} else {
- append(&parent.children, element)
+ append(&doc.elements[parent].children, element)
}
- element.parent = parent
- element.ident = open.text
+ doc.elements[element].parent = parent
+ doc.elements[element].ident = open.text
- parse_attributes(doc, &element.attribs) or_return
+ parse_attributes(doc, &doc.elements[element].attribs) or_return
/*
If a DOCTYPE is present _or_ the caller
asked for a specific DOCTYPE and the DOCTYPE
and root tag don't match, we return .Invalid_Root_Tag.
*/
- if element == doc.root {
+ if element == 0 { // Root tag?
if len(expected_doctype) > 0 && expected_doctype != open.text {
error(t, t.offset, "Root Tag doesn't match DOCTYPE. Expected: %v, got: %v\n", expected_doctype, open.text)
return doc, .Invalid_DocType
@@ -395,7 +324,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
Empty tag. Close it.
*/
expect(t, .Gt) or_return
- parent = element.parent
+ parent = doc.elements[element].parent
element = parent
tag_is_open = false
@@ -404,22 +333,103 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
return
}
- case .Slash:
+ } else if open.kind == .Slash {
/*
Close tag.
*/
ident := expect(t, .Ident) or_return
_ = expect(t, .Gt) or_return
- if element.ident != ident.text {
- error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", element.ident, ident.text)
+ if doc.elements[element].ident != ident.text {
+ error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", doc.elements[element].ident, ident.text)
return doc, .Mismatched_Closing_Tag
}
- parent = element.parent
+ parent = doc.elements[element].parent
element = parent
tag_is_open = false
- case:
+ } else if open.kind == .Exclaim {
+ /*
+ <!
+ */
+ next := scan(t)
+ #partial switch next.kind {
+ case .Ident:
+ switch next.text {
+ case "DOCTYPE":
+ if len(doc.doctype.ident) > 0 {
+ return doc, .Too_Many_DocTypes
+ }
+ if doc.element_count > 0 {
+ return doc, .DocType_Must_Preceed_Elements
+ }
+ parse_doctype(doc) or_return
+
+ if len(expected_doctype) > 0 && expected_doctype != doc.doctype.ident {
+ error(t, t.offset, "Invalid DOCTYPE. Expected: %v, got: %v\n", expected_doctype, doc.doctype.ident)
+ return doc, .Invalid_DocType
+ }
+ expected_doctype = doc.doctype.ident
+
+ case:
+ if .Error_on_Unsupported in opts.flags {
+ error(t, t.offset, "Unhandled: <!%v\n", next.text)
+ return doc, .Unhandled_Bang
+ }
+ skip_element(t) or_return
+ }
+
+ case .Dash:
+ /*
+ Comment: <!-- -->.
+ The grammar does not allow a comment to end in --->
+ */
+ expect(t, .Dash)
+ comment := scan_comment(t) or_return
+
+ if .Intern_Comments in opts.flags {
+ if len(doc.elements) == 0 {
+ append(&doc.comments, comment)
+ } else {
+ el := new_element(doc)
+ doc.elements[el].parent = element
+ doc.elements[el].kind = .Comment
+ doc.elements[el].value = comment
+ append(&doc.elements[element].children, el)
+ }
+ }
+
+ case:
+ error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next)
+ return
+ }
+
+ } else if open.kind == .Question {
+ /*
+ <?xml
+ */
+ next := scan(t)
+ #partial switch next.kind {
+ case .Ident:
+ if len(next.text) == 3 && strings.to_lower(next.text, context.temp_allocator) == "xml" {
+ parse_prolog(doc) or_return
+ } else if len(doc.prolog) > 0 {
+ /*
+ We've already seen a prolog.
+ */
+ return doc, .Too_Many_Prologs
+ } else {
+ /*
+ Could be `<?xml-stylesheet`, etc. Ignore it.
+ */
+ skip_element(t) or_return
+ }
+ case:
+ error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", next.text)
+ return
+ }
+
+ } else {
error(t, t.offset, "Invalid Token after <: %#v\n", open)
return
}
@@ -442,7 +452,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
needs_processing |= .Decode_SGML_Entities in opts.flags
if !needs_processing {
- element.value = body_text
+ doc.elements[element].value = body_text
continue
}
@@ -464,10 +474,10 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
decoded, decode_err := entity.decode_xml(body_text, decode_opts)
if decode_err == .None {
- element.value = decoded
+ doc.elements[element].value = decoded
append(&doc.strings_to_free, decoded)
} else {
- element.value = body_text
+ doc.elements[element].value = body_text
}
}
}
@@ -480,6 +490,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
return doc, .No_DocType
}
+ resize(&doc.elements, int(doc.element_count))
return doc, .None
}
@@ -497,26 +508,14 @@ parse_from_file :: proc(filename: string, options := DEFAULT_Options, error_hand
parse :: proc { parse_from_file, parse_from_slice }
-free_element :: proc(element: ^Element) {
- if element == nil { return }
-
- for child in element.children {
- /*
- NOTE: Recursive.
-
- Could be rewritten so it adds them to a list of pointers to free.
- */
- free_element(child)
- }
- delete(element.attribs)
- delete(element.children)
- free(element)
-}
-
destroy :: proc(doc: ^Document) {
if doc == nil { return }
- free_element(doc.root)
+ for el in doc.elements {
+ delete(el.attribs)
+ delete(el.children)
+ }
+ delete(doc.elements)
delete(doc.prolog)
delete(doc.comments)
@@ -686,4 +685,25 @@ parse_doctype :: proc(doc: ^Document) -> (err: Error) {
*/
doc.doctype.rest = string(t.src[offset : t.offset - 1])
return .None
+}
+
+Element_ID :: u32
+
+new_element :: proc(doc: ^Document) -> (id: Element_ID) {
+ element_space := len(doc.elements)
+
+ // Need to resize
+ if int(doc.element_count) + 1 > element_space {
+ if element_space < 65536 {
+ element_space *= 2
+ } else {
+ element_space += 65536
+ }
+ resize(&doc.elements, element_space)
+ }
+
+ cur := doc.element_count
+ doc.element_count += 1
+
+ return cur
} \ No newline at end of file