diff options
| author | Jeroen van Rijn <Kelimion@users.noreply.github.com> | 2022-04-28 15:29:00 +0200 |
|---|---|---|
| committer | Jeroen van Rijn <Kelimion@users.noreply.github.com> | 2022-04-28 15:29:00 +0200 |
| commit | 80878264b63cd8476def629526b294b8e129791a (patch) | |
| tree | d50ef7531aa490cad474f3e805a3f199455a12b4 | |
| parent | 6df21d6a9f08deb4dab96fb17f3540ebfbc8b8fe (diff) | |
[xml] Speedup.
| -rw-r--r-- | core/encoding/xml/debug_print.odin | 18 | ||||
| -rw-r--r-- | core/encoding/xml/example/xml_example.odin | 77 | ||||
| -rw-r--r-- | core/encoding/xml/helpers.odin | 28 | ||||
| -rw-r--r-- | core/encoding/xml/tokenizer.odin | 11 | ||||
| -rw-r--r-- | core/encoding/xml/xml_reader.odin | 276 | ||||
| -rw-r--r-- | tests/core/encoding/xml/test_core_xml.odin | 17 |
6 files changed, 245 insertions, 182 deletions
diff --git a/core/encoding/xml/debug_print.odin b/core/encoding/xml/debug_print.odin index e6a8c9433..7c20ac123 100644 --- a/core/encoding/xml/debug_print.odin +++ b/core/encoding/xml/debug_print.odin @@ -1,8 +1,7 @@ -package xml /* An XML 1.0 / 1.1 parser - Copyright 2021 Jeroen van Rijn <nom@duclavier.com>. + Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>. Made available under Odin's BSD-3 license. A from-scratch XML implementation, loosely modeled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816). @@ -10,6 +9,8 @@ package xml List of contributors: Jeroen van Rijn: Initial implementation. */ +package xml + import "core:io" import "core:fmt" @@ -40,17 +41,16 @@ print :: proc(writer: io.Writer, doc: ^Document) -> (written: int, err: io.Error written += wprintf(writer, "[Pre-root comment] %v\n", comment) } - if doc.root != nil { + if len(doc.elements) > 0 { + wprintln(writer, " --- ") + print_element(writer, doc, 0) wprintln(writer, " --- ") - print_element(writer, doc.root) - wprintln(writer, " --- ") } return written, .None } -print_element :: proc(writer: io.Writer, element: ^Element, indent := 0) -> (written: int, err: io.Error) { - if element == nil { return } +print_element :: proc(writer: io.Writer, doc: ^Document, element_id: Element_ID, indent := 0) -> (written: int, err: io.Error) { using fmt tab :: proc(writer: io.Writer, indent: int) { @@ -61,6 +61,8 @@ print_element :: proc(writer: io.Writer, element: ^Element, indent := 0) -> (wri tab(writer, indent) + element := doc.elements[element_id] + if element.kind == .Element { wprintf(writer, "<%v>\n", element.ident) if len(element.value) > 0 { @@ -74,7 +76,7 @@ print_element :: proc(writer: io.Writer, element: ^Element, indent := 0) -> (wri } for child in element.children { - print_element(writer, child, indent + 1) + print_element(writer, doc, child, indent + 1) } } else if element.kind == .Comment { wprintf(writer, "[COMMENT] %v\n", element.value) diff --git a/core/encoding/xml/example/xml_example.odin b/core/encoding/xml/example/xml_example.odin index daa3c5dab..cadfcfb43 100644 --- a/core/encoding/xml/example/xml_example.odin +++ b/core/encoding/xml/example/xml_example.odin @@ -1,52 +1,85 @@ package xml_example import "core:encoding/xml" -import "core:os" import "core:mem" import "core:fmt" import "core:time" import "core:strings" import "core:hash" +N :: 1 + example :: proc() { using fmt - doc: ^xml.Document - err: xml.Error + docs: [N]^xml.Document + errs: [N]xml.Error + times: [N]time.Duration + + defer for round in 0..<N { + xml.destroy(docs[round]) + } DOC :: #load("../../../../tests/core/assets/XML/unicode.xml") + input := DOC + + for round in 0..<N { + start := time.tick_now() - parse_duration: time.Duration - { - time.SCOPED_TICK_DURATION(&parse_duration) - doc, err = xml.parse(DOC, xml.Options{flags={.Ignore_Unsupported}}) + docs[round], errs[round] = xml.parse(input, xml.Options{ + flags={.Ignore_Unsupported}, + expected_doctype = "", + }) + + end := time.tick_now() + times[round] = time.tick_diff(start, end) } - defer xml.destroy(doc) - ms := time.duration_milliseconds(parse_duration) - speed := (f64(1000.0) / ms) * f64(len(DOC)) / 1_024.0 / 1_024.0 - fmt.printf("Parse time: %v bytes in %.2f ms (%.2f MiB/s).\n", len(DOC), ms, speed) + fastest := time.Duration(max(i64)) + slowest := time.Duration(0) + total := time.Duration(0) + + for round in 0..<N { + fastest = min(fastest, times[round]) + slowest = max(slowest, times[round]) + total += times[round] + } - if err != .None { - printf("Load/Parse error: %v\n", err) - if err == .File_Error { + fastest_ms := time.duration_milliseconds(fastest) + slowest_ms := time.duration_milliseconds(slowest) + average_ms := time.duration_milliseconds(time.Duration(f64(total) / f64(N))) + + fastest_speed := (f64(1000.0) / fastest_ms) * f64(len(DOC)) / 1_024.0 / 1_024.0 + slowest_speed := (f64(1000.0) / slowest_ms) * f64(len(DOC)) / 1_024.0 / 1_024.0 + average_speed := (f64(1000.0) / average_ms) * f64(len(DOC)) / 1_024.0 / 1_024.0 + + fmt.printf("N = %v\n", N) + fmt.printf("[Fastest]: %v bytes in %.2f ms (%.2f MiB/s).\n", len(input), fastest_ms, fastest_speed) + fmt.printf("[Slowest]: %v bytes in %.2f ms (%.2f MiB/s).\n", len(input), slowest_ms, slowest_speed) + fmt.printf("[Average]: %v bytes in %.2f ms (%.2f MiB/s).\n", len(input), average_ms, average_speed) + + if errs[0] != .None { + printf("Load/Parse error: %v\n", errs[0]) + if errs[0] == .File_Error { println("\"unicode.xml\" not found. Did you run \"tests\\download_assets.py\"?") } - os.exit(1) + return } - println("\"unicode.xml\" loaded and parsed.") - - charlist, charlist_ok := xml.find_child_by_ident(doc.root, "charlist") + charlist, charlist_ok := xml.find_child_by_ident(docs[0], 0, "charlist") if !charlist_ok { - eprintln("Could not locate top-level `<charlist>` tag.") - os.exit(1) + eprintln("Could not locate top-level `<charlist>` tag.") + return } - printf("Found `<charlist>` with %v children.\n", len(charlist.children)) + printf("Found `<charlist>` with %v children, %v elements total\n", len(docs[0].elements[charlist].children), docs[0].element_count) - crc32 := doc_hash(doc) + crc32 := doc_hash(docs[0]) printf("[%v] CRC32: 0x%08x\n", "🎉" if crc32 == 0xcaa042b9 else "🤬", crc32) + + for round in 0..<N { + defer xml.destroy(docs[round]) + } } doc_hash :: proc(doc: ^xml.Document, print := false) -> (crc32: u32) { diff --git a/core/encoding/xml/helpers.odin b/core/encoding/xml/helpers.odin index 14597ddbd..48f058334 100644 --- a/core/encoding/xml/helpers.odin +++ b/core/encoding/xml/helpers.odin @@ -1,22 +1,20 @@ -package xml /* An XML 1.0 / 1.1 parser - Copyright 2021 Jeroen van Rijn <nom@duclavier.com>. + Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>. Made available under Odin's BSD-3 license. This file contains helper functions. */ +package xml - -/* - Find `tag`'s nth child with a given ident. -*/ -find_child_by_ident :: proc(tag: ^Element, ident: string, nth := 0) -> (res: ^Element, found: bool) { - if tag == nil { return nil, false } +// Find parent's nth child with a given ident. +find_child_by_ident :: proc(doc: ^Document, parent_id: Element_ID, ident: string, nth := 0) -> (res: Element_ID, found: bool) { + tag := doc.elements[parent_id] count := 0 - for child in tag.children { + for child_id in tag.children { + child := doc.elements[child_id] /* Skip commments. They have no name. */ @@ -26,18 +24,16 @@ find_child_by_ident :: proc(tag: ^Element, ident: string, nth := 0) -> (res: ^El If the ident matches and it's the nth such child, return it. */ if child.ident == ident { - if count == nth { return child, true } + if count == nth { return child_id, true } count += 1 } } - return nil, false + return 0, false } -/* - Find an attribute by key. -*/ -find_attribute_val_by_key :: proc(tag: ^Element, key: string) -> (val: string, found: bool) { - if tag == nil { return "", false } +// Find an attribute by key. +find_attribute_val_by_key :: proc(doc: ^Document, parent_id: Element_ID, key: string) -> (val: string, found: bool) { + tag := doc.elements[parent_id] for attr in tag.attribs { /* diff --git a/core/encoding/xml/tokenizer.odin b/core/encoding/xml/tokenizer.odin index 2da3b7683..c3fece76e 100644 --- a/core/encoding/xml/tokenizer.odin +++ b/core/encoding/xml/tokenizer.odin @@ -1,3 +1,14 @@ +/* + An XML 1.0 / 1.1 parser + + Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>. + Made available under Odin's BSD-3 license. + + A from-scratch XML implementation, loosely modeled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816). + + List of contributors: + Jeroen van Rijn: Initial implementation. +*/ package xml import "core:fmt" diff --git a/core/encoding/xml/xml_reader.odin b/core/encoding/xml/xml_reader.odin index 0315b0e05..636dd0ae4 100644 --- a/core/encoding/xml/xml_reader.odin +++ b/core/encoding/xml/xml_reader.odin @@ -1,8 +1,7 @@ -package xml /* An XML 1.0 / 1.1 parser - Copyright 2021 Jeroen van Rijn <nom@duclavier.com>. + Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>. Made available under Odin's BSD-3 license. A from-scratch XML implementation, loosely modelled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816). @@ -25,12 +24,17 @@ package xml List of contributors: Jeroen van Rijn: Initial implementation. */ +package xml +// An XML 1.0 / 1.1 parser import "core:bytes" -import "core:strings" import "core:encoding/entity" +import "core:intrinsics" import "core:mem" import "core:os" +import "core:strings" + +likely :: intrinsics.expect DEFAULT_Options :: Options{ flags = { @@ -88,7 +92,9 @@ Option_Flag :: enum { Option_Flags :: bit_set[Option_Flag; u16] Document :: struct { - root: ^Element, + elements: [dynamic]Element, + element_count: Element_ID, + prolog: Attributes, encoding: Encoding, @@ -129,8 +135,8 @@ Element :: struct { Comment, }, - parent: ^Element, - children: [dynamic]^Element, + parent: Element_ID, + children: [dynamic]Element_ID, } Attr :: struct { @@ -185,7 +191,7 @@ Error :: enum { No_DocType, Too_Many_DocTypes, - DocType_Must_Proceed_Elements, + DocType_Must_Preceed_Elements, /* If a DOCTYPE is present _or_ the caller @@ -237,12 +243,16 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err doc.tokenizer = t doc.input = data + doc.elements = make([dynamic]Element, 1024, 1024, allocator) + // strings.intern_init(&doc.intern, allocator, allocator) err = .Unexpected_Token - element, parent: ^Element + element, parent: Element_ID - tag_is_open := false + tag_is_open := false + first_element := true + open: Token /* If a DOCTYPE is present, the root tag has to match. @@ -252,6 +262,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err loop: for { skip_whitespace(t) + // NOTE(Jeroen): This is faster as a switch. switch t.ch { case '<': /* @@ -259,118 +270,36 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err */ advance_rune(t) - open := scan(t) - #partial switch open.kind { - - case .Question: - /* - <?xml - */ - next := scan(t) - #partial switch next.kind { - case .Ident: - if len(next.text) == 3 && strings.to_lower(next.text, context.temp_allocator) == "xml" { - parse_prolog(doc) or_return - } else if len(doc.prolog) > 0 { - /* - We've already seen a prolog. - */ - return doc, .Too_Many_Prologs - } else { - /* - Could be `<?xml-stylesheet`, etc. Ignore it. - */ - skip_element(t) or_return - } - case: - error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", next.text) - return - } - - case .Exclaim: - /* - <! - */ - next := scan(t) - #partial switch next.kind { - case .Ident: - switch next.text { - case "DOCTYPE": - if len(doc.doctype.ident) > 0 { - return doc, .Too_Many_DocTypes - } - if doc.root != nil { - return doc, .DocType_Must_Proceed_Elements - } - parse_doctype(doc) or_return - - if len(expected_doctype) > 0 && expected_doctype != doc.doctype.ident { - error(t, t.offset, "Invalid DOCTYPE. Expected: %v, got: %v\n", expected_doctype, doc.doctype.ident) - return doc, .Invalid_DocType - } - expected_doctype = doc.doctype.ident - - case: - if .Error_on_Unsupported in opts.flags { - error(t, t.offset, "Unhandled: <!%v\n", next.text) - return doc, .Unhandled_Bang - } - skip_element(t) or_return - } - - case .Dash: - /* - Comment: <!-- -->. - The grammar does not allow a comment to end in ---> - */ - expect(t, .Dash) - comment := scan_comment(t) or_return - - if .Intern_Comments in opts.flags { - if doc.root == nil { - append(&doc.comments, comment) - } else { - el := new(Element) - el.parent = element - el.kind = .Comment - el.value = comment - append(&element.children, el) - } - } - - case: - error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next) - return - } - - case .Ident: + open = scan(t) + // NOTE(Jeroen): We're not using a switch because this if-else chain ordered by likelihood is 2.5% faster at -o:size and -o:speed. + if likely(open.kind, Token_Kind.Ident) == .Ident { /* e.g. <odin - Start of new element. */ - element = new(Element) + element = new_element(doc) tag_is_open = true - if doc.root == nil { + if first_element { /* First element. */ - doc.root = element parent = element + first_element = false } else { - append(&parent.children, element) + append(&doc.elements[parent].children, element) } - element.parent = parent - element.ident = open.text + doc.elements[element].parent = parent + doc.elements[element].ident = open.text - parse_attributes(doc, &element.attribs) or_return + parse_attributes(doc, &doc.elements[element].attribs) or_return /* If a DOCTYPE is present _or_ the caller asked for a specific DOCTYPE and the DOCTYPE and root tag don't match, we return .Invalid_Root_Tag. */ - if element == doc.root { + if element == 0 { // Root tag? if len(expected_doctype) > 0 && expected_doctype != open.text { error(t, t.offset, "Root Tag doesn't match DOCTYPE. Expected: %v, got: %v\n", expected_doctype, open.text) return doc, .Invalid_DocType @@ -395,7 +324,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err Empty tag. Close it. */ expect(t, .Gt) or_return - parent = element.parent + parent = doc.elements[element].parent element = parent tag_is_open = false @@ -404,22 +333,103 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err return } - case .Slash: + } else if open.kind == .Slash { /* Close tag. */ ident := expect(t, .Ident) or_return _ = expect(t, .Gt) or_return - if element.ident != ident.text { - error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", element.ident, ident.text) + if doc.elements[element].ident != ident.text { + error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", doc.elements[element].ident, ident.text) return doc, .Mismatched_Closing_Tag } - parent = element.parent + parent = doc.elements[element].parent element = parent tag_is_open = false - case: + } else if open.kind == .Exclaim { + /* + <! + */ + next := scan(t) + #partial switch next.kind { + case .Ident: + switch next.text { + case "DOCTYPE": + if len(doc.doctype.ident) > 0 { + return doc, .Too_Many_DocTypes + } + if doc.element_count > 0 { + return doc, .DocType_Must_Preceed_Elements + } + parse_doctype(doc) or_return + + if len(expected_doctype) > 0 && expected_doctype != doc.doctype.ident { + error(t, t.offset, "Invalid DOCTYPE. Expected: %v, got: %v\n", expected_doctype, doc.doctype.ident) + return doc, .Invalid_DocType + } + expected_doctype = doc.doctype.ident + + case: + if .Error_on_Unsupported in opts.flags { + error(t, t.offset, "Unhandled: <!%v\n", next.text) + return doc, .Unhandled_Bang + } + skip_element(t) or_return + } + + case .Dash: + /* + Comment: <!-- -->. + The grammar does not allow a comment to end in ---> + */ + expect(t, .Dash) + comment := scan_comment(t) or_return + + if .Intern_Comments in opts.flags { + if len(doc.elements) == 0 { + append(&doc.comments, comment) + } else { + el := new_element(doc) + doc.elements[el].parent = element + doc.elements[el].kind = .Comment + doc.elements[el].value = comment + append(&doc.elements[element].children, el) + } + } + + case: + error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next) + return + } + + } else if open.kind == .Question { + /* + <?xml + */ + next := scan(t) + #partial switch next.kind { + case .Ident: + if len(next.text) == 3 && strings.to_lower(next.text, context.temp_allocator) == "xml" { + parse_prolog(doc) or_return + } else if len(doc.prolog) > 0 { + /* + We've already seen a prolog. + */ + return doc, .Too_Many_Prologs + } else { + /* + Could be `<?xml-stylesheet`, etc. Ignore it. + */ + skip_element(t) or_return + } + case: + error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", next.text) + return + } + + } else { error(t, t.offset, "Invalid Token after <: %#v\n", open) return } @@ -442,7 +452,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err needs_processing |= .Decode_SGML_Entities in opts.flags if !needs_processing { - element.value = body_text + doc.elements[element].value = body_text continue } @@ -464,10 +474,10 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err decoded, decode_err := entity.decode_xml(body_text, decode_opts) if decode_err == .None { - element.value = decoded + doc.elements[element].value = decoded append(&doc.strings_to_free, decoded) } else { - element.value = body_text + doc.elements[element].value = body_text } } } @@ -480,6 +490,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err return doc, .No_DocType } + resize(&doc.elements, int(doc.element_count)) return doc, .None } @@ -497,26 +508,14 @@ parse_from_file :: proc(filename: string, options := DEFAULT_Options, error_hand parse :: proc { parse_from_file, parse_from_slice } -free_element :: proc(element: ^Element) { - if element == nil { return } - - for child in element.children { - /* - NOTE: Recursive. - - Could be rewritten so it adds them to a list of pointers to free. - */ - free_element(child) - } - delete(element.attribs) - delete(element.children) - free(element) -} - destroy :: proc(doc: ^Document) { if doc == nil { return } - free_element(doc.root) + for el in doc.elements { + delete(el.attribs) + delete(el.children) + } + delete(doc.elements) delete(doc.prolog) delete(doc.comments) @@ -686,4 +685,25 @@ parse_doctype :: proc(doc: ^Document) -> (err: Error) { */ doc.doctype.rest = string(t.src[offset : t.offset - 1]) return .None +} + +Element_ID :: u32 + +new_element :: proc(doc: ^Document) -> (id: Element_ID) { + element_space := len(doc.elements) + + // Need to resize + if int(doc.element_count) + 1 > element_space { + if element_space < 65536 { + element_space *= 2 + } else { + element_space += 65536 + } + resize(&doc.elements, element_space) + } + + cur := doc.element_count + doc.element_count += 1 + + return cur }
\ No newline at end of file diff --git a/tests/core/encoding/xml/test_core_xml.odin b/tests/core/encoding/xml/test_core_xml.odin index 7669afe97..82386b2bb 100644 --- a/tests/core/encoding/xml/test_core_xml.odin +++ b/tests/core/encoding/xml/test_core_xml.odin @@ -224,7 +224,7 @@ doc_to_string :: proc(doc: ^xml.Document) -> (result: string) { written += wprintf(writer, "[DOCTYPE] %v\n", doc.doctype.ident) if len(doc.doctype.rest) > 0 { - wprintf(writer, "\t%v\n", doc.doctype.rest) + wprintf(writer, "\t%v\n", doc.doctype.rest) } } @@ -232,17 +232,16 @@ doc_to_string :: proc(doc: ^xml.Document) -> (result: string) { written += wprintf(writer, "[Pre-root comment] %v\n", comment) } - if doc.root != nil { - wprintln(writer, " --- ") - print_element(writer, doc.root) - wprintln(writer, " --- ") + if doc.element_count > 0 { + wprintln(writer, " --- ") + print_element(writer, doc, 0) + wprintln(writer, " --- ") } return written, .None } - print_element :: proc(writer: io.Writer, element: ^xml.Element, indent := 0) -> (written: int, err: io.Error) { - if element == nil { return } + print_element :: proc(writer: io.Writer, doc: ^xml.Document, element_id: xml.Element_ID, indent := 0) -> (written: int, err: io.Error) { using fmt tab :: proc(writer: io.Writer, indent: int) { @@ -253,6 +252,8 @@ doc_to_string :: proc(doc: ^xml.Document) -> (result: string) { tab(writer, indent) + element := doc.elements[element_id] + if element.kind == .Element { wprintf(writer, "<%v>\n", element.ident) if len(element.value) > 0 { @@ -266,7 +267,7 @@ doc_to_string :: proc(doc: ^xml.Document) -> (result: string) { } for child in element.children { - print_element(writer, child, indent + 1) + print_element(writer, doc, child, indent + 1) } } else if element.kind == .Comment { wprintf(writer, "[COMMENT] %v\n", element.value) |