From 80878264b63cd8476def629526b294b8e129791a Mon Sep 17 00:00:00 2001 From: Jeroen van Rijn Date: Thu, 28 Apr 2022 15:29:00 +0200 Subject: [xml] Speedup. --- core/encoding/xml/xml_reader.odin | 276 ++++++++++++++++++++------------------ 1 file changed, 148 insertions(+), 128 deletions(-) (limited to 'core/encoding/xml/xml_reader.odin') diff --git a/core/encoding/xml/xml_reader.odin b/core/encoding/xml/xml_reader.odin index 0315b0e05..636dd0ae4 100644 --- a/core/encoding/xml/xml_reader.odin +++ b/core/encoding/xml/xml_reader.odin @@ -1,8 +1,7 @@ -package xml /* An XML 1.0 / 1.1 parser - Copyright 2021 Jeroen van Rijn . + Copyright 2021-2022 Jeroen van Rijn . Made available under Odin's BSD-3 license. A from-scratch XML implementation, loosely modelled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816). @@ -25,12 +24,17 @@ package xml List of contributors: Jeroen van Rijn: Initial implementation. */ +package xml +// An XML 1.0 / 1.1 parser import "core:bytes" -import "core:strings" import "core:encoding/entity" +import "core:intrinsics" import "core:mem" import "core:os" +import "core:strings" + +likely :: intrinsics.expect DEFAULT_Options :: Options{ flags = { @@ -88,7 +92,9 @@ Option_Flag :: enum { Option_Flags :: bit_set[Option_Flag; u16] Document :: struct { - root: ^Element, + elements: [dynamic]Element, + element_count: Element_ID, + prolog: Attributes, encoding: Encoding, @@ -129,8 +135,8 @@ Element :: struct { Comment, }, - parent: ^Element, - children: [dynamic]^Element, + parent: Element_ID, + children: [dynamic]Element_ID, } Attr :: struct { @@ -185,7 +191,7 @@ Error :: enum { No_DocType, Too_Many_DocTypes, - DocType_Must_Proceed_Elements, + DocType_Must_Preceed_Elements, /* If a DOCTYPE is present _or_ the caller @@ -237,12 +243,16 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err doc.tokenizer = t doc.input = data + doc.elements = make([dynamic]Element, 1024, 1024, allocator) + // strings.intern_init(&doc.intern, allocator, allocator) err = .Unexpected_Token - element, parent: ^Element + element, parent: Element_ID - tag_is_open := false + tag_is_open := false + first_element := true + open: Token /* If a DOCTYPE is present, the root tag has to match. @@ -252,6 +262,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err loop: for { skip_whitespace(t) + // NOTE(Jeroen): This is faster as a switch. switch t.ch { case '<': /* @@ -259,118 +270,36 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err */ advance_rune(t) - open := scan(t) - #partial switch open.kind { - - case .Question: - /* - 0 { - /* - We've already seen a prolog. - */ - return doc, .Too_Many_Prologs - } else { - /* - Could be ` 0 { - return doc, .Too_Many_DocTypes - } - if doc.root != nil { - return doc, .DocType_Must_Proceed_Elements - } - parse_doctype(doc) or_return - - if len(expected_doctype) > 0 && expected_doctype != doc.doctype.ident { - error(t, t.offset, "Invalid DOCTYPE. Expected: %v, got: %v\n", expected_doctype, doc.doctype.ident) - return doc, .Invalid_DocType - } - expected_doctype = doc.doctype.ident - - case: - if .Error_on_Unsupported in opts.flags { - error(t, t.offset, "Unhandled: . - The grammar does not allow a comment to end in ---> - */ - expect(t, .Dash) - comment := scan_comment(t) or_return - - if .Intern_Comments in opts.flags { - if doc.root == nil { - append(&doc.comments, comment) - } else { - el := new(Element) - el.parent = element - el.kind = .Comment - el.value = comment - append(&element.children, el) - } - } - - case: - error(t, t.offset, "Invalid Token after 0 && expected_doctype != open.text { error(t, t.offset, "Root Tag doesn't match DOCTYPE. Expected: %v, got: %v\n", expected_doctype, open.text) return doc, .Invalid_DocType @@ -395,7 +324,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err Empty tag. Close it. */ expect(t, .Gt) or_return - parent = element.parent + parent = doc.elements[element].parent element = parent tag_is_open = false @@ -404,22 +333,103 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err return } - case .Slash: + } else if open.kind == .Slash { /* Close tag. */ ident := expect(t, .Ident) or_return _ = expect(t, .Gt) or_return - if element.ident != ident.text { - error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", element.ident, ident.text) + if doc.elements[element].ident != ident.text { + error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", doc.elements[element].ident, ident.text) return doc, .Mismatched_Closing_Tag } - parent = element.parent + parent = doc.elements[element].parent element = parent tag_is_open = false - case: + } else if open.kind == .Exclaim { + /* + 0 { + return doc, .Too_Many_DocTypes + } + if doc.element_count > 0 { + return doc, .DocType_Must_Preceed_Elements + } + parse_doctype(doc) or_return + + if len(expected_doctype) > 0 && expected_doctype != doc.doctype.ident { + error(t, t.offset, "Invalid DOCTYPE. Expected: %v, got: %v\n", expected_doctype, doc.doctype.ident) + return doc, .Invalid_DocType + } + expected_doctype = doc.doctype.ident + + case: + if .Error_on_Unsupported in opts.flags { + error(t, t.offset, "Unhandled: . + The grammar does not allow a comment to end in ---> + */ + expect(t, .Dash) + comment := scan_comment(t) or_return + + if .Intern_Comments in opts.flags { + if len(doc.elements) == 0 { + append(&doc.comments, comment) + } else { + el := new_element(doc) + doc.elements[el].parent = element + doc.elements[el].kind = .Comment + doc.elements[el].value = comment + append(&doc.elements[element].children, el) + } + } + + case: + error(t, t.offset, "Invalid Token after 0 { + /* + We've already seen a prolog. + */ + return doc, .Too_Many_Prologs + } else { + /* + Could be ` (err: Error) { */ doc.doctype.rest = string(t.src[offset : t.offset - 1]) return .None +} + +Element_ID :: u32 + +new_element :: proc(doc: ^Document) -> (id: Element_ID) { + element_space := len(doc.elements) + + // Need to resize + if int(doc.element_count) + 1 > element_space { + if element_space < 65536 { + element_space *= 2 + } else { + element_space += 65536 + } + resize(&doc.elements, element_space) + } + + cur := doc.element_count + doc.element_count += 1 + + return cur } \ No newline at end of file -- cgit v1.2.3