diff options
Diffstat (limited to 'core/encoding/xml/xml_reader.odin')
| -rw-r--r-- | core/encoding/xml/xml_reader.odin | 276 |
1 files changed, 148 insertions, 128 deletions
diff --git a/core/encoding/xml/xml_reader.odin b/core/encoding/xml/xml_reader.odin index 0315b0e05..636dd0ae4 100644 --- a/core/encoding/xml/xml_reader.odin +++ b/core/encoding/xml/xml_reader.odin @@ -1,8 +1,7 @@ -package xml /* An XML 1.0 / 1.1 parser - Copyright 2021 Jeroen van Rijn <nom@duclavier.com>. + Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>. Made available under Odin's BSD-3 license. A from-scratch XML implementation, loosely modelled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816). @@ -25,12 +24,17 @@ package xml List of contributors: Jeroen van Rijn: Initial implementation. */ +package xml +// An XML 1.0 / 1.1 parser import "core:bytes" -import "core:strings" import "core:encoding/entity" +import "core:intrinsics" import "core:mem" import "core:os" +import "core:strings" + +likely :: intrinsics.expect DEFAULT_Options :: Options{ flags = { @@ -88,7 +92,9 @@ Option_Flag :: enum { Option_Flags :: bit_set[Option_Flag; u16] Document :: struct { - root: ^Element, + elements: [dynamic]Element, + element_count: Element_ID, + prolog: Attributes, encoding: Encoding, @@ -129,8 +135,8 @@ Element :: struct { Comment, }, - parent: ^Element, - children: [dynamic]^Element, + parent: Element_ID, + children: [dynamic]Element_ID, } Attr :: struct { @@ -185,7 +191,7 @@ Error :: enum { No_DocType, Too_Many_DocTypes, - DocType_Must_Proceed_Elements, + DocType_Must_Preceed_Elements, /* If a DOCTYPE is present _or_ the caller @@ -237,12 +243,16 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err doc.tokenizer = t doc.input = data + doc.elements = make([dynamic]Element, 1024, 1024, allocator) + // strings.intern_init(&doc.intern, allocator, allocator) err = .Unexpected_Token - element, parent: ^Element + element, parent: Element_ID - tag_is_open := false + tag_is_open := false + first_element := true + open: Token /* If a DOCTYPE is present, the root tag has to match. @@ -252,6 +262,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err loop: for { skip_whitespace(t) + // NOTE(Jeroen): This is faster as a switch. switch t.ch { case '<': /* @@ -259,118 +270,36 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err */ advance_rune(t) - open := scan(t) - #partial switch open.kind { - - case .Question: - /* - <?xml - */ - next := scan(t) - #partial switch next.kind { - case .Ident: - if len(next.text) == 3 && strings.to_lower(next.text, context.temp_allocator) == "xml" { - parse_prolog(doc) or_return - } else if len(doc.prolog) > 0 { - /* - We've already seen a prolog. - */ - return doc, .Too_Many_Prologs - } else { - /* - Could be `<?xml-stylesheet`, etc. Ignore it. - */ - skip_element(t) or_return - } - case: - error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", next.text) - return - } - - case .Exclaim: - /* - <! - */ - next := scan(t) - #partial switch next.kind { - case .Ident: - switch next.text { - case "DOCTYPE": - if len(doc.doctype.ident) > 0 { - return doc, .Too_Many_DocTypes - } - if doc.root != nil { - return doc, .DocType_Must_Proceed_Elements - } - parse_doctype(doc) or_return - - if len(expected_doctype) > 0 && expected_doctype != doc.doctype.ident { - error(t, t.offset, "Invalid DOCTYPE. Expected: %v, got: %v\n", expected_doctype, doc.doctype.ident) - return doc, .Invalid_DocType - } - expected_doctype = doc.doctype.ident - - case: - if .Error_on_Unsupported in opts.flags { - error(t, t.offset, "Unhandled: <!%v\n", next.text) - return doc, .Unhandled_Bang - } - skip_element(t) or_return - } - - case .Dash: - /* - Comment: <!-- -->. - The grammar does not allow a comment to end in ---> - */ - expect(t, .Dash) - comment := scan_comment(t) or_return - - if .Intern_Comments in opts.flags { - if doc.root == nil { - append(&doc.comments, comment) - } else { - el := new(Element) - el.parent = element - el.kind = .Comment - el.value = comment - append(&element.children, el) - } - } - - case: - error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next) - return - } - - case .Ident: + open = scan(t) + // NOTE(Jeroen): We're not using a switch because this if-else chain ordered by likelihood is 2.5% faster at -o:size and -o:speed. + if likely(open.kind, Token_Kind.Ident) == .Ident { /* e.g. <odin - Start of new element. */ - element = new(Element) + element = new_element(doc) tag_is_open = true - if doc.root == nil { + if first_element { /* First element. */ - doc.root = element parent = element + first_element = false } else { - append(&parent.children, element) + append(&doc.elements[parent].children, element) } - element.parent = parent - element.ident = open.text + doc.elements[element].parent = parent + doc.elements[element].ident = open.text - parse_attributes(doc, &element.attribs) or_return + parse_attributes(doc, &doc.elements[element].attribs) or_return /* If a DOCTYPE is present _or_ the caller asked for a specific DOCTYPE and the DOCTYPE and root tag don't match, we return .Invalid_Root_Tag. */ - if element == doc.root { + if element == 0 { // Root tag? if len(expected_doctype) > 0 && expected_doctype != open.text { error(t, t.offset, "Root Tag doesn't match DOCTYPE. Expected: %v, got: %v\n", expected_doctype, open.text) return doc, .Invalid_DocType @@ -395,7 +324,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err Empty tag. Close it. */ expect(t, .Gt) or_return - parent = element.parent + parent = doc.elements[element].parent element = parent tag_is_open = false @@ -404,22 +333,103 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err return } - case .Slash: + } else if open.kind == .Slash { /* Close tag. */ ident := expect(t, .Ident) or_return _ = expect(t, .Gt) or_return - if element.ident != ident.text { - error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", element.ident, ident.text) + if doc.elements[element].ident != ident.text { + error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", doc.elements[element].ident, ident.text) return doc, .Mismatched_Closing_Tag } - parent = element.parent + parent = doc.elements[element].parent element = parent tag_is_open = false - case: + } else if open.kind == .Exclaim { + /* + <! + */ + next := scan(t) + #partial switch next.kind { + case .Ident: + switch next.text { + case "DOCTYPE": + if len(doc.doctype.ident) > 0 { + return doc, .Too_Many_DocTypes + } + if doc.element_count > 0 { + return doc, .DocType_Must_Preceed_Elements + } + parse_doctype(doc) or_return + + if len(expected_doctype) > 0 && expected_doctype != doc.doctype.ident { + error(t, t.offset, "Invalid DOCTYPE. Expected: %v, got: %v\n", expected_doctype, doc.doctype.ident) + return doc, .Invalid_DocType + } + expected_doctype = doc.doctype.ident + + case: + if .Error_on_Unsupported in opts.flags { + error(t, t.offset, "Unhandled: <!%v\n", next.text) + return doc, .Unhandled_Bang + } + skip_element(t) or_return + } + + case .Dash: + /* + Comment: <!-- -->. + The grammar does not allow a comment to end in ---> + */ + expect(t, .Dash) + comment := scan_comment(t) or_return + + if .Intern_Comments in opts.flags { + if len(doc.elements) == 0 { + append(&doc.comments, comment) + } else { + el := new_element(doc) + doc.elements[el].parent = element + doc.elements[el].kind = .Comment + doc.elements[el].value = comment + append(&doc.elements[element].children, el) + } + } + + case: + error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next) + return + } + + } else if open.kind == .Question { + /* + <?xml + */ + next := scan(t) + #partial switch next.kind { + case .Ident: + if len(next.text) == 3 && strings.to_lower(next.text, context.temp_allocator) == "xml" { + parse_prolog(doc) or_return + } else if len(doc.prolog) > 0 { + /* + We've already seen a prolog. + */ + return doc, .Too_Many_Prologs + } else { + /* + Could be `<?xml-stylesheet`, etc. Ignore it. + */ + skip_element(t) or_return + } + case: + error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", next.text) + return + } + + } else { error(t, t.offset, "Invalid Token after <: %#v\n", open) return } @@ -442,7 +452,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err needs_processing |= .Decode_SGML_Entities in opts.flags if !needs_processing { - element.value = body_text + doc.elements[element].value = body_text continue } @@ -464,10 +474,10 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err decoded, decode_err := entity.decode_xml(body_text, decode_opts) if decode_err == .None { - element.value = decoded + doc.elements[element].value = decoded append(&doc.strings_to_free, decoded) } else { - element.value = body_text + doc.elements[element].value = body_text } } } @@ -480,6 +490,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err return doc, .No_DocType } + resize(&doc.elements, int(doc.element_count)) return doc, .None } @@ -497,26 +508,14 @@ parse_from_file :: proc(filename: string, options := DEFAULT_Options, error_hand parse :: proc { parse_from_file, parse_from_slice } -free_element :: proc(element: ^Element) { - if element == nil { return } - - for child in element.children { - /* - NOTE: Recursive. - - Could be rewritten so it adds them to a list of pointers to free. - */ - free_element(child) - } - delete(element.attribs) - delete(element.children) - free(element) -} - destroy :: proc(doc: ^Document) { if doc == nil { return } - free_element(doc.root) + for el in doc.elements { + delete(el.attribs) + delete(el.children) + } + delete(doc.elements) delete(doc.prolog) delete(doc.comments) @@ -686,4 +685,25 @@ parse_doctype :: proc(doc: ^Document) -> (err: Error) { */ doc.doctype.rest = string(t.src[offset : t.offset - 1]) return .None +} + +Element_ID :: u32 + +new_element :: proc(doc: ^Document) -> (id: Element_ID) { + element_space := len(doc.elements) + + // Need to resize + if int(doc.element_count) + 1 > element_space { + if element_space < 65536 { + element_space *= 2 + } else { + element_space += 65536 + } + resize(&doc.elements, element_space) + } + + cur := doc.element_count + doc.element_count += 1 + + return cur }
\ No newline at end of file |