aboutsummaryrefslogtreecommitdiff
path: root/core/encoding/xml/xml_reader.odin
diff options
context:
space:
mode:
Diffstat (limited to 'core/encoding/xml/xml_reader.odin')
-rw-r--r--core/encoding/xml/xml_reader.odin276
1 files changed, 148 insertions, 128 deletions
diff --git a/core/encoding/xml/xml_reader.odin b/core/encoding/xml/xml_reader.odin
index 0315b0e05..636dd0ae4 100644
--- a/core/encoding/xml/xml_reader.odin
+++ b/core/encoding/xml/xml_reader.odin
@@ -1,8 +1,7 @@
-package xml
/*
An XML 1.0 / 1.1 parser
- Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
+ Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>.
Made available under Odin's BSD-3 license.
A from-scratch XML implementation, loosely modelled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
@@ -25,12 +24,17 @@ package xml
List of contributors:
Jeroen van Rijn: Initial implementation.
*/
+package xml
+// An XML 1.0 / 1.1 parser
import "core:bytes"
-import "core:strings"
import "core:encoding/entity"
+import "core:intrinsics"
import "core:mem"
import "core:os"
+import "core:strings"
+
+likely :: intrinsics.expect
DEFAULT_Options :: Options{
flags = {
@@ -88,7 +92,9 @@ Option_Flag :: enum {
Option_Flags :: bit_set[Option_Flag; u16]
Document :: struct {
- root: ^Element,
+ elements: [dynamic]Element,
+ element_count: Element_ID,
+
prolog: Attributes,
encoding: Encoding,
@@ -129,8 +135,8 @@ Element :: struct {
Comment,
},
- parent: ^Element,
- children: [dynamic]^Element,
+ parent: Element_ID,
+ children: [dynamic]Element_ID,
}
Attr :: struct {
@@ -185,7 +191,7 @@ Error :: enum {
No_DocType,
Too_Many_DocTypes,
- DocType_Must_Proceed_Elements,
+ DocType_Must_Preceed_Elements,
/*
If a DOCTYPE is present _or_ the caller
@@ -237,12 +243,16 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
doc.tokenizer = t
doc.input = data
+ doc.elements = make([dynamic]Element, 1024, 1024, allocator)
+
// strings.intern_init(&doc.intern, allocator, allocator)
err = .Unexpected_Token
- element, parent: ^Element
+ element, parent: Element_ID
- tag_is_open := false
+ tag_is_open := false
+ first_element := true
+ open: Token
/*
If a DOCTYPE is present, the root tag has to match.
@@ -252,6 +262,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
loop: for {
skip_whitespace(t)
+ // NOTE(Jeroen): This is faster as a switch.
switch t.ch {
case '<':
/*
@@ -259,118 +270,36 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
*/
advance_rune(t)
- open := scan(t)
- #partial switch open.kind {
-
- case .Question:
- /*
- <?xml
- */
- next := scan(t)
- #partial switch next.kind {
- case .Ident:
- if len(next.text) == 3 && strings.to_lower(next.text, context.temp_allocator) == "xml" {
- parse_prolog(doc) or_return
- } else if len(doc.prolog) > 0 {
- /*
- We've already seen a prolog.
- */
- return doc, .Too_Many_Prologs
- } else {
- /*
- Could be `<?xml-stylesheet`, etc. Ignore it.
- */
- skip_element(t) or_return
- }
- case:
- error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", next.text)
- return
- }
-
- case .Exclaim:
- /*
- <!
- */
- next := scan(t)
- #partial switch next.kind {
- case .Ident:
- switch next.text {
- case "DOCTYPE":
- if len(doc.doctype.ident) > 0 {
- return doc, .Too_Many_DocTypes
- }
- if doc.root != nil {
- return doc, .DocType_Must_Proceed_Elements
- }
- parse_doctype(doc) or_return
-
- if len(expected_doctype) > 0 && expected_doctype != doc.doctype.ident {
- error(t, t.offset, "Invalid DOCTYPE. Expected: %v, got: %v\n", expected_doctype, doc.doctype.ident)
- return doc, .Invalid_DocType
- }
- expected_doctype = doc.doctype.ident
-
- case:
- if .Error_on_Unsupported in opts.flags {
- error(t, t.offset, "Unhandled: <!%v\n", next.text)
- return doc, .Unhandled_Bang
- }
- skip_element(t) or_return
- }
-
- case .Dash:
- /*
- Comment: <!-- -->.
- The grammar does not allow a comment to end in --->
- */
- expect(t, .Dash)
- comment := scan_comment(t) or_return
-
- if .Intern_Comments in opts.flags {
- if doc.root == nil {
- append(&doc.comments, comment)
- } else {
- el := new(Element)
- el.parent = element
- el.kind = .Comment
- el.value = comment
- append(&element.children, el)
- }
- }
-
- case:
- error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next)
- return
- }
-
- case .Ident:
+ open = scan(t)
+ // NOTE(Jeroen): We're not using a switch because this if-else chain ordered by likelihood is 2.5% faster at -o:size and -o:speed.
+ if likely(open.kind, Token_Kind.Ident) == .Ident {
/*
e.g. <odin - Start of new element.
*/
- element = new(Element)
+ element = new_element(doc)
tag_is_open = true
- if doc.root == nil {
+ if first_element {
/*
First element.
*/
- doc.root = element
parent = element
+ first_element = false
} else {
- append(&parent.children, element)
+ append(&doc.elements[parent].children, element)
}
- element.parent = parent
- element.ident = open.text
+ doc.elements[element].parent = parent
+ doc.elements[element].ident = open.text
- parse_attributes(doc, &element.attribs) or_return
+ parse_attributes(doc, &doc.elements[element].attribs) or_return
/*
If a DOCTYPE is present _or_ the caller
asked for a specific DOCTYPE and the DOCTYPE
and root tag don't match, we return .Invalid_Root_Tag.
*/
- if element == doc.root {
+ if element == 0 { // Root tag?
if len(expected_doctype) > 0 && expected_doctype != open.text {
error(t, t.offset, "Root Tag doesn't match DOCTYPE. Expected: %v, got: %v\n", expected_doctype, open.text)
return doc, .Invalid_DocType
@@ -395,7 +324,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
Empty tag. Close it.
*/
expect(t, .Gt) or_return
- parent = element.parent
+ parent = doc.elements[element].parent
element = parent
tag_is_open = false
@@ -404,22 +333,103 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
return
}
- case .Slash:
+ } else if open.kind == .Slash {
/*
Close tag.
*/
ident := expect(t, .Ident) or_return
_ = expect(t, .Gt) or_return
- if element.ident != ident.text {
- error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", element.ident, ident.text)
+ if doc.elements[element].ident != ident.text {
+ error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", doc.elements[element].ident, ident.text)
return doc, .Mismatched_Closing_Tag
}
- parent = element.parent
+ parent = doc.elements[element].parent
element = parent
tag_is_open = false
- case:
+ } else if open.kind == .Exclaim {
+ /*
+ <!
+ */
+ next := scan(t)
+ #partial switch next.kind {
+ case .Ident:
+ switch next.text {
+ case "DOCTYPE":
+ if len(doc.doctype.ident) > 0 {
+ return doc, .Too_Many_DocTypes
+ }
+ if doc.element_count > 0 {
+ return doc, .DocType_Must_Preceed_Elements
+ }
+ parse_doctype(doc) or_return
+
+ if len(expected_doctype) > 0 && expected_doctype != doc.doctype.ident {
+ error(t, t.offset, "Invalid DOCTYPE. Expected: %v, got: %v\n", expected_doctype, doc.doctype.ident)
+ return doc, .Invalid_DocType
+ }
+ expected_doctype = doc.doctype.ident
+
+ case:
+ if .Error_on_Unsupported in opts.flags {
+ error(t, t.offset, "Unhandled: <!%v\n", next.text)
+ return doc, .Unhandled_Bang
+ }
+ skip_element(t) or_return
+ }
+
+ case .Dash:
+ /*
+ Comment: <!-- -->.
+ The grammar does not allow a comment to end in --->
+ */
+ expect(t, .Dash)
+ comment := scan_comment(t) or_return
+
+ if .Intern_Comments in opts.flags {
+ if len(doc.elements) == 0 {
+ append(&doc.comments, comment)
+ } else {
+ el := new_element(doc)
+ doc.elements[el].parent = element
+ doc.elements[el].kind = .Comment
+ doc.elements[el].value = comment
+ append(&doc.elements[element].children, el)
+ }
+ }
+
+ case:
+ error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next)
+ return
+ }
+
+ } else if open.kind == .Question {
+ /*
+ <?xml
+ */
+ next := scan(t)
+ #partial switch next.kind {
+ case .Ident:
+ if len(next.text) == 3 && strings.to_lower(next.text, context.temp_allocator) == "xml" {
+ parse_prolog(doc) or_return
+ } else if len(doc.prolog) > 0 {
+ /*
+ We've already seen a prolog.
+ */
+ return doc, .Too_Many_Prologs
+ } else {
+ /*
+ Could be `<?xml-stylesheet`, etc. Ignore it.
+ */
+ skip_element(t) or_return
+ }
+ case:
+ error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", next.text)
+ return
+ }
+
+ } else {
error(t, t.offset, "Invalid Token after <: %#v\n", open)
return
}
@@ -442,7 +452,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
needs_processing |= .Decode_SGML_Entities in opts.flags
if !needs_processing {
- element.value = body_text
+ doc.elements[element].value = body_text
continue
}
@@ -464,10 +474,10 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
decoded, decode_err := entity.decode_xml(body_text, decode_opts)
if decode_err == .None {
- element.value = decoded
+ doc.elements[element].value = decoded
append(&doc.strings_to_free, decoded)
} else {
- element.value = body_text
+ doc.elements[element].value = body_text
}
}
}
@@ -480,6 +490,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
return doc, .No_DocType
}
+ resize(&doc.elements, int(doc.element_count))
return doc, .None
}
@@ -497,26 +508,14 @@ parse_from_file :: proc(filename: string, options := DEFAULT_Options, error_hand
parse :: proc { parse_from_file, parse_from_slice }
-free_element :: proc(element: ^Element) {
- if element == nil { return }
-
- for child in element.children {
- /*
- NOTE: Recursive.
-
- Could be rewritten so it adds them to a list of pointers to free.
- */
- free_element(child)
- }
- delete(element.attribs)
- delete(element.children)
- free(element)
-}
-
destroy :: proc(doc: ^Document) {
if doc == nil { return }
- free_element(doc.root)
+ for el in doc.elements {
+ delete(el.attribs)
+ delete(el.children)
+ }
+ delete(doc.elements)
delete(doc.prolog)
delete(doc.comments)
@@ -686,4 +685,25 @@ parse_doctype :: proc(doc: ^Document) -> (err: Error) {
*/
doc.doctype.rest = string(t.src[offset : t.offset - 1])
return .None
+}
+
+Element_ID :: u32
+
+new_element :: proc(doc: ^Document) -> (id: Element_ID) {
+ element_space := len(doc.elements)
+
+ // Need to resize
+ if int(doc.element_count) + 1 > element_space {
+ if element_space < 65536 {
+ element_space *= 2
+ } else {
+ element_space += 65536
+ }
+ resize(&doc.elements, element_space)
+ }
+
+ cur := doc.element_count
+ doc.element_count += 1
+
+ return cur
} \ No newline at end of file