aboutsummaryrefslogtreecommitdiff
path: root/core/encoding/xml/xml_reader.odin
diff options
context:
space:
mode:
authorCourtney Strachan <courtney.strachan@gmail.com>2025-10-06 02:41:44 +0100
committerGitHub <noreply@github.com>2025-10-06 02:41:44 +0100
commit6de2d6e8ca687c989bbb7806e5cbe8d791e425bf (patch)
tree03a2e0a84c7c1530215f8e3f59a7f643b39b3677 /core/encoding/xml/xml_reader.odin
parentdbbe96ae5c343f0e803de6ee508207a62571534f (diff)
parent0f97382fa3e46da80705c00dfe02f3deb9562e4f (diff)
Merge branch 'odin-lang:master' into master
Diffstat (limited to 'core/encoding/xml/xml_reader.odin')
-rw-r--r--core/encoding/xml/xml_reader.odin106
1 files changed, 61 insertions, 45 deletions
diff --git a/core/encoding/xml/xml_reader.odin b/core/encoding/xml/xml_reader.odin
index b8c8b13a4..707d2b3f3 100644
--- a/core/encoding/xml/xml_reader.odin
+++ b/core/encoding/xml/xml_reader.odin
@@ -56,7 +56,7 @@ Option_Flag :: enum {
Option_Flags :: bit_set[Option_Flag; u16]
Document :: struct {
- elements: [dynamic]Element,
+ elements: [dynamic]Element `fmt:"v,element_count"`,
element_count: Element_ID,
prologue: Attributes,
@@ -70,15 +70,15 @@ Document :: struct {
// If we encounter comments before the root node, and the option to intern comments is given, this is where they'll live.
// Otherwise they'll be in the element tree.
- comments: [dynamic]string,
+ comments: [dynamic]string `fmt:"-"`,
// Internal
- tokenizer: ^Tokenizer,
- allocator: mem.Allocator,
+ tokenizer: ^Tokenizer `fmt:"-"`,
+ allocator: mem.Allocator `fmt:"-"`,
// Input. Either the original buffer, or a copy if `.Input_May_Be_Modified` isn't specified.
- input: []u8,
- strings_to_free: [dynamic]string,
+ input: []u8 `fmt:"-"`,
+ strings_to_free: [dynamic]string `fmt:"-"`,
}
Element :: struct {
@@ -175,7 +175,7 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
data = bytes.clone(data)
}
- t := &Tokenizer{}
+ t := new(Tokenizer)
init(t, string(data), path, error_handler)
doc = new(Document)
@@ -195,7 +195,6 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
loop: for {
skip_whitespace(t)
- // NOTE(Jeroen): This is faster as a switch.
switch t.ch {
case '<':
// Consume peeked `<`
@@ -306,9 +305,17 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
}
}
+ case .Open_Bracket:
+ // This could be a CDATA tag part of a tag's body. Unread the `<![`
+ t.offset -= 3
+
+ // Instead of calling `parse_body` here, we could also `continue loop`
+ // and fall through to the `case:` at the bottom of the outer loop.
+ // This makes the intent clearer.
+ parse_body(doc, element, opts) or_return
+
case:
- error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next)
- return
+ error(t, t.offset, "Unexpected Token after <!: %#v", next)
}
} else if open.kind == .Question {
@@ -341,38 +348,7 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
case:
// This should be a tag's body text.
- body_text := scan_string(t, t.offset) or_return
- needs_processing := .Unbox_CDATA in opts.flags
- needs_processing |= .Decode_SGML_Entities in opts.flags
-
- if !needs_processing {
- append(&doc.elements[element].value, body_text)
- continue
- }
-
- decode_opts := entity.XML_Decode_Options{}
- if .Keep_Tag_Body_Comments not_in opts.flags {
- decode_opts += { .Comment_Strip }
- }
-
- if .Decode_SGML_Entities not_in opts.flags {
- decode_opts += { .No_Entity_Decode }
- }
-
- if .Unbox_CDATA in opts.flags {
- decode_opts += { .Unbox_CDATA }
- if .Decode_SGML_Entities in opts.flags {
- decode_opts += { .Decode_CDATA }
- }
- }
-
- decoded, decode_err := entity.decode_xml(body_text, decode_opts)
- if decode_err == .None {
- append(&doc.elements[element].value, decoded)
- append(&doc.strings_to_free, decoded)
- } else {
- append(&doc.elements[element].value, body_text)
- }
+ parse_body(doc, element, opts) or_return
}
}
@@ -427,6 +403,7 @@ destroy :: proc(doc: ^Document) {
}
delete(doc.strings_to_free)
+ free(doc.tokenizer)
free(doc)
}
@@ -457,8 +434,6 @@ parse_attribute :: proc(doc: ^Document) -> (attr: Attribute, offset: int, err: E
t := doc.tokenizer
key := expect(t, .Ident) or_return
- offset = t.offset - len(key.text)
-
_ = expect(t, .Eq) or_return
value := expect(t, .String, multiline_string=true) or_return
@@ -591,6 +566,47 @@ parse_doctype :: proc(doc: ^Document) -> (err: Error) {
return .None
}
+parse_body :: proc(doc: ^Document, element: Element_ID, opts: Options) -> (err: Error) {
+ assert(doc != nil)
+ context.allocator = doc.allocator
+ t := doc.tokenizer
+
+ body_text := scan_string(t, t.offset) or_return
+ needs_processing := .Unbox_CDATA in opts.flags
+ needs_processing |= .Decode_SGML_Entities in opts.flags
+
+ if !needs_processing {
+ append(&doc.elements[element].value, body_text)
+ return
+ }
+
+ decode_opts := entity.XML_Decode_Options{}
+ if .Keep_Tag_Body_Comments not_in opts.flags {
+ decode_opts += { .Comment_Strip }
+ }
+
+ if .Decode_SGML_Entities not_in opts.flags {
+ decode_opts += { .No_Entity_Decode }
+ }
+
+ if .Unbox_CDATA in opts.flags {
+ decode_opts += { .Unbox_CDATA }
+ if .Decode_SGML_Entities in opts.flags {
+ decode_opts += { .Decode_CDATA }
+ }
+ }
+
+ decoded, decode_err := entity.decode_xml(body_text, decode_opts)
+ if decode_err == .None {
+ append(&doc.elements[element].value, decoded)
+ append(&doc.strings_to_free, decoded)
+ } else {
+ append(&doc.elements[element].value, body_text)
+ }
+
+ return
+}
+
Element_ID :: u32
new_element :: proc(doc: ^Document) -> (id: Element_ID) {
@@ -609,4 +625,4 @@ new_element :: proc(doc: ^Document) -> (id: Element_ID) {
cur := doc.element_count
doc.element_count += 1
return cur
-}
+} \ No newline at end of file