diff options
| author | Raph <raphfl.dev@gmail.com> | 2025-06-20 16:50:00 -0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-06-20 16:50:00 -0100 |
| commit | a7e89e1324f64346b201aea8ac6205e0bc85eb21 (patch) | |
| tree | 612abe74fa630e7cddad4d37ca5a04e18ff81471 /core/encoding/xml/xml_reader.odin | |
| parent | 0b5be6ad6a3c40ced071c89bb066dfd326b72943 (diff) | |
| parent | d9e08bc5d8a1292e3eccdb325bde4d180ebb4749 (diff) | |
Merge branch 'master' into tiocgwinsz_time
Diffstat (limited to 'core/encoding/xml/xml_reader.odin')
| -rw-r--r-- | core/encoding/xml/xml_reader.odin | 106 |
1 files changed, 61 insertions, 45 deletions
diff --git a/core/encoding/xml/xml_reader.odin b/core/encoding/xml/xml_reader.odin index b8c8b13a4..707d2b3f3 100644 --- a/core/encoding/xml/xml_reader.odin +++ b/core/encoding/xml/xml_reader.odin @@ -56,7 +56,7 @@ Option_Flag :: enum { Option_Flags :: bit_set[Option_Flag; u16] Document :: struct { - elements: [dynamic]Element, + elements: [dynamic]Element `fmt:"v,element_count"`, element_count: Element_ID, prologue: Attributes, @@ -70,15 +70,15 @@ Document :: struct { // If we encounter comments before the root node, and the option to intern comments is given, this is where they'll live. // Otherwise they'll be in the element tree. - comments: [dynamic]string, + comments: [dynamic]string `fmt:"-"`, // Internal - tokenizer: ^Tokenizer, - allocator: mem.Allocator, + tokenizer: ^Tokenizer `fmt:"-"`, + allocator: mem.Allocator `fmt:"-"`, // Input. Either the original buffer, or a copy if `.Input_May_Be_Modified` isn't specified. - input: []u8, - strings_to_free: [dynamic]string, + input: []u8 `fmt:"-"`, + strings_to_free: [dynamic]string `fmt:"-"`, } Element :: struct { @@ -175,7 +175,7 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha data = bytes.clone(data) } - t := &Tokenizer{} + t := new(Tokenizer) init(t, string(data), path, error_handler) doc = new(Document) @@ -195,7 +195,6 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha loop: for { skip_whitespace(t) - // NOTE(Jeroen): This is faster as a switch. switch t.ch { case '<': // Consume peeked `<` @@ -306,9 +305,17 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha } } + case .Open_Bracket: + // This could be a CDATA tag part of a tag's body. Unread the `<![` + t.offset -= 3 + + // Instead of calling `parse_body` here, we could also `continue loop` + // and fall through to the `case:` at the bottom of the outer loop. + // This makes the intent clearer. + parse_body(doc, element, opts) or_return + case: - error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next) - return + error(t, t.offset, "Unexpected Token after <!: %#v", next) } } else if open.kind == .Question { @@ -341,38 +348,7 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha case: // This should be a tag's body text. - body_text := scan_string(t, t.offset) or_return - needs_processing := .Unbox_CDATA in opts.flags - needs_processing |= .Decode_SGML_Entities in opts.flags - - if !needs_processing { - append(&doc.elements[element].value, body_text) - continue - } - - decode_opts := entity.XML_Decode_Options{} - if .Keep_Tag_Body_Comments not_in opts.flags { - decode_opts += { .Comment_Strip } - } - - if .Decode_SGML_Entities not_in opts.flags { - decode_opts += { .No_Entity_Decode } - } - - if .Unbox_CDATA in opts.flags { - decode_opts += { .Unbox_CDATA } - if .Decode_SGML_Entities in opts.flags { - decode_opts += { .Decode_CDATA } - } - } - - decoded, decode_err := entity.decode_xml(body_text, decode_opts) - if decode_err == .None { - append(&doc.elements[element].value, decoded) - append(&doc.strings_to_free, decoded) - } else { - append(&doc.elements[element].value, body_text) - } + parse_body(doc, element, opts) or_return } } @@ -427,6 +403,7 @@ destroy :: proc(doc: ^Document) { } delete(doc.strings_to_free) + free(doc.tokenizer) free(doc) } @@ -457,8 +434,6 @@ parse_attribute :: proc(doc: ^Document) -> (attr: Attribute, offset: int, err: E t := doc.tokenizer key := expect(t, .Ident) or_return - offset = t.offset - len(key.text) - _ = expect(t, .Eq) or_return value := expect(t, .String, multiline_string=true) or_return @@ -591,6 +566,47 @@ parse_doctype :: proc(doc: ^Document) -> (err: Error) { return .None } +parse_body :: proc(doc: ^Document, element: Element_ID, opts: Options) -> (err: Error) { + assert(doc != nil) + context.allocator = doc.allocator + t := doc.tokenizer + + body_text := scan_string(t, t.offset) or_return + needs_processing := .Unbox_CDATA in opts.flags + needs_processing |= .Decode_SGML_Entities in opts.flags + + if !needs_processing { + append(&doc.elements[element].value, body_text) + return + } + + decode_opts := entity.XML_Decode_Options{} + if .Keep_Tag_Body_Comments not_in opts.flags { + decode_opts += { .Comment_Strip } + } + + if .Decode_SGML_Entities not_in opts.flags { + decode_opts += { .No_Entity_Decode } + } + + if .Unbox_CDATA in opts.flags { + decode_opts += { .Unbox_CDATA } + if .Decode_SGML_Entities in opts.flags { + decode_opts += { .Decode_CDATA } + } + } + + decoded, decode_err := entity.decode_xml(body_text, decode_opts) + if decode_err == .None { + append(&doc.elements[element].value, decoded) + append(&doc.strings_to_free, decoded) + } else { + append(&doc.elements[element].value, body_text) + } + + return +} + Element_ID :: u32 new_element :: proc(doc: ^Document) -> (id: Element_ID) { @@ -609,4 +625,4 @@ new_element :: proc(doc: ^Document) -> (id: Element_ID) { cur := doc.element_count doc.element_count += 1 return cur -} +}
\ No newline at end of file |