diff options
| author | Jeroen van Rijn <Kelimion@users.noreply.github.com> | 2025-04-19 20:25:44 +0200 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-04-19 20:25:44 +0200 |
| commit | 062a3c2fae3712c60af00798a0815509a732790b (patch) | |
| tree | 9658915735206e27b5a0437b8ec7af5809e38662 /core | |
| parent | bc86b503922781091ec3ae54c722bd8ff33c7205 (diff) | |
Fix parsing of CDATA tags (#5059)
Fixes #5054
Diffstat (limited to 'core')
| -rw-r--r-- | core/encoding/entity/entity.odin | 54 | ||||
| -rw-r--r-- | core/encoding/xml/tokenizer.odin | 38 | ||||
| -rw-r--r-- | core/encoding/xml/xml_reader.odin | 99 |
3 files changed, 99 insertions, 92 deletions
diff --git a/core/encoding/entity/entity.odin b/core/encoding/entity/entity.odin index d2f1d46b2..cb8fa8611 100644 --- a/core/encoding/entity/entity.odin +++ b/core/encoding/entity/entity.odin @@ -108,7 +108,7 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator := it couldn't have been part of an XML tag body to be decoded here. Keep in mind that we could already *be* inside a CDATA tag. - If so, write `>` as a literal and continue. + If so, write `<` as a literal and continue. */ if in_data { write_rune(&builder, '<') @@ -119,11 +119,9 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator := case ']': // If we're unboxing _and_ decoding CDATA, we'll have to check for the end tag. if in_data { - if t.read_offset + len(CDATA_END) < len(t.src) { - if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END { - in_data = false - t.read_offset += len(CDATA_END) - 1 - } + if strings.has_prefix(t.src[t.offset:], CDATA_END) { + in_data = false + t.read_offset += len(CDATA_END) - 1 } continue } else { @@ -297,40 +295,40 @@ _handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: X assert(t != nil && t.r == '<') if t.read_offset + len(CDATA_START) >= len(t.src) { return false, .None } - if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START { - t.read_offset += len(CDATA_START) - 1 - + s := string(t.src[t.offset:]) + if strings.has_prefix(s, CDATA_START) { if .Unbox_CDATA in options && .Decode_CDATA in options { // We're unboxing _and_ decoding CDATA + t.read_offset += len(CDATA_START) - 1 return true, .None } - // CDATA is passed through. - offset := t.offset - - // Scan until end of CDATA. + // CDATA is passed through. Scan until end of CDATA. + start_offset := t.offset + t.read_offset += len(CDATA_START) for { - advance(t) or_return - if t.r < 0 { return true, .CDATA_Not_Terminated } - - if t.read_offset + len(CDATA_END) < len(t.src) { - if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END { - t.read_offset += len(CDATA_END) - 1 + advance(t) + if t.r < 0 { + // error(t, offset, "[scan_string] CDATA was not terminated\n") + return true, .CDATA_Not_Terminated + } - cdata := string(t.src[offset : t.read_offset]) - - if .Unbox_CDATA in options { - cdata = cdata[len(CDATA_START):] - cdata = cdata[:len(cdata) - len(CDATA_END)] - } + // Scan until the end of a CDATA tag. + if s = string(t.src[t.read_offset:]); strings.has_prefix(s, CDATA_END) { + t.read_offset += len(CDATA_END) + cdata := string(t.src[start_offset:t.read_offset]) - write_string(builder, cdata) - return false, .None + if .Unbox_CDATA in options { + cdata = cdata[len(CDATA_START):] + cdata = cdata[:len(cdata) - len(CDATA_END)] } + write_string(builder, cdata) + return false, .None } } - } else if string(t.src[t.offset:][:len(COMMENT_START)]) == COMMENT_START { + + } else if strings.has_prefix(s, COMMENT_START) { t.read_offset += len(COMMENT_START) // Comment is passed through by default. offset := t.offset diff --git a/core/encoding/xml/tokenizer.odin b/core/encoding/xml/tokenizer.odin index a2bbaf28e..3ef9a6388 100644 --- a/core/encoding/xml/tokenizer.odin +++ b/core/encoding/xml/tokenizer.odin @@ -16,6 +16,7 @@ package encoding_xml import "core:fmt" import "core:unicode" import "core:unicode/utf8" +import "core:strings" Error_Handler :: #type proc(pos: Pos, fmt: string, args: ..any) @@ -121,7 +122,7 @@ default_error_handler :: proc(pos: Pos, msg: string, args: ..any) { error :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) { pos := offset_to_pos(t, offset) if t.err != nil { - t.err(pos, msg, ..args) + t.err(pos=pos, fmt=msg, args=args) } t.error_count += 1 } @@ -268,32 +269,27 @@ scan_comment :: proc(t: ^Tokenizer) -> (comment: string, err: Error) { // Skip CDATA skip_cdata :: proc(t: ^Tokenizer) -> (err: Error) { - if t.read_offset + len(CDATA_START) >= len(t.src) { - // Can't be the start of a CDATA tag. + if s := string(t.src[t.offset:]); !strings.has_prefix(s, CDATA_START) { return .None } - if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START { - t.read_offset += len(CDATA_START) - offset := t.offset + t.read_offset += len(CDATA_START) + offset := t.offset - cdata_scan: for { - advance_rune(t) - if t.ch < 0 { - error(t, offset, "[scan_string] CDATA was not terminated\n") - return .Premature_EOF - } + cdata_scan: for { + advance_rune(t) + if t.ch < 0 { + error(t, offset, "[scan_string] CDATA was not terminated\n") + return .Premature_EOF + } - // Scan until the end of a CDATA tag. - if t.read_offset + len(CDATA_END) < len(t.src) { - if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END { - t.read_offset += len(CDATA_END) - break cdata_scan - } - } + // Scan until the end of a CDATA tag. + if s := string(t.src[t.read_offset:]); strings.has_prefix(s, CDATA_END) { + t.read_offset += len(CDATA_END) + break cdata_scan } } - return + return .None } @(optimization_mode="favor_size") @@ -393,6 +389,8 @@ scan :: proc(t: ^Tokenizer, multiline_string := false) -> Token { case '/': kind = .Slash case '-': kind = .Dash case ':': kind = .Colon + case '[': kind = .Open_Bracket + case ']': kind = .Close_Bracket case '"', '\'': kind = .Invalid diff --git a/core/encoding/xml/xml_reader.odin b/core/encoding/xml/xml_reader.odin index b8c8b13a4..60744357c 100644 --- a/core/encoding/xml/xml_reader.odin +++ b/core/encoding/xml/xml_reader.odin @@ -56,7 +56,7 @@ Option_Flag :: enum { Option_Flags :: bit_set[Option_Flag; u16] Document :: struct { - elements: [dynamic]Element, + elements: [dynamic]Element `fmt:"v,element_count"`, element_count: Element_ID, prologue: Attributes, @@ -70,15 +70,15 @@ Document :: struct { // If we encounter comments before the root node, and the option to intern comments is given, this is where they'll live. // Otherwise they'll be in the element tree. - comments: [dynamic]string, + comments: [dynamic]string `fmt:"-"`, // Internal - tokenizer: ^Tokenizer, - allocator: mem.Allocator, + tokenizer: ^Tokenizer `fmt:"-"`, + allocator: mem.Allocator `fmt:"-"`, // Input. Either the original buffer, or a copy if `.Input_May_Be_Modified` isn't specified. - input: []u8, - strings_to_free: [dynamic]string, + input: []u8 `fmt:"-"`, + strings_to_free: [dynamic]string `fmt:"-"`, } Element :: struct { @@ -195,7 +195,6 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha loop: for { skip_whitespace(t) - // NOTE(Jeroen): This is faster as a switch. switch t.ch { case '<': // Consume peeked `<` @@ -306,9 +305,13 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha } } + case .Open_Bracket: + // This could be a CDATA tag part of a tag's body. Unread the `<![` + t.offset -= 3 + parse_body(doc, element, opts) or_return + case: - error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next) - return + error(t, t.offset, "Unexpected Token after <!: %#v", next) } } else if open.kind == .Question { @@ -341,38 +344,7 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha case: // This should be a tag's body text. - body_text := scan_string(t, t.offset) or_return - needs_processing := .Unbox_CDATA in opts.flags - needs_processing |= .Decode_SGML_Entities in opts.flags - - if !needs_processing { - append(&doc.elements[element].value, body_text) - continue - } - - decode_opts := entity.XML_Decode_Options{} - if .Keep_Tag_Body_Comments not_in opts.flags { - decode_opts += { .Comment_Strip } - } - - if .Decode_SGML_Entities not_in opts.flags { - decode_opts += { .No_Entity_Decode } - } - - if .Unbox_CDATA in opts.flags { - decode_opts += { .Unbox_CDATA } - if .Decode_SGML_Entities in opts.flags { - decode_opts += { .Decode_CDATA } - } - } - - decoded, decode_err := entity.decode_xml(body_text, decode_opts) - if decode_err == .None { - append(&doc.elements[element].value, decoded) - append(&doc.strings_to_free, decoded) - } else { - append(&doc.elements[element].value, body_text) - } + parse_body(doc, element, opts) or_return } } @@ -457,8 +429,6 @@ parse_attribute :: proc(doc: ^Document) -> (attr: Attribute, offset: int, err: E t := doc.tokenizer key := expect(t, .Ident) or_return - offset = t.offset - len(key.text) - _ = expect(t, .Eq) or_return value := expect(t, .String, multiline_string=true) or_return @@ -591,6 +561,47 @@ parse_doctype :: proc(doc: ^Document) -> (err: Error) { return .None } +parse_body :: proc(doc: ^Document, element: Element_ID, opts: Options) -> (err: Error) { + assert(doc != nil) + context.allocator = doc.allocator + t := doc.tokenizer + + body_text := scan_string(t, t.offset) or_return + needs_processing := .Unbox_CDATA in opts.flags + needs_processing |= .Decode_SGML_Entities in opts.flags + + if !needs_processing { + append(&doc.elements[element].value, body_text) + return + } + + decode_opts := entity.XML_Decode_Options{} + if .Keep_Tag_Body_Comments not_in opts.flags { + decode_opts += { .Comment_Strip } + } + + if .Decode_SGML_Entities not_in opts.flags { + decode_opts += { .No_Entity_Decode } + } + + if .Unbox_CDATA in opts.flags { + decode_opts += { .Unbox_CDATA } + if .Decode_SGML_Entities in opts.flags { + decode_opts += { .Decode_CDATA } + } + } + + decoded, decode_err := entity.decode_xml(body_text, decode_opts) + if decode_err == .None { + append(&doc.elements[element].value, decoded) + append(&doc.strings_to_free, decoded) + } else { + append(&doc.elements[element].value, body_text) + } + + return +} + Element_ID :: u32 new_element :: proc(doc: ^Document) -> (id: Element_ID) { @@ -609,4 +620,4 @@ new_element :: proc(doc: ^Document) -> (id: Element_ID) { cur := doc.element_count doc.element_count += 1 return cur -} +}
\ No newline at end of file |