aboutsummaryrefslogtreecommitdiff
path: root/core
diff options
context:
space:
mode:
authorJeroen van Rijn <Kelimion@users.noreply.github.com>2025-04-19 20:25:44 +0200
committerGitHub <noreply@github.com>2025-04-19 20:25:44 +0200
commit062a3c2fae3712c60af00798a0815509a732790b (patch)
tree9658915735206e27b5a0437b8ec7af5809e38662 /core
parentbc86b503922781091ec3ae54c722bd8ff33c7205 (diff)
Fix parsing of CDATA tags (#5059)
Fixes #5054
Diffstat (limited to 'core')
-rw-r--r--core/encoding/entity/entity.odin54
-rw-r--r--core/encoding/xml/tokenizer.odin38
-rw-r--r--core/encoding/xml/xml_reader.odin99
3 files changed, 99 insertions, 92 deletions
diff --git a/core/encoding/entity/entity.odin b/core/encoding/entity/entity.odin
index d2f1d46b2..cb8fa8611 100644
--- a/core/encoding/entity/entity.odin
+++ b/core/encoding/entity/entity.odin
@@ -108,7 +108,7 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator :=
it couldn't have been part of an XML tag body to be decoded here.
Keep in mind that we could already *be* inside a CDATA tag.
- If so, write `>` as a literal and continue.
+ If so, write `<` as a literal and continue.
*/
if in_data {
write_rune(&builder, '<')
@@ -119,11 +119,9 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator :=
case ']':
// If we're unboxing _and_ decoding CDATA, we'll have to check for the end tag.
if in_data {
- if t.read_offset + len(CDATA_END) < len(t.src) {
- if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
- in_data = false
- t.read_offset += len(CDATA_END) - 1
- }
+ if strings.has_prefix(t.src[t.offset:], CDATA_END) {
+ in_data = false
+ t.read_offset += len(CDATA_END) - 1
}
continue
} else {
@@ -297,40 +295,40 @@ _handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: X
assert(t != nil && t.r == '<')
if t.read_offset + len(CDATA_START) >= len(t.src) { return false, .None }
- if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
- t.read_offset += len(CDATA_START) - 1
-
+ s := string(t.src[t.offset:])
+ if strings.has_prefix(s, CDATA_START) {
if .Unbox_CDATA in options && .Decode_CDATA in options {
// We're unboxing _and_ decoding CDATA
+ t.read_offset += len(CDATA_START) - 1
return true, .None
}
- // CDATA is passed through.
- offset := t.offset
-
- // Scan until end of CDATA.
+ // CDATA is passed through. Scan until end of CDATA.
+ start_offset := t.offset
+ t.read_offset += len(CDATA_START)
for {
- advance(t) or_return
- if t.r < 0 { return true, .CDATA_Not_Terminated }
-
- if t.read_offset + len(CDATA_END) < len(t.src) {
- if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
- t.read_offset += len(CDATA_END) - 1
+ advance(t)
+ if t.r < 0 {
+ // error(t, offset, "[scan_string] CDATA was not terminated\n")
+ return true, .CDATA_Not_Terminated
+ }
- cdata := string(t.src[offset : t.read_offset])
-
- if .Unbox_CDATA in options {
- cdata = cdata[len(CDATA_START):]
- cdata = cdata[:len(cdata) - len(CDATA_END)]
- }
+ // Scan until the end of a CDATA tag.
+ if s = string(t.src[t.read_offset:]); strings.has_prefix(s, CDATA_END) {
+ t.read_offset += len(CDATA_END)
+ cdata := string(t.src[start_offset:t.read_offset])
- write_string(builder, cdata)
- return false, .None
+ if .Unbox_CDATA in options {
+ cdata = cdata[len(CDATA_START):]
+ cdata = cdata[:len(cdata) - len(CDATA_END)]
}
+ write_string(builder, cdata)
+ return false, .None
}
}
- } else if string(t.src[t.offset:][:len(COMMENT_START)]) == COMMENT_START {
+
+ } else if strings.has_prefix(s, COMMENT_START) {
t.read_offset += len(COMMENT_START)
// Comment is passed through by default.
offset := t.offset
diff --git a/core/encoding/xml/tokenizer.odin b/core/encoding/xml/tokenizer.odin
index a2bbaf28e..3ef9a6388 100644
--- a/core/encoding/xml/tokenizer.odin
+++ b/core/encoding/xml/tokenizer.odin
@@ -16,6 +16,7 @@ package encoding_xml
import "core:fmt"
import "core:unicode"
import "core:unicode/utf8"
+import "core:strings"
Error_Handler :: #type proc(pos: Pos, fmt: string, args: ..any)
@@ -121,7 +122,7 @@ default_error_handler :: proc(pos: Pos, msg: string, args: ..any) {
error :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) {
pos := offset_to_pos(t, offset)
if t.err != nil {
- t.err(pos, msg, ..args)
+ t.err(pos=pos, fmt=msg, args=args)
}
t.error_count += 1
}
@@ -268,32 +269,27 @@ scan_comment :: proc(t: ^Tokenizer) -> (comment: string, err: Error) {
// Skip CDATA
skip_cdata :: proc(t: ^Tokenizer) -> (err: Error) {
- if t.read_offset + len(CDATA_START) >= len(t.src) {
- // Can't be the start of a CDATA tag.
+ if s := string(t.src[t.offset:]); !strings.has_prefix(s, CDATA_START) {
return .None
}
- if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
- t.read_offset += len(CDATA_START)
- offset := t.offset
+ t.read_offset += len(CDATA_START)
+ offset := t.offset
- cdata_scan: for {
- advance_rune(t)
- if t.ch < 0 {
- error(t, offset, "[scan_string] CDATA was not terminated\n")
- return .Premature_EOF
- }
+ cdata_scan: for {
+ advance_rune(t)
+ if t.ch < 0 {
+ error(t, offset, "[scan_string] CDATA was not terminated\n")
+ return .Premature_EOF
+ }
- // Scan until the end of a CDATA tag.
- if t.read_offset + len(CDATA_END) < len(t.src) {
- if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
- t.read_offset += len(CDATA_END)
- break cdata_scan
- }
- }
+ // Scan until the end of a CDATA tag.
+ if s := string(t.src[t.read_offset:]); strings.has_prefix(s, CDATA_END) {
+ t.read_offset += len(CDATA_END)
+ break cdata_scan
}
}
- return
+ return .None
}
@(optimization_mode="favor_size")
@@ -393,6 +389,8 @@ scan :: proc(t: ^Tokenizer, multiline_string := false) -> Token {
case '/': kind = .Slash
case '-': kind = .Dash
case ':': kind = .Colon
+ case '[': kind = .Open_Bracket
+ case ']': kind = .Close_Bracket
case '"', '\'':
kind = .Invalid
diff --git a/core/encoding/xml/xml_reader.odin b/core/encoding/xml/xml_reader.odin
index b8c8b13a4..60744357c 100644
--- a/core/encoding/xml/xml_reader.odin
+++ b/core/encoding/xml/xml_reader.odin
@@ -56,7 +56,7 @@ Option_Flag :: enum {
Option_Flags :: bit_set[Option_Flag; u16]
Document :: struct {
- elements: [dynamic]Element,
+ elements: [dynamic]Element `fmt:"v,element_count"`,
element_count: Element_ID,
prologue: Attributes,
@@ -70,15 +70,15 @@ Document :: struct {
// If we encounter comments before the root node, and the option to intern comments is given, this is where they'll live.
// Otherwise they'll be in the element tree.
- comments: [dynamic]string,
+ comments: [dynamic]string `fmt:"-"`,
// Internal
- tokenizer: ^Tokenizer,
- allocator: mem.Allocator,
+ tokenizer: ^Tokenizer `fmt:"-"`,
+ allocator: mem.Allocator `fmt:"-"`,
// Input. Either the original buffer, or a copy if `.Input_May_Be_Modified` isn't specified.
- input: []u8,
- strings_to_free: [dynamic]string,
+ input: []u8 `fmt:"-"`,
+ strings_to_free: [dynamic]string `fmt:"-"`,
}
Element :: struct {
@@ -195,7 +195,6 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
loop: for {
skip_whitespace(t)
- // NOTE(Jeroen): This is faster as a switch.
switch t.ch {
case '<':
// Consume peeked `<`
@@ -306,9 +305,13 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
}
}
+ case .Open_Bracket:
+ // This could be a CDATA tag part of a tag's body. Unread the `<![`
+ t.offset -= 3
+ parse_body(doc, element, opts) or_return
+
case:
- error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next)
- return
+ error(t, t.offset, "Unexpected Token after <!: %#v", next)
}
} else if open.kind == .Question {
@@ -341,38 +344,7 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
case:
// This should be a tag's body text.
- body_text := scan_string(t, t.offset) or_return
- needs_processing := .Unbox_CDATA in opts.flags
- needs_processing |= .Decode_SGML_Entities in opts.flags
-
- if !needs_processing {
- append(&doc.elements[element].value, body_text)
- continue
- }
-
- decode_opts := entity.XML_Decode_Options{}
- if .Keep_Tag_Body_Comments not_in opts.flags {
- decode_opts += { .Comment_Strip }
- }
-
- if .Decode_SGML_Entities not_in opts.flags {
- decode_opts += { .No_Entity_Decode }
- }
-
- if .Unbox_CDATA in opts.flags {
- decode_opts += { .Unbox_CDATA }
- if .Decode_SGML_Entities in opts.flags {
- decode_opts += { .Decode_CDATA }
- }
- }
-
- decoded, decode_err := entity.decode_xml(body_text, decode_opts)
- if decode_err == .None {
- append(&doc.elements[element].value, decoded)
- append(&doc.strings_to_free, decoded)
- } else {
- append(&doc.elements[element].value, body_text)
- }
+ parse_body(doc, element, opts) or_return
}
}
@@ -457,8 +429,6 @@ parse_attribute :: proc(doc: ^Document) -> (attr: Attribute, offset: int, err: E
t := doc.tokenizer
key := expect(t, .Ident) or_return
- offset = t.offset - len(key.text)
-
_ = expect(t, .Eq) or_return
value := expect(t, .String, multiline_string=true) or_return
@@ -591,6 +561,47 @@ parse_doctype :: proc(doc: ^Document) -> (err: Error) {
return .None
}
+parse_body :: proc(doc: ^Document, element: Element_ID, opts: Options) -> (err: Error) {
+ assert(doc != nil)
+ context.allocator = doc.allocator
+ t := doc.tokenizer
+
+ body_text := scan_string(t, t.offset) or_return
+ needs_processing := .Unbox_CDATA in opts.flags
+ needs_processing |= .Decode_SGML_Entities in opts.flags
+
+ if !needs_processing {
+ append(&doc.elements[element].value, body_text)
+ return
+ }
+
+ decode_opts := entity.XML_Decode_Options{}
+ if .Keep_Tag_Body_Comments not_in opts.flags {
+ decode_opts += { .Comment_Strip }
+ }
+
+ if .Decode_SGML_Entities not_in opts.flags {
+ decode_opts += { .No_Entity_Decode }
+ }
+
+ if .Unbox_CDATA in opts.flags {
+ decode_opts += { .Unbox_CDATA }
+ if .Decode_SGML_Entities in opts.flags {
+ decode_opts += { .Decode_CDATA }
+ }
+ }
+
+ decoded, decode_err := entity.decode_xml(body_text, decode_opts)
+ if decode_err == .None {
+ append(&doc.elements[element].value, decoded)
+ append(&doc.strings_to_free, decoded)
+ } else {
+ append(&doc.elements[element].value, body_text)
+ }
+
+ return
+}
+
Element_ID :: u32
new_element :: proc(doc: ^Document) -> (id: Element_ID) {
@@ -609,4 +620,4 @@ new_element :: proc(doc: ^Document) -> (id: Element_ID) {
cur := doc.element_count
doc.element_count += 1
return cur
-}
+} \ No newline at end of file