Fix parsing of CDATA tags (#5059)

Fixes #5054
author: Jeroen van Rijn <Kelimion@users.noreply.github.com> 2025-04-19 20:25:44 +0200
committer: GitHub <noreply@github.com> 2025-04-19 20:25:44 +0200
commit: 062a3c2fae3712c60af00798a0815509a732790b (patch)
tree: 9658915735206e27b5a0437b8ec7af5809e38662 /core
parent: bc86b503922781091ec3ae54c722bd8ff33c7205 (diff)
3 files changed, 99 insertions, 92 deletions
diff --git a/core/encoding/entity/entity.odin b/core/encoding/entity/entity.odin
index d2f1d46b2..cb8fa8611 100644
--- a/core/encoding/entity/entity.odin
+++ b/core/encoding/entity/entity.odin
@@ -108,7 +108,7 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator :=
 				it couldn't have been part of an XML tag body to be decoded here.
 
 				Keep in mind that we could already *be* inside a CDATA tag.
-				If so, write `>` as a literal and continue.
+				If so, write `<` as a literal and continue.
 			*/
 			if in_data {
 				write_rune(&builder, '<')
@@ -119,11 +119,9 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator :=
 		case ']':
 			// If we're unboxing _and_ decoding CDATA, we'll have to check for the end tag.
 			if in_data {
-				if t.read_offset + len(CDATA_END) < len(t.src) {
-					if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
-						in_data = false
-						t.read_offset += len(CDATA_END) - 1
-					}
+				if strings.has_prefix(t.src[t.offset:], CDATA_END) {
+					in_data = false
+					t.read_offset += len(CDATA_END) - 1
 				}
 				continue
 			} else {
@@ -297,40 +295,40 @@ _handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: X
 	assert(t != nil && t.r == '<')
 	if t.read_offset + len(CDATA_START) >= len(t.src) { return false, .None }
 
-	if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
-		t.read_offset += len(CDATA_START) - 1
-
+	s := string(t.src[t.offset:])
+	if strings.has_prefix(s, CDATA_START) {
 		if .Unbox_CDATA in options && .Decode_CDATA in options {
 			// We're unboxing _and_ decoding CDATA
+			t.read_offset += len(CDATA_START) - 1
 			return true, .None
 		}
 
-		// CDATA is passed through.
-		offset := t.offset
-
-		// Scan until end of CDATA.
+		// CDATA is passed through. Scan until end of CDATA.
+		start_offset  := t.offset
+		t.read_offset += len(CDATA_START)
 		for {
-			advance(t) or_return
-			if t.r < 0 { return true, .CDATA_Not_Terminated }
-
-			if t.read_offset + len(CDATA_END) < len(t.src) {
-				if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
-					t.read_offset += len(CDATA_END) - 1
+			advance(t)
+			if t.r < 0 {
+				// error(t, offset, "[scan_string] CDATA was not terminated\n")
+				return true, .CDATA_Not_Terminated
+			}
 
-					cdata := string(t.src[offset : t.read_offset])
-	
-					if .Unbox_CDATA in options {
-						cdata = cdata[len(CDATA_START):]
-						cdata = cdata[:len(cdata) - len(CDATA_END)]
-					}
+			// Scan until the end of a CDATA tag.
+			if s = string(t.src[t.read_offset:]); strings.has_prefix(s, CDATA_END) {
+				t.read_offset += len(CDATA_END)
+				cdata := string(t.src[start_offset:t.read_offset])
 
-					write_string(builder, cdata)
-					return false, .None
+				if .Unbox_CDATA in options {
+					cdata = cdata[len(CDATA_START):]
+					cdata = cdata[:len(cdata) - len(CDATA_END)]
 				}
+				write_string(builder, cdata)
+				return false, .None
 			}
 		}
 
-	} else if string(t.src[t.offset:][:len(COMMENT_START)]) == COMMENT_START {
+
+	} else if strings.has_prefix(s, COMMENT_START) {
 		t.read_offset += len(COMMENT_START)
 		// Comment is passed through by default.
 		offset := t.offset
diff --git a/core/encoding/xml/tokenizer.odin b/core/encoding/xml/tokenizer.odin
index a2bbaf28e..3ef9a6388 100644
--- a/core/encoding/xml/tokenizer.odin
+++ b/core/encoding/xml/tokenizer.odin
@@ -16,6 +16,7 @@ package encoding_xml
 import "core:fmt"
 import "core:unicode"
 import "core:unicode/utf8"
+import "core:strings"
 
 Error_Handler :: #type proc(pos: Pos, fmt: string, args: ..any)
 
@@ -121,7 +122,7 @@ default_error_handler :: proc(pos: Pos, msg: string, args: ..any) {
 error :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) {
 	pos := offset_to_pos(t, offset)
 	if t.err != nil {
-		t.err(pos, msg, ..args)
+		t.err(pos=pos, fmt=msg, args=args)
 	}
 	t.error_count += 1
 }
@@ -268,32 +269,27 @@ scan_comment :: proc(t: ^Tokenizer) -> (comment: string, err: Error) {
 
 // Skip CDATA
 skip_cdata :: proc(t: ^Tokenizer) -> (err: Error) {
-	if t.read_offset + len(CDATA_START) >= len(t.src) {
-		// Can't be the start of a CDATA tag.
+	if s := string(t.src[t.offset:]); !strings.has_prefix(s, CDATA_START) {
 		return .None
 	}
 
-	if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
-		t.read_offset += len(CDATA_START)
-		offset := t.offset
+	t.read_offset += len(CDATA_START)
+	offset := t.offset
 
-		cdata_scan: for {
-			advance_rune(t)
-			if t.ch < 0 {
-				error(t, offset, "[scan_string] CDATA was not terminated\n")
-				return .Premature_EOF
-			}
+	cdata_scan: for {
+		advance_rune(t)
+		if t.ch < 0 {
+			error(t, offset, "[scan_string] CDATA was not terminated\n")
+			return .Premature_EOF
+		}
 
-			// Scan until the end of a CDATA tag.
-			if t.read_offset + len(CDATA_END) < len(t.src) {
-				if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
-					t.read_offset += len(CDATA_END)
-					break cdata_scan
-				}
-			}
+		// Scan until the end of a CDATA tag.
+		if s := string(t.src[t.read_offset:]); strings.has_prefix(s, CDATA_END) {
+			t.read_offset += len(CDATA_END)
+			break cdata_scan
 		}
 	}
-	return
+	return .None
 }
 
 @(optimization_mode="favor_size")
@@ -393,6 +389,8 @@ scan :: proc(t: ^Tokenizer, multiline_string := false) -> Token {
 		case '/': kind = .Slash
 		case '-': kind = .Dash
 		case ':': kind = .Colon
+		case '[': kind = .Open_Bracket
+		case ']': kind = .Close_Bracket
 
 		case '"', '\'':
 			kind = .Invalid
diff --git a/core/encoding/xml/xml_reader.odin b/core/encoding/xml/xml_reader.odin
index b8c8b13a4..60744357c 100644
--- a/core/encoding/xml/xml_reader.odin
+++ b/core/encoding/xml/xml_reader.odin
@@ -56,7 +56,7 @@ Option_Flag :: enum {
 Option_Flags :: bit_set[Option_Flag; u16]
 
 Document :: struct {
-	elements:      [dynamic]Element,
+	elements:      [dynamic]Element `fmt:"v,element_count"`,
 	element_count: Element_ID,
 
 	prologue: Attributes,
@@ -70,15 +70,15 @@ Document :: struct {
 
 	// If we encounter comments before the root node, and the option to intern comments is given, this is where they'll live.
 	// Otherwise they'll be in the element tree.
-	comments: [dynamic]string,
+	comments: [dynamic]string        `fmt:"-"`,
 
 	// Internal
-	tokenizer: ^Tokenizer,
-	allocator: mem.Allocator,
+	tokenizer: ^Tokenizer            `fmt:"-"`,
+	allocator: mem.Allocator         `fmt:"-"`,
 
 	// Input. Either the original buffer, or a copy if `.Input_May_Be_Modified` isn't specified.
-	input:           []u8,
-	strings_to_free: [dynamic]string,
+	input:           []u8            `fmt:"-"`,
+	strings_to_free: [dynamic]string `fmt:"-"`,
 }
 
 Element :: struct {
@@ -195,7 +195,6 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
 
 	loop: for {
 		skip_whitespace(t)
-		// NOTE(Jeroen): This is faster as a switch.
 		switch t.ch {
 		case '<':
 			// Consume peeked `<`
@@ -306,9 +305,13 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
 						}
 					}
 
+				case .Open_Bracket:
+					// This could be a CDATA tag part of a tag's body. Unread the `<![`
+					t.offset -= 3
+					parse_body(doc, element, opts) or_return
+
 				case:
-					error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next)
-					return
+					error(t, t.offset, "Unexpected Token after <!: %#v", next)
 				}
 
 			} else if open.kind == .Question {
@@ -341,38 +344,7 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
 
 		case:
 			// This should be a tag's body text.
-			body_text        := scan_string(t, t.offset) or_return
-			needs_processing := .Unbox_CDATA          in opts.flags
-			needs_processing |= .Decode_SGML_Entities in opts.flags
-
-			if !needs_processing {
-				append(&doc.elements[element].value, body_text)
-				continue
-			}
-
-			decode_opts := entity.XML_Decode_Options{}
-			if .Keep_Tag_Body_Comments not_in opts.flags {
-				decode_opts += { .Comment_Strip }
-			}
-
-			if .Decode_SGML_Entities not_in opts.flags {
-				decode_opts += { .No_Entity_Decode }
-			}
-
-			if .Unbox_CDATA in opts.flags {
-				decode_opts += { .Unbox_CDATA }
-				if .Decode_SGML_Entities in opts.flags {
-					decode_opts += { .Decode_CDATA }
-				}
-			}
-
-			decoded, decode_err := entity.decode_xml(body_text, decode_opts)
-			if decode_err == .None {
-				append(&doc.elements[element].value, decoded)
-				append(&doc.strings_to_free, decoded)
-			} else {
-				append(&doc.elements[element].value, body_text)
-			}
+			parse_body(doc, element, opts) or_return
 		}
 	}
 
@@ -457,8 +429,6 @@ parse_attribute :: proc(doc: ^Document) -> (attr: Attribute, offset: int, err: E
 	t := doc.tokenizer
 
 	key    := expect(t, .Ident)  or_return
-	offset  = t.offset - len(key.text)
-
 	_       = expect(t, .Eq)     or_return
 	value  := expect(t, .String, multiline_string=true) or_return
 
@@ -591,6 +561,47 @@ parse_doctype :: proc(doc: ^Document) -> (err: Error) {
 	return .None
 }
 
+parse_body :: proc(doc: ^Document, element: Element_ID, opts: Options) -> (err: Error) {
+	assert(doc != nil)
+	context.allocator = doc.allocator
+	t := doc.tokenizer
+
+	body_text        := scan_string(t, t.offset) or_return
+	needs_processing := .Unbox_CDATA          in opts.flags
+	needs_processing |= .Decode_SGML_Entities in opts.flags
+
+	if !needs_processing {
+		append(&doc.elements[element].value, body_text)
+		return
+	}
+
+	decode_opts := entity.XML_Decode_Options{}
+	if .Keep_Tag_Body_Comments not_in opts.flags {
+		decode_opts += { .Comment_Strip }
+	}
+
+	if .Decode_SGML_Entities not_in opts.flags {
+		decode_opts += { .No_Entity_Decode }
+	}
+
+	if .Unbox_CDATA in opts.flags {
+		decode_opts += { .Unbox_CDATA }
+		if .Decode_SGML_Entities in opts.flags {
+			decode_opts += { .Decode_CDATA }
+		}
+	}
+
+	decoded, decode_err := entity.decode_xml(body_text, decode_opts)
+	if decode_err == .None {
+		append(&doc.elements[element].value, decoded)
+		append(&doc.strings_to_free, decoded)
+	} else {
+		append(&doc.elements[element].value, body_text)
+	}
+
+	return
+}
+
 Element_ID :: u32
 
 new_element :: proc(doc: ^Document) -> (id: Element_ID) {
@@ -609,4 +620,4 @@ new_element :: proc(doc: ^Document) -> (id: Element_ID) {
 	cur := doc.element_count
 	doc.element_count += 1
 	return cur
-}
+}
+\ No newline at end of file
author	Jeroen van Rijn <Kelimion@users.noreply.github.com>	2025-04-19 20:25:44 +0200
committer	GitHub <noreply@github.com>	2025-04-19 20:25:44 +0200
commit	062a3c2fae3712c60af00798a0815509a732790b (patch)
tree	9658915735206e27b5a0437b8ec7af5809e38662 /core
parent	bc86b503922781091ec3ae54c722bd8ff33c7205 (diff)