aboutsummaryrefslogtreecommitdiff
path: root/core/encoding/xml/tokenizer.odin
diff options
context:
space:
mode:
authorJeroen van Rijn <Kelimion@users.noreply.github.com>2021-12-01 22:01:19 +0100
committerJeroen van Rijn <Kelimion@users.noreply.github.com>2021-12-05 02:52:23 +0100
commit23baf56c8784901f67970760db5025c9c9f03b67 (patch)
tree0e33a131a9fb514c3fcc40287c1fe3386c70b74e /core/encoding/xml/tokenizer.odin
parentbeff90e1d12391e63cd1119023f8565eda97593e (diff)
[xml] Improve CDATA + comment handling in tag body.
Diffstat (limited to 'core/encoding/xml/tokenizer.odin')
-rw-r--r--core/encoding/xml/tokenizer.odin117
1 files changed, 94 insertions, 23 deletions
diff --git a/core/encoding/xml/tokenizer.odin b/core/encoding/xml/tokenizer.odin
index 9247d2531..95024518d 100644
--- a/core/encoding/xml/tokenizer.odin
+++ b/core/encoding/xml/tokenizer.odin
@@ -46,8 +46,11 @@ Token_Kind :: enum {
EOF,
}
-CDATA_START :: "<![CDATA["
-CDATA_END :: "]]>"
+CDATA_START :: "<![CDATA["
+CDATA_END :: "]]>"
+
+COMMENT_START :: "<!--"
+COMMENT_END :: "-->"
Tokenizer :: struct {
// Immutable data
@@ -214,10 +217,83 @@ scan_identifier :: proc(t: ^Tokenizer) -> string {
return string(t.src[offset : t.offset])
}
+/*
+ A comment ends when we see -->, preceded by a character that's not a dash.
+ "For compatibility, the string "--" (double-hyphen) must not occur within comments."
+
+ See: https://www.w3.org/TR/2006/REC-xml11-20060816/#dt-comment
+
+ Thanks to the length (4) of the comment start, we also have enough lookback,
+ and the peek at the next byte asserts that there's at least one more character
+ that's a `>`.
+*/
+scan_comment :: proc(t: ^Tokenizer) -> (comment: string, err: Error) {
+ offset := t.offset
+
+ for {
+ advance_rune(t)
+ ch := t.ch
+
+ if ch < 0 {
+ error(t, offset, "[parse] Comment was not terminated\n")
+ return "", .Unclosed_Comment
+ }
+
+ if string(t.src[t.offset - 1:][:2]) == "--" {
+ if peek_byte(t) == '>' {
+ break
+ } else {
+ error(t, t.offset - 1, "Invalid -- sequence in comment.\n")
+ return "", .Invalid_Sequence_In_Comment
+ }
+ }
+ }
+
+ expect(t, .Dash)
+ expect(t, .Gt)
+
+ return string(t.src[offset : t.offset - 1]), .None
+}
+
+/*
+ Skip CDATA
+*/
+skip_cdata :: proc(t: ^Tokenizer) -> (err: Error) {
+ if t.read_offset + len(CDATA_START) >= len(t.src) {
+ /*
+ Can't be the start of a CDATA tag.
+ */
+ return .None
+ }
+
+ if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
+ t.read_offset += len(CDATA_START)
+ offset := t.offset
+
+ cdata_scan: for {
+ advance_rune(t)
+ if t.ch < 0 {
+ error(t, offset, "[scan_string] CDATA was not terminated\n")
+ return .Premature_EOF
+ }
+
+ /*
+ Scan until the end of a CDATA tag.
+ */
+ if t.read_offset + len(CDATA_END) < len(t.src) {
+ if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
+ t.read_offset += len(CDATA_END)
+ break cdata_scan
+ }
+ }
+ }
+ }
+ return
+}
+
@(optimization_mode="speed")
scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close := false, multiline := true) -> (value: string, err: Error) {
err = .None
- in_cdata := false
loop: for {
ch := t.ch
@@ -228,27 +304,23 @@ scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close
return "", .Premature_EOF
case '<':
- /*
- Might be the start of a CDATA tag.
- */
- if t.read_offset + len(CDATA_START) < len(t.src) {
- if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
- in_cdata = true
- }
- }
-
- case ']':
- /*
- Might be the end of a CDATA tag.
- */
- if t.read_offset + len(CDATA_END) < len(t.src) {
- if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
- in_cdata = false
+ if peek_byte(t) == '!' {
+ if peek_byte(t, 1) == '[' {
+ /*
+ Might be the start of a CDATA tag.
+ */
+ skip_cdata(t) or_return
+ } else if peek_byte(t, 1) == '-' && peek_byte(t, 2) == '-' {
+ /*
+ Comment start. Eat comment.
+ */
+ t.read_offset += 3
+ _ = scan_comment(t) or_return
}
}
case '\n':
- if !(multiline || in_cdata) {
+ if !multiline {
error(t, offset, string(t.src[offset : t.offset]))
error(t, offset, "[scan_string] Not terminated\n")
err = .Invalid_Tag_Value
@@ -256,13 +328,12 @@ scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close
}
}
- if ch == close && !in_cdata {
+ if t.ch == close {
/*
- If it's not a CDATA tag, it's the end of this body.
+ If it's not a CDATA or comment, it's the end of this body.
*/
break loop
}
-
advance_rune(t)
}