diff options
| author | Jeroen van Rijn <Kelimion@users.noreply.github.com> | 2021-12-01 22:01:19 +0100 |
|---|---|---|
| committer | Jeroen van Rijn <Kelimion@users.noreply.github.com> | 2021-12-05 02:52:23 +0100 |
| commit | 23baf56c8784901f67970760db5025c9c9f03b67 (patch) | |
| tree | 0e33a131a9fb514c3fcc40287c1fe3386c70b74e /core/encoding/xml/tokenizer.odin | |
| parent | beff90e1d12391e63cd1119023f8565eda97593e (diff) | |
[xml] Improve CDATA + comment handling in tag body.
Diffstat (limited to 'core/encoding/xml/tokenizer.odin')
| -rw-r--r-- | core/encoding/xml/tokenizer.odin | 117 |
1 files changed, 94 insertions, 23 deletions
diff --git a/core/encoding/xml/tokenizer.odin b/core/encoding/xml/tokenizer.odin index 9247d2531..95024518d 100644 --- a/core/encoding/xml/tokenizer.odin +++ b/core/encoding/xml/tokenizer.odin @@ -46,8 +46,11 @@ Token_Kind :: enum { EOF, } -CDATA_START :: "<![CDATA[" -CDATA_END :: "]]>" +CDATA_START :: "<![CDATA[" +CDATA_END :: "]]>" + +COMMENT_START :: "<!--" +COMMENT_END :: "-->" Tokenizer :: struct { // Immutable data @@ -214,10 +217,83 @@ scan_identifier :: proc(t: ^Tokenizer) -> string { return string(t.src[offset : t.offset]) } +/* + A comment ends when we see -->, preceded by a character that's not a dash. + "For compatibility, the string "--" (double-hyphen) must not occur within comments." + + See: https://www.w3.org/TR/2006/REC-xml11-20060816/#dt-comment + + Thanks to the length (4) of the comment start, we also have enough lookback, + and the peek at the next byte asserts that there's at least one more character + that's a `>`. +*/ +scan_comment :: proc(t: ^Tokenizer) -> (comment: string, err: Error) { + offset := t.offset + + for { + advance_rune(t) + ch := t.ch + + if ch < 0 { + error(t, offset, "[parse] Comment was not terminated\n") + return "", .Unclosed_Comment + } + + if string(t.src[t.offset - 1:][:2]) == "--" { + if peek_byte(t) == '>' { + break + } else { + error(t, t.offset - 1, "Invalid -- sequence in comment.\n") + return "", .Invalid_Sequence_In_Comment + } + } + } + + expect(t, .Dash) + expect(t, .Gt) + + return string(t.src[offset : t.offset - 1]), .None +} + +/* + Skip CDATA +*/ +skip_cdata :: proc(t: ^Tokenizer) -> (err: Error) { + if t.read_offset + len(CDATA_START) >= len(t.src) { + /* + Can't be the start of a CDATA tag. + */ + return .None + } + + if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START { + t.read_offset += len(CDATA_START) + offset := t.offset + + cdata_scan: for { + advance_rune(t) + if t.ch < 0 { + error(t, offset, "[scan_string] CDATA was not terminated\n") + return .Premature_EOF + } + + /* + Scan until the end of a CDATA tag. + */ + if t.read_offset + len(CDATA_END) < len(t.src) { + if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END { + t.read_offset += len(CDATA_END) + break cdata_scan + } + } + } + } + return +} + @(optimization_mode="speed") scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close := false, multiline := true) -> (value: string, err: Error) { err = .None - in_cdata := false loop: for { ch := t.ch @@ -228,27 +304,23 @@ scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close return "", .Premature_EOF case '<': - /* - Might be the start of a CDATA tag. - */ - if t.read_offset + len(CDATA_START) < len(t.src) { - if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START { - in_cdata = true - } - } - - case ']': - /* - Might be the end of a CDATA tag. - */ - if t.read_offset + len(CDATA_END) < len(t.src) { - if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END { - in_cdata = false + if peek_byte(t) == '!' { + if peek_byte(t, 1) == '[' { + /* + Might be the start of a CDATA tag. + */ + skip_cdata(t) or_return + } else if peek_byte(t, 1) == '-' && peek_byte(t, 2) == '-' { + /* + Comment start. Eat comment. + */ + t.read_offset += 3 + _ = scan_comment(t) or_return } } case '\n': - if !(multiline || in_cdata) { + if !multiline { error(t, offset, string(t.src[offset : t.offset])) error(t, offset, "[scan_string] Not terminated\n") err = .Invalid_Tag_Value @@ -256,13 +328,12 @@ scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close } } - if ch == close && !in_cdata { + if t.ch == close { /* - If it's not a CDATA tag, it's the end of this body. + If it's not a CDATA or comment, it's the end of this body. */ break loop } - advance_rune(t) } |