aboutsummaryrefslogtreecommitdiff
path: root/core
diff options
context:
space:
mode:
authorJeroen van Rijn <Kelimion@users.noreply.github.com>2021-12-01 15:30:36 +0100
committerJeroen van Rijn <Kelimion@users.noreply.github.com>2021-12-05 02:52:22 +0100
commitec63d0bbd21aa3d3f33cd762bd656ea8eb0af4a6 (patch)
tree0c113ca28319dbef82c6341989a3717205f32e79 /core
parent32eab04d662b0c1128e64a4b91fb81f5f2be5a95 (diff)
[xml] Robustness improvement.
Can now parse https://www.w3.org/2003/entities/2007xml/unicode.xml no problem.
Diffstat (limited to 'core')
-rw-r--r--core/encoding/xml/debug_print.odin4
-rw-r--r--core/encoding/xml/xml_reader.odin75
2 files changed, 55 insertions, 24 deletions
diff --git a/core/encoding/xml/debug_print.odin b/core/encoding/xml/debug_print.odin
index 65b71e30b..e6a8c9433 100644
--- a/core/encoding/xml/debug_print.odin
+++ b/core/encoding/xml/debug_print.odin
@@ -36,6 +36,10 @@ print :: proc(writer: io.Writer, doc: ^Document) -> (written: int, err: io.Error
}
}
+ for comment in doc.comments {
+ written += wprintf(writer, "[Pre-root comment] %v\n", comment)
+ }
+
if doc.root != nil {
wprintln(writer, " --- ")
print_element(writer, doc.root)
diff --git a/core/encoding/xml/xml_reader.odin b/core/encoding/xml/xml_reader.odin
index 34f6e65d0..b2226e6b9 100644
--- a/core/encoding/xml/xml_reader.odin
+++ b/core/encoding/xml/xml_reader.odin
@@ -86,11 +86,17 @@ Document :: struct {
/*
We only scan the <!DOCTYPE IDENT part and skip the rest.
*/
- ident: string,
- rest: string,
+ ident: string,
+ rest: string,
},
/*
+ If we encounter comments before the root node, and the option to intern comments is given, this is where they'll live.
+ Otherwise they'll be in the element tree.
+ */
+ comments: [dynamic]string,
+
+ /*
Internal
*/
tokenizer: ^Tokenizer,
@@ -218,6 +224,8 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
err = .Unexpected_Token
element, parent: ^Element
+ tag_is_open := false
+
/*
If a DOCTYPE is present, the root tag has to match.
If an expected DOCTYPE is given in options (i.e. it's non-empty), the DOCTYPE (if present) and root tag have to match.
@@ -225,10 +233,14 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
expected_doctype := options.expected_doctype
loop: for {
- tok := scan(t)
- #partial switch tok.kind {
+ skip_whitespace(t)
+ switch t.ch {
+ case '<':
+ /*
+ Consume peeked `<`
+ */
+ tok := scan(t)
- case .Lt:
open := scan(t)
#partial switch open.kind {
@@ -247,8 +259,10 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
*/
return doc, .Too_Many_Prologs
} else {
- error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", tok.text)
- return
+ /*
+ Could be `<?xml-stylesheet`, etc. Ignore it.
+ */
+ skip_element(t) or_return
}
case:
error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", tok.text)
@@ -292,10 +306,6 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
Comment: <!-- -->.
The grammar does not allow a comment to end in --->
*/
- if doc.root == nil {
- return doc, .Comment_Before_Root_Element
- }
-
expect(t, .Dash)
offset := t.offset
@@ -329,12 +339,17 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
}
if .Intern_Comments in opts.flags {
- el := new(Element)
-
- el.parent = element
- el.kind = .Comment
- el.value = strings.intern_get(&doc.intern, string(t.src[offset : t.offset - 1]))
- append(&element.children, el)
+ comment := strings.intern_get(&doc.intern, string(t.src[offset : t.offset - 1]))
+
+ if doc.root == nil {
+ append(&doc.comments, comment)
+ } else {
+ el := new(Element)
+ el.parent = element
+ el.kind = .Comment
+ el.value = comment
+ append(&element.children, el)
+ }
}
expect(t, .Dash)
@@ -350,6 +365,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
e.g. <odin - Start of new element.
*/
element = new(Element)
+ tag_is_open = true
if doc.root == nil {
/*
@@ -384,7 +400,6 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
- `/>`, which means this is an 'empty' or self-closing tag.
*/
end_token := scan(t)
-
#partial switch end_token.kind {
case .Gt:
/*
@@ -394,9 +409,12 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
case .Slash:
/*
- Empty tag?
+ Empty tag. Close it.
*/
expect(t, .Gt) or_return
+ parent = element.parent
+ element = parent
+ tag_is_open = false
case:
error(t, t.offset, "Expected close tag, got: %#v\n", end_token)
@@ -411,25 +429,33 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
_ = expect(t, .Gt) or_return
if element.ident != ident.text {
- error(t, t.offset, "Mismatched Closing Tag: %v\n", ident.text)
+ error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", element.ident, ident.text)
return doc, .Mismatched_Closing_Tag
}
- parent = element.parent
- element = parent
+ parent = element.parent
+ element = parent
+ tag_is_open = false
case:
error(t, t.offset, "Invalid Token after <: %#v\n", open)
return
}
- case .EOF:
+ case -1:
+ /*
+ End of file.
+ */
+ if tag_is_open {
+ return doc, .Premature_EOF
+ }
break loop
case:
/*
This should be a tag's body text.
*/
- element.value = scan_string(t, tok.pos.offset) or_return
+ body_text := scan_string(t, t.offset) or_return
+ element.value = strings.intern_get(&doc.intern, body_text)
}
}
@@ -480,6 +506,7 @@ destroy :: proc(doc: ^Document) {
strings.intern_destroy(&doc.intern)
delete(doc.prolog)
+ delete(doc.comments)
free(doc)
}