aboutsummaryrefslogtreecommitdiff
path: root/core/encoding
diff options
context:
space:
mode:
authorJeroen van Rijn <Kelimion@users.noreply.github.com>2021-12-02 21:07:40 +0100
committerJeroen van Rijn <Kelimion@users.noreply.github.com>2021-12-05 02:52:23 +0100
commit3d72e80ccf0f382f03a1c9407c4728862c5bca91 (patch)
treed5ed66c7b2d2a8fe697eeccb35a0884977143f32 /core/encoding
parent2dd67dba89732b89adb0199bc0a99de4cbc34e8f (diff)
[xml] Implement optional unboxing of CDATA and decoding of tag values.
Diffstat (limited to 'core/encoding')
-rw-r--r--core/encoding/entity/entity.odin39
-rw-r--r--core/encoding/entity/example/entity_example.odin67
-rw-r--r--core/encoding/entity/example/test.html2
-rw-r--r--core/encoding/xml/xml_reader.odin41
4 files changed, 56 insertions, 93 deletions
diff --git a/core/encoding/entity/entity.odin b/core/encoding/entity/entity.odin
index e40896819..8742446e6 100644
--- a/core/encoding/entity/entity.odin
+++ b/core/encoding/entity/entity.odin
@@ -61,15 +61,21 @@ COMMENT_END :: "-->"
*/
XML_Decode_Option :: enum u8 {
/*
+ Do not decode & entities. It decodes by default.
+ If given, overrides `Decode_CDATA`.
+ */
+ No_Entity_Decode,
+
+ /*
CDATA is unboxed.
*/
- CDATA_Unbox,
+ Unbox_CDATA,
/*
Unboxed CDATA is decoded as well.
- Ignored if `.CDATA_Unbox` is not given.
+ Ignored if `.Unbox_CDATA` is not given.
*/
- CDATA_Decode,
+ Decode_CDATA,
/*
Comments are stripped.
@@ -129,7 +135,7 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator :=
}
case:
- if in_data && .CDATA_Decode not_in options {
+ if in_data && .Decode_CDATA not_in options {
/*
Unboxed, but undecoded.
*/
@@ -145,17 +151,20 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator :=
*/
write_string(&builder, entity)
} else {
- if decoded, ok := xml_decode_entity(entity); ok {
- write_rune(&builder, decoded)
- } else {
- /*
- Decode failed. Pass through original.
- */
- write_string(&builder, "&")
- write_string(&builder, entity)
- write_string(&builder, ";")
+
+ if .No_Entity_Decode not_in options {
+ if decoded, ok := xml_decode_entity(entity); ok {
+ write_rune(&builder, decoded)
+ continue
+ }
}
+ /*
+ Literal passthrough because the decode failed or we want entities not decoded.
+ */
+ write_string(&builder, "&")
+ write_string(&builder, entity)
+ write_string(&builder, ";")
}
} else {
write_rune(&builder, t.r)
@@ -290,7 +299,7 @@ _handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: X
if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
t.read_offset += len(CDATA_START) - 1
- if .CDATA_Unbox in options && .CDATA_Decode in options {
+ if .Unbox_CDATA in options && .Decode_CDATA in options {
/*
We're unboxing _and_ decoding CDATA
*/
@@ -315,7 +324,7 @@ _handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: X
cdata := string(t.src[offset : t.read_offset])
- if .CDATA_Unbox in options {
+ if .Unbox_CDATA in options {
cdata = cdata[len(CDATA_START):]
cdata = cdata[:len(cdata) - len(CDATA_END)]
}
diff --git a/core/encoding/entity/example/entity_example.odin b/core/encoding/entity/example/entity_example.odin
index 8758d9ad9..161a44827 100644
--- a/core/encoding/entity/example/entity_example.odin
+++ b/core/encoding/entity/example/entity_example.odin
@@ -1,19 +1,11 @@
package unicode_entity_example
import "core:encoding/xml"
-import "core:encoding/entity"
import "core:strings"
import "core:mem"
import "core:fmt"
import "core:time"
-OPTIONS :: xml.Options{
- flags = {
- .Ignore_Unsupported, .Intern_Comments,
- },
- expected_doctype = "",
-}
-
doc_print :: proc(doc: ^xml.Document) {
buf: strings.Builder
defer strings.destroy_builder(&buf)
@@ -29,6 +21,13 @@ _entities :: proc() {
DOC :: #load("../../../../tests/core/assets/XML/unicode.xml")
+ OPTIONS :: xml.Options{
+ flags = {
+ .Ignore_Unsupported, .Intern_Comments,
+ },
+ expected_doctype = "",
+ }
+
parse_duration: time.Duration
{
@@ -50,57 +49,11 @@ _entities :: proc() {
_main :: proc() {
using fmt
- doc, err := xml.parse(#load("test.html"))
+ options := xml.Options{ flags = { .Ignore_Unsupported, .Intern_Comments, .Unbox_CDATA, .Decode_SGML_Entities }}
+ doc, _ := xml.parse(#load("test.html"), options)
+
defer xml.destroy(doc)
doc_print(doc)
-
- if false {
- val := doc.root.children[1].children[2].value
-
- println()
- replaced, ok := entity.decode_xml(val)
- defer delete(replaced)
-
- printf("Before: '%v', Err: %v\n", val, err)
- printf("Passthrough: '%v'\nOK: %v\n", replaced, ok)
- println()
- }
-
- if false {
- val := doc.root.children[1].children[2].value
-
- println()
- replaced, ok := entity.decode_xml(val, { .CDATA_Unbox })
- defer delete(replaced)
-
- printf("Before: '%v', Err: %v\n", val, err)
- printf("CDATA_Unbox: '%v'\nOK: %v\n", replaced, ok)
- println()
- }
-
- if true {
- val := doc.root.children[1].children[2].value
-
- println()
- replaced, ok := entity.decode_xml(val, { .CDATA_Unbox, .CDATA_Decode })
- defer delete(replaced)
-
- printf("Before: '%v', Err: %v\n", val, err)
- printf("CDATA_Decode: '%v'\nOK: %v\n", replaced, ok)
- println()
- }
-
- if true {
- val := doc.root.children[1].children[1].value
-
- println()
- replaced, ok := entity.decode_xml(val, { .Comment_Strip })
- defer delete(replaced)
-
- printf("Before: '%v', Err: %v\n", val, err)
- printf("Comment_Strip: '%v'\nOK: %v\n", replaced, ok)
- println()
- }
}
main :: proc() {
diff --git a/core/encoding/entity/example/test.html b/core/encoding/entity/example/test.html
index 60e32bf03..62a0bb35a 100644
--- a/core/encoding/entity/example/test.html
+++ b/core/encoding/entity/example/test.html
@@ -16,9 +16,11 @@
<div id="test_cdata_in_comment" foo="">
Foozle]!&#32;&copy;&#x20;<!-- <![CDATA[&#32;&reg;&#x20;]]> -->42&;1234&
</div>
+ <!-- EXPECTED: Foozle]! © 42&;1234& -->
<div id="test_cdata_unwrap_and_passthrough">
Foozle]!&#32;&copy;&#x20;<![CDATA[BOX&#32;&reg;&#x20;/BOX]]>42&;1234&
</div>
+ <!-- EXPECTED: Foozle]! © BOX ® /BOX42&;1234& -->
<div>
&verbar; &vert; &VerticalLine; &fjlig; &grave; &bsol; &reg; &rhov; &CounterClockwiseContourIntegral;
</div>
diff --git a/core/encoding/xml/xml_reader.odin b/core/encoding/xml/xml_reader.odin
index 146c278cb..6f49b8e08 100644
--- a/core/encoding/xml/xml_reader.odin
+++ b/core/encoding/xml/xml_reader.odin
@@ -18,10 +18,6 @@ package xml
- We do NOT support UTF-16. If you have a UTF-16 XML file, please convert it to UTF-8 first. Also, our condolences.
- <[!ELEMENT and <[!ATTLIST are not supported, and will be either ignored or return an error depending on the parser options.
- TODO:
- - Optional CDATA unboxing.
- - Optional `&gt;`, `&#32;`, `&#x20;` and other escape substitution in tag bodies.
-
MAYBE:
- XML writer?
- Serialize/deserialize Odin types?
@@ -31,6 +27,7 @@ package xml
*/
import "core:strings"
+import "core:encoding/entity"
import "core:mem"
import "core:os"
@@ -196,12 +193,6 @@ Error :: enum {
Duplicate_Attribute,
Conflicting_Options,
-
- /*
- Unhandled TODO:
- */
- Unhandled_CDATA_Unboxing,
- Unhandled_SGML_Entity_Decoding,
}
/*
@@ -422,8 +413,25 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
/*
This should be a tag's body text.
*/
- body_text := scan_string(t, t.offset) or_return
- element.value = strings.intern_get(&doc.intern, body_text)
+ body_text := scan_string(t, t.offset) or_return
+
+ decode_opts := entity.XML_Decode_Options{ .Comment_Strip }
+
+ if .Decode_SGML_Entities not_in opts.flags {
+ decode_opts += { .No_Entity_Decode }
+ }
+ if .Unbox_CDATA in opts.flags {
+ decode_opts += { .Unbox_CDATA, .Decode_CDATA }
+ }
+
+ decoded, decode_err := entity.decode_xml(body_text, decode_opts)
+ defer delete(decoded)
+
+ if decode_err == .None {
+ element.value = strings.intern_get(&doc.intern, decoded)
+ } else {
+ element.value = strings.intern_get(&doc.intern, body_text)
+ }
}
}
@@ -488,15 +496,6 @@ validate_options :: proc(options: Options) -> (validated: Options, err: Error) {
if .Error_on_Unsupported in validated.flags && .Ignore_Unsupported in validated.flags {
return options, .Conflicting_Options
}
-
- if .Unbox_CDATA in validated.flags {
- return options, .Unhandled_CDATA_Unboxing
- }
-
- if .Decode_SGML_Entities in validated.flags {
- return options, .Unhandled_SGML_Entity_Decoding
- }
-
return validated, .None
}