From b5c828fe4ee3f0942b2eda1dc5753e4ad6d38ea9 Mon Sep 17 00:00:00 2001 From: Jeroen van Rijn Date: Tue, 30 Nov 2021 23:01:22 +0100 Subject: [xml] Initial implementation of `core:encoding/xml`. A from-scratch XML implementation, loosely modeled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816). Features: - Supports enough of the XML 1.0/1.1 spec to handle the 99.9% of XML documents in common current usage. - Simple to understand and use. Small. Caveats: - We do NOT support HTML in this package, as that may or may not be valid XML. If it works, great. If it doesn't, that's not considered a bug. - We do NOT support UTF-16. If you have a UTF-16 XML file, please convert it to UTF-8 first. Also, our condolences. - <[!ELEMENT and <[!ATTLIST are not supported, and will be either ignored or return an error depending on the parser options. TODO: - Optional CDATA unboxing. - Optional `>`, ` `, ` ` and other escape substitution in tag bodies. - Test suite MAYBE: - XML writer? - Serialize/deserialize Odin types? --- core/encoding/xml/debug_print.odin | 73 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 core/encoding/xml/debug_print.odin (limited to 'core/encoding/xml/debug_print.odin') diff --git a/core/encoding/xml/debug_print.odin b/core/encoding/xml/debug_print.odin new file mode 100644 index 000000000..0b7ffa822 --- /dev/null +++ b/core/encoding/xml/debug_print.odin @@ -0,0 +1,73 @@ +package xml +/* + An XML 1.0 / 1.1 parser + + Copyright 2021 Jeroen van Rijn . + Made available under Odin's BSD-3 license. + + A from-scratch XML implementation, loosely modeled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816). + + List of contributors: + Jeroen van Rijn: Initial implementation. +*/ +import "core:fmt" + +/* + Just for debug purposes. +*/ +print :: proc(doc: ^Document) { + assert(doc != nil) + + using fmt + println("[XML Prolog]") + + for attr in doc.prolog { + printf("\t%v: %v\n", attr.key, attr.val) + } + + printf("[Encoding] %v\n", doc.encoding) + printf("[DOCTYPE] %v\n", doc.doctype.ident) + + if len(doc.doctype.rest) > 0 { + printf("\t%v\n", doc.doctype.rest) + } + + if doc.root != nil { + println(" --- ") + print_element(0, doc.root) + println(" --- ") + } +} + +print_element :: proc(indent: int, element: ^Element) { + if element == nil { return } + using fmt + + tab :: proc(indent: int) { + tabs := "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t" + + i := max(0, min(indent, len(tabs))) + printf("%v", tabs[:i]) + } + + tab(indent) + + if element.kind == .Element { + printf("<%v>\n", element.ident) + if len(element.value) > 0 { + tab(indent + 1) + printf("[Value] %v\n", element.value) + } + + for attr in element.attribs { + tab(indent + 1) + printf("[Attr] %v: %v\n", attr.key, attr.val) + } + + for child in element.children { + print_element(indent + 1, child) + } + } else if element.kind == .Comment { + printf("[COMMENT] %v\n", element.value) + } +} \ No newline at end of file -- cgit v1.2.3 From 46a4927acad674b3265969bd5bde591b480d0c73 Mon Sep 17 00:00:00 2001 From: Jeroen van Rijn Date: Wed, 1 Dec 2021 00:32:35 +0100 Subject: [xml] Use `io.Writer` for `xml.print(doc)`. --- core/encoding/xml/debug_print.odin | 51 ++++++++++++++++-------------- core/encoding/xml/example/xml_example.odin | 8 ++++- core/encoding/xml/xml_reader.odin | 2 +- 3 files changed, 36 insertions(+), 25 deletions(-) (limited to 'core/encoding/xml/debug_print.odin') diff --git a/core/encoding/xml/debug_print.odin b/core/encoding/xml/debug_print.odin index 0b7ffa822..be1175cbc 100644 --- a/core/encoding/xml/debug_print.odin +++ b/core/encoding/xml/debug_print.odin @@ -10,64 +10,69 @@ package xml List of contributors: Jeroen van Rijn: Initial implementation. */ +import "core:io" import "core:fmt" /* Just for debug purposes. */ -print :: proc(doc: ^Document) { - assert(doc != nil) - +print :: proc(writer: io.Writer, doc: ^Document) -> (written: int, err: io.Error) { + if doc == nil { return } using fmt - println("[XML Prolog]") + + written += wprintf(writer, "[XML Prolog]\n") for attr in doc.prolog { - printf("\t%v: %v\n", attr.key, attr.val) + written += wprintf(writer, "\t%v: %v\n", attr.key, attr.val) } - printf("[Encoding] %v\n", doc.encoding) - printf("[DOCTYPE] %v\n", doc.doctype.ident) + written += wprintf(writer, "[Encoding] %v\n", doc.encoding) + written += wprintf(writer, "[DOCTYPE] %v\n", doc.doctype.ident) if len(doc.doctype.rest) > 0 { - printf("\t%v\n", doc.doctype.rest) + wprintf(writer, "\t%v\n", doc.doctype.rest) } if doc.root != nil { - println(" --- ") - print_element(0, doc.root) - println(" --- ") - } + wprintln(writer, " --- ") + print_element(writer, doc.root) + wprintln(writer, " --- ") + } + + return written, .None } -print_element :: proc(indent: int, element: ^Element) { +print_element :: proc(writer: io.Writer, element: ^Element, indent := 0) -> (written: int, err: io.Error) { if element == nil { return } using fmt - tab :: proc(indent: int) { + tab :: proc(writer: io.Writer, indent: int) { tabs := "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t" i := max(0, min(indent, len(tabs))) - printf("%v", tabs[:i]) + wprintf(writer, "%v", tabs[:i]) } - tab(indent) + tab(writer, indent) if element.kind == .Element { - printf("<%v>\n", element.ident) + wprintf(writer, "<%v>\n", element.ident) if len(element.value) > 0 { - tab(indent + 1) - printf("[Value] %v\n", element.value) + tab(writer, indent + 1) + wprintf(writer, "[Value] %v\n", element.value) } for attr in element.attribs { - tab(indent + 1) - printf("[Attr] %v: %v\n", attr.key, attr.val) + tab(writer, indent + 1) + wprintf(writer, "[Attr] %v: %v\n", attr.key, attr.val) } for child in element.children { - print_element(indent + 1, child) + print_element(writer, child, indent + 1) } } else if element.kind == .Comment { - printf("[COMMENT] %v\n", element.value) + wprintf(writer, "[COMMENT] %v\n", element.value) } + + return written, .None } \ No newline at end of file diff --git a/core/encoding/xml/example/xml_example.odin b/core/encoding/xml/example/xml_example.odin index 24a277de6..82938c223 100644 --- a/core/encoding/xml/example/xml_example.odin +++ b/core/encoding/xml/example/xml_example.odin @@ -2,6 +2,7 @@ package xml_example import "core:encoding/xml" import "core:mem" +import "core:strings" import "core:fmt" Error_Handler :: proc(pos: xml.Pos, fmt: string, args: ..any) { @@ -28,7 +29,12 @@ _main :: proc() { doc, err := xml.parse(DOC, OPTIONS, FILENAME, Error_Handler) defer xml.destroy(doc) - xml.print(doc) + buf: strings.Builder + defer strings.destroy_builder(&buf) + w := strings.to_writer(&buf) + + xml.print(w, doc) + println(strings.to_string(buf)) if err != .None { printf("Parse error: %v\n", err) diff --git a/core/encoding/xml/xml_reader.odin b/core/encoding/xml/xml_reader.odin index 526be5856..34f6e65d0 100644 --- a/core/encoding/xml/xml_reader.odin +++ b/core/encoding/xml/xml_reader.odin @@ -75,6 +75,7 @@ Option_Flag :: enum { */ Decode_SGML_Entities, } +Option_Flags :: bit_set[Option_Flag; u8] Document :: struct { root: ^Element, @@ -122,7 +123,6 @@ Options :: struct { flags: Option_Flags, expected_doctype: string, } -Option_Flags :: bit_set[Option_Flag] Encoding :: enum { Unknown, -- cgit v1.2.3 From 682783a2aabad34e838493bb1e4c2437fd13058a Mon Sep 17 00:00:00 2001 From: Jeroen van Rijn Date: Wed, 1 Dec 2021 00:43:22 +0100 Subject: [xml] Tab indentation in debug printer. --- core/encoding/xml/debug_print.odin | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'core/encoding/xml/debug_print.odin') diff --git a/core/encoding/xml/debug_print.odin b/core/encoding/xml/debug_print.odin index be1175cbc..c4d6875cc 100644 --- a/core/encoding/xml/debug_print.odin +++ b/core/encoding/xml/debug_print.odin @@ -47,10 +47,9 @@ print_element :: proc(writer: io.Writer, element: ^Element, indent := 0) -> (wri using fmt tab :: proc(writer: io.Writer, indent: int) { - tabs := "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t" - - i := max(0, min(indent, len(tabs))) - wprintf(writer, "%v", tabs[:i]) + for _ in 0..=indent { + wprintf(writer, "\t") + } } tab(writer, indent) -- cgit v1.2.3 From 32eab04d662b0c1128e64a4b91fb81f5f2be5a95 Mon Sep 17 00:00:00 2001 From: Jeroen van Rijn Date: Wed, 1 Dec 2021 03:15:44 +0100 Subject: [xml] Allow multi-line bodies w/o CDATA. Strip trailing whitespace. --- core/encoding/xml/debug_print.odin | 9 ++++++--- core/encoding/xml/tokenizer.odin | 21 ++++++++++++++++++--- 2 files changed, 24 insertions(+), 6 deletions(-) (limited to 'core/encoding/xml/debug_print.odin') diff --git a/core/encoding/xml/debug_print.odin b/core/encoding/xml/debug_print.odin index c4d6875cc..65b71e30b 100644 --- a/core/encoding/xml/debug_print.odin +++ b/core/encoding/xml/debug_print.odin @@ -27,10 +27,13 @@ print :: proc(writer: io.Writer, doc: ^Document) -> (written: int, err: io.Error } written += wprintf(writer, "[Encoding] %v\n", doc.encoding) - written += wprintf(writer, "[DOCTYPE] %v\n", doc.doctype.ident) - if len(doc.doctype.rest) > 0 { - wprintf(writer, "\t%v\n", doc.doctype.rest) + if len(doc.doctype.ident) > 0 { + written += wprintf(writer, "[DOCTYPE] %v\n", doc.doctype.ident) + + if len(doc.doctype.rest) > 0 { + wprintf(writer, "\t%v\n", doc.doctype.rest) + } } if doc.root != nil { diff --git a/core/encoding/xml/tokenizer.odin b/core/encoding/xml/tokenizer.odin index 3dcffb0d6..e453552b8 100644 --- a/core/encoding/xml/tokenizer.odin +++ b/core/encoding/xml/tokenizer.odin @@ -205,7 +205,7 @@ scan_identifier :: proc(t: ^Tokenizer) -> string { return string(t.src[offset : t.offset]) } -scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close := false) -> (value: string, err: Error) { +scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close := false, multiline := true) -> (value: string, err: Error) { err = .None in_cdata := false @@ -238,7 +238,7 @@ scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close } case '\n': - if !in_cdata { + if !(multiline || in_cdata) { error(t, offset, string(t.src[offset : t.offset])) error(t, offset, "[scan_string] Not terminated\n") err = .Invalid_Tag_Value @@ -256,7 +256,22 @@ scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close advance_rune(t) } + /* + Strip trailing whitespace. + */ lit := string(t.src[offset : t.offset]) + + end := len(lit) + eat: for ; end > 0; end -= 1 { + ch := lit[end - 1] + switch ch { + case ' ', '\t', '\r', '\n': + case: + break eat + } + } + lit = lit[:end] + if consume_close { advance_rune(t) } @@ -307,7 +322,7 @@ scan :: proc(t: ^Tokenizer) -> Token { case ':': kind = .Colon case '"', '\'': - lit, err = scan_string(t, t.offset, ch, true) + lit, err = scan_string(t, t.offset, ch, true, false) if err == .None { kind = .String } else { -- cgit v1.2.3 From ec63d0bbd21aa3d3f33cd762bd656ea8eb0af4a6 Mon Sep 17 00:00:00 2001 From: Jeroen van Rijn Date: Wed, 1 Dec 2021 15:30:36 +0100 Subject: [xml] Robustness improvement. Can now parse https://www.w3.org/2003/entities/2007xml/unicode.xml no problem. --- core/encoding/xml/debug_print.odin | 4 ++ core/encoding/xml/xml_reader.odin | 75 ++++++++++++++++++++++++++------------ 2 files changed, 55 insertions(+), 24 deletions(-) (limited to 'core/encoding/xml/debug_print.odin') diff --git a/core/encoding/xml/debug_print.odin b/core/encoding/xml/debug_print.odin index 65b71e30b..e6a8c9433 100644 --- a/core/encoding/xml/debug_print.odin +++ b/core/encoding/xml/debug_print.odin @@ -36,6 +36,10 @@ print :: proc(writer: io.Writer, doc: ^Document) -> (written: int, err: io.Error } } + for comment in doc.comments { + written += wprintf(writer, "[Pre-root comment] %v\n", comment) + } + if doc.root != nil { wprintln(writer, " --- ") print_element(writer, doc.root) diff --git a/core/encoding/xml/xml_reader.odin b/core/encoding/xml/xml_reader.odin index 34f6e65d0..b2226e6b9 100644 --- a/core/encoding/xml/xml_reader.odin +++ b/core/encoding/xml/xml_reader.odin @@ -86,10 +86,16 @@ Document :: struct { /* We only scan the . The grammar does not allow a comment to end in ---> */ - if doc.root == nil { - return doc, .Comment_Before_Root_Element - } - expect(t, .Dash) offset := t.offset @@ -329,12 +339,17 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err } if .Intern_Comments in opts.flags { - el := new(Element) - - el.parent = element - el.kind = .Comment - el.value = strings.intern_get(&doc.intern, string(t.src[offset : t.offset - 1])) - append(&element.children, el) + comment := strings.intern_get(&doc.intern, string(t.src[offset : t.offset - 1])) + + if doc.root == nil { + append(&doc.comments, comment) + } else { + el := new(Element) + el.parent = element + el.kind = .Comment + el.value = comment + append(&element.children, el) + } } expect(t, .Dash) @@ -350,6 +365,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err e.g. `, which means this is an 'empty' or self-closing tag. */ end_token := scan(t) - #partial switch end_token.kind { case .Gt: /* @@ -394,9 +409,12 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err case .Slash: /* - Empty tag? + Empty tag. Close it. */ expect(t, .Gt) or_return + parent = element.parent + element = parent + tag_is_open = false case: error(t, t.offset, "Expected close tag, got: %#v\n", end_token) @@ -411,25 +429,33 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err _ = expect(t, .Gt) or_return if element.ident != ident.text { - error(t, t.offset, "Mismatched Closing Tag: %v\n", ident.text) + error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", element.ident, ident.text) return doc, .Mismatched_Closing_Tag } - parent = element.parent - element = parent + parent = element.parent + element = parent + tag_is_open = false case: error(t, t.offset, "Invalid Token after <: %#v\n", open) return } - case .EOF: + case -1: + /* + End of file. + */ + if tag_is_open { + return doc, .Premature_EOF + } break loop case: /* This should be a tag's body text. */ - element.value = scan_string(t, tok.pos.offset) or_return + body_text := scan_string(t, t.offset) or_return + element.value = strings.intern_get(&doc.intern, body_text) } } @@ -480,6 +506,7 @@ destroy :: proc(doc: ^Document) { strings.intern_destroy(&doc.intern) delete(doc.prolog) + delete(doc.comments) free(doc) } -- cgit v1.2.3 From 80878264b63cd8476def629526b294b8e129791a Mon Sep 17 00:00:00 2001 From: Jeroen van Rijn Date: Thu, 28 Apr 2022 15:29:00 +0200 Subject: [xml] Speedup. --- core/encoding/xml/debug_print.odin | 18 +- core/encoding/xml/example/xml_example.odin | 77 +++++--- core/encoding/xml/helpers.odin | 28 ++- core/encoding/xml/tokenizer.odin | 11 ++ core/encoding/xml/xml_reader.odin | 276 ++++++++++++++++------------- tests/core/encoding/xml/test_core_xml.odin | 17 +- 6 files changed, 245 insertions(+), 182 deletions(-) (limited to 'core/encoding/xml/debug_print.odin') diff --git a/core/encoding/xml/debug_print.odin b/core/encoding/xml/debug_print.odin index e6a8c9433..7c20ac123 100644 --- a/core/encoding/xml/debug_print.odin +++ b/core/encoding/xml/debug_print.odin @@ -1,8 +1,7 @@ -package xml /* An XML 1.0 / 1.1 parser - Copyright 2021 Jeroen van Rijn . + Copyright 2021-2022 Jeroen van Rijn . Made available under Odin's BSD-3 license. A from-scratch XML implementation, loosely modeled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816). @@ -10,6 +9,8 @@ package xml List of contributors: Jeroen van Rijn: Initial implementation. */ +package xml + import "core:io" import "core:fmt" @@ -40,17 +41,16 @@ print :: proc(writer: io.Writer, doc: ^Document) -> (written: int, err: io.Error written += wprintf(writer, "[Pre-root comment] %v\n", comment) } - if doc.root != nil { + if len(doc.elements) > 0 { + wprintln(writer, " --- ") + print_element(writer, doc, 0) wprintln(writer, " --- ") - print_element(writer, doc.root) - wprintln(writer, " --- ") } return written, .None } -print_element :: proc(writer: io.Writer, element: ^Element, indent := 0) -> (written: int, err: io.Error) { - if element == nil { return } +print_element :: proc(writer: io.Writer, doc: ^Document, element_id: Element_ID, indent := 0) -> (written: int, err: io.Error) { using fmt tab :: proc(writer: io.Writer, indent: int) { @@ -61,6 +61,8 @@ print_element :: proc(writer: io.Writer, element: ^Element, indent := 0) -> (wri tab(writer, indent) + element := doc.elements[element_id] + if element.kind == .Element { wprintf(writer, "<%v>\n", element.ident) if len(element.value) > 0 { @@ -74,7 +76,7 @@ print_element :: proc(writer: io.Writer, element: ^Element, indent := 0) -> (wri } for child in element.children { - print_element(writer, child, indent + 1) + print_element(writer, doc, child, indent + 1) } } else if element.kind == .Comment { wprintf(writer, "[COMMENT] %v\n", element.value) diff --git a/core/encoding/xml/example/xml_example.odin b/core/encoding/xml/example/xml_example.odin index daa3c5dab..cadfcfb43 100644 --- a/core/encoding/xml/example/xml_example.odin +++ b/core/encoding/xml/example/xml_example.odin @@ -1,52 +1,85 @@ package xml_example import "core:encoding/xml" -import "core:os" import "core:mem" import "core:fmt" import "core:time" import "core:strings" import "core:hash" +N :: 1 + example :: proc() { using fmt - doc: ^xml.Document - err: xml.Error + docs: [N]^xml.Document + errs: [N]xml.Error + times: [N]time.Duration + + defer for round in 0..` tag.") - os.exit(1) + eprintln("Could not locate top-level `` tag.") + return } - printf("Found `` with %v children.\n", len(charlist.children)) + printf("Found `` with %v children, %v elements total\n", len(docs[0].elements[charlist].children), docs[0].element_count) - crc32 := doc_hash(doc) + crc32 := doc_hash(docs[0]) printf("[%v] CRC32: 0x%08x\n", "🎉" if crc32 == 0xcaa042b9 else "🤬", crc32) + + for round in 0.. (crc32: u32) { diff --git a/core/encoding/xml/helpers.odin b/core/encoding/xml/helpers.odin index 14597ddbd..48f058334 100644 --- a/core/encoding/xml/helpers.odin +++ b/core/encoding/xml/helpers.odin @@ -1,22 +1,20 @@ -package xml /* An XML 1.0 / 1.1 parser - Copyright 2021 Jeroen van Rijn . + Copyright 2021-2022 Jeroen van Rijn . Made available under Odin's BSD-3 license. This file contains helper functions. */ +package xml - -/* - Find `tag`'s nth child with a given ident. -*/ -find_child_by_ident :: proc(tag: ^Element, ident: string, nth := 0) -> (res: ^Element, found: bool) { - if tag == nil { return nil, false } +// Find parent's nth child with a given ident. +find_child_by_ident :: proc(doc: ^Document, parent_id: Element_ID, ident: string, nth := 0) -> (res: Element_ID, found: bool) { + tag := doc.elements[parent_id] count := 0 - for child in tag.children { + for child_id in tag.children { + child := doc.elements[child_id] /* Skip commments. They have no name. */ @@ -26,18 +24,16 @@ find_child_by_ident :: proc(tag: ^Element, ident: string, nth := 0) -> (res: ^El If the ident matches and it's the nth such child, return it. */ if child.ident == ident { - if count == nth { return child, true } + if count == nth { return child_id, true } count += 1 } } - return nil, false + return 0, false } -/* - Find an attribute by key. -*/ -find_attribute_val_by_key :: proc(tag: ^Element, key: string) -> (val: string, found: bool) { - if tag == nil { return "", false } +// Find an attribute by key. +find_attribute_val_by_key :: proc(doc: ^Document, parent_id: Element_ID, key: string) -> (val: string, found: bool) { + tag := doc.elements[parent_id] for attr in tag.attribs { /* diff --git a/core/encoding/xml/tokenizer.odin b/core/encoding/xml/tokenizer.odin index 2da3b7683..c3fece76e 100644 --- a/core/encoding/xml/tokenizer.odin +++ b/core/encoding/xml/tokenizer.odin @@ -1,3 +1,14 @@ +/* + An XML 1.0 / 1.1 parser + + Copyright 2021-2022 Jeroen van Rijn . + Made available under Odin's BSD-3 license. + + A from-scratch XML implementation, loosely modeled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816). + + List of contributors: + Jeroen van Rijn: Initial implementation. +*/ package xml import "core:fmt" diff --git a/core/encoding/xml/xml_reader.odin b/core/encoding/xml/xml_reader.odin index 0315b0e05..636dd0ae4 100644 --- a/core/encoding/xml/xml_reader.odin +++ b/core/encoding/xml/xml_reader.odin @@ -1,8 +1,7 @@ -package xml /* An XML 1.0 / 1.1 parser - Copyright 2021 Jeroen van Rijn . + Copyright 2021-2022 Jeroen van Rijn . Made available under Odin's BSD-3 license. A from-scratch XML implementation, loosely modelled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816). @@ -25,12 +24,17 @@ package xml List of contributors: Jeroen van Rijn: Initial implementation. */ +package xml +// An XML 1.0 / 1.1 parser import "core:bytes" -import "core:strings" import "core:encoding/entity" +import "core:intrinsics" import "core:mem" import "core:os" +import "core:strings" + +likely :: intrinsics.expect DEFAULT_Options :: Options{ flags = { @@ -88,7 +92,9 @@ Option_Flag :: enum { Option_Flags :: bit_set[Option_Flag; u16] Document :: struct { - root: ^Element, + elements: [dynamic]Element, + element_count: Element_ID, + prolog: Attributes, encoding: Encoding, @@ -129,8 +135,8 @@ Element :: struct { Comment, }, - parent: ^Element, - children: [dynamic]^Element, + parent: Element_ID, + children: [dynamic]Element_ID, } Attr :: struct { @@ -185,7 +191,7 @@ Error :: enum { No_DocType, Too_Many_DocTypes, - DocType_Must_Proceed_Elements, + DocType_Must_Preceed_Elements, /* If a DOCTYPE is present _or_ the caller @@ -237,12 +243,16 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err doc.tokenizer = t doc.input = data + doc.elements = make([dynamic]Element, 1024, 1024, allocator) + // strings.intern_init(&doc.intern, allocator, allocator) err = .Unexpected_Token - element, parent: ^Element + element, parent: Element_ID - tag_is_open := false + tag_is_open := false + first_element := true + open: Token /* If a DOCTYPE is present, the root tag has to match. @@ -252,6 +262,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err loop: for { skip_whitespace(t) + // NOTE(Jeroen): This is faster as a switch. switch t.ch { case '<': /* @@ -259,118 +270,36 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err */ advance_rune(t) - open := scan(t) - #partial switch open.kind { - - case .Question: - /* - 0 { - /* - We've already seen a prolog. - */ - return doc, .Too_Many_Prologs - } else { - /* - Could be ` 0 { - return doc, .Too_Many_DocTypes - } - if doc.root != nil { - return doc, .DocType_Must_Proceed_Elements - } - parse_doctype(doc) or_return - - if len(expected_doctype) > 0 && expected_doctype != doc.doctype.ident { - error(t, t.offset, "Invalid DOCTYPE. Expected: %v, got: %v\n", expected_doctype, doc.doctype.ident) - return doc, .Invalid_DocType - } - expected_doctype = doc.doctype.ident - - case: - if .Error_on_Unsupported in opts.flags { - error(t, t.offset, "Unhandled: . - The grammar does not allow a comment to end in ---> - */ - expect(t, .Dash) - comment := scan_comment(t) or_return - - if .Intern_Comments in opts.flags { - if doc.root == nil { - append(&doc.comments, comment) - } else { - el := new(Element) - el.parent = element - el.kind = .Comment - el.value = comment - append(&element.children, el) - } - } - - case: - error(t, t.offset, "Invalid Token after 0 && expected_doctype != open.text { error(t, t.offset, "Root Tag doesn't match DOCTYPE. Expected: %v, got: %v\n", expected_doctype, open.text) return doc, .Invalid_DocType @@ -395,7 +324,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err Empty tag. Close it. */ expect(t, .Gt) or_return - parent = element.parent + parent = doc.elements[element].parent element = parent tag_is_open = false @@ -404,22 +333,103 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err return } - case .Slash: + } else if open.kind == .Slash { /* Close tag. */ ident := expect(t, .Ident) or_return _ = expect(t, .Gt) or_return - if element.ident != ident.text { - error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", element.ident, ident.text) + if doc.elements[element].ident != ident.text { + error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", doc.elements[element].ident, ident.text) return doc, .Mismatched_Closing_Tag } - parent = element.parent + parent = doc.elements[element].parent element = parent tag_is_open = false - case: + } else if open.kind == .Exclaim { + /* + 0 { + return doc, .Too_Many_DocTypes + } + if doc.element_count > 0 { + return doc, .DocType_Must_Preceed_Elements + } + parse_doctype(doc) or_return + + if len(expected_doctype) > 0 && expected_doctype != doc.doctype.ident { + error(t, t.offset, "Invalid DOCTYPE. Expected: %v, got: %v\n", expected_doctype, doc.doctype.ident) + return doc, .Invalid_DocType + } + expected_doctype = doc.doctype.ident + + case: + if .Error_on_Unsupported in opts.flags { + error(t, t.offset, "Unhandled: . + The grammar does not allow a comment to end in ---> + */ + expect(t, .Dash) + comment := scan_comment(t) or_return + + if .Intern_Comments in opts.flags { + if len(doc.elements) == 0 { + append(&doc.comments, comment) + } else { + el := new_element(doc) + doc.elements[el].parent = element + doc.elements[el].kind = .Comment + doc.elements[el].value = comment + append(&doc.elements[element].children, el) + } + } + + case: + error(t, t.offset, "Invalid Token after 0 { + /* + We've already seen a prolog. + */ + return doc, .Too_Many_Prologs + } else { + /* + Could be ` (err: Error) { */ doc.doctype.rest = string(t.src[offset : t.offset - 1]) return .None +} + +Element_ID :: u32 + +new_element :: proc(doc: ^Document) -> (id: Element_ID) { + element_space := len(doc.elements) + + // Need to resize + if int(doc.element_count) + 1 > element_space { + if element_space < 65536 { + element_space *= 2 + } else { + element_space += 65536 + } + resize(&doc.elements, element_space) + } + + cur := doc.element_count + doc.element_count += 1 + + return cur } \ No newline at end of file diff --git a/tests/core/encoding/xml/test_core_xml.odin b/tests/core/encoding/xml/test_core_xml.odin index 7669afe97..82386b2bb 100644 --- a/tests/core/encoding/xml/test_core_xml.odin +++ b/tests/core/encoding/xml/test_core_xml.odin @@ -224,7 +224,7 @@ doc_to_string :: proc(doc: ^xml.Document) -> (result: string) { written += wprintf(writer, "[DOCTYPE] %v\n", doc.doctype.ident) if len(doc.doctype.rest) > 0 { - wprintf(writer, "\t%v\n", doc.doctype.rest) + wprintf(writer, "\t%v\n", doc.doctype.rest) } } @@ -232,17 +232,16 @@ doc_to_string :: proc(doc: ^xml.Document) -> (result: string) { written += wprintf(writer, "[Pre-root comment] %v\n", comment) } - if doc.root != nil { - wprintln(writer, " --- ") - print_element(writer, doc.root) - wprintln(writer, " --- ") + if doc.element_count > 0 { + wprintln(writer, " --- ") + print_element(writer, doc, 0) + wprintln(writer, " --- ") } return written, .None } - print_element :: proc(writer: io.Writer, element: ^xml.Element, indent := 0) -> (written: int, err: io.Error) { - if element == nil { return } + print_element :: proc(writer: io.Writer, doc: ^xml.Document, element_id: xml.Element_ID, indent := 0) -> (written: int, err: io.Error) { using fmt tab :: proc(writer: io.Writer, indent: int) { @@ -253,6 +252,8 @@ doc_to_string :: proc(doc: ^xml.Document) -> (result: string) { tab(writer, indent) + element := doc.elements[element_id] + if element.kind == .Element { wprintf(writer, "<%v>\n", element.ident) if len(element.value) > 0 { @@ -266,7 +267,7 @@ doc_to_string :: proc(doc: ^xml.Document) -> (result: string) { } for child in element.children { - print_element(writer, child, indent + 1) + print_element(writer, doc, child, indent + 1) } } else if element.kind == .Comment { wprintf(writer, "[COMMENT] %v\n", element.value) -- cgit v1.2.3 From d224679619e4b8b41c62d3cf1909ea05a39f569e Mon Sep 17 00:00:00 2001 From: gingerBill Date: Thu, 12 May 2022 15:57:03 +0100 Subject: Minor name changes within `core:encoding/xml` for consistency --- core/encoding/xml/debug_print.odin | 2 +- core/encoding/xml/example/xml_example.odin | 2 +- core/encoding/xml/xml_reader.odin | 50 ++++++++++++++---------------- 3 files changed, 26 insertions(+), 28 deletions(-) (limited to 'core/encoding/xml/debug_print.odin') diff --git a/core/encoding/xml/debug_print.odin b/core/encoding/xml/debug_print.odin index 7c20ac123..e9a1cb160 100644 --- a/core/encoding/xml/debug_print.odin +++ b/core/encoding/xml/debug_print.odin @@ -23,7 +23,7 @@ print :: proc(writer: io.Writer, doc: ^Document) -> (written: int, err: io.Error written += wprintf(writer, "[XML Prolog]\n") - for attr in doc.prolog { + for attr in doc.prologue { written += wprintf(writer, "\t%v: %v\n", attr.key, attr.val) } diff --git a/core/encoding/xml/example/xml_example.odin b/core/encoding/xml/example/xml_example.odin index cadfcfb43..f7e74840e 100644 --- a/core/encoding/xml/example/xml_example.odin +++ b/core/encoding/xml/example/xml_example.odin @@ -35,7 +35,7 @@ example :: proc() { times[round] = time.tick_diff(start, end) } - fastest := time.Duration(max(i64)) + fastest := max(time.Duration) slowest := time.Duration(0) total := time.Duration(0) diff --git a/core/encoding/xml/xml_reader.odin b/core/encoding/xml/xml_reader.odin index 151d44e2a..b77ae97b3 100644 --- a/core/encoding/xml/xml_reader.odin +++ b/core/encoding/xml/xml_reader.odin @@ -36,10 +36,8 @@ import "core:strings" likely :: intrinsics.expect -DEFAULT_Options :: Options{ - flags = { - .Ignore_Unsupported, - }, +DEFAULT_OPTIONS :: Options{ + flags = {.Ignore_Unsupported}, expected_doctype = "", } @@ -51,7 +49,7 @@ Option_Flag :: enum { Input_May_Be_Modified, /* - Document MUST start with ` (doc: ^Document, err: Error) { +parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) { data := data context.allocator = allocator @@ -411,10 +409,10 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err #partial switch next.kind { case .Ident: if len(next.text) == 3 && strings.to_lower(next.text, context.temp_allocator) == "xml" { - parse_prolog(doc) or_return - } else if len(doc.prolog) > 0 { + parse_prologue(doc) or_return + } else if len(doc.prologue) > 0 { /* - We've already seen a prolog. + We've already seen a prologue. */ return doc, .Too_Many_Prologs } else { @@ -481,7 +479,7 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err } } - if .Must_Have_Prolog in opts.flags && len(doc.prolog) == 0 { + if .Must_Have_Prolog in opts.flags && len(doc.prologue) == 0 { return doc, .No_Prolog } @@ -493,16 +491,16 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err return doc, .None } -parse_from_string :: proc(data: string, options := DEFAULT_Options, path := "", error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) { +parse_string :: proc(data: string, options := DEFAULT_OPTIONS, path := "", error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) { _data := transmute([]u8)data - return parse_from_slice(_data, options, path, error_handler, allocator) + return parse_bytes(_data, options, path, error_handler, allocator) } -parse :: proc { parse_from_string, parse_from_slice } +parse :: proc { parse_string, parse_bytes } // Load an XML file -load_from_file :: proc(filename: string, options := DEFAULT_Options, error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) { +load_from_file :: proc(filename: string, options := DEFAULT_OPTIONS, error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) { context.allocator = allocator options := options @@ -511,7 +509,7 @@ load_from_file :: proc(filename: string, options := DEFAULT_Options, error_handl options.flags += { .Input_May_Be_Modified } - return parse_from_slice(data, options, filename, error_handler, allocator) + return parse_bytes(data, options, filename, error_handler, allocator) } destroy :: proc(doc: ^Document) { @@ -523,7 +521,7 @@ destroy :: proc(doc: ^Document) { } delete(doc.elements) - delete(doc.prolog) + delete(doc.prologue) delete(doc.comments) delete(doc.input) @@ -556,7 +554,7 @@ expect :: proc(t: ^Tokenizer, kind: Token_Kind) -> (tok: Token, err: Error) { return tok, .Unexpected_Token } -parse_attribute :: proc(doc: ^Document) -> (attr: Attr, offset: int, err: Error) { +parse_attribute :: proc(doc: ^Document) -> (attr: Attribute, offset: int, err: Error) { assert(doc != nil) context.allocator = doc.allocator t := doc.tokenizer @@ -574,7 +572,7 @@ parse_attribute :: proc(doc: ^Document) -> (attr: Attr, offset: int, err: Error) return } -check_duplicate_attributes :: proc(t: ^Tokenizer, attribs: Attributes, attr: Attr, offset: int) -> (err: Error) { +check_duplicate_attributes :: proc(t: ^Tokenizer, attribs: Attributes, attr: Attribute, offset: int) -> (err: Error) { for a in attribs { if attr.key == a.key { error(t, offset, "Duplicate attribute: %v\n", attr.key) @@ -598,21 +596,21 @@ parse_attributes :: proc(doc: ^Document, attribs: ^Attributes) -> (err: Error) { return .None } -parse_prolog :: proc(doc: ^Document) -> (err: Error) { +parse_prologue :: proc(doc: ^Document) -> (err: Error) { assert(doc != nil) context.allocator = doc.allocator t := doc.tokenizer offset := t.offset - parse_attributes(doc, &doc.prolog) or_return + parse_attributes(doc, &doc.prologue) or_return - for attr in doc.prolog { + for attr in doc.prologue { switch attr.key { case "version": switch attr.val { case "1.0", "1.1": case: - error(t, offset, "[parse_prolog] Warning: Unhandled XML version: %v\n", attr.val) + error(t, offset, "[parse_prologue] Warning: Unhandled XML version: %v\n", attr.val) } case "encoding": @@ -627,7 +625,7 @@ parse_prolog :: proc(doc: ^Document) -> (err: Error) { /* Unrecognized encoding, assume UTF-8. */ - error(t, offset, "[parse_prolog] Warning: Unrecognized encoding: %v\n", attr.val) + error(t, offset, "[parse_prologue] Warning: Unrecognized encoding: %v\n", attr.val) } case: -- cgit v1.2.3