diff options
Diffstat (limited to 'core/encoding/xml')
| -rw-r--r-- | core/encoding/xml/debug_print.odin | 6 | ||||
| -rw-r--r-- | core/encoding/xml/example/xml_example.odin | 4 | ||||
| -rw-r--r-- | core/encoding/xml/helpers.odin | 6 | ||||
| -rw-r--r-- | core/encoding/xml/tokenizer.odin | 6 | ||||
| -rw-r--r-- | core/encoding/xml/xml_reader.odin | 219 |
5 files changed, 85 insertions, 156 deletions
diff --git a/core/encoding/xml/debug_print.odin b/core/encoding/xml/debug_print.odin index b97617a8a..2607bec23 100644 --- a/core/encoding/xml/debug_print.odin +++ b/core/encoding/xml/debug_print.odin @@ -1,3 +1,5 @@ +package xml + /* An XML 1.0 / 1.1 parser @@ -9,7 +11,7 @@ List of contributors: Jeroen van Rijn: Initial implementation. */ -package xml + import "core:io" import "core:fmt" @@ -81,4 +83,4 @@ print_element :: proc(writer: io.Writer, doc: ^Document, element_id: Element_ID, } return written, .None -}
\ No newline at end of file +} diff --git a/core/encoding/xml/example/xml_example.odin b/core/encoding/xml/example/xml_example.odin index aebb8d0ea..9648143db 100644 --- a/core/encoding/xml/example/xml_example.odin +++ b/core/encoding/xml/example/xml_example.odin @@ -20,7 +20,7 @@ example :: proc() { xml.destroy(docs[round]) } - DOC :: #load("../../../../tests/core/assets/XML/unicode.xml") + DOC :: #load("../../../../tests/core/assets/XML/utf8.xml") input := DOC for round in 0..<N { @@ -109,4 +109,4 @@ main :: proc() { } } println("Done and cleaned up!") -}
\ No newline at end of file +} diff --git a/core/encoding/xml/helpers.odin b/core/encoding/xml/helpers.odin index e0b5ecc32..42a5258b3 100644 --- a/core/encoding/xml/helpers.odin +++ b/core/encoding/xml/helpers.odin @@ -1,3 +1,5 @@ +package xml + /* An XML 1.0 / 1.1 parser @@ -6,7 +8,7 @@ This file contains helper functions. */ -package xml + // Find parent's nth child with a given ident. find_child_by_ident :: proc(doc: ^Document, parent_id: Element_ID, ident: string, nth := 0) -> (res: Element_ID, found: bool) { @@ -47,4 +49,4 @@ find_attribute_val_by_key :: proc(doc: ^Document, parent_id: Element_ID, key: st if attr.key == key { return attr.val, true } } return "", false -}
\ No newline at end of file +} diff --git a/core/encoding/xml/tokenizer.odin b/core/encoding/xml/tokenizer.odin index cd055475c..a223a75d6 100644 --- a/core/encoding/xml/tokenizer.odin +++ b/core/encoding/xml/tokenizer.odin @@ -1,3 +1,5 @@ +package xml + /* An XML 1.0 / 1.1 parser @@ -9,7 +11,7 @@ List of contributors: Jeroen van Rijn: Initial implementation. */ -package xml + import "core:fmt" import "core:unicode" @@ -433,4 +435,4 @@ scan :: proc(t: ^Tokenizer) -> Token { lit = string(t.src[offset : t.offset]) } return Token{kind, lit, pos} -}
\ No newline at end of file +} diff --git a/core/encoding/xml/xml_reader.odin b/core/encoding/xml/xml_reader.odin index f4f8a4b05..562d519d5 100644 --- a/core/encoding/xml/xml_reader.odin +++ b/core/encoding/xml/xml_reader.odin @@ -1,28 +1,28 @@ /* - An XML 1.0 / 1.1 parser + XML 1.0 / 1.1 parser - Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>. - Made available under Odin's BSD-3 license. + 2021-2022 Jeroen van Rijn <nom@duclavier.com>. + available under Odin's BSD-3 license. - A from-scratch XML implementation, loosely modelled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816). + from-scratch XML implementation, loosely modelled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816). - Features: - - Supports enough of the XML 1.0/1.1 spec to handle the 99.9% of XML documents in common current usage. - - Simple to understand and use. Small. +Features: +- Supports enough of the XML 1.0/1.1 spec to handle the 99.9% of XML documents in common current usage. +- Simple to understand and use. Small. - Caveats: - - We do NOT support HTML in this package, as that may or may not be valid XML. - If it works, great. If it doesn't, that's not considered a bug. +Caveats: +- We do NOT support HTML in this package, as that may or may not be valid XML. + If it works, great. If it doesn't, that's not considered a bug. - - We do NOT support UTF-16. If you have a UTF-16 XML file, please convert it to UTF-8 first. Also, our condolences. - - <[!ELEMENT and <[!ATTLIST are not supported, and will be either ignored or return an error depending on the parser options. +- We do NOT support UTF-16. If you have a UTF-16 XML file, please convert it to UTF-8 first. Also, our condolences. +- <[!ELEMENT and <[!ATTLIST are not supported, and will be either ignored or return an error depending on the parser options. - MAYBE: - - XML writer? - - Serialize/deserialize Odin types? +MAYBE: +- XML writer? +- Serialize/deserialize Odin types? - List of contributors: - Jeroen van Rijn: Initial implementation. +List of contributors: +- Jeroen van Rijn: Initial implementation. */ package xml // An XML 1.0 / 1.1 parser @@ -43,48 +43,32 @@ DEFAULT_OPTIONS :: Options{ } Option_Flag :: enum { - /* - If the caller says that input may be modified, we can perform in-situ parsing. - If this flag isn't provided, the XML parser first duplicates the input so that it can. - */ + // If the caller says that input may be modified, we can perform in-situ parsing. + // If this flag isn't provided, the XML parser first duplicates the input so that it can. Input_May_Be_Modified, - /* - Document MUST start with `<?xml` prologue. - */ + // Document MUST start with `<?xml` prologue. Must_Have_Prolog, - /* - Document MUST have a `<!DOCTYPE`. - */ + // Document MUST have a `<!DOCTYPE`. Must_Have_DocType, - /* - By default we skip comments. Use this option to intern a comment on a parented Element. - */ + // By default we skip comments. Use this option to intern a comment on a parented Element. Intern_Comments, - /* - How to handle unsupported parts of the specification, like <! other than <!DOCTYPE and <![CDATA[ - */ + // How to handle unsupported parts of the specification, like <! other than <!DOCTYPE and <![CDATA[ Error_on_Unsupported, Ignore_Unsupported, - /* - By default CDATA tags are passed-through as-is. - This option unwraps them when encountered. - */ + // By default CDATA tags are passed-through as-is. + // This option unwraps them when encountered. Unbox_CDATA, - /* - By default SGML entities like `>`, ` ` and ` ` are passed-through as-is. - This option decodes them when encountered. - */ + // By default SGML entities like `>`, ` ` and ` ` are passed-through as-is. + // This option decodes them when encountered. Decode_SGML_Entities, - /* - If a tag body has a comment, it will be stripped unless this option is given. - */ + // If a tag body has a comment, it will be stripped unless this option is given. Keep_Tag_Body_Comments, } Option_Flags :: bit_set[Option_Flag; u16] @@ -97,28 +81,20 @@ Document :: struct { encoding: Encoding, doctype: struct { - /* - We only scan the <!DOCTYPE IDENT part and skip the rest. - */ + // We only scan the <!DOCTYPE IDENT part and skip the rest. ident: string, rest: string, }, - /* - If we encounter comments before the root node, and the option to intern comments is given, this is where they'll live. - Otherwise they'll be in the element tree. - */ + // If we encounter comments before the root node, and the option to intern comments is given, this is where they'll live. + // Otherwise they'll be in the element tree. comments: [dynamic]string, - /* - Internal - */ + // Internal tokenizer: ^Tokenizer, allocator: mem.Allocator, - /* - Input. Either the original buffer, or a copy if `.Input_May_Be_Modified` isn't specified. - */ + // Input. Either the original buffer, or a copy if `.Input_May_Be_Modified` isn't specified. input: []u8, strings_to_free: [dynamic]string, } @@ -158,34 +134,24 @@ Encoding :: enum { UTF_8, ISO_8859_1, - /* - Aliases - */ + // Aliases LATIN_1 = ISO_8859_1, } Error :: enum { - /* - General return values. - */ + // General return values. None = 0, General_Error, Unexpected_Token, Invalid_Token, - /* - Couldn't find, open or read file. - */ + // Couldn't find, open or read file. File_Error, - /* - File too short. - */ + // File too short. Premature_EOF, - /* - XML-specific errors. - */ + // XML-specific errors. No_Prolog, Invalid_Prolog, Too_Many_Prologs, @@ -194,11 +160,9 @@ Error :: enum { Too_Many_DocTypes, DocType_Must_Preceed_Elements, - /* - If a DOCTYPE is present _or_ the caller - asked for a specific DOCTYPE and the DOCTYPE - and root tag don't match, we return `.Invalid_DocType`. - */ + // If a DOCTYPE is present _or_ the caller + // asked for a specific DOCTYPE and the DOCTYPE + // and root tag don't match, we return `.Invalid_DocType`. Invalid_DocType, Invalid_Tag_Value, @@ -211,27 +175,20 @@ Error :: enum { Unsupported_Version, Unsupported_Encoding, - /* - <!FOO are usually skipped. - */ + // <!FOO are usually skipped. Unhandled_Bang, Duplicate_Attribute, Conflicting_Options, } -/* - Implementation starts here. -*/ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) { data := data context.allocator = allocator opts := validate_options(options) or_return - /* - If `.Input_May_Be_Modified` is not specified, we duplicate the input so that we can modify it in-place. - */ + // If `.Input_May_Be_Modified` is not specified, we duplicate the input so that we can modify it in-place. if .Input_May_Be_Modified not_in opts.flags { data = bytes.clone(data) } @@ -252,10 +209,8 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha element, parent: Element_ID open: Token - /* - If a DOCTYPE is present, the root tag has to match. - If an expected DOCTYPE is given in options (i.e. it's non-empty), the DOCTYPE (if present) and root tag have to match. - */ + // If a DOCTYPE is present, the root tag has to match. + // If an expected DOCTYPE is given in options (i.e. it's non-empty), the DOCTYPE (if present) and root tag have to match. expected_doctype := options.expected_doctype loop: for { @@ -263,17 +218,13 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha // NOTE(Jeroen): This is faster as a switch. switch t.ch { case '<': - /* - Consume peeked `<` - */ + // Consume peeked `<` advance_rune(t) open = scan(t) // NOTE(Jeroen): We're not using a switch because this if-else chain ordered by likelihood is 2.5% faster at -o:size and -o:speed. if likely(open.kind, Token_Kind.Ident) == .Ident { - /* - e.g. <odin - Start of new element. - */ + // e.g. <odin - Start of new element. element = new_element(doc) if element == 0 { // First Element parent = element @@ -286,11 +237,9 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha parse_attributes(doc, &doc.elements[element].attribs) or_return - /* - If a DOCTYPE is present _or_ the caller - asked for a specific DOCTYPE and the DOCTYPE - and root tag don't match, we return .Invalid_Root_Tag. - */ + // If a DOCTYPE is present _or_ the caller + // asked for a specific DOCTYPE and the DOCTYPE + // and root tag don't match, we return .Invalid_Root_Tag. if element == 0 { // Root tag? if len(expected_doctype) > 0 && expected_doctype != open.text { error(t, t.offset, "Root Tag doesn't match DOCTYPE. Expected: %v, got: %v\n", expected_doctype, open.text) @@ -298,23 +247,17 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha } } - /* - One of these should follow: - - `>`, which means we've just opened this tag and expect a later element to close it. - - `/>`, which means this is an 'empty' or self-closing tag. - */ + // One of these should follow: + // - `>`, which means we've just opened this tag and expect a later element to close it. + // - `/>`, which means this is an 'empty' or self-closing tag. end_token := scan(t) #partial switch end_token.kind { case .Gt: - /* - We're now the new parent. - */ + // We're now the new parent. parent = element case .Slash: - /* - Empty tag. Close it. - */ + // Empty tag. Close it. expect(t, .Gt) or_return parent = doc.elements[element].parent element = parent @@ -325,9 +268,7 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha } } else if open.kind == .Slash { - /* - Close tag. - */ + // Close tag. ident := expect(t, .Ident) or_return _ = expect(t, .Gt) or_return @@ -339,9 +280,7 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha element = parent } else if open.kind == .Exclaim { - /* - <! - */ + // <! next := scan(t) #partial switch next.kind { case .Ident: @@ -370,10 +309,8 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha } case .Dash: - /* - Comment: <!-- -->. - The grammar does not allow a comment to end in ---> - */ + // Comment: <!-- -->. + // The grammar does not allow a comment to end in ---> expect(t, .Dash) comment := scan_comment(t) or_return @@ -395,23 +332,17 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha } } else if open.kind == .Question { - /* - <?xml - */ + // <?xml next := scan(t) #partial switch next.kind { case .Ident: if len(next.text) == 3 && strings.equal_fold(next.text, "xml") { parse_prologue(doc) or_return } else if len(doc.prologue) > 0 { - /* - We've already seen a prologue. - */ + // We've already seen a prologue. return doc, .Too_Many_Prologs } else { - /* - Could be `<?xml-stylesheet`, etc. Ignore it. - */ + // Could be `<?xml-stylesheet`, etc. Ignore it. skip_element(t) or_return } case: @@ -425,15 +356,11 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha } case -1: - /* - End of file. - */ + // End of file. break loop case: - /* - This should be a tag's body text. - */ + // This should be a tag's body text. body_text := scan_string(t, t.offset) or_return needs_processing := .Unbox_CDATA in opts.flags needs_processing |= .Decode_SGML_Entities in opts.flags @@ -613,9 +540,7 @@ parse_prologue :: proc(doc: ^Document) -> (err: Error) { doc.encoding = .LATIN_1 case: - /* - Unrecognized encoding, assume UTF-8. - */ + // Unrecognized encoding, assume UTF-8. error(t, offset, "[parse_prologue] Warning: Unrecognized encoding: %v\n", attr.val) } @@ -658,11 +583,11 @@ skip_element :: proc(t: ^Tokenizer) -> (err: Error) { parse_doctype :: proc(doc: ^Document) -> (err: Error) { /* - <!DOCTYPE greeting SYSTEM "hello.dtd"> + <!DOCTYPE greeting SYSTEM "hello.dtd"> - <!DOCTYPE greeting [ - <!ELEMENT greeting (#PCDATA)> - ]> + <!DOCTYPE greeting [ + <!ELEMENT greeting (#PCDATA)> + ]> */ assert(doc != nil) context.allocator = doc.allocator @@ -675,9 +600,7 @@ parse_doctype :: proc(doc: ^Document) -> (err: Error) { offset := t.offset skip_element(t) or_return - /* - -1 because the current offset is that of the closing tag, so the rest of the DOCTYPE tag ends just before it. - */ + // -1 because the current offset is that of the closing tag, so the rest of the DOCTYPE tag ends just before it. doc.doctype.rest = string(t.src[offset : t.offset - 1]) return .None } @@ -700,4 +623,4 @@ new_element :: proc(doc: ^Document) -> (id: Element_ID) { cur := doc.element_count doc.element_count += 1 return cur -}
\ No newline at end of file +} |