diff options
| author | Jeroen van Rijn <Kelimion@users.noreply.github.com> | 2024-06-12 12:52:48 +0200 |
|---|---|---|
| committer | Jeroen van Rijn <Kelimion@users.noreply.github.com> | 2024-06-12 12:52:48 +0200 |
| commit | ebadff555d70d5ef8faf91785696e5c5ea3605f8 (patch) | |
| tree | d50d29bf5e53ca855bc5a2109e814673363c6ce3 /core/encoding/entity | |
| parent | e87c5bca58e00b8edaf24a25627b36351e92dc16 (diff) | |
Update XML reader to normalize whitespace, part 1.
Diffstat (limited to 'core/encoding/entity')
| -rw-r--r-- | core/encoding/entity/entity.odin | 109 |
1 files changed, 40 insertions, 69 deletions
diff --git a/core/encoding/entity/entity.odin b/core/encoding/entity/entity.odin index cee6230ef..280be9377 100644 --- a/core/encoding/entity/entity.odin +++ b/core/encoding/entity/entity.odin @@ -56,38 +56,27 @@ CDATA_END :: "]]>" COMMENT_START :: "<!--" COMMENT_END :: "-->" -/* - Default: CDATA and comments are passed through unchanged. -*/ +// Default: CDATA and comments are passed through unchanged. XML_Decode_Option :: enum u8 { - /* - Do not decode & entities. It decodes by default. - If given, overrides `Decode_CDATA`. - */ + // Do not decode & entities. It decodes by default. If given, overrides `Decode_CDATA`. No_Entity_Decode, - /* - CDATA is unboxed. - */ + // CDATA is unboxed. Unbox_CDATA, - /* - Unboxed CDATA is decoded as well. - Ignored if `.Unbox_CDATA` is not given. - */ + // Unboxed CDATA is decoded as well. Ignored if `.Unbox_CDATA` is not given. Decode_CDATA, - /* - Comments are stripped. - */ + // Comments are stripped. Comment_Strip, + + // Normalize whitespace + Normalize_Whitespace, } XML_Decode_Options :: bit_set[XML_Decode_Option; u8] -/* - Decode a string that may include SGML/XML/HTML entities. - The caller has to free the result. -*/ +// Decode a string that may include SGML/XML/HTML entities. +// The caller has to free the result. decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator := context.allocator) -> (decoded: string, err: Error) { context.allocator = allocator @@ -100,14 +89,14 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator := t := Tokenizer{src=input} in_data := false + prev: rune + loop: for { advance(&t) or_return if t.r < 0 { break loop } - /* - Below here we're never inside a CDATA tag. - At most we'll see the start of one, but that doesn't affect the logic. - */ + // Below here we're never inside a CDATA tag. At most we'll see the start of one, + // but that doesn't affect the logic. switch t.r { case '<': /* @@ -126,9 +115,7 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator := in_data = _handle_xml_special(&t, &builder, options) or_return case ']': - /* - If we're unboxing _and_ decoding CDATA, we'll have to check for the end tag. - */ + // If we're unboxing _and_ decoding CDATA, we'll have to check for the end tag. if in_data { if t.read_offset + len(CDATA_END) < len(t.src) { if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END { @@ -143,22 +130,16 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator := case: if in_data && .Decode_CDATA not_in options { - /* - Unboxed, but undecoded. - */ + // Unboxed, but undecoded. write_rune(&builder, t.r) continue } if t.r == '&' { if entity, entity_err := _extract_xml_entity(&t); entity_err != .None { - /* - We read to the end of the string without closing the entity. - Pass through as-is. - */ + // We read to the end of the string without closing the entity. Pass through as-is. write_string(&builder, entity) } else { - if .No_Entity_Decode not_in options { if decoded, ok := xml_decode_entity(entity); ok { write_rune(&builder, decoded) @@ -166,19 +147,27 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator := } } - /* - Literal passthrough because the decode failed or we want entities not decoded. - */ + // Literal passthrough because the decode failed or we want entities not decoded. write_string(&builder, "&") write_string(&builder, entity) write_string(&builder, ";") } } else { - write_rune(&builder, t.r) + // https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-line-ends + switch t.r { + case '\n', 0x85, 0x2028: + write_rune(&builder, '\n') + case '\r': // Do nothing until next character + case: + if prev == '\r' { // Turn a single carriage return into a \n + write_rune(&builder, '\n') + } + write_rune(&builder, t.r) + } + prev = t.r } } } - return strings.clone(strings.to_string(builder), allocator), err } @@ -253,24 +242,18 @@ xml_decode_entity :: proc(entity: string) -> (decoded: rune, ok: bool) { return rune(val), true case: - /* - Named entity. - */ + // Named entity. return named_xml_entity_to_rune(entity) } } -/* - Private XML helper to extract `&<stuff>;` entity. -*/ +// Private XML helper to extract `&<stuff>;` entity. @(private="file") _extract_xml_entity :: proc(t: ^Tokenizer) -> (entity: string, err: Error) { assert(t != nil && t.r == '&') - /* - All of these would be in the ASCII range. - Even if one is not, it doesn't matter. All characters we need to compare to extract are. - */ + // All of these would be in the ASCII range. + // Even if one is not, it doesn't matter. All characters we need to compare to extract are. length := len(t.src) found := false @@ -292,9 +275,7 @@ _extract_xml_entity :: proc(t: ^Tokenizer) -> (entity: string, err: Error) { return string(t.src[t.offset : t.read_offset]), .Invalid_Entity_Encoding } -/* - Private XML helper for CDATA and comments. -*/ +// Private XML helper for CDATA and comments. @(private="file") _handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: XML_Decode_Options) -> (in_data: bool, err: Error) { assert(t != nil && t.r == '<') @@ -304,20 +285,14 @@ _handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: X t.read_offset += len(CDATA_START) - 1 if .Unbox_CDATA in options && .Decode_CDATA in options { - /* - We're unboxing _and_ decoding CDATA - */ + // We're unboxing _and_ decoding CDATA return true, .None } - /* - CDATA is passed through. - */ + // CDATA is passed through. offset := t.offset - /* - Scan until end of CDATA. - */ + // Scan until end of CDATA. for { advance(t) or_return if t.r < 0 { return true, .CDATA_Not_Terminated } @@ -341,14 +316,10 @@ _handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: X } else if string(t.src[t.offset:][:len(COMMENT_START)]) == COMMENT_START { t.read_offset += len(COMMENT_START) - /* - Comment is passed through by default. - */ + // Comment is passed through by default. offset := t.offset - /* - Scan until end of Comment. - */ + // Scan until end of Comment. for { advance(t) or_return if t.r < 0 { return true, .Comment_Not_Terminated } |