diff options
| author | gingerBill <gingerBill@users.noreply.github.com> | 2026-01-18 11:45:40 +0000 |
|---|---|---|
| committer | gingerBill <gingerBill@users.noreply.github.com> | 2026-01-18 11:45:40 +0000 |
| commit | 23198f79179075f05a965db7951613ef6bb3f9fd (patch) | |
| tree | cff43a0bd767f2d81e21fbfc1903f4045e7fcc4f /core/encoding/entity/entity.odin | |
| parent | 227e7920a84ca6d899edcabcdd7fb04dde447f97 (diff) | |
Move html escaping calls and fix existing generator
Diffstat (limited to 'core/encoding/entity/entity.odin')
| -rw-r--r-- | core/encoding/entity/entity.odin | 277 |
1 files changed, 262 insertions, 15 deletions
diff --git a/core/encoding/entity/entity.odin b/core/encoding/entity/entity.odin index e112eedf2..0d455ef88 100644 --- a/core/encoding/entity/entity.odin +++ b/core/encoding/entity/entity.odin @@ -21,6 +21,7 @@ package encoding_unicode_entity Jeroen van Rijn: Initial implementation. */ +import "base:runtime" import "core:unicode/utf8" import "core:unicode" import "core:strings" @@ -141,8 +142,10 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator := write_string(&builder, entity) } else { if .No_Entity_Decode not_in options { - if decoded, ok := xml_decode_entity(entity); ok { - write_rune(&builder, decoded) + if decoded, count, ok := xml_decode_entity(entity); ok { + for i in 0..<count { + write_rune(&builder, decoded[i]) + } continue } } @@ -212,17 +215,16 @@ advance :: proc(t: ^Tokenizer) -> (err: Error) { } } -xml_decode_entity :: proc(entity: string) -> (decoded: rune, ok: bool) { +xml_decode_entity :: proc(entity: string) -> (decoded: [2]rune, rune_count: int, ok: bool) { entity := entity - if len(entity) == 0 { return -1, false } + if len(entity) == 0 { return } - switch entity[0] { - case '#': + if entity[0] == '#' { base := 10 val := 0 entity = entity[1:] - if len(entity) == 0 { return -1, false } + if len(entity) == 0 { return } if entity[0] == 'x' || entity[0] == 'X' { base = 16 @@ -237,30 +239,275 @@ xml_decode_entity :: proc(entity: string) -> (decoded: rune, ok: bool) { val += int(r - '0') case 'a'..='f': - if base == 10 { return -1, false } + if base == 10 { return } val *= base val += int(r - 'a' + 10) case 'A'..='F': - if base == 10 { return -1, false } + if base == 10 { return } val *= base val += int(r - 'A' + 10) case: - return -1, false + return } - if val > MAX_RUNE_CODEPOINT { return -1, false } + if val > MAX_RUNE_CODEPOINT { return } entity = entity[1:] } - return rune(val), true + return rune(val), 1, true + } + // Named entity. + return named_xml_entity_to_rune(entity) +} + + +// escape_html escapes special characters like '&' to become '&'. +// It escapes only 5 different characters: & ' < > and ". +@(require_results) +escape_html :: proc(s: string, allocator := context.allocator, loc := #caller_location) -> (output: string, was_allocation: bool) { + /* + & -> & + ' -> ' // ' is shorter than ' (NOTE: ' was not available until HTML 5) + < -> < + > -> > + " -> " // " is shorter than " + */ + + b := transmute([]byte)s + + extra_bytes_needed := 0 + + for c in b { + switch c { + case '&': extra_bytes_needed += 4 + case '\'': extra_bytes_needed += 4 + case '<': extra_bytes_needed += 3 + case '>': extra_bytes_needed += 3 + case '"': extra_bytes_needed += 4 + } + } + + if extra_bytes_needed == 0 { + return s, false + } + + t, err := make([]byte, len(s) + extra_bytes_needed, allocator, loc) + if err != nil { + return + } + was_allocation = true + + w := 0 + for c in b { + s := "" + switch c { + case '&': s = "&" + case '\'': s = "'" + case '<': s = "<" + case '>': s = ">" + case '"': s = """ + } + if s != "" { + copy(t[w:], s) + w += len(s) + } else { + t[w] = c + w += 1 + } + } + output = string(t[0:w]) + return +} + + +@(require_results) +unescape_html :: proc(s: string, allocator := context.allocator, loc := #caller_location) -> (output: string, was_allocation: bool, err: runtime.Allocator_Error) { + @(require_results) + do_append :: proc(s: string, amp_idx: int, buf: ^[dynamic]byte) -> (n: int) { + s, amp_idx := s, amp_idx + + n += len(s[:amp_idx]) + if buf != nil { append(buf, s[:amp_idx]) } + s = s[amp_idx:] + for len(s) > 0 { + b, w, j := unescape_entity(s) + n += w + if buf != nil { append(buf, ..b[:w]) } + + s = s[j:] + + amp_idx = strings.index_byte(s, '&') + if amp_idx < 0 { + n += len(s) + if buf != nil { append(buf, s) } + break + } + n += amp_idx + if buf != nil { append(buf, s[:amp_idx]) } + s = s[amp_idx:] + } + + return + } - case: - // Named entity. - return named_xml_entity_to_rune(entity) + s := s + amp_idx := strings.index_byte(s, '&') + if amp_idx < 0 { + return s, false, nil } + + // NOTE(bill): this does a two pass in order to minimize the allocations required + bytes_required := do_append(s, amp_idx, nil) + + buf := make([dynamic]byte, 0, bytes_required, allocator, loc) or_return + was_allocation = true + + _ = do_append(s, amp_idx, &buf) + + assert(len(buf) == cap(buf)) + output = string(buf[:]) + + return } +// Returns an unescaped string of an encoded XML/HTML entity. +@(require_results) +unescape_entity :: proc(s: string) -> (b: [8]byte, w: int, j: int) { + s := s + if len(s) < 2 { + return + } + if s[0] != '&' { + return + } + j = 1 + + if s[j] == '#' { // scan numbers + j += 1 + if len(s) <= 3 { // remove `&#.` + return + } + c := s[j] + hex := false + if c == 'x' || c == 'X' { + hex = true + j += 1 + } + + x := rune(0) + scan_number: for j < len(s) { + c = s[j] + j += 1 + if hex { + switch c { + case '0'..='9': x = 16*x + rune(c) - '0'; continue scan_number + case 'a'..='f': x = 16*x + rune(c) - 'a' + 10; continue scan_number + case 'A'..='F': x = 16*x + rune(c) - 'A' + 10; continue scan_number + } + } else { + switch c { + case '0'..='9': x = 10*x + rune(c) - '0'; continue scan_number + } + } + + // Keep the ';' to check for cases which require it and cases which might not + if c != ';' { + j -= 1 + } + break scan_number + } + + + if j <= 3 { // no replacement characters found + return + } + + @(static, rodata) + windows_1252_replacement_table := [0xa0 - 0x80]rune{ // Windows-1252 -> UTF-8 + '\u20ac', '\u0081', '\u201a', '\u0192', + '\u201e', '\u2026', '\u2020', '\u2021', + '\u02c6', '\u2030', '\u0160', '\u2039', + '\u0152', '\u008d', '\u017d', '\u008f', + '\u0090', '\u2018', '\u2019', '\u201c', + '\u201d', '\u2022', '\u2013', '\u2014', + '\u02dc', '\u2122', '\u0161', '\u203a', + '\u0153', '\u009d', '\u017e', '\u0178', + } + + switch x { + case 0x80..<0xa0: + x = windows_1252_replacement_table[x-0x80] + case 0, 0xd800..=0xdfff: + x = utf8.RUNE_ERROR + case: + if x > 0x10ffff { + x = utf8.RUNE_ERROR + } + + } + + b1, w1 := utf8.encode_rune(x) + w += copy(b[:], b1[:w1]) + return + } + + // Lookup by entity names + + scan_ident: for j < len(s) { // scan over letters and digits + c := s[j] + j += 1 + + switch c { + case 'a'..='z', 'A'..='Z', '0'..='9': + continue scan_ident + } + // Keep the ';' to check for cases which require it and cases which might not + if c != ';' { + j -= 1 + } + break scan_ident + } + + entity_name := s[1:j] + if len(entity_name) == 0 { + return + } + + if entity_name[len(entity_name)-1] == ';' { + entity_name = entity_name[:len(entity_name)-1] + } + + if r2, _, ok := named_xml_entity_to_rune(entity_name); ok { + b1, w1 := utf8.encode_rune(r2[0]) + w += copy(b[w:], b1[:w1]) + if r2[1] != 0 { + b2, w2 := utf8.encode_rune(r2[1]) + w += copy(b[w:], b2[:w2]) + } + return + } + + // The longest entities that do not end with a semicolon are <=6 bytes long + LONGEST_ENTITY_WITHOUT_SEMICOLON :: 6 + + n := min(len(entity_name)-1, LONGEST_ENTITY_WITHOUT_SEMICOLON) + for i := n; i > 1; i -= 1 { + if r2, _, ok := named_xml_entity_to_rune(entity_name[:i]); ok { + b1, w1 := utf8.encode_rune(r2[0]) + w += copy(b[w:], b1[:w1]) + if r2[1] != 0 { + b2, w2 := utf8.encode_rune(r2[1]) + w += copy(b[w:], b2[:w2]) + } + return + } + } + + return +} + + // Private XML helper to extract `&<stuff>;` entity. @(private="file") _extract_xml_entity :: proc(t: ^Tokenizer) -> (entity: string, err: Error) { |