[core:encoding/entity] Add new package to decode &<entity>; entities.

Includes generator to generate a lookup for named entitiess.
author: Jeroen van Rijn <Kelimion@users.noreply.github.com> 2021-12-02 20:12:12 +0100
committer: Jeroen van Rijn <Kelimion@users.noreply.github.com> 2021-12-05 02:52:23 +0100
commit: 2dd67dba89732b89adb0199bc0a99de4cbc34e8f (patch)
tree: d5ba3341bdfa31758d59590b0c62d6f3aa8a3cad /core/encoding/entity/entity.odin
parent: 580721440657a9fe5334b6bf095fb70b584fa4f6 (diff)
1 files changed, 358 insertions, 0 deletions
diff --git a/core/encoding/entity/entity.odin b/core/encoding/entity/entity.odin
new file mode 100644
index 000000000..e40896819
--- /dev/null
+++ b/core/encoding/entity/entity.odin
@@ -0,0 +1,358 @@
+package unicode_entity
+/*
+	A unicode entity encoder/decoder
+
+	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
+	Made available under Odin's BSD-3 license.
+
+	This code has several procedures to map unicode runes to/from different textual encodings.
+	- SGML/XML/HTML entity
+	-- &#<decimal>;
+	-- &#x<hexadecimal>;
+	-- &<entity name>;   (If the lookup tables are compiled in).
+	Reference: https://www.w3.org/2003/entities/2007xml/unicode.xml	
+
+	- URL encode / decode %hex entity
+	Reference: https://datatracker.ietf.org/doc/html/rfc3986/#section-2.1
+
+	List of contributors:
+		Jeroen van Rijn: Initial implementation.
+*/
+
+import "core:unicode/utf8"
+import "core:unicode"
+import "core:strings"
+
+MAX_RUNE_CODEPOINT :: int(unicode.MAX_RUNE)
+
+write_rune   :: strings.write_rune_builder
+write_string :: strings.write_string_builder
+
+Error :: enum u8 {
+	None = 0,
+	Tokenizer_Is_Nil,
+
+	Illegal_NUL_Character,
+	Illegal_UTF_Encoding,
+	Illegal_BOM,
+
+	CDATA_Not_Terminated,
+	Comment_Not_Terminated,
+	Invalid_Entity_Encoding,
+}
+
+Tokenizer :: struct {
+	r:           rune,
+	w:           int,
+
+	src:         string,
+	offset:      int,
+	read_offset: int,
+}
+
+CDATA_START   :: "<![CDATA["
+CDATA_END     :: "]]>"
+
+COMMENT_START :: "<!--"
+COMMENT_END   :: "-->"
+
+/*
+	Default: CDATA and comments are passed through unchanged.
+*/
+XML_Decode_Option :: enum u8 {
+	/*
+		CDATA is unboxed.
+	*/
+	CDATA_Unbox,
+
+	/*
+		Unboxed CDATA is decoded as well.
+		Ignored if `.CDATA_Unbox` is not given.
+	*/
+	CDATA_Decode,
+
+	/*
+		Comments are stripped.
+	*/
+	Comment_Strip,
+}
+XML_Decode_Options :: bit_set[XML_Decode_Option; u8]
+
+/*
+	Decode a string that may include SGML/XML/HTML entities.
+	The caller has to free the result.
+*/
+decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator := context.allocator) -> (decoded: string, err: Error) {
+	context.allocator = allocator
+
+	l := len(input)
+	if l == 0 { return "", .None }
+
+	builder := strings.make_builder()
+	defer strings.destroy_builder(&builder)
+
+	t := Tokenizer{src=input}
+	in_data := false
+
+	loop: for {
+		advance(&t) or_return
+		if t.r < 0 { break loop }
+
+		/*
+			Below here we're never inside a CDATA tag.
+			At most we'll see the start of one, but that doesn't affect the logic.
+		*/
+		switch t.r {
+		case '<':
+			/*
+				Might be the start of a CDATA tag or comment.
+
+				We don't need to check if we need to write a `<`, because if it isn't CDATA or a comment,
+				it couldn't have been part of an XML tag body to be decoded here.
+			*/
+			in_data = _handle_xml_special(&t, &builder, options) or_return
+
+		case ']':
+			/*
+				If we're unboxing _and_ decoding CDATA, we'll have to check for the end tag.
+			*/
+			if in_data {
+				if t.read_offset + len(CDATA_END) < len(t.src) {
+					if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
+						in_data = false
+						t.read_offset += len(CDATA_END) - 1
+					}
+				}
+				continue
+			} else {
+				write_rune(&builder, ']')
+			}
+
+		case:
+			if in_data && .CDATA_Decode not_in options {
+				/*
+					Unboxed, but undecoded.
+				*/
+				write_rune(&builder, t.r)
+				continue
+			}
+
+			if t.r == '&' {
+				if entity, entity_err := _extract_xml_entity(&t); entity_err != .None {
+					/*
+						We read to the end of the string without closing the entity.
+						Pass through as-is.
+					*/
+					write_string(&builder, entity)
+				} else {
+					if decoded, ok := xml_decode_entity(entity); ok {
+						write_rune(&builder, decoded)
+					} else {
+						/*
+							Decode failed. Pass through original.
+						*/
+						write_string(&builder, "&")
+						write_string(&builder, entity)
+						write_string(&builder, ";")
+					}
+
+				}
+			} else {
+				write_rune(&builder, t.r)
+			}
+		}
+	}
+
+	return strings.clone(strings.to_string(builder), allocator), err
+}
+
+advance :: proc(t: ^Tokenizer) -> (err: Error) {
+	if t == nil { return .Tokenizer_Is_Nil }
+	using t
+
+	#no_bounds_check {
+		if read_offset < len(src) {
+			offset = read_offset
+			r, w   = rune(src[read_offset]), 1
+			switch {
+			case r == 0:
+				return .Illegal_NUL_Character
+			case r >= utf8.RUNE_SELF:
+				r, w = utf8.decode_rune_in_string(src[read_offset:])
+				if r == utf8.RUNE_ERROR && w == 1 {
+					return .Illegal_UTF_Encoding
+				} else if r == utf8.RUNE_BOM && offset > 0 {
+					return .Illegal_BOM
+				}
+			}
+			read_offset += w
+			return .None
+		} else {
+			offset = len(src)
+			r = -1
+			return
+		}
+	}
+}
+
+xml_decode_entity :: proc(entity: string) -> (decoded: rune, ok: bool) {
+	entity := entity
+	if len(entity) == 0 { return -1, false }
+
+	switch entity[0] {
+	case '#':
+		base  := 10
+		val   := 0
+		entity = entity[1:]
+
+		if len(entity) == 0 { return -1, false }
+
+		if entity[0] == 'x' || entity[0] == 'X' {
+			base = 16
+			entity = entity[1:]
+		}
+
+		for len(entity) > 0 {
+			r := entity[0]
+			switch r {
+			case '0'..'9':
+				val *= base
+				val += int(r - '0')
+
+			case 'a'..'f':
+				if base == 10 { return -1, false }
+				val *= base
+				val += int(r - 'a' + 10)
+
+			case 'A'..'F':
+				if base == 10 { return -1, false }
+				val *= base
+				val += int(r - 'A' + 10)
+
+			case:
+				return -1, false
+			}
+
+			if val > MAX_RUNE_CODEPOINT { return -1, false }
+			entity = entity[1:]
+		}
+		return rune(val), true
+
+	case:
+		/*
+			Named entity.
+		*/
+		return named_xml_entity_to_rune(entity)
+	}
+}
+
+/*
+	Private XML helper to extract `&<stuff>;` entity.
+*/
+@(private="file")
+_extract_xml_entity :: proc(t: ^Tokenizer) -> (entity: string, err: Error) {
+	assert(t != nil && t.r == '&')
+
+	/*
+		All of these would be in the ASCII range.
+		Even if one is not, it doesn't matter. All characters we need to compare to extract are.
+	*/
+	using t
+
+	length := len(t.src)
+	found  := false
+
+	#no_bounds_check {
+		for read_offset < length {
+			if src[read_offset] == ';' {
+				found = true
+				read_offset += 1
+				break
+			}
+			read_offset += 1
+		}
+	}
+
+	if found {
+		return string(src[offset + 1 : read_offset - 1]), .None
+	}
+	return string(src[offset : read_offset]), .Invalid_Entity_Encoding
+}
+
+/*
+	Private XML helper for CDATA and comments.
+*/
+@(private="file")
+_handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: XML_Decode_Options) -> (in_data: bool, err: Error) {
+	assert(t != nil && t.r == '<')
+	if t.read_offset + len(CDATA_START) >= len(t.src) { return false, .None }
+
+	if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
+		t.read_offset += len(CDATA_START) - 1
+
+		if .CDATA_Unbox in options && .CDATA_Decode in options {
+			/*
+				We're unboxing _and_ decoding CDATA
+			*/
+			return true, .None
+		}
+
+		/*
+			CDATA is passed through.
+		*/
+		offset := t.offset
+
+		/*
+			Scan until end of CDATA.
+		*/
+		for {
+			advance(t) or_return
+			if t.r < 0 { return true, .CDATA_Not_Terminated }
+
+			if t.read_offset + len(CDATA_END) < len(t.src) {
+				if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
+					t.read_offset += len(CDATA_END) - 1
+
+					cdata := string(t.src[offset : t.read_offset])
+	
+					if .CDATA_Unbox in options {
+						cdata = cdata[len(CDATA_START):]
+						cdata = cdata[:len(cdata) - len(CDATA_END)]
+					}
+
+					write_string(builder, cdata)
+					return false, .None
+				}
+			}
+		}
+
+	} else if string(t.src[t.offset:][:len(COMMENT_START)]) == COMMENT_START {
+		t.read_offset += len(COMMENT_START)
+		/*
+			Comment is passed through by default.
+		*/
+		offset := t.offset
+
+		/*
+			Scan until end of Comment.
+		*/
+		for {
+			advance(t) or_return
+			if t.r < 0 { return true, .Comment_Not_Terminated }
+
+			if t.read_offset + len(COMMENT_END) < len(t.src) {
+				if string(t.src[t.offset:][:len(COMMENT_END)]) == COMMENT_END {
+					t.read_offset += len(COMMENT_END) - 1
+
+					if .Comment_Strip not_in options {
+						comment := string(t.src[offset : t.read_offset])
+						write_string(builder, comment)
+					}
+					return false, .None
+				}
+			}
+		}
+
+	}
+	return false, .None
+}
+\ No newline at end of file
author	Jeroen van Rijn <Kelimion@users.noreply.github.com>	2021-12-02 20:12:12 +0100
committer	Jeroen van Rijn <Kelimion@users.noreply.github.com>	2021-12-05 02:52:23 +0100
commit	2dd67dba89732b89adb0199bc0a99de4cbc34e8f (patch)
tree	d5ba3341bdfa31758d59590b0c62d6f3aa8a3cad /core/encoding/entity/entity.odin
parent	580721440657a9fe5334b6bf095fb70b584fa4f6 (diff)