1 files changed, 643 insertions, 10 deletions
diff --git a/code/demo.odin b/code/demo.odin
index 323754a23..0be0acffa 100644
--- a/code/demo.odin
+++ b/code/demo.odin
@@ -1,18 +1,651 @@
 #import "fmt.odin";
+#import "os.odin";
+#import "strconv.odin";
+#import "utf8.odin";
+
+Error :: enum {
+	NONE,
+}
+
+Style_Type :: enum {
+	ITALIC,
+	BOLD,
+	STRIKE,
+}
+
+Node :: union {
+	children:       [dynamic]Node,
+	content:        []byte,
+	inline_content: ^Node,
+	line_number:    int,
+
+	// Block Variants
+	Header{level: int},
+	Document{},
+	Paragraph{},
+	Quote{},
+	Code_Block{language: string},
+	Horizontal_Rule{},
+
+	// Inline Variants
+	Multiple_Inline{},
+	String_Inline{},
+	Soft_Line_Break{},
+	Hard_Line_Break{},
+	Code_Span{},
+	Style{
+		type: Style_Type,
+	}
+}
+
+
+Parser :: struct {
+	data:  []byte,
+	nodes: [dynamic]Node,
+}
+
+parse :: proc(data: []byte) -> ([]Node, Error) {
+	p := Parser{
+		data = data,
+	};
+	err := parse(^p);
+
+	if err != Error.NONE {
+		return nil, err;
+	}
+
+	return p.nodes[..], Error.NONE;
+}
+
+parse :: proc(p: ^Parser) -> Error {
+	is_blank :: proc(line: []byte) -> bool {
+		line = trim_whitespace(line);
+		return len(line) == 0;
+	}
+
+	is_horizontal_rule :: proc(line: []byte) -> bool {
+		char: byte;
+		count := 0;
+		for c, i in line {
+			if c != ' ' && c != '\n' {
+				if c != '-' && c != '_' && c != '*' {
+					return false;
+				}
+				if char == 0 {
+					if i >= 4 {
+						return false;
+					}
+					char = c;
+					count = 1;
+				} else if c == char {
+					count++;
+				} else {
+					return false;
+				}
+			}
+		}
+
+
+		return count >= 3;
+	}
+
+	nodes := make([dynamic]Node);
+
+	line_number: int = 0;
+	prev_was_blank := false;
+	in_code_block := false;
+	code_language := "";
+	code_block_start := 0;
+
+	pos := 0;
+	end := len(p.data);
+	for pos < len(p.data) {
+		line_start := pos;
+		line_end := pos;
+		for p.data[line_end] != '\n' {
+			line_end++;
+		}
+		line := p.data[pos..line_end];
+		pos = line_end+1;
+		line_number++;
+
+		line = tabs_to_spaces_and_append_newline(line);
+		str := cast(string)line;
+
+		skip := in_code_block;
+
+		node: Node;
+		if len(line) > 3 && cast(string)line[..3] == "```" {
+			if !in_code_block {
+				code_block_start = line_start+3;
+				in_code_block = true;
+				code_language = "";
+				rest := trim_whitespace(line[3..]);
+				if len(rest) > 0 {
+					code_language = cast(string)rest;
+				}
+			} else {
+				end := line_start-1;
+				str := p.data[code_block_start..end];
+				node = Node.Code_Block{content = str, language = code_language};
+				in_code_block = false;
+			}
+			skip = true;
+		}
+
+		indent_char := line[indentation(line)];
+		if skip {
+
+		} else if indent_char == '>' {
+			node = Node.Quote{content = line};
+		} else if indent_char == '*' {
+			// fmt.println("List Item");
+		} else if level, content := parse_header(line); level > 0 {
+			node = Node.Header{content = content, level = level};
+		} else if is_horizontal_rule(line) {
+			node = Node.Horizontal_Rule{};
+		} else if !is_blank(line) {
+			node = Node.Paragraph{content = line};
+		}
+
+		if node != nil {
+			node.line_number = line_number;
+			append(nodes, node);
+		}
+	}
+
+
+	for _, i in nodes {
+		using Node;
+		match n in nodes[i] {
+		case Paragraph, Horizontal_Rule, Header, Code_Block:
+			append(p.nodes, nodes[i]);
+		case Quote:
+			// fmt.println("Quote");
+		}
+	}
+
+	for _, i in p.nodes {
+		process_inlines(^p.nodes[i]);
+	}
+
+
+	return Error.NONE;
+}
+
+process_inlines :: proc(node: ^Node) {
+	using Node;
+	match n in node {
+	case Header:
+		n.inline_content = parse_inlines(n.content);
+	case Paragraph:
+		n.inline_content = parse_inlines(trim_right_space(n.content));
+	}
+
+	for _, i in node.children {
+		process_inlines(^node.children[i]);
+	}
+}
+
+Inline_Parser :: struct {
+	data:         []byte,
+	pos:          int,
+	string_start: int,
+	root:         ^Node,
+}
+
+parse_inlines :: proc(data: []byte) -> ^Node {
+	reset_string :: proc(p: ^Inline_Parser) {
+		p.string_start = p.pos;
+	}
+	finalize_string :: proc(p: ^Inline_Parser) {
+		if p.string_start >= p.pos {
+			return;
+		}
+
+
+		str := p.data[p.string_start..p.pos];
+		append(p.root.children, Node.String_Inline{content = trim_right_whitespace(str)});
+	}
+
+	p := Inline_Parser{
+		data = data,
+		root = new(Node),
+	};
+	p.root^ = Node.Multiple_Inline{};
+
+	using Node;
+
+	for p.pos < len(p.data) {
+		node: Node;
+		match p.data[p.pos] {
+		default: p.pos++;
+
+		case '\n':
+			hard_break := false;
+			new_line_pos := p.pos;
+
+			if p.pos >= 2 && p.data[p.pos-1] == ' ' && p.data[p.pos-2] == ' ' {
+				hard_break = true;
+				p.pos -= 2;
+			}
+
+			if p.pos >= 1 && p.data[p.pos-1] == '\\' {
+				hard_break = true;
+				p.pos--;
+			}
+
+
+			for p.pos > 0 && p.data[p.pos-1] == ' ' {
+				p.pos--;
+			}
+			finalize_string(^p);
+
+			if hard_break {
+				node = Hard_Line_Break{};
+			} else {
+				node = Soft_Line_Break{};
+			}
+
+			p.pos = new_line_pos + 1;
+
+			for p.pos < len(p.data) && p.data[p.pos] == ' ' {
+				p.pos++;
+			}
+			reset_string(^p);
+
+		case '`':
+			// "A backtick string is a string of one or more backtick
+			// characters (`) that is neither preceded nor followed by a
+			// backtick."
+			backtick_count: int;
+			for p.pos+backtick_count < len(p.data) && p.data[p.pos+backtick_count] == '`' {
+				backtick_count++;
+			}
+			closing := char_string_index(p.data, '`', p.pos+backtick_count, backtick_count);
+			if closing == -1 {
+				p.pos += backtick_count;
+				break;
+			}
+
+			finalize_string(^p);
+			p.pos += backtick_count;
+
+			content := p.data[p.pos..closing];
+			content = collapse_space(trim_whitespace(content));
+
+			node = Code_Span{content = content};
+
+			p.pos = closing + backtick_count;
+			reset_string(^p);
+
+		case '\\':
+			// "Backslashes before other characters are treated as literal backslashes."
+			if p.pos+1 >= len(p.data) || !is_ascii_punc(p.data[p.pos+1]) {
+				p.pos++;
+				break;
+			}
+			// "Any ASCII punctuation character may be backslash-escaped."
+			finalize_string(^p);
+			p.pos++;
+			node = String_Inline{content = p.data[p.pos..p.pos+1]};
+			p.pos++;
+			reset_string(^p);
+
+		case '&':
+			// "[A]ll valid HTML entities in any context are recognized as such
+			// and converted into unicode characters before they are stored in
+			// the AST."
+			semicolon := -1;
+			for c, i in p.data[p.pos+1..] {
+				if c == ';' {
+					semicolon = i;
+					break;
+				}
+			}
+
+			if semicolon < 0 {
+				p.pos++;
+				break;
+			}
+
+			semicolon += p.pos+1;
+			entity := cast(string)p.data[p.pos+1..semicolon];
+
+			codepoints := make([dynamic]byte, 0, 6);
+
+			if len(entity) > 0 {
+				if entity[0] != '#' {
+					append(codepoints, '&');
+					append(codepoints, ..cast([]byte)entity);
+					append(codepoints, ';');
+				} else {
+					if len(entity) > 1 {
+						base := 10;
+						if entity[1] == 'x' || entity[1] == 'X' {
+							// "Hexadecimal entities consist of &# + either X or x + a
+							// string of 1-8 hexadecimal digits + ;."
+							base = 16;
+						} else {
+							// "Decimal entities consist of &# + a string of 1–8 arabic
+							// digits + ;. Again, these entities need to be recognised and
+							// tranformed into their corresponding UTF8 codepoints. Invalid
+							// Unicode codepoints will be written as the “unknown
+							// codepoint” character (0xFFFD)."
+						}
+						codepoint := strconv.parse_uint(entity[2..], base);
+						data, len := utf8.encode_rune(cast(rune)codepoint);
+						append(codepoints, ..data[..len]);
+					}
+				}
+			}
+
+			if len(codepoints) == 0 {
+				p.pos++;
+				break;
+			}
+
+			finalize_string(^p);
+			node = String_Inline{content = codepoints[..]};
+			p.pos = semicolon+1;
+			reset_string(^p);
+		}
+
+		if node != nil {
+			append(p.root.children, node);
+		}
+	}
+
+	finalize_string(^p);
+
+	return p.root;
+}
+
+is_ascii_punc :: proc(char: byte) -> bool {
+	match char {
+	case '!', '"', '#', '$', '%',
+	     '&', '\'', '(', ')',
+	     '*', '+', ',', '-',
+	     '.', '/', ':', ';',
+	     '<', '=', '>', '?', '@', '[', '\\', ']',
+	     '^', '_', '`', '{', '|', '}', '~':
+		return true;
+	}
+	return false;
+}
+
+char_string_index :: proc(data: []byte, char: byte, start, length: int) -> int {
+	count: int;
+	for i in start..len(data) {
+		if data[i] == char {
+			count++;
+			if count == length {
+				if i+1 >= len(data) || data[i+1] != char {
+					return i+1 - count;
+				}
+			}
+		} else {
+			count = 0;
+		}
+	}
+	return -1;
+}
+
+collapse_space :: proc(data: []byte) -> []byte {
+	out := make([]byte, 0, len(data));
+	prev_was_space := false;
+	for c in data {
+		if c == ' ' || c == '\n' {
+			if !prev_was_space {
+				append(out, ' ');
+				prev_was_space = true;
+			}
+		} else {
+			append(out, c);
+			prev_was_space = false;
+		}
+	}
+
+	return out;
+}
+
+
+parse_header :: proc(line: []byte) -> (int, []byte) {
+	// "The opening # character may be indented 0-3 spaces."
+	indent := indentation(line);
+	if indent > 3 {
+		return -1, nil;
+	}
+	line = line[indent..];
+
+	// "The header level is equal to the number of # characters in the opening sequence."
+	level := 0;
+	for c, i in line {
+		if c != '#' {
+			level = i;
+			break;
+		}
+	}
+
+	if level < 1 || level > 6 {
+		return -1, nil;
+	}
+	line = line[level..];
+	// "The opening sequence of # characters cannot be followed directly by a
+	// nonspace character."
+	if line[0] != ' ' && line[0] != '\n' {
+		return -1, nil;
+	}
+	// "The optional closing sequence of #s [...] may be followed by spaces
+	// only."
+
+	trailer_start := len(line) - 1;
+	for trailer_start > 0 && line[trailer_start-1] == ' ' {
+		trailer_start--;
+	}
+	for trailer_start > 0 && line[trailer_start-1] == '#' {
+		trailer_start--;
+	}
+	// "The optional closing sequence of #s must be preceded by a space [...]."
+	// Note that (if the header is empty) this may be the same space as after
+	// the opening sequence.
+	if trailer_start > 0 && line[trailer_start-1] == ' ' {
+		line = line[..trailer_start];
+	}
+
+	// "The raw contents of the header are stripped of leading and trailing
+	// spaces before being parsed as inline content."
+	line = trim_space(line);
+	return level, line;
+
+}
+
+indentation :: proc(line: []byte) -> int {
+	for c, i in line {
+		if c != ' ' {
+			return i;
+		}
+	}
+	panic("indentation() expects line to end in newline character");
+	return 0;
+}
+
+
+TAB_STOP :: 4;
+
+tabs_to_spaces_and_append_newline :: proc(line: []byte) -> []byte {
+	tab_count: int;
+	for c in line {
+		if c == '\t' {
+			tab_count++;
+		}
+	}
+
+	out := make([]byte, 0, len(line) + tab_count*(TAB_STOP-1) + 1);
+
+	rune_count: int;
+	for r in cast(string)line {
+		if r == '\t' {
+			spaces_count := TAB_STOP - rune_count%TAB_STOP;
+			for i in 0..spaces_count {
+				append(out, ' ');
+			}
+			rune_count += spaces_count;
+		} else {
+			match r {
+			case '\r', '\v', '\f':
+				append(out, ' ');
+			default:
+				c, l := utf8.encode_rune(r);
+				append(out, ..c[0..l]);
+			}
+			rune_count++;
+		}
+	}
+	append(out, '\n');
+	return out;
+}
+
+trim_right_whitespace :: proc(data: []byte) -> []byte {
+	c := data;
+	for i := len(c)-1; i >= 0; i-- {
+		match c[i] {
+		case ' ', '\t', '\v', '\f', '\r', '\n':
+			c = c[..i];
+			continue;
+		}
+		break;
+	}
+
+	return c;
+}
+
+
+trim_right_space :: proc(data: []byte) -> []byte {
+	c := data;
+	for i := len(c)-1; i >= 0; i-- {
+		if c[i] != ' ' {
+			break;
+		}
+		c = c[..i];
+	}
+
+	return c;
+}
+
+trim_whitespace :: proc(data: []byte) -> []byte {
+	data = trim_right_whitespace(data);
+	index := 0;
+	for c in data {
+		match c {
+		case ' ', '\t', '\v', '\f', '\r':
+			index++;
+			continue;
+		}
+		break;
+	}
+	return data[index..];
+}
+
+trim_space :: proc(data: []byte) -> []byte {
+	index := 0;
+	for c in data {
+		if c != ' ' {
+			break;
+		}
+		index++;
+	}
+	data = data[index..];
+
+	for i := len(data)-1; i >= 0; i-- {
+		if data[i] != ' ' {
+			break;
+		}
+		data = data[..i];
+	}
+
+	return data;
+}
+
+escape_map := map[byte]string{
+	'"' = "&quot;",
+	'&' = "&amp;",
+	'<' = "&lt;",
+	'>' = "&gt;",
+};
+
 
 main :: proc() {
-	immutable program := "+ + * - /";
-	accumulator := 0;
+	data, ok := os.read_entire_file("W:/Odin/misc/markdown_test.md");
+	if !ok {
+		fmt.println("Failure to load file");
+		return;
+	}
+
+	nodes, err := parse(data);
+	if err != Error.NONE {
+		fmt.println("Failure to parse file");
+		return;
+	}
+
+	write_espaced :: proc(data: []byte) {
+		start: int;
+		for c, i in data {
+			if escaped, ok := escape_map[c]; ok {
+				fmt.print(cast(string)data[start..i]);
+				fmt.print(escaped);
+				start = i+1;
+			}
+		}
+		fmt.print(cast(string)data[start..]);
+	}
 
-	for token in program {
-		match token {
-		case '+': accumulator += 1;
-		case '-': accumulator -= 1;
-		case '*': accumulator *= 2;
-		case '/': accumulator /= 2;
-		default: // Ignore everything else
+	print_inline_as_html :: proc(node: ^Node) {
+		using Node;
+		match n in node {
+		case Multiple_Inline:
+			for _, i in n.children {
+				print_inline_as_html(^n.children[i]);
+			}
+		case String_Inline:
+			write_espaced(n.content);
+		case Soft_Line_Break:
+			// fmt.println();
+		case Hard_Line_Break:
+			fmt.println("<br>");
+		case Code_Span:
+			fmt.print("<code>");
+			write_espaced(n.content);
+			fmt.print("</code>");
 		}
 	}
 
-	fmt.printf("The program \"%s\" calculates the value %d\n", program, accumulator);
+	print_node_as_html :: proc(node: ^Node) {
+		using Node;
+		match n in node {
+		case Header:
+			fmt.printf("<h%d>", n.level);
+			print_inline_as_html(n.inline_content);
+			fmt.printf("</h%d>\n", n.level);
+		case Paragraph:
+			fmt.print("<p>");
+			print_inline_as_html(n.inline_content);
+			fmt.println("</p>");
+		case Horizontal_Rule:
+			fmt.println("<hr>");
+		case Code_Block:
+			if n.language != "" {
+				fmt.printf("<pre><code class=\"language-%s\">", n.language);
+			} else {
+				fmt.print("<pre><code>");
+			}
+			fmt.print(cast(string)n.content);
+			fmt.println("</code></pre>");
+		case Quote:
+		}
+	}
+
+	for _, i in nodes {
+		print_node_as_html(^nodes[i]);
+	}
 }