diff options
Diffstat (limited to 'code')
| -rw-r--r-- | code/demo.odin | 653 |
1 files changed, 643 insertions, 10 deletions
diff --git a/code/demo.odin b/code/demo.odin index 323754a23..0be0acffa 100644 --- a/code/demo.odin +++ b/code/demo.odin @@ -1,18 +1,651 @@ #import "fmt.odin"; +#import "os.odin"; +#import "strconv.odin"; +#import "utf8.odin"; + +Error :: enum { + NONE, +} + +Style_Type :: enum { + ITALIC, + BOLD, + STRIKE, +} + +Node :: union { + children: [dynamic]Node, + content: []byte, + inline_content: ^Node, + line_number: int, + + // Block Variants + Header{level: int}, + Document{}, + Paragraph{}, + Quote{}, + Code_Block{language: string}, + Horizontal_Rule{}, + + // Inline Variants + Multiple_Inline{}, + String_Inline{}, + Soft_Line_Break{}, + Hard_Line_Break{}, + Code_Span{}, + Style{ + type: Style_Type, + } +} + + +Parser :: struct { + data: []byte, + nodes: [dynamic]Node, +} + +parse :: proc(data: []byte) -> ([]Node, Error) { + p := Parser{ + data = data, + }; + err := parse(^p); + + if err != Error.NONE { + return nil, err; + } + + return p.nodes[..], Error.NONE; +} + +parse :: proc(p: ^Parser) -> Error { + is_blank :: proc(line: []byte) -> bool { + line = trim_whitespace(line); + return len(line) == 0; + } + + is_horizontal_rule :: proc(line: []byte) -> bool { + char: byte; + count := 0; + for c, i in line { + if c != ' ' && c != '\n' { + if c != '-' && c != '_' && c != '*' { + return false; + } + if char == 0 { + if i >= 4 { + return false; + } + char = c; + count = 1; + } else if c == char { + count++; + } else { + return false; + } + } + } + + + return count >= 3; + } + + nodes := make([dynamic]Node); + + line_number: int = 0; + prev_was_blank := false; + in_code_block := false; + code_language := ""; + code_block_start := 0; + + pos := 0; + end := len(p.data); + for pos < len(p.data) { + line_start := pos; + line_end := pos; + for p.data[line_end] != '\n' { + line_end++; + } + line := p.data[pos..line_end]; + pos = line_end+1; + line_number++; + + line = tabs_to_spaces_and_append_newline(line); + str := cast(string)line; + + skip := in_code_block; + + node: Node; + if len(line) > 3 && cast(string)line[..3] == "```" { + if !in_code_block { + code_block_start = line_start+3; + in_code_block = true; + code_language = ""; + rest := trim_whitespace(line[3..]); + if len(rest) > 0 { + code_language = cast(string)rest; + } + } else { + end := line_start-1; + str := p.data[code_block_start..end]; + node = Node.Code_Block{content = str, language = code_language}; + in_code_block = false; + } + skip = true; + } + + indent_char := line[indentation(line)]; + if skip { + + } else if indent_char == '>' { + node = Node.Quote{content = line}; + } else if indent_char == '*' { + // fmt.println("List Item"); + } else if level, content := parse_header(line); level > 0 { + node = Node.Header{content = content, level = level}; + } else if is_horizontal_rule(line) { + node = Node.Horizontal_Rule{}; + } else if !is_blank(line) { + node = Node.Paragraph{content = line}; + } + + if node != nil { + node.line_number = line_number; + append(nodes, node); + } + } + + + for _, i in nodes { + using Node; + match n in nodes[i] { + case Paragraph, Horizontal_Rule, Header, Code_Block: + append(p.nodes, nodes[i]); + case Quote: + // fmt.println("Quote"); + } + } + + for _, i in p.nodes { + process_inlines(^p.nodes[i]); + } + + + return Error.NONE; +} + +process_inlines :: proc(node: ^Node) { + using Node; + match n in node { + case Header: + n.inline_content = parse_inlines(n.content); + case Paragraph: + n.inline_content = parse_inlines(trim_right_space(n.content)); + } + + for _, i in node.children { + process_inlines(^node.children[i]); + } +} + +Inline_Parser :: struct { + data: []byte, + pos: int, + string_start: int, + root: ^Node, +} + +parse_inlines :: proc(data: []byte) -> ^Node { + reset_string :: proc(p: ^Inline_Parser) { + p.string_start = p.pos; + } + finalize_string :: proc(p: ^Inline_Parser) { + if p.string_start >= p.pos { + return; + } + + + str := p.data[p.string_start..p.pos]; + append(p.root.children, Node.String_Inline{content = trim_right_whitespace(str)}); + } + + p := Inline_Parser{ + data = data, + root = new(Node), + }; + p.root^ = Node.Multiple_Inline{}; + + using Node; + + for p.pos < len(p.data) { + node: Node; + match p.data[p.pos] { + default: p.pos++; + + case '\n': + hard_break := false; + new_line_pos := p.pos; + + if p.pos >= 2 && p.data[p.pos-1] == ' ' && p.data[p.pos-2] == ' ' { + hard_break = true; + p.pos -= 2; + } + + if p.pos >= 1 && p.data[p.pos-1] == '\\' { + hard_break = true; + p.pos--; + } + + + for p.pos > 0 && p.data[p.pos-1] == ' ' { + p.pos--; + } + finalize_string(^p); + + if hard_break { + node = Hard_Line_Break{}; + } else { + node = Soft_Line_Break{}; + } + + p.pos = new_line_pos + 1; + + for p.pos < len(p.data) && p.data[p.pos] == ' ' { + p.pos++; + } + reset_string(^p); + + case '`': + // "A backtick string is a string of one or more backtick + // characters (`) that is neither preceded nor followed by a + // backtick." + backtick_count: int; + for p.pos+backtick_count < len(p.data) && p.data[p.pos+backtick_count] == '`' { + backtick_count++; + } + closing := char_string_index(p.data, '`', p.pos+backtick_count, backtick_count); + if closing == -1 { + p.pos += backtick_count; + break; + } + + finalize_string(^p); + p.pos += backtick_count; + + content := p.data[p.pos..closing]; + content = collapse_space(trim_whitespace(content)); + + node = Code_Span{content = content}; + + p.pos = closing + backtick_count; + reset_string(^p); + + case '\\': + // "Backslashes before other characters are treated as literal backslashes." + if p.pos+1 >= len(p.data) || !is_ascii_punc(p.data[p.pos+1]) { + p.pos++; + break; + } + // "Any ASCII punctuation character may be backslash-escaped." + finalize_string(^p); + p.pos++; + node = String_Inline{content = p.data[p.pos..p.pos+1]}; + p.pos++; + reset_string(^p); + + case '&': + // "[A]ll valid HTML entities in any context are recognized as such + // and converted into unicode characters before they are stored in + // the AST." + semicolon := -1; + for c, i in p.data[p.pos+1..] { + if c == ';' { + semicolon = i; + break; + } + } + + if semicolon < 0 { + p.pos++; + break; + } + + semicolon += p.pos+1; + entity := cast(string)p.data[p.pos+1..semicolon]; + + codepoints := make([dynamic]byte, 0, 6); + + if len(entity) > 0 { + if entity[0] != '#' { + append(codepoints, '&'); + append(codepoints, ..cast([]byte)entity); + append(codepoints, ';'); + } else { + if len(entity) > 1 { + base := 10; + if entity[1] == 'x' || entity[1] == 'X' { + // "Hexadecimal entities consist of &# + either X or x + a + // string of 1-8 hexadecimal digits + ;." + base = 16; + } else { + // "Decimal entities consist of &# + a string of 1–8 arabic + // digits + ;. Again, these entities need to be recognised and + // tranformed into their corresponding UTF8 codepoints. Invalid + // Unicode codepoints will be written as the “unknown + // codepoint” character (0xFFFD)." + } + codepoint := strconv.parse_uint(entity[2..], base); + data, len := utf8.encode_rune(cast(rune)codepoint); + append(codepoints, ..data[..len]); + } + } + } + + if len(codepoints) == 0 { + p.pos++; + break; + } + + finalize_string(^p); + node = String_Inline{content = codepoints[..]}; + p.pos = semicolon+1; + reset_string(^p); + } + + if node != nil { + append(p.root.children, node); + } + } + + finalize_string(^p); + + return p.root; +} + +is_ascii_punc :: proc(char: byte) -> bool { + match char { + case '!', '"', '#', '$', '%', + '&', '\'', '(', ')', + '*', '+', ',', '-', + '.', '/', ':', ';', + '<', '=', '>', '?', '@', '[', '\\', ']', + '^', '_', '`', '{', '|', '}', '~': + return true; + } + return false; +} + +char_string_index :: proc(data: []byte, char: byte, start, length: int) -> int { + count: int; + for i in start..len(data) { + if data[i] == char { + count++; + if count == length { + if i+1 >= len(data) || data[i+1] != char { + return i+1 - count; + } + } + } else { + count = 0; + } + } + return -1; +} + +collapse_space :: proc(data: []byte) -> []byte { + out := make([]byte, 0, len(data)); + prev_was_space := false; + for c in data { + if c == ' ' || c == '\n' { + if !prev_was_space { + append(out, ' '); + prev_was_space = true; + } + } else { + append(out, c); + prev_was_space = false; + } + } + + return out; +} + + +parse_header :: proc(line: []byte) -> (int, []byte) { + // "The opening # character may be indented 0-3 spaces." + indent := indentation(line); + if indent > 3 { + return -1, nil; + } + line = line[indent..]; + + // "The header level is equal to the number of # characters in the opening sequence." + level := 0; + for c, i in line { + if c != '#' { + level = i; + break; + } + } + + if level < 1 || level > 6 { + return -1, nil; + } + line = line[level..]; + // "The opening sequence of # characters cannot be followed directly by a + // nonspace character." + if line[0] != ' ' && line[0] != '\n' { + return -1, nil; + } + // "The optional closing sequence of #s [...] may be followed by spaces + // only." + + trailer_start := len(line) - 1; + for trailer_start > 0 && line[trailer_start-1] == ' ' { + trailer_start--; + } + for trailer_start > 0 && line[trailer_start-1] == '#' { + trailer_start--; + } + // "The optional closing sequence of #s must be preceded by a space [...]." + // Note that (if the header is empty) this may be the same space as after + // the opening sequence. + if trailer_start > 0 && line[trailer_start-1] == ' ' { + line = line[..trailer_start]; + } + + // "The raw contents of the header are stripped of leading and trailing + // spaces before being parsed as inline content." + line = trim_space(line); + return level, line; + +} + +indentation :: proc(line: []byte) -> int { + for c, i in line { + if c != ' ' { + return i; + } + } + panic("indentation() expects line to end in newline character"); + return 0; +} + + +TAB_STOP :: 4; + +tabs_to_spaces_and_append_newline :: proc(line: []byte) -> []byte { + tab_count: int; + for c in line { + if c == '\t' { + tab_count++; + } + } + + out := make([]byte, 0, len(line) + tab_count*(TAB_STOP-1) + 1); + + rune_count: int; + for r in cast(string)line { + if r == '\t' { + spaces_count := TAB_STOP - rune_count%TAB_STOP; + for i in 0..spaces_count { + append(out, ' '); + } + rune_count += spaces_count; + } else { + match r { + case '\r', '\v', '\f': + append(out, ' '); + default: + c, l := utf8.encode_rune(r); + append(out, ..c[0..l]); + } + rune_count++; + } + } + append(out, '\n'); + return out; +} + +trim_right_whitespace :: proc(data: []byte) -> []byte { + c := data; + for i := len(c)-1; i >= 0; i-- { + match c[i] { + case ' ', '\t', '\v', '\f', '\r', '\n': + c = c[..i]; + continue; + } + break; + } + + return c; +} + + +trim_right_space :: proc(data: []byte) -> []byte { + c := data; + for i := len(c)-1; i >= 0; i-- { + if c[i] != ' ' { + break; + } + c = c[..i]; + } + + return c; +} + +trim_whitespace :: proc(data: []byte) -> []byte { + data = trim_right_whitespace(data); + index := 0; + for c in data { + match c { + case ' ', '\t', '\v', '\f', '\r': + index++; + continue; + } + break; + } + return data[index..]; +} + +trim_space :: proc(data: []byte) -> []byte { + index := 0; + for c in data { + if c != ' ' { + break; + } + index++; + } + data = data[index..]; + + for i := len(data)-1; i >= 0; i-- { + if data[i] != ' ' { + break; + } + data = data[..i]; + } + + return data; +} + +escape_map := map[byte]string{ + '"' = """, + '&' = "&", + '<' = "<", + '>' = ">", +}; + main :: proc() { - immutable program := "+ + * - /"; - accumulator := 0; + data, ok := os.read_entire_file("W:/Odin/misc/markdown_test.md"); + if !ok { + fmt.println("Failure to load file"); + return; + } + + nodes, err := parse(data); + if err != Error.NONE { + fmt.println("Failure to parse file"); + return; + } + + write_espaced :: proc(data: []byte) { + start: int; + for c, i in data { + if escaped, ok := escape_map[c]; ok { + fmt.print(cast(string)data[start..i]); + fmt.print(escaped); + start = i+1; + } + } + fmt.print(cast(string)data[start..]); + } - for token in program { - match token { - case '+': accumulator += 1; - case '-': accumulator -= 1; - case '*': accumulator *= 2; - case '/': accumulator /= 2; - default: // Ignore everything else + print_inline_as_html :: proc(node: ^Node) { + using Node; + match n in node { + case Multiple_Inline: + for _, i in n.children { + print_inline_as_html(^n.children[i]); + } + case String_Inline: + write_espaced(n.content); + case Soft_Line_Break: + // fmt.println(); + case Hard_Line_Break: + fmt.println("<br>"); + case Code_Span: + fmt.print("<code>"); + write_espaced(n.content); + fmt.print("</code>"); } } - fmt.printf("The program \"%s\" calculates the value %d\n", program, accumulator); + print_node_as_html :: proc(node: ^Node) { + using Node; + match n in node { + case Header: + fmt.printf("<h%d>", n.level); + print_inline_as_html(n.inline_content); + fmt.printf("</h%d>\n", n.level); + case Paragraph: + fmt.print("<p>"); + print_inline_as_html(n.inline_content); + fmt.println("</p>"); + case Horizontal_Rule: + fmt.println("<hr>"); + case Code_Block: + if n.language != "" { + fmt.printf("<pre><code class=\"language-%s\">", n.language); + } else { + fmt.print("<pre><code>"); + } + fmt.print(cast(string)n.content); + fmt.println("</code></pre>"); + case Quote: + } + } + + for _, i in nodes { + print_node_as_html(^nodes[i]); + } } |