package c_frontend_tokenizer import "core:fmt" import "core:os" import "core:strings" import "core:unicode/utf8" Error_Handler :: #type proc(pos: Pos, fmt: string, args: ..any) Tokenizer :: struct { // Immutable data path: string, src: []byte, // Tokenizing state ch: rune, offset: int, read_offset: int, line_offset: int, line_count: int, // Extra information for tokens at_bol: bool, has_space: bool, // Mutable data err: Error_Handler, warn: Error_Handler, error_count: int, warning_count: int, } init_defaults :: proc(t: ^Tokenizer, err: Error_Handler = default_error_handler, warn: Error_Handler = default_warn_handler) { t.err = err t.warn = warn } @(private) offset_to_pos :: proc(t: ^Tokenizer, offset: int) -> (pos: Pos) { pos.file = t.path pos.offset = offset pos.line = t.line_count pos.column = offset - t.line_offset + 1 return } default_error_handler :: proc(pos: Pos, msg: string, args: ..any) { fmt.eprintf("%s(%d:%d) ", pos.file, pos.line, pos.column) fmt.eprintf(msg, ..args) fmt.eprintf("\n") } default_warn_handler :: proc(pos: Pos, msg: string, args: ..any) { fmt.eprintf("%s(%d:%d) warning: ", pos.file, pos.line, pos.column) fmt.eprintf(msg, ..args) fmt.eprintf("\n") } error_offset :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) { pos := offset_to_pos(t, offset) if t.err != nil { t.err(pos, msg, ..args) } t.error_count += 1 } warn_offset :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) { pos := offset_to_pos(t, offset) if t.warn != nil { t.warn(pos, msg, ..args) } t.warning_count += 1 } error :: proc(t: ^Tokenizer, tok: ^Token, msg: string, args: ..any) { pos := tok.pos if t.err != nil { t.err(pos, msg, ..args) } t.error_count += 1 } warn :: proc(t: ^Tokenizer, tok: ^Token, msg: string, args: ..any) { pos := tok.pos if t.warn != nil { t.warn(pos, msg, ..args) } t.warning_count += 1 } advance_rune :: proc(t: ^Tokenizer) { if t.read_offset < len(t.src) { t.offset = t.read_offset if t.ch == '\n' { t.at_bol = true t.line_offset = t.offset t.line_count += 1 } r, w := rune(t.src[t.read_offset]), 1 switch { case r == 0: error_offset(t, t.offset, "illegal character NUL") case r >= utf8.RUNE_SELF: r, w = utf8.decode_rune(t.src[t.read_offset:]) if r == utf8.RUNE_ERROR && w == 1 { error_offset(t, t.offset, "illegal UTF-8 encoding") } else if r == utf8.RUNE_BOM && t.offset > 0 { error_offset(t, t.offset, "illegal byte order mark") } } t.read_offset += w t.ch = r } else { t.offset = len(t.src) if t.ch == '\n' { t.at_bol = true t.line_offset = t.offset t.line_count += 1 } t.ch = -1 } } advance_rune_n :: proc(t: ^Tokenizer, n: int) { for _ in 0.. bool { return '0' <= r && r <= '9' } skip_whitespace :: proc(t: ^Tokenizer) { for { switch t.ch { case ' ', '\t', '\r', '\v', '\f', '\n': t.has_space = true advance_rune(t) case: return } } } scan_comment :: proc(t: ^Tokenizer) -> string { offset := t.offset-1 next := -1 general: { if t.ch == '/'{ // line comments advance_rune(t) for t.ch != '\n' && t.ch >= 0 { advance_rune(t) } next = t.offset if t.ch == '\n' { next += 1 } break general } /* style comment */ advance_rune(t) for t.ch >= 0 { ch := t.ch advance_rune(t) if ch == '*' && t.ch == '/' { advance_rune(t) next = t.offset break general } } error_offset(t, offset, "comment not terminated") } lit := t.src[offset : t.offset] // NOTE(bill): Strip CR for line comments for len(lit) > 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' { lit = lit[:len(lit)-1] } return string(lit) } scan_identifier :: proc(t: ^Tokenizer) -> string { offset := t.offset for is_ident1(t.ch) { advance_rune(t) } return string(t.src[offset : t.offset]) } scan_string :: proc(t: ^Tokenizer) -> string { offset := t.offset-1 for { ch := t.ch if ch == '\n' || ch < 0 { error_offset(t, offset, "string literal was not terminated") break } advance_rune(t) if ch == '"' { break } if ch == '\\' { scan_escape(t) } } return string(t.src[offset : t.offset]) } digit_val :: proc(r: rune) -> int { switch r { case '0'..='9': return int(r-'0') case 'A'..='F': return int(r-'A' + 10) case 'a'..='f': return int(r-'a' + 10) } return 16 } scan_escape :: proc(t: ^Tokenizer) -> bool { offset := t.offset esc := t.ch n: int base, max: u32 switch esc { case 'a', 'b', 'e', 'f', 'n', 't', 'v', 'r', '\\', '\'', '"': advance_rune(t) return true case '0'..='7': for digit_val(t.ch) < 8 { advance_rune(t) } return true case 'x': advance_rune(t) for digit_val(t.ch) < 16 { advance_rune(t) } return true case 'u': advance_rune(t) n, base, max = 4, 16, utf8.MAX_RUNE case 'U': advance_rune(t) n, base, max = 8, 16, utf8.MAX_RUNE case: if t.ch < 0 { error_offset(t, offset, "escape sequence was not terminated") } else { break } return false } x: u32 main_loop: for n > 0 { d := u32(digit_val(t.ch)) if d >= base { if t.ch == '"' || t.ch == '\'' { break main_loop } if t.ch < 0 { error_offset(t, t.offset, "escape sequence was not terminated") } else { error_offset(t, t.offset, "illegal character '%r' : %d in escape sequence", t.ch, t.ch) } return false } x = x*base + d advance_rune(t) n -= 1 } if x > max || 0xd800 <= x && x <= 0xe000 { error_offset(t, offset, "escape sequence is an invalid Unicode code point") return false } return true } scan_rune :: proc(t: ^Tokenizer) -> string { offset := t.offset-1 valid := true n := 0 for { ch := t.ch if ch == '\n' || ch < 0 { if valid { error_offset(t, offset, "rune literal not terminated") valid = false } break } advance_rune(t) if ch == '\'' { break } n += 1 if ch == '\\' { if !scan_escape(t) { valid = false } } } if valid && n != 1 { error_offset(t, offset, "illegal rune literal") } return string(t.src[offset : t.offset]) } scan_number :: proc(t: ^Tokenizer, seen_decimal_point: bool) -> (Token_Kind, string) { scan_mantissa :: proc(t: ^Tokenizer, base: int) { for digit_val(t.ch) < base { advance_rune(t) } } scan_exponent :: proc(t: ^Tokenizer) { if t.ch == 'e' || t.ch == 'E' || t.ch == 'p' || t.ch == 'P' { advance_rune(t) if t.ch == '-' || t.ch == '+' { advance_rune(t) } if digit_val(t.ch) < 10 { scan_mantissa(t, 10) } else { error_offset(t, t.offset, "illegal floating-point exponent") } } } scan_fraction :: proc(t: ^Tokenizer) -> (early_exit: bool) { if t.ch == '.' && peek(t) == '.' { return true } if t.ch == '.' { advance_rune(t) scan_mantissa(t, 10) } return false } check_end := true offset := t.offset seen_point := seen_decimal_point if seen_point { offset -= 1 scan_mantissa(t, 10) scan_exponent(t) } else { if t.ch == '0' { int_base :: proc(t: ^Tokenizer, base: int, msg: string) { prev := t.offset advance_rune(t) scan_mantissa(t, base) if t.offset - prev <= 1 { error_offset(t, t.offset, msg) } } advance_rune(t) switch t.ch { case 'b', 'B': int_base(t, 2, "illegal binary integer") case 'x', 'X': int_base(t, 16, "illegal hexadecimal integer") case: seen_point = false scan_mantissa(t, 10) if t.ch == '.' { seen_point = true if scan_fraction(t) { check_end = false } } if check_end { scan_exponent(t) check_end = false } } } } if check_end { scan_mantissa(t, 10) if !scan_fraction(t) { scan_exponent(t) } } return .Number, string(t.src[offset : t.offset]) } scan_punct :: proc(t: ^Tokenizer, ch: rune) -> (kind: Token_Kind) { kind = .Punct switch ch { case: kind = .Invalid case '<', '>': if t.ch == ch { advance_rune(t) } if t.ch == '=' { advance_rune(t) } case '!', '+', '-', '*', '/', '%', '^', '=': if t.ch == '=' { advance_rune(t) } case '#': if t.ch == '#' { advance_rune(t) } case '&': if t.ch == '=' || t.ch == '&' { advance_rune(t) } case '|': if t.ch == '=' || t.ch == '|' { advance_rune(t) } case '(', ')', '[', ']', '{', '}': // okay case '~', ',', ':', ';', '?': // okay case '`': // okay case '.': if t.ch == '.' && peek(t) == '.' { advance_rune(t) advance_rune(t) // consume last '.' } } return } peek :: proc(t: ^Tokenizer) -> byte { if t.read_offset < len(t.src) { return t.src[t.read_offset] } return 0 } peek_str :: proc(t: ^Tokenizer, str: string) -> bool { if t.read_offset < len(t.src) { return strings.has_prefix(string(t.src[t.offset:]), str) } return false } scan_literal_prefix :: proc(t: ^Tokenizer, str: string, prefix: ^string) -> bool { if peek_str(t, str) { offset := t.offset for _ in str { advance_rune(t) } prefix^ = string(t.src[offset:][:len(str)-1]) return true } return false } allow_next_to_be_newline :: proc(t: ^Tokenizer) -> bool { if t.ch == '\n' { advance_rune(t) return true } else if t.ch == '\r' && peek(t) == '\n' { // allow for MS-DOS style line endings advance_rune(t) // \r advance_rune(t) // \n return true } return false } scan :: proc(t: ^Tokenizer, f: ^File) -> ^Token { skip_whitespace(t) offset := t.offset kind: Token_Kind lit: string prefix: string switch ch := t.ch; { case scan_literal_prefix(t, `u8"`, &prefix): kind = .String lit = scan_string(t) case scan_literal_prefix(t, `u"`, &prefix): kind = .String lit = scan_string(t) case scan_literal_prefix(t, `L"`, &prefix): kind = .String lit = scan_string(t) case scan_literal_prefix(t, `U"`, &prefix): kind = .String lit = scan_string(t) case scan_literal_prefix(t, `u'`, &prefix): kind = .Char lit = scan_rune(t) case scan_literal_prefix(t, `L'`, &prefix): kind = .Char lit = scan_rune(t) case scan_literal_prefix(t, `U'`, &prefix): kind = .Char lit = scan_rune(t) case is_ident0(ch): lit = scan_identifier(t) kind = .Ident case '0' <= ch && ch <= '9': kind, lit = scan_number(t, false) case: advance_rune(t) switch ch { case -1: kind = .EOF case '\\': kind = .Punct if allow_next_to_be_newline(t) { t.at_bol = true t.has_space = false return scan(t, f) } case '.': if is_digit(t.ch) { kind, lit = scan_number(t, true) } else { kind = scan_punct(t, ch) } case '"': kind = .String lit = scan_string(t) case '\'': kind = .Char lit = scan_rune(t) case '/': if t.ch == '/' || t.ch == '*' { kind = .Comment lit = scan_comment(t) t.has_space = true break } fallthrough case: kind = scan_punct(t, ch) if kind == .Invalid && ch != utf8.RUNE_BOM { error_offset(t, t.offset, "illegal character '%r': %d", ch, ch) } } } if lit == "" { lit = string(t.src[offset : t.offset]) } if kind == .Comment { return scan(t, f) } tok := new(Token) tok.kind = kind tok.lit = lit tok.pos = offset_to_pos(t, offset) tok.file = f tok.prefix = prefix tok.at_bol = t.at_bol tok.has_space = t.has_space t.at_bol, t.has_space = false, false return tok } tokenize :: proc(t: ^Tokenizer, f: ^File) -> ^Token { setup_tokenizer: { t.src = f.src t.ch = ' ' t.offset = 0 t.read_offset = 0 t.line_offset = 0 t.line_count = len(t.src) > 0 ? 1 : 0 t.error_count = 0 t.path = f.name advance_rune(t) if t.ch == utf8.RUNE_BOM { advance_rune(t) } } t.at_bol = true t.has_space = false head: Token curr := &head for { tok := scan(t, f) if tok == nil { break } curr.next = tok curr = curr.next if tok.kind == .EOF { break } } return head.next } add_new_file :: proc(t: ^Tokenizer, name: string, src: []byte, id: int) -> ^File { file := new(File) file.id = id file.src = src file.name = name file.display_name = name return file } tokenize_file :: proc(t: ^Tokenizer, path: string, id: int, loc := #caller_location) -> ^Token { src, ok := os.read_entire_file(path) if !ok { return nil } return tokenize(t, add_new_file(t, path, src, id)) } inline_tokenize :: proc(t: ^Tokenizer, tok: ^Token, src: []byte) -> ^Token { file := new(File) file.src = src if tok.file != nil { file.id = tok.file.id file.name = tok.file.name file.display_name = tok.file.name } return tokenize(t, file) }