aboutsummaryrefslogtreecommitdiff
path: root/src/tokenizer.cpp
diff options
context:
space:
mode:
authorgingerBill <ginger.bill.22@gmail.com>2016-07-07 21:47:33 +0100
committergingerBill <ginger.bill.22@gmail.com>2016-07-07 21:47:33 +0100
commit7430008fd7088339821923bdff533369b7967904 (patch)
tree207c8e9b67f938c38982d1c9a2630e2ff7018700 /src/tokenizer.cpp
Initial Commit
Original Git Repo "went bad" so I had to restart.
Diffstat (limited to 'src/tokenizer.cpp')
-rw-r--r--src/tokenizer.cpp760
1 files changed, 760 insertions, 0 deletions
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
new file mode 100644
index 000000000..38a6f54fe
--- /dev/null
+++ b/src/tokenizer.cpp
@@ -0,0 +1,760 @@
+// TODO(bill): Unicode support
+b32 rune_is_letter(Rune r) {
+ if (r < 0x80 && gb_char_is_alpha(cast(char)r) || r == '_') {
+ return true;
+ }
+ return false;
+}
+
+b32 rune_is_digit(Rune r) {
+ if (r < 0x80 && gb_is_between(r, '0', '9'))
+ return true;
+ return false;
+}
+
+b32 rune_is_whitespace(Rune r) {
+ switch (r) {
+ case ' ':
+ case '\t':
+ case '\n':
+ case '\r':
+ case '\f':
+ case '\v':
+ return true;
+ }
+ return false;
+}
+
+typedef enum TokenKind TokenKind;
+enum TokenKind {
+ Token_Invalid,
+ Token_EOF,
+
+Token__LiteralBegin,
+ Token_Identifier,
+ Token_Integer,
+ Token_Float,
+ Token_Rune,
+ Token_String,
+Token__LiteralEnd,
+
+Token__OperatorBegin,
+ Token_Eq, // =
+
+ Token_Not, // ! (Unary Boolean)
+ Token_Hash, // #
+ Token_At, // @ // TODO(bill): Remove
+ Token_Pointer, // ^
+
+ Token_Add, // +
+ Token_Sub, // -
+ Token_Mul, // *
+ Token_Quo, // /
+ Token_Mod, // %
+
+ Token_AddEq, // +=
+ Token_SubEq, // -=
+ Token_MulEq, // *=
+ Token_QuoEq, // /=
+ Token_ModEq, // %=
+
+ Token_And, // &
+ Token_Or, // |
+ Token_Xor, // ~
+ Token_AndNot, // &~
+
+ Token_AndEq, // &=
+ Token_OrEq, // |=
+ Token_XorEq, // ~=
+ Token_AndNotEq, // &~=
+
+ Token_Increment, // ++
+ Token_Decrement, // --
+ Token_ArrowRight, // ->
+ Token_ArrowLeft, // <-
+
+ Token_CmpAnd, // &&
+ Token_CmpOr, // ||
+Token__ComparisonBegin,
+ Token_CmpEq, // ==
+ Token_Lt, // <
+ Token_Gt, // >
+ Token_NotEq, // !=
+ Token_LtEq, // <=
+ Token_GtEq, // >=
+Token__ComparisonEnd,
+ Token_CmpAndEq, // &&=
+ Token_CmpOrEq, // ||=
+
+ Token_OpenParen, // (
+ Token_CloseParen, // )
+ Token_OpenBracket, // [
+ Token_CloseBracket, // ]
+ Token_OpenBrace, // {
+ Token_CloseBrace, // }
+
+ Token_Colon, // :
+ Token_Semicolon, // ;
+ Token_Period, // .
+ Token_Comma, // ,
+ Token_Ellipsis, // ...
+Token__OperatorEnd,
+
+Token__KeywordBegin,
+ Token_type,
+ Token_proc,
+
+ Token_match, // TODO(bill): switch vs match?
+ Token_break,
+ Token_continue,
+ Token_fallthrough,
+ Token_case,
+ Token_default,
+
+ Token_if,
+ Token_else,
+ Token_for,
+ Token_defer,
+ Token_return,
+ Token_import,
+ Token_cast,
+
+ Token_struct,
+ Token_union,
+ Token_enum,
+
+ Token_inline,
+ Token_no_inline,
+Token__KeywordEnd,
+
+ Token_Count,
+};
+
+char const *TOKEN_STRINGS[] = {
+ "Invalid",
+ "EOF",
+"_LiteralBegin",
+ "Identifier",
+ "Integer",
+ "Float",
+ "Rune",
+ "String",
+"_LiteralEnd",
+"_OperatorBegin",
+ "=",
+ "!",
+ "#",
+ "@",
+ "^",
+ "+",
+ "-",
+ "*",
+ "/",
+ "%",
+ "+=",
+ "-=",
+ "*=",
+ "/=",
+ "%=",
+ "&",
+ "|",
+ "~",
+ "&~",
+ "&=",
+ "|=",
+ "~=",
+ "&~=",
+ "++",
+ "--",
+ "->",
+ "<-",
+ "&&",
+ "||",
+"_ComparisonBegin",
+ "==",
+ "<",
+ ">",
+ "!=",
+ "<=",
+ ">=",
+"_ComparisonEnd",
+ "&&=",
+ "||=",
+ "(",
+ ")",
+ "[",
+ "]",
+ "{",
+ "}",
+ ":",
+ ";",
+ ".",
+ ",",
+ "...",
+"_OperatorEnd",
+"_KeywordBegin",
+ "type",
+ "proc",
+ "switch",
+ "break",
+ "continue",
+ "fallthrough",
+ "case",
+ "default",
+ "if",
+ "else",
+ "for",
+ "defer",
+ "return",
+ "import",
+ "cast",
+ "struct",
+ "union",
+ "enum",
+ "inline",
+ "no_inline",
+ "import",
+"_KeywordEnd",
+};
+
+
+// NOTE(bill): Text is UTF-8, thus why u8 and not char
+typedef struct Token Token;
+struct Token {
+ TokenKind kind;
+ String string;
+ isize line, column;
+};
+
+
+
+char const *token_kind_to_string(TokenKind kind) {
+ return TOKEN_STRINGS[kind];
+}
+
+i32 token_precedence(Token t) {
+ switch (t.kind) {
+ case Token_CmpOr: return 1;
+ case Token_CmpAnd: return 2;
+
+ case Token_CmpEq:
+ case Token_NotEq:
+ case Token_Lt:
+ case Token_Gt:
+ case Token_LtEq:
+ case Token_GtEq:
+ return 3;
+
+ case Token_Add:
+ case Token_Sub:
+ case Token_Or:
+ case Token_Xor:
+ return 4;
+
+ case Token_Mul:
+ case Token_Quo:
+ case Token_Mod:
+ case Token_And:
+ case Token_AndNot:
+ return 5;
+ }
+
+ return 0;
+}
+
+
+gb_inline b32 token_is_literal(Token t) {
+ return gb_is_between(t.kind, Token__LiteralBegin+1, Token__LiteralEnd-1);
+}
+gb_inline b32 token_is_operator(Token t) {
+ return gb_is_between(t.kind, Token__OperatorBegin+1, Token__OperatorEnd-1);
+}
+gb_inline b32 token_is_keyword(Token t) {
+ return gb_is_between(t.kind, Token__KeywordBegin+1, Token__KeywordEnd-1);
+}
+gb_inline b32 token_is_comparison(Token t) {
+ return gb_is_between(t.kind, Token__ComparisonBegin+1, Token__ComparisonEnd-1);
+}
+
+gb_inline void print_token(Token t) { gb_printf("%.*s\n", LIT(t.string)); }
+
+typedef struct Tokenizer Tokenizer;
+struct Tokenizer {
+ char *fullpath;
+ u8 *start;
+ u8 *end;
+
+ Rune curr_rune; // current character
+ u8 * curr; // character pos
+ u8 * read_curr; // pos from start
+ u8 * line; // current line pos
+ isize line_count;
+};
+
+
+#define tokenizer_error(t, msg, ...) tokenizer_error_(t, __FUNCTION__, msg, ##__VA_ARGS__)
+void tokenizer_error_(Tokenizer *t, char *function, char *msg, ...) {
+ va_list va;
+ isize column = t->read_curr - t->line+1;
+ if (column < 1)
+ column = 1;
+
+ gb_printf_err("%s()\n", function);
+ gb_printf_err("%s(%td:%td) ", t->fullpath, t->line_count, column);
+
+ va_start(va, msg);
+ gb_printf_err_va(msg, va);
+ va_end(va);
+
+ gb_printf_err("\n");
+
+ gb_exit(1);
+}
+
+void advance_to_next_rune(Tokenizer *t) {
+ if (t->read_curr < t->end) {
+ Rune rune;
+ isize width = 1;
+
+ t->curr = t->read_curr;
+ if (t->curr_rune == '\n') {
+ t->line = t->curr;
+ t->line_count++;
+ }
+ rune = *t->read_curr;
+ if (rune == 0) {
+ tokenizer_error(t, "Illegal character NUL");
+ } else if (rune >= 0x80) { // not ASCII
+ width = gb_utf8_decode(t->read_curr, t->end-t->read_curr, &rune);
+ if (rune == GB_RUNE_INVALID && width == 1)
+ tokenizer_error(t, "Illegal UTF-8 encoding");
+ else if (rune == GB_RUNE_BOM && t->curr-t->start > 0)
+ tokenizer_error(t, "Illegal byte order mark");
+ }
+ t->read_curr += width;
+ t->curr_rune = rune;
+ } else {
+ t->curr = t->end;
+ if (t->curr_rune == '\n') {
+ t->line = t->curr;
+ t->line_count++;
+ }
+ t->curr_rune = GB_RUNE_EOF;
+ }
+}
+
+b32 init_tokenizer(Tokenizer *t, char *filename) {
+ gbFileContents fc = gb_file_read_contents(gb_heap_allocator(), true, filename);
+ gb_zero_item(t);
+ if (fc.data) {
+ t->start = cast(u8 *)fc.data;
+ t->line = t->read_curr = t->curr = t->start;
+ t->end = t->start + fc.size;
+
+ t->fullpath = gb_path_get_full_name(gb_heap_allocator(), filename);
+
+ t->line_count = 1;
+
+ advance_to_next_rune(t);
+ if (t->curr_rune == GB_RUNE_BOM)
+ advance_to_next_rune(t); // Ignore BOM at file beginning
+ return true;
+ }
+ return false;
+}
+
+gb_inline void destroy_tokenizer(Tokenizer *t) {
+ gb_free(gb_heap_allocator(), t->start);
+}
+
+void tokenizer_skip_whitespace(Tokenizer *t) {
+ for (;;) {
+ if (rune_is_whitespace(t->curr_rune)) {
+ advance_to_next_rune(t);
+ } else if (t->curr_rune == '/') {
+ if (t->read_curr[0] == '/') { // Line comment //
+ while (t->curr_rune != '\n')
+ advance_to_next_rune(t);
+ } else if (t->read_curr[0] == '*') { // (Nested) Block comment /**/
+ isize comment_scope = 1;
+ for (;;) {
+ advance_to_next_rune(t);
+ if (t->curr_rune == '/') {
+ advance_to_next_rune(t);
+ if (t->curr_rune == '*') {
+ advance_to_next_rune(t);
+ comment_scope++;
+ }
+ }
+ if (t->curr_rune == '*') {
+ advance_to_next_rune(t);
+ if (t->curr_rune == '/') {
+ advance_to_next_rune(t);
+ comment_scope--;
+ }
+ }
+ if (comment_scope == 0)
+ break;
+ }
+ } else {
+ break;
+ }
+ } else {
+ break;
+ }
+ }
+}
+
+gb_inline i32 digit_value(Rune r) {
+ if (gb_char_is_digit(cast(char)r))
+ return r - '0';
+ if (gb_is_between(cast(char)r, 'a', 'f'))
+ return r - 'a' + 10;
+ if (gb_is_between(cast(char)r, 'A', 'F'))
+ return r - 'A' + 10;
+ return 16; // NOTE(bill): Larger than highest possible
+}
+
+gb_inline void scan_mantissa(Tokenizer *t, i32 base) {
+ // TODO(bill): Allow for underscores in numbers as a number separator
+ // TODO(bill): Is this a good idea?
+ // while (digit_value(t->curr_rune) < base || t->curr_rune == '_')
+ while (digit_value(t->curr_rune) < base)
+ advance_to_next_rune(t);
+}
+
+
+Token scan_number_to_token(Tokenizer *t, b32 seen_decimal_point) {
+ Token token = {};
+ u8 *start_curr = t->curr;
+ token.kind = Token_Integer;
+ token.string = make_string(start_curr, 1);
+ token.line = t->line_count;
+ token.column = t->curr-t->line+1;
+
+ if (seen_decimal_point) {
+ start_curr--;
+ token.kind = Token_Float;
+ scan_mantissa(t, 10);
+ goto exponent;
+ }
+
+ if (t->curr_rune == '0') {
+ u8 *prev = t->curr;
+ advance_to_next_rune(t);
+ if (t->curr_rune == 'b') { // Binary
+ advance_to_next_rune(t);
+ scan_mantissa(t, 2);
+ if (t->curr - prev <= 2)
+ token.kind = Token_Invalid;
+ } else if (t->curr_rune == 'o') { // Octal
+ advance_to_next_rune(t);
+ scan_mantissa(t, 8);
+ if (t->curr - prev <= 2)
+ token.kind = Token_Invalid;
+ } else if (t->curr_rune == 'd') { // Decimal
+ advance_to_next_rune(t);
+ scan_mantissa(t, 10);
+ if (t->curr - prev <= 2)
+ token.kind = Token_Invalid;
+ } else if (t->curr_rune == 'x') { // Hexadecimal
+ advance_to_next_rune(t);
+ scan_mantissa(t, 16);
+ if (t->curr - prev <= 2)
+ token.kind = Token_Invalid;
+ } else {
+ seen_decimal_point = false;
+ scan_mantissa(t, 10);
+
+ if (t->curr_rune == '.' || t->curr_rune == 'e' || t->curr_rune == 'E') {
+ seen_decimal_point = true;
+ goto fraction;
+ }
+ }
+ goto end;
+ }
+
+ scan_mantissa(t, 10);
+
+fraction:
+ if (t->curr_rune == '.') {
+ token.kind = Token_Float;
+ advance_to_next_rune(t);
+ scan_mantissa(t, 10);
+ }
+
+exponent:
+ if (t->curr_rune == 'e' || t->curr_rune == 'E') {
+ token.kind = Token_Float;
+ advance_to_next_rune(t);
+ if (t->curr_rune == '-' || t->curr_rune == '+')
+ advance_to_next_rune(t);
+ scan_mantissa(t, 10);
+ }
+
+end:
+ token.string.len = t->curr - token.string.text;
+ return token;
+}
+
+// Quote == " for string and ' for char
+b32 scan_escape(Tokenizer *t, Rune quote) {
+ isize len = 0;
+ u32 base = 0, max = 0, x = 0;
+
+ Rune r = t->curr_rune;
+ if (r == 'a' ||
+ r == 'b' ||
+ r == 'f' ||
+ r == 'n' ||
+ r == 'r' ||
+ r == 't' ||
+ r == 'v' ||
+ r == '\\' ||
+ r == quote) {
+ advance_to_next_rune(t);
+ return true;
+ } else if (gb_is_between(r, '0', '7')) {
+ len = 3; base = 8; max = 255;
+ } else if (r == 'x') {
+ advance_to_next_rune(t);
+ len = 2; base = 16; max = 255;
+ } else if (r == 'u') {
+ advance_to_next_rune(t);
+ len = 4; base = 16; max = GB_RUNE_MAX;
+ } else if (r == 'U') {
+ advance_to_next_rune(t);
+ len = 8; base = 16; max = GB_RUNE_MAX;
+ } else {
+ if (t->curr_rune < 0)
+ tokenizer_error(t, "Escape sequence was not terminated");
+ else
+ tokenizer_error(t, "Unknown escape sequence");
+ return false;
+ }
+
+ while (len --> 0) {
+ u32 d = cast(u32)digit_value(t->curr_rune);
+ if (d >= base) {
+ if (t->curr_rune < 0)
+ tokenizer_error(t, "Escape sequence was not terminated");
+ else
+ tokenizer_error(t, "Illegal character %d in escape sequence", t->curr_rune);
+ return false;
+ }
+
+ x = x*base + d;
+ advance_to_next_rune(t);
+ }
+
+ return true;
+}
+
+gb_inline TokenKind token_type_variant2(Tokenizer *t, TokenKind a, TokenKind b) {
+ if (t->curr_rune == '=') {
+ advance_to_next_rune(t);
+ return b;
+ }
+ return a;
+}
+
+
+gb_inline TokenKind token_type_variant3(Tokenizer *t, TokenKind a, TokenKind b, Rune ch_c, TokenKind c) {
+ if (t->curr_rune == '=') {
+ advance_to_next_rune(t);
+ return b;
+ }
+ if (t->curr_rune == ch_c) {
+ advance_to_next_rune(t);
+ return c;
+ }
+ return a;
+}
+
+gb_inline TokenKind token_type_variant4(Tokenizer *t, TokenKind a, TokenKind b, Rune ch_c, TokenKind c, Rune ch_d, TokenKind d) {
+ if (t->curr_rune == '=') {
+ advance_to_next_rune(t);
+ return b;
+ } else if (t->curr_rune == ch_c) {
+ advance_to_next_rune(t);
+ return c;
+ } else if (t->curr_rune == ch_d) {
+ advance_to_next_rune(t);
+ return d;
+ }
+ return a;
+}
+
+Token tokenizer_get_token(Tokenizer *t) {
+ Token token = {};
+ Rune curr_rune;
+
+ tokenizer_skip_whitespace(t);
+ token.string = make_string(t->curr, 1);
+ token.line = t->line_count;
+ token.column = t->curr - t->line + 1;
+
+ curr_rune = t->curr_rune;
+ if (rune_is_letter(curr_rune)) {
+ token.kind = Token_Identifier;
+ while (rune_is_letter(t->curr_rune) || rune_is_digit(t->curr_rune))
+ advance_to_next_rune(t);
+
+ token.string.len = t->curr - token.string.text;
+
+ // NOTE(bill): ALL identifiers are > 1
+ if (token.string.len > 1) {
+ #define KWB if (0) {}
+ #define KWT(keyword, token_type) else if ((gb_size_of(keyword)-1) == token.string.len && gb_strncmp((char *)token.string.text, keyword, token.string.len) == 0) token.kind = token_type
+ #define KWE else {}
+
+ KWB
+ KWT("type", Token_type);
+ KWT("proc", Token_proc);
+ KWT("match", Token_match);
+ KWT("break", Token_break);
+ KWT("continue", Token_continue);
+ KWT("fallthrough", Token_fallthrough);
+ KWT("case", Token_case);
+ KWT("default", Token_default);
+ KWT("if", Token_if);
+ KWT("else", Token_else);
+ KWT("for", Token_for);
+ KWT("defer", Token_defer);
+ KWT("return", Token_return);
+ KWT("import", Token_import);
+ KWT("cast", Token_cast);
+ KWT("struct", Token_struct);
+ KWT("union", Token_union);
+ KWT("enum", Token_enum);
+ KWT("inline", Token_inline);
+ KWT("no_inline", Token_no_inline);
+ KWE
+
+ #undef KWB
+ #undef KWT
+ #undef KWE
+ }
+
+ } else if (gb_is_between(curr_rune, '0', '9')) {
+ token = scan_number_to_token(t, false);
+ } else {
+ advance_to_next_rune(t);
+ switch (curr_rune) {
+ case GB_RUNE_EOF:
+ token.kind = Token_EOF;
+ break;
+ case '"': // String Literal
+ token.kind = Token_String;
+ for (;;) {
+ Rune r = t->curr_rune;
+ if (r == '\n' || r < 0) {
+ tokenizer_error(t, "String literal not terminated");
+ break;
+ }
+ advance_to_next_rune(t);
+ if (r == '"')
+ break;
+ if (r == '\\')
+ scan_escape(t, '"');
+ }
+ break;
+
+ case '\'': { // Rune Literal
+ b32 valid = true;
+ isize len = 0;
+ token.kind = Token_Rune;
+ for (;;) {
+ Rune r = t->curr_rune;
+ if (r == '\n' || r < 0) {
+ if (valid)
+ tokenizer_error(t, "Rune literal not terminated");
+ break;
+ }
+ advance_to_next_rune(t);
+ if (r == '\'')
+ break;
+ len++;
+ if (r == '\\') {
+ if (!scan_escape(t, '\''))
+ valid = false;
+ }
+ }
+
+ if (valid && len != 1)
+ tokenizer_error(t, "Illegal rune literal");
+ } break;
+
+ case '.':
+ token.kind = Token_Period; // Default
+ if (gb_is_between(t->curr_rune, '0', '9')) { // Might be a number
+ token = scan_number_to_token(t, true);
+ } else if (t->curr_rune == '.') { // Could be an ellipsis
+ advance_to_next_rune(t);
+ if (t->curr_rune == '.') {
+ advance_to_next_rune(t);
+ token.kind = Token_Ellipsis;
+ }
+ }
+ break;
+
+ case '#': token.kind = Token_Hash; break;
+ case '@': token.kind = Token_At; break;
+ case '^': token.kind = Token_Pointer; break;
+ case ';': token.kind = Token_Semicolon; break;
+ case ',': token.kind = Token_Comma; break;
+ case '(': token.kind = Token_OpenParen; break;
+ case ')': token.kind = Token_CloseParen; break;
+ case '[': token.kind = Token_OpenBracket; break;
+ case ']': token.kind = Token_CloseBracket; break;
+ case '{': token.kind = Token_OpenBrace; break;
+ case '}': token.kind = Token_CloseBrace; break;
+ case ':': token.kind = Token_Colon; break;
+
+ case '*': token.kind = token_type_variant2(t, Token_Mul, Token_MulEq); break;
+ case '/': token.kind = token_type_variant2(t, Token_Quo, Token_QuoEq); break;
+ case '%': token.kind = token_type_variant2(t, Token_Mod, Token_ModEq); break;
+ case '=': token.kind = token_type_variant2(t, Token_Eq, Token_CmpEq); break;
+ case '~': token.kind = token_type_variant2(t, Token_Xor, Token_XorEq); break;
+ case '!': token.kind = token_type_variant2(t, Token_Not, Token_NotEq); break;
+ case '>': token.kind = token_type_variant2(t, Token_Gt, Token_GtEq); break;
+ case '<': token.kind = token_type_variant3(t, Token_Lt, Token_LtEq, '-', Token_ArrowLeft); break;
+ case '+': token.kind = token_type_variant3(t, Token_Add, Token_AddEq, '+', Token_Increment); break;
+ case '-': token.kind = token_type_variant4(t, Token_Sub, Token_SubEq, '-', Token_Decrement, '>', Token_ArrowRight); break;
+
+ case '&':
+ token.kind = Token_And;
+ if (t->curr_rune == '~') {
+ advance_to_next_rune(t);
+ token.kind = token_type_variant2(t, Token_AndNot, Token_AndNotEq);
+ } else {
+ advance_to_next_rune(t);
+ token.kind = token_type_variant3(t, Token_And, Token_AndEq, '&', Token_CmpAnd);
+ if (t->curr_rune == '=') {
+ token.kind = Token_CmpAndEq;
+ advance_to_next_rune(t);
+ }
+ }
+ break;
+
+ case '|':
+ token.kind = Token_Or;
+ advance_to_next_rune(t);
+ token.kind = token_type_variant3(t, Token_Or, Token_OrEq, '|', Token_CmpOr);
+ if (t->curr_rune == '=') {
+ token.kind = Token_CmpOrEq;
+ advance_to_next_rune(t);
+ }
+ break;
+
+ default:
+ if (curr_rune != GB_RUNE_BOM)
+ tokenizer_error(t, "Illegal character: %c (%d) ", cast(char)curr_rune, curr_rune);
+ token.kind = Token_Invalid;
+ break;
+ }
+ }
+
+ token.string.len = t->curr - token.string.text;
+ return token;
+}