diff options
| author | gingerBill <bill@gingerbill.org> | 2021-08-01 22:41:00 +0100 |
|---|---|---|
| committer | gingerBill <bill@gingerbill.org> | 2021-08-01 22:41:00 +0100 |
| commit | b1a8357f509d0dfa2d496746831dfb1008ab94a7 (patch) | |
| tree | e217862aed1f59014d1181c01819a2ed931c2230 /src/tokenizer.cpp | |
| parent | 700624119beb5ebf6b9ec4e16ece2ba455c7cae1 (diff) | |
Clean up a bit of the tokenizer code so that the semicolon insertion rules are in one place
Diffstat (limited to 'src/tokenizer.cpp')
| -rw-r--r-- | src/tokenizer.cpp | 269 |
1 files changed, 142 insertions, 127 deletions
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 7bfcb496b..a073abc37 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -152,7 +152,6 @@ gb_global bool keyword_indices[16] = {}; gb_inline u32 keyword_hash(u8 const *text, isize len) { return fnv32a(text, len); - // return murmur3_32(text, len, 0x6f64696e); } void add_keyword_hash_entry(String const &s, TokenKind kind) { max_keyword_size = gb_max(max_keyword_size, s.len); @@ -164,7 +163,7 @@ void add_keyword_hash_entry(String const &s, TokenKind kind) { // NOTE(bill): This is a bit of an empirical hack in order to speed things up u32 index = hash & KEYWORD_HASH_TABLE_MASK; KeywordHashEntry *entry = &keyword_hash_table[index]; - GB_ASSERT_MSG(entry->kind == Token_Invalid, "Keyword hash table initialtion collision: %.*s %.*s %08x %08x", LIT(s), LIT(token_strings[entry->kind]), hash, entry->hash); + GB_ASSERT_MSG(entry->kind == Token_Invalid, "Keyword hash table initialtion collision: %.*s %.*s 0x%08x 0x%08x", LIT(s), LIT(token_strings[entry->kind]), hash, entry->hash); entry->hash = hash; entry->kind = kind; entry->text = s; @@ -1095,23 +1094,36 @@ bool scan_escape(Tokenizer *t) { void tokenizer_get_token(Tokenizer *t, Token *token, int repeat=0) { + // Skip whitespace - for (;;) { - switch (t->curr_rune) { - case '\n': - if (t->insert_semicolon) { + if (t->flags & TokenizerFlag_InsertSemicolon && t->insert_semicolon) { + for (;;) { + switch (t->curr_rune) { + case '\n': break; + case ' ': + case '\t': + case '\r': + advance_to_next_rune(t); + continue; } - /*fallthrough*/ - case ' ': - case '\t': - case '\r': - advance_to_next_rune(t); - continue; + break; + } + } else { + for (;;) { + switch (t->curr_rune) { + case '\n': // only on t->insert_semicolon + case ' ': + case '\t': + case '\r': + advance_to_next_rune(t); + continue; + } + break; } - break; } + token->kind = Token_Invalid; token->string.text = t->curr; token->string.len = 1; @@ -1122,8 +1134,6 @@ void tokenizer_get_token(Tokenizer *t, Token *token, int repeat=0) { TokenPos current_pos = token->pos; - bool insert_semicolon = false; - Rune curr_rune = t->curr_rune; if (rune_is_letter(curr_rune)) { token->kind = Token_Ident; @@ -1148,28 +1158,15 @@ void tokenizer_get_token(Tokenizer *t, Token *token, int repeat=0) { } } - switch (token->kind) { - case Token_Ident: - case Token_context: - case Token_typeid: // Dunno? - case Token_break: - case Token_continue: - case Token_fallthrough: - case Token_return: - insert_semicolon = true; - break; + goto semicolon_check; + } else { + switch (curr_rune) { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + scan_number_to_token(t, token, false); + goto semicolon_check; } - - if (t->flags & TokenizerFlag_InsertSemicolon) { - t->insert_semicolon = insert_semicolon; - } - return; - - } else if (gb_is_between(curr_rune, '0', '9')) { - insert_semicolon = true; - scan_number_to_token(t, token, false); - } else { advance_to_next_rune(t); switch (curr_rune) { case GB_RUNE_EOF: @@ -1201,8 +1198,6 @@ void tokenizer_get_token(Tokenizer *t, Token *token, int repeat=0) { case '\'': // Rune Literal { - insert_semicolon = true; - token->kind = Token_Rune; Rune quote = curr_rune; bool valid = true; @@ -1239,18 +1234,12 @@ void tokenizer_get_token(Tokenizer *t, Token *token, int repeat=0) { tokenizer_err(t, "Invalid rune literal"); } - if (t->flags & TokenizerFlag_InsertSemicolon) { - t->insert_semicolon = insert_semicolon; - } - - return; + goto semicolon_check; } break; case '`': // Raw String Literal case '"': // String Literal { - insert_semicolon = true; - bool has_carriage_return = false; i32 success; Rune quote = curr_rune; @@ -1296,15 +1285,13 @@ void tokenizer_get_token(Tokenizer *t, Token *token, int repeat=0) { tokenizer_err(t, "Invalid string literal"); } - if (t->flags & TokenizerFlag_InsertSemicolon) { - t->insert_semicolon = insert_semicolon; - } - - return; + goto semicolon_check; } break; case '.': - if (t->curr_rune == '.') { + token->kind = Token_Period; + switch (t->curr_rune) { + case '.': advance_to_next_rune(t); token->kind = Token_Ellipsis; if (t->curr_rune == '<') { @@ -1314,54 +1301,41 @@ void tokenizer_get_token(Tokenizer *t, Token *token, int repeat=0) { advance_to_next_rune(t); token->kind = Token_RangeFull; } - } else if ('0' <= t->curr_rune && t->curr_rune <= '9') { + break; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': scan_number_to_token(t, token, true); - } else { - token->kind = Token_Period; + break; } break; - - case '@': token->kind = Token_At; break; - case '$': token->kind = Token_Dollar; break; - case '?': - insert_semicolon = true; - token->kind = Token_Question; - break; - case '^': - insert_semicolon = true; - token->kind = Token_Pointer; - break; - case ';': token->kind = Token_Semicolon; break; - case ',': token->kind = Token_Comma; break; - case ':': token->kind = Token_Colon; break; - case '(': token->kind = Token_OpenParen; break; - case ')': - insert_semicolon = true; - token->kind = Token_CloseParen; - break; + case '@': token->kind = Token_At; break; + case '$': token->kind = Token_Dollar; break; + case '?': token->kind = Token_Question; break; + case '^': token->kind = Token_Pointer; break; + case ';': token->kind = Token_Semicolon; break; + case ',': token->kind = Token_Comma; break; + case ':': token->kind = Token_Colon; break; + case '(': token->kind = Token_OpenParen; break; + case ')': token->kind = Token_CloseParen; break; case '[': token->kind = Token_OpenBracket; break; - case ']': - insert_semicolon = true; - token->kind = Token_CloseBracket; - break; - case '{': token->kind = Token_OpenBrace; break; - case '}': - insert_semicolon = true; - token->kind = Token_CloseBrace; - break; - + case ']': token->kind = Token_CloseBracket; break; + case '{': token->kind = Token_OpenBrace; break; + case '}': token->kind = Token_CloseBrace; break; case '%': token->kind = Token_Mod; - if (t->curr_rune == '=') { + switch (t->curr_rune) { + case '=': advance_to_next_rune(t); token->kind = Token_ModEq; - } else if (t->curr_rune == '%') { + break; + case '%': token->kind = Token_ModMod; advance_to_next_rune(t); if (t->curr_rune == '=') { token->kind = Token_ModModEq; advance_to_next_rune(t); } + break; } break; @@ -1395,63 +1369,61 @@ void tokenizer_get_token(Tokenizer *t, Token *token, int repeat=0) { break; case '+': token->kind = Token_Add; - if (t->curr_rune == '=') { + switch (t->curr_rune) { + case '=': advance_to_next_rune(t); token->kind = Token_AddEq; - } else if (t->curr_rune == '+') { + break; + case '+': advance_to_next_rune(t); token->kind = Token_Increment; - insert_semicolon = true; + break; } break; case '-': token->kind = Token_Sub; - if (t->curr_rune == '=') { + switch (t->curr_rune) { + case '=': advance_to_next_rune(t); token->kind = Token_SubEq; - } else if (t->curr_rune == '-') { - insert_semicolon = true; + break; + case '-': advance_to_next_rune(t); token->kind = Token_Decrement; if (t->curr_rune == '-') { advance_to_next_rune(t); token->kind = Token_Undef; } - } else if (t->curr_rune == '>') { + break; + case '>': advance_to_next_rune(t); token->kind = Token_ArrowRight; + break; } break; - case '#': + token->kind = Token_Hash; if (t->curr_rune == '!') { - insert_semicolon = t->insert_semicolon; token->kind = Token_Comment; while (t->curr_rune != '\n' && t->curr_rune != GB_RUNE_EOF) { advance_to_next_rune(t); } - } else { - token->kind = Token_Hash; } break; - - - case '/': { + case '/': token->kind = Token_Quo; - if (t->curr_rune == '/') { - insert_semicolon = t->insert_semicolon; + switch (t->curr_rune) { + case '/': token->kind = Token_Comment; - while (t->curr_rune != '\n' && t->curr_rune != GB_RUNE_EOF) { advance_to_next_rune(t); } - } else if (t->curr_rune == '*') { + break; + case '*': token->kind = Token_Comment; - - isize comment_scope = 1; advance_to_next_rune(t); - while (comment_scope > 0) { + for (isize comment_scope = 1; comment_scope > 0; /**/) { if (t->curr_rune == GB_RUNE_EOF) { break; } else if (t->curr_rune == '/') { @@ -1470,97 +1442,140 @@ void tokenizer_get_token(Tokenizer *t, Token *token, int repeat=0) { advance_to_next_rune(t); } } - } else if (t->curr_rune == '=') { + break; + case '=': advance_to_next_rune(t); token->kind = Token_QuoEq; + break; } - } break; - + break; case '<': token->kind = Token_Lt; - if (t->curr_rune == '=') { + switch (t->curr_rune) { + case '=': token->kind = Token_LtEq; advance_to_next_rune(t); - } else if (t->curr_rune == '<') { + break; + case '<': token->kind = Token_Shl; advance_to_next_rune(t); if (t->curr_rune == '=') { token->kind = Token_ShlEq; advance_to_next_rune(t); } + break; } break; - case '>': token->kind = Token_Gt; - if (t->curr_rune == '=') { + switch (t->curr_rune) { + case '=': token->kind = Token_GtEq; advance_to_next_rune(t); - } else if (t->curr_rune == '>') { + break; + case '>': token->kind = Token_Shr; advance_to_next_rune(t); if (t->curr_rune == '=') { token->kind = Token_ShrEq; advance_to_next_rune(t); } + break; } break; - case '&': token->kind = Token_And; - if (t->curr_rune == '~') { + switch (t->curr_rune) { + case '~': token->kind = Token_AndNot; advance_to_next_rune(t); if (t->curr_rune == '=') { token->kind = Token_AndNotEq; advance_to_next_rune(t); } - } else if (t->curr_rune == '=') { + break; + case '=': token->kind = Token_AndEq; advance_to_next_rune(t); - } else if (t->curr_rune == '&') { + break; + case '&': token->kind = Token_CmpAnd; advance_to_next_rune(t); if (t->curr_rune == '=') { token->kind = Token_CmpAndEq; advance_to_next_rune(t); } + break; } break; - case '|': token->kind = Token_Or; - if (t->curr_rune == '=') { + switch (t->curr_rune) { + case '=': token->kind = Token_OrEq; advance_to_next_rune(t); - } else if (t->curr_rune == '|') { + break; + case '|': token->kind = Token_CmpOr; advance_to_next_rune(t); if (t->curr_rune == '=') { token->kind = Token_CmpOrEq; advance_to_next_rune(t); } + break; } break; - default: + token->kind = Token_Invalid; if (curr_rune != GB_RUNE_BOM) { u8 str[4] = {}; int len = cast(int)gb_utf8_encode_rune(str, curr_rune); tokenizer_err(t, "Illegal character: %.*s (%d) ", len, str, curr_rune); } - insert_semicolon = t->insert_semicolon; // Preserve insert_semicolon info - token->kind = Token_Invalid; break; } } - if (t->flags & TokenizerFlag_InsertSemicolon) { - t->insert_semicolon = insert_semicolon; - } - token->string.len = t->curr - token->string.text; +semicolon_check:; + if (t->flags & TokenizerFlag_InsertSemicolon) { + switch (token->kind) { + case Token_Invalid: + case Token_Comment: + // Preserve insert_semicolon info + break; + case Token_Ident: + case Token_context: + case Token_typeid: + case Token_break: + case Token_continue: + case Token_fallthrough: + case Token_return: + /*fallthrough*/ + case Token_Integer: + case Token_Float: + case Token_Imag: + case Token_Rune: + case Token_String: + case Token_Undef: + /*fallthrough*/ + case Token_Question: + case Token_Pointer: + case Token_CloseParen: + case Token_CloseBracket: + case Token_CloseBrace: + /*fallthrough*/ + case Token_Increment: + case Token_Decrement: + /*fallthrough*/ + t->insert_semicolon = true; + break; + default: + t->insert_semicolon = false; + break; + } + } return; } |