diff options
| author | gingerBill <bill@gingerbill.org> | 2020-05-27 12:54:11 +0100 |
|---|---|---|
| committer | gingerBill <bill@gingerbill.org> | 2020-05-27 12:54:11 +0100 |
| commit | 876820789e9dedaa6198c4cd145702485e3bd21c (patch) | |
| tree | bcfb69df5b3eefdad8587d609e4139bc35b36a3f /src/unicode.cpp | |
| parent | 4e21a4d46a90c56edd45f5d5c46b375742738a17 (diff) | |
Add `rune_is_letter_or_digit` for tokenizer
Diffstat (limited to 'src/unicode.cpp')
| -rw-r--r-- | src/unicode.cpp | 23 |
1 files changed, 23 insertions, 0 deletions
diff --git a/src/unicode.cpp b/src/unicode.cpp index b988155f7..83aa8deef 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -32,6 +32,29 @@ bool rune_is_digit(Rune r) { return utf8proc_category(r) == UTF8PROC_CATEGORY_ND; } +bool rune_is_letter_or_digit(Rune r) { + if (r < 0x80) { + if (r == '_') { + return true; + } + if (((cast(u32)r | 0x20) - 0x61) < 26) { + return true; + } + return (cast(u32)r - '0') < 10; + } + switch (utf8proc_category(r)) { + case UTF8PROC_CATEGORY_LU: + case UTF8PROC_CATEGORY_LL: + case UTF8PROC_CATEGORY_LT: + case UTF8PROC_CATEGORY_LM: + case UTF8PROC_CATEGORY_LO: + return true; + case UTF8PROC_CATEGORY_ND: + return true; + } + return false; +} + bool rune_is_whitespace(Rune r) { switch (r) { case ' ': |