diff options
| author | gingerBill <gingerBill@users.noreply.github.com> | 2025-08-06 16:09:18 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-08-06 16:09:18 +0100 |
| commit | 09a1e170bc92a0ea48a8ee67599c2936e924fe4d (patch) | |
| tree | 92b44b34a1f2f0c4a8c96a49ab61bb5177432ed7 /src/string.cpp | |
| parent | ec7509430369eb5d57a081507792dc03b1c05bab (diff) | |
| parent | af3184adc96cef59fff986ea6400caa6dbdb56ae (diff) | |
Merge pull request #5530 from odin-lang/bill/utf16-strings
UTF-16 string types: `string16` & `cstring16`
Diffstat (limited to 'src/string.cpp')
| -rw-r--r-- | src/string.cpp | 181 |
1 files changed, 162 insertions, 19 deletions
diff --git a/src/string.cpp b/src/string.cpp index ae8d066b1..2087a5fee 100644 --- a/src/string.cpp +++ b/src/string.cpp @@ -26,15 +26,14 @@ struct String_Iterator { // NOTE(bill): String16 is only used for Windows due to its file directories struct String16 { - wchar_t *text; - isize len; - wchar_t const &operator[](isize i) const { + u16 * text; + isize len; + u16 const &operator[](isize i) const { GB_ASSERT_MSG(0 <= i && i < len, "[%td]", i); return text[i]; } }; - gb_internal gb_inline String make_string(u8 const *text, isize len) { String s; s.text = cast(u8 *)text; @@ -45,19 +44,19 @@ gb_internal gb_inline String make_string(u8 const *text, isize len) { return s; } - -gb_internal gb_inline String16 make_string16(wchar_t const *text, isize len) { +gb_internal gb_inline String16 make_string16(u16 const *text, isize len) { String16 s; - s.text = cast(wchar_t *)text; + s.text = cast(u16 *)text; s.len = len; return s; } -gb_internal isize string16_len(wchar_t const *s) { + +gb_internal isize string16_len(u16 const *s) { if (s == nullptr) { return 0; } - wchar_t const *p = s; + u16 const *p = s; while (*p) { p++; } @@ -69,7 +68,7 @@ gb_internal gb_inline String make_string_c(char const *text) { return make_string(cast(u8 *)cast(void *)text, gb_strlen(text)); } -gb_internal gb_inline String16 make_string16_c(wchar_t const *text) { +gb_internal gb_inline String16 make_string16_c(u16 const *text) { return make_string16(text, string16_len(text)); } @@ -80,6 +79,13 @@ gb_internal String substring(String const &s, isize lo, isize hi) { return make_string(s.text+lo, hi-lo); } +gb_internal String16 substring(String16 const &s, isize lo, isize hi) { + isize max = s.len; + GB_ASSERT_MSG(lo <= hi && hi <= max, "%td..%td..%td", lo, hi, max); + + return make_string16(s.text+lo, hi-lo); +} + gb_internal char *alloc_cstring(gbAllocator a, String s) { char *c_str = gb_alloc_array(a, char, s.len+1); @@ -145,6 +151,27 @@ gb_internal int string_compare(String const &a, String const &b) { return res; } + +gb_internal int string16_compare(String16 const &a, String16 const &b) { + if (a.text == b.text) { + return cast(int)(a.len - b.len); + } + if (a.text == nullptr) { + return -1; + } + if (b.text == nullptr) { + return +1; + } + + uintptr n = gb_min(a.len, b.len); + int res = memcmp(a.text, b.text, n*gb_size_of(u16)); + if (res == 0) { + res = cast(int)(a.len - b.len); + } + return res; +} + + gb_internal isize string_index_byte(String const &s, u8 x) { for (isize i = 0; i < s.len; i++) { if (s.text[i] == x) { @@ -182,6 +209,26 @@ template <isize N> gb_internal bool operator >= (String const &a, char const (&b template <> bool operator == (String const &a, char const (&b)[1]) { return a.len == 0; } template <> bool operator != (String const &a, char const (&b)[1]) { return a.len != 0; } + +gb_internal gb_inline bool str_eq(String16 const &a, String16 const &b) { + if (a.len != b.len) return false; + if (a.len == 0) return true; + return memcmp(a.text, b.text, a.len) == 0; +} +gb_internal gb_inline bool str_ne(String16 const &a, String16 const &b) { return !str_eq(a, b); } +gb_internal gb_inline bool str_lt(String16 const &a, String16 const &b) { return string16_compare(a, b) < 0; } +gb_internal gb_inline bool str_gt(String16 const &a, String16 const &b) { return string16_compare(a, b) > 0; } +gb_internal gb_inline bool str_le(String16 const &a, String16 const &b) { return string16_compare(a, b) <= 0; } +gb_internal gb_inline bool str_ge(String16 const &a, String16 const &b) { return string16_compare(a, b) >= 0; } + +gb_internal gb_inline bool operator == (String16 const &a, String16 const &b) { return str_eq(a, b); } +gb_internal gb_inline bool operator != (String16 const &a, String16 const &b) { return str_ne(a, b); } +gb_internal gb_inline bool operator < (String16 const &a, String16 const &b) { return str_lt(a, b); } +gb_internal gb_inline bool operator > (String16 const &a, String16 const &b) { return str_gt(a, b); } +gb_internal gb_inline bool operator <= (String16 const &a, String16 const &b) { return str_le(a, b); } +gb_internal gb_inline bool operator >= (String16 const &a, String16 const &b) { return str_ge(a, b); } + + gb_internal gb_inline bool string_starts_with(String const &s, String const &prefix) { if (prefix.len > s.len) { return false; @@ -611,10 +658,9 @@ gb_internal String normalize_path(gbAllocator a, String const &path, String cons -// TODO(bill): Make this non-windows specific gb_internal String16 string_to_string16(gbAllocator a, String s) { int len, len1; - wchar_t *text; + u16 *text; if (s.len < 1) { return make_string16(nullptr, 0); @@ -625,15 +671,14 @@ gb_internal String16 string_to_string16(gbAllocator a, String s) { return make_string16(nullptr, 0); } - text = gb_alloc_array(a, wchar_t, len+1); + text = gb_alloc_array(a, u16, len+1); - len1 = convert_multibyte_to_widechar(cast(char *)s.text, cast(int)s.len, text, cast(int)len); + len1 = convert_multibyte_to_widechar(cast(char *)s.text, cast(int)s.len, cast(wchar_t *)text, cast(int)len); if (len1 == 0) { gb_free(a, text); return make_string16(nullptr, 0); } text[len] = 0; - return make_string16(text, len); } @@ -646,7 +691,7 @@ gb_internal String string16_to_string(gbAllocator a, String16 s) { return make_string(nullptr, 0); } - len = convert_widechar_to_multibyte(s.text, cast(int)s.len, nullptr, 0); + len = convert_widechar_to_multibyte(cast(wchar_t *)s.text, cast(int)s.len, nullptr, 0); if (len == 0) { return make_string(nullptr, 0); } @@ -654,7 +699,7 @@ gb_internal String string16_to_string(gbAllocator a, String16 s) { text = gb_alloc_array(a, u8, len+1); - len1 = convert_widechar_to_multibyte(s.text, cast(int)s.len, cast(char *)text, cast(int)len); + len1 = convert_widechar_to_multibyte(cast(wchar_t *)s.text, cast(int)s.len, cast(char *)text, cast(int)len); if (len1 == 0) { gb_free(a, text); return make_string(nullptr, 0); @@ -674,9 +719,9 @@ gb_internal String temporary_directory(gbAllocator allocator) { return String{0}; } DWORD len = gb_max(MAX_PATH, n); - wchar_t *b = gb_alloc_array(heap_allocator(), wchar_t, len+1); + u16 *b = gb_alloc_array(heap_allocator(), u16, len+1); defer (gb_free(heap_allocator(), b)); - n = GetTempPathW(len, b); + n = GetTempPathW(len, cast(wchar_t *)b); if (n == 3 && b[1] == ':' && b[2] == '\\') { } else if (n > 0 && b[n-1] == '\\') { @@ -791,6 +836,104 @@ gb_internal String quote_to_ascii(gbAllocator a, String str, u8 quote='"') { return res; } +gb_internal Rune decode_surrogate_pair(u16 r1, u16 r2) { + static Rune const _surr1 = 0xd800; + static Rune const _surr2 = 0xdc00; + static Rune const _surr3 = 0xe000; + static Rune const _surr_self = 0x10000; + + if (_surr1 <= r1 && r1 < _surr2 && _surr2 <= r2 && r2 < _surr3) { + return (((r1-_surr1)<<10) | (r2 - _surr2)) + _surr_self; + } + return GB_RUNE_INVALID; +} + +gb_internal String quote_to_ascii(gbAllocator a, String16 str, u8 quote='"') { + static Rune const _surr1 = 0xd800; + static Rune const _surr2 = 0xdc00; + static Rune const _surr3 = 0xe000; + static Rune const _surr_self = 0x10000; + + u16 *s = cast(u16 *)str.text; + isize n = str.len; + auto buf = array_make<u8>(a, 0, n*2); + array_add(&buf, quote); + for (isize width = 0; n > 0; s += width, n -= width) { + Rune r = cast(Rune)s[0]; + width = 1; + if (r < _surr1 || _surr3 <= r) { + r = cast(Rune)r; + } else if (_surr1 <= r && r < _surr2) { + if (n>1) { + r = decode_surrogate_pair(s[0], s[1]); + if (r != GB_RUNE_INVALID) { + width = 2; + } + } else { + r = GB_RUNE_INVALID; + } + } + if (width == 1 && r == GB_RUNE_INVALID) { + array_add(&buf, cast(u8)'\\'); + array_add(&buf, cast(u8)'x'); + array_add(&buf, cast(u8)lower_hex[s[0]>>4]); + array_add(&buf, cast(u8)lower_hex[s[0]&0xf]); + continue; + } + + if (r == quote || r == '\\') { + array_add(&buf, cast(u8)'\\'); + array_add(&buf, u8(r)); + continue; + } + if (r < 0x80 && is_printable(r)) { + array_add(&buf, u8(r)); + continue; + } + switch (r) { + case '\a': + case '\b': + case '\f': + case '\n': + case '\r': + case '\t': + case '\v': + default: + if (r < ' ') { + u8 b = cast(u8)r; + array_add(&buf, cast(u8)'\\'); + array_add(&buf, cast(u8)'x'); + array_add(&buf, cast(u8)lower_hex[b>>4]); + array_add(&buf, cast(u8)lower_hex[b&0xf]); + } + if (r > GB_RUNE_MAX) { + r = 0XFFFD; + } + if (r < 0x10000) { + array_add(&buf, cast(u8)'\\'); + array_add(&buf, cast(u8)'u'); + for (isize i = 12; i >= 0; i -= 4) { + array_add(&buf, cast(u8)lower_hex[(r>>i)&0xf]); + } + } else { + array_add(&buf, cast(u8)'\\'); + array_add(&buf, cast(u8)'U'); + for (isize i = 28; i >= 0; i -= 4) { + array_add(&buf, cast(u8)lower_hex[(r>>i)&0xf]); + } + } + } + } + + + + array_add(&buf, quote); + String res = {}; + res.text = buf.data; + res.len = buf.count; + return res; +} + |