aboutsummaryrefslogtreecommitdiff
path: root/src/string.cpp
diff options
context:
space:
mode:
authorgingerBill <gingerBill@users.noreply.github.com>2025-08-06 16:09:18 +0100
committerGitHub <noreply@github.com>2025-08-06 16:09:18 +0100
commit09a1e170bc92a0ea48a8ee67599c2936e924fe4d (patch)
tree92b44b34a1f2f0c4a8c96a49ab61bb5177432ed7 /src/string.cpp
parentec7509430369eb5d57a081507792dc03b1c05bab (diff)
parentaf3184adc96cef59fff986ea6400caa6dbdb56ae (diff)
Merge pull request #5530 from odin-lang/bill/utf16-strings
UTF-16 string types: `string16` & `cstring16`
Diffstat (limited to 'src/string.cpp')
-rw-r--r--src/string.cpp181
1 files changed, 162 insertions, 19 deletions
diff --git a/src/string.cpp b/src/string.cpp
index ae8d066b1..2087a5fee 100644
--- a/src/string.cpp
+++ b/src/string.cpp
@@ -26,15 +26,14 @@ struct String_Iterator {
// NOTE(bill): String16 is only used for Windows due to its file directories
struct String16 {
- wchar_t *text;
- isize len;
- wchar_t const &operator[](isize i) const {
+ u16 * text;
+ isize len;
+ u16 const &operator[](isize i) const {
GB_ASSERT_MSG(0 <= i && i < len, "[%td]", i);
return text[i];
}
};
-
gb_internal gb_inline String make_string(u8 const *text, isize len) {
String s;
s.text = cast(u8 *)text;
@@ -45,19 +44,19 @@ gb_internal gb_inline String make_string(u8 const *text, isize len) {
return s;
}
-
-gb_internal gb_inline String16 make_string16(wchar_t const *text, isize len) {
+gb_internal gb_inline String16 make_string16(u16 const *text, isize len) {
String16 s;
- s.text = cast(wchar_t *)text;
+ s.text = cast(u16 *)text;
s.len = len;
return s;
}
-gb_internal isize string16_len(wchar_t const *s) {
+
+gb_internal isize string16_len(u16 const *s) {
if (s == nullptr) {
return 0;
}
- wchar_t const *p = s;
+ u16 const *p = s;
while (*p) {
p++;
}
@@ -69,7 +68,7 @@ gb_internal gb_inline String make_string_c(char const *text) {
return make_string(cast(u8 *)cast(void *)text, gb_strlen(text));
}
-gb_internal gb_inline String16 make_string16_c(wchar_t const *text) {
+gb_internal gb_inline String16 make_string16_c(u16 const *text) {
return make_string16(text, string16_len(text));
}
@@ -80,6 +79,13 @@ gb_internal String substring(String const &s, isize lo, isize hi) {
return make_string(s.text+lo, hi-lo);
}
+gb_internal String16 substring(String16 const &s, isize lo, isize hi) {
+ isize max = s.len;
+ GB_ASSERT_MSG(lo <= hi && hi <= max, "%td..%td..%td", lo, hi, max);
+
+ return make_string16(s.text+lo, hi-lo);
+}
+
gb_internal char *alloc_cstring(gbAllocator a, String s) {
char *c_str = gb_alloc_array(a, char, s.len+1);
@@ -145,6 +151,27 @@ gb_internal int string_compare(String const &a, String const &b) {
return res;
}
+
+gb_internal int string16_compare(String16 const &a, String16 const &b) {
+ if (a.text == b.text) {
+ return cast(int)(a.len - b.len);
+ }
+ if (a.text == nullptr) {
+ return -1;
+ }
+ if (b.text == nullptr) {
+ return +1;
+ }
+
+ uintptr n = gb_min(a.len, b.len);
+ int res = memcmp(a.text, b.text, n*gb_size_of(u16));
+ if (res == 0) {
+ res = cast(int)(a.len - b.len);
+ }
+ return res;
+}
+
+
gb_internal isize string_index_byte(String const &s, u8 x) {
for (isize i = 0; i < s.len; i++) {
if (s.text[i] == x) {
@@ -182,6 +209,26 @@ template <isize N> gb_internal bool operator >= (String const &a, char const (&b
template <> bool operator == (String const &a, char const (&b)[1]) { return a.len == 0; }
template <> bool operator != (String const &a, char const (&b)[1]) { return a.len != 0; }
+
+gb_internal gb_inline bool str_eq(String16 const &a, String16 const &b) {
+ if (a.len != b.len) return false;
+ if (a.len == 0) return true;
+ return memcmp(a.text, b.text, a.len) == 0;
+}
+gb_internal gb_inline bool str_ne(String16 const &a, String16 const &b) { return !str_eq(a, b); }
+gb_internal gb_inline bool str_lt(String16 const &a, String16 const &b) { return string16_compare(a, b) < 0; }
+gb_internal gb_inline bool str_gt(String16 const &a, String16 const &b) { return string16_compare(a, b) > 0; }
+gb_internal gb_inline bool str_le(String16 const &a, String16 const &b) { return string16_compare(a, b) <= 0; }
+gb_internal gb_inline bool str_ge(String16 const &a, String16 const &b) { return string16_compare(a, b) >= 0; }
+
+gb_internal gb_inline bool operator == (String16 const &a, String16 const &b) { return str_eq(a, b); }
+gb_internal gb_inline bool operator != (String16 const &a, String16 const &b) { return str_ne(a, b); }
+gb_internal gb_inline bool operator < (String16 const &a, String16 const &b) { return str_lt(a, b); }
+gb_internal gb_inline bool operator > (String16 const &a, String16 const &b) { return str_gt(a, b); }
+gb_internal gb_inline bool operator <= (String16 const &a, String16 const &b) { return str_le(a, b); }
+gb_internal gb_inline bool operator >= (String16 const &a, String16 const &b) { return str_ge(a, b); }
+
+
gb_internal gb_inline bool string_starts_with(String const &s, String const &prefix) {
if (prefix.len > s.len) {
return false;
@@ -611,10 +658,9 @@ gb_internal String normalize_path(gbAllocator a, String const &path, String cons
-// TODO(bill): Make this non-windows specific
gb_internal String16 string_to_string16(gbAllocator a, String s) {
int len, len1;
- wchar_t *text;
+ u16 *text;
if (s.len < 1) {
return make_string16(nullptr, 0);
@@ -625,15 +671,14 @@ gb_internal String16 string_to_string16(gbAllocator a, String s) {
return make_string16(nullptr, 0);
}
- text = gb_alloc_array(a, wchar_t, len+1);
+ text = gb_alloc_array(a, u16, len+1);
- len1 = convert_multibyte_to_widechar(cast(char *)s.text, cast(int)s.len, text, cast(int)len);
+ len1 = convert_multibyte_to_widechar(cast(char *)s.text, cast(int)s.len, cast(wchar_t *)text, cast(int)len);
if (len1 == 0) {
gb_free(a, text);
return make_string16(nullptr, 0);
}
text[len] = 0;
-
return make_string16(text, len);
}
@@ -646,7 +691,7 @@ gb_internal String string16_to_string(gbAllocator a, String16 s) {
return make_string(nullptr, 0);
}
- len = convert_widechar_to_multibyte(s.text, cast(int)s.len, nullptr, 0);
+ len = convert_widechar_to_multibyte(cast(wchar_t *)s.text, cast(int)s.len, nullptr, 0);
if (len == 0) {
return make_string(nullptr, 0);
}
@@ -654,7 +699,7 @@ gb_internal String string16_to_string(gbAllocator a, String16 s) {
text = gb_alloc_array(a, u8, len+1);
- len1 = convert_widechar_to_multibyte(s.text, cast(int)s.len, cast(char *)text, cast(int)len);
+ len1 = convert_widechar_to_multibyte(cast(wchar_t *)s.text, cast(int)s.len, cast(char *)text, cast(int)len);
if (len1 == 0) {
gb_free(a, text);
return make_string(nullptr, 0);
@@ -674,9 +719,9 @@ gb_internal String temporary_directory(gbAllocator allocator) {
return String{0};
}
DWORD len = gb_max(MAX_PATH, n);
- wchar_t *b = gb_alloc_array(heap_allocator(), wchar_t, len+1);
+ u16 *b = gb_alloc_array(heap_allocator(), u16, len+1);
defer (gb_free(heap_allocator(), b));
- n = GetTempPathW(len, b);
+ n = GetTempPathW(len, cast(wchar_t *)b);
if (n == 3 && b[1] == ':' && b[2] == '\\') {
} else if (n > 0 && b[n-1] == '\\') {
@@ -791,6 +836,104 @@ gb_internal String quote_to_ascii(gbAllocator a, String str, u8 quote='"') {
return res;
}
+gb_internal Rune decode_surrogate_pair(u16 r1, u16 r2) {
+ static Rune const _surr1 = 0xd800;
+ static Rune const _surr2 = 0xdc00;
+ static Rune const _surr3 = 0xe000;
+ static Rune const _surr_self = 0x10000;
+
+ if (_surr1 <= r1 && r1 < _surr2 && _surr2 <= r2 && r2 < _surr3) {
+ return (((r1-_surr1)<<10) | (r2 - _surr2)) + _surr_self;
+ }
+ return GB_RUNE_INVALID;
+}
+
+gb_internal String quote_to_ascii(gbAllocator a, String16 str, u8 quote='"') {
+ static Rune const _surr1 = 0xd800;
+ static Rune const _surr2 = 0xdc00;
+ static Rune const _surr3 = 0xe000;
+ static Rune const _surr_self = 0x10000;
+
+ u16 *s = cast(u16 *)str.text;
+ isize n = str.len;
+ auto buf = array_make<u8>(a, 0, n*2);
+ array_add(&buf, quote);
+ for (isize width = 0; n > 0; s += width, n -= width) {
+ Rune r = cast(Rune)s[0];
+ width = 1;
+ if (r < _surr1 || _surr3 <= r) {
+ r = cast(Rune)r;
+ } else if (_surr1 <= r && r < _surr2) {
+ if (n>1) {
+ r = decode_surrogate_pair(s[0], s[1]);
+ if (r != GB_RUNE_INVALID) {
+ width = 2;
+ }
+ } else {
+ r = GB_RUNE_INVALID;
+ }
+ }
+ if (width == 1 && r == GB_RUNE_INVALID) {
+ array_add(&buf, cast(u8)'\\');
+ array_add(&buf, cast(u8)'x');
+ array_add(&buf, cast(u8)lower_hex[s[0]>>4]);
+ array_add(&buf, cast(u8)lower_hex[s[0]&0xf]);
+ continue;
+ }
+
+ if (r == quote || r == '\\') {
+ array_add(&buf, cast(u8)'\\');
+ array_add(&buf, u8(r));
+ continue;
+ }
+ if (r < 0x80 && is_printable(r)) {
+ array_add(&buf, u8(r));
+ continue;
+ }
+ switch (r) {
+ case '\a':
+ case '\b':
+ case '\f':
+ case '\n':
+ case '\r':
+ case '\t':
+ case '\v':
+ default:
+ if (r < ' ') {
+ u8 b = cast(u8)r;
+ array_add(&buf, cast(u8)'\\');
+ array_add(&buf, cast(u8)'x');
+ array_add(&buf, cast(u8)lower_hex[b>>4]);
+ array_add(&buf, cast(u8)lower_hex[b&0xf]);
+ }
+ if (r > GB_RUNE_MAX) {
+ r = 0XFFFD;
+ }
+ if (r < 0x10000) {
+ array_add(&buf, cast(u8)'\\');
+ array_add(&buf, cast(u8)'u');
+ for (isize i = 12; i >= 0; i -= 4) {
+ array_add(&buf, cast(u8)lower_hex[(r>>i)&0xf]);
+ }
+ } else {
+ array_add(&buf, cast(u8)'\\');
+ array_add(&buf, cast(u8)'U');
+ for (isize i = 28; i >= 0; i -= 4) {
+ array_add(&buf, cast(u8)lower_hex[(r>>i)&0xf]);
+ }
+ }
+ }
+ }
+
+
+
+ array_add(&buf, quote);
+ String res = {};
+ res.text = buf.data;
+ res.len = buf.count;
+ return res;
+}
+