From 2561427dd396a69cd49eb02c0814c4e8e8b3a08f Mon Sep 17 00:00:00 2001 From: gingerBill Date: Sat, 2 Aug 2025 11:00:15 +0100 Subject: Add `string16` and `cstring16` (UTF-16 based strings) --- src/string.cpp | 172 +++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 155 insertions(+), 17 deletions(-) (limited to 'src/string.cpp') diff --git a/src/string.cpp b/src/string.cpp index ae8d066b1..8405938f4 100644 --- a/src/string.cpp +++ b/src/string.cpp @@ -26,15 +26,14 @@ struct String_Iterator { // NOTE(bill): String16 is only used for Windows due to its file directories struct String16 { - wchar_t *text; - isize len; - wchar_t const &operator[](isize i) const { + u16 * text; + isize len; + u16 const &operator[](isize i) const { GB_ASSERT_MSG(0 <= i && i < len, "[%td]", i); return text[i]; } }; - gb_internal gb_inline String make_string(u8 const *text, isize len) { String s; s.text = cast(u8 *)text; @@ -45,19 +44,19 @@ gb_internal gb_inline String make_string(u8 const *text, isize len) { return s; } - -gb_internal gb_inline String16 make_string16(wchar_t const *text, isize len) { +gb_internal gb_inline String16 make_string16(u16 const *text, isize len) { String16 s; - s.text = cast(wchar_t *)text; + s.text = cast(u16 *)text; s.len = len; return s; } -gb_internal isize string16_len(wchar_t const *s) { + +gb_internal isize string16_len(u16 const *s) { if (s == nullptr) { return 0; } - wchar_t const *p = s; + u16 const *p = s; while (*p) { p++; } @@ -69,7 +68,7 @@ gb_internal gb_inline String make_string_c(char const *text) { return make_string(cast(u8 *)cast(void *)text, gb_strlen(text)); } -gb_internal gb_inline String16 make_string16_c(wchar_t const *text) { +gb_internal gb_inline String16 make_string16_c(u16 const *text) { return make_string16(text, string16_len(text)); } @@ -145,6 +144,27 @@ gb_internal int string_compare(String const &a, String const &b) { return res; } + +gb_internal int string16_compare(String16 const &a, String16 const &b) { + if (a.text == b.text) { + return cast(int)(a.len - b.len); + } + if (a.text == nullptr) { + return -1; + } + if (b.text == nullptr) { + return +1; + } + + uintptr n = gb_min(a.len, b.len); + int res = memcmp(a.text, b.text, n*gb_size_of(u16)); + if (res == 0) { + res = cast(int)(a.len - b.len); + } + return res; +} + + gb_internal isize string_index_byte(String const &s, u8 x) { for (isize i = 0; i < s.len; i++) { if (s.text[i] == x) { @@ -182,6 +202,26 @@ template gb_internal bool operator >= (String const &a, char const (&b template <> bool operator == (String const &a, char const (&b)[1]) { return a.len == 0; } template <> bool operator != (String const &a, char const (&b)[1]) { return a.len != 0; } + +gb_internal gb_inline bool str_eq(String16 const &a, String16 const &b) { + if (a.len != b.len) return false; + if (a.len == 0) return true; + return memcmp(a.text, b.text, a.len) == 0; +} +gb_internal gb_inline bool str_ne(String16 const &a, String16 const &b) { return !str_eq(a, b); } +gb_internal gb_inline bool str_lt(String16 const &a, String16 const &b) { return string16_compare(a, b) < 0; } +gb_internal gb_inline bool str_gt(String16 const &a, String16 const &b) { return string16_compare(a, b) > 0; } +gb_internal gb_inline bool str_le(String16 const &a, String16 const &b) { return string16_compare(a, b) <= 0; } +gb_internal gb_inline bool str_ge(String16 const &a, String16 const &b) { return string16_compare(a, b) >= 0; } + +gb_internal gb_inline bool operator == (String16 const &a, String16 const &b) { return str_eq(a, b); } +gb_internal gb_inline bool operator != (String16 const &a, String16 const &b) { return str_ne(a, b); } +gb_internal gb_inline bool operator < (String16 const &a, String16 const &b) { return str_lt(a, b); } +gb_internal gb_inline bool operator > (String16 const &a, String16 const &b) { return str_gt(a, b); } +gb_internal gb_inline bool operator <= (String16 const &a, String16 const &b) { return str_le(a, b); } +gb_internal gb_inline bool operator >= (String16 const &a, String16 const &b) { return str_ge(a, b); } + + gb_internal gb_inline bool string_starts_with(String const &s, String const &prefix) { if (prefix.len > s.len) { return false; @@ -614,7 +654,7 @@ gb_internal String normalize_path(gbAllocator a, String const &path, String cons // TODO(bill): Make this non-windows specific gb_internal String16 string_to_string16(gbAllocator a, String s) { int len, len1; - wchar_t *text; + u16 *text; if (s.len < 1) { return make_string16(nullptr, 0); @@ -625,9 +665,9 @@ gb_internal String16 string_to_string16(gbAllocator a, String s) { return make_string16(nullptr, 0); } - text = gb_alloc_array(a, wchar_t, len+1); + text = gb_alloc_array(a, u16, len+1); - len1 = convert_multibyte_to_widechar(cast(char *)s.text, cast(int)s.len, text, cast(int)len); + len1 = convert_multibyte_to_widechar(cast(char *)s.text, cast(int)s.len, cast(wchar_t *)text, cast(int)len); if (len1 == 0) { gb_free(a, text); return make_string16(nullptr, 0); @@ -646,7 +686,7 @@ gb_internal String string16_to_string(gbAllocator a, String16 s) { return make_string(nullptr, 0); } - len = convert_widechar_to_multibyte(s.text, cast(int)s.len, nullptr, 0); + len = convert_widechar_to_multibyte(cast(wchar_t *)s.text, cast(int)s.len, nullptr, 0); if (len == 0) { return make_string(nullptr, 0); } @@ -654,7 +694,7 @@ gb_internal String string16_to_string(gbAllocator a, String16 s) { text = gb_alloc_array(a, u8, len+1); - len1 = convert_widechar_to_multibyte(s.text, cast(int)s.len, cast(char *)text, cast(int)len); + len1 = convert_widechar_to_multibyte(cast(wchar_t *)s.text, cast(int)s.len, cast(char *)text, cast(int)len); if (len1 == 0) { gb_free(a, text); return make_string(nullptr, 0); @@ -674,9 +714,9 @@ gb_internal String temporary_directory(gbAllocator allocator) { return String{0}; } DWORD len = gb_max(MAX_PATH, n); - wchar_t *b = gb_alloc_array(heap_allocator(), wchar_t, len+1); + u16 *b = gb_alloc_array(heap_allocator(), u16, len+1); defer (gb_free(heap_allocator(), b)); - n = GetTempPathW(len, b); + n = GetTempPathW(len, cast(wchar_t *)b); if (n == 3 && b[1] == ':' && b[2] == '\\') { } else if (n > 0 && b[n-1] == '\\') { @@ -791,6 +831,104 @@ gb_internal String quote_to_ascii(gbAllocator a, String str, u8 quote='"') { return res; } +gb_internal Rune decode_surrogate_pair(u16 r1, u16 r2) { + static Rune const _surr1 = 0xd800; + static Rune const _surr2 = 0xdc00; + static Rune const _surr3 = 0xe000; + static Rune const _surr_self = 0x10000; + + if (_surr1 <= r1 && r1 < _surr2 && _surr2 <= r2 && r2 < _surr3) { + return (((r1-_surr1)<<10) | (r2 - _surr2)) + _surr_self; + } + return GB_RUNE_INVALID; +} + +gb_internal String quote_to_ascii(gbAllocator a, String16 str, u8 quote='"') { + static Rune const _surr1 = 0xd800; + static Rune const _surr2 = 0xdc00; + static Rune const _surr3 = 0xe000; + static Rune const _surr_self = 0x10000; + + u16 *s = cast(u16 *)str.text; + isize n = str.len; + auto buf = array_make(a, 0, n*2); + array_add(&buf, quote); + for (isize width = 0; n > 0; s += width, n -= width) { + Rune r = cast(Rune)s[0]; + width = 1; + if (r < _surr1 || _surr3 <= r) { + r = cast(Rune)r; + } else if (_surr1 <= r && r < _surr2) { + if (n>1) { + r = decode_surrogate_pair(s[0], s[1]); + if (r != GB_RUNE_INVALID) { + width = 2; + } + } else { + r = GB_RUNE_INVALID; + } + } + if (width == 1 && r == GB_RUNE_INVALID) { + array_add(&buf, cast(u8)'\\'); + array_add(&buf, cast(u8)'x'); + array_add(&buf, cast(u8)lower_hex[s[0]>>4]); + array_add(&buf, cast(u8)lower_hex[s[0]&0xf]); + continue; + } + + if (r == quote || r == '\\') { + array_add(&buf, cast(u8)'\\'); + array_add(&buf, u8(r)); + continue; + } + if (r < 0x80 && is_printable(r)) { + array_add(&buf, u8(r)); + continue; + } + switch (r) { + case '\a': + case '\b': + case '\f': + case '\n': + case '\r': + case '\t': + case '\v': + default: + if (r < ' ') { + u8 b = cast(u8)r; + array_add(&buf, cast(u8)'\\'); + array_add(&buf, cast(u8)'x'); + array_add(&buf, cast(u8)lower_hex[b>>4]); + array_add(&buf, cast(u8)lower_hex[b&0xf]); + } + if (r > GB_RUNE_MAX) { + r = 0XFFFD; + } + if (r < 0x10000) { + array_add(&buf, cast(u8)'\\'); + array_add(&buf, cast(u8)'u'); + for (isize i = 12; i >= 0; i -= 4) { + array_add(&buf, cast(u8)lower_hex[(r>>i)&0xf]); + } + } else { + array_add(&buf, cast(u8)'\\'); + array_add(&buf, cast(u8)'U'); + for (isize i = 28; i >= 0; i -= 4) { + array_add(&buf, cast(u8)lower_hex[(r>>i)&0xf]); + } + } + } + } + + + + array_add(&buf, quote); + String res = {}; + res.text = buf.data; + res.len = buf.count; + return res; +} + -- cgit v1.2.3 From ae02d3d02d2eb5132fa7c6573ed7db20d7e18f3e Mon Sep 17 00:00:00 2001 From: gingerBill Date: Sat, 2 Aug 2025 11:55:16 +0100 Subject: Begin supporting `string16` across the core library --- base/intrinsics/intrinsics.odin | 1 + base/runtime/print.odin | 6 +++ core/encoding/cbor/tags.odin | 2 +- core/encoding/cbor/unmarshal.odin | 2 + core/encoding/json/marshal.odin | 8 ++-- core/encoding/json/unmarshal.odin | 4 +- core/flags/internal_rtti.odin | 2 + core/fmt/fmt.odin | 16 ++++---- core/io/io.odin | 4 +- core/reflect/types.odin | 8 ++-- src/check_builtin.cpp | 2 + src/check_expr.cpp | 42 +++++++++++++++++---- src/checker_builtin_procs.hpp | 2 + src/llvm_backend.cpp | 6 +++ src/llvm_backend_const.cpp | 77 +++++++++++++++++++++++++++++++++++++-- src/llvm_backend_debug.cpp | 14 +++++++ src/llvm_backend_expr.cpp | 3 +- src/llvm_backend_general.cpp | 37 +++++++++++++++++++ src/llvm_backend_utility.cpp | 19 +++++++++- src/string.cpp | 7 ++++ 20 files changed, 230 insertions(+), 32 deletions(-) (limited to 'src/string.cpp') diff --git a/base/intrinsics/intrinsics.odin b/base/intrinsics/intrinsics.odin index be75739fe..d45d24f48 100644 --- a/base/intrinsics/intrinsics.odin +++ b/base/intrinsics/intrinsics.odin @@ -141,6 +141,7 @@ type_is_quaternion :: proc($T: typeid) -> bool --- type_is_string :: proc($T: typeid) -> bool --- type_is_typeid :: proc($T: typeid) -> bool --- type_is_any :: proc($T: typeid) -> bool --- +type_is_string16 :: proc($T: typeid) -> bool --- type_is_endian_platform :: proc($T: typeid) -> bool --- type_is_endian_little :: proc($T: typeid) -> bool --- diff --git a/base/runtime/print.odin b/base/runtime/print.odin index 145f002d1..85ed49445 100644 --- a/base/runtime/print.odin +++ b/base/runtime/print.odin @@ -293,7 +293,13 @@ print_type :: #force_no_inline proc "contextless" (ti: ^Type_Info) { print_string("quaternion") print_u64(u64(8*ti.size)) case Type_Info_String: + if info.is_cstring { + print_byte('c') + } print_string("string") + if info.is_utf16 { + print_string("16") + } case Type_Info_Boolean: switch ti.id { case bool: print_string("bool") diff --git a/core/encoding/cbor/tags.odin b/core/encoding/cbor/tags.odin index 17420af46..e0e69cbf5 100644 --- a/core/encoding/cbor/tags.odin +++ b/core/encoding/cbor/tags.odin @@ -298,7 +298,7 @@ tag_base64_unmarshal :: proc(_: ^Tag_Implementation, d: Decoder, _: Tag_Number, #partial switch t in ti.variant { case reflect.Type_Info_String: - + assert(!t.is_utf16) if t.is_cstring { length := base64.decoded_len(bytes) builder := strings.builder_make(0, length+1) diff --git a/core/encoding/cbor/unmarshal.odin b/core/encoding/cbor/unmarshal.odin index 365ac5d6f..2840429f5 100644 --- a/core/encoding/cbor/unmarshal.odin +++ b/core/encoding/cbor/unmarshal.odin @@ -335,6 +335,8 @@ _unmarshal_value :: proc(d: Decoder, v: any, hdr: Header, allocator := context.a _unmarshal_bytes :: proc(d: Decoder, v: any, ti: ^reflect.Type_Info, hdr: Header, add: Add, allocator := context.allocator, loc := #caller_location) -> (err: Unmarshal_Error) { #partial switch t in ti.variant { case reflect.Type_Info_String: + assert(!t.is_utf16) + bytes := err_conv(_decode_bytes(d, add, allocator=allocator, loc=loc)) or_return if t.is_cstring { diff --git a/core/encoding/json/marshal.odin b/core/encoding/json/marshal.odin index ebb9a639c..cdb00a354 100644 --- a/core/encoding/json/marshal.odin +++ b/core/encoding/json/marshal.odin @@ -353,10 +353,10 @@ marshal_to_writer :: proc(w: io.Writer, v: any, opt: ^Marshal_Options) -> (err: #partial switch info in ti.variant { case runtime.Type_Info_String: switch x in v { - case string: - return x == "" - case cstring: - return x == nil || x == "" + case string: return x == "" + case cstring: return x == nil || x == "" + case string16: return x == "" + case cstring16: return x == nil || x == "" } case runtime.Type_Info_Any: return v.(any) == nil diff --git a/core/encoding/json/unmarshal.odin b/core/encoding/json/unmarshal.odin index b9ed1476f..51e7e3b81 100644 --- a/core/encoding/json/unmarshal.odin +++ b/core/encoding/json/unmarshal.odin @@ -570,7 +570,9 @@ unmarshal_object :: proc(p: ^Parser, v: any, end_token: Token_Kind) -> (err: Unm key_ptr: rawptr #partial switch tk in t.key.variant { - case runtime.Type_Info_String: + case runtime.Type_Info_String: + assert(!tk.is_utf16) + key_ptr = rawptr(&key) key_cstr: cstring if reflect.is_cstring(t.key) { diff --git a/core/flags/internal_rtti.odin b/core/flags/internal_rtti.odin index 1c559ca55..58224cc87 100644 --- a/core/flags/internal_rtti.odin +++ b/core/flags/internal_rtti.odin @@ -127,6 +127,8 @@ parse_and_set_pointer_by_base_type :: proc(ptr: rawptr, str: string, type_info: } case runtime.Type_Info_String: + assert(!specific_type_info.is_utf16) + if specific_type_info.is_cstring { cstr_ptr := (^cstring)(ptr) if cstr_ptr != nil { diff --git a/core/fmt/fmt.odin b/core/fmt/fmt.odin index 7fe6287d4..9c245de94 100644 --- a/core/fmt/fmt.odin +++ b/core/fmt/fmt.odin @@ -2346,14 +2346,14 @@ fmt_array :: proc(fi: ^Info, data: rawptr, n: int, elem_size: int, elem: ^reflec } switch reflect.type_info_base(elem).id { - case byte: fmt_string(fi, string(([^]byte)(data)[:n]), verb); return - case u16: print_utf16(fi, ([^]u16)(data)[:n]); return - case u16le: print_utf16(fi, ([^]u16le)(data)[:n]); return - case u16be: print_utf16(fi, ([^]u16be)(data)[:n]); return - case u32: print_utf32(fi, ([^]u32)(data)[:n]); return - case u32le: print_utf32(fi, ([^]u32le)(data)[:n]); return - case u32be: print_utf32(fi, ([^]u32be)(data)[:n]); return - case rune: print_utf32(fi, ([^]rune)(data)[:n]); return + case byte: fmt_string(fi, string (([^]byte)(data)[:n]), verb); return + case u16: fmt_string16(fi, string16(([^]u16) (data)[:n]), verb); return + case u16le: print_utf16(fi, ([^]u16le)(data)[:n]); return + case u16be: print_utf16(fi, ([^]u16be)(data)[:n]); return + case u32: print_utf32(fi, ([^]u32)(data)[:n]); return + case u32le: print_utf32(fi, ([^]u32le)(data)[:n]); return + case u32be: print_utf32(fi, ([^]u32be)(data)[:n]); return + case rune: print_utf32(fi, ([^]rune)(data)[:n]); return } } if verb == 'p' { diff --git a/core/io/io.odin b/core/io/io.odin index 5431519bf..c4eb6a073 100644 --- a/core/io/io.odin +++ b/core/io/io.odin @@ -319,7 +319,6 @@ write_string :: proc(s: Writer, str: string, n_written: ^int = nil) -> (n: int, write_string16 :: proc(s: Writer, str: string16, n_written: ^int = nil) -> (n: int, err: Error) { for i := 0; i < len(str); i += 1 { r := rune(utf16.REPLACEMENT_CHAR) - switch c := str[i]; { case c < utf16._surr1, utf16._surr3 <= c: r = rune(c) @@ -329,7 +328,8 @@ write_string16 :: proc(s: Writer, str: string16, n_written: ^int = nil) -> (n: i i += 1 } - w, err := write_rune(s, r, n_written) + w: int + w, err = write_rune(s, r, n_written) n += w if err != nil { return diff --git a/core/reflect/types.odin b/core/reflect/types.odin index 511c5c9bd..2351408cc 100644 --- a/core/reflect/types.odin +++ b/core/reflect/types.odin @@ -511,9 +511,11 @@ write_type_writer :: #force_no_inline proc(w: io.Writer, ti: ^Type_Info, n_writt io.write_i64(w, i64(8*ti.size), 10, &n) or_return case Type_Info_String: if info.is_cstring { - io.write_string(w, "cstring", &n) or_return - } else { - io.write_string(w, "string", &n) or_return + io.write_byte(w, 'c', &n) or_return + } + io.write_string(w, "string", &n) or_return + if info.is_utf16 { + io.write_string(w, "16", &n) or_return } case Type_Info_Boolean: switch ti.id { diff --git a/src/check_builtin.cpp b/src/check_builtin.cpp index d36cf4520..4abace637 100644 --- a/src/check_builtin.cpp +++ b/src/check_builtin.cpp @@ -19,6 +19,7 @@ gb_global BuiltinTypeIsProc *builtin_type_is_procs[BuiltinProc__type_simple_bool is_type_complex, is_type_quaternion, is_type_string, + is_type_string16, is_type_typeid, is_type_any, is_type_endian_platform, @@ -6139,6 +6140,7 @@ gb_internal bool check_builtin_procedure(CheckerContext *c, Operand *operand, As case BuiltinProc_type_is_complex: case BuiltinProc_type_is_quaternion: case BuiltinProc_type_is_string: + case BuiltinProc_type_is_string16: case BuiltinProc_type_is_typeid: case BuiltinProc_type_is_any: case BuiltinProc_type_is_endian_platform: diff --git a/src/check_expr.cpp b/src/check_expr.cpp index 57073e22f..8d2e4d637 100644 --- a/src/check_expr.cpp +++ b/src/check_expr.cpp @@ -2106,6 +2106,9 @@ gb_internal bool check_representable_as_constant(CheckerContext *c, ExactValue i } else if (is_type_boolean(type)) { return in_value.kind == ExactValue_Bool; } else if (is_type_string(type)) { + if (in_value.kind == ExactValue_String16) { + return is_type_string16(type) || is_type_cstring16(type); + } return in_value.kind == ExactValue_String; } else if (is_type_integer(type) || is_type_rune(type)) { if (in_value.kind == ExactValue_Bool) { @@ -2320,6 +2323,9 @@ gb_internal bool check_representable_as_constant(CheckerContext *c, ExactValue i if (in_value.kind == ExactValue_String) { return false; } + if (in_value.kind == ExactValue_String16) { + return false; + } if (out_value) *out_value = in_value; } else if (is_type_bit_set(type)) { if (in_value.kind == ExactValue_Integer) { @@ -4654,6 +4660,13 @@ gb_internal void convert_to_typed(CheckerContext *c, Operand *operand, Type *tar break; } } + } else if (operand->value.kind == ExactValue_String16) { + String16 s = operand->value.value_string16; + if (is_type_u16_array(t)) { + if (s.len == t->Array.count) { + break; + } + } } operand->mode = Addressing_Invalid; convert_untyped_error(c, operand, target_type); @@ -4983,6 +4996,12 @@ gb_internal ExactValue get_constant_field_single(CheckerContext *c, ExactValue v if (success_) *success_ = true; if (finish_) *finish_ = true; return exact_value_u64(val); + } else if (value.kind == ExactValue_String16) { + GB_ASSERT(0 <= index && index < value.value_string.len); + u16 val = value.value_string16[index]; + if (success_) *success_ = true; + if (finish_) *finish_ = true; + return exact_value_u64(val); } if (value.kind != ExactValue_Compound) { if (success_) *success_ = true; @@ -11124,15 +11143,21 @@ gb_internal ExprKind check_slice_expr(CheckerContext *c, Operand *o, Ast *node, o->expr = node; return kind; } - - String s = {}; - if (o->value.kind == ExactValue_String) { - s = o->value.value_string; - } - o->mode = Addressing_Constant; o->type = t; - o->value = exact_value_string(substring(s, cast(isize)indices[0], cast(isize)indices[1])); + + if (o->value.kind == ExactValue_String16) { + String16 s = o->value.value_string16; + + o->value = exact_value_string16(substring(s, cast(isize)indices[0], cast(isize)indices[1])); + } else { + String s = {}; + if (o->value.kind == ExactValue_String) { + s = o->value.value_string; + } + + o->value = exact_value_string(substring(s, cast(isize)indices[0], cast(isize)indices[1])); + } } return kind; } @@ -11221,6 +11246,7 @@ gb_internal ExprKind check_expr_base_internal(CheckerContext *c, Operand *o, Ast Type *t = t_invalid; switch (node->tav.value.kind) { case ExactValue_String: t = t_untyped_string; break; + case ExactValue_String16: t = t_string16; break; // TODO(bill): determine this correctly case ExactValue_Float: t = t_untyped_float; break; case ExactValue_Complex: t = t_untyped_complex; break; case ExactValue_Quaternion: t = t_untyped_quaternion; break; @@ -11657,6 +11683,8 @@ gb_internal bool is_exact_value_zero(ExactValue const &v) { return !v.value_bool; case ExactValue_String: return v.value_string.len == 0; + case ExactValue_String16: + return v.value_string16.len == 0; case ExactValue_Integer: return big_int_is_zero(&v.value_integer); case ExactValue_Float: diff --git a/src/checker_builtin_procs.hpp b/src/checker_builtin_procs.hpp index 8e135ab10..bff887d9e 100644 --- a/src/checker_builtin_procs.hpp +++ b/src/checker_builtin_procs.hpp @@ -250,6 +250,7 @@ BuiltinProc__type_simple_boolean_begin, BuiltinProc_type_is_complex, BuiltinProc_type_is_quaternion, BuiltinProc_type_is_string, + BuiltinProc_type_is_string16, BuiltinProc_type_is_typeid, BuiltinProc_type_is_any, @@ -607,6 +608,7 @@ gb_global BuiltinProc builtin_procs[BuiltinProc_COUNT] = { {STR_LIT("type_is_complex"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, {STR_LIT("type_is_quaternion"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, {STR_LIT("type_is_string"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, + {STR_LIT("type_is_string16"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, {STR_LIT("type_is_typeid"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, {STR_LIT("type_is_any"), 1, false, Expr_Expr, BuiltinProcPkg_intrinsics}, diff --git a/src/llvm_backend.cpp b/src/llvm_backend.cpp index 13a1d8cf3..f37415cc1 100644 --- a/src/llvm_backend.cpp +++ b/src/llvm_backend.cpp @@ -1264,7 +1264,13 @@ String lb_get_objc_type_encoding(Type *t, isize pointer_depth = 0) { case Basic_string: return build_context.metrics.int_size == 4 ? str_lit("{string=*i}") : str_lit("{string=*q}"); + case Basic_string16: + return build_context.metrics.int_size == 4 ? str_lit("{string16=*i}") : str_lit("{string16=*q}"); + case Basic_cstring: return str_lit("*"); + case Basic_cstring16: return str_lit("*"); + + case Basic_any: return str_lit("{any=^v^v}"); // rawptr + ^Type_Info case Basic_typeid: diff --git a/src/llvm_backend_const.cpp b/src/llvm_backend_const.cpp index c3112934e..8c05ed4a2 100644 --- a/src/llvm_backend_const.cpp +++ b/src/llvm_backend_const.cpp @@ -122,6 +122,25 @@ gb_internal lbValue lb_const_ptr_cast(lbModule *m, lbValue value, Type *t) { gb_internal LLVMValueRef llvm_const_string_internal(lbModule *m, Type *t, LLVMValueRef data, LLVMValueRef len) { + GB_ASSERT(!is_type_string16(t)); + if (build_context.metrics.ptr_size < build_context.metrics.int_size) { + LLVMValueRef values[3] = { + data, + LLVMConstNull(lb_type(m, t_i32)), + len, + }; + return llvm_const_named_struct_internal(lb_type(m, t), values, 3); + } else { + LLVMValueRef values[2] = { + data, + len, + }; + return llvm_const_named_struct_internal(lb_type(m, t), values, 2); + } +} + +gb_internal LLVMValueRef llvm_const_string16_internal(lbModule *m, Type *t, LLVMValueRef data, LLVMValueRef len) { + GB_ASSERT(is_type_string16(t)); if (build_context.metrics.ptr_size < build_context.metrics.int_size) { LLVMValueRef values[3] = { data, @@ -238,6 +257,10 @@ gb_internal lbValue lb_const_string(lbModule *m, String const &value) { return lb_const_value(m, t_string, exact_value_string(value)); } +gb_internal lbValue lb_const_string(lbModule *m, String16 const &value) { + return lb_const_value(m, t_string16, exact_value_string16(value)); +} + gb_internal lbValue lb_const_bool(lbModule *m, Type *type, bool value) { lbValue res = {}; @@ -569,7 +592,11 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, lb GB_ASSERT(is_type_slice(type)); res.value = lb_find_or_add_entity_string_byte_slice_with_type(m, value.value_string, original_type).value; return res; - } else { + } else if (value.kind == ExactValue_String16) { + GB_ASSERT(is_type_slice(type)); + GB_PANIC("TODO(bill): UTF-16 String"); + return res; + }else { ast_node(cl, CompoundLit, value.value_compound); isize count = cl->elems.count; @@ -751,15 +778,23 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, lb { bool custom_link_section = cc.link_section.len > 0; - LLVMValueRef ptr = lb_find_or_add_entity_string_ptr(m, value.value_string, custom_link_section); + LLVMValueRef ptr = nullptr; lbValue res = {}; res.type = default_type(original_type); + if (is_type_string16(res.type) || is_type_cstring16(res.type)) { + TEMPORARY_ALLOCATOR_GUARD(); + String16 s16 = string_to_string16(temporary_allocator(), value.value_string); + ptr = lb_find_or_add_entity_string16_ptr(m, s16, custom_link_section); + } else { + ptr = lb_find_or_add_entity_string_ptr(m, value.value_string, custom_link_section); + } + if (custom_link_section) { LLVMSetSection(ptr, alloc_cstring(permanent_allocator(), cc.link_section)); } - if (is_type_cstring(res.type)) { + if (is_type_cstring(res.type) || is_type_cstring16(res.type)) { res.value = ptr; } else { if (value.value_string.len == 0) { @@ -768,12 +803,46 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, lb LLVMValueRef str_len = LLVMConstInt(lb_type(m, t_int), value.value_string.len, true); GB_ASSERT(is_type_string(original_type)); - res.value = llvm_const_string_internal(m, original_type, ptr, str_len); + if (is_type_string16(res.type)) { + res.value = llvm_const_string16_internal(m, original_type, ptr, str_len); + } else { + res.value = llvm_const_string_internal(m, original_type, ptr, str_len); + } + } + + return res; + } + + case ExactValue_String16: + { + GB_ASSERT(is_type_string16(res.type) || is_type_cstring16(res.type)); + + bool custom_link_section = cc.link_section.len > 0; + + LLVMValueRef ptr = lb_find_or_add_entity_string16_ptr(m, value.value_string16, custom_link_section); + lbValue res = {}; + res.type = default_type(original_type); + + if (custom_link_section) { + LLVMSetSection(ptr, alloc_cstring(permanent_allocator(), cc.link_section)); + } + + if (is_type_cstring16(res.type)) { + res.value = ptr; + } else { + if (value.value_string16.len == 0) { + ptr = LLVMConstNull(lb_type(m, t_u8_ptr)); + } + LLVMValueRef str_len = LLVMConstInt(lb_type(m, t_int), value.value_string16.len, true); + GB_ASSERT(is_type_string(original_type)); + + res.value = llvm_const_string16_internal(m, original_type, ptr, str_len); } return res; } + case ExactValue_Integer: if (is_type_pointer(type) || is_type_multi_pointer(type) || is_type_proc(type)) { LLVMTypeRef t = lb_type(m, original_type); diff --git a/src/llvm_backend_debug.cpp b/src/llvm_backend_debug.cpp index 024c5564e..182920fc7 100644 --- a/src/llvm_backend_debug.cpp +++ b/src/llvm_backend_debug.cpp @@ -802,6 +802,20 @@ gb_internal LLVMMetadataRef lb_debug_type_internal(lbModule *m, Type *type) { LLVMMetadataRef char_type = lb_debug_type_basic_type(m, str_lit("char"), 8, LLVMDWARFTypeEncoding_Unsigned); return LLVMDIBuilderCreatePointerType(m->debug_builder, char_type, ptr_bits, ptr_bits, 0, "cstring", 7); } + + case Basic_string16: + { + LLVMMetadataRef elements[2] = {}; + elements[0] = lb_debug_struct_field(m, str_lit("data"), t_u16_ptr, 0); + elements[1] = lb_debug_struct_field(m, str_lit("len"), t_int, int_bits); + return lb_debug_basic_struct(m, str_lit("string16"), 2*int_bits, int_bits, elements, gb_count_of(elements)); + } + case Basic_cstring16: + { + LLVMMetadataRef char_type = lb_debug_type_basic_type(m, str_lit("wchar_t"), 16, LLVMDWARFTypeEncoding_Unsigned); + return LLVMDIBuilderCreatePointerType(m->debug_builder, char_type, ptr_bits, ptr_bits, 0, "cstring16", 7); + } + case Basic_any: { LLVMMetadataRef elements[2] = {}; diff --git a/src/llvm_backend_expr.cpp b/src/llvm_backend_expr.cpp index fbf0dea11..3463b6083 100644 --- a/src/llvm_backend_expr.cpp +++ b/src/llvm_backend_expr.cpp @@ -4354,12 +4354,13 @@ gb_internal lbAddr lb_build_addr_index_expr(lbProcedure *p, Ast *expr) { } - case Type_Basic: { // Basic_string + case Type_Basic: { // Basic_string/Basic_string16 lbValue str; lbValue elem; lbValue len; lbValue index; + str = lb_build_expr(p, ie->expr); if (deref) { str = lb_emit_load(p, str); diff --git a/src/llvm_backend_general.cpp b/src/llvm_backend_general.cpp index d9771a75b..9ef1c23c0 100644 --- a/src/llvm_backend_general.cpp +++ b/src/llvm_backend_general.cpp @@ -2715,6 +2715,43 @@ gb_internal LLVMValueRef lb_find_or_add_entity_string_ptr(lbModule *m, String co } } +gb_internal LLVMValueRef lb_find_or_add_entity_string16_ptr(lbModule *m, String16 const &str, bool custom_link_section) { + // TODO(bill): caching for UTF-16 strings + + LLVMValueRef indices[2] = {llvm_zero(m), llvm_zero(m)}; + + LLVMValueRef data = nullptr; + { + LLVMTypeRef llvm_u16 = LLVMInt16TypeInContext(m->ctx); + + TEMPORARY_ALLOCATOR_GUARD(); + + LLVMValueRef *values = gb_alloc_array(temporary_allocator(), LLVMValueRef, str.len+1); + + for (isize i = 0; i < str.len; i++) { + values[i] = LLVMConstInt(llvm_u16, str.text[i], false); + } + values[str.len] = LLVMConstInt(llvm_u16, 0, false); + + data = LLVMConstArray(llvm_u16, values, cast(unsigned)(str.len+1)); + } + + + u32 id = m->global_array_index.fetch_add(1); + gbString name = gb_string_make(temporary_allocator(), "csbs$"); + name = gb_string_appendc(name, m->module_name); + name = gb_string_append_fmt(name, "$%x", id); + + LLVMTypeRef type = LLVMTypeOf(data); + LLVMValueRef global_data = LLVMAddGlobal(m->mod, type, name); + LLVMSetInitializer(global_data, data); + lb_make_global_private_const(global_data); + LLVMSetAlignment(global_data, 1); + + LLVMValueRef ptr = LLVMConstInBoundsGEP2(type, global_data, indices, 2); + return ptr; +} + gb_internal lbValue lb_find_or_add_entity_string(lbModule *m, String const &str, bool custom_link_section) { LLVMValueRef ptr = nullptr; if (str.len != 0) { diff --git a/src/llvm_backend_utility.cpp b/src/llvm_backend_utility.cpp index d4117b7ff..ea1bae4e9 100644 --- a/src/llvm_backend_utility.cpp +++ b/src/llvm_backend_utility.cpp @@ -6,6 +6,7 @@ gb_internal bool lb_is_type_aggregate(Type *t) { case Type_Basic: switch (t->Basic.kind) { case Basic_string: + case Basic_string16: case Basic_any: return true; @@ -981,7 +982,8 @@ gb_internal i32 lb_convert_struct_index(lbModule *m, Type *t, i32 index) { } else if (build_context.ptr_size != build_context.int_size) { switch (t->kind) { case Type_Basic: - if (t->Basic.kind != Basic_string) { + if (t->Basic.kind != Basic_string && + t->Basic.kind != Basic_string16) { break; } /*fallthrough*/ @@ -1160,6 +1162,11 @@ gb_internal lbValue lb_emit_struct_ep(lbProcedure *p, lbValue s, i32 index) { case 0: result_type = alloc_type_pointer(t->Slice.elem); break; case 1: result_type = t_int; break; } + } else if (is_type_string16(t)) { + switch (index) { + case 0: result_type = t_u16_ptr; break; + case 1: result_type = t_int; break; + } } else if (is_type_string(t)) { switch (index) { case 0: result_type = t_u8_ptr; break; @@ -1273,6 +1280,12 @@ gb_internal lbValue lb_emit_struct_ev(lbProcedure *p, lbValue s, i32 index) { switch (t->kind) { case Type_Basic: switch (t->Basic.kind) { + case Basic_string16: + switch (index) { + case 0: result_type = t_u16_ptr; break; + case 1: result_type = t_int; break; + } + break; case Basic_string: switch (index) { case 0: result_type = t_u8_ptr; break; @@ -1440,6 +1453,10 @@ gb_internal lbValue lb_emit_deep_field_gep(lbProcedure *p, lbValue e, Selection e = lb_emit_struct_ep(p, e, index); break; + case Basic_string16: + e = lb_emit_struct_ep(p, e, index); + break; + default: GB_PANIC("un-gep-able type %s", type_to_string(type)); break; diff --git a/src/string.cpp b/src/string.cpp index 8405938f4..8cc0e93f3 100644 --- a/src/string.cpp +++ b/src/string.cpp @@ -79,6 +79,13 @@ gb_internal String substring(String const &s, isize lo, isize hi) { return make_string(s.text+lo, hi-lo); } +gb_internal String16 substring(String16 const &s, isize lo, isize hi) { + isize max = s.len; + GB_ASSERT_MSG(lo <= hi && hi <= max, "%td..%td..%td", lo, hi, max); + + return make_string16(s.text+lo, hi-lo); +} + gb_internal char *alloc_cstring(gbAllocator a, String s) { char *c_str = gb_alloc_array(a, char, s.len+1); -- cgit v1.2.3 From dca9bf0b0c8a3ad2ac6854c901d99868d3a296d5 Mon Sep 17 00:00:00 2001 From: gingerBill Date: Sat, 2 Aug 2025 13:11:34 +0100 Subject: Fix string16 literal length set in LLVM --- src/llvm_backend_const.cpp | 13 ++++++++++--- src/llvm_backend_general.cpp | 4 ++-- src/string.cpp | 2 -- 3 files changed, 12 insertions(+), 7 deletions(-) (limited to 'src/string.cpp') diff --git a/src/llvm_backend_const.cpp b/src/llvm_backend_const.cpp index cba0000cd..e64be49f2 100644 --- a/src/llvm_backend_const.cpp +++ b/src/llvm_backend_const.cpp @@ -782,9 +782,12 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, lb lbValue res = {}; res.type = default_type(original_type); + isize len = value.value_string.len; + if (is_type_string16(res.type) || is_type_cstring16(res.type)) { TEMPORARY_ALLOCATOR_GUARD(); String16 s16 = string_to_string16(temporary_allocator(), value.value_string); + len = s16.len; ptr = lb_find_or_add_entity_string16_ptr(m, s16, custom_link_section); } else { ptr = lb_find_or_add_entity_string_ptr(m, value.value_string, custom_link_section); @@ -797,10 +800,14 @@ gb_internal lbValue lb_const_value(lbModule *m, Type *type, ExactValue value, lb if (is_type_cstring(res.type) || is_type_cstring16(res.type)) { res.value = ptr; } else { - if (value.value_string.len == 0) { - ptr = LLVMConstNull(lb_type(m, t_u8_ptr)); + if (len == 0) { + if (is_type_string16(res.type)) { + ptr = LLVMConstNull(lb_type(m, t_u16_ptr)); + } else { + ptr = LLVMConstNull(lb_type(m, t_u8_ptr)); + } } - LLVMValueRef str_len = LLVMConstInt(lb_type(m, t_int), value.value_string.len, true); + LLVMValueRef str_len = LLVMConstInt(lb_type(m, t_int), len, true); GB_ASSERT(is_type_string(original_type)); if (is_type_string16(res.type)) { diff --git a/src/llvm_backend_general.cpp b/src/llvm_backend_general.cpp index 064d0ef39..d84b8302b 100644 --- a/src/llvm_backend_general.cpp +++ b/src/llvm_backend_general.cpp @@ -2758,7 +2758,7 @@ gb_internal LLVMValueRef lb_find_or_add_entity_string16_ptr(lbModule *m, String1 LLVMValueRef global_data = LLVMAddGlobal(m->mod, type, name); LLVMSetInitializer(global_data, data); lb_make_global_private_const(global_data); - LLVMSetAlignment(global_data, 1); + LLVMSetAlignment(global_data, 2); LLVMValueRef ptr = LLVMConstInBoundsGEP2(type, global_data, indices, 2); if (!custom_link_section) { @@ -2855,7 +2855,7 @@ gb_internal lbValue lb_find_or_add_entity_string16_slice_with_type(lbModule *m, LLVMValueRef global_data = LLVMAddGlobal(m->mod, type, name); LLVMSetInitializer(global_data, data); lb_make_global_private_const(global_data); - LLVMSetAlignment(global_data, 1); + LLVMSetAlignment(global_data, 2); i64 data_len = str.len; LLVMValueRef ptr = nullptr; diff --git a/src/string.cpp b/src/string.cpp index 8cc0e93f3..2087a5fee 100644 --- a/src/string.cpp +++ b/src/string.cpp @@ -658,7 +658,6 @@ gb_internal String normalize_path(gbAllocator a, String const &path, String cons -// TODO(bill): Make this non-windows specific gb_internal String16 string_to_string16(gbAllocator a, String s) { int len, len1; u16 *text; @@ -680,7 +679,6 @@ gb_internal String16 string_to_string16(gbAllocator a, String s) { return make_string16(nullptr, 0); } text[len] = 0; - return make_string16(text, len); } -- cgit v1.2.3