diff options
| author | VladPavliuk <pavliuk.vlad@gmail.com> | 2024-07-14 18:22:20 +0300 |
|---|---|---|
| committer | VladPavliuk <pavliuk.vlad@gmail.com> | 2024-07-14 18:22:20 +0300 |
| commit | 3f8712edb03390c1eed4dced27f7c2707cf14ecb (patch) | |
| tree | a186834d911e19418836bf2ca3f52f334c11267a /src | |
| parent | 79e2f63182581547dcdb7593397d1c3e280a5670 (diff) | |
| parent | e7d37607ef9ce54a80d83230150874b71d628d6d (diff) | |
Merge branch 'master' into json-add-int-key-map-support
Diffstat (limited to 'src')
| -rw-r--r-- | src/build_settings.cpp | 6 | ||||
| -rw-r--r-- | src/check_decl.cpp | 9 | ||||
| -rw-r--r-- | src/check_expr.cpp | 31 | ||||
| -rw-r--r-- | src/check_type.cpp | 37 | ||||
| -rw-r--r-- | src/checker.cpp | 3 | ||||
| -rw-r--r-- | src/checker.hpp | 9 | ||||
| -rw-r--r-- | src/entity.cpp | 2 | ||||
| -rw-r--r-- | src/llvm_abi.cpp | 14 | ||||
| -rw-r--r-- | src/llvm_backend.cpp | 1 | ||||
| -rw-r--r-- | src/llvm_backend.hpp | 9 | ||||
| -rw-r--r-- | src/llvm_backend_proc.cpp | 56 | ||||
| -rw-r--r-- | src/main.cpp | 12 | ||||
| -rw-r--r-- | src/parser.cpp | 1 | ||||
| -rw-r--r-- | src/parser.hpp | 9 | ||||
| -rw-r--r-- | src/thread_pool.cpp | 129 | ||||
| -rw-r--r-- | src/threading.cpp | 39 | ||||
| -rw-r--r-- | src/types.cpp | 3 |
17 files changed, 300 insertions, 70 deletions
diff --git a/src/build_settings.cpp b/src/build_settings.cpp index 4d3e20a7a..32640d732 100644 --- a/src/build_settings.cpp +++ b/src/build_settings.cpp @@ -1649,7 +1649,11 @@ gb_internal void init_build_context(TargetMetrics *cross_target, Subtarget subta if (!bc->custom_optimization_level) { // NOTE(bill): when building with `-debug` but not specifying an optimization level // default to `-o:none` to improve the debug symbol generation by default - bc->optimization_level = -1; // -o:none + if (bc->ODIN_DEBUG) { + bc->optimization_level = -1; // -o:none + } else { + bc->optimization_level = 0; // -o:minimal + } } bc->optimization_level = gb_clamp(bc->optimization_level, -1, 3); diff --git a/src/check_decl.cpp b/src/check_decl.cpp index 7d81d102d..6828774e4 100644 --- a/src/check_decl.cpp +++ b/src/check_decl.cpp @@ -1869,5 +1869,14 @@ gb_internal bool check_proc_body(CheckerContext *ctx_, Token token, DeclInfo *de add_deps_from_child_to_parent(decl); + for (VariadicReuseData const &vr : decl->variadic_reuses) { + GB_ASSERT(vr.slice_type->kind == Type_Slice); + Type *elem = vr.slice_type->Slice.elem; + i64 size = type_size_of(elem); + i64 align = type_align_of(elem); + decl->variadic_reuse_max_bytes = gb_max(decl->variadic_reuse_max_bytes, size*vr.max_count); + decl->variadic_reuse_max_align = gb_max(decl->variadic_reuse_max_align, align); + } + return true; } diff --git a/src/check_expr.cpp b/src/check_expr.cpp index 12acca0cb..82f64738f 100644 --- a/src/check_expr.cpp +++ b/src/check_expr.cpp @@ -6033,6 +6033,22 @@ gb_internal CallArgumentError check_call_arguments_internal(CheckerContext *c, A Entity *vt = pt->params->Tuple.variables[pt->variadic_index]; o.type = vt->type; + + // NOTE(bill, 2024-07-14): minimize the stack usage for variadic parameters with the backing array + if (c->decl) { + bool found = false; + for (auto &vr : c->decl->variadic_reuses) { + if (are_types_identical(vt->type, vr.slice_type)) { + vr.max_count = gb_max(vr.max_count, variadic_operands.count); + found = true; + break; + } + } + if (!found) { + array_add(&c->decl->variadic_reuses, VariadicReuseData{vt->type, variadic_operands.count}); + } + } + } else { dummy_argument_count += 1; o.type = t_untyped_nil; @@ -7888,12 +7904,15 @@ gb_internal ExprKind check_call_expr(CheckerContext *c, Operand *operand, Ast *c // NOTE: Due to restrictions in LLVM you can not inline calls with a superset of features. if (is_call_inlined) { - GB_ASSERT(c->curr_proc_decl); - GB_ASSERT(c->curr_proc_decl->entity); - GB_ASSERT(c->curr_proc_decl->entity->type->kind == Type_Proc); - String scope_features = c->curr_proc_decl->entity->type->Proc.enable_target_feature; - if (!check_target_feature_is_superset_of(scope_features, pt->Proc.enable_target_feature, &invalid)) { - error(call, "Inlined procedure enables target feature '%.*s', this requires the calling procedure to at least enable the same feature", LIT(invalid)); + if (c->curr_proc_decl == nullptr) { + error(call, "Calling a '#force_inline' procedure that enables target features is not allowed at file scope"); + } else { + GB_ASSERT(c->curr_proc_decl->entity); + GB_ASSERT(c->curr_proc_decl->entity->type->kind == Type_Proc); + String scope_features = c->curr_proc_decl->entity->type->Proc.enable_target_feature; + if (!check_target_feature_is_superset_of(scope_features, pt->Proc.enable_target_feature, &invalid)) { + error(call, "Inlined procedure enables target feature '%.*s', this requires the calling procedure to at least enable the same feature", LIT(invalid)); + } } } } diff --git a/src/check_type.cpp b/src/check_type.cpp index dd8559114..fea937e4e 100644 --- a/src/check_type.cpp +++ b/src/check_type.cpp @@ -1953,6 +1953,10 @@ gb_internal Type *check_get_params(CheckerContext *ctx, Scope *scope, Ast *_para error(name, "'#by_ptr' can only be applied to variable fields"); p->flags &= ~FieldFlag_by_ptr; } + if (p->flags&FieldFlag_no_capture) { + error(name, "'#no_capture' can only be applied to variable fields"); + p->flags &= ~FieldFlag_no_capture; + } param = alloc_entity_type_name(scope, name->Ident.token, type, EntityState_Resolved); param->TypeName.is_type_alias = true; @@ -2054,6 +2058,28 @@ gb_internal Type *check_get_params(CheckerContext *ctx, Scope *scope, Ast *_para p->flags &= ~FieldFlag_by_ptr; // Remove the flag } } + if (p->flags&FieldFlag_no_capture) { + if (is_variadic && variadic_index == variables.count) { + if (p->flags & FieldFlag_c_vararg) { + error(name, "'#no_capture' cannot be applied to a #c_vararg parameter"); + p->flags &= ~FieldFlag_no_capture; + } else { + error(name, "'#no_capture' is already implied on all variadic parameter"); + } + } else if (is_type_polymorphic(type)) { + // ignore + } else { + if (is_type_internally_pointer_like(type)) { + error(name, "'#no_capture' is currently reserved for future use"); + } else { + ERROR_BLOCK(); + error(name, "'#no_capture' can only be applied to pointer-like types"); + error_line("\t'#no_capture' does not currently do anything useful\n"); + p->flags &= ~FieldFlag_no_capture; + } + } + } + if (is_poly_name) { if (p->flags&FieldFlag_no_alias) { @@ -2072,6 +2098,11 @@ gb_internal Type *check_get_params(CheckerContext *ctx, Scope *scope, Ast *_para error(name, "'#by_ptr' can only be applied to variable fields"); p->flags &= ~FieldFlag_by_ptr; } + if (p->flags&FieldFlag_no_capture) { + error(name, "'#no_capture' can only be applied to variable fields"); + p->flags &= ~FieldFlag_no_capture; + } + if (!is_type_polymorphic(type) && check_constant_parameter_value(type, params[i])) { // failed @@ -2091,6 +2122,8 @@ gb_internal Type *check_get_params(CheckerContext *ctx, Scope *scope, Ast *_para param->flags |= EntityFlag_Ellipsis; if (is_c_vararg) { param->flags |= EntityFlag_CVarArg; + } else { + param->flags |= EntityFlag_NoCapture; } } @@ -2115,6 +2148,10 @@ gb_internal Type *check_get_params(CheckerContext *ctx, Scope *scope, Ast *_para if (p->flags&FieldFlag_by_ptr) { param->flags |= EntityFlag_ByPtr; } + if (p->flags&FieldFlag_no_capture) { + param->flags |= EntityFlag_NoCapture; + } + param->state = EntityState_Resolved; // NOTE(bill): This should have be resolved whilst determining it add_entity(ctx, scope, name, param); diff --git a/src/checker.cpp b/src/checker.cpp index 8756cce1a..336440d32 100644 --- a/src/checker.cpp +++ b/src/checker.cpp @@ -184,6 +184,9 @@ gb_internal void init_decl_info(DeclInfo *d, Scope *scope, DeclInfo *parent) { ptr_set_init(&d->deps, 0); ptr_set_init(&d->type_info_deps, 0); d->labels.allocator = heap_allocator(); + d->variadic_reuses.allocator = heap_allocator(); + d->variadic_reuse_max_bytes = 0; + d->variadic_reuse_max_align = 1; } gb_internal DeclInfo *make_decl_info(Scope *scope, DeclInfo *parent) { diff --git a/src/checker.hpp b/src/checker.hpp index 781737140..d76e4c7d0 100644 --- a/src/checker.hpp +++ b/src/checker.hpp @@ -181,6 +181,11 @@ char const *ProcCheckedState_strings[ProcCheckedState_COUNT] { "Checked", }; +struct VariadicReuseData { + Type *slice_type; // ..elem_type + i64 max_count; +}; + // DeclInfo is used to store information of certain declarations to allow for "any order" usage struct DeclInfo { DeclInfo * parent; // NOTE(bill): only used for procedure literals at the moment @@ -219,6 +224,10 @@ struct DeclInfo { Array<BlockLabel> labels; + Array<VariadicReuseData> variadic_reuses; + i64 variadic_reuse_max_bytes; + i64 variadic_reuse_max_align; + // NOTE(bill): this is to prevent a race condition since these procedure literals can be created anywhere at any time struct lbModule *code_gen_module; }; diff --git a/src/entity.cpp b/src/entity.cpp index 41d84e0f7..db6ffdd52 100644 --- a/src/entity.cpp +++ b/src/entity.cpp @@ -45,7 +45,7 @@ enum EntityFlag : u64 { EntityFlag_Value = 1ull<<11, EntityFlag_BitFieldField = 1ull<<12, - + EntityFlag_NoCapture = 1ull<<13, // #no_capture EntityFlag_PolyConst = 1ull<<15, EntityFlag_NotExported = 1ull<<16, diff --git a/src/llvm_abi.cpp b/src/llvm_abi.cpp index b2e485d01..c21cd0a46 100644 --- a/src/llvm_abi.cpp +++ b/src/llvm_abi.cpp @@ -15,6 +15,7 @@ struct lbArgType { LLVMAttributeRef align_attribute; // Optional i64 byval_alignment; bool is_byval; + bool no_capture; }; @@ -159,6 +160,11 @@ gb_internal void lb_add_function_type_attributes(LLVMValueRef fn, lbFunctionType LLVMAddAttributeAtIndex(fn, arg_index+1, arg->align_attribute); } + if (arg->no_capture) { + LLVMAddAttributeAtIndex(fn, arg_index+1, nocapture_attr); + } + + if (ft->multiple_return_original_type) { if (ft->original_arg_count <= i) { LLVMAddAttributeAtIndex(fn, arg_index+1, noalias_attr); @@ -645,10 +651,10 @@ namespace lbAbiAmd64SysV { if (is_mem_cls(cls, attribute_kind)) { LLVMAttributeRef attribute = nullptr; if (attribute_kind == Amd64TypeAttribute_ByVal) { - // if (!is_calling_convention_odin(calling_convention)) { - return lb_arg_type_indirect_byval(c, type); - // } - // attribute = nullptr; + if (is_calling_convention_odin(calling_convention)) { + return lb_arg_type_indirect(type, attribute); + } + return lb_arg_type_indirect_byval(c, type); } else if (attribute_kind == Amd64TypeAttribute_StructRect) { attribute = lb_create_enum_attribute_with_type(c, "sret", type); } diff --git a/src/llvm_backend.cpp b/src/llvm_backend.cpp index 52661dfa7..ae46186ed 100644 --- a/src/llvm_backend.cpp +++ b/src/llvm_backend.cpp @@ -1570,6 +1570,7 @@ gb_internal WORKER_TASK_PROC(lb_llvm_module_pass_worker_proc) { switch (build_context.optimization_level) { case -1: + array_add(&passes, "function(annotation-remarks)"); break; case 0: array_add(&passes, "always-inline"); diff --git a/src/llvm_backend.hpp b/src/llvm_backend.hpp index 005358734..deb05528f 100644 --- a/src/llvm_backend.hpp +++ b/src/llvm_backend.hpp @@ -296,6 +296,11 @@ enum lbProcedureFlag : u32 { lbProcedureFlag_DebugAllocaCopy = 1<<1, }; +struct lbVariadicReuseSlices { + Type *slice_type; + lbAddr slice_addr; +}; + struct lbProcedure { u32 flags; u16 state_flags; @@ -336,8 +341,10 @@ struct lbProcedure { bool in_multi_assignment; Array<LLVMValueRef> raw_input_parameters; - LLVMValueRef temp_callee_return_struct_memory; + Array<lbVariadicReuseSlices> variadic_reuses; + lbAddr variadic_reuse_base_array_ptr; + LLVMValueRef temp_callee_return_struct_memory; Ast *curr_stmt; Array<Scope *> scope_stack; diff --git a/src/llvm_backend_proc.cpp b/src/llvm_backend_proc.cpp index 610c34de2..5270d6c30 100644 --- a/src/llvm_backend_proc.cpp +++ b/src/llvm_backend_proc.cpp @@ -253,6 +253,11 @@ gb_internal lbProcedure *lb_create_procedure(lbModule *m, Entity *entity, bool i if (e->flags&EntityFlag_NoAlias) { lb_add_proc_attribute_at_index(p, offset+parameter_index, "noalias"); } + if (e->flags&EntityFlag_NoCapture) { + if (is_type_internally_pointer_like(e->type)) { + lb_add_proc_attribute_at_index(p, offset+parameter_index, "nocapture"); + } + } parameter_index += 1; } } @@ -517,6 +522,7 @@ gb_internal void lb_begin_procedure_body(lbProcedure *p) { lb_start_block(p, p->entry_block); map_init(&p->direct_parameters); + p->variadic_reuses.allocator = heap_allocator(); GB_ASSERT(p->type != nullptr); @@ -3450,17 +3456,59 @@ gb_internal lbValue lb_build_call_expr_internal(lbProcedure *p, Ast *expr) { } isize slice_len = var_args.count; if (slice_len > 0) { - lbAddr slice = lb_add_local_generated(p, slice_type, true); - lbAddr base_array = lb_add_local_generated(p, alloc_type_array(elem_type, slice_len), true); + lbAddr slice = {}; + + for (auto const &vr : p->variadic_reuses) { + if (are_types_identical(vr.slice_type, slice_type)) { + slice = vr.slice_addr; + break; + } + } + + DeclInfo *d = decl_info_of_entity(p->entity); + if (d != nullptr && slice.addr.value == nullptr) { + for (auto const &vr : d->variadic_reuses) { + if (are_types_identical(vr.slice_type, slice_type)) { + #if LLVM_VERSION_MAJOR >= 13 + // NOTE(bill): No point wasting even more memory, just reuse this stack variable too + if (p->variadic_reuses.count > 0) { + slice = p->variadic_reuses[0].slice_addr; + } else { + slice = lb_add_local_generated(p, slice_type, true); + } + // NOTE(bill): Change the underlying type to match the specific type + slice.addr.type = alloc_type_pointer(slice_type); + #else + slice = lb_add_local_generated(p, slice_type, true); + #endif + array_add(&p->variadic_reuses, lbVariadicReuseSlices{slice_type, slice}); + break; + } + } + } + + lbValue base_array_ptr = p->variadic_reuse_base_array_ptr.addr; + if (d != nullptr && base_array_ptr.value == nullptr) { + i64 max_bytes = d->variadic_reuse_max_bytes; + i64 max_align = gb_max(d->variadic_reuse_max_align, 16); + p->variadic_reuse_base_array_ptr = lb_add_local_generated(p, alloc_type_array(t_u8, max_bytes), true); + lb_try_update_alignment(p->variadic_reuse_base_array_ptr.addr, cast(unsigned)max_align); + base_array_ptr = p->variadic_reuse_base_array_ptr.addr; + } + + GB_ASSERT(base_array_ptr.value != nullptr); + GB_ASSERT(slice.addr.value != nullptr); + + base_array_ptr = lb_emit_conv(p, base_array_ptr, alloc_type_pointer(alloc_type_array(elem_type, slice_len))); for (isize i = 0; i < var_args.count; i++) { - lbValue addr = lb_emit_array_epi(p, base_array.addr, cast(i32)i); + lbValue addr = lb_emit_array_epi(p, base_array_ptr, cast(i32)i); lbValue var_arg = var_args[i]; var_arg = lb_emit_conv(p, var_arg, elem_type); lb_emit_store(p, addr, var_arg); } - lbValue base_elem = lb_emit_array_epi(p, base_array.addr, 0); + lbValue base_elem = lb_emit_array_epi(p, base_array_ptr, 0); lbValue len = lb_const_int(p->module, t_int, slice_len); lb_fill_slice(p, slice, base_elem, len); diff --git a/src/main.cpp b/src/main.cpp index e6a0aecf0..388184be9 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -399,6 +399,8 @@ enum BuildFlagKind { BuildFlag_Sanitize, + BuildFlag_FastBuild, + #if defined(GB_SYSTEM_WINDOWS) BuildFlag_IgnoreVsSearch, BuildFlag_ResourceFile, @@ -605,6 +607,9 @@ gb_internal bool parse_build_flags(Array<String> args) { add_flag(&build_flags, BuildFlag_Sanitize, str_lit("sanitize"), BuildFlagParam_String, Command__does_build, true); + add_flag(&build_flags, BuildFlag_FastBuild, str_lit("fast-build"), BuildFlagParam_None, Command__does_build); + + #if defined(GB_SYSTEM_WINDOWS) add_flag(&build_flags, BuildFlag_IgnoreVsSearch, str_lit("ignore-vs-search"), BuildFlagParam_None, Command__does_build); add_flag(&build_flags, BuildFlag_ResourceFile, str_lit("resource"), BuildFlagParam_String, Command__does_build); @@ -1441,6 +1446,13 @@ gb_internal bool parse_build_flags(Array<String> args) { } break; + + case BuildFlag_FastBuild: + build_context.custom_optimization_level = true; + build_context.optimization_level = -1; + build_context.use_separate_modules = true; + break; + #if defined(GB_SYSTEM_WINDOWS) case BuildFlag_IgnoreVsSearch: { GB_ASSERT(value.kind == ExactValue_Invalid); diff --git a/src/parser.cpp b/src/parser.cpp index 9ce3d563d..a6a146cfd 100644 --- a/src/parser.cpp +++ b/src/parser.cpp @@ -4014,6 +4014,7 @@ struct ParseFieldPrefixMapping { gb_global ParseFieldPrefixMapping const parse_field_prefix_mappings[] = { {str_lit("using"), Token_using, FieldFlag_using}, {str_lit("no_alias"), Token_Hash, FieldFlag_no_alias}, + {str_lit("no_capture"), Token_Hash, FieldFlag_no_capture}, {str_lit("c_vararg"), Token_Hash, FieldFlag_c_vararg}, {str_lit("const"), Token_Hash, FieldFlag_const}, {str_lit("any_int"), Token_Hash, FieldFlag_any_int}, diff --git a/src/parser.hpp b/src/parser.hpp index 86b3393af..451cdf53d 100644 --- a/src/parser.hpp +++ b/src/parser.hpp @@ -331,8 +331,10 @@ enum FieldFlag : u32 { FieldFlag_by_ptr = 1<<8, FieldFlag_no_broadcast = 1<<9, // disallow array programming + FieldFlag_no_capture = 1<<11, + // Internal use by the parser only - FieldFlag_Tags = 1<<10, + FieldFlag_Tags = 1<<15, FieldFlag_Results = 1<<16, @@ -340,7 +342,10 @@ enum FieldFlag : u32 { FieldFlag_Invalid = 1u<<31, // Parameter List Restrictions - FieldFlag_Signature = FieldFlag_ellipsis|FieldFlag_using|FieldFlag_no_alias|FieldFlag_c_vararg|FieldFlag_const|FieldFlag_any_int|FieldFlag_by_ptr|FieldFlag_no_broadcast, + FieldFlag_Signature = FieldFlag_ellipsis|FieldFlag_using|FieldFlag_no_alias|FieldFlag_c_vararg| + FieldFlag_const|FieldFlag_any_int|FieldFlag_by_ptr|FieldFlag_no_broadcast| + FieldFlag_no_capture, + FieldFlag_Struct = FieldFlag_using|FieldFlag_subtype|FieldFlag_Tags, }; diff --git a/src/thread_pool.cpp b/src/thread_pool.cpp index 5dbbe37c4..62cca6de6 100644 --- a/src/thread_pool.cpp +++ b/src/thread_pool.cpp @@ -10,13 +10,18 @@ gb_internal void thread_pool_destroy(ThreadPool *pool); gb_internal bool thread_pool_add_task(ThreadPool *pool, WorkerTaskProc *proc, void *data); gb_internal void thread_pool_wait(ThreadPool *pool); +enum GrabState { + Grab_Success = 0, + Grab_Empty = 1, + Grab_Failed = 2, +}; + struct ThreadPool { - gbAllocator threads_allocator; - Slice<Thread> threads; + gbAllocator threads_allocator; + Slice<Thread> threads; std::atomic<bool> running; Futex tasks_available; - Futex tasks_left; }; @@ -46,7 +51,7 @@ gb_internal void thread_pool_destroy(ThreadPool *pool) { for_array_off(i, 1, pool->threads) { Thread *t = &pool->threads[i]; - pool->tasks_available.fetch_add(1, std::memory_order_relaxed); + pool->tasks_available.fetch_add(1, std::memory_order_acquire); futex_broadcast(&pool->tasks_available); thread_join_and_destroy(t); } @@ -54,51 +59,86 @@ gb_internal void thread_pool_destroy(ThreadPool *pool) { gb_free(pool->threads_allocator, pool->threads.data); } -void thread_pool_queue_push(Thread *thread, WorkerTask task) { - u64 capture; - u64 new_capture; - do { - capture = thread->head_and_tail.load(); - - u64 mask = thread->capacity - 1; - u64 head = (capture >> 32) & mask; - u64 tail = ((u32)capture) & mask; +TaskRingBuffer *task_ring_grow(TaskRingBuffer *ring, isize bottom, isize top) { + TaskRingBuffer *new_ring = task_ring_init(ring->size * 2); + for (isize i = top; i < bottom; i++) { + new_ring->buffer[i % new_ring->size] = ring->buffer[i % ring->size]; + } + return new_ring; +} - u64 new_head = (head + 1) & mask; - GB_ASSERT_MSG(new_head != tail, "Thread Queue Full!"); +void thread_pool_queue_push(Thread *thread, WorkerTask task) { + isize bot = thread->queue.bottom.load(std::memory_order_relaxed); + isize top = thread->queue.top.load(std::memory_order_acquire); + TaskRingBuffer *cur_ring = thread->queue.ring.load(std::memory_order_relaxed); + + isize size = bot - top; + if (size > (cur_ring->size - 1)) { + // Queue is full + thread->queue.ring = task_ring_grow(thread->queue.ring, bot, top); + cur_ring = thread->queue.ring.load(std::memory_order_relaxed); + } - // This *must* be done in here, to avoid a potential race condition where we no longer own the slot by the time we're assigning - thread->queue[head] = task; - new_capture = (new_head << 32) | tail; - } while (!thread->head_and_tail.compare_exchange_weak(capture, new_capture)); + cur_ring->buffer[bot % cur_ring->size] = task; + std::atomic_thread_fence(std::memory_order_release); + thread->queue.bottom.store(bot + 1, std::memory_order_relaxed); thread->pool->tasks_left.fetch_add(1, std::memory_order_release); thread->pool->tasks_available.fetch_add(1, std::memory_order_relaxed); futex_broadcast(&thread->pool->tasks_available); } -bool thread_pool_queue_pop(Thread *thread, WorkerTask *task) { - u64 capture; - u64 new_capture; - do { - capture = thread->head_and_tail.load(std::memory_order_acquire); - - u64 mask = thread->capacity - 1; - u64 head = (capture >> 32) & mask; - u64 tail = ((u32)capture) & mask; +GrabState thread_pool_queue_take(Thread *thread, WorkerTask *task) { + isize bot = thread->queue.bottom.load(std::memory_order_relaxed) - 1; + TaskRingBuffer *cur_ring = thread->queue.ring.load(std::memory_order_relaxed); + thread->queue.bottom.store(bot, std::memory_order_relaxed); + std::atomic_thread_fence(std::memory_order_seq_cst); + + isize top = thread->queue.top.load(std::memory_order_relaxed); + if (top <= bot) { + + // Queue is not empty + *task = cur_ring->buffer[bot % cur_ring->size]; + if (top == bot) { + // Only one entry left in queue + if (!thread->queue.top.compare_exchange_strong(top, top + 1, std::memory_order_seq_cst, std::memory_order_relaxed)) { + // Race failed + thread->queue.bottom.store(bot + 1, std::memory_order_relaxed); + return Grab_Empty; + } - u64 new_tail = (tail + 1) & mask; - if (tail == head) { - return false; + thread->queue.bottom.store(bot + 1, std::memory_order_relaxed); + return Grab_Success; } - // Making a copy of the task before we increment the tail, avoiding the same potential race condition as above - *task = thread->queue[tail]; - - new_capture = (head << 32) | new_tail; - } while (!thread->head_and_tail.compare_exchange_weak(capture, new_capture, std::memory_order_release)); + // We got a task without hitting a race + return Grab_Success; + } else { + // Queue is empty + thread->queue.bottom.store(bot + 1, std::memory_order_relaxed); + return Grab_Empty; + } +} - return true; +GrabState thread_pool_queue_steal(Thread *thread, WorkerTask *task) { + isize top = thread->queue.top.load(std::memory_order_acquire); + std::atomic_thread_fence(std::memory_order_seq_cst); + isize bot = thread->queue.bottom.load(std::memory_order_acquire); + + GrabState ret = Grab_Empty; + if (top < bot) { + // Queue is not empty + TaskRingBuffer *cur_ring = thread->queue.ring.load(std::memory_order_consume); + *task = cur_ring->buffer[top % cur_ring->size]; + + if (!thread->queue.top.compare_exchange_strong(top, top + 1, std::memory_order_seq_cst, std::memory_order_relaxed)) { + // Race failed + ret = Grab_Failed; + } else { + ret = Grab_Success; + } + } + return ret; } gb_internal bool thread_pool_add_task(ThreadPool *pool, WorkerTaskProc *proc, void *data) { @@ -115,12 +155,11 @@ gb_internal void thread_pool_wait(ThreadPool *pool) { while (pool->tasks_left.load(std::memory_order_acquire)) { // if we've got tasks on our queue, run them - while (thread_pool_queue_pop(current_thread, &task)) { + while (!thread_pool_queue_take(current_thread, &task)) { task.do_work(task.data); pool->tasks_left.fetch_sub(1, std::memory_order_release); } - // is this mem-barriered enough? // This *must* be executed in this order, so the futex wakes immediately // if rem_tasks has changed since we checked last, otherwise the program @@ -145,7 +184,7 @@ gb_internal THREAD_PROC(thread_pool_thread_proc) { usize finished_tasks = 0; i32 state; - while (thread_pool_queue_pop(current_thread, &task)) { + while (!thread_pool_queue_take(current_thread, &task)) { task.do_work(task.data); pool->tasks_left.fetch_sub(1, std::memory_order_release); @@ -167,7 +206,12 @@ gb_internal THREAD_PROC(thread_pool_thread_proc) { Thread *thread = &pool->threads.data[idx]; WorkerTask task; - if (thread_pool_queue_pop(thread, &task)) { + + GrabState ret = thread_pool_queue_steal(thread, &task); + switch (ret) { + case Grab_Empty: + continue; + case Grab_Success: task.do_work(task.data); pool->tasks_left.fetch_sub(1, std::memory_order_release); @@ -175,6 +219,8 @@ gb_internal THREAD_PROC(thread_pool_thread_proc) { futex_signal(&pool->tasks_left); } + /*fallthrough*/ + case Grab_Failed: goto main_loop_continue; } } @@ -182,6 +228,7 @@ gb_internal THREAD_PROC(thread_pool_thread_proc) { // if we've done all our work, and there's nothing to steal, go to sleep state = pool->tasks_available.load(std::memory_order_acquire); + if (!pool->running) { break; } futex_wait(&pool->tasks_available, state); main_loop_continue:; diff --git a/src/threading.cpp b/src/threading.cpp index 717dcb874..ff0fdfcde 100644 --- a/src/threading.cpp +++ b/src/threading.cpp @@ -46,6 +46,18 @@ typedef struct WorkerTask { void *data; } WorkerTask; +typedef struct TaskRingBuffer { + std::atomic<isize> size; + std::atomic<WorkerTask *> buffer; +} TaskRingBuffer; + +typedef struct TaskQueue { + std::atomic<isize> top; + std::atomic<isize> bottom; + + std::atomic<TaskRingBuffer *> ring; +} TaskQueue; + struct Thread { #if defined(GB_SYSTEM_WINDOWS) void *win32_handle; @@ -54,12 +66,9 @@ struct Thread { #endif isize idx; + isize stack_size; - WorkerTask *queue; - size_t capacity; - std::atomic<uint64_t> head_and_tail; - - isize stack_size; + struct TaskQueue queue; struct ThreadPool *pool; }; @@ -551,6 +560,18 @@ gb_internal void *internal_thread_proc(void *arg) { } #endif +TaskRingBuffer *task_ring_init(isize size) { + TaskRingBuffer *ring = gb_alloc_item(heap_allocator(), TaskRingBuffer); + ring->size = size; + ring->buffer = gb_alloc_array(heap_allocator(), WorkerTask, ring->size); + return ring; +} + +void thread_queue_destroy(TaskQueue *q) { + gb_free(heap_allocator(), (*q->ring).buffer); + gb_free(heap_allocator(), q->ring); +} + gb_internal void thread_init(ThreadPool *pool, Thread *t, isize idx) { gb_zero_item(t); #if defined(GB_SYSTEM_WINDOWS) @@ -559,14 +580,12 @@ gb_internal void thread_init(ThreadPool *pool, Thread *t, isize idx) { t->posix_handle = 0; #endif - t->capacity = 1 << 14; // must be a power of 2 - t->queue = gb_alloc_array(heap_allocator(), WorkerTask, t->capacity); - t->head_and_tail = 0; + // Size must be a power of 2 + t->queue.ring = task_ring_init(1 << 14); t->pool = pool; t->idx = idx; } - gb_internal void thread_init_and_start(ThreadPool *pool, Thread *t, isize idx) { thread_init(pool, t, idx); isize stack_size = 0; @@ -598,7 +617,7 @@ gb_internal void thread_join_and_destroy(Thread *t) { t->posix_handle = 0; #endif - gb_free(heap_allocator(), t->queue); + thread_queue_destroy(&t->queue); } gb_internal void thread_set_name(Thread *t, char const *name) { diff --git a/src/types.cpp b/src/types.cpp index c3a5fb539..92b187cdb 100644 --- a/src/types.cpp +++ b/src/types.cpp @@ -2923,11 +2923,14 @@ gb_internal Type *c_vararg_promote_type(Type *type) { if (core->kind == Type_Basic) { switch (core->Basic.kind) { + case Basic_f16: case Basic_f32: case Basic_UntypedFloat: return t_f64; + case Basic_f16le: case Basic_f32le: return t_f64le; + case Basic_f16be: case Basic_f32be: return t_f64be; |