17 files changed, 300 insertions, 70 deletions
diff --git a/src/build_settings.cpp b/src/build_settings.cpp
index 4d3e20a7a..32640d732 100644
--- a/src/build_settings.cpp
+++ b/src/build_settings.cpp
@@ -1649,7 +1649,11 @@ gb_internal void init_build_context(TargetMetrics *cross_target, Subtarget subta
 	if (!bc->custom_optimization_level) {
 		// NOTE(bill): when building with `-debug` but not specifying an optimization level
 		// default to `-o:none` to improve the debug symbol generation by default
-		bc->optimization_level = -1; // -o:none
+		if (bc->ODIN_DEBUG) {
+			bc->optimization_level = -1; // -o:none
+		} else {
+			bc->optimization_level = 0; // -o:minimal
+		}
 	}
 
 	bc->optimization_level = gb_clamp(bc->optimization_level, -1, 3);
diff --git a/src/check_decl.cpp b/src/check_decl.cpp
index 7d81d102d..6828774e4 100644
--- a/src/check_decl.cpp
+++ b/src/check_decl.cpp
@@ -1869,5 +1869,14 @@ gb_internal bool check_proc_body(CheckerContext *ctx_, Token token, DeclInfo *de
 
 	add_deps_from_child_to_parent(decl);
 
+	for (VariadicReuseData const &vr : decl->variadic_reuses) {
+		GB_ASSERT(vr.slice_type->kind == Type_Slice);
+		Type *elem = vr.slice_type->Slice.elem;
+		i64 size = type_size_of(elem);
+		i64 align = type_align_of(elem);
+		decl->variadic_reuse_max_bytes = gb_max(decl->variadic_reuse_max_bytes, size*vr.max_count);
+		decl->variadic_reuse_max_align = gb_max(decl->variadic_reuse_max_align, align);
+	}
+
 	return true;
 }
diff --git a/src/check_expr.cpp b/src/check_expr.cpp
index 12acca0cb..82f64738f 100644
--- a/src/check_expr.cpp
+++ b/src/check_expr.cpp
@@ -6033,6 +6033,22 @@ gb_internal CallArgumentError check_call_arguments_internal(CheckerContext *c, A
 
 					Entity *vt = pt->params->Tuple.variables[pt->variadic_index];
 					o.type = vt->type;
+
+					// NOTE(bill, 2024-07-14): minimize the stack usage for variadic parameters with the backing array
+					if (c->decl) {
+						bool found = false;
+						for (auto &vr : c->decl->variadic_reuses) {
+							if (are_types_identical(vt->type, vr.slice_type)) {
+								vr.max_count = gb_max(vr.max_count, variadic_operands.count);
+								found = true;
+								break;
+							}
+						}
+						if (!found) {
+							array_add(&c->decl->variadic_reuses, VariadicReuseData{vt->type, variadic_operands.count});
+						}
+					}
+
 				} else {
 					dummy_argument_count += 1;
 					o.type = t_untyped_nil;
@@ -7888,12 +7904,15 @@ gb_internal ExprKind check_call_expr(CheckerContext *c, Operand *operand, Ast *c
 
 			// NOTE: Due to restrictions in LLVM you can not inline calls with a superset of features.
 			if (is_call_inlined) {
-				GB_ASSERT(c->curr_proc_decl);
-				GB_ASSERT(c->curr_proc_decl->entity);
-				GB_ASSERT(c->curr_proc_decl->entity->type->kind == Type_Proc);
-				String scope_features = c->curr_proc_decl->entity->type->Proc.enable_target_feature;
-				if (!check_target_feature_is_superset_of(scope_features, pt->Proc.enable_target_feature, &invalid)) {
-					error(call, "Inlined procedure enables target feature '%.*s', this requires the calling procedure to at least enable the same feature", LIT(invalid));
+				if (c->curr_proc_decl == nullptr) {
+					error(call, "Calling a '#force_inline' procedure that enables target features is not allowed at file scope");
+				} else {
+					GB_ASSERT(c->curr_proc_decl->entity);
+					GB_ASSERT(c->curr_proc_decl->entity->type->kind == Type_Proc);
+					String scope_features = c->curr_proc_decl->entity->type->Proc.enable_target_feature;
+					if (!check_target_feature_is_superset_of(scope_features, pt->Proc.enable_target_feature, &invalid)) {
+						error(call, "Inlined procedure enables target feature '%.*s', this requires the calling procedure to at least enable the same feature", LIT(invalid));
+					}
 				}
 			}
 		}
diff --git a/src/check_type.cpp b/src/check_type.cpp
index dd8559114..fea937e4e 100644
--- a/src/check_type.cpp
+++ b/src/check_type.cpp
@@ -1953,6 +1953,10 @@ gb_internal Type *check_get_params(CheckerContext *ctx, Scope *scope, Ast *_para
 					error(name, "'#by_ptr' can only be applied to variable fields");
 					p->flags &= ~FieldFlag_by_ptr;
 				}
+				if (p->flags&FieldFlag_no_capture) {
+					error(name, "'#no_capture' can only be applied to variable fields");
+					p->flags &= ~FieldFlag_no_capture;
+				}
 
 				param = alloc_entity_type_name(scope, name->Ident.token, type, EntityState_Resolved);
 				param->TypeName.is_type_alias = true;
@@ -2054,6 +2058,28 @@ gb_internal Type *check_get_params(CheckerContext *ctx, Scope *scope, Ast *_para
 						p->flags &= ~FieldFlag_by_ptr; // Remove the flag
 					}
 				}
+				if (p->flags&FieldFlag_no_capture) {
+					if (is_variadic && variadic_index == variables.count) {
+						if (p->flags & FieldFlag_c_vararg) {
+							error(name, "'#no_capture' cannot be applied to a #c_vararg parameter");
+							p->flags &= ~FieldFlag_no_capture;
+						} else {
+							error(name, "'#no_capture' is already implied on all variadic parameter");
+						}
+					} else if (is_type_polymorphic(type)) {
+						// ignore
+					} else {
+						if (is_type_internally_pointer_like(type)) {
+							error(name, "'#no_capture' is currently reserved for future use");
+						} else {
+							ERROR_BLOCK();
+							error(name, "'#no_capture' can only be applied to pointer-like types");
+							error_line("\t'#no_capture' does not currently do anything useful\n");
+							p->flags &= ~FieldFlag_no_capture;
+						}
+					}
+				}
+
 
 				if (is_poly_name) {
 					if (p->flags&FieldFlag_no_alias) {
@@ -2072,6 +2098,11 @@ gb_internal Type *check_get_params(CheckerContext *ctx, Scope *scope, Ast *_para
 						error(name, "'#by_ptr' can only be applied to variable fields");
 						p->flags &= ~FieldFlag_by_ptr;
 					}
+					if (p->flags&FieldFlag_no_capture) {
+						error(name, "'#no_capture' can only be applied to variable fields");
+						p->flags &= ~FieldFlag_no_capture;
+					}
+
 
 					if (!is_type_polymorphic(type) && check_constant_parameter_value(type, params[i])) {
 						// failed
@@ -2091,6 +2122,8 @@ gb_internal Type *check_get_params(CheckerContext *ctx, Scope *scope, Ast *_para
 				param->flags |= EntityFlag_Ellipsis;
 				if (is_c_vararg) {
 					param->flags |= EntityFlag_CVarArg;
+				} else {
+					param->flags |= EntityFlag_NoCapture;
 				}
 			}
 
@@ -2115,6 +2148,10 @@ gb_internal Type *check_get_params(CheckerContext *ctx, Scope *scope, Ast *_para
 			if (p->flags&FieldFlag_by_ptr) {
 				param->flags |= EntityFlag_ByPtr;
 			}
+			if (p->flags&FieldFlag_no_capture) {
+				param->flags |= EntityFlag_NoCapture;
+			}
+
 
 			param->state = EntityState_Resolved; // NOTE(bill): This should have be resolved whilst determining it
 			add_entity(ctx, scope, name, param);
diff --git a/src/checker.cpp b/src/checker.cpp
index 8756cce1a..336440d32 100644
--- a/src/checker.cpp
+++ b/src/checker.cpp
@@ -184,6 +184,9 @@ gb_internal void init_decl_info(DeclInfo *d, Scope *scope, DeclInfo *parent) {
 	ptr_set_init(&d->deps, 0);
 	ptr_set_init(&d->type_info_deps, 0);
 	d->labels.allocator = heap_allocator();
+	d->variadic_reuses.allocator = heap_allocator();
+	d->variadic_reuse_max_bytes = 0;
+	d->variadic_reuse_max_align = 1;
 }
 
 gb_internal DeclInfo *make_decl_info(Scope *scope, DeclInfo *parent) {
diff --git a/src/checker.hpp b/src/checker.hpp
index 781737140..d76e4c7d0 100644
--- a/src/checker.hpp
+++ b/src/checker.hpp
@@ -181,6 +181,11 @@ char const *ProcCheckedState_strings[ProcCheckedState_COUNT] {
 	"Checked",
 };
 
+struct VariadicReuseData {
+	Type *slice_type; // ..elem_type
+	i64 max_count;
+};
+
 // DeclInfo is used to store information of certain declarations to allow for "any order" usage
 struct DeclInfo {
 	DeclInfo *    parent; // NOTE(bill): only used for procedure literals at the moment
@@ -219,6 +224,10 @@ struct DeclInfo {
 
 	Array<BlockLabel> labels;
 
+	Array<VariadicReuseData> variadic_reuses;
+	i64 variadic_reuse_max_bytes;
+	i64 variadic_reuse_max_align;
+
 	// NOTE(bill): this is to prevent a race condition since these procedure literals can be created anywhere at any time
 	struct lbModule *code_gen_module;
 };
diff --git a/src/entity.cpp b/src/entity.cpp
index 41d84e0f7..db6ffdd52 100644
--- a/src/entity.cpp
+++ b/src/entity.cpp
@@ -45,7 +45,7 @@ enum EntityFlag : u64 {
 	EntityFlag_Value         = 1ull<<11,
 	EntityFlag_BitFieldField = 1ull<<12,
 
-
+	EntityFlag_NoCapture = 1ull<<13, // #no_capture
 
 	EntityFlag_PolyConst     = 1ull<<15,
 	EntityFlag_NotExported   = 1ull<<16,
diff --git a/src/llvm_abi.cpp b/src/llvm_abi.cpp
index b2e485d01..c21cd0a46 100644
--- a/src/llvm_abi.cpp
+++ b/src/llvm_abi.cpp
@@ -15,6 +15,7 @@ struct lbArgType {
 	LLVMAttributeRef align_attribute; // Optional
 	i64 byval_alignment;
 	bool is_byval;
+	bool no_capture;
 };
 
 
@@ -159,6 +160,11 @@ gb_internal void lb_add_function_type_attributes(LLVMValueRef fn, lbFunctionType
 			LLVMAddAttributeAtIndex(fn, arg_index+1, arg->align_attribute);
 		}
 
+		if (arg->no_capture) {
+			LLVMAddAttributeAtIndex(fn, arg_index+1, nocapture_attr);
+		}
+
+
 		if (ft->multiple_return_original_type) {
 			if (ft->original_arg_count <= i) {
 				LLVMAddAttributeAtIndex(fn, arg_index+1, noalias_attr);
@@ -645,10 +651,10 @@ namespace lbAbiAmd64SysV {
 		if (is_mem_cls(cls, attribute_kind)) {
 			LLVMAttributeRef attribute = nullptr;
 			if (attribute_kind == Amd64TypeAttribute_ByVal) {
-				// if (!is_calling_convention_odin(calling_convention)) {
-					return lb_arg_type_indirect_byval(c, type);
-				// }
-				// attribute = nullptr;
+				if (is_calling_convention_odin(calling_convention)) {
+					return lb_arg_type_indirect(type, attribute);
+				}
+				return lb_arg_type_indirect_byval(c, type);
 			} else if (attribute_kind == Amd64TypeAttribute_StructRect) {
 				attribute = lb_create_enum_attribute_with_type(c, "sret", type);
 			}
diff --git a/src/llvm_backend.cpp b/src/llvm_backend.cpp
index 52661dfa7..ae46186ed 100644
--- a/src/llvm_backend.cpp
+++ b/src/llvm_backend.cpp
@@ -1570,6 +1570,7 @@ gb_internal WORKER_TASK_PROC(lb_llvm_module_pass_worker_proc) {
 
 	switch (build_context.optimization_level) {
 	case -1:
+		array_add(&passes, "function(annotation-remarks)");
 		break;
 	case 0:
 		array_add(&passes, "always-inline");
diff --git a/src/llvm_backend.hpp b/src/llvm_backend.hpp
index 005358734..deb05528f 100644
--- a/src/llvm_backend.hpp
+++ b/src/llvm_backend.hpp
@@ -296,6 +296,11 @@ enum lbProcedureFlag : u32 {
 	lbProcedureFlag_DebugAllocaCopy = 1<<1,
 };
 
+struct lbVariadicReuseSlices {
+	Type *slice_type;
+	lbAddr slice_addr;
+};
+
 struct lbProcedure {
 	u32 flags;
 	u16 state_flags;
@@ -336,8 +341,10 @@ struct lbProcedure {
 	bool             in_multi_assignment;
 	Array<LLVMValueRef> raw_input_parameters;
 
-	LLVMValueRef temp_callee_return_struct_memory;
+	Array<lbVariadicReuseSlices> variadic_reuses;
+	lbAddr variadic_reuse_base_array_ptr;
 
+	LLVMValueRef temp_callee_return_struct_memory;
 	Ast *curr_stmt;
 
 	Array<Scope *>       scope_stack;
diff --git a/src/llvm_backend_proc.cpp b/src/llvm_backend_proc.cpp
index 610c34de2..5270d6c30 100644
--- a/src/llvm_backend_proc.cpp
+++ b/src/llvm_backend_proc.cpp
@@ -253,6 +253,11 @@ gb_internal lbProcedure *lb_create_procedure(lbModule *m, Entity *entity, bool i
 			if (e->flags&EntityFlag_NoAlias) {
 				lb_add_proc_attribute_at_index(p, offset+parameter_index, "noalias");
 			}
+			if (e->flags&EntityFlag_NoCapture) {
+				if (is_type_internally_pointer_like(e->type)) {
+					lb_add_proc_attribute_at_index(p, offset+parameter_index, "nocapture");
+				}
+			}
 			parameter_index += 1;
 		}
 	}
@@ -517,6 +522,7 @@ gb_internal void lb_begin_procedure_body(lbProcedure *p) {
 	lb_start_block(p, p->entry_block);
 
 	map_init(&p->direct_parameters);
+	p->variadic_reuses.allocator = heap_allocator();
 
 	GB_ASSERT(p->type != nullptr);
 
@@ -3450,17 +3456,59 @@ gb_internal lbValue lb_build_call_expr_internal(lbProcedure *p, Ast *expr) {
 					}
 					isize slice_len = var_args.count;
 					if (slice_len > 0) {
-						lbAddr slice = lb_add_local_generated(p, slice_type, true);
-						lbAddr base_array = lb_add_local_generated(p, alloc_type_array(elem_type, slice_len), true);
+						lbAddr slice = {};
+
+						for (auto const &vr : p->variadic_reuses) {
+							if (are_types_identical(vr.slice_type, slice_type)) {
+								slice = vr.slice_addr;
+								break;
+							}
+						}
+
+						DeclInfo *d = decl_info_of_entity(p->entity);
+						if (d != nullptr && slice.addr.value == nullptr) {
+							for (auto const &vr : d->variadic_reuses) {
+								if (are_types_identical(vr.slice_type, slice_type)) {
+								#if LLVM_VERSION_MAJOR >= 13
+									// NOTE(bill): No point wasting even more memory, just reuse this stack variable too
+									if (p->variadic_reuses.count > 0) {
+										slice = p->variadic_reuses[0].slice_addr;
+									} else {
+										slice = lb_add_local_generated(p, slice_type, true);
+									}
+									// NOTE(bill): Change the underlying type to match the specific type
+									slice.addr.type = alloc_type_pointer(slice_type);
+								#else
+									slice = lb_add_local_generated(p, slice_type, true);
+								#endif
+									array_add(&p->variadic_reuses, lbVariadicReuseSlices{slice_type, slice});
+									break;
+								}
+							}
+						}
+
+						lbValue base_array_ptr = p->variadic_reuse_base_array_ptr.addr;
+						if (d != nullptr && base_array_ptr.value == nullptr) {
+							i64 max_bytes = d->variadic_reuse_max_bytes;
+							i64 max_align = gb_max(d->variadic_reuse_max_align, 16);
+							p->variadic_reuse_base_array_ptr = lb_add_local_generated(p, alloc_type_array(t_u8, max_bytes), true);
+							lb_try_update_alignment(p->variadic_reuse_base_array_ptr.addr, cast(unsigned)max_align);
+							base_array_ptr = p->variadic_reuse_base_array_ptr.addr;
+						}
+
+						GB_ASSERT(base_array_ptr.value != nullptr);
+						GB_ASSERT(slice.addr.value != nullptr);
+
+						base_array_ptr = lb_emit_conv(p, base_array_ptr, alloc_type_pointer(alloc_type_array(elem_type, slice_len)));
 
 						for (isize i = 0; i < var_args.count; i++) {
-							lbValue addr = lb_emit_array_epi(p, base_array.addr, cast(i32)i);
+							lbValue addr = lb_emit_array_epi(p, base_array_ptr, cast(i32)i);
 							lbValue var_arg = var_args[i];
 							var_arg = lb_emit_conv(p, var_arg, elem_type);
 							lb_emit_store(p, addr, var_arg);
 						}
 
-						lbValue base_elem = lb_emit_array_epi(p, base_array.addr, 0);
+						lbValue base_elem = lb_emit_array_epi(p, base_array_ptr, 0);
 						lbValue len = lb_const_int(p->module, t_int, slice_len);
 						lb_fill_slice(p, slice, base_elem, len);
 
diff --git a/src/main.cpp b/src/main.cpp
index e6a0aecf0..388184be9 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -399,6 +399,8 @@ enum BuildFlagKind {
 
 	BuildFlag_Sanitize,
 
+	BuildFlag_FastBuild,
+
 #if defined(GB_SYSTEM_WINDOWS)
 	BuildFlag_IgnoreVsSearch,
 	BuildFlag_ResourceFile,
@@ -605,6 +607,9 @@ gb_internal bool parse_build_flags(Array<String> args) {
 
 	add_flag(&build_flags, BuildFlag_Sanitize,                str_lit("sanitize"),                  BuildFlagParam_String,  Command__does_build, true);
 
+	add_flag(&build_flags, BuildFlag_FastBuild,               str_lit("fast-build"),                BuildFlagParam_None,    Command__does_build);
+
+
 #if defined(GB_SYSTEM_WINDOWS)
 	add_flag(&build_flags, BuildFlag_IgnoreVsSearch,          str_lit("ignore-vs-search"),          BuildFlagParam_None,    Command__does_build);
 	add_flag(&build_flags, BuildFlag_ResourceFile,            str_lit("resource"),                  BuildFlagParam_String,  Command__does_build);
@@ -1441,6 +1446,13 @@ gb_internal bool parse_build_flags(Array<String> args) {
 							}
 							break;
 
+
+						case BuildFlag_FastBuild:
+							build_context.custom_optimization_level = true;
+							build_context.optimization_level = -1;
+							build_context.use_separate_modules = true;
+							break;
+
 					#if defined(GB_SYSTEM_WINDOWS)
 						case BuildFlag_IgnoreVsSearch: {
 							GB_ASSERT(value.kind == ExactValue_Invalid);
diff --git a/src/parser.cpp b/src/parser.cpp
index 9ce3d563d..a6a146cfd 100644
--- a/src/parser.cpp
+++ b/src/parser.cpp
@@ -4014,6 +4014,7 @@ struct ParseFieldPrefixMapping {
 gb_global ParseFieldPrefixMapping const parse_field_prefix_mappings[] = {
 	{str_lit("using"),        Token_using,     FieldFlag_using},
 	{str_lit("no_alias"),     Token_Hash,      FieldFlag_no_alias},
+	{str_lit("no_capture"),   Token_Hash,      FieldFlag_no_capture},
 	{str_lit("c_vararg"),     Token_Hash,      FieldFlag_c_vararg},
 	{str_lit("const"),        Token_Hash,      FieldFlag_const},
 	{str_lit("any_int"),      Token_Hash,      FieldFlag_any_int},
diff --git a/src/parser.hpp b/src/parser.hpp
index 86b3393af..451cdf53d 100644
--- a/src/parser.hpp
+++ b/src/parser.hpp
@@ -331,8 +331,10 @@ enum FieldFlag : u32 {
 	FieldFlag_by_ptr    = 1<<8,
 	FieldFlag_no_broadcast = 1<<9, // disallow array programming
 
+	FieldFlag_no_capture  = 1<<11,
+
 	// Internal use by the parser only
-	FieldFlag_Tags      = 1<<10,
+	FieldFlag_Tags      = 1<<15,
 	FieldFlag_Results   = 1<<16,
 
 
@@ -340,7 +342,10 @@ enum FieldFlag : u32 {
 	FieldFlag_Invalid   = 1u<<31,
 
 	// Parameter List Restrictions
-	FieldFlag_Signature = FieldFlag_ellipsis|FieldFlag_using|FieldFlag_no_alias|FieldFlag_c_vararg|FieldFlag_const|FieldFlag_any_int|FieldFlag_by_ptr|FieldFlag_no_broadcast,
+	FieldFlag_Signature = FieldFlag_ellipsis|FieldFlag_using|FieldFlag_no_alias|FieldFlag_c_vararg|
+	                      FieldFlag_const|FieldFlag_any_int|FieldFlag_by_ptr|FieldFlag_no_broadcast|
+	                      FieldFlag_no_capture,
+
 	FieldFlag_Struct    = FieldFlag_using|FieldFlag_subtype|FieldFlag_Tags,
 };
 
diff --git a/src/thread_pool.cpp b/src/thread_pool.cpp
index 5dbbe37c4..62cca6de6 100644
--- a/src/thread_pool.cpp
+++ b/src/thread_pool.cpp
@@ -10,13 +10,18 @@ gb_internal void thread_pool_destroy(ThreadPool *pool);
 gb_internal bool thread_pool_add_task(ThreadPool *pool, WorkerTaskProc *proc, void *data);
 gb_internal void thread_pool_wait(ThreadPool *pool);
 
+enum GrabState {
+	Grab_Success = 0,
+	Grab_Empty   = 1,
+	Grab_Failed  = 2,
+};
+
 struct ThreadPool {
-	gbAllocator threads_allocator;
-	Slice<Thread> threads;
+	gbAllocator       threads_allocator;
+	Slice<Thread>     threads;
 	std::atomic<bool> running;
 
 	Futex tasks_available;
-
 	Futex tasks_left;
 };
 
@@ -46,7 +51,7 @@ gb_internal void thread_pool_destroy(ThreadPool *pool) {
 
 	for_array_off(i, 1, pool->threads) {
 		Thread *t = &pool->threads[i];
-		pool->tasks_available.fetch_add(1, std::memory_order_relaxed);
+		pool->tasks_available.fetch_add(1, std::memory_order_acquire);
 		futex_broadcast(&pool->tasks_available);
 		thread_join_and_destroy(t);
 	}
@@ -54,51 +59,86 @@ gb_internal void thread_pool_destroy(ThreadPool *pool) {
 	gb_free(pool->threads_allocator, pool->threads.data);
 }
 
-void thread_pool_queue_push(Thread *thread, WorkerTask task) {
-	u64 capture;
-	u64 new_capture;
-	do {
-		capture = thread->head_and_tail.load();
-
-		u64 mask = thread->capacity - 1;
-		u64 head = (capture >> 32) & mask;
-		u64 tail = ((u32)capture) & mask;
+TaskRingBuffer *task_ring_grow(TaskRingBuffer *ring, isize bottom, isize top) {
+	TaskRingBuffer *new_ring = task_ring_init(ring->size * 2);
+	for (isize i = top; i < bottom; i++) {
+		new_ring->buffer[i % new_ring->size] = ring->buffer[i % ring->size];
+	}
+	return new_ring;
+}
 
-		u64 new_head = (head + 1) & mask;
-		GB_ASSERT_MSG(new_head != tail, "Thread Queue Full!");
+void thread_pool_queue_push(Thread *thread, WorkerTask task) {
+	isize bot                = thread->queue.bottom.load(std::memory_order_relaxed);
+	isize top                = thread->queue.top.load(std::memory_order_acquire);
+	TaskRingBuffer *cur_ring   = thread->queue.ring.load(std::memory_order_relaxed);
+
+	isize size = bot - top;
+	if (size > (cur_ring->size - 1)) {
+		// Queue is full
+		thread->queue.ring = task_ring_grow(thread->queue.ring, bot, top);
+		cur_ring = thread->queue.ring.load(std::memory_order_relaxed);
+	}
 
-		// This *must* be done in here, to avoid a potential race condition where we no longer own the slot by the time we're assigning
-		thread->queue[head] = task;
-		new_capture = (new_head << 32) | tail;
-	} while (!thread->head_and_tail.compare_exchange_weak(capture, new_capture));
+	cur_ring->buffer[bot % cur_ring->size] = task;
+	std::atomic_thread_fence(std::memory_order_release);
+	thread->queue.bottom.store(bot + 1, std::memory_order_relaxed);
 
 	thread->pool->tasks_left.fetch_add(1, std::memory_order_release);
 	thread->pool->tasks_available.fetch_add(1, std::memory_order_relaxed);
 	futex_broadcast(&thread->pool->tasks_available);
 }
 
-bool thread_pool_queue_pop(Thread *thread, WorkerTask *task) {
-	u64 capture;
-	u64 new_capture;
-	do {
-		capture = thread->head_and_tail.load(std::memory_order_acquire);
-
-		u64 mask = thread->capacity - 1;
-		u64 head = (capture >> 32) & mask;
-		u64 tail = ((u32)capture) & mask;
+GrabState thread_pool_queue_take(Thread *thread, WorkerTask *task) {
+	isize bot = thread->queue.bottom.load(std::memory_order_relaxed) - 1;
+	TaskRingBuffer *cur_ring = thread->queue.ring.load(std::memory_order_relaxed);
+	thread->queue.bottom.store(bot, std::memory_order_relaxed);
+	std::atomic_thread_fence(std::memory_order_seq_cst);
+
+	isize top = thread->queue.top.load(std::memory_order_relaxed);
+	if (top <= bot) {
+
+		// Queue is not empty
+		*task = cur_ring->buffer[bot % cur_ring->size];
+		if (top == bot) {
+			// Only one entry left in queue
+			if (!thread->queue.top.compare_exchange_strong(top, top + 1, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+				// Race failed
+				thread->queue.bottom.store(bot + 1, std::memory_order_relaxed);
+				return Grab_Empty;
+			}
 
-		u64 new_tail = (tail + 1) & mask;
-		if (tail == head) {
-			return false;
+			thread->queue.bottom.store(bot + 1, std::memory_order_relaxed);
+			return Grab_Success;
 		}
 
-		// Making a copy of the task before we increment the tail, avoiding the same potential race condition as above
-		*task = thread->queue[tail];
-
-		new_capture = (head << 32) | new_tail;
-	} while (!thread->head_and_tail.compare_exchange_weak(capture, new_capture, std::memory_order_release));
+		// We got a task without hitting a race
+		return Grab_Success;
+	} else {
+		// Queue is empty
+		thread->queue.bottom.store(bot + 1, std::memory_order_relaxed);
+		return Grab_Empty;
+	}
+}
 
-	return true;
+GrabState thread_pool_queue_steal(Thread *thread, WorkerTask *task) {
+	isize top = thread->queue.top.load(std::memory_order_acquire);
+	std::atomic_thread_fence(std::memory_order_seq_cst);
+	isize bot = thread->queue.bottom.load(std::memory_order_acquire);
+
+	GrabState ret = Grab_Empty;
+	if (top < bot) {
+		// Queue is not empty
+		TaskRingBuffer *cur_ring = thread->queue.ring.load(std::memory_order_consume);
+		*task = cur_ring->buffer[top % cur_ring->size];
+
+		if (!thread->queue.top.compare_exchange_strong(top, top + 1, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+			// Race failed
+			ret = Grab_Failed;
+		} else {
+			ret = Grab_Success;
+		}
+	}
+	return ret;
 }
 
 gb_internal bool thread_pool_add_task(ThreadPool *pool, WorkerTaskProc *proc, void *data) {
@@ -115,12 +155,11 @@ gb_internal void thread_pool_wait(ThreadPool *pool) {
 
 	while (pool->tasks_left.load(std::memory_order_acquire)) {
 		// if we've got tasks on our queue, run them
-		while (thread_pool_queue_pop(current_thread, &task)) {
+		while (!thread_pool_queue_take(current_thread, &task)) {
 			task.do_work(task.data);
 			pool->tasks_left.fetch_sub(1, std::memory_order_release);
 		}
 
-
 		// is this mem-barriered enough?
 		// This *must* be executed in this order, so the futex wakes immediately
 		// if rem_tasks has changed since we checked last, otherwise the program
@@ -145,7 +184,7 @@ gb_internal THREAD_PROC(thread_pool_thread_proc) {
 		usize finished_tasks = 0;
 		i32 state;
 
-		while (thread_pool_queue_pop(current_thread, &task)) {
+		while (!thread_pool_queue_take(current_thread, &task)) {
 			task.do_work(task.data);
 			pool->tasks_left.fetch_sub(1, std::memory_order_release);
 
@@ -167,7 +206,12 @@ gb_internal THREAD_PROC(thread_pool_thread_proc) {
 
 				Thread *thread = &pool->threads.data[idx];
 				WorkerTask task;
-				if (thread_pool_queue_pop(thread, &task)) {
+
+				GrabState ret = thread_pool_queue_steal(thread, &task);
+				switch (ret) {
+				case Grab_Empty:
+					continue;
+				case Grab_Success:
 					task.do_work(task.data);
 					pool->tasks_left.fetch_sub(1, std::memory_order_release);
 
@@ -175,6 +219,8 @@ gb_internal THREAD_PROC(thread_pool_thread_proc) {
 						futex_signal(&pool->tasks_left);
 					}
 
+					/*fallthrough*/
+				case Grab_Failed:
 					goto main_loop_continue;
 				}
 			}
@@ -182,6 +228,7 @@ gb_internal THREAD_PROC(thread_pool_thread_proc) {
 
 		// if we've done all our work, and there's nothing to steal, go to sleep
 		state = pool->tasks_available.load(std::memory_order_acquire);
+		if (!pool->running) { break; }
 		futex_wait(&pool->tasks_available, state);
 
 		main_loop_continue:;
diff --git a/src/threading.cpp b/src/threading.cpp
index 717dcb874..ff0fdfcde 100644
--- a/src/threading.cpp
+++ b/src/threading.cpp
@@ -46,6 +46,18 @@ typedef struct WorkerTask {
 	void           *data;
 } WorkerTask;
 
+typedef struct TaskRingBuffer {
+	std::atomic<isize> size;
+	std::atomic<WorkerTask *> buffer;
+} TaskRingBuffer;
+
+typedef struct TaskQueue {
+	std::atomic<isize> top;
+	std::atomic<isize> bottom;
+
+	std::atomic<TaskRingBuffer *> ring;
+} TaskQueue;
+
 struct Thread {
 #if defined(GB_SYSTEM_WINDOWS)
 	void *win32_handle;
@@ -54,12 +66,9 @@ struct Thread {
 #endif
 
 	isize idx;
+	isize stack_size;
 
-	WorkerTask *queue;
-	size_t capacity;
-	std::atomic<uint64_t> head_and_tail;
-
-	isize  stack_size;
+	struct TaskQueue   queue;
 	struct ThreadPool *pool;
 };
 
@@ -551,6 +560,18 @@ gb_internal void *internal_thread_proc(void *arg) {
 }
 #endif
 
+TaskRingBuffer *task_ring_init(isize size) {
+	TaskRingBuffer *ring = gb_alloc_item(heap_allocator(), TaskRingBuffer);
+	ring->size = size;
+	ring->buffer = gb_alloc_array(heap_allocator(), WorkerTask, ring->size);
+	return ring;
+}
+
+void thread_queue_destroy(TaskQueue *q) {
+	gb_free(heap_allocator(), (*q->ring).buffer);
+	gb_free(heap_allocator(), q->ring);
+}
+
 gb_internal void thread_init(ThreadPool *pool, Thread *t, isize idx) {
 	gb_zero_item(t);
 #if defined(GB_SYSTEM_WINDOWS)
@@ -559,14 +580,12 @@ gb_internal void thread_init(ThreadPool *pool, Thread *t, isize idx) {
 	t->posix_handle = 0;
 #endif
 
-	t->capacity = 1 << 14; // must be a power of 2
-	t->queue = gb_alloc_array(heap_allocator(), WorkerTask, t->capacity);
-	t->head_and_tail = 0;
+	// Size must be a power of 2
+	t->queue.ring = task_ring_init(1 << 14);
 	t->pool = pool;
 	t->idx = idx;
 }
 
-
 gb_internal void thread_init_and_start(ThreadPool *pool, Thread *t, isize idx) {
 	thread_init(pool, t, idx);
 	isize stack_size = 0;
@@ -598,7 +617,7 @@ gb_internal void thread_join_and_destroy(Thread *t) {
 	t->posix_handle = 0;
 #endif
 
-	gb_free(heap_allocator(), t->queue);
+	thread_queue_destroy(&t->queue);
 }
 
 gb_internal void thread_set_name(Thread *t, char const *name) {
diff --git a/src/types.cpp b/src/types.cpp
index c3a5fb539..92b187cdb 100644
--- a/src/types.cpp
+++ b/src/types.cpp
@@ -2923,11 +2923,14 @@ gb_internal Type *c_vararg_promote_type(Type *type) {
 
 	if (core->kind == Type_Basic) {
 		switch (core->Basic.kind) {
+		case Basic_f16:
 		case Basic_f32:
 		case Basic_UntypedFloat:
 			return t_f64;
+		case Basic_f16le:
 		case Basic_f32le:
 			return t_f64le;
+		case Basic_f16be:
 		case Basic_f32be:
 			return t_f64be;