From bb551ededd63794e96314303558679931ed41beb Mon Sep 17 00:00:00 2001 From: Andre Weissflog Date: Sat, 24 Jan 2026 18:00:33 +0100 Subject: sokol_gfx.h vk: fix catastrophic performance of uniform updates on Intel Windows driver. (#1424) --- CHANGELOG.md | 10 ++++++ sokol_gfx.h | 101 ++++++++++++++++++++++++++++++++++++----------------------- 2 files changed, 71 insertions(+), 40 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1698b5d0..9f5c9f1d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,16 @@ - sokol_app.h/sokol_gfx.h vk: object debug labels are now working in the experimental Vulkan backend (via `VK_EXT_debug_utils`). Debug labels are only set in debug mode (e.g. `SOKOL_DEBUG` is defined). PR: https://github.com/floooh/sokol/pull/1422 +- sokol_gfx.h vk: Fix catastrophic performance of uniform updates on the Intel Vulkan + driver for Windows. This was caused by letting the descriptor-buffer-extension + function `vkGetDescriptorEXT()` write the descriptor data directly into the descriptor + buffer which completely destroys performance on the Intel Windows driver (interestingly + this is fast on the same machine in Linux). The fix is now to write the descriptor data + into an intermediate sysmem chunk and then do a single memcpy() of the entire descriptor + set into the descriptor buffer. As a bonus there are now also no more redundant `vkGetDescriptorEXT()` + calls when multiple uniform blocks are used in a shader. Overall performance in the `drawcallperf-sapp` + sample is still behind GL and D3D11, but at least it's roughly in the same ballpark now. + PR: https://github.com/floooh/sokol/pull/1424 ### 19-Jan-2026 diff --git a/sokol_gfx.h b/sokol_gfx.h index 8a5d6495..93659030 100644 --- a/sokol_gfx.h +++ b/sokol_gfx.h @@ -4466,6 +4466,7 @@ typedef struct sg_stats { _SG_LOGITEM_XMACRO(VULKAN_STORAGEIMAGE_SPIRV_SET1_BINDING_OUT_OF_RANGE, "vulkan: storage image 'spirv_set1_binding_n' is out of range (must be 0..127)") \ _SG_LOGITEM_XMACRO(VULKAN_SAMPLER_SPIRV_SET1_BINDING_OUT_OF_RANGE, "vulkan: sampler 'spirv_set1_binding_n' is out of range (must be 0..127)") \ _SG_LOGITEM_XMACRO(VULKAN_CREATE_DESCRIPTOR_SET_LAYOUT_FAILED, "vulkan: vkCreateDescriptorSetLayout() failed!") \ + _SG_LOGITEM_XMACRO(VULKAN_SHADER_UNIFORM_DESCRIPTOR_SET_SIZE_VS_CACHE_SIZE, "vulkan: shader uniform descriptor set is too big for the descriptor set cache (please write a Github issue)") \ _SG_LOGITEM_XMACRO(VULKAN_CREATE_PIPELINE_LAYOUT_FAILED, "vulkan: vkCreatePipelineLayout() failed!") \ _SG_LOGITEM_XMACRO(VULKAN_CREATE_GRAPHICS_PIPELINE_FAILED, "vulkan: vkCreateGraphicsPipelines() failed!") \ _SG_LOGITEM_XMACRO(VULKAN_CREATE_COMPUTE_PIPELINE_FAILED, "vulkan: vkCreateComputePipelines() failed!") \ @@ -7093,11 +7094,6 @@ typedef struct { } slots[SG_NUM_INFLIGHT_FRAMES]; } _sg_vk_shared_buffer_t; -typedef struct { - VkDescriptorAddressInfoEXT addr_info; - VkDescriptorGetInfoEXT get_info; -} _sg_vk_uniform_bindinfo_t; - typedef struct { bool valid; VkInstance instance; @@ -7145,9 +7141,14 @@ typedef struct { _sg_vk_shared_buffer_t stream; } stage; // uniform update system - bool uniforms_dirty; - _sg_vk_shared_buffer_t uniform; - _sg_vk_uniform_bindinfo_t uniform_bindinfos[SG_MAX_UNIFORMBLOCK_BINDSLOTS]; + struct { + bool dirty; + _sg_vk_shared_buffer_t dbuf; // descriptor buffer + VkDescriptorAddressInfoEXT addr_info[SG_MAX_UNIFORMBLOCK_BINDSLOTS]; + VkDescriptorGetInfoEXT get_info[SG_MAX_UNIFORMBLOCK_BINDSLOTS]; + size_t dset_cache_size; + uint8_t* dset_cache; + } uniforms; // resource binding system (using descriptor buffers) _sg_vk_shared_buffer_t bind; // hazard tracking system for buffers and images @@ -19760,44 +19761,53 @@ _SOKOL_PRIVATE void _sg_vk_staging_stream_image_data(_sg_image_t* img, const sg_ // uniform data system _SOKOL_PRIVATE void _sg_vk_uniform_init(void) { SOKOL_ASSERT(_sg.desc.uniform_buffer_size > 0); - _sg_vk_shared_buffer_init(&_sg.vk.uniform, + SOKOL_ASSERT(0 == _sg.vk.uniforms.dset_cache); + + _sg_vk_shared_buffer_init(&_sg.vk.uniforms.dbuf, (uint32_t)_sg.desc.uniform_buffer_size, (uint32_t)_sg.vk.dev_props.properties.limits.minUniformBufferOffsetAlignment, _SG_VK_MEMTYPE_UNIFORMS, "shared-uniform-buffer"); + for (size_t i = 0; i < SG_MAX_UNIFORMBLOCK_BINDSLOTS; i++) { - _sg_vk_uniform_bindinfo_t* ubi = &_sg.vk.uniform_bindinfos[i]; - ubi->addr_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_ADDRESS_INFO_EXT; - ubi->get_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_GET_INFO_EXT; - ubi->get_info.type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; - ubi->get_info.data.pUniformBuffer = &ubi->addr_info; + _sg.vk.uniforms.addr_info[i].sType = VK_STRUCTURE_TYPE_DESCRIPTOR_ADDRESS_INFO_EXT; + _sg.vk.uniforms.get_info[i].sType = VK_STRUCTURE_TYPE_DESCRIPTOR_GET_INFO_EXT; + _sg.vk.uniforms.get_info[i].type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + _sg.vk.uniforms.get_info[i].data.pUniformBuffer = &_sg.vk.uniforms.addr_info[i]; } + + // NOTE: we assume here that the max alignment for uniform buffer + // descriptors in the descriptor buffer is the same as the assumed max + // descriptor size (e.g. 256 bytes) + _sg.vk.uniforms.dset_cache_size = SG_MAX_UNIFORMBLOCK_BINDSLOTS * _SG_VK_MAX_DESCRIPTOR_DATA_SIZE; + _sg.vk.uniforms.dset_cache = (uint8_t*)_sg_malloc_clear(_sg.vk.uniforms.dset_cache_size); } _SOKOL_PRIVATE void _sg_vk_uniform_discard(void) { - _sg_vk_shared_buffer_discard(&_sg.vk.uniform); + SOKOL_ASSERT(_sg.vk.uniforms.dset_cache); + _sg_free(_sg.vk.uniforms.dset_cache); _sg.vk.uniforms.dset_cache = 0; + _sg_vk_shared_buffer_discard(&_sg.vk.uniforms.dbuf); } // called from _sg_vk_acquire_frame_command_buffer() _SOKOL_PRIVATE void _sg_vk_uniform_after_acquire(void) { - _sg_vk_shared_buffer_after_acquire(&_sg.vk.uniform); + _sg_vk_shared_buffer_after_acquire(&_sg.vk.uniforms.dbuf); // reset uniform tracking data for (size_t i = 0; i < SG_MAX_UNIFORMBLOCK_BINDSLOTS; i++) { - _sg_vk_uniform_bindinfo_t* ubi = &_sg.vk.uniform_bindinfos[i]; - ubi->addr_info.address = 0; - ubi->addr_info.range = 0; + _sg.vk.uniforms.addr_info[i].address = 0; + _sg.vk.uniforms.addr_info[i].range = 0; } } // called from _sg_vk_submit_frame_command_buffer() _SOKOL_PRIVATE void _sg_vk_uniform_before_submit(void) { - _sg_vk_shared_buffer_before_submit(&_sg.vk.uniform); + _sg_vk_shared_buffer_before_submit(&_sg.vk.uniforms.dbuf); } // called form _sg_vk_apply_uniforms, returns offset of data snippet into uniform buffer _SOKOL_PRIVATE uint32_t _sg_vk_uniform_copy(const sg_range* data) { SOKOL_ASSERT(data && data->ptr && (data->size > 0)); - return (uint32_t)_sg_vk_shared_buffer_memcpy(&_sg.vk.uniform, data->ptr, (uint32_t)data->size); + return (uint32_t)_sg_vk_shared_buffer_memcpy(&_sg.vk.uniforms.dbuf, data->ptr, (uint32_t)data->size); } // resource binding system @@ -19896,8 +19906,8 @@ _SOKOL_PRIVATE bool _sg_vk_bind_view_smp_descriptor_set(VkCommandBuffer cmd_buf, _SOKOL_PRIVATE bool _sg_vk_bind_uniform_descriptor_set(VkCommandBuffer cmd_buf) { SOKOL_ASSERT(cmd_buf); - SOKOL_ASSERT(_sg.vk.uniforms_dirty); - _sg.vk.uniforms_dirty = false; + SOKOL_ASSERT(_sg.vk.uniforms.dirty); + _sg.vk.uniforms.dirty = false; const _sg_pipeline_t* pip = _sg_pipeline_ref_ptr(&_sg.cur_pip); const _sg_shader_t* shd = _sg_shader_ref_ptr(&pip->cmn.shader); @@ -19911,15 +19921,8 @@ _SOKOL_PRIVATE bool _sg_vk_bind_uniform_descriptor_set(VkCommandBuffer cmd_buf) uint8_t* dbuf_ptr = _sg_vk_shared_buffer_ptr(&_sg.vk.bind, dbuf_offset); // update descriptor buffer - for (size_t i = 0; i < SG_MAX_UNIFORMBLOCK_BINDSLOTS; i++) { - if (shd->cmn.uniform_blocks[i].stage == SG_SHADERSTAGE_NONE) { - continue; - } - _sg.vk.ext.get_descriptor(_sg.vk.dev, - &_sg.vk.uniform_bindinfos[i].get_info, - _sg.vk.descriptor_buffer_props.uniformBufferDescriptorSize, - dbuf_ptr + shd->vk.ub_dset_offsets[i]); - } + SOKOL_ASSERT(shd->vk.ub_dset_size <= _sg.vk.uniforms.dset_cache_size); + memcpy(dbuf_ptr, _sg.vk.uniforms.dset_cache, shd->vk.ub_dset_size); // record the descriptor buffer offset const VkPipelineBindPoint vk_bind_point = _sg.cur_pass.is_compute @@ -21016,6 +21019,10 @@ _SOKOL_PRIVATE sg_resource_state _sg_vk_create_shader(_sg_shader_t* shd, const s // store uniform descriptor set size and descriptor offsets _sg.vk.ext.get_descriptor_set_layout_size(_sg.vk.dev, shd->vk.ub_dsl, &shd->vk.ub_dset_size); + if (shd->vk.ub_dset_size > _sg.vk.uniforms.dset_cache_size) { + _SG_ERROR(VULKAN_SHADER_UNIFORM_DESCRIPTOR_SET_SIZE_VS_CACHE_SIZE); + return SG_RESOURCESTATE_FAILED; + } for (size_t i = 0; i < SG_MAX_UNIFORMBLOCK_BINDSLOTS; i++) { if (shd->cmn.uniform_blocks[i].stage == SG_SHADERSTAGE_NONE) { continue; @@ -21627,7 +21634,7 @@ _SOKOL_PRIVATE void _sg_vk_apply_pipeline(_sg_pipeline_t* pip) { SOKOL_ASSERT(pip); SOKOL_ASSERT(pip->vk.pip); SOKOL_ASSERT(_sg.vk.frame.cmd_buf); - _sg.vk.uniforms_dirty = false; + _sg.vk.uniforms.dirty = false; VkPipelineBindPoint bindpoint = pip->cmn.is_compute ? VK_PIPELINE_BIND_POINT_COMPUTE : VK_PIPELINE_BIND_POINT_GRAPHICS; @@ -21671,26 +21678,40 @@ _SOKOL_PRIVATE bool _sg_vk_apply_bindings(_sg_bindings_ptrs_t* bnd) { } _SOKOL_PRIVATE void _sg_vk_apply_uniforms(int ub_slot, const sg_range* data) { - SOKOL_ASSERT(_sg.vk.uniform.cur_dev_addr); + SOKOL_ASSERT(_sg.vk.uniforms.dbuf.cur_dev_addr); SOKOL_ASSERT(data && data->ptr && (data->size > 0)); SOKOL_ASSERT((ub_slot >= 0) && (ub_slot < SG_MAX_UNIFORMBLOCK_BINDSLOTS)); + const _sg_pipeline_t* pip = _sg_pipeline_ref_ptr(&_sg.cur_pip); + const _sg_shader_t* shd = _sg_shader_ref_ptr(&pip->cmn.shader); + SOKOL_ASSERT(data->size == shd->cmn.uniform_blocks[ub_slot].size); // copy data into uniform buffer and keep track of uniform bind infos const VkDeviceSize ubuf_offset = _sg_vk_uniform_copy(data); - if (_sg.vk.uniform.overflown) { + if (_sg.vk.uniforms.dbuf.overflown) { _SG_ERROR(VULKAN_UNIFORM_BUFFER_OVERFLOW); _sg.next_draw_valid = false; return; } - _sg.vk.uniform_bindinfos[ub_slot].addr_info.range = data->size; - _sg.vk.uniform_bindinfos[ub_slot].addr_info.address = _sg.vk.uniform.cur_dev_addr + ubuf_offset; - _sg.vk.uniforms_dirty = true; + _sg.vk.uniforms.addr_info[ub_slot].range = data->size; + _sg.vk.uniforms.addr_info[ub_slot].address = _sg.vk.uniforms.dbuf.cur_dev_addr + ubuf_offset; + + // copy uniform buffer descriptor data into intermediate sysmem buffer + // NOTE: letting vkGetDescriptorEXT write directly into the descriptor + // buffer has catastrophic performance on some Vulkan drivers, notably + // Intel's Windows driver + const size_t dsize = _sg.vk.descriptor_buffer_props.uniformBufferDescriptorSize; + SOKOL_ASSERT((shd->vk.ub_dset_offsets[ub_slot] + dsize) <= _sg.vk.uniforms.dset_cache_size); + uint8_t* dst_ptr = _sg.vk.uniforms.dset_cache + shd->vk.ub_dset_offsets[ub_slot]; + _sg.vk.ext.get_descriptor(_sg.vk.dev, &_sg.vk.uniforms.get_info[ub_slot], dsize, dst_ptr); + + // set uniforms dirty, applying the descriptor buffer offset is happens in draw/dispatch + _sg.vk.uniforms.dirty = true; } _SOKOL_PRIVATE void _sg_vk_draw(int base_element, int num_elements, int num_instances, int base_vertex, int base_instance) { SOKOL_ASSERT(_sg.vk.frame.cmd_buf); VkCommandBuffer cmd_buf = _sg.vk.frame.cmd_buf; - if (_sg.vk.uniforms_dirty) { + if (_sg.vk.uniforms.dirty) { if (!_sg_vk_bind_uniform_descriptor_set(cmd_buf)) { return; } @@ -21714,7 +21735,7 @@ _SOKOL_PRIVATE void _sg_vk_draw(int base_element, int num_elements, int num_inst _SOKOL_PRIVATE void _sg_vk_dispatch(int num_groups_x, int num_groups_y, int num_groups_z) { SOKOL_ASSERT(_sg.vk.frame.cmd_buf); VkCommandBuffer cmd_buf = _sg.vk.frame.cmd_buf; - if (_sg.vk.uniforms_dirty) { + if (_sg.vk.uniforms.dirty) { if (!_sg_vk_bind_uniform_descriptor_set(cmd_buf)) { return; } -- cgit v1.2.3