From 7e15e72639e6f2b5d636072e0866cb63a4fbd44c Mon Sep 17 00:00:00 2001 From: Andre Weissflog Date: Sat, 30 Sep 2023 17:38:40 +0200 Subject: [PATCH] sokol_gfx.h wgpu: implement bindgroups cache --- sokol_gfx.h | 188 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 150 insertions(+), 38 deletions(-) diff --git a/sokol_gfx.h b/sokol_gfx.h index 0bdefcd5c..adf860bd8 100644 --- a/sokol_gfx.h +++ b/sokol_gfx.h @@ -2904,7 +2904,9 @@ typedef struct sg_pass_info { _SG_LOGITEM_XMACRO(METAL_FRAGMENT_SHADER_ENTRY_NOT_FOUND, "fragment shader entry not found (metal)") \ _SG_LOGITEM_XMACRO(METAL_CREATE_RPS_FAILED, "failed to create render pipeline state (metal)") \ _SG_LOGITEM_XMACRO(METAL_CREATE_RPS_OUTPUT, "") \ - _SG_LOGITEM_XMACRO(WGPU_BINDGROUPS_POOL_EXHAUSTED, "wgpu BindGroups pool exhausted (increase sg_desc.bindgroups_cache_size)") \ + _SG_LOGITEM_XMACRO(WGPU_BINDGROUPS_POOL_EXHAUSTED, "bindgroups pool exhausted (increase sg_desc.bindgroups_cache_size) (wgpu)") \ + _SG_LOGITEM_XMACRO(WGPU_BINDGROUPSCACHE_SIZE_GREATER_ONE, "sg_desc.wgpu_bindgroups_cache_size must be > 1 (wgpu)") \ + _SG_LOGITEM_XMACRO(WGPU_BINDGROUPSCACHE_SIZE_POW2, "sg_desc.wgpu_bindgroups_cache_size must be a power of 2 (wgpu)") \ _SG_LOGITEM_XMACRO(WGPU_CREATEBINDGROUP_FAILED, "wgpuDeviceCreateBindGroup failed") \ _SG_LOGITEM_XMACRO(WGPU_CREATE_BUFFER_FAILED, "wgpuDeviceCreateBuffer() failed") \ _SG_LOGITEM_XMACRO(WGPU_CREATE_TEXTURE_FAILED, "wgpuDeviceCreateTexture() failed") \ @@ -4070,6 +4072,7 @@ typedef struct { #define _sg_max(a,b) (((a)>(b))?(a):(b)) #define _sg_clamp(v,v0,v1) (((v)<(v0))?(v0):(((v)>(v1))?(v1):(v))) #define _sg_fequal(val,cmp,delta) ((((val)-(cmp))> -(delta))&&(((val)-(cmp))<(delta))) +#define _sg_ispow2(val) ((val&(val-1))==0) _SOKOL_PRIVATE void* _sg_malloc_clear(size_t size); _SOKOL_PRIVATE void _sg_free(void* ptr); @@ -4935,18 +4938,18 @@ typedef struct { typedef struct { uint64_t hash; uint32_t items[_SG_WGPU_BINDGROUPSCACHE_NUM_ITEMS]; -} _sg_wgpu_bindgroupscache_key_t; +} _sg_wgpu_bindgroups_cache_key_t; typedef struct { - size_t num; // must be 2^n - uint64_t index_mask; // mask to turn hash into valid index + uint32_t num; // must be 2^n + uint32_t index_mask; // mask to turn hash into valid index _sg_wgpu_bindgroup_handle_t* items; } _sg_wgpu_bindgroups_cache_t; typedef struct { _sg_slot_t slot; WGPUBindGroup bindgroup; - _sg_wgpu_bindgroupscache_key_t key; + _sg_wgpu_bindgroups_cache_key_t key; } _sg_wgpu_bindgroup_t; typedef struct { @@ -4963,7 +4966,6 @@ typedef struct { sg_buffer buffer; int offset; } ib; - _sg_wgpu_bindgroup_handle_t bindgroup; } _sg_wgpu_bindings_cache_t; // the WGPU backend state @@ -4990,7 +4992,7 @@ typedef struct { sg_pipeline cur_pipeline_id; _sg_wgpu_uniform_buffer_t uniform; _sg_wgpu_bindings_cache_t bindings_cache; - _sg_wgpu_bindgroups_cache_t bingroups_cache; + _sg_wgpu_bindgroups_cache_t bindgroups_cache; _sg_wgpu_bindgroups_pool_t bindgroups_pool; } _sg_wgpu_backend_t; #endif @@ -12547,29 +12549,32 @@ _SOKOL_PRIVATE void _sg_wgpu_uniform_buffer_on_commit(void) { _SOKOL_PRIVATE void _sg_wgpu_bindgroups_pool_init(const sg_desc* desc) { SOKOL_ASSERT((desc->wgpu_bindgroups_cache_size > 0) && (desc->wgpu_bindgroups_cache_size < _SG_MAX_POOL_SIZE)); - SOKOL_ASSERT(0 == _sg.wgpu.bindgroups_pool.bindgroups); + _sg_wgpu_bindgroups_pool_t* p = &_sg.wgpu.bindgroups_pool; + SOKOL_ASSERT(0 == p->bindgroups); const int pool_size = desc->wgpu_bindgroups_cache_size; - _sg_init_pool(&_sg.wgpu.bindgroups_pool.pool, pool_size); + _sg_init_pool(&p->pool, pool_size); size_t pool_byte_size = sizeof(_sg_wgpu_bindgroup_t) * (size_t)pool_size; - _sg.wgpu.bindgroups_pool.bindgroups = _sg_malloc_clear(pool_byte_size); + p->bindgroups = _sg_malloc_clear(pool_byte_size); } -_SOKOL_PRIVATE void _sg_wgpu_bindgroups_pool_discard(_sg_wgpu_bindgroups_pool_t* p) { - SOKOL_ASSERT(p && p->bindgroups); +_SOKOL_PRIVATE void _sg_wgpu_bindgroups_pool_discard(void) { + _sg_wgpu_bindgroups_pool_t* p = &_sg.wgpu.bindgroups_pool; + SOKOL_ASSERT(p->bindgroups); _sg_free(p->bindgroups); p->bindgroups = 0; _sg_discard_pool(&p->pool); } -_SOKOL_PRIVATE _sg_wgpu_bindgroup_t* _sg_wgpu_bindgroup_at(_sg_wgpu_bindgroups_pool_t* p, uint32_t bg_id) { - SOKOL_ASSERT(p && (SG_INVALID_ID != bg_id)); +_SOKOL_PRIVATE _sg_wgpu_bindgroup_t* _sg_wgpu_bindgroup_at(uint32_t bg_id) { + SOKOL_ASSERT(SG_INVALID_ID != bg_id); + _sg_wgpu_bindgroups_pool_t* p = &_sg.wgpu.bindgroups_pool; int slot_index = _sg_slot_index(bg_id); SOKOL_ASSERT((slot_index > _SG_INVALID_SLOT_INDEX) && (slot_index < p->pool.size)); return &p->bindgroups[slot_index]; } -_SOKOL_PRIVATE _sg_wgpu_bindgroup_t* _sg_wgpu_lookup_bindgroup(_sg_wgpu_bindgroups_pool_t* p, uint32_t bg_id) { +_SOKOL_PRIVATE _sg_wgpu_bindgroup_t* _sg_wgpu_lookup_bindgroup(uint32_t bg_id) { if (SG_INVALID_ID != bg_id) { - _sg_wgpu_bindgroup_t* bg = _sg_wgpu_bindgroup_at(p, bg_id); + _sg_wgpu_bindgroup_t* bg = _sg_wgpu_bindgroup_at(bg_id); if (bg->slot.id == bg_id) { return bg; } @@ -12577,7 +12582,8 @@ _SOKOL_PRIVATE _sg_wgpu_bindgroup_t* _sg_wgpu_lookup_bindgroup(_sg_wgpu_bindgrou return 0; } -_SOKOL_PRIVATE _sg_wgpu_bindgroup_handle_t _sg_wgpu_alloc_bindgroup(_sg_wgpu_bindgroups_pool_t* p) { +_SOKOL_PRIVATE _sg_wgpu_bindgroup_handle_t _sg_wgpu_alloc_bindgroup(void) { + _sg_wgpu_bindgroups_pool_t* p = &_sg.wgpu.bindgroups_pool; _sg_wgpu_bindgroup_handle_t res; int slot_index = _sg_pool_alloc_index(&p->pool); if (_SG_INVALID_SLOT_INDEX != slot_index) { @@ -12589,8 +12595,9 @@ _SOKOL_PRIVATE _sg_wgpu_bindgroup_handle_t _sg_wgpu_alloc_bindgroup(_sg_wgpu_bin return res; } -_SOKOL_PRIVATE void _sg_wgpu_dealloc_bindgroup(_sg_wgpu_bindgroups_pool_t* p, _sg_wgpu_bindgroup_t* bg) { +_SOKOL_PRIVATE void _sg_wgpu_dealloc_bindgroup(_sg_wgpu_bindgroup_t* bg) { SOKOL_ASSERT(bg && (bg->slot.state == SG_RESOURCESTATE_ALLOC) && (bg->slot.id != SG_INVALID_ID)); + _sg_wgpu_bindgroups_pool_t* p = &_sg.wgpu.bindgroups_pool; _sg_pool_free_index(&p->pool, _sg_slot_index(bg->slot.id)); _sg_reset_slot(&bg->slot); } @@ -12603,13 +12610,45 @@ _SOKOL_PRIVATE void _sg_wgpu_reset_bindgroup_to_alloc_state(_sg_wgpu_bindgroup_t bg->slot.state = SG_RESOURCESTATE_ALLOC; } -_SOKOL_PRIVATE uint64_t _sg_wgpu_hash(void* ptr, size_t num_bytes) { - // FIXME! - (void)ptr; (void)num_bytes; - return 0; +// MurmurHash64B (see: https://github.com/aappleby/smhasher/blob/61a0530f28277f2e850bfc39600ce61d02b518de/src/MurmurHash2.cpp#L142) +_SOKOL_PRIVATE uint64_t _sg_wgpu_hash(const void* key, int len, uint64_t seed) { + const uint32_t m = 0x5bd1e995; + const int r = 24; + uint32_t h1 = (uint32_t)seed ^ (uint32_t)len; + uint32_t h2 = (uint32_t)(seed >> 32); + const uint32_t * data = (const uint32_t *)key; + while (len >= 8) { + uint32_t k1 = *data++; + k1 *= m; k1 ^= k1 >> r; k1 *= m; + h1 *= m; h1 ^= k1; + len -= 4; + uint32_t k2 = *data++; + k2 *= m; k2 ^= k2 >> r; k2 *= m; + h2 *= m; h2 ^= k2; + len -= 4; + } + if (len >= 4) { + uint32_t k1 = *data++; + k1 *= m; k1 ^= k1 >> r; k1 *= m; + h1 *= m; h1 ^= k1; + len -= 4; + } + switch(len) { + case 3: h2 ^= (uint32_t)(((unsigned char*)data)[2] << 16); + case 2: h2 ^= (uint32_t)(((unsigned char*)data)[1] << 8); + case 1: h2 ^= ((unsigned char*)data)[0]; + h2 *= m; + }; + h1 ^= h2 >> 18; h1 *= m; + h2 ^= h1 >> 22; h2 *= m; + h1 ^= h2 >> 17; h1 *= m; + h2 ^= h1 >> 19; h2 *= m; + uint64_t h = h1; + h = (h << 32) | h2; + return h; } -_SOKOL_PRIVATE void _sg_wgpu_init_bindgroupscache_key(_sg_wgpu_bindgroupscache_key_t* key, const _sg_bindings_t* bnd) { +_SOKOL_PRIVATE void _sg_wgpu_init_bindgroups_cache_key(_sg_wgpu_bindgroups_cache_key_t* key, const _sg_bindings_t* bnd) { SOKOL_ASSERT(bnd); SOKOL_ASSERT(bnd->pip); SOKOL_ASSERT(bnd->num_vs_imgs <= SG_MAX_SHADERSTAGE_IMAGES); @@ -12640,10 +12679,10 @@ _SOKOL_PRIVATE void _sg_wgpu_init_bindgroupscache_key(_sg_wgpu_bindgroupscache_k SOKOL_ASSERT(bnd->fs_smps[i]); key->items[fs_smps_offset + i] = bnd->fs_smps[i]->slot.id; } - key->hash = _sg_wgpu_hash(&key->items, sizeof(key->items)); + key->hash = _sg_wgpu_hash(&key->items, (int)sizeof(key->items), 0x1234567887654321); } -_SOKOL_PRIVATE bool _sg_wgpu_compare_bindgroupscache_key(_sg_wgpu_bindgroupscache_key_t* k0, _sg_wgpu_bindgroupscache_key_t* k1) { +_SOKOL_PRIVATE bool _sg_wgpu_compare_bindgroups_cache_key(_sg_wgpu_bindgroups_cache_key_t* k0, _sg_wgpu_bindgroups_cache_key_t* k1) { SOKOL_ASSERT(k0 && k1); if (k0->hash != k1->hash) { return false; @@ -12658,11 +12697,11 @@ _SOKOL_PRIVATE _sg_wgpu_bindgroup_t* _sg_wgpu_create_bindgroup(_sg_bindings_t* b SOKOL_ASSERT(_sg.wgpu.dev); SOKOL_ASSERT(bnd->pip); SOKOL_ASSERT(bnd->pip->shader && (bnd->pip->cmn.shader_id.id == bnd->pip->shader->slot.id)); - _sg_wgpu_bindgroup_handle_t bg_id = _sg_wgpu_alloc_bindgroup(&_sg.wgpu.bindgroups_pool); + _sg_wgpu_bindgroup_handle_t bg_id = _sg_wgpu_alloc_bindgroup(); if (bg_id.id == SG_INVALID_ID) { return 0; } - _sg_wgpu_bindgroup_t* bg = _sg_wgpu_bindgroup_at(&_sg.wgpu.bindgroups_pool, bg_id.id); + _sg_wgpu_bindgroup_t* bg = _sg_wgpu_bindgroup_at(bg_id.id); SOKOL_ASSERT(bg && (bg->slot.state == SG_RESOURCESTATE_ALLOC)); // create wgpu bindgroup object @@ -12703,7 +12742,7 @@ _SOKOL_PRIVATE _sg_wgpu_bindgroup_t* _sg_wgpu_create_bindgroup(_sg_bindings_t* b return bg; } - _sg_wgpu_init_bindgroupscache_key(&bg->key, bnd); + _sg_wgpu_init_bindgroups_cache_key(&bg->key, bnd); bg->slot.state = SG_RESOURCESTATE_VALID; return bg; @@ -12720,12 +12759,13 @@ _SOKOL_PRIVATE void _sg_wgpu_discard_bindgroup(_sg_wgpu_bindgroup_t* bg) { SOKOL_ASSERT(bg->slot.state == SG_RESOURCESTATE_ALLOC); } if (bg->slot.state == SG_RESOURCESTATE_ALLOC) { - _sg_wgpu_dealloc_bindgroup(&_sg.wgpu.bindgroups_pool, bg); + _sg_wgpu_dealloc_bindgroup(bg); SOKOL_ASSERT(bg->slot.state == SG_RESOURCESTATE_INITIAL); } } -_SOKOL_PRIVATE void _sg_wgpu_discard_all_bindgroups(_sg_wgpu_bindgroups_pool_t* p) { +_SOKOL_PRIVATE void _sg_wgpu_discard_all_bindgroups(void) { + _sg_wgpu_bindgroups_pool_t* p = &_sg.wgpu.bindgroups_pool; for (int i = 0; i < p->pool.size; i++) { sg_resource_state state = p->bindgroups[i].slot.state; if ((state == SG_RESOURCESTATE_VALID) || (state == SG_RESOURCESTATE_FAILED)) { @@ -12734,6 +12774,47 @@ _SOKOL_PRIVATE void _sg_wgpu_discard_all_bindgroups(_sg_wgpu_bindgroups_pool_t* } } +_SOKOL_PRIVATE void _sg_wgpu_bindgroups_cache_init(const sg_desc* desc) { + SOKOL_ASSERT(desc); + SOKOL_ASSERT(_sg.wgpu.bindgroups_cache.num == 0); + SOKOL_ASSERT(_sg.wgpu.bindgroups_cache.index_mask == 0); + SOKOL_ASSERT(_sg.wgpu.bindgroups_cache.items == 0); + const int num = desc->wgpu_bindgroups_cache_size; + if (num <= 1) { + _SG_PANIC(WGPU_BINDGROUPSCACHE_SIZE_GREATER_ONE); + } + if (!_sg_ispow2(num)) { + _SG_PANIC(WGPU_BINDGROUPSCACHE_SIZE_POW2); + } + _sg.wgpu.bindgroups_cache.num = (uint32_t)desc->wgpu_bindgroups_cache_size; + _sg.wgpu.bindgroups_cache.index_mask = _sg.wgpu.bindgroups_cache.num - 1; + size_t size_in_bytes = sizeof(_sg_wgpu_bindgroup_handle_t) * (size_t)num; + _sg.wgpu.bindgroups_cache.items = _sg_malloc_clear(size_in_bytes); +} + +_SOKOL_PRIVATE void _sg_wgpu_bindgroups_cache_discard(void) { + if (_sg.wgpu.bindgroups_cache.items) { + _sg_free(_sg.wgpu.bindgroups_cache.items); + _sg.wgpu.bindgroups_cache.items = 0; + } + _sg.wgpu.bindgroups_cache.num = 0; + _sg.wgpu.bindgroups_cache.index_mask = 0; +} + +_SOKOL_PRIVATE void _sg_wgpu_bindgroups_cache_set(uint64_t hash, uint32_t bg_id) { + uint32_t index = hash & _sg.wgpu.bindgroups_cache.index_mask; + SOKOL_ASSERT(index < _sg.wgpu.bindgroups_cache.num); + SOKOL_ASSERT(_sg.wgpu.bindgroups_cache.items); + _sg.wgpu.bindgroups_cache.items[index].id = bg_id; +} + +_SOKOL_PRIVATE uint32_t _sg_wgpu_bindgroups_cache_get(uint64_t hash) { + uint32_t index = hash & _sg.wgpu.bindgroups_cache.index_mask; + SOKOL_ASSERT(index < _sg.wgpu.bindgroups_cache.num); + SOKOL_ASSERT(_sg.wgpu.bindgroups_cache.items); + return _sg.wgpu.bindgroups_cache.items[index].id; +} + _SOKOL_PRIVATE void _sg_wgpu_bindcache_clear(void) { memset(&_sg.wgpu.bindings_cache, 0, sizeof(_sg.wgpu.bindings_cache)); } @@ -12762,16 +12843,45 @@ _SOKOL_PRIVATE void _sg_wgpu_bindcache_ib_update(const _sg_buffer_t* ib, int ib_ _SOKOL_PRIVATE bool _sg_wgpu_apply_bindgroup(_sg_bindings_t* bnd) { if ((bnd->num_vs_imgs + bnd->num_vs_smps + bnd->num_fs_imgs + bnd->num_fs_smps) > 0) { - // FIXME: use bindgroups cache! - _sg_wgpu_bindgroup_t* bg = _sg_wgpu_create_bindgroup(bnd); - if (bg) { - if (bg->slot.state == SG_RESOURCESTATE_VALID) { + if (!_sg.desc.wgpu_disable_bindgroups_cache) { + _sg_wgpu_bindgroup_t* bg = 0; + _sg_wgpu_bindgroups_cache_key_t key; + _sg_wgpu_init_bindgroups_cache_key(&key, bnd); + uint32_t bg_id = _sg_wgpu_bindgroups_cache_get(key.hash); + if (bg_id != SG_INVALID_ID) { + // potential cache hit + bg = _sg_wgpu_lookup_bindgroup(bg_id); + SOKOL_ASSERT(bg && (bg->slot.state == SG_RESOURCESTATE_VALID)); + if (!_sg_wgpu_compare_bindgroups_cache_key(&key, &bg->key)) { + // cache collision, need to delete cached bindgroup + _sg_wgpu_discard_bindgroup(bg); + _sg_wgpu_bindgroups_cache_set(key.hash, SG_INVALID_ID); + bg = 0; + } + } + if (bg == 0) { + // either no cache entry yet, or cache collision, create new bindgroup and store in cache + bg = _sg_wgpu_create_bindgroup(bnd); + _sg_wgpu_bindgroups_cache_set(key.hash, bg->slot.id); + } + if (bg && bg->slot.state == SG_RESOURCESTATE_VALID) { SOKOL_ASSERT(bg->bindgroup); wgpuRenderPassEncoderSetBindGroup(_sg.wgpu.pass_enc, _SG_WGPU_IMAGE_SAMPLER_BINDGROUP_INDEX, bg->bindgroup, 0, 0); + } else { + return false; } - _sg_wgpu_discard_bindgroup(bg); } else { - return false; + // bindgroups cache disabled, create and destroy bindgroup on the fly (expensive!) + _sg_wgpu_bindgroup_t* bg = _sg_wgpu_create_bindgroup(bnd); + if (bg) { + if (bg->slot.state == SG_RESOURCESTATE_VALID) { + SOKOL_ASSERT(bg->bindgroup); + wgpuRenderPassEncoderSetBindGroup(_sg.wgpu.pass_enc, _SG_WGPU_IMAGE_SAMPLER_BINDGROUP_INDEX, bg->bindgroup, 0, 0); + } + _sg_wgpu_discard_bindgroup(bg); + } else { + return false; + } } } else { wgpuRenderPassEncoderSetBindGroup(_sg.wgpu.pass_enc, _SG_WGPU_IMAGE_SAMPLER_BINDGROUP_INDEX, _sg.wgpu.empty_bind_group, 0, 0); @@ -12803,6 +12913,7 @@ _SOKOL_PRIVATE void _sg_wgpu_setup_backend(const sg_desc* desc) { _sg_wgpu_init_caps(); _sg_wgpu_uniform_buffer_init(desc); _sg_wgpu_bindgroups_pool_init(desc); + _sg_wgpu_bindgroups_cache_init(desc); // create an empty bind group for shader stages without bound images WGPUBindGroupLayoutDescriptor bgl_desc; @@ -12827,8 +12938,9 @@ _SOKOL_PRIVATE void _sg_wgpu_discard_backend(void) { SOKOL_ASSERT(_sg.wgpu.valid); SOKOL_ASSERT(_sg.wgpu.cmd_enc); _sg.wgpu.valid = false; - _sg_wgpu_discard_all_bindgroups(&_sg.wgpu.bindgroups_pool); - _sg_wgpu_bindgroups_pool_discard(&_sg.wgpu.bindgroups_pool); + _sg_wgpu_discard_all_bindgroups(); + _sg_wgpu_bindgroups_cache_discard(); + _sg_wgpu_bindgroups_pool_discard(); _sg_wgpu_uniform_buffer_discard(); wgpuBindGroupRelease(_sg.wgpu.empty_bind_group); _sg.wgpu.empty_bind_group = 0; wgpuCommandEncoderRelease(_sg.wgpu.cmd_enc); _sg.wgpu.cmd_enc = 0;