ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

Commit: 8a0b3f2f8e4b99c5e0c28dfcaaeda0b61c077596
Parent: c0a61b78d170bfec84aa9052ddf4e2010ba79afb
Author: Randy Palamar
Date:   Tue, 10 Jun 2025 11:42:09 -0600

core/lib: coalesce buffer uploads that haven't made it to the GPU

If a caller overwrites a buffer that hasn't been uploaded to the
GPU yet we don't need to make a new work item. The library code is
already holding the lock so we know that beamformer can't upload
the buffer until we release it so overwrite the data and only push
the work item if the dirty region flag is not set.

Diffstat:
Mbeamformer.c | 9++++++---
Mbeamformer.h | 1-
Mbeamformer_work_queue.h | 11+++++------
Mintrinsics.c | 8++++++--
Mui.c | 2+-
5 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/beamformer.c b/beamformer.c @@ -561,7 +561,7 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co } buffer = cs->raw_data_ssbo; } break; - default: INVALID_CODE_PATH; break; + InvalidDefaultCase; } if (tex_1d) { @@ -573,6 +573,8 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co glNamedBufferSubData(buffer, 0, uc->size, (u8 *)sm + uc->shared_memory_offset); } + + atomic_and_u32(&sm->dirty_regions, ~(sm->dirty_regions & 1 << (work->lock - 1))); ctx->os.shared_memory_region_unlock(&ctx->shared_memory, sm->locks, (i32)work->lock); } break; case BW_COMPUTE_INDIRECT:{ @@ -588,9 +590,10 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co } atomic_store_u32(&ctx->starting_compute, 0); - if (cs->shared_ubo_dirty) { + i32 mask = 1 << (BeamformerSharedMemoryLockKind_Parameters - 1); + if (sm->dirty_regions & mask) { glNamedBufferSubData(cs->shared_ubo, 0, sizeof(sm->parameters), &sm->parameters); - cs->shared_ubo_dirty = 0; + atomic_and_u32(&sm->dirty_regions, ~mask); } atomic_store_u32(&cs->processing_compute, 1); diff --git a/beamformer.h b/beamformer.h @@ -87,7 +87,6 @@ typedef struct { u32 raw_data_ssbo; u32 shared_ubo; - b32 shared_ubo_dirty; u32 channel_mapping_texture; u32 sparse_elements_texture; diff --git a/beamformer_work_queue.h b/beamformer_work_queue.h @@ -1,13 +1,8 @@ /* See LICENSE for license details. */ -/* TODO(rnp): - * [ ]: coalesce uploads if they are overwriting exist data - * - use a flag field and only submit a new work if the corresponding flag is clear - */ - #ifndef _BEAMFORMER_WORK_QUEUE_H_ #define _BEAMFORMER_WORK_QUEUE_H_ -#define BEAMFORMER_SHARED_MEMORY_VERSION (3UL) +#define BEAMFORMER_SHARED_MEMORY_VERSION (4UL) typedef struct BeamformComputeFrame BeamformComputeFrame; typedef struct ShaderReloadContext ShaderReloadContext; @@ -97,6 +92,10 @@ typedef align_as(64) struct { * the lock without leaving userspace. also this struct needs a bunch of padding */ i32 locks[BeamformerSharedMemoryLockKind_Count]; + /* NOTE(rnp): used to coalesce uploads when they are not yet uploaded to the GPU */ + u32 dirty_regions; + static_assert(BeamformerSharedMemoryLockKind_Count <= 32, "only 32 lock regions supported"); + /* NOTE(rnp): interleaved transmit angle, focal depth pairs */ align_as(64) v2 focal_vectors[256]; diff --git a/intrinsics.c b/intrinsics.c @@ -29,11 +29,13 @@ #define atomic_store_u32(ptr, n) *((volatile u32 *)(ptr)) = (n) #define atomic_load_u64(ptr) *((volatile u64 *)(ptr)) #define atomic_load_u32(ptr) *((volatile u32 *)(ptr)) - #define atomic_and_u64(ptr, n) _InterlockedAnd64((volatile u64 *)(ptr), (n)) #define atomic_add_u64(ptr, n) _InterlockedExchangeAdd64((volatile u64 *)(ptr), (n)) #define atomic_add_u32(ptr, n) _InterlockedExchangeAdd((volatile u32 *)(ptr), (n)) + #define atomic_and_u64(ptr, n) _InterlockedAnd64((volatile u64 *)(ptr), (n)) + #define atomic_and_u32(ptr, n) _InterlockedAnd((volatile u32 *)(ptr), (n)) #define atomic_cas_u64(ptr, cptr, n) (_InterlockedCompareExchange64((volatile u64 *)(ptr), *(cptr), (n)) == *(cptr)) #define atomic_cas_u32(ptr, cptr, n) (_InterlockedCompareExchange((volatile u32 *)(ptr), *(cptr), (n)) == *(cptr)) + #define atomic_or_u32(ptr, n) _InterlockedOr((volatile u32 *)(ptr), (n)) #define sqrt_f32(a) sqrtf(a) #define atan2_f32(y, x) atan2f(y, x) @@ -53,10 +55,12 @@ #define atomic_store_u32(ptr, n) __atomic_store_n(ptr, n, __ATOMIC_RELEASE) #define atomic_load_u64(ptr) __atomic_load_n(ptr, __ATOMIC_ACQUIRE) - #define atomic_and_u64(ptr, n) __atomic_and_fetch(ptr, n, __ATOMIC_RELEASE) #define atomic_add_u64(ptr, n) __atomic_fetch_add(ptr, n, __ATOMIC_ACQ_REL) + #define atomic_and_u64(ptr, n) __atomic_and_fetch(ptr, n, __ATOMIC_RELEASE) #define atomic_cas_u64(ptr, cptr, n) __atomic_compare_exchange_n(ptr, cptr, n, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) + #define atomic_or_u32(ptr, n) __atomic_or_fetch(ptr, n, __ATOMIC_RELEASE) #define atomic_add_u32 atomic_add_u64 + #define atomic_and_u32 atomic_and_u64 #define atomic_cas_u32 atomic_cas_u64 #define atomic_load_u32 atomic_load_u64 diff --git a/ui.c b/ui.c @@ -2847,7 +2847,7 @@ draw_ui(BeamformerCtx *ctx, BeamformerInput *input, BeamformFrame *frame_to_draw if (ctx->os.shared_memory_region_lock(&ctx->shared_memory, sm->locks, lock, 0)) { mem_copy(&sm->parameters_ui, &ui->params, sizeof(ui->params)); ui->flush_params = 0; - ctx->csctx.shared_ubo_dirty = 1; + atomic_or_u32(&sm->dirty_regions, (1 << (lock - 1))); b32 dispatch = ctx->os.shared_memory_region_lock(&ctx->shared_memory, sm->locks, BeamformerSharedMemoryLockKind_DispatchCompute, 0);