ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

Commit: 59c5ac37beff2ea3a82a0beefff08c5cd2f52d33
Parent: fc1489f3d8c138185d24551f188e96429681d002
Author: Randy Palamar
Date:   Sat,  4 Oct 2025 11:12:39 -0600

core: use a dirty programs flag bit for reloading compute shaders

there should be a way of tieing this into the UI so that only the
required shader is reloaded instead of the whole pipeline but that
requires further refactoring.

Diffstat:
Mbeamformer.c | 29++++++++++++++---------------
Mbeamformer.h | 3+++
Mbeamformer_shared_memory.c | 7-------
Mintrinsics.c | 22+++++++++++++---------
4 files changed, 30 insertions(+), 31 deletions(-)

diff --git a/beamformer.c b/beamformer.c @@ -5,10 +5,6 @@ * in the shader * - this will also remove the need for the channel mapping in the decode shader * [ ]: refactor: ui: reload only shader which is affected by the interaction - * [x]: refactor: fancier hot reloading for JIT shaders - * - loop over all active blocks - - loop over shader sets per block - * - when match found reload it * [ ]: BeamformWorkQueue -> BeamformerWorkQueue * [ ]: need to keep track of gpu memory in some way * - want to be able to store more than 16 2D frames but limit 3D frames @@ -306,9 +302,8 @@ function b32 fill_frame_compute_work(BeamformerCtx *ctx, BeamformWork *work, BeamformerViewPlaneTag plane, u32 parameter_block, b32 indirect) { - b32 result = 0; - if (work) { - result = 1; + b32 result = work != 0; + if (result) { u32 frame_id = atomic_add_u32(&ctx->next_render_frame_index, 1); u32 frame_index = frame_id % countof(ctx->beamform_frames); work->kind = indirect? BeamformerWorkKind_ComputeIndirect : BeamformerWorkKind_Compute; @@ -805,11 +800,7 @@ function void beamformer_commit_parameter_block(BeamformerCtx *ctx, BeamformerComputePlan *cp, u32 block, Arena arena) { BeamformerParameterBlock *pb = beamformer_parameter_block_lock(&ctx->shared_memory, block, -1); - for (u32 region = ctz_u32(pb->dirty_regions); - region != 32; - region = ctz_u32(pb->dirty_regions)) - { - mark_parameter_block_region_clean(ctx->shared_memory.region, block, region); + for EachBit(pb->dirty_regions, region) { switch (region) { case BeamformerParameterBlockRegion_ComputePipeline: case BeamformerParameterBlockRegion_Parameters: @@ -822,7 +813,7 @@ beamformer_commit_parameter_block(BeamformerCtx *ctx, BeamformerComputePlan *cp, pb->dirty_regions &= ~mask; for (u32 shader_slot = 0; shader_slot < cp->pipeline.shader_count; shader_slot++) - load_compute_shader(ctx, cp, shader_slot, arena); + cp->dirty_programs |= 1 << shader_slot; #define X(k, t, v) glNamedBufferSubData(cp->ubos[BeamformerComputeUBOKind_##k], \ 0, sizeof(t), &cp->v ## _ubo_data); @@ -1129,12 +1120,13 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena, iptr gl_c for (u32 slot = 0; slot < cp->pipeline.shader_count; slot++) { i32 shader_index = beamformer_shader_reloadable_index_by_shader[cp->pipeline.shaders[slot]]; if (beamformer_reloadable_shader_kinds[shader_index] == work->reload_shader) - load_compute_shader(ctx, cp, slot, *arena); + cp->dirty_programs |= 1 << slot; } } if (ctx->latest_frame && !sm->live_imaging_parameters.active) { - fill_frame_compute_work(ctx, work, ctx->latest_frame->view_plane_tag, 0, 0); + fill_frame_compute_work(ctx, work, ctx->latest_frame->view_plane_tag, + ctx->latest_frame->parameter_block, 0); can_commit = 0; } }break; @@ -1199,6 +1191,13 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena, iptr gl_c post_sync_barrier(&ctx->shared_memory, work->lock, sm->locks); + u32 dirty_programs = atomic_swap_u32(&cp->dirty_programs, 0); + static_assert(ISPOWEROF2(BeamformerMaxComputeShaderStages), + "max compute shader stages must be power of 2"); + assert((dirty_programs & ~((u32)BeamformerMaxComputeShaderStages - 1)) == 0); + for EachBit(dirty_programs, slot) + load_compute_shader(ctx, cp, (u32)slot, *arena); + atomic_store_u32(&cs->processing_compute, 1); start_renderdoc_capture(gl_context); diff --git a/beamformer.h b/beamformer.h @@ -158,6 +158,8 @@ struct BeamformerComputePlan { uv3 decode_dispatch; uv3 demod_dispatch; + u32 dirty_programs; + u32 rf_size; i32 hadamard_order; b32 iq_pipeline; @@ -278,6 +280,7 @@ struct BeamformerFrame { GLenum gl_kind; u32 id; u32 compound_count; + u32 parameter_block; BeamformerAcquisitionKind acquisition_kind; BeamformerViewPlaneTag view_plane_tag; diff --git a/beamformer_shared_memory.c b/beamformer_shared_memory.c @@ -279,13 +279,6 @@ mark_parameter_block_region_dirty(BeamformerSharedMemory *sm, u32 block, Beamfor } function void -mark_parameter_block_region_clean(BeamformerSharedMemory *sm, u32 block, BeamformerParameterBlockRegions region) -{ - BeamformerParameterBlock *pb = beamformer_parameter_block(sm, block); - atomic_and_u32(&pb->dirty_regions, ~(1u << region)); -} - -function void post_sync_barrier(SharedMemoryRegion *sm, BeamformerSharedMemoryLockKind lock, i32 *locks) { /* NOTE(rnp): debug: here it is not a bug to release the lock if it diff --git a/intrinsics.c b/intrinsics.c @@ -28,17 +28,19 @@ #define memory_write_barrier() _WriteBarrier() - #define atomic_store_u64(ptr, n) *((volatile u64 *)(ptr)) = (n) - #define atomic_store_u32(ptr, n) *((volatile u32 *)(ptr)) = (n) - #define atomic_load_u64(ptr) *((volatile u64 *)(ptr)) - #define atomic_load_u32(ptr) *((volatile u32 *)(ptr)) - #define atomic_add_u64(ptr, n) _InterlockedExchangeAdd64((volatile u64 *)(ptr), (n)) #define atomic_add_u32(ptr, n) _InterlockedExchangeAdd((volatile u32 *)(ptr), (n)) - #define atomic_and_u64(ptr, n) _InterlockedAnd64((volatile u64 *)(ptr), (n)) + #define atomic_add_u64(ptr, n) _InterlockedExchangeAdd64((volatile u64 *)(ptr), (n)) #define atomic_and_u32(ptr, n) _InterlockedAnd((volatile u32 *)(ptr), (n)) - #define atomic_cas_u64(ptr, cptr, n) (_InterlockedCompareExchange64((volatile u64 *)(ptr), *(cptr), (n)) == *(cptr)) + #define atomic_and_u64(ptr, n) _InterlockedAnd64((volatile u64 *)(ptr), (n)) #define atomic_cas_u32(ptr, cptr, n) (_InterlockedCompareExchange((volatile u32 *)(ptr), *(cptr), (n)) == *(cptr)) + #define atomic_cas_u64(ptr, cptr, n) (_InterlockedCompareExchange64((volatile u64 *)(ptr), *(cptr), (n)) == *(cptr)) + #define atomic_load_u32(ptr) *((volatile u32 *)(ptr)) + #define atomic_load_u64(ptr) *((volatile u64 *)(ptr)) #define atomic_or_u32(ptr, n) _InterlockedOr((volatile u32 *)(ptr), (n)) + #define atomic_store_u32(ptr, n) *((volatile u32 *)(ptr)) = (n) + #define atomic_store_u64(ptr, n) *((volatile u64 *)(ptr)) = (n) + #define atomic_swap_u32(ptr, n) _InterlockedExchange((volatile u32 *)(ptr), n) + #define atomic_swap_u64(ptr, n) _InterlockedExchange64((volatile u32 *)(ptr), n) #define atan2_f32(y, x) atan2f(y, x) #define cos_f32(a) cosf(a) @@ -65,17 +67,19 @@ #define memory_write_barrier() asm volatile ("" ::: "memory") - #define atomic_store_u64(ptr, n) __atomic_store_n(ptr, n, __ATOMIC_RELEASE) - #define atomic_load_u64(ptr) __atomic_load_n(ptr, __ATOMIC_ACQUIRE) #define atomic_add_u64(ptr, n) __atomic_fetch_add(ptr, n, __ATOMIC_ACQ_REL) #define atomic_and_u64(ptr, n) __atomic_and_fetch(ptr, n, __ATOMIC_RELEASE) #define atomic_cas_u64(ptr, cptr, n) __atomic_compare_exchange_n(ptr, cptr, n, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) + #define atomic_load_u64(ptr) __atomic_load_n(ptr, __ATOMIC_ACQUIRE) #define atomic_or_u32(ptr, n) __atomic_or_fetch(ptr, n, __ATOMIC_RELEASE) + #define atomic_store_u64(ptr, n) __atomic_store_n(ptr, n, __ATOMIC_RELEASE) + #define atomic_swap_u64(ptr, n) __atomic_exchange_n(ptr, n, __ATOMIC_RELEASE) #define atomic_add_u32 atomic_add_u64 #define atomic_and_u32 atomic_and_u64 #define atomic_cas_u32 atomic_cas_u64 #define atomic_load_u32 atomic_load_u64 #define atomic_store_u32 atomic_store_u64 + #define atomic_swap_u32 atomic_swap_u64 #define atan2_f32(y, x) __builtin_atan2f(y, x) #define cos_f32(a) __builtin_cosf(a)