core: use a dirty programs flag bit for reloading compute shaders - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: 59c5ac37beff2ea3a82a0beefff08c5cd2f52d33
Parent: fc1489f3d8c138185d24551f188e96429681d002
Author: Randy Palamar
Date:   Sat,  4 Oct 2025 11:12:39 -0600

core: use a dirty programs flag bit for reloading compute shaders

there should be a way of tieing this into the UI so that only the
required shader is reloaded instead of the whole pipeline but that
requires further refactoring.

Diffstat:
M beamformer.c  | 29 ++++++++++++++---------------
M beamformer.h  | 3 +++
M beamformer_shared_memory.c  | 7 -------
M intrinsics.c  | 22 +++++++++++++---------

4 files changed, 30 insertions(+), 31 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -5,10 +5,6 @@
  *        in the shader
  *      - this will also remove the need for the channel mapping in the decode shader
  * [ ]: refactor: ui: reload only shader which is affected by the interaction
- * [x]: refactor: fancier hot reloading for JIT shaders
- *      - loop over all active blocks
-          - loop over shader sets per block
- *      - when match found reload it
  * [ ]: BeamformWorkQueue -> BeamformerWorkQueue
  * [ ]: need to keep track of gpu memory in some way
  *      - want to be able to store more than 16 2D frames but limit 3D frames
@@ -306,9 +302,8 @@ function b32
 fill_frame_compute_work(BeamformerCtx *ctx, BeamformWork *work, BeamformerViewPlaneTag plane,
                         u32 parameter_block, b32 indirect)
 {
-	b32 result = 0;
-	if (work) {
-		result = 1;
+	b32 result = work != 0;
+	if (result) {
 		u32 frame_id    = atomic_add_u32(&ctx->next_render_frame_index, 1);
 		u32 frame_index = frame_id % countof(ctx->beamform_frames);
 		work->kind      = indirect? BeamformerWorkKind_ComputeIndirect : BeamformerWorkKind_Compute;
@@ -805,11 +800,7 @@ function void
 beamformer_commit_parameter_block(BeamformerCtx *ctx, BeamformerComputePlan *cp, u32 block, Arena arena)
 {
 	BeamformerParameterBlock *pb = beamformer_parameter_block_lock(&ctx->shared_memory, block, -1);
-	for (u32 region = ctz_u32(pb->dirty_regions);
-	     region != 32;
-	     region = ctz_u32(pb->dirty_regions))
-	{
-		mark_parameter_block_region_clean(ctx->shared_memory.region, block, region);
+	for EachBit(pb->dirty_regions, region) {
 		switch (region) {
 		case BeamformerParameterBlockRegion_ComputePipeline:
 		case BeamformerParameterBlockRegion_Parameters:
@@ -822,7 +813,7 @@ beamformer_commit_parameter_block(BeamformerCtx *ctx, BeamformerComputePlan *cp,
 			pb->dirty_regions &= ~mask;
 
 			for (u32 shader_slot = 0; shader_slot < cp->pipeline.shader_count; shader_slot++)
-				load_compute_shader(ctx, cp, shader_slot, arena);
+				cp->dirty_programs |= 1 << shader_slot;
 
 			#define X(k, t, v) glNamedBufferSubData(cp->ubos[BeamformerComputeUBOKind_##k], \
 			                                        0, sizeof(t), &cp->v ## _ubo_data);
@@ -1129,12 +1120,13 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena, iptr gl_c
 				for (u32 slot = 0; slot < cp->pipeline.shader_count; slot++) {
 					i32 shader_index = beamformer_shader_reloadable_index_by_shader[cp->pipeline.shaders[slot]];
 					if (beamformer_reloadable_shader_kinds[shader_index] == work->reload_shader)
-						load_compute_shader(ctx, cp, slot, *arena);
+						cp->dirty_programs |= 1 << slot;
 				}
 			}
 
 			if (ctx->latest_frame && !sm->live_imaging_parameters.active) {
-				fill_frame_compute_work(ctx, work, ctx->latest_frame->view_plane_tag, 0, 0);
+				fill_frame_compute_work(ctx, work, ctx->latest_frame->view_plane_tag,
+				                        ctx->latest_frame->parameter_block, 0);
 				can_commit = 0;
 			}
 		}break;
@@ -1199,6 +1191,13 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena, iptr gl_c
 
 			post_sync_barrier(&ctx->shared_memory, work->lock, sm->locks);
 
+			u32 dirty_programs = atomic_swap_u32(&cp->dirty_programs, 0);
+			static_assert(ISPOWEROF2(BeamformerMaxComputeShaderStages),
+			              "max compute shader stages must be power of 2");
+			assert((dirty_programs & ~((u32)BeamformerMaxComputeShaderStages - 1)) == 0);
+			for EachBit(dirty_programs, slot)
+				load_compute_shader(ctx, cp, (u32)slot, *arena);
+
 			atomic_store_u32(&cs->processing_compute, 1);
 			start_renderdoc_capture(gl_context);
 
diff --git a/beamformer.h b/beamformer.h
@@ -158,6 +158,8 @@ struct BeamformerComputePlan {
 	uv3 decode_dispatch;
 	uv3 demod_dispatch;
 
+	u32 dirty_programs;
+
 	u32 rf_size;
 	i32 hadamard_order;
 	b32 iq_pipeline;
@@ -278,6 +280,7 @@ struct BeamformerFrame {
 	GLenum                    gl_kind;
 	u32                       id;
 	u32                       compound_count;
+	u32                       parameter_block;
 	BeamformerAcquisitionKind acquisition_kind;
 	BeamformerViewPlaneTag    view_plane_tag;
 
diff --git a/beamformer_shared_memory.c b/beamformer_shared_memory.c
@@ -279,13 +279,6 @@ mark_parameter_block_region_dirty(BeamformerSharedMemory *sm, u32 block, Beamfor
 }
 
 function void
-mark_parameter_block_region_clean(BeamformerSharedMemory *sm, u32 block, BeamformerParameterBlockRegions region)
-{
-	BeamformerParameterBlock *pb = beamformer_parameter_block(sm, block);
-	atomic_and_u32(&pb->dirty_regions, ~(1u << region));
-}
-
-function void
 post_sync_barrier(SharedMemoryRegion *sm, BeamformerSharedMemoryLockKind lock, i32 *locks)
 {
 	/* NOTE(rnp): debug: here it is not a bug to release the lock if it
diff --git a/intrinsics.c b/intrinsics.c
@@ -28,17 +28,19 @@
 
   #define memory_write_barrier()       _WriteBarrier()
 
-  #define atomic_store_u64(ptr, n)     *((volatile u64 *)(ptr)) = (n)
-  #define atomic_store_u32(ptr, n)     *((volatile u32 *)(ptr)) = (n)
-  #define atomic_load_u64(ptr)         *((volatile u64 *)(ptr))
-  #define atomic_load_u32(ptr)         *((volatile u32 *)(ptr))
-  #define atomic_add_u64(ptr, n)         _InterlockedExchangeAdd64((volatile u64 *)(ptr), (n))
   #define atomic_add_u32(ptr, n)         _InterlockedExchangeAdd((volatile u32 *)(ptr), (n))
-  #define atomic_and_u64(ptr, n)         _InterlockedAnd64((volatile u64 *)(ptr), (n))
+  #define atomic_add_u64(ptr, n)         _InterlockedExchangeAdd64((volatile u64 *)(ptr), (n))
   #define atomic_and_u32(ptr, n)         _InterlockedAnd((volatile u32 *)(ptr), (n))
-  #define atomic_cas_u64(ptr, cptr, n)  (_InterlockedCompareExchange64((volatile u64 *)(ptr), *(cptr), (n)) == *(cptr))
+  #define atomic_and_u64(ptr, n)         _InterlockedAnd64((volatile u64 *)(ptr), (n))
   #define atomic_cas_u32(ptr, cptr, n)  (_InterlockedCompareExchange((volatile u32 *)(ptr),   *(cptr), (n)) == *(cptr))
+  #define atomic_cas_u64(ptr, cptr, n)  (_InterlockedCompareExchange64((volatile u64 *)(ptr), *(cptr), (n)) == *(cptr))
+  #define atomic_load_u32(ptr)         *((volatile u32 *)(ptr))
+  #define atomic_load_u64(ptr)         *((volatile u64 *)(ptr))
   #define atomic_or_u32(ptr, n)          _InterlockedOr((volatile u32 *)(ptr), (n))
+  #define atomic_store_u32(ptr, n)     *((volatile u32 *)(ptr)) = (n)
+  #define atomic_store_u64(ptr, n)     *((volatile u64 *)(ptr)) = (n)
+  #define atomic_swap_u32(ptr, n)        _InterlockedExchange((volatile u32 *)(ptr), n)
+  #define atomic_swap_u64(ptr, n)        _InterlockedExchange64((volatile u32 *)(ptr), n)
 
   #define atan2_f32(y, x) atan2f(y, x)
   #define cos_f32(a)      cosf(a)
@@ -65,17 +67,19 @@
 
   #define memory_write_barrier()        asm volatile ("" ::: "memory")
 
-  #define atomic_store_u64(ptr, n)      __atomic_store_n(ptr,    n, __ATOMIC_RELEASE)
-  #define atomic_load_u64(ptr)          __atomic_load_n(ptr,        __ATOMIC_ACQUIRE)
   #define atomic_add_u64(ptr, n)        __atomic_fetch_add(ptr,  n, __ATOMIC_ACQ_REL)
   #define atomic_and_u64(ptr, n)        __atomic_and_fetch(ptr,  n, __ATOMIC_RELEASE)
   #define atomic_cas_u64(ptr, cptr, n)  __atomic_compare_exchange_n(ptr, cptr, n, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+  #define atomic_load_u64(ptr)          __atomic_load_n(ptr,        __ATOMIC_ACQUIRE)
   #define atomic_or_u32(ptr, n)         __atomic_or_fetch(ptr,   n, __ATOMIC_RELEASE)
+  #define atomic_store_u64(ptr, n)      __atomic_store_n(ptr,    n, __ATOMIC_RELEASE)
+  #define atomic_swap_u64(ptr, n)       __atomic_exchange_n(ptr, n, __ATOMIC_RELEASE)
   #define atomic_add_u32                atomic_add_u64
   #define atomic_and_u32                atomic_and_u64
   #define atomic_cas_u32                atomic_cas_u64
   #define atomic_load_u32               atomic_load_u64
   #define atomic_store_u32              atomic_store_u64
+  #define atomic_swap_u32               atomic_swap_u64
 
   #define atan2_f32(y, x) __builtin_atan2f(y, x)
   #define cos_f32(a)      __builtin_cosf(a)

M	beamformer.c	\|	29	++++++++++++++---------------
M	beamformer.h	\|	3	+++
M	beamformer_shared_memory.c	\|	7	-------
M	intrinsics.c	\|	22	+++++++++++++---------