Commit: 59c5ac37beff2ea3a82a0beefff08c5cd2f52d33
Parent: fc1489f3d8c138185d24551f188e96429681d002
Author: Randy Palamar
Date: Sat, 4 Oct 2025 11:12:39 -0600
core: use a dirty programs flag bit for reloading compute shaders
there should be a way of tieing this into the UI so that only the
required shader is reloaded instead of the whole pipeline but that
requires further refactoring.
Diffstat:
4 files changed, 30 insertions(+), 31 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -5,10 +5,6 @@
* in the shader
* - this will also remove the need for the channel mapping in the decode shader
* [ ]: refactor: ui: reload only shader which is affected by the interaction
- * [x]: refactor: fancier hot reloading for JIT shaders
- * - loop over all active blocks
- - loop over shader sets per block
- * - when match found reload it
* [ ]: BeamformWorkQueue -> BeamformerWorkQueue
* [ ]: need to keep track of gpu memory in some way
* - want to be able to store more than 16 2D frames but limit 3D frames
@@ -306,9 +302,8 @@ function b32
fill_frame_compute_work(BeamformerCtx *ctx, BeamformWork *work, BeamformerViewPlaneTag plane,
u32 parameter_block, b32 indirect)
{
- b32 result = 0;
- if (work) {
- result = 1;
+ b32 result = work != 0;
+ if (result) {
u32 frame_id = atomic_add_u32(&ctx->next_render_frame_index, 1);
u32 frame_index = frame_id % countof(ctx->beamform_frames);
work->kind = indirect? BeamformerWorkKind_ComputeIndirect : BeamformerWorkKind_Compute;
@@ -805,11 +800,7 @@ function void
beamformer_commit_parameter_block(BeamformerCtx *ctx, BeamformerComputePlan *cp, u32 block, Arena arena)
{
BeamformerParameterBlock *pb = beamformer_parameter_block_lock(&ctx->shared_memory, block, -1);
- for (u32 region = ctz_u32(pb->dirty_regions);
- region != 32;
- region = ctz_u32(pb->dirty_regions))
- {
- mark_parameter_block_region_clean(ctx->shared_memory.region, block, region);
+ for EachBit(pb->dirty_regions, region) {
switch (region) {
case BeamformerParameterBlockRegion_ComputePipeline:
case BeamformerParameterBlockRegion_Parameters:
@@ -822,7 +813,7 @@ beamformer_commit_parameter_block(BeamformerCtx *ctx, BeamformerComputePlan *cp,
pb->dirty_regions &= ~mask;
for (u32 shader_slot = 0; shader_slot < cp->pipeline.shader_count; shader_slot++)
- load_compute_shader(ctx, cp, shader_slot, arena);
+ cp->dirty_programs |= 1 << shader_slot;
#define X(k, t, v) glNamedBufferSubData(cp->ubos[BeamformerComputeUBOKind_##k], \
0, sizeof(t), &cp->v ## _ubo_data);
@@ -1129,12 +1120,13 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena, iptr gl_c
for (u32 slot = 0; slot < cp->pipeline.shader_count; slot++) {
i32 shader_index = beamformer_shader_reloadable_index_by_shader[cp->pipeline.shaders[slot]];
if (beamformer_reloadable_shader_kinds[shader_index] == work->reload_shader)
- load_compute_shader(ctx, cp, slot, *arena);
+ cp->dirty_programs |= 1 << slot;
}
}
if (ctx->latest_frame && !sm->live_imaging_parameters.active) {
- fill_frame_compute_work(ctx, work, ctx->latest_frame->view_plane_tag, 0, 0);
+ fill_frame_compute_work(ctx, work, ctx->latest_frame->view_plane_tag,
+ ctx->latest_frame->parameter_block, 0);
can_commit = 0;
}
}break;
@@ -1199,6 +1191,13 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena, iptr gl_c
post_sync_barrier(&ctx->shared_memory, work->lock, sm->locks);
+ u32 dirty_programs = atomic_swap_u32(&cp->dirty_programs, 0);
+ static_assert(ISPOWEROF2(BeamformerMaxComputeShaderStages),
+ "max compute shader stages must be power of 2");
+ assert((dirty_programs & ~((u32)BeamformerMaxComputeShaderStages - 1)) == 0);
+ for EachBit(dirty_programs, slot)
+ load_compute_shader(ctx, cp, (u32)slot, *arena);
+
atomic_store_u32(&cs->processing_compute, 1);
start_renderdoc_capture(gl_context);
diff --git a/beamformer.h b/beamformer.h
@@ -158,6 +158,8 @@ struct BeamformerComputePlan {
uv3 decode_dispatch;
uv3 demod_dispatch;
+ u32 dirty_programs;
+
u32 rf_size;
i32 hadamard_order;
b32 iq_pipeline;
@@ -278,6 +280,7 @@ struct BeamformerFrame {
GLenum gl_kind;
u32 id;
u32 compound_count;
+ u32 parameter_block;
BeamformerAcquisitionKind acquisition_kind;
BeamformerViewPlaneTag view_plane_tag;
diff --git a/beamformer_shared_memory.c b/beamformer_shared_memory.c
@@ -279,13 +279,6 @@ mark_parameter_block_region_dirty(BeamformerSharedMemory *sm, u32 block, Beamfor
}
function void
-mark_parameter_block_region_clean(BeamformerSharedMemory *sm, u32 block, BeamformerParameterBlockRegions region)
-{
- BeamformerParameterBlock *pb = beamformer_parameter_block(sm, block);
- atomic_and_u32(&pb->dirty_regions, ~(1u << region));
-}
-
-function void
post_sync_barrier(SharedMemoryRegion *sm, BeamformerSharedMemoryLockKind lock, i32 *locks)
{
/* NOTE(rnp): debug: here it is not a bug to release the lock if it
diff --git a/intrinsics.c b/intrinsics.c
@@ -28,17 +28,19 @@
#define memory_write_barrier() _WriteBarrier()
- #define atomic_store_u64(ptr, n) *((volatile u64 *)(ptr)) = (n)
- #define atomic_store_u32(ptr, n) *((volatile u32 *)(ptr)) = (n)
- #define atomic_load_u64(ptr) *((volatile u64 *)(ptr))
- #define atomic_load_u32(ptr) *((volatile u32 *)(ptr))
- #define atomic_add_u64(ptr, n) _InterlockedExchangeAdd64((volatile u64 *)(ptr), (n))
#define atomic_add_u32(ptr, n) _InterlockedExchangeAdd((volatile u32 *)(ptr), (n))
- #define atomic_and_u64(ptr, n) _InterlockedAnd64((volatile u64 *)(ptr), (n))
+ #define atomic_add_u64(ptr, n) _InterlockedExchangeAdd64((volatile u64 *)(ptr), (n))
#define atomic_and_u32(ptr, n) _InterlockedAnd((volatile u32 *)(ptr), (n))
- #define atomic_cas_u64(ptr, cptr, n) (_InterlockedCompareExchange64((volatile u64 *)(ptr), *(cptr), (n)) == *(cptr))
+ #define atomic_and_u64(ptr, n) _InterlockedAnd64((volatile u64 *)(ptr), (n))
#define atomic_cas_u32(ptr, cptr, n) (_InterlockedCompareExchange((volatile u32 *)(ptr), *(cptr), (n)) == *(cptr))
+ #define atomic_cas_u64(ptr, cptr, n) (_InterlockedCompareExchange64((volatile u64 *)(ptr), *(cptr), (n)) == *(cptr))
+ #define atomic_load_u32(ptr) *((volatile u32 *)(ptr))
+ #define atomic_load_u64(ptr) *((volatile u64 *)(ptr))
#define atomic_or_u32(ptr, n) _InterlockedOr((volatile u32 *)(ptr), (n))
+ #define atomic_store_u32(ptr, n) *((volatile u32 *)(ptr)) = (n)
+ #define atomic_store_u64(ptr, n) *((volatile u64 *)(ptr)) = (n)
+ #define atomic_swap_u32(ptr, n) _InterlockedExchange((volatile u32 *)(ptr), n)
+ #define atomic_swap_u64(ptr, n) _InterlockedExchange64((volatile u32 *)(ptr), n)
#define atan2_f32(y, x) atan2f(y, x)
#define cos_f32(a) cosf(a)
@@ -65,17 +67,19 @@
#define memory_write_barrier() asm volatile ("" ::: "memory")
- #define atomic_store_u64(ptr, n) __atomic_store_n(ptr, n, __ATOMIC_RELEASE)
- #define atomic_load_u64(ptr) __atomic_load_n(ptr, __ATOMIC_ACQUIRE)
#define atomic_add_u64(ptr, n) __atomic_fetch_add(ptr, n, __ATOMIC_ACQ_REL)
#define atomic_and_u64(ptr, n) __atomic_and_fetch(ptr, n, __ATOMIC_RELEASE)
#define atomic_cas_u64(ptr, cptr, n) __atomic_compare_exchange_n(ptr, cptr, n, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
+ #define atomic_load_u64(ptr) __atomic_load_n(ptr, __ATOMIC_ACQUIRE)
#define atomic_or_u32(ptr, n) __atomic_or_fetch(ptr, n, __ATOMIC_RELEASE)
+ #define atomic_store_u64(ptr, n) __atomic_store_n(ptr, n, __ATOMIC_RELEASE)
+ #define atomic_swap_u64(ptr, n) __atomic_exchange_n(ptr, n, __ATOMIC_RELEASE)
#define atomic_add_u32 atomic_add_u64
#define atomic_and_u32 atomic_and_u64
#define atomic_cas_u32 atomic_cas_u64
#define atomic_load_u32 atomic_load_u64
#define atomic_store_u32 atomic_store_u64
+ #define atomic_swap_u32 atomic_swap_u64
#define atan2_f32(y, x) __builtin_atan2f(y, x)
#define cos_f32(a) __builtin_cosf(a)