core: fix race condition in rf upload function - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: c22719e6abf0f40c01b8456662df96365920a23a
Parent: 41cafc3e0f80428ed532f589f507f66193e4ccb8
Author: Randy Palamar
Date:   Thu, 23 Oct 2025 18:04:49 -0600

core: fix race condition in rf upload function

While the code prior to this commit contained a condition to
prevent the two helper threads from sleeping during live imaging
the imaging upload rate was still limited to 60 FPS. This was
because the upload thread was stuck in the loop until the main
thread allowed it to exit. Fixing this exposed a race in the
library upload code where the upload thread could preempt the
library into taking the scratch space lock before the library was
able to upload data.

To solve this I utilize the scratch_rf_size as a separate
synchronization point. If the value is 0 the upload thread will
not try to lock the scratch region. If its any other value the
upload thread sets it to zero and takes the lock (the library
cannot try to take the lock again until the upload thread releases
the UploadRF fence)

Furthermore I made a pass through all the places where values are
being modified across threads and ensured they are done
atomically; this is the only way to ensure that their value
becomes visible to other threads. All atomic operations for
GCC/Clang were switched to SEQ_CST; while some of them could
technically use weaker constraints, I do not want to think about
them until they become performance issues (which they probably
never will).

Diffstat:
M beamformer.c  | 30 +++++++++++++++---------------
M beamformer_shared_memory.c  | 2 +-
M helpers/ogl_beamformer_lib.c  | 4 ++--
M intrinsics.c  | 12 ++++++------
M os_linux.c  | 6 ++----
M static.c  | 13 +++++++------

6 files changed, 33 insertions(+), 34 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -1205,15 +1205,15 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena, iptr gl_c
 				BeamformerRFBuffer *rf = &cs->rf_buffer;
 				u32 slot = rf->compute_index % countof(rf->compute_syncs);
 
-				/* NOTE(rnp): compute indirect is used when uploading data. in this case the thread
-				 * must wait on an upload fence. if the fence doesn't yet exist the thread must wait */
-				if (work->kind == BeamformerWorkKind_ComputeIndirect)
+				if (work->kind == BeamformerWorkKind_ComputeIndirect) {
+					/* NOTE(rnp): compute indirect is used when uploading data. if compute thread
+					 * preempts upload it must wait for the fence to exist. then it must tell the
+					 * GPU to wait for upload to complete before it can start compute */
 					spin_wait(!atomic_load_u64(rf->upload_syncs + slot));
 
-				if (rf->upload_syncs[slot]) {
-					rf->compute_index++;
 					glWaitSync(rf->upload_syncs[slot], 0, GL_TIMEOUT_IGNORED);
 					glDeleteSync(rf->upload_syncs[slot]);
+					rf->compute_index++;
 				} else {
 					slot = (rf->compute_index - 1) % countof(rf->compute_syncs);
 				}
@@ -1225,9 +1225,8 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena, iptr gl_c
 				glEndQuery(GL_TIME_ELAPSED);
 
 				if (work->kind == BeamformerWorkKind_ComputeIndirect) {
-					rf->compute_syncs[slot] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
-					rf->upload_syncs[slot]  = 0;
-					memory_write_barrier();
+					atomic_store_u64(rf->compute_syncs + slot, glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0));
+					atomic_store_u64(rf->upload_syncs + slot,  0);
 				}
 			}
 
@@ -1365,11 +1364,13 @@ DEBUG_EXPORT BEAMFORMER_RF_UPLOAD_FN(beamformer_rf_upload)
 
 	BeamformerSharedMemoryLockKind scratch_lock = BeamformerSharedMemoryLockKind_ScratchSpace;
 	BeamformerSharedMemoryLockKind upload_lock  = BeamformerSharedMemoryLockKind_UploadRF;
-	if (sm->locks[upload_lock] &&
+	u32 scratch_rf_size;
+	if (atomic_load_u32(sm->locks + upload_lock) &&
+	    (scratch_rf_size = atomic_swap_u32(&sm->scratch_rf_size, 0)) &&
 	    os_shared_memory_region_lock(ctx->shared_memory, sm->locks, (i32)scratch_lock, (u32)-1))
 	{
 		BeamformerRFBuffer *rf = ctx->rf_buffer;
-		rf->active_rf_size = (u32)round_up_to(sm->scratch_rf_size, 64);
+		rf->active_rf_size = (u32)round_up_to(scratch_rf_size, 64);
 		if (rf->size < rf->active_rf_size)
 			beamformer_rf_buffer_allocate(rf, rf->active_rf_size, arena);
 
@@ -1380,7 +1381,7 @@ DEBUG_EXPORT BEAMFORMER_RF_UPLOAD_FN(beamformer_rf_upload)
 		 * through this path. therefore it is safe to spin until it gets processed */
 		spin_wait(atomic_load_u64(rf->upload_syncs + slot));
 
-		if (rf->compute_syncs[slot]) {
+		if (atomic_load_u64(rf->compute_syncs + slot)) {
 			GLenum sync_result = glClientWaitSync(rf->compute_syncs[slot], 0, 1000000000);
 			if (sync_result == GL_TIMEOUT_EXPIRED || sync_result == GL_WAIT_FAILED) {
 				// TODO(rnp): what do?
@@ -1401,9 +1402,8 @@ DEBUG_EXPORT BEAMFORMER_RF_UPLOAD_FN(beamformer_rf_upload)
 		glFlushMappedNamedBufferRange(rf->ssbo, 0, (i32)rf->active_rf_size);
 		glUnmapNamedBuffer(rf->ssbo);
 
-		rf->upload_syncs[slot]  = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
-		rf->compute_syncs[slot] = 0;
-		memory_write_barrier();
+		atomic_store_u64(rf->upload_syncs + slot,  glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0));
+		atomic_store_u64(rf->compute_syncs + slot, 0);
 
 		os_wake_waiters(ctx->compute_worker_sync);
 
@@ -1434,7 +1434,7 @@ DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step)
 	}
 
 	BeamformerSharedMemory *sm = ctx->shared_memory.region;
-	if (sm->locks[BeamformerSharedMemoryLockKind_UploadRF] != 0)
+	if (atomic_load_u32(sm->locks + BeamformerSharedMemoryLockKind_UploadRF))
 		os_wake_waiters(&ctx->os.upload_worker.sync_variable);
 
 	BeamformerFrame        *frame = ctx->latest_frame;
diff --git a/beamformer_shared_memory.c b/beamformer_shared_memory.c
@@ -1,5 +1,5 @@
 /* See LICENSE for license details. */
-#define BEAMFORMER_SHARED_MEMORY_VERSION (16UL)
+#define BEAMFORMER_SHARED_MEMORY_VERSION (17UL)
 
 typedef struct BeamformerFrame BeamformerFrame;
 
diff --git a/helpers/ogl_beamformer_lib.c b/helpers/ogl_beamformer_lib.c
@@ -430,9 +430,9 @@ beamformer_push_data_base(void *data, u32 data_size, i32 timeout_ms)
 			if (lib_try_lock(BeamformerSharedMemoryLockKind_UploadRF, timeout_ms)) {
 				if (lib_try_lock(BeamformerSharedMemoryLockKind_ScratchSpace, 0)) {
 					mem_copy(scratch.beg, data, data_size);
-					/* TODO(rnp): need a better way to communicate this */
-					g_beamformer_library_context.bp->scratch_rf_size = data_size;
 					lib_release_lock(BeamformerSharedMemoryLockKind_ScratchSpace);
+					/* TODO(rnp): need a better way to communicate this */
+					atomic_store_u32(&g_beamformer_library_context.bp->scratch_rf_size, data_size);
 					result = 1;
 				}
 			}
diff --git a/intrinsics.c b/intrinsics.c
@@ -67,13 +67,13 @@
 
   #define memory_write_barrier()        asm volatile ("" ::: "memory")
 
-  #define atomic_add_u64(ptr, n)        __atomic_fetch_add(ptr,  n, __ATOMIC_ACQ_REL)
-  #define atomic_and_u64(ptr, n)        __atomic_and_fetch(ptr,  n, __ATOMIC_RELEASE)
+  #define atomic_add_u64(ptr, n)        __atomic_fetch_add(ptr,  n, __ATOMIC_SEQ_CST)
+  #define atomic_and_u64(ptr, n)        __atomic_and_fetch(ptr,  n, __ATOMIC_SEQ_CST)
   #define atomic_cas_u64(ptr, cptr, n)  __atomic_compare_exchange_n(ptr, cptr, n, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
-  #define atomic_load_u64(ptr)          __atomic_load_n(ptr,        __ATOMIC_ACQUIRE)
-  #define atomic_or_u32(ptr, n)         __atomic_or_fetch(ptr,   n, __ATOMIC_RELEASE)
-  #define atomic_store_u64(ptr, n)      __atomic_store_n(ptr,    n, __ATOMIC_RELEASE)
-  #define atomic_swap_u64(ptr, n)       __atomic_exchange_n(ptr, n, __ATOMIC_RELEASE)
+  #define atomic_load_u64(ptr)          __atomic_load_n(ptr,        __ATOMIC_SEQ_CST)
+  #define atomic_or_u32(ptr, n)         __atomic_or_fetch(ptr,   n, __ATOMIC_SEQ_CST)
+  #define atomic_store_u64(ptr, n)      __atomic_store_n(ptr,    n, __ATOMIC_SEQ_CST)
+  #define atomic_swap_u64(ptr, n)       __atomic_exchange_n(ptr, n, __ATOMIC_SEQ_CST)
   #define atomic_add_u32                atomic_add_u64
   #define atomic_and_u32                atomic_and_u64
   #define atomic_cas_u32                atomic_cas_u64
diff --git a/os_linux.c b/os_linux.c
@@ -278,11 +278,9 @@ function OS_SHARED_MEMORY_LOCK_REGION_FN(os_shared_memory_region_lock)
 	b32 result = 0;
 	for (;;) {
 		i32 current = 0;
-		if (atomic_cas_u32(locks + lock_index, &current, 1)) {
+		if (atomic_cas_u32(locks + lock_index, &current, 1))
 			result = 1;
-			break;
-		}
-		if (!timeout_ms || !os_wait_on_value(locks + lock_index, current, timeout_ms))
+		if (result || !timeout_ms || !os_wait_on_value(locks + lock_index, current, timeout_ms))
 			break;
 	}
 	return result;
diff --git a/static.c b/static.c
@@ -264,14 +264,15 @@ worker_thread_sleep(GLWorkerThreadContext *ctx, BeamformerSharedMemory *sm)
 {
 	for (;;) {
 		i32 expected = 0;
-		if (atomic_cas_u32(&ctx->sync_variable, &expected, 1))
+		if (atomic_cas_u32(&ctx->sync_variable, &expected, 1) ||
+		    atomic_load_u32(&sm->live_imaging_parameters.active))
+		{
 			break;
-
-		if (!atomic_load_u32(&sm->live_imaging_parameters.active)) {
-			atomic_store_u32(&ctx->asleep, 1);
-			os_wait_on_value(&ctx->sync_variable, 1, (u32)-1);
-			atomic_store_u32(&ctx->asleep, 0);
 		}
+
+		atomic_store_u32(&ctx->asleep, 1);
+		os_wait_on_value(&ctx->sync_variable, 1, (u32)-1);
+		atomic_store_u32(&ctx->asleep, 0);
 	}
 }

M	beamformer.c	\|	30	+++++++++++++++---------------
M	beamformer_shared_memory.c	\|	2	+-
M	helpers/ogl_beamformer_lib.c	\|	4	++--
M	intrinsics.c	\|	12	++++++------
M	os_linux.c	\|	6	++----
M	static.c	\|	13	+++++++------