core: return to single thread for upload timeline - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: c268a6bc01452e94e82a881f1650d52ab0805dba
Parent: 0269e2ebd19872b3d2507bb0849519ec019d6a2c
Author: Randy Palamar
Date:   Thu, 13 Nov 2025 20:43:03 -0700

core: return to single thread for upload timeline

based on what we currently want to do with the data in flight L3
cache contention will prevent any gains from using multiple
threads.

The rest of thread code remains as it will be used elsewhere.

Diffstat:
M beamformer.c  | 108 ++++++++++++++++++++++++++-----------------------------------------------------
M beamformer_shared_memory.c  | 4 ++--
M helpers/ogl_beamformer_lib.c  | 4 ++--
M static.c  | 66 +++++-------------------------------------------------------------

4 files changed, 44 insertions(+), 138 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -1414,93 +1414,55 @@ beamformer_rf_buffer_allocate(BeamformerRFBuffer *rf, u32 rf_size, b32 nvidia)
 
 DEBUG_EXPORT BEAMFORMER_RF_UPLOAD_FN(beamformer_rf_upload)
 {
-	struct load_context {
-		uptr  buffer;
-		void *data;
-		u32   offset;
-		u32   channel_count;
-		u32   channel_stride_bytes;
-		b32   nvidia;
-	} load_context_store = {0};
-	struct load_context *lctx = 0;
-
-	BeamformerSharedMemory *sm                  = 0;
+	BeamformerSharedMemory *sm                  = ctx->shared_memory->region;
 	BeamformerSharedMemoryLockKind scratch_lock = BeamformerSharedMemoryLockKind_ScratchSpace;
 	BeamformerSharedMemoryLockKind upload_lock  = BeamformerSharedMemoryLockKind_UploadRF;
 
-	u32 insertion_slot = 0;
-	if (lane_index() == 0) {
-		sm   = ctx->shared_memory->region;
-		lctx = &load_context_store;
-		u32 scratch_rf_size;
+	u64 rf_block_rf_size;
+	if (atomic_load_u32(sm->locks + upload_lock) &&
+	    (rf_block_rf_size = atomic_swap_u64(&sm->rf_block_rf_size, 0)) &&
+	    os_shared_memory_region_lock(ctx->shared_memory, sm->locks, (i32)scratch_lock, (u32)-1))
+	{
+		BeamformerRFBuffer       *rf = ctx->rf_buffer;
+		BeamformerParameterBlock *b  = beamformer_parameter_block(sm, (u32)(rf_block_rf_size >> 32ULL));
+		BeamformerParameters     *bp = &b->parameters;
+		BeamformerDataKind data_kind = b->pipeline.data_kind;
 
-		if (atomic_load_u32(sm->locks + upload_lock) &&
-		    (scratch_rf_size = atomic_swap_u32(&sm->rf_meta.size, 0)) &&
-		    os_shared_memory_region_lock(ctx->shared_memory, sm->locks, (i32)scratch_lock, (u32)-1))
-		{
-			lctx->nvidia = ctx->gl->vendor_id == GLVendor_NVIDIA;
-			BeamformerRFBuffer *rf = ctx->rf_buffer;
-			rf->active_rf_size = (u32)round_up_to(scratch_rf_size, 64);
-			if (rf->size < rf->active_rf_size)
-				beamformer_rf_buffer_allocate(rf, rf->active_rf_size, lctx->nvidia);
-
-			insertion_slot = rf->insertion_index++ % countof(rf->compute_syncs);
-
-			/* NOTE(rnp): if the rest of the code is functioning then the first
-			 * time the compute thread processes an upload it must have gone
-			 * through this path. therefore it is safe to spin until it gets processed */
-			spin_wait(atomic_load_u64(rf->upload_syncs + insertion_slot));
-
-			if (atomic_load_u64(rf->compute_syncs + insertion_slot)) {
-				GLenum sync_result = glClientWaitSync(rf->compute_syncs[insertion_slot], 0, 1000000000);
-				if (sync_result == GL_TIMEOUT_EXPIRED || sync_result == GL_WAIT_FAILED) {
-					// TODO(rnp): what do?
-				}
-				glDeleteSync(rf->compute_syncs[insertion_slot]);
-			}
+		b32 nvidia = ctx->gl->vendor_id == GLVendor_NVIDIA;
 
-			lctx->offset = insertion_slot * rf->active_rf_size;
-			lctx->buffer = lctx->nvidia? rf->ssbo : (uptr)rf->buffer;
-			lctx->data   = beamformer_shared_memory_scratch_arena(sm).beg;
+		rf->active_rf_size = (u32)round_up_to(rf_block_rf_size & 0xFFFFFFFFULL, 64);
+		if (rf->size < rf->active_rf_size)
+			beamformer_rf_buffer_allocate(rf, rf->active_rf_size, nvidia);
 
-			BeamformerParameterBlock *b = beamformer_parameter_block(sm, atomic_load_u32(&sm->rf_meta.block));
-			BeamformerParameters     *bp = &b->parameters;
-			BeamformerDataKind data_kind = b->pipeline.data_kind;
+		u32 slot = rf->insertion_index++ % countof(rf->compute_syncs);
 
-			u32 size = bp->acquisition_count * bp->sample_count * beamformer_data_kind_byte_size[data_kind];
-			lctx->channel_count        = bp->channel_count;
-			lctx->channel_stride_bytes = size;
-		}
-	}
-	lane_sync_u64(&lctx, 0);
-
-	if (lctx->buffer) {
-		RangeU64 range = lane_range(lctx->channel_count);
-		if (lctx->nvidia) {
-			i64 offset = (i64)(lctx->offset + range.start * lctx->channel_stride_bytes);
-			i32 size   = (i32)(lctx->channel_stride_bytes * (range.stop - range.start));
-			u8 *in     = lctx->data + range.start * lctx->channel_stride_bytes;
-			glNamedBufferSubData((u32)lctx->buffer, offset, size, in);
-		} else {
-			for (u64 channel = range.start; channel < range.stop; channel++) {
-				u8 *in  = lctx->data + channel * lctx->channel_stride_bytes;
-				u8 *out = (u8 *)lctx->buffer + lctx->offset + channel * lctx->channel_stride_bytes;
-				mem_copy(out, in, lctx->channel_stride_bytes);
+		/* NOTE(rnp): if the rest of the code is functioning then the first
+		 * time the compute thread processes an upload it must have gone
+		 * through this path. therefore it is safe to spin until it gets processed */
+		spin_wait(atomic_load_u64(rf->upload_syncs + slot));
+
+		if (atomic_load_u64(rf->compute_syncs + slot)) {
+			GLenum sync_result = glClientWaitSync(rf->compute_syncs[slot], 0, 1000000000);
+			if (sync_result == GL_TIMEOUT_EXPIRED || sync_result == GL_WAIT_FAILED) {
+				// TODO(rnp): what do?
 			}
+			glDeleteSync(rf->compute_syncs[slot]);
 		}
-	}
-	lane_sync();
 
-	if (lctx->buffer && lane_index() == 0) {
+		u32 size = bp->channel_count * bp->acquisition_count * bp->sample_count * beamformer_data_kind_byte_size[data_kind];
+		u8 *data = beamformer_shared_memory_scratch_arena(sm).beg;
+
+		if (nvidia) glNamedBufferSubData(rf->ssbo, slot * rf->active_rf_size, (i32)size, data);
+		else        mem_copy(rf->buffer + slot * rf->active_rf_size, data, size);
+
 		os_shared_memory_region_unlock(ctx->shared_memory, sm->locks, (i32)scratch_lock);
 		post_sync_barrier(ctx->shared_memory, upload_lock, sm->locks);
 
-		BeamformerRFBuffer *rf = ctx->rf_buffer;
-		if (!lctx->nvidia)
-			glFlushMappedNamedBufferRange(rf->ssbo, insertion_slot * rf->active_rf_size, (i32)rf->active_rf_size);
+		if (!nvidia)
+			glFlushMappedNamedBufferRange(rf->ssbo, slot * rf->active_rf_size, (i32)rf->active_rf_size);
 
-		atomic_store_u64(rf->upload_syncs + insertion_slot,  glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0));
-		atomic_store_u64(rf->compute_syncs + insertion_slot, 0);
+		atomic_store_u64(rf->upload_syncs  + slot, glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0));
+		atomic_store_u64(rf->compute_syncs + slot, 0);
 
 		os_wake_waiters(ctx->compute_worker_sync);
 
diff --git a/beamformer_shared_memory.c b/beamformer_shared_memory.c
@@ -1,5 +1,5 @@
 /* See LICENSE for license details. */
-#define BEAMFORMER_SHARED_MEMORY_VERSION (22UL)
+#define BEAMFORMER_SHARED_MEMORY_VERSION (23UL)
 
 typedef struct BeamformerFrame BeamformerFrame;
 
@@ -154,7 +154,7 @@ typedef struct {
 	u32 reserved_parameter_blocks;
 
 	/* TODO(rnp): this is really sucky. we need a better way to communicate this */
-	alignas(8) union { struct {u32 block, size;}; u64 U64; } rf_meta;
+	u64 rf_block_rf_size;
 
 	BeamformerLiveImagingParameters live_imaging_parameters;
 	BeamformerLiveImagingDirtyFlags live_imaging_dirty_flags;
diff --git a/helpers/ogl_beamformer_lib.c b/helpers/ogl_beamformer_lib.c
@@ -438,8 +438,8 @@ beamformer_push_data_base(void *data, u32 data_size, i32 timeout_ms, u32 block)
 
 				lib_release_lock(BeamformerSharedMemoryLockKind_ScratchSpace);
 				/* TODO(rnp): need a better way to communicate this */
-				typeof(g_beamformer_library_context.bp->rf_meta) meta = {.block = block, .size = size};
-				atomic_store_u64(&g_beamformer_library_context.bp->rf_meta.U64, meta.U64);
+				u64 rf_block_rf_size = (u64)block << 32ULL | (u64)size;
+				atomic_store_u64(&g_beamformer_library_context.bp->rf_block_rf_size, rf_block_rf_size);
 				result = 1;
 			}
 		}
diff --git a/static.c b/static.c
@@ -309,32 +309,6 @@ function OS_THREAD_ENTRY_POINT_FN(compute_worker_thread_entry_point)
 
 function OS_THREAD_ENTRY_POINT_FN(beamformer_upload_entry_point)
 {
-	GLWorkerThreadContext         *gl_thread_context = 0;
-	BeamformerUploadThreadContext *up = 0;
-	{
-		ThreadContext *ctx = (ThreadContext *)_ctx;
-		lane_context(ctx);
-
-		if (lane_index() == 0) {
-			gl_thread_context = (GLWorkerThreadContext *)ctx->lane_context.broadcast_memory[0];
-			up = (BeamformerUploadThreadContext *)gl_thread_context->user_context;
-		}
-	}
-
-	for (;;) {
-		if (lane_index() == 0)
-			worker_thread_sleep(gl_thread_context, up->shared_memory->region);
-
-		lane_sync();
-
-		beamformer_rf_upload(up);
-	}
-
-	unreachable();
-}
-
-function OS_THREAD_ENTRY_POINT_FN(upload_worker_thread_entry_point)
-{
 	GLWorkerThreadContext *ctx = (GLWorkerThreadContext *)_ctx;
 	glfwMakeContextCurrent(ctx->window_handle);
 	ctx->gl_context = os_get_native_gl_context(ctx->window_handle);
@@ -344,41 +318,11 @@ function OS_THREAD_ENTRY_POINT_FN(upload_worker_thread_entry_point)
 	/* NOTE(rnp): start this here so we don't have to worry about it being started or not */
 	glQueryCounter(up->rf_buffer->data_timestamp_query, GL_TIMESTAMP);
 
-	u64 lane_broadcast_value = 0;
-
-	ThreadContext *threads;
-	{
-		u32 main_threads          = 3 - 1;
-		u32 async_threads_count   = os_get_system_info()->logical_processor_count;
-		u32 main_threads_clamped  = MIN(async_threads_count, main_threads);
-		async_threads_count      -= main_threads_clamped;
-
-		/* NOTE(rnp): always memory bound right now so more threads don't help anything */
-		async_threads_count = 1;
-
-		//Barrier barrier = os_barrier_alloc(async_threads_count);
-		Barrier barrier = {0};
-		threads         = push_array(&ctx->arena, ThreadContext, (iz)async_threads_count);
-
-		for (u64 index = 0; index < async_threads_count; index++) {
-			Stream name = stream_from_buffer(threads[index].name, countof(threads[index].name));
-			stream_append_s8(&name, s8("[upload_"));
-			stream_append_u64(&name, index);
-			stream_append_s8(&name, s8("]"));
-			threads[index].lane_context.index            = index;
-			threads[index].lane_context.count            = async_threads_count;
-			threads[index].lane_context.barrier          = barrier;
-			threads[index].lane_context.broadcast_memory = &lane_broadcast_value;
-			if (index != 0) {
-				iptr thread = os_create_thread((iptr)(threads + index), beamformer_upload_entry_point);
-				os_set_thread_name(thread, stream_to_s8(&name));
-			}
-		}
+	for (;;) {
+		worker_thread_sleep(ctx, up->shared_memory->region);
+		beamformer_rf_upload(up);
 	}
 
-	threads[0].lane_context.broadcast_memory[0] = (u64)ctx;
-	beamformer_upload_entry_point((iptr)threads);
-
 	unreachable();
 
 	return 0;
@@ -461,8 +405,8 @@ setup_beamformer(Arena *memory, BeamformerCtx **o_ctx, BeamformerInput **o_input
 	upctx->compute_worker_sync  = &ctx->compute_worker.sync_variable;
 	upctx->gl                   = &ctx->gl;
 	upload->window_handle = glfwCreateWindow(1, 1, "", 0, raylib_window_handle);
-	upload->handle        = os_create_thread((iptr)upload, upload_worker_thread_entry_point);
-	os_set_thread_name(worker->handle, s8("[upload_0]"));
+	upload->handle        = os_create_thread((iptr)upload, beamformer_upload_entry_point);
+	os_set_thread_name(worker->handle, s8("[upload]"));
 
 	glfwMakeContextCurrent(raylib_window_handle);

M	beamformer.c	\|	108	++++++++++++++++++++++++++-----------------------------------------------------
M	beamformer_shared_memory.c	\|	4	++--
M	helpers/ogl_beamformer_lib.c	\|	4	++--
M	static.c	\|	66	+++++-------------------------------------------------------------