core: asynchronous RF upload thread - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: 727b3a7ca14a3db09f2932015b3aa76813a646cf
Parent: b3b5d18cd54220d06653c169004923ed7a343949
Author: Randy Palamar
Date:   Tue, 29 Jul 2025 21:58:05 -0600

core: asynchronous RF upload thread

This significantly improves performance by overlapping data
uploads with compute. I'm not 100% sure about by AMD card but most
modern NVIDIA cards have an asynchronous upload queue that can be
run at the same time as the compute queue is being processed.
Apparently the only reliable way in OpenGL of getting the driver
to use that upload queue is to have a separate context that only
ever issues memory related commands, never drawing commands.

With some careful use of fences and a persistently mapped
oversized ssbo we can keep the compute thread processing a frame
while a new RF is being uploaded.

This increases the rf upload throughput on my weak desktop GPU
past what we used to get on the RTX 4090 before we cleaned up the
matlab side. This should effectively remove all synchronization
but it still needs to be profiled.

Diffstat:
M beamformer.c  | 177 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------
M beamformer.h  | 31 +++++++++++++++++++++++++++----
M beamformer_work_queue.c  | 19 +++++++++++++++++++
M beamformer_work_queue.h  | 15 ++++++---------
M helpers/ogl_beamformer_lib.c  | 82 ++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
M helpers/ogl_beamformer_lib_base.h  | 4 ++--
M opengl.h  | 21 +++++++++++++++++++--
M static.c  | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
M tests/throughput.c  | 17 +++++++++++++----
M ui.c  | 27 ++++++++++++++++++---------
M util.h  | 4 ++++

11 files changed, 337 insertions(+), 135 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -5,15 +5,8 @@
  *      - this will also flip the current hack to support demodulate after decode to
  *        being a hack to support CudaHilbert after decode
  * [ ]: filter sampling frequency should be a filter creation parameter
- * [ ]: reinvestigate ring buffer raw_data_ssbo
- *      - to minimize latency the main thread should manage the subbuffer upload so that the
- *        compute thread can just keep computing. This way we can keep the copmute thread busy
- *        with work while we image.
- *      - In particular we will potentially need multiple GPUComputeContexts so that we
- *        can overwrite one while the other is in use.
- *      - make use of glFenceSync to guard buffer uploads
+ * [ ]: measure performance of doing channel mapping in a separate shader
  * [ ]: BeamformWorkQueue -> BeamformerWorkQueue
- * [ ]: bug: re-beamform on shader reload
  * [ ]: need to keep track of gpu memory in some way
  *      - want to be able to store more than 16 2D frames but limit 3D frames
  *      - maybe keep track of how much gpu memory is committed for beamformed images
@@ -164,12 +157,6 @@ alloc_shader_storage(BeamformerCtx *ctx, u32 rf_raw_size, Arena a)
 	glDeleteBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos);
 	glCreateBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos);
 
-	u32 storage_flags = GL_DYNAMIC_STORAGE_BIT;
-	glDeleteBuffers(1, &cs->raw_data_ssbo);
-	glCreateBuffers(1, &cs->raw_data_ssbo);
-	glNamedBufferStorage(cs->raw_data_ssbo, rf_raw_size, 0, storage_flags);
-	LABEL_GL_OBJECT(GL_BUFFER, cs->raw_data_ssbo, s8("Raw_RF_SSBO"));
-
 	uz rf_decoded_size = 2 * sizeof(f32) * cs->dec_data_dim.x * cs->dec_data_dim.y * cs->dec_data_dim.z;
 	Stream label = arena_stream(a);
 	stream_append_s8(&label, s8("Decoded_RF_SSBO_"));
@@ -182,7 +169,8 @@ alloc_shader_storage(BeamformerCtx *ctx, u32 rf_raw_size, Arena a)
 	}
 
 	/* NOTE(rnp): these are stubs when CUDA isn't supported */
-	cs->cuda_lib.register_buffers(cs->rf_data_ssbos, countof(cs->rf_data_ssbos), cs->raw_data_ssbo);
+	/* TODO(rnp): cuda should know that there is more than one raw rf ssbo */
+	cs->cuda_lib.register_buffers(cs->rf_data_ssbos, countof(cs->rf_data_ssbos), cs->rf_buffer.ssbo);
 	cs->cuda_lib.init(bp->rf_raw_dim, bp->dec_data_dim);
 
 	i32  order    = (i32)cs->dec_data_dim.z;
@@ -205,14 +193,14 @@ push_compute_timing_info(ComputeTimingTable *t, ComputeTimingInfo info)
 }
 
 function b32
-fill_frame_compute_work(BeamformerCtx *ctx, BeamformWork *work, BeamformerViewPlaneTag plane)
+fill_frame_compute_work(BeamformerCtx *ctx, BeamformWork *work, BeamformerViewPlaneTag plane, b32 indirect)
 {
 	b32 result = 0;
 	if (work) {
 		result = 1;
 		u32 frame_id    = atomic_add_u32(&ctx->next_render_frame_index, 1);
 		u32 frame_index = frame_id % countof(ctx->beamform_frames);
-		work->kind      = BeamformerWorkKind_Compute;
+		work->kind      = indirect? BeamformerWorkKind_ComputeIndirect : BeamformerWorkKind_Compute;
 		work->lock      = BeamformerSharedMemoryLockKind_DispatchCompute;
 		work->frame     = ctx->beamform_frames + frame_index;
 		work->frame->ready_to_present = 0;
@@ -534,7 +522,6 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformerFrame *frame,
 		glBindImageTexture(0, csctx->hadamard_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8I);
 
 		if (shader == cp->shaders[0]) {
-			glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo);
 			glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[input_ssbo_idx]);
 			glBindImageTexture(1, csctx->channel_mapping_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I);
 			glProgramUniform1ui(program, DECODE_FIRST_PASS_UNIFORM_LOC, 1);
@@ -566,10 +553,10 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformerFrame *frame,
 	case BeamformerShaderKind_DemodulateFloat:
 	{
 		BeamformerDemodulateUBO *ubo = &cp->demod_ubo_data;
-		u32 input = ubo->map_channels ? csctx->raw_data_ssbo : csctx->rf_data_ssbos[input_ssbo_idx];
 		glBindBufferBase(GL_UNIFORM_BUFFER,        0, cp->ubos[BeamformerComputeUBOKind_Demodulate]);
-		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, input);
 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]);
+		if (!ubo->map_channels)
+			glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]);
 
 		glBindImageTexture(0, csctx->filters[sp->filter_slot].texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R32F);
 		if (ubo->map_channels)
@@ -877,9 +864,8 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co
 			default:{}break;
 			}
 
-
-			if (success && ctx->latest_frame) {
-				fill_frame_compute_work(ctx, work, ctx->latest_frame->view_plane_tag);
+			if (success && ctx->latest_frame && !sm->live_imaging_parameters.active) {
+				fill_frame_compute_work(ctx, work, ctx->latest_frame->view_plane_tag, 0);
 				can_commit = 0;
 			}
 		}break;
@@ -944,21 +930,6 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co
 				tex_format        = GL_RED_INTEGER;
 				tex_element_count = countof(sm->sparse_elements);
 			}break;
-			case BeamformerUploadKind_RFData:{
-				if (cs->rf_raw_size != uc->size ||
-				    !uv4_equal(cs->dec_data_dim, uv4_from_u32_array(bp->dec_data_dim)))
-				{
-					alloc_shader_storage(ctx, uc->size, arena);
-				}
-				buffer = cs->raw_data_ssbo;
-
-				ComputeTimingInfo info = {0};
-				info.kind = ComputeTimingInfoKind_RF_Data;
-				/* TODO(rnp): this could stall. what should we do about it? */
-				glGetQueryObjectui64v(cs->rf_data_timestamp_query, GL_QUERY_RESULT, &info.timer_count);
-				glQueryCounter(cs->rf_data_timestamp_query, GL_TIMESTAMP);
-				push_compute_timing_info(ctx->compute_timing_table, info);
-			}break;
 			InvalidDefaultCase;
 			}
 
@@ -972,12 +943,11 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co
 				                     (u8 *)sm + uc->shared_memory_offset);
 			}
 
-			atomic_and_u32(&sm->dirty_regions, ~(sm->dirty_regions & 1 << (work->lock - 1)));
+			mark_shared_memory_region_clean(sm, (i32)work->lock);
 			os_shared_memory_region_unlock(&ctx->shared_memory, sm->locks, (i32)work->lock);
 		}break;
 		case BeamformerWorkKind_ComputeIndirect:{
-			fill_frame_compute_work(ctx, work, work->compute_indirect_plane);
-			DEBUG_DECL(work->kind = BeamformerWorkKind_ComputeIndirect;)
+			fill_frame_compute_work(ctx, work, work->compute_indirect_plane, 1);
 		} /* FALLTHROUGH */
 		case BeamformerWorkKind_Compute:{
 			DEBUG_DECL(glClearNamedBufferData(cs->rf_data_ssbos[0], GL_RG32F, GL_RG, GL_FLOAT, 0);)
@@ -991,6 +961,12 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co
 			u32 mask = (1 << (BeamformerSharedMemoryLockKind_Parameters - 1)) |
 			           (1 << (BeamformerSharedMemoryLockKind_ComputePipeline - 1));
 			if (sm->dirty_regions & mask) {
+				if (cs->rf_raw_size != cs->rf_buffer.rf_size ||
+				    !uv4_equal(cs->dec_data_dim, uv4_from_u32_array(bp->dec_data_dim)))
+				{
+					alloc_shader_storage(ctx, cs->rf_buffer.rf_size, arena);
+				}
+
 				plan_compute_pipeline(&ctx->shared_memory, cp, cs->filters);
 				atomic_store_u32(&ctx->ui_read_params, ctx->beamform_work_queue != q);
 				atomic_and_u32(&sm->dirty_regions, ~mask);
@@ -1023,8 +999,40 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co
 			frame->das_shader_kind = bp->das_shader_id;
 			frame->compound_count  = bp->dec_data_dim[2];
 
+			/* NOTE(rnp): first stage requires access to raw data buffer directly so we break
+			 * it out into a separate step. This way data can get release as soon as possible */
+			if (cp->shader_count > 0) {
+				BeamformerRFBuffer *rf = &cs->rf_buffer;
+				u32 slot = rf->compute_index % countof(rf->compute_syncs);
+
+				/* NOTE(rnp): compute indirect is used when uploading data. in this case the thread
+				 * must wait on an upload fence. if the fence doesn't yet exist the thread must wait */
+				if (work->kind == BeamformerWorkKind_ComputeIndirect)
+					spin_wait(!atomic_load_u64(rf->upload_syncs + slot));
+
+				if (rf->upload_syncs[slot]) {
+					rf->compute_index++;
+					glWaitSync(rf->upload_syncs[slot], 0, GL_TIMEOUT_IGNORED);
+					glDeleteSync(rf->upload_syncs[slot]);
+				} else {
+					slot = (rf->compute_index - 1) % countof(rf->compute_syncs);
+				}
+
+				glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, rf->ssbo, slot * rf->rf_size, rf->rf_size);
+
+				glBeginQuery(GL_TIME_ELAPSED, cs->shader_timer_ids[0]);
+				do_compute_shader(ctx, arena, frame, cp->shaders[0], cp->shader_parameters + 0);
+				glEndQuery(GL_TIME_ELAPSED);
+
+				if (work->kind == BeamformerWorkKind_ComputeIndirect) {
+					rf->compute_syncs[slot] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+					rf->upload_syncs[slot]  = 0;
+					memory_write_barrier();
+				}
+			}
+
 			b32 did_sum_shader = 0;
-			for (i32 i = 0; i < cp->shader_count; i++) {
+			for (i32 i = 1; i < cp->shader_count; i++) {
 				did_sum_shader |= cp->shaders[i] == BeamformerShaderKind_Sum;
 				glBeginQuery(GL_TIME_ELAPSED, cs->shader_timer_ids[i]);
 				do_compute_shader(ctx, arena, frame, cp->shaders[i], cp->shader_parameters + i);
@@ -1158,10 +1166,6 @@ DEBUG_EXPORT BEAMFORMER_COMPUTE_SETUP_FN(beamformer_compute_setup)
 	LABEL_GL_OBJECT(GL_TEXTURE, cs->sparse_elements_texture, s8("Sparse_Elements"));
 
 	glCreateQueries(GL_TIME_ELAPSED, countof(cs->shader_timer_ids), cs->shader_timer_ids);
-	glCreateQueries(GL_TIMESTAMP, 1, &cs->rf_data_timestamp_query);
-
-	/* NOTE(rnp): start this here so we don't have to worry about it being started or not */
-	glQueryCounter(cs->rf_data_timestamp_query, GL_TIMESTAMP);
 }
 
 DEBUG_EXPORT BEAMFORMER_COMPLETE_COMPUTE_FN(beamformer_complete_compute)
@@ -1172,6 +1176,75 @@ DEBUG_EXPORT BEAMFORMER_COMPLETE_COMPUTE_FN(beamformer_complete_compute)
 	complete_queue(ctx, ctx->beamform_work_queue, arena, gl_context);
 }
 
+function void
+beamformer_rf_buffer_allocate(BeamformerRFBuffer *rf, u32 rf_size, Arena arena)
+{
+	glUnmapNamedBuffer(rf->ssbo);
+	glDeleteBuffers(1, &rf->ssbo);
+	glCreateBuffers(1, &rf->ssbo);
+
+	rf_size = (u32)round_up_to((iz)rf_size, 64);
+	glNamedBufferStorage(rf->ssbo, countof(rf->compute_syncs) * rf_size, 0,
+	                     GL_DYNAMIC_STORAGE_BIT|GL_MAP_WRITE_BIT);
+	LABEL_GL_OBJECT(GL_BUFFER, rf->ssbo, s8("Raw_RF_SSBO"));
+	rf->rf_size = rf_size;
+}
+
+DEBUG_EXPORT BEAMFORMER_RF_UPLOAD_FN(beamformer_rf_upload)
+{
+	BeamformerSharedMemory *sm = ctx->shared_memory->region;
+
+	BeamformerSharedMemoryLockKind scratch_lock = BeamformerSharedMemoryLockKind_ScratchSpace;
+	BeamformerSharedMemoryLockKind upload_lock  = BeamformerSharedMemoryLockKind_UploadRF;
+	if (sm->locks[upload_lock] &&
+	    os_shared_memory_region_lock(ctx->shared_memory, sm->locks, (i32)scratch_lock, (u32)-1))
+	{
+		BeamformerRFBuffer *rf = ctx->rf_buffer;
+		if (rf->rf_size < sm->scratch_rf_size)
+			beamformer_rf_buffer_allocate(rf, sm->scratch_rf_size, arena);
+
+		u32 slot = rf->insertion_index++ % countof(rf->compute_syncs);
+
+		/* NOTE(rnp): if the rest of the code is functioning then the first
+		 * time the compute thread processes an upload it must have gone
+		 * through this path. therefore it is safe to spin until it gets processed */
+		spin_wait(atomic_load_u64(rf->upload_syncs + slot));
+
+		if (rf->compute_syncs[slot]) {
+			GLenum sync_result = glClientWaitSync(rf->compute_syncs[slot], 0, 1000000000);
+			if (sync_result == GL_TIMEOUT_EXPIRED || sync_result == GL_WAIT_FAILED) {
+				// TODO(rnp): what do?
+			}
+			glDeleteSync(rf->compute_syncs[slot]);
+		}
+
+		/* NOTE(rnp): nVidia's drivers really don't play nice with persistant mapping,
+		 * at least when it is a big as this one wants to be. mapping and unmapping the
+		 * desired range each time doesn't seem to introduce any performance hit */
+		u32 access = GL_MAP_WRITE_BIT|GL_MAP_FLUSH_EXPLICIT_BIT|GL_MAP_UNSYNCHRONIZED_BIT;
+		u8 *buffer = glMapNamedBufferRange(rf->ssbo, slot * rf->rf_size, (i32)rf->rf_size, access);
+
+		mem_copy(buffer, (u8 *)sm + BEAMFORMER_SCRATCH_OFF, rf->rf_size);
+		mark_shared_memory_region_clean(sm, (i32)scratch_lock);
+		os_shared_memory_region_unlock(ctx->shared_memory, sm->locks, (i32)scratch_lock);
+		post_sync_barrier(ctx->shared_memory, upload_lock, sm->locks);
+
+		glFlushMappedNamedBufferRange(rf->ssbo, 0, (i32)rf->rf_size);
+		glUnmapNamedBuffer(rf->ssbo);
+
+		rf->upload_syncs[slot]  = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+		rf->compute_syncs[slot] = 0;
+		memory_write_barrier();
+
+		os_wake_waiters(ctx->compute_worker_sync);
+
+		ComputeTimingInfo info = {.kind = ComputeTimingInfoKind_RF_Data};
+		glGetQueryObjectui64v(rf->data_timestamp_query, GL_QUERY_RESULT, &info.timer_count);
+		glQueryCounter(rf->data_timestamp_query, GL_TIMESTAMP);
+		push_compute_timing_info(ctx->compute_timing_table, info);
+	}
+}
+
 #include "ui.c"
 
 DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step)
@@ -1192,16 +1265,8 @@ DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step)
 	}
 
 	BeamformerSharedMemory *sm = ctx->shared_memory.region;
-	if (sm->locks[BeamformerSharedMemoryLockKind_DispatchCompute] && ctx->os.compute_worker.asleep) {
-		if (sm->start_compute_from_main) {
-			BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue);
-			BeamformerViewPlaneTag tag = ctx->latest_frame ? ctx->latest_frame->view_plane_tag : 0;
-			if (fill_frame_compute_work(ctx, work, tag))
-				beamform_work_queue_push_commit(ctx->beamform_work_queue);
-			atomic_store_u32(&sm->start_compute_from_main, 0);
-		}
-		os_wake_waiters(&ctx->os.compute_worker.sync_variable);
-	}
+	if (sm->locks[BeamformerSharedMemoryLockKind_UploadRF] != 0)
+		os_wake_waiters(&ctx->os.upload_worker.sync_variable);
 
 	BeamformerFrame        *frame = ctx->latest_frame;
 	BeamformerViewPlaneTag  tag   = frame? frame->view_plane_tag : 0;
diff --git a/beamformer.h b/beamformer.h
@@ -174,19 +174,33 @@ typedef struct {
 	#undef X
 } BeamformerComputePipeline;
 
+#define MAX_RAW_DATA_FRAMES_IN_FLIGHT 3
+typedef struct {
+	GLsync  upload_syncs[MAX_RAW_DATA_FRAMES_IN_FLIGHT];
+	GLsync  compute_syncs[MAX_RAW_DATA_FRAMES_IN_FLIGHT];
+
+	u32 ssbo;
+	u32 rf_size;
+
+	u32 data_timestamp_query;
+
+	u32 insertion_index;
+	u32 compute_index;
+} BeamformerRFBuffer;
+
 typedef struct {
 	u32 programs[BeamformerShaderKind_ComputeCount];
 
 	BeamformerComputePipeline compute_pipeline;
 	BeamformerFilter filters[BEAMFORMER_FILTER_SLOTS];
 
+	BeamformerRFBuffer rf_buffer;
+
 	/* NOTE: Decoded data is only relevant in the context of a single frame. We use two
 	 * buffers so that they can be swapped when chaining multiple compute stages */
 	u32 rf_data_ssbos[2];
 	u32 last_output_ssbo_index;
 
-	u32 raw_data_ssbo;
-
 	u32 channel_mapping_texture;
 	u32 sparse_elements_texture;
 	u32 focal_vectors_texture;
@@ -198,8 +212,6 @@ typedef struct {
 	f32 processing_progress;
 	b32 processing_compute;
 
-	u32 rf_data_timestamp_query;
-
 	u32 shader_timer_ids[MAX_COMPUTE_SHADER_STAGES];
 
 	BeamformerRenderModel unit_cube_model;
@@ -247,6 +259,13 @@ typedef struct {
 	ComputeTimingInfo buffer[4096];
 } ComputeTimingTable;
 
+typedef struct {
+	BeamformerRFBuffer *rf_buffer;
+	SharedMemoryRegion *shared_memory;
+	ComputeTimingTable *compute_timing_table;
+	i32                *compute_worker_sync;
+} BeamformerUploadThreadContext;
+
 struct BeamformerFrame {
 	u32 texture;
 	b32 ready_to_present;
@@ -271,6 +290,7 @@ struct BeamformerFrame {
 #define GL_PARAMETERS \
 	X(MAJOR_VERSION,                   version_major,                   "")      \
 	X(MINOR_VERSION,                   version_minor,                   "")      \
+	X(MIN_MAP_BUFFER_ALIGNMENT,        min_map_buffer_alignment,        "")      \
 	X(TEXTURE_BUFFER_OFFSET_ALIGNMENT, texture_buffer_offset_alignment, "")      \
 	X(MAX_TEXTURE_BUFFER_SIZE,         max_texture_buffer_size,         "")      \
 	X(MAX_TEXTURE_SIZE,                max_2d_texture_dim,              "")      \
@@ -344,6 +364,9 @@ typedef BEAMFORMER_COMPUTE_SETUP_FN(beamformer_compute_setup_fn);
 #define BEAMFORMER_COMPLETE_COMPUTE_FN(name) void name(iptr user_context, Arena arena, iptr gl_context)
 typedef BEAMFORMER_COMPLETE_COMPUTE_FN(beamformer_complete_compute_fn);
 
+#define BEAMFORMER_RF_UPLOAD_FN(name) void name(BeamformerUploadThreadContext *ctx, Arena arena)
+typedef BEAMFORMER_RF_UPLOAD_FN(beamformer_rf_upload_fn);
+
 #define BEAMFORMER_RELOAD_SHADER_FN(name) b32 name(OS *os, BeamformerCtx *ctx, \
                                                    ShaderReloadContext *src, Arena arena, s8 shader_name)
 typedef BEAMFORMER_RELOAD_SHADER_FN(beamformer_reload_shader_fn);
diff --git a/beamformer_work_queue.c b/beamformer_work_queue.c
@@ -52,6 +52,25 @@ DEBUG_EXPORT BEAMFORM_WORK_QUEUE_PUSH_COMMIT_FN(beamform_work_queue_push_commit)
 }
 
 function void
+mark_shared_memory_region_dirty(BeamformerSharedMemory *sm, i32 index)
+{
+	atomic_or_u32(&sm->dirty_regions, (1 << (index - 1)));
+}
+
+function void
+mark_shared_memory_region_clean(BeamformerSharedMemory *sm, i32 index)
+{
+	atomic_and_u32(&sm->dirty_regions, ~(1 << (index - 1)));
+}
+
+function b32
+is_shared_memory_region_dirty(BeamformerSharedMemory *sm, i32 index)
+{
+	b32 result = (atomic_load_u32(&sm->dirty_regions) & (1 << (index - 1))) != 0;
+	return result;
+}
+
+function void
 post_sync_barrier(SharedMemoryRegion *sm, BeamformerSharedMemoryLockKind lock, i32 *locks)
 {
 	/* NOTE(rnp): debug: here it is not a bug to release the lock if it
diff --git a/beamformer_work_queue.h b/beamformer_work_queue.h
@@ -2,7 +2,7 @@
 #ifndef _BEAMFORMER_WORK_QUEUE_H_
 #define _BEAMFORMER_WORK_QUEUE_H_
 
-#define BEAMFORMER_SHARED_MEMORY_VERSION (10UL)
+#define BEAMFORMER_SHARED_MEMORY_VERSION (11UL)
 
 typedef struct BeamformerFrame     BeamformerFrame;
 typedef struct ShaderReloadContext ShaderReloadContext;
@@ -19,7 +19,6 @@ typedef enum {
 typedef enum {
 	BeamformerUploadKind_ChannelMapping,
 	BeamformerUploadKind_FocalVectors,
-	BeamformerUploadKind_RFData,
 	BeamformerUploadKind_SparseElements,
 } BeamformerUploadKind;
 
@@ -64,6 +63,7 @@ typedef union {
 	X(Parameters)      \
 	X(ScratchSpace)    \
 	X(SparseElements)  \
+	X(UploadRF)        \
 	X(ExportSync)      \
 	X(DispatchCompute)
 
@@ -73,6 +73,8 @@ typedef enum {BEAMFORMER_SHARED_MEMORY_LOCKS BeamformerSharedMemoryLockKind_Coun
 
 /* NOTE: discriminated union based on type */
 typedef struct {
+	BeamformerWorkKind kind;
+	BeamformerSharedMemoryLockKind lock;
 	union {
 		BeamformerFrame               *frame;
 		BeamformerCreateFilterContext  create_filter_context;
@@ -82,8 +84,6 @@ typedef struct {
 		ShaderReloadContext           *shader_reload_context;
 		void                          *generic;
 	};
-	BeamformerSharedMemoryLockKind lock;
-	BeamformerWorkKind kind;
 } BeamformWork;
 
 typedef struct {
@@ -147,11 +147,8 @@ typedef struct {
 	i32                        shader_count;
 	BeamformerDataKind         data_kind;
 
-	/* TODO(rnp): hack: we need a different way of dispatching work for export */
-	b32 start_compute_from_main;
-
-	/* TODO(rnp): this shouldn't be needed */
-	b32 export_next_frame;
+	/* TODO(rnp): this is really sucky. we need a better way to communicate this */
+	u32 scratch_rf_size;
 
 	BeamformerLiveImagingParameters live_imaging_parameters;
 	BeamformerLiveImagingDirtyFlags live_imaging_dirty_flags;
diff --git a/helpers/ogl_beamformer_lib.c b/helpers/ogl_beamformer_lib.c
@@ -245,19 +245,46 @@ beamformer_create_kaiser_low_pass_filter(f32 beta, f32 cutoff_frequency, i16 len
 	return result;
 }
 
+function b32
+beamformer_flush_commands(i32 timeout_ms)
+{
+	b32 result = lib_try_lock(BeamformerSharedMemoryLockKind_DispatchCompute, timeout_ms);
+	return result;
+}
+
+function b32
+beamformer_compute_indirect(BeamformerViewPlaneTag tag)
+{
+	b32 result = check_shared_memory();
+	if (result) {
+		result = tag < BeamformerViewPlaneTag_Count;
+		if (result) {
+			BeamformWork *work = try_push_work_queue();
+			result = work != 0;
+			if (result) {
+				work->kind = BeamformerWorkKind_ComputeIndirect;
+				work->compute_indirect_plane = tag;
+				beamform_work_queue_push_commit(&g_beamformer_library_context.bp->external_work_queue);
+				beamformer_flush_commands(0);
+			}
+		} else {
+			g_beamformer_library_context.last_error = BF_LIB_ERR_KIND_INVALID_IMAGE_PLANE;
+		}
+	}
+	return result;
+}
+
 b32
-beamformer_start_compute(i32 timeout_ms)
+beamformer_start_compute(void)
 {
-	u32 lock   = BeamformerSharedMemoryLockKind_DispatchCompute;
-	b32 result = check_shared_memory() && lib_try_lock(lock, timeout_ms);
+	b32 result = beamformer_compute_indirect(0);
 	return result;
 }
 
 b32
 beamformer_wait_for_compute_dispatch(i32 timeout_ms)
 {
-	u32 lock   = BeamformerSharedMemoryLockKind_DispatchCompute;
-	b32 result = check_shared_memory() && lib_try_lock(lock, timeout_ms);
+	b32 result = beamformer_flush_commands(timeout_ms);
 	/* NOTE(rnp): if you are calling this function you are probably about
 	 * to start some other work and it might be better to not do this... */
 	if (result) lib_release_lock(BeamformerSharedMemoryLockKind_DispatchCompute);
@@ -270,10 +297,9 @@ locked_region_upload(void *region, void *data, u32 size, BeamformerSharedMemoryL
 {
 	b32 result = lib_try_lock(lock, timeout_ms);
 	if (result) {
-		if (dirty)
-			*dirty = atomic_load_u32(&g_beamformer_library_context.bp->dirty_regions) & (1 << (lock - 1));
+		if (dirty) *dirty = is_shared_memory_region_dirty(g_beamformer_library_context.bp, (i32)lock);
 		mem_copy(region, data, size);
-		atomic_or_u32(&g_beamformer_library_context.bp->dirty_regions, (1 << (lock - 1)));
+		mark_shared_memory_region_dirty(g_beamformer_library_context.bp, (i32)lock);
 		lib_release_lock(lock);
 	}
 	return result;
@@ -324,15 +350,17 @@ BEAMFORMER_UPLOAD_FNS
 #undef X
 
 function b32
-beamformer_push_data_base(void *data, u32 data_size, i32 timeout_ms, b32 start_from_main)
+beamformer_push_data_base(void *data, u32 data_size, i32 timeout_ms)
 {
 	b32 result = 0;
 	if (data_size <= BEAMFORMER_MAX_RF_DATA_SIZE) {
-		result = beamformer_upload_buffer(data, data_size, BEAMFORMER_SCRATCH_OFF,
-		                                  BeamformerUploadKind_RFData,
-		                                  BeamformerSharedMemoryLockKind_ScratchSpace, timeout_ms);
-		if (result && start_from_main)
-			atomic_store_u32(&g_beamformer_library_context.bp->start_compute_from_main, 1);
+		if (lib_try_lock(BeamformerSharedMemoryLockKind_UploadRF, timeout_ms)) {
+			result = locked_region_upload((u8 *)g_beamformer_library_context.bp + BEAMFORMER_SCRATCH_OFF,
+			                              data, data_size, BeamformerSharedMemoryLockKind_ScratchSpace,
+			                              0, 0);
+			/* TODO(rnp): need a better way to communicate this */
+			if (result) g_beamformer_library_context.bp->scratch_rf_size = data_size;
+		}
 	} else {
 		g_beamformer_library_context.last_error = BF_LIB_ERR_KIND_BUFFER_OVERFLOW;
 	}
@@ -342,27 +370,14 @@ beamformer_push_data_base(void *data, u32 data_size, i32 timeout_ms, b32 start_f
 b32
 beamformer_push_data(void *data, u32 data_size)
 {
-	return beamformer_push_data_base(data, data_size, g_beamformer_library_context.timeout_ms, 1);
+	return beamformer_push_data_base(data, data_size, g_beamformer_library_context.timeout_ms);
 }
 
 b32
 beamformer_push_data_with_compute(void *data, u32 data_size, u32 image_plane_tag)
 {
-	b32 result = beamformer_push_data_base(data, data_size, g_beamformer_library_context.timeout_ms, 0);
-	if (result) {
-		result = image_plane_tag < BeamformerViewPlaneTag_Count;
-		if (result) {
-			BeamformWork *work = try_push_work_queue();
-			if (work) {
-				work->kind = BeamformerWorkKind_ComputeIndirect;
-				work->compute_indirect_plane = image_plane_tag;
-				beamform_work_queue_push_commit(&g_beamformer_library_context.bp->external_work_queue);
-				result = beamformer_start_compute(0);
-			}
-		} else {
-			g_beamformer_library_context.last_error = BF_LIB_ERR_KIND_INVALID_IMAGE_PLANE;
-		}
-	}
+	b32 result = beamformer_push_data_base(data, data_size, g_beamformer_library_context.timeout_ms);
+	if (result) result = beamformer_compute_indirect(image_plane_tag);
 	return result;
 }
 
@@ -448,8 +463,7 @@ beamform_data_synchronized(void *data, u32 data_size, i32 output_points[3], f32 
 		g_beamformer_library_context.bp->parameters.output_points[2] = output_points[2];
 
 		uz output_size = (u32)output_points[0] * (u32)output_points[1] * (u32)output_points[2] * sizeof(f32) * 2;
-		if (output_size <= BEAMFORMER_SCRATCH_SIZE && beamformer_push_data_with_compute(data, data_size, 0))
-		{
+		if (output_size <= BEAMFORMER_SCRATCH_SIZE && beamformer_push_data_with_compute(data, data_size, 0)) {
 			BeamformerExportContext export;
 			export.kind = BeamformerExportKind_BeamformedData;
 			export.size = (u32)output_size;
@@ -457,7 +471,7 @@ beamform_data_synchronized(void *data, u32 data_size, i32 output_points[3], f32 
 				/* NOTE(rnp): if this fails it just means that the work from push_data hasn't
 				 * started yet. This is here to catch the other case where the work started
 				 * and finished before we finished queuing the export work item */
-				beamformer_start_compute(0);
+				beamformer_flush_commands(0);
 
 				result = beamformer_read_output(out_data, output_size, timeout_ms);
 			}
@@ -477,7 +491,7 @@ beamformer_compute_timings(BeamformerComputeStatsTable *output, i32 timeout_ms)
 		BeamformerExportContext export;
 		export.kind = BeamformerExportKind_Stats;
 		export.size = sizeof(*output);
-		if (beamformer_export_buffer(export) && beamformer_start_compute(0))
+		if (beamformer_export_buffer(export) && beamformer_flush_commands(0))
 			result = beamformer_read_output(output, sizeof(*output), timeout_ms);
 	}
 	return result;
diff --git a/helpers/ogl_beamformer_lib_base.h b/helpers/ogl_beamformer_lib_base.h
@@ -51,8 +51,8 @@ LIB_FN uint32_t beamform_data_synchronized(void *data, uint32_t data_size, int32
 /* NOTE: downloads the last 32 frames worth of compute timings into output */
 LIB_FN uint32_t beamformer_compute_timings(BeamformerComputeStatsTable *output, int32_t timeout_ms);
 
-/* NOTE: tells the beamformer to start beamforming and waits until it starts or for timeout_ms */
-LIB_FN uint32_t beamformer_start_compute(int32_t timeout_ms);
+/* NOTE: tells the beamformer to start beamforming */
+LIB_FN uint32_t beamformer_start_compute(void);
 
 /* NOTE: waits for previously queued beamform to start or for timeout_ms */
 LIB_FN uint32_t beamformer_wait_for_compute_dispatch(int32_t timeout_ms);
diff --git a/opengl.h b/opengl.h
@@ -11,6 +11,9 @@
 #include <GL/gl.h>
 
 /* NOTE: do not add extra 0s to these, even at the start -> garbage compilers will complain */
+#define GL_MAP_WRITE_BIT                   0x0002
+#define GL_MAP_FLUSH_EXPLICIT_BIT          0x0010
+#define GL_MAP_UNSYNCHRONIZED_BIT          0x0020
 #define GL_DYNAMIC_STORAGE_BIT             0x0100
 #define GL_SHADER_IMAGE_ACCESS_BARRIER_BIT 0x00000020
 #define GL_TEXTURE_UPDATE_BARRIER_BIT      0x00000100
@@ -30,7 +33,6 @@
 #define GL_RG32F                           0x8230
 #define GL_R8I                             0x8231
 #define GL_R16I                            0x8233
-#define GL_DEBUG_OUTPUT_SYNCHRONOUS        0x8242
 #define GL_MAX_COMPUTE_SHARED_MEMORY_SIZE  0x8262
 #define GL_BUFFER                          0x82E0
 #define GL_PROGRAM                         0x82E2
@@ -55,16 +57,24 @@
 #define GL_RENDERBUFFER                    0x8D41
 #define GL_RED_INTEGER                     0x8D94
 #define GL_TIMESTAMP                       0x8E28
+#define GL_MIN_MAP_BUFFER_ALIGNMENT        0x90BC
 #define GL_SHADER_STORAGE_BUFFER           0x90D2
 #define GL_MAX_SHADER_STORAGE_BLOCK_SIZE   0x90DE
 #define GL_MAX_SERVER_WAIT_TIMEOUT         0x9111
+#define GL_SYNC_GPU_COMMANDS_COMPLETE      0x9117
+#define GL_TIMEOUT_EXPIRED                 0x911B
+#define GL_WAIT_FAILED                     0x911D
 #define GL_TEXTURE_BUFFER_OFFSET_ALIGNMENT 0x919F
 #define GL_COMPUTE_SHADER                  0x91B9
+#define GL_DEBUG_OUTPUT                    0x92E0
+
+#define GL_TIMEOUT_IGNORED                 0xFFFFFFFFFFFFFFFFull
 
 typedef char      GLchar;
 typedef ptrdiff_t GLsizeiptr;
 typedef ptrdiff_t GLintptr;
 typedef uint64_t  GLuint64;
+typedef struct __GLsync *GLsync;
 
 /* X(name, ret, params) */
 #define OGLProcedureList \
@@ -80,6 +90,7 @@ typedef uint64_t  GLuint64;
 	X(glClearNamedBufferData,                void,   (GLuint buffer, GLenum internalformat, GLenum format, GLenum type, const void *data)) \
 	X(glClearNamedFramebufferfv,             void,   (GLuint framebuffer, GLenum buffer, GLint drawbuffer, const GLfloat *value)) \
 	X(glClearTexImage,                       void,   (GLuint texture, GLint level, GLenum format, GLenum type, const void *data)) \
+	X(glClientWaitSync,                      GLenum, (GLsync sync, GLbitfield flags, GLuint64 timeout)) \
 	X(glCompileShader,                       void,   (GLuint shader)) \
 	X(glCopyImageSubData,                    void,   (GLuint srcName, GLenum srcTarget, GLint srcLevel, GLint srcX, GLint srcY, GLint srcZ, GLuint dstName, GLenum dstTarget, GLint dstLevel, GLint dstX, GLint dstY, GLint dstZ, GLsizei srcWidth, GLsizei srcHeight, GLsizei srcDepth)) \
 	X(glCreateBuffers,                       void,   (GLsizei n, GLuint *buffers)) \
@@ -94,9 +105,12 @@ typedef uint64_t  GLuint64;
 	X(glDeleteBuffers,                       void,   (GLsizei n, const GLuint *buffers)) \
 	X(glDeleteProgram,                       void,   (GLuint program)) \
 	X(glDeleteShader,                        void,   (GLuint shader)) \
+	X(glDeleteSync,                          void,   (GLsync sync)) \
 	X(glDispatchCompute,                     void,   (GLuint num_groups_x, GLuint num_groups_y, GLuint num_groups_z)) \
 	X(glEndQuery,                            void,   (GLenum target)) \
 	X(glEnableVertexArrayAttrib,             void,   (GLuint vao, GLuint index)) \
+	X(glFenceSync,                           GLsync, (GLenum condition, GLbitfield flags)) \
+	X(glFlushMappedNamedBufferRange,         void,   (GLuint buffer, GLintptr offset, GLsizei length)) \
 	X(glGenerateTextureMipmap,               void,   (GLuint texture)) \
 	X(glGetProgramInfoLog,                   void,   (GLuint program, GLsizei maxLength, GLsizei *length, GLchar *infoLog)) \
 	X(glGetProgramiv,                        void,   (GLuint program, GLenum pname, GLint *params)) \
@@ -105,6 +119,7 @@ typedef uint64_t  GLuint64;
 	X(glGetShaderiv,                         void,   (GLuint shader, GLenum pname, GLint *params)) \
 	X(glGetTextureImage,                     void,   (GLuint texture, GLint level, GLenum format, GLenum type, GLsizei bufSize, void *pixels)) \
 	X(glLinkProgram,                         void,   (GLuint program)) \
+	X(glMapNamedBufferRange,                 void *, (GLuint buffer, GLintptr offset, GLsizei length, GLbitfield access)) \
 	X(glMemoryBarrier,                       void,   (GLbitfield barriers)) \
 	X(glNamedBufferData,                     void,   (GLuint buffer, GLsizeiptr size, const void *data, GLenum usage)) \
 	X(glNamedBufferStorage,                  void,   (GLuint buffer, GLsizeiptr size, const void *data, GLbitfield flags)) \
@@ -129,11 +144,13 @@ typedef uint64_t  GLuint64;
 	X(glTextureSubImage1D,                   void,   (GLuint texture, GLint level, GLint xoff, GLsizei width, GLenum format, GLenum type, const void *pix)) \
 	X(glTextureSubImage2D,                   void,   (GLuint texture, GLint level, GLint xoff, GLint yoff, GLsizei width, GLsizei height, GLenum format, GLenum type, const void *pix)) \
 	X(glTextureSubImage3D,                   void,   (GLuint texture, GLint level, GLint xoff, GLint yoff, GLint zoff, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, const void *pix)) \
+	X(glUnmapNamedBuffer,                    GLboolean, (GLuint buffer)) \
 	X(glUseProgram,                          void,   (GLuint program)) \
 	X(glVertexArrayAttribBinding,            void,   (GLuint vao, GLuint attribindex, GLuint bindingindex)) \
 	X(glVertexArrayAttribFormat,             void,   (GLuint vao, GLuint attribindex, GLint size, GLenum type, GLboolean normalized, GLuint relativeoffset)) \
 	X(glVertexArrayElementBuffer,            void,   (GLuint vao, GLuint buffer)) \
-	X(glVertexArrayVertexBuffer,             void,   (GLuint vao, GLuint bindingindex, GLuint buffer, GLintptr offset, GLsizei stride))
+	X(glVertexArrayVertexBuffer,             void,   (GLuint vao, GLuint bindingindex, GLuint buffer, GLintptr offset, GLsizei stride)) \
+	X(glWaitSync,                            void,   (GLsync sync, GLbitfield flags, GLuint64 timeout))
 
 #define X(name, ret, params) typedef ret name##_fn params;
 OGLProcedureList
diff --git a/static.c b/static.c
@@ -10,10 +10,11 @@ global void *debug_lib;
 
 #define DEBUG_ENTRY_POINTS \
 	X(beamformer_debug_ui_deinit)      \
-	X(beamformer_frame_step)           \
 	X(beamformer_complete_compute)     \
 	X(beamformer_compute_setup)        \
+	X(beamformer_frame_step)           \
 	X(beamformer_reload_shader)        \
+	X(beamformer_rf_upload)            \
 	X(beamform_work_queue_push)        \
 	X(beamform_work_queue_push_commit)
 
@@ -28,7 +29,8 @@ function FILE_WATCH_CALLBACK_FN(debug_reload)
 
 	/* NOTE(rnp): spin until compute thread finishes its work (we will probably
 	 * never reload while compute is in progress but just incase). */
-	while (!atomic_load_u32(&os->compute_worker.asleep));
+	spin_wait(!atomic_load_u32(&os->compute_worker.asleep));
+	spin_wait(!atomic_load_u32(&os->upload_worker.asleep));
 
 	os_unload_library(debug_lib);
 	debug_lib = os_load_library(OS_DEBUG_LIB_NAME, OS_DEBUG_LIB_TEMP_NAME, &err);
@@ -250,6 +252,20 @@ void glfwWindowHint(i32, i32);
 iptr glfwCreateWindow(i32, i32, char *, iptr, iptr);
 void glfwMakeContextCurrent(iptr);
 
+function void
+worker_thread_sleep(GLWorkerThreadContext *ctx)
+{
+	for (;;) {
+		i32 expected = 0;
+		if (atomic_cas_u32(&ctx->sync_variable, &expected, 1))
+			break;
+
+		atomic_store_u32(&ctx->asleep, 1);
+		os_wait_on_value(&ctx->sync_variable, 1, (u32)-1);
+		atomic_store_u32(&ctx->asleep, 0);
+	}
+}
+
 function OS_THREAD_ENTRY_POINT_FN(compute_worker_thread_entry_point)
 {
 	GLWorkerThreadContext *ctx = (GLWorkerThreadContext *)_ctx;
@@ -260,15 +276,7 @@ function OS_THREAD_ENTRY_POINT_FN(compute_worker_thread_entry_point)
 	beamformer_compute_setup(ctx->user_context);
 
 	for (;;) {
-		for (;;) {
-			i32 expected = 0;
-			if (atomic_cas_u32(&ctx->sync_variable, &expected, 1))
-				break;
-
-			atomic_store_u32(&ctx->asleep, 1);
-			os_wait_on_value(&ctx->sync_variable, 1, (u32)-1);
-			atomic_store_u32(&ctx->asleep, 0);
-		}
+		worker_thread_sleep(ctx);
 		asan_poison_region(ctx->arena.beg, ctx->arena.end - ctx->arena.beg);
 		beamformer_complete_compute(ctx->user_context, ctx->arena, ctx->gl_context);
 	}
@@ -278,10 +286,33 @@ function OS_THREAD_ENTRY_POINT_FN(compute_worker_thread_entry_point)
 	return 0;
 }
 
+function OS_THREAD_ENTRY_POINT_FN(upload_worker_thread_entry_point)
+{
+	GLWorkerThreadContext *ctx = (GLWorkerThreadContext *)_ctx;
+	glfwMakeContextCurrent(ctx->window_handle);
+	ctx->gl_context = os_get_native_gl_context(ctx->window_handle);
+
+	BeamformerUploadThreadContext *up = (typeof(up))ctx->user_context;
+	glCreateQueries(GL_TIMESTAMP, 1, &up->rf_buffer->data_timestamp_query);
+	/* NOTE(rnp): start this here so we don't have to worry about it being started or not */
+	glQueryCounter(up->rf_buffer->data_timestamp_query, GL_TIMESTAMP);
+
+	for (;;) {
+		worker_thread_sleep(ctx);
+		asan_poison_region(ctx->arena.beg, ctx->arena.end - ctx->arena.beg);
+		beamformer_rf_upload(up, ctx->arena);
+	}
+
+	unreachable();
+
+	return 0;
+}
+
 function void
 setup_beamformer(Arena *memory, BeamformerCtx **o_ctx, BeamformerInput **o_input)
 {
-	Arena  compute_arena = sub_arena(memory, MB(2), KB(4));
+	Arena  compute_arena = sub_arena(memory, MB(2),  KB(4));
+	Arena  upload_arena  = sub_arena(memory, KB(64), KB(4));
 	Stream error         = stream_alloc(memory, MB(1));
 	Arena  ui_arena      = sub_arena(memory, MB(2), KB(4));
 
@@ -296,6 +327,8 @@ setup_beamformer(Arena *memory, BeamformerCtx **o_ctx, BeamformerInput **o_input
 	os_init(&ctx->os, memory);
 	ctx->os.compute_worker.arena  = compute_arena;
 	ctx->os.compute_worker.asleep = 1;
+	ctx->os.upload_worker.arena   = upload_arena;
+	ctx->os.upload_worker.asleep  = 1;
 
 	debug_init(&ctx->os, (iptr)input, memory);
 
@@ -334,16 +367,28 @@ setup_beamformer(Arena *memory, BeamformerCtx **o_ctx, BeamformerInput **o_input
 	sm->shaders[1]   = BeamformerShaderKind_DAS;
 	sm->shader_count = 2;
 
+	ComputeShaderCtx *cs = &ctx->csctx;
+
 	GLWorkerThreadContext *worker = &ctx->os.compute_worker;
 	/* TODO(rnp): we should lock this down after we have something working */
 	worker->user_context  = (iptr)ctx;
-	worker->window_handle = glfwCreateWindow(320, 240, "", 0, raylib_window_handle);
+	worker->window_handle = glfwCreateWindow(1, 1, "", 0, raylib_window_handle);
 	worker->handle        = os_create_thread(*memory, (iptr)worker, s8("[compute]"),
 	                                         compute_worker_thread_entry_point);
 
+	GLWorkerThreadContext         *upload = &ctx->os.upload_worker;
+	BeamformerUploadThreadContext *upctx  = push_struct(memory, typeof(*upctx));
+	upload->user_context = (iptr)upctx;
+	upctx->rf_buffer     = &cs->rf_buffer;
+	upctx->shared_memory = &ctx->shared_memory;
+	upctx->compute_timing_table = ctx->compute_timing_table;
+	upctx->compute_worker_sync  = &ctx->os.compute_worker.sync_variable;
+	upload->window_handle = glfwCreateWindow(1, 1, "", 0, raylib_window_handle);
+	upload->handle        = os_create_thread(*memory, (iptr)upload, s8("[upload]"),
+	                                         upload_worker_thread_entry_point);
+
 	glfwMakeContextCurrent(raylib_window_handle);
 
-	ComputeShaderCtx *cs = &ctx->csctx;
 	if (ctx->gl.vendor_id == GL_VENDOR_NVIDIA
 	    && load_cuda_lib(&ctx->os, s8(OS_CUDA_LIB_NAME), (iptr)&cs->cuda_lib, *memory))
 	{
@@ -360,7 +405,7 @@ setup_beamformer(Arena *memory, BeamformerCtx **o_ctx, BeamformerInput **o_input
 	gl_debug_ctx->os_error_handle = ctx->os.error_handle;
 	glDebugMessageCallback(gl_debug_logger, gl_debug_ctx);
 #ifdef _DEBUG
-	glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS);
+	glEnable(GL_DEBUG_OUTPUT);
 #endif
 
 	read_only local_persist s8 compute_headers[BeamformerShaderKind_ComputeCount] = {
diff --git a/tests/throughput.c b/tests/throughput.c
@@ -290,10 +290,8 @@ decompress_data_at_work_index(Stream *path_base, u32 index)
 function b32
 send_frame(i16 *restrict i16_data, BeamformerParameters *restrict bp)
 {
-	b32 result    = 0;
 	u32 data_size = bp->rf_raw_dim[0] * bp->rf_raw_dim[1] * sizeof(i16);
-	if (beamformer_wait_for_compute_dispatch(10000))
-		result = beamformer_push_data_with_compute(i16_data, data_size, BeamformerViewPlaneTag_XZ);
+	b32 result    = beamformer_push_data_with_compute(i16_data, data_size, BeamformerViewPlaneTag_XZ);
 	if (!result && !g_should_exit) printf("lib error: %s\n", beamformer_get_last_error_string());
 
 	return result;
@@ -361,10 +359,15 @@ execute_study(s8 study, Arena arena, Stream path, Options *options)
 
 	beamformer_push_pipeline(shader_stages, shader_stage_count, BeamformerDataKind_Int16);
 
+	beamformer_set_global_timeout(1000);
+
 	stream_reset(&path, path_work_index);
 	i16 *data = decompress_data_at_work_index(&path, options->frame_number);
 
 	if (options->loop) {
+		BeamformerLiveImagingParameters lip = {.active = 1};
+		beamformer_set_live_parameters(&lip);
+
 		u32 frame = 0;
 		f32 times[32] = {0};
 		f32 data_size = (f32)(bp.rf_raw_dim[0] * bp.rf_raw_dim[1] * sizeof(*data));
@@ -383,10 +386,16 @@ execute_study(s8 study, Arena arena, Stream path, Options *options)
 					       delta * 1e3, sum * 1e3, data_size / (sum * (GB(1))));
 				}
 
-				times[frame & 31] = delta;
+				times[frame % countof(times)] = delta;
 				frame++;
 			}
+			i32 flag = beamformer_live_parameters_get_dirty_flag();
+			if (flag != -1 && (1 << flag) == BeamformerLiveImagingDirtyFlags_StopImaging)
+				break;
 		}
+
+		lip.active = 0;
+		beamformer_set_live_parameters(&lip);
 	} else {
 		send_frame(data, &bp);
 	}
diff --git a/ui.c b/ui.c
@@ -868,7 +868,7 @@ function void
 resize_frame_view(BeamformerFrameView *view, iv2 dim, b32 depth)
 {
 	glDeleteTextures(countof(view->textures), view->textures);
-	glCreateTextures(GL_TEXTURE_2D, countof(view->textures) - !!depth, view->textures);
+	glCreateTextures(GL_TEXTURE_2D, depth ? countof(view->textures) : countof(view->textures) - 1, view->textures);
 
 	view->texture_dim     = dim;
 	view->texture_mipmaps = (i32)ctz_u32((u32)MAX(dim.x, dim.y)) + 1;
@@ -2728,10 +2728,11 @@ draw_compute_stats_view(BeamformerUI *ui, Arena arena, Variable *view, Rect r, v
 	InvalidDefaultCase;
 	}
 
+	u32 rf_size = ui->beamformer_context->csctx.rf_buffer.rf_size;
 	push_table_time_row_with_fps(table, &arena, s8("Compute Total:"),   compute_time_sum);
 	push_table_time_row_with_fps(table, &arena, s8("RF Upload Delta:"), stats->rf_time_delta_average);
-	push_table_memory_size_row(table, &arena, s8("Input RF Size:"), ui->beamformer_context->csctx.rf_raw_size);
-	if (ui->beamformer_context->csctx.rf_raw_size != cp->rf_size)
+	push_table_memory_size_row(table, &arena, s8("Input RF Size:"), rf_size);
+	if (rf_size != cp->rf_size)
 		push_table_memory_size_row(table, &arena, s8("DAS RF Size:"), cp->rf_size);
 
 	result = v2_add(result, table_extent(table, arena, text_spec.font));
@@ -3865,7 +3866,7 @@ draw_ui(BeamformerCtx *ctx, BeamformerInput *input, BeamformerFrame *frame_to_dr
 
 	/* TODO(rnp): there should be a better way of detecting this */
 	if (ctx->ui_read_params) {
-		mem_copy(&ui->params, &sm->parameters.output_min_coordinate, sizeof(ui->params));
+		mem_copy(&ui->params, &sm->parameters_ui, sizeof(ui->params));
 		ui->flush_params    = 0;
 		ctx->ui_read_params = 0;
 	}
@@ -3876,16 +3877,24 @@ draw_ui(BeamformerCtx *ctx, BeamformerInput *input, BeamformerFrame *frame_to_dr
 	ui_interact(ui, input, window_rect);
 
 	if (ui->flush_params) {
-		i32 lock = BeamformerSharedMemoryLockKind_Parameters;
 		validate_ui_parameters(&ui->params);
+		i32 lock = BeamformerSharedMemoryLockKind_Parameters;
 		if (ctx->latest_frame && os_shared_memory_region_lock(&ctx->shared_memory, sm->locks, lock, 0)) {
 			mem_copy(&sm->parameters_ui, &ui->params, sizeof(ui->params));
 			ui->flush_params = 0;
-			atomic_or_u32(&sm->dirty_regions, (1 << (lock - 1)));
-			b32 dispatch = os_shared_memory_region_lock(&ctx->shared_memory, sm->locks,
-			                                            BeamformerSharedMemoryLockKind_DispatchCompute, 0);
-			sm->start_compute_from_main |= dispatch & ctx->latest_frame->ready_to_present;
+			mark_shared_memory_region_dirty(sm, lock);
 			os_shared_memory_region_unlock(&ctx->shared_memory, sm->locks, lock);
+
+			BeamformerSharedMemoryLockKind dispatch_lock = BeamformerSharedMemoryLockKind_DispatchCompute;
+			if (!sm->live_imaging_parameters.active &&
+			    os_shared_memory_region_lock(&ctx->shared_memory, sm->locks, (i32)dispatch_lock, 0))
+			{
+				BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue);
+				BeamformerViewPlaneTag tag = frame_to_draw ? frame_to_draw->view_plane_tag : 0;
+				if (fill_frame_compute_work(ctx, work, tag, 0))
+					beamform_work_queue_push_commit(ctx->beamform_work_queue);
+			}
+			os_wake_waiters(&ctx->os.compute_worker.sync_variable);
 		}
 	}
 
diff --git a/util.h b/util.h
@@ -87,6 +87,8 @@
 #define EachEnumValue(type, it)        (type it = (type)0; it < type##_Count; it = (type)(it + 1))
 #define EachNonZeroEnumValue(type, it) (type it = (type)1; it < type##_Count; it = (type)(it + 1))
 
+#define spin_wait(c) while ((c))
+
 /* NOTE(rnp): no guarantees about actually getting an element */
 #define SLLPop(list) list; list = list ? list->next : 0
 #define SLLPush(v, list) do { \
@@ -344,7 +346,9 @@ struct OS {
 	FileWatchContext file_watch_context;
 	iptr             context;
 	iptr             error_handle;
+
 	GLWorkerThreadContext compute_worker;
+	GLWorkerThreadContext upload_worker;
 
 	DEBUG_DECL(renderdoc_start_frame_capture_fn *start_frame_capture;)
 	DEBUG_DECL(renderdoc_end_frame_capture_fn   *end_frame_capture;)

M	beamformer.c	\|	177	++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------
M	beamformer.h	\|	31	+++++++++++++++++++++++++++----
M	beamformer_work_queue.c	\|	19	+++++++++++++++++++
M	beamformer_work_queue.h	\|	15	++++++---------
M	helpers/ogl_beamformer_lib.c	\|	82	++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
M	helpers/ogl_beamformer_lib_base.h	\|	4	++--
M	opengl.h	\|	21	+++++++++++++++++++--
M	static.c	\|	75	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
M	tests/throughput.c	\|	17	+++++++++++++----
M	ui.c	\|	27	++++++++++++++++++---------
M	util.h	\|	4	++++