core/lib: rework library syncing and locking - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: c0a61b78d170bfec84aa9052ddf4e2010ba79afb
Parent: 8e1243fc6309f0793b5c8d814ba79f6dffda8386
Author: Randy Palamar
Date:   Tue, 10 Jun 2025 10:54:28 -0600

core/lib: rework library syncing and locking

this removes the trivial ways of deadlocking the library while
also greatly improving throughput. for a particular dataset on my
computer the beamformer would previously achieve ~1.3GB/s now it
is consistently ~2.1GB/s

this also transitions w32 to using w32 semaphores so that
timeouts work on w32

Diffstat:
M beamformer.c  | 140 ++++++++++++++++++++++++++++++++++++++++++-------------------------------------
M beamformer.h  | 5 +++--
M beamformer_parameters.h  | 2 --
M beamformer_work_queue.c  | 16 ----------------
M beamformer_work_queue.h  | 51 +++++++++++++++++++++++++++++++++++----------------
M build.c  | 1 -
M helpers/ogl_beamformer_lib.c  | 173 ++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------
M helpers/ogl_beamformer_lib_base.h  | 4 +++-
M os_linux.c  | 72 ++++++++++++++++++++++++++++++++++++++++++++++++------------------------
M os_win32.c  | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
M static.c  | 35 ++++++++++++++---------------------
M ui.c  | 32 ++++++++++++++++----------------
M util.c  | 9 +++++++++
M util.h  | 30 +++++++++++++++++++++---------

14 files changed, 391 insertions(+), 258 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -1,6 +1,5 @@
 /* See LICENSE for license details. */
 /* TODO(rnp):
- * [ ]: refactor: BeamformGPUComputeContext
  * [ ]: refactor: compute shader timers should be generated based on the pipeline stage limit
  * [ ]: reinvestigate ring buffer raw_data_ssbo
  *      - to minimize latency the main thread should manage the subbuffer upload so that the
@@ -10,6 +9,14 @@
  *        can overwrite one while the other is in use.
  *      - make use of glFenceSync to guard buffer uploads
  * [ ]: BeamformWorkQueue -> BeamformerWorkQueue
+ * [ ]: bug: re-beamform on shader reload
+ * [ ]: need to keep track of gpu memory in some way
+ *      - want to be able to store more than 16 2D frames but limit 3D frames
+ *      - maybe keep track of how much gpu memory is committed for beamformed images
+ *        and use that to determine when to loop back over existing textures
+ *      - to do this maybe use a circular linked list instead of a flat array
+ *      - then have a way of querying how many frames are available for a specific point count
+ * [ ]: bug: reinit cuda on hot-reload
  */
 
 #include "beamformer.h"
@@ -111,7 +118,7 @@ function void
 alloc_shader_storage(BeamformerCtx *ctx, u32 rf_raw_size, Arena a)
 {
 	ComputeShaderCtx     *cs = &ctx->csctx;
-	BeamformerParameters *bp = &ctx->shared_memory->parameters;
+	BeamformerParameters *bp = &((BeamformerSharedMemory *)ctx->shared_memory.region)->parameters;
 
 	cs->dec_data_dim = uv4_from_u32_array(bp->dec_data_dim);
 	cs->rf_raw_size  = rf_raw_size;
@@ -161,6 +168,7 @@ fill_frame_compute_work(BeamformerCtx *ctx, BeamformWork *work, ImagePlaneTag pl
 		u32 frame_id    = atomic_add_u32(&ctx->next_render_frame_index, 1);
 		u32 frame_index = frame_id % countof(ctx->beamform_frames);
 		work->type      = BW_COMPUTE;
+		work->lock      = BeamformerSharedMemoryLockKind_DispatchCompute;
 		work->frame     = ctx->beamform_frames + frame_index;
 		work->frame->ready_to_present = 0;
 		work->frame->frame.id = frame_id;
@@ -270,7 +278,8 @@ compute_cursor_finished(struct compute_cursor *cursor)
 function void
 do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, ShaderKind shader)
 {
-	ComputeShaderCtx *csctx = &ctx->csctx;
+	ComputeShaderCtx *csctx    = &ctx->csctx;
+	BeamformerSharedMemory *sm = ctx->shared_memory.region;
 
 	glUseProgram(csctx->programs[shader]);
 
@@ -364,10 +373,10 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, 
 		aframe->frame.id             = ctx->averaged_frame_index;
 		/* TODO(rnp): hack we need a better way of specifying which frames to sum;
 		 * this is fine for rolling averaging but what if we want to do something else */
-		ASSERT(frame >= ctx->beamform_frames);
-		ASSERT(frame < ctx->beamform_frames + ARRAY_COUNT(ctx->beamform_frames));
+		assert(frame >= ctx->beamform_frames);
+		assert(frame < ctx->beamform_frames + countof(ctx->beamform_frames));
 		u32 base_index   = (u32)(frame - ctx->beamform_frames);
-		u32 to_average   = ctx->shared_memory->parameters.output_points[3];
+		u32 to_average   = sm->parameters.output_points[3];
 		u32 frame_count  = 0;
 		u32 *in_textures = push_array(&arena, u32, MAX_BEAMFORMED_SAVED_FRAMES);
 		ComputeFrameIterator cfi = compute_frame_iterator(ctx, 1 + base_index - to_average,
@@ -482,11 +491,11 @@ reload_compute_shader(BeamformerCtx *ctx, ShaderReloadContext *src, s8 name_extr
 }
 
 function void
-complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_context, iz barrier_offset)
+complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_context)
 {
 	ComputeShaderCtx       *cs = &ctx->csctx;
-	BeamformerParameters   *bp = &ctx->shared_memory->parameters;
-	BeamformerSharedMemory *sm = ctx->shared_memory;
+	BeamformerSharedMemory *sm = ctx->shared_memory.region;
+	BeamformerParameters   *bp = &sm->parameters;
 
 	BeamformWork *work = beamform_work_queue_pop(q);
 	while (work) {
@@ -517,7 +526,7 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co
 			}
 		} break;
 		case BW_UPLOAD_BUFFER: {
-			assert(!atomic_load_u32((i32 *)(barrier_offset + work->completion_barrier)));
+			ctx->os.shared_memory_region_lock(&ctx->shared_memory, sm->locks, (i32)work->lock, -1);
 			BeamformerUploadContext *uc = &work->upload_context;
 			u32 tex_type, tex_format, tex_element_count, tex_1d = 0, buffer = 0;
 			switch (uc->kind) {
@@ -541,7 +550,7 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co
 				tex_element_count = ARRAY_COUNT(sm->sparse_elements);
 			} break;
 			case BU_KIND_PARAMETERS: {
-				ctx->ui_read_params = barrier_offset != 0;
+				ctx->ui_read_params = ctx->beamform_work_queue != q;
 				buffer = cs->shared_ubo;
 			} break;
 			case BU_KIND_RF_DATA: {
@@ -564,8 +573,26 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co
 				glNamedBufferSubData(buffer, 0, uc->size,
 				                     (u8 *)sm + uc->shared_memory_offset);
 			}
+			ctx->os.shared_memory_region_unlock(&ctx->shared_memory, sm->locks, (i32)work->lock);
 		} break;
-		case BW_COMPUTE: {
+		case BW_COMPUTE_INDIRECT:{
+			fill_frame_compute_work(ctx, work, work->compute_indirect_plane);
+			DEBUG_DECL(work->type = BW_COMPUTE_INDIRECT;)
+		} /* FALLTHROUGH */
+		case BW_COMPUTE:{
+			/* NOTE(rnp): debug: here it is not a bug to release the lock if it
+			 * isn't held but elswhere it is */
+			DEBUG_DECL(if (sm->locks[work->lock])) {
+				ctx->os.shared_memory_region_unlock(&ctx->shared_memory,
+				                                    sm->locks, work->lock);
+			}
+			atomic_store_u32(&ctx->starting_compute, 0);
+
+			if (cs->shared_ubo_dirty) {
+				glNamedBufferSubData(cs->shared_ubo, 0, sizeof(sm->parameters), &sm->parameters);
+				cs->shared_ubo_dirty = 0;
+			}
+
 			atomic_store_u32(&cs->processing_compute, 1);
 			start_renderdoc_capture(gl_context);
 
@@ -640,14 +667,10 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co
 				INVALID_CODE_PATH;
 			}
 		} break;
-		default: INVALID_CODE_PATH; break;
+		InvalidDefaultCase;
 		}
 
 		if (can_commit) {
-			if (work->completion_barrier) {
-				i32 *value = (i32 *)(barrier_offset + work->completion_barrier);
-				ctx->os.wake_waiters(value);
-			}
 			beamform_work_queue_pop_commit(q);
 			work = beamform_work_queue_pop(q);
 		}
@@ -657,7 +680,7 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co
 DEBUG_EXPORT BEAMFORMER_COMPUTE_SETUP_FN(beamformer_compute_setup)
 {
 	BeamformerCtx          *ctx = (BeamformerCtx *)user_context;
-	BeamformerSharedMemory *sm  = ctx->shared_memory;
+	BeamformerSharedMemory *sm  = ctx->shared_memory.region;
 	ComputeShaderCtx       *cs  = &ctx->csctx;
 
 	glCreateBuffers(1, &cs->shared_ubo);
@@ -678,9 +701,10 @@ DEBUG_EXPORT BEAMFORMER_COMPUTE_SETUP_FN(beamformer_compute_setup)
 
 DEBUG_EXPORT BEAMFORMER_COMPLETE_COMPUTE_FN(beamformer_complete_compute)
 {
-	BeamformerCtx *ctx = (BeamformerCtx *)user_context;
-	complete_queue(ctx, &ctx->shared_memory->external_work_queue, arena, gl_context, (iz)ctx->shared_memory);
-	complete_queue(ctx, ctx->beamform_work_queue, arena, gl_context, 0);
+	BeamformerCtx *ctx         = (BeamformerCtx *)user_context;
+	BeamformerSharedMemory *sm = ctx->shared_memory.region;
+	complete_queue(ctx, &sm->external_work_queue, arena, gl_context);
+	complete_queue(ctx, ctx->beamform_work_queue, arena, gl_context);
 }
 
 #include "ui.c"
@@ -700,49 +724,40 @@ DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step)
 		DEBUG_DECL(end_frame_capture   = ctx->os.end_frame_capture);
 	}
 
-	BeamformerParameters *bp = &ctx->shared_memory->parameters;
-	if (ctx->shared_memory->dispatch_compute_sync) {
-		ImagePlaneTag current_plane = ctx->shared_memory->current_image_plane;
-		atomic_store_u32(&ctx->shared_memory->dispatch_compute_sync, 0);
-		BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue);
-		if (work) {
-			if (fill_frame_compute_work(ctx, work, current_plane))
+	BeamformerSharedMemory *sm = ctx->shared_memory.region;
+	BeamformerParameters   *bp = &sm->parameters;
+	if (sm->locks[BeamformerSharedMemoryLockKind_DispatchCompute] && !ctx->starting_compute) {
+		if (sm->start_compute_from_main) {
+			BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue);
+			ImagePlaneTag tag  = ctx->beamform_frames[ctx->display_frame_index].image_plane_tag;
+			if (fill_frame_compute_work(ctx, work, tag)) {
 				beamform_work_queue_push_commit(ctx->beamform_work_queue);
-
-			if (ctx->shared_memory->export_next_frame) {
-				BeamformWork *export = beamform_work_queue_push(ctx->beamform_work_queue);
-				if (export) {
-					/* TODO: we don't really want the beamformer opening/closing files */
-					iptr f = ctx->os.open_for_write(ctx->os.export_pipe_name);
-					export->type = BW_SAVE_FRAME;
-					export->output_frame_ctx.file_handle = f;
-					if (bp->output_points[3] > 1) {
-						u32 a_index = !(ctx->averaged_frame_index %
-						                ARRAY_COUNT(ctx->averaged_frames));
-						BeamformComputeFrame *aframe = ctx->averaged_frames + a_index;
-						export->output_frame_ctx.frame = aframe;
-					} else {
-						export->output_frame_ctx.frame = work->frame;
+				if (sm->export_next_frame) {
+					BeamformWork *export = beamform_work_queue_push(ctx->beamform_work_queue);
+					if (export) {
+						/* TODO: we don't really want the beamformer opening/closing files */
+						iptr f = ctx->os.open_for_write(ctx->os.export_pipe_name);
+						export->type = BW_SAVE_FRAME;
+						export->output_frame_ctx.file_handle = f;
+						if (bp->output_points[3] > 1) {
+							static_assert(countof(ctx->averaged_frames) == 2,
+							              "fix this, we assume average frame ping pong buffer");
+							u32 a_index = !(ctx->averaged_frame_index %
+							                countof(ctx->averaged_frames));
+							BeamformComputeFrame *aframe = ctx->averaged_frames + a_index;
+							export->output_frame_ctx.frame = aframe;
+						} else {
+							export->output_frame_ctx.frame = work->frame;
+						}
+						beamform_work_queue_push_commit(ctx->beamform_work_queue);
 					}
-					beamform_work_queue_push_commit(ctx->beamform_work_queue);
+					sm->export_next_frame = 0;
 				}
-				ctx->shared_memory->export_next_frame = 0;
-			}
-
-			ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable);
-		}
-	}
-
-	if (ctx->start_compute) {
-		if (ctx->beamform_frames[ctx->display_frame_index].ready_to_present) {
-			BeamformWork *work  = beamform_work_queue_push(ctx->beamform_work_queue);
-			ImagePlaneTag plane = ctx->beamform_frames[ctx->display_frame_index].image_plane_tag;
-			if (fill_frame_compute_work(ctx, work, plane)) {
-				beamform_work_queue_push_commit(ctx->beamform_work_queue);
-				ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable);
-				ctx->start_compute = 0;
 			}
+			atomic_store_u32(&sm->start_compute_from_main, 0);
 		}
+		atomic_store_u32(&ctx->starting_compute, 1);
+		ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable);
 	}
 
 	ComputeFrameIterator cfi = compute_frame_iterator(ctx, ctx->display_frame_index,
@@ -754,14 +769,9 @@ DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step)
 		}
 	}
 
-	if (ctx->start_compute) {
-		ctx->start_compute = 0;
-		ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable);
-	}
-
 	BeamformComputeFrame *frame_to_draw;
 	if (bp->output_points[3] > 1) {
-		u32 a_index = !(ctx->averaged_frame_index % ARRAY_COUNT(ctx->averaged_frames));
+		u32 a_index = !(ctx->averaged_frame_index % countof(ctx->averaged_frames));
 		frame_to_draw = ctx->averaged_frames + a_index;
 	} else {
 		frame_to_draw = ctx->beamform_frames + ctx->display_frame_index;
diff --git a/beamformer.h b/beamformer.h
@@ -87,6 +87,7 @@ typedef struct {
 
 	u32 raw_data_ssbo;
 	u32 shared_ubo;
+	b32 shared_ubo_dirty;
 
 	u32 channel_mapping_texture;
 	u32 sparse_elements_texture;
@@ -171,7 +172,7 @@ typedef struct {
 	GLParams gl;
 
 	uv2 window_size;
-	b32 start_compute;
+	b32 starting_compute;
 	b32 should_exit;
 
 	Arena  ui_backing_store;
@@ -201,7 +202,7 @@ typedef struct {
 
 	BeamformWorkQueue *beamform_work_queue;
 
-	BeamformerSharedMemory *shared_memory;
+	SharedMemoryRegion shared_memory;
 } BeamformerCtx;
 
 struct ShaderReloadContext {
diff --git a/beamformer_parameters.h b/beamformer_parameters.h
@@ -7,8 +7,6 @@
  *      programatically would be nice.
  */
 
-#define BEAMFORMER_PARAMETERS_VERSION (2UL)
-
 /* X(enumarant, number, shader file name, needs header, pretty name) */
 #define COMPUTE_SHADERS \
 	X(CudaDecode,         0, "",         0, "CUDA Decode")   \
diff --git a/beamformer_work_queue.c b/beamformer_work_queue.c
@@ -50,19 +50,3 @@ DEBUG_EXPORT BEAMFORM_WORK_QUEUE_PUSH_COMMIT_FN(beamform_work_queue_push_commit)
 {
 	atomic_add_u64(&q->queue, 1);
 }
-
-function b32
-try_wait_sync(i32 *sync, i32 timeout_ms, os_wait_on_value_fn *os_wait_on_value)
-{
-	b32 result = 0;
-	for (;;) {
-		i32 current = atomic_load_u32(sync);
-		if (current && atomic_cas_u32(sync, &current, 0)) {
-			result = 1;
-			break;
-		}
-		if (!timeout_ms || !os_wait_on_value(sync, 0, timeout_ms))
-			break;
-	}
-	return result;
-}
diff --git a/beamformer_work_queue.h b/beamformer_work_queue.h
@@ -1,12 +1,20 @@
 /* See LICENSE for license details. */
+/* TODO(rnp):
+ * [ ]: coalesce uploads if they are overwriting exist data
+ *      - use a flag field and only submit a new work if the corresponding flag is clear
+ */
+
 #ifndef _BEAMFORMER_WORK_QUEUE_H_
 #define _BEAMFORMER_WORK_QUEUE_H_
 
+#define BEAMFORMER_SHARED_MEMORY_VERSION (3UL)
+
 typedef struct BeamformComputeFrame BeamformComputeFrame;
 typedef struct ShaderReloadContext  ShaderReloadContext;
 
 typedef enum {
 	BW_COMPUTE,
+	BW_COMPUTE_INDIRECT,
 	BW_RELOAD_SHADER,
 	BW_SAVE_FRAME,
 	BW_SEND_FRAME,
@@ -33,6 +41,21 @@ typedef struct {
 	iptr                  file_handle;
 } BeamformOutputFrameContext;
 
+#define BEAMFORMER_SHARED_MEMORY_LOCKS \
+	X(None)            \
+	X(Parameters)      \
+	X(ParametersHead)  \
+	X(ParametersUI)    \
+	X(FocalVectors)    \
+	X(ChannelMapping)  \
+	X(SparseElements)  \
+	X(RawData)         \
+	X(DispatchCompute)
+
+#define X(name) BeamformerSharedMemoryLockKind_##name,
+typedef enum {BEAMFORMER_SHARED_MEMORY_LOCKS BeamformerSharedMemoryLockKind_Count} BeamformerSharedMemoryLockKind;
+#undef X
+
 /* NOTE: discriminated union based on type */
 typedef struct {
 	union {
@@ -40,11 +63,10 @@ typedef struct {
 		BeamformerUploadContext     upload_context;
 		BeamformOutputFrameContext  output_frame_ctx;
 		ShaderReloadContext        *shader_reload_context;
+		ImagePlaneTag               compute_indirect_plane;
 		void                       *generic;
 	};
-	/* NOTE(rnp): mostly for __external__ processes to sync on. when passed from external
-	 * process this should be an offset from base of shared_memory */
-	iptr completion_barrier;
+	BeamformerSharedMemoryLockKind lock;
 
 	BeamformWorkType type;
 } BeamformWork;
@@ -68,9 +90,15 @@ typedef BEAMFORM_WORK_QUEUE_PUSH_COMMIT_FN(beamform_work_queue_push_commit_fn);
                                        - (uintptr_t)(sizeof(BeamformerSharedMemory) & 4095ULL))
 #define BEAMFORMER_MAX_RF_DATA_SIZE   (BEAMFORMER_SHARED_MEMORY_SIZE - BEAMFORMER_RF_DATA_OFF)
 
-typedef struct {
+typedef align_as(64) struct {
+	u32 version;
+
+	/* NOTE(rnp): not used for locking on w32 but we can use these to peek at the status of
+	 * the lock without leaving userspace. also this struct needs a bunch of padding */
+	i32 locks[BeamformerSharedMemoryLockKind_Count];
+
 	/* NOTE(rnp): interleaved transmit angle, focal depth pairs */
-	_Alignas(64) v2 focal_vectors[256];
+	align_as(64) v2 focal_vectors[256];
 
 	i16 channel_mapping[256];
 	i16 sparse_elements[256];
@@ -87,21 +115,12 @@ typedef struct {
 	ComputeShaderKind compute_stages[MAX_COMPUTE_SHADER_STAGES];
 	u32               compute_stages_count;
 
-	i32 parameters_sync;
-	i32 parameters_head_sync;
-	i32 parameters_ui_sync;
-	i32 focal_vectors_sync;
-	i32 channel_mapping_sync;
-	i32 sparse_elements_sync;
-	i32 raw_data_sync;
-
-	i32           dispatch_compute_sync;
-	ImagePlaneTag current_image_plane;
+	/* TODO(rnp): hack: we need a different way of dispatching work for export */
+	b32 start_compute_from_main;
 
 	/* TODO(rnp): this shouldn't be needed */
 	b32 export_next_frame;
 
-	u32 version;
 	BeamformWorkQueue external_work_queue;
 } BeamformerSharedMemory;
 
diff --git a/build.c b/build.c
@@ -272,7 +272,6 @@ W32(b32) CreateProcessA(u8 *, u8 *, iptr, iptr, b32, u32, iptr, u8 *, iptr, iptr
 W32(b32) GetExitCodeProcess(iptr handle, u32 *);
 W32(b32) GetFileTime(iptr, iptr, iptr, iptr);
 W32(b32) MoveFileExA(c8 *, c8 *, u32);
-W32(u32) WaitForSingleObject(iptr, u32);
 
 function void
 os_make_directory(char *name)
diff --git a/helpers/ogl_beamformer_lib.c b/helpers/ogl_beamformer_lib.c
@@ -8,6 +8,7 @@
 
 #define PIPE_RETRY_PERIOD_MS (100ULL)
 
+global SharedMemoryRegion      g_shared_memory;
 global BeamformerSharedMemory *g_bp;
 global BeamformerLibErrorKind  g_lib_last_error;
 
@@ -72,15 +73,14 @@ os_wait_read_pipe(Pipe p, void *buf, iz read_size, u32 timeout_ms)
 	return total_read == read_size;
 }
 
-static BeamformerSharedMemory *
+function SharedMemoryRegion
 os_open_shared_memory_area(char *name)
 {
-	BeamformerSharedMemory *result = 0;
+	SharedMemoryRegion result = {0};
 	i32 fd = shm_open(name, O_RDWR, S_IRUSR|S_IWUSR);
 	if (fd > 0) {
 		void *new = mmap(0, BEAMFORMER_SHARED_MEMORY_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
-		if (new != MAP_FAILED)
-			result = new;
+		if (new != MAP_FAILED) result.region = new;
 		close(fd);
 	}
 	return result;
@@ -88,15 +88,6 @@ os_open_shared_memory_area(char *name)
 
 #elif OS_WINDOWS
 
-/* TODO(rnp): temporary workaround */
-function OS_WAIT_ON_VALUE_FN(os_wait_on_value_stub)
-{
-	/* TODO(rnp): this doesn't work across processes on win32 (return 1 to cause a spin wait) */
-	return 1;
-	return WaitOnAddress(value, &current, sizeof(*value), timeout_ms);
-}
-#define os_wait_on_value os_wait_on_value_stub
-
 static Pipe
 os_open_read_pipe(char *name)
 {
@@ -145,16 +136,35 @@ os_wait_read_pipe(Pipe p, void *buf, iz read_size, u32 timeout_ms)
 	return total_read == read_size;
 }
 
-function BeamformerSharedMemory *
+function SharedMemoryRegion
 os_open_shared_memory_area(char *name)
 {
-	BeamformerSharedMemory *result = 0;
+	SharedMemoryRegion result = {0};
 	iptr h = OpenFileMappingA(FILE_MAP_ALL_ACCESS, 0, name);
 	if (h != INVALID_FILE) {
-		result = MapViewOfFile(h, FILE_MAP_ALL_ACCESS, 0, 0, BEAMFORMER_SHARED_MEMORY_SIZE);
+		void *new = MapViewOfFile(h, FILE_MAP_ALL_ACCESS, 0, 0,
+		                          os_round_up_to_page_size(BEAMFORMER_SHARED_MEMORY_SIZE));
+		if (new) {
+			u8 buffer[1024];
+			Stream sb = {.data = buffer, .cap = 1024};
+			stream_append_s8s(&sb, c_str_to_s8(name), s8("_lock_"));
+			local_persist iptr semaphores[BeamformerSharedMemoryLockKind_Count];
+			local_persist w32_shared_memory_context ctx = {.semaphores = semaphores};
+			b32 all_semaphores = 1;
+			for (i32 i = 0; i < countof(semaphores); i++) {
+				Stream lb = sb;
+				stream_append_i64(&lb, i);
+				stream_append_byte(&lb, 0);
+				semaphores[i] = CreateSemaphoreA(0, 1, 1, (c8 *)lb.data);
+				all_semaphores &= semaphores[i] != INVALID_FILE;
+			}
+			if (all_semaphores) {
+				result.region     = new;
+				result.os_context = (iptr)&ctx;
+			}
+		}
 		CloseHandle(h);
 	}
-
 	return result;
 }
 
@@ -164,16 +174,19 @@ function b32
 check_shared_memory(void)
 {
 	b32 result = 1;
-	if (!g_bp) {
-		g_bp = os_open_shared_memory_area(OS_SHARED_MEMORY_NAME);
-		if (!g_bp) {
+	if (!g_shared_memory.region) {
+		g_shared_memory = os_open_shared_memory_area(OS_SHARED_MEMORY_NAME);
+		if (!g_shared_memory.region) {
 			result = 0;
 			g_lib_last_error = BF_LIB_ERR_KIND_SHARED_MEMORY;
 		}
-	} else if (g_bp->version != BEAMFORMER_PARAMETERS_VERSION) {
+	} else if (((BeamformerSharedMemory *)g_shared_memory.region)->version
+	           != BEAMFORMER_SHARED_MEMORY_VERSION)
+	{
 		g_lib_last_error = BF_LIB_ERR_KIND_VERSION_MISMATCH;
 		result = 0;
 	}
+	if (result) g_bp = g_shared_memory.region;
 	return result;
 }
 
@@ -186,17 +199,23 @@ try_push_work_queue(void)
 }
 
 function b32
-lib_try_wait_sync(i32 *sync, i32 timeout_ms, os_wait_on_value_fn *os_wait_on_value)
+lib_try_lock(BeamformerSharedMemoryLockKind lock, i32 timeout_ms)
 {
-	b32 result = try_wait_sync(sync, timeout_ms, os_wait_on_value);
+	b32 result = os_shared_memory_region_lock(&g_shared_memory, g_bp->locks, (i32)lock, timeout_ms);
 	if (!result) g_lib_last_error = BF_LIB_ERR_KIND_SYNC_VARIABLE;
 	return result;
 }
 
+function void
+lib_release_lock(BeamformerSharedMemoryLockKind lock)
+{
+	os_shared_memory_region_unlock(&g_shared_memory, g_bp->locks, (i32)lock);
+}
+
 u32
 beamformer_get_api_version(void)
 {
-	return BEAMFORMER_PARAMETERS_VERSION;
+	return BEAMFORMER_SHARED_MEMORY_VERSION;
 }
 
 const char *
@@ -245,64 +264,60 @@ set_beamformer_pipeline(i32 *stages, i32 stages_count)
 }
 
 b32
-beamformer_start_compute(u32 image_plane_tag)
+beamformer_start_compute(i32 timeout_ms)
 {
 	b32 result = 0;
-	if (image_plane_tag < IPT_LAST) {
-		if (check_shared_memory()) {
-			if (atomic_load_u32(&g_bp->dispatch_compute_sync) == 0) {
-				g_bp->current_image_plane = image_plane_tag;
-				atomic_store_u32(&g_bp->dispatch_compute_sync, 1);
+	if (check_shared_memory()) {
+		if (lib_try_lock(BeamformerSharedMemoryLockKind_DispatchCompute, 0)) {
+			if (lib_try_lock(BeamformerSharedMemoryLockKind_DispatchCompute, timeout_ms)) {
+				lib_release_lock(BeamformerSharedMemoryLockKind_DispatchCompute);
 				result = 1;
-			} else {
-				g_lib_last_error = BF_LIB_ERR_KIND_SYNC_VARIABLE;
 			}
 		}
-	} else {
-		g_lib_last_error = BF_LIB_ERR_KIND_INVALID_IMAGE_PLANE;
 	}
 	return result;
 }
 
 function b32
-beamformer_upload_buffer(void *data, u32 size, i32 store_offset, i32 sync_offset,
+beamformer_upload_buffer(void *data, u32 size, i32 store_offset, BeamformerSharedMemoryLockKind lock,
                          BeamformerUploadKind kind, i32 timeout_ms)
 {
 	b32 result = 0;
 	if (check_shared_memory()) {
 		BeamformWork *work = try_push_work_queue();
-		result = work && lib_try_wait_sync((i32 *)((u8 *)g_bp + sync_offset), timeout_ms, os_wait_on_value);
+		result = work && lib_try_lock(lock, timeout_ms);
 		if (result) {
 			BeamformerUploadContext *uc = &work->upload_context;
 			uc->shared_memory_offset = store_offset;
 			uc->size = size;
 			uc->kind = kind;
 			work->type = BW_UPLOAD_BUFFER;
-			work->completion_barrier = sync_offset;
+			work->lock = lock;
 			mem_copy((u8 *)g_bp + store_offset, data, size);
 			beamform_work_queue_push_commit(&g_bp->external_work_queue);
+			lib_release_lock(lock);
 		}
 	}
 	return result;
 }
 
 #define BEAMFORMER_UPLOAD_FNS \
-	X(channel_mapping, i16, 1, CHANNEL_MAPPING) \
-	X(sparse_elements, i16, 1, SPARSE_ELEMENTS) \
-	X(focal_vectors,   f32, 2, FOCAL_VECTORS)
+	X(channel_mapping, i16, 1, ChannelMapping, CHANNEL_MAPPING) \
+	X(sparse_elements, i16, 1, SparseElements, SPARSE_ELEMENTS) \
+	X(focal_vectors,   f32, 2, FocalVectors,   FOCAL_VECTORS)
 
-#define X(name, dtype, elements, command) \
+#define X(name, dtype, elements, lock_name, command) \
 b32 beamformer_push_##name (dtype *data, u32 count, i32 timeout_ms) { \
-	b32 result = 0;                                                                          \
-	if (count <= countof(g_bp->name)) {                                                      \
-		result = beamformer_upload_buffer(data, count * elements * sizeof(dtype),        \
-		                                  offsetof(BeamformerSharedMemory, name),        \
-		                                  offsetof(BeamformerSharedMemory, name##_sync), \
-		                                  BU_KIND_##command, timeout_ms);                \
-	} else {                                                                                 \
-		g_lib_last_error = BF_LIB_ERR_KIND_BUFFER_OVERFLOW;                              \
-	}                                                                                        \
-	return result;                                                                           \
+	b32 result = 0;                                                                       \
+	if (count <= countof(g_bp->name)) {                                                   \
+		result = beamformer_upload_buffer(data, count * elements * sizeof(dtype),     \
+		                                  offsetof(BeamformerSharedMemory, name),     \
+		                                  BeamformerSharedMemoryLockKind_##lock_name, \
+		                                  BU_KIND_##command, timeout_ms);             \
+	} else {                                                                              \
+		g_lib_last_error = BF_LIB_ERR_KIND_BUFFER_OVERFLOW;                           \
+	}                                                                                     \
+	return result;                                                                        \
 }
 BEAMFORMER_UPLOAD_FNS
 #undef X
@@ -312,19 +327,20 @@ beamformer_push_parameters(BeamformerParameters *bp, i32 timeout_ms)
 {
 	b32 result = beamformer_upload_buffer(bp, sizeof(*bp),
 	                                      offsetof(BeamformerSharedMemory, parameters),
-	                                      offsetof(BeamformerSharedMemory, parameters_sync),
+	                                      BeamformerSharedMemoryLockKind_Parameters,
 	                                      BU_KIND_PARAMETERS, timeout_ms);
 	return result;
 }
 
-b32
-beamformer_push_data(void *data, u32 data_size, i32 timeout_ms)
+function b32
+beamformer_push_data_base(void *data, u32 data_size, i32 timeout_ms, b32 start_from_main)
 {
 	b32 result = 0;
 	if (data_size <= BEAMFORMER_MAX_RF_DATA_SIZE) {
 		result = beamformer_upload_buffer(data, data_size, BEAMFORMER_RF_DATA_OFF,
-		                                  offsetof(BeamformerSharedMemory, raw_data_sync),
+		                                  BeamformerSharedMemoryLockKind_RawData,
 		                                  BU_KIND_RF_DATA, timeout_ms);
+		if (result && start_from_main) atomic_store_u32(&g_bp->start_compute_from_main, 1);
 	} else {
 		g_lib_last_error = BF_LIB_ERR_KIND_BUFFER_OVERFLOW;
 	}
@@ -332,21 +348,49 @@ beamformer_push_data(void *data, u32 data_size, i32 timeout_ms)
 }
 
 b32
+beamformer_push_data(void *data, u32 data_size, i32 timeout_ms)
+{
+	return beamformer_push_data_base(data, data_size, timeout_ms, 1);
+}
+
+b32
+beamformer_push_data_with_compute(void *data, u32 data_size, u32 image_plane_tag, i32 timeout_ms)
+{
+	b32 result = beamformer_push_data_base(data, data_size, timeout_ms, 0);
+	if (result) {
+		result = image_plane_tag < IPT_LAST;
+		if (result) {
+			BeamformWork *work = try_push_work_queue();
+			result = work != 0;
+			if (result) {
+				work->type = BW_COMPUTE_INDIRECT;
+				work->compute_indirect_plane = image_plane_tag;
+				beamform_work_queue_push_commit(&g_bp->external_work_queue);
+			}
+		} else {
+			g_lib_last_error = BF_LIB_ERR_KIND_INVALID_IMAGE_PLANE;
+		}
+	}
+	return result;
+}
+
+b32
 beamformer_push_parameters_ui(BeamformerUIParameters *bp, i32 timeout_ms)
 {
 	b32 result = 0;
 	if (check_shared_memory()) {
 		BeamformWork *work = try_push_work_queue();
-		result = work && lib_try_wait_sync(&g_bp->parameters_ui_sync, timeout_ms, os_wait_on_value);
+		result = work && lib_try_lock(BeamformerSharedMemoryLockKind_ParametersUI, timeout_ms);
 		if (result) {
 			BeamformerUploadContext *uc = &work->upload_context;
 			uc->shared_memory_offset = offsetof(BeamformerSharedMemory, parameters);
 			uc->size = sizeof(g_bp->parameters);
 			uc->kind = BU_KIND_PARAMETERS;
 			work->type = BW_UPLOAD_BUFFER;
-			work->completion_barrier = offsetof(BeamformerSharedMemory, parameters_ui_sync);
+			work->lock = BeamformerSharedMemoryLockKind_ParametersUI;
 			mem_copy(&g_bp->parameters_ui, bp, sizeof(*bp));
 			beamform_work_queue_push_commit(&g_bp->external_work_queue);
+			lib_release_lock(BeamformerSharedMemoryLockKind_ParametersUI);
 		}
 	}
 	return result;
@@ -358,16 +402,17 @@ beamformer_push_parameters_head(BeamformerParametersHead *bp, i32 timeout_ms)
 	b32 result = 0;
 	if (check_shared_memory()) {
 		BeamformWork *work = try_push_work_queue();
-		result = work && lib_try_wait_sync(&g_bp->parameters_head_sync, timeout_ms, os_wait_on_value);
+		result = work && lib_try_lock(BeamformerSharedMemoryLockKind_ParametersHead, timeout_ms);
 		if (result) {
 			BeamformerUploadContext *uc = &work->upload_context;
 			uc->shared_memory_offset = offsetof(BeamformerSharedMemory, parameters);
 			uc->size = sizeof(g_bp->parameters);
 			uc->kind = BU_KIND_PARAMETERS;
 			work->type = BW_UPLOAD_BUFFER;
-			work->completion_barrier = offsetof(BeamformerSharedMemory, parameters_head_sync);
+			work->lock = BeamformerSharedMemoryLockKind_ParametersHead;
 			mem_copy(&g_bp->parameters_head, bp, sizeof(*bp));
 			beamform_work_queue_push_commit(&g_bp->external_work_queue);
+			lib_release_lock(BeamformerSharedMemoryLockKind_ParametersHead);
 		}
 	}
 	return result;
@@ -393,14 +438,8 @@ b32
 send_data(void *data, u32 data_size)
 {
 	b32 result = 0;
-	if (beamformer_push_data(data, data_size, 0)) {
-		result = beamformer_start_compute(0);
-		if (result) {
-			/* TODO(rnp): should we just set timeout on acquiring the lock instead of this? */
-			try_wait_sync(&g_bp->raw_data_sync, -1, os_wait_on_value);
-			atomic_store_u32(&g_bp->raw_data_sync, 1);
-		}
-	}
+	if (beamformer_push_data(data, data_size, 0))
+		result = beamformer_start_compute(-1);
 	return result;
 }
 
diff --git a/helpers/ogl_beamformer_lib_base.h b/helpers/ogl_beamformer_lib_base.h
@@ -38,10 +38,12 @@ LIB_FN uint32_t send_data(void *data, uint32_t data_size);
 LIB_FN uint32_t beamform_data_synchronized(void *data, uint32_t data_size, uint32_t output_points[3],
                                            float *out_data, int32_t timeout_ms);
 
-LIB_FN uint32_t beamformer_start_compute(uint32_t image_plane_tag);
+/* NOTE: tells the beamformer to start beamforming and waits until it starts or for timeout_ms */
+LIB_FN uint32_t beamformer_start_compute(int32_t timeout_ms);
 
 /* NOTE: these functions only queue an upload; you must flush (old data functions or start_compute) */
 LIB_FN uint32_t beamformer_push_data(void *data, uint32_t size, int32_t timeout_ms);
+LIB_FN uint32_t beamformer_push_data_with_compute(void *data, uint32_t size, uint32_t image_plane_tag, int32_t timeout_ms);
 LIB_FN uint32_t beamformer_push_channel_mapping(int16_t *mapping,  uint32_t count, int32_t timeout_ms);
 LIB_FN uint32_t beamformer_push_sparse_elements(int16_t *elements, uint32_t count, int32_t timeout_ms);
 LIB_FN uint32_t beamformer_push_focal_vectors(float     *vectors,  uint32_t count, int32_t timeout_ms);
diff --git a/os_linux.c b/os_linux.c
@@ -83,24 +83,25 @@ os_get_timer_counter(void)
 	return result;
 }
 
+function iz
+os_round_up_to_page_size(iz value)
+{
+	iz result = round_up_to(value, sysconf(_SC_PAGESIZE));
+	return result;
+}
+
 function OS_ALLOC_ARENA_FN(os_alloc_arena)
 {
-	Arena result;
-	iz pagesize = sysconf(_SC_PAGESIZE);
-	if (capacity % pagesize != 0)
-		capacity += pagesize - capacity % pagesize;
-
-	iz oldsize = old.end - old.beg;
-	if (oldsize > capacity)
-		return old;
-
-	if (old.beg)
-		munmap(old.beg, oldsize);
-
-	result.beg = mmap(0, capacity, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
-	if (result.beg == MAP_FAILED)
-		os_fatal(s8("os_alloc_arena: couldn't allocate memory\n"));
-	result.end = result.beg + capacity;
+	Arena result = old;
+	capacity = os_round_up_to_page_size(capacity);
+	iz old_size = old.end - old.beg;
+	if (old_size < capacity) {
+		if (old.beg) munmap(old.beg, old_size);
+		result.beg = mmap(0, capacity, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+		if (result.beg == MAP_FAILED)
+			os_fatal(s8("os_alloc_arena: couldn't allocate memory\n"));
+		result.end = result.beg + capacity;
+	}
 	return result;
 }
 
@@ -163,15 +164,15 @@ function OS_READ_FILE_FN(os_read_file)
 	return total_read;
 }
 
-function void *
-os_create_shared_memory_area(char *name, iz cap)
+function SharedMemoryRegion
+os_create_shared_memory_area(Arena *arena, char *name, i32 lock_count, iz requested_capacity)
 {
-	void *result = 0;
+	iz capacity = os_round_up_to_page_size(requested_capacity);
+	SharedMemoryRegion result = {0};
 	i32 fd = shm_open(name, O_CREAT|O_RDWR, S_IRUSR|S_IWUSR);
-	if (fd > 0 && ftruncate(fd, cap) != -1) {
-		void *new = mmap(NULL, cap, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
-		if (new != MAP_FAILED)
-			result = new;
+	if (fd > 0 && ftruncate(fd, capacity) != -1) {
+		void *new = mmap(0, capacity, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+		if (new != MAP_FAILED) result.region = new;
 	}
 	if (fd > 0) close(fd);
 	return result;
@@ -294,7 +295,30 @@ function OS_WAIT_ON_VALUE_FN(os_wait_on_value)
 function OS_WAKE_WAITERS_FN(os_wake_waiters)
 {
 	if (sync) {
-		atomic_store_u32(sync, 1);
+		atomic_store_u32(sync, 0);
 		syscall(SYS_futex, sync, FUTEX_WAKE, I32_MAX, 0, 0, 0);
 	}
 }
+
+function OS_SHARED_MEMORY_LOCK_REGION_FN(os_shared_memory_region_lock)
+{
+	b32 result = 0;
+	for (;;) {
+		i32 current = atomic_load_u32(locks + lock_index);
+		if (current == 0 && atomic_cas_u32(locks + lock_index, &current, 1)) {
+			result = 1;
+			break;
+		}
+		if (!timeout_ms || !os_wait_on_value(locks + lock_index, current, timeout_ms))
+			break;
+	}
+	return result;
+}
+
+function OS_SHARED_MEMORY_UNLOCK_REGION_FN(os_shared_memory_region_unlock)
+{
+	i32 *lock = locks + lock_index;
+	assert(atomic_load_u32(lock));
+	atomic_store_u32(lock, 0);
+	os_wake_waiters(lock);
+}
diff --git a/os_win32.c b/os_win32.c
@@ -83,12 +83,17 @@ typedef struct {
 	iptr context;
 } w32_io_completion_event;
 
+typedef struct {
+	iptr *semaphores;
+} w32_shared_memory_context;
+
 #define W32(r) __declspec(dllimport) r __stdcall
 W32(b32)    CloseHandle(iptr);
 W32(b32)    CopyFileA(c8 *, c8 *, b32);
 W32(iptr)   CreateFileA(c8 *, u32, u32, void *, u32, u32, void *);
 W32(iptr)   CreateFileMappingA(iptr, void *, u32, u32, u32, c8 *);
 W32(iptr)   CreateIoCompletionPort(iptr, iptr, uptr, u32);
+W32(iptr)   CreateSemaphoreA(iptr, i32, i32, c8 *);
 W32(iptr)   CreateThread(iptr, uz, iptr, iptr, u32, u32 *);
 W32(b32)    DeleteFileA(c8 *);
 W32(void)   ExitProcess(i32);
@@ -107,8 +112,9 @@ W32(b32)    QueryPerformanceCounter(u64 *);
 W32(b32)    QueryPerformanceFrequency(u64 *);
 W32(b32)    ReadDirectoryChangesW(iptr, u8 *, u32, b32, u32, u32 *, void *, void *);
 W32(b32)    ReadFile(iptr, u8 *, i32, i32 *, void *);
-W32(b32)    ReleaseSemaphore(iptr, i64, i64 *);
+W32(b32)    ReleaseSemaphore(iptr, i32, i32 *);
 W32(i32)    SetThreadDescription(iptr, u16 *);
+W32(u32)    WaitForSingleObject(iptr, u32);
 W32(b32)    WaitOnAddress(void *, void *, uz, u32);
 W32(i32)    WakeByAddressAll(void *);
 W32(iptr)   wglGetProcAddress(c8 *);
@@ -168,10 +174,9 @@ os_get_timer_counter(void)
 	return result;
 }
 
-function OS_ALLOC_ARENA_FN(os_alloc_arena)
+function iz
+os_round_up_to_page_size(iz value)
 {
-	Arena result = old;
-
 	struct {
 		u16  architecture;
 		u16  _pad1;
@@ -185,17 +190,18 @@ function OS_ALLOC_ARENA_FN(os_alloc_arena)
 		u16  processor_level;
 		u16  processor_revision;
 	} info;
-
 	GetSystemInfo(&info);
+	iz result = round_up_to(value, info.page_size);
+	return result;
+}
 
-	if (capacity % info.page_size != 0)
-		capacity += (info.page_size - capacity % info.page_size);
-
+function OS_ALLOC_ARENA_FN(os_alloc_arena)
+{
+	Arena result = old;
+	capacity = os_round_up_to_page_size(capacity);
 	iz old_size = old.end - old.beg;
 	if (old_size < capacity) {
-		if (old.beg)
-			VirtualFree(old.beg, old_size, MEM_RELEASE);
-
+		if (old.beg) VirtualFree(old.beg, old_size, MEM_RELEASE);
 		result.beg = VirtualAlloc(0, capacity, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
 		if (!result.beg)
 			os_fatal(s8("os_alloc_arena: couldn't allocate memory\n"));
@@ -268,13 +274,34 @@ os_file_exists(char *path)
 	return result;
 }
 
-function void *
-os_create_shared_memory_area(char *name, iz cap)
+function SharedMemoryRegion
+os_create_shared_memory_area(Arena *arena, char *name, i32 lock_count, iz requested_capacity)
 {
-	void *result = 0;
-	iptr h = CreateFileMappingA(-1, 0, PAGE_READWRITE, 0, cap, name);
-	if (h != INVALID_FILE)
-		result = MapViewOfFile(h, FILE_MAP_ALL_ACCESS, 0, 0, cap);
+	iz capacity = os_round_up_to_page_size(requested_capacity);
+	SharedMemoryRegion result = {0};
+	iptr h = CreateFileMappingA(-1, 0, PAGE_READWRITE, 0, capacity, name);
+	if (h != INVALID_FILE) {
+		void *new = MapViewOfFile(h, FILE_MAP_ALL_ACCESS, 0, 0, capacity);
+		if (new) {
+			w32_shared_memory_context *ctx = push_struct(arena, typeof(*ctx));
+			ctx->semaphores   = push_array(arena, typeof(*ctx->semaphores), lock_count);
+			result.os_context = (iptr)ctx;
+			result.region     = new;
+
+			Stream sb = arena_stream(*arena);
+			stream_append_s8s(&sb, c_str_to_s8(name), s8("_lock_"));
+			for (i32 i = 0; i < lock_count; i++) {
+				Stream lb = sb;
+				stream_append_i64(&lb, i);
+				stream_append_byte(&lb, 0);
+				ctx->semaphores[i] = CreateSemaphoreA(0, 1, 1, (c8 *)lb.data);
+				if (ctx->semaphores[i] == INVALID_FILE) {
+					os_fatal(s8("os_create_shared_memory_area: "
+					            "failed to create semaphore\n"));
+				}
+			}
+		}
+	}
 	return result;
 }
 
@@ -380,7 +407,23 @@ function OS_WAIT_ON_VALUE_FN(os_wait_on_value)
 function OS_WAKE_WAITERS_FN(os_wake_waiters)
 {
 	if (sync) {
-		atomic_add_u32(sync, 1);
+		atomic_store_u32(sync, 0);
 		WakeByAddressAll(sync);
 	}
 }
+
+function OS_SHARED_MEMORY_LOCK_REGION_FN(os_shared_memory_region_lock)
+{
+	w32_shared_memory_context *ctx = (typeof(ctx))sm->os_context;
+	b32 result = !WaitForSingleObject(ctx->semaphores[lock_index], timeout_ms);
+	if (result) atomic_store_u32(locks + lock_index, 1);
+	return result;
+}
+
+function OS_SHARED_MEMORY_UNLOCK_REGION_FN(os_shared_memory_region_unlock)
+{
+	w32_shared_memory_context *ctx = (typeof(ctx))sm->os_context;
+	assert(atomic_load_u32(locks + lock_index));
+	os_wake_waiters(locks + lock_index);
+	ReleaseSemaphore(ctx->semaphores[lock_index], 1, 0);
+}
diff --git a/static.c b/static.c
@@ -212,7 +212,6 @@ function FILE_WATCH_CALLBACK_FN(load_cuda_lib)
 	return result;
 }
 
-
 #define GLFW_VISIBLE 0x00020004
 void glfwWindowHint(i32, i32);
 iptr glfwCreateWindow(i32, i32, char *, iptr, iptr);
@@ -229,12 +228,12 @@ function OS_THREAD_ENTRY_POINT_FN(compute_worker_thread_entry_point)
 
 	for (;;) {
 		for (;;) {
-			i32 expected = 1;
-			if (atomic_cas_u32(&ctx->sync_variable, &expected, 0))
+			i32 expected = 0;
+			if (atomic_cas_u32(&ctx->sync_variable, &expected, 1))
 				break;
 
 			ctx->asleep = 1;
-			os_wait_on_value(&ctx->sync_variable, 0, -1);
+			os_wait_on_value(&ctx->sync_variable, 1, -1);
 			ctx->asleep = 0;
 		}
 		beamformer_complete_compute(ctx->user_context, ctx->arena, ctx->gl_context);
@@ -280,25 +279,19 @@ setup_beamformer(BeamformerCtx *ctx, BeamformerInput *input, Arena *memory)
 
 	ctx->beamform_work_queue = push_struct(memory, BeamformWorkQueue);
 
-	ctx->shared_memory = os_create_shared_memory_area(OS_SHARED_MEMORY_NAME, BEAMFORMER_SHARED_MEMORY_SIZE);
-	if (!ctx->shared_memory)
-		os_fatal(s8("Get more ram lol\n"));
-	mem_clear(ctx->shared_memory, 0, sizeof(*ctx->shared_memory));
-
-	ctx->shared_memory->version = BEAMFORMER_PARAMETERS_VERSION;
-	/* TODO(rnp): refactor - this is annoying */
-	ctx->shared_memory->parameters_sync      = 1;
-	ctx->shared_memory->parameters_head_sync = 1;
-	ctx->shared_memory->parameters_ui_sync   = 1;
-	ctx->shared_memory->raw_data_sync        = 1;
-	ctx->shared_memory->channel_mapping_sync = 1;
-	ctx->shared_memory->sparse_elements_sync = 1;
-	ctx->shared_memory->focal_vectors_sync   = 1;
+	ctx->shared_memory = os_create_shared_memory_area(memory, OS_SHARED_MEMORY_NAME,
+	                                                  BeamformerSharedMemoryLockKind_Count,
+	                                                  BEAMFORMER_SHARED_MEMORY_SIZE);
+	BeamformerSharedMemory *sm = ctx->shared_memory.region;
+	if (!sm) os_fatal(s8("Get more ram lol\n"));
+	mem_clear(sm, 0, sizeof(*sm));
+
+	sm->version = BEAMFORMER_SHARED_MEMORY_VERSION;
 
 	/* NOTE: default compute shader pipeline */
-	ctx->shared_memory->compute_stages[0]    = ComputeShaderKind_Decode;
-	ctx->shared_memory->compute_stages[1]    = ComputeShaderKind_DASCompute;
-	ctx->shared_memory->compute_stages_count = 2;
+	sm->compute_stages[0]    = ComputeShaderKind_Decode;
+	sm->compute_stages[1]    = ComputeShaderKind_DASCompute;
+	sm->compute_stages_count = 2;
 
 	if (ctx->gl.vendor_id == GL_VENDOR_NVIDIA
 	    && load_cuda_lib(&ctx->os, s8(OS_CUDA_LIB_NAME), (iptr)&ctx->cuda_lib, *memory))
diff --git a/ui.c b/ui.c
@@ -1965,18 +1965,18 @@ draw_compute_stats_view(BeamformerCtx *ctx, Arena arena, ComputeShaderStats *sta
 	read_only local_persist s8 labels[ComputeShaderKind_Count] = {COMPUTE_SHADERS};
 	#undef X
 
+	BeamformerSharedMemory *sm = ctx->shared_memory.region;
 	BeamformerUI *ui     = ctx->ui;
 	f32 compute_time_sum = 0;
-	u32 stages           = ctx->shared_memory->compute_stages_count;
+	u32 stages           = sm->compute_stages_count;
 	TextSpec text_spec   = {.font = &ui->font, .colour = FG_COLOUR, .flags = TF_LIMITED};
 
 	Table *table = table_new(&arena, stages + 1, 3, (TextAlignment []){TA_LEFT, TA_LEFT, TA_LEFT});
 	for (u32 i = 0; i < stages; i++) {
 		TableCell *cells = table_push_row(table, &arena, TRK_CELLS)->data;
 
-
 		Stream sb = arena_stream(arena);
-		u32 index = ctx->shared_memory->compute_stages[i];
+		ShaderKind index = (ShaderKind)sm->compute_stages[i];
 		compute_time_sum += stats->times[index];
 		stream_append_f64_e(&sb, stats->times[index]);
 
@@ -2824,6 +2824,7 @@ draw_ui(BeamformerCtx *ctx, BeamformerInput *input, BeamformFrame *frame_to_draw
         ComputeShaderStats *latest_compute_stats)
 {
 	BeamformerUI *ui = ctx->ui;
+	BeamformerSharedMemory *sm = ctx->shared_memory.region;
 
 	ui->latest_plane[IPT_LAST]    = frame_to_draw;
 	ui->latest_plane[frame_plane] = frame_to_draw;
@@ -2831,7 +2832,7 @@ draw_ui(BeamformerCtx *ctx, BeamformerInput *input, BeamformFrame *frame_to_draw
 
 	/* TODO(rnp): there should be a better way of detecting this */
 	if (ctx->ui_read_params) {
-		mem_copy(&ui->params, &ctx->shared_memory->parameters.output_min_coordinate, sizeof(ui->params));
+		mem_copy(&ui->params, &sm->parameters.output_min_coordinate, sizeof(ui->params));
 		ui->flush_params    = 0;
 		ctx->ui_read_params = 0;
 	}
@@ -2841,19 +2842,18 @@ draw_ui(BeamformerCtx *ctx, BeamformerInput *input, BeamformFrame *frame_to_draw
 	ui_interact(ui, input, ctx->window_size);
 
 	if (ui->flush_params) {
+		i32 lock = BeamformerSharedMemoryLockKind_Parameters;
 		validate_ui_parameters(&ui->params);
-		BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue);
-		if (work && try_wait_sync(&ctx->shared_memory->parameters_sync, 0, ctx->os.wait_on_value)) {
-			BeamformerUploadContext *uc = &work->upload_context;
-			uc->shared_memory_offset = offsetof(BeamformerSharedMemory, parameters);
-			uc->size = sizeof(ctx->shared_memory->parameters);
-			uc->kind = BU_KIND_PARAMETERS;
-			work->type = BW_UPLOAD_BUFFER;
-			work->completion_barrier = (iptr)&ctx->shared_memory->parameters_sync;
-			mem_copy(&ctx->shared_memory->parameters_ui, &ui->params, sizeof(ui->params));
-			beamform_work_queue_push_commit(ctx->beamform_work_queue);
-			ui->flush_params   = 0;
-			ctx->start_compute = 1;
+		if (ctx->os.shared_memory_region_lock(&ctx->shared_memory, sm->locks, lock, 0)) {
+			mem_copy(&sm->parameters_ui, &ui->params, sizeof(ui->params));
+			ui->flush_params = 0;
+			ctx->csctx.shared_ubo_dirty = 1;
+			b32 dispatch = ctx->os.shared_memory_region_lock(&ctx->shared_memory, sm->locks,
+			                                                 BeamformerSharedMemoryLockKind_DispatchCompute,
+			                                                 0);
+			sm->start_compute_from_main |= dispatch &
+			                               ctx->beamform_frames[ctx->display_frame_index].ready_to_present;
+			ctx->os.shared_memory_region_unlock(&ctx->shared_memory, sm->locks, lock);
 		}
 	}
 
diff --git a/util.c b/util.c
@@ -508,6 +508,15 @@ round_down_power_of_2(u32 a)
 	return result;
 }
 
+function iz
+round_up_to(iz value, iz multiple)
+{
+	iz result = value;
+	if (value % multiple != 0)
+		result += multiple - value % multiple;
+	return result;
+}
+
 function b32
 uv2_equal(uv2 a, uv2 b)
 {
diff --git a/util.h b/util.h
@@ -254,6 +254,11 @@ typedef struct {
 	iptr  handle;
 } FileWatchContext;
 
+typedef struct {
+	void *region;
+	iptr  os_context;
+} SharedMemoryRegion;
+
 #define OS_ALLOC_ARENA_FN(name) Arena name(Arena old, iz capacity)
 typedef OS_ALLOC_ARENA_FN(os_alloc_arena_fn);
 
@@ -291,16 +296,23 @@ typedef OS_WRITE_FILE_FN(os_write_file_fn);
 #define OS_THREAD_ENTRY_POINT_FN(name) iptr name(iptr _ctx)
 typedef OS_THREAD_ENTRY_POINT_FN(os_thread_entry_point_fn);
 
+#define OS_SHARED_MEMORY_LOCK_REGION_FN(name) b32 name(SharedMemoryRegion *sm, i32 *locks, i32 lock_index, i32 timeout_ms)
+typedef OS_SHARED_MEMORY_LOCK_REGION_FN(os_shared_memory_region_lock_fn);
+
+#define OS_SHARED_MEMORY_UNLOCK_REGION_FN(name) void name(SharedMemoryRegion *sm, i32 *locks, i32 lock_index)
+typedef OS_SHARED_MEMORY_UNLOCK_REGION_FN(os_shared_memory_region_unlock_fn);
+
 #define OS_FNS \
-	X(add_file_watch)  \
-	X(alloc_arena)     \
-	X(close)           \
-	X(open_for_write)  \
-	X(read_file)       \
-	X(read_whole_file) \
-	X(wait_on_value)   \
-	X(wake_waiters)    \
-	X(write_new_file)  \
+	X(add_file_watch)              \
+	X(alloc_arena)                 \
+	X(close)                       \
+	X(open_for_write)              \
+	X(read_file)                   \
+	X(read_whole_file)             \
+	X(shared_memory_region_lock)   \
+	X(shared_memory_region_unlock) \
+	X(wake_waiters)                \
+	X(write_new_file)              \
 	X(write_file)
 
 #define RENDERDOC_GET_API_FN(name) b32 name(u32 version, void **out_api)

M	beamformer.c	\|	140	++++++++++++++++++++++++++++++++++++++++++-------------------------------------
M	beamformer.h	\|	5	+++--
M	beamformer_parameters.h	\|	2	--
M	beamformer_work_queue.c	\|	16	----------------
M	beamformer_work_queue.h	\|	51	+++++++++++++++++++++++++++++++++++----------------
M	build.c	\|	1	-
M	helpers/ogl_beamformer_lib.c	\|	173	++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------
M	helpers/ogl_beamformer_lib_base.h	\|	4	+++-
M	os_linux.c	\|	72	++++++++++++++++++++++++++++++++++++++++++++++++------------------------
M	os_win32.c	\|	79	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
M	static.c	\|	35	++++++++++++++---------------------
M	ui.c	\|	32	++++++++++++++++----------------
M	util.c	\|	9	+++++++++
M	util.h	\|	30	+++++++++++++++++++++---------