core/lib: use shared memory for export, add compute stats export - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: 9130418de20f2674c3152ae6ef4bafaba43b6a00
Parent: 358a6695b6f36e4ae9425bf8ce1dcf07e9c8df9a
Author: Randy Palamar
Date:   Sun, 22 Jun 2025 12:21:30 -0600

core/lib: use shared memory for export, add compute stats export

This touches many files because deleting the export pipe means
that a large portion of other trash can be removed. It also
greatly shrinks the platform layer surface area.

Diffstat:
M beamformer.c  | 249 +++++++++++++++++++++++++++++++++++++------------------------------------------
M beamformer.h  | 25 +++++--------------------
M beamformer_parameters.h  | 16 +++++++++++++---
M beamformer_work_queue.c  | 11 +++++++++++
M beamformer_work_queue.h  | 62 ++++++++++++++++++++++++++++++++++----------------------------
M build.c  | 2 +-
M helpers/ogl_beamformer_lib.c  | 197 +++++++++++++++++++++++++++++--------------------------------------------------
M helpers/ogl_beamformer_lib_base.h  | 10 ++++++----
M main_linux.c  | 3 +--
M main_w32.c  | 3 +--
M os_linux.c  | 41 ++++++-----------------------------------
M os_win32.c  | 37 ++++++-------------------------------
M static.c  | 13 +++++++------
M tests/throughput.c  | 9 +++++----
M ui.c  | 11 +++++------
M util.h  | 26 ++++----------------------

16 files changed, 291 insertions(+), 424 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -160,6 +160,20 @@ push_compute_timing_info(ComputeTimingTable *t, ComputeTimingInfo info)
 	t->buffer[index] = info;
 }
 
+function BeamformComputeFrame *
+beamformer_get_newest_frame(BeamformerCtx *ctx, b32 average_frame)
+{
+	BeamformComputeFrame *result = 0;
+	if (average_frame) {
+		u32 a_index = !(ctx->averaged_frame_index % countof(ctx->averaged_frames));
+		result      = ctx->averaged_frames + a_index;
+	} else {
+		u32 index = (ctx->next_render_frame_index - 1) % countof(ctx->beamform_frames);
+		result = ctx->beamform_frames + index;
+	}
+	return result;
+}
+
 function b32
 fill_frame_compute_work(BeamformerCtx *ctx, BeamformWork *work, ImagePlaneTag plane)
 {
@@ -168,7 +182,7 @@ fill_frame_compute_work(BeamformerCtx *ctx, BeamformWork *work, ImagePlaneTag pl
 		result = 1;
 		u32 frame_id    = atomic_add_u32(&ctx->next_render_frame_index, 1);
 		u32 frame_index = frame_id % countof(ctx->beamform_frames);
-		work->type      = BW_COMPUTE;
+		work->kind      = BeamformerWorkKind_Compute;
 		work->lock      = BeamformerSharedMemoryLockKind_DispatchCompute;
 		work->frame     = ctx->beamform_frames + frame_index;
 		work->frame->ready_to_present = 0;
@@ -179,19 +193,6 @@ fill_frame_compute_work(BeamformerCtx *ctx, BeamformWork *work, ImagePlaneTag pl
 }
 
 function void
-export_frame(BeamformerCtx *ctx, iptr handle, BeamformFrame *frame)
-{
-	uv3 dim            = frame->dim;
-	iz  out_size       = dim.x * dim.y * dim.z * 2 * sizeof(f32);
-	ctx->export_buffer = ctx->os.alloc_arena(ctx->export_buffer, out_size);
-	glGetTextureImage(frame->texture, 0, GL_RG, GL_FLOAT, out_size, ctx->export_buffer.beg);
-	s8 raw = {.len = out_size, .data = ctx->export_buffer.beg};
-	if (!ctx->os.write_file(handle, raw))
-		ctx->os.write_file(ctx->os.error_handle, s8("failed to export frame\n"));
-	ctx->os.close(handle);
-}
-
-function void
 do_sum_shader(ComputeShaderCtx *cs, u32 *in_textures, u32 in_texture_count, f32 in_scale,
               u32 out_texture, uv3 out_data_dim)
 {
@@ -200,7 +201,7 @@ do_sum_shader(ComputeShaderCtx *cs, u32 *in_textures, u32 in_texture_count, f32 
 	glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT);
 
 	glBindImageTexture(0, out_texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F);
-	glProgramUniform1f(cs->programs[ShaderKind_Sum], CS_SUM_PRESCALE_UNIFORM_LOC, in_scale);
+	glProgramUniform1f(cs->programs[BeamformerShaderKind_Sum], CS_SUM_PRESCALE_UNIFORM_LOC, in_scale);
 	for (u32 i = 0; i < in_texture_count; i++) {
 		glBindImageTexture(1, in_textures[i], 0, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F);
 		glDispatchCompute(ORONE(out_data_dim.x / 32),
@@ -277,7 +278,7 @@ compute_cursor_finished(struct compute_cursor *cursor)
 }
 
 function void
-do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, ShaderKind shader)
+do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, BeamformerShaderKind shader)
 {
 	ComputeShaderCtx *csctx    = &ctx->csctx;
 	BeamformerSharedMemory *sm = ctx->shared_memory.region;
@@ -288,9 +289,9 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, 
 	u32 input_ssbo_idx  = csctx->last_output_ssbo_index;
 
 	switch (shader) {
-	case ShaderKind_Decode:
-	case ShaderKind_DecodeFloat:
-	case ShaderKind_DecodeFloatComplex:{
+	case BeamformerShaderKind_Decode:
+	case BeamformerShaderKind_DecodeFloat:
+	case BeamformerShaderKind_DecodeFloatComplex:{
 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo);
 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]);
 		glBindImageTexture(0, csctx->hadamard_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8I);
@@ -300,15 +301,15 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, 
 		                  ceil_f32((f32)csctx->dec_data_dim.z / DECODE_LOCAL_SIZE_Z));
 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
 	}break;
-	case ShaderKind_CudaDecode:{
+	case BeamformerShaderKind_CudaDecode:{
 		ctx->cuda_lib.decode(0, output_ssbo_idx, 0);
 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
 	}break;
-	case ShaderKind_CudaHilbert:
+	case BeamformerShaderKind_CudaHilbert:
 		ctx->cuda_lib.hilbert(input_ssbo_idx, output_ssbo_idx);
 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
 		break;
-	case ShaderKind_Demodulate:{
+	case BeamformerShaderKind_Demodulate:{
 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]);
 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]);
 		glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32),
@@ -316,7 +317,7 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, 
 		                  ORONE(csctx->dec_data_dim.z));
 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
 	}break;
-	case ShaderKind_MinMax:{
+	case BeamformerShaderKind_MinMax:{
 		u32 texture = frame->frame.texture;
 		for (u32 i = 1; i < frame->frame.mips; i++) {
 			glBindImageTexture(0, texture, i - 1, GL_TRUE, 0, GL_READ_ONLY,  GL_RG32F);
@@ -330,7 +331,7 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, 
 			glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
 		}
 	}break;
-	case ShaderKind_DASCompute:{
+	case BeamformerShaderKind_DASCompute:{
 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]);
 		glBindImageTexture(0, frame->frame.texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F);
 		glBindImageTexture(1, csctx->sparse_elements_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I);
@@ -367,7 +368,7 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, 
 		#endif
 		glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT|GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
 	}break;
-	case ShaderKind_Sum:{
+	case BeamformerShaderKind_Sum:{
 		u32 aframe_index = ctx->averaged_frame_index % ARRAY_COUNT(ctx->averaged_frames);
 		BeamformComputeFrame *aframe = ctx->averaged_frames + aframe_index;
 		aframe->ready_to_present     = 0;
@@ -405,7 +406,7 @@ shader_text_with_header(ShaderReloadContext *ctx, OS *os, Arena *arena)
 	stream_append_s8s(&sb, s8("#version 460 core\n\n"), ctx->header);
 
 	switch (ctx->kind) {
-	case ShaderKind_DASCompute:{
+	case BeamformerShaderKind_DASCompute:{
 		#define X(type, id, pretty, fixed_tx) "#define DAS_ID_" #type " " #id "\n"
 		stream_append_s8(&sb, s8(""
 		"layout(local_size_x = " str(DAS_LOCAL_SIZE_X) ", "
@@ -417,14 +418,14 @@ shader_text_with_header(ShaderReloadContext *ctx, OS *os, Arena *arena)
 		));
 		#undef X
 	}break;
-	case ShaderKind_DecodeFloat:
-	case ShaderKind_DecodeFloatComplex:{
-		if (ctx->kind == ShaderKind_DecodeFloat)
+	case BeamformerShaderKind_DecodeFloat:
+	case BeamformerShaderKind_DecodeFloatComplex:{
+		if (ctx->kind == BeamformerShaderKind_DecodeFloat)
 			stream_append_s8(&sb, s8("#define INPUT_DATA_TYPE_FLOAT\n\n"));
 		else
 			stream_append_s8(&sb, s8("#define INPUT_DATA_TYPE_FLOAT_COMPLEX\n\n"));
 	} /* FALLTHROUGH */
-	case ShaderKind_Decode:{
+	case BeamformerShaderKind_Decode:{
 		#define X(type, id, pretty) "#define DECODE_MODE_" #type " " #id "\n"
 		stream_append_s8(&sb, s8(""
 		"layout(local_size_x = " str(DECODE_LOCAL_SIZE_X) ", "
@@ -434,11 +435,11 @@ shader_text_with_header(ShaderReloadContext *ctx, OS *os, Arena *arena)
 		));
 		#undef X
 	}break;
-	case ShaderKind_MinMax:{
+	case BeamformerShaderKind_MinMax:{
 		stream_append_s8(&sb, s8("layout(location = " str(CS_MIN_MAX_MIPS_LEVEL_UNIFORM_LOC)
 		                         ") uniform int u_mip_map;\n\n"));
 	}break;
-	case ShaderKind_Sum:{
+	case BeamformerShaderKind_Sum:{
 		stream_append_s8(&sb, s8("layout(location = " str(CS_SUM_PRESCALE_UNIFORM_LOC)
 		                         ") uniform float u_sum_prescale = 1.0;\n\n"));
 	}break;
@@ -477,7 +478,7 @@ DEBUG_EXPORT BEAMFORMER_RELOAD_SHADER_FN(beamformer_reload_shader)
 	if (new_program) {
 		glDeleteProgram(*src->shader);
 		*src->shader = new_program;
-		if (src->kind == ShaderKind_Render2D) ctx->frame_view_render_context.updated = 1;
+		if (src->kind == BeamformerShaderKind_Render2D) ctx->frame_view_render_context.updated = 1;
 	}
 	return new_program != 0;
 }
@@ -506,32 +507,62 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co
 	BeamformWork *work = beamform_work_queue_pop(q);
 	while (work) {
 		b32 can_commit = 1;
-		switch (work->type) {
-		case BW_RELOAD_SHADER: {
+		switch (work->kind) {
+		case BeamformerWorkKind_ReloadShader:{
 			ShaderReloadContext *src = work->shader_reload_context;
 			b32 success = reload_compute_shader(ctx, src, s8(""), arena);
-			if (src->kind == ShaderKind_Decode) {
+			if (src->kind == BeamformerShaderKind_Decode) {
 				/* TODO(rnp): think of a better way of doing this */
-				src->kind   = ShaderKind_DecodeFloatComplex;
-				src->shader = cs->programs + ShaderKind_DecodeFloatComplex;
+				src->kind   = BeamformerShaderKind_DecodeFloatComplex;
+				src->shader = cs->programs + BeamformerShaderKind_DecodeFloatComplex;
 				success &= reload_compute_shader(ctx, src, s8(" (F32C)"), arena);
-				src->kind   = ShaderKind_DecodeFloat;
-				src->shader = cs->programs + ShaderKind_DecodeFloat;
+				src->kind   = BeamformerShaderKind_DecodeFloat;
+				src->shader = cs->programs + BeamformerShaderKind_DecodeFloat;
 				success &= reload_compute_shader(ctx, src, s8(" (F32)"),  arena);
-				src->kind   = ShaderKind_Decode;
-				src->shader = cs->programs + ShaderKind_Decode;
+				src->kind   = BeamformerShaderKind_Decode;
+				src->shader = cs->programs + BeamformerShaderKind_Decode;
 			}
 
-			if (success) {
+			if (success && ctx->csctx.raw_data_ssbo) {
 				/* TODO(rnp): this check seems off */
-				if (ctx->csctx.raw_data_ssbo) {
-					can_commit = 0;
-					ImagePlaneTag plane = ctx->beamform_frames[ctx->display_frame_index].image_plane_tag;
-					fill_frame_compute_work(ctx, work, plane);
+				can_commit = 0;
+				BeamformComputeFrame *frame = beamformer_get_newest_frame(ctx, bp->output_points[3] > 1);
+				fill_frame_compute_work(ctx, work, frame->image_plane_tag);
+			}
+		}break;
+		case BeamformerWorkKind_ExportBuffer:{
+			/* TODO(rnp): better way of handling DispatchCompute barrier */
+			post_sync_barrier(&ctx->shared_memory, BeamformerSharedMemoryLockKind_DispatchCompute,
+			                  sm->locks, ctx->os.shared_memory_region_unlock);
+			ctx->os.shared_memory_region_lock(&ctx->shared_memory, sm->locks, (i32)work->lock, -1);
+			BeamformerExportContext *ec = &work->export_context;
+			switch (ec->kind) {
+			case BeamformerExportKind_BeamformedData:{
+				BeamformComputeFrame *frame = beamformer_get_newest_frame(ctx, bp->output_points[3] > 1);
+				assert(frame->ready_to_present);
+				u32 texture  = frame->frame.texture;
+				uv3 dim      = frame->frame.dim;
+				iz  out_size = dim.x * dim.y * dim.z * 2 * sizeof(f32);
+				if (out_size <= ec->size) {
+					glGetTextureImage(texture, 0, GL_RG, GL_FLOAT, out_size,
+					                  (u8 *)sm + BEAMFORMER_SCRATCH_OFF);
 				}
+			}break;
+			case BeamformerExportKind_Stats:{
+				ComputeTimingTable *table = ctx->compute_timing_table;
+				/* NOTE(rnp): do a little spin to let this finish updating */
+				while (table->write_index != atomic_load_u32(&table->read_index));
+				ComputeShaderStats *stats = ctx->compute_shader_stats;
+				if (sizeof(stats->table) <= ec->size)
+					mem_copy((u8 *)sm + BEAMFORMER_SCRATCH_OFF, &stats->table, sizeof(stats->table));
+			}break;
+			InvalidDefaultCase;
 			}
-		} break;
-		case BW_UPLOAD_BUFFER: {
+			ctx->os.shared_memory_region_unlock(&ctx->shared_memory, sm->locks, (i32)work->lock);
+			post_sync_barrier(&ctx->shared_memory, BeamformerSharedMemoryLockKind_ExportSync, sm->locks,
+			                  ctx->os.shared_memory_region_unlock);
+		}break;
+		case BeamformerWorkKind_UploadBuffer:{
 			ctx->os.shared_memory_region_lock(&ctx->shared_memory, sm->locks, (i32)work->lock, -1);
 			BeamformerUploadContext *uc = &work->upload_context;
 			u32 tex_type, tex_format, tex_element_count, tex_1d = 0, buffer = 0;
@@ -589,18 +620,14 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co
 
 			atomic_and_u32(&sm->dirty_regions, ~(sm->dirty_regions & 1 << (work->lock - 1)));
 			ctx->os.shared_memory_region_unlock(&ctx->shared_memory, sm->locks, (i32)work->lock);
-		} break;
-		case BW_COMPUTE_INDIRECT:{
+		}break;
+		case BeamformerWorkKind_ComputeIndirect:{
 			fill_frame_compute_work(ctx, work, work->compute_indirect_plane);
-			DEBUG_DECL(work->type = BW_COMPUTE_INDIRECT;)
+			DEBUG_DECL(work->kind = BeamformerWorkKind_ComputeIndirect;)
 		} /* FALLTHROUGH */
-		case BW_COMPUTE:{
-			/* NOTE(rnp): debug: here it is not a bug to release the lock if it
-			 * isn't held but elswhere it is */
-			DEBUG_DECL(if (sm->locks[work->lock])) {
-				ctx->os.shared_memory_region_unlock(&ctx->shared_memory,
-				                                    sm->locks, work->lock);
-			}
+		case BeamformerWorkKind_Compute:{
+			post_sync_barrier(&ctx->shared_memory, work->lock, sm->locks,
+			                  ctx->os.shared_memory_region_unlock);
 
 			push_compute_timing_info(ctx->compute_timing_table,
 			                         (ComputeTimingInfo){.kind = ComputeTimingInfoKind_ComputeFrameBegin});
@@ -628,7 +655,6 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co
 				}
 			}
 
-			frame->in_flight = 1;
 			frame->frame.min_coordinate  = v4_from_f32_array(bp->output_min_coordinate);
 			frame->frame.max_coordinate  = v4_from_f32_array(bp->output_max_coordinate);
 			frame->frame.das_shader_kind = bp->das_shader_id;
@@ -636,11 +662,11 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co
 
 			b32 did_sum_shader = 0;
 			u32 stage_count    = sm->compute_stages_count;
-			ComputeShaderKind *stages = sm->compute_stages;
+			BeamformerShaderKind *stages = sm->compute_stages;
 			for (u32 i = 0; i < stage_count; i++) {
-				did_sum_shader |= stages[i] == ComputeShaderKind_Sum;
+				did_sum_shader |= stages[i] == BeamformerShaderKind_Sum;
 				glBeginQuery(GL_TIME_ELAPSED, cs->shader_timer_ids[i]);
-				do_compute_shader(ctx, arena, frame, (ShaderKind)stages[i]);
+				do_compute_shader(ctx, arena, frame, stages[i]);
 				glEndQuery(GL_TIME_ELAPSED);
 			}
 			/* NOTE(rnp): block until work completes so that we can record timings */
@@ -650,7 +676,7 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co
 			for (u32 i = 0; i < stage_count; i++) {
 				ComputeTimingInfo info = {0};
 				info.kind   = ComputeTimingInfoKind_Shader;
-				info.shader = (ShaderKind)stages[i];
+				info.shader = stages[i];
 				glGetQueryObjectui64v(cs->shader_timer_ids[i], GL_QUERY_RESULT, &info.timer_count);
 				push_compute_timing_info(ctx->compute_timing_table, info);
 			}
@@ -668,16 +694,7 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co
 			                         (ComputeTimingInfo){.kind = ComputeTimingInfoKind_ComputeFrameEnd});
 
 			end_renderdoc_capture(gl_context);
-		} break;
-		case BW_SAVE_FRAME: {
-			BeamformComputeFrame *frame = work->output_frame_ctx.frame;
-			if (frame->ready_to_present) {
-				export_frame(ctx, work->output_frame_ctx.file_handle, &frame->frame);
-			} else {
-				/* TODO(rnp): should we handle this? */
-				INVALID_CODE_PATH;
-			}
-		} break;
+		}break;
 		InvalidDefaultCase;
 		}
 
@@ -695,55 +712,57 @@ coalesce_timing_table(ComputeTimingTable *t, ComputeShaderStats *stats)
 	 * info item. this could result in garbage entries but they shouldn't really matter */
 
 	u32 target = atomic_load_u32(&t->write_index);
-	u32 stats_index = (stats->latest_frame_index + 1) % countof(stats->times);
+	u32 stats_index = (stats->latest_frame_index + 1) % countof(stats->table.times);
 
-	static_assert(ShaderKind_Count + 1 <= 32, "timing coalescence bitfield test");
+	static_assert(BeamformerShaderKind_Count + 1 <= 32, "timing coalescence bitfield test");
 	u32 seen_info_test = 0;
 
 	while (t->read_index != target) {
-		ComputeTimingInfo info = t->buffer[(t->read_index++) % countof(t->buffer)];
+		ComputeTimingInfo info = t->buffer[t->read_index % countof(t->buffer)];
 		switch (info.kind) {
 		case ComputeTimingInfoKind_ComputeFrameBegin:{
 			assert(t->compute_frame_active == 0);
 			t->compute_frame_active = 1;
 			/* NOTE(rnp): allow multiple instances of same shader to accumulate */
-			mem_clear(stats->times[stats_index], 0, sizeof(stats->times[stats_index]));
+			mem_clear(stats->table.times[stats_index], 0, sizeof(stats->table.times[stats_index]));
 		}break;
 		case ComputeTimingInfoKind_ComputeFrameEnd:{
 			assert(t->compute_frame_active == 1);
 			t->compute_frame_active = 0;
 			stats->latest_frame_index = stats_index;
-			stats_index = (stats_index + 1) % countof(stats->times);
+			stats_index = (stats_index + 1) % countof(stats->table.times);
 		}break;
 		case ComputeTimingInfoKind_Shader:{
-			stats->times[stats_index][info.shader] += (f32)info.timer_count / 1.0e9;
+			stats->table.times[stats_index][info.shader] += (f32)info.timer_count / 1.0e9;
 			seen_info_test |= (1 << info.shader);
 		}break;
 		case ComputeTimingInfoKind_RF_Data:{
-			stats->latest_rf_index = (stats->latest_rf_index + 1) % countof(stats->rf_time_deltas);
+			stats->latest_rf_index = (stats->latest_rf_index + 1) % countof(stats->table.rf_time_deltas);
 			f32 delta = (f32)(info.timer_count - stats->last_rf_timer_count) / 1.0e9;
-			stats->rf_time_deltas[stats->latest_rf_index] = delta;
+			stats->table.rf_time_deltas[stats->latest_rf_index] = delta;
 			stats->last_rf_timer_count = info.timer_count;
-			seen_info_test |= (1 << ShaderKind_Count);
+			seen_info_test |= (1 << BeamformerShaderKind_Count);
 		}break;
 		}
+		/* NOTE(rnp): do this at the end so that stats table is always in a consistent state */
+		atomic_add_u32(&t->read_index, 1);
 	}
 
 	if (seen_info_test) {
-		for EachEnumValue(ShaderKind, shader) {
+		for EachEnumValue(BeamformerShaderKind, shader) {
 			if (seen_info_test & (1 << shader)) {
 				f32 sum = 0;
-				for EachElement(stats->times, i)
-					sum += stats->times[i][shader];
-				stats->average_times[shader] = sum / countof(stats->times);
+				for EachElement(stats->table.times, i)
+					sum += stats->table.times[i][shader];
+				stats->average_times[shader] = sum / countof(stats->table.times);
 			}
 		}
 
-		if (seen_info_test & (1 << ShaderKind_Count)) {
+		if (seen_info_test & (1 << BeamformerShaderKind_Count)) {
 			f32 sum = 0;
-			for EachElement(stats->rf_time_deltas, i)
-				sum += stats->rf_time_deltas[i];
-			stats->rf_time_delta_average = sum / countof(stats->rf_time_deltas);
+			for EachElement(stats->table.rf_time_deltas, i)
+				sum += stats->table.rf_time_deltas[i];
+			stats->rf_time_delta_average = sum / countof(stats->table.rf_time_deltas);
 		}
 	}
 }
@@ -805,58 +824,20 @@ DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step)
 
 	BeamformerSharedMemory *sm = ctx->shared_memory.region;
 	BeamformerParameters   *bp = &sm->parameters;
+	b32 averaging = bp->output_points[3] > 1;
 	if (sm->locks[BeamformerSharedMemoryLockKind_DispatchCompute] && ctx->os.compute_worker.asleep) {
 		if (sm->start_compute_from_main) {
 			BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue);
-			ImagePlaneTag tag  = ctx->beamform_frames[ctx->display_frame_index].image_plane_tag;
-			if (fill_frame_compute_work(ctx, work, tag)) {
+			ImagePlaneTag tag  = beamformer_get_newest_frame(ctx, averaging)->image_plane_tag;
+			if (fill_frame_compute_work(ctx, work, tag))
 				beamform_work_queue_push_commit(ctx->beamform_work_queue);
-				if (sm->export_next_frame) {
-					BeamformWork *export = beamform_work_queue_push(ctx->beamform_work_queue);
-					if (export) {
-						/* TODO: we don't really want the beamformer opening/closing files */
-						iptr f = ctx->os.open_for_write(ctx->os.export_pipe_name);
-						export->type = BW_SAVE_FRAME;
-						export->output_frame_ctx.file_handle = f;
-						if (bp->output_points[3] > 1) {
-							static_assert(countof(ctx->averaged_frames) == 2,
-							              "fix this, we assume average frame ping pong buffer");
-							u32 a_index = !(ctx->averaged_frame_index %
-							                countof(ctx->averaged_frames));
-							BeamformComputeFrame *aframe = ctx->averaged_frames + a_index;
-							export->output_frame_ctx.frame = aframe;
-						} else {
-							export->output_frame_ctx.frame = work->frame;
-						}
-						beamform_work_queue_push_commit(ctx->beamform_work_queue);
-					}
-					sm->export_next_frame = 0;
-				}
-			}
 			atomic_store_u32(&sm->start_compute_from_main, 0);
 		}
 		ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable);
 	}
 
-	ComputeFrameIterator cfi = compute_frame_iterator(ctx, ctx->display_frame_index,
-	                                                  ctx->next_render_frame_index - ctx->display_frame_index);
-	for (BeamformComputeFrame *frame = frame_next(&cfi); frame; frame = frame_next(&cfi)) {
-		if (frame->in_flight && frame->ready_to_present) {
-			frame->in_flight         = 0;
-			ctx->display_frame_index = frame - cfi.frames;
-		}
-	}
-
-	BeamformComputeFrame *frame_to_draw;
-	if (bp->output_points[3] > 1) {
-		u32 a_index = !(ctx->averaged_frame_index % countof(ctx->averaged_frames));
-		frame_to_draw = ctx->averaged_frames + a_index;
-	} else {
-		frame_to_draw = ctx->beamform_frames + ctx->display_frame_index;
-	}
-
-	draw_ui(ctx, input, frame_to_draw->ready_to_present? &frame_to_draw->frame : 0,
-	        frame_to_draw->image_plane_tag);
+	BeamformComputeFrame *frame = beamformer_get_newest_frame(ctx, averaging);
+	draw_ui(ctx, input, frame->ready_to_present? &frame->frame : 0, frame->image_plane_tag);
 
 	ctx->frame_view_render_context.updated = 0;
 
diff --git a/beamformer.h b/beamformer.h
@@ -77,7 +77,7 @@ typedef struct {
 #define CS_SUM_PRESCALE_UNIFORM_LOC       1
 
 typedef struct {
-	u32 programs[ComputeShaderKind_Count];
+	u32 programs[BeamformerShaderKind_ComputeCount];
 
 	/* NOTE: Decoded data is only relevant in the context of a single frame. We use two
 	 * buffers so that they can be swapped when chaining multiple compute stages */
@@ -110,22 +110,11 @@ typedef enum {
 	DASShaderKind_Count
 } DASShaderKind;
 
-typedef enum {
-	#define X(e, n, s, h, pn) ShaderKind_##e = n,
-	COMPUTE_SHADERS
-	#undef X
-	ShaderKind_Render2D,
-	ShaderKind_Count
-} ShaderKind;
-
 typedef struct {
-	/* NOTE(rnp): this wants to be iterated on both dimensions. it depends entirely on which
-	 * visualization method you want to use. the coalescing function wants both directions */
-	f32 times[32][ShaderKind_Count];
-	f32 average_times[ShaderKind_Count];
+	BeamformerComputeStatsTable table;
+	f32 average_times[BeamformerShaderKind_Count];
 
 	u64 last_rf_timer_count;
-	f32 rf_time_deltas[32];
 	f32 rf_time_delta_average;
 
 	u32 latest_frame_index;
@@ -144,7 +133,7 @@ typedef struct {
 	u64 timer_count;
 	ComputeTimingInfoKind kind;
 	union {
-		ShaderKind shader;
+		BeamformerShaderKind shader;
 	};
 } ComputeTimingInfo;
 
@@ -175,7 +164,6 @@ typedef struct BeamformFrame {
 struct BeamformComputeFrame {
 	BeamformFrame frame;
 	ImagePlaneTag image_plane_tag;
-	b32           in_flight;
 	b32           ready_to_present;
 };
 
@@ -210,7 +198,6 @@ typedef struct {
 
 	BeamformComputeFrame beamform_frames[MAX_BEAMFORMED_SAVED_FRAMES];
 	u32 next_render_frame_index;
-	u32 display_frame_index;
 
 	/* NOTE: this will only be used when we are averaging */
 	u32                  averaged_frame_index;
@@ -222,8 +209,6 @@ typedef struct {
 	 * destroying itself on hot-reload */
 	FrameViewRenderContext frame_view_render_context;
 
-	Arena export_buffer;
-
 	CudaLib cuda_lib;
 	OS      os;
 	Stream  error_stream;
@@ -244,7 +229,7 @@ struct ShaderReloadContext {
 	u32 *shader;
 	ShaderReloadContext *link;
 	GLenum     gl_type;
-	ShaderKind kind;
+	BeamformerShaderKind kind;
 };
 
 #define BEAMFORMER_FRAME_STEP_FN(name) void name(BeamformerCtx *ctx, Arena *arena, \
diff --git a/beamformer_parameters.h b/beamformer_parameters.h
@@ -20,11 +20,21 @@
 	X(Sum,                8, "sum",      0, "Sum")
 
 typedef enum {
-	#define X(e, n, s, h, pn) ComputeShaderKind_##e = n,
+	#define X(e, n, s, h, pn) BeamformerShaderKind_##e = n,
 	COMPUTE_SHADERS
 	#undef X
-	ComputeShaderKind_Count
-} ComputeShaderKind;
+	BeamformerShaderKind_Render2D,
+	BeamformerShaderKind_Count,
+
+	BeamformerShaderKind_ComputeCount = BeamformerShaderKind_Render2D,
+} BeamformerShaderKind;
+
+typedef struct {
+	/* NOTE(rnp): this wants to be iterated on both dimensions. it depends entirely on which
+	 * visualization method you want to use. the coalescing function wants both directions */
+	float times[32][BeamformerShaderKind_Count];
+	float rf_time_deltas[32];
+} BeamformerComputeStatsTable;
 
 /* X(type, id, pretty name) */
 #define DECODE_TYPES \
diff --git a/beamformer_work_queue.c b/beamformer_work_queue.c
@@ -50,3 +50,14 @@ DEBUG_EXPORT BEAMFORM_WORK_QUEUE_PUSH_COMMIT_FN(beamform_work_queue_push_commit)
 {
 	atomic_add_u64(&q->queue, 1);
 }
+
+function void
+post_sync_barrier(SharedMemoryRegion *sm, BeamformerSharedMemoryLockKind lock, i32 *locks,
+                  os_shared_memory_region_unlock_fn *os_shared_memory_region_unlock)
+{
+	/* NOTE(rnp): debug: here it is not a bug to release the lock if it
+	 * isn't held but elswhere it is */
+	DEBUG_DECL(if (locks[lock])) {
+		os_shared_memory_region_unlock(sm, locks, lock);
+	}
+}
diff --git a/beamformer_work_queue.h b/beamformer_work_queue.h
@@ -2,19 +2,19 @@
 #ifndef _BEAMFORMER_WORK_QUEUE_H_
 #define _BEAMFORMER_WORK_QUEUE_H_
 
-#define BEAMFORMER_SHARED_MEMORY_VERSION (6UL)
+#define BEAMFORMER_SHARED_MEMORY_VERSION (7UL)
 
 typedef struct BeamformComputeFrame BeamformComputeFrame;
 typedef struct ShaderReloadContext  ShaderReloadContext;
 
 typedef enum {
-	BW_COMPUTE,
-	BW_COMPUTE_INDIRECT,
-	BW_RELOAD_SHADER,
-	BW_SAVE_FRAME,
-	BW_SEND_FRAME,
-	BW_UPLOAD_BUFFER,
-} BeamformWorkType;
+	BeamformerWorkKind_Compute,
+	BeamformerWorkKind_ComputeIndirect,
+	BeamformerWorkKind_ReloadShader,
+	BeamformerWorkKind_SendFrame,
+	BeamformerWorkKind_ExportBuffer,
+	BeamformerWorkKind_UploadBuffer,
+} BeamformerWorkKind;
 
 typedef enum {
 	BU_KIND_CHANNEL_MAPPING,
@@ -26,23 +26,29 @@ typedef enum {
 } BeamformerUploadKind;
 
 typedef struct {
+	BeamformerUploadKind kind;
 	i32 size;
 	i32 shared_memory_offset;
-	BeamformerUploadKind kind;
 } BeamformerUploadContext;
 
+typedef enum {
+	BeamformerExportKind_BeamformedData,
+	BeamformerExportKind_Stats,
+} BeamformerExportKind;
+
 typedef struct {
-	BeamformComputeFrame *frame;
-	iptr                  file_handle;
-} BeamformOutputFrameContext;
+	BeamformerExportKind kind;
+	i32 size;
+} BeamformerExportContext;
 
 #define BEAMFORMER_SHARED_MEMORY_LOCKS \
 	X(None)            \
-	X(Parameters)      \
-	X(FocalVectors)    \
 	X(ChannelMapping)  \
+	X(FocalVectors)    \
+	X(Parameters)      \
+	X(ScratchSpace)    \
 	X(SparseElements)  \
-	X(RawData)         \
+	X(ExportSync)      \
 	X(DispatchCompute)
 
 #define X(name) BeamformerSharedMemoryLockKind_##name,
@@ -52,16 +58,15 @@ typedef enum {BEAMFORMER_SHARED_MEMORY_LOCKS BeamformerSharedMemoryLockKind_Coun
 /* NOTE: discriminated union based on type */
 typedef struct {
 	union {
-		BeamformComputeFrame       *frame;
-		BeamformerUploadContext     upload_context;
-		BeamformOutputFrameContext  output_frame_ctx;
-		ShaderReloadContext        *shader_reload_context;
-		ImagePlaneTag               compute_indirect_plane;
-		void                       *generic;
+		BeamformComputeFrame    *frame;
+		BeamformerUploadContext  upload_context;
+		BeamformerExportContext  export_context;
+		ShaderReloadContext     *shader_reload_context;
+		ImagePlaneTag            compute_indirect_plane;
+		void                    *generic;
 	};
 	BeamformerSharedMemoryLockKind lock;
-
-	BeamformWorkType type;
+	BeamformerWorkKind kind;
 } BeamformWork;
 
 typedef struct {
@@ -79,11 +84,12 @@ typedef BEAMFORM_WORK_QUEUE_PUSH_FN(beamform_work_queue_push_fn);
 typedef BEAMFORM_WORK_QUEUE_PUSH_COMMIT_FN(beamform_work_queue_push_commit_fn);
 
 #define BEAMFORMER_SHARED_MEMORY_SIZE (GB(2))
-#define BEAMFORMER_RF_DATA_OFF        (sizeof(BeamformerSharedMemory) + 4096ULL \
+#define BEAMFORMER_SCRATCH_OFF        (sizeof(BeamformerSharedMemory) + 4096ULL \
                                        - (uintptr_t)(sizeof(BeamformerSharedMemory) & 4095ULL))
-#define BEAMFORMER_MAX_RF_DATA_SIZE   (BEAMFORMER_SHARED_MEMORY_SIZE - BEAMFORMER_RF_DATA_OFF)
+#define BEAMFORMER_SCRATCH_SIZE       (BEAMFORMER_SHARED_MEMORY_SIZE - BEAMFORMER_SCRATCH_OFF)
+#define BEAMFORMER_MAX_RF_DATA_SIZE   (BEAMFORMER_SCRATCH_SIZE)
 
-typedef align_as(64) struct {
+typedef struct {
 	u32 version;
 
 	/* NOTE(rnp): causes future library calls to fail.
@@ -113,8 +119,8 @@ typedef align_as(64) struct {
 		};
 	};
 
-	ComputeShaderKind compute_stages[MAX_COMPUTE_SHADER_STAGES];
-	u32               compute_stages_count;
+	BeamformerShaderKind compute_stages[MAX_COMPUTE_SHADER_STAGES];
+	u32                  compute_stages_count;
 
 	/* TODO(rnp): hack: we need a different way of dispatching work for export */
 	b32 start_compute_from_main;
diff --git a/build.c b/build.c
@@ -698,7 +698,7 @@ main(i32 argc, char *argv[])
 	u64 start_time = os_get_timer_counter();
 
 	b32 result  = 1;
-	Arena arena = os_alloc_arena((Arena){0}, MB(8));
+	Arena arena = os_alloc_arena(MB(8));
 	check_rebuild_self(arena, argc, argv);
 
 	Options options = parse_options(argc, argv);
diff --git a/helpers/ogl_beamformer_lib.c b/helpers/ogl_beamformer_lib.c
@@ -6,8 +6,6 @@
 #include "ogl_beamformer_lib_base.h"
 #include "../beamformer_work_queue.c"
 
-#define PIPE_RETRY_PERIOD_MS (100ULL)
-
 global SharedMemoryRegion      g_shared_memory;
 global BeamformerSharedMemory *g_bp;
 global BeamformerLibErrorKind  g_lib_last_error;
@@ -17,20 +15,7 @@ global BeamformerLibErrorKind  g_lib_last_error;
 #elif OS_WINDOWS
 #include "../os_win32.c"
 
-#define PIPE_TYPE_BYTE      0x00
-#define PIPE_ACCESS_INBOUND 0x01
-
-#define PIPE_WAIT   0x00
-#define PIPE_NOWAIT 0x01
-
-#define ERROR_NO_DATA            232L
-#define ERROR_PIPE_NOT_CONNECTED 233L
-#define ERROR_PIPE_LISTENING     536L
-
-W32(iptr) CreateNamedPipeA(c8 *, u32, u32, u32, u32, u32, u32, void *);
-W32(b32)  DisconnectNamedPipe(iptr);
 W32(iptr) OpenFileMappingA(u32, b32, c8 *);
-W32(void) Sleep(u32);
 
 #else
 #error Unsupported Platform
@@ -38,41 +23,6 @@ W32(void) Sleep(u32);
 
 #if OS_LINUX
 
-function Pipe
-os_open_read_pipe(char *name)
-{
-	mkfifo(name, 0660);
-	return (Pipe){.file = open(name, O_RDONLY|O_NONBLOCK), .name = name};
-}
-
-static void
-os_disconnect_pipe(Pipe p)
-{
-}
-
-static void
-os_close_pipe(iptr *file, char *name)
-{
-	if (file) close(*file);
-	if (name) unlink(name);
-	*file = INVALID_FILE;
-}
-
-static b32
-os_wait_read_pipe(Pipe p, void *buf, iz read_size, u32 timeout_ms)
-{
-	struct pollfd pfd = {.fd = p.file, .events = POLLIN};
-	iz total_read = 0;
-	if (poll(&pfd, 1, timeout_ms) > 0) {
-		iz r;
-		do {
-			 r = read(p.file, (u8 *)buf + total_read, read_size - total_read);
-			 if (r > 0) total_read += r;
-		} while (r != 0);
-	}
-	return total_read == read_size;
-}
-
 function SharedMemoryRegion
 os_open_shared_memory_area(char *name)
 {
@@ -88,54 +38,6 @@ os_open_shared_memory_area(char *name)
 
 #elif OS_WINDOWS
 
-static Pipe
-os_open_read_pipe(char *name)
-{
-	iptr file = CreateNamedPipeA(name, PIPE_ACCESS_INBOUND, PIPE_TYPE_BYTE|PIPE_NOWAIT, 1,
-	                             0, 1024UL * 1024UL, 0, 0);
-	return (Pipe){.file = file, .name = name};
-}
-
-static void
-os_disconnect_pipe(Pipe p)
-{
-	DisconnectNamedPipe(p.file);
-}
-
-static void
-os_close_pipe(iptr *file, char *name)
-{
-	if (file) CloseHandle(*file);
-	*file = INVALID_FILE;
-}
-
-static b32
-os_wait_read_pipe(Pipe p, void *buf, iz read_size, u32 timeout_ms)
-{
-	iz elapsed_ms = 0, total_read = 0;
-	while (elapsed_ms <= timeout_ms && read_size != total_read) {
-		u8 data;
-		i32 read;
-		b32 result = ReadFile(p.file, &data, 0, &read, 0);
-		if (!result) {
-			i32 error = GetLastError();
-			if (error != ERROR_NO_DATA &&
-			    error != ERROR_PIPE_LISTENING &&
-			    error != ERROR_PIPE_NOT_CONNECTED)
-			{
-				/* NOTE: pipe is in a bad state; we will never read anything */
-				break;
-			}
-			Sleep(PIPE_RETRY_PERIOD_MS);
-			elapsed_ms += PIPE_RETRY_PERIOD_MS;
-		} else {
-			ReadFile(p.file, (u8 *)buf + total_read, read_size - total_read, &read, 0);
-			total_read += read;
-		}
-	}
-	return total_read == read_size;
-}
-
 function SharedMemoryRegion
 os_open_shared_memory_area(char *name)
 {
@@ -216,6 +118,18 @@ lib_release_lock(BeamformerSharedMemoryLockKind lock)
 	os_shared_memory_region_unlock(&g_shared_memory, g_bp->locks, (i32)lock);
 }
 
+function b32
+try_wait_sync(BeamformerSharedMemoryLockKind lock, i32 timeout_ms)
+{
+	b32 result = 0;
+	if (lib_try_lock(lock, 0) && lib_try_lock(lock, timeout_ms)) {
+		/* TODO(rnp): non-critical race condition */
+		lib_release_lock(lock);
+		result = 1;
+	}
+	return result;
+}
+
 u32
 beamformer_get_api_version(void)
 {
@@ -251,7 +165,7 @@ set_beamformer_pipeline(i32 *stages, i32 stages_count)
 		if (check_shared_memory()) {
 			g_bp->compute_stages_count = 0;
 			for (i32 i = 0; i < stages_count; i++) {
-				if (BETWEEN(stages[i], 0, ComputeShaderKind_Count)) {
+				if (BETWEEN(stages[i], 0, BeamformerShaderKind_ComputeCount)) {
 					g_bp->compute_stages[g_bp->compute_stages_count++] = stages[i];
 				}
 			}
@@ -270,16 +184,8 @@ set_beamformer_pipeline(i32 *stages, i32 stages_count)
 b32
 beamformer_start_compute(i32 timeout_ms)
 {
-	b32 result = 0;
-	if (check_shared_memory()) {
-		if (lib_try_lock(BeamformerSharedMemoryLockKind_DispatchCompute, 0)) {
-			if (lib_try_lock(BeamformerSharedMemoryLockKind_DispatchCompute, timeout_ms)) {
-				/* TODO(rnp): non-critical race condition */
-				lib_release_lock(BeamformerSharedMemoryLockKind_DispatchCompute);
-				result = 1;
-			}
-		}
-	}
+	b32 result = check_shared_memory() &&
+	             try_wait_sync(BeamformerSharedMemoryLockKind_DispatchCompute, timeout_ms);
 	return result;
 }
 
@@ -293,7 +199,7 @@ beamformer_upload_buffer(void *data, u32 size, i32 store_offset, BeamformerUploa
 		result = work && lib_try_lock(lock, timeout_ms);
 		if (result) {
 			work->upload_context = upload_context;
-			work->type = BW_UPLOAD_BUFFER;
+			work->kind = BeamformerWorkKind_UploadBuffer;
 			work->lock = lock;
 			mem_copy((u8 *)g_bp + store_offset, data, size);
 			if ((atomic_load_u32(&g_bp->dirty_regions) & (1 << (lock - 1))) == 0) {
@@ -335,11 +241,11 @@ beamformer_push_data_base(void *data, u32 data_size, i32 timeout_ms, b32 start_f
 	b32 result = 0;
 	if (data_size <= BEAMFORMER_MAX_RF_DATA_SIZE) {
 		BeamformerUploadContext uc = {0};
-		uc.shared_memory_offset = BEAMFORMER_RF_DATA_OFF;
+		uc.shared_memory_offset = BEAMFORMER_SCRATCH_OFF;
 		uc.size = data_size;
 		uc.kind = BU_KIND_RF_DATA;
 		result = beamformer_upload_buffer(data, data_size, uc.shared_memory_offset, uc,
-		                                  BeamformerSharedMemoryLockKind_RawData, timeout_ms);
+		                                  BeamformerSharedMemoryLockKind_ScratchSpace, timeout_ms);
 		if (result && start_from_main) atomic_store_u32(&g_bp->start_compute_from_main, 1);
 	} else {
 		g_lib_last_error = BF_LIB_ERR_KIND_BUFFER_OVERFLOW;
@@ -363,7 +269,7 @@ beamformer_push_data_with_compute(void *data, u32 data_size, u32 image_plane_tag
 			BeamformWork *work = try_push_work_queue();
 			result = work != 0;
 			if (result) {
-				work->type = BW_COMPUTE_INDIRECT;
+				work->kind = BeamformerWorkKind_ComputeIndirect;
 				work->compute_indirect_plane = image_plane_tag;
 				beamform_work_queue_push_commit(&g_bp->external_work_queue);
 			}
@@ -438,6 +344,20 @@ send_data(void *data, u32 data_size)
 	return result;
 }
 
+function b32
+beamformer_export_buffer(BeamformerExportContext export_context)
+{
+	BeamformWork *work = try_push_work_queue();
+	b32 result = work != 0;
+	if (result) {
+		work->export_context = export_context;
+		work->kind = BeamformerWorkKind_ExportBuffer;
+		work->lock = BeamformerSharedMemoryLockKind_ScratchSpace;
+		beamform_work_queue_push_commit(&g_bp->external_work_queue);
+	}
+	return result;
+}
+
 b32
 beamform_data_synchronized(void *data, u32 data_size, u32 output_points[3], f32 *out_data, i32 timeout_ms)
 {
@@ -450,21 +370,46 @@ beamform_data_synchronized(void *data, u32 data_size, u32 output_points[3], f32 
 		g_bp->parameters.output_points[0] = output_points[0];
 		g_bp->parameters.output_points[1] = output_points[1];
 		g_bp->parameters.output_points[2] = output_points[2];
-		g_bp->export_next_frame = 1;
-
-		Pipe export_pipe = os_open_read_pipe(OS_EXPORT_PIPE_NAME);
-		if (export_pipe.file != INVALID_FILE) {
-			if (send_data(data, data_size)) {
-				iz output_size = output_points[0] * output_points[1] *
-				                 output_points[2] * sizeof(f32) * 2;
-				result = os_wait_read_pipe(export_pipe, out_data, output_size, timeout_ms);
-				if (!result) g_lib_last_error = BF_LIB_ERR_KIND_READ_EXPORT_PIPE;
-			}
 
-			os_disconnect_pipe(export_pipe);
-			os_close_pipe(&export_pipe.file, export_pipe.name);
+		iz output_size = output_points[0] * output_points[1] * output_points[2] * sizeof(f32) * 2;
+		if (output_size <= BEAMFORMER_SCRATCH_SIZE &&
+		    beamformer_push_data_with_compute(data, data_size, 0, 0))
+		{
+			BeamformerExportContext export;
+			export.kind = BeamformerExportKind_BeamformedData;
+			export.size = output_size;
+			if (beamformer_export_buffer(export) &&
+			    lib_try_lock(BeamformerSharedMemoryLockKind_DispatchCompute, 0))
+			{
+				if (try_wait_sync(BeamformerSharedMemoryLockKind_ExportSync, timeout_ms)) {
+					mem_copy(out_data, (u8 *)g_bp + BEAMFORMER_SCRATCH_OFF, output_size);
+					result = 1;
+				}
+			}
 		} else {
-			g_lib_last_error = BF_LIB_ERR_KIND_OPEN_EXPORT_PIPE;
+			g_lib_last_error = BF_LIB_ERR_KIND_EXPORT_SPACE_OVERFLOW;
+		}
+	}
+	return result;
+}
+
+b32
+beamformer_compute_timings(BeamformerComputeStatsTable *output, i32 timeout_ms)
+{
+	b32 result = 0;
+	if (check_shared_memory()) {
+		static_assert(sizeof(*output) <= BEAMFORMER_SCRATCH_SIZE, "timing table size exceeds scratch space");
+		BeamformerExportContext export;
+		export.kind = BeamformerExportKind_Stats;
+		export.size = sizeof(*output);
+
+		if (beamformer_export_buffer(export) &&
+		    lib_try_lock(BeamformerSharedMemoryLockKind_DispatchCompute, 0))
+		{
+			if (try_wait_sync(BeamformerSharedMemoryLockKind_ExportSync, timeout_ms)) {
+				mem_copy(output, (u8 *)g_bp + BEAMFORMER_SCRATCH_OFF, sizeof(*output));
+				result = 1;
+			}
 		}
 	}
 	return result;
diff --git a/helpers/ogl_beamformer_lib_base.h b/helpers/ogl_beamformer_lib_base.h
@@ -16,10 +16,9 @@
 	X(INVALID_IMAGE_PLANE,     5, "invalid image plane")                          \
 	X(BUFFER_OVERFLOW,         6, "passed buffer size exceeds available space")   \
 	X(WORK_QUEUE_FULL,         7, "work queue full")                              \
-	X(OPEN_EXPORT_PIPE,        8, "failed to open export pipe")                   \
-	X(READ_EXPORT_PIPE,        9, "failed to read full export data from pipe")    \
-	X(SHARED_MEMORY,          10, "failed to open shared memory region")          \
-	X(SYNC_VARIABLE,          11, "failed to acquire lock within timeout period")
+	X(EXPORT_SPACE_OVERFLOW,   8, "not enough space for data export")             \
+	X(SHARED_MEMORY,           9, "failed to open shared memory region")          \
+	X(SYNC_VARIABLE,          10, "failed to acquire lock within timeout period")
 
 #define X(type, num, string) BF_LIB_ERR_KIND_ ##type = num,
 typedef enum {BEAMFORMER_LIB_ERRORS} BeamformerLibErrorKind;
@@ -41,6 +40,9 @@ LIB_FN uint32_t send_data(void *data, uint32_t data_size);
 LIB_FN uint32_t beamform_data_synchronized(void *data, uint32_t data_size, uint32_t output_points[3],
                                            float *out_data, int32_t timeout_ms);
 
+/* NOTE: downloads the last 32 frames worth of compute timings into output */
+LIB_FN uint32_t beamformer_compute_timings(BeamformerComputeStatsTable *output, int32_t timeout_ms);
+
 /* NOTE: tells the beamformer to start beamforming and waits until it starts or for timeout_ms */
 LIB_FN uint32_t beamformer_start_compute(int32_t timeout_ms);
 
diff --git a/main_linux.c b/main_linux.c
@@ -75,7 +75,7 @@ main(void)
 {
 	BeamformerCtx   ctx   = {0};
 	BeamformerInput input = {.executable_reloaded = 1};
-	Arena temp_memory = os_alloc_arena((Arena){0}, MB(16));
+	Arena temp_memory = os_alloc_arena(MB(16));
 	ctx.error_stream  = stream_alloc(&temp_memory, MB(1));
 
 	ctx.ui_backing_store        = sub_arena(&temp_memory, MB(2), KB(4));
@@ -88,7 +88,6 @@ main(void)
 	ctx.os.file_watch_context.handle = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
 	ctx.os.compute_worker.asleep     = 1;
 	ctx.os.error_handle              = STDERR_FILENO;
-	ctx.os.export_pipe_name          = OS_EXPORT_PIPE_NAME;
 
 	setup_beamformer(&ctx, &input, &temp_memory);
 	os_wake_waiters(&ctx.os.compute_worker.sync_variable);
diff --git a/main_w32.c b/main_w32.c
@@ -104,7 +104,7 @@ main(void)
 {
 	BeamformerCtx   ctx   = {0};
 	BeamformerInput input = {.executable_reloaded = 1};
-	Arena temp_memory = os_alloc_arena((Arena){0}, MB(16));
+	Arena temp_memory = os_alloc_arena(MB(16));
 	ctx.error_stream  = stream_alloc(&temp_memory, MB(1));
 
 	ctx.ui_backing_store        = sub_arena(&temp_memory, MB(2), KB(4));
@@ -121,7 +121,6 @@ main(void)
 	ctx.os.context               = (iptr)&w32_ctx;
 	ctx.os.compute_worker.asleep = 1;
 	ctx.os.error_handle          = GetStdHandle(STD_ERROR_HANDLE);
-	ctx.os.export_pipe_name      = OS_EXPORT_PIPE_NAME;
 
 	setup_beamformer(&ctx, &input, &temp_memory);
 	os_wake_waiters(&ctx.os.compute_worker.sync_variable);
diff --git a/os_linux.c b/os_linux.c
@@ -4,7 +4,6 @@
  * be provided by any platform the beamformer is ported to. */
 
 #define OS_SHARED_MEMORY_NAME "/ogl_beamformer_shared_memory"
-#define OS_EXPORT_PIPE_NAME   "/tmp/beamformer_output_pipe"
 
 #define OS_PATH_SEPARATOR_CHAR '/'
 #define OS_PATH_SEPARATOR      "/"
@@ -92,29 +91,12 @@ os_round_up_to_page_size(iz value)
 
 function OS_ALLOC_ARENA_FN(os_alloc_arena)
 {
-	Arena result = old;
-	capacity = os_round_up_to_page_size(capacity);
-	iz old_size = old.end - old.beg;
-	if (old_size < capacity) {
-		if (old.beg) munmap(old.beg, old_size);
-		result.beg = mmap(0, capacity, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
-		if (result.beg == MAP_FAILED)
-			os_fatal(s8("os_alloc_arena: couldn't allocate memory\n"));
-		result.end = result.beg + capacity;
-	}
-	return result;
-}
-
-function OS_CLOSE_FN(os_close)
-{
-	close(file);
-}
-
-function OS_OPEN_FOR_WRITE_FN(os_open_for_write)
-{
-	iptr result = open(fname, O_WRONLY|O_TRUNC);
-	if (result == -1)
-		result = INVALID_FILE;
+	Arena result = {0};
+	capacity   = os_round_up_to_page_size(capacity);
+	result.beg = mmap(0, capacity, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+	if (result.beg == MAP_FAILED)
+		os_fatal(s8("os_alloc_arena: couldn't allocate memory\n"));
+	result.end = result.beg + capacity;
 	return result;
 }
 
@@ -153,17 +135,6 @@ os_file_exists(char *path)
 	return result;
 }
 
-function OS_READ_FILE_FN(os_read_file)
-{
-	iz r = 0, total_read = 0;
-	do {
-		if (r != -1)
-			total_read += r;
-		r = read(file, buf + total_read, size - total_read);
-	} while (r);
-	return total_read;
-}
-
 function SharedMemoryRegion
 os_create_shared_memory_area(Arena *arena, char *name, i32 lock_count, iz requested_capacity)
 {
diff --git a/os_win32.c b/os_win32.c
@@ -1,7 +1,6 @@
 /* See LICENSE for license details. */
 
 #define OS_SHARED_MEMORY_NAME "Local\\ogl_beamformer_parameters"
-#define OS_EXPORT_PIPE_NAME   "\\\\.\\pipe\\beamformer_output_pipe"
 
 #define OS_PATH_SEPARATOR_CHAR '\\'
 #define OS_PATH_SEPARATOR      "\\"
@@ -15,7 +14,6 @@
 #define PAGE_READWRITE 0x04
 #define MEM_COMMIT     0x1000
 #define MEM_RESERVE    0x2000
-#define MEM_RELEASE    0x8000
 
 #define GENERIC_WRITE  0x40000000
 #define GENERIC_READ   0x80000000
@@ -120,7 +118,6 @@ W32(i32)    WakeByAddressAll(void *);
 W32(iptr)   wglGetProcAddress(c8 *);
 W32(b32)    WriteFile(iptr, u8 *, i32, i32 *, void *);
 W32(void *) VirtualAlloc(u8 *, iz, u32, u32);
-W32(b32)    VirtualFree(u8 *, iz, u32);
 
 #ifdef _DEBUG
 function void *
@@ -197,27 +194,12 @@ os_round_up_to_page_size(iz value)
 
 function OS_ALLOC_ARENA_FN(os_alloc_arena)
 {
-	Arena result = old;
-	capacity = os_round_up_to_page_size(capacity);
-	iz old_size = old.end - old.beg;
-	if (old_size < capacity) {
-		if (old.beg) VirtualFree(old.beg, old_size, MEM_RELEASE);
-		result.beg = VirtualAlloc(0, capacity, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
-		if (!result.beg)
-			os_fatal(s8("os_alloc_arena: couldn't allocate memory\n"));
-		result.end = result.beg + capacity;
-	}
-	return result;
-}
-
-function OS_CLOSE_FN(os_close)
-{
-	CloseHandle(file);
-}
-
-function OS_OPEN_FOR_WRITE_FN(os_open_for_write)
-{
-	iptr result = CreateFileA(fname, GENERIC_WRITE, 0, 0, OPEN_EXISTING, 0, 0);
+	Arena result = {0};
+	capacity   = os_round_up_to_page_size(capacity);
+	result.beg = VirtualAlloc(0, capacity, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
+	if (!result.beg)
+		os_fatal(s8("os_alloc_arena: couldn't allocate memory\n"));
+	result.end = result.beg + capacity;
 	return result;
 }
 
@@ -243,13 +225,6 @@ function OS_READ_WHOLE_FILE_FN(os_read_whole_file)
 	return result;
 }
 
-function OS_READ_FILE_FN(os_read_file)
-{
-	i32 total_read = 0;
-	ReadFile(file, buf, size, &total_read, 0);
-	return total_read;
-}
-
 function OS_WRITE_NEW_FILE_FN(os_write_new_file)
 {
 	b32 result = 0;
diff --git a/static.c b/static.c
@@ -180,7 +180,7 @@ function FILE_WATCH_CALLBACK_FN(reload_shader_indirect)
 	BeamformerCtx *ctx = src->beamformer_context;
 	BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue);
 	if (work) {
-		work->type = BW_RELOAD_SHADER;
+		work->kind = BeamformerWorkKind_ReloadShader,
 		work->shader_reload_context = src;
 		beamform_work_queue_push_commit(ctx->beamform_work_queue);
 		os_wake_waiters(&os->compute_worker.sync_variable);
@@ -291,8 +291,8 @@ setup_beamformer(BeamformerCtx *ctx, BeamformerInput *input, Arena *memory)
 	sm->version = BEAMFORMER_SHARED_MEMORY_VERSION;
 
 	/* NOTE: default compute shader pipeline */
-	sm->compute_stages[0]    = ComputeShaderKind_Decode;
-	sm->compute_stages[1]    = ComputeShaderKind_DASCompute;
+	sm->compute_stages[0]    = BeamformerShaderKind_Decode;
+	sm->compute_stages[1]    = BeamformerShaderKind_DASCompute;
 	sm->compute_stages_count = 2;
 
 	if (ctx->gl.vendor_id == GL_VENDOR_NVIDIA
@@ -325,15 +325,16 @@ setup_beamformer(BeamformerCtx *ctx, BeamformerInput *input, Arena *memory)
 	);
 	#undef X
 
+	ComputeShaderCtx *cs = &ctx->csctx;
 	#define X(e, sn, f, nh, pretty_name) do if (s8(f).len > 0) {          \
 		ShaderReloadContext *src = push_struct(memory, typeof(*src)); \
 		src->beamformer_context  = ctx;                               \
 		if (nh) src->header = compute_parameters_header;              \
 		src->path    = s8(static_path_join("shaders", f ".glsl"));    \
 		src->name    = src->path;                                     \
-		src->shader  = ctx->csctx.programs + ShaderKind_##e;          \
+		src->shader  = cs->programs + BeamformerShaderKind_##e;       \
 		src->gl_type = GL_COMPUTE_SHADER;                             \
-		src->kind    = ShaderKind_##e;                                \
+		src->kind    = BeamformerShaderKind_##e;                      \
 		src->link    = src;                                           \
 		os_add_file_watch(&ctx->os, memory, src->path, reload_shader_indirect, (iptr)src); \
 		reload_shader_indirect(&ctx->os, src->path, (iptr)src, *memory); \
@@ -372,7 +373,7 @@ setup_beamformer(BeamformerCtx *ctx, BeamformerInput *input, Arena *memory)
 	render_2d->path    = s8(static_path_join("shaders", "render_2d.frag.glsl"));
 	render_2d->name    = s8("shaders/render_2d.glsl");
 	render_2d->gl_type = GL_FRAGMENT_SHADER;
-	render_2d->kind    = ShaderKind_Render2D;
+	render_2d->kind    = BeamformerShaderKind_Render2D;
 	render_2d->shader  = &fvr->shader;
 	render_2d->header  = s8(""
 	"layout(location = 0) in  vec2 texture_coordinate;\n"
diff --git a/tests/throughput.c b/tests/throughput.c
@@ -2,6 +2,7 @@
 /* TODO(rnp):
  * [ ]: for finer grained evaluation of throughput latency just queue a data upload
  *      without replacing the data.
+ * [ ]: bug: we aren't inserting rf data between each frame
  */
 
 #define LIB_FN function
@@ -371,9 +372,9 @@ execute_study(s8 study, Arena arena, Stream path, Options *options)
 
 	i32 shader_stages[16];
 	i32 shader_stage_count = 0;
-	if (options->cuda) shader_stages[shader_stage_count++] = ComputeShaderKind_CudaDecode;
-	else               shader_stages[shader_stage_count++] = ComputeShaderKind_Decode;
-	shader_stages[shader_stage_count++] = ComputeShaderKind_DASCompute;
+	if (options->cuda) shader_stages[shader_stage_count++] = BeamformerShaderKind_CudaDecode;
+	else               shader_stages[shader_stage_count++] = BeamformerShaderKind_Decode;
+	shader_stages[shader_stage_count++] = BeamformerShaderKind_DASCompute;
 
 	set_beamformer_pipeline(shader_stages, shader_stage_count);
 
@@ -428,7 +429,7 @@ main(i32 argc, char *argv[])
 
 	signal(SIGINT, sigint);
 
-	Arena arena = os_alloc_arena((Arena){0}, KB(8));
+	Arena arena = os_alloc_arena(KB(8));
 	Stream path = stream_alloc(&arena, KB(4));
 	stream_append_s8(&path, c_str_to_s8(options.remaining[0]));
 	stream_ensure_termination(&path, OS_PATH_SEPARATOR_CHAR);
diff --git a/ui.c b/ui.c
@@ -2032,8 +2032,8 @@ draw_compute_progress_bar(BeamformerUI *ui, Arena arena, ComputeProgressBar *sta
 function v2
 draw_compute_stats_view(BeamformerCtx *ctx, Arena arena, Rect r)
 {
-	#define X(e, n, s, h, pn) [ComputeShaderKind_##e] = s8_comp(pn ":"),
-	read_only local_persist s8 labels[ComputeShaderKind_Count] = {COMPUTE_SHADERS};
+	#define X(e, n, s, h, pn) [BeamformerShaderKind_##e] = s8_comp(pn ":"),
+	read_only local_persist s8 labels[BeamformerShaderKind_ComputeCount] = {COMPUTE_SHADERS};
 	#undef X
 
 	BeamformerSharedMemory *sm    = ctx->shared_memory.region;
@@ -2043,14 +2043,14 @@ draw_compute_stats_view(BeamformerCtx *ctx, Arena arena, Rect r)
 	u32 stages           = sm->compute_stages_count;
 	TextSpec text_spec   = {.font = &ui->font, .colour = FG_COLOUR, .flags = TF_LIMITED};
 
-	static_assert(ShaderKind_Count <= 32, "shader kind bitfield test");
+	static_assert(countof(labels) <= 32, "shader kind bitfield test");
 	u32 seen_shaders = 0;
 	Table *table = table_new(&arena, stages + 2, TextAlignment_Left, TextAlignment_Left, TextAlignment_Left);
 	for (u32 i = 0; i < stages; i++) {
 		TableCell *cells = table_push_row(table, &arena, TRK_CELLS)->data;
 
 		Stream sb = arena_stream(arena);
-		ShaderKind index = (ShaderKind)sm->compute_stages[i];
+		BeamformerShaderKind index = sm->compute_stages[i];
 		if ((seen_shaders & (1 << index)) == 0) {
 			compute_time_sum += stats->average_times[index];
 			stream_append_f64_e(&sb, stats->average_times[index]);
@@ -3019,8 +3019,7 @@ draw_ui(BeamformerCtx *ctx, BeamformerInput *input, BeamformFrame *frame_to_draw
 			b32 dispatch = ctx->os.shared_memory_region_lock(&ctx->shared_memory, sm->locks,
 			                                                 BeamformerSharedMemoryLockKind_DispatchCompute,
 			                                                 0);
-			sm->start_compute_from_main |= dispatch &
-			                               ctx->beamform_frames[ctx->display_frame_index].ready_to_present;
+			sm->start_compute_from_main |= dispatch & beamformer_get_newest_frame(ctx, 0)->ready_to_present;
 			ctx->os.shared_memory_region_unlock(&ctx->shared_memory, sm->locks, lock);
 		}
 	}
diff --git a/util.h b/util.h
@@ -43,6 +43,7 @@
 
 #define INVALID_CODE_PATH ASSERT(0)
 #define INVALID_DEFAULT_CASE default: ASSERT(0); break
+#define InvalidCodePath assert(0)
 #define InvalidDefaultCase default: assert(0); break
 
 #define arg_list(type, ...) (type []){__VA_ARGS__}, sizeof((type []){__VA_ARGS__}) / sizeof(type)
@@ -207,18 +208,14 @@ typedef union {
                                       .size = {.x = -F32_INFINITY, .y = -F32_INFINITY}}
 
 typedef struct {
-	iptr  file;
-	char *name;
-} Pipe;
-#define INVALID_FILE (-1)
-
-typedef struct {
 	u8   *data;
 	u32   widx;
 	u32   cap;
 	b32   errors;
 } Stream;
 
+#define INVALID_FILE (-1)
+
 typedef struct OS OS;
 
 typedef struct {
@@ -263,7 +260,7 @@ typedef struct {
 	iptr  os_context;
 } SharedMemoryRegion;
 
-#define OS_ALLOC_ARENA_FN(name) Arena name(Arena old, iz capacity)
+#define OS_ALLOC_ARENA_FN(name) Arena name(iz capacity)
 typedef OS_ALLOC_ARENA_FN(os_alloc_arena_fn);
 
 #define OS_ADD_FILE_WATCH_FN(name) void name(OS *os, Arena *a, s8 path, \
@@ -273,18 +270,9 @@ typedef OS_ADD_FILE_WATCH_FN(os_add_file_watch_fn);
 #define OS_WAKE_WORKER_FN(name) void name(GLWorkerThreadContext *ctx)
 typedef OS_WAKE_WORKER_FN(os_wake_worker_fn);
 
-#define OS_CLOSE_FN(name) void name(iptr file)
-typedef OS_CLOSE_FN(os_close_fn);
-
-#define OS_OPEN_FOR_WRITE_FN(name) iptr name(c8 *fname)
-typedef OS_OPEN_FOR_WRITE_FN(os_open_for_write_fn);
-
 #define OS_READ_WHOLE_FILE_FN(name) s8 name(Arena *arena, char *file)
 typedef OS_READ_WHOLE_FILE_FN(os_read_whole_file_fn);
 
-#define OS_READ_FILE_FN(name) iz name(iptr file, void *buf, iz size)
-typedef OS_READ_FILE_FN(os_read_file_fn);
-
 #define OS_WAIT_ON_VALUE_FN(name) b32 name(i32 *value, i32 current, u32 timeout_ms)
 typedef OS_WAIT_ON_VALUE_FN(os_wait_on_value_fn);
 
@@ -308,10 +296,6 @@ typedef OS_SHARED_MEMORY_UNLOCK_REGION_FN(os_shared_memory_region_unlock_fn);
 
 #define OS_FNS \
 	X(add_file_watch)              \
-	X(alloc_arena)                 \
-	X(close)                       \
-	X(open_for_write)              \
-	X(read_file)                   \
 	X(read_whole_file)             \
 	X(shared_memory_region_lock)   \
 	X(shared_memory_region_unlock) \
@@ -342,8 +326,6 @@ struct OS {
 	iptr             error_handle;
 	GLWorkerThreadContext compute_worker;
 
-	char *export_pipe_name;
-
 	DEBUG_DECL(renderdoc_start_frame_capture_fn *start_frame_capture;)
 	DEBUG_DECL(renderdoc_end_frame_capture_fn   *end_frame_capture;)
 };

M	beamformer.c	\|	249	+++++++++++++++++++++++++++++++++++++------------------------------------------
M	beamformer.h	\|	25	+++++--------------------
M	beamformer_parameters.h	\|	16	+++++++++++++---
M	beamformer_work_queue.c	\|	11	+++++++++++
M	beamformer_work_queue.h	\|	62	++++++++++++++++++++++++++++++++++----------------------------
M	build.c	\|	2	+-
M	helpers/ogl_beamformer_lib.c	\|	197	+++++++++++++++++++++++++++++--------------------------------------------------
M	helpers/ogl_beamformer_lib_base.h	\|	10	++++++----
M	main_linux.c	\|	3	+--
M	main_w32.c	\|	3	+--
M	os_linux.c	\|	41	++++++-----------------------------------
M	os_win32.c	\|	37	++++++-------------------------------
M	static.c	\|	13	+++++++------
M	tests/throughput.c	\|	9	+++++----
M	ui.c	\|	11	+++++------
M	util.h	\|	26	++++----------------------