restructure beamform submissions into work queue - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: 67bd1deee3a5fa0454f0ac8d6361492c2fa9daea
Parent: 59f09bb2efea630c3fea3d4a8d246d6dcec92b72
Author: Randy Palamar
Date:   Fri,  8 Nov 2024 14:58:30 -0700

restructure beamform submissions into work queue

This is moving the program towards a state where we can export
arbitrary beamformed frames.

Diffstat:
M beamformer.c  | 603 ++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
M beamformer.h  | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------
M beamformer_parameters.h  | 18 ++++++++++++------
M main_generic.c  | 2 +-
M static.c  | 28 +++++++++-------------------
M ui.c  | 26 +++++++++++++++-----------
M util.c  | 4 ++++
M util.h  | 20 ++++++++++++++++++++

8 files changed, 486 insertions(+), 308 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -3,8 +3,6 @@
 
 static f32 dt_for_frame;
 
-#include "ui.c"
-
 static size
 decoded_data_size(ComputeShaderCtx *cs)
 {
@@ -14,63 +12,57 @@ decoded_data_size(ComputeShaderCtx *cs)
 }
 
 static void
-alloc_output_image(BeamformerCtx *ctx, Arena a)
+alloc_beamform_frame(GLParams *gp, BeamformFrame *out, uv4 out_dim, u32 frame_index, s8 name)
 {
-	BeamformerParameters *bp = &ctx->params->raw;
-	ComputeShaderCtx *cs     = &ctx->csctx;
+	glDeleteTextures(out->dim.w, out->textures);
 
-	u32 max_3d_dim      = ctx->gl.max_3d_texture_dim;
-	ctx->out_data_dim.x = CLAMP(round_down_power_of_2(ORONE(bp->output_points.x)), 1, max_3d_dim);
-	ctx->out_data_dim.y = CLAMP(round_down_power_of_2(ORONE(bp->output_points.y)), 1, max_3d_dim);
-	ctx->out_data_dim.z = CLAMP(round_down_power_of_2(ORONE(bp->output_points.z)), 1, max_3d_dim);
-	ctx->out_data_dim.w = CLAMP(bp->output_points.w, 1, ARRAY_COUNT(cs->sum_textures));
-	bp->output_points   = ctx->out_data_dim;
+	out->dim.x = CLAMP(round_down_power_of_2(ORONE(out_dim.x)), 1, gp->max_3d_texture_dim);
+	out->dim.y = CLAMP(round_down_power_of_2(ORONE(out_dim.y)), 1, gp->max_3d_texture_dim);
+	out->dim.z = CLAMP(round_down_power_of_2(ORONE(out_dim.z)), 1, gp->max_3d_texture_dim);
+	out->dim.w = CLAMP(out_dim.w, 0, MAX_MULTI_XDC_COUNT);
 
 	/* NOTE: allocate storage for beamformed output data;
 	 * this is shared between compute and fragment shaders */
-	uv4 odim    = ctx->out_data_dim;
-	u32 max_dim = MAX(odim.x, MAX(odim.y, odim.z));
-	ctx->out_texture_mips = _tzcnt_u32(max_dim) + 1;
-
-	glActiveTexture(GL_TEXTURE0);
-	glDeleteTextures(1, &ctx->out_texture);
-	glGenTextures(1, &ctx->out_texture);
-	glBindTexture(GL_TEXTURE_3D, ctx->out_texture);
-	glTexStorage3D(GL_TEXTURE_3D, ctx->out_texture_mips, GL_RG32F, odim.x, odim.y, odim.z);
-	LABEL_GL_OBJECT(GL_TEXTURE, ctx->out_texture, s8("Beamformed_Data_Texture"));
-
-	Stream label = stream_alloc(&a, 256);
-	stream_append_s8(&label, s8("Sum_Texture_"));
+	u32 max_dim = MAX(out->dim.x, MAX(out->dim.y, out->dim.z));
+	out->mips   = _tzcnt_u32(max_dim) + 1;
+
+	u8 buf[256];
+	Stream label = {.data = buf, .cap = ARRAY_COUNT(buf)};
+	stream_append_s8(&label, name);
+	stream_append_byte(&label, '[');
+	stream_append_u64(&label, frame_index);
+	stream_append_s8(&label, s8("]["));
 	u32 sidx = label.widx;
-	glDeleteTextures(ARRAY_COUNT(cs->sum_textures), cs->sum_textures);
-	if (odim.w > 1) {
-		glGenTextures(odim.w, cs->sum_textures);
-		for (u32 i = 0; i < odim.w; i++) {
-			glBindTexture(GL_TEXTURE_3D, cs->sum_textures[i]);
-			glTexStorage3D(GL_TEXTURE_3D, ctx->out_texture_mips, GL_RG32F, odim.x, odim.y, odim.z);
-			stream_append_u64(&label, i);
-			s8 slabel = stream_to_s8(&label);
-			LABEL_GL_OBJECT(GL_TEXTURE, cs->sum_textures[i], slabel);
-			label.widx = sidx;
-		}
-	}
 
-	bp->xdc_count = CLAMP(bp->xdc_count, 1, ARRAY_COUNT(cs->array_textures));
-	glDeleteTextures(ARRAY_COUNT(cs->array_textures), cs->array_textures);
-	glGenTextures(bp->xdc_count, cs->array_textures);
-	for (u32 i = 0; i < bp->xdc_count; i++) {
-		glBindTexture(GL_TEXTURE_3D, cs->array_textures[i]);
-		glTexStorage3D(GL_TEXTURE_3D, ctx->out_texture_mips, GL_RG32F, odim.x, odim.y, odim.z);
+	glCreateTextures(GL_TEXTURE_3D, out->dim.w, out->textures);
+	for (u32 i = 0; i < out->dim.w; i++) {
+		glTextureStorage3D(out->textures[i], out->mips, GL_RG32F,
+		                   out->dim.x, out->dim.y, out->dim.z);
+		stream_append_u64(&label, i);
+		stream_append_byte(&label, ']');
+		LABEL_GL_OBJECT(GL_TEXTURE, out->textures[i], stream_to_s8(&label));
+		label.widx = sidx;
 	}
+}
 
-	UnloadRenderTexture(ctx->fsctx.output);
-	/* TODO: select odim.x vs odim.y */
-	ctx->fsctx.output = LoadRenderTexture(odim.x, odim.z);
-	LABEL_GL_OBJECT(GL_FRAMEBUFFER, ctx->fsctx.output.id, s8("Rendered_View"));
-	GenTextureMipmaps(&ctx->fsctx.output.texture);
-	//SetTextureFilter(ctx->fsctx.output.texture, TEXTURE_FILTER_ANISOTROPIC_8X);
-	//SetTextureFilter(ctx->fsctx.output.texture, TEXTURE_FILTER_TRILINEAR);
-	SetTextureFilter(ctx->fsctx.output.texture, TEXTURE_FILTER_BILINEAR);
+static void
+alloc_output_image(BeamformerCtx *ctx, uv4 output_dim)
+{
+	uv4 try_dim = {.xyz = output_dim.xyz};
+	if (!uv4_equal(try_dim, ctx->averaged_frame.dim)) {
+		alloc_beamform_frame(&ctx->gl, &ctx->averaged_frame, try_dim, 0,
+		                     s8("Beamformed_Averaged_Data"));
+		uv4 odim = ctx->averaged_frame.dim;
+
+		UnloadRenderTexture(ctx->fsctx.output);
+		/* TODO: select odim.x vs odim.y */
+		ctx->fsctx.output = LoadRenderTexture(odim.x, odim.z);
+		LABEL_GL_OBJECT(GL_FRAMEBUFFER, ctx->fsctx.output.id, s8("Rendered_View"));
+		GenTextureMipmaps(&ctx->fsctx.output.texture);
+		//SetTextureFilter(ctx->fsctx.output.texture, TEXTURE_FILTER_ANISOTROPIC_8X);
+		//SetTextureFilter(ctx->fsctx.output.texture, TEXTURE_FILTER_TRILINEAR);
+		SetTextureFilter(ctx->fsctx.output.texture, TEXTURE_FILTER_BILINEAR);
+	}
 }
 
 static void
@@ -145,6 +137,95 @@ alloc_shader_storage(BeamformerCtx *ctx, Arena a)
 	LABEL_GL_OBJECT(GL_BUFFER, cs->hadamard_ssbo, s8("Hadamard_SSBO"));
 }
 
+static BeamformWork *
+beamform_work_queue_pop(BeamformWorkQueue *q)
+{
+	BeamformWork *result = q->first;
+	if (result) {
+		q->first = result->next;
+		if (result == q->last) {
+			ASSERT(result->next == 0);
+			q->last = 0;
+		}
+
+		switch (result->type) {
+		case BW_FULL_COMPUTE:
+		case BW_RECOMPUTE:
+		case BW_PARTIAL_COMPUTE:
+			/* NOTE: only one compute is allowed per frame */
+			if (q->did_compute_this_frame)
+				result = 0;
+			else
+				q->did_compute_this_frame = 1;
+			break;
+		}
+	}
+	return result;
+}
+
+static BeamformWork *
+beamform_work_queue_push(BeamformerCtx *ctx, Arena *a, enum beamform_work work_type)
+{
+	BeamformWorkQueue *q = &ctx->beamform_work_queue;
+	ComputeShaderCtx *cs = &ctx->csctx;
+
+	BeamformWork *result = q->next_free;
+	if (result) q->next_free = result->next;
+	else        result = alloc(a, typeof(*result), 1);
+
+	if (result) {
+		result->type = work_type;
+		result->next = 0;
+
+		switch (work_type) {
+		case BW_FULL_COMPUTE:
+			/* TODO: limiting to make sure we don't have too many of these in the queue */
+			cs->raw_data_index++;
+			if (cs->raw_data_index >= ARRAY_COUNT(cs->raw_data_fences))
+				cs->raw_data_index = 0;
+			/* FALLTHROUGH */
+		case BW_RECOMPUTE: {
+			i32 raw_index = cs->raw_data_index;
+			result->compute_ctx.raw_data_ssbo_index = raw_index;
+			/* NOTE: if this times out it means the command queue is more than 3
+			 * frames behind. In that case we need to re-evaluate the buffer size */
+			if (cs->raw_data_fences[raw_index]) {
+				i32 result = glClientWaitSync(cs->raw_data_fences[raw_index], 0,
+				                              10000);
+				if (result == GL_TIMEOUT_EXPIRED) {
+					//ASSERT(0);
+				}
+				glDeleteSync(cs->raw_data_fences[raw_index]);
+				cs->raw_data_fences[raw_index] = NULL;
+			}
+			ctx->displayed_frame_index++;
+			if (ctx->displayed_frame_index >= ARRAY_COUNT(ctx->beamform_frames))
+				ctx->displayed_frame_index = 0;
+			result->compute_ctx.frame = ctx->beamform_frames + ctx->displayed_frame_index;
+			result->compute_ctx.first_pass = 1;
+
+			uv4 try_dim = ctx->params->raw.output_points;
+			try_dim.w   = ctx->params->raw.xdc_count;
+			if (!uv4_equal(result->compute_ctx.frame->dim, try_dim)) {
+				alloc_beamform_frame(&ctx->gl, result->compute_ctx.frame, try_dim,
+				                     ctx->displayed_frame_index,
+				                     s8("Beamformed_Data"));
+			}
+		} break;
+		case BW_PARTIAL_COMPUTE:
+		case BW_SAVE_FRAME:
+		case BW_SEND_FRAME:
+		case BW_SSBO_COPY:
+			break;
+		}
+
+		if (q->last) q->last = q->last->next = result;
+		else         q->last = q->first      = result;
+	}
+
+	return result;
+}
+
 static m3
 v3_to_xdc_space(v3 direction, v3 origin, v3 corner1, v3 corner2)
 {
@@ -173,65 +254,84 @@ f32_4_to_v4(f32 *in)
 	return result;
 }
 
+static void
+do_sum_shader(ComputeShaderCtx *cs, u32 *in_textures, u32 in_texture_count, f32 in_scale,
+              u32 out_texture, uv4 out_data_dim)
+{
+	/* NOTE: zero output before summing */
+	glClearTexImage(out_texture, 0, GL_RED, GL_FLOAT, 0);
+
+	glBindImageTexture(0, out_texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F);
+	glUniform1f(cs->sum_prescale_id, in_scale);
+	for (u32 i = 0; i < in_texture_count; i++) {
+		glBindImageTexture(1, in_textures[i], 0, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F);
+		glDispatchCompute(ORONE(out_data_dim.x / 32),
+		                  ORONE(out_data_dim.y),
+		                  ORONE(out_data_dim.z / 32));
+		glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
+	}
+}
+
+static void
+do_beamform_shader(ComputeShaderCtx *cs, BeamformerParameters *bp, BeamformFrame *frame,
+                   u32 rf_ssbo, iv3 compute_dim_offset, i32 compute_pass)
+{
+	glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, rf_ssbo);
+	glUniform3iv(cs->volume_export_dim_offset_id, 1, compute_dim_offset.E);
+	glUniform1i(cs->volume_export_pass_id, compute_pass);
+
+	for (u32 i = 0; i < frame->dim.w; i++) {
+		u32 texture = frame->textures[i];
+		m3 xdc_transform = v3_to_xdc_space((v3){.z = 1},
+		                                   f32_4_to_v4(bp->xdc_origin  + (4 * i)).xyz,
+		                                   f32_4_to_v4(bp->xdc_corner1 + (4 * i)).xyz,
+		                                   f32_4_to_v4(bp->xdc_corner2 + (4 * i)).xyz);
+		glBindImageTexture(0, texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F);
+		glUniform1i(cs->xdc_index_id, i);
+		glUniformMatrix3fv(cs->xdc_transform_id, 1, GL_FALSE, xdc_transform.E);
+		glDispatchCompute(ORONE(frame->dim.x / 32), frame->dim.y,
+		                  ORONE(frame->dim.z / 32));
+	}
+}
+
 static b32
-do_volume_computation_step(BeamformerCtx *ctx, enum compute_shaders shader)
+do_partial_compute_step(BeamformerCtx *ctx, BeamformFrame *frame)
 {
-	ComputeShaderCtx *cs = &ctx->csctx;
-	ExportCtx        *e  = &ctx->export_ctx;
+	ComputeShaderCtx  *cs = &ctx->csctx;
+	PartialComputeCtx *pc = &ctx->partial_compute_ctx;
 
 	b32 done = 0;
 
 	/* NOTE: we start this elsewhere on the first dispatch so that we can include
 	 * times such as decoding/demodulation/etc. */
-	if (!(e->state & ES_TIMER_ACTIVE)) {
-		glQueryCounter(e->timer_ids[0], GL_TIMESTAMP);
-		e->state |= ES_TIMER_ACTIVE;
+	if (!(pc->state & PCS_TIMER_ACTIVE)) {
+		glQueryCounter(pc->timer_ids[0], GL_TIMESTAMP);
+		pc->state |= PCS_TIMER_ACTIVE;
 	}
 
-	glUseProgram(cs->programs[shader]);
+	glUseProgram(cs->programs[pc->shader]);
 	glBindBufferBase(GL_UNIFORM_BUFFER, 0, cs->shared_ubo);
-	glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, e->rf_data_ssbo);
-
-	glBindImageTexture(0, e->volume_texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_R32F);
-	glUniform1i(cs->volume_export_pass_id, 1);
 
 	/* NOTE: We must tile this otherwise GL will kill us for taking too long */
 	/* TODO: this could be based on multiple dimensions */
-	u32 dispatch_count = e->volume_dim.z / 32;
-	uv4 dim_offset = {.z = !!dispatch_count * 32 * e->dispatch_index++};
-	glUniform3iv(cs->volume_export_dim_offset_id, 1, (i32 *)dim_offset.E);
-	glDispatchCompute(ORONE(e->volume_dim.x / 32), e->volume_dim.y, 1);
-	if (e->dispatch_index >= dispatch_count) {
-		e->dispatch_index  = 0;
-		e->state          &= ~ES_COMPUTING;
-		done               = 1;
+	i32 dispatch_count = frame->dim.z / 32;
+	iv3 dim_offset = {.z = !!dispatch_count * 32 * pc->dispatch_index++};
+	do_beamform_shader(cs, &ctx->params->raw, frame, pc->rf_data_ssbo, dim_offset, 1);
+
+	if (pc->dispatch_index >= dispatch_count) {
+		pc->dispatch_index  = 0;
+		pc->state          &= ~PCS_COMPUTING;
+		done                = 1;
 	}
 
-	glQueryCounter(e->timer_ids[1], GL_TIMESTAMP);
+	glQueryCounter(pc->timer_ids[1], GL_TIMESTAMP);
 
 	return done;
 }
 
 static void
-do_sum_shader(ComputeShaderCtx *cs, u32 *in_textures, u32 in_texture_count, f32 in_scale,
-              u32 out_texture, uv4 out_data_dim)
-{
-	/* NOTE: zero output before summing */
-	glClearTexImage(out_texture, 0, GL_RED, GL_FLOAT, 0);
-
-	glBindImageTexture(0, out_texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F);
-	glUniform1f(cs->sum_prescale_id, in_scale);
-	for (u32 i = 0; i < in_texture_count; i++) {
-		glBindImageTexture(1, in_textures[i], 0, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F);
-		glDispatchCompute(ORONE(out_data_dim.x / 32),
-		                  ORONE(out_data_dim.y),
-		                  ORONE(out_data_dim.z / 32));
-		glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
-	}
-}
-
-static void
-do_compute_shader(BeamformerCtx *ctx, enum compute_shaders shader)
+do_compute_shader(BeamformerCtx *ctx, BeamformFrame *frame, u32 raw_data_index,
+                  enum compute_shaders shader)
 {
 	ComputeShaderCtx *csctx = &ctx->csctx;
 	uv2  rf_raw_dim         = ctx->params->raw.rf_raw_dim;
@@ -244,22 +344,23 @@ do_compute_shader(BeamformerCtx *ctx, enum compute_shaders shader)
 
 	u32 output_ssbo_idx = !csctx->last_output_ssbo_index;
 	u32 input_ssbo_idx  = csctx->last_output_ssbo_index;
+
 	switch (shader) {
 	case CS_HADAMARD:
 		glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo,
-		                  csctx->raw_data_index * rf_raw_size, rf_raw_size);
+		                  raw_data_index * rf_raw_size, rf_raw_size);
 
 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]);
 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, csctx->hadamard_ssbo);
 		glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32),
 		                  ORONE(csctx->dec_data_dim.y / 32),
 		                  ORONE(csctx->dec_data_dim.z));
-		csctx->raw_data_fences[csctx->raw_data_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+		csctx->raw_data_fences[raw_data_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
 		break;
 	case CS_CUDA_DECODE:
-		ctx->cuda_lib.cuda_decode(csctx->raw_data_index * rf_raw_size, output_ssbo_idx);
-		csctx->raw_data_fences[csctx->raw_data_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+		ctx->cuda_lib.cuda_decode(raw_data_index * rf_raw_size, output_ssbo_idx);
+		csctx->raw_data_fences[raw_data_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
 		break;
 	case CS_CUDA_HILBERT:
@@ -275,76 +376,42 @@ do_compute_shader(BeamformerCtx *ctx, enum compute_shaders shader)
 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
 		break;
 	case CS_MIN_MAX: {
-		u32 texture = ctx->out_texture;
-		for (u32 i = 1; i < ctx->out_texture_mips; i++) {
+		u32 texture = frame->textures[frame->dim.w - 1];
+		for (u32 i = 1; i < frame->mips; i++) {
 			glBindImageTexture(0, texture, i - 1, GL_TRUE, 0, GL_READ_ONLY,  GL_RG32F);
 			glBindImageTexture(1, texture, i - 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F);
 			glUniform1i(csctx->mips_level_id, i);
 
-			u32 width  = ctx->out_data_dim.x >> i;
-			u32 height = ctx->out_data_dim.y >> i;
-			u32 depth  = ctx->out_data_dim.z >> i;
+			u32 width  = frame->dim.x >> i;
+			u32 height = frame->dim.y >> i;
+			u32 depth  = frame->dim.z >> i;
 			glDispatchCompute(ORONE(width / 32), ORONE(height), ORONE(depth / 32));
 			glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
 		}
 	} break;
 	case CS_HERCULES:
 	case CS_UFORCES: {
-		if (ctx->export_ctx.state & ES_START) {
-			/* NOTE: on the first frame of compute make a copy of the rf data */
-			size rf_size           = decoded_data_size(csctx);
-			ctx->export_ctx.state &= ~ES_START;
-			ctx->export_ctx.state |= ES_COMPUTING;
-			glCopyNamedBufferSubData(csctx->rf_data_ssbos[input_ssbo_idx],
-			                         ctx->export_ctx.rf_data_ssbo, 0, 0, rf_size);
-		}
-
-		BeamformerParameters *bp = &ctx->params->raw;
-
-		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]);
-		glUniform3iv(csctx->volume_export_dim_offset_id, 1, (i32 []){0, 0, 0});
-		glUniform1i(csctx->volume_export_pass_id, 0);
-
-		for (u32 i = 0; i < bp->xdc_count; i++) {
-			u32 texture;
-			if (bp->xdc_count == 1) {
-				if (ctx->out_data_dim.w > 1) {
-					texture = csctx->sum_textures[csctx->sum_texture_index];
-				} else {
-					texture = ctx->out_texture;
-				}
-			} else {
-				texture = csctx->array_textures[i];
-			}
-
-			m3 xdc_transform = v3_to_xdc_space((v3){.z = 1},
-			                                   f32_4_to_v4(bp->xdc_origin  + (4 * i)).xyz,
-			                                   f32_4_to_v4(bp->xdc_corner1 + (4 * i)).xyz,
-			                                   f32_4_to_v4(bp->xdc_corner2 + (4 * i)).xyz);
-			glBindImageTexture(0, texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F);
-			glUniform1i(csctx->xdc_index_id, i);
-			glUniformMatrix3fv(csctx->xdc_transform_id, 1, GL_FALSE, xdc_transform.E);
-			glDispatchCompute(ORONE(ctx->out_data_dim.x / 32),
-			                  ctx->out_data_dim.y,
-			                  ORONE(ctx->out_data_dim.z / 32));
-		}
-		if (bp->xdc_count > 1) {
+		u32 rf_ssbo = csctx->rf_data_ssbos[input_ssbo_idx];
+		do_beamform_shader(csctx, &ctx->params->raw, frame, rf_ssbo, (iv3){0}, 0);
+		if (frame->dim.w > 1) {
 			glUseProgram(csctx->programs[CS_SUM]);
 			glBindBufferBase(GL_UNIFORM_BUFFER, 0, csctx->shared_ubo);
-			u32 out;
-			if (ctx->out_data_dim.w > 1) out = csctx->sum_textures[csctx->sum_texture_index];
-			else                         out = ctx->out_texture;
-			do_sum_shader(csctx, csctx->array_textures, bp->xdc_count,
-			              1 / (f32)bp->xdc_count, out, ctx->out_data_dim);
+			u32 input_texture_count = frame->dim.w - 1;
+			do_sum_shader(csctx, frame->textures, input_texture_count,
+			              1 / (f32)input_texture_count, frame->textures[frame->dim.w - 1],
+			              frame->dim);
 		}
 	} break;
 	case CS_SUM: {
-		u32 frame_count = ctx->out_data_dim.w;
-		if (frame_count > 1) {
-			do_sum_shader(csctx, csctx->sum_textures, frame_count, 1 / (f32)frame_count,
-			              ctx->out_texture, ctx->out_data_dim);
-			csctx->sum_texture_index = (csctx->sum_texture_index + 1) % frame_count;
+		u32 frame_count = ctx->params->raw.output_points.w;
+		u32 in_textures[MAX_BEAMFORMED_SAVED_FRAMES];
+		for (u32 i = 0; i < frame_count; i++) {
+			u32 idx = (ctx->displayed_frame_index - i) % ARRAY_COUNT(ctx->beamform_frames);
+			BeamformFrame *frame = ctx->beamform_frames + idx;
+			in_textures[i]       = frame->textures[frame->dim.w - 1];
 		}
+		do_sum_shader(csctx, in_textures, frame_count, 1 / (f32)frame_count,
+		              ctx->averaged_frame.textures[0], ctx->averaged_frame.dim);
 	} break;
 	default: ASSERT(0);
 	}
@@ -353,16 +420,101 @@ do_compute_shader(BeamformerCtx *ctx, enum compute_shaders shader)
 }
 
 static void
-check_compute_timers(ComputeShaderCtx *cs, ExportCtx *e, BeamformerParametersFull *bp)
+do_beamform_work(BeamformerCtx *ctx, Arena *a)
+{
+	BeamformerParameters *bp = &ctx->params->raw;
+	BeamformWorkQueue *q     = &ctx->beamform_work_queue;
+	BeamformWork *work       = beamform_work_queue_pop(q);
+	ComputeShaderCtx *cs     = &ctx->csctx;
+
+	while (work) {
+		switch (work->type) {
+		case BW_PARTIAL_COMPUTE: {
+			BeamformFrame *frame = work->compute_ctx.frame;
+
+			if (work->compute_ctx.first_pass) {
+				if (ctx->params->upload) {
+					glNamedBufferSubData(cs->shared_ubo, 0, sizeof(*bp), bp);
+					ctx->params->upload = 0;
+				}
+
+				/* TODO: maybe we should have some concept of compute shader
+				 * groups, then we could define a group that does the decoding
+				 * and filtering and apply that group directly here. For now
+				 * we will do this dumb thing */
+				u32 stage_count = ctx->params->compute_stages_count;
+				enum compute_shaders *stages = ctx->params->compute_stages;
+				for (u32 i = 0; i < stage_count; i++) {
+					if (stages[i] == CS_UFORCES || stages[i] == CS_HERCULES) {
+						/* TODO: this is not a proper solution if we have
+						 * more beamforming shaders */
+						ctx->partial_compute_ctx.shader = stages[i];
+						break;
+					}
+					do_compute_shader(ctx, frame,
+					                  work->compute_ctx.raw_data_ssbo_index,
+					                  stages[i]);
+				}
+				u32 output_ssbo = ctx->partial_compute_ctx.rf_data_ssbo;
+				u32 input_ssbo  = cs->last_output_ssbo_index;
+				size rf_size    = decoded_data_size(cs);
+				glCopyNamedBufferSubData(cs->rf_data_ssbos[input_ssbo],
+				                         output_ssbo, 0, 0, rf_size);
+			}
+
+			b32 done = do_partial_compute_step(ctx, frame);
+			if (!done) {
+				BeamformWork *new;
+				/* NOTE: this push must not fail */
+				new = beamform_work_queue_push(ctx, a, BW_PARTIAL_COMPUTE);
+				new->compute_ctx.first_pass = 0;
+			}
+		} break;
+		case BW_FULL_COMPUTE:
+		case BW_RECOMPUTE: {
+			BeamformFrame *frame = work->compute_ctx.frame;
+
+			if (work->compute_ctx.first_pass) {
+				if (ctx->params->upload) {
+					glNamedBufferSubData(cs->shared_ubo, 0, sizeof(*bp), bp);
+					ctx->params->upload = 0;
+				}
+			}
+
+			u32 stage_count = ctx->params->compute_stages_count;
+			enum compute_shaders *stages = ctx->params->compute_stages;
+			for (u32 i = 0; i < stage_count; i++)
+				do_compute_shader(ctx, frame, work->compute_ctx.raw_data_ssbo_index,
+					          stages[i]);
+			ctx->flags |= GEN_MIPMAPS;
+		} break;
+		}
+
+
+		work->next   = q->next_free;
+		q->next_free = work;
+		work = beamform_work_queue_pop(q);
+	}
+
+	if (q->did_compute_this_frame) {
+		u32 tidx = ctx->csctx.timer_index;
+		glDeleteSync(ctx->csctx.timer_fences[tidx]);
+		ctx->csctx.timer_fences[tidx] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+		ctx->csctx.timer_index = (tidx + 1) % ARRAY_COUNT(ctx->csctx.timer_fences);
+	}
+}
+
+static void
+check_compute_timers(ComputeShaderCtx *cs, PartialComputeCtx *pc, BeamformerParametersFull *bp)
 {
 	/* NOTE: volume generation running timer */
-	if (e->state & ES_TIMER_ACTIVE) {
+	if (pc->state & PCS_TIMER_ACTIVE) {
 		u64 start_ns = 0, end_ns = 0;
-		glGetQueryObjectui64v(e->timer_ids[0], GL_QUERY_RESULT, &start_ns);
-		glGetQueryObjectui64v(e->timer_ids[1], GL_QUERY_RESULT, &end_ns);
+		glGetQueryObjectui64v(pc->timer_ids[0], GL_QUERY_RESULT, &start_ns);
+		glGetQueryObjectui64v(pc->timer_ids[1], GL_QUERY_RESULT, &end_ns);
 		u64 elapsed_ns = end_ns - start_ns;
-		e->runtime    += (f32)elapsed_ns * 1e-9;
-		e->state      &= ~ES_TIMER_ACTIVE;
+		pc->runtime    += (f32)elapsed_ns * 1e-9;
+		pc->state      &= ~PCS_TIMER_ACTIVE;
 	}
 
 	/* NOTE: main timers for display portion of the program */
@@ -384,8 +536,10 @@ check_compute_timers(ComputeShaderCtx *cs, ExportCtx *e, BeamformerParametersFul
 	}
 }
 
+#include "ui.c"
+
 DEBUG_EXPORT void
-do_beamformer(BeamformerCtx *ctx, Arena arena)
+do_beamformer(BeamformerCtx *ctx, Arena *arena)
 {
 	dt_for_frame = GetFrameTime();
 
@@ -395,108 +549,53 @@ do_beamformer(BeamformerCtx *ctx, Arena arena)
 	}
 
 	/* NOTE: Store the compute time for the last frame. */
-	check_compute_timers(&ctx->csctx, &ctx->export_ctx, ctx->params);
+	check_compute_timers(&ctx->csctx, &ctx->partial_compute_ctx, ctx->params);
 
 	BeamformerParameters *bp = &ctx->params->raw;
 	/* NOTE: Check for and Load RF Data into GPU */
 	if (ctx->platform.poll_pipe(ctx->data_pipe)) {
-		ComputeShaderCtx *cs = &ctx->csctx;
-		if (!uv4_equal(cs->dec_data_dim, bp->dec_data_dim))
-			alloc_shader_storage(ctx, arena);
-
-		if (!uv4_equal(ctx->out_data_dim, bp->output_points))
-			alloc_output_image(ctx, arena);
-
-		cs->raw_data_index = (cs->raw_data_index + 1) % ARRAY_COUNT(cs->raw_data_fences);
-		i32 raw_index = ctx->csctx.raw_data_index;
-		/* NOTE: if this times out it means the command queue is more than 3 frames behind.
-		 * In that case we need to re-evaluate the buffer size */
-		if (ctx->csctx.raw_data_fences[raw_index]) {
-			i32 result = glClientWaitSync(cs->raw_data_fences[raw_index], 0, 10000);
-			if (result == GL_TIMEOUT_EXPIRED) {
-				//ASSERT(0);
+		BeamformWork *work = beamform_work_queue_push(ctx, arena, BW_FULL_COMPUTE);
+		/* NOTE: we can only read in the new data if we get back a work item.
+		 * otherwise we have too many frames in flight and should wait until the
+		 * next frame to try again */
+		if (work) {
+			ComputeShaderCtx *cs = &ctx->csctx;
+			if (!uv4_equal(cs->dec_data_dim, bp->dec_data_dim)) {
+				alloc_shader_storage(ctx, *arena);
+				/* TODO: we may need to invalidate all queue items here */
 			}
-			glDeleteSync(cs->raw_data_fences[raw_index]);
-			cs->raw_data_fences[raw_index] = NULL;
-		}
 
-		uv2  rf_raw_dim   = cs->rf_raw_dim;
-		size rf_raw_size  = rf_raw_dim.x * rf_raw_dim.y * sizeof(i16);
-
-		void *rf_data_buf = cs->raw_data_arena.beg + raw_index * rf_raw_size;
-		size rlen         = ctx->platform.read_pipe(ctx->data_pipe, rf_data_buf, rf_raw_size);
-		if (rlen != rf_raw_size) {
-			ctx->partial_transfer_count++;
-		} else {
-			ctx->flags |= DO_COMPUTE;
-			switch (ctx->gl.vendor_id) {
-			case GL_VENDOR_INTEL:
-				/* TODO: intel complains about this buffer being busy even with
-				 * MAP_UNSYNCHRONIZED_BIT */
-			case GL_VENDOR_AMD:
-				break;
-			case GL_VENDOR_NVIDIA:
-				glNamedBufferSubData(cs->raw_data_ssbo, raw_index * rf_raw_size,
-				                     rf_raw_size, rf_data_buf);
+			u32  raw_index    = work->compute_ctx.raw_data_ssbo_index;
+			uv2  rf_raw_dim   = cs->rf_raw_dim;
+			size rf_raw_size  = rf_raw_dim.x * rf_raw_dim.y * sizeof(i16);
+			void *rf_data_buf = cs->raw_data_arena.beg + raw_index * rf_raw_size;
+
+			alloc_output_image(ctx, bp->output_points);
+
+			size rlen = ctx->platform.read_pipe(ctx->data_pipe, rf_data_buf, rf_raw_size);
+			if (rlen != rf_raw_size) {
+				stream_append_s8(&ctx->error_stream, s8("Partial Read Occurred: "));
+				stream_append_i64(&ctx->error_stream, rlen);
+				stream_append_byte(&ctx->error_stream, '/');
+				stream_append_i64(&ctx->error_stream, rf_raw_size);
+				stream_append_s8(&ctx->error_stream, s8("\n\0"));
+				TraceLog(LOG_WARNING, (c8 *)stream_to_s8(&ctx->error_stream).data);
+				ctx->error_stream.widx = 0;
+			} else {
+				switch (ctx->gl.vendor_id) {
+				case GL_VENDOR_INTEL:
+				case GL_VENDOR_AMD:
+					break;
+				case GL_VENDOR_NVIDIA:
+					glNamedBufferSubData(cs->raw_data_ssbo, raw_index * rlen,
+					                     rlen, rf_data_buf);
+				}
 			}
 		}
 	}
 
-	/* NOTE: we are starting a volume computation on this frame so make some space */
-	if (ctx->export_ctx.state & ES_START) {
-		ExportCtx *e = &ctx->export_ctx;
-		e->runtime   = 0;
-		uv4 edim     = e->volume_dim;
-
-		/* NOTE: get a timestamp here which will include decoding/demodulating/etc. */
-		glQueryCounter(e->timer_ids[0], GL_TIMESTAMP);
-		e->state |= ES_TIMER_ACTIVE;
-
-		glDeleteTextures(1, &e->volume_texture);
-		glCreateTextures(GL_TEXTURE_3D, 1, &e->volume_texture);
-		glTextureStorage3D(e->volume_texture, 1, GL_R32F, edim.x, edim.y, edim.z);
-		LABEL_GL_OBJECT(GL_TEXTURE, e->volume_texture, s8("Beamformed_Volume"));
-
-		glDeleteBuffers(1, &e->rf_data_ssbo);
-		glCreateBuffers(1, &e->rf_data_ssbo);
-		glNamedBufferStorage(e->rf_data_ssbo, decoded_data_size(&ctx->csctx), 0, 0);
-		LABEL_GL_OBJECT(GL_BUFFER, e->rf_data_ssbo, s8("Volume_RF_SSBO"));
-	}
-
-	if (ctx->flags & DO_COMPUTE || ctx->export_ctx.state & ES_START) {
-		if (ctx->params->upload && !(ctx->export_ctx.state & ES_COMPUTING)) {
-			glNamedBufferSubData(ctx->csctx.shared_ubo, 0, sizeof(*bp), bp);
-			ctx->params->upload = 0;
-		}
-
-		u32 stages = ctx->params->compute_stages_count;
-		for (u32 i = 0; i < stages; i++) {
-			do_compute_shader(ctx, ctx->params->compute_stages[i]);
-		}
-		ctx->flags &= ~DO_COMPUTE;
-		ctx->flags |= GEN_MIPMAPS;
-
-		u32 tidx = ctx->csctx.timer_index;
-		glDeleteSync(ctx->csctx.timer_fences[tidx]);
-		ctx->csctx.timer_fences[tidx] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
-		ctx->csctx.timer_index = (tidx + 1) % ARRAY_COUNT(ctx->csctx.timer_fences);
-	}
-
-	if (ctx->export_ctx.state & ES_COMPUTING) {
-		/* TODO: this could probably be adapted to do FORCES as well */
-		b32 done = do_volume_computation_step(ctx, CS_HERCULES);
-		if (done) {
-			ExportCtx *e         = &ctx->export_ctx;
-			uv4 dim              = e->volume_dim;
-			size volume_out_size = dim.x * dim.y * dim.z * sizeof(f32);
-			e->volume_buf        = ctx->platform.alloc_arena(e->volume_buf, volume_out_size);
-			glGetTextureImage(e->volume_texture, 0, GL_RED, GL_FLOAT, volume_out_size,
-			                  e->volume_buf.beg);
-			s8 raw = {.len = volume_out_size, .data = e->volume_buf.beg};
-			if (!ctx->platform.write_new_file("raw_volume.bin", raw))
-				TraceLog(LOG_WARNING, "failed to write output volume\n");
-		}
-	}
+	ctx->beamform_work_queue.did_compute_this_frame = 0;
+	do_beamform_work(ctx, arena);
 
 	/* NOTE: draw output image texture using render fragment shader */
 	BeginTextureMode(ctx->fsctx.output);
@@ -504,7 +603,15 @@ do_beamformer(BeamformerCtx *ctx, Arena arena)
 		BeginShaderMode(ctx->fsctx.shader);
 			FragmentShaderCtx *fs = &ctx->fsctx;
 			glUseProgram(fs->shader.id);
-			glBindTextureUnit(0, ctx->out_texture);
+			u32 out_texture = 0;
+			if (bp->output_points.w > 1) {
+				out_texture = ctx->averaged_frame.textures[0];
+			} else {
+				BeamformFrame *f = ctx->beamform_frames + ctx->displayed_frame_index;
+				/* NOTE: verify we have actually beamformed something yet */
+				if (f->dim.w) out_texture = f->textures[f->dim.w - 1];
+			}
+			glBindTextureUnit(0, out_texture);
 			glUniform1f(fs->db_cutoff_id, fs->db);
 			glUniform1f(fs->threshold_id, fs->threshold);
 			DrawTexture(fs->output.texture, 0, 0, WHITE);
@@ -520,7 +627,7 @@ do_beamformer(BeamformerCtx *ctx, Arena arena)
 		ctx->flags &= ~GEN_MIPMAPS;
 	}
 
-	draw_ui(ctx, arena);
+	draw_ui(ctx, *arena);
 
 	if (IsKeyPressed(KEY_R))
 		ctx->flags |= RELOAD_SHADERS;
diff --git a/beamformer.h b/beamformer.h
@@ -29,8 +29,7 @@
 enum program_flags {
 	SHOULD_EXIT    = 1 << 0,
 	RELOAD_SHADERS = 1 << 1,
-	GEN_MIPMAPS    = 1 << 29,
-	DO_COMPUTE     = 1 << 30,
+	GEN_MIPMAPS    = 1 << 30,
 };
 
 enum gl_vendor_ids {
@@ -128,14 +127,6 @@ typedef struct {
 	GLsync timer_fences[MAX_FRAMES_IN_FLIGHT];
 	f32    last_frame_time[CS_LAST];
 
-	/* NOTE: circular buffer of textures for averaging.
-	 * Only allocated up to configured frame average count */
-	u32 sum_textures[16];
-	u32 sum_texture_index;
-
-	/* NOTE: array output textures. Only allocated up to configured array count */
-	u32 array_textures[4];
-
 	/* NOTE: the raw_data_ssbo is allocated at 3x the required size to allow for tiled
 	 * transfers when the GPU is running behind the CPU. It is not mapped on NVIDIA because
 	 * their drivers _will_ store the buffer in the system memory. This doesn't happen
@@ -174,22 +165,32 @@ typedef struct {
 	f32             threshold;
 } FragmentShaderCtx;
 
-enum export_state {
-	ES_START        = (1 <<  0),
-	ES_COMPUTING    = (1 <<  1),
-	ES_TIMER_ACTIVE = (1 <<  2),
+enum {
+	PCS_COMPUTING,
+	PCS_TIMER_ACTIVE,
 };
 
 typedef struct {
-	Arena volume_buf;
+	/* NOTE: we always have one extra texture to sum into; thus the final output data
+	 * is always found in textures[dim.w - 1] */
+	u32 textures[MAX_MULTI_XDC_COUNT + 1];
+	uv4 dim;
+	u32 mips;
+} BeamformFrame;
+
+typedef struct {
+	/* TODO: possibly both of these should be stored elsewhere */
+	Arena export_buf;
 	uv4   volume_dim;
+
+	BeamformFrame frame;
 	u32   timer_ids[2];
 	f32   runtime;
-	u32   volume_texture;
 	u32   rf_data_ssbo;
-	u32   state;
+	u32   shader;
 	u32   dispatch_index;
-} ExportCtx;
+	u32   state;
+} PartialComputeCtx;
 
 typedef struct {
 	enum gl_vendor_ids vendor_id;
@@ -201,6 +202,49 @@ typedef struct {
 	i32  max_ubo_size;
 } GLParams;
 
+enum beamform_work {
+	BW_FULL_COMPUTE,
+	BW_RECOMPUTE,
+	BW_PARTIAL_COMPUTE,
+	BW_SAVE_FRAME,
+	BW_SEND_FRAME,
+	BW_SSBO_COPY,
+};
+
+typedef struct {
+	u32 source_ssbo;
+	u32 dest_ssbo;
+} BeamformSSBOCopy;
+
+typedef struct {
+	BeamformFrame *frame;
+	u32 raw_data_ssbo_index;
+	b32 first_pass;
+} BeamformCompute;
+
+typedef struct {
+	BeamformFrame *frame;
+	iptr output_handle;
+} BeamformOutputFrame;
+
+/* NOTE: discriminated union based on type */
+typedef struct BeamformWork {
+	struct BeamformWork *next;
+	union {
+		BeamformSSBOCopy    ssbo_copy_ctx;
+		BeamformCompute     compute_ctx;
+		BeamformOutputFrame output_frame_ctx;
+	};
+	u32 type;
+} BeamformWork;
+
+typedef struct {
+	BeamformWork *first;
+	BeamformWork *last;
+	BeamformWork *next_free;
+	b32 did_compute_this_frame;
+} BeamformWorkQueue;
+
 typedef struct BeamformerCtx {
 	GLParams gl;
 
@@ -213,13 +257,14 @@ typedef struct BeamformerCtx {
 
 	InputState is;
 
-	uv4 out_data_dim;
-	u32 out_texture;
-	u32 out_texture_mips;
+	BeamformFrame beamform_frames[MAX_BEAMFORMED_SAVED_FRAMES];
+	u32 displayed_frame_index;
 
+	/* NOTE: this will only be used when we are averaging */
+	BeamformFrame averaged_frame;
 	ComputeShaderCtx  csctx;
 	FragmentShaderCtx fsctx;
-	ExportCtx         export_ctx;
+	PartialComputeCtx partial_compute_ctx;
 
 	Pipe data_pipe;
 	u32  partial_transfer_count;
@@ -228,9 +273,11 @@ typedef struct BeamformerCtx {
 	Platform platform;
 	Stream   error_stream;
 
+	BeamformWorkQueue beamform_work_queue;
+
 	BeamformerParametersFull *params;
 } BeamformerCtx;
 
-#define LABEL_GL_OBJECT(type, id, s) glObjectLabel(type, id, (s).len, (c8 *)(s).data)
+#define LABEL_GL_OBJECT(type, id, s) {s8 _s = (s); glObjectLabel(type, id, _s.len, (c8 *)_s.data);}
 
 #endif /*_BEAMFORMER_H_ */
diff --git a/beamformer_parameters.h b/beamformer_parameters.h
@@ -11,14 +11,16 @@ enum compute_shaders {
 	CS_LAST
 };
 
+#define MAX_BEAMFORMED_SAVED_FRAMES 16
+#define MAX_MULTI_XDC_COUNT         4
 /* NOTE: This struct follows the OpenGL std140 layout. DO NOT modify unless you have
  * read and understood the rules, particulary with regards to _member alignment_ */
 typedef struct {
 	u16 channel_mapping[512];   /* Transducer Channel to Verasonics Channel */
 	u32 uforces_channels[128];  /* Channels used for virtual UFORCES elements */
-	f32 xdc_origin[16];         /* [m] (4 v4s) Corner of transducer being treated as origin */
-	f32 xdc_corner1[16];        /* [m] (4 v4s) Corner of transducer along first axis (arbitrary) */
-	f32 xdc_corner2[16];        /* [m] (4 v4s) Corner of transducer along second axis (arbitrary) */
+	f32 xdc_origin[4 * MAX_MULTI_XDC_COUNT];  /* [m] Corner of transducer being treated as origin */
+	f32 xdc_corner1[4 * MAX_MULTI_XDC_COUNT]; /* [m] Corner of transducer along first axis */
+	f32 xdc_corner2[4 * MAX_MULTI_XDC_COUNT]; /* [m] Corner of transducer along second axis */
 	uv4 dec_data_dim;           /* Samples * Channels * Acquisitions; last element ignored */
 	uv4 output_points;          /* Width * Height * Depth * (Frame Average Count) */
 	v4  output_min_coordinate;  /* [m] Back-Top-Left corner of output region (w ignored) */
@@ -36,15 +38,19 @@ typedef struct {
 	f32 _pad[1];
 } BeamformerParameters;
 
+/* NOTE: garbage to get the prepocessor to properly stringize the value of a macro */
+#define str_(x) #x
+#define str(x) str_(x)
+
 #define COMPUTE_SHADER_HEADER "\
 #version 460 core\n\
 \n\
 layout(std140, binding = 0) uniform parameters {\n\
 	uvec4 channel_mapping[64];    /* Transducer Channel to Verasonics Channel */\n\
 	uvec4 uforces_channels[32];   /* Channels used for virtual UFORCES elements */\n\
-	vec4  xdc_origin[4];          /* [m] Corner of transducer being treated as origin */\n\
-	vec4  xdc_corner1[4];         /* [m] Corner of transducer along first axis (arbitrary) */\n\
-	vec4  xdc_corner2[4];         /* [m] Corner of transducer along second axis (arbitrary) */\n\
+	vec4  xdc_origin[" str(MAX_MULTI_XDC_COUNT) "];          /* [m] Corner of transducer being treated as origin */\n\
+	vec4  xdc_corner1[" str(MAX_MULTI_XDC_COUNT) "];         /* [m] Corner of transducer along first axis (arbitrary) */\n\
+	vec4  xdc_corner2[" str(MAX_MULTI_XDC_COUNT) "];         /* [m] Corner of transducer along second axis (arbitrary) */\n\
 	uvec4 dec_data_dim;           /* Samples * Channels * Acquisitions; last element ignored */\n\
 	uvec4 output_points;          /* Width * Height * Depth * (Frame Average Count) */\n\
 	vec4  output_min_coord;       /* [m] Top left corner of output region */\n\
diff --git a/main_generic.c b/main_generic.c
@@ -44,7 +44,7 @@ main(void)
 	setup_beamformer(&ctx, temp_memory);
 
 	while(!(ctx.flags & SHOULD_EXIT)) {
-		do_program_step(&ctx, temp_memory);
+		do_program_step(&ctx, &temp_memory);
 	}
 
 	/* NOTE: make sure this will get cleaned up after external
diff --git a/static.c b/static.c
@@ -20,7 +20,8 @@ static struct {
 #else
 static void *debug_lib;
 
-typedef void do_beamformer_fn(BeamformerCtx *, Arena);
+/* TODO: move this to a header */
+typedef void do_beamformer_fn(BeamformerCtx *, Arena *);
 static do_beamformer_fn *do_beamformer;
 
 static void
@@ -173,13 +174,6 @@ compile_shader(Arena a, u32 type, s8 shader)
 }
 
 static void
-init_fragment_shader_ctx(FragmentShaderCtx *ctx, uv4 out_data_dim)
-{
-	ctx->db        = -50.0f;
-	ctx->threshold =  40.0f;
-}
-
-static void
 reload_shaders(BeamformerCtx *ctx, Arena a)
 {
 	ComputeShaderCtx *csctx = &ctx->csctx;
@@ -211,7 +205,6 @@ reload_shaders(BeamformerCtx *ctx, Arena a)
 			glDeleteProgram(csctx->programs[i]);
 			csctx->programs[i] = rlLoadComputeShaderProgram(shader_id);
 			LABEL_GL_OBJECT(GL_PROGRAM, csctx->programs[i], compute_shaders[i].label);
-			ctx->flags |= DO_COMPUTE;
 		}
 
 		glDeleteShader(shader_id);
@@ -262,8 +255,7 @@ setup_beamformer(BeamformerCtx *ctx, Arena temp_memory)
 {
 	ctx->window_size  = (uv2){.w = 1280, .h = 840};
 
-	ctx->out_data_dim          = (uv4){.x = 1, .y = 1, .z = 1};
-	ctx->export_ctx.volume_dim = (uv4){.x = 1, .y = 1, .z = 1};
+	ctx->partial_compute_ctx.volume_dim = (uv4){.x = 1, .y = 1, .z = 1};
 
 	SetConfigFlags(FLAG_VSYNC_HINT);
 	InitWindow(ctx->window_size.w, ctx->window_size.h, "OGL Beamformer");
@@ -280,7 +272,8 @@ setup_beamformer(BeamformerCtx *ctx, Arena temp_memory)
 	ctx->font       = LoadFontEx("assets/IBMPlexSans-Bold.ttf", 28, 0, 0);
 	ctx->small_font = LoadFontEx("assets/IBMPlexSans-Bold.ttf", 22, 0, 0);
 
-	init_fragment_shader_ctx(&ctx->fsctx, ctx->out_data_dim);
+	ctx->fsctx.db        = -50.0f;
+	ctx->fsctx.threshold =  40.0f;
 
 	ctx->data_pipe = os_open_named_pipe(OS_PIPE_NAME);
 	ctx->params    = os_open_shared_memory_area(OS_SMEM_NAME, sizeof(ctx->params));
@@ -288,7 +281,6 @@ setup_beamformer(BeamformerCtx *ctx, Arena temp_memory)
 	ASSERT(ctx->data_pipe.file != INVALID_FILE);
 	ASSERT(ctx->params);
 
-	ctx->params->raw.output_points = ctx->out_data_dim;
 	/* NOTE: default compute shader pipeline */
 	ctx->params->compute_stages[0]    = CS_HADAMARD;
 	ctx->params->compute_stages[1]    = CS_DEMOD;
@@ -311,15 +303,13 @@ setup_beamformer(BeamformerCtx *ctx, Arena temp_memory)
 	LABEL_GL_OBJECT(GL_BUFFER, ctx->csctx.shared_ubo, s8("Beamformer_Parameters"));
 
 	glGenQueries(ARRAY_COUNT(ctx->csctx.timer_fences) * CS_LAST, (u32 *)ctx->csctx.timer_ids);
-	glGenQueries(ARRAY_COUNT(ctx->export_ctx.timer_ids), ctx->export_ctx.timer_ids);
+	glGenQueries(ARRAY_COUNT(ctx->partial_compute_ctx.timer_ids), ctx->partial_compute_ctx.timer_ids);
 
-	/* NOTE: do not DO_COMPUTE on first frame */
 	reload_shaders(ctx, temp_memory);
-	ctx->flags &= ~DO_COMPUTE;
 }
 
 static void
-do_program_step(BeamformerCtx *ctx, Arena temp_memory)
+do_program_step(BeamformerCtx *ctx, Arena *memory)
 {
 	do_debug(&ctx->error_stream);
 	if (ctx->gl.vendor_id == GL_VENDOR_NVIDIA)
@@ -327,8 +317,8 @@ do_program_step(BeamformerCtx *ctx, Arena temp_memory)
 
 	if (ctx->flags & RELOAD_SHADERS) {
 		ctx->flags &= ~RELOAD_SHADERS;
-		reload_shaders(ctx, temp_memory);
+		reload_shaders(ctx, *memory);
 	}
 
-	do_beamformer(ctx, temp_memory);
+	do_beamformer(ctx, memory);
 }
diff --git a/ui.c b/ui.c
@@ -2,10 +2,13 @@
 static void
 ui_start_compute(BeamformerCtx *ctx)
 {
-	ctx->flags |= DO_COMPUTE;
-	if (ctx->params->raw.output_points.w > 1) {
-		for (u32 i = 0; i < ctx->params->raw.output_points.w; i++)
-			glClearTexImage(ctx->csctx.sum_textures[i], 0, GL_RED, GL_FLOAT, 0);
+	/* NOTE: we do not allow ui to start a work if no work was previously completed */
+	Arena a = {0};
+	beamform_work_queue_push(ctx, &a, BW_RECOMPUTE);
+	for (u32 i = 0; i < ARRAY_COUNT(ctx->beamform_frames); i++) {
+		BeamformFrame *frame = ctx->beamform_frames + i;
+		if (frame->dim.w && frame->textures[frame->dim.w - 1])
+			glClearTexImage(frame->textures[frame->dim.w - 1], 0, GL_RED, GL_FLOAT, 0);
 	}
 	ctx->params->upload = 1;
 }
@@ -560,19 +563,21 @@ draw_settings_ui(BeamformerCtx *ctx, Arena arena, Rect r, v2 mouse)
 	draw_r.pos.y  += 2 * LISTING_LINE_PAD;
 	draw_r.size.y -= 2 * LISTING_LINE_PAD;
 
-	bmv = (BPModifiableValue){&ctx->export_ctx.volume_dim.x, bmv_store_power_of_two,
+	#if 0
+	/* TODO: work this into the work queue */
+	bmv = (BPModifiableValue){&ctx->partial_compute_ctx.volume_dim.x, bmv_store_power_of_two,
 	                          .ilimits = (iv2){.x = 1, .y = ctx->gl.max_3d_texture_dim},
 	                          MV_INT, 1, 1};
 	draw_r = do_text_input_listing(s8("Export Dimension X:"), s8(""), bmv, ctx, arena,
 	                               draw_r, mouse, hover_t + idx++);
 
-	bmv = (BPModifiableValue){&ctx->export_ctx.volume_dim.y, bmv_store_power_of_two,
+	bmv = (BPModifiableValue){&ctx->partial_compute_ctx.volume_dim.y, bmv_store_power_of_two,
 	                          .ilimits = (iv2){.x = 1, .y = ctx->gl.max_3d_texture_dim},
 	                          MV_INT, 1, 1};
 	draw_r = do_text_input_listing(s8("Export Dimension Y:"), s8(""), bmv, ctx, arena,
 	                               draw_r, mouse, hover_t + idx++);
 
-	bmv = (BPModifiableValue){&ctx->export_ctx.volume_dim.z, bmv_store_power_of_two,
+	bmv = (BPModifiableValue){&ctx->partial_compute_ctx.volume_dim.z, bmv_store_power_of_two,
 	                          .ilimits = (iv2){.x = 1, .y = ctx->gl.max_3d_texture_dim},
 	                          MV_INT, 1, 1};
 	draw_r = do_text_input_listing(s8("Export Dimension Z:"), s8(""), bmv, ctx, arena,
@@ -582,11 +587,10 @@ draw_settings_ui(BeamformerCtx *ctx, Arena arena, Rect r, v2 mouse)
 	btn_r.size.h  = ctx->font.baseSize * 1.3;
 	btn_r.size.w *= 0.6;
 	if (do_text_button(ctx, s8("Dump Raw Volume"), btn_r, mouse, hover_t + idx++)) {
-		if (!ctx->export_ctx.state) {
-			ctx->export_ctx.state  = ES_START;
-			ctx->flags            |= DO_COMPUTE;
+		if (!ctx->partial_compute_ctx.state) {
 		}
 	}
+	#endif
 
 	/* NOTE: if C compilers didn't suck this would be a static assert */
 	ASSERT(idx <= ARRAY_COUNT(hover_t));
@@ -631,7 +635,7 @@ draw_debug_overlay(BeamformerCtx *ctx, Arena arena, Rect r)
 	}
 
 	static s8 totals[2] = {s8("Compute Total:"), s8("Volume Total:")};
-	f32 times[2]        = {compute_time_sum, ctx->export_ctx.runtime};
+	f32 times[2]        = {compute_time_sum, ctx->partial_compute_ctx.runtime};
 	for (u32 i = 0; i < ARRAY_COUNT(totals); i++) {
 		pos.y    -= measure_text(ctx->font, totals[i]).y;
 		draw_text(ctx->font, totals[i], pos, 0, colour_from_normalized(FG_COLOUR));
diff --git a/util.c b/util.c
@@ -47,6 +47,10 @@ mem_move(u8 *src, u8 *dest, size n)
 static void *
 alloc_(Arena *a, size len, size align, size count)
 {
+	/* NOTE: special case 0 arena */
+	if (a->beg == 0)
+		return 0;
+
 	size padding   = -(uintptr_t)a->beg & (align - 1);
 	size available = a->end - a->beg - padding;
 	if (available < 0 || count > available / len)
diff --git a/util.h b/util.h
@@ -11,6 +11,10 @@
 #define asm __asm__
 #endif
 
+#ifndef typeof
+#define typeof __typeof__
+#endif
+
 #ifndef unreachable
 #ifdef _MSC_VER
 	#define unreachable() __assume(0)
@@ -62,6 +66,7 @@ typedef ptrdiff_t size;
 typedef ptrdiff_t iptr;
 
 typedef struct { u8 *beg, *end; } Arena;
+typedef struct { Arena *arena; u8 *old_beg; } TempArena;
 
 typedef struct { size len; u8 *data; } s8;
 #define s8(s) (s8){.len = ARRAY_COUNT(s) - 1, .data = (u8 *)s}
@@ -79,13 +84,28 @@ typedef union {
 } iv2;
 
 typedef union {
+	struct { i32 x, y, z; };
+	struct { i32 w, h, d; };
+	iv2 xy;
+	i32 E[3];
+} iv3;
+
+typedef union {
 	struct { u32 x, y; };
 	struct { u32 w, h; };
 	u32 E[2];
 } uv2;
 
 typedef union {
+	struct { u32 x, y, z; };
+	struct { u32 w, h, d; };
+	uv2 xy;
+	u32 E[3];
+} uv3;
+
+typedef union {
 	struct { u32 x, y, z, w; };
+	uv3 xyz;
 	u32 E[4];
 } uv4;

M	beamformer.c	\|	603	++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
M	beamformer.h	\|	93	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------
M	beamformer_parameters.h	\|	18	++++++++++++------
M	main_generic.c	\|	2	+-
M	static.c	\|	28	+++++++++-------------------
M	ui.c	\|	26	+++++++++++++++-----------
M	util.c	\|	4	++++
M	util.h	\|	20	++++++++++++++++++++