ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | LICENSE

beamformer.c (26848B)


      1 /* See LICENSE for license details. */
      2 #include "beamformer.h"
      3 
      4 static f32 dt_for_frame;
      5 static f32 cycle_t;
      6 
      7 static size
      8 decoded_data_size(ComputeShaderCtx *cs)
      9 {
     10 	uv4  dim    = cs->dec_data_dim;
     11 	size result = 2 * sizeof(f32) * dim.x * dim.y * dim.z;
     12 	return result;
     13 }
     14 
     15 static uv4
     16 make_valid_test_dim(uv4 in)
     17 {
     18 	uv4 result;
     19 	result.x = MAX(in.x, 1);
     20 	result.y = MAX(in.y, 1);
     21 	result.z = MAX(in.z, 1);
     22 	result.w = 1;
     23 	return result;
     24 }
     25 
     26 static BeamformFrameIterator
     27 beamform_frame_iterator(BeamformerCtx *ctx)
     28 {
     29 	BeamformFrameIterator result;
     30 	result.frames        = ctx->beamform_frames;
     31 	result.offset        = ctx->displayed_frame_index;
     32 	result.capacity      = ARRAY_COUNT(ctx->beamform_frames);
     33 	result.cursor        = 0;
     34 	result.needed_frames = ORONE(ctx->params->raw.output_points.w);
     35 	return result;
     36 }
     37 
     38 static BeamformFrame *
     39 frame_next(BeamformFrameIterator *bfi)
     40 {
     41 	BeamformFrame *result = 0;
     42 	if (bfi->cursor != bfi->needed_frames) {
     43 		u32 index = (bfi->offset - bfi->cursor++) % bfi->capacity;
     44 		result    = bfi->frames + index;
     45 	}
     46 	return result;
     47 }
     48 
     49 static void
     50 alloc_beamform_frame(GLParams *gp, BeamformFrame *out, uv4 out_dim, u32 frame_index, s8 name)
     51 {
     52 	glDeleteTextures(out->dim.w, out->textures);
     53 
     54 	out->dim.x = CLAMP(round_down_power_of_2(ORONE(out_dim.x)), 1, gp->max_3d_texture_dim);
     55 	out->dim.y = CLAMP(round_down_power_of_2(ORONE(out_dim.y)), 1, gp->max_3d_texture_dim);
     56 	out->dim.z = CLAMP(round_down_power_of_2(ORONE(out_dim.z)), 1, gp->max_3d_texture_dim);
     57 	out->dim.w = CLAMP(out_dim.w, 0, MAX_MULTI_XDC_COUNT);
     58 
     59 	/* NOTE: allocate storage for beamformed output data;
     60 	 * this is shared between compute and fragment shaders */
     61 	u32 max_dim = MAX(out->dim.x, MAX(out->dim.y, out->dim.z));
     62 	out->mips   = ctz_u32(max_dim) + 1;
     63 
     64 	u8 buf[256];
     65 	Stream label = {.data = buf, .cap = ARRAY_COUNT(buf)};
     66 	stream_append_s8(&label, name);
     67 	stream_append_byte(&label, '[');
     68 	stream_append_u64(&label, frame_index);
     69 	stream_append_s8(&label, s8("]["));
     70 	u32 sidx = label.widx;
     71 
     72 	glCreateTextures(GL_TEXTURE_3D, out->dim.w, out->textures);
     73 	for (u32 i = 0; i < out->dim.w; i++) {
     74 		glTextureStorage3D(out->textures[i], out->mips, GL_RG32F,
     75 		                   out->dim.x, out->dim.y, out->dim.z);
     76 		stream_append_u64(&label, i);
     77 		stream_append_byte(&label, ']');
     78 		LABEL_GL_OBJECT(GL_TEXTURE, out->textures[i], stream_to_s8(&label));
     79 		label.widx = sidx;
     80 	}
     81 }
     82 
     83 static void
     84 alloc_output_image(BeamformerCtx *ctx, uv4 output_dim)
     85 {
     86 	uv4 try_dim = make_valid_test_dim(output_dim);
     87 	if (!uv4_equal(try_dim, ctx->averaged_frame.dim)) {
     88 		alloc_beamform_frame(&ctx->gl, &ctx->averaged_frame, try_dim, 0,
     89 		                     s8("Beamformed_Averaged_Data"));
     90 		uv4 odim = ctx->averaged_frame.dim;
     91 
     92 		UnloadRenderTexture(ctx->fsctx.output);
     93 		/* TODO: select odim.x vs odim.y */
     94 		ctx->fsctx.output = LoadRenderTexture(odim.x, odim.z);
     95 		LABEL_GL_OBJECT(GL_FRAMEBUFFER, ctx->fsctx.output.id, s8("Rendered_View"));
     96 		GenTextureMipmaps(&ctx->fsctx.output.texture);
     97 		//SetTextureFilter(ctx->fsctx.output.texture, TEXTURE_FILTER_ANISOTROPIC_8X);
     98 		//SetTextureFilter(ctx->fsctx.output.texture, TEXTURE_FILTER_TRILINEAR);
     99 		SetTextureFilter(ctx->fsctx.output.texture, TEXTURE_FILTER_BILINEAR);
    100 	}
    101 }
    102 
    103 static void
    104 alloc_shader_storage(BeamformerCtx *ctx, Arena a)
    105 {
    106 	ComputeShaderCtx *cs     = &ctx->csctx;
    107 	BeamformerParameters *bp = &ctx->params->raw;
    108 	uv4 dec_data_dim         = bp->dec_data_dim;
    109 	uv2 rf_raw_dim           = bp->rf_raw_dim;
    110 	ctx->csctx.dec_data_dim  = dec_data_dim;
    111 	ctx->csctx.rf_raw_dim    = rf_raw_dim;
    112 	size rf_raw_size         = rf_raw_dim.x * rf_raw_dim.y * sizeof(i16);
    113 	size rf_decoded_size     = decoded_data_size(cs);
    114 
    115 	glDeleteBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos);
    116 	glCreateBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos);
    117 
    118 	i32 storage_flags = GL_DYNAMIC_STORAGE_BIT;
    119 	switch (ctx->gl.vendor_id) {
    120 	case GL_VENDOR_AMD:
    121 	case GL_VENDOR_ARM:
    122 	case GL_VENDOR_INTEL:
    123 		if (cs->raw_data_ssbo)
    124 			glUnmapNamedBuffer(cs->raw_data_ssbo);
    125 		storage_flags |= GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT;
    126 	case GL_VENDOR_NVIDIA:
    127 		/* NOTE: register_cuda_buffers will handle the updated ssbo */
    128 		break;
    129 	}
    130 
    131 	size full_rf_buf_size = ARRAY_COUNT(cs->raw_data_fences) * rf_raw_size;
    132 	glDeleteBuffers(1, &cs->raw_data_ssbo);
    133 	glCreateBuffers(1, &cs->raw_data_ssbo);
    134 	glNamedBufferStorage(cs->raw_data_ssbo, full_rf_buf_size, 0, storage_flags);
    135 	LABEL_GL_OBJECT(GL_BUFFER, cs->raw_data_ssbo, s8("Raw_Data_SSBO"));
    136 
    137 	Stream label = stream_alloc(&a, 256);
    138 	stream_append_s8(&label, s8("RF_SSBO_"));
    139 	u32 s_widx  = label.widx;
    140 	for (u32 i = 0; i < ARRAY_COUNT(cs->rf_data_ssbos); i++) {
    141 		glNamedBufferStorage(cs->rf_data_ssbos[i], rf_decoded_size, 0, 0);
    142 		stream_append_u64(&label, i);
    143 		s8 rf_label = stream_to_s8(&label);
    144 		LABEL_GL_OBJECT(GL_BUFFER, cs->rf_data_ssbos[i], rf_label);
    145 		label.widx = s_widx;
    146 	}
    147 
    148 	i32 map_flags = GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT|GL_MAP_UNSYNCHRONIZED_BIT;
    149 	switch (ctx->gl.vendor_id) {
    150 	case GL_VENDOR_AMD:
    151 	case GL_VENDOR_ARM:
    152 	case GL_VENDOR_INTEL:
    153 		cs->raw_data_arena.beg = glMapNamedBufferRange(cs->raw_data_ssbo, 0,
    154 		                                               full_rf_buf_size, map_flags);
    155 		break;
    156 	case GL_VENDOR_NVIDIA:
    157 		cs->raw_data_arena = ctx->platform.alloc_arena(cs->raw_data_arena, full_rf_buf_size);
    158 		ctx->cuda_lib.register_cuda_buffers(cs->rf_data_ssbos, ARRAY_COUNT(cs->rf_data_ssbos),
    159 		                                    cs->raw_data_ssbo);
    160 		ctx->cuda_lib.init_cuda_configuration(bp->rf_raw_dim.E, bp->dec_data_dim.E,
    161 		                                      bp->channel_mapping);
    162 		break;
    163 	}
    164 
    165 	/* NOTE: store hadamard in GPU once; it won't change for a particular imaging session */
    166 	cs->hadamard_dim       = (uv2){.x = dec_data_dim.z, .y = dec_data_dim.z};
    167 	size hadamard_elements = dec_data_dim.z * dec_data_dim.z;
    168 	i32  *hadamard         = alloc(&a, i32, hadamard_elements);
    169 	i32  *tmp              = alloc(&a, i32, hadamard_elements);
    170 	fill_hadamard_transpose(hadamard, tmp, dec_data_dim.z);
    171 	glDeleteBuffers(1, &cs->hadamard_ssbo);
    172 	glCreateBuffers(1, &cs->hadamard_ssbo);
    173 	glNamedBufferStorage(cs->hadamard_ssbo, hadamard_elements * sizeof(i32), hadamard, 0);
    174 	LABEL_GL_OBJECT(GL_BUFFER, cs->hadamard_ssbo, s8("Hadamard_SSBO"));
    175 }
    176 
    177 static BeamformWork *
    178 beamform_work_queue_pop(BeamformWorkQueue *q)
    179 {
    180 	BeamformWork *result = q->first;
    181 	if (result) {
    182 		switch (result->type) {
    183 		case BW_FULL_COMPUTE:
    184 		case BW_RECOMPUTE:
    185 		case BW_PARTIAL_COMPUTE:
    186 			/* NOTE: only one compute is allowed per frame */
    187 			if (q->did_compute_this_frame) {
    188 				result = 0;
    189 			} else {
    190 				q->compute_in_flight--;
    191 				q->did_compute_this_frame = 1;
    192 				ASSERT(q->compute_in_flight >= 0);
    193 			}
    194 			break;
    195 		}
    196 	}
    197 	/* NOTE: only do this once we have determined if we are doing the work */
    198 	if (result) {
    199 		q->first = result->next;
    200 		if (result == q->last) {
    201 			ASSERT(result->next == 0);
    202 			q->last = 0;
    203 		}
    204 	}
    205 
    206 	return result;
    207 }
    208 
    209 static BeamformWork *
    210 beamform_work_queue_push(BeamformerCtx *ctx, Arena *a, enum beamform_work work_type)
    211 {
    212 	/* TODO: we should have a sub arena specifically for this purpose */
    213 
    214 	BeamformWorkQueue *q = &ctx->beamform_work_queue;
    215 	ComputeShaderCtx *cs = &ctx->csctx;
    216 
    217 	BeamformWork *result = q->next_free;
    218 	if (result) q->next_free = result->next;
    219 	else        result = alloc(a, typeof(*result), 1);
    220 
    221 	if (result) {
    222 		result->type = work_type;
    223 		result->next = 0;
    224 
    225 		switch (work_type) {
    226 		case BW_FULL_COMPUTE:
    227 			if (q->compute_in_flight >= ARRAY_COUNT(cs->raw_data_fences)) {
    228 				result->next = q->next_free;
    229 				q->next_free = result;
    230 				result       = 0;
    231 				break;
    232 			}
    233 			cs->raw_data_index++;
    234 			if (cs->raw_data_index >= ARRAY_COUNT(cs->raw_data_fences))
    235 				cs->raw_data_index = 0;
    236 			/* FALLTHROUGH */
    237 		case BW_RECOMPUTE: {
    238 			i32 raw_index = cs->raw_data_index;
    239 			result->compute_ctx.raw_data_ssbo_index = raw_index;
    240 			/* NOTE: if this times out it means the command queue is more than 3
    241 			 * frames behind. In that case we need to re-evaluate the buffer size */
    242 			if (cs->raw_data_fences[raw_index]) {
    243 				i32 result = glClientWaitSync(cs->raw_data_fences[raw_index], 0,
    244 				                              10000);
    245 				if (result == GL_TIMEOUT_EXPIRED) {
    246 					//ASSERT(0);
    247 				}
    248 				glDeleteSync(cs->raw_data_fences[raw_index]);
    249 				cs->raw_data_fences[raw_index] = NULL;
    250 			}
    251 			ctx->displayed_frame_index++;
    252 			if (ctx->displayed_frame_index >= ARRAY_COUNT(ctx->beamform_frames))
    253 				ctx->displayed_frame_index = 0;
    254 			result->compute_ctx.frame = ctx->beamform_frames + ctx->displayed_frame_index;
    255 			result->compute_ctx.first_pass = 1;
    256 
    257 			BeamformFrameIterator bfi = beamform_frame_iterator(ctx);
    258 			for (BeamformFrame *frame = frame_next(&bfi); frame; frame = frame_next(&bfi)) {
    259 				uv4 try_dim = ctx->params->raw.output_points;
    260 				try_dim.w   = ctx->params->raw.xdc_count;
    261 				if (!uv4_equal(frame->dim, try_dim)) {
    262 					u32 index = (bfi.offset - bfi.cursor) % bfi.capacity;
    263 					alloc_beamform_frame(&ctx->gl, frame, try_dim, index,
    264 					                     s8("Beamformed_Data"));
    265 				}
    266 			}
    267 		} /* FALLTHROUGH */
    268 		case BW_PARTIAL_COMPUTE:
    269 			q->compute_in_flight++;
    270 		case BW_SAVE_FRAME:
    271 		case BW_SEND_FRAME:
    272 		case BW_SSBO_COPY:
    273 			break;
    274 		}
    275 
    276 		if (result) {
    277 			if (q->last) q->last = q->last->next = result;
    278 			else         q->last = q->first      = result;
    279 		}
    280 	}
    281 
    282 	return result;
    283 }
    284 
    285 static m4
    286 v3_to_xdc_space(v3 direction, v3 origin, v3 corner1, v3 corner2)
    287 {
    288 	v3 edge1      = sub_v3(corner1, origin);
    289 	v3 edge2      = sub_v3(corner2, origin);
    290 	v3 xdc_normal = cross(edge1, edge2);
    291 	if (xdc_normal.z < 0)
    292 		xdc_normal = cross(edge2, edge1);
    293 	ASSERT(xdc_normal.z >= 0);
    294 
    295 	v3 e1 = normalize_v3(sub_v3(direction, xdc_normal));
    296 	v3 e2 = {.y = 1};
    297 	v3 e3 = normalize_v3(cross(e2, e1));
    298 	v4 e4 = {.x = -origin.x, .y = -origin.y, .z = -origin.z, .w = 1};
    299 
    300 	m4 result = {
    301 		.c[0] = (v4){.x = e3.x, .y = e2.x, .z = e1.x, .w = 0},
    302 		.c[1] = (v4){.x = e3.y, .y = e2.y, .z = e1.y, .w = 0},
    303 		.c[2] = (v4){.x = e3.z, .y = e2.z, .z = e1.z, .w = 0},
    304 		.c[3] = e4,
    305 	};
    306 
    307 	return result;
    308 }
    309 
    310 static v4
    311 f32_4_to_v4(f32 *in)
    312 {
    313 	v4 result;
    314 	store_f32x4(load_f32x4(in), result.E);
    315 	return result;
    316 }
    317 
    318 static void
    319 export_frame(BeamformerCtx *ctx, iptr handle, BeamformFrame *frame)
    320 {
    321 	uv3 dim            = frame->dim.xyz;
    322 	size out_size      = dim.x * dim.y * dim.z * 2 * sizeof(f32);
    323 	ctx->export_buffer = ctx->platform.alloc_arena(ctx->export_buffer, out_size);
    324 	u32 texture        = frame->textures[frame->dim.w - 1];
    325 	glGetTextureImage(texture, 0, GL_RG, GL_FLOAT, out_size, ctx->export_buffer.beg);
    326 	s8 raw = {.len = out_size, .data = ctx->export_buffer.beg};
    327 	if (!ctx->platform.write_file(handle, raw))
    328 		TraceLog(LOG_WARNING, "failed to export frame\n");
    329 	ctx->platform.close(handle);
    330 }
    331 
    332 static void
    333 do_sum_shader(ComputeShaderCtx *cs, u32 *in_textures, u32 in_texture_count, f32 in_scale,
    334               u32 out_texture, uv4 out_data_dim)
    335 {
    336 	/* NOTE: zero output before summing */
    337 	glClearTexImage(out_texture, 0, GL_RED, GL_FLOAT, 0);
    338 
    339 	glBindImageTexture(0, out_texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F);
    340 	glUniform1f(cs->sum_prescale_id, in_scale);
    341 	for (u32 i = 0; i < in_texture_count; i++) {
    342 		glBindImageTexture(1, in_textures[i], 0, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F);
    343 		glDispatchCompute(ORONE(out_data_dim.x / 32),
    344 		                  ORONE(out_data_dim.y),
    345 		                  ORONE(out_data_dim.z / 32));
    346 		glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    347 	}
    348 }
    349 
    350 static void
    351 do_beamform_shader(ComputeShaderCtx *cs, BeamformerParameters *bp, BeamformFrame *frame,
    352                    u32 rf_ssbo, iv3 dispatch_dim, iv3 compute_dim_offset, i32 compute_pass)
    353 {
    354 	glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, rf_ssbo);
    355 	glUniform3iv(cs->volume_export_dim_offset_id, 1, compute_dim_offset.E);
    356 	glUniform1i(cs->volume_export_pass_id, compute_pass);
    357 
    358 	for (u32 i = 0; i < frame->dim.w; i++) {
    359 		u32 texture = frame->textures[i];
    360 		m4 xdc_transform = v3_to_xdc_space((v3){.z = 1},
    361 		                                   f32_4_to_v4(bp->xdc_origin  + (4 * i)).xyz,
    362 		                                   f32_4_to_v4(bp->xdc_corner1 + (4 * i)).xyz,
    363 		                                   f32_4_to_v4(bp->xdc_corner2 + (4 * i)).xyz);
    364 		glBindImageTexture(0, texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F);
    365 		glUniform1i(cs->xdc_index_id, i);
    366 		glUniformMatrix4fv(cs->xdc_transform_id, 1, GL_FALSE, xdc_transform.E);
    367 		glDispatchCompute(ORONE(dispatch_dim.x / 32),
    368 		                  ORONE(dispatch_dim.y),
    369 		                  ORONE(dispatch_dim.z / 32));
    370 	}
    371 }
    372 
    373 static b32
    374 do_partial_compute_step(BeamformerCtx *ctx, BeamformFrame *frame)
    375 {
    376 	ComputeShaderCtx  *cs = &ctx->csctx;
    377 	PartialComputeCtx *pc = &ctx->partial_compute_ctx;
    378 
    379 	b32 done = 0;
    380 
    381 	/* NOTE: we start this elsewhere on the first dispatch so that we can include
    382 	 * times such as decoding/demodulation/etc. */
    383 	if (!pc->timer_active) {
    384 		glQueryCounter(pc->timer_ids[0], GL_TIMESTAMP);
    385 		pc->timer_active = 1;
    386 	}
    387 
    388 	glBeginQuery(GL_TIME_ELAPSED, cs->timer_ids[cs->timer_index][pc->shader]);
    389 	cs->timer_active[cs->timer_index][pc->shader] = 1;
    390 
    391 	glUseProgram(cs->programs[pc->shader]);
    392 
    393 	/* NOTE: We must tile this otherwise GL will kill us for taking too long */
    394 	/* TODO: this could be based on multiple dimensions */
    395 	i32 dispatch_count = frame->dim.z / 32;
    396 	iv3 dim_offset = {.z = !!dispatch_count * 32 * pc->dispatch_index++};
    397 	iv3 dispatch_dim = {.x = frame->dim.x, .y = frame->dim.y, .z = 1};
    398 	do_beamform_shader(cs, &ctx->params->raw, frame, pc->rf_data_ssbo, dispatch_dim, dim_offset, 1);
    399 
    400 	if (pc->dispatch_index >= dispatch_count) {
    401 		pc->dispatch_index  = 0;
    402 		done                = 1;
    403 	}
    404 
    405 	glQueryCounter(pc->timer_ids[1], GL_TIMESTAMP);
    406 
    407 	glEndQuery(GL_TIME_ELAPSED);
    408 
    409 	return done;
    410 }
    411 
    412 static void
    413 do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformFrame *frame, u32 raw_data_index,
    414                   enum compute_shaders shader)
    415 {
    416 	ComputeShaderCtx *csctx = &ctx->csctx;
    417 	uv2  rf_raw_dim         = ctx->params->raw.rf_raw_dim;
    418 	size rf_raw_size        = rf_raw_dim.x * rf_raw_dim.y * sizeof(i16);
    419 
    420 	glBeginQuery(GL_TIME_ELAPSED, csctx->timer_ids[csctx->timer_index][shader]);
    421 	csctx->timer_active[csctx->timer_index][shader] = 1;
    422 
    423 	glUseProgram(csctx->programs[shader]);
    424 
    425 	u32 output_ssbo_idx = !csctx->last_output_ssbo_index;
    426 	u32 input_ssbo_idx  = csctx->last_output_ssbo_index;
    427 
    428 	switch (shader) {
    429 	case CS_HADAMARD:
    430 		glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo,
    431 		                  raw_data_index * rf_raw_size, rf_raw_size);
    432 
    433 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]);
    434 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, csctx->hadamard_ssbo);
    435 		glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32),
    436 		                  ORONE(csctx->dec_data_dim.y / 32),
    437 		                  ORONE(csctx->dec_data_dim.z));
    438 		csctx->raw_data_fences[raw_data_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
    439 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    440 		break;
    441 	case CS_CUDA_DECODE:
    442 		ctx->cuda_lib.cuda_decode(raw_data_index * rf_raw_size, output_ssbo_idx,
    443 		                          ctx->params->raw.channel_offset);
    444 		csctx->raw_data_fences[raw_data_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
    445 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    446 		break;
    447 	case CS_CUDA_HILBERT:
    448 		ctx->cuda_lib.cuda_hilbert(input_ssbo_idx, output_ssbo_idx);
    449 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    450 		break;
    451 	case CS_DEMOD:
    452 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]);
    453 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]);
    454 		glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32),
    455 		                  ORONE(csctx->dec_data_dim.y / 32),
    456 		                  ORONE(csctx->dec_data_dim.z));
    457 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    458 		break;
    459 	case CS_MIN_MAX: {
    460 		u32 texture = frame->textures[frame->dim.w - 1];
    461 		for (u32 i = 1; i < frame->mips; i++) {
    462 			glBindImageTexture(0, texture, i - 1, GL_TRUE, 0, GL_READ_ONLY,  GL_RG32F);
    463 			glBindImageTexture(1, texture, i - 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F);
    464 			glUniform1i(csctx->mips_level_id, i);
    465 
    466 			u32 width  = frame->dim.x >> i;
    467 			u32 height = frame->dim.y >> i;
    468 			u32 depth  = frame->dim.z >> i;
    469 			glDispatchCompute(ORONE(width / 32), ORONE(height), ORONE(depth / 32));
    470 			glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    471 		}
    472 	} break;
    473 	case CS_DAS: {
    474 		u32 rf_ssbo      = csctx->rf_data_ssbos[input_ssbo_idx];
    475 		iv3 dispatch_dim = {.x = frame->dim.x, .y = frame->dim.y, .z = frame->dim.z};
    476 		do_beamform_shader(csctx, &ctx->params->raw, frame, rf_ssbo, dispatch_dim, (iv3){0}, 0);
    477 		if (frame->dim.w > 1) {
    478 			glUseProgram(csctx->programs[CS_SUM]);
    479 			u32 input_texture_count = frame->dim.w - 1;
    480 			do_sum_shader(csctx, frame->textures, input_texture_count,
    481 			              1 / (f32)input_texture_count, frame->textures[frame->dim.w - 1],
    482 			              frame->dim);
    483 		}
    484 	} break;
    485 	case CS_SUM: {
    486 		u32 frame_count  = 0;
    487 		u32 *in_textures = alloc(&arena, u32, MAX_BEAMFORMED_SAVED_FRAMES);
    488 		BeamformFrameIterator bfi = beamform_frame_iterator(ctx);
    489 		for (BeamformFrame *frame = frame_next(&bfi); frame; frame = frame_next(&bfi)) {
    490 			ASSERT(frame->dim.w);
    491 			in_textures[frame_count++] = frame->textures[frame->dim.w - 1];
    492 		}
    493 		do_sum_shader(csctx, in_textures, frame_count, 1 / (f32)frame_count,
    494 		              ctx->averaged_frame.textures[0], ctx->averaged_frame.dim);
    495 	} break;
    496 	default: ASSERT(0);
    497 	}
    498 
    499 	glEndQuery(GL_TIME_ELAPSED);
    500 }
    501 
    502 static void
    503 do_beamform_work(BeamformerCtx *ctx, Arena *a)
    504 {
    505 	BeamformerParameters *bp = &ctx->params->raw;
    506 	BeamformWorkQueue *q     = &ctx->beamform_work_queue;
    507 	BeamformWork *work       = beamform_work_queue_pop(q);
    508 	ComputeShaderCtx *cs     = &ctx->csctx;
    509 
    510 	while (work) {
    511 		switch (work->type) {
    512 		case BW_PARTIAL_COMPUTE: {
    513 			BeamformFrame *frame = work->compute_ctx.frame;
    514 
    515 			if (work->compute_ctx.first_pass) {
    516 				if (ctx->params->upload) {
    517 					glNamedBufferSubData(cs->shared_ubo, 0, sizeof(*bp), bp);
    518 					ctx->params->upload = 0;
    519 				}
    520 
    521 				PartialComputeCtx *pc = &ctx->partial_compute_ctx;
    522 				pc->runtime      = 0;
    523 				pc->timer_active = 1;
    524 				glQueryCounter(pc->timer_ids[0], GL_TIMESTAMP);
    525 				glDeleteBuffers(1, &pc->rf_data_ssbo);
    526 				glCreateBuffers(1, &pc->rf_data_ssbo);
    527 				glNamedBufferStorage(pc->rf_data_ssbo, decoded_data_size(cs), 0, 0);
    528 				LABEL_GL_OBJECT(GL_BUFFER, pc->rf_data_ssbo, s8("Volume_RF_SSBO"));
    529 
    530 				/* TODO: maybe we should have some concept of compute shader
    531 				 * groups, then we could define a group that does the decoding
    532 				 * and filtering and apply that group directly here. For now
    533 				 * we will do this dumb thing */
    534 				u32 stage_count = ctx->params->compute_stages_count;
    535 				enum compute_shaders *stages = ctx->params->compute_stages;
    536 				for (u32 i = 0; i < stage_count; i++) {
    537 					if (stages[i] == CS_DAS) {
    538 						ctx->partial_compute_ctx.shader = stages[i];
    539 						break;
    540 					}
    541 					do_compute_shader(ctx, *a, frame,
    542 					                  work->compute_ctx.raw_data_ssbo_index,
    543 					                  stages[i]);
    544 				}
    545 				u32 output_ssbo = pc->rf_data_ssbo;
    546 				u32 input_ssbo  = cs->rf_data_ssbos[cs->last_output_ssbo_index];
    547 				size rf_size    = decoded_data_size(cs);
    548 				glCopyNamedBufferSubData(input_ssbo, output_ssbo, 0, 0, rf_size);
    549 			}
    550 
    551 			b32 done = do_partial_compute_step(ctx, frame);
    552 			if (!done) {
    553 				BeamformWork *new;
    554 				/* NOTE: this push must not fail */
    555 				new = beamform_work_queue_push(ctx, a, BW_PARTIAL_COMPUTE);
    556 				new->compute_ctx.first_pass    = 0;
    557 				new->compute_ctx.frame         = frame;
    558 				new->compute_ctx.export_handle = work->compute_ctx.export_handle;
    559 			} else if (work->compute_ctx.export_handle != INVALID_FILE) {
    560 				export_frame(ctx, work->compute_ctx.export_handle, frame);
    561 				work->compute_ctx.export_handle = INVALID_FILE;
    562 				/* NOTE: do not waste a bunch of GPU space holding onto the volume
    563 				 * texture if it was just for export */
    564 				glDeleteTextures(frame->dim.w, frame->textures);
    565 				mem_clear(frame, 0, sizeof(*frame));
    566 			}
    567 		} break;
    568 		case BW_FULL_COMPUTE:
    569 		case BW_RECOMPUTE: {
    570 			BeamformFrame *frame = work->compute_ctx.frame;
    571 
    572 			if (ctx->params->upload) {
    573 				glNamedBufferSubData(cs->shared_ubo, 0, sizeof(*bp), bp);
    574 				ctx->params->upload = 0;
    575 			}
    576 
    577 			u32 stage_count = ctx->params->compute_stages_count;
    578 			enum compute_shaders *stages = ctx->params->compute_stages;
    579 			for (u32 i = 0; i < stage_count; i++)
    580 				do_compute_shader(ctx, *a, frame, work->compute_ctx.raw_data_ssbo_index,
    581 					          stages[i]);
    582 
    583 			if (work->compute_ctx.export_handle != INVALID_FILE) {
    584 				export_frame(ctx, work->compute_ctx.export_handle, frame);
    585 				work->compute_ctx.export_handle = INVALID_FILE;
    586 			}
    587 
    588 			ctx->flags |= GEN_MIPMAPS;
    589 		} break;
    590 		}
    591 
    592 
    593 		work->next   = q->next_free;
    594 		q->next_free = work;
    595 		work = beamform_work_queue_pop(q);
    596 	}
    597 
    598 	if (q->did_compute_this_frame) {
    599 		u32 tidx = ctx->csctx.timer_index;
    600 		glDeleteSync(ctx->csctx.timer_fences[tidx]);
    601 		ctx->csctx.timer_fences[tidx] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
    602 		ctx->csctx.timer_index = (tidx + 1) % ARRAY_COUNT(ctx->csctx.timer_fences);
    603 	}
    604 }
    605 
    606 static void
    607 check_compute_timers(ComputeShaderCtx *cs, PartialComputeCtx *pc, BeamformerParametersFull *bp)
    608 {
    609 	/* NOTE: volume generation running timer */
    610 	if (pc->timer_active) {
    611 		u64 start_ns = 0, end_ns = 0;
    612 		glGetQueryObjectui64v(pc->timer_ids[0], GL_QUERY_RESULT, &start_ns);
    613 		glGetQueryObjectui64v(pc->timer_ids[1], GL_QUERY_RESULT, &end_ns);
    614 		u64 elapsed_ns    = end_ns - start_ns;
    615 		pc->runtime      += (f32)elapsed_ns * 1e-9;
    616 		pc->timer_active  = 0;
    617 	}
    618 
    619 	/* NOTE: main timers for display portion of the program */
    620 	u32 last_idx = (cs->timer_index - 1) % ARRAY_COUNT(cs->timer_fences);
    621 	if (!cs->timer_fences[last_idx])
    622 		return;
    623 
    624 	i32 status = glClientWaitSync(cs->timer_fences[last_idx], 0, 0);
    625 	if (status == GL_TIMEOUT_EXPIRED || status == GL_WAIT_FAILED)
    626 		return;
    627 	glDeleteSync(cs->timer_fences[last_idx]);
    628 	cs->timer_fences[last_idx] = NULL;
    629 
    630 	for (u32 i = 0; i < bp->compute_stages_count; i++) {
    631 		u64 ns = 0;
    632 		i32 idx = bp->compute_stages[i];
    633 		if (cs->timer_active[last_idx][idx]) {
    634 			glGetQueryObjectui64v(cs->timer_ids[last_idx][idx], GL_QUERY_RESULT, &ns);
    635 			cs->timer_active[last_idx][idx] = 0;
    636 		}
    637 		cs->last_frame_time[idx] = (f32)ns / 1e9;
    638 	}
    639 }
    640 
    641 #include "ui.c"
    642 
    643 DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step)
    644 {
    645 	dt_for_frame = GetFrameTime();
    646 
    647 	cycle_t += dt_for_frame;
    648 	if (cycle_t > 1) cycle_t -= 1;
    649 	glProgramUniform1f(ctx->csctx.programs[CS_DAS], ctx->csctx.cycle_t_id, cycle_t);
    650 
    651 	if (IsWindowResized()) {
    652 		ctx->window_size.h = GetScreenHeight();
    653 		ctx->window_size.w = GetScreenWidth();
    654 	}
    655 
    656 	if (input->executable_reloaded) {
    657 		ui_init(ctx, ctx->ui_backing_store);
    658 	}
    659 
    660 	/* NOTE: Store the compute time for the last frame. */
    661 	check_compute_timers(&ctx->csctx, &ctx->partial_compute_ctx, ctx->params);
    662 
    663 	BeamformerParameters *bp = &ctx->params->raw;
    664 	/* NOTE: Check for and Load RF Data into GPU */
    665 	if (input->pipe_data_available) {
    666 		BeamformWork *work = beamform_work_queue_push(ctx, arena, BW_FULL_COMPUTE);
    667 		/* NOTE: we can only read in the new data if we get back a work item.
    668 		 * otherwise we have too many frames in flight and should wait until the
    669 		 * next frame to try again */
    670 		if (work) {
    671 			ComputeShaderCtx *cs = &ctx->csctx;
    672 			if (!uv4_equal(cs->dec_data_dim, bp->dec_data_dim)) {
    673 				alloc_shader_storage(ctx, *arena);
    674 				/* TODO: we may need to invalidate all queue items here */
    675 			}
    676 
    677 			if (ctx->params->export_next_frame) {
    678 				/* TODO: we don't really want the beamformer opening/closing files */
    679 				iptr f = ctx->platform.open_for_write(ctx->params->export_pipe_name);
    680 				work->compute_ctx.export_handle = f;
    681 				ctx->params->export_next_frame  = 0;
    682 			} else {
    683 				work->compute_ctx.export_handle = INVALID_FILE;
    684 			}
    685 
    686 			b32 output_3d = bp->output_points.x > 1 && bp->output_points.y > 1 &&
    687 			                bp->output_points.z > 1;
    688 
    689 			if (output_3d) {
    690 				work->type = BW_PARTIAL_COMPUTE;
    691 				BeamformFrame *frame = &ctx->partial_compute_ctx.frame;
    692 				uv4 out_dim = ctx->params->raw.output_points;
    693 				out_dim.w   = ctx->params->raw.xdc_count;
    694 				alloc_beamform_frame(&ctx->gl, frame, out_dim, 0, s8("Beamformed_Volume"));
    695 				work->compute_ctx.frame = frame;
    696 			}
    697 
    698 			u32  raw_index    = work->compute_ctx.raw_data_ssbo_index;
    699 			uv2  rf_raw_dim   = cs->rf_raw_dim;
    700 			size rf_raw_size  = rf_raw_dim.x * rf_raw_dim.y * sizeof(i16);
    701 			void *rf_data_buf = cs->raw_data_arena.beg + raw_index * rf_raw_size;
    702 
    703 			alloc_output_image(ctx, bp->output_points);
    704 
    705 			size rlen = ctx->platform.read_pipe(input->pipe_handle, rf_data_buf, rf_raw_size);
    706 			if (rlen != rf_raw_size) {
    707 				stream_append_s8(&ctx->error_stream, s8("Partial Read Occurred: "));
    708 				stream_append_i64(&ctx->error_stream, rlen);
    709 				stream_append_byte(&ctx->error_stream, '/');
    710 				stream_append_i64(&ctx->error_stream, rf_raw_size);
    711 				stream_append_s8(&ctx->error_stream, s8("\n\0"));
    712 				TraceLog(LOG_WARNING, (c8 *)stream_to_s8(&ctx->error_stream).data);
    713 				ctx->error_stream.widx = 0;
    714 			} else {
    715 				switch (ctx->gl.vendor_id) {
    716 				case GL_VENDOR_AMD:
    717 				case GL_VENDOR_ARM:
    718 				case GL_VENDOR_INTEL:
    719 					break;
    720 				case GL_VENDOR_NVIDIA:
    721 					glNamedBufferSubData(cs->raw_data_ssbo, raw_index * rlen,
    722 					                     rlen, rf_data_buf);
    723 				}
    724 			}
    725 		}
    726 	}
    727 
    728 	ctx->beamform_work_queue.did_compute_this_frame = 0;
    729 	do_beamform_work(ctx, arena);
    730 
    731 	/* NOTE: draw output image texture using render fragment shader */
    732 	BeginTextureMode(ctx->fsctx.output);
    733 		ClearBackground(PINK);
    734 		BeginShaderMode(ctx->fsctx.shader);
    735 			FragmentShaderCtx *fs = &ctx->fsctx;
    736 			glUseProgram(fs->shader.id);
    737 			u32 out_texture = 0;
    738 			if (bp->output_points.w > 1) {
    739 				out_texture = ctx->averaged_frame.textures[0];
    740 			} else {
    741 				BeamformFrame *f = ctx->beamform_frames + ctx->displayed_frame_index;
    742 				/* NOTE: verify we have actually beamformed something yet */
    743 				if (f->dim.w) out_texture = f->textures[f->dim.w - 1];
    744 			}
    745 			glBindTextureUnit(0, out_texture);
    746 			glUniform1f(fs->db_cutoff_id, fs->db);
    747 			glUniform1f(fs->threshold_id, fs->threshold);
    748 			DrawTexture(fs->output.texture, 0, 0, WHITE);
    749 		EndShaderMode();
    750 	EndTextureMode();
    751 
    752 	/* NOTE: regenerate mipmaps only when the output has actually changed */
    753 	if (ctx->flags & GEN_MIPMAPS) {
    754 		/* NOTE: shut up raylib's reporting on mipmap gen */
    755 		SetTraceLogLevel(LOG_NONE);
    756 		GenTextureMipmaps(&ctx->fsctx.output.texture);
    757 		SetTraceLogLevel(LOG_INFO);
    758 		ctx->flags &= ~GEN_MIPMAPS;
    759 	}
    760 
    761 	draw_ui(ctx, input);
    762 
    763 	if (IsKeyPressed(KEY_R)) {
    764 		ctx->flags |= RELOAD_SHADERS;
    765 		if (ui_can_start_compute(ctx))
    766 			ui_start_compute(ctx);
    767 	}
    768 	if (WindowShouldClose())
    769 		ctx->flags |= SHOULD_EXIT;
    770 }