ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

beamformer.c (58553B)


      1 /* See LICENSE for license details. */
      2 /* TODO(rnp):
      3  * [ ]: filter shader specializations need to be generated per sample mode
      4  *      - performance was measured with a switch on sampling mode and the perfomance gained
      5  *        is 80% worse than just having a baked in sampling mode
      6  *      - should also include channel mapping just in case
      7  * [ ]: make decode output real values for real inputs and complex values for complex inputs
      8  *      - this means that das should have a RF version and an IQ version
      9  *      - this will also flip the current hack to support demodulate after decode to
     10  *        being a hack to support CudaHilbert after decode
     11  * [ ]: measure performance of doing channel mapping in a separate shader
     12  * [ ]: BeamformWorkQueue -> BeamformerWorkQueue
     13  * [ ]: need to keep track of gpu memory in some way
     14  *      - want to be able to store more than 16 2D frames but limit 3D frames
     15  *      - maybe keep track of how much gpu memory is committed for beamformed images
     16  *        and use that to determine when to loop back over existing textures
     17  *      - to do this maybe use a circular linked list instead of a flat array
     18  *      - then have a way of querying how many frames are available for a specific point count
     19  * [ ]: bug: reinit cuda on hot-reload
     20  */
     21 
     22 #include "beamformer.h"
     23 
     24 global f32 dt_for_frame;
     25 global u32 cycle_t;
     26 
     27 #ifndef _DEBUG
     28 #define start_renderdoc_capture(...)
     29 #define end_renderdoc_capture(...)
     30 #else
     31 global renderdoc_start_frame_capture_fn *start_frame_capture;
     32 global renderdoc_end_frame_capture_fn   *end_frame_capture;
     33 #define start_renderdoc_capture(gl) if (start_frame_capture) start_frame_capture(gl, 0)
     34 #define end_renderdoc_capture(gl)   if (end_frame_capture)   end_frame_capture(gl, 0)
     35 #endif
     36 
     37 typedef struct {
     38 	BeamformerFrame *frames;
     39 	u32 capacity;
     40 	u32 offset;
     41 	u32 cursor;
     42 	u32 needed_frames;
     43 } ComputeFrameIterator;
     44 
     45 function void
     46 beamformer_compute_plan_release(BeamformerComputeContext *cc, u32 block)
     47 {
     48 	assert(block < countof(cc->compute_plans));
     49 	BeamformerComputePlan *cp = cc->compute_plans[block];
     50 	if (cp) {
     51 		glDeleteBuffers(countof(cp->ubos), cp->ubos);
     52 		glDeleteTextures(countof(cp->textures), cp->textures);
     53 		for (u32 i = 0; i < countof(cp->filters); i++)
     54 			glDeleteTextures(1, &cp->filters[i].texture);
     55 		cc->compute_plans[block] = 0;
     56 		SLLPushFreelist(cp, cc->compute_plan_freelist);
     57 	}
     58 }
     59 
     60 function BeamformerComputePlan *
     61 beamformer_compute_plan_for_block(BeamformerComputeContext *cc, u32 block, Arena *arena)
     62 {
     63 	assert(block < countof(cc->compute_plans));
     64 	BeamformerComputePlan *result = cc->compute_plans[block];
     65 	if (!result) {
     66 		result = SLLPopFreelist(cc->compute_plan_freelist);
     67 		if (result) zero_struct(result);
     68 		else        result = push_struct(arena, BeamformerComputePlan);
     69 		cc->compute_plans[block] = result;
     70 
     71 		glCreateBuffers(countof(result->ubos), result->ubos);
     72 
     73 		Stream label = arena_stream(*arena);
     74 		#define X(k, t, ...) \
     75 			glNamedBufferStorage(result->ubos[BeamformerComputeUBOKind_##k], sizeof(t), \
     76 			                     0, GL_DYNAMIC_STORAGE_BIT); \
     77 			stream_append_s8(&label, s8(#t "[")); \
     78 			stream_append_u64(&label, block);     \
     79 			stream_append_byte(&label, ']');      \
     80 			glObjectLabel(GL_BUFFER, result->ubos[BeamformerComputeUBOKind_##k], \
     81 			              label.widx, (c8 *)label.data); \
     82 			label.widx = 0;
     83 		BEAMFORMER_COMPUTE_UBO_LIST
     84 		#undef X
     85 
     86 		#define X(_k, t, ...) t,
     87 		GLenum gl_kind[] = {BEAMFORMER_COMPUTE_TEXTURE_LIST};
     88 		#undef X
     89 		read_only local_persist s8 tex_prefix[] = {
     90 			#define X(k, ...) s8_comp(#k "["),
     91 			BEAMFORMER_COMPUTE_TEXTURE_LIST
     92 			#undef X
     93 		};
     94 		glCreateTextures(GL_TEXTURE_1D, BeamformerComputeTextureKind_Count - 1, result->textures);
     95 		for (u32 i = 0; i < BeamformerComputeTextureKind_Count - 1; i++) {
     96 			/* TODO(rnp): this could be predicated on channel count for this compute plan */
     97 			glTextureStorage1D(result->textures[i], 1, gl_kind[i], BeamformerMaxChannelCount);
     98 			stream_append_s8(&label, tex_prefix[i]);
     99 			stream_append_u64(&label, block);
    100 			stream_append_byte(&label, ']');
    101 			glObjectLabel(GL_TEXTURE, result->textures[i], label.widx, (c8 *)label.data);
    102 			label.widx = 0;
    103 		}
    104 	}
    105 	return result;
    106 }
    107 
    108 function void
    109 beamformer_filter_update(BeamformerFilter *f, BeamformerFilterKind kind,
    110                          BeamformerFilterParameters fp, u32 block, u32 slot, Arena arena)
    111 {
    112 	#define X(k, ...) s8_comp(#k "Filter"),
    113 	read_only local_persist s8 filter_kinds[] = {BEAMFORMER_FILTER_KIND_LIST(,)};
    114 	#undef X
    115 
    116 	Stream sb = arena_stream(arena);
    117 	stream_append_s8s(&sb, filter_kinds[kind % countof(filter_kinds)], s8("["));
    118 	stream_append_u64(&sb, block);
    119 	stream_append_s8(&sb, s8("]["));
    120 	stream_append_u64(&sb, slot);
    121 	stream_append_byte(&sb, ']');
    122 	s8 label = arena_stream_commit(&arena, &sb);
    123 
    124 	void *filter = 0;
    125 	switch (kind) {
    126 	case BeamformerFilterKind_Kaiser:{
    127 		/* TODO(rnp): this should also support complex */
    128 		/* TODO(rnp): implement this as an IFIR filter instead to reduce computation */
    129 		filter = kaiser_low_pass_filter(&arena, fp.Kaiser.cutoff_frequency, fp.sampling_frequency,
    130 		                                fp.Kaiser.beta, (i32)fp.Kaiser.length);
    131 		f->length     = (i32)fp.Kaiser.length;
    132 		f->time_delay = (f32)f->length / 2.0f / fp.sampling_frequency;
    133 	}break;
    134 	case BeamformerFilterKind_MatchedChirp:{
    135 		typeof(fp.MatchedChirp) *mc = &fp.MatchedChirp;
    136 		f32 fs    = fp.sampling_frequency;
    137 		f->length = (i32)(mc->duration * fs);
    138 		if (fp.complex) {
    139 			filter = baseband_chirp(&arena, mc->min_frequency, mc->max_frequency, fs, f->length, 1, 0.5f);
    140 			f->time_delay = complex_filter_first_moment(filter, f->length, fs);
    141 		} else {
    142 			filter = rf_chirp(&arena, mc->min_frequency, mc->max_frequency, fs, f->length, 1);
    143 			f->time_delay = real_filter_first_moment(filter, f->length, fs);
    144 		}
    145 	}break;
    146 	InvalidDefaultCase;
    147 	}
    148 
    149 	f->kind       = kind;
    150 	f->parameters = fp;
    151 
    152 	glDeleteTextures(1, &f->texture);
    153 	glCreateTextures(GL_TEXTURE_1D, 1, &f->texture);
    154 	glTextureStorage1D(f->texture, 1, fp.complex? GL_RG32F : GL_R32F, f->length);
    155 	glTextureSubImage1D(f->texture, 0, 0, f->length, fp.complex? GL_RG : GL_RED, GL_FLOAT, filter);
    156 	glObjectLabel(GL_TEXTURE, f->texture, (i32)label.len, (c8 *)label.data);
    157 }
    158 
    159 function ComputeFrameIterator
    160 compute_frame_iterator(BeamformerCtx *ctx, u32 start_index, u32 needed_frames)
    161 {
    162 	start_index = start_index % ARRAY_COUNT(ctx->beamform_frames);
    163 
    164 	ComputeFrameIterator result;
    165 	result.frames        = ctx->beamform_frames;
    166 	result.offset        = start_index;
    167 	result.capacity      = ARRAY_COUNT(ctx->beamform_frames);
    168 	result.cursor        = 0;
    169 	result.needed_frames = needed_frames;
    170 	return result;
    171 }
    172 
    173 function BeamformerFrame *
    174 frame_next(ComputeFrameIterator *bfi)
    175 {
    176 	BeamformerFrame *result = 0;
    177 	if (bfi->cursor != bfi->needed_frames) {
    178 		u32 index = (bfi->offset + bfi->cursor++) % bfi->capacity;
    179 		result    = bfi->frames + index;
    180 	}
    181 	return result;
    182 }
    183 
    184 function void
    185 alloc_beamform_frame(GLParams *gp, BeamformerFrame *out, iv3 out_dim, s8 name, Arena arena)
    186 {
    187 	out->dim.x = MAX(1, out_dim.x);
    188 	out->dim.y = MAX(1, out_dim.y);
    189 	out->dim.z = MAX(1, out_dim.z);
    190 
    191 	if (gp) {
    192 		out->dim.x = MIN(out->dim.x, gp->max_3d_texture_dim);
    193 		out->dim.y = MIN(out->dim.y, gp->max_3d_texture_dim);
    194 		out->dim.z = MIN(out->dim.z, gp->max_3d_texture_dim);
    195 	}
    196 
    197 	/* NOTE: allocate storage for beamformed output data;
    198 	 * this is shared between compute and fragment shaders */
    199 	u32 max_dim = (u32)MAX(out->dim.x, MAX(out->dim.y, out->dim.z));
    200 	out->mips   = (i32)ctz_u32(round_up_power_of_2(max_dim)) + 1;
    201 
    202 	Stream label = arena_stream(arena);
    203 	stream_append_s8(&label, name);
    204 	stream_append_byte(&label, '[');
    205 	stream_append_hex_u64(&label, out->id);
    206 	stream_append_byte(&label, ']');
    207 
    208 	glDeleteTextures(1, &out->texture);
    209 	glCreateTextures(GL_TEXTURE_3D, 1, &out->texture);
    210 	glTextureStorage3D(out->texture, out->mips, GL_RG32F, out->dim.x, out->dim.y, out->dim.z);
    211 
    212 	glTextureParameteri(out->texture, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
    213 	glTextureParameteri(out->texture, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
    214 
    215 	LABEL_GL_OBJECT(GL_TEXTURE, out->texture, stream_to_s8(&label));
    216 }
    217 
    218 function void
    219 update_hadamard_texture(BeamformerComputePlan *cp, i32 order, Arena arena)
    220 {
    221 	i32 *hadamard = make_hadamard_transpose(&arena, order);
    222 	if (hadamard) {
    223 		cp->hadamard_order = order;
    224 		u32 *texture = cp->textures + BeamformerComputeTextureKind_Hadamard;
    225 		glDeleteTextures(1, texture);
    226 		glCreateTextures(GL_TEXTURE_2D, 1, texture);
    227 		glTextureStorage2D(*texture, 1, GL_R8I, order, order);
    228 		glTextureSubImage2D(*texture, 0, 0, 0,  order, order, GL_RED_INTEGER, GL_INT, hadamard);
    229 
    230 		Stream label = arena_stream(arena);
    231 		stream_append_s8(&label, s8("Hadamard"));
    232 		stream_append_i64(&label, order);
    233 		LABEL_GL_OBJECT(GL_TEXTURE, *texture, stream_to_s8(&label));
    234 	}
    235 }
    236 
    237 function void
    238 alloc_shader_storage(BeamformerCtx *ctx, u32 decoded_data_size, Arena arena)
    239 {
    240 	BeamformerComputeContext *cc = &ctx->compute_context;
    241 	glDeleteBuffers(countof(cc->ping_pong_ssbos), cc->ping_pong_ssbos);
    242 	glCreateBuffers(countof(cc->ping_pong_ssbos), cc->ping_pong_ssbos);
    243 
    244 	cc->ping_pong_ssbo_size = decoded_data_size;
    245 
    246 	Stream label = arena_stream(arena);
    247 	stream_append_s8(&label, s8("PingPongSSBO["));
    248 	i32 s_widx = label.widx;
    249 	for (i32 i = 0; i < countof(cc->ping_pong_ssbos); i++) {
    250 		glNamedBufferStorage(cc->ping_pong_ssbos[i], (iz)decoded_data_size, 0, 0);
    251 		stream_append_i64(&label, i);
    252 		stream_append_byte(&label, ']');
    253 		LABEL_GL_OBJECT(GL_BUFFER, cc->ping_pong_ssbos[i], stream_to_s8(&label));
    254 		stream_reset(&label, s_widx);
    255 	}
    256 
    257 	/* TODO(rnp): (25.08.04) cuda lib is heavily broken atm. First there are multiple RF
    258 	 * buffers and cuda decode shouldn't assume that the data is coming from the rf_buffer
    259 	 * ssbo. Second each parameter block may need a different hadamard matrix so ideally
    260 	 * decode should just take the texture as a parameter. Third, none of these dimensions
    261 	 * need to be pre-known by the library unless its allocating GPU memory which it shouldn't
    262 	 * need to do. For now grab out of parameter block 0 but it is not correct */
    263 	BeamformerParameterBlock *pb = beamformer_parameter_block(ctx->shared_memory.region, 0);
    264 	/* NOTE(rnp): these are stubs when CUDA isn't supported */
    265 	cuda_register_buffers(cc->ping_pong_ssbos, countof(cc->ping_pong_ssbos), cc->rf_buffer.ssbo);
    266 	u32 decoded_data_dimension[3] = {pb->parameters.sample_count, pb->parameters.channel_count, pb->parameters.acquisition_count};
    267 	cuda_init(pb->parameters.raw_data_dimensions, decoded_data_dimension);
    268 }
    269 
    270 function void
    271 push_compute_timing_info(ComputeTimingTable *t, ComputeTimingInfo info)
    272 {
    273 	u32 index = atomic_add_u32(&t->write_index, 1) % countof(t->buffer);
    274 	t->buffer[index] = info;
    275 }
    276 
    277 function b32
    278 fill_frame_compute_work(BeamformerCtx *ctx, BeamformWork *work, BeamformerViewPlaneTag plane,
    279                         u32 parameter_block, b32 indirect)
    280 {
    281 	b32 result = 0;
    282 	if (work) {
    283 		result = 1;
    284 		u32 frame_id    = atomic_add_u32(&ctx->next_render_frame_index, 1);
    285 		u32 frame_index = frame_id % countof(ctx->beamform_frames);
    286 		work->kind      = indirect? BeamformerWorkKind_ComputeIndirect : BeamformerWorkKind_Compute;
    287 		work->lock      = BeamformerSharedMemoryLockKind_DispatchCompute;
    288 		work->compute_context.parameter_block = parameter_block;
    289 		work->compute_context.frame = ctx->beamform_frames + frame_index;
    290 		work->compute_context.frame->ready_to_present = 0;
    291 		work->compute_context.frame->view_plane_tag   = plane;
    292 		work->compute_context.frame->id               = frame_id;
    293 	}
    294 	return result;
    295 }
    296 
    297 function void
    298 do_sum_shader(BeamformerComputeContext *cc, u32 *in_textures, u32 in_texture_count, f32 in_scale,
    299               u32 out_texture, iv3 out_data_dim)
    300 {
    301 	/* NOTE: zero output before summing */
    302 	glClearTexImage(out_texture, 0, GL_RED, GL_FLOAT, 0);
    303 	glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT);
    304 
    305 	glBindImageTexture(0, out_texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F);
    306 	glProgramUniform1f(cc->programs[BeamformerShaderKind_Sum], SUM_PRESCALE_UNIFORM_LOC, in_scale);
    307 	for (u32 i = 0; i < in_texture_count; i++) {
    308 		glBindImageTexture(1, in_textures[i], 0, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F);
    309 		glDispatchCompute(ORONE((u32)out_data_dim.x / 32u),
    310 		                  ORONE((u32)out_data_dim.y),
    311 		                  ORONE((u32)out_data_dim.z / 32u));
    312 		glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    313 	}
    314 }
    315 
    316 struct compute_cursor {
    317 	iv3 cursor;
    318 	uv3 dispatch;
    319 	iv3 target;
    320 	u32 points_per_dispatch;
    321 	u32 completed_points;
    322 	u32 total_points;
    323 };
    324 
    325 function struct compute_cursor
    326 start_compute_cursor(iv3 dim, u32 max_points)
    327 {
    328 	struct compute_cursor result = {0};
    329 	u32 invocations_per_dispatch = DAS_LOCAL_SIZE_X * DAS_LOCAL_SIZE_Y * DAS_LOCAL_SIZE_Z;
    330 
    331 	result.dispatch.y = MIN(max_points / invocations_per_dispatch, (u32)ceil_f32((f32)dim.y / DAS_LOCAL_SIZE_Y));
    332 
    333 	u32 remaining     = max_points / result.dispatch.y;
    334 	result.dispatch.x = MIN(remaining / invocations_per_dispatch, (u32)ceil_f32((f32)dim.x / DAS_LOCAL_SIZE_X));
    335 	result.dispatch.z = MIN(remaining / (invocations_per_dispatch * result.dispatch.x),
    336 	                        (u32)ceil_f32((f32)dim.z / DAS_LOCAL_SIZE_Z));
    337 
    338 	result.target.x = MAX(dim.x / (i32)result.dispatch.x / DAS_LOCAL_SIZE_X, 1);
    339 	result.target.y = MAX(dim.y / (i32)result.dispatch.y / DAS_LOCAL_SIZE_Y, 1);
    340 	result.target.z = MAX(dim.z / (i32)result.dispatch.z / DAS_LOCAL_SIZE_Z, 1);
    341 
    342 	result.points_per_dispatch = 1;
    343 	result.points_per_dispatch *= result.dispatch.x * DAS_LOCAL_SIZE_X;
    344 	result.points_per_dispatch *= result.dispatch.y * DAS_LOCAL_SIZE_Y;
    345 	result.points_per_dispatch *= result.dispatch.z * DAS_LOCAL_SIZE_Z;
    346 
    347 	result.total_points = (u32)(dim.x * dim.y * dim.z);
    348 
    349 	return result;
    350 }
    351 
    352 function iv3
    353 step_compute_cursor(struct compute_cursor *cursor)
    354 {
    355 	cursor->cursor.x += 1;
    356 	if (cursor->cursor.x >= cursor->target.x) {
    357 		cursor->cursor.x  = 0;
    358 		cursor->cursor.y += 1;
    359 		if (cursor->cursor.y >= cursor->target.y) {
    360 			cursor->cursor.y  = 0;
    361 			cursor->cursor.z += 1;
    362 		}
    363 	}
    364 
    365 	cursor->completed_points += cursor->points_per_dispatch;
    366 
    367 	iv3 result = cursor->cursor;
    368 	result.x *= (i32)cursor->dispatch.x * DAS_LOCAL_SIZE_X;
    369 	result.y *= (i32)cursor->dispatch.y * DAS_LOCAL_SIZE_Y;
    370 	result.z *= (i32)cursor->dispatch.z * DAS_LOCAL_SIZE_Z;
    371 
    372 	return result;
    373 }
    374 
    375 function b32
    376 compute_cursor_finished(struct compute_cursor *cursor)
    377 {
    378 	b32 result = cursor->completed_points >= cursor->total_points;
    379 	return result;
    380 }
    381 
    382 function m4
    383 das_voxel_transform_matrix(BeamformerParameters *bp)
    384 {
    385 	v3 min = v3_from_f32_array(bp->output_min_coordinate);
    386 	v3 max = v3_from_f32_array(bp->output_max_coordinate);
    387 	v3 extent = v3_abs(v3_sub(max, min));
    388 	v3 points = {{(f32)bp->output_points[0], (f32)bp->output_points[1], (f32)bp->output_points[2]}};
    389 
    390 	m4 T1 = m4_translation(v3_scale(v3_sub(points, (v3){{1.0f, 1.0f, 1.0f}}), -0.5f));
    391 	m4 T2 = m4_translation(v3_add(min, v3_scale(extent, 0.5f)));
    392 	m4 S  = m4_scale(v3_div(extent, points));
    393 
    394 	m4 R;
    395 	switch (bp->das_shader_id) {
    396 	case DASShaderKind_FORCES:
    397 	case DASShaderKind_UFORCES:
    398 	case DASShaderKind_Flash:
    399 	{
    400 		R = m4_identity();
    401 		S.c[1].E[1]  = 0;
    402 		T2.c[3].E[1] = 0;
    403 	}break;
    404 	case DASShaderKind_HERCULES:
    405 	case DASShaderKind_UHERCULES:
    406 	case DASShaderKind_RCA_TPW:
    407 	case DASShaderKind_RCA_VLS:
    408 	{
    409 		R = m4_rotation_about_z(bp->beamform_plane ? 0.0f : 0.25f);
    410 		if (!(points.x > 1 && points.y > 1 && points.z > 1))
    411 			T2.c[3].E[1] = bp->off_axis_pos;
    412 	}break;
    413 	default:{ R = m4_identity(); }break;
    414 	}
    415 	m4 result = m4_mul(R, m4_mul(T2, m4_mul(S, T1)));
    416 	return result;
    417 }
    418 
    419 function void
    420 das_ubo_from_beamformer_parameters(BeamformerDASUBO *du, BeamformerParameters *bp)
    421 {
    422 	du->voxel_transform = das_voxel_transform_matrix(bp);
    423 	mem_copy(du->xdc_transform.E,     bp->xdc_transform,     sizeof(du->xdc_transform));
    424 	mem_copy(du->xdc_element_pitch.E, bp->xdc_element_pitch, sizeof(du->xdc_element_pitch));
    425 	du->sampling_frequency     = bp->sampling_frequency;
    426 	du->demodulation_frequency = bp->demodulation_frequency;
    427 	du->speed_of_sound         = bp->speed_of_sound;
    428 	du->time_offset            = bp->time_offset;
    429 	du->f_number               = bp->f_number;
    430 	du->shader_kind            = bp->das_shader_id;
    431 	du->sample_count           = bp->sample_count;
    432 	du->channel_count          = bp->channel_count;
    433 	du->acquisition_count      = bp->acquisition_count;
    434 
    435 	du->shader_flags = 0;
    436 	if (bp->interpolate)         du->shader_flags |= DASShaderFlags_Interpolate;
    437 	if (bp->coherency_weighting) du->shader_flags |= DASShaderFlags_CoherencyWeighting;
    438 	if (bp->transmit_mode == BeamformerTransmitMode_Columns)
    439 		du->shader_flags |= DASShaderFlags_TxColumns;
    440 	if (bp->receive_mode == BeamformerReceiveMode_Columns)
    441 		du->shader_flags |= DASShaderFlags_RxColumns;
    442 }
    443 
    444 function void
    445 plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
    446 {
    447 	BeamformerDASUBO *bp = &cp->das_ubo_data;
    448 
    449 	das_ubo_from_beamformer_parameters(bp, &pb->parameters);
    450 
    451 	b32 decode_first = pb->pipeline.shaders[0] == BeamformerShaderKind_Decode;
    452 	b32 run_cuda_hilbert = 0;
    453 	b32 demodulate       = 0;
    454 
    455 	for (u32 i = 0; i < pb->pipeline.shader_count; i++) {
    456 		switch (pb->pipeline.shaders[i]) {
    457 		case BeamformerShaderKind_CudaHilbert:{ run_cuda_hilbert = 1; }break;
    458 		case BeamformerShaderKind_Demodulate:{  demodulate = 1;       }break;
    459 		default:{}break;
    460 		}
    461 	}
    462 
    463 	if (demodulate) run_cuda_hilbert = 0;
    464 
    465 	BeamformerDataKind data_kind = pb->pipeline.data_kind;
    466 	cp->pipeline.shader_count = 0;
    467 	for (u32 i = 0; i < pb->pipeline.shader_count; i++) {
    468 		BeamformerShaderParameters *sp = pb->pipeline.parameters + i;
    469 		u32 shader = pb->pipeline.shaders[i];
    470 		b32 commit = 0;
    471 
    472 		switch (shader) {
    473 		case BeamformerShaderKind_CudaHilbert:{ commit = run_cuda_hilbert; }break;
    474 		case BeamformerShaderKind_Decode:{
    475 			BeamformerShaderKind decode_table[] = {
    476 				[BeamformerDataKind_Int16]          = BeamformerShaderKind_Decode,
    477 				[BeamformerDataKind_Int16Complex]   = BeamformerShaderKind_DecodeInt16Complex,
    478 				[BeamformerDataKind_Float32]        = BeamformerShaderKind_DecodeFloat,
    479 				[BeamformerDataKind_Float32Complex] = BeamformerShaderKind_DecodeFloatComplex,
    480 			};
    481 			if (decode_first && demodulate) {
    482 				/* TODO(rnp): for now we assume that if we are demodulating the data is int16 */
    483 				shader = BeamformerShaderKind_DecodeInt16ToFloat;
    484 			} else if (decode_first) {
    485 				shader = decode_table[CLAMP(data_kind, 0, countof(decode_table) - 1)];
    486 			} else {
    487 				if (data_kind == BeamformerDataKind_Int16)
    488 					shader = BeamformerShaderKind_DecodeInt16Complex;
    489 				else
    490 					shader = BeamformerShaderKind_DecodeFloatComplex;
    491 			}
    492 			commit = 1;
    493 		}break;
    494 		case BeamformerShaderKind_Demodulate:{
    495 			BeamformerFilter *f = cp->filters + sp->filter_slot;
    496 			if (decode_first || (!decode_first && data_kind == BeamformerDataKind_Float32)) {
    497 				if (f->parameters.complex) shader = BeamformerShaderKind_DemodulateFloatCF;
    498 				else                       shader = BeamformerShaderKind_DemodulateFloat;
    499 			} else if (f->parameters.complex) {
    500 				shader = BeamformerShaderKind_DemodulateCF;
    501 			}
    502 			bp->time_offset += f->time_delay;
    503 			commit = 1;
    504 		}break;
    505 		case BeamformerShaderKind_Filter:{
    506 			BeamformerFilter *f = cp->filters + sp->filter_slot;
    507 			if (f->parameters.complex) shader = BeamformerShaderKind_FilterCF;
    508 			bp->time_offset += f->time_delay;
    509 			commit = 1;
    510 		}break;
    511 		case BeamformerShaderKind_DAS:{
    512 			if ((bp->shader_flags & DASShaderFlags_CoherencyWeighting) == 0)
    513 				shader = BeamformerShaderKind_DASFast;
    514 			commit = 1;
    515 		}break;
    516 		default:{ commit = 1; }break;
    517 		}
    518 
    519 		if (commit) {
    520 			u32 index = cp->pipeline.shader_count++;
    521 			cp->pipeline.shaders[index]    = shader;
    522 			cp->pipeline.parameters[index] = *sp;
    523 		}
    524 	}
    525 	cp->pipeline.data_kind = data_kind;
    526 
    527 	u32 das_sample_stride   = 1;
    528 	u32 das_transmit_stride = bp->sample_count;
    529 	u32 das_channel_stride  = bp->acquisition_count * bp->sample_count;
    530 
    531 	u32 decimation_rate = MAX(pb->parameters.decimation_rate, 1);
    532 	if (demodulate) {
    533 		das_channel_stride  /= (2 * decimation_rate);
    534 		das_transmit_stride /= (2 * decimation_rate);
    535 	}
    536 
    537 	u32 input_sample_stride   = 1;
    538 	u32 input_transmit_stride = bp->sample_count;
    539 	u32 input_channel_stride  = pb->parameters.raw_data_dimensions[0];
    540 
    541 	BeamformerDecodeUBO *dp = &cp->decode_ubo_data;
    542 	dp->decode_mode    = pb->parameters.decode;
    543 	dp->transmit_count = bp->acquisition_count;
    544 
    545 	dp->input_sample_stride    = decode_first? input_sample_stride   : bp->acquisition_count;
    546 	dp->input_channel_stride   = decode_first? input_channel_stride  : das_channel_stride;
    547 	dp->input_transmit_stride  = decode_first? input_transmit_stride : 1;
    548 	dp->output_sample_stride   = das_sample_stride;
    549 	dp->output_channel_stride  = das_channel_stride;
    550 	dp->output_transmit_stride = das_transmit_stride;
    551 	if (decode_first) {
    552 		dp->output_channel_stride  *= decimation_rate;
    553 		dp->output_transmit_stride *= decimation_rate;
    554 	}
    555 
    556 	if (!demodulate) bp->demodulation_frequency = 0;
    557 
    558 	cp->decode_dispatch.x = (u32)ceil_f32((f32)bp->sample_count      / DECODE_LOCAL_SIZE_X);
    559 	cp->decode_dispatch.y = (u32)ceil_f32((f32)bp->channel_count     / DECODE_LOCAL_SIZE_Y);
    560 	cp->decode_dispatch.z = (u32)ceil_f32((f32)bp->acquisition_count / DECODE_LOCAL_SIZE_Z);
    561 
    562 	/* NOTE(rnp): decode 2 samples per dispatch when data is i16 */
    563 	if (decode_first && data_kind == BeamformerDataKind_Int16)
    564 		cp->decode_dispatch.x = (u32)ceil_f32((f32)cp->decode_dispatch.x / 2);
    565 
    566 	/* NOTE(rnp): when we are demodulating we pretend that the sampler was alternating
    567 	 * between sampling the I portion and the Q portion of an IQ signal. Therefore there
    568 	 * is an implicit decimation factor of 2 which must always be included. All code here
    569 	 * assumes that the signal was sampled in such a way that supports this operation.
    570 	 * To recover IQ[n] from the sampled data (RF[n]) we do the following:
    571 	 *   I[n]  = RF[n]
    572 	 *   Q[n]  = RF[n + 1]
    573 	 *   IQ[n] = I[n] - j*Q[n]
    574 	 */
    575 	if (demodulate) {
    576 		BeamformerFilterUBO *mp    = &cp->demod_ubo_data;
    577 		mp->demodulation_frequency = bp->demodulation_frequency;
    578 		mp->sampling_frequency     = bp->sampling_frequency / 2;
    579 		mp->decimation_rate        = decimation_rate;
    580 
    581 		if (!decode_first) mp->shader_flags |= FilterShaderFlags_MapChannels;
    582 
    583 		bp->sampling_frequency /= 2 * (f32)mp->decimation_rate;
    584 		bp->sample_count       /= 2 * mp->decimation_rate;
    585 
    586 		if (decode_first) {
    587 			mp->input_channel_stride  = dp->output_channel_stride;
    588 			mp->input_sample_stride   = dp->output_sample_stride;
    589 			mp->input_transmit_stride = dp->output_transmit_stride;
    590 
    591 			mp->output_channel_stride  = das_channel_stride;
    592 			mp->output_sample_stride   = das_sample_stride;
    593 			mp->output_transmit_stride = das_transmit_stride;
    594 		} else {
    595 			mp->input_channel_stride  = input_channel_stride  / 2;
    596 			mp->input_sample_stride   = input_sample_stride;
    597 			mp->input_transmit_stride = input_transmit_stride / 2;
    598 
    599 			/* NOTE(rnp): output optimized layout for decoding */
    600 			mp->output_channel_stride  = dp->input_channel_stride;
    601 			mp->output_sample_stride   = dp->input_sample_stride;
    602 			mp->output_transmit_stride = dp->input_transmit_stride;
    603 
    604 			cp->decode_dispatch.x = (u32)ceil_f32((f32)bp->sample_count / DECODE_LOCAL_SIZE_X);
    605 		}
    606 	}
    607 
    608 	/* TODO(rnp): filter may need a different dispatch layout */
    609 	cp->demod_dispatch.x = (u32)ceil_f32((f32)bp->sample_count      / FILTER_LOCAL_SIZE_X);
    610 	cp->demod_dispatch.y = (u32)ceil_f32((f32)bp->channel_count     / FILTER_LOCAL_SIZE_Y);
    611 	cp->demod_dispatch.z = (u32)ceil_f32((f32)bp->acquisition_count / FILTER_LOCAL_SIZE_Z);
    612 
    613 	/* TODO(rnp): if IQ (* 8) else (* 4) */
    614 	cp->rf_size = bp->sample_count * bp->channel_count * bp->acquisition_count * 8;
    615 
    616 	/* TODO(rnp): UBO per filter stage */
    617 	BeamformerFilterUBO *flt = &cp->filter_ubo_data;
    618 	flt->demodulation_frequency = bp->demodulation_frequency;
    619 	flt->sampling_frequency     = bp->sampling_frequency;
    620 	flt->decimation_rate        = 1;
    621 	flt->shader_flags           = pb->parameters.sampling_mode & FilterShaderFlags_SamplingModeMask;
    622 	flt->output_channel_stride  = bp->sample_count * bp->acquisition_count;
    623 	flt->output_sample_stride   = 1;
    624 	flt->output_transmit_stride = bp->sample_count;
    625 	flt->input_channel_stride   = bp->sample_count * bp->acquisition_count;
    626 	flt->input_sample_stride    = 1;
    627 	flt->input_transmit_stride  = bp->sample_count;
    628 }
    629 
    630 function void
    631 beamformer_commit_parameter_block(BeamformerCtx *ctx, BeamformerComputePlan *cp, u32 block, Arena arena)
    632 {
    633 	BeamformerParameterBlock *pb = beamformer_parameter_block_lock(&ctx->shared_memory, block, -1);
    634 	for (u32 region = ctz_u32(pb->dirty_regions);
    635 	     region != 32;
    636 	     region = ctz_u32(pb->dirty_regions))
    637 	{
    638 		mark_parameter_block_region_clean(ctx->shared_memory.region, block, region);
    639 		switch (region) {
    640 		case BeamformerParameterBlockRegion_ComputePipeline:
    641 		case BeamformerParameterBlockRegion_Parameters:
    642 		{
    643 			plan_compute_pipeline(cp, pb);
    644 
    645 			/* NOTE(rnp): these are both handled by plan_compute_pipeline() */
    646 			u32 mask = 1 << BeamformerParameterBlockRegion_ComputePipeline |
    647 			           1 << BeamformerParameterBlockRegion_Parameters;
    648 			pb->dirty_regions &= ~mask;
    649 
    650 			#define X(k, t, v) glNamedBufferSubData(cp->ubos[BeamformerComputeUBOKind_##k], \
    651 			                                        0, sizeof(t), &cp->v ## _ubo_data);
    652 			BEAMFORMER_COMPUTE_UBO_LIST
    653 			#undef X
    654 
    655 			u32 samples      = pb->parameters.sample_count;
    656 			u32 channels     = pb->parameters.channel_count;
    657 			u32 acquisitions = pb->parameters.acquisition_count;
    658 			u32 decoded_data_size = (u32)(2 * sizeof(f32) * samples * channels * acquisitions);
    659 			if (ctx->compute_context.ping_pong_ssbo_size < decoded_data_size)
    660 				alloc_shader_storage(ctx, decoded_data_size, arena);
    661 
    662 			if (cp->hadamard_order != (i32)cp->das_ubo_data.acquisition_count)
    663 				update_hadamard_texture(cp, (i32)cp->das_ubo_data.acquisition_count, arena);
    664 
    665 			cp->min_coordinate = v3_from_f32_array(pb->parameters.output_min_coordinate);
    666 			cp->max_coordinate = v3_from_f32_array(pb->parameters.output_max_coordinate);
    667 
    668 			cp->output_points.E[0] = MAX(pb->parameters.output_points[0], 1);
    669 			cp->output_points.E[1] = MAX(pb->parameters.output_points[1], 1);
    670 			cp->output_points.E[2] = MAX(pb->parameters.output_points[2], 1);
    671 			cp->average_frames     = pb->parameters.output_points[3];
    672 
    673 			if (cp->average_frames > 1 && !iv3_equal(cp->output_points, ctx->averaged_frames[0].dim)) {
    674 				alloc_beamform_frame(&ctx->gl, ctx->averaged_frames + 0, cp->output_points, s8("Averaged Frame"), arena);
    675 				alloc_beamform_frame(&ctx->gl, ctx->averaged_frames + 1, cp->output_points, s8("Averaged Frame"), arena);
    676 			}
    677 		}break;
    678 		case BeamformerParameterBlockRegion_ChannelMapping:
    679 		case BeamformerParameterBlockRegion_FocalVectors:
    680 		case BeamformerParameterBlockRegion_SparseElements:
    681 		{
    682 			BeamformerComputeTextureKind texture_kind = 0;
    683 			u32 texture_type = 0, texture_format = 0;
    684 			/* TODO(rnp): this whole thing could be a table */
    685 			switch (region) {
    686 			case BeamformerParameterBlockRegion_ChannelMapping:{
    687 				texture_kind   = BeamformerComputeTextureKind_ChannelMapping;
    688 				texture_type   = GL_SHORT;
    689 				texture_format = GL_RED_INTEGER;
    690 				/* TODO(rnp): cuda lib */
    691 				cuda_set_channel_mapping(pb->channel_mapping);
    692 			}break;
    693 			case BeamformerParameterBlockRegion_FocalVectors:{
    694 				texture_kind   = BeamformerComputeTextureKind_FocalVectors;
    695 				texture_type   = GL_FLOAT;
    696 				texture_format = GL_RG;
    697 			}break;
    698 			case BeamformerParameterBlockRegion_SparseElements:{
    699 				texture_kind   = BeamformerComputeTextureKind_SparseElements;
    700 				texture_type   = GL_SHORT;
    701 				texture_format = GL_RED_INTEGER;
    702 			}break;
    703 			InvalidDefaultCase;
    704 			}
    705 			glTextureSubImage1D(cp->textures[texture_kind], 0, 0, BeamformerMaxChannelCount,
    706 			                    texture_format, texture_type,
    707 			                    (u8 *)pb + BeamformerParameterBlockRegionOffsets[region]);
    708 		}break;
    709 		}
    710 	}
    711 	beamformer_parameter_block_unlock(&ctx->shared_memory, block);
    712 }
    713 
    714 function void
    715 do_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, BeamformerFrame *frame,
    716                   BeamformerShaderKind shader, BeamformerShaderParameters *sp, Arena arena)
    717 {
    718 	BeamformerComputeContext *cc = &ctx->compute_context;
    719 
    720 	u32 program = cc->programs[shader];
    721 	glUseProgram(program);
    722 
    723 	u32 output_ssbo_idx = !cc->last_output_ssbo_index;
    724 	u32 input_ssbo_idx  = cc->last_output_ssbo_index;
    725 
    726 	switch (shader) {
    727 	case BeamformerShaderKind_Decode:
    728 	case BeamformerShaderKind_DecodeInt16Complex:
    729 	case BeamformerShaderKind_DecodeFloat:
    730 	case BeamformerShaderKind_DecodeFloatComplex:
    731 	case BeamformerShaderKind_DecodeInt16ToFloat:
    732 	{
    733 		glBindBufferBase(GL_UNIFORM_BUFFER, 0, cp->ubos[BeamformerComputeUBOKind_Decode]);
    734 		glBindImageTexture(0, cp->textures[BeamformerComputeTextureKind_Hadamard], 0, 0, 0, GL_READ_ONLY, GL_R8I);
    735 
    736 		if (shader == cp->pipeline.shaders[0]) {
    737 			glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, cc->ping_pong_ssbos[input_ssbo_idx]);
    738 			glBindImageTexture(1, cp->textures[BeamformerComputeTextureKind_ChannelMapping], 0, 0, 0, GL_READ_ONLY, GL_R16I);
    739 			glProgramUniform1ui(program, DECODE_FIRST_PASS_UNIFORM_LOC, 1);
    740 
    741 			glDispatchCompute(cp->decode_dispatch.x, cp->decode_dispatch.y, cp->decode_dispatch.z);
    742 			glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
    743 		}
    744 
    745 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, cc->ping_pong_ssbos[input_ssbo_idx]);
    746 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, cc->ping_pong_ssbos[output_ssbo_idx]);
    747 
    748 		glProgramUniform1ui(program, DECODE_FIRST_PASS_UNIFORM_LOC, 0);
    749 
    750 		glDispatchCompute(cp->decode_dispatch.x, cp->decode_dispatch.y, cp->decode_dispatch.z);
    751 		glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
    752 
    753 		cc->last_output_ssbo_index = !cc->last_output_ssbo_index;
    754 	}break;
    755 	case BeamformerShaderKind_CudaDecode:{
    756 		cuda_decode(0, output_ssbo_idx, 0);
    757 		cc->last_output_ssbo_index = !cc->last_output_ssbo_index;
    758 	}break;
    759 	case BeamformerShaderKind_CudaHilbert:{
    760 		cuda_hilbert(input_ssbo_idx, output_ssbo_idx);
    761 		cc->last_output_ssbo_index = !cc->last_output_ssbo_index;
    762 	}break;
    763 	case BeamformerShaderKind_Filter:
    764 	case BeamformerShaderKind_FilterCF:
    765 	case BeamformerShaderKind_Demodulate:
    766 	case BeamformerShaderKind_DemodulateCF:
    767 	case BeamformerShaderKind_DemodulateFloat:
    768 	case BeamformerShaderKind_DemodulateFloatCF:
    769 	{
    770 		BeamformerFilterUBO *ubo = &cp->demod_ubo_data;
    771 		if (shader == BeamformerShaderKind_Filter)
    772 			ubo = &cp->filter_ubo_data;
    773 
    774 		u32 index = shader == BeamformerShaderKind_Filter ? BeamformerComputeUBOKind_Filter
    775 		                                                  : BeamformerComputeUBOKind_Demodulate;
    776 		glBindBufferBase(GL_UNIFORM_BUFFER,        0, cp->ubos[index]);
    777 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, cc->ping_pong_ssbos[output_ssbo_idx]);
    778 		if ((ubo->shader_flags & FilterShaderFlags_MapChannels) == 0)
    779 			glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, cc->ping_pong_ssbos[input_ssbo_idx]);
    780 
    781 		GLenum kind = cp->filters[sp->filter_slot].parameters.complex? GL_RG32F : GL_R32F;
    782 		glBindImageTexture(0, cp->filters[sp->filter_slot].texture, 0, 0, 0, GL_READ_ONLY, kind);
    783 		if (ubo->shader_flags & FilterShaderFlags_MapChannels)
    784 			glBindImageTexture(1, cp->textures[BeamformerComputeTextureKind_ChannelMapping], 0, 0, 0, GL_READ_ONLY, GL_R16I);
    785 
    786 		glDispatchCompute(cp->demod_dispatch.x, cp->demod_dispatch.y, cp->demod_dispatch.z);
    787 		glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
    788 
    789 		cc->last_output_ssbo_index = !cc->last_output_ssbo_index;
    790 	}break;
    791 	case BeamformerShaderKind_MinMax:{
    792 		for (i32 i = 1; i < frame->mips; i++) {
    793 			glBindImageTexture(0, frame->texture, i - 1, GL_TRUE, 0, GL_READ_ONLY,  GL_RG32F);
    794 			glBindImageTexture(1, frame->texture, i - 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F);
    795 			glProgramUniform1i(cc->programs[shader], MIN_MAX_MIPS_LEVEL_UNIFORM_LOC, i);
    796 
    797 			u32 width  = (u32)frame->dim.x >> i;
    798 			u32 height = (u32)frame->dim.y >> i;
    799 			u32 depth  = (u32)frame->dim.z >> i;
    800 			glDispatchCompute(ORONE(width / 32), ORONE(height), ORONE(depth / 32));
    801 			glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    802 		}
    803 	}break;
    804 	case BeamformerShaderKind_DAS:
    805 	case BeamformerShaderKind_DASFast:
    806 	{
    807 		BeamformerDASUBO *ubo = &cp->das_ubo_data;
    808 		if (shader == BeamformerShaderKind_DASFast) {
    809 			glClearTexImage(frame->texture, 0, GL_RED, GL_FLOAT, 0);
    810 			glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT);
    811 			glBindImageTexture(0, frame->texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F);
    812 		} else {
    813 			glBindImageTexture(0, frame->texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F);
    814 		}
    815 
    816 		glBindBufferBase(GL_UNIFORM_BUFFER, 0, cp->ubos[BeamformerComputeUBOKind_DAS]);
    817 		glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, cc->ping_pong_ssbos[input_ssbo_idx], 0, cp->rf_size);
    818 		glBindImageTexture(1, cp->textures[BeamformerComputeTextureKind_SparseElements], 0, 0, 0, GL_READ_ONLY, GL_R16I);
    819 		glBindImageTexture(2, cp->textures[BeamformerComputeTextureKind_FocalVectors],   0, 0, 0, GL_READ_ONLY, GL_RG32F);
    820 
    821 		glProgramUniform1ui(program, DAS_CYCLE_T_UNIFORM_LOC, cycle_t++);
    822 
    823 		if (shader == BeamformerShaderKind_DASFast) {
    824 			i32 loop_end;
    825 			if (ubo->shader_kind == DASShaderKind_RCA_VLS ||
    826 			    ubo->shader_kind == DASShaderKind_RCA_TPW)
    827 			{
    828 				/* NOTE(rnp): to avoid repeatedly sampling the whole focal vectors
    829 				 * texture we loop over transmits for VLS/TPW */
    830 				loop_end = (i32)ubo->acquisition_count;
    831 			} else {
    832 				loop_end = (i32)ubo->channel_count;
    833 			}
    834 			f32 percent_per_step = 1.0f / (f32)loop_end;
    835 			cc->processing_progress = -percent_per_step;
    836 			for (i32 index = 0; index < loop_end; index++) {
    837 				cc->processing_progress += percent_per_step;
    838 				/* IMPORTANT(rnp): prevents OS from coalescing and killing our shader */
    839 				glFinish();
    840 				glProgramUniform1i(program, DAS_FAST_CHANNEL_UNIFORM_LOC, index);
    841 				glDispatchCompute((u32)ceil_f32((f32)frame->dim.x / DAS_FAST_LOCAL_SIZE_X),
    842 				                  (u32)ceil_f32((f32)frame->dim.y / DAS_FAST_LOCAL_SIZE_Y),
    843 				                  (u32)ceil_f32((f32)frame->dim.z / DAS_FAST_LOCAL_SIZE_Z));
    844 				glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    845 			}
    846 		} else {
    847 			#if 1
    848 			/* TODO(rnp): compute max_points_per_dispatch based on something like a
    849 			 * transmit_count * channel_count product */
    850 			u32 max_points_per_dispatch = KB(64);
    851 			struct compute_cursor cursor = start_compute_cursor(frame->dim, max_points_per_dispatch);
    852 			f32 percent_per_step = (f32)cursor.points_per_dispatch / (f32)cursor.total_points;
    853 			cc->processing_progress = -percent_per_step;
    854 			for (iv3 offset = {0};
    855 			     !compute_cursor_finished(&cursor);
    856 			     offset = step_compute_cursor(&cursor))
    857 			{
    858 				cc->processing_progress += percent_per_step;
    859 				/* IMPORTANT(rnp): prevents OS from coalescing and killing our shader */
    860 				glFinish();
    861 				glProgramUniform3iv(program, DAS_VOXEL_OFFSET_UNIFORM_LOC, 1, offset.E);
    862 				glDispatchCompute(cursor.dispatch.x, cursor.dispatch.y, cursor.dispatch.z);
    863 			}
    864 			#else
    865 			/* NOTE(rnp): use this for testing tiling code. The performance of the above path
    866 			 * should be the same as this path if everything is working correctly */
    867 			iv3 compute_dim_offset = {0};
    868 			glProgramUniform3iv(program, DAS_VOXEL_OFFSET_UNIFORM_LOC, 1, compute_dim_offset.E);
    869 			glDispatchCompute((u32)ceil_f32((f32)dim.x / DAS_LOCAL_SIZE_X),
    870 			                  (u32)ceil_f32((f32)dim.y / DAS_LOCAL_SIZE_Y),
    871 			                  (u32)ceil_f32((f32)dim.z / DAS_LOCAL_SIZE_Z));
    872 			#endif
    873 		}
    874 		glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT|GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    875 	}break;
    876 	case BeamformerShaderKind_Sum:{
    877 		u32 aframe_index = ctx->averaged_frame_index % ARRAY_COUNT(ctx->averaged_frames);
    878 		BeamformerFrame *aframe = ctx->averaged_frames + aframe_index;
    879 		aframe->id              = ctx->averaged_frame_index;
    880 		atomic_store_u32(&aframe->ready_to_present, 0);
    881 		/* TODO(rnp): hack we need a better way of specifying which frames to sum;
    882 		 * this is fine for rolling averaging but what if we want to do something else */
    883 		assert(frame >= ctx->beamform_frames);
    884 		assert(frame < ctx->beamform_frames + countof(ctx->beamform_frames));
    885 		u32 base_index   = (u32)(frame - ctx->beamform_frames);
    886 		u32 to_average   = (u32)cp->average_frames;
    887 		u32 frame_count  = 0;
    888 		u32 *in_textures = push_array(&arena, u32, BeamformerMaxSavedFrames);
    889 		ComputeFrameIterator cfi = compute_frame_iterator(ctx, 1 + base_index - to_average, to_average);
    890 		for (BeamformerFrame *it = frame_next(&cfi); it; it = frame_next(&cfi))
    891 			in_textures[frame_count++] = it->texture;
    892 
    893 		assert(to_average == frame_count);
    894 
    895 		do_sum_shader(cc, in_textures, frame_count, 1 / (f32)frame_count, aframe->texture, aframe->dim);
    896 		aframe->min_coordinate  = frame->min_coordinate;
    897 		aframe->max_coordinate  = frame->max_coordinate;
    898 		aframe->compound_count  = frame->compound_count;
    899 		aframe->das_shader_kind = frame->das_shader_kind;
    900 	}break;
    901 	InvalidDefaultCase;
    902 	}
    903 }
    904 
    905 function s8
    906 shader_text_with_header(ShaderReloadContext *ctx, OS *os, Arena *arena)
    907 {
    908 	Stream sb = arena_stream(*arena);
    909 	stream_append_s8s(&sb, s8("#version 460 core\n\n"), ctx->header);
    910 
    911 	switch (ctx->kind) {
    912 	case BeamformerShaderKind_Filter:
    913 	case BeamformerShaderKind_FilterCF:
    914 	case BeamformerShaderKind_Demodulate:
    915 	case BeamformerShaderKind_DemodulateCF:
    916 	case BeamformerShaderKind_DemodulateFloat:
    917 	case BeamformerShaderKind_DemodulateFloatCF:
    918 	{
    919 		stream_append_s8(&sb, s8(""
    920 		"layout(local_size_x = " str(FILTER_LOCAL_SIZE_X) ", "
    921 		       "local_size_y = " str(FILTER_LOCAL_SIZE_Y) ", "
    922 		       "local_size_z = " str(FILTER_LOCAL_SIZE_Z) ") in;\n\n"
    923 		));
    924 
    925 		switch (ctx->kind) {
    926 		case BeamformerShaderKind_FilterCF:
    927 		case BeamformerShaderKind_DemodulateCF:
    928 		case BeamformerShaderKind_DemodulateFloatCF:
    929 		{
    930 			stream_append_s8(&sb, s8("#define COMPLEX_FILTER 1\n"));
    931 		}break;
    932 		default:{
    933 			stream_append_s8(&sb, s8("#define COMPLEX_FILTER 0\n"));
    934 		}break;
    935 		}
    936 
    937 		switch (ctx->kind) {
    938 		case BeamformerShaderKind_Filter:
    939 		case BeamformerShaderKind_FilterCF:
    940 		case BeamformerShaderKind_DemodulateFloat:
    941 		case BeamformerShaderKind_DemodulateFloatCF:
    942 		{
    943 			stream_append_s8(&sb, s8("#define INPUT_DATA_TYPE_FLOAT\n"));
    944 		}break;
    945 		default:{}break;
    946 		}
    947 
    948 		switch (ctx->kind) {
    949 		case BeamformerShaderKind_Demodulate:
    950 		case BeamformerShaderKind_DemodulateCF:
    951 		case BeamformerShaderKind_DemodulateFloat:
    952 		case BeamformerShaderKind_DemodulateFloatCF:
    953 		{
    954 			stream_append_s8(&sb, s8("#define DEMODULATE\n"));
    955 		}break;
    956 		default:{}break;
    957 		}
    958 
    959 		stream_append_byte(&sb, '\n');
    960 		#define X(k, v, ...) "#define ShaderFlags_" #k " " #v "\n"
    961 		stream_append_s8(&sb, s8(FILTER_SHADER_FLAGS_LIST));
    962 		#undef X
    963 
    964 		stream_append_byte(&sb, '\n');
    965 		#define X(k, v, ...) "#define SamplingMode_" #k " " #v "\n"
    966 		stream_append_s8(&sb, s8(SAMPLING_MODES_LIST));
    967 		#undef X
    968 		stream_append_byte(&sb, '\n');
    969 	}break;
    970 	case BeamformerShaderKind_DAS:
    971 	case BeamformerShaderKind_DASFast:
    972 	{
    973 		if (ctx->kind == BeamformerShaderKind_DAS) {
    974 			stream_append_s8(&sb, s8(""
    975 			"layout(local_size_x = " str(DAS_LOCAL_SIZE_X) ", "
    976 			       "local_size_y = " str(DAS_LOCAL_SIZE_Y) ", "
    977 			       "local_size_z = " str(DAS_LOCAL_SIZE_Z) ") in;\n\n"
    978 			"#define DAS_FAST 0\n\n"
    979 			"layout(location = " str(DAS_VOXEL_OFFSET_UNIFORM_LOC) ") uniform ivec3 u_voxel_offset;\n"
    980 			));
    981 		} else {
    982 			stream_append_s8(&sb, s8(""
    983 			"layout(local_size_x = " str(DAS_FAST_LOCAL_SIZE_X) ", "
    984 			       "local_size_y = " str(DAS_FAST_LOCAL_SIZE_Y) ", "
    985 			       "local_size_z = " str(DAS_FAST_LOCAL_SIZE_Z) ") in;\n\n"
    986 			"#define DAS_FAST 1\n\n"
    987 			"layout(location = " str(DAS_FAST_CHANNEL_UNIFORM_LOC) ") uniform int   u_channel;\n"
    988 			));
    989 		}
    990 		stream_append_s8(&sb, s8(""
    991 		"layout(location = " str(DAS_CYCLE_T_UNIFORM_LOC) ") uniform uint  u_cycle_t;\n\n"));
    992 
    993 		#define X(k, v, ...) "#define ShaderFlags_" #k " " #v "\n"
    994 		stream_append_s8(&sb, s8(DAS_SHADER_FLAGS_LIST));
    995 		#undef X
    996 
    997 		stream_append_byte(&sb, '\n');
    998 
    999 		#define X(k, id, ...) "#define ShaderKind_" #k " " #id "\n"
   1000 		stream_append_s8(&sb, s8(DAS_SHADER_KIND_LIST));
   1001 		#undef X
   1002 	}break;
   1003 	case BeamformerShaderKind_Decode:
   1004 	case BeamformerShaderKind_DecodeFloat:
   1005 	case BeamformerShaderKind_DecodeFloatComplex:
   1006 	case BeamformerShaderKind_DecodeInt16Complex:
   1007 	case BeamformerShaderKind_DecodeInt16ToFloat:
   1008 	{
   1009 		s8 define_table[] = {
   1010 			[BeamformerShaderKind_DecodeFloatComplex] = s8("#define INPUT_DATA_TYPE_FLOAT_COMPLEX\n\n"),
   1011 			[BeamformerShaderKind_DecodeFloat]        = s8("#define INPUT_DATA_TYPE_FLOAT\n\n"),
   1012 			[BeamformerShaderKind_DecodeInt16Complex] = s8("#define INPUT_DATA_TYPE_INT16_COMPLEX\n\n"),
   1013 			[BeamformerShaderKind_DecodeInt16ToFloat] = s8("#define OUTPUT_DATA_TYPE_FLOAT\n\n"),
   1014 		};
   1015 		#define X(type, id, pretty) "#define DECODE_MODE_" #type " " #id "\n"
   1016 		stream_append_s8s(&sb, define_table[ctx->kind], s8(""
   1017 		"layout(local_size_x = " str(DECODE_LOCAL_SIZE_X) ", "
   1018 		       "local_size_y = " str(DECODE_LOCAL_SIZE_Y) ", "
   1019 		       "local_size_z = " str(DECODE_LOCAL_SIZE_Z) ") in;\n\n"
   1020 		"layout(location = " str(DECODE_FIRST_PASS_UNIFORM_LOC) ") uniform bool u_first_pass;\n\n"
   1021 		DECODE_TYPES
   1022 		));
   1023 		#undef X
   1024 	}break;
   1025 	case BeamformerShaderKind_MinMax:{
   1026 		stream_append_s8(&sb, s8("layout(location = " str(MIN_MAX_MIPS_LEVEL_UNIFORM_LOC)
   1027 		                         ") uniform int u_mip_map;\n\n"));
   1028 	}break;
   1029 	case BeamformerShaderKind_Sum:{
   1030 		stream_append_s8(&sb, s8("layout(location = " str(SUM_PRESCALE_UNIFORM_LOC)
   1031 		                         ") uniform float u_sum_prescale = 1.0;\n\n"));
   1032 	}break;
   1033 	default:{}break;
   1034 	}
   1035 	stream_append_s8(&sb, s8("\n#line 1\n"));
   1036 
   1037 	s8 result = arena_stream_commit(arena, &sb);
   1038 	if (ctx->path.len) {
   1039 		s8 file = os_read_whole_file(arena, (c8 *)ctx->path.data);
   1040 		assert(file.data == result.data + result.len);
   1041 		result.len += file.len;
   1042 	}
   1043 
   1044 	return result;
   1045 }
   1046 
   1047 DEBUG_EXPORT BEAMFORMER_RELOAD_SHADER_FN(beamformer_reload_shader)
   1048 {
   1049 	i32 shader_count = 1;
   1050 	ShaderReloadContext *link = src->link;
   1051 	while (link != src) { shader_count++; link = link->link; }
   1052 
   1053 	s8  *shader_texts = push_array(&arena, s8,  shader_count);
   1054 	u32 *shader_types = push_array(&arena, u32, shader_count);
   1055 
   1056 	i32 index = 0;
   1057 	do {
   1058 		shader_texts[index] = shader_text_with_header(link, os, &arena);
   1059 		shader_types[index] = link->gl_type;
   1060 		index++;
   1061 		link = link->link;
   1062 	} while (link != src);
   1063 
   1064 	glDeleteProgram(*src->shader);
   1065 	*src->shader = load_shader(&ctx->os, arena, shader_texts, shader_types, shader_count, shader_name);
   1066 	if (src->kind == BeamformerShaderKind_Render3D) ctx->frame_view_render_context.updated = 1;
   1067 
   1068 	return 1;
   1069 }
   1070 
   1071 function b32
   1072 reload_compute_shader(BeamformerCtx *ctx, ShaderReloadContext *src, s8 name_extra, Arena arena)
   1073 {
   1074 	Stream sb  = arena_stream(arena);
   1075 	stream_append_s8s(&sb, src->name, name_extra);
   1076 	s8  name   = arena_stream_commit(&arena, &sb);
   1077 	b32 result = beamformer_reload_shader(&ctx->os, ctx, src, arena, name);
   1078 	return result;
   1079 }
   1080 
   1081 function void
   1082 complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena, iptr gl_context)
   1083 {
   1084 	BeamformerComputeContext *cs = &ctx->compute_context;
   1085 	BeamformerSharedMemory   *sm = ctx->shared_memory.region;
   1086 
   1087 	BeamformWork *work = beamform_work_queue_pop(q);
   1088 	while (work) {
   1089 		b32 can_commit = 1;
   1090 		switch (work->kind) {
   1091 		case BeamformerWorkKind_ReloadShader:{
   1092 			ShaderReloadContext *src = work->shader_reload_context;
   1093 			b32 success = reload_compute_shader(ctx, src, s8(""), *arena);
   1094 			/* TODO(rnp): think of a better way of doing this */
   1095 			switch (src->kind) {
   1096 			case BeamformerShaderKind_DAS:{
   1097 				src->kind   = BeamformerShaderKind_DASFast;
   1098 				src->shader = cs->programs + src->kind;
   1099 				success &= reload_compute_shader(ctx, src, s8(" (Fast)"), *arena);
   1100 
   1101 				src->kind   = BeamformerShaderKind_DAS;
   1102 				src->shader = cs->programs + src->kind;
   1103 			}break;
   1104 			case BeamformerShaderKind_Decode:{
   1105 				read_only local_persist struct { BeamformerShaderKind kind; s8 suffix; } derivatives[] = {
   1106 				#define X(k, __1, __2, suffix, ...) {BeamformerShaderKind_## k, s8_comp(suffix)},
   1107 				DECODE_SHADER_VARIATIONS
   1108 				#undef X
   1109 				};
   1110 				for EachElement(derivatives, it) {
   1111 					src->kind   = derivatives[it].kind;
   1112 					src->shader = cs->programs + src->kind;
   1113 					success &= reload_compute_shader(ctx, src, derivatives[it].suffix, *arena);
   1114 				}
   1115 				src->kind   = BeamformerShaderKind_Decode;
   1116 				src->shader = cs->programs + src->kind;
   1117 			}break;
   1118 			case BeamformerShaderKind_Filter:{
   1119 				read_only local_persist struct { BeamformerShaderKind kind; s8 suffix; } derivatives[] = {
   1120 					{BeamformerShaderKind_Demodulate, s8_comp(" (Demodulate)")},
   1121 					#define X(k, __1, __2, suffix, ...) {BeamformerShaderKind_## k, s8_comp(suffix)},
   1122 					FILTER_SHADER_VARIATIONS
   1123 					#undef X
   1124 				};
   1125 				for EachElement(derivatives, it) {
   1126 					src->kind   = derivatives[it].kind;
   1127 					src->shader = cs->programs + src->kind;
   1128 					success &= reload_compute_shader(ctx, src, derivatives[it].suffix, *arena);
   1129 				}
   1130 				src->kind   = BeamformerShaderKind_Filter;
   1131 				src->shader = cs->programs + src->kind;
   1132 			}break;
   1133 			default:{}break;
   1134 			}
   1135 
   1136 			if (success && ctx->latest_frame && !sm->live_imaging_parameters.active) {
   1137 				fill_frame_compute_work(ctx, work, ctx->latest_frame->view_plane_tag, 0, 0);
   1138 				can_commit = 0;
   1139 			}
   1140 		}break;
   1141 		case BeamformerWorkKind_ExportBuffer:{
   1142 			/* TODO(rnp): better way of handling DispatchCompute barrier */
   1143 			post_sync_barrier(&ctx->shared_memory, BeamformerSharedMemoryLockKind_DispatchCompute, sm->locks);
   1144 			os_shared_memory_region_lock(&ctx->shared_memory, sm->locks, (i32)work->lock, (u32)-1);
   1145 			BeamformerExportContext *ec = &work->export_context;
   1146 			switch (ec->kind) {
   1147 			case BeamformerExportKind_BeamformedData:{
   1148 				BeamformerFrame *frame = ctx->latest_frame;
   1149 				if (frame) {
   1150 					assert(frame->ready_to_present);
   1151 					u32 texture  = frame->texture;
   1152 					iv3 dim      = frame->dim;
   1153 					u32 out_size = (u32)dim.x * (u32)dim.y * (u32)dim.z * 2 * sizeof(f32);
   1154 					if (out_size <= ec->size) {
   1155 						glGetTextureImage(texture, 0, GL_RG, GL_FLOAT, (i32)out_size,
   1156 						                  beamformer_shared_memory_scratch_arena(sm).beg);
   1157 					}
   1158 				}
   1159 			}break;
   1160 			case BeamformerExportKind_Stats:{
   1161 				ComputeTimingTable *table = ctx->compute_timing_table;
   1162 				/* NOTE(rnp): do a little spin to let this finish updating */
   1163 				while (table->write_index != atomic_load_u32(&table->read_index));
   1164 				ComputeShaderStats *stats = ctx->compute_shader_stats;
   1165 				if (sizeof(stats->table) <= ec->size)
   1166 					mem_copy(beamformer_shared_memory_scratch_arena(sm).beg, &stats->table, sizeof(stats->table));
   1167 			}break;
   1168 			InvalidDefaultCase;
   1169 			}
   1170 			os_shared_memory_region_unlock(&ctx->shared_memory, sm->locks, (i32)work->lock);
   1171 			post_sync_barrier(&ctx->shared_memory, BeamformerSharedMemoryLockKind_ExportSync, sm->locks);
   1172 		}break;
   1173 		case BeamformerWorkKind_CreateFilter:{
   1174 			/* TODO(rnp): this should probably get deleted and moved to lazy loading */
   1175 			BeamformerCreateFilterContext *fctx = &work->create_filter_context;
   1176 			u32 block = fctx->parameter_block;
   1177 			u32 slot  = fctx->filter_slot;
   1178 			BeamformerComputePlan *cp = beamformer_compute_plan_for_block(cs, block, arena);
   1179 			beamformer_filter_update(cp->filters + slot, fctx->kind, fctx->parameters, block, slot, *arena);
   1180 		}break;
   1181 		case BeamformerWorkKind_ComputeIndirect:{
   1182 			fill_frame_compute_work(ctx, work, work->compute_indirect_context.view_plane,
   1183 			                        work->compute_indirect_context.parameter_block, 1);
   1184 		} /* FALLTHROUGH */
   1185 		case BeamformerWorkKind_Compute:{
   1186 			DEBUG_DECL(glClearNamedBufferData(cs->ping_pong_ssbos[0], GL_RG32F, GL_RG, GL_FLOAT, 0);)
   1187 			DEBUG_DECL(glClearNamedBufferData(cs->ping_pong_ssbos[1], GL_RG32F, GL_RG, GL_FLOAT, 0);)
   1188 			DEBUG_DECL(glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);)
   1189 
   1190 			push_compute_timing_info(ctx->compute_timing_table,
   1191 			                         (ComputeTimingInfo){.kind = ComputeTimingInfoKind_ComputeFrameBegin});
   1192 
   1193 			BeamformerComputePlan *cp = beamformer_compute_plan_for_block(cs, work->compute_context.parameter_block, arena);
   1194 			if (beamformer_parameter_block_dirty(sm, work->compute_context.parameter_block)) {
   1195 				u32 block = work->compute_context.parameter_block;
   1196 				beamformer_commit_parameter_block(ctx, cp, block, *arena);
   1197 				atomic_store_u32(&ctx->ui_dirty_parameter_blocks, (u32)(ctx->beamform_work_queue != q) << block);
   1198 			}
   1199 
   1200 			post_sync_barrier(&ctx->shared_memory, work->lock, sm->locks);
   1201 
   1202 			atomic_store_u32(&cs->processing_compute, 1);
   1203 			start_renderdoc_capture(gl_context);
   1204 
   1205 			BeamformerFrame *frame = work->compute_context.frame;
   1206 			if (!iv3_equal(cp->output_points, frame->dim))
   1207 				alloc_beamform_frame(&ctx->gl, frame, cp->output_points, s8("Beamformed_Data"), *arena);
   1208 
   1209 			frame->min_coordinate  = cp->min_coordinate;
   1210 			frame->max_coordinate  = cp->max_coordinate;
   1211 			frame->das_shader_kind = cp->das_ubo_data.shader_kind;
   1212 			frame->compound_count  = cp->das_ubo_data.acquisition_count;
   1213 
   1214 			BeamformerComputeContext  *cc       = &ctx->compute_context;
   1215 			BeamformerComputePipeline *pipeline = &cp->pipeline;
   1216 			/* NOTE(rnp): first stage requires access to raw data buffer directly so we break
   1217 			 * it out into a separate step. This way data can get released as soon as possible */
   1218 			if (pipeline->shader_count > 0) {
   1219 				BeamformerRFBuffer *rf = &cs->rf_buffer;
   1220 				u32 slot = rf->compute_index % countof(rf->compute_syncs);
   1221 
   1222 				/* NOTE(rnp): compute indirect is used when uploading data. in this case the thread
   1223 				 * must wait on an upload fence. if the fence doesn't yet exist the thread must wait */
   1224 				if (work->kind == BeamformerWorkKind_ComputeIndirect)
   1225 					spin_wait(!atomic_load_u64(rf->upload_syncs + slot));
   1226 
   1227 				if (rf->upload_syncs[slot]) {
   1228 					rf->compute_index++;
   1229 					glWaitSync(rf->upload_syncs[slot], 0, GL_TIMEOUT_IGNORED);
   1230 					glDeleteSync(rf->upload_syncs[slot]);
   1231 				} else {
   1232 					slot = (rf->compute_index - 1) % countof(rf->compute_syncs);
   1233 				}
   1234 
   1235 				glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, rf->ssbo, slot * rf->size, rf->size);
   1236 
   1237 				glBeginQuery(GL_TIME_ELAPSED, cc->shader_timer_ids[0]);
   1238 				do_compute_shader(ctx, cp, frame, pipeline->shaders[0], pipeline->parameters + 0, *arena);
   1239 				glEndQuery(GL_TIME_ELAPSED);
   1240 
   1241 				if (work->kind == BeamformerWorkKind_ComputeIndirect) {
   1242 					rf->compute_syncs[slot] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
   1243 					rf->upload_syncs[slot]  = 0;
   1244 					memory_write_barrier();
   1245 				}
   1246 			}
   1247 
   1248 			b32 did_sum_shader = 0;
   1249 			for (u32 i = 1; i < pipeline->shader_count; i++) {
   1250 				did_sum_shader |= pipeline->shaders[i] == BeamformerShaderKind_Sum;
   1251 				glBeginQuery(GL_TIME_ELAPSED, cc->shader_timer_ids[i]);
   1252 				do_compute_shader(ctx, cp, frame, pipeline->shaders[i], pipeline->parameters + i, *arena);
   1253 				glEndQuery(GL_TIME_ELAPSED);
   1254 			}
   1255 
   1256 			/* NOTE(rnp): the first of these blocks until work completes */
   1257 			for (u32 i = 0; i < pipeline->shader_count; i++) {
   1258 				ComputeTimingInfo info = {0};
   1259 				info.kind   = ComputeTimingInfoKind_Shader;
   1260 				info.shader = pipeline->shaders[i];
   1261 				glGetQueryObjectui64v(cc->shader_timer_ids[i], GL_QUERY_RESULT, &info.timer_count);
   1262 				push_compute_timing_info(ctx->compute_timing_table, info);
   1263 			}
   1264 			cs->processing_progress = 1;
   1265 
   1266 			frame->ready_to_present = 1;
   1267 			if (did_sum_shader) {
   1268 				u32 aframe_index = (ctx->averaged_frame_index % countof(ctx->averaged_frames));
   1269 				ctx->averaged_frames[aframe_index].view_plane_tag  = frame->view_plane_tag;
   1270 				ctx->averaged_frames[aframe_index].ready_to_present = 1;
   1271 				atomic_add_u32(&ctx->averaged_frame_index, 1);
   1272 				atomic_store_u64((u64 *)&ctx->latest_frame, (u64)(ctx->averaged_frames + aframe_index));
   1273 			} else {
   1274 				atomic_store_u64((u64 *)&ctx->latest_frame, (u64)frame);
   1275 			}
   1276 			cs->processing_compute  = 0;
   1277 
   1278 			push_compute_timing_info(ctx->compute_timing_table,
   1279 			                         (ComputeTimingInfo){.kind = ComputeTimingInfoKind_ComputeFrameEnd});
   1280 
   1281 			end_renderdoc_capture(gl_context);
   1282 		}break;
   1283 		InvalidDefaultCase;
   1284 		}
   1285 
   1286 		if (can_commit) {
   1287 			beamform_work_queue_pop_commit(q);
   1288 			work = beamform_work_queue_pop(q);
   1289 		}
   1290 	}
   1291 }
   1292 
   1293 function void
   1294 coalesce_timing_table(ComputeTimingTable *t, ComputeShaderStats *stats)
   1295 {
   1296 	/* TODO(rnp): we do not currently do anything to handle the potential for a half written
   1297 	 * info item. this could result in garbage entries but they shouldn't really matter */
   1298 
   1299 	u32 target = atomic_load_u32(&t->write_index);
   1300 	u32 stats_index = (stats->latest_frame_index + 1) % countof(stats->table.times);
   1301 
   1302 	static_assert(BeamformerShaderKind_Count + 1 <= 32, "timing coalescence bitfield test");
   1303 	u32 seen_info_test = 0;
   1304 
   1305 	while (t->read_index != target) {
   1306 		ComputeTimingInfo info = t->buffer[t->read_index % countof(t->buffer)];
   1307 		switch (info.kind) {
   1308 		case ComputeTimingInfoKind_ComputeFrameBegin:{
   1309 			assert(t->compute_frame_active == 0);
   1310 			t->compute_frame_active = 1;
   1311 			/* NOTE(rnp): allow multiple instances of same shader to accumulate */
   1312 			mem_clear(stats->table.times[stats_index], 0, sizeof(stats->table.times[stats_index]));
   1313 		}break;
   1314 		case ComputeTimingInfoKind_ComputeFrameEnd:{
   1315 			assert(t->compute_frame_active == 1);
   1316 			t->compute_frame_active = 0;
   1317 			stats->latest_frame_index = stats_index;
   1318 			stats_index = (stats_index + 1) % countof(stats->table.times);
   1319 		}break;
   1320 		case ComputeTimingInfoKind_Shader:{
   1321 			stats->table.times[stats_index][info.shader] += (f32)info.timer_count / 1.0e9f;
   1322 			seen_info_test |= (1u << info.shader);
   1323 		}break;
   1324 		case ComputeTimingInfoKind_RF_Data:{
   1325 			stats->latest_rf_index = (stats->latest_rf_index + 1) % countof(stats->table.rf_time_deltas);
   1326 			f32 delta = (f32)(info.timer_count - stats->last_rf_timer_count) / 1.0e9f;
   1327 			stats->table.rf_time_deltas[stats->latest_rf_index] = delta;
   1328 			stats->last_rf_timer_count = info.timer_count;
   1329 			seen_info_test |= (1 << BeamformerShaderKind_Count);
   1330 		}break;
   1331 		}
   1332 		/* NOTE(rnp): do this at the end so that stats table is always in a consistent state */
   1333 		atomic_add_u32(&t->read_index, 1);
   1334 	}
   1335 
   1336 	if (seen_info_test) {
   1337 		for EachEnumValue(BeamformerShaderKind, shader) {
   1338 			if (seen_info_test & (1 << shader)) {
   1339 				f32 sum = 0;
   1340 				for EachElement(stats->table.times, i)
   1341 					sum += stats->table.times[i][shader];
   1342 				stats->average_times[shader] = sum / countof(stats->table.times);
   1343 			}
   1344 		}
   1345 
   1346 		if (seen_info_test & (1 << BeamformerShaderKind_Count)) {
   1347 			f32 sum = 0;
   1348 			for EachElement(stats->table.rf_time_deltas, i)
   1349 				sum += stats->table.rf_time_deltas[i];
   1350 			stats->rf_time_delta_average = sum / countof(stats->table.rf_time_deltas);
   1351 		}
   1352 	}
   1353 }
   1354 
   1355 DEBUG_EXPORT BEAMFORMER_COMPLETE_COMPUTE_FN(beamformer_complete_compute)
   1356 {
   1357 	BeamformerCtx *ctx         = (BeamformerCtx *)user_context;
   1358 	BeamformerSharedMemory *sm = ctx->shared_memory.region;
   1359 	complete_queue(ctx, &sm->external_work_queue, arena, gl_context);
   1360 	complete_queue(ctx, ctx->beamform_work_queue, arena, gl_context);
   1361 }
   1362 
   1363 function void
   1364 beamformer_rf_buffer_allocate(BeamformerRFBuffer *rf, u32 rf_size, Arena arena)
   1365 {
   1366 	glUnmapNamedBuffer(rf->ssbo);
   1367 	glDeleteBuffers(1, &rf->ssbo);
   1368 	glCreateBuffers(1, &rf->ssbo);
   1369 
   1370 	rf_size = (u32)round_up_to((iz)rf_size, 64);
   1371 	glNamedBufferStorage(rf->ssbo, countof(rf->compute_syncs) * rf_size, 0,
   1372 	                     GL_DYNAMIC_STORAGE_BIT|GL_MAP_WRITE_BIT);
   1373 	LABEL_GL_OBJECT(GL_BUFFER, rf->ssbo, s8("Raw_RF_SSBO"));
   1374 	rf->size = rf_size;
   1375 }
   1376 
   1377 DEBUG_EXPORT BEAMFORMER_RF_UPLOAD_FN(beamformer_rf_upload)
   1378 {
   1379 	BeamformerSharedMemory *sm = ctx->shared_memory->region;
   1380 
   1381 	BeamformerSharedMemoryLockKind scratch_lock = BeamformerSharedMemoryLockKind_ScratchSpace;
   1382 	BeamformerSharedMemoryLockKind upload_lock  = BeamformerSharedMemoryLockKind_UploadRF;
   1383 	if (sm->locks[upload_lock] &&
   1384 	    os_shared_memory_region_lock(ctx->shared_memory, sm->locks, (i32)scratch_lock, (u32)-1))
   1385 	{
   1386 		BeamformerRFBuffer *rf = ctx->rf_buffer;
   1387 		if (rf->size < sm->scratch_rf_size)
   1388 			beamformer_rf_buffer_allocate(rf, sm->scratch_rf_size, arena);
   1389 
   1390 		u32 slot = rf->insertion_index++ % countof(rf->compute_syncs);
   1391 
   1392 		/* NOTE(rnp): if the rest of the code is functioning then the first
   1393 		 * time the compute thread processes an upload it must have gone
   1394 		 * through this path. therefore it is safe to spin until it gets processed */
   1395 		spin_wait(atomic_load_u64(rf->upload_syncs + slot));
   1396 
   1397 		if (rf->compute_syncs[slot]) {
   1398 			GLenum sync_result = glClientWaitSync(rf->compute_syncs[slot], 0, 1000000000);
   1399 			if (sync_result == GL_TIMEOUT_EXPIRED || sync_result == GL_WAIT_FAILED) {
   1400 				// TODO(rnp): what do?
   1401 			}
   1402 			glDeleteSync(rf->compute_syncs[slot]);
   1403 		}
   1404 
   1405 		/* NOTE(rnp): nVidia's drivers really don't play nice with persistant mapping,
   1406 		 * at least when it is a big as this one wants to be. mapping and unmapping the
   1407 		 * desired range each time doesn't seem to introduce any performance hit */
   1408 		u32 access = GL_MAP_WRITE_BIT|GL_MAP_FLUSH_EXPLICIT_BIT|GL_MAP_UNSYNCHRONIZED_BIT;
   1409 		u8 *buffer = glMapNamedBufferRange(rf->ssbo, slot * rf->size, (i32)rf->size, access);
   1410 
   1411 		mem_copy(buffer, beamformer_shared_memory_scratch_arena(sm).beg, rf->size);
   1412 		os_shared_memory_region_unlock(ctx->shared_memory, sm->locks, (i32)scratch_lock);
   1413 		post_sync_barrier(ctx->shared_memory, upload_lock, sm->locks);
   1414 
   1415 		glFlushMappedNamedBufferRange(rf->ssbo, 0, (i32)rf->size);
   1416 		glUnmapNamedBuffer(rf->ssbo);
   1417 
   1418 		rf->upload_syncs[slot]  = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
   1419 		rf->compute_syncs[slot] = 0;
   1420 		memory_write_barrier();
   1421 
   1422 		os_wake_waiters(ctx->compute_worker_sync);
   1423 
   1424 		ComputeTimingInfo info = {.kind = ComputeTimingInfoKind_RF_Data};
   1425 		glGetQueryObjectui64v(rf->data_timestamp_query, GL_QUERY_RESULT, &info.timer_count);
   1426 		glQueryCounter(rf->data_timestamp_query, GL_TIMESTAMP);
   1427 		push_compute_timing_info(ctx->compute_timing_table, info);
   1428 	}
   1429 }
   1430 
   1431 #include "ui.c"
   1432 
   1433 DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step)
   1434 {
   1435 	dt_for_frame = input->dt;
   1436 
   1437 	if (IsWindowResized()) {
   1438 		ctx->window_size.h = GetScreenHeight();
   1439 		ctx->window_size.w = GetScreenWidth();
   1440 	}
   1441 
   1442 	coalesce_timing_table(ctx->compute_timing_table, ctx->compute_shader_stats);
   1443 
   1444 	if (input->executable_reloaded) {
   1445 		ui_init(ctx, ctx->ui_backing_store);
   1446 		DEBUG_DECL(start_frame_capture = ctx->os.start_frame_capture);
   1447 		DEBUG_DECL(end_frame_capture   = ctx->os.end_frame_capture);
   1448 	}
   1449 
   1450 	BeamformerSharedMemory *sm = ctx->shared_memory.region;
   1451 	if (sm->locks[BeamformerSharedMemoryLockKind_UploadRF] != 0)
   1452 		os_wake_waiters(&ctx->os.upload_worker.sync_variable);
   1453 
   1454 	BeamformerFrame        *frame = ctx->latest_frame;
   1455 	BeamformerViewPlaneTag  tag   = frame? frame->view_plane_tag : 0;
   1456 	draw_ui(ctx, input, frame, tag);
   1457 
   1458 	ctx->frame_view_render_context.updated = 0;
   1459 
   1460 	if (WindowShouldClose())
   1461 		ctx->should_exit = 1;
   1462 }
   1463 
   1464 /* NOTE(rnp): functions defined in these shouldn't be visible to the whole program */
   1465 #if _DEBUG
   1466   #if OS_LINUX
   1467     #include "os_linux.c"
   1468   #elif OS_WINDOWS
   1469     #include "os_win32.c"
   1470   #endif
   1471 #endif