ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

beamformer.c (58799B)


      1 /* See LICENSE for license details. */
      2 /* TODO(rnp):
      3  * [ ]: measure performance of doing channel mapping in a separate shader
      4  * [ ]: BeamformWorkQueue -> BeamformerWorkQueue
      5  * [ ]: need to keep track of gpu memory in some way
      6  *      - want to be able to store more than 16 2D frames but limit 3D frames
      7  *      - maybe keep track of how much gpu memory is committed for beamformed images
      8  *        and use that to determine when to loop back over existing textures
      9  *      - to do this maybe use a circular linked list instead of a flat array
     10  *      - then have a way of querying how many frames are available for a specific point count
     11  * [ ]: bug: reinit cuda on hot-reload
     12  */
     13 
     14 #include "beamformer.h"
     15 
     16 global f32 dt_for_frame;
     17 
     18 #define FILTER_LOCAL_SIZE_X 64
     19 #define FILTER_LOCAL_SIZE_Y  1
     20 #define FILTER_LOCAL_SIZE_Z  1
     21 
     22 #define DECODE_LOCAL_SIZE_X  4
     23 #define DECODE_LOCAL_SIZE_Y  1
     24 #define DECODE_LOCAL_SIZE_Z 16
     25 
     26 #define DECODE_FIRST_PASS_UNIFORM_LOC 1
     27 
     28 #define DAS_LOCAL_SIZE_X  16
     29 #define DAS_LOCAL_SIZE_Y   1
     30 #define DAS_LOCAL_SIZE_Z  16
     31 
     32 #define DAS_VOXEL_OFFSET_UNIFORM_LOC  2
     33 #define DAS_CYCLE_T_UNIFORM_LOC       3
     34 #define DAS_FAST_CHANNEL_UNIFORM_LOC  4
     35 
     36 #define MIN_MAX_MIPS_LEVEL_UNIFORM_LOC 1
     37 #define SUM_PRESCALE_UNIFORM_LOC       1
     38 
     39 #ifndef _DEBUG
     40 #define start_renderdoc_capture(...)
     41 #define end_renderdoc_capture(...)
     42 #else
     43 global renderdoc_start_frame_capture_fn *start_frame_capture;
     44 global renderdoc_end_frame_capture_fn   *end_frame_capture;
     45 #define start_renderdoc_capture(gl) if (start_frame_capture) start_frame_capture(gl, 0)
     46 #define end_renderdoc_capture(gl)   if (end_frame_capture)   end_frame_capture(gl, 0)
     47 #endif
     48 
     49 typedef struct {
     50 	BeamformerFrame *frames;
     51 	u32 capacity;
     52 	u32 offset;
     53 	u32 cursor;
     54 	u32 needed_frames;
     55 } ComputeFrameIterator;
     56 
     57 function void
     58 beamformer_compute_plan_release(BeamformerComputeContext *cc, u32 block)
     59 {
     60 	assert(block < countof(cc->compute_plans));
     61 	BeamformerComputePlan *cp = cc->compute_plans[block];
     62 	if (cp) {
     63 		glDeleteBuffers(countof(cp->ubos), cp->ubos);
     64 		glDeleteTextures(countof(cp->textures), cp->textures);
     65 		for (u32 i = 0; i < countof(cp->filters); i++)
     66 			glDeleteTextures(1, &cp->filters[i].texture);
     67 		cc->compute_plans[block] = 0;
     68 		SLLPushFreelist(cp, cc->compute_plan_freelist);
     69 	}
     70 }
     71 
     72 function BeamformerComputePlan *
     73 beamformer_compute_plan_for_block(BeamformerComputeContext *cc, u32 block, Arena *arena)
     74 {
     75 	assert(block < countof(cc->compute_plans));
     76 	BeamformerComputePlan *result = cc->compute_plans[block];
     77 	if (!result) {
     78 		result = SLLPopFreelist(cc->compute_plan_freelist);
     79 		if (result) zero_struct(result);
     80 		else        result = push_struct(arena, BeamformerComputePlan);
     81 		cc->compute_plans[block] = result;
     82 
     83 		glCreateBuffers(countof(result->ubos), result->ubos);
     84 
     85 		Stream label = arena_stream(*arena);
     86 		#define X(k, t, ...) \
     87 			glNamedBufferStorage(result->ubos[BeamformerComputeUBOKind_##k], sizeof(t), \
     88 			                     0, GL_DYNAMIC_STORAGE_BIT); \
     89 			stream_append_s8(&label, s8(#t "[")); \
     90 			stream_append_u64(&label, block);     \
     91 			stream_append_byte(&label, ']');      \
     92 			glObjectLabel(GL_BUFFER, result->ubos[BeamformerComputeUBOKind_##k], \
     93 			              label.widx, (c8 *)label.data); \
     94 			label.widx = 0;
     95 		BEAMFORMER_COMPUTE_UBO_LIST
     96 		#undef X
     97 
     98 		#define X(_k, t, ...) t,
     99 		GLenum gl_kind[] = {BEAMFORMER_COMPUTE_TEXTURE_LIST};
    100 		#undef X
    101 		read_only local_persist s8 tex_prefix[] = {
    102 			#define X(k, ...) s8_comp(#k "["),
    103 			BEAMFORMER_COMPUTE_TEXTURE_LIST
    104 			#undef X
    105 		};
    106 		glCreateTextures(GL_TEXTURE_1D, BeamformerComputeTextureKind_Count - 1, result->textures);
    107 		for (u32 i = 0; i < BeamformerComputeTextureKind_Count - 1; i++) {
    108 			/* TODO(rnp): this could be predicated on channel count for this compute plan */
    109 			glTextureStorage1D(result->textures[i], 1, gl_kind[i], BeamformerMaxChannelCount);
    110 			stream_append_s8(&label, tex_prefix[i]);
    111 			stream_append_u64(&label, block);
    112 			stream_append_byte(&label, ']');
    113 			glObjectLabel(GL_TEXTURE, result->textures[i], label.widx, (c8 *)label.data);
    114 			label.widx = 0;
    115 		}
    116 	}
    117 	return result;
    118 }
    119 
    120 function void
    121 beamformer_filter_update(BeamformerFilter *f, BeamformerFilterKind kind,
    122                          BeamformerFilterParameters fp, u32 block, u32 slot, Arena arena)
    123 {
    124 	#define X(k, ...) s8_comp(#k "Filter"),
    125 	read_only local_persist s8 filter_kinds[] = {BEAMFORMER_FILTER_KIND_LIST(,)};
    126 	#undef X
    127 
    128 	Stream sb = arena_stream(arena);
    129 	stream_append_s8s(&sb, filter_kinds[kind % countof(filter_kinds)], s8("["));
    130 	stream_append_u64(&sb, block);
    131 	stream_append_s8(&sb, s8("]["));
    132 	stream_append_u64(&sb, slot);
    133 	stream_append_byte(&sb, ']');
    134 	s8 label = arena_stream_commit(&arena, &sb);
    135 
    136 	void *filter = 0;
    137 	switch (kind) {
    138 	case BeamformerFilterKind_Kaiser:{
    139 		/* TODO(rnp): this should also support complex */
    140 		/* TODO(rnp): implement this as an IFIR filter instead to reduce computation */
    141 		filter = kaiser_low_pass_filter(&arena, fp.Kaiser.cutoff_frequency, fp.sampling_frequency,
    142 		                                fp.Kaiser.beta, (i32)fp.Kaiser.length);
    143 		f->length     = (i32)fp.Kaiser.length;
    144 		f->time_delay = (f32)f->length / 2.0f / fp.sampling_frequency;
    145 	}break;
    146 	case BeamformerFilterKind_MatchedChirp:{
    147 		typeof(fp.MatchedChirp) *mc = &fp.MatchedChirp;
    148 		f32 fs    = fp.sampling_frequency;
    149 		f->length = (i32)(mc->duration * fs);
    150 		if (fp.complex) {
    151 			filter = baseband_chirp(&arena, mc->min_frequency, mc->max_frequency, fs, f->length, 1, 0.5f);
    152 			f->time_delay = complex_filter_first_moment(filter, f->length, fs);
    153 		} else {
    154 			filter = rf_chirp(&arena, mc->min_frequency, mc->max_frequency, fs, f->length, 1);
    155 			f->time_delay = real_filter_first_moment(filter, f->length, fs);
    156 		}
    157 	}break;
    158 	InvalidDefaultCase;
    159 	}
    160 
    161 	f->kind       = kind;
    162 	f->parameters = fp;
    163 
    164 	glDeleteTextures(1, &f->texture);
    165 	glCreateTextures(GL_TEXTURE_1D, 1, &f->texture);
    166 	glTextureStorage1D(f->texture, 1, fp.complex? GL_RG32F : GL_R32F, f->length);
    167 	glTextureSubImage1D(f->texture, 0, 0, f->length, fp.complex? GL_RG : GL_RED, GL_FLOAT, filter);
    168 	glObjectLabel(GL_TEXTURE, f->texture, (i32)label.len, (c8 *)label.data);
    169 }
    170 
    171 function ComputeFrameIterator
    172 compute_frame_iterator(BeamformerCtx *ctx, u32 start_index, u32 needed_frames)
    173 {
    174 	start_index = start_index % ARRAY_COUNT(ctx->beamform_frames);
    175 
    176 	ComputeFrameIterator result;
    177 	result.frames        = ctx->beamform_frames;
    178 	result.offset        = start_index;
    179 	result.capacity      = ARRAY_COUNT(ctx->beamform_frames);
    180 	result.cursor        = 0;
    181 	result.needed_frames = needed_frames;
    182 	return result;
    183 }
    184 
    185 function BeamformerFrame *
    186 frame_next(ComputeFrameIterator *bfi)
    187 {
    188 	BeamformerFrame *result = 0;
    189 	if (bfi->cursor != bfi->needed_frames) {
    190 		u32 index = (bfi->offset + bfi->cursor++) % bfi->capacity;
    191 		result    = bfi->frames + index;
    192 	}
    193 	return result;
    194 }
    195 
    196 function b32
    197 beamformer_frame_compatible(BeamformerFrame *f, iv3 dim, GLenum gl_kind)
    198 {
    199 	b32 result = gl_kind == f->gl_kind && iv3_equal(dim, f->dim);
    200 	return result;
    201 }
    202 
    203 function void
    204 alloc_beamform_frame(GLParams *gp, BeamformerFrame *out, iv3 out_dim, GLenum gl_kind, s8 name, Arena arena)
    205 {
    206 	out->dim.x = MAX(1, out_dim.x);
    207 	out->dim.y = MAX(1, out_dim.y);
    208 	out->dim.z = MAX(1, out_dim.z);
    209 
    210 	if (gp) {
    211 		out->dim.x = MIN(out->dim.x, gp->max_3d_texture_dim);
    212 		out->dim.y = MIN(out->dim.y, gp->max_3d_texture_dim);
    213 		out->dim.z = MIN(out->dim.z, gp->max_3d_texture_dim);
    214 	}
    215 
    216 	/* NOTE: allocate storage for beamformed output data;
    217 	 * this is shared between compute and fragment shaders */
    218 	u32 max_dim = (u32)MAX(out->dim.x, MAX(out->dim.y, out->dim.z));
    219 	out->mips   = (i32)ctz_u32(round_up_power_of_2(max_dim)) + 1;
    220 
    221 	out->gl_kind = gl_kind;
    222 
    223 	Stream label = arena_stream(arena);
    224 	stream_append_s8(&label, name);
    225 	stream_append_byte(&label, '[');
    226 	stream_append_hex_u64(&label, out->id);
    227 	stream_append_byte(&label, ']');
    228 
    229 	glDeleteTextures(1, &out->texture);
    230 	glCreateTextures(GL_TEXTURE_3D, 1, &out->texture);
    231 	glTextureStorage3D(out->texture, out->mips, gl_kind, out->dim.x, out->dim.y, out->dim.z);
    232 
    233 	glTextureParameteri(out->texture, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
    234 	glTextureParameteri(out->texture, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
    235 
    236 	LABEL_GL_OBJECT(GL_TEXTURE, out->texture, stream_to_s8(&label));
    237 }
    238 
    239 function void
    240 update_hadamard_texture(BeamformerComputePlan *cp, i32 order, Arena arena)
    241 {
    242 	i32 *hadamard = make_hadamard_transpose(&arena, order);
    243 	if (hadamard) {
    244 		cp->hadamard_order = order;
    245 		u32 *texture = cp->textures + BeamformerComputeTextureKind_Hadamard;
    246 		glDeleteTextures(1, texture);
    247 		glCreateTextures(GL_TEXTURE_2D, 1, texture);
    248 		glTextureStorage2D(*texture, 1, GL_R8I, order, order);
    249 		glTextureSubImage2D(*texture, 0, 0, 0,  order, order, GL_RED_INTEGER, GL_INT, hadamard);
    250 
    251 		Stream label = arena_stream(arena);
    252 		stream_append_s8(&label, s8("Hadamard"));
    253 		stream_append_i64(&label, order);
    254 		LABEL_GL_OBJECT(GL_TEXTURE, *texture, stream_to_s8(&label));
    255 	}
    256 }
    257 
    258 function void
    259 alloc_shader_storage(BeamformerCtx *ctx, u32 decoded_data_size, Arena arena)
    260 {
    261 	BeamformerComputeContext *cc = &ctx->compute_context;
    262 	glDeleteBuffers(countof(cc->ping_pong_ssbos), cc->ping_pong_ssbos);
    263 	glCreateBuffers(countof(cc->ping_pong_ssbos), cc->ping_pong_ssbos);
    264 
    265 	cc->ping_pong_ssbo_size = decoded_data_size;
    266 
    267 	Stream label = arena_stream(arena);
    268 	stream_append_s8(&label, s8("PingPongSSBO["));
    269 	i32 s_widx = label.widx;
    270 	for (i32 i = 0; i < countof(cc->ping_pong_ssbos); i++) {
    271 		glNamedBufferStorage(cc->ping_pong_ssbos[i], (iz)decoded_data_size, 0, 0);
    272 		stream_append_i64(&label, i);
    273 		stream_append_byte(&label, ']');
    274 		LABEL_GL_OBJECT(GL_BUFFER, cc->ping_pong_ssbos[i], stream_to_s8(&label));
    275 		stream_reset(&label, s_widx);
    276 	}
    277 
    278 	/* TODO(rnp): (25.08.04) cuda lib is heavily broken atm. First there are multiple RF
    279 	 * buffers and cuda decode shouldn't assume that the data is coming from the rf_buffer
    280 	 * ssbo. Second each parameter block may need a different hadamard matrix so ideally
    281 	 * decode should just take the texture as a parameter. Third, none of these dimensions
    282 	 * need to be pre-known by the library unless its allocating GPU memory which it shouldn't
    283 	 * need to do. For now grab out of parameter block 0 but it is not correct */
    284 	BeamformerParameterBlock *pb = beamformer_parameter_block(ctx->shared_memory.region, 0);
    285 	/* NOTE(rnp): these are stubs when CUDA isn't supported */
    286 	cuda_register_buffers(cc->ping_pong_ssbos, countof(cc->ping_pong_ssbos), cc->rf_buffer.ssbo);
    287 	u32 decoded_data_dimension[3] = {pb->parameters.sample_count, pb->parameters.channel_count, pb->parameters.acquisition_count};
    288 	cuda_init(pb->parameters.raw_data_dimensions, decoded_data_dimension);
    289 }
    290 
    291 function void
    292 push_compute_timing_info(ComputeTimingTable *t, ComputeTimingInfo info)
    293 {
    294 	u32 index = atomic_add_u32(&t->write_index, 1) % countof(t->buffer);
    295 	t->buffer[index] = info;
    296 }
    297 
    298 function b32
    299 fill_frame_compute_work(BeamformerCtx *ctx, BeamformWork *work, BeamformerViewPlaneTag plane,
    300                         u32 parameter_block, b32 indirect)
    301 {
    302 	b32 result = 0;
    303 	if (work) {
    304 		result = 1;
    305 		u32 frame_id    = atomic_add_u32(&ctx->next_render_frame_index, 1);
    306 		u32 frame_index = frame_id % countof(ctx->beamform_frames);
    307 		work->kind      = indirect? BeamformerWorkKind_ComputeIndirect : BeamformerWorkKind_Compute;
    308 		work->lock      = BeamformerSharedMemoryLockKind_DispatchCompute;
    309 		work->compute_context.parameter_block = parameter_block;
    310 		work->compute_context.frame = ctx->beamform_frames + frame_index;
    311 		work->compute_context.frame->ready_to_present = 0;
    312 		work->compute_context.frame->view_plane_tag   = plane;
    313 		work->compute_context.frame->id               = frame_id;
    314 	}
    315 	return result;
    316 }
    317 
    318 function void
    319 do_sum_shader(BeamformerComputeContext *cc, u32 *in_textures, u32 in_texture_count, f32 in_scale,
    320               u32 out_texture, iv3 out_data_dim)
    321 {
    322 	/* NOTE: zero output before summing */
    323 	glClearTexImage(out_texture, 0, GL_RED, GL_FLOAT, 0);
    324 	glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT);
    325 
    326 	glBindImageTexture(0, out_texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F);
    327 	glProgramUniform1f(cc->programs[BeamformerShaderKind_Sum], SUM_PRESCALE_UNIFORM_LOC, in_scale);
    328 	for (u32 i = 0; i < in_texture_count; i++) {
    329 		glBindImageTexture(1, in_textures[i], 0, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F);
    330 		glDispatchCompute(ORONE((u32)out_data_dim.x / 32u),
    331 		                  ORONE((u32)out_data_dim.y),
    332 		                  ORONE((u32)out_data_dim.z / 32u));
    333 		glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    334 	}
    335 }
    336 
    337 struct compute_cursor {
    338 	iv3 cursor;
    339 	uv3 dispatch;
    340 	iv3 target;
    341 	u32 points_per_dispatch;
    342 	u32 completed_points;
    343 	u32 total_points;
    344 };
    345 
    346 function struct compute_cursor
    347 start_compute_cursor(iv3 dim, u32 max_points)
    348 {
    349 	struct compute_cursor result = {0};
    350 	u32 invocations_per_dispatch = DAS_LOCAL_SIZE_X * DAS_LOCAL_SIZE_Y * DAS_LOCAL_SIZE_Z;
    351 
    352 	result.dispatch.y = MIN(max_points / invocations_per_dispatch, (u32)ceil_f32((f32)dim.y / DAS_LOCAL_SIZE_Y));
    353 
    354 	u32 remaining     = max_points / result.dispatch.y;
    355 	result.dispatch.x = MIN(remaining / invocations_per_dispatch, (u32)ceil_f32((f32)dim.x / DAS_LOCAL_SIZE_X));
    356 	result.dispatch.z = MIN(remaining / (invocations_per_dispatch * result.dispatch.x),
    357 	                        (u32)ceil_f32((f32)dim.z / DAS_LOCAL_SIZE_Z));
    358 
    359 	result.target.x = MAX(dim.x / (i32)result.dispatch.x / DAS_LOCAL_SIZE_X, 1);
    360 	result.target.y = MAX(dim.y / (i32)result.dispatch.y / DAS_LOCAL_SIZE_Y, 1);
    361 	result.target.z = MAX(dim.z / (i32)result.dispatch.z / DAS_LOCAL_SIZE_Z, 1);
    362 
    363 	result.points_per_dispatch = 1;
    364 	result.points_per_dispatch *= result.dispatch.x * DAS_LOCAL_SIZE_X;
    365 	result.points_per_dispatch *= result.dispatch.y * DAS_LOCAL_SIZE_Y;
    366 	result.points_per_dispatch *= result.dispatch.z * DAS_LOCAL_SIZE_Z;
    367 
    368 	result.total_points = (u32)(dim.x * dim.y * dim.z);
    369 
    370 	return result;
    371 }
    372 
    373 function iv3
    374 step_compute_cursor(struct compute_cursor *cursor)
    375 {
    376 	cursor->cursor.x += 1;
    377 	if (cursor->cursor.x >= cursor->target.x) {
    378 		cursor->cursor.x  = 0;
    379 		cursor->cursor.y += 1;
    380 		if (cursor->cursor.y >= cursor->target.y) {
    381 			cursor->cursor.y  = 0;
    382 			cursor->cursor.z += 1;
    383 		}
    384 	}
    385 
    386 	cursor->completed_points += cursor->points_per_dispatch;
    387 
    388 	iv3 result = cursor->cursor;
    389 	result.x *= (i32)cursor->dispatch.x * DAS_LOCAL_SIZE_X;
    390 	result.y *= (i32)cursor->dispatch.y * DAS_LOCAL_SIZE_Y;
    391 	result.z *= (i32)cursor->dispatch.z * DAS_LOCAL_SIZE_Z;
    392 
    393 	return result;
    394 }
    395 
    396 function b32
    397 compute_cursor_finished(struct compute_cursor *cursor)
    398 {
    399 	b32 result = cursor->completed_points >= cursor->total_points;
    400 	return result;
    401 }
    402 
    403 function m4
    404 das_voxel_transform_matrix(BeamformerParameters *bp)
    405 {
    406 	v3 min = v3_from_f32_array(bp->output_min_coordinate);
    407 	v3 max = v3_from_f32_array(bp->output_max_coordinate);
    408 	v3 extent = v3_abs(v3_sub(max, min));
    409 	v3 points = {{(f32)bp->output_points[0], (f32)bp->output_points[1], (f32)bp->output_points[2]}};
    410 
    411 	m4 T1 = m4_translation(v3_scale(v3_sub(points, (v3){{1.0f, 1.0f, 1.0f}}), -0.5f));
    412 	m4 T2 = m4_translation(v3_add(min, v3_scale(extent, 0.5f)));
    413 	m4 S  = m4_scale(v3_div(extent, points));
    414 
    415 	m4 R;
    416 	switch (bp->das_shader_id) {
    417 	case BeamformerDASKind_FORCES:
    418 	case BeamformerDASKind_UFORCES:
    419 	case BeamformerDASKind_Flash:
    420 	{
    421 		R = m4_identity();
    422 		S.c[1].E[1]  = 0;
    423 		T2.c[3].E[1] = 0;
    424 	}break;
    425 	case BeamformerDASKind_HERCULES:
    426 	case BeamformerDASKind_UHERCULES:
    427 	case BeamformerDASKind_RCA_TPW:
    428 	case BeamformerDASKind_RCA_VLS:
    429 	{
    430 		R = m4_rotation_about_z(bp->beamform_plane ? 0.0f : 0.25f);
    431 		if (!(points.x > 1 && points.y > 1 && points.z > 1))
    432 			T2.c[3].E[1] = bp->off_axis_pos;
    433 	}break;
    434 	default:{ R = m4_identity(); }break;
    435 	}
    436 	m4 result = m4_mul(R, m4_mul(T2, m4_mul(S, T1)));
    437 	return result;
    438 }
    439 
    440 function void
    441 das_ubo_from_beamformer_parameters(BeamformerDASUBO *du, BeamformerParameters *bp)
    442 {
    443 	du->voxel_transform = das_voxel_transform_matrix(bp);
    444 	mem_copy(du->xdc_transform.E,     bp->xdc_transform,     sizeof(du->xdc_transform));
    445 	mem_copy(du->xdc_element_pitch.E, bp->xdc_element_pitch, sizeof(du->xdc_element_pitch));
    446 	du->sampling_frequency     = bp->sampling_frequency;
    447 	du->demodulation_frequency = bp->demodulation_frequency;
    448 	du->speed_of_sound         = bp->speed_of_sound;
    449 	du->time_offset            = bp->time_offset;
    450 	du->f_number               = bp->f_number;
    451 	du->shader_kind            = bp->das_shader_id;
    452 	du->sample_count           = bp->sample_count;
    453 	du->channel_count          = bp->channel_count;
    454 	du->acquisition_count      = bp->acquisition_count;
    455 
    456 	du->shader_flags = 0;
    457 	if (bp->coherency_weighting) du->shader_flags |= BeamformerShaderDASFlags_CoherencyWeighting;
    458 	if (bp->transmit_mode == BeamformerRCAOrientation_Columns)
    459 		du->shader_flags |= BeamformerShaderDASFlags_TxColumns;
    460 	if (bp->receive_mode == BeamformerRCAOrientation_Columns)
    461 		du->shader_flags |= BeamformerShaderDASFlags_RxColumns;
    462 }
    463 
    464 function void
    465 plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
    466 {
    467 	BeamformerDASUBO *bp = &cp->das_ubo_data;
    468 
    469 	das_ubo_from_beamformer_parameters(bp, &pb->parameters);
    470 
    471 	b32 decode_first = pb->pipeline.shaders[0] == BeamformerShaderKind_Decode;
    472 	b32 run_cuda_hilbert = 0;
    473 	b32 demodulate       = 0;
    474 
    475 	for (u32 i = 0; i < pb->pipeline.shader_count; i++) {
    476 		switch (pb->pipeline.shaders[i]) {
    477 		case BeamformerShaderKind_CudaHilbert:{ run_cuda_hilbert = 1; }break;
    478 		case BeamformerShaderKind_Demodulate:{  demodulate = 1;       }break;
    479 		default:{}break;
    480 		}
    481 	}
    482 
    483 	if (demodulate) run_cuda_hilbert = 0;
    484 
    485 	if (demodulate || run_cuda_hilbert) cp->iq_pipeline = 1;
    486 
    487 	BeamformerDataKind data_kind = pb->pipeline.data_kind;
    488 	cp->pipeline.shader_count = 0;
    489 	for (u32 i = 0; i < pb->pipeline.shader_count; i++) {
    490 		BeamformerShaderParameters *sp = pb->pipeline.parameters + i;
    491 		u32 shader = pb->pipeline.shaders[i];
    492 		b32 commit = 0;
    493 
    494 		iz match = 0;
    495 		switch (shader) {
    496 		case BeamformerShaderKind_CudaHilbert:{ commit = run_cuda_hilbert; }break;
    497 		case BeamformerShaderKind_Decode:{
    498 			/* TODO(rnp): rework decode first and demodulate after */
    499 			BeamformerDataKind decode_data_kind = data_kind;
    500 			if (!decode_first) {
    501 				if (data_kind == BeamformerDataKind_Int16) {
    502 					decode_data_kind = BeamformerDataKind_Int16Complex;
    503 				} else {
    504 					decode_data_kind = BeamformerDataKind_Float32Complex;
    505 				}
    506 			}
    507 			i32 local_flags = 0;
    508 			if (run_cuda_hilbert) local_flags |= BeamformerShaderDecodeFlags_DilateOutput;
    509 			match = beamformer_shader_decode_match(decode_data_kind, local_flags);
    510 			commit = 1;
    511 		}break;
    512 		case BeamformerShaderKind_Demodulate:{
    513 			BeamformerFilter *f = cp->filters + sp->filter_slot;
    514 			i32 local_flags = BeamformerShaderFilterFlags_Demodulate;
    515 			if (f->parameters.complex) local_flags |= BeamformerShaderFilterFlags_ComplexFilter;
    516 			if (!decode_first)         local_flags |= BeamformerShaderFilterFlags_MapChannels;
    517 
    518 			BeamformerDataKind filter_data_kind = data_kind;
    519 			if (decode_first)
    520 				filter_data_kind = BeamformerDataKind_Float32;
    521 
    522 			match = beamformer_shader_demodulate_match(filter_data_kind, pb->parameters.sampling_mode, local_flags);
    523 
    524 			bp->time_offset += f->time_delay;
    525 			commit = 1;
    526 		}break;
    527 		case BeamformerShaderKind_Filter:{
    528 			BeamformerFilter *f = cp->filters + sp->filter_slot;
    529 			i32 local_flags = 0;
    530 			if (f->parameters.complex) local_flags |= BeamformerShaderFilterFlags_ComplexFilter;
    531 
    532 			BeamformerDataKind filter_data_kind = data_kind;
    533 			if (decode_first)
    534 				filter_data_kind = BeamformerDataKind_Float32;
    535 
    536 			match = beamformer_shader_filter_match(filter_data_kind, local_flags);
    537 			bp->time_offset += f->time_delay;
    538 			commit = 1;
    539 		}break;
    540 		case BeamformerShaderKind_DAS:{
    541 			BeamformerDataKind das_data_kind = BeamformerDataKind_Float32;
    542 			if (demodulate || run_cuda_hilbert)
    543 				das_data_kind = BeamformerDataKind_Float32Complex;
    544 
    545 			i32 local_flags = 0;
    546 			if ((bp->shader_flags & BeamformerShaderDASFlags_CoherencyWeighting) == 0)
    547 				local_flags |= BeamformerShaderDASFlags_Fast;
    548 			if (bp->shader_kind == BeamformerDASKind_UFORCES || bp->shader_kind == BeamformerDASKind_UHERCULES)
    549 				local_flags |= BeamformerShaderDASFlags_Sparse;
    550 			if (pb->parameters.interpolate)
    551 				local_flags |= BeamformerShaderDASFlags_Interpolate;
    552 
    553 			match = beamformer_shader_das_match(das_data_kind, local_flags);
    554 			commit = 1;
    555 		}break;
    556 		default:{
    557 			match  = beamformer_shader_descriptors[shader].first_match_vector_index;
    558 			commit = 1;
    559 		}break;
    560 		}
    561 
    562 		if (commit) {
    563 			u32 index = cp->pipeline.shader_count++;
    564 			cp->pipeline.shaders[index]         = shader;
    565 			cp->pipeline.program_indices[index] = (u32)match;
    566 			cp->pipeline.parameters[index]      = *sp;
    567 		}
    568 	}
    569 	cp->pipeline.data_kind = data_kind;
    570 
    571 	u32 das_sample_stride   = 1;
    572 	u32 das_transmit_stride = bp->sample_count;
    573 	u32 das_channel_stride  = bp->acquisition_count * bp->sample_count;
    574 
    575 	u32 decimation_rate = MAX(pb->parameters.decimation_rate, 1);
    576 	if (demodulate) {
    577 		das_channel_stride  /= (2 * decimation_rate);
    578 		das_transmit_stride /= (2 * decimation_rate);
    579 	}
    580 
    581 	u32 input_sample_stride   = 1;
    582 	u32 input_transmit_stride = bp->sample_count;
    583 	u32 input_channel_stride  = pb->parameters.raw_data_dimensions[0];
    584 
    585 	BeamformerDecodeUBO *dp = &cp->decode_ubo_data;
    586 	dp->decode_mode    = pb->parameters.decode;
    587 	dp->transmit_count = bp->acquisition_count;
    588 
    589 	dp->input_sample_stride    = decode_first? input_sample_stride   : bp->acquisition_count;
    590 	dp->input_channel_stride   = decode_first? input_channel_stride  : das_channel_stride;
    591 	dp->input_transmit_stride  = decode_first? input_transmit_stride : 1;
    592 	dp->output_sample_stride   = das_sample_stride;
    593 	dp->output_channel_stride  = das_channel_stride;
    594 	dp->output_transmit_stride = das_transmit_stride;
    595 	if (decode_first) {
    596 		dp->output_channel_stride  *= decimation_rate;
    597 		dp->output_transmit_stride *= decimation_rate;
    598 	}
    599 
    600 	cp->decode_dispatch.x = (u32)ceil_f32((f32)bp->sample_count      / DECODE_LOCAL_SIZE_X);
    601 	cp->decode_dispatch.y = (u32)ceil_f32((f32)bp->channel_count     / DECODE_LOCAL_SIZE_Y);
    602 	cp->decode_dispatch.z = (u32)ceil_f32((f32)bp->acquisition_count / DECODE_LOCAL_SIZE_Z);
    603 
    604 	/* NOTE(rnp): decode 2 samples per dispatch when data is i16 */
    605 	if (decode_first && data_kind == BeamformerDataKind_Int16)
    606 		cp->decode_dispatch.x = (u32)ceil_f32((f32)cp->decode_dispatch.x / 2);
    607 
    608 	/* NOTE(rnp): when we are demodulating we pretend that the sampler was alternating
    609 	 * between sampling the I portion and the Q portion of an IQ signal. Therefore there
    610 	 * is an implicit decimation factor of 2 which must always be included. All code here
    611 	 * assumes that the signal was sampled in such a way that supports this operation.
    612 	 * To recover IQ[n] from the sampled data (RF[n]) we do the following:
    613 	 *   I[n]  = RF[n]
    614 	 *   Q[n]  = RF[n + 1]
    615 	 *   IQ[n] = I[n] - j*Q[n]
    616 	 */
    617 	if (demodulate) {
    618 		BeamformerFilterUBO *mp    = &cp->demod_ubo_data;
    619 		mp->demodulation_frequency = bp->demodulation_frequency;
    620 		mp->sampling_frequency     = bp->sampling_frequency / 2;
    621 		mp->decimation_rate        = decimation_rate;
    622 
    623 		bp->sampling_frequency /= 2 * (f32)mp->decimation_rate;
    624 		bp->sample_count       /= 2 * mp->decimation_rate;
    625 
    626 		if (decode_first) {
    627 			mp->input_channel_stride  = dp->output_channel_stride;
    628 			mp->input_sample_stride   = dp->output_sample_stride;
    629 			mp->input_transmit_stride = dp->output_transmit_stride;
    630 
    631 			mp->output_channel_stride  = das_channel_stride;
    632 			mp->output_sample_stride   = das_sample_stride;
    633 			mp->output_transmit_stride = das_transmit_stride;
    634 		} else {
    635 			mp->input_channel_stride  = input_channel_stride  / 2;
    636 			mp->input_sample_stride   = input_sample_stride;
    637 			mp->input_transmit_stride = input_transmit_stride / 2;
    638 
    639 			/* NOTE(rnp): output optimized layout for decoding */
    640 			mp->output_channel_stride  = dp->input_channel_stride;
    641 			mp->output_sample_stride   = dp->input_sample_stride;
    642 			mp->output_transmit_stride = dp->input_transmit_stride;
    643 
    644 			cp->decode_dispatch.x = (u32)ceil_f32((f32)bp->sample_count / DECODE_LOCAL_SIZE_X);
    645 		}
    646 	}
    647 
    648 	/* TODO(rnp): filter may need a different dispatch layout */
    649 	cp->demod_dispatch.x = (u32)ceil_f32((f32)bp->sample_count      / FILTER_LOCAL_SIZE_X);
    650 	cp->demod_dispatch.y = (u32)ceil_f32((f32)bp->channel_count     / FILTER_LOCAL_SIZE_Y);
    651 	cp->demod_dispatch.z = (u32)ceil_f32((f32)bp->acquisition_count / FILTER_LOCAL_SIZE_Z);
    652 
    653 	cp->rf_size = bp->sample_count * bp->channel_count * bp->acquisition_count;
    654 	if (demodulate || run_cuda_hilbert) cp->rf_size *= 8;
    655 	else                                cp->rf_size *= 4;
    656 
    657 	/* TODO(rnp): UBO per filter stage */
    658 	BeamformerFilterUBO *flt = &cp->filter_ubo_data;
    659 	flt->demodulation_frequency = bp->demodulation_frequency;
    660 	flt->sampling_frequency     = bp->sampling_frequency;
    661 	flt->decimation_rate        = 1;
    662 	flt->output_channel_stride  = bp->sample_count * bp->acquisition_count;
    663 	flt->output_sample_stride   = 1;
    664 	flt->output_transmit_stride = bp->sample_count;
    665 	flt->input_channel_stride   = bp->sample_count * bp->acquisition_count;
    666 	flt->input_sample_stride    = 1;
    667 	flt->input_transmit_stride  = bp->sample_count;
    668 }
    669 
    670 function void
    671 beamformer_commit_parameter_block(BeamformerCtx *ctx, BeamformerComputePlan *cp, u32 block, Arena arena)
    672 {
    673 	BeamformerParameterBlock *pb = beamformer_parameter_block_lock(&ctx->shared_memory, block, -1);
    674 	for (u32 region = ctz_u32(pb->dirty_regions);
    675 	     region != 32;
    676 	     region = ctz_u32(pb->dirty_regions))
    677 	{
    678 		mark_parameter_block_region_clean(ctx->shared_memory.region, block, region);
    679 		switch (region) {
    680 		case BeamformerParameterBlockRegion_ComputePipeline:
    681 		case BeamformerParameterBlockRegion_Parameters:
    682 		{
    683 			plan_compute_pipeline(cp, pb);
    684 
    685 			/* NOTE(rnp): these are both handled by plan_compute_pipeline() */
    686 			u32 mask = 1 << BeamformerParameterBlockRegion_ComputePipeline |
    687 			           1 << BeamformerParameterBlockRegion_Parameters;
    688 			pb->dirty_regions &= ~mask;
    689 
    690 			#define X(k, t, v) glNamedBufferSubData(cp->ubos[BeamformerComputeUBOKind_##k], \
    691 			                                        0, sizeof(t), &cp->v ## _ubo_data);
    692 			BEAMFORMER_COMPUTE_UBO_LIST
    693 			#undef X
    694 
    695 			u32 decoded_data_size = cp->rf_size;
    696 			if (ctx->compute_context.ping_pong_ssbo_size < decoded_data_size)
    697 				alloc_shader_storage(ctx, decoded_data_size, arena);
    698 
    699 			if (cp->hadamard_order != (i32)cp->das_ubo_data.acquisition_count)
    700 				update_hadamard_texture(cp, (i32)cp->das_ubo_data.acquisition_count, arena);
    701 
    702 			cp->min_coordinate = v3_from_f32_array(pb->parameters.output_min_coordinate);
    703 			cp->max_coordinate = v3_from_f32_array(pb->parameters.output_max_coordinate);
    704 
    705 			cp->output_points.E[0] = MAX(pb->parameters.output_points[0], 1);
    706 			cp->output_points.E[1] = MAX(pb->parameters.output_points[1], 1);
    707 			cp->output_points.E[2] = MAX(pb->parameters.output_points[2], 1);
    708 			cp->average_frames     = pb->parameters.output_points[3];
    709 
    710 			GLenum gl_kind = cp->iq_pipeline ? GL_RG32F : GL_R32F;
    711 			if (cp->average_frames > 1 && !beamformer_frame_compatible(ctx->averaged_frames + 0, cp->output_points, gl_kind)) {
    712 				alloc_beamform_frame(&ctx->gl, ctx->averaged_frames + 0, cp->output_points, gl_kind, s8("Averaged Frame"), arena);
    713 				alloc_beamform_frame(&ctx->gl, ctx->averaged_frames + 1, cp->output_points, gl_kind, s8("Averaged Frame"), arena);
    714 			}
    715 		}break;
    716 		case BeamformerParameterBlockRegion_ChannelMapping:
    717 		case BeamformerParameterBlockRegion_FocalVectors:
    718 		case BeamformerParameterBlockRegion_SparseElements:
    719 		{
    720 			BeamformerComputeTextureKind texture_kind = 0;
    721 			u32 texture_type = 0, texture_format = 0;
    722 			/* TODO(rnp): this whole thing could be a table */
    723 			switch (region) {
    724 			case BeamformerParameterBlockRegion_ChannelMapping:{
    725 				texture_kind   = BeamformerComputeTextureKind_ChannelMapping;
    726 				texture_type   = GL_SHORT;
    727 				texture_format = GL_RED_INTEGER;
    728 				/* TODO(rnp): cuda lib */
    729 				cuda_set_channel_mapping(pb->channel_mapping);
    730 			}break;
    731 			case BeamformerParameterBlockRegion_FocalVectors:{
    732 				texture_kind   = BeamformerComputeTextureKind_FocalVectors;
    733 				texture_type   = GL_FLOAT;
    734 				texture_format = GL_RG;
    735 			}break;
    736 			case BeamformerParameterBlockRegion_SparseElements:{
    737 				texture_kind   = BeamformerComputeTextureKind_SparseElements;
    738 				texture_type   = GL_SHORT;
    739 				texture_format = GL_RED_INTEGER;
    740 			}break;
    741 			InvalidDefaultCase;
    742 			}
    743 			glTextureSubImage1D(cp->textures[texture_kind], 0, 0, BeamformerMaxChannelCount,
    744 			                    texture_format, texture_type,
    745 			                    (u8 *)pb + BeamformerParameterBlockRegionOffsets[region]);
    746 		}break;
    747 		}
    748 	}
    749 	beamformer_parameter_block_unlock(&ctx->shared_memory, block);
    750 }
    751 
    752 function void
    753 do_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, BeamformerFrame *frame,
    754                   BeamformerShaderKind shader, u32 program_index, BeamformerShaderParameters *sp, Arena arena)
    755 {
    756 	BeamformerComputeContext *cc = &ctx->compute_context;
    757 
    758 	i32 *match_vector = beamformer_shader_match_vectors[program_index];
    759 	BeamformerShaderDescriptor *shader_descriptor = beamformer_shader_descriptors + shader;
    760 
    761 	u32 program = cc->programs[program_index];
    762 	glUseProgram(program);
    763 
    764 	u32 output_ssbo_idx = !cc->last_output_ssbo_index;
    765 	u32 input_ssbo_idx  = cc->last_output_ssbo_index;
    766 
    767 	switch (shader) {
    768 	case BeamformerShaderKind_Decode:{
    769 		glBindBufferBase(GL_UNIFORM_BUFFER, 0, cp->ubos[BeamformerComputeUBOKind_Decode]);
    770 		glBindImageTexture(0, cp->textures[BeamformerComputeTextureKind_Hadamard], 0, 0, 0, GL_READ_ONLY, GL_R8I);
    771 
    772 		if (shader == cp->pipeline.shaders[0]) {
    773 			glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, cc->ping_pong_ssbos[input_ssbo_idx]);
    774 			glBindImageTexture(1, cp->textures[BeamformerComputeTextureKind_ChannelMapping], 0, 0, 0, GL_READ_ONLY, GL_R16I);
    775 			glProgramUniform1ui(program, DECODE_FIRST_PASS_UNIFORM_LOC, 1);
    776 
    777 			glDispatchCompute(cp->decode_dispatch.x, cp->decode_dispatch.y, cp->decode_dispatch.z);
    778 			glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
    779 		}
    780 
    781 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, cc->ping_pong_ssbos[input_ssbo_idx]);
    782 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, cc->ping_pong_ssbos[output_ssbo_idx]);
    783 
    784 		glProgramUniform1ui(program, DECODE_FIRST_PASS_UNIFORM_LOC, 0);
    785 
    786 		glDispatchCompute(cp->decode_dispatch.x, cp->decode_dispatch.y, cp->decode_dispatch.z);
    787 		glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
    788 
    789 		cc->last_output_ssbo_index = !cc->last_output_ssbo_index;
    790 	}break;
    791 	case BeamformerShaderKind_CudaDecode:{
    792 		cuda_decode(0, output_ssbo_idx, 0);
    793 		cc->last_output_ssbo_index = !cc->last_output_ssbo_index;
    794 	}break;
    795 	case BeamformerShaderKind_CudaHilbert:{
    796 		cuda_hilbert(input_ssbo_idx, output_ssbo_idx);
    797 		cc->last_output_ssbo_index = !cc->last_output_ssbo_index;
    798 	}break;
    799 	case BeamformerShaderKind_Filter:
    800 	case BeamformerShaderKind_Demodulate:
    801 	{
    802 		i32 local_flags  = match_vector[shader_descriptor->match_vector_length];
    803 		b32 map_channels = (local_flags & BeamformerShaderFilterFlags_MapChannels) != 0;
    804 
    805 		u32 index = shader == BeamformerShaderKind_Filter ? BeamformerComputeUBOKind_Filter
    806 		                                                  : BeamformerComputeUBOKind_Demodulate;
    807 		glBindBufferBase(GL_UNIFORM_BUFFER,        0, cp->ubos[index]);
    808 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, cc->ping_pong_ssbos[output_ssbo_idx]);
    809 
    810 		if (!map_channels)
    811 			glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, cc->ping_pong_ssbos[input_ssbo_idx]);
    812 
    813 		GLenum kind = cp->filters[sp->filter_slot].parameters.complex? GL_RG32F : GL_R32F;
    814 		glBindImageTexture(0, cp->filters[sp->filter_slot].texture, 0, 0, 0, GL_READ_ONLY, kind);
    815 
    816 		if (map_channels)
    817 			glBindImageTexture(1, cp->textures[BeamformerComputeTextureKind_ChannelMapping], 0, 0, 0, GL_READ_ONLY, GL_R16I);
    818 
    819 		glDispatchCompute(cp->demod_dispatch.x, cp->demod_dispatch.y, cp->demod_dispatch.z);
    820 		glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
    821 
    822 		cc->last_output_ssbo_index = !cc->last_output_ssbo_index;
    823 	}break;
    824 	case BeamformerShaderKind_MinMax:{
    825 		for (i32 i = 1; i < frame->mips; i++) {
    826 			glBindImageTexture(0, frame->texture, i - 1, GL_TRUE, 0, GL_READ_ONLY,  GL_RG32F);
    827 			glBindImageTexture(1, frame->texture, i - 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F);
    828 			glProgramUniform1i(cc->programs[shader], MIN_MAX_MIPS_LEVEL_UNIFORM_LOC, i);
    829 
    830 			u32 width  = (u32)frame->dim.x >> i;
    831 			u32 height = (u32)frame->dim.y >> i;
    832 			u32 depth  = (u32)frame->dim.z >> i;
    833 			glDispatchCompute(ORONE(width / 32), ORONE(height), ORONE(depth / 32));
    834 			glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    835 		}
    836 	}break;
    837 	case BeamformerShaderKind_DAS:{
    838 		local_persist u32 das_cycle_t = 0;
    839 
    840 		BeamformerDASUBO *ubo = &cp->das_ubo_data;
    841 
    842 		i32 local_flags = match_vector[shader_descriptor->match_vector_length];
    843 		b32 fast        = (local_flags & BeamformerShaderDASFlags_Fast)   != 0;
    844 		b32 sparse      = (local_flags & BeamformerShaderDASFlags_Sparse) != 0;
    845 
    846 		if (fast) {
    847 			glClearTexImage(frame->texture, 0, GL_RED, GL_FLOAT, 0);
    848 			glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT);
    849 			glBindImageTexture(0, frame->texture, 0, GL_TRUE, 0, GL_READ_WRITE, cp->iq_pipeline ? GL_RG32F : GL_R32F);
    850 		} else {
    851 			glBindImageTexture(0, frame->texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, cp->iq_pipeline ? GL_RG32F : GL_R32F);
    852 		}
    853 
    854 		u32 sparse_texture = cp->textures[BeamformerComputeTextureKind_SparseElements];
    855 		if (!sparse) sparse_texture = 0;
    856 
    857 		glBindBufferBase(GL_UNIFORM_BUFFER, 0, cp->ubos[BeamformerComputeUBOKind_DAS]);
    858 		glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, cc->ping_pong_ssbos[input_ssbo_idx], 0, cp->rf_size);
    859 		glBindImageTexture(1, sparse_texture, 0, 0, 0, GL_READ_ONLY, GL_R16I);
    860 		glBindImageTexture(2, cp->textures[BeamformerComputeTextureKind_FocalVectors], 0, 0, 0, GL_READ_ONLY, GL_RG32F);
    861 
    862 		glProgramUniform1ui(program, DAS_CYCLE_T_UNIFORM_LOC, das_cycle_t++);
    863 
    864 		if (fast) {
    865 			i32 loop_end;
    866 			if (ubo->shader_kind == BeamformerDASKind_RCA_VLS ||
    867 			    ubo->shader_kind == BeamformerDASKind_RCA_TPW)
    868 			{
    869 				/* NOTE(rnp): to avoid repeatedly sampling the whole focal vectors
    870 				 * texture we loop over transmits for VLS/TPW */
    871 				loop_end = (i32)ubo->acquisition_count;
    872 			} else {
    873 				loop_end = (i32)ubo->channel_count;
    874 			}
    875 			f32 percent_per_step = 1.0f / (f32)loop_end;
    876 			cc->processing_progress = -percent_per_step;
    877 			for (i32 index = 0; index < loop_end; index++) {
    878 				cc->processing_progress += percent_per_step;
    879 				/* IMPORTANT(rnp): prevents OS from coalescing and killing our shader */
    880 				glFinish();
    881 				glProgramUniform1i(program, DAS_FAST_CHANNEL_UNIFORM_LOC, index);
    882 				glDispatchCompute((u32)ceil_f32((f32)frame->dim.x / DAS_LOCAL_SIZE_X),
    883 				                  (u32)ceil_f32((f32)frame->dim.y / DAS_LOCAL_SIZE_Y),
    884 				                  (u32)ceil_f32((f32)frame->dim.z / DAS_LOCAL_SIZE_Z));
    885 				glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    886 			}
    887 		} else {
    888 			#if 1
    889 			/* TODO(rnp): compute max_points_per_dispatch based on something like a
    890 			 * transmit_count * channel_count product */
    891 			u32 max_points_per_dispatch = KB(64);
    892 			struct compute_cursor cursor = start_compute_cursor(frame->dim, max_points_per_dispatch);
    893 			f32 percent_per_step = (f32)cursor.points_per_dispatch / (f32)cursor.total_points;
    894 			cc->processing_progress = -percent_per_step;
    895 			for (iv3 offset = {0};
    896 			     !compute_cursor_finished(&cursor);
    897 			     offset = step_compute_cursor(&cursor))
    898 			{
    899 				cc->processing_progress += percent_per_step;
    900 				/* IMPORTANT(rnp): prevents OS from coalescing and killing our shader */
    901 				glFinish();
    902 				glProgramUniform3iv(program, DAS_VOXEL_OFFSET_UNIFORM_LOC, 1, offset.E);
    903 				glDispatchCompute(cursor.dispatch.x, cursor.dispatch.y, cursor.dispatch.z);
    904 			}
    905 			#else
    906 			/* NOTE(rnp): use this for testing tiling code. The performance of the above path
    907 			 * should be the same as this path if everything is working correctly */
    908 			iv3 compute_dim_offset = {0};
    909 			glProgramUniform3iv(program, DAS_VOXEL_OFFSET_UNIFORM_LOC, 1, compute_dim_offset.E);
    910 			glDispatchCompute((u32)ceil_f32((f32)dim.x / DAS_LOCAL_SIZE_X),
    911 			                  (u32)ceil_f32((f32)dim.y / DAS_LOCAL_SIZE_Y),
    912 			                  (u32)ceil_f32((f32)dim.z / DAS_LOCAL_SIZE_Z));
    913 			#endif
    914 		}
    915 		glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT|GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    916 	}break;
    917 	case BeamformerShaderKind_Sum:{
    918 		u32 aframe_index = ctx->averaged_frame_index % ARRAY_COUNT(ctx->averaged_frames);
    919 		BeamformerFrame *aframe = ctx->averaged_frames + aframe_index;
    920 		aframe->id              = ctx->averaged_frame_index;
    921 		atomic_store_u32(&aframe->ready_to_present, 0);
    922 		/* TODO(rnp): hack we need a better way of specifying which frames to sum;
    923 		 * this is fine for rolling averaging but what if we want to do something else */
    924 		assert(frame >= ctx->beamform_frames);
    925 		assert(frame < ctx->beamform_frames + countof(ctx->beamform_frames));
    926 		u32 base_index   = (u32)(frame - ctx->beamform_frames);
    927 		u32 to_average   = (u32)cp->average_frames;
    928 		u32 frame_count  = 0;
    929 		u32 *in_textures = push_array(&arena, u32, BeamformerMaxSavedFrames);
    930 		ComputeFrameIterator cfi = compute_frame_iterator(ctx, 1 + base_index - to_average, to_average);
    931 		for (BeamformerFrame *it = frame_next(&cfi); it; it = frame_next(&cfi))
    932 			in_textures[frame_count++] = it->texture;
    933 
    934 		assert(to_average == frame_count);
    935 
    936 		do_sum_shader(cc, in_textures, frame_count, 1 / (f32)frame_count, aframe->texture, aframe->dim);
    937 		aframe->min_coordinate  = frame->min_coordinate;
    938 		aframe->max_coordinate  = frame->max_coordinate;
    939 		aframe->compound_count  = frame->compound_count;
    940 		aframe->das_kind        = frame->das_kind;
    941 	}break;
    942 	InvalidDefaultCase;
    943 	}
    944 }
    945 
    946 function void
    947 stream_push_shader_header(Stream *s, ShaderReloadContext *ctx)
    948 {
    949 	BeamformerReloadableShaderInfo *rsi = beamformer_reloadable_shader_infos + ctx->reloadable_info_index;
    950 
    951 	stream_append_s8s(s, s8("#version 460 core\n\n"), ctx->header);
    952 
    953 	switch (rsi->kind) {
    954 	case BeamformerShaderKind_Filter:{
    955 		stream_append_s8(s, s8(""
    956 		"layout(local_size_x = " str(FILTER_LOCAL_SIZE_X) ", "
    957 		       "local_size_y = " str(FILTER_LOCAL_SIZE_Y) ", "
    958 		       "local_size_z = " str(FILTER_LOCAL_SIZE_Z) ") in;\n\n"
    959 		));
    960 	}break;
    961 	case BeamformerShaderKind_DAS:{
    962 		stream_append_s8(s, s8(""
    963 		"layout(local_size_x = " str(DAS_LOCAL_SIZE_X) ", "
    964 		       "local_size_y = " str(DAS_LOCAL_SIZE_Y) ", "
    965 		       "local_size_z = " str(DAS_LOCAL_SIZE_Z) ") in;\n\n"
    966 		"layout(location = " str(DAS_VOXEL_OFFSET_UNIFORM_LOC) ") uniform ivec3 u_voxel_offset;\n"
    967 		"layout(location = " str(DAS_CYCLE_T_UNIFORM_LOC)      ") uniform uint  u_cycle_t;\n"
    968 		"layout(location = " str(DAS_FAST_CHANNEL_UNIFORM_LOC) ") uniform int   u_channel;\n\n"
    969 		));
    970 
    971 		#define X(k, id, ...) "#define ShaderKind_" #k " " #id "\n"
    972 		stream_append_s8s(s, s8(DAS_SHADER_KIND_LIST), s8("\n"));
    973 		#undef X
    974 	}break;
    975 	case BeamformerShaderKind_Decode:{
    976 		stream_append_s8s(s, s8(""
    977 		"layout(local_size_x = " str(DECODE_LOCAL_SIZE_X) ", "
    978 		       "local_size_y = " str(DECODE_LOCAL_SIZE_Y) ", "
    979 		       "local_size_z = " str(DECODE_LOCAL_SIZE_Z) ") in;\n\n"
    980 		"layout(location = " str(DECODE_FIRST_PASS_UNIFORM_LOC) ") uniform bool u_first_pass;\n\n"
    981 		));
    982 	}break;
    983 	case BeamformerShaderKind_MinMax:{
    984 		stream_append_s8(s, s8("layout(location = " str(MIN_MAX_MIPS_LEVEL_UNIFORM_LOC)
    985 		                       ") uniform int u_mip_map;\n\n"));
    986 	}break;
    987 	case BeamformerShaderKind_Sum:{
    988 		stream_append_s8(s, s8("layout(location = " str(SUM_PRESCALE_UNIFORM_LOC)
    989 		                       ") uniform float u_sum_prescale = 1.0;\n\n"));
    990 	}break;
    991 	default:{}break;
    992 	}
    993 }
    994 
    995 function s8
    996 shader_text_with_header(ShaderReloadContext *ctx, s8 filepath, Arena *arena)
    997 {
    998 	Stream sb = arena_stream(*arena);
    999 	stream_push_shader_header(&sb, ctx);
   1000 	stream_append_s8(&sb, s8("\n#line 1\n"));
   1001 
   1002 	s8 result = arena_stream_commit(arena, &sb);
   1003 	if (filepath.len > 0) {
   1004 		s8 file = os_read_whole_file(arena, (c8 *)filepath.data);
   1005 		assert(file.data == result.data + result.len);
   1006 		result.len += file.len;
   1007 	}
   1008 
   1009 	return result;
   1010 }
   1011 
   1012 /* NOTE(rnp): currently this function is only handling rendering shaders.
   1013  * look at reload_compute_shader for compute shaders */
   1014 DEBUG_EXPORT BEAMFORMER_RELOAD_SHADER_FN(beamformer_reload_shader)
   1015 {
   1016 	BeamformerCtx *ctx = src->beamformer_context;
   1017 
   1018 	i32 shader_count = 1;
   1019 	ShaderReloadContext *link = src->link;
   1020 	while (link != src) { shader_count++; link = link->link; }
   1021 
   1022 	s8  *shader_texts = push_array(&arena, s8,  shader_count);
   1023 	u32 *shader_types = push_array(&arena, u32, shader_count);
   1024 
   1025 	i32 index = 0;
   1026 	do {
   1027 		s8 filepath = {0};
   1028 		if (link->reloadable_info_index >= 0) filepath = path;
   1029 		shader_texts[index] = shader_text_with_header(link, filepath, &arena);
   1030 		shader_types[index] = link->gl_type;
   1031 		index++;
   1032 		link = link->link;
   1033 	} while (link != src);
   1034 
   1035 	BeamformerReloadableShaderInfo *rsi = beamformer_reloadable_shader_infos + src->reloadable_info_index;
   1036 	assert(rsi->kind == BeamformerShaderKind_Render3D);
   1037 
   1038 	u32 *shader = &ctx->frame_view_render_context.shader;
   1039 	glDeleteProgram(*shader);
   1040 	*shader = load_shader(&ctx->os, arena, shader_texts, shader_types, shader_count, shader_name);
   1041 	ctx->frame_view_render_context.updated = 1;
   1042 
   1043 	return 1;
   1044 }
   1045 
   1046 function void
   1047 reload_compute_shader(BeamformerCtx *ctx, ShaderReloadContext *src, Arena arena)
   1048 {
   1049 	BeamformerComputeContext       *cc  = &ctx->compute_context;
   1050 	BeamformerReloadableShaderInfo *rsi = beamformer_reloadable_shader_infos + src->reloadable_info_index;
   1051 	BeamformerShaderDescriptor     *sd  = beamformer_shader_descriptors + rsi->kind;
   1052 
   1053 	Stream status = stream_alloc(&arena, 128);
   1054 	u32 completed     = 0;
   1055 	u32 total_shaders = (u32)(sd->one_past_last_match_vector_index - sd->first_match_vector_index);
   1056 	for (i32 i = 0; i < rsi->sub_shader_descriptor_index_count; i++) {
   1057 		BeamformerShaderDescriptor *ssd  = beamformer_shader_descriptors + rsi->sub_shader_descriptor_indices[i];
   1058 		total_shaders += (u32)(ssd->one_past_last_match_vector_index - ssd->first_match_vector_index);
   1059 	}
   1060 
   1061 	s8 path = push_s8_from_parts(&arena, ctx->os.path_separator, s8("shaders"),
   1062 	                             beamformer_reloadable_shader_files[src->reloadable_info_index]);
   1063 	s8 file_text  = os_read_whole_file(&arena, (c8 *)path.data);
   1064 	Stream shader = arena_stream(arena);
   1065 
   1066 	stream_push_shader_header(&shader, src);
   1067 
   1068 	stream_append_s8(&shader, beamformer_shader_local_header_strings[src->reloadable_info_index]);
   1069 
   1070 	i32 save_point = shader.widx;
   1071 	for (i32 sub_index = -1; sub_index < rsi->sub_shader_descriptor_index_count; sub_index++) {
   1072 		shader.widx = save_point;
   1073 
   1074 		if (sub_index != -1)
   1075 			sd = beamformer_shader_descriptors + rsi->sub_shader_descriptor_indices[sub_index];
   1076 
   1077 		i32 *hvector = beamformer_shader_header_vectors[sd - beamformer_shader_descriptors];
   1078 		for (i32 index = 0; index < sd->header_vector_length; index++)
   1079 			stream_append_s8s(&shader, beamformer_shader_global_header_strings[hvector[index]], s8("\n"));
   1080 
   1081 		i32 instance_save_point = shader.widx;
   1082 		arena_commit(&arena, instance_save_point);
   1083 		TempArena arena_save = begin_temp_arena(&arena);
   1084 
   1085 		for (i32 instance = sd->first_match_vector_index;
   1086 		     instance < sd->one_past_last_match_vector_index;
   1087 		     instance++)
   1088 		{
   1089 			shader.widx = instance_save_point;
   1090 			end_temp_arena(arena_save);
   1091 
   1092 			i32 *match_vector = beamformer_shader_match_vectors[instance];
   1093 			for (i32 index = 0; index < sd->match_vector_length; index++) {
   1094 				stream_append_s8s(&shader, s8("#define "), beamformer_shader_descriptor_header_strings[hvector[index]], s8(" ("));
   1095 				stream_append_i64(&shader, match_vector[index]);
   1096 				stream_append_s8(&shader, s8(")\n"));
   1097 			}
   1098 
   1099 			if (sd->has_local_flags) {
   1100 				stream_append_s8(&shader, s8("#define ShaderFlags (0x"));
   1101 				stream_append_hex_u64(&shader, (u64)match_vector[sd->match_vector_length]);
   1102 				stream_append_s8(&shader, s8(")\n"));
   1103 			}
   1104 
   1105 			stream_append_s8s(&shader, s8("\n#line 1\n"), file_text);
   1106 
   1107 			arena_commit(&arena, shader.widx - instance_save_point);
   1108 
   1109 			s8 shader_text = stream_to_s8(&shader);
   1110 			/* TODO(rnp): instance name */
   1111 			s8 shader_name = beamformer_shader_names[rsi->kind];
   1112 			glDeleteProgram(cc->programs[instance]);
   1113 			cc->programs[instance] = load_shader(&ctx->os, arena, &shader_text, &src->gl_type, 1, shader_name);
   1114 
   1115 			status.widx = 0;
   1116 			stream_append_s8s(&status, s8("\r\x1b[2Kloaded shader "), shader_name, s8(": ["));
   1117 			stream_append_u64(&status, ++completed);
   1118 			stream_append_s8s(&status, s8("/"));
   1119 			stream_append_u64(&status, total_shaders);
   1120 			stream_append_s8s(&status, s8("]"));
   1121 			os_write_file(ctx->os.error_handle, stream_to_s8(&status));
   1122 		}
   1123 	}
   1124 	os_write_file(ctx->os.error_handle, s8("\n"));
   1125 }
   1126 
   1127 function void
   1128 complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena, iptr gl_context)
   1129 {
   1130 	BeamformerComputeContext *cs = &ctx->compute_context;
   1131 	BeamformerSharedMemory   *sm = ctx->shared_memory.region;
   1132 
   1133 	BeamformWork *work = beamform_work_queue_pop(q);
   1134 	while (work) {
   1135 		b32 can_commit = 1;
   1136 		switch (work->kind) {
   1137 		case BeamformerWorkKind_ReloadShader:{
   1138 			reload_compute_shader(ctx, work->shader_reload_context, *arena);
   1139 			if (ctx->latest_frame && !sm->live_imaging_parameters.active) {
   1140 				fill_frame_compute_work(ctx, work, ctx->latest_frame->view_plane_tag, 0, 0);
   1141 				can_commit = 0;
   1142 			}
   1143 		}break;
   1144 		case BeamformerWorkKind_ExportBuffer:{
   1145 			/* TODO(rnp): better way of handling DispatchCompute barrier */
   1146 			post_sync_barrier(&ctx->shared_memory, BeamformerSharedMemoryLockKind_DispatchCompute, sm->locks);
   1147 			os_shared_memory_region_lock(&ctx->shared_memory, sm->locks, (i32)work->lock, (u32)-1);
   1148 			BeamformerExportContext *ec = &work->export_context;
   1149 			switch (ec->kind) {
   1150 			case BeamformerExportKind_BeamformedData:{
   1151 				BeamformerFrame *frame = ctx->latest_frame;
   1152 				if (frame) {
   1153 					assert(frame->ready_to_present);
   1154 					u32 texture  = frame->texture;
   1155 					iv3 dim      = frame->dim;
   1156 					u32 out_size = (u32)dim.x * (u32)dim.y * (u32)dim.z * 2 * sizeof(f32);
   1157 					if (out_size <= ec->size) {
   1158 						glGetTextureImage(texture, 0, GL_RG, GL_FLOAT, (i32)out_size,
   1159 						                  beamformer_shared_memory_scratch_arena(sm).beg);
   1160 					}
   1161 				}
   1162 			}break;
   1163 			case BeamformerExportKind_Stats:{
   1164 				ComputeTimingTable *table = ctx->compute_timing_table;
   1165 				/* NOTE(rnp): do a little spin to let this finish updating */
   1166 				while (table->write_index != atomic_load_u32(&table->read_index));
   1167 				ComputeShaderStats *stats = ctx->compute_shader_stats;
   1168 				if (sizeof(stats->table) <= ec->size)
   1169 					mem_copy(beamformer_shared_memory_scratch_arena(sm).beg, &stats->table, sizeof(stats->table));
   1170 			}break;
   1171 			InvalidDefaultCase;
   1172 			}
   1173 			os_shared_memory_region_unlock(&ctx->shared_memory, sm->locks, (i32)work->lock);
   1174 			post_sync_barrier(&ctx->shared_memory, BeamformerSharedMemoryLockKind_ExportSync, sm->locks);
   1175 		}break;
   1176 		case BeamformerWorkKind_CreateFilter:{
   1177 			/* TODO(rnp): this should probably get deleted and moved to lazy loading */
   1178 			BeamformerCreateFilterContext *fctx = &work->create_filter_context;
   1179 			u32 block = fctx->parameter_block;
   1180 			u32 slot  = fctx->filter_slot;
   1181 			BeamformerComputePlan *cp = beamformer_compute_plan_for_block(cs, block, arena);
   1182 			beamformer_filter_update(cp->filters + slot, fctx->kind, fctx->parameters, block, slot, *arena);
   1183 		}break;
   1184 		case BeamformerWorkKind_ComputeIndirect:{
   1185 			fill_frame_compute_work(ctx, work, work->compute_indirect_context.view_plane,
   1186 			                        work->compute_indirect_context.parameter_block, 1);
   1187 		} /* FALLTHROUGH */
   1188 		case BeamformerWorkKind_Compute:{
   1189 			DEBUG_DECL(glClearNamedBufferData(cs->ping_pong_ssbos[0], GL_RG32F, GL_RG, GL_FLOAT, 0);)
   1190 			DEBUG_DECL(glClearNamedBufferData(cs->ping_pong_ssbos[1], GL_RG32F, GL_RG, GL_FLOAT, 0);)
   1191 			DEBUG_DECL(glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);)
   1192 
   1193 			push_compute_timing_info(ctx->compute_timing_table,
   1194 			                         (ComputeTimingInfo){.kind = ComputeTimingInfoKind_ComputeFrameBegin});
   1195 
   1196 			BeamformerComputePlan *cp = beamformer_compute_plan_for_block(cs, work->compute_context.parameter_block, arena);
   1197 			if (beamformer_parameter_block_dirty(sm, work->compute_context.parameter_block)) {
   1198 				u32 block = work->compute_context.parameter_block;
   1199 				beamformer_commit_parameter_block(ctx, cp, block, *arena);
   1200 				atomic_store_u32(&ctx->ui_dirty_parameter_blocks, (u32)(ctx->beamform_work_queue != q) << block);
   1201 			}
   1202 
   1203 			post_sync_barrier(&ctx->shared_memory, work->lock, sm->locks);
   1204 
   1205 			atomic_store_u32(&cs->processing_compute, 1);
   1206 			start_renderdoc_capture(gl_context);
   1207 
   1208 			BeamformerFrame *frame = work->compute_context.frame;
   1209 
   1210 			GLenum gl_kind = cp->iq_pipeline ? GL_RG32F : GL_R32F;
   1211 			if (!beamformer_frame_compatible(frame, cp->output_points, gl_kind))
   1212 				alloc_beamform_frame(&ctx->gl, frame, cp->output_points, gl_kind, s8("Beamformed_Data"), *arena);
   1213 
   1214 			frame->min_coordinate  = cp->min_coordinate;
   1215 			frame->max_coordinate  = cp->max_coordinate;
   1216 			frame->das_kind        = cp->das_ubo_data.shader_kind;
   1217 			frame->compound_count  = cp->das_ubo_data.acquisition_count;
   1218 
   1219 			BeamformerComputeContext  *cc       = &ctx->compute_context;
   1220 			BeamformerComputePipeline *pipeline = &cp->pipeline;
   1221 			/* NOTE(rnp): first stage requires access to raw data buffer directly so we break
   1222 			 * it out into a separate step. This way data can get released as soon as possible */
   1223 			if (pipeline->shader_count > 0) {
   1224 				BeamformerRFBuffer *rf = &cs->rf_buffer;
   1225 				u32 slot = rf->compute_index % countof(rf->compute_syncs);
   1226 
   1227 				/* NOTE(rnp): compute indirect is used when uploading data. in this case the thread
   1228 				 * must wait on an upload fence. if the fence doesn't yet exist the thread must wait */
   1229 				if (work->kind == BeamformerWorkKind_ComputeIndirect)
   1230 					spin_wait(!atomic_load_u64(rf->upload_syncs + slot));
   1231 
   1232 				if (rf->upload_syncs[slot]) {
   1233 					rf->compute_index++;
   1234 					glWaitSync(rf->upload_syncs[slot], 0, GL_TIMEOUT_IGNORED);
   1235 					glDeleteSync(rf->upload_syncs[slot]);
   1236 				} else {
   1237 					slot = (rf->compute_index - 1) % countof(rf->compute_syncs);
   1238 				}
   1239 
   1240 				glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, rf->ssbo, slot * rf->active_rf_size, rf->active_rf_size);
   1241 
   1242 				glBeginQuery(GL_TIME_ELAPSED, cc->shader_timer_ids[0]);
   1243 				do_compute_shader(ctx, cp, frame, pipeline->shaders[0], pipeline->program_indices[0],
   1244 				                  pipeline->parameters + 0, *arena);
   1245 				glEndQuery(GL_TIME_ELAPSED);
   1246 
   1247 				if (work->kind == BeamformerWorkKind_ComputeIndirect) {
   1248 					rf->compute_syncs[slot] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
   1249 					rf->upload_syncs[slot]  = 0;
   1250 					memory_write_barrier();
   1251 				}
   1252 			}
   1253 
   1254 			b32 did_sum_shader = 0;
   1255 			for (u32 i = 1; i < pipeline->shader_count; i++) {
   1256 				did_sum_shader |= pipeline->shaders[i] == BeamformerShaderKind_Sum;
   1257 				glBeginQuery(GL_TIME_ELAPSED, cc->shader_timer_ids[i]);
   1258 				do_compute_shader(ctx, cp, frame, pipeline->shaders[i], pipeline->program_indices[i],
   1259 				                  pipeline->parameters + i, *arena);
   1260 				glEndQuery(GL_TIME_ELAPSED);
   1261 			}
   1262 
   1263 			/* NOTE(rnp): the first of these blocks until work completes */
   1264 			for (u32 i = 0; i < pipeline->shader_count; i++) {
   1265 				ComputeTimingInfo info = {0};
   1266 				info.kind   = ComputeTimingInfoKind_Shader;
   1267 				info.shader = pipeline->shaders[i];
   1268 				glGetQueryObjectui64v(cc->shader_timer_ids[i], GL_QUERY_RESULT, &info.timer_count);
   1269 				push_compute_timing_info(ctx->compute_timing_table, info);
   1270 			}
   1271 			cs->processing_progress = 1;
   1272 
   1273 			frame->ready_to_present = 1;
   1274 			if (did_sum_shader) {
   1275 				u32 aframe_index = ((ctx->averaged_frame_index++) % countof(ctx->averaged_frames));
   1276 				ctx->averaged_frames[aframe_index].view_plane_tag  = frame->view_plane_tag;
   1277 				ctx->averaged_frames[aframe_index].ready_to_present = 1;
   1278 				atomic_store_u64((u64 *)&ctx->latest_frame, (u64)(ctx->averaged_frames + aframe_index));
   1279 			} else {
   1280 				atomic_store_u64((u64 *)&ctx->latest_frame, (u64)frame);
   1281 			}
   1282 			cs->processing_compute  = 0;
   1283 
   1284 			push_compute_timing_info(ctx->compute_timing_table,
   1285 			                         (ComputeTimingInfo){.kind = ComputeTimingInfoKind_ComputeFrameEnd});
   1286 
   1287 			end_renderdoc_capture(gl_context);
   1288 		}break;
   1289 		InvalidDefaultCase;
   1290 		}
   1291 
   1292 		if (can_commit) {
   1293 			beamform_work_queue_pop_commit(q);
   1294 			work = beamform_work_queue_pop(q);
   1295 		}
   1296 	}
   1297 }
   1298 
   1299 function void
   1300 coalesce_timing_table(ComputeTimingTable *t, ComputeShaderStats *stats)
   1301 {
   1302 	/* TODO(rnp): we do not currently do anything to handle the potential for a half written
   1303 	 * info item. this could result in garbage entries but they shouldn't really matter */
   1304 
   1305 	u32 target = atomic_load_u32(&t->write_index);
   1306 	u32 stats_index = (stats->latest_frame_index + 1) % countof(stats->table.times);
   1307 
   1308 	static_assert(BeamformerShaderKind_Count + 1 <= 32, "timing coalescence bitfield test");
   1309 	u32 seen_info_test = 0;
   1310 
   1311 	while (t->read_index != target) {
   1312 		ComputeTimingInfo info = t->buffer[t->read_index % countof(t->buffer)];
   1313 		switch (info.kind) {
   1314 		case ComputeTimingInfoKind_ComputeFrameBegin:{
   1315 			assert(t->compute_frame_active == 0);
   1316 			t->compute_frame_active = 1;
   1317 			/* NOTE(rnp): allow multiple instances of same shader to accumulate */
   1318 			mem_clear(stats->table.times[stats_index], 0, sizeof(stats->table.times[stats_index]));
   1319 		}break;
   1320 		case ComputeTimingInfoKind_ComputeFrameEnd:{
   1321 			assert(t->compute_frame_active == 1);
   1322 			t->compute_frame_active = 0;
   1323 			stats->latest_frame_index = stats_index;
   1324 			stats_index = (stats_index + 1) % countof(stats->table.times);
   1325 		}break;
   1326 		case ComputeTimingInfoKind_Shader:{
   1327 			stats->table.times[stats_index][info.shader] += (f32)info.timer_count / 1.0e9f;
   1328 			seen_info_test |= (1u << info.shader);
   1329 		}break;
   1330 		case ComputeTimingInfoKind_RF_Data:{
   1331 			stats->latest_rf_index = (stats->latest_rf_index + 1) % countof(stats->table.rf_time_deltas);
   1332 			f32 delta = (f32)(info.timer_count - stats->last_rf_timer_count) / 1.0e9f;
   1333 			stats->table.rf_time_deltas[stats->latest_rf_index] = delta;
   1334 			stats->last_rf_timer_count = info.timer_count;
   1335 			seen_info_test |= (1 << BeamformerShaderKind_Count);
   1336 		}break;
   1337 		}
   1338 		/* NOTE(rnp): do this at the end so that stats table is always in a consistent state */
   1339 		atomic_add_u32(&t->read_index, 1);
   1340 	}
   1341 
   1342 	if (seen_info_test) {
   1343 		for EachEnumValue(BeamformerShaderKind, shader) {
   1344 			if (seen_info_test & (1 << shader)) {
   1345 				f32 sum = 0;
   1346 				for EachElement(stats->table.times, i)
   1347 					sum += stats->table.times[i][shader];
   1348 				stats->average_times[shader] = sum / countof(stats->table.times);
   1349 			}
   1350 		}
   1351 
   1352 		if (seen_info_test & (1 << BeamformerShaderKind_Count)) {
   1353 			f32 sum = 0;
   1354 			for EachElement(stats->table.rf_time_deltas, i)
   1355 				sum += stats->table.rf_time_deltas[i];
   1356 			stats->rf_time_delta_average = sum / countof(stats->table.rf_time_deltas);
   1357 		}
   1358 	}
   1359 }
   1360 
   1361 DEBUG_EXPORT BEAMFORMER_COMPLETE_COMPUTE_FN(beamformer_complete_compute)
   1362 {
   1363 	BeamformerCtx *ctx         = (BeamformerCtx *)user_context;
   1364 	BeamformerSharedMemory *sm = ctx->shared_memory.region;
   1365 	complete_queue(ctx, &sm->external_work_queue, arena, gl_context);
   1366 	complete_queue(ctx, ctx->beamform_work_queue, arena, gl_context);
   1367 }
   1368 
   1369 function void
   1370 beamformer_rf_buffer_allocate(BeamformerRFBuffer *rf, u32 rf_size, Arena arena)
   1371 {
   1372 	assert((rf_size % 64) == 0);
   1373 	glUnmapNamedBuffer(rf->ssbo);
   1374 	glDeleteBuffers(1, &rf->ssbo);
   1375 	glCreateBuffers(1, &rf->ssbo);
   1376 
   1377 	glNamedBufferStorage(rf->ssbo, countof(rf->compute_syncs) * rf_size, 0,
   1378 	                     GL_DYNAMIC_STORAGE_BIT|GL_MAP_WRITE_BIT);
   1379 	LABEL_GL_OBJECT(GL_BUFFER, rf->ssbo, s8("Raw_RF_SSBO"));
   1380 	rf->size = rf_size;
   1381 }
   1382 
   1383 DEBUG_EXPORT BEAMFORMER_RF_UPLOAD_FN(beamformer_rf_upload)
   1384 {
   1385 	BeamformerSharedMemory *sm = ctx->shared_memory->region;
   1386 
   1387 	BeamformerSharedMemoryLockKind scratch_lock = BeamformerSharedMemoryLockKind_ScratchSpace;
   1388 	BeamformerSharedMemoryLockKind upload_lock  = BeamformerSharedMemoryLockKind_UploadRF;
   1389 	if (sm->locks[upload_lock] &&
   1390 	    os_shared_memory_region_lock(ctx->shared_memory, sm->locks, (i32)scratch_lock, (u32)-1))
   1391 	{
   1392 		BeamformerRFBuffer *rf = ctx->rf_buffer;
   1393 		rf->active_rf_size = (u32)round_up_to(sm->scratch_rf_size, 64);
   1394 		if (rf->size < rf->active_rf_size)
   1395 			beamformer_rf_buffer_allocate(rf, rf->active_rf_size, arena);
   1396 
   1397 		u32 slot = rf->insertion_index++ % countof(rf->compute_syncs);
   1398 
   1399 		/* NOTE(rnp): if the rest of the code is functioning then the first
   1400 		 * time the compute thread processes an upload it must have gone
   1401 		 * through this path. therefore it is safe to spin until it gets processed */
   1402 		spin_wait(atomic_load_u64(rf->upload_syncs + slot));
   1403 
   1404 		if (rf->compute_syncs[slot]) {
   1405 			GLenum sync_result = glClientWaitSync(rf->compute_syncs[slot], 0, 1000000000);
   1406 			if (sync_result == GL_TIMEOUT_EXPIRED || sync_result == GL_WAIT_FAILED) {
   1407 				// TODO(rnp): what do?
   1408 			}
   1409 			glDeleteSync(rf->compute_syncs[slot]);
   1410 		}
   1411 
   1412 		/* NOTE(rnp): nVidia's drivers really don't play nice with persistant mapping,
   1413 		 * at least when it is a big as this one wants to be. mapping and unmapping the
   1414 		 * desired range each time doesn't seem to introduce any performance hit */
   1415 		u32 access = GL_MAP_WRITE_BIT|GL_MAP_FLUSH_EXPLICIT_BIT|GL_MAP_UNSYNCHRONIZED_BIT;
   1416 		u8 *buffer = glMapNamedBufferRange(rf->ssbo, slot * rf->active_rf_size, (i32)rf->active_rf_size, access);
   1417 
   1418 		mem_copy(buffer, beamformer_shared_memory_scratch_arena(sm).beg, rf->active_rf_size);
   1419 		os_shared_memory_region_unlock(ctx->shared_memory, sm->locks, (i32)scratch_lock);
   1420 		post_sync_barrier(ctx->shared_memory, upload_lock, sm->locks);
   1421 
   1422 		glFlushMappedNamedBufferRange(rf->ssbo, 0, (i32)rf->active_rf_size);
   1423 		glUnmapNamedBuffer(rf->ssbo);
   1424 
   1425 		rf->upload_syncs[slot]  = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
   1426 		rf->compute_syncs[slot] = 0;
   1427 		memory_write_barrier();
   1428 
   1429 		os_wake_waiters(ctx->compute_worker_sync);
   1430 
   1431 		ComputeTimingInfo info = {.kind = ComputeTimingInfoKind_RF_Data};
   1432 		glGetQueryObjectui64v(rf->data_timestamp_query, GL_QUERY_RESULT, &info.timer_count);
   1433 		glQueryCounter(rf->data_timestamp_query, GL_TIMESTAMP);
   1434 		push_compute_timing_info(ctx->compute_timing_table, info);
   1435 	}
   1436 }
   1437 
   1438 #include "ui.c"
   1439 
   1440 DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step)
   1441 {
   1442 	dt_for_frame = input->dt;
   1443 
   1444 	if (IsWindowResized()) {
   1445 		ctx->window_size.h = GetScreenHeight();
   1446 		ctx->window_size.w = GetScreenWidth();
   1447 	}
   1448 
   1449 	coalesce_timing_table(ctx->compute_timing_table, ctx->compute_shader_stats);
   1450 
   1451 	if (input->executable_reloaded) {
   1452 		ui_init(ctx, ctx->ui_backing_store);
   1453 		DEBUG_DECL(start_frame_capture = ctx->os.start_frame_capture);
   1454 		DEBUG_DECL(end_frame_capture   = ctx->os.end_frame_capture);
   1455 	}
   1456 
   1457 	BeamformerSharedMemory *sm = ctx->shared_memory.region;
   1458 	if (sm->locks[BeamformerSharedMemoryLockKind_UploadRF] != 0)
   1459 		os_wake_waiters(&ctx->os.upload_worker.sync_variable);
   1460 
   1461 	BeamformerFrame        *frame = ctx->latest_frame;
   1462 	BeamformerViewPlaneTag  tag   = frame? frame->view_plane_tag : 0;
   1463 	draw_ui(ctx, input, frame, tag);
   1464 
   1465 	ctx->frame_view_render_context.updated = 0;
   1466 
   1467 	if (WindowShouldClose())
   1468 		ctx->should_exit = 1;
   1469 }
   1470 
   1471 /* NOTE(rnp): functions defined in these shouldn't be visible to the whole program */
   1472 #if _DEBUG
   1473   #if OS_LINUX
   1474     #include "os_linux.c"
   1475   #elif OS_WINDOWS
   1476     #include "os_win32.c"
   1477   #endif
   1478 #endif