beamformer.c - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

beamformer.c (33809B)
      1 /* See LICENSE for license details. */
      2 /* TODO(rnp):
      3  * [ ]: reinvestigate ring buffer raw_data_ssbo
      4  *      - to minimize latency the main thread should manage the subbuffer upload so that the
      5  *        compute thread can just keep computing. This way we can keep the copmute thread busy
      6  *        with work while we image.
      7  *      - In particular we will potentially need multiple GPUComputeContexts so that we
      8  *        can overwrite one while the other is in use.
      9  *      - make use of glFenceSync to guard buffer uploads
     10  * [ ]: BeamformWorkQueue -> BeamformerWorkQueue
     11  * [ ]: bug: re-beamform on shader reload
     12  * [ ]: need to keep track of gpu memory in some way
     13  *      - want to be able to store more than 16 2D frames but limit 3D frames
     14  *      - maybe keep track of how much gpu memory is committed for beamformed images
     15  *        and use that to determine when to loop back over existing textures
     16  *      - to do this maybe use a circular linked list instead of a flat array
     17  *      - then have a way of querying how many frames are available for a specific point count
     18  * [ ]: bug: reinit cuda on hot-reload
     19  */
     20 
     21 #include "beamformer.h"
     22 #include "beamformer_work_queue.c"
     23 
     24 global f32 dt_for_frame;
     25 global u32 cycle_t;
     26 
     27 #ifndef _DEBUG
     28 #define start_renderdoc_capture(...)
     29 #define end_renderdoc_capture(...)
     30 #else
     31 global renderdoc_start_frame_capture_fn *start_frame_capture;
     32 global renderdoc_end_frame_capture_fn   *end_frame_capture;
     33 #define start_renderdoc_capture(gl) if (start_frame_capture) start_frame_capture(gl, 0)
     34 #define end_renderdoc_capture(gl)   if (end_frame_capture)   end_frame_capture(gl, 0)
     35 #endif
     36 
     37 typedef struct {
     38 	BeamformerComputeFrame *frames;
     39 	u32 capacity;
     40 	u32 offset;
     41 	u32 cursor;
     42 	u32 needed_frames;
     43 } ComputeFrameIterator;
     44 
     45 function uv3
     46 make_valid_test_dim(u32 in[3])
     47 {
     48 	uv3 result;
     49 	result.E[0] = MAX(in[0], 1);
     50 	result.E[1] = MAX(in[1], 1);
     51 	result.E[2] = MAX(in[2], 1);
     52 	return result;
     53 }
     54 
     55 function ComputeFrameIterator
     56 compute_frame_iterator(BeamformerCtx *ctx, u32 start_index, u32 needed_frames)
     57 {
     58 	start_index = start_index % ARRAY_COUNT(ctx->beamform_frames);
     59 
     60 	ComputeFrameIterator result;
     61 	result.frames        = ctx->beamform_frames;
     62 	result.offset        = start_index;
     63 	result.capacity      = ARRAY_COUNT(ctx->beamform_frames);
     64 	result.cursor        = 0;
     65 	result.needed_frames = needed_frames;
     66 	return result;
     67 }
     68 
     69 function BeamformerComputeFrame *
     70 frame_next(ComputeFrameIterator *bfi)
     71 {
     72 	BeamformerComputeFrame *result = 0;
     73 	if (bfi->cursor != bfi->needed_frames) {
     74 		u32 index = (bfi->offset + bfi->cursor++) % bfi->capacity;
     75 		result    = bfi->frames + index;
     76 	}
     77 	return result;
     78 }
     79 
     80 function void
     81 alloc_beamform_frame(GLParams *gp, BeamformerFrame *out, uv3 out_dim, s8 name, Arena arena)
     82 {
     83 	out->dim.x = MAX(1, out_dim.x);
     84 	out->dim.y = MAX(1, out_dim.y);
     85 	out->dim.z = MAX(1, out_dim.z);
     86 
     87 	if (gp) {
     88 		out->dim.x = MIN(out->dim.x, gp->max_3d_texture_dim);
     89 		out->dim.y = MIN(out->dim.y, gp->max_3d_texture_dim);
     90 		out->dim.z = MIN(out->dim.z, gp->max_3d_texture_dim);
     91 	}
     92 
     93 	/* NOTE: allocate storage for beamformed output data;
     94 	 * this is shared between compute and fragment shaders */
     95 	u32 max_dim = MAX(out->dim.x, MAX(out->dim.y, out->dim.z));
     96 	out->mips   = ctz_u32(round_up_power_of_2(max_dim)) + 1;
     97 
     98 	Stream label = arena_stream(arena);
     99 	stream_append_s8(&label, name);
    100 	stream_append_byte(&label, '[');
    101 	stream_append_hex_u64(&label, out->id);
    102 	stream_append_byte(&label, ']');
    103 
    104 	glDeleteTextures(1, &out->texture);
    105 	glCreateTextures(GL_TEXTURE_3D, 1, &out->texture);
    106 	glTextureStorage3D(out->texture, out->mips, GL_RG32F, out->dim.x, out->dim.y, out->dim.z);
    107 
    108 	glTextureParameteri(out->texture, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
    109 	glTextureParameteri(out->texture, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
    110 
    111 	LABEL_GL_OBJECT(GL_TEXTURE, out->texture, stream_to_s8(&label));
    112 }
    113 
    114 function void
    115 alloc_shader_storage(BeamformerCtx *ctx, u32 rf_raw_size, Arena a)
    116 {
    117 	ComputeShaderCtx     *cs = &ctx->csctx;
    118 	BeamformerParameters *bp = &((BeamformerSharedMemory *)ctx->shared_memory.region)->parameters;
    119 
    120 	cs->dec_data_dim = uv4_from_u32_array(bp->dec_data_dim);
    121 	cs->rf_raw_size  = rf_raw_size;
    122 
    123 	glDeleteBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos);
    124 	glCreateBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos);
    125 
    126 	i32 storage_flags = GL_DYNAMIC_STORAGE_BIT;
    127 	glDeleteBuffers(1, &cs->raw_data_ssbo);
    128 	glCreateBuffers(1, &cs->raw_data_ssbo);
    129 	glNamedBufferStorage(cs->raw_data_ssbo, 2 * rf_raw_size, 0, storage_flags);
    130 	LABEL_GL_OBJECT(GL_BUFFER, cs->raw_data_ssbo, s8("Raw_RF_SSBO"));
    131 
    132 	iz rf_decoded_size = 2 * sizeof(f32) * cs->dec_data_dim.x * cs->dec_data_dim.y * cs->dec_data_dim.z;
    133 	Stream label = arena_stream(a);
    134 	stream_append_s8(&label, s8("Decoded_RF_SSBO_"));
    135 	u32 s_widx = label.widx;
    136 	for (u32 i = 0; i < ARRAY_COUNT(cs->rf_data_ssbos); i++) {
    137 		glNamedBufferStorage(cs->rf_data_ssbos[i], rf_decoded_size, 0, 0);
    138 		stream_append_u64(&label, i);
    139 		LABEL_GL_OBJECT(GL_BUFFER, cs->rf_data_ssbos[i], stream_to_s8(&label));
    140 		stream_reset(&label, s_widx);
    141 	}
    142 
    143 	/* NOTE(rnp): these are stubs when CUDA isn't supported */
    144 	ctx->cuda_lib.register_buffers(cs->rf_data_ssbos, countof(cs->rf_data_ssbos), cs->raw_data_ssbo);
    145 	ctx->cuda_lib.init(bp->rf_raw_dim, bp->dec_data_dim);
    146 
    147 	u32  order    = cs->dec_data_dim.z;
    148 	i32 *hadamard = make_hadamard_transpose(&a, order);
    149 	if (hadamard) {
    150 		glDeleteTextures(1, &cs->hadamard_texture);
    151 		glCreateTextures(GL_TEXTURE_2D, 1, &cs->hadamard_texture);
    152 		glTextureStorage2D(cs->hadamard_texture, 1, GL_R8I, order, order);
    153 		glTextureSubImage2D(cs->hadamard_texture, 0, 0, 0,  order, order, GL_RED_INTEGER,
    154 		                    GL_INT, hadamard);
    155 		LABEL_GL_OBJECT(GL_TEXTURE, cs->hadamard_texture, s8("Hadamard_Matrix"));
    156 	}
    157 }
    158 
    159 function void
    160 push_compute_timing_info(ComputeTimingTable *t, ComputeTimingInfo info)
    161 {
    162 	u32 index = atomic_add_u32(&t->write_index, 1) % countof(t->buffer);
    163 	t->buffer[index] = info;
    164 }
    165 
    166 function b32
    167 fill_frame_compute_work(BeamformerCtx *ctx, BeamformWork *work, BeamformerViewPlaneTag plane)
    168 {
    169 	b32 result = 0;
    170 	if (work) {
    171 		result = 1;
    172 		u32 frame_id    = atomic_add_u32(&ctx->next_render_frame_index, 1);
    173 		u32 frame_index = frame_id % countof(ctx->beamform_frames);
    174 		work->kind      = BeamformerWorkKind_Compute;
    175 		work->lock      = BeamformerSharedMemoryLockKind_DispatchCompute;
    176 		work->frame     = ctx->beamform_frames + frame_index;
    177 		work->frame->ready_to_present = 0;
    178 		work->frame->view_plane_tag   = plane;
    179 		work->frame->frame.id         = frame_id;
    180 	}
    181 	return result;
    182 }
    183 
    184 function void
    185 do_sum_shader(ComputeShaderCtx *cs, u32 *in_textures, u32 in_texture_count, f32 in_scale,
    186               u32 out_texture, uv3 out_data_dim)
    187 {
    188 	/* NOTE: zero output before summing */
    189 	glClearTexImage(out_texture, 0, GL_RED, GL_FLOAT, 0);
    190 	glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT);
    191 
    192 	glBindImageTexture(0, out_texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F);
    193 	glProgramUniform1f(cs->programs[BeamformerShaderKind_Sum], SUM_PRESCALE_UNIFORM_LOC, in_scale);
    194 	for (u32 i = 0; i < in_texture_count; i++) {
    195 		glBindImageTexture(1, in_textures[i], 0, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F);
    196 		glDispatchCompute(ORONE(out_data_dim.x / 32),
    197 		                  ORONE(out_data_dim.y),
    198 		                  ORONE(out_data_dim.z / 32));
    199 		glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    200 	}
    201 }
    202 
    203 struct compute_cursor {
    204 	iv3 cursor;
    205 	iv3 dispatch;
    206 	iv3 target;
    207 	u32 points_per_dispatch;
    208 	u32 completed_points;
    209 	u32 total_points;
    210 };
    211 
    212 function struct compute_cursor
    213 start_compute_cursor(uv3 dim, u32 max_points)
    214 {
    215 	struct compute_cursor result = {0};
    216 	u32 invocations_per_dispatch = DAS_LOCAL_SIZE_X * DAS_LOCAL_SIZE_Y * DAS_LOCAL_SIZE_Z;
    217 
    218 	result.dispatch.y = MIN(max_points / invocations_per_dispatch, ceil_f32((f32)dim.y / DAS_LOCAL_SIZE_Y));
    219 
    220 	u32 remaining     = max_points / result.dispatch.y;
    221 	result.dispatch.x = MIN(remaining / invocations_per_dispatch, ceil_f32((f32)dim.x / DAS_LOCAL_SIZE_X));
    222 	result.dispatch.z = MIN(remaining / (invocations_per_dispatch * result.dispatch.x),
    223 	                        ceil_f32((f32)dim.z / DAS_LOCAL_SIZE_Z));
    224 
    225 	result.target.x = MAX(dim.x / result.dispatch.x / DAS_LOCAL_SIZE_X, 1);
    226 	result.target.y = MAX(dim.y / result.dispatch.y / DAS_LOCAL_SIZE_Y, 1);
    227 	result.target.z = MAX(dim.z / result.dispatch.z / DAS_LOCAL_SIZE_Z, 1);
    228 
    229 	result.points_per_dispatch = 1;
    230 	result.points_per_dispatch *= result.dispatch.x * DAS_LOCAL_SIZE_X;
    231 	result.points_per_dispatch *= result.dispatch.y * DAS_LOCAL_SIZE_Y;
    232 	result.points_per_dispatch *= result.dispatch.z * DAS_LOCAL_SIZE_Z;
    233 
    234 	result.total_points = dim.x * dim.y * dim.z;
    235 
    236 	return result;
    237 }
    238 
    239 function iv3
    240 step_compute_cursor(struct compute_cursor *cursor)
    241 {
    242 	cursor->cursor.x += 1;
    243 	if (cursor->cursor.x >= cursor->target.x) {
    244 		cursor->cursor.x  = 0;
    245 		cursor->cursor.y += 1;
    246 		if (cursor->cursor.y >= cursor->target.y) {
    247 			cursor->cursor.y  = 0;
    248 			cursor->cursor.z += 1;
    249 		}
    250 	}
    251 
    252 	cursor->completed_points += cursor->points_per_dispatch;
    253 
    254 	iv3 result = cursor->cursor;
    255 	result.x *= cursor->dispatch.x * DAS_LOCAL_SIZE_X;
    256 	result.y *= cursor->dispatch.y * DAS_LOCAL_SIZE_Y;
    257 	result.z *= cursor->dispatch.z * DAS_LOCAL_SIZE_Z;
    258 
    259 	return result;
    260 }
    261 
    262 function b32
    263 compute_cursor_finished(struct compute_cursor *cursor)
    264 {
    265 	b32 result = cursor->completed_points >= cursor->total_points;
    266 	return result;
    267 }
    268 
    269 function void
    270 do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformerComputeFrame *frame, BeamformerShaderKind shader)
    271 {
    272 	ComputeShaderCtx *csctx    = &ctx->csctx;
    273 	BeamformerSharedMemory *sm = ctx->shared_memory.region;
    274 
    275 	glUseProgram(csctx->programs[shader]);
    276 
    277 	u32 output_ssbo_idx = !csctx->last_output_ssbo_index;
    278 	u32 input_ssbo_idx  = csctx->last_output_ssbo_index;
    279 
    280 	switch (shader) {
    281 	case BeamformerShaderKind_Decode:
    282 	case BeamformerShaderKind_DecodeFloat:
    283 	case BeamformerShaderKind_DecodeFloatComplex:{
    284 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, csctx->rf_data_ssbos[output_ssbo_idx]);
    285 		glBindImageTexture(0, csctx->hadamard_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8I);
    286 		glBindImageTexture(1, csctx->channel_mapping_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I);
    287 
    288 		/* NOTE(rnp): decode 2 samples per dispatch when data is i16 */
    289 		i32 local_size_x = DECODE_LOCAL_SIZE_X;
    290 		if (shader == BeamformerShaderKind_Decode)
    291 			local_size_x *= 2;
    292 
    293 		iz raw_size = csctx->rf_raw_size;
    294 		glProgramUniform1ui(csctx->programs[shader], DECODE_FIRST_PASS_UNIFORM_LOC, 1);
    295 		glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo, 0,        raw_size);
    296 		glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 2, csctx->raw_data_ssbo, raw_size, raw_size);
    297 		glDispatchCompute(ceil_f32((f32)csctx->dec_data_dim.x / local_size_x),
    298 		                  ceil_f32((f32)csctx->dec_data_dim.y / DECODE_LOCAL_SIZE_Y),
    299 		                  ceil_f32((f32)csctx->dec_data_dim.z / DECODE_LOCAL_SIZE_Z));
    300 
    301 		glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
    302 
    303 		glProgramUniform1ui(csctx->programs[shader], DECODE_FIRST_PASS_UNIFORM_LOC, 0);
    304 		glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo, raw_size, raw_size);
    305 		glDispatchCompute(ceil_f32((f32)csctx->dec_data_dim.x / local_size_x),
    306 		                  ceil_f32((f32)csctx->dec_data_dim.y / DECODE_LOCAL_SIZE_Y),
    307 		                  ceil_f32((f32)csctx->dec_data_dim.z / DECODE_LOCAL_SIZE_Z));
    308 
    309 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    310 	}break;
    311 	case BeamformerShaderKind_CudaDecode:{
    312 		ctx->cuda_lib.decode(0, output_ssbo_idx, 0);
    313 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    314 	}break;
    315 	case BeamformerShaderKind_CudaHilbert:
    316 		ctx->cuda_lib.hilbert(input_ssbo_idx, output_ssbo_idx);
    317 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    318 		break;
    319 	case BeamformerShaderKind_Demodulate:{
    320 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]);
    321 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]);
    322 		glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32),
    323 		                  ORONE(csctx->dec_data_dim.y / 32),
    324 		                  ORONE(csctx->dec_data_dim.z));
    325 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    326 	}break;
    327 	case BeamformerShaderKind_MinMax:{
    328 		u32 texture = frame->frame.texture;
    329 		for (u32 i = 1; i < frame->frame.mips; i++) {
    330 			glBindImageTexture(0, texture, i - 1, GL_TRUE, 0, GL_READ_ONLY,  GL_RG32F);
    331 			glBindImageTexture(1, texture, i - 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F);
    332 			glProgramUniform1i(csctx->programs[shader], MIN_MAX_MIPS_LEVEL_UNIFORM_LOC, i);
    333 
    334 			u32 width  = frame->frame.dim.x >> i;
    335 			u32 height = frame->frame.dim.y >> i;
    336 			u32 depth  = frame->frame.dim.z >> i;
    337 			glDispatchCompute(ORONE(width / 32), ORONE(height), ORONE(depth / 32));
    338 			glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    339 		}
    340 	}break;
    341 	case BeamformerShaderKind_DASCompute:{
    342 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]);
    343 		glBindImageTexture(0, frame->frame.texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F);
    344 		glBindImageTexture(1, csctx->sparse_elements_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I);
    345 		glBindImageTexture(2, csctx->focal_vectors_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RG32F);
    346 
    347 		glProgramUniform1ui(csctx->programs[shader], DAS_CYCLE_T_UNIFORM_LOC, cycle_t++);
    348 
    349 		#if 1
    350 		/* TODO(rnp): compute max_points_per_dispatch based on something like a
    351 		 * transmit_count * channel_count product */
    352 		u32 max_points_per_dispatch = KB(64);
    353 		struct compute_cursor cursor = start_compute_cursor(frame->frame.dim, max_points_per_dispatch);
    354 		f32 percent_per_step = (f32)cursor.points_per_dispatch / (f32)cursor.total_points;
    355 		csctx->processing_progress = -percent_per_step;
    356 		for (iv3 offset = {0};
    357 		     !compute_cursor_finished(&cursor);
    358 		     offset = step_compute_cursor(&cursor))
    359 		{
    360 			csctx->processing_progress += percent_per_step;
    361 			/* IMPORTANT(rnp): prevents OS from coalescing and killing our shader */
    362 			glFinish();
    363 			glProgramUniform3iv(csctx->programs[shader], DAS_VOXEL_OFFSET_UNIFORM_LOC,
    364 			                    1, offset.E);
    365 			glDispatchCompute(cursor.dispatch.x, cursor.dispatch.y, cursor.dispatch.z);
    366 		}
    367 		#else
    368 		/* NOTE(rnp): use this for testing tiling code. The performance of the above path
    369 		 * should be the same as this path if everything is working correctly */
    370 		iv3 compute_dim_offset = {0};
    371 		glUniform3iv(csctx->voxel_offset_id, 1, compute_dim_offset.E);
    372 		glDispatchCompute(ORONE(frame->frame.dim.x / 32),
    373 		                  ORONE(frame->frame.dim.y),
    374 		                  ORONE(frame->frame.dim.z / 32));
    375 		#endif
    376 		glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT|GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    377 	}break;
    378 	case BeamformerShaderKind_Sum:{
    379 		u32 aframe_index = ctx->averaged_frame_index % ARRAY_COUNT(ctx->averaged_frames);
    380 		BeamformerComputeFrame *aframe = ctx->averaged_frames + aframe_index;
    381 		aframe->ready_to_present       = 0;
    382 		aframe->frame.id               = ctx->averaged_frame_index;
    383 		/* TODO(rnp): hack we need a better way of specifying which frames to sum;
    384 		 * this is fine for rolling averaging but what if we want to do something else */
    385 		assert(frame >= ctx->beamform_frames);
    386 		assert(frame < ctx->beamform_frames + countof(ctx->beamform_frames));
    387 		u32 base_index   = (u32)(frame - ctx->beamform_frames);
    388 		u32 to_average   = sm->parameters.output_points[3];
    389 		u32 frame_count  = 0;
    390 		u32 *in_textures = push_array(&arena, u32, MAX_BEAMFORMED_SAVED_FRAMES);
    391 		ComputeFrameIterator cfi = compute_frame_iterator(ctx, 1 + base_index - to_average,
    392 		                                                  to_average);
    393 		for (BeamformerComputeFrame *it = frame_next(&cfi); it; it = frame_next(&cfi))
    394 			in_textures[frame_count++] = it->frame.texture;
    395 
    396 		assert(to_average == frame_count);
    397 
    398 		do_sum_shader(csctx, in_textures, frame_count, 1 / (f32)frame_count,
    399 		              aframe->frame.texture, aframe->frame.dim);
    400 		aframe->frame.min_coordinate  = frame->frame.min_coordinate;
    401 		aframe->frame.max_coordinate  = frame->frame.max_coordinate;
    402 		aframe->frame.compound_count  = frame->frame.compound_count;
    403 		aframe->frame.das_shader_kind = frame->frame.das_shader_kind;
    404 	}break;
    405 	InvalidDefaultCase;
    406 	}
    407 }
    408 
    409 function s8
    410 shader_text_with_header(ShaderReloadContext *ctx, OS *os, Arena *arena)
    411 {
    412 	Stream sb = arena_stream(*arena);
    413 	stream_append_s8s(&sb, s8("#version 460 core\n\n"), ctx->header);
    414 
    415 	switch (ctx->kind) {
    416 	case BeamformerShaderKind_DASCompute:{
    417 		#define X(type, id, pretty, fixed_tx) "#define DAS_ID_" #type " " #id "\n"
    418 		stream_append_s8(&sb, s8(""
    419 		"layout(local_size_x = " str(DAS_LOCAL_SIZE_X) ", "
    420 		       "local_size_y = " str(DAS_LOCAL_SIZE_Y) ", "
    421 		       "local_size_z = " str(DAS_LOCAL_SIZE_Z) ") in;\n\n"
    422 		"layout(location = " str(DAS_VOXEL_OFFSET_UNIFORM_LOC) ") uniform ivec3 u_voxel_offset;\n"
    423 		"layout(location = " str(DAS_CYCLE_T_UNIFORM_LOC)      ") uniform uint  u_cycle_t;\n\n"
    424 		DAS_TYPES
    425 		));
    426 		#undef X
    427 	}break;
    428 	case BeamformerShaderKind_DecodeFloat:
    429 	case BeamformerShaderKind_DecodeFloatComplex:{
    430 		if (ctx->kind == BeamformerShaderKind_DecodeFloat)
    431 			stream_append_s8(&sb, s8("#define INPUT_DATA_TYPE_FLOAT\n\n"));
    432 		else
    433 			stream_append_s8(&sb, s8("#define INPUT_DATA_TYPE_FLOAT_COMPLEX\n\n"));
    434 	} /* FALLTHROUGH */
    435 	case BeamformerShaderKind_Decode:{
    436 		#define X(type, id, pretty) "#define DECODE_MODE_" #type " " #id "\n"
    437 		stream_append_s8(&sb, s8(""
    438 		"layout(local_size_x = " str(DECODE_LOCAL_SIZE_X) ", "
    439 		       "local_size_y = " str(DECODE_LOCAL_SIZE_Y) ", "
    440 		       "local_size_z = " str(DECODE_LOCAL_SIZE_Z) ") in;\n\n"
    441 		"layout(location = " str(DECODE_FIRST_PASS_UNIFORM_LOC) ") uniform bool u_first_pass;\n\n"
    442 		DECODE_TYPES
    443 		));
    444 		#undef X
    445 	}break;
    446 	case BeamformerShaderKind_MinMax:{
    447 		stream_append_s8(&sb, s8("layout(location = " str(MIN_MAX_MIPS_LEVEL_UNIFORM_LOC)
    448 		                         ") uniform int u_mip_map;\n\n"));
    449 	}break;
    450 	case BeamformerShaderKind_Sum:{
    451 		stream_append_s8(&sb, s8("layout(location = " str(SUM_PRESCALE_UNIFORM_LOC)
    452 		                         ") uniform float u_sum_prescale = 1.0;\n\n"));
    453 	}break;
    454 	default:{}break;
    455 	}
    456 	stream_append_s8(&sb, s8("\n#line 1\n"));
    457 
    458 	s8 result = arena_stream_commit(arena, &sb);
    459 	if (ctx->path.len) {
    460 		s8 file = os->read_whole_file(arena, (c8 *)ctx->path.data);
    461 		assert(file.data == result.data + result.len);
    462 		result.len += file.len;
    463 	}
    464 
    465 	return result;
    466 }
    467 
    468 DEBUG_EXPORT BEAMFORMER_RELOAD_SHADER_FN(beamformer_reload_shader)
    469 {
    470 	i32 shader_count = 1;
    471 	ShaderReloadContext *link = src->link;
    472 	while (link != src) { shader_count++; link = link->link; }
    473 
    474 	s8  *shader_texts = push_array(&arena, s8,  shader_count);
    475 	u32 *shader_types = push_array(&arena, u32, shader_count);
    476 
    477 	i32 index = 0;
    478 	do {
    479 		shader_texts[index] = shader_text_with_header(link, &ctx->os, &arena);
    480 		shader_types[index] = link->gl_type;
    481 		index++;
    482 		link = link->link;
    483 	} while (link != src);
    484 
    485 	u32 new_program = load_shader(&ctx->os, arena, shader_texts, shader_types, shader_count, shader_name);
    486 	if (new_program) {
    487 		glDeleteProgram(*src->shader);
    488 		*src->shader = new_program;
    489 		if (src->kind == BeamformerShaderKind_Render2D) ctx->frame_view_render_context.updated = 1;
    490 	}
    491 	return new_program != 0;
    492 }
    493 
    494 function b32
    495 reload_compute_shader(BeamformerCtx *ctx, ShaderReloadContext *src, s8 name_extra, Arena arena)
    496 {
    497 	Stream sb  = arena_stream(arena);
    498 	stream_append_s8s(&sb, src->name, name_extra);
    499 	s8  name   = arena_stream_commit(&arena, &sb);
    500 	b32 result = beamformer_reload_shader(ctx, src, arena, name);
    501 	if (result) {
    502 		glUseProgram(*src->shader);
    503 		glBindBufferBase(GL_UNIFORM_BUFFER, 0, ctx->csctx.shared_ubo);
    504 	}
    505 	return result;
    506 }
    507 
    508 function void
    509 complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_context)
    510 {
    511 	ComputeShaderCtx       *cs = &ctx->csctx;
    512 	BeamformerSharedMemory *sm = ctx->shared_memory.region;
    513 	BeamformerParameters   *bp = &sm->parameters;
    514 
    515 	BeamformWork *work = beamform_work_queue_pop(q);
    516 	while (work) {
    517 		b32 can_commit = 1;
    518 		switch (work->kind) {
    519 		case BeamformerWorkKind_ReloadShader:{
    520 			ShaderReloadContext *src = work->shader_reload_context;
    521 			b32 success = reload_compute_shader(ctx, src, s8(""), arena);
    522 			if (src->kind == BeamformerShaderKind_Decode) {
    523 				/* TODO(rnp): think of a better way of doing this */
    524 				src->kind   = BeamformerShaderKind_DecodeFloatComplex;
    525 				src->shader = cs->programs + BeamformerShaderKind_DecodeFloatComplex;
    526 				success &= reload_compute_shader(ctx, src, s8(" (F32C)"), arena);
    527 				src->kind   = BeamformerShaderKind_DecodeFloat;
    528 				src->shader = cs->programs + BeamformerShaderKind_DecodeFloat;
    529 				success &= reload_compute_shader(ctx, src, s8(" (F32)"),  arena);
    530 				src->kind   = BeamformerShaderKind_Decode;
    531 				src->shader = cs->programs + BeamformerShaderKind_Decode;
    532 			}
    533 
    534 			if (success && ctx->csctx.raw_data_ssbo) {
    535 				/* TODO(rnp): this check seems off */
    536 				can_commit = 0;
    537 				fill_frame_compute_work(ctx, work, ctx->latest_frame->view_plane_tag);
    538 			}
    539 		}break;
    540 		case BeamformerWorkKind_ExportBuffer:{
    541 			/* TODO(rnp): better way of handling DispatchCompute barrier */
    542 			post_sync_barrier(&ctx->shared_memory, BeamformerSharedMemoryLockKind_DispatchCompute,
    543 			                  sm->locks, ctx->os.shared_memory_region_unlock);
    544 			ctx->os.shared_memory_region_lock(&ctx->shared_memory, sm->locks, (i32)work->lock, -1);
    545 			BeamformerExportContext *ec = &work->export_context;
    546 			switch (ec->kind) {
    547 			case BeamformerExportKind_BeamformedData:{
    548 				BeamformerComputeFrame *frame = ctx->latest_frame;
    549 				assert(frame->ready_to_present);
    550 				u32 texture  = frame->frame.texture;
    551 				uv3 dim      = frame->frame.dim;
    552 				iz  out_size = dim.x * dim.y * dim.z * 2 * sizeof(f32);
    553 				if (out_size <= ec->size) {
    554 					glGetTextureImage(texture, 0, GL_RG, GL_FLOAT, out_size,
    555 					                  (u8 *)sm + BEAMFORMER_SCRATCH_OFF);
    556 				}
    557 			}break;
    558 			case BeamformerExportKind_Stats:{
    559 				ComputeTimingTable *table = ctx->compute_timing_table;
    560 				/* NOTE(rnp): do a little spin to let this finish updating */
    561 				while (table->write_index != atomic_load_u32(&table->read_index));
    562 				ComputeShaderStats *stats = ctx->compute_shader_stats;
    563 				if (sizeof(stats->table) <= ec->size)
    564 					mem_copy((u8 *)sm + BEAMFORMER_SCRATCH_OFF, &stats->table, sizeof(stats->table));
    565 			}break;
    566 			InvalidDefaultCase;
    567 			}
    568 			ctx->os.shared_memory_region_unlock(&ctx->shared_memory, sm->locks, (i32)work->lock);
    569 			post_sync_barrier(&ctx->shared_memory, BeamformerSharedMemoryLockKind_ExportSync, sm->locks,
    570 			                  ctx->os.shared_memory_region_unlock);
    571 		}break;
    572 		case BeamformerWorkKind_UploadBuffer:{
    573 			ctx->os.shared_memory_region_lock(&ctx->shared_memory, sm->locks, (i32)work->lock, -1);
    574 			BeamformerUploadContext *uc = &work->upload_context;
    575 			u32 tex_type, tex_format, tex_element_count, tex_1d = 0, buffer = 0;
    576 			switch (uc->kind) {
    577 			case BU_KIND_CHANNEL_MAPPING: {
    578 				tex_1d            = cs->channel_mapping_texture;
    579 				tex_type          = GL_SHORT;
    580 				tex_format        = GL_RED_INTEGER;
    581 				tex_element_count = ARRAY_COUNT(sm->channel_mapping);
    582 				ctx->cuda_lib.set_channel_mapping(sm->channel_mapping);
    583 			} break;
    584 			case BU_KIND_FOCAL_VECTORS: {
    585 				tex_1d            = cs->focal_vectors_texture;
    586 				tex_type          = GL_FLOAT;
    587 				tex_format        = GL_RG;
    588 				tex_element_count = ARRAY_COUNT(sm->focal_vectors);
    589 			} break;
    590 			case BU_KIND_SPARSE_ELEMENTS: {
    591 				tex_1d            = cs->sparse_elements_texture;
    592 				tex_type          = GL_SHORT;
    593 				tex_format        = GL_RED_INTEGER;
    594 				tex_element_count = ARRAY_COUNT(sm->sparse_elements);
    595 			} break;
    596 			case BU_KIND_PARAMETERS: {
    597 				ctx->ui_read_params = ctx->beamform_work_queue != q;
    598 				buffer = cs->shared_ubo;
    599 			} break;
    600 			case BU_KIND_RF_DATA: {
    601 				if (cs->rf_raw_size != uc->size ||
    602 				    !uv4_equal(cs->dec_data_dim, uv4_from_u32_array(bp->dec_data_dim)))
    603 				{
    604 					alloc_shader_storage(ctx, uc->size, arena);
    605 				}
    606 				buffer = cs->raw_data_ssbo;
    607 
    608 				ComputeTimingInfo info = {0};
    609 				info.kind = ComputeTimingInfoKind_RF_Data;
    610 				/* TODO(rnp): this could stall. what should we do about it? */
    611 				glGetQueryObjectui64v(cs->rf_data_timestamp_query, GL_QUERY_RESULT, &info.timer_count);
    612 				glQueryCounter(cs->rf_data_timestamp_query, GL_TIMESTAMP);
    613 				push_compute_timing_info(ctx->compute_timing_table, info);
    614 			}break;
    615 			InvalidDefaultCase;
    616 			}
    617 
    618 			if (tex_1d) {
    619 				glTextureSubImage1D(tex_1d, 0, 0, tex_element_count, tex_format,
    620 				                    tex_type, (u8 *)sm + uc->shared_memory_offset);
    621 			}
    622 
    623 			if (buffer) {
    624 				glNamedBufferSubData(buffer, 0, uc->size,
    625 				                     (u8 *)sm + uc->shared_memory_offset);
    626 			}
    627 
    628 			atomic_and_u32(&sm->dirty_regions, ~(sm->dirty_regions & 1 << (work->lock - 1)));
    629 			ctx->os.shared_memory_region_unlock(&ctx->shared_memory, sm->locks, (i32)work->lock);
    630 		}break;
    631 		case BeamformerWorkKind_ComputeIndirect:{
    632 			fill_frame_compute_work(ctx, work, work->compute_indirect_plane);
    633 			DEBUG_DECL(work->kind = BeamformerWorkKind_ComputeIndirect;)
    634 		} /* FALLTHROUGH */
    635 		case BeamformerWorkKind_Compute:{
    636 			post_sync_barrier(&ctx->shared_memory, work->lock, sm->locks,
    637 			                  ctx->os.shared_memory_region_unlock);
    638 
    639 			push_compute_timing_info(ctx->compute_timing_table,
    640 			                         (ComputeTimingInfo){.kind = ComputeTimingInfoKind_ComputeFrameBegin});
    641 
    642 			i32 mask = 1 << (BeamformerSharedMemoryLockKind_Parameters - 1);
    643 			if (sm->dirty_regions & mask) {
    644 				glNamedBufferSubData(cs->shared_ubo, 0, sizeof(sm->parameters), &sm->parameters);
    645 				atomic_and_u32(&sm->dirty_regions, ~mask);
    646 			}
    647 
    648 			atomic_store_u32(&cs->processing_compute, 1);
    649 			start_renderdoc_capture(gl_context);
    650 
    651 			BeamformerComputeFrame *frame = work->frame;
    652 			uv3 try_dim = make_valid_test_dim(bp->output_points);
    653 			if (!uv3_equal(try_dim, frame->frame.dim))
    654 				alloc_beamform_frame(&ctx->gl, &frame->frame, try_dim, s8("Beamformed_Data"), arena);
    655 
    656 			if (bp->output_points[3] > 1) {
    657 				if (!uv3_equal(try_dim, ctx->averaged_frames[0].frame.dim)) {
    658 					alloc_beamform_frame(&ctx->gl, &ctx->averaged_frames[0].frame,
    659 					                     try_dim, s8("Averaged Frame"), arena);
    660 					alloc_beamform_frame(&ctx->gl, &ctx->averaged_frames[1].frame,
    661 					                     try_dim, s8("Averaged Frame"), arena);
    662 				}
    663 			}
    664 
    665 			frame->frame.min_coordinate  = v4_from_f32_array(bp->output_min_coordinate);
    666 			frame->frame.max_coordinate  = v4_from_f32_array(bp->output_max_coordinate);
    667 			frame->frame.das_shader_kind = bp->das_shader_id;
    668 			frame->frame.compound_count  = bp->dec_data_dim[2];
    669 
    670 			b32 did_sum_shader = 0;
    671 			u32 stage_count    = sm->compute_stages_count;
    672 			BeamformerShaderKind *stages = sm->compute_stages;
    673 			for (u32 i = 0; i < stage_count; i++) {
    674 				did_sum_shader |= stages[i] == BeamformerShaderKind_Sum;
    675 				glBeginQuery(GL_TIME_ELAPSED, cs->shader_timer_ids[i]);
    676 				do_compute_shader(ctx, arena, frame, stages[i]);
    677 				glEndQuery(GL_TIME_ELAPSED);
    678 			}
    679 
    680 			/* NOTE(rnp): the first of these blocks until work completes */
    681 			for (u32 i = 0; i < stage_count; i++) {
    682 				ComputeTimingInfo info = {0};
    683 				info.kind   = ComputeTimingInfoKind_Shader;
    684 				info.shader = stages[i];
    685 				glGetQueryObjectui64v(cs->shader_timer_ids[i], GL_QUERY_RESULT, &info.timer_count);
    686 				push_compute_timing_info(ctx->compute_timing_table, info);
    687 			}
    688 			cs->processing_progress = 1;
    689 
    690 			frame->ready_to_present = 1;
    691 			if (did_sum_shader) {
    692 				u32 aframe_index = (ctx->averaged_frame_index % countof(ctx->averaged_frames));
    693 				ctx->averaged_frames[aframe_index].view_plane_tag  = frame->view_plane_tag;
    694 				ctx->averaged_frames[aframe_index].ready_to_present = 1;
    695 				atomic_add_u32(&ctx->averaged_frame_index, 1);
    696 				atomic_store_u64((u64 *)&ctx->latest_frame, (u64)(ctx->averaged_frames + aframe_index));
    697 			} else {
    698 				atomic_store_u64((u64 *)&ctx->latest_frame, (u64)frame);
    699 			}
    700 			cs->processing_compute  = 0;
    701 
    702 			push_compute_timing_info(ctx->compute_timing_table,
    703 			                         (ComputeTimingInfo){.kind = ComputeTimingInfoKind_ComputeFrameEnd});
    704 
    705 			end_renderdoc_capture(gl_context);
    706 		}break;
    707 		InvalidDefaultCase;
    708 		}
    709 
    710 		if (can_commit) {
    711 			beamform_work_queue_pop_commit(q);
    712 			work = beamform_work_queue_pop(q);
    713 		}
    714 	}
    715 }
    716 
    717 function void
    718 coalesce_timing_table(ComputeTimingTable *t, ComputeShaderStats *stats)
    719 {
    720 	/* TODO(rnp): we do not currently do anything to handle the potential for a half written
    721 	 * info item. this could result in garbage entries but they shouldn't really matter */
    722 
    723 	u32 target = atomic_load_u32(&t->write_index);
    724 	u32 stats_index = (stats->latest_frame_index + 1) % countof(stats->table.times);
    725 
    726 	static_assert(BeamformerShaderKind_Count + 1 <= 32, "timing coalescence bitfield test");
    727 	u32 seen_info_test = 0;
    728 
    729 	while (t->read_index != target) {
    730 		ComputeTimingInfo info = t->buffer[t->read_index % countof(t->buffer)];
    731 		switch (info.kind) {
    732 		case ComputeTimingInfoKind_ComputeFrameBegin:{
    733 			assert(t->compute_frame_active == 0);
    734 			t->compute_frame_active = 1;
    735 			/* NOTE(rnp): allow multiple instances of same shader to accumulate */
    736 			mem_clear(stats->table.times[stats_index], 0, sizeof(stats->table.times[stats_index]));
    737 		}break;
    738 		case ComputeTimingInfoKind_ComputeFrameEnd:{
    739 			assert(t->compute_frame_active == 1);
    740 			t->compute_frame_active = 0;
    741 			stats->latest_frame_index = stats_index;
    742 			stats_index = (stats_index + 1) % countof(stats->table.times);
    743 		}break;
    744 		case ComputeTimingInfoKind_Shader:{
    745 			stats->table.times[stats_index][info.shader] += (f32)info.timer_count / 1.0e9;
    746 			seen_info_test |= (1 << info.shader);
    747 		}break;
    748 		case ComputeTimingInfoKind_RF_Data:{
    749 			stats->latest_rf_index = (stats->latest_rf_index + 1) % countof(stats->table.rf_time_deltas);
    750 			f32 delta = (f32)(info.timer_count - stats->last_rf_timer_count) / 1.0e9;
    751 			stats->table.rf_time_deltas[stats->latest_rf_index] = delta;
    752 			stats->last_rf_timer_count = info.timer_count;
    753 			seen_info_test |= (1 << BeamformerShaderKind_Count);
    754 		}break;
    755 		}
    756 		/* NOTE(rnp): do this at the end so that stats table is always in a consistent state */
    757 		atomic_add_u32(&t->read_index, 1);
    758 	}
    759 
    760 	if (seen_info_test) {
    761 		for EachEnumValue(BeamformerShaderKind, shader) {
    762 			if (seen_info_test & (1 << shader)) {
    763 				f32 sum = 0;
    764 				for EachElement(stats->table.times, i)
    765 					sum += stats->table.times[i][shader];
    766 				stats->average_times[shader] = sum / countof(stats->table.times);
    767 			}
    768 		}
    769 
    770 		if (seen_info_test & (1 << BeamformerShaderKind_Count)) {
    771 			f32 sum = 0;
    772 			for EachElement(stats->table.rf_time_deltas, i)
    773 				sum += stats->table.rf_time_deltas[i];
    774 			stats->rf_time_delta_average = sum / countof(stats->table.rf_time_deltas);
    775 		}
    776 	}
    777 }
    778 
    779 DEBUG_EXPORT BEAMFORMER_COMPUTE_SETUP_FN(beamformer_compute_setup)
    780 {
    781 	BeamformerCtx          *ctx = (BeamformerCtx *)user_context;
    782 	BeamformerSharedMemory *sm  = ctx->shared_memory.region;
    783 	ComputeShaderCtx       *cs  = &ctx->csctx;
    784 
    785 	glCreateBuffers(1, &cs->shared_ubo);
    786 	glNamedBufferStorage(cs->shared_ubo, sizeof(sm->parameters), 0, GL_DYNAMIC_STORAGE_BIT);
    787 
    788 	glCreateTextures(GL_TEXTURE_1D, 1, &cs->channel_mapping_texture);
    789 	glCreateTextures(GL_TEXTURE_1D, 1, &cs->sparse_elements_texture);
    790 	glCreateTextures(GL_TEXTURE_1D, 1, &cs->focal_vectors_texture);
    791 	glTextureStorage1D(cs->channel_mapping_texture, 1, GL_R16I,  ARRAY_COUNT(sm->channel_mapping));
    792 	glTextureStorage1D(cs->sparse_elements_texture, 1, GL_R16I,  ARRAY_COUNT(sm->sparse_elements));
    793 	glTextureStorage1D(cs->focal_vectors_texture,   1, GL_RG32F, ARRAY_COUNT(sm->focal_vectors));
    794 
    795 	LABEL_GL_OBJECT(GL_TEXTURE, cs->channel_mapping_texture, s8("Channel_Mapping"));
    796 	LABEL_GL_OBJECT(GL_TEXTURE, cs->focal_vectors_texture,   s8("Focal_Vectors"));
    797 	LABEL_GL_OBJECT(GL_TEXTURE, cs->sparse_elements_texture, s8("Sparse_Elements"));
    798 	LABEL_GL_OBJECT(GL_BUFFER,  cs->shared_ubo,              s8("Beamformer_Parameters"));
    799 
    800 	glCreateQueries(GL_TIME_ELAPSED, countof(cs->shader_timer_ids), cs->shader_timer_ids);
    801 	glCreateQueries(GL_TIMESTAMP, 1, &cs->rf_data_timestamp_query);
    802 
    803 	/* NOTE(rnp): start this here so we don't have to worry about it being started or not */
    804 	glQueryCounter(cs->rf_data_timestamp_query, GL_TIMESTAMP);
    805 }
    806 
    807 DEBUG_EXPORT BEAMFORMER_COMPLETE_COMPUTE_FN(beamformer_complete_compute)
    808 {
    809 	BeamformerCtx *ctx         = (BeamformerCtx *)user_context;
    810 	BeamformerSharedMemory *sm = ctx->shared_memory.region;
    811 	complete_queue(ctx, &sm->external_work_queue, arena, gl_context);
    812 	complete_queue(ctx, ctx->beamform_work_queue, arena, gl_context);
    813 }
    814 
    815 #include "ui.c"
    816 
    817 DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step)
    818 {
    819 	dt_for_frame = input->dt;
    820 
    821 	if (IsWindowResized()) {
    822 		ctx->window_size.h = GetScreenHeight();
    823 		ctx->window_size.w = GetScreenWidth();
    824 	}
    825 
    826 	coalesce_timing_table(ctx->compute_timing_table, ctx->compute_shader_stats);
    827 
    828 	if (input->executable_reloaded) {
    829 		ui_init(ctx, ctx->ui_backing_store);
    830 		DEBUG_DECL(start_frame_capture = ctx->os.start_frame_capture);
    831 		DEBUG_DECL(end_frame_capture   = ctx->os.end_frame_capture);
    832 	}
    833 
    834 	BeamformerSharedMemory *sm = ctx->shared_memory.region;
    835 	if (sm->locks[BeamformerSharedMemoryLockKind_DispatchCompute] && ctx->os.compute_worker.asleep) {
    836 		if (sm->start_compute_from_main) {
    837 			BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue);
    838 			BeamformerViewPlaneTag tag = ctx->latest_frame->view_plane_tag;
    839 			if (fill_frame_compute_work(ctx, work, tag))
    840 				beamform_work_queue_push_commit(ctx->beamform_work_queue);
    841 			atomic_store_u32(&sm->start_compute_from_main, 0);
    842 		}
    843 		ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable);
    844 	}
    845 
    846 	draw_ui(ctx, input, ctx->latest_frame->ready_to_present ? &ctx->latest_frame->frame : 0,
    847 	        ctx->latest_frame->view_plane_tag);
    848 
    849 	ctx->frame_view_render_context.updated = 0;
    850 
    851 	if (WindowShouldClose())
    852 		ctx->should_exit = 1;
    853 }