ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

beamformer.c (48071B)


      1 /* See LICENSE for license details. */
      2 /* TODO(rnp):
      3  * [ ]: make decode output real values for real inputs and complex values for complex inputs
      4  *      - this means that das should have a RF version and an IQ version
      5  *      - this will also flip the current hack to support demodulate after decode to
      6  *        being a hack to support CudaHilbert after decode
      7  * [ ]: filter sampling frequency should be a filter creation parameter
      8  * [ ]: reinvestigate ring buffer raw_data_ssbo
      9  *      - to minimize latency the main thread should manage the subbuffer upload so that the
     10  *        compute thread can just keep computing. This way we can keep the copmute thread busy
     11  *        with work while we image.
     12  *      - In particular we will potentially need multiple GPUComputeContexts so that we
     13  *        can overwrite one while the other is in use.
     14  *      - make use of glFenceSync to guard buffer uploads
     15  * [ ]: BeamformWorkQueue -> BeamformerWorkQueue
     16  * [ ]: bug: re-beamform on shader reload
     17  * [ ]: need to keep track of gpu memory in some way
     18  *      - want to be able to store more than 16 2D frames but limit 3D frames
     19  *      - maybe keep track of how much gpu memory is committed for beamformed images
     20  *        and use that to determine when to loop back over existing textures
     21  *      - to do this maybe use a circular linked list instead of a flat array
     22  *      - then have a way of querying how many frames are available for a specific point count
     23  * [ ]: bug: reinit cuda on hot-reload
     24  */
     25 
     26 #include "beamformer.h"
     27 #include "beamformer_work_queue.c"
     28 
     29 global f32 dt_for_frame;
     30 global u32 cycle_t;
     31 
     32 #ifndef _DEBUG
     33 #define start_renderdoc_capture(...)
     34 #define end_renderdoc_capture(...)
     35 #else
     36 global renderdoc_start_frame_capture_fn *start_frame_capture;
     37 global renderdoc_end_frame_capture_fn   *end_frame_capture;
     38 #define start_renderdoc_capture(gl) if (start_frame_capture) start_frame_capture(gl, 0)
     39 #define end_renderdoc_capture(gl)   if (end_frame_capture)   end_frame_capture(gl, 0)
     40 #endif
     41 
     42 typedef struct {
     43 	BeamformerFrame *frames;
     44 	u32 capacity;
     45 	u32 offset;
     46 	u32 cursor;
     47 	u32 needed_frames;
     48 } ComputeFrameIterator;
     49 
     50 function void
     51 beamformer_filter_update(BeamformerFilter *f, BeamformerCreateFilterContext *cfc,
     52                          f32 sampling_frequency, Arena arena)
     53 {
     54 	glDeleteTextures(1, &f->texture);
     55 	glCreateTextures(GL_TEXTURE_1D, 1, &f->texture);
     56 	glTextureStorage1D(f->texture, 1, GL_R32F, cfc->length);
     57 
     58 	f32 *filter = 0;
     59 	switch (cfc->kind) {
     60 	case BeamformerFilterKind_Kaiser:{
     61 		filter = kaiser_low_pass_filter(&arena, cfc->cutoff_frequency, sampling_frequency,
     62 		                                cfc->beta, cfc->length);
     63 	}break;
     64 	InvalidDefaultCase;
     65 	}
     66 
     67 	f->kind   = cfc->kind;
     68 	f->length = cfc->length;
     69 	f->sampling_frequency = sampling_frequency;
     70 	glTextureSubImage1D(f->texture, 0, 0, f->length, GL_RED, GL_FLOAT, filter);
     71 }
     72 
     73 function f32
     74 beamformer_filter_time_offset(BeamformerFilter *f)
     75 {
     76 	f32 result = 0;
     77 	switch (f->kind) {
     78 	case BeamformerFilterKind_Kaiser:{
     79 		result = (f32)f->length / 2.0f / f->sampling_frequency;
     80 	}break;
     81 	InvalidDefaultCase;
     82 	}
     83 	return result;
     84 }
     85 
     86 function iv3
     87 make_valid_test_dim(i32 in[3])
     88 {
     89 	iv3 result;
     90 	result.E[0] = MAX(in[0], 1);
     91 	result.E[1] = MAX(in[1], 1);
     92 	result.E[2] = MAX(in[2], 1);
     93 	return result;
     94 }
     95 
     96 function ComputeFrameIterator
     97 compute_frame_iterator(BeamformerCtx *ctx, u32 start_index, u32 needed_frames)
     98 {
     99 	start_index = start_index % ARRAY_COUNT(ctx->beamform_frames);
    100 
    101 	ComputeFrameIterator result;
    102 	result.frames        = ctx->beamform_frames;
    103 	result.offset        = start_index;
    104 	result.capacity      = ARRAY_COUNT(ctx->beamform_frames);
    105 	result.cursor        = 0;
    106 	result.needed_frames = needed_frames;
    107 	return result;
    108 }
    109 
    110 function BeamformerFrame *
    111 frame_next(ComputeFrameIterator *bfi)
    112 {
    113 	BeamformerFrame *result = 0;
    114 	if (bfi->cursor != bfi->needed_frames) {
    115 		u32 index = (bfi->offset + bfi->cursor++) % bfi->capacity;
    116 		result    = bfi->frames + index;
    117 	}
    118 	return result;
    119 }
    120 
    121 function void
    122 alloc_beamform_frame(GLParams *gp, BeamformerFrame *out, iv3 out_dim, s8 name, Arena arena)
    123 {
    124 	out->dim.x = MAX(1, out_dim.x);
    125 	out->dim.y = MAX(1, out_dim.y);
    126 	out->dim.z = MAX(1, out_dim.z);
    127 
    128 	if (gp) {
    129 		out->dim.x = MIN(out->dim.x, gp->max_3d_texture_dim);
    130 		out->dim.y = MIN(out->dim.y, gp->max_3d_texture_dim);
    131 		out->dim.z = MIN(out->dim.z, gp->max_3d_texture_dim);
    132 	}
    133 
    134 	/* NOTE: allocate storage for beamformed output data;
    135 	 * this is shared between compute and fragment shaders */
    136 	u32 max_dim = (u32)MAX(out->dim.x, MAX(out->dim.y, out->dim.z));
    137 	out->mips   = (i32)ctz_u32(round_up_power_of_2(max_dim)) + 1;
    138 
    139 	Stream label = arena_stream(arena);
    140 	stream_append_s8(&label, name);
    141 	stream_append_byte(&label, '[');
    142 	stream_append_hex_u64(&label, out->id);
    143 	stream_append_byte(&label, ']');
    144 
    145 	glDeleteTextures(1, &out->texture);
    146 	glCreateTextures(GL_TEXTURE_3D, 1, &out->texture);
    147 	glTextureStorage3D(out->texture, out->mips, GL_RG32F, out->dim.x, out->dim.y, out->dim.z);
    148 
    149 	glTextureParameteri(out->texture, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
    150 	glTextureParameteri(out->texture, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
    151 
    152 	LABEL_GL_OBJECT(GL_TEXTURE, out->texture, stream_to_s8(&label));
    153 }
    154 
    155 function void
    156 alloc_shader_storage(BeamformerCtx *ctx, u32 rf_raw_size, Arena a)
    157 {
    158 	ComputeShaderCtx     *cs = &ctx->csctx;
    159 	BeamformerParameters *bp = &((BeamformerSharedMemory *)ctx->shared_memory.region)->parameters;
    160 
    161 	cs->dec_data_dim = uv4_from_u32_array(bp->dec_data_dim);
    162 	cs->rf_raw_size  = rf_raw_size;
    163 
    164 	glDeleteBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos);
    165 	glCreateBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos);
    166 
    167 	u32 storage_flags = GL_DYNAMIC_STORAGE_BIT;
    168 	glDeleteBuffers(1, &cs->raw_data_ssbo);
    169 	glCreateBuffers(1, &cs->raw_data_ssbo);
    170 	glNamedBufferStorage(cs->raw_data_ssbo, rf_raw_size, 0, storage_flags);
    171 	LABEL_GL_OBJECT(GL_BUFFER, cs->raw_data_ssbo, s8("Raw_RF_SSBO"));
    172 
    173 	uz rf_decoded_size = 2 * sizeof(f32) * cs->dec_data_dim.x * cs->dec_data_dim.y * cs->dec_data_dim.z;
    174 	Stream label = arena_stream(a);
    175 	stream_append_s8(&label, s8("Decoded_RF_SSBO_"));
    176 	i32 s_widx = label.widx;
    177 	for (i32 i = 0; i < countof(cs->rf_data_ssbos); i++) {
    178 		glNamedBufferStorage(cs->rf_data_ssbos[i], (iz)rf_decoded_size, 0, 0);
    179 		stream_append_i64(&label, i);
    180 		LABEL_GL_OBJECT(GL_BUFFER, cs->rf_data_ssbos[i], stream_to_s8(&label));
    181 		stream_reset(&label, s_widx);
    182 	}
    183 
    184 	/* NOTE(rnp): these are stubs when CUDA isn't supported */
    185 	cs->cuda_lib.register_buffers(cs->rf_data_ssbos, countof(cs->rf_data_ssbos), cs->raw_data_ssbo);
    186 	cs->cuda_lib.init(bp->rf_raw_dim, bp->dec_data_dim);
    187 
    188 	i32  order    = (i32)cs->dec_data_dim.z;
    189 	i32 *hadamard = make_hadamard_transpose(&a, order);
    190 	if (hadamard) {
    191 		glDeleteTextures(1, &cs->hadamard_texture);
    192 		glCreateTextures(GL_TEXTURE_2D, 1, &cs->hadamard_texture);
    193 		glTextureStorage2D(cs->hadamard_texture, 1, GL_R8I, order, order);
    194 		glTextureSubImage2D(cs->hadamard_texture, 0, 0, 0,  order, order, GL_RED_INTEGER,
    195 		                    GL_INT, hadamard);
    196 		LABEL_GL_OBJECT(GL_TEXTURE, cs->hadamard_texture, s8("Hadamard_Matrix"));
    197 	}
    198 }
    199 
    200 function void
    201 push_compute_timing_info(ComputeTimingTable *t, ComputeTimingInfo info)
    202 {
    203 	u32 index = atomic_add_u32(&t->write_index, 1) % countof(t->buffer);
    204 	t->buffer[index] = info;
    205 }
    206 
    207 function b32
    208 fill_frame_compute_work(BeamformerCtx *ctx, BeamformWork *work, BeamformerViewPlaneTag plane)
    209 {
    210 	b32 result = 0;
    211 	if (work) {
    212 		result = 1;
    213 		u32 frame_id    = atomic_add_u32(&ctx->next_render_frame_index, 1);
    214 		u32 frame_index = frame_id % countof(ctx->beamform_frames);
    215 		work->kind      = BeamformerWorkKind_Compute;
    216 		work->lock      = BeamformerSharedMemoryLockKind_DispatchCompute;
    217 		work->frame     = ctx->beamform_frames + frame_index;
    218 		work->frame->ready_to_present = 0;
    219 		work->frame->view_plane_tag   = plane;
    220 		work->frame->id               = frame_id;
    221 	}
    222 	return result;
    223 }
    224 
    225 function void
    226 do_sum_shader(ComputeShaderCtx *cs, u32 *in_textures, u32 in_texture_count, f32 in_scale,
    227               u32 out_texture, iv3 out_data_dim)
    228 {
    229 	/* NOTE: zero output before summing */
    230 	glClearTexImage(out_texture, 0, GL_RED, GL_FLOAT, 0);
    231 	glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT);
    232 
    233 	glBindImageTexture(0, out_texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F);
    234 	glProgramUniform1f(cs->programs[BeamformerShaderKind_Sum], SUM_PRESCALE_UNIFORM_LOC, in_scale);
    235 	for (u32 i = 0; i < in_texture_count; i++) {
    236 		glBindImageTexture(1, in_textures[i], 0, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F);
    237 		glDispatchCompute(ORONE((u32)out_data_dim.x / 32u),
    238 		                  ORONE((u32)out_data_dim.y),
    239 		                  ORONE((u32)out_data_dim.z / 32u));
    240 		glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    241 	}
    242 }
    243 
    244 struct compute_cursor {
    245 	iv3 cursor;
    246 	uv3 dispatch;
    247 	iv3 target;
    248 	u32 points_per_dispatch;
    249 	u32 completed_points;
    250 	u32 total_points;
    251 };
    252 
    253 function struct compute_cursor
    254 start_compute_cursor(iv3 dim, u32 max_points)
    255 {
    256 	struct compute_cursor result = {0};
    257 	u32 invocations_per_dispatch = DAS_LOCAL_SIZE_X * DAS_LOCAL_SIZE_Y * DAS_LOCAL_SIZE_Z;
    258 
    259 	result.dispatch.y = MIN(max_points / invocations_per_dispatch, (u32)ceil_f32((f32)dim.y / DAS_LOCAL_SIZE_Y));
    260 
    261 	u32 remaining     = max_points / result.dispatch.y;
    262 	result.dispatch.x = MIN(remaining / invocations_per_dispatch, (u32)ceil_f32((f32)dim.x / DAS_LOCAL_SIZE_X));
    263 	result.dispatch.z = MIN(remaining / (invocations_per_dispatch * result.dispatch.x),
    264 	                        (u32)ceil_f32((f32)dim.z / DAS_LOCAL_SIZE_Z));
    265 
    266 	result.target.x = MAX(dim.x / (i32)result.dispatch.x / DAS_LOCAL_SIZE_X, 1);
    267 	result.target.y = MAX(dim.y / (i32)result.dispatch.y / DAS_LOCAL_SIZE_Y, 1);
    268 	result.target.z = MAX(dim.z / (i32)result.dispatch.z / DAS_LOCAL_SIZE_Z, 1);
    269 
    270 	result.points_per_dispatch = 1;
    271 	result.points_per_dispatch *= result.dispatch.x * DAS_LOCAL_SIZE_X;
    272 	result.points_per_dispatch *= result.dispatch.y * DAS_LOCAL_SIZE_Y;
    273 	result.points_per_dispatch *= result.dispatch.z * DAS_LOCAL_SIZE_Z;
    274 
    275 	result.total_points = (u32)(dim.x * dim.y * dim.z);
    276 
    277 	return result;
    278 }
    279 
    280 function iv3
    281 step_compute_cursor(struct compute_cursor *cursor)
    282 {
    283 	cursor->cursor.x += 1;
    284 	if (cursor->cursor.x >= cursor->target.x) {
    285 		cursor->cursor.x  = 0;
    286 		cursor->cursor.y += 1;
    287 		if (cursor->cursor.y >= cursor->target.y) {
    288 			cursor->cursor.y  = 0;
    289 			cursor->cursor.z += 1;
    290 		}
    291 	}
    292 
    293 	cursor->completed_points += cursor->points_per_dispatch;
    294 
    295 	iv3 result = cursor->cursor;
    296 	result.x *= (i32)cursor->dispatch.x * DAS_LOCAL_SIZE_X;
    297 	result.y *= (i32)cursor->dispatch.y * DAS_LOCAL_SIZE_Y;
    298 	result.z *= (i32)cursor->dispatch.z * DAS_LOCAL_SIZE_Z;
    299 
    300 	return result;
    301 }
    302 
    303 function b32
    304 compute_cursor_finished(struct compute_cursor *cursor)
    305 {
    306 	b32 result = cursor->completed_points >= cursor->total_points;
    307 	return result;
    308 }
    309 
    310 function void
    311 plan_compute_pipeline(SharedMemoryRegion *os_sm, BeamformerComputePipeline *cp, BeamformerFilter *filters)
    312 {
    313 	BeamformerSharedMemory *sm = os_sm->region;
    314 	BeamformerParameters   *bp = &cp->das_ubo_data;
    315 
    316 	i32 compute_lock = BeamformerSharedMemoryLockKind_ComputePipeline;
    317 	i32 params_lock  = BeamformerSharedMemoryLockKind_Parameters;
    318 	os_shared_memory_region_lock(os_sm, sm->locks, compute_lock, (u32)-1);
    319 
    320 	b32 decode_first = sm->shaders[0] == BeamformerShaderKind_Decode;
    321 	b32 cuda_hilbert = 0;
    322 	b32 demodulate   = 0;
    323 
    324 	for (i32 i = 0; i < sm->shader_count; i++) {
    325 		switch (sm->shaders[i]) {
    326 		case BeamformerShaderKind_CudaHilbert:{ cuda_hilbert = 1; }break;
    327 		case BeamformerShaderKind_Demodulate:{  demodulate = 1;   }break;
    328 		default:{}break;
    329 		}
    330 	}
    331 
    332 	if (demodulate) cuda_hilbert = 0;
    333 
    334 	os_shared_memory_region_lock(os_sm, sm->locks, params_lock, (u32)-1);
    335 	mem_copy(bp, &sm->parameters, sizeof(*bp));
    336 	os_shared_memory_region_unlock(os_sm, sm->locks, params_lock);
    337 
    338 	BeamformerDataKind data_kind = sm->data_kind;
    339 	cp->shader_count = 0;
    340 	for (i32 i = 0; i < sm->shader_count; i++) {
    341 		BeamformerShaderParameters *sp = sm->shader_parameters + i;
    342 		u32 shader = sm->shaders[i];
    343 		b32 commit = 0;
    344 
    345 		switch (shader) {
    346 		case BeamformerShaderKind_CudaHilbert:{ commit = cuda_hilbert; }break;
    347 		case BeamformerShaderKind_Decode:{
    348 			BeamformerShaderKind decode_table[] = {
    349 				[BeamformerDataKind_Int16]          = BeamformerShaderKind_Decode,
    350 				[BeamformerDataKind_Int16Complex]   = BeamformerShaderKind_DecodeInt16Complex,
    351 				[BeamformerDataKind_Float32]        = BeamformerShaderKind_DecodeFloat,
    352 				[BeamformerDataKind_Float32Complex] = BeamformerShaderKind_DecodeFloatComplex,
    353 			};
    354 			if (decode_first && demodulate) {
    355 				/* TODO(rnp): for now we assume that if we are demodulating the data is int16 */
    356 				shader = BeamformerShaderKind_DecodeInt16ToFloat;
    357 			} else if (decode_first) {
    358 				shader = decode_table[CLAMP(data_kind, 0, countof(decode_table) - 1)];
    359 			} else {
    360 				if (data_kind == BeamformerDataKind_Int16)
    361 					shader = BeamformerShaderKind_DecodeInt16Complex;
    362 				else
    363 					shader = BeamformerShaderKind_DecodeFloatComplex;
    364 			}
    365 			commit = 1;
    366 		}break;
    367 		case BeamformerShaderKind_Demodulate:{
    368 			if (decode_first || (!decode_first && data_kind == BeamformerDataKind_Float32))
    369 				shader = BeamformerShaderKind_DemodulateFloat;
    370 			bp->time_offset += beamformer_filter_time_offset(filters + sp->filter_slot);
    371 			commit = 1;
    372 		}break;
    373 		case BeamformerShaderKind_DAS:{
    374 			if (!bp->coherency_weighting)
    375 				shader = BeamformerShaderKind_DASFast;
    376 			commit = 1;
    377 		}break;
    378 		default:{ commit = 1; }break;
    379 		}
    380 
    381 		if (commit) {
    382 			i32 index = cp->shader_count++;
    383 			cp->shaders[index] = shader;
    384 			cp->shader_parameters[index] = *sp;
    385 		}
    386 	}
    387 	os_shared_memory_region_unlock(os_sm, sm->locks, compute_lock);
    388 
    389 	u32 time_compression = 1;
    390 	if (demodulate) time_compression = 2;
    391 
    392 	if (!demodulate) bp->center_frequency = 0;
    393 	bp->decimation_rate = MAX(bp->decimation_rate, 1);
    394 
    395 	cp->decode_dispatch.x = (u32)ceil_f32((f32)bp->dec_data_dim[0] / DECODE_LOCAL_SIZE_X);
    396 	cp->decode_dispatch.y = (u32)ceil_f32((f32)bp->dec_data_dim[1] / DECODE_LOCAL_SIZE_Y);
    397 	cp->decode_dispatch.z = (u32)ceil_f32((f32)bp->dec_data_dim[2] / DECODE_LOCAL_SIZE_Z);
    398 
    399 	/* NOTE(rnp): decode 2 samples per dispatch when data is i16 */
    400 	if (decode_first && cp->data_kind == BeamformerDataKind_Int16)
    401 		cp->decode_dispatch.x = (u32)ceil_f32((f32)cp->decode_dispatch.x / 2);
    402 
    403 	BeamformerDecodeUBO *dp = &cp->decode_ubo_data;
    404 	dp->decode_mode    = bp->decode;
    405 	dp->transmit_count = bp->dec_data_dim[2];
    406 
    407 	if (decode_first) {
    408 		dp->input_channel_stride   = bp->rf_raw_dim[0];
    409 		dp->input_sample_stride    = 1;
    410 		dp->input_transmit_stride  = bp->dec_data_dim[0];
    411 
    412 		dp->output_channel_stride  = bp->dec_data_dim[0] * bp->dec_data_dim[2] / time_compression;
    413 		dp->output_sample_stride   = 1;
    414 		dp->output_transmit_stride = bp->dec_data_dim[0] / time_compression;
    415 	} else {
    416 		dp->input_channel_stride   = bp->dec_data_dim[0] * bp->dec_data_dim[2] /
    417 		                             bp->decimation_rate / time_compression;
    418 		dp->input_sample_stride    = bp->dec_data_dim[2];
    419 		dp->input_transmit_stride  = 1;
    420 
    421 		dp->output_channel_stride  = dp->input_channel_stride;
    422 		dp->output_sample_stride   = 1;
    423 		dp->output_transmit_stride = bp->dec_data_dim[0] / bp->decimation_rate / time_compression;
    424 	}
    425 
    426 	/* NOTE(rnp): when we are demodulating we pretend that the sampler was alternating
    427 	 * between sampling the I portion and the Q portion of an IQ signal. Therefore there
    428 	 * is an implicit decimation factor of 2 which must always be included. All code here
    429 	 * assumes that the signal was sampled in such a way that supports this operation.
    430 	 * To recover IQ[n] from the sampled data (RF[n]) we do the following:
    431 	 *   I[n]  = RF[n]
    432 	 *   Q[n]  = RF[n + 1]
    433 	 *   IQ[n] = I[n] - j*Q[n]
    434 	 */
    435 	if (demodulate) {
    436 		BeamformerDemodulateUBO *mp = &cp->demod_ubo_data;
    437 		mp->demodulation_frequency = bp->center_frequency;
    438 		mp->sampling_frequency     = bp->sampling_frequency / (f32)time_compression;
    439 		mp->decimation_rate        = bp->decimation_rate;
    440 		mp->map_channels           = !decode_first;
    441 
    442 		if (decode_first) {
    443 			mp->input_channel_stride  = dp->output_channel_stride;
    444 			mp->input_sample_stride   = dp->output_sample_stride;
    445 			mp->input_transmit_stride = dp->output_transmit_stride;
    446 
    447 			mp->output_channel_stride  = bp->dec_data_dim[0] * bp->dec_data_dim[2] /
    448 			                             mp->decimation_rate / time_compression;
    449 			mp->output_sample_stride   = 1;
    450 			mp->output_transmit_stride = bp->dec_data_dim[0] /  mp->decimation_rate / time_compression;
    451 		} else {
    452 			mp->input_channel_stride  = bp->rf_raw_dim[0] / time_compression;
    453 			mp->input_sample_stride   = 1;
    454 			mp->input_transmit_stride = bp->dec_data_dim[0] / time_compression;
    455 
    456 			/* NOTE(rnp): output optimized layout for decoding */
    457 			mp->output_channel_stride  = dp->input_channel_stride;
    458 			mp->output_sample_stride   = dp->input_sample_stride;
    459 			mp->output_transmit_stride = dp->input_transmit_stride;
    460 
    461 			u32 time_samples = bp->dec_data_dim[0] / mp->decimation_rate / time_compression;
    462 			cp->decode_dispatch.x = (u32)ceil_f32((f32)time_samples / DECODE_LOCAL_SIZE_X);
    463 		}
    464 
    465 		f32 local_size_x = DEMOD_LOCAL_SIZE_X * (f32)time_compression * (f32)mp->decimation_rate;
    466 		cp->demod_dispatch.x = (u32)ceil_f32((f32)bp->dec_data_dim[0] / local_size_x);
    467 		cp->demod_dispatch.y = (u32)ceil_f32((f32)bp->dec_data_dim[1] / DEMOD_LOCAL_SIZE_Y);
    468 		cp->demod_dispatch.z = (u32)ceil_f32((f32)bp->dec_data_dim[2] / DEMOD_LOCAL_SIZE_Z);
    469 
    470 		bp->sampling_frequency /= (f32)mp->decimation_rate * (f32)time_compression;
    471 		bp->dec_data_dim[0]    /= mp->decimation_rate * time_compression;
    472 	}
    473 	/* TODO(rnp): if IQ (* 8) else (* 4) */
    474 	cp->rf_size = bp->dec_data_dim[0] * bp->dec_data_dim[1] * bp->dec_data_dim[2] * 8;
    475 }
    476 
    477 function m4
    478 das_voxel_transform_matrix(BeamformerParameters *bp)
    479 {
    480 	v3 min = v4_from_f32_array(bp->output_min_coordinate).xyz;
    481 	v3 max = v4_from_f32_array(bp->output_max_coordinate).xyz;
    482 	v3 extent = v3_abs(v3_sub(max, min));
    483 	v3 points = {{(f32)bp->output_points[0], (f32)bp->output_points[1], (f32)bp->output_points[2]}};
    484 
    485 	m4 T1 = m4_translation(v3_scale(v3_sub(points, (v3){{1.0f, 1.0f, 1.0f}}), -0.5f));
    486 	m4 T2 = m4_translation(v3_add(min, v3_scale(extent, 0.5f)));
    487 	m4 S  = m4_scale(v3_div(extent, points));
    488 
    489 	m4 R;
    490 	switch (bp->das_shader_id) {
    491 	case DASShaderKind_FORCES:
    492 	case DASShaderKind_UFORCES:
    493 	case DASShaderKind_FLASH:
    494 	{
    495 		R = m4_identity();
    496 		S.c[1].E[1]  = 0;
    497 		T2.c[3].E[1] = 0;
    498 	}break;
    499 	case DASShaderKind_HERCULES:
    500 	case DASShaderKind_UHERCULES:
    501 	case DASShaderKind_RCA_TPW:
    502 	case DASShaderKind_RCA_VLS:
    503 	{
    504 		R = m4_rotation_about_z(bp->beamform_plane ? 0.0f : 0.25f);
    505 		if (!(points.x > 1 && points.y > 1 && points.z > 1))
    506 			T2.c[3].E[1] = bp->off_axis_pos;
    507 	}break;
    508 	default:{ R = m4_identity(); }break;
    509 	}
    510 	m4 result = m4_mul(R, m4_mul(T2, m4_mul(S, T1)));
    511 	return result;
    512 }
    513 
    514 function void
    515 do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformerFrame *frame,
    516                   BeamformerShaderKind shader, BeamformerShaderParameters *sp)
    517 {
    518 	ComputeShaderCtx          *csctx = &ctx->csctx;
    519 	BeamformerComputePipeline *cp    = &csctx->compute_pipeline;
    520 
    521 	u32 program = csctx->programs[shader];
    522 	glUseProgram(program);
    523 
    524 	u32 output_ssbo_idx = !csctx->last_output_ssbo_index;
    525 	u32 input_ssbo_idx  = csctx->last_output_ssbo_index;
    526 
    527 	switch (shader) {
    528 	case BeamformerShaderKind_Decode:
    529 	case BeamformerShaderKind_DecodeInt16Complex:
    530 	case BeamformerShaderKind_DecodeFloat:
    531 	case BeamformerShaderKind_DecodeFloatComplex:
    532 	case BeamformerShaderKind_DecodeInt16ToFloat:
    533 	{
    534 		glBindBufferBase(GL_UNIFORM_BUFFER, 0, cp->ubos[BeamformerComputeUBOKind_Decode]);
    535 		glBindImageTexture(0, csctx->hadamard_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8I);
    536 
    537 		if (shader == cp->shaders[0]) {
    538 			glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo);
    539 			glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[input_ssbo_idx]);
    540 			glBindImageTexture(1, csctx->channel_mapping_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I);
    541 			glProgramUniform1ui(program, DECODE_FIRST_PASS_UNIFORM_LOC, 1);
    542 
    543 			glDispatchCompute(cp->decode_dispatch.x, cp->decode_dispatch.y, cp->decode_dispatch.z);
    544 			glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
    545 		}
    546 
    547 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]);
    548 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, csctx->rf_data_ssbos[output_ssbo_idx]);
    549 
    550 		glProgramUniform1ui(program, DECODE_FIRST_PASS_UNIFORM_LOC, 0);
    551 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, csctx->rf_data_ssbos[output_ssbo_idx]);
    552 
    553 		glDispatchCompute(cp->decode_dispatch.x, cp->decode_dispatch.y, cp->decode_dispatch.z);
    554 		glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
    555 
    556 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    557 	}break;
    558 	case BeamformerShaderKind_CudaDecode:{
    559 		csctx->cuda_lib.decode(0, output_ssbo_idx, 0);
    560 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    561 	}break;
    562 	case BeamformerShaderKind_CudaHilbert:{
    563 		csctx->cuda_lib.hilbert(input_ssbo_idx, output_ssbo_idx);
    564 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    565 	}break;
    566 	case BeamformerShaderKind_Demodulate:
    567 	case BeamformerShaderKind_DemodulateFloat:
    568 	{
    569 		BeamformerDemodulateUBO *ubo = &cp->demod_ubo_data;
    570 		u32 input = ubo->map_channels ? csctx->raw_data_ssbo : csctx->rf_data_ssbos[input_ssbo_idx];
    571 		glBindBufferBase(GL_UNIFORM_BUFFER,        0, cp->ubos[BeamformerComputeUBOKind_Demodulate]);
    572 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, input);
    573 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]);
    574 
    575 		glBindImageTexture(0, csctx->filters[sp->filter_slot].texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R32F);
    576 		if (ubo->map_channels)
    577 			glBindImageTexture(1, csctx->channel_mapping_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I);
    578 
    579 		glDispatchCompute(cp->demod_dispatch.x, cp->demod_dispatch.y, cp->demod_dispatch.z);
    580 		glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
    581 
    582 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    583 	}break;
    584 	case BeamformerShaderKind_MinMax:{
    585 		for (i32 i = 1; i < frame->mips; i++) {
    586 			glBindImageTexture(0, frame->texture, i - 1, GL_TRUE, 0, GL_READ_ONLY,  GL_RG32F);
    587 			glBindImageTexture(1, frame->texture, i - 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F);
    588 			glProgramUniform1i(csctx->programs[shader], MIN_MAX_MIPS_LEVEL_UNIFORM_LOC, i);
    589 
    590 			u32 width  = (u32)frame->dim.x >> i;
    591 			u32 height = (u32)frame->dim.y >> i;
    592 			u32 depth  = (u32)frame->dim.z >> i;
    593 			glDispatchCompute(ORONE(width / 32), ORONE(height), ORONE(depth / 32));
    594 			glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    595 		}
    596 	}break;
    597 	case BeamformerShaderKind_DAS:
    598 	case BeamformerShaderKind_DASFast:
    599 	{
    600 		BeamformerParameters *ubo = &cp->das_ubo_data;
    601 		if (shader == BeamformerShaderKind_DASFast) {
    602 			glClearTexImage(frame->texture, 0, GL_RED, GL_FLOAT, 0);
    603 			glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT);
    604 			glBindImageTexture(0, frame->texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F);
    605 		} else {
    606 			glBindImageTexture(0, frame->texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F);
    607 		}
    608 
    609 		glBindBufferBase(GL_UNIFORM_BUFFER, 0, cp->ubos[BeamformerComputeUBOKind_DAS]);
    610 		glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx], 0, cp->rf_size);
    611 		glBindImageTexture(1, csctx->sparse_elements_texture, 0, GL_FALSE, 0, GL_READ_ONLY,  GL_R16I);
    612 		glBindImageTexture(2, csctx->focal_vectors_texture,   0, GL_FALSE, 0, GL_READ_ONLY,  GL_RG32F);
    613 
    614 		m4 voxel_transform = das_voxel_transform_matrix(ubo);
    615 		glProgramUniform1ui(program, DAS_CYCLE_T_UNIFORM_LOC, cycle_t++);
    616 		glProgramUniformMatrix4fv(program, DAS_VOXEL_MATRIX_LOC, 1, 0, voxel_transform.E);
    617 
    618 		if (shader == BeamformerShaderKind_DASFast) {
    619 			i32 loop_end;
    620 			if (ubo->das_shader_id == DASShaderKind_RCA_VLS ||
    621 			    ubo->das_shader_id == DASShaderKind_RCA_TPW)
    622 			{
    623 				/* NOTE(rnp): to avoid repeatedly sampling the whole focal vectors
    624 				 * texture we loop over transmits for VLS/TPW */
    625 				loop_end = (i32)ubo->dec_data_dim[2];
    626 			} else {
    627 				loop_end = (i32)ubo->dec_data_dim[1];
    628 			}
    629 			f32 percent_per_step = 1.0f / (f32)loop_end;
    630 			csctx->processing_progress = -percent_per_step;
    631 			for (i32 index = 0; index < loop_end; index++) {
    632 				csctx->processing_progress += percent_per_step;
    633 				/* IMPORTANT(rnp): prevents OS from coalescing and killing our shader */
    634 				glFinish();
    635 				glProgramUniform1i(program, DAS_FAST_CHANNEL_UNIFORM_LOC, index);
    636 				glDispatchCompute((u32)ceil_f32((f32)frame->dim.x / DAS_FAST_LOCAL_SIZE_X),
    637 				                  (u32)ceil_f32((f32)frame->dim.y / DAS_FAST_LOCAL_SIZE_Y),
    638 				                  (u32)ceil_f32((f32)frame->dim.z / DAS_FAST_LOCAL_SIZE_Z));
    639 				glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    640 			}
    641 		} else {
    642 			#if 1
    643 			/* TODO(rnp): compute max_points_per_dispatch based on something like a
    644 			 * transmit_count * channel_count product */
    645 			u32 max_points_per_dispatch = KB(64);
    646 			struct compute_cursor cursor = start_compute_cursor(frame->dim, max_points_per_dispatch);
    647 			f32 percent_per_step = (f32)cursor.points_per_dispatch / (f32)cursor.total_points;
    648 			csctx->processing_progress = -percent_per_step;
    649 			for (iv3 offset = {0};
    650 			     !compute_cursor_finished(&cursor);
    651 			     offset = step_compute_cursor(&cursor))
    652 			{
    653 				csctx->processing_progress += percent_per_step;
    654 				/* IMPORTANT(rnp): prevents OS from coalescing and killing our shader */
    655 				glFinish();
    656 				glProgramUniform3iv(program, DAS_VOXEL_OFFSET_UNIFORM_LOC, 1, offset.E);
    657 				glDispatchCompute(cursor.dispatch.x, cursor.dispatch.y, cursor.dispatch.z);
    658 			}
    659 			#else
    660 			/* NOTE(rnp): use this for testing tiling code. The performance of the above path
    661 			 * should be the same as this path if everything is working correctly */
    662 			iv3 compute_dim_offset = {0};
    663 			glProgramUniform3iv(program, DAS_VOXEL_OFFSET_UNIFORM_LOC, 1, compute_dim_offset.E);
    664 			glDispatchCompute((u32)ceil_f32((f32)dim.x / DAS_LOCAL_SIZE_X),
    665 			                  (u32)ceil_f32((f32)dim.y / DAS_LOCAL_SIZE_Y),
    666 			                  (u32)ceil_f32((f32)dim.z / DAS_LOCAL_SIZE_Z));
    667 			#endif
    668 		}
    669 		glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT|GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    670 	}break;
    671 	case BeamformerShaderKind_Sum:{
    672 		u32 aframe_index = ctx->averaged_frame_index % ARRAY_COUNT(ctx->averaged_frames);
    673 		BeamformerFrame *aframe = ctx->averaged_frames + aframe_index;
    674 		aframe->id              = ctx->averaged_frame_index;
    675 		atomic_store_u32(&aframe->ready_to_present, 0);
    676 		/* TODO(rnp): hack we need a better way of specifying which frames to sum;
    677 		 * this is fine for rolling averaging but what if we want to do something else */
    678 		assert(frame >= ctx->beamform_frames);
    679 		assert(frame < ctx->beamform_frames + countof(ctx->beamform_frames));
    680 		u32 base_index   = (u32)(frame - ctx->beamform_frames);
    681 		u32 to_average   = (u32)cp->das_ubo_data.output_points[3];
    682 		u32 frame_count  = 0;
    683 		u32 *in_textures = push_array(&arena, u32, MAX_BEAMFORMED_SAVED_FRAMES);
    684 		ComputeFrameIterator cfi = compute_frame_iterator(ctx, 1 + base_index - to_average, to_average);
    685 		for (BeamformerFrame *it = frame_next(&cfi); it; it = frame_next(&cfi))
    686 			in_textures[frame_count++] = it->texture;
    687 
    688 		assert(to_average == frame_count);
    689 
    690 		do_sum_shader(csctx, in_textures, frame_count, 1 / (f32)frame_count, aframe->texture, aframe->dim);
    691 		aframe->min_coordinate  = frame->min_coordinate;
    692 		aframe->max_coordinate  = frame->max_coordinate;
    693 		aframe->compound_count  = frame->compound_count;
    694 		aframe->das_shader_kind = frame->das_shader_kind;
    695 	}break;
    696 	InvalidDefaultCase;
    697 	}
    698 }
    699 
    700 function s8
    701 shader_text_with_header(ShaderReloadContext *ctx, OS *os, Arena *arena)
    702 {
    703 	Stream sb = arena_stream(*arena);
    704 	stream_append_s8s(&sb, s8("#version 460 core\n\n"), ctx->header);
    705 
    706 	switch (ctx->kind) {
    707 	case BeamformerShaderKind_Demodulate:
    708 	case BeamformerShaderKind_DemodulateFloat:
    709 	{
    710 			stream_append_s8(&sb, s8(""
    711 			"layout(local_size_x = " str(DEMOD_LOCAL_SIZE_X) ", "
    712 			       "local_size_y = " str(DEMOD_LOCAL_SIZE_Y) ", "
    713 			       "local_size_z = " str(DEMOD_LOCAL_SIZE_Z) ") in;\n\n"
    714 			));
    715 			if (ctx->kind == BeamformerShaderKind_DemodulateFloat)
    716 				stream_append_s8(&sb, s8("#define INPUT_DATA_TYPE_FLOAT\n\n"));
    717 	}break;
    718 	case BeamformerShaderKind_DAS:
    719 	case BeamformerShaderKind_DASFast:
    720 	{
    721 		if (ctx->kind == BeamformerShaderKind_DAS) {
    722 			stream_append_s8(&sb, s8(""
    723 			"layout(local_size_x = " str(DAS_LOCAL_SIZE_X) ", "
    724 			       "local_size_y = " str(DAS_LOCAL_SIZE_Y) ", "
    725 			       "local_size_z = " str(DAS_LOCAL_SIZE_Z) ") in;\n\n"
    726 			"#define DAS_FAST 0\n\n"
    727 			"layout(location = " str(DAS_VOXEL_OFFSET_UNIFORM_LOC) ") uniform ivec3 u_voxel_offset;\n"
    728 			));
    729 		} else {
    730 			stream_append_s8(&sb, s8(""
    731 			"layout(local_size_x = " str(DAS_FAST_LOCAL_SIZE_X) ", "
    732 			       "local_size_y = " str(DAS_FAST_LOCAL_SIZE_Y) ", "
    733 			       "local_size_z = " str(DAS_FAST_LOCAL_SIZE_Z) ") in;\n\n"
    734 			"#define DAS_FAST 1\n\n"
    735 			"layout(location = " str(DAS_FAST_CHANNEL_UNIFORM_LOC) ") uniform int   u_channel;\n"
    736 			));
    737 		}
    738 		#define X(type, id, pretty, fixed_tx) "#define DAS_ID_" #type " " #id "\n"
    739 		stream_append_s8(&sb, s8(""
    740 		"layout(location = " str(DAS_VOXEL_MATRIX_LOC)    ") uniform mat4  u_voxel_transform;\n"
    741 		"layout(location = " str(DAS_CYCLE_T_UNIFORM_LOC) ") uniform uint  u_cycle_t;\n\n"
    742 		DAS_TYPES
    743 		));
    744 		#undef X
    745 	}break;
    746 	case BeamformerShaderKind_Decode:
    747 	case BeamformerShaderKind_DecodeFloat:
    748 	case BeamformerShaderKind_DecodeFloatComplex:
    749 	case BeamformerShaderKind_DecodeInt16Complex:
    750 	case BeamformerShaderKind_DecodeInt16ToFloat:
    751 	{
    752 		s8 define_table[] = {
    753 			[BeamformerShaderKind_DecodeFloatComplex] = s8("#define INPUT_DATA_TYPE_FLOAT_COMPLEX\n\n"),
    754 			[BeamformerShaderKind_DecodeFloat]        = s8("#define INPUT_DATA_TYPE_FLOAT\n\n"),
    755 			[BeamformerShaderKind_DecodeInt16Complex] = s8("#define INPUT_DATA_TYPE_INT16_COMPLEX\n\n"),
    756 			[BeamformerShaderKind_DecodeInt16ToFloat] = s8("#define OUTPUT_DATA_TYPE_FLOAT\n\n"),
    757 		};
    758 		#define X(type, id, pretty) "#define DECODE_MODE_" #type " " #id "\n"
    759 		stream_append_s8s(&sb, define_table[ctx->kind], s8(""
    760 		"layout(local_size_x = " str(DECODE_LOCAL_SIZE_X) ", "
    761 		       "local_size_y = " str(DECODE_LOCAL_SIZE_Y) ", "
    762 		       "local_size_z = " str(DECODE_LOCAL_SIZE_Z) ") in;\n\n"
    763 		"layout(location = " str(DECODE_FIRST_PASS_UNIFORM_LOC) ") uniform bool u_first_pass;\n\n"
    764 		DECODE_TYPES
    765 		));
    766 		#undef X
    767 	}break;
    768 	case BeamformerShaderKind_MinMax:{
    769 		stream_append_s8(&sb, s8("layout(location = " str(MIN_MAX_MIPS_LEVEL_UNIFORM_LOC)
    770 		                         ") uniform int u_mip_map;\n\n"));
    771 	}break;
    772 	case BeamformerShaderKind_Sum:{
    773 		stream_append_s8(&sb, s8("layout(location = " str(SUM_PRESCALE_UNIFORM_LOC)
    774 		                         ") uniform float u_sum_prescale = 1.0;\n\n"));
    775 	}break;
    776 	default:{}break;
    777 	}
    778 	stream_append_s8(&sb, s8("\n#line 1\n"));
    779 
    780 	s8 result = arena_stream_commit(arena, &sb);
    781 	if (ctx->path.len) {
    782 		s8 file = os_read_whole_file(arena, (c8 *)ctx->path.data);
    783 		assert(file.data == result.data + result.len);
    784 		result.len += file.len;
    785 	}
    786 
    787 	return result;
    788 }
    789 
    790 DEBUG_EXPORT BEAMFORMER_RELOAD_SHADER_FN(beamformer_reload_shader)
    791 {
    792 	i32 shader_count = 1;
    793 	ShaderReloadContext *link = src->link;
    794 	while (link != src) { shader_count++; link = link->link; }
    795 
    796 	s8  *shader_texts = push_array(&arena, s8,  shader_count);
    797 	u32 *shader_types = push_array(&arena, u32, shader_count);
    798 
    799 	i32 index = 0;
    800 	do {
    801 		shader_texts[index] = shader_text_with_header(link, os, &arena);
    802 		shader_types[index] = link->gl_type;
    803 		index++;
    804 		link = link->link;
    805 	} while (link != src);
    806 
    807 	u32 new_program = load_shader(&ctx->os, arena, shader_texts, shader_types, shader_count, shader_name);
    808 	if (new_program) {
    809 		glDeleteProgram(*src->shader);
    810 		*src->shader = new_program;
    811 		if (src->kind == BeamformerShaderKind_Render3D) ctx->frame_view_render_context.updated = 1;
    812 	}
    813 	return new_program != 0;
    814 }
    815 
    816 function b32
    817 reload_compute_shader(BeamformerCtx *ctx, ShaderReloadContext *src, s8 name_extra, Arena arena)
    818 {
    819 	Stream sb  = arena_stream(arena);
    820 	stream_append_s8s(&sb, src->name, name_extra);
    821 	s8  name   = arena_stream_commit(&arena, &sb);
    822 	b32 result = beamformer_reload_shader(&ctx->os, ctx, src, arena, name);
    823 	return result;
    824 }
    825 
    826 function void
    827 complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_context)
    828 {
    829 	ComputeShaderCtx       *cs = &ctx->csctx;
    830 	BeamformerSharedMemory *sm = ctx->shared_memory.region;
    831 	BeamformerParameters   *bp = &sm->parameters;
    832 
    833 	BeamformWork *work = beamform_work_queue_pop(q);
    834 	while (work) {
    835 		b32 can_commit = 1;
    836 		switch (work->kind) {
    837 		case BeamformerWorkKind_ReloadShader:{
    838 			ShaderReloadContext *src = work->shader_reload_context;
    839 			b32 success = reload_compute_shader(ctx, src, s8(""), arena);
    840 			/* TODO(rnp): think of a better way of doing this */
    841 			switch (src->kind) {
    842 			case BeamformerShaderKind_DAS:{
    843 				src->kind   = BeamformerShaderKind_DASFast;
    844 				src->shader = cs->programs + src->kind;
    845 				success &= reload_compute_shader(ctx, src, s8(" (Fast)"), arena);
    846 
    847 				src->kind   = BeamformerShaderKind_DAS;
    848 				src->shader = cs->programs + src->kind;
    849 			}break;
    850 			case BeamformerShaderKind_Decode:{
    851 				src->kind   = BeamformerShaderKind_DecodeFloatComplex;
    852 				src->shader = cs->programs + src->kind;
    853 				success &= reload_compute_shader(ctx, src, s8(" (F32C)"), arena);
    854 
    855 				src->kind   = BeamformerShaderKind_DecodeFloat;
    856 				src->shader = cs->programs + src->kind;
    857 				success &= reload_compute_shader(ctx, src, s8(" (F32)"),  arena);
    858 
    859 				src->kind   = BeamformerShaderKind_DecodeInt16Complex;
    860 				src->shader = cs->programs + src->kind;
    861 				success &= reload_compute_shader(ctx, src, s8(" (I16C)"),  arena);
    862 
    863 				src->kind   = BeamformerShaderKind_DecodeInt16ToFloat;
    864 				src->shader = cs->programs + src->kind;
    865 				success &= reload_compute_shader(ctx, src, s8(" (I16-F32)"),  arena);
    866 
    867 				src->kind   = BeamformerShaderKind_Decode;
    868 				src->shader = cs->programs + src->kind;
    869 			}break;
    870 			case BeamformerShaderKind_Demodulate:{
    871 				src->kind   = BeamformerShaderKind_DemodulateFloat;
    872 				src->shader = cs->programs + src->kind;
    873 				success &= reload_compute_shader(ctx, src, s8(" (F32)"), arena);
    874 
    875 				src->kind   = BeamformerShaderKind_Demodulate;
    876 				src->shader = cs->programs + src->kind;
    877 			}break;
    878 			default:{}break;
    879 			}
    880 
    881 
    882 			if (success && ctx->latest_frame) {
    883 				fill_frame_compute_work(ctx, work, ctx->latest_frame->view_plane_tag);
    884 				can_commit = 0;
    885 			}
    886 		}break;
    887 		case BeamformerWorkKind_ExportBuffer:{
    888 			/* TODO(rnp): better way of handling DispatchCompute barrier */
    889 			post_sync_barrier(&ctx->shared_memory, BeamformerSharedMemoryLockKind_DispatchCompute, sm->locks);
    890 			os_shared_memory_region_lock(&ctx->shared_memory, sm->locks, (i32)work->lock, (u32)-1);
    891 			BeamformerExportContext *ec = &work->export_context;
    892 			switch (ec->kind) {
    893 			case BeamformerExportKind_BeamformedData:{
    894 				BeamformerFrame *frame = ctx->latest_frame;
    895 				if (frame) {
    896 					assert(frame->ready_to_present);
    897 					u32 texture  = frame->texture;
    898 					iv3 dim      = frame->dim;
    899 					u32 out_size = (u32)dim.x * (u32)dim.y * (u32)dim.z * 2 * sizeof(f32);
    900 					if (out_size <= ec->size) {
    901 						glGetTextureImage(texture, 0, GL_RG, GL_FLOAT, (i32)out_size,
    902 						                  (u8 *)sm + BEAMFORMER_SCRATCH_OFF);
    903 					}
    904 				}
    905 			}break;
    906 			case BeamformerExportKind_Stats:{
    907 				ComputeTimingTable *table = ctx->compute_timing_table;
    908 				/* NOTE(rnp): do a little spin to let this finish updating */
    909 				while (table->write_index != atomic_load_u32(&table->read_index));
    910 				ComputeShaderStats *stats = ctx->compute_shader_stats;
    911 				if (sizeof(stats->table) <= ec->size)
    912 					mem_copy((u8 *)sm + BEAMFORMER_SCRATCH_OFF, &stats->table, sizeof(stats->table));
    913 			}break;
    914 			InvalidDefaultCase;
    915 			}
    916 			os_shared_memory_region_unlock(&ctx->shared_memory, sm->locks, (i32)work->lock);
    917 			post_sync_barrier(&ctx->shared_memory, BeamformerSharedMemoryLockKind_ExportSync, sm->locks);
    918 		}break;
    919 		case BeamformerWorkKind_CreateFilter:{
    920 			BeamformerCreateFilterContext *fctx = &work->create_filter_context;
    921 			beamformer_filter_update(cs->filters + fctx->slot, fctx, sm->parameters.sampling_frequency / 2, arena);
    922 		}break;
    923 		case BeamformerWorkKind_UploadBuffer:{
    924 			os_shared_memory_region_lock(&ctx->shared_memory, sm->locks, (i32)work->lock, (u32)-1);
    925 			BeamformerUploadContext *uc = &work->upload_context;
    926 			u32 tex_type, tex_format, tex_1d = 0, buffer = 0;
    927 			i32 tex_element_count;
    928 			switch (uc->kind) {
    929 			case BeamformerUploadKind_ChannelMapping:{
    930 				tex_1d            = cs->channel_mapping_texture;
    931 				tex_type          = GL_SHORT;
    932 				tex_format        = GL_RED_INTEGER;
    933 				tex_element_count = countof(sm->channel_mapping);
    934 				cs->cuda_lib.set_channel_mapping(sm->channel_mapping);
    935 			}break;
    936 			case BeamformerUploadKind_FocalVectors:{
    937 				tex_1d            = cs->focal_vectors_texture;
    938 				tex_type          = GL_FLOAT;
    939 				tex_format        = GL_RG;
    940 				tex_element_count = countof(sm->focal_vectors);
    941 			}break;
    942 			case BeamformerUploadKind_SparseElements:{
    943 				tex_1d            = cs->sparse_elements_texture;
    944 				tex_type          = GL_SHORT;
    945 				tex_format        = GL_RED_INTEGER;
    946 				tex_element_count = countof(sm->sparse_elements);
    947 			}break;
    948 			case BeamformerUploadKind_RFData:{
    949 				if (cs->rf_raw_size != uc->size ||
    950 				    !uv4_equal(cs->dec_data_dim, uv4_from_u32_array(bp->dec_data_dim)))
    951 				{
    952 					alloc_shader_storage(ctx, uc->size, arena);
    953 				}
    954 				buffer = cs->raw_data_ssbo;
    955 
    956 				ComputeTimingInfo info = {0};
    957 				info.kind = ComputeTimingInfoKind_RF_Data;
    958 				/* TODO(rnp): this could stall. what should we do about it? */
    959 				glGetQueryObjectui64v(cs->rf_data_timestamp_query, GL_QUERY_RESULT, &info.timer_count);
    960 				glQueryCounter(cs->rf_data_timestamp_query, GL_TIMESTAMP);
    961 				push_compute_timing_info(ctx->compute_timing_table, info);
    962 			}break;
    963 			InvalidDefaultCase;
    964 			}
    965 
    966 			if (tex_1d) {
    967 				glTextureSubImage1D(tex_1d, 0, 0, tex_element_count, tex_format,
    968 				                    tex_type, (u8 *)sm + uc->shared_memory_offset);
    969 			}
    970 
    971 			if (buffer) {
    972 				glNamedBufferSubData(buffer, 0, (i32)uc->size,
    973 				                     (u8 *)sm + uc->shared_memory_offset);
    974 			}
    975 
    976 			atomic_and_u32(&sm->dirty_regions, ~(sm->dirty_regions & 1 << (work->lock - 1)));
    977 			os_shared_memory_region_unlock(&ctx->shared_memory, sm->locks, (i32)work->lock);
    978 		}break;
    979 		case BeamformerWorkKind_ComputeIndirect:{
    980 			fill_frame_compute_work(ctx, work, work->compute_indirect_plane);
    981 			DEBUG_DECL(work->kind = BeamformerWorkKind_ComputeIndirect;)
    982 		} /* FALLTHROUGH */
    983 		case BeamformerWorkKind_Compute:{
    984 			DEBUG_DECL(glClearNamedBufferData(cs->rf_data_ssbos[0], GL_RG32F, GL_RG, GL_FLOAT, 0);)
    985 			DEBUG_DECL(glClearNamedBufferData(cs->rf_data_ssbos[1], GL_RG32F, GL_RG, GL_FLOAT, 0);)
    986 			DEBUG_DECL(glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);)
    987 
    988 			push_compute_timing_info(ctx->compute_timing_table,
    989 			                         (ComputeTimingInfo){.kind = ComputeTimingInfoKind_ComputeFrameBegin});
    990 
    991 			BeamformerComputePipeline *cp = &cs->compute_pipeline;
    992 			u32 mask = (1 << (BeamformerSharedMemoryLockKind_Parameters - 1)) |
    993 			           (1 << (BeamformerSharedMemoryLockKind_ComputePipeline - 1));
    994 			if (sm->dirty_regions & mask) {
    995 				plan_compute_pipeline(&ctx->shared_memory, cp, cs->filters);
    996 				atomic_store_u32(&ctx->ui_read_params, ctx->beamform_work_queue != q);
    997 				atomic_and_u32(&sm->dirty_regions, ~mask);
    998 
    999 				#define X(k, t, v) glNamedBufferSubData(cp->ubos[BeamformerComputeUBOKind_##k], \
   1000 				                                        0, sizeof(t), &cp->v ## _ubo_data);
   1001 				BEAMFORMER_COMPUTE_UBO_LIST
   1002 				#undef X
   1003 			}
   1004 
   1005 			post_sync_barrier(&ctx->shared_memory, work->lock, sm->locks);
   1006 
   1007 			atomic_store_u32(&cs->processing_compute, 1);
   1008 			start_renderdoc_capture(gl_context);
   1009 
   1010 			BeamformerFrame *frame = work->frame;
   1011 			iv3 try_dim = make_valid_test_dim(bp->output_points);
   1012 			if (!iv3_equal(try_dim, frame->dim))
   1013 				alloc_beamform_frame(&ctx->gl, frame, try_dim, s8("Beamformed_Data"), arena);
   1014 
   1015 			if (bp->output_points[3] > 1) {
   1016 				if (!iv3_equal(try_dim, ctx->averaged_frames[0].dim)) {
   1017 					alloc_beamform_frame(&ctx->gl, ctx->averaged_frames + 0, try_dim, s8("Averaged Frame"), arena);
   1018 					alloc_beamform_frame(&ctx->gl, ctx->averaged_frames + 1, try_dim, s8("Averaged Frame"), arena);
   1019 				}
   1020 			}
   1021 
   1022 			frame->min_coordinate  = v4_from_f32_array(bp->output_min_coordinate);
   1023 			frame->max_coordinate  = v4_from_f32_array(bp->output_max_coordinate);
   1024 			frame->das_shader_kind = bp->das_shader_id;
   1025 			frame->compound_count  = bp->dec_data_dim[2];
   1026 
   1027 			b32 did_sum_shader = 0;
   1028 			for (i32 i = 0; i < cp->shader_count; i++) {
   1029 				did_sum_shader |= cp->shaders[i] == BeamformerShaderKind_Sum;
   1030 				glBeginQuery(GL_TIME_ELAPSED, cs->shader_timer_ids[i]);
   1031 				do_compute_shader(ctx, arena, frame, cp->shaders[i], cp->shader_parameters + i);
   1032 				glEndQuery(GL_TIME_ELAPSED);
   1033 			}
   1034 
   1035 			/* NOTE(rnp): the first of these blocks until work completes */
   1036 			for (i32 i = 0; i < cp->shader_count; i++) {
   1037 				ComputeTimingInfo info = {0};
   1038 				info.kind   = ComputeTimingInfoKind_Shader;
   1039 				info.shader = cp->shaders[i];
   1040 				glGetQueryObjectui64v(cs->shader_timer_ids[i], GL_QUERY_RESULT, &info.timer_count);
   1041 				push_compute_timing_info(ctx->compute_timing_table, info);
   1042 			}
   1043 			cs->processing_progress = 1;
   1044 
   1045 			frame->ready_to_present = 1;
   1046 			if (did_sum_shader) {
   1047 				u32 aframe_index = (ctx->averaged_frame_index % countof(ctx->averaged_frames));
   1048 				ctx->averaged_frames[aframe_index].view_plane_tag  = frame->view_plane_tag;
   1049 				ctx->averaged_frames[aframe_index].ready_to_present = 1;
   1050 				atomic_add_u32(&ctx->averaged_frame_index, 1);
   1051 				atomic_store_u64((u64 *)&ctx->latest_frame, (u64)(ctx->averaged_frames + aframe_index));
   1052 			} else {
   1053 				atomic_store_u64((u64 *)&ctx->latest_frame, (u64)frame);
   1054 			}
   1055 			cs->processing_compute  = 0;
   1056 
   1057 			push_compute_timing_info(ctx->compute_timing_table,
   1058 			                         (ComputeTimingInfo){.kind = ComputeTimingInfoKind_ComputeFrameEnd});
   1059 
   1060 			end_renderdoc_capture(gl_context);
   1061 		}break;
   1062 		InvalidDefaultCase;
   1063 		}
   1064 
   1065 		if (can_commit) {
   1066 			beamform_work_queue_pop_commit(q);
   1067 			work = beamform_work_queue_pop(q);
   1068 		}
   1069 	}
   1070 }
   1071 
   1072 function void
   1073 coalesce_timing_table(ComputeTimingTable *t, ComputeShaderStats *stats)
   1074 {
   1075 	/* TODO(rnp): we do not currently do anything to handle the potential for a half written
   1076 	 * info item. this could result in garbage entries but they shouldn't really matter */
   1077 
   1078 	u32 target = atomic_load_u32(&t->write_index);
   1079 	u32 stats_index = (stats->latest_frame_index + 1) % countof(stats->table.times);
   1080 
   1081 	static_assert(BeamformerShaderKind_Count + 1 <= 32, "timing coalescence bitfield test");
   1082 	u32 seen_info_test = 0;
   1083 
   1084 	while (t->read_index != target) {
   1085 		ComputeTimingInfo info = t->buffer[t->read_index % countof(t->buffer)];
   1086 		switch (info.kind) {
   1087 		case ComputeTimingInfoKind_ComputeFrameBegin:{
   1088 			assert(t->compute_frame_active == 0);
   1089 			t->compute_frame_active = 1;
   1090 			/* NOTE(rnp): allow multiple instances of same shader to accumulate */
   1091 			mem_clear(stats->table.times[stats_index], 0, sizeof(stats->table.times[stats_index]));
   1092 		}break;
   1093 		case ComputeTimingInfoKind_ComputeFrameEnd:{
   1094 			assert(t->compute_frame_active == 1);
   1095 			t->compute_frame_active = 0;
   1096 			stats->latest_frame_index = stats_index;
   1097 			stats_index = (stats_index + 1) % countof(stats->table.times);
   1098 		}break;
   1099 		case ComputeTimingInfoKind_Shader:{
   1100 			stats->table.times[stats_index][info.shader] += (f32)info.timer_count / 1.0e9f;
   1101 			seen_info_test |= (1u << info.shader);
   1102 		}break;
   1103 		case ComputeTimingInfoKind_RF_Data:{
   1104 			stats->latest_rf_index = (stats->latest_rf_index + 1) % countof(stats->table.rf_time_deltas);
   1105 			f32 delta = (f32)(info.timer_count - stats->last_rf_timer_count) / 1.0e9f;
   1106 			stats->table.rf_time_deltas[stats->latest_rf_index] = delta;
   1107 			stats->last_rf_timer_count = info.timer_count;
   1108 			seen_info_test |= (1 << BeamformerShaderKind_Count);
   1109 		}break;
   1110 		}
   1111 		/* NOTE(rnp): do this at the end so that stats table is always in a consistent state */
   1112 		atomic_add_u32(&t->read_index, 1);
   1113 	}
   1114 
   1115 	if (seen_info_test) {
   1116 		for EachEnumValue(BeamformerShaderKind, shader) {
   1117 			if (seen_info_test & (1 << shader)) {
   1118 				f32 sum = 0;
   1119 				for EachElement(stats->table.times, i)
   1120 					sum += stats->table.times[i][shader];
   1121 				stats->average_times[shader] = sum / countof(stats->table.times);
   1122 			}
   1123 		}
   1124 
   1125 		if (seen_info_test & (1 << BeamformerShaderKind_Count)) {
   1126 			f32 sum = 0;
   1127 			for EachElement(stats->table.rf_time_deltas, i)
   1128 				sum += stats->table.rf_time_deltas[i];
   1129 			stats->rf_time_delta_average = sum / countof(stats->table.rf_time_deltas);
   1130 		}
   1131 	}
   1132 }
   1133 
   1134 DEBUG_EXPORT BEAMFORMER_COMPUTE_SETUP_FN(beamformer_compute_setup)
   1135 {
   1136 	BeamformerCtx             *ctx = (BeamformerCtx *)user_context;
   1137 	BeamformerSharedMemory    *sm  = ctx->shared_memory.region;
   1138 	ComputeShaderCtx          *cs  = &ctx->csctx;
   1139 	BeamformerComputePipeline *cp  = &cs->compute_pipeline;
   1140 
   1141 	glCreateBuffers(countof(cp->ubos), cp->ubos);
   1142 	#define X(k, t, ...) \
   1143 		glNamedBufferStorage(cp->ubos[BeamformerComputeUBOKind_##k], sizeof(t), \
   1144 		                     0, GL_DYNAMIC_STORAGE_BIT); \
   1145 		LABEL_GL_OBJECT(GL_BUFFER, cp->ubos[BeamformerComputeUBOKind_##k], s8(#t));
   1146 
   1147 		BEAMFORMER_COMPUTE_UBO_LIST
   1148 	#undef X
   1149 
   1150 	glCreateTextures(GL_TEXTURE_1D, 1, &cs->channel_mapping_texture);
   1151 	glCreateTextures(GL_TEXTURE_1D, 1, &cs->sparse_elements_texture);
   1152 	glCreateTextures(GL_TEXTURE_1D, 1, &cs->focal_vectors_texture);
   1153 	glTextureStorage1D(cs->channel_mapping_texture, 1, GL_R16I,  ARRAY_COUNT(sm->channel_mapping));
   1154 	glTextureStorage1D(cs->sparse_elements_texture, 1, GL_R16I,  ARRAY_COUNT(sm->sparse_elements));
   1155 	glTextureStorage1D(cs->focal_vectors_texture,   1, GL_RG32F, ARRAY_COUNT(sm->focal_vectors));
   1156 
   1157 	LABEL_GL_OBJECT(GL_TEXTURE, cs->channel_mapping_texture, s8("Channel_Mapping"));
   1158 	LABEL_GL_OBJECT(GL_TEXTURE, cs->focal_vectors_texture,   s8("Focal_Vectors"));
   1159 	LABEL_GL_OBJECT(GL_TEXTURE, cs->sparse_elements_texture, s8("Sparse_Elements"));
   1160 
   1161 	glCreateQueries(GL_TIME_ELAPSED, countof(cs->shader_timer_ids), cs->shader_timer_ids);
   1162 	glCreateQueries(GL_TIMESTAMP, 1, &cs->rf_data_timestamp_query);
   1163 
   1164 	/* NOTE(rnp): start this here so we don't have to worry about it being started or not */
   1165 	glQueryCounter(cs->rf_data_timestamp_query, GL_TIMESTAMP);
   1166 }
   1167 
   1168 DEBUG_EXPORT BEAMFORMER_COMPLETE_COMPUTE_FN(beamformer_complete_compute)
   1169 {
   1170 	BeamformerCtx *ctx         = (BeamformerCtx *)user_context;
   1171 	BeamformerSharedMemory *sm = ctx->shared_memory.region;
   1172 	complete_queue(ctx, &sm->external_work_queue, arena, gl_context);
   1173 	complete_queue(ctx, ctx->beamform_work_queue, arena, gl_context);
   1174 }
   1175 
   1176 #include "ui.c"
   1177 
   1178 DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step)
   1179 {
   1180 	dt_for_frame = input->dt;
   1181 
   1182 	if (IsWindowResized()) {
   1183 		ctx->window_size.h = GetScreenHeight();
   1184 		ctx->window_size.w = GetScreenWidth();
   1185 	}
   1186 
   1187 	coalesce_timing_table(ctx->compute_timing_table, ctx->compute_shader_stats);
   1188 
   1189 	if (input->executable_reloaded) {
   1190 		ui_init(ctx, ctx->ui_backing_store);
   1191 		DEBUG_DECL(start_frame_capture = ctx->os.start_frame_capture);
   1192 		DEBUG_DECL(end_frame_capture   = ctx->os.end_frame_capture);
   1193 	}
   1194 
   1195 	BeamformerSharedMemory *sm = ctx->shared_memory.region;
   1196 	if (sm->locks[BeamformerSharedMemoryLockKind_DispatchCompute] && ctx->os.compute_worker.asleep) {
   1197 		if (sm->start_compute_from_main) {
   1198 			BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue);
   1199 			BeamformerViewPlaneTag tag = ctx->latest_frame ? ctx->latest_frame->view_plane_tag : 0;
   1200 			if (fill_frame_compute_work(ctx, work, tag))
   1201 				beamform_work_queue_push_commit(ctx->beamform_work_queue);
   1202 			atomic_store_u32(&sm->start_compute_from_main, 0);
   1203 		}
   1204 		os_wake_waiters(&ctx->os.compute_worker.sync_variable);
   1205 	}
   1206 
   1207 	BeamformerFrame        *frame = ctx->latest_frame;
   1208 	BeamformerViewPlaneTag  tag   = frame? frame->view_plane_tag : 0;
   1209 	draw_ui(ctx, input, frame, tag);
   1210 
   1211 	ctx->frame_view_render_context.updated = 0;
   1212 
   1213 	if (WindowShouldClose())
   1214 		ctx->should_exit = 1;
   1215 }
   1216 
   1217 /* NOTE(rnp): functions defined in these shouldn't be visible to the whole program */
   1218 #if _DEBUG
   1219   #if OS_LINUX
   1220     #include "os_linux.c"
   1221   #elif OS_WINDOWS
   1222     #include "os_win32.c"
   1223   #endif
   1224 #endif