ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

beamformer.c (29153B)


      1 /* See LICENSE for license details. */
      2 /* TODO(rnp):
      3  * [ ]: refactor: BeamformGPUComputeContext
      4  * [ ]: refactor: compute shader timers should be generated based on the pipeline stage limit
      5  * [ ]: reinvestigate ring buffer raw_data_ssbo
      6  *      - to minimize latency the main thread should manage the subbuffer upload so that the
      7  *        compute thread can just keep computing. This way we can keep the copmute thread busy
      8  *        with work while we image.
      9  *      - In particular we will potentially need multiple GPUComputeContexts so that we
     10  *        can overwrite one while the other is in use.
     11  *      - make use of glFenceSync to guard buffer uploads
     12  * [ ]: BeamformWorkQueue -> BeamformerWorkQueue
     13  */
     14 
     15 #include "beamformer.h"
     16 #include "beamformer_work_queue.c"
     17 
     18 global f32 dt_for_frame;
     19 global u32 cycle_t;
     20 
     21 #ifndef _DEBUG
     22 #define start_renderdoc_capture(...)
     23 #define end_renderdoc_capture(...)
     24 #else
     25 global renderdoc_start_frame_capture_fn *start_frame_capture;
     26 global renderdoc_end_frame_capture_fn   *end_frame_capture;
     27 #define start_renderdoc_capture(gl) if (start_frame_capture) start_frame_capture(gl, 0)
     28 #define end_renderdoc_capture(gl)   if (end_frame_capture)   end_frame_capture(gl, 0)
     29 #endif
     30 
     31 typedef struct {
     32 	BeamformComputeFrame *frames;
     33 	u32 capacity;
     34 	u32 offset;
     35 	u32 cursor;
     36 	u32 needed_frames;
     37 } ComputeFrameIterator;
     38 
     39 function uv3
     40 make_valid_test_dim(u32 in[3])
     41 {
     42 	uv3 result;
     43 	result.E[0] = MAX(in[0], 1);
     44 	result.E[1] = MAX(in[1], 1);
     45 	result.E[2] = MAX(in[2], 1);
     46 	return result;
     47 }
     48 
     49 function ComputeFrameIterator
     50 compute_frame_iterator(BeamformerCtx *ctx, u32 start_index, u32 needed_frames)
     51 {
     52 	start_index = start_index % ARRAY_COUNT(ctx->beamform_frames);
     53 
     54 	ComputeFrameIterator result;
     55 	result.frames        = ctx->beamform_frames;
     56 	result.offset        = start_index;
     57 	result.capacity      = ARRAY_COUNT(ctx->beamform_frames);
     58 	result.cursor        = 0;
     59 	result.needed_frames = needed_frames;
     60 	return result;
     61 }
     62 
     63 function BeamformComputeFrame *
     64 frame_next(ComputeFrameIterator *bfi)
     65 {
     66 	BeamformComputeFrame *result = 0;
     67 	if (bfi->cursor != bfi->needed_frames) {
     68 		u32 index = (bfi->offset + bfi->cursor++) % bfi->capacity;
     69 		result    = bfi->frames + index;
     70 	}
     71 	return result;
     72 }
     73 
     74 function void
     75 alloc_beamform_frame(GLParams *gp, BeamformFrame *out, ComputeShaderStats *out_stats,
     76                      uv3 out_dim, s8 name, Arena arena)
     77 {
     78 	out->dim.x = MAX(1, round_down_power_of_2(ORONE(out_dim.x)));
     79 	out->dim.y = MAX(1, round_down_power_of_2(ORONE(out_dim.y)));
     80 	out->dim.z = MAX(1, round_down_power_of_2(ORONE(out_dim.z)));
     81 
     82 	if (gp) {
     83 		out->dim.x = MIN(out->dim.x, gp->max_3d_texture_dim);
     84 		out->dim.y = MIN(out->dim.y, gp->max_3d_texture_dim);
     85 		out->dim.z = MIN(out->dim.z, gp->max_3d_texture_dim);
     86 	}
     87 
     88 	/* NOTE: allocate storage for beamformed output data;
     89 	 * this is shared between compute and fragment shaders */
     90 	u32 max_dim = MAX(out->dim.x, MAX(out->dim.y, out->dim.z));
     91 	out->mips   = ctz_u32(max_dim) + 1;
     92 
     93 	Stream label = arena_stream(arena);
     94 	stream_append_s8(&label, name);
     95 	stream_append_byte(&label, '[');
     96 	stream_append_hex_u64(&label, out->id);
     97 	stream_append_byte(&label, ']');
     98 
     99 	glDeleteTextures(1, &out->texture);
    100 	glCreateTextures(GL_TEXTURE_3D, 1, &out->texture);
    101 	glTextureStorage3D(out->texture, out->mips, GL_RG32F, out->dim.x, out->dim.y, out->dim.z);
    102 	LABEL_GL_OBJECT(GL_TEXTURE, out->texture, stream_to_s8(&label));
    103 
    104 	if (out_stats) {
    105 		glDeleteQueries(ARRAY_COUNT(out_stats->timer_ids), out_stats->timer_ids);
    106 		glCreateQueries(GL_TIME_ELAPSED, ARRAY_COUNT(out_stats->timer_ids), out_stats->timer_ids);
    107 	}
    108 }
    109 
    110 function void
    111 alloc_shader_storage(BeamformerCtx *ctx, u32 rf_raw_size, Arena a)
    112 {
    113 	ComputeShaderCtx     *cs = &ctx->csctx;
    114 	BeamformerParameters *bp = &ctx->shared_memory->parameters;
    115 
    116 	cs->dec_data_dim = uv4_from_u32_array(bp->dec_data_dim);
    117 	cs->rf_raw_size  = rf_raw_size;
    118 
    119 	glDeleteBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos);
    120 	glCreateBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos);
    121 
    122 	i32 storage_flags = GL_DYNAMIC_STORAGE_BIT;
    123 	glDeleteBuffers(1, &cs->raw_data_ssbo);
    124 	glCreateBuffers(1, &cs->raw_data_ssbo);
    125 	glNamedBufferStorage(cs->raw_data_ssbo, rf_raw_size, 0, storage_flags);
    126 	LABEL_GL_OBJECT(GL_BUFFER, cs->raw_data_ssbo, s8("Raw_RF_SSBO"));
    127 
    128 	iz rf_decoded_size = 2 * sizeof(f32) * cs->dec_data_dim.x * cs->dec_data_dim.y * cs->dec_data_dim.z;
    129 	Stream label = arena_stream(a);
    130 	stream_append_s8(&label, s8("Decoded_RF_SSBO_"));
    131 	u32 s_widx = label.widx;
    132 	for (u32 i = 0; i < ARRAY_COUNT(cs->rf_data_ssbos); i++) {
    133 		glNamedBufferStorage(cs->rf_data_ssbos[i], rf_decoded_size, 0, 0);
    134 		stream_append_u64(&label, i);
    135 		LABEL_GL_OBJECT(GL_BUFFER, cs->rf_data_ssbos[i], stream_to_s8(&label));
    136 		stream_reset(&label, s_widx);
    137 	}
    138 
    139 	/* NOTE(rnp): these are stubs when CUDA isn't supported */
    140 	ctx->cuda_lib.register_buffers(cs->rf_data_ssbos, countof(cs->rf_data_ssbos), cs->raw_data_ssbo);
    141 	ctx->cuda_lib.init(bp->rf_raw_dim, bp->dec_data_dim);
    142 
    143 	u32  order    = cs->dec_data_dim.z;
    144 	i32 *hadamard = make_hadamard_transpose(&a, order);
    145 	if (hadamard) {
    146 		glDeleteTextures(1, &cs->hadamard_texture);
    147 		glCreateTextures(GL_TEXTURE_2D, 1, &cs->hadamard_texture);
    148 		glTextureStorage2D(cs->hadamard_texture, 1, GL_R8I, order, order);
    149 		glTextureSubImage2D(cs->hadamard_texture, 0, 0, 0,  order, order, GL_RED_INTEGER,
    150 		                    GL_INT, hadamard);
    151 		LABEL_GL_OBJECT(GL_TEXTURE, cs->hadamard_texture, s8("Hadamard_Matrix"));
    152 	}
    153 }
    154 
    155 function b32
    156 fill_frame_compute_work(BeamformerCtx *ctx, BeamformWork *work, ImagePlaneTag plane)
    157 {
    158 	b32 result = 0;
    159 	if (work) {
    160 		result = 1;
    161 		u32 frame_id    = atomic_inc_u32(&ctx->next_render_frame_index, 1);
    162 		u32 frame_index = frame_id % ARRAY_COUNT(ctx->beamform_frames);
    163 		work->type      = BW_COMPUTE;
    164 		work->frame     = ctx->beamform_frames + frame_index;
    165 		work->frame->ready_to_present = 0;
    166 		work->frame->frame.id = frame_id;
    167 		work->frame->image_plane_tag = plane;
    168 	}
    169 	return result;
    170 }
    171 
    172 function void
    173 export_frame(BeamformerCtx *ctx, iptr handle, BeamformFrame *frame)
    174 {
    175 	uv3 dim            = frame->dim;
    176 	iz  out_size       = dim.x * dim.y * dim.z * 2 * sizeof(f32);
    177 	ctx->export_buffer = ctx->os.alloc_arena(ctx->export_buffer, out_size);
    178 	glGetTextureImage(frame->texture, 0, GL_RG, GL_FLOAT, out_size, ctx->export_buffer.beg);
    179 	s8 raw = {.len = out_size, .data = ctx->export_buffer.beg};
    180 	if (!ctx->os.write_file(handle, raw))
    181 		ctx->os.write_file(ctx->os.error_handle, s8("failed to export frame\n"));
    182 	ctx->os.close(handle);
    183 }
    184 
    185 function void
    186 do_sum_shader(ComputeShaderCtx *cs, u32 *in_textures, u32 in_texture_count, f32 in_scale,
    187               u32 out_texture, uv3 out_data_dim)
    188 {
    189 	/* NOTE: zero output before summing */
    190 	glClearTexImage(out_texture, 0, GL_RED, GL_FLOAT, 0);
    191 	glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT);
    192 
    193 	glBindImageTexture(0, out_texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F);
    194 	glUniform1f(CS_SUM_PRESCALE_UNIFORM_LOC, in_scale);
    195 	for (u32 i = 0; i < in_texture_count; i++) {
    196 		glBindImageTexture(1, in_textures[i], 0, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F);
    197 		glDispatchCompute(ORONE(out_data_dim.x / 32),
    198 		                  ORONE(out_data_dim.y),
    199 		                  ORONE(out_data_dim.z / 32));
    200 		glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    201 	}
    202 }
    203 
    204 struct compute_cursor {
    205 	iv3 cursor;
    206 	iv3 dispatch;
    207 	iv3 target;
    208 	u32 points_per_dispatch;
    209 	u32 completed_points;
    210 	u32 total_points;
    211 };
    212 
    213 function struct compute_cursor
    214 start_compute_cursor(uv3 dim, u32 max_points)
    215 {
    216 	struct compute_cursor result = {0};
    217 	u32 invocations_per_dispatch = DAS_LOCAL_SIZE_X * DAS_LOCAL_SIZE_Y * DAS_LOCAL_SIZE_Z;
    218 
    219 	result.dispatch.y = MIN(max_points / invocations_per_dispatch, MAX(dim.y / DAS_LOCAL_SIZE_Y, 1));
    220 
    221 	u32 remaining     = max_points / result.dispatch.y;
    222 	result.dispatch.x = MIN(remaining / invocations_per_dispatch, MAX(dim.x / DAS_LOCAL_SIZE_X, 1));
    223 	result.dispatch.z = MIN(remaining / (invocations_per_dispatch * result.dispatch.x),
    224 	                        MAX(dim.z / DAS_LOCAL_SIZE_Z, 1));
    225 
    226 	result.target.x = MAX(dim.x / result.dispatch.x / DAS_LOCAL_SIZE_X, 1);
    227 	result.target.y = MAX(dim.y / result.dispatch.y / DAS_LOCAL_SIZE_Y, 1);
    228 	result.target.z = MAX(dim.z / result.dispatch.z / DAS_LOCAL_SIZE_Z, 1);
    229 
    230 	result.points_per_dispatch = 1;
    231 	result.points_per_dispatch *= result.dispatch.x * DAS_LOCAL_SIZE_X;
    232 	result.points_per_dispatch *= result.dispatch.y * DAS_LOCAL_SIZE_Y;
    233 	result.points_per_dispatch *= result.dispatch.z * DAS_LOCAL_SIZE_Z;
    234 
    235 	result.total_points = dim.x * dim.y * dim.z;
    236 
    237 	return result;
    238 }
    239 
    240 function iv3
    241 step_compute_cursor(struct compute_cursor *cursor)
    242 {
    243 	cursor->cursor.x += 1;
    244 	if (cursor->cursor.x >= cursor->target.x) {
    245 		cursor->cursor.x  = 0;
    246 		cursor->cursor.y += 1;
    247 		if (cursor->cursor.y >= cursor->target.y) {
    248 			cursor->cursor.y  = 0;
    249 			cursor->cursor.z += 1;
    250 		}
    251 	}
    252 
    253 	cursor->completed_points += cursor->points_per_dispatch;
    254 
    255 	iv3 result = cursor->cursor;
    256 	result.x *= cursor->dispatch.x * DAS_LOCAL_SIZE_X;
    257 	result.y *= cursor->dispatch.y * DAS_LOCAL_SIZE_Y;
    258 	result.z *= cursor->dispatch.z * DAS_LOCAL_SIZE_Z;
    259 
    260 	return result;
    261 }
    262 
    263 function b32
    264 compute_cursor_finished(struct compute_cursor *cursor)
    265 {
    266 	b32 result = cursor->completed_points >= cursor->total_points;
    267 	return result;
    268 }
    269 
    270 function void
    271 do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, ShaderKind shader)
    272 {
    273 	ComputeShaderCtx *csctx = &ctx->csctx;
    274 
    275 	glUseProgram(csctx->programs[shader]);
    276 
    277 	u32 output_ssbo_idx = !csctx->last_output_ssbo_index;
    278 	u32 input_ssbo_idx  = csctx->last_output_ssbo_index;
    279 
    280 	switch (shader) {
    281 	case ShaderKind_Decode:
    282 	case ShaderKind_DecodeFloat:
    283 	case ShaderKind_DecodeFloatComplex:{
    284 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo);
    285 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]);
    286 		glBindImageTexture(0, csctx->hadamard_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8I);
    287 		glBindImageTexture(1, csctx->channel_mapping_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I);
    288 		glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32),
    289 		                  ORONE(csctx->dec_data_dim.y / 32),
    290 		                  ORONE(csctx->dec_data_dim.z));
    291 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    292 	}break;
    293 	case ShaderKind_CudaDecode:{
    294 		ctx->cuda_lib.decode(0, output_ssbo_idx, 0);
    295 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    296 	}break;
    297 	case ShaderKind_CudaHilbert:
    298 		ctx->cuda_lib.hilbert(input_ssbo_idx, output_ssbo_idx);
    299 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    300 		break;
    301 	case ShaderKind_Demodulate:{
    302 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]);
    303 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]);
    304 		glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32),
    305 		                  ORONE(csctx->dec_data_dim.y / 32),
    306 		                  ORONE(csctx->dec_data_dim.z));
    307 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    308 	}break;
    309 	case ShaderKind_MinMax:{
    310 		u32 texture = frame->frame.texture;
    311 		for (u32 i = 1; i < frame->frame.mips; i++) {
    312 			glBindImageTexture(0, texture, i - 1, GL_TRUE, 0, GL_READ_ONLY,  GL_RG32F);
    313 			glBindImageTexture(1, texture, i - 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F);
    314 			glUniform1i(CS_MIN_MAX_MIPS_LEVEL_UNIFORM_LOC, i);
    315 
    316 			u32 width  = frame->frame.dim.x >> i;
    317 			u32 height = frame->frame.dim.y >> i;
    318 			u32 depth  = frame->frame.dim.z >> i;
    319 			glDispatchCompute(ORONE(width / 32), ORONE(height), ORONE(depth / 32));
    320 			glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    321 		}
    322 	}break;
    323 	case ShaderKind_DASCompute:{
    324 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]);
    325 		glBindImageTexture(0, frame->frame.texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F);
    326 		glBindImageTexture(1, csctx->sparse_elements_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I);
    327 		glBindImageTexture(2, csctx->focal_vectors_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RG32F);
    328 
    329 		glUniform1ui(DAS_CYCLE_T_UNIFORM_LOC, cycle_t++);
    330 
    331 		#if 1
    332 		/* TODO(rnp): compute max_points_per_dispatch based on something like a
    333 		 * transmit_count * channel_count product */
    334 		u32 max_points_per_dispatch = KB(64);
    335 		struct compute_cursor cursor = start_compute_cursor(frame->frame.dim, max_points_per_dispatch);
    336 		f32 percent_per_step = (f32)cursor.points_per_dispatch / (f32)cursor.total_points;
    337 		csctx->processing_progress = -percent_per_step;
    338 		for (iv3 offset = {0};
    339 		     !compute_cursor_finished(&cursor);
    340 		     offset = step_compute_cursor(&cursor))
    341 		{
    342 			csctx->processing_progress += percent_per_step;
    343 			/* IMPORTANT(rnp): prevents OS from coalescing and killing our shader */
    344 			glFinish();
    345 			glUniform3iv(DAS_VOXEL_OFFSET_UNIFORM_LOC, 1, offset.E);
    346 			glDispatchCompute(cursor.dispatch.x, cursor.dispatch.y, cursor.dispatch.z);
    347 		}
    348 		#else
    349 		/* NOTE(rnp): use this for testing tiling code. The performance of the above path
    350 		 * should be the same as this path if everything is working correctly */
    351 		iv3 compute_dim_offset = {0};
    352 		glUniform3iv(csctx->voxel_offset_id, 1, compute_dim_offset.E);
    353 		glDispatchCompute(ORONE(frame->frame.dim.x / 32),
    354 		                  ORONE(frame->frame.dim.y),
    355 		                  ORONE(frame->frame.dim.z / 32));
    356 		#endif
    357 		glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT|GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    358 	}break;
    359 	case ShaderKind_Sum:{
    360 		u32 aframe_index = ctx->averaged_frame_index % ARRAY_COUNT(ctx->averaged_frames);
    361 		BeamformComputeFrame *aframe = ctx->averaged_frames + aframe_index;
    362 		aframe->ready_to_present     = 0;
    363 		aframe->frame.id             = ctx->averaged_frame_index;
    364 		/* TODO(rnp): hack we need a better way of specifying which frames to sum;
    365 		 * this is fine for rolling averaging but what if we want to do something else */
    366 		ASSERT(frame >= ctx->beamform_frames);
    367 		ASSERT(frame < ctx->beamform_frames + ARRAY_COUNT(ctx->beamform_frames));
    368 		u32 base_index   = (u32)(frame - ctx->beamform_frames);
    369 		u32 to_average   = ctx->shared_memory->parameters.output_points[3];
    370 		u32 frame_count  = 0;
    371 		u32 *in_textures = push_array(&arena, u32, MAX_BEAMFORMED_SAVED_FRAMES);
    372 		ComputeFrameIterator cfi = compute_frame_iterator(ctx, 1 + base_index - to_average,
    373 		                                                  to_average);
    374 		for (BeamformComputeFrame *it = frame_next(&cfi); it; it = frame_next(&cfi))
    375 			in_textures[frame_count++] = it->frame.texture;
    376 
    377 		ASSERT(to_average == frame_count);
    378 
    379 		do_sum_shader(csctx, in_textures, frame_count, 1 / (f32)frame_count,
    380 		              aframe->frame.texture, aframe->frame.dim);
    381 		aframe->frame.min_coordinate  = frame->frame.min_coordinate;
    382 		aframe->frame.max_coordinate  = frame->frame.max_coordinate;
    383 		aframe->frame.compound_count  = frame->frame.compound_count;
    384 		aframe->frame.das_shader_kind = frame->frame.das_shader_kind;
    385 	}break;
    386 	InvalidDefaultCase;
    387 	}
    388 }
    389 
    390 function s8
    391 shader_text_with_header(ShaderReloadContext *ctx, OS *os, Arena *arena)
    392 {
    393 	Stream sb = arena_stream(*arena);
    394 	stream_append_s8s(&sb, s8("#version 460 core\n\n"), ctx->header);
    395 
    396 	switch (ctx->kind) {
    397 	case ShaderKind_DASCompute:{
    398 		#define X(type, id, pretty, fixed_tx) "#define DAS_ID_" #type " " #id "\n"
    399 		stream_append_s8(&sb, s8(""
    400 		"layout(local_size_x = " str(DAS_LOCAL_SIZE_X) ", "
    401 		       "local_size_y = " str(DAS_LOCAL_SIZE_Y) ", "
    402 		       "local_size_z = " str(DAS_LOCAL_SIZE_Z) ") in;\n\n"
    403 		"layout(location = " str(DAS_VOXEL_OFFSET_UNIFORM_LOC) ") uniform ivec3 u_voxel_offset;\n"
    404 		"layout(location = " str(DAS_CYCLE_T_UNIFORM_LOC)      ") uniform uint  u_cycle_t;\n\n"
    405 		DAS_TYPES
    406 		));
    407 		#undef X
    408 	}break;
    409 	case ShaderKind_DecodeFloat:
    410 	case ShaderKind_DecodeFloatComplex:{
    411 		if (ctx->kind == ShaderKind_DecodeFloat)
    412 			stream_append_s8(&sb, s8("#define INPUT_DATA_TYPE_FLOAT\n\n"));
    413 		else
    414 			stream_append_s8(&sb, s8("#define INPUT_DATA_TYPE_FLOAT_COMPLEX\n\n"));
    415 	} /* FALLTHROUGH */
    416 	case ShaderKind_Decode:{
    417 		#define X(type, id, pretty) stream_append_s8(&sb, s8("#define DECODE_MODE_" #type " " #id "\n"));
    418 		DECODE_TYPES
    419 		#undef X
    420 	}break;
    421 	case ShaderKind_MinMax:{
    422 		stream_append_s8(&sb, s8("layout(location = " str(CS_MIN_MAX_MIPS_LEVEL_UNIFORM_LOC)
    423 		                         ") uniform int u_mip_map;\n\n"));
    424 	}break;
    425 	case ShaderKind_Sum:{
    426 		stream_append_s8(&sb, s8("layout(location = " str(CS_SUM_PRESCALE_UNIFORM_LOC)
    427 		                         ") uniform float u_sum_prescale = 1.0;\n\n"));
    428 	}break;
    429 	default:{}break;
    430 	}
    431 	stream_append_s8(&sb, s8("\n#line 1\n"));
    432 
    433 	s8 result = arena_stream_commit(arena, &sb);
    434 	if (ctx->path.len) {
    435 		s8 file = os->read_whole_file(arena, (c8 *)ctx->path.data);
    436 		assert(file.data == result.data + result.len);
    437 		result.len += file.len;
    438 	}
    439 
    440 	return result;
    441 }
    442 
    443 DEBUG_EXPORT BEAMFORMER_RELOAD_SHADER_FN(beamformer_reload_shader)
    444 {
    445 	i32 shader_count = 1;
    446 	ShaderReloadContext *link = src->link;
    447 	while (link != src) { shader_count++; link = link->link; }
    448 
    449 	s8  *shader_texts = push_array(&arena, s8,  shader_count);
    450 	u32 *shader_types = push_array(&arena, u32, shader_count);
    451 
    452 	i32 index = 0;
    453 	do {
    454 		shader_texts[index] = shader_text_with_header(link, &ctx->os, &arena);
    455 		shader_types[index] = link->gl_type;
    456 		index++;
    457 		link = link->link;
    458 	} while (link != src);
    459 
    460 	u32 new_program = load_shader(&ctx->os, arena, shader_texts, shader_types, shader_count, shader_name);
    461 	if (new_program) {
    462 		glDeleteProgram(*src->shader);
    463 		*src->shader = new_program;
    464 		if (src->kind == ShaderKind_Render2D) ctx->frame_view_render_context.updated = 1;
    465 	}
    466 	return new_program != 0;
    467 }
    468 
    469 function b32
    470 reload_compute_shader(BeamformerCtx *ctx, ShaderReloadContext *src, s8 name_extra, Arena arena)
    471 {
    472 	Stream sb  = arena_stream(arena);
    473 	stream_append_s8s(&sb, src->name, name_extra);
    474 	s8  name   = arena_stream_commit(&arena, &sb);
    475 	b32 result = beamformer_reload_shader(ctx, src, arena, name);
    476 	if (result) {
    477 		glUseProgram(*src->shader);
    478 		glBindBufferBase(GL_UNIFORM_BUFFER, 0, ctx->csctx.shared_ubo);
    479 	}
    480 	return result;
    481 }
    482 
    483 function void
    484 complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_context, iz barrier_offset)
    485 {
    486 	ComputeShaderCtx       *cs = &ctx->csctx;
    487 	BeamformerParameters   *bp = &ctx->shared_memory->parameters;
    488 	BeamformerSharedMemory *sm = ctx->shared_memory;
    489 
    490 	BeamformWork *work = beamform_work_queue_pop(q);
    491 	while (work) {
    492 		b32 can_commit = 1;
    493 		switch (work->type) {
    494 		case BW_RELOAD_SHADER: {
    495 			ShaderReloadContext *src = work->shader_reload_context;
    496 			b32 success = reload_compute_shader(ctx, src, s8(""), arena);
    497 			if (src->kind == ShaderKind_Decode) {
    498 				/* TODO(rnp): think of a better way of doing this */
    499 				src->kind   = ShaderKind_DecodeFloatComplex;
    500 				src->shader = cs->programs + ShaderKind_DecodeFloatComplex;
    501 				success &= reload_compute_shader(ctx, src, s8(" (F32C)"), arena);
    502 				src->kind   = ShaderKind_DecodeFloat;
    503 				src->shader = cs->programs + ShaderKind_DecodeFloat;
    504 				success &= reload_compute_shader(ctx, src, s8(" (F32)"),  arena);
    505 				src->kind   = ShaderKind_Decode;
    506 				src->shader = cs->programs + ShaderKind_Decode;
    507 			}
    508 
    509 			if (success) {
    510 				/* TODO(rnp): this check seems off */
    511 				if (ctx->csctx.raw_data_ssbo) {
    512 					can_commit = 0;
    513 					ImagePlaneTag plane = ctx->beamform_frames[ctx->display_frame_index].image_plane_tag;
    514 					fill_frame_compute_work(ctx, work, plane);
    515 				}
    516 			}
    517 		} break;
    518 		case BW_UPLOAD_BUFFER: {
    519 			ASSERT(!atomic_load((i32 *)(barrier_offset + work->completion_barrier)));
    520 			BeamformerUploadContext *uc = &work->upload_context;
    521 			u32 tex_type, tex_format, tex_element_count, tex_1d = 0, buffer = 0;
    522 			switch (uc->kind) {
    523 			case BU_KIND_CHANNEL_MAPPING: {
    524 				tex_1d            = cs->channel_mapping_texture;
    525 				tex_type          = GL_SHORT;
    526 				tex_format        = GL_RED_INTEGER;
    527 				tex_element_count = ARRAY_COUNT(sm->channel_mapping);
    528 				ctx->cuda_lib.set_channel_mapping(sm->channel_mapping);
    529 			} break;
    530 			case BU_KIND_FOCAL_VECTORS: {
    531 				tex_1d            = cs->focal_vectors_texture;
    532 				tex_type          = GL_FLOAT;
    533 				tex_format        = GL_RG;
    534 				tex_element_count = ARRAY_COUNT(sm->focal_vectors);
    535 			} break;
    536 			case BU_KIND_SPARSE_ELEMENTS: {
    537 				tex_1d            = cs->sparse_elements_texture;
    538 				tex_type          = GL_SHORT;
    539 				tex_format        = GL_RED_INTEGER;
    540 				tex_element_count = ARRAY_COUNT(sm->sparse_elements);
    541 			} break;
    542 			case BU_KIND_PARAMETERS: {
    543 				ctx->ui_read_params = barrier_offset != 0;
    544 				buffer = cs->shared_ubo;
    545 			} break;
    546 			case BU_KIND_RF_DATA: {
    547 				if (cs->rf_raw_size != uc->size ||
    548 				    !uv4_equal(cs->dec_data_dim, uv4_from_u32_array(bp->dec_data_dim)))
    549 				{
    550 					alloc_shader_storage(ctx, uc->size, arena);
    551 				}
    552 				buffer = cs->raw_data_ssbo;
    553 			} break;
    554 			default: INVALID_CODE_PATH; break;
    555 			}
    556 
    557 			if (tex_1d) {
    558 				glTextureSubImage1D(tex_1d, 0, 0, tex_element_count, tex_format,
    559 				                    tex_type, (u8 *)sm + uc->shared_memory_offset);
    560 			}
    561 
    562 			if (buffer) {
    563 				glNamedBufferSubData(buffer, 0, uc->size,
    564 				                     (u8 *)sm + uc->shared_memory_offset);
    565 			}
    566 		} break;
    567 		case BW_COMPUTE: {
    568 			atomic_store(&cs->processing_compute, 1);
    569 			start_renderdoc_capture(gl_context);
    570 
    571 			BeamformComputeFrame *frame = work->frame;
    572 			uv3 try_dim = make_valid_test_dim(bp->output_points);
    573 			if (!uv3_equal(try_dim, frame->frame.dim))
    574 				alloc_beamform_frame(&ctx->gl, &frame->frame, &frame->stats, try_dim,
    575 				                     s8("Beamformed_Data"), arena);
    576 
    577 			if (bp->output_points[3] > 1) {
    578 				if (!uv3_equal(try_dim, ctx->averaged_frames[0].frame.dim)) {
    579 					alloc_beamform_frame(&ctx->gl, &ctx->averaged_frames[0].frame,
    580 					                     &ctx->averaged_frames[0].stats,
    581 					                     try_dim, s8("Averaged Frame"), arena);
    582 					alloc_beamform_frame(&ctx->gl, &ctx->averaged_frames[1].frame,
    583 					                     &ctx->averaged_frames[1].stats,
    584 					                     try_dim, s8("Averaged Frame"), arena);
    585 				}
    586 			}
    587 
    588 			frame->in_flight = 1;
    589 			frame->frame.min_coordinate  = v4_from_f32_array(bp->output_min_coordinate);
    590 			frame->frame.max_coordinate  = v4_from_f32_array(bp->output_max_coordinate);
    591 			frame->frame.das_shader_kind = bp->das_shader_id;
    592 			frame->frame.compound_count  = bp->dec_data_dim[2];
    593 
    594 			b32 did_sum_shader = 0;
    595 			u32 stage_count    = sm->compute_stages_count;
    596 			ComputeShaderKind *stages = sm->compute_stages;
    597 			for (u32 i = 0; i < stage_count; i++) {
    598 				did_sum_shader |= stages[i] == ComputeShaderKind_Sum;
    599 				frame->stats.timer_active[stages[i]] = 1;
    600 				glBeginQuery(GL_TIME_ELAPSED, frame->stats.timer_ids[stages[i]]);
    601 				do_compute_shader(ctx, arena, frame, (ShaderKind)stages[i]);
    602 				glEndQuery(GL_TIME_ELAPSED);
    603 			}
    604 			/* NOTE(rnp): block until work completes so that we can record timings */
    605 			glFinish();
    606 			cs->processing_progress = 1;
    607 
    608 			for (u32 i = 0; i < ARRAY_COUNT(frame->stats.timer_ids); i++) {
    609 				u64 ns = 0;
    610 				if (frame->stats.timer_active[i]) {
    611 					glGetQueryObjectui64v(frame->stats.timer_ids[i],
    612 					                      GL_QUERY_RESULT, &ns);
    613 					frame->stats.timer_active[i] = 0;
    614 				}
    615 				frame->stats.times[i] = (f32)ns / 1e9;
    616 			}
    617 
    618 			if (did_sum_shader) {
    619 				u32 aframe_index = (ctx->averaged_frame_index %
    620 				                    ARRAY_COUNT(ctx->averaged_frames));
    621 				ctx->averaged_frames[aframe_index].image_plane_tag  = frame->image_plane_tag;
    622 				ctx->averaged_frames[aframe_index].ready_to_present = 1;
    623 				/* TODO(rnp): not really sure what to do here */
    624 				mem_copy(&ctx->averaged_frames[aframe_index].stats.times,
    625 				         &frame->stats.times, sizeof(frame->stats.times));
    626 				atomic_inc_u32(&ctx->averaged_frame_index, 1);
    627 			}
    628 			frame->ready_to_present = 1;
    629 			cs->processing_compute  = 0;
    630 
    631 			end_renderdoc_capture(gl_context);
    632 		} break;
    633 		case BW_SAVE_FRAME: {
    634 			BeamformComputeFrame *frame = work->output_frame_ctx.frame;
    635 			if (frame->ready_to_present) {
    636 				export_frame(ctx, work->output_frame_ctx.file_handle, &frame->frame);
    637 			} else {
    638 				/* TODO(rnp): should we handle this? */
    639 				INVALID_CODE_PATH;
    640 			}
    641 		} break;
    642 		default: INVALID_CODE_PATH; break;
    643 		}
    644 
    645 		if (can_commit) {
    646 			if (work->completion_barrier) {
    647 				i32 *value = (i32 *)(barrier_offset + work->completion_barrier);
    648 				ctx->os.wake_waiters(value);
    649 			}
    650 			beamform_work_queue_pop_commit(q);
    651 			work = beamform_work_queue_pop(q);
    652 		}
    653 	}
    654 }
    655 
    656 DEBUG_EXPORT BEAMFORMER_COMPUTE_SETUP_FN(beamformer_compute_setup)
    657 {
    658 	BeamformerCtx          *ctx = (BeamformerCtx *)user_context;
    659 	BeamformerSharedMemory *sm  = ctx->shared_memory;
    660 	ComputeShaderCtx       *cs  = &ctx->csctx;
    661 
    662 	glCreateBuffers(1, &cs->shared_ubo);
    663 	glNamedBufferStorage(cs->shared_ubo, sizeof(sm->parameters), 0, GL_DYNAMIC_STORAGE_BIT);
    664 
    665 	glCreateTextures(GL_TEXTURE_1D, 1, &cs->channel_mapping_texture);
    666 	glCreateTextures(GL_TEXTURE_1D, 1, &cs->sparse_elements_texture);
    667 	glCreateTextures(GL_TEXTURE_1D, 1, &cs->focal_vectors_texture);
    668 	glTextureStorage1D(cs->channel_mapping_texture, 1, GL_R16I,  ARRAY_COUNT(sm->channel_mapping));
    669 	glTextureStorage1D(cs->sparse_elements_texture, 1, GL_R16I,  ARRAY_COUNT(sm->sparse_elements));
    670 	glTextureStorage1D(cs->focal_vectors_texture,   1, GL_RG32F, ARRAY_COUNT(sm->focal_vectors));
    671 
    672 	LABEL_GL_OBJECT(GL_TEXTURE, cs->channel_mapping_texture, s8("Channel_Mapping"));
    673 	LABEL_GL_OBJECT(GL_TEXTURE, cs->focal_vectors_texture,   s8("Focal_Vectors"));
    674 	LABEL_GL_OBJECT(GL_TEXTURE, cs->sparse_elements_texture, s8("Sparse_Elements"));
    675 	LABEL_GL_OBJECT(GL_BUFFER,  cs->shared_ubo,              s8("Beamformer_Parameters"));
    676 }
    677 
    678 DEBUG_EXPORT BEAMFORMER_COMPLETE_COMPUTE_FN(beamformer_complete_compute)
    679 {
    680 	BeamformerCtx *ctx = (BeamformerCtx *)user_context;
    681 	complete_queue(ctx, &ctx->shared_memory->external_work_queue, arena, gl_context, (iz)ctx->shared_memory);
    682 	complete_queue(ctx, ctx->beamform_work_queue, arena, gl_context, 0);
    683 }
    684 
    685 #include "ui.c"
    686 
    687 DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step)
    688 {
    689 	dt_for_frame = GetFrameTime();
    690 
    691 	if (IsWindowResized()) {
    692 		ctx->window_size.h = GetScreenHeight();
    693 		ctx->window_size.w = GetScreenWidth();
    694 	}
    695 
    696 	if (input->executable_reloaded) {
    697 		ui_init(ctx, ctx->ui_backing_store);
    698 		DEBUG_DECL(start_frame_capture = ctx->os.start_frame_capture);
    699 		DEBUG_DECL(end_frame_capture   = ctx->os.end_frame_capture);
    700 	}
    701 
    702 	BeamformerParameters *bp = &ctx->shared_memory->parameters;
    703 	if (ctx->shared_memory->dispatch_compute_sync) {
    704 		ImagePlaneTag current_plane = ctx->shared_memory->current_image_plane;
    705 		atomic_store(&ctx->shared_memory->dispatch_compute_sync, 0);
    706 		BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue);
    707 		if (work) {
    708 			if (fill_frame_compute_work(ctx, work, current_plane))
    709 				beamform_work_queue_push_commit(ctx->beamform_work_queue);
    710 
    711 			if (ctx->shared_memory->export_next_frame) {
    712 				BeamformWork *export = beamform_work_queue_push(ctx->beamform_work_queue);
    713 				if (export) {
    714 					/* TODO: we don't really want the beamformer opening/closing files */
    715 					iptr f = ctx->os.open_for_write(ctx->os.export_pipe_name);
    716 					export->type = BW_SAVE_FRAME;
    717 					export->output_frame_ctx.file_handle = f;
    718 					if (bp->output_points[3] > 1) {
    719 						u32 a_index = !(ctx->averaged_frame_index %
    720 						                ARRAY_COUNT(ctx->averaged_frames));
    721 						BeamformComputeFrame *aframe = ctx->averaged_frames + a_index;
    722 						export->output_frame_ctx.frame = aframe;
    723 					} else {
    724 						export->output_frame_ctx.frame = work->frame;
    725 					}
    726 					beamform_work_queue_push_commit(ctx->beamform_work_queue);
    727 				}
    728 				ctx->shared_memory->export_next_frame = 0;
    729 			}
    730 
    731 			ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable);
    732 		}
    733 	}
    734 
    735 	if (ctx->start_compute) {
    736 		if (ctx->beamform_frames[ctx->display_frame_index].ready_to_present) {
    737 			BeamformWork *work  = beamform_work_queue_push(ctx->beamform_work_queue);
    738 			ImagePlaneTag plane = ctx->beamform_frames[ctx->display_frame_index].image_plane_tag;
    739 			if (fill_frame_compute_work(ctx, work, plane)) {
    740 				beamform_work_queue_push_commit(ctx->beamform_work_queue);
    741 				ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable);
    742 				ctx->start_compute = 0;
    743 			}
    744 		}
    745 	}
    746 
    747 	ComputeFrameIterator cfi = compute_frame_iterator(ctx, ctx->display_frame_index,
    748 	                                                  ctx->next_render_frame_index - ctx->display_frame_index);
    749 	for (BeamformComputeFrame *frame = frame_next(&cfi); frame; frame = frame_next(&cfi)) {
    750 		if (frame->in_flight && frame->ready_to_present) {
    751 			frame->in_flight         = 0;
    752 			ctx->display_frame_index = frame - cfi.frames;
    753 		}
    754 	}
    755 
    756 	if (ctx->start_compute) {
    757 		ctx->start_compute = 0;
    758 		ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable);
    759 	}
    760 
    761 	BeamformComputeFrame *frame_to_draw;
    762 	if (bp->output_points[3] > 1) {
    763 		u32 a_index = !(ctx->averaged_frame_index % ARRAY_COUNT(ctx->averaged_frames));
    764 		frame_to_draw = ctx->averaged_frames + a_index;
    765 	} else {
    766 		frame_to_draw = ctx->beamform_frames + ctx->display_frame_index;
    767 	}
    768 
    769 	draw_ui(ctx, input, frame_to_draw->ready_to_present? &frame_to_draw->frame : 0,
    770 	        frame_to_draw->image_plane_tag, &frame_to_draw->stats);
    771 
    772 	ctx->frame_view_render_context.updated = 0;
    773 
    774 	if (WindowShouldClose())
    775 		ctx->should_exit = 1;
    776 }