ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | LICENSE

beamformer.c (28961B)


      1 /* See LICENSE for license details. */
      2 /* TODO(rnp):
      3  * [ ]: refactor: BeamformGPUComputeContext
      4  * [ ]: refactor: compute shader timers should be generated based on the pipeline stage limit
      5  * [ ]: reinvestigate ring buffer raw_data_ssbo
      6  *      - to minimize latency the main thread should manage the subbuffer upload so that the
      7  *        compute thread can just keep computing. This way we can keep the copmute thread busy
      8  *        with work while we image.
      9  *      - In particular we will potentially need multiple GPUComputeContexts so that we
     10  *        can overwrite one while the other is in use.
     11  *      - make use of glFenceSync to guard buffer uploads
     12  */
     13 
     14 #include "beamformer.h"
     15 #include "beamformer_work_queue.c"
     16 
     17 global f32 dt_for_frame;
     18 global u32 cycle_t;
     19 
     20 #ifndef _DEBUG
     21 #define start_renderdoc_capture(...)
     22 #define end_renderdoc_capture(...)
     23 #else
     24 static renderdoc_start_frame_capture_fn *start_frame_capture;
     25 static renderdoc_end_frame_capture_fn   *end_frame_capture;
     26 #define start_renderdoc_capture(gl) if (start_frame_capture) start_frame_capture(gl, 0)
     27 #define end_renderdoc_capture(gl)   if (end_frame_capture)   end_frame_capture(gl, 0)
     28 #endif
     29 
     30 typedef struct {
     31 	BeamformComputeFrame *frames;
     32 	u32 capacity;
     33 	u32 offset;
     34 	u32 cursor;
     35 	u32 needed_frames;
     36 } ComputeFrameIterator;
     37 
     38 function uv3
     39 make_valid_test_dim(u32 in[3])
     40 {
     41 	uv3 result;
     42 	result.E[0] = MAX(in[0], 1);
     43 	result.E[1] = MAX(in[1], 1);
     44 	result.E[2] = MAX(in[2], 1);
     45 	return result;
     46 }
     47 
     48 function ComputeFrameIterator
     49 compute_frame_iterator(BeamformerCtx *ctx, u32 start_index, u32 needed_frames)
     50 {
     51 	start_index = start_index % ARRAY_COUNT(ctx->beamform_frames);
     52 
     53 	ComputeFrameIterator result;
     54 	result.frames        = ctx->beamform_frames;
     55 	result.offset        = start_index;
     56 	result.capacity      = ARRAY_COUNT(ctx->beamform_frames);
     57 	result.cursor        = 0;
     58 	result.needed_frames = needed_frames;
     59 	return result;
     60 }
     61 
     62 static BeamformComputeFrame *
     63 frame_next(ComputeFrameIterator *bfi)
     64 {
     65 	BeamformComputeFrame *result = 0;
     66 	if (bfi->cursor != bfi->needed_frames) {
     67 		u32 index = (bfi->offset + bfi->cursor++) % bfi->capacity;
     68 		result    = bfi->frames + index;
     69 	}
     70 	return result;
     71 }
     72 
     73 static void
     74 alloc_beamform_frame(GLParams *gp, BeamformFrame *out, ComputeShaderStats *out_stats,
     75                      uv3 out_dim, s8 name, Arena arena)
     76 {
     77 	out->dim.x = MAX(1, round_down_power_of_2(ORONE(out_dim.x)));
     78 	out->dim.y = MAX(1, round_down_power_of_2(ORONE(out_dim.y)));
     79 	out->dim.z = MAX(1, round_down_power_of_2(ORONE(out_dim.z)));
     80 
     81 	if (gp) {
     82 		out->dim.x = MIN(out->dim.x, gp->max_3d_texture_dim);
     83 		out->dim.y = MIN(out->dim.y, gp->max_3d_texture_dim);
     84 		out->dim.z = MIN(out->dim.z, gp->max_3d_texture_dim);
     85 	}
     86 
     87 	/* NOTE: allocate storage for beamformed output data;
     88 	 * this is shared between compute and fragment shaders */
     89 	u32 max_dim = MAX(out->dim.x, MAX(out->dim.y, out->dim.z));
     90 	out->mips   = ctz_u32(max_dim) + 1;
     91 
     92 	Stream label = arena_stream(arena);
     93 	stream_append_s8(&label, name);
     94 	stream_append_byte(&label, '[');
     95 	stream_append_hex_u64(&label, out->id);
     96 	stream_append_byte(&label, ']');
     97 
     98 	glDeleteTextures(1, &out->texture);
     99 	glCreateTextures(GL_TEXTURE_3D, 1, &out->texture);
    100 	glTextureStorage3D(out->texture, out->mips, GL_RG32F, out->dim.x, out->dim.y, out->dim.z);
    101 	LABEL_GL_OBJECT(GL_TEXTURE, out->texture, stream_to_s8(&label));
    102 
    103 	if (out_stats) {
    104 		glDeleteQueries(ARRAY_COUNT(out_stats->timer_ids), out_stats->timer_ids);
    105 		glCreateQueries(GL_TIME_ELAPSED, ARRAY_COUNT(out_stats->timer_ids), out_stats->timer_ids);
    106 	}
    107 }
    108 
    109 function void
    110 alloc_shader_storage(BeamformerCtx *ctx, u32 rf_raw_size, Arena a)
    111 {
    112 	ComputeShaderCtx     *cs = &ctx->csctx;
    113 	BeamformerParameters *bp = &ctx->shared_memory->parameters;
    114 
    115 	cs->dec_data_dim = uv4_from_u32_array(bp->dec_data_dim);
    116 	cs->rf_raw_size  = rf_raw_size;
    117 
    118 	glDeleteBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos);
    119 	glCreateBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos);
    120 
    121 	i32 storage_flags = GL_DYNAMIC_STORAGE_BIT;
    122 	glDeleteBuffers(1, &cs->raw_data_ssbo);
    123 	glCreateBuffers(1, &cs->raw_data_ssbo);
    124 	glNamedBufferStorage(cs->raw_data_ssbo, rf_raw_size, 0, storage_flags);
    125 	LABEL_GL_OBJECT(GL_BUFFER, cs->raw_data_ssbo, s8("Raw_RF_SSBO"));
    126 
    127 	iz rf_decoded_size = 2 * sizeof(f32) * cs->dec_data_dim.x * cs->dec_data_dim.y * cs->dec_data_dim.z;
    128 	Stream label = arena_stream(a);
    129 	stream_append_s8(&label, s8("Decoded_RF_SSBO_"));
    130 	u32 s_widx = label.widx;
    131 	for (u32 i = 0; i < ARRAY_COUNT(cs->rf_data_ssbos); i++) {
    132 		glNamedBufferStorage(cs->rf_data_ssbos[i], rf_decoded_size, 0, 0);
    133 		stream_append_u64(&label, i);
    134 		LABEL_GL_OBJECT(GL_BUFFER, cs->rf_data_ssbos[i], stream_to_s8(&label));
    135 		stream_reset(&label, s_widx);
    136 	}
    137 
    138 	/* NOTE(rnp): these are stubs when CUDA isn't supported */
    139 	ctx->cuda_lib.register_cuda_buffers(cs->rf_data_ssbos, ARRAY_COUNT(cs->rf_data_ssbos),
    140 		                            cs->raw_data_ssbo);
    141 	ctx->cuda_lib.init_cuda_configuration(bp->rf_raw_dim, bp->dec_data_dim,
    142 		                              ctx->shared_memory->channel_mapping);
    143 
    144 	u32  order    = cs->dec_data_dim.z;
    145 	i32 *hadamard = make_hadamard_transpose(&a, order);
    146 	if (hadamard) {
    147 		glDeleteTextures(1, &cs->hadamard_texture);
    148 		glCreateTextures(GL_TEXTURE_2D, 1, &cs->hadamard_texture);
    149 		glTextureStorage2D(cs->hadamard_texture, 1, GL_R8I, order, order);
    150 		glTextureSubImage2D(cs->hadamard_texture, 0, 0, 0,  order, order, GL_RED_INTEGER,
    151 		                    GL_INT, hadamard);
    152 		LABEL_GL_OBJECT(GL_TEXTURE, cs->hadamard_texture, s8("Hadamard_Matrix"));
    153 	}
    154 }
    155 
    156 function b32
    157 fill_frame_compute_work(BeamformerCtx *ctx, BeamformWork *work, ImagePlaneTag plane)
    158 {
    159 	b32 result = 0;
    160 	if (work) {
    161 		result = 1;
    162 		u32 frame_id    = atomic_inc(&ctx->next_render_frame_index, 1);
    163 		u32 frame_index = frame_id % ARRAY_COUNT(ctx->beamform_frames);
    164 		work->type      = BW_COMPUTE;
    165 		work->frame     = ctx->beamform_frames + frame_index;
    166 		work->frame->ready_to_present = 0;
    167 		work->frame->frame.id = frame_id;
    168 		work->frame->image_plane_tag = plane;
    169 	}
    170 	return result;
    171 }
    172 
    173 static void
    174 export_frame(BeamformerCtx *ctx, iptr handle, BeamformFrame *frame)
    175 {
    176 	uv3 dim            = frame->dim;
    177 	iz  out_size       = dim.x * dim.y * dim.z * 2 * sizeof(f32);
    178 	ctx->export_buffer = ctx->os.alloc_arena(ctx->export_buffer, out_size);
    179 	glGetTextureImage(frame->texture, 0, GL_RG, GL_FLOAT, out_size, ctx->export_buffer.beg);
    180 	s8 raw = {.len = out_size, .data = ctx->export_buffer.beg};
    181 	if (!ctx->os.write_file(handle, raw))
    182 		ctx->os.write_file(ctx->os.stderr, s8("failed to export frame\n"));
    183 	ctx->os.close(handle);
    184 }
    185 
    186 static void
    187 do_sum_shader(ComputeShaderCtx *cs, u32 *in_textures, u32 in_texture_count, f32 in_scale,
    188               u32 out_texture, uv3 out_data_dim)
    189 {
    190 	/* NOTE: zero output before summing */
    191 	glClearTexImage(out_texture, 0, GL_RED, GL_FLOAT, 0);
    192 	glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT);
    193 
    194 	glBindImageTexture(0, out_texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F);
    195 	glUniform1f(CS_SUM_PRESCALE_UNIFORM_LOC, in_scale);
    196 	for (u32 i = 0; i < in_texture_count; i++) {
    197 		glBindImageTexture(1, in_textures[i], 0, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F);
    198 		glDispatchCompute(ORONE(out_data_dim.x / 32),
    199 		                  ORONE(out_data_dim.y),
    200 		                  ORONE(out_data_dim.z / 32));
    201 		glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    202 	}
    203 }
    204 
    205 struct compute_cursor {
    206 	iv3 cursor;
    207 	iv3 dispatch;
    208 	iv3 target;
    209 	u32 points_per_dispatch;
    210 	u32 completed_points;
    211 	u32 total_points;
    212 };
    213 
    214 static struct compute_cursor
    215 start_compute_cursor(uv3 dim, u32 max_points)
    216 {
    217 	struct compute_cursor result = {0};
    218 	u32 invocations_per_dispatch = DAS_LOCAL_SIZE_X * DAS_LOCAL_SIZE_Y * DAS_LOCAL_SIZE_Z;
    219 
    220 	result.dispatch.y = MIN(max_points / invocations_per_dispatch, MAX(dim.y / DAS_LOCAL_SIZE_Y, 1));
    221 
    222 	u32 remaining     = max_points / result.dispatch.y;
    223 	result.dispatch.x = MIN(remaining / invocations_per_dispatch, MAX(dim.x / DAS_LOCAL_SIZE_X, 1));
    224 	result.dispatch.z = MIN(remaining / (invocations_per_dispatch * result.dispatch.x),
    225 	                        MAX(dim.z / DAS_LOCAL_SIZE_Z, 1));
    226 
    227 	result.target.x = MAX(dim.x / result.dispatch.x / DAS_LOCAL_SIZE_X, 1);
    228 	result.target.y = MAX(dim.y / result.dispatch.y / DAS_LOCAL_SIZE_Y, 1);
    229 	result.target.z = MAX(dim.z / result.dispatch.z / DAS_LOCAL_SIZE_Z, 1);
    230 
    231 	result.points_per_dispatch = 1;
    232 	result.points_per_dispatch *= result.dispatch.x * DAS_LOCAL_SIZE_X;
    233 	result.points_per_dispatch *= result.dispatch.y * DAS_LOCAL_SIZE_Y;
    234 	result.points_per_dispatch *= result.dispatch.z * DAS_LOCAL_SIZE_Z;
    235 
    236 	result.total_points = dim.x * dim.y * dim.z;
    237 
    238 	return result;
    239 }
    240 
    241 static iv3
    242 step_compute_cursor(struct compute_cursor *cursor)
    243 {
    244 	cursor->cursor.x += 1;
    245 	if (cursor->cursor.x >= cursor->target.x) {
    246 		cursor->cursor.x  = 0;
    247 		cursor->cursor.y += 1;
    248 		if (cursor->cursor.y >= cursor->target.y) {
    249 			cursor->cursor.y  = 0;
    250 			cursor->cursor.z += 1;
    251 		}
    252 	}
    253 
    254 	cursor->completed_points += cursor->points_per_dispatch;
    255 
    256 	iv3 result = cursor->cursor;
    257 	result.x *= cursor->dispatch.x * DAS_LOCAL_SIZE_X;
    258 	result.y *= cursor->dispatch.y * DAS_LOCAL_SIZE_Y;
    259 	result.z *= cursor->dispatch.z * DAS_LOCAL_SIZE_Z;
    260 
    261 	return result;
    262 }
    263 
    264 static b32
    265 compute_cursor_finished(struct compute_cursor *cursor)
    266 {
    267 	b32 result = cursor->completed_points >= cursor->total_points;
    268 	return result;
    269 }
    270 
    271 static void
    272 do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, ComputeShaderID shader)
    273 {
    274 	ComputeShaderCtx *csctx = &ctx->csctx;
    275 
    276 	glUseProgram(csctx->programs[shader]);
    277 
    278 	u32 output_ssbo_idx = !csctx->last_output_ssbo_index;
    279 	u32 input_ssbo_idx  = csctx->last_output_ssbo_index;
    280 
    281 	switch (shader) {
    282 	case CS_DECODE:
    283 	case CS_DECODE_FLOAT:
    284 	case CS_DECODE_FLOAT_COMPLEX:
    285 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo);
    286 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]);
    287 		glBindImageTexture(0, csctx->hadamard_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8I);
    288 		glBindImageTexture(1, csctx->channel_mapping_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I);
    289 		glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32),
    290 		                  ORONE(csctx->dec_data_dim.y / 32),
    291 		                  ORONE(csctx->dec_data_dim.z));
    292 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    293 		break;
    294 	case CS_CUDA_DECODE:
    295 		ctx->cuda_lib.cuda_decode(0, output_ssbo_idx, 0);
    296 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    297 		break;
    298 	case CS_CUDA_HILBERT:
    299 		ctx->cuda_lib.cuda_hilbert(input_ssbo_idx, output_ssbo_idx);
    300 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    301 		break;
    302 	case CS_DEMOD:
    303 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]);
    304 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]);
    305 		glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32),
    306 		                  ORONE(csctx->dec_data_dim.y / 32),
    307 		                  ORONE(csctx->dec_data_dim.z));
    308 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    309 		break;
    310 	case CS_MIN_MAX: {
    311 		u32 texture = frame->frame.texture;
    312 		for (u32 i = 1; i < frame->frame.mips; i++) {
    313 			glBindImageTexture(0, texture, i - 1, GL_TRUE, 0, GL_READ_ONLY,  GL_RG32F);
    314 			glBindImageTexture(1, texture, i - 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F);
    315 			glUniform1i(CS_MIN_MAX_MIPS_LEVEL_UNIFORM_LOC, i);
    316 
    317 			u32 width  = frame->frame.dim.x >> i;
    318 			u32 height = frame->frame.dim.y >> i;
    319 			u32 depth  = frame->frame.dim.z >> i;
    320 			glDispatchCompute(ORONE(width / 32), ORONE(height), ORONE(depth / 32));
    321 			glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    322 		}
    323 	} break;
    324 	case CS_DAS: {
    325 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]);
    326 		glBindImageTexture(0, frame->frame.texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F);
    327 		glBindImageTexture(1, csctx->sparse_elements_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I);
    328 		glBindImageTexture(2, csctx->focal_vectors_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RG32F);
    329 
    330 		glUniform1ui(DAS_CYCLE_T_UNIFORM_LOC, cycle_t++);
    331 
    332 		#if 1
    333 		/* TODO(rnp): compute max_points_per_dispatch based on something like a
    334 		 * transmit_count * channel_count product */
    335 		u32 max_points_per_dispatch = KB(64);
    336 		struct compute_cursor cursor = start_compute_cursor(frame->frame.dim, max_points_per_dispatch);
    337 		f32 percent_per_step = (f32)cursor.points_per_dispatch / (f32)cursor.total_points;
    338 		csctx->processing_progress = -percent_per_step;
    339 		for (iv3 offset = {0};
    340 		     !compute_cursor_finished(&cursor);
    341 		     offset = step_compute_cursor(&cursor))
    342 		{
    343 			csctx->processing_progress += percent_per_step;
    344 			/* IMPORTANT(rnp): prevents OS from coalescing and killing our shader */
    345 			glFinish();
    346 			glUniform3iv(DAS_VOXEL_OFFSET_UNIFORM_LOC, 1, offset.E);
    347 			glDispatchCompute(cursor.dispatch.x, cursor.dispatch.y, cursor.dispatch.z);
    348 		}
    349 		#else
    350 		/* NOTE(rnp): use this for testing tiling code. The performance of the above path
    351 		 * should be the same as this path if everything is working correctly */
    352 		iv3 compute_dim_offset = {0};
    353 		glUniform3iv(csctx->voxel_offset_id, 1, compute_dim_offset.E);
    354 		glDispatchCompute(ORONE(frame->frame.dim.x / 32),
    355 		                  ORONE(frame->frame.dim.y),
    356 		                  ORONE(frame->frame.dim.z / 32));
    357 		#endif
    358 		glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT|GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    359 	} break;
    360 	case CS_SUM: {
    361 		u32 aframe_index = ctx->averaged_frame_index % ARRAY_COUNT(ctx->averaged_frames);
    362 		BeamformComputeFrame *aframe = ctx->averaged_frames + aframe_index;
    363 		aframe->ready_to_present     = 0;
    364 		aframe->frame.id             = ctx->averaged_frame_index;
    365 		/* TODO(rnp): hack we need a better way of specifying which frames to sum;
    366 		 * this is fine for rolling averaging but what if we want to do something else */
    367 		ASSERT(frame >= ctx->beamform_frames);
    368 		ASSERT(frame < ctx->beamform_frames + ARRAY_COUNT(ctx->beamform_frames));
    369 		u32 base_index   = (u32)(frame - ctx->beamform_frames);
    370 		u32 to_average   = ctx->shared_memory->parameters.output_points[3];
    371 		u32 frame_count  = 0;
    372 		u32 *in_textures = push_array(&arena, u32, MAX_BEAMFORMED_SAVED_FRAMES);
    373 		ComputeFrameIterator cfi = compute_frame_iterator(ctx, 1 + base_index - to_average,
    374 		                                                  to_average);
    375 		for (BeamformComputeFrame *it = frame_next(&cfi); it; it = frame_next(&cfi))
    376 			in_textures[frame_count++] = it->frame.texture;
    377 
    378 		ASSERT(to_average == frame_count);
    379 
    380 		do_sum_shader(csctx, in_textures, frame_count, 1 / (f32)frame_count,
    381 		              aframe->frame.texture, aframe->frame.dim);
    382 		aframe->frame.min_coordinate = frame->frame.min_coordinate;
    383 		aframe->frame.max_coordinate = frame->frame.max_coordinate;
    384 		aframe->frame.compound_count = frame->frame.compound_count;
    385 		aframe->frame.das_shader_id  = frame->frame.das_shader_id;
    386 	} break;
    387 	default: ASSERT(0);
    388 	}
    389 }
    390 
    391 function s8
    392 push_compute_shader_header(Arena *a, b32 parameters, ComputeShaderID shader)
    393 {
    394 	Stream sb = arena_stream(*a);
    395 
    396 	stream_append_s8(&sb, s8("#version 460 core\n\n"));
    397 
    398 	#define X(name, type, size, gltype, glsize, comment) "\t" #gltype " " #name #glsize "; " comment "\n"
    399 	if (parameters) {
    400 		stream_append_s8(&sb, s8("layout(std140, binding = 0) uniform parameters {\n"
    401 		                         BEAMFORMER_PARAMS_HEAD
    402 		                         BEAMFORMER_UI_PARAMS
    403 		                         BEAMFORMER_PARAMS_TAIL
    404 		                         "};\n\n"));
    405 	}
    406 	#undef X
    407 
    408 	switch (shader) {
    409 	case CS_DAS: {
    410 		#define X(type, id, pretty, fixed_tx) "#define DAS_ID_" #type " " #id "\n"
    411 		stream_append_s8(&sb, s8(""
    412 		"layout(local_size_x = " str(DAS_LOCAL_SIZE_X) ", "
    413 		       "local_size_y = " str(DAS_LOCAL_SIZE_Y) ", "
    414 		       "local_size_z = " str(DAS_LOCAL_SIZE_Z) ") in;\n\n"
    415 		"layout(location = " str(DAS_VOXEL_OFFSET_UNIFORM_LOC) ") uniform ivec3 u_voxel_offset;\n"
    416 		"layout(location = " str(DAS_CYCLE_T_UNIFORM_LOC)      ") uniform uint  u_cycle_t;\n\n"
    417 		DAS_TYPES
    418 		));
    419 		#undef X
    420 	} break;
    421 	case CS_DECODE_FLOAT:
    422 	case CS_DECODE_FLOAT_COMPLEX: {
    423 		if (shader == CS_DECODE_FLOAT) stream_append_s8(&sb, s8("#define INPUT_DATA_TYPE_FLOAT\n\n"));
    424 		else                           stream_append_s8(&sb, s8("#define INPUT_DATA_TYPE_FLOAT_COMPLEX\n\n"));
    425 	} /* FALLTHROUGH */
    426 	case CS_DECODE: {
    427 		#define X(type, id, pretty) stream_append_s8(&sb, s8("#define DECODE_MODE_" #type " " #id "\n"));
    428 		DECODE_TYPES
    429 		#undef X
    430 	} break;
    431 	case CS_MIN_MAX: {
    432 		stream_append_s8(&sb, s8("layout(location = " str(CS_MIN_MAX_MIPS_LEVEL_UNIFORM_LOC)
    433 		                         ") uniform int u_mip_map;\n\n"));
    434 	} break;
    435 	case CS_SUM: {
    436 		stream_append_s8(&sb, s8("layout(location = " str(CS_SUM_PRESCALE_UNIFORM_LOC)
    437 		                         ") uniform float u_sum_prescale = 1.0;\n\n"));
    438 	} break;
    439 	default: break;
    440 	}
    441 	stream_append_s8(&sb, s8("\n#line 1\n"));
    442 	return arena_stream_commit(a, &sb);
    443 }
    444 
    445 static b32
    446 reload_compute_shader(BeamformerCtx *ctx, s8 path, s8 extra, ComputeShaderReloadContext *csr, Arena tmp)
    447 {
    448 	ComputeShaderCtx *cs = &ctx->csctx;
    449 	b32 result = 0;
    450 
    451 	/* NOTE: arena works as stack (since everything here is 1 byte aligned) */
    452 	s8 header      = push_compute_shader_header(&tmp, csr->needs_header, csr->shader);
    453 	s8 shader_text = ctx->os.read_whole_file(&tmp, (c8 *)path.data);
    454 	shader_text.data -= header.len;
    455 	shader_text.len  += header.len;
    456 
    457 	if (shader_text.data == header.data) {
    458 		Stream sb = arena_stream(tmp);
    459 		stream_append_s8s(&sb, path, extra);
    460 		s8 info = arena_stream_commit(&tmp, &sb);
    461 		u32 new_program = load_shader(&ctx->os, tmp, 1, s8(""), s8(""), shader_text,
    462 		                              info, csr->label);
    463 		if (new_program) {
    464 			glDeleteProgram(cs->programs[csr->shader]);
    465 			cs->programs[csr->shader] = new_program;
    466 			glUseProgram(cs->programs[csr->shader]);
    467 			glBindBufferBase(GL_UNIFORM_BUFFER, 0, cs->shared_ubo);
    468 		}
    469 	} else {
    470 		Stream sb = arena_stream(tmp);
    471 		stream_append_s8s(&sb, s8("failed to load: "), path, extra, s8("\n"));
    472 		ctx->os.write_file(ctx->os.stderr, stream_to_s8(&sb));
    473 	}
    474 
    475 	return result;
    476 }
    477 
    478 static void
    479 complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_context, iz barrier_offset)
    480 {
    481 	ComputeShaderCtx       *cs = &ctx->csctx;
    482 	BeamformerParameters   *bp = &ctx->shared_memory->parameters;
    483 	BeamformerSharedMemory *sm = ctx->shared_memory;
    484 
    485 	BeamformWork *work = beamform_work_queue_pop(q);
    486 	while (work) {
    487 		b32 can_commit = 1;
    488 		switch (work->type) {
    489 		case BW_RELOAD_SHADER: {
    490 			ComputeShaderReloadContext *csr = work->reload_shader_ctx;
    491 			b32 success = reload_compute_shader(ctx, csr->path, s8(""), csr, arena);
    492 			if (csr->shader == CS_DECODE) {
    493 				/* TODO(rnp): think of a better way of doing this */
    494 				csr->shader = CS_DECODE_FLOAT_COMPLEX;
    495 				success &= reload_compute_shader(ctx, csr->path, s8(" (F32C)"), csr, arena);
    496 				csr->shader = CS_DECODE_FLOAT;
    497 				success &= reload_compute_shader(ctx, csr->path, s8(" (F32)"),  csr, arena);
    498 				csr->shader = CS_DECODE;
    499 			}
    500 
    501 			if (success) {
    502 				/* TODO(rnp): this check seems off */
    503 				if (ctx->csctx.raw_data_ssbo) {
    504 					can_commit = 0;
    505 					ImagePlaneTag plane = ctx->beamform_frames[ctx->display_frame_index].image_plane_tag;
    506 					fill_frame_compute_work(ctx, work, plane);
    507 				}
    508 			}
    509 		} break;
    510 		case BW_UPLOAD_BUFFER: {
    511 			ASSERT(!atomic_load((i32 *)(barrier_offset + work->completion_barrier)));
    512 			BeamformerUploadContext *uc = &work->upload_context;
    513 			u32 tex_type, tex_format, tex_element_count, tex_1d = 0, buffer = 0;
    514 			switch (uc->kind) {
    515 			case BU_KIND_CHANNEL_MAPPING: {
    516 				tex_1d            = cs->channel_mapping_texture;
    517 				tex_type          = GL_SHORT;
    518 				tex_format        = GL_RED_INTEGER;
    519 				tex_element_count = ARRAY_COUNT(sm->channel_mapping);
    520 			} break;
    521 			case BU_KIND_FOCAL_VECTORS: {
    522 				tex_1d            = cs->focal_vectors_texture;
    523 				tex_type          = GL_FLOAT;
    524 				tex_format        = GL_RG;
    525 				tex_element_count = ARRAY_COUNT(sm->focal_vectors);
    526 			} break;
    527 			case BU_KIND_SPARSE_ELEMENTS: {
    528 				tex_1d            = cs->sparse_elements_texture;
    529 				tex_type          = GL_SHORT;
    530 				tex_format        = GL_RED_INTEGER;
    531 				tex_element_count = ARRAY_COUNT(sm->sparse_elements);
    532 			} break;
    533 			case BU_KIND_PARAMETERS: {
    534 				ctx->ui_read_params = barrier_offset != 0;
    535 				buffer = cs->shared_ubo;
    536 			} break;
    537 			case BU_KIND_RF_DATA: {
    538 				if (cs->rf_raw_size != uc->size ||
    539 				    !uv4_equal(cs->dec_data_dim, uv4_from_u32_array(bp->dec_data_dim)))
    540 				{
    541 					alloc_shader_storage(ctx, uc->size, arena);
    542 				}
    543 				buffer = cs->raw_data_ssbo;
    544 			} break;
    545 			default: INVALID_CODE_PATH; break;
    546 			}
    547 
    548 			if (tex_1d) {
    549 				glTextureSubImage1D(tex_1d, 0, 0, tex_element_count, tex_format,
    550 				                    tex_type, (u8 *)sm + uc->shared_memory_offset);
    551 			}
    552 
    553 			if (buffer) {
    554 				glNamedBufferSubData(buffer, 0, uc->size,
    555 				                     (u8 *)sm + uc->shared_memory_offset);
    556 			}
    557 		} break;
    558 		case BW_COMPUTE: {
    559 			atomic_store(&cs->processing_compute, 1);
    560 			start_renderdoc_capture(gl_context);
    561 
    562 			BeamformComputeFrame *frame = work->frame;
    563 			uv3 try_dim = make_valid_test_dim(bp->output_points);
    564 			if (!uv3_equal(try_dim, frame->frame.dim))
    565 				alloc_beamform_frame(&ctx->gl, &frame->frame, &frame->stats, try_dim,
    566 				                     s8("Beamformed_Data"), arena);
    567 
    568 			if (bp->output_points[3] > 1) {
    569 				if (!uv3_equal(try_dim, ctx->averaged_frames[0].frame.dim)) {
    570 					alloc_beamform_frame(&ctx->gl, &ctx->averaged_frames[0].frame,
    571 					                     &ctx->averaged_frames[0].stats,
    572 					                     try_dim, s8("Averaged Frame"), arena);
    573 					alloc_beamform_frame(&ctx->gl, &ctx->averaged_frames[1].frame,
    574 					                     &ctx->averaged_frames[1].stats,
    575 					                     try_dim, s8("Averaged Frame"), arena);
    576 				}
    577 			}
    578 
    579 			frame->in_flight = 1;
    580 			frame->frame.min_coordinate = v4_from_f32_array(bp->output_min_coordinate);
    581 			frame->frame.max_coordinate = v4_from_f32_array(bp->output_max_coordinate);
    582 			frame->frame.das_shader_id  = bp->das_shader_id;
    583 			frame->frame.compound_count = bp->dec_data_dim[2];
    584 
    585 			b32 did_sum_shader = 0;
    586 			u32 stage_count = sm->compute_stages_count;
    587 			ComputeShaderID *stages = sm->compute_stages;
    588 			for (u32 i = 0; i < stage_count; i++) {
    589 				did_sum_shader |= stages[i] == CS_SUM;
    590 				frame->stats.timer_active[stages[i]] = 1;
    591 				glBeginQuery(GL_TIME_ELAPSED, frame->stats.timer_ids[stages[i]]);
    592 				do_compute_shader(ctx, arena, frame, stages[i]);
    593 				glEndQuery(GL_TIME_ELAPSED);
    594 			}
    595 			/* NOTE(rnp): block until work completes so that we can record timings */
    596 			glFinish();
    597 			cs->processing_progress = 1;
    598 
    599 			for (u32 i = 0; i < ARRAY_COUNT(frame->stats.timer_ids); i++) {
    600 				u64 ns = 0;
    601 				if (frame->stats.timer_active[i]) {
    602 					glGetQueryObjectui64v(frame->stats.timer_ids[i],
    603 					                      GL_QUERY_RESULT, &ns);
    604 					frame->stats.timer_active[i] = 0;
    605 				}
    606 				frame->stats.times[i] = (f32)ns / 1e9;
    607 			}
    608 
    609 			if (did_sum_shader) {
    610 				u32 aframe_index = (ctx->averaged_frame_index %
    611 				                    ARRAY_COUNT(ctx->averaged_frames));
    612 				ctx->averaged_frames[aframe_index].image_plane_tag  = frame->image_plane_tag;
    613 				ctx->averaged_frames[aframe_index].ready_to_present = 1;
    614 				/* TODO(rnp): not really sure what to do here */
    615 				mem_copy(&ctx->averaged_frames[aframe_index].stats.times,
    616 				         &frame->stats.times, sizeof(frame->stats.times));
    617 				atomic_inc(&ctx->averaged_frame_index, 1);
    618 			}
    619 			frame->ready_to_present = 1;
    620 			cs->processing_compute  = 0;
    621 
    622 			end_renderdoc_capture(gl_context);
    623 		} break;
    624 		case BW_SAVE_FRAME: {
    625 			BeamformComputeFrame *frame = work->output_frame_ctx.frame;
    626 			if (frame->ready_to_present) {
    627 				export_frame(ctx, work->output_frame_ctx.file_handle, &frame->frame);
    628 			} else {
    629 				/* TODO(rnp): should we handle this? */
    630 				INVALID_CODE_PATH;
    631 			}
    632 		} break;
    633 		default: INVALID_CODE_PATH; break;
    634 		}
    635 
    636 		if (can_commit) {
    637 			if (work->completion_barrier) {
    638 				i32 *value = (i32 *)(barrier_offset + work->completion_barrier);
    639 				ctx->os.wake_waiters(value);
    640 			}
    641 			beamform_work_queue_pop_commit(q);
    642 			work = beamform_work_queue_pop(q);
    643 		}
    644 	}
    645 }
    646 
    647 DEBUG_EXPORT BEAMFORMER_COMPUTE_SETUP_FN(beamformer_compute_setup)
    648 {
    649 	BeamformerCtx          *ctx = (BeamformerCtx *)user_context;
    650 	BeamformerSharedMemory *sm  = ctx->shared_memory;
    651 	ComputeShaderCtx       *cs  = &ctx->csctx;
    652 
    653 	glCreateBuffers(1, &cs->shared_ubo);
    654 	glNamedBufferStorage(cs->shared_ubo, sizeof(sm->parameters), 0, GL_DYNAMIC_STORAGE_BIT);
    655 
    656 	glCreateTextures(GL_TEXTURE_1D, 1, &cs->channel_mapping_texture);
    657 	glCreateTextures(GL_TEXTURE_1D, 1, &cs->sparse_elements_texture);
    658 	glCreateTextures(GL_TEXTURE_1D, 1, &cs->focal_vectors_texture);
    659 	glTextureStorage1D(cs->channel_mapping_texture, 1, GL_R16I,  ARRAY_COUNT(sm->channel_mapping));
    660 	glTextureStorage1D(cs->sparse_elements_texture, 1, GL_R16I,  ARRAY_COUNT(sm->sparse_elements));
    661 	glTextureStorage1D(cs->focal_vectors_texture,   1, GL_RG32F, ARRAY_COUNT(sm->focal_vectors));
    662 
    663 	LABEL_GL_OBJECT(GL_TEXTURE, cs->channel_mapping_texture, s8("Channel_Mapping"));
    664 	LABEL_GL_OBJECT(GL_TEXTURE, cs->focal_vectors_texture,   s8("Focal_Vectors"));
    665 	LABEL_GL_OBJECT(GL_TEXTURE, cs->sparse_elements_texture, s8("Sparse_Elements"));
    666 	LABEL_GL_OBJECT(GL_BUFFER,  cs->shared_ubo,              s8("Beamformer_Parameters"));
    667 }
    668 
    669 DEBUG_EXPORT BEAMFORMER_COMPLETE_COMPUTE_FN(beamformer_complete_compute)
    670 {
    671 	BeamformerCtx *ctx = (BeamformerCtx *)user_context;
    672 	complete_queue(ctx, &ctx->shared_memory->external_work_queue, arena, gl_context, (iz)ctx->shared_memory);
    673 	complete_queue(ctx, ctx->beamform_work_queue, arena, gl_context, 0);
    674 }
    675 
    676 #include "ui.c"
    677 
    678 DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step)
    679 {
    680 	dt_for_frame = GetFrameTime();
    681 
    682 	if (IsWindowResized()) {
    683 		ctx->window_size.h = GetScreenHeight();
    684 		ctx->window_size.w = GetScreenWidth();
    685 	}
    686 
    687 	if (input->executable_reloaded) {
    688 		ui_init(ctx, ctx->ui_backing_store);
    689 		DEBUG_DECL(start_frame_capture = ctx->os.start_frame_capture);
    690 		DEBUG_DECL(end_frame_capture   = ctx->os.end_frame_capture);
    691 	}
    692 
    693 	BeamformerParameters *bp = &ctx->shared_memory->parameters;
    694 	if (ctx->shared_memory->dispatch_compute_sync) {
    695 		ImagePlaneTag current_plane = ctx->shared_memory->current_image_plane;
    696 		atomic_store(&ctx->shared_memory->dispatch_compute_sync, 0);
    697 		BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue);
    698 		if (work) {
    699 			if (fill_frame_compute_work(ctx, work, current_plane))
    700 				beamform_work_queue_push_commit(ctx->beamform_work_queue);
    701 
    702 			if (ctx->shared_memory->export_next_frame) {
    703 				BeamformWork *export = beamform_work_queue_push(ctx->beamform_work_queue);
    704 				if (export) {
    705 					/* TODO: we don't really want the beamformer opening/closing files */
    706 					iptr f = ctx->os.open_for_write(ctx->os.export_pipe_name);
    707 					export->type = BW_SAVE_FRAME;
    708 					export->output_frame_ctx.file_handle = f;
    709 					if (bp->output_points[3] > 1) {
    710 						u32 a_index = !(ctx->averaged_frame_index %
    711 						                ARRAY_COUNT(ctx->averaged_frames));
    712 						BeamformComputeFrame *aframe = ctx->averaged_frames + a_index;
    713 						export->output_frame_ctx.frame = aframe;
    714 					} else {
    715 						export->output_frame_ctx.frame = work->frame;
    716 					}
    717 					beamform_work_queue_push_commit(ctx->beamform_work_queue);
    718 				}
    719 				ctx->shared_memory->export_next_frame = 0;
    720 			}
    721 
    722 			ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable);
    723 		}
    724 	}
    725 
    726 	if (ctx->start_compute) {
    727 		if (ctx->beamform_frames[ctx->display_frame_index].ready_to_present) {
    728 			BeamformWork *work  = beamform_work_queue_push(ctx->beamform_work_queue);
    729 			ImagePlaneTag plane = ctx->beamform_frames[ctx->display_frame_index].image_plane_tag;
    730 			if (fill_frame_compute_work(ctx, work, plane)) {
    731 				beamform_work_queue_push_commit(ctx->beamform_work_queue);
    732 				ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable);
    733 				ctx->start_compute = 0;
    734 			}
    735 		}
    736 	}
    737 
    738 	ComputeFrameIterator cfi = compute_frame_iterator(ctx, ctx->display_frame_index,
    739 	                                                  ctx->next_render_frame_index - ctx->display_frame_index);
    740 	for (BeamformComputeFrame *frame = frame_next(&cfi); frame; frame = frame_next(&cfi)) {
    741 		if (frame->in_flight && frame->ready_to_present) {
    742 			frame->in_flight         = 0;
    743 			ctx->display_frame_index = frame - cfi.frames;
    744 		}
    745 	}
    746 
    747 	if (ctx->start_compute) {
    748 		ctx->start_compute = 0;
    749 		ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable);
    750 	}
    751 
    752 	BeamformComputeFrame *frame_to_draw;
    753 	if (bp->output_points[3] > 1) {
    754 		u32 a_index = !(ctx->averaged_frame_index % ARRAY_COUNT(ctx->averaged_frames));
    755 		frame_to_draw = ctx->averaged_frames + a_index;
    756 	} else {
    757 		frame_to_draw = ctx->beamform_frames + ctx->display_frame_index;
    758 	}
    759 
    760 	draw_ui(ctx, input, frame_to_draw->ready_to_present? &frame_to_draw->frame : 0,
    761 	        frame_to_draw->image_plane_tag, &frame_to_draw->stats);
    762 
    763 	ctx->frame_view_render_context.updated = 0;
    764 
    765 	if (WindowShouldClose())
    766 		ctx->should_exit = 1;
    767 }