ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | LICENSE

beamformer.c (29230B)


      1 /* See LICENSE for license details. */
      2 /* TODO(rnp):
      3  * [ ]: refactor: BeamformGPUComputeContext
      4  * [ ]: refactor: compute shader timers should be generated based on the pipeline stage limit
      5  * [ ]: reinvestigate ring buffer raw_data_ssbo
      6  *      - to minimize latency the main thread should manage the subbuffer upload so that the
      7  *        compute thread can just keep computing. This way we can keep the copmute thread busy
      8  *        with work while we image.
      9  *      - In particular we will potentially need multiple GPUComputeContexts so that we
     10  *        can overwrite one while the other is in use.
     11  *      - make use of glFenceSync to guard buffer uploads
     12  */
     13 
     14 #include "beamformer.h"
     15 #include "beamformer_work_queue.c"
     16 
     17 global f32 dt_for_frame;
     18 global u32 cycle_t;
     19 
     20 #ifndef _DEBUG
     21 #define start_renderdoc_capture(...)
     22 #define end_renderdoc_capture(...)
     23 #else
     24 static renderdoc_start_frame_capture_fn *start_frame_capture;
     25 static renderdoc_end_frame_capture_fn   *end_frame_capture;
     26 #define start_renderdoc_capture(gl) if (start_frame_capture) start_frame_capture(gl, 0)
     27 #define end_renderdoc_capture(gl)   if (end_frame_capture)   end_frame_capture(gl, 0)
     28 #endif
     29 
     30 typedef struct {
     31 	BeamformComputeFrame *frames;
     32 	u32 capacity;
     33 	u32 offset;
     34 	u32 cursor;
     35 	u32 needed_frames;
     36 } ComputeFrameIterator;
     37 
     38 static uv3
     39 make_valid_test_dim(uv3 in)
     40 {
     41 	uv3 result;
     42 	result.x = MAX(in.x, 1);
     43 	result.y = MAX(in.y, 1);
     44 	result.z = MAX(in.z, 1);
     45 	return result;
     46 }
     47 
     48 static ComputeFrameIterator
     49 compute_frame_iterator(BeamformerCtx *ctx, u32 start_index, u32 needed_frames)
     50 {
     51 	start_index = start_index % ARRAY_COUNT(ctx->beamform_frames);
     52 
     53 	ComputeFrameIterator result;
     54 	result.frames        = ctx->beamform_frames;
     55 	result.offset        = start_index;
     56 	result.capacity      = ARRAY_COUNT(ctx->beamform_frames);
     57 	result.cursor        = 0;
     58 	result.needed_frames = needed_frames;
     59 	return result;
     60 }
     61 
     62 static BeamformComputeFrame *
     63 frame_next(ComputeFrameIterator *bfi)
     64 {
     65 	BeamformComputeFrame *result = 0;
     66 	if (bfi->cursor != bfi->needed_frames) {
     67 		u32 index = (bfi->offset + bfi->cursor++) % bfi->capacity;
     68 		result    = bfi->frames + index;
     69 	}
     70 	return result;
     71 }
     72 
     73 static void
     74 alloc_beamform_frame(GLParams *gp, BeamformFrame *out, ComputeShaderStats *out_stats,
     75                      uv3 out_dim, s8 name, Arena arena)
     76 {
     77 	out->dim.x = MAX(1, round_down_power_of_2(ORONE(out_dim.x)));
     78 	out->dim.y = MAX(1, round_down_power_of_2(ORONE(out_dim.y)));
     79 	out->dim.z = MAX(1, round_down_power_of_2(ORONE(out_dim.z)));
     80 
     81 	if (gp) {
     82 		out->dim.x = MIN(out->dim.x, gp->max_3d_texture_dim);
     83 		out->dim.y = MIN(out->dim.y, gp->max_3d_texture_dim);
     84 		out->dim.z = MIN(out->dim.z, gp->max_3d_texture_dim);
     85 	}
     86 
     87 	/* NOTE: allocate storage for beamformed output data;
     88 	 * this is shared between compute and fragment shaders */
     89 	u32 max_dim = MAX(out->dim.x, MAX(out->dim.y, out->dim.z));
     90 	out->mips   = ctz_u32(max_dim) + 1;
     91 
     92 	Stream label = arena_stream(&arena);
     93 	stream_append_s8(&label, name);
     94 	stream_append_byte(&label, '[');
     95 	stream_append_hex_u64(&label, out->id);
     96 	stream_append_byte(&label, ']');
     97 
     98 	glDeleteTextures(1, &out->texture);
     99 	glCreateTextures(GL_TEXTURE_3D, 1, &out->texture);
    100 	glTextureStorage3D(out->texture, out->mips, GL_RG32F, out->dim.x, out->dim.y, out->dim.z);
    101 	LABEL_GL_OBJECT(GL_TEXTURE, out->texture, stream_to_s8(&label));
    102 
    103 	if (out_stats) {
    104 		glDeleteQueries(ARRAY_COUNT(out_stats->timer_ids), out_stats->timer_ids);
    105 		glCreateQueries(GL_TIME_ELAPSED, ARRAY_COUNT(out_stats->timer_ids), out_stats->timer_ids);
    106 	}
    107 }
    108 
    109 function void
    110 alloc_shader_storage(BeamformerCtx *ctx, u32 rf_raw_size, Arena a)
    111 {
    112 	ComputeShaderCtx     *cs = &ctx->csctx;
    113 	BeamformerParameters *bp = &ctx->shared_memory->parameters;
    114 
    115 	uv4 dec_data_dim = bp->dec_data_dim;
    116 	cs->dec_data_dim = dec_data_dim;
    117 	cs->rf_raw_size  = rf_raw_size;
    118 
    119 	glDeleteBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos);
    120 	glCreateBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos);
    121 
    122 	i32 storage_flags = GL_DYNAMIC_STORAGE_BIT;
    123 	glDeleteBuffers(1, &cs->raw_data_ssbo);
    124 	glCreateBuffers(1, &cs->raw_data_ssbo);
    125 	glNamedBufferStorage(cs->raw_data_ssbo, rf_raw_size, 0, storage_flags);
    126 	LABEL_GL_OBJECT(GL_BUFFER, cs->raw_data_ssbo, s8("Raw_RF_SSBO"));
    127 
    128 	iz rf_decoded_size = 2 * sizeof(f32) * cs->dec_data_dim.x * cs->dec_data_dim.y * cs->dec_data_dim.z;
    129 	Stream label = stream_alloc(&a, 256);
    130 	stream_append_s8(&label, s8("Decoded_RF_SSBO_"));
    131 	u32 s_widx = label.widx;
    132 	for (u32 i = 0; i < ARRAY_COUNT(cs->rf_data_ssbos); i++) {
    133 		glNamedBufferStorage(cs->rf_data_ssbos[i], rf_decoded_size, 0, 0);
    134 		stream_append_u64(&label, i);
    135 		s8 rf_label = stream_to_s8(&label);
    136 		LABEL_GL_OBJECT(GL_BUFFER, cs->rf_data_ssbos[i], rf_label);
    137 		stream_reset(&label, s_widx);
    138 	}
    139 
    140 	/* NOTE(rnp): these are stubs when CUDA isn't supported */
    141 	ctx->cuda_lib.register_cuda_buffers(cs->rf_data_ssbos, ARRAY_COUNT(cs->rf_data_ssbos),
    142 		                            cs->raw_data_ssbo);
    143 	ctx->cuda_lib.init_cuda_configuration(bp->rf_raw_dim.E, bp->dec_data_dim.E,
    144 		                              ctx->shared_memory->channel_mapping);
    145 
    146 	/* NOTE: store hadamard in GPU once; it won't change for a particular imaging session */
    147 	iz   hadamard_elements = dec_data_dim.z * dec_data_dim.z;
    148 	i32  *hadamard         = alloc(&a, i32, hadamard_elements);
    149 	i32  *tmp              = alloc(&a, i32, hadamard_elements);
    150 	fill_hadamard_transpose(hadamard, tmp, dec_data_dim.z);
    151 	glDeleteTextures(1, &cs->hadamard_texture);
    152 	glCreateTextures(GL_TEXTURE_2D, 1, &cs->hadamard_texture);
    153 	glTextureStorage2D(cs->hadamard_texture, 1, GL_R8I, dec_data_dim.z, dec_data_dim.z);
    154 	glTextureSubImage2D(cs->hadamard_texture, 0, 0, 0, dec_data_dim.z, dec_data_dim.z,
    155 	                    GL_RED_INTEGER, GL_INT, hadamard);
    156 	LABEL_GL_OBJECT(GL_TEXTURE, cs->hadamard_texture, s8("Hadamard_Matrix"));
    157 }
    158 
    159 static b32
    160 fill_frame_compute_work(BeamformerCtx *ctx, BeamformWork *work, ImagePlaneTag plane)
    161 {
    162 	b32 result = 0;
    163 	if (work) {
    164 		result = 1;
    165 		u32 frame_id    = atomic_inc(&ctx->next_render_frame_index, 1);
    166 		u32 frame_index = frame_id % ARRAY_COUNT(ctx->beamform_frames);
    167 		work->type      = BW_COMPUTE;
    168 		work->frame     = ctx->beamform_frames + frame_index;
    169 		work->frame->ready_to_present = 0;
    170 		work->frame->frame.id = frame_id;
    171 		work->frame->image_plane_tag = plane;
    172 	}
    173 	return result;
    174 }
    175 
    176 static void
    177 export_frame(BeamformerCtx *ctx, iptr handle, BeamformFrame *frame)
    178 {
    179 	uv3 dim            = frame->dim;
    180 	iz  out_size       = dim.x * dim.y * dim.z * 2 * sizeof(f32);
    181 	ctx->export_buffer = ctx->os.alloc_arena(ctx->export_buffer, out_size);
    182 	glGetTextureImage(frame->texture, 0, GL_RG, GL_FLOAT, out_size, ctx->export_buffer.beg);
    183 	s8 raw = {.len = out_size, .data = ctx->export_buffer.beg};
    184 	if (!ctx->os.write_file(handle, raw))
    185 		ctx->os.write_file(ctx->os.stderr, s8("failed to export frame\n"));
    186 	ctx->os.close(handle);
    187 }
    188 
    189 static void
    190 do_sum_shader(ComputeShaderCtx *cs, u32 *in_textures, u32 in_texture_count, f32 in_scale,
    191               u32 out_texture, uv3 out_data_dim)
    192 {
    193 	/* NOTE: zero output before summing */
    194 	glClearTexImage(out_texture, 0, GL_RED, GL_FLOAT, 0);
    195 	glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT);
    196 
    197 	glBindImageTexture(0, out_texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F);
    198 	glUniform1f(CS_SUM_PRESCALE_UNIFORM_LOC, in_scale);
    199 	for (u32 i = 0; i < in_texture_count; i++) {
    200 		glBindImageTexture(1, in_textures[i], 0, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F);
    201 		glDispatchCompute(ORONE(out_data_dim.x / 32),
    202 		                  ORONE(out_data_dim.y),
    203 		                  ORONE(out_data_dim.z / 32));
    204 		glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    205 	}
    206 }
    207 
    208 struct compute_cursor {
    209 	iv3 cursor;
    210 	iv3 dispatch;
    211 	iv3 target;
    212 	u32 points_per_dispatch;
    213 	u32 completed_points;
    214 	u32 total_points;
    215 };
    216 
    217 static struct compute_cursor
    218 start_compute_cursor(uv3 dim, u32 max_points)
    219 {
    220 	struct compute_cursor result = {0};
    221 	u32 invocations_per_dispatch = DAS_LOCAL_SIZE_X * DAS_LOCAL_SIZE_Y * DAS_LOCAL_SIZE_Z;
    222 
    223 	result.dispatch.y = MIN(max_points / invocations_per_dispatch, MAX(dim.y / DAS_LOCAL_SIZE_Y, 1));
    224 
    225 	u32 remaining     = max_points / result.dispatch.y;
    226 	result.dispatch.x = MIN(remaining / invocations_per_dispatch, MAX(dim.x / DAS_LOCAL_SIZE_X, 1));
    227 	result.dispatch.z = MIN(remaining / (invocations_per_dispatch * result.dispatch.x),
    228 	                        MAX(dim.z / DAS_LOCAL_SIZE_Z, 1));
    229 
    230 	result.target.x = MAX(dim.x / result.dispatch.x / DAS_LOCAL_SIZE_X, 1);
    231 	result.target.y = MAX(dim.y / result.dispatch.y / DAS_LOCAL_SIZE_Y, 1);
    232 	result.target.z = MAX(dim.z / result.dispatch.z / DAS_LOCAL_SIZE_Z, 1);
    233 
    234 	result.points_per_dispatch = 1;
    235 	result.points_per_dispatch *= result.dispatch.x * DAS_LOCAL_SIZE_X;
    236 	result.points_per_dispatch *= result.dispatch.y * DAS_LOCAL_SIZE_Y;
    237 	result.points_per_dispatch *= result.dispatch.z * DAS_LOCAL_SIZE_Z;
    238 
    239 	result.total_points = dim.x * dim.y * dim.z;
    240 
    241 	return result;
    242 }
    243 
    244 static iv3
    245 step_compute_cursor(struct compute_cursor *cursor)
    246 {
    247 	cursor->cursor.x += 1;
    248 	if (cursor->cursor.x >= cursor->target.x) {
    249 		cursor->cursor.x  = 0;
    250 		cursor->cursor.y += 1;
    251 		if (cursor->cursor.y >= cursor->target.y) {
    252 			cursor->cursor.y  = 0;
    253 			cursor->cursor.z += 1;
    254 		}
    255 	}
    256 
    257 	cursor->completed_points += cursor->points_per_dispatch;
    258 
    259 	iv3 result = cursor->cursor;
    260 	result.x *= cursor->dispatch.x * DAS_LOCAL_SIZE_X;
    261 	result.y *= cursor->dispatch.y * DAS_LOCAL_SIZE_Y;
    262 	result.z *= cursor->dispatch.z * DAS_LOCAL_SIZE_Z;
    263 
    264 	return result;
    265 }
    266 
    267 static b32
    268 compute_cursor_finished(struct compute_cursor *cursor)
    269 {
    270 	b32 result = cursor->completed_points >= cursor->total_points;
    271 	return result;
    272 }
    273 
    274 static void
    275 do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, ComputeShaderID shader)
    276 {
    277 	ComputeShaderCtx *csctx = &ctx->csctx;
    278 
    279 	glUseProgram(csctx->programs[shader]);
    280 
    281 	u32 output_ssbo_idx = !csctx->last_output_ssbo_index;
    282 	u32 input_ssbo_idx  = csctx->last_output_ssbo_index;
    283 
    284 	switch (shader) {
    285 	case CS_DECODE:
    286 	case CS_DECODE_FLOAT:
    287 	case CS_DECODE_FLOAT_COMPLEX:
    288 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo);
    289 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]);
    290 		glBindImageTexture(0, csctx->hadamard_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8I);
    291 		glBindImageTexture(1, csctx->channel_mapping_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I);
    292 		glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32),
    293 		                  ORONE(csctx->dec_data_dim.y / 32),
    294 		                  ORONE(csctx->dec_data_dim.z));
    295 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    296 		break;
    297 	case CS_CUDA_DECODE:
    298 		ctx->cuda_lib.cuda_decode(0, output_ssbo_idx, 0);
    299 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    300 		break;
    301 	case CS_CUDA_HILBERT:
    302 		ctx->cuda_lib.cuda_hilbert(input_ssbo_idx, output_ssbo_idx);
    303 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    304 		break;
    305 	case CS_DEMOD:
    306 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]);
    307 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]);
    308 		glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32),
    309 		                  ORONE(csctx->dec_data_dim.y / 32),
    310 		                  ORONE(csctx->dec_data_dim.z));
    311 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    312 		break;
    313 	case CS_MIN_MAX: {
    314 		u32 texture = frame->frame.texture;
    315 		for (u32 i = 1; i < frame->frame.mips; i++) {
    316 			glBindImageTexture(0, texture, i - 1, GL_TRUE, 0, GL_READ_ONLY,  GL_RG32F);
    317 			glBindImageTexture(1, texture, i - 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F);
    318 			glUniform1i(CS_MIN_MAX_MIPS_LEVEL_UNIFORM_LOC, i);
    319 
    320 			u32 width  = frame->frame.dim.x >> i;
    321 			u32 height = frame->frame.dim.y >> i;
    322 			u32 depth  = frame->frame.dim.z >> i;
    323 			glDispatchCompute(ORONE(width / 32), ORONE(height), ORONE(depth / 32));
    324 			glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    325 		}
    326 	} break;
    327 	case CS_DAS: {
    328 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]);
    329 		glBindImageTexture(0, frame->frame.texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F);
    330 		glBindImageTexture(1, csctx->sparse_elements_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I);
    331 		glBindImageTexture(2, csctx->focal_vectors_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RG32F);
    332 
    333 		glUniform1ui(DAS_CYCLE_T_UNIFORM_LOC, cycle_t++);
    334 
    335 		#if 1
    336 		/* TODO(rnp): compute max_points_per_dispatch based on something like a
    337 		 * transmit_count * channel_count product */
    338 		u32 max_points_per_dispatch = KB(64);
    339 		struct compute_cursor cursor = start_compute_cursor(frame->frame.dim, max_points_per_dispatch);
    340 		f32 percent_per_step = (f32)cursor.points_per_dispatch / (f32)cursor.total_points;
    341 		csctx->processing_progress = -percent_per_step;
    342 		for (iv3 offset = {0};
    343 		     !compute_cursor_finished(&cursor);
    344 		     offset = step_compute_cursor(&cursor))
    345 		{
    346 			csctx->processing_progress += percent_per_step;
    347 			/* IMPORTANT(rnp): prevents OS from coalescing and killing our shader */
    348 			glFinish();
    349 			glUniform3iv(DAS_VOXEL_OFFSET_UNIFORM_LOC, 1, offset.E);
    350 			glDispatchCompute(cursor.dispatch.x, cursor.dispatch.y, cursor.dispatch.z);
    351 		}
    352 		#else
    353 		/* NOTE(rnp): use this for testing tiling code. The performance of the above path
    354 		 * should be the same as this path if everything is working correctly */
    355 		iv3 compute_dim_offset = {0};
    356 		glUniform3iv(csctx->voxel_offset_id, 1, compute_dim_offset.E);
    357 		glDispatchCompute(ORONE(frame->frame.dim.x / 32),
    358 		                  ORONE(frame->frame.dim.y),
    359 		                  ORONE(frame->frame.dim.z / 32));
    360 		#endif
    361 		glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT|GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    362 	} break;
    363 	case CS_SUM: {
    364 		u32 aframe_index = ctx->averaged_frame_index % ARRAY_COUNT(ctx->averaged_frames);
    365 		BeamformComputeFrame *aframe = ctx->averaged_frames + aframe_index;
    366 		aframe->ready_to_present     = 0;
    367 		aframe->frame.id             = ctx->averaged_frame_index;
    368 		/* TODO(rnp): hack we need a better way of specifying which frames to sum;
    369 		 * this is fine for rolling averaging but what if we want to do something else */
    370 		ASSERT(frame >= ctx->beamform_frames);
    371 		ASSERT(frame < ctx->beamform_frames + ARRAY_COUNT(ctx->beamform_frames));
    372 		u32 base_index   = (u32)(frame - ctx->beamform_frames);
    373 		u32 to_average   = ctx->shared_memory->parameters.output_points.w;
    374 		u32 frame_count  = 0;
    375 		u32 *in_textures = alloc(&arena, u32, MAX_BEAMFORMED_SAVED_FRAMES);
    376 		ComputeFrameIterator cfi = compute_frame_iterator(ctx, 1 + base_index - to_average,
    377 		                                                  to_average);
    378 		for (BeamformComputeFrame *it = frame_next(&cfi); it; it = frame_next(&cfi))
    379 			in_textures[frame_count++] = it->frame.texture;
    380 
    381 		ASSERT(to_average == frame_count);
    382 
    383 		do_sum_shader(csctx, in_textures, frame_count, 1 / (f32)frame_count,
    384 		              aframe->frame.texture, aframe->frame.dim);
    385 		aframe->frame.min_coordinate = frame->frame.min_coordinate;
    386 		aframe->frame.max_coordinate = frame->frame.max_coordinate;
    387 		aframe->frame.compound_count = frame->frame.compound_count;
    388 		aframe->frame.das_shader_id  = frame->frame.das_shader_id;
    389 	} break;
    390 	default: ASSERT(0);
    391 	}
    392 }
    393 
    394 function s8
    395 push_compute_shader_header(Arena *a, b32 parameters, ComputeShaderID shader)
    396 {
    397 	s8 result = {.data = a->beg};
    398 
    399 	push_s8(a, s8("#version 460 core\n\n"));
    400 
    401 	#define X(name, type, size, gltype, glsize, comment) "\t" #gltype " " #name #glsize "; " comment "\n"
    402 	if (parameters) {
    403 		push_s8(a, s8("layout(std140, binding = 0) uniform parameters {\n"
    404 		              BEAMFORMER_PARAMS_HEAD
    405 		              BEAMFORMER_UI_PARAMS
    406 		              BEAMFORMER_PARAMS_TAIL
    407 		              "};\n\n"));
    408 	}
    409 	#undef X
    410 
    411 	switch (shader) {
    412 	case CS_DAS: {
    413 		push_s8(a, s8("layout("
    414 		              "local_size_x = " str(DAS_LOCAL_SIZE_X) ", "
    415 		              "local_size_y = " str(DAS_LOCAL_SIZE_Y) ", "
    416 		              "local_size_z = " str(DAS_LOCAL_SIZE_Z) ") "
    417 		              "in;\n\n"));
    418 
    419 		push_s8(a, s8("layout(location = " str(DAS_VOXEL_OFFSET_UNIFORM_LOC) ") uniform ivec3 u_voxel_offset;\n"));
    420 		push_s8(a, s8("layout(location = " str(DAS_CYCLE_T_UNIFORM_LOC)      ") uniform uint  u_cycle_t;\n\n"));
    421 		#define X(type, id, pretty, fixed_tx) push_s8(a, s8("#define DAS_ID_" #type " " #id "\n"));
    422 		DAS_TYPES
    423 		#undef X
    424 	} break;
    425 	case CS_DECODE_FLOAT:
    426 	case CS_DECODE_FLOAT_COMPLEX: {
    427 		if (shader == CS_DECODE_FLOAT) push_s8(a, s8("#define INPUT_DATA_TYPE_FLOAT\n\n"));
    428 		else                           push_s8(a, s8("#define INPUT_DATA_TYPE_FLOAT_COMPLEX\n\n"));
    429 	} /* FALLTHROUGH */
    430 	case CS_DECODE: {
    431 		#define X(type, id, pretty) push_s8(a, s8("#define DECODE_MODE_" #type " " #id "\n"));
    432 		DECODE_TYPES
    433 		#undef X
    434 	} break;
    435 	case CS_MIN_MAX: {
    436 		push_s8(a, s8("layout(location = " str(CS_MIN_MAX_MIPS_LEVEL_UNIFORM_LOC)
    437 		              ") uniform int u_mip_map;\n\n"));
    438 	} break;
    439 	case CS_SUM: {
    440 		push_s8(a, s8("layout(location = " str(CS_SUM_PRESCALE_UNIFORM_LOC)
    441 		              ") uniform float u_sum_prescale = 1.0;\n\n"));
    442 	} break;
    443 	default: break;
    444 	}
    445 	s8 end = push_s8(a, s8("\n#line 1\n"));
    446 	result.len = end.data + end.len - result.data;
    447 	return result;
    448 }
    449 
    450 static b32
    451 reload_compute_shader(BeamformerCtx *ctx, s8 path, s8 extra, ComputeShaderReloadContext *csr, Arena tmp)
    452 {
    453 	ComputeShaderCtx *cs = &ctx->csctx;
    454 	b32 result = 0;
    455 
    456 	/* NOTE: arena works as stack (since everything here is 1 byte aligned) */
    457 	s8 header      = push_compute_shader_header(&tmp, csr->needs_header, csr->shader);
    458 	s8 shader_text = ctx->os.read_whole_file(&tmp, (c8 *)path.data);
    459 	shader_text.data -= header.len;
    460 	shader_text.len  += header.len;
    461 
    462 	if (shader_text.data == header.data) {
    463 		s8 info = {.data = tmp.beg};
    464 		push_s8(&tmp, path);
    465 		push_s8(&tmp, extra);
    466 		info.len = tmp.beg - info.data;
    467 		u32 new_program = load_shader(&ctx->os, tmp, 1, (s8){0}, (s8){0}, shader_text,
    468 		                              info, csr->label);
    469 		if (new_program) {
    470 			glDeleteProgram(cs->programs[csr->shader]);
    471 			cs->programs[csr->shader] = new_program;
    472 			glUseProgram(cs->programs[csr->shader]);
    473 			glBindBufferBase(GL_UNIFORM_BUFFER, 0, cs->shared_ubo);
    474 		}
    475 	} else {
    476 		Stream buf = arena_stream(&tmp);
    477 		stream_append_s8(&buf, s8("failed to load: "));
    478 		stream_append_s8(&buf, path);
    479 		stream_append_s8(&buf, extra);
    480 		stream_append_byte(&buf, '\n');
    481 		ctx->os.write_file(ctx->os.stderr, stream_to_s8(&buf));
    482 	}
    483 
    484 	return result;
    485 }
    486 
    487 static void
    488 complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_context, iz barrier_offset)
    489 {
    490 	ComputeShaderCtx       *cs = &ctx->csctx;
    491 	BeamformerParameters   *bp = &ctx->shared_memory->parameters;
    492 	BeamformerSharedMemory *sm = ctx->shared_memory;
    493 
    494 	BeamformWork *work = beamform_work_queue_pop(q);
    495 	while (work) {
    496 		b32 can_commit = 1;
    497 		switch (work->type) {
    498 		case BW_RELOAD_SHADER: {
    499 			ComputeShaderReloadContext *csr = work->reload_shader_ctx;
    500 			b32 success = reload_compute_shader(ctx, csr->path, s8(""), csr, arena);
    501 			if (csr->shader == CS_DECODE) {
    502 				/* TODO(rnp): think of a better way of doing this */
    503 				csr->shader = CS_DECODE_FLOAT_COMPLEX;
    504 				success &= reload_compute_shader(ctx, csr->path, s8(" (F32C)"), csr, arena);
    505 				csr->shader = CS_DECODE_FLOAT;
    506 				success &= reload_compute_shader(ctx, csr->path, s8(" (F32)"),  csr, arena);
    507 				csr->shader = CS_DECODE;
    508 			}
    509 
    510 			if (success) {
    511 				/* TODO(rnp): this check seems off */
    512 				if (ctx->csctx.raw_data_ssbo) {
    513 					can_commit = 0;
    514 					ImagePlaneTag plane = ctx->beamform_frames[ctx->display_frame_index].image_plane_tag;
    515 					fill_frame_compute_work(ctx, work, plane);
    516 				}
    517 			}
    518 		} break;
    519 		case BW_UPLOAD_BUFFER: {
    520 			ASSERT(!atomic_load((i32 *)(barrier_offset + work->completion_barrier)));
    521 			BeamformerUploadContext *uc = &work->upload_context;
    522 			u32 tex_type, tex_format, tex_element_count, tex_1d = 0, buffer = 0;
    523 			switch (uc->kind) {
    524 			case BU_KIND_CHANNEL_MAPPING: {
    525 				tex_1d            = cs->channel_mapping_texture;
    526 				tex_type          = GL_SHORT;
    527 				tex_format        = GL_RED_INTEGER;
    528 				tex_element_count = ARRAY_COUNT(sm->channel_mapping);
    529 			} break;
    530 			case BU_KIND_FOCAL_VECTORS: {
    531 				tex_1d            = cs->focal_vectors_texture;
    532 				tex_type          = GL_FLOAT;
    533 				tex_format        = GL_RG;
    534 				tex_element_count = ARRAY_COUNT(sm->focal_vectors);
    535 			} break;
    536 			case BU_KIND_SPARSE_ELEMENTS: {
    537 				tex_1d            = cs->sparse_elements_texture;
    538 				tex_type          = GL_SHORT;
    539 				tex_format        = GL_RED_INTEGER;
    540 				tex_element_count = ARRAY_COUNT(sm->sparse_elements);
    541 			} break;
    542 			case BU_KIND_PARAMETERS: {
    543 				ctx->ui_read_params = barrier_offset != 0;
    544 				buffer = cs->shared_ubo;
    545 			} break;
    546 			case BU_KIND_RF_DATA: {
    547 				if (cs->rf_raw_size != uc->size ||
    548 				    !uv4_equal(cs->dec_data_dim, bp->dec_data_dim))
    549 				{
    550 					alloc_shader_storage(ctx, uc->size, arena);
    551 				}
    552 				buffer = cs->raw_data_ssbo;
    553 			} break;
    554 			default: INVALID_CODE_PATH; break;
    555 			}
    556 
    557 			if (tex_1d) {
    558 				glTextureSubImage1D(tex_1d, 0, 0, tex_element_count, tex_format,
    559 				                    tex_type, (u8 *)sm + uc->shared_memory_offset);
    560 			}
    561 
    562 			if (buffer) {
    563 				glNamedBufferSubData(buffer, 0, uc->size,
    564 				                     (u8 *)sm + uc->shared_memory_offset);
    565 			}
    566 		} break;
    567 		case BW_COMPUTE: {
    568 			atomic_store(&cs->processing_compute, 1);
    569 			start_renderdoc_capture(gl_context);
    570 
    571 			BeamformComputeFrame *frame = work->frame;
    572 			uv3 try_dim = make_valid_test_dim(bp->output_points.xyz);
    573 			if (!uv3_equal(try_dim, frame->frame.dim))
    574 				alloc_beamform_frame(&ctx->gl, &frame->frame, &frame->stats, try_dim,
    575 				                     s8("Beamformed_Data"), arena);
    576 
    577 			if (bp->output_points.w > 1) {
    578 				if (!uv3_equal(try_dim, ctx->averaged_frames[0].frame.dim)) {
    579 					alloc_beamform_frame(&ctx->gl, &ctx->averaged_frames[0].frame,
    580 					                     &ctx->averaged_frames[0].stats,
    581 					                     try_dim, s8("Averaged Frame"), arena);
    582 					alloc_beamform_frame(&ctx->gl, &ctx->averaged_frames[1].frame,
    583 					                     &ctx->averaged_frames[1].stats,
    584 					                     try_dim, s8("Averaged Frame"), arena);
    585 				}
    586 			}
    587 
    588 			frame->in_flight = 1;
    589 			frame->frame.min_coordinate = bp->output_min_coordinate;
    590 			frame->frame.max_coordinate = bp->output_max_coordinate;
    591 			frame->frame.das_shader_id  = bp->das_shader_id;
    592 			frame->frame.compound_count = bp->dec_data_dim.z;
    593 
    594 			b32 did_sum_shader = 0;
    595 			u32 stage_count = sm->compute_stages_count;
    596 			ComputeShaderID *stages = sm->compute_stages;
    597 			for (u32 i = 0; i < stage_count; i++) {
    598 				did_sum_shader |= stages[i] == CS_SUM;
    599 				frame->stats.timer_active[stages[i]] = 1;
    600 				glBeginQuery(GL_TIME_ELAPSED, frame->stats.timer_ids[stages[i]]);
    601 				do_compute_shader(ctx, arena, frame, stages[i]);
    602 				glEndQuery(GL_TIME_ELAPSED);
    603 			}
    604 			/* NOTE(rnp): block until work completes so that we can record timings */
    605 			glFinish();
    606 			cs->processing_progress = 1;
    607 
    608 			for (u32 i = 0; i < ARRAY_COUNT(frame->stats.timer_ids); i++) {
    609 				u64 ns = 0;
    610 				if (frame->stats.timer_active[i]) {
    611 					glGetQueryObjectui64v(frame->stats.timer_ids[i],
    612 					                      GL_QUERY_RESULT, &ns);
    613 					frame->stats.timer_active[i] = 0;
    614 				}
    615 				frame->stats.times[i] = (f32)ns / 1e9;
    616 			}
    617 
    618 			if (did_sum_shader) {
    619 				u32 aframe_index = (ctx->averaged_frame_index %
    620 				                    ARRAY_COUNT(ctx->averaged_frames));
    621 				ctx->averaged_frames[aframe_index].image_plane_tag  = frame->image_plane_tag;
    622 				ctx->averaged_frames[aframe_index].ready_to_present = 1;
    623 				/* TODO(rnp): not really sure what to do here */
    624 				mem_copy(&ctx->averaged_frames[aframe_index].stats.times,
    625 				         &frame->stats.times, sizeof(frame->stats.times));
    626 				atomic_inc(&ctx->averaged_frame_index, 1);
    627 			}
    628 			frame->ready_to_present = 1;
    629 			cs->processing_compute  = 0;
    630 
    631 			end_renderdoc_capture(gl_context);
    632 		} break;
    633 		case BW_SAVE_FRAME: {
    634 			BeamformComputeFrame *frame = work->output_frame_ctx.frame;
    635 			if (frame->ready_to_present) {
    636 				export_frame(ctx, work->output_frame_ctx.file_handle, &frame->frame);
    637 			} else {
    638 				/* TODO(rnp): should we handle this? */
    639 				INVALID_CODE_PATH;
    640 			}
    641 		} break;
    642 		default: INVALID_CODE_PATH; break;
    643 		}
    644 
    645 		if (can_commit) {
    646 			if (work->completion_barrier) {
    647 				i32 *value = (i32 *)(barrier_offset + work->completion_barrier);
    648 				ctx->os.wake_waiters(value);
    649 			}
    650 			beamform_work_queue_pop_commit(q);
    651 			work = beamform_work_queue_pop(q);
    652 		}
    653 	}
    654 }
    655 
    656 DEBUG_EXPORT BEAMFORMER_COMPUTE_SETUP_FN(beamformer_compute_setup)
    657 {
    658 	BeamformerCtx          *ctx = (BeamformerCtx *)user_context;
    659 	BeamformerSharedMemory *sm  = ctx->shared_memory;
    660 	ComputeShaderCtx       *cs  = &ctx->csctx;
    661 
    662 	glCreateBuffers(1, &cs->shared_ubo);
    663 	glNamedBufferStorage(cs->shared_ubo, sizeof(sm->parameters), 0, GL_DYNAMIC_STORAGE_BIT);
    664 
    665 	glCreateTextures(GL_TEXTURE_1D, 1, &cs->channel_mapping_texture);
    666 	glCreateTextures(GL_TEXTURE_1D, 1, &cs->sparse_elements_texture);
    667 	glCreateTextures(GL_TEXTURE_1D, 1, &cs->focal_vectors_texture);
    668 	glTextureStorage1D(cs->channel_mapping_texture, 1, GL_R16I,  ARRAY_COUNT(sm->channel_mapping));
    669 	glTextureStorage1D(cs->sparse_elements_texture, 1, GL_R16I,  ARRAY_COUNT(sm->sparse_elements));
    670 	glTextureStorage1D(cs->focal_vectors_texture,   1, GL_RG32F, ARRAY_COUNT(sm->focal_vectors));
    671 
    672 	LABEL_GL_OBJECT(GL_TEXTURE, cs->channel_mapping_texture, s8("Channel_Mapping"));
    673 	LABEL_GL_OBJECT(GL_TEXTURE, cs->focal_vectors_texture,   s8("Focal_Vectors"));
    674 	LABEL_GL_OBJECT(GL_TEXTURE, cs->sparse_elements_texture, s8("Sparse_Elements"));
    675 	LABEL_GL_OBJECT(GL_BUFFER,  cs->shared_ubo,              s8("Beamformer_Parameters"));
    676 }
    677 
    678 DEBUG_EXPORT BEAMFORMER_COMPLETE_COMPUTE_FN(beamformer_complete_compute)
    679 {
    680 	BeamformerCtx *ctx = (BeamformerCtx *)user_context;
    681 	complete_queue(ctx, &ctx->shared_memory->external_work_queue, arena, gl_context, (iz)ctx->shared_memory);
    682 	complete_queue(ctx, ctx->beamform_work_queue, arena, gl_context, 0);
    683 }
    684 
    685 #include "ui.c"
    686 
    687 DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step)
    688 {
    689 	dt_for_frame = GetFrameTime();
    690 
    691 	if (IsWindowResized()) {
    692 		ctx->window_size.h = GetScreenHeight();
    693 		ctx->window_size.w = GetScreenWidth();
    694 	}
    695 
    696 	if (input->executable_reloaded) {
    697 		ui_init(ctx, ctx->ui_backing_store);
    698 		DEBUG_DECL(start_frame_capture = ctx->os.start_frame_capture);
    699 		DEBUG_DECL(end_frame_capture   = ctx->os.end_frame_capture);
    700 	}
    701 
    702 	BeamformerParameters *bp = &ctx->shared_memory->parameters;
    703 	if (ctx->shared_memory->dispatch_compute_sync) {
    704 		ImagePlaneTag current_plane = ctx->shared_memory->current_image_plane;
    705 		atomic_store(&ctx->shared_memory->dispatch_compute_sync, 0);
    706 		BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue);
    707 		if (work) {
    708 			if (fill_frame_compute_work(ctx, work, current_plane))
    709 				beamform_work_queue_push_commit(ctx->beamform_work_queue);
    710 
    711 			if (ctx->shared_memory->export_next_frame) {
    712 				BeamformWork *export = beamform_work_queue_push(ctx->beamform_work_queue);
    713 				if (export) {
    714 					/* TODO: we don't really want the beamformer opening/closing files */
    715 					iptr f = ctx->os.open_for_write(ctx->shared_memory->export_pipe_name);
    716 					export->type = BW_SAVE_FRAME;
    717 					export->output_frame_ctx.file_handle = f;
    718 					if (bp->output_points.w > 1) {
    719 						u32 a_index = !(ctx->averaged_frame_index %
    720 						                ARRAY_COUNT(ctx->averaged_frames));
    721 						BeamformComputeFrame *aframe = ctx->averaged_frames + a_index;
    722 						export->output_frame_ctx.frame = aframe;
    723 					} else {
    724 						export->output_frame_ctx.frame = work->frame;
    725 					}
    726 					beamform_work_queue_push_commit(ctx->beamform_work_queue);
    727 				}
    728 				ctx->shared_memory->export_next_frame = 0;
    729 			}
    730 
    731 			ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable);
    732 		}
    733 	}
    734 
    735 	if (ctx->start_compute) {
    736 		if (ctx->beamform_frames[ctx->display_frame_index].ready_to_present) {
    737 			BeamformWork *work  = beamform_work_queue_push(ctx->beamform_work_queue);
    738 			ImagePlaneTag plane = ctx->beamform_frames[ctx->display_frame_index].image_plane_tag;
    739 			if (fill_frame_compute_work(ctx, work, plane)) {
    740 				beamform_work_queue_push_commit(ctx->beamform_work_queue);
    741 				ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable);
    742 				ctx->start_compute = 0;
    743 			}
    744 		}
    745 	}
    746 
    747 	ComputeFrameIterator cfi = compute_frame_iterator(ctx, ctx->display_frame_index,
    748 	                                                  ctx->next_render_frame_index - ctx->display_frame_index);
    749 	for (BeamformComputeFrame *frame = frame_next(&cfi); frame; frame = frame_next(&cfi)) {
    750 		if (frame->in_flight && frame->ready_to_present) {
    751 			frame->in_flight         = 0;
    752 			ctx->display_frame_index = frame - cfi.frames;
    753 		}
    754 	}
    755 
    756 	if (ctx->start_compute) {
    757 		ctx->start_compute = 0;
    758 		ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable);
    759 	}
    760 
    761 	BeamformComputeFrame *frame_to_draw;
    762 	if (bp->output_points.w > 1) {
    763 		u32 a_index = !(ctx->averaged_frame_index % ARRAY_COUNT(ctx->averaged_frames));
    764 		frame_to_draw = ctx->averaged_frames + a_index;
    765 	} else {
    766 		frame_to_draw = ctx->beamform_frames + ctx->display_frame_index;
    767 	}
    768 
    769 	draw_ui(ctx, input, frame_to_draw->ready_to_present? &frame_to_draw->frame : 0,
    770 	        frame_to_draw->image_plane_tag, &frame_to_draw->stats);
    771 
    772 	ctx->frame_view_render_context.updated = 0;
    773 
    774 	if (WindowShouldClose())
    775 		ctx->should_exit = 1;
    776 }