ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | LICENSE

beamformer.c (25411B)


      1 /* See LICENSE for license details. */
      2 #include "beamformer.h"
      3 
      4 static f32 dt_for_frame;
      5 static f32 cycle_t;
      6 
      7 static size
      8 decoded_data_size(ComputeShaderCtx *cs)
      9 {
     10 	uv4  dim    = cs->dec_data_dim;
     11 	size result = 2 * sizeof(f32) * dim.x * dim.y * dim.z;
     12 	return result;
     13 }
     14 
     15 static uv3
     16 make_valid_test_dim(uv3 in)
     17 {
     18 	uv3 result;
     19 	result.x = MAX(in.x, 1);
     20 	result.y = MAX(in.y, 1);
     21 	result.z = MAX(in.z, 1);
     22 	return result;
     23 }
     24 
     25 static BeamformFrameIterator
     26 beamform_frame_iterator(BeamformerCtx *ctx)
     27 {
     28 	BeamformFrameIterator result;
     29 	result.frames        = ctx->beamform_frames;
     30 	result.offset        = ctx->displayed_frame_index;
     31 	result.capacity      = ARRAY_COUNT(ctx->beamform_frames);
     32 	result.cursor        = 0;
     33 	result.needed_frames = ORONE(ctx->params->raw.output_points.w);
     34 	return result;
     35 }
     36 
     37 static BeamformFrame *
     38 frame_next(BeamformFrameIterator *bfi)
     39 {
     40 	BeamformFrame *result = 0;
     41 	if (bfi->cursor != bfi->needed_frames) {
     42 		u32 index = (bfi->offset - bfi->cursor++) % bfi->capacity;
     43 		result    = bfi->frames + index;
     44 	}
     45 	return result;
     46 }
     47 
     48 static void
     49 alloc_beamform_frame(GLParams *gp, BeamformFrame *out, uv3 out_dim, u32 frame_index, s8 name)
     50 {
     51 	glDeleteTextures(1, &out->texture);
     52 
     53 	out->dim.x = CLAMP(round_down_power_of_2(ORONE(out_dim.x)), 1, gp->max_3d_texture_dim);
     54 	out->dim.y = CLAMP(round_down_power_of_2(ORONE(out_dim.y)), 1, gp->max_3d_texture_dim);
     55 	out->dim.z = CLAMP(round_down_power_of_2(ORONE(out_dim.z)), 1, gp->max_3d_texture_dim);
     56 
     57 	/* NOTE: allocate storage for beamformed output data;
     58 	 * this is shared between compute and fragment shaders */
     59 	u32 max_dim = MAX(out->dim.x, MAX(out->dim.y, out->dim.z));
     60 	out->mips   = ctz_u32(max_dim) + 1;
     61 
     62 	/* TODO(rnp): arena?? */
     63 	u8 buf[256];
     64 	Stream label = {.data = buf, .cap = ARRAY_COUNT(buf)};
     65 	stream_append_s8(&label, name);
     66 	stream_append_byte(&label, '[');
     67 	stream_append_u64(&label, frame_index);
     68 	stream_append_s8(&label, s8("]"));
     69 
     70 	glCreateTextures(GL_TEXTURE_3D, 1, &out->texture);
     71 	glTextureStorage3D(out->texture, out->mips, GL_RG32F, out->dim.x, out->dim.y, out->dim.z);
     72 	LABEL_GL_OBJECT(GL_TEXTURE, out->texture, stream_to_s8(&label));
     73 }
     74 
     75 static void
     76 alloc_output_image(BeamformerCtx *ctx, uv3 output_dim)
     77 {
     78 	uv3 try_dim = make_valid_test_dim(output_dim);
     79 	if (!uv3_equal(try_dim, ctx->averaged_frame.dim)) {
     80 		alloc_beamform_frame(&ctx->gl, &ctx->averaged_frame, try_dim, 0,
     81 		                     s8("Beamformed_Averaged_Data"));
     82 		uv3 odim = ctx->averaged_frame.dim;
     83 
     84 		UnloadRenderTexture(ctx->fsctx.output);
     85 		/* TODO: select odim.x vs odim.y */
     86 		ctx->fsctx.output = LoadRenderTexture(odim.x, odim.z);
     87 		LABEL_GL_OBJECT(GL_FRAMEBUFFER, ctx->fsctx.output.id, s8("Rendered_View"));
     88 		GenTextureMipmaps(&ctx->fsctx.output.texture);
     89 		//SetTextureFilter(ctx->fsctx.output.texture, TEXTURE_FILTER_ANISOTROPIC_8X);
     90 		//SetTextureFilter(ctx->fsctx.output.texture, TEXTURE_FILTER_TRILINEAR);
     91 		SetTextureFilter(ctx->fsctx.output.texture, TEXTURE_FILTER_BILINEAR);
     92 
     93 		/* NOTE(rnp): work around raylib's janky texture sampling */
     94 		i32 id = ctx->fsctx.output.texture.id;
     95 		glTextureParameteri(id, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_BORDER);
     96 		glTextureParameteri(id, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_BORDER);
     97 
     98 		f32 border_color[] = {0, 0, 0, 1};
     99 		glTextureParameterfv(id, GL_TEXTURE_BORDER_COLOR, border_color);
    100 	}
    101 }
    102 
    103 static void
    104 alloc_shader_storage(BeamformerCtx *ctx, Arena a)
    105 {
    106 	ComputeShaderCtx *cs     = &ctx->csctx;
    107 	BeamformerParameters *bp = &ctx->params->raw;
    108 	uv4 dec_data_dim         = bp->dec_data_dim;
    109 	uv2 rf_raw_dim           = bp->rf_raw_dim;
    110 	ctx->csctx.dec_data_dim  = dec_data_dim;
    111 	ctx->csctx.rf_raw_dim    = rf_raw_dim;
    112 	size rf_raw_size         = rf_raw_dim.x * rf_raw_dim.y * sizeof(i16);
    113 	size rf_decoded_size     = decoded_data_size(cs);
    114 
    115 	glDeleteBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos);
    116 	glCreateBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos);
    117 
    118 	i32 storage_flags = GL_DYNAMIC_STORAGE_BIT;
    119 	switch (ctx->gl.vendor_id) {
    120 	case GL_VENDOR_AMD:
    121 	case GL_VENDOR_ARM:
    122 	case GL_VENDOR_INTEL:
    123 		if (cs->raw_data_ssbo)
    124 			glUnmapNamedBuffer(cs->raw_data_ssbo);
    125 		storage_flags |= GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT;
    126 	case GL_VENDOR_NVIDIA:
    127 		/* NOTE: register_cuda_buffers will handle the updated ssbo */
    128 		break;
    129 	}
    130 
    131 	size full_rf_buf_size = ARRAY_COUNT(cs->raw_data_fences) * rf_raw_size;
    132 	glDeleteBuffers(1, &cs->raw_data_ssbo);
    133 	glCreateBuffers(1, &cs->raw_data_ssbo);
    134 	glNamedBufferStorage(cs->raw_data_ssbo, full_rf_buf_size, 0, storage_flags);
    135 	LABEL_GL_OBJECT(GL_BUFFER, cs->raw_data_ssbo, s8("Raw_Data_SSBO"));
    136 
    137 	Stream label = stream_alloc(&a, 256);
    138 	stream_append_s8(&label, s8("RF_SSBO_"));
    139 	u32 s_widx  = label.widx;
    140 	for (u32 i = 0; i < ARRAY_COUNT(cs->rf_data_ssbos); i++) {
    141 		glNamedBufferStorage(cs->rf_data_ssbos[i], rf_decoded_size, 0, 0);
    142 		stream_append_u64(&label, i);
    143 		s8 rf_label = stream_to_s8(&label);
    144 		LABEL_GL_OBJECT(GL_BUFFER, cs->rf_data_ssbos[i], rf_label);
    145 		label.widx = s_widx;
    146 	}
    147 
    148 	i32 map_flags = GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT|GL_MAP_UNSYNCHRONIZED_BIT;
    149 	switch (ctx->gl.vendor_id) {
    150 	case GL_VENDOR_AMD:
    151 	case GL_VENDOR_ARM:
    152 	case GL_VENDOR_INTEL:
    153 		cs->raw_data_arena.beg = glMapNamedBufferRange(cs->raw_data_ssbo, 0,
    154 		                                               full_rf_buf_size, map_flags);
    155 		break;
    156 	case GL_VENDOR_NVIDIA:
    157 		cs->raw_data_arena = ctx->platform.alloc_arena(cs->raw_data_arena, full_rf_buf_size);
    158 		ctx->cuda_lib.register_cuda_buffers(cs->rf_data_ssbos, ARRAY_COUNT(cs->rf_data_ssbos),
    159 		                                    cs->raw_data_ssbo);
    160 		ctx->cuda_lib.init_cuda_configuration(bp->rf_raw_dim.E, bp->dec_data_dim.E,
    161 		                                      bp->channel_mapping);
    162 		break;
    163 	}
    164 
    165 	/* NOTE: store hadamard in GPU once; it won't change for a particular imaging session */
    166 	size hadamard_elements = dec_data_dim.z * dec_data_dim.z;
    167 	i32  *hadamard         = alloc(&a, i32, hadamard_elements);
    168 	i32  *tmp              = alloc(&a, i32, hadamard_elements);
    169 	fill_hadamard_transpose(hadamard, tmp, dec_data_dim.z);
    170 	glDeleteTextures(1, &cs->hadamard_texture);
    171 	glCreateTextures(GL_TEXTURE_2D, 1, &cs->hadamard_texture);
    172 	glTextureStorage2D(cs->hadamard_texture, 1, GL_R8I, dec_data_dim.z, dec_data_dim.z);
    173 	glTextureSubImage2D(cs->hadamard_texture, 0, 0, 0, dec_data_dim.z, dec_data_dim.z,
    174 	                    GL_RED_INTEGER, GL_INT, hadamard);
    175 	LABEL_GL_OBJECT(GL_TEXTURE, cs->hadamard_texture, s8("Hadamard_Matrix"));
    176 }
    177 
    178 static BeamformWork *
    179 beamform_work_queue_pop(BeamformWorkQueue *q)
    180 {
    181 	BeamformWork *result = q->first;
    182 	if (result) {
    183 		switch (result->type) {
    184 		case BW_FULL_COMPUTE:
    185 		case BW_RECOMPUTE:
    186 		case BW_PARTIAL_COMPUTE:
    187 			/* NOTE: only one compute is allowed per frame */
    188 			if (q->did_compute_this_frame) {
    189 				result = 0;
    190 			} else {
    191 				q->compute_in_flight--;
    192 				q->did_compute_this_frame = 1;
    193 				ASSERT(q->compute_in_flight >= 0);
    194 			}
    195 			break;
    196 		}
    197 	}
    198 	/* NOTE: only do this once we have determined if we are doing the work */
    199 	if (result) {
    200 		q->first = result->next;
    201 		if (result == q->last) {
    202 			ASSERT(result->next == 0);
    203 			q->last = 0;
    204 		}
    205 	}
    206 
    207 	return result;
    208 }
    209 
    210 static BeamformWork *
    211 beamform_work_queue_push(BeamformerCtx *ctx, Arena *a, enum beamform_work work_type)
    212 {
    213 	/* TODO: we should have a sub arena specifically for this purpose */
    214 
    215 	BeamformWorkQueue *q = &ctx->beamform_work_queue;
    216 	ComputeShaderCtx *cs = &ctx->csctx;
    217 
    218 	BeamformWork *result = q->next_free;
    219 	if (result) q->next_free = result->next;
    220 	else        result = alloc(a, typeof(*result), 1);
    221 
    222 	if (result) {
    223 		result->type = work_type;
    224 		result->next = 0;
    225 
    226 		switch (work_type) {
    227 		case BW_FULL_COMPUTE:
    228 			if (q->compute_in_flight >= ARRAY_COUNT(cs->raw_data_fences)) {
    229 				result->next = q->next_free;
    230 				q->next_free = result;
    231 				result       = 0;
    232 				break;
    233 			}
    234 			cs->raw_data_index++;
    235 			if (cs->raw_data_index >= ARRAY_COUNT(cs->raw_data_fences))
    236 				cs->raw_data_index = 0;
    237 			/* FALLTHROUGH */
    238 		case BW_RECOMPUTE: {
    239 			i32 raw_index = cs->raw_data_index;
    240 			result->compute_ctx.raw_data_ssbo_index = raw_index;
    241 			/* NOTE: if this times out it means the command queue is more than 3
    242 			 * frames behind. In that case we need to re-evaluate the buffer size */
    243 			if (cs->raw_data_fences[raw_index]) {
    244 				i32 result = glClientWaitSync(cs->raw_data_fences[raw_index], 0,
    245 				                              10000);
    246 				if (result == GL_TIMEOUT_EXPIRED) {
    247 					//ASSERT(0);
    248 				}
    249 				glDeleteSync(cs->raw_data_fences[raw_index]);
    250 				cs->raw_data_fences[raw_index] = NULL;
    251 			}
    252 			ctx->displayed_frame_index++;
    253 			if (ctx->displayed_frame_index >= ARRAY_COUNT(ctx->beamform_frames))
    254 				ctx->displayed_frame_index = 0;
    255 			result->compute_ctx.frame = ctx->beamform_frames + ctx->displayed_frame_index;
    256 			result->compute_ctx.first_pass = 1;
    257 
    258 			BeamformFrameIterator bfi = beamform_frame_iterator(ctx);
    259 			for (BeamformFrame *frame = frame_next(&bfi); frame; frame = frame_next(&bfi)) {
    260 				uv3 try_dim = ctx->params->raw.output_points.xyz;
    261 				if (!uv3_equal(frame->dim, try_dim)) {
    262 					u32 index = (bfi.offset - bfi.cursor) % bfi.capacity;
    263 					alloc_beamform_frame(&ctx->gl, frame, try_dim, index,
    264 					                     s8("Beamformed_Data"));
    265 				}
    266 			}
    267 		} /* FALLTHROUGH */
    268 		case BW_PARTIAL_COMPUTE:
    269 			q->compute_in_flight++;
    270 		case BW_SAVE_FRAME:
    271 		case BW_SEND_FRAME:
    272 		case BW_SSBO_COPY:
    273 			break;
    274 		}
    275 
    276 		if (result) {
    277 			if (q->last) q->last = q->last->next = result;
    278 			else         q->last = q->first      = result;
    279 		}
    280 	}
    281 
    282 	return result;
    283 }
    284 
    285 static void
    286 export_frame(BeamformerCtx *ctx, iptr handle, BeamformFrame *frame)
    287 {
    288 	uv3 dim            = frame->dim;
    289 	size out_size      = dim.x * dim.y * dim.z * 2 * sizeof(f32);
    290 	ctx->export_buffer = ctx->platform.alloc_arena(ctx->export_buffer, out_size);
    291 	glGetTextureImage(frame->texture, 0, GL_RG, GL_FLOAT, out_size, ctx->export_buffer.beg);
    292 	s8 raw = {.len = out_size, .data = ctx->export_buffer.beg};
    293 	if (!ctx->platform.write_file(handle, raw))
    294 		TraceLog(LOG_WARNING, "failed to export frame\n");
    295 	ctx->platform.close(handle);
    296 }
    297 
    298 static void
    299 do_sum_shader(ComputeShaderCtx *cs, u32 *in_textures, u32 in_texture_count, f32 in_scale,
    300               u32 out_texture, uv3 out_data_dim)
    301 {
    302 	/* NOTE: zero output before summing */
    303 	glClearTexImage(out_texture, 0, GL_RED, GL_FLOAT, 0);
    304 
    305 	glBindImageTexture(0, out_texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F);
    306 	glUniform1f(cs->sum_prescale_id, in_scale);
    307 	for (u32 i = 0; i < in_texture_count; i++) {
    308 		glBindImageTexture(1, in_textures[i], 0, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F);
    309 		glDispatchCompute(ORONE(out_data_dim.x / 32),
    310 		                  ORONE(out_data_dim.y),
    311 		                  ORONE(out_data_dim.z / 32));
    312 		glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    313 	}
    314 }
    315 
    316 static void
    317 do_beamform_shader(ComputeShaderCtx *cs, BeamformerParameters *bp, BeamformFrame *frame,
    318                    u32 rf_ssbo, iv3 dispatch_dim, iv3 compute_dim_offset, i32 compute_pass)
    319 {
    320 	glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, rf_ssbo);
    321 	glUniform3iv(cs->volume_export_dim_offset_id, 1, compute_dim_offset.E);
    322 	glUniform1i(cs->volume_export_pass_id, compute_pass);
    323 
    324 	glBindImageTexture(0, frame->texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F);
    325 	glDispatchCompute(ORONE(dispatch_dim.x / 32),
    326 	                  ORONE(dispatch_dim.y),
    327 	                  ORONE(dispatch_dim.z / 32));
    328 }
    329 
    330 static b32
    331 do_partial_compute_step(BeamformerCtx *ctx, BeamformFrame *frame)
    332 {
    333 	ComputeShaderCtx  *cs = &ctx->csctx;
    334 	PartialComputeCtx *pc = &ctx->partial_compute_ctx;
    335 
    336 	b32 done = 0;
    337 
    338 	/* NOTE: we start this elsewhere on the first dispatch so that we can include
    339 	 * times such as decoding/demodulation/etc. */
    340 	if (!pc->timer_active) {
    341 		glQueryCounter(pc->timer_ids[0], GL_TIMESTAMP);
    342 		pc->timer_active = 1;
    343 	}
    344 
    345 	glBeginQuery(GL_TIME_ELAPSED, cs->timer_ids[cs->timer_index][pc->shader]);
    346 	cs->timer_active[cs->timer_index][pc->shader] = 1;
    347 
    348 	glUseProgram(cs->programs[pc->shader]);
    349 
    350 	/* NOTE: We must tile this otherwise GL will kill us for taking too long */
    351 	/* TODO: this could be based on multiple dimensions */
    352 	i32 dispatch_count = frame->dim.z / 32;
    353 	iv3 dim_offset = {.z = !!dispatch_count * 32 * pc->dispatch_index++};
    354 	iv3 dispatch_dim = {.x = frame->dim.x, .y = frame->dim.y, .z = 1};
    355 	do_beamform_shader(cs, &ctx->params->raw, frame, pc->rf_data_ssbo, dispatch_dim, dim_offset, 1);
    356 
    357 	if (pc->dispatch_index >= dispatch_count) {
    358 		pc->dispatch_index  = 0;
    359 		done                = 1;
    360 	}
    361 
    362 	glQueryCounter(pc->timer_ids[1], GL_TIMESTAMP);
    363 
    364 	glEndQuery(GL_TIME_ELAPSED);
    365 
    366 	return done;
    367 }
    368 
    369 static void
    370 do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformFrame *frame, u32 raw_data_index,
    371                   enum compute_shaders shader)
    372 {
    373 	ComputeShaderCtx *csctx = &ctx->csctx;
    374 	uv2  rf_raw_dim         = ctx->params->raw.rf_raw_dim;
    375 	size rf_raw_size        = rf_raw_dim.x * rf_raw_dim.y * sizeof(i16);
    376 
    377 	glBeginQuery(GL_TIME_ELAPSED, csctx->timer_ids[csctx->timer_index][shader]);
    378 	csctx->timer_active[csctx->timer_index][shader] = 1;
    379 
    380 	glUseProgram(csctx->programs[shader]);
    381 
    382 	u32 output_ssbo_idx = !csctx->last_output_ssbo_index;
    383 	u32 input_ssbo_idx  = csctx->last_output_ssbo_index;
    384 
    385 	switch (shader) {
    386 	case CS_HADAMARD:
    387 		glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo,
    388 		                  raw_data_index * rf_raw_size, rf_raw_size);
    389 
    390 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]);
    391 		glBindImageTexture(0, csctx->hadamard_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8I);
    392 		glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32),
    393 		                  ORONE(csctx->dec_data_dim.y / 32),
    394 		                  ORONE(csctx->dec_data_dim.z));
    395 		csctx->raw_data_fences[raw_data_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
    396 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    397 		break;
    398 	case CS_CUDA_DECODE:
    399 		ctx->cuda_lib.cuda_decode(raw_data_index * rf_raw_size, output_ssbo_idx, 0);
    400 		csctx->raw_data_fences[raw_data_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
    401 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    402 		break;
    403 	case CS_CUDA_HILBERT:
    404 		ctx->cuda_lib.cuda_hilbert(input_ssbo_idx, output_ssbo_idx);
    405 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    406 		break;
    407 	case CS_DEMOD:
    408 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]);
    409 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]);
    410 		glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32),
    411 		                  ORONE(csctx->dec_data_dim.y / 32),
    412 		                  ORONE(csctx->dec_data_dim.z));
    413 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    414 		break;
    415 	case CS_MIN_MAX: {
    416 		u32 texture = frame->texture;
    417 		for (u32 i = 1; i < frame->mips; i++) {
    418 			glBindImageTexture(0, texture, i - 1, GL_TRUE, 0, GL_READ_ONLY,  GL_RG32F);
    419 			glBindImageTexture(1, texture, i - 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F);
    420 			glUniform1i(csctx->mips_level_id, i);
    421 
    422 			u32 width  = frame->dim.x >> i;
    423 			u32 height = frame->dim.y >> i;
    424 			u32 depth  = frame->dim.z >> i;
    425 			glDispatchCompute(ORONE(width / 32), ORONE(height), ORONE(depth / 32));
    426 			glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    427 		}
    428 	} break;
    429 	case CS_DAS: {
    430 		u32 rf_ssbo      = csctx->rf_data_ssbos[input_ssbo_idx];
    431 		iv3 dispatch_dim = {.x = frame->dim.x, .y = frame->dim.y, .z = frame->dim.z};
    432 		do_beamform_shader(csctx, &ctx->params->raw, frame, rf_ssbo, dispatch_dim, (iv3){0}, 0);
    433 	} break;
    434 	case CS_SUM: {
    435 		u32 frame_count  = 0;
    436 		u32 *in_textures = alloc(&arena, u32, MAX_BEAMFORMED_SAVED_FRAMES);
    437 		BeamformFrameIterator bfi = beamform_frame_iterator(ctx);
    438 		for (BeamformFrame *frame = frame_next(&bfi); frame; frame = frame_next(&bfi))
    439 			in_textures[frame_count++] = frame->texture;
    440 		do_sum_shader(csctx, in_textures, frame_count, 1 / (f32)frame_count,
    441 		              ctx->averaged_frame.texture, ctx->averaged_frame.dim);
    442 	} break;
    443 	default: ASSERT(0);
    444 	}
    445 
    446 	glEndQuery(GL_TIME_ELAPSED);
    447 }
    448 
    449 static BeamformFrame *
    450 start_beamform_compute_work(BeamformWork *work, ComputeShaderCtx *cs, BeamformerParametersFull *bpf)
    451 {
    452 	BeamformFrame *result = work->compute_ctx.frame;
    453 	if (bpf->upload) {
    454 		glNamedBufferSubData(cs->shared_ubo, 0, sizeof(bpf->raw), &bpf->raw);
    455 		bpf->upload = 0;
    456 	}
    457 
    458 	result->min_coordinate = bpf->raw.output_min_coordinate;
    459 	result->max_coordinate = bpf->raw.output_max_coordinate;
    460 
    461 	return result;
    462 }
    463 
    464 static void
    465 do_beamform_work(BeamformerCtx *ctx, Arena *a)
    466 {
    467 	BeamformWorkQueue *q = &ctx->beamform_work_queue;
    468 	BeamformWork *work   = beamform_work_queue_pop(q);
    469 	ComputeShaderCtx *cs = &ctx->csctx;
    470 
    471 	while (work) {
    472 		switch (work->type) {
    473 		case BW_PARTIAL_COMPUTE: {
    474 			BeamformFrame *frame = work->compute_ctx.frame;
    475 
    476 			if (work->compute_ctx.first_pass) {
    477 				start_beamform_compute_work(work, cs, ctx->params);
    478 
    479 				PartialComputeCtx *pc = &ctx->partial_compute_ctx;
    480 				pc->runtime      = 0;
    481 				pc->timer_active = 1;
    482 				glQueryCounter(pc->timer_ids[0], GL_TIMESTAMP);
    483 				glDeleteBuffers(1, &pc->rf_data_ssbo);
    484 				glCreateBuffers(1, &pc->rf_data_ssbo);
    485 				glNamedBufferStorage(pc->rf_data_ssbo, decoded_data_size(cs), 0, 0);
    486 				LABEL_GL_OBJECT(GL_BUFFER, pc->rf_data_ssbo, s8("Volume_RF_SSBO"));
    487 
    488 				/* TODO: maybe we should have some concept of compute shader
    489 				 * groups, then we could define a group that does the decoding
    490 				 * and filtering and apply that group directly here. For now
    491 				 * we will do this dumb thing */
    492 				u32 stage_count = ctx->params->compute_stages_count;
    493 				enum compute_shaders *stages = ctx->params->compute_stages;
    494 				for (u32 i = 0; i < stage_count; i++) {
    495 					if (stages[i] == CS_DAS) {
    496 						ctx->partial_compute_ctx.shader = stages[i];
    497 						break;
    498 					}
    499 					do_compute_shader(ctx, *a, frame,
    500 					                  work->compute_ctx.raw_data_ssbo_index,
    501 					                  stages[i]);
    502 				}
    503 				u32 output_ssbo = pc->rf_data_ssbo;
    504 				u32 input_ssbo  = cs->rf_data_ssbos[cs->last_output_ssbo_index];
    505 				size rf_size    = decoded_data_size(cs);
    506 				glCopyNamedBufferSubData(input_ssbo, output_ssbo, 0, 0, rf_size);
    507 			}
    508 
    509 			b32 done = do_partial_compute_step(ctx, frame);
    510 			if (!done) {
    511 				BeamformWork *new;
    512 				/* NOTE: this push must not fail */
    513 				new = beamform_work_queue_push(ctx, a, BW_PARTIAL_COMPUTE);
    514 				new->compute_ctx.first_pass    = 0;
    515 				new->compute_ctx.frame         = frame;
    516 				new->compute_ctx.export_handle = work->compute_ctx.export_handle;
    517 			} else if (work->compute_ctx.export_handle != INVALID_FILE) {
    518 				export_frame(ctx, work->compute_ctx.export_handle, frame);
    519 				work->compute_ctx.export_handle = INVALID_FILE;
    520 				/* NOTE: do not waste a bunch of GPU space holding onto the volume
    521 				 * texture if it was just for export */
    522 				glDeleteTextures(1, &frame->texture);
    523 				mem_clear(frame, 0, sizeof(*frame));
    524 			}
    525 		} break;
    526 		case BW_FULL_COMPUTE:
    527 		case BW_RECOMPUTE: {
    528 			BeamformFrame *frame = start_beamform_compute_work(work, cs, ctx->params);
    529 
    530 			u32 stage_count = ctx->params->compute_stages_count;
    531 			enum compute_shaders *stages = ctx->params->compute_stages;
    532 			for (u32 i = 0; i < stage_count; i++)
    533 				do_compute_shader(ctx, *a, frame, work->compute_ctx.raw_data_ssbo_index,
    534 					          stages[i]);
    535 
    536 			if (work->compute_ctx.export_handle != INVALID_FILE) {
    537 				export_frame(ctx, work->compute_ctx.export_handle, frame);
    538 				work->compute_ctx.export_handle = INVALID_FILE;
    539 			}
    540 
    541 			ctx->fsctx.gen_mipmaps = 1;
    542 		} break;
    543 		}
    544 
    545 
    546 		work->next   = q->next_free;
    547 		q->next_free = work;
    548 		work = beamform_work_queue_pop(q);
    549 	}
    550 
    551 	if (q->did_compute_this_frame) {
    552 		u32 tidx = ctx->csctx.timer_index;
    553 		glDeleteSync(ctx->csctx.timer_fences[tidx]);
    554 		ctx->csctx.timer_fences[tidx] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
    555 		ctx->csctx.timer_index = (tidx + 1) % ARRAY_COUNT(ctx->csctx.timer_fences);
    556 	}
    557 }
    558 
    559 static void
    560 check_compute_timers(ComputeShaderCtx *cs, PartialComputeCtx *pc, BeamformerParametersFull *bp)
    561 {
    562 	/* NOTE: volume generation running timer */
    563 	if (pc->timer_active) {
    564 		u64 start_ns = 0, end_ns = 0;
    565 		glGetQueryObjectui64v(pc->timer_ids[0], GL_QUERY_RESULT, &start_ns);
    566 		glGetQueryObjectui64v(pc->timer_ids[1], GL_QUERY_RESULT, &end_ns);
    567 		u64 elapsed_ns    = end_ns - start_ns;
    568 		pc->runtime      += (f32)elapsed_ns * 1e-9;
    569 		pc->timer_active  = 0;
    570 	}
    571 
    572 	/* NOTE: main timers for display portion of the program */
    573 	u32 last_idx = (cs->timer_index - 1) % ARRAY_COUNT(cs->timer_fences);
    574 	if (!cs->timer_fences[last_idx])
    575 		return;
    576 
    577 	i32 status = glClientWaitSync(cs->timer_fences[last_idx], 0, 0);
    578 	if (status == GL_TIMEOUT_EXPIRED || status == GL_WAIT_FAILED)
    579 		return;
    580 	glDeleteSync(cs->timer_fences[last_idx]);
    581 	cs->timer_fences[last_idx] = NULL;
    582 
    583 	for (u32 i = 0; i < bp->compute_stages_count; i++) {
    584 		u64 ns = 0;
    585 		i32 idx = bp->compute_stages[i];
    586 		if (cs->timer_active[last_idx][idx]) {
    587 			glGetQueryObjectui64v(cs->timer_ids[last_idx][idx], GL_QUERY_RESULT, &ns);
    588 			cs->timer_active[last_idx][idx] = 0;
    589 		}
    590 		cs->last_frame_time[idx] = (f32)ns / 1e9;
    591 	}
    592 }
    593 
    594 #include "ui.c"
    595 
    596 DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step)
    597 {
    598 	dt_for_frame = GetFrameTime();
    599 
    600 	cycle_t += dt_for_frame;
    601 	if (cycle_t > 1) cycle_t -= 1;
    602 	glProgramUniform1f(ctx->csctx.programs[CS_DAS], ctx->csctx.cycle_t_id, cycle_t);
    603 
    604 	if (IsWindowResized()) {
    605 		ctx->window_size.h = GetScreenHeight();
    606 		ctx->window_size.w = GetScreenWidth();
    607 	}
    608 
    609 	if (input->executable_reloaded) {
    610 		ui_init(ctx, ctx->ui_backing_store);
    611 	}
    612 
    613 	if (ctx->flags & START_COMPUTE) {
    614 		if (ui_can_start_compute(ctx))
    615 			ui_start_compute(ctx);
    616 		ctx->flags &= ~START_COMPUTE;
    617 	}
    618 
    619 	/* NOTE: Store the compute time for the last frame. */
    620 	check_compute_timers(&ctx->csctx, &ctx->partial_compute_ctx, ctx->params);
    621 
    622 	BeamformerParameters *bp = &ctx->params->raw;
    623 	/* NOTE: Check for and Load RF Data into GPU */
    624 	if (input->pipe_data_available) {
    625 		BeamformWork *work = beamform_work_queue_push(ctx, arena, BW_FULL_COMPUTE);
    626 		/* NOTE: we can only read in the new data if we get back a work item.
    627 		 * otherwise we have too many frames in flight and should wait until the
    628 		 * next frame to try again */
    629 		if (work) {
    630 			ComputeShaderCtx *cs = &ctx->csctx;
    631 			if (!uv4_equal(cs->dec_data_dim, bp->dec_data_dim)) {
    632 				alloc_shader_storage(ctx, *arena);
    633 				/* TODO: we may need to invalidate all queue items here */
    634 			}
    635 
    636 			if (ctx->params->export_next_frame) {
    637 				/* TODO: we don't really want the beamformer opening/closing files */
    638 				iptr f = ctx->platform.open_for_write(ctx->params->export_pipe_name);
    639 				work->compute_ctx.export_handle = f;
    640 				ctx->params->export_next_frame  = 0;
    641 			} else {
    642 				work->compute_ctx.export_handle = INVALID_FILE;
    643 			}
    644 
    645 			b32 output_3d = bp->output_points.x > 1 && bp->output_points.y > 1 &&
    646 			                bp->output_points.z > 1;
    647 
    648 			if (output_3d) {
    649 				work->type = BW_PARTIAL_COMPUTE;
    650 				BeamformFrame *frame = &ctx->partial_compute_ctx.frame;
    651 				uv3 out_dim = ctx->params->raw.output_points.xyz;
    652 				alloc_beamform_frame(&ctx->gl, frame, out_dim, 0, s8("Beamformed_Volume"));
    653 				work->compute_ctx.frame = frame;
    654 			}
    655 
    656 			u32  raw_index    = work->compute_ctx.raw_data_ssbo_index;
    657 			uv2  rf_raw_dim   = cs->rf_raw_dim;
    658 			size rf_raw_size  = rf_raw_dim.x * rf_raw_dim.y * sizeof(i16);
    659 			void *rf_data_buf = cs->raw_data_arena.beg + raw_index * rf_raw_size;
    660 
    661 			alloc_output_image(ctx, bp->output_points.xyz);
    662 
    663 			size rlen = ctx->platform.read_pipe(input->pipe_handle, rf_data_buf, rf_raw_size);
    664 			if (rlen != rf_raw_size) {
    665 				stream_append_s8(&ctx->error_stream, s8("Partial Read Occurred: "));
    666 				stream_append_i64(&ctx->error_stream, rlen);
    667 				stream_append_byte(&ctx->error_stream, '/');
    668 				stream_append_i64(&ctx->error_stream, rf_raw_size);
    669 				stream_append_s8(&ctx->error_stream, s8("\n\0"));
    670 				TraceLog(LOG_WARNING, (c8 *)stream_to_s8(&ctx->error_stream).data);
    671 				ctx->error_stream.widx = 0;
    672 			} else {
    673 				switch (ctx->gl.vendor_id) {
    674 				case GL_VENDOR_AMD:
    675 				case GL_VENDOR_ARM:
    676 				case GL_VENDOR_INTEL:
    677 					break;
    678 				case GL_VENDOR_NVIDIA:
    679 					glNamedBufferSubData(cs->raw_data_ssbo, raw_index * rlen,
    680 					                     rlen, rf_data_buf);
    681 				}
    682 			}
    683 		}
    684 	}
    685 
    686 	ctx->beamform_work_queue.did_compute_this_frame = 0;
    687 	do_beamform_work(ctx, arena);
    688 
    689 	/* NOTE: draw output image texture using render fragment shader */
    690 	BeamformFrame *frame_to_draw = 0;
    691 	BeginTextureMode(ctx->fsctx.output);
    692 		ClearBackground(PINK);
    693 		BeginShaderMode(ctx->fsctx.shader);
    694 			FragmentShaderCtx *fs = &ctx->fsctx;
    695 			glUseProgram(fs->shader.id);
    696 			u32 out_texture = 0;
    697 			if (bp->output_points.w > 1) {
    698 				frame_to_draw = &ctx->averaged_frame;
    699 				out_texture   = ctx->averaged_frame.texture;
    700 			} else {
    701 				frame_to_draw = ctx->beamform_frames + ctx->displayed_frame_index;
    702 				out_texture   = frame_to_draw->texture;
    703 			}
    704 			glBindTextureUnit(0, out_texture);
    705 			glUniform1f(fs->db_cutoff_id, fs->db);
    706 			glUniform1f(fs->threshold_id, fs->threshold);
    707 			DrawTexture(fs->output.texture, 0, 0, WHITE);
    708 		EndShaderMode();
    709 	EndTextureMode();
    710 
    711 	/* NOTE: regenerate mipmaps only when the output has actually changed */
    712 	if (ctx->fsctx.gen_mipmaps) {
    713 		/* NOTE: shut up raylib's reporting on mipmap gen */
    714 		SetTraceLogLevel(LOG_NONE);
    715 		GenTextureMipmaps(&ctx->fsctx.output.texture);
    716 		SetTraceLogLevel(LOG_INFO);
    717 		ctx->fsctx.gen_mipmaps = 0;
    718 	}
    719 
    720 	draw_ui(ctx, input, frame_to_draw);
    721 
    722 	if (WindowShouldClose())
    723 		ctx->flags |= SHOULD_EXIT;
    724 }