ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | LICENSE

beamformer.c (29288B)


      1 /* See LICENSE for license details. */
      2 #include "beamformer.h"
      3 
      4 static f32 dt_for_frame;
      5 static u32 cycle_t;
      6 
      7 #ifndef _DEBUG
      8 #define start_renderdoc_capture(...)
      9 #define end_renderdoc_capture(...)
     10 #else
     11 static renderdoc_start_frame_capture_fn *start_frame_capture;
     12 static renderdoc_end_frame_capture_fn   *end_frame_capture;
     13 #define start_renderdoc_capture(gl) if (start_frame_capture) start_frame_capture(gl, 0)
     14 #define end_renderdoc_capture(gl)   if (end_frame_capture)   end_frame_capture(gl, 0)
     15 #endif
     16 
     17 static iz
     18 decoded_data_size(ComputeShaderCtx *cs)
     19 {
     20 	uv4 dim    = cs->dec_data_dim;
     21 	iz  result = 2 * sizeof(f32) * dim.x * dim.y * dim.z;
     22 	return result;
     23 }
     24 
     25 static uv3
     26 make_valid_test_dim(uv3 in)
     27 {
     28 	uv3 result;
     29 	result.x = MAX(in.x, 1);
     30 	result.y = MAX(in.y, 1);
     31 	result.z = MAX(in.z, 1);
     32 	return result;
     33 }
     34 
     35 static BeamformFrameIterator
     36 beamform_frame_iterator(BeamformerCtx *ctx, u32 start_index, u32 needed_frames)
     37 {
     38 	start_index = start_index % ARRAY_COUNT(ctx->beamform_frames);
     39 
     40 	BeamformFrameIterator result;
     41 	result.frames        = ctx->beamform_frames;
     42 	result.offset        = start_index;
     43 	result.capacity      = ARRAY_COUNT(ctx->beamform_frames);
     44 	result.cursor        = 0;
     45 	result.needed_frames = needed_frames;
     46 	return result;
     47 }
     48 
     49 static BeamformFrame *
     50 frame_next(BeamformFrameIterator *bfi)
     51 {
     52 	BeamformFrame *result = 0;
     53 	if (bfi->cursor != bfi->needed_frames) {
     54 		u32 index = (bfi->offset + bfi->cursor++) % bfi->capacity;
     55 		result    = bfi->frames + index;
     56 	}
     57 	return result;
     58 }
     59 
     60 static void
     61 alloc_beamform_frame(GLParams *gp, BeamformFrame *out, ComputeShaderStats *out_stats,
     62                      uv3 out_dim, s8 name, Arena arena)
     63 {
     64 	out->ready_to_present = 0;
     65 
     66 	out->dim.x = MAX(1, round_down_power_of_2(ORONE(out_dim.x)));
     67 	out->dim.y = MAX(1, round_down_power_of_2(ORONE(out_dim.y)));
     68 	out->dim.z = MAX(1, round_down_power_of_2(ORONE(out_dim.z)));
     69 
     70 	if (gp) {
     71 		out->dim.x = MIN(out->dim.x, gp->max_3d_texture_dim);
     72 		out->dim.y = MIN(out->dim.y, gp->max_3d_texture_dim);
     73 		out->dim.z = MIN(out->dim.z, gp->max_3d_texture_dim);
     74 	}
     75 
     76 	/* NOTE: allocate storage for beamformed output data;
     77 	 * this is shared between compute and fragment shaders */
     78 	u32 max_dim = MAX(out->dim.x, MAX(out->dim.y, out->dim.z));
     79 	out->mips   = ctz_u32(max_dim) + 1;
     80 
     81 	Stream label = arena_stream(&arena);
     82 	stream_append_s8(&label, name);
     83 	stream_append_byte(&label, '[');
     84 	stream_append_hex_u64(&label, out->id);
     85 	stream_append_byte(&label, ']');
     86 
     87 	glDeleteTextures(1, &out->texture);
     88 	glCreateTextures(GL_TEXTURE_3D, 1, &out->texture);
     89 	glTextureStorage3D(out->texture, out->mips, GL_RG32F, out->dim.x, out->dim.y, out->dim.z);
     90 	LABEL_GL_OBJECT(GL_TEXTURE, out->texture, stream_to_s8(&label));
     91 
     92 	if (out_stats) {
     93 		glDeleteQueries(ARRAY_COUNT(out_stats->timer_ids), out_stats->timer_ids);
     94 		glCreateQueries(GL_TIME_ELAPSED, ARRAY_COUNT(out_stats->timer_ids), out_stats->timer_ids);
     95 	}
     96 }
     97 
     98 static void
     99 alloc_shader_storage(BeamformerCtx *ctx, Arena a)
    100 {
    101 	ComputeShaderCtx *cs     = &ctx->csctx;
    102 	BeamformerParameters *bp = &ctx->params->raw;
    103 
    104 	uv4 dec_data_dim = bp->dec_data_dim;
    105 	u32 rf_raw_size  = ctx->params->raw_data_size;
    106 	cs->dec_data_dim = dec_data_dim;
    107 	cs->rf_raw_size  = rf_raw_size;
    108 
    109 	glDeleteBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos);
    110 	glCreateBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos);
    111 
    112 	i32 storage_flags = GL_DYNAMIC_STORAGE_BIT;
    113 	switch (ctx->gl.vendor_id) {
    114 	case GL_VENDOR_AMD:
    115 	case GL_VENDOR_ARM:
    116 	case GL_VENDOR_INTEL:
    117 		if (cs->raw_data_ssbo)
    118 			glUnmapNamedBuffer(cs->raw_data_ssbo);
    119 		storage_flags |= GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT;
    120 	case GL_VENDOR_NVIDIA:
    121 		/* NOTE: register_cuda_buffers will handle the updated ssbo */
    122 		break;
    123 	}
    124 
    125 	glDeleteBuffers(1, &cs->raw_data_ssbo);
    126 	glCreateBuffers(1, &cs->raw_data_ssbo);
    127 	glNamedBufferStorage(cs->raw_data_ssbo, rf_raw_size, 0, storage_flags);
    128 	LABEL_GL_OBJECT(GL_BUFFER, cs->raw_data_ssbo, s8("Raw_RF_SSBO"));
    129 
    130 	iz rf_decoded_size = decoded_data_size(cs);
    131 	Stream label = stream_alloc(&a, 256);
    132 	stream_append_s8(&label, s8("Decoded_RF_SSBO_"));
    133 	u32 s_widx = label.widx;
    134 	for (u32 i = 0; i < ARRAY_COUNT(cs->rf_data_ssbos); i++) {
    135 		glNamedBufferStorage(cs->rf_data_ssbos[i], rf_decoded_size, 0, 0);
    136 		stream_append_u64(&label, i);
    137 		s8 rf_label = stream_to_s8(&label);
    138 		LABEL_GL_OBJECT(GL_BUFFER, cs->rf_data_ssbos[i], rf_label);
    139 		stream_reset(&label, s_widx);
    140 	}
    141 
    142 	i32 map_flags = GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT|GL_MAP_UNSYNCHRONIZED_BIT;
    143 	switch (ctx->gl.vendor_id) {
    144 	case GL_VENDOR_AMD:
    145 	case GL_VENDOR_ARM:
    146 	case GL_VENDOR_INTEL:
    147 		cs->raw_data_arena.beg = glMapNamedBufferRange(cs->raw_data_ssbo, 0,
    148 		                                               rf_raw_size, map_flags);
    149 		cs->raw_data_arena.end = cs->raw_data_arena.beg + rf_raw_size;
    150 		break;
    151 	case GL_VENDOR_NVIDIA:
    152 		cs->raw_data_arena = ctx->os.alloc_arena(cs->raw_data_arena, rf_raw_size);
    153 		ctx->cuda_lib.register_cuda_buffers(cs->rf_data_ssbos, ARRAY_COUNT(cs->rf_data_ssbos),
    154 		                                    cs->raw_data_ssbo);
    155 		ctx->cuda_lib.init_cuda_configuration(bp->rf_raw_dim.E, bp->dec_data_dim.E,
    156 		                                      bp->channel_mapping);
    157 		break;
    158 	}
    159 
    160 	/* NOTE: store hadamard in GPU once; it won't change for a particular imaging session */
    161 	iz   hadamard_elements = dec_data_dim.z * dec_data_dim.z;
    162 	i32  *hadamard         = alloc(&a, i32, hadamard_elements);
    163 	i32  *tmp              = alloc(&a, i32, hadamard_elements);
    164 	fill_hadamard_transpose(hadamard, tmp, dec_data_dim.z);
    165 	glDeleteTextures(1, &cs->hadamard_texture);
    166 	glCreateTextures(GL_TEXTURE_2D, 1, &cs->hadamard_texture);
    167 	glTextureStorage2D(cs->hadamard_texture, 1, GL_R8I, dec_data_dim.z, dec_data_dim.z);
    168 	glTextureSubImage2D(cs->hadamard_texture, 0, 0, 0, dec_data_dim.z, dec_data_dim.z,
    169 	                    GL_RED_INTEGER, GL_INT, hadamard);
    170 	LABEL_GL_OBJECT(GL_TEXTURE, cs->hadamard_texture, s8("Hadamard_Matrix"));
    171 }
    172 
    173 static BeamformWork *
    174 beamform_work_queue_pop(BeamformWorkQueue *q)
    175 {
    176 	BeamformWork *result = 0;
    177 
    178 	static_assert(ISPOWEROF2(ARRAY_COUNT(q->work_items)), "queue capacity must be a power of 2");
    179 	u64 val  = atomic_load(&q->queue);
    180 	u64 mask = ARRAY_COUNT(q->work_items) - 1;
    181 	u32 widx = val       & mask;
    182 	u32 ridx = val >> 32 & mask;
    183 
    184 	if (ridx != widx)
    185 		result = q->work_items + ridx;
    186 
    187 	return result;
    188 }
    189 
    190 static void
    191 beamform_work_queue_pop_commit(BeamformWorkQueue *q)
    192 {
    193 	atomic_add(&q->queue, 0x100000000ULL);
    194 }
    195 
    196 DEBUG_EXPORT BEAMFORM_WORK_QUEUE_PUSH_FN(beamform_work_queue_push)
    197 {
    198 	BeamformWork *result = 0;
    199 
    200 	static_assert(ISPOWEROF2(ARRAY_COUNT(q->work_items)), "queue capacity must be a power of 2");
    201 	u64 val  = atomic_load(&q->queue);
    202 	u64 mask = ARRAY_COUNT(q->work_items) - 1;
    203 	u32 widx = val       & mask;
    204 	u32 ridx = val >> 32 & mask;
    205 	u32 next = (widx + 1) & mask;
    206 
    207 	if (val & 0x80000000)
    208 		atomic_and(&q->queue, ~0x80000000);
    209 
    210 	if (next != ridx) {
    211 		result = q->work_items + widx;
    212 		zero_struct(result);
    213 	}
    214 
    215 	return result;
    216 }
    217 
    218 DEBUG_EXPORT BEAMFORM_WORK_QUEUE_PUSH_COMMIT_FN(beamform_work_queue_push_commit)
    219 {
    220 	atomic_add(&q->queue, 1);
    221 }
    222 
    223 static b32
    224 fill_frame_compute_work(BeamformerCtx *ctx, BeamformWork *work)
    225 {
    226 	b32 result = 0;
    227 	if (work) {
    228 		result = 1;
    229 		u32 frame_id      = atomic_inc(&ctx->next_render_frame_index, 1);
    230 		u32 frame_index   = frame_id % ARRAY_COUNT(ctx->beamform_frames);
    231 		work->type        = BW_COMPUTE;
    232 		work->frame.store = ctx->beamform_frames + frame_index;
    233 		work->frame.stats = ctx->beamform_frame_compute_stats + frame_index;
    234 		work->frame.store->ready_to_present = 0;
    235 		work->frame.store->id = frame_id;
    236 	}
    237 	return result;
    238 }
    239 
    240 static void
    241 export_frame(BeamformerCtx *ctx, iptr handle, BeamformFrame *frame)
    242 {
    243 	uv3 dim            = frame->dim;
    244 	iz  out_size       = dim.x * dim.y * dim.z * 2 * sizeof(f32);
    245 	ctx->export_buffer = ctx->os.alloc_arena(ctx->export_buffer, out_size);
    246 	glGetTextureImage(frame->texture, 0, GL_RG, GL_FLOAT, out_size, ctx->export_buffer.beg);
    247 	s8 raw = {.len = out_size, .data = ctx->export_buffer.beg};
    248 	if (!ctx->os.write_file(handle, raw))
    249 		ctx->os.write_file(ctx->os.stderr, s8("failed to export frame\n"));
    250 	ctx->os.close(handle);
    251 }
    252 
    253 static void
    254 do_sum_shader(ComputeShaderCtx *cs, u32 *in_textures, u32 in_texture_count, f32 in_scale,
    255               u32 out_texture, uv3 out_data_dim)
    256 {
    257 	/* NOTE: zero output before summing */
    258 	glClearTexImage(out_texture, 0, GL_RED, GL_FLOAT, 0);
    259 	glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT);
    260 
    261 	glBindImageTexture(0, out_texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F);
    262 	glUniform1f(cs->sum_prescale_id, in_scale);
    263 	for (u32 i = 0; i < in_texture_count; i++) {
    264 		glBindImageTexture(1, in_textures[i], 0, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F);
    265 		glDispatchCompute(ORONE(out_data_dim.x / 32),
    266 		                  ORONE(out_data_dim.y),
    267 		                  ORONE(out_data_dim.z / 32));
    268 		glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    269 	}
    270 }
    271 
    272 struct compute_cursor {
    273 	iv3 cursor;
    274 	iv3 dispatch;
    275 	iv3 target;
    276 	u32 points_per_dispatch;
    277 	u32 completed_points;
    278 	u32 total_points;
    279 };
    280 
    281 static struct compute_cursor
    282 start_compute_cursor(uv3 dim, u32 max_points)
    283 {
    284 	struct compute_cursor result = {0};
    285 	u32 invocations_per_dispatch = DAS_LOCAL_SIZE_X * DAS_LOCAL_SIZE_Y * DAS_LOCAL_SIZE_Z;
    286 
    287 	result.dispatch.y = MIN(max_points / invocations_per_dispatch, MAX(dim.y / DAS_LOCAL_SIZE_Y, 1));
    288 
    289 	u32 remaining     = max_points / result.dispatch.y;
    290 	result.dispatch.x = MIN(remaining / invocations_per_dispatch, MAX(dim.x / DAS_LOCAL_SIZE_X, 1));
    291 	result.dispatch.z = MIN(remaining / (invocations_per_dispatch * result.dispatch.x),
    292 	                        MAX(dim.z / DAS_LOCAL_SIZE_Z, 1));
    293 
    294 	result.target.x = MAX(dim.x / result.dispatch.x / DAS_LOCAL_SIZE_X, 1);
    295 	result.target.y = MAX(dim.y / result.dispatch.y / DAS_LOCAL_SIZE_Y, 1);
    296 	result.target.z = MAX(dim.z / result.dispatch.z / DAS_LOCAL_SIZE_Z, 1);
    297 
    298 	result.points_per_dispatch = 1;
    299 	result.points_per_dispatch *= result.dispatch.x * DAS_LOCAL_SIZE_X;
    300 	result.points_per_dispatch *= result.dispatch.y * DAS_LOCAL_SIZE_Y;
    301 	result.points_per_dispatch *= result.dispatch.z * DAS_LOCAL_SIZE_Z;
    302 
    303 	result.total_points = dim.x * dim.y * dim.z;
    304 
    305 	return result;
    306 }
    307 
    308 static iv3
    309 step_compute_cursor(struct compute_cursor *cursor)
    310 {
    311 	cursor->cursor.x += 1;
    312 	if (cursor->cursor.x >= cursor->target.x) {
    313 		cursor->cursor.x  = 0;
    314 		cursor->cursor.y += 1;
    315 		if (cursor->cursor.y >= cursor->target.y) {
    316 			cursor->cursor.y  = 0;
    317 			cursor->cursor.z += 1;
    318 		}
    319 	}
    320 
    321 	cursor->completed_points += cursor->points_per_dispatch;
    322 
    323 	iv3 result = cursor->cursor;
    324 	result.x *= cursor->dispatch.x * DAS_LOCAL_SIZE_X;
    325 	result.y *= cursor->dispatch.y * DAS_LOCAL_SIZE_Y;
    326 	result.z *= cursor->dispatch.z * DAS_LOCAL_SIZE_Z;
    327 
    328 	return result;
    329 }
    330 
    331 static b32
    332 compute_cursor_finished(struct compute_cursor *cursor)
    333 {
    334 	b32 result = cursor->completed_points >= cursor->total_points;
    335 	return result;
    336 }
    337 
    338 static void
    339 do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformFrame *frame, ComputeShaderID shader)
    340 {
    341 	ComputeShaderCtx *csctx = &ctx->csctx;
    342 
    343 	glUseProgram(csctx->programs[shader]);
    344 
    345 	u32 output_ssbo_idx = !csctx->last_output_ssbo_index;
    346 	u32 input_ssbo_idx  = csctx->last_output_ssbo_index;
    347 
    348 	switch (shader) {
    349 	case CS_DECODE:
    350 	case CS_DECODE_FLOAT:
    351 	case CS_DECODE_FLOAT_COMPLEX:
    352 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo);
    353 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]);
    354 		glBindImageTexture(0, csctx->hadamard_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8I);
    355 		glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32),
    356 		                  ORONE(csctx->dec_data_dim.y / 32),
    357 		                  ORONE(csctx->dec_data_dim.z));
    358 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    359 		break;
    360 	case CS_CUDA_DECODE:
    361 		ctx->cuda_lib.cuda_decode(0, output_ssbo_idx, 0);
    362 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    363 		break;
    364 	case CS_CUDA_HILBERT:
    365 		ctx->cuda_lib.cuda_hilbert(input_ssbo_idx, output_ssbo_idx);
    366 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    367 		break;
    368 	case CS_DEMOD:
    369 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]);
    370 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]);
    371 		glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32),
    372 		                  ORONE(csctx->dec_data_dim.y / 32),
    373 		                  ORONE(csctx->dec_data_dim.z));
    374 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
    375 		break;
    376 	case CS_MIN_MAX: {
    377 		u32 texture = frame->texture;
    378 		for (u32 i = 1; i < frame->mips; i++) {
    379 			glBindImageTexture(0, texture, i - 1, GL_TRUE, 0, GL_READ_ONLY,  GL_RG32F);
    380 			glBindImageTexture(1, texture, i - 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F);
    381 			glUniform1i(csctx->mips_level_id, i);
    382 
    383 			u32 width  = frame->dim.x >> i;
    384 			u32 height = frame->dim.y >> i;
    385 			u32 depth  = frame->dim.z >> i;
    386 			glDispatchCompute(ORONE(width / 32), ORONE(height), ORONE(depth / 32));
    387 			glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    388 		}
    389 	} break;
    390 	case CS_DAS: {
    391 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]);
    392 		glBindImageTexture(0, frame->texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F);
    393 
    394 		#if 1
    395 		/* TODO(rnp): compute max_points_per_dispatch based on something like a
    396 		 * transmit_count * channel_count product */
    397 		u32 max_points_per_dispatch = KB(64);
    398 		struct compute_cursor cursor = start_compute_cursor(frame->dim, max_points_per_dispatch);
    399 		f32 percent_per_step = (f32)cursor.points_per_dispatch / (f32)cursor.total_points;
    400 		csctx->processing_progress = -percent_per_step;
    401 		for (iv3 offset = {0};
    402 		     !compute_cursor_finished(&cursor);
    403 		     offset = step_compute_cursor(&cursor))
    404 		{
    405 			csctx->processing_progress += percent_per_step;
    406 			/* IMPORTANT(rnp): prevents OS from coalescing and killing our shader */
    407 			glFinish();
    408 			glUniform3iv(csctx->voxel_offset_id, 1, offset.E);
    409 			glDispatchCompute(cursor.dispatch.x, cursor.dispatch.y, cursor.dispatch.z);
    410 		}
    411 		#else
    412 		/* NOTE(rnp): use this for testing tiling code. The performance of the above path
    413 		 * should be the same as this path if everything is working correctly */
    414 		iv3 compute_dim_offset = {0};
    415 		glUniform3iv(csctx->voxel_offset_id, 1, compute_dim_offset.E);
    416 		glDispatchCompute(ORONE(frame->dim.x / 32),
    417 		                  ORONE(frame->dim.y),
    418 		                  ORONE(frame->dim.z / 32));
    419 		#endif
    420 		glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT|GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
    421 	} break;
    422 	case CS_SUM: {
    423 		u32 aframe_index = ctx->averaged_frame_index % ARRAY_COUNT(ctx->averaged_frames);
    424 		BeamformFrame *aframe    = ctx->averaged_frames + aframe_index;
    425 		aframe->ready_to_present = 0;
    426 		/* TODO(rnp): hack we need a better way of specifying which frames to sum;
    427 		 * this is fine for rolling averaging but what if we want to do something else */
    428 		ASSERT(frame >= ctx->beamform_frames);
    429 		ASSERT(frame < ctx->beamform_frames + ARRAY_COUNT(ctx->beamform_frames));
    430 		u32 base_index   = (u32)(frame - ctx->beamform_frames);
    431 		u32 to_average   = ctx->params->raw.output_points.w;
    432 		u32 frame_count  = 0;
    433 		u32 *in_textures = alloc(&arena, u32, MAX_BEAMFORMED_SAVED_FRAMES);
    434 		BeamformFrameIterator bfi = beamform_frame_iterator(ctx, 1 + base_index - to_average,
    435 		                                                    to_average);
    436 		for (BeamformFrame *it = frame_next(&bfi); it; it = frame_next(&bfi))
    437 			in_textures[frame_count++] = it->texture;
    438 
    439 		ASSERT(to_average == frame_count);
    440 
    441 		do_sum_shader(csctx, in_textures, frame_count, 1 / (f32)frame_count,
    442 		              aframe->texture, aframe->dim);
    443 		aframe->min_coordinate = frame->min_coordinate;
    444 		aframe->max_coordinate = frame->max_coordinate;
    445 		aframe->compound_count = frame->compound_count;
    446 		aframe->das_shader_id  = frame->das_shader_id;
    447 	} break;
    448 	default: ASSERT(0);
    449 	}
    450 }
    451 
    452 static u32
    453 compile_shader(OS *os, Arena a, u32 type, s8 shader, s8 name)
    454 {
    455 	u32 sid = glCreateShader(type);
    456 	glShaderSource(sid, 1, (const char **)&shader.data, (int *)&shader.len);
    457 	glCompileShader(sid);
    458 
    459 	i32 res = 0;
    460 	glGetShaderiv(sid, GL_COMPILE_STATUS, &res);
    461 
    462 	if (res == GL_FALSE) {
    463 		Stream buf = arena_stream(&a);
    464 		stream_append_s8(&buf, name);
    465 		stream_append_s8(&buf, s8(": failed to compile\n"));
    466 
    467 		i32 len = 0, out_len = 0;
    468 		glGetShaderiv(sid, GL_INFO_LOG_LENGTH, &len);
    469 		glGetShaderInfoLog(sid, len, &out_len, (char *)(buf.data + buf.widx));
    470 		stream_commit(&buf, out_len);
    471 		glDeleteShader(sid);
    472 		os->write_file(os->stderr, stream_to_s8(&buf));
    473 
    474 		sid = 0;
    475 	}
    476 
    477 	return sid;
    478 }
    479 
    480 static u32
    481 link_program(OS *os, Arena a, u32 shader_id)
    482 {
    483 	i32 success = 0;
    484 	u32 result  = glCreateProgram();
    485 	glAttachShader(result, shader_id);
    486 	glLinkProgram(result);
    487 	glGetProgramiv(result, GL_LINK_STATUS, &success);
    488 	if (success == GL_FALSE) {
    489 		i32 len    = 0;
    490 		Stream buf = arena_stream(&a);
    491 		stream_append_s8(&buf, s8("shader link error: "));
    492 		glGetProgramInfoLog(result, buf.cap - buf.widx, &len, (c8 *)(buf.data + buf.widx));
    493 		stream_reset(&buf, len);
    494 		stream_append_byte(&buf, '\n');
    495 		os->write_file(os->stderr, stream_to_s8(&buf));
    496 		glDeleteProgram(result);
    497 		result = 0;
    498 	}
    499 	return result;
    500 }
    501 
    502 static s8
    503 push_compute_shader_header(Arena *a, ComputeShaderID shader)
    504 {
    505 	s8 result = {.data = a->beg};
    506 
    507 	#define X(name, type, size, gltype, glsize, comment) "\t" #gltype " " #name #glsize "; " comment "\n"
    508 	push_s8(a, s8("#version 460 core\n\n"
    509 	              "layout(std140, binding = 0) uniform parameters {\n"
    510 	              BEAMFORMER_PARAMS_HEAD
    511 	              BEAMFORMER_UI_PARAMS
    512 	              BEAMFORMER_PARAMS_TAIL
    513 	              "};\n\n"));
    514 	#undef X
    515 
    516 	switch (shader) {
    517 	case CS_DAS: {
    518 		push_s8(a, s8("layout("
    519 		              "local_size_x = " str(DAS_LOCAL_SIZE_X) ", "
    520 		              "local_size_y = " str(DAS_LOCAL_SIZE_Y) ", "
    521 		              "local_size_z = " str(DAS_LOCAL_SIZE_Z) ") "
    522 		              "in;\n\n"));
    523 		#define X(type, id, pretty, fixed_tx) push_s8(a, s8("#define DAS_ID_" #type " " #id "\n"));
    524 		DAS_TYPES
    525 		#undef X
    526 	} break;
    527 	case CS_DECODE_FLOAT:
    528 	case CS_DECODE_FLOAT_COMPLEX: {
    529 		if (shader == CS_DECODE_FLOAT) push_s8(a, s8("#define INPUT_DATA_TYPE_FLOAT\n\n"));
    530 		else                           push_s8(a, s8("#define INPUT_DATA_TYPE_FLOAT_COMPLEX\n\n"));
    531 	} /* FALLTHROUGH */
    532 	case CS_DECODE: {
    533 		#define X(type, id, pretty) push_s8(a, s8("#define DECODE_MODE_" #type " " #id "\n"));
    534 		DECODE_TYPES
    535 		#undef X
    536 	} break;
    537 	default: break;
    538 	}
    539 	s8 end = push_s8(a, s8("\n#line 1\n"));
    540 	result.len = end.data + end.len - result.data;
    541 	return result;
    542 }
    543 
    544 static b32
    545 reload_compute_shader(BeamformerCtx *ctx, s8 path, s8 extra, ComputeShaderReloadContext *csr, Arena tmp)
    546 {
    547 	ComputeShaderCtx *cs = &ctx->csctx;
    548 	b32 result = 0;
    549 
    550 	/* NOTE: arena works as stack (since everything here is 1 byte aligned) */
    551 	s8 header = {.data = tmp.beg};
    552 	if (csr->needs_header)
    553 		header = push_compute_shader_header(&tmp, csr->shader);
    554 
    555 	s8 shader_text = ctx->os.read_whole_file(&tmp, (c8 *)path.data);
    556 	shader_text.data -= header.len;
    557 	shader_text.len  += header.len;
    558 
    559 	if (shader_text.data == header.data) {
    560 		u32 shader_id  = compile_shader(&ctx->os, tmp, GL_COMPUTE_SHADER, shader_text, path);
    561 		if (shader_id) {
    562 			u32 new_program = link_program(&ctx->os, tmp, shader_id);
    563 			if (new_program) {
    564 				Stream buf = arena_stream(&tmp);
    565 				stream_append_s8(&buf, s8("loaded: "));
    566 				stream_append_s8(&buf, path);
    567 				stream_append_s8(&buf, extra);
    568 				stream_append_byte(&buf, '\n');
    569 				ctx->os.write_file(ctx->os.stderr, stream_to_s8(&buf));
    570 				glDeleteProgram(cs->programs[csr->shader]);
    571 				cs->programs[csr->shader] = new_program;
    572 				glUseProgram(cs->programs[csr->shader]);
    573 				glBindBufferBase(GL_UNIFORM_BUFFER, 0, cs->shared_ubo);
    574 				LABEL_GL_OBJECT(GL_PROGRAM, cs->programs[csr->shader], csr->label);
    575 				result = 1;
    576 			}
    577 			glDeleteShader(shader_id);
    578 		}
    579 	} else {
    580 		Stream buf = arena_stream(&tmp);
    581 		stream_append_s8(&buf, s8("failed to load: "));
    582 		stream_append_s8(&buf, path);
    583 		stream_append_s8(&buf, extra);
    584 		stream_append_byte(&buf, '\n');
    585 		ctx->os.write_file(ctx->os.stderr, stream_to_s8(&buf));
    586 	}
    587 
    588 	return result;
    589 }
    590 
    591 DEBUG_EXPORT BEAMFORMER_COMPLETE_COMPUTE_FN(beamformer_complete_compute)
    592 {
    593 	BeamformerCtx *ctx   = (BeamformerCtx *)user_context;
    594 	BeamformWorkQueue *q = ctx->beamform_work_queue;
    595 	BeamformWork *work   = beamform_work_queue_pop(q);
    596 	ComputeShaderCtx *cs = &ctx->csctx;
    597 
    598 	BeamformerParameters *bp = &ctx->params->raw;
    599 
    600 	while (work) {
    601 		b32 can_commit = 1;
    602 		switch (work->type) {
    603 		case BW_RELOAD_SHADER: {
    604 			ComputeShaderReloadContext *csr = work->reload_shader_ctx;
    605 			b32 success = reload_compute_shader(ctx, csr->path, s8(""), csr, arena);
    606 			if (csr->shader == CS_DECODE) {
    607 				/* TODO(rnp): think of a better way of doing this */
    608 				csr->shader = CS_DECODE_FLOAT_COMPLEX;
    609 				success &= reload_compute_shader(ctx, csr->path, s8(" (F32C)"), csr, arena);
    610 				csr->shader = CS_DECODE_FLOAT;
    611 				success &= reload_compute_shader(ctx, csr->path, s8(" (F32)"),  csr, arena);
    612 				csr->shader = CS_DECODE;
    613 			}
    614 
    615 			if (success) {
    616 				if (ctx->csctx.raw_data_ssbo) {
    617 					can_commit = 0;
    618 					fill_frame_compute_work(ctx, work);
    619 				}
    620 
    621 				/* TODO(rnp): remove this */
    622 				#define X(idx, name) cs->name##_id = glGetUniformLocation(cs->programs[idx], "u_" #name);
    623 				CS_UNIFORMS
    624 				#undef X
    625 			}
    626 		} break;
    627 		case BW_LOAD_RF_DATA: {
    628 			if (cs->rf_raw_size != ctx->params->raw_data_size ||
    629 			    !uv4_equal(cs->dec_data_dim, bp->dec_data_dim))
    630 			{
    631 				alloc_shader_storage(ctx, arena);
    632 			}
    633 
    634 			void *rf_data_buf = cs->raw_data_arena.beg;
    635 			iz rlen = ctx->os.read_file(work->file_handle, rf_data_buf, cs->rf_raw_size);
    636 			if (rlen != cs->rf_raw_size) {
    637 				stream_append_s8(&ctx->error_stream, s8("Partial Read Occurred: "));
    638 				stream_append_i64(&ctx->error_stream, rlen);
    639 				stream_append_byte(&ctx->error_stream, '/');
    640 				stream_append_i64(&ctx->error_stream, cs->rf_raw_size);
    641 				stream_append_byte(&ctx->error_stream, '\n');
    642 				ctx->os.write_file(ctx->os.stderr, stream_to_s8(&ctx->error_stream));
    643 				ctx->error_stream.widx = 0;
    644 			} else {
    645 				switch (ctx->gl.vendor_id) {
    646 				case GL_VENDOR_AMD:
    647 				case GL_VENDOR_ARM:
    648 				case GL_VENDOR_INTEL:
    649 					break;
    650 				case GL_VENDOR_NVIDIA:
    651 					glNamedBufferSubData(cs->raw_data_ssbo, 0, rlen, rf_data_buf);
    652 				}
    653 			}
    654 			ctx->ready_for_rf = 1;
    655 		} break;
    656 		case BW_COMPUTE: {
    657 			atomic_store(&cs->processing_compute, 1);
    658 			start_renderdoc_capture(gl_context);
    659 
    660 			BeamformerWorkFrame *frame = &work->frame;
    661 			if (ctx->params->upload) {
    662 				glNamedBufferSubData(cs->shared_ubo, 0, sizeof(ctx->params->raw),
    663 				                     &ctx->params->raw);
    664 				ctx->params->upload = 0;
    665 			}
    666 
    667 			if (cs->programs[CS_DAS])
    668 				glProgramUniform1ui(cs->programs[CS_DAS], cs->cycle_t_id, cycle_t++);
    669 
    670 			uv3 try_dim = make_valid_test_dim(ctx->params->raw.output_points.xyz);
    671 			if (!uv3_equal(try_dim, frame->store->dim))
    672 				alloc_beamform_frame(&ctx->gl, frame->store, frame->stats, try_dim,
    673 				                     s8("Beamformed_Data"), arena);
    674 
    675 			if (ctx->params->raw.output_points.w > 1) {
    676 				if (!uv3_equal(try_dim, ctx->averaged_frames[0].dim)) {
    677 					alloc_beamform_frame(&ctx->gl, ctx->averaged_frames + 0,
    678 					                     ctx->averaged_frame_compute_stats + 0,
    679 					                     try_dim, s8("Averaged Frame"), arena);
    680 					alloc_beamform_frame(&ctx->gl, ctx->averaged_frames + 1,
    681 					                     ctx->averaged_frame_compute_stats + 1,
    682 					                     try_dim, s8("Averaged Frame"), arena);
    683 				}
    684 			}
    685 
    686 			frame->store->in_flight      = 1;
    687 			frame->store->min_coordinate = ctx->params->raw.output_min_coordinate;
    688 			frame->store->max_coordinate = ctx->params->raw.output_max_coordinate;
    689 			frame->store->das_shader_id  = ctx->params->raw.das_shader_id;
    690 			frame->store->compound_count = ctx->params->raw.dec_data_dim.z;
    691 
    692 			b32 did_sum_shader = 0;
    693 			u32 stage_count = ctx->params->compute_stages_count;
    694 			ComputeShaderID *stages = ctx->params->compute_stages;
    695 			for (u32 i = 0; i < stage_count; i++) {
    696 				did_sum_shader |= stages[i] == CS_SUM;
    697 				frame->stats->timer_active[stages[i]] = 1;
    698 				glBeginQuery(GL_TIME_ELAPSED, frame->stats->timer_ids[stages[i]]);
    699 				do_compute_shader(ctx, arena, frame->store, stages[i]);
    700 				glEndQuery(GL_TIME_ELAPSED);
    701 			}
    702 			/* NOTE(rnp): block until work completes so that we can record timings */
    703 			glFinish();
    704 			cs->processing_progress = 1;
    705 
    706 			for (u32 i = 0; i < ARRAY_COUNT(frame->stats->timer_ids); i++) {
    707 				u64 ns = 0;
    708 				if (frame->stats->timer_active[i]) {
    709 					glGetQueryObjectui64v(frame->stats->timer_ids[i],
    710 					                      GL_QUERY_RESULT, &ns);
    711 					frame->stats->timer_active[i] = 0;
    712 				}
    713 				frame->stats->times[i] = (f32)ns / 1e9;
    714 			}
    715 
    716 			if (did_sum_shader) {
    717 				u32 aframe_index = (ctx->averaged_frame_index %
    718 				                    ARRAY_COUNT(ctx->averaged_frames));
    719 				ctx->averaged_frames[aframe_index].ready_to_present = 1;
    720 				/* TODO(rnp): not really sure what to do here */
    721 				mem_copy(ctx->averaged_frame_compute_stats[aframe_index].times,
    722 				         frame->stats->times, sizeof(frame->stats->times));
    723 				atomic_inc(&ctx->averaged_frame_index, 1);
    724 			}
    725 			frame->store->ready_to_present = 1;
    726 			cs->processing_compute         = 0;
    727 
    728 			end_renderdoc_capture(gl_context);
    729 		} break;
    730 		case BW_SAVE_FRAME: {
    731 			BeamformFrame *frame = work->output_frame_ctx.frame.store;
    732 			ASSERT(frame->ready_to_present);
    733 			export_frame(ctx, work->output_frame_ctx.file_handle, frame);
    734 		} break;
    735 		}
    736 
    737 		if (can_commit) {
    738 			beamform_work_queue_pop_commit(q);
    739 			work = beamform_work_queue_pop(q);
    740 		}
    741 	}
    742 }
    743 
    744 #include "ui.c"
    745 
    746 DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step)
    747 {
    748 	dt_for_frame = GetFrameTime();
    749 
    750 	if (IsWindowResized()) {
    751 		ctx->window_size.h = GetScreenHeight();
    752 		ctx->window_size.w = GetScreenWidth();
    753 	}
    754 
    755 	if (input->executable_reloaded) {
    756 		ui_init(ctx, ctx->ui_backing_store);
    757 		DEBUG_DECL(start_frame_capture = ctx->os.start_frame_capture);
    758 		DEBUG_DECL(end_frame_capture   = ctx->os.end_frame_capture);
    759 	}
    760 
    761 	if (ctx->start_compute && !input->pipe_data_available) {
    762 		if (ctx->beamform_frames[ctx->display_frame_index].ready_to_present) {
    763 			BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue);
    764 			if (fill_frame_compute_work(ctx, work)) {
    765 				beamform_work_queue_push_commit(ctx->beamform_work_queue);
    766 				ctx->os.wake_thread(ctx->os.compute_worker.sync_handle);
    767 				ctx->start_compute = 0;
    768 			}
    769 		}
    770 	}
    771 
    772 	BeamformerParameters *bp = &ctx->params->raw;
    773 	if (ctx->ready_for_rf && input->pipe_data_available) {
    774 		BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue);
    775 		if (work) {
    776 			ctx->start_compute = 1;
    777 			ctx->ready_for_rf  = 0;
    778 
    779 			work->type        = BW_LOAD_RF_DATA;
    780 			work->file_handle = input->pipe_handle;
    781 			beamform_work_queue_push_commit(ctx->beamform_work_queue);
    782 
    783 			BeamformWork *compute = beamform_work_queue_push(ctx->beamform_work_queue);
    784 			if (fill_frame_compute_work(ctx, compute))
    785 				beamform_work_queue_push_commit(ctx->beamform_work_queue);
    786 
    787 			if (compute && ctx->params->export_next_frame) {
    788 				BeamformWork *export = beamform_work_queue_push(ctx->beamform_work_queue);
    789 				if (export) {
    790 					/* TODO: we don't really want the beamformer opening/closing files */
    791 					iptr f = ctx->os.open_for_write(ctx->params->export_pipe_name);
    792 					export->type = BW_SAVE_FRAME;
    793 					export->output_frame_ctx.file_handle = f;
    794 					if (ctx->params->raw.output_points.w > 1) {
    795 						u32 a_index = !(ctx->averaged_frame_index %
    796 						                ARRAY_COUNT(ctx->averaged_frames));
    797 						BeamformFrame      *aframe = ctx->averaged_frames + a_index;
    798 						ComputeShaderStats *astats = ctx->averaged_frame_compute_stats + a_index;
    799 						export->output_frame_ctx.frame.store = aframe;
    800 						export->output_frame_ctx.frame.stats = astats;
    801 					} else {
    802 						export->output_frame_ctx.frame = compute->frame;
    803 					}
    804 					beamform_work_queue_push_commit(ctx->beamform_work_queue);
    805 				}
    806 				ctx->params->export_next_frame = 0;
    807 			}
    808 
    809 			if (ctx->params->upload) {
    810 				/* TODO(rnp): clean this up */
    811 				ctx->ui_read_params = 1;
    812 			}
    813 		}
    814 	}
    815 
    816 	BeamformFrameIterator bfi = beamform_frame_iterator(ctx, ctx->display_frame_index,
    817 	                                                    ctx->next_render_frame_index - ctx->display_frame_index);
    818 	for (BeamformFrame *frame = frame_next(&bfi); frame; frame = frame_next(&bfi)) {
    819 		if (frame->in_flight && frame->ready_to_present) {
    820 			frame->in_flight         = 0;
    821 			ctx->display_frame_index = frame - bfi.frames;
    822 		}
    823 	}
    824 
    825 	if (ctx->start_compute) {
    826 		ctx->start_compute = 0;
    827 		ctx->os.wake_thread(ctx->os.compute_worker.sync_handle);
    828 	}
    829 
    830 	BeamformFrame      *frame_to_draw;
    831 	ComputeShaderStats *frame_compute_stats;
    832 	if (bp->output_points.w > 1) {
    833 		u32 a_index = !(ctx->averaged_frame_index % ARRAY_COUNT(ctx->averaged_frames));
    834 		frame_to_draw       = ctx->averaged_frames + a_index;
    835 		frame_compute_stats = ctx->averaged_frame_compute_stats + a_index;
    836 	} else {
    837 		frame_to_draw       = ctx->beamform_frames + ctx->display_frame_index;
    838 		frame_compute_stats = ctx->beamform_frame_compute_stats + ctx->display_frame_index;
    839 	}
    840 
    841 	draw_ui(ctx, input, frame_to_draw, frame_compute_stats);
    842 
    843 	ctx->fsctx.updated = 0;
    844 
    845 	if (WindowShouldClose())
    846 		ctx->should_exit = 1;
    847 }