beamformer.c (33809B)
1 /* See LICENSE for license details. */ 2 /* TODO(rnp): 3 * [ ]: reinvestigate ring buffer raw_data_ssbo 4 * - to minimize latency the main thread should manage the subbuffer upload so that the 5 * compute thread can just keep computing. This way we can keep the copmute thread busy 6 * with work while we image. 7 * - In particular we will potentially need multiple GPUComputeContexts so that we 8 * can overwrite one while the other is in use. 9 * - make use of glFenceSync to guard buffer uploads 10 * [ ]: BeamformWorkQueue -> BeamformerWorkQueue 11 * [ ]: bug: re-beamform on shader reload 12 * [ ]: need to keep track of gpu memory in some way 13 * - want to be able to store more than 16 2D frames but limit 3D frames 14 * - maybe keep track of how much gpu memory is committed for beamformed images 15 * and use that to determine when to loop back over existing textures 16 * - to do this maybe use a circular linked list instead of a flat array 17 * - then have a way of querying how many frames are available for a specific point count 18 * [ ]: bug: reinit cuda on hot-reload 19 */ 20 21 #include "beamformer.h" 22 #include "beamformer_work_queue.c" 23 24 global f32 dt_for_frame; 25 global u32 cycle_t; 26 27 #ifndef _DEBUG 28 #define start_renderdoc_capture(...) 29 #define end_renderdoc_capture(...) 30 #else 31 global renderdoc_start_frame_capture_fn *start_frame_capture; 32 global renderdoc_end_frame_capture_fn *end_frame_capture; 33 #define start_renderdoc_capture(gl) if (start_frame_capture) start_frame_capture(gl, 0) 34 #define end_renderdoc_capture(gl) if (end_frame_capture) end_frame_capture(gl, 0) 35 #endif 36 37 typedef struct { 38 BeamformerComputeFrame *frames; 39 u32 capacity; 40 u32 offset; 41 u32 cursor; 42 u32 needed_frames; 43 } ComputeFrameIterator; 44 45 function uv3 46 make_valid_test_dim(u32 in[3]) 47 { 48 uv3 result; 49 result.E[0] = MAX(in[0], 1); 50 result.E[1] = MAX(in[1], 1); 51 result.E[2] = MAX(in[2], 1); 52 return result; 53 } 54 55 function ComputeFrameIterator 56 compute_frame_iterator(BeamformerCtx *ctx, u32 start_index, u32 needed_frames) 57 { 58 start_index = start_index % ARRAY_COUNT(ctx->beamform_frames); 59 60 ComputeFrameIterator result; 61 result.frames = ctx->beamform_frames; 62 result.offset = start_index; 63 result.capacity = ARRAY_COUNT(ctx->beamform_frames); 64 result.cursor = 0; 65 result.needed_frames = needed_frames; 66 return result; 67 } 68 69 function BeamformerComputeFrame * 70 frame_next(ComputeFrameIterator *bfi) 71 { 72 BeamformerComputeFrame *result = 0; 73 if (bfi->cursor != bfi->needed_frames) { 74 u32 index = (bfi->offset + bfi->cursor++) % bfi->capacity; 75 result = bfi->frames + index; 76 } 77 return result; 78 } 79 80 function void 81 alloc_beamform_frame(GLParams *gp, BeamformerFrame *out, uv3 out_dim, s8 name, Arena arena) 82 { 83 out->dim.x = MAX(1, out_dim.x); 84 out->dim.y = MAX(1, out_dim.y); 85 out->dim.z = MAX(1, out_dim.z); 86 87 if (gp) { 88 out->dim.x = MIN(out->dim.x, gp->max_3d_texture_dim); 89 out->dim.y = MIN(out->dim.y, gp->max_3d_texture_dim); 90 out->dim.z = MIN(out->dim.z, gp->max_3d_texture_dim); 91 } 92 93 /* NOTE: allocate storage for beamformed output data; 94 * this is shared between compute and fragment shaders */ 95 u32 max_dim = MAX(out->dim.x, MAX(out->dim.y, out->dim.z)); 96 out->mips = ctz_u32(round_up_power_of_2(max_dim)) + 1; 97 98 Stream label = arena_stream(arena); 99 stream_append_s8(&label, name); 100 stream_append_byte(&label, '['); 101 stream_append_hex_u64(&label, out->id); 102 stream_append_byte(&label, ']'); 103 104 glDeleteTextures(1, &out->texture); 105 glCreateTextures(GL_TEXTURE_3D, 1, &out->texture); 106 glTextureStorage3D(out->texture, out->mips, GL_RG32F, out->dim.x, out->dim.y, out->dim.z); 107 108 glTextureParameteri(out->texture, GL_TEXTURE_MIN_FILTER, GL_NEAREST); 109 glTextureParameteri(out->texture, GL_TEXTURE_MAG_FILTER, GL_NEAREST); 110 111 LABEL_GL_OBJECT(GL_TEXTURE, out->texture, stream_to_s8(&label)); 112 } 113 114 function void 115 alloc_shader_storage(BeamformerCtx *ctx, u32 rf_raw_size, Arena a) 116 { 117 ComputeShaderCtx *cs = &ctx->csctx; 118 BeamformerParameters *bp = &((BeamformerSharedMemory *)ctx->shared_memory.region)->parameters; 119 120 cs->dec_data_dim = uv4_from_u32_array(bp->dec_data_dim); 121 cs->rf_raw_size = rf_raw_size; 122 123 glDeleteBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos); 124 glCreateBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos); 125 126 i32 storage_flags = GL_DYNAMIC_STORAGE_BIT; 127 glDeleteBuffers(1, &cs->raw_data_ssbo); 128 glCreateBuffers(1, &cs->raw_data_ssbo); 129 glNamedBufferStorage(cs->raw_data_ssbo, 2 * rf_raw_size, 0, storage_flags); 130 LABEL_GL_OBJECT(GL_BUFFER, cs->raw_data_ssbo, s8("Raw_RF_SSBO")); 131 132 iz rf_decoded_size = 2 * sizeof(f32) * cs->dec_data_dim.x * cs->dec_data_dim.y * cs->dec_data_dim.z; 133 Stream label = arena_stream(a); 134 stream_append_s8(&label, s8("Decoded_RF_SSBO_")); 135 u32 s_widx = label.widx; 136 for (u32 i = 0; i < ARRAY_COUNT(cs->rf_data_ssbos); i++) { 137 glNamedBufferStorage(cs->rf_data_ssbos[i], rf_decoded_size, 0, 0); 138 stream_append_u64(&label, i); 139 LABEL_GL_OBJECT(GL_BUFFER, cs->rf_data_ssbos[i], stream_to_s8(&label)); 140 stream_reset(&label, s_widx); 141 } 142 143 /* NOTE(rnp): these are stubs when CUDA isn't supported */ 144 ctx->cuda_lib.register_buffers(cs->rf_data_ssbos, countof(cs->rf_data_ssbos), cs->raw_data_ssbo); 145 ctx->cuda_lib.init(bp->rf_raw_dim, bp->dec_data_dim); 146 147 u32 order = cs->dec_data_dim.z; 148 i32 *hadamard = make_hadamard_transpose(&a, order); 149 if (hadamard) { 150 glDeleteTextures(1, &cs->hadamard_texture); 151 glCreateTextures(GL_TEXTURE_2D, 1, &cs->hadamard_texture); 152 glTextureStorage2D(cs->hadamard_texture, 1, GL_R8I, order, order); 153 glTextureSubImage2D(cs->hadamard_texture, 0, 0, 0, order, order, GL_RED_INTEGER, 154 GL_INT, hadamard); 155 LABEL_GL_OBJECT(GL_TEXTURE, cs->hadamard_texture, s8("Hadamard_Matrix")); 156 } 157 } 158 159 function void 160 push_compute_timing_info(ComputeTimingTable *t, ComputeTimingInfo info) 161 { 162 u32 index = atomic_add_u32(&t->write_index, 1) % countof(t->buffer); 163 t->buffer[index] = info; 164 } 165 166 function b32 167 fill_frame_compute_work(BeamformerCtx *ctx, BeamformWork *work, BeamformerViewPlaneTag plane) 168 { 169 b32 result = 0; 170 if (work) { 171 result = 1; 172 u32 frame_id = atomic_add_u32(&ctx->next_render_frame_index, 1); 173 u32 frame_index = frame_id % countof(ctx->beamform_frames); 174 work->kind = BeamformerWorkKind_Compute; 175 work->lock = BeamformerSharedMemoryLockKind_DispatchCompute; 176 work->frame = ctx->beamform_frames + frame_index; 177 work->frame->ready_to_present = 0; 178 work->frame->view_plane_tag = plane; 179 work->frame->frame.id = frame_id; 180 } 181 return result; 182 } 183 184 function void 185 do_sum_shader(ComputeShaderCtx *cs, u32 *in_textures, u32 in_texture_count, f32 in_scale, 186 u32 out_texture, uv3 out_data_dim) 187 { 188 /* NOTE: zero output before summing */ 189 glClearTexImage(out_texture, 0, GL_RED, GL_FLOAT, 0); 190 glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT); 191 192 glBindImageTexture(0, out_texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F); 193 glProgramUniform1f(cs->programs[BeamformerShaderKind_Sum], SUM_PRESCALE_UNIFORM_LOC, in_scale); 194 for (u32 i = 0; i < in_texture_count; i++) { 195 glBindImageTexture(1, in_textures[i], 0, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F); 196 glDispatchCompute(ORONE(out_data_dim.x / 32), 197 ORONE(out_data_dim.y), 198 ORONE(out_data_dim.z / 32)); 199 glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); 200 } 201 } 202 203 struct compute_cursor { 204 iv3 cursor; 205 iv3 dispatch; 206 iv3 target; 207 u32 points_per_dispatch; 208 u32 completed_points; 209 u32 total_points; 210 }; 211 212 function struct compute_cursor 213 start_compute_cursor(uv3 dim, u32 max_points) 214 { 215 struct compute_cursor result = {0}; 216 u32 invocations_per_dispatch = DAS_LOCAL_SIZE_X * DAS_LOCAL_SIZE_Y * DAS_LOCAL_SIZE_Z; 217 218 result.dispatch.y = MIN(max_points / invocations_per_dispatch, ceil_f32((f32)dim.y / DAS_LOCAL_SIZE_Y)); 219 220 u32 remaining = max_points / result.dispatch.y; 221 result.dispatch.x = MIN(remaining / invocations_per_dispatch, ceil_f32((f32)dim.x / DAS_LOCAL_SIZE_X)); 222 result.dispatch.z = MIN(remaining / (invocations_per_dispatch * result.dispatch.x), 223 ceil_f32((f32)dim.z / DAS_LOCAL_SIZE_Z)); 224 225 result.target.x = MAX(dim.x / result.dispatch.x / DAS_LOCAL_SIZE_X, 1); 226 result.target.y = MAX(dim.y / result.dispatch.y / DAS_LOCAL_SIZE_Y, 1); 227 result.target.z = MAX(dim.z / result.dispatch.z / DAS_LOCAL_SIZE_Z, 1); 228 229 result.points_per_dispatch = 1; 230 result.points_per_dispatch *= result.dispatch.x * DAS_LOCAL_SIZE_X; 231 result.points_per_dispatch *= result.dispatch.y * DAS_LOCAL_SIZE_Y; 232 result.points_per_dispatch *= result.dispatch.z * DAS_LOCAL_SIZE_Z; 233 234 result.total_points = dim.x * dim.y * dim.z; 235 236 return result; 237 } 238 239 function iv3 240 step_compute_cursor(struct compute_cursor *cursor) 241 { 242 cursor->cursor.x += 1; 243 if (cursor->cursor.x >= cursor->target.x) { 244 cursor->cursor.x = 0; 245 cursor->cursor.y += 1; 246 if (cursor->cursor.y >= cursor->target.y) { 247 cursor->cursor.y = 0; 248 cursor->cursor.z += 1; 249 } 250 } 251 252 cursor->completed_points += cursor->points_per_dispatch; 253 254 iv3 result = cursor->cursor; 255 result.x *= cursor->dispatch.x * DAS_LOCAL_SIZE_X; 256 result.y *= cursor->dispatch.y * DAS_LOCAL_SIZE_Y; 257 result.z *= cursor->dispatch.z * DAS_LOCAL_SIZE_Z; 258 259 return result; 260 } 261 262 function b32 263 compute_cursor_finished(struct compute_cursor *cursor) 264 { 265 b32 result = cursor->completed_points >= cursor->total_points; 266 return result; 267 } 268 269 function void 270 do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformerComputeFrame *frame, BeamformerShaderKind shader) 271 { 272 ComputeShaderCtx *csctx = &ctx->csctx; 273 BeamformerSharedMemory *sm = ctx->shared_memory.region; 274 275 glUseProgram(csctx->programs[shader]); 276 277 u32 output_ssbo_idx = !csctx->last_output_ssbo_index; 278 u32 input_ssbo_idx = csctx->last_output_ssbo_index; 279 280 switch (shader) { 281 case BeamformerShaderKind_Decode: 282 case BeamformerShaderKind_DecodeFloat: 283 case BeamformerShaderKind_DecodeFloatComplex:{ 284 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, csctx->rf_data_ssbos[output_ssbo_idx]); 285 glBindImageTexture(0, csctx->hadamard_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8I); 286 glBindImageTexture(1, csctx->channel_mapping_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I); 287 288 /* NOTE(rnp): decode 2 samples per dispatch when data is i16 */ 289 i32 local_size_x = DECODE_LOCAL_SIZE_X; 290 if (shader == BeamformerShaderKind_Decode) 291 local_size_x *= 2; 292 293 iz raw_size = csctx->rf_raw_size; 294 glProgramUniform1ui(csctx->programs[shader], DECODE_FIRST_PASS_UNIFORM_LOC, 1); 295 glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo, 0, raw_size); 296 glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 2, csctx->raw_data_ssbo, raw_size, raw_size); 297 glDispatchCompute(ceil_f32((f32)csctx->dec_data_dim.x / local_size_x), 298 ceil_f32((f32)csctx->dec_data_dim.y / DECODE_LOCAL_SIZE_Y), 299 ceil_f32((f32)csctx->dec_data_dim.z / DECODE_LOCAL_SIZE_Z)); 300 301 glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); 302 303 glProgramUniform1ui(csctx->programs[shader], DECODE_FIRST_PASS_UNIFORM_LOC, 0); 304 glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo, raw_size, raw_size); 305 glDispatchCompute(ceil_f32((f32)csctx->dec_data_dim.x / local_size_x), 306 ceil_f32((f32)csctx->dec_data_dim.y / DECODE_LOCAL_SIZE_Y), 307 ceil_f32((f32)csctx->dec_data_dim.z / DECODE_LOCAL_SIZE_Z)); 308 309 csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; 310 }break; 311 case BeamformerShaderKind_CudaDecode:{ 312 ctx->cuda_lib.decode(0, output_ssbo_idx, 0); 313 csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; 314 }break; 315 case BeamformerShaderKind_CudaHilbert: 316 ctx->cuda_lib.hilbert(input_ssbo_idx, output_ssbo_idx); 317 csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; 318 break; 319 case BeamformerShaderKind_Demodulate:{ 320 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]); 321 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]); 322 glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32), 323 ORONE(csctx->dec_data_dim.y / 32), 324 ORONE(csctx->dec_data_dim.z)); 325 csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; 326 }break; 327 case BeamformerShaderKind_MinMax:{ 328 u32 texture = frame->frame.texture; 329 for (u32 i = 1; i < frame->frame.mips; i++) { 330 glBindImageTexture(0, texture, i - 1, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F); 331 glBindImageTexture(1, texture, i - 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F); 332 glProgramUniform1i(csctx->programs[shader], MIN_MAX_MIPS_LEVEL_UNIFORM_LOC, i); 333 334 u32 width = frame->frame.dim.x >> i; 335 u32 height = frame->frame.dim.y >> i; 336 u32 depth = frame->frame.dim.z >> i; 337 glDispatchCompute(ORONE(width / 32), ORONE(height), ORONE(depth / 32)); 338 glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); 339 } 340 }break; 341 case BeamformerShaderKind_DASCompute:{ 342 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]); 343 glBindImageTexture(0, frame->frame.texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F); 344 glBindImageTexture(1, csctx->sparse_elements_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I); 345 glBindImageTexture(2, csctx->focal_vectors_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RG32F); 346 347 glProgramUniform1ui(csctx->programs[shader], DAS_CYCLE_T_UNIFORM_LOC, cycle_t++); 348 349 #if 1 350 /* TODO(rnp): compute max_points_per_dispatch based on something like a 351 * transmit_count * channel_count product */ 352 u32 max_points_per_dispatch = KB(64); 353 struct compute_cursor cursor = start_compute_cursor(frame->frame.dim, max_points_per_dispatch); 354 f32 percent_per_step = (f32)cursor.points_per_dispatch / (f32)cursor.total_points; 355 csctx->processing_progress = -percent_per_step; 356 for (iv3 offset = {0}; 357 !compute_cursor_finished(&cursor); 358 offset = step_compute_cursor(&cursor)) 359 { 360 csctx->processing_progress += percent_per_step; 361 /* IMPORTANT(rnp): prevents OS from coalescing and killing our shader */ 362 glFinish(); 363 glProgramUniform3iv(csctx->programs[shader], DAS_VOXEL_OFFSET_UNIFORM_LOC, 364 1, offset.E); 365 glDispatchCompute(cursor.dispatch.x, cursor.dispatch.y, cursor.dispatch.z); 366 } 367 #else 368 /* NOTE(rnp): use this for testing tiling code. The performance of the above path 369 * should be the same as this path if everything is working correctly */ 370 iv3 compute_dim_offset = {0}; 371 glUniform3iv(csctx->voxel_offset_id, 1, compute_dim_offset.E); 372 glDispatchCompute(ORONE(frame->frame.dim.x / 32), 373 ORONE(frame->frame.dim.y), 374 ORONE(frame->frame.dim.z / 32)); 375 #endif 376 glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT|GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); 377 }break; 378 case BeamformerShaderKind_Sum:{ 379 u32 aframe_index = ctx->averaged_frame_index % ARRAY_COUNT(ctx->averaged_frames); 380 BeamformerComputeFrame *aframe = ctx->averaged_frames + aframe_index; 381 aframe->ready_to_present = 0; 382 aframe->frame.id = ctx->averaged_frame_index; 383 /* TODO(rnp): hack we need a better way of specifying which frames to sum; 384 * this is fine for rolling averaging but what if we want to do something else */ 385 assert(frame >= ctx->beamform_frames); 386 assert(frame < ctx->beamform_frames + countof(ctx->beamform_frames)); 387 u32 base_index = (u32)(frame - ctx->beamform_frames); 388 u32 to_average = sm->parameters.output_points[3]; 389 u32 frame_count = 0; 390 u32 *in_textures = push_array(&arena, u32, MAX_BEAMFORMED_SAVED_FRAMES); 391 ComputeFrameIterator cfi = compute_frame_iterator(ctx, 1 + base_index - to_average, 392 to_average); 393 for (BeamformerComputeFrame *it = frame_next(&cfi); it; it = frame_next(&cfi)) 394 in_textures[frame_count++] = it->frame.texture; 395 396 assert(to_average == frame_count); 397 398 do_sum_shader(csctx, in_textures, frame_count, 1 / (f32)frame_count, 399 aframe->frame.texture, aframe->frame.dim); 400 aframe->frame.min_coordinate = frame->frame.min_coordinate; 401 aframe->frame.max_coordinate = frame->frame.max_coordinate; 402 aframe->frame.compound_count = frame->frame.compound_count; 403 aframe->frame.das_shader_kind = frame->frame.das_shader_kind; 404 }break; 405 InvalidDefaultCase; 406 } 407 } 408 409 function s8 410 shader_text_with_header(ShaderReloadContext *ctx, OS *os, Arena *arena) 411 { 412 Stream sb = arena_stream(*arena); 413 stream_append_s8s(&sb, s8("#version 460 core\n\n"), ctx->header); 414 415 switch (ctx->kind) { 416 case BeamformerShaderKind_DASCompute:{ 417 #define X(type, id, pretty, fixed_tx) "#define DAS_ID_" #type " " #id "\n" 418 stream_append_s8(&sb, s8("" 419 "layout(local_size_x = " str(DAS_LOCAL_SIZE_X) ", " 420 "local_size_y = " str(DAS_LOCAL_SIZE_Y) ", " 421 "local_size_z = " str(DAS_LOCAL_SIZE_Z) ") in;\n\n" 422 "layout(location = " str(DAS_VOXEL_OFFSET_UNIFORM_LOC) ") uniform ivec3 u_voxel_offset;\n" 423 "layout(location = " str(DAS_CYCLE_T_UNIFORM_LOC) ") uniform uint u_cycle_t;\n\n" 424 DAS_TYPES 425 )); 426 #undef X 427 }break; 428 case BeamformerShaderKind_DecodeFloat: 429 case BeamformerShaderKind_DecodeFloatComplex:{ 430 if (ctx->kind == BeamformerShaderKind_DecodeFloat) 431 stream_append_s8(&sb, s8("#define INPUT_DATA_TYPE_FLOAT\n\n")); 432 else 433 stream_append_s8(&sb, s8("#define INPUT_DATA_TYPE_FLOAT_COMPLEX\n\n")); 434 } /* FALLTHROUGH */ 435 case BeamformerShaderKind_Decode:{ 436 #define X(type, id, pretty) "#define DECODE_MODE_" #type " " #id "\n" 437 stream_append_s8(&sb, s8("" 438 "layout(local_size_x = " str(DECODE_LOCAL_SIZE_X) ", " 439 "local_size_y = " str(DECODE_LOCAL_SIZE_Y) ", " 440 "local_size_z = " str(DECODE_LOCAL_SIZE_Z) ") in;\n\n" 441 "layout(location = " str(DECODE_FIRST_PASS_UNIFORM_LOC) ") uniform bool u_first_pass;\n\n" 442 DECODE_TYPES 443 )); 444 #undef X 445 }break; 446 case BeamformerShaderKind_MinMax:{ 447 stream_append_s8(&sb, s8("layout(location = " str(MIN_MAX_MIPS_LEVEL_UNIFORM_LOC) 448 ") uniform int u_mip_map;\n\n")); 449 }break; 450 case BeamformerShaderKind_Sum:{ 451 stream_append_s8(&sb, s8("layout(location = " str(SUM_PRESCALE_UNIFORM_LOC) 452 ") uniform float u_sum_prescale = 1.0;\n\n")); 453 }break; 454 default:{}break; 455 } 456 stream_append_s8(&sb, s8("\n#line 1\n")); 457 458 s8 result = arena_stream_commit(arena, &sb); 459 if (ctx->path.len) { 460 s8 file = os->read_whole_file(arena, (c8 *)ctx->path.data); 461 assert(file.data == result.data + result.len); 462 result.len += file.len; 463 } 464 465 return result; 466 } 467 468 DEBUG_EXPORT BEAMFORMER_RELOAD_SHADER_FN(beamformer_reload_shader) 469 { 470 i32 shader_count = 1; 471 ShaderReloadContext *link = src->link; 472 while (link != src) { shader_count++; link = link->link; } 473 474 s8 *shader_texts = push_array(&arena, s8, shader_count); 475 u32 *shader_types = push_array(&arena, u32, shader_count); 476 477 i32 index = 0; 478 do { 479 shader_texts[index] = shader_text_with_header(link, &ctx->os, &arena); 480 shader_types[index] = link->gl_type; 481 index++; 482 link = link->link; 483 } while (link != src); 484 485 u32 new_program = load_shader(&ctx->os, arena, shader_texts, shader_types, shader_count, shader_name); 486 if (new_program) { 487 glDeleteProgram(*src->shader); 488 *src->shader = new_program; 489 if (src->kind == BeamformerShaderKind_Render2D) ctx->frame_view_render_context.updated = 1; 490 } 491 return new_program != 0; 492 } 493 494 function b32 495 reload_compute_shader(BeamformerCtx *ctx, ShaderReloadContext *src, s8 name_extra, Arena arena) 496 { 497 Stream sb = arena_stream(arena); 498 stream_append_s8s(&sb, src->name, name_extra); 499 s8 name = arena_stream_commit(&arena, &sb); 500 b32 result = beamformer_reload_shader(ctx, src, arena, name); 501 if (result) { 502 glUseProgram(*src->shader); 503 glBindBufferBase(GL_UNIFORM_BUFFER, 0, ctx->csctx.shared_ubo); 504 } 505 return result; 506 } 507 508 function void 509 complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_context) 510 { 511 ComputeShaderCtx *cs = &ctx->csctx; 512 BeamformerSharedMemory *sm = ctx->shared_memory.region; 513 BeamformerParameters *bp = &sm->parameters; 514 515 BeamformWork *work = beamform_work_queue_pop(q); 516 while (work) { 517 b32 can_commit = 1; 518 switch (work->kind) { 519 case BeamformerWorkKind_ReloadShader:{ 520 ShaderReloadContext *src = work->shader_reload_context; 521 b32 success = reload_compute_shader(ctx, src, s8(""), arena); 522 if (src->kind == BeamformerShaderKind_Decode) { 523 /* TODO(rnp): think of a better way of doing this */ 524 src->kind = BeamformerShaderKind_DecodeFloatComplex; 525 src->shader = cs->programs + BeamformerShaderKind_DecodeFloatComplex; 526 success &= reload_compute_shader(ctx, src, s8(" (F32C)"), arena); 527 src->kind = BeamformerShaderKind_DecodeFloat; 528 src->shader = cs->programs + BeamformerShaderKind_DecodeFloat; 529 success &= reload_compute_shader(ctx, src, s8(" (F32)"), arena); 530 src->kind = BeamformerShaderKind_Decode; 531 src->shader = cs->programs + BeamformerShaderKind_Decode; 532 } 533 534 if (success && ctx->csctx.raw_data_ssbo) { 535 /* TODO(rnp): this check seems off */ 536 can_commit = 0; 537 fill_frame_compute_work(ctx, work, ctx->latest_frame->view_plane_tag); 538 } 539 }break; 540 case BeamformerWorkKind_ExportBuffer:{ 541 /* TODO(rnp): better way of handling DispatchCompute barrier */ 542 post_sync_barrier(&ctx->shared_memory, BeamformerSharedMemoryLockKind_DispatchCompute, 543 sm->locks, ctx->os.shared_memory_region_unlock); 544 ctx->os.shared_memory_region_lock(&ctx->shared_memory, sm->locks, (i32)work->lock, -1); 545 BeamformerExportContext *ec = &work->export_context; 546 switch (ec->kind) { 547 case BeamformerExportKind_BeamformedData:{ 548 BeamformerComputeFrame *frame = ctx->latest_frame; 549 assert(frame->ready_to_present); 550 u32 texture = frame->frame.texture; 551 uv3 dim = frame->frame.dim; 552 iz out_size = dim.x * dim.y * dim.z * 2 * sizeof(f32); 553 if (out_size <= ec->size) { 554 glGetTextureImage(texture, 0, GL_RG, GL_FLOAT, out_size, 555 (u8 *)sm + BEAMFORMER_SCRATCH_OFF); 556 } 557 }break; 558 case BeamformerExportKind_Stats:{ 559 ComputeTimingTable *table = ctx->compute_timing_table; 560 /* NOTE(rnp): do a little spin to let this finish updating */ 561 while (table->write_index != atomic_load_u32(&table->read_index)); 562 ComputeShaderStats *stats = ctx->compute_shader_stats; 563 if (sizeof(stats->table) <= ec->size) 564 mem_copy((u8 *)sm + BEAMFORMER_SCRATCH_OFF, &stats->table, sizeof(stats->table)); 565 }break; 566 InvalidDefaultCase; 567 } 568 ctx->os.shared_memory_region_unlock(&ctx->shared_memory, sm->locks, (i32)work->lock); 569 post_sync_barrier(&ctx->shared_memory, BeamformerSharedMemoryLockKind_ExportSync, sm->locks, 570 ctx->os.shared_memory_region_unlock); 571 }break; 572 case BeamformerWorkKind_UploadBuffer:{ 573 ctx->os.shared_memory_region_lock(&ctx->shared_memory, sm->locks, (i32)work->lock, -1); 574 BeamformerUploadContext *uc = &work->upload_context; 575 u32 tex_type, tex_format, tex_element_count, tex_1d = 0, buffer = 0; 576 switch (uc->kind) { 577 case BU_KIND_CHANNEL_MAPPING: { 578 tex_1d = cs->channel_mapping_texture; 579 tex_type = GL_SHORT; 580 tex_format = GL_RED_INTEGER; 581 tex_element_count = ARRAY_COUNT(sm->channel_mapping); 582 ctx->cuda_lib.set_channel_mapping(sm->channel_mapping); 583 } break; 584 case BU_KIND_FOCAL_VECTORS: { 585 tex_1d = cs->focal_vectors_texture; 586 tex_type = GL_FLOAT; 587 tex_format = GL_RG; 588 tex_element_count = ARRAY_COUNT(sm->focal_vectors); 589 } break; 590 case BU_KIND_SPARSE_ELEMENTS: { 591 tex_1d = cs->sparse_elements_texture; 592 tex_type = GL_SHORT; 593 tex_format = GL_RED_INTEGER; 594 tex_element_count = ARRAY_COUNT(sm->sparse_elements); 595 } break; 596 case BU_KIND_PARAMETERS: { 597 ctx->ui_read_params = ctx->beamform_work_queue != q; 598 buffer = cs->shared_ubo; 599 } break; 600 case BU_KIND_RF_DATA: { 601 if (cs->rf_raw_size != uc->size || 602 !uv4_equal(cs->dec_data_dim, uv4_from_u32_array(bp->dec_data_dim))) 603 { 604 alloc_shader_storage(ctx, uc->size, arena); 605 } 606 buffer = cs->raw_data_ssbo; 607 608 ComputeTimingInfo info = {0}; 609 info.kind = ComputeTimingInfoKind_RF_Data; 610 /* TODO(rnp): this could stall. what should we do about it? */ 611 glGetQueryObjectui64v(cs->rf_data_timestamp_query, GL_QUERY_RESULT, &info.timer_count); 612 glQueryCounter(cs->rf_data_timestamp_query, GL_TIMESTAMP); 613 push_compute_timing_info(ctx->compute_timing_table, info); 614 }break; 615 InvalidDefaultCase; 616 } 617 618 if (tex_1d) { 619 glTextureSubImage1D(tex_1d, 0, 0, tex_element_count, tex_format, 620 tex_type, (u8 *)sm + uc->shared_memory_offset); 621 } 622 623 if (buffer) { 624 glNamedBufferSubData(buffer, 0, uc->size, 625 (u8 *)sm + uc->shared_memory_offset); 626 } 627 628 atomic_and_u32(&sm->dirty_regions, ~(sm->dirty_regions & 1 << (work->lock - 1))); 629 ctx->os.shared_memory_region_unlock(&ctx->shared_memory, sm->locks, (i32)work->lock); 630 }break; 631 case BeamformerWorkKind_ComputeIndirect:{ 632 fill_frame_compute_work(ctx, work, work->compute_indirect_plane); 633 DEBUG_DECL(work->kind = BeamformerWorkKind_ComputeIndirect;) 634 } /* FALLTHROUGH */ 635 case BeamformerWorkKind_Compute:{ 636 post_sync_barrier(&ctx->shared_memory, work->lock, sm->locks, 637 ctx->os.shared_memory_region_unlock); 638 639 push_compute_timing_info(ctx->compute_timing_table, 640 (ComputeTimingInfo){.kind = ComputeTimingInfoKind_ComputeFrameBegin}); 641 642 i32 mask = 1 << (BeamformerSharedMemoryLockKind_Parameters - 1); 643 if (sm->dirty_regions & mask) { 644 glNamedBufferSubData(cs->shared_ubo, 0, sizeof(sm->parameters), &sm->parameters); 645 atomic_and_u32(&sm->dirty_regions, ~mask); 646 } 647 648 atomic_store_u32(&cs->processing_compute, 1); 649 start_renderdoc_capture(gl_context); 650 651 BeamformerComputeFrame *frame = work->frame; 652 uv3 try_dim = make_valid_test_dim(bp->output_points); 653 if (!uv3_equal(try_dim, frame->frame.dim)) 654 alloc_beamform_frame(&ctx->gl, &frame->frame, try_dim, s8("Beamformed_Data"), arena); 655 656 if (bp->output_points[3] > 1) { 657 if (!uv3_equal(try_dim, ctx->averaged_frames[0].frame.dim)) { 658 alloc_beamform_frame(&ctx->gl, &ctx->averaged_frames[0].frame, 659 try_dim, s8("Averaged Frame"), arena); 660 alloc_beamform_frame(&ctx->gl, &ctx->averaged_frames[1].frame, 661 try_dim, s8("Averaged Frame"), arena); 662 } 663 } 664 665 frame->frame.min_coordinate = v4_from_f32_array(bp->output_min_coordinate); 666 frame->frame.max_coordinate = v4_from_f32_array(bp->output_max_coordinate); 667 frame->frame.das_shader_kind = bp->das_shader_id; 668 frame->frame.compound_count = bp->dec_data_dim[2]; 669 670 b32 did_sum_shader = 0; 671 u32 stage_count = sm->compute_stages_count; 672 BeamformerShaderKind *stages = sm->compute_stages; 673 for (u32 i = 0; i < stage_count; i++) { 674 did_sum_shader |= stages[i] == BeamformerShaderKind_Sum; 675 glBeginQuery(GL_TIME_ELAPSED, cs->shader_timer_ids[i]); 676 do_compute_shader(ctx, arena, frame, stages[i]); 677 glEndQuery(GL_TIME_ELAPSED); 678 } 679 680 /* NOTE(rnp): the first of these blocks until work completes */ 681 for (u32 i = 0; i < stage_count; i++) { 682 ComputeTimingInfo info = {0}; 683 info.kind = ComputeTimingInfoKind_Shader; 684 info.shader = stages[i]; 685 glGetQueryObjectui64v(cs->shader_timer_ids[i], GL_QUERY_RESULT, &info.timer_count); 686 push_compute_timing_info(ctx->compute_timing_table, info); 687 } 688 cs->processing_progress = 1; 689 690 frame->ready_to_present = 1; 691 if (did_sum_shader) { 692 u32 aframe_index = (ctx->averaged_frame_index % countof(ctx->averaged_frames)); 693 ctx->averaged_frames[aframe_index].view_plane_tag = frame->view_plane_tag; 694 ctx->averaged_frames[aframe_index].ready_to_present = 1; 695 atomic_add_u32(&ctx->averaged_frame_index, 1); 696 atomic_store_u64((u64 *)&ctx->latest_frame, (u64)(ctx->averaged_frames + aframe_index)); 697 } else { 698 atomic_store_u64((u64 *)&ctx->latest_frame, (u64)frame); 699 } 700 cs->processing_compute = 0; 701 702 push_compute_timing_info(ctx->compute_timing_table, 703 (ComputeTimingInfo){.kind = ComputeTimingInfoKind_ComputeFrameEnd}); 704 705 end_renderdoc_capture(gl_context); 706 }break; 707 InvalidDefaultCase; 708 } 709 710 if (can_commit) { 711 beamform_work_queue_pop_commit(q); 712 work = beamform_work_queue_pop(q); 713 } 714 } 715 } 716 717 function void 718 coalesce_timing_table(ComputeTimingTable *t, ComputeShaderStats *stats) 719 { 720 /* TODO(rnp): we do not currently do anything to handle the potential for a half written 721 * info item. this could result in garbage entries but they shouldn't really matter */ 722 723 u32 target = atomic_load_u32(&t->write_index); 724 u32 stats_index = (stats->latest_frame_index + 1) % countof(stats->table.times); 725 726 static_assert(BeamformerShaderKind_Count + 1 <= 32, "timing coalescence bitfield test"); 727 u32 seen_info_test = 0; 728 729 while (t->read_index != target) { 730 ComputeTimingInfo info = t->buffer[t->read_index % countof(t->buffer)]; 731 switch (info.kind) { 732 case ComputeTimingInfoKind_ComputeFrameBegin:{ 733 assert(t->compute_frame_active == 0); 734 t->compute_frame_active = 1; 735 /* NOTE(rnp): allow multiple instances of same shader to accumulate */ 736 mem_clear(stats->table.times[stats_index], 0, sizeof(stats->table.times[stats_index])); 737 }break; 738 case ComputeTimingInfoKind_ComputeFrameEnd:{ 739 assert(t->compute_frame_active == 1); 740 t->compute_frame_active = 0; 741 stats->latest_frame_index = stats_index; 742 stats_index = (stats_index + 1) % countof(stats->table.times); 743 }break; 744 case ComputeTimingInfoKind_Shader:{ 745 stats->table.times[stats_index][info.shader] += (f32)info.timer_count / 1.0e9; 746 seen_info_test |= (1 << info.shader); 747 }break; 748 case ComputeTimingInfoKind_RF_Data:{ 749 stats->latest_rf_index = (stats->latest_rf_index + 1) % countof(stats->table.rf_time_deltas); 750 f32 delta = (f32)(info.timer_count - stats->last_rf_timer_count) / 1.0e9; 751 stats->table.rf_time_deltas[stats->latest_rf_index] = delta; 752 stats->last_rf_timer_count = info.timer_count; 753 seen_info_test |= (1 << BeamformerShaderKind_Count); 754 }break; 755 } 756 /* NOTE(rnp): do this at the end so that stats table is always in a consistent state */ 757 atomic_add_u32(&t->read_index, 1); 758 } 759 760 if (seen_info_test) { 761 for EachEnumValue(BeamformerShaderKind, shader) { 762 if (seen_info_test & (1 << shader)) { 763 f32 sum = 0; 764 for EachElement(stats->table.times, i) 765 sum += stats->table.times[i][shader]; 766 stats->average_times[shader] = sum / countof(stats->table.times); 767 } 768 } 769 770 if (seen_info_test & (1 << BeamformerShaderKind_Count)) { 771 f32 sum = 0; 772 for EachElement(stats->table.rf_time_deltas, i) 773 sum += stats->table.rf_time_deltas[i]; 774 stats->rf_time_delta_average = sum / countof(stats->table.rf_time_deltas); 775 } 776 } 777 } 778 779 DEBUG_EXPORT BEAMFORMER_COMPUTE_SETUP_FN(beamformer_compute_setup) 780 { 781 BeamformerCtx *ctx = (BeamformerCtx *)user_context; 782 BeamformerSharedMemory *sm = ctx->shared_memory.region; 783 ComputeShaderCtx *cs = &ctx->csctx; 784 785 glCreateBuffers(1, &cs->shared_ubo); 786 glNamedBufferStorage(cs->shared_ubo, sizeof(sm->parameters), 0, GL_DYNAMIC_STORAGE_BIT); 787 788 glCreateTextures(GL_TEXTURE_1D, 1, &cs->channel_mapping_texture); 789 glCreateTextures(GL_TEXTURE_1D, 1, &cs->sparse_elements_texture); 790 glCreateTextures(GL_TEXTURE_1D, 1, &cs->focal_vectors_texture); 791 glTextureStorage1D(cs->channel_mapping_texture, 1, GL_R16I, ARRAY_COUNT(sm->channel_mapping)); 792 glTextureStorage1D(cs->sparse_elements_texture, 1, GL_R16I, ARRAY_COUNT(sm->sparse_elements)); 793 glTextureStorage1D(cs->focal_vectors_texture, 1, GL_RG32F, ARRAY_COUNT(sm->focal_vectors)); 794 795 LABEL_GL_OBJECT(GL_TEXTURE, cs->channel_mapping_texture, s8("Channel_Mapping")); 796 LABEL_GL_OBJECT(GL_TEXTURE, cs->focal_vectors_texture, s8("Focal_Vectors")); 797 LABEL_GL_OBJECT(GL_TEXTURE, cs->sparse_elements_texture, s8("Sparse_Elements")); 798 LABEL_GL_OBJECT(GL_BUFFER, cs->shared_ubo, s8("Beamformer_Parameters")); 799 800 glCreateQueries(GL_TIME_ELAPSED, countof(cs->shader_timer_ids), cs->shader_timer_ids); 801 glCreateQueries(GL_TIMESTAMP, 1, &cs->rf_data_timestamp_query); 802 803 /* NOTE(rnp): start this here so we don't have to worry about it being started or not */ 804 glQueryCounter(cs->rf_data_timestamp_query, GL_TIMESTAMP); 805 } 806 807 DEBUG_EXPORT BEAMFORMER_COMPLETE_COMPUTE_FN(beamformer_complete_compute) 808 { 809 BeamformerCtx *ctx = (BeamformerCtx *)user_context; 810 BeamformerSharedMemory *sm = ctx->shared_memory.region; 811 complete_queue(ctx, &sm->external_work_queue, arena, gl_context); 812 complete_queue(ctx, ctx->beamform_work_queue, arena, gl_context); 813 } 814 815 #include "ui.c" 816 817 DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step) 818 { 819 dt_for_frame = input->dt; 820 821 if (IsWindowResized()) { 822 ctx->window_size.h = GetScreenHeight(); 823 ctx->window_size.w = GetScreenWidth(); 824 } 825 826 coalesce_timing_table(ctx->compute_timing_table, ctx->compute_shader_stats); 827 828 if (input->executable_reloaded) { 829 ui_init(ctx, ctx->ui_backing_store); 830 DEBUG_DECL(start_frame_capture = ctx->os.start_frame_capture); 831 DEBUG_DECL(end_frame_capture = ctx->os.end_frame_capture); 832 } 833 834 BeamformerSharedMemory *sm = ctx->shared_memory.region; 835 if (sm->locks[BeamformerSharedMemoryLockKind_DispatchCompute] && ctx->os.compute_worker.asleep) { 836 if (sm->start_compute_from_main) { 837 BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue); 838 BeamformerViewPlaneTag tag = ctx->latest_frame->view_plane_tag; 839 if (fill_frame_compute_work(ctx, work, tag)) 840 beamform_work_queue_push_commit(ctx->beamform_work_queue); 841 atomic_store_u32(&sm->start_compute_from_main, 0); 842 } 843 ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable); 844 } 845 846 draw_ui(ctx, input, ctx->latest_frame->ready_to_present ? &ctx->latest_frame->frame : 0, 847 ctx->latest_frame->view_plane_tag); 848 849 ctx->frame_view_render_context.updated = 0; 850 851 if (WindowShouldClose()) 852 ctx->should_exit = 1; 853 }