beamformer.c (29153B)
1 /* See LICENSE for license details. */ 2 /* TODO(rnp): 3 * [ ]: refactor: BeamformGPUComputeContext 4 * [ ]: refactor: compute shader timers should be generated based on the pipeline stage limit 5 * [ ]: reinvestigate ring buffer raw_data_ssbo 6 * - to minimize latency the main thread should manage the subbuffer upload so that the 7 * compute thread can just keep computing. This way we can keep the copmute thread busy 8 * with work while we image. 9 * - In particular we will potentially need multiple GPUComputeContexts so that we 10 * can overwrite one while the other is in use. 11 * - make use of glFenceSync to guard buffer uploads 12 * [ ]: BeamformWorkQueue -> BeamformerWorkQueue 13 */ 14 15 #include "beamformer.h" 16 #include "beamformer_work_queue.c" 17 18 global f32 dt_for_frame; 19 global u32 cycle_t; 20 21 #ifndef _DEBUG 22 #define start_renderdoc_capture(...) 23 #define end_renderdoc_capture(...) 24 #else 25 global renderdoc_start_frame_capture_fn *start_frame_capture; 26 global renderdoc_end_frame_capture_fn *end_frame_capture; 27 #define start_renderdoc_capture(gl) if (start_frame_capture) start_frame_capture(gl, 0) 28 #define end_renderdoc_capture(gl) if (end_frame_capture) end_frame_capture(gl, 0) 29 #endif 30 31 typedef struct { 32 BeamformComputeFrame *frames; 33 u32 capacity; 34 u32 offset; 35 u32 cursor; 36 u32 needed_frames; 37 } ComputeFrameIterator; 38 39 function uv3 40 make_valid_test_dim(u32 in[3]) 41 { 42 uv3 result; 43 result.E[0] = MAX(in[0], 1); 44 result.E[1] = MAX(in[1], 1); 45 result.E[2] = MAX(in[2], 1); 46 return result; 47 } 48 49 function ComputeFrameIterator 50 compute_frame_iterator(BeamformerCtx *ctx, u32 start_index, u32 needed_frames) 51 { 52 start_index = start_index % ARRAY_COUNT(ctx->beamform_frames); 53 54 ComputeFrameIterator result; 55 result.frames = ctx->beamform_frames; 56 result.offset = start_index; 57 result.capacity = ARRAY_COUNT(ctx->beamform_frames); 58 result.cursor = 0; 59 result.needed_frames = needed_frames; 60 return result; 61 } 62 63 function BeamformComputeFrame * 64 frame_next(ComputeFrameIterator *bfi) 65 { 66 BeamformComputeFrame *result = 0; 67 if (bfi->cursor != bfi->needed_frames) { 68 u32 index = (bfi->offset + bfi->cursor++) % bfi->capacity; 69 result = bfi->frames + index; 70 } 71 return result; 72 } 73 74 function void 75 alloc_beamform_frame(GLParams *gp, BeamformFrame *out, ComputeShaderStats *out_stats, 76 uv3 out_dim, s8 name, Arena arena) 77 { 78 out->dim.x = MAX(1, round_down_power_of_2(ORONE(out_dim.x))); 79 out->dim.y = MAX(1, round_down_power_of_2(ORONE(out_dim.y))); 80 out->dim.z = MAX(1, round_down_power_of_2(ORONE(out_dim.z))); 81 82 if (gp) { 83 out->dim.x = MIN(out->dim.x, gp->max_3d_texture_dim); 84 out->dim.y = MIN(out->dim.y, gp->max_3d_texture_dim); 85 out->dim.z = MIN(out->dim.z, gp->max_3d_texture_dim); 86 } 87 88 /* NOTE: allocate storage for beamformed output data; 89 * this is shared between compute and fragment shaders */ 90 u32 max_dim = MAX(out->dim.x, MAX(out->dim.y, out->dim.z)); 91 out->mips = ctz_u32(max_dim) + 1; 92 93 Stream label = arena_stream(arena); 94 stream_append_s8(&label, name); 95 stream_append_byte(&label, '['); 96 stream_append_hex_u64(&label, out->id); 97 stream_append_byte(&label, ']'); 98 99 glDeleteTextures(1, &out->texture); 100 glCreateTextures(GL_TEXTURE_3D, 1, &out->texture); 101 glTextureStorage3D(out->texture, out->mips, GL_RG32F, out->dim.x, out->dim.y, out->dim.z); 102 LABEL_GL_OBJECT(GL_TEXTURE, out->texture, stream_to_s8(&label)); 103 104 if (out_stats) { 105 glDeleteQueries(ARRAY_COUNT(out_stats->timer_ids), out_stats->timer_ids); 106 glCreateQueries(GL_TIME_ELAPSED, ARRAY_COUNT(out_stats->timer_ids), out_stats->timer_ids); 107 } 108 } 109 110 function void 111 alloc_shader_storage(BeamformerCtx *ctx, u32 rf_raw_size, Arena a) 112 { 113 ComputeShaderCtx *cs = &ctx->csctx; 114 BeamformerParameters *bp = &ctx->shared_memory->parameters; 115 116 cs->dec_data_dim = uv4_from_u32_array(bp->dec_data_dim); 117 cs->rf_raw_size = rf_raw_size; 118 119 glDeleteBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos); 120 glCreateBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos); 121 122 i32 storage_flags = GL_DYNAMIC_STORAGE_BIT; 123 glDeleteBuffers(1, &cs->raw_data_ssbo); 124 glCreateBuffers(1, &cs->raw_data_ssbo); 125 glNamedBufferStorage(cs->raw_data_ssbo, rf_raw_size, 0, storage_flags); 126 LABEL_GL_OBJECT(GL_BUFFER, cs->raw_data_ssbo, s8("Raw_RF_SSBO")); 127 128 iz rf_decoded_size = 2 * sizeof(f32) * cs->dec_data_dim.x * cs->dec_data_dim.y * cs->dec_data_dim.z; 129 Stream label = arena_stream(a); 130 stream_append_s8(&label, s8("Decoded_RF_SSBO_")); 131 u32 s_widx = label.widx; 132 for (u32 i = 0; i < ARRAY_COUNT(cs->rf_data_ssbos); i++) { 133 glNamedBufferStorage(cs->rf_data_ssbos[i], rf_decoded_size, 0, 0); 134 stream_append_u64(&label, i); 135 LABEL_GL_OBJECT(GL_BUFFER, cs->rf_data_ssbos[i], stream_to_s8(&label)); 136 stream_reset(&label, s_widx); 137 } 138 139 /* NOTE(rnp): these are stubs when CUDA isn't supported */ 140 ctx->cuda_lib.register_buffers(cs->rf_data_ssbos, countof(cs->rf_data_ssbos), cs->raw_data_ssbo); 141 ctx->cuda_lib.init(bp->rf_raw_dim, bp->dec_data_dim); 142 143 u32 order = cs->dec_data_dim.z; 144 i32 *hadamard = make_hadamard_transpose(&a, order); 145 if (hadamard) { 146 glDeleteTextures(1, &cs->hadamard_texture); 147 glCreateTextures(GL_TEXTURE_2D, 1, &cs->hadamard_texture); 148 glTextureStorage2D(cs->hadamard_texture, 1, GL_R8I, order, order); 149 glTextureSubImage2D(cs->hadamard_texture, 0, 0, 0, order, order, GL_RED_INTEGER, 150 GL_INT, hadamard); 151 LABEL_GL_OBJECT(GL_TEXTURE, cs->hadamard_texture, s8("Hadamard_Matrix")); 152 } 153 } 154 155 function b32 156 fill_frame_compute_work(BeamformerCtx *ctx, BeamformWork *work, ImagePlaneTag plane) 157 { 158 b32 result = 0; 159 if (work) { 160 result = 1; 161 u32 frame_id = atomic_inc_u32(&ctx->next_render_frame_index, 1); 162 u32 frame_index = frame_id % ARRAY_COUNT(ctx->beamform_frames); 163 work->type = BW_COMPUTE; 164 work->frame = ctx->beamform_frames + frame_index; 165 work->frame->ready_to_present = 0; 166 work->frame->frame.id = frame_id; 167 work->frame->image_plane_tag = plane; 168 } 169 return result; 170 } 171 172 function void 173 export_frame(BeamformerCtx *ctx, iptr handle, BeamformFrame *frame) 174 { 175 uv3 dim = frame->dim; 176 iz out_size = dim.x * dim.y * dim.z * 2 * sizeof(f32); 177 ctx->export_buffer = ctx->os.alloc_arena(ctx->export_buffer, out_size); 178 glGetTextureImage(frame->texture, 0, GL_RG, GL_FLOAT, out_size, ctx->export_buffer.beg); 179 s8 raw = {.len = out_size, .data = ctx->export_buffer.beg}; 180 if (!ctx->os.write_file(handle, raw)) 181 ctx->os.write_file(ctx->os.error_handle, s8("failed to export frame\n")); 182 ctx->os.close(handle); 183 } 184 185 function void 186 do_sum_shader(ComputeShaderCtx *cs, u32 *in_textures, u32 in_texture_count, f32 in_scale, 187 u32 out_texture, uv3 out_data_dim) 188 { 189 /* NOTE: zero output before summing */ 190 glClearTexImage(out_texture, 0, GL_RED, GL_FLOAT, 0); 191 glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT); 192 193 glBindImageTexture(0, out_texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F); 194 glUniform1f(CS_SUM_PRESCALE_UNIFORM_LOC, in_scale); 195 for (u32 i = 0; i < in_texture_count; i++) { 196 glBindImageTexture(1, in_textures[i], 0, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F); 197 glDispatchCompute(ORONE(out_data_dim.x / 32), 198 ORONE(out_data_dim.y), 199 ORONE(out_data_dim.z / 32)); 200 glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); 201 } 202 } 203 204 struct compute_cursor { 205 iv3 cursor; 206 iv3 dispatch; 207 iv3 target; 208 u32 points_per_dispatch; 209 u32 completed_points; 210 u32 total_points; 211 }; 212 213 function struct compute_cursor 214 start_compute_cursor(uv3 dim, u32 max_points) 215 { 216 struct compute_cursor result = {0}; 217 u32 invocations_per_dispatch = DAS_LOCAL_SIZE_X * DAS_LOCAL_SIZE_Y * DAS_LOCAL_SIZE_Z; 218 219 result.dispatch.y = MIN(max_points / invocations_per_dispatch, MAX(dim.y / DAS_LOCAL_SIZE_Y, 1)); 220 221 u32 remaining = max_points / result.dispatch.y; 222 result.dispatch.x = MIN(remaining / invocations_per_dispatch, MAX(dim.x / DAS_LOCAL_SIZE_X, 1)); 223 result.dispatch.z = MIN(remaining / (invocations_per_dispatch * result.dispatch.x), 224 MAX(dim.z / DAS_LOCAL_SIZE_Z, 1)); 225 226 result.target.x = MAX(dim.x / result.dispatch.x / DAS_LOCAL_SIZE_X, 1); 227 result.target.y = MAX(dim.y / result.dispatch.y / DAS_LOCAL_SIZE_Y, 1); 228 result.target.z = MAX(dim.z / result.dispatch.z / DAS_LOCAL_SIZE_Z, 1); 229 230 result.points_per_dispatch = 1; 231 result.points_per_dispatch *= result.dispatch.x * DAS_LOCAL_SIZE_X; 232 result.points_per_dispatch *= result.dispatch.y * DAS_LOCAL_SIZE_Y; 233 result.points_per_dispatch *= result.dispatch.z * DAS_LOCAL_SIZE_Z; 234 235 result.total_points = dim.x * dim.y * dim.z; 236 237 return result; 238 } 239 240 function iv3 241 step_compute_cursor(struct compute_cursor *cursor) 242 { 243 cursor->cursor.x += 1; 244 if (cursor->cursor.x >= cursor->target.x) { 245 cursor->cursor.x = 0; 246 cursor->cursor.y += 1; 247 if (cursor->cursor.y >= cursor->target.y) { 248 cursor->cursor.y = 0; 249 cursor->cursor.z += 1; 250 } 251 } 252 253 cursor->completed_points += cursor->points_per_dispatch; 254 255 iv3 result = cursor->cursor; 256 result.x *= cursor->dispatch.x * DAS_LOCAL_SIZE_X; 257 result.y *= cursor->dispatch.y * DAS_LOCAL_SIZE_Y; 258 result.z *= cursor->dispatch.z * DAS_LOCAL_SIZE_Z; 259 260 return result; 261 } 262 263 function b32 264 compute_cursor_finished(struct compute_cursor *cursor) 265 { 266 b32 result = cursor->completed_points >= cursor->total_points; 267 return result; 268 } 269 270 function void 271 do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, ShaderKind shader) 272 { 273 ComputeShaderCtx *csctx = &ctx->csctx; 274 275 glUseProgram(csctx->programs[shader]); 276 277 u32 output_ssbo_idx = !csctx->last_output_ssbo_index; 278 u32 input_ssbo_idx = csctx->last_output_ssbo_index; 279 280 switch (shader) { 281 case ShaderKind_Decode: 282 case ShaderKind_DecodeFloat: 283 case ShaderKind_DecodeFloatComplex:{ 284 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo); 285 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]); 286 glBindImageTexture(0, csctx->hadamard_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8I); 287 glBindImageTexture(1, csctx->channel_mapping_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I); 288 glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32), 289 ORONE(csctx->dec_data_dim.y / 32), 290 ORONE(csctx->dec_data_dim.z)); 291 csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; 292 }break; 293 case ShaderKind_CudaDecode:{ 294 ctx->cuda_lib.decode(0, output_ssbo_idx, 0); 295 csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; 296 }break; 297 case ShaderKind_CudaHilbert: 298 ctx->cuda_lib.hilbert(input_ssbo_idx, output_ssbo_idx); 299 csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; 300 break; 301 case ShaderKind_Demodulate:{ 302 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]); 303 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]); 304 glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32), 305 ORONE(csctx->dec_data_dim.y / 32), 306 ORONE(csctx->dec_data_dim.z)); 307 csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; 308 }break; 309 case ShaderKind_MinMax:{ 310 u32 texture = frame->frame.texture; 311 for (u32 i = 1; i < frame->frame.mips; i++) { 312 glBindImageTexture(0, texture, i - 1, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F); 313 glBindImageTexture(1, texture, i - 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F); 314 glUniform1i(CS_MIN_MAX_MIPS_LEVEL_UNIFORM_LOC, i); 315 316 u32 width = frame->frame.dim.x >> i; 317 u32 height = frame->frame.dim.y >> i; 318 u32 depth = frame->frame.dim.z >> i; 319 glDispatchCompute(ORONE(width / 32), ORONE(height), ORONE(depth / 32)); 320 glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); 321 } 322 }break; 323 case ShaderKind_DASCompute:{ 324 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]); 325 glBindImageTexture(0, frame->frame.texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F); 326 glBindImageTexture(1, csctx->sparse_elements_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I); 327 glBindImageTexture(2, csctx->focal_vectors_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RG32F); 328 329 glUniform1ui(DAS_CYCLE_T_UNIFORM_LOC, cycle_t++); 330 331 #if 1 332 /* TODO(rnp): compute max_points_per_dispatch based on something like a 333 * transmit_count * channel_count product */ 334 u32 max_points_per_dispatch = KB(64); 335 struct compute_cursor cursor = start_compute_cursor(frame->frame.dim, max_points_per_dispatch); 336 f32 percent_per_step = (f32)cursor.points_per_dispatch / (f32)cursor.total_points; 337 csctx->processing_progress = -percent_per_step; 338 for (iv3 offset = {0}; 339 !compute_cursor_finished(&cursor); 340 offset = step_compute_cursor(&cursor)) 341 { 342 csctx->processing_progress += percent_per_step; 343 /* IMPORTANT(rnp): prevents OS from coalescing and killing our shader */ 344 glFinish(); 345 glUniform3iv(DAS_VOXEL_OFFSET_UNIFORM_LOC, 1, offset.E); 346 glDispatchCompute(cursor.dispatch.x, cursor.dispatch.y, cursor.dispatch.z); 347 } 348 #else 349 /* NOTE(rnp): use this for testing tiling code. The performance of the above path 350 * should be the same as this path if everything is working correctly */ 351 iv3 compute_dim_offset = {0}; 352 glUniform3iv(csctx->voxel_offset_id, 1, compute_dim_offset.E); 353 glDispatchCompute(ORONE(frame->frame.dim.x / 32), 354 ORONE(frame->frame.dim.y), 355 ORONE(frame->frame.dim.z / 32)); 356 #endif 357 glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT|GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); 358 }break; 359 case ShaderKind_Sum:{ 360 u32 aframe_index = ctx->averaged_frame_index % ARRAY_COUNT(ctx->averaged_frames); 361 BeamformComputeFrame *aframe = ctx->averaged_frames + aframe_index; 362 aframe->ready_to_present = 0; 363 aframe->frame.id = ctx->averaged_frame_index; 364 /* TODO(rnp): hack we need a better way of specifying which frames to sum; 365 * this is fine for rolling averaging but what if we want to do something else */ 366 ASSERT(frame >= ctx->beamform_frames); 367 ASSERT(frame < ctx->beamform_frames + ARRAY_COUNT(ctx->beamform_frames)); 368 u32 base_index = (u32)(frame - ctx->beamform_frames); 369 u32 to_average = ctx->shared_memory->parameters.output_points[3]; 370 u32 frame_count = 0; 371 u32 *in_textures = push_array(&arena, u32, MAX_BEAMFORMED_SAVED_FRAMES); 372 ComputeFrameIterator cfi = compute_frame_iterator(ctx, 1 + base_index - to_average, 373 to_average); 374 for (BeamformComputeFrame *it = frame_next(&cfi); it; it = frame_next(&cfi)) 375 in_textures[frame_count++] = it->frame.texture; 376 377 ASSERT(to_average == frame_count); 378 379 do_sum_shader(csctx, in_textures, frame_count, 1 / (f32)frame_count, 380 aframe->frame.texture, aframe->frame.dim); 381 aframe->frame.min_coordinate = frame->frame.min_coordinate; 382 aframe->frame.max_coordinate = frame->frame.max_coordinate; 383 aframe->frame.compound_count = frame->frame.compound_count; 384 aframe->frame.das_shader_kind = frame->frame.das_shader_kind; 385 }break; 386 InvalidDefaultCase; 387 } 388 } 389 390 function s8 391 shader_text_with_header(ShaderReloadContext *ctx, OS *os, Arena *arena) 392 { 393 Stream sb = arena_stream(*arena); 394 stream_append_s8s(&sb, s8("#version 460 core\n\n"), ctx->header); 395 396 switch (ctx->kind) { 397 case ShaderKind_DASCompute:{ 398 #define X(type, id, pretty, fixed_tx) "#define DAS_ID_" #type " " #id "\n" 399 stream_append_s8(&sb, s8("" 400 "layout(local_size_x = " str(DAS_LOCAL_SIZE_X) ", " 401 "local_size_y = " str(DAS_LOCAL_SIZE_Y) ", " 402 "local_size_z = " str(DAS_LOCAL_SIZE_Z) ") in;\n\n" 403 "layout(location = " str(DAS_VOXEL_OFFSET_UNIFORM_LOC) ") uniform ivec3 u_voxel_offset;\n" 404 "layout(location = " str(DAS_CYCLE_T_UNIFORM_LOC) ") uniform uint u_cycle_t;\n\n" 405 DAS_TYPES 406 )); 407 #undef X 408 }break; 409 case ShaderKind_DecodeFloat: 410 case ShaderKind_DecodeFloatComplex:{ 411 if (ctx->kind == ShaderKind_DecodeFloat) 412 stream_append_s8(&sb, s8("#define INPUT_DATA_TYPE_FLOAT\n\n")); 413 else 414 stream_append_s8(&sb, s8("#define INPUT_DATA_TYPE_FLOAT_COMPLEX\n\n")); 415 } /* FALLTHROUGH */ 416 case ShaderKind_Decode:{ 417 #define X(type, id, pretty) stream_append_s8(&sb, s8("#define DECODE_MODE_" #type " " #id "\n")); 418 DECODE_TYPES 419 #undef X 420 }break; 421 case ShaderKind_MinMax:{ 422 stream_append_s8(&sb, s8("layout(location = " str(CS_MIN_MAX_MIPS_LEVEL_UNIFORM_LOC) 423 ") uniform int u_mip_map;\n\n")); 424 }break; 425 case ShaderKind_Sum:{ 426 stream_append_s8(&sb, s8("layout(location = " str(CS_SUM_PRESCALE_UNIFORM_LOC) 427 ") uniform float u_sum_prescale = 1.0;\n\n")); 428 }break; 429 default:{}break; 430 } 431 stream_append_s8(&sb, s8("\n#line 1\n")); 432 433 s8 result = arena_stream_commit(arena, &sb); 434 if (ctx->path.len) { 435 s8 file = os->read_whole_file(arena, (c8 *)ctx->path.data); 436 assert(file.data == result.data + result.len); 437 result.len += file.len; 438 } 439 440 return result; 441 } 442 443 DEBUG_EXPORT BEAMFORMER_RELOAD_SHADER_FN(beamformer_reload_shader) 444 { 445 i32 shader_count = 1; 446 ShaderReloadContext *link = src->link; 447 while (link != src) { shader_count++; link = link->link; } 448 449 s8 *shader_texts = push_array(&arena, s8, shader_count); 450 u32 *shader_types = push_array(&arena, u32, shader_count); 451 452 i32 index = 0; 453 do { 454 shader_texts[index] = shader_text_with_header(link, &ctx->os, &arena); 455 shader_types[index] = link->gl_type; 456 index++; 457 link = link->link; 458 } while (link != src); 459 460 u32 new_program = load_shader(&ctx->os, arena, shader_texts, shader_types, shader_count, shader_name); 461 if (new_program) { 462 glDeleteProgram(*src->shader); 463 *src->shader = new_program; 464 if (src->kind == ShaderKind_Render2D) ctx->frame_view_render_context.updated = 1; 465 } 466 return new_program != 0; 467 } 468 469 function b32 470 reload_compute_shader(BeamformerCtx *ctx, ShaderReloadContext *src, s8 name_extra, Arena arena) 471 { 472 Stream sb = arena_stream(arena); 473 stream_append_s8s(&sb, src->name, name_extra); 474 s8 name = arena_stream_commit(&arena, &sb); 475 b32 result = beamformer_reload_shader(ctx, src, arena, name); 476 if (result) { 477 glUseProgram(*src->shader); 478 glBindBufferBase(GL_UNIFORM_BUFFER, 0, ctx->csctx.shared_ubo); 479 } 480 return result; 481 } 482 483 function void 484 complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_context, iz barrier_offset) 485 { 486 ComputeShaderCtx *cs = &ctx->csctx; 487 BeamformerParameters *bp = &ctx->shared_memory->parameters; 488 BeamformerSharedMemory *sm = ctx->shared_memory; 489 490 BeamformWork *work = beamform_work_queue_pop(q); 491 while (work) { 492 b32 can_commit = 1; 493 switch (work->type) { 494 case BW_RELOAD_SHADER: { 495 ShaderReloadContext *src = work->shader_reload_context; 496 b32 success = reload_compute_shader(ctx, src, s8(""), arena); 497 if (src->kind == ShaderKind_Decode) { 498 /* TODO(rnp): think of a better way of doing this */ 499 src->kind = ShaderKind_DecodeFloatComplex; 500 src->shader = cs->programs + ShaderKind_DecodeFloatComplex; 501 success &= reload_compute_shader(ctx, src, s8(" (F32C)"), arena); 502 src->kind = ShaderKind_DecodeFloat; 503 src->shader = cs->programs + ShaderKind_DecodeFloat; 504 success &= reload_compute_shader(ctx, src, s8(" (F32)"), arena); 505 src->kind = ShaderKind_Decode; 506 src->shader = cs->programs + ShaderKind_Decode; 507 } 508 509 if (success) { 510 /* TODO(rnp): this check seems off */ 511 if (ctx->csctx.raw_data_ssbo) { 512 can_commit = 0; 513 ImagePlaneTag plane = ctx->beamform_frames[ctx->display_frame_index].image_plane_tag; 514 fill_frame_compute_work(ctx, work, plane); 515 } 516 } 517 } break; 518 case BW_UPLOAD_BUFFER: { 519 ASSERT(!atomic_load((i32 *)(barrier_offset + work->completion_barrier))); 520 BeamformerUploadContext *uc = &work->upload_context; 521 u32 tex_type, tex_format, tex_element_count, tex_1d = 0, buffer = 0; 522 switch (uc->kind) { 523 case BU_KIND_CHANNEL_MAPPING: { 524 tex_1d = cs->channel_mapping_texture; 525 tex_type = GL_SHORT; 526 tex_format = GL_RED_INTEGER; 527 tex_element_count = ARRAY_COUNT(sm->channel_mapping); 528 ctx->cuda_lib.set_channel_mapping(sm->channel_mapping); 529 } break; 530 case BU_KIND_FOCAL_VECTORS: { 531 tex_1d = cs->focal_vectors_texture; 532 tex_type = GL_FLOAT; 533 tex_format = GL_RG; 534 tex_element_count = ARRAY_COUNT(sm->focal_vectors); 535 } break; 536 case BU_KIND_SPARSE_ELEMENTS: { 537 tex_1d = cs->sparse_elements_texture; 538 tex_type = GL_SHORT; 539 tex_format = GL_RED_INTEGER; 540 tex_element_count = ARRAY_COUNT(sm->sparse_elements); 541 } break; 542 case BU_KIND_PARAMETERS: { 543 ctx->ui_read_params = barrier_offset != 0; 544 buffer = cs->shared_ubo; 545 } break; 546 case BU_KIND_RF_DATA: { 547 if (cs->rf_raw_size != uc->size || 548 !uv4_equal(cs->dec_data_dim, uv4_from_u32_array(bp->dec_data_dim))) 549 { 550 alloc_shader_storage(ctx, uc->size, arena); 551 } 552 buffer = cs->raw_data_ssbo; 553 } break; 554 default: INVALID_CODE_PATH; break; 555 } 556 557 if (tex_1d) { 558 glTextureSubImage1D(tex_1d, 0, 0, tex_element_count, tex_format, 559 tex_type, (u8 *)sm + uc->shared_memory_offset); 560 } 561 562 if (buffer) { 563 glNamedBufferSubData(buffer, 0, uc->size, 564 (u8 *)sm + uc->shared_memory_offset); 565 } 566 } break; 567 case BW_COMPUTE: { 568 atomic_store(&cs->processing_compute, 1); 569 start_renderdoc_capture(gl_context); 570 571 BeamformComputeFrame *frame = work->frame; 572 uv3 try_dim = make_valid_test_dim(bp->output_points); 573 if (!uv3_equal(try_dim, frame->frame.dim)) 574 alloc_beamform_frame(&ctx->gl, &frame->frame, &frame->stats, try_dim, 575 s8("Beamformed_Data"), arena); 576 577 if (bp->output_points[3] > 1) { 578 if (!uv3_equal(try_dim, ctx->averaged_frames[0].frame.dim)) { 579 alloc_beamform_frame(&ctx->gl, &ctx->averaged_frames[0].frame, 580 &ctx->averaged_frames[0].stats, 581 try_dim, s8("Averaged Frame"), arena); 582 alloc_beamform_frame(&ctx->gl, &ctx->averaged_frames[1].frame, 583 &ctx->averaged_frames[1].stats, 584 try_dim, s8("Averaged Frame"), arena); 585 } 586 } 587 588 frame->in_flight = 1; 589 frame->frame.min_coordinate = v4_from_f32_array(bp->output_min_coordinate); 590 frame->frame.max_coordinate = v4_from_f32_array(bp->output_max_coordinate); 591 frame->frame.das_shader_kind = bp->das_shader_id; 592 frame->frame.compound_count = bp->dec_data_dim[2]; 593 594 b32 did_sum_shader = 0; 595 u32 stage_count = sm->compute_stages_count; 596 ComputeShaderKind *stages = sm->compute_stages; 597 for (u32 i = 0; i < stage_count; i++) { 598 did_sum_shader |= stages[i] == ComputeShaderKind_Sum; 599 frame->stats.timer_active[stages[i]] = 1; 600 glBeginQuery(GL_TIME_ELAPSED, frame->stats.timer_ids[stages[i]]); 601 do_compute_shader(ctx, arena, frame, (ShaderKind)stages[i]); 602 glEndQuery(GL_TIME_ELAPSED); 603 } 604 /* NOTE(rnp): block until work completes so that we can record timings */ 605 glFinish(); 606 cs->processing_progress = 1; 607 608 for (u32 i = 0; i < ARRAY_COUNT(frame->stats.timer_ids); i++) { 609 u64 ns = 0; 610 if (frame->stats.timer_active[i]) { 611 glGetQueryObjectui64v(frame->stats.timer_ids[i], 612 GL_QUERY_RESULT, &ns); 613 frame->stats.timer_active[i] = 0; 614 } 615 frame->stats.times[i] = (f32)ns / 1e9; 616 } 617 618 if (did_sum_shader) { 619 u32 aframe_index = (ctx->averaged_frame_index % 620 ARRAY_COUNT(ctx->averaged_frames)); 621 ctx->averaged_frames[aframe_index].image_plane_tag = frame->image_plane_tag; 622 ctx->averaged_frames[aframe_index].ready_to_present = 1; 623 /* TODO(rnp): not really sure what to do here */ 624 mem_copy(&ctx->averaged_frames[aframe_index].stats.times, 625 &frame->stats.times, sizeof(frame->stats.times)); 626 atomic_inc_u32(&ctx->averaged_frame_index, 1); 627 } 628 frame->ready_to_present = 1; 629 cs->processing_compute = 0; 630 631 end_renderdoc_capture(gl_context); 632 } break; 633 case BW_SAVE_FRAME: { 634 BeamformComputeFrame *frame = work->output_frame_ctx.frame; 635 if (frame->ready_to_present) { 636 export_frame(ctx, work->output_frame_ctx.file_handle, &frame->frame); 637 } else { 638 /* TODO(rnp): should we handle this? */ 639 INVALID_CODE_PATH; 640 } 641 } break; 642 default: INVALID_CODE_PATH; break; 643 } 644 645 if (can_commit) { 646 if (work->completion_barrier) { 647 i32 *value = (i32 *)(barrier_offset + work->completion_barrier); 648 ctx->os.wake_waiters(value); 649 } 650 beamform_work_queue_pop_commit(q); 651 work = beamform_work_queue_pop(q); 652 } 653 } 654 } 655 656 DEBUG_EXPORT BEAMFORMER_COMPUTE_SETUP_FN(beamformer_compute_setup) 657 { 658 BeamformerCtx *ctx = (BeamformerCtx *)user_context; 659 BeamformerSharedMemory *sm = ctx->shared_memory; 660 ComputeShaderCtx *cs = &ctx->csctx; 661 662 glCreateBuffers(1, &cs->shared_ubo); 663 glNamedBufferStorage(cs->shared_ubo, sizeof(sm->parameters), 0, GL_DYNAMIC_STORAGE_BIT); 664 665 glCreateTextures(GL_TEXTURE_1D, 1, &cs->channel_mapping_texture); 666 glCreateTextures(GL_TEXTURE_1D, 1, &cs->sparse_elements_texture); 667 glCreateTextures(GL_TEXTURE_1D, 1, &cs->focal_vectors_texture); 668 glTextureStorage1D(cs->channel_mapping_texture, 1, GL_R16I, ARRAY_COUNT(sm->channel_mapping)); 669 glTextureStorage1D(cs->sparse_elements_texture, 1, GL_R16I, ARRAY_COUNT(sm->sparse_elements)); 670 glTextureStorage1D(cs->focal_vectors_texture, 1, GL_RG32F, ARRAY_COUNT(sm->focal_vectors)); 671 672 LABEL_GL_OBJECT(GL_TEXTURE, cs->channel_mapping_texture, s8("Channel_Mapping")); 673 LABEL_GL_OBJECT(GL_TEXTURE, cs->focal_vectors_texture, s8("Focal_Vectors")); 674 LABEL_GL_OBJECT(GL_TEXTURE, cs->sparse_elements_texture, s8("Sparse_Elements")); 675 LABEL_GL_OBJECT(GL_BUFFER, cs->shared_ubo, s8("Beamformer_Parameters")); 676 } 677 678 DEBUG_EXPORT BEAMFORMER_COMPLETE_COMPUTE_FN(beamformer_complete_compute) 679 { 680 BeamformerCtx *ctx = (BeamformerCtx *)user_context; 681 complete_queue(ctx, &ctx->shared_memory->external_work_queue, arena, gl_context, (iz)ctx->shared_memory); 682 complete_queue(ctx, ctx->beamform_work_queue, arena, gl_context, 0); 683 } 684 685 #include "ui.c" 686 687 DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step) 688 { 689 dt_for_frame = GetFrameTime(); 690 691 if (IsWindowResized()) { 692 ctx->window_size.h = GetScreenHeight(); 693 ctx->window_size.w = GetScreenWidth(); 694 } 695 696 if (input->executable_reloaded) { 697 ui_init(ctx, ctx->ui_backing_store); 698 DEBUG_DECL(start_frame_capture = ctx->os.start_frame_capture); 699 DEBUG_DECL(end_frame_capture = ctx->os.end_frame_capture); 700 } 701 702 BeamformerParameters *bp = &ctx->shared_memory->parameters; 703 if (ctx->shared_memory->dispatch_compute_sync) { 704 ImagePlaneTag current_plane = ctx->shared_memory->current_image_plane; 705 atomic_store(&ctx->shared_memory->dispatch_compute_sync, 0); 706 BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue); 707 if (work) { 708 if (fill_frame_compute_work(ctx, work, current_plane)) 709 beamform_work_queue_push_commit(ctx->beamform_work_queue); 710 711 if (ctx->shared_memory->export_next_frame) { 712 BeamformWork *export = beamform_work_queue_push(ctx->beamform_work_queue); 713 if (export) { 714 /* TODO: we don't really want the beamformer opening/closing files */ 715 iptr f = ctx->os.open_for_write(ctx->os.export_pipe_name); 716 export->type = BW_SAVE_FRAME; 717 export->output_frame_ctx.file_handle = f; 718 if (bp->output_points[3] > 1) { 719 u32 a_index = !(ctx->averaged_frame_index % 720 ARRAY_COUNT(ctx->averaged_frames)); 721 BeamformComputeFrame *aframe = ctx->averaged_frames + a_index; 722 export->output_frame_ctx.frame = aframe; 723 } else { 724 export->output_frame_ctx.frame = work->frame; 725 } 726 beamform_work_queue_push_commit(ctx->beamform_work_queue); 727 } 728 ctx->shared_memory->export_next_frame = 0; 729 } 730 731 ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable); 732 } 733 } 734 735 if (ctx->start_compute) { 736 if (ctx->beamform_frames[ctx->display_frame_index].ready_to_present) { 737 BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue); 738 ImagePlaneTag plane = ctx->beamform_frames[ctx->display_frame_index].image_plane_tag; 739 if (fill_frame_compute_work(ctx, work, plane)) { 740 beamform_work_queue_push_commit(ctx->beamform_work_queue); 741 ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable); 742 ctx->start_compute = 0; 743 } 744 } 745 } 746 747 ComputeFrameIterator cfi = compute_frame_iterator(ctx, ctx->display_frame_index, 748 ctx->next_render_frame_index - ctx->display_frame_index); 749 for (BeamformComputeFrame *frame = frame_next(&cfi); frame; frame = frame_next(&cfi)) { 750 if (frame->in_flight && frame->ready_to_present) { 751 frame->in_flight = 0; 752 ctx->display_frame_index = frame - cfi.frames; 753 } 754 } 755 756 if (ctx->start_compute) { 757 ctx->start_compute = 0; 758 ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable); 759 } 760 761 BeamformComputeFrame *frame_to_draw; 762 if (bp->output_points[3] > 1) { 763 u32 a_index = !(ctx->averaged_frame_index % ARRAY_COUNT(ctx->averaged_frames)); 764 frame_to_draw = ctx->averaged_frames + a_index; 765 } else { 766 frame_to_draw = ctx->beamform_frames + ctx->display_frame_index; 767 } 768 769 draw_ui(ctx, input, frame_to_draw->ready_to_present? &frame_to_draw->frame : 0, 770 frame_to_draw->image_plane_tag, &frame_to_draw->stats); 771 772 ctx->frame_view_render_context.updated = 0; 773 774 if (WindowShouldClose()) 775 ctx->should_exit = 1; 776 }