beamformer.c (28961B)
1 /* See LICENSE for license details. */ 2 /* TODO(rnp): 3 * [ ]: refactor: BeamformGPUComputeContext 4 * [ ]: refactor: compute shader timers should be generated based on the pipeline stage limit 5 * [ ]: reinvestigate ring buffer raw_data_ssbo 6 * - to minimize latency the main thread should manage the subbuffer upload so that the 7 * compute thread can just keep computing. This way we can keep the copmute thread busy 8 * with work while we image. 9 * - In particular we will potentially need multiple GPUComputeContexts so that we 10 * can overwrite one while the other is in use. 11 * - make use of glFenceSync to guard buffer uploads 12 */ 13 14 #include "beamformer.h" 15 #include "beamformer_work_queue.c" 16 17 global f32 dt_for_frame; 18 global u32 cycle_t; 19 20 #ifndef _DEBUG 21 #define start_renderdoc_capture(...) 22 #define end_renderdoc_capture(...) 23 #else 24 static renderdoc_start_frame_capture_fn *start_frame_capture; 25 static renderdoc_end_frame_capture_fn *end_frame_capture; 26 #define start_renderdoc_capture(gl) if (start_frame_capture) start_frame_capture(gl, 0) 27 #define end_renderdoc_capture(gl) if (end_frame_capture) end_frame_capture(gl, 0) 28 #endif 29 30 typedef struct { 31 BeamformComputeFrame *frames; 32 u32 capacity; 33 u32 offset; 34 u32 cursor; 35 u32 needed_frames; 36 } ComputeFrameIterator; 37 38 function uv3 39 make_valid_test_dim(u32 in[3]) 40 { 41 uv3 result; 42 result.E[0] = MAX(in[0], 1); 43 result.E[1] = MAX(in[1], 1); 44 result.E[2] = MAX(in[2], 1); 45 return result; 46 } 47 48 function ComputeFrameIterator 49 compute_frame_iterator(BeamformerCtx *ctx, u32 start_index, u32 needed_frames) 50 { 51 start_index = start_index % ARRAY_COUNT(ctx->beamform_frames); 52 53 ComputeFrameIterator result; 54 result.frames = ctx->beamform_frames; 55 result.offset = start_index; 56 result.capacity = ARRAY_COUNT(ctx->beamform_frames); 57 result.cursor = 0; 58 result.needed_frames = needed_frames; 59 return result; 60 } 61 62 static BeamformComputeFrame * 63 frame_next(ComputeFrameIterator *bfi) 64 { 65 BeamformComputeFrame *result = 0; 66 if (bfi->cursor != bfi->needed_frames) { 67 u32 index = (bfi->offset + bfi->cursor++) % bfi->capacity; 68 result = bfi->frames + index; 69 } 70 return result; 71 } 72 73 static void 74 alloc_beamform_frame(GLParams *gp, BeamformFrame *out, ComputeShaderStats *out_stats, 75 uv3 out_dim, s8 name, Arena arena) 76 { 77 out->dim.x = MAX(1, round_down_power_of_2(ORONE(out_dim.x))); 78 out->dim.y = MAX(1, round_down_power_of_2(ORONE(out_dim.y))); 79 out->dim.z = MAX(1, round_down_power_of_2(ORONE(out_dim.z))); 80 81 if (gp) { 82 out->dim.x = MIN(out->dim.x, gp->max_3d_texture_dim); 83 out->dim.y = MIN(out->dim.y, gp->max_3d_texture_dim); 84 out->dim.z = MIN(out->dim.z, gp->max_3d_texture_dim); 85 } 86 87 /* NOTE: allocate storage for beamformed output data; 88 * this is shared between compute and fragment shaders */ 89 u32 max_dim = MAX(out->dim.x, MAX(out->dim.y, out->dim.z)); 90 out->mips = ctz_u32(max_dim) + 1; 91 92 Stream label = arena_stream(arena); 93 stream_append_s8(&label, name); 94 stream_append_byte(&label, '['); 95 stream_append_hex_u64(&label, out->id); 96 stream_append_byte(&label, ']'); 97 98 glDeleteTextures(1, &out->texture); 99 glCreateTextures(GL_TEXTURE_3D, 1, &out->texture); 100 glTextureStorage3D(out->texture, out->mips, GL_RG32F, out->dim.x, out->dim.y, out->dim.z); 101 LABEL_GL_OBJECT(GL_TEXTURE, out->texture, stream_to_s8(&label)); 102 103 if (out_stats) { 104 glDeleteQueries(ARRAY_COUNT(out_stats->timer_ids), out_stats->timer_ids); 105 glCreateQueries(GL_TIME_ELAPSED, ARRAY_COUNT(out_stats->timer_ids), out_stats->timer_ids); 106 } 107 } 108 109 function void 110 alloc_shader_storage(BeamformerCtx *ctx, u32 rf_raw_size, Arena a) 111 { 112 ComputeShaderCtx *cs = &ctx->csctx; 113 BeamformerParameters *bp = &ctx->shared_memory->parameters; 114 115 cs->dec_data_dim = uv4_from_u32_array(bp->dec_data_dim); 116 cs->rf_raw_size = rf_raw_size; 117 118 glDeleteBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos); 119 glCreateBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos); 120 121 i32 storage_flags = GL_DYNAMIC_STORAGE_BIT; 122 glDeleteBuffers(1, &cs->raw_data_ssbo); 123 glCreateBuffers(1, &cs->raw_data_ssbo); 124 glNamedBufferStorage(cs->raw_data_ssbo, rf_raw_size, 0, storage_flags); 125 LABEL_GL_OBJECT(GL_BUFFER, cs->raw_data_ssbo, s8("Raw_RF_SSBO")); 126 127 iz rf_decoded_size = 2 * sizeof(f32) * cs->dec_data_dim.x * cs->dec_data_dim.y * cs->dec_data_dim.z; 128 Stream label = arena_stream(a); 129 stream_append_s8(&label, s8("Decoded_RF_SSBO_")); 130 u32 s_widx = label.widx; 131 for (u32 i = 0; i < ARRAY_COUNT(cs->rf_data_ssbos); i++) { 132 glNamedBufferStorage(cs->rf_data_ssbos[i], rf_decoded_size, 0, 0); 133 stream_append_u64(&label, i); 134 LABEL_GL_OBJECT(GL_BUFFER, cs->rf_data_ssbos[i], stream_to_s8(&label)); 135 stream_reset(&label, s_widx); 136 } 137 138 /* NOTE(rnp): these are stubs when CUDA isn't supported */ 139 ctx->cuda_lib.register_cuda_buffers(cs->rf_data_ssbos, ARRAY_COUNT(cs->rf_data_ssbos), 140 cs->raw_data_ssbo); 141 ctx->cuda_lib.init_cuda_configuration(bp->rf_raw_dim, bp->dec_data_dim, 142 ctx->shared_memory->channel_mapping); 143 144 u32 order = cs->dec_data_dim.z; 145 i32 *hadamard = make_hadamard_transpose(&a, order); 146 if (hadamard) { 147 glDeleteTextures(1, &cs->hadamard_texture); 148 glCreateTextures(GL_TEXTURE_2D, 1, &cs->hadamard_texture); 149 glTextureStorage2D(cs->hadamard_texture, 1, GL_R8I, order, order); 150 glTextureSubImage2D(cs->hadamard_texture, 0, 0, 0, order, order, GL_RED_INTEGER, 151 GL_INT, hadamard); 152 LABEL_GL_OBJECT(GL_TEXTURE, cs->hadamard_texture, s8("Hadamard_Matrix")); 153 } 154 } 155 156 function b32 157 fill_frame_compute_work(BeamformerCtx *ctx, BeamformWork *work, ImagePlaneTag plane) 158 { 159 b32 result = 0; 160 if (work) { 161 result = 1; 162 u32 frame_id = atomic_inc(&ctx->next_render_frame_index, 1); 163 u32 frame_index = frame_id % ARRAY_COUNT(ctx->beamform_frames); 164 work->type = BW_COMPUTE; 165 work->frame = ctx->beamform_frames + frame_index; 166 work->frame->ready_to_present = 0; 167 work->frame->frame.id = frame_id; 168 work->frame->image_plane_tag = plane; 169 } 170 return result; 171 } 172 173 static void 174 export_frame(BeamformerCtx *ctx, iptr handle, BeamformFrame *frame) 175 { 176 uv3 dim = frame->dim; 177 iz out_size = dim.x * dim.y * dim.z * 2 * sizeof(f32); 178 ctx->export_buffer = ctx->os.alloc_arena(ctx->export_buffer, out_size); 179 glGetTextureImage(frame->texture, 0, GL_RG, GL_FLOAT, out_size, ctx->export_buffer.beg); 180 s8 raw = {.len = out_size, .data = ctx->export_buffer.beg}; 181 if (!ctx->os.write_file(handle, raw)) 182 ctx->os.write_file(ctx->os.stderr, s8("failed to export frame\n")); 183 ctx->os.close(handle); 184 } 185 186 static void 187 do_sum_shader(ComputeShaderCtx *cs, u32 *in_textures, u32 in_texture_count, f32 in_scale, 188 u32 out_texture, uv3 out_data_dim) 189 { 190 /* NOTE: zero output before summing */ 191 glClearTexImage(out_texture, 0, GL_RED, GL_FLOAT, 0); 192 glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT); 193 194 glBindImageTexture(0, out_texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F); 195 glUniform1f(CS_SUM_PRESCALE_UNIFORM_LOC, in_scale); 196 for (u32 i = 0; i < in_texture_count; i++) { 197 glBindImageTexture(1, in_textures[i], 0, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F); 198 glDispatchCompute(ORONE(out_data_dim.x / 32), 199 ORONE(out_data_dim.y), 200 ORONE(out_data_dim.z / 32)); 201 glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); 202 } 203 } 204 205 struct compute_cursor { 206 iv3 cursor; 207 iv3 dispatch; 208 iv3 target; 209 u32 points_per_dispatch; 210 u32 completed_points; 211 u32 total_points; 212 }; 213 214 static struct compute_cursor 215 start_compute_cursor(uv3 dim, u32 max_points) 216 { 217 struct compute_cursor result = {0}; 218 u32 invocations_per_dispatch = DAS_LOCAL_SIZE_X * DAS_LOCAL_SIZE_Y * DAS_LOCAL_SIZE_Z; 219 220 result.dispatch.y = MIN(max_points / invocations_per_dispatch, MAX(dim.y / DAS_LOCAL_SIZE_Y, 1)); 221 222 u32 remaining = max_points / result.dispatch.y; 223 result.dispatch.x = MIN(remaining / invocations_per_dispatch, MAX(dim.x / DAS_LOCAL_SIZE_X, 1)); 224 result.dispatch.z = MIN(remaining / (invocations_per_dispatch * result.dispatch.x), 225 MAX(dim.z / DAS_LOCAL_SIZE_Z, 1)); 226 227 result.target.x = MAX(dim.x / result.dispatch.x / DAS_LOCAL_SIZE_X, 1); 228 result.target.y = MAX(dim.y / result.dispatch.y / DAS_LOCAL_SIZE_Y, 1); 229 result.target.z = MAX(dim.z / result.dispatch.z / DAS_LOCAL_SIZE_Z, 1); 230 231 result.points_per_dispatch = 1; 232 result.points_per_dispatch *= result.dispatch.x * DAS_LOCAL_SIZE_X; 233 result.points_per_dispatch *= result.dispatch.y * DAS_LOCAL_SIZE_Y; 234 result.points_per_dispatch *= result.dispatch.z * DAS_LOCAL_SIZE_Z; 235 236 result.total_points = dim.x * dim.y * dim.z; 237 238 return result; 239 } 240 241 static iv3 242 step_compute_cursor(struct compute_cursor *cursor) 243 { 244 cursor->cursor.x += 1; 245 if (cursor->cursor.x >= cursor->target.x) { 246 cursor->cursor.x = 0; 247 cursor->cursor.y += 1; 248 if (cursor->cursor.y >= cursor->target.y) { 249 cursor->cursor.y = 0; 250 cursor->cursor.z += 1; 251 } 252 } 253 254 cursor->completed_points += cursor->points_per_dispatch; 255 256 iv3 result = cursor->cursor; 257 result.x *= cursor->dispatch.x * DAS_LOCAL_SIZE_X; 258 result.y *= cursor->dispatch.y * DAS_LOCAL_SIZE_Y; 259 result.z *= cursor->dispatch.z * DAS_LOCAL_SIZE_Z; 260 261 return result; 262 } 263 264 static b32 265 compute_cursor_finished(struct compute_cursor *cursor) 266 { 267 b32 result = cursor->completed_points >= cursor->total_points; 268 return result; 269 } 270 271 static void 272 do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, ComputeShaderID shader) 273 { 274 ComputeShaderCtx *csctx = &ctx->csctx; 275 276 glUseProgram(csctx->programs[shader]); 277 278 u32 output_ssbo_idx = !csctx->last_output_ssbo_index; 279 u32 input_ssbo_idx = csctx->last_output_ssbo_index; 280 281 switch (shader) { 282 case CS_DECODE: 283 case CS_DECODE_FLOAT: 284 case CS_DECODE_FLOAT_COMPLEX: 285 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo); 286 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]); 287 glBindImageTexture(0, csctx->hadamard_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8I); 288 glBindImageTexture(1, csctx->channel_mapping_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I); 289 glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32), 290 ORONE(csctx->dec_data_dim.y / 32), 291 ORONE(csctx->dec_data_dim.z)); 292 csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; 293 break; 294 case CS_CUDA_DECODE: 295 ctx->cuda_lib.cuda_decode(0, output_ssbo_idx, 0); 296 csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; 297 break; 298 case CS_CUDA_HILBERT: 299 ctx->cuda_lib.cuda_hilbert(input_ssbo_idx, output_ssbo_idx); 300 csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; 301 break; 302 case CS_DEMOD: 303 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]); 304 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]); 305 glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32), 306 ORONE(csctx->dec_data_dim.y / 32), 307 ORONE(csctx->dec_data_dim.z)); 308 csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; 309 break; 310 case CS_MIN_MAX: { 311 u32 texture = frame->frame.texture; 312 for (u32 i = 1; i < frame->frame.mips; i++) { 313 glBindImageTexture(0, texture, i - 1, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F); 314 glBindImageTexture(1, texture, i - 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F); 315 glUniform1i(CS_MIN_MAX_MIPS_LEVEL_UNIFORM_LOC, i); 316 317 u32 width = frame->frame.dim.x >> i; 318 u32 height = frame->frame.dim.y >> i; 319 u32 depth = frame->frame.dim.z >> i; 320 glDispatchCompute(ORONE(width / 32), ORONE(height), ORONE(depth / 32)); 321 glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); 322 } 323 } break; 324 case CS_DAS: { 325 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]); 326 glBindImageTexture(0, frame->frame.texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F); 327 glBindImageTexture(1, csctx->sparse_elements_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I); 328 glBindImageTexture(2, csctx->focal_vectors_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RG32F); 329 330 glUniform1ui(DAS_CYCLE_T_UNIFORM_LOC, cycle_t++); 331 332 #if 1 333 /* TODO(rnp): compute max_points_per_dispatch based on something like a 334 * transmit_count * channel_count product */ 335 u32 max_points_per_dispatch = KB(64); 336 struct compute_cursor cursor = start_compute_cursor(frame->frame.dim, max_points_per_dispatch); 337 f32 percent_per_step = (f32)cursor.points_per_dispatch / (f32)cursor.total_points; 338 csctx->processing_progress = -percent_per_step; 339 for (iv3 offset = {0}; 340 !compute_cursor_finished(&cursor); 341 offset = step_compute_cursor(&cursor)) 342 { 343 csctx->processing_progress += percent_per_step; 344 /* IMPORTANT(rnp): prevents OS from coalescing and killing our shader */ 345 glFinish(); 346 glUniform3iv(DAS_VOXEL_OFFSET_UNIFORM_LOC, 1, offset.E); 347 glDispatchCompute(cursor.dispatch.x, cursor.dispatch.y, cursor.dispatch.z); 348 } 349 #else 350 /* NOTE(rnp): use this for testing tiling code. The performance of the above path 351 * should be the same as this path if everything is working correctly */ 352 iv3 compute_dim_offset = {0}; 353 glUniform3iv(csctx->voxel_offset_id, 1, compute_dim_offset.E); 354 glDispatchCompute(ORONE(frame->frame.dim.x / 32), 355 ORONE(frame->frame.dim.y), 356 ORONE(frame->frame.dim.z / 32)); 357 #endif 358 glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT|GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); 359 } break; 360 case CS_SUM: { 361 u32 aframe_index = ctx->averaged_frame_index % ARRAY_COUNT(ctx->averaged_frames); 362 BeamformComputeFrame *aframe = ctx->averaged_frames + aframe_index; 363 aframe->ready_to_present = 0; 364 aframe->frame.id = ctx->averaged_frame_index; 365 /* TODO(rnp): hack we need a better way of specifying which frames to sum; 366 * this is fine for rolling averaging but what if we want to do something else */ 367 ASSERT(frame >= ctx->beamform_frames); 368 ASSERT(frame < ctx->beamform_frames + ARRAY_COUNT(ctx->beamform_frames)); 369 u32 base_index = (u32)(frame - ctx->beamform_frames); 370 u32 to_average = ctx->shared_memory->parameters.output_points[3]; 371 u32 frame_count = 0; 372 u32 *in_textures = push_array(&arena, u32, MAX_BEAMFORMED_SAVED_FRAMES); 373 ComputeFrameIterator cfi = compute_frame_iterator(ctx, 1 + base_index - to_average, 374 to_average); 375 for (BeamformComputeFrame *it = frame_next(&cfi); it; it = frame_next(&cfi)) 376 in_textures[frame_count++] = it->frame.texture; 377 378 ASSERT(to_average == frame_count); 379 380 do_sum_shader(csctx, in_textures, frame_count, 1 / (f32)frame_count, 381 aframe->frame.texture, aframe->frame.dim); 382 aframe->frame.min_coordinate = frame->frame.min_coordinate; 383 aframe->frame.max_coordinate = frame->frame.max_coordinate; 384 aframe->frame.compound_count = frame->frame.compound_count; 385 aframe->frame.das_shader_id = frame->frame.das_shader_id; 386 } break; 387 default: ASSERT(0); 388 } 389 } 390 391 function s8 392 push_compute_shader_header(Arena *a, b32 parameters, ComputeShaderID shader) 393 { 394 Stream sb = arena_stream(*a); 395 396 stream_append_s8(&sb, s8("#version 460 core\n\n")); 397 398 #define X(name, type, size, gltype, glsize, comment) "\t" #gltype " " #name #glsize "; " comment "\n" 399 if (parameters) { 400 stream_append_s8(&sb, s8("layout(std140, binding = 0) uniform parameters {\n" 401 BEAMFORMER_PARAMS_HEAD 402 BEAMFORMER_UI_PARAMS 403 BEAMFORMER_PARAMS_TAIL 404 "};\n\n")); 405 } 406 #undef X 407 408 switch (shader) { 409 case CS_DAS: { 410 #define X(type, id, pretty, fixed_tx) "#define DAS_ID_" #type " " #id "\n" 411 stream_append_s8(&sb, s8("" 412 "layout(local_size_x = " str(DAS_LOCAL_SIZE_X) ", " 413 "local_size_y = " str(DAS_LOCAL_SIZE_Y) ", " 414 "local_size_z = " str(DAS_LOCAL_SIZE_Z) ") in;\n\n" 415 "layout(location = " str(DAS_VOXEL_OFFSET_UNIFORM_LOC) ") uniform ivec3 u_voxel_offset;\n" 416 "layout(location = " str(DAS_CYCLE_T_UNIFORM_LOC) ") uniform uint u_cycle_t;\n\n" 417 DAS_TYPES 418 )); 419 #undef X 420 } break; 421 case CS_DECODE_FLOAT: 422 case CS_DECODE_FLOAT_COMPLEX: { 423 if (shader == CS_DECODE_FLOAT) stream_append_s8(&sb, s8("#define INPUT_DATA_TYPE_FLOAT\n\n")); 424 else stream_append_s8(&sb, s8("#define INPUT_DATA_TYPE_FLOAT_COMPLEX\n\n")); 425 } /* FALLTHROUGH */ 426 case CS_DECODE: { 427 #define X(type, id, pretty) stream_append_s8(&sb, s8("#define DECODE_MODE_" #type " " #id "\n")); 428 DECODE_TYPES 429 #undef X 430 } break; 431 case CS_MIN_MAX: { 432 stream_append_s8(&sb, s8("layout(location = " str(CS_MIN_MAX_MIPS_LEVEL_UNIFORM_LOC) 433 ") uniform int u_mip_map;\n\n")); 434 } break; 435 case CS_SUM: { 436 stream_append_s8(&sb, s8("layout(location = " str(CS_SUM_PRESCALE_UNIFORM_LOC) 437 ") uniform float u_sum_prescale = 1.0;\n\n")); 438 } break; 439 default: break; 440 } 441 stream_append_s8(&sb, s8("\n#line 1\n")); 442 return arena_stream_commit(a, &sb); 443 } 444 445 static b32 446 reload_compute_shader(BeamformerCtx *ctx, s8 path, s8 extra, ComputeShaderReloadContext *csr, Arena tmp) 447 { 448 ComputeShaderCtx *cs = &ctx->csctx; 449 b32 result = 0; 450 451 /* NOTE: arena works as stack (since everything here is 1 byte aligned) */ 452 s8 header = push_compute_shader_header(&tmp, csr->needs_header, csr->shader); 453 s8 shader_text = ctx->os.read_whole_file(&tmp, (c8 *)path.data); 454 shader_text.data -= header.len; 455 shader_text.len += header.len; 456 457 if (shader_text.data == header.data) { 458 Stream sb = arena_stream(tmp); 459 stream_append_s8s(&sb, path, extra); 460 s8 info = arena_stream_commit(&tmp, &sb); 461 u32 new_program = load_shader(&ctx->os, tmp, 1, s8(""), s8(""), shader_text, 462 info, csr->label); 463 if (new_program) { 464 glDeleteProgram(cs->programs[csr->shader]); 465 cs->programs[csr->shader] = new_program; 466 glUseProgram(cs->programs[csr->shader]); 467 glBindBufferBase(GL_UNIFORM_BUFFER, 0, cs->shared_ubo); 468 } 469 } else { 470 Stream sb = arena_stream(tmp); 471 stream_append_s8s(&sb, s8("failed to load: "), path, extra, s8("\n")); 472 ctx->os.write_file(ctx->os.stderr, stream_to_s8(&sb)); 473 } 474 475 return result; 476 } 477 478 static void 479 complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_context, iz barrier_offset) 480 { 481 ComputeShaderCtx *cs = &ctx->csctx; 482 BeamformerParameters *bp = &ctx->shared_memory->parameters; 483 BeamformerSharedMemory *sm = ctx->shared_memory; 484 485 BeamformWork *work = beamform_work_queue_pop(q); 486 while (work) { 487 b32 can_commit = 1; 488 switch (work->type) { 489 case BW_RELOAD_SHADER: { 490 ComputeShaderReloadContext *csr = work->reload_shader_ctx; 491 b32 success = reload_compute_shader(ctx, csr->path, s8(""), csr, arena); 492 if (csr->shader == CS_DECODE) { 493 /* TODO(rnp): think of a better way of doing this */ 494 csr->shader = CS_DECODE_FLOAT_COMPLEX; 495 success &= reload_compute_shader(ctx, csr->path, s8(" (F32C)"), csr, arena); 496 csr->shader = CS_DECODE_FLOAT; 497 success &= reload_compute_shader(ctx, csr->path, s8(" (F32)"), csr, arena); 498 csr->shader = CS_DECODE; 499 } 500 501 if (success) { 502 /* TODO(rnp): this check seems off */ 503 if (ctx->csctx.raw_data_ssbo) { 504 can_commit = 0; 505 ImagePlaneTag plane = ctx->beamform_frames[ctx->display_frame_index].image_plane_tag; 506 fill_frame_compute_work(ctx, work, plane); 507 } 508 } 509 } break; 510 case BW_UPLOAD_BUFFER: { 511 ASSERT(!atomic_load((i32 *)(barrier_offset + work->completion_barrier))); 512 BeamformerUploadContext *uc = &work->upload_context; 513 u32 tex_type, tex_format, tex_element_count, tex_1d = 0, buffer = 0; 514 switch (uc->kind) { 515 case BU_KIND_CHANNEL_MAPPING: { 516 tex_1d = cs->channel_mapping_texture; 517 tex_type = GL_SHORT; 518 tex_format = GL_RED_INTEGER; 519 tex_element_count = ARRAY_COUNT(sm->channel_mapping); 520 } break; 521 case BU_KIND_FOCAL_VECTORS: { 522 tex_1d = cs->focal_vectors_texture; 523 tex_type = GL_FLOAT; 524 tex_format = GL_RG; 525 tex_element_count = ARRAY_COUNT(sm->focal_vectors); 526 } break; 527 case BU_KIND_SPARSE_ELEMENTS: { 528 tex_1d = cs->sparse_elements_texture; 529 tex_type = GL_SHORT; 530 tex_format = GL_RED_INTEGER; 531 tex_element_count = ARRAY_COUNT(sm->sparse_elements); 532 } break; 533 case BU_KIND_PARAMETERS: { 534 ctx->ui_read_params = barrier_offset != 0; 535 buffer = cs->shared_ubo; 536 } break; 537 case BU_KIND_RF_DATA: { 538 if (cs->rf_raw_size != uc->size || 539 !uv4_equal(cs->dec_data_dim, uv4_from_u32_array(bp->dec_data_dim))) 540 { 541 alloc_shader_storage(ctx, uc->size, arena); 542 } 543 buffer = cs->raw_data_ssbo; 544 } break; 545 default: INVALID_CODE_PATH; break; 546 } 547 548 if (tex_1d) { 549 glTextureSubImage1D(tex_1d, 0, 0, tex_element_count, tex_format, 550 tex_type, (u8 *)sm + uc->shared_memory_offset); 551 } 552 553 if (buffer) { 554 glNamedBufferSubData(buffer, 0, uc->size, 555 (u8 *)sm + uc->shared_memory_offset); 556 } 557 } break; 558 case BW_COMPUTE: { 559 atomic_store(&cs->processing_compute, 1); 560 start_renderdoc_capture(gl_context); 561 562 BeamformComputeFrame *frame = work->frame; 563 uv3 try_dim = make_valid_test_dim(bp->output_points); 564 if (!uv3_equal(try_dim, frame->frame.dim)) 565 alloc_beamform_frame(&ctx->gl, &frame->frame, &frame->stats, try_dim, 566 s8("Beamformed_Data"), arena); 567 568 if (bp->output_points[3] > 1) { 569 if (!uv3_equal(try_dim, ctx->averaged_frames[0].frame.dim)) { 570 alloc_beamform_frame(&ctx->gl, &ctx->averaged_frames[0].frame, 571 &ctx->averaged_frames[0].stats, 572 try_dim, s8("Averaged Frame"), arena); 573 alloc_beamform_frame(&ctx->gl, &ctx->averaged_frames[1].frame, 574 &ctx->averaged_frames[1].stats, 575 try_dim, s8("Averaged Frame"), arena); 576 } 577 } 578 579 frame->in_flight = 1; 580 frame->frame.min_coordinate = v4_from_f32_array(bp->output_min_coordinate); 581 frame->frame.max_coordinate = v4_from_f32_array(bp->output_max_coordinate); 582 frame->frame.das_shader_id = bp->das_shader_id; 583 frame->frame.compound_count = bp->dec_data_dim[2]; 584 585 b32 did_sum_shader = 0; 586 u32 stage_count = sm->compute_stages_count; 587 ComputeShaderID *stages = sm->compute_stages; 588 for (u32 i = 0; i < stage_count; i++) { 589 did_sum_shader |= stages[i] == CS_SUM; 590 frame->stats.timer_active[stages[i]] = 1; 591 glBeginQuery(GL_TIME_ELAPSED, frame->stats.timer_ids[stages[i]]); 592 do_compute_shader(ctx, arena, frame, stages[i]); 593 glEndQuery(GL_TIME_ELAPSED); 594 } 595 /* NOTE(rnp): block until work completes so that we can record timings */ 596 glFinish(); 597 cs->processing_progress = 1; 598 599 for (u32 i = 0; i < ARRAY_COUNT(frame->stats.timer_ids); i++) { 600 u64 ns = 0; 601 if (frame->stats.timer_active[i]) { 602 glGetQueryObjectui64v(frame->stats.timer_ids[i], 603 GL_QUERY_RESULT, &ns); 604 frame->stats.timer_active[i] = 0; 605 } 606 frame->stats.times[i] = (f32)ns / 1e9; 607 } 608 609 if (did_sum_shader) { 610 u32 aframe_index = (ctx->averaged_frame_index % 611 ARRAY_COUNT(ctx->averaged_frames)); 612 ctx->averaged_frames[aframe_index].image_plane_tag = frame->image_plane_tag; 613 ctx->averaged_frames[aframe_index].ready_to_present = 1; 614 /* TODO(rnp): not really sure what to do here */ 615 mem_copy(&ctx->averaged_frames[aframe_index].stats.times, 616 &frame->stats.times, sizeof(frame->stats.times)); 617 atomic_inc(&ctx->averaged_frame_index, 1); 618 } 619 frame->ready_to_present = 1; 620 cs->processing_compute = 0; 621 622 end_renderdoc_capture(gl_context); 623 } break; 624 case BW_SAVE_FRAME: { 625 BeamformComputeFrame *frame = work->output_frame_ctx.frame; 626 if (frame->ready_to_present) { 627 export_frame(ctx, work->output_frame_ctx.file_handle, &frame->frame); 628 } else { 629 /* TODO(rnp): should we handle this? */ 630 INVALID_CODE_PATH; 631 } 632 } break; 633 default: INVALID_CODE_PATH; break; 634 } 635 636 if (can_commit) { 637 if (work->completion_barrier) { 638 i32 *value = (i32 *)(barrier_offset + work->completion_barrier); 639 ctx->os.wake_waiters(value); 640 } 641 beamform_work_queue_pop_commit(q); 642 work = beamform_work_queue_pop(q); 643 } 644 } 645 } 646 647 DEBUG_EXPORT BEAMFORMER_COMPUTE_SETUP_FN(beamformer_compute_setup) 648 { 649 BeamformerCtx *ctx = (BeamformerCtx *)user_context; 650 BeamformerSharedMemory *sm = ctx->shared_memory; 651 ComputeShaderCtx *cs = &ctx->csctx; 652 653 glCreateBuffers(1, &cs->shared_ubo); 654 glNamedBufferStorage(cs->shared_ubo, sizeof(sm->parameters), 0, GL_DYNAMIC_STORAGE_BIT); 655 656 glCreateTextures(GL_TEXTURE_1D, 1, &cs->channel_mapping_texture); 657 glCreateTextures(GL_TEXTURE_1D, 1, &cs->sparse_elements_texture); 658 glCreateTextures(GL_TEXTURE_1D, 1, &cs->focal_vectors_texture); 659 glTextureStorage1D(cs->channel_mapping_texture, 1, GL_R16I, ARRAY_COUNT(sm->channel_mapping)); 660 glTextureStorage1D(cs->sparse_elements_texture, 1, GL_R16I, ARRAY_COUNT(sm->sparse_elements)); 661 glTextureStorage1D(cs->focal_vectors_texture, 1, GL_RG32F, ARRAY_COUNT(sm->focal_vectors)); 662 663 LABEL_GL_OBJECT(GL_TEXTURE, cs->channel_mapping_texture, s8("Channel_Mapping")); 664 LABEL_GL_OBJECT(GL_TEXTURE, cs->focal_vectors_texture, s8("Focal_Vectors")); 665 LABEL_GL_OBJECT(GL_TEXTURE, cs->sparse_elements_texture, s8("Sparse_Elements")); 666 LABEL_GL_OBJECT(GL_BUFFER, cs->shared_ubo, s8("Beamformer_Parameters")); 667 } 668 669 DEBUG_EXPORT BEAMFORMER_COMPLETE_COMPUTE_FN(beamformer_complete_compute) 670 { 671 BeamformerCtx *ctx = (BeamformerCtx *)user_context; 672 complete_queue(ctx, &ctx->shared_memory->external_work_queue, arena, gl_context, (iz)ctx->shared_memory); 673 complete_queue(ctx, ctx->beamform_work_queue, arena, gl_context, 0); 674 } 675 676 #include "ui.c" 677 678 DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step) 679 { 680 dt_for_frame = GetFrameTime(); 681 682 if (IsWindowResized()) { 683 ctx->window_size.h = GetScreenHeight(); 684 ctx->window_size.w = GetScreenWidth(); 685 } 686 687 if (input->executable_reloaded) { 688 ui_init(ctx, ctx->ui_backing_store); 689 DEBUG_DECL(start_frame_capture = ctx->os.start_frame_capture); 690 DEBUG_DECL(end_frame_capture = ctx->os.end_frame_capture); 691 } 692 693 BeamformerParameters *bp = &ctx->shared_memory->parameters; 694 if (ctx->shared_memory->dispatch_compute_sync) { 695 ImagePlaneTag current_plane = ctx->shared_memory->current_image_plane; 696 atomic_store(&ctx->shared_memory->dispatch_compute_sync, 0); 697 BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue); 698 if (work) { 699 if (fill_frame_compute_work(ctx, work, current_plane)) 700 beamform_work_queue_push_commit(ctx->beamform_work_queue); 701 702 if (ctx->shared_memory->export_next_frame) { 703 BeamformWork *export = beamform_work_queue_push(ctx->beamform_work_queue); 704 if (export) { 705 /* TODO: we don't really want the beamformer opening/closing files */ 706 iptr f = ctx->os.open_for_write(ctx->os.export_pipe_name); 707 export->type = BW_SAVE_FRAME; 708 export->output_frame_ctx.file_handle = f; 709 if (bp->output_points[3] > 1) { 710 u32 a_index = !(ctx->averaged_frame_index % 711 ARRAY_COUNT(ctx->averaged_frames)); 712 BeamformComputeFrame *aframe = ctx->averaged_frames + a_index; 713 export->output_frame_ctx.frame = aframe; 714 } else { 715 export->output_frame_ctx.frame = work->frame; 716 } 717 beamform_work_queue_push_commit(ctx->beamform_work_queue); 718 } 719 ctx->shared_memory->export_next_frame = 0; 720 } 721 722 ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable); 723 } 724 } 725 726 if (ctx->start_compute) { 727 if (ctx->beamform_frames[ctx->display_frame_index].ready_to_present) { 728 BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue); 729 ImagePlaneTag plane = ctx->beamform_frames[ctx->display_frame_index].image_plane_tag; 730 if (fill_frame_compute_work(ctx, work, plane)) { 731 beamform_work_queue_push_commit(ctx->beamform_work_queue); 732 ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable); 733 ctx->start_compute = 0; 734 } 735 } 736 } 737 738 ComputeFrameIterator cfi = compute_frame_iterator(ctx, ctx->display_frame_index, 739 ctx->next_render_frame_index - ctx->display_frame_index); 740 for (BeamformComputeFrame *frame = frame_next(&cfi); frame; frame = frame_next(&cfi)) { 741 if (frame->in_flight && frame->ready_to_present) { 742 frame->in_flight = 0; 743 ctx->display_frame_index = frame - cfi.frames; 744 } 745 } 746 747 if (ctx->start_compute) { 748 ctx->start_compute = 0; 749 ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable); 750 } 751 752 BeamformComputeFrame *frame_to_draw; 753 if (bp->output_points[3] > 1) { 754 u32 a_index = !(ctx->averaged_frame_index % ARRAY_COUNT(ctx->averaged_frames)); 755 frame_to_draw = ctx->averaged_frames + a_index; 756 } else { 757 frame_to_draw = ctx->beamform_frames + ctx->display_frame_index; 758 } 759 760 draw_ui(ctx, input, frame_to_draw->ready_to_present? &frame_to_draw->frame : 0, 761 frame_to_draw->image_plane_tag, &frame_to_draw->stats); 762 763 ctx->frame_view_render_context.updated = 0; 764 765 if (WindowShouldClose()) 766 ctx->should_exit = 1; 767 }