beamformer.c (29230B)
1 /* See LICENSE for license details. */ 2 /* TODO(rnp): 3 * [ ]: refactor: BeamformGPUComputeContext 4 * [ ]: refactor: compute shader timers should be generated based on the pipeline stage limit 5 * [ ]: reinvestigate ring buffer raw_data_ssbo 6 * - to minimize latency the main thread should manage the subbuffer upload so that the 7 * compute thread can just keep computing. This way we can keep the copmute thread busy 8 * with work while we image. 9 * - In particular we will potentially need multiple GPUComputeContexts so that we 10 * can overwrite one while the other is in use. 11 * - make use of glFenceSync to guard buffer uploads 12 */ 13 14 #include "beamformer.h" 15 #include "beamformer_work_queue.c" 16 17 global f32 dt_for_frame; 18 global u32 cycle_t; 19 20 #ifndef _DEBUG 21 #define start_renderdoc_capture(...) 22 #define end_renderdoc_capture(...) 23 #else 24 static renderdoc_start_frame_capture_fn *start_frame_capture; 25 static renderdoc_end_frame_capture_fn *end_frame_capture; 26 #define start_renderdoc_capture(gl) if (start_frame_capture) start_frame_capture(gl, 0) 27 #define end_renderdoc_capture(gl) if (end_frame_capture) end_frame_capture(gl, 0) 28 #endif 29 30 typedef struct { 31 BeamformComputeFrame *frames; 32 u32 capacity; 33 u32 offset; 34 u32 cursor; 35 u32 needed_frames; 36 } ComputeFrameIterator; 37 38 static uv3 39 make_valid_test_dim(uv3 in) 40 { 41 uv3 result; 42 result.x = MAX(in.x, 1); 43 result.y = MAX(in.y, 1); 44 result.z = MAX(in.z, 1); 45 return result; 46 } 47 48 static ComputeFrameIterator 49 compute_frame_iterator(BeamformerCtx *ctx, u32 start_index, u32 needed_frames) 50 { 51 start_index = start_index % ARRAY_COUNT(ctx->beamform_frames); 52 53 ComputeFrameIterator result; 54 result.frames = ctx->beamform_frames; 55 result.offset = start_index; 56 result.capacity = ARRAY_COUNT(ctx->beamform_frames); 57 result.cursor = 0; 58 result.needed_frames = needed_frames; 59 return result; 60 } 61 62 static BeamformComputeFrame * 63 frame_next(ComputeFrameIterator *bfi) 64 { 65 BeamformComputeFrame *result = 0; 66 if (bfi->cursor != bfi->needed_frames) { 67 u32 index = (bfi->offset + bfi->cursor++) % bfi->capacity; 68 result = bfi->frames + index; 69 } 70 return result; 71 } 72 73 static void 74 alloc_beamform_frame(GLParams *gp, BeamformFrame *out, ComputeShaderStats *out_stats, 75 uv3 out_dim, s8 name, Arena arena) 76 { 77 out->dim.x = MAX(1, round_down_power_of_2(ORONE(out_dim.x))); 78 out->dim.y = MAX(1, round_down_power_of_2(ORONE(out_dim.y))); 79 out->dim.z = MAX(1, round_down_power_of_2(ORONE(out_dim.z))); 80 81 if (gp) { 82 out->dim.x = MIN(out->dim.x, gp->max_3d_texture_dim); 83 out->dim.y = MIN(out->dim.y, gp->max_3d_texture_dim); 84 out->dim.z = MIN(out->dim.z, gp->max_3d_texture_dim); 85 } 86 87 /* NOTE: allocate storage for beamformed output data; 88 * this is shared between compute and fragment shaders */ 89 u32 max_dim = MAX(out->dim.x, MAX(out->dim.y, out->dim.z)); 90 out->mips = ctz_u32(max_dim) + 1; 91 92 Stream label = arena_stream(&arena); 93 stream_append_s8(&label, name); 94 stream_append_byte(&label, '['); 95 stream_append_hex_u64(&label, out->id); 96 stream_append_byte(&label, ']'); 97 98 glDeleteTextures(1, &out->texture); 99 glCreateTextures(GL_TEXTURE_3D, 1, &out->texture); 100 glTextureStorage3D(out->texture, out->mips, GL_RG32F, out->dim.x, out->dim.y, out->dim.z); 101 LABEL_GL_OBJECT(GL_TEXTURE, out->texture, stream_to_s8(&label)); 102 103 if (out_stats) { 104 glDeleteQueries(ARRAY_COUNT(out_stats->timer_ids), out_stats->timer_ids); 105 glCreateQueries(GL_TIME_ELAPSED, ARRAY_COUNT(out_stats->timer_ids), out_stats->timer_ids); 106 } 107 } 108 109 function void 110 alloc_shader_storage(BeamformerCtx *ctx, u32 rf_raw_size, Arena a) 111 { 112 ComputeShaderCtx *cs = &ctx->csctx; 113 BeamformerParameters *bp = &ctx->shared_memory->parameters; 114 115 uv4 dec_data_dim = bp->dec_data_dim; 116 cs->dec_data_dim = dec_data_dim; 117 cs->rf_raw_size = rf_raw_size; 118 119 glDeleteBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos); 120 glCreateBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos); 121 122 i32 storage_flags = GL_DYNAMIC_STORAGE_BIT; 123 glDeleteBuffers(1, &cs->raw_data_ssbo); 124 glCreateBuffers(1, &cs->raw_data_ssbo); 125 glNamedBufferStorage(cs->raw_data_ssbo, rf_raw_size, 0, storage_flags); 126 LABEL_GL_OBJECT(GL_BUFFER, cs->raw_data_ssbo, s8("Raw_RF_SSBO")); 127 128 iz rf_decoded_size = 2 * sizeof(f32) * cs->dec_data_dim.x * cs->dec_data_dim.y * cs->dec_data_dim.z; 129 Stream label = stream_alloc(&a, 256); 130 stream_append_s8(&label, s8("Decoded_RF_SSBO_")); 131 u32 s_widx = label.widx; 132 for (u32 i = 0; i < ARRAY_COUNT(cs->rf_data_ssbos); i++) { 133 glNamedBufferStorage(cs->rf_data_ssbos[i], rf_decoded_size, 0, 0); 134 stream_append_u64(&label, i); 135 s8 rf_label = stream_to_s8(&label); 136 LABEL_GL_OBJECT(GL_BUFFER, cs->rf_data_ssbos[i], rf_label); 137 stream_reset(&label, s_widx); 138 } 139 140 /* NOTE(rnp): these are stubs when CUDA isn't supported */ 141 ctx->cuda_lib.register_cuda_buffers(cs->rf_data_ssbos, ARRAY_COUNT(cs->rf_data_ssbos), 142 cs->raw_data_ssbo); 143 ctx->cuda_lib.init_cuda_configuration(bp->rf_raw_dim.E, bp->dec_data_dim.E, 144 ctx->shared_memory->channel_mapping); 145 146 /* NOTE: store hadamard in GPU once; it won't change for a particular imaging session */ 147 iz hadamard_elements = dec_data_dim.z * dec_data_dim.z; 148 i32 *hadamard = alloc(&a, i32, hadamard_elements); 149 i32 *tmp = alloc(&a, i32, hadamard_elements); 150 fill_hadamard_transpose(hadamard, tmp, dec_data_dim.z); 151 glDeleteTextures(1, &cs->hadamard_texture); 152 glCreateTextures(GL_TEXTURE_2D, 1, &cs->hadamard_texture); 153 glTextureStorage2D(cs->hadamard_texture, 1, GL_R8I, dec_data_dim.z, dec_data_dim.z); 154 glTextureSubImage2D(cs->hadamard_texture, 0, 0, 0, dec_data_dim.z, dec_data_dim.z, 155 GL_RED_INTEGER, GL_INT, hadamard); 156 LABEL_GL_OBJECT(GL_TEXTURE, cs->hadamard_texture, s8("Hadamard_Matrix")); 157 } 158 159 static b32 160 fill_frame_compute_work(BeamformerCtx *ctx, BeamformWork *work, ImagePlaneTag plane) 161 { 162 b32 result = 0; 163 if (work) { 164 result = 1; 165 u32 frame_id = atomic_inc(&ctx->next_render_frame_index, 1); 166 u32 frame_index = frame_id % ARRAY_COUNT(ctx->beamform_frames); 167 work->type = BW_COMPUTE; 168 work->frame = ctx->beamform_frames + frame_index; 169 work->frame->ready_to_present = 0; 170 work->frame->frame.id = frame_id; 171 work->frame->image_plane_tag = plane; 172 } 173 return result; 174 } 175 176 static void 177 export_frame(BeamformerCtx *ctx, iptr handle, BeamformFrame *frame) 178 { 179 uv3 dim = frame->dim; 180 iz out_size = dim.x * dim.y * dim.z * 2 * sizeof(f32); 181 ctx->export_buffer = ctx->os.alloc_arena(ctx->export_buffer, out_size); 182 glGetTextureImage(frame->texture, 0, GL_RG, GL_FLOAT, out_size, ctx->export_buffer.beg); 183 s8 raw = {.len = out_size, .data = ctx->export_buffer.beg}; 184 if (!ctx->os.write_file(handle, raw)) 185 ctx->os.write_file(ctx->os.stderr, s8("failed to export frame\n")); 186 ctx->os.close(handle); 187 } 188 189 static void 190 do_sum_shader(ComputeShaderCtx *cs, u32 *in_textures, u32 in_texture_count, f32 in_scale, 191 u32 out_texture, uv3 out_data_dim) 192 { 193 /* NOTE: zero output before summing */ 194 glClearTexImage(out_texture, 0, GL_RED, GL_FLOAT, 0); 195 glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT); 196 197 glBindImageTexture(0, out_texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F); 198 glUniform1f(CS_SUM_PRESCALE_UNIFORM_LOC, in_scale); 199 for (u32 i = 0; i < in_texture_count; i++) { 200 glBindImageTexture(1, in_textures[i], 0, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F); 201 glDispatchCompute(ORONE(out_data_dim.x / 32), 202 ORONE(out_data_dim.y), 203 ORONE(out_data_dim.z / 32)); 204 glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); 205 } 206 } 207 208 struct compute_cursor { 209 iv3 cursor; 210 iv3 dispatch; 211 iv3 target; 212 u32 points_per_dispatch; 213 u32 completed_points; 214 u32 total_points; 215 }; 216 217 static struct compute_cursor 218 start_compute_cursor(uv3 dim, u32 max_points) 219 { 220 struct compute_cursor result = {0}; 221 u32 invocations_per_dispatch = DAS_LOCAL_SIZE_X * DAS_LOCAL_SIZE_Y * DAS_LOCAL_SIZE_Z; 222 223 result.dispatch.y = MIN(max_points / invocations_per_dispatch, MAX(dim.y / DAS_LOCAL_SIZE_Y, 1)); 224 225 u32 remaining = max_points / result.dispatch.y; 226 result.dispatch.x = MIN(remaining / invocations_per_dispatch, MAX(dim.x / DAS_LOCAL_SIZE_X, 1)); 227 result.dispatch.z = MIN(remaining / (invocations_per_dispatch * result.dispatch.x), 228 MAX(dim.z / DAS_LOCAL_SIZE_Z, 1)); 229 230 result.target.x = MAX(dim.x / result.dispatch.x / DAS_LOCAL_SIZE_X, 1); 231 result.target.y = MAX(dim.y / result.dispatch.y / DAS_LOCAL_SIZE_Y, 1); 232 result.target.z = MAX(dim.z / result.dispatch.z / DAS_LOCAL_SIZE_Z, 1); 233 234 result.points_per_dispatch = 1; 235 result.points_per_dispatch *= result.dispatch.x * DAS_LOCAL_SIZE_X; 236 result.points_per_dispatch *= result.dispatch.y * DAS_LOCAL_SIZE_Y; 237 result.points_per_dispatch *= result.dispatch.z * DAS_LOCAL_SIZE_Z; 238 239 result.total_points = dim.x * dim.y * dim.z; 240 241 return result; 242 } 243 244 static iv3 245 step_compute_cursor(struct compute_cursor *cursor) 246 { 247 cursor->cursor.x += 1; 248 if (cursor->cursor.x >= cursor->target.x) { 249 cursor->cursor.x = 0; 250 cursor->cursor.y += 1; 251 if (cursor->cursor.y >= cursor->target.y) { 252 cursor->cursor.y = 0; 253 cursor->cursor.z += 1; 254 } 255 } 256 257 cursor->completed_points += cursor->points_per_dispatch; 258 259 iv3 result = cursor->cursor; 260 result.x *= cursor->dispatch.x * DAS_LOCAL_SIZE_X; 261 result.y *= cursor->dispatch.y * DAS_LOCAL_SIZE_Y; 262 result.z *= cursor->dispatch.z * DAS_LOCAL_SIZE_Z; 263 264 return result; 265 } 266 267 static b32 268 compute_cursor_finished(struct compute_cursor *cursor) 269 { 270 b32 result = cursor->completed_points >= cursor->total_points; 271 return result; 272 } 273 274 static void 275 do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, ComputeShaderID shader) 276 { 277 ComputeShaderCtx *csctx = &ctx->csctx; 278 279 glUseProgram(csctx->programs[shader]); 280 281 u32 output_ssbo_idx = !csctx->last_output_ssbo_index; 282 u32 input_ssbo_idx = csctx->last_output_ssbo_index; 283 284 switch (shader) { 285 case CS_DECODE: 286 case CS_DECODE_FLOAT: 287 case CS_DECODE_FLOAT_COMPLEX: 288 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo); 289 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]); 290 glBindImageTexture(0, csctx->hadamard_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8I); 291 glBindImageTexture(1, csctx->channel_mapping_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I); 292 glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32), 293 ORONE(csctx->dec_data_dim.y / 32), 294 ORONE(csctx->dec_data_dim.z)); 295 csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; 296 break; 297 case CS_CUDA_DECODE: 298 ctx->cuda_lib.cuda_decode(0, output_ssbo_idx, 0); 299 csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; 300 break; 301 case CS_CUDA_HILBERT: 302 ctx->cuda_lib.cuda_hilbert(input_ssbo_idx, output_ssbo_idx); 303 csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; 304 break; 305 case CS_DEMOD: 306 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]); 307 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]); 308 glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32), 309 ORONE(csctx->dec_data_dim.y / 32), 310 ORONE(csctx->dec_data_dim.z)); 311 csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; 312 break; 313 case CS_MIN_MAX: { 314 u32 texture = frame->frame.texture; 315 for (u32 i = 1; i < frame->frame.mips; i++) { 316 glBindImageTexture(0, texture, i - 1, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F); 317 glBindImageTexture(1, texture, i - 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F); 318 glUniform1i(CS_MIN_MAX_MIPS_LEVEL_UNIFORM_LOC, i); 319 320 u32 width = frame->frame.dim.x >> i; 321 u32 height = frame->frame.dim.y >> i; 322 u32 depth = frame->frame.dim.z >> i; 323 glDispatchCompute(ORONE(width / 32), ORONE(height), ORONE(depth / 32)); 324 glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); 325 } 326 } break; 327 case CS_DAS: { 328 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]); 329 glBindImageTexture(0, frame->frame.texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F); 330 glBindImageTexture(1, csctx->sparse_elements_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I); 331 glBindImageTexture(2, csctx->focal_vectors_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RG32F); 332 333 glUniform1ui(DAS_CYCLE_T_UNIFORM_LOC, cycle_t++); 334 335 #if 1 336 /* TODO(rnp): compute max_points_per_dispatch based on something like a 337 * transmit_count * channel_count product */ 338 u32 max_points_per_dispatch = KB(64); 339 struct compute_cursor cursor = start_compute_cursor(frame->frame.dim, max_points_per_dispatch); 340 f32 percent_per_step = (f32)cursor.points_per_dispatch / (f32)cursor.total_points; 341 csctx->processing_progress = -percent_per_step; 342 for (iv3 offset = {0}; 343 !compute_cursor_finished(&cursor); 344 offset = step_compute_cursor(&cursor)) 345 { 346 csctx->processing_progress += percent_per_step; 347 /* IMPORTANT(rnp): prevents OS from coalescing and killing our shader */ 348 glFinish(); 349 glUniform3iv(DAS_VOXEL_OFFSET_UNIFORM_LOC, 1, offset.E); 350 glDispatchCompute(cursor.dispatch.x, cursor.dispatch.y, cursor.dispatch.z); 351 } 352 #else 353 /* NOTE(rnp): use this for testing tiling code. The performance of the above path 354 * should be the same as this path if everything is working correctly */ 355 iv3 compute_dim_offset = {0}; 356 glUniform3iv(csctx->voxel_offset_id, 1, compute_dim_offset.E); 357 glDispatchCompute(ORONE(frame->frame.dim.x / 32), 358 ORONE(frame->frame.dim.y), 359 ORONE(frame->frame.dim.z / 32)); 360 #endif 361 glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT|GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); 362 } break; 363 case CS_SUM: { 364 u32 aframe_index = ctx->averaged_frame_index % ARRAY_COUNT(ctx->averaged_frames); 365 BeamformComputeFrame *aframe = ctx->averaged_frames + aframe_index; 366 aframe->ready_to_present = 0; 367 aframe->frame.id = ctx->averaged_frame_index; 368 /* TODO(rnp): hack we need a better way of specifying which frames to sum; 369 * this is fine for rolling averaging but what if we want to do something else */ 370 ASSERT(frame >= ctx->beamform_frames); 371 ASSERT(frame < ctx->beamform_frames + ARRAY_COUNT(ctx->beamform_frames)); 372 u32 base_index = (u32)(frame - ctx->beamform_frames); 373 u32 to_average = ctx->shared_memory->parameters.output_points.w; 374 u32 frame_count = 0; 375 u32 *in_textures = alloc(&arena, u32, MAX_BEAMFORMED_SAVED_FRAMES); 376 ComputeFrameIterator cfi = compute_frame_iterator(ctx, 1 + base_index - to_average, 377 to_average); 378 for (BeamformComputeFrame *it = frame_next(&cfi); it; it = frame_next(&cfi)) 379 in_textures[frame_count++] = it->frame.texture; 380 381 ASSERT(to_average == frame_count); 382 383 do_sum_shader(csctx, in_textures, frame_count, 1 / (f32)frame_count, 384 aframe->frame.texture, aframe->frame.dim); 385 aframe->frame.min_coordinate = frame->frame.min_coordinate; 386 aframe->frame.max_coordinate = frame->frame.max_coordinate; 387 aframe->frame.compound_count = frame->frame.compound_count; 388 aframe->frame.das_shader_id = frame->frame.das_shader_id; 389 } break; 390 default: ASSERT(0); 391 } 392 } 393 394 function s8 395 push_compute_shader_header(Arena *a, b32 parameters, ComputeShaderID shader) 396 { 397 s8 result = {.data = a->beg}; 398 399 push_s8(a, s8("#version 460 core\n\n")); 400 401 #define X(name, type, size, gltype, glsize, comment) "\t" #gltype " " #name #glsize "; " comment "\n" 402 if (parameters) { 403 push_s8(a, s8("layout(std140, binding = 0) uniform parameters {\n" 404 BEAMFORMER_PARAMS_HEAD 405 BEAMFORMER_UI_PARAMS 406 BEAMFORMER_PARAMS_TAIL 407 "};\n\n")); 408 } 409 #undef X 410 411 switch (shader) { 412 case CS_DAS: { 413 push_s8(a, s8("layout(" 414 "local_size_x = " str(DAS_LOCAL_SIZE_X) ", " 415 "local_size_y = " str(DAS_LOCAL_SIZE_Y) ", " 416 "local_size_z = " str(DAS_LOCAL_SIZE_Z) ") " 417 "in;\n\n")); 418 419 push_s8(a, s8("layout(location = " str(DAS_VOXEL_OFFSET_UNIFORM_LOC) ") uniform ivec3 u_voxel_offset;\n")); 420 push_s8(a, s8("layout(location = " str(DAS_CYCLE_T_UNIFORM_LOC) ") uniform uint u_cycle_t;\n\n")); 421 #define X(type, id, pretty, fixed_tx) push_s8(a, s8("#define DAS_ID_" #type " " #id "\n")); 422 DAS_TYPES 423 #undef X 424 } break; 425 case CS_DECODE_FLOAT: 426 case CS_DECODE_FLOAT_COMPLEX: { 427 if (shader == CS_DECODE_FLOAT) push_s8(a, s8("#define INPUT_DATA_TYPE_FLOAT\n\n")); 428 else push_s8(a, s8("#define INPUT_DATA_TYPE_FLOAT_COMPLEX\n\n")); 429 } /* FALLTHROUGH */ 430 case CS_DECODE: { 431 #define X(type, id, pretty) push_s8(a, s8("#define DECODE_MODE_" #type " " #id "\n")); 432 DECODE_TYPES 433 #undef X 434 } break; 435 case CS_MIN_MAX: { 436 push_s8(a, s8("layout(location = " str(CS_MIN_MAX_MIPS_LEVEL_UNIFORM_LOC) 437 ") uniform int u_mip_map;\n\n")); 438 } break; 439 case CS_SUM: { 440 push_s8(a, s8("layout(location = " str(CS_SUM_PRESCALE_UNIFORM_LOC) 441 ") uniform float u_sum_prescale = 1.0;\n\n")); 442 } break; 443 default: break; 444 } 445 s8 end = push_s8(a, s8("\n#line 1\n")); 446 result.len = end.data + end.len - result.data; 447 return result; 448 } 449 450 static b32 451 reload_compute_shader(BeamformerCtx *ctx, s8 path, s8 extra, ComputeShaderReloadContext *csr, Arena tmp) 452 { 453 ComputeShaderCtx *cs = &ctx->csctx; 454 b32 result = 0; 455 456 /* NOTE: arena works as stack (since everything here is 1 byte aligned) */ 457 s8 header = push_compute_shader_header(&tmp, csr->needs_header, csr->shader); 458 s8 shader_text = ctx->os.read_whole_file(&tmp, (c8 *)path.data); 459 shader_text.data -= header.len; 460 shader_text.len += header.len; 461 462 if (shader_text.data == header.data) { 463 s8 info = {.data = tmp.beg}; 464 push_s8(&tmp, path); 465 push_s8(&tmp, extra); 466 info.len = tmp.beg - info.data; 467 u32 new_program = load_shader(&ctx->os, tmp, 1, (s8){0}, (s8){0}, shader_text, 468 info, csr->label); 469 if (new_program) { 470 glDeleteProgram(cs->programs[csr->shader]); 471 cs->programs[csr->shader] = new_program; 472 glUseProgram(cs->programs[csr->shader]); 473 glBindBufferBase(GL_UNIFORM_BUFFER, 0, cs->shared_ubo); 474 } 475 } else { 476 Stream buf = arena_stream(&tmp); 477 stream_append_s8(&buf, s8("failed to load: ")); 478 stream_append_s8(&buf, path); 479 stream_append_s8(&buf, extra); 480 stream_append_byte(&buf, '\n'); 481 ctx->os.write_file(ctx->os.stderr, stream_to_s8(&buf)); 482 } 483 484 return result; 485 } 486 487 static void 488 complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_context, iz barrier_offset) 489 { 490 ComputeShaderCtx *cs = &ctx->csctx; 491 BeamformerParameters *bp = &ctx->shared_memory->parameters; 492 BeamformerSharedMemory *sm = ctx->shared_memory; 493 494 BeamformWork *work = beamform_work_queue_pop(q); 495 while (work) { 496 b32 can_commit = 1; 497 switch (work->type) { 498 case BW_RELOAD_SHADER: { 499 ComputeShaderReloadContext *csr = work->reload_shader_ctx; 500 b32 success = reload_compute_shader(ctx, csr->path, s8(""), csr, arena); 501 if (csr->shader == CS_DECODE) { 502 /* TODO(rnp): think of a better way of doing this */ 503 csr->shader = CS_DECODE_FLOAT_COMPLEX; 504 success &= reload_compute_shader(ctx, csr->path, s8(" (F32C)"), csr, arena); 505 csr->shader = CS_DECODE_FLOAT; 506 success &= reload_compute_shader(ctx, csr->path, s8(" (F32)"), csr, arena); 507 csr->shader = CS_DECODE; 508 } 509 510 if (success) { 511 /* TODO(rnp): this check seems off */ 512 if (ctx->csctx.raw_data_ssbo) { 513 can_commit = 0; 514 ImagePlaneTag plane = ctx->beamform_frames[ctx->display_frame_index].image_plane_tag; 515 fill_frame_compute_work(ctx, work, plane); 516 } 517 } 518 } break; 519 case BW_UPLOAD_BUFFER: { 520 ASSERT(!atomic_load((i32 *)(barrier_offset + work->completion_barrier))); 521 BeamformerUploadContext *uc = &work->upload_context; 522 u32 tex_type, tex_format, tex_element_count, tex_1d = 0, buffer = 0; 523 switch (uc->kind) { 524 case BU_KIND_CHANNEL_MAPPING: { 525 tex_1d = cs->channel_mapping_texture; 526 tex_type = GL_SHORT; 527 tex_format = GL_RED_INTEGER; 528 tex_element_count = ARRAY_COUNT(sm->channel_mapping); 529 } break; 530 case BU_KIND_FOCAL_VECTORS: { 531 tex_1d = cs->focal_vectors_texture; 532 tex_type = GL_FLOAT; 533 tex_format = GL_RG; 534 tex_element_count = ARRAY_COUNT(sm->focal_vectors); 535 } break; 536 case BU_KIND_SPARSE_ELEMENTS: { 537 tex_1d = cs->sparse_elements_texture; 538 tex_type = GL_SHORT; 539 tex_format = GL_RED_INTEGER; 540 tex_element_count = ARRAY_COUNT(sm->sparse_elements); 541 } break; 542 case BU_KIND_PARAMETERS: { 543 ctx->ui_read_params = barrier_offset != 0; 544 buffer = cs->shared_ubo; 545 } break; 546 case BU_KIND_RF_DATA: { 547 if (cs->rf_raw_size != uc->size || 548 !uv4_equal(cs->dec_data_dim, bp->dec_data_dim)) 549 { 550 alloc_shader_storage(ctx, uc->size, arena); 551 } 552 buffer = cs->raw_data_ssbo; 553 } break; 554 default: INVALID_CODE_PATH; break; 555 } 556 557 if (tex_1d) { 558 glTextureSubImage1D(tex_1d, 0, 0, tex_element_count, tex_format, 559 tex_type, (u8 *)sm + uc->shared_memory_offset); 560 } 561 562 if (buffer) { 563 glNamedBufferSubData(buffer, 0, uc->size, 564 (u8 *)sm + uc->shared_memory_offset); 565 } 566 } break; 567 case BW_COMPUTE: { 568 atomic_store(&cs->processing_compute, 1); 569 start_renderdoc_capture(gl_context); 570 571 BeamformComputeFrame *frame = work->frame; 572 uv3 try_dim = make_valid_test_dim(bp->output_points.xyz); 573 if (!uv3_equal(try_dim, frame->frame.dim)) 574 alloc_beamform_frame(&ctx->gl, &frame->frame, &frame->stats, try_dim, 575 s8("Beamformed_Data"), arena); 576 577 if (bp->output_points.w > 1) { 578 if (!uv3_equal(try_dim, ctx->averaged_frames[0].frame.dim)) { 579 alloc_beamform_frame(&ctx->gl, &ctx->averaged_frames[0].frame, 580 &ctx->averaged_frames[0].stats, 581 try_dim, s8("Averaged Frame"), arena); 582 alloc_beamform_frame(&ctx->gl, &ctx->averaged_frames[1].frame, 583 &ctx->averaged_frames[1].stats, 584 try_dim, s8("Averaged Frame"), arena); 585 } 586 } 587 588 frame->in_flight = 1; 589 frame->frame.min_coordinate = bp->output_min_coordinate; 590 frame->frame.max_coordinate = bp->output_max_coordinate; 591 frame->frame.das_shader_id = bp->das_shader_id; 592 frame->frame.compound_count = bp->dec_data_dim.z; 593 594 b32 did_sum_shader = 0; 595 u32 stage_count = sm->compute_stages_count; 596 ComputeShaderID *stages = sm->compute_stages; 597 for (u32 i = 0; i < stage_count; i++) { 598 did_sum_shader |= stages[i] == CS_SUM; 599 frame->stats.timer_active[stages[i]] = 1; 600 glBeginQuery(GL_TIME_ELAPSED, frame->stats.timer_ids[stages[i]]); 601 do_compute_shader(ctx, arena, frame, stages[i]); 602 glEndQuery(GL_TIME_ELAPSED); 603 } 604 /* NOTE(rnp): block until work completes so that we can record timings */ 605 glFinish(); 606 cs->processing_progress = 1; 607 608 for (u32 i = 0; i < ARRAY_COUNT(frame->stats.timer_ids); i++) { 609 u64 ns = 0; 610 if (frame->stats.timer_active[i]) { 611 glGetQueryObjectui64v(frame->stats.timer_ids[i], 612 GL_QUERY_RESULT, &ns); 613 frame->stats.timer_active[i] = 0; 614 } 615 frame->stats.times[i] = (f32)ns / 1e9; 616 } 617 618 if (did_sum_shader) { 619 u32 aframe_index = (ctx->averaged_frame_index % 620 ARRAY_COUNT(ctx->averaged_frames)); 621 ctx->averaged_frames[aframe_index].image_plane_tag = frame->image_plane_tag; 622 ctx->averaged_frames[aframe_index].ready_to_present = 1; 623 /* TODO(rnp): not really sure what to do here */ 624 mem_copy(&ctx->averaged_frames[aframe_index].stats.times, 625 &frame->stats.times, sizeof(frame->stats.times)); 626 atomic_inc(&ctx->averaged_frame_index, 1); 627 } 628 frame->ready_to_present = 1; 629 cs->processing_compute = 0; 630 631 end_renderdoc_capture(gl_context); 632 } break; 633 case BW_SAVE_FRAME: { 634 BeamformComputeFrame *frame = work->output_frame_ctx.frame; 635 if (frame->ready_to_present) { 636 export_frame(ctx, work->output_frame_ctx.file_handle, &frame->frame); 637 } else { 638 /* TODO(rnp): should we handle this? */ 639 INVALID_CODE_PATH; 640 } 641 } break; 642 default: INVALID_CODE_PATH; break; 643 } 644 645 if (can_commit) { 646 if (work->completion_barrier) { 647 i32 *value = (i32 *)(barrier_offset + work->completion_barrier); 648 ctx->os.wake_waiters(value); 649 } 650 beamform_work_queue_pop_commit(q); 651 work = beamform_work_queue_pop(q); 652 } 653 } 654 } 655 656 DEBUG_EXPORT BEAMFORMER_COMPUTE_SETUP_FN(beamformer_compute_setup) 657 { 658 BeamformerCtx *ctx = (BeamformerCtx *)user_context; 659 BeamformerSharedMemory *sm = ctx->shared_memory; 660 ComputeShaderCtx *cs = &ctx->csctx; 661 662 glCreateBuffers(1, &cs->shared_ubo); 663 glNamedBufferStorage(cs->shared_ubo, sizeof(sm->parameters), 0, GL_DYNAMIC_STORAGE_BIT); 664 665 glCreateTextures(GL_TEXTURE_1D, 1, &cs->channel_mapping_texture); 666 glCreateTextures(GL_TEXTURE_1D, 1, &cs->sparse_elements_texture); 667 glCreateTextures(GL_TEXTURE_1D, 1, &cs->focal_vectors_texture); 668 glTextureStorage1D(cs->channel_mapping_texture, 1, GL_R16I, ARRAY_COUNT(sm->channel_mapping)); 669 glTextureStorage1D(cs->sparse_elements_texture, 1, GL_R16I, ARRAY_COUNT(sm->sparse_elements)); 670 glTextureStorage1D(cs->focal_vectors_texture, 1, GL_RG32F, ARRAY_COUNT(sm->focal_vectors)); 671 672 LABEL_GL_OBJECT(GL_TEXTURE, cs->channel_mapping_texture, s8("Channel_Mapping")); 673 LABEL_GL_OBJECT(GL_TEXTURE, cs->focal_vectors_texture, s8("Focal_Vectors")); 674 LABEL_GL_OBJECT(GL_TEXTURE, cs->sparse_elements_texture, s8("Sparse_Elements")); 675 LABEL_GL_OBJECT(GL_BUFFER, cs->shared_ubo, s8("Beamformer_Parameters")); 676 } 677 678 DEBUG_EXPORT BEAMFORMER_COMPLETE_COMPUTE_FN(beamformer_complete_compute) 679 { 680 BeamformerCtx *ctx = (BeamformerCtx *)user_context; 681 complete_queue(ctx, &ctx->shared_memory->external_work_queue, arena, gl_context, (iz)ctx->shared_memory); 682 complete_queue(ctx, ctx->beamform_work_queue, arena, gl_context, 0); 683 } 684 685 #include "ui.c" 686 687 DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step) 688 { 689 dt_for_frame = GetFrameTime(); 690 691 if (IsWindowResized()) { 692 ctx->window_size.h = GetScreenHeight(); 693 ctx->window_size.w = GetScreenWidth(); 694 } 695 696 if (input->executable_reloaded) { 697 ui_init(ctx, ctx->ui_backing_store); 698 DEBUG_DECL(start_frame_capture = ctx->os.start_frame_capture); 699 DEBUG_DECL(end_frame_capture = ctx->os.end_frame_capture); 700 } 701 702 BeamformerParameters *bp = &ctx->shared_memory->parameters; 703 if (ctx->shared_memory->dispatch_compute_sync) { 704 ImagePlaneTag current_plane = ctx->shared_memory->current_image_plane; 705 atomic_store(&ctx->shared_memory->dispatch_compute_sync, 0); 706 BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue); 707 if (work) { 708 if (fill_frame_compute_work(ctx, work, current_plane)) 709 beamform_work_queue_push_commit(ctx->beamform_work_queue); 710 711 if (ctx->shared_memory->export_next_frame) { 712 BeamformWork *export = beamform_work_queue_push(ctx->beamform_work_queue); 713 if (export) { 714 /* TODO: we don't really want the beamformer opening/closing files */ 715 iptr f = ctx->os.open_for_write(ctx->shared_memory->export_pipe_name); 716 export->type = BW_SAVE_FRAME; 717 export->output_frame_ctx.file_handle = f; 718 if (bp->output_points.w > 1) { 719 u32 a_index = !(ctx->averaged_frame_index % 720 ARRAY_COUNT(ctx->averaged_frames)); 721 BeamformComputeFrame *aframe = ctx->averaged_frames + a_index; 722 export->output_frame_ctx.frame = aframe; 723 } else { 724 export->output_frame_ctx.frame = work->frame; 725 } 726 beamform_work_queue_push_commit(ctx->beamform_work_queue); 727 } 728 ctx->shared_memory->export_next_frame = 0; 729 } 730 731 ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable); 732 } 733 } 734 735 if (ctx->start_compute) { 736 if (ctx->beamform_frames[ctx->display_frame_index].ready_to_present) { 737 BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue); 738 ImagePlaneTag plane = ctx->beamform_frames[ctx->display_frame_index].image_plane_tag; 739 if (fill_frame_compute_work(ctx, work, plane)) { 740 beamform_work_queue_push_commit(ctx->beamform_work_queue); 741 ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable); 742 ctx->start_compute = 0; 743 } 744 } 745 } 746 747 ComputeFrameIterator cfi = compute_frame_iterator(ctx, ctx->display_frame_index, 748 ctx->next_render_frame_index - ctx->display_frame_index); 749 for (BeamformComputeFrame *frame = frame_next(&cfi); frame; frame = frame_next(&cfi)) { 750 if (frame->in_flight && frame->ready_to_present) { 751 frame->in_flight = 0; 752 ctx->display_frame_index = frame - cfi.frames; 753 } 754 } 755 756 if (ctx->start_compute) { 757 ctx->start_compute = 0; 758 ctx->os.wake_waiters(&ctx->os.compute_worker.sync_variable); 759 } 760 761 BeamformComputeFrame *frame_to_draw; 762 if (bp->output_points.w > 1) { 763 u32 a_index = !(ctx->averaged_frame_index % ARRAY_COUNT(ctx->averaged_frames)); 764 frame_to_draw = ctx->averaged_frames + a_index; 765 } else { 766 frame_to_draw = ctx->beamform_frames + ctx->display_frame_index; 767 } 768 769 draw_ui(ctx, input, frame_to_draw->ready_to_present? &frame_to_draw->frame : 0, 770 frame_to_draw->image_plane_tag, &frame_to_draw->stats); 771 772 ctx->frame_view_render_context.updated = 0; 773 774 if (WindowShouldClose()) 775 ctx->should_exit = 1; 776 }