beamformer.c (29288B)
1 /* See LICENSE for license details. */ 2 #include "beamformer.h" 3 4 static f32 dt_for_frame; 5 static u32 cycle_t; 6 7 #ifndef _DEBUG 8 #define start_renderdoc_capture(...) 9 #define end_renderdoc_capture(...) 10 #else 11 static renderdoc_start_frame_capture_fn *start_frame_capture; 12 static renderdoc_end_frame_capture_fn *end_frame_capture; 13 #define start_renderdoc_capture(gl) if (start_frame_capture) start_frame_capture(gl, 0) 14 #define end_renderdoc_capture(gl) if (end_frame_capture) end_frame_capture(gl, 0) 15 #endif 16 17 static iz 18 decoded_data_size(ComputeShaderCtx *cs) 19 { 20 uv4 dim = cs->dec_data_dim; 21 iz result = 2 * sizeof(f32) * dim.x * dim.y * dim.z; 22 return result; 23 } 24 25 static uv3 26 make_valid_test_dim(uv3 in) 27 { 28 uv3 result; 29 result.x = MAX(in.x, 1); 30 result.y = MAX(in.y, 1); 31 result.z = MAX(in.z, 1); 32 return result; 33 } 34 35 static BeamformFrameIterator 36 beamform_frame_iterator(BeamformerCtx *ctx, u32 start_index, u32 needed_frames) 37 { 38 start_index = start_index % ARRAY_COUNT(ctx->beamform_frames); 39 40 BeamformFrameIterator result; 41 result.frames = ctx->beamform_frames; 42 result.offset = start_index; 43 result.capacity = ARRAY_COUNT(ctx->beamform_frames); 44 result.cursor = 0; 45 result.needed_frames = needed_frames; 46 return result; 47 } 48 49 static BeamformFrame * 50 frame_next(BeamformFrameIterator *bfi) 51 { 52 BeamformFrame *result = 0; 53 if (bfi->cursor != bfi->needed_frames) { 54 u32 index = (bfi->offset + bfi->cursor++) % bfi->capacity; 55 result = bfi->frames + index; 56 } 57 return result; 58 } 59 60 static void 61 alloc_beamform_frame(GLParams *gp, BeamformFrame *out, ComputeShaderStats *out_stats, 62 uv3 out_dim, s8 name, Arena arena) 63 { 64 out->ready_to_present = 0; 65 66 out->dim.x = MAX(1, round_down_power_of_2(ORONE(out_dim.x))); 67 out->dim.y = MAX(1, round_down_power_of_2(ORONE(out_dim.y))); 68 out->dim.z = MAX(1, round_down_power_of_2(ORONE(out_dim.z))); 69 70 if (gp) { 71 out->dim.x = MIN(out->dim.x, gp->max_3d_texture_dim); 72 out->dim.y = MIN(out->dim.y, gp->max_3d_texture_dim); 73 out->dim.z = MIN(out->dim.z, gp->max_3d_texture_dim); 74 } 75 76 /* NOTE: allocate storage for beamformed output data; 77 * this is shared between compute and fragment shaders */ 78 u32 max_dim = MAX(out->dim.x, MAX(out->dim.y, out->dim.z)); 79 out->mips = ctz_u32(max_dim) + 1; 80 81 Stream label = arena_stream(&arena); 82 stream_append_s8(&label, name); 83 stream_append_byte(&label, '['); 84 stream_append_hex_u64(&label, out->id); 85 stream_append_byte(&label, ']'); 86 87 glDeleteTextures(1, &out->texture); 88 glCreateTextures(GL_TEXTURE_3D, 1, &out->texture); 89 glTextureStorage3D(out->texture, out->mips, GL_RG32F, out->dim.x, out->dim.y, out->dim.z); 90 LABEL_GL_OBJECT(GL_TEXTURE, out->texture, stream_to_s8(&label)); 91 92 if (out_stats) { 93 glDeleteQueries(ARRAY_COUNT(out_stats->timer_ids), out_stats->timer_ids); 94 glCreateQueries(GL_TIME_ELAPSED, ARRAY_COUNT(out_stats->timer_ids), out_stats->timer_ids); 95 } 96 } 97 98 static void 99 alloc_shader_storage(BeamformerCtx *ctx, Arena a) 100 { 101 ComputeShaderCtx *cs = &ctx->csctx; 102 BeamformerParameters *bp = &ctx->params->raw; 103 104 uv4 dec_data_dim = bp->dec_data_dim; 105 u32 rf_raw_size = ctx->params->raw_data_size; 106 cs->dec_data_dim = dec_data_dim; 107 cs->rf_raw_size = rf_raw_size; 108 109 glDeleteBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos); 110 glCreateBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos); 111 112 i32 storage_flags = GL_DYNAMIC_STORAGE_BIT; 113 switch (ctx->gl.vendor_id) { 114 case GL_VENDOR_AMD: 115 case GL_VENDOR_ARM: 116 case GL_VENDOR_INTEL: 117 if (cs->raw_data_ssbo) 118 glUnmapNamedBuffer(cs->raw_data_ssbo); 119 storage_flags |= GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT; 120 case GL_VENDOR_NVIDIA: 121 /* NOTE: register_cuda_buffers will handle the updated ssbo */ 122 break; 123 } 124 125 glDeleteBuffers(1, &cs->raw_data_ssbo); 126 glCreateBuffers(1, &cs->raw_data_ssbo); 127 glNamedBufferStorage(cs->raw_data_ssbo, rf_raw_size, 0, storage_flags); 128 LABEL_GL_OBJECT(GL_BUFFER, cs->raw_data_ssbo, s8("Raw_RF_SSBO")); 129 130 iz rf_decoded_size = decoded_data_size(cs); 131 Stream label = stream_alloc(&a, 256); 132 stream_append_s8(&label, s8("Decoded_RF_SSBO_")); 133 u32 s_widx = label.widx; 134 for (u32 i = 0; i < ARRAY_COUNT(cs->rf_data_ssbos); i++) { 135 glNamedBufferStorage(cs->rf_data_ssbos[i], rf_decoded_size, 0, 0); 136 stream_append_u64(&label, i); 137 s8 rf_label = stream_to_s8(&label); 138 LABEL_GL_OBJECT(GL_BUFFER, cs->rf_data_ssbos[i], rf_label); 139 stream_reset(&label, s_widx); 140 } 141 142 i32 map_flags = GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT|GL_MAP_UNSYNCHRONIZED_BIT; 143 switch (ctx->gl.vendor_id) { 144 case GL_VENDOR_AMD: 145 case GL_VENDOR_ARM: 146 case GL_VENDOR_INTEL: 147 cs->raw_data_arena.beg = glMapNamedBufferRange(cs->raw_data_ssbo, 0, 148 rf_raw_size, map_flags); 149 cs->raw_data_arena.end = cs->raw_data_arena.beg + rf_raw_size; 150 break; 151 case GL_VENDOR_NVIDIA: 152 cs->raw_data_arena = ctx->os.alloc_arena(cs->raw_data_arena, rf_raw_size); 153 ctx->cuda_lib.register_cuda_buffers(cs->rf_data_ssbos, ARRAY_COUNT(cs->rf_data_ssbos), 154 cs->raw_data_ssbo); 155 ctx->cuda_lib.init_cuda_configuration(bp->rf_raw_dim.E, bp->dec_data_dim.E, 156 bp->channel_mapping); 157 break; 158 } 159 160 /* NOTE: store hadamard in GPU once; it won't change for a particular imaging session */ 161 iz hadamard_elements = dec_data_dim.z * dec_data_dim.z; 162 i32 *hadamard = alloc(&a, i32, hadamard_elements); 163 i32 *tmp = alloc(&a, i32, hadamard_elements); 164 fill_hadamard_transpose(hadamard, tmp, dec_data_dim.z); 165 glDeleteTextures(1, &cs->hadamard_texture); 166 glCreateTextures(GL_TEXTURE_2D, 1, &cs->hadamard_texture); 167 glTextureStorage2D(cs->hadamard_texture, 1, GL_R8I, dec_data_dim.z, dec_data_dim.z); 168 glTextureSubImage2D(cs->hadamard_texture, 0, 0, 0, dec_data_dim.z, dec_data_dim.z, 169 GL_RED_INTEGER, GL_INT, hadamard); 170 LABEL_GL_OBJECT(GL_TEXTURE, cs->hadamard_texture, s8("Hadamard_Matrix")); 171 } 172 173 static BeamformWork * 174 beamform_work_queue_pop(BeamformWorkQueue *q) 175 { 176 BeamformWork *result = 0; 177 178 static_assert(ISPOWEROF2(ARRAY_COUNT(q->work_items)), "queue capacity must be a power of 2"); 179 u64 val = atomic_load(&q->queue); 180 u64 mask = ARRAY_COUNT(q->work_items) - 1; 181 u32 widx = val & mask; 182 u32 ridx = val >> 32 & mask; 183 184 if (ridx != widx) 185 result = q->work_items + ridx; 186 187 return result; 188 } 189 190 static void 191 beamform_work_queue_pop_commit(BeamformWorkQueue *q) 192 { 193 atomic_add(&q->queue, 0x100000000ULL); 194 } 195 196 DEBUG_EXPORT BEAMFORM_WORK_QUEUE_PUSH_FN(beamform_work_queue_push) 197 { 198 BeamformWork *result = 0; 199 200 static_assert(ISPOWEROF2(ARRAY_COUNT(q->work_items)), "queue capacity must be a power of 2"); 201 u64 val = atomic_load(&q->queue); 202 u64 mask = ARRAY_COUNT(q->work_items) - 1; 203 u32 widx = val & mask; 204 u32 ridx = val >> 32 & mask; 205 u32 next = (widx + 1) & mask; 206 207 if (val & 0x80000000) 208 atomic_and(&q->queue, ~0x80000000); 209 210 if (next != ridx) { 211 result = q->work_items + widx; 212 zero_struct(result); 213 } 214 215 return result; 216 } 217 218 DEBUG_EXPORT BEAMFORM_WORK_QUEUE_PUSH_COMMIT_FN(beamform_work_queue_push_commit) 219 { 220 atomic_add(&q->queue, 1); 221 } 222 223 static b32 224 fill_frame_compute_work(BeamformerCtx *ctx, BeamformWork *work) 225 { 226 b32 result = 0; 227 if (work) { 228 result = 1; 229 u32 frame_id = atomic_inc(&ctx->next_render_frame_index, 1); 230 u32 frame_index = frame_id % ARRAY_COUNT(ctx->beamform_frames); 231 work->type = BW_COMPUTE; 232 work->frame.store = ctx->beamform_frames + frame_index; 233 work->frame.stats = ctx->beamform_frame_compute_stats + frame_index; 234 work->frame.store->ready_to_present = 0; 235 work->frame.store->id = frame_id; 236 } 237 return result; 238 } 239 240 static void 241 export_frame(BeamformerCtx *ctx, iptr handle, BeamformFrame *frame) 242 { 243 uv3 dim = frame->dim; 244 iz out_size = dim.x * dim.y * dim.z * 2 * sizeof(f32); 245 ctx->export_buffer = ctx->os.alloc_arena(ctx->export_buffer, out_size); 246 glGetTextureImage(frame->texture, 0, GL_RG, GL_FLOAT, out_size, ctx->export_buffer.beg); 247 s8 raw = {.len = out_size, .data = ctx->export_buffer.beg}; 248 if (!ctx->os.write_file(handle, raw)) 249 ctx->os.write_file(ctx->os.stderr, s8("failed to export frame\n")); 250 ctx->os.close(handle); 251 } 252 253 static void 254 do_sum_shader(ComputeShaderCtx *cs, u32 *in_textures, u32 in_texture_count, f32 in_scale, 255 u32 out_texture, uv3 out_data_dim) 256 { 257 /* NOTE: zero output before summing */ 258 glClearTexImage(out_texture, 0, GL_RED, GL_FLOAT, 0); 259 glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT); 260 261 glBindImageTexture(0, out_texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F); 262 glUniform1f(cs->sum_prescale_id, in_scale); 263 for (u32 i = 0; i < in_texture_count; i++) { 264 glBindImageTexture(1, in_textures[i], 0, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F); 265 glDispatchCompute(ORONE(out_data_dim.x / 32), 266 ORONE(out_data_dim.y), 267 ORONE(out_data_dim.z / 32)); 268 glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); 269 } 270 } 271 272 struct compute_cursor { 273 iv3 cursor; 274 iv3 dispatch; 275 iv3 target; 276 u32 points_per_dispatch; 277 u32 completed_points; 278 u32 total_points; 279 }; 280 281 static struct compute_cursor 282 start_compute_cursor(uv3 dim, u32 max_points) 283 { 284 struct compute_cursor result = {0}; 285 u32 invocations_per_dispatch = DAS_LOCAL_SIZE_X * DAS_LOCAL_SIZE_Y * DAS_LOCAL_SIZE_Z; 286 287 result.dispatch.y = MIN(max_points / invocations_per_dispatch, MAX(dim.y / DAS_LOCAL_SIZE_Y, 1)); 288 289 u32 remaining = max_points / result.dispatch.y; 290 result.dispatch.x = MIN(remaining / invocations_per_dispatch, MAX(dim.x / DAS_LOCAL_SIZE_X, 1)); 291 result.dispatch.z = MIN(remaining / (invocations_per_dispatch * result.dispatch.x), 292 MAX(dim.z / DAS_LOCAL_SIZE_Z, 1)); 293 294 result.target.x = MAX(dim.x / result.dispatch.x / DAS_LOCAL_SIZE_X, 1); 295 result.target.y = MAX(dim.y / result.dispatch.y / DAS_LOCAL_SIZE_Y, 1); 296 result.target.z = MAX(dim.z / result.dispatch.z / DAS_LOCAL_SIZE_Z, 1); 297 298 result.points_per_dispatch = 1; 299 result.points_per_dispatch *= result.dispatch.x * DAS_LOCAL_SIZE_X; 300 result.points_per_dispatch *= result.dispatch.y * DAS_LOCAL_SIZE_Y; 301 result.points_per_dispatch *= result.dispatch.z * DAS_LOCAL_SIZE_Z; 302 303 result.total_points = dim.x * dim.y * dim.z; 304 305 return result; 306 } 307 308 static iv3 309 step_compute_cursor(struct compute_cursor *cursor) 310 { 311 cursor->cursor.x += 1; 312 if (cursor->cursor.x >= cursor->target.x) { 313 cursor->cursor.x = 0; 314 cursor->cursor.y += 1; 315 if (cursor->cursor.y >= cursor->target.y) { 316 cursor->cursor.y = 0; 317 cursor->cursor.z += 1; 318 } 319 } 320 321 cursor->completed_points += cursor->points_per_dispatch; 322 323 iv3 result = cursor->cursor; 324 result.x *= cursor->dispatch.x * DAS_LOCAL_SIZE_X; 325 result.y *= cursor->dispatch.y * DAS_LOCAL_SIZE_Y; 326 result.z *= cursor->dispatch.z * DAS_LOCAL_SIZE_Z; 327 328 return result; 329 } 330 331 static b32 332 compute_cursor_finished(struct compute_cursor *cursor) 333 { 334 b32 result = cursor->completed_points >= cursor->total_points; 335 return result; 336 } 337 338 static void 339 do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformFrame *frame, ComputeShaderID shader) 340 { 341 ComputeShaderCtx *csctx = &ctx->csctx; 342 343 glUseProgram(csctx->programs[shader]); 344 345 u32 output_ssbo_idx = !csctx->last_output_ssbo_index; 346 u32 input_ssbo_idx = csctx->last_output_ssbo_index; 347 348 switch (shader) { 349 case CS_DECODE: 350 case CS_DECODE_FLOAT: 351 case CS_DECODE_FLOAT_COMPLEX: 352 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo); 353 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]); 354 glBindImageTexture(0, csctx->hadamard_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8I); 355 glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32), 356 ORONE(csctx->dec_data_dim.y / 32), 357 ORONE(csctx->dec_data_dim.z)); 358 csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; 359 break; 360 case CS_CUDA_DECODE: 361 ctx->cuda_lib.cuda_decode(0, output_ssbo_idx, 0); 362 csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; 363 break; 364 case CS_CUDA_HILBERT: 365 ctx->cuda_lib.cuda_hilbert(input_ssbo_idx, output_ssbo_idx); 366 csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; 367 break; 368 case CS_DEMOD: 369 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]); 370 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]); 371 glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32), 372 ORONE(csctx->dec_data_dim.y / 32), 373 ORONE(csctx->dec_data_dim.z)); 374 csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; 375 break; 376 case CS_MIN_MAX: { 377 u32 texture = frame->texture; 378 for (u32 i = 1; i < frame->mips; i++) { 379 glBindImageTexture(0, texture, i - 1, GL_TRUE, 0, GL_READ_ONLY, GL_RG32F); 380 glBindImageTexture(1, texture, i - 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F); 381 glUniform1i(csctx->mips_level_id, i); 382 383 u32 width = frame->dim.x >> i; 384 u32 height = frame->dim.y >> i; 385 u32 depth = frame->dim.z >> i; 386 glDispatchCompute(ORONE(width / 32), ORONE(height), ORONE(depth / 32)); 387 glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); 388 } 389 } break; 390 case CS_DAS: { 391 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]); 392 glBindImageTexture(0, frame->texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F); 393 394 #if 1 395 /* TODO(rnp): compute max_points_per_dispatch based on something like a 396 * transmit_count * channel_count product */ 397 u32 max_points_per_dispatch = KB(64); 398 struct compute_cursor cursor = start_compute_cursor(frame->dim, max_points_per_dispatch); 399 f32 percent_per_step = (f32)cursor.points_per_dispatch / (f32)cursor.total_points; 400 csctx->processing_progress = -percent_per_step; 401 for (iv3 offset = {0}; 402 !compute_cursor_finished(&cursor); 403 offset = step_compute_cursor(&cursor)) 404 { 405 csctx->processing_progress += percent_per_step; 406 /* IMPORTANT(rnp): prevents OS from coalescing and killing our shader */ 407 glFinish(); 408 glUniform3iv(csctx->voxel_offset_id, 1, offset.E); 409 glDispatchCompute(cursor.dispatch.x, cursor.dispatch.y, cursor.dispatch.z); 410 } 411 #else 412 /* NOTE(rnp): use this for testing tiling code. The performance of the above path 413 * should be the same as this path if everything is working correctly */ 414 iv3 compute_dim_offset = {0}; 415 glUniform3iv(csctx->voxel_offset_id, 1, compute_dim_offset.E); 416 glDispatchCompute(ORONE(frame->dim.x / 32), 417 ORONE(frame->dim.y), 418 ORONE(frame->dim.z / 32)); 419 #endif 420 glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT|GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); 421 } break; 422 case CS_SUM: { 423 u32 aframe_index = ctx->averaged_frame_index % ARRAY_COUNT(ctx->averaged_frames); 424 BeamformFrame *aframe = ctx->averaged_frames + aframe_index; 425 aframe->ready_to_present = 0; 426 /* TODO(rnp): hack we need a better way of specifying which frames to sum; 427 * this is fine for rolling averaging but what if we want to do something else */ 428 ASSERT(frame >= ctx->beamform_frames); 429 ASSERT(frame < ctx->beamform_frames + ARRAY_COUNT(ctx->beamform_frames)); 430 u32 base_index = (u32)(frame - ctx->beamform_frames); 431 u32 to_average = ctx->params->raw.output_points.w; 432 u32 frame_count = 0; 433 u32 *in_textures = alloc(&arena, u32, MAX_BEAMFORMED_SAVED_FRAMES); 434 BeamformFrameIterator bfi = beamform_frame_iterator(ctx, 1 + base_index - to_average, 435 to_average); 436 for (BeamformFrame *it = frame_next(&bfi); it; it = frame_next(&bfi)) 437 in_textures[frame_count++] = it->texture; 438 439 ASSERT(to_average == frame_count); 440 441 do_sum_shader(csctx, in_textures, frame_count, 1 / (f32)frame_count, 442 aframe->texture, aframe->dim); 443 aframe->min_coordinate = frame->min_coordinate; 444 aframe->max_coordinate = frame->max_coordinate; 445 aframe->compound_count = frame->compound_count; 446 aframe->das_shader_id = frame->das_shader_id; 447 } break; 448 default: ASSERT(0); 449 } 450 } 451 452 static u32 453 compile_shader(OS *os, Arena a, u32 type, s8 shader, s8 name) 454 { 455 u32 sid = glCreateShader(type); 456 glShaderSource(sid, 1, (const char **)&shader.data, (int *)&shader.len); 457 glCompileShader(sid); 458 459 i32 res = 0; 460 glGetShaderiv(sid, GL_COMPILE_STATUS, &res); 461 462 if (res == GL_FALSE) { 463 Stream buf = arena_stream(&a); 464 stream_append_s8(&buf, name); 465 stream_append_s8(&buf, s8(": failed to compile\n")); 466 467 i32 len = 0, out_len = 0; 468 glGetShaderiv(sid, GL_INFO_LOG_LENGTH, &len); 469 glGetShaderInfoLog(sid, len, &out_len, (char *)(buf.data + buf.widx)); 470 stream_commit(&buf, out_len); 471 glDeleteShader(sid); 472 os->write_file(os->stderr, stream_to_s8(&buf)); 473 474 sid = 0; 475 } 476 477 return sid; 478 } 479 480 static u32 481 link_program(OS *os, Arena a, u32 shader_id) 482 { 483 i32 success = 0; 484 u32 result = glCreateProgram(); 485 glAttachShader(result, shader_id); 486 glLinkProgram(result); 487 glGetProgramiv(result, GL_LINK_STATUS, &success); 488 if (success == GL_FALSE) { 489 i32 len = 0; 490 Stream buf = arena_stream(&a); 491 stream_append_s8(&buf, s8("shader link error: ")); 492 glGetProgramInfoLog(result, buf.cap - buf.widx, &len, (c8 *)(buf.data + buf.widx)); 493 stream_reset(&buf, len); 494 stream_append_byte(&buf, '\n'); 495 os->write_file(os->stderr, stream_to_s8(&buf)); 496 glDeleteProgram(result); 497 result = 0; 498 } 499 return result; 500 } 501 502 static s8 503 push_compute_shader_header(Arena *a, ComputeShaderID shader) 504 { 505 s8 result = {.data = a->beg}; 506 507 #define X(name, type, size, gltype, glsize, comment) "\t" #gltype " " #name #glsize "; " comment "\n" 508 push_s8(a, s8("#version 460 core\n\n" 509 "layout(std140, binding = 0) uniform parameters {\n" 510 BEAMFORMER_PARAMS_HEAD 511 BEAMFORMER_UI_PARAMS 512 BEAMFORMER_PARAMS_TAIL 513 "};\n\n")); 514 #undef X 515 516 switch (shader) { 517 case CS_DAS: { 518 push_s8(a, s8("layout(" 519 "local_size_x = " str(DAS_LOCAL_SIZE_X) ", " 520 "local_size_y = " str(DAS_LOCAL_SIZE_Y) ", " 521 "local_size_z = " str(DAS_LOCAL_SIZE_Z) ") " 522 "in;\n\n")); 523 #define X(type, id, pretty, fixed_tx) push_s8(a, s8("#define DAS_ID_" #type " " #id "\n")); 524 DAS_TYPES 525 #undef X 526 } break; 527 case CS_DECODE_FLOAT: 528 case CS_DECODE_FLOAT_COMPLEX: { 529 if (shader == CS_DECODE_FLOAT) push_s8(a, s8("#define INPUT_DATA_TYPE_FLOAT\n\n")); 530 else push_s8(a, s8("#define INPUT_DATA_TYPE_FLOAT_COMPLEX\n\n")); 531 } /* FALLTHROUGH */ 532 case CS_DECODE: { 533 #define X(type, id, pretty) push_s8(a, s8("#define DECODE_MODE_" #type " " #id "\n")); 534 DECODE_TYPES 535 #undef X 536 } break; 537 default: break; 538 } 539 s8 end = push_s8(a, s8("\n#line 1\n")); 540 result.len = end.data + end.len - result.data; 541 return result; 542 } 543 544 static b32 545 reload_compute_shader(BeamformerCtx *ctx, s8 path, s8 extra, ComputeShaderReloadContext *csr, Arena tmp) 546 { 547 ComputeShaderCtx *cs = &ctx->csctx; 548 b32 result = 0; 549 550 /* NOTE: arena works as stack (since everything here is 1 byte aligned) */ 551 s8 header = {.data = tmp.beg}; 552 if (csr->needs_header) 553 header = push_compute_shader_header(&tmp, csr->shader); 554 555 s8 shader_text = ctx->os.read_whole_file(&tmp, (c8 *)path.data); 556 shader_text.data -= header.len; 557 shader_text.len += header.len; 558 559 if (shader_text.data == header.data) { 560 u32 shader_id = compile_shader(&ctx->os, tmp, GL_COMPUTE_SHADER, shader_text, path); 561 if (shader_id) { 562 u32 new_program = link_program(&ctx->os, tmp, shader_id); 563 if (new_program) { 564 Stream buf = arena_stream(&tmp); 565 stream_append_s8(&buf, s8("loaded: ")); 566 stream_append_s8(&buf, path); 567 stream_append_s8(&buf, extra); 568 stream_append_byte(&buf, '\n'); 569 ctx->os.write_file(ctx->os.stderr, stream_to_s8(&buf)); 570 glDeleteProgram(cs->programs[csr->shader]); 571 cs->programs[csr->shader] = new_program; 572 glUseProgram(cs->programs[csr->shader]); 573 glBindBufferBase(GL_UNIFORM_BUFFER, 0, cs->shared_ubo); 574 LABEL_GL_OBJECT(GL_PROGRAM, cs->programs[csr->shader], csr->label); 575 result = 1; 576 } 577 glDeleteShader(shader_id); 578 } 579 } else { 580 Stream buf = arena_stream(&tmp); 581 stream_append_s8(&buf, s8("failed to load: ")); 582 stream_append_s8(&buf, path); 583 stream_append_s8(&buf, extra); 584 stream_append_byte(&buf, '\n'); 585 ctx->os.write_file(ctx->os.stderr, stream_to_s8(&buf)); 586 } 587 588 return result; 589 } 590 591 DEBUG_EXPORT BEAMFORMER_COMPLETE_COMPUTE_FN(beamformer_complete_compute) 592 { 593 BeamformerCtx *ctx = (BeamformerCtx *)user_context; 594 BeamformWorkQueue *q = ctx->beamform_work_queue; 595 BeamformWork *work = beamform_work_queue_pop(q); 596 ComputeShaderCtx *cs = &ctx->csctx; 597 598 BeamformerParameters *bp = &ctx->params->raw; 599 600 while (work) { 601 b32 can_commit = 1; 602 switch (work->type) { 603 case BW_RELOAD_SHADER: { 604 ComputeShaderReloadContext *csr = work->reload_shader_ctx; 605 b32 success = reload_compute_shader(ctx, csr->path, s8(""), csr, arena); 606 if (csr->shader == CS_DECODE) { 607 /* TODO(rnp): think of a better way of doing this */ 608 csr->shader = CS_DECODE_FLOAT_COMPLEX; 609 success &= reload_compute_shader(ctx, csr->path, s8(" (F32C)"), csr, arena); 610 csr->shader = CS_DECODE_FLOAT; 611 success &= reload_compute_shader(ctx, csr->path, s8(" (F32)"), csr, arena); 612 csr->shader = CS_DECODE; 613 } 614 615 if (success) { 616 if (ctx->csctx.raw_data_ssbo) { 617 can_commit = 0; 618 fill_frame_compute_work(ctx, work); 619 } 620 621 /* TODO(rnp): remove this */ 622 #define X(idx, name) cs->name##_id = glGetUniformLocation(cs->programs[idx], "u_" #name); 623 CS_UNIFORMS 624 #undef X 625 } 626 } break; 627 case BW_LOAD_RF_DATA: { 628 if (cs->rf_raw_size != ctx->params->raw_data_size || 629 !uv4_equal(cs->dec_data_dim, bp->dec_data_dim)) 630 { 631 alloc_shader_storage(ctx, arena); 632 } 633 634 void *rf_data_buf = cs->raw_data_arena.beg; 635 iz rlen = ctx->os.read_file(work->file_handle, rf_data_buf, cs->rf_raw_size); 636 if (rlen != cs->rf_raw_size) { 637 stream_append_s8(&ctx->error_stream, s8("Partial Read Occurred: ")); 638 stream_append_i64(&ctx->error_stream, rlen); 639 stream_append_byte(&ctx->error_stream, '/'); 640 stream_append_i64(&ctx->error_stream, cs->rf_raw_size); 641 stream_append_byte(&ctx->error_stream, '\n'); 642 ctx->os.write_file(ctx->os.stderr, stream_to_s8(&ctx->error_stream)); 643 ctx->error_stream.widx = 0; 644 } else { 645 switch (ctx->gl.vendor_id) { 646 case GL_VENDOR_AMD: 647 case GL_VENDOR_ARM: 648 case GL_VENDOR_INTEL: 649 break; 650 case GL_VENDOR_NVIDIA: 651 glNamedBufferSubData(cs->raw_data_ssbo, 0, rlen, rf_data_buf); 652 } 653 } 654 ctx->ready_for_rf = 1; 655 } break; 656 case BW_COMPUTE: { 657 atomic_store(&cs->processing_compute, 1); 658 start_renderdoc_capture(gl_context); 659 660 BeamformerWorkFrame *frame = &work->frame; 661 if (ctx->params->upload) { 662 glNamedBufferSubData(cs->shared_ubo, 0, sizeof(ctx->params->raw), 663 &ctx->params->raw); 664 ctx->params->upload = 0; 665 } 666 667 if (cs->programs[CS_DAS]) 668 glProgramUniform1ui(cs->programs[CS_DAS], cs->cycle_t_id, cycle_t++); 669 670 uv3 try_dim = make_valid_test_dim(ctx->params->raw.output_points.xyz); 671 if (!uv3_equal(try_dim, frame->store->dim)) 672 alloc_beamform_frame(&ctx->gl, frame->store, frame->stats, try_dim, 673 s8("Beamformed_Data"), arena); 674 675 if (ctx->params->raw.output_points.w > 1) { 676 if (!uv3_equal(try_dim, ctx->averaged_frames[0].dim)) { 677 alloc_beamform_frame(&ctx->gl, ctx->averaged_frames + 0, 678 ctx->averaged_frame_compute_stats + 0, 679 try_dim, s8("Averaged Frame"), arena); 680 alloc_beamform_frame(&ctx->gl, ctx->averaged_frames + 1, 681 ctx->averaged_frame_compute_stats + 1, 682 try_dim, s8("Averaged Frame"), arena); 683 } 684 } 685 686 frame->store->in_flight = 1; 687 frame->store->min_coordinate = ctx->params->raw.output_min_coordinate; 688 frame->store->max_coordinate = ctx->params->raw.output_max_coordinate; 689 frame->store->das_shader_id = ctx->params->raw.das_shader_id; 690 frame->store->compound_count = ctx->params->raw.dec_data_dim.z; 691 692 b32 did_sum_shader = 0; 693 u32 stage_count = ctx->params->compute_stages_count; 694 ComputeShaderID *stages = ctx->params->compute_stages; 695 for (u32 i = 0; i < stage_count; i++) { 696 did_sum_shader |= stages[i] == CS_SUM; 697 frame->stats->timer_active[stages[i]] = 1; 698 glBeginQuery(GL_TIME_ELAPSED, frame->stats->timer_ids[stages[i]]); 699 do_compute_shader(ctx, arena, frame->store, stages[i]); 700 glEndQuery(GL_TIME_ELAPSED); 701 } 702 /* NOTE(rnp): block until work completes so that we can record timings */ 703 glFinish(); 704 cs->processing_progress = 1; 705 706 for (u32 i = 0; i < ARRAY_COUNT(frame->stats->timer_ids); i++) { 707 u64 ns = 0; 708 if (frame->stats->timer_active[i]) { 709 glGetQueryObjectui64v(frame->stats->timer_ids[i], 710 GL_QUERY_RESULT, &ns); 711 frame->stats->timer_active[i] = 0; 712 } 713 frame->stats->times[i] = (f32)ns / 1e9; 714 } 715 716 if (did_sum_shader) { 717 u32 aframe_index = (ctx->averaged_frame_index % 718 ARRAY_COUNT(ctx->averaged_frames)); 719 ctx->averaged_frames[aframe_index].ready_to_present = 1; 720 /* TODO(rnp): not really sure what to do here */ 721 mem_copy(ctx->averaged_frame_compute_stats[aframe_index].times, 722 frame->stats->times, sizeof(frame->stats->times)); 723 atomic_inc(&ctx->averaged_frame_index, 1); 724 } 725 frame->store->ready_to_present = 1; 726 cs->processing_compute = 0; 727 728 end_renderdoc_capture(gl_context); 729 } break; 730 case BW_SAVE_FRAME: { 731 BeamformFrame *frame = work->output_frame_ctx.frame.store; 732 ASSERT(frame->ready_to_present); 733 export_frame(ctx, work->output_frame_ctx.file_handle, frame); 734 } break; 735 } 736 737 if (can_commit) { 738 beamform_work_queue_pop_commit(q); 739 work = beamform_work_queue_pop(q); 740 } 741 } 742 } 743 744 #include "ui.c" 745 746 DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step) 747 { 748 dt_for_frame = GetFrameTime(); 749 750 if (IsWindowResized()) { 751 ctx->window_size.h = GetScreenHeight(); 752 ctx->window_size.w = GetScreenWidth(); 753 } 754 755 if (input->executable_reloaded) { 756 ui_init(ctx, ctx->ui_backing_store); 757 DEBUG_DECL(start_frame_capture = ctx->os.start_frame_capture); 758 DEBUG_DECL(end_frame_capture = ctx->os.end_frame_capture); 759 } 760 761 if (ctx->start_compute && !input->pipe_data_available) { 762 if (ctx->beamform_frames[ctx->display_frame_index].ready_to_present) { 763 BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue); 764 if (fill_frame_compute_work(ctx, work)) { 765 beamform_work_queue_push_commit(ctx->beamform_work_queue); 766 ctx->os.wake_thread(ctx->os.compute_worker.sync_handle); 767 ctx->start_compute = 0; 768 } 769 } 770 } 771 772 BeamformerParameters *bp = &ctx->params->raw; 773 if (ctx->ready_for_rf && input->pipe_data_available) { 774 BeamformWork *work = beamform_work_queue_push(ctx->beamform_work_queue); 775 if (work) { 776 ctx->start_compute = 1; 777 ctx->ready_for_rf = 0; 778 779 work->type = BW_LOAD_RF_DATA; 780 work->file_handle = input->pipe_handle; 781 beamform_work_queue_push_commit(ctx->beamform_work_queue); 782 783 BeamformWork *compute = beamform_work_queue_push(ctx->beamform_work_queue); 784 if (fill_frame_compute_work(ctx, compute)) 785 beamform_work_queue_push_commit(ctx->beamform_work_queue); 786 787 if (compute && ctx->params->export_next_frame) { 788 BeamformWork *export = beamform_work_queue_push(ctx->beamform_work_queue); 789 if (export) { 790 /* TODO: we don't really want the beamformer opening/closing files */ 791 iptr f = ctx->os.open_for_write(ctx->params->export_pipe_name); 792 export->type = BW_SAVE_FRAME; 793 export->output_frame_ctx.file_handle = f; 794 if (ctx->params->raw.output_points.w > 1) { 795 u32 a_index = !(ctx->averaged_frame_index % 796 ARRAY_COUNT(ctx->averaged_frames)); 797 BeamformFrame *aframe = ctx->averaged_frames + a_index; 798 ComputeShaderStats *astats = ctx->averaged_frame_compute_stats + a_index; 799 export->output_frame_ctx.frame.store = aframe; 800 export->output_frame_ctx.frame.stats = astats; 801 } else { 802 export->output_frame_ctx.frame = compute->frame; 803 } 804 beamform_work_queue_push_commit(ctx->beamform_work_queue); 805 } 806 ctx->params->export_next_frame = 0; 807 } 808 809 if (ctx->params->upload) { 810 /* TODO(rnp): clean this up */ 811 ctx->ui_read_params = 1; 812 } 813 } 814 } 815 816 BeamformFrameIterator bfi = beamform_frame_iterator(ctx, ctx->display_frame_index, 817 ctx->next_render_frame_index - ctx->display_frame_index); 818 for (BeamformFrame *frame = frame_next(&bfi); frame; frame = frame_next(&bfi)) { 819 if (frame->in_flight && frame->ready_to_present) { 820 frame->in_flight = 0; 821 ctx->display_frame_index = frame - bfi.frames; 822 } 823 } 824 825 if (ctx->start_compute) { 826 ctx->start_compute = 0; 827 ctx->os.wake_thread(ctx->os.compute_worker.sync_handle); 828 } 829 830 BeamformFrame *frame_to_draw; 831 ComputeShaderStats *frame_compute_stats; 832 if (bp->output_points.w > 1) { 833 u32 a_index = !(ctx->averaged_frame_index % ARRAY_COUNT(ctx->averaged_frames)); 834 frame_to_draw = ctx->averaged_frames + a_index; 835 frame_compute_stats = ctx->averaged_frame_compute_stats + a_index; 836 } else { 837 frame_to_draw = ctx->beamform_frames + ctx->display_frame_index; 838 frame_compute_stats = ctx->beamform_frame_compute_stats + ctx->display_frame_index; 839 } 840 841 draw_ui(ctx, input, frame_to_draw, frame_compute_stats); 842 843 ctx->fsctx.updated = 0; 844 845 if (WindowShouldClose()) 846 ctx->should_exit = 1; 847 }