beamformer.c (15828B)
1 /* See LICENSE for license details. */ 2 #include "beamformer.h" 3 #include "ui.c" 4 5 6 static size 7 decoded_data_size(ComputeShaderCtx *cs) 8 { 9 uv4 dim = cs->dec_data_dim; 10 size result = 2 * sizeof(f32) * dim.x * dim.y * dim.z; 11 return result; 12 } 13 14 static void 15 alloc_output_image(BeamformerCtx *ctx) 16 { 17 BeamformerParameters *bp = &ctx->params->raw; 18 ctx->out_data_dim.x = round_down_power_of_2(ORONE(bp->output_points.x)); 19 ctx->out_data_dim.y = round_down_power_of_2(ORONE(bp->output_points.y)); 20 ctx->out_data_dim.z = round_down_power_of_2(ORONE(bp->output_points.z)); 21 bp->output_points = ctx->out_data_dim; 22 23 /* NOTE: allocate storage for beamformed output data; 24 * this is shared between compute and fragment shaders */ 25 uv4 odim = ctx->out_data_dim; 26 u32 max_dim = MAX(odim.x, MAX(odim.y, odim.z)); 27 /* TODO: does this actually matter or is 0 fine? */ 28 ctx->out_texture_unit = 0; 29 ctx->out_texture_mips = _tzcnt_u32(max_dim) + 1; 30 glActiveTexture(GL_TEXTURE0 + ctx->out_texture_unit); 31 glDeleteTextures(1, &ctx->out_texture); 32 glGenTextures(1, &ctx->out_texture); 33 glBindTexture(GL_TEXTURE_3D, ctx->out_texture); 34 glTexStorage3D(GL_TEXTURE_3D, ctx->out_texture_mips, GL_RG32F, odim.x, odim.y, odim.z); 35 36 UnloadRenderTexture(ctx->fsctx.output); 37 /* TODO: select odim.x vs odim.y */ 38 ctx->fsctx.output = LoadRenderTexture(odim.x, odim.z); 39 GenTextureMipmaps(&ctx->fsctx.output.texture); 40 //SetTextureFilter(ctx->fsctx.output.texture, TEXTURE_FILTER_ANISOTROPIC_8X); 41 //SetTextureFilter(ctx->fsctx.output.texture, TEXTURE_FILTER_TRILINEAR); 42 SetTextureFilter(ctx->fsctx.output.texture, TEXTURE_FILTER_BILINEAR); 43 } 44 45 static void 46 alloc_shader_storage(BeamformerCtx *ctx, Arena a) 47 { 48 ComputeShaderCtx *cs = &ctx->csctx; 49 BeamformerParameters *bp = &ctx->params->raw; 50 uv4 dec_data_dim = bp->dec_data_dim; 51 uv2 rf_raw_dim = bp->rf_raw_dim; 52 ctx->csctx.dec_data_dim = dec_data_dim; 53 ctx->csctx.rf_raw_dim = rf_raw_dim; 54 size rf_raw_size = rf_raw_dim.x * rf_raw_dim.y * sizeof(i16); 55 size rf_decoded_size = decoded_data_size(cs); 56 57 glDeleteBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos); 58 glCreateBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos); 59 60 i32 storage_flags = GL_DYNAMIC_STORAGE_BIT; 61 switch (ctx->gl_vendor_id) { 62 case GL_VENDOR_INTEL: 63 case GL_VENDOR_AMD: 64 if (cs->raw_data_ssbo) 65 glUnmapNamedBuffer(cs->raw_data_ssbo); 66 storage_flags |= GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT; 67 case GL_VENDOR_NVIDIA: 68 /* NOTE: register_cuda_buffers will handle the updated ssbo */ 69 break; 70 } 71 72 size full_rf_buf_size = ARRAY_COUNT(cs->raw_data_fences) * rf_raw_size; 73 glDeleteBuffers(1, &cs->raw_data_ssbo); 74 glCreateBuffers(1, &cs->raw_data_ssbo); 75 glNamedBufferStorage(cs->raw_data_ssbo, full_rf_buf_size, 0, storage_flags); 76 77 for (u32 i = 0; i < ARRAY_COUNT(cs->rf_data_ssbos); i++) 78 glNamedBufferStorage(cs->rf_data_ssbos[i], rf_decoded_size, 0, 0); 79 80 i32 map_flags = GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT|GL_MAP_UNSYNCHRONIZED_BIT; 81 switch (ctx->gl_vendor_id) { 82 case GL_VENDOR_INTEL: 83 case GL_VENDOR_AMD: 84 cs->raw_data_arena.beg = glMapNamedBufferRange(cs->raw_data_ssbo, 0, 85 full_rf_buf_size, map_flags); 86 break; 87 case GL_VENDOR_NVIDIA: 88 cs->raw_data_arena = os_alloc_arena(cs->raw_data_arena, full_rf_buf_size); 89 ctx->cuda_lib.register_cuda_buffers(cs->rf_data_ssbos, ARRAY_COUNT(cs->rf_data_ssbos), 90 cs->raw_data_ssbo); 91 ctx->cuda_lib.init_cuda_configuration(bp->rf_raw_dim.E, bp->dec_data_dim.E, 92 bp->channel_mapping, bp->channel_offset > 0); 93 break; 94 } 95 96 /* NOTE: store hadamard in GPU once; it won't change for a particular imaging session */ 97 cs->hadamard_dim = (uv2){.x = dec_data_dim.z, .y = dec_data_dim.z}; 98 size hadamard_elements = dec_data_dim.z * dec_data_dim.z; 99 i32 *hadamard = alloc(&a, i32, hadamard_elements); 100 fill_hadamard(hadamard, dec_data_dim.z); 101 glDeleteBuffers(1, &cs->hadamard_ssbo); 102 glCreateBuffers(1, &cs->hadamard_ssbo); 103 glNamedBufferStorage(cs->hadamard_ssbo, hadamard_elements * sizeof(i32), hadamard, 0); 104 } 105 106 static b32 107 do_volume_computation_step(BeamformerCtx *ctx, enum compute_shaders shader) 108 { 109 ComputeShaderCtx *cs = &ctx->csctx; 110 ExportCtx *e = &ctx->export_ctx; 111 112 b32 done = 0; 113 114 /* NOTE: we start this elsewhere on the first dispatch so that we can include 115 * times such as decoding/demodulation/etc. */ 116 if (!(e->state & ES_TIMER_ACTIVE)) { 117 glQueryCounter(e->timer_ids[0], GL_TIMESTAMP); 118 e->state |= ES_TIMER_ACTIVE; 119 } 120 121 glUseProgram(cs->programs[shader]); 122 glBindBufferBase(GL_UNIFORM_BUFFER, 0, cs->shared_ubo); 123 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, e->rf_data_ssbo); 124 125 glActiveTexture(GL_TEXTURE0); 126 glBindTexture(GL_TEXTURE_3D, e->volume_texture); 127 glBindImageTexture(0, e->volume_texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_R32F); 128 glUniform1i(e->volume_texture_id, 0); 129 glUniform1i(cs->volume_export_pass_id, 1); 130 131 /* NOTE: We must tile this otherwise GL will kill us for taking too long */ 132 /* TODO: this could be based on multiple dimensions */ 133 u32 dispatch_count = e->volume_dim.z / 32; 134 uv4 dim_offset = {.z = !!dispatch_count * 32 * e->dispatch_index++}; 135 glUniform3iv(cs->volume_export_dim_offset_id, 1, (i32 *)dim_offset.E); 136 glDispatchCompute(ORONE(e->volume_dim.x / 32), e->volume_dim.y, 1); 137 if (e->dispatch_index >= dispatch_count) { 138 e->dispatch_index = 0; 139 e->state &= ~ES_COMPUTING; 140 done = 1; 141 } 142 143 glQueryCounter(e->timer_ids[1], GL_TIMESTAMP); 144 145 return done; 146 } 147 148 static void 149 do_compute_shader(BeamformerCtx *ctx, enum compute_shaders shader) 150 { 151 ComputeShaderCtx *csctx = &ctx->csctx; 152 uv2 rf_raw_dim = ctx->params->raw.rf_raw_dim; 153 size rf_raw_size = rf_raw_dim.x * rf_raw_dim.y * sizeof(i16); 154 155 glBeginQuery(GL_TIME_ELAPSED, csctx->timer_ids[csctx->timer_index][shader]); 156 157 glUseProgram(csctx->programs[shader]); 158 glBindBufferBase(GL_UNIFORM_BUFFER, 0, csctx->shared_ubo); 159 160 u32 output_ssbo_idx = !csctx->last_output_ssbo_index; 161 u32 input_ssbo_idx = csctx->last_output_ssbo_index; 162 switch (shader) { 163 case CS_HADAMARD: 164 glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo, 165 csctx->raw_data_index * rf_raw_size, rf_raw_size); 166 167 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]); 168 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, csctx->hadamard_ssbo); 169 glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32), 170 ORONE(csctx->dec_data_dim.y / 32), 171 ORONE(csctx->dec_data_dim.z)); 172 csctx->raw_data_fences[csctx->raw_data_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); 173 csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; 174 break; 175 case CS_CUDA_DECODE: 176 ctx->cuda_lib.cuda_decode(csctx->raw_data_index * rf_raw_size, output_ssbo_idx); 177 csctx->raw_data_fences[csctx->raw_data_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); 178 csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; 179 break; 180 case CS_CUDA_HILBERT: 181 ctx->cuda_lib.cuda_hilbert(input_ssbo_idx, output_ssbo_idx); 182 csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; 183 break; 184 case CS_DEMOD: 185 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]); 186 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]); 187 glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32), 188 ORONE(csctx->dec_data_dim.y / 32), 189 ORONE(csctx->dec_data_dim.z)); 190 csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; 191 break; 192 case CS_MIN_MAX: 193 glBindImageTexture(ctx->out_texture_unit, ctx->out_texture, 0, GL_FALSE, 0, 194 GL_WRITE_ONLY, GL_RG32F); 195 glUniform1i(csctx->out_data_tex_id, ctx->out_texture_unit); 196 for (u32 i = 1; i < ctx->out_texture_mips; i++) { 197 u32 otu = ctx->out_texture_unit; 198 glBindImageTexture(otu + 1, ctx->out_texture, i - 1, 199 GL_FALSE, 0, GL_READ_ONLY, GL_RG32F); 200 glBindImageTexture(otu + 2, ctx->out_texture, i, 201 GL_FALSE, 0, GL_WRITE_ONLY, GL_RG32F); 202 glUniform1i(csctx->out_data_tex_id, otu + 1); 203 glUniform1i(csctx->mip_view_tex_id, otu + 2); 204 glUniform1i(csctx->mips_level_id, i); 205 206 u32 width = ctx->out_data_dim.x >> i; 207 u32 height = ctx->out_data_dim.y >> i; 208 u32 depth = ctx->out_data_dim.z >> i; 209 glDispatchCompute(ORONE(width / 32), ORONE(height), ORONE(depth / 32)); 210 glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); 211 } 212 break; 213 case CS_HERCULES: 214 case CS_UFORCES: 215 if (ctx->export_ctx.state & ES_START) { 216 /* NOTE: on the first frame of compute make a copy of the rf data */ 217 size rf_size = decoded_data_size(csctx); 218 ctx->export_ctx.state &= ~ES_START; 219 ctx->export_ctx.state |= ES_COMPUTING; 220 glCopyNamedBufferSubData(csctx->rf_data_ssbos[input_ssbo_idx], 221 ctx->export_ctx.rf_data_ssbo, 0, 0, rf_size); 222 } 223 224 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]); 225 glUniform3iv(csctx->volume_export_dim_offset_id, 1, (i32 []){0, 0, 0}); 226 glUniform1i(csctx->volume_export_pass_id, 0); 227 glActiveTexture(GL_TEXTURE0 + ctx->out_texture_unit); 228 glBindTexture(GL_TEXTURE_3D, ctx->out_texture); 229 glBindImageTexture(ctx->out_texture_unit, ctx->out_texture, 0, GL_TRUE, 0, 230 GL_WRITE_ONLY, GL_RG32F); 231 glUniform1i(csctx->out_data_tex_id, ctx->out_texture_unit); 232 glDispatchCompute(ORONE(ctx->out_data_dim.x / 32), 233 ctx->out_data_dim.y, 234 ORONE(ctx->out_data_dim.z / 32)); 235 break; 236 default: ASSERT(0); 237 } 238 239 glEndQuery(GL_TIME_ELAPSED); 240 } 241 242 static void 243 check_compute_timers(ComputeShaderCtx *cs, ExportCtx *e, BeamformerParametersFull *bp) 244 { 245 /* NOTE: volume generation running timer */ 246 if (e->state & ES_TIMER_ACTIVE) { 247 u64 start_ns = 0, end_ns = 0; 248 glGetQueryObjectui64v(e->timer_ids[0], GL_QUERY_RESULT, &start_ns); 249 glGetQueryObjectui64v(e->timer_ids[1], GL_QUERY_RESULT, &end_ns); 250 u64 elapsed_ns = end_ns - start_ns; 251 e->runtime += (f32)elapsed_ns * 1e-9; 252 e->state &= ~ES_TIMER_ACTIVE; 253 } 254 255 /* NOTE: main timers for display portion of the program */ 256 u32 last_idx = (cs->timer_index - 1) % ARRAY_COUNT(cs->timer_fences); 257 if (!cs->timer_fences[last_idx]) 258 return; 259 260 i32 status = glClientWaitSync(cs->timer_fences[last_idx], 0, 0); 261 if (status == GL_TIMEOUT_EXPIRED || status == GL_WAIT_FAILED) 262 return; 263 glDeleteSync(cs->timer_fences[last_idx]); 264 cs->timer_fences[last_idx] = NULL; 265 266 for (u32 i = 0; i < bp->compute_stages_count; i++) { 267 u64 ns = 0; 268 i32 idx = bp->compute_stages[i]; 269 glGetQueryObjectui64v(cs->timer_ids[last_idx][idx], GL_QUERY_RESULT, &ns); 270 cs->last_frame_time[idx] = (f32)ns / 1e9; 271 } 272 } 273 274 DEBUG_EXPORT void 275 do_beamformer(BeamformerCtx *ctx, Arena arena) 276 { 277 ctx->dt = GetFrameTime(); 278 279 if (IsWindowResized()) { 280 ctx->window_size.h = GetScreenHeight(); 281 ctx->window_size.w = GetScreenWidth(); 282 } 283 284 /* NOTE: Store the compute time for the last frame. */ 285 check_compute_timers(&ctx->csctx, &ctx->export_ctx, ctx->params); 286 287 BeamformerParameters *bp = &ctx->params->raw; 288 /* NOTE: Check for and Load RF Data into GPU */ 289 if (os_poll_pipe(ctx->data_pipe)) { 290 ComputeShaderCtx *cs = &ctx->csctx; 291 if (!uv4_equal(cs->dec_data_dim, bp->dec_data_dim)) 292 alloc_shader_storage(ctx, arena); 293 294 if (!uv4_equal(ctx->out_data_dim, bp->output_points)) 295 alloc_output_image(ctx); 296 297 cs->raw_data_index = (cs->raw_data_index + 1) % ARRAY_COUNT(cs->raw_data_fences); 298 i32 raw_index = ctx->csctx.raw_data_index; 299 /* NOTE: if this times out it means the command queue is more than 3 frames behind. 300 * In that case we need to re-evaluate the buffer size */ 301 if (ctx->csctx.raw_data_fences[raw_index]) { 302 i32 result = glClientWaitSync(cs->raw_data_fences[raw_index], 0, 10000); 303 if (result == GL_TIMEOUT_EXPIRED) { 304 //ASSERT(0); 305 } 306 glDeleteSync(cs->raw_data_fences[raw_index]); 307 cs->raw_data_fences[raw_index] = NULL; 308 } 309 310 uv2 rf_raw_dim = cs->rf_raw_dim; 311 size rf_raw_size = rf_raw_dim.x * rf_raw_dim.y * sizeof(i16); 312 313 void *rf_data_buf = cs->raw_data_arena.beg + raw_index * rf_raw_size; 314 size rlen = os_read_pipe_data(ctx->data_pipe, rf_data_buf, rf_raw_size); 315 switch (ctx->gl_vendor_id) { 316 case GL_VENDOR_INTEL: 317 /* TODO: intel complains about this buffer being busy even with 318 * MAP_UNSYNCHRONIZED_BIT */ 319 case GL_VENDOR_AMD: 320 break; 321 case GL_VENDOR_NVIDIA: 322 glNamedBufferSubData(cs->raw_data_ssbo, raw_index * rf_raw_size, 323 rf_raw_size, rf_data_buf); 324 } 325 if (rlen == rf_raw_size) ctx->flags |= DO_COMPUTE; 326 else ctx->partial_transfer_count++; 327 } 328 329 /* NOTE: we are starting a volume computation on this frame so make some space */ 330 if (ctx->export_ctx.state & ES_START) { 331 ExportCtx *e = &ctx->export_ctx; 332 e->runtime = 0; 333 uv4 edim = e->volume_dim; 334 335 /* NOTE: get a timestamp here which will include decoding/demodulating/etc. */ 336 glQueryCounter(e->timer_ids[0], GL_TIMESTAMP); 337 e->state |= ES_TIMER_ACTIVE; 338 339 glDeleteTextures(1, &e->volume_texture); 340 glCreateTextures(GL_TEXTURE_3D, 1, &e->volume_texture); 341 glTextureStorage3D(e->volume_texture, 1, GL_R32F, edim.x, edim.y, edim.z); 342 343 glDeleteBuffers(1, &e->rf_data_ssbo); 344 glCreateBuffers(1, &e->rf_data_ssbo); 345 glNamedBufferStorage(e->rf_data_ssbo, decoded_data_size(&ctx->csctx), 0, 0); 346 } 347 348 if (ctx->flags & DO_COMPUTE || ctx->export_ctx.state & ES_START) { 349 if (ctx->params->upload && !(ctx->export_ctx.state & ES_COMPUTING)) { 350 glNamedBufferSubData(ctx->csctx.shared_ubo, 0, sizeof(*bp), bp); 351 ctx->params->upload = 0; 352 } 353 354 u32 stages = ctx->params->compute_stages_count; 355 for (u32 i = 0; i < stages; i++) { 356 do_compute_shader(ctx, ctx->params->compute_stages[i]); 357 } 358 ctx->flags &= ~DO_COMPUTE; 359 ctx->flags |= GEN_MIPMAPS; 360 361 u32 tidx = ctx->csctx.timer_index; 362 glDeleteSync(ctx->csctx.timer_fences[tidx]); 363 ctx->csctx.timer_fences[tidx] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); 364 ctx->csctx.timer_index = (tidx + 1) % ARRAY_COUNT(ctx->csctx.timer_fences); 365 } 366 367 if (ctx->export_ctx.state & ES_COMPUTING) { 368 /* TODO: this could probably be adapted to do FORCES as well */ 369 b32 done = do_volume_computation_step(ctx, CS_HERCULES); 370 if (done) { 371 ExportCtx *e = &ctx->export_ctx; 372 uv4 dim = e->volume_dim; 373 size volume_out_size = dim.x * dim.y * dim.z * sizeof(f32); 374 e->volume_buf = os_alloc_arena(e->volume_buf, volume_out_size); 375 glGetTextureImage(e->volume_texture, 0, GL_RED, GL_FLOAT, volume_out_size, 376 e->volume_buf.beg); 377 s8 raw = {.len = volume_out_size, .data = e->volume_buf.beg}; 378 if (!os_write_file("raw_volume.bin", raw)) 379 TraceLog(LOG_WARNING, "failed to write output volume\n"); 380 } 381 } 382 383 /* NOTE: draw output image texture using render fragment shader */ 384 BeginTextureMode(ctx->fsctx.output); 385 ClearBackground(PINK); 386 BeginShaderMode(ctx->fsctx.shader); 387 FragmentShaderCtx *fs = &ctx->fsctx; 388 glUseProgram(fs->shader.id); 389 glActiveTexture(GL_TEXTURE0 + ctx->out_texture_unit); 390 glBindTexture(GL_TEXTURE_3D, ctx->out_texture); 391 glUniform1i(fs->out_data_tex_id, ctx->out_texture_unit); 392 glUniform1f(fs->db_cutoff_id, fs->db); 393 DrawTexture(fs->output.texture, 0, 0, WHITE); 394 EndShaderMode(); 395 EndTextureMode(); 396 397 /* NOTE: regenerate mipmaps only when the output has actually changed */ 398 if (ctx->flags & GEN_MIPMAPS) { 399 /* NOTE: shut up raylib's reporting on mipmap gen */ 400 SetTraceLogLevel(LOG_NONE); 401 GenTextureMipmaps(&ctx->fsctx.output.texture); 402 SetTraceLogLevel(LOG_INFO); 403 ctx->flags &= ~GEN_MIPMAPS; 404 } 405 406 draw_ui(ctx, arena); 407 408 if (IsKeyPressed(KEY_R)) 409 ctx->flags |= RELOAD_SHADERS; 410 }