ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | LICENSE

Commit: f15b9fab81866a6f6b22c95f30d4ad5aceaed1c7
Parent: 65f7c333f3ae2fff6db4f7870403206d368f87af
Author: Randy Palamar
Date:   Thu,  8 Aug 2024 12:51:37 -0600

switch to using unsynchronized buffer access for raw data

This uses a big ssbo that is 3x the size of the raw data allowing
for up to 3 frames in flight at a time. On my desktop this greatly
increases the performance. For FORCES this increases the framerate
to ~9FPS.

Diffstat:
Mbeamformer.c | 66+++++++++++++++++++++++++++++++++++++++++++++---------------------
Mbeamformer.h | 9+++++++--
2 files changed, 52 insertions(+), 23 deletions(-)

diff --git a/beamformer.c b/beamformer.c @@ -53,6 +53,7 @@ upload_filter_coefficients(BeamformerCtx *ctx, Arena a) static void alloc_shader_storage(BeamformerCtx *ctx, Arena a) { + ComputeShaderCtx *cs = &ctx->csctx; BeamformerParameters *bp = &ctx->params->raw; uv4 dec_data_dim = bp->dec_data_dim; uv2 rf_raw_dim = bp->rf_raw_dim; @@ -61,35 +62,38 @@ alloc_shader_storage(BeamformerCtx *ctx, Arena a) ctx->csctx.rf_raw_dim = rf_raw_dim; ctx->csctx.dec_data_dim = dec_data_dim; - glDeleteBuffers(ARRAY_COUNT(ctx->csctx.rf_data_ssbos), ctx->csctx.rf_data_ssbos); - glDeleteBuffers(1, &ctx->csctx.raw_data_ssbo); - glGenBuffers(1, &ctx->csctx.raw_data_ssbo); - glGenBuffers(ARRAY_COUNT(ctx->csctx.rf_data_ssbos), ctx->csctx.rf_data_ssbos); - glBindBuffer(GL_SHADER_STORAGE_BUFFER, ctx->csctx.raw_data_ssbo); - glBufferStorage(GL_SHADER_STORAGE_BUFFER, rf_raw_size, 0, - GL_DYNAMIC_STORAGE_BIT|GL_MAP_WRITE_BIT); + glDeleteBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos); + glGenBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos); - for (u32 i = 0; i < ARRAY_COUNT(ctx->csctx.rf_data_ssbos); i++) { - glBindBuffer(GL_SHADER_STORAGE_BUFFER, ctx->csctx.rf_data_ssbos[i]); + glDeleteBuffers(1, &cs->raw_data_ssbo); + glGenBuffers(1, &cs->raw_data_ssbo); + + glBindBuffer(GL_SHADER_STORAGE_BUFFER, cs->raw_data_ssbo); + glBufferStorage(GL_SHADER_STORAGE_BUFFER, ARRAY_COUNT(cs->raw_data_fences) * rf_raw_size, + 0, GL_DYNAMIC_STORAGE_BIT|GL_MAP_WRITE_BIT); + + for (u32 i = 0; i < ARRAY_COUNT(cs->rf_data_ssbos); i++) { + glBindBuffer(GL_SHADER_STORAGE_BUFFER, cs->rf_data_ssbos[i]); glBufferStorage(GL_SHADER_STORAGE_BUFFER, rf_decoded_size, 0, 0); } /* NOTE: store hadamard in GPU once; it won't change for a particular imaging session */ - ctx->csctx.hadamard_dim = (uv2){.x = dec_data_dim.z, .y = dec_data_dim.z}; - size hadamard_elements = dec_data_dim.z * dec_data_dim.z; - i32 *hadamard = alloc(&a, i32, hadamard_elements); + cs->hadamard_dim = (uv2){.x = dec_data_dim.z, .y = dec_data_dim.z}; + size hadamard_elements = dec_data_dim.z * dec_data_dim.z; + i32 *hadamard = alloc(&a, i32, hadamard_elements); fill_hadamard(hadamard, dec_data_dim.z); - rlUnloadShaderBuffer(ctx->csctx.hadamard_ssbo); - ctx->csctx.hadamard_ssbo = rlLoadShaderBuffer(hadamard_elements * sizeof(i32), hadamard, - GL_STATIC_DRAW); - ctx->flags &= ~ALLOC_SSBOS; + rlUnloadShaderBuffer(cs->hadamard_ssbo); + cs->hadamard_ssbo = rlLoadShaderBuffer(hadamard_elements * sizeof(i32), hadamard, GL_STATIC_DRAW); + ctx->flags &= ~ALLOC_SSBOS; } static void do_compute_shader(BeamformerCtx *ctx, enum compute_shaders shader) { ComputeShaderCtx *csctx = &ctx->csctx; + uv2 rf_raw_dim = ctx->params->raw.rf_raw_dim; + size rf_raw_size = rf_raw_dim.x * rf_raw_dim.y * sizeof(i16); glBeginQuery(GL_TIME_ELAPSED, csctx->timer_ids[shader]); @@ -101,13 +105,17 @@ do_compute_shader(BeamformerCtx *ctx, enum compute_shaders shader) u32 input_ssbo_idx = csctx->last_output_ssbo_index; switch (shader) { case CS_HADAMARD: - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo, + csctx->raw_data_index * rf_raw_size, rf_raw_size); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, csctx->hadamard_ssbo); glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32), ORONE(csctx->dec_data_dim.y / 32), ORONE(csctx->dec_data_dim.z)); + csctx->raw_data_fences[csctx->raw_data_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; + csctx->raw_data_index = (csctx->raw_data_index + 1) % ARRAY_COUNT(csctx->raw_data_fences); break; case CS_LPF: glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]); @@ -531,12 +539,28 @@ do_beamformer(BeamformerCtx *ctx, Arena arena) if (!uv4_equal(ctx->out_data_dim, bp->output_points) || ctx->flags & ALLOC_OUT_TEX) alloc_output_image(ctx); - glBindBuffer(GL_SHADER_STORAGE_BUFFER, ctx->csctx.raw_data_ssbo); - void *rf_data_buf = glMapBuffer(GL_SHADER_STORAGE_BUFFER, GL_WRITE_ONLY); - ASSERT(rf_data_buf); + i32 raw_index = ctx->csctx.raw_data_index; + /* NOTE: if this times out it means the command queue is more than 3 frames behind. + * In that case we need to re-evaluate the buffer size */ + i32 result = glClientWaitSync(ctx->csctx.raw_data_fences[raw_index], 0, 10000); + if (result == GL_TIMEOUT_EXPIRED) { + //ASSERT(0); + } + glDeleteSync(ctx->csctx.raw_data_fences[raw_index]); + uv2 rf_raw_dim = ctx->csctx.rf_raw_dim; size rf_raw_size = rf_raw_dim.x * rf_raw_dim.y * sizeof(i16); - size rlen = os_read_pipe_data(ctx->data_pipe, rf_data_buf, rf_raw_size); + + glBindBuffer(GL_SHADER_STORAGE_BUFFER, ctx->csctx.raw_data_ssbo); + void *rf_data_buf = glMapBufferRange(GL_SHADER_STORAGE_BUFFER, + raw_index * rf_raw_size, rf_raw_size, + GL_MAP_UNSYNCHRONIZED_BIT | GL_MAP_WRITE_BIT); + if (!rf_data_buf) { + rlCheckErrors(); + ASSERT(0); + } + size rlen = os_read_pipe_data(ctx->data_pipe, rf_data_buf, rf_raw_size); + glUnmapBuffer(GL_SHADER_STORAGE_BUFFER); if (rlen == rf_raw_size) ctx->flags |= DO_COMPUTE; diff --git a/beamformer.h b/beamformer.h @@ -97,8 +97,13 @@ typedef struct { GLsync timer_fence; f32 last_frame_time[CS_LAST]; - /* NOTE: One SSBO for raw data and two for decoded data (swapped for chained stages)*/ - u32 raw_data_ssbo; + /* NOTE: multiple raw data SSBOs for unsynchronized mapping. + * Decoded data is only relavent in the context of a single frame, two are + * used so that they can be swapped when chaining multiple compute stages */ + GLsync raw_data_fences[3]; + u32 raw_data_ssbo; + u32 raw_data_index; + u32 rf_data_ssbos[2]; u32 last_output_ssbo_index; u32 hadamard_ssbo;