Commit: f15b9fab81866a6f6b22c95f30d4ad5aceaed1c7
Parent: 65f7c333f3ae2fff6db4f7870403206d368f87af
Author: Randy Palamar
Date: Thu, 8 Aug 2024 12:51:37 -0600
switch to using unsynchronized buffer access for raw data
This uses a big ssbo that is 3x the size of the raw data allowing
for up to 3 frames in flight at a time. On my desktop this greatly
increases the performance. For FORCES this increases the framerate
to ~9FPS.
Diffstat:
2 files changed, 52 insertions(+), 23 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -53,6 +53,7 @@ upload_filter_coefficients(BeamformerCtx *ctx, Arena a)
static void
alloc_shader_storage(BeamformerCtx *ctx, Arena a)
{
+ ComputeShaderCtx *cs = &ctx->csctx;
BeamformerParameters *bp = &ctx->params->raw;
uv4 dec_data_dim = bp->dec_data_dim;
uv2 rf_raw_dim = bp->rf_raw_dim;
@@ -61,35 +62,38 @@ alloc_shader_storage(BeamformerCtx *ctx, Arena a)
ctx->csctx.rf_raw_dim = rf_raw_dim;
ctx->csctx.dec_data_dim = dec_data_dim;
- glDeleteBuffers(ARRAY_COUNT(ctx->csctx.rf_data_ssbos), ctx->csctx.rf_data_ssbos);
- glDeleteBuffers(1, &ctx->csctx.raw_data_ssbo);
- glGenBuffers(1, &ctx->csctx.raw_data_ssbo);
- glGenBuffers(ARRAY_COUNT(ctx->csctx.rf_data_ssbos), ctx->csctx.rf_data_ssbos);
- glBindBuffer(GL_SHADER_STORAGE_BUFFER, ctx->csctx.raw_data_ssbo);
- glBufferStorage(GL_SHADER_STORAGE_BUFFER, rf_raw_size, 0,
- GL_DYNAMIC_STORAGE_BIT|GL_MAP_WRITE_BIT);
+ glDeleteBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos);
+ glGenBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos);
- for (u32 i = 0; i < ARRAY_COUNT(ctx->csctx.rf_data_ssbos); i++) {
- glBindBuffer(GL_SHADER_STORAGE_BUFFER, ctx->csctx.rf_data_ssbos[i]);
+ glDeleteBuffers(1, &cs->raw_data_ssbo);
+ glGenBuffers(1, &cs->raw_data_ssbo);
+
+ glBindBuffer(GL_SHADER_STORAGE_BUFFER, cs->raw_data_ssbo);
+ glBufferStorage(GL_SHADER_STORAGE_BUFFER, ARRAY_COUNT(cs->raw_data_fences) * rf_raw_size,
+ 0, GL_DYNAMIC_STORAGE_BIT|GL_MAP_WRITE_BIT);
+
+ for (u32 i = 0; i < ARRAY_COUNT(cs->rf_data_ssbos); i++) {
+ glBindBuffer(GL_SHADER_STORAGE_BUFFER, cs->rf_data_ssbos[i]);
glBufferStorage(GL_SHADER_STORAGE_BUFFER, rf_decoded_size, 0, 0);
}
/* NOTE: store hadamard in GPU once; it won't change for a particular imaging session */
- ctx->csctx.hadamard_dim = (uv2){.x = dec_data_dim.z, .y = dec_data_dim.z};
- size hadamard_elements = dec_data_dim.z * dec_data_dim.z;
- i32 *hadamard = alloc(&a, i32, hadamard_elements);
+ cs->hadamard_dim = (uv2){.x = dec_data_dim.z, .y = dec_data_dim.z};
+ size hadamard_elements = dec_data_dim.z * dec_data_dim.z;
+ i32 *hadamard = alloc(&a, i32, hadamard_elements);
fill_hadamard(hadamard, dec_data_dim.z);
- rlUnloadShaderBuffer(ctx->csctx.hadamard_ssbo);
- ctx->csctx.hadamard_ssbo = rlLoadShaderBuffer(hadamard_elements * sizeof(i32), hadamard,
- GL_STATIC_DRAW);
- ctx->flags &= ~ALLOC_SSBOS;
+ rlUnloadShaderBuffer(cs->hadamard_ssbo);
+ cs->hadamard_ssbo = rlLoadShaderBuffer(hadamard_elements * sizeof(i32), hadamard, GL_STATIC_DRAW);
+ ctx->flags &= ~ALLOC_SSBOS;
}
static void
do_compute_shader(BeamformerCtx *ctx, enum compute_shaders shader)
{
ComputeShaderCtx *csctx = &ctx->csctx;
+ uv2 rf_raw_dim = ctx->params->raw.rf_raw_dim;
+ size rf_raw_size = rf_raw_dim.x * rf_raw_dim.y * sizeof(i16);
glBeginQuery(GL_TIME_ELAPSED, csctx->timer_ids[shader]);
@@ -101,13 +105,17 @@ do_compute_shader(BeamformerCtx *ctx, enum compute_shaders shader)
u32 input_ssbo_idx = csctx->last_output_ssbo_index;
switch (shader) {
case CS_HADAMARD:
- glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo);
+ glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo,
+ csctx->raw_data_index * rf_raw_size, rf_raw_size);
+
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, csctx->hadamard_ssbo);
glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32),
ORONE(csctx->dec_data_dim.y / 32),
ORONE(csctx->dec_data_dim.z));
+ csctx->raw_data_fences[csctx->raw_data_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
+ csctx->raw_data_index = (csctx->raw_data_index + 1) % ARRAY_COUNT(csctx->raw_data_fences);
break;
case CS_LPF:
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]);
@@ -531,12 +539,28 @@ do_beamformer(BeamformerCtx *ctx, Arena arena)
if (!uv4_equal(ctx->out_data_dim, bp->output_points) || ctx->flags & ALLOC_OUT_TEX)
alloc_output_image(ctx);
- glBindBuffer(GL_SHADER_STORAGE_BUFFER, ctx->csctx.raw_data_ssbo);
- void *rf_data_buf = glMapBuffer(GL_SHADER_STORAGE_BUFFER, GL_WRITE_ONLY);
- ASSERT(rf_data_buf);
+ i32 raw_index = ctx->csctx.raw_data_index;
+ /* NOTE: if this times out it means the command queue is more than 3 frames behind.
+ * In that case we need to re-evaluate the buffer size */
+ i32 result = glClientWaitSync(ctx->csctx.raw_data_fences[raw_index], 0, 10000);
+ if (result == GL_TIMEOUT_EXPIRED) {
+ //ASSERT(0);
+ }
+ glDeleteSync(ctx->csctx.raw_data_fences[raw_index]);
+
uv2 rf_raw_dim = ctx->csctx.rf_raw_dim;
size rf_raw_size = rf_raw_dim.x * rf_raw_dim.y * sizeof(i16);
- size rlen = os_read_pipe_data(ctx->data_pipe, rf_data_buf, rf_raw_size);
+
+ glBindBuffer(GL_SHADER_STORAGE_BUFFER, ctx->csctx.raw_data_ssbo);
+ void *rf_data_buf = glMapBufferRange(GL_SHADER_STORAGE_BUFFER,
+ raw_index * rf_raw_size, rf_raw_size,
+ GL_MAP_UNSYNCHRONIZED_BIT | GL_MAP_WRITE_BIT);
+ if (!rf_data_buf) {
+ rlCheckErrors();
+ ASSERT(0);
+ }
+ size rlen = os_read_pipe_data(ctx->data_pipe, rf_data_buf, rf_raw_size);
+
glUnmapBuffer(GL_SHADER_STORAGE_BUFFER);
if (rlen == rf_raw_size) ctx->flags |= DO_COMPUTE;
diff --git a/beamformer.h b/beamformer.h
@@ -97,8 +97,13 @@ typedef struct {
GLsync timer_fence;
f32 last_frame_time[CS_LAST];
- /* NOTE: One SSBO for raw data and two for decoded data (swapped for chained stages)*/
- u32 raw_data_ssbo;
+ /* NOTE: multiple raw data SSBOs for unsynchronized mapping.
+ * Decoded data is only relavent in the context of a single frame, two are
+ * used so that they can be swapped when chaining multiple compute stages */
+ GLsync raw_data_fences[3];
+ u32 raw_data_ssbo;
+ u32 raw_data_index;
+
u32 rf_data_ssbos[2];
u32 last_output_ssbo_index;
u32 hadamard_ssbo;