ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | LICENSE

Commit: 8741f0be4c6ceab20d7fa32cb71da28a825b9f91
Parent: ea9d5a0270e873fd7ad2e8e06f154671a291ba20
Author: Randy Palamar
Date:   Tue, 13 Aug 2024 10:54:45 -0600

use BufferSubData for uploading rf data on NVIDIA/AMD

It may seem counter-intuitive and goes against all of NVIDIA's
presentations and other documentation but this is the fastest way
to upload the data.

Unfourtanetely this doesn't work on Intel for some reason so it
gets a seperate path. The seperate path code is necessary anyways
because we want to add CUDA support for NVIDIA.

Diffstat:
Mbeamformer.c | 58++++++++++++++++++++++++++++++++++++----------------------
Mbeamformer.h | 17++++++++++++++---
Mmain.c | 12++++++++++++
3 files changed, 62 insertions(+), 25 deletions(-)

diff --git a/beamformer.c b/beamformer.c @@ -69,12 +69,17 @@ alloc_shader_storage(BeamformerCtx *ctx, Arena a) glDeleteBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos); glGenBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos); + i32 storage_flags = GL_DYNAMIC_STORAGE_BIT; + if (ctx->gl_vendor_id == GL_VENDOR_INTEL) + storage_flags |= GL_MAP_WRITE_BIT; glDeleteBuffers(1, &cs->raw_data_ssbo); - glGenBuffers(1, &cs->raw_data_ssbo); + glCreateBuffers(1, &cs->raw_data_ssbo); + glNamedBufferStorage(cs->raw_data_ssbo, ARRAY_COUNT(cs->raw_data_fences) * rf_raw_size, 0, + storage_flags); - glBindBuffer(GL_SHADER_STORAGE_BUFFER, cs->raw_data_ssbo); - glBufferStorage(GL_SHADER_STORAGE_BUFFER, ARRAY_COUNT(cs->raw_data_fences) * rf_raw_size, - 0, GL_MAP_WRITE_BIT); + /* TODO: allow this to grow if the raw data has been resized */ + if (cs->raw_data_arena.beg == 0) + cs->raw_data_arena = os_new_arena(rf_raw_size); for (u32 i = 0; i < ARRAY_COUNT(cs->rf_data_ssbos); i++) { glBindBuffer(GL_SHADER_STORAGE_BUFFER, cs->rf_data_ssbos[i]); @@ -538,7 +543,8 @@ do_beamformer(BeamformerCtx *ctx, Arena arena) BeamformerParameters *bp = &ctx->params->raw; /* NOTE: Check for and Load RF Data into GPU */ if (os_poll_pipe(ctx->data_pipe)) { - if (!uv4_equal(ctx->csctx.dec_data_dim, bp->dec_data_dim) || ctx->flags & ALLOC_SSBOS) + ComputeShaderCtx *cs = &ctx->csctx; + if (!uv4_equal(cs->dec_data_dim, bp->dec_data_dim) || ctx->flags & ALLOC_SSBOS) alloc_shader_storage(ctx, arena); if (!uv4_equal(ctx->out_data_dim, bp->output_points) || ctx->flags & ALLOC_OUT_TEX) alloc_output_image(ctx); @@ -547,31 +553,39 @@ do_beamformer(BeamformerCtx *ctx, Arena arena) /* NOTE: if this times out it means the command queue is more than 3 frames behind. * In that case we need to re-evaluate the buffer size */ if (ctx->csctx.raw_data_fences[raw_index]) { - i32 result = glClientWaitSync(ctx->csctx.raw_data_fences[raw_index], 0, 10000); + i32 result = glClientWaitSync(cs->raw_data_fences[raw_index], 0, 10000); if (result == GL_TIMEOUT_EXPIRED) { //ASSERT(0); } - glDeleteSync(ctx->csctx.raw_data_fences[raw_index]); - ctx->csctx.raw_data_fences[raw_index] = NULL; + glDeleteSync(cs->raw_data_fences[raw_index]); + cs->raw_data_fences[raw_index] = NULL; } - uv2 rf_raw_dim = ctx->csctx.rf_raw_dim; + uv2 rf_raw_dim = cs->rf_raw_dim; size rf_raw_size = rf_raw_dim.x * rf_raw_dim.y * sizeof(i16); - glBindBuffer(GL_SHADER_STORAGE_BUFFER, ctx->csctx.raw_data_ssbo); - void *rf_data_buf = glMapBufferRange(GL_SHADER_STORAGE_BUFFER, - raw_index * rf_raw_size, rf_raw_size, - GL_MAP_UNSYNCHRONIZED_BIT | GL_MAP_WRITE_BIT); - if (!rf_data_buf) { - rlCheckErrors(); - ASSERT(0); + if (ctx->gl_vendor_id == GL_VENDOR_INTEL) { + /* TODO: intel complains about this buffer being busy even with + * MAP_UNSYNCHRONIZED_BIT */ + void *rf_data_buf = glMapNamedBufferRange(cs->raw_data_ssbo, + raw_index * rf_raw_size, + rf_raw_size, + GL_MAP_WRITE_BIT); + size rlen = os_read_pipe_data(ctx->data_pipe, rf_data_buf, rf_raw_size); + glUnmapNamedBuffer(cs->raw_data_ssbo); + if (rlen == rf_raw_size) ctx->flags |= DO_COMPUTE; + else ctx->partial_transfer_count++; + } else { + void *rf_data_buf = cs->raw_data_arena.beg + raw_index * rf_raw_size; + size rlen = os_read_pipe_data(ctx->data_pipe, rf_data_buf, rf_raw_size); + if (rlen == rf_raw_size) { + ctx->flags |= DO_COMPUTE; + glNamedBufferSubData(cs->raw_data_ssbo, raw_index * rf_raw_size, + rf_raw_size, rf_data_buf); + } else { + ctx->partial_transfer_count++; + } } - size rlen = os_read_pipe_data(ctx->data_pipe, rf_data_buf, rf_raw_size); - - glUnmapBuffer(GL_SHADER_STORAGE_BUFFER); - - if (rlen == rf_raw_size) ctx->flags |= DO_COMPUTE; - else ctx->partial_transfer_count++; } if (ctx->flags & UPLOAD_FILTER) diff --git a/beamformer.h b/beamformer.h @@ -60,6 +60,12 @@ enum program_flags { DO_COMPUTE = 1 << 30, }; +enum gl_vendor_ids { + GL_VENDOR_AMD, + GL_VENDOR_INTEL, + GL_VENDOR_NVIDIA, +}; + typedef struct { char buf[64]; i32 buf_len; @@ -98,13 +104,17 @@ typedef struct { GLsync timer_fence; f32 last_frame_time[CS_LAST]; - /* NOTE: multiple raw data SSBOs for unsynchronized mapping. - * Decoded data is only relavent in the context of a single frame, two are - * used so that they can be swapped when chaining multiple compute stages */ + /* NOTE: the raw_data_ssbo is allocated at 3x the required size to allow for tiled + * transfers when the GPU is running behind the CPU. It is not mapped because NVIDIA's + * drivers _will_ store the buffer in the sytem memory in that case (this doesn't happen + * for Intel or AMD). Instead BufferSubData is used to update the correct subrange */ GLsync raw_data_fences[3]; + Arena raw_data_arena; u32 raw_data_ssbo; u32 raw_data_index; + /* NOTE: Decoded data is only relevant in the context of a single frame. We use two + * buffers so that they can be swapped when chaining multiple compute stages */ u32 rf_data_ssbos[2]; u32 last_output_ssbo_index; u32 hadamard_ssbo; @@ -134,6 +144,7 @@ typedef struct { typedef struct { uv2 window_size; u32 flags; + enum gl_vendor_ids gl_vendor_id; f32 dt; diff --git a/main.c b/main.c @@ -185,6 +185,18 @@ main(void) ctx.params->raw.output_points = ctx.out_data_dim; + /* NOTE: Determine which graphics vendor we are running on */ + { + const u8 *vendor = glGetString(GL_VENDOR); + if (!vendor) + die("Failed to determine GL Vendor\n"); + switch (vendor[0]) { + case 'A': ctx.gl_vendor_id = GL_VENDOR_AMD; break; + case 'I': ctx.gl_vendor_id = GL_VENDOR_INTEL; break; + case 'N': ctx.gl_vendor_id = GL_VENDOR_NVIDIA; break; + default: die("Unknown GL Vendor: %s\n", vendor); break; + } + } /* NOTE: set up OpenGL debug logging */ glDebugMessageCallback(gl_debug_logger, NULL);