ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

Commit: 0269e2ebd19872b3d2507bb0849519ec019d6a2c
Parent: b3c9e90173f7ce8b1134d6449ca222a6d9c43c27
Author: Randy Palamar
Date:   Thu, 13 Nov 2025 09:19:02 -0700

core: use NamedBufferSubData for rf data on NVIDIA

this fixes one part of the weird performance regression

Diffstat:
Mbeamformer.c | 51+++++++++++++++++++++++++++++++++------------------
Mbeamformer.h | 60++++++++++++++++++++++++++++++++----------------------------
Mopengl.h | 1+
Mstatic.c | 31++++++++++++++++---------------
4 files changed, 82 insertions(+), 61 deletions(-)

diff --git a/beamformer.c b/beamformer.c @@ -209,7 +209,7 @@ make_valid_output_points(i32 points[3]) } function void -alloc_beamform_frame(GLParams *gp, BeamformerFrame *out, iv3 out_dim, GLenum gl_kind, s8 name, Arena arena) +alloc_beamform_frame(GLParameters *gp, BeamformerFrame *out, iv3 out_dim, GLenum gl_kind, s8 name, Arena arena) { out->dim = make_valid_output_points(out_dim.E); if (gp) { @@ -1391,14 +1391,23 @@ DEBUG_EXPORT BEAMFORMER_COMPLETE_COMPUTE_FN(beamformer_complete_compute) } function void -beamformer_rf_buffer_allocate(BeamformerRFBuffer *rf, u32 rf_size) +beamformer_rf_buffer_allocate(BeamformerRFBuffer *rf, u32 rf_size, b32 nvidia) { assert((rf_size % 64) == 0); + if (!nvidia) glUnmapNamedBuffer(rf->ssbo); glDeleteBuffers(1, &rf->ssbo); glCreateBuffers(1, &rf->ssbo); - glNamedBufferStorage(rf->ssbo, countof(rf->compute_syncs) * rf_size, 0, - GL_DYNAMIC_STORAGE_BIT|GL_MAP_WRITE_BIT); + u32 buffer_flags = GL_DYNAMIC_STORAGE_BIT; + if (!nvidia) buffer_flags |= GL_MAP_PERSISTENT_BIT|GL_MAP_WRITE_BIT; + + glNamedBufferStorage(rf->ssbo, countof(rf->compute_syncs) * rf_size, 0, buffer_flags); + + if (!nvidia) { + u32 access = GL_MAP_PERSISTENT_BIT|GL_MAP_WRITE_BIT|GL_MAP_FLUSH_EXPLICIT_BIT|GL_MAP_UNSYNCHRONIZED_BIT; + rf->buffer = glMapNamedBufferRange(rf->ssbo, 0, (GLsizei)(countof(rf->compute_syncs) * rf_size), access); + } + LABEL_GL_OBJECT(GL_BUFFER, rf->ssbo, s8("Raw_RF_SSBO")); rf->size = rf_size; } @@ -1406,10 +1415,12 @@ beamformer_rf_buffer_allocate(BeamformerRFBuffer *rf, u32 rf_size) DEBUG_EXPORT BEAMFORMER_RF_UPLOAD_FN(beamformer_rf_upload) { struct load_context { - u8 *buffer; + uptr buffer; void *data; + u32 offset; u32 channel_count; u32 channel_stride_bytes; + b32 nvidia; } load_context_store = {0}; struct load_context *lctx = 0; @@ -1427,10 +1438,11 @@ DEBUG_EXPORT BEAMFORMER_RF_UPLOAD_FN(beamformer_rf_upload) (scratch_rf_size = atomic_swap_u32(&sm->rf_meta.size, 0)) && os_shared_memory_region_lock(ctx->shared_memory, sm->locks, (i32)scratch_lock, (u32)-1)) { + lctx->nvidia = ctx->gl->vendor_id == GLVendor_NVIDIA; BeamformerRFBuffer *rf = ctx->rf_buffer; rf->active_rf_size = (u32)round_up_to(scratch_rf_size, 64); if (rf->size < rf->active_rf_size) - beamformer_rf_buffer_allocate(rf, rf->active_rf_size); + beamformer_rf_buffer_allocate(rf, rf->active_rf_size, lctx->nvidia); insertion_slot = rf->insertion_index++ % countof(rf->compute_syncs); @@ -1447,12 +1459,8 @@ DEBUG_EXPORT BEAMFORMER_RF_UPLOAD_FN(beamformer_rf_upload) glDeleteSync(rf->compute_syncs[insertion_slot]); } - /* NOTE(rnp): nVidia's drivers really don't play nice with persistant mapping, - * at least when it is a big as this one wants to be. mapping and unmapping the - * desired range each time doesn't seem to introduce any performance hit */ - u32 access = GL_MAP_WRITE_BIT|GL_MAP_FLUSH_EXPLICIT_BIT|GL_MAP_UNSYNCHRONIZED_BIT; - lctx->buffer = glMapNamedBufferRange(rf->ssbo, insertion_slot * rf->active_rf_size, - (i32)rf->active_rf_size, access); + lctx->offset = insertion_slot * rf->active_rf_size; + lctx->buffer = lctx->nvidia? rf->ssbo : (uptr)rf->buffer; lctx->data = beamformer_shared_memory_scratch_arena(sm).beg; BeamformerParameterBlock *b = beamformer_parameter_block(sm, atomic_load_u32(&sm->rf_meta.block)); @@ -1468,10 +1476,17 @@ DEBUG_EXPORT BEAMFORMER_RF_UPLOAD_FN(beamformer_rf_upload) if (lctx->buffer) { RangeU64 range = lane_range(lctx->channel_count); - for (u64 channel = range.start; channel < range.stop; channel++) { - u8 *out = lctx->buffer + channel * lctx->channel_stride_bytes; - u8 *in = lctx->data + channel * lctx->channel_stride_bytes; - mem_copy(out, in, lctx->channel_stride_bytes); + if (lctx->nvidia) { + i64 offset = (i64)(lctx->offset + range.start * lctx->channel_stride_bytes); + i32 size = (i32)(lctx->channel_stride_bytes * (range.stop - range.start)); + u8 *in = lctx->data + range.start * lctx->channel_stride_bytes; + glNamedBufferSubData((u32)lctx->buffer, offset, size, in); + } else { + for (u64 channel = range.start; channel < range.stop; channel++) { + u8 *in = lctx->data + channel * lctx->channel_stride_bytes; + u8 *out = (u8 *)lctx->buffer + lctx->offset + channel * lctx->channel_stride_bytes; + mem_copy(out, in, lctx->channel_stride_bytes); + } } } lane_sync(); @@ -1481,8 +1496,8 @@ DEBUG_EXPORT BEAMFORMER_RF_UPLOAD_FN(beamformer_rf_upload) post_sync_barrier(ctx->shared_memory, upload_lock, sm->locks); BeamformerRFBuffer *rf = ctx->rf_buffer; - glFlushMappedNamedBufferRange(rf->ssbo, 0, (i32)rf->active_rf_size); - glUnmapNamedBuffer(rf->ssbo); + if (!lctx->nvidia) + glFlushMappedNamedBufferRange(rf->ssbo, insertion_slot * rf->active_rf_size, (i32)rf->active_rf_size); atomic_store_u64(rf->upload_syncs + insertion_slot, glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0)); atomic_store_u64(rf->compute_syncs + insertion_slot, 0); diff --git a/beamformer.h b/beamformer.h @@ -25,13 +25,6 @@ function OS_WRITE_FILE_FN(os_write_file); #include "threads.c" #include "util_gl.c" -enum gl_vendor_ids { - GL_VENDOR_AMD, - GL_VENDOR_ARM, - GL_VENDOR_INTEL, - GL_VENDOR_NVIDIA, -}; - typedef struct { v2 mouse; v2 last_mouse; @@ -97,6 +90,33 @@ typedef struct { #include "beamformer_parameters.h" #include "beamformer_shared_memory.c" +typedef enum { + GLVendor_AMD, + GLVendor_ARM, + GLVendor_Intel, + GLVendor_NVIDIA, +} GLVendorID; + +#define GL_PARAMETERS \ + X(MAJOR_VERSION, version_major, "") \ + X(MINOR_VERSION, version_minor, "") \ + X(MIN_MAP_BUFFER_ALIGNMENT, min_map_buffer_alignment, "") \ + X(TEXTURE_BUFFER_OFFSET_ALIGNMENT, texture_buffer_offset_alignment, "") \ + X(MAX_TEXTURE_BUFFER_SIZE, max_texture_buffer_size, "") \ + X(MAX_TEXTURE_SIZE, max_2d_texture_dim, "") \ + X(MAX_3D_TEXTURE_SIZE, max_3d_texture_dim, "") \ + X(MAX_SHADER_STORAGE_BLOCK_SIZE, max_ssbo_size, "") \ + X(MAX_COMPUTE_SHARED_MEMORY_SIZE, max_shared_memory_size, "") \ + X(MAX_UNIFORM_BLOCK_SIZE, max_ubo_size, "") \ + X(MAX_SERVER_WAIT_TIMEOUT, max_server_wait_time, " [ns]") \ + +typedef struct { + GLVendorID vendor_id; + #define X(glname, name, suffix) i32 name; + GL_PARAMETERS + #undef X +} GLParameters; + typedef struct { iptr elements_offset; i32 elements; @@ -197,7 +217,10 @@ typedef struct { GLsync upload_syncs[BeamformerMaxRawDataFramesInFlight]; GLsync compute_syncs[BeamformerMaxRawDataFramesInFlight]; + u8 *buffer; + u32 ssbo; + u32 size; u32 active_rf_size; @@ -267,6 +290,7 @@ typedef struct { SharedMemoryRegion *shared_memory; ComputeTimingTable *compute_timing_table; i32 *compute_worker_sync; + GLParameters *gl; } BeamformerUploadThreadContext; struct BeamformerFrame { @@ -292,28 +316,8 @@ struct BeamformerFrame { BeamformerFrame *next; }; -#define GL_PARAMETERS \ - X(MAJOR_VERSION, version_major, "") \ - X(MINOR_VERSION, version_minor, "") \ - X(MIN_MAP_BUFFER_ALIGNMENT, min_map_buffer_alignment, "") \ - X(TEXTURE_BUFFER_OFFSET_ALIGNMENT, texture_buffer_offset_alignment, "") \ - X(MAX_TEXTURE_BUFFER_SIZE, max_texture_buffer_size, "") \ - X(MAX_TEXTURE_SIZE, max_2d_texture_dim, "") \ - X(MAX_3D_TEXTURE_SIZE, max_3d_texture_dim, "") \ - X(MAX_SHADER_STORAGE_BLOCK_SIZE, max_ssbo_size, "") \ - X(MAX_COMPUTE_SHARED_MEMORY_SIZE, max_shared_memory_size, "") \ - X(MAX_UNIFORM_BLOCK_SIZE, max_ubo_size, "") \ - X(MAX_SERVER_WAIT_TIMEOUT, max_server_wait_time, " [ns]") - -typedef struct { - enum gl_vendor_ids vendor_id; - #define X(glname, name, suffix) i32 name; - GL_PARAMETERS - #undef X -} GLParams; - typedef struct { - GLParams gl; + GLParameters gl; iv2 window_size; b32 should_exit; diff --git a/opengl.h b/opengl.h @@ -14,6 +14,7 @@ #define GL_MAP_WRITE_BIT 0x0002 #define GL_MAP_FLUSH_EXPLICIT_BIT 0x0010 #define GL_MAP_UNSYNCHRONIZED_BIT 0x0020 +#define GL_MAP_PERSISTENT_BIT 0x0040 #define GL_DYNAMIC_STORAGE_BIT 0x0100 #define GL_SHADER_IMAGE_ACCESS_BARRIER_BIT 0x00000020 #define GL_TEXTURE_UPDATE_BARRIER_BIT 0x00000100 diff --git a/static.c b/static.c @@ -94,7 +94,7 @@ gl_debug_logger(u32 src, u32 type, u32 id, u32 lvl, i32 len, const char *msg, co } function void -get_gl_params(GLParams *gl, Stream *err) +get_gl_params(GLParameters *gl, Stream *err) { char *vendor = (char *)glGetString(GL_VENDOR); if (!vendor) { @@ -103,13 +103,13 @@ get_gl_params(GLParams *gl, Stream *err) } /* TODO(rnp): str prefix of */ switch (vendor[0]) { - case 'A': gl->vendor_id = GL_VENDOR_AMD; break; - case 'I': gl->vendor_id = GL_VENDOR_INTEL; break; - case 'N': gl->vendor_id = GL_VENDOR_NVIDIA; break; + case 'A': gl->vendor_id = GLVendor_AMD; break; + case 'I': gl->vendor_id = GLVendor_Intel; break; + case 'N': gl->vendor_id = GLVendor_NVIDIA; break; /* NOTE(rnp): freedreno */ - case 'f': gl->vendor_id = GL_VENDOR_ARM; break; + case 'f': gl->vendor_id = GLVendor_ARM; break; /* NOTE(rnp): Microsoft Corporation - weird win32 thing (microsoft is just using mesa for the driver) */ - case 'M': gl->vendor_id = GL_VENDOR_ARM; break; + case 'M': gl->vendor_id = GLVendor_ARM; break; default: stream_append_s8s(err, s8("Unknown GL Vendor: "), c_str_to_s8(vendor), s8("\n")); os_fatal(stream_to_s8(err)); @@ -121,7 +121,7 @@ get_gl_params(GLParams *gl, Stream *err) } function void -validate_gl_requirements(GLParams *gl, Arena a) +validate_gl_requirements(GLParameters *gl, Arena a) { Stream s = arena_stream(a); @@ -139,7 +139,7 @@ validate_gl_requirements(GLParams *gl, Arena a) } function void -dump_gl_params(GLParams *gl, Arena a) +dump_gl_params(GLParameters *gl, Arena a) { #ifdef _DEBUG s8 vendor = s8("vendor:"); @@ -153,10 +153,10 @@ dump_gl_params(GLParams *gl, Arena a) stream_append_s8s(&s, s8("---- GL Parameters ----\n"), vendor); stream_pad(&s, ' ', max_width - (i32)vendor.len); switch (gl->vendor_id) { - case GL_VENDOR_AMD: stream_append_s8(&s, s8("AMD\n")); break; - case GL_VENDOR_ARM: stream_append_s8(&s, s8("ARM\n")); break; - case GL_VENDOR_INTEL: stream_append_s8(&s, s8("Intel\n")); break; - case GL_VENDOR_NVIDIA: stream_append_s8(&s, s8("nVidia\n")); break; + case GLVendor_AMD: stream_append_s8(&s, s8("AMD\n")); break; + case GLVendor_ARM: stream_append_s8(&s, s8("ARM\n")); break; + case GLVendor_Intel: stream_append_s8(&s, s8("Intel\n")); break; + case GLVendor_NVIDIA: stream_append_s8(&s, s8("nVidia\n")); break; } #define X(glname, name, suffix) \ @@ -202,11 +202,11 @@ function FILE_WATCH_CALLBACK_FN(load_cuda_library) { local_persist void *cuda_library_handle; - GLParams *gl = (typeof(gl))user_data; + GLParameters *gl = (typeof(gl))user_data; /* TODO(rnp): (25.10.30) registering the rf buffer with CUDA is currently * causing a major performance regression. for now we are disabling its use * altogether. it will be reenabled once the issue can be fixed */ - b32 result = 0 && gl->vendor_id == GL_VENDOR_NVIDIA && os_file_exists((c8 *)path.data); + b32 result = 0 && gl->vendor_id == GLVendor_NVIDIA && os_file_exists((c8 *)path.data); if (result) { Stream err = arena_stream(arena); @@ -459,6 +459,7 @@ setup_beamformer(Arena *memory, BeamformerCtx **o_ctx, BeamformerInput **o_input upctx->shared_memory = &ctx->shared_memory; upctx->compute_timing_table = ctx->compute_timing_table; upctx->compute_worker_sync = &ctx->compute_worker.sync_variable; + upctx->gl = &ctx->gl; upload->window_handle = glfwCreateWindow(1, 1, "", 0, raylib_window_handle); upload->handle = os_create_thread((iptr)upload, upload_worker_thread_entry_point); os_set_thread_name(worker->handle, s8("[upload_0]")); @@ -500,7 +501,7 @@ setup_beamformer(Arena *memory, BeamformerCtx **o_ctx, BeamformerInput **o_input LABEL_GL_OBJECT(GL_FRAMEBUFFER, fvr->framebuffers[1], s8("Frame View Resolving Framebuffer")); glCreateRenderbuffers(countof(fvr->renderbuffers), fvr->renderbuffers); - i32 msaa_samples = ctx->gl.vendor_id == GL_VENDOR_ARM? 4 : 8; + i32 msaa_samples = ctx->gl.vendor_id == GLVendor_ARM? 4 : 8; glNamedRenderbufferStorageMultisample(fvr->renderbuffers[0], msaa_samples, GL_RGBA8, FRAME_VIEW_RENDER_TARGET_SIZE); glNamedRenderbufferStorageMultisample(fvr->renderbuffers[1], msaa_samples, GL_DEPTH_COMPONENT24,