Commit: 0269e2ebd19872b3d2507bb0849519ec019d6a2c
Parent: b3c9e90173f7ce8b1134d6449ca222a6d9c43c27
Author: Randy Palamar
Date: Thu, 13 Nov 2025 09:19:02 -0700
core: use NamedBufferSubData for rf data on NVIDIA
this fixes one part of the weird performance regression
Diffstat:
| M | beamformer.c | | | 51 | +++++++++++++++++++++++++++++++++------------------ |
| M | beamformer.h | | | 60 | ++++++++++++++++++++++++++++++++---------------------------- |
| M | opengl.h | | | 1 | + |
| M | static.c | | | 31 | ++++++++++++++++--------------- |
4 files changed, 82 insertions(+), 61 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -209,7 +209,7 @@ make_valid_output_points(i32 points[3])
}
function void
-alloc_beamform_frame(GLParams *gp, BeamformerFrame *out, iv3 out_dim, GLenum gl_kind, s8 name, Arena arena)
+alloc_beamform_frame(GLParameters *gp, BeamformerFrame *out, iv3 out_dim, GLenum gl_kind, s8 name, Arena arena)
{
out->dim = make_valid_output_points(out_dim.E);
if (gp) {
@@ -1391,14 +1391,23 @@ DEBUG_EXPORT BEAMFORMER_COMPLETE_COMPUTE_FN(beamformer_complete_compute)
}
function void
-beamformer_rf_buffer_allocate(BeamformerRFBuffer *rf, u32 rf_size)
+beamformer_rf_buffer_allocate(BeamformerRFBuffer *rf, u32 rf_size, b32 nvidia)
{
assert((rf_size % 64) == 0);
+ if (!nvidia) glUnmapNamedBuffer(rf->ssbo);
glDeleteBuffers(1, &rf->ssbo);
glCreateBuffers(1, &rf->ssbo);
- glNamedBufferStorage(rf->ssbo, countof(rf->compute_syncs) * rf_size, 0,
- GL_DYNAMIC_STORAGE_BIT|GL_MAP_WRITE_BIT);
+ u32 buffer_flags = GL_DYNAMIC_STORAGE_BIT;
+ if (!nvidia) buffer_flags |= GL_MAP_PERSISTENT_BIT|GL_MAP_WRITE_BIT;
+
+ glNamedBufferStorage(rf->ssbo, countof(rf->compute_syncs) * rf_size, 0, buffer_flags);
+
+ if (!nvidia) {
+ u32 access = GL_MAP_PERSISTENT_BIT|GL_MAP_WRITE_BIT|GL_MAP_FLUSH_EXPLICIT_BIT|GL_MAP_UNSYNCHRONIZED_BIT;
+ rf->buffer = glMapNamedBufferRange(rf->ssbo, 0, (GLsizei)(countof(rf->compute_syncs) * rf_size), access);
+ }
+
LABEL_GL_OBJECT(GL_BUFFER, rf->ssbo, s8("Raw_RF_SSBO"));
rf->size = rf_size;
}
@@ -1406,10 +1415,12 @@ beamformer_rf_buffer_allocate(BeamformerRFBuffer *rf, u32 rf_size)
DEBUG_EXPORT BEAMFORMER_RF_UPLOAD_FN(beamformer_rf_upload)
{
struct load_context {
- u8 *buffer;
+ uptr buffer;
void *data;
+ u32 offset;
u32 channel_count;
u32 channel_stride_bytes;
+ b32 nvidia;
} load_context_store = {0};
struct load_context *lctx = 0;
@@ -1427,10 +1438,11 @@ DEBUG_EXPORT BEAMFORMER_RF_UPLOAD_FN(beamformer_rf_upload)
(scratch_rf_size = atomic_swap_u32(&sm->rf_meta.size, 0)) &&
os_shared_memory_region_lock(ctx->shared_memory, sm->locks, (i32)scratch_lock, (u32)-1))
{
+ lctx->nvidia = ctx->gl->vendor_id == GLVendor_NVIDIA;
BeamformerRFBuffer *rf = ctx->rf_buffer;
rf->active_rf_size = (u32)round_up_to(scratch_rf_size, 64);
if (rf->size < rf->active_rf_size)
- beamformer_rf_buffer_allocate(rf, rf->active_rf_size);
+ beamformer_rf_buffer_allocate(rf, rf->active_rf_size, lctx->nvidia);
insertion_slot = rf->insertion_index++ % countof(rf->compute_syncs);
@@ -1447,12 +1459,8 @@ DEBUG_EXPORT BEAMFORMER_RF_UPLOAD_FN(beamformer_rf_upload)
glDeleteSync(rf->compute_syncs[insertion_slot]);
}
- /* NOTE(rnp): nVidia's drivers really don't play nice with persistant mapping,
- * at least when it is a big as this one wants to be. mapping and unmapping the
- * desired range each time doesn't seem to introduce any performance hit */
- u32 access = GL_MAP_WRITE_BIT|GL_MAP_FLUSH_EXPLICIT_BIT|GL_MAP_UNSYNCHRONIZED_BIT;
- lctx->buffer = glMapNamedBufferRange(rf->ssbo, insertion_slot * rf->active_rf_size,
- (i32)rf->active_rf_size, access);
+ lctx->offset = insertion_slot * rf->active_rf_size;
+ lctx->buffer = lctx->nvidia? rf->ssbo : (uptr)rf->buffer;
lctx->data = beamformer_shared_memory_scratch_arena(sm).beg;
BeamformerParameterBlock *b = beamformer_parameter_block(sm, atomic_load_u32(&sm->rf_meta.block));
@@ -1468,10 +1476,17 @@ DEBUG_EXPORT BEAMFORMER_RF_UPLOAD_FN(beamformer_rf_upload)
if (lctx->buffer) {
RangeU64 range = lane_range(lctx->channel_count);
- for (u64 channel = range.start; channel < range.stop; channel++) {
- u8 *out = lctx->buffer + channel * lctx->channel_stride_bytes;
- u8 *in = lctx->data + channel * lctx->channel_stride_bytes;
- mem_copy(out, in, lctx->channel_stride_bytes);
+ if (lctx->nvidia) {
+ i64 offset = (i64)(lctx->offset + range.start * lctx->channel_stride_bytes);
+ i32 size = (i32)(lctx->channel_stride_bytes * (range.stop - range.start));
+ u8 *in = lctx->data + range.start * lctx->channel_stride_bytes;
+ glNamedBufferSubData((u32)lctx->buffer, offset, size, in);
+ } else {
+ for (u64 channel = range.start; channel < range.stop; channel++) {
+ u8 *in = lctx->data + channel * lctx->channel_stride_bytes;
+ u8 *out = (u8 *)lctx->buffer + lctx->offset + channel * lctx->channel_stride_bytes;
+ mem_copy(out, in, lctx->channel_stride_bytes);
+ }
}
}
lane_sync();
@@ -1481,8 +1496,8 @@ DEBUG_EXPORT BEAMFORMER_RF_UPLOAD_FN(beamformer_rf_upload)
post_sync_barrier(ctx->shared_memory, upload_lock, sm->locks);
BeamformerRFBuffer *rf = ctx->rf_buffer;
- glFlushMappedNamedBufferRange(rf->ssbo, 0, (i32)rf->active_rf_size);
- glUnmapNamedBuffer(rf->ssbo);
+ if (!lctx->nvidia)
+ glFlushMappedNamedBufferRange(rf->ssbo, insertion_slot * rf->active_rf_size, (i32)rf->active_rf_size);
atomic_store_u64(rf->upload_syncs + insertion_slot, glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0));
atomic_store_u64(rf->compute_syncs + insertion_slot, 0);
diff --git a/beamformer.h b/beamformer.h
@@ -25,13 +25,6 @@ function OS_WRITE_FILE_FN(os_write_file);
#include "threads.c"
#include "util_gl.c"
-enum gl_vendor_ids {
- GL_VENDOR_AMD,
- GL_VENDOR_ARM,
- GL_VENDOR_INTEL,
- GL_VENDOR_NVIDIA,
-};
-
typedef struct {
v2 mouse;
v2 last_mouse;
@@ -97,6 +90,33 @@ typedef struct {
#include "beamformer_parameters.h"
#include "beamformer_shared_memory.c"
+typedef enum {
+ GLVendor_AMD,
+ GLVendor_ARM,
+ GLVendor_Intel,
+ GLVendor_NVIDIA,
+} GLVendorID;
+
+#define GL_PARAMETERS \
+ X(MAJOR_VERSION, version_major, "") \
+ X(MINOR_VERSION, version_minor, "") \
+ X(MIN_MAP_BUFFER_ALIGNMENT, min_map_buffer_alignment, "") \
+ X(TEXTURE_BUFFER_OFFSET_ALIGNMENT, texture_buffer_offset_alignment, "") \
+ X(MAX_TEXTURE_BUFFER_SIZE, max_texture_buffer_size, "") \
+ X(MAX_TEXTURE_SIZE, max_2d_texture_dim, "") \
+ X(MAX_3D_TEXTURE_SIZE, max_3d_texture_dim, "") \
+ X(MAX_SHADER_STORAGE_BLOCK_SIZE, max_ssbo_size, "") \
+ X(MAX_COMPUTE_SHARED_MEMORY_SIZE, max_shared_memory_size, "") \
+ X(MAX_UNIFORM_BLOCK_SIZE, max_ubo_size, "") \
+ X(MAX_SERVER_WAIT_TIMEOUT, max_server_wait_time, " [ns]") \
+
+typedef struct {
+ GLVendorID vendor_id;
+ #define X(glname, name, suffix) i32 name;
+ GL_PARAMETERS
+ #undef X
+} GLParameters;
+
typedef struct {
iptr elements_offset;
i32 elements;
@@ -197,7 +217,10 @@ typedef struct {
GLsync upload_syncs[BeamformerMaxRawDataFramesInFlight];
GLsync compute_syncs[BeamformerMaxRawDataFramesInFlight];
+ u8 *buffer;
+
u32 ssbo;
+
u32 size;
u32 active_rf_size;
@@ -267,6 +290,7 @@ typedef struct {
SharedMemoryRegion *shared_memory;
ComputeTimingTable *compute_timing_table;
i32 *compute_worker_sync;
+ GLParameters *gl;
} BeamformerUploadThreadContext;
struct BeamformerFrame {
@@ -292,28 +316,8 @@ struct BeamformerFrame {
BeamformerFrame *next;
};
-#define GL_PARAMETERS \
- X(MAJOR_VERSION, version_major, "") \
- X(MINOR_VERSION, version_minor, "") \
- X(MIN_MAP_BUFFER_ALIGNMENT, min_map_buffer_alignment, "") \
- X(TEXTURE_BUFFER_OFFSET_ALIGNMENT, texture_buffer_offset_alignment, "") \
- X(MAX_TEXTURE_BUFFER_SIZE, max_texture_buffer_size, "") \
- X(MAX_TEXTURE_SIZE, max_2d_texture_dim, "") \
- X(MAX_3D_TEXTURE_SIZE, max_3d_texture_dim, "") \
- X(MAX_SHADER_STORAGE_BLOCK_SIZE, max_ssbo_size, "") \
- X(MAX_COMPUTE_SHARED_MEMORY_SIZE, max_shared_memory_size, "") \
- X(MAX_UNIFORM_BLOCK_SIZE, max_ubo_size, "") \
- X(MAX_SERVER_WAIT_TIMEOUT, max_server_wait_time, " [ns]")
-
-typedef struct {
- enum gl_vendor_ids vendor_id;
- #define X(glname, name, suffix) i32 name;
- GL_PARAMETERS
- #undef X
-} GLParams;
-
typedef struct {
- GLParams gl;
+ GLParameters gl;
iv2 window_size;
b32 should_exit;
diff --git a/opengl.h b/opengl.h
@@ -14,6 +14,7 @@
#define GL_MAP_WRITE_BIT 0x0002
#define GL_MAP_FLUSH_EXPLICIT_BIT 0x0010
#define GL_MAP_UNSYNCHRONIZED_BIT 0x0020
+#define GL_MAP_PERSISTENT_BIT 0x0040
#define GL_DYNAMIC_STORAGE_BIT 0x0100
#define GL_SHADER_IMAGE_ACCESS_BARRIER_BIT 0x00000020
#define GL_TEXTURE_UPDATE_BARRIER_BIT 0x00000100
diff --git a/static.c b/static.c
@@ -94,7 +94,7 @@ gl_debug_logger(u32 src, u32 type, u32 id, u32 lvl, i32 len, const char *msg, co
}
function void
-get_gl_params(GLParams *gl, Stream *err)
+get_gl_params(GLParameters *gl, Stream *err)
{
char *vendor = (char *)glGetString(GL_VENDOR);
if (!vendor) {
@@ -103,13 +103,13 @@ get_gl_params(GLParams *gl, Stream *err)
}
/* TODO(rnp): str prefix of */
switch (vendor[0]) {
- case 'A': gl->vendor_id = GL_VENDOR_AMD; break;
- case 'I': gl->vendor_id = GL_VENDOR_INTEL; break;
- case 'N': gl->vendor_id = GL_VENDOR_NVIDIA; break;
+ case 'A': gl->vendor_id = GLVendor_AMD; break;
+ case 'I': gl->vendor_id = GLVendor_Intel; break;
+ case 'N': gl->vendor_id = GLVendor_NVIDIA; break;
/* NOTE(rnp): freedreno */
- case 'f': gl->vendor_id = GL_VENDOR_ARM; break;
+ case 'f': gl->vendor_id = GLVendor_ARM; break;
/* NOTE(rnp): Microsoft Corporation - weird win32 thing (microsoft is just using mesa for the driver) */
- case 'M': gl->vendor_id = GL_VENDOR_ARM; break;
+ case 'M': gl->vendor_id = GLVendor_ARM; break;
default:
stream_append_s8s(err, s8("Unknown GL Vendor: "), c_str_to_s8(vendor), s8("\n"));
os_fatal(stream_to_s8(err));
@@ -121,7 +121,7 @@ get_gl_params(GLParams *gl, Stream *err)
}
function void
-validate_gl_requirements(GLParams *gl, Arena a)
+validate_gl_requirements(GLParameters *gl, Arena a)
{
Stream s = arena_stream(a);
@@ -139,7 +139,7 @@ validate_gl_requirements(GLParams *gl, Arena a)
}
function void
-dump_gl_params(GLParams *gl, Arena a)
+dump_gl_params(GLParameters *gl, Arena a)
{
#ifdef _DEBUG
s8 vendor = s8("vendor:");
@@ -153,10 +153,10 @@ dump_gl_params(GLParams *gl, Arena a)
stream_append_s8s(&s, s8("---- GL Parameters ----\n"), vendor);
stream_pad(&s, ' ', max_width - (i32)vendor.len);
switch (gl->vendor_id) {
- case GL_VENDOR_AMD: stream_append_s8(&s, s8("AMD\n")); break;
- case GL_VENDOR_ARM: stream_append_s8(&s, s8("ARM\n")); break;
- case GL_VENDOR_INTEL: stream_append_s8(&s, s8("Intel\n")); break;
- case GL_VENDOR_NVIDIA: stream_append_s8(&s, s8("nVidia\n")); break;
+ case GLVendor_AMD: stream_append_s8(&s, s8("AMD\n")); break;
+ case GLVendor_ARM: stream_append_s8(&s, s8("ARM\n")); break;
+ case GLVendor_Intel: stream_append_s8(&s, s8("Intel\n")); break;
+ case GLVendor_NVIDIA: stream_append_s8(&s, s8("nVidia\n")); break;
}
#define X(glname, name, suffix) \
@@ -202,11 +202,11 @@ function FILE_WATCH_CALLBACK_FN(load_cuda_library)
{
local_persist void *cuda_library_handle;
- GLParams *gl = (typeof(gl))user_data;
+ GLParameters *gl = (typeof(gl))user_data;
/* TODO(rnp): (25.10.30) registering the rf buffer with CUDA is currently
* causing a major performance regression. for now we are disabling its use
* altogether. it will be reenabled once the issue can be fixed */
- b32 result = 0 && gl->vendor_id == GL_VENDOR_NVIDIA && os_file_exists((c8 *)path.data);
+ b32 result = 0 && gl->vendor_id == GLVendor_NVIDIA && os_file_exists((c8 *)path.data);
if (result) {
Stream err = arena_stream(arena);
@@ -459,6 +459,7 @@ setup_beamformer(Arena *memory, BeamformerCtx **o_ctx, BeamformerInput **o_input
upctx->shared_memory = &ctx->shared_memory;
upctx->compute_timing_table = ctx->compute_timing_table;
upctx->compute_worker_sync = &ctx->compute_worker.sync_variable;
+ upctx->gl = &ctx->gl;
upload->window_handle = glfwCreateWindow(1, 1, "", 0, raylib_window_handle);
upload->handle = os_create_thread((iptr)upload, upload_worker_thread_entry_point);
os_set_thread_name(worker->handle, s8("[upload_0]"));
@@ -500,7 +501,7 @@ setup_beamformer(Arena *memory, BeamformerCtx **o_ctx, BeamformerInput **o_input
LABEL_GL_OBJECT(GL_FRAMEBUFFER, fvr->framebuffers[1], s8("Frame View Resolving Framebuffer"));
glCreateRenderbuffers(countof(fvr->renderbuffers), fvr->renderbuffers);
- i32 msaa_samples = ctx->gl.vendor_id == GL_VENDOR_ARM? 4 : 8;
+ i32 msaa_samples = ctx->gl.vendor_id == GLVendor_ARM? 4 : 8;
glNamedRenderbufferStorageMultisample(fvr->renderbuffers[0], msaa_samples, GL_RGBA8,
FRAME_VIEW_RENDER_TARGET_SIZE);
glNamedRenderbufferStorageMultisample(fvr->renderbuffers[1], msaa_samples, GL_DEPTH_COMPONENT24,