use BufferSubData for uploading rf data on NVIDIA/AMD - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: 8741f0be4c6ceab20d7fa32cb71da28a825b9f91
Parent: ea9d5a0270e873fd7ad2e8e06f154671a291ba20
Author: Randy Palamar
Date:   Tue, 13 Aug 2024 10:54:45 -0600

use BufferSubData for uploading rf data on NVIDIA/AMD

It may seem counter-intuitive and goes against all of NVIDIA's
presentations and other documentation but this is the fastest way
to upload the data.

Unfourtanetely this doesn't work on Intel for some reason so it
gets a seperate path. The seperate path code is necessary anyways
because we want to add CUDA support for NVIDIA.

Diffstat:
M beamformer.c  | 58 ++++++++++++++++++++++++++++++++++++----------------------
M beamformer.h  | 17 ++++++++++++++---
M main.c  | 12 ++++++++++++

3 files changed, 62 insertions(+), 25 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -69,12 +69,17 @@ alloc_shader_storage(BeamformerCtx *ctx, Arena a)
 	glDeleteBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos);
 	glGenBuffers(ARRAY_COUNT(cs->rf_data_ssbos),    cs->rf_data_ssbos);
 
+	i32 storage_flags = GL_DYNAMIC_STORAGE_BIT;
+	if (ctx->gl_vendor_id == GL_VENDOR_INTEL)
+		storage_flags |= GL_MAP_WRITE_BIT;
 	glDeleteBuffers(1, &cs->raw_data_ssbo);
-	glGenBuffers(1,    &cs->raw_data_ssbo);
+	glCreateBuffers(1, &cs->raw_data_ssbo);
+	glNamedBufferStorage(cs->raw_data_ssbo, ARRAY_COUNT(cs->raw_data_fences) * rf_raw_size, 0,
+	                     storage_flags);
 
-	glBindBuffer(GL_SHADER_STORAGE_BUFFER, cs->raw_data_ssbo);
-	glBufferStorage(GL_SHADER_STORAGE_BUFFER, ARRAY_COUNT(cs->raw_data_fences) * rf_raw_size,
-	                0, GL_MAP_WRITE_BIT);
+	/* TODO: allow this to grow if the raw data has been resized */
+	if (cs->raw_data_arena.beg == 0)
+		cs->raw_data_arena = os_new_arena(rf_raw_size);
 
 	for (u32 i = 0; i < ARRAY_COUNT(cs->rf_data_ssbos); i++) {
 		glBindBuffer(GL_SHADER_STORAGE_BUFFER, cs->rf_data_ssbos[i]);
@@ -538,7 +543,8 @@ do_beamformer(BeamformerCtx *ctx, Arena arena)
 	BeamformerParameters *bp = &ctx->params->raw;
 	/* NOTE: Check for and Load RF Data into GPU */
 	if (os_poll_pipe(ctx->data_pipe)) {
-		if (!uv4_equal(ctx->csctx.dec_data_dim, bp->dec_data_dim) || ctx->flags & ALLOC_SSBOS)
+		ComputeShaderCtx *cs = &ctx->csctx;
+		if (!uv4_equal(cs->dec_data_dim, bp->dec_data_dim) || ctx->flags & ALLOC_SSBOS)
 			alloc_shader_storage(ctx, arena);
 		if (!uv4_equal(ctx->out_data_dim, bp->output_points) || ctx->flags & ALLOC_OUT_TEX)
 			alloc_output_image(ctx);
@@ -547,31 +553,39 @@ do_beamformer(BeamformerCtx *ctx, Arena arena)
 		/* NOTE: if this times out it means the command queue is more than 3 frames behind.
 		 * In that case we need to re-evaluate the buffer size */
 		if (ctx->csctx.raw_data_fences[raw_index]) {
-			i32 result = glClientWaitSync(ctx->csctx.raw_data_fences[raw_index], 0, 10000);
+			i32 result = glClientWaitSync(cs->raw_data_fences[raw_index], 0, 10000);
 			if (result == GL_TIMEOUT_EXPIRED) {
 				//ASSERT(0);
 			}
-			glDeleteSync(ctx->csctx.raw_data_fences[raw_index]);
-			ctx->csctx.raw_data_fences[raw_index] = NULL;
+			glDeleteSync(cs->raw_data_fences[raw_index]);
+			cs->raw_data_fences[raw_index] = NULL;
 		}
 
-		uv2  rf_raw_dim   = ctx->csctx.rf_raw_dim;
+		uv2  rf_raw_dim   = cs->rf_raw_dim;
 		size rf_raw_size  = rf_raw_dim.x * rf_raw_dim.y * sizeof(i16);
 
-		glBindBuffer(GL_SHADER_STORAGE_BUFFER, ctx->csctx.raw_data_ssbo);
-		void *rf_data_buf = glMapBufferRange(GL_SHADER_STORAGE_BUFFER,
-		                                     raw_index * rf_raw_size, rf_raw_size,
-		                                     GL_MAP_UNSYNCHRONIZED_BIT | GL_MAP_WRITE_BIT);
-		if (!rf_data_buf) {
-			rlCheckErrors();
-			ASSERT(0);
+		if (ctx->gl_vendor_id == GL_VENDOR_INTEL) {
+			/* TODO: intel complains about this buffer being busy even with
+			 * MAP_UNSYNCHRONIZED_BIT */
+			void *rf_data_buf = glMapNamedBufferRange(cs->raw_data_ssbo,
+			                                          raw_index * rf_raw_size,
+			                                          rf_raw_size,
+			                                          GL_MAP_WRITE_BIT);
+			size rlen = os_read_pipe_data(ctx->data_pipe, rf_data_buf, rf_raw_size);
+			glUnmapNamedBuffer(cs->raw_data_ssbo);
+			if (rlen == rf_raw_size) ctx->flags |= DO_COMPUTE;
+			else                     ctx->partial_transfer_count++;
+		} else {
+			void *rf_data_buf = cs->raw_data_arena.beg + raw_index * rf_raw_size;
+			size rlen = os_read_pipe_data(ctx->data_pipe, rf_data_buf, rf_raw_size);
+			if (rlen == rf_raw_size) {
+				ctx->flags |= DO_COMPUTE;
+				glNamedBufferSubData(cs->raw_data_ssbo, raw_index * rf_raw_size,
+				                     rf_raw_size, rf_data_buf);
+			} else {
+				ctx->partial_transfer_count++;
+			}
 		}
-		size rlen = os_read_pipe_data(ctx->data_pipe, rf_data_buf, rf_raw_size);
-
-		glUnmapBuffer(GL_SHADER_STORAGE_BUFFER);
-
-		if (rlen == rf_raw_size) ctx->flags |= DO_COMPUTE;
-		else                     ctx->partial_transfer_count++;
 	}
 
 	if (ctx->flags & UPLOAD_FILTER)
diff --git a/beamformer.h b/beamformer.h
@@ -60,6 +60,12 @@ enum program_flags {
 	DO_COMPUTE     = 1 << 30,
 };
 
+enum gl_vendor_ids {
+	GL_VENDOR_AMD,
+	GL_VENDOR_INTEL,
+	GL_VENDOR_NVIDIA,
+};
+
 typedef struct {
 	char buf[64];
 	i32  buf_len;
@@ -98,13 +104,17 @@ typedef struct {
 	GLsync timer_fence;
 	f32    last_frame_time[CS_LAST];
 
-	/* NOTE: multiple raw data SSBOs for unsynchronized mapping.
-	 * Decoded data is only relavent in the context of a single frame, two are
-	 * used so that they can be swapped when chaining multiple compute stages */
+	/* NOTE: the raw_data_ssbo is allocated at 3x the required size to allow for tiled
+	 * transfers when the GPU is running behind the CPU. It is not mapped because NVIDIA's
+	 * drivers _will_ store the buffer in the sytem memory in that case (this doesn't happen
+	 * for Intel or AMD). Instead BufferSubData is used to update the correct subrange */
 	GLsync raw_data_fences[3];
+	Arena  raw_data_arena;
 	u32    raw_data_ssbo;
 	u32    raw_data_index;
 
+	/* NOTE: Decoded data is only relevant in the context of a single frame. We use two
+	 * buffers so that they can be swapped when chaining multiple compute stages */
 	u32 rf_data_ssbos[2];
 	u32 last_output_ssbo_index;
 	u32 hadamard_ssbo;
@@ -134,6 +144,7 @@ typedef struct {
 typedef struct {
 	uv2 window_size;
 	u32 flags;
+	enum gl_vendor_ids gl_vendor_id;
 
 	f32 dt;
 
diff --git a/main.c b/main.c
@@ -185,6 +185,18 @@ main(void)
 
 	ctx.params->raw.output_points = ctx.out_data_dim;
 
+	/* NOTE: Determine which graphics vendor we are running on */
+	{
+		const u8 *vendor = glGetString(GL_VENDOR);
+		if (!vendor)
+			die("Failed to determine GL Vendor\n");
+		switch (vendor[0]) {
+		case 'A': ctx.gl_vendor_id = GL_VENDOR_AMD;       break;
+		case 'I': ctx.gl_vendor_id = GL_VENDOR_INTEL;     break;
+		case 'N': ctx.gl_vendor_id = GL_VENDOR_NVIDIA;    break;
+		default:  die("Unknown GL Vendor: %s\n", vendor); break;
+		}
+	}
 
 	/* NOTE: set up OpenGL debug logging */
 	glDebugMessageCallback(gl_debug_logger, NULL);

M	beamformer.c	\|	58	++++++++++++++++++++++++++++++++++++----------------------
M	beamformer.h	\|	17	++++++++++++++---
M	main.c	\|	12	++++++++++++