shaders/decode: improve gpu cache behaviour - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: 0fad2570f0676f4b9663b8b134025676e01083a7
Parent: ca621f7d047fa723b32e6a172785f837d65fc734
Author: Randy Palamar
Date:   Wed, 18 Jun 2025 11:23:24 -0600

shaders/decode: improve gpu cache behaviour

for a single work group we do not want data from more than 1
channel. Assuming a worse case sample count of 4096 samples per
transmit we can fit 16 transmits in a 128KB L1 cache (pretty
common even on low end modern cards). Therefore we want dispatches
in the transmit dimension to be 16.

In the samples dimension the dispatch count is a little less
obvious but we want it to be at least 2 on nVidia or 4 on AMD. AMD
typically reports an L0 (per work group) Cache which we can maybe
use to make a better decision there. But on my (AMD) card anything
above 4 performed essentially the same. Choosing 4 is fine for an
nVidia card too but only 32 items will actually get scheduled on
the same SM.

This results in a ~24% throughput increase on my system for
FORCES/HERCULES. The decode shader itself is now more than twice
as fast.

Diffstat:
M beamformer.c  | 19 ++++++++++++-------
M beamformer_parameters.h  | 4 ++++
M intrinsics.c  | 6 ++++--
M shaders/decode.glsl  | 45 +++++++++++++++++++--------------------------

4 files changed, 39 insertions(+), 35 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -224,12 +224,12 @@ start_compute_cursor(uv3 dim, u32 max_points)
 	struct compute_cursor result = {0};
 	u32 invocations_per_dispatch = DAS_LOCAL_SIZE_X * DAS_LOCAL_SIZE_Y * DAS_LOCAL_SIZE_Z;
 
-	result.dispatch.y = MIN(max_points / invocations_per_dispatch, MAX(dim.y / DAS_LOCAL_SIZE_Y, 1));
+	result.dispatch.y = MIN(max_points / invocations_per_dispatch, ceil_f32((f32)dim.y / DAS_LOCAL_SIZE_Y));
 
 	u32 remaining     = max_points / result.dispatch.y;
-	result.dispatch.x = MIN(remaining / invocations_per_dispatch, MAX(dim.x / DAS_LOCAL_SIZE_X, 1));
+	result.dispatch.x = MIN(remaining / invocations_per_dispatch, ceil_f32((f32)dim.x / DAS_LOCAL_SIZE_X));
 	result.dispatch.z = MIN(remaining / (invocations_per_dispatch * result.dispatch.x),
-	                        MAX(dim.z / DAS_LOCAL_SIZE_Z, 1));
+	                        ceil_f32((f32)dim.z / DAS_LOCAL_SIZE_Z));
 
 	result.target.x = MAX(dim.x / result.dispatch.x / DAS_LOCAL_SIZE_X, 1);
 	result.target.y = MAX(dim.y / result.dispatch.y / DAS_LOCAL_SIZE_Y, 1);
@@ -294,9 +294,9 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, 
 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]);
 		glBindImageTexture(0, csctx->hadamard_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8I);
 		glBindImageTexture(1, csctx->channel_mapping_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I);
-		glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32),
-		                  ORONE(csctx->dec_data_dim.y / 32),
-		                  ORONE(csctx->dec_data_dim.z));
+		glDispatchCompute(ceil_f32((f32)csctx->dec_data_dim.x / DECODE_LOCAL_SIZE_X),
+		                  ceil_f32((f32)csctx->dec_data_dim.y / DECODE_LOCAL_SIZE_Y),
+		                  ceil_f32((f32)csctx->dec_data_dim.z / DECODE_LOCAL_SIZE_Z));
 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
 	}break;
 	case ShaderKind_CudaDecode:{
@@ -424,8 +424,13 @@ shader_text_with_header(ShaderReloadContext *ctx, OS *os, Arena *arena)
 			stream_append_s8(&sb, s8("#define INPUT_DATA_TYPE_FLOAT_COMPLEX\n\n"));
 	} /* FALLTHROUGH */
 	case ShaderKind_Decode:{
-		#define X(type, id, pretty) stream_append_s8(&sb, s8("#define DECODE_MODE_" #type " " #id "\n"));
+		#define X(type, id, pretty) "#define DECODE_MODE_" #type " " #id "\n"
+		stream_append_s8(&sb, s8(""
+		"layout(local_size_x = " str(DECODE_LOCAL_SIZE_X) ", "
+		       "local_size_y = " str(DECODE_LOCAL_SIZE_Y) ", "
+		       "local_size_z = " str(DECODE_LOCAL_SIZE_Z) ") in;\n\n"
 		DECODE_TYPES
+		));
 		#undef X
 	}break;
 	case ShaderKind_MinMax:{
diff --git a/beamformer_parameters.h b/beamformer_parameters.h
@@ -59,6 +59,10 @@ typedef enum {
 	X(EPIC_UHERCULES,  9, "EPIC-UHERCULES", 0) \
 	X(FLASH,          10, "Flash",          0)
 
+#define DECODE_LOCAL_SIZE_X  4
+#define DECODE_LOCAL_SIZE_Y  1
+#define DECODE_LOCAL_SIZE_Z 16
+
 #define DAS_LOCAL_SIZE_X 32
 #define DAS_LOCAL_SIZE_Y  1
 #define DAS_LOCAL_SIZE_Z 32
diff --git a/intrinsics.c b/intrinsics.c
@@ -37,8 +37,9 @@
   #define atomic_cas_u32(ptr, cptr, n)  (_InterlockedCompareExchange((volatile u32 *)(ptr),   *(cptr), (n)) == *(cptr))
   #define atomic_or_u32(ptr, n)          _InterlockedOr((volatile u32 *)(ptr), (n))
 
-  #define sqrt_f32(a)     sqrtf(a)
   #define atan2_f32(y, x) atan2f(y, x)
+  #define ceil_f32(a)     ceilf(a)
+  #define sqrt_f32(a)     sqrtf(a)
 
 #else
   #define align_as(n)      __attribute__((aligned(n)))
@@ -64,8 +65,9 @@
   #define atomic_cas_u32                atomic_cas_u64
   #define atomic_load_u32               atomic_load_u64
 
-  #define sqrt_f32(a)     __builtin_sqrtf(a)
   #define atan2_f32(y, x) __builtin_atan2f(y, x)
+  #define ceil_f32(a)     __builtin_ceilf(a)
+  #define sqrt_f32(a)     __builtin_sqrtf(a)
 
 #endif
 
diff --git a/shaders/decode.glsl b/shaders/decode.glsl
@@ -1,5 +1,12 @@
 /* See LICENSE for license details. */
-layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
+
+/* NOTE(rnp): invoked with samples x channels x transmits
+ * Each instance extracts a single time sample from a single channel for all transmits
+ * and does a dot product with the appropriate row of the bound hadamard matrix
+ * (unless decode_mode == DECODE_MODE_NONE). The result of this dot product is stored in the
+ * output. In bulk this has the effect of computing a matrix multiply of the
+ * sample-transmit plane with the bound hadamard matrix.
+ */
 
 #if   defined(INPUT_DATA_TYPE_FLOAT)
 	#define INPUT_DATA_TYPE      float
@@ -32,6 +39,8 @@ INPUT_DATA_TYPE sample_rf_data(int index, uint lfs)
 #if defined(INPUT_DATA_TYPE_FLOAT) || defined(INPUT_DATA_TYPE_FLOAT_COMPLEX)
 	result = rf_data[index];
 #else
+	/* NOTE(rnp): for i16 rf_data we grab 2 samples at a time. We need to shift
+	 * arithmetically (maintaining the sign) to get the desired element. */
 	result = (rf_data[index] << lfs) >> 16;
 #endif
 	return result;
@@ -39,46 +48,30 @@ INPUT_DATA_TYPE sample_rf_data(int index, uint lfs)
 
 void main()
 {
-	/* NOTE: each invocation takes a time sample and a receive channel.
-	 * It first maps the column to the correct column in the rf data then
-	 * does the dot product with the equivalent row of the hadamard matrix.
-	 * The result is stored to the equivalent row, column index of the output.
-	 */
 	int time_sample = int(gl_GlobalInvocationID.x);
 	int channel     = int(gl_GlobalInvocationID.y);
-	int acq         = int(gl_GlobalInvocationID.z);
+	int transmit    = int(gl_GlobalInvocationID.z);
 
-	/* NOTE: offsets for storing the results in the output data */
-	uint out_off = dec_data_dim.x * dec_data_dim.y * acq + dec_data_dim.x * channel + time_sample;
+	/* NOTE(rnp): stores output as a 3D matrix with ordering of {samples, channels, transmits} */
+	uint out_off = dec_data_dim.x * dec_data_dim.y * transmit + dec_data_dim.x * channel + time_sample;
 
 	int rf_channel = imageLoad(channel_mapping, channel).x;
 
-	/* NOTE: stride is the number of samples between acquistions; off is the
-	 * index of the first acquisition for this channel and time sample  */
-	int rf_stride = int(dec_data_dim.x);
-	int rf_off    = int(rf_raw_dim.x) * rf_channel + time_sample;
-
-	/* NOTE: rf_data index and stride considering the data is i16 not i32 */
-	int ridx       = rf_off    / RF_SAMPLES_PER_INDEX;
-	int ridx_delta = rf_stride / RF_SAMPLES_PER_INDEX;
+	/* NOTE(rnp): samples input as 2D matrix of {samples * transmits + padding, channels} */
+	int rf_stride = int(dec_data_dim.x) / RF_SAMPLES_PER_INDEX;
+	int rf_offset = (int(rf_raw_dim.x) * rf_channel + time_sample) / RF_SAMPLES_PER_INDEX;
 
-	/* NOTE: rf_data is i16 so each access grabs two time samples at time.
-	 * We need to shift arithmetically (maintaining the sign) to get the
-	 * desired element. If the time sample is even we take the upper half
-	 * and if its odd we take the lower half. */
 	uint lfs = ((~time_sample) & 1u) * 16;
-
-	/* NOTE: Compute N-D dot product */
 	vec2 result = vec2(0);
 	switch (decode) {
 	case DECODE_MODE_NONE: {
-		result = RESULT_TYPE_CAST(sample_rf_data(ridx + ridx_delta * acq, lfs));
+		result = RESULT_TYPE_CAST(sample_rf_data(rf_offset + rf_stride * transmit, lfs));
 	} break;
 	case DECODE_MODE_HADAMARD: {
 		INPUT_DATA_TYPE sum = INPUT_DATA_TYPE(0);
 		for (int i = 0; i < dec_data_dim.z; i++) {
-			sum  += imageLoad(hadamard, ivec2(i, acq)).x * sample_rf_data(ridx, lfs);
-			ridx += ridx_delta;
+			sum += imageLoad(hadamard, ivec2(i, transmit)).x * sample_rf_data(rf_offset, lfs);
+			rf_offset += rf_stride;
 		}
 		result = RESULT_TYPE_CAST(sum) / float(dec_data_dim.z);
 	} break;

M	beamformer.c	\|	19	++++++++++++-------
M	beamformer_parameters.h	\|	4	++++
M	intrinsics.c	\|	6	++++--
M	shaders/decode.glsl	\|	45	+++++++++++++++++++--------------------------