shaders/decode: use register caching for transmit counts < 48 - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: b7655d49b9f84c14c5b7055305091f2771be2fca
Parent: 3939648169851306c32bf37a52f73ac8567c85fb
Author: Randy Palamar
Date:   Mon,  3 Nov 2025 11:44:22 -0700

shaders/decode: use register caching for transmit counts < 48

Preloading all needed samples into registers and then calculating
the output for all transmits in a single thread gives better
performance than doing the processing across multiple threads. We
have to schedule less than 64 threads per wave and have only 3/16
waves in flight but it still gives better performance.

Note that the number of scheduled threads may need to be adjusted
on NVIDIA since they only have 32 threads per warp but until we
can fix out test setup this may introduce a small hit on NVIDIA
(for these small transmit counts)

Diffstat:
M beamformer.c  | 25 +++++++++++++++++--------
M shaders/decode.glsl  | 19 +++++++++++++++----

2 files changed, 32 insertions(+), 12 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -525,16 +525,25 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 				db->output_transmit_stride *= decimation_rate;
 			}
 
-			if (run_cuda_hilbert)        sd->bake.flags |= BeamformerShaderDecodeFlags_DilateOutput;
-			if (db->transmit_count > 32) sd->bake.flags |= BeamformerShaderDecodeFlags_UseSharedMemory;
+			if (run_cuda_hilbert) sd->bake.flags |= BeamformerShaderDecodeFlags_DilateOutput;
 
-			db->transmits_processed = db->transmit_count >= 32 ? 2 : 1;
+			if (db->transmit_count > 40) {
+				sd->bake.flags |= BeamformerShaderDecodeFlags_UseSharedMemory;
+				db->transmits_processed = 2;
 
-			b32 use_16z  = db->transmit_count <= 32 || db->transmit_count == 80 ||
-			               db->transmit_count == 96 || db->transmit_count == 160;
-			sd->layout.x = 4;
-			sd->layout.y = 1;
-			sd->layout.z = use_16z? 16 : 32;
+				b32 use_16z  = db->transmit_count == 80 || db->transmit_count == 96 || db->transmit_count == 160;
+				sd->layout.x = 4;
+				sd->layout.y = 1;
+				sd->layout.z = use_16z? 16 : 32;
+			} else {
+				db->transmits_processed = db->transmit_count;
+				/* NOTE(rnp): register caching. using more threads will cause the compiler to do
+				 * contortions to avoid spilling registers. using less gives higher performance */
+				/* TODO(rnp): may need to be adjusted to 16 on NVIDIA */
+				sd->layout.x = 32;
+				sd->layout.y = 1;
+				sd->layout.z = 1;
+			}
 
 			sd->dispatch.x = (u32)ceil_f32((f32)sample_count                     / (f32)sd->layout.x);
 			sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count     / (f32)sd->layout.y);
diff --git a/shaders/decode.glsl b/shaders/decode.glsl
@@ -64,7 +64,9 @@ SAMPLE_DATA_TYPE sample_rf_data(uint index)
 	return result;
 }
 
+#if UseSharedMemory
 shared INPUT_DATA_TYPE rf[gl_WorkGroupSize.x * TransmitCount];
+#endif
 
 void main()
 {
@@ -114,15 +116,24 @@ void main()
 				for (uint i = 0; i < TransmitsProcessed; i++)
 					result[i] = SAMPLE_DATA_TYPE(0);
 
+				#if UseSharedMemory
 				for (int j = 0; j < TransmitCount; j++) {
-					#if UseSharedMemory
 					SAMPLE_DATA_TYPE s = SAMPLE_TYPE_CAST(rf[gl_LocalInvocationID.x * TransmitCount + j]);
-					#else
-					SAMPLE_DATA_TYPE s = sample_rf_data(rf_offset + j);
-					#endif
 					for (uint i = 0; i < TransmitsProcessed; i++)
 						result[i] += imageLoad(hadamard, ivec2(j, transmit + i)).x * s;
 				}
+				#else
+				INPUT_DATA_TYPE rf[TransmitsProcessed];
+				for (int j = 0; j < TransmitCount; j++)
+					rf[j] = rf_data[rf_offset + j];
+
+				for (int j = 0; j < TransmitCount; j++) {
+					SAMPLE_DATA_TYPE s = SAMPLE_TYPE_CAST(rf[j]);
+					for (uint i = 0; i < TransmitsProcessed; i++) {
+						result[i] += imageLoad(hadamard, ivec2(j, transmit + i)).x * s;
+					}
+				}
+				#endif
 
 				for (uint i = 0; i < TransmitsProcessed; i++)
 					result[i] /= float(TransmitCount);

M	beamformer.c	\|	25	+++++++++++++++++--------
M	shaders/decode.glsl	\|	19	+++++++++++++++----