shaders/decode: move shared memory and register caching to separate functions - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: 31df2285cfeeb900c97ecd3011a75df20ef05ad9
Parent: 35f140977903d900298b976cd5ffc11e3e2ebd68
Author: Randy Palamar
Date:   Wed,  5 Nov 2025 12:58:16 -0700

shaders/decode: move shared memory and register caching to separate functions

this was hard to read and reason about. additionally fixed bad
indexing for rf only mode.

also added a missed special case for 48 Transmits. this brings its
performance in line with the trend followed by the other sizes

Diffstat:
M beamformer.c  | 28 ++++++++++++++++++++--------
M beamformer.meta  | 2 +-
M generated/beamformer.meta.c  | 4 ++--
M shaders/decode.glsl  | 173 ++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------

4 files changed, 129 insertions(+), 78 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -531,31 +531,43 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 			if (run_cuda_hilbert) sd->bake.flags |= BeamformerShaderDecodeFlags_DilateOutput;
 
 			if (db->decode_mode == BeamformerDecodeMode_None) {
-				db->transmits_processed = 1;
+				db->to_process = 1;
 				sd->layout.x = 64;
 				sd->layout.y = 1;
 				sd->layout.z = 1;
+				sd->dispatch.x = (u32)ceil_f32((f32)sample_count                     / (f32)sd->layout.x);
+				sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count     / (f32)sd->layout.y);
+				sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z);
 			} else if (db->transmit_count > 40) {
 				sd->bake.flags |= BeamformerShaderDecodeFlags_UseSharedMemory;
-				db->transmits_processed = 2;
+				db->to_process = 2;
 
-				b32 use_16z  = db->transmit_count == 80 || db->transmit_count == 96 || db->transmit_count == 160;
+				if (db->transmit_count == 48)
+					db->to_process = db->transmit_count / 16;
+
+				b32 use_16z  = db->transmit_count == 48 || db->transmit_count == 80 ||
+				               db->transmit_count == 96 || db->transmit_count == 160;
 				sd->layout.x = 4;
 				sd->layout.y = 1;
 				sd->layout.z = use_16z? 16 : 32;
+
+				sd->dispatch.x = (u32)ceil_f32((f32)sample_count                     / (f32)sd->layout.x);
+				sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count     / (f32)sd->layout.y);
+				sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z / (f32)db->to_process);
 			} else {
-				db->transmits_processed = db->transmit_count;
+				db->to_process = 1;
+
 				/* NOTE(rnp): register caching. using more threads will cause the compiler to do
 				 * contortions to avoid spilling registers. using less gives higher performance */
 				/* TODO(rnp): may need to be adjusted to 16 on NVIDIA */
 				sd->layout.x = 32;
 				sd->layout.y = 1;
 				sd->layout.z = 1;
-			}
 
-			sd->dispatch.x = (u32)ceil_f32((f32)sample_count                     / (f32)sd->layout.x);
-			sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count     / (f32)sd->layout.y);
-			sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z / (f32)db->transmits_processed);
+				sd->dispatch.x = (u32)ceil_f32((f32)sample_count                 / (f32)sd->layout.x);
+				sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y);
+				sd->dispatch.z = 1;
+			}
 
 			if (first) sd->dispatch.x *= decimation_rate;
 
diff --git a/beamformer.meta b/beamformer.meta
@@ -70,8 +70,8 @@
 			@BakeInt(OutputChannelStride  output_channel_stride )
 			@BakeInt(OutputSampleStride   output_sample_stride  )
 			@BakeInt(OutputTransmitStride output_transmit_stride)
+			@BakeInt(ToProcess            to_process            )
 			@BakeInt(TransmitCount        transmit_count        )
-			@BakeInt(TransmitsProcessed   transmits_processed   )
 		}
 	}
 
diff --git a/generated/beamformer.meta.c b/generated/beamformer.meta.c
@@ -99,8 +99,8 @@ typedef struct {
 	u32 output_channel_stride;
 	u32 output_sample_stride;
 	u32 output_transmit_stride;
+	u32 to_process;
 	u32 transmit_count;
-	u32 transmits_processed;
 } BeamformerShaderDecodeBakeParameters;
 
 typedef struct {
@@ -296,8 +296,8 @@ read_only global s8 *beamformer_shader_bake_parameter_names[] = {
 		s8_comp("OutputChannelStride"),
 		s8_comp("OutputSampleStride"),
 		s8_comp("OutputTransmitStride"),
+		s8_comp("ToProcess"),
 		s8_comp("TransmitCount"),
-		s8_comp("TransmitsProcessed"),
 	},
 	(s8 []){
 		s8_comp("DecimationRate"),
diff --git a/shaders/decode.glsl b/shaders/decode.glsl
@@ -66,92 +66,131 @@ SAMPLE_DATA_TYPE sample_rf_data(uint index)
 
 #if UseSharedMemory
 shared INPUT_DATA_TYPE rf[gl_WorkGroupSize.x * TransmitCount];
+void run_decode_large(void)
+{
+	uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX;
+	uint channel     = gl_GlobalInvocationID.y;
+	uint transmit    = gl_GlobalInvocationID.z * ToProcess;
+
+	uint thread_count = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z;
+	uint thread_index = gl_LocalInvocationIndex;
+
+	uint samples_per_thread  = rf.length() / thread_count;
+	uint leftover_samples    = rf.length() % thread_count;
+	uint samples_this_thread = samples_per_thread + uint(thread_index < leftover_samples);
+
+	uint rf_offset = (InputChannelStride * channel / RF_SAMPLES_PER_INDEX +
+	                  TransmitCount * gl_WorkGroupID.x * gl_WorkGroupSize.x);
+
+	for (uint i = 0; i < samples_this_thread; i++) {
+		uint index = i * thread_count + thread_index;
+		rf[index] = rf_data[rf_offset + index];
+	}
+
+	barrier();
+
+	SAMPLE_DATA_TYPE result[ToProcess];
+	if (time_sample < OutputTransmitStride) {
+		for (uint i = 0; i < ToProcess; i++)
+			result[i] = SAMPLE_DATA_TYPE(0);
+
+		for (int j = 0; j < TransmitCount; j++) {
+			SAMPLE_DATA_TYPE s = SAMPLE_TYPE_CAST(rf[gl_LocalInvocationID.x * TransmitCount + j]);
+			for (uint i = 0; i < ToProcess; i++)
+				result[i] += imageLoad(hadamard, ivec2(j, transmit + i)).x * s;
+		}
+
+		for (uint i = 0; i < ToProcess; i++)
+			result[i] /= float(TransmitCount);
+	}
+
+	/* NOTE(rnp): DO NOT combine with above; compiler shits the bed on TransmitCount == 80
+	 * and it kills performance. reinvestigate when we further optimize */
+	if (time_sample < OutputTransmitStride) {
+		uint out_off = OutputChannelStride  * channel +
+		               OutputTransmitStride * transmit +
+		               OutputSampleStride   * time_sample;
+
+		for (uint i = 0; i < ToProcess; i++, out_off += OutputTransmitStride)
+			if (TransmitCount % (gl_WorkGroupSize.z * ToProcess) == 0 || transmit + i < TransmitCount)
+				out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = result[i];
+	}
+}
 #endif
 
+void run_decode_small(void)
+{
+	uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX;
+	uint channel     = gl_GlobalInvocationID.y;
+	uint rf_offset   = (InputChannelStride * channel + TransmitCount * time_sample) / RF_SAMPLES_PER_INDEX;
+
+	if (time_sample < OutputTransmitStride) {
+		INPUT_DATA_TYPE rf[TransmitCount];
+		for (int j = 0; j < TransmitCount; j++)
+			rf[j] = rf_data[rf_offset + j];
+
+		SAMPLE_DATA_TYPE result[TransmitCount];
+		for (int j = 0; j < TransmitCount; j++)
+			result[j] = SAMPLE_DATA_TYPE(0);
+
+		for (int i = 0; i < TransmitCount; i++) {
+			SAMPLE_DATA_TYPE s = SAMPLE_TYPE_CAST(rf[i]);
+			for (int j = 0; j < TransmitCount; j++) {
+				result[j] += imageLoad(hadamard, ivec2(i, j)).x * s;
+			}
+		}
+
+		for (int i = 0; i < TransmitCount; i++)
+			result[i] /= float(TransmitCount);
+
+		uint out_off = OutputChannelStride  * channel +
+		               OutputSampleStride   * time_sample;
+		for (int i = 0; i < TransmitCount; i++, out_off += OutputTransmitStride)
+			out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = result[i];
+	}
+}
+
 void main()
 {
 	uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX;
 	uint channel     = gl_GlobalInvocationID.y;
-	uint transmit    = gl_GlobalInvocationID.z * TransmitsProcessed;
+	uint transmit    = gl_GlobalInvocationID.z * ToProcess;
 
 	uint rf_offset = (InputChannelStride * channel + TransmitCount * time_sample) / RF_SAMPLES_PER_INDEX;
 	if (u_first_pass) {
 		if (time_sample < InputTransmitStride) {
-			uint in_off = InputChannelStride  * imageLoad(channel_mapping, int(channel)).x +
-			              InputTransmitStride * transmit +
-			              InputSampleStride   * time_sample;
-			for (uint i = 0; i < TransmitsProcessed; i++, in_off += InputTransmitStride)
-				out_rf_data[rf_offset + transmit + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX];
+			uint in_off = InputChannelStride * imageLoad(channel_mapping, int(channel)).x +
+			              InputSampleStride  * time_sample;
+			#if DecodeMode == DecodeMode_None || UseSharedMemory
+			in_off    += InputTransmitStride * transmit;
+			rf_offset += transmit;
+			for (uint i = 0; i < ToProcess; i++, in_off += InputTransmitStride) {
+				if (transmit + i < TransmitCount)
+					out_rf_data[rf_offset + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX];
+			}
+			#else
+			for (uint i = 0; i < TransmitCount; i++, in_off += InputTransmitStride)
+				out_rf_data[rf_offset + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX];
+			#endif
 		}
 	} else {
-		if (UseSharedMemory == 0 && time_sample >= OutputTransmitStride)
-			return;
-
-		SAMPLE_DATA_TYPE result[TransmitsProcessed];
 		switch (DecodeMode) {
 		case DecodeMode_None:{
-			for (uint i = 0; i < TransmitsProcessed; i++)
-				if (TransmitCount % (gl_WorkGroupSize.z * TransmitsProcessed) == 0 || transmit + i < TransmitCount)
-					result[i] = sample_rf_data(rf_offset + transmit + i);
+			uint out_off = OutputChannelStride  * channel +
+			               OutputTransmitStride * transmit +
+			               OutputSampleStride   * time_sample;
+			for (uint i = 0; i < ToProcess; i++, out_off += OutputTransmitStride) {
+				if (TransmitCount % (gl_WorkGroupSize.z * ToProcess) == 0 || transmit + i < TransmitCount)
+					out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = sample_rf_data(rf_offset + transmit + i);
+			}
 		}break;
 		case DecodeMode_Hadamard:{
 			#if UseSharedMemory
-			{
-				uint thread_count = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z;
-				uint thread_index = gl_LocalInvocationIndex;
-
-				uint samples_per_thread  = rf.length() / thread_count;
-				uint leftover_samples    = rf.length() % thread_count;
-				uint samples_this_thread = samples_per_thread + uint(thread_index < leftover_samples);
-
-				uint rf_offset = (InputChannelStride * channel +
-				                  TransmitCount * gl_WorkGroupID.x * gl_WorkGroupSize.x) / RF_SAMPLES_PER_INDEX;
-
-				for (uint i = 0; i < samples_this_thread; i++) {
-					uint index = i * thread_count + thread_index;
-					rf[index] = rf_data[rf_offset + index];
-				}
-				barrier();
-			}
+				run_decode_large();
+			#else
+				run_decode_small();
 			#endif
-
-			if (time_sample < OutputTransmitStride) {
-				for (uint i = 0; i < TransmitsProcessed; i++)
-					result[i] = SAMPLE_DATA_TYPE(0);
-
-				#if UseSharedMemory
-				for (int j = 0; j < TransmitCount; j++) {
-					SAMPLE_DATA_TYPE s = SAMPLE_TYPE_CAST(rf[gl_LocalInvocationID.x * TransmitCount + j]);
-					for (uint i = 0; i < TransmitsProcessed; i++)
-						result[i] += imageLoad(hadamard, ivec2(j, transmit + i)).x * s;
-				}
-				#else
-				INPUT_DATA_TYPE rf[TransmitsProcessed];
-				for (int j = 0; j < TransmitCount; j++)
-					rf[j] = rf_data[rf_offset + j];
-
-				for (int j = 0; j < TransmitCount; j++) {
-					SAMPLE_DATA_TYPE s = SAMPLE_TYPE_CAST(rf[j]);
-					for (uint i = 0; i < TransmitsProcessed; i++) {
-						result[i] += imageLoad(hadamard, ivec2(j, transmit + i)).x * s;
-					}
-				}
-				#endif
-
-				for (uint i = 0; i < TransmitsProcessed; i++)
-					result[i] /= float(TransmitCount);
-			}
 		}break;
 		}
-
-		if (time_sample < OutputTransmitStride) {
-			uint out_off = OutputChannelStride  * channel +
-			               OutputTransmitStride * transmit +
-			               OutputSampleStride   * time_sample;
-
-			for (uint i = 0; i < TransmitsProcessed; i++, out_off += OutputTransmitStride)
-				if (TransmitCount % (gl_WorkGroupSize.z * TransmitsProcessed) == 0 || transmit + i < TransmitCount)
-					out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = result[i];
-		}
 	}
 }

M	beamformer.c	\|	28	++++++++++++++++++++--------
M	beamformer.meta	\|	2	+-
M	generated/beamformer.meta.c	\|	4	++--
M	shaders/decode.glsl	\|	173	++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------