shaders/decode: use LDS for Hadamard > 32 - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: c414a90c8eb461e4fa1bcc7de6ea2882ec26e574
Parent: 1f851e8a4feb66a225d0f8951216ee214e1237a9
Author: Randy Palamar
Date:   Fri, 31 Oct 2025 13:14:14 -0600

shaders/decode: use LDS for Hadamard > 32

The performance improvement varies. Its small for smaller
matrices, > ~30% for 128, and > ~50% for 256

Diffstat:
M beamformer.c  | 6 ++++--
M beamformer.meta  | 1 +
M generated/beamformer.meta.c  | 6 ++++--
M shaders/decode.glsl  | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------

4 files changed, 65 insertions(+), 19 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -527,13 +527,15 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 				db->output_transmit_stride *= decimation_rate;
 			}
 
+			db->transmits_processed = db->transmit_count >= 32 ? 2 : 1;
+
 			sd->layout.x = 4;
 			sd->layout.y = 1;
-			sd->layout.z = 16;
+			sd->layout.z = (db->transmit_count <= 32 || db->transmit_count == 80)? 16 : 32;
 
 			sd->dispatch.x = (u32)ceil_f32((f32)sample_count                     / (f32)sd->layout.x);
 			sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count     / (f32)sd->layout.y);
-			sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z);
+			sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z / (f32)db->transmits_processed);
 
 			if (first) sd->dispatch.x *= decimation_rate;
 
diff --git a/beamformer.meta b/beamformer.meta
@@ -71,6 +71,7 @@
 			@BakeInt(OutputSampleStride   output_sample_stride  )
 			@BakeInt(OutputTransmitStride output_transmit_stride)
 			@BakeInt(TransmitCount        transmit_count        )
+			@BakeInt(TransmitsProcessed   transmits_processed   )
 		}
 	}
 
diff --git a/generated/beamformer.meta.c b/generated/beamformer.meta.c
@@ -99,6 +99,7 @@ typedef struct {
 	u32 output_sample_stride;
 	u32 output_transmit_stride;
 	u32 transmit_count;
+	u32 transmits_processed;
 } BeamformerShaderDecodeBakeParameters;
 
 typedef struct {
@@ -295,6 +296,7 @@ read_only global s8 *beamformer_shader_bake_parameter_names[] = {
 		s8_comp("OutputSampleStride"),
 		s8_comp("OutputTransmitStride"),
 		s8_comp("TransmitCount"),
+		s8_comp("TransmitsProcessed"),
 	},
 	(s8 []){
 		s8_comp("DecimationRate"),
@@ -331,7 +333,7 @@ read_only global s8 *beamformer_shader_bake_parameter_names[] = {
 };
 
 read_only global u8 *beamformer_shader_bake_parameter_is_float[] = {
-	(u8 []){0, 0, 0, 0, 0, 0, 0, 0},
+	(u8 []){0, 0, 0, 0, 0, 0, 0, 0, 0},
 	(u8 []){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1},
 	(u8 []){0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1},
 	0,
@@ -340,7 +342,7 @@ read_only global u8 *beamformer_shader_bake_parameter_is_float[] = {
 };
 
 read_only global i32 beamformer_shader_bake_parameter_counts[] = {
-	8,
+	9,
 	12,
 	13,
 	0,
diff --git a/shaders/decode.glsl b/shaders/decode.glsl
@@ -64,11 +64,13 @@ SAMPLE_DATA_TYPE sample_rf_data(uint index)
 	return result;
 }
 
+shared INPUT_DATA_TYPE rf[gl_WorkGroupSize.x * TransmitCount];
+
 void main()
 {
 	uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX;
 	uint channel     = gl_GlobalInvocationID.y;
-	uint transmit    = gl_GlobalInvocationID.z;
+	uint transmit    = gl_GlobalInvocationID.z * TransmitsProcessed;
 
 	uint rf_offset = (InputChannelStride * channel + TransmitCount * time_sample) / RF_SAMPLES_PER_INDEX;
 	if (u_first_pass) {
@@ -76,27 +78,66 @@ void main()
 			uint in_off = InputChannelStride  * imageLoad(channel_mapping, int(channel)).x +
 			              InputTransmitStride * transmit +
 			              InputSampleStride   * time_sample;
-			out_rf_data[rf_offset + transmit] = rf_data[in_off / RF_SAMPLES_PER_INDEX];
+			for (uint i = 0; i < TransmitsProcessed; i++, in_off += InputTransmitStride)
+				out_rf_data[rf_offset + transmit + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX];
 		}
 	} else {
+		SAMPLE_DATA_TYPE result[TransmitsProcessed];
+		switch (DecodeMode) {
+		case DecodeMode_None:{
+			for (uint i = 0; i < TransmitsProcessed; i++)
+				if (transmit + i < TransmitCount)
+					result[i] = sample_rf_data(rf_offset + transmit + i);
+		}break;
+		case DecodeMode_Hadamard:{
+			#if TransmitCount > 32
+			{
+				uint thread_count = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z;
+				uint thread_index = gl_LocalInvocationIndex;
+
+				uint samples_per_thread  = rf.length() / thread_count;
+				uint leftover_samples    = rf.length() % thread_count;
+				uint samples_this_thread = samples_per_thread + uint(thread_index < leftover_samples);
+
+				uint rf_offset = (InputChannelStride * channel +
+				                  TransmitCount * gl_WorkGroupID.x * gl_WorkGroupSize.x) / RF_SAMPLES_PER_INDEX;
+
+				for (uint i = 0; i < samples_this_thread; i++) {
+					uint index = i * thread_count + thread_index;
+					rf[index] = rf_data[rf_offset + index];
+				}
+				barrier();
+			}
+			#endif
+
+			if (time_sample < OutputTransmitStride) {
+				for (uint i = 0; i < TransmitsProcessed; i++)
+					result[i] = SAMPLE_DATA_TYPE(0);
+
+				for (int j = 0; j < TransmitCount; j++) {
+					#if TransmitCount > 32
+					SAMPLE_DATA_TYPE s = SAMPLE_TYPE_CAST(rf[gl_LocalInvocationID.x * TransmitCount + j]);
+					#else
+					SAMPLE_DATA_TYPE s = sample_rf_data(rf_offset + j);
+					#endif
+					for (uint i = 0; i < TransmitsProcessed; i++)
+						result[i] += imageLoad(hadamard, ivec2(j, transmit + i)).x * s;
+				}
+
+				for (uint i = 0; i < TransmitsProcessed; i++)
+					result[i] /= float(TransmitCount);
+			}
+		}break;
+		}
+
 		if (time_sample < OutputTransmitStride) {
 			uint out_off = OutputChannelStride  * channel +
 			               OutputTransmitStride * transmit +
 			               OutputSampleStride   * time_sample;
 
-			SAMPLE_DATA_TYPE result = SAMPLE_DATA_TYPE(0);
-			switch (DecodeMode) {
-			case DecodeMode_None:{
-				result = sample_rf_data(rf_offset + transmit);
-			}break;
-			case DecodeMode_Hadamard:{
-				SAMPLE_DATA_TYPE sum = SAMPLE_DATA_TYPE(0);
-				for (int i = 0; i < TransmitCount; i++)
-					sum += imageLoad(hadamard, ivec2(i, transmit)).x * sample_rf_data(rf_offset++);
-				result = sum / float(TransmitCount);
-			}break;
-			}
-			out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = result;
+			for (uint i = 0; i < TransmitsProcessed; i++, out_off += OutputTransmitStride)
+				if (transmit + i < TransmitCount)
+					out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = result[i];
 		}
 	}
 }

M	beamformer.c	\|	6	++++--
M	beamformer.meta	\|	1	+
M	generated/beamformer.meta.c	\|	6	++++--
M	shaders/decode.glsl	\|	71	++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------