Commit: c414a90c8eb461e4fa1bcc7de6ea2882ec26e574
Parent: 1f851e8a4feb66a225d0f8951216ee214e1237a9
Author: Randy Palamar
Date: Fri, 31 Oct 2025 13:14:14 -0600
shaders/decode: use LDS for Hadamard > 32
The performance improvement varies. Its small for smaller
matrices, > ~30% for 128, and > ~50% for 256
Diffstat:
4 files changed, 65 insertions(+), 19 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -527,13 +527,15 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
db->output_transmit_stride *= decimation_rate;
}
+ db->transmits_processed = db->transmit_count >= 32 ? 2 : 1;
+
sd->layout.x = 4;
sd->layout.y = 1;
- sd->layout.z = 16;
+ sd->layout.z = (db->transmit_count <= 32 || db->transmit_count == 80)? 16 : 32;
sd->dispatch.x = (u32)ceil_f32((f32)sample_count / (f32)sd->layout.x);
sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y);
- sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z);
+ sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z / (f32)db->transmits_processed);
if (first) sd->dispatch.x *= decimation_rate;
diff --git a/beamformer.meta b/beamformer.meta
@@ -71,6 +71,7 @@
@BakeInt(OutputSampleStride output_sample_stride )
@BakeInt(OutputTransmitStride output_transmit_stride)
@BakeInt(TransmitCount transmit_count )
+ @BakeInt(TransmitsProcessed transmits_processed )
}
}
diff --git a/generated/beamformer.meta.c b/generated/beamformer.meta.c
@@ -99,6 +99,7 @@ typedef struct {
u32 output_sample_stride;
u32 output_transmit_stride;
u32 transmit_count;
+ u32 transmits_processed;
} BeamformerShaderDecodeBakeParameters;
typedef struct {
@@ -295,6 +296,7 @@ read_only global s8 *beamformer_shader_bake_parameter_names[] = {
s8_comp("OutputSampleStride"),
s8_comp("OutputTransmitStride"),
s8_comp("TransmitCount"),
+ s8_comp("TransmitsProcessed"),
},
(s8 []){
s8_comp("DecimationRate"),
@@ -331,7 +333,7 @@ read_only global s8 *beamformer_shader_bake_parameter_names[] = {
};
read_only global u8 *beamformer_shader_bake_parameter_is_float[] = {
- (u8 []){0, 0, 0, 0, 0, 0, 0, 0},
+ (u8 []){0, 0, 0, 0, 0, 0, 0, 0, 0},
(u8 []){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1},
(u8 []){0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1},
0,
@@ -340,7 +342,7 @@ read_only global u8 *beamformer_shader_bake_parameter_is_float[] = {
};
read_only global i32 beamformer_shader_bake_parameter_counts[] = {
- 8,
+ 9,
12,
13,
0,
diff --git a/shaders/decode.glsl b/shaders/decode.glsl
@@ -64,11 +64,13 @@ SAMPLE_DATA_TYPE sample_rf_data(uint index)
return result;
}
+shared INPUT_DATA_TYPE rf[gl_WorkGroupSize.x * TransmitCount];
+
void main()
{
uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX;
uint channel = gl_GlobalInvocationID.y;
- uint transmit = gl_GlobalInvocationID.z;
+ uint transmit = gl_GlobalInvocationID.z * TransmitsProcessed;
uint rf_offset = (InputChannelStride * channel + TransmitCount * time_sample) / RF_SAMPLES_PER_INDEX;
if (u_first_pass) {
@@ -76,27 +78,66 @@ void main()
uint in_off = InputChannelStride * imageLoad(channel_mapping, int(channel)).x +
InputTransmitStride * transmit +
InputSampleStride * time_sample;
- out_rf_data[rf_offset + transmit] = rf_data[in_off / RF_SAMPLES_PER_INDEX];
+ for (uint i = 0; i < TransmitsProcessed; i++, in_off += InputTransmitStride)
+ out_rf_data[rf_offset + transmit + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX];
}
} else {
+ SAMPLE_DATA_TYPE result[TransmitsProcessed];
+ switch (DecodeMode) {
+ case DecodeMode_None:{
+ for (uint i = 0; i < TransmitsProcessed; i++)
+ if (transmit + i < TransmitCount)
+ result[i] = sample_rf_data(rf_offset + transmit + i);
+ }break;
+ case DecodeMode_Hadamard:{
+ #if TransmitCount > 32
+ {
+ uint thread_count = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z;
+ uint thread_index = gl_LocalInvocationIndex;
+
+ uint samples_per_thread = rf.length() / thread_count;
+ uint leftover_samples = rf.length() % thread_count;
+ uint samples_this_thread = samples_per_thread + uint(thread_index < leftover_samples);
+
+ uint rf_offset = (InputChannelStride * channel +
+ TransmitCount * gl_WorkGroupID.x * gl_WorkGroupSize.x) / RF_SAMPLES_PER_INDEX;
+
+ for (uint i = 0; i < samples_this_thread; i++) {
+ uint index = i * thread_count + thread_index;
+ rf[index] = rf_data[rf_offset + index];
+ }
+ barrier();
+ }
+ #endif
+
+ if (time_sample < OutputTransmitStride) {
+ for (uint i = 0; i < TransmitsProcessed; i++)
+ result[i] = SAMPLE_DATA_TYPE(0);
+
+ for (int j = 0; j < TransmitCount; j++) {
+ #if TransmitCount > 32
+ SAMPLE_DATA_TYPE s = SAMPLE_TYPE_CAST(rf[gl_LocalInvocationID.x * TransmitCount + j]);
+ #else
+ SAMPLE_DATA_TYPE s = sample_rf_data(rf_offset + j);
+ #endif
+ for (uint i = 0; i < TransmitsProcessed; i++)
+ result[i] += imageLoad(hadamard, ivec2(j, transmit + i)).x * s;
+ }
+
+ for (uint i = 0; i < TransmitsProcessed; i++)
+ result[i] /= float(TransmitCount);
+ }
+ }break;
+ }
+
if (time_sample < OutputTransmitStride) {
uint out_off = OutputChannelStride * channel +
OutputTransmitStride * transmit +
OutputSampleStride * time_sample;
- SAMPLE_DATA_TYPE result = SAMPLE_DATA_TYPE(0);
- switch (DecodeMode) {
- case DecodeMode_None:{
- result = sample_rf_data(rf_offset + transmit);
- }break;
- case DecodeMode_Hadamard:{
- SAMPLE_DATA_TYPE sum = SAMPLE_DATA_TYPE(0);
- for (int i = 0; i < TransmitCount; i++)
- sum += imageLoad(hadamard, ivec2(i, transmit)).x * sample_rf_data(rf_offset++);
- result = sum / float(TransmitCount);
- }break;
- }
- out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = result;
+ for (uint i = 0; i < TransmitsProcessed; i++, out_off += OutputTransmitStride)
+ if (transmit + i < TransmitCount)
+ out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = result[i];
}
}
}