ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

Commit: c414a90c8eb461e4fa1bcc7de6ea2882ec26e574
Parent: 1f851e8a4feb66a225d0f8951216ee214e1237a9
Author: Randy Palamar
Date:   Fri, 31 Oct 2025 13:14:14 -0600

shaders/decode: use LDS for Hadamard > 32

The performance improvement varies. Its small for smaller
matrices, > ~30% for 128, and > ~50% for 256

Diffstat:
Mbeamformer.c | 6++++--
Mbeamformer.meta | 1+
Mgenerated/beamformer.meta.c | 6++++--
Mshaders/decode.glsl | 71++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
4 files changed, 65 insertions(+), 19 deletions(-)

diff --git a/beamformer.c b/beamformer.c @@ -527,13 +527,15 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb) db->output_transmit_stride *= decimation_rate; } + db->transmits_processed = db->transmit_count >= 32 ? 2 : 1; + sd->layout.x = 4; sd->layout.y = 1; - sd->layout.z = 16; + sd->layout.z = (db->transmit_count <= 32 || db->transmit_count == 80)? 16 : 32; sd->dispatch.x = (u32)ceil_f32((f32)sample_count / (f32)sd->layout.x); sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y); - sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z); + sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z / (f32)db->transmits_processed); if (first) sd->dispatch.x *= decimation_rate; diff --git a/beamformer.meta b/beamformer.meta @@ -71,6 +71,7 @@ @BakeInt(OutputSampleStride output_sample_stride ) @BakeInt(OutputTransmitStride output_transmit_stride) @BakeInt(TransmitCount transmit_count ) + @BakeInt(TransmitsProcessed transmits_processed ) } } diff --git a/generated/beamformer.meta.c b/generated/beamformer.meta.c @@ -99,6 +99,7 @@ typedef struct { u32 output_sample_stride; u32 output_transmit_stride; u32 transmit_count; + u32 transmits_processed; } BeamformerShaderDecodeBakeParameters; typedef struct { @@ -295,6 +296,7 @@ read_only global s8 *beamformer_shader_bake_parameter_names[] = { s8_comp("OutputSampleStride"), s8_comp("OutputTransmitStride"), s8_comp("TransmitCount"), + s8_comp("TransmitsProcessed"), }, (s8 []){ s8_comp("DecimationRate"), @@ -331,7 +333,7 @@ read_only global s8 *beamformer_shader_bake_parameter_names[] = { }; read_only global u8 *beamformer_shader_bake_parameter_is_float[] = { - (u8 []){0, 0, 0, 0, 0, 0, 0, 0}, + (u8 []){0, 0, 0, 0, 0, 0, 0, 0, 0}, (u8 []){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1}, (u8 []){0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1}, 0, @@ -340,7 +342,7 @@ read_only global u8 *beamformer_shader_bake_parameter_is_float[] = { }; read_only global i32 beamformer_shader_bake_parameter_counts[] = { - 8, + 9, 12, 13, 0, diff --git a/shaders/decode.glsl b/shaders/decode.glsl @@ -64,11 +64,13 @@ SAMPLE_DATA_TYPE sample_rf_data(uint index) return result; } +shared INPUT_DATA_TYPE rf[gl_WorkGroupSize.x * TransmitCount]; + void main() { uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX; uint channel = gl_GlobalInvocationID.y; - uint transmit = gl_GlobalInvocationID.z; + uint transmit = gl_GlobalInvocationID.z * TransmitsProcessed; uint rf_offset = (InputChannelStride * channel + TransmitCount * time_sample) / RF_SAMPLES_PER_INDEX; if (u_first_pass) { @@ -76,27 +78,66 @@ void main() uint in_off = InputChannelStride * imageLoad(channel_mapping, int(channel)).x + InputTransmitStride * transmit + InputSampleStride * time_sample; - out_rf_data[rf_offset + transmit] = rf_data[in_off / RF_SAMPLES_PER_INDEX]; + for (uint i = 0; i < TransmitsProcessed; i++, in_off += InputTransmitStride) + out_rf_data[rf_offset + transmit + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX]; } } else { + SAMPLE_DATA_TYPE result[TransmitsProcessed]; + switch (DecodeMode) { + case DecodeMode_None:{ + for (uint i = 0; i < TransmitsProcessed; i++) + if (transmit + i < TransmitCount) + result[i] = sample_rf_data(rf_offset + transmit + i); + }break; + case DecodeMode_Hadamard:{ + #if TransmitCount > 32 + { + uint thread_count = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z; + uint thread_index = gl_LocalInvocationIndex; + + uint samples_per_thread = rf.length() / thread_count; + uint leftover_samples = rf.length() % thread_count; + uint samples_this_thread = samples_per_thread + uint(thread_index < leftover_samples); + + uint rf_offset = (InputChannelStride * channel + + TransmitCount * gl_WorkGroupID.x * gl_WorkGroupSize.x) / RF_SAMPLES_PER_INDEX; + + for (uint i = 0; i < samples_this_thread; i++) { + uint index = i * thread_count + thread_index; + rf[index] = rf_data[rf_offset + index]; + } + barrier(); + } + #endif + + if (time_sample < OutputTransmitStride) { + for (uint i = 0; i < TransmitsProcessed; i++) + result[i] = SAMPLE_DATA_TYPE(0); + + for (int j = 0; j < TransmitCount; j++) { + #if TransmitCount > 32 + SAMPLE_DATA_TYPE s = SAMPLE_TYPE_CAST(rf[gl_LocalInvocationID.x * TransmitCount + j]); + #else + SAMPLE_DATA_TYPE s = sample_rf_data(rf_offset + j); + #endif + for (uint i = 0; i < TransmitsProcessed; i++) + result[i] += imageLoad(hadamard, ivec2(j, transmit + i)).x * s; + } + + for (uint i = 0; i < TransmitsProcessed; i++) + result[i] /= float(TransmitCount); + } + }break; + } + if (time_sample < OutputTransmitStride) { uint out_off = OutputChannelStride * channel + OutputTransmitStride * transmit + OutputSampleStride * time_sample; - SAMPLE_DATA_TYPE result = SAMPLE_DATA_TYPE(0); - switch (DecodeMode) { - case DecodeMode_None:{ - result = sample_rf_data(rf_offset + transmit); - }break; - case DecodeMode_Hadamard:{ - SAMPLE_DATA_TYPE sum = SAMPLE_DATA_TYPE(0); - for (int i = 0; i < TransmitCount; i++) - sum += imageLoad(hadamard, ivec2(i, transmit)).x * sample_rf_data(rf_offset++); - result = sum / float(TransmitCount); - }break; - } - out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = result; + for (uint i = 0; i < TransmitsProcessed; i++, out_off += OutputTransmitStride) + if (transmit + i < TransmitCount) + out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = result[i]; } } }