ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

Commit: 31df2285cfeeb900c97ecd3011a75df20ef05ad9
Parent: 35f140977903d900298b976cd5ffc11e3e2ebd68
Author: Randy Palamar
Date:   Wed,  5 Nov 2025 12:58:16 -0700

shaders/decode: move shared memory and register caching to separate functions

this was hard to read and reason about. additionally fixed bad
indexing for rf only mode.

also added a missed special case for 48 Transmits. this brings its
performance in line with the trend followed by the other sizes

Diffstat:
Mbeamformer.c | 28++++++++++++++++++++--------
Mbeamformer.meta | 2+-
Mgenerated/beamformer.meta.c | 4++--
Mshaders/decode.glsl | 173++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------
4 files changed, 129 insertions(+), 78 deletions(-)

diff --git a/beamformer.c b/beamformer.c @@ -531,31 +531,43 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb) if (run_cuda_hilbert) sd->bake.flags |= BeamformerShaderDecodeFlags_DilateOutput; if (db->decode_mode == BeamformerDecodeMode_None) { - db->transmits_processed = 1; + db->to_process = 1; sd->layout.x = 64; sd->layout.y = 1; sd->layout.z = 1; + sd->dispatch.x = (u32)ceil_f32((f32)sample_count / (f32)sd->layout.x); + sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y); + sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z); } else if (db->transmit_count > 40) { sd->bake.flags |= BeamformerShaderDecodeFlags_UseSharedMemory; - db->transmits_processed = 2; + db->to_process = 2; - b32 use_16z = db->transmit_count == 80 || db->transmit_count == 96 || db->transmit_count == 160; + if (db->transmit_count == 48) + db->to_process = db->transmit_count / 16; + + b32 use_16z = db->transmit_count == 48 || db->transmit_count == 80 || + db->transmit_count == 96 || db->transmit_count == 160; sd->layout.x = 4; sd->layout.y = 1; sd->layout.z = use_16z? 16 : 32; + + sd->dispatch.x = (u32)ceil_f32((f32)sample_count / (f32)sd->layout.x); + sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y); + sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z / (f32)db->to_process); } else { - db->transmits_processed = db->transmit_count; + db->to_process = 1; + /* NOTE(rnp): register caching. using more threads will cause the compiler to do * contortions to avoid spilling registers. using less gives higher performance */ /* TODO(rnp): may need to be adjusted to 16 on NVIDIA */ sd->layout.x = 32; sd->layout.y = 1; sd->layout.z = 1; - } - sd->dispatch.x = (u32)ceil_f32((f32)sample_count / (f32)sd->layout.x); - sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y); - sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z / (f32)db->transmits_processed); + sd->dispatch.x = (u32)ceil_f32((f32)sample_count / (f32)sd->layout.x); + sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y); + sd->dispatch.z = 1; + } if (first) sd->dispatch.x *= decimation_rate; diff --git a/beamformer.meta b/beamformer.meta @@ -70,8 +70,8 @@ @BakeInt(OutputChannelStride output_channel_stride ) @BakeInt(OutputSampleStride output_sample_stride ) @BakeInt(OutputTransmitStride output_transmit_stride) + @BakeInt(ToProcess to_process ) @BakeInt(TransmitCount transmit_count ) - @BakeInt(TransmitsProcessed transmits_processed ) } } diff --git a/generated/beamformer.meta.c b/generated/beamformer.meta.c @@ -99,8 +99,8 @@ typedef struct { u32 output_channel_stride; u32 output_sample_stride; u32 output_transmit_stride; + u32 to_process; u32 transmit_count; - u32 transmits_processed; } BeamformerShaderDecodeBakeParameters; typedef struct { @@ -296,8 +296,8 @@ read_only global s8 *beamformer_shader_bake_parameter_names[] = { s8_comp("OutputChannelStride"), s8_comp("OutputSampleStride"), s8_comp("OutputTransmitStride"), + s8_comp("ToProcess"), s8_comp("TransmitCount"), - s8_comp("TransmitsProcessed"), }, (s8 []){ s8_comp("DecimationRate"), diff --git a/shaders/decode.glsl b/shaders/decode.glsl @@ -66,92 +66,131 @@ SAMPLE_DATA_TYPE sample_rf_data(uint index) #if UseSharedMemory shared INPUT_DATA_TYPE rf[gl_WorkGroupSize.x * TransmitCount]; +void run_decode_large(void) +{ + uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX; + uint channel = gl_GlobalInvocationID.y; + uint transmit = gl_GlobalInvocationID.z * ToProcess; + + uint thread_count = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z; + uint thread_index = gl_LocalInvocationIndex; + + uint samples_per_thread = rf.length() / thread_count; + uint leftover_samples = rf.length() % thread_count; + uint samples_this_thread = samples_per_thread + uint(thread_index < leftover_samples); + + uint rf_offset = (InputChannelStride * channel / RF_SAMPLES_PER_INDEX + + TransmitCount * gl_WorkGroupID.x * gl_WorkGroupSize.x); + + for (uint i = 0; i < samples_this_thread; i++) { + uint index = i * thread_count + thread_index; + rf[index] = rf_data[rf_offset + index]; + } + + barrier(); + + SAMPLE_DATA_TYPE result[ToProcess]; + if (time_sample < OutputTransmitStride) { + for (uint i = 0; i < ToProcess; i++) + result[i] = SAMPLE_DATA_TYPE(0); + + for (int j = 0; j < TransmitCount; j++) { + SAMPLE_DATA_TYPE s = SAMPLE_TYPE_CAST(rf[gl_LocalInvocationID.x * TransmitCount + j]); + for (uint i = 0; i < ToProcess; i++) + result[i] += imageLoad(hadamard, ivec2(j, transmit + i)).x * s; + } + + for (uint i = 0; i < ToProcess; i++) + result[i] /= float(TransmitCount); + } + + /* NOTE(rnp): DO NOT combine with above; compiler shits the bed on TransmitCount == 80 + * and it kills performance. reinvestigate when we further optimize */ + if (time_sample < OutputTransmitStride) { + uint out_off = OutputChannelStride * channel + + OutputTransmitStride * transmit + + OutputSampleStride * time_sample; + + for (uint i = 0; i < ToProcess; i++, out_off += OutputTransmitStride) + if (TransmitCount % (gl_WorkGroupSize.z * ToProcess) == 0 || transmit + i < TransmitCount) + out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = result[i]; + } +} #endif +void run_decode_small(void) +{ + uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX; + uint channel = gl_GlobalInvocationID.y; + uint rf_offset = (InputChannelStride * channel + TransmitCount * time_sample) / RF_SAMPLES_PER_INDEX; + + if (time_sample < OutputTransmitStride) { + INPUT_DATA_TYPE rf[TransmitCount]; + for (int j = 0; j < TransmitCount; j++) + rf[j] = rf_data[rf_offset + j]; + + SAMPLE_DATA_TYPE result[TransmitCount]; + for (int j = 0; j < TransmitCount; j++) + result[j] = SAMPLE_DATA_TYPE(0); + + for (int i = 0; i < TransmitCount; i++) { + SAMPLE_DATA_TYPE s = SAMPLE_TYPE_CAST(rf[i]); + for (int j = 0; j < TransmitCount; j++) { + result[j] += imageLoad(hadamard, ivec2(i, j)).x * s; + } + } + + for (int i = 0; i < TransmitCount; i++) + result[i] /= float(TransmitCount); + + uint out_off = OutputChannelStride * channel + + OutputSampleStride * time_sample; + for (int i = 0; i < TransmitCount; i++, out_off += OutputTransmitStride) + out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = result[i]; + } +} + void main() { uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX; uint channel = gl_GlobalInvocationID.y; - uint transmit = gl_GlobalInvocationID.z * TransmitsProcessed; + uint transmit = gl_GlobalInvocationID.z * ToProcess; uint rf_offset = (InputChannelStride * channel + TransmitCount * time_sample) / RF_SAMPLES_PER_INDEX; if (u_first_pass) { if (time_sample < InputTransmitStride) { - uint in_off = InputChannelStride * imageLoad(channel_mapping, int(channel)).x + - InputTransmitStride * transmit + - InputSampleStride * time_sample; - for (uint i = 0; i < TransmitsProcessed; i++, in_off += InputTransmitStride) - out_rf_data[rf_offset + transmit + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX]; + uint in_off = InputChannelStride * imageLoad(channel_mapping, int(channel)).x + + InputSampleStride * time_sample; + #if DecodeMode == DecodeMode_None || UseSharedMemory + in_off += InputTransmitStride * transmit; + rf_offset += transmit; + for (uint i = 0; i < ToProcess; i++, in_off += InputTransmitStride) { + if (transmit + i < TransmitCount) + out_rf_data[rf_offset + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX]; + } + #else + for (uint i = 0; i < TransmitCount; i++, in_off += InputTransmitStride) + out_rf_data[rf_offset + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX]; + #endif } } else { - if (UseSharedMemory == 0 && time_sample >= OutputTransmitStride) - return; - - SAMPLE_DATA_TYPE result[TransmitsProcessed]; switch (DecodeMode) { case DecodeMode_None:{ - for (uint i = 0; i < TransmitsProcessed; i++) - if (TransmitCount % (gl_WorkGroupSize.z * TransmitsProcessed) == 0 || transmit + i < TransmitCount) - result[i] = sample_rf_data(rf_offset + transmit + i); + uint out_off = OutputChannelStride * channel + + OutputTransmitStride * transmit + + OutputSampleStride * time_sample; + for (uint i = 0; i < ToProcess; i++, out_off += OutputTransmitStride) { + if (TransmitCount % (gl_WorkGroupSize.z * ToProcess) == 0 || transmit + i < TransmitCount) + out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = sample_rf_data(rf_offset + transmit + i); + } }break; case DecodeMode_Hadamard:{ #if UseSharedMemory - { - uint thread_count = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z; - uint thread_index = gl_LocalInvocationIndex; - - uint samples_per_thread = rf.length() / thread_count; - uint leftover_samples = rf.length() % thread_count; - uint samples_this_thread = samples_per_thread + uint(thread_index < leftover_samples); - - uint rf_offset = (InputChannelStride * channel + - TransmitCount * gl_WorkGroupID.x * gl_WorkGroupSize.x) / RF_SAMPLES_PER_INDEX; - - for (uint i = 0; i < samples_this_thread; i++) { - uint index = i * thread_count + thread_index; - rf[index] = rf_data[rf_offset + index]; - } - barrier(); - } + run_decode_large(); + #else + run_decode_small(); #endif - - if (time_sample < OutputTransmitStride) { - for (uint i = 0; i < TransmitsProcessed; i++) - result[i] = SAMPLE_DATA_TYPE(0); - - #if UseSharedMemory - for (int j = 0; j < TransmitCount; j++) { - SAMPLE_DATA_TYPE s = SAMPLE_TYPE_CAST(rf[gl_LocalInvocationID.x * TransmitCount + j]); - for (uint i = 0; i < TransmitsProcessed; i++) - result[i] += imageLoad(hadamard, ivec2(j, transmit + i)).x * s; - } - #else - INPUT_DATA_TYPE rf[TransmitsProcessed]; - for (int j = 0; j < TransmitCount; j++) - rf[j] = rf_data[rf_offset + j]; - - for (int j = 0; j < TransmitCount; j++) { - SAMPLE_DATA_TYPE s = SAMPLE_TYPE_CAST(rf[j]); - for (uint i = 0; i < TransmitsProcessed; i++) { - result[i] += imageLoad(hadamard, ivec2(j, transmit + i)).x * s; - } - } - #endif - - for (uint i = 0; i < TransmitsProcessed; i++) - result[i] /= float(TransmitCount); - } }break; } - - if (time_sample < OutputTransmitStride) { - uint out_off = OutputChannelStride * channel + - OutputTransmitStride * transmit + - OutputSampleStride * time_sample; - - for (uint i = 0; i < TransmitsProcessed; i++, out_off += OutputTransmitStride) - if (TransmitCount % (gl_WorkGroupSize.z * TransmitsProcessed) == 0 || transmit + i < TransmitCount) - out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = result[i]; - } } }