ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

Commit: 35f140977903d900298b976cd5ffc11e3e2ebd68
Parent: f8cd8d4d57af9141aca4fcb9bc059c2b6dd0e95a
Author: Randy Palamar
Date:   Mon,  3 Nov 2025 22:10:05 -0700

shaders/decode: avoid nasty codegen; make sure None doesn't use LDS path

we have a compile time value which can be checked to avoid
pointless bounds checks which end up getting unrolled and prevent
the compiler from properly interleaving loads and ALU ops

Diffstat:
Mbeamformer.c | 10+++++++++-
Mshaders/decode.glsl | 7+++++--
2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/beamformer.c b/beamformer.c @@ -1,5 +1,8 @@ /* See LICENSE for license details. */ /* TODO(rnp): + * [ ]: refactor: DecodeMode_None should use a different mapping and optional conversion shader + * for rf only mode with no filter and demod/filter should gain the OutputFloats flag for iq + * case and rf mode with filter; this can also be used instead of first pass uniform * [ ]: refactor: replace UploadRF with just the scratch_rf_size variable, * use below to spin wait in library * [ ]: utilize umonitor/umwait (intel), monitorx/mwaitx (amd), and wfe/sev (aarch64) @@ -527,7 +530,12 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb) if (run_cuda_hilbert) sd->bake.flags |= BeamformerShaderDecodeFlags_DilateOutput; - if (db->transmit_count > 40) { + if (db->decode_mode == BeamformerDecodeMode_None) { + db->transmits_processed = 1; + sd->layout.x = 64; + sd->layout.y = 1; + sd->layout.z = 1; + } else if (db->transmit_count > 40) { sd->bake.flags |= BeamformerShaderDecodeFlags_UseSharedMemory; db->transmits_processed = 2; diff --git a/shaders/decode.glsl b/shaders/decode.glsl @@ -84,11 +84,14 @@ void main() out_rf_data[rf_offset + transmit + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX]; } } else { + if (UseSharedMemory == 0 && time_sample >= OutputTransmitStride) + return; + SAMPLE_DATA_TYPE result[TransmitsProcessed]; switch (DecodeMode) { case DecodeMode_None:{ for (uint i = 0; i < TransmitsProcessed; i++) - if (transmit + i < TransmitCount) + if (TransmitCount % (gl_WorkGroupSize.z * TransmitsProcessed) == 0 || transmit + i < TransmitCount) result[i] = sample_rf_data(rf_offset + transmit + i); }break; case DecodeMode_Hadamard:{ @@ -147,7 +150,7 @@ void main() OutputSampleStride * time_sample; for (uint i = 0; i < TransmitsProcessed; i++, out_off += OutputTransmitStride) - if (transmit + i < TransmitCount) + if (TransmitCount % (gl_WorkGroupSize.z * TransmitsProcessed) == 0 || transmit + i < TransmitCount) out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = result[i]; } }