Commit: 35f140977903d900298b976cd5ffc11e3e2ebd68
Parent: f8cd8d4d57af9141aca4fcb9bc059c2b6dd0e95a
Author: Randy Palamar
Date: Mon, 3 Nov 2025 22:10:05 -0700
shaders/decode: avoid nasty codegen; make sure None doesn't use LDS path
we have a compile time value which can be checked to avoid
pointless bounds checks which end up getting unrolled and prevent
the compiler from properly interleaving loads and ALU ops
Diffstat:
2 files changed, 14 insertions(+), 3 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -1,5 +1,8 @@
/* See LICENSE for license details. */
/* TODO(rnp):
+ * [ ]: refactor: DecodeMode_None should use a different mapping and optional conversion shader
+ * for rf only mode with no filter and demod/filter should gain the OutputFloats flag for iq
+ * case and rf mode with filter; this can also be used instead of first pass uniform
* [ ]: refactor: replace UploadRF with just the scratch_rf_size variable,
* use below to spin wait in library
* [ ]: utilize umonitor/umwait (intel), monitorx/mwaitx (amd), and wfe/sev (aarch64)
@@ -527,7 +530,12 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
if (run_cuda_hilbert) sd->bake.flags |= BeamformerShaderDecodeFlags_DilateOutput;
- if (db->transmit_count > 40) {
+ if (db->decode_mode == BeamformerDecodeMode_None) {
+ db->transmits_processed = 1;
+ sd->layout.x = 64;
+ sd->layout.y = 1;
+ sd->layout.z = 1;
+ } else if (db->transmit_count > 40) {
sd->bake.flags |= BeamformerShaderDecodeFlags_UseSharedMemory;
db->transmits_processed = 2;
diff --git a/shaders/decode.glsl b/shaders/decode.glsl
@@ -84,11 +84,14 @@ void main()
out_rf_data[rf_offset + transmit + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX];
}
} else {
+ if (UseSharedMemory == 0 && time_sample >= OutputTransmitStride)
+ return;
+
SAMPLE_DATA_TYPE result[TransmitsProcessed];
switch (DecodeMode) {
case DecodeMode_None:{
for (uint i = 0; i < TransmitsProcessed; i++)
- if (transmit + i < TransmitCount)
+ if (TransmitCount % (gl_WorkGroupSize.z * TransmitsProcessed) == 0 || transmit + i < TransmitCount)
result[i] = sample_rf_data(rf_offset + transmit + i);
}break;
case DecodeMode_Hadamard:{
@@ -147,7 +150,7 @@ void main()
OutputSampleStride * time_sample;
for (uint i = 0; i < TransmitsProcessed; i++, out_off += OutputTransmitStride)
- if (transmit + i < TransmitCount)
+ if (TransmitCount % (gl_WorkGroupSize.z * TransmitsProcessed) == 0 || transmit + i < TransmitCount)
out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = result[i];
}
}