shaders/decode: avoid nasty codegen; make sure None doesn't use LDS path - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: 35f140977903d900298b976cd5ffc11e3e2ebd68
Parent: f8cd8d4d57af9141aca4fcb9bc059c2b6dd0e95a
Author: Randy Palamar
Date:   Mon,  3 Nov 2025 22:10:05 -0700

shaders/decode: avoid nasty codegen; make sure None doesn't use LDS path

we have a compile time value which can be checked to avoid
pointless bounds checks which end up getting unrolled and prevent
the compiler from properly interleaving loads and ALU ops

Diffstat:
M beamformer.c  | 10 +++++++++-
M shaders/decode.glsl  | 7 +++++--

2 files changed, 14 insertions(+), 3 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -1,5 +1,8 @@
 /* See LICENSE for license details. */
 /* TODO(rnp):
+ * [ ]: refactor: DecodeMode_None should use a different mapping and optional conversion shader
+ *      for rf only mode with no filter and demod/filter should gain the OutputFloats flag for iq
+ *      case and rf mode with filter; this can also be used instead of first pass uniform
  * [ ]: refactor: replace UploadRF with just the scratch_rf_size variable,
  *      use below to spin wait in library
  * [ ]: utilize umonitor/umwait (intel), monitorx/mwaitx (amd), and wfe/sev (aarch64)
@@ -527,7 +530,12 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 
 			if (run_cuda_hilbert) sd->bake.flags |= BeamformerShaderDecodeFlags_DilateOutput;
 
-			if (db->transmit_count > 40) {
+			if (db->decode_mode == BeamformerDecodeMode_None) {
+				db->transmits_processed = 1;
+				sd->layout.x = 64;
+				sd->layout.y = 1;
+				sd->layout.z = 1;
+			} else if (db->transmit_count > 40) {
 				sd->bake.flags |= BeamformerShaderDecodeFlags_UseSharedMemory;
 				db->transmits_processed = 2;
 
diff --git a/shaders/decode.glsl b/shaders/decode.glsl
@@ -84,11 +84,14 @@ void main()
 				out_rf_data[rf_offset + transmit + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX];
 		}
 	} else {
+		if (UseSharedMemory == 0 && time_sample >= OutputTransmitStride)
+			return;
+
 		SAMPLE_DATA_TYPE result[TransmitsProcessed];
 		switch (DecodeMode) {
 		case DecodeMode_None:{
 			for (uint i = 0; i < TransmitsProcessed; i++)
-				if (transmit + i < TransmitCount)
+				if (TransmitCount % (gl_WorkGroupSize.z * TransmitsProcessed) == 0 || transmit + i < TransmitCount)
 					result[i] = sample_rf_data(rf_offset + transmit + i);
 		}break;
 		case DecodeMode_Hadamard:{
@@ -147,7 +150,7 @@ void main()
 			               OutputSampleStride   * time_sample;
 
 			for (uint i = 0; i < TransmitsProcessed; i++, out_off += OutputTransmitStride)
-				if (transmit + i < TransmitCount)
+				if (TransmitCount % (gl_WorkGroupSize.z * TransmitsProcessed) == 0 || transmit + i < TransmitCount)
 					out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = result[i];
 		}
 	}