shaders/decode: cleanup DecodeMode_None path - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: 42c2b9f27aeb4d5aed5dd3d78cd4ddc7198acbbf
Parent: 3e23d5d371f9e3817889c940926f5429d66c7a7a
Author: Randy Palamar
Date:   Thu,  6 Nov 2025 22:35:29 -0700

shaders/decode: cleanup DecodeMode_None path

this will eventually get completely removed, but for now it is
used for RF mode channel mapping and conversion to float

Diffstat:
M beamformer.c  | 36 ++++++++++++++++++------------------
M shaders/decode.glsl  | 71 +++++++++++++++++++++++++++++++++++++++--------------------------------

2 files changed, 57 insertions(+), 50 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -531,10 +531,8 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 			if (run_cuda_hilbert) sd->bake.flags |= BeamformerShaderDecodeFlags_DilateOutput;
 
 			if (db->decode_mode == BeamformerDecodeMode_None) {
-				db->to_process = 1;
-				sd->layout.x = 64;
-				sd->layout.y = 1;
-				sd->layout.z = 1;
+				sd->layout = (uv3){{64, 1, 1}};
+
 				sd->dispatch.x = (u32)ceil_f32((f32)sample_count                     / (f32)sd->layout.x);
 				sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count     / (f32)sd->layout.y);
 				sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z);
@@ -547,9 +545,7 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 
 				b32 use_16z  = db->transmit_count == 48 || db->transmit_count == 80 ||
 				               db->transmit_count == 96 || db->transmit_count == 160;
-				sd->layout.x = 4;
-				sd->layout.y = 1;
-				sd->layout.z = use_16z? 16 : 32;
+				sd->layout = (uv3){{4, 1, use_16z? 16 : 32}};
 
 				sd->dispatch.x = (u32)ceil_f32((f32)sample_count                     / (f32)sd->layout.x);
 				sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count     / (f32)sd->layout.y);
@@ -560,9 +556,7 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 				/* NOTE(rnp): register caching. using more threads will cause the compiler to do
 				 * contortions to avoid spilling registers. using less gives higher performance */
 				/* TODO(rnp): may need to be adjusted to 16 on NVIDIA */
-				sd->layout.x = 32;
-				sd->layout.y = 1;
-				sd->layout.z = 1;
+				sd->layout = (uv3){{32, 1, 1}};
 
 				sd->dispatch.x = (u32)ceil_f32((f32)sample_count                 / (f32)sd->layout.x);
 				sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y);
@@ -645,9 +639,7 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 			}
 
 			/* TODO(rnp): filter may need a different dispatch layout */
-			sd->layout.x   = 128;
-			sd->layout.y   = 1;
-			sd->layout.z   = 1;
+			sd->layout     = (uv3){{128, 1, 1}};
 			sd->dispatch.x = (u32)ceil_f32((f32)sample_count                     / (f32)sd->layout.x);
 			sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count     / (f32)sd->layout.y);
 			sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z);
@@ -923,16 +915,22 @@ do_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, BeamformerFrame
 	case BeamformerShaderKind_Decode:{
 		glBindImageTexture(0, cp->textures[BeamformerComputeTextureKind_Hadamard], 0, 0, 0, GL_READ_ONLY, GL_R32F);
 
+		BeamformerDecodeMode mode = cp->shader_descriptors[shader_slot].bake.Decode.decode_mode;
 		if (shader_slot == 0) {
-			glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, cc->ping_pong_ssbos[input_ssbo_idx]);
 			glBindImageTexture(1, cp->textures[BeamformerComputeTextureKind_ChannelMapping], 0, 0, 0, GL_READ_ONLY, GL_R16I);
-			glProgramUniform1ui(program, DECODE_FIRST_PASS_UNIFORM_LOC, 1);
 
-			glDispatchCompute(dispatch.x, dispatch.y, dispatch.z);
-			glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
+			if (mode != BeamformerDecodeMode_None) {
+				glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, cc->ping_pong_ssbos[input_ssbo_idx]);
+				glProgramUniform1ui(program, DECODE_FIRST_PASS_UNIFORM_LOC, 1);
+
+				glDispatchCompute(dispatch.x, dispatch.y, dispatch.z);
+				glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
+			}
 		}
 
-		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, cc->ping_pong_ssbos[input_ssbo_idx]);
+		if (mode != BeamformerDecodeMode_None)
+			glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, cc->ping_pong_ssbos[input_ssbo_idx]);
+
 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, cc->ping_pong_ssbos[output_ssbo_idx]);
 
 		glProgramUniform1ui(program, DECODE_FIRST_PASS_UNIFORM_LOC, 0);
@@ -1496,6 +1494,8 @@ DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step)
 	BeamformerSharedMemory *sm = ctx->shared_memory.region;
 	if (atomic_load_u32(sm->locks + BeamformerSharedMemoryLockKind_UploadRF))
 		os_wake_waiters(&ctx->os.upload_worker.sync_variable);
+	if (atomic_load_u32(sm->locks + BeamformerSharedMemoryLockKind_DispatchCompute))
+		os_wake_waiters(&ctx->os.compute_worker.sync_variable);
 
 	BeamformerFrame        *frame = ctx->latest_frame;
 	BeamformerViewPlaneTag  tag   = frame? frame->view_plane_tag : 0;
diff --git a/shaders/decode.glsl b/shaders/decode.glsl
@@ -152,45 +152,52 @@ void run_decode_small(void)
 
 void main()
 {
-	uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX;
-	uint channel     = gl_GlobalInvocationID.y;
-	uint transmit    = gl_GlobalInvocationID.z * ToProcess;
-
-	uint rf_offset = (InputChannelStride * channel + TransmitCount * time_sample) / RF_SAMPLES_PER_INDEX;
-	if (u_first_pass) {
-		if (time_sample < InputTransmitStride) {
-			uint in_off = InputChannelStride * imageLoad(channel_mapping, int(channel)).x +
-			              InputSampleStride  * time_sample;
-			#if DecodeMode == DecodeMode_None || UseSharedMemory
-			in_off    += InputTransmitStride * transmit;
-			rf_offset += transmit;
-			for (uint i = 0; i < ToProcess; i++, in_off += InputTransmitStride) {
-				if (transmit + i < TransmitCount)
-					out_rf_data[rf_offset + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX];
-			}
-			#else
-			for (uint i = 0; i < TransmitCount; i++, in_off += InputTransmitStride)
-				out_rf_data[rf_offset + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX];
-			#endif
+	switch (DecodeMode) {
+	case DecodeMode_None:{
+		uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX;
+		uint channel     = gl_GlobalInvocationID.y;
+		uint transmit    = gl_GlobalInvocationID.z;
+
+		if (time_sample < OutputTransmitStride) {
+			uint in_off = (InputChannelStride  * imageLoad(channel_mapping, int(channel)).x +
+			               InputTransmitStride * transmit +
+			               InputSampleStride   * time_sample) / RF_SAMPLES_PER_INDEX;
+
+			uint out_off = (OutputChannelStride  * channel +
+			                OutputTransmitStride * transmit +
+			                OutputSampleStride   * time_sample) / OUTPUT_SAMPLES_PER_INDEX;
+
+			out_data[out_off] = sample_rf_data(in_off);
 		}
-	} else {
-		switch (DecodeMode) {
-		case DecodeMode_None:{
-			uint out_off = OutputChannelStride  * channel +
-			               OutputTransmitStride * transmit +
-			               OutputSampleStride   * time_sample;
-			for (uint i = 0; i < ToProcess; i++, out_off += OutputTransmitStride) {
-				if (TransmitCount % (gl_WorkGroupSize.z * ToProcess) == 0 || transmit + i < TransmitCount)
-					out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = sample_rf_data(rf_offset + transmit + i);
+	}break;
+	case DecodeMode_Hadamard:{
+		if (u_first_pass) {
+			uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX;
+			uint channel     = gl_GlobalInvocationID.y;
+			uint transmit    = gl_GlobalInvocationID.z * ToProcess;
+			if (time_sample < InputTransmitStride) {
+				uint out_off = (InputChannelStride * channel + TransmitCount * time_sample) / RF_SAMPLES_PER_INDEX;
+				uint in_off  = InputChannelStride * imageLoad(channel_mapping, int(channel)).x +
+				               InputSampleStride  * time_sample;
+				#if UseSharedMemory
+					in_off  += InputTransmitStride * transmit;
+					out_off += transmit;
+					for (uint i = 0; i < ToProcess; i++, in_off += InputTransmitStride) {
+						if (transmit + i < TransmitCount)
+							out_rf_data[out_off + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX];
+					}
+				#else
+					for (uint i = 0; i < TransmitCount; i++, in_off += InputTransmitStride)
+						out_rf_data[out_off + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX];
+				#endif
 			}
-		}break;
-		case DecodeMode_Hadamard:{
+		} else {
 			#if UseSharedMemory
 				run_decode_large();
 			#else
 				run_decode_small();
 			#endif
-		}break;
 		}
+	}break;
 	}
 }

M	beamformer.c	\|	36	++++++++++++++++++------------------
M	shaders/decode.glsl	\|	71	+++++++++++++++++++++++++++++++++++++++--------------------------------