shaders/decode: gain another 47% performance boost for i16 data - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: ee457ba4b6809bef581c90ac7609882948abe7f9
Parent: 27f33707695d4a52321850c5a9800422a2ea4cd8
Author: Randy Palamar
Date:   Wed, 25 Jun 2025 21:24:29 -0600

shaders/decode: gain another 47% performance boost for i16 data

just decode 2 samples at once dummy. also convert to float
immediately instead of after dot product.

Diffstat:
M beamformer.c  | 8 +++++++-
M shaders/decode.glsl  | 43 +++++++++++++++++++++++--------------------

2 files changed, 30 insertions(+), 21 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -282,7 +282,13 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, 
 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]);
 		glBindImageTexture(0, csctx->hadamard_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8I);
 		glBindImageTexture(1, csctx->channel_mapping_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I);
-		glDispatchCompute(ceil_f32((f32)csctx->dec_data_dim.x / DECODE_LOCAL_SIZE_X),
+
+		/* NOTE(rnp): decode 2 samples per dispatch when data is i16 */
+		i32 local_size_x = DECODE_LOCAL_SIZE_X;
+		if (shader == BeamformerShaderKind_Decode)
+			local_size_x *= 2;
+
+		glDispatchCompute(ceil_f32((f32)csctx->dec_data_dim.x / local_size_x),
 		                  ceil_f32((f32)csctx->dec_data_dim.y / DECODE_LOCAL_SIZE_Y),
 		                  ceil_f32((f32)csctx->dec_data_dim.z / DECODE_LOCAL_SIZE_Z));
 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
diff --git a/shaders/decode.glsl b/shaders/decode.glsl
@@ -11,15 +11,22 @@
 #if   defined(INPUT_DATA_TYPE_FLOAT)
 	#define INPUT_DATA_TYPE      float
 	#define RF_SAMPLES_PER_INDEX 1
-	#define RESULT_TYPE_CAST(x)  vec2(x, 0)
+	#define RESULT_TYPE_CAST(x)  vec4((x), 0, 0, 0)
+	#define SAMPLE_DATA_TYPE     float
+	#define SAMPLE_TYPE_CAST(x)  (x)
 #elif defined(INPUT_DATA_TYPE_FLOAT_COMPLEX)
 	#define INPUT_DATA_TYPE      vec2
 	#define RF_SAMPLES_PER_INDEX 1
-	#define RESULT_TYPE_CAST(x)  (x)
+	#define RESULT_TYPE_CAST(x)  vec4((x), 0, 0)
+	#define SAMPLE_DATA_TYPE     vec2
+	#define SAMPLE_TYPE_CAST(x)  (x)
 #else
 	#define INPUT_DATA_TYPE      int
 	#define RF_SAMPLES_PER_INDEX 2
-	#define RESULT_TYPE_CAST(x)  vec2(x, 0)
+	#define RESULT_TYPE_CAST(x)  (x)
+	#define SAMPLE_DATA_TYPE     vec4
+	/* NOTE(rnp): for i16 rf_data we decode 2 samples at once */
+	#define SAMPLE_TYPE_CAST(x)  vec4(((x) << 16) >> 16, 0, (x) >> 16, 0)
 #endif
 
 layout(std430, binding = 1) readonly restrict buffer buffer_1 {
@@ -33,16 +40,9 @@ layout(std430, binding = 2) writeonly restrict buffer buffer_2 {
 layout(r8i,  binding = 0) readonly restrict uniform iimage2D hadamard;
 layout(r16i, binding = 1) readonly restrict uniform iimage1D channel_mapping;
 
-INPUT_DATA_TYPE sample_rf_data(int index, uint lfs)
+SAMPLE_DATA_TYPE sample_rf_data(int index)
 {
-	INPUT_DATA_TYPE result;
-#if defined(INPUT_DATA_TYPE_FLOAT) || defined(INPUT_DATA_TYPE_FLOAT_COMPLEX)
-	result = rf_data[index];
-#else
-	/* NOTE(rnp): for i16 rf_data we grab 2 samples at a time. We need to shift
-	 * arithmetically (maintaining the sign) to get the desired element. */
-	result = (rf_data[index] << lfs) >> 16;
-#endif
+	SAMPLE_DATA_TYPE result = SAMPLE_TYPE_CAST(rf_data[index]);
 	return result;
 }
 
@@ -53,28 +53,31 @@ void main()
 	int transmit    = int(gl_GlobalInvocationID.z);
 
 	/* NOTE(rnp): stores output as a 3D matrix with ordering of {samples, channels, transmits} */
-	uint out_off = dec_data_dim.x * dec_data_dim.y * transmit + dec_data_dim.x * channel + time_sample;
+	uint out_off  = dec_data_dim.x * dec_data_dim.y * transmit + dec_data_dim.x * channel;
+	out_off      += RF_SAMPLES_PER_INDEX * time_sample;
 
 	int rf_channel = imageLoad(channel_mapping, channel).x;
 
 	/* NOTE(rnp): samples input as 2D matrix of {samples * transmits + padding, channels} */
 	int rf_stride = int(dec_data_dim.x) / RF_SAMPLES_PER_INDEX;
-	int rf_offset = (int(rf_raw_dim.x) * rf_channel + time_sample) / RF_SAMPLES_PER_INDEX;
+	int rf_offset = (int(rf_raw_dim.x) * rf_channel) / RF_SAMPLES_PER_INDEX + time_sample;
 
-	uint lfs = ((~time_sample) & 1u) * 16;
-	vec2 result = vec2(0);
+	vec4 result = vec4(0);
 	switch (decode) {
 	case DECODE_MODE_NONE: {
-		result = RESULT_TYPE_CAST(sample_rf_data(rf_offset + rf_stride * transmit, lfs));
+		result = RESULT_TYPE_CAST(sample_rf_data(rf_offset + rf_stride * transmit));
 	} break;
 	case DECODE_MODE_HADAMARD: {
-		INPUT_DATA_TYPE sum = INPUT_DATA_TYPE(0);
+		SAMPLE_DATA_TYPE sum = SAMPLE_DATA_TYPE(0);
 		for (int i = 0; i < dec_data_dim.z; i++) {
-			sum += imageLoad(hadamard, ivec2(i, transmit)).x * sample_rf_data(rf_offset, lfs);
+			sum += imageLoad(hadamard, ivec2(i, transmit)).x * sample_rf_data(rf_offset);
 			rf_offset += rf_stride;
 		}
 		result = RESULT_TYPE_CAST(sum) / float(dec_data_dim.z);
 	} break;
 	}
-	out_data[out_off] = result;
+	out_data[out_off + 0] = result.xy;
+#if RF_SAMPLES_PER_INDEX == 2
+	out_data[out_off + 1] = result.zw;
+#endif
 }

M	beamformer.c	\|	8	+++++++-
M	shaders/decode.glsl	\|	43	+++++++++++++++++++++++--------------------