shaders/decode: gain another 10% by doing a first pass data reordering - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: 85a084f7329a4c19f01efe2ad9bbbdf115307079
Parent: ef105d5bc95f4fa9127a4595c71349522b35e6aa
Author: Randy Palamar
Date:   Fri, 27 Jun 2025 07:54:01 -0600

shaders/decode: gain another 10% by doing a first pass data reordering

instead of doing a sparse matrix multiply we can instead reorder
the data first then do a dense matrix multiply.

For now this is the final decode optimization I will do (though
its possible there is a better dispatch order for doing the
shuffle). I also tried packing the hadamard into a bit matrix but
extra integer math was more expensive than the extra memory usage
in this case (it may still be important for readi though).

Diffstat:
M beamformer.c  | 20 ++++++++++++++++----
M beamformer_parameters.h  | 2 ++
M opengl.h  | 2 ++
M shaders/decode.glsl  | 51 +++++++++++++++++++++++++++------------------------

4 files changed, 47 insertions(+), 28 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -1,6 +1,5 @@
 /* See LICENSE for license details. */
 /* TODO(rnp):
- * [ ]: refactor: compute shader timers should be generated based on the pipeline stage limit
  * [ ]: reinvestigate ring buffer raw_data_ssbo
  *      - to minimize latency the main thread should manage the subbuffer upload so that the
  *        compute thread can just keep computing. This way we can keep the copmute thread busy
@@ -123,7 +122,7 @@ alloc_shader_storage(BeamformerCtx *ctx, u32 rf_raw_size, Arena a)
 	i32 storage_flags = GL_DYNAMIC_STORAGE_BIT;
 	glDeleteBuffers(1, &cs->raw_data_ssbo);
 	glCreateBuffers(1, &cs->raw_data_ssbo);
-	glNamedBufferStorage(cs->raw_data_ssbo, rf_raw_size, 0, storage_flags);
+	glNamedBufferStorage(cs->raw_data_ssbo, 2 * rf_raw_size, 0, storage_flags);
 	LABEL_GL_OBJECT(GL_BUFFER, cs->raw_data_ssbo, s8("Raw_RF_SSBO"));
 
 	iz rf_decoded_size = 2 * sizeof(f32) * cs->dec_data_dim.x * cs->dec_data_dim.y * cs->dec_data_dim.z;
@@ -278,8 +277,7 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, 
 	case BeamformerShaderKind_Decode:
 	case BeamformerShaderKind_DecodeFloat:
 	case BeamformerShaderKind_DecodeFloatComplex:{
-		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo);
-		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]);
+		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, csctx->rf_data_ssbos[output_ssbo_idx]);
 		glBindImageTexture(0, csctx->hadamard_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8I);
 		glBindImageTexture(1, csctx->channel_mapping_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I);
 
@@ -288,9 +286,22 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, 
 		if (shader == BeamformerShaderKind_Decode)
 			local_size_x *= 2;
 
+		iz raw_size = csctx->rf_raw_size;
+		glProgramUniform1ui(csctx->programs[shader], DECODE_FIRST_PASS_UNIFORM_LOC, 1);
+		glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo, 0,        raw_size);
+		glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 2, csctx->raw_data_ssbo, raw_size, raw_size);
 		glDispatchCompute(ceil_f32((f32)csctx->dec_data_dim.x / local_size_x),
 		                  ceil_f32((f32)csctx->dec_data_dim.y / DECODE_LOCAL_SIZE_Y),
 		                  ceil_f32((f32)csctx->dec_data_dim.z / DECODE_LOCAL_SIZE_Z));
+
+		glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
+
+		glProgramUniform1ui(csctx->programs[shader], DECODE_FIRST_PASS_UNIFORM_LOC, 0);
+		glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo, raw_size, raw_size);
+		glDispatchCompute(ceil_f32((f32)csctx->dec_data_dim.x / local_size_x),
+		                  ceil_f32((f32)csctx->dec_data_dim.y / DECODE_LOCAL_SIZE_Y),
+		                  ceil_f32((f32)csctx->dec_data_dim.z / DECODE_LOCAL_SIZE_Z));
+
 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
 	}break;
 	case BeamformerShaderKind_CudaDecode:{
@@ -423,6 +434,7 @@ shader_text_with_header(ShaderReloadContext *ctx, OS *os, Arena *arena)
 		"layout(local_size_x = " str(DECODE_LOCAL_SIZE_X) ", "
 		       "local_size_y = " str(DECODE_LOCAL_SIZE_Y) ", "
 		       "local_size_z = " str(DECODE_LOCAL_SIZE_Z) ") in;\n\n"
+		"layout(location = " str(DECODE_FIRST_PASS_UNIFORM_LOC) ") uniform bool u_first_pass;\n\n"
 		DECODE_TYPES
 		));
 		#undef X
diff --git a/beamformer_parameters.h b/beamformer_parameters.h
@@ -73,6 +73,8 @@ typedef enum {
 #define DECODE_LOCAL_SIZE_Y  1
 #define DECODE_LOCAL_SIZE_Z 16
 
+#define DECODE_FIRST_PASS_UNIFORM_LOC 1
+
 #define DAS_LOCAL_SIZE_X 32
 #define DAS_LOCAL_SIZE_Y  1
 #define DAS_LOCAL_SIZE_Z 32
diff --git a/opengl.h b/opengl.h
@@ -14,6 +14,7 @@
 #define GL_DYNAMIC_STORAGE_BIT             0x0100
 #define GL_SHADER_IMAGE_ACCESS_BARRIER_BIT 0x00000020
 #define GL_TEXTURE_UPDATE_BARRIER_BIT      0x00000100
+#define GL_SHADER_STORAGE_BARRIER_BIT      0x00002000
 
 #define GL_UNSIGNED_INT_8_8_8_8            0x8035
 #define GL_TEXTURE_3D                      0x806F
@@ -67,6 +68,7 @@ typedef uint64_t  GLuint64;
 	X(glAttachShader,                        void,   (GLuint program, GLuint shader)) \
 	X(glBeginQuery,                          void,   (GLenum target, GLuint id)) \
 	X(glBindBufferBase,                      void,   (GLenum target, GLuint index, GLuint buffer)) \
+	X(glBindBufferRange,                     void,   (GLenum target, GLuint index, GLuint buffer, GLintptr offset, GLsizeiptr size)) \
 	X(glBindFramebuffer,                     void,   (GLenum target, GLuint framebuffer)) \
 	X(glBindImageTexture,                    void,   (GLuint unit, GLuint texture, GLint level, GLboolean layered, GLint layer, GLenum access, GLenum format)) \
 	X(glBindTextureUnit,                     void,   (GLuint unit, GLuint texture)) \
diff --git a/shaders/decode.glsl b/shaders/decode.glsl
@@ -34,6 +34,10 @@ layout(std430, binding = 1) readonly restrict buffer buffer_1 {
 };
 
 layout(std430, binding = 2) writeonly restrict buffer buffer_2 {
+	INPUT_DATA_TYPE out_rf_data[];
+};
+
+layout(std430, binding = 3) writeonly restrict buffer buffer_3 {
 	vec2 out_data[];
 };
 
@@ -52,31 +56,30 @@ void main()
 	uint channel     = gl_GlobalInvocationID.y;
 	uint transmit    = gl_GlobalInvocationID.z;
 
-	/* NOTE(rnp): stores output as a 3D matrix with ordering of {samples, channels, transmits} */
-	uint out_off = dec_data_dim.x * dec_data_dim.y * transmit + dec_data_dim.x * channel + time_sample;
-
-	int rf_channel = imageLoad(channel_mapping, int(channel)).x;
+	uint rf_offset = (channel * rf_raw_dim.x + time_sample * dec_data_dim.z) / RF_SAMPLES_PER_INDEX;
+	if (u_first_pass) {
+		int  rf_channel = imageLoad(channel_mapping, int(channel)).x;
+		uint in_off     = rf_channel * rf_raw_dim.x + transmit * dec_data_dim.x + time_sample;
+		out_rf_data[rf_offset + transmit] = rf_data[in_off / RF_SAMPLES_PER_INDEX];
+	} else {
+		/* NOTE(rnp): stores output as a 3D matrix with ordering of {samples, channels, transmits} */
+		uint out_off = dec_data_dim.x * dec_data_dim.y * transmit + dec_data_dim.x * channel + time_sample;
 
-	/* NOTE(rnp): samples input as 2D matrix of {samples * transmits + padding, channels} */
-	uint rf_stride = dec_data_dim.x / RF_SAMPLES_PER_INDEX;
-	uint rf_offset = (rf_raw_dim.x * rf_channel + time_sample) / RF_SAMPLES_PER_INDEX;
-
-	vec4 result = vec4(0);
-	switch (decode) {
-	case DECODE_MODE_NONE: {
-		result = RESULT_TYPE_CAST(sample_rf_data(rf_offset + rf_stride * transmit));
-	} break;
-	case DECODE_MODE_HADAMARD: {
-		SAMPLE_DATA_TYPE sum = SAMPLE_DATA_TYPE(0);
-		for (int i = 0; i < dec_data_dim.z; i++) {
-			sum += imageLoad(hadamard, ivec2(i, transmit)).x * sample_rf_data(rf_offset);
-			rf_offset += rf_stride;
+		vec4 result = vec4(0);
+		switch (decode) {
+		case DECODE_MODE_NONE: {
+			result = RESULT_TYPE_CAST(sample_rf_data(rf_offset + transmit));
+		} break;
+		case DECODE_MODE_HADAMARD: {
+			SAMPLE_DATA_TYPE sum = SAMPLE_DATA_TYPE(0);
+			for (int i = 0; i < dec_data_dim.z; i++)
+				sum += imageLoad(hadamard, ivec2(i, transmit)).x * sample_rf_data(rf_offset++);
+			result = RESULT_TYPE_CAST(sum) / float(dec_data_dim.z);
+		} break;
 		}
-		result = RESULT_TYPE_CAST(sum) / float(dec_data_dim.z);
-	} break;
+		out_data[out_off + 0] = result.xy;
+		#if RF_SAMPLES_PER_INDEX == 2
+		out_data[out_off + 1] = result.zw;
+		#endif
 	}
-	out_data[out_off + 0] = result.xy;
-#if RF_SAMPLES_PER_INDEX == 2
-	out_data[out_off + 1] = result.zw;
-#endif
 }

M	beamformer.c	\|	20	++++++++++++++++----
M	beamformer_parameters.h	\|	2	++
M	opengl.h	\|	2	++
M	shaders/decode.glsl	\|	51	+++++++++++++++++++++++++++------------------------