Commit: 85a084f7329a4c19f01efe2ad9bbbdf115307079
Parent: ef105d5bc95f4fa9127a4595c71349522b35e6aa
Author: Randy Palamar
Date: Fri, 27 Jun 2025 07:54:01 -0600
shaders/decode: gain another 10% by doing a first pass data reordering
instead of doing a sparse matrix multiply we can instead reorder
the data first then do a dense matrix multiply.
For now this is the final decode optimization I will do (though
its possible there is a better dispatch order for doing the
shuffle). I also tried packing the hadamard into a bit matrix but
extra integer math was more expensive than the extra memory usage
in this case (it may still be important for readi though).
Diffstat:
4 files changed, 47 insertions(+), 28 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -1,6 +1,5 @@
/* See LICENSE for license details. */
/* TODO(rnp):
- * [ ]: refactor: compute shader timers should be generated based on the pipeline stage limit
* [ ]: reinvestigate ring buffer raw_data_ssbo
* - to minimize latency the main thread should manage the subbuffer upload so that the
* compute thread can just keep computing. This way we can keep the copmute thread busy
@@ -123,7 +122,7 @@ alloc_shader_storage(BeamformerCtx *ctx, u32 rf_raw_size, Arena a)
i32 storage_flags = GL_DYNAMIC_STORAGE_BIT;
glDeleteBuffers(1, &cs->raw_data_ssbo);
glCreateBuffers(1, &cs->raw_data_ssbo);
- glNamedBufferStorage(cs->raw_data_ssbo, rf_raw_size, 0, storage_flags);
+ glNamedBufferStorage(cs->raw_data_ssbo, 2 * rf_raw_size, 0, storage_flags);
LABEL_GL_OBJECT(GL_BUFFER, cs->raw_data_ssbo, s8("Raw_RF_SSBO"));
iz rf_decoded_size = 2 * sizeof(f32) * cs->dec_data_dim.x * cs->dec_data_dim.y * cs->dec_data_dim.z;
@@ -278,8 +277,7 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame,
case BeamformerShaderKind_Decode:
case BeamformerShaderKind_DecodeFloat:
case BeamformerShaderKind_DecodeFloatComplex:{
- glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo);
- glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]);
+ glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, csctx->rf_data_ssbos[output_ssbo_idx]);
glBindImageTexture(0, csctx->hadamard_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8I);
glBindImageTexture(1, csctx->channel_mapping_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I);
@@ -288,9 +286,22 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame,
if (shader == BeamformerShaderKind_Decode)
local_size_x *= 2;
+ iz raw_size = csctx->rf_raw_size;
+ glProgramUniform1ui(csctx->programs[shader], DECODE_FIRST_PASS_UNIFORM_LOC, 1);
+ glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo, 0, raw_size);
+ glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 2, csctx->raw_data_ssbo, raw_size, raw_size);
glDispatchCompute(ceil_f32((f32)csctx->dec_data_dim.x / local_size_x),
ceil_f32((f32)csctx->dec_data_dim.y / DECODE_LOCAL_SIZE_Y),
ceil_f32((f32)csctx->dec_data_dim.z / DECODE_LOCAL_SIZE_Z));
+
+ glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
+
+ glProgramUniform1ui(csctx->programs[shader], DECODE_FIRST_PASS_UNIFORM_LOC, 0);
+ glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo, raw_size, raw_size);
+ glDispatchCompute(ceil_f32((f32)csctx->dec_data_dim.x / local_size_x),
+ ceil_f32((f32)csctx->dec_data_dim.y / DECODE_LOCAL_SIZE_Y),
+ ceil_f32((f32)csctx->dec_data_dim.z / DECODE_LOCAL_SIZE_Z));
+
csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
}break;
case BeamformerShaderKind_CudaDecode:{
@@ -423,6 +434,7 @@ shader_text_with_header(ShaderReloadContext *ctx, OS *os, Arena *arena)
"layout(local_size_x = " str(DECODE_LOCAL_SIZE_X) ", "
"local_size_y = " str(DECODE_LOCAL_SIZE_Y) ", "
"local_size_z = " str(DECODE_LOCAL_SIZE_Z) ") in;\n\n"
+ "layout(location = " str(DECODE_FIRST_PASS_UNIFORM_LOC) ") uniform bool u_first_pass;\n\n"
DECODE_TYPES
));
#undef X
diff --git a/beamformer_parameters.h b/beamformer_parameters.h
@@ -73,6 +73,8 @@ typedef enum {
#define DECODE_LOCAL_SIZE_Y 1
#define DECODE_LOCAL_SIZE_Z 16
+#define DECODE_FIRST_PASS_UNIFORM_LOC 1
+
#define DAS_LOCAL_SIZE_X 32
#define DAS_LOCAL_SIZE_Y 1
#define DAS_LOCAL_SIZE_Z 32
diff --git a/opengl.h b/opengl.h
@@ -14,6 +14,7 @@
#define GL_DYNAMIC_STORAGE_BIT 0x0100
#define GL_SHADER_IMAGE_ACCESS_BARRIER_BIT 0x00000020
#define GL_TEXTURE_UPDATE_BARRIER_BIT 0x00000100
+#define GL_SHADER_STORAGE_BARRIER_BIT 0x00002000
#define GL_UNSIGNED_INT_8_8_8_8 0x8035
#define GL_TEXTURE_3D 0x806F
@@ -67,6 +68,7 @@ typedef uint64_t GLuint64;
X(glAttachShader, void, (GLuint program, GLuint shader)) \
X(glBeginQuery, void, (GLenum target, GLuint id)) \
X(glBindBufferBase, void, (GLenum target, GLuint index, GLuint buffer)) \
+ X(glBindBufferRange, void, (GLenum target, GLuint index, GLuint buffer, GLintptr offset, GLsizeiptr size)) \
X(glBindFramebuffer, void, (GLenum target, GLuint framebuffer)) \
X(glBindImageTexture, void, (GLuint unit, GLuint texture, GLint level, GLboolean layered, GLint layer, GLenum access, GLenum format)) \
X(glBindTextureUnit, void, (GLuint unit, GLuint texture)) \
diff --git a/shaders/decode.glsl b/shaders/decode.glsl
@@ -34,6 +34,10 @@ layout(std430, binding = 1) readonly restrict buffer buffer_1 {
};
layout(std430, binding = 2) writeonly restrict buffer buffer_2 {
+ INPUT_DATA_TYPE out_rf_data[];
+};
+
+layout(std430, binding = 3) writeonly restrict buffer buffer_3 {
vec2 out_data[];
};
@@ -52,31 +56,30 @@ void main()
uint channel = gl_GlobalInvocationID.y;
uint transmit = gl_GlobalInvocationID.z;
- /* NOTE(rnp): stores output as a 3D matrix with ordering of {samples, channels, transmits} */
- uint out_off = dec_data_dim.x * dec_data_dim.y * transmit + dec_data_dim.x * channel + time_sample;
-
- int rf_channel = imageLoad(channel_mapping, int(channel)).x;
+ uint rf_offset = (channel * rf_raw_dim.x + time_sample * dec_data_dim.z) / RF_SAMPLES_PER_INDEX;
+ if (u_first_pass) {
+ int rf_channel = imageLoad(channel_mapping, int(channel)).x;
+ uint in_off = rf_channel * rf_raw_dim.x + transmit * dec_data_dim.x + time_sample;
+ out_rf_data[rf_offset + transmit] = rf_data[in_off / RF_SAMPLES_PER_INDEX];
+ } else {
+ /* NOTE(rnp): stores output as a 3D matrix with ordering of {samples, channels, transmits} */
+ uint out_off = dec_data_dim.x * dec_data_dim.y * transmit + dec_data_dim.x * channel + time_sample;
- /* NOTE(rnp): samples input as 2D matrix of {samples * transmits + padding, channels} */
- uint rf_stride = dec_data_dim.x / RF_SAMPLES_PER_INDEX;
- uint rf_offset = (rf_raw_dim.x * rf_channel + time_sample) / RF_SAMPLES_PER_INDEX;
-
- vec4 result = vec4(0);
- switch (decode) {
- case DECODE_MODE_NONE: {
- result = RESULT_TYPE_CAST(sample_rf_data(rf_offset + rf_stride * transmit));
- } break;
- case DECODE_MODE_HADAMARD: {
- SAMPLE_DATA_TYPE sum = SAMPLE_DATA_TYPE(0);
- for (int i = 0; i < dec_data_dim.z; i++) {
- sum += imageLoad(hadamard, ivec2(i, transmit)).x * sample_rf_data(rf_offset);
- rf_offset += rf_stride;
+ vec4 result = vec4(0);
+ switch (decode) {
+ case DECODE_MODE_NONE: {
+ result = RESULT_TYPE_CAST(sample_rf_data(rf_offset + transmit));
+ } break;
+ case DECODE_MODE_HADAMARD: {
+ SAMPLE_DATA_TYPE sum = SAMPLE_DATA_TYPE(0);
+ for (int i = 0; i < dec_data_dim.z; i++)
+ sum += imageLoad(hadamard, ivec2(i, transmit)).x * sample_rf_data(rf_offset++);
+ result = RESULT_TYPE_CAST(sum) / float(dec_data_dim.z);
+ } break;
}
- result = RESULT_TYPE_CAST(sum) / float(dec_data_dim.z);
- } break;
+ out_data[out_off + 0] = result.xy;
+ #if RF_SAMPLES_PER_INDEX == 2
+ out_data[out_off + 1] = result.zw;
+ #endif
}
- out_data[out_off + 0] = result.xy;
-#if RF_SAMPLES_PER_INDEX == 2
- out_data[out_off + 1] = result.zw;
-#endif
}