ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

Commit: 85a084f7329a4c19f01efe2ad9bbbdf115307079
Parent: ef105d5bc95f4fa9127a4595c71349522b35e6aa
Author: Randy Palamar
Date:   Fri, 27 Jun 2025 07:54:01 -0600

shaders/decode: gain another 10% by doing a first pass data reordering

instead of doing a sparse matrix multiply we can instead reorder
the data first then do a dense matrix multiply.

For now this is the final decode optimization I will do (though
its possible there is a better dispatch order for doing the
shuffle). I also tried packing the hadamard into a bit matrix but
extra integer math was more expensive than the extra memory usage
in this case (it may still be important for readi though).

Diffstat:
Mbeamformer.c | 20++++++++++++++++----
Mbeamformer_parameters.h | 2++
Mopengl.h | 2++
Mshaders/decode.glsl | 51+++++++++++++++++++++++++++------------------------
4 files changed, 47 insertions(+), 28 deletions(-)

diff --git a/beamformer.c b/beamformer.c @@ -1,6 +1,5 @@ /* See LICENSE for license details. */ /* TODO(rnp): - * [ ]: refactor: compute shader timers should be generated based on the pipeline stage limit * [ ]: reinvestigate ring buffer raw_data_ssbo * - to minimize latency the main thread should manage the subbuffer upload so that the * compute thread can just keep computing. This way we can keep the copmute thread busy @@ -123,7 +122,7 @@ alloc_shader_storage(BeamformerCtx *ctx, u32 rf_raw_size, Arena a) i32 storage_flags = GL_DYNAMIC_STORAGE_BIT; glDeleteBuffers(1, &cs->raw_data_ssbo); glCreateBuffers(1, &cs->raw_data_ssbo); - glNamedBufferStorage(cs->raw_data_ssbo, rf_raw_size, 0, storage_flags); + glNamedBufferStorage(cs->raw_data_ssbo, 2 * rf_raw_size, 0, storage_flags); LABEL_GL_OBJECT(GL_BUFFER, cs->raw_data_ssbo, s8("Raw_RF_SSBO")); iz rf_decoded_size = 2 * sizeof(f32) * cs->dec_data_dim.x * cs->dec_data_dim.y * cs->dec_data_dim.z; @@ -278,8 +277,7 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, case BeamformerShaderKind_Decode: case BeamformerShaderKind_DecodeFloat: case BeamformerShaderKind_DecodeFloatComplex:{ - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, csctx->rf_data_ssbos[output_ssbo_idx]); glBindImageTexture(0, csctx->hadamard_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8I); glBindImageTexture(1, csctx->channel_mapping_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I); @@ -288,9 +286,22 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, if (shader == BeamformerShaderKind_Decode) local_size_x *= 2; + iz raw_size = csctx->rf_raw_size; + glProgramUniform1ui(csctx->programs[shader], DECODE_FIRST_PASS_UNIFORM_LOC, 1); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo, 0, raw_size); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 2, csctx->raw_data_ssbo, raw_size, raw_size); glDispatchCompute(ceil_f32((f32)csctx->dec_data_dim.x / local_size_x), ceil_f32((f32)csctx->dec_data_dim.y / DECODE_LOCAL_SIZE_Y), ceil_f32((f32)csctx->dec_data_dim.z / DECODE_LOCAL_SIZE_Z)); + + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); + + glProgramUniform1ui(csctx->programs[shader], DECODE_FIRST_PASS_UNIFORM_LOC, 0); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo, raw_size, raw_size); + glDispatchCompute(ceil_f32((f32)csctx->dec_data_dim.x / local_size_x), + ceil_f32((f32)csctx->dec_data_dim.y / DECODE_LOCAL_SIZE_Y), + ceil_f32((f32)csctx->dec_data_dim.z / DECODE_LOCAL_SIZE_Z)); + csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; }break; case BeamformerShaderKind_CudaDecode:{ @@ -423,6 +434,7 @@ shader_text_with_header(ShaderReloadContext *ctx, OS *os, Arena *arena) "layout(local_size_x = " str(DECODE_LOCAL_SIZE_X) ", " "local_size_y = " str(DECODE_LOCAL_SIZE_Y) ", " "local_size_z = " str(DECODE_LOCAL_SIZE_Z) ") in;\n\n" + "layout(location = " str(DECODE_FIRST_PASS_UNIFORM_LOC) ") uniform bool u_first_pass;\n\n" DECODE_TYPES )); #undef X diff --git a/beamformer_parameters.h b/beamformer_parameters.h @@ -73,6 +73,8 @@ typedef enum { #define DECODE_LOCAL_SIZE_Y 1 #define DECODE_LOCAL_SIZE_Z 16 +#define DECODE_FIRST_PASS_UNIFORM_LOC 1 + #define DAS_LOCAL_SIZE_X 32 #define DAS_LOCAL_SIZE_Y 1 #define DAS_LOCAL_SIZE_Z 32 diff --git a/opengl.h b/opengl.h @@ -14,6 +14,7 @@ #define GL_DYNAMIC_STORAGE_BIT 0x0100 #define GL_SHADER_IMAGE_ACCESS_BARRIER_BIT 0x00000020 #define GL_TEXTURE_UPDATE_BARRIER_BIT 0x00000100 +#define GL_SHADER_STORAGE_BARRIER_BIT 0x00002000 #define GL_UNSIGNED_INT_8_8_8_8 0x8035 #define GL_TEXTURE_3D 0x806F @@ -67,6 +68,7 @@ typedef uint64_t GLuint64; X(glAttachShader, void, (GLuint program, GLuint shader)) \ X(glBeginQuery, void, (GLenum target, GLuint id)) \ X(glBindBufferBase, void, (GLenum target, GLuint index, GLuint buffer)) \ + X(glBindBufferRange, void, (GLenum target, GLuint index, GLuint buffer, GLintptr offset, GLsizeiptr size)) \ X(glBindFramebuffer, void, (GLenum target, GLuint framebuffer)) \ X(glBindImageTexture, void, (GLuint unit, GLuint texture, GLint level, GLboolean layered, GLint layer, GLenum access, GLenum format)) \ X(glBindTextureUnit, void, (GLuint unit, GLuint texture)) \ diff --git a/shaders/decode.glsl b/shaders/decode.glsl @@ -34,6 +34,10 @@ layout(std430, binding = 1) readonly restrict buffer buffer_1 { }; layout(std430, binding = 2) writeonly restrict buffer buffer_2 { + INPUT_DATA_TYPE out_rf_data[]; +}; + +layout(std430, binding = 3) writeonly restrict buffer buffer_3 { vec2 out_data[]; }; @@ -52,31 +56,30 @@ void main() uint channel = gl_GlobalInvocationID.y; uint transmit = gl_GlobalInvocationID.z; - /* NOTE(rnp): stores output as a 3D matrix with ordering of {samples, channels, transmits} */ - uint out_off = dec_data_dim.x * dec_data_dim.y * transmit + dec_data_dim.x * channel + time_sample; - - int rf_channel = imageLoad(channel_mapping, int(channel)).x; + uint rf_offset = (channel * rf_raw_dim.x + time_sample * dec_data_dim.z) / RF_SAMPLES_PER_INDEX; + if (u_first_pass) { + int rf_channel = imageLoad(channel_mapping, int(channel)).x; + uint in_off = rf_channel * rf_raw_dim.x + transmit * dec_data_dim.x + time_sample; + out_rf_data[rf_offset + transmit] = rf_data[in_off / RF_SAMPLES_PER_INDEX]; + } else { + /* NOTE(rnp): stores output as a 3D matrix with ordering of {samples, channels, transmits} */ + uint out_off = dec_data_dim.x * dec_data_dim.y * transmit + dec_data_dim.x * channel + time_sample; - /* NOTE(rnp): samples input as 2D matrix of {samples * transmits + padding, channels} */ - uint rf_stride = dec_data_dim.x / RF_SAMPLES_PER_INDEX; - uint rf_offset = (rf_raw_dim.x * rf_channel + time_sample) / RF_SAMPLES_PER_INDEX; - - vec4 result = vec4(0); - switch (decode) { - case DECODE_MODE_NONE: { - result = RESULT_TYPE_CAST(sample_rf_data(rf_offset + rf_stride * transmit)); - } break; - case DECODE_MODE_HADAMARD: { - SAMPLE_DATA_TYPE sum = SAMPLE_DATA_TYPE(0); - for (int i = 0; i < dec_data_dim.z; i++) { - sum += imageLoad(hadamard, ivec2(i, transmit)).x * sample_rf_data(rf_offset); - rf_offset += rf_stride; + vec4 result = vec4(0); + switch (decode) { + case DECODE_MODE_NONE: { + result = RESULT_TYPE_CAST(sample_rf_data(rf_offset + transmit)); + } break; + case DECODE_MODE_HADAMARD: { + SAMPLE_DATA_TYPE sum = SAMPLE_DATA_TYPE(0); + for (int i = 0; i < dec_data_dim.z; i++) + sum += imageLoad(hadamard, ivec2(i, transmit)).x * sample_rf_data(rf_offset++); + result = RESULT_TYPE_CAST(sum) / float(dec_data_dim.z); + } break; } - result = RESULT_TYPE_CAST(sum) / float(dec_data_dim.z); - } break; + out_data[out_off + 0] = result.xy; + #if RF_SAMPLES_PER_INDEX == 2 + out_data[out_off + 1] = result.zw; + #endif } - out_data[out_off + 0] = result.xy; -#if RF_SAMPLES_PER_INDEX == 2 - out_data[out_off + 1] = result.zw; -#endif }