ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

Commit: ee457ba4b6809bef581c90ac7609882948abe7f9
Parent: 27f33707695d4a52321850c5a9800422a2ea4cd8
Author: Randy Palamar
Date:   Wed, 25 Jun 2025 21:24:29 -0600

shaders/decode: gain another 47% performance boost for i16 data

just decode 2 samples at once dummy. also convert to float
immediately instead of after dot product.

Diffstat:
Mbeamformer.c | 8+++++++-
Mshaders/decode.glsl | 43+++++++++++++++++++++++--------------------
2 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/beamformer.c b/beamformer.c @@ -282,7 +282,13 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]); glBindImageTexture(0, csctx->hadamard_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8I); glBindImageTexture(1, csctx->channel_mapping_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I); - glDispatchCompute(ceil_f32((f32)csctx->dec_data_dim.x / DECODE_LOCAL_SIZE_X), + + /* NOTE(rnp): decode 2 samples per dispatch when data is i16 */ + i32 local_size_x = DECODE_LOCAL_SIZE_X; + if (shader == BeamformerShaderKind_Decode) + local_size_x *= 2; + + glDispatchCompute(ceil_f32((f32)csctx->dec_data_dim.x / local_size_x), ceil_f32((f32)csctx->dec_data_dim.y / DECODE_LOCAL_SIZE_Y), ceil_f32((f32)csctx->dec_data_dim.z / DECODE_LOCAL_SIZE_Z)); csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; diff --git a/shaders/decode.glsl b/shaders/decode.glsl @@ -11,15 +11,22 @@ #if defined(INPUT_DATA_TYPE_FLOAT) #define INPUT_DATA_TYPE float #define RF_SAMPLES_PER_INDEX 1 - #define RESULT_TYPE_CAST(x) vec2(x, 0) + #define RESULT_TYPE_CAST(x) vec4((x), 0, 0, 0) + #define SAMPLE_DATA_TYPE float + #define SAMPLE_TYPE_CAST(x) (x) #elif defined(INPUT_DATA_TYPE_FLOAT_COMPLEX) #define INPUT_DATA_TYPE vec2 #define RF_SAMPLES_PER_INDEX 1 - #define RESULT_TYPE_CAST(x) (x) + #define RESULT_TYPE_CAST(x) vec4((x), 0, 0) + #define SAMPLE_DATA_TYPE vec2 + #define SAMPLE_TYPE_CAST(x) (x) #else #define INPUT_DATA_TYPE int #define RF_SAMPLES_PER_INDEX 2 - #define RESULT_TYPE_CAST(x) vec2(x, 0) + #define RESULT_TYPE_CAST(x) (x) + #define SAMPLE_DATA_TYPE vec4 + /* NOTE(rnp): for i16 rf_data we decode 2 samples at once */ + #define SAMPLE_TYPE_CAST(x) vec4(((x) << 16) >> 16, 0, (x) >> 16, 0) #endif layout(std430, binding = 1) readonly restrict buffer buffer_1 { @@ -33,16 +40,9 @@ layout(std430, binding = 2) writeonly restrict buffer buffer_2 { layout(r8i, binding = 0) readonly restrict uniform iimage2D hadamard; layout(r16i, binding = 1) readonly restrict uniform iimage1D channel_mapping; -INPUT_DATA_TYPE sample_rf_data(int index, uint lfs) +SAMPLE_DATA_TYPE sample_rf_data(int index) { - INPUT_DATA_TYPE result; -#if defined(INPUT_DATA_TYPE_FLOAT) || defined(INPUT_DATA_TYPE_FLOAT_COMPLEX) - result = rf_data[index]; -#else - /* NOTE(rnp): for i16 rf_data we grab 2 samples at a time. We need to shift - * arithmetically (maintaining the sign) to get the desired element. */ - result = (rf_data[index] << lfs) >> 16; -#endif + SAMPLE_DATA_TYPE result = SAMPLE_TYPE_CAST(rf_data[index]); return result; } @@ -53,28 +53,31 @@ void main() int transmit = int(gl_GlobalInvocationID.z); /* NOTE(rnp): stores output as a 3D matrix with ordering of {samples, channels, transmits} */ - uint out_off = dec_data_dim.x * dec_data_dim.y * transmit + dec_data_dim.x * channel + time_sample; + uint out_off = dec_data_dim.x * dec_data_dim.y * transmit + dec_data_dim.x * channel; + out_off += RF_SAMPLES_PER_INDEX * time_sample; int rf_channel = imageLoad(channel_mapping, channel).x; /* NOTE(rnp): samples input as 2D matrix of {samples * transmits + padding, channels} */ int rf_stride = int(dec_data_dim.x) / RF_SAMPLES_PER_INDEX; - int rf_offset = (int(rf_raw_dim.x) * rf_channel + time_sample) / RF_SAMPLES_PER_INDEX; + int rf_offset = (int(rf_raw_dim.x) * rf_channel) / RF_SAMPLES_PER_INDEX + time_sample; - uint lfs = ((~time_sample) & 1u) * 16; - vec2 result = vec2(0); + vec4 result = vec4(0); switch (decode) { case DECODE_MODE_NONE: { - result = RESULT_TYPE_CAST(sample_rf_data(rf_offset + rf_stride * transmit, lfs)); + result = RESULT_TYPE_CAST(sample_rf_data(rf_offset + rf_stride * transmit)); } break; case DECODE_MODE_HADAMARD: { - INPUT_DATA_TYPE sum = INPUT_DATA_TYPE(0); + SAMPLE_DATA_TYPE sum = SAMPLE_DATA_TYPE(0); for (int i = 0; i < dec_data_dim.z; i++) { - sum += imageLoad(hadamard, ivec2(i, transmit)).x * sample_rf_data(rf_offset, lfs); + sum += imageLoad(hadamard, ivec2(i, transmit)).x * sample_rf_data(rf_offset); rf_offset += rf_stride; } result = RESULT_TYPE_CAST(sum) / float(dec_data_dim.z); } break; } - out_data[out_off] = result; + out_data[out_off + 0] = result.xy; +#if RF_SAMPLES_PER_INDEX == 2 + out_data[out_off + 1] = result.zw; +#endif }