ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

Commit: 2ca5fb9c3a7e4116aab00470d387096068231541
Parent: c181b332e633070145251210905a88a4fa920492
Author: Randy Palamar
Date:   Thu, 25 Sep 2025 10:00:10 -0600

shaders/decode: switch to parameter baking at compile time

this gives a ~49% performance increase (likely due to the constant loop size)

Diffstat:
Mbeamformer.c | 27++++++++++++++-------------
Mbeamformer.h | 21+--------------------
Mbeamformer.meta | 17++++++++++++++---
Mgenerated/beamformer.meta.c | 64++++++++++++++++++++++++++++++++++++++++------------------------
Mshaders/decode.glsl | 24++++++++++++------------
5 files changed, 81 insertions(+), 72 deletions(-)

diff --git a/beamformer.c b/beamformer.c @@ -1,10 +1,13 @@ /* See LICENSE for license details. */ /* TODO(rnp): + * [ ]: refactor: split decode into reshape and decode + * - the check for first pass reshaping is the last non constant check + * in the shader + * - this will also remove the need for the channel mapping in the decode shader * [ ]: refactor: fancier hot reloading for JIT shaders * - loop over all active blocks - loop over shader sets per block * - when match found reload it - * [ ]: measure performance of doing channel mapping in a separate shader * [ ]: BeamformWorkQueue -> BeamformerWorkQueue * [ ]: need to keep track of gpu memory in some way * - want to be able to store more than 16 2D frames but limit 3D frames @@ -501,17 +504,19 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb) case BeamformerShaderKind_CudaHilbert:{ commit = run_cuda_hilbert; }break; case BeamformerShaderKind_Decode:{ /* TODO(rnp): rework decode first and demodulate after */ - BeamformerDataKind decode_data_kind = data_kind; + BeamformerShaderDecodeBakeParameters *b = &cp->decode_bake; + b->data_kind = data_kind; if (!decode_first) { if (data_kind == BeamformerDataKind_Int16) { - decode_data_kind = BeamformerDataKind_Int16Complex; + b->data_kind = BeamformerDataKind_Int16Complex; } else { - decode_data_kind = BeamformerDataKind_Float32Complex; + b->data_kind = BeamformerDataKind_Float32Complex; } } - i32 local_flags = 0; - if (run_cuda_hilbert) local_flags |= BeamformerShaderDecodeFlags_DilateOutput; - match = beamformer_shader_decode_match(decode_data_kind, local_flags); + + b->shader_flags = 0; + if (run_cuda_hilbert) b->shader_flags |= BeamformerShaderDecodeFlags_DilateOutput; + commit = 1; }break; case BeamformerShaderKind_Demodulate:{ @@ -583,7 +588,7 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb) u32 input_transmit_stride = cp->das_bake.sample_count; u32 input_channel_stride = pb->parameters.raw_data_dimensions[0]; - BeamformerDecodeUBO *dp = &cp->decode_ubo_data; + BeamformerShaderDecodeBakeParameters *dp = &cp->decode_bake; dp->decode_mode = pb->parameters.decode; dp->transmit_count = cp->das_bake.acquisition_count; @@ -727,10 +732,6 @@ load_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, u32 shader_sl BEAMFORMER_DAS_UBO_PARAM_LIST "};\n\n" ), - [BeamformerShaderKind_Decode] = s8_comp("layout(std140, binding = 0) uniform parameters {\n" - BEAMFORMER_DECODE_UBO_PARAM_LIST - "};\n\n" - ), [BeamformerShaderKind_Filter] = s8_comp("layout(std140, binding = 0) uniform parameters {\n" BEAMFORMER_FILTER_UBO_PARAM_LIST "};\n\n" @@ -776,6 +777,7 @@ load_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, u32 shader_sl u32 *parameters = 0; /* TODO(rnp): generate this */ switch (shader) { + case BeamformerShaderKind_Decode:{ parameters = cp->decode_bake.E; }break; case BeamformerShaderKind_Demodulate:{ parameters = cp->demodulate_bake.E; }break; case BeamformerShaderKind_Filter:{ parameters = cp->filter_bake.E; }break; case BeamformerShaderKind_DAS:{ parameters = cp->das_bake.E; }break; @@ -900,7 +902,6 @@ do_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, BeamformerFrame switch (shader) { case BeamformerShaderKind_Decode:{ - glBindBufferBase(GL_UNIFORM_BUFFER, 0, cp->ubos[BeamformerComputeUBOKind_Decode]); glBindImageTexture(0, cp->textures[BeamformerComputeTextureKind_Hadamard], 0, 0, 0, GL_READ_ONLY, GL_R8I); if (shader_slot == 0) { diff --git a/beamformer.h b/beamformer.h @@ -113,17 +113,6 @@ typedef struct { X(sampling_frequency, f32, float) /* X(name, type, gltype) */ -#define BEAMFORMER_DECODE_UBO_PARAM_LIST \ - X(input_channel_stride, u32, uint) \ - X(input_sample_stride, u32, uint) \ - X(input_transmit_stride, u32, uint) \ - X(output_channel_stride, u32, uint) \ - X(output_sample_stride, u32, uint) \ - X(output_transmit_stride, u32, uint) \ - X(transmit_count, u32, uint) \ - X(decode_mode, u32, uint) - -/* X(name, type, gltype) */ #define BEAMFORMER_DAS_UBO_PARAM_LIST \ X(voxel_transform, m4, mat4) \ X(xdc_transform, m4, mat4) \ @@ -136,13 +125,6 @@ typedef struct { typedef alignas(16) struct { #define X(name, type, ...) type name; - BEAMFORMER_DECODE_UBO_PARAM_LIST - #undef X -} BeamformerDecodeUBO; -static_assert((sizeof(BeamformerDecodeUBO) & 15) == 0, "UBO size must be a multiple of 16"); - -typedef alignas(16) struct { - #define X(name, type, ...) type name; BEAMFORMER_FILTER_UBO_PARAM_LIST #undef X float _pad[2]; @@ -157,11 +139,9 @@ typedef alignas(16) struct { } BeamformerDASUBO; static_assert((sizeof(BeamformerDASUBO) & 15) == 0, "UBO size must be a multiple of 16"); -/* TODO(rnp): das should remove redundant info and add voxel transform */ /* TODO(rnp): need 1 UBO per filter slot */ #define BEAMFORMER_COMPUTE_UBO_LIST \ X(DAS, BeamformerDASUBO, das) \ - X(Decode, BeamformerDecodeUBO, decode) \ X(Filter, BeamformerFilterUBO, filter) \ X(Demodulate, BeamformerFilterUBO, demod) @@ -217,6 +197,7 @@ struct BeamformerComputePlan { BEAMFORMER_COMPUTE_UBO_LIST #undef X + BeamformerShaderDecodeBakeParameters decode_bake; BeamformerShaderFilterBakeParameters demodulate_bake; BeamformerShaderFilterBakeParameters filter_bake; BeamformerShaderDASBakeParameters das_bake; diff --git a/beamformer.meta b/beamformer.meta @@ -10,12 +10,23 @@ @Shader(decode.glsl) Decode { + @Enumeration(DataKind) @Enumeration(DecodeMode) - @PermuteFlags([DilateOutput]) + @Flags([DilateOutput]) + + @Bake { - @Permute(DataKind [Int16]) + @BakeVariable(DataKind data_kind ) + @BakeVariable(DecodeMode decode_mode ) + @BakeVariable(InputChannelStride input_channel_stride ) + @BakeVariable(InputSampleStride input_sample_stride ) + @BakeVariable(InputTransmitStride input_transmit_stride ) + @BakeVariable(OutputChannelStride output_channel_stride ) + @BakeVariable(OutputSampleStride output_sample_stride ) + @BakeVariable(OutputTransmitStride output_transmit_stride) + @BakeVariable(ShaderFlags shader_flags ) + @BakeVariable(TransmitCount transmit_count ) } - @Permute(DataKind [Int16Complex Float32 Float32Complex]) } @Shader(filter.glsl) Filter diff --git a/generated/beamformer.meta.c b/generated/beamformer.meta.c @@ -76,6 +76,22 @@ typedef struct { typedef union { struct { u32 data_kind; + u32 decode_mode; + u32 input_channel_stride; + u32 input_sample_stride; + u32 input_transmit_stride; + u32 output_channel_stride; + u32 output_sample_stride; + u32 output_transmit_stride; + u32 shader_flags; + u32 transmit_count; + }; + u32 E[10]; +} BeamformerShaderDecodeBakeParameters; + +typedef union { + struct { + u32 data_kind; u32 decimation_rate; u32 filter_length; u32 input_channel_stride; @@ -108,11 +124,7 @@ read_only global i32 *beamformer_shader_match_vectors[] = { // CudaHilbert 0, // Decode - (i32 []){BeamformerDataKind_Int16, 0x00}, - (i32 []){BeamformerDataKind_Int16, 0x01}, - (i32 []){BeamformerDataKind_Int16Complex, 0x00}, - (i32 []){BeamformerDataKind_Float32, 0x00}, - (i32 []){BeamformerDataKind_Float32Complex, 0x00}, + 0, // Filter 0, // Demodulate @@ -125,18 +137,18 @@ read_only global i32 *beamformer_shader_match_vectors[] = { // Render3D 0, }; -#define beamformer_match_vectors_count (12) +#define beamformer_match_vectors_count (8) read_only global BeamformerShaderDescriptor beamformer_shader_descriptors[] = { - {0, 1, 0, 0, 0}, - {1, 2, 0, 0, 0}, - {2, 7, 1, 2, 1}, - {7, 8, 0, 2, 0}, - {8, 8, 0, 0, 0}, - {8, 9, 0, 2, 0}, - {9, 10, 0, 0, 0}, - {10, 11, 0, 0, 0}, - {11, 12, 0, 0, 0}, + {0, 1, 0, 0, 0}, + {1, 2, 0, 0, 0}, + {2, 3, 0, 2, 0}, + {3, 4, 0, 2, 0}, + {4, 4, 0, 0, 0}, + {4, 5, 0, 2, 0}, + {5, 6, 0, 0, 0}, + {6, 7, 0, 0, 0}, + {7, 8, 0, 0, 0}, }; read_only global s8 beamformer_shader_names[] = { @@ -251,7 +263,18 @@ read_only global i32 *beamformer_shader_header_vectors[] = { }; read_only global s8 *beamformer_shader_bake_parameter_names[] = { - 0, + (s8 []){ + s8_comp("DataKind"), + s8_comp("DecodeMode"), + s8_comp("InputChannelStride"), + s8_comp("InputSampleStride"), + s8_comp("InputTransmitStride"), + s8_comp("OutputChannelStride"), + s8_comp("OutputSampleStride"), + s8_comp("OutputTransmitStride"), + s8_comp("ShaderFlags"), + s8_comp("TransmitCount"), + }, (s8 []){ s8_comp("DataKind"), s8_comp("DecimationRate"), @@ -279,7 +302,7 @@ read_only global s8 *beamformer_shader_bake_parameter_names[] = { }; read_only global i32 beamformer_shader_bake_parameter_name_counts[] = { - 0, + 10, 11, 6, 0, @@ -309,10 +332,3 @@ beamformer_shader_match(i32 *match_vector, i32 first_index, i32 one_past_last_in return result; } -function iz -beamformer_shader_decode_match(BeamformerDataKind a, i32 flags) -{ - iz result = beamformer_shader_match((i32 []){(i32)a, flags}, 2, 7, 2); - return result; -} - diff --git a/shaders/decode.glsl b/shaders/decode.glsl @@ -70,30 +70,30 @@ void main() uint channel = gl_GlobalInvocationID.y; uint transmit = gl_GlobalInvocationID.z; - uint rf_offset = (input_channel_stride * channel + transmit_count * time_sample) / RF_SAMPLES_PER_INDEX; + uint rf_offset = (InputChannelStride * channel + TransmitCount * time_sample) / RF_SAMPLES_PER_INDEX; if (u_first_pass) { - if (time_sample < input_transmit_stride) { - uint in_off = input_channel_stride * imageLoad(channel_mapping, int(channel)).x + - input_transmit_stride * transmit + - input_sample_stride * time_sample; + if (time_sample < InputTransmitStride) { + uint in_off = InputChannelStride * imageLoad(channel_mapping, int(channel)).x + + InputTransmitStride * transmit + + InputSampleStride * time_sample; out_rf_data[rf_offset + transmit] = rf_data[in_off / RF_SAMPLES_PER_INDEX]; } } else { - if (time_sample < output_transmit_stride) { - uint out_off = output_channel_stride * channel + - output_transmit_stride * transmit + - output_sample_stride * time_sample; + if (time_sample < OutputTransmitStride) { + uint out_off = OutputChannelStride * channel + + OutputTransmitStride * transmit + + OutputSampleStride * time_sample; SAMPLE_DATA_TYPE result = SAMPLE_DATA_TYPE(0); - switch (decode_mode) { + switch (DecodeMode) { case DecodeMode_None:{ result = sample_rf_data(rf_offset + transmit); }break; case DecodeMode_Hadamard:{ SAMPLE_DATA_TYPE sum = SAMPLE_DATA_TYPE(0); - for (int i = 0; i < imageSize(hadamard).x; i++) + for (int i = 0; i < TransmitCount; i++) sum += imageLoad(hadamard, ivec2(i, transmit)).x * sample_rf_data(rf_offset++); - result = sum / float(imageSize(hadamard).x); + result = sum / float(TransmitCount); }break; } out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = result;