Commit: 2ca5fb9c3a7e4116aab00470d387096068231541
Parent: c181b332e633070145251210905a88a4fa920492
Author: Randy Palamar
Date: Thu, 25 Sep 2025 10:00:10 -0600
shaders/decode: switch to parameter baking at compile time
this gives a ~49% performance increase (likely due to the constant loop size)
Diffstat:
5 files changed, 81 insertions(+), 72 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -1,10 +1,13 @@
/* See LICENSE for license details. */
/* TODO(rnp):
+ * [ ]: refactor: split decode into reshape and decode
+ * - the check for first pass reshaping is the last non constant check
+ * in the shader
+ * - this will also remove the need for the channel mapping in the decode shader
* [ ]: refactor: fancier hot reloading for JIT shaders
* - loop over all active blocks
- loop over shader sets per block
* - when match found reload it
- * [ ]: measure performance of doing channel mapping in a separate shader
* [ ]: BeamformWorkQueue -> BeamformerWorkQueue
* [ ]: need to keep track of gpu memory in some way
* - want to be able to store more than 16 2D frames but limit 3D frames
@@ -501,17 +504,19 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
case BeamformerShaderKind_CudaHilbert:{ commit = run_cuda_hilbert; }break;
case BeamformerShaderKind_Decode:{
/* TODO(rnp): rework decode first and demodulate after */
- BeamformerDataKind decode_data_kind = data_kind;
+ BeamformerShaderDecodeBakeParameters *b = &cp->decode_bake;
+ b->data_kind = data_kind;
if (!decode_first) {
if (data_kind == BeamformerDataKind_Int16) {
- decode_data_kind = BeamformerDataKind_Int16Complex;
+ b->data_kind = BeamformerDataKind_Int16Complex;
} else {
- decode_data_kind = BeamformerDataKind_Float32Complex;
+ b->data_kind = BeamformerDataKind_Float32Complex;
}
}
- i32 local_flags = 0;
- if (run_cuda_hilbert) local_flags |= BeamformerShaderDecodeFlags_DilateOutput;
- match = beamformer_shader_decode_match(decode_data_kind, local_flags);
+
+ b->shader_flags = 0;
+ if (run_cuda_hilbert) b->shader_flags |= BeamformerShaderDecodeFlags_DilateOutput;
+
commit = 1;
}break;
case BeamformerShaderKind_Demodulate:{
@@ -583,7 +588,7 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
u32 input_transmit_stride = cp->das_bake.sample_count;
u32 input_channel_stride = pb->parameters.raw_data_dimensions[0];
- BeamformerDecodeUBO *dp = &cp->decode_ubo_data;
+ BeamformerShaderDecodeBakeParameters *dp = &cp->decode_bake;
dp->decode_mode = pb->parameters.decode;
dp->transmit_count = cp->das_bake.acquisition_count;
@@ -727,10 +732,6 @@ load_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, u32 shader_sl
BEAMFORMER_DAS_UBO_PARAM_LIST
"};\n\n"
),
- [BeamformerShaderKind_Decode] = s8_comp("layout(std140, binding = 0) uniform parameters {\n"
- BEAMFORMER_DECODE_UBO_PARAM_LIST
- "};\n\n"
- ),
[BeamformerShaderKind_Filter] = s8_comp("layout(std140, binding = 0) uniform parameters {\n"
BEAMFORMER_FILTER_UBO_PARAM_LIST
"};\n\n"
@@ -776,6 +777,7 @@ load_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, u32 shader_sl
u32 *parameters = 0;
/* TODO(rnp): generate this */
switch (shader) {
+ case BeamformerShaderKind_Decode:{ parameters = cp->decode_bake.E; }break;
case BeamformerShaderKind_Demodulate:{ parameters = cp->demodulate_bake.E; }break;
case BeamformerShaderKind_Filter:{ parameters = cp->filter_bake.E; }break;
case BeamformerShaderKind_DAS:{ parameters = cp->das_bake.E; }break;
@@ -900,7 +902,6 @@ do_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, BeamformerFrame
switch (shader) {
case BeamformerShaderKind_Decode:{
- glBindBufferBase(GL_UNIFORM_BUFFER, 0, cp->ubos[BeamformerComputeUBOKind_Decode]);
glBindImageTexture(0, cp->textures[BeamformerComputeTextureKind_Hadamard], 0, 0, 0, GL_READ_ONLY, GL_R8I);
if (shader_slot == 0) {
diff --git a/beamformer.h b/beamformer.h
@@ -113,17 +113,6 @@ typedef struct {
X(sampling_frequency, f32, float)
/* X(name, type, gltype) */
-#define BEAMFORMER_DECODE_UBO_PARAM_LIST \
- X(input_channel_stride, u32, uint) \
- X(input_sample_stride, u32, uint) \
- X(input_transmit_stride, u32, uint) \
- X(output_channel_stride, u32, uint) \
- X(output_sample_stride, u32, uint) \
- X(output_transmit_stride, u32, uint) \
- X(transmit_count, u32, uint) \
- X(decode_mode, u32, uint)
-
-/* X(name, type, gltype) */
#define BEAMFORMER_DAS_UBO_PARAM_LIST \
X(voxel_transform, m4, mat4) \
X(xdc_transform, m4, mat4) \
@@ -136,13 +125,6 @@ typedef struct {
typedef alignas(16) struct {
#define X(name, type, ...) type name;
- BEAMFORMER_DECODE_UBO_PARAM_LIST
- #undef X
-} BeamformerDecodeUBO;
-static_assert((sizeof(BeamformerDecodeUBO) & 15) == 0, "UBO size must be a multiple of 16");
-
-typedef alignas(16) struct {
- #define X(name, type, ...) type name;
BEAMFORMER_FILTER_UBO_PARAM_LIST
#undef X
float _pad[2];
@@ -157,11 +139,9 @@ typedef alignas(16) struct {
} BeamformerDASUBO;
static_assert((sizeof(BeamformerDASUBO) & 15) == 0, "UBO size must be a multiple of 16");
-/* TODO(rnp): das should remove redundant info and add voxel transform */
/* TODO(rnp): need 1 UBO per filter slot */
#define BEAMFORMER_COMPUTE_UBO_LIST \
X(DAS, BeamformerDASUBO, das) \
- X(Decode, BeamformerDecodeUBO, decode) \
X(Filter, BeamformerFilterUBO, filter) \
X(Demodulate, BeamformerFilterUBO, demod)
@@ -217,6 +197,7 @@ struct BeamformerComputePlan {
BEAMFORMER_COMPUTE_UBO_LIST
#undef X
+ BeamformerShaderDecodeBakeParameters decode_bake;
BeamformerShaderFilterBakeParameters demodulate_bake;
BeamformerShaderFilterBakeParameters filter_bake;
BeamformerShaderDASBakeParameters das_bake;
diff --git a/beamformer.meta b/beamformer.meta
@@ -10,12 +10,23 @@
@Shader(decode.glsl) Decode
{
+ @Enumeration(DataKind)
@Enumeration(DecodeMode)
- @PermuteFlags([DilateOutput])
+ @Flags([DilateOutput])
+
+ @Bake
{
- @Permute(DataKind [Int16])
+ @BakeVariable(DataKind data_kind )
+ @BakeVariable(DecodeMode decode_mode )
+ @BakeVariable(InputChannelStride input_channel_stride )
+ @BakeVariable(InputSampleStride input_sample_stride )
+ @BakeVariable(InputTransmitStride input_transmit_stride )
+ @BakeVariable(OutputChannelStride output_channel_stride )
+ @BakeVariable(OutputSampleStride output_sample_stride )
+ @BakeVariable(OutputTransmitStride output_transmit_stride)
+ @BakeVariable(ShaderFlags shader_flags )
+ @BakeVariable(TransmitCount transmit_count )
}
- @Permute(DataKind [Int16Complex Float32 Float32Complex])
}
@Shader(filter.glsl) Filter
diff --git a/generated/beamformer.meta.c b/generated/beamformer.meta.c
@@ -76,6 +76,22 @@ typedef struct {
typedef union {
struct {
u32 data_kind;
+ u32 decode_mode;
+ u32 input_channel_stride;
+ u32 input_sample_stride;
+ u32 input_transmit_stride;
+ u32 output_channel_stride;
+ u32 output_sample_stride;
+ u32 output_transmit_stride;
+ u32 shader_flags;
+ u32 transmit_count;
+ };
+ u32 E[10];
+} BeamformerShaderDecodeBakeParameters;
+
+typedef union {
+ struct {
+ u32 data_kind;
u32 decimation_rate;
u32 filter_length;
u32 input_channel_stride;
@@ -108,11 +124,7 @@ read_only global i32 *beamformer_shader_match_vectors[] = {
// CudaHilbert
0,
// Decode
- (i32 []){BeamformerDataKind_Int16, 0x00},
- (i32 []){BeamformerDataKind_Int16, 0x01},
- (i32 []){BeamformerDataKind_Int16Complex, 0x00},
- (i32 []){BeamformerDataKind_Float32, 0x00},
- (i32 []){BeamformerDataKind_Float32Complex, 0x00},
+ 0,
// Filter
0,
// Demodulate
@@ -125,18 +137,18 @@ read_only global i32 *beamformer_shader_match_vectors[] = {
// Render3D
0,
};
-#define beamformer_match_vectors_count (12)
+#define beamformer_match_vectors_count (8)
read_only global BeamformerShaderDescriptor beamformer_shader_descriptors[] = {
- {0, 1, 0, 0, 0},
- {1, 2, 0, 0, 0},
- {2, 7, 1, 2, 1},
- {7, 8, 0, 2, 0},
- {8, 8, 0, 0, 0},
- {8, 9, 0, 2, 0},
- {9, 10, 0, 0, 0},
- {10, 11, 0, 0, 0},
- {11, 12, 0, 0, 0},
+ {0, 1, 0, 0, 0},
+ {1, 2, 0, 0, 0},
+ {2, 3, 0, 2, 0},
+ {3, 4, 0, 2, 0},
+ {4, 4, 0, 0, 0},
+ {4, 5, 0, 2, 0},
+ {5, 6, 0, 0, 0},
+ {6, 7, 0, 0, 0},
+ {7, 8, 0, 0, 0},
};
read_only global s8 beamformer_shader_names[] = {
@@ -251,7 +263,18 @@ read_only global i32 *beamformer_shader_header_vectors[] = {
};
read_only global s8 *beamformer_shader_bake_parameter_names[] = {
- 0,
+ (s8 []){
+ s8_comp("DataKind"),
+ s8_comp("DecodeMode"),
+ s8_comp("InputChannelStride"),
+ s8_comp("InputSampleStride"),
+ s8_comp("InputTransmitStride"),
+ s8_comp("OutputChannelStride"),
+ s8_comp("OutputSampleStride"),
+ s8_comp("OutputTransmitStride"),
+ s8_comp("ShaderFlags"),
+ s8_comp("TransmitCount"),
+ },
(s8 []){
s8_comp("DataKind"),
s8_comp("DecimationRate"),
@@ -279,7 +302,7 @@ read_only global s8 *beamformer_shader_bake_parameter_names[] = {
};
read_only global i32 beamformer_shader_bake_parameter_name_counts[] = {
- 0,
+ 10,
11,
6,
0,
@@ -309,10 +332,3 @@ beamformer_shader_match(i32 *match_vector, i32 first_index, i32 one_past_last_in
return result;
}
-function iz
-beamformer_shader_decode_match(BeamformerDataKind a, i32 flags)
-{
- iz result = beamformer_shader_match((i32 []){(i32)a, flags}, 2, 7, 2);
- return result;
-}
-
diff --git a/shaders/decode.glsl b/shaders/decode.glsl
@@ -70,30 +70,30 @@ void main()
uint channel = gl_GlobalInvocationID.y;
uint transmit = gl_GlobalInvocationID.z;
- uint rf_offset = (input_channel_stride * channel + transmit_count * time_sample) / RF_SAMPLES_PER_INDEX;
+ uint rf_offset = (InputChannelStride * channel + TransmitCount * time_sample) / RF_SAMPLES_PER_INDEX;
if (u_first_pass) {
- if (time_sample < input_transmit_stride) {
- uint in_off = input_channel_stride * imageLoad(channel_mapping, int(channel)).x +
- input_transmit_stride * transmit +
- input_sample_stride * time_sample;
+ if (time_sample < InputTransmitStride) {
+ uint in_off = InputChannelStride * imageLoad(channel_mapping, int(channel)).x +
+ InputTransmitStride * transmit +
+ InputSampleStride * time_sample;
out_rf_data[rf_offset + transmit] = rf_data[in_off / RF_SAMPLES_PER_INDEX];
}
} else {
- if (time_sample < output_transmit_stride) {
- uint out_off = output_channel_stride * channel +
- output_transmit_stride * transmit +
- output_sample_stride * time_sample;
+ if (time_sample < OutputTransmitStride) {
+ uint out_off = OutputChannelStride * channel +
+ OutputTransmitStride * transmit +
+ OutputSampleStride * time_sample;
SAMPLE_DATA_TYPE result = SAMPLE_DATA_TYPE(0);
- switch (decode_mode) {
+ switch (DecodeMode) {
case DecodeMode_None:{
result = sample_rf_data(rf_offset + transmit);
}break;
case DecodeMode_Hadamard:{
SAMPLE_DATA_TYPE sum = SAMPLE_DATA_TYPE(0);
- for (int i = 0; i < imageSize(hadamard).x; i++)
+ for (int i = 0; i < TransmitCount; i++)
sum += imageLoad(hadamard, ivec2(i, transmit)).x * sample_rf_data(rf_offset++);
- result = sum / float(imageSize(hadamard).x);
+ result = sum / float(TransmitCount);
}break;
}
out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = result;