Commit: 31df2285cfeeb900c97ecd3011a75df20ef05ad9
Parent: 35f140977903d900298b976cd5ffc11e3e2ebd68
Author: Randy Palamar
Date: Wed, 5 Nov 2025 12:58:16 -0700
shaders/decode: move shared memory and register caching to separate functions
this was hard to read and reason about. additionally fixed bad
indexing for rf only mode.
also added a missed special case for 48 Transmits. this brings its
performance in line with the trend followed by the other sizes
Diffstat:
4 files changed, 129 insertions(+), 78 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -531,31 +531,43 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
if (run_cuda_hilbert) sd->bake.flags |= BeamformerShaderDecodeFlags_DilateOutput;
if (db->decode_mode == BeamformerDecodeMode_None) {
- db->transmits_processed = 1;
+ db->to_process = 1;
sd->layout.x = 64;
sd->layout.y = 1;
sd->layout.z = 1;
+ sd->dispatch.x = (u32)ceil_f32((f32)sample_count / (f32)sd->layout.x);
+ sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y);
+ sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z);
} else if (db->transmit_count > 40) {
sd->bake.flags |= BeamformerShaderDecodeFlags_UseSharedMemory;
- db->transmits_processed = 2;
+ db->to_process = 2;
- b32 use_16z = db->transmit_count == 80 || db->transmit_count == 96 || db->transmit_count == 160;
+ if (db->transmit_count == 48)
+ db->to_process = db->transmit_count / 16;
+
+ b32 use_16z = db->transmit_count == 48 || db->transmit_count == 80 ||
+ db->transmit_count == 96 || db->transmit_count == 160;
sd->layout.x = 4;
sd->layout.y = 1;
sd->layout.z = use_16z? 16 : 32;
+
+ sd->dispatch.x = (u32)ceil_f32((f32)sample_count / (f32)sd->layout.x);
+ sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y);
+ sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z / (f32)db->to_process);
} else {
- db->transmits_processed = db->transmit_count;
+ db->to_process = 1;
+
/* NOTE(rnp): register caching. using more threads will cause the compiler to do
* contortions to avoid spilling registers. using less gives higher performance */
/* TODO(rnp): may need to be adjusted to 16 on NVIDIA */
sd->layout.x = 32;
sd->layout.y = 1;
sd->layout.z = 1;
- }
- sd->dispatch.x = (u32)ceil_f32((f32)sample_count / (f32)sd->layout.x);
- sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y);
- sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z / (f32)db->transmits_processed);
+ sd->dispatch.x = (u32)ceil_f32((f32)sample_count / (f32)sd->layout.x);
+ sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y);
+ sd->dispatch.z = 1;
+ }
if (first) sd->dispatch.x *= decimation_rate;
diff --git a/beamformer.meta b/beamformer.meta
@@ -70,8 +70,8 @@
@BakeInt(OutputChannelStride output_channel_stride )
@BakeInt(OutputSampleStride output_sample_stride )
@BakeInt(OutputTransmitStride output_transmit_stride)
+ @BakeInt(ToProcess to_process )
@BakeInt(TransmitCount transmit_count )
- @BakeInt(TransmitsProcessed transmits_processed )
}
}
diff --git a/generated/beamformer.meta.c b/generated/beamformer.meta.c
@@ -99,8 +99,8 @@ typedef struct {
u32 output_channel_stride;
u32 output_sample_stride;
u32 output_transmit_stride;
+ u32 to_process;
u32 transmit_count;
- u32 transmits_processed;
} BeamformerShaderDecodeBakeParameters;
typedef struct {
@@ -296,8 +296,8 @@ read_only global s8 *beamformer_shader_bake_parameter_names[] = {
s8_comp("OutputChannelStride"),
s8_comp("OutputSampleStride"),
s8_comp("OutputTransmitStride"),
+ s8_comp("ToProcess"),
s8_comp("TransmitCount"),
- s8_comp("TransmitsProcessed"),
},
(s8 []){
s8_comp("DecimationRate"),
diff --git a/shaders/decode.glsl b/shaders/decode.glsl
@@ -66,92 +66,131 @@ SAMPLE_DATA_TYPE sample_rf_data(uint index)
#if UseSharedMemory
shared INPUT_DATA_TYPE rf[gl_WorkGroupSize.x * TransmitCount];
+void run_decode_large(void)
+{
+ uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX;
+ uint channel = gl_GlobalInvocationID.y;
+ uint transmit = gl_GlobalInvocationID.z * ToProcess;
+
+ uint thread_count = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z;
+ uint thread_index = gl_LocalInvocationIndex;
+
+ uint samples_per_thread = rf.length() / thread_count;
+ uint leftover_samples = rf.length() % thread_count;
+ uint samples_this_thread = samples_per_thread + uint(thread_index < leftover_samples);
+
+ uint rf_offset = (InputChannelStride * channel / RF_SAMPLES_PER_INDEX +
+ TransmitCount * gl_WorkGroupID.x * gl_WorkGroupSize.x);
+
+ for (uint i = 0; i < samples_this_thread; i++) {
+ uint index = i * thread_count + thread_index;
+ rf[index] = rf_data[rf_offset + index];
+ }
+
+ barrier();
+
+ SAMPLE_DATA_TYPE result[ToProcess];
+ if (time_sample < OutputTransmitStride) {
+ for (uint i = 0; i < ToProcess; i++)
+ result[i] = SAMPLE_DATA_TYPE(0);
+
+ for (int j = 0; j < TransmitCount; j++) {
+ SAMPLE_DATA_TYPE s = SAMPLE_TYPE_CAST(rf[gl_LocalInvocationID.x * TransmitCount + j]);
+ for (uint i = 0; i < ToProcess; i++)
+ result[i] += imageLoad(hadamard, ivec2(j, transmit + i)).x * s;
+ }
+
+ for (uint i = 0; i < ToProcess; i++)
+ result[i] /= float(TransmitCount);
+ }
+
+ /* NOTE(rnp): DO NOT combine with above; compiler shits the bed on TransmitCount == 80
+ * and it kills performance. reinvestigate when we further optimize */
+ if (time_sample < OutputTransmitStride) {
+ uint out_off = OutputChannelStride * channel +
+ OutputTransmitStride * transmit +
+ OutputSampleStride * time_sample;
+
+ for (uint i = 0; i < ToProcess; i++, out_off += OutputTransmitStride)
+ if (TransmitCount % (gl_WorkGroupSize.z * ToProcess) == 0 || transmit + i < TransmitCount)
+ out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = result[i];
+ }
+}
#endif
+void run_decode_small(void)
+{
+ uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX;
+ uint channel = gl_GlobalInvocationID.y;
+ uint rf_offset = (InputChannelStride * channel + TransmitCount * time_sample) / RF_SAMPLES_PER_INDEX;
+
+ if (time_sample < OutputTransmitStride) {
+ INPUT_DATA_TYPE rf[TransmitCount];
+ for (int j = 0; j < TransmitCount; j++)
+ rf[j] = rf_data[rf_offset + j];
+
+ SAMPLE_DATA_TYPE result[TransmitCount];
+ for (int j = 0; j < TransmitCount; j++)
+ result[j] = SAMPLE_DATA_TYPE(0);
+
+ for (int i = 0; i < TransmitCount; i++) {
+ SAMPLE_DATA_TYPE s = SAMPLE_TYPE_CAST(rf[i]);
+ for (int j = 0; j < TransmitCount; j++) {
+ result[j] += imageLoad(hadamard, ivec2(i, j)).x * s;
+ }
+ }
+
+ for (int i = 0; i < TransmitCount; i++)
+ result[i] /= float(TransmitCount);
+
+ uint out_off = OutputChannelStride * channel +
+ OutputSampleStride * time_sample;
+ for (int i = 0; i < TransmitCount; i++, out_off += OutputTransmitStride)
+ out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = result[i];
+ }
+}
+
void main()
{
uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX;
uint channel = gl_GlobalInvocationID.y;
- uint transmit = gl_GlobalInvocationID.z * TransmitsProcessed;
+ uint transmit = gl_GlobalInvocationID.z * ToProcess;
uint rf_offset = (InputChannelStride * channel + TransmitCount * time_sample) / RF_SAMPLES_PER_INDEX;
if (u_first_pass) {
if (time_sample < InputTransmitStride) {
- uint in_off = InputChannelStride * imageLoad(channel_mapping, int(channel)).x +
- InputTransmitStride * transmit +
- InputSampleStride * time_sample;
- for (uint i = 0; i < TransmitsProcessed; i++, in_off += InputTransmitStride)
- out_rf_data[rf_offset + transmit + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX];
+ uint in_off = InputChannelStride * imageLoad(channel_mapping, int(channel)).x +
+ InputSampleStride * time_sample;
+ #if DecodeMode == DecodeMode_None || UseSharedMemory
+ in_off += InputTransmitStride * transmit;
+ rf_offset += transmit;
+ for (uint i = 0; i < ToProcess; i++, in_off += InputTransmitStride) {
+ if (transmit + i < TransmitCount)
+ out_rf_data[rf_offset + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX];
+ }
+ #else
+ for (uint i = 0; i < TransmitCount; i++, in_off += InputTransmitStride)
+ out_rf_data[rf_offset + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX];
+ #endif
}
} else {
- if (UseSharedMemory == 0 && time_sample >= OutputTransmitStride)
- return;
-
- SAMPLE_DATA_TYPE result[TransmitsProcessed];
switch (DecodeMode) {
case DecodeMode_None:{
- for (uint i = 0; i < TransmitsProcessed; i++)
- if (TransmitCount % (gl_WorkGroupSize.z * TransmitsProcessed) == 0 || transmit + i < TransmitCount)
- result[i] = sample_rf_data(rf_offset + transmit + i);
+ uint out_off = OutputChannelStride * channel +
+ OutputTransmitStride * transmit +
+ OutputSampleStride * time_sample;
+ for (uint i = 0; i < ToProcess; i++, out_off += OutputTransmitStride) {
+ if (TransmitCount % (gl_WorkGroupSize.z * ToProcess) == 0 || transmit + i < TransmitCount)
+ out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = sample_rf_data(rf_offset + transmit + i);
+ }
}break;
case DecodeMode_Hadamard:{
#if UseSharedMemory
- {
- uint thread_count = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z;
- uint thread_index = gl_LocalInvocationIndex;
-
- uint samples_per_thread = rf.length() / thread_count;
- uint leftover_samples = rf.length() % thread_count;
- uint samples_this_thread = samples_per_thread + uint(thread_index < leftover_samples);
-
- uint rf_offset = (InputChannelStride * channel +
- TransmitCount * gl_WorkGroupID.x * gl_WorkGroupSize.x) / RF_SAMPLES_PER_INDEX;
-
- for (uint i = 0; i < samples_this_thread; i++) {
- uint index = i * thread_count + thread_index;
- rf[index] = rf_data[rf_offset + index];
- }
- barrier();
- }
+ run_decode_large();
+ #else
+ run_decode_small();
#endif
-
- if (time_sample < OutputTransmitStride) {
- for (uint i = 0; i < TransmitsProcessed; i++)
- result[i] = SAMPLE_DATA_TYPE(0);
-
- #if UseSharedMemory
- for (int j = 0; j < TransmitCount; j++) {
- SAMPLE_DATA_TYPE s = SAMPLE_TYPE_CAST(rf[gl_LocalInvocationID.x * TransmitCount + j]);
- for (uint i = 0; i < TransmitsProcessed; i++)
- result[i] += imageLoad(hadamard, ivec2(j, transmit + i)).x * s;
- }
- #else
- INPUT_DATA_TYPE rf[TransmitsProcessed];
- for (int j = 0; j < TransmitCount; j++)
- rf[j] = rf_data[rf_offset + j];
-
- for (int j = 0; j < TransmitCount; j++) {
- SAMPLE_DATA_TYPE s = SAMPLE_TYPE_CAST(rf[j]);
- for (uint i = 0; i < TransmitsProcessed; i++) {
- result[i] += imageLoad(hadamard, ivec2(j, transmit + i)).x * s;
- }
- }
- #endif
-
- for (uint i = 0; i < TransmitsProcessed; i++)
- result[i] /= float(TransmitCount);
- }
}break;
}
-
- if (time_sample < OutputTransmitStride) {
- uint out_off = OutputChannelStride * channel +
- OutputTransmitStride * transmit +
- OutputSampleStride * time_sample;
-
- for (uint i = 0; i < TransmitsProcessed; i++, out_off += OutputTransmitStride)
- if (TransmitCount % (gl_WorkGroupSize.z * TransmitsProcessed) == 0 || transmit + i < TransmitCount)
- out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = result[i];
- }
}
}