Commit: 42c2b9f27aeb4d5aed5dd3d78cd4ddc7198acbbf
Parent: 3e23d5d371f9e3817889c940926f5429d66c7a7a
Author: Randy Palamar
Date: Thu, 6 Nov 2025 22:35:29 -0700
shaders/decode: cleanup DecodeMode_None path
this will eventually get completely removed, but for now it is
used for RF mode channel mapping and conversion to float
Diffstat:
2 files changed, 57 insertions(+), 50 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -531,10 +531,8 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
if (run_cuda_hilbert) sd->bake.flags |= BeamformerShaderDecodeFlags_DilateOutput;
if (db->decode_mode == BeamformerDecodeMode_None) {
- db->to_process = 1;
- sd->layout.x = 64;
- sd->layout.y = 1;
- sd->layout.z = 1;
+ sd->layout = (uv3){{64, 1, 1}};
+
sd->dispatch.x = (u32)ceil_f32((f32)sample_count / (f32)sd->layout.x);
sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y);
sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z);
@@ -547,9 +545,7 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
b32 use_16z = db->transmit_count == 48 || db->transmit_count == 80 ||
db->transmit_count == 96 || db->transmit_count == 160;
- sd->layout.x = 4;
- sd->layout.y = 1;
- sd->layout.z = use_16z? 16 : 32;
+ sd->layout = (uv3){{4, 1, use_16z? 16 : 32}};
sd->dispatch.x = (u32)ceil_f32((f32)sample_count / (f32)sd->layout.x);
sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y);
@@ -560,9 +556,7 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
/* NOTE(rnp): register caching. using more threads will cause the compiler to do
* contortions to avoid spilling registers. using less gives higher performance */
/* TODO(rnp): may need to be adjusted to 16 on NVIDIA */
- sd->layout.x = 32;
- sd->layout.y = 1;
- sd->layout.z = 1;
+ sd->layout = (uv3){{32, 1, 1}};
sd->dispatch.x = (u32)ceil_f32((f32)sample_count / (f32)sd->layout.x);
sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y);
@@ -645,9 +639,7 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
}
/* TODO(rnp): filter may need a different dispatch layout */
- sd->layout.x = 128;
- sd->layout.y = 1;
- sd->layout.z = 1;
+ sd->layout = (uv3){{128, 1, 1}};
sd->dispatch.x = (u32)ceil_f32((f32)sample_count / (f32)sd->layout.x);
sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y);
sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z);
@@ -923,16 +915,22 @@ do_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, BeamformerFrame
case BeamformerShaderKind_Decode:{
glBindImageTexture(0, cp->textures[BeamformerComputeTextureKind_Hadamard], 0, 0, 0, GL_READ_ONLY, GL_R32F);
+ BeamformerDecodeMode mode = cp->shader_descriptors[shader_slot].bake.Decode.decode_mode;
if (shader_slot == 0) {
- glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, cc->ping_pong_ssbos[input_ssbo_idx]);
glBindImageTexture(1, cp->textures[BeamformerComputeTextureKind_ChannelMapping], 0, 0, 0, GL_READ_ONLY, GL_R16I);
- glProgramUniform1ui(program, DECODE_FIRST_PASS_UNIFORM_LOC, 1);
- glDispatchCompute(dispatch.x, dispatch.y, dispatch.z);
- glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
+ if (mode != BeamformerDecodeMode_None) {
+ glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, cc->ping_pong_ssbos[input_ssbo_idx]);
+ glProgramUniform1ui(program, DECODE_FIRST_PASS_UNIFORM_LOC, 1);
+
+ glDispatchCompute(dispatch.x, dispatch.y, dispatch.z);
+ glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
+ }
}
- glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, cc->ping_pong_ssbos[input_ssbo_idx]);
+ if (mode != BeamformerDecodeMode_None)
+ glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, cc->ping_pong_ssbos[input_ssbo_idx]);
+
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, cc->ping_pong_ssbos[output_ssbo_idx]);
glProgramUniform1ui(program, DECODE_FIRST_PASS_UNIFORM_LOC, 0);
@@ -1496,6 +1494,8 @@ DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step)
BeamformerSharedMemory *sm = ctx->shared_memory.region;
if (atomic_load_u32(sm->locks + BeamformerSharedMemoryLockKind_UploadRF))
os_wake_waiters(&ctx->os.upload_worker.sync_variable);
+ if (atomic_load_u32(sm->locks + BeamformerSharedMemoryLockKind_DispatchCompute))
+ os_wake_waiters(&ctx->os.compute_worker.sync_variable);
BeamformerFrame *frame = ctx->latest_frame;
BeamformerViewPlaneTag tag = frame? frame->view_plane_tag : 0;
diff --git a/shaders/decode.glsl b/shaders/decode.glsl
@@ -152,45 +152,52 @@ void run_decode_small(void)
void main()
{
- uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX;
- uint channel = gl_GlobalInvocationID.y;
- uint transmit = gl_GlobalInvocationID.z * ToProcess;
-
- uint rf_offset = (InputChannelStride * channel + TransmitCount * time_sample) / RF_SAMPLES_PER_INDEX;
- if (u_first_pass) {
- if (time_sample < InputTransmitStride) {
- uint in_off = InputChannelStride * imageLoad(channel_mapping, int(channel)).x +
- InputSampleStride * time_sample;
- #if DecodeMode == DecodeMode_None || UseSharedMemory
- in_off += InputTransmitStride * transmit;
- rf_offset += transmit;
- for (uint i = 0; i < ToProcess; i++, in_off += InputTransmitStride) {
- if (transmit + i < TransmitCount)
- out_rf_data[rf_offset + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX];
- }
- #else
- for (uint i = 0; i < TransmitCount; i++, in_off += InputTransmitStride)
- out_rf_data[rf_offset + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX];
- #endif
+ switch (DecodeMode) {
+ case DecodeMode_None:{
+ uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX;
+ uint channel = gl_GlobalInvocationID.y;
+ uint transmit = gl_GlobalInvocationID.z;
+
+ if (time_sample < OutputTransmitStride) {
+ uint in_off = (InputChannelStride * imageLoad(channel_mapping, int(channel)).x +
+ InputTransmitStride * transmit +
+ InputSampleStride * time_sample) / RF_SAMPLES_PER_INDEX;
+
+ uint out_off = (OutputChannelStride * channel +
+ OutputTransmitStride * transmit +
+ OutputSampleStride * time_sample) / OUTPUT_SAMPLES_PER_INDEX;
+
+ out_data[out_off] = sample_rf_data(in_off);
}
- } else {
- switch (DecodeMode) {
- case DecodeMode_None:{
- uint out_off = OutputChannelStride * channel +
- OutputTransmitStride * transmit +
- OutputSampleStride * time_sample;
- for (uint i = 0; i < ToProcess; i++, out_off += OutputTransmitStride) {
- if (TransmitCount % (gl_WorkGroupSize.z * ToProcess) == 0 || transmit + i < TransmitCount)
- out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = sample_rf_data(rf_offset + transmit + i);
+ }break;
+ case DecodeMode_Hadamard:{
+ if (u_first_pass) {
+ uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX;
+ uint channel = gl_GlobalInvocationID.y;
+ uint transmit = gl_GlobalInvocationID.z * ToProcess;
+ if (time_sample < InputTransmitStride) {
+ uint out_off = (InputChannelStride * channel + TransmitCount * time_sample) / RF_SAMPLES_PER_INDEX;
+ uint in_off = InputChannelStride * imageLoad(channel_mapping, int(channel)).x +
+ InputSampleStride * time_sample;
+ #if UseSharedMemory
+ in_off += InputTransmitStride * transmit;
+ out_off += transmit;
+ for (uint i = 0; i < ToProcess; i++, in_off += InputTransmitStride) {
+ if (transmit + i < TransmitCount)
+ out_rf_data[out_off + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX];
+ }
+ #else
+ for (uint i = 0; i < TransmitCount; i++, in_off += InputTransmitStride)
+ out_rf_data[out_off + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX];
+ #endif
}
- }break;
- case DecodeMode_Hadamard:{
+ } else {
#if UseSharedMemory
run_decode_large();
#else
run_decode_small();
#endif
- }break;
}
+ }break;
}
}