ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

Commit: 42c2b9f27aeb4d5aed5dd3d78cd4ddc7198acbbf
Parent: 3e23d5d371f9e3817889c940926f5429d66c7a7a
Author: Randy Palamar
Date:   Thu,  6 Nov 2025 22:35:29 -0700

shaders/decode: cleanup DecodeMode_None path

this will eventually get completely removed, but for now it is
used for RF mode channel mapping and conversion to float

Diffstat:
Mbeamformer.c | 36++++++++++++++++++------------------
Mshaders/decode.glsl | 71+++++++++++++++++++++++++++++++++++++++--------------------------------
2 files changed, 57 insertions(+), 50 deletions(-)

diff --git a/beamformer.c b/beamformer.c @@ -531,10 +531,8 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb) if (run_cuda_hilbert) sd->bake.flags |= BeamformerShaderDecodeFlags_DilateOutput; if (db->decode_mode == BeamformerDecodeMode_None) { - db->to_process = 1; - sd->layout.x = 64; - sd->layout.y = 1; - sd->layout.z = 1; + sd->layout = (uv3){{64, 1, 1}}; + sd->dispatch.x = (u32)ceil_f32((f32)sample_count / (f32)sd->layout.x); sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y); sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z); @@ -547,9 +545,7 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb) b32 use_16z = db->transmit_count == 48 || db->transmit_count == 80 || db->transmit_count == 96 || db->transmit_count == 160; - sd->layout.x = 4; - sd->layout.y = 1; - sd->layout.z = use_16z? 16 : 32; + sd->layout = (uv3){{4, 1, use_16z? 16 : 32}}; sd->dispatch.x = (u32)ceil_f32((f32)sample_count / (f32)sd->layout.x); sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y); @@ -560,9 +556,7 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb) /* NOTE(rnp): register caching. using more threads will cause the compiler to do * contortions to avoid spilling registers. using less gives higher performance */ /* TODO(rnp): may need to be adjusted to 16 on NVIDIA */ - sd->layout.x = 32; - sd->layout.y = 1; - sd->layout.z = 1; + sd->layout = (uv3){{32, 1, 1}}; sd->dispatch.x = (u32)ceil_f32((f32)sample_count / (f32)sd->layout.x); sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y); @@ -645,9 +639,7 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb) } /* TODO(rnp): filter may need a different dispatch layout */ - sd->layout.x = 128; - sd->layout.y = 1; - sd->layout.z = 1; + sd->layout = (uv3){{128, 1, 1}}; sd->dispatch.x = (u32)ceil_f32((f32)sample_count / (f32)sd->layout.x); sd->dispatch.y = (u32)ceil_f32((f32)pb->parameters.channel_count / (f32)sd->layout.y); sd->dispatch.z = (u32)ceil_f32((f32)pb->parameters.acquisition_count / (f32)sd->layout.z); @@ -923,16 +915,22 @@ do_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, BeamformerFrame case BeamformerShaderKind_Decode:{ glBindImageTexture(0, cp->textures[BeamformerComputeTextureKind_Hadamard], 0, 0, 0, GL_READ_ONLY, GL_R32F); + BeamformerDecodeMode mode = cp->shader_descriptors[shader_slot].bake.Decode.decode_mode; if (shader_slot == 0) { - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, cc->ping_pong_ssbos[input_ssbo_idx]); glBindImageTexture(1, cp->textures[BeamformerComputeTextureKind_ChannelMapping], 0, 0, 0, GL_READ_ONLY, GL_R16I); - glProgramUniform1ui(program, DECODE_FIRST_PASS_UNIFORM_LOC, 1); - glDispatchCompute(dispatch.x, dispatch.y, dispatch.z); - glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); + if (mode != BeamformerDecodeMode_None) { + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, cc->ping_pong_ssbos[input_ssbo_idx]); + glProgramUniform1ui(program, DECODE_FIRST_PASS_UNIFORM_LOC, 1); + + glDispatchCompute(dispatch.x, dispatch.y, dispatch.z); + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); + } } - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, cc->ping_pong_ssbos[input_ssbo_idx]); + if (mode != BeamformerDecodeMode_None) + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, cc->ping_pong_ssbos[input_ssbo_idx]); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, cc->ping_pong_ssbos[output_ssbo_idx]); glProgramUniform1ui(program, DECODE_FIRST_PASS_UNIFORM_LOC, 0); @@ -1496,6 +1494,8 @@ DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step) BeamformerSharedMemory *sm = ctx->shared_memory.region; if (atomic_load_u32(sm->locks + BeamformerSharedMemoryLockKind_UploadRF)) os_wake_waiters(&ctx->os.upload_worker.sync_variable); + if (atomic_load_u32(sm->locks + BeamformerSharedMemoryLockKind_DispatchCompute)) + os_wake_waiters(&ctx->os.compute_worker.sync_variable); BeamformerFrame *frame = ctx->latest_frame; BeamformerViewPlaneTag tag = frame? frame->view_plane_tag : 0; diff --git a/shaders/decode.glsl b/shaders/decode.glsl @@ -152,45 +152,52 @@ void run_decode_small(void) void main() { - uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX; - uint channel = gl_GlobalInvocationID.y; - uint transmit = gl_GlobalInvocationID.z * ToProcess; - - uint rf_offset = (InputChannelStride * channel + TransmitCount * time_sample) / RF_SAMPLES_PER_INDEX; - if (u_first_pass) { - if (time_sample < InputTransmitStride) { - uint in_off = InputChannelStride * imageLoad(channel_mapping, int(channel)).x + - InputSampleStride * time_sample; - #if DecodeMode == DecodeMode_None || UseSharedMemory - in_off += InputTransmitStride * transmit; - rf_offset += transmit; - for (uint i = 0; i < ToProcess; i++, in_off += InputTransmitStride) { - if (transmit + i < TransmitCount) - out_rf_data[rf_offset + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX]; - } - #else - for (uint i = 0; i < TransmitCount; i++, in_off += InputTransmitStride) - out_rf_data[rf_offset + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX]; - #endif + switch (DecodeMode) { + case DecodeMode_None:{ + uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX; + uint channel = gl_GlobalInvocationID.y; + uint transmit = gl_GlobalInvocationID.z; + + if (time_sample < OutputTransmitStride) { + uint in_off = (InputChannelStride * imageLoad(channel_mapping, int(channel)).x + + InputTransmitStride * transmit + + InputSampleStride * time_sample) / RF_SAMPLES_PER_INDEX; + + uint out_off = (OutputChannelStride * channel + + OutputTransmitStride * transmit + + OutputSampleStride * time_sample) / OUTPUT_SAMPLES_PER_INDEX; + + out_data[out_off] = sample_rf_data(in_off); } - } else { - switch (DecodeMode) { - case DecodeMode_None:{ - uint out_off = OutputChannelStride * channel + - OutputTransmitStride * transmit + - OutputSampleStride * time_sample; - for (uint i = 0; i < ToProcess; i++, out_off += OutputTransmitStride) { - if (TransmitCount % (gl_WorkGroupSize.z * ToProcess) == 0 || transmit + i < TransmitCount) - out_data[out_off / OUTPUT_SAMPLES_PER_INDEX] = sample_rf_data(rf_offset + transmit + i); + }break; + case DecodeMode_Hadamard:{ + if (u_first_pass) { + uint time_sample = gl_GlobalInvocationID.x * RF_SAMPLES_PER_INDEX; + uint channel = gl_GlobalInvocationID.y; + uint transmit = gl_GlobalInvocationID.z * ToProcess; + if (time_sample < InputTransmitStride) { + uint out_off = (InputChannelStride * channel + TransmitCount * time_sample) / RF_SAMPLES_PER_INDEX; + uint in_off = InputChannelStride * imageLoad(channel_mapping, int(channel)).x + + InputSampleStride * time_sample; + #if UseSharedMemory + in_off += InputTransmitStride * transmit; + out_off += transmit; + for (uint i = 0; i < ToProcess; i++, in_off += InputTransmitStride) { + if (transmit + i < TransmitCount) + out_rf_data[out_off + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX]; + } + #else + for (uint i = 0; i < TransmitCount; i++, in_off += InputTransmitStride) + out_rf_data[out_off + i] = rf_data[in_off / RF_SAMPLES_PER_INDEX]; + #endif } - }break; - case DecodeMode_Hadamard:{ + } else { #if UseSharedMemory run_decode_large(); #else run_decode_small(); #endif - }break; } + }break; } }