ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

Commit: 399d6e0e1f756a103dd3a23dd1f9c3b3e8aae63a
Parent: d5355cf2830265cdfe5ca686f0865a28ebc8a46b
Author: Randy Palamar
Date:   Tue,  9 Sep 2025 12:42:17 -0600

das: adapt to Float32 (RF) input

not currently tested, decode shader needs to be fixed for RF input

Diffstat:
Mbeamformer.c | 40+++++++++++++++++++++++++++-------------
Mbeamformer.h | 2++
Mshaders/das.glsl | 136++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------
Mui.c | 2+-
4 files changed, 116 insertions(+), 64 deletions(-)

diff --git a/beamformer.c b/beamformer.c @@ -1,5 +1,6 @@ /* See LICENSE for license details. */ /* TODO(rnp): + * [ ]: bug: shaders with no parameters aren't currently properly selected * [ ]: make decode output real values for real inputs and complex values for complex inputs * - this means that das should have a RF version and an IQ version * - this will also flip the current hack to support demodulate after decode to @@ -177,8 +178,15 @@ frame_next(ComputeFrameIterator *bfi) return result; } +function b32 +beamformer_frame_compatible(BeamformerFrame *f, iv3 dim, GLenum gl_kind) +{ + b32 result = gl_kind == f->gl_kind && iv3_equal(dim, f->dim); + return result; +} + function void -alloc_beamform_frame(GLParams *gp, BeamformerFrame *out, iv3 out_dim, s8 name, Arena arena) +alloc_beamform_frame(GLParams *gp, BeamformerFrame *out, iv3 out_dim, GLenum gl_kind, s8 name, Arena arena) { out->dim.x = MAX(1, out_dim.x); out->dim.y = MAX(1, out_dim.y); @@ -195,6 +203,8 @@ alloc_beamform_frame(GLParams *gp, BeamformerFrame *out, iv3 out_dim, s8 name, A u32 max_dim = (u32)MAX(out->dim.x, MAX(out->dim.y, out->dim.z)); out->mips = (i32)ctz_u32(round_up_power_of_2(max_dim)) + 1; + out->gl_kind = gl_kind; + Stream label = arena_stream(arena); stream_append_s8(&label, name); stream_append_byte(&label, '['); @@ -203,7 +213,7 @@ alloc_beamform_frame(GLParams *gp, BeamformerFrame *out, iv3 out_dim, s8 name, A glDeleteTextures(1, &out->texture); glCreateTextures(GL_TEXTURE_3D, 1, &out->texture); - glTextureStorage3D(out->texture, out->mips, GL_RG32F, out->dim.x, out->dim.y, out->dim.z); + glTextureStorage3D(out->texture, out->mips, gl_kind, out->dim.x, out->dim.y, out->dim.z); glTextureParameteri(out->texture, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTextureParameteri(out->texture, GL_TEXTURE_MAG_FILTER, GL_NEAREST); @@ -457,6 +467,8 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb) if (demodulate) run_cuda_hilbert = 0; + if (demodulate || run_cuda_hilbert) cp->iq_pipeline = 1; + BeamformerDataKind data_kind = pb->pipeline.data_kind; cp->pipeline.shader_count = 0; for (u32 i = 0; i < pb->pipeline.shader_count; i++) { @@ -565,8 +577,6 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb) dp->output_transmit_stride *= decimation_rate; } - if (!demodulate) bp->demodulation_frequency = 0; - cp->decode_dispatch.x = (u32)ceil_f32((f32)bp->sample_count / DECODE_LOCAL_SIZE_X); cp->decode_dispatch.y = (u32)ceil_f32((f32)bp->channel_count / DECODE_LOCAL_SIZE_Y); cp->decode_dispatch.z = (u32)ceil_f32((f32)bp->acquisition_count / DECODE_LOCAL_SIZE_Z); @@ -620,8 +630,9 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb) cp->demod_dispatch.y = (u32)ceil_f32((f32)bp->channel_count / FILTER_LOCAL_SIZE_Y); cp->demod_dispatch.z = (u32)ceil_f32((f32)bp->acquisition_count / FILTER_LOCAL_SIZE_Z); - /* TODO(rnp): if IQ (* 8) else (* 4) */ - cp->rf_size = bp->sample_count * bp->channel_count * bp->acquisition_count * 8; + cp->rf_size = bp->sample_count * bp->channel_count * bp->acquisition_count; + if (demodulate || run_cuda_hilbert) cp->rf_size *= 8; + else cp->rf_size *= 4; /* TODO(rnp): UBO per filter stage */ BeamformerFilterUBO *flt = &cp->filter_ubo_data; @@ -679,9 +690,10 @@ beamformer_commit_parameter_block(BeamformerCtx *ctx, BeamformerComputePlan *cp, cp->output_points.E[2] = MAX(pb->parameters.output_points[2], 1); cp->average_frames = pb->parameters.output_points[3]; - if (cp->average_frames > 1 && !iv3_equal(cp->output_points, ctx->averaged_frames[0].dim)) { - alloc_beamform_frame(&ctx->gl, ctx->averaged_frames + 0, cp->output_points, s8("Averaged Frame"), arena); - alloc_beamform_frame(&ctx->gl, ctx->averaged_frames + 1, cp->output_points, s8("Averaged Frame"), arena); + GLenum gl_kind = cp->iq_pipeline ? GL_RG32F : GL_R32F; + if (cp->average_frames > 1 && !beamformer_frame_compatible(ctx->averaged_frames + 0, cp->output_points, gl_kind)) { + alloc_beamform_frame(&ctx->gl, ctx->averaged_frames + 0, cp->output_points, gl_kind, s8("Averaged Frame"), arena); + alloc_beamform_frame(&ctx->gl, ctx->averaged_frames + 1, cp->output_points, gl_kind, s8("Averaged Frame"), arena); } }break; case BeamformerParameterBlockRegion_ChannelMapping: @@ -815,9 +827,9 @@ do_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, BeamformerFrame if (fast) { glClearTexImage(frame->texture, 0, GL_RED, GL_FLOAT, 0); glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT); - glBindImageTexture(0, frame->texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F); + glBindImageTexture(0, frame->texture, 0, GL_TRUE, 0, GL_READ_WRITE, cp->iq_pipeline ? GL_RG32F : GL_R32F); } else { - glBindImageTexture(0, frame->texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F); + glBindImageTexture(0, frame->texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, cp->iq_pipeline ? GL_RG32F : GL_R32F); } u32 sparse_texture = cp->textures[BeamformerComputeTextureKind_SparseElements]; @@ -1175,8 +1187,10 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena, iptr gl_c start_renderdoc_capture(gl_context); BeamformerFrame *frame = work->compute_context.frame; - if (!iv3_equal(cp->output_points, frame->dim)) - alloc_beamform_frame(&ctx->gl, frame, cp->output_points, s8("Beamformed_Data"), *arena); + + GLenum gl_kind = cp->iq_pipeline ? GL_RG32F : GL_R32F; + if (!beamformer_frame_compatible(frame, cp->output_points, gl_kind)) + alloc_beamform_frame(&ctx->gl, frame, cp->output_points, gl_kind, s8("Beamformed_Data"), *arena); frame->min_coordinate = cp->min_coordinate; frame->max_coordinate = cp->max_coordinate; diff --git a/beamformer.h b/beamformer.h @@ -204,6 +204,7 @@ struct BeamformerComputePlan { u32 rf_size; i32 hadamard_order; + b32 iq_pipeline; v3 min_coordinate; v3 max_coordinate; @@ -319,6 +320,7 @@ struct BeamformerFrame { v3 max_coordinate; // metadata + GLenum gl_kind; u32 id; u32 compound_count; DASShaderKind das_shader_kind; diff --git a/shaders/das.glsl b/shaders/das.glsl @@ -1,15 +1,41 @@ /* See LICENSE for license details. */ +#if DataKind == DataKind_Float32 + #define SAMPLE_TYPE float + #define TEXTURE_KIND r32f + #define RESULT_TYPE_CAST(a) (a).x + #define OUTPUT_TYPE_CAST(a) vec4((a).x, 0, 0, 0) + #if (ShaderFlags & ShaderFlags_Fast) + #define RESULT_TYPE SAMPLE_TYPE + #else + #define RESULT_TYPE vec2 + #define RESULT_LAST_INDEX 1 + #endif +#elif DataKind == DataKind_Float32Complex + #define SAMPLE_TYPE vec2 + #define TEXTURE_KIND rg32f + #define RESULT_TYPE_CAST(a) (a).xy + #define OUTPUT_TYPE_CAST(a) vec4((a).xy, 0, 0) + #if (ShaderFlags & ShaderFlags_Fast) + #define RESULT_TYPE SAMPLE_TYPE + #else + #define RESULT_TYPE vec3 + #define RESULT_LAST_INDEX 2 + #endif +#else + #error DataKind unsupported for DAS +#endif + layout(std430, binding = 1) readonly restrict buffer buffer_1 { - vec2 rf_data[]; + SAMPLE_TYPE rf_data[]; }; const bool sparse = bool(ShaderFlags & ShaderFlags_Sparse); const bool interpolate = bool(ShaderFlags & ShaderFlags_Interpolate); #if (ShaderFlags & ShaderFlags_Fast) -layout(rg32f, binding = 0) restrict uniform image3D u_out_data_tex; +layout(TEXTURE_KIND, binding = 0) restrict uniform image3D u_out_data_tex; #else -layout(rg32f, binding = 0) writeonly restrict uniform image3D u_out_data_tex; +layout(TEXTURE_KIND, binding = 0) writeonly restrict uniform image3D u_out_data_tex; #endif layout(r16i, binding = 1) readonly restrict uniform iimage1D sparse_elements; @@ -17,20 +43,21 @@ layout(rg32f, binding = 2) readonly restrict uniform image1D focal_vectors; #define C_SPLINE 0.5 +#if DataKind == DataKind_Float32Complex vec2 rotate_iq(vec2 iq, float time) { - vec2 result = iq; - if (demodulation_frequency > 0) { - float arg = radians(360) * demodulation_frequency * time; - mat2 phasor = mat2( cos(arg), sin(arg), - -sin(arg), cos(arg)); - result = phasor * iq; - } + float arg = radians(360) * demodulation_frequency * time; + mat2 phasor = mat2( cos(arg), sin(arg), + -sin(arg), cos(arg)); + vec2 result = phasor * iq; return result; } +#else + #define rotate_iq(a, b) (a) +#endif /* NOTE: See: https://cubic.org/docs/hermite.htm */ -vec2 cubic(int base_index, float index) +SAMPLE_TYPE cubic(int base_index, float index) { const mat4 h = mat4( 2, -3, 0, 1, @@ -40,28 +67,33 @@ vec2 cubic(int base_index, float index) ); float tk, t = modf(index, tk); - vec2 samples[4] = { + SAMPLE_TYPE samples[4] = { rf_data[base_index + int(tk) - 1], rf_data[base_index + int(tk) + 0], rf_data[base_index + int(tk) + 1], rf_data[base_index + int(tk) + 2], }; - vec4 S = vec4(t * t * t, t * t, t, 1); - vec2 P1 = samples[1]; - vec2 P2 = samples[2]; - vec2 T1 = C_SPLINE * (P2 - samples[0]); - vec2 T2 = C_SPLINE * (samples[3] - P1); + vec4 S = vec4(t * t * t, t * t, t, 1); + SAMPLE_TYPE P1 = samples[1]; + SAMPLE_TYPE P2 = samples[2]; + SAMPLE_TYPE T1 = C_SPLINE * (P2 - samples[0]); + SAMPLE_TYPE T2 = C_SPLINE * (samples[3] - P1); +#if DataKind == DataKind_Float32 + vec4 C = vec4(P1.x, P2.x, T1.x, T2.x); + float result = dot(S, h * C); +#elif DataKind == DataKind_Float32Complex mat2x4 C = mat2x4(vec4(P1.x, P2.x, T1.x, T2.x), vec4(P1.y, P2.y, T1.y, T2.y)); vec2 result = S * h * C; +#endif return result; } -vec2 sample_rf(int channel, int transmit, float index) +SAMPLE_TYPE sample_rf(int channel, int transmit, float index) { - vec2 result = vec2(index >= 0.0f) * vec2((int(index) + 1 + int(interpolate)) < sample_count); - int base_index = int(channel * sample_count * acquisition_count + transmit * sample_count); + SAMPLE_TYPE result = SAMPLE_TYPE(index >= 0.0f) * SAMPLE_TYPE((int(index) + 1 + int(interpolate)) < sample_count); + int base_index = int(channel * sample_count * acquisition_count + transmit * sample_count); if (interpolate) result *= cubic(base_index, index); else result *= rf_data[base_index + int(round(index))]; result = rotate_iq(result, index / sampling_frequency); @@ -105,7 +137,7 @@ float cylindrical_wave_transmit_distance(vec3 point, float focal_depth, float tr } #if (ShaderFlags & ShaderFlags_Fast) -vec3 RCA(vec3 world_point) +RESULT_TYPE RCA(vec3 world_point) { bool tx_rows = bool((shader_flags & ShaderFlags_TxColumns) == 0); bool rx_rows = bool((shader_flags & ShaderFlags_RxColumns) == 0); @@ -122,7 +154,7 @@ vec3 RCA(vec3 world_point) transmit_angle, tx_rows); } - vec2 result = vec2(0); + RESULT_TYPE result = RESULT_TYPE(0); for (int channel = 0; channel < channel_count; channel++) { vec2 receive_vector = xdc_world_point - rca_plane_projection(vec3(channel * xdc_element_pitch, 0), rx_rows); float receive_distance = length(receive_vector); @@ -133,16 +165,16 @@ vec3 RCA(vec3 world_point) result += apodization * sample_rf(channel, u_channel, sidx); } } - return vec3(result, 0); + return result; } #else -vec3 RCA(vec3 world_point) +RESULT_TYPE RCA(vec3 world_point) { bool tx_rows = bool((shader_flags & ShaderFlags_TxColumns) == 0); bool rx_rows = bool((shader_flags & ShaderFlags_RxColumns) == 0); vec2 xdc_world_point = rca_plane_projection((xdc_transform * vec4(world_point, 1)).xyz, rx_rows); - vec3 sum = vec3(0); + RESULT_TYPE result = RESULT_TYPE(0); for (int transmit = 0; transmit < acquisition_count; transmit++) { vec2 focal_vector = imageLoad(focal_vectors, transmit).xy; float transmit_angle = radians(focal_vector.x); @@ -162,18 +194,18 @@ vec3 RCA(vec3 world_point) float apodization = apodize(f_number * radians(180) / abs(xdc_world_point.y) * receive_vector.x); if (apodization > 0) { - float sidx = sample_index(transmit_distance + length(receive_vector)); - vec2 value = apodization * sample_rf(rx_channel, transmit, sidx); - sum += vec3(value, length(value)); + float sidx = sample_index(transmit_distance + length(receive_vector)); + SAMPLE_TYPE value = apodization * sample_rf(rx_channel, transmit, sidx); + result += RESULT_TYPE(value, length(value)); } } } - return sum; + return result; } #endif #if (ShaderFlags & ShaderFlags_Fast) -vec3 HERCULES(vec3 world_point) +RESULT_TYPE HERCULES(vec3 world_point) { vec3 xdc_world_point = (xdc_transform * vec4(world_point, 1)).xyz; bool tx_rows = bool((shader_flags & ShaderFlags_TxColumns) == 0); @@ -190,7 +222,7 @@ vec3 HERCULES(vec3 world_point) transmit_angle, tx_rows); } - vec2 result = vec2(0); + RESULT_TYPE result = RESULT_TYPE(0); for (int transmit = int(sparse); transmit < acquisition_count; transmit++) { int tx_channel = sparse ? imageLoad(sparse_elements, transmit - int(sparse)).x : transmit; vec3 element_position; @@ -207,10 +239,10 @@ vec3 HERCULES(vec3 world_point) result += apodization * sample_rf(u_channel, transmit, sidx); } } - return vec3(result, 0); + return result; } #else -vec3 HERCULES(vec3 world_point) +RESULT_TYPE HERCULES(vec3 world_point) { vec3 xdc_world_point = (xdc_transform * vec4(world_point, 1)).xyz; bool tx_rows = bool((shader_flags & ShaderFlags_TxColumns) == 0); @@ -227,7 +259,7 @@ vec3 HERCULES(vec3 world_point) transmit_angle, tx_rows); } - vec3 result = vec3(0); + RESULT_TYPE result = RESULT_TYPE(0); for (int transmit = int(sparse); transmit < acquisition_count; transmit++) { int tx_channel = sparse ? imageLoad(sparse_elements, transmit - int(sparse)).x : transmit; for (int rx_channel = 0; rx_channel < channel_count; rx_channel++) { @@ -241,9 +273,9 @@ vec3 HERCULES(vec3 world_point) /* NOTE: tribal knowledge */ if (transmit == 0) apodization *= inversesqrt(acquisition_count); - float sidx = sample_index(transmit_distance + distance(xdc_world_point, element_position)); - vec2 value = apodization * sample_rf(rx_channel, transmit, sidx); - result += vec3(value, length(value)); + float sidx = sample_index(transmit_distance + distance(xdc_world_point, element_position)); + SAMPLE_TYPE value = apodization * sample_rf(rx_channel, transmit, sidx); + result += RESULT_TYPE(value, length(value)); } } } @@ -252,14 +284,14 @@ vec3 HERCULES(vec3 world_point) #endif #if (ShaderFlags & ShaderFlags_Fast) -vec3 FORCES(vec3 world_point) +RESULT_TYPE FORCES(vec3 world_point) { vec3 xdc_world_point = (xdc_transform * vec4(world_point, 1)).xyz; float receive_distance = distance(xdc_world_point.xz, vec2(u_channel * xdc_element_pitch.x, 0)); float apodization = apodize(f_number * radians(180) / abs(xdc_world_point.z) * (xdc_world_point.x - u_channel * xdc_element_pitch.x)); - vec2 result = vec2(0); + RESULT_TYPE result = RESULT_TYPE(0); if (apodization > 0) { for (int transmit = int(sparse); transmit < acquisition_count; transmit++) { int tx_channel = sparse ? imageLoad(sparse_elements, transmit - int(sparse)).x : transmit; @@ -269,14 +301,14 @@ vec3 FORCES(vec3 world_point) result += apodization * sample_rf(u_channel, transmit, sidx); } } - return vec3(result, 0); + return result; } #else -vec3 FORCES(vec3 world_point) +RESULT_TYPE FORCES(vec3 world_point) { vec3 xdc_world_point = (xdc_transform * vec4(world_point, 1)).xyz; - vec3 result = vec3(0); + RESULT_TYPE result = RESULT_TYPE(0); for (int rx_channel = 0; rx_channel < channel_count; rx_channel++) { float receive_distance = distance(xdc_world_point.xz, vec2(rx_channel * xdc_element_pitch.x, 0)); float apodization = apodize(f_number * radians(180) / abs(xdc_world_point.z) * @@ -286,9 +318,9 @@ vec3 FORCES(vec3 world_point) int tx_channel = sparse ? imageLoad(sparse_elements, transmit - int(sparse)).x : transmit; vec3 transmit_center = vec3(xdc_element_pitch * vec2(tx_channel, floor(channel_count / 2)), 0); - float sidx = sample_index(distance(xdc_world_point, transmit_center) + receive_distance); - vec2 value = apodization * sample_rf(rx_channel, tx_channel, sidx); - result += vec3(value, length(value)); + float sidx = sample_index(distance(xdc_world_point, transmit_center) + receive_distance); + SAMPLE_TYPE value = apodization * sample_rf(rx_channel, transmit, sidx); + result += RESULT_TYPE(value, length(value)); } } } @@ -303,9 +335,9 @@ void main() return; #if (ShaderFlags & ShaderFlags_Fast) - vec3 sum = vec3(imageLoad(u_out_data_tex, out_voxel).xy, 0); + RESULT_TYPE sum = RESULT_TYPE_CAST(imageLoad(u_out_data_tex, out_voxel)); #else - vec3 sum = vec3(0); + RESULT_TYPE sum = RESULT_TYPE(0); out_voxel += u_voxel_offset; #endif @@ -330,9 +362,13 @@ void main() }break; } + #if (ShaderFlags & ShaderFlags_Fast) == 0 /* TODO(rnp): scale such that brightness remains ~constant */ - if (bool(shader_flags & ShaderFlags_CoherencyWeighting)) - sum.xy *= sum.xy / (sum.z + float(sum.z == 0)); + if (bool(shader_flags & ShaderFlags_CoherencyWeighting)) { + float denominator = sum[RESULT_LAST_INDEX] + float(sum[RESULT_LAST_INDEX] == 0); + RESULT_TYPE_CAST(sum) *= RESULT_TYPE_CAST(sum) / denominator; + } + #endif - imageStore(u_out_data_tex, out_voxel, vec4(sum.xy, 0, 0)); + imageStore(u_out_data_tex, out_voxel, OUTPUT_TYPE_CAST(sum)); } diff --git a/ui.c b/ui.c @@ -1469,7 +1469,7 @@ ui_beamformer_frame_view_copy_frame(BeamformerUI *ui, BeamformerFrameView *new, mem_copy(new->frame, old->frame, sizeof(*new->frame)); new->frame->texture = 0; new->frame->next = 0; - alloc_beamform_frame(0, new->frame, old->frame->dim, s8("Frame Copy: "), ui->arena); + alloc_beamform_frame(0, new->frame, old->frame->dim, old->frame->gl_kind, s8("Frame Copy: "), ui->arena); glCopyImageSubData(old->frame->texture, GL_TEXTURE_3D, 0, 0, 0, 0, new->frame->texture, GL_TEXTURE_3D, 0, 0, 0, 0,