ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

Commit: 8bd1e76f64dbcb6774967455319a3fb107dcc418
Parent: f98319eadb856d31bd0b976d53d6a1a7fa6dfd32
Author: Randy Palamar
Date:   Thu,  9 Oct 2025 12:34:27 -0600

shaders/das: avoid cosine evaluation in early out path

This gives a ~2.1% performance boost for HERCULES which must
evaluate it for every element of the probe. Should be similar for
conventional RCA methods but I didn't measure.

Diffstat:
Mshaders/das.glsl | 27++++++++++++++++-----------
1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/shaders/das.glsl b/shaders/das.glsl @@ -111,14 +111,17 @@ float sample_index(const float distance) float apodize(const float arg) { - /* NOTE: used for constant F# dynamic receive apodization. This is implemented as: + /* IMPORTANT: do not move calculation of arg into this function. It will generate a + * conditional move resulting in cos always being evaluated causing a slowdown */ + + /* NOTE: constant F# dynamic receive apodization. This is implemented as: * * / |x_e - x_i|\ * a(x, z) = cos(F# * π * ----------- ) ^ 2 * \ |z_e - z_i|/ * * where x,z_e are transducer element positions and x,z_i are image positions. */ - float a = cos(clamp(abs(arg), 0, 0.25 * radians(360))); + float a = cos(radians(180) * arg); return a * a; } @@ -184,11 +187,11 @@ RESULT_TYPE RCA(const vec3 world_point) for (int rx_channel = 0; rx_channel < ChannelCount; rx_channel++) { vec3 rx_center = vec3(rx_channel * xdc_element_pitch, 0); vec2 receive_vector = xdc_world_point - rca_plane_projection(rx_center, rx_rows); - float apodization = apodize(FNumber * radians(180) / abs(xdc_world_point.y) * receive_vector.x); + float a_arg = abs(FNumber * receive_vector.x / abs(xdc_world_point.y)); - if (apodization > 0) { + if (a_arg < 0.5f) { float sidx = sample_index(transmit_distance + length(receive_vector)); - SAMPLE_TYPE value = apodization * sample_rf(rx_channel, acquisition, sidx); + SAMPLE_TYPE value = apodize(a_arg) * sample_rf(rx_channel, acquisition, sidx); result += RESULT_STORE(value, length(value)); } } @@ -215,9 +218,10 @@ RESULT_TYPE HERCULES(const vec3 world_point) if (rx_cols) element_position = vec3(rx_channel, tx_channel, 0) * vec3(xdc_element_pitch, 0); else element_position = vec3(tx_channel, rx_channel, 0) * vec3(xdc_element_pitch, 0); - float apodization = apodize(FNumber * radians(180) / abs(xdc_world_point.z) * - distance(xdc_world_point.xy, element_position.xy)); - if (apodization > 0) { + float a_arg = abs(FNumber * distance(xdc_world_point.xy, element_position.xy) / + abs(xdc_world_point.z)); + if (a_arg < 0.5f) { + float apodization = apodize(a_arg); /* NOTE: tribal knowledge */ if (transmit == 0) apodization *= inversesqrt(AcquisitionCount); @@ -239,9 +243,10 @@ RESULT_TYPE FORCES(const vec3 world_point) vec3 xdc_world_point = (xdc_transform * vec4(world_point, 1)).xyz; for (int rx_channel = rx_channel_start; rx_channel < rx_channel_end; rx_channel++) { float receive_distance = distance(xdc_world_point.xz, vec2(rx_channel * xdc_element_pitch.x, 0)); - float apodization = apodize(FNumber * radians(180) / abs(xdc_world_point.z) * - (xdc_world_point.x - rx_channel * xdc_element_pitch.x)); - if (apodization > 0) { + float a_arg = abs(FNumber * (xdc_world_point.x - rx_channel * xdc_element_pitch.x) / + abs(xdc_world_point.z)); + if (a_arg < 0.5f) { + float apodization = apodize(a_arg); for (int transmit = Sparse; transmit < AcquisitionCount; transmit++) { int tx_channel = bool(Sparse) ? imageLoad(sparse_elements, transmit - Sparse).x : transmit; vec3 transmit_center = vec3(xdc_element_pitch * vec2(tx_channel, floor(ChannelCount / 2)), 0);