ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

Commit: d23026486d5af255051d3d4c61d157845931917a
Parent: 66ad2b2da60bf90fbf833e27040010be19ed5e69
Author: Randy Palamar
Date:   Fri, 18 Jul 2025 10:51:30 -0600

core: shave 40% off DAS runtime

for FORCES and HERCULES the speed was improved by beamforming
receive channel by receive channel after resetting the order of
the rf data to the order it was originally received in. This order
is more cache friendly. VLS/TPW most likely perform better in the
old shader because focal vectors can be better utilized from
cached texture memory but it wasn't explored in detail.

Other options making use of GPU shared memory all performed way
worse than the existing code.

New shader is currently auto selected if we can determine from the
rest of the parameters that it is supported.

Diffstat:
Mbeamformer.c | 210++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
Mbeamformer.h | 1+
Mbeamformer_parameters.h | 24+++++++++++++++++-------
Mmath.c | 62++++++++++++++++++++++++++++++++++++++++++++------------------
Mopengl.h | 1+
Mshaders/das.glsl | 367+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
Mshaders/decode.glsl | 2+-
Mstatic.c | 4++--
Mtests/throughput.c | 2+-
Mui.c | 4++--
10 files changed, 445 insertions(+), 232 deletions(-)

diff --git a/beamformer.c b/beamformer.c @@ -266,13 +266,58 @@ compute_cursor_finished(struct compute_cursor *cursor) return result; } +function b32 +das_can_use_fast(BeamformerParameters *bp) +{ + b32 result = !bp->coherency_weighting; + return result; +} + +function m4 +das_voxel_transform_matrix(BeamformerParameters *bp) +{ + v3 min = v4_from_f32_array(bp->output_min_coordinate).xyz; + v3 max = v4_from_f32_array(bp->output_max_coordinate).xyz; + v3 extent = v3_abs(v3_sub(max, min)); + v3 points = {{(f32)bp->output_points[0], (f32)bp->output_points[1], (f32)bp->output_points[2]}}; + + m4 T1 = m4_translation(v3_scale(v3_sub(points, (v3){{1.0f, 1.0f, 1.0f}}), -0.5f)); + m4 T2 = m4_translation(v3_add(min, v3_scale(extent, 0.5f))); + m4 S = m4_scale(v3_div(extent, points)); + + m4 R; + switch (bp->das_shader_id) { + case DASShaderKind_FORCES: + case DASShaderKind_UFORCES: + case DASShaderKind_FLASH: + { + R = m4_identity(); + S.c[1].E[1] = 0; + T2.c[3].E[1] = 0; + }break; + case DASShaderKind_HERCULES: + case DASShaderKind_UHERCULES: + case DASShaderKind_RCA_TPW: + case DASShaderKind_RCA_VLS: + { + R = m4_rotation_about_z(bp->beamform_plane ? 0.0f : 0.25f); + if (!(points.x > 1 && points.y > 1 && points.z > 1)) + T2.c[3].E[1] = bp->off_axis_pos; + }break; + default:{ R = m4_identity(); }break; + } + m4 result = m4_mul(R, m4_mul(T2, m4_mul(S, T1))); + return result; +} + function void do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformerComputeFrame *frame, BeamformerShaderKind shader) { ComputeShaderCtx *csctx = &ctx->csctx; BeamformerSharedMemory *sm = ctx->shared_memory.region; - glUseProgram(csctx->programs[shader]); + u32 program = csctx->programs[shader]; + glUseProgram(program); u32 output_ssbo_idx = !csctx->last_output_ssbo_index; u32 input_ssbo_idx = csctx->last_output_ssbo_index; @@ -291,7 +336,7 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformerComputeFrame *frame local_size_x *= 2; iz raw_size = csctx->rf_raw_size; - glProgramUniform1ui(csctx->programs[shader], DECODE_FIRST_PASS_UNIFORM_LOC, 1); + glProgramUniform1ui(program, DECODE_FIRST_PASS_UNIFORM_LOC, 1); glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo, 0, raw_size); glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 2, csctx->raw_data_ssbo, raw_size, raw_size); glDispatchCompute((u32)ceil_f32((f32)csctx->dec_data_dim.x / local_size_x), @@ -300,7 +345,7 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformerComputeFrame *frame glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); - glProgramUniform1ui(csctx->programs[shader], DECODE_FIRST_PASS_UNIFORM_LOC, 0); + glProgramUniform1ui(program, DECODE_FIRST_PASS_UNIFORM_LOC, 0); glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo, raw_size, raw_size); glDispatchCompute((u32)ceil_f32((f32)csctx->dec_data_dim.x / local_size_x), (u32)ceil_f32((f32)csctx->dec_data_dim.y / DECODE_LOCAL_SIZE_Y), @@ -338,42 +383,77 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformerComputeFrame *frame glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); } }break; - case BeamformerShaderKind_DASCompute:{ + case BeamformerShaderKind_DAS: + case BeamformerShaderKind_DASFast: + { + if (shader == BeamformerShaderKind_DASFast) { + glClearTexImage(frame->frame.texture, 0, GL_RED, GL_FLOAT, 0); + glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT); + glBindImageTexture(0, frame->frame.texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F); + } else { + glBindImageTexture(0, frame->frame.texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F); + } + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]); - glBindImageTexture(0, frame->frame.texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F); - glBindImageTexture(1, csctx->sparse_elements_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I); - glBindImageTexture(2, csctx->focal_vectors_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RG32F); - - glProgramUniform1ui(csctx->programs[shader], DAS_CYCLE_T_UNIFORM_LOC, cycle_t++); - - #if 1 - /* TODO(rnp): compute max_points_per_dispatch based on something like a - * transmit_count * channel_count product */ - u32 max_points_per_dispatch = KB(64); - struct compute_cursor cursor = start_compute_cursor(frame->frame.dim, max_points_per_dispatch); - f32 percent_per_step = (f32)cursor.points_per_dispatch / (f32)cursor.total_points; - csctx->processing_progress = -percent_per_step; - for (iv3 offset = {0}; - !compute_cursor_finished(&cursor); - offset = step_compute_cursor(&cursor)) - { - csctx->processing_progress += percent_per_step; - /* IMPORTANT(rnp): prevents OS from coalescing and killing our shader */ - glFinish(); - glProgramUniform3iv(csctx->programs[shader], DAS_VOXEL_OFFSET_UNIFORM_LOC, - 1, offset.E); - glDispatchCompute(cursor.dispatch.x, cursor.dispatch.y, cursor.dispatch.z); + glBindImageTexture(1, csctx->sparse_elements_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I); + glBindImageTexture(2, csctx->focal_vectors_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RG32F); + + m4 voxel_transform = das_voxel_transform_matrix(&sm->parameters); + glProgramUniform1ui(program, DAS_CYCLE_T_UNIFORM_LOC, cycle_t++); + glProgramUniformMatrix4fv(program, DAS_VOXEL_MATRIX_LOC, 1, 0, voxel_transform.E); + + iv3 dim = frame->frame.dim; + if (shader == BeamformerShaderKind_DASFast) { + i32 loop_end; + if (sm->parameters.das_shader_id == DASShaderKind_RCA_VLS || + sm->parameters.das_shader_id == DASShaderKind_RCA_TPW) + { + /* NOTE(rnp): to avoid repeatedly sampling the whole focal vectors + * texture we loop over transmits for VLS/TPW */ + loop_end = (i32)sm->parameters.dec_data_dim[2]; + } else { + loop_end = (i32)sm->parameters.dec_data_dim[1]; + } + f32 percent_per_step = 1.0f / (f32)loop_end; + csctx->processing_progress = -percent_per_step; + for (i32 index = 0; index < loop_end; index++) { + csctx->processing_progress += percent_per_step; + /* IMPORTANT(rnp): prevents OS from coalescing and killing our shader */ + glFinish(); + glProgramUniform1i(program, DAS_FAST_CHANNEL_UNIFORM_LOC, index); + glDispatchCompute((u32)ceil_f32((f32)dim.x / DAS_FAST_LOCAL_SIZE_X), + (u32)ceil_f32((f32)dim.y / DAS_FAST_LOCAL_SIZE_Y), + (u32)ceil_f32((f32)dim.z / DAS_FAST_LOCAL_SIZE_Z)); + glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); + } + } else { + #if 1 + /* TODO(rnp): compute max_points_per_dispatch based on something like a + * transmit_count * channel_count product */ + u32 max_points_per_dispatch = KB(64); + struct compute_cursor cursor = start_compute_cursor(dim, max_points_per_dispatch); + f32 percent_per_step = (f32)cursor.points_per_dispatch / (f32)cursor.total_points; + csctx->processing_progress = -percent_per_step; + for (iv3 offset = {0}; + !compute_cursor_finished(&cursor); + offset = step_compute_cursor(&cursor)) + { + csctx->processing_progress += percent_per_step; + /* IMPORTANT(rnp): prevents OS from coalescing and killing our shader */ + glFinish(); + glProgramUniform3iv(program, DAS_VOXEL_OFFSET_UNIFORM_LOC, 1, offset.E); + glDispatchCompute(cursor.dispatch.x, cursor.dispatch.y, cursor.dispatch.z); + } + #else + /* NOTE(rnp): use this for testing tiling code. The performance of the above path + * should be the same as this path if everything is working correctly */ + iv3 compute_dim_offset = {0}; + glProgramUniform3iv(program, DAS_VOXEL_OFFSET_UNIFORM_LOC, 1, compute_dim_offset.E); + glDispatchCompute((u32)ceil_f32((f32)dim.x / DAS_LOCAL_SIZE_X), + (u32)ceil_f32((f32)dim.y / DAS_LOCAL_SIZE_Y), + (u32)ceil_f32((f32)dim.z / DAS_LOCAL_SIZE_Z)); + #endif } - #else - /* NOTE(rnp): use this for testing tiling code. The performance of the above path - * should be the same as this path if everything is working correctly */ - iv3 compute_dim_offset = {0}; - glProgramUniform3iv(csctx->programs[shader], DAS_VOXEL_OFFSET_UNIFORM_LOC, - 1, compute_dim_offset.E); - glDispatchCompute((u32)ceil_f32((f32)frame->frame.dim.x / DAS_LOCAL_SIZE_X), - (u32)ceil_f32((f32)frame->frame.dim.y / DAS_LOCAL_SIZE_Y), - (u32)ceil_f32((f32)frame->frame.dim.z / DAS_LOCAL_SIZE_Z)); - #endif glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT|GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); }break; case BeamformerShaderKind_Sum:{ @@ -414,14 +494,30 @@ shader_text_with_header(ShaderReloadContext *ctx, OS *os, Arena *arena) stream_append_s8s(&sb, s8("#version 460 core\n\n"), ctx->header); switch (ctx->kind) { - case BeamformerShaderKind_DASCompute:{ + case BeamformerShaderKind_DAS: + case BeamformerShaderKind_DASFast: + { + if (ctx->kind == BeamformerShaderKind_DAS) { + stream_append_s8(&sb, s8("" + "layout(local_size_x = " str(DAS_LOCAL_SIZE_X) ", " + "local_size_y = " str(DAS_LOCAL_SIZE_Y) ", " + "local_size_z = " str(DAS_LOCAL_SIZE_Z) ") in;\n\n" + "#define DAS_FAST 0\n\n" + "layout(location = " str(DAS_VOXEL_OFFSET_UNIFORM_LOC) ") uniform ivec3 u_voxel_offset;\n" + )); + } else { + stream_append_s8(&sb, s8("" + "layout(local_size_x = " str(DAS_FAST_LOCAL_SIZE_X) ", " + "local_size_y = " str(DAS_FAST_LOCAL_SIZE_Y) ", " + "local_size_z = " str(DAS_FAST_LOCAL_SIZE_Z) ") in;\n\n" + "#define DAS_FAST 1\n\n" + "layout(location = " str(DAS_FAST_CHANNEL_UNIFORM_LOC) ") uniform int u_channel;\n" + )); + } #define X(type, id, pretty, fixed_tx) "#define DAS_ID_" #type " " #id "\n" stream_append_s8(&sb, s8("" - "layout(local_size_x = " str(DAS_LOCAL_SIZE_X) ", " - "local_size_y = " str(DAS_LOCAL_SIZE_Y) ", " - "local_size_z = " str(DAS_LOCAL_SIZE_Z) ") in;\n\n" - "layout(location = " str(DAS_VOXEL_OFFSET_UNIFORM_LOC) ") uniform ivec3 u_voxel_offset;\n" - "layout(location = " str(DAS_CYCLE_T_UNIFORM_LOC) ") uniform uint u_cycle_t;\n\n" + "layout(location = " str(DAS_VOXEL_MATRIX_LOC) ") uniform mat4 u_voxel_transform;\n" + "layout(location = " str(DAS_CYCLE_T_UNIFORM_LOC) ") uniform uint u_cycle_t;\n\n" DAS_TYPES )); #undef X @@ -532,6 +628,15 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co src->shader = cs->programs + BeamformerShaderKind_Decode; } + if (src->kind == BeamformerShaderKind_DAS) { + /* TODO(rnp): think of a better way of doing this */ + src->kind = BeamformerShaderKind_DASFast; + src->shader = cs->programs + BeamformerShaderKind_DASFast; + success &= reload_compute_shader(ctx, src, s8(" (Fast)"), arena); + src->kind = BeamformerShaderKind_DAS; + src->shader = cs->programs + BeamformerShaderKind_DAS; + } + if (success && ctx->csctx.raw_data_ssbo) { /* TODO(rnp): this check seems off */ can_commit = 0; @@ -666,9 +771,24 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co frame->frame.das_shader_kind = bp->das_shader_id; frame->frame.compound_count = bp->dec_data_dim[2]; - b32 did_sum_shader = 0; - u32 stage_count = sm->compute_stages_count; + u32 stage_count = sm->compute_stages_count; BeamformerShaderKind *stages = sm->compute_stages; + + for (u32 i = 0; i < stage_count; i++) { + switch (stages[i]) { + case BeamformerShaderKind_DASFast:{ + if (!das_can_use_fast(bp)) + stages[i] = BeamformerShaderKind_DAS; + }break; + case BeamformerShaderKind_DAS:{ + if (das_can_use_fast(bp)) + stages[i] = BeamformerShaderKind_DASFast; + }break; + default:{}break; + } + } + + b32 did_sum_shader = 0; for (u32 i = 0; i < stage_count; i++) { did_sum_shader |= stages[i] == BeamformerShaderKind_Sum; glBeginQuery(GL_TIME_ELAPSED, cs->shader_timer_ids[i]); diff --git a/beamformer.h b/beamformer.h @@ -203,6 +203,7 @@ struct BeamformerComputeFrame { X(MAX_TEXTURE_SIZE, max_2d_texture_dim, "") \ X(MAX_3D_TEXTURE_SIZE, max_3d_texture_dim, "") \ X(MAX_SHADER_STORAGE_BLOCK_SIZE, max_ssbo_size, "") \ + X(MAX_COMPUTE_SHARED_MEMORY_SIZE, max_shared_memory_size, "") \ X(MAX_UNIFORM_BLOCK_SIZE, max_ubo_size, "") \ X(MAX_SERVER_WAIT_TIMEOUT, max_server_wait_time, " [ns]") diff --git a/beamformer_parameters.h b/beamformer_parameters.h @@ -14,7 +14,7 @@ #define COMPUTE_SHADERS \ X(CudaDecode, 0, "", 0, "CUDA Decode") \ X(CudaHilbert, 1, "", 0, "CUDA Hilbert") \ - X(DASCompute, 2, "das", 1, "DAS") \ + X(DAS, 2, "das", 1, "DAS") \ X(Decode, 3, "decode", 1, "Decode") \ X(DecodeFloat, 4, "", 1, "Decode (F32)") \ X(DecodeFloatComplex, 5, "", 1, "Decode (F32C)") \ @@ -22,9 +22,13 @@ X(MinMax, 7, "min_max", 0, "Min/Max") \ X(Sum, 8, "sum", 0, "Sum") +#define COMPUTE_SHADERS_INTERNAL \ + COMPUTE_SHADERS \ + X(DASFast, 9, "", 1, "DAS (Fast)") + typedef enum { #define X(e, n, s, h, pn) BeamformerShaderKind_##e = n, - COMPUTE_SHADERS + COMPUTE_SHADERS_INTERNAL #undef X BeamformerShaderKind_Render3D, BeamformerShaderKind_Count, @@ -78,12 +82,18 @@ typedef enum { #define DECODE_FIRST_PASS_UNIFORM_LOC 1 -#define DAS_LOCAL_SIZE_X 32 -#define DAS_LOCAL_SIZE_Y 1 -#define DAS_LOCAL_SIZE_Z 32 +#define DAS_LOCAL_SIZE_X 16 +#define DAS_LOCAL_SIZE_Y 1 +#define DAS_LOCAL_SIZE_Z 16 + +#define DAS_FAST_LOCAL_SIZE_X 16 +#define DAS_FAST_LOCAL_SIZE_Y 1 +#define DAS_FAST_LOCAL_SIZE_Z 16 -#define DAS_VOXEL_OFFSET_UNIFORM_LOC 2 -#define DAS_CYCLE_T_UNIFORM_LOC 3 +#define DAS_VOXEL_OFFSET_UNIFORM_LOC 2 +#define DAS_CYCLE_T_UNIFORM_LOC 3 +#define DAS_VOXEL_MATRIX_LOC 4 +#define DAS_FAST_CHANNEL_UNIFORM_LOC 5 #define MIN_MAX_MIPS_LEVEL_UNIFORM_LOC 1 #define SUM_PRESCALE_UNIFORM_LOC 1 diff --git a/math.c b/math.c @@ -212,6 +212,16 @@ cross(v3 a, v3 b) } function v3 +v3_abs(v3 a) +{ + v3 result; + result.x = ABS(a.x); + result.y = ABS(a.y); + result.z = ABS(a.z); + return result; +} + +function v3 v3_scale(v3 a, f32 scale) { v3 result; @@ -238,6 +248,16 @@ v3_sub(v3 a, v3 b) return result; } +function v3 +v3_div(v3 a, v3 b) +{ + v3 result; + result.x = a.x / b.x; + result.y = a.y / b.y; + result.z = a.z / b.z; + return result; +} + function f32 v3_dot(v3 a, v3 b) { @@ -359,21 +379,14 @@ m4_row(m4 a, u32 row) return result; } -function v4 -m4_column(m4 a, u32 column) -{ - v4 result = a.c[column]; - return result; -} - function m4 m4_mul(m4 a, m4 b) { m4 result; - for (u32 i = 0; i < countof(result.E); i++) { - u32 base = i / 4; - u32 sub = i % 4; - result.E[i] = v4_dot(m4_row(a, base), m4_column(b, sub)); + for (u32 i = 0; i < 4; i++) { + for (u32 j = 0; j < 4; j++) { + result.c[i].E[j] = v4_dot(m4_row(a, j), b.c[i]); + } } return result; } @@ -414,10 +427,10 @@ function m4 m4_translation(v3 delta) { m4 result; - result.c[0] = (v4){{1, 0, 0, delta.x}}; - result.c[1] = (v4){{0, 1, 0, delta.y}}; - result.c[2] = (v4){{0, 0, 1, delta.z}}; - result.c[3] = (v4){{0, 0, 0, 1}}; + result.c[0] = (v4){{1, 0, 0, 0}}; + result.c[1] = (v4){{0, 1, 0, 0}}; + result.c[2] = (v4){{0, 0, 1, 0}}; + result.c[3] = (v4){{delta.x, delta.y, delta.z, 1}}; return result; } @@ -433,6 +446,19 @@ m4_scale(v3 scale) } function m4 +m4_rotation_about_z(f32 turns) +{ + f32 sa = sin_f32(turns * 2 * PI); + f32 ca = cos_f32(turns * 2 * PI); + m4 result; + result.c[0] = (v4){{ca, -sa, 0, 0}}; + result.c[1] = (v4){{sa, ca, 0, 0}}; + result.c[2] = (v4){{0, 0, 1, 0}}; + result.c[3] = (v4){{0, 0, 0, 1}}; + return result; +} + +function m4 m4_rotation_about_y(f32 turns) { f32 sa = sin_f32(turns * 2 * PI); @@ -448,10 +474,10 @@ m4_rotation_about_y(f32 turns) function m4 y_aligned_volume_transform(v3 extent, v3 translation, f32 rotation_turns) { - m4 S = m4_scale(extent); - m4 R = m4_rotation_about_y(rotation_turns); m4 T = m4_translation(translation); - m4 result = m4_mul(m4_mul(R, S), T); + m4 R = m4_rotation_about_y(rotation_turns); + m4 S = m4_scale(extent); + m4 result = m4_mul(T, m4_mul(R, S)); return result; } diff --git a/opengl.h b/opengl.h @@ -30,6 +30,7 @@ #define GL_R8I 0x8231 #define GL_R16I 0x8233 #define GL_DEBUG_OUTPUT_SYNCHRONOUS 0x8242 +#define GL_MAX_COMPUTE_SHARED_MEMORY_SIZE 0x8262 #define GL_BUFFER 0x82E0 #define GL_PROGRAM 0x82E2 #define GL_MIRRORED_REPEAT 0x8370 diff --git a/shaders/das.glsl b/shaders/das.glsl @@ -3,7 +3,12 @@ layout(std430, binding = 1) readonly restrict buffer buffer_1 { vec2 rf_data[]; }; +#if DAS_FAST +layout(rg32f, binding = 0) restrict uniform image3D u_out_data_tex; +#else layout(rg32f, binding = 0) writeonly restrict uniform image3D u_out_data_tex; +#endif + layout(r16i, binding = 1) readonly restrict uniform iimage1D sparse_elements; layout(rg32f, binding = 2) readonly restrict uniform image1D focal_vectors; @@ -16,7 +21,7 @@ layout(rg32f, binding = 2) readonly restrict uniform image1D focal_vectors; #define TX_MODE_RX_COLS(a) (((a) & 1) != 0) /* NOTE: See: https://cubic.org/docs/hermite.htm */ -vec2 cubic(int ridx, float x) +vec2 cubic(int base_index, float x) { mat4 h = mat4( 2, -3, 0, 1, @@ -29,240 +34,290 @@ vec2 cubic(int ridx, float x) float t = x - floor(x); vec4 S = vec4(t * t * t, t * t, t, 1); - vec2 P1 = rf_data[ridx + xk]; - vec2 P2 = rf_data[ridx + xk + 1]; - vec2 T1 = C_SPLINE * (P2 - rf_data[ridx + xk - 1]); - vec2 T2 = C_SPLINE * (rf_data[ridx + xk + 2] - P1); + vec2 samples[4] = { + rf_data[base_index + xk - 1], + rf_data[base_index + xk + 0], + rf_data[base_index + xk + 1], + rf_data[base_index + xk + 2], + }; + + vec2 P1 = samples[1]; + vec2 P2 = samples[2]; + vec2 T1 = C_SPLINE * (P2 - samples[0]); + vec2 T2 = C_SPLINE * (samples[3] - P1); mat2x4 C = mat2x4(vec4(P1.x, P2.x, T1.x, T2.x), vec4(P1.y, P2.y, T1.y, T2.y)); return S * h * C; } -vec2 sample_rf(int ridx, float t) +vec2 sample_rf(int channel, int transmit, float t) { vec2 result; - if (interpolate) result = cubic(ridx, t); - else result = rf_data[ridx + int(floor(t))]; + int base_index = int(channel * dec_data_dim.x * dec_data_dim.z + transmit * dec_data_dim.x); + if (interpolate) result = cubic(base_index, t); + else result = rf_data[base_index + int(floor(t))]; return result; } -vec3 calc_image_point(vec3 voxel) -{ - vec3 out_data_dim = vec3(imageSize(u_out_data_tex)); - vec4 output_size = abs(output_max_coordinate - output_min_coordinate); - vec3 image_point = output_min_coordinate.xyz + voxel * output_size.xyz / out_data_dim; - - switch (das_shader_id) { - case DAS_ID_FLASH: - case DAS_ID_FORCES: - case DAS_ID_UFORCES: - /* TODO: fix the math so that the image plane can be aritrary */ - image_point.y = 0; - break; - case DAS_ID_HERCULES: - case DAS_ID_RCA_TPW: - case DAS_ID_RCA_VLS: - /* TODO(rnp): this can be removed when we use an abitrary plane transform */ - if (!all(greaterThan(out_data_dim, vec3(1, 1, 1)))) - image_point.y = off_axis_pos; - break; - } - - return image_point; -} - -vec2 apodize(vec2 value, float apodization_arg, float distance) +float sample_index(float distance) { - /* NOTE: apodization value for this transducer element */ - float a = cos(clamp(abs(apodization_arg * distance), 0, 0.25 * radians(360))); - return value * a * a; + float time = distance / speed_of_sound + time_offset; + return time * sampling_frequency; } -vec3 orientation_projection(vec3 point, bool rows) +float apodize(float arg) { - return point * vec3(!rows, rows, 1); + /* NOTE: used for constant F# dynamic receive apodization. This is implemented as: + * + * / |x_e - x_i|\ + * a(x, z) = cos(F# * π * ----------- ) ^ 2 + * \ |z_e - z_i|/ + * + * where x,z_e are transducer element positions and x,z_i are image positions. */ + float a = cos(clamp(abs(arg), 0, 0.25 * radians(360))); + return a * a; } -vec3 world_space_to_rca_space(vec3 image_point, bool rx_rows) +vec2 rca_plane_projection(vec3 point, bool rows) { - return orientation_projection((xdc_transform * vec4(image_point, 1)).xyz, rx_rows); + vec2 result = vec2(point[int(rows)], point[2]); + return result; } -float sample_index(float distance) +float plane_wave_transmit_distance(vec3 point, float transmit_angle, bool tx_rows) { - float time = distance / speed_of_sound + time_offset; - return time * sampling_frequency; + return dot(rca_plane_projection(point, tx_rows), vec2(sin(transmit_angle), cos(transmit_angle))); } -float planewave_transmit_distance(vec3 point, float transmit_angle, bool tx_rows) +float cylindrical_wave_transmit_distance(vec3 point, float focal_depth, float transmit_angle, bool tx_rows) { - return dot(orientation_projection(point, tx_rows), - vec3(sin(transmit_angle), sin(transmit_angle), cos(transmit_angle))); + vec2 f = focal_depth * vec2(sin(transmit_angle), cos(transmit_angle)); + return distance(rca_plane_projection(point, tx_rows), f); } -float cylindricalwave_transmit_distance(vec3 point, float focal_depth, float transmit_angle, bool tx_rows) +#if DAS_FAST +vec3 RCA(vec3 world_point) { - vec3 f = focal_depth * vec3(sin(transmit_angle), sin(transmit_angle), cos(transmit_angle)); - return length(orientation_projection(point - f, tx_rows)); -} + bool tx_rows = !TX_MODE_TX_COLS(transmit_mode); + bool rx_rows = !TX_MODE_RX_COLS(transmit_mode); + vec2 xdc_world_point = rca_plane_projection((xdc_transform * vec4(world_point, 1)).xyz, rx_rows); + vec2 focal_vector = imageLoad(focal_vectors, u_channel).xy; + float transmit_angle = radians(focal_vector.x); + float focal_depth = focal_vector.y; -vec3 RCA(vec3 image_point, vec3 delta, float apodization_arg) -{ - int ridx = 0; - int direction = beamform_plane; - if (direction != TX_ROWS) image_point = image_point.yxz; + float transmit_distance; + if (isinf(focal_depth)) { + transmit_distance = plane_wave_transmit_distance(world_point, transmit_angle, tx_rows); + } else { + transmit_distance = cylindrical_wave_transmit_distance(world_point, focal_depth, + transmit_angle, tx_rows); + } - bool tx_col = TX_MODE_TX_COLS(transmit_mode); - bool rx_col = TX_MODE_RX_COLS(transmit_mode); + vec2 result = vec2(0); + for (int channel = 0; channel < dec_data_dim.y; channel++) { + vec2 receive_vector = xdc_world_point - rca_plane_projection(vec3(channel * xdc_element_pitch, 0), rx_rows); + float receive_distance = length(receive_vector); + float apodization = apodize(f_number * radians(180) / abs(xdc_world_point.y) * receive_vector.x); - vec3 receive_point = world_space_to_rca_space(image_point, !rx_col); - delta = orientation_projection(delta, !rx_col); + float sidx = sample_index(transmit_distance + receive_distance); + vec2 valid = vec2(sidx >= 0) * vec2(sidx < dec_data_dim.x); + result += valid * apodization * sample_rf(channel, u_channel, sidx); + } + return vec3(result, 0); +} +#else +vec3 RCA(vec3 world_point) +{ + bool tx_rows = !TX_MODE_TX_COLS(transmit_mode); + bool rx_rows = !TX_MODE_RX_COLS(transmit_mode); + vec2 xdc_world_point = rca_plane_projection((xdc_transform * vec4(world_point, 1)).xyz, rx_rows); vec3 sum = vec3(0); - /* NOTE: For Each Acquistion in Raw Data */ - // uint i = (dec_data_dim.z - 1) * uint(clamp(u_cycle_t, 0, 1)); { - for (int i = 0; i < dec_data_dim.z; i++) { - float transmit_angle = radians(imageLoad(focal_vectors, i).x); - float focal_depth = imageLoad(focal_vectors, i).y; + for (int transmit = 0; transmit < dec_data_dim.z; transmit++) { + vec2 focal_vector = imageLoad(focal_vectors, transmit).xy; + float transmit_angle = radians(focal_vector.x); + float focal_depth = focal_vector.y; float transmit_distance; if (isinf(focal_depth)) { - transmit_distance = planewave_transmit_distance(image_point, transmit_angle, - !tx_col); + transmit_distance = plane_wave_transmit_distance(world_point, transmit_angle, tx_rows); } else { - transmit_distance = cylindricalwave_transmit_distance(image_point, focal_depth, - transmit_angle, - !tx_col); + transmit_distance = cylindrical_wave_transmit_distance(world_point, focal_depth, + transmit_angle, tx_rows); } - vec3 receive_distance = receive_point; - /* NOTE: For Each Receiver */ - // uint j = (dec_data_dim.z - 1) * uint(clamp(u_cycle_t, 0, 1)); { - for (uint j = 0; j < dec_data_dim.y; j++) { - float sidx = sample_index(transmit_distance + length(receive_distance)); - vec2 valid = vec2(sidx >= 0) * vec2(sidx < dec_data_dim.x); - vec2 samp = valid * apodize(sample_rf(ridx, sidx), apodization_arg, - length(receive_distance.xy)); - sum += vec3(samp, length(samp)); - receive_distance -= delta; - ridx += int(dec_data_dim.x); + for (int rx_channel = 0; rx_channel < dec_data_dim.y; rx_channel++) { + vec3 rx_center = vec3(rx_channel * xdc_element_pitch, 0); + vec2 receive_vector = xdc_world_point - rca_plane_projection(rx_center, rx_rows); + float apodization = apodize(f_number * radians(180) / abs(xdc_world_point.y) * receive_vector.x); + + float sidx = sample_index(transmit_distance + length(receive_vector)); + vec2 valid = vec2(sidx >= 0) * vec2(sidx < dec_data_dim.x); + vec2 value = valid * apodization * sample_rf(rx_channel, transmit, sidx); + sum += vec3(value, length(value)); } } return sum; } +#endif -vec3 HERCULES(vec3 image_point, vec3 delta, float apodization_arg) +#if DAS_FAST +vec3 HERCULES(vec3 world_point) { - int uhercules = int(das_shader_id == DAS_ID_UHERCULES); - int ridx = int(dec_data_dim.y * dec_data_dim.x * uhercules); - int direction = beamform_plane; - if (direction != TX_ROWS) image_point = image_point.yxz; + bool uhercules = das_shader_id == DAS_ID_UHERCULES; + vec3 xdc_world_point = (xdc_transform * vec4(world_point, 1)).xyz; + bool tx_rows = !TX_MODE_TX_COLS(transmit_mode); + bool rx_cols = TX_MODE_RX_COLS(transmit_mode); + vec2 focal_vector = imageLoad(focal_vectors, 0).xy; + float transmit_angle = radians(focal_vector.x); + float focal_depth = focal_vector.y; - bool tx_col = TX_MODE_TX_COLS(transmit_mode); - bool rx_col = TX_MODE_RX_COLS(transmit_mode); - float transmit_angle = radians(imageLoad(focal_vectors, 0).x); - float focal_depth = imageLoad(focal_vectors, 0).y; + float transmit_distance; + if (isinf(focal_depth)) { + transmit_distance = plane_wave_transmit_distance(world_point, transmit_angle, tx_rows); + } else { + transmit_distance = cylindrical_wave_transmit_distance(world_point, focal_depth, + transmit_angle, tx_rows); + } - vec3 receive_point = (xdc_transform * vec4(image_point, 1)).xyz; + vec2 result = vec2(0); + for (int transmit = int(uhercules); transmit < dec_data_dim.z; transmit++) { + int tx_channel = uhercules ? imageLoad(sparse_elements, transmit - int(uhercules)).x : transmit; + vec3 element_position; + if (rx_cols) element_position = vec3(u_channel, tx_channel, 0) * vec3(xdc_element_pitch, 0); + else element_position = vec3(tx_channel, u_channel, 0) * vec3(xdc_element_pitch, 0); + + float apodization = apodize(f_number * radians(180) / abs(xdc_world_point.z) * + distance(xdc_world_point.xy, element_position.xy)); + /* NOTE: tribal knowledge */ + if (transmit == 0) apodization *= inversesqrt(dec_data_dim.z); + + float sidx = sample_index(transmit_distance + distance(xdc_world_point, element_position)); + vec2 valid = vec2(sidx >= 0) * vec2(sidx < dec_data_dim.x); + result += valid * apodization * sample_rf(u_channel, transmit, sidx); + } + return vec3(result, 0); +} +#else +vec3 HERCULES(vec3 world_point) +{ + bool uhercules = das_shader_id == DAS_ID_UHERCULES; + vec3 xdc_world_point = (xdc_transform * vec4(world_point, 1)).xyz; + bool tx_rows = !TX_MODE_TX_COLS(transmit_mode); + bool rx_cols = TX_MODE_RX_COLS(transmit_mode); + vec2 focal_vector = imageLoad(focal_vectors, 0).xy; + float transmit_angle = radians(focal_vector.x); + float focal_depth = focal_vector.y; float transmit_distance; if (isinf(focal_depth)) { - transmit_distance = planewave_transmit_distance(image_point, transmit_angle, - !tx_col); + transmit_distance = plane_wave_transmit_distance(world_point, transmit_angle, tx_rows); } else { - transmit_distance = cylindricalwave_transmit_distance(image_point, focal_depth, - transmit_angle, - !tx_col); + transmit_distance = cylindrical_wave_transmit_distance(world_point, focal_depth, + transmit_angle, tx_rows); } - vec3 sum = vec3(0); - /* NOTE: For Each Acquistion in Raw Data */ - for (int i = uhercules; i < dec_data_dim.z; i++) { - int channel = imageLoad(sparse_elements, i - uhercules).x; - /* NOTE: For Each Virtual Source */ - for (uint j = 0; j < dec_data_dim.y; j++) { + vec3 result = vec3(0); + for (int transmit = int(uhercules); transmit < dec_data_dim.z; transmit++) { + int tx_channel = uhercules ? imageLoad(sparse_elements, transmit - int(uhercules)).x : transmit; + for (int rx_channel = 0; rx_channel < dec_data_dim.y; rx_channel++) { vec3 element_position; - if (rx_col) element_position = vec3(j, channel, 0) * delta; - else element_position = vec3(channel, j, 0) * delta; - vec3 receive_distance = receive_point - element_position; - float sidx = sample_index(transmit_distance + length(receive_distance)); - vec2 valid = vec2(sidx >= 0) * vec2(sidx < dec_data_dim.x); + if (rx_cols) element_position = vec3(rx_channel, tx_channel, 0) * vec3(xdc_element_pitch, 0); + else element_position = vec3(tx_channel, rx_channel, 0) * vec3(xdc_element_pitch, 0); + float apodization = apodize(f_number * radians(180) / abs(xdc_world_point.z) * + distance(xdc_world_point.xy, element_position.xy)); /* NOTE: tribal knowledge */ - if (i == 0) valid *= inversesqrt(dec_data_dim.z); + if (transmit == 0) apodization *= inversesqrt(dec_data_dim.z); - vec2 samp = valid * apodize(sample_rf(ridx, sidx), apodization_arg, - length(receive_distance.xy)); - sum += vec3(samp, length(samp)); - ridx += int(dec_data_dim.x); + float sidx = sample_index(transmit_distance + distance(xdc_world_point, element_position)); + vec2 valid = vec2(sidx >= 0) * vec2(sidx < dec_data_dim.x); + vec2 value = valid * apodization * sample_rf(rx_channel, transmit, sidx); + result += vec3(value, length(value)); } } - return sum; + return result; } +#endif -vec3 uFORCES(vec3 image_point, vec3 delta, float apodization_arg) +#if DAS_FAST +vec3 FORCES(vec3 world_point) { - /* NOTE: skip first acquisition in uforces since its garbage */ - int uforces = int(das_shader_id == DAS_ID_UFORCES); - int ridx = int(dec_data_dim.y * dec_data_dim.x * uforces); - - image_point = (xdc_transform * vec4(image_point, 1)).xyz; - - vec3 sum = vec3(0); - for (int i = uforces; i < dec_data_dim.z; i++) { - int channel = imageLoad(sparse_elements, i - uforces).x; - vec2 recieve_point = image_point.xz; - vec3 element_center = delta * vec3(channel, floor(dec_data_dim.y / 2), 0); - float transmit_dist = distance(image_point, element_center); - - for (uint j = 0; j < dec_data_dim.y; j++) { - float sidx = sample_index(transmit_dist + length(recieve_point)); - vec2 valid = vec2(sidx >= 0) * vec2(sidx < dec_data_dim.x); - vec2 samp = valid * apodize(sample_rf(ridx, sidx), apodization_arg, - recieve_point.x); - sum += vec3(samp, length(samp)); - ridx += int(dec_data_dim.x); - recieve_point.x -= delta.x; + bool uforces = das_shader_id == DAS_ID_UFORCES; + vec3 xdc_world_point = (xdc_transform * vec4(world_point, 1)).xyz; + float receive_distance = distance(xdc_world_point.xz, vec2(u_channel * xdc_element_pitch.x, 0)); + float apodization = apodize(f_number * radians(180) / abs(xdc_world_point.z) * + (xdc_world_point.x - u_channel * xdc_element_pitch.x)); + + vec2 result = vec2(0); + for (int transmit = int(uforces); transmit < dec_data_dim.z; transmit++) { + int tx_channel = uforces ? imageLoad(sparse_elements, transmit - int(uforces)).x : transmit; + vec3 transmit_center = vec3(xdc_element_pitch * vec2(tx_channel, floor(dec_data_dim.y / 2)), 0); + + float sidx = sample_index(distance(xdc_world_point, transmit_center) + receive_distance); + vec2 valid = vec2(sidx >= 0) * vec2(sidx < dec_data_dim.x); + result += valid * apodization * sample_rf(u_channel, transmit, sidx); + } + return vec3(result, 0); +} +#else +vec3 FORCES(vec3 world_point) +{ + bool uforces = das_shader_id == DAS_ID_UFORCES; + vec3 xdc_world_point = (xdc_transform * vec4(world_point, 1)).xyz; + + vec3 result = vec3(0); + for (int rx_channel = 0; rx_channel < dec_data_dim.y; rx_channel++) { + float receive_distance = distance(xdc_world_point.xz, vec2(rx_channel * xdc_element_pitch.x, 0)); + float apodization = apodize(f_number * radians(180) / abs(xdc_world_point.z) * + (xdc_world_point.x - rx_channel * xdc_element_pitch.x)); + for (int transmit = int(uforces); transmit < dec_data_dim.z; transmit++) { + int tx_channel = uforces ? imageLoad(sparse_elements, transmit - int(uforces)).x : transmit; + vec3 transmit_center = vec3(xdc_element_pitch * vec2(tx_channel, floor(dec_data_dim.y / 2)), 0); + + float sidx = sample_index(distance(xdc_world_point, transmit_center) + receive_distance); + vec2 valid = vec2(sidx >= 0) * vec2(sidx < dec_data_dim.x); + vec2 value = valid * apodization * sample_rf(rx_channel, tx_channel, sidx); + result += vec3(value, length(value)); } } - return sum; + return result; } +#endif void main() { - /* NOTE: Convert voxel to physical coordinates */ - ivec3 out_coord = ivec3(gl_GlobalInvocationID) + u_voxel_offset; - vec3 image_point = calc_image_point(vec3(out_coord)); + ivec3 out_voxel = ivec3(gl_GlobalInvocationID); +#if DAS_FAST + vec3 sum = vec3(imageLoad(u_out_data_tex, out_voxel).xy, 0); +#else + vec3 sum = vec3(0); + out_voxel += u_voxel_offset; +#endif - /* NOTE: used for constant F# dynamic receive apodization. This is implemented as: - * - * / |x_e - x_i|\ - * a(x, z) = cos(F# * π * ----------- ) ^ 2 - * \ |z_e - z_i|/ - * - * where x,z_e are transducer element positions and x,z_i are image positions. */ - float apod_arg = f_number * radians(180) / abs(image_point.z); + vec3 world_point = (u_voxel_transform * vec4(out_voxel, 1)).xyz; - /* NOTE: skip over channels corresponding to other arrays */ - vec3 sum; switch (das_shader_id) { case DAS_ID_FORCES: case DAS_ID_UFORCES: - sum = uFORCES(image_point, vec3(xdc_element_pitch, 0), apod_arg); - break; + { + sum += FORCES(world_point); + }break; case DAS_ID_HERCULES: case DAS_ID_UHERCULES: - sum = HERCULES(image_point, vec3(xdc_element_pitch, 0), apod_arg); - break; + { + sum += HERCULES(world_point); + }break; case DAS_ID_FLASH: case DAS_ID_RCA_TPW: case DAS_ID_RCA_VLS: - sum = RCA(image_point, vec3(xdc_element_pitch, 0), apod_arg); - break; + { + sum += RCA(world_point); + }break; } /* TODO(rnp): scale such that brightness remains ~constant */ if (coherency_weighting) sum.xy *= sum.xy / (sum.z + float(sum.z == 0)); - imageStore(u_out_data_tex, out_coord, vec4(sum.xy, 0, 0)); + imageStore(u_out_data_tex, out_voxel, vec4(sum.xy, 0, 0)); } diff --git a/shaders/decode.glsl b/shaders/decode.glsl @@ -63,7 +63,7 @@ void main() out_rf_data[rf_offset + transmit] = rf_data[in_off / RF_SAMPLES_PER_INDEX]; } else { /* NOTE(rnp): stores output as a 3D matrix with ordering of {samples, channels, transmits} */ - uint out_off = dec_data_dim.x * dec_data_dim.y * transmit + dec_data_dim.x * channel + time_sample; + uint out_off = dec_data_dim.x * dec_data_dim.z * channel + dec_data_dim.x * transmit + time_sample; vec4 result = vec4(0); switch (decode) { diff --git a/static.c b/static.c @@ -332,7 +332,7 @@ setup_beamformer(Arena *memory, BeamformerCtx **o_ctx, BeamformerInput **o_input /* NOTE: default compute shader pipeline */ sm->compute_stages[0] = BeamformerShaderKind_Decode; - sm->compute_stages[1] = BeamformerShaderKind_DASCompute; + sm->compute_stages[1] = BeamformerShaderKind_DAS; sm->compute_stages_count = 2; GLWorkerThreadContext *worker = &ctx->os.compute_worker; @@ -387,7 +387,7 @@ setup_beamformer(Arena *memory, BeamformerCtx **o_ctx, BeamformerInput **o_input os_add_file_watch(&ctx->os, memory, src->path, reload_shader_indirect, (iptr)src); \ reload_shader_indirect(&ctx->os, src->path, (iptr)src, *memory); \ } while (0); - COMPUTE_SHADERS + COMPUTE_SHADERS_INTERNAL #undef X os_wake_waiters(&worker->sync_variable); diff --git a/tests/throughput.c b/tests/throughput.c @@ -357,7 +357,7 @@ execute_study(s8 study, Arena arena, Stream path, Options *options) u32 shader_stage_count = 0; if (options->cuda) shader_stages[shader_stage_count++] = BeamformerShaderKind_CudaDecode; else shader_stages[shader_stage_count++] = BeamformerShaderKind_Decode; - shader_stages[shader_stage_count++] = BeamformerShaderKind_DASCompute; + shader_stages[shader_stage_count++] = BeamformerShaderKind_DAS; set_beamformer_pipeline(shader_stages, shader_stage_count); diff --git a/ui.c b/ui.c @@ -2589,7 +2589,7 @@ draw_compute_stats_bar_view(BeamformerUI *ui, Arena arena, ComputeShaderStats *s } #define X(e, n, s, h, pn) [BeamformerShaderKind_##e] = s8_comp(pn ": "), - read_only local_persist s8 labels[BeamformerShaderKind_ComputeCount] = {COMPUTE_SHADERS}; + read_only local_persist s8 labels[BeamformerShaderKind_ComputeCount] = {COMPUTE_SHADERS_INTERNAL}; #undef X v2 result = table_extent(table, arena, ts.font); @@ -2686,7 +2686,7 @@ draw_compute_stats_view(BeamformerUI *ui, Arena arena, Variable *view, Rect r, v switch (csv->kind) { case ComputeStatsViewKind_Average:{ #define X(e, n, s, h, pn) [BeamformerShaderKind_##e] = s8_comp(pn ":"), - read_only local_persist s8 labels[BeamformerShaderKind_ComputeCount] = {COMPUTE_SHADERS}; + read_only local_persist s8 labels[BeamformerShaderKind_ComputeCount] = {COMPUTE_SHADERS_INTERNAL}; #undef X da_reserve(&arena, table, stages); for (u32 i = 0; i < stages; i++) {