Commit: 399d6e0e1f756a103dd3a23dd1f9c3b3e8aae63a
Parent: d5355cf2830265cdfe5ca686f0865a28ebc8a46b
Author: Randy Palamar
Date: Tue, 9 Sep 2025 12:42:17 -0600
das: adapt to Float32 (RF) input
not currently tested, decode shader needs to be fixed for RF input
Diffstat:
4 files changed, 116 insertions(+), 64 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -1,5 +1,6 @@
/* See LICENSE for license details. */
/* TODO(rnp):
+ * [ ]: bug: shaders with no parameters aren't currently properly selected
* [ ]: make decode output real values for real inputs and complex values for complex inputs
* - this means that das should have a RF version and an IQ version
* - this will also flip the current hack to support demodulate after decode to
@@ -177,8 +178,15 @@ frame_next(ComputeFrameIterator *bfi)
return result;
}
+function b32
+beamformer_frame_compatible(BeamformerFrame *f, iv3 dim, GLenum gl_kind)
+{
+ b32 result = gl_kind == f->gl_kind && iv3_equal(dim, f->dim);
+ return result;
+}
+
function void
-alloc_beamform_frame(GLParams *gp, BeamformerFrame *out, iv3 out_dim, s8 name, Arena arena)
+alloc_beamform_frame(GLParams *gp, BeamformerFrame *out, iv3 out_dim, GLenum gl_kind, s8 name, Arena arena)
{
out->dim.x = MAX(1, out_dim.x);
out->dim.y = MAX(1, out_dim.y);
@@ -195,6 +203,8 @@ alloc_beamform_frame(GLParams *gp, BeamformerFrame *out, iv3 out_dim, s8 name, A
u32 max_dim = (u32)MAX(out->dim.x, MAX(out->dim.y, out->dim.z));
out->mips = (i32)ctz_u32(round_up_power_of_2(max_dim)) + 1;
+ out->gl_kind = gl_kind;
+
Stream label = arena_stream(arena);
stream_append_s8(&label, name);
stream_append_byte(&label, '[');
@@ -203,7 +213,7 @@ alloc_beamform_frame(GLParams *gp, BeamformerFrame *out, iv3 out_dim, s8 name, A
glDeleteTextures(1, &out->texture);
glCreateTextures(GL_TEXTURE_3D, 1, &out->texture);
- glTextureStorage3D(out->texture, out->mips, GL_RG32F, out->dim.x, out->dim.y, out->dim.z);
+ glTextureStorage3D(out->texture, out->mips, gl_kind, out->dim.x, out->dim.y, out->dim.z);
glTextureParameteri(out->texture, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTextureParameteri(out->texture, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
@@ -457,6 +467,8 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
if (demodulate) run_cuda_hilbert = 0;
+ if (demodulate || run_cuda_hilbert) cp->iq_pipeline = 1;
+
BeamformerDataKind data_kind = pb->pipeline.data_kind;
cp->pipeline.shader_count = 0;
for (u32 i = 0; i < pb->pipeline.shader_count; i++) {
@@ -565,8 +577,6 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
dp->output_transmit_stride *= decimation_rate;
}
- if (!demodulate) bp->demodulation_frequency = 0;
-
cp->decode_dispatch.x = (u32)ceil_f32((f32)bp->sample_count / DECODE_LOCAL_SIZE_X);
cp->decode_dispatch.y = (u32)ceil_f32((f32)bp->channel_count / DECODE_LOCAL_SIZE_Y);
cp->decode_dispatch.z = (u32)ceil_f32((f32)bp->acquisition_count / DECODE_LOCAL_SIZE_Z);
@@ -620,8 +630,9 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
cp->demod_dispatch.y = (u32)ceil_f32((f32)bp->channel_count / FILTER_LOCAL_SIZE_Y);
cp->demod_dispatch.z = (u32)ceil_f32((f32)bp->acquisition_count / FILTER_LOCAL_SIZE_Z);
- /* TODO(rnp): if IQ (* 8) else (* 4) */
- cp->rf_size = bp->sample_count * bp->channel_count * bp->acquisition_count * 8;
+ cp->rf_size = bp->sample_count * bp->channel_count * bp->acquisition_count;
+ if (demodulate || run_cuda_hilbert) cp->rf_size *= 8;
+ else cp->rf_size *= 4;
/* TODO(rnp): UBO per filter stage */
BeamformerFilterUBO *flt = &cp->filter_ubo_data;
@@ -679,9 +690,10 @@ beamformer_commit_parameter_block(BeamformerCtx *ctx, BeamformerComputePlan *cp,
cp->output_points.E[2] = MAX(pb->parameters.output_points[2], 1);
cp->average_frames = pb->parameters.output_points[3];
- if (cp->average_frames > 1 && !iv3_equal(cp->output_points, ctx->averaged_frames[0].dim)) {
- alloc_beamform_frame(&ctx->gl, ctx->averaged_frames + 0, cp->output_points, s8("Averaged Frame"), arena);
- alloc_beamform_frame(&ctx->gl, ctx->averaged_frames + 1, cp->output_points, s8("Averaged Frame"), arena);
+ GLenum gl_kind = cp->iq_pipeline ? GL_RG32F : GL_R32F;
+ if (cp->average_frames > 1 && !beamformer_frame_compatible(ctx->averaged_frames + 0, cp->output_points, gl_kind)) {
+ alloc_beamform_frame(&ctx->gl, ctx->averaged_frames + 0, cp->output_points, gl_kind, s8("Averaged Frame"), arena);
+ alloc_beamform_frame(&ctx->gl, ctx->averaged_frames + 1, cp->output_points, gl_kind, s8("Averaged Frame"), arena);
}
}break;
case BeamformerParameterBlockRegion_ChannelMapping:
@@ -815,9 +827,9 @@ do_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, BeamformerFrame
if (fast) {
glClearTexImage(frame->texture, 0, GL_RED, GL_FLOAT, 0);
glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT);
- glBindImageTexture(0, frame->texture, 0, GL_TRUE, 0, GL_READ_WRITE, GL_RG32F);
+ glBindImageTexture(0, frame->texture, 0, GL_TRUE, 0, GL_READ_WRITE, cp->iq_pipeline ? GL_RG32F : GL_R32F);
} else {
- glBindImageTexture(0, frame->texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F);
+ glBindImageTexture(0, frame->texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, cp->iq_pipeline ? GL_RG32F : GL_R32F);
}
u32 sparse_texture = cp->textures[BeamformerComputeTextureKind_SparseElements];
@@ -1175,8 +1187,10 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena, iptr gl_c
start_renderdoc_capture(gl_context);
BeamformerFrame *frame = work->compute_context.frame;
- if (!iv3_equal(cp->output_points, frame->dim))
- alloc_beamform_frame(&ctx->gl, frame, cp->output_points, s8("Beamformed_Data"), *arena);
+
+ GLenum gl_kind = cp->iq_pipeline ? GL_RG32F : GL_R32F;
+ if (!beamformer_frame_compatible(frame, cp->output_points, gl_kind))
+ alloc_beamform_frame(&ctx->gl, frame, cp->output_points, gl_kind, s8("Beamformed_Data"), *arena);
frame->min_coordinate = cp->min_coordinate;
frame->max_coordinate = cp->max_coordinate;
diff --git a/beamformer.h b/beamformer.h
@@ -204,6 +204,7 @@ struct BeamformerComputePlan {
u32 rf_size;
i32 hadamard_order;
+ b32 iq_pipeline;
v3 min_coordinate;
v3 max_coordinate;
@@ -319,6 +320,7 @@ struct BeamformerFrame {
v3 max_coordinate;
// metadata
+ GLenum gl_kind;
u32 id;
u32 compound_count;
DASShaderKind das_shader_kind;
diff --git a/shaders/das.glsl b/shaders/das.glsl
@@ -1,15 +1,41 @@
/* See LICENSE for license details. */
+#if DataKind == DataKind_Float32
+ #define SAMPLE_TYPE float
+ #define TEXTURE_KIND r32f
+ #define RESULT_TYPE_CAST(a) (a).x
+ #define OUTPUT_TYPE_CAST(a) vec4((a).x, 0, 0, 0)
+ #if (ShaderFlags & ShaderFlags_Fast)
+ #define RESULT_TYPE SAMPLE_TYPE
+ #else
+ #define RESULT_TYPE vec2
+ #define RESULT_LAST_INDEX 1
+ #endif
+#elif DataKind == DataKind_Float32Complex
+ #define SAMPLE_TYPE vec2
+ #define TEXTURE_KIND rg32f
+ #define RESULT_TYPE_CAST(a) (a).xy
+ #define OUTPUT_TYPE_CAST(a) vec4((a).xy, 0, 0)
+ #if (ShaderFlags & ShaderFlags_Fast)
+ #define RESULT_TYPE SAMPLE_TYPE
+ #else
+ #define RESULT_TYPE vec3
+ #define RESULT_LAST_INDEX 2
+ #endif
+#else
+ #error DataKind unsupported for DAS
+#endif
+
layout(std430, binding = 1) readonly restrict buffer buffer_1 {
- vec2 rf_data[];
+ SAMPLE_TYPE rf_data[];
};
const bool sparse = bool(ShaderFlags & ShaderFlags_Sparse);
const bool interpolate = bool(ShaderFlags & ShaderFlags_Interpolate);
#if (ShaderFlags & ShaderFlags_Fast)
-layout(rg32f, binding = 0) restrict uniform image3D u_out_data_tex;
+layout(TEXTURE_KIND, binding = 0) restrict uniform image3D u_out_data_tex;
#else
-layout(rg32f, binding = 0) writeonly restrict uniform image3D u_out_data_tex;
+layout(TEXTURE_KIND, binding = 0) writeonly restrict uniform image3D u_out_data_tex;
#endif
layout(r16i, binding = 1) readonly restrict uniform iimage1D sparse_elements;
@@ -17,20 +43,21 @@ layout(rg32f, binding = 2) readonly restrict uniform image1D focal_vectors;
#define C_SPLINE 0.5
+#if DataKind == DataKind_Float32Complex
vec2 rotate_iq(vec2 iq, float time)
{
- vec2 result = iq;
- if (demodulation_frequency > 0) {
- float arg = radians(360) * demodulation_frequency * time;
- mat2 phasor = mat2( cos(arg), sin(arg),
- -sin(arg), cos(arg));
- result = phasor * iq;
- }
+ float arg = radians(360) * demodulation_frequency * time;
+ mat2 phasor = mat2( cos(arg), sin(arg),
+ -sin(arg), cos(arg));
+ vec2 result = phasor * iq;
return result;
}
+#else
+ #define rotate_iq(a, b) (a)
+#endif
/* NOTE: See: https://cubic.org/docs/hermite.htm */
-vec2 cubic(int base_index, float index)
+SAMPLE_TYPE cubic(int base_index, float index)
{
const mat4 h = mat4(
2, -3, 0, 1,
@@ -40,28 +67,33 @@ vec2 cubic(int base_index, float index)
);
float tk, t = modf(index, tk);
- vec2 samples[4] = {
+ SAMPLE_TYPE samples[4] = {
rf_data[base_index + int(tk) - 1],
rf_data[base_index + int(tk) + 0],
rf_data[base_index + int(tk) + 1],
rf_data[base_index + int(tk) + 2],
};
- vec4 S = vec4(t * t * t, t * t, t, 1);
- vec2 P1 = samples[1];
- vec2 P2 = samples[2];
- vec2 T1 = C_SPLINE * (P2 - samples[0]);
- vec2 T2 = C_SPLINE * (samples[3] - P1);
+ vec4 S = vec4(t * t * t, t * t, t, 1);
+ SAMPLE_TYPE P1 = samples[1];
+ SAMPLE_TYPE P2 = samples[2];
+ SAMPLE_TYPE T1 = C_SPLINE * (P2 - samples[0]);
+ SAMPLE_TYPE T2 = C_SPLINE * (samples[3] - P1);
+#if DataKind == DataKind_Float32
+ vec4 C = vec4(P1.x, P2.x, T1.x, T2.x);
+ float result = dot(S, h * C);
+#elif DataKind == DataKind_Float32Complex
mat2x4 C = mat2x4(vec4(P1.x, P2.x, T1.x, T2.x), vec4(P1.y, P2.y, T1.y, T2.y));
vec2 result = S * h * C;
+#endif
return result;
}
-vec2 sample_rf(int channel, int transmit, float index)
+SAMPLE_TYPE sample_rf(int channel, int transmit, float index)
{
- vec2 result = vec2(index >= 0.0f) * vec2((int(index) + 1 + int(interpolate)) < sample_count);
- int base_index = int(channel * sample_count * acquisition_count + transmit * sample_count);
+ SAMPLE_TYPE result = SAMPLE_TYPE(index >= 0.0f) * SAMPLE_TYPE((int(index) + 1 + int(interpolate)) < sample_count);
+ int base_index = int(channel * sample_count * acquisition_count + transmit * sample_count);
if (interpolate) result *= cubic(base_index, index);
else result *= rf_data[base_index + int(round(index))];
result = rotate_iq(result, index / sampling_frequency);
@@ -105,7 +137,7 @@ float cylindrical_wave_transmit_distance(vec3 point, float focal_depth, float tr
}
#if (ShaderFlags & ShaderFlags_Fast)
-vec3 RCA(vec3 world_point)
+RESULT_TYPE RCA(vec3 world_point)
{
bool tx_rows = bool((shader_flags & ShaderFlags_TxColumns) == 0);
bool rx_rows = bool((shader_flags & ShaderFlags_RxColumns) == 0);
@@ -122,7 +154,7 @@ vec3 RCA(vec3 world_point)
transmit_angle, tx_rows);
}
- vec2 result = vec2(0);
+ RESULT_TYPE result = RESULT_TYPE(0);
for (int channel = 0; channel < channel_count; channel++) {
vec2 receive_vector = xdc_world_point - rca_plane_projection(vec3(channel * xdc_element_pitch, 0), rx_rows);
float receive_distance = length(receive_vector);
@@ -133,16 +165,16 @@ vec3 RCA(vec3 world_point)
result += apodization * sample_rf(channel, u_channel, sidx);
}
}
- return vec3(result, 0);
+ return result;
}
#else
-vec3 RCA(vec3 world_point)
+RESULT_TYPE RCA(vec3 world_point)
{
bool tx_rows = bool((shader_flags & ShaderFlags_TxColumns) == 0);
bool rx_rows = bool((shader_flags & ShaderFlags_RxColumns) == 0);
vec2 xdc_world_point = rca_plane_projection((xdc_transform * vec4(world_point, 1)).xyz, rx_rows);
- vec3 sum = vec3(0);
+ RESULT_TYPE result = RESULT_TYPE(0);
for (int transmit = 0; transmit < acquisition_count; transmit++) {
vec2 focal_vector = imageLoad(focal_vectors, transmit).xy;
float transmit_angle = radians(focal_vector.x);
@@ -162,18 +194,18 @@ vec3 RCA(vec3 world_point)
float apodization = apodize(f_number * radians(180) / abs(xdc_world_point.y) * receive_vector.x);
if (apodization > 0) {
- float sidx = sample_index(transmit_distance + length(receive_vector));
- vec2 value = apodization * sample_rf(rx_channel, transmit, sidx);
- sum += vec3(value, length(value));
+ float sidx = sample_index(transmit_distance + length(receive_vector));
+ SAMPLE_TYPE value = apodization * sample_rf(rx_channel, transmit, sidx);
+ result += RESULT_TYPE(value, length(value));
}
}
}
- return sum;
+ return result;
}
#endif
#if (ShaderFlags & ShaderFlags_Fast)
-vec3 HERCULES(vec3 world_point)
+RESULT_TYPE HERCULES(vec3 world_point)
{
vec3 xdc_world_point = (xdc_transform * vec4(world_point, 1)).xyz;
bool tx_rows = bool((shader_flags & ShaderFlags_TxColumns) == 0);
@@ -190,7 +222,7 @@ vec3 HERCULES(vec3 world_point)
transmit_angle, tx_rows);
}
- vec2 result = vec2(0);
+ RESULT_TYPE result = RESULT_TYPE(0);
for (int transmit = int(sparse); transmit < acquisition_count; transmit++) {
int tx_channel = sparse ? imageLoad(sparse_elements, transmit - int(sparse)).x : transmit;
vec3 element_position;
@@ -207,10 +239,10 @@ vec3 HERCULES(vec3 world_point)
result += apodization * sample_rf(u_channel, transmit, sidx);
}
}
- return vec3(result, 0);
+ return result;
}
#else
-vec3 HERCULES(vec3 world_point)
+RESULT_TYPE HERCULES(vec3 world_point)
{
vec3 xdc_world_point = (xdc_transform * vec4(world_point, 1)).xyz;
bool tx_rows = bool((shader_flags & ShaderFlags_TxColumns) == 0);
@@ -227,7 +259,7 @@ vec3 HERCULES(vec3 world_point)
transmit_angle, tx_rows);
}
- vec3 result = vec3(0);
+ RESULT_TYPE result = RESULT_TYPE(0);
for (int transmit = int(sparse); transmit < acquisition_count; transmit++) {
int tx_channel = sparse ? imageLoad(sparse_elements, transmit - int(sparse)).x : transmit;
for (int rx_channel = 0; rx_channel < channel_count; rx_channel++) {
@@ -241,9 +273,9 @@ vec3 HERCULES(vec3 world_point)
/* NOTE: tribal knowledge */
if (transmit == 0) apodization *= inversesqrt(acquisition_count);
- float sidx = sample_index(transmit_distance + distance(xdc_world_point, element_position));
- vec2 value = apodization * sample_rf(rx_channel, transmit, sidx);
- result += vec3(value, length(value));
+ float sidx = sample_index(transmit_distance + distance(xdc_world_point, element_position));
+ SAMPLE_TYPE value = apodization * sample_rf(rx_channel, transmit, sidx);
+ result += RESULT_TYPE(value, length(value));
}
}
}
@@ -252,14 +284,14 @@ vec3 HERCULES(vec3 world_point)
#endif
#if (ShaderFlags & ShaderFlags_Fast)
-vec3 FORCES(vec3 world_point)
+RESULT_TYPE FORCES(vec3 world_point)
{
vec3 xdc_world_point = (xdc_transform * vec4(world_point, 1)).xyz;
float receive_distance = distance(xdc_world_point.xz, vec2(u_channel * xdc_element_pitch.x, 0));
float apodization = apodize(f_number * radians(180) / abs(xdc_world_point.z) *
(xdc_world_point.x - u_channel * xdc_element_pitch.x));
- vec2 result = vec2(0);
+ RESULT_TYPE result = RESULT_TYPE(0);
if (apodization > 0) {
for (int transmit = int(sparse); transmit < acquisition_count; transmit++) {
int tx_channel = sparse ? imageLoad(sparse_elements, transmit - int(sparse)).x : transmit;
@@ -269,14 +301,14 @@ vec3 FORCES(vec3 world_point)
result += apodization * sample_rf(u_channel, transmit, sidx);
}
}
- return vec3(result, 0);
+ return result;
}
#else
-vec3 FORCES(vec3 world_point)
+RESULT_TYPE FORCES(vec3 world_point)
{
vec3 xdc_world_point = (xdc_transform * vec4(world_point, 1)).xyz;
- vec3 result = vec3(0);
+ RESULT_TYPE result = RESULT_TYPE(0);
for (int rx_channel = 0; rx_channel < channel_count; rx_channel++) {
float receive_distance = distance(xdc_world_point.xz, vec2(rx_channel * xdc_element_pitch.x, 0));
float apodization = apodize(f_number * radians(180) / abs(xdc_world_point.z) *
@@ -286,9 +318,9 @@ vec3 FORCES(vec3 world_point)
int tx_channel = sparse ? imageLoad(sparse_elements, transmit - int(sparse)).x : transmit;
vec3 transmit_center = vec3(xdc_element_pitch * vec2(tx_channel, floor(channel_count / 2)), 0);
- float sidx = sample_index(distance(xdc_world_point, transmit_center) + receive_distance);
- vec2 value = apodization * sample_rf(rx_channel, tx_channel, sidx);
- result += vec3(value, length(value));
+ float sidx = sample_index(distance(xdc_world_point, transmit_center) + receive_distance);
+ SAMPLE_TYPE value = apodization * sample_rf(rx_channel, transmit, sidx);
+ result += RESULT_TYPE(value, length(value));
}
}
}
@@ -303,9 +335,9 @@ void main()
return;
#if (ShaderFlags & ShaderFlags_Fast)
- vec3 sum = vec3(imageLoad(u_out_data_tex, out_voxel).xy, 0);
+ RESULT_TYPE sum = RESULT_TYPE_CAST(imageLoad(u_out_data_tex, out_voxel));
#else
- vec3 sum = vec3(0);
+ RESULT_TYPE sum = RESULT_TYPE(0);
out_voxel += u_voxel_offset;
#endif
@@ -330,9 +362,13 @@ void main()
}break;
}
+ #if (ShaderFlags & ShaderFlags_Fast) == 0
/* TODO(rnp): scale such that brightness remains ~constant */
- if (bool(shader_flags & ShaderFlags_CoherencyWeighting))
- sum.xy *= sum.xy / (sum.z + float(sum.z == 0));
+ if (bool(shader_flags & ShaderFlags_CoherencyWeighting)) {
+ float denominator = sum[RESULT_LAST_INDEX] + float(sum[RESULT_LAST_INDEX] == 0);
+ RESULT_TYPE_CAST(sum) *= RESULT_TYPE_CAST(sum) / denominator;
+ }
+ #endif
- imageStore(u_out_data_tex, out_voxel, vec4(sum.xy, 0, 0));
+ imageStore(u_out_data_tex, out_voxel, OUTPUT_TYPE_CAST(sum));
}
diff --git a/ui.c b/ui.c
@@ -1469,7 +1469,7 @@ ui_beamformer_frame_view_copy_frame(BeamformerUI *ui, BeamformerFrameView *new,
mem_copy(new->frame, old->frame, sizeof(*new->frame));
new->frame->texture = 0;
new->frame->next = 0;
- alloc_beamform_frame(0, new->frame, old->frame->dim, s8("Frame Copy: "), ui->arena);
+ alloc_beamform_frame(0, new->frame, old->frame->dim, old->frame->gl_kind, s8("Frame Copy: "), ui->arena);
glCopyImageSubData(old->frame->texture, GL_TEXTURE_3D, 0, 0, 0, 0,
new->frame->texture, GL_TEXTURE_3D, 0, 0, 0, 0,