Commit: f815e8bb2229b323096765e1d00be10d0d0c57e2
Parent: 54effe8d17a212b7594c4c6a6d942286e2ea41e3
Author: Randy Palamar
Date: Tue, 23 Sep 2025 19:35:57 -0600
shader/das: first pass at baking exact (integer) parameters
this gives performance boosts as high as 10% for FORCES. while I'm
typically against const it seems that including on function
parameters ensures that actual compile time constants carry
through correctly without additional register usage
Diffstat:
| M | beamformer.c | | | 109 | ++++++++++++++++++++++++++++++++++++++++++++----------------------------------- |
| M | beamformer.h | | | 14 | ++++++++------ |
| M | shaders/das.glsl | | | 72 | ++++++++++++++++++++++++++++++++---------------------------------------- |
3 files changed, 101 insertions(+), 94 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -1,13 +1,10 @@
/* See LICENSE for license details. */
/* TODO(rnp):
- * [ ]: do JIT compilation of shaders
- * - a larger subset of parameters can be made into compile time constants
- * - preallocated storage for shaders is minimized
- * - loops over TX and RX count can be unrolled
- * - hot reload can still be trivially supported:
- * - loop over shaders for the current pipeline
- * - check if the base shader matches the shader we are trying to reload
- * - load header and append constants which are stored in the pipeline parameters
+ * [ ]: refactor: fancier hot reloading for JIT shaders
+ * - loop over all active blocks
+ - loop over shader sets per block
+ * - when match found reload it
+ * [ ]: refactor: move shader bake parameters into meta code
* [ ]: measure performance of doing channel mapping in a separate shader
* [ ]: BeamformWorkQueue -> BeamformerWorkQueue
* [ ]: need to keep track of gpu memory in some way
@@ -445,7 +442,7 @@ das_voxel_transform_matrix(BeamformerParameters *bp)
}
function void
-das_ubo_from_beamformer_parameters(BeamformerDASUBO *du, BeamformerParameters *bp)
+das_ubo_from_beamformer_parameters(BeamformerComputePlan *cp, BeamformerDASUBO *du, BeamformerParameters *bp)
{
du->voxel_transform = das_voxel_transform_matrix(bp);
mem_copy(du->xdc_transform.E, bp->xdc_transform, sizeof(du->xdc_transform));
@@ -455,13 +452,14 @@ das_ubo_from_beamformer_parameters(BeamformerDASUBO *du, BeamformerParameters *b
du->speed_of_sound = bp->speed_of_sound;
du->time_offset = bp->time_offset;
du->f_number = bp->f_number;
- du->shader_kind = bp->das_shader_id;
- du->sample_count = bp->sample_count;
- du->channel_count = bp->channel_count;
- du->acquisition_count = bp->acquisition_count;
- du->shader_flags = 0;
- if (bp->coherency_weighting) du->shader_flags |= BeamformerShaderDASFlags_CoherencyWeighting;
+ cp->das_shader_kind = bp->das_shader_id;
+ cp->das_sample_count = bp->sample_count;
+ cp->das_channel_count = bp->channel_count;
+ cp->das_acquisition_count = bp->acquisition_count;
+
+ cp->das_shader_flags = 0;
+ if (bp->coherency_weighting) cp->das_shader_flags |= BeamformerShaderDASFlags_CoherencyWeighting;
}
function void
@@ -469,7 +467,7 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
{
BeamformerDASUBO *bp = &cp->das_ubo_data;
- das_ubo_from_beamformer_parameters(bp, &pb->parameters);
+ das_ubo_from_beamformer_parameters(cp, bp, &pb->parameters);
b32 decode_first = pb->pipeline.shaders[0] == BeamformerShaderKind_Decode;
b32 run_cuda_hilbert = 0;
@@ -546,9 +544,9 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
das_data_kind = BeamformerDataKind_Float32Complex;
i32 local_flags = 0;
- if ((bp->shader_flags & BeamformerShaderDASFlags_CoherencyWeighting) == 0)
+ if ((cp->das_shader_flags & BeamformerShaderDASFlags_CoherencyWeighting) == 0)
local_flags |= BeamformerShaderDASFlags_Fast;
- if (bp->shader_kind == BeamformerDASKind_UFORCES || bp->shader_kind == BeamformerDASKind_UHERCULES)
+ if (cp->das_shader_kind == BeamformerDASKind_UFORCES || cp->das_shader_kind == BeamformerDASKind_UHERCULES)
local_flags |= BeamformerShaderDASFlags_Sparse;
if (pb->parameters.interpolate)
local_flags |= BeamformerShaderDASFlags_Interpolate;
@@ -572,8 +570,8 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
cp->pipeline.data_kind = data_kind;
u32 das_sample_stride = 1;
- u32 das_transmit_stride = bp->sample_count;
- u32 das_channel_stride = bp->acquisition_count * bp->sample_count;
+ u32 das_transmit_stride = cp->das_sample_count;
+ u32 das_channel_stride = cp->das_acquisition_count * cp->das_sample_count;
u32 decimation_rate = MAX(pb->parameters.decimation_rate, 1);
if (demodulate) {
@@ -582,14 +580,14 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
}
u32 input_sample_stride = 1;
- u32 input_transmit_stride = bp->sample_count;
+ u32 input_transmit_stride = cp->das_sample_count;
u32 input_channel_stride = pb->parameters.raw_data_dimensions[0];
BeamformerDecodeUBO *dp = &cp->decode_ubo_data;
dp->decode_mode = pb->parameters.decode;
- dp->transmit_count = bp->acquisition_count;
+ dp->transmit_count = cp->das_acquisition_count;
- dp->input_sample_stride = decode_first? input_sample_stride : bp->acquisition_count;
+ dp->input_sample_stride = decode_first? input_sample_stride : cp->das_acquisition_count;
dp->input_channel_stride = decode_first? input_channel_stride : das_channel_stride;
dp->input_transmit_stride = decode_first? input_transmit_stride : 1;
dp->output_sample_stride = das_sample_stride;
@@ -600,9 +598,9 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
dp->output_transmit_stride *= decimation_rate;
}
- cp->decode_dispatch.x = (u32)ceil_f32((f32)bp->sample_count / DECODE_LOCAL_SIZE_X);
- cp->decode_dispatch.y = (u32)ceil_f32((f32)bp->channel_count / DECODE_LOCAL_SIZE_Y);
- cp->decode_dispatch.z = (u32)ceil_f32((f32)bp->acquisition_count / DECODE_LOCAL_SIZE_Z);
+ cp->decode_dispatch.x = (u32)ceil_f32((f32)cp->das_sample_count / DECODE_LOCAL_SIZE_X);
+ cp->decode_dispatch.y = (u32)ceil_f32((f32)cp->das_channel_count / DECODE_LOCAL_SIZE_Y);
+ cp->decode_dispatch.z = (u32)ceil_f32((f32)cp->das_acquisition_count / DECODE_LOCAL_SIZE_Z);
/* NOTE(rnp): decode 2 samples per dispatch when data is i16 */
if (decode_first && data_kind == BeamformerDataKind_Int16)
@@ -624,7 +622,7 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
mp->decimation_rate = decimation_rate;
bp->sampling_frequency /= 2 * (f32)mp->decimation_rate;
- bp->sample_count /= 2 * mp->decimation_rate;
+ cp->das_sample_count /= 2 * mp->decimation_rate;
if (decode_first) {
mp->input_channel_stride = dp->output_channel_stride;
@@ -644,16 +642,16 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
mp->output_sample_stride = dp->input_sample_stride;
mp->output_transmit_stride = dp->input_transmit_stride;
- cp->decode_dispatch.x = (u32)ceil_f32((f32)bp->sample_count / DECODE_LOCAL_SIZE_X);
+ cp->decode_dispatch.x = (u32)ceil_f32((f32)cp->das_sample_count / DECODE_LOCAL_SIZE_X);
}
}
/* TODO(rnp): filter may need a different dispatch layout */
- cp->demod_dispatch.x = (u32)ceil_f32((f32)bp->sample_count / FILTER_LOCAL_SIZE_X);
- cp->demod_dispatch.y = (u32)ceil_f32((f32)bp->channel_count / FILTER_LOCAL_SIZE_Y);
- cp->demod_dispatch.z = (u32)ceil_f32((f32)bp->acquisition_count / FILTER_LOCAL_SIZE_Z);
+ cp->demod_dispatch.x = (u32)ceil_f32((f32)cp->das_sample_count / FILTER_LOCAL_SIZE_X);
+ cp->demod_dispatch.y = (u32)ceil_f32((f32)cp->das_channel_count / FILTER_LOCAL_SIZE_Y);
+ cp->demod_dispatch.z = (u32)ceil_f32((f32)cp->das_acquisition_count / FILTER_LOCAL_SIZE_Z);
- cp->rf_size = bp->sample_count * bp->channel_count * bp->acquisition_count;
+ cp->rf_size = cp->das_sample_count * cp->das_channel_count * cp->das_acquisition_count;
if (demodulate || run_cuda_hilbert) cp->rf_size *= 8;
else cp->rf_size *= 4;
@@ -662,12 +660,12 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
flt->demodulation_frequency = bp->demodulation_frequency;
flt->sampling_frequency = bp->sampling_frequency;
flt->decimation_rate = 1;
- flt->output_channel_stride = bp->sample_count * bp->acquisition_count;
+ flt->output_channel_stride = cp->das_sample_count * cp->das_acquisition_count;
flt->output_sample_stride = 1;
- flt->output_transmit_stride = bp->sample_count;
- flt->input_channel_stride = bp->sample_count * bp->acquisition_count;
+ flt->output_transmit_stride = cp->das_sample_count;
+ flt->input_channel_stride = cp->das_sample_count * cp->das_acquisition_count;
flt->input_sample_stride = 1;
- flt->input_transmit_stride = bp->sample_count;
+ flt->input_transmit_stride = cp->das_sample_count;
}
function void
@@ -741,7 +739,6 @@ load_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, u32 shader_sl
BeamformerShaderKind shader = cp->pipeline.shaders[shader_slot];
BeamformerShaderDescriptor *sd = beamformer_shader_descriptors + shader;
-
u32 program = 0;
i32 reloadable_index = beamformer_shader_reloadable_index_by_shader[shader];
if (reloadable_index != -1) {
@@ -767,10 +764,28 @@ load_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, u32 shader_sl
if (sd->has_local_flags) {
stream_append_s8(&shader_stream, s8("#define ShaderFlags (0x"));
- stream_append_hex_u64(&shader_stream, (u64)match_vector[sd->match_vector_length]);
+ i32 flags = match_vector[sd->match_vector_length];
+ if (shader == BeamformerShaderKind_DAS) flags |= cp->das_shader_flags;
+ stream_append_hex_u64(&shader_stream, (u64)flags);
stream_append_s8(&shader_stream, s8(")\n"));
}
+ /* TODO(rnp): generate this */
+ switch (base_shader) {
+ case BeamformerShaderKind_DAS:{
+ stream_append_s8(&shader_stream, s8("#define ShaderKind "));
+ stream_append_u64(&shader_stream, cp->das_shader_kind);
+ stream_append_s8(&shader_stream, s8("\n#define SampleCount "));
+ stream_append_u64(&shader_stream, cp->das_sample_count);
+ stream_append_s8(&shader_stream, s8("\n#define ChannelCount "));
+ stream_append_u64(&shader_stream, cp->das_channel_count);
+ stream_append_s8(&shader_stream, s8("\n#define AcquisitionCount "));
+ stream_append_u64(&shader_stream, cp->das_acquisition_count);
+ stream_append_s8(&shader_stream, s8("\n"));
+ }break;
+ default:{}break;
+ }
+
stream_append_s8(&shader_stream, s8("\n#line 1\n"));
s8 shader_text = arena_stream_commit(&arena, &shader_stream);
@@ -820,8 +835,8 @@ beamformer_commit_parameter_block(BeamformerCtx *ctx, BeamformerComputePlan *cp,
if (ctx->compute_context.ping_pong_ssbo_size < decoded_data_size)
alloc_shader_storage(ctx, decoded_data_size, arena);
- if (cp->hadamard_order != (i32)cp->das_ubo_data.acquisition_count)
- update_hadamard_texture(cp, (i32)cp->das_ubo_data.acquisition_count, arena);
+ if (cp->hadamard_order != (i32)cp->das_acquisition_count)
+ update_hadamard_texture(cp, (i32)cp->das_acquisition_count, arena);
cp->min_coordinate = v3_from_f32_array(pb->parameters.output_min_coordinate);
cp->max_coordinate = v3_from_f32_array(pb->parameters.output_max_coordinate);
@@ -954,8 +969,6 @@ do_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, BeamformerFrame
case BeamformerShaderKind_DAS:{
local_persist u32 das_cycle_t = 0;
- BeamformerDASUBO *ubo = &cp->das_ubo_data;
-
i32 local_flags = match_vector[shader_descriptor->match_vector_length];
b32 fast = (local_flags & BeamformerShaderDASFlags_Fast) != 0;
b32 sparse = (local_flags & BeamformerShaderDASFlags_Sparse) != 0;
@@ -981,14 +994,14 @@ do_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, BeamformerFrame
if (fast) {
i32 loop_end;
- if (ubo->shader_kind == BeamformerDASKind_RCA_VLS ||
- ubo->shader_kind == BeamformerDASKind_RCA_TPW)
+ if (cp->das_shader_kind == BeamformerDASKind_RCA_VLS ||
+ cp->das_shader_kind == BeamformerDASKind_RCA_TPW)
{
/* NOTE(rnp): to avoid repeatedly sampling the whole focal vectors
* texture we loop over transmits for VLS/TPW */
- loop_end = (i32)ubo->acquisition_count;
+ loop_end = (i32)cp->das_acquisition_count;
} else {
- loop_end = (i32)ubo->channel_count;
+ loop_end = (i32)cp->das_channel_count;
}
f32 percent_per_step = 1.0f / (f32)loop_end;
cc->processing_progress = -percent_per_step;
@@ -1204,8 +1217,8 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena, iptr gl_c
frame->min_coordinate = cp->min_coordinate;
frame->max_coordinate = cp->max_coordinate;
- frame->das_kind = cp->das_ubo_data.shader_kind;
- frame->compound_count = cp->das_ubo_data.acquisition_count;
+ frame->das_kind = cp->das_shader_kind;
+ frame->compound_count = cp->das_acquisition_count;
BeamformerComputeContext *cc = &ctx->compute_context;
BeamformerComputePipeline *pipeline = &cp->pipeline;
diff --git a/beamformer.h b/beamformer.h
@@ -139,12 +139,7 @@ typedef struct {
X(demodulation_frequency, f32, float) \
X(speed_of_sound, f32, float) \
X(time_offset, f32, float) \
- X(f_number, f32, float) \
- X(shader_flags, u32, int) \
- X(shader_kind, u32, uint) \
- X(sample_count, u32, uint) \
- X(channel_count, u32, uint) \
- X(acquisition_count, u32, uint)
+ X(f_number, f32, float)
typedef alignas(16) struct {
#define X(name, type, ...) type name;
@@ -165,6 +160,7 @@ typedef alignas(16) struct {
#define X(name, type, ...) type name;
BEAMFORMER_DAS_UBO_PARAM_LIST
#undef X
+ float _pad[1];
} BeamformerDASUBO;
static_assert((sizeof(BeamformerDASUBO) & 15) == 0, "UBO size must be a multiple of 16");
@@ -228,6 +224,12 @@ struct BeamformerComputePlan {
BEAMFORMER_COMPUTE_UBO_LIST
#undef X
+ u32 das_shader_kind;
+ u32 das_sample_count;
+ u32 das_channel_count;
+ u32 das_acquisition_count;
+ i32 das_shader_flags;
+
BeamformerComputePlan *next;
};
diff --git a/shaders/das.glsl b/shaders/das.glsl
@@ -35,9 +35,10 @@ layout(std430, binding = 1) readonly restrict buffer buffer_1 {
#define RESULT_STORE(a, length_a) RESULT_TYPE(a, length_a)
#endif
-const bool fast = bool(ShaderFlags & ShaderFlags_Fast);
-const bool sparse = bool(ShaderFlags & ShaderFlags_Sparse);
-const bool interpolate = bool(ShaderFlags & ShaderFlags_Interpolate);
+const bool fast = bool(ShaderFlags & ShaderFlags_Fast);
+const bool sparse = bool(ShaderFlags & ShaderFlags_Sparse);
+const bool interpolate = bool(ShaderFlags & ShaderFlags_Interpolate);
+const bool coherency_weighting = bool(ShaderFlags & ShaderFlags_CoherencyWeighting);
#if (ShaderFlags & ShaderFlags_Fast)
layout(TEXTURE_KIND, binding = 0) restrict uniform image3D u_out_data_tex;
@@ -55,7 +56,7 @@ layout(r8i, binding = 3) readonly restrict uniform iimage1D transmit_receive_
#define C_SPLINE 0.5
#if DataKind == DataKind_Float32Complex
-vec2 rotate_iq(vec2 iq, float time)
+vec2 rotate_iq(const vec2 iq, const float time)
{
float arg = radians(360) * demodulation_frequency * time;
mat2 phasor = mat2( cos(arg), sin(arg),
@@ -68,7 +69,7 @@ vec2 rotate_iq(vec2 iq, float time)
#endif
/* NOTE: See: https://cubic.org/docs/hermite.htm */
-SAMPLE_TYPE cubic(int base_index, float index)
+SAMPLE_TYPE cubic(const int base_index, const float index)
{
const mat4 h = mat4(
2, -3, 0, 1,
@@ -101,23 +102,23 @@ SAMPLE_TYPE cubic(int base_index, float index)
return result;
}
-SAMPLE_TYPE sample_rf(int channel, int transmit, float index)
+SAMPLE_TYPE sample_rf(const int channel, const int transmit, const float index)
{
- SAMPLE_TYPE result = SAMPLE_TYPE(index >= 0.0f) * SAMPLE_TYPE((int(index) + 1 + int(interpolate)) < sample_count);
- int base_index = int(channel * sample_count * acquisition_count + transmit * sample_count);
+ SAMPLE_TYPE result = SAMPLE_TYPE(index >= 0.0f) * SAMPLE_TYPE((int(index) + 1 + int(interpolate)) < SampleCount);
+ int base_index = int(channel * SampleCount * AcquisitionCount + transmit * SampleCount);
if (interpolate) result *= cubic(base_index, index);
else result *= rf_data[base_index + int(round(index))];
result = rotate_iq(result, index / sampling_frequency);
return result;
}
-float sample_index(float distance)
+float sample_index(const float distance)
{
float time = distance / speed_of_sound + time_offset;
return time * sampling_frequency;
}
-float apodize(float arg)
+float apodize(const float arg)
{
/* NOTE: used for constant F# dynamic receive apodization. This is implemented as:
*
@@ -130,24 +131,25 @@ float apodize(float arg)
return a * a;
}
-vec2 rca_plane_projection(vec3 point, bool rows)
+vec2 rca_plane_projection(const vec3 point, const bool rows)
{
vec2 result = vec2(point[int(rows)], point[2]);
return result;
}
-float plane_wave_transmit_distance(vec3 point, float transmit_angle, bool tx_rows)
+float plane_wave_transmit_distance(const vec3 point, const float transmit_angle, const bool tx_rows)
{
return dot(rca_plane_projection(point, tx_rows), vec2(sin(transmit_angle), cos(transmit_angle)));
}
-float cylindrical_wave_transmit_distance(vec3 point, float focal_depth, float transmit_angle, bool tx_rows)
+float cylindrical_wave_transmit_distance(const vec3 point, const float focal_depth,
+ const float transmit_angle, const bool tx_rows)
{
vec2 f = focal_depth * vec2(sin(transmit_angle), cos(transmit_angle));
return distance(rca_plane_projection(point, tx_rows), f);
}
-float rca_transmit_distance(vec3 world_point, vec2 focal_vector, int transmit_receive_orientation)
+float rca_transmit_distance(const vec3 world_point, const vec2 focal_vector, const int transmit_receive_orientation)
{
bool tx_rows = (transmit_receive_orientation & TX_ORIENTATION_MASK) == 0;
float transmit_angle = radians(focal_vector.x);
@@ -162,8 +164,10 @@ float rca_transmit_distance(vec3 world_point, vec2 focal_vector, int transmit_re
return result;
}
-RESULT_TYPE RCA_acquisition_range(vec3 world_point, int acquisition_start, int acquisition_end)
+RESULT_TYPE RCA(const vec3 world_point)
{
+ const int acquisition_start = fast? u_channel : 0;
+ const int acquisition_end = fast? u_channel + 1 : AcquisitionCount;
RESULT_TYPE result = RESULT_TYPE(0);
for (int acquisition = acquisition_start; acquisition < acquisition_end; acquisition++) {
int transmit_receive_orientation = imageLoad(transmit_receive_orientations, acquisition).x;
@@ -172,7 +176,7 @@ RESULT_TYPE RCA_acquisition_range(vec3 world_point, int acquisition_start, int a
float transmit_distance = rca_transmit_distance(world_point, imageLoad(focal_vectors, acquisition).xy,
transmit_receive_orientation);
- for (int rx_channel = 0; rx_channel < channel_count; rx_channel++) {
+ for (int rx_channel = 0; rx_channel < ChannelCount; rx_channel++) {
vec3 rx_center = vec3(rx_channel * xdc_element_pitch, 0);
vec2 receive_vector = xdc_world_point - rca_plane_projection(rx_center, rx_rows);
float apodization = apodize(f_number * radians(180) / abs(xdc_world_point.y) * receive_vector.x);
@@ -187,14 +191,11 @@ RESULT_TYPE RCA_acquisition_range(vec3 world_point, int acquisition_start, int a
return result;
}
-RESULT_TYPE RCA(vec3 world_point)
+RESULT_TYPE HERCULES(const vec3 world_point)
{
- if (fast) return RESULT_TYPE_CAST(RCA_acquisition_range(world_point, u_channel, u_channel + 1));
- else return RESULT_TYPE_CAST(RCA_acquisition_range(world_point, 0, int(acquisition_count)));
-}
+ const int rx_channel_start = fast? u_channel : 0;
+ const int rx_channel_end = fast? u_channel + 1 : ChannelCount;
-RESULT_TYPE HERCULES_receive_channel_range(vec3 world_point, int rx_channel_start, int rx_channel_end)
-{
int transmit_receive_orientation = imageLoad(transmit_receive_orientations, 0).x;
vec3 xdc_world_point = (xdc_transform * vec4(world_point, 1)).xyz;
bool rx_cols = (transmit_receive_orientation & RX_ORIENTATION_MASK) != 0;
@@ -202,7 +203,7 @@ RESULT_TYPE HERCULES_receive_channel_range(vec3 world_point, int rx_channel_star
transmit_receive_orientation);
RESULT_TYPE result = RESULT_TYPE(0);
- for (int transmit = int(sparse); transmit < acquisition_count; transmit++) {
+ for (int transmit = int(sparse); transmit < AcquisitionCount; transmit++) {
int tx_channel = sparse ? imageLoad(sparse_elements, transmit - int(sparse)).x : transmit;
for (int rx_channel = rx_channel_start; rx_channel < rx_channel_end; rx_channel++) {
vec3 element_position;
@@ -213,7 +214,7 @@ RESULT_TYPE HERCULES_receive_channel_range(vec3 world_point, int rx_channel_star
distance(xdc_world_point.xy, element_position.xy));
if (apodization > 0) {
/* NOTE: tribal knowledge */
- if (transmit == 0) apodization *= inversesqrt(acquisition_count);
+ if (transmit == 0) apodization *= inversesqrt(AcquisitionCount);
float sidx = sample_index(transmit_distance + distance(xdc_world_point, element_position));
SAMPLE_TYPE value = apodization * sample_rf(rx_channel, transmit, sidx);
@@ -224,14 +225,11 @@ RESULT_TYPE HERCULES_receive_channel_range(vec3 world_point, int rx_channel_star
return result;
}
-RESULT_TYPE HERCULES(vec3 world_point)
+RESULT_TYPE FORCES(const vec3 world_point)
{
- if (fast) return RESULT_TYPE_CAST(HERCULES_receive_channel_range(world_point, u_channel, u_channel + 1));
- else return RESULT_TYPE_CAST(HERCULES_receive_channel_range(world_point, 0, int(channel_count)));
-}
+ const int rx_channel_start = fast? u_channel : 0;
+ const int rx_channel_end = fast? u_channel + 1 : ChannelCount;
-RESULT_TYPE FORCES_receive_channel_range(vec3 world_point, int rx_channel_start, int rx_channel_end)
-{
RESULT_TYPE result = RESULT_TYPE(0);
vec3 xdc_world_point = (xdc_transform * vec4(world_point, 1)).xyz;
for (int rx_channel = rx_channel_start; rx_channel < rx_channel_end; rx_channel++) {
@@ -239,9 +237,9 @@ RESULT_TYPE FORCES_receive_channel_range(vec3 world_point, int rx_channel_start,
float apodization = apodize(f_number * radians(180) / abs(xdc_world_point.z) *
(xdc_world_point.x - rx_channel * xdc_element_pitch.x));
if (apodization > 0) {
- for (int transmit = int(sparse); transmit < acquisition_count; transmit++) {
+ for (int transmit = int(sparse); transmit < AcquisitionCount; transmit++) {
int tx_channel = sparse ? imageLoad(sparse_elements, transmit - int(sparse)).x : transmit;
- vec3 transmit_center = vec3(xdc_element_pitch * vec2(tx_channel, floor(channel_count / 2)), 0);
+ vec3 transmit_center = vec3(xdc_element_pitch * vec2(tx_channel, floor(ChannelCount / 2)), 0);
float sidx = sample_index(distance(xdc_world_point, transmit_center) + receive_distance);
SAMPLE_TYPE value = apodization * sample_rf(rx_channel, transmit, sidx);
@@ -252,12 +250,6 @@ RESULT_TYPE FORCES_receive_channel_range(vec3 world_point, int rx_channel_start,
return result;
}
-RESULT_TYPE FORCES(vec3 world_point)
-{
- if (fast) return RESULT_TYPE_CAST(FORCES_receive_channel_range(world_point, u_channel, u_channel + 1));
- else return RESULT_TYPE_CAST(FORCES_receive_channel_range(world_point, 0, int(channel_count)));
-}
-
void main()
{
ivec3 out_voxel = ivec3(gl_GlobalInvocationID);
@@ -273,7 +265,7 @@ void main()
vec3 world_point = (voxel_transform * vec4(out_voxel, 1)).xyz;
- switch (shader_kind) {
+ switch (ShaderKind) {
case ShaderKind_FORCES:
case ShaderKind_UFORCES:
{
@@ -294,7 +286,7 @@ void main()
#if (ShaderFlags & ShaderFlags_Fast) == 0
/* TODO(rnp): scale such that brightness remains ~constant */
- if (bool(shader_flags & ShaderFlags_CoherencyWeighting)) {
+ if (coherency_weighting) {
float denominator = sum[RESULT_LAST_INDEX] + float(sum[RESULT_LAST_INDEX] == 0);
RESULT_TYPE_CAST(sum) *= RESULT_TYPE_CAST(sum) / denominator;
}