shader/das: first pass at baking exact (integer) parameters - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: f815e8bb2229b323096765e1d00be10d0d0c57e2
Parent: 54effe8d17a212b7594c4c6a6d942286e2ea41e3
Author: Randy Palamar
Date:   Tue, 23 Sep 2025 19:35:57 -0600

shader/das: first pass at baking exact (integer) parameters

this gives performance boosts as high as 10% for FORCES. while I'm
typically against const it seems that including on function
parameters ensures that actual compile time constants carry
through correctly without additional register usage

Diffstat:
M beamformer.c  | 109 ++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
M beamformer.h  | 14 ++++++++------
M shaders/das.glsl  | 72 ++++++++++++++++++++++++++++++++----------------------------------------

3 files changed, 101 insertions(+), 94 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -1,13 +1,10 @@
 /* See LICENSE for license details. */
 /* TODO(rnp):
- * [ ]: do JIT compilation of shaders
- *      - a larger subset of parameters can be made into compile time constants
- *      - preallocated storage for shaders is minimized
- *      - loops over TX and RX count can be unrolled
- *      - hot reload can still be trivially supported:
- *        - loop over shaders for the current pipeline
- *        - check if the base shader matches the shader we are trying to reload
- *        - load header and append constants which are stored in the pipeline parameters
+ * [ ]: refactor: fancier hot reloading for JIT shaders
+ *      - loop over all active blocks
+          - loop over shader sets per block
+ *      - when match found reload it
+ * [ ]: refactor: move shader bake parameters into meta code
  * [ ]: measure performance of doing channel mapping in a separate shader
  * [ ]: BeamformWorkQueue -> BeamformerWorkQueue
  * [ ]: need to keep track of gpu memory in some way
@@ -445,7 +442,7 @@ das_voxel_transform_matrix(BeamformerParameters *bp)
 }
 
 function void
-das_ubo_from_beamformer_parameters(BeamformerDASUBO *du, BeamformerParameters *bp)
+das_ubo_from_beamformer_parameters(BeamformerComputePlan *cp, BeamformerDASUBO *du, BeamformerParameters *bp)
 {
 	du->voxel_transform = das_voxel_transform_matrix(bp);
 	mem_copy(du->xdc_transform.E,     bp->xdc_transform,     sizeof(du->xdc_transform));
@@ -455,13 +452,14 @@ das_ubo_from_beamformer_parameters(BeamformerDASUBO *du, BeamformerParameters *b
 	du->speed_of_sound         = bp->speed_of_sound;
 	du->time_offset            = bp->time_offset;
 	du->f_number               = bp->f_number;
-	du->shader_kind            = bp->das_shader_id;
-	du->sample_count           = bp->sample_count;
-	du->channel_count          = bp->channel_count;
-	du->acquisition_count      = bp->acquisition_count;
 
-	du->shader_flags = 0;
-	if (bp->coherency_weighting) du->shader_flags |= BeamformerShaderDASFlags_CoherencyWeighting;
+	cp->das_shader_kind        = bp->das_shader_id;
+	cp->das_sample_count       = bp->sample_count;
+	cp->das_channel_count      = bp->channel_count;
+	cp->das_acquisition_count  = bp->acquisition_count;
+
+	cp->das_shader_flags = 0;
+	if (bp->coherency_weighting) cp->das_shader_flags |= BeamformerShaderDASFlags_CoherencyWeighting;
 }
 
 function void
@@ -469,7 +467,7 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 {
 	BeamformerDASUBO *bp = &cp->das_ubo_data;
 
-	das_ubo_from_beamformer_parameters(bp, &pb->parameters);
+	das_ubo_from_beamformer_parameters(cp, bp, &pb->parameters);
 
 	b32 decode_first = pb->pipeline.shaders[0] == BeamformerShaderKind_Decode;
 	b32 run_cuda_hilbert = 0;
@@ -546,9 +544,9 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 				das_data_kind = BeamformerDataKind_Float32Complex;
 
 			i32 local_flags = 0;
-			if ((bp->shader_flags & BeamformerShaderDASFlags_CoherencyWeighting) == 0)
+			if ((cp->das_shader_flags & BeamformerShaderDASFlags_CoherencyWeighting) == 0)
 				local_flags |= BeamformerShaderDASFlags_Fast;
-			if (bp->shader_kind == BeamformerDASKind_UFORCES || bp->shader_kind == BeamformerDASKind_UHERCULES)
+			if (cp->das_shader_kind == BeamformerDASKind_UFORCES || cp->das_shader_kind == BeamformerDASKind_UHERCULES)
 				local_flags |= BeamformerShaderDASFlags_Sparse;
 			if (pb->parameters.interpolate)
 				local_flags |= BeamformerShaderDASFlags_Interpolate;
@@ -572,8 +570,8 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 	cp->pipeline.data_kind = data_kind;
 
 	u32 das_sample_stride   = 1;
-	u32 das_transmit_stride = bp->sample_count;
-	u32 das_channel_stride  = bp->acquisition_count * bp->sample_count;
+	u32 das_transmit_stride = cp->das_sample_count;
+	u32 das_channel_stride  = cp->das_acquisition_count * cp->das_sample_count;
 
 	u32 decimation_rate = MAX(pb->parameters.decimation_rate, 1);
 	if (demodulate) {
@@ -582,14 +580,14 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 	}
 
 	u32 input_sample_stride   = 1;
-	u32 input_transmit_stride = bp->sample_count;
+	u32 input_transmit_stride = cp->das_sample_count;
 	u32 input_channel_stride  = pb->parameters.raw_data_dimensions[0];
 
 	BeamformerDecodeUBO *dp = &cp->decode_ubo_data;
 	dp->decode_mode    = pb->parameters.decode;
-	dp->transmit_count = bp->acquisition_count;
+	dp->transmit_count = cp->das_acquisition_count;
 
-	dp->input_sample_stride    = decode_first? input_sample_stride   : bp->acquisition_count;
+	dp->input_sample_stride    = decode_first? input_sample_stride   : cp->das_acquisition_count;
 	dp->input_channel_stride   = decode_first? input_channel_stride  : das_channel_stride;
 	dp->input_transmit_stride  = decode_first? input_transmit_stride : 1;
 	dp->output_sample_stride   = das_sample_stride;
@@ -600,9 +598,9 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 		dp->output_transmit_stride *= decimation_rate;
 	}
 
-	cp->decode_dispatch.x = (u32)ceil_f32((f32)bp->sample_count      / DECODE_LOCAL_SIZE_X);
-	cp->decode_dispatch.y = (u32)ceil_f32((f32)bp->channel_count     / DECODE_LOCAL_SIZE_Y);
-	cp->decode_dispatch.z = (u32)ceil_f32((f32)bp->acquisition_count / DECODE_LOCAL_SIZE_Z);
+	cp->decode_dispatch.x = (u32)ceil_f32((f32)cp->das_sample_count      / DECODE_LOCAL_SIZE_X);
+	cp->decode_dispatch.y = (u32)ceil_f32((f32)cp->das_channel_count     / DECODE_LOCAL_SIZE_Y);
+	cp->decode_dispatch.z = (u32)ceil_f32((f32)cp->das_acquisition_count / DECODE_LOCAL_SIZE_Z);
 
 	/* NOTE(rnp): decode 2 samples per dispatch when data is i16 */
 	if (decode_first && data_kind == BeamformerDataKind_Int16)
@@ -624,7 +622,7 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 		mp->decimation_rate        = decimation_rate;
 
 		bp->sampling_frequency /= 2 * (f32)mp->decimation_rate;
-		bp->sample_count       /= 2 * mp->decimation_rate;
+		cp->das_sample_count   /= 2 * mp->decimation_rate;
 
 		if (decode_first) {
 			mp->input_channel_stride  = dp->output_channel_stride;
@@ -644,16 +642,16 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 			mp->output_sample_stride   = dp->input_sample_stride;
 			mp->output_transmit_stride = dp->input_transmit_stride;
 
-			cp->decode_dispatch.x = (u32)ceil_f32((f32)bp->sample_count / DECODE_LOCAL_SIZE_X);
+			cp->decode_dispatch.x = (u32)ceil_f32((f32)cp->das_sample_count / DECODE_LOCAL_SIZE_X);
 		}
 	}
 
 	/* TODO(rnp): filter may need a different dispatch layout */
-	cp->demod_dispatch.x = (u32)ceil_f32((f32)bp->sample_count      / FILTER_LOCAL_SIZE_X);
-	cp->demod_dispatch.y = (u32)ceil_f32((f32)bp->channel_count     / FILTER_LOCAL_SIZE_Y);
-	cp->demod_dispatch.z = (u32)ceil_f32((f32)bp->acquisition_count / FILTER_LOCAL_SIZE_Z);
+	cp->demod_dispatch.x = (u32)ceil_f32((f32)cp->das_sample_count      / FILTER_LOCAL_SIZE_X);
+	cp->demod_dispatch.y = (u32)ceil_f32((f32)cp->das_channel_count     / FILTER_LOCAL_SIZE_Y);
+	cp->demod_dispatch.z = (u32)ceil_f32((f32)cp->das_acquisition_count / FILTER_LOCAL_SIZE_Z);
 
-	cp->rf_size = bp->sample_count * bp->channel_count * bp->acquisition_count;
+	cp->rf_size = cp->das_sample_count * cp->das_channel_count * cp->das_acquisition_count;
 	if (demodulate || run_cuda_hilbert) cp->rf_size *= 8;
 	else                                cp->rf_size *= 4;
 
@@ -662,12 +660,12 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
 	flt->demodulation_frequency = bp->demodulation_frequency;
 	flt->sampling_frequency     = bp->sampling_frequency;
 	flt->decimation_rate        = 1;
-	flt->output_channel_stride  = bp->sample_count * bp->acquisition_count;
+	flt->output_channel_stride  = cp->das_sample_count * cp->das_acquisition_count;
 	flt->output_sample_stride   = 1;
-	flt->output_transmit_stride = bp->sample_count;
-	flt->input_channel_stride   = bp->sample_count * bp->acquisition_count;
+	flt->output_transmit_stride = cp->das_sample_count;
+	flt->input_channel_stride   = cp->das_sample_count * cp->das_acquisition_count;
 	flt->input_sample_stride    = 1;
-	flt->input_transmit_stride  = bp->sample_count;
+	flt->input_transmit_stride  = cp->das_sample_count;
 }
 
 function void
@@ -741,7 +739,6 @@ load_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, u32 shader_sl
 	BeamformerShaderKind        shader = cp->pipeline.shaders[shader_slot];
 	BeamformerShaderDescriptor *sd     = beamformer_shader_descriptors + shader;
 
-
 	u32 program          = 0;
 	i32 reloadable_index = beamformer_shader_reloadable_index_by_shader[shader];
 	if (reloadable_index != -1) {
@@ -767,10 +764,28 @@ load_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, u32 shader_sl
 
 		if (sd->has_local_flags) {
 			stream_append_s8(&shader_stream, s8("#define ShaderFlags (0x"));
-			stream_append_hex_u64(&shader_stream, (u64)match_vector[sd->match_vector_length]);
+			i32 flags = match_vector[sd->match_vector_length];
+			if (shader == BeamformerShaderKind_DAS) flags |= cp->das_shader_flags;
+			stream_append_hex_u64(&shader_stream, (u64)flags);
 			stream_append_s8(&shader_stream, s8(")\n"));
 		}
 
+		/* TODO(rnp): generate this */
+		switch (base_shader) {
+		case BeamformerShaderKind_DAS:{
+			stream_append_s8(&shader_stream, s8("#define ShaderKind "));
+			stream_append_u64(&shader_stream, cp->das_shader_kind);
+			stream_append_s8(&shader_stream, s8("\n#define SampleCount "));
+			stream_append_u64(&shader_stream, cp->das_sample_count);
+			stream_append_s8(&shader_stream, s8("\n#define ChannelCount "));
+			stream_append_u64(&shader_stream, cp->das_channel_count);
+			stream_append_s8(&shader_stream, s8("\n#define AcquisitionCount "));
+			stream_append_u64(&shader_stream, cp->das_acquisition_count);
+			stream_append_s8(&shader_stream, s8("\n"));
+		}break;
+		default:{}break;
+		}
+
 		stream_append_s8(&shader_stream, s8("\n#line 1\n"));
 
 		s8 shader_text = arena_stream_commit(&arena, &shader_stream);
@@ -820,8 +835,8 @@ beamformer_commit_parameter_block(BeamformerCtx *ctx, BeamformerComputePlan *cp,
 			if (ctx->compute_context.ping_pong_ssbo_size < decoded_data_size)
 				alloc_shader_storage(ctx, decoded_data_size, arena);
 
-			if (cp->hadamard_order != (i32)cp->das_ubo_data.acquisition_count)
-				update_hadamard_texture(cp, (i32)cp->das_ubo_data.acquisition_count, arena);
+			if (cp->hadamard_order != (i32)cp->das_acquisition_count)
+				update_hadamard_texture(cp, (i32)cp->das_acquisition_count, arena);
 
 			cp->min_coordinate = v3_from_f32_array(pb->parameters.output_min_coordinate);
 			cp->max_coordinate = v3_from_f32_array(pb->parameters.output_max_coordinate);
@@ -954,8 +969,6 @@ do_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, BeamformerFrame
 	case BeamformerShaderKind_DAS:{
 		local_persist u32 das_cycle_t = 0;
 
-		BeamformerDASUBO *ubo = &cp->das_ubo_data;
-
 		i32 local_flags = match_vector[shader_descriptor->match_vector_length];
 		b32 fast        = (local_flags & BeamformerShaderDASFlags_Fast)   != 0;
 		b32 sparse      = (local_flags & BeamformerShaderDASFlags_Sparse) != 0;
@@ -981,14 +994,14 @@ do_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, BeamformerFrame
 
 		if (fast) {
 			i32 loop_end;
-			if (ubo->shader_kind == BeamformerDASKind_RCA_VLS ||
-			    ubo->shader_kind == BeamformerDASKind_RCA_TPW)
+			if (cp->das_shader_kind == BeamformerDASKind_RCA_VLS ||
+			    cp->das_shader_kind == BeamformerDASKind_RCA_TPW)
 			{
 				/* NOTE(rnp): to avoid repeatedly sampling the whole focal vectors
 				 * texture we loop over transmits for VLS/TPW */
-				loop_end = (i32)ubo->acquisition_count;
+				loop_end = (i32)cp->das_acquisition_count;
 			} else {
-				loop_end = (i32)ubo->channel_count;
+				loop_end = (i32)cp->das_channel_count;
 			}
 			f32 percent_per_step = 1.0f / (f32)loop_end;
 			cc->processing_progress = -percent_per_step;
@@ -1204,8 +1217,8 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena *arena, iptr gl_c
 
 			frame->min_coordinate  = cp->min_coordinate;
 			frame->max_coordinate  = cp->max_coordinate;
-			frame->das_kind        = cp->das_ubo_data.shader_kind;
-			frame->compound_count  = cp->das_ubo_data.acquisition_count;
+			frame->das_kind        = cp->das_shader_kind;
+			frame->compound_count  = cp->das_acquisition_count;
 
 			BeamformerComputeContext  *cc       = &ctx->compute_context;
 			BeamformerComputePipeline *pipeline = &cp->pipeline;
diff --git a/beamformer.h b/beamformer.h
@@ -139,12 +139,7 @@ typedef struct {
 	X(demodulation_frequency, f32, float) \
 	X(speed_of_sound,         f32, float) \
 	X(time_offset,            f32, float) \
-	X(f_number,               f32, float) \
-	X(shader_flags,           u32, int)   \
-	X(shader_kind,            u32, uint)  \
-	X(sample_count,           u32, uint)  \
-	X(channel_count,          u32, uint)  \
-	X(acquisition_count,      u32, uint)
+	X(f_number,               f32, float)
 
 typedef alignas(16) struct {
 	#define X(name, type, ...) type name;
@@ -165,6 +160,7 @@ typedef alignas(16) struct {
 	#define X(name, type, ...) type name;
 	BEAMFORMER_DAS_UBO_PARAM_LIST
 	#undef X
+	float _pad[1];
 } BeamformerDASUBO;
 static_assert((sizeof(BeamformerDASUBO) & 15) == 0, "UBO size must be a multiple of 16");
 
@@ -228,6 +224,12 @@ struct BeamformerComputePlan {
 	BEAMFORMER_COMPUTE_UBO_LIST
 	#undef X
 
+	u32 das_shader_kind;
+	u32 das_sample_count;
+	u32 das_channel_count;
+	u32 das_acquisition_count;
+	i32 das_shader_flags;
+
 	BeamformerComputePlan *next;
 };
 
diff --git a/shaders/das.glsl b/shaders/das.glsl
@@ -35,9 +35,10 @@ layout(std430, binding = 1) readonly restrict buffer buffer_1 {
   #define RESULT_STORE(a, length_a) RESULT_TYPE(a, length_a)
 #endif
 
-const bool fast        = bool(ShaderFlags & ShaderFlags_Fast);
-const bool sparse      = bool(ShaderFlags & ShaderFlags_Sparse);
-const bool interpolate = bool(ShaderFlags & ShaderFlags_Interpolate);
+const bool fast                = bool(ShaderFlags & ShaderFlags_Fast);
+const bool sparse              = bool(ShaderFlags & ShaderFlags_Sparse);
+const bool interpolate         = bool(ShaderFlags & ShaderFlags_Interpolate);
+const bool coherency_weighting = bool(ShaderFlags & ShaderFlags_CoherencyWeighting);
 
 #if (ShaderFlags & ShaderFlags_Fast)
 layout(TEXTURE_KIND, binding = 0)           restrict uniform image3D  u_out_data_tex;
@@ -55,7 +56,7 @@ layout(r8i,   binding = 3) readonly  restrict uniform iimage1D transmit_receive_
 #define C_SPLINE 0.5
 
 #if DataKind == DataKind_Float32Complex
-vec2 rotate_iq(vec2 iq, float time)
+vec2 rotate_iq(const vec2 iq, const float time)
 {
 	float arg    = radians(360) * demodulation_frequency * time;
 	mat2  phasor = mat2( cos(arg), sin(arg),
@@ -68,7 +69,7 @@ vec2 rotate_iq(vec2 iq, float time)
 #endif
 
 /* NOTE: See: https://cubic.org/docs/hermite.htm */
-SAMPLE_TYPE cubic(int base_index, float index)
+SAMPLE_TYPE cubic(const int base_index, const float index)
 {
 	const mat4 h = mat4(
 		 2, -3,  0, 1,
@@ -101,23 +102,23 @@ SAMPLE_TYPE cubic(int base_index, float index)
 	return result;
 }
 
-SAMPLE_TYPE sample_rf(int channel, int transmit, float index)
+SAMPLE_TYPE sample_rf(const int channel, const int transmit, const float index)
 {
-	SAMPLE_TYPE result = SAMPLE_TYPE(index >= 0.0f) * SAMPLE_TYPE((int(index) + 1 + int(interpolate)) < sample_count);
-	int base_index = int(channel * sample_count * acquisition_count + transmit * sample_count);
+	SAMPLE_TYPE result = SAMPLE_TYPE(index >= 0.0f) * SAMPLE_TYPE((int(index) + 1 + int(interpolate)) < SampleCount);
+	int base_index = int(channel * SampleCount * AcquisitionCount + transmit * SampleCount);
 	if (interpolate) result *= cubic(base_index, index);
 	else             result *= rf_data[base_index + int(round(index))];
 	result = rotate_iq(result, index / sampling_frequency);
 	return result;
 }
 
-float sample_index(float distance)
+float sample_index(const float distance)
 {
 	float  time = distance / speed_of_sound + time_offset;
 	return time * sampling_frequency;
 }
 
-float apodize(float arg)
+float apodize(const float arg)
 {
 	/* NOTE: used for constant F# dynamic receive apodization. This is implemented as:
 	 *
@@ -130,24 +131,25 @@ float apodize(float arg)
 	return a * a;
 }
 
-vec2 rca_plane_projection(vec3 point, bool rows)
+vec2 rca_plane_projection(const vec3 point, const bool rows)
 {
 	vec2 result = vec2(point[int(rows)], point[2]);
 	return result;
 }
 
-float plane_wave_transmit_distance(vec3 point, float transmit_angle, bool tx_rows)
+float plane_wave_transmit_distance(const vec3 point, const float transmit_angle, const bool tx_rows)
 {
 	return dot(rca_plane_projection(point, tx_rows), vec2(sin(transmit_angle), cos(transmit_angle)));
 }
 
-float cylindrical_wave_transmit_distance(vec3 point, float focal_depth, float transmit_angle, bool tx_rows)
+float cylindrical_wave_transmit_distance(const vec3 point, const float focal_depth,
+                                         const float transmit_angle, const bool tx_rows)
 {
 	vec2 f = focal_depth * vec2(sin(transmit_angle), cos(transmit_angle));
 	return distance(rca_plane_projection(point, tx_rows), f);
 }
 
-float rca_transmit_distance(vec3 world_point, vec2 focal_vector, int transmit_receive_orientation)
+float rca_transmit_distance(const vec3 world_point, const vec2 focal_vector, const int transmit_receive_orientation)
 {
 	bool  tx_rows        = (transmit_receive_orientation & TX_ORIENTATION_MASK) == 0;
 	float transmit_angle = radians(focal_vector.x);
@@ -162,8 +164,10 @@ float rca_transmit_distance(vec3 world_point, vec2 focal_vector, int transmit_re
 	return result;
 }
 
-RESULT_TYPE RCA_acquisition_range(vec3 world_point, int acquisition_start, int acquisition_end)
+RESULT_TYPE RCA(const vec3 world_point)
 {
+	const int acquisition_start = fast? u_channel     : 0;
+	const int acquisition_end   = fast? u_channel + 1 : AcquisitionCount;
 	RESULT_TYPE result = RESULT_TYPE(0);
 	for (int acquisition = acquisition_start; acquisition < acquisition_end; acquisition++) {
 		int   transmit_receive_orientation = imageLoad(transmit_receive_orientations, acquisition).x;
@@ -172,7 +176,7 @@ RESULT_TYPE RCA_acquisition_range(vec3 world_point, int acquisition_start, int a
 		float transmit_distance = rca_transmit_distance(world_point, imageLoad(focal_vectors, acquisition).xy,
 		                                                transmit_receive_orientation);
 
-		for (int rx_channel = 0; rx_channel < channel_count; rx_channel++) {
+		for (int rx_channel = 0; rx_channel < ChannelCount; rx_channel++) {
 			vec3  rx_center      = vec3(rx_channel * xdc_element_pitch, 0);
 			vec2  receive_vector = xdc_world_point - rca_plane_projection(rx_center, rx_rows);
 			float apodization    = apodize(f_number * radians(180) / abs(xdc_world_point.y) * receive_vector.x);
@@ -187,14 +191,11 @@ RESULT_TYPE RCA_acquisition_range(vec3 world_point, int acquisition_start, int a
 	return result;
 }
 
-RESULT_TYPE RCA(vec3 world_point)
+RESULT_TYPE HERCULES(const vec3 world_point)
 {
-	if (fast) return RESULT_TYPE_CAST(RCA_acquisition_range(world_point, u_channel, u_channel + 1));
-	else      return RESULT_TYPE_CAST(RCA_acquisition_range(world_point, 0,         int(acquisition_count)));
-}
+	const int rx_channel_start = fast? u_channel     : 0;
+	const int rx_channel_end   = fast? u_channel + 1 : ChannelCount;
 
-RESULT_TYPE HERCULES_receive_channel_range(vec3 world_point, int rx_channel_start, int rx_channel_end)
-{
 	int   transmit_receive_orientation = imageLoad(transmit_receive_orientations, 0).x;
 	vec3  xdc_world_point   = (xdc_transform * vec4(world_point, 1)).xyz;
 	bool  rx_cols           = (transmit_receive_orientation & RX_ORIENTATION_MASK) != 0;
@@ -202,7 +203,7 @@ RESULT_TYPE HERCULES_receive_channel_range(vec3 world_point, int rx_channel_star
 	                                                transmit_receive_orientation);
 
 	RESULT_TYPE result = RESULT_TYPE(0);
-	for (int transmit = int(sparse); transmit < acquisition_count; transmit++) {
+	for (int transmit = int(sparse); transmit < AcquisitionCount; transmit++) {
 		int tx_channel = sparse ? imageLoad(sparse_elements, transmit - int(sparse)).x : transmit;
 		for (int rx_channel = rx_channel_start; rx_channel < rx_channel_end; rx_channel++) {
 			vec3 element_position;
@@ -213,7 +214,7 @@ RESULT_TYPE HERCULES_receive_channel_range(vec3 world_point, int rx_channel_star
 			                            distance(xdc_world_point.xy, element_position.xy));
 			if (apodization > 0) {
 				/* NOTE: tribal knowledge */
-				if (transmit == 0) apodization *= inversesqrt(acquisition_count);
+				if (transmit == 0) apodization *= inversesqrt(AcquisitionCount);
 
 				float       sidx  = sample_index(transmit_distance + distance(xdc_world_point, element_position));
 				SAMPLE_TYPE value = apodization * sample_rf(rx_channel, transmit, sidx);
@@ -224,14 +225,11 @@ RESULT_TYPE HERCULES_receive_channel_range(vec3 world_point, int rx_channel_star
 	return result;
 }
 
-RESULT_TYPE HERCULES(vec3 world_point)
+RESULT_TYPE FORCES(const vec3 world_point)
 {
-	if (fast) return RESULT_TYPE_CAST(HERCULES_receive_channel_range(world_point, u_channel, u_channel + 1));
-	else      return RESULT_TYPE_CAST(HERCULES_receive_channel_range(world_point, 0,         int(channel_count)));
-}
+	const int rx_channel_start = fast? u_channel     : 0;
+	const int rx_channel_end   = fast? u_channel + 1 : ChannelCount;
 
-RESULT_TYPE FORCES_receive_channel_range(vec3 world_point, int rx_channel_start, int rx_channel_end)
-{
 	RESULT_TYPE result = RESULT_TYPE(0);
 	vec3 xdc_world_point = (xdc_transform * vec4(world_point, 1)).xyz;
 	for (int rx_channel = rx_channel_start; rx_channel < rx_channel_end; rx_channel++) {
@@ -239,9 +237,9 @@ RESULT_TYPE FORCES_receive_channel_range(vec3 world_point, int rx_channel_start,
 		float apodization      = apodize(f_number * radians(180) / abs(xdc_world_point.z) *
 		                                 (xdc_world_point.x - rx_channel * xdc_element_pitch.x));
 		if (apodization > 0) {
-			for (int transmit = int(sparse); transmit < acquisition_count; transmit++) {
+			for (int transmit = int(sparse); transmit < AcquisitionCount; transmit++) {
 				int   tx_channel      = sparse ? imageLoad(sparse_elements, transmit - int(sparse)).x : transmit;
-				vec3  transmit_center = vec3(xdc_element_pitch * vec2(tx_channel, floor(channel_count / 2)), 0);
+				vec3  transmit_center = vec3(xdc_element_pitch * vec2(tx_channel, floor(ChannelCount / 2)), 0);
 
 				float       sidx  = sample_index(distance(xdc_world_point, transmit_center) + receive_distance);
 				SAMPLE_TYPE value = apodization * sample_rf(rx_channel, transmit, sidx);
@@ -252,12 +250,6 @@ RESULT_TYPE FORCES_receive_channel_range(vec3 world_point, int rx_channel_start,
 	return result;
 }
 
-RESULT_TYPE FORCES(vec3 world_point)
-{
-	if (fast) return RESULT_TYPE_CAST(FORCES_receive_channel_range(world_point, u_channel, u_channel + 1));
-	else      return RESULT_TYPE_CAST(FORCES_receive_channel_range(world_point, 0,         int(channel_count)));
-}
-
 void main()
 {
 	ivec3 out_voxel = ivec3(gl_GlobalInvocationID);
@@ -273,7 +265,7 @@ void main()
 
 	vec3 world_point = (voxel_transform * vec4(out_voxel, 1)).xyz;
 
-	switch (shader_kind) {
+	switch (ShaderKind) {
 	case ShaderKind_FORCES:
 	case ShaderKind_UFORCES:
 	{
@@ -294,7 +286,7 @@ void main()
 
 	#if (ShaderFlags & ShaderFlags_Fast) == 0
 	/* TODO(rnp): scale such that brightness remains ~constant */
-	if (bool(shader_flags & ShaderFlags_CoherencyWeighting)) {
+	if (coherency_weighting) {
 		float denominator = sum[RESULT_LAST_INDEX] + float(sum[RESULT_LAST_INDEX] == 0);
 		RESULT_TYPE_CAST(sum) *= RESULT_TYPE_CAST(sum) / denominator;
 	}

M	beamformer.c	\|	109	++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
M	beamformer.h	\|	14	++++++++------
M	shaders/das.glsl	\|	72	++++++++++++++++++++++++++++++++----------------------------------------