combine hercules/uforces shaders into an uber DAS shader - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: 748691e7a7ed87e3a36ea6e0a45f0e42a1a61157
Parent: 10bb1639c2f9563874aeb9702f2473b7dd0bdffa
Author: Randy Palamar
Date:   Thu, 21 Nov 2024 12:52:13 -0700

combine hercules/uforces shaders into an uber DAS shader

this makes it easier to compare methods and share code

Diffstat:
M beamformer.c  | 7 ++-----
M beamformer.h  | 14 +++++++-------
M beamformer_parameters.h  | 19 ++++++++++++++-----
M helpers/ogl_beamformer_lib.c  | 3 +--
A shaders/das.glsl  | 200 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
D shaders/hercules.glsl  | 137 -------------------------------------------------------------------------------
D shaders/uforces.glsl  | 116 -------------------------------------------------------------------------------
M static.c  | 10 ++++------
M ui.c  | 3 +--

9 files changed, 229 insertions(+), 280 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -418,8 +418,7 @@ do_compute_shader(BeamformerCtx *ctx, BeamformFrame *frame, u32 raw_data_index,
 			glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
 		}
 	} break;
-	case CS_HERCULES:
-	case CS_UFORCES: {
+	case CS_DAS: {
 		u32 rf_ssbo      = csctx->rf_data_ssbos[input_ssbo_idx];
 		iv3 dispatch_dim = {.x = frame->dim.x, .y = frame->dim.y, .z = frame->dim.z};
 		do_beamform_shader(csctx, &ctx->params->raw, frame, rf_ssbo, dispatch_dim, (iv3){0}, 0);
@@ -483,9 +482,7 @@ do_beamform_work(BeamformerCtx *ctx, Arena *a)
 				u32 stage_count = ctx->params->compute_stages_count;
 				enum compute_shaders *stages = ctx->params->compute_stages;
 				for (u32 i = 0; i < stage_count; i++) {
-					if (stages[i] == CS_UFORCES || stages[i] == CS_HERCULES) {
-						/* TODO: this is not a proper solution if we have
-						 * more beamforming shaders */
+					if (stages[i] == CS_DAS) {
 						ctx->partial_compute_ctx.shader = stages[i];
 						break;
 					}
diff --git a/beamformer.h b/beamformer.h
@@ -125,13 +125,13 @@ typedef struct {
 	c8                   export_pipe_name[1024];
 } BeamformerParametersFull;
 
-#define CS_UNIFORMS                              \
-	X(CS_HERCULES, volume_export_dim_offset) \
-	X(CS_HERCULES, volume_export_pass)       \
-	X(CS_MIN_MAX,  mips_level)               \
-	X(CS_SUM,      sum_prescale)             \
-	X(CS_UFORCES,  xdc_index)                \
-	X(CS_UFORCES,  xdc_transform)
+#define CS_UNIFORMS                             \
+	X(CS_DAS,     volume_export_dim_offset) \
+	X(CS_DAS,     volume_export_pass)       \
+	X(CS_DAS,     xdc_index)                \
+	X(CS_DAS,     xdc_transform)            \
+	X(CS_MIN_MAX, mips_level)               \
+	X(CS_SUM,     sum_prescale)
 
 typedef struct {
 	u32 programs[CS_LAST];
diff --git a/beamformer_parameters.h b/beamformer_parameters.h
@@ -2,15 +2,17 @@
 enum compute_shaders {
 	CS_CUDA_DECODE           = 0,
 	CS_CUDA_HILBERT          = 1,
-	CS_DEMOD                 = 2,
-	CS_HADAMARD              = 3,
-	CS_HERCULES              = 4,
+	CS_DAS                   = 2,
+	CS_DEMOD                 = 3,
+	CS_HADAMARD              = 4,
 	CS_MIN_MAX               = 5,
 	CS_SUM                   = 6,
-	CS_UFORCES               = 7,
 	CS_LAST
 };
 
+#define DAS_ID_UFORCES  0
+#define DAS_ID_HERCULES 1
+
 #define MAX_BEAMFORMED_SAVED_FRAMES 16
 #define MAX_MULTI_XDC_COUNT         4
 /* NOTE: This struct follows the OpenGL std140 layout. DO NOT modify unless you have
@@ -36,6 +38,8 @@ typedef struct {
 	f32 off_axis_pos;           /* [m] Position on screen normal to beamform in 2D HERCULES */
 	i32 beamform_plane;         /* Plane to Beamform in 2D HERCULES */
 	f32 f_number;               /* F# (set to 0 to disable) */
+	u32 das_shader_id;
+	f32 _pad[3];
 } BeamformerParameters;
 
 /* NOTE: garbage to get the prepocessor to properly stringize the value of a macro */
@@ -66,4 +70,9 @@ layout(std140, binding = 0) uniform parameters {\n\
 	float off_axis_pos;           /* [m] Position on screen normal to beamform in 2D HERCULES */\n\
 	int   beamform_plane;         /* Plane to Beamform in 2D HERCULES */\n\
 	float f_number;               /* F# (set to 0 to disable) */\n\
-};\n\n"
+	uint  das_shader_id;\n\
+};\n\
+\n\
+#define DAS_ID_UFORCES  " str(DAS_ID_UFORCES) "\n\
+#define DAS_ID_HERCULES " str(DAS_ID_HERCULES) "\n\
+\n"
diff --git a/helpers/ogl_beamformer_lib.c b/helpers/ogl_beamformer_lib.c
@@ -212,12 +212,11 @@ set_beamformer_pipeline(char *shm_name, i32 *stages, i32 stages_count)
 		switch (stages[i]) {
 		case CS_CUDA_DECODE:
 		case CS_CUDA_HILBERT:
+		case CS_DAS:
 		case CS_DEMOD:
 		case CS_HADAMARD:
-		case CS_HERCULES:
 		case CS_MIN_MAX:
 		case CS_SUM:
-		case CS_UFORCES:
 			g_bp->compute_stages[i] = stages[i];
 			break;
 		default:
diff --git a/shaders/das.glsl b/shaders/das.glsl
@@ -0,0 +1,200 @@
+/* See LICENSE for license details. */
+layout(local_size_x = 32, local_size_y = 1, local_size_z = 32) in;
+
+layout(std430, binding = 1) readonly restrict buffer buffer_1 {
+	vec2 rf_data[];
+};
+
+layout(rg32f, binding = 0) writeonly uniform image3D u_out_data_tex;
+
+layout(location = 2) uniform int   u_volume_export_pass;
+layout(location = 3) uniform ivec3 u_volume_export_dim_offset;
+layout(location = 4) uniform mat4  u_xdc_transform;
+layout(location = 5) uniform int   u_xdc_index;
+
+#define C_SPLINE 0.5
+
+#define TX_ROWS 0
+#define TX_COLS 1
+
+#if 1
+/* NOTE: interpolation is unnecessary if the data has been demodulated and not decimated */
+vec2 cubic(uint ridx, float t)
+{
+	return rf_data[ridx + uint(floor(t))];
+}
+#else
+/* NOTE: See: https://cubic.org/docs/hermite.htm */
+vec2 cubic(uint ridx, float x)
+{
+	mat4 h = mat4(
+		 2, -3,  0, 1,
+		-2,  3,  0, 0,
+		 1, -2,  1, 0,
+		 1, -1,  0, 0
+	);
+
+	uint  xk = uint(floor(x));
+	float t  = (x  - float(xk));
+	vec4  S  = vec4(t * t * t, t * t, t, 1);
+
+	vec2 P1 = rf_data[ridx + xk];
+	vec2 P2 = rf_data[ridx + xk + 1];
+	vec2 T1 = C_SPLINE * (P2 - rf_data[ridx + xk - 1]);
+	vec2 T2 = C_SPLINE * (rf_data[ridx + xk + 2] - P1);
+
+	vec4 C1 = vec4(P1.x, P2.x, T1.x, T2.x);
+	vec4 C2 = vec4(P1.y, P2.y, T1.y, T2.y);
+	return vec2(dot(S, h * C1), dot(S, h * C2));
+}
+#endif
+
+vec3 calc_image_point(vec3 voxel)
+{
+	ivec3 out_data_dim = imageSize(u_out_data_tex);
+	vec4 output_size   = abs(output_max_coord - output_min_coord);
+	vec4 image_point   = vec4(output_min_coord.xyz + voxel * output_size.xyz / out_data_dim, 1);
+
+	switch (das_shader_id) {
+	case DAS_ID_UFORCES:
+		/* TODO: fix the math so that the image plane can be aritrary */
+		image_point.y = 0;
+		break;
+	case DAS_ID_HERCULES:
+		if (u_volume_export_pass == 0)
+			image_point.y = off_axis_pos;
+		break;
+	}
+
+
+	/* NOTE: move the image point into xdc space */
+	image_point = u_xdc_transform * image_point;
+	return image_point.xyz;
+}
+
+vec2 apodize(vec2 value, float apodization_arg, float distance)
+{
+	/* NOTE: apodization value for this transducer element */
+	float a  = cos(clamp(abs(apodization_arg * distance), 0, 0.25 * radians(360)));
+	return value * a * a;
+}
+
+float sample_index(float distance)
+{
+	float  time = distance / speed_of_sound + time_offset;
+	return time * sampling_frequency;
+}
+
+vec2 HERCULES(vec3 image_point, vec3 delta, uint starting_offset, float apodization_arg)
+{
+	/* TODO: pass this in (there is a problem in that it depends on the orientation
+	 * of the array relative to the target/subject). */
+	int   transmit_orientation = TX_ROWS;
+	float transmit_dist;
+	if (isinf(focal_depth)) {
+		/* NOTE: plane wave */
+		transmit_dist = image_point.z;
+	} else {
+		/* NOTE: cylindrical diverging wave */
+		if (transmit_orientation == TX_ROWS)
+			transmit_dist = length(vec2(image_point.y, image_point.z - focal_depth));
+		else
+			transmit_dist = length(vec2(image_point.x, image_point.z - focal_depth));
+	}
+
+	uint ridx      = starting_offset;
+	vec3 rdist     = image_point;
+	int  direction = beamform_plane * (u_volume_export_pass ^ 1);
+
+	vec2 sum = vec2(0);
+	/* NOTE: For Each Acquistion in Raw Data */
+	for (uint i = 0; i < dec_data_dim.z; i++) {
+		/* NOTE: For Each Virtual Source */
+		for (uint j = 0; j < dec_data_dim.y; j++) {
+			float sidx = sample_index(transmit_dist + length(rdist));
+			vec2 valid = vec2(sidx < dec_data_dim.x);
+			/* NOTE: tribal knowledge; this is a problem with the imaging sequence */
+			if (i == 0) valid *= inversesqrt(128);
+
+			sum += apodize(cubic(ridx, sidx), apodization_arg, rdist.x) * valid;
+
+			rdist[direction] -= delta[direction];
+			ridx             += dec_data_dim.x;
+		}
+
+		rdist[direction]      = image_point[direction];
+		rdist[direction ^ 1] -= delta[direction ^ 1];
+	}
+	return sum;
+}
+
+vec2 uFORCES(vec3 image_point, vec3 delta, uint starting_offset, float apodization_arg)
+{
+	/* NOTE: skip first acquisition in uforces since its garbage */
+	uint uforces = uint(dec_data_dim.y != dec_data_dim.z);
+	uint ridx    = starting_offset + dec_data_dim.y * dec_data_dim.x * uforces;
+
+	vec2 sum = vec2(0);
+	for (uint i = uforces; i < dec_data_dim.z; i++) {
+		uint base_idx = (i - uforces) / 4;
+		uint sub_idx  = (i - uforces) % 4;
+
+		vec3  rdist         = image_point;
+		vec3  focal_point   = uforces_channels[base_idx][sub_idx] * delta;
+		float transmit_dist = distance(image_point, focal_point);
+
+		for (uint j = 0; j < dec_data_dim.y; j++) {
+			float sidx  = sample_index(transmit_dist + length(rdist));
+			vec2 valid  = vec2(sidx < dec_data_dim.x);
+			sum        += apodize(cubic(ridx, sidx), apodization_arg, rdist.x) * valid;
+			rdist      -= delta;
+			ridx       += dec_data_dim.x;
+		}
+	}
+	return sum;
+}
+
+void main()
+{
+
+	/* NOTE: Convert voxel to physical coordinates */
+	ivec3 out_coord    = ivec3(gl_GlobalInvocationID);
+	vec3  image_point  = calc_image_point(vec3(gl_GlobalInvocationID));
+
+	/* NOTE: array edge vectors for calculating element step delta */
+	vec3 edge1 = xdc_corner1[u_xdc_index].xyz - xdc_origin[u_xdc_index].xyz;
+	vec3 edge2 = xdc_corner2[u_xdc_index].xyz - xdc_origin[u_xdc_index].xyz;
+
+	/* NOTE: used for constant F# dynamic receive apodization. This is implemented as:
+	 *
+	 *                  /        |x_e - x_i|\
+	 *    a(x, z) = cos(F# * π * ----------- ) ^ 2
+	 *                  \        |z_e - z_i|/
+	 *
+	 * where x,z_e are transducer element positions and x,z_i are image positions. */
+	float apod_arg = f_number * 0.5 * radians(360) / abs(image_point.z);
+
+	/* NOTE: skip over channels corresponding to other arrays */
+	uint starting_offset = u_xdc_index * (dec_data_dim.y / xdc_count) * dec_data_dim.x * dec_data_dim.z;
+
+	/* NOTE: in (u)FORCES we step along line elements */
+	vec3 delta;
+
+	vec2 sum;
+	switch (das_shader_id) {
+	case DAS_ID_UFORCES:
+		/* TODO: there should be a smarter way of detecting this */
+		if (edge2.x != 0) delta = vec3(edge2.x, 0, 0) / float(dec_data_dim.y);
+		else              delta = vec3(edge1.x, 0, 0) / float(dec_data_dim.y);
+		sum = uFORCES(image_point, delta, starting_offset, apod_arg);
+		break;
+	case DAS_ID_HERCULES:
+		/* TODO: there should be a smarter way of detecting this */
+		if (edge2.x != 0) delta = vec3(edge2.x, edge1.y, 0) / float(dec_data_dim.y);
+		else              delta = vec3(edge1.x, edge2.y, 0) / float(dec_data_dim.y);
+		sum = HERCULES(image_point, delta, starting_offset, apod_arg);
+		break;
+	}
+
+	imageStore(u_out_data_tex, out_coord, vec4(sum.x, sum.y, 0, 0));
+}
diff --git a/shaders/hercules.glsl b/shaders/hercules.glsl
@@ -1,137 +0,0 @@
-/* See LICENSE for license details. */
-layout(local_size_x = 32, local_size_y = 1, local_size_z = 32) in;
-
-layout(std430, binding = 1) readonly restrict buffer buffer_1 {
-	vec2 rf_data[];
-};
-
-layout(rg32f, binding = 0) writeonly uniform image3D u_out_data_tex;
-
-layout(location = 2) uniform int   u_volume_export_pass;
-layout(location = 3) uniform ivec3 u_volume_export_dim_offset;
-layout(location = 4) uniform mat4  u_xdc_transform;
-layout(location = 5) uniform int   u_xdc_index;
-
-#define C_SPLINE 0.5
-
-#define TX_ROWS 0
-#define TX_COLS 1
-
-#if 1
-/* NOTE: interpolation is unnecessary if the data has been demodulated and not decimated */
-vec2 cubic(uint ridx, float t)
-{
-	return rf_data[ridx + uint(floor(t))];
-}
-#else
-/* NOTE: See: https://cubic.org/docs/hermite.htm */
-vec2 cubic(uint ridx, float x)
-{
-	mat4 h = mat4(
-		 2, -3,  0, 1,
-		-2,  3,  0, 0,
-		 1, -2,  1, 0,
-		 1, -1,  0, 0
-	);
-
-	uint  xk = uint(floor(x));
-	float t  = (x  - float(xk));
-	vec4  S  = vec4(t * t * t, t * t, t, 1);
-
-	vec2 P1 = rf_data[ridx + xk];
-	vec2 P2 = rf_data[ridx + xk + 1];
-	vec2 T1 = C_SPLINE * (P2 - rf_data[ridx + xk - 1]);
-	vec2 T2 = C_SPLINE * (rf_data[ridx + xk + 2] - P1);
-
-	vec4 C1 = vec4(P1.x, P2.x, T1.x, T2.x);
-	vec4 C2 = vec4(P1.y, P2.y, T1.y, T2.y);
-	return vec2(dot(S, h * C1), dot(S, h * C2));
-}
-#endif
-
-vec3 calc_image_point(vec3 voxel)
-{
-	ivec3 out_data_dim = imageSize(u_out_data_tex);
-	vec4 output_size   = abs(output_max_coord - output_min_coord);
-	vec4 image_point   = vec4(output_min_coord.xyz + voxel * output_size.xyz / out_data_dim, 1);
-
-	if (u_volume_export_pass == 0)
-		image_point.y = off_axis_pos;
-
-	/* NOTE: move the image point into xdc space */
-	image_point = u_xdc_transform * image_point;
-
-	return image_point.xyz;
-}
-
-void main()
-{
-	vec3  voxel      = vec3(gl_GlobalInvocationID.xyz)  + vec3(u_volume_export_dim_offset);
-	ivec3 out_coord  = ivec3(gl_GlobalInvocationID.xyz) + u_volume_export_dim_offset;
-
-	/* NOTE: Convert voxel to physical coordinates */
-	vec3 edge1       = xdc_corner1[u_xdc_index].xyz - xdc_origin[u_xdc_index].xyz;
-	vec3 edge2       = xdc_corner2[u_xdc_index].xyz - xdc_origin[u_xdc_index].xyz;
-	vec3 image_point = calc_image_point(voxel);
-	vec3 delta;
-	/* TODO: there should be a smarter way of detecting this */
-	if (edge2.x != 0) delta = vec3(edge2.x, edge1.y, 0) / float(dec_data_dim.y);
-	else              delta = vec3(edge1.x, edge2.y, 0) / float(dec_data_dim.y);
-
-	/* NOTE: used for constant F# dynamic receive apodization. This is implemented as:
-	 *
-	 *                  /        |x_e - x_i|\
-	 *    a(x, z) = cos(F# * π * ----------- ) ^ 2
-	 *                  \        |z_e - z_i|/
-	 *
-	 * where x,z_e are transducer element positions and x,z_i are image positions. */
-	float apod_arg = f_number * 0.5 * radians(360) / abs(image_point.z);
-
-	vec2 sum   = vec2(0);
-	vec3 rdist = image_point;
-
-	/* TODO: pass this in (there is a problem in that it depends on the orientation
-	 * of the array relative to the target/subject). */
-	int   transmit_orientation = TX_ROWS;
-	float transmit_dist;
-	if (isinf(focal_depth)) {
-		/* NOTE: plane wave */
-		transmit_dist = image_point.z;
-	} else {
-		/* NOTE: cylindrical diverging wave */
-		if (transmit_orientation == TX_ROWS)
-			transmit_dist = length(vec2(image_point.y, image_point.z - focal_depth));
-		else
-			transmit_dist = length(vec2(image_point.x, image_point.z - focal_depth));
-	}
-
-	/* NOTE: skip over channels corresponding to other arrays */
-	uint ridx      = u_xdc_index * (dec_data_dim.y / xdc_count) * dec_data_dim.x * dec_data_dim.z;
-	int  direction = beamform_plane * (u_volume_export_pass ^ 1);
-	/* NOTE: For Each Acquistion in Raw Data */
-	for (uint i = 0; i < dec_data_dim.z; i++) {
-		/* NOTE: For Each Virtual Source */
-		for (uint j = 0; j < dec_data_dim.y; j++) {
-			float dist = transmit_dist + length(rdist);
-			float time = dist / speed_of_sound + time_offset;
-
-			/* NOTE: apodization value for this transducer element */
-			float a  = cos(clamp(abs(apod_arg * rdist.x), 0, 0.25 * radians(360)));
-			a        = a * a;
-
-			float sidx = time * sampling_frequency;
-			vec2 valid = vec2(sidx < dec_data_dim.x);
-			vec2 p     = cubic(ridx, sidx);
-			/* NOTE: tribal knowledge; this is a problem with the imaging sequence */
-			if (i == 0) p *= inversesqrt(128);
-			sum += p * a;
-
-			rdist[direction] -= delta[direction];
-			ridx             += dec_data_dim.x;
-		}
-
-		rdist[direction]      = image_point[direction];
-		rdist[direction ^ 1] -= delta[direction ^ 1];
-	}
-	imageStore(u_out_data_tex, out_coord, vec4(sum.x, sum.y, 0, 0));
-}
diff --git a/shaders/uforces.glsl b/shaders/uforces.glsl
@@ -1,116 +0,0 @@
-/* See LICENSE for license details. */
-layout(local_size_x = 32, local_size_y = 1, local_size_z = 32) in;
-
-layout(std430, binding = 1) readonly restrict buffer buffer_1 {
-	vec2 rf_data[];
-};
-
-layout(rg32f, binding = 0) writeonly uniform image3D u_out_data_tex;
-
-layout(location = 2) uniform int   u_volume_export_pass;
-layout(location = 3) uniform ivec3 u_volume_export_dim_offset;
-layout(location = 4) uniform mat4  u_xdc_transform;
-layout(location = 5) uniform int   u_xdc_index;
-
-#define C_SPLINE 0.5
-
-#if 1
-/* NOTE: interpolation is unnecessary if the data has been demodulated and not decimated */
-vec2 cubic(uint ridx, float t)
-{
-	return rf_data[ridx + uint(floor(t))];
-}
-#else
-/* NOTE: See: https://cubic.org/docs/hermite.htm */
-vec2 cubic(uint ridx, float x)
-{
-	mat4 h = mat4(
-		 2, -3,  0, 1,
-		-2,  3,  0, 0,
-		 1, -2,  1, 0,
-		 1, -1,  0, 0
-	);
-
-	uint  xk = uint(floor(x));
-	float t  = (x  - float(xk));
-	vec4  S  = vec4(t * t * t, t * t, t, 1);
-
-	vec2 P1 = rf_data[ridx + xk];
-	vec2 P2 = rf_data[ridx + xk + 1];
-	vec2 T1 = C_SPLINE * (P2 - rf_data[ridx + xk - 1]);
-	vec2 T2 = C_SPLINE * (rf_data[ridx + xk + 2] - P1);
-
-	vec4 C1 = vec4(P1.x, P2.x, T1.x, T2.x);
-	vec4 C2 = vec4(P1.y, P2.y, T1.y, T2.y);
-	return vec2(dot(S, h * C1), dot(S, h * C2));
-}
-#endif
-
-vec3 calc_image_point(vec3 voxel)
-{
-	ivec3 out_data_dim = imageSize(u_out_data_tex);
-	vec4 output_size   = abs(output_max_coord - output_min_coord);
-	vec4 image_point   = vec4(output_min_coord.xyz + voxel * output_size.xyz / out_data_dim, 1);
-
-	/* TODO: fix the math so that the image plane can be aritrary */
-	image_point.y = 0;
-
-	/* NOTE: move the image point into xdc space */
-	image_point = u_xdc_transform * image_point;
-	return image_point.xyz;
-}
-
-void main()
-{
-	vec3  voxel        = vec3(gl_GlobalInvocationID);
-	ivec3 out_coord    = ivec3(gl_GlobalInvocationID);
-
-	/* NOTE: Convert voxel to physical coordinates */
-	vec3 edge1         = xdc_corner1[u_xdc_index].xyz - xdc_origin[u_xdc_index].xyz;
-	vec3 edge2         = xdc_corner2[u_xdc_index].xyz - xdc_origin[u_xdc_index].xyz;
-	vec3 image_point   = calc_image_point(voxel);
-	vec3 delta;
-	/* TODO: there should be a smarter way of detecting this */
-	if (edge2.x != 0) delta = vec3(edge2.x, 0, 0) / float(dec_data_dim.y);
-	else              delta = vec3(edge1.x, 0, 0) / float(dec_data_dim.y);
-
-	/* NOTE: used for constant F# dynamic receive apodization. This is implemented as:
-	 *
-	 *                  /        |x_e - x_i|\
-	 *    a(x, z) = cos(F# * π * ----------- ) ^ 2
-	 *                  \        |z_e - z_i|/
-	 *
-	 * where x,z_e are transducer element positions and x,z_i are image positions. */
-	float apod_arg = f_number * 0.5 * radians(360) / abs(image_point.z);
-
-	vec2 sum   = vec2(0);
-	/* NOTE: skip over channels corresponding to other arrays */
-	uint ridx  = u_xdc_index * (dec_data_dim.y / xdc_count) * dec_data_dim.x * dec_data_dim.z;
-	/* NOTE: skip first acquisition in uforces since its garbage */
-	uint uforces = uint(dec_data_dim.y != dec_data_dim.z);
-	ridx += dec_data_dim.y * dec_data_dim.x * uforces;
-	for (uint i = uforces; i < dec_data_dim.z; i++) {
-		uint base_idx = (i - uforces) / 4;
-		uint sub_idx  = (i - uforces) % 4;
-
-		vec3  focal_point   = uforces_channels[base_idx][sub_idx] * delta;
-		float transmit_dist = distance(image_point, focal_point);
-		vec3 rdist = image_point;
-		for (uint j = 0; j < dec_data_dim.y; j++) {
-			float dist = transmit_dist + length(rdist);
-			float time = dist / speed_of_sound + time_offset;
-
-			/* NOTE: apodization value for this transducer element */
-			float a  = cos(clamp(abs(apod_arg * rdist.x), 0, 0.25 * radians(360)));
-			a        = a * a;
-
-			float sidx  = time * sampling_frequency;
-			vec2 valid  = vec2(sidx < dec_data_dim.x);
-			vec2 p      = cubic(ridx, sidx) * valid;
-			sum        += p * a;
-			rdist      -= delta;
-			ridx       += dec_data_dim.x;
-		}
-	}
-	imageStore(u_out_data_tex, out_coord, vec4(sum.x, sum.y, 0, 0));
-}
diff --git a/static.c b/static.c
@@ -5,11 +5,10 @@ static struct {
 	b32 needs_header;
 } compute_shaders[CS_LAST] = {
 	[CS_HADAMARD] = {s8("Hadamard"), s8("shaders/hadamard.glsl"), 1},
-	[CS_HERCULES] = {s8("HERCULES"), s8("shaders/hercules.glsl"), 1},
+	[CS_DAS]      = {s8("DAS"),      s8("shaders/das.glsl"),      1},
 	[CS_DEMOD]    = {s8("Demod"),    s8("shaders/demod.glsl"),    1},
 	[CS_MIN_MAX]  = {s8("Min/Max"),  s8("shaders/min_max.glsl"),  0},
 	[CS_SUM]      = {s8("Sum"),      s8("shaders/sum.glsl"),      0},
-	[CS_UFORCES]  = {s8("UFORCES"),  s8("shaders/uforces.glsl"),  1},
 };
 
 #ifndef _DEBUG
@@ -275,10 +274,9 @@ setup_beamformer(BeamformerCtx *ctx, Arena temp_memory)
 
 	/* NOTE: default compute shader pipeline */
 	ctx->params->compute_stages[0]    = CS_HADAMARD;
-	ctx->params->compute_stages[1]    = CS_DEMOD;
-	ctx->params->compute_stages[2]    = CS_UFORCES;
-	ctx->params->compute_stages[3]    = CS_MIN_MAX;
-	ctx->params->compute_stages_count = 4;
+	ctx->params->compute_stages[1]    = CS_DAS;
+	ctx->params->compute_stages[2]    = CS_MIN_MAX;
+	ctx->params->compute_stages_count = 3;
 
 	/* NOTE: make sure function pointers are valid even if we are not using the cuda lib */
 	validate_cuda_lib(&ctx->cuda_lib);
diff --git a/ui.c b/ui.c
@@ -451,12 +451,11 @@ draw_debug_overlay(BeamformerCtx *ctx, Arena arena, Rect r)
 	static s8 labels[CS_LAST] = {
 		[CS_CUDA_DECODE]  = s8("CUDA Decoding:"),
 		[CS_CUDA_HILBERT] = s8("CUDA Hilbert:"),
+		[CS_DAS]          = s8("DAS:"),
 		[CS_DEMOD]        = s8("Demodulation:"),
 		[CS_HADAMARD]     = s8("Decoding:"),
-		[CS_HERCULES]     = s8("HERCULES:"),
 		[CS_MIN_MAX]      = s8("Min/Max:"),
 		[CS_SUM]          = s8("Sum:"),
-		[CS_UFORCES]      = s8("UFORCES:"),
 	};
 
 	BeamformerUI *ui     = ctx->ui;

M	beamformer.c	\|	7	++-----
M	beamformer.h	\|	14	+++++++-------
M	beamformer_parameters.h	\|	19	++++++++++++++-----
M	helpers/ogl_beamformer_lib.c	\|	3	+--
A	shaders/das.glsl	\|	200	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
D	shaders/hercules.glsl	\|	137	-------------------------------------------------------------------------------
D	shaders/uforces.glsl	\|	116	-------------------------------------------------------------------------------
M	static.c	\|	10	++++------
M	ui.c	\|	3	+--