tiled 3D volume generation - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: 1d6065e3eaa161f2d65820ff48e89be3ca0498e4
Parent: 1340abce1b7a40f1f9bcb6aa59552631c5bcecb0
Author: Randy Palamar
Date:   Sat,  7 Sep 2024 17:56:44 -0600

tiled 3D volume generation

In order to not get killed by the GL driver we need to split this
computation up across frames. This also means we need to make a
copy of the starting rf data and keep it around until the
computation is complete.

Diffstat:
M beamformer.c  | 61 +++++++++++++++++++++++++++++++++++++++++++++----------------
M beamformer.h  | 3 +++
M main.c  | 12 +++++++-----
D shaders/2d_hercules.glsl  | 143 -------------------------------------------------------------------------------
A shaders/hercules.glsl  | 145 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

5 files changed, 200 insertions(+), 164 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -2,6 +2,15 @@
 #include "beamformer.h"
 #include "ui.c"
 
+
+static size
+decoded_data_size(ComputeShaderCtx *cs)
+{
+	uv4  dim    = cs->dec_data_dim;
+	size result = 2 * sizeof(f32) * dim.x * dim.y * dim.z;
+	return result;
+}
+
 static void
 alloc_output_image(BeamformerCtx *ctx)
 {
@@ -40,10 +49,10 @@ alloc_shader_storage(BeamformerCtx *ctx, Arena a)
 	BeamformerParameters *bp = &ctx->params->raw;
 	uv4 dec_data_dim         = bp->dec_data_dim;
 	uv2 rf_raw_dim           = bp->rf_raw_dim;
-	size rf_raw_size         = rf_raw_dim.x * rf_raw_dim.y * sizeof(i16);
-	size rf_decoded_size     = dec_data_dim.x * dec_data_dim.y * dec_data_dim.z * sizeof(f32) * 2;
-	ctx->csctx.rf_raw_dim    = rf_raw_dim;
 	ctx->csctx.dec_data_dim  = dec_data_dim;
+	ctx->csctx.rf_raw_dim    = rf_raw_dim;
+	size rf_raw_size         = rf_raw_dim.x * rf_raw_dim.y * sizeof(i16);
+	size rf_decoded_size     = decoded_data_size(cs);
 
 	glDeleteBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos);
 	glCreateBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos);
@@ -165,21 +174,37 @@ do_compute_shader(BeamformerCtx *ctx, enum compute_shaders shader)
 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]);
 
 		/* NOTE: Do a volume computation before doing the normal display path */
-		if (ctx->export_ctx.state & ES_START) {
-			/* TODO: for large data this must be split over multiple compute calls
-			 * otherwise the GL driver will kill you */
+		if (ctx->export_ctx.state & (ES_START|ES_COMPUTING)) {
 			ExportCtx *e = &ctx->export_ctx;
+			/* NOTE: on the first frame of compute make a copy of the rf data */
+			if (e->state & ES_START) {
+				size rf_size = decoded_data_size(csctx);
+				e->state &= ~ES_START;
+				glCopyNamedBufferSubData(csctx->rf_data_ssbos[input_ssbo_idx],
+				                         e->rf_data_ssbo, 0, 0, rf_size);
+			}
+
 			glActiveTexture(GL_TEXTURE0);
 			glBindTexture(GL_TEXTURE_3D, e->volume_texture);
 			glBindImageTexture(0, e->volume_texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_R32F);
 			glUniform1i(e->volume_texture_id, 0);
 			glUniform1i(csctx->volume_export_pass_id, 1);
-			glDispatchCompute(ORONE(e->volume_dim.x / 32),
-			                  e->volume_dim.y,
-			                  ORONE(e->volume_dim.z / 32));
-			ctx->export_ctx.state = ES_DONE;
+
+			e->state |= ES_COMPUTING;
+
+			/* NOTE: We must tile this otherwise GL will kill us for taking too long */
+			/* TODO: this could be based on multiple dimensions */
+			u32 dispatch_count = e->volume_dim.z / 32;
+			uv4 dim_offset = {.z = !!dispatch_count * 32 * e->dispatch_index++};
+			glUniform3iv(csctx->volume_export_dim_offset_id, 1, (i32 *)dim_offset.E);
+			glDispatchCompute(ORONE(e->volume_dim.x / 32), e->volume_dim.y, 1);
+			if (e->dispatch_index >= dispatch_count) {
+				e->dispatch_index = 0;
+				e->state          = ES_DONE;
+			}
 		}
 
+		glUniform3iv(csctx->volume_export_dim_offset_id, 1, (i32 []){0, 0, 0});
 		glUniform1i(csctx->volume_export_pass_id, 0);
 		glActiveTexture(GL_TEXTURE0 + ctx->out_texture_unit);
 		glBindTexture(GL_TEXTURE_3D, ctx->out_texture);
@@ -272,18 +297,22 @@ do_beamformer(BeamformerCtx *ctx, Arena arena)
 		else                     ctx->partial_transfer_count++;
 	}
 
-	if (ctx->flags & DO_COMPUTE) {
+	if (ctx->flags & DO_COMPUTE || ctx->export_ctx.state & ES_COMPUTING) {
 		if (ctx->params->upload) {
 			glNamedBufferSubData(ctx->csctx.shared_ubo, 0, sizeof(*bp), bp);
 			ctx->params->upload = 0;
 		}
 
 		if (ctx->export_ctx.state & ES_START) {
-			uv4 edim = ctx->export_ctx.volume_dim;
-			glDeleteTextures(1, &ctx->export_ctx.volume_texture);
-			glCreateTextures(GL_TEXTURE_3D, 1, &ctx->export_ctx.volume_texture);
-			glTextureStorage3D(ctx->export_ctx.volume_texture, 1, GL_R32F,
-			                   edim.x, edim.y, edim.z);
+			ExportCtx *e = &ctx->export_ctx;
+			uv4 edim = e->volume_dim;
+			glDeleteTextures(1, &e->volume_texture);
+			glCreateTextures(GL_TEXTURE_3D, 1, &e->volume_texture);
+			glTextureStorage3D(e->volume_texture, 1, GL_R32F, edim.x, edim.y, edim.z);
+
+			glDeleteBuffers(1, &e->rf_data_ssbo);
+			glCreateBuffers(1, &e->rf_data_ssbo);
+			glNamedBufferStorage(e->rf_data_ssbo, decoded_data_size(&ctx->csctx), 0, 0);
 		}
 
 		u32 stages = ctx->params->compute_stages_count;
diff --git a/beamformer.h b/beamformer.h
@@ -180,6 +180,7 @@ typedef struct {
 	i32 mip_view_tex_id;
 	i32 mips_level_id;
 	i32 volume_export_pass_id;
+	i32 volume_export_dim_offset_id;
 } ComputeShaderCtx;
 
 typedef struct {
@@ -201,8 +202,10 @@ typedef struct {
 	uv4   volume_dim;
 	u32   volume_texture;
 	i32   volume_texture_id;
+	u32   rf_data_ssbo;
 	u32   output_ssbo;
 	u32   state;
+	u32   dispatch_index;
 } ExportCtx;
 
 typedef struct {
diff --git a/main.c b/main.c
@@ -3,7 +3,7 @@
 
 static char *compute_shader_paths[CS_LAST] = {
 	[CS_HADAMARD] = "shaders/hadamard.glsl",
-	[CS_HERCULES] = "shaders/2d_hercules.glsl",
+	[CS_HERCULES] = "shaders/hercules.glsl",
 	[CS_DEMOD]    = "shaders/demod.glsl",
 	[CS_MIN_MAX]  = "shaders/min_max.glsl",
 	[CS_UFORCES]  = "shaders/uforces.glsl",
@@ -117,10 +117,12 @@ reload_shaders(BeamformerCtx *ctx, Arena a)
 		glDeleteShader(shader_id);
 	}
 
-	ctx->export_ctx.volume_texture_id = glGetUniformLocation(csctx->programs[CS_HERCULES],
-	                                                         "u_out_volume_tex");
-	csctx->volume_export_pass_id      = glGetUniformLocation(csctx->programs[CS_HERCULES],
-	                                                         "u_volume_export_pass");
+	ctx->export_ctx.volume_texture_id  = glGetUniformLocation(csctx->programs[CS_HERCULES],
+	                                                          "u_out_volume_tex");
+	csctx->volume_export_pass_id       = glGetUniformLocation(csctx->programs[CS_HERCULES],
+	                                                          "u_volume_export_pass");
+	csctx->volume_export_dim_offset_id = glGetUniformLocation(csctx->programs[CS_HERCULES],
+	                                                         "u_volume_export_dim_offset");
 
 	csctx->out_data_tex_id = glGetUniformLocation(csctx->programs[CS_UFORCES], "u_out_data_tex");
 	csctx->mip_view_tex_id = glGetUniformLocation(csctx->programs[CS_MIN_MAX], "u_mip_view_tex");
diff --git a/shaders/2d_hercules.glsl b/shaders/2d_hercules.glsl
@@ -1,143 +0,0 @@
-/* See LICENSE for license details. */
-#version 460 core
-layout(local_size_x = 32, local_size_y = 1, local_size_z = 32) in;
-
-layout(std430, binding = 1) readonly restrict buffer buffer_1 {
-	vec2 rf_data[];
-};
-
-layout(std140, binding = 0) uniform parameters {
-	uvec4 channel_mapping[64];    /* Transducer Channel to Verasonics Channel */
-	uvec4 uforces_channels[32];   /* Channels used for virtual UFORCES elements */
-	vec4  lpf_coefficients[16];   /* Low Pass Filter Cofficients */
-	uvec4 dec_data_dim;           /* Samples * Channels * Acquisitions; last element ignored */
-	uvec4 output_points;          /* Width * Height * Depth; last element ignored */
-	vec4  output_min_coord;       /* [m] Top left corner of output region */
-	vec4  output_max_coord;       /* [m] Bottom right corner of output region */
-	uvec2 rf_raw_dim;             /* Raw Data Dimensions */
-	vec2  xdc_min_xy;             /* [m] Min center of transducer elements */
-	vec2  xdc_max_xy;             /* [m] Max center of transducer elements */
-	uint  channel_offset;         /* Offset into channel_mapping: 0 or 128 (rows or columns) */
-	uint  lpf_order;              /* Order of Low Pass Filter */
-	float speed_of_sound;         /* [m/s] */
-	float sampling_frequency;     /* [Hz]  */
-	float center_frequency;       /* [Hz]  */
-	float focal_depth;            /* [m]   */
-	float time_offset;            /* pulse length correction time [s]   */
-	uint  uforces;                /* mode is UFORCES (1) or FORCES (0) */
-};
-
-layout(rg32f, location = 1) uniform writeonly image3D u_out_data_tex;
-layout(r32f,  location = 2) uniform writeonly image3D u_out_volume_tex;
-
-layout(location = 3) uniform int u_volume_export_pass;
-
-#define C_SPLINE 0.5
-
-#if 0
-/* NOTE: interpolation is unnecessary if the data has been demodulated and not decimated */
-vec2 cubic(uint ridx, float t)
-{
-	return rf_data[ridx + uint(floor(t))];
-}
-#else
-/* NOTE: See: https://cubic.org/docs/hermite.htm */
-vec2 cubic(uint ridx, float x)
-{
-	mat4 h = mat4(
-		 2, -3,  0, 1,
-		-2,  3,  0, 0,
-		 1, -2,  1, 0,
-		 1, -1,  0, 0
-	);
-
-	uint  xk = uint(floor(x));
-	float t  = (x  - float(xk));
-	vec4  S  = vec4(t * t * t, t * t, t, 1);
-
-	vec2 P1 = rf_data[ridx + xk];
-	vec2 P2 = rf_data[ridx + xk + 1];
-	vec2 T1 = C_SPLINE * (P2 - rf_data[ridx + xk - 1]);
-	vec2 T2 = C_SPLINE * (rf_data[ridx + xk + 2] - P1);
-
-	vec4 C1 = vec4(P1.x, P2.x, T1.x, T2.x);
-	vec4 C2 = vec4(P1.y, P2.y, T1.y, T2.y);
-	return vec2(dot(S, h * C1), dot(S, h * C2));
-}
-#endif
-
-void main()
-{
-	vec3  voxel        = vec3(gl_GlobalInvocationID.xyz);
-	ivec3 out_coord    = ivec3(gl_GlobalInvocationID.xyz);
-	ivec3 out_data_dim;
-	if (u_volume_export_pass == 0) out_data_dim = imageSize(u_out_data_tex);
-	else                           out_data_dim = imageSize(u_out_volume_tex);
-
-	/* NOTE: Convert pixel to physical coordinates */
-	vec2 xdc_size      = abs(xdc_max_xy - xdc_min_xy);
-	vec4 output_size   = abs(output_max_coord - output_min_coord);
-	vec3 image_point   = vec3(
-		output_min_coord.x + voxel.x * output_size.x / out_data_dim.x,
-		output_min_coord.y + voxel.y * output_size.y / out_data_dim.y,
-		output_min_coord.z + voxel.z * output_size.z / out_data_dim.z
-	);
-
-	/* NOTE: used for constant F# dynamic receive apodization. This is implemented as:
-	 *
-	 *                  /        |x_e - x_i|\
-	 *    a(x, z) = cos(F# * π * ----------- ) ^ 2
-	 *                  \        |z_e - z_i|/
-	 *
-	 * where x,z_e are transducer element positions and x,z_i are image positions. */
-	float f_num    = output_size.z / output_size.x;
-	float apod_arg = f_num * 0.5 * radians(360) / abs(image_point.z);
-
-	/* NOTE: for I-Q data phase correction */
-	float iq_time_scale = (lpf_order > 0)? radians(360) * center_frequency : 0;
-
-	vec3  starting_dist = vec3(image_point.x - xdc_min_xy.x, image_point.y - xdc_min_xy.y, image_point.z);
-	vec3  delta         = vec3(xdc_size.x / float(dec_data_dim.y), xdc_size.y / float(dec_data_dim.y), 0);
-	float dzsign        = sign(image_point.z - focal_depth);
-
-	/* NOTE: offset correcting for both pulse length and low pass filtering */
-	float time_correction = time_offset + (lpf_order + 1) / sampling_frequency;
-
-	vec2 sum   = vec2(0);
-	vec3 rdist = starting_dist;
-
-	int  direction = 1 * (u_volume_export_pass ^ 1);
-	uint ridx      = 0;
-	/* NOTE: For Each Acquistion in Raw Data */
-	for (uint i = 0; i < dec_data_dim.z; i++) {
-		uint base_idx = (i - uforces) / 4;
-		uint sub_idx  = (i - uforces) % 4;
-
-		float transmit_dist = image_point.z;
-
-		/* NOTE: For Each Virtual Source */
-		for (uint j = 0; j < dec_data_dim.y; j++) {
-			float dist = transmit_dist + length(rdist);
-			float time = dist / speed_of_sound + time_correction;
-
-			/* NOTE: apodization value for this transducer element */
-			float a  = cos(clamp(abs(apod_arg * rdist.x), 0, 0.25 * radians(360)));
-			a        = a * a;
-
-			vec2 p   = cubic(ridx, time * sampling_frequency);
-			/* NOTE: tribal knowledge; this is a problem with the imaging sequence */
-			if (i == 0) p *= inversesqrt(128);
-			//p       *= vec2(cos(iq_time_scale * time), sin(iq_time_scale * time));
-			sum     += p;
-
-			rdist[direction] -= delta[direction];
-			ridx             += dec_data_dim.x;
-		}
-
-		rdist[direction]      = starting_dist[direction];
-		rdist[direction ^ 1] -= delta[direction ^ 1];
-	}
-	float val = length(sum);
-	if (u_volume_export_pass == 0) imageStore(u_out_data_tex,   out_coord, vec4(val));
-	else                           imageStore(u_out_volume_tex, out_coord, vec4(val));
-}
diff --git a/shaders/hercules.glsl b/shaders/hercules.glsl
@@ -0,0 +1,145 @@
+/* See LICENSE for license details. */
+#version 460 core
+layout(local_size_x = 32, local_size_y = 1, local_size_z = 32) in;
+
+layout(std430, binding = 1) readonly restrict buffer buffer_1 {
+	vec2 rf_data[];
+};
+
+layout(std140, binding = 0) uniform parameters {
+	uvec4 channel_mapping[64];    /* Transducer Channel to Verasonics Channel */
+	uvec4 uforces_channels[32];   /* Channels used for virtual UFORCES elements */
+	vec4  lpf_coefficients[16];   /* Low Pass Filter Cofficients */
+	uvec4 dec_data_dim;           /* Samples * Channels * Acquisitions; last element ignored */
+	uvec4 output_points;          /* Width * Height * Depth; last element ignored */
+	vec4  output_min_coord;       /* [m] Top left corner of output region */
+	vec4  output_max_coord;       /* [m] Bottom right corner of output region */
+	uvec2 rf_raw_dim;             /* Raw Data Dimensions */
+	vec2  xdc_min_xy;             /* [m] Min center of transducer elements */
+	vec2  xdc_max_xy;             /* [m] Max center of transducer elements */
+	uint  channel_offset;         /* Offset into channel_mapping: 0 or 128 (rows or columns) */
+	uint  lpf_order;              /* Order of Low Pass Filter */
+	float speed_of_sound;         /* [m/s] */
+	float sampling_frequency;     /* [Hz]  */
+	float center_frequency;       /* [Hz]  */
+	float focal_depth;            /* [m]   */
+	float time_offset;            /* pulse length correction time [s]   */
+	uint  uforces;                /* mode is UFORCES (1) or FORCES (0) */
+};
+
+layout(rg32f, location = 1) uniform writeonly image3D u_out_data_tex;
+layout(r32f,  location = 2) uniform writeonly image3D u_out_volume_tex;
+
+layout(location = 3) uniform int   u_volume_export_pass;
+layout(location = 4) uniform ivec3 u_volume_export_dim_offset;
+
+#define C_SPLINE 0.5
+
+#if 0
+/* NOTE: interpolation is unnecessary if the data has been demodulated and not decimated */
+vec2 cubic(uint ridx, float t)
+{
+	return rf_data[ridx + uint(floor(t))];
+}
+#else
+/* NOTE: See: https://cubic.org/docs/hermite.htm */
+vec2 cubic(uint ridx, float x)
+{
+	mat4 h = mat4(
+		 2, -3,  0, 1,
+		-2,  3,  0, 0,
+		 1, -2,  1, 0,
+		 1, -1,  0, 0
+	);
+
+	uint  xk = uint(floor(x));
+	float t  = (x  - float(xk));
+	vec4  S  = vec4(t * t * t, t * t, t, 1);
+
+	vec2 P1 = rf_data[ridx + xk];
+	vec2 P2 = rf_data[ridx + xk + 1];
+	vec2 T1 = C_SPLINE * (P2 - rf_data[ridx + xk - 1]);
+	vec2 T2 = C_SPLINE * (rf_data[ridx + xk + 2] - P1);
+
+	vec4 C1 = vec4(P1.x, P2.x, T1.x, T2.x);
+	vec4 C2 = vec4(P1.y, P2.y, T1.y, T2.y);
+	return vec2(dot(S, h * C1), dot(S, h * C2));
+}
+#endif
+
+void main()
+{
+	vec3  voxel        = vec3(gl_GlobalInvocationID.xyz)  + vec3(u_volume_export_dim_offset);
+	ivec3 out_coord    = ivec3(gl_GlobalInvocationID.xyz) + u_volume_export_dim_offset;
+	ivec3 out_data_dim;
+
+	if (u_volume_export_pass == 0) out_data_dim = imageSize(u_out_data_tex);
+	else                           out_data_dim = imageSize(u_out_volume_tex);
+
+	/* NOTE: Convert pixel to physical coordinates */
+	vec2 xdc_size      = abs(xdc_max_xy - xdc_min_xy);
+	vec4 output_size   = abs(output_max_coord - output_min_coord);
+	vec3 image_point   = vec3(
+		output_min_coord.x + voxel.x * output_size.x / out_data_dim.x,
+		output_min_coord.y + voxel.y * output_size.y / out_data_dim.y,
+		output_min_coord.z + voxel.z * output_size.z / out_data_dim.z
+	);
+
+	/* NOTE: used for constant F# dynamic receive apodization. This is implemented as:
+	 *
+	 *                  /        |x_e - x_i|\
+	 *    a(x, z) = cos(F# * π * ----------- ) ^ 2
+	 *                  \        |z_e - z_i|/
+	 *
+	 * where x,z_e are transducer element positions and x,z_i are image positions. */
+	float f_num    = output_size.z / output_size.x;
+	float apod_arg = f_num * 0.5 * radians(360) / abs(image_point.z);
+
+	/* NOTE: for I-Q data phase correction */
+	float iq_time_scale = (lpf_order > 0)? radians(360) * center_frequency : 0;
+
+	vec3  starting_dist = vec3(image_point.x - xdc_min_xy.x, image_point.y - xdc_min_xy.y, image_point.z);
+	vec3  delta         = vec3(xdc_size.x / float(dec_data_dim.y), xdc_size.y / float(dec_data_dim.y), 0);
+	float dzsign        = sign(image_point.z - focal_depth);
+
+	/* NOTE: offset correcting for both pulse length and low pass filtering */
+	float time_correction = time_offset + (lpf_order + 1) / sampling_frequency;
+
+	vec2 sum   = vec2(0);
+	vec3 rdist = starting_dist;
+
+	int  direction = 1 * (u_volume_export_pass ^ 1);
+	uint ridx      = 0;
+	/* NOTE: For Each Acquistion in Raw Data */
+	for (uint i = 0; i < dec_data_dim.z; i++) {
+		uint base_idx = (i - uforces) / 4;
+		uint sub_idx  = (i - uforces) % 4;
+
+		float transmit_dist = image_point.z;
+
+		/* NOTE: For Each Virtual Source */
+		for (uint j = 0; j < dec_data_dim.y; j++) {
+			float dist = transmit_dist + length(rdist);
+			float time = dist / speed_of_sound + time_correction;
+
+			/* NOTE: apodization value for this transducer element */
+			float a  = cos(clamp(abs(apod_arg * rdist.x), 0, 0.25 * radians(360)));
+			a        = a * a;
+
+			vec2 p   = cubic(ridx, time * sampling_frequency);
+			/* NOTE: tribal knowledge; this is a problem with the imaging sequence */
+			if (i == 0) p *= inversesqrt(128);
+			//p       *= vec2(cos(iq_time_scale * time), sin(iq_time_scale * time));
+			sum     += p;
+
+			rdist[direction] -= delta[direction];
+			ridx             += dec_data_dim.x;
+		}
+
+		rdist[direction]      = starting_dist[direction];
+		rdist[direction ^ 1] -= delta[direction ^ 1];
+	}
+	float val = length(sum);
+	if (u_volume_export_pass == 0) imageStore(u_out_data_tex,   out_coord, vec4(val));
+	else                           imageStore(u_out_volume_tex, out_coord, vec4(val));
+}

M	beamformer.c	\|	61	+++++++++++++++++++++++++++++++++++++++++++++----------------
M	beamformer.h	\|	3	+++
M	main.c	\|	12	+++++++-----
D	shaders/2d_hercules.glsl	\|	143	-------------------------------------------------------------------------------
A	shaders/hercules.glsl	\|	145	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++