fill out cuda lib skeleton - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: 0f04b0f13a888570f1654032a69ff93e26351f22
Parent: 38170ef98620a2f05101d029a5ddd4a3ecb6321f
Author: tkhenry-uofa
Date:   Mon,  5 Aug 2024 19:32:26 -0600

fill out cuda lib skeleton

Diffstat:
M .gitignore  | 1 +
M beamformer.c  | 50 ++++++++++++++++++++++++++++++--------------------
M beamformer.h  | 19 ++++++++++---------
M beamformer_parameters.h  | 2 +-
M main.c  | 1 +
A shaders/2d_hercules.glsl  | 126 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M shaders/hadamard.glsl  | 1 +
M shaders/lpf.glsl  | 1 +
M shaders/uforces.glsl  | 1 +

9 files changed, 172 insertions(+), 30 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 ogl
+*.dll
 *.so
 *.mexa64
 external/lib
diff --git a/beamformer.c b/beamformer.c
@@ -55,7 +55,7 @@ alloc_shader_storage(BeamformerCtx *ctx, Arena a)
 			glUnmapNamedBuffer(cs->raw_data_ssbo);
 		storage_flags |= GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT;
 	case GL_VENDOR_NVIDIA:
-		/* TODO: does the cuda buffer need to be unregistered? */
+		/* NOTE: register_cuda_buffers will handle the updated ssbo */
 		break;
 	}
 
@@ -64,6 +64,9 @@ alloc_shader_storage(BeamformerCtx *ctx, Arena a)
 	glCreateBuffers(1, &cs->raw_data_ssbo);
 	glNamedBufferStorage(cs->raw_data_ssbo, full_rf_buf_size, 0, storage_flags);
 
+	for (u32 i = 0; i < ARRAY_COUNT(cs->rf_data_ssbos); i++)
+		glNamedBufferStorage(cs->rf_data_ssbos[i], rf_decoded_size, 0, 0);
+
 	i32 map_flags = GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT|GL_MAP_UNSYNCHRONIZED_BIT;
 	switch (ctx->gl_vendor_id) {
 	case GL_VENDOR_INTEL:
@@ -73,17 +76,17 @@ alloc_shader_storage(BeamformerCtx *ctx, Arena a)
 		break;
 	case GL_VENDOR_NVIDIA:
 		cs->raw_data_arena = os_alloc_arena(cs->raw_data_arena, full_rf_buf_size);
-		if (g_cuda_lib_functions[CLF_REGISTER_BUFFER]) {
-			cuda_register_buffer *fn = g_cuda_lib_functions[CLF_REGISTER_BUFFER];
-			/* TODO: this needs the correct parameters */
-			fn(NULL, cs->raw_data_ssbo);
+		if (g_cuda_lib_functions[CLF_REGISTER_BUFFERS] && g_cuda_lib_functions[CLF_INIT_CONFIG]) {
+			register_cuda_buffers *fn = g_cuda_lib_functions[CLF_REGISTER_BUFFERS];
+			fn(cs->rf_data_ssbos, ARRAY_COUNT(cs->rf_data_ssbos), cs->raw_data_ssbo);
+
+			init_cuda_configuration *init_fn = g_cuda_lib_functions[CLF_INIT_CONFIG];
+			init_fn(bp->rf_raw_dim.E, bp->dec_data_dim.E, bp->channel_mapping,
+			        bp->channel_offset > 0);
 		}
 		break;
 	}
 
-	for (u32 i = 0; i < ARRAY_COUNT(cs->rf_data_ssbos); i++)
-		glNamedBufferStorage(cs->rf_data_ssbos[i], rf_decoded_size, 0, 0);
-
 	/* NOTE: store hadamard in GPU once; it won't change for a particular imaging session */
 	cs->hadamard_dim       = (uv2){.x = dec_data_dim.z, .y = dec_data_dim.z};
 	size hadamard_elements = dec_data_dim.z * dec_data_dim.z;
@@ -123,13 +126,14 @@ do_compute_shader(BeamformerCtx *ctx, enum compute_shaders shader)
 		                  ORONE(csctx->dec_data_dim.z));
 		csctx->raw_data_fences[csctx->raw_data_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
 		csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
-		csctx->raw_data_index = (csctx->raw_data_index + 1) % ARRAY_COUNT(csctx->raw_data_fences);
 		break;
 	case CS_CUDA_DECODE_AND_DEMOD:
 		if (g_cuda_lib_functions[CLF_DECODE_AND_DEMOD]) {
-			cuda_decode_and_demod *fn = g_cuda_lib_functions[CLF_DECODE_AND_DEMOD];
-			/* TODO: fill in correct params */
-			fn(1, 2, 3);
+			decode_and_hilbert*fn = g_cuda_lib_functions[CLF_DECODE_AND_DEMOD];
+
+			fn(csctx->raw_data_index * rf_raw_size, output_ssbo_idx);
+			csctx->raw_data_fences[csctx->raw_data_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+			csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
 		}
 		break;
 	case CS_LPF:
@@ -354,14 +358,15 @@ draw_settings_ui(BeamformerCtx *ctx, Arena arena, Rect r, v2 mouse)
 	f32 minx = bp->output_min_xz.x + 1e-6, maxx = bp->output_max_xz.x - 1e-6;
 	f32 minz = bp->output_min_xz.y + 1e-6, maxz = bp->output_max_xz.y - 1e-6;
 	struct listing listings[] = {
-		{ "Sampling Rate:",    "[MHz]", &bp->sampling_frequency, {{0,    0}},     1e-6, 0 },
-		{ "Center Frequency:", "[MHz]", &bp->center_frequency,   {{0,    100e6}}, 1e-6, 1 },
-		{ "Speed of Sound:",   "[m/s]", &bp->speed_of_sound,     {{0,    1e6}},   1,    1 },
-		{ "Min X Point:",      "[mm]",  &bp->output_min_xz.x,    {{-1e3, maxx}},  1e3,  1 },
-		{ "Max X Point:",      "[mm]",  &bp->output_max_xz.x,    {{minx, 1e3}},   1e3,  1 },
-		{ "Min Z Point:",      "[mm]",  &bp->output_min_xz.y,    {{0,    maxz}},  1e3,  1 },
-		{ "Max Z Point:",      "[mm]",  &bp->output_max_xz.y,    {{minz, 1e3}},   1e3,  1 },
-		{ "Dynamic Range:",    "[dB]",  &ctx->fsctx.db,          {{-120, 0}},     1,    1 },
+		{ "Sampling Rate:",    "[MHz]", &bp->sampling_frequency, {{0,    0}},        1e-6, 0 },
+		{ "Center Frequency:", "[MHz]", &bp->center_frequency,   {{0,    100e6}},    1e-6, 1 },
+		{ "Speed of Sound:",   "[m/s]", &bp->speed_of_sound,     {{0,    1e6}},      1,    1 },
+		{ "Min X Point:",      "[mm]",  &bp->output_min_xz.x,    {{-1e3, maxx}},     1e3,  1 },
+		{ "Max X Point:",      "[mm]",  &bp->output_max_xz.x,    {{minx, 1e3}},      1e3,  1 },
+		{ "Min Z Point:",      "[mm]",  &bp->output_min_xz.y,    {{0,    maxz}},     1e3,  1 },
+		{ "Max Z Point:",      "[mm]",  &bp->output_max_xz.y,    {{minz, 1e3}},      1e3,  1 },
+		{ "Dynamic Range:",    "[dB]",  &ctx->fsctx.db,          {{-120, 0}},        1,    1 },
+		{ "Y Position:",       "[mm]",  &bp->off_axis_pos,       {{minx*2, maxx*2}}, 1e3,  1 },
 	};
 
 	static f32 hover_t[ARRAY_COUNT(listings)];
@@ -463,6 +468,7 @@ draw_debug_overlay(BeamformerCtx *ctx, Arena arena, Rect r)
 
 	static char *labels[CS_LAST] = {
 		[CS_HADAMARD] = "Decoding:",
+		[CS_CUDA_DECODE_AND_DEMOD] = "Cuda Decoding:",
 		[CS_LPF]      = "LPF:",
 		[CS_MIN_MAX]  = "Min/Max:",
 		[CS_UFORCES]  = "UFORCES:",
@@ -553,9 +559,11 @@ do_beamformer(BeamformerCtx *ctx, Arena arena)
 		ComputeShaderCtx *cs = &ctx->csctx;
 		if (!uv4_equal(cs->dec_data_dim, bp->dec_data_dim) || ctx->flags & ALLOC_SSBOS)
 			alloc_shader_storage(ctx, arena);
+
 		if (!uv4_equal(ctx->out_data_dim, bp->output_points) || ctx->flags & ALLOC_OUT_TEX)
 			alloc_output_image(ctx);
 
+		cs->raw_data_index = (cs->raw_data_index + 1) % ARRAY_COUNT(cs->raw_data_fences);
 		i32 raw_index = ctx->csctx.raw_data_index;
 		/* NOTE: if this times out it means the command queue is more than 3 frames behind.
 		 * In that case we need to re-evaluate the buffer size */
@@ -592,10 +600,12 @@ do_beamformer(BeamformerCtx *ctx, Arena arena)
 			glNamedBufferSubData(ctx->csctx.shared_ubo, 0, sizeof(*bp), bp);
 			ctx->params->upload = 0;
 		}
+
 		do_compute_shader(ctx, CS_HADAMARD);
 		do_compute_shader(ctx, CS_LPF);
 		do_compute_shader(ctx, CS_UFORCES);
 		do_compute_shader(ctx, CS_MIN_MAX);
+
 		ctx->flags &= ~DO_COMPUTE;
 		ctx->flags |= GEN_MIPMAPS;
 
diff --git a/beamformer.h b/beamformer.h
@@ -45,7 +45,7 @@ enum compute_shaders {
 	CS_HADAMARD,
 	/* TODO: Probably this should be split up */
 	CS_CUDA_DECODE_AND_DEMOD,
-//	CS_HERCULES,
+	CS_HERCULES,
 	CS_LPF,
 	CS_MIN_MAX,
 	CS_UFORCES,
@@ -170,21 +170,22 @@ typedef struct {
 } BeamformerCtx;
 
 enum cuda_lib_functions {
-	CLF_REGISTER_BUFFER,
+	CLF_INIT_CONFIG,
+	CLF_REGISTER_BUFFERS,
 	CLF_DECODE_AND_DEMOD,
 	CLF_LAST
 };
 
-/* TODO: fill in cuda lib entry points (function names) */
 static char *cuda_lib_function_names[CLF_LAST] = {
-	[CLF_REGISTER_BUFFER]  = "cuda_register_buffers",
-	[CLF_DECODE_AND_DEMOD] = "cuda_decode_and_demod",
+	[CLF_INIT_CONFIG]      = "init_cuda_configuration",
+	[CLF_REGISTER_BUFFERS] = "register_cuda_buffers",
+	[CLF_DECODE_AND_DEMOD] = "decode_and_hilbert",
 };
 
-/* TODO: fill in portable function type params */
-#define CUDA_LIB_NAME "libcuda.dll"
-typedef void cuda_register_buffer(void *, u32);
-typedef void cuda_decode_and_demod(u32, u32, u32);
+#define CUDA_LIB_NAME "cuda_toolkit.dll"
+typedef void init_cuda_configuration(const u32*, const u32*, const u32*, b32);
+typedef void register_cuda_buffers(u32*, u32, u32);
+typedef void decode_and_hilbert(size_t, u32);
 
 /* NOTE: Array of Function Pointers used for accessing cuda lib functions */
 /* TODO: robustness: replace with function stubs when not found */
diff --git a/beamformer_parameters.h b/beamformer_parameters.h
@@ -21,5 +21,5 @@ typedef struct {
 	f32 focal_depth;            /* [m]   */
 	f32 time_offset;            /* pulse length correction time [s]   */
 	u32 uforces;                /* mode is UFORCES (1) or FORCES (0) */
-	f32 _pad[1];
+	f32 off_axis_pos;           /* Where on the 3rd axis to render the image (Hercules only)*/
 } BeamformerParameters;
diff --git a/main.c b/main.c
@@ -3,6 +3,7 @@
 
 static char *compute_shader_paths[CS_LAST] = {
 	[CS_HADAMARD] = "shaders/hadamard.glsl",
+	[CS_HERCULES] = "shaders/2d_hercules.glsl",
 	[CS_LPF]      = "shaders/lpf.glsl",
 	[CS_MIN_MAX]  = "shaders/min_max.glsl",
 	[CS_UFORCES]  = "shaders/uforces.glsl",
diff --git a/shaders/2d_hercules.glsl b/shaders/2d_hercules.glsl
@@ -0,0 +1,126 @@
+/* See LICENSE for license details. */
+#version 460 core
+layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
+
+layout(std430, binding = 1) readonly restrict buffer buffer_1 {
+	vec2 rf_data[];
+};
+
+layout(std140, binding = 0) uniform parameters{
+	uvec4 channel_mapping[64];    /* Transducer Channel to Verasonics Channel */
+	uvec4 uforces_channels[32];   /* Channels used for virtual UFORCES elements */
+	vec4  lpf_coefficients[16];   /* Low Pass Filter Cofficients */
+	uvec4 dec_data_dim;           /* Samples * Channels * Acquisitions; last element ignored */
+	uvec4 output_points;          /* Width * Height * Depth; last element ignored */
+	uvec2 rf_raw_dim;             /* Raw Data Dimensions */
+	vec2  output_min_xz;          /* [m] Top left corner of output region */
+	vec2  output_max_xz;          /* [m] Bottom right corner of output region */
+	vec2  xdc_min_xy;             /* [m] Min center of transducer elements */
+	vec2  xdc_max_xy;             /* [m] Max center of transducer elements */
+	uint  channel_offset;         /* Offset into channel_mapping: 0 or 128 (rows or columns) */
+	uint  lpf_order;              /* Order of Low Pass Filter */
+	float speed_of_sound;         /* [m/s] */
+	float sampling_frequency;     /* [Hz]  */
+	float center_frequency;       /* [Hz]  */
+	float focal_depth;            /* [m]   */
+	float time_offset;            /* pulse length correction time [s]   */
+	uint  uforces;                /* mode is UFORCES (1) or FORCES (0) */
+	float off_axis_pos;           /* Where on the 3rd axis to render the image (Hercules only) */
+};
+
+layout(rg32f, location = 1) uniform image3D   u_out_data_tex;
+
+#define C_SPLINE 0.5
+
+/* NOTE: See: https://cubic.org/docs/hermite.htm */
+vec2 cubic(uint ridx, float x)
+{
+	mat4 h = mat4(
+		2, -3, 0, 1,
+		-2, 3, 0, 0,
+		1, -2, 1, 0,
+		1, -1, 0, 0
+	);
+
+	uint  xk = uint(floor(x));
+	float t = (x - float(xk));
+	vec4  S = vec4(t * t * t, t * t, t, 1);
+
+	vec2 P1 = rf_data[ridx + xk];
+	vec2 P2 = rf_data[ridx + xk + 1];
+	vec2 T1 = C_SPLINE * (P2 - rf_data[ridx + xk - 1]);
+	vec2 T2 = C_SPLINE * (rf_data[ridx + xk + 2] - P1);
+
+	vec4 C1 = vec4(P1.x, P2.x, T1.x, T2.x);
+	vec4 C2 = vec4(P1.y, P2.y, T1.y, T2.y);
+	return vec2(dot(S, h * C1), dot(S, h * C2));
+}
+
+void main()
+{
+	vec2  pixel = vec2(gl_GlobalInvocationID.xy);
+	ivec3 out_coord = ivec3(gl_GlobalInvocationID.xyz);
+	ivec3 out_data_dim = imageSize(u_out_data_tex);
+
+	/* NOTE: Convert pixel to physical coordinates */
+	vec2 xdc_size = abs(xdc_max_xy - xdc_min_xy);
+	vec2 output_size = abs(output_max_xz - output_min_xz);
+
+	/* TODO: for now assume y-dimension is along transducer center */
+	vec3 image_point = vec3(
+		output_min_xz.x + pixel.x * output_size.x / out_data_dim.x,
+		off_axis_pos,
+		output_min_xz.y + pixel.y * output_size.y / out_data_dim.y
+	);
+
+	/* NOTE: used for constant F# dynamic receive apodization. This is implemented as:
+	 *
+	 *                  /        |x_e - x_i|\
+	 *    a(x, z) = cos(F# * π * ----------- ) ^ 2
+	 *                  \        |z_e - z_i|/
+		 *
+	 * where x,z_e are transducer element positions and x,z_i are image positions. */
+	float apod_arg = 0.5 * radians(360) * output_size.y / output_size.x / abs(image_point.z);
+
+	/* NOTE: for I-Q data phase correction */
+	float iq_time_scale = radians(360) * center_frequency;
+
+	vec3  starting_dist = vec3(image_point.x - xdc_min_xy.x, image_point.y - xdc_min_xy.y, image_point.z);
+	float dx = xdc_size.x / float(dec_data_dim.y);
+	float dy = xdc_size.y / float(dec_data_dim.y);
+	float dzsign = sign(image_point.z - focal_depth);
+
+	vec2 sum = vec2(0);
+	/* NOTE: skip first acquisition in uforces since its garbage */
+	vec3 rdist = starting_dist;
+	uint ridx = dec_data_dim.y * dec_data_dim.x * uforces;
+	for (uint i = uforces; i < dec_data_dim.z; i++) {
+		uint base_idx = (i - uforces) / 4;
+		uint sub_idx = (i - uforces) - base_idx * 4;
+
+		vec3  focal_point = vec3(uforces_channels[base_idx][sub_idx] * dx, 0, focal_depth);
+		float transmit_dist = image_point.z; //+dzsign * distance(image_point, focal_point);
+
+		for (uint j = 0; j < dec_data_dim.y; j++) {
+			float dist = transmit_dist + length(rdist);
+			float time = dist / speed_of_sound + time_offset;
+
+			/* NOTE: apodization value for this transducer element */
+			vec2 lat_rdist = vec2(rdist.x, rdist.y);
+			float a = cos(clamp(abs(apod_arg * rdist.x), 0, 0.25 * radians(360)));
+			a = a * a;
+
+			vec2 p = cubic(ridx, time * sampling_frequency);
+			//p.x *= cos(iq_time_scale * time);
+			//p.y *= sin(iq_time_scale * time);
+			sum += p;
+			rdist.y -= dy;
+			ridx += dec_data_dim.x;
+		}
+
+		rdist.y = starting_dist.y;
+		rdist.x -= dx;
+	}
+	float val = length(sum);
+	imageStore(u_out_data_tex, out_coord, vec4(val, val, 0, 0));
+}
diff --git a/shaders/hadamard.glsl b/shaders/hadamard.glsl
@@ -33,6 +33,7 @@ layout(std140, binding = 0) uniform parameters {
 	float focal_depth;            /* [m]   */
 	float time_offset;            /* pulse length correction time [s]   */
 	uint  uforces;                /* mode is UFORCES (1) or FORCES (0) */
+	float off_axis_pos;           /* Where on the 3rd axis to render the image (Hercules only) */
 };
 
 void main()
diff --git a/shaders/lpf.glsl b/shaders/lpf.glsl
@@ -29,6 +29,7 @@ layout(std140, binding = 0) uniform parameters {
 	float focal_depth;            /* [m]   */
 	float time_offset;            /* pulse length correction time [s]   */
 	uint  uforces;                /* mode is UFORCES (1) or FORCES (0) */
+	float off_axis_pos;           /* Where on the 3rd axis to render the image (Hercules only) */
 };
 
 void main()
diff --git a/shaders/uforces.glsl b/shaders/uforces.glsl
@@ -25,6 +25,7 @@ layout(std140, binding = 0) uniform parameters {
 	float focal_depth;            /* [m]   */
 	float time_offset;            /* pulse length correction time [s]   */
 	uint  uforces;                /* mode is UFORCES (1) or FORCES (0) */
+	float off_axis_pos;           /* Where on the 3rd axis to render the image (Hercules only) */
 };
 
 layout(rg32f, location = 1) uniform image3D   u_out_data_tex;

M	.gitignore	\|	1	+
M	beamformer.c	\|	50	++++++++++++++++++++++++++++++--------------------
M	beamformer.h	\|	19	++++++++++---------
M	beamformer_parameters.h	\|	2	+-
M	main.c	\|	1	+
A	shaders/2d_hercules.glsl	\|	126	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	shaders/hadamard.glsl	\|	1	+
M	shaders/lpf.glsl	\|	1	+
M	shaders/uforces.glsl	\|	1	+