Commit: 0f04b0f13a888570f1654032a69ff93e26351f22
Parent: 38170ef98620a2f05101d029a5ddd4a3ecb6321f
Author: tkhenry-uofa
Date: Mon, 5 Aug 2024 19:32:26 -0600
fill out cuda lib skeleton
Diffstat:
9 files changed, 172 insertions(+), 30 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
ogl
+*.dll
*.so
*.mexa64
external/lib
diff --git a/beamformer.c b/beamformer.c
@@ -55,7 +55,7 @@ alloc_shader_storage(BeamformerCtx *ctx, Arena a)
glUnmapNamedBuffer(cs->raw_data_ssbo);
storage_flags |= GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT;
case GL_VENDOR_NVIDIA:
- /* TODO: does the cuda buffer need to be unregistered? */
+ /* NOTE: register_cuda_buffers will handle the updated ssbo */
break;
}
@@ -64,6 +64,9 @@ alloc_shader_storage(BeamformerCtx *ctx, Arena a)
glCreateBuffers(1, &cs->raw_data_ssbo);
glNamedBufferStorage(cs->raw_data_ssbo, full_rf_buf_size, 0, storage_flags);
+ for (u32 i = 0; i < ARRAY_COUNT(cs->rf_data_ssbos); i++)
+ glNamedBufferStorage(cs->rf_data_ssbos[i], rf_decoded_size, 0, 0);
+
i32 map_flags = GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT|GL_MAP_UNSYNCHRONIZED_BIT;
switch (ctx->gl_vendor_id) {
case GL_VENDOR_INTEL:
@@ -73,17 +76,17 @@ alloc_shader_storage(BeamformerCtx *ctx, Arena a)
break;
case GL_VENDOR_NVIDIA:
cs->raw_data_arena = os_alloc_arena(cs->raw_data_arena, full_rf_buf_size);
- if (g_cuda_lib_functions[CLF_REGISTER_BUFFER]) {
- cuda_register_buffer *fn = g_cuda_lib_functions[CLF_REGISTER_BUFFER];
- /* TODO: this needs the correct parameters */
- fn(NULL, cs->raw_data_ssbo);
+ if (g_cuda_lib_functions[CLF_REGISTER_BUFFERS] && g_cuda_lib_functions[CLF_INIT_CONFIG]) {
+ register_cuda_buffers *fn = g_cuda_lib_functions[CLF_REGISTER_BUFFERS];
+ fn(cs->rf_data_ssbos, ARRAY_COUNT(cs->rf_data_ssbos), cs->raw_data_ssbo);
+
+ init_cuda_configuration *init_fn = g_cuda_lib_functions[CLF_INIT_CONFIG];
+ init_fn(bp->rf_raw_dim.E, bp->dec_data_dim.E, bp->channel_mapping,
+ bp->channel_offset > 0);
}
break;
}
- for (u32 i = 0; i < ARRAY_COUNT(cs->rf_data_ssbos); i++)
- glNamedBufferStorage(cs->rf_data_ssbos[i], rf_decoded_size, 0, 0);
-
/* NOTE: store hadamard in GPU once; it won't change for a particular imaging session */
cs->hadamard_dim = (uv2){.x = dec_data_dim.z, .y = dec_data_dim.z};
size hadamard_elements = dec_data_dim.z * dec_data_dim.z;
@@ -123,13 +126,14 @@ do_compute_shader(BeamformerCtx *ctx, enum compute_shaders shader)
ORONE(csctx->dec_data_dim.z));
csctx->raw_data_fences[csctx->raw_data_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
- csctx->raw_data_index = (csctx->raw_data_index + 1) % ARRAY_COUNT(csctx->raw_data_fences);
break;
case CS_CUDA_DECODE_AND_DEMOD:
if (g_cuda_lib_functions[CLF_DECODE_AND_DEMOD]) {
- cuda_decode_and_demod *fn = g_cuda_lib_functions[CLF_DECODE_AND_DEMOD];
- /* TODO: fill in correct params */
- fn(1, 2, 3);
+ decode_and_hilbert*fn = g_cuda_lib_functions[CLF_DECODE_AND_DEMOD];
+
+ fn(csctx->raw_data_index * rf_raw_size, output_ssbo_idx);
+ csctx->raw_data_fences[csctx->raw_data_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+ csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
}
break;
case CS_LPF:
@@ -354,14 +358,15 @@ draw_settings_ui(BeamformerCtx *ctx, Arena arena, Rect r, v2 mouse)
f32 minx = bp->output_min_xz.x + 1e-6, maxx = bp->output_max_xz.x - 1e-6;
f32 minz = bp->output_min_xz.y + 1e-6, maxz = bp->output_max_xz.y - 1e-6;
struct listing listings[] = {
- { "Sampling Rate:", "[MHz]", &bp->sampling_frequency, {{0, 0}}, 1e-6, 0 },
- { "Center Frequency:", "[MHz]", &bp->center_frequency, {{0, 100e6}}, 1e-6, 1 },
- { "Speed of Sound:", "[m/s]", &bp->speed_of_sound, {{0, 1e6}}, 1, 1 },
- { "Min X Point:", "[mm]", &bp->output_min_xz.x, {{-1e3, maxx}}, 1e3, 1 },
- { "Max X Point:", "[mm]", &bp->output_max_xz.x, {{minx, 1e3}}, 1e3, 1 },
- { "Min Z Point:", "[mm]", &bp->output_min_xz.y, {{0, maxz}}, 1e3, 1 },
- { "Max Z Point:", "[mm]", &bp->output_max_xz.y, {{minz, 1e3}}, 1e3, 1 },
- { "Dynamic Range:", "[dB]", &ctx->fsctx.db, {{-120, 0}}, 1, 1 },
+ { "Sampling Rate:", "[MHz]", &bp->sampling_frequency, {{0, 0}}, 1e-6, 0 },
+ { "Center Frequency:", "[MHz]", &bp->center_frequency, {{0, 100e6}}, 1e-6, 1 },
+ { "Speed of Sound:", "[m/s]", &bp->speed_of_sound, {{0, 1e6}}, 1, 1 },
+ { "Min X Point:", "[mm]", &bp->output_min_xz.x, {{-1e3, maxx}}, 1e3, 1 },
+ { "Max X Point:", "[mm]", &bp->output_max_xz.x, {{minx, 1e3}}, 1e3, 1 },
+ { "Min Z Point:", "[mm]", &bp->output_min_xz.y, {{0, maxz}}, 1e3, 1 },
+ { "Max Z Point:", "[mm]", &bp->output_max_xz.y, {{minz, 1e3}}, 1e3, 1 },
+ { "Dynamic Range:", "[dB]", &ctx->fsctx.db, {{-120, 0}}, 1, 1 },
+ { "Y Position:", "[mm]", &bp->off_axis_pos, {{minx*2, maxx*2}}, 1e3, 1 },
};
static f32 hover_t[ARRAY_COUNT(listings)];
@@ -463,6 +468,7 @@ draw_debug_overlay(BeamformerCtx *ctx, Arena arena, Rect r)
static char *labels[CS_LAST] = {
[CS_HADAMARD] = "Decoding:",
+ [CS_CUDA_DECODE_AND_DEMOD] = "Cuda Decoding:",
[CS_LPF] = "LPF:",
[CS_MIN_MAX] = "Min/Max:",
[CS_UFORCES] = "UFORCES:",
@@ -553,9 +559,11 @@ do_beamformer(BeamformerCtx *ctx, Arena arena)
ComputeShaderCtx *cs = &ctx->csctx;
if (!uv4_equal(cs->dec_data_dim, bp->dec_data_dim) || ctx->flags & ALLOC_SSBOS)
alloc_shader_storage(ctx, arena);
+
if (!uv4_equal(ctx->out_data_dim, bp->output_points) || ctx->flags & ALLOC_OUT_TEX)
alloc_output_image(ctx);
+ cs->raw_data_index = (cs->raw_data_index + 1) % ARRAY_COUNT(cs->raw_data_fences);
i32 raw_index = ctx->csctx.raw_data_index;
/* NOTE: if this times out it means the command queue is more than 3 frames behind.
* In that case we need to re-evaluate the buffer size */
@@ -592,10 +600,12 @@ do_beamformer(BeamformerCtx *ctx, Arena arena)
glNamedBufferSubData(ctx->csctx.shared_ubo, 0, sizeof(*bp), bp);
ctx->params->upload = 0;
}
+
do_compute_shader(ctx, CS_HADAMARD);
do_compute_shader(ctx, CS_LPF);
do_compute_shader(ctx, CS_UFORCES);
do_compute_shader(ctx, CS_MIN_MAX);
+
ctx->flags &= ~DO_COMPUTE;
ctx->flags |= GEN_MIPMAPS;
diff --git a/beamformer.h b/beamformer.h
@@ -45,7 +45,7 @@ enum compute_shaders {
CS_HADAMARD,
/* TODO: Probably this should be split up */
CS_CUDA_DECODE_AND_DEMOD,
-// CS_HERCULES,
+ CS_HERCULES,
CS_LPF,
CS_MIN_MAX,
CS_UFORCES,
@@ -170,21 +170,22 @@ typedef struct {
} BeamformerCtx;
enum cuda_lib_functions {
- CLF_REGISTER_BUFFER,
+ CLF_INIT_CONFIG,
+ CLF_REGISTER_BUFFERS,
CLF_DECODE_AND_DEMOD,
CLF_LAST
};
-/* TODO: fill in cuda lib entry points (function names) */
static char *cuda_lib_function_names[CLF_LAST] = {
- [CLF_REGISTER_BUFFER] = "cuda_register_buffers",
- [CLF_DECODE_AND_DEMOD] = "cuda_decode_and_demod",
+ [CLF_INIT_CONFIG] = "init_cuda_configuration",
+ [CLF_REGISTER_BUFFERS] = "register_cuda_buffers",
+ [CLF_DECODE_AND_DEMOD] = "decode_and_hilbert",
};
-/* TODO: fill in portable function type params */
-#define CUDA_LIB_NAME "libcuda.dll"
-typedef void cuda_register_buffer(void *, u32);
-typedef void cuda_decode_and_demod(u32, u32, u32);
+#define CUDA_LIB_NAME "cuda_toolkit.dll"
+typedef void init_cuda_configuration(const u32*, const u32*, const u32*, b32);
+typedef void register_cuda_buffers(u32*, u32, u32);
+typedef void decode_and_hilbert(size_t, u32);
/* NOTE: Array of Function Pointers used for accessing cuda lib functions */
/* TODO: robustness: replace with function stubs when not found */
diff --git a/beamformer_parameters.h b/beamformer_parameters.h
@@ -21,5 +21,5 @@ typedef struct {
f32 focal_depth; /* [m] */
f32 time_offset; /* pulse length correction time [s] */
u32 uforces; /* mode is UFORCES (1) or FORCES (0) */
- f32 _pad[1];
+ f32 off_axis_pos; /* Where on the 3rd axis to render the image (Hercules only)*/
} BeamformerParameters;
diff --git a/main.c b/main.c
@@ -3,6 +3,7 @@
static char *compute_shader_paths[CS_LAST] = {
[CS_HADAMARD] = "shaders/hadamard.glsl",
+ [CS_HERCULES] = "shaders/2d_hercules.glsl",
[CS_LPF] = "shaders/lpf.glsl",
[CS_MIN_MAX] = "shaders/min_max.glsl",
[CS_UFORCES] = "shaders/uforces.glsl",
diff --git a/shaders/2d_hercules.glsl b/shaders/2d_hercules.glsl
@@ -0,0 +1,126 @@
+/* See LICENSE for license details. */
+#version 460 core
+layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
+
+layout(std430, binding = 1) readonly restrict buffer buffer_1 {
+ vec2 rf_data[];
+};
+
+layout(std140, binding = 0) uniform parameters{
+ uvec4 channel_mapping[64]; /* Transducer Channel to Verasonics Channel */
+ uvec4 uforces_channels[32]; /* Channels used for virtual UFORCES elements */
+ vec4 lpf_coefficients[16]; /* Low Pass Filter Cofficients */
+ uvec4 dec_data_dim; /* Samples * Channels * Acquisitions; last element ignored */
+ uvec4 output_points; /* Width * Height * Depth; last element ignored */
+ uvec2 rf_raw_dim; /* Raw Data Dimensions */
+ vec2 output_min_xz; /* [m] Top left corner of output region */
+ vec2 output_max_xz; /* [m] Bottom right corner of output region */
+ vec2 xdc_min_xy; /* [m] Min center of transducer elements */
+ vec2 xdc_max_xy; /* [m] Max center of transducer elements */
+ uint channel_offset; /* Offset into channel_mapping: 0 or 128 (rows or columns) */
+ uint lpf_order; /* Order of Low Pass Filter */
+ float speed_of_sound; /* [m/s] */
+ float sampling_frequency; /* [Hz] */
+ float center_frequency; /* [Hz] */
+ float focal_depth; /* [m] */
+ float time_offset; /* pulse length correction time [s] */
+ uint uforces; /* mode is UFORCES (1) or FORCES (0) */
+ float off_axis_pos; /* Where on the 3rd axis to render the image (Hercules only) */
+};
+
+layout(rg32f, location = 1) uniform image3D u_out_data_tex;
+
+#define C_SPLINE 0.5
+
+/* NOTE: See: https://cubic.org/docs/hermite.htm */
+vec2 cubic(uint ridx, float x)
+{
+ mat4 h = mat4(
+ 2, -3, 0, 1,
+ -2, 3, 0, 0,
+ 1, -2, 1, 0,
+ 1, -1, 0, 0
+ );
+
+ uint xk = uint(floor(x));
+ float t = (x - float(xk));
+ vec4 S = vec4(t * t * t, t * t, t, 1);
+
+ vec2 P1 = rf_data[ridx + xk];
+ vec2 P2 = rf_data[ridx + xk + 1];
+ vec2 T1 = C_SPLINE * (P2 - rf_data[ridx + xk - 1]);
+ vec2 T2 = C_SPLINE * (rf_data[ridx + xk + 2] - P1);
+
+ vec4 C1 = vec4(P1.x, P2.x, T1.x, T2.x);
+ vec4 C2 = vec4(P1.y, P2.y, T1.y, T2.y);
+ return vec2(dot(S, h * C1), dot(S, h * C2));
+}
+
+void main()
+{
+ vec2 pixel = vec2(gl_GlobalInvocationID.xy);
+ ivec3 out_coord = ivec3(gl_GlobalInvocationID.xyz);
+ ivec3 out_data_dim = imageSize(u_out_data_tex);
+
+ /* NOTE: Convert pixel to physical coordinates */
+ vec2 xdc_size = abs(xdc_max_xy - xdc_min_xy);
+ vec2 output_size = abs(output_max_xz - output_min_xz);
+
+ /* TODO: for now assume y-dimension is along transducer center */
+ vec3 image_point = vec3(
+ output_min_xz.x + pixel.x * output_size.x / out_data_dim.x,
+ off_axis_pos,
+ output_min_xz.y + pixel.y * output_size.y / out_data_dim.y
+ );
+
+ /* NOTE: used for constant F# dynamic receive apodization. This is implemented as:
+ *
+ * / |x_e - x_i|\
+ * a(x, z) = cos(F# * π * ----------- ) ^ 2
+ * \ |z_e - z_i|/
+ *
+ * where x,z_e are transducer element positions and x,z_i are image positions. */
+ float apod_arg = 0.5 * radians(360) * output_size.y / output_size.x / abs(image_point.z);
+
+ /* NOTE: for I-Q data phase correction */
+ float iq_time_scale = radians(360) * center_frequency;
+
+ vec3 starting_dist = vec3(image_point.x - xdc_min_xy.x, image_point.y - xdc_min_xy.y, image_point.z);
+ float dx = xdc_size.x / float(dec_data_dim.y);
+ float dy = xdc_size.y / float(dec_data_dim.y);
+ float dzsign = sign(image_point.z - focal_depth);
+
+ vec2 sum = vec2(0);
+ /* NOTE: skip first acquisition in uforces since its garbage */
+ vec3 rdist = starting_dist;
+ uint ridx = dec_data_dim.y * dec_data_dim.x * uforces;
+ for (uint i = uforces; i < dec_data_dim.z; i++) {
+ uint base_idx = (i - uforces) / 4;
+ uint sub_idx = (i - uforces) - base_idx * 4;
+
+ vec3 focal_point = vec3(uforces_channels[base_idx][sub_idx] * dx, 0, focal_depth);
+ float transmit_dist = image_point.z; //+dzsign * distance(image_point, focal_point);
+
+ for (uint j = 0; j < dec_data_dim.y; j++) {
+ float dist = transmit_dist + length(rdist);
+ float time = dist / speed_of_sound + time_offset;
+
+ /* NOTE: apodization value for this transducer element */
+ vec2 lat_rdist = vec2(rdist.x, rdist.y);
+ float a = cos(clamp(abs(apod_arg * rdist.x), 0, 0.25 * radians(360)));
+ a = a * a;
+
+ vec2 p = cubic(ridx, time * sampling_frequency);
+ //p.x *= cos(iq_time_scale * time);
+ //p.y *= sin(iq_time_scale * time);
+ sum += p;
+ rdist.y -= dy;
+ ridx += dec_data_dim.x;
+ }
+
+ rdist.y = starting_dist.y;
+ rdist.x -= dx;
+ }
+ float val = length(sum);
+ imageStore(u_out_data_tex, out_coord, vec4(val, val, 0, 0));
+}
diff --git a/shaders/hadamard.glsl b/shaders/hadamard.glsl
@@ -33,6 +33,7 @@ layout(std140, binding = 0) uniform parameters {
float focal_depth; /* [m] */
float time_offset; /* pulse length correction time [s] */
uint uforces; /* mode is UFORCES (1) or FORCES (0) */
+ float off_axis_pos; /* Where on the 3rd axis to render the image (Hercules only) */
};
void main()
diff --git a/shaders/lpf.glsl b/shaders/lpf.glsl
@@ -29,6 +29,7 @@ layout(std140, binding = 0) uniform parameters {
float focal_depth; /* [m] */
float time_offset; /* pulse length correction time [s] */
uint uforces; /* mode is UFORCES (1) or FORCES (0) */
+ float off_axis_pos; /* Where on the 3rd axis to render the image (Hercules only) */
};
void main()
diff --git a/shaders/uforces.glsl b/shaders/uforces.glsl
@@ -25,6 +25,7 @@ layout(std140, binding = 0) uniform parameters {
float focal_depth; /* [m] */
float time_offset; /* pulse length correction time [s] */
uint uforces; /* mode is UFORCES (1) or FORCES (0) */
+ float off_axis_pos; /* Where on the 3rd axis to render the image (Hercules only) */
};
layout(rg32f, location = 1) uniform image3D u_out_data_tex;