ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | LICENSE

Commit: 0f04b0f13a888570f1654032a69ff93e26351f22
Parent: 38170ef98620a2f05101d029a5ddd4a3ecb6321f
Author: tkhenry-uofa
Date:   Mon,  5 Aug 2024 19:32:26 -0600

fill out cuda lib skeleton

Diffstat:
M.gitignore | 1+
Mbeamformer.c | 50++++++++++++++++++++++++++++++--------------------
Mbeamformer.h | 19++++++++++---------
Mbeamformer_parameters.h | 2+-
Mmain.c | 1+
Ashaders/2d_hercules.glsl | 126+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mshaders/hadamard.glsl | 1+
Mshaders/lpf.glsl | 1+
Mshaders/uforces.glsl | 1+
9 files changed, 172 insertions(+), 30 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -1,4 +1,5 @@ ogl +*.dll *.so *.mexa64 external/lib diff --git a/beamformer.c b/beamformer.c @@ -55,7 +55,7 @@ alloc_shader_storage(BeamformerCtx *ctx, Arena a) glUnmapNamedBuffer(cs->raw_data_ssbo); storage_flags |= GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT; case GL_VENDOR_NVIDIA: - /* TODO: does the cuda buffer need to be unregistered? */ + /* NOTE: register_cuda_buffers will handle the updated ssbo */ break; } @@ -64,6 +64,9 @@ alloc_shader_storage(BeamformerCtx *ctx, Arena a) glCreateBuffers(1, &cs->raw_data_ssbo); glNamedBufferStorage(cs->raw_data_ssbo, full_rf_buf_size, 0, storage_flags); + for (u32 i = 0; i < ARRAY_COUNT(cs->rf_data_ssbos); i++) + glNamedBufferStorage(cs->rf_data_ssbos[i], rf_decoded_size, 0, 0); + i32 map_flags = GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT|GL_MAP_UNSYNCHRONIZED_BIT; switch (ctx->gl_vendor_id) { case GL_VENDOR_INTEL: @@ -73,17 +76,17 @@ alloc_shader_storage(BeamformerCtx *ctx, Arena a) break; case GL_VENDOR_NVIDIA: cs->raw_data_arena = os_alloc_arena(cs->raw_data_arena, full_rf_buf_size); - if (g_cuda_lib_functions[CLF_REGISTER_BUFFER]) { - cuda_register_buffer *fn = g_cuda_lib_functions[CLF_REGISTER_BUFFER]; - /* TODO: this needs the correct parameters */ - fn(NULL, cs->raw_data_ssbo); + if (g_cuda_lib_functions[CLF_REGISTER_BUFFERS] && g_cuda_lib_functions[CLF_INIT_CONFIG]) { + register_cuda_buffers *fn = g_cuda_lib_functions[CLF_REGISTER_BUFFERS]; + fn(cs->rf_data_ssbos, ARRAY_COUNT(cs->rf_data_ssbos), cs->raw_data_ssbo); + + init_cuda_configuration *init_fn = g_cuda_lib_functions[CLF_INIT_CONFIG]; + init_fn(bp->rf_raw_dim.E, bp->dec_data_dim.E, bp->channel_mapping, + bp->channel_offset > 0); } break; } - for (u32 i = 0; i < ARRAY_COUNT(cs->rf_data_ssbos); i++) - glNamedBufferStorage(cs->rf_data_ssbos[i], rf_decoded_size, 0, 0); - /* NOTE: store hadamard in GPU once; it won't change for a particular imaging session */ cs->hadamard_dim = (uv2){.x = dec_data_dim.z, .y = dec_data_dim.z}; size hadamard_elements = dec_data_dim.z * dec_data_dim.z; @@ -123,13 +126,14 @@ do_compute_shader(BeamformerCtx *ctx, enum compute_shaders shader) ORONE(csctx->dec_data_dim.z)); csctx->raw_data_fences[csctx->raw_data_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; - csctx->raw_data_index = (csctx->raw_data_index + 1) % ARRAY_COUNT(csctx->raw_data_fences); break; case CS_CUDA_DECODE_AND_DEMOD: if (g_cuda_lib_functions[CLF_DECODE_AND_DEMOD]) { - cuda_decode_and_demod *fn = g_cuda_lib_functions[CLF_DECODE_AND_DEMOD]; - /* TODO: fill in correct params */ - fn(1, 2, 3); + decode_and_hilbert*fn = g_cuda_lib_functions[CLF_DECODE_AND_DEMOD]; + + fn(csctx->raw_data_index * rf_raw_size, output_ssbo_idx); + csctx->raw_data_fences[csctx->raw_data_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); + csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; } break; case CS_LPF: @@ -354,14 +358,15 @@ draw_settings_ui(BeamformerCtx *ctx, Arena arena, Rect r, v2 mouse) f32 minx = bp->output_min_xz.x + 1e-6, maxx = bp->output_max_xz.x - 1e-6; f32 minz = bp->output_min_xz.y + 1e-6, maxz = bp->output_max_xz.y - 1e-6; struct listing listings[] = { - { "Sampling Rate:", "[MHz]", &bp->sampling_frequency, {{0, 0}}, 1e-6, 0 }, - { "Center Frequency:", "[MHz]", &bp->center_frequency, {{0, 100e6}}, 1e-6, 1 }, - { "Speed of Sound:", "[m/s]", &bp->speed_of_sound, {{0, 1e6}}, 1, 1 }, - { "Min X Point:", "[mm]", &bp->output_min_xz.x, {{-1e3, maxx}}, 1e3, 1 }, - { "Max X Point:", "[mm]", &bp->output_max_xz.x, {{minx, 1e3}}, 1e3, 1 }, - { "Min Z Point:", "[mm]", &bp->output_min_xz.y, {{0, maxz}}, 1e3, 1 }, - { "Max Z Point:", "[mm]", &bp->output_max_xz.y, {{minz, 1e3}}, 1e3, 1 }, - { "Dynamic Range:", "[dB]", &ctx->fsctx.db, {{-120, 0}}, 1, 1 }, + { "Sampling Rate:", "[MHz]", &bp->sampling_frequency, {{0, 0}}, 1e-6, 0 }, + { "Center Frequency:", "[MHz]", &bp->center_frequency, {{0, 100e6}}, 1e-6, 1 }, + { "Speed of Sound:", "[m/s]", &bp->speed_of_sound, {{0, 1e6}}, 1, 1 }, + { "Min X Point:", "[mm]", &bp->output_min_xz.x, {{-1e3, maxx}}, 1e3, 1 }, + { "Max X Point:", "[mm]", &bp->output_max_xz.x, {{minx, 1e3}}, 1e3, 1 }, + { "Min Z Point:", "[mm]", &bp->output_min_xz.y, {{0, maxz}}, 1e3, 1 }, + { "Max Z Point:", "[mm]", &bp->output_max_xz.y, {{minz, 1e3}}, 1e3, 1 }, + { "Dynamic Range:", "[dB]", &ctx->fsctx.db, {{-120, 0}}, 1, 1 }, + { "Y Position:", "[mm]", &bp->off_axis_pos, {{minx*2, maxx*2}}, 1e3, 1 }, }; static f32 hover_t[ARRAY_COUNT(listings)]; @@ -463,6 +468,7 @@ draw_debug_overlay(BeamformerCtx *ctx, Arena arena, Rect r) static char *labels[CS_LAST] = { [CS_HADAMARD] = "Decoding:", + [CS_CUDA_DECODE_AND_DEMOD] = "Cuda Decoding:", [CS_LPF] = "LPF:", [CS_MIN_MAX] = "Min/Max:", [CS_UFORCES] = "UFORCES:", @@ -553,9 +559,11 @@ do_beamformer(BeamformerCtx *ctx, Arena arena) ComputeShaderCtx *cs = &ctx->csctx; if (!uv4_equal(cs->dec_data_dim, bp->dec_data_dim) || ctx->flags & ALLOC_SSBOS) alloc_shader_storage(ctx, arena); + if (!uv4_equal(ctx->out_data_dim, bp->output_points) || ctx->flags & ALLOC_OUT_TEX) alloc_output_image(ctx); + cs->raw_data_index = (cs->raw_data_index + 1) % ARRAY_COUNT(cs->raw_data_fences); i32 raw_index = ctx->csctx.raw_data_index; /* NOTE: if this times out it means the command queue is more than 3 frames behind. * In that case we need to re-evaluate the buffer size */ @@ -592,10 +600,12 @@ do_beamformer(BeamformerCtx *ctx, Arena arena) glNamedBufferSubData(ctx->csctx.shared_ubo, 0, sizeof(*bp), bp); ctx->params->upload = 0; } + do_compute_shader(ctx, CS_HADAMARD); do_compute_shader(ctx, CS_LPF); do_compute_shader(ctx, CS_UFORCES); do_compute_shader(ctx, CS_MIN_MAX); + ctx->flags &= ~DO_COMPUTE; ctx->flags |= GEN_MIPMAPS; diff --git a/beamformer.h b/beamformer.h @@ -45,7 +45,7 @@ enum compute_shaders { CS_HADAMARD, /* TODO: Probably this should be split up */ CS_CUDA_DECODE_AND_DEMOD, -// CS_HERCULES, + CS_HERCULES, CS_LPF, CS_MIN_MAX, CS_UFORCES, @@ -170,21 +170,22 @@ typedef struct { } BeamformerCtx; enum cuda_lib_functions { - CLF_REGISTER_BUFFER, + CLF_INIT_CONFIG, + CLF_REGISTER_BUFFERS, CLF_DECODE_AND_DEMOD, CLF_LAST }; -/* TODO: fill in cuda lib entry points (function names) */ static char *cuda_lib_function_names[CLF_LAST] = { - [CLF_REGISTER_BUFFER] = "cuda_register_buffers", - [CLF_DECODE_AND_DEMOD] = "cuda_decode_and_demod", + [CLF_INIT_CONFIG] = "init_cuda_configuration", + [CLF_REGISTER_BUFFERS] = "register_cuda_buffers", + [CLF_DECODE_AND_DEMOD] = "decode_and_hilbert", }; -/* TODO: fill in portable function type params */ -#define CUDA_LIB_NAME "libcuda.dll" -typedef void cuda_register_buffer(void *, u32); -typedef void cuda_decode_and_demod(u32, u32, u32); +#define CUDA_LIB_NAME "cuda_toolkit.dll" +typedef void init_cuda_configuration(const u32*, const u32*, const u32*, b32); +typedef void register_cuda_buffers(u32*, u32, u32); +typedef void decode_and_hilbert(size_t, u32); /* NOTE: Array of Function Pointers used for accessing cuda lib functions */ /* TODO: robustness: replace with function stubs when not found */ diff --git a/beamformer_parameters.h b/beamformer_parameters.h @@ -21,5 +21,5 @@ typedef struct { f32 focal_depth; /* [m] */ f32 time_offset; /* pulse length correction time [s] */ u32 uforces; /* mode is UFORCES (1) or FORCES (0) */ - f32 _pad[1]; + f32 off_axis_pos; /* Where on the 3rd axis to render the image (Hercules only)*/ } BeamformerParameters; diff --git a/main.c b/main.c @@ -3,6 +3,7 @@ static char *compute_shader_paths[CS_LAST] = { [CS_HADAMARD] = "shaders/hadamard.glsl", + [CS_HERCULES] = "shaders/2d_hercules.glsl", [CS_LPF] = "shaders/lpf.glsl", [CS_MIN_MAX] = "shaders/min_max.glsl", [CS_UFORCES] = "shaders/uforces.glsl", diff --git a/shaders/2d_hercules.glsl b/shaders/2d_hercules.glsl @@ -0,0 +1,126 @@ +/* See LICENSE for license details. */ +#version 460 core +layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in; + +layout(std430, binding = 1) readonly restrict buffer buffer_1 { + vec2 rf_data[]; +}; + +layout(std140, binding = 0) uniform parameters{ + uvec4 channel_mapping[64]; /* Transducer Channel to Verasonics Channel */ + uvec4 uforces_channels[32]; /* Channels used for virtual UFORCES elements */ + vec4 lpf_coefficients[16]; /* Low Pass Filter Cofficients */ + uvec4 dec_data_dim; /* Samples * Channels * Acquisitions; last element ignored */ + uvec4 output_points; /* Width * Height * Depth; last element ignored */ + uvec2 rf_raw_dim; /* Raw Data Dimensions */ + vec2 output_min_xz; /* [m] Top left corner of output region */ + vec2 output_max_xz; /* [m] Bottom right corner of output region */ + vec2 xdc_min_xy; /* [m] Min center of transducer elements */ + vec2 xdc_max_xy; /* [m] Max center of transducer elements */ + uint channel_offset; /* Offset into channel_mapping: 0 or 128 (rows or columns) */ + uint lpf_order; /* Order of Low Pass Filter */ + float speed_of_sound; /* [m/s] */ + float sampling_frequency; /* [Hz] */ + float center_frequency; /* [Hz] */ + float focal_depth; /* [m] */ + float time_offset; /* pulse length correction time [s] */ + uint uforces; /* mode is UFORCES (1) or FORCES (0) */ + float off_axis_pos; /* Where on the 3rd axis to render the image (Hercules only) */ +}; + +layout(rg32f, location = 1) uniform image3D u_out_data_tex; + +#define C_SPLINE 0.5 + +/* NOTE: See: https://cubic.org/docs/hermite.htm */ +vec2 cubic(uint ridx, float x) +{ + mat4 h = mat4( + 2, -3, 0, 1, + -2, 3, 0, 0, + 1, -2, 1, 0, + 1, -1, 0, 0 + ); + + uint xk = uint(floor(x)); + float t = (x - float(xk)); + vec4 S = vec4(t * t * t, t * t, t, 1); + + vec2 P1 = rf_data[ridx + xk]; + vec2 P2 = rf_data[ridx + xk + 1]; + vec2 T1 = C_SPLINE * (P2 - rf_data[ridx + xk - 1]); + vec2 T2 = C_SPLINE * (rf_data[ridx + xk + 2] - P1); + + vec4 C1 = vec4(P1.x, P2.x, T1.x, T2.x); + vec4 C2 = vec4(P1.y, P2.y, T1.y, T2.y); + return vec2(dot(S, h * C1), dot(S, h * C2)); +} + +void main() +{ + vec2 pixel = vec2(gl_GlobalInvocationID.xy); + ivec3 out_coord = ivec3(gl_GlobalInvocationID.xyz); + ivec3 out_data_dim = imageSize(u_out_data_tex); + + /* NOTE: Convert pixel to physical coordinates */ + vec2 xdc_size = abs(xdc_max_xy - xdc_min_xy); + vec2 output_size = abs(output_max_xz - output_min_xz); + + /* TODO: for now assume y-dimension is along transducer center */ + vec3 image_point = vec3( + output_min_xz.x + pixel.x * output_size.x / out_data_dim.x, + off_axis_pos, + output_min_xz.y + pixel.y * output_size.y / out_data_dim.y + ); + + /* NOTE: used for constant F# dynamic receive apodization. This is implemented as: + * + * / |x_e - x_i|\ + * a(x, z) = cos(F# * π * ----------- ) ^ 2 + * \ |z_e - z_i|/ + * + * where x,z_e are transducer element positions and x,z_i are image positions. */ + float apod_arg = 0.5 * radians(360) * output_size.y / output_size.x / abs(image_point.z); + + /* NOTE: for I-Q data phase correction */ + float iq_time_scale = radians(360) * center_frequency; + + vec3 starting_dist = vec3(image_point.x - xdc_min_xy.x, image_point.y - xdc_min_xy.y, image_point.z); + float dx = xdc_size.x / float(dec_data_dim.y); + float dy = xdc_size.y / float(dec_data_dim.y); + float dzsign = sign(image_point.z - focal_depth); + + vec2 sum = vec2(0); + /* NOTE: skip first acquisition in uforces since its garbage */ + vec3 rdist = starting_dist; + uint ridx = dec_data_dim.y * dec_data_dim.x * uforces; + for (uint i = uforces; i < dec_data_dim.z; i++) { + uint base_idx = (i - uforces) / 4; + uint sub_idx = (i - uforces) - base_idx * 4; + + vec3 focal_point = vec3(uforces_channels[base_idx][sub_idx] * dx, 0, focal_depth); + float transmit_dist = image_point.z; //+dzsign * distance(image_point, focal_point); + + for (uint j = 0; j < dec_data_dim.y; j++) { + float dist = transmit_dist + length(rdist); + float time = dist / speed_of_sound + time_offset; + + /* NOTE: apodization value for this transducer element */ + vec2 lat_rdist = vec2(rdist.x, rdist.y); + float a = cos(clamp(abs(apod_arg * rdist.x), 0, 0.25 * radians(360))); + a = a * a; + + vec2 p = cubic(ridx, time * sampling_frequency); + //p.x *= cos(iq_time_scale * time); + //p.y *= sin(iq_time_scale * time); + sum += p; + rdist.y -= dy; + ridx += dec_data_dim.x; + } + + rdist.y = starting_dist.y; + rdist.x -= dx; + } + float val = length(sum); + imageStore(u_out_data_tex, out_coord, vec4(val, val, 0, 0)); +} diff --git a/shaders/hadamard.glsl b/shaders/hadamard.glsl @@ -33,6 +33,7 @@ layout(std140, binding = 0) uniform parameters { float focal_depth; /* [m] */ float time_offset; /* pulse length correction time [s] */ uint uforces; /* mode is UFORCES (1) or FORCES (0) */ + float off_axis_pos; /* Where on the 3rd axis to render the image (Hercules only) */ }; void main() diff --git a/shaders/lpf.glsl b/shaders/lpf.glsl @@ -29,6 +29,7 @@ layout(std140, binding = 0) uniform parameters { float focal_depth; /* [m] */ float time_offset; /* pulse length correction time [s] */ uint uforces; /* mode is UFORCES (1) or FORCES (0) */ + float off_axis_pos; /* Where on the 3rd axis to render the image (Hercules only) */ }; void main() diff --git a/shaders/uforces.glsl b/shaders/uforces.glsl @@ -25,6 +25,7 @@ layout(std140, binding = 0) uniform parameters { float focal_depth; /* [m] */ float time_offset; /* pulse length correction time [s] */ uint uforces; /* mode is UFORCES (1) or FORCES (0) */ + float off_axis_pos; /* Where on the 3rd axis to render the image (Hercules only) */ }; layout(rg32f, location = 1) uniform image3D u_out_data_tex;