ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | LICENSE

Commit: d7c7ee7a6dc519aa077b51b333911e3d260db7a8
Parent: 239392a90e60209dd824539069615cde6da65e10
Author: Randy Palamar
Date:   Wed, 26 Feb 2025 12:35:32 -0700

core: allow floating point data to be sent to beamformer

closes: #17

Diffstat:
Mbeamformer.c | 28++++++++++++++++++++--------
Mbeamformer.h | 8+-------
Mbeamformer_parameters.h | 21+++++++++++++--------
Mhelpers/ogl_beamformer_lib.c | 52+++++++++++++++++++++++++++++++++++++++++-----------
Mhelpers/ogl_beamformer_lib.h | 9+++++----
Ashaders/decode.glsl | 84+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dshaders/hadamard.glsl | 68--------------------------------------------------------------------
Mstatic.c | 5++---
8 files changed, 166 insertions(+), 109 deletions(-)

diff --git a/beamformer.c b/beamformer.c @@ -364,7 +364,8 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformFrame *frame, enum co u32 input_ssbo_idx = csctx->last_output_ssbo_index; switch (shader) { - case CS_HADAMARD: + case CS_DECODE: + case CS_DECODE_FLOAT: glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]); glBindImageTexture(0, csctx->hadamard_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8I); @@ -508,15 +509,21 @@ reload_compute_shader(BeamformerCtx *ctx, s8 path, ComputeShaderReloadContext *c ComputeShaderCtx *cs = &ctx->csctx; /* NOTE: arena works as stack (since everything here is 1 byte aligned) */ - s8 header_in_arena = {.data = tmp.beg}; - if (csr->needs_header) - header_in_arena = push_s8(&tmp, s8(COMPUTE_SHADER_HEADER)); + s8 header = {.data = tmp.beg}; + if (csr->needs_header) { + header = push_s8(&tmp, s8(COMPUTE_SHADER_HEADER)); + if (csr->shader == CS_DECODE_FLOAT) { + s8 extra = push_s8(&tmp, s8(COMPUTE_FLOAT_DECODE_HEADER)); + ASSERT(extra.data == header.data + header.len); + header.len += extra.len; + } + } - s8 shader_text = ctx->platform.read_whole_file(&tmp, (c8 *)path.data); - shader_text.data -= header_in_arena.len; - shader_text.len += header_in_arena.len; + s8 shader_text = ctx->platform.read_whole_file(&tmp, (c8 *)path.data); + shader_text.data -= header.len; + shader_text.len += header.len; - if (shader_text.data == header_in_arena.data) { + if (shader_text.data == header.data) { u32 shader_id = compile_shader(&ctx->platform, tmp, GL_COMPUTE_SHADER, shader_text, path); if (shader_id) { u32 new_program = link_program(&ctx->platform, tmp, shader_id); @@ -561,6 +568,11 @@ DEBUG_EXPORT BEAMFORMER_COMPLETE_COMPUTE_FN(beamformer_complete_compute) case BW_RELOAD_SHADER: { ComputeShaderReloadContext *csr = work->reload_shader_ctx; reload_compute_shader(ctx, csr->path, csr, arena); + if (csr->shader == CS_DECODE) { + csr->shader = CS_DECODE_FLOAT; + reload_compute_shader(ctx, csr->path, csr, arena); + csr->shader = CS_DECODE; + } /* TODO(rnp): remove this */ #define X(idx, name) cs->name##_id = glGetUniformLocation(cs->programs[idx], "u_" #name); diff --git a/beamformer.h b/beamformer.h @@ -180,7 +180,7 @@ typedef struct { iptr last_displayed_frame; } BeamformerUI; -#define CS_UNIFORMS \ +#define CS_UNIFORMS \ X(CS_DAS, voxel_offset) \ X(CS_DAS, cycle_t) \ X(CS_MIN_MAX, mips_level) \ @@ -189,12 +189,6 @@ typedef struct { typedef struct { u32 programs[CS_LAST]; - /* NOTE: the raw_data_ssbo is allocated at 3x the required size to allow for tiled - * transfers when the GPU is running behind the CPU. It is not mapped on NVIDIA because - * their drivers _will_ store the buffer in the system memory. This doesn't happen - * for Intel or AMD and mapping the buffer is preferred. In either case incoming data can - * be written to the arena at the appropriate offset for the current raw_data_index. An - * additional BufferSubData is needed on NVIDIA to upload the data. */ /* NOTE: The raw data ssbo is not mapped on NVIDIA because their drivers _will_ store * the buffer in the system memory. This doesn't happen for other vendors and * mapping the buffer is preferred. In either case incoming data can be written to diff --git a/beamformer_parameters.h b/beamformer_parameters.h @@ -1,14 +1,15 @@ /* See LICENSE for license details. */ /* X(enumarant, number, shader file name, needs header, pretty name) */ -#define COMPUTE_SHADERS \ - X(CUDA_DECODE, 0, "", 0, "CUDA Decoding") \ - X(CUDA_HILBERT, 1, "", 0, "CUDA Hilbert") \ - X(DAS, 2, "das", 1, "DAS") \ - X(DEMOD, 3, "demod", 1, "Demodulation") \ - X(HADAMARD, 4, "hadamard", 1, "Decoding") \ - X(MIN_MAX, 5, "min_max", 0, "Min/Max") \ - X(SUM, 6, "sum", 0, "Sum") +#define COMPUTE_SHADERS \ + X(CUDA_DECODE, 0, "", 0, "CUDA Decoding") \ + X(CUDA_HILBERT, 1, "", 0, "CUDA Hilbert") \ + X(DAS, 2, "das", 1, "DAS") \ + X(DECODE, 3, "decode", 1, "Decoding") \ + X(DECODE_FLOAT, 4, "", 1, "Decoding (F32)") \ + X(DEMOD, 5, "demod", 1, "Demodulation") \ + X(MIN_MAX, 6, "min_max", 0, "Min/Max") \ + X(SUM, 7, "sum", 0, "Sum") enum compute_shaders { #define X(e, n, s, h, pn) CS_ ##e = n, @@ -120,6 +121,10 @@ layout(std140, binding = 0) uniform parameters {\n\ \n\ #line 1\n" +#define COMPUTE_FLOAT_DECODE_HEADER "\ +#define INPUT_DATA_TYPE_FLOAT\n\ +#line 1\n" + /* TODO(rnp): bake this into the das shader header */ #define DAS_LOCAL_SIZE_X 32 #define DAS_LOCAL_SIZE_Y 1 diff --git a/helpers/ogl_beamformer_lib.c b/helpers/ogl_beamformer_lib.c @@ -16,7 +16,6 @@ typedef struct { char *name; } Pipe; - typedef struct { size len; u8 *data; } s8; #define s8(s) (s8){.len = ARRAY_COUNT(s) - 1, .data = (u8 *)s} @@ -274,7 +273,7 @@ b32 set_beamformer_pipeline(char *shm_name, i32 *stages, i32 stages_count) { if (stages_count > ARRAY_COUNT(g_bp->compute_stages)) { - error_msg("maximum stage count is %u", ARRAY_COUNT(g_bp->compute_stages)); + error_msg("maximum stage count is %lu", ARRAY_COUNT(g_bp->compute_stages)); return 0; } @@ -287,7 +286,8 @@ set_beamformer_pipeline(char *shm_name, i32 *stages, i32 stages_count) case CS_CUDA_HILBERT: case CS_DAS: case CS_DEMOD: - case CS_HADAMARD: + case CS_DECODE: + case CS_DECODE_FLOAT: case CS_MIN_MAX: case CS_SUM: g_bp->compute_stages[i] = stages[i]; @@ -331,7 +331,7 @@ send_raw_data(char *pipe_name, char *shm_name, void *data, u32 data_size) written = os_write(g_pipe.file, data, data_size); result = written == data_size; if (!result) - warning_msg("failed again, wrote %ld/%ld\ngiving up", + warning_msg("failed again, wrote %ld/%u\ngiving up", written, data_size); } } @@ -367,12 +367,12 @@ set_beamformer_parameters(char *shm_name, BeamformerParameters *new_bp) return 1; } -void -beamform_data_synchronized(char *pipe_name, char *shm_name, i16 *data, uv2 data_dim, - uv4 output_points, f32 *out_data, i32 timeout_ms) +static b32 +beamform_data_synchronized(char *pipe_name, char *shm_name, void *data, uv2 data_dim, + u32 data_size, uv4 output_points, f32 *out_data, i32 timeout_ms) { if (!check_shared_memory(shm_name)) - return; + return 0; if (output_points.x == 0) output_points.x = 1; if (output_points.y == 0) output_points.y = 1; @@ -388,19 +388,19 @@ beamform_data_synchronized(char *pipe_name, char *shm_name, i16 *data, uv2 data_ s8 export_name = s8(OS_EXPORT_PIPE_NAME); if (export_name.len > ARRAY_COUNT(g_bp->export_pipe_name)) { error_msg("export pipe name too long"); - return; + return 0; } Pipe export_pipe = os_open_read_pipe(OS_EXPORT_PIPE_NAME); if (export_pipe.file == INVALID_FILE) { error_msg("failed to open export pipe"); - return; + return 0; } for (u32 i = 0; i < export_name.len; i++) g_bp->export_pipe_name[i] = export_name.data[i]; - b32 result = send_data(pipe_name, shm_name, data, data_dim); + b32 result = send_raw_data(pipe_name, shm_name, data, data_size); if (result) { size output_size = output_points.x * output_points.y * output_points.z * sizeof(f32) * 2; result = os_wait_read_pipe(export_pipe, out_data, output_size, timeout_ms); @@ -411,4 +411,34 @@ beamform_data_synchronized(char *pipe_name, char *shm_name, i16 *data, uv2 data_ os_disconnect_pipe(export_pipe); os_close_pipe(&export_pipe.file, export_pipe.name); os_close_pipe(&g_pipe.file, 0); + + return result; +} + +b32 +beamform_data_synchronized_i16(char *pipe_name, char *shm_name, i16 *data, uv2 data_dim, + uv4 output_points, f32 *out_data, i32 timeout_ms) +{ + b32 result = 0; + u64 data_size = data_dim.x * data_dim.y * sizeof(i16); + if (data_size <= U32_MAX) { + g_bp->raw.rf_raw_dim = data_dim; + result = beamform_data_synchronized(pipe_name, shm_name, data, data_dim, data_size, + output_points, out_data, timeout_ms); + } + return result; +} + +b32 +beamform_data_synchronized_f32(char *pipe_name, char *shm_name, f32 *data, uv2 data_dim, + uv4 output_points, f32 *out_data, i32 timeout_ms) +{ + b32 result = 0; + u64 data_size = data_dim.x * data_dim.y * sizeof(f32); + if (data_size <= U32_MAX) { + g_bp->raw.rf_raw_dim = data_dim; + result = beamform_data_synchronized(pipe_name, shm_name, data, data_dim, data_size, + output_points, out_data, timeout_ms); + } + return result; } diff --git a/helpers/ogl_beamformer_lib.h b/helpers/ogl_beamformer_lib.h @@ -35,7 +35,8 @@ LIB_FN b32 send_data(char *pipe_name, char *shm_name, i16 *data, uv2 data_dim); /* NOTE: sends data and waits for (complex) beamformed data to be returned. * out_data: must be allocated by the caller as 2 f32s per output point. */ -LIB_FN void beamform_data_synchronized(char *pipe_name, char *shm_name, - i16 *data, uv2 data_dim, - uv4 output_points, f32 *out_data, - i32 timeout_ms); +LIB_FN b32 beamform_data_synchronized_i16(char *pipe_name, char *shm_name, i16 *data, uv2 data_dim, + uv4 output_points, f32 *out_data, i32 timeout_ms); + +LIB_FN b32 beamform_data_synchronized_f32(char *pipe_name, char *shm_name, f32 *data, uv2 data_dim, + uv4 output_points, f32 *out_data, i32 timeout_ms); diff --git a/shaders/decode.glsl b/shaders/decode.glsl @@ -0,0 +1,84 @@ +/* See LICENSE for license details. */ +layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in; + +#ifdef INPUT_DATA_TYPE_FLOAT +#define INPUT_DATA_TYPE float +#define RF_DATA_STEP_SCALE 1 +#else +#define INPUT_DATA_TYPE int +#define RF_DATA_STEP_SCALE 2 +#endif + +layout(std430, binding = 1) readonly restrict buffer buffer_1 { + INPUT_DATA_TYPE rf_data[]; +}; + +layout(std430, binding = 2) writeonly restrict buffer buffer_2 { + vec2 out_data[]; +}; + +layout(r8i, binding = 0) readonly restrict uniform iimage2D hadamard; + +INPUT_DATA_TYPE sample_rf_data(uint index, uint lfs) +{ + INPUT_DATA_TYPE result; + #ifdef INPUT_DATA_TYPE_FLOAT + result = rf_data[index]; + #else + result = (rf_data[index] << lfs) >> 16; + #endif + return result; +} + +void main() +{ + /* NOTE: each invocation takes a time sample and a receive channel. + * It first maps the column to the correct column in the rf data then + * does the dot product with the equivalent row of the hadamard matrix. + * The result is stored to the equivalent row, column index of the output. + */ + uint time_sample = gl_GlobalInvocationID.x; + uint channel = gl_GlobalInvocationID.y; + uint acq = gl_GlobalInvocationID.z; + + /* NOTE: offsets for storing the results in the output data */ + uint out_off = dec_data_dim.x * dec_data_dim.y * acq + dec_data_dim.x * channel + time_sample; + + /* NOTE: channel mapping is stored as u16s so we must do this to extract the final value */ + uint ch_array_idx = (channel / 8); + uint ch_vec_idx = (channel % 8) / 2; + uint ch_elem_lfs = ((~channel) & 1u) * 16; + uint rf_channel = (channel_mapping[ch_array_idx][ch_vec_idx] << ch_elem_lfs) >> 16; + + /* NOTE: stride is the number of samples between acquistions; off is the + * index of the first acquisition for this channel and time sample */ + uint rf_stride = dec_data_dim.x; + uint rf_off = rf_raw_dim.x * rf_channel + time_sample; + + /* NOTE: rf_data index and stride considering the data is i16 not i32 */ + uint ridx = rf_off / RF_DATA_STEP_SCALE; + uint ridx_delta = rf_stride / RF_DATA_STEP_SCALE; + + /* NOTE: rf_data is i16 so each access grabs two time samples at time. + * We need to shift arithmetically (maintaining the sign) to get the + * desired element. If the time sample is even we take the upper half + * and if its odd we take the lower half. */ + uint lfs = ((~time_sample) & 1u) * 16; + + /* NOTE: Compute N-D dot product */ + float result = 0; + switch (decode) { + case DECODE_MODE_NONE: { + result = sample_rf_data(ridx + ridx_delta * acq, lfs); + } break; + case DECODE_MODE_HADAMARD: { + INPUT_DATA_TYPE sum = 0; + for (int i = 0; i < dec_data_dim.z; i++) { + sum += imageLoad(hadamard, ivec2(i, acq)).x * sample_rf_data(ridx, lfs); + ridx += ridx_delta; + } + result = float(sum) / float(dec_data_dim.z); + } break; + } + out_data[out_off] = vec2(result, 0); +} diff --git a/shaders/hadamard.glsl b/shaders/hadamard.glsl @@ -1,68 +0,0 @@ -/* See LICENSE for license details. */ -layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in; - -layout(std430, binding = 1) readonly restrict buffer buffer_1 { - int rf_data[]; -}; - -layout(std430, binding = 2) writeonly restrict buffer buffer_2 { - vec2 out_data[]; -}; - -layout(r8i, binding = 0) readonly restrict uniform iimage2D hadamard; - -void main() -{ - /* NOTE: each invocation takes a time sample and a receive channel. - * It first maps the column to the correct column in the rf data then - * does the dot product with the equivalent row of the hadamard matrix. - * The result is stored to the equivalent row, column index of the output. - */ - uint time_sample = gl_GlobalInvocationID.x; - uint channel = gl_GlobalInvocationID.y; - uint acq = gl_GlobalInvocationID.z; - - /* NOTE: offsets for storing the results in the output data */ - uint out_off = dec_data_dim.x * dec_data_dim.y * acq + dec_data_dim.x * channel + time_sample; - - /* NOTE: channel mapping is stored as u16s so we must do this to extract the final value */ - uint ch_array_idx = (channel / 8); - uint ch_vec_idx = (channel % 8) / 2; - uint ch_elem_lfs = ((~channel) & 1u) * 16; - uint rf_channel = (channel_mapping[ch_array_idx][ch_vec_idx] << ch_elem_lfs) >> 16; - - /* NOTE: stride is the number of samples between acquistions; off is the - * index of the first acquisition for this channel and time sample */ - uint rf_stride = dec_data_dim.x; - uint rf_off = rf_raw_dim.x * rf_channel + time_sample; - - /* NOTE: rf_data index and stride considering the data is i16 not i32 */ - uint ridx = rf_off / 2; - uint ridx_delta = rf_stride / 2; - - /* NOTE: rf_data is i16 so each access grabs two time samples at time. - * We need to shift arithmetically (maintaining the sign) to get the - * desired element. If the time sample is even we take the upper half - * and if its odd we take the lower half. */ - uint lfs = ((~time_sample) & 1u) * 16; - - /* NOTE: Compute N-D dot product */ - int sum = 0; - switch (decode) { - case DECODE_MODE_NONE: { - ridx += ridx_delta * acq; - sum = (rf_data[ridx] << lfs) >> 16; - } break; - case DECODE_MODE_HADAMARD: { - /* NOTE: offset to get the correct column in hadamard matrix */ - uint hoff = dec_data_dim.z * acq; - - for (int i = 0; i < dec_data_dim.z; i++) { - int data = (rf_data[ridx] << lfs) >> 16; - sum += imageLoad(hadamard, ivec2(i, acq)).x * data; - ridx += ridx_delta; - } - } break; - } - out_data[out_off] = vec2(float(sum), 0); -} diff --git a/static.c b/static.c @@ -290,10 +290,9 @@ setup_beamformer(BeamformerCtx *ctx, Arena *memory) ASSERT(ctx->params); /* NOTE: default compute shader pipeline */ - ctx->params->compute_stages[0] = CS_HADAMARD; + ctx->params->compute_stages[0] = CS_DECODE; ctx->params->compute_stages[1] = CS_DAS; - ctx->params->compute_stages[2] = CS_MIN_MAX; - ctx->params->compute_stages_count = 3; + ctx->params->compute_stages_count = 2; if (ctx->gl.vendor_id == GL_VENDOR_NVIDIA && load_cuda_lib(&ctx->platform, s8(OS_CUDA_LIB_NAME), (iptr)&ctx->cuda_lib, *memory))