ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

Commit: c075fc32ff89c97466394030b1075a7156a0de88
Parent: 8e2d399de3ad4d383c731cb6575a2c993a9cdb9e
Author: Randy Palamar
Date:   Sun,  6 Apr 2025 12:26:52 -0600

core/lib: upload channel mapping as a 1D i16 texture

Diffstat:
Mbeamformer.c | 20+++++++++++++++++---
Mbeamformer.h | 4+++-
Mbeamformer_parameters.h | 25+++++++++++++++++++++++--
Mbeamformer_work_queue.h | 7+++++--
Mhelpers/ogl_beamformer_lib.c | 40++++++++++++++++++++++++++++++++++------
Mhelpers/ogl_beamformer_lib.h | 9+++++++--
Mshaders/decode.glsl | 25+++++++++++--------------
Mstatic.c | 4+++-
8 files changed, 103 insertions(+), 31 deletions(-)

diff --git a/beamformer.c b/beamformer.c @@ -160,7 +160,7 @@ alloc_shader_storage(BeamformerCtx *ctx, Arena a) ctx->cuda_lib.register_cuda_buffers(cs->rf_data_ssbos, ARRAY_COUNT(cs->rf_data_ssbos), cs->raw_data_ssbo); ctx->cuda_lib.init_cuda_configuration(bp->rf_raw_dim.E, bp->dec_data_dim.E, - bp->channel_mapping); + ctx->shared_memory->channel_mapping); break; } @@ -308,6 +308,7 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]); glBindImageTexture(0, csctx->hadamard_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8I); + glBindImageTexture(1, csctx->channel_mapping_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I); glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32), ORONE(csctx->dec_data_dim.y / 32), ORONE(csctx->dec_data_dim.z)); @@ -548,8 +549,9 @@ reload_compute_shader(BeamformerCtx *ctx, s8 path, s8 extra, ComputeShaderReload static void complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_context, iz barrier_offset) { - ComputeShaderCtx *cs = &ctx->csctx; - BeamformerParameters *bp = &ctx->shared_memory->raw; + ComputeShaderCtx *cs = &ctx->csctx; + BeamformerParameters *bp = &ctx->shared_memory->raw; + BeamformerSharedMemory *sm = ctx->shared_memory; BeamformWork *work = beamform_work_queue_pop(q); while (work) { @@ -579,6 +581,18 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co #undef X } } break; + case BW_UPLOAD_CHANNEL_MAPPING: { + ASSERT(!atomic_load(&ctx->shared_memory->channel_mapping_sync)); + if (!cs->channel_mapping_texture) { + glCreateTextures(GL_TEXTURE_1D, 1, &cs->channel_mapping_texture); + glTextureStorage1D(cs->channel_mapping_texture, 1, GL_R16I, + ARRAY_COUNT(sm->channel_mapping)); + LABEL_GL_OBJECT(GL_TEXTURE, cs->channel_mapping_texture, s8("Channel_Mapping")); + } + glTextureSubImage1D(cs->channel_mapping_texture, 0, 0, + ARRAY_COUNT(sm->channel_mapping), GL_RED_INTEGER, + GL_SHORT, sm->channel_mapping); + } break; case BW_UPLOAD_RF_DATA: { ASSERT(!atomic_load(&ctx->shared_memory->raw_data_sync)); diff --git a/beamformer.h b/beamformer.h @@ -23,7 +23,7 @@ typedef struct { b32 executable_reloaded; } BeamformerInput; -#define INIT_CUDA_CONFIGURATION_FN(name) void name(u32 *input_dims, u32 *decoded_dims, u16 *channel_mapping) +#define INIT_CUDA_CONFIGURATION_FN(name) void name(u32 *input_dims, u32 *decoded_dims, i16 *channel_mapping) typedef INIT_CUDA_CONFIGURATION_FN(init_cuda_configuration_fn); INIT_CUDA_CONFIGURATION_FN(init_cuda_configuration_stub) {} @@ -86,6 +86,8 @@ typedef struct { u32 shared_ubo; + u32 channel_mapping_texture; + f32 processing_progress; b32 processing_compute; diff --git a/beamformer_parameters.h b/beamformer_parameters.h @@ -54,7 +54,7 @@ typedef enum { X(f_number, f32, , float, , "/* F# (set to 0 to disable) */") \ X(interpolate, b32, , bool, , "/* Perform Cubic Interpolation of RF Samples */") -#define BEAMFORMER_PARAMS_HEAD \ +#define BEAMFORMER_PARAMS_HEAD_V0 \ X(channel_mapping, u16, [256], uvec4, [32], "/* Transducer Channel to Verasonics Channel */") \ X(uforces_channels, u16, [256], uvec4, [32], "/* Channels used for virtual UFORCES elements */") \ X(focal_depths, f32, [256], vec4, [64], "/* [m] Focal Depths for each transmit of a RCA imaging scheme*/") \ @@ -68,12 +68,33 @@ typedef enum { X(das_shader_id, u32, , uint, , "") \ X(time_offset, f32, , float, , "/* pulse length correction time [s] */") +#define BEAMFORMER_PARAMS_HEAD \ + X(uforces_channels, u16, [256], uvec4, [32], "/* Channels used for virtual UFORCES elements */") \ + X(focal_depths, f32, [256], vec4, [64], "/* [m] Focal Depths for each transmit of a RCA imaging scheme*/") \ + X(transmit_angles, f32, [256], vec4, [64], "/* [radians] Transmit Angles for each transmit of a RCA imaging scheme*/") \ + X(xdc_transform, f32, [16] , mat4, , "/* IMPORTANT: column major order */") \ + X(dec_data_dim, uv4, , uvec4, , "/* Samples * Channels * Acquisitions; last element ignored */") \ + X(xdc_element_pitch, f32, [2] , vec2, , "/* [m] Transducer Element Pitch {row, col} */") \ + X(rf_raw_dim, uv2, , uvec2, , "/* Raw Data Dimensions */") \ + X(transmit_mode, i32, , int, , "/* Method/Orientation of Transmit */") \ + X(decode, u32, , uint, , "/* Decode or just reshape data */") \ + X(das_shader_id, u32, , uint, , "") \ + X(time_offset, f32, , float, , "/* pulse length correction time [s] */") + #define BEAMFORMER_PARAMS_TAIL \ X(readi_group_id, u32, , uint, , "/* Which readi group this data is from */") \ X(readi_group_size, u32, , uint, , "/* Size of readi transmit group */") #define X(name, type, size, gltype, glsize, comment) type name size; -typedef struct { BEAMFORMER_UI_PARAMS } BeamformerUIParameters; +typedef struct { BEAMFORMER_UI_PARAMS } BeamformerUIParameters; +typedef struct { BEAMFORMER_PARAMS_HEAD_V0 } BeamformerFixedParametersV0; + +typedef struct { + BEAMFORMER_PARAMS_HEAD_V0 + BEAMFORMER_UI_PARAMS + BEAMFORMER_PARAMS_TAIL + f32 _pad[3]; +} BeamformerParametersV0; /* NOTE: This struct follows the OpenGL std140 layout. DO NOT modify unless you have * read and understood the rules, particulary with regards to _member alignment_ */ diff --git a/beamformer_work_queue.h b/beamformer_work_queue.h @@ -73,8 +73,11 @@ typedef struct { /* TODO(rnp): probably remove this */ c8 export_pipe_name[256]; - u16 channel_mapping[256]; - u16 sparse_elements[256]; + i32 channel_mapping_sync; + i32 sparse_elements_sync; + + i16 channel_mapping[256]; + i16 sparse_elements[256]; v2 transmit_angles_focal_depths[256]; BeamformWorkQueue external_work_queue; diff --git a/helpers/ogl_beamformer_lib.c b/helpers/ogl_beamformer_lib.c @@ -253,15 +253,43 @@ set_beamformer_pipeline(char *shm_name, i32 *stages, i32 stages_count) } b32 -set_beamformer_parameters(char *shm_name, BeamformerParameters *new_bp) +beamformer_push_channel_mapping(char *shm_name, i16 *mapping, u32 count, i32 timeout_ms) { - if (!check_shared_memory(shm_name)) - return 0; + b32 result = check_shared_memory(shm_name) && count <= ARRAY_COUNT(g_bp->channel_mapping); + if (result) { + BeamformWork *work = beamform_work_queue_push(&g_bp->external_work_queue); + if (work) { + /* TODO(rnp): refactor */ + for (;;) { + i32 current = atomic_load(&g_bp->channel_mapping_sync); + if (current) { + atomic_inc(&g_bp->channel_mapping_sync, -current); + break; + } + os_wait_on_value(&g_bp->channel_mapping_sync, 0, timeout_ms); + } + work->type = BW_UPLOAD_CHANNEL_MAPPING; + work->completion_barrier = offsetof(BeamformerSharedMemory, channel_mapping_sync); + mem_copy(g_bp->channel_mapping, mapping, count * sizeof(*mapping)); - mem_copy(&g_bp->raw, new_bp, sizeof(BeamformerParameters)); - g_bp->upload = 1; + beamform_work_queue_push_commit(&g_bp->external_work_queue); + } + } + return result; +} - return 1; +b32 +set_beamformer_parameters(char *shm_name, BeamformerParametersV0 *new_bp) +{ + b32 result = 0; + result |= beamformer_push_channel_mapping(shm_name, (i16 *)new_bp->channel_mapping, + ARRAY_COUNT(new_bp->channel_mapping), 0); + if (result) { + mem_copy(&g_bp->raw, &new_bp->uforces_channels, sizeof(BeamformerParameters)); + g_bp->upload = 1; + } + + return result; } b32 diff --git a/helpers/ogl_beamformer_lib.h b/helpers/ogl_beamformer_lib.h @@ -20,12 +20,17 @@ typedef struct { f32 x, y, z, w; } v4; #define LIB_FN #endif -LIB_FN b32 set_beamformer_parameters(char *shm_name, BeamformerParameters *); -LIB_FN b32 set_beamformer_pipeline(char *shm_name, i32 *stages, i32 stages_count); +/* IMPORTANT: timeout of -1 will block forever */ +LIB_FN b32 set_beamformer_parameters(char *shm_name, BeamformerParametersV0 *); +LIB_FN b32 set_beamformer_pipeline(char *shm_name, i32 *stages, i32 stages_count); LIB_FN b32 send_data(char *pipe_name, char *shm_name, void *data, u32 data_size); /* NOTE: sends data and waits for (complex) beamformed data to be returned. * out_data: must be allocated by the caller as 2 f32s per output point. */ LIB_FN b32 beamform_data_synchronized(char *pipe_name, char *shm_name, void *data, u32 data_size, uv4 output_points, f32 *out_data, i32 timeout_ms); + +/* NOTE: these functions only queue an upload; you must flush (for now via one of the data functions) */ + +LIB_FN b32 beamformer_push_channel_mapping(char *shm_name, i16 *mapping, u32 count, i32 timeout_ms); diff --git a/shaders/decode.glsl b/shaders/decode.glsl @@ -23,9 +23,10 @@ layout(std430, binding = 2) writeonly restrict buffer buffer_2 { vec2 out_data[]; }; -layout(r8i, binding = 0) readonly restrict uniform iimage2D hadamard; +layout(r8i, binding = 0) readonly restrict uniform iimage2D hadamard; +layout(r16i, binding = 1) readonly restrict uniform iimage1D channel_mapping; -INPUT_DATA_TYPE sample_rf_data(uint index, uint lfs) +INPUT_DATA_TYPE sample_rf_data(int index, uint lfs) { INPUT_DATA_TYPE result; #if defined(INPUT_DATA_TYPE_FLOAT) @@ -45,27 +46,23 @@ void main() * does the dot product with the equivalent row of the hadamard matrix. * The result is stored to the equivalent row, column index of the output. */ - uint time_sample = gl_GlobalInvocationID.x; - uint channel = gl_GlobalInvocationID.y; - uint acq = gl_GlobalInvocationID.z; + int time_sample = int(gl_GlobalInvocationID.x); + int channel = int(gl_GlobalInvocationID.y); + int acq = int(gl_GlobalInvocationID.z); /* NOTE: offsets for storing the results in the output data */ uint out_off = dec_data_dim.x * dec_data_dim.y * acq + dec_data_dim.x * channel + time_sample; - /* NOTE: channel mapping is stored as u16s so we must do this to extract the final value */ - uint ch_array_idx = (channel / 8); - uint ch_vec_idx = (channel % 8) / 2; - uint ch_elem_lfs = ((~channel) & 1u) * 16; - uint rf_channel = (channel_mapping[ch_array_idx][ch_vec_idx] << ch_elem_lfs) >> 16; + int rf_channel = imageLoad(channel_mapping, channel).x; /* NOTE: stride is the number of samples between acquistions; off is the * index of the first acquisition for this channel and time sample */ - uint rf_stride = dec_data_dim.x; - uint rf_off = rf_raw_dim.x * rf_channel + time_sample; + int rf_stride = int(dec_data_dim.x); + int rf_off = int(rf_raw_dim.x) * rf_channel + time_sample; /* NOTE: rf_data index and stride considering the data is i16 not i32 */ - uint ridx = rf_off / RF_SAMPLES_PER_INDEX; - uint ridx_delta = rf_stride / RF_SAMPLES_PER_INDEX; + int ridx = rf_off / RF_SAMPLES_PER_INDEX; + int ridx_delta = rf_stride / RF_SAMPLES_PER_INDEX; /* NOTE: rf_data is i16 so each access grabs two time samples at time. * We need to shift arithmetically (maintaining the sign) to get the diff --git a/static.c b/static.c @@ -305,7 +305,9 @@ setup_beamformer(BeamformerCtx *ctx, Arena *memory) ctx->shared_memory = os_open_shared_memory_area(OS_SMEM_NAME, BEAMFORMER_SHARED_MEMORY_SIZE); if (!ctx->shared_memory) os_fatal(s8("Get more ram lol\n")); - ctx->shared_memory->raw_data_sync = 1; + /* TODO(rnp): refactor - this is annoying */ + ctx->shared_memory->raw_data_sync = 1; + ctx->shared_memory->channel_mapping_sync = 1; /* NOTE: default compute shader pipeline */ ctx->shared_memory->compute_stages[0] = CS_DECODE;