core/lib: upload channel mapping as a 1D i16 texture - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: c075fc32ff89c97466394030b1075a7156a0de88
Parent: 8e2d399de3ad4d383c731cb6575a2c993a9cdb9e
Author: Randy Palamar
Date:   Sun,  6 Apr 2025 12:26:52 -0600

core/lib: upload channel mapping as a 1D i16 texture

Diffstat:
M beamformer.c  | 20 +++++++++++++++++---
M beamformer.h  | 4 +++-
M beamformer_parameters.h  | 25 +++++++++++++++++++++++--
M beamformer_work_queue.h  | 7 +++++--
M helpers/ogl_beamformer_lib.c  | 40 ++++++++++++++++++++++++++++++++++------
M helpers/ogl_beamformer_lib.h  | 9 +++++++--
M shaders/decode.glsl  | 25 +++++++++++--------------
M static.c  | 4 +++-

8 files changed, 103 insertions(+), 31 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -160,7 +160,7 @@ alloc_shader_storage(BeamformerCtx *ctx, Arena a)
 		ctx->cuda_lib.register_cuda_buffers(cs->rf_data_ssbos, ARRAY_COUNT(cs->rf_data_ssbos),
 		                                    cs->raw_data_ssbo);
 		ctx->cuda_lib.init_cuda_configuration(bp->rf_raw_dim.E, bp->dec_data_dim.E,
-		                                      bp->channel_mapping);
+		                                      ctx->shared_memory->channel_mapping);
 		break;
 	}
 
@@ -308,6 +308,7 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, 
 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo);
 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]);
 		glBindImageTexture(0, csctx->hadamard_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8I);
+		glBindImageTexture(1, csctx->channel_mapping_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I);
 		glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32),
 		                  ORONE(csctx->dec_data_dim.y / 32),
 		                  ORONE(csctx->dec_data_dim.z));
@@ -548,8 +549,9 @@ reload_compute_shader(BeamformerCtx *ctx, s8 path, s8 extra, ComputeShaderReload
 static void
 complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_context, iz barrier_offset)
 {
-	ComputeShaderCtx *cs     = &ctx->csctx;
-	BeamformerParameters *bp = &ctx->shared_memory->raw;
+	ComputeShaderCtx       *cs = &ctx->csctx;
+	BeamformerParameters   *bp = &ctx->shared_memory->raw;
+	BeamformerSharedMemory *sm = ctx->shared_memory;
 
 	BeamformWork *work = beamform_work_queue_pop(q);
 	while (work) {
@@ -579,6 +581,18 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co
 				#undef X
 			}
 		} break;
+		case BW_UPLOAD_CHANNEL_MAPPING: {
+			ASSERT(!atomic_load(&ctx->shared_memory->channel_mapping_sync));
+			if (!cs->channel_mapping_texture) {
+				glCreateTextures(GL_TEXTURE_1D, 1, &cs->channel_mapping_texture);
+				glTextureStorage1D(cs->channel_mapping_texture, 1, GL_R16I,
+				                   ARRAY_COUNT(sm->channel_mapping));
+				LABEL_GL_OBJECT(GL_TEXTURE, cs->channel_mapping_texture, s8("Channel_Mapping"));
+			}
+			glTextureSubImage1D(cs->channel_mapping_texture, 0, 0,
+			                    ARRAY_COUNT(sm->channel_mapping), GL_RED_INTEGER,
+			                    GL_SHORT, sm->channel_mapping);
+		} break;
 		case BW_UPLOAD_RF_DATA: {
 			ASSERT(!atomic_load(&ctx->shared_memory->raw_data_sync));
 
diff --git a/beamformer.h b/beamformer.h
@@ -23,7 +23,7 @@ typedef struct {
 	b32  executable_reloaded;
 } BeamformerInput;
 
-#define INIT_CUDA_CONFIGURATION_FN(name) void name(u32 *input_dims, u32 *decoded_dims, u16 *channel_mapping)
+#define INIT_CUDA_CONFIGURATION_FN(name) void name(u32 *input_dims, u32 *decoded_dims, i16 *channel_mapping)
 typedef INIT_CUDA_CONFIGURATION_FN(init_cuda_configuration_fn);
 INIT_CUDA_CONFIGURATION_FN(init_cuda_configuration_stub) {}
 
@@ -86,6 +86,8 @@ typedef struct {
 
 	u32 shared_ubo;
 
+	u32 channel_mapping_texture;
+
 	f32 processing_progress;
 	b32 processing_compute;
 
diff --git a/beamformer_parameters.h b/beamformer_parameters.h
@@ -54,7 +54,7 @@ typedef enum {
 	X(f_number,              f32, , float, , "/* F# (set to 0 to disable) */")                                     \
 	X(interpolate,           b32, , bool,  , "/* Perform Cubic Interpolation of RF Samples */")
 
-#define BEAMFORMER_PARAMS_HEAD \
+#define BEAMFORMER_PARAMS_HEAD_V0 \
 	X(channel_mapping,   u16, [256], uvec4, [32], "/* Transducer Channel to Verasonics Channel */")                           \
 	X(uforces_channels,  u16, [256], uvec4, [32], "/* Channels used for virtual UFORCES elements */")                         \
 	X(focal_depths,      f32, [256], vec4,  [64], "/* [m] Focal Depths for each transmit of a RCA imaging scheme*/")          \
@@ -68,12 +68,33 @@ typedef enum {
 	X(das_shader_id,     u32,      , uint,      , "")                                                                         \
 	X(time_offset,       f32,      , float,     , "/* pulse length correction time [s] */")
 
+#define BEAMFORMER_PARAMS_HEAD \
+	X(uforces_channels,  u16, [256], uvec4, [32], "/* Channels used for virtual UFORCES elements */")                         \
+	X(focal_depths,      f32, [256], vec4,  [64], "/* [m] Focal Depths for each transmit of a RCA imaging scheme*/")          \
+	X(transmit_angles,   f32, [256], vec4,  [64], "/* [radians] Transmit Angles for each transmit of a RCA imaging scheme*/") \
+	X(xdc_transform,     f32, [16] , mat4,      , "/* IMPORTANT: column major order */")                                      \
+	X(dec_data_dim,      uv4,      , uvec4,     , "/* Samples * Channels * Acquisitions; last element ignored */")            \
+	X(xdc_element_pitch, f32, [2]  , vec2,      , "/* [m] Transducer Element Pitch {row, col} */")                            \
+	X(rf_raw_dim,        uv2,      , uvec2,     , "/* Raw Data Dimensions */")                                                \
+	X(transmit_mode,     i32,      , int,       , "/* Method/Orientation of Transmit */")                                     \
+	X(decode,            u32,      , uint,      , "/* Decode or just reshape data */")                                        \
+	X(das_shader_id,     u32,      , uint,      , "")                                                                         \
+	X(time_offset,       f32,      , float,     , "/* pulse length correction time [s] */")
+
 #define BEAMFORMER_PARAMS_TAIL \
 	X(readi_group_id,   u32, , uint, , "/* Which readi group this data is from */") \
 	X(readi_group_size, u32, , uint, , "/* Size of readi transmit group */")
 
 #define X(name, type, size, gltype, glsize, comment) type name size;
-typedef struct { BEAMFORMER_UI_PARAMS } BeamformerUIParameters;
+typedef struct { BEAMFORMER_UI_PARAMS }      BeamformerUIParameters;
+typedef struct { BEAMFORMER_PARAMS_HEAD_V0 } BeamformerFixedParametersV0;
+
+typedef struct {
+	BEAMFORMER_PARAMS_HEAD_V0
+	BEAMFORMER_UI_PARAMS
+	BEAMFORMER_PARAMS_TAIL
+	f32 _pad[3];
+} BeamformerParametersV0;
 
 /* NOTE: This struct follows the OpenGL std140 layout. DO NOT modify unless you have
  * read and understood the rules, particulary with regards to _member alignment_ */
diff --git a/beamformer_work_queue.h b/beamformer_work_queue.h
@@ -73,8 +73,11 @@ typedef struct {
 	/* TODO(rnp): probably remove this */
 	c8  export_pipe_name[256];
 
-	u16 channel_mapping[256];
-	u16 sparse_elements[256];
+	i32 channel_mapping_sync;
+	i32 sparse_elements_sync;
+
+	i16 channel_mapping[256];
+	i16 sparse_elements[256];
 	v2  transmit_angles_focal_depths[256];
 
 	BeamformWorkQueue external_work_queue;
diff --git a/helpers/ogl_beamformer_lib.c b/helpers/ogl_beamformer_lib.c
@@ -253,15 +253,43 @@ set_beamformer_pipeline(char *shm_name, i32 *stages, i32 stages_count)
 }
 
 b32
-set_beamformer_parameters(char *shm_name, BeamformerParameters *new_bp)
+beamformer_push_channel_mapping(char *shm_name, i16 *mapping, u32 count, i32 timeout_ms)
 {
-	if (!check_shared_memory(shm_name))
-		return 0;
+	b32 result = check_shared_memory(shm_name) && count <= ARRAY_COUNT(g_bp->channel_mapping);
+	if (result) {
+		BeamformWork *work = beamform_work_queue_push(&g_bp->external_work_queue);
+		if (work) {
+			/* TODO(rnp): refactor */
+			for (;;) {
+				i32 current = atomic_load(&g_bp->channel_mapping_sync);
+				if (current) {
+					atomic_inc(&g_bp->channel_mapping_sync, -current);
+					break;
+				}
+				os_wait_on_value(&g_bp->channel_mapping_sync, 0, timeout_ms);
+			}
+			work->type = BW_UPLOAD_CHANNEL_MAPPING;
+			work->completion_barrier = offsetof(BeamformerSharedMemory, channel_mapping_sync);
+			mem_copy(g_bp->channel_mapping, mapping, count * sizeof(*mapping));
 
-	mem_copy(&g_bp->raw, new_bp, sizeof(BeamformerParameters));
-	g_bp->upload = 1;
+			beamform_work_queue_push_commit(&g_bp->external_work_queue);
+		}
+	}
+	return result;
+}
 
-	return 1;
+b32
+set_beamformer_parameters(char *shm_name, BeamformerParametersV0 *new_bp)
+{
+	b32 result = 0;
+	result |= beamformer_push_channel_mapping(shm_name, (i16 *)new_bp->channel_mapping,
+	                                          ARRAY_COUNT(new_bp->channel_mapping), 0);
+	if (result) {
+		mem_copy(&g_bp->raw, &new_bp->uforces_channels, sizeof(BeamformerParameters));
+		g_bp->upload = 1;
+	}
+
+	return result;
 }
 
 b32
diff --git a/helpers/ogl_beamformer_lib.h b/helpers/ogl_beamformer_lib.h
@@ -20,12 +20,17 @@ typedef struct { f32 x, y, z, w; } v4;
 #define LIB_FN
 #endif
 
-LIB_FN b32 set_beamformer_parameters(char *shm_name, BeamformerParameters *);
-LIB_FN b32 set_beamformer_pipeline(char *shm_name, i32 *stages, i32 stages_count);
+/* IMPORTANT: timeout of -1 will block forever */
 
+LIB_FN b32 set_beamformer_parameters(char *shm_name, BeamformerParametersV0 *);
+LIB_FN b32 set_beamformer_pipeline(char *shm_name, i32 *stages, i32 stages_count);
 LIB_FN b32 send_data(char *pipe_name, char *shm_name, void *data, u32 data_size);
 
 /* NOTE: sends data and waits for (complex) beamformed data to be returned.
  * out_data: must be allocated by the caller as 2 f32s per output point. */
 LIB_FN b32 beamform_data_synchronized(char *pipe_name, char *shm_name, void *data, u32 data_size,
                                       uv4 output_points, f32 *out_data, i32 timeout_ms);
+
+/* NOTE: these functions only queue an upload; you must flush (for now via one of the data functions) */
+
+LIB_FN b32 beamformer_push_channel_mapping(char *shm_name, i16 *mapping, u32 count, i32 timeout_ms);
diff --git a/shaders/decode.glsl b/shaders/decode.glsl
@@ -23,9 +23,10 @@ layout(std430, binding = 2) writeonly restrict buffer buffer_2 {
 	vec2 out_data[];
 };
 
-layout(r8i, binding = 0) readonly restrict uniform iimage2D hadamard;
+layout(r8i,  binding = 0) readonly restrict uniform iimage2D hadamard;
+layout(r16i, binding = 1) readonly restrict uniform iimage1D channel_mapping;
 
-INPUT_DATA_TYPE sample_rf_data(uint index, uint lfs)
+INPUT_DATA_TYPE sample_rf_data(int index, uint lfs)
 {
 	INPUT_DATA_TYPE result;
 #if   defined(INPUT_DATA_TYPE_FLOAT)
@@ -45,27 +46,23 @@ void main()
 	 * does the dot product with the equivalent row of the hadamard matrix.
 	 * The result is stored to the equivalent row, column index of the output.
 	 */
-	uint time_sample = gl_GlobalInvocationID.x;
-	uint channel     = gl_GlobalInvocationID.y;
-	uint acq         = gl_GlobalInvocationID.z;
+	int time_sample = int(gl_GlobalInvocationID.x);
+	int channel     = int(gl_GlobalInvocationID.y);
+	int acq         = int(gl_GlobalInvocationID.z);
 
 	/* NOTE: offsets for storing the results in the output data */
 	uint out_off = dec_data_dim.x * dec_data_dim.y * acq + dec_data_dim.x * channel + time_sample;
 
-	/* NOTE: channel mapping is stored as u16s so we must do this to extract the final value */
-	uint ch_array_idx = (channel / 8);
-	uint ch_vec_idx   = (channel % 8) / 2;
-	uint ch_elem_lfs  = ((~channel) & 1u) * 16;
-	uint rf_channel   = (channel_mapping[ch_array_idx][ch_vec_idx] << ch_elem_lfs) >> 16;
+	int rf_channel = imageLoad(channel_mapping, channel).x;
 
 	/* NOTE: stride is the number of samples between acquistions; off is the
 	 * index of the first acquisition for this channel and time sample  */
-	uint rf_stride = dec_data_dim.x;
-	uint rf_off    = rf_raw_dim.x * rf_channel + time_sample;
+	int rf_stride = int(dec_data_dim.x);
+	int rf_off    = int(rf_raw_dim.x) * rf_channel + time_sample;
 
 	/* NOTE: rf_data index and stride considering the data is i16 not i32 */
-	uint ridx       = rf_off    / RF_SAMPLES_PER_INDEX;
-	uint ridx_delta = rf_stride / RF_SAMPLES_PER_INDEX;
+	int ridx       = rf_off    / RF_SAMPLES_PER_INDEX;
+	int ridx_delta = rf_stride / RF_SAMPLES_PER_INDEX;
 
 	/* NOTE: rf_data is i16 so each access grabs two time samples at time.
 	 * We need to shift arithmetically (maintaining the sign) to get the
diff --git a/static.c b/static.c
@@ -305,7 +305,9 @@ setup_beamformer(BeamformerCtx *ctx, Arena *memory)
 	ctx->shared_memory = os_open_shared_memory_area(OS_SMEM_NAME, BEAMFORMER_SHARED_MEMORY_SIZE);
 	if (!ctx->shared_memory)
 		os_fatal(s8("Get more ram lol\n"));
-	ctx->shared_memory->raw_data_sync = 1;
+	/* TODO(rnp): refactor - this is annoying */
+	ctx->shared_memory->raw_data_sync        = 1;
+	ctx->shared_memory->channel_mapping_sync = 1;
 
 	/* NOTE: default compute shader pipeline */
 	ctx->shared_memory->compute_stages[0]    = CS_DECODE;

M	beamformer.c	\|	20	+++++++++++++++++---
M	beamformer.h	\|	4	+++-
M	beamformer_parameters.h	\|	25	+++++++++++++++++++++++--
M	beamformer_work_queue.h	\|	7	+++++--
M	helpers/ogl_beamformer_lib.c	\|	40	++++++++++++++++++++++++++++++++++------
M	helpers/ogl_beamformer_lib.h	\|	9	+++++++--
M	shaders/decode.glsl	\|	25	+++++++++++--------------
M	static.c	\|	4	+++-