core/lib: upload sparse elements as a 1D i16 texture - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: 7e351ca34f89675e39ee52a4d516a87c1bb17e15
Parent: c075fc32ff89c97466394030b1075a7156a0de88
Author: Randy Palamar
Date:   Sun,  6 Apr 2025 12:57:38 -0600

core/lib: upload sparse elements as a 1D i16 texture

no more bit manipulation/integer unpacking

Diffstat:
M beamformer.c  | 13 +++++++++++++
M beamformer.h  | 1 +
M beamformer_parameters.h  | 1 -
M helpers/ogl_beamformer_lib.c  | 50 +++++++++++++++++++++++++++++++++++++++-----------
M helpers/ogl_beamformer_lib.h  | 1 +
M shaders/das.glsl  | 47 ++++++++++++++++++++---------------------------
M static.c  | 1 +

7 files changed, 75 insertions(+), 39 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -347,6 +347,7 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformComputeFrame *frame, 
 	case CS_DAS: {
 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]);
 		glBindImageTexture(0, frame->frame.texture, 0, GL_TRUE, 0, GL_WRITE_ONLY, GL_RG32F);
+		glBindImageTexture(1, csctx->sparse_elements_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I);
 
 		#if 1
 		/* TODO(rnp): compute max_points_per_dispatch based on something like a
@@ -614,6 +615,18 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co
 				break;
 			}
 		} break;
+		case BW_UPLOAD_SPARSE_ELEMENTS: {
+			ASSERT(!atomic_load(&ctx->shared_memory->sparse_elements_sync));
+			if (!cs->sparse_elements_texture) {
+				glCreateTextures(GL_TEXTURE_1D, 1, &cs->sparse_elements_texture);
+				glTextureStorage1D(cs->sparse_elements_texture, 1, GL_R16I,
+				                   ARRAY_COUNT(sm->sparse_elements));
+				LABEL_GL_OBJECT(GL_TEXTURE, cs->sparse_elements_texture, s8("Sparse_Elements"));
+			}
+			glTextureSubImage1D(cs->sparse_elements_texture, 0, 0,
+			                    ARRAY_COUNT(sm->sparse_elements), GL_RED_INTEGER,
+			                    GL_SHORT, sm->sparse_elements);
+		} break;
 		case BW_COMPUTE: {
 			atomic_store(&cs->processing_compute, 1);
 			start_renderdoc_capture(gl_context);
diff --git a/beamformer.h b/beamformer.h
@@ -87,6 +87,7 @@ typedef struct {
 	u32 shared_ubo;
 
 	u32 channel_mapping_texture;
+	u32 sparse_elements_texture;
 
 	f32 processing_progress;
 	b32 processing_compute;
diff --git a/beamformer_parameters.h b/beamformer_parameters.h
@@ -69,7 +69,6 @@ typedef enum {
 	X(time_offset,       f32,      , float,     , "/* pulse length correction time [s] */")
 
 #define BEAMFORMER_PARAMS_HEAD \
-	X(uforces_channels,  u16, [256], uvec4, [32], "/* Channels used for virtual UFORCES elements */")                         \
 	X(focal_depths,      f32, [256], vec4,  [64], "/* [m] Focal Depths for each transmit of a RCA imaging scheme*/")          \
 	X(transmit_angles,   f32, [256], vec4,  [64], "/* [radians] Transmit Angles for each transmit of a RCA imaging scheme*/") \
 	X(xdc_transform,     f32, [16] , mat4,      , "/* IMPORTANT: column major order */")                                      \
diff --git a/helpers/ogl_beamformer_lib.c b/helpers/ogl_beamformer_lib.c
@@ -211,6 +211,24 @@ os_open_shared_memory_area(char *name)
 #endif
 
 static b32
+try_wait_sync(i32 *sync, i32 timeout_ms)
+{
+	b32 result = 0;
+	for (;;) {
+		i32 current = atomic_load(sync);
+		if (current) {
+			atomic_inc(sync, -current);
+			result = 1;
+			break;
+		} else if (!timeout_ms) {
+			break;
+		}
+		os_wait_on_value(sync, 0, timeout_ms);
+	}
+	return result;
+}
+
+static b32
 check_shared_memory(char *name)
 {
 	if (!g_bp) {
@@ -258,20 +276,28 @@ beamformer_push_channel_mapping(char *shm_name, i16 *mapping, u32 count, i32 tim
 	b32 result = check_shared_memory(shm_name) && count <= ARRAY_COUNT(g_bp->channel_mapping);
 	if (result) {
 		BeamformWork *work = beamform_work_queue_push(&g_bp->external_work_queue);
-		if (work) {
-			/* TODO(rnp): refactor */
-			for (;;) {
-				i32 current = atomic_load(&g_bp->channel_mapping_sync);
-				if (current) {
-					atomic_inc(&g_bp->channel_mapping_sync, -current);
-					break;
-				}
-				os_wait_on_value(&g_bp->channel_mapping_sync, 0, timeout_ms);
-			}
+		result = work && try_wait_sync(&g_bp->channel_mapping_sync, timeout_ms);
+		if (result) {
 			work->type = BW_UPLOAD_CHANNEL_MAPPING;
 			work->completion_barrier = offsetof(BeamformerSharedMemory, channel_mapping_sync);
 			mem_copy(g_bp->channel_mapping, mapping, count * sizeof(*mapping));
+			beamform_work_queue_push_commit(&g_bp->external_work_queue);
+		}
+	}
+	return result;
+}
 
+b32
+beamformer_push_sparse_elements(char *shm_name, i16 *elements, u32 count, i32 timeout_ms)
+{
+	b32 result = check_shared_memory(shm_name) && count <= ARRAY_COUNT(g_bp->sparse_elements);
+	if (result) {
+		BeamformWork *work = beamform_work_queue_push(&g_bp->external_work_queue);
+		result = work && try_wait_sync(&g_bp->sparse_elements_sync, timeout_ms);
+		if (result) {
+			work->type = BW_UPLOAD_SPARSE_ELEMENTS;
+			work->completion_barrier = offsetof(BeamformerSharedMemory, sparse_elements_sync);
+			mem_copy(g_bp->sparse_elements, elements, count * sizeof(*elements));
 			beamform_work_queue_push_commit(&g_bp->external_work_queue);
 		}
 	}
@@ -284,8 +310,10 @@ set_beamformer_parameters(char *shm_name, BeamformerParametersV0 *new_bp)
 	b32 result = 0;
 	result |= beamformer_push_channel_mapping(shm_name, (i16 *)new_bp->channel_mapping,
 	                                          ARRAY_COUNT(new_bp->channel_mapping), 0);
+	result |= beamformer_push_sparse_elements(shm_name, (i16 *)new_bp->uforces_channels,
+	                                          ARRAY_COUNT(new_bp->uforces_channels), 0);
 	if (result) {
-		mem_copy(&g_bp->raw, &new_bp->uforces_channels, sizeof(BeamformerParameters));
+		mem_copy(&g_bp->raw, &new_bp->focal_depths, sizeof(g_bp->raw));
 		g_bp->upload = 1;
 	}
 
diff --git a/helpers/ogl_beamformer_lib.h b/helpers/ogl_beamformer_lib.h
@@ -34,3 +34,4 @@ LIB_FN b32 beamform_data_synchronized(char *pipe_name, char *shm_name, void *dat
 /* NOTE: these functions only queue an upload; you must flush (for now via one of the data functions) */
 
 LIB_FN b32 beamformer_push_channel_mapping(char *shm_name, i16 *mapping, u32 count, i32 timeout_ms);
+LIB_FN b32 beamformer_push_sparse_elements(char *shm_name, i16 *elements, u32 count, i32 timeout_ms);
diff --git a/shaders/das.glsl b/shaders/das.glsl
@@ -3,7 +3,8 @@ layout(std430, binding = 1) readonly restrict buffer buffer_1 {
 	vec2 rf_data[];
 };
 
-layout(rg32f, binding = 0) writeonly uniform image3D u_out_data_tex;
+layout(rg32f, binding = 0) writeonly restrict uniform image3D  u_out_data_tex;
+layout(r16i,  binding = 1) readonly  restrict uniform iimage1D sparse_elements;
 
 layout(location = 2) uniform ivec3 u_voxel_offset;
 layout(location = 3) uniform uint  u_cycle_t;
@@ -17,7 +18,7 @@ layout(location = 3) uniform uint  u_cycle_t;
 #define TX_MODE_RX_COLS(a) (((a) & 1) != 0)
 
 /* NOTE: See: https://cubic.org/docs/hermite.htm */
-vec2 cubic(uint ridx, float x)
+vec2 cubic(int ridx, float x)
 {
 	mat4 h = mat4(
 		 2, -3,  0, 1,
@@ -26,7 +27,7 @@ vec2 cubic(uint ridx, float x)
 		 1, -1,  0, 0
 	);
 
-	uint  xk = uint(floor(x));
+	int   xk = int(floor(x));
 	float t  = (x  - float(xk));
 	vec4  S  = vec4(t * t * t, t * t, t, 1);
 
@@ -40,11 +41,11 @@ vec2 cubic(uint ridx, float x)
 	return vec2(dot(S, h * C1), dot(S, h * C2));
 }
 
-vec2 sample_rf(uint ridx, float t)
+vec2 sample_rf(int ridx, float t)
 {
 	vec2 result;
 	if (interpolate) result = cubic(ridx, t);
-	else             result = rf_data[ridx + uint(floor(t))];
+	else             result = rf_data[ridx + int(floor(t))];
 	return result;
 }
 
@@ -109,8 +110,8 @@ float cylindricalwave_transmit_distance(vec3 point, float focal_depth, float tra
 
 vec2 RCA(vec3 image_point, vec3 delta, float apodization_arg)
 {
-	uint ridx      = 0;
-	int  direction = beamform_plane;
+	int ridx      = 0;
+	int direction = beamform_plane;
 	if (direction != TX_ROWS) image_point = image_point.yxz;
 
 	bool tx_col = TX_MODE_TX_COLS(transmit_mode);
@@ -146,8 +147,8 @@ vec2 RCA(vec3 image_point, vec3 delta, float apodization_arg)
 			float sidx  = sample_index(transmit_distance + length(receive_distance));
 			vec2 valid  = vec2(sidx >= 0) * vec2(sidx < dec_data_dim.x);
 			sum        += apodize(sample_rf(ridx, sidx), apodization_arg, length(receive_distance.xy)) * valid;
-			receive_distance   -= delta;
-			ridx       += dec_data_dim.x;
+			receive_distance -= delta;
+			ridx             += int(dec_data_dim.x);
 		}
 	}
 	return sum;
@@ -155,8 +156,8 @@ vec2 RCA(vec3 image_point, vec3 delta, float apodization_arg)
 
 vec2 HERCULES(vec3 image_point, vec3 delta, float apodization_arg)
 {
-	uint uhercules = uint(das_shader_id == DAS_ID_UHERCULES);
-	uint ridx      = dec_data_dim.y * dec_data_dim.x * uhercules;
+	int uhercules  = int(das_shader_id == DAS_ID_UHERCULES);
+	int ridx       = int(dec_data_dim.y * dec_data_dim.x * uhercules);
 	int  direction = beamform_plane;
 	if (direction != TX_ROWS) image_point = image_point.yxz;
 
@@ -179,12 +180,8 @@ vec2 HERCULES(vec3 image_point, vec3 delta, float apodization_arg)
 
 	vec2 sum = vec2(0);
 	/* NOTE: For Each Acquistion in Raw Data */
-	for (uint i = uhercules; i < dec_data_dim.z; i++) {
-		uint base_idx = ((i - uhercules) / 8);
-		uint sub_idx  = ((i - uhercules) % 8) / 2;
-		uint shift    = (~(i - uhercules) & 1u) * 16u;
-		uint channel  = (uforces_channels[base_idx][sub_idx] << shift) >> 16u;
-
+	for (int i = uhercules; i < dec_data_dim.z; i++) {
+		int channel = imageLoad(sparse_elements, i - uhercules).x;
 		/* NOTE: For Each Virtual Source */
 		for (uint j = 0; j < dec_data_dim.y; j++) {
 			vec3 element_position;
@@ -199,7 +196,7 @@ vec2 HERCULES(vec3 image_point, vec3 delta, float apodization_arg)
 
 			sum  += apodize(sample_rf(ridx, sidx), apodization_arg,
 			                length(receive_distance.xy)) * valid;
-			ridx += dec_data_dim.x;
+			ridx += int(dec_data_dim.x);
 		}
 	}
 	return sum;
@@ -208,8 +205,8 @@ vec2 HERCULES(vec3 image_point, vec3 delta, float apodization_arg)
 vec2 uFORCES(vec3 image_point, vec3 delta, float apodization_arg)
 {
 	/* NOTE: skip first acquisition in uforces since its garbage */
-	uint uforces = uint(das_shader_id == DAS_ID_UFORCES);
-	uint ridx    = dec_data_dim.y * dec_data_dim.x * uforces;
+	int uforces = int(das_shader_id == DAS_ID_UFORCES);
+	int ridx    = int(dec_data_dim.y * dec_data_dim.x * uforces);
 
 	image_point  = (xdc_transform * vec4(image_point, 1)).xyz;
 
@@ -217,12 +214,8 @@ vec2 uFORCES(vec3 image_point, vec3 delta, float apodization_arg)
 	delta.y = 0;
 
 	vec2 sum = vec2(0);
-	for (uint i = uforces; i < dec_data_dim.z; i++) {
-		uint base_idx = ((i - uforces) / 8);
-		uint sub_idx  = ((i - uforces) % 8) / 2;
-		uint shift    = (~(i - uforces) & 1u) * 16u;
-		uint channel  = (uforces_channels[base_idx][sub_idx] << shift) >> 16u;
-
+	for (int i = uforces; i < dec_data_dim.z; i++) {
+		int   channel       = imageLoad(sparse_elements, i - uforces).x;
 		vec2  rdist         = vec2(image_point.x, image_point.z);
 		vec3  focal_point   = channel * delta + focal_point_offset;
 		float transmit_dist = distance(image_point, focal_point);
@@ -232,7 +225,7 @@ vec2 uFORCES(vec3 image_point, vec3 delta, float apodization_arg)
 			vec2 valid  = vec2(sidx >= 0) * vec2(sidx < dec_data_dim.x);
 			sum        += apodize(sample_rf(ridx, sidx), apodization_arg, rdist.x) * valid;
 			rdist.x    -= delta.x;
-			ridx       += dec_data_dim.x;
+			ridx       += int(dec_data_dim.x);
 		}
 	}
 	return sum;
diff --git a/static.c b/static.c
@@ -308,6 +308,7 @@ setup_beamformer(BeamformerCtx *ctx, Arena *memory)
 	/* TODO(rnp): refactor - this is annoying */
 	ctx->shared_memory->raw_data_sync        = 1;
 	ctx->shared_memory->channel_mapping_sync = 1;
+	ctx->shared_memory->sparse_elements_sync = 1;
 
 	/* NOTE: default compute shader pipeline */
 	ctx->shared_memory->compute_stages[0]    = CS_DECODE;

M	beamformer.c	\|	13	+++++++++++++
M	beamformer.h	\|	1	+
M	beamformer_parameters.h	\|	1	-
M	helpers/ogl_beamformer_lib.c	\|	50	+++++++++++++++++++++++++++++++++++++++-----------
M	helpers/ogl_beamformer_lib.h	\|	1	+
M	shaders/das.glsl	\|	47	++++++++++++++++++++---------------------------
M	static.c	\|	1	+