core: allow floating point data to be sent to beamformer - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: d7c7ee7a6dc519aa077b51b333911e3d260db7a8
Parent: 239392a90e60209dd824539069615cde6da65e10
Author: Randy Palamar
Date:   Wed, 26 Feb 2025 12:35:32 -0700

core: allow floating point data to be sent to beamformer

closes: #17

Diffstat:
M beamformer.c  | 28 ++++++++++++++++++++--------
M beamformer.h  | 8 +-------
M beamformer_parameters.h  | 21 +++++++++++++--------
M helpers/ogl_beamformer_lib.c  | 52 +++++++++++++++++++++++++++++++++++++++++-----------
M helpers/ogl_beamformer_lib.h  | 9 +++++----
A shaders/decode.glsl  | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
D shaders/hadamard.glsl  | 68 --------------------------------------------------------------------
M static.c  | 5 ++---

8 files changed, 166 insertions(+), 109 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -364,7 +364,8 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformFrame *frame, enum co
 	u32 input_ssbo_idx  = csctx->last_output_ssbo_index;
 
 	switch (shader) {
-	case CS_HADAMARD:
+	case CS_DECODE:
+	case CS_DECODE_FLOAT:
 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo);
 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]);
 		glBindImageTexture(0, csctx->hadamard_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8I);
@@ -508,15 +509,21 @@ reload_compute_shader(BeamformerCtx *ctx, s8 path, ComputeShaderReloadContext *c
 	ComputeShaderCtx *cs = &ctx->csctx;
 
 	/* NOTE: arena works as stack (since everything here is 1 byte aligned) */
-	s8 header_in_arena = {.data = tmp.beg};
-	if (csr->needs_header)
-		header_in_arena = push_s8(&tmp, s8(COMPUTE_SHADER_HEADER));
+	s8 header = {.data = tmp.beg};
+	if (csr->needs_header) {
+		header = push_s8(&tmp, s8(COMPUTE_SHADER_HEADER));
+		if (csr->shader == CS_DECODE_FLOAT) {
+			s8 extra = push_s8(&tmp, s8(COMPUTE_FLOAT_DECODE_HEADER));
+			ASSERT(extra.data == header.data + header.len);
+			header.len += extra.len;
+		}
+	}
 
-	s8 shader_text    = ctx->platform.read_whole_file(&tmp, (c8 *)path.data);
-	shader_text.data -= header_in_arena.len;
-	shader_text.len  += header_in_arena.len;
+	s8 shader_text = ctx->platform.read_whole_file(&tmp, (c8 *)path.data);
+	shader_text.data -= header.len;
+	shader_text.len  += header.len;
 
-	if (shader_text.data == header_in_arena.data) {
+	if (shader_text.data == header.data) {
 		u32 shader_id  = compile_shader(&ctx->platform, tmp, GL_COMPUTE_SHADER, shader_text, path);
 		if (shader_id) {
 			u32 new_program = link_program(&ctx->platform, tmp, shader_id);
@@ -561,6 +568,11 @@ DEBUG_EXPORT BEAMFORMER_COMPLETE_COMPUTE_FN(beamformer_complete_compute)
 		case BW_RELOAD_SHADER: {
 			ComputeShaderReloadContext *csr = work->reload_shader_ctx;
 			reload_compute_shader(ctx, csr->path, csr, arena);
+			if (csr->shader == CS_DECODE) {
+				csr->shader = CS_DECODE_FLOAT;
+				reload_compute_shader(ctx, csr->path, csr, arena);
+				csr->shader = CS_DECODE;
+			}
 
 			/* TODO(rnp): remove this */
 			#define X(idx, name) cs->name##_id = glGetUniformLocation(cs->programs[idx], "u_" #name);
diff --git a/beamformer.h b/beamformer.h
@@ -180,7 +180,7 @@ typedef struct {
 	iptr                   last_displayed_frame;
 } BeamformerUI;
 
-#define CS_UNIFORMS                 \
+#define CS_UNIFORMS \
 	X(CS_DAS,     voxel_offset) \
 	X(CS_DAS,     cycle_t)      \
 	X(CS_MIN_MAX, mips_level)   \
@@ -189,12 +189,6 @@ typedef struct {
 typedef struct {
 	u32 programs[CS_LAST];
 
-	/* NOTE: the raw_data_ssbo is allocated at 3x the required size to allow for tiled
-	 * transfers when the GPU is running behind the CPU. It is not mapped on NVIDIA because
-	 * their drivers _will_ store the buffer in the system memory. This doesn't happen
-	 * for Intel or AMD and mapping the buffer is preferred. In either case incoming data can
-	 * be written to the arena at the appropriate offset for the current raw_data_index. An
-	 * additional BufferSubData is needed on NVIDIA to upload the data. */
 	/* NOTE: The raw data ssbo is not mapped on NVIDIA because their drivers _will_ store
 	 * the buffer in the system memory. This doesn't happen for other vendors and
 	 * mapping the buffer is preferred. In either case incoming data can be written to
diff --git a/beamformer_parameters.h b/beamformer_parameters.h
@@ -1,14 +1,15 @@
 /* See LICENSE for license details. */
 
 /* X(enumarant, number, shader file name, needs header, pretty name) */
-#define COMPUTE_SHADERS                                    \
-	X(CUDA_DECODE,  0, "",         0, "CUDA Decoding") \
-	X(CUDA_HILBERT, 1, "",         0, "CUDA Hilbert")  \
-	X(DAS,          2, "das",      1, "DAS")           \
-	X(DEMOD,        3, "demod",    1, "Demodulation")  \
-	X(HADAMARD,     4, "hadamard", 1, "Decoding")      \
-	X(MIN_MAX,      5, "min_max",  0, "Min/Max")       \
-	X(SUM,          6, "sum",      0, "Sum")
+#define COMPUTE_SHADERS \
+	X(CUDA_DECODE,  0, "",         0, "CUDA Decoding")  \
+	X(CUDA_HILBERT, 1, "",         0, "CUDA Hilbert")   \
+	X(DAS,          2, "das",      1, "DAS")            \
+	X(DECODE,       3, "decode",   1, "Decoding")       \
+	X(DECODE_FLOAT, 4, "",         1, "Decoding (F32)") \
+	X(DEMOD,        5, "demod",    1, "Demodulation")   \
+	X(MIN_MAX,      6, "min_max",  0, "Min/Max")        \
+	X(SUM,          7, "sum",      0, "Sum")
 
 enum compute_shaders {
 	#define X(e, n, s, h, pn) CS_ ##e = n,
@@ -120,6 +121,10 @@ layout(std140, binding = 0) uniform parameters {\n\
 \n\
 #line 1\n"
 
+#define COMPUTE_FLOAT_DECODE_HEADER "\
+#define INPUT_DATA_TYPE_FLOAT\n\
+#line 1\n"
+
 /* TODO(rnp): bake this into the das shader header */
 #define DAS_LOCAL_SIZE_X 32
 #define DAS_LOCAL_SIZE_Y  1
diff --git a/helpers/ogl_beamformer_lib.c b/helpers/ogl_beamformer_lib.c
@@ -16,7 +16,6 @@ typedef struct {
 	char *name;
 } Pipe;
 
-
 typedef struct { size len; u8 *data; } s8;
 #define s8(s) (s8){.len = ARRAY_COUNT(s) - 1, .data = (u8 *)s}
 
@@ -274,7 +273,7 @@ b32
 set_beamformer_pipeline(char *shm_name, i32 *stages, i32 stages_count)
 {
 	if (stages_count > ARRAY_COUNT(g_bp->compute_stages)) {
-		error_msg("maximum stage count is %u", ARRAY_COUNT(g_bp->compute_stages));
+		error_msg("maximum stage count is %lu", ARRAY_COUNT(g_bp->compute_stages));
 		return 0;
 	}
 
@@ -287,7 +286,8 @@ set_beamformer_pipeline(char *shm_name, i32 *stages, i32 stages_count)
 		case CS_CUDA_HILBERT:
 		case CS_DAS:
 		case CS_DEMOD:
-		case CS_HADAMARD:
+		case CS_DECODE:
+		case CS_DECODE_FLOAT:
 		case CS_MIN_MAX:
 		case CS_SUM:
 			g_bp->compute_stages[i] = stages[i];
@@ -331,7 +331,7 @@ send_raw_data(char *pipe_name, char *shm_name, void *data, u32 data_size)
 				written = os_write(g_pipe.file, data, data_size);
 			result = written == data_size;
 			if (!result)
-				warning_msg("failed again, wrote %ld/%ld\ngiving up",
+				warning_msg("failed again, wrote %ld/%u\ngiving up",
 				            written, data_size);
 		}
 	}
@@ -367,12 +367,12 @@ set_beamformer_parameters(char *shm_name, BeamformerParameters *new_bp)
 	return 1;
 }
 
-void
-beamform_data_synchronized(char *pipe_name, char *shm_name, i16 *data, uv2 data_dim,
-                           uv4 output_points, f32 *out_data, i32 timeout_ms)
+static b32
+beamform_data_synchronized(char *pipe_name, char *shm_name, void *data, uv2 data_dim,
+                           u32 data_size, uv4 output_points, f32 *out_data, i32 timeout_ms)
 {
 	if (!check_shared_memory(shm_name))
-		return;
+		return 0;
 
 	if (output_points.x == 0) output_points.x = 1;
 	if (output_points.y == 0) output_points.y = 1;
@@ -388,19 +388,19 @@ beamform_data_synchronized(char *pipe_name, char *shm_name, i16 *data, uv2 data_
 	s8 export_name = s8(OS_EXPORT_PIPE_NAME);
 	if (export_name.len > ARRAY_COUNT(g_bp->export_pipe_name)) {
 		error_msg("export pipe name too long");
-		return;
+		return 0;
 	}
 
 	Pipe export_pipe = os_open_read_pipe(OS_EXPORT_PIPE_NAME);
 	if (export_pipe.file == INVALID_FILE) {
 		error_msg("failed to open export pipe");
-		return;
+		return 0;
 	}
 
 	for (u32 i = 0; i < export_name.len; i++)
 		g_bp->export_pipe_name[i] = export_name.data[i];
 
-	b32 result = send_data(pipe_name, shm_name, data, data_dim);
+	b32 result = send_raw_data(pipe_name, shm_name, data, data_size);
 	if (result) {
 		size output_size = output_points.x * output_points.y * output_points.z * sizeof(f32) * 2;
 		result = os_wait_read_pipe(export_pipe, out_data, output_size, timeout_ms);
@@ -411,4 +411,34 @@ beamform_data_synchronized(char *pipe_name, char *shm_name, i16 *data, uv2 data_
 	os_disconnect_pipe(export_pipe);
 	os_close_pipe(&export_pipe.file, export_pipe.name);
 	os_close_pipe(&g_pipe.file, 0);
+
+	return result;
+}
+
+b32
+beamform_data_synchronized_i16(char *pipe_name, char *shm_name, i16 *data, uv2 data_dim,
+                               uv4 output_points, f32 *out_data, i32 timeout_ms)
+{
+	b32 result    = 0;
+	u64 data_size = data_dim.x * data_dim.y * sizeof(i16);
+	if (data_size <= U32_MAX) {
+		g_bp->raw.rf_raw_dim = data_dim;
+		result = beamform_data_synchronized(pipe_name, shm_name, data, data_dim, data_size,
+		                                    output_points, out_data, timeout_ms);
+	}
+	return result;
+}
+
+b32
+beamform_data_synchronized_f32(char *pipe_name, char *shm_name, f32 *data, uv2 data_dim,
+                               uv4 output_points, f32 *out_data, i32 timeout_ms)
+{
+	b32 result    = 0;
+	u64 data_size = data_dim.x * data_dim.y * sizeof(f32);
+	if (data_size <= U32_MAX) {
+		g_bp->raw.rf_raw_dim = data_dim;
+		result = beamform_data_synchronized(pipe_name, shm_name, data, data_dim, data_size,
+		                                    output_points, out_data, timeout_ms);
+	}
+	return result;
 }
diff --git a/helpers/ogl_beamformer_lib.h b/helpers/ogl_beamformer_lib.h
@@ -35,7 +35,8 @@ LIB_FN b32 send_data(char *pipe_name, char *shm_name, i16 *data, uv2 data_dim);
 
 /* NOTE: sends data and waits for (complex) beamformed data to be returned.
  * out_data: must be allocated by the caller as 2 f32s per output point. */
-LIB_FN void beamform_data_synchronized(char *pipe_name, char *shm_name,
-                                       i16 *data, uv2 data_dim,
-                                       uv4 output_points, f32 *out_data,
-                                       i32 timeout_ms);
+LIB_FN b32 beamform_data_synchronized_i16(char *pipe_name, char *shm_name, i16 *data, uv2 data_dim,
+                                          uv4 output_points, f32 *out_data, i32 timeout_ms);
+
+LIB_FN b32 beamform_data_synchronized_f32(char *pipe_name, char *shm_name, f32 *data, uv2 data_dim,
+                                          uv4 output_points, f32 *out_data, i32 timeout_ms);
diff --git a/shaders/decode.glsl b/shaders/decode.glsl
@@ -0,0 +1,84 @@
+/* See LICENSE for license details. */
+layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
+
+#ifdef INPUT_DATA_TYPE_FLOAT
+#define INPUT_DATA_TYPE    float
+#define RF_DATA_STEP_SCALE 1
+#else
+#define INPUT_DATA_TYPE    int
+#define RF_DATA_STEP_SCALE 2
+#endif
+
+layout(std430, binding = 1) readonly restrict buffer buffer_1 {
+	INPUT_DATA_TYPE rf_data[];
+};
+
+layout(std430, binding = 2) writeonly restrict buffer buffer_2 {
+	vec2 out_data[];
+};
+
+layout(r8i, binding = 0) readonly restrict uniform iimage2D hadamard;
+
+INPUT_DATA_TYPE sample_rf_data(uint index, uint lfs)
+{
+	INPUT_DATA_TYPE result;
+	#ifdef INPUT_DATA_TYPE_FLOAT
+	result = rf_data[index];
+	#else
+	result = (rf_data[index] << lfs) >> 16;
+	#endif
+	return result;
+}
+
+void main()
+{
+	/* NOTE: each invocation takes a time sample and a receive channel.
+	 * It first maps the column to the correct column in the rf data then
+	 * does the dot product with the equivalent row of the hadamard matrix.
+	 * The result is stored to the equivalent row, column index of the output.
+	 */
+	uint time_sample = gl_GlobalInvocationID.x;
+	uint channel     = gl_GlobalInvocationID.y;
+	uint acq         = gl_GlobalInvocationID.z;
+
+	/* NOTE: offsets for storing the results in the output data */
+	uint out_off = dec_data_dim.x * dec_data_dim.y * acq + dec_data_dim.x * channel + time_sample;
+
+	/* NOTE: channel mapping is stored as u16s so we must do this to extract the final value */
+	uint ch_array_idx = (channel / 8);
+	uint ch_vec_idx   = (channel % 8) / 2;
+	uint ch_elem_lfs  = ((~channel) & 1u) * 16;
+	uint rf_channel   = (channel_mapping[ch_array_idx][ch_vec_idx] << ch_elem_lfs) >> 16;
+
+	/* NOTE: stride is the number of samples between acquistions; off is the
+	 * index of the first acquisition for this channel and time sample  */
+	uint rf_stride = dec_data_dim.x;
+	uint rf_off    = rf_raw_dim.x * rf_channel + time_sample;
+
+	/* NOTE: rf_data index and stride considering the data is i16 not i32 */
+	uint ridx       = rf_off    / RF_DATA_STEP_SCALE;
+	uint ridx_delta = rf_stride / RF_DATA_STEP_SCALE;
+
+	/* NOTE: rf_data is i16 so each access grabs two time samples at time.
+	 * We need to shift arithmetically (maintaining the sign) to get the
+	 * desired element. If the time sample is even we take the upper half
+	 * and if its odd we take the lower half. */
+	uint lfs = ((~time_sample) & 1u) * 16;
+
+	/* NOTE: Compute N-D dot product */
+	float result = 0;
+	switch (decode) {
+	case DECODE_MODE_NONE: {
+		result = sample_rf_data(ridx + ridx_delta * acq, lfs);
+	} break;
+	case DECODE_MODE_HADAMARD: {
+		INPUT_DATA_TYPE sum = 0;
+		for (int i = 0; i < dec_data_dim.z; i++) {
+			sum  += imageLoad(hadamard, ivec2(i, acq)).x * sample_rf_data(ridx, lfs);
+			ridx += ridx_delta;
+		}
+		result = float(sum) / float(dec_data_dim.z);
+	} break;
+	}
+	out_data[out_off] = vec2(result, 0);
+}
diff --git a/shaders/hadamard.glsl b/shaders/hadamard.glsl
@@ -1,68 +0,0 @@
-/* See LICENSE for license details. */
-layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
-
-layout(std430, binding = 1) readonly restrict buffer buffer_1 {
-	int rf_data[];
-};
-
-layout(std430, binding = 2) writeonly restrict buffer buffer_2 {
-	vec2 out_data[];
-};
-
-layout(r8i, binding = 0) readonly restrict uniform iimage2D hadamard;
-
-void main()
-{
-	/* NOTE: each invocation takes a time sample and a receive channel.
-	 * It first maps the column to the correct column in the rf data then
-	 * does the dot product with the equivalent row of the hadamard matrix.
-	 * The result is stored to the equivalent row, column index of the output.
-	 */
-	uint time_sample = gl_GlobalInvocationID.x;
-	uint channel     = gl_GlobalInvocationID.y;
-	uint acq         = gl_GlobalInvocationID.z;
-
-	/* NOTE: offsets for storing the results in the output data */
-	uint out_off = dec_data_dim.x * dec_data_dim.y * acq + dec_data_dim.x * channel + time_sample;
-
-	/* NOTE: channel mapping is stored as u16s so we must do this to extract the final value */
-	uint ch_array_idx = (channel / 8);
-	uint ch_vec_idx   = (channel % 8) / 2;
-	uint ch_elem_lfs  = ((~channel) & 1u) * 16;
-	uint rf_channel   = (channel_mapping[ch_array_idx][ch_vec_idx] << ch_elem_lfs) >> 16;
-
-	/* NOTE: stride is the number of samples between acquistions; off is the
-	 * index of the first acquisition for this channel and time sample  */
-	uint rf_stride = dec_data_dim.x;
-	uint rf_off    = rf_raw_dim.x * rf_channel + time_sample;
-
-	/* NOTE: rf_data index and stride considering the data is i16 not i32 */
-	uint ridx       = rf_off / 2;
-	uint ridx_delta = rf_stride / 2;
-
-	/* NOTE: rf_data is i16 so each access grabs two time samples at time.
-	 * We need to shift arithmetically (maintaining the sign) to get the
-	 * desired element. If the time sample is even we take the upper half
-	 * and if its odd we take the lower half. */
-	uint lfs = ((~time_sample) & 1u) * 16;
-
-	/* NOTE: Compute N-D dot product */
-	int sum = 0;
-	switch (decode) {
-	case DECODE_MODE_NONE: {
-		ridx += ridx_delta * acq;
-		sum   = (rf_data[ridx] << lfs) >> 16;
-	} break;
-	case DECODE_MODE_HADAMARD: {
-		/* NOTE: offset to get the correct column in hadamard matrix */
-		uint hoff = dec_data_dim.z * acq;
-
-		for (int i = 0; i < dec_data_dim.z; i++) {
-			int data = (rf_data[ridx] << lfs) >> 16;
-			sum  += imageLoad(hadamard, ivec2(i, acq)).x * data;
-			ridx += ridx_delta;
-		}
-	} break;
-	}
-	out_data[out_off] = vec2(float(sum), 0);
-}
diff --git a/static.c b/static.c
@@ -290,10 +290,9 @@ setup_beamformer(BeamformerCtx *ctx, Arena *memory)
 	ASSERT(ctx->params);
 
 	/* NOTE: default compute shader pipeline */
-	ctx->params->compute_stages[0]    = CS_HADAMARD;
+	ctx->params->compute_stages[0]    = CS_DECODE;
 	ctx->params->compute_stages[1]    = CS_DAS;
-	ctx->params->compute_stages[2]    = CS_MIN_MAX;
-	ctx->params->compute_stages_count = 3;
+	ctx->params->compute_stages_count = 2;
 
 	if (ctx->gl.vendor_id == GL_VENDOR_NVIDIA
 	    && load_cuda_lib(&ctx->platform, s8(OS_CUDA_LIB_NAME), (iptr)&ctx->cuda_lib, *memory))

M	beamformer.c	\|	28	++++++++++++++++++++--------
M	beamformer.h	\|	8	+-------
M	beamformer_parameters.h	\|	21	+++++++++++++--------
M	helpers/ogl_beamformer_lib.c	\|	52	+++++++++++++++++++++++++++++++++++++++++-----------
M	helpers/ogl_beamformer_lib.h	\|	9	+++++----
A	shaders/decode.glsl	\|	84	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
D	shaders/hadamard.glsl	\|	68	--------------------------------------------------------------------
M	static.c	\|	5	++---