Commit: d7c7ee7a6dc519aa077b51b333911e3d260db7a8
Parent: 239392a90e60209dd824539069615cde6da65e10
Author: Randy Palamar
Date: Wed, 26 Feb 2025 12:35:32 -0700
core: allow floating point data to be sent to beamformer
closes: #17
Diffstat:
8 files changed, 166 insertions(+), 109 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -364,7 +364,8 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformFrame *frame, enum co
u32 input_ssbo_idx = csctx->last_output_ssbo_index;
switch (shader) {
- case CS_HADAMARD:
+ case CS_DECODE:
+ case CS_DECODE_FLOAT:
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]);
glBindImageTexture(0, csctx->hadamard_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8I);
@@ -508,15 +509,21 @@ reload_compute_shader(BeamformerCtx *ctx, s8 path, ComputeShaderReloadContext *c
ComputeShaderCtx *cs = &ctx->csctx;
/* NOTE: arena works as stack (since everything here is 1 byte aligned) */
- s8 header_in_arena = {.data = tmp.beg};
- if (csr->needs_header)
- header_in_arena = push_s8(&tmp, s8(COMPUTE_SHADER_HEADER));
+ s8 header = {.data = tmp.beg};
+ if (csr->needs_header) {
+ header = push_s8(&tmp, s8(COMPUTE_SHADER_HEADER));
+ if (csr->shader == CS_DECODE_FLOAT) {
+ s8 extra = push_s8(&tmp, s8(COMPUTE_FLOAT_DECODE_HEADER));
+ ASSERT(extra.data == header.data + header.len);
+ header.len += extra.len;
+ }
+ }
- s8 shader_text = ctx->platform.read_whole_file(&tmp, (c8 *)path.data);
- shader_text.data -= header_in_arena.len;
- shader_text.len += header_in_arena.len;
+ s8 shader_text = ctx->platform.read_whole_file(&tmp, (c8 *)path.data);
+ shader_text.data -= header.len;
+ shader_text.len += header.len;
- if (shader_text.data == header_in_arena.data) {
+ if (shader_text.data == header.data) {
u32 shader_id = compile_shader(&ctx->platform, tmp, GL_COMPUTE_SHADER, shader_text, path);
if (shader_id) {
u32 new_program = link_program(&ctx->platform, tmp, shader_id);
@@ -561,6 +568,11 @@ DEBUG_EXPORT BEAMFORMER_COMPLETE_COMPUTE_FN(beamformer_complete_compute)
case BW_RELOAD_SHADER: {
ComputeShaderReloadContext *csr = work->reload_shader_ctx;
reload_compute_shader(ctx, csr->path, csr, arena);
+ if (csr->shader == CS_DECODE) {
+ csr->shader = CS_DECODE_FLOAT;
+ reload_compute_shader(ctx, csr->path, csr, arena);
+ csr->shader = CS_DECODE;
+ }
/* TODO(rnp): remove this */
#define X(idx, name) cs->name##_id = glGetUniformLocation(cs->programs[idx], "u_" #name);
diff --git a/beamformer.h b/beamformer.h
@@ -180,7 +180,7 @@ typedef struct {
iptr last_displayed_frame;
} BeamformerUI;
-#define CS_UNIFORMS \
+#define CS_UNIFORMS \
X(CS_DAS, voxel_offset) \
X(CS_DAS, cycle_t) \
X(CS_MIN_MAX, mips_level) \
@@ -189,12 +189,6 @@ typedef struct {
typedef struct {
u32 programs[CS_LAST];
- /* NOTE: the raw_data_ssbo is allocated at 3x the required size to allow for tiled
- * transfers when the GPU is running behind the CPU. It is not mapped on NVIDIA because
- * their drivers _will_ store the buffer in the system memory. This doesn't happen
- * for Intel or AMD and mapping the buffer is preferred. In either case incoming data can
- * be written to the arena at the appropriate offset for the current raw_data_index. An
- * additional BufferSubData is needed on NVIDIA to upload the data. */
/* NOTE: The raw data ssbo is not mapped on NVIDIA because their drivers _will_ store
* the buffer in the system memory. This doesn't happen for other vendors and
* mapping the buffer is preferred. In either case incoming data can be written to
diff --git a/beamformer_parameters.h b/beamformer_parameters.h
@@ -1,14 +1,15 @@
/* See LICENSE for license details. */
/* X(enumarant, number, shader file name, needs header, pretty name) */
-#define COMPUTE_SHADERS \
- X(CUDA_DECODE, 0, "", 0, "CUDA Decoding") \
- X(CUDA_HILBERT, 1, "", 0, "CUDA Hilbert") \
- X(DAS, 2, "das", 1, "DAS") \
- X(DEMOD, 3, "demod", 1, "Demodulation") \
- X(HADAMARD, 4, "hadamard", 1, "Decoding") \
- X(MIN_MAX, 5, "min_max", 0, "Min/Max") \
- X(SUM, 6, "sum", 0, "Sum")
+#define COMPUTE_SHADERS \
+ X(CUDA_DECODE, 0, "", 0, "CUDA Decoding") \
+ X(CUDA_HILBERT, 1, "", 0, "CUDA Hilbert") \
+ X(DAS, 2, "das", 1, "DAS") \
+ X(DECODE, 3, "decode", 1, "Decoding") \
+ X(DECODE_FLOAT, 4, "", 1, "Decoding (F32)") \
+ X(DEMOD, 5, "demod", 1, "Demodulation") \
+ X(MIN_MAX, 6, "min_max", 0, "Min/Max") \
+ X(SUM, 7, "sum", 0, "Sum")
enum compute_shaders {
#define X(e, n, s, h, pn) CS_ ##e = n,
@@ -120,6 +121,10 @@ layout(std140, binding = 0) uniform parameters {\n\
\n\
#line 1\n"
+#define COMPUTE_FLOAT_DECODE_HEADER "\
+#define INPUT_DATA_TYPE_FLOAT\n\
+#line 1\n"
+
/* TODO(rnp): bake this into the das shader header */
#define DAS_LOCAL_SIZE_X 32
#define DAS_LOCAL_SIZE_Y 1
diff --git a/helpers/ogl_beamformer_lib.c b/helpers/ogl_beamformer_lib.c
@@ -16,7 +16,6 @@ typedef struct {
char *name;
} Pipe;
-
typedef struct { size len; u8 *data; } s8;
#define s8(s) (s8){.len = ARRAY_COUNT(s) - 1, .data = (u8 *)s}
@@ -274,7 +273,7 @@ b32
set_beamformer_pipeline(char *shm_name, i32 *stages, i32 stages_count)
{
if (stages_count > ARRAY_COUNT(g_bp->compute_stages)) {
- error_msg("maximum stage count is %u", ARRAY_COUNT(g_bp->compute_stages));
+ error_msg("maximum stage count is %lu", ARRAY_COUNT(g_bp->compute_stages));
return 0;
}
@@ -287,7 +286,8 @@ set_beamformer_pipeline(char *shm_name, i32 *stages, i32 stages_count)
case CS_CUDA_HILBERT:
case CS_DAS:
case CS_DEMOD:
- case CS_HADAMARD:
+ case CS_DECODE:
+ case CS_DECODE_FLOAT:
case CS_MIN_MAX:
case CS_SUM:
g_bp->compute_stages[i] = stages[i];
@@ -331,7 +331,7 @@ send_raw_data(char *pipe_name, char *shm_name, void *data, u32 data_size)
written = os_write(g_pipe.file, data, data_size);
result = written == data_size;
if (!result)
- warning_msg("failed again, wrote %ld/%ld\ngiving up",
+ warning_msg("failed again, wrote %ld/%u\ngiving up",
written, data_size);
}
}
@@ -367,12 +367,12 @@ set_beamformer_parameters(char *shm_name, BeamformerParameters *new_bp)
return 1;
}
-void
-beamform_data_synchronized(char *pipe_name, char *shm_name, i16 *data, uv2 data_dim,
- uv4 output_points, f32 *out_data, i32 timeout_ms)
+static b32
+beamform_data_synchronized(char *pipe_name, char *shm_name, void *data, uv2 data_dim,
+ u32 data_size, uv4 output_points, f32 *out_data, i32 timeout_ms)
{
if (!check_shared_memory(shm_name))
- return;
+ return 0;
if (output_points.x == 0) output_points.x = 1;
if (output_points.y == 0) output_points.y = 1;
@@ -388,19 +388,19 @@ beamform_data_synchronized(char *pipe_name, char *shm_name, i16 *data, uv2 data_
s8 export_name = s8(OS_EXPORT_PIPE_NAME);
if (export_name.len > ARRAY_COUNT(g_bp->export_pipe_name)) {
error_msg("export pipe name too long");
- return;
+ return 0;
}
Pipe export_pipe = os_open_read_pipe(OS_EXPORT_PIPE_NAME);
if (export_pipe.file == INVALID_FILE) {
error_msg("failed to open export pipe");
- return;
+ return 0;
}
for (u32 i = 0; i < export_name.len; i++)
g_bp->export_pipe_name[i] = export_name.data[i];
- b32 result = send_data(pipe_name, shm_name, data, data_dim);
+ b32 result = send_raw_data(pipe_name, shm_name, data, data_size);
if (result) {
size output_size = output_points.x * output_points.y * output_points.z * sizeof(f32) * 2;
result = os_wait_read_pipe(export_pipe, out_data, output_size, timeout_ms);
@@ -411,4 +411,34 @@ beamform_data_synchronized(char *pipe_name, char *shm_name, i16 *data, uv2 data_
os_disconnect_pipe(export_pipe);
os_close_pipe(&export_pipe.file, export_pipe.name);
os_close_pipe(&g_pipe.file, 0);
+
+ return result;
+}
+
+b32
+beamform_data_synchronized_i16(char *pipe_name, char *shm_name, i16 *data, uv2 data_dim,
+ uv4 output_points, f32 *out_data, i32 timeout_ms)
+{
+ b32 result = 0;
+ u64 data_size = data_dim.x * data_dim.y * sizeof(i16);
+ if (data_size <= U32_MAX) {
+ g_bp->raw.rf_raw_dim = data_dim;
+ result = beamform_data_synchronized(pipe_name, shm_name, data, data_dim, data_size,
+ output_points, out_data, timeout_ms);
+ }
+ return result;
+}
+
+b32
+beamform_data_synchronized_f32(char *pipe_name, char *shm_name, f32 *data, uv2 data_dim,
+ uv4 output_points, f32 *out_data, i32 timeout_ms)
+{
+ b32 result = 0;
+ u64 data_size = data_dim.x * data_dim.y * sizeof(f32);
+ if (data_size <= U32_MAX) {
+ g_bp->raw.rf_raw_dim = data_dim;
+ result = beamform_data_synchronized(pipe_name, shm_name, data, data_dim, data_size,
+ output_points, out_data, timeout_ms);
+ }
+ return result;
}
diff --git a/helpers/ogl_beamformer_lib.h b/helpers/ogl_beamformer_lib.h
@@ -35,7 +35,8 @@ LIB_FN b32 send_data(char *pipe_name, char *shm_name, i16 *data, uv2 data_dim);
/* NOTE: sends data and waits for (complex) beamformed data to be returned.
* out_data: must be allocated by the caller as 2 f32s per output point. */
-LIB_FN void beamform_data_synchronized(char *pipe_name, char *shm_name,
- i16 *data, uv2 data_dim,
- uv4 output_points, f32 *out_data,
- i32 timeout_ms);
+LIB_FN b32 beamform_data_synchronized_i16(char *pipe_name, char *shm_name, i16 *data, uv2 data_dim,
+ uv4 output_points, f32 *out_data, i32 timeout_ms);
+
+LIB_FN b32 beamform_data_synchronized_f32(char *pipe_name, char *shm_name, f32 *data, uv2 data_dim,
+ uv4 output_points, f32 *out_data, i32 timeout_ms);
diff --git a/shaders/decode.glsl b/shaders/decode.glsl
@@ -0,0 +1,84 @@
+/* See LICENSE for license details. */
+layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
+
+#ifdef INPUT_DATA_TYPE_FLOAT
+#define INPUT_DATA_TYPE float
+#define RF_DATA_STEP_SCALE 1
+#else
+#define INPUT_DATA_TYPE int
+#define RF_DATA_STEP_SCALE 2
+#endif
+
+layout(std430, binding = 1) readonly restrict buffer buffer_1 {
+ INPUT_DATA_TYPE rf_data[];
+};
+
+layout(std430, binding = 2) writeonly restrict buffer buffer_2 {
+ vec2 out_data[];
+};
+
+layout(r8i, binding = 0) readonly restrict uniform iimage2D hadamard;
+
+INPUT_DATA_TYPE sample_rf_data(uint index, uint lfs)
+{
+ INPUT_DATA_TYPE result;
+ #ifdef INPUT_DATA_TYPE_FLOAT
+ result = rf_data[index];
+ #else
+ result = (rf_data[index] << lfs) >> 16;
+ #endif
+ return result;
+}
+
+void main()
+{
+ /* NOTE: each invocation takes a time sample and a receive channel.
+ * It first maps the column to the correct column in the rf data then
+ * does the dot product with the equivalent row of the hadamard matrix.
+ * The result is stored to the equivalent row, column index of the output.
+ */
+ uint time_sample = gl_GlobalInvocationID.x;
+ uint channel = gl_GlobalInvocationID.y;
+ uint acq = gl_GlobalInvocationID.z;
+
+ /* NOTE: offsets for storing the results in the output data */
+ uint out_off = dec_data_dim.x * dec_data_dim.y * acq + dec_data_dim.x * channel + time_sample;
+
+ /* NOTE: channel mapping is stored as u16s so we must do this to extract the final value */
+ uint ch_array_idx = (channel / 8);
+ uint ch_vec_idx = (channel % 8) / 2;
+ uint ch_elem_lfs = ((~channel) & 1u) * 16;
+ uint rf_channel = (channel_mapping[ch_array_idx][ch_vec_idx] << ch_elem_lfs) >> 16;
+
+ /* NOTE: stride is the number of samples between acquistions; off is the
+ * index of the first acquisition for this channel and time sample */
+ uint rf_stride = dec_data_dim.x;
+ uint rf_off = rf_raw_dim.x * rf_channel + time_sample;
+
+ /* NOTE: rf_data index and stride considering the data is i16 not i32 */
+ uint ridx = rf_off / RF_DATA_STEP_SCALE;
+ uint ridx_delta = rf_stride / RF_DATA_STEP_SCALE;
+
+ /* NOTE: rf_data is i16 so each access grabs two time samples at time.
+ * We need to shift arithmetically (maintaining the sign) to get the
+ * desired element. If the time sample is even we take the upper half
+ * and if its odd we take the lower half. */
+ uint lfs = ((~time_sample) & 1u) * 16;
+
+ /* NOTE: Compute N-D dot product */
+ float result = 0;
+ switch (decode) {
+ case DECODE_MODE_NONE: {
+ result = sample_rf_data(ridx + ridx_delta * acq, lfs);
+ } break;
+ case DECODE_MODE_HADAMARD: {
+ INPUT_DATA_TYPE sum = 0;
+ for (int i = 0; i < dec_data_dim.z; i++) {
+ sum += imageLoad(hadamard, ivec2(i, acq)).x * sample_rf_data(ridx, lfs);
+ ridx += ridx_delta;
+ }
+ result = float(sum) / float(dec_data_dim.z);
+ } break;
+ }
+ out_data[out_off] = vec2(result, 0);
+}
diff --git a/shaders/hadamard.glsl b/shaders/hadamard.glsl
@@ -1,68 +0,0 @@
-/* See LICENSE for license details. */
-layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
-
-layout(std430, binding = 1) readonly restrict buffer buffer_1 {
- int rf_data[];
-};
-
-layout(std430, binding = 2) writeonly restrict buffer buffer_2 {
- vec2 out_data[];
-};
-
-layout(r8i, binding = 0) readonly restrict uniform iimage2D hadamard;
-
-void main()
-{
- /* NOTE: each invocation takes a time sample and a receive channel.
- * It first maps the column to the correct column in the rf data then
- * does the dot product with the equivalent row of the hadamard matrix.
- * The result is stored to the equivalent row, column index of the output.
- */
- uint time_sample = gl_GlobalInvocationID.x;
- uint channel = gl_GlobalInvocationID.y;
- uint acq = gl_GlobalInvocationID.z;
-
- /* NOTE: offsets for storing the results in the output data */
- uint out_off = dec_data_dim.x * dec_data_dim.y * acq + dec_data_dim.x * channel + time_sample;
-
- /* NOTE: channel mapping is stored as u16s so we must do this to extract the final value */
- uint ch_array_idx = (channel / 8);
- uint ch_vec_idx = (channel % 8) / 2;
- uint ch_elem_lfs = ((~channel) & 1u) * 16;
- uint rf_channel = (channel_mapping[ch_array_idx][ch_vec_idx] << ch_elem_lfs) >> 16;
-
- /* NOTE: stride is the number of samples between acquistions; off is the
- * index of the first acquisition for this channel and time sample */
- uint rf_stride = dec_data_dim.x;
- uint rf_off = rf_raw_dim.x * rf_channel + time_sample;
-
- /* NOTE: rf_data index and stride considering the data is i16 not i32 */
- uint ridx = rf_off / 2;
- uint ridx_delta = rf_stride / 2;
-
- /* NOTE: rf_data is i16 so each access grabs two time samples at time.
- * We need to shift arithmetically (maintaining the sign) to get the
- * desired element. If the time sample is even we take the upper half
- * and if its odd we take the lower half. */
- uint lfs = ((~time_sample) & 1u) * 16;
-
- /* NOTE: Compute N-D dot product */
- int sum = 0;
- switch (decode) {
- case DECODE_MODE_NONE: {
- ridx += ridx_delta * acq;
- sum = (rf_data[ridx] << lfs) >> 16;
- } break;
- case DECODE_MODE_HADAMARD: {
- /* NOTE: offset to get the correct column in hadamard matrix */
- uint hoff = dec_data_dim.z * acq;
-
- for (int i = 0; i < dec_data_dim.z; i++) {
- int data = (rf_data[ridx] << lfs) >> 16;
- sum += imageLoad(hadamard, ivec2(i, acq)).x * data;
- ridx += ridx_delta;
- }
- } break;
- }
- out_data[out_off] = vec2(float(sum), 0);
-}
diff --git a/static.c b/static.c
@@ -290,10 +290,9 @@ setup_beamformer(BeamformerCtx *ctx, Arena *memory)
ASSERT(ctx->params);
/* NOTE: default compute shader pipeline */
- ctx->params->compute_stages[0] = CS_HADAMARD;
+ ctx->params->compute_stages[0] = CS_DECODE;
ctx->params->compute_stages[1] = CS_DAS;
- ctx->params->compute_stages[2] = CS_MIN_MAX;
- ctx->params->compute_stages_count = 3;
+ ctx->params->compute_stages_count = 2;
if (ctx->gl.vendor_id == GL_VENDOR_NVIDIA
&& load_cuda_lib(&ctx->platform, s8(OS_CUDA_LIB_NAME), (iptr)&ctx->cuda_lib, *memory))