Commit: de5fd539910b244b6906c3bd05451038ea010d8c
Parent: 53c5030fe0613e2c311a827aa8764875be59f639
Author: Randy Palamar
Date: Mon, 20 Apr 2026 20:06:17 -0600
lib: add processing for basic 3 sample contrast method
we have had some success with a contrast method that performs 3
acquisitions while varying the polarity of the receive
apodization. If the 3 samples are A, B, and C, the final sample
for beamforming is recovered by A - B - C. For simplicity I am
calling this A1S2 (Add 1 Subtract 2).
It is significantly more efficient to perform this reduction on
the CPU since we are already touching all samples to perform
channel mapping. Furthermore it reduces the amount of data that
must pass through the shared memory region and be uploaded to the
GPU. Since the operation is trivial, any non-zero optimization
level will result in the compiler performing auto vectorization
meaning that we don't really need to do any manual optimization. I
wrote the code to jump through a function pointer table since it
is more efficient for the i-cache but it is probably not
important.
Diffstat:
7 files changed, 136 insertions(+), 20 deletions(-)
diff --git a/beamformer.meta b/beamformer.meta
@@ -8,6 +8,14 @@
[Int16Complex 2 2 1]
[Float32 4 1 0]
[Float32Complex 4 2 1]
+ [Float16 2 1 0]
+ [Float16Complex 2 2 1]
+}
+
+@Table([name samples]) ContrastMode
+{
+ [None 1]
+ [A1S2 3]
}
@Enumeration(EmissionKind [Sine Chirp])
@@ -111,6 +119,7 @@
@Table([name c_type m_type m_size]) ParametersExtra
{
+ [contrast_mode BeamformerContrastMode uint32 1]
[emission_kind BeamformerEmissionKind uint32 1]
[emission_parameters BeamformerEmissionParameters uint8 12]
}
@@ -130,6 +139,7 @@
}
@Expand(AcquisitionKind) @Enumeration(AcquisitionKind `$(name)`)
+@Expand(ContrastMode) @Enumeration(ContrastMode `$(name)`)
@Expand(DataKind) @Enumeration(DataKind `$(name)`)
@Expand(FilterKind) @Enumeration(FilterKind `$(name)`)
@Expand(InterpolationMode) @Enumeration(InterpolationMode `$(name)`)
@@ -200,6 +210,10 @@
@Expand(DataKind) ` $(complex),`
`};`
``
+ `read_only global u8 beamformer_contrast_mode_samples[] = {`
+ @Expand(ContrastMode) ` $(samples),`
+ `};`
+ ``
`read_only global u8 beamformer_acquisition_kind_has_fixed_transmits[] = {`
@Expand(AcquisitionKind) ` $(fixed_transmits),`
`};`
diff --git a/beamformer_shared_memory.c b/beamformer_shared_memory.c
@@ -1,5 +1,5 @@
/* See LICENSE for license details. */
-#define BEAMFORMER_SHARED_MEMORY_VERSION (26UL)
+#define BEAMFORMER_SHARED_MEMORY_VERSION (27UL)
typedef struct BeamformerFrame BeamformerFrame;
diff --git a/generated/beamformer.meta.c b/generated/beamformer.meta.c
@@ -44,10 +44,18 @@ typedef enum {
} BeamformerAcquisitionKind;
typedef enum {
+ BeamformerContrastMode_None = 0,
+ BeamformerContrastMode_A1S2 = 1,
+ BeamformerContrastMode_Count,
+} BeamformerContrastMode;
+
+typedef enum {
BeamformerDataKind_Int16 = 0,
BeamformerDataKind_Int16Complex = 1,
BeamformerDataKind_Float32 = 2,
BeamformerDataKind_Float32Complex = 3,
+ BeamformerDataKind_Float16 = 4,
+ BeamformerDataKind_Float16Complex = 5,
BeamformerDataKind_Count,
} BeamformerDataKind;
@@ -191,6 +199,7 @@ typedef struct {
u32 interpolation_mode;
u32 coherency_weighting;
u32 decimation_rate;
+ BeamformerContrastMode contrast_mode;
BeamformerEmissionKind emission_kind;
BeamformerEmissionParameters emission_parameters;
} BeamformerParameters;
@@ -225,6 +234,7 @@ typedef struct {
} BeamformerUIParameters;
typedef struct {
+ BeamformerContrastMode contrast_mode;
BeamformerEmissionKind emission_kind;
BeamformerEmissionParameters emission_parameters;
} BeamformerParametersExtra;
@@ -253,6 +263,7 @@ typedef struct {
u32 interpolation_mode;
u32 coherency_weighting;
u32 decimation_rate;
+ BeamformerContrastMode contrast_mode;
BeamformerEmissionKind emission_kind;
BeamformerEmissionParameters emission_parameters;
i16 channel_mapping[256];
@@ -289,6 +300,8 @@ read_only global u8 beamformer_data_kind_element_size[] = {
2,
4,
4,
+ 2,
+ 2,
};
read_only global u8 beamformer_data_kind_element_count[] = {
@@ -296,6 +309,8 @@ read_only global u8 beamformer_data_kind_element_count[] = {
2,
1,
2,
+ 1,
+ 2,
};
read_only global u8 beamformer_data_kind_byte_size[] = {
@@ -303,6 +318,8 @@ read_only global u8 beamformer_data_kind_byte_size[] = {
2 * 2,
4 * 1,
4 * 2,
+ 2 * 1,
+ 2 * 2,
};
read_only global b8 beamformer_data_kind_complex[] = {
@@ -310,6 +327,13 @@ read_only global b8 beamformer_data_kind_complex[] = {
1,
0,
1,
+ 0,
+ 1,
+};
+
+read_only global u8 beamformer_contrast_mode_samples[] = {
+ 1,
+ 3,
};
read_only global u8 beamformer_acquisition_kind_has_fixed_transmits[] = {
@@ -413,6 +437,8 @@ read_only global s8 beamformer_shader_global_header_strings[] = {
"#define DataKind_Int16Complex 1\n"
"#define DataKind_Float32 2\n"
"#define DataKind_Float32Complex 3\n"
+ "#define DataKind_Float16 4\n"
+ "#define DataKind_Float16Complex 5\n"
"\n"),
s8_comp(""
"#define DecodeMode_None 0\n"
diff --git a/lib/ogl_beamformer_lib.c b/lib/ogl_beamformer_lib.c
@@ -249,11 +249,24 @@ beamformer_reserve_parameter_blocks(uint32_t count)
}
function b32
+validate_parameters(BeamformerParameters *bp)
+{
+ if (!lib_error_check(Between(bp->contrast_mode, 0, BeamformerContrastMode_Count - 1), InvalidContrastMode))
+ return 0;
+
+ u32 contrast_raw_sample_count = bp->acquisition_count * bp->sample_count * beamformer_contrast_mode_samples[bp->contrast_mode];
+ if (!lib_error_check(contrast_raw_sample_count <= bp->raw_data_dimensions.x, DataSizeMismatch))
+ return 0;
+
+ return 1;
+}
+
+function b32
validate_pipeline(i32 *shaders, u32 shader_count, BeamformerDataKind data_kind)
{
- b32 data_kind_test = Between(data_kind, 0, BeamformerDataKind_Count - 1);
- //data_kind != BeamformerDataKind_Float16 &&
- //data_kind != BeamformerDataKind_Float16Complex;
+ b32 data_kind_test = Between(data_kind, 0, BeamformerDataKind_Count - 1) &&
+ data_kind != BeamformerDataKind_Float16 &&
+ data_kind != BeamformerDataKind_Float16Complex;
if (!lib_error_check(data_kind_test, InvalidDataKind))
return 0;
@@ -413,6 +426,31 @@ b32 beamformer_push_##name (dtype *data, u32 count) { \
BEAMFORMER_UPLOAD_FNS
#undef X
+#define BEAMFORMER_REDUCE_A1S2_CONTRAST_FN(name) void name(void *restrict output_v, \
+ void *restrict input_v, \
+ u32 sample_count)
+typedef BEAMFORMER_REDUCE_A1S2_CONTRAST_FN(beamformer_reduce_a1s2_contrast_fn);
+
+#define BEAMFORMER_REDUCE_A1S2_CONTRAST_LIST \
+ X(i16) \
+ X(f32) \
+ X(f16) \
+
+static_assert(BeamformerDataKind_Float16Complex == (BeamformerDataKind_Count - 1), "");
+
+#define X(type, ...) \
+function BEAMFORMER_REDUCE_A1S2_CONTRAST_FN(beamformer_reduce_a1s2_contrast_##type) \
+{ \
+ type *input_a = (type *)input_v + 0 * sample_count; \
+ type *input_b = (type *)input_v + 1 * sample_count; \
+ type *input_c = (type *)input_v + 2 * sample_count; \
+ type *output = (type *)output_v; \
+ for (u32 sample = 0; sample < sample_count; sample++) \
+ output[sample] = input_a[sample] - input_b[sample] - input_c[sample]; \
+}
+BEAMFORMER_REDUCE_A1S2_CONTRAST_LIST
+#undef X
+
function b32
beamformer_push_data_base(void *data, u32 data_size, i32 timeout_ms, u32 block)
{
@@ -421,7 +459,8 @@ beamformer_push_data_base(void *data, u32 data_size, i32 timeout_ms, u32 block)
g_beamformer_library_context.shared_memory_size);
BeamformerParameterBlock *b = beamformer_parameter_block(g_beamformer_library_context.bp, block);
BeamformerParameters *bp = &b->parameters;
- BeamformerDataKind data_kind = b->pipeline.data_kind;
+ BeamformerDataKind data_kind = b->pipeline.data_kind;
+ BeamformerContrastMode contrast_mode = bp->contrast_mode;
u32 size = bp->acquisition_count * bp->sample_count * bp->channel_count * beamformer_data_kind_byte_size[data_kind];
u32 raw_size = bp->raw_data_dimensions.x * bp->raw_data_dimensions.y * beamformer_data_kind_byte_size[data_kind];
@@ -439,9 +478,43 @@ beamformer_push_data_base(void *data, u32 data_size, i32 timeout_ms, u32 block)
u16 data_channel = (u16)b->channel_mapping[channel];
u32 out_off = out_channel_stride * channel;
u32 in_off = in_channel_stride * data_channel;
- /* TODO(rnp): it would be better to do non temporal copy here, but we can't ensure
- * 64 byte boundaries. */
- mem_copy(scratch.beg + out_off, (u8 *)data + in_off, out_channel_stride);
+ switch (contrast_mode) {
+ default:{
+ /* NOTE(rnp): non temporal copy would be better, but we can't ensure
+ * 64 byte boundaries. */
+ memory_copy(scratch.beg + out_off, (u8 *)data + in_off, out_channel_stride);
+ }break;
+
+ case BeamformerContrastMode_A1S2:{
+ read_only local_persist u8 reduce_a1s2_index_map[] = {
+ [BeamformerDataKind_Int16] = 0,
+ [BeamformerDataKind_Int16Complex] = 0,
+ [BeamformerDataKind_Float32] = 1,
+ [BeamformerDataKind_Float32Complex] = 1,
+ [BeamformerDataKind_Float16] = 2,
+ [BeamformerDataKind_Float16Complex] = 2,
+ };
+ static_assert(BeamformerDataKind_Float16Complex == (BeamformerDataKind_Count - 1), "");
+
+ read_only local_persist beamformer_reduce_a1s2_contrast_fn *reduce_a1s2_fn_table[] = {
+ #define X(type, ...) beamformer_reduce_a1s2_contrast_##type,
+ BEAMFORMER_REDUCE_A1S2_CONTRAST_LIST
+ #undef X
+ };
+
+ // TODO(rnp): HACK: for some unknown reason loading contrast data after loading
+ // non-contrast data causes the dataset to not be stored correctly (it looks
+ // like mix of the old and new dataset). Putting this here fixes the issue.
+ // Counter-intuitively this improves throughput on my zen4 test computer,
+ // however it obviously should not be needed.
+ memory_clear(scratch.beg + out_off, 0, out_channel_stride);
+
+ u32 sample_count = bp->sample_count * beamformer_data_kind_element_count[data_kind];
+ reduce_a1s2_fn_table[reduce_a1s2_index_map[data_kind]](scratch.beg + out_off,
+ (u8 *)data + in_off,
+ sample_count);
+ }break;
+ }
}
lib_release_lock(BeamformerSharedMemoryLockKind_ScratchSpace);
@@ -482,7 +555,7 @@ beamformer_push_data_with_compute(void *data, u32 data_size, u32 image_plane_tag
b32
beamformer_push_parameters_at(BeamformerParameters *bp, u32 block)
{
- b32 result = check_shared_memory();
+ b32 result = check_shared_memory() && validate_parameters(bp);
if (result) {
result = parameter_block_region_upload(bp, sizeof(*bp), block,
BeamformerParameterBlockRegion_Parameters,
diff --git a/lib/ogl_beamformer_lib_base.h b/lib/ogl_beamformer_lib_base.h
@@ -20,12 +20,13 @@
X(InvalidImagePlane, 9, "invalid image plane") \
X(InvalidFilterKind, 10, "invalid filter kind") \
X(InvalidDataKind, 11, "invalid data kind") \
- X(BufferOverflow, 12, "passed buffer size exceeds available space") \
- X(DataSizeMismatch, 13, "data size doesn't match the size specified in parameters") \
- X(WorkQueueFull, 14, "work queue full") \
- X(ExportSpaceOverflow, 15, "not enough space for data export") \
- X(SharedMemory, 16, "failed to open shared memory region") \
- X(SyncVariable, 17, "failed to acquire lock within timeout period") \
+ X(InvalidContrastMode, 12, "invalid contrast mode") \
+ X(BufferOverflow, 13, "passed buffer size exceeds available space") \
+ X(DataSizeMismatch, 14, "data size doesn't match the size specified in parameters") \
+ X(WorkQueueFull, 15, "work queue full") \
+ X(ExportSpaceOverflow, 16, "not enough space for data export") \
+ X(SharedMemory, 17, "failed to open shared memory region") \
+ X(SyncVariable, 18, "failed to acquire lock within timeout period") \
#define X(type, num, string) BeamformerLibErrorKind_##type = num,
typedef enum {BEAMFORMER_LIB_ERRORS} BeamformerLibErrorKind;
diff --git a/tests/throughput.c b/tests/throughput.c
@@ -242,7 +242,7 @@ beamformer_simple_parameters_from_zbp_file(BeamformerSimpleParameters *bp, char
bp->speed_of_sound = header->speed_of_sound;
bp->time_offset = header->time_offset;
- //bp->contrast_mode = header->contrast_mode;
+ bp->contrast_mode = header->contrast_mode;
if (header->channel_mapping_offset != -1) {
mem_copy(bp->channel_mapping, raw.data + header->channel_mapping_offset,
diff --git a/util.c b/util.c
@@ -5,9 +5,10 @@
#pragma GCC diagnostic ignored "-Woverride-init"
#endif
-#define zero_struct(s) mem_clear(s, 0, sizeof(*s))
+#define zero_struct(s) memory_clear(s, 0, sizeof(*s))
+#define mem_clear memory_clear
function void *
-mem_clear(void *restrict p_, u8 c, iz size)
+memory_clear(void *restrict p_, u8 c, iz size)
{
u8 *p = p_;
while (size > 0) p[--size] = c;
@@ -24,8 +25,9 @@ memory_equal(void *restrict left, void *restrict right, uz n)
return result;
}
+#define mem_copy memory_copy
function void
-mem_copy(void *restrict dest, void *restrict src, uz n)
+memory_copy(void *restrict dest, void *restrict src, uz n)
{
u8 *s = src, *d = dest;
#ifdef __AVX512BW__
@@ -171,7 +173,7 @@ arena_alloc_(Arena *a, ArenaAllocateInfo info)
a->beg = start + info.count * info.size;
result = start;
if ((info.flags & ArenaAllocateFlags_NoZero) == 0)
- result = mem_clear(start, 0, info.count * info.size);
+ result = memory_clear(start, 0, info.count * info.size);
}
return result;
}