Commit: b325fe076dbd6c14a0b3bbfeac0478fad8dfcb0c
Parent: 42c2b9f27aeb4d5aed5dd3d78cd4ddc7198acbbf
Author: Randy Palamar
Date: Fri, 7 Nov 2025 14:22:03 -0700
core/lib: always perform channel mapping on the CPU
There are no cases where this would be slower. To perform the
mem_copy the CPU must touch every single byte in the array. By
simply not copying channels that are 0 we get a latency reduction.
This also means that none of the shaders need to care about
channel mapping which simplifies everything.
In the common case where half of the channels are 0 this will
reduce the latency from memory copies by at least half. It will
also reduce the latency from uploading to the GPU by a similar
amount.
NOTE: nothing about the external API needs to change; the only way
the previous code could work was if the channel mapping was
uploaded prior to the data being sent. Therefore the library can
just load it and do the mapping.
Diffstat:
8 files changed, 113 insertions(+), 53 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -516,9 +516,10 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
db->decode_mode = pb->parameters.decode_mode;
db->transmit_count = pb->parameters.acquisition_count;
- db->input_sample_stride = first? 1 : ld->bake.Filter.output_sample_stride;
- db->input_channel_stride = first? pb->parameters.raw_data_dimensions[0] : ld->bake.Filter.output_channel_stride;
- db->input_transmit_stride = first? pb->parameters.sample_count : 1;
+ u32 channel_stride = pb->parameters.acquistion_count * pb->parameters.sample_count;
+ db->input_sample_stride = first? 1 : ld->bake.Filter.output_sample_stride;
+ db->input_channel_stride = first? channel_stride : ld->bake.Filter.output_channel_stride;
+ db->input_transmit_stride = first? pb->parameters.sample_count : 1;
db->output_sample_stride = das_sample_stride;
db->output_channel_stride = das_channel_stride;
@@ -584,7 +585,6 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
fb->filter_length = (u32)f->length;
if (demod) sd->bake.flags |= BeamformerShaderFilterFlags_Demodulate;
if (f->parameters.complex) sd->bake.flags |= BeamformerShaderFilterFlags_ComplexFilter;
- if (first) sd->bake.flags |= BeamformerShaderFilterFlags_MapChannels;
sd->bake.data_kind = data_kind;
if (!first) sd->bake.data_kind = BeamformerDataKind_Float32;
@@ -609,7 +609,7 @@ plan_compute_pipeline(BeamformerComputePlan *cp, BeamformerParameterBlock *pb)
fb->output_transmit_stride = das_transmit_stride;
if (first) {
- fb->input_channel_stride = pb->parameters.raw_data_dimensions[0] / 2;
+ fb->input_channel_stride = pb->parameters.sample_count * pb->parameters.acquisition_count / 2;
fb->input_sample_stride = 1;
fb->input_transmit_stride = pb->parameters.sample_count / 2;
@@ -871,7 +871,7 @@ beamformer_commit_parameter_block(BeamformerCtx *ctx, BeamformerComputePlan *cp,
}break;
case BeamformerParameterBlockRegion_ChannelMapping:{
cuda_set_channel_mapping(pb->channel_mapping);
- } /* FALLTHROUGH */
+ }break;
case BeamformerParameterBlockRegion_FocalVectors:
case BeamformerParameterBlockRegion_SparseElements:
case BeamformerParameterBlockRegion_TransmitReceiveOrientations:
@@ -917,8 +917,6 @@ do_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, BeamformerFrame
BeamformerDecodeMode mode = cp->shader_descriptors[shader_slot].bake.Decode.decode_mode;
if (shader_slot == 0) {
- glBindImageTexture(1, cp->textures[BeamformerComputeTextureKind_ChannelMapping], 0, 0, 0, GL_READ_ONLY, GL_R16I);
-
if (mode != BeamformerDecodeMode_None) {
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, cc->ping_pong_ssbos[input_ssbo_idx]);
glProgramUniform1ui(program, DECODE_FIRST_PASS_UNIFORM_LOC, 1);
@@ -951,16 +949,11 @@ do_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, BeamformerFrame
case BeamformerShaderKind_Filter:
case BeamformerShaderKind_Demodulate:
{
- b32 map_channels = (cp->shader_descriptors[shader_slot].bake.flags & BeamformerShaderFilterFlags_MapChannels) != 0;
-
+ if (shader_slot != 0)
+ glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, cc->ping_pong_ssbos[input_ssbo_idx]);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, cc->ping_pong_ssbos[output_ssbo_idx]);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, cp->filters[sp->filter_slot].ssbo);
- if (!map_channels)
- glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, cc->ping_pong_ssbos[input_ssbo_idx]);
- else
- glBindImageTexture(1, cp->textures[BeamformerComputeTextureKind_ChannelMapping], 0, 0, 0, GL_READ_ONLY, GL_R16I);
-
glDispatchCompute(dispatch.x, dispatch.y, dispatch.z);
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
diff --git a/beamformer.h b/beamformer.h
@@ -132,7 +132,6 @@ typedef enum {BEAMFORMER_COMPUTE_UBO_LIST BeamformerComputeUBOKind_Count} Beamfo
// X(kind, gl_kind, texture_format, pixel_type)
#define BEAMFORMER_COMPUTE_TEXTURE_LIST \
- X(ChannelMapping, GL_R16I, GL_RED_INTEGER, GL_SHORT) \
X(FocalVectors, GL_RG32F, GL_RG, GL_FLOAT) \
X(SparseElements, GL_R16I, GL_RED_INTEGER, GL_SHORT) \
X(TransmitReceiveOrientations, GL_R8I, GL_RED_INTEGER, GL_BYTE)
diff --git a/beamformer.meta b/beamformer.meta
@@ -1,8 +1,15 @@
-@Enumeration(DataKind [Int16 Int16Complex Float32 Float32Complex])
@Enumeration(DecodeMode [None Hadamard])
@Enumeration(RCAOrientation [None Rows Columns])
@Enumeration(SamplingMode [2X 4X])
+@Table([name size elements]) DataKind
+{
+ [Int16 2 1]
+ [Int16Complex 2 2]
+ [Float32 4 1]
+ [Float32Complex 4 2]
+}
+
@Table([name pretty_name fixed_transmits]) AcquisitionKind
{
[FORCES FORCES 1]
@@ -27,10 +34,26 @@
}
@Expand(AcquisitionKind) @Enumeration(AcquisitionKind `$(name)`)
+@Expand(DataKind) @Enumeration(DataKind `$(name)`)
@Expand(InterpolationMode) @Enumeration(InterpolationMode `$(name)`)
@Emit
{
+ `read_only global u8 beamformer_data_kind_element_size[] = {`
+ @Expand(DataKind) ` $(size),`
+ `};`
+ ``
+ `read_only global u8 beamformer_data_kind_element_count[] = {`
+ @Expand(DataKind) ` $(elements),`
+ `};`
+ ``
+ `read_only global u8 beamformer_data_kind_byte_size[] = {`
+ @Expand(DataKind) ` $(size) * $(elements),`
+ `};`
+}
+
+@Emit
+{
`read_only global u8 beamformer_acquisition_kind_has_fixed_transmits[] = {`
@Expand(AcquisitionKind) ` $(fixed_transmits),`
`};`
@@ -78,7 +101,7 @@
@Shader(filter.glsl) Filter
{
@Enumeration(DataKind)
- @Flags([ComplexFilter MapChannels OutputFloats])
+ @Flags([ComplexFilter OutputFloats])
@Bake
{
diff --git a/beamformer_shared_memory.c b/beamformer_shared_memory.c
@@ -1,5 +1,5 @@
/* See LICENSE for license details. */
-#define BEAMFORMER_SHARED_MEMORY_VERSION (17UL)
+#define BEAMFORMER_SHARED_MEMORY_VERSION (18UL)
typedef struct BeamformerFrame BeamformerFrame;
diff --git a/generated/beamformer.meta.c b/generated/beamformer.meta.c
@@ -3,14 +3,6 @@
// GENERATED CODE
typedef enum {
- BeamformerDataKind_Int16 = 0,
- BeamformerDataKind_Int16Complex = 1,
- BeamformerDataKind_Float32 = 2,
- BeamformerDataKind_Float32Complex = 3,
- BeamformerDataKind_Count,
-} BeamformerDataKind;
-
-typedef enum {
BeamformerDecodeMode_None = 0,
BeamformerDecodeMode_Hadamard = 1,
BeamformerDecodeMode_Count,
@@ -46,6 +38,14 @@ typedef enum {
} BeamformerAcquisitionKind;
typedef enum {
+ BeamformerDataKind_Int16 = 0,
+ BeamformerDataKind_Int16Complex = 1,
+ BeamformerDataKind_Float32 = 2,
+ BeamformerDataKind_Float32Complex = 3,
+ BeamformerDataKind_Count,
+} BeamformerDataKind;
+
+typedef enum {
BeamformerInterpolationMode_Nearest = 0,
BeamformerInterpolationMode_Linear = 1,
BeamformerInterpolationMode_Cubic = 2,
@@ -59,9 +59,8 @@ typedef enum {
typedef enum {
BeamformerShaderFilterFlags_ComplexFilter = (1 << 0),
- BeamformerShaderFilterFlags_MapChannels = (1 << 1),
- BeamformerShaderFilterFlags_OutputFloats = (1 << 2),
- BeamformerShaderFilterFlags_Demodulate = (1 << 3),
+ BeamformerShaderFilterFlags_OutputFloats = (1 << 1),
+ BeamformerShaderFilterFlags_Demodulate = (1 << 2),
} BeamformerShaderFilterFlags;
typedef enum {
@@ -200,12 +199,6 @@ read_only global i32 beamformer_reloadable_render_shader_info_indices[] = {
read_only global s8 beamformer_shader_global_header_strings[] = {
s8_comp(""
- "#define DataKind_Int16 0\n"
- "#define DataKind_Int16Complex 1\n"
- "#define DataKind_Float32 2\n"
- "#define DataKind_Float32Complex 3\n"
- "\n"),
- s8_comp(""
"#define DecodeMode_None 0\n"
"#define DecodeMode_Hadamard 1\n"
"\n"),
@@ -233,6 +226,12 @@ read_only global s8 beamformer_shader_global_header_strings[] = {
"#define AcquisitionKind_HERO_PA 11\n"
"\n"),
s8_comp(""
+ "#define DataKind_Int16 0\n"
+ "#define DataKind_Int16Complex 1\n"
+ "#define DataKind_Float32 2\n"
+ "#define DataKind_Float32Complex 3\n"
+ "\n"),
+ s8_comp(""
"#define InterpolationMode_Nearest 0\n"
"#define InterpolationMode_Linear 1\n"
"#define InterpolationMode_Cubic 2\n"
@@ -246,7 +245,6 @@ read_only global s8 *beamformer_shader_flag_strings[] = {
},
(s8 []){
s8_comp("ComplexFilter"),
- s8_comp("MapChannels"),
s8_comp("OutputFloats"),
s8_comp("Demodulate"),
},
@@ -264,7 +262,7 @@ read_only global s8 *beamformer_shader_flag_strings[] = {
read_only global u8 beamformer_shader_flag_strings_count[] = {
2,
- 4,
+ 3,
5,
0,
0,
@@ -272,9 +270,9 @@ read_only global u8 beamformer_shader_flag_strings_count[] = {
};
read_only global i32 *beamformer_shader_header_vectors[] = {
- (i32 []){0, 1},
- (i32 []){0},
- (i32 []){4, 0, 5, 2},
+ (i32 []){4, 0},
+ (i32 []){4},
+ (i32 []){3, 4, 5, 1},
0,
0,
0,
@@ -352,6 +350,27 @@ read_only global i32 beamformer_shader_bake_parameter_counts[] = {
0,
};
+read_only global u8 beamformer_data_kind_element_size[] = {
+ 2,
+ 2,
+ 4,
+ 4,
+};
+
+read_only global u8 beamformer_data_kind_element_count[] = {
+ 1,
+ 2,
+ 1,
+ 2,
+};
+
+read_only global u8 beamformer_data_kind_byte_size[] = {
+ 2 * 1,
+ 2 * 2,
+ 4 * 1,
+ 4 * 2,
+};
+
read_only global u8 beamformer_acquisition_kind_has_fixed_transmits[] = {
1,
0,
diff --git a/helpers/ogl_beamformer_lib.c b/helpers/ogl_beamformer_lib.c
@@ -421,7 +421,7 @@ BEAMFORMER_UPLOAD_FNS
#undef X
function b32
-beamformer_push_data_base(void *data, u32 data_size, i32 timeout_ms)
+beamformer_push_data_base(void *data, u32 data_size, i32 timeout_ms, u32 block)
{
b32 result = 0;
if (check_shared_memory()) {
@@ -429,10 +429,41 @@ beamformer_push_data_base(void *data, u32 data_size, i32 timeout_ms)
if (lib_error_check(data_size <= arena_capacity(&scratch, u8), BF_LIB_ERR_KIND_BUFFER_OVERFLOW)) {
if (lib_try_lock(BeamformerSharedMemoryLockKind_UploadRF, timeout_ms)) {
if (lib_try_lock(BeamformerSharedMemoryLockKind_ScratchSpace, 0)) {
- mem_copy(scratch.beg, data, data_size);
+ BeamformerParameterBlock *b = beamformer_parameter_block(g_beamformer_library_context.bp, block);
+ BeamformerParameters *bp = &b->parameters;
+ BeamformerDataKind data_kind = b->pipeline.data_kind;
+
+ // TODO(rnp): maybe make a mismatched size here an error
+ u32 size = bp->acquisition_count * bp->sample_count * bp->channel_count * beamformer_data_kind_byte_size[data_kind];
+
+ u32 channel_count = bp->channel_count;
+ u32 out_channel_stride = beamformer_data_kind_element_count[data_kind] * bp->sample_count * bp->acquisition_count;
+ u32 in_channel_stride = beamformer_data_kind_element_count[data_kind] * bp->raw_data_dimensions[0];
+
+ for (u32 channel = 0; channel < channel_count; channel++) {
+ u16 data_channel = (u16)b->channel_mapping[channel];
+ u32 out_off = out_channel_stride * channel;
+ u32 in_off = in_channel_stride * data_channel;
+ for (u32 sample = 0; sample < out_channel_stride; sample++, out_off++, in_off++) {
+ switch (data_kind) {
+ case BeamformerDataKind_Int16:
+ case BeamformerDataKind_Int16Complex:
+ {
+ ((i16 *)scratch.beg)[out_off] = ((i16 *)data)[in_off];
+ }break;
+ case BeamformerDataKind_Float32:
+ case BeamformerDataKind_Float32Complex:
+ {
+ ((f32 *)scratch.beg)[out_off] = ((f32 *)data)[in_off];
+ }break;
+ InvalidDefaultCase;
+ }
+ }
+ }
+
lib_release_lock(BeamformerSharedMemoryLockKind_ScratchSpace);
/* TODO(rnp): need a better way to communicate this */
- atomic_store_u32(&g_beamformer_library_context.bp->scratch_rf_size, data_size);
+ atomic_store_u32(&g_beamformer_library_context.bp->scratch_rf_size, size);
result = 1;
}
}
@@ -444,7 +475,7 @@ beamformer_push_data_base(void *data, u32 data_size, i32 timeout_ms)
b32
beamformer_push_data_with_compute(void *data, u32 data_size, u32 image_plane_tag, u32 parameter_slot)
{
- b32 result = beamformer_push_data_base(data, data_size, g_beamformer_library_context.timeout_ms);
+ b32 result = beamformer_push_data_base(data, data_size, g_beamformer_library_context.timeout_ms, parameter_slot);
if (result) result = beamformer_compute_indirect(image_plane_tag, parameter_slot);
return result;
}
diff --git a/shaders/decode.glsl b/shaders/decode.glsl
@@ -56,7 +56,6 @@ layout(std430, binding = 3) writeonly restrict buffer buffer_3 {
};
layout(r32f, binding = 0) readonly restrict uniform image2D hadamard;
-layout(r16i, binding = 1) readonly restrict uniform iimage1D channel_mapping;
SAMPLE_DATA_TYPE sample_rf_data(uint index)
{
@@ -159,7 +158,7 @@ void main()
uint transmit = gl_GlobalInvocationID.z;
if (time_sample < OutputTransmitStride) {
- uint in_off = (InputChannelStride * imageLoad(channel_mapping, int(channel)).x +
+ uint in_off = (InputChannelStride * channel +
InputTransmitStride * transmit +
InputSampleStride * time_sample) / RF_SAMPLES_PER_INDEX;
@@ -177,8 +176,7 @@ void main()
uint transmit = gl_GlobalInvocationID.z * ToProcess;
if (time_sample < InputTransmitStride) {
uint out_off = (InputChannelStride * channel + TransmitCount * time_sample) / RF_SAMPLES_PER_INDEX;
- uint in_off = InputChannelStride * imageLoad(channel_mapping, int(channel)).x +
- InputSampleStride * time_sample;
+ uint in_off = (InputChannelStride * channel + InputSampleStride * time_sample);
#if UseSharedMemory
in_off += InputTransmitStride * transmit;
out_off += transmit;
diff --git a/shaders/filter.glsl b/shaders/filter.glsl
@@ -40,8 +40,6 @@ layout(std430, binding = 3) readonly restrict buffer buffer_3 {
FILTER_TYPE filter_coefficients[FilterLength];
};
-layout(r16i, binding = 1) readonly restrict uniform iimage1D channel_mapping;
-
vec2 complex_mul(vec2 a, vec2 b)
{
mat2 m = mat2(b.x, b.y, -b.y, b.x);
@@ -72,8 +70,7 @@ void main()
uint channel = gl_GlobalInvocationID.y;
uint transmit = gl_GlobalInvocationID.z;
- uint in_channel = bool(MapChannels) ? imageLoad(channel_mapping, int(channel)).x : channel;
- uint in_offset = InputChannelStride * in_channel + InputTransmitStride * transmit;
+ uint in_offset = InputChannelStride * channel + InputTransmitStride * transmit;
uint out_offset = OutputChannelStride * channel +
OutputTransmitStride * transmit +
OutputSampleStride * out_sample;