Commit: ec59851530f81a795e6e3b53556d3f638a2598a9
Parent: 9caebc871d3d048f526864cd9403771257ee181f
Author: Randy Palamar
Date: Wed, 23 Jul 2025 09:10:17 -0600
core: first pass at demod/decimation support
This version has to appear after decoding and is therefore less
optimal than it could be. It also requires alot of setup on the
library user side alot of which should be simplified.
Diffstat:
13 files changed, 355 insertions(+), 84 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -311,7 +311,8 @@ das_voxel_transform_matrix(BeamformerParameters *bp)
}
function void
-do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformerComputeFrame *frame, BeamformerShaderKind shader)
+do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformerComputeFrame *frame,
+ BeamformerShaderKind shader, BeamformerShaderParameters *sp)
{
ComputeShaderCtx *csctx = &ctx->csctx;
BeamformerSharedMemory *sm = ctx->shared_memory.region;
@@ -364,9 +365,15 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformerComputeFrame *frame
case BeamformerShaderKind_Demodulate:{
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, csctx->rf_data_ssbos[output_ssbo_idx]);
- glDispatchCompute(ORONE(csctx->dec_data_dim.x / 32),
- ORONE(csctx->dec_data_dim.y / 32),
- ORONE(csctx->dec_data_dim.z));
+
+ glBindImageTexture(0, csctx->filter_textures[sp->filter_slot], 0, GL_FALSE, 0, GL_READ_ONLY, GL_R32F);
+
+ f32 local_size_x = (f32)(DEMOD_LOCAL_SIZE_X * (f32)sm->parameters.decimation_rate);
+ glDispatchCompute((u32)ceil_f32((f32)csctx->dec_data_dim.x / local_size_x),
+ (u32)ceil_f32((f32)csctx->dec_data_dim.y / DEMOD_LOCAL_SIZE_Y),
+ (u32)ceil_f32((f32)csctx->dec_data_dim.z / DEMOD_LOCAL_SIZE_Z));
+ glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
+
csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index;
}break;
case BeamformerShaderKind_MinMax:{
@@ -494,6 +501,13 @@ shader_text_with_header(ShaderReloadContext *ctx, OS *os, Arena *arena)
stream_append_s8s(&sb, s8("#version 460 core\n\n"), ctx->header);
switch (ctx->kind) {
+ case BeamformerShaderKind_Demodulate:{
+ stream_append_s8(&sb, s8(""
+ "layout(local_size_x = " str(DEMOD_LOCAL_SIZE_X) ", "
+ "local_size_y = " str(DEMOD_LOCAL_SIZE_Y) ", "
+ "local_size_z = " str(DEMOD_LOCAL_SIZE_Z) ") in;\n\n"
+ ));
+ }break;
case BeamformerShaderKind_DAS:
case BeamformerShaderKind_DASFast:
{
@@ -522,14 +536,19 @@ shader_text_with_header(ShaderReloadContext *ctx, OS *os, Arena *arena)
));
#undef X
}break;
+ case BeamformerShaderKind_Decode:
case BeamformerShaderKind_DecodeFloat:
- case BeamformerShaderKind_DecodeFloatComplex:{
- if (ctx->kind == BeamformerShaderKind_DecodeFloat)
+ case BeamformerShaderKind_DecodeFloatComplex:
+ {
+ switch (ctx->kind) {
+ case BeamformerShaderKind_DecodeFloat:{
stream_append_s8(&sb, s8("#define INPUT_DATA_TYPE_FLOAT\n\n"));
- else
+ }break;
+ case BeamformerShaderKind_DecodeFloatComplex:{
stream_append_s8(&sb, s8("#define INPUT_DATA_TYPE_FLOAT_COMPLEX\n\n"));
- } /* FALLTHROUGH */
- case BeamformerShaderKind_Decode:{
+ }break;
+ default:{}break;
+ }
#define X(type, id, pretty) "#define DECODE_MODE_" #type " " #id "\n"
stream_append_s8(&sb, s8(""
"layout(local_size_x = " str(DECODE_LOCAL_SIZE_X) ", "
@@ -619,13 +638,15 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co
if (src->kind == BeamformerShaderKind_Decode) {
/* TODO(rnp): think of a better way of doing this */
src->kind = BeamformerShaderKind_DecodeFloatComplex;
- src->shader = cs->programs + BeamformerShaderKind_DecodeFloatComplex;
+ src->shader = cs->programs + src->kind;
success &= reload_compute_shader(ctx, src, s8(" (F32C)"), arena);
+
src->kind = BeamformerShaderKind_DecodeFloat;
- src->shader = cs->programs + BeamformerShaderKind_DecodeFloat;
+ src->shader = cs->programs + src->kind;
success &= reload_compute_shader(ctx, src, s8(" (F32)"), arena);
+
src->kind = BeamformerShaderKind_Decode;
- src->shader = cs->programs + BeamformerShaderKind_Decode;
+ src->shader = cs->programs + src->kind;
}
if (src->kind == BeamformerShaderKind_DAS) {
@@ -673,36 +694,48 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co
os_shared_memory_region_unlock(&ctx->shared_memory, sm->locks, (i32)work->lock);
post_sync_barrier(&ctx->shared_memory, BeamformerSharedMemoryLockKind_ExportSync, sm->locks);
}break;
+ case BeamformerWorkKind_CreateFilter:{
+ BeamformerCreateFilterContext *fctx = &work->create_filter_context;
+ Arena tmp_arena = arena;
+ glDeleteTextures(1, cs->filter_textures + fctx->slot);
+ glCreateTextures(GL_TEXTURE_1D, 1, cs->filter_textures + fctx->slot);
+
+ u32 texture = cs->filter_textures[fctx->slot];
+ glTextureStorage1D(texture, 1, GL_R32F, fctx->length);
+ f32 *filter = kaiser_low_pass_filter(&tmp_arena, fctx->cutoff_frequency,
+ sm->parameters.sampling_frequency, fctx->beta, fctx->length);
+ glTextureSubImage1D(texture, 0, 0, fctx->length, GL_RED, GL_FLOAT, filter);
+ }break;
case BeamformerWorkKind_UploadBuffer:{
os_shared_memory_region_lock(&ctx->shared_memory, sm->locks, (i32)work->lock, (u32)-1);
BeamformerUploadContext *uc = &work->upload_context;
u32 tex_type, tex_format, tex_1d = 0, buffer = 0;
i32 tex_element_count;
switch (uc->kind) {
- case BU_KIND_CHANNEL_MAPPING: {
+ case BeamformerUploadKind_ChannelMapping:{
tex_1d = cs->channel_mapping_texture;
tex_type = GL_SHORT;
tex_format = GL_RED_INTEGER;
tex_element_count = countof(sm->channel_mapping);
cs->cuda_lib.set_channel_mapping(sm->channel_mapping);
- } break;
- case BU_KIND_FOCAL_VECTORS: {
+ }break;
+ case BeamformerUploadKind_FocalVectors:{
tex_1d = cs->focal_vectors_texture;
tex_type = GL_FLOAT;
tex_format = GL_RG;
tex_element_count = countof(sm->focal_vectors);
- } break;
- case BU_KIND_SPARSE_ELEMENTS: {
+ }break;
+ case BeamformerUploadKind_SparseElements:{
tex_1d = cs->sparse_elements_texture;
tex_type = GL_SHORT;
tex_format = GL_RED_INTEGER;
tex_element_count = countof(sm->sparse_elements);
- } break;
- case BU_KIND_PARAMETERS: {
+ }break;
+ case BeamformerUploadKind_Parameters:{
ctx->ui_read_params = ctx->beamform_work_queue != q;
buffer = cs->shared_ubo;
- } break;
- case BU_KIND_RF_DATA: {
+ }break;
+ case BeamformerUploadKind_RFData:{
if (cs->rf_raw_size != uc->size ||
!uv4_equal(cs->dec_data_dim, uv4_from_u32_array(bp->dec_data_dim)))
{
@@ -772,7 +805,8 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co
frame->frame.compound_count = bp->dec_data_dim[2];
u32 stage_count = sm->compute_stages_count;
- BeamformerShaderKind *stages = sm->compute_stages;
+ BeamformerShaderKind *stages = sm->compute_stages;
+ BeamformerShaderParameters *params = sm->compute_shader_parameters;
for (u32 i = 0; i < stage_count; i++) {
switch (stages[i]) {
@@ -792,7 +826,7 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co
for (u32 i = 0; i < stage_count; i++) {
did_sum_shader |= stages[i] == BeamformerShaderKind_Sum;
glBeginQuery(GL_TIME_ELAPSED, cs->shader_timer_ids[i]);
- do_compute_shader(ctx, arena, frame, stages[i]);
+ do_compute_shader(ctx, arena, frame, stages[i], params + i);
glEndQuery(GL_TIME_ELAPSED);
}
diff --git a/beamformer.h b/beamformer.h
@@ -111,6 +111,7 @@ typedef struct {
u32 raw_data_ssbo;
u32 shared_ubo;
+ u32 filter_textures[BEAMFORMER_FILTER_SLOTS];
u32 channel_mapping_texture;
u32 sparse_elements_texture;
u32 focal_vectors_texture;
diff --git a/beamformer_parameters.h b/beamformer_parameters.h
@@ -15,7 +15,7 @@
X(CudaDecode, 0, "", 0, "CUDA Decode") \
X(CudaHilbert, 1, "", 0, "CUDA Hilbert") \
X(DAS, 2, "das", 1, "DAS") \
- X(Decode, 3, "decode", 1, "Decode") \
+ X(Decode, 3, "decode", 1, "Decode (I16)") \
X(DecodeFloat, 4, "", 1, "Decode (F32)") \
X(DecodeFloatComplex, 5, "", 1, "Decode (F32C)") \
X(Demodulate, 6, "demod", 1, "Demodulate") \
@@ -76,6 +76,10 @@ typedef enum {
X(EPIC_UHERCULES, 9, "EPIC-UHERCULES", 0) \
X(FLASH, 10, "Flash", 0)
+#define DEMOD_LOCAL_SIZE_X 64
+#define DEMOD_LOCAL_SIZE_Y 1
+#define DEMOD_LOCAL_SIZE_Z 1
+
#define DECODE_LOCAL_SIZE_X 4
#define DECODE_LOCAL_SIZE_Y 1
#define DECODE_LOCAL_SIZE_Z 16
@@ -101,6 +105,8 @@ typedef enum {
#define MAX_BEAMFORMED_SAVED_FRAMES 16
#define MAX_COMPUTE_SHADER_STAGES 16
+#define BEAMFORMER_FILTER_SLOTS 4
+
/* TODO(rnp): actually use a substruct but generate a header compatible with MATLAB */
/* X(name, type, size, gltype, glsize, comment) */
#define BEAMFORMER_UI_PARAMS \
@@ -141,6 +147,7 @@ typedef enum {
X(time_offset, float, , float, , "/* pulse length correction time [s] */")
#define BEAMFORMER_PARAMS_TAIL \
+ X(decimation_rate, int32_t, , int, , "/* Number of times to decimate */") \
X(readi_group_id, uint32_t, , uint, , "/* Which readi group this data is from */") \
X(readi_group_size, uint32_t, , uint, , "/* Size of readi transmit group */")
@@ -153,7 +160,7 @@ typedef struct {
BEAMFORMER_PARAMS_HEAD_V0
BEAMFORMER_UI_PARAMS
BEAMFORMER_PARAMS_TAIL
- float _pad[2];
+ float _pad[1];
} BeamformerParametersV0;
/* NOTE: This struct follows the OpenGL std140 layout. DO NOT modify unless you have
@@ -162,7 +169,7 @@ typedef struct {
BEAMFORMER_PARAMS_HEAD
BEAMFORMER_UI_PARAMS
BEAMFORMER_PARAMS_TAIL
- float _pad[2];
+ float _pad[1];
} BeamformerParameters;
#undef X
diff --git a/beamformer_work_queue.h b/beamformer_work_queue.h
@@ -2,7 +2,7 @@
#ifndef _BEAMFORMER_WORK_QUEUE_H_
#define _BEAMFORMER_WORK_QUEUE_H_
-#define BEAMFORMER_SHARED_MEMORY_VERSION (8UL)
+#define BEAMFORMER_SHARED_MEMORY_VERSION (9UL)
typedef struct BeamformerComputeFrame BeamformerComputeFrame;
typedef struct ShaderReloadContext ShaderReloadContext;
@@ -10,6 +10,7 @@ typedef struct ShaderReloadContext ShaderReloadContext;
typedef enum {
BeamformerWorkKind_Compute,
BeamformerWorkKind_ComputeIndirect,
+ BeamformerWorkKind_CreateFilter,
BeamformerWorkKind_ReloadShader,
BeamformerWorkKind_SendFrame,
BeamformerWorkKind_ExportBuffer,
@@ -17,12 +18,11 @@ typedef enum {
} BeamformerWorkKind;
typedef enum {
- BU_KIND_CHANNEL_MAPPING,
- BU_KIND_FOCAL_VECTORS,
- BU_KIND_PARAMETERS,
- BU_KIND_RF_DATA,
- BU_KIND_SPARSE_ELEMENTS,
- BU_KIND_LAST,
+ BeamformerUploadKind_ChannelMapping,
+ BeamformerUploadKind_FocalVectors,
+ BeamformerUploadKind_Parameters,
+ BeamformerUploadKind_RFData,
+ BeamformerUploadKind_SparseElements,
} BeamformerUploadKind;
typedef struct {
@@ -32,6 +32,24 @@ typedef struct {
} BeamformerUploadContext;
typedef enum {
+ BeamformerFilterKind_Kaiser,
+ BeamformerFilterKind_MatchedSine,
+} BeamformerFilterKind;
+
+typedef struct {
+ BeamformerFilterKind kind;
+ union {
+ struct {
+ f32 beta;
+ f32 cutoff_frequency;
+ };
+ f32 xdc_center_frequency;
+ };
+ i16 length;
+ i16 slot;
+} BeamformerCreateFilterContext;
+
+typedef enum {
BeamformerExportKind_BeamformedData,
BeamformerExportKind_Stats,
} BeamformerExportKind;
@@ -41,6 +59,10 @@ typedef struct {
u32 size;
} BeamformerExportContext;
+typedef union {
+ u8 filter_slot;
+} BeamformerShaderParameters;
+
#define BEAMFORMER_SHARED_MEMORY_LOCKS \
X(None) \
X(ChannelMapping) \
@@ -58,12 +80,13 @@ typedef enum {BEAMFORMER_SHARED_MEMORY_LOCKS BeamformerSharedMemoryLockKind_Coun
/* NOTE: discriminated union based on type */
typedef struct {
union {
- BeamformerComputeFrame *frame;
- BeamformerUploadContext upload_context;
- BeamformerExportContext export_context;
- ShaderReloadContext *shader_reload_context;
- BeamformerViewPlaneTag compute_indirect_plane;
- void *generic;
+ BeamformerComputeFrame *frame;
+ BeamformerCreateFilterContext create_filter_context;
+ BeamformerExportContext export_context;
+ BeamformerUploadContext upload_context;
+ BeamformerViewPlaneTag compute_indirect_plane;
+ ShaderReloadContext *shader_reload_context;
+ void *generic;
};
BeamformerSharedMemoryLockKind lock;
BeamformerWorkKind kind;
@@ -123,8 +146,9 @@ typedef struct {
};
};
- BeamformerShaderKind compute_stages[MAX_COMPUTE_SHADER_STAGES];
- u32 compute_stages_count;
+ BeamformerShaderParameters compute_shader_parameters[MAX_COMPUTE_SHADER_STAGES];
+ BeamformerShaderKind compute_stages[MAX_COMPUTE_SHADER_STAGES];
+ u32 compute_stages_count;
/* TODO(rnp): hack: we need a different way of dispatching work for export */
b32 start_compute_from_main;
diff --git a/external/cephes.c b/external/cephes.c
@@ -0,0 +1,115 @@
+/*
+Cephes Math Library Release 2.8: June, 2000
+Copyright 1984, 1987, 2000 by Stephen L. Moshier
+*/
+
+/* NOTE(rnp): modified so that the library can be included from a single translation unit
+ * and so that it doesn't use pre-ANSI C declarations */
+
+function f64
+cephes_chbevl(f64 x, f64 *coefficients, i32 n)
+{
+ f64 *p = coefficients;
+ f64 b0 = *p++, b1 = 0.0, b2;
+
+ for (i32 i = n - 1; i > 0; i--) {
+ b2 = b1;
+ b1 = b0;
+ b0 = x * b1 - b2 + *p++;
+ }
+
+ return 0.5 * (b0 - b2);
+}
+
+function f64
+cephes_i0(f64 x)
+{
+ /* Chebyshev coefficients for exp(-x) I0(x)
+ * in the interval [0,8].
+ *
+ * lim(x->0){ exp(-x) I0(x) } = 1.
+ */
+ read_only local_persist f64 A[] = {
+ -4.41534164647933937950E-18,
+ 3.33079451882223809783E-17,
+ -2.43127984654795469359E-16,
+ 1.71539128555513303061E-15,
+ -1.16853328779934516808E-14,
+ 7.67618549860493561688E-14,
+ -4.85644678311192946090E-13,
+ 2.95505266312963983461E-12,
+ -1.72682629144155570723E-11,
+ 9.67580903537323691224E-11,
+ -5.18979560163526290666E-10,
+ 2.65982372468238665035E-9,
+ -1.30002500998624804212E-8,
+ 6.04699502254191894932E-8,
+ -2.67079385394061173391E-7,
+ 1.11738753912010371815E-6,
+ -4.41673835845875056359E-6,
+ 1.64484480707288970893E-5,
+ -5.75419501008210370398E-5,
+ 1.88502885095841655729E-4,
+ -5.76375574538582365885E-4,
+ 1.63947561694133579842E-3,
+ -4.32430999505057594430E-3,
+ 1.05464603945949983183E-2,
+ -2.37374148058994688156E-2,
+ 4.93052842396707084878E-2,
+ -9.49010970480476444210E-2,
+ 1.71620901522208775349E-1,
+ -3.04682672343198398683E-1,
+ 6.76795274409476084995E-1
+ };
+
+ /* Chebyshev coefficients for exp(-x) sqrt(x) I0(x)
+ * in the inverted interval [8,infinity].
+ *
+ * lim(x->inf){ exp(-x) sqrt(x) I0(x) } = 1/sqrt(2pi).
+ */
+ read_only local_persist f64 B[] = {
+ -7.23318048787475395456E-18,
+ -4.83050448594418207126E-18,
+ 4.46562142029675999901E-17,
+ 3.46122286769746109310E-17,
+ -2.82762398051658348494E-16,
+ -3.42548561967721913462E-16,
+ 1.77256013305652638360E-15,
+ 3.81168066935262242075E-15,
+ -9.55484669882830764870E-15,
+ -4.15056934728722208663E-14,
+ 1.54008621752140982691E-14,
+ 3.85277838274214270114E-13,
+ 7.18012445138366623367E-13,
+ -1.79417853150680611778E-12,
+ -1.32158118404477131188E-11,
+ -3.14991652796324136454E-11,
+ 1.18891471078464383424E-11,
+ 4.94060238822496958910E-10,
+ 3.39623202570838634515E-9,
+ 2.26666899049817806459E-8,
+ 2.04891858946906374183E-7,
+ 2.89137052083475648297E-6,
+ 6.88975834691682398426E-5,
+ 3.36911647825569408990E-3,
+ 8.04490411014108831608E-1
+ };
+
+ f64 result;
+ if (x < 0) x = -x;
+ if (x <= 8.0) result = exp_f64(x) * cephes_chbevl(x / 2.0 - 2.0, A, 30);
+ else result = exp_f64(x) * cephes_chbevl(32.0 / x - 2.0, B, 25) / sqrt_f64(x);
+ return result;
+}
+
+#if 0
+function
+f64 cephes_i0e(f64 x)
+{
+ f64 result;
+ if (x < 0) x = -x;
+ if (x <= 8.0) result = cephes_chbevl(x / 2.0 - 2.0, A, 30);
+ else result = cephes_chbevl(32.0 / x - 2.0, B, 25) / sqrt_f64(x);
+ return result;
+}
+#endif
diff --git a/helpers/ogl_beamformer_lib.c b/helpers/ogl_beamformer_lib.c
@@ -169,6 +169,39 @@ set_beamformer_pipeline(i32 *stages, u32 stages_count)
return result;
}
+
+b32
+beamformer_set_pipeline_stage_parameters(i32 stage_index, i32 parameter)
+{
+ b32 result = 0;
+ if (check_shared_memory() && g_bp->compute_stages_count != 0) {
+ stage_index %= (i32)g_bp->compute_stages_count;
+ g_bp->compute_shader_parameters[stage_index].filter_slot = (u8)parameter;
+ }
+ return result;
+}
+
+b32
+beamformer_create_kaiser_low_pass_filter(f32 beta, f32 cutoff_frequency, i16 length, u8 slot)
+{
+ b32 result = 0;
+ if (check_shared_memory()) {
+ BeamformWork *work = try_push_work_queue();
+ result = work != 0;
+ if (result) {
+ BeamformerCreateFilterContext *ctx = &work->create_filter_context;
+ work->kind = BeamformerWorkKind_CreateFilter;
+ ctx->kind = BeamformerFilterKind_Kaiser;
+ ctx->cutoff_frequency = cutoff_frequency;
+ ctx->beta = beta;
+ ctx->length = length;
+ ctx->slot = slot % BEAMFORMER_FILTER_SLOTS;
+ beamform_work_queue_push_commit(&g_bp->external_work_queue);
+ }
+ }
+ return result;
+}
+
b32
beamformer_start_compute(i32 timeout_ms)
{
@@ -212,17 +245,17 @@ beamformer_upload_buffer(void *data, u32 size, i32 store_offset, BeamformerUploa
}
#define BEAMFORMER_UPLOAD_FNS \
- X(channel_mapping, i16, 1, ChannelMapping, CHANNEL_MAPPING) \
- X(sparse_elements, i16, 1, SparseElements, SPARSE_ELEMENTS) \
- X(focal_vectors, f32, 2, FocalVectors, FOCAL_VECTORS)
+ X(channel_mapping, i16, 1, ChannelMapping) \
+ X(sparse_elements, i16, 1, SparseElements) \
+ X(focal_vectors, f32, 2, FocalVectors)
-#define X(name, dtype, elements, lock_name, command) \
+#define X(name, dtype, elements, lock_name) \
b32 beamformer_push_##name (dtype *data, u32 count, i32 timeout_ms) { \
b32 result = 0; \
if (count <= countof(g_bp->name)) { \
BeamformerUploadContext uc = {0}; \
uc.shared_memory_offset = offsetof(BeamformerSharedMemory, name); \
- uc.kind = BU_KIND_##command; \
+ uc.kind = BeamformerUploadKind_##lock_name; \
uc.size = count * elements * sizeof(dtype); \
result = beamformer_upload_buffer(data, uc.size, uc.shared_memory_offset, uc, \
BeamformerSharedMemoryLockKind_##lock_name, timeout_ms); \
@@ -242,7 +275,7 @@ beamformer_push_data_base(void *data, u32 data_size, i32 timeout_ms, b32 start_f
BeamformerUploadContext uc = {0};
uc.shared_memory_offset = BEAMFORMER_SCRATCH_OFF;
uc.size = data_size;
- uc.kind = BU_KIND_RF_DATA;
+ uc.kind = BeamformerUploadKind_RFData;
result = beamformer_upload_buffer(data, data_size, uc.shared_memory_offset, uc,
BeamformerSharedMemoryLockKind_ScratchSpace, timeout_ms);
if (result && start_from_main) atomic_store_u32(&g_bp->start_compute_from_main, 1);
@@ -285,7 +318,7 @@ beamformer_push_parameters(BeamformerParameters *bp, i32 timeout_ms)
BeamformerUploadContext uc = {0};
uc.shared_memory_offset = offsetof(BeamformerSharedMemory, parameters);
uc.size = sizeof(g_bp->parameters);
- uc.kind = BU_KIND_PARAMETERS;
+ uc.kind = BeamformerUploadKind_Parameters;
b32 result = beamformer_upload_buffer(bp, sizeof(*bp),
offsetof(BeamformerSharedMemory, parameters), uc,
BeamformerSharedMemoryLockKind_Parameters, timeout_ms);
@@ -298,7 +331,7 @@ beamformer_push_parameters_ui(BeamformerUIParameters *bp, i32 timeout_ms)
BeamformerUploadContext uc = {0};
uc.shared_memory_offset = offsetof(BeamformerSharedMemory, parameters);
uc.size = sizeof(g_bp->parameters);
- uc.kind = BU_KIND_PARAMETERS;
+ uc.kind = BeamformerUploadKind_Parameters;
b32 result = beamformer_upload_buffer(bp, sizeof(*bp),
offsetof(BeamformerSharedMemory, parameters_ui), uc,
BeamformerSharedMemoryLockKind_Parameters, timeout_ms);
@@ -311,7 +344,7 @@ beamformer_push_parameters_head(BeamformerParametersHead *bp, i32 timeout_ms)
BeamformerUploadContext uc = {0};
uc.shared_memory_offset = offsetof(BeamformerSharedMemory, parameters);
uc.size = sizeof(g_bp->parameters);
- uc.kind = BU_KIND_PARAMETERS;
+ uc.kind = BeamformerUploadKind_Parameters;
b32 result = beamformer_upload_buffer(bp, sizeof(*bp),
offsetof(BeamformerSharedMemory, parameters_head), uc,
BeamformerSharedMemoryLockKind_Parameters, timeout_ms);
diff --git a/helpers/ogl_beamformer_lib_base.h b/helpers/ogl_beamformer_lib_base.h
@@ -60,6 +60,28 @@ LIB_FN uint32_t beamformer_push_parameters(BeamformerParameters *, int32_t timeo
LIB_FN uint32_t beamformer_push_parameters_ui(BeamformerUIParameters *, int32_t timeout_ms);
LIB_FN uint32_t beamformer_push_parameters_head(BeamformerParametersHead *, int32_t timeout_ms);
+////////////////////
+// Filter Creation
+
+/* Kaiser Low-Pass Parameter Selection
+ * see: "Discrete Time Signal Processing" (Oppenheim)
+ * δ: fractional passband ripple
+ * ω_p: highest angular frequency of passband
+ * ω_s: lowest angular frequency of stopband
+ * ω_c: cutoff angular frequency. midpoint of ω_s and ω_p
+ * M: length of filter
+ *
+ * Define: A = -20log10(δ)
+ * β:
+ * β = 0.1102(A - 8.7) if 50 < A
+ * β = 0.5842 * pow(A - 21, 0.4) + 0.07886(A − 21) if 21 <= A <= 50
+ * β = 0 if A < 21
+ * M:
+ * M = (A - 8) / (2.285 (ω_s - ω_p))
+ */
+LIB_FN uint32_t beamformer_create_kaiser_low_pass_filter(float beta, f32 cutoff_frequency,
+ int16_t length, uint8_t slot);
+
//////////////////////////
// Live Imaging Controls
LIB_FN int32_t beamformer_live_parameters_get_dirty_flag(void);
diff --git a/intrinsics.c b/intrinsics.c
@@ -47,6 +47,9 @@
#define ceil_f32(a) ceilf(a)
#define sqrt_f32(a) sqrtf(a)
+ #define exp_f64(a) exp(a)
+ #define sqrt_f64(a) sqrt(a)
+
#else
#define align_as(n) __attribute__((aligned(n)))
#define pack_struct(s) s __attribute__((packed))
@@ -81,6 +84,8 @@
#define ceil_f32(a) __builtin_ceilf(a)
#define sqrt_f32(a) __builtin_sqrtf(a)
+ #define exp_f64(a) __builtin_exp(a)
+ #define sqrt_f64(a) __builtin_sqrt(a)
#endif
#if COMPILER_MSVC
diff --git a/math.c b/math.c
@@ -1,3 +1,5 @@
+#include "external/cephes.c"
+
function void
fill_kronecker_sub_matrix(i32 *out, i32 out_stride, i32 scale, i32 *b, iv2 b_dim)
{
@@ -119,6 +121,26 @@ make_hadamard_transpose(Arena *a, i32 dim)
return result;
}
+/* NOTE(rnp): adapted from "Discrete Time Signal Processing" (Oppenheim) */
+function f32 *
+kaiser_low_pass_filter(Arena *arena, f32 cutoff_frequency, f32 sampling_frequency, f32 beta, i32 length)
+{
+ f32 *result = push_array(arena, f32, length);
+ f32 wc = 2 * PI * cutoff_frequency / sampling_frequency;
+ f32 a = (f32)length / 2.0f;
+ f32 pi_i0_b = PI * (f32)cephes_i0(beta);
+
+ for (i32 n = 0; n < length; n++) {
+ f32 t = (f32)n - a;
+ f32 impulse = !f32_cmp(t, 0) ? sin_f32(wc * t) / t : 1;
+ t = t / a;
+ f32 window = (f32)cephes_i0(beta * sqrt_f32(1 - t * t)) / pi_i0_b;
+ result[n] = impulse * window;
+ }
+
+ return result;
+}
+
function b32
iv2_equal(iv2 a, iv2 b)
{
diff --git a/opengl.h b/opengl.h
@@ -26,6 +26,7 @@
#define GL_MAJOR_VERSION 0x821B
#define GL_MINOR_VERSION 0x821C
#define GL_RG 0x8227
+#define GL_R32F 0x822E
#define GL_RG32F 0x8230
#define GL_R8I 0x8231
#define GL_R16I 0x8233
diff --git a/shaders/das.glsl b/shaders/das.glsl
@@ -20,6 +20,18 @@ layout(rg32f, binding = 2) readonly restrict uniform image1D focal_vectors;
#define TX_MODE_TX_COLS(a) (((a) & 2) != 0)
#define TX_MODE_RX_COLS(a) (((a) & 1) != 0)
+vec2 rotate_iq(vec2 iq, float time)
+{
+ vec2 result = iq;
+ if (center_frequency > 0) {
+ float arg = radians(360) * center_frequency * time;
+ mat2 phasor = mat2( cos(arg), sin(arg),
+ -sin(arg), cos(arg));
+ result = phasor * iq;
+ }
+ return result;
+}
+
/* NOTE: See: https://cubic.org/docs/hermite.htm */
vec2 cubic(int base_index, float x)
{
@@ -47,22 +59,26 @@ vec2 cubic(int base_index, float x)
vec2 T2 = C_SPLINE * (samples[3] - P1);
mat2x4 C = mat2x4(vec4(P1.x, P2.x, T1.x, T2.x), vec4(P1.y, P2.y, T1.y, T2.y));
- return S * h * C;
+ float fs = sampling_frequency / decimation_rate;
+ vec2 result = rotate_iq(S * h * C, x / fs);
+ return result;
}
vec2 sample_rf(int channel, int transmit, float t)
{
vec2 result;
- int base_index = int(channel * dec_data_dim.x * dec_data_dim.z + transmit * dec_data_dim.x);
+ float fs = sampling_frequency / decimation_rate;
+ int base_index = int(channel * dec_data_dim.x * dec_data_dim.z + transmit * dec_data_dim.x);
if (interpolate) result = cubic(base_index, t);
- else result = rf_data[base_index + int(floor(t))];
+ else result = rotate_iq(rf_data[base_index + int(round(t))], t / fs);
return result;
}
float sample_index(float distance)
{
+ float fs = sampling_frequency / decimation_rate;
float time = distance / speed_of_sound + time_offset;
- return time * sampling_frequency;
+ return time * fs;
}
float apodize(float arg)
diff --git a/shaders/demod.glsl b/shaders/demod.glsl
@@ -1,6 +1,4 @@
/* See LICENSE for license details. */
-layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
-
layout(std430, binding = 1) readonly restrict buffer buffer_1 {
vec2 in_data[];
};
@@ -9,34 +7,27 @@ layout(std430, binding = 2) writeonly restrict buffer buffer_2 {
vec2 out_data[];
};
-layout(std430, binding = 3) readonly restrict buffer buffer_3 {
- float filter_coefficients[];
-};
-
-layout(location = 0) uniform uint u_filter_order = 0;
+layout(r32f, binding = 0) readonly restrict uniform image1D filter_coefficients;
void main()
{
- uint time_sample = gl_GlobalInvocationID.x;
- uint channel = gl_GlobalInvocationID.y;
- uint acq = gl_GlobalInvocationID.z;
-
- /* NOTE: offsets for storing the results in the output data */
- uint stride = dec_data_dim.x * dec_data_dim.y;
- uint off = dec_data_dim.x * channel + stride * acq + time_sample;
-
- /* NOTE: for calculating full-band I-Q data; needs to be stepped in loop */
- float arg = radians(360) * center_frequency * time_sample / sampling_frequency;
- float arg_delta = radians(360) * center_frequency / sampling_frequency;
-
- vec2 sum = vec2(0);
- for (int i = 0; i <= u_filter_order; i++) {
- vec2 data;
- /* NOTE: make sure data samples come from the same acquisition */
- if (time_sample >= i) data = in_data[off - i].xx * vec2(cos(arg), sin(arg));
- else data = vec2(0);
- sum += filter_coefficients[i] * data;
- arg -= arg_delta;
+ uint in_sample = gl_GlobalInvocationID.x * decimation_rate;
+ uint out_sample = gl_GlobalInvocationID.x;
+ uint channel = gl_GlobalInvocationID.y;
+ uint transmit = gl_GlobalInvocationID.z;
+
+ uint in_offset = (dec_data_dim.x * dec_data_dim.z * channel + dec_data_dim.x * transmit);
+ uint out_offset = (dec_data_dim.x * dec_data_dim.z * channel + dec_data_dim.x * transmit) + out_sample;
+
+ float arg = radians(360) * center_frequency / sampling_frequency;
+ vec2 result = vec2(0);
+ for (int i = 0; i < imageSize(filter_coefficients).x; i++) {
+ int index = int(in_sample + i);
+ if (index < dec_data_dim.x) {
+ float data = in_data[in_offset + index].x;
+ vec2 iq = sqrt(2.0f) * data * vec2(cos(arg * index), -sin(arg * index));
+ result += imageLoad(filter_coefficients, imageSize(filter_coefficients).x - i - 1).x * iq;
+ }
}
- out_data[off] = sum;
+ out_data[out_offset] = result;
}
diff --git a/tests/throughput.c b/tests/throughput.c
@@ -47,7 +47,7 @@ typedef struct {
f32 center_frequency;
f32 sampling_frequency;
f32 time_offset;
- u32 transmit_mode;
+ i32 transmit_mode;
} zemp_bp_v1;
global b32 g_should_exit;
@@ -203,7 +203,7 @@ fill_beamformer_parameters_from_zemp_bp_v1(zemp_bp_v1 *zbp, BeamformerParameters
mem_copy(out->xdc_element_pitch, zbp->xdc_element_pitch, sizeof(out->xdc_element_pitch));
mem_copy(out->rf_raw_dim, zbp->raw_data_dim, sizeof(out->rf_raw_dim));
- out->transmit_mode = (i32)zbp->transmit_mode;
+ out->transmit_mode = zbp->transmit_mode;
out->decode = zbp->decode_mode;
out->das_shader_id = zbp->beamform_mode;
out->time_offset = zbp->time_offset;