ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

Commit: b6bd87b30496428a650e027307ee11d0d7e6e332
Parent: 6c7f2e251664fa9a7991bf709392838e3d259b54
Author: Randy Palamar
Date:   Fri, 25 Jul 2025 13:30:53 -0600

core: treat data as IQ when demodulating

All the alternate sampling modes work if you just pretend samples
are IQ pairs and rotate the Q sample back by 90 degrees.

Diffstat:
Mbeamformer.c | 165+++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
Mbeamformer.h | 3+++
Mbeamformer_parameters.h | 5+++--
Mopengl.h | 1+
Mshaders/decode.glsl | 63++++++++++++++++++++++++++++++++++++++-------------------------
Mshaders/demod.glsl | 66++++++++++++++++++++++++++++++++++++++++--------------------------
6 files changed, 183 insertions(+), 120 deletions(-)

diff --git a/beamformer.c b/beamformer.c @@ -1,5 +1,11 @@ /* See LICENSE for license details. */ /* TODO(rnp): + * [ ]: make decode output real values for real inputs and complex values for complex inputs + * - this means that das should have a RF version and an IQ version + * - this will also flip the current hack to support demodulate after decode to + * being a hack to support CudaHilbert after decode + * [ ]: measure filtering performance when indexing forward on RF data + * [ ]: filter sampling frequency should be a filter creation parameter * [ ]: reinvestigate ring buffer raw_data_ssbo * - to minimize latency the main thread should manage the subbuffer upload so that the * compute thread can just keep computing. This way we can keep the copmute thread busy @@ -71,7 +77,7 @@ beamformer_filter_time_offset(BeamformerFilter *f) f32 result = 0; switch (f->kind) { case BeamformerFilterKind_Kaiser:{ - result = -(f32)f->length / 2.0f / f->sampling_frequency; + result = (f32)f->length / 2.0f / f->sampling_frequency; }break; InvalidDefaultCase; } @@ -313,7 +319,6 @@ plan_compute_pipeline(SharedMemoryRegion *os_sm, BeamformerComputePipeline *cp, os_shared_memory_region_lock(os_sm, sm->locks, compute_lock, (u32)-1); b32 decode_first = sm->shaders[0] == BeamformerShaderKind_Decode; - b32 demod_first = sm->shaders[0] == BeamformerShaderKind_Demodulate; b32 cuda_hilbert = 0; b32 demodulate = 0; @@ -347,7 +352,10 @@ plan_compute_pipeline(SharedMemoryRegion *os_sm, BeamformerComputePipeline *cp, [BeamformerDataKind_Float32] = BeamformerShaderKind_DecodeFloat, [BeamformerDataKind_Float32Complex] = BeamformerShaderKind_DecodeFloatComplex, }; - if (decode_first) { + if (decode_first && demodulate) { + /* TODO(rnp): for now we assume that if we are demodulating the data is int16 */ + shader = BeamformerShaderKind_DecodeInt16ToFloat; + } else if (decode_first) { shader = decode_table[CLAMP(data_kind, 0, countof(decode_table) - 1)]; } else { if (data_kind == BeamformerDataKind_Int16) @@ -358,7 +366,7 @@ plan_compute_pipeline(SharedMemoryRegion *os_sm, BeamformerComputePipeline *cp, commit = 1; }break; case BeamformerShaderKind_Demodulate:{ - if (!demod_first || (demod_first && data_kind == BeamformerDataKind_Float32)) + if (decode_first || (!decode_first && data_kind == BeamformerDataKind_Float32)) shader = BeamformerShaderKind_DemodulateFloat; bp->time_offset += beamformer_filter_time_offset(filters + sp->filter_slot); commit = 1; @@ -379,58 +387,89 @@ plan_compute_pipeline(SharedMemoryRegion *os_sm, BeamformerComputePipeline *cp, } os_shared_memory_region_unlock(os_sm, sm->locks, compute_lock); + u32 time_compression = 1; + if (demodulate) time_compression = 2; + + if (!demodulate) bp->center_frequency = 0; + bp->decimation_rate = MAX(bp->decimation_rate, 1); + + cp->decode_dispatch.x = (u32)ceil_f32((f32)bp->dec_data_dim[0] / DECODE_LOCAL_SIZE_X); + cp->decode_dispatch.y = (u32)ceil_f32((f32)bp->dec_data_dim[1] / DECODE_LOCAL_SIZE_Y); + cp->decode_dispatch.z = (u32)ceil_f32((f32)bp->dec_data_dim[2] / DECODE_LOCAL_SIZE_Z); + + /* NOTE(rnp): decode 2 samples per dispatch when data is i16 */ + if (decode_first && cp->data_kind == BeamformerDataKind_Int16) + cp->decode_dispatch.x = (u32)ceil_f32((f32)cp->decode_dispatch.x / 2); + BeamformerDecodeUBO *dp = &cp->decode_ubo_data; dp->decode_mode = bp->decode; dp->transmit_count = bp->dec_data_dim[2]; - bp->decimation_rate = MAX(bp->decimation_rate, 1); - if (decode_first) { dp->input_channel_stride = bp->rf_raw_dim[0]; dp->input_sample_stride = 1; dp->input_transmit_stride = bp->dec_data_dim[0]; - dp->output_channel_stride = bp->dec_data_dim[0] * bp->dec_data_dim[2]; + dp->output_channel_stride = bp->dec_data_dim[0] * bp->dec_data_dim[2] / time_compression; + dp->output_sample_stride = 1; + dp->output_transmit_stride = bp->dec_data_dim[0] / time_compression; + } else { + dp->input_channel_stride = bp->dec_data_dim[0] * bp->dec_data_dim[2] / + bp->decimation_rate / time_compression; + dp->input_sample_stride = bp->dec_data_dim[2]; + dp->input_transmit_stride = 1; + + dp->output_channel_stride = dp->input_channel_stride; dp->output_sample_stride = 1; - dp->output_transmit_stride = bp->dec_data_dim[0]; + dp->output_transmit_stride = bp->dec_data_dim[0] / bp->decimation_rate / time_compression; } + /* NOTE(rnp): when we are demodulating we pretend that the sampler was alternating + * between sampling the I portion and the Q portion of an IQ signal. Therefore there + * is an implicit decimation factor of 2 which must always be included. All code here + * assumes that the signal was sampled in such a way that supports this operation. + * To recover IQ[n] from the sampled data (RF[n]) we do the following: + * I[n] = RF[n] + * Q[n] = RF[n + 1] + * IQ[n] = I[n] - j*Q[n] + */ if (demodulate) { BeamformerDemodulateUBO *mp = &cp->demod_ubo_data; - mp->sampling_frequency = bp->sampling_frequency; mp->demodulation_frequency = bp->center_frequency; + mp->sampling_frequency = bp->sampling_frequency / (f32)time_compression; mp->decimation_rate = bp->decimation_rate; + mp->map_channels = !decode_first; - bp->sampling_frequency /= (f32)mp->decimation_rate; - bp->dec_data_dim[0] /= mp->decimation_rate; + if (decode_first) { + mp->input_channel_stride = dp->output_channel_stride; + mp->input_sample_stride = dp->output_sample_stride; + mp->input_transmit_stride = dp->output_transmit_stride; - mp->input_sample_stride = 1; - mp->input_transmit_stride = bp->dec_data_dim[0] * mp->decimation_rate; - mp->output_channel_stride = bp->dec_data_dim[0] * bp->dec_data_dim[2]; - - if (demod_first) { - /* NOTE(rnp): output optimized decode layout to skip first pass */ - mp->input_channel_stride = bp->rf_raw_dim[0]; - mp->output_sample_stride = bp->dec_data_dim[2]; - mp->output_transmit_stride = 1; - mp->map_channels = 1; + mp->output_channel_stride = bp->dec_data_dim[0] * bp->dec_data_dim[2] / + mp->decimation_rate / time_compression; + mp->output_sample_stride = 1; + mp->output_transmit_stride = bp->dec_data_dim[0] / mp->decimation_rate / time_compression; + } else { + mp->input_channel_stride = bp->rf_raw_dim[0] / time_compression; + mp->input_sample_stride = 1; + mp->input_transmit_stride = bp->dec_data_dim[0] / time_compression; - dp->input_channel_stride = mp->output_channel_stride; - dp->input_sample_stride = mp->output_sample_stride; - dp->input_transmit_stride = mp->output_transmit_stride; + /* NOTE(rnp): output optimized layout for decoding */ + mp->output_channel_stride = dp->input_channel_stride; + mp->output_sample_stride = dp->input_sample_stride; + mp->output_transmit_stride = dp->input_transmit_stride; - dp->output_channel_stride = bp->dec_data_dim[0] * bp->dec_data_dim[2]; - dp->output_sample_stride = 1; - dp->output_transmit_stride = bp->dec_data_dim[0]; - } else { - mp->input_channel_stride = dp->output_channel_stride; - mp->output_sample_stride = 1; - mp->output_transmit_stride = bp->dec_data_dim[0]; - mp->map_channels = 0; + u32 time_samples = bp->dec_data_dim[0] / mp->decimation_rate / time_compression; + cp->decode_dispatch.x = (u32)ceil_f32((f32)time_samples / DECODE_LOCAL_SIZE_X); } - } else { - bp->center_frequency = 0; - bp->decimation_rate = 1; + + f32 local_size_x = DEMOD_LOCAL_SIZE_X * (f32)time_compression * (f32)mp->decimation_rate; + cp->demod_dispatch.x = (u32)ceil_f32((f32)bp->dec_data_dim[0] / local_size_x); + cp->demod_dispatch.y = (u32)ceil_f32((f32)bp->dec_data_dim[1] / DEMOD_LOCAL_SIZE_Y); + cp->demod_dispatch.z = (u32)ceil_f32((f32)bp->dec_data_dim[2] / DEMOD_LOCAL_SIZE_Z); + + bp->sampling_frequency /= (f32)mp->decimation_rate * (f32)time_compression; + bp->dec_data_dim[0] /= mp->decimation_rate * time_compression; } } @@ -489,40 +528,32 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformerComputeFrame *frame case BeamformerShaderKind_DecodeInt16Complex: case BeamformerShaderKind_DecodeFloat: case BeamformerShaderKind_DecodeFloatComplex: + case BeamformerShaderKind_DecodeInt16ToFloat: { glBindBufferBase(GL_UNIFORM_BUFFER, 0, cp->ubos[BeamformerComputeUBOKind_Decode]); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, csctx->rf_data_ssbos[output_ssbo_idx]); glBindImageTexture(0, csctx->hadamard_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8I); - /* NOTE(rnp): decode 2 samples per dispatch when data is i16 */ - f32 local_size_x = (f32)DECODE_LOCAL_SIZE_X; - if (shader == BeamformerShaderKind_Decode) - local_size_x *= 2; - - uv3 dim = csctx->dec_data_dim.xyz; iz raw_size = csctx->rf_raw_size; if (shader == cp->shaders[0]) { glBindImageTexture(1, csctx->channel_mapping_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I); glProgramUniform1ui(program, DECODE_FIRST_PASS_UNIFORM_LOC, 1); glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo, 0, raw_size); glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 2, csctx->raw_data_ssbo, raw_size, raw_size); - glDispatchCompute((u32)ceil_f32((f32)dim.x / local_size_x), - (u32)ceil_f32((f32)dim.y / DECODE_LOCAL_SIZE_Y), - (u32)ceil_f32((f32)dim.z / DECODE_LOCAL_SIZE_Z)); + glDispatchCompute(cp->decode_dispatch.x, cp->decode_dispatch.y, cp->decode_dispatch.z); glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); } if (shader == cp->shaders[0]) { glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, csctx->raw_data_ssbo, raw_size, raw_size); } else { - dim = uv4_from_u32_array(cp->das_ubo_data.dec_data_dim).xyz; glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, csctx->rf_data_ssbos[input_ssbo_idx]); } glProgramUniform1ui(program, DECODE_FIRST_PASS_UNIFORM_LOC, 0); - glDispatchCompute((u32)ceil_f32((f32)dim.x / local_size_x), - (u32)ceil_f32((f32)dim.y / DECODE_LOCAL_SIZE_Y), - (u32)ceil_f32((f32)dim.z / DECODE_LOCAL_SIZE_Z)); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, csctx->rf_data_ssbos[output_ssbo_idx]); + + glDispatchCompute(cp->decode_dispatch.x, cp->decode_dispatch.y, cp->decode_dispatch.z); glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; @@ -551,10 +582,7 @@ do_compute_shader(BeamformerCtx *ctx, Arena arena, BeamformerComputeFrame *frame if (ubo->map_channels) glBindImageTexture(1, csctx->channel_mapping_texture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16I); - f32 local_size_x = (f32)(DEMOD_LOCAL_SIZE_X * (f32)ubo->decimation_rate); - glDispatchCompute((u32)ceil_f32((f32)csctx->dec_data_dim.x / local_size_x), - (u32)ceil_f32((f32)csctx->dec_data_dim.y / DEMOD_LOCAL_SIZE_Y), - (u32)ceil_f32((f32)csctx->dec_data_dim.z / DEMOD_LOCAL_SIZE_Z)); + glDispatchCompute(cp->demod_dispatch.x, cp->demod_dispatch.y, cp->demod_dispatch.z); glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); csctx->last_output_ssbo_index = !csctx->last_output_ssbo_index; @@ -726,24 +754,19 @@ shader_text_with_header(ShaderReloadContext *ctx, OS *os, Arena *arena) #undef X }break; case BeamformerShaderKind_Decode: - case BeamformerShaderKind_DecodeInt16Complex: case BeamformerShaderKind_DecodeFloat: case BeamformerShaderKind_DecodeFloatComplex: + case BeamformerShaderKind_DecodeInt16Complex: + case BeamformerShaderKind_DecodeInt16ToFloat: { - switch (ctx->kind) { - case BeamformerShaderKind_DecodeInt16Complex:{ - stream_append_s8(&sb, s8("#define INPUT_DATA_TYPE_INT16_COMPLEX\n\n")); - }break; - case BeamformerShaderKind_DecodeFloat:{ - stream_append_s8(&sb, s8("#define INPUT_DATA_TYPE_FLOAT\n\n")); - }break; - case BeamformerShaderKind_DecodeFloatComplex:{ - stream_append_s8(&sb, s8("#define INPUT_DATA_TYPE_FLOAT_COMPLEX\n\n")); - }break; - default:{}break; - } + s8 define_table[] = { + [BeamformerShaderKind_DecodeFloatComplex] = s8("#define INPUT_DATA_TYPE_FLOAT_COMPLEX\n\n"), + [BeamformerShaderKind_DecodeFloat] = s8("#define INPUT_DATA_TYPE_FLOAT\n\n"), + [BeamformerShaderKind_DecodeInt16Complex] = s8("#define INPUT_DATA_TYPE_INT16_COMPLEX\n\n"), + [BeamformerShaderKind_DecodeInt16ToFloat] = s8("#define OUTPUT_DATA_TYPE_FLOAT\n\n"), + }; #define X(type, id, pretty) "#define DECODE_MODE_" #type " " #id "\n" - stream_append_s8(&sb, s8("" + stream_append_s8s(&sb, define_table[ctx->kind], s8("" "layout(local_size_x = " str(DECODE_LOCAL_SIZE_X) ", " "local_size_y = " str(DECODE_LOCAL_SIZE_Y) ", " "local_size_z = " str(DECODE_LOCAL_SIZE_Z) ") in;\n\n" @@ -847,6 +870,10 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co src->shader = cs->programs + src->kind; success &= reload_compute_shader(ctx, src, s8(" (I16C)"), arena); + src->kind = BeamformerShaderKind_DecodeInt16ToFloat; + src->shader = cs->programs + src->kind; + success &= reload_compute_shader(ctx, src, s8(" (I16-F32)"), arena); + src->kind = BeamformerShaderKind_Decode; src->shader = cs->programs + src->kind; }break; @@ -899,7 +926,7 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co }break; case BeamformerWorkKind_CreateFilter:{ BeamformerCreateFilterContext *fctx = &work->create_filter_context; - beamformer_filter_update(cs->filters + fctx->slot, fctx, sm->parameters.sampling_frequency, arena); + beamformer_filter_update(cs->filters + fctx->slot, fctx, sm->parameters.sampling_frequency / 2, arena); }break; case BeamformerWorkKind_UploadBuffer:{ os_shared_memory_region_lock(&ctx->shared_memory, sm->locks, (i32)work->lock, (u32)-1); @@ -962,6 +989,10 @@ complete_queue(BeamformerCtx *ctx, BeamformWorkQueue *q, Arena arena, iptr gl_co DEBUG_DECL(work->kind = BeamformerWorkKind_ComputeIndirect;) } /* FALLTHROUGH */ case BeamformerWorkKind_Compute:{ + DEBUG_DECL(glClearNamedBufferData(cs->rf_data_ssbos[0], GL_RG32F, GL_RG, GL_FLOAT, 0);) + DEBUG_DECL(glClearNamedBufferData(cs->rf_data_ssbos[1], GL_RG32F, GL_RG, GL_FLOAT, 0);) + DEBUG_DECL(glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);) + push_compute_timing_info(ctx->compute_timing_table, (ComputeTimingInfo){.kind = ComputeTimingInfoKind_ComputeFrameBegin}); diff --git a/beamformer.h b/beamformer.h @@ -162,6 +162,9 @@ typedef struct { i32 shader_count; BeamformerDataKind data_kind; + uv3 decode_dispatch; + uv3 demod_dispatch; + u32 ubos[BeamformerComputeUBOKind_Count]; #define X(k, type, name) type name ##_ubo_data; diff --git a/beamformer_parameters.h b/beamformer_parameters.h @@ -25,8 +25,9 @@ X(DecodeInt16Complex, 7, "", "Decode (I16C)") \ X(DecodeFloat, 8, "", "Decode (F32)") \ X(DecodeFloatComplex, 9, "", "Decode (F32C)") \ - X(DemodulateFloat, 10, "", "Demodulate (F32)") \ - X(DASFast, 11, "", "DAS (Fast)") + X(DecodeInt16ToFloat, 10, "", "Decode (I16-F32)") \ + X(DemodulateFloat, 11, "", "Demodulate (F32)") \ + X(DASFast, 12, "", "DAS (Fast)") typedef enum { #define X(e, n, ...) BeamformerShaderKind_##e = n, diff --git a/opengl.h b/opengl.h @@ -77,6 +77,7 @@ typedef uint64_t GLuint64; X(glBindTextureUnit, void, (GLuint unit, GLuint texture)) \ X(glBindVertexArray, void, (GLuint array)) \ X(glBlitNamedFramebuffer, void, (GLuint sfb, GLuint dfb, GLint sx0, GLint sy0, GLint sx1, GLint sy1, GLint dx0, GLint dy0, GLint dx1, GLint dy1, GLbitfield mask, GLenum filter)) \ + X(glClearNamedBufferData, void, (GLuint buffer, GLenum internalformat, GLenum format, GLenum type, const void *data)) \ X(glClearNamedFramebufferfv, void, (GLuint framebuffer, GLenum buffer, GLint drawbuffer, const GLfloat *value)) \ X(glClearTexImage, void, (GLuint texture, GLint level, GLenum format, GLenum type, const void *data)) \ X(glCompileShader, void, (GLuint shader)) \ diff --git a/shaders/decode.glsl b/shaders/decode.glsl @@ -28,11 +28,15 @@ #define SAMPLE_TYPE_CAST(x) vec2(((x) << 16) >> 16, (x) >> 16) #else #define INPUT_DATA_TYPE int - #define RF_SAMPLES_PER_INDEX 2 #define RESULT_TYPE_CAST(x) (x) - #define SAMPLE_DATA_TYPE vec4 /* NOTE(rnp): for i16 rf_data we decode 2 samples at once */ - #define SAMPLE_TYPE_CAST(x) vec4(((x) << 16) >> 16, 0, (x) >> 16, 0) + #define RF_SAMPLES_PER_INDEX 2 + #define SAMPLE_DATA_TYPE vec4 + #if defined(OUTPUT_DATA_TYPE_FLOAT) + #define SAMPLE_TYPE_CAST(x) vec4(((x) << 16) >> 16, (x) >> 16, 0, 0) + #else + #define SAMPLE_TYPE_CAST(x) vec4(((x) << 16) >> 16, 0, (x) >> 16, 0) + #endif #endif layout(std430, binding = 1) readonly restrict buffer buffer_1 { @@ -64,30 +68,39 @@ void main() uint rf_offset = (input_channel_stride * channel + transmit_count * time_sample) / RF_SAMPLES_PER_INDEX; if (u_first_pass) { - uint in_off = input_channel_stride * imageLoad(channel_mapping, int(channel)).x + - input_transmit_stride * transmit + - input_sample_stride * time_sample; - out_rf_data[rf_offset + transmit] = rf_data[in_off / RF_SAMPLES_PER_INDEX]; + if (time_sample < input_transmit_stride) { + uint in_off = input_channel_stride * imageLoad(channel_mapping, int(channel)).x + + input_transmit_stride * transmit + + input_sample_stride * time_sample; + out_rf_data[rf_offset + transmit] = rf_data[in_off / RF_SAMPLES_PER_INDEX]; + } } else { - uint out_off = output_channel_stride * channel + - output_transmit_stride * transmit + - output_sample_stride * time_sample; + #if defined(OUTPUT_DATA_TYPE_FLOAT) + /* NOTE(rnp): when outputting floats do not dilate the out time sample; + * output should end up densely packed */ + time_sample = gl_GlobalInvocationID.x; + #endif + if (time_sample < output_transmit_stride) { + uint out_off = output_channel_stride * channel + + output_transmit_stride * transmit + + output_sample_stride * time_sample; - vec4 result = vec4(0); - switch (decode_mode) { - case DECODE_MODE_NONE: { - result = RESULT_TYPE_CAST(sample_rf_data(rf_offset + transmit)); - } break; - case DECODE_MODE_HADAMARD: { - SAMPLE_DATA_TYPE sum = SAMPLE_DATA_TYPE(0); - for (int i = 0; i < transmit_count; i++) - sum += imageLoad(hadamard, ivec2(i, transmit)).x * sample_rf_data(rf_offset++); - result = RESULT_TYPE_CAST(sum) / float(transmit_count); - } break; + vec4 result = vec4(0); + switch (decode_mode) { + case DECODE_MODE_NONE: { + result = RESULT_TYPE_CAST(sample_rf_data(rf_offset + transmit)); + } break; + case DECODE_MODE_HADAMARD: { + SAMPLE_DATA_TYPE sum = SAMPLE_DATA_TYPE(0); + for (int i = 0; i < imageSize(hadamard).x; i++) + sum += imageLoad(hadamard, ivec2(i, transmit)).x * sample_rf_data(rf_offset++); + result = RESULT_TYPE_CAST(sum) / float(imageSize(hadamard).x); + } break; + } + out_data[out_off + 0] = result.xy; + #if RF_SAMPLES_PER_INDEX == 2 && !defined(OUTPUT_DATA_TYPE_FLOAT) + out_data[out_off + 1] = result.zw; + #endif } - out_data[out_off + 0] = result.xy; - #if RF_SAMPLES_PER_INDEX == 2 - out_data[out_off + 1] = result.zw; - #endif } } diff --git a/shaders/demod.glsl b/shaders/demod.glsl @@ -1,39 +1,44 @@ /* See LICENSE for license details. */ #if defined(INPUT_DATA_TYPE_FLOAT) - #define INPUT_DATA_TYPE vec2 - #define RF_SAMPLES_PER_INDEX 1 - #define RESULT_TYPE_CAST(x) (x) - #define SAMPLE_TYPE_CAST(v, i) (v).x + #define DATA_TYPE vec2 + #define RESULT_TYPE_CAST(v) (v) + #define SAMPLE_TYPE_CAST(v) (v) #else - #define INPUT_DATA_TYPE uint - #define RF_SAMPLES_PER_INDEX 2 - #define RESULT_TYPE_CAST(v) packSnorm2x16(v) - #define SAMPLE_TYPE_CAST(v, i) unpackSnorm2x16(v)[(~(i)) & 1u] - /* NOTE(rnp): for outputting complex floats */ - //#define RESULT_TYPE_CAST(v) (v) - //#define SAMPLE_TYPE_CAST(v, i) unpackSnorm2x16(v)[(~(i)) & 1u] * 32767.0f + #define DATA_TYPE uint + #define RESULT_TYPE_CAST(v) packSnorm2x16(v) + #define SAMPLE_TYPE_CAST(v) unpackSnorm2x16(v) #endif layout(std430, binding = 1) readonly restrict buffer buffer_1 { - INPUT_DATA_TYPE in_data[]; + DATA_TYPE in_data[]; }; layout(std430, binding = 2) writeonly restrict buffer buffer_2 { - INPUT_DATA_TYPE out_data[]; + DATA_TYPE out_data[]; }; layout(r32f, binding = 0) readonly restrict uniform image1D filter_coefficients; layout(r16i, binding = 1) readonly restrict uniform iimage1D channel_mapping; -float sample_rf(uint index) +vec2 rotate_iq(vec2 iq, uint index) { - float result = SAMPLE_TYPE_CAST(in_data[index], index); + float arg = radians(360) * demodulation_frequency * index / sampling_frequency; + mat2 phasor = mat2( cos(arg), sin(arg), + -sin(arg), cos(arg)); + vec2 result = phasor * iq; + return result; +} + +vec2 sample_rf(uint index) +{ + vec2 result = SAMPLE_TYPE_CAST(in_data[index]); return result; } void main() { uint in_sample = gl_GlobalInvocationID.x * decimation_rate; + uint out_sample = gl_GlobalInvocationID.x; uint channel = gl_GlobalInvocationID.y; uint transmit = gl_GlobalInvocationID.z; @@ -41,17 +46,26 @@ void main() uint in_offset = input_channel_stride * in_channel + input_transmit_stride * transmit; uint out_offset = output_channel_stride * channel + output_transmit_stride * transmit + - output_sample_stride * gl_GlobalInvocationID.x; - - float arg = radians(360) * demodulation_frequency / sampling_frequency; - vec2 result = vec2(0); - for (int i = 0; i < imageSize(filter_coefficients).x; i++) { - uint index = in_sample + i; - if (index < input_transmit_stride) { - float data = sample_rf((in_offset + index) / RF_SAMPLES_PER_INDEX); - vec2 iq = sqrt(2.0f) * data * vec2(cos(arg * index), -sin(arg * index)); - result += imageLoad(filter_coefficients, imageSize(filter_coefficients).x - i - 1).x * iq; + output_sample_stride * out_sample; + + bool in_bounds; + if (map_channels) { + in_bounds = out_sample < output_channel_stride / output_sample_stride; + } else { + in_bounds = out_sample < output_transmit_stride; + } + + if (in_bounds) { + vec2 result = vec2(0); + int index = int(in_sample); + for (int i = 0; + i < imageSize(filter_coefficients).x && index >= 0; + i++, index -= int(input_sample_stride)) + { + vec2 data = sample_rf(in_offset + index) * vec2(1, -1); + vec2 iq = sqrt(2.0f) * rotate_iq(data, index); + result += imageLoad(filter_coefficients, i).x * iq; } + out_data[out_offset] = RESULT_TYPE_CAST(result); } - out_data[out_offset] = RESULT_TYPE_CAST(result); }