ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

Commit: 0210c211e7d92d032c297a66e3b9597ab12dab8b
Parent: 68e5130078be9c42dc509eadbf0bc960c86e5b63
Author: Randy Palamar
Date:   Mon, 20 Apr 2026 10:09:46 -0600

core/decode: store hadamard as f16

This doesn't result in any measurable performance diffence in
OpenGL, it is simply pulling back code from the Vulkan branch.

Diffstat:
Mbeamformer_core.c | 12++++++------
Mbeamformer_internal.h | 2+-
Mmath.c | 52+++++++++++++++++++++++++++++++---------------------
Mopengl.h | 1+
Mshaders/decode.glsl | 2+-
Mutil.h | 22++++++++++++----------
6 files changed, 52 insertions(+), 39 deletions(-)

diff --git a/beamformer_core.c b/beamformer_core.c @@ -242,16 +242,16 @@ alloc_beamform_frame(BeamformerFrame *out, iv3 out_dim, GLenum gl_kind, s8 name, } function void -update_hadamard_texture(BeamformerComputePlan *cp, i32 order, Arena arena) +update_hadamard_texture(BeamformerComputePlan *cp, i32 order, b32 row_major, Arena arena) { - f32 *hadamard = make_hadamard_transpose(&arena, order); + f16 *hadamard = make_hadamard_transpose(&arena, order, row_major); if (hadamard) { cp->hadamard_order = order; u32 *texture = cp->textures + BeamformerComputeTextureKind_Hadamard; glDeleteTextures(1, texture); glCreateTextures(GL_TEXTURE_2D, 1, texture); - glTextureStorage2D(*texture, 1, GL_R32F, order, order); - glTextureSubImage2D(*texture, 0, 0, 0, order, order, GL_RED, GL_FLOAT, hadamard); + glTextureStorage2D(*texture, 1, GL_R16F, order, order); + glTextureSubImage2D(*texture, 0, 0, 0, order, order, GL_RED, GL_SHORT, hadamard); Stream label = arena_stream(arena); stream_append_s8(&label, s8("Hadamard")); @@ -788,7 +788,7 @@ beamformer_commit_parameter_block(BeamformerCtx *ctx, BeamformerComputePlan *cp, alloc_shader_storage(ctx, decoded_data_size, arena); if (cp->hadamard_order != (i32)cp->acquisition_count) - update_hadamard_texture(cp, (i32)cp->acquisition_count, arena); + update_hadamard_texture(cp, (i32)cp->acquisition_count, 0, arena); mem_copy(cp->voxel_transform.E, pb->parameters.das_voxel_transform.E, sizeof(cp->voxel_transform)); @@ -842,7 +842,7 @@ do_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, BeamformerFrame uv3 dispatch = cp->shader_descriptors[shader_slot].dispatch; switch (shader) { case BeamformerShaderKind_Decode:{ - glBindImageTexture(0, cp->textures[BeamformerComputeTextureKind_Hadamard], 0, 0, 0, GL_READ_ONLY, GL_R32F); + glBindImageTexture(0, cp->textures[BeamformerComputeTextureKind_Hadamard], 0, 0, 0, GL_READ_ONLY, GL_R16F); BeamformerDecodeMode mode = cp->shader_descriptors[shader_slot].bake.Decode.decode_mode; if (shader_slot == 0) { diff --git a/beamformer_internal.h b/beamformer_internal.h @@ -129,7 +129,7 @@ typedef enum {BEAMFORMER_COMPUTE_UBO_LIST BeamformerComputeUBOKind_Count} Beamfo #define BEAMFORMER_COMPUTE_TEXTURE_LIST_FULL \ BEAMFORMER_COMPUTE_TEXTURE_LIST \ - X(Hadamard, GL_R32F) + X(Hadamard, GL_R16F) typedef enum { #define X(k, ...) BeamformerComputeTextureKind_##k, diff --git a/math.c b/math.c @@ -1,13 +1,15 @@ +/* See LICENSE for license details. */ #include "external/cephes.c" function void -fill_kronecker_sub_matrix(f32 *out, i32 out_stride, f32 scale, f32 *b, iv2 b_dim) +fill_kronecker_sub_matrix_f16(f16 *out, i32 out_stride, f16 scale, f16 *b, iv2 b_dim) { - f32x4 vscale = dup_f32x4((f32)scale); for (i32 i = 0; i < b_dim.y; i++) { for (i32 j = 0; j < b_dim.x; j += 4, b += 4) { - f32x4 vb = load_f32x4(b); - store_f32x4(out + j, mul_f32x4(vscale, vb)); + out[j + 0] = scale * b[0]; + out[j + 1] = scale * b[1]; + out[j + 2] = scale * b[2]; + out[j + 3] = scale * b[3]; } out += out_stride; } @@ -15,14 +17,14 @@ fill_kronecker_sub_matrix(f32 *out, i32 out_stride, f32 scale, f32 *b, iv2 b_dim /* NOTE: this won't check for valid space/etc and assumes row major order */ function void -kronecker_product(f32 *out, f32 *a, iv2 a_dim, f32 *b, iv2 b_dim) +kronecker_product_f16(f16 *out, f16 *a, iv2 a_dim, f16 *b, iv2 b_dim) { iv2 out_dim = {{a_dim.x * b_dim.x, a_dim.y * b_dim.y}}; assert(out_dim.y % 4 == 0); for (i32 i = 0; i < a_dim.y; i++) { - f32 *vout = out; + f16 *vout = out; for (i32 j = 0; j < a_dim.x; j++, a++) { - fill_kronecker_sub_matrix(vout, out_dim.y, *a, b, b_dim); + fill_kronecker_sub_matrix_f16(vout, out_dim.y, *a, b, b_dim); vout += b_dim.y; } out += out_dim.y * b_dim.x; @@ -30,10 +32,10 @@ kronecker_product(f32 *out, f32 *a, iv2 a_dim, f32 *b, iv2 b_dim) } /* NOTE/TODO: to support even more hadamard sizes use the Paley construction */ -function f32 * -make_hadamard_transpose(Arena *a, i32 dim) +function f16 * +make_hadamard_transpose(Arena *a, i32 dim, b32 row_major) { - read_only local_persist f32 hadamard_12_12_transpose[] = { + read_only local_persist f16 hadamard_12_12_transpose[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, 1, -1, 1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, 1, -1, @@ -48,7 +50,7 @@ make_hadamard_transpose(Arena *a, i32 dim) 1, -1, 1, -1, -1, -1, 1, 1, 1, -1, 1, -1, }; - read_only local_persist f32 hadamard_20_20_transpose[] = { + read_only local_persist f16 hadamard_20_20_transpose[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, @@ -71,9 +73,11 @@ make_hadamard_transpose(Arena *a, i32 dim) 1, 1, -1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1, }; - f32 *result = 0; - b32 power_of_2 = ISPOWEROF2(dim); + f16 *result = 0; + + i32 order = dim; + b32 power_of_2 = IsPowerOfTwo(dim); b32 multiple_of_12 = dim % 12 == 0; b32 multiple_of_20 = dim % 20 == 0; iz elements = dim * dim; @@ -81,26 +85,26 @@ make_hadamard_transpose(Arena *a, i32 dim) i32 base_dim = 0; if (power_of_2) { base_dim = dim; - } else if (multiple_of_20 && ISPOWEROF2(dim / 20)) { + } else if (multiple_of_20 && IsPowerOfTwo(dim / 20)) { base_dim = 20; dim /= 20; - } else if (multiple_of_12 && ISPOWEROF2(dim / 12)) { + } else if (multiple_of_12 && IsPowerOfTwo(dim / 12)) { base_dim = 12; dim /= 12; } - if (ISPOWEROF2(dim) && base_dim && arena_capacity(a, f32) >= elements * (1 + (dim != base_dim))) { - result = push_array(a, f32, elements); + if (power_of_2 && base_dim && arena_capacity(a, f16) >= elements * (1 + (dim != base_dim))) { + result = push_array(a, f16, elements); Arena tmp = *a; - f32 *m = dim == base_dim ? result : push_array(&tmp, f32, elements); + f16 *m = dim == base_dim ? result : push_array(&tmp, f16, elements); #define IND(i, j) ((i) * dim + (j)) m[0] = 1; for (i32 k = 1; k < dim; k *= 2) { for (i32 i = 0; i < k; i++) { for (i32 j = 0; j < k; j++) { - f32 val = m[IND(i, j)]; + f16 val = m[IND(i, j)]; m[IND(i + k, j)] = val; m[IND(i, j + k)] = val; m[IND(i + k, j + k)] = -val; @@ -109,13 +113,19 @@ make_hadamard_transpose(Arena *a, i32 dim) } #undef IND - f32 *m2 = 0; + f16 *m2 = 0; iv2 m2_dim; switch (base_dim) { case 12:{ m2 = hadamard_12_12_transpose; m2_dim = (iv2){{12, 12}}; }break; case 20:{ m2 = hadamard_20_20_transpose; m2_dim = (iv2){{20, 20}}; }break; } - if (m2) kronecker_product(result, m, (iv2){{dim, dim}}, m2, m2_dim); + if (m2) kronecker_product_f16(result, m, (iv2){{dim, dim}}, m2, m2_dim); + } + + if (row_major) { + for (i32 r = 0; r < order; r++) + for (i32 c = 0; c < order; c++) + swap(result[r * order + c], result[c * order + r]); } return result; diff --git a/opengl.h b/opengl.h @@ -30,6 +30,7 @@ #define GL_MAJOR_VERSION 0x821B #define GL_MINOR_VERSION 0x821C #define GL_RG 0x8227 +#define GL_R16F 0x822D #define GL_R32F 0x822E #define GL_RG32F 0x8230 #define GL_R8I 0x8231 diff --git a/shaders/decode.glsl b/shaders/decode.glsl @@ -55,7 +55,7 @@ layout(std430, binding = 3) writeonly restrict buffer buffer_3 { SAMPLE_DATA_TYPE out_data[]; }; -layout(r32f, binding = 0) readonly restrict uniform image2D hadamard; +layout(r16f, binding = 0) readonly restrict uniform image2D hadamard; SAMPLE_DATA_TYPE sample_rf_data(uint index) { diff --git a/util.h b/util.h @@ -24,16 +24,17 @@ typedef __INT8_TYPE__ i8; #endif -typedef char c8; -typedef u8 b8; -typedef u16 b16; -typedef u32 b32; -typedef float f32; -typedef double f64; -typedef i64 iz; -typedef u64 uz; -typedef i64 iptr; -typedef u64 uptr; +typedef char c8; +typedef u8 b8; +typedef u16 b16; +typedef u32 b32; +typedef _Float16 f16; +typedef float f32; +typedef double f64; +typedef i64 iz; +typedef u64 uz; +typedef i64 iptr; +typedef u64 uptr; #ifndef asm #define asm __asm__ @@ -126,6 +127,7 @@ typedef u64 uptr; #define Clamp(x, a, b) ((x) < (a) ? (a) : (x) > (b) ? (b) : (x)) #define Min(a, b) ((a) < (b) ? (a) : (b)) #define Max(a, b) ((a) > (b) ? (a) : (b)) +#define IsPowerOfTwo(a) (((a) & ((a) - 1)) == 0) #define ISDIGIT(c) (BETWEEN((c), '0', '9')) #define ISUPPER(c) (((c) & 0x20u) == 0)