Commit: 0210c211e7d92d032c297a66e3b9597ab12dab8b
Parent: 68e5130078be9c42dc509eadbf0bc960c86e5b63
Author: Randy Palamar
Date: Mon, 20 Apr 2026 10:09:46 -0600
core/decode: store hadamard as f16
This doesn't result in any measurable performance diffence in
OpenGL, it is simply pulling back code from the Vulkan branch.
Diffstat:
6 files changed, 52 insertions(+), 39 deletions(-)
diff --git a/beamformer_core.c b/beamformer_core.c
@@ -242,16 +242,16 @@ alloc_beamform_frame(BeamformerFrame *out, iv3 out_dim, GLenum gl_kind, s8 name,
}
function void
-update_hadamard_texture(BeamformerComputePlan *cp, i32 order, Arena arena)
+update_hadamard_texture(BeamformerComputePlan *cp, i32 order, b32 row_major, Arena arena)
{
- f32 *hadamard = make_hadamard_transpose(&arena, order);
+ f16 *hadamard = make_hadamard_transpose(&arena, order, row_major);
if (hadamard) {
cp->hadamard_order = order;
u32 *texture = cp->textures + BeamformerComputeTextureKind_Hadamard;
glDeleteTextures(1, texture);
glCreateTextures(GL_TEXTURE_2D, 1, texture);
- glTextureStorage2D(*texture, 1, GL_R32F, order, order);
- glTextureSubImage2D(*texture, 0, 0, 0, order, order, GL_RED, GL_FLOAT, hadamard);
+ glTextureStorage2D(*texture, 1, GL_R16F, order, order);
+ glTextureSubImage2D(*texture, 0, 0, 0, order, order, GL_RED, GL_SHORT, hadamard);
Stream label = arena_stream(arena);
stream_append_s8(&label, s8("Hadamard"));
@@ -788,7 +788,7 @@ beamformer_commit_parameter_block(BeamformerCtx *ctx, BeamformerComputePlan *cp,
alloc_shader_storage(ctx, decoded_data_size, arena);
if (cp->hadamard_order != (i32)cp->acquisition_count)
- update_hadamard_texture(cp, (i32)cp->acquisition_count, arena);
+ update_hadamard_texture(cp, (i32)cp->acquisition_count, 0, arena);
mem_copy(cp->voxel_transform.E, pb->parameters.das_voxel_transform.E, sizeof(cp->voxel_transform));
@@ -842,7 +842,7 @@ do_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, BeamformerFrame
uv3 dispatch = cp->shader_descriptors[shader_slot].dispatch;
switch (shader) {
case BeamformerShaderKind_Decode:{
- glBindImageTexture(0, cp->textures[BeamformerComputeTextureKind_Hadamard], 0, 0, 0, GL_READ_ONLY, GL_R32F);
+ glBindImageTexture(0, cp->textures[BeamformerComputeTextureKind_Hadamard], 0, 0, 0, GL_READ_ONLY, GL_R16F);
BeamformerDecodeMode mode = cp->shader_descriptors[shader_slot].bake.Decode.decode_mode;
if (shader_slot == 0) {
diff --git a/beamformer_internal.h b/beamformer_internal.h
@@ -129,7 +129,7 @@ typedef enum {BEAMFORMER_COMPUTE_UBO_LIST BeamformerComputeUBOKind_Count} Beamfo
#define BEAMFORMER_COMPUTE_TEXTURE_LIST_FULL \
BEAMFORMER_COMPUTE_TEXTURE_LIST \
- X(Hadamard, GL_R32F)
+ X(Hadamard, GL_R16F)
typedef enum {
#define X(k, ...) BeamformerComputeTextureKind_##k,
diff --git a/math.c b/math.c
@@ -1,13 +1,15 @@
+/* See LICENSE for license details. */
#include "external/cephes.c"
function void
-fill_kronecker_sub_matrix(f32 *out, i32 out_stride, f32 scale, f32 *b, iv2 b_dim)
+fill_kronecker_sub_matrix_f16(f16 *out, i32 out_stride, f16 scale, f16 *b, iv2 b_dim)
{
- f32x4 vscale = dup_f32x4((f32)scale);
for (i32 i = 0; i < b_dim.y; i++) {
for (i32 j = 0; j < b_dim.x; j += 4, b += 4) {
- f32x4 vb = load_f32x4(b);
- store_f32x4(out + j, mul_f32x4(vscale, vb));
+ out[j + 0] = scale * b[0];
+ out[j + 1] = scale * b[1];
+ out[j + 2] = scale * b[2];
+ out[j + 3] = scale * b[3];
}
out += out_stride;
}
@@ -15,14 +17,14 @@ fill_kronecker_sub_matrix(f32 *out, i32 out_stride, f32 scale, f32 *b, iv2 b_dim
/* NOTE: this won't check for valid space/etc and assumes row major order */
function void
-kronecker_product(f32 *out, f32 *a, iv2 a_dim, f32 *b, iv2 b_dim)
+kronecker_product_f16(f16 *out, f16 *a, iv2 a_dim, f16 *b, iv2 b_dim)
{
iv2 out_dim = {{a_dim.x * b_dim.x, a_dim.y * b_dim.y}};
assert(out_dim.y % 4 == 0);
for (i32 i = 0; i < a_dim.y; i++) {
- f32 *vout = out;
+ f16 *vout = out;
for (i32 j = 0; j < a_dim.x; j++, a++) {
- fill_kronecker_sub_matrix(vout, out_dim.y, *a, b, b_dim);
+ fill_kronecker_sub_matrix_f16(vout, out_dim.y, *a, b, b_dim);
vout += b_dim.y;
}
out += out_dim.y * b_dim.x;
@@ -30,10 +32,10 @@ kronecker_product(f32 *out, f32 *a, iv2 a_dim, f32 *b, iv2 b_dim)
}
/* NOTE/TODO: to support even more hadamard sizes use the Paley construction */
-function f32 *
-make_hadamard_transpose(Arena *a, i32 dim)
+function f16 *
+make_hadamard_transpose(Arena *a, i32 dim, b32 row_major)
{
- read_only local_persist f32 hadamard_12_12_transpose[] = {
+ read_only local_persist f16 hadamard_12_12_transpose[] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, -1, -1, 1, -1, -1, -1, 1, 1, 1, -1, 1,
1, 1, -1, -1, 1, -1, -1, -1, 1, 1, 1, -1,
@@ -48,7 +50,7 @@ make_hadamard_transpose(Arena *a, i32 dim)
1, -1, 1, -1, -1, -1, 1, 1, 1, -1, 1, -1,
};
- read_only local_persist f32 hadamard_20_20_transpose[] = {
+ read_only local_persist f16 hadamard_20_20_transpose[] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, -1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1, 1,
1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1,
@@ -71,9 +73,11 @@ make_hadamard_transpose(Arena *a, i32 dim)
1, 1, -1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1,
};
- f32 *result = 0;
- b32 power_of_2 = ISPOWEROF2(dim);
+ f16 *result = 0;
+
+ i32 order = dim;
+ b32 power_of_2 = IsPowerOfTwo(dim);
b32 multiple_of_12 = dim % 12 == 0;
b32 multiple_of_20 = dim % 20 == 0;
iz elements = dim * dim;
@@ -81,26 +85,26 @@ make_hadamard_transpose(Arena *a, i32 dim)
i32 base_dim = 0;
if (power_of_2) {
base_dim = dim;
- } else if (multiple_of_20 && ISPOWEROF2(dim / 20)) {
+ } else if (multiple_of_20 && IsPowerOfTwo(dim / 20)) {
base_dim = 20;
dim /= 20;
- } else if (multiple_of_12 && ISPOWEROF2(dim / 12)) {
+ } else if (multiple_of_12 && IsPowerOfTwo(dim / 12)) {
base_dim = 12;
dim /= 12;
}
- if (ISPOWEROF2(dim) && base_dim && arena_capacity(a, f32) >= elements * (1 + (dim != base_dim))) {
- result = push_array(a, f32, elements);
+ if (power_of_2 && base_dim && arena_capacity(a, f16) >= elements * (1 + (dim != base_dim))) {
+ result = push_array(a, f16, elements);
Arena tmp = *a;
- f32 *m = dim == base_dim ? result : push_array(&tmp, f32, elements);
+ f16 *m = dim == base_dim ? result : push_array(&tmp, f16, elements);
#define IND(i, j) ((i) * dim + (j))
m[0] = 1;
for (i32 k = 1; k < dim; k *= 2) {
for (i32 i = 0; i < k; i++) {
for (i32 j = 0; j < k; j++) {
- f32 val = m[IND(i, j)];
+ f16 val = m[IND(i, j)];
m[IND(i + k, j)] = val;
m[IND(i, j + k)] = val;
m[IND(i + k, j + k)] = -val;
@@ -109,13 +113,19 @@ make_hadamard_transpose(Arena *a, i32 dim)
}
#undef IND
- f32 *m2 = 0;
+ f16 *m2 = 0;
iv2 m2_dim;
switch (base_dim) {
case 12:{ m2 = hadamard_12_12_transpose; m2_dim = (iv2){{12, 12}}; }break;
case 20:{ m2 = hadamard_20_20_transpose; m2_dim = (iv2){{20, 20}}; }break;
}
- if (m2) kronecker_product(result, m, (iv2){{dim, dim}}, m2, m2_dim);
+ if (m2) kronecker_product_f16(result, m, (iv2){{dim, dim}}, m2, m2_dim);
+ }
+
+ if (row_major) {
+ for (i32 r = 0; r < order; r++)
+ for (i32 c = 0; c < order; c++)
+ swap(result[r * order + c], result[c * order + r]);
}
return result;
diff --git a/opengl.h b/opengl.h
@@ -30,6 +30,7 @@
#define GL_MAJOR_VERSION 0x821B
#define GL_MINOR_VERSION 0x821C
#define GL_RG 0x8227
+#define GL_R16F 0x822D
#define GL_R32F 0x822E
#define GL_RG32F 0x8230
#define GL_R8I 0x8231
diff --git a/shaders/decode.glsl b/shaders/decode.glsl
@@ -55,7 +55,7 @@ layout(std430, binding = 3) writeonly restrict buffer buffer_3 {
SAMPLE_DATA_TYPE out_data[];
};
-layout(r32f, binding = 0) readonly restrict uniform image2D hadamard;
+layout(r16f, binding = 0) readonly restrict uniform image2D hadamard;
SAMPLE_DATA_TYPE sample_rf_data(uint index)
{
diff --git a/util.h b/util.h
@@ -24,16 +24,17 @@
typedef __INT8_TYPE__ i8;
#endif
-typedef char c8;
-typedef u8 b8;
-typedef u16 b16;
-typedef u32 b32;
-typedef float f32;
-typedef double f64;
-typedef i64 iz;
-typedef u64 uz;
-typedef i64 iptr;
-typedef u64 uptr;
+typedef char c8;
+typedef u8 b8;
+typedef u16 b16;
+typedef u32 b32;
+typedef _Float16 f16;
+typedef float f32;
+typedef double f64;
+typedef i64 iz;
+typedef u64 uz;
+typedef i64 iptr;
+typedef u64 uptr;
#ifndef asm
#define asm __asm__
@@ -126,6 +127,7 @@ typedef u64 uptr;
#define Clamp(x, a, b) ((x) < (a) ? (a) : (x) > (b) ? (b) : (x))
#define Min(a, b) ((a) < (b) ? (a) : (b))
#define Max(a, b) ((a) > (b) ? (a) : (b))
+#define IsPowerOfTwo(a) (((a) & ((a) - 1)) == 0)
#define ISDIGIT(c) (BETWEEN((c), '0', '9'))
#define ISUPPER(c) (((c) & 0x20u) == 0)