core/decode: store hadamard as f16 - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: 0210c211e7d92d032c297a66e3b9597ab12dab8b
Parent: 68e5130078be9c42dc509eadbf0bc960c86e5b63
Author: Randy Palamar
Date:   Mon, 20 Apr 2026 10:09:46 -0600

core/decode: store hadamard as f16

This doesn't result in any measurable performance diffence in
OpenGL, it is simply pulling back code from the Vulkan branch.

Diffstat:
M beamformer_core.c  | 12 ++++++------
M beamformer_internal.h  | 2 +-
M math.c  | 52 +++++++++++++++++++++++++++++++---------------------
M opengl.h  | 1 +
M shaders/decode.glsl  | 2 +-
M util.h  | 22 ++++++++++++----------

6 files changed, 52 insertions(+), 39 deletions(-)
diff --git a/beamformer_core.c b/beamformer_core.c
@@ -242,16 +242,16 @@ alloc_beamform_frame(BeamformerFrame *out, iv3 out_dim, GLenum gl_kind, s8 name,
 }
 
 function void
-update_hadamard_texture(BeamformerComputePlan *cp, i32 order, Arena arena)
+update_hadamard_texture(BeamformerComputePlan *cp, i32 order, b32 row_major, Arena arena)
 {
-	f32 *hadamard = make_hadamard_transpose(&arena, order);
+	f16 *hadamard = make_hadamard_transpose(&arena, order, row_major);
 	if (hadamard) {
 		cp->hadamard_order = order;
 		u32 *texture = cp->textures + BeamformerComputeTextureKind_Hadamard;
 		glDeleteTextures(1, texture);
 		glCreateTextures(GL_TEXTURE_2D, 1, texture);
-		glTextureStorage2D(*texture, 1, GL_R32F, order, order);
-		glTextureSubImage2D(*texture, 0, 0, 0, order, order, GL_RED, GL_FLOAT, hadamard);
+		glTextureStorage2D(*texture, 1, GL_R16F, order, order);
+		glTextureSubImage2D(*texture, 0, 0, 0, order, order, GL_RED, GL_SHORT, hadamard);
 
 		Stream label = arena_stream(arena);
 		stream_append_s8(&label, s8("Hadamard"));
@@ -788,7 +788,7 @@ beamformer_commit_parameter_block(BeamformerCtx *ctx, BeamformerComputePlan *cp,
 				alloc_shader_storage(ctx, decoded_data_size, arena);
 
 			if (cp->hadamard_order != (i32)cp->acquisition_count)
-				update_hadamard_texture(cp, (i32)cp->acquisition_count, arena);
+				update_hadamard_texture(cp, (i32)cp->acquisition_count, 0, arena);
 
 			mem_copy(cp->voxel_transform.E,  pb->parameters.das_voxel_transform.E, sizeof(cp->voxel_transform));
 
@@ -842,7 +842,7 @@ do_compute_shader(BeamformerCtx *ctx, BeamformerComputePlan *cp, BeamformerFrame
 	uv3 dispatch = cp->shader_descriptors[shader_slot].dispatch;
 	switch (shader) {
 	case BeamformerShaderKind_Decode:{
-		glBindImageTexture(0, cp->textures[BeamformerComputeTextureKind_Hadamard], 0, 0, 0, GL_READ_ONLY, GL_R32F);
+		glBindImageTexture(0, cp->textures[BeamformerComputeTextureKind_Hadamard], 0, 0, 0, GL_READ_ONLY, GL_R16F);
 
 		BeamformerDecodeMode mode = cp->shader_descriptors[shader_slot].bake.Decode.decode_mode;
 		if (shader_slot == 0) {
diff --git a/beamformer_internal.h b/beamformer_internal.h
@@ -129,7 +129,7 @@ typedef enum {BEAMFORMER_COMPUTE_UBO_LIST BeamformerComputeUBOKind_Count} Beamfo
 
 #define BEAMFORMER_COMPUTE_TEXTURE_LIST_FULL \
 	BEAMFORMER_COMPUTE_TEXTURE_LIST \
-	X(Hadamard,       GL_R32F)
+	X(Hadamard,       GL_R16F)
 
 typedef enum {
 	#define X(k, ...) BeamformerComputeTextureKind_##k,
diff --git a/math.c b/math.c
@@ -1,13 +1,15 @@
+/* See LICENSE for license details. */
 #include "external/cephes.c"
 
 function void
-fill_kronecker_sub_matrix(f32 *out, i32 out_stride, f32 scale, f32 *b, iv2 b_dim)
+fill_kronecker_sub_matrix_f16(f16 *out, i32 out_stride, f16 scale, f16 *b, iv2 b_dim)
 {
-	f32x4 vscale = dup_f32x4((f32)scale);
 	for (i32 i = 0; i < b_dim.y; i++) {
 		for (i32 j = 0; j < b_dim.x; j += 4, b += 4) {
-			f32x4 vb = load_f32x4(b);
-			store_f32x4(out + j, mul_f32x4(vscale, vb));
+			out[j + 0] = scale * b[0];
+			out[j + 1] = scale * b[1];
+			out[j + 2] = scale * b[2];
+			out[j + 3] = scale * b[3];
 		}
 		out += out_stride;
 	}
@@ -15,14 +17,14 @@ fill_kronecker_sub_matrix(f32 *out, i32 out_stride, f32 scale, f32 *b, iv2 b_dim
 
 /* NOTE: this won't check for valid space/etc and assumes row major order */
 function void
-kronecker_product(f32 *out, f32 *a, iv2 a_dim, f32 *b, iv2 b_dim)
+kronecker_product_f16(f16 *out, f16 *a, iv2 a_dim, f16 *b, iv2 b_dim)
 {
 	iv2 out_dim = {{a_dim.x * b_dim.x, a_dim.y * b_dim.y}};
 	assert(out_dim.y % 4 == 0);
 	for (i32 i = 0; i < a_dim.y; i++) {
-		f32 *vout = out;
+		f16 *vout = out;
 		for (i32 j = 0; j < a_dim.x; j++, a++) {
-			fill_kronecker_sub_matrix(vout, out_dim.y, *a, b, b_dim);
+			fill_kronecker_sub_matrix_f16(vout, out_dim.y, *a, b, b_dim);
 			vout += b_dim.y;
 		}
 		out += out_dim.y * b_dim.x;
@@ -30,10 +32,10 @@ kronecker_product(f32 *out, f32 *a, iv2 a_dim, f32 *b, iv2 b_dim)
 }
 
 /* NOTE/TODO: to support even more hadamard sizes use the Paley construction */
-function f32 *
-make_hadamard_transpose(Arena *a, i32 dim)
+function f16 *
+make_hadamard_transpose(Arena *a, i32 dim, b32 row_major)
 {
-	read_only local_persist	f32 hadamard_12_12_transpose[] = {
+	read_only local_persist	f16 hadamard_12_12_transpose[] = {
 		1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 		1, -1, -1,  1, -1, -1, -1,  1,  1,  1, -1,  1,
 		1,  1, -1, -1,  1, -1, -1, -1,  1,  1,  1, -1,
@@ -48,7 +50,7 @@ make_hadamard_transpose(Arena *a, i32 dim)
 		1, -1,  1, -1, -1, -1,  1,  1,  1, -1,  1, -1,
 	};
 
-	read_only local_persist f32 hadamard_20_20_transpose[] = {
+	read_only local_persist f16 hadamard_20_20_transpose[] = {
 		1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
 		1, -1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1,
 		1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,
@@ -71,9 +73,11 @@ make_hadamard_transpose(Arena *a, i32 dim)
 		1,  1, -1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1, -1,  1,  1,  1,  1, -1, -1,
 	};
 
-	f32 *result = 0;
 
-	b32 power_of_2     = ISPOWEROF2(dim);
+	f16 *result = 0;
+
+	i32 order          = dim;
+	b32 power_of_2     = IsPowerOfTwo(dim);
 	b32 multiple_of_12 = dim % 12 == 0;
 	b32 multiple_of_20 = dim % 20 == 0;
 	iz elements        = dim * dim;
@@ -81,26 +85,26 @@ make_hadamard_transpose(Arena *a, i32 dim)
 	i32 base_dim = 0;
 	if (power_of_2) {
 		base_dim  = dim;
-	} else if (multiple_of_20 && ISPOWEROF2(dim / 20)) {
+	} else if (multiple_of_20 && IsPowerOfTwo(dim / 20)) {
 		base_dim  = 20;
 		dim      /= 20;
-	} else if (multiple_of_12 && ISPOWEROF2(dim / 12)) {
+	} else if (multiple_of_12 && IsPowerOfTwo(dim / 12)) {
 		base_dim  = 12;
 		dim      /= 12;
 	}
 
-	if (ISPOWEROF2(dim) && base_dim && arena_capacity(a, f32) >= elements * (1 + (dim != base_dim))) {
-		result = push_array(a, f32, elements);
+	if (power_of_2 && base_dim && arena_capacity(a, f16) >= elements * (1 + (dim != base_dim))) {
+		result = push_array(a, f16, elements);
 
 		Arena tmp = *a;
-		f32 *m = dim == base_dim ? result : push_array(&tmp, f32, elements);
+		f16 *m = dim == base_dim ? result : push_array(&tmp, f16, elements);
 
 		#define IND(i, j) ((i) * dim + (j))
 		m[0] = 1;
 		for (i32 k = 1; k < dim; k *= 2) {
 			for (i32 i = 0; i < k; i++) {
 				for (i32 j = 0; j < k; j++) {
-					f32 val = m[IND(i, j)];
+					f16 val = m[IND(i, j)];
 					m[IND(i + k, j)]     =  val;
 					m[IND(i, j + k)]     =  val;
 					m[IND(i + k, j + k)] = -val;
@@ -109,13 +113,19 @@ make_hadamard_transpose(Arena *a, i32 dim)
 		}
 		#undef IND
 
-		f32 *m2 = 0;
+		f16 *m2 = 0;
 		iv2 m2_dim;
 		switch (base_dim) {
 		case 12:{ m2 = hadamard_12_12_transpose; m2_dim = (iv2){{12, 12}}; }break;
 		case 20:{ m2 = hadamard_20_20_transpose; m2_dim = (iv2){{20, 20}}; }break;
 		}
-		if (m2) kronecker_product(result, m, (iv2){{dim, dim}}, m2, m2_dim);
+		if (m2) kronecker_product_f16(result, m, (iv2){{dim, dim}}, m2, m2_dim);
+	}
+
+	if (row_major) {
+		for (i32 r = 0; r < order; r++)
+			for (i32 c = 0; c < order; c++)
+				swap(result[r * order + c], result[c * order + r]);
 	}
 
 	return result;
diff --git a/opengl.h b/opengl.h
@@ -30,6 +30,7 @@
 #define GL_MAJOR_VERSION                   0x821B
 #define GL_MINOR_VERSION                   0x821C
 #define GL_RG                              0x8227
+#define GL_R16F                            0x822D
 #define GL_R32F                            0x822E
 #define GL_RG32F                           0x8230
 #define GL_R8I                             0x8231
diff --git a/shaders/decode.glsl b/shaders/decode.glsl
@@ -55,7 +55,7 @@ layout(std430, binding = 3) writeonly restrict buffer buffer_3 {
 	SAMPLE_DATA_TYPE out_data[];
 };
 
-layout(r32f, binding = 0) readonly restrict uniform image2D  hadamard;
+layout(r16f, binding = 0) readonly restrict uniform image2D  hadamard;
 
 SAMPLE_DATA_TYPE sample_rf_data(uint index)
 {
diff --git a/util.h b/util.h
@@ -24,16 +24,17 @@
   typedef __INT8_TYPE__     i8;
 #endif
 
-typedef char   c8;
-typedef u8     b8;
-typedef u16    b16;
-typedef u32    b32;
-typedef float  f32;
-typedef double f64;
-typedef i64    iz;
-typedef u64    uz;
-typedef i64    iptr;
-typedef u64    uptr;
+typedef char     c8;
+typedef u8       b8;
+typedef u16      b16;
+typedef u32      b32;
+typedef _Float16 f16;
+typedef float    f32;
+typedef double   f64;
+typedef i64      iz;
+typedef u64      uz;
+typedef i64      iptr;
+typedef u64      uptr;
 
 #ifndef asm
 #define asm __asm__
@@ -126,6 +127,7 @@ typedef u64    uptr;
 #define Clamp(x, a, b)   ((x) < (a) ? (a) : (x) > (b) ? (b) : (x))
 #define Min(a, b)        ((a) < (b) ? (a) : (b))
 #define Max(a, b)        ((a) > (b) ? (a) : (b))
+#define IsPowerOfTwo(a)  (((a) & ((a) - 1)) == 0)
 
 #define ISDIGIT(c)       (BETWEEN((c), '0', '9'))
 #define ISUPPER(c)       (((c) & 0x20u) == 0)

M	beamformer_core.c	\|	12	++++++------
M	beamformer_internal.h	\|	2	+-
M	math.c	\|	52	+++++++++++++++++++++++++++++++---------------------
M	opengl.h	\|	1	+
M	shaders/decode.glsl	\|	2	+-
M	util.h	\|	22	++++++++++++----------