port to aarch64 - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: 16c32a812bd167d704d39740aec439f202bcc631
Parent: 15c6348c2447d49fe2e2e3021a269ffad19c451e
Author: Randy Palamar
Date:   Fri, 20 Dec 2024 17:11:19 -0700

port to aarch64

Diffstat:
M beamformer.c  | 13 ++++++++-----
M beamformer.h  | 1 +
A intrinsics.c  | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
M static.c  | 10 +++++++---
M util.c  | 25 +++++++++++--------------
M util.h  | 6 +++---

6 files changed, 81 insertions(+), 25 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -59,7 +59,7 @@ alloc_beamform_frame(GLParams *gp, BeamformFrame *out, uv4 out_dim, u32 frame_in
 	/* NOTE: allocate storage for beamformed output data;
 	 * this is shared between compute and fragment shaders */
 	u32 max_dim = MAX(out->dim.x, MAX(out->dim.y, out->dim.z));
-	out->mips   = _tzcnt_u32(max_dim) + 1;
+	out->mips   = ctz_u32(max_dim) + 1;
 
 	u8 buf[256];
 	Stream label = {.data = buf, .cap = ARRAY_COUNT(buf)};
@@ -117,8 +117,9 @@ alloc_shader_storage(BeamformerCtx *ctx, Arena a)
 
 	i32 storage_flags = GL_DYNAMIC_STORAGE_BIT;
 	switch (ctx->gl.vendor_id) {
-	case GL_VENDOR_INTEL:
 	case GL_VENDOR_AMD:
+	case GL_VENDOR_ARM:
+	case GL_VENDOR_INTEL:
 		if (cs->raw_data_ssbo)
 			glUnmapNamedBuffer(cs->raw_data_ssbo);
 		storage_flags |= GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT;
@@ -146,8 +147,9 @@ alloc_shader_storage(BeamformerCtx *ctx, Arena a)
 
 	i32 map_flags = GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT|GL_MAP_UNSYNCHRONIZED_BIT;
 	switch (ctx->gl.vendor_id) {
-	case GL_VENDOR_INTEL:
 	case GL_VENDOR_AMD:
+	case GL_VENDOR_ARM:
+	case GL_VENDOR_INTEL:
 		cs->raw_data_arena.beg = glMapNamedBufferRange(cs->raw_data_ssbo, 0,
 		                                               full_rf_buf_size, map_flags);
 		break;
@@ -309,7 +311,7 @@ static v4
 f32_4_to_v4(f32 *in)
 {
 	v4 result;
-	_mm_storeu_ps(result.E, _mm_loadu_ps(in));
+	store_f32x4(load_f32x4(in), result.E);
 	return result;
 }
 
@@ -711,8 +713,9 @@ DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step)
 				ctx->error_stream.widx = 0;
 			} else {
 				switch (ctx->gl.vendor_id) {
-				case GL_VENDOR_INTEL:
 				case GL_VENDOR_AMD:
+				case GL_VENDOR_ARM:
+				case GL_VENDOR_INTEL:
 					break;
 				case GL_VENDOR_NVIDIA:
 					glNamedBufferSubData(cs->raw_data_ssbo, raw_index * rlen,
diff --git a/beamformer.h b/beamformer.h
@@ -35,6 +35,7 @@ enum program_flags {
 
 enum gl_vendor_ids {
 	GL_VENDOR_AMD,
+	GL_VENDOR_ARM,
 	GL_VENDOR_INTEL,
 	GL_VENDOR_NVIDIA,
 };
diff --git a/intrinsics.c b/intrinsics.c
@@ -0,0 +1,51 @@
+#define FORCE_INLINE inline __attribute__((always_inline))
+
+/* TODO(rnp): msvc probably won't build this but there are other things preventing that as well */
+#define clz_u32(a)  __builtin_clz(a)
+#define ctz_u32(a)  __builtin_ctz(a)
+#define sqrt_f32(a) __builtin_sqrtf(a)
+
+#ifdef __ARM_ARCH_ISA_A64
+/* TODO? debuggers just loop here forever and need a manual PC increment (step over) */
+#define debugbreak() asm volatile ("brk 0xf000")
+
+/* NOTE(rnp): we are only doing a handful of f32x4 operations so we will just use NEON and do
+ * the macro renaming thing. If you are implementing a serious wide vector operation you should
+ * use SVE(2) instead. The semantics are different however and the code will be written for an
+ * arbitrary vector bit width. In that case you will also need x86_64 code for determining
+ * the supported vector width (ideally at runtime though that may not be possible).
+ */
+#include <arm_neon.h>
+typedef float32x4_t f32x4;
+typedef int32x4_t   i32x4;
+
+#define cvt_i32x4_f32x4(a)    vcvtq_f32_s32(a)
+#define cvt_f32x4_i32x4(a)    vcvtq_s32_f32(a)
+#define dup_f32x4(f)          vdupq_n_f32(f)
+#define load_f32x4(a)         vld1q_f32(a)
+#define load_i32x4(a)         vld1q_s32(a)
+#define mul_f32x4(a, b)       vmulq_f32(a, b)
+#define set_f32x4(a, b, c, d) vld1q_f32((f32 []){d, c, b, a})
+#define sqrt_f32x4(a)         vsqrtq_f32(a)
+#define store_f32x4(a, o)     vst1q_f32(o, a)
+#define store_i32x4(a, o)     vst1q_s32(o, a)
+
+#elif __x86_64__
+#include <immintrin.h>
+typedef __m128  f32x4;
+typedef __m128i i32x4;
+
+#define cvt_i32x4_f32x4(a)    _mm_cvtepi32_ps(a)
+#define cvt_f32x4_i32x4(a)    _mm_cvtps_epi32(a)
+#define dup_f32x4(f)          _mm_set1_ps(f)
+#define load_f32x4(a)         _mm_loadu_ps(a)
+#define load_i32x4(a)         _mm_loadu_si128((i32x4 *)a)
+#define mul_f32x4(a, b)       _mm_mul_ps(a, b)
+#define set_f32x4(a, b, c, d) _mm_set_ps(a, b, c, d)
+#define sqrt_f32x4(a)         _mm_sqrt_ps(a)
+#define store_f32x4(a, o)     _mm_storeu_ps(o, a)
+#define store_i32x4(a, o)     _mm_storeu_si128((i32x4 *)o, a)
+
+#define debugbreak() asm volatile ("int3; nop")
+
+#endif
diff --git a/static.c b/static.c
@@ -63,9 +63,11 @@ get_gl_params(GLParams *gl, Stream *err)
 		os_fatal(stream_to_s8(err));
 	}
 	switch (vendor[0]) {
-	case 'A': gl->vendor_id = GL_VENDOR_AMD;                      break;
-	case 'I': gl->vendor_id = GL_VENDOR_INTEL;                    break;
-	case 'N': gl->vendor_id = GL_VENDOR_NVIDIA;                   break;
+	case 'A': gl->vendor_id = GL_VENDOR_AMD;    break;
+	case 'I': gl->vendor_id = GL_VENDOR_INTEL;  break;
+	case 'N': gl->vendor_id = GL_VENDOR_NVIDIA; break;
+	/* NOTE(rnp): freedreno - might need different handling on win32 but this is fine for now */
+	case 'f': gl->vendor_id = GL_VENDOR_ARM;    break;
 	default:
 		stream_append_s8(err, s8("Unknown GL Vendor: "));
 		stream_append_s8(err, cstr_to_s8(vendor));
@@ -93,6 +95,7 @@ validate_gl_requirements(GLParams *gl)
 
 	switch (gl->vendor_id) {
 	case GL_VENDOR_AMD:
+	case GL_VENDOR_ARM:
 	case GL_VENDOR_INTEL:
 		if (gl->version_major == 4 && gl->version_minor < 5)
 			invalid = 1;
@@ -116,6 +119,7 @@ dump_gl_params(GLParams *gl, Arena a)
 	stream_append_s8(&s, s8("---- GL Parameters ----\n"));
 	switch (gl->vendor_id) {
 	case GL_VENDOR_AMD:    stream_append_s8(&s, s8("Vendor: AMD\n"));    break;
+	case GL_VENDOR_ARM:    stream_append_s8(&s, s8("Vendor: ARM\n"));    break;
 	case GL_VENDOR_INTEL:  stream_append_s8(&s, s8("Vendor: Intel\n"));  break;
 	case GL_VENDOR_NVIDIA: stream_append_s8(&s, s8("Vendor: nVidia\n")); break;
 	}
diff --git a/util.c b/util.c
@@ -277,7 +277,7 @@ uv4_equal(uv4 a, uv4 b)
 static u32
 round_down_power_of_2(u32 a)
 {
-	u32 result = 0x80000000UL >> _lzcnt_u32(a);
+	u32 result = 0x80000000UL >> clz_u32(a);
 	return result;
 }
 
@@ -342,11 +342,8 @@ mul_v2(v2 a, v2 b)
 static f32
 magnitude_v2(v2 a)
 {
-	v4 result;
-	__m128 av = _mm_set_ps(0, 0, a.x, a.y);
-	av = _mm_mul_ps(av, av);
-	_mm_store_ps(result.E, _mm_sqrt_ps(_mm_hadd_ps(av, av)));
-	return result.x;
+	f32 result = sqrt_f32(a.x * a.x + a.y * a.y);
+	return result;
 }
 
 static f64
@@ -379,13 +376,13 @@ parse_f64(s8 s)
 }
 
 static void
-fill_kronecker_sub_matrix(__m128i *out, i32 out_stride, i32 scale, __m128i *b, uv2 b_dim)
+fill_kronecker_sub_matrix(i32 *out, i32 out_stride, i32 scale, i32 *b, uv2 b_dim)
 {
-	__m128 vscale = _mm_set1_ps(scale);
+	f32x4 vscale = dup_f32x4(scale);
 	for (u32 i = 0; i < b_dim.y; i++) {
-		for (u32 j = 0; j < b_dim.x / 4; j++) {
-			__m128 vb = _mm_cvtepi32_ps(_mm_loadu_si128(b++));
-			_mm_storeu_si128(out + j, _mm_cvtps_epi32(_mm_mul_ps(vscale, vb)));
+		for (u32 j = 0; j < b_dim.x; j += 4, b += 4) {
+			f32x4 vb = cvt_i32x4_f32x4(load_i32x4(b));
+			store_i32x4(cvt_f32x4_i32x4(mul_f32x4(vscale, vb)), out + j);
 		}
 		out += out_stride;
 	}
@@ -398,10 +395,10 @@ kronecker_product(i32 *out, i32 *a, uv2 a_dim, i32 *b, uv2 b_dim)
 	uv2 out_dim = {.x = a_dim.x * b_dim.x, .y = a_dim.y * b_dim.y};
 	ASSERT(out_dim.y % 4 == 0);
 	for (u32 i = 0; i < a_dim.y; i++) {
-		__m128i *vout = (__m128i *)out;
+		i32 *vout = out;
 		for (u32 j = 0; j < a_dim.x; j++, a++) {
-			fill_kronecker_sub_matrix(vout, out_dim.y / 4, *a, (__m128i *)b, b_dim);
-			vout += b_dim.y / 4;
+			fill_kronecker_sub_matrix(vout, out_dim.y, *a, b, b_dim);
+			vout += b_dim.y;
 		}
 		out += out_dim.y * b_dim.x;
 	}
diff --git a/util.h b/util.h
@@ -5,8 +5,6 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include <immintrin.h>
-
 #ifndef asm
 #define asm __asm__
 #endif
@@ -23,13 +21,15 @@
 #endif
 #endif
 
+#include "intrinsics.c"
+
 #ifdef _DEBUG
 	#ifdef _WIN32
 		#define DEBUG_EXPORT __declspec(dllexport)
 	#else
 		#define DEBUG_EXPORT
 	#endif
-	#define ASSERT(c) do { if (!(c)) asm("int3; nop"); } while (0);
+	#define ASSERT(c) do { if (!(c)) debugbreak(); } while (0);
 #else
 	#define DEBUG_EXPORT static
 	#define ASSERT(c)

M	beamformer.c	\|	13	++++++++-----
M	beamformer.h	\|	1	+
A	intrinsics.c	\|	51	+++++++++++++++++++++++++++++++++++++++++++++++++++
M	static.c	\|	10	+++++++---
M	util.c	\|	25	+++++++++++--------------
M	util.h	\|	6	+++---