Commit: 16c32a812bd167d704d39740aec439f202bcc631
Parent: 15c6348c2447d49fe2e2e3021a269ffad19c451e
Author: Randy Palamar
Date: Fri, 20 Dec 2024 17:11:19 -0700
port to aarch64
Diffstat:
6 files changed, 81 insertions(+), 25 deletions(-)
diff --git a/beamformer.c b/beamformer.c
@@ -59,7 +59,7 @@ alloc_beamform_frame(GLParams *gp, BeamformFrame *out, uv4 out_dim, u32 frame_in
/* NOTE: allocate storage for beamformed output data;
* this is shared between compute and fragment shaders */
u32 max_dim = MAX(out->dim.x, MAX(out->dim.y, out->dim.z));
- out->mips = _tzcnt_u32(max_dim) + 1;
+ out->mips = ctz_u32(max_dim) + 1;
u8 buf[256];
Stream label = {.data = buf, .cap = ARRAY_COUNT(buf)};
@@ -117,8 +117,9 @@ alloc_shader_storage(BeamformerCtx *ctx, Arena a)
i32 storage_flags = GL_DYNAMIC_STORAGE_BIT;
switch (ctx->gl.vendor_id) {
- case GL_VENDOR_INTEL:
case GL_VENDOR_AMD:
+ case GL_VENDOR_ARM:
+ case GL_VENDOR_INTEL:
if (cs->raw_data_ssbo)
glUnmapNamedBuffer(cs->raw_data_ssbo);
storage_flags |= GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT;
@@ -146,8 +147,9 @@ alloc_shader_storage(BeamformerCtx *ctx, Arena a)
i32 map_flags = GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT|GL_MAP_UNSYNCHRONIZED_BIT;
switch (ctx->gl.vendor_id) {
- case GL_VENDOR_INTEL:
case GL_VENDOR_AMD:
+ case GL_VENDOR_ARM:
+ case GL_VENDOR_INTEL:
cs->raw_data_arena.beg = glMapNamedBufferRange(cs->raw_data_ssbo, 0,
full_rf_buf_size, map_flags);
break;
@@ -309,7 +311,7 @@ static v4
f32_4_to_v4(f32 *in)
{
v4 result;
- _mm_storeu_ps(result.E, _mm_loadu_ps(in));
+ store_f32x4(load_f32x4(in), result.E);
return result;
}
@@ -711,8 +713,9 @@ DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step)
ctx->error_stream.widx = 0;
} else {
switch (ctx->gl.vendor_id) {
- case GL_VENDOR_INTEL:
case GL_VENDOR_AMD:
+ case GL_VENDOR_ARM:
+ case GL_VENDOR_INTEL:
break;
case GL_VENDOR_NVIDIA:
glNamedBufferSubData(cs->raw_data_ssbo, raw_index * rlen,
diff --git a/beamformer.h b/beamformer.h
@@ -35,6 +35,7 @@ enum program_flags {
enum gl_vendor_ids {
GL_VENDOR_AMD,
+ GL_VENDOR_ARM,
GL_VENDOR_INTEL,
GL_VENDOR_NVIDIA,
};
diff --git a/intrinsics.c b/intrinsics.c
@@ -0,0 +1,51 @@
+#define FORCE_INLINE inline __attribute__((always_inline))
+
+/* TODO(rnp): msvc probably won't build this but there are other things preventing that as well */
+#define clz_u32(a) __builtin_clz(a)
+#define ctz_u32(a) __builtin_ctz(a)
+#define sqrt_f32(a) __builtin_sqrtf(a)
+
+#ifdef __ARM_ARCH_ISA_A64
+/* TODO? debuggers just loop here forever and need a manual PC increment (step over) */
+#define debugbreak() asm volatile ("brk 0xf000")
+
+/* NOTE(rnp): we are only doing a handful of f32x4 operations so we will just use NEON and do
+ * the macro renaming thing. If you are implementing a serious wide vector operation you should
+ * use SVE(2) instead. The semantics are different however and the code will be written for an
+ * arbitrary vector bit width. In that case you will also need x86_64 code for determining
+ * the supported vector width (ideally at runtime though that may not be possible).
+ */
+#include <arm_neon.h>
+typedef float32x4_t f32x4;
+typedef int32x4_t i32x4;
+
+#define cvt_i32x4_f32x4(a) vcvtq_f32_s32(a)
+#define cvt_f32x4_i32x4(a) vcvtq_s32_f32(a)
+#define dup_f32x4(f) vdupq_n_f32(f)
+#define load_f32x4(a) vld1q_f32(a)
+#define load_i32x4(a) vld1q_s32(a)
+#define mul_f32x4(a, b) vmulq_f32(a, b)
+#define set_f32x4(a, b, c, d) vld1q_f32((f32 []){d, c, b, a})
+#define sqrt_f32x4(a) vsqrtq_f32(a)
+#define store_f32x4(a, o) vst1q_f32(o, a)
+#define store_i32x4(a, o) vst1q_s32(o, a)
+
+#elif __x86_64__
+#include <immintrin.h>
+typedef __m128 f32x4;
+typedef __m128i i32x4;
+
+#define cvt_i32x4_f32x4(a) _mm_cvtepi32_ps(a)
+#define cvt_f32x4_i32x4(a) _mm_cvtps_epi32(a)
+#define dup_f32x4(f) _mm_set1_ps(f)
+#define load_f32x4(a) _mm_loadu_ps(a)
+#define load_i32x4(a) _mm_loadu_si128((i32x4 *)a)
+#define mul_f32x4(a, b) _mm_mul_ps(a, b)
+#define set_f32x4(a, b, c, d) _mm_set_ps(a, b, c, d)
+#define sqrt_f32x4(a) _mm_sqrt_ps(a)
+#define store_f32x4(a, o) _mm_storeu_ps(o, a)
+#define store_i32x4(a, o) _mm_storeu_si128((i32x4 *)o, a)
+
+#define debugbreak() asm volatile ("int3; nop")
+
+#endif
diff --git a/static.c b/static.c
@@ -63,9 +63,11 @@ get_gl_params(GLParams *gl, Stream *err)
os_fatal(stream_to_s8(err));
}
switch (vendor[0]) {
- case 'A': gl->vendor_id = GL_VENDOR_AMD; break;
- case 'I': gl->vendor_id = GL_VENDOR_INTEL; break;
- case 'N': gl->vendor_id = GL_VENDOR_NVIDIA; break;
+ case 'A': gl->vendor_id = GL_VENDOR_AMD; break;
+ case 'I': gl->vendor_id = GL_VENDOR_INTEL; break;
+ case 'N': gl->vendor_id = GL_VENDOR_NVIDIA; break;
+ /* NOTE(rnp): freedreno - might need different handling on win32 but this is fine for now */
+ case 'f': gl->vendor_id = GL_VENDOR_ARM; break;
default:
stream_append_s8(err, s8("Unknown GL Vendor: "));
stream_append_s8(err, cstr_to_s8(vendor));
@@ -93,6 +95,7 @@ validate_gl_requirements(GLParams *gl)
switch (gl->vendor_id) {
case GL_VENDOR_AMD:
+ case GL_VENDOR_ARM:
case GL_VENDOR_INTEL:
if (gl->version_major == 4 && gl->version_minor < 5)
invalid = 1;
@@ -116,6 +119,7 @@ dump_gl_params(GLParams *gl, Arena a)
stream_append_s8(&s, s8("---- GL Parameters ----\n"));
switch (gl->vendor_id) {
case GL_VENDOR_AMD: stream_append_s8(&s, s8("Vendor: AMD\n")); break;
+ case GL_VENDOR_ARM: stream_append_s8(&s, s8("Vendor: ARM\n")); break;
case GL_VENDOR_INTEL: stream_append_s8(&s, s8("Vendor: Intel\n")); break;
case GL_VENDOR_NVIDIA: stream_append_s8(&s, s8("Vendor: nVidia\n")); break;
}
diff --git a/util.c b/util.c
@@ -277,7 +277,7 @@ uv4_equal(uv4 a, uv4 b)
static u32
round_down_power_of_2(u32 a)
{
- u32 result = 0x80000000UL >> _lzcnt_u32(a);
+ u32 result = 0x80000000UL >> clz_u32(a);
return result;
}
@@ -342,11 +342,8 @@ mul_v2(v2 a, v2 b)
static f32
magnitude_v2(v2 a)
{
- v4 result;
- __m128 av = _mm_set_ps(0, 0, a.x, a.y);
- av = _mm_mul_ps(av, av);
- _mm_store_ps(result.E, _mm_sqrt_ps(_mm_hadd_ps(av, av)));
- return result.x;
+ f32 result = sqrt_f32(a.x * a.x + a.y * a.y);
+ return result;
}
static f64
@@ -379,13 +376,13 @@ parse_f64(s8 s)
}
static void
-fill_kronecker_sub_matrix(__m128i *out, i32 out_stride, i32 scale, __m128i *b, uv2 b_dim)
+fill_kronecker_sub_matrix(i32 *out, i32 out_stride, i32 scale, i32 *b, uv2 b_dim)
{
- __m128 vscale = _mm_set1_ps(scale);
+ f32x4 vscale = dup_f32x4(scale);
for (u32 i = 0; i < b_dim.y; i++) {
- for (u32 j = 0; j < b_dim.x / 4; j++) {
- __m128 vb = _mm_cvtepi32_ps(_mm_loadu_si128(b++));
- _mm_storeu_si128(out + j, _mm_cvtps_epi32(_mm_mul_ps(vscale, vb)));
+ for (u32 j = 0; j < b_dim.x; j += 4, b += 4) {
+ f32x4 vb = cvt_i32x4_f32x4(load_i32x4(b));
+ store_i32x4(cvt_f32x4_i32x4(mul_f32x4(vscale, vb)), out + j);
}
out += out_stride;
}
@@ -398,10 +395,10 @@ kronecker_product(i32 *out, i32 *a, uv2 a_dim, i32 *b, uv2 b_dim)
uv2 out_dim = {.x = a_dim.x * b_dim.x, .y = a_dim.y * b_dim.y};
ASSERT(out_dim.y % 4 == 0);
for (u32 i = 0; i < a_dim.y; i++) {
- __m128i *vout = (__m128i *)out;
+ i32 *vout = out;
for (u32 j = 0; j < a_dim.x; j++, a++) {
- fill_kronecker_sub_matrix(vout, out_dim.y / 4, *a, (__m128i *)b, b_dim);
- vout += b_dim.y / 4;
+ fill_kronecker_sub_matrix(vout, out_dim.y, *a, b, b_dim);
+ vout += b_dim.y;
}
out += out_dim.y * b_dim.x;
}
diff --git a/util.h b/util.h
@@ -5,8 +5,6 @@
#include <stddef.h>
#include <stdint.h>
-#include <immintrin.h>
-
#ifndef asm
#define asm __asm__
#endif
@@ -23,13 +21,15 @@
#endif
#endif
+#include "intrinsics.c"
+
#ifdef _DEBUG
#ifdef _WIN32
#define DEBUG_EXPORT __declspec(dllexport)
#else
#define DEBUG_EXPORT
#endif
- #define ASSERT(c) do { if (!(c)) asm("int3; nop"); } while (0);
+ #define ASSERT(c) do { if (!(c)) debugbreak(); } while (0);
#else
#define DEBUG_EXPORT static
#define ASSERT(c)