Commit: 62254bdb37e6ee31b8a704c2f762a92a9abfefb5
Parent: 9a65ec87ee0efffb0b2dfe6fb99b17ecf1973563
Author: Randy Palamar
Date: Tue, 21 Oct 2025 08:43:49 -0600
core: more efficient spin wait
Both x86 and aarch64 have instructions which are meant to be put
inside of a spin wait loop to prevent memory order violations
which can trash performance. Furthermore, the CPU will waste less
energy while doing the spin.
For more details see here:
https://www.felixcloutier.com/x86/pause
Diffstat:
3 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/beamformer.c b/beamformer.c
@@ -1,5 +1,7 @@
/* See LICENSE for license details. */
/* TODO(rnp):
+ * [ ]: utilize umonitor/umwait (intel), monitorx/mwaitx (amd), and wfe/sev (aarch64)
+ * for power efficient low latency waiting
* [ ]: refactor: split decode into reshape and decode
* - the check for first pass reshaping is the last non constant check
* in the shader
diff --git a/intrinsics.c b/intrinsics.c
@@ -184,6 +184,8 @@ typedef uint32x4_t u32x4;
#define store_i32x4(o, a) vst1q_s32(o, a)
#define sub_f32x4(a, b) vsubq_f32(a, b)
+#define cpu_yield() asm volatile ("yield")
+
#elif ARCH_X64
#include <immintrin.h>
typedef __m128 f32x4;
@@ -207,4 +209,6 @@ typedef __m128i u32x4;
#define store_i32x4(o, a) _mm_storeu_si128((i32x4 *)o, a)
#define sub_f32x4(a, b) _mm_sub_ps(a, b)
+#define cpu_yield() _mm_pause()
+
#endif
diff --git a/util.h b/util.h
@@ -97,7 +97,7 @@
#define EachEnumValue(type, it) (type it = (type)0; it < type##_Count; it = (type)(it + 1))
#define EachNonZeroEnumValue(type, it) (type it = (type)1; it < type##_Count; it = (type)(it + 1))
-#define spin_wait(c) while ((c))
+#define spin_wait(c) while ((c)) cpu_yield()
/* NOTE(rnp): no guarantees about actually getting an element */
#define SLLPop(list) list; list = list ? list->next : 0