core: more efficient spin wait - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: 62254bdb37e6ee31b8a704c2f762a92a9abfefb5
Parent: 9a65ec87ee0efffb0b2dfe6fb99b17ecf1973563
Author: Randy Palamar
Date:   Tue, 21 Oct 2025 08:43:49 -0600

core: more efficient spin wait

Both x86 and aarch64 have instructions which are meant to be put
inside of a spin wait loop to prevent memory order violations
which can trash performance. Furthermore, the CPU will waste less
energy while doing the spin.

For more details see here:
https://www.felixcloutier.com/x86/pause

Diffstat:
M beamformer.c  | 2 ++
M intrinsics.c  | 4 ++++
M util.h  | 2 +-

3 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/beamformer.c b/beamformer.c
@@ -1,5 +1,7 @@
 /* See LICENSE for license details. */
 /* TODO(rnp):
+ * [ ]: utilize umonitor/umwait (intel), monitorx/mwaitx (amd), and wfe/sev (aarch64)
+ *      for power efficient low latency waiting
  * [ ]: refactor: split decode into reshape and decode
  *      - the check for first pass reshaping is the last non constant check
  *        in the shader
diff --git a/intrinsics.c b/intrinsics.c
@@ -184,6 +184,8 @@ typedef uint32x4_t  u32x4;
 #define store_i32x4(o, a)     vst1q_s32(o, a)
 #define sub_f32x4(a, b)       vsubq_f32(a, b)
 
+#define cpu_yield()           asm volatile ("yield")
+
 #elif ARCH_X64
 #include <immintrin.h>
 typedef __m128  f32x4;
@@ -207,4 +209,6 @@ typedef __m128i u32x4;
 #define store_i32x4(o, a)     _mm_storeu_si128((i32x4 *)o, a)
 #define sub_f32x4(a, b)       _mm_sub_ps(a, b)
 
+#define cpu_yield()           _mm_pause()
+
 #endif
diff --git a/util.h b/util.h
@@ -97,7 +97,7 @@
 #define EachEnumValue(type, it)        (type it = (type)0; it < type##_Count; it = (type)(it + 1))
 #define EachNonZeroEnumValue(type, it) (type it = (type)1; it < type##_Count; it = (type)(it + 1))
 
-#define spin_wait(c) while ((c))
+#define spin_wait(c) while ((c)) cpu_yield()
 
 /* NOTE(rnp): no guarantees about actually getting an element */
 #define SLLPop(list) list; list = list ? list->next : 0