ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

Commit: 62254bdb37e6ee31b8a704c2f762a92a9abfefb5
Parent: 9a65ec87ee0efffb0b2dfe6fb99b17ecf1973563
Author: Randy Palamar
Date:   Tue, 21 Oct 2025 08:43:49 -0600

core: more efficient spin wait

Both x86 and aarch64 have instructions which are meant to be put
inside of a spin wait loop to prevent memory order violations
which can trash performance. Furthermore, the CPU will waste less
energy while doing the spin.

For more details see here:
https://www.felixcloutier.com/x86/pause

Diffstat:
Mbeamformer.c | 2++
Mintrinsics.c | 4++++
Mutil.h | 2+-
3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/beamformer.c b/beamformer.c @@ -1,5 +1,7 @@ /* See LICENSE for license details. */ /* TODO(rnp): + * [ ]: utilize umonitor/umwait (intel), monitorx/mwaitx (amd), and wfe/sev (aarch64) + * for power efficient low latency waiting * [ ]: refactor: split decode into reshape and decode * - the check for first pass reshaping is the last non constant check * in the shader diff --git a/intrinsics.c b/intrinsics.c @@ -184,6 +184,8 @@ typedef uint32x4_t u32x4; #define store_i32x4(o, a) vst1q_s32(o, a) #define sub_f32x4(a, b) vsubq_f32(a, b) +#define cpu_yield() asm volatile ("yield") + #elif ARCH_X64 #include <immintrin.h> typedef __m128 f32x4; @@ -207,4 +209,6 @@ typedef __m128i u32x4; #define store_i32x4(o, a) _mm_storeu_si128((i32x4 *)o, a) #define sub_f32x4(a, b) _mm_sub_ps(a, b) +#define cpu_yield() _mm_pause() + #endif diff --git a/util.h b/util.h @@ -97,7 +97,7 @@ #define EachEnumValue(type, it) (type it = (type)0; it < type##_Count; it = (type)(it + 1)) #define EachNonZeroEnumValue(type, it) (type it = (type)1; it < type##_Count; it = (type)(it + 1)) -#define spin_wait(c) while ((c)) +#define spin_wait(c) while ((c)) cpu_yield() /* NOTE(rnp): no guarantees about actually getting an element */ #define SLLPop(list) list; list = list ? list->next : 0