Commit: e68fa72369dab981f71b974f346514cd11fb5eff
Parent: 612c98af0e59cf66e7501c48eb2114fb55342282
Author: Randy Palamar
Date: Tue, 6 Jan 2026 13:41:42 -0700
util: add arm inline asm non-temporal memory copy
since my laptop doesn't support SVE I can't use the actually well
designed simd instructions so I am stuck with only using NEON,
which suffers from the same fixed width limitations as AVX.
Unfortunately not all instructions have intrinsics in clang/gcc.
The ldnp/stnp instructions load/store a pair of neon registers
(total 32 bytes) at a time with a non-temporal hint. I suppose
this doesn't really align with rest of the intrinsics which can
only return 1 result so there is no way to use these without
resorting to inline assembly. LLVM will actually emit the normal
caching variants itself but I want the non-caching versions.
Diffstat:
1 file changed, 15 insertions(+), 0 deletions(-)
diff --git a/util.c b/util.c
@@ -51,6 +51,21 @@ memory_copy_non_temporal(void *restrict dest, void *restrict src, uz n)
for (; n >= 32; n -= 32, s += 32, d += 32)
_mm256_stream_si256((__m256i *)d, _mm256_stream_load_si256((__m256i *)s));
}
+ #elif ARCH_ARM64 && !COMPILER_MSVC
+ {
+ asm volatile (
+ "cbz %2, 2f\n"
+ "1: ldnp q0, q1, [%1]\n"
+ "subs %2, %2, #32\n"
+ "add %1, %1, #32\n"
+ "stnp q0, q1, [%0]\n"
+ "add %0, %0, #32\n"
+ "b.ne 1b\n"
+ "2:"
+ : "+r"(d), "+r"(s), "+r"(n)
+ :: "memory", "v0", "v1"
+ );
+ }
#else
mem_copy(d, s, n);
#endif