intrinsics.c (4571B)
1 /* See LICENSE for license details. */ 2 #include "compiler.h" 3 4 #if COMPILER_CLANG || COMPILER_GCC 5 #define force_inline inline __attribute__((always_inline)) 6 #elif COMPILER_MSVC 7 #define force_inline __forceinline 8 #endif 9 10 #if COMPILER_MSVC || (COMPILER_CLANG && OS_WINDOWS) 11 #pragma section(".rdata$", read) 12 #define read_only __declspec(allocate(".rdata$")) 13 #elif COMPILER_CLANG 14 #define read_only __attribute__((section(".rodata"))) 15 #elif COMPILER_GCC 16 /* TODO(rnp): how do we do this with gcc, putting it in rodata causes warnings and writing to 17 * it doesn't cause a fault */ 18 #define read_only 19 #endif 20 21 #if COMPILER_MSVC 22 #define align_as(n) __declspec(align(n)) 23 #define pack_struct(s) __pragma(pack(push, 1)) s __pragma(pack(pop)) 24 #define no_return __declspec(noreturn) 25 26 #define debugbreak() __debugbreak() 27 #define unreachable() __assume(0) 28 29 #define atomic_store(ptr, n) __atomic_store_n(ptr, n, __ATOMIC_RELEASE) 30 #define atomic_load(ptr) __atomic_load_n(ptr, __ATOMIC_ACQUIRE) 31 #define atomic_swap(ptr, n) __atomic_exchange_n(ptr, n, __ATOMIC_RELEASE) 32 #define atomic_and(ptr, n) __atomic_and_fetch(ptr, n, __ATOMIC_RELEASE) 33 #define atomic_add(ptr, n) __atomic_add_fetch(ptr, n, __ATOMIC_RELEASE) 34 #define atomic_inc_u32(ptr, n) _InterlockedAdd((volatile u32 *)ptr, n) 35 #define atomic_cas(ptr, cptr, n) __atomic_compare_exchange_n(ptr, cptr, n, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) 36 37 #else 38 #define align_as(n) __attribute__((aligned(n))) 39 #define pack_struct(s) s __attribute__((packed)) 40 #define no_return __attribute__((noreturn)) 41 42 #if ARCH_ARM64 43 /* TODO? debuggers just loop here forever and need a manual PC increment (step over) */ 44 #define debugbreak() asm volatile ("brk 0xf000") 45 #else 46 #define debugbreak() asm volatile ("int3; nop") 47 #endif 48 #define unreachable() __builtin_unreachable() 49 50 #define atomic_store(ptr, n) __atomic_store_n(ptr, n, __ATOMIC_RELEASE) 51 #define atomic_load(ptr) __atomic_load_n(ptr, __ATOMIC_ACQUIRE) 52 #define atomic_swap(ptr, n) __atomic_exchange_n(ptr, n, __ATOMIC_RELEASE) 53 #define atomic_and(ptr, n) __atomic_and_fetch(ptr, n, __ATOMIC_RELEASE) 54 #define atomic_add(ptr, n) __atomic_add_fetch(ptr, n, __ATOMIC_RELEASE) 55 #define atomic_inc_u32(ptr, n) __atomic_fetch_add(ptr, n, __ATOMIC_ACQ_REL) 56 #define atomic_cas(ptr, cptr, n) __atomic_compare_exchange_n(ptr, cptr, n, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) 57 58 #endif 59 60 /* TODO(rnp): msvc probably won't build this but there are other things preventing that as well */ 61 #define sqrt_f32(a) __builtin_sqrtf(a) 62 #define atan2_f32(y, x) __builtin_atan2f(y, x) 63 64 function force_inline u32 65 clz_u32(u32 a) 66 { 67 u32 result = 32; 68 if (a) result = __builtin_clz(a); 69 return result; 70 } 71 72 function force_inline u32 73 ctz_u32(u32 a) 74 { 75 u32 result = 32; 76 if (a) result = __builtin_ctz(a); 77 return result; 78 } 79 80 #if ARCH_ARM64 81 /* NOTE(rnp): we are only doing a handful of f32x4 operations so we will just use NEON and do 82 * the macro renaming thing. If you are implementing a serious wide vector operation you should 83 * use SVE(2) instead. The semantics are different however and the code will be written for an 84 * arbitrary vector bit width. In that case you will also need x86_64 code for determining 85 * the supported vector width (ideally at runtime though that may not be possible). 86 */ 87 #include <arm_neon.h> 88 typedef float32x4_t f32x4; 89 typedef int32x4_t i32x4; 90 91 #define cvt_i32x4_f32x4(a) vcvtq_f32_s32(a) 92 #define cvt_f32x4_i32x4(a) vcvtq_s32_f32(a) 93 #define dup_f32x4(f) vdupq_n_f32(f) 94 #define load_f32x4(a) vld1q_f32(a) 95 #define load_i32x4(a) vld1q_s32(a) 96 #define mul_f32x4(a, b) vmulq_f32(a, b) 97 #define set_f32x4(a, b, c, d) vld1q_f32((f32 []){d, c, b, a}) 98 #define sqrt_f32x4(a) vsqrtq_f32(a) 99 #define store_f32x4(a, o) vst1q_f32(o, a) 100 #define store_i32x4(a, o) vst1q_s32(o, a) 101 102 #elif ARCH_X64 103 #include <immintrin.h> 104 typedef __m128 f32x4; 105 typedef __m128i i32x4; 106 107 #define cvt_i32x4_f32x4(a) _mm_cvtepi32_ps(a) 108 #define cvt_f32x4_i32x4(a) _mm_cvtps_epi32(a) 109 #define dup_f32x4(f) _mm_set1_ps(f) 110 #define load_f32x4(a) _mm_loadu_ps(a) 111 #define load_i32x4(a) _mm_loadu_si128((i32x4 *)a) 112 #define mul_f32x4(a, b) _mm_mul_ps(a, b) 113 #define set_f32x4(a, b, c, d) _mm_set_ps(a, b, c, d) 114 #define sqrt_f32x4(a) _mm_sqrt_ps(a) 115 #define store_f32x4(a, o) _mm_storeu_ps(o, a) 116 #define store_i32x4(a, o) _mm_storeu_si128((i32x4 *)o, a) 117 118 #endif