intrinsics.c (7898B)
1 /* See LICENSE for license details. */ 2 #include "compiler.h" 3 4 #if COMPILER_CLANG || COMPILER_GCC 5 #define force_inline inline __attribute__((always_inline)) 6 #elif COMPILER_MSVC 7 #define force_inline __forceinline 8 #endif 9 10 #if COMPILER_MSVC || (COMPILER_CLANG && OS_WINDOWS) 11 #pragma section(".rdata$", read) 12 #define read_only __declspec(allocate(".rdata$")) 13 #elif COMPILER_CLANG 14 #define read_only __attribute__((section(".rodata"))) 15 #elif COMPILER_GCC 16 /* TODO(rnp): how do we do this with gcc, putting it in rodata causes warnings and writing to 17 * it doesn't cause a fault */ 18 #define read_only 19 #endif 20 21 #if COMPILER_MSVC 22 #define alignas(n) __declspec(align(n)) 23 #define pack_struct(s) __pragma(pack(push, 1)) s __pragma(pack(pop)) 24 #define no_return __declspec(noreturn) 25 26 #define likely(x) (x) 27 #define unlikely(x) (x) 28 29 #define assume(x) __assume(x) 30 #define debugbreak() __debugbreak() 31 #define unreachable() __assume(0) 32 33 #if ARCH_ARM64 34 #define cpu_yield() __yield() 35 #define store_fence() __dmb(0x0A) // 0x0A: ishst 36 #endif 37 38 #define atomic_add_u32(ptr, n) _InterlockedExchangeAdd((volatile u32 *)(ptr), (n)) 39 #define atomic_add_u64(ptr, n) _InterlockedExchangeAdd64((volatile u64 *)(ptr), (n)) 40 #define atomic_and_u32(ptr, n) _InterlockedAnd((volatile u32 *)(ptr), (n)) 41 #define atomic_and_u64(ptr, n) _InterlockedAnd64((volatile u64 *)(ptr), (n)) 42 #define atomic_cas_u32(ptr, cptr, n) (_InterlockedCompareExchange((volatile u32 *)(ptr), *(cptr), (n)) == *(cptr)) 43 #define atomic_cas_u64(ptr, cptr, n) (_InterlockedCompareExchange64((volatile u64 *)(ptr), *(cptr), (n)) == *(cptr)) 44 #define atomic_load_u32(ptr) *((volatile u32 *)(ptr)) 45 #define atomic_load_u64(ptr) *((volatile u64 *)(ptr)) 46 #define atomic_or_u32(ptr, n) _InterlockedOr((volatile u32 *)(ptr), (n)) 47 #define atomic_store_u32(ptr, n) *((volatile u32 *)(ptr)) = (u32)(n) 48 #define atomic_store_u64(ptr, n) *((volatile u64 *)(ptr)) = (u64)(n) 49 #define atomic_swap_u32(ptr, n) _InterlockedExchange((volatile u32 *)(ptr), n) 50 #define atomic_swap_u64(ptr, n) _InterlockedExchange64((volatile u64 *)(ptr), n) 51 52 #define atan2_f32(y, x) atan2f(y, x) 53 #define cos_f32(a) cosf(a) 54 #define sin_f32(a) sinf(a) 55 #define tan_f32(a) tanf(a) 56 #define ceil_f32(a) ceilf(a) 57 #define sqrt_f32(a) sqrtf(a) 58 59 #define exp_f64(a) exp(a) 60 #define sqrt_f64(a) sqrt(a) 61 62 #else 63 #define alignas(n) __attribute__((aligned(n))) 64 #define pack_struct(s) s __attribute__((packed)) 65 #define no_return __attribute__((noreturn)) 66 67 #define likely(x) (__builtin_expect(!!(x), 1)) 68 #define unlikely(x) (__builtin_expect(!!(x), 0)) 69 70 #if COMPILER_CLANG 71 #define assume(x) __builtin_assume(x) 72 #else 73 #define assume(x) __attribute__((assume(x))) 74 #endif 75 #define unreachable() __builtin_unreachable() 76 #if ARCH_ARM64 77 /* TODO? debuggers just loop here forever and need a manual PC increment (step over) */ 78 #define debugbreak() asm volatile ("brk 0xf000") 79 #define cpu_yield() asm volatile ("yield") 80 #define store_fence() asm volatile ("dmb ishst" ::: "memory") 81 #else 82 #define debugbreak() asm volatile ("int3; nop") 83 #endif 84 85 #define atomic_add_u64(ptr, n) __atomic_fetch_add(ptr, n, __ATOMIC_SEQ_CST) 86 #define atomic_and_u64(ptr, n) __atomic_and_fetch(ptr, n, __ATOMIC_SEQ_CST) 87 #define atomic_cas_u64(ptr, cptr, n) __atomic_compare_exchange_n(ptr, cptr, n, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) 88 #define atomic_load_u64(ptr) __atomic_load_n(ptr, __ATOMIC_SEQ_CST) 89 #define atomic_or_u32(ptr, n) __atomic_or_fetch(ptr, n, __ATOMIC_SEQ_CST) 90 #define atomic_store_u64(ptr, n) __atomic_store_n(ptr, n, __ATOMIC_SEQ_CST) 91 #define atomic_swap_u64(ptr, n) __atomic_exchange_n(ptr, n, __ATOMIC_SEQ_CST) 92 #define atomic_add_u32 atomic_add_u64 93 #define atomic_and_u32 atomic_and_u64 94 #define atomic_cas_u32 atomic_cas_u64 95 #define atomic_load_u32 atomic_load_u64 96 #define atomic_store_u32 atomic_store_u64 97 #define atomic_swap_u32 atomic_swap_u64 98 99 #define atan2_f32(y, x) __builtin_atan2f(y, x) 100 #define cos_f32(a) __builtin_cosf(a) 101 #define sin_f32(a) __builtin_sinf(a) 102 #define tan_f32(a) __builtin_tanf(a) 103 #define ceil_f32(a) __builtin_ceilf(a) 104 #define sqrt_f32(a) __builtin_sqrtf(a) 105 106 #define exp_f64(a) __builtin_exp(a) 107 #define sqrt_f64(a) __builtin_sqrt(a) 108 109 #define popcount_u64(a) (u64)__builtin_popcountll(a) 110 #endif 111 112 #if COMPILER_MSVC 113 114 function force_inline u32 115 clz_u32(u32 a) 116 { 117 u32 result = 32, index; 118 if (a) { 119 _BitScanReverse(&index, a); 120 result = index; 121 } 122 return result; 123 } 124 125 function force_inline u32 126 ctz_u32(u32 a) 127 { 128 u32 result = 32, index; 129 if (a) { 130 _BitScanForward(&index, a); 131 result = index; 132 } 133 return result; 134 } 135 136 function force_inline u64 137 ctz_u64(u64 a) 138 { 139 u64 result = 64, index; 140 if (a) { 141 _BitScanForward64(&index, a); 142 result = index; 143 } 144 return result; 145 } 146 147 #else /* !COMPILER_MSVC */ 148 149 function force_inline u32 150 clz_u32(u32 a) 151 { 152 u32 result = 32; 153 if (a) result = (u32)__builtin_clz(a); 154 return result; 155 } 156 157 function force_inline u32 158 ctz_u32(u32 a) 159 { 160 u32 result = 32; 161 if (a) result = (u32)__builtin_ctz(a); 162 return result; 163 } 164 165 function force_inline u64 166 ctz_u64(u64 a) 167 { 168 u64 result = 64; 169 if (a) result = (u64)__builtin_ctzll(a); 170 return result; 171 } 172 173 #endif 174 175 #if ARCH_ARM64 176 /* NOTE(rnp): we are only doing a handful of f32x4 operations so we will just use NEON and do 177 * the macro renaming thing. If you are implementing a serious wide vector operation you should 178 * use SVE(2) instead. The semantics are different however and the code will be written for an 179 * arbitrary vector bit width. In that case you will also need x86_64 code for determining 180 * the supported vector width (ideally at runtime though that may not be possible). 181 */ 182 #include <arm_neon.h> 183 typedef float32x4_t f32x4; 184 typedef int32x4_t i32x4; 185 typedef uint32x4_t u32x4; 186 187 #define add_f32x4(a, b) vaddq_f32(a, b) 188 #define cvt_i32x4_f32x4(a) vcvtq_f32_s32(a) 189 #define cvt_f32x4_i32x4(a) vcvtq_s32_f32(a) 190 #define div_f32x4(a, b) vdivq_f32(a, b) 191 #define dup_f32x4(f) vdupq_n_f32(f) 192 #define floor_f32x4(a) vrndmq_f32(a) 193 #define load_f32x4(a) vld1q_f32(a) 194 #define load_i32x4(a) vld1q_s32(a) 195 #define max_f32x4(a, b) vmaxq_f32(a, b) 196 #define min_f32x4(a, b) vminq_f32(a, b) 197 #define mul_f32x4(a, b) vmulq_f32(a, b) 198 #define set_f32x4(a, b, c, d) vld1q_f32((f32 []){d, c, b, a}) 199 #define sqrt_f32x4(a) vsqrtq_f32(a) 200 #define store_f32x4(o, a) vst1q_f32(o, a) 201 #define store_i32x4(o, a) vst1q_s32(o, a) 202 #define sub_f32x4(a, b) vsubq_f32(a, b) 203 204 #elif ARCH_X64 205 #include <immintrin.h> 206 typedef __m128 f32x4; 207 typedef __m128i i32x4; 208 typedef __m128i u32x4; 209 210 #define add_f32x4(a, b) _mm_add_ps(a, b) 211 #define cvt_i32x4_f32x4(a) _mm_cvtepi32_ps(a) 212 #define cvt_f32x4_i32x4(a) _mm_cvtps_epi32(a) 213 #define div_f32x4(a, b) _mm_div_ps(a, b) 214 #define dup_f32x4(f) _mm_set1_ps(f) 215 #define floor_f32x4(a) _mm_floor_ps(a) 216 #define load_f32x4(a) _mm_loadu_ps(a) 217 #define load_i32x4(a) _mm_loadu_si128((i32x4 *)a) 218 #define max_f32x4(a, b) _mm_max_ps(a, b) 219 #define min_f32x4(a, b) _mm_min_ps(a, b) 220 #define mul_f32x4(a, b) _mm_mul_ps(a, b) 221 #define set_f32x4(a, b, c, d) _mm_set_ps(a, b, c, d) 222 #define sqrt_f32x4(a) _mm_sqrt_ps(a) 223 #define store_f32x4(o, a) _mm_storeu_ps(o, a) 224 #define store_i32x4(o, a) _mm_storeu_si128((i32x4 *)o, a) 225 #define sub_f32x4(a, b) _mm_sub_ps(a, b) 226 227 #define cpu_yield _mm_pause 228 #define store_fence _mm_sfence 229 230 #endif