intrinsics.c (8072B)
1 /* See LICENSE for license details. */ 2 #include "compiler.h" 3 4 #if COMPILER_CLANG || COMPILER_GCC 5 #define force_inline inline __attribute__((always_inline)) 6 #elif COMPILER_MSVC 7 #define force_inline __forceinline 8 #endif 9 10 #if COMPILER_MSVC || (COMPILER_CLANG && OS_WINDOWS) 11 #pragma section(".rdata$", read) 12 #define read_only __declspec(allocate(".rdata$")) 13 #elif COMPILER_CLANG 14 #define read_only __attribute__((section(".rodata"))) 15 #elif COMPILER_GCC 16 /* TODO(rnp): how do we do this with gcc, putting it in rodata causes warnings and writing to 17 * it doesn't cause a fault */ 18 #define read_only 19 #endif 20 21 #if COMPILER_MSVC 22 #define alignas(n) __declspec(align(n)) 23 #define pack_struct(s) __pragma(pack(push, 1)) s __pragma(pack(pop)) 24 #define no_return __declspec(noreturn) 25 26 #define likely(x) (x) 27 #define unlikely(x) (x) 28 29 #define assume(x) __assume(x) 30 #define debugbreak() __debugbreak() 31 #define unreachable() __assume(0) 32 33 #if ARCH_ARM64 34 #define cpu_yield() __yield() 35 #define store_fence() __dmb(0x0A) // 0x0A: ishst 36 #endif 37 38 #define atomic_add_u32(ptr, n) _InterlockedExchangeAdd((volatile u32 *)(ptr), (n)) 39 #define atomic_add_u64(ptr, n) _InterlockedExchangeAdd64((volatile u64 *)(ptr), (n)) 40 #define atomic_and_u32(ptr, n) _InterlockedAnd((volatile u32 *)(ptr), (n)) 41 #define atomic_and_u64(ptr, n) _InterlockedAnd64((volatile u64 *)(ptr), (n)) 42 #define atomic_cas_u32(ptr, cptr, n) (_InterlockedCompareExchange((volatile u32 *)(ptr), *(cptr), (n)) == *(cptr)) 43 #define atomic_cas_u64(ptr, cptr, n) (_InterlockedCompareExchange64((volatile u64 *)(ptr), *(cptr), (n)) == *(cptr)) 44 #define atomic_load_u32(ptr) *((volatile u32 *)(ptr)) 45 #define atomic_load_u64(ptr) *((volatile u64 *)(ptr)) 46 #define atomic_or_u32(ptr, n) _InterlockedOr((volatile u32 *)(ptr), (n)) 47 #define atomic_store_u32(ptr, n) *((volatile u32 *)(ptr)) = (u32)(n) 48 #define atomic_store_u64(ptr, n) *((volatile u64 *)(ptr)) = (u64)(n) 49 #define atomic_swap_u32(ptr, n) _InterlockedExchange((volatile u32 *)(ptr), n) 50 #define atomic_swap_u64(ptr, n) _InterlockedExchange64((volatile u64 *)(ptr), n) 51 52 #define atan2_f32(y, x) atan2f(y, x) 53 #define cos_f32(a) cosf(a) 54 #define sin_f32(a) sinf(a) 55 #define tan_f32(a) tanf(a) 56 #define ceil_f32(a) ceilf(a) 57 #define sqrt_f32(a) sqrtf(a) 58 59 #define exp_f64(a) exp(a) 60 #define sqrt_f64(a) sqrt(a) 61 62 #else 63 #define alignas(n) __attribute__((aligned(n))) 64 #define pack_struct(s) s __attribute__((packed)) 65 #define no_return __attribute__((noreturn)) 66 67 #define likely(x) (__builtin_expect(!!(x), 1)) 68 #define unlikely(x) (__builtin_expect(!!(x), 0)) 69 70 #if COMPILER_CLANG 71 #define assume(x) __builtin_assume(x) 72 #else 73 #if defined(__has_attribute) 74 #if __has_attribute(assume) 75 #define assume(x) __attribute__((assume(x))) 76 #endif 77 #endif 78 #endif 79 #if !defined(assume) 80 #define assume(x) if (!(x)) unreachable() 81 #endif 82 #define unreachable() __builtin_unreachable() 83 #if ARCH_ARM64 84 /* TODO? debuggers just loop here forever and need a manual PC increment (step over) */ 85 #define debugbreak() asm volatile ("brk 0xf000") 86 #define cpu_yield() asm volatile ("yield") 87 #define store_fence() asm volatile ("dmb ishst" ::: "memory") 88 #else 89 #define debugbreak() asm volatile ("int3; nop") 90 #endif 91 92 #define atomic_add_u64(ptr, n) __atomic_fetch_add(ptr, n, __ATOMIC_SEQ_CST) 93 #define atomic_and_u64(ptr, n) __atomic_and_fetch(ptr, n, __ATOMIC_SEQ_CST) 94 #define atomic_cas_u64(ptr, cptr, n) __atomic_compare_exchange_n(ptr, cptr, n, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) 95 #define atomic_load_u64(ptr) __atomic_load_n(ptr, __ATOMIC_SEQ_CST) 96 #define atomic_or_u32(ptr, n) __atomic_or_fetch(ptr, n, __ATOMIC_SEQ_CST) 97 #define atomic_store_u64(ptr, n) __atomic_store_n(ptr, n, __ATOMIC_SEQ_CST) 98 #define atomic_swap_u64(ptr, n) __atomic_exchange_n(ptr, n, __ATOMIC_SEQ_CST) 99 #define atomic_add_u32 atomic_add_u64 100 #define atomic_and_u32 atomic_and_u64 101 #define atomic_cas_u32 atomic_cas_u64 102 #define atomic_load_u32 atomic_load_u64 103 #define atomic_store_u32 atomic_store_u64 104 #define atomic_swap_u32 atomic_swap_u64 105 106 #define atan2_f32(y, x) __builtin_atan2f(y, x) 107 #define cos_f32(a) __builtin_cosf(a) 108 #define sin_f32(a) __builtin_sinf(a) 109 #define tan_f32(a) __builtin_tanf(a) 110 #define ceil_f32(a) __builtin_ceilf(a) 111 #define sqrt_f32(a) __builtin_sqrtf(a) 112 113 #define exp_f64(a) __builtin_exp(a) 114 #define sqrt_f64(a) __builtin_sqrt(a) 115 116 #define popcount_u64(a) (u64)__builtin_popcountll(a) 117 #endif 118 119 #if COMPILER_MSVC 120 121 function force_inline u32 122 clz_u32(u32 a) 123 { 124 u32 result = 32, index; 125 if (a) { 126 _BitScanReverse(&index, a); 127 result = index; 128 } 129 return result; 130 } 131 132 function force_inline u32 133 ctz_u32(u32 a) 134 { 135 u32 result = 32, index; 136 if (a) { 137 _BitScanForward(&index, a); 138 result = index; 139 } 140 return result; 141 } 142 143 function force_inline u64 144 ctz_u64(u64 a) 145 { 146 u64 result = 64, index; 147 if (a) { 148 _BitScanForward64(&index, a); 149 result = index; 150 } 151 return result; 152 } 153 154 #else /* !COMPILER_MSVC */ 155 156 function force_inline u32 157 clz_u32(u32 a) 158 { 159 u32 result = 32; 160 if (a) result = (u32)__builtin_clz(a); 161 return result; 162 } 163 164 function force_inline u32 165 ctz_u32(u32 a) 166 { 167 u32 result = 32; 168 if (a) result = (u32)__builtin_ctz(a); 169 return result; 170 } 171 172 function force_inline u64 173 ctz_u64(u64 a) 174 { 175 u64 result = 64; 176 if (a) result = (u64)__builtin_ctzll(a); 177 return result; 178 } 179 180 #endif 181 182 #if ARCH_ARM64 183 /* NOTE(rnp): we are only doing a handful of f32x4 operations so we will just use NEON and do 184 * the macro renaming thing. If you are implementing a serious wide vector operation you should 185 * use SVE(2) instead. The semantics are different however and the code will be written for an 186 * arbitrary vector bit width. In that case you will also need x86_64 code for determining 187 * the supported vector width (ideally at runtime though that may not be possible). 188 */ 189 #include <arm_neon.h> 190 typedef float32x4_t f32x4; 191 typedef int32x4_t i32x4; 192 typedef uint32x4_t u32x4; 193 194 #define add_f32x4(a, b) vaddq_f32(a, b) 195 #define cvt_i32x4_f32x4(a) vcvtq_f32_s32(a) 196 #define cvt_f32x4_i32x4(a) vcvtq_s32_f32(a) 197 #define div_f32x4(a, b) vdivq_f32(a, b) 198 #define dup_f32x4(f) vdupq_n_f32(f) 199 #define floor_f32x4(a) vrndmq_f32(a) 200 #define load_f32x4(a) vld1q_f32(a) 201 #define load_i32x4(a) vld1q_s32(a) 202 #define max_f32x4(a, b) vmaxq_f32(a, b) 203 #define min_f32x4(a, b) vminq_f32(a, b) 204 #define mul_f32x4(a, b) vmulq_f32(a, b) 205 #define set_f32x4(a, b, c, d) vld1q_f32((f32 []){d, c, b, a}) 206 #define sqrt_f32x4(a) vsqrtq_f32(a) 207 #define store_f32x4(o, a) vst1q_f32(o, a) 208 #define store_i32x4(o, a) vst1q_s32(o, a) 209 #define sub_f32x4(a, b) vsubq_f32(a, b) 210 211 #elif ARCH_X64 212 #include <immintrin.h> 213 typedef __m128 f32x4; 214 typedef __m128i i32x4; 215 typedef __m128i u32x4; 216 217 #define add_f32x4(a, b) _mm_add_ps(a, b) 218 #define cvt_i32x4_f32x4(a) _mm_cvtepi32_ps(a) 219 #define cvt_f32x4_i32x4(a) _mm_cvtps_epi32(a) 220 #define div_f32x4(a, b) _mm_div_ps(a, b) 221 #define dup_f32x4(f) _mm_set1_ps(f) 222 #define floor_f32x4(a) _mm_floor_ps(a) 223 #define load_f32x4(a) _mm_loadu_ps(a) 224 #define load_i32x4(a) _mm_loadu_si128((i32x4 *)a) 225 #define max_f32x4(a, b) _mm_max_ps(a, b) 226 #define min_f32x4(a, b) _mm_min_ps(a, b) 227 #define mul_f32x4(a, b) _mm_mul_ps(a, b) 228 #define set_f32x4(a, b, c, d) _mm_set_ps(a, b, c, d) 229 #define sqrt_f32x4(a) _mm_sqrt_ps(a) 230 #define store_f32x4(o, a) _mm_storeu_ps(o, a) 231 #define store_i32x4(o, a) _mm_storeu_si128((i32x4 *)o, a) 232 #define sub_f32x4(a, b) _mm_sub_ps(a, b) 233 234 #define cpu_yield _mm_pause 235 #define store_fence _mm_sfence 236 237 #endif