intrinsics.c (8173B)
1 /* See LICENSE for license details. */ 2 #include "compiler.h" 3 4 #if COMPILER_CLANG || COMPILER_GCC 5 #define force_inline inline __attribute__((always_inline)) 6 #elif COMPILER_MSVC 7 #define force_inline __forceinline 8 #endif 9 10 #if COMPILER_MSVC || (COMPILER_CLANG && OS_WINDOWS) 11 #pragma section(".rdata$", read) 12 #define read_only __declspec(allocate(".rdata$")) 13 #elif COMPILER_CLANG 14 #define read_only __attribute__((section(".rodata"))) 15 #elif COMPILER_GCC 16 /* TODO(rnp): how do we do this with gcc, putting it in rodata causes warnings and writing to 17 * it doesn't cause a fault */ 18 #define read_only 19 #endif 20 21 #if COMPILER_MSVC 22 #define alignas(n) __declspec(align(n)) 23 #define pack_struct(s) __pragma(pack(push, 1)) s __pragma(pack(pop)) 24 #define no_return __declspec(noreturn) 25 26 #define likely(x) (x) 27 #define unlikely(x) (x) 28 29 #define print_format(f, va) 30 31 #define assume(x) __assume(x) 32 #define debugbreak() __debugbreak() 33 #define unreachable() __assume(0) 34 35 #if ARCH_ARM64 36 #define cpu_yield() __yield() 37 #define store_fence() __dmb(0x0A) // 0x0A: ishst 38 #endif 39 40 #define atomic_add_u32(ptr, n) _InterlockedExchangeAdd((volatile u32 *)(ptr), (n)) 41 #define atomic_add_u64(ptr, n) _InterlockedExchangeAdd64((volatile u64 *)(ptr), (n)) 42 #define atomic_and_u32(ptr, n) _InterlockedAnd((volatile u32 *)(ptr), (n)) 43 #define atomic_and_u64(ptr, n) _InterlockedAnd64((volatile u64 *)(ptr), (n)) 44 #define atomic_cas_u32(ptr, cptr, n) (_InterlockedCompareExchange((volatile u32 *)(ptr), *(cptr), (n)) == *(cptr)) 45 #define atomic_cas_u64(ptr, cptr, n) (_InterlockedCompareExchange64((volatile u64 *)(ptr), *(cptr), (n)) == *(cptr)) 46 #define atomic_load_u32(ptr) *((volatile u32 *)(ptr)) 47 #define atomic_load_u64(ptr) *((volatile u64 *)(ptr)) 48 #define atomic_or_u32(ptr, n) _InterlockedOr((volatile u32 *)(ptr), (n)) 49 #define atomic_store_u32(ptr, n) *((volatile u32 *)(ptr)) = (u32)(n) 50 #define atomic_store_u64(ptr, n) *((volatile u64 *)(ptr)) = (u64)(n) 51 #define atomic_swap_u32(ptr, n) _InterlockedExchange((volatile u32 *)(ptr), n) 52 #define atomic_swap_u64(ptr, n) _InterlockedExchange64((volatile u64 *)(ptr), n) 53 54 #define atan2_f32(y, x) atan2f(y, x) 55 #define cos_f32(a) cosf(a) 56 #define sin_f32(a) sinf(a) 57 #define tan_f32(a) tanf(a) 58 #define ceil_f32(a) ceilf(a) 59 #define sqrt_f32(a) sqrtf(a) 60 61 #define exp_f64(a) exp(a) 62 #define sqrt_f64(a) sqrt(a) 63 64 #else 65 #define alignas(n) __attribute__((aligned(n))) 66 #define pack_struct(s) s __attribute__((packed)) 67 #define no_return __attribute__((noreturn)) 68 69 #define likely(x) (__builtin_expect(!!(x), 1)) 70 #define unlikely(x) (__builtin_expect(!!(x), 0)) 71 72 #define print_format(f, va) __attribute__((format(printf, f, va))) 73 74 #if COMPILER_CLANG 75 #define assume(x) __builtin_assume(x) 76 #else 77 #if defined(__has_attribute) 78 #if __has_attribute(assume) 79 #define assume(x) __attribute__((assume(x))) 80 #endif 81 #endif 82 #endif 83 #if !defined(assume) 84 #define assume(x) if (!(x)) unreachable() 85 #endif 86 #define unreachable() __builtin_unreachable() 87 #if ARCH_ARM64 88 /* TODO? debuggers just loop here forever and need a manual PC increment (step over) */ 89 #define debugbreak() asm volatile ("brk 0xf000") 90 #define cpu_yield() asm volatile ("yield") 91 #define store_fence() asm volatile ("dmb ishst" ::: "memory") 92 #else 93 #define debugbreak() asm volatile ("int3; nop") 94 #endif 95 96 #define atomic_add_u64(ptr, n) __atomic_fetch_add(ptr, n, __ATOMIC_SEQ_CST) 97 #define atomic_and_u64(ptr, n) __atomic_and_fetch(ptr, n, __ATOMIC_SEQ_CST) 98 #define atomic_cas_u64(ptr, cptr, n) __atomic_compare_exchange_n(ptr, cptr, n, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) 99 #define atomic_load_u64(ptr) __atomic_load_n(ptr, __ATOMIC_SEQ_CST) 100 #define atomic_or_u32(ptr, n) __atomic_or_fetch(ptr, n, __ATOMIC_SEQ_CST) 101 #define atomic_store_u64(ptr, n) __atomic_store_n(ptr, n, __ATOMIC_SEQ_CST) 102 #define atomic_swap_u64(ptr, n) __atomic_exchange_n(ptr, n, __ATOMIC_SEQ_CST) 103 #define atomic_add_u32 atomic_add_u64 104 #define atomic_and_u32 atomic_and_u64 105 #define atomic_cas_u32 atomic_cas_u64 106 #define atomic_load_u32 atomic_load_u64 107 #define atomic_store_u32 atomic_store_u64 108 #define atomic_swap_u32 atomic_swap_u64 109 110 #define atan2_f32(y, x) __builtin_atan2f(y, x) 111 #define cos_f32(a) __builtin_cosf(a) 112 #define sin_f32(a) __builtin_sinf(a) 113 #define tan_f32(a) __builtin_tanf(a) 114 #define ceil_f32(a) __builtin_ceilf(a) 115 #define sqrt_f32(a) __builtin_sqrtf(a) 116 117 #define exp_f64(a) __builtin_exp(a) 118 #define sqrt_f64(a) __builtin_sqrt(a) 119 120 #define popcount_u64(a) (u64)__builtin_popcountll(a) 121 #endif 122 123 #if COMPILER_MSVC 124 125 function force_inline u32 126 clz_u32(u32 a) 127 { 128 u32 result = 32, index; 129 if (a) { 130 _BitScanReverse(&index, a); 131 result = index; 132 } 133 return result; 134 } 135 136 function force_inline u32 137 ctz_u32(u32 a) 138 { 139 u32 result = 32, index; 140 if (a) { 141 _BitScanForward(&index, a); 142 result = index; 143 } 144 return result; 145 } 146 147 function force_inline u64 148 ctz_u64(u64 a) 149 { 150 u64 result = 64, index; 151 if (a) { 152 _BitScanForward64(&index, a); 153 result = index; 154 } 155 return result; 156 } 157 158 #else /* !COMPILER_MSVC */ 159 160 function force_inline u32 161 clz_u32(u32 a) 162 { 163 u32 result = 32; 164 if (a) result = (u32)__builtin_clz(a); 165 return result; 166 } 167 168 function force_inline u32 169 ctz_u32(u32 a) 170 { 171 u32 result = 32; 172 if (a) result = (u32)__builtin_ctz(a); 173 return result; 174 } 175 176 function force_inline u64 177 ctz_u64(u64 a) 178 { 179 u64 result = 64; 180 if (a) result = (u64)__builtin_ctzll(a); 181 return result; 182 } 183 184 #endif 185 186 #if ARCH_ARM64 187 /* NOTE(rnp): we are only doing a handful of f32x4 operations so we will just use NEON and do 188 * the macro renaming thing. If you are implementing a serious wide vector operation you should 189 * use SVE(2) instead. The semantics are different however and the code will be written for an 190 * arbitrary vector bit width. In that case you will also need x86_64 code for determining 191 * the supported vector width (ideally at runtime though that may not be possible). 192 */ 193 #include <arm_neon.h> 194 typedef float32x4_t f32x4; 195 typedef int32x4_t i32x4; 196 typedef uint32x4_t u32x4; 197 198 #define add_f32x4(a, b) vaddq_f32(a, b) 199 #define cvt_i32x4_f32x4(a) vcvtq_f32_s32(a) 200 #define cvt_f32x4_i32x4(a) vcvtq_s32_f32(a) 201 #define div_f32x4(a, b) vdivq_f32(a, b) 202 #define dup_f32x4(f) vdupq_n_f32(f) 203 #define floor_f32x4(a) vrndmq_f32(a) 204 #define load_f32x4(a) vld1q_f32(a) 205 #define load_i32x4(a) vld1q_s32(a) 206 #define max_f32x4(a, b) vmaxq_f32(a, b) 207 #define min_f32x4(a, b) vminq_f32(a, b) 208 #define mul_f32x4(a, b) vmulq_f32(a, b) 209 #define set_f32x4(a, b, c, d) vld1q_f32((f32 []){d, c, b, a}) 210 #define sqrt_f32x4(a) vsqrtq_f32(a) 211 #define store_f32x4(o, a) vst1q_f32(o, a) 212 #define store_i32x4(o, a) vst1q_s32(o, a) 213 #define sub_f32x4(a, b) vsubq_f32(a, b) 214 215 #elif ARCH_X64 216 #include <immintrin.h> 217 typedef __m128 f32x4; 218 typedef __m128i i32x4; 219 typedef __m128i u32x4; 220 221 #define add_f32x4(a, b) _mm_add_ps(a, b) 222 #define cvt_i32x4_f32x4(a) _mm_cvtepi32_ps(a) 223 #define cvt_f32x4_i32x4(a) _mm_cvtps_epi32(a) 224 #define div_f32x4(a, b) _mm_div_ps(a, b) 225 #define dup_f32x4(f) _mm_set1_ps(f) 226 #define floor_f32x4(a) _mm_floor_ps(a) 227 #define load_f32x4(a) _mm_loadu_ps(a) 228 #define load_i32x4(a) _mm_loadu_si128((i32x4 *)a) 229 #define max_f32x4(a, b) _mm_max_ps(a, b) 230 #define min_f32x4(a, b) _mm_min_ps(a, b) 231 #define mul_f32x4(a, b) _mm_mul_ps(a, b) 232 #define set_f32x4(a, b, c, d) _mm_set_ps(a, b, c, d) 233 #define sqrt_f32x4(a) _mm_sqrt_ps(a) 234 #define store_f32x4(o, a) _mm_storeu_ps(o, a) 235 #define store_i32x4(o, a) _mm_storeu_si128((i32x4 *)o, a) 236 #define sub_f32x4(a, b) _mm_sub_ps(a, b) 237 238 #define cpu_yield _mm_pause 239 #define store_fence _mm_sfence 240 241 #endif