intrinsics.c (7452B)
1 /* See LICENSE for license details. */ 2 #include "compiler.h" 3 4 #if COMPILER_CLANG || COMPILER_GCC 5 #define force_inline inline __attribute__((always_inline)) 6 #elif COMPILER_MSVC 7 #define force_inline __forceinline 8 #endif 9 10 #if COMPILER_MSVC || (COMPILER_CLANG && OS_WINDOWS) 11 #pragma section(".rdata$", read) 12 #define read_only __declspec(allocate(".rdata$")) 13 #elif COMPILER_CLANG 14 #define read_only __attribute__((section(".rodata"))) 15 #elif COMPILER_GCC 16 /* TODO(rnp): how do we do this with gcc, putting it in rodata causes warnings and writing to 17 * it doesn't cause a fault */ 18 #define read_only 19 #endif 20 21 #if COMPILER_MSVC 22 #define alignas(n) __declspec(align(n)) 23 #define pack_struct(s) __pragma(pack(push, 1)) s __pragma(pack(pop)) 24 #define no_return __declspec(noreturn) 25 26 #define debugbreak() __debugbreak() 27 #define unreachable() __assume(0) 28 29 #if ARCH_ARM64 30 #define cpu_yield() __yield() 31 #endif 32 33 #define memory_write_barrier() _WriteBarrier() 34 35 #define atomic_add_u32(ptr, n) _InterlockedExchangeAdd((volatile u32 *)(ptr), (n)) 36 #define atomic_add_u64(ptr, n) _InterlockedExchangeAdd64((volatile u64 *)(ptr), (n)) 37 #define atomic_and_u32(ptr, n) _InterlockedAnd((volatile u32 *)(ptr), (n)) 38 #define atomic_and_u64(ptr, n) _InterlockedAnd64((volatile u64 *)(ptr), (n)) 39 #define atomic_cas_u32(ptr, cptr, n) (_InterlockedCompareExchange((volatile u32 *)(ptr), *(cptr), (n)) == *(cptr)) 40 #define atomic_cas_u64(ptr, cptr, n) (_InterlockedCompareExchange64((volatile u64 *)(ptr), *(cptr), (n)) == *(cptr)) 41 #define atomic_load_u32(ptr) *((volatile u32 *)(ptr)) 42 #define atomic_load_u64(ptr) *((volatile u64 *)(ptr)) 43 #define atomic_or_u32(ptr, n) _InterlockedOr((volatile u32 *)(ptr), (n)) 44 #define atomic_store_u32(ptr, n) *((volatile u32 *)(ptr)) = (u32)(n) 45 #define atomic_store_u64(ptr, n) *((volatile u64 *)(ptr)) = (u64)(n) 46 #define atomic_swap_u32(ptr, n) _InterlockedExchange((volatile u32 *)(ptr), n) 47 #define atomic_swap_u64(ptr, n) _InterlockedExchange64((volatile u64 *)(ptr), n) 48 49 #define atan2_f32(y, x) atan2f(y, x) 50 #define cos_f32(a) cosf(a) 51 #define sin_f32(a) sinf(a) 52 #define tan_f32(a) tanf(a) 53 #define ceil_f32(a) ceilf(a) 54 #define sqrt_f32(a) sqrtf(a) 55 56 #define exp_f64(a) exp(a) 57 #define sqrt_f64(a) sqrt(a) 58 59 #else 60 #define alignas(n) __attribute__((aligned(n))) 61 #define pack_struct(s) s __attribute__((packed)) 62 #define no_return __attribute__((noreturn)) 63 64 #if ARCH_ARM64 65 /* TODO? debuggers just loop here forever and need a manual PC increment (step over) */ 66 #define debugbreak() asm volatile ("brk 0xf000") 67 #define cpu_yield() asm volatile ("yield") 68 #else 69 #define debugbreak() asm volatile ("int3; nop") 70 #endif 71 #define unreachable() __builtin_unreachable() 72 73 #define memory_write_barrier() asm volatile ("" ::: "memory") 74 75 #define atomic_add_u64(ptr, n) __atomic_fetch_add(ptr, n, __ATOMIC_SEQ_CST) 76 #define atomic_and_u64(ptr, n) __atomic_and_fetch(ptr, n, __ATOMIC_SEQ_CST) 77 #define atomic_cas_u64(ptr, cptr, n) __atomic_compare_exchange_n(ptr, cptr, n, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) 78 #define atomic_load_u64(ptr) __atomic_load_n(ptr, __ATOMIC_SEQ_CST) 79 #define atomic_or_u32(ptr, n) __atomic_or_fetch(ptr, n, __ATOMIC_SEQ_CST) 80 #define atomic_store_u64(ptr, n) __atomic_store_n(ptr, n, __ATOMIC_SEQ_CST) 81 #define atomic_swap_u64(ptr, n) __atomic_exchange_n(ptr, n, __ATOMIC_SEQ_CST) 82 #define atomic_add_u32 atomic_add_u64 83 #define atomic_and_u32 atomic_and_u64 84 #define atomic_cas_u32 atomic_cas_u64 85 #define atomic_load_u32 atomic_load_u64 86 #define atomic_store_u32 atomic_store_u64 87 #define atomic_swap_u32 atomic_swap_u64 88 89 #define atan2_f32(y, x) __builtin_atan2f(y, x) 90 #define cos_f32(a) __builtin_cosf(a) 91 #define sin_f32(a) __builtin_sinf(a) 92 #define tan_f32(a) __builtin_tanf(a) 93 #define ceil_f32(a) __builtin_ceilf(a) 94 #define sqrt_f32(a) __builtin_sqrtf(a) 95 96 #define exp_f64(a) __builtin_exp(a) 97 #define sqrt_f64(a) __builtin_sqrt(a) 98 #endif 99 100 #if COMPILER_MSVC 101 102 function force_inline u32 103 clz_u32(u32 a) 104 { 105 u32 result = 32, index; 106 if (a) { 107 _BitScanReverse(&index, a); 108 result = index; 109 } 110 return result; 111 } 112 113 function force_inline u32 114 ctz_u32(u32 a) 115 { 116 u32 result = 32, index; 117 if (a) { 118 _BitScanForward(&index, a); 119 result = index; 120 } 121 return result; 122 } 123 124 function force_inline u64 125 ctz_u64(u64 a) 126 { 127 u64 result = 64, index; 128 if (a) { 129 _BitScanForward64(&index, a); 130 result = index; 131 } 132 return result; 133 } 134 135 #else /* !COMPILER_MSVC */ 136 137 function force_inline u32 138 clz_u32(u32 a) 139 { 140 u32 result = 32; 141 if (a) result = (u32)__builtin_clz(a); 142 return result; 143 } 144 145 function force_inline u32 146 ctz_u32(u32 a) 147 { 148 u32 result = 32; 149 if (a) result = (u32)__builtin_ctz(a); 150 return result; 151 } 152 153 function force_inline u64 154 ctz_u64(u64 a) 155 { 156 u64 result = 64; 157 if (a) result = (u64)__builtin_ctzll(a); 158 return result; 159 } 160 161 #endif 162 163 #if ARCH_ARM64 164 /* NOTE(rnp): we are only doing a handful of f32x4 operations so we will just use NEON and do 165 * the macro renaming thing. If you are implementing a serious wide vector operation you should 166 * use SVE(2) instead. The semantics are different however and the code will be written for an 167 * arbitrary vector bit width. In that case you will also need x86_64 code for determining 168 * the supported vector width (ideally at runtime though that may not be possible). 169 */ 170 #include <arm_neon.h> 171 typedef float32x4_t f32x4; 172 typedef int32x4_t i32x4; 173 typedef uint32x4_t u32x4; 174 175 #define add_f32x4(a, b) vaddq_f32(a, b) 176 #define cvt_i32x4_f32x4(a) vcvtq_f32_s32(a) 177 #define cvt_f32x4_i32x4(a) vcvtq_s32_f32(a) 178 #define div_f32x4(a, b) vdivq_f32(a, b) 179 #define dup_f32x4(f) vdupq_n_f32(f) 180 #define floor_f32x4(a) vrndmq_f32(a) 181 #define load_f32x4(a) vld1q_f32(a) 182 #define load_i32x4(a) vld1q_s32(a) 183 #define max_f32x4(a, b) vmaxq_f32(a, b) 184 #define min_f32x4(a, b) vminq_f32(a, b) 185 #define mul_f32x4(a, b) vmulq_f32(a, b) 186 #define set_f32x4(a, b, c, d) vld1q_f32((f32 []){d, c, b, a}) 187 #define sqrt_f32x4(a) vsqrtq_f32(a) 188 #define store_f32x4(o, a) vst1q_f32(o, a) 189 #define store_i32x4(o, a) vst1q_s32(o, a) 190 #define sub_f32x4(a, b) vsubq_f32(a, b) 191 192 #elif ARCH_X64 193 #include <immintrin.h> 194 typedef __m128 f32x4; 195 typedef __m128i i32x4; 196 typedef __m128i u32x4; 197 198 #define add_f32x4(a, b) _mm_add_ps(a, b) 199 #define cvt_i32x4_f32x4(a) _mm_cvtepi32_ps(a) 200 #define cvt_f32x4_i32x4(a) _mm_cvtps_epi32(a) 201 #define div_f32x4(a, b) _mm_div_ps(a, b) 202 #define dup_f32x4(f) _mm_set1_ps(f) 203 #define floor_f32x4(a) _mm_floor_ps(a) 204 #define load_f32x4(a) _mm_loadu_ps(a) 205 #define load_i32x4(a) _mm_loadu_si128((i32x4 *)a) 206 #define max_f32x4(a, b) _mm_max_ps(a, b) 207 #define min_f32x4(a, b) _mm_min_ps(a, b) 208 #define mul_f32x4(a, b) _mm_mul_ps(a, b) 209 #define set_f32x4(a, b, c, d) _mm_set_ps(a, b, c, d) 210 #define sqrt_f32x4(a) _mm_sqrt_ps(a) 211 #define store_f32x4(o, a) _mm_storeu_ps(o, a) 212 #define store_i32x4(o, a) _mm_storeu_si128((i32x4 *)o, a) 213 #define sub_f32x4(a, b) _mm_sub_ps(a, b) 214 215 #define cpu_yield() _mm_pause() 216 217 #endif