intrinsics.c (7288B)
1 /* See LICENSE for license details. */ 2 #include "compiler.h" 3 4 #if COMPILER_CLANG || COMPILER_GCC 5 #define force_inline inline __attribute__((always_inline)) 6 #elif COMPILER_MSVC 7 #define force_inline __forceinline 8 #endif 9 10 #if COMPILER_MSVC || (COMPILER_CLANG && OS_WINDOWS) 11 #pragma section(".rdata$", read) 12 #define read_only __declspec(allocate(".rdata$")) 13 #elif COMPILER_CLANG 14 #define read_only __attribute__((section(".rodata"))) 15 #elif COMPILER_GCC 16 /* TODO(rnp): how do we do this with gcc, putting it in rodata causes warnings and writing to 17 * it doesn't cause a fault */ 18 #define read_only 19 #endif 20 21 #if COMPILER_MSVC 22 #define alignas(n) __declspec(align(n)) 23 #define pack_struct(s) __pragma(pack(push, 1)) s __pragma(pack(pop)) 24 #define no_return __declspec(noreturn) 25 26 #define debugbreak() __debugbreak() 27 #define unreachable() __assume(0) 28 29 #define memory_write_barrier() _WriteBarrier() 30 31 #define atomic_add_u32(ptr, n) _InterlockedExchangeAdd((volatile u32 *)(ptr), (n)) 32 #define atomic_add_u64(ptr, n) _InterlockedExchangeAdd64((volatile u64 *)(ptr), (n)) 33 #define atomic_and_u32(ptr, n) _InterlockedAnd((volatile u32 *)(ptr), (n)) 34 #define atomic_and_u64(ptr, n) _InterlockedAnd64((volatile u64 *)(ptr), (n)) 35 #define atomic_cas_u32(ptr, cptr, n) (_InterlockedCompareExchange((volatile u32 *)(ptr), *(cptr), (n)) == *(cptr)) 36 #define atomic_cas_u64(ptr, cptr, n) (_InterlockedCompareExchange64((volatile u64 *)(ptr), *(cptr), (n)) == *(cptr)) 37 #define atomic_load_u32(ptr) *((volatile u32 *)(ptr)) 38 #define atomic_load_u64(ptr) *((volatile u64 *)(ptr)) 39 #define atomic_or_u32(ptr, n) _InterlockedOr((volatile u32 *)(ptr), (n)) 40 #define atomic_store_u32(ptr, n) *((volatile u32 *)(ptr)) = (n) 41 #define atomic_store_u64(ptr, n) *((volatile u64 *)(ptr)) = (n) 42 #define atomic_swap_u32(ptr, n) _InterlockedExchange((volatile u32 *)(ptr), n) 43 #define atomic_swap_u64(ptr, n) _InterlockedExchange64((volatile u32 *)(ptr), n) 44 45 #define atan2_f32(y, x) atan2f(y, x) 46 #define cos_f32(a) cosf(a) 47 #define sin_f32(a) sinf(a) 48 #define tan_f32(a) tanf(a) 49 #define ceil_f32(a) ceilf(a) 50 #define sqrt_f32(a) sqrtf(a) 51 52 #define exp_f64(a) exp(a) 53 #define sqrt_f64(a) sqrt(a) 54 55 #else 56 #define alignas(n) __attribute__((aligned(n))) 57 #define pack_struct(s) s __attribute__((packed)) 58 #define no_return __attribute__((noreturn)) 59 60 #if ARCH_ARM64 61 /* TODO? debuggers just loop here forever and need a manual PC increment (step over) */ 62 #define debugbreak() asm volatile ("brk 0xf000") 63 #else 64 #define debugbreak() asm volatile ("int3; nop") 65 #endif 66 #define unreachable() __builtin_unreachable() 67 68 #define memory_write_barrier() asm volatile ("" ::: "memory") 69 70 #define atomic_add_u64(ptr, n) __atomic_fetch_add(ptr, n, __ATOMIC_ACQ_REL) 71 #define atomic_and_u64(ptr, n) __atomic_and_fetch(ptr, n, __ATOMIC_RELEASE) 72 #define atomic_cas_u64(ptr, cptr, n) __atomic_compare_exchange_n(ptr, cptr, n, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) 73 #define atomic_load_u64(ptr) __atomic_load_n(ptr, __ATOMIC_ACQUIRE) 74 #define atomic_or_u32(ptr, n) __atomic_or_fetch(ptr, n, __ATOMIC_RELEASE) 75 #define atomic_store_u64(ptr, n) __atomic_store_n(ptr, n, __ATOMIC_RELEASE) 76 #define atomic_swap_u64(ptr, n) __atomic_exchange_n(ptr, n, __ATOMIC_RELEASE) 77 #define atomic_add_u32 atomic_add_u64 78 #define atomic_and_u32 atomic_and_u64 79 #define atomic_cas_u32 atomic_cas_u64 80 #define atomic_load_u32 atomic_load_u64 81 #define atomic_store_u32 atomic_store_u64 82 #define atomic_swap_u32 atomic_swap_u64 83 84 #define atan2_f32(y, x) __builtin_atan2f(y, x) 85 #define cos_f32(a) __builtin_cosf(a) 86 #define sin_f32(a) __builtin_sinf(a) 87 #define tan_f32(a) __builtin_tanf(a) 88 #define ceil_f32(a) __builtin_ceilf(a) 89 #define sqrt_f32(a) __builtin_sqrtf(a) 90 91 #define exp_f64(a) __builtin_exp(a) 92 #define sqrt_f64(a) __builtin_sqrt(a) 93 #endif 94 95 #if COMPILER_MSVC 96 97 function force_inline u32 98 clz_u32(u32 a) 99 { 100 u32 result = 32, index; 101 if (a) { 102 _BitScanReverse(&index, a); 103 result = index; 104 } 105 return result; 106 } 107 108 function force_inline u32 109 ctz_u32(u32 a) 110 { 111 u32 result = 32, index; 112 if (a) { 113 _BitScanForward(&index, a); 114 result = index; 115 } 116 return result; 117 } 118 119 function force_inline u64 120 ctz_u64(u64 a) 121 { 122 u64 result = 64, index; 123 if (a) { 124 _BitScanForward64(&index, a); 125 result = index; 126 } 127 return result; 128 } 129 130 #else /* !COMPILER_MSVC */ 131 132 function force_inline u32 133 clz_u32(u32 a) 134 { 135 u32 result = 32; 136 if (a) result = (u32)__builtin_clz(a); 137 return result; 138 } 139 140 function force_inline u32 141 ctz_u32(u32 a) 142 { 143 u32 result = 32; 144 if (a) result = (u32)__builtin_ctz(a); 145 return result; 146 } 147 148 function force_inline u64 149 ctz_u64(u64 a) 150 { 151 u64 result = 64; 152 if (a) result = (u64)__builtin_ctzll(a); 153 return result; 154 } 155 156 #endif 157 158 #if ARCH_ARM64 159 /* NOTE(rnp): we are only doing a handful of f32x4 operations so we will just use NEON and do 160 * the macro renaming thing. If you are implementing a serious wide vector operation you should 161 * use SVE(2) instead. The semantics are different however and the code will be written for an 162 * arbitrary vector bit width. In that case you will also need x86_64 code for determining 163 * the supported vector width (ideally at runtime though that may not be possible). 164 */ 165 #include <arm_neon.h> 166 typedef float32x4_t f32x4; 167 typedef int32x4_t i32x4; 168 typedef uint32x4_t u32x4; 169 170 #define add_f32x4(a, b) vaddq_f32(a, b) 171 #define cvt_i32x4_f32x4(a) vcvtq_f32_s32(a) 172 #define cvt_f32x4_i32x4(a) vcvtq_s32_f32(a) 173 #define div_f32x4(a, b) vdivq_f32(a, b) 174 #define dup_f32x4(f) vdupq_n_f32(f) 175 #define floor_f32x4(a) vrndmq_f32(a) 176 #define load_f32x4(a) vld1q_f32(a) 177 #define load_i32x4(a) vld1q_s32(a) 178 #define max_f32x4(a, b) vmaxq_f32(a, b) 179 #define min_f32x4(a, b) vminq_f32(a, b) 180 #define mul_f32x4(a, b) vmulq_f32(a, b) 181 #define set_f32x4(a, b, c, d) vld1q_f32((f32 []){d, c, b, a}) 182 #define sqrt_f32x4(a) vsqrtq_f32(a) 183 #define store_f32x4(o, a) vst1q_f32(o, a) 184 #define store_i32x4(o, a) vst1q_s32(o, a) 185 #define sub_f32x4(a, b) vsubq_f32(a, b) 186 187 #elif ARCH_X64 188 #include <immintrin.h> 189 typedef __m128 f32x4; 190 typedef __m128i i32x4; 191 typedef __m128i u32x4; 192 193 #define add_f32x4(a, b) _mm_add_ps(a, b) 194 #define cvt_i32x4_f32x4(a) _mm_cvtepi32_ps(a) 195 #define cvt_f32x4_i32x4(a) _mm_cvtps_epi32(a) 196 #define div_f32x4(a, b) _mm_div_ps(a, b) 197 #define dup_f32x4(f) _mm_set1_ps(f) 198 #define floor_f32x4(a) _mm_floor_ps(a) 199 #define load_f32x4(a) _mm_loadu_ps(a) 200 #define load_i32x4(a) _mm_loadu_si128((i32x4 *)a) 201 #define max_f32x4(a, b) _mm_max_ps(a, b) 202 #define min_f32x4(a, b) _mm_min_ps(a, b) 203 #define mul_f32x4(a, b) _mm_mul_ps(a, b) 204 #define set_f32x4(a, b, c, d) _mm_set_ps(a, b, c, d) 205 #define sqrt_f32x4(a) _mm_sqrt_ps(a) 206 #define store_f32x4(o, a) _mm_storeu_ps(o, a) 207 #define store_i32x4(o, a) _mm_storeu_si128((i32x4 *)o, a) 208 #define sub_f32x4(a, b) _mm_sub_ps(a, b) 209 210 #endif