intrinsics.c (6646B)
1 /* See LICENSE for license details. */ 2 #include "compiler.h" 3 4 #if COMPILER_CLANG || COMPILER_GCC 5 #define force_inline inline __attribute__((always_inline)) 6 #elif COMPILER_MSVC 7 #define force_inline __forceinline 8 #endif 9 10 #if COMPILER_MSVC || (COMPILER_CLANG && OS_WINDOWS) 11 #pragma section(".rdata$", read) 12 #define read_only __declspec(allocate(".rdata$")) 13 #elif COMPILER_CLANG 14 #define read_only __attribute__((section(".rodata"))) 15 #elif COMPILER_GCC 16 /* TODO(rnp): how do we do this with gcc, putting it in rodata causes warnings and writing to 17 * it doesn't cause a fault */ 18 #define read_only 19 #endif 20 21 #if COMPILER_MSVC 22 #define alignas(n) __declspec(align(n)) 23 #define pack_struct(s) __pragma(pack(push, 1)) s __pragma(pack(pop)) 24 #define no_return __declspec(noreturn) 25 26 #define debugbreak() __debugbreak() 27 #define unreachable() __assume(0) 28 29 #define memory_write_barrier() _WriteBarrier() 30 31 #define atomic_store_u64(ptr, n) *((volatile u64 *)(ptr)) = (n) 32 #define atomic_store_u32(ptr, n) *((volatile u32 *)(ptr)) = (n) 33 #define atomic_load_u64(ptr) *((volatile u64 *)(ptr)) 34 #define atomic_load_u32(ptr) *((volatile u32 *)(ptr)) 35 #define atomic_add_u64(ptr, n) _InterlockedExchangeAdd64((volatile u64 *)(ptr), (n)) 36 #define atomic_add_u32(ptr, n) _InterlockedExchangeAdd((volatile u32 *)(ptr), (n)) 37 #define atomic_and_u64(ptr, n) _InterlockedAnd64((volatile u64 *)(ptr), (n)) 38 #define atomic_and_u32(ptr, n) _InterlockedAnd((volatile u32 *)(ptr), (n)) 39 #define atomic_cas_u64(ptr, cptr, n) (_InterlockedCompareExchange64((volatile u64 *)(ptr), *(cptr), (n)) == *(cptr)) 40 #define atomic_cas_u32(ptr, cptr, n) (_InterlockedCompareExchange((volatile u32 *)(ptr), *(cptr), (n)) == *(cptr)) 41 #define atomic_or_u32(ptr, n) _InterlockedOr((volatile u32 *)(ptr), (n)) 42 43 #define atan2_f32(y, x) atan2f(y, x) 44 #define cos_f32(a) cosf(a) 45 #define sin_f32(a) sinf(a) 46 #define tan_f32(a) tanf(a) 47 #define ceil_f32(a) ceilf(a) 48 #define sqrt_f32(a) sqrtf(a) 49 50 #define exp_f64(a) exp(a) 51 #define sqrt_f64(a) sqrt(a) 52 53 #else 54 #define alignas(n) __attribute__((aligned(n))) 55 #define pack_struct(s) s __attribute__((packed)) 56 #define no_return __attribute__((noreturn)) 57 58 #if ARCH_ARM64 59 /* TODO? debuggers just loop here forever and need a manual PC increment (step over) */ 60 #define debugbreak() asm volatile ("brk 0xf000") 61 #else 62 #define debugbreak() asm volatile ("int3; nop") 63 #endif 64 #define unreachable() __builtin_unreachable() 65 66 #define memory_write_barrier() asm volatile ("" ::: "memory") 67 68 #define atomic_store_u64(ptr, n) __atomic_store_n(ptr, n, __ATOMIC_RELEASE) 69 #define atomic_load_u64(ptr) __atomic_load_n(ptr, __ATOMIC_ACQUIRE) 70 #define atomic_add_u64(ptr, n) __atomic_fetch_add(ptr, n, __ATOMIC_ACQ_REL) 71 #define atomic_and_u64(ptr, n) __atomic_and_fetch(ptr, n, __ATOMIC_RELEASE) 72 #define atomic_cas_u64(ptr, cptr, n) __atomic_compare_exchange_n(ptr, cptr, n, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) 73 #define atomic_or_u32(ptr, n) __atomic_or_fetch(ptr, n, __ATOMIC_RELEASE) 74 #define atomic_add_u32 atomic_add_u64 75 #define atomic_and_u32 atomic_and_u64 76 #define atomic_cas_u32 atomic_cas_u64 77 #define atomic_load_u32 atomic_load_u64 78 #define atomic_store_u32 atomic_store_u64 79 80 #define atan2_f32(y, x) __builtin_atan2f(y, x) 81 #define cos_f32(a) __builtin_cosf(a) 82 #define sin_f32(a) __builtin_sinf(a) 83 #define tan_f32(a) __builtin_tanf(a) 84 #define ceil_f32(a) __builtin_ceilf(a) 85 #define sqrt_f32(a) __builtin_sqrtf(a) 86 87 #define exp_f64(a) __builtin_exp(a) 88 #define sqrt_f64(a) __builtin_sqrt(a) 89 #endif 90 91 #if COMPILER_MSVC 92 93 function force_inline u32 94 clz_u32(u32 a) 95 { 96 u32 result = 32, index; 97 if (a) { 98 _BitScanReverse(&index, a); 99 result = index; 100 } 101 return result; 102 } 103 104 function force_inline u32 105 ctz_u32(u32 a) 106 { 107 u32 result = 32, index; 108 if (a) { 109 _BitScanForward(&index, a); 110 result = index; 111 } 112 return result; 113 } 114 115 #else /* !COMPILER_MSVC */ 116 117 function force_inline u32 118 clz_u32(u32 a) 119 { 120 u32 result = 32; 121 if (a) result = (u32)__builtin_clz(a); 122 return result; 123 } 124 125 function force_inline u32 126 ctz_u32(u32 a) 127 { 128 u32 result = 32; 129 if (a) result = (u32)__builtin_ctz(a); 130 return result; 131 } 132 133 #endif 134 135 #if ARCH_ARM64 136 /* NOTE(rnp): we are only doing a handful of f32x4 operations so we will just use NEON and do 137 * the macro renaming thing. If you are implementing a serious wide vector operation you should 138 * use SVE(2) instead. The semantics are different however and the code will be written for an 139 * arbitrary vector bit width. In that case you will also need x86_64 code for determining 140 * the supported vector width (ideally at runtime though that may not be possible). 141 */ 142 #include <arm_neon.h> 143 typedef float32x4_t f32x4; 144 typedef int32x4_t i32x4; 145 146 #define add_f32x4(a, b) vaddq_f32(a, b) 147 #define cvt_i32x4_f32x4(a) vcvtq_f32_s32(a) 148 #define cvt_f32x4_i32x4(a) vcvtq_s32_f32(a) 149 #define div_f32x4(a, b) vdivq_f32(a, b) 150 #define dup_f32x4(f) vdupq_n_f32(f) 151 #define floor_f32x4(a) vrndmq_f32(a) 152 #define load_f32x4(a) vld1q_f32(a) 153 #define load_i32x4(a) vld1q_s32(a) 154 #define max_f32x4(a, b) vmaxq_f32(a, b) 155 #define min_f32x4(a, b) vminq_f32(a, b) 156 #define mul_f32x4(a, b) vmulq_f32(a, b) 157 #define set_f32x4(a, b, c, d) vld1q_f32((f32 []){d, c, b, a}) 158 #define sqrt_f32x4(a) vsqrtq_f32(a) 159 #define store_f32x4(o, a) vst1q_f32(o, a) 160 #define store_i32x4(o, a) vst1q_s32(o, a) 161 #define sub_f32x4(a, b) vsubq_f32(a, b) 162 163 #elif ARCH_X64 164 #include <immintrin.h> 165 typedef __m128 f32x4; 166 typedef __m128i i32x4; 167 168 #define add_f32x4(a, b) _mm_add_ps(a, b) 169 #define cvt_i32x4_f32x4(a) _mm_cvtepi32_ps(a) 170 #define cvt_f32x4_i32x4(a) _mm_cvtps_epi32(a) 171 #define div_f32x4(a, b) _mm_div_ps(a, b) 172 #define dup_f32x4(f) _mm_set1_ps(f) 173 #define floor_f32x4(a) _mm_floor_ps(a) 174 #define load_f32x4(a) _mm_loadu_ps(a) 175 #define load_i32x4(a) _mm_loadu_si128((i32x4 *)a) 176 #define max_f32x4(a, b) _mm_max_ps(a, b) 177 #define min_f32x4(a, b) _mm_min_ps(a, b) 178 #define mul_f32x4(a, b) _mm_mul_ps(a, b) 179 #define set_f32x4(a, b, c, d) _mm_set_ps(a, b, c, d) 180 #define sqrt_f32x4(a) _mm_sqrt_ps(a) 181 #define store_f32x4(o, a) _mm_storeu_ps(o, a) 182 #define store_i32x4(o, a) _mm_storeu_si128((i32x4 *)o, a) 183 #define sub_f32x4(a, b) _mm_sub_ps(a, b) 184 185 #endif