intrinsics.c (6491B)
1 /* See LICENSE for license details. */ 2 #include "compiler.h" 3 4 #if COMPILER_CLANG || COMPILER_GCC 5 #define force_inline inline __attribute__((always_inline)) 6 #elif COMPILER_MSVC 7 #define force_inline __forceinline 8 #endif 9 10 #if COMPILER_MSVC || (COMPILER_CLANG && OS_WINDOWS) 11 #pragma section(".rdata$", read) 12 #define read_only __declspec(allocate(".rdata$")) 13 #elif COMPILER_CLANG 14 #define read_only __attribute__((section(".rodata"))) 15 #elif COMPILER_GCC 16 /* TODO(rnp): how do we do this with gcc, putting it in rodata causes warnings and writing to 17 * it doesn't cause a fault */ 18 #define read_only 19 #endif 20 21 #if COMPILER_MSVC 22 #define align_as(n) __declspec(align(n)) 23 #define pack_struct(s) __pragma(pack(push, 1)) s __pragma(pack(pop)) 24 #define no_return __declspec(noreturn) 25 26 #define debugbreak() __debugbreak() 27 #define unreachable() __assume(0) 28 29 #define memory_write_barrier() _WriteBarrier() 30 31 #define atomic_store_u64(ptr, n) *((volatile u64 *)(ptr)) = (n) 32 #define atomic_store_u32(ptr, n) *((volatile u32 *)(ptr)) = (n) 33 #define atomic_load_u64(ptr) *((volatile u64 *)(ptr)) 34 #define atomic_load_u32(ptr) *((volatile u32 *)(ptr)) 35 #define atomic_add_u64(ptr, n) _InterlockedExchangeAdd64((volatile u64 *)(ptr), (n)) 36 #define atomic_add_u32(ptr, n) _InterlockedExchangeAdd((volatile u32 *)(ptr), (n)) 37 #define atomic_and_u64(ptr, n) _InterlockedAnd64((volatile u64 *)(ptr), (n)) 38 #define atomic_and_u32(ptr, n) _InterlockedAnd((volatile u32 *)(ptr), (n)) 39 #define atomic_cas_u64(ptr, cptr, n) (_InterlockedCompareExchange64((volatile u64 *)(ptr), *(cptr), (n)) == *(cptr)) 40 #define atomic_cas_u32(ptr, cptr, n) (_InterlockedCompareExchange((volatile u32 *)(ptr), *(cptr), (n)) == *(cptr)) 41 #define atomic_or_u32(ptr, n) _InterlockedOr((volatile u32 *)(ptr), (n)) 42 43 #define atan2_f32(y, x) atan2f(y, x) 44 #define cos_f32(a) cosf(a) 45 #define sin_f32(a) sinf(a) 46 #define tan_f32(a) tanf(a) 47 #define ceil_f32(a) ceilf(a) 48 #define sqrt_f32(a) sqrtf(a) 49 50 #else 51 #define align_as(n) __attribute__((aligned(n))) 52 #define pack_struct(s) s __attribute__((packed)) 53 #define no_return __attribute__((noreturn)) 54 55 #if ARCH_ARM64 56 /* TODO? debuggers just loop here forever and need a manual PC increment (step over) */ 57 #define debugbreak() asm volatile ("brk 0xf000") 58 #else 59 #define debugbreak() asm volatile ("int3; nop") 60 #endif 61 #define unreachable() __builtin_unreachable() 62 63 #define memory_write_barrier() asm volatile ("" ::: "memory") 64 65 #define atomic_store_u64(ptr, n) __atomic_store_n(ptr, n, __ATOMIC_RELEASE) 66 #define atomic_load_u64(ptr) __atomic_load_n(ptr, __ATOMIC_ACQUIRE) 67 #define atomic_add_u64(ptr, n) __atomic_fetch_add(ptr, n, __ATOMIC_ACQ_REL) 68 #define atomic_and_u64(ptr, n) __atomic_and_fetch(ptr, n, __ATOMIC_RELEASE) 69 #define atomic_cas_u64(ptr, cptr, n) __atomic_compare_exchange_n(ptr, cptr, n, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) 70 #define atomic_or_u32(ptr, n) __atomic_or_fetch(ptr, n, __ATOMIC_RELEASE) 71 #define atomic_add_u32 atomic_add_u64 72 #define atomic_and_u32 atomic_and_u64 73 #define atomic_cas_u32 atomic_cas_u64 74 #define atomic_load_u32 atomic_load_u64 75 #define atomic_store_u32 atomic_store_u64 76 77 #define atan2_f32(y, x) __builtin_atan2f(y, x) 78 #define cos_f32(a) __builtin_cosf(a) 79 #define sin_f32(a) __builtin_sinf(a) 80 #define tan_f32(a) __builtin_tanf(a) 81 #define ceil_f32(a) __builtin_ceilf(a) 82 #define sqrt_f32(a) __builtin_sqrtf(a) 83 84 #endif 85 86 #if COMPILER_MSVC 87 88 function force_inline u32 89 clz_u32(u32 a) 90 { 91 u32 result = 32, index; 92 if (a) { 93 _BitScanReverse(&index, a); 94 result = index; 95 } 96 return result; 97 } 98 99 function force_inline u32 100 ctz_u32(u32 a) 101 { 102 u32 result = 32, index; 103 if (a) { 104 _BitScanForward(&index, a); 105 result = index; 106 } 107 return result; 108 } 109 110 #else /* !COMPILER_MSVC */ 111 112 function force_inline u32 113 clz_u32(u32 a) 114 { 115 u32 result = 32; 116 if (a) result = (u32)__builtin_clz(a); 117 return result; 118 } 119 120 function force_inline u32 121 ctz_u32(u32 a) 122 { 123 u32 result = 32; 124 if (a) result = (u32)__builtin_ctz(a); 125 return result; 126 } 127 128 #endif 129 130 #if ARCH_ARM64 131 /* NOTE(rnp): we are only doing a handful of f32x4 operations so we will just use NEON and do 132 * the macro renaming thing. If you are implementing a serious wide vector operation you should 133 * use SVE(2) instead. The semantics are different however and the code will be written for an 134 * arbitrary vector bit width. In that case you will also need x86_64 code for determining 135 * the supported vector width (ideally at runtime though that may not be possible). 136 */ 137 #include <arm_neon.h> 138 typedef float32x4_t f32x4; 139 typedef int32x4_t i32x4; 140 141 #define add_f32x4(a, b) vaddq_f32(a, b) 142 #define cvt_i32x4_f32x4(a) vcvtq_f32_s32(a) 143 #define cvt_f32x4_i32x4(a) vcvtq_s32_f32(a) 144 #define div_f32x4(a, b) vdivq_f32(a, b) 145 #define dup_f32x4(f) vdupq_n_f32(f) 146 #define floor_f32x4(a) vrndmq_f32(a) 147 #define load_f32x4(a) vld1q_f32(a) 148 #define load_i32x4(a) vld1q_s32(a) 149 #define max_f32x4(a, b) vmaxq_f32(a, b) 150 #define min_f32x4(a, b) vminq_f32(a, b) 151 #define mul_f32x4(a, b) vmulq_f32(a, b) 152 #define set_f32x4(a, b, c, d) vld1q_f32((f32 []){d, c, b, a}) 153 #define sqrt_f32x4(a) vsqrtq_f32(a) 154 #define store_f32x4(o, a) vst1q_f32(o, a) 155 #define store_i32x4(o, a) vst1q_s32(o, a) 156 #define sub_f32x4(a, b) vsubq_f32(a, b) 157 158 #elif ARCH_X64 159 #include <immintrin.h> 160 typedef __m128 f32x4; 161 typedef __m128i i32x4; 162 163 #define add_f32x4(a, b) _mm_add_ps(a, b) 164 #define cvt_i32x4_f32x4(a) _mm_cvtepi32_ps(a) 165 #define cvt_f32x4_i32x4(a) _mm_cvtps_epi32(a) 166 #define div_f32x4(a, b) _mm_div_ps(a, b) 167 #define dup_f32x4(f) _mm_set1_ps(f) 168 #define floor_f32x4(a) _mm_floor_ps(a) 169 #define load_f32x4(a) _mm_loadu_ps(a) 170 #define load_i32x4(a) _mm_loadu_si128((i32x4 *)a) 171 #define max_f32x4(a, b) _mm_max_ps(a, b) 172 #define min_f32x4(a, b) _mm_min_ps(a, b) 173 #define mul_f32x4(a, b) _mm_mul_ps(a, b) 174 #define set_f32x4(a, b, c, d) _mm_set_ps(a, b, c, d) 175 #define sqrt_f32x4(a) _mm_sqrt_ps(a) 176 #define store_f32x4(o, a) _mm_storeu_ps(o, a) 177 #define store_i32x4(o, a) _mm_storeu_si128((i32x4 *)o, a) 178 #define sub_f32x4(a, b) _mm_sub_ps(a, b) 179 180 #endif