intrinsics.c (5564B)
1 /* See LICENSE for license details. */ 2 #include "compiler.h" 3 4 #if COMPILER_CLANG || COMPILER_GCC 5 #define force_inline inline __attribute__((always_inline)) 6 #elif COMPILER_MSVC 7 #define force_inline __forceinline 8 #endif 9 10 #if COMPILER_MSVC || (COMPILER_CLANG && OS_WINDOWS) 11 #pragma section(".rdata$", read) 12 #define read_only __declspec(allocate(".rdata$")) 13 #elif COMPILER_CLANG 14 #define read_only __attribute__((section(".rodata"))) 15 #elif COMPILER_GCC 16 /* TODO(rnp): how do we do this with gcc, putting it in rodata causes warnings and writing to 17 * it doesn't cause a fault */ 18 #define read_only 19 #endif 20 21 #if COMPILER_MSVC 22 #define align_as(n) __declspec(align(n)) 23 #define pack_struct(s) __pragma(pack(push, 1)) s __pragma(pack(pop)) 24 #define no_return __declspec(noreturn) 25 26 #define debugbreak() __debugbreak() 27 #define unreachable() __assume(0) 28 29 #define atomic_store_u64(ptr, n) *((volatile u64 *)(ptr)) = (n) 30 #define atomic_store_u32(ptr, n) *((volatile u32 *)(ptr)) = (n) 31 #define atomic_load_u64(ptr) *((volatile u64 *)(ptr)) 32 #define atomic_load_u32(ptr) *((volatile u32 *)(ptr)) 33 #define atomic_add_u64(ptr, n) _InterlockedExchangeAdd64((volatile u64 *)(ptr), (n)) 34 #define atomic_add_u32(ptr, n) _InterlockedExchangeAdd((volatile u32 *)(ptr), (n)) 35 #define atomic_and_u64(ptr, n) _InterlockedAnd64((volatile u64 *)(ptr), (n)) 36 #define atomic_and_u32(ptr, n) _InterlockedAnd((volatile u32 *)(ptr), (n)) 37 #define atomic_cas_u64(ptr, cptr, n) (_InterlockedCompareExchange64((volatile u64 *)(ptr), *(cptr), (n)) == *(cptr)) 38 #define atomic_cas_u32(ptr, cptr, n) (_InterlockedCompareExchange((volatile u32 *)(ptr), *(cptr), (n)) == *(cptr)) 39 #define atomic_or_u32(ptr, n) _InterlockedOr((volatile u32 *)(ptr), (n)) 40 41 #define atan2_f32(y, x) atan2f(y, x) 42 #define ceil_f32(a) ceilf(a) 43 #define sqrt_f32(a) sqrtf(a) 44 45 #else 46 #define align_as(n) __attribute__((aligned(n))) 47 #define pack_struct(s) s __attribute__((packed)) 48 #define no_return __attribute__((noreturn)) 49 50 #if ARCH_ARM64 51 /* TODO? debuggers just loop here forever and need a manual PC increment (step over) */ 52 #define debugbreak() asm volatile ("brk 0xf000") 53 #else 54 #define debugbreak() asm volatile ("int3; nop") 55 #endif 56 #define unreachable() __builtin_unreachable() 57 58 #define atomic_store_u64(ptr, n) __atomic_store_n(ptr, n, __ATOMIC_RELEASE) 59 #define atomic_load_u64(ptr) __atomic_load_n(ptr, __ATOMIC_ACQUIRE) 60 #define atomic_add_u64(ptr, n) __atomic_fetch_add(ptr, n, __ATOMIC_ACQ_REL) 61 #define atomic_and_u64(ptr, n) __atomic_and_fetch(ptr, n, __ATOMIC_RELEASE) 62 #define atomic_cas_u64(ptr, cptr, n) __atomic_compare_exchange_n(ptr, cptr, n, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) 63 #define atomic_or_u32(ptr, n) __atomic_or_fetch(ptr, n, __ATOMIC_RELEASE) 64 #define atomic_add_u32 atomic_add_u64 65 #define atomic_and_u32 atomic_and_u64 66 #define atomic_cas_u32 atomic_cas_u64 67 #define atomic_load_u32 atomic_load_u64 68 #define atomic_store_u32 atomic_store_u64 69 70 #define atan2_f32(y, x) __builtin_atan2f(y, x) 71 #define ceil_f32(a) __builtin_ceilf(a) 72 #define sqrt_f32(a) __builtin_sqrtf(a) 73 74 #endif 75 76 #if COMPILER_MSVC 77 78 function force_inline u32 79 clz_u32(u32 a) 80 { 81 u32 result = 32, index; 82 if (a) { 83 _BitScanReverse(&index, a); 84 result = index; 85 } 86 return result; 87 } 88 89 function force_inline u32 90 ctz_u32(u32 a) 91 { 92 u32 result = 32, index; 93 if (a) { 94 _BitScanForward(&index, a); 95 result = index; 96 } 97 return result; 98 } 99 100 #else /* !COMPILER_MSVC */ 101 102 function force_inline u32 103 clz_u32(u32 a) 104 { 105 u32 result = 32; 106 if (a) result = __builtin_clz(a); 107 return result; 108 } 109 110 function force_inline u32 111 ctz_u32(u32 a) 112 { 113 u32 result = 32; 114 if (a) result = __builtin_ctz(a); 115 return result; 116 } 117 118 #endif 119 120 #if ARCH_ARM64 121 /* NOTE(rnp): we are only doing a handful of f32x4 operations so we will just use NEON and do 122 * the macro renaming thing. If you are implementing a serious wide vector operation you should 123 * use SVE(2) instead. The semantics are different however and the code will be written for an 124 * arbitrary vector bit width. In that case you will also need x86_64 code for determining 125 * the supported vector width (ideally at runtime though that may not be possible). 126 */ 127 #include <arm_neon.h> 128 typedef float32x4_t f32x4; 129 typedef int32x4_t i32x4; 130 131 #define cvt_i32x4_f32x4(a) vcvtq_f32_s32(a) 132 #define cvt_f32x4_i32x4(a) vcvtq_s32_f32(a) 133 #define dup_f32x4(f) vdupq_n_f32(f) 134 #define load_f32x4(a) vld1q_f32(a) 135 #define load_i32x4(a) vld1q_s32(a) 136 #define mul_f32x4(a, b) vmulq_f32(a, b) 137 #define set_f32x4(a, b, c, d) vld1q_f32((f32 []){d, c, b, a}) 138 #define sqrt_f32x4(a) vsqrtq_f32(a) 139 #define store_f32x4(a, o) vst1q_f32(o, a) 140 #define store_i32x4(a, o) vst1q_s32(o, a) 141 142 #elif ARCH_X64 143 #include <immintrin.h> 144 typedef __m128 f32x4; 145 typedef __m128i i32x4; 146 147 #define cvt_i32x4_f32x4(a) _mm_cvtepi32_ps(a) 148 #define cvt_f32x4_i32x4(a) _mm_cvtps_epi32(a) 149 #define dup_f32x4(f) _mm_set1_ps(f) 150 #define load_f32x4(a) _mm_loadu_ps(a) 151 #define load_i32x4(a) _mm_loadu_si128((i32x4 *)a) 152 #define mul_f32x4(a, b) _mm_mul_ps(a, b) 153 #define set_f32x4(a, b, c, d) _mm_set_ps(a, b, c, d) 154 #define sqrt_f32x4(a) _mm_sqrt_ps(a) 155 #define store_f32x4(a, o) _mm_storeu_ps(o, a) 156 #define store_i32x4(a, o) _mm_storeu_si128((i32x4 *)o, a) 157 158 #endif