ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | LICENSE

intrinsics.c (2888B)


      1 #define FORCE_INLINE inline __attribute__((always_inline))
      2 
      3 /* TODO(rnp): msvc probably won't build this but there are other things preventing that as well */
      4 #define sqrt_f32(a)     __builtin_sqrtf(a)
      5 #define atan2_f32(y, x) __builtin_atan2f(y, x)
      6 
      7 #define atomic_store(ptr, n)     __atomic_store_n(ptr,    n, __ATOMIC_RELEASE)
      8 #define atomic_load(ptr)         __atomic_load_n(ptr,        __ATOMIC_ACQUIRE)
      9 #define atomic_swap(ptr, n)      __atomic_exchange_n(ptr, n, __ATOMIC_RELEASE)
     10 #define atomic_and(ptr, n)       __atomic_and_fetch(ptr,  n, __ATOMIC_RELEASE)
     11 #define atomic_add(ptr, n)       __atomic_add_fetch(ptr,  n, __ATOMIC_RELEASE)
     12 #define atomic_inc(ptr, n)       __atomic_fetch_add(ptr,  n, __ATOMIC_ACQ_REL)
     13 #define atomic_cas(ptr, cptr, n) __atomic_compare_exchange_n(ptr, cptr, n, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
     14 
     15 static FORCE_INLINE u32
     16 clz_u32(u32 a)
     17 {
     18 	u32 result = 32;
     19 	if (a) result = __builtin_clz(a);
     20 	return result;
     21 }
     22 
     23 static FORCE_INLINE u32
     24 ctz_u32(u32 a)
     25 {
     26 	u32 result = 32;
     27 	if (a) result = __builtin_ctz(a);
     28 	return result;
     29 }
     30 
     31 #ifdef __ARM_ARCH_ISA_A64
     32 /* TODO? debuggers just loop here forever and need a manual PC increment (step over) */
     33 #define debugbreak() asm volatile ("brk 0xf000")
     34 
     35 /* NOTE(rnp): we are only doing a handful of f32x4 operations so we will just use NEON and do
     36  * the macro renaming thing. If you are implementing a serious wide vector operation you should
     37  * use SVE(2) instead. The semantics are different however and the code will be written for an
     38  * arbitrary vector bit width. In that case you will also need x86_64 code for determining
     39  * the supported vector width (ideally at runtime though that may not be possible).
     40  */
     41 #include <arm_neon.h>
     42 typedef float32x4_t f32x4;
     43 typedef int32x4_t   i32x4;
     44 
     45 #define cvt_i32x4_f32x4(a)    vcvtq_f32_s32(a)
     46 #define cvt_f32x4_i32x4(a)    vcvtq_s32_f32(a)
     47 #define dup_f32x4(f)          vdupq_n_f32(f)
     48 #define load_f32x4(a)         vld1q_f32(a)
     49 #define load_i32x4(a)         vld1q_s32(a)
     50 #define mul_f32x4(a, b)       vmulq_f32(a, b)
     51 #define set_f32x4(a, b, c, d) vld1q_f32((f32 []){d, c, b, a})
     52 #define sqrt_f32x4(a)         vsqrtq_f32(a)
     53 #define store_f32x4(a, o)     vst1q_f32(o, a)
     54 #define store_i32x4(a, o)     vst1q_s32(o, a)
     55 
     56 #elif __x86_64__
     57 #include <immintrin.h>
     58 typedef __m128  f32x4;
     59 typedef __m128i i32x4;
     60 
     61 #define cvt_i32x4_f32x4(a)    _mm_cvtepi32_ps(a)
     62 #define cvt_f32x4_i32x4(a)    _mm_cvtps_epi32(a)
     63 #define dup_f32x4(f)          _mm_set1_ps(f)
     64 #define load_f32x4(a)         _mm_loadu_ps(a)
     65 #define load_i32x4(a)         _mm_loadu_si128((i32x4 *)a)
     66 #define mul_f32x4(a, b)       _mm_mul_ps(a, b)
     67 #define set_f32x4(a, b, c, d) _mm_set_ps(a, b, c, d)
     68 #define sqrt_f32x4(a)         _mm_sqrt_ps(a)
     69 #define store_f32x4(a, o)     _mm_storeu_ps(o, a)
     70 #define store_i32x4(a, o)     _mm_storeu_si128((i32x4 *)o, a)
     71 
     72 #define debugbreak() asm volatile ("int3; nop")
     73 
     74 #endif