ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | LICENSE

Commit: 16c32a812bd167d704d39740aec439f202bcc631
Parent: 15c6348c2447d49fe2e2e3021a269ffad19c451e
Author: Randy Palamar
Date:   Fri, 20 Dec 2024 17:11:19 -0700

port to aarch64

Diffstat:
Mbeamformer.c | 13++++++++-----
Mbeamformer.h | 1+
Aintrinsics.c | 51+++++++++++++++++++++++++++++++++++++++++++++++++++
Mstatic.c | 10+++++++---
Mutil.c | 25+++++++++++--------------
Mutil.h | 6+++---
6 files changed, 81 insertions(+), 25 deletions(-)

diff --git a/beamformer.c b/beamformer.c @@ -59,7 +59,7 @@ alloc_beamform_frame(GLParams *gp, BeamformFrame *out, uv4 out_dim, u32 frame_in /* NOTE: allocate storage for beamformed output data; * this is shared between compute and fragment shaders */ u32 max_dim = MAX(out->dim.x, MAX(out->dim.y, out->dim.z)); - out->mips = _tzcnt_u32(max_dim) + 1; + out->mips = ctz_u32(max_dim) + 1; u8 buf[256]; Stream label = {.data = buf, .cap = ARRAY_COUNT(buf)}; @@ -117,8 +117,9 @@ alloc_shader_storage(BeamformerCtx *ctx, Arena a) i32 storage_flags = GL_DYNAMIC_STORAGE_BIT; switch (ctx->gl.vendor_id) { - case GL_VENDOR_INTEL: case GL_VENDOR_AMD: + case GL_VENDOR_ARM: + case GL_VENDOR_INTEL: if (cs->raw_data_ssbo) glUnmapNamedBuffer(cs->raw_data_ssbo); storage_flags |= GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT; @@ -146,8 +147,9 @@ alloc_shader_storage(BeamformerCtx *ctx, Arena a) i32 map_flags = GL_MAP_WRITE_BIT|GL_MAP_PERSISTENT_BIT|GL_MAP_UNSYNCHRONIZED_BIT; switch (ctx->gl.vendor_id) { - case GL_VENDOR_INTEL: case GL_VENDOR_AMD: + case GL_VENDOR_ARM: + case GL_VENDOR_INTEL: cs->raw_data_arena.beg = glMapNamedBufferRange(cs->raw_data_ssbo, 0, full_rf_buf_size, map_flags); break; @@ -309,7 +311,7 @@ static v4 f32_4_to_v4(f32 *in) { v4 result; - _mm_storeu_ps(result.E, _mm_loadu_ps(in)); + store_f32x4(load_f32x4(in), result.E); return result; } @@ -711,8 +713,9 @@ DEBUG_EXPORT BEAMFORMER_FRAME_STEP_FN(beamformer_frame_step) ctx->error_stream.widx = 0; } else { switch (ctx->gl.vendor_id) { - case GL_VENDOR_INTEL: case GL_VENDOR_AMD: + case GL_VENDOR_ARM: + case GL_VENDOR_INTEL: break; case GL_VENDOR_NVIDIA: glNamedBufferSubData(cs->raw_data_ssbo, raw_index * rlen, diff --git a/beamformer.h b/beamformer.h @@ -35,6 +35,7 @@ enum program_flags { enum gl_vendor_ids { GL_VENDOR_AMD, + GL_VENDOR_ARM, GL_VENDOR_INTEL, GL_VENDOR_NVIDIA, }; diff --git a/intrinsics.c b/intrinsics.c @@ -0,0 +1,51 @@ +#define FORCE_INLINE inline __attribute__((always_inline)) + +/* TODO(rnp): msvc probably won't build this but there are other things preventing that as well */ +#define clz_u32(a) __builtin_clz(a) +#define ctz_u32(a) __builtin_ctz(a) +#define sqrt_f32(a) __builtin_sqrtf(a) + +#ifdef __ARM_ARCH_ISA_A64 +/* TODO? debuggers just loop here forever and need a manual PC increment (step over) */ +#define debugbreak() asm volatile ("brk 0xf000") + +/* NOTE(rnp): we are only doing a handful of f32x4 operations so we will just use NEON and do + * the macro renaming thing. If you are implementing a serious wide vector operation you should + * use SVE(2) instead. The semantics are different however and the code will be written for an + * arbitrary vector bit width. In that case you will also need x86_64 code for determining + * the supported vector width (ideally at runtime though that may not be possible). + */ +#include <arm_neon.h> +typedef float32x4_t f32x4; +typedef int32x4_t i32x4; + +#define cvt_i32x4_f32x4(a) vcvtq_f32_s32(a) +#define cvt_f32x4_i32x4(a) vcvtq_s32_f32(a) +#define dup_f32x4(f) vdupq_n_f32(f) +#define load_f32x4(a) vld1q_f32(a) +#define load_i32x4(a) vld1q_s32(a) +#define mul_f32x4(a, b) vmulq_f32(a, b) +#define set_f32x4(a, b, c, d) vld1q_f32((f32 []){d, c, b, a}) +#define sqrt_f32x4(a) vsqrtq_f32(a) +#define store_f32x4(a, o) vst1q_f32(o, a) +#define store_i32x4(a, o) vst1q_s32(o, a) + +#elif __x86_64__ +#include <immintrin.h> +typedef __m128 f32x4; +typedef __m128i i32x4; + +#define cvt_i32x4_f32x4(a) _mm_cvtepi32_ps(a) +#define cvt_f32x4_i32x4(a) _mm_cvtps_epi32(a) +#define dup_f32x4(f) _mm_set1_ps(f) +#define load_f32x4(a) _mm_loadu_ps(a) +#define load_i32x4(a) _mm_loadu_si128((i32x4 *)a) +#define mul_f32x4(a, b) _mm_mul_ps(a, b) +#define set_f32x4(a, b, c, d) _mm_set_ps(a, b, c, d) +#define sqrt_f32x4(a) _mm_sqrt_ps(a) +#define store_f32x4(a, o) _mm_storeu_ps(o, a) +#define store_i32x4(a, o) _mm_storeu_si128((i32x4 *)o, a) + +#define debugbreak() asm volatile ("int3; nop") + +#endif diff --git a/static.c b/static.c @@ -63,9 +63,11 @@ get_gl_params(GLParams *gl, Stream *err) os_fatal(stream_to_s8(err)); } switch (vendor[0]) { - case 'A': gl->vendor_id = GL_VENDOR_AMD; break; - case 'I': gl->vendor_id = GL_VENDOR_INTEL; break; - case 'N': gl->vendor_id = GL_VENDOR_NVIDIA; break; + case 'A': gl->vendor_id = GL_VENDOR_AMD; break; + case 'I': gl->vendor_id = GL_VENDOR_INTEL; break; + case 'N': gl->vendor_id = GL_VENDOR_NVIDIA; break; + /* NOTE(rnp): freedreno - might need different handling on win32 but this is fine for now */ + case 'f': gl->vendor_id = GL_VENDOR_ARM; break; default: stream_append_s8(err, s8("Unknown GL Vendor: ")); stream_append_s8(err, cstr_to_s8(vendor)); @@ -93,6 +95,7 @@ validate_gl_requirements(GLParams *gl) switch (gl->vendor_id) { case GL_VENDOR_AMD: + case GL_VENDOR_ARM: case GL_VENDOR_INTEL: if (gl->version_major == 4 && gl->version_minor < 5) invalid = 1; @@ -116,6 +119,7 @@ dump_gl_params(GLParams *gl, Arena a) stream_append_s8(&s, s8("---- GL Parameters ----\n")); switch (gl->vendor_id) { case GL_VENDOR_AMD: stream_append_s8(&s, s8("Vendor: AMD\n")); break; + case GL_VENDOR_ARM: stream_append_s8(&s, s8("Vendor: ARM\n")); break; case GL_VENDOR_INTEL: stream_append_s8(&s, s8("Vendor: Intel\n")); break; case GL_VENDOR_NVIDIA: stream_append_s8(&s, s8("Vendor: nVidia\n")); break; } diff --git a/util.c b/util.c @@ -277,7 +277,7 @@ uv4_equal(uv4 a, uv4 b) static u32 round_down_power_of_2(u32 a) { - u32 result = 0x80000000UL >> _lzcnt_u32(a); + u32 result = 0x80000000UL >> clz_u32(a); return result; } @@ -342,11 +342,8 @@ mul_v2(v2 a, v2 b) static f32 magnitude_v2(v2 a) { - v4 result; - __m128 av = _mm_set_ps(0, 0, a.x, a.y); - av = _mm_mul_ps(av, av); - _mm_store_ps(result.E, _mm_sqrt_ps(_mm_hadd_ps(av, av))); - return result.x; + f32 result = sqrt_f32(a.x * a.x + a.y * a.y); + return result; } static f64 @@ -379,13 +376,13 @@ parse_f64(s8 s) } static void -fill_kronecker_sub_matrix(__m128i *out, i32 out_stride, i32 scale, __m128i *b, uv2 b_dim) +fill_kronecker_sub_matrix(i32 *out, i32 out_stride, i32 scale, i32 *b, uv2 b_dim) { - __m128 vscale = _mm_set1_ps(scale); + f32x4 vscale = dup_f32x4(scale); for (u32 i = 0; i < b_dim.y; i++) { - for (u32 j = 0; j < b_dim.x / 4; j++) { - __m128 vb = _mm_cvtepi32_ps(_mm_loadu_si128(b++)); - _mm_storeu_si128(out + j, _mm_cvtps_epi32(_mm_mul_ps(vscale, vb))); + for (u32 j = 0; j < b_dim.x; j += 4, b += 4) { + f32x4 vb = cvt_i32x4_f32x4(load_i32x4(b)); + store_i32x4(cvt_f32x4_i32x4(mul_f32x4(vscale, vb)), out + j); } out += out_stride; } @@ -398,10 +395,10 @@ kronecker_product(i32 *out, i32 *a, uv2 a_dim, i32 *b, uv2 b_dim) uv2 out_dim = {.x = a_dim.x * b_dim.x, .y = a_dim.y * b_dim.y}; ASSERT(out_dim.y % 4 == 0); for (u32 i = 0; i < a_dim.y; i++) { - __m128i *vout = (__m128i *)out; + i32 *vout = out; for (u32 j = 0; j < a_dim.x; j++, a++) { - fill_kronecker_sub_matrix(vout, out_dim.y / 4, *a, (__m128i *)b, b_dim); - vout += b_dim.y / 4; + fill_kronecker_sub_matrix(vout, out_dim.y, *a, b, b_dim); + vout += b_dim.y; } out += out_dim.y * b_dim.x; } diff --git a/util.h b/util.h @@ -5,8 +5,6 @@ #include <stddef.h> #include <stdint.h> -#include <immintrin.h> - #ifndef asm #define asm __asm__ #endif @@ -23,13 +21,15 @@ #endif #endif +#include "intrinsics.c" + #ifdef _DEBUG #ifdef _WIN32 #define DEBUG_EXPORT __declspec(dllexport) #else #define DEBUG_EXPORT #endif - #define ASSERT(c) do { if (!(c)) asm("int3; nop"); } while (0); + #define ASSERT(c) do { if (!(c)) debugbreak(); } while (0); #else #define DEBUG_EXPORT static #define ASSERT(c)