ogl_beamforming

Ultrasound Beamforming Implemented with OpenGL
git clone anongit@rnpnr.xyz:ogl_beamforming.git
Log | Files | Refs | Feed | Submodules | README | LICENSE

Commit: bc05b689c633829751fdd6ba20cfc6fa62de1828
Parent: bce389f0d683d84cf92632e9e2f314505b928d8d
Author: Randy Palamar
Date:   Tue, 29 Oct 2024 12:28:07 -0600

make kronecker product use SIMD

no need to make the CPU do useless work when we know that the
output dimension has to be a multiple of 4.

Diffstat:
Mutil.c | 22+++++++++++++++-------
1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/util.c b/util.c @@ -273,11 +273,16 @@ parse_f64(s8 s) } static void -fill_kronecker_sub_matrix(i32 *out, i32 out_stride, i32 scale, i32 *b, uv2 b_dim) +fill_kronecker_sub_matrix(__m128i *out, i32 out_stride, i32 scale, __m128i *b, uv2 b_dim) { - for (u32 i = 0; i < b_dim.y; i++) - for (u32 j = 0; j < b_dim.x; j++) - out[i * out_stride + j] = scale * b[i * b_dim.x + j]; + __m128 vscale = _mm_set1_ps(scale); + for (u32 i = 0; i < b_dim.y; i++) { + for (u32 j = 0; j < b_dim.x / 4; j++) { + __m128 vb = _mm_cvtepi32_ps(_mm_loadu_si128(b++)); + _mm_storeu_si128(out + j, _mm_cvtps_epi32(_mm_mul_ps(vscale, vb))); + } + out += out_stride; + } } /* NOTE: this won't check for valid space/etc and assumes row major order */ @@ -285,11 +290,14 @@ static void kronecker_product(i32 *out, i32 *a, uv2 a_dim, i32 *b, uv2 b_dim) { uv2 out_dim = {.x = a_dim.x * b_dim.x, .y = a_dim.y * b_dim.y}; + ASSERT(out_dim.y % 4 == 0); for (u32 i = 0; i < a_dim.y; i++) { - for (u32 j = 0; j < a_dim.x; j++) { - fill_kronecker_sub_matrix(out + j * b_dim.y + i * out_dim.y * b_dim.x, - out_dim.y, a[i * a_dim.x + j], b, b_dim); + __m128i *vout = (__m128i *)out; + for (u32 j = 0; j < a_dim.x; j++, a++) { + fill_kronecker_sub_matrix(vout, out_dim.y / 4, *a, (__m128i *)b, b_dim); + vout += b_dim.y / 4; } + out += out_dim.y * b_dim.x; } }