make kronecker product use SIMD - ogl_beamforming - Ultrasound Beamforming Implemented with OpenGL

Commit: bc05b689c633829751fdd6ba20cfc6fa62de1828
Parent: bce389f0d683d84cf92632e9e2f314505b928d8d
Author: Randy Palamar
Date:   Tue, 29 Oct 2024 12:28:07 -0600

make kronecker product use SIMD

no need to make the CPU do useless work when we know that the
output dimension has to be a multiple of 4.

Diffstat:
M util.c  | 22 +++++++++++++++-------

1 file changed, 15 insertions(+), 7 deletions(-)
diff --git a/util.c b/util.c
@@ -273,11 +273,16 @@ parse_f64(s8 s)
 }
 
 static void
-fill_kronecker_sub_matrix(i32 *out, i32 out_stride, i32 scale, i32 *b, uv2 b_dim)
+fill_kronecker_sub_matrix(__m128i *out, i32 out_stride, i32 scale, __m128i *b, uv2 b_dim)
 {
-	for (u32 i = 0; i < b_dim.y; i++)
-		for (u32 j = 0; j < b_dim.x; j++)
-			out[i * out_stride + j] = scale * b[i * b_dim.x + j];
+	__m128 vscale = _mm_set1_ps(scale);
+	for (u32 i = 0; i < b_dim.y; i++) {
+		for (u32 j = 0; j < b_dim.x / 4; j++) {
+			__m128 vb = _mm_cvtepi32_ps(_mm_loadu_si128(b++));
+			_mm_storeu_si128(out + j, _mm_cvtps_epi32(_mm_mul_ps(vscale, vb)));
+		}
+		out += out_stride;
+	}
 }
 
 /* NOTE: this won't check for valid space/etc and assumes row major order */
@@ -285,11 +290,14 @@ static void
 kronecker_product(i32 *out, i32 *a, uv2 a_dim, i32 *b, uv2 b_dim)
 {
 	uv2 out_dim = {.x = a_dim.x * b_dim.x, .y = a_dim.y * b_dim.y};
+	ASSERT(out_dim.y % 4 == 0);
 	for (u32 i = 0; i < a_dim.y; i++) {
-		for (u32 j = 0; j < a_dim.x; j++) {
-			fill_kronecker_sub_matrix(out + j * b_dim.y + i * out_dim.y * b_dim.x,
-			                          out_dim.y, a[i * a_dim.x + j], b, b_dim);
+		__m128i *vout = (__m128i *)out;
+		for (u32 j = 0; j < a_dim.x; j++, a++) {
+			fill_kronecker_sub_matrix(vout, out_dim.y / 4, *a, (__m128i *)b, b_dim);
+			vout += b_dim.y / 4;
 		}
+		out += out_dim.y * b_dim.x;
 	}
 }