Commit: 431c5823b49684e7536b83acaeb8d4745d623a59
Parent: 66327e47816e2bf8e068e18700feb88900a8bc26
Author: Randy Palamar
Date: Thu, 20 Jun 2024 05:42:05 -0600
convert rgb_to_hsv to SIMD
This works out to 31 instructions; pretty good but there may be
ways of eliminating a few instructions.
Diffstat:
M | colourpicker.c | | | 65 | +++++++++++++++++++++++++++++++++++++++++++++++++++++------------ |
1 file changed, 53 insertions(+), 12 deletions(-)
diff --git a/colourpicker.c b/colourpicker.c
@@ -86,8 +86,48 @@ cut_rect_right(Rect r, f32 fraction)
static v4
rgb_to_hsv(v4 rgb)
{
- Vector3 hsv = ColorToHSV(ColorFromNormalized(rgb.rv));
- return (v4){ .x = hsv.x / 360, .y = hsv.y, .z = hsv.z, .w = rgb.a };
+ __m128 rgba = _mm_loadu_ps(rgb.E);
+ __m128 gbra = _mm_shuffle_ps(rgba, rgba, _MM_SHUFFLE(3, 0, 2, 1));
+ __m128 brga = _mm_shuffle_ps(gbra, gbra, _MM_SHUFFLE(3, 0, 2, 1));
+
+ __m128 Max = _mm_max_ps(rgba, _mm_max_ps(gbra, brga));
+ __m128 Min = _mm_min_ps(rgba, _mm_min_ps(gbra, brga));
+ __m128 C = _mm_sub_ps(Max, Min);
+
+ __m128 t = _mm_div_ps(_mm_sub_ps(gbra, brga), C);
+
+ _Alignas(16) f32 aval[4] = { 0, 2, 4, 0 };
+ t = _mm_add_ps(t, _mm_load_ps(aval));
+
+ /* TODO: does (G - B) / C ever exceed 6.0? */
+ /* NOTE: 1e9 ensures that the remainder after floor is 0.
+ * This limits the fmodf to apply only to element [0] */
+ _Alignas(16) f32 div[4] = { 6, 1e9, 1e9, 1e9 };
+ __m128 six = _mm_set1_ps(6);
+ __m128 rem = _mm_floor_ps(_mm_div_ps(t, _mm_load_ps(div)));
+ t = _mm_sub_ps(t, _mm_mul_ps(rem, six));
+
+ __m128 zero = _mm_set1_ps(0);
+ __m128 maxmask = _mm_cmpeq_ps(rgba, Max);
+
+ __m128 H = _mm_div_ps(_mm_blendv_ps(zero, t, maxmask), six);
+ __m128 S = _mm_div_ps(C, Max);
+
+ /* NOTE: Make sure H & S are 0 instead of NaN when V == 0 */
+ __m128 zeromask = _mm_cmpeq_ps(zero, Max);
+ H = _mm_blendv_ps(H, zero, zeromask);
+ S = _mm_blendv_ps(S, zero, zeromask);
+
+ __m128 H0 = _mm_shuffle_ps(H, H, _MM_SHUFFLE(3, 0, 0, 0));
+ __m128 H1 = _mm_shuffle_ps(H, H, _MM_SHUFFLE(3, 1, 1, 1));
+ __m128 H2 = _mm_shuffle_ps(H, H, _MM_SHUFFLE(3, 2, 2, 2));
+ H = _mm_or_ps(H0, _mm_or_ps(H1, H2));
+
+ /* NOTE: keep only element [0] from H vector; Max contains V & A */
+ __m128 hva = _mm_blend_ps(Max, H, 0x01);
+ v4 res;
+ _mm_storeu_ps(res.E, _mm_blend_ps(hva, S, 0x02));
+ return res;
}
static v4
@@ -98,11 +138,11 @@ hsv_to_rgb(v4 hsv)
* (R, G, B) = (f(n = 5), f(n = 3), f(n = 1))
*/
_Alignas(16) f32 nval[4] = {5.0f, 3.0f, 1.0f, 0.0f};
- __m128 n = _mm_load_ps(nval);
- __m128 H = _mm_set1_ps(hsv.x);
- __m128 S = _mm_set1_ps(hsv.y);
- __m128 V = _mm_set1_ps(hsv.z);
- __m128 six = _mm_set1_ps(6);
+ __m128 n = _mm_load_ps(nval);
+ __m128 H = _mm_set1_ps(hsv.x);
+ __m128 S = _mm_set1_ps(hsv.y);
+ __m128 V = _mm_set1_ps(hsv.z);
+ __m128 six = _mm_set1_ps(6);
__m128 t = _mm_add_ps(n, _mm_mul_ps(six, H));
__m128 rem = _mm_floor_ps(_mm_div_ps(t, six));
@@ -133,6 +173,7 @@ fill_hsv_image(Image img, v4 hsv)
s.y = 0;
v.z = 0;
+ f32 inc = 1.0 / img.width;
for (u32 i = 0; i < img.width; i++) {
Color hrgb = ColorFromNormalized(hsv_to_rgb(h).rv);
Color srgb = ColorFromNormalized(hsv_to_rgb(s).rv);
@@ -144,9 +185,9 @@ fill_hsv_image(Image img, v4 hsv)
hbot.x += 1.0;
sbot.x += 1.0;
vbot.x += 1.0;
- h.x += 1.0 / img.width;
- s.y += 1.0 / img.width;
- v.z += 1.0 / img.width;
+ h.x += inc;
+ s.y += inc;
+ v.z += inc;
}
}
@@ -401,8 +442,8 @@ do_colour_stack(ColourPickerCtx *ctx, Rect sa, f32 dt)
r.pos.x += sa.size.w * 0.15;
r.pos.y += sa.size.h * 0.06;
- f32 stack_off_target = -sa.size.h * 0.16;
- f32 stack_off_delta = -stack_off_target * 5 * dt;
+ f32 stack_off_target = -sa.size.h * 0.16;
+ f32 stack_off_delta = -stack_off_target * 5 * dt;
ColourStackState *css = &ctx->colour_stack;
b32 fade_stack = css->fade_param != 1.0f;