Improve efficiency of operator*(fquatSIMD, fquatSIMD) in SSE4 mode.

Now only requires 3 shuffle, 4 mul and 4 dpps.
2024-11-27 02:34:35 +00:00 · 2013-04-24 07:58:41 +10:00 · 2013-04-24 07:58:41 +10:00 · 197b6c96d8
commit 197b6c96d8
parent d23da666c5
1 changed files with 29 additions and 28 deletions
--- a/glm/gtx/simd_quat.inl
+++ b/glm/gtx/simd_quat.inl
@ -122,7 +122,7 @@ GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const &

    // SSE4 STATS:
    //    3 shuffle
-    //    8 mul
+    //    4 mul
    //    4 dpps

    __m128 mul0 = _mm_mul_ps(q1.Data, q2.Data);
@ -130,18 +130,18 @@ GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const &
    __m128 mul2 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(1, 0, 3, 2)));
    __m128 mul3 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(2, 3, 0, 1)));

+#   if((GLM_ARCH & GLM_ARCH_SSE4))
+    __m128 add0 = _mm_dp_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f), 0xff);
+    __m128 add1 = _mm_dp_ps(mul1, _mm_set_ps(1.0f, -1.0f,  1.0f,  1.0f), 0xff);
+    __m128 add2 = _mm_dp_ps(mul2, _mm_set_ps(1.0f,  1.0f,  1.0f, -1.0f), 0xff);
+    __m128 add3 = _mm_dp_ps(mul3, _mm_set_ps(1.0f,  1.0f, -1.0f,  1.0f), 0xff);
+#   else
               mul0 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f));
               mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, -1.0f,  1.0f,  1.0f));
               mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f,  1.0f,  1.0f, -1.0f));
               mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f,  1.0f, -1.0f,  1.0f));

-    
-#   if((GLM_ARCH & GLM_ARCH_SSE4))
-    __m128 add0 = _mm_dp_ps(mul0, _mm_set1_ps(1.0f), 0xff);
-    __m128 add1 = _mm_dp_ps(mul1, _mm_set1_ps(1.0f), 0xff);
-    __m128 add2 = _mm_dp_ps(mul2, _mm_set1_ps(1.0f), 0xff);
-    __m128 add3 = _mm_dp_ps(mul3, _mm_set1_ps(1.0f), 0xff);
-#   elif((GLM_ARCH & GLM_ARCH_SSE3))
+#       if((GLM_ARCH & GLM_ARCH_SSE3))
        __m128 add0 = _mm_hadd_ps(mul0, mul0);
               add0 = _mm_hadd_ps(add0, add0);
        __m128 add1 = _mm_hadd_ps(mul1, mul1);
@ -159,6 +159,7 @@ GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const &
               add2 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1));
        __m128 add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul3, mul3));
               add3 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1));
+#       endif
 #endif