Remove the SSE3 implementation in operator*(fquatSIMD, fquatSIMD).

The SSE2 version is now running faster than the SSE3 version.
2024-11-14 06:04:34 +00:00 · 2013-04-24 08:51:17 +10:00 · 2013-04-24 08:51:17 +10:00 · 13837e1079
commit 13837e1079
parent 197b6c96d8
1 changed files with 14 additions and 29 deletions
--- a/glm/gtx/simd_quat.inl
+++ b/glm/gtx/simd_quat.inl
@ -115,11 +115,6 @@ GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const &
    //    8  mul
    //    8  add
    // SSE3 STATS:
    //    3 shuffle
    //    8 mul
    //    8 add
    // SSE4 STATS:
    //    3 shuffle
    //    4 mul
@ -137,31 +132,21 @@ GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const &
    __m128 add3 = _mm_dp_ps(mul3, _mm_set_ps(1.0f,  1.0f, -1.0f,  1.0f), 0xff);
 #   else
           mul0 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f));
               mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, -1.0f,  1.0f,  1.0f));
               mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f,  1.0f,  1.0f, -1.0f));
               mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f,  1.0f, -1.0f,  1.0f));
 #       if((GLM_ARCH & GLM_ARCH_SSE3))
        __m128 add0 = _mm_hadd_ps(mul0, mul0);
               add0 = _mm_hadd_ps(add0, add0);
        __m128 add1 = _mm_hadd_ps(mul1, mul1);
               add1 = _mm_hadd_ps(add1, add1);
        __m128 add2 = _mm_hadd_ps(mul2, mul2);
               add2 = _mm_hadd_ps(add2, add2);
        __m128 add3 = _mm_hadd_ps(mul3, mul3);
               add3 = _mm_hadd_ps(add3, add3);
 #       else
    __m128 add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul0, mul0));
           add0 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1));
           mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, -1.0f,  1.0f,  1.0f));
    __m128 add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul1, mul1));
           add1 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1));
           mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f,  1.0f,  1.0f, -1.0f));
    __m128 add2 = _mm_add_ps(mul2, _mm_movehl_ps(mul2, mul2));
           add2 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1));
           mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f,  1.0f, -1.0f,  1.0f));
    __m128 add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul3, mul3));
           add3 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1));
 #endif
 #endif
    // I had tried something clever here using shuffles to produce the final result, but it turns out that using