From 197b6c96d8a068fcd1002b0731def6bcbb2af5fc Mon Sep 17 00:00:00 2001
From: Dave Reid <mackron@gmail.com>
Date: Wed, 24 Apr 2013 07:58:41 +1000
Subject: [PATCH] Improve efficiency of operator*(fquatSIMD, fquatSIMD) in SSE4
 mode.

Now only requires 3 shuffle, 4 mul and 4 dpps.
---
 glm/gtx/simd_quat.inl | 57 ++++++++++++++++++++++---------------------
 1 file changed, 29 insertions(+), 28 deletions(-)

diff --git a/glm/gtx/simd_quat.inl b/glm/gtx/simd_quat.inl
index f9df7a94..25848a57 100644
--- a/glm/gtx/simd_quat.inl
+++ b/glm/gtx/simd_quat.inl
@@ -122,7 +122,7 @@ GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const &
 
     // SSE4 STATS:
     //    3 shuffle
-    //    8 mul
+    //    4 mul
     //    4 dpps
 
     __m128 mul0 = _mm_mul_ps(q1.Data, q2.Data);
@@ -130,35 +130,36 @@ GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const &
     __m128 mul2 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(1, 0, 3, 2)));
     __m128 mul3 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(2, 3, 0, 1)));
 
-           mul0 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f));
-           mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, -1.0f,  1.0f,  1.0f));
-           mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f,  1.0f,  1.0f, -1.0f));
-           mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f,  1.0f, -1.0f,  1.0f));
-
-    
 #   if((GLM_ARCH & GLM_ARCH_SSE4))
-    __m128 add0 = _mm_dp_ps(mul0, _mm_set1_ps(1.0f), 0xff);
-    __m128 add1 = _mm_dp_ps(mul1, _mm_set1_ps(1.0f), 0xff);
-    __m128 add2 = _mm_dp_ps(mul2, _mm_set1_ps(1.0f), 0xff);
-    __m128 add3 = _mm_dp_ps(mul3, _mm_set1_ps(1.0f), 0xff);
-#   elif((GLM_ARCH & GLM_ARCH_SSE3))
-    __m128 add0 = _mm_hadd_ps(mul0, mul0);
-           add0 = _mm_hadd_ps(add0, add0);
-    __m128 add1 = _mm_hadd_ps(mul1, mul1);
-           add1 = _mm_hadd_ps(add1, add1);
-    __m128 add2 = _mm_hadd_ps(mul2, mul2);
-           add2 = _mm_hadd_ps(add2, add2);
-    __m128 add3 = _mm_hadd_ps(mul3, mul3);
-           add3 = _mm_hadd_ps(add3, add3);
+    __m128 add0 = _mm_dp_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f), 0xff);
+    __m128 add1 = _mm_dp_ps(mul1, _mm_set_ps(1.0f, -1.0f,  1.0f,  1.0f), 0xff);
+    __m128 add2 = _mm_dp_ps(mul2, _mm_set_ps(1.0f,  1.0f,  1.0f, -1.0f), 0xff);
+    __m128 add3 = _mm_dp_ps(mul3, _mm_set_ps(1.0f,  1.0f, -1.0f,  1.0f), 0xff);
 #   else
-    __m128 add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul0, mul0));
-           add0 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1));
-    __m128 add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul1, mul1));
-           add1 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1));
-    __m128 add2 = _mm_add_ps(mul2, _mm_movehl_ps(mul2, mul2));
-           add2 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1));
-    __m128 add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul3, mul3));
-           add3 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1));
+               mul0 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f));
+               mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, -1.0f,  1.0f,  1.0f));
+               mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f,  1.0f,  1.0f, -1.0f));
+               mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f,  1.0f, -1.0f,  1.0f));
+
+#       if((GLM_ARCH & GLM_ARCH_SSE3))
+        __m128 add0 = _mm_hadd_ps(mul0, mul0);
+               add0 = _mm_hadd_ps(add0, add0);
+        __m128 add1 = _mm_hadd_ps(mul1, mul1);
+               add1 = _mm_hadd_ps(add1, add1);
+        __m128 add2 = _mm_hadd_ps(mul2, mul2);
+               add2 = _mm_hadd_ps(add2, add2);
+        __m128 add3 = _mm_hadd_ps(mul3, mul3);
+               add3 = _mm_hadd_ps(add3, add3);
+#       else
+        __m128 add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul0, mul0));
+               add0 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1));
+        __m128 add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul1, mul1));
+               add1 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1));
+        __m128 add2 = _mm_add_ps(mul2, _mm_movehl_ps(mul2, mul2));
+               add2 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1));
+        __m128 add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul3, mul3));
+               add3 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1));
+#       endif
 #endif