From 7f2a5b89b30d143014bc0363b99dc3d942457ae7 Mon Sep 17 00:00:00 2001 From: Laurent Caumont Date: Fri, 22 Dec 2023 17:24:37 +0100 Subject: [PATCH] Simd improvement - Add simd aligned_vec3 (and sse aligned_dvec3 - 2 x xmm) - Fast packed_vec3 <=> aligned_vec3 and packed_vec4 <=> aligned_vec4 conversion - Fast aligned_vec3 <=> aligned_vec4 conversion - Optimized aligned_mat x aligned_mat and aligned_mat x aligned_vec - Inverse aligned_mat3 simd version (actually slower than ssid on my computer even it has 30% less instruction ?) --- glm/detail/_swizzle.hpp | 201 ++--- glm/detail/_vectorize.hpp | 68 ++ glm/detail/compute_vector_decl.hpp | 190 +++++ glm/detail/func_common.inl | 164 +++- glm/detail/func_common_simd.inl | 382 ++++++++++ glm/detail/func_geometric.inl | 20 +- glm/detail/func_geometric_simd.inl | 28 +- glm/detail/func_matrix.inl | 75 +- glm/detail/func_matrix_simd.inl | 11 + glm/detail/qualifier.hpp | 68 +- glm/detail/type_mat3x3.inl | 114 ++- glm/detail/type_mat4x4.inl | 105 ++- glm/detail/type_vec2.hpp | 4 + glm/detail/type_vec3.hpp | 11 + glm/detail/type_vec3.inl | 565 ++++++-------- glm/detail/type_vec4.hpp | 10 +- glm/detail/type_vec4.inl | 306 ++++---- glm/detail/type_vec4_simd.inl | 788 -------------------- glm/detail/type_vec_simd.inl | 1032 ++++++++++++++++++++++++++ glm/ext/vector_relational.inl | 2 +- glm/gtc/random.inl | 6 +- glm/gtx/common.inl | 2 +- glm/gtx/vec_swizzle.hpp | 15 - glm/simd/common.h | 13 +- glm/simd/matrix.h | 12 + glm/simd/neon.h | 28 +- test/CMakeLists.txt | 126 ++++ test/bug/bug_ms_vec_static.cpp | 4 +- test/core/core_force_pure.cpp | 2 +- test/core/core_type_aligned.cpp | 3 + test/gtc/gtc_random.cpp | 2 + test/gtc/gtc_type_aligned.cpp | 304 ++++++++ test/perf/perf_matrix_mul.cpp | 74 +- test/perf/perf_matrix_mul_vector.cpp | 19 +- test/perf/perf_vector_mul_matrix.cpp | 86 ++- 35 files changed, 3296 insertions(+), 1544 deletions(-) create mode 100644 glm/detail/compute_vector_decl.hpp delete mode 100644 glm/detail/type_vec4_simd.inl create mode 100644 glm/detail/type_vec_simd.inl diff --git a/glm/detail/_swizzle.hpp b/glm/detail/_swizzle.hpp index e1f57ccb..0678982c 100644 --- a/glm/detail/_swizzle.hpp +++ b/glm/detail/_swizzle.hpp @@ -17,25 +17,30 @@ namespace detail char _buffer[1]; }; - template + template struct _swizzle_base1 : public _swizzle_base0 { }; - template - struct _swizzle_base1<2, T, Q, E0,E1,-1,-2, Aligned> : public _swizzle_base0 + template + struct _swizzle_base1 : public _swizzle_base0 + { + }; + + template + struct _swizzle_base1<2, T, Q, E0,E1,-1,-2, false> : public _swizzle_base0 { GLM_FUNC_QUALIFIER vec<2, T, Q> operator ()() const { return vec<2, T, Q>(this->elem(E0), this->elem(E1)); } }; - template - struct _swizzle_base1<3, T, Q, E0,E1,E2,-1, Aligned> : public _swizzle_base0 + template + struct _swizzle_base1<3, T, Q, E0,E1,E2,3, false> : public _swizzle_base0 { GLM_FUNC_QUALIFIER vec<3, T, Q> operator ()() const { return vec<3, T, Q>(this->elem(E0), this->elem(E1), this->elem(E2)); } }; - template - struct _swizzle_base1<4, T, Q, E0,E1,E2,E3, Aligned> : public _swizzle_base0 + template + struct _swizzle_base1<4, T, Q, E0,E1,E2,E3, false> : public _swizzle_base0 { GLM_FUNC_QUALIFIER vec<4, T, Q> operator ()() const { return vec<4, T, Q>(this->elem(E0), this->elem(E1), this->elem(E2), this->elem(E3)); } }; @@ -350,33 +355,33 @@ namespace glm struct { detail::_swizzle<2,T, Q, 2,2,-1,-2> E2 ## E2; }; #define GLM_SWIZZLE3_3_MEMBERS(T, Q ,E0,E1,E2) \ - struct { detail::_swizzle<3, T, Q, 0,0,0,-1> E0 ## E0 ## E0; }; \ - struct { detail::_swizzle<3, T, Q, 0,0,1,-1> E0 ## E0 ## E1; }; \ - struct { detail::_swizzle<3, T, Q, 0,0,2,-1> E0 ## E0 ## E2; }; \ - struct { detail::_swizzle<3, T, Q, 0,1,0,-1> E0 ## E1 ## E0; }; \ - struct { detail::_swizzle<3, T, Q, 0,1,1,-1> E0 ## E1 ## E1; }; \ - struct { detail::_swizzle<3, T, Q, 0,1,2,-1> E0 ## E1 ## E2; }; \ - struct { detail::_swizzle<3, T, Q, 0,2,0,-1> E0 ## E2 ## E0; }; \ - struct { detail::_swizzle<3, T, Q, 0,2,1,-1> E0 ## E2 ## E1; }; \ - struct { detail::_swizzle<3, T, Q, 0,2,2,-1> E0 ## E2 ## E2; }; \ - struct { detail::_swizzle<3, T, Q, 1,0,0,-1> E1 ## E0 ## E0; }; \ - struct { detail::_swizzle<3, T, Q, 1,0,1,-1> E1 ## E0 ## E1; }; \ - struct { detail::_swizzle<3, T, Q, 1,0,2,-1> E1 ## E0 ## E2; }; \ - struct { detail::_swizzle<3, T, Q, 1,1,0,-1> E1 ## E1 ## E0; }; \ - struct { detail::_swizzle<3, T, Q, 1,1,1,-1> E1 ## E1 ## E1; }; \ - struct { detail::_swizzle<3, T, Q, 1,1,2,-1> E1 ## E1 ## E2; }; \ - struct { detail::_swizzle<3, T, Q, 1,2,0,-1> E1 ## E2 ## E0; }; \ - struct { detail::_swizzle<3, T, Q, 1,2,1,-1> E1 ## E2 ## E1; }; \ - struct { detail::_swizzle<3, T, Q, 1,2,2,-1> E1 ## E2 ## E2; }; \ - struct { detail::_swizzle<3, T, Q, 2,0,0,-1> E2 ## E0 ## E0; }; \ - struct { detail::_swizzle<3, T, Q, 2,0,1,-1> E2 ## E0 ## E1; }; \ - struct { detail::_swizzle<3, T, Q, 2,0,2,-1> E2 ## E0 ## E2; }; \ - struct { detail::_swizzle<3, T, Q, 2,1,0,-1> E2 ## E1 ## E0; }; \ - struct { detail::_swizzle<3, T, Q, 2,1,1,-1> E2 ## E1 ## E1; }; \ - struct { detail::_swizzle<3, T, Q, 2,1,2,-1> E2 ## E1 ## E2; }; \ - struct { detail::_swizzle<3, T, Q, 2,2,0,-1> E2 ## E2 ## E0; }; \ - struct { detail::_swizzle<3, T, Q, 2,2,1,-1> E2 ## E2 ## E1; }; \ - struct { detail::_swizzle<3, T, Q, 2,2,2,-1> E2 ## E2 ## E2; }; + struct { detail::_swizzle<3, T, Q, 0,0,0,3> E0 ## E0 ## E0; }; \ + struct { detail::_swizzle<3, T, Q, 0,0,1,3> E0 ## E0 ## E1; }; \ + struct { detail::_swizzle<3, T, Q, 0,0,2,3> E0 ## E0 ## E2; }; \ + struct { detail::_swizzle<3, T, Q, 0,1,0,3> E0 ## E1 ## E0; }; \ + struct { detail::_swizzle<3, T, Q, 0,1,1,3> E0 ## E1 ## E1; }; \ + struct { detail::_swizzle<3, T, Q, 0,1,2,3> E0 ## E1 ## E2; }; \ + struct { detail::_swizzle<3, T, Q, 0,2,0,3> E0 ## E2 ## E0; }; \ + struct { detail::_swizzle<3, T, Q, 0,2,1,3> E0 ## E2 ## E1; }; \ + struct { detail::_swizzle<3, T, Q, 0,2,2,3> E0 ## E2 ## E2; }; \ + struct { detail::_swizzle<3, T, Q, 1,0,0,3> E1 ## E0 ## E0; }; \ + struct { detail::_swizzle<3, T, Q, 1,0,1,3> E1 ## E0 ## E1; }; \ + struct { detail::_swizzle<3, T, Q, 1,0,2,3> E1 ## E0 ## E2; }; \ + struct { detail::_swizzle<3, T, Q, 1,1,0,3> E1 ## E1 ## E0; }; \ + struct { detail::_swizzle<3, T, Q, 1,1,1,3> E1 ## E1 ## E1; }; \ + struct { detail::_swizzle<3, T, Q, 1,1,2,3> E1 ## E1 ## E2; }; \ + struct { detail::_swizzle<3, T, Q, 1,2,0,3> E1 ## E2 ## E0; }; \ + struct { detail::_swizzle<3, T, Q, 1,2,1,3> E1 ## E2 ## E1; }; \ + struct { detail::_swizzle<3, T, Q, 1,2,2,3> E1 ## E2 ## E2; }; \ + struct { detail::_swizzle<3, T, Q, 2,0,0,3> E2 ## E0 ## E0; }; \ + struct { detail::_swizzle<3, T, Q, 2,0,1,3> E2 ## E0 ## E1; }; \ + struct { detail::_swizzle<3, T, Q, 2,0,2,3> E2 ## E0 ## E2; }; \ + struct { detail::_swizzle<3, T, Q, 2,1,0,3> E2 ## E1 ## E0; }; \ + struct { detail::_swizzle<3, T, Q, 2,1,1,3> E2 ## E1 ## E1; }; \ + struct { detail::_swizzle<3, T, Q, 2,1,2,3> E2 ## E1 ## E2; }; \ + struct { detail::_swizzle<3, T, Q, 2,2,0,3> E2 ## E2 ## E0; }; \ + struct { detail::_swizzle<3, T, Q, 2,2,1,3> E2 ## E2 ## E1; }; \ + struct { detail::_swizzle<3, T, Q, 2,2,2,3> E2 ## E2 ## E2; }; #define GLM_SWIZZLE3_4_MEMBERS(T, Q, E0,E1,E2) \ struct { detail::_swizzle<4,T, Q, 0,0,0,0> E0 ## E0 ## E0 ## E0; }; \ @@ -480,70 +485,70 @@ namespace glm struct { detail::_swizzle<2,T, Q, 3,3,-1,-2> E3 ## E3; }; #define GLM_SWIZZLE4_3_MEMBERS(T, Q, E0,E1,E2,E3) \ - struct { detail::_swizzle<3, T, Q, 0,0,0,-1> E0 ## E0 ## E0; }; \ - struct { detail::_swizzle<3, T, Q, 0,0,1,-1> E0 ## E0 ## E1; }; \ - struct { detail::_swizzle<3, T, Q, 0,0,2,-1> E0 ## E0 ## E2; }; \ - struct { detail::_swizzle<3, T, Q, 0,0,3,-1> E0 ## E0 ## E3; }; \ - struct { detail::_swizzle<3, T, Q, 0,1,0,-1> E0 ## E1 ## E0; }; \ - struct { detail::_swizzle<3, T, Q, 0,1,1,-1> E0 ## E1 ## E1; }; \ - struct { detail::_swizzle<3, T, Q, 0,1,2,-1> E0 ## E1 ## E2; }; \ - struct { detail::_swizzle<3, T, Q, 0,1,3,-1> E0 ## E1 ## E3; }; \ - struct { detail::_swizzle<3, T, Q, 0,2,0,-1> E0 ## E2 ## E0; }; \ - struct { detail::_swizzle<3, T, Q, 0,2,1,-1> E0 ## E2 ## E1; }; \ - struct { detail::_swizzle<3, T, Q, 0,2,2,-1> E0 ## E2 ## E2; }; \ - struct { detail::_swizzle<3, T, Q, 0,2,3,-1> E0 ## E2 ## E3; }; \ - struct { detail::_swizzle<3, T, Q, 0,3,0,-1> E0 ## E3 ## E0; }; \ - struct { detail::_swizzle<3, T, Q, 0,3,1,-1> E0 ## E3 ## E1; }; \ - struct { detail::_swizzle<3, T, Q, 0,3,2,-1> E0 ## E3 ## E2; }; \ - struct { detail::_swizzle<3, T, Q, 0,3,3,-1> E0 ## E3 ## E3; }; \ - struct { detail::_swizzle<3, T, Q, 1,0,0,-1> E1 ## E0 ## E0; }; \ - struct { detail::_swizzle<3, T, Q, 1,0,1,-1> E1 ## E0 ## E1; }; \ - struct { detail::_swizzle<3, T, Q, 1,0,2,-1> E1 ## E0 ## E2; }; \ - struct { detail::_swizzle<3, T, Q, 1,0,3,-1> E1 ## E0 ## E3; }; \ - struct { detail::_swizzle<3, T, Q, 1,1,0,-1> E1 ## E1 ## E0; }; \ - struct { detail::_swizzle<3, T, Q, 1,1,1,-1> E1 ## E1 ## E1; }; \ - struct { detail::_swizzle<3, T, Q, 1,1,2,-1> E1 ## E1 ## E2; }; \ - struct { detail::_swizzle<3, T, Q, 1,1,3,-1> E1 ## E1 ## E3; }; \ - struct { detail::_swizzle<3, T, Q, 1,2,0,-1> E1 ## E2 ## E0; }; \ - struct { detail::_swizzle<3, T, Q, 1,2,1,-1> E1 ## E2 ## E1; }; \ - struct { detail::_swizzle<3, T, Q, 1,2,2,-1> E1 ## E2 ## E2; }; \ - struct { detail::_swizzle<3, T, Q, 1,2,3,-1> E1 ## E2 ## E3; }; \ - struct { detail::_swizzle<3, T, Q, 1,3,0,-1> E1 ## E3 ## E0; }; \ - struct { detail::_swizzle<3, T, Q, 1,3,1,-1> E1 ## E3 ## E1; }; \ - struct { detail::_swizzle<3, T, Q, 1,3,2,-1> E1 ## E3 ## E2; }; \ - struct { detail::_swizzle<3, T, Q, 1,3,3,-1> E1 ## E3 ## E3; }; \ - struct { detail::_swizzle<3, T, Q, 2,0,0,-1> E2 ## E0 ## E0; }; \ - struct { detail::_swizzle<3, T, Q, 2,0,1,-1> E2 ## E0 ## E1; }; \ - struct { detail::_swizzle<3, T, Q, 2,0,2,-1> E2 ## E0 ## E2; }; \ - struct { detail::_swizzle<3, T, Q, 2,0,3,-1> E2 ## E0 ## E3; }; \ - struct { detail::_swizzle<3, T, Q, 2,1,0,-1> E2 ## E1 ## E0; }; \ - struct { detail::_swizzle<3, T, Q, 2,1,1,-1> E2 ## E1 ## E1; }; \ - struct { detail::_swizzle<3, T, Q, 2,1,2,-1> E2 ## E1 ## E2; }; \ - struct { detail::_swizzle<3, T, Q, 2,1,3,-1> E2 ## E1 ## E3; }; \ - struct { detail::_swizzle<3, T, Q, 2,2,0,-1> E2 ## E2 ## E0; }; \ - struct { detail::_swizzle<3, T, Q, 2,2,1,-1> E2 ## E2 ## E1; }; \ - struct { detail::_swizzle<3, T, Q, 2,2,2,-1> E2 ## E2 ## E2; }; \ - struct { detail::_swizzle<3, T, Q, 2,2,3,-1> E2 ## E2 ## E3; }; \ - struct { detail::_swizzle<3, T, Q, 2,3,0,-1> E2 ## E3 ## E0; }; \ - struct { detail::_swizzle<3, T, Q, 2,3,1,-1> E2 ## E3 ## E1; }; \ - struct { detail::_swizzle<3, T, Q, 2,3,2,-1> E2 ## E3 ## E2; }; \ - struct { detail::_swizzle<3, T, Q, 2,3,3,-1> E2 ## E3 ## E3; }; \ - struct { detail::_swizzle<3, T, Q, 3,0,0,-1> E3 ## E0 ## E0; }; \ - struct { detail::_swizzle<3, T, Q, 3,0,1,-1> E3 ## E0 ## E1; }; \ - struct { detail::_swizzle<3, T, Q, 3,0,2,-1> E3 ## E0 ## E2; }; \ - struct { detail::_swizzle<3, T, Q, 3,0,3,-1> E3 ## E0 ## E3; }; \ - struct { detail::_swizzle<3, T, Q, 3,1,0,-1> E3 ## E1 ## E0; }; \ - struct { detail::_swizzle<3, T, Q, 3,1,1,-1> E3 ## E1 ## E1; }; \ - struct { detail::_swizzle<3, T, Q, 3,1,2,-1> E3 ## E1 ## E2; }; \ - struct { detail::_swizzle<3, T, Q, 3,1,3,-1> E3 ## E1 ## E3; }; \ - struct { detail::_swizzle<3, T, Q, 3,2,0,-1> E3 ## E2 ## E0; }; \ - struct { detail::_swizzle<3, T, Q, 3,2,1,-1> E3 ## E2 ## E1; }; \ - struct { detail::_swizzle<3, T, Q, 3,2,2,-1> E3 ## E2 ## E2; }; \ - struct { detail::_swizzle<3, T, Q, 3,2,3,-1> E3 ## E2 ## E3; }; \ - struct { detail::_swizzle<3, T, Q, 3,3,0,-1> E3 ## E3 ## E0; }; \ - struct { detail::_swizzle<3, T, Q, 3,3,1,-1> E3 ## E3 ## E1; }; \ - struct { detail::_swizzle<3, T, Q, 3,3,2,-1> E3 ## E3 ## E2; }; \ - struct { detail::_swizzle<3, T, Q, 3,3,3,-1> E3 ## E3 ## E3; }; + struct { detail::_swizzle<3, T, Q, 0,0,0,3> E0 ## E0 ## E0; }; \ + struct { detail::_swizzle<3, T, Q, 0,0,1,3> E0 ## E0 ## E1; }; \ + struct { detail::_swizzle<3, T, Q, 0,0,2,3> E0 ## E0 ## E2; }; \ + struct { detail::_swizzle<3, T, Q, 0,0,3,3> E0 ## E0 ## E3; }; \ + struct { detail::_swizzle<3, T, Q, 0,1,0,3> E0 ## E1 ## E0; }; \ + struct { detail::_swizzle<3, T, Q, 0,1,1,3> E0 ## E1 ## E1; }; \ + struct { detail::_swizzle<3, T, Q, 0,1,2,3> E0 ## E1 ## E2; }; \ + struct { detail::_swizzle<3, T, Q, 0,1,3,3> E0 ## E1 ## E3; }; \ + struct { detail::_swizzle<3, T, Q, 0,2,0,3> E0 ## E2 ## E0; }; \ + struct { detail::_swizzle<3, T, Q, 0,2,1,3> E0 ## E2 ## E1; }; \ + struct { detail::_swizzle<3, T, Q, 0,2,2,3> E0 ## E2 ## E2; }; \ + struct { detail::_swizzle<3, T, Q, 0,2,3,3> E0 ## E2 ## E3; }; \ + struct { detail::_swizzle<3, T, Q, 0,3,0,3> E0 ## E3 ## E0; }; \ + struct { detail::_swizzle<3, T, Q, 0,3,1,3> E0 ## E3 ## E1; }; \ + struct { detail::_swizzle<3, T, Q, 0,3,2,3> E0 ## E3 ## E2; }; \ + struct { detail::_swizzle<3, T, Q, 0,3,3,3> E0 ## E3 ## E3; }; \ + struct { detail::_swizzle<3, T, Q, 1,0,0,3> E1 ## E0 ## E0; }; \ + struct { detail::_swizzle<3, T, Q, 1,0,1,3> E1 ## E0 ## E1; }; \ + struct { detail::_swizzle<3, T, Q, 1,0,2,3> E1 ## E0 ## E2; }; \ + struct { detail::_swizzle<3, T, Q, 1,0,3,3> E1 ## E0 ## E3; }; \ + struct { detail::_swizzle<3, T, Q, 1,1,0,3> E1 ## E1 ## E0; }; \ + struct { detail::_swizzle<3, T, Q, 1,1,1,3> E1 ## E1 ## E1; }; \ + struct { detail::_swizzle<3, T, Q, 1,1,2,3> E1 ## E1 ## E2; }; \ + struct { detail::_swizzle<3, T, Q, 1,1,3,3> E1 ## E1 ## E3; }; \ + struct { detail::_swizzle<3, T, Q, 1,2,0,3> E1 ## E2 ## E0; }; \ + struct { detail::_swizzle<3, T, Q, 1,2,1,3> E1 ## E2 ## E1; }; \ + struct { detail::_swizzle<3, T, Q, 1,2,2,3> E1 ## E2 ## E2; }; \ + struct { detail::_swizzle<3, T, Q, 1,2,3,3> E1 ## E2 ## E3; }; \ + struct { detail::_swizzle<3, T, Q, 1,3,0,3> E1 ## E3 ## E0; }; \ + struct { detail::_swizzle<3, T, Q, 1,3,1,3> E1 ## E3 ## E1; }; \ + struct { detail::_swizzle<3, T, Q, 1,3,2,3> E1 ## E3 ## E2; }; \ + struct { detail::_swizzle<3, T, Q, 1,3,3,3> E1 ## E3 ## E3; }; \ + struct { detail::_swizzle<3, T, Q, 2,0,0,3> E2 ## E0 ## E0; }; \ + struct { detail::_swizzle<3, T, Q, 2,0,1,3> E2 ## E0 ## E1; }; \ + struct { detail::_swizzle<3, T, Q, 2,0,2,3> E2 ## E0 ## E2; }; \ + struct { detail::_swizzle<3, T, Q, 2,0,3,3> E2 ## E0 ## E3; }; \ + struct { detail::_swizzle<3, T, Q, 2,1,0,3> E2 ## E1 ## E0; }; \ + struct { detail::_swizzle<3, T, Q, 2,1,1,3> E2 ## E1 ## E1; }; \ + struct { detail::_swizzle<3, T, Q, 2,1,2,3> E2 ## E1 ## E2; }; \ + struct { detail::_swizzle<3, T, Q, 2,1,3,3> E2 ## E1 ## E3; }; \ + struct { detail::_swizzle<3, T, Q, 2,2,0,3> E2 ## E2 ## E0; }; \ + struct { detail::_swizzle<3, T, Q, 2,2,1,3> E2 ## E2 ## E1; }; \ + struct { detail::_swizzle<3, T, Q, 2,2,2,3> E2 ## E2 ## E2; }; \ + struct { detail::_swizzle<3, T, Q, 2,2,3,3> E2 ## E2 ## E3; }; \ + struct { detail::_swizzle<3, T, Q, 2,3,0,3> E2 ## E3 ## E0; }; \ + struct { detail::_swizzle<3, T, Q, 2,3,1,3> E2 ## E3 ## E1; }; \ + struct { detail::_swizzle<3, T, Q, 2,3,2,3> E2 ## E3 ## E2; }; \ + struct { detail::_swizzle<3, T, Q, 2,3,3,3> E2 ## E3 ## E3; }; \ + struct { detail::_swizzle<3, T, Q, 3,0,0,3> E3 ## E0 ## E0; }; \ + struct { detail::_swizzle<3, T, Q, 3,0,1,3> E3 ## E0 ## E1; }; \ + struct { detail::_swizzle<3, T, Q, 3,0,2,3> E3 ## E0 ## E2; }; \ + struct { detail::_swizzle<3, T, Q, 3,0,3,3> E3 ## E0 ## E3; }; \ + struct { detail::_swizzle<3, T, Q, 3,1,0,3> E3 ## E1 ## E0; }; \ + struct { detail::_swizzle<3, T, Q, 3,1,1,3> E3 ## E1 ## E1; }; \ + struct { detail::_swizzle<3, T, Q, 3,1,2,3> E3 ## E1 ## E2; }; \ + struct { detail::_swizzle<3, T, Q, 3,1,3,3> E3 ## E1 ## E3; }; \ + struct { detail::_swizzle<3, T, Q, 3,2,0,3> E3 ## E2 ## E0; }; \ + struct { detail::_swizzle<3, T, Q, 3,2,1,3> E3 ## E2 ## E1; }; \ + struct { detail::_swizzle<3, T, Q, 3,2,2,3> E3 ## E2 ## E2; }; \ + struct { detail::_swizzle<3, T, Q, 3,2,3,3> E3 ## E2 ## E3; }; \ + struct { detail::_swizzle<3, T, Q, 3,3,0,3> E3 ## E3 ## E0; }; \ + struct { detail::_swizzle<3, T, Q, 3,3,1,3> E3 ## E3 ## E1; }; \ + struct { detail::_swizzle<3, T, Q, 3,3,2,3> E3 ## E3 ## E2; }; \ + struct { detail::_swizzle<3, T, Q, 3,3,3,3> E3 ## E3 ## E3; }; #define GLM_SWIZZLE4_4_MEMBERS(T, Q, E0,E1,E2,E3) \ struct { detail::_swizzle<4, T, Q, 0,0,0,0> E0 ## E0 ## E0 ## E0; }; \ diff --git a/glm/detail/_vectorize.hpp b/glm/detail/_vectorize.hpp index 1fcaec31..807b1b1d 100644 --- a/glm/detail/_vectorize.hpp +++ b/glm/detail/_vectorize.hpp @@ -52,6 +52,12 @@ namespace detail { return vec<1, T, Q>(Func(a.x, b.x)); } + + template + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec<1, T, Q> call(Fct Func, vec<1, T, Q> const& a, vec<1, T, Q> const& b) + { + return vec<1, T, Q>(Func(a.x, b.x)); + } }; template class vec, typename T, qualifier Q> @@ -61,6 +67,12 @@ namespace detail { return vec<2, T, Q>(Func(a.x, b.x), Func(a.y, b.y)); } + + template + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec<2, T, Q> call(Fct Func, vec<2, T, Q> const& a, vec<2, T, Q> const& b) + { + return vec<2, T, Q>(Func(a.x, b.x), Func(a.y, b.y)); + } }; template class vec, typename T, qualifier Q> @@ -70,6 +82,12 @@ namespace detail { return vec<3, T, Q>(Func(a.x, b.x), Func(a.y, b.y), Func(a.z, b.z)); } + + template + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec<3, T, Q> call(Fct Func, vec<3, T, Q> const& a, vec<3, T, Q> const& b) + { + return vec<3, T, Q>(Func(a.x, b.x), Func(a.y, b.y), Func(a.z, b.z)); + } }; template class vec, typename T, qualifier Q> @@ -79,6 +97,12 @@ namespace detail { return vec<4, T, Q>(Func(a.x, b.x), Func(a.y, b.y), Func(a.z, b.z), Func(a.w, b.w)); } + + template + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec<4, T, Q> call(Fct Func, vec<4, T, Q> const& a, vec<4, T, Q> const& b) + { + return vec<4, T, Q>(Func(a.x, b.x), Func(a.y, b.y), Func(a.z, b.z), Func(a.w, b.w)); + } }; template class vec, length_t L, typename T, qualifier Q> @@ -91,6 +115,11 @@ namespace detail { return vec<1, T, Q>(Func(a.x, b)); } + template + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec<1, T, Q> call(Fct Func, vec<1, T, Q> const& a, T b) + { + return vec<1, T, Q>(Func(a.x, b)); + } }; template class vec, typename T, qualifier Q> @@ -100,6 +129,12 @@ namespace detail { return vec<2, T, Q>(Func(a.x, b), Func(a.y, b)); } + + template + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec<2, T, Q> call(Fct Func, vec<2, T, Q> const& a, T b) + { + return vec<2, T, Q>(Func(a.x, b), Func(a.y, b)); + } }; template class vec, typename T, qualifier Q> @@ -109,6 +144,12 @@ namespace detail { return vec<3, T, Q>(Func(a.x, b), Func(a.y, b), Func(a.z, b)); } + + template + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec<3, T, Q> call(Fct Func, vec<3, T, Q> const& a, T b) + { + return vec<3, T, Q>(Func(a.x, b), Func(a.y, b), Func(a.z, b)); + } }; template class vec, typename T, qualifier Q> @@ -118,6 +159,11 @@ namespace detail { return vec<4, T, Q>(Func(a.x, b), Func(a.y, b), Func(a.z, b), Func(a.w, b)); } + template + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec<4, T, Q> call(Fct Func, vec<4, T, Q> const& a, T b) + { + return vec<4, T, Q>(Func(a.x, b), Func(a.y, b), Func(a.z, b), Func(a.w, b)); + } }; template @@ -130,6 +176,12 @@ namespace detail { return vec<1, int, Q>(Func(a.x, b.x)); } + + template + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec<1, int, Q> call(Fct Func, vec<1, T, Q> const& a, vec<1, int, Q> const& b) + { + return vec<1, int, Q>(Func(a.x, b.x)); + } }; template @@ -139,6 +191,11 @@ namespace detail { return vec<2, int, Q>(Func(a.x, b.x), Func(a.y, b.y)); } + template + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec<2, int, Q> call(Fct Func, vec<2, T, Q> const& a, vec<2, int, Q> const& b) + { + return vec<2, int, Q>(Func(a.x, b.x), Func(a.y, b.y)); + } }; template @@ -148,6 +205,11 @@ namespace detail { return vec<3, int, Q>(Func(a.x, b.x), Func(a.y, b.y), Func(a.z, b.z)); } + template + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec<3, int, Q> call(Fct Func, vec<3, T, Q> const& a, vec<3, int, Q> const& b) + { + return vec<3, int, Q>(Func(a.x, b.x), Func(a.y, b.y), Func(a.z, b.z)); + } }; template @@ -157,6 +219,12 @@ namespace detail { return vec<4, int, Q>(Func(a.x, b.x), Func(a.y, b.y), Func(a.z, b.z), Func(a.w, b.w)); } + + template + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec<4, int, Q> call(Fct Func, vec<4, T, Q> const& a, vec<4, int, Q> const& b) + { + return vec<4, int, Q>(Func(a.x, b.x), Func(a.y, b.y), Func(a.z, b.z), Func(a.w, b.w)); + } }; }//namespace detail }//namespace glm diff --git a/glm/detail/compute_vector_decl.hpp b/glm/detail/compute_vector_decl.hpp new file mode 100644 index 00000000..a1b1043a --- /dev/null +++ b/glm/detail/compute_vector_decl.hpp @@ -0,0 +1,190 @@ + +#pragma once +#include +#include "_vectorize.hpp" + +namespace glm { + namespace detail + { + template + struct compute_vec_add {}; + + template + struct compute_vec_sub {}; + + template + struct compute_vec_mul {}; + + template + struct compute_vec_div {}; + + template + struct compute_vec_mod {}; + + template + struct compute_splat {}; + + template + struct compute_vec_and {}; + + template + struct compute_vec_or {}; + + template + struct compute_vec_xor {}; + + template + struct compute_vec_shift_left {}; + + template + struct compute_vec_shift_right {}; + + template + struct compute_vec_equal {}; + + template + struct compute_vec_nequal {}; + + template + struct compute_vec_bitwise_not {}; + + template + struct compute_vec_add + { + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call(vec const& a, vec const& b) + { + return detail::functor2::call(std::plus(), a, b); + } + }; + + template + struct compute_vec_sub + { + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call(vec const& a, vec const& b) + { + return detail::functor2::call(std::minus(), a, b); + } + }; + + template + struct compute_vec_mul + { + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call(vec const& a, vec const& b) + { + return detail::functor2::call(std::multiplies(), a, b); + } + }; + + template + struct compute_vec_div + { + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call(vec const& a, vec const& b) + { + return detail::functor2::call(std::divides(), a, b); + } + }; + + template + struct compute_vec_mod + { + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call(vec const& a, vec const& b) + { + return detail::functor2::call(std::modulus(), a, b); + } + }; + + template + struct compute_vec_and + { + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call(vec const& a, vec const& b) + { + vec v(a); + for (int i = 0; i < L; ++i) + v[i] &= static_cast(b[i]); + return v; + } + }; + + template + struct compute_vec_or + { + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call(vec const& a, vec const& b) + { + vec v(a); + for (int i = 0; i < L; ++i) + v[i] |= static_cast(b[i]); + return v; + } + }; + + template + struct compute_vec_xor + { + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call(vec const& a, vec const& b) + { + vec v(a); + for (int i = 0; i < L; ++i) + v[i] ^= static_cast(b[i]); + return v; + } + }; + + template + struct compute_vec_shift_left + { + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call(vec const& a, vec const& b) + { + vec v(a); + for (int i = 0; i < L; ++i) + v[i] <<= static_cast(b[i]); + return v; + } + }; + + template + struct compute_vec_shift_right + { + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call(vec const& a, vec const& b) + { + vec v(a); + for (int i = 0; i < L; ++i) + v[i] >>= static_cast(b[i]); + return v; + } + }; + + template + struct compute_vec_equal + { + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static bool call(vec const& v1, vec const& v2) + { + bool b = true; + for (int i = 0; i < L; ++i) + b = b && detail::compute_equal::is_iec559>::call(v1.x, v2.x); + return b; + } + }; + + template + struct compute_vec_nequal + { + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static bool call(vec<4, T, Q> const& v1, vec<4, T, Q> const& v2) + { + return !compute_vec_equal::value, sizeof(T) * 8, detail::is_aligned::value>::call(v1, v2); + } + }; + + template + struct compute_vec_bitwise_not + { + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call(vec const& a) + { + vec v(a); + for (int i = 0; i < L; ++i) + v[i] = ~v[i]; + return v; + } + }; + + } +} diff --git a/glm/detail/func_common.inl b/glm/detail/func_common.inl index f8584c9e..f0397d4e 100644 --- a/glm/detail/func_common.inl +++ b/glm/detail/func_common.inl @@ -20,6 +20,11 @@ namespace glm return (y < x) ? y : x; } + template + struct TMin { + T operator()(const T& a, const T& b) { return min(a, b); } + }; + // max template GLM_FUNC_QUALIFIER GLM_CONSTEXPR genType max(genType x, genType y) @@ -29,6 +34,11 @@ namespace glm return (x < y) ? y : x; } + template + struct TMax { + T operator()(const T& a, const T& b) { return max(a, b); } + }; + // abs template<> GLM_FUNC_QUALIFIER GLM_CONSTEXPR int abs(int x) @@ -37,6 +47,11 @@ namespace glm return (x ^ y) - y; } + template + struct TAbs { + T operator()(const T& a) { return abs(a); } + }; + // round # if GLM_HAS_CXX11_STL using ::std::round; @@ -50,6 +65,11 @@ namespace glm } # endif + template + struct TRound { + T operator()(const T& a) { return round(a); } + }; + // trunc # if GLM_HAS_CXX11_STL using ::std::trunc; @@ -63,6 +83,16 @@ namespace glm } # endif + template + struct TTrunc { + T operator()(const T& a) { return trunc(a); } + }; + + template + struct TFmod { + T operator()(const T& a, const T& b) { return std::fmod(a, b); } + }; + }//namespace glm namespace glm{ @@ -80,7 +110,7 @@ namespace detail template struct compute_mix_vector { - GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call(vec const& x, vec const& y, vec const& a) + GLM_FUNC_QUALIFIER static vec call(vec const& x, vec const& y, vec const& a) { GLM_STATIC_ASSERT(std::numeric_limits::is_iec559 || GLM_CONFIG_UNRESTRICTED_FLOAT || GLM_CONFIG_UNRESTRICTED_GENTYPE, "'mix' only accept floating-point inputs for the interpolator a"); @@ -91,7 +121,7 @@ namespace detail template struct compute_mix_vector { - GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call(vec const& x, vec const& y, vec const& a) + GLM_FUNC_QUALIFIER static vec call(vec const& x, vec const& y, vec const& a) { vec Result(0); for(length_t i = 0; i < x.length(); ++i) @@ -103,7 +133,7 @@ namespace detail template struct compute_mix_scalar { - GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call(vec const& x, vec const& y, U const& a) + GLM_FUNC_QUALIFIER static vec call(vec const& x, vec const& y, U const& a) { GLM_STATIC_ASSERT(std::numeric_limits::is_iec559 || GLM_CONFIG_UNRESTRICTED_FLOAT || GLM_CONFIG_UNRESTRICTED_GENTYPE, "'mix' only accept floating-point inputs for the interpolator a"); @@ -114,7 +144,7 @@ namespace detail template struct compute_mix_scalar { - GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call(vec const& x, vec const& y, bool const& a) + GLM_FUNC_QUALIFIER static vec call(vec const& x, vec const& y, bool const& a) { return a ? y : x; } @@ -123,7 +153,7 @@ namespace detail template struct compute_mix { - GLM_FUNC_QUALIFIER GLM_CONSTEXPR static T call(T const& x, T const& y, U const& a) + GLM_FUNC_QUALIFIER static T call(T const& x, T const& y, U const& a) { GLM_STATIC_ASSERT(std::numeric_limits::is_iec559 || GLM_CONFIG_UNRESTRICTED_FLOAT || GLM_CONFIG_UNRESTRICTED_GENTYPE, "'mix' only accept floating-point inputs for the interpolator a"); @@ -134,7 +164,7 @@ namespace detail template struct compute_mix { - GLM_FUNC_QUALIFIER GLM_CONSTEXPR static T call(T const& x, T const& y, bool const& a) + GLM_FUNC_QUALIFIER static T call(T const& x, T const& y, bool const& a) { return a ? y : x; } @@ -143,7 +173,7 @@ namespace detail template struct compute_sign { - GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call(vec const& x) + GLM_FUNC_QUALIFIER static vec call(vec const& x) { return vec(glm::lessThan(vec(0), x)) - vec(glm::lessThan(x, vec(0))); } @@ -153,7 +183,7 @@ namespace detail template struct compute_sign { - GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call(vec const& x) + GLM_FUNC_QUALIFIER static vec call(vec const& x) { T const Shift(static_cast(sizeof(T) * 8 - 1)); vec const y(vec::type, Q>(-x) >> typename detail::make_unsigned::type(Shift)); @@ -218,12 +248,21 @@ namespace detail } }; + template + struct compute_fma + { + GLM_FUNC_QUALIFIER static vec call(vec const& a, vec const& b, vec const& c) + { + return a * b + c; + } + }; + template struct compute_min_vector { GLM_FUNC_QUALIFIER static vec call(vec const& x, vec const& y) { - return detail::functor2::call(min, x, y); + return detail::functor2::call(TMin(), x, y); } }; @@ -232,7 +271,7 @@ namespace detail { GLM_FUNC_QUALIFIER static vec call(vec const& x, vec const& y) { - return detail::functor2::call(max, x, y); + return detail::functor2::call(TMax(), x, y); } }; @@ -264,6 +303,56 @@ namespace detail return tmp * tmp * (static_cast(3) - static_cast(2) * tmp); } }; + + template + struct convert_vec3_to_vec4W0 + { + GLM_FUNC_QUALIFIER static vec<4, T, Q> call(vec<3, T, Q> const& a) + { + return vec<4, T, Q>(a.x, a.y, a.z, 0.0f); + } + }; + + template + struct convert_vec3_to_vec4WZ + { + GLM_FUNC_QUALIFIER static vec<4, T, Q> call(vec<3, T, Q> const& a) + { + return vec<4, T, Q>(a.x, a.y, a.z, a.z); + } + }; + + template + struct convert_vec3_to_vec4W1 + { + GLM_FUNC_QUALIFIER static vec<4, T, Q> call(vec<3, T, Q> const& a) + { + return vec<4, T, Q>(a.x, a.y, a.z, 1.0f); + } + }; + + template + struct convert_vec4_to_vec3 + { + GLM_FUNC_QUALIFIER static vec<4, T, Q> call(vec<3, T, Q> const& a) + { + return vec<4, T, Q>(a.x, a.y, a.z, 0.0f); + } + }; + + template + struct convert_splat { + template + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call(vec const& a) + { + vec v; + for (int i = 0; i < L; ++i) + v[i] = a[c]; + return v; + } + }; + + }//namespace detail template @@ -422,6 +511,61 @@ namespace detail return detail::compute_mod::value>::call(x, y); } + template + GLM_FUNC_QUALIFIER vec fma(vec const& a, vec const& b, vec const& c) + { + return detail::compute_fma::value>::call(a, b, c); + } + + template + GLM_FUNC_QUALIFIER vec<4, T, Q> xyz0(vec<3, T, Q> const& a) + { + return detail::convert_vec3_to_vec4W0::value>::call(a); + } + + template + GLM_FUNC_QUALIFIER vec<4, T, Q> xyz1(vec<3, T, Q> const& a) + { + return detail::convert_vec3_to_vec4W1::value>::call(a); + } + + template + GLM_FUNC_QUALIFIER vec<4, T, Q> xyzz(vec<3, T, Q> const& a) + { + return detail::convert_vec3_to_vec4WZ::value>::call(a); + } + + template + GLM_FUNC_QUALIFIER vec<3, T, Q> xyz(vec<4, T, Q> const& a) + { + return detail::convert_vec4_to_vec3::value>::call(a); + } + + template + GLM_FUNC_QUALIFIER vec splatX(vec const& a) + { + return detail::convert_splat::value>::template call<0>(a); + } + + template + GLM_FUNC_QUALIFIER vec splatY(vec const& a) + { + return detail::convert_splat::value>::template call<1>(a); + } + + template + GLM_FUNC_QUALIFIER vec splatZ(vec const& a) + { + return detail::convert_splat::value>::template call<2>(a); + } + + template + GLM_FUNC_QUALIFIER vec splatW(vec const& a) + { + return detail::convert_splat::value>::template call<3>(a); + } + + // modf template GLM_FUNC_QUALIFIER genType modf(genType x, genType & i) diff --git a/glm/detail/func_common_simd.inl b/glm/detail/func_common_simd.inl index ce0032d3..55c72286 100644 --- a/glm/detail/func_common_simd.inl +++ b/glm/detail/func_common_simd.inl @@ -225,7 +225,389 @@ namespace detail return Result; } }; + + template + struct compute_fma<4, float, Q, true> + { + GLM_FUNC_QUALIFIER static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b, vec<4, float, Q> const& c) + { + vec<4, float, Q> Result; + Result.data = glm_vec4_fma(a.data, b.data, c.data); + return Result; + } + }; + + template + struct compute_fma<3, float, Q, true> + { + GLM_FUNC_QUALIFIER static vec<3, float, Q> call(vec<3, float, Q> const& a, vec<3, float, Q> const& b, vec<3, float, Q> const& c) + { + vec<3, float, Q> Result; + Result.data = glm_vec4_fma(a.data, b.data, c.data); + return Result; + } + }; + + + template + struct compute_fma<4, double, Q, true> + { + GLM_FUNC_QUALIFIER static vec<4, double, Q> call(vec<4, double, Q> const& a, vec<4, double, Q> const& b, vec<4, double, Q> const& c) + { + vec<4, double, Q> Result; +# if (GLM_ARCH & GLM_ARCH_AVX2_BIT) && !(GLM_COMPILER & GLM_COMPILER_CLANG) + Result.data = _mm256_fmadd_pd(a.data, b.data, c.data); +# elif (GLM_ARCH & GLM_ARCH_AVX_BIT) + Result.data = _mm256_add_pd(_mm256_mul_pd(a.data, b.data), c.data); +# else + Result.data.setv(0, _mm_add_pd(_mm_mul_pd(a.data.getv(0), b.data.getv(0)), c.data.getv(0))); + Result.data.setv(1, _mm_add_pd(_mm_mul_pd(a.data.getv(1), b.data.getv(1)), c.data.getv(1))); +# endif + return Result; + } + }; + + // copy vec3 to vec4 and set w to 0 + template + struct convert_vec3_to_vec4W0 + { + GLM_FUNC_QUALIFIER static vec<4, float, Q> call(vec<3, float, Q> const& a) + { + vec<4, float, Q> v; +#if (GLM_ARCH & GLM_ARCH_SSE41_BIT) + v.data = _mm_blend_ps(a.data, _mm_setzero_ps(), 8); +#else + __m128i mask = _mm_set_epi32(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF); + __m128 v0 = _mm_castsi128_ps(_mm_and_si128(_mm_castps_si128(a.data), mask)); + v.data = v0; +#endif + return v; + } + }; + + // copy vec3 to vec4 and set w to 1 + template + struct convert_vec3_to_vec4W1 + { + GLM_FUNC_QUALIFIER static vec<4, float, Q> call(vec<3, float, Q> const& a) + { + vec<4, float, Q> v; +#if (GLM_ARCH & GLM_ARCH_SSE41_BIT) + v.data = _mm_blend_ps(a.data, _mm_set1_ps(1.0f), 8); +#else + __m128 t1 = _mm_shuffle_ps(a.data, a.data, _MM_SHUFFLE(0, 2, 1, 3)); //permute x, w + __m128 t2 = _mm_move_ss(t1, _mm_set_ss(1.0f)); // set x to 1.0f + v.data = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(0, 2, 1, 3)); //permute x, w +#endif + return v; + } + }; + + // copy vec3 to vec4 and set w to vec3.z + template + struct convert_vec3_to_vec4WZ + { + GLM_FUNC_QUALIFIER static vec<4, float, Q> call(vec<3, float, Q> const& a) + { + vec<4, float, Q> v; + v.data = _mm_shuffle_ps(a.data, a.data, _MM_SHUFFLE(2, 2, 1, 0)); + return v; + } + }; + + // copy vec3 to vec4 and set w to 0 + template + struct convert_vec3_to_vec4W0 + { + GLM_FUNC_QUALIFIER static vec<4, double, Q> call(vec<3, double, Q> const& a) + { + vec<4, double, Q> v; +#if (GLM_ARCH & GLM_ARCH_AVX_BIT) + v.data = _mm256_blend_pd(a.data, _mm256_setzero_pd(), 8); +#else + v.data.setv(0, a.data.getv(0)); + glm_dvec2 av2 = a.data.getv(1); + av2 = _mm_shuffle_pd(av2, _mm_setzero_pd(), 2); + v.data.setv(1, av2); +#endif + return v; + } + }; + + // copy vec3 to vec4 and set w to vec3.z + template + struct convert_vec3_to_vec4WZ + { + GLM_FUNC_QUALIFIER static vec<4, double, Q> call(vec<3, double, Q> const& a) + { + vec<4, double, Q> v; +#if (GLM_ARCH & GLM_ARCH_AVX_BIT) + v.data = _mm256_permute_pd(a.data, 2); +#else + v.data.setv(0, a.data.getv(0)); + glm_dvec2 av2 = a.data.getv(1); + __m128d t1 = _mm_shuffle_pd(av2, av2, 0); + v.data.setv(1, t1); +#endif + return v; + } + }; + + // copy vec3 to vec4 and set w to 1 + template + struct convert_vec3_to_vec4W1 + { + GLM_FUNC_QUALIFIER static vec<4, double, Q> call(vec<3, double, Q> const& a) + { + vec<4, double, Q> v; +#if (GLM_ARCH & GLM_ARCH_AVX_BIT) + v.data = _mm256_blend_pd(a.data, _mm256_set1_pd(1.0), 8); +#else + v.data.setv(0, a.data.getv(0)); + glm_dvec2 av2 = a.data.getv(1); + av2 = _mm_shuffle_pd(av2, _mm_set1_pd(1.), 2); + v.data.setv(1, av2); +#endif + return v; + } + }; + + template + struct convert_vec4_to_vec3 { + GLM_FUNC_QUALIFIER static vec<3, float, Q> call(vec<4, float, Q> const& a) + { + vec<3, float, Q> v; + v.data = a.data; + return v; + } + }; + + template + struct convert_vec4_to_vec3 { + GLM_FUNC_QUALIFIER static vec<3, double, Q> call(vec<4, double, Q> const& a) + { + vec<3, double, Q> v; +#if GLM_ARCH & GLM_ARCH_AVX_BIT + v.data = a.data; +#else + v.data.setv(0, a.data.getv(0)); + v.data.setv(1, a.data.getv(1)); +#endif + return v; + } + }; + + + // set all coordinates to same value vec[c] + template + struct convert_splat { + template + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call(vec const& a) + { + vec Result; + const int s = _MM_SHUFFLE(c, c, c, c); + glm_f32vec4 va = static_cast(a.data); +# if GLM_ARCH & GLM_ARCH_AVX_BIT + Result.data = _mm_permute_ps(va, s); +# else + Result.data = _mm_shuffle_ps(va, va, s); +# endif + return Result; + } + }; + + // set all coordinates to same value vec[c] + template + struct convert_splat { + + template + struct detailSSE + {}; + + template + struct detailSSE + { + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call(vec const& a) + { + vec Result; + glm_f64vec2 r0 = _mm_shuffle_pd(a.data.getv(0), a.data.getv(0), c | c << 1); + Result.data.setv(0, r0); + Result.data.setv(1, r0); + return Result; + } + }; + + template + struct detailSSE + { + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call(vec const& a) + { + vec Result; + const unsigned int d = static_cast(c - 2); + glm_f64vec2 r0 = _mm_shuffle_pd(a.data.getv(1), a.data.getv(1), d | d << 1); + Result.data.setv(0, r0); + Result.data.setv(1, r0); + return Result; + } + }; + +#if GLM_ARCH & GLM_ARCH_AVX_BIT + template //note: bool is useless but needed to compil on linux (gcc) + struct detailAVX + {}; + + template + struct detailAVX + { + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call(vec const& a) + { + vec Result; + __m256d t1 = _mm256_permute2f128_pd(a.data, a.data, 0x0); + Result.data = _mm256_permute_pd(t1, 0); + return Result; + } + }; + + template + struct detailAVX + { + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call(vec const& a) + { + vec Result; + __m256d t1 = _mm256_permute2f128_pd(a.data, a.data, 0x0); + Result.data = _mm256_permute_pd(t1, 0xf); + return Result; + } + }; + + template + struct detailAVX + { + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call(vec const& a) + { + vec Result; + __m256d t2 = _mm256_permute2f128_pd(a.data, a.data, 0x11); + Result.data = _mm256_permute_pd(t2, 0x0); + return Result; + } + }; + + template + struct detailAVX + { + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call(vec const& a) + { + vec Result; + __m256d t2 = _mm256_permute2f128_pd(a.data, a.data, 0x11); + Result.data = _mm256_permute_pd(t2, 0xf); + return Result; + } + }; +#endif //GLM_ARCH & GLM_ARCH_AVX_BIT + + template + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call(vec const& a) + { + //return compute_splat::call(a); + vec Result; +# if GLM_ARCH & GLM_ARCH_AVX2_BIT + Result.data = _mm256_permute4x64_pd(a.data, _MM_SHUFFLE(c, c, c, c)); +# elif GLM_ARCH & GLM_ARCH_AVX_BIT + Result = detailAVX::call(a); +# else +#if 1 //detail<(c <= 1), c>::call2(a) is equivalent to following code but without if constexpr usage + Result = detailSSE<(c <= 1), c>::call(a); +#else + if constexpr (c <= 1) + { + glm_f64vec2 r0 = _mm_shuffle_pd(a.data.getv(0), a.data.getv(0), c | c << 1); + Result.data.setv(0, r0); + Result.data.setv(1, r0); + } + else + { + const unsigned int d = (unsigned int)(c - 2); + glm_f64vec2 r0 = _mm_shuffle_pd(a.data.getv(1), a.data.getv(1), d | d << 1); + Result.data.setv(0, r0); + Result.data.setv(1, r0); + } +#endif +# endif + return Result; + } + }; + + }//namespace detail }//namespace glm #endif//GLM_ARCH & GLM_ARCH_SSE2_BIT + +#if GLM_ARCH & GLM_ARCH_NEON_BIT +namespace glm { + namespace detail { + + template + struct convert_vec3_to_vec4W0 + { + GLM_FUNC_QUALIFIER static vec<4, float, Q> call(vec<3, float, Q> const& a) + { + vec<4, float, Q> v; + static const uint32x4_t mask = { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; + v.data = vbslq_f32(mask, a.data, vdupq_n_f32(0)); + return v; + } + }; + + template + struct convert_vec4_to_vec3 { + GLM_FUNC_QUALIFIER static vec<3, float, Q> call(vec<4, float, Q> const& a) + { + vec<3, float, Q> v; + v.data = a.data; + return v; + } + }; + + template + struct compute_splat { + template + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call(vec const& a) + {} + + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call<0>(vec const& a) + { + vec Result; + Result.data = vdupq_lane_f32(vget_low_f32(a.data), 0); + return Result; + } + + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call<1>(vec const& a) + { + vec Result; + Result.data = vdupq_lane_f32(vget_low_f32(a.data), 1); + return Result; + } + + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call<2>(vec const& a) + { + vec Result; + Result.data = vdupq_lane_f32(vget_high_f32(a.data), 0); + return Result; + } + + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec call<3>(vec const& a) + { + vec Result; + Result.data = vdupq_lane_f32(vget_high_f32(a.data), 1); + return Result; + } + }; + +}//namespace detail +}//namespace glm +#endif //GLM_ARCH & GLM_ARCH_NEON_BIT diff --git a/glm/detail/func_geometric.inl b/glm/detail/func_geometric.inl index 404c9905..8f3cc773 100644 --- a/glm/detail/func_geometric.inl +++ b/glm/detail/func_geometric.inl @@ -59,8 +59,13 @@ namespace detail { GLM_FUNC_QUALIFIER GLM_CONSTEXPR static T call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) { - vec<4, T, Q> tmp(a * b); - return (tmp.x + tmp.y) + (tmp.z + tmp.w); + // VS 17.7.4 generates longer assembly (~20 instructions vs 11 instructions) + #if defined(_MSC_VER) + return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; + #else + vec<4, T, Q> tmp(a * b); + return (tmp.x + tmp.y) + (tmp.z + tmp.w); + #endif } }; @@ -76,6 +81,17 @@ namespace detail x.z * y.x - y.z * x.x, x.x * y.y - y.x * x.y); } + + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec<4, T, Q> call(vec<4, T, Q> const& x, vec<4, T, Q> const& y) + { + GLM_STATIC_ASSERT(std::numeric_limits::is_iec559, "'cross' accepts only floating-point inputs"); + + return vec<4, T, Q>( + x.y * y.z - y.y * x.z, + x.z * y.x - y.z * x.x, + x.x * y.y - y.x * x.y, + 0.0f); + } }; template diff --git a/glm/detail/func_geometric_simd.inl b/glm/detail/func_geometric_simd.inl index 2076dae0..4c7f56bf 100644 --- a/glm/detail/func_geometric_simd.inl +++ b/glm/detail/func_geometric_simd.inl @@ -35,18 +35,36 @@ namespace detail } }; + template + struct compute_dot, float, true> + { + GLM_FUNC_QUALIFIER static float call(vec<3, float, Q> const& a, vec<3, float, Q> const& b) + { + vec<4, float, Q> aa = xyz0(a); + vec<4, float, Q> bb = xyz0(b); + return _mm_cvtss_f32(glm_vec1_dot(aa.data, bb.data)); + } + }; + template struct compute_cross { GLM_FUNC_QUALIFIER static vec<3, float, Q> call(vec<3, float, Q> const& a, vec<3, float, Q> const& b) { - __m128 const set0 = _mm_set_ps(0.0f, a.z, a.y, a.x); - __m128 const set1 = _mm_set_ps(0.0f, b.z, b.y, b.x); - __m128 const xpd0 = glm_vec4_cross(set0, set1); + vec<4, float, Q> aa = xyzz(a); + vec<4, float, Q> bb = xyzz(b); + __m128 const xpd0 = glm_vec4_cross(aa.data, bb.data); - vec<4, float, Q> Result; + vec<3, float, Q> Result; Result.data = xpd0; - return vec<3, float, Q>(Result); + return Result; + } + + GLM_FUNC_QUALIFIER static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) + { + vec<4, float, Q> Result; + Result.data = glm_vec4_cross(a.data, b.data); + return Result; } }; diff --git a/glm/detail/func_matrix.inl b/glm/detail/func_matrix.inl index 081761f3..9c7f7eb7 100644 --- a/glm/detail/func_matrix.inl +++ b/glm/detail/func_matrix.inl @@ -318,28 +318,69 @@ namespace detail } }; + template + struct inv3x3 {}; + + template + struct inv3x3 + { + GLM_FUNC_QUALIFIER static mat<3, 3, T, Q> call(mat<3, 3, T, Q> const& m) + { + // see: https://www.onlinemathstutor.org/post/3x3_inverses + + vec<4, T, Q> a = xyz0(m[0]); + vec<4, T, Q> b = xyz0(m[1]); + vec<4, T, Q> c = xyz0(m[2]); + + vec<4, T, Q> i0 = compute_cross::call(b, c); + vec<4, T, Q> i1 = compute_cross::call(c, a); + vec<4, T, Q> i2 = compute_cross::call(a, b); + + mat<3, 3, T, Q> Inverse; + Inverse[0] = xyz(i0); + Inverse[1] = xyz(i1); + Inverse[2] = xyz(i2); + Inverse = transpose(Inverse); + + T Determinant = compute_dot, T, true>::call(a, compute_cross::call(b, c)); + vec<3, T, Q> OneOverDeterminant(static_cast(1) / Determinant); + Inverse *= OneOverDeterminant; + return Inverse; + } + }; + + template + struct inv3x3 + { + GLM_FUNC_QUALIFIER static mat<3, 3, T, Q> call(mat<3, 3, T, Q> const& m) + { + T OneOverDeterminant = static_cast(1) / ( + +m[0][0] * (m[1][1] * m[2][2] - m[2][1] * m[1][2]) + - m[1][0] * (m[0][1] * m[2][2] - m[2][1] * m[0][2]) + + m[2][0] * (m[0][1] * m[1][2] - m[1][1] * m[0][2])); + + mat<3, 3, T, Q> Inverse; + Inverse[0][0] = +(m[1][1] * m[2][2] - m[2][1] * m[1][2]); + Inverse[1][0] = -(m[1][0] * m[2][2] - m[2][0] * m[1][2]); + Inverse[2][0] = +(m[1][0] * m[2][1] - m[2][0] * m[1][1]); + Inverse[0][1] = -(m[0][1] * m[2][2] - m[2][1] * m[0][2]); + Inverse[1][1] = +(m[0][0] * m[2][2] - m[2][0] * m[0][2]); + Inverse[2][1] = -(m[0][0] * m[2][1] - m[2][0] * m[0][1]); + Inverse[0][2] = +(m[0][1] * m[1][2] - m[1][1] * m[0][2]); + Inverse[1][2] = -(m[0][0] * m[1][2] - m[1][0] * m[0][2]); + Inverse[2][2] = +(m[0][0] * m[1][1] - m[1][0] * m[0][1]); + + Inverse *= OneOverDeterminant; + return Inverse; + } + }; + template struct compute_inverse<3, 3, T, Q, Aligned> { GLM_FUNC_QUALIFIER static mat<3, 3, T, Q> call(mat<3, 3, T, Q> const& m) { - T OneOverDeterminant = static_cast(1) / ( - + m[0][0] * (m[1][1] * m[2][2] - m[2][1] * m[1][2]) - - m[1][0] * (m[0][1] * m[2][2] - m[2][1] * m[0][2]) - + m[2][0] * (m[0][1] * m[1][2] - m[1][1] * m[0][2])); - - mat<3, 3, T, Q> Inverse; - Inverse[0][0] = + (m[1][1] * m[2][2] - m[2][1] * m[1][2]) * OneOverDeterminant; - Inverse[1][0] = - (m[1][0] * m[2][2] - m[2][0] * m[1][2]) * OneOverDeterminant; - Inverse[2][0] = + (m[1][0] * m[2][1] - m[2][0] * m[1][1]) * OneOverDeterminant; - Inverse[0][1] = - (m[0][1] * m[2][2] - m[2][1] * m[0][2]) * OneOverDeterminant; - Inverse[1][1] = + (m[0][0] * m[2][2] - m[2][0] * m[0][2]) * OneOverDeterminant; - Inverse[2][1] = - (m[0][0] * m[2][1] - m[2][0] * m[0][1]) * OneOverDeterminant; - Inverse[0][2] = + (m[0][1] * m[1][2] - m[1][1] * m[0][2]) * OneOverDeterminant; - Inverse[1][2] = - (m[0][0] * m[1][2] - m[1][0] * m[0][2]) * OneOverDeterminant; - Inverse[2][2] = + (m[0][0] * m[1][1] - m[1][0] * m[0][1]) * OneOverDeterminant; - - return Inverse; + return detail::inv3x3::value>::call(m); } }; diff --git a/glm/detail/func_matrix_simd.inl b/glm/detail/func_matrix_simd.inl index b9bb4615..9e47bf70 100644 --- a/glm/detail/func_matrix_simd.inl +++ b/glm/detail/func_matrix_simd.inl @@ -37,6 +37,17 @@ namespace detail } }; + template + struct compute_transpose<3, 3, float, Q, true> + { + GLM_FUNC_QUALIFIER static mat<3, 3, float, Q> call(mat<3, 3, float, Q> const& m) + { + mat<3, 3, float, Q> Result; + glm_mat3_transpose(&m[0].data, &Result[0].data); + return Result; + } + }; + template struct compute_determinant<4, 4, float, Q, true> { diff --git a/glm/detail/qualifier.hpp b/glm/detail/qualifier.hpp index a6c96cca..cb4e108a 100644 --- a/glm/detail/qualifier.hpp +++ b/glm/detail/qualifier.hpp @@ -126,6 +126,24 @@ namespace detail typedef glm_u32vec4 type; }; + template<> + struct storage<3, float, true> + { + typedef glm_f32vec4 type; + }; + + template<> + struct storage<3, int, true> + { + typedef glm_i32vec4 type; + }; + + template<> + struct storage<3, unsigned int, true> + { + typedef glm_i32vec4 type; + }; + template<> struct storage<2, double, true> { @@ -143,13 +161,38 @@ namespace detail { typedef glm_u64vec2 type; }; -# endif -# if (GLM_ARCH & GLM_ARCH_AVX_BIT) + + + template<> + struct storage<3, detail::uint64, true> + { + typedef glm_u64vec2 type; + }; + template<> struct storage<4, double, true> { +# if (GLM_ARCH & GLM_ARCH_AVX_BIT) typedef glm_f64vec4 type; +# else + struct type + { + glm_f64vec2 data[2]; + GLM_CONSTEXPR glm_f64vec2 getv(int i) const { + return data[i]; + } + GLM_CONSTEXPR void setv(int i, const glm_f64vec2& v) { + data[i] = v; + } + }; +# endif }; + + + template<> + struct storage<3, double, true> : public storage<4, double, true> + {}; + # endif # if (GLM_ARCH & GLM_ARCH_AVX2_BIT) @@ -173,17 +216,38 @@ namespace detail typedef glm_f32vec4 type; }; + template<> + struct storage<3, float, true> : public storage<4, float, true> + {}; + template<> struct storage<4, int, true> { typedef glm_i32vec4 type; }; + template<> + struct storage<3, int, true> : public storage<4, int, true> + {}; + template<> struct storage<4, unsigned int, true> { typedef glm_u32vec4 type; }; + + template<> + struct storage<3, unsigned int, true> : public storage<4, unsigned int, true> + {}; + + template<> + struct storage<3, double, true> + { + typedef struct alignas(4 * sizeof(double)) type { + double data[4]; + } type; + }; + # endif enum genTypeEnum diff --git a/glm/detail/type_mat3x3.inl b/glm/detail/type_mat3x3.inl index a9b633ae..5ccfb229 100644 --- a/glm/detail/type_mat3x3.inl +++ b/glm/detail/type_mat3x3.inl @@ -1,4 +1,5 @@ #include "../matrix.hpp" +#include "../common.hpp" namespace glm { @@ -307,9 +308,10 @@ namespace glm template GLM_FUNC_QUALIFIER GLM_CONSTEXPR mat<3, 3, T, Q> & mat<3, 3, T, Q>::operator*=(U s) { - this->value[0] *= s; - this->value[1] *= s; - this->value[2] *= s; + col_type sv(s); + this->value[0] *= sv; + this->value[1] *= sv; + this->value[2] *= sv; return *this; } @@ -468,54 +470,86 @@ namespace glm GLM_FUNC_QUALIFIER GLM_CONSTEXPR typename mat<3, 3, T, Q>::col_type operator*(mat<3, 3, T, Q> const& m, typename mat<3, 3, T, Q>::row_type const& v) { return typename mat<3, 3, T, Q>::col_type( - m[0][0] * v.x + m[1][0] * v.y + m[2][0] * v.z, - m[0][1] * v.x + m[1][1] * v.y + m[2][1] * v.z, - m[0][2] * v.x + m[1][2] * v.y + m[2][2] * v.z); + m[0] * splatX(v) + m[1] * splatY(v) + m[2] * splatZ(v)); } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR typename mat<3, 3, T, Q>::row_type operator*(typename mat<3, 3, T, Q>::col_type const& v, mat<3, 3, T, Q> const& m) { return typename mat<3, 3, T, Q>::row_type( - m[0][0] * v.x + m[0][1] * v.y + m[0][2] * v.z, - m[1][0] * v.x + m[1][1] * v.y + m[1][2] * v.z, - m[2][0] * v.x + m[2][1] * v.y + m[2][2] * v.z); + dot(m[0], v), + dot(m[1], v), + dot(m[2], v)); + } + + namespace detail + { + template + struct mul3x3 {}; + +#if GLM_CONFIG_SIMD == GLM_ENABLE + template + struct mul3x3 + { + GLM_FUNC_QUALIFIER static mat<3, 3, T, Q> call(mat<3, 3, T, Q> const& m1, mat<3, 3, T, Q> const& m2) + { + typename mat<4, 4, T, Q>::col_type const SrcA0 = xyzz(m1[0]); + typename mat<4, 4, T, Q>::col_type const SrcA1 = xyzz(m1[1]); + typename mat<4, 4, T, Q>::col_type const SrcA2 = xyzz(m1[2]); + + typename mat<4, 4, T, Q>::col_type const SrcB0 = xyzz(m2[0]); + typename mat<4, 4, T, Q>::col_type const SrcB1 = xyzz(m2[1]); + typename mat<4, 4, T, Q>::col_type const SrcB2 = xyzz(m2[2]); + + mat<3, 3, T, Q> Result; + Result[0] = xyz(glm::fma(SrcA2, splatZ(SrcB0), glm::fma(SrcA1, splatY(SrcB0), SrcA0 * splatX(SrcB0)))); + Result[1] = xyz(glm::fma(SrcA2, splatZ(SrcB1), glm::fma(SrcA1, splatY(SrcB1), SrcA0 * splatX(SrcB1)))); + Result[2] = xyz(glm::fma(SrcA2, splatZ(SrcB2), glm::fma(SrcA1, splatY(SrcB2), SrcA0 * splatX(SrcB2)))); + return mat<3, 3, T, Q>(Result); + } + }; +#endif + template + struct mul3x3 + { + GLM_FUNC_QUALIFIER static mat<3, 3, T, Q> call(mat<3, 3, T, Q> const& m1, mat<3, 3, T, Q> const& m2) + { + typename mat<3, 3, T, Q>::col_type const& SrcA0 = m1[0]; + typename mat<3, 3, T, Q>::col_type const& SrcA1 = m1[1]; + typename mat<3, 3, T, Q>::col_type const& SrcA2 = m1[2]; + + typename mat<3, 3, T, Q>::col_type const& SrcB0 = m2[0]; + typename mat<3, 3, T, Q>::col_type const& SrcB1 = m2[1]; + typename mat<3, 3, T, Q>::col_type const& SrcB2 = m2[2]; + + mat<3, 3, T, Q> Result; + // note: the following lines are decomposed to have consistent results between simd and non simd code (prevent rounding error because of operation order) + //Result[0] = SrcA2 * SrcB1.z + SrcA1 * SrcB1.y + SrcA0 * SrcB1.x; + //Result[1] = SrcA2 * SrcB1.z + SrcA1 * SrcB1.y + SrcA0 * SrcB1.x; + //Result[2] = SrcA2 * SrcB2.z + SrcA1 * SrcB2.y + SrcA0 * SrcB2.x; + + typename mat<3, 3, T, Q>::col_type tmp; + tmp = SrcA0 * SrcB0.x; + tmp += SrcA1 * SrcB0.y; + tmp += SrcA2 * SrcB0.z; + Result[0] = tmp; + tmp = SrcA0 * SrcB1.x; + tmp += SrcA1 * SrcB1.y; + tmp += SrcA2 * SrcB1.z; + Result[1] = tmp; + tmp = SrcA0 * SrcB2.x; + tmp += SrcA1 * SrcB2.y; + tmp += SrcA2 * SrcB2.z; + Result[2] = tmp; + return Result; + } + }; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR mat<3, 3, T, Q> operator*(mat<3, 3, T, Q> const& m1, mat<3, 3, T, Q> const& m2) { - T const SrcA00 = m1[0][0]; - T const SrcA01 = m1[0][1]; - T const SrcA02 = m1[0][2]; - T const SrcA10 = m1[1][0]; - T const SrcA11 = m1[1][1]; - T const SrcA12 = m1[1][2]; - T const SrcA20 = m1[2][0]; - T const SrcA21 = m1[2][1]; - T const SrcA22 = m1[2][2]; - - T const SrcB00 = m2[0][0]; - T const SrcB01 = m2[0][1]; - T const SrcB02 = m2[0][2]; - T const SrcB10 = m2[1][0]; - T const SrcB11 = m2[1][1]; - T const SrcB12 = m2[1][2]; - T const SrcB20 = m2[2][0]; - T const SrcB21 = m2[2][1]; - T const SrcB22 = m2[2][2]; - - mat<3, 3, T, Q> Result; - Result[0][0] = SrcA00 * SrcB00 + SrcA10 * SrcB01 + SrcA20 * SrcB02; - Result[0][1] = SrcA01 * SrcB00 + SrcA11 * SrcB01 + SrcA21 * SrcB02; - Result[0][2] = SrcA02 * SrcB00 + SrcA12 * SrcB01 + SrcA22 * SrcB02; - Result[1][0] = SrcA00 * SrcB10 + SrcA10 * SrcB11 + SrcA20 * SrcB12; - Result[1][1] = SrcA01 * SrcB10 + SrcA11 * SrcB11 + SrcA21 * SrcB12; - Result[1][2] = SrcA02 * SrcB10 + SrcA12 * SrcB11 + SrcA22 * SrcB12; - Result[2][0] = SrcA00 * SrcB20 + SrcA10 * SrcB21 + SrcA20 * SrcB22; - Result[2][1] = SrcA01 * SrcB20 + SrcA11 * SrcB21 + SrcA21 * SrcB22; - Result[2][2] = SrcA02 * SrcB20 + SrcA12 * SrcB21 + SrcA22 * SrcB22; - return Result; + return detail::mul3x3::value>::call(m1, m2); } template diff --git a/glm/detail/type_mat4x4.inl b/glm/detail/type_mat4x4.inl index 116731d7..6e328a9e 100644 --- a/glm/detail/type_mat4x4.inl +++ b/glm/detail/type_mat4x4.inl @@ -1,4 +1,5 @@ #include "../matrix.hpp" +#include "../geometric.hpp" namespace glm { @@ -588,10 +589,10 @@ namespace glm ) { return typename mat<4, 4, T, Q>::row_type( - m[0][0] * v[0] + m[0][1] * v[1] + m[0][2] * v[2] + m[0][3] * v[3], - m[1][0] * v[0] + m[1][1] * v[1] + m[1][2] * v[2] + m[1][3] * v[3], - m[2][0] * v[0] + m[2][1] * v[1] + m[2][2] * v[2] + m[2][3] * v[3], - m[3][0] * v[0] + m[3][1] * v[1] + m[3][2] * v[2] + m[3][3] * v[3]); + glm::dot(m[0], v), + glm::dot(m[1], v), + glm::dot(m[2], v), + glm::dot(m[3], v)); } template @@ -626,25 +627,89 @@ namespace glm m1[0][3] * m2[2][0] + m1[1][3] * m2[2][1] + m1[2][3] * m2[2][2] + m1[3][3] * m2[2][3]); } + namespace detail + { + template + struct mul4x4 {}; + + template + struct mul4x4 + { + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static mat<4, 4, T, Q> call(mat<4, 4, T, Q> const& m1, mat<4, 4, T, Q> const& m2) + { + typename mat<4, 4, T, Q>::col_type const SrcA0 = m1[0]; + typename mat<4, 4, T, Q>::col_type const SrcA1 = m1[1]; + typename mat<4, 4, T, Q>::col_type const SrcA2 = m1[2]; + typename mat<4, 4, T, Q>::col_type const SrcA3 = m1[3]; + + typename mat<4, 4, T, Q>::col_type const SrcB0 = m2[0]; + typename mat<4, 4, T, Q>::col_type const SrcB1 = m2[1]; + typename mat<4, 4, T, Q>::col_type const SrcB2 = m2[2]; + typename mat<4, 4, T, Q>::col_type const SrcB3 = m2[3]; + + mat<4, 4, T, Q> Result; + Result[0] = glm::fma(SrcA3, splatW(SrcB0), glm::fma(SrcA2, splatZ(SrcB0), glm::fma(SrcA1, splatY(SrcB0), SrcA0 * splatX(SrcB0)))); + Result[1] = glm::fma(SrcA3, splatW(SrcB1), glm::fma(SrcA2, splatZ(SrcB1), glm::fma(SrcA1, splatY(SrcB1), SrcA0 * splatX(SrcB1)))); + Result[2] = glm::fma(SrcA3, splatW(SrcB2), glm::fma(SrcA2, splatZ(SrcB2), glm::fma(SrcA1, splatY(SrcB2), SrcA0 * splatX(SrcB2)))); + Result[3] = glm::fma(SrcA3, splatW(SrcB3), glm::fma(SrcA2, splatZ(SrcB3), glm::fma(SrcA1, splatY(SrcB3), SrcA0 * splatX(SrcB3)))); + return mat < 4, 4, T, Q > (Result); + } + }; + + template + struct mul4x4 + { + GLM_FUNC_QUALIFIER GLM_CONSTEXPR static mat<4, 4, T, Q> call(mat<4, 4, T, Q> const& m1, mat<4, 4, T, Q> const& m2) + { + typename mat<4, 4, T, Q>::col_type const& SrcA0 = m1[0]; + typename mat<4, 4, T, Q>::col_type const& SrcA1 = m1[1]; + typename mat<4, 4, T, Q>::col_type const& SrcA2 = m1[2]; + typename mat<4, 4, T, Q>::col_type const& SrcA3 = m1[3]; + + typename mat<4, 4, T, Q>::col_type const& SrcB0 = m2[0]; + typename mat<4, 4, T, Q>::col_type const& SrcB1 = m2[1]; + typename mat<4, 4, T, Q>::col_type const& SrcB2 = m2[2]; + typename mat<4, 4, T, Q>::col_type const& SrcB3 = m2[3]; + + mat<4, 4, T, Q> Result; + // note: the following lines are decomposed to have consistent results between simd and non simd code (prevent rounding error because of operation order) + //Result[0] = SrcA3 * SrcB0.w + SrcA2 * SrcB0.z + SrcA1 * SrcB0.y + SrcA0 * SrcB0.x; + //Result[1] = SrcA3 * SrcB1.w + SrcA2 * SrcB1.z + SrcA1 * SrcB1.y + SrcA0 * SrcB1.x; + //Result[2] = SrcA3 * SrcB2.w + SrcA2 * SrcB2.z + SrcA1 * SrcB2.y + SrcA0 * SrcB2.x; + //Result[3] = SrcA3 * SrcB3.w + SrcA2 * SrcB3.z + SrcA1 * SrcB3.y + SrcA0 * SrcB3.x; + + typename mat<4, 4, T, Q>::col_type tmp; + tmp = SrcA0 * SrcB0.x; + tmp += SrcA1 * SrcB0.y; + tmp += SrcA2 * SrcB0.z; + tmp += SrcA3 * SrcB0.w; + Result[0] = tmp; + tmp = SrcA0 * SrcB1.x; + tmp += SrcA1 * SrcB1.y; + tmp += SrcA2 * SrcB1.z; + tmp += SrcA3 * SrcB1.w; + Result[1] = tmp; + tmp = SrcA0 * SrcB2.x; + tmp += SrcA1 * SrcB2.y; + tmp += SrcA2 * SrcB2.z; + tmp += SrcA3 * SrcB2.w; + Result[2] = tmp; + tmp = SrcA0 * SrcB3.x; + tmp += SrcA1 * SrcB3.y; + tmp += SrcA2 * SrcB3.z; + tmp += SrcA3 * SrcB3.w; + Result[3] = tmp; + + return Result; + } + }; + } + + template GLM_FUNC_QUALIFIER GLM_CONSTEXPR mat<4, 4, T, Q> operator*(mat<4, 4, T, Q> const& m1, mat<4, 4, T, Q> const& m2) { - typename mat<4, 4, T, Q>::col_type const SrcA0 = m1[0]; - typename mat<4, 4, T, Q>::col_type const SrcA1 = m1[1]; - typename mat<4, 4, T, Q>::col_type const SrcA2 = m1[2]; - typename mat<4, 4, T, Q>::col_type const SrcA3 = m1[3]; - - typename mat<4, 4, T, Q>::col_type const SrcB0 = m2[0]; - typename mat<4, 4, T, Q>::col_type const SrcB1 = m2[1]; - typename mat<4, 4, T, Q>::col_type const SrcB2 = m2[2]; - typename mat<4, 4, T, Q>::col_type const SrcB3 = m2[3]; - - mat<4, 4, T, Q> Result; - Result[0] = SrcA0 * SrcB0[0] + SrcA1 * SrcB0[1] + SrcA2 * SrcB0[2] + SrcA3 * SrcB0[3]; - Result[1] = SrcA0 * SrcB1[0] + SrcA1 * SrcB1[1] + SrcA2 * SrcB1[2] + SrcA3 * SrcB1[3]; - Result[2] = SrcA0 * SrcB2[0] + SrcA1 * SrcB2[1] + SrcA2 * SrcB2[2] + SrcA3 * SrcB2[3]; - Result[3] = SrcA0 * SrcB3[0] + SrcA1 * SrcB3[1] + SrcA2 * SrcB3[2] + SrcA3 * SrcB3[3]; - return Result; + return detail::mul4x4::value>::call(m1, m2); } template diff --git a/glm/detail/type_vec2.hpp b/glm/detail/type_vec2.hpp index 424868f5..d095a2cc 100644 --- a/glm/detail/type_vec2.hpp +++ b/glm/detail/type_vec2.hpp @@ -21,6 +21,10 @@ namespace glm typedef T value_type; typedef vec<2, T, Q> type; typedef vec<2, bool, Q> bool_type; + enum is_aligned + { + value = false + }; // -- Data -- diff --git a/glm/detail/type_vec3.hpp b/glm/detail/type_vec3.hpp index 67104800..8b221205 100644 --- a/glm/detail/type_vec3.hpp +++ b/glm/detail/type_vec3.hpp @@ -22,6 +22,11 @@ namespace glm typedef vec<3, T, Q> type; typedef vec<3, bool, Q> bool_type; + enum is_aligned + { + value = detail::is_aligned::value + }; + // -- Data -- # if GLM_SILENT_WARNINGS == GLM_ENABLE @@ -252,6 +257,8 @@ namespace glm GLM_FUNC_DECL GLM_CONSTEXPR vec<3, T, Q> & operator>>=(vec<3, U, Q> const& v); }; + + // -- Unary operators -- template @@ -428,6 +435,10 @@ namespace glm template GLM_FUNC_DECL GLM_CONSTEXPR vec<3, bool, Q> operator||(vec<3, bool, Q> const& v1, vec<3, bool, Q> const& v2); + + + + }//namespace glm #ifndef GLM_EXTERNAL_TEMPLATE diff --git a/glm/detail/type_vec3.inl b/glm/detail/type_vec3.inl index 4e44047b..eeae97ad 100644 --- a/glm/detail/type_vec3.inl +++ b/glm/detail/type_vec3.inl @@ -1,6 +1,7 @@ /// @ref core #include "compute_vector_relational.hpp" +#include "compute_vector_decl.hpp" namespace glm { @@ -225,120 +226,84 @@ namespace glm template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> & vec<3, T, Q>::operator+=(U scalar) { - this->x += static_cast(scalar); - this->y += static_cast(scalar); - this->z += static_cast(scalar); - return *this; + return (*this = detail::compute_vec_add<3, T, Q, detail::is_aligned::value>::call(*this, vec<3, T, Q>(scalar))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> & vec<3, T, Q>::operator+=(vec<1, U, Q> const& v) { - this->x += static_cast(v.x); - this->y += static_cast(v.x); - this->z += static_cast(v.x); - return *this; + return (*this = detail::compute_vec_add<3, T, Q, detail::is_aligned::value>::call(*this, vec<1, T, Q>(v.x))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> & vec<3, T, Q>::operator+=(vec<3, U, Q> const& v) { - this->x += static_cast(v.x); - this->y += static_cast(v.y); - this->z += static_cast(v.z); - return *this; + return (*this = detail::compute_vec_add<3, T, Q, detail::is_aligned::value>::call(*this, vec<3, T, Q>(v))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> & vec<3, T, Q>::operator-=(U scalar) { - this->x -= static_cast(scalar); - this->y -= static_cast(scalar); - this->z -= static_cast(scalar); - return *this; + return (*this = detail::compute_vec_sub<3, T, Q, detail::is_aligned::value>::call(*this, vec<3, T, Q>(scalar))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> & vec<3, T, Q>::operator-=(vec<1, U, Q> const& v) { - this->x -= static_cast(v.x); - this->y -= static_cast(v.x); - this->z -= static_cast(v.x); - return *this; + return (*this = detail::compute_vec_sub<3, T, Q, detail::is_aligned::value>::call(*this, vec<1, T, Q>(v.x))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> & vec<3, T, Q>::operator-=(vec<3, U, Q> const& v) { - this->x -= static_cast(v.x); - this->y -= static_cast(v.y); - this->z -= static_cast(v.z); - return *this; + return (*this = detail::compute_vec_sub<3, T, Q, detail::is_aligned::value>::call(*this, vec<3, T, Q>(v))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> & vec<3, T, Q>::operator*=(U scalar) { - this->x *= static_cast(scalar); - this->y *= static_cast(scalar); - this->z *= static_cast(scalar); - return *this; + return (*this = detail::compute_vec_mul<3, T, Q, detail::is_aligned::value>::call(*this, vec<3, T, Q>(static_cast(scalar)))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> & vec<3, T, Q>::operator*=(vec<1, U, Q> const& v) { - this->x *= static_cast(v.x); - this->y *= static_cast(v.x); - this->z *= static_cast(v.x); - return *this; + return (*this = detail::compute_vec_mul<3, T, Q, detail::is_aligned::value>::call(*this, vec<3, T, Q>(v.x))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> & vec<3, T, Q>::operator*=(vec<3, U, Q> const& v) { - this->x *= static_cast(v.x); - this->y *= static_cast(v.y); - this->z *= static_cast(v.z); - return *this; + return (*this = detail::compute_vec_mul<3, T, Q, detail::is_aligned::value>::call(*this, vec<3, T, Q>(v))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> & vec<3, T, Q>::operator/=(U v) { - this->x /= static_cast(v); - this->y /= static_cast(v); - this->z /= static_cast(v); - return *this; + return (*this = detail::compute_vec_div<3, T, Q, detail::is_aligned::value>::call(*this, vec<3, T, Q>(v))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> & vec<3, T, Q>::operator/=(vec<1, U, Q> const& v) { - this->x /= static_cast(v.x); - this->y /= static_cast(v.x); - this->z /= static_cast(v.x); - return *this; + return (*this = detail::compute_vec_div<3, T, Q, detail::is_aligned::value>::call(*this, vec<3, T, Q>(v.x))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> & vec<3, T, Q>::operator/=(vec<3, U, Q> const& v) { - this->x /= static_cast(v.x); - this->y /= static_cast(v.y); - this->z /= static_cast(v.z); - return *this; + return (*this = detail::compute_vec_div<3, T, Q, detail::is_aligned::value>::call(*this, vec<3, T, Q>(v))); } // -- Increment and decrement operators -- @@ -383,180 +348,127 @@ namespace glm template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> & vec<3, T, Q>::operator%=(U scalar) { - this->x %= scalar; - this->y %= scalar; - this->z %= scalar; - return *this; + return (*this = detail::compute_vec_mod<3, T, Q, detail::is_aligned::value>::call(*this, vec<3, T, Q>(scalar))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> & vec<3, T, Q>::operator%=(vec<1, U, Q> const& v) { - this->x %= v.x; - this->y %= v.x; - this->z %= v.x; - return *this; + return (*this = detail::compute_vec_mod<3, T, Q, detail::is_aligned::value>::call(*this, vec<3, T, Q>(v))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> & vec<3, T, Q>::operator%=(vec<3, U, Q> const& v) { - this->x %= v.x; - this->y %= v.y; - this->z %= v.z; - return *this; + return (*this = detail::compute_vec_mod<3, T, Q, detail::is_aligned::value>::call(*this, vec<3, T, Q>(v))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> & vec<3, T, Q>::operator&=(U scalar) { - this->x &= scalar; - this->y &= scalar; - this->z &= scalar; - return *this; + return (*this = detail::compute_vec_and<3, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<3, T, Q>(scalar))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> & vec<3, T, Q>::operator&=(vec<1, U, Q> const& v) { - this->x &= v.x; - this->y &= v.x; - this->z &= v.x; - return *this; + return (*this = detail::compute_vec_and<3, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<3, T, Q>(v))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> & vec<3, T, Q>::operator&=(vec<3, U, Q> const& v) { - this->x &= v.x; - this->y &= v.y; - this->z &= v.z; - return *this; + return (*this = detail::compute_vec_and<3, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<3, T, Q>(v))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> & vec<3, T, Q>::operator|=(U scalar) { - this->x |= scalar; - this->y |= scalar; - this->z |= scalar; - return *this; + return (*this = detail::compute_vec_or<3, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<3, T, Q>(scalar))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> & vec<3, T, Q>::operator|=(vec<1, U, Q> const& v) { - this->x |= v.x; - this->y |= v.x; - this->z |= v.x; - return *this; + return (*this = detail::compute_vec_or<3, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<3, T, Q>(v))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> & vec<3, T, Q>::operator|=(vec<3, U, Q> const& v) { - this->x |= v.x; - this->y |= v.y; - this->z |= v.z; - return *this; + return (*this = detail::compute_vec_or<3, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<3, T, Q>(v))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> & vec<3, T, Q>::operator^=(U scalar) { - this->x ^= scalar; - this->y ^= scalar; - this->z ^= scalar; - return *this; + return (*this = detail::compute_vec_xor<3, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<3, T, Q>(scalar))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> & vec<3, T, Q>::operator^=(vec<1, U, Q> const& v) { - this->x ^= v.x; - this->y ^= v.x; - this->z ^= v.x; - return *this; + return (*this = detail::compute_vec_xor<3, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<3, T, Q>(v.x))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> & vec<3, T, Q>::operator^=(vec<3, U, Q> const& v) { - this->x ^= v.x; - this->y ^= v.y; - this->z ^= v.z; - return *this; + return (*this = detail::compute_vec_xor<3, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<3, T, Q>(v))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> & vec<3, T, Q>::operator<<=(U scalar) { - this->x <<= scalar; - this->y <<= scalar; - this->z <<= scalar; - return *this; + return (*this = detail::compute_vec_shift_left<3, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<3, T, Q>(scalar))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> & vec<3, T, Q>::operator<<=(vec<1, U, Q> const& v) { - this->x <<= static_cast(v.x); - this->y <<= static_cast(v.x); - this->z <<= static_cast(v.x); - return *this; + return (*this = detail::compute_vec_shift_left<3, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<1, T, Q>(v))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> & vec<3, T, Q>::operator<<=(vec<3, U, Q> const& v) { - this->x <<= static_cast(v.x); - this->y <<= static_cast(v.y); - this->z <<= static_cast(v.z); - return *this; + return (*this = detail::compute_vec_shift_left<3, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<3, T, Q>(v))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> & vec<3, T, Q>::operator>>=(U scalar) { - this->x >>= static_cast(scalar); - this->y >>= static_cast(scalar); - this->z >>= static_cast(scalar); - return *this; + return (*this = detail::compute_vec_shift_right<3, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<3, T, Q>(scalar))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> & vec<3, T, Q>::operator>>=(vec<1, U, Q> const& v) { - this->x >>= static_cast(v.x); - this->y >>= static_cast(v.x); - this->z >>= static_cast(v.x); - return *this; + return (*this = detail::compute_vec_shift_right<3, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<3, T, Q>(v))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> & vec<3, T, Q>::operator>>=(vec<3, U, Q> const& v) { - this->x >>= static_cast(v.x); - this->y >>= static_cast(v.y); - this->z >>= static_cast(v.z); - return *this; + return (*this = detail::compute_vec_shift_right<3, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<3, T, Q>(v))); + } // -- Unary arithmetic operators -- @@ -570,10 +482,7 @@ namespace glm template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator-(vec<3, T, Q> const& v) { - return vec<3, T, Q>( - -v.x, - -v.y, - -v.z); + return vec<3, T, Q>(0) -= v; } // -- Binary arithmetic operators -- @@ -581,181 +490,122 @@ namespace glm template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator+(vec<3, T, Q> const& v, T scalar) { - return vec<3, T, Q>( - v.x + scalar, - v.y + scalar, - v.z + scalar); + return vec<3, T, Q>(v) += scalar; } template - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator+(vec<3, T, Q> const& v, vec<1, T, Q> const& scalar) + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator+(vec<3, T, Q> const& v1, vec<1, T, Q> const& v2) { - return vec<3, T, Q>( - v.x + scalar.x, - v.y + scalar.x, - v.z + scalar.x); + return vec<3, T, Q>(v1) += v2; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator+(T scalar, vec<3, T, Q> const& v) { - return vec<3, T, Q>( - scalar + v.x, - scalar + v.y, - scalar + v.z); + return vec<3, T, Q>(v) += scalar; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator+(vec<1, T, Q> const& scalar, vec<3, T, Q> const& v) { - return vec<3, T, Q>( - scalar.x + v.x, - scalar.x + v.y, - scalar.x + v.z); + return vec<3, T, Q>(scalar) += v; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator+(vec<3, T, Q> const& v1, vec<3, T, Q> const& v2) { - return vec<3, T, Q>( - v1.x + v2.x, - v1.y + v2.y, - v1.z + v2.z); + return vec<3, T, Q>(v1) += v2; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator-(vec<3, T, Q> const& v, T scalar) { - return vec<3, T, Q>( - v.x - scalar, - v.y - scalar, - v.z - scalar); + return vec<3, T, Q>(v) -= scalar; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator-(vec<3, T, Q> const& v, vec<1, T, Q> const& scalar) { - return vec<3, T, Q>( - v.x - scalar.x, - v.y - scalar.x, - v.z - scalar.x); + return vec<3, T, Q>(v) -= scalar.x; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator-(T scalar, vec<3, T, Q> const& v) { - return vec<3, T, Q>( - scalar - v.x, - scalar - v.y, - scalar - v.z); + return vec<3, T, Q>(scalar) -= v; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator-(vec<1, T, Q> const& scalar, vec<3, T, Q> const& v) { - return vec<3, T, Q>( - scalar.x - v.x, - scalar.x - v.y, - scalar.x - v.z); + return vec<3, T, Q>(scalar) -= v; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator-(vec<3, T, Q> const& v1, vec<3, T, Q> const& v2) { - return vec<3, T, Q>( - v1.x - v2.x, - v1.y - v2.y, - v1.z - v2.z); + return vec<3, T, Q>(v1) -= v2; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator*(vec<3, T, Q> const& v, T scalar) { - return vec<3, T, Q>( - v.x * scalar, - v.y * scalar, - v.z * scalar); + return vec<3, T, Q>(v) *= scalar; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator*(vec<3, T, Q> const& v, vec<1, T, Q> const& scalar) { - return vec<3, T, Q>( - v.x * scalar.x, - v.y * scalar.x, - v.z * scalar.x); + return vec<3, T, Q>(v) *= scalar.x; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator*(T scalar, vec<3, T, Q> const& v) { - return vec<3, T, Q>( - scalar * v.x, - scalar * v.y, - scalar * v.z); + return vec<3, T, Q>(v) *= scalar; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator*(vec<1, T, Q> const& scalar, vec<3, T, Q> const& v) { - return vec<3, T, Q>( - scalar.x * v.x, - scalar.x * v.y, - scalar.x * v.z); + return vec<3, T, Q>(v) *= scalar.x; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator*(vec<3, T, Q> const& v1, vec<3, T, Q> const& v2) { - return vec<3, T, Q>( - v1.x * v2.x, - v1.y * v2.y, - v1.z * v2.z); + return vec<3, T, Q>(v1) *= v2; + } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator/(vec<3, T, Q> const& v, T scalar) { - return vec<3, T, Q>( - v.x / scalar, - v.y / scalar, - v.z / scalar); + return vec<3, T, Q>(v) *= 1/scalar; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator/(vec<3, T, Q> const& v, vec<1, T, Q> const& scalar) { - return vec<3, T, Q>( - v.x / scalar.x, - v.y / scalar.x, - v.z / scalar.x); + return vec<3, T, Q>(v) *= 1/scalar.x; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator/(T scalar, vec<3, T, Q> const& v) { - return vec<3, T, Q>( - scalar / v.x, - scalar / v.y, - scalar / v.z); + return vec<3, T, Q>(scalar) /= v; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator/(vec<1, T, Q> const& scalar, vec<3, T, Q> const& v) { - return vec<3, T, Q>( - scalar.x / v.x, - scalar.x / v.y, - scalar.x / v.z); + return vec<3, T, Q>(scalar) /= v; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator/(vec<3, T, Q> const& v1, vec<3, T, Q> const& v2) { - return vec<3, T, Q>( - v1.x / v2.x, - v1.y / v2.y, - v1.z / v2.z); + return vec<3, T, Q>(v1) /= v2; } // -- Binary bit operators -- @@ -763,271 +613,181 @@ namespace glm template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator%(vec<3, T, Q> const& v, T scalar) { - return vec<3, T, Q>( - v.x % scalar, - v.y % scalar, - v.z % scalar); + return vec<3, T, Q>(v) %= scalar; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator%(vec<3, T, Q> const& v, vec<1, T, Q> const& scalar) { - return vec<3, T, Q>( - v.x % scalar.x, - v.y % scalar.x, - v.z % scalar.x); + return vec<3, T, Q>(v) %= scalar.x; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator%(T scalar, vec<3, T, Q> const& v) { - return vec<3, T, Q>( - scalar % v.x, - scalar % v.y, - scalar % v.z); + return vec<3, T, Q>(scalar) %= v; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator%(vec<1, T, Q> const& scalar, vec<3, T, Q> const& v) { - return vec<3, T, Q>( - scalar.x % v.x, - scalar.x % v.y, - scalar.x % v.z); + return vec<3, T, Q>(scalar.x) %= v; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator%(vec<3, T, Q> const& v1, vec<3, T, Q> const& v2) { - return vec<3, T, Q>( - v1.x % v2.x, - v1.y % v2.y, - v1.z % v2.z); + return vec<3, T, Q>(v1) %= v2; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator&(vec<3, T, Q> const& v, T scalar) { - return vec<3, T, Q>( - v.x & scalar, - v.y & scalar, - v.z & scalar); + return vec<3, T, Q>(v) &= scalar; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator&(vec<3, T, Q> const& v, vec<1, T, Q> const& scalar) { - return vec<3, T, Q>( - v.x & scalar.x, - v.y & scalar.x, - v.z & scalar.x); + return vec<3, T, Q>(v) &= scalar.x; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator&(T scalar, vec<3, T, Q> const& v) { - return vec<3, T, Q>( - scalar & v.x, - scalar & v.y, - scalar & v.z); + return vec<3, T, Q>(scalar) &= v; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator&(vec<1, T, Q> const& scalar, vec<3, T, Q> const& v) { - return vec<3, T, Q>( - scalar.x & v.x, - scalar.x & v.y, - scalar.x & v.z); + return vec<3, T, Q>(scalar.x) &= v; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator&(vec<3, T, Q> const& v1, vec<3, T, Q> const& v2) { - return vec<3, T, Q>( - v1.x & v2.x, - v1.y & v2.y, - v1.z & v2.z); + return vec<3, T, Q>(v1) &= v2; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator|(vec<3, T, Q> const& v, T scalar) { - return vec<3, T, Q>( - v.x | scalar, - v.y | scalar, - v.z | scalar); + return vec<3, T, Q>(v) |= scalar; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator|(vec<3, T, Q> const& v, vec<1, T, Q> const& scalar) { - return vec<3, T, Q>( - v.x | scalar.x, - v.y | scalar.x, - v.z | scalar.x); + return vec<3, T, Q>(v) |= scalar.x; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator|(T scalar, vec<3, T, Q> const& v) { - return vec<3, T, Q>( - scalar | v.x, - scalar | v.y, - scalar | v.z); + return vec<3, T, Q>(scalar) |= v; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator|(vec<1, T, Q> const& scalar, vec<3, T, Q> const& v) { - return vec<3, T, Q>( - scalar.x | v.x, - scalar.x | v.y, - scalar.x | v.z); + return vec<3, T, Q>(scalar.x) |= v; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator|(vec<3, T, Q> const& v1, vec<3, T, Q> const& v2) { - return vec<3, T, Q>( - v1.x | v2.x, - v1.y | v2.y, - v1.z | v2.z); + return vec<3, T, Q>(v1) |= v2; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator^(vec<3, T, Q> const& v, T scalar) { - return vec<3, T, Q>( - v.x ^ scalar, - v.y ^ scalar, - v.z ^ scalar); + return vec<3, T, Q>(v) ^= scalar; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator^(vec<3, T, Q> const& v, vec<1, T, Q> const& scalar) { - return vec<3, T, Q>( - v.x ^ scalar.x, - v.y ^ scalar.x, - v.z ^ scalar.x); + return vec<3, T, Q>(v) ^= scalar.x; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator^(T scalar, vec<3, T, Q> const& v) { - return vec<3, T, Q>( - scalar ^ v.x, - scalar ^ v.y, - scalar ^ v.z); + return vec<3, T, Q>(scalar) ^= v; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator^(vec<1, T, Q> const& scalar, vec<3, T, Q> const& v) { - return vec<3, T, Q>( - scalar.x ^ v.x, - scalar.x ^ v.y, - scalar.x ^ v.z); + return vec<3, T, Q>(scalar.x) ^= v; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator^(vec<3, T, Q> const& v1, vec<3, T, Q> const& v2) { - return vec<3, T, Q>( - v1.x ^ v2.x, - v1.y ^ v2.y, - v1.z ^ v2.z); + return vec<3, T, Q>(v1) ^= v2; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator<<(vec<3, T, Q> const& v, T scalar) { - return vec<3, T, Q>( - v.x << scalar, - v.y << scalar, - v.z << scalar); + return vec<3, T, Q>(v) <<= scalar; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator<<(vec<3, T, Q> const& v, vec<1, T, Q> const& scalar) { - return vec<3, T, Q>( - v.x << scalar.x, - v.y << scalar.x, - v.z << scalar.x); + return vec<3, T, Q>(v) <<= scalar.x; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator<<(T scalar, vec<3, T, Q> const& v) { - return vec<3, T, Q>( - scalar << v.x, - scalar << v.y, - scalar << v.z); + return vec<3, T, Q>(scalar) << v; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator<<(vec<1, T, Q> const& scalar, vec<3, T, Q> const& v) { - return vec<3, T, Q>( - scalar.x << v.x, - scalar.x << v.y, - scalar.x << v.z); + return vec<3, T, Q>(scalar.x) << v; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator<<(vec<3, T, Q> const& v1, vec<3, T, Q> const& v2) { - return vec<3, T, Q>( - v1.x << v2.x, - v1.y << v2.y, - v1.z << v2.z); + return vec<3, T, Q>(v1) <<= v2; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator>>(vec<3, T, Q> const& v, T scalar) { - return vec<3, T, Q>( - v.x >> scalar, - v.y >> scalar, - v.z >> scalar); + return vec<3, T, Q>(v) >>= scalar; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator>>(vec<3, T, Q> const& v, vec<1, T, Q> const& scalar) { - return vec<3, T, Q>( - v.x >> scalar.x, - v.y >> scalar.x, - v.z >> scalar.x); + return vec<3, T, Q>(v) >>= scalar.x; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator>>(T scalar, vec<3, T, Q> const& v) { - return vec<3, T, Q>( - scalar >> v.x, - scalar >> v.y, - scalar >> v.z); + return vec<3, T, Q>(scalar) >>= v; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator>>(vec<1, T, Q> const& scalar, vec<3, T, Q> const& v) { - return vec<3, T, Q>( - scalar.x >> v.x, - scalar.x >> v.y, - scalar.x >> v.z); + return vec<3, T, Q>(scalar.x) >>= v; } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, T, Q> operator>>(vec<3, T, Q> const& v1, vec<3, T, Q> const& v2) { - return vec<3, T, Q>( - v1.x >> v2.x, - v1.y >> v2.y, - v1.z >> v2.z); + return vec<3, T, Q>(v1) >>= v2; } template @@ -1068,3 +828,158 @@ namespace glm return vec<3, bool, Q>(v1.x || v2.x, v1.y || v2.y, v1.z || v2.z); } }//namespace glm + + +#if GLM_CONFIG_SIMD == GLM_ENABLE +# include "type_vec_simd.inl" + +namespace glm { + +#if (GLM_ARCH & GLM_ARCH_NEON_BIT) && !GLM_CONFIG_XYZW_ONLY + CTORSL(3, CTOR_FLOAT); + CTORSL(3, CTOR_INT); + CTORSL(3, CTOR_UINT); + CTORSL(3, CTOR_VECF_INT3); + CTORSL(3, CTOR_VECF_UINT3); + CTORSL(3, CTOR_VECF_VECF); + CTORSL(3, CTOR_VECF_VECI); + CTORSL(3, CTOR_VECF_VECU); +#endif + +#if GLM_ARCH & GLM_ARCH_SSE2_BIT + CTORSL(3, CTOR_FLOAT_COPY3); + CTORSL(3, CTOR_DOUBLE_COPY3); + CTORSL(3, CTOR_FLOAT); + CTORSL(3, CTOR_FLOAT3); + CTORSL(3, CTOR_DOUBLE3); + CTORSL(3, CTOR_INT); + CTORSL(3, CTOR_INT3); + CTORSL(3, CTOR_VECF_INT3); + + + template<> + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, double, aligned_highp>::vec(const vec<3, double, aligned_highp>& v) +#if (GLM_ARCH & GLM_ARCH_AVX_BIT) + :data(v.data) {} +#else + { + data.setv(0, v.data.getv(0)); + data.setv(1, v.data.getv(1)); + } +#endif + + + + + template<> + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, float, aligned_highp>::vec(const vec<3, float, aligned_highp>& v) : + data(v.data) {} + + template<> + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, float, aligned_highp>::vec(const vec<3, float, packed_highp>& v) + { + data = _mm_set_ps(v[2], v[2], v[1], v[0]); + } + + template<> + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, float, packed_highp>::vec(const vec<3, float, aligned_highp>& v) + { + _mm_store_sd(reinterpret_cast(this), _mm_castps_pd(v.data)); + __m128 mz = _mm_shuffle_ps(v.data, v.data, _MM_SHUFFLE(2, 2, 2, 2)); + _mm_store_ss(reinterpret_cast(this)+2, mz); + } + + template<> + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, double, aligned_highp>::vec(const vec<3, double, packed_highp>& v) + { +#if (GLM_ARCH & GLM_ARCH_AVX_BIT) + data = _mm256_set_pd(v[2], v[2], v[1], v[0]); +#else + data.setv(0, _mm_loadu_pd(reinterpret_cast(&v))); + data.setv(1, _mm_loadu_pd(reinterpret_cast(&v)+2)); +#endif + } + + template<> + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, double, packed_highp>::vec(const vec<3, double, aligned_highp>& v) + { +#if (GLM_ARCH & GLM_ARCH_AVX_BIT) + __m256d T1 = _mm256_permute_pd(v.data, 1); + _mm_store_sd((reinterpret_cast(this)) + 0, _mm256_castpd256_pd128(v.data)); + _mm_store_sd((reinterpret_cast(this)) + 1, _mm256_castpd256_pd128(T1)); + _mm_store_sd((reinterpret_cast(this)) + 2, _mm256_extractf128_pd(v.data, 1)); +#else + _mm_storeu_pd(reinterpret_cast(this), v.data.getv(0)); + _mm_store_sd((reinterpret_cast(this)) + 2, v.data.getv(1)); +#endif + } + + template<> + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, int, aligned_highp>::vec(const vec<3, int, aligned_highp>& v) : + data(v.data) + { + } + + template<> + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, int, aligned_highp>::vec(const vec<3, int, packed_highp>& v) + { + __m128 mx = _mm_load_ss(reinterpret_cast(&v[0])); + __m128 my = _mm_load_ss(reinterpret_cast(&v[1])); + __m128 mz = _mm_load_ss(reinterpret_cast(&v[2])); + __m128 mxy = _mm_unpacklo_ps(mx, my); + data = _mm_castps_si128(_mm_movelh_ps(mxy, mz)); + } + + template<> + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, int, packed_highp>::vec(const vec<3, int, aligned_highp>& v) + { + _mm_store_sd(reinterpret_cast(this), _mm_castsi128_pd(v.data)); + __m128 mz = _mm_shuffle_ps(_mm_castsi128_ps(v.data), _mm_castsi128_ps(v.data), _MM_SHUFFLE(2, 2, 2, 2)); + _mm_store_ss(reinterpret_cast(this)+2, mz); + } + + template<> + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, unsigned int, aligned_highp>::vec(const vec<3, unsigned int, aligned_highp>& v) : + data(v.data) + { + } + + template<> + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, unsigned int, aligned_highp>::vec(const vec<3, unsigned int, packed_highp>& v) + { + __m128 mx = _mm_load_ss(reinterpret_cast(&v[0])); + __m128 my = _mm_load_ss(reinterpret_cast(&v[1])); + __m128 mz = _mm_load_ss(reinterpret_cast(&v[2])); + __m128 mxy = _mm_unpacklo_ps(mx, my); + data = _mm_castps_si128(_mm_movelh_ps(mxy, mz)); + } + + template<> + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, unsigned int, packed_highp>::vec(const vec<3, unsigned int, aligned_highp>& v) + { + _mm_store_sd(reinterpret_cast(this), _mm_castsi128_pd(v.data)); + __m128 mz = _mm_shuffle_ps(_mm_castsi128_ps(v.data), _mm_castsi128_ps(v.data), _MM_SHUFFLE(2, 2, 2, 2)); + _mm_store_ss(reinterpret_cast(this) + 2, mz); + } + + CTORSL(3, CTOR_DOUBLE); + //CTORSL(3, CTOR_INT64); + +#endif //GLM_ARCH & GLM_ARCH_SSE2_BITt + + +} + +#endif diff --git a/glm/detail/type_vec4.hpp b/glm/detail/type_vec4.hpp index 601256c3..7055d6a7 100644 --- a/glm/detail/type_vec4.hpp +++ b/glm/detail/type_vec4.hpp @@ -21,6 +21,11 @@ namespace glm typedef T value_type; typedef vec<4, T, Q> type; typedef vec<4, bool, Q> bool_type; + + enum is_aligned + { + value = detail::is_aligned::value + }; // -- Data -- @@ -235,13 +240,13 @@ namespace glm } template - GLM_FUNC_DECL GLM_CONSTEXPR vec(detail::_swizzle<3, T, Q, E0, E1, E2, -1> const& v, T const& w) + GLM_FUNC_DECL GLM_CONSTEXPR vec(detail::_swizzle<3, T, Q, E0, E1, E2, 3> const& v, T const& w) { *this = vec<4, T, Q>(v(), w); } template - GLM_FUNC_DECL GLM_CONSTEXPR vec(T const& x, detail::_swizzle<3, T, Q, E0, E1, E2, -1> const& v) + GLM_FUNC_DECL GLM_CONSTEXPR vec(T const& x, detail::_swizzle<3, T, Q, E0, E1, E2, 3> const& v) { *this = vec<4, T, Q>(x, v()); } @@ -325,6 +330,7 @@ namespace glm GLM_FUNC_DECL GLM_CONSTEXPR vec<4, T, Q> & operator>>=(vec<4, U, Q> const& v); }; + // -- Unary operators -- template diff --git a/glm/detail/type_vec4.inl b/glm/detail/type_vec4.inl index d48fae75..c147ca09 100644 --- a/glm/detail/type_vec4.inl +++ b/glm/detail/type_vec4.inl @@ -1,130 +1,12 @@ /// @ref core #include "compute_vector_relational.hpp" +#include "compute_vector_decl.hpp" namespace glm{ namespace detail { - template - struct compute_vec4_add - { - GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) - { - return vec<4, T, Q>(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); - } - }; - template - struct compute_vec4_sub - { - GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) - { - return vec<4, T, Q>(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); - } - }; - - template - struct compute_vec4_mul - { - GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) - { - return vec<4, T, Q>(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); - } - }; - - template - struct compute_vec4_div - { - GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) - { - return vec<4, T, Q>(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); - } - }; - - template - struct compute_vec4_mod - { - GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) - { - return vec<4, T, Q>(a.x % b.x, a.y % b.y, a.z % b.z, a.w % b.w); - } - }; - - template - struct compute_vec4_and - { - GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) - { - return vec<4, T, Q>(a.x & b.x, a.y & b.y, a.z & b.z, a.w & b.w); - } - }; - - template - struct compute_vec4_or - { - GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) - { - return vec<4, T, Q>(a.x | b.x, a.y | b.y, a.z | b.z, a.w | b.w); - } - }; - - template - struct compute_vec4_xor - { - GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) - { - return vec<4, T, Q>(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); - } - }; - - template - struct compute_vec4_shift_left - { - GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) - { - return vec<4, T, Q>(a.x << b.x, a.y << b.y, a.z << b.z, a.w << b.w); - } - }; - - template - struct compute_vec4_shift_right - { - GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) - { - return vec<4, T, Q>(a.x >> b.x, a.y >> b.y, a.z >> b.z, a.w >> b.w); - } - }; - - template - struct compute_vec4_equal - { - GLM_FUNC_QUALIFIER GLM_CONSTEXPR static bool call(vec<4, T, Q> const& v1, vec<4, T, Q> const& v2) - { - return - detail::compute_equal::is_iec559>::call(v1.x, v2.x) && - detail::compute_equal::is_iec559>::call(v1.y, v2.y) && - detail::compute_equal::is_iec559>::call(v1.z, v2.z) && - detail::compute_equal::is_iec559>::call(v1.w, v2.w); - } - }; - - template - struct compute_vec4_nequal - { - GLM_FUNC_QUALIFIER GLM_CONSTEXPR static bool call(vec<4, T, Q> const& v1, vec<4, T, Q> const& v2) - { - return !compute_vec4_equal::value, sizeof(T) * 8, detail::is_aligned::value>::call(v1, v2); - } - }; - - template - struct compute_vec4_bitwise_not - { - GLM_FUNC_QUALIFIER GLM_CONSTEXPR static vec<4, T, Q> call(vec<4, T, Q> const& v) - { - return vec<4, T, Q>(~v.x, ~v.y, ~v.z, ~v.w); - } - }; }//namespace detail // -- Implicit basic constructors -- @@ -158,7 +40,7 @@ namespace detail : x(scalar), y(scalar), z(scalar), w(scalar) {} - template + template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q>::vec(T _x, T _y, T _z, T _w) : x(_x), y(_y), z(_z), w(_w) {} @@ -473,13 +355,19 @@ namespace detail , w(static_cast(v.w)) {} + // -- Component accesses -- template GLM_FUNC_QUALIFIER GLM_CONSTEXPR T& vec<4, T, Q>::operator[](typename vec<4, T, Q>::length_type i) { +<<<<<<< HEAD GLM_ASSERT_LENGTH(i, this->length()); switch(i) +======= + assert(i >= 0 && i < this->length()); + switch (i) +>>>>>>> 5c1da05b (Simd improvement) { default: case 0: @@ -496,8 +384,13 @@ namespace detail template GLM_FUNC_QUALIFIER GLM_CONSTEXPR T const& vec<4, T, Q>::operator[](typename vec<4, T, Q>::length_type i) const { +<<<<<<< HEAD GLM_ASSERT_LENGTH(i, this->length()); switch(i) +======= + assert(i >= 0 && i < this->length()); + switch (i) +>>>>>>> 5c1da05b (Simd improvement) { default: case 0: @@ -540,84 +433,84 @@ namespace detail template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> & vec<4, T, Q>::operator+=(U scalar) { - return (*this = detail::compute_vec4_add::value>::call(*this, vec<4, T, Q>(scalar))); + return (*this = detail::compute_vec_add<4, T, Q, detail::is_aligned::value>::call(*this, vec<4, T, Q>(scalar))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> & vec<4, T, Q>::operator+=(vec<1, U, Q> const& v) { - return (*this = detail::compute_vec4_add::value>::call(*this, vec<4, T, Q>(v.x))); + return (*this = detail::compute_vec_add<4, T, Q, detail::is_aligned::value>::call(*this, vec<4, T, Q>(v.x))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> & vec<4, T, Q>::operator+=(vec<4, U, Q> const& v) { - return (*this = detail::compute_vec4_add::value>::call(*this, vec<4, T, Q>(v))); + return (*this = detail::compute_vec_add<4, T, Q, detail::is_aligned::value>::call(*this, vec<4, T, Q>(v))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> & vec<4, T, Q>::operator-=(U scalar) { - return (*this = detail::compute_vec4_sub::value>::call(*this, vec<4, T, Q>(scalar))); + return (*this = detail::compute_vec_sub<4, T, Q, detail::is_aligned::value>::call(*this, vec<4, T, Q>(scalar))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> & vec<4, T, Q>::operator-=(vec<1, U, Q> const& v) { - return (*this = detail::compute_vec4_sub::value>::call(*this, vec<4, T, Q>(v.x))); + return (*this = detail::compute_vec_sub<4, T, Q, detail::is_aligned::value>::call(*this, vec<4, T, Q>(v.x))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> & vec<4, T, Q>::operator-=(vec<4, U, Q> const& v) { - return (*this = detail::compute_vec4_sub::value>::call(*this, vec<4, T, Q>(v))); + return (*this = detail::compute_vec_sub<4, T, Q, detail::is_aligned::value>::call(*this, vec<4, T, Q>(v))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> & vec<4, T, Q>::operator*=(U scalar) { - return (*this = detail::compute_vec4_mul::value>::call(*this, vec<4, T, Q>(scalar))); + return (*this = detail::compute_vec_mul<4,T, Q, detail::is_aligned::value>::call(*this, vec<4, T, Q>(scalar))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> & vec<4, T, Q>::operator*=(vec<1, U, Q> const& v) { - return (*this = detail::compute_vec4_mul::value>::call(*this, vec<4, T, Q>(v.x))); + return (*this = detail::compute_vec_mul<4,T, Q, detail::is_aligned::value>::call(*this, vec<4, T, Q>(v.x))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> & vec<4, T, Q>::operator*=(vec<4, U, Q> const& v) { - return (*this = detail::compute_vec4_mul::value>::call(*this, vec<4, T, Q>(v))); + return (*this = detail::compute_vec_mul<4,T, Q, detail::is_aligned::value>::call(*this, vec<4, T, Q>(v))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> & vec<4, T, Q>::operator/=(U scalar) { - return (*this = detail::compute_vec4_div::value>::call(*this, vec<4, T, Q>(scalar))); + return (*this = detail::compute_vec_div<4, T, Q, detail::is_aligned::value>::call(*this, vec<4, T, Q>(scalar))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> & vec<4, T, Q>::operator/=(vec<1, U, Q> const& v) { - return (*this = detail::compute_vec4_div::value>::call(*this, vec<4, T, Q>(v.x))); + return (*this = detail::compute_vec_div<4, T, Q, detail::is_aligned::value>::call(*this, vec<4, T, Q>(v.x))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> & vec<4, T, Q>::operator/=(vec<4, U, Q> const& v) { - return (*this = detail::compute_vec4_div::value>::call(*this, vec<4, T, Q>(v))); + return (*this = detail::compute_vec_div<4, T, Q, detail::is_aligned::value>::call(*this, vec<4, T, Q>(v))); } // -- Increment and decrement operators -- @@ -664,126 +557,126 @@ namespace detail template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> & vec<4, T, Q>::operator%=(U scalar) { - return (*this = detail::compute_vec4_mod::value>::call(*this, vec<4, T, Q>(scalar))); + return (*this = detail::compute_vec_mod<4, T, Q, detail::is_aligned::value>::call(*this, vec<4, T, Q>(scalar))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> & vec<4, T, Q>::operator%=(vec<1, U, Q> const& v) { - return (*this = detail::compute_vec4_mod::value>::call(*this, vec<4, T, Q>(v))); + return (*this = detail::compute_vec_mod<3, T, Q, detail::is_aligned::value>::call(*this, vec<3, T, Q>(v))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> & vec<4, T, Q>::operator%=(vec<4, U, Q> const& v) { - return (*this = detail::compute_vec4_mod::value>::call(*this, vec<4, T, Q>(v))); + return (*this = detail::compute_vec_mod<4, T, Q, detail::is_aligned::value>::call(*this, vec<4, T, Q>(v))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> & vec<4, T, Q>::operator&=(U scalar) { - return (*this = detail::compute_vec4_and::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<4, T, Q>(scalar))); + return (*this = detail::compute_vec_and<4, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<4, T, Q>(scalar))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> & vec<4, T, Q>::operator&=(vec<1, U, Q> const& v) { - return (*this = detail::compute_vec4_and::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<4, T, Q>(v))); + return (*this = detail::compute_vec_and<4, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<4, T, Q>(v))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> & vec<4, T, Q>::operator&=(vec<4, U, Q> const& v) { - return (*this = detail::compute_vec4_and::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<4, T, Q>(v))); + return (*this = detail::compute_vec_and<4, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<4, T, Q>(v))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> & vec<4, T, Q>::operator|=(U scalar) { - return (*this = detail::compute_vec4_or::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<4, T, Q>(scalar))); + return (*this = detail::compute_vec_or<4, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<4, T, Q>(scalar))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> & vec<4, T, Q>::operator|=(vec<1, U, Q> const& v) { - return (*this = detail::compute_vec4_or::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<4, T, Q>(v))); + return (*this = detail::compute_vec_or<4, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<4, T, Q>(v))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> & vec<4, T, Q>::operator|=(vec<4, U, Q> const& v) { - return (*this = detail::compute_vec4_or::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<4, T, Q>(v))); + return (*this = detail::compute_vec_or<4, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<4, T, Q>(v))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> & vec<4, T, Q>::operator^=(U scalar) { - return (*this = detail::compute_vec4_xor::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<4, T, Q>(scalar))); + return (*this = detail::compute_vec_xor<4, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<4, T, Q>(scalar))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> & vec<4, T, Q>::operator^=(vec<1, U, Q> const& v) { - return (*this = detail::compute_vec4_xor::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<4, T, Q>(v))); + return (*this = detail::compute_vec_xor<4, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<4, T, Q>(v))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> & vec<4, T, Q>::operator^=(vec<4, U, Q> const& v) { - return (*this = detail::compute_vec4_xor::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<4, T, Q>(v))); + return (*this = detail::compute_vec_xor<4, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<4, T, Q>(v))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> & vec<4, T, Q>::operator<<=(U scalar) { - return (*this = detail::compute_vec4_shift_left::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<4, T, Q>(scalar))); + return (*this = detail::compute_vec_shift_left<4, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<4, T, Q>(scalar))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> & vec<4, T, Q>::operator<<=(vec<1, U, Q> const& v) { - return (*this = detail::compute_vec4_shift_left::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<4, T, Q>(v))); + return (*this = detail::compute_vec_shift_left<4, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<4, T, Q>(v))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> & vec<4, T, Q>::operator<<=(vec<4, U, Q> const& v) { - return (*this = detail::compute_vec4_shift_left::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<4, T, Q>(v))); + return (*this = detail::compute_vec_shift_left<4, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<4, T, Q>(v))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> & vec<4, T, Q>::operator>>=(U scalar) { - return (*this = detail::compute_vec4_shift_right::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<4, T, Q>(scalar))); + return (*this = detail::compute_vec_shift_right<4, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<4, T, Q>(scalar))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> & vec<4, T, Q>::operator>>=(vec<1, U, Q> const& v) { - return (*this = detail::compute_vec4_shift_right::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<4, T, Q>(v))); + return (*this = detail::compute_vec_shift_right<4, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<4, T, Q>(v))); } template template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> & vec<4, T, Q>::operator>>=(vec<4, U, Q> const& v) { - return (*this = detail::compute_vec4_shift_right::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<4, T, Q>(v))); + return (*this = detail::compute_vec_shift_right<4, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(*this, vec<4, T, Q>(v))); } // -- Unary constant operators -- @@ -1107,7 +1000,7 @@ namespace detail template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, T, Q> operator~(vec<4, T, Q> const& v) { - return detail::compute_vec4_bitwise_not::value, sizeof(T) * 8, detail::is_aligned::value>::call(v); + return detail::compute_vec_bitwise_not<4, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(v); } // -- Boolean operators -- @@ -1115,13 +1008,13 @@ namespace detail template GLM_FUNC_QUALIFIER GLM_CONSTEXPR bool operator==(vec<4, T, Q> const& v1, vec<4, T, Q> const& v2) { - return detail::compute_vec4_equal::value, sizeof(T) * 8, detail::is_aligned::value>::call(v1, v2); + return detail::compute_vec_equal<4, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(v1, v2); } template GLM_FUNC_QUALIFIER GLM_CONSTEXPR bool operator!=(vec<4, T, Q> const& v1, vec<4, T, Q> const& v2) { - return detail::compute_vec4_nequal::value, sizeof(T) * 8, detail::is_aligned::value>::call(v1, v2); + return detail::compute_vec_nequal<4, T, Q, detail::is_int::value, sizeof(T) * 8, detail::is_aligned::value>::call(v1, v2); } template @@ -1138,5 +1031,110 @@ namespace detail }//namespace glm #if GLM_CONFIG_SIMD == GLM_ENABLE -# include "type_vec4_simd.inl" +# include "type_vec_simd.inl" + +namespace glm { +#if GLM_ARCH & GLM_ARCH_NEON_BIT && !GLM_CONFIG_XYZW_ONLY + CTORSL(4, CTOR_FLOAT); + CTORSL(4, CTOR_INT); + CTORSL(4, CTOR_UINT); + CTORSL(4, CTOR_VECF_INT4); + CTORSL(4, CTOR_VECF_UINT4); + CTORSL(4, CTOR_VECF_VECF); + CTORSL(4, CTOR_VECF_VECI); + CTORSL(4, CTOR_VECF_VECU); + + +#endif// GLM_ARCH & GLM_ARCH_NEON_BIT + +#if GLM_ARCH & GLM_ARCH_SSE2_BIT + CTORSL(4, CTOR_FLOAT); + CTORSL(4, CTOR_DOUBLE); + CTORSL(4, CTOR_FLOAT4); + CTORSL(4, CTOR_DOUBLE4); + CTORSL(4, CTOR_INT); + CTORSL(4, CTOR_INT4); + CTORSL(4, CTOR_VECF_INT4); + + template<> + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(const vec<4, float, aligned_highp>& v): + data(v.data) + { + } + + template<> + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(const vec<4, float, packed_highp>& v) + { + data = _mm_loadu_ps(reinterpret_cast(&v)); + } + + template<> + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, packed_highp>::vec(const vec<4, float, aligned_highp>& v) + { + _mm_storeu_ps(reinterpret_cast(this), v.data); + } + + template<> + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, int, aligned_highp>::vec(const vec<4, int, aligned_highp>& v) : + data(v.data) + { + } + + template<> + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, int, aligned_highp>::vec(const vec<4, int, packed_highp>& v) + { + data = _mm_loadu_si128(reinterpret_cast(&v)); + } + + template<> + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, int, packed_highp>::vec(const vec<4, int, aligned_highp>& v) + { + _mm_storeu_si128(reinterpret_cast<__m128i*>(this), v.data); + } + + template<> + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, double, aligned_highp>::vec(const vec<4, double, aligned_highp>& v) + { +# if (GLM_ARCH & GLM_ARCH_AVX_BIT) + data = v.data; +#else + data.setv(0, v.data.getv(0)); + data.setv(1, v.data.getv(1)); +#endif + } + + template<> + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, double, aligned_highp>::vec(const vec<4, double, packed_highp>& v) + { +# if (GLM_ARCH & GLM_ARCH_AVX_BIT) + data = _mm256_loadu_pd(reinterpret_cast(&v)); +#else + data.setv(0, _mm_loadu_pd(reinterpret_cast(&v))); + data.setv(1, _mm_loadu_pd(reinterpret_cast(&v)+2)); +#endif + } + + template<> + template<> + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, double, packed_highp>::vec(const vec<4, double, aligned_highp>& v) + { +# if (GLM_ARCH & GLM_ARCH_AVX_BIT) + _mm256_storeu_pd(reinterpret_cast(this), v.data); +#else + _mm_storeu_pd(reinterpret_cast(this), v.data.getv(0)); + _mm_storeu_pd(reinterpret_cast(this) + 2, v.data.getv(1)); +#endif + } + +#endif//GLM_ARCH & GLM_ARCH_SSE2_BIT +} + #endif diff --git a/glm/detail/type_vec4_simd.inl b/glm/detail/type_vec4_simd.inl deleted file mode 100644 index 816ef45b..00000000 --- a/glm/detail/type_vec4_simd.inl +++ /dev/null @@ -1,788 +0,0 @@ -#if GLM_ARCH & GLM_ARCH_SSE2_BIT - -namespace glm { - namespace detail - { -# if GLM_CONFIG_SWIZZLE == GLM_SWIZZLE_OPERATOR - template - struct _swizzle_base1<4, float, Q, E0, E1, E2, E3, true> : public _swizzle_base0 - { - GLM_FUNC_QUALIFIER vec<4, float, Q> operator ()() const - { - __m128 data = *reinterpret_cast<__m128 const*>(&this->_buffer); - - vec<4, float, Q> Result; -# if GLM_ARCH & GLM_ARCH_AVX_BIT - Result.data = _mm_permute_ps(data, _MM_SHUFFLE(E3, E2, E1, E0)); -# else - Result.data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(E3, E2, E1, E0)); -# endif - return Result; - } - }; - - template - struct _swizzle_base1<4, int, Q, E0, E1, E2, E3, true> : public _swizzle_base0 - { - GLM_FUNC_QUALIFIER vec<4, int, Q> operator ()() const - { - __m128i data = *reinterpret_cast<__m128i const*>(&this->_buffer); - - vec<4, int, Q> Result; - Result.data = _mm_shuffle_epi32(data, _MM_SHUFFLE(E3, E2, E1, E0)); - return Result; - } - }; - - template - struct _swizzle_base1<4, uint, Q, E0, E1, E2, E3, true> : public _swizzle_base0 - { - GLM_FUNC_QUALIFIER vec<4, uint, Q> operator ()() const - { - __m128i data = *reinterpret_cast<__m128i const*>(&this->_buffer); - - vec<4, uint, Q> Result; - Result.data = _mm_shuffle_epi32(data, _MM_SHUFFLE(E3, E2, E1, E0)); - return Result; - } - }; -# endif// GLM_CONFIG_SWIZZLE == GLM_SWIZZLE_OPERATOR - - template - struct compute_vec4_add - { - static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) - { - vec<4, float, Q> Result; - Result.data = _mm_add_ps(a.data, b.data); - return Result; - } - }; - -# if GLM_ARCH & GLM_ARCH_AVX_BIT - template - struct compute_vec4_add - { - static vec<4, double, Q> call(vec<4, double, Q> const& a, vec<4, double, Q> const& b) - { - vec<4, double, Q> Result; - Result.data = _mm256_add_pd(a.data, b.data); - return Result; - } - }; -# endif - - template - struct compute_vec4_sub - { - static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) - { - vec<4, float, Q> Result; - Result.data = _mm_sub_ps(a.data, b.data); - return Result; - } - }; - -# if GLM_ARCH & GLM_ARCH_AVX_BIT - template - struct compute_vec4_sub - { - static vec<4, double, Q> call(vec<4, double, Q> const& a, vec<4, double, Q> const& b) - { - vec<4, double, Q> Result; - Result.data = _mm256_sub_pd(a.data, b.data); - return Result; - } - }; -# endif - - template - struct compute_vec4_mul - { - static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) - { - vec<4, float, Q> Result; - Result.data = _mm_mul_ps(a.data, b.data); - return Result; - } - }; - -# if GLM_ARCH & GLM_ARCH_AVX_BIT - template - struct compute_vec4_mul - { - static vec<4, double, Q> call(vec<4, double, Q> const& a, vec<4, double, Q> const& b) - { - vec<4, double, Q> Result; - Result.data = _mm256_mul_pd(a.data, b.data); - return Result; - } - }; -# endif - - template - struct compute_vec4_div - { - static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) - { - vec<4, float, Q> Result; - Result.data = _mm_div_ps(a.data, b.data); - return Result; - } - }; - -# if GLM_ARCH & GLM_ARCH_AVX_BIT - template - struct compute_vec4_div - { - static vec<4, double, Q> call(vec<4, double, Q> const& a, vec<4, double, Q> const& b) - { - vec<4, double, Q> Result; - Result.data = _mm256_div_pd(a.data, b.data); - return Result; - } - }; -# endif - - template<> - struct compute_vec4_div - { - static vec<4, float, aligned_lowp> call(vec<4, float, aligned_lowp> const& a, vec<4, float, aligned_lowp> const& b) - { - vec<4, float, aligned_lowp> Result; - Result.data = _mm_mul_ps(a.data, _mm_rcp_ps(b.data)); - return Result; - } - }; - - template - struct compute_vec4_and - { - static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) - { - vec<4, T, Q> Result; - Result.data = _mm_and_si128(a.data, b.data); - return Result; - } - }; - -# if GLM_ARCH & GLM_ARCH_AVX2_BIT - template - struct compute_vec4_and - { - static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) - { - vec<4, T, Q> Result; - Result.data = _mm256_and_si256(a.data, b.data); - return Result; - } - }; -# endif - - template - struct compute_vec4_or - { - static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) - { - vec<4, T, Q> Result; - Result.data = _mm_or_si128(a.data, b.data); - return Result; - } - }; - -# if GLM_ARCH & GLM_ARCH_AVX2_BIT - template - struct compute_vec4_or - { - static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) - { - vec<4, T, Q> Result; - Result.data = _mm256_or_si256(a.data, b.data); - return Result; - } - }; -# endif - - template - struct compute_vec4_xor - { - static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) - { - vec<4, T, Q> Result; - Result.data = _mm_xor_si128(a.data, b.data); - return Result; - } - }; - -# if GLM_ARCH & GLM_ARCH_AVX2_BIT - template - struct compute_vec4_xor - { - static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) - { - vec<4, T, Q> Result; - Result.data = _mm256_xor_si256(a.data, b.data); - return Result; - } - }; -# endif - - template - struct compute_vec4_shift_left - { - static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) - { - vec<4, T, Q> Result; - Result.data = _mm_sll_epi32(a.data, b.data); - return Result; - } - }; - -# if GLM_ARCH & GLM_ARCH_AVX2_BIT - template - struct compute_vec4_shift_left - { - static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) - { - vec<4, T, Q> Result; - Result.data = _mm256_sll_epi64(a.data, b.data); - return Result; - } - }; -# endif - - template - struct compute_vec4_shift_right - { - static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) - { - vec<4, T, Q> Result; - Result.data = _mm_srl_epi32(a.data, b.data); - return Result; - } - }; - -# if GLM_ARCH & GLM_ARCH_AVX2_BIT - template - struct compute_vec4_shift_right - { - static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) - { - vec<4, T, Q> Result; - Result.data = _mm256_srl_epi64(a.data, b.data); - return Result; - } - }; -# endif - - template - struct compute_vec4_bitwise_not - { - static vec<4, T, Q> call(vec<4, T, Q> const& v) - { - vec<4, T, Q> Result; - Result.data = _mm_xor_si128(v.data, _mm_set1_epi32(-1)); - return Result; - } - }; - -# if GLM_ARCH & GLM_ARCH_AVX2_BIT - template - struct compute_vec4_bitwise_not - { - static vec<4, T, Q> call(vec<4, T, Q> const& v) - { - vec<4, T, Q> Result; - Result.data = _mm256_xor_si256(v.data, _mm_set1_epi32(-1)); - return Result; - } - }; -# endif - - template - struct compute_vec4_equal - { - static bool call(vec<4, float, Q> const& v1, vec<4, float, Q> const& v2) - { - return _mm_movemask_ps(_mm_cmpneq_ps(v1.data, v2.data)) == 0; - } - }; - -# if GLM_ARCH & GLM_ARCH_SSE41_BIT - template - struct compute_vec4_equal - { - static bool call(vec<4, int, Q> const& v1, vec<4, int, Q> const& v2) - { - //return _mm_movemask_epi8(_mm_cmpeq_epi32(v1.data, v2.data)) != 0; - __m128i neq = _mm_xor_si128(v1.data, v2.data); - return _mm_test_all_zeros(neq, neq) == 0; - } - }; -# endif - - template - struct compute_vec4_nequal - { - static bool call(vec<4, float, Q> const& v1, vec<4, float, Q> const& v2) - { - return _mm_movemask_ps(_mm_cmpneq_ps(v1.data, v2.data)) != 0; - } - }; - -# if GLM_ARCH & GLM_ARCH_SSE41_BIT - template - struct compute_vec4_nequal - { - static bool call(vec<4, int, Q> const& v1, vec<4, int, Q> const& v2) - { - //return _mm_movemask_epi8(_mm_cmpneq_epi32(v1.data, v2.data)) != 0; - __m128i neq = _mm_xor_si128(v1.data, v2.data); - return _mm_test_all_zeros(neq, neq) != 0; - } - }; -# endif - }//namespace detail - - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_lowp>::vec(float _s) : - data(_mm_set1_ps(_s)) - {} - - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_mediump>::vec(float _s) : - data(_mm_set1_ps(_s)) - {} - - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(float _s) : - data(_mm_set1_ps(_s)) - {} - -# if GLM_ARCH & GLM_ARCH_AVX_BIT - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, double, aligned_lowp>::vec(double _s) : - data(_mm256_set1_pd(_s)) - {} - - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, double, aligned_mediump>::vec(double _s) : - data(_mm256_set1_pd(_s)) - {} - - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, double, aligned_highp>::vec(double _s) : - data(_mm256_set1_pd(_s)) - {} -# endif - - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, int, aligned_lowp>::vec(int _s) : - data(_mm_set1_epi32(_s)) - {} - - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, int, aligned_mediump>::vec(int _s) : - data(_mm_set1_epi32(_s)) - {} - - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, int, aligned_highp>::vec(int _s) : - data(_mm_set1_epi32(_s)) - {} - -# if GLM_ARCH & GLM_ARCH_AVX2_BIT - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, detail::int64, aligned_lowp>::vec(detail::int64 _s) : - data(_mm256_set1_epi64x(_s)) - {} - - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, detail::int64, aligned_mediump>::vec(detail::int64 _s) : - data(_mm256_set1_epi64x(_s)) - {} - - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, detail::int64, aligned_highp>::vec(detail::int64 _s) : - data(_mm256_set1_epi64x(_s)) - {} -# endif - - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_lowp>::vec(float _x, float _y, float _z, float _w) : - data(_mm_set_ps(_w, _z, _y, _x)) - {} - - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_mediump>::vec(float _x, float _y, float _z, float _w) : - data(_mm_set_ps(_w, _z, _y, _x)) - {} - - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(float _x, float _y, float _z, float _w) : - data(_mm_set_ps(_w, _z, _y, _x)) - {} - - template<> - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, int, aligned_lowp>::vec(int _x, int _y, int _z, int _w) : - data(_mm_set_epi32(_w, _z, _y, _x)) - {} - - template<> - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, int, aligned_mediump>::vec(int _x, int _y, int _z, int _w) : - data(_mm_set_epi32(_w, _z, _y, _x)) - {} - - template<> - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, int, aligned_highp>::vec(int _x, int _y, int _z, int _w) : - data(_mm_set_epi32(_w, _z, _y, _x)) - {} - - template<> - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_lowp>::vec(int _x, int _y, int _z, int _w) : - data(_mm_cvtepi32_ps(_mm_set_epi32(_w, _z, _y, _x))) - {} - - template<> - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_mediump>::vec(int _x, int _y, int _z, int _w) : - data(_mm_cvtepi32_ps(_mm_set_epi32(_w, _z, _y, _x))) - {} - - template<> - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(int _x, int _y, int _z, int _w) : - data(_mm_cvtepi32_ps(_mm_set_epi32(_w, _z, _y, _x))) - {} -}//namespace glm - -#endif//GLM_ARCH & GLM_ARCH_SSE2_BIT - -#if GLM_ARCH & GLM_ARCH_NEON_BIT -namespace glm { - namespace detail { - - template - struct compute_vec4_add - { - static - vec<4, float, Q> - call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) - { - vec<4, float, Q> Result; - Result.data = vaddq_f32(a.data, b.data); - return Result; - } - }; - - template - struct compute_vec4_add - { - static - vec<4, uint, Q> - call(vec<4, uint, Q> const& a, vec<4, uint, Q> const& b) - { - vec<4, uint, Q> Result; - Result.data = vaddq_u32(a.data, b.data); - return Result; - } - }; - - template - struct compute_vec4_add - { - static - vec<4, int, Q> - call(vec<4, int, Q> const& a, vec<4, int, Q> const& b) - { - vec<4, int, Q> Result; - Result.data = vaddq_s32(a.data, b.data); - return Result; - } - }; - - template - struct compute_vec4_sub - { - static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) - { - vec<4, float, Q> Result; - Result.data = vsubq_f32(a.data, b.data); - return Result; - } - }; - - template - struct compute_vec4_sub - { - static vec<4, uint, Q> call(vec<4, uint, Q> const& a, vec<4, uint, Q> const& b) - { - vec<4, uint, Q> Result; - Result.data = vsubq_u32(a.data, b.data); - return Result; - } - }; - - template - struct compute_vec4_sub - { - static vec<4, int, Q> call(vec<4, int, Q> const& a, vec<4, int, Q> const& b) - { - vec<4, int, Q> Result; - Result.data = vsubq_s32(a.data, b.data); - return Result; - } - }; - - template - struct compute_vec4_mul - { - static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) - { - vec<4, float, Q> Result; - Result.data = vmulq_f32(a.data, b.data); - return Result; - } - }; - - template - struct compute_vec4_mul - { - static vec<4, uint, Q> call(vec<4, uint, Q> const& a, vec<4, uint, Q> const& b) - { - vec<4, uint, Q> Result; - Result.data = vmulq_u32(a.data, b.data); - return Result; - } - }; - - template - struct compute_vec4_mul - { - static vec<4, int, Q> call(vec<4, int, Q> const& a, vec<4, int, Q> const& b) - { - vec<4, int, Q> Result; - Result.data = vmulq_s32(a.data, b.data); - return Result; - } - }; - - template - struct compute_vec4_div - { - static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) - { - vec<4, float, Q> Result; -#if GLM_ARCH & GLM_ARCH_ARMV8_BIT - Result.data = vdivq_f32(a.data, b.data); -#else - /* Arm assembler reference: - * - * The Newton-Raphson iteration: x[n+1] = x[n] * (2 - d * x[n]) - * converges to (1/d) if x0 is the result of VRECPE applied to d. - * - * Note: The precision usually improves with two interactions, but more than two iterations are not helpful. */ - float32x4_t x = vrecpeq_f32(b.data); - x = vmulq_f32(vrecpsq_f32(b.data, x), x); - x = vmulq_f32(vrecpsq_f32(b.data, x), x); - Result.data = vmulq_f32(a.data, x); -#endif - return Result; - } - }; - - template - struct compute_vec4_equal - { - static bool call(vec<4, float, Q> const& v1, vec<4, float, Q> const& v2) - { - uint32x4_t cmp = vceqq_f32(v1.data, v2.data); -#if GLM_ARCH & GLM_ARCH_ARMV8_BIT - cmp = vpminq_u32(cmp, cmp); - cmp = vpminq_u32(cmp, cmp); - uint32_t r = cmp[0]; -#else - uint32x2_t cmpx2 = vpmin_u32(vget_low_u32(cmp), vget_high_u32(cmp)); - cmpx2 = vpmin_u32(cmpx2, cmpx2); - uint32_t r = cmpx2[0]; -#endif - return r == ~0u; - } - }; - - template - struct compute_vec4_equal - { - static bool call(vec<4, uint, Q> const& v1, vec<4, uint, Q> const& v2) - { - uint32x4_t cmp = vceqq_u32(v1.data, v2.data); -#if GLM_ARCH & GLM_ARCH_ARMV8_BIT - cmp = vpminq_u32(cmp, cmp); - cmp = vpminq_u32(cmp, cmp); - uint32_t r = cmp[0]; -#else - uint32x2_t cmpx2 = vpmin_u32(vget_low_u32(cmp), vget_high_u32(cmp)); - cmpx2 = vpmin_u32(cmpx2, cmpx2); - uint32_t r = cmpx2[0]; -#endif - return r == ~0u; - } - }; - - template - struct compute_vec4_equal - { - static bool call(vec<4, int, Q> const& v1, vec<4, int, Q> const& v2) - { - uint32x4_t cmp = vceqq_s32(v1.data, v2.data); -#if GLM_ARCH & GLM_ARCH_ARMV8_BIT - cmp = vpminq_u32(cmp, cmp); - cmp = vpminq_u32(cmp, cmp); - uint32_t r = cmp[0]; -#else - uint32x2_t cmpx2 = vpmin_u32(vget_low_u32(cmp), vget_high_u32(cmp)); - cmpx2 = vpmin_u32(cmpx2, cmpx2); - uint32_t r = cmpx2[0]; -#endif - return r == ~0u; - } - }; - - template - struct compute_vec4_nequal - { - static bool call(vec<4, float, Q> const& v1, vec<4, float, Q> const& v2) - { - return !compute_vec4_equal::call(v1, v2); - } - }; - - template - struct compute_vec4_nequal - { - static bool call(vec<4, uint, Q> const& v1, vec<4, uint, Q> const& v2) - { - return !compute_vec4_equal::call(v1, v2); - } - }; - - template - struct compute_vec4_nequal - { - static bool call(vec<4, int, Q> const& v1, vec<4, int, Q> const& v2) - { - return !compute_vec4_equal::call(v1, v2); - } - }; - - }//namespace detail - -#if !GLM_CONFIG_XYZW_ONLY - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_lowp>::vec(float _s) : - data(vdupq_n_f32(_s)) - {} - - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_mediump>::vec(float _s) : - data(vdupq_n_f32(_s)) - {} - - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(float _s) : - data(vdupq_n_f32(_s)) - {} - - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, int, aligned_lowp>::vec(int _s) : - data(vdupq_n_s32(_s)) - {} - - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, int, aligned_mediump>::vec(int _s) : - data(vdupq_n_s32(_s)) - {} - - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, int, aligned_highp>::vec(int _s) : - data(vdupq_n_s32(_s)) - {} - - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, uint, aligned_lowp>::vec(uint _s) : - data(vdupq_n_u32(_s)) - {} - - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, uint, aligned_mediump>::vec(uint _s) : - data(vdupq_n_u32(_s)) - {} - - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, uint, aligned_highp>::vec(uint _s) : - data(vdupq_n_u32(_s)) - {} - - template<> - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(const vec<4, float, aligned_highp>& rhs) : - data(rhs.data) - {} - - template<> - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(const vec<4, int, aligned_highp>& rhs) : - data(vcvtq_f32_s32(rhs.data)) - {} - - template<> - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(const vec<4, uint, aligned_highp>& rhs) : - data(vcvtq_f32_u32(rhs.data)) - {} - - template<> - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_lowp>::vec(int _x, int _y, int _z, int _w) : - data(vcvtq_f32_s32(vec<4, int, aligned_lowp>(_x, _y, _z, _w).data)) - {} - - template<> - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_mediump>::vec(int _x, int _y, int _z, int _w) : - data(vcvtq_f32_s32(vec<4, int, aligned_mediump>(_x, _y, _z, _w).data)) - {} - - template<> - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(int _x, int _y, int _z, int _w) : - data(vcvtq_f32_s32(vec<4, int, aligned_highp>(_x, _y, _z, _w).data)) - {} - - template<> - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_lowp>::vec(uint _x, uint _y, uint _z, uint _w) : - data(vcvtq_f32_u32(vec<4, uint, aligned_lowp>(_x, _y, _z, _w).data)) - {} - - template<> - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_mediump>::vec(uint _x, uint _y, uint _z, uint _w) : - data(vcvtq_f32_u32(vec<4, uint, aligned_mediump>(_x, _y, _z, _w).data)) - {} - - - template<> - template<> - GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(uint _x, uint _y, uint _z, uint _w) : - data(vcvtq_f32_u32(vec<4, uint, aligned_highp>(_x, _y, _z, _w).data)) - {} - -#endif -}//namespace glm - -#endif diff --git a/glm/detail/type_vec_simd.inl b/glm/detail/type_vec_simd.inl new file mode 100644 index 00000000..566abec3 --- /dev/null +++ b/glm/detail/type_vec_simd.inl @@ -0,0 +1,1032 @@ +#pragma once + +#define CTORSL(L, CTOR)\ +CTOR(L, aligned_lowp)\ +CTOR(L, aligned_mediump)\ +CTOR(L, aligned_highp)\ + +namespace glm { + namespace detail + { + +template +struct compute_vec_and : public compute_vec_and +{}; + +template +struct compute_vec_or: public compute_vec_or +{}; + +template +struct compute_vec_xor : public compute_vec_xor +{}; + +template +struct compute_vec_shift_left : public compute_vec_shift_left +{}; + +template +struct compute_vec_shift_right : public compute_vec_shift_right +{}; + +template +struct compute_vec_bitwise_not:public compute_vec_bitwise_not +{}; + +template +struct compute_vec_equal : public compute_vec_equal +{}; + +template +struct compute_vec_nequal : public compute_vec_nequal +{}; + +template +struct compute_vec_mod : public compute_vec_mod +{}; + + +template +struct compute_vec_add : public compute_vec_add +{}; + +template< length_t L, typename T, qualifier Q> +struct compute_vec_sub : public compute_vec_sub +{}; + +template< length_t L, typename T, qualifier Q> +struct compute_vec_mul : public compute_vec_mul +{}; + +template< length_t L, typename T, qualifier Q> +struct compute_vec_div : public compute_vec_div +{}; + +#if GLM_ARCH & GLM_ARCH_SSE2_BIT + +# if GLM_CONFIG_SWIZZLE == GLM_SWIZZLE_OPERATOR + template + struct _swizzle_base1 : public _swizzle_base0 + { + GLM_FUNC_QUALIFIER vec operator ()() const + { + __m128 data = *reinterpret_cast<__m128 const*>(&this->_buffer); + + vec Result; +# if GLM_ARCH & GLM_ARCH_AVX_BIT + Result.data = _mm_permute_ps(data, _MM_SHUFFLE(E3, E2, E1, E0)); +# else + Result.data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(E3, E2, E1, E0)); +# endif + return Result; + } + }; + + template + struct _swizzle_base1<2, float, Q, E0, E1, E2, E3, true> : public _swizzle_base1<2, float, Q, E0, E1, E2, E3, false> {}; + + template + struct _swizzle_base1<2, int, Q, E0, E1, E2, E3, true> : public _swizzle_base1<2, int, Q, E0, E1, E2, E3, false> {}; + + template + struct _swizzle_base1 : public _swizzle_base0 + { + GLM_FUNC_QUALIFIER vec operator ()() const + { + __m128i data = *reinterpret_cast<__m128i const*>(&this->_buffer); + + vec Result; + Result.data = _mm_shuffle_epi32(data, _MM_SHUFFLE(E3, E2, E1, E0)); + return Result; + } + }; + + template + struct _swizzle_base1 : public _swizzle_base0 + { + GLM_FUNC_QUALIFIER vec operator ()() const + { + __m128i data = *reinterpret_cast<__m128i const*>(&this->_buffer); + + vec Result; + Result.data = _mm_shuffle_epi32(data, _MM_SHUFFLE(E3, E2, E1, E0)); + return Result; + } + }; +# endif// GLM_CONFIG_SWIZZLE == GLM_SWIZZLE_OPERATOR + + + template + struct compute_vec_add + { + GLM_FUNC_QUALIFIER static vec call(vec const& a, vec const& b) + { + vec Result; + Result.data = _mm_add_ps(a.data, b.data); + return Result; + } + }; + + template + struct compute_vec_add + { + GLM_FUNC_QUALIFIER static vec call(vec const& a, vec const& b) + { + vec Result; + Result.data = _mm_add_epi32(a.data, b.data); + return Result; + } + }; + + template + struct compute_vec_add + { + GLM_FUNC_QUALIFIER static vec call(vec const& a, vec const& b) + { + vec Result; +# if (GLM_ARCH & GLM_ARCH_AVX_BIT) + Result.data = _mm256_add_pd(a.data, b.data); +#else + Result.data.setv(0, _mm_add_pd(a.data.getv(0), b.data.getv(0))); + Result.data.setv(1, _mm_add_pd(a.data.getv(1), b.data.getv(1))); +#endif + return Result; + } + }; + + template + struct compute_vec_sub + { + GLM_FUNC_QUALIFIER static vec call(vec const& a, vec const& b) + { + vec Result; + Result.data = _mm_sub_ps(a.data, b.data); + return Result; + } + }; + + template + struct compute_vec_sub + { + GLM_FUNC_QUALIFIER static vec call(vec const& a, vec const& b) + { + vec Result; + Result.data = _mm_sub_epi32(a.data, b.data); + return Result; + } + }; + + template + struct compute_vec_sub + { + GLM_FUNC_QUALIFIER static vec call(vec const& a, vec const& b) + { + vec Result; +#if (GLM_ARCH & GLM_ARCH_AVX_BIT) + Result.data = _mm256_sub_pd(a.data, b.data); +#else + Result.data.setv(0, _mm_sub_pd(a.data.getv(0), b.data.getv(0))); + Result.data.setv(1, _mm_sub_pd(a.data.getv(1), b.data.getv(1))); +#endif + return Result; + } + }; + + template + struct compute_vec_mul + { + GLM_FUNC_QUALIFIER static vec call(vec const& a, vec const& b) + { + vec Result; + Result.data = _mm_mul_ps(a.data, b.data); + return Result; + } + }; + + + template + struct compute_vec_mul + { + GLM_FUNC_QUALIFIER static vec call(vec const& a, vec const& b) + { + vec Result; +#if (GLM_ARCH & GLM_ARCH_AVX_BIT) + Result.data = _mm256_mul_pd(a.data, b.data); +#else + Result.data.setv(0, _mm_mul_pd(a.data.getv(0), b.data.getv(0))); + Result.data.setv(1, _mm_mul_pd(a.data.getv(1), b.data.getv(1))); +#endif + return Result; + } + }; + + template + struct compute_vec_mul + { + GLM_FUNC_QUALIFIER static vec call(vec const& a, vec const& b) + { + vec Result; + glm_i32vec4 ia = a.data; + glm_i32vec4 ib = b.data; +#ifdef __SSE4_1__ // modern CPU - use SSE 4.1 + Result.data = _mm_mullo_epi32(ia, ib); +#else // old CPU - use SSE 2 + __m128i tmp1 = _mm_mul_epu32(ia, ib); /* mul 2,0*/ + __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(ia, 4), _mm_srli_si128(ib, 4)); /* mul 3,1 */ + Result.data = _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0))); /* shuffle results to [63..0] and pack */ +#endif + return Result; + } + }; + + template + struct compute_vec_div + { + GLM_FUNC_QUALIFIER static vec call(vec const& a, vec const& b) + { + vec Result; + Result.data = _mm_div_ps(a.data, b.data); + return Result; + } + }; + + template + struct compute_vec_div + { + GLM_FUNC_QUALIFIER static vec call(vec const& a, vec const& b) + { +#if defined(_MSC_VER) && _MSC_VER >= 1920 //_mm_div_epi32 only defined with VS >= 2019 + vec Result; + Result.data = _mm_div_epi32(a.data, b.data); + return Result; +#else + return compute_vec_div::call(a, b); +#endif + } + }; + + + // note: div on uninitialized w can generate div by 0 exception + template + struct compute_vec_div<3, int, Q, true> + { + + GLM_FUNC_QUALIFIER static vec<3, int, Q> call(vec<3, int, Q> const& a, vec<3, int, Q> const& b) + { +#if defined(_MSC_VER) && _MSC_VER >= 1920 //_mm_div_epi32 only defined with VS >= 2019 + vec<3, int, Q> Result; + glm_i32vec4 bv = b.data; + bv = _mm_shuffle_epi32(bv, _MM_SHUFFLE(0, 2, 1, 0)); + Result.data = _mm_div_epi32(a.data, bv); + return Result; +#else + return compute_vec_div<3, int, Q, false>::call(a, b); +#endif + } + }; + + + template + struct compute_vec_div + { + GLM_FUNC_QUALIFIER static vec call(vec const& a, vec const& b) + { + vec Result; +# if GLM_ARCH & GLM_ARCH_AVX_BIT + Result.data = _mm256_div_pd(a.data, b.data); +# else + Result.data.setv(0, _mm_div_pd(a.data.getv(0), b.data.getv(0))); + Result.data.setv(1, _mm_div_pd(a.data.getv(1), b.data.getv(1))); +# endif + return Result; + } + }; + + template + struct compute_vec_div + { + GLM_FUNC_QUALIFIER static vec call(vec const& a, vec const& b) + { + vec Result; + Result.data = _mm_mul_ps(a.data, _mm_rcp_ps(b.data)); + return Result; + } + }; + + template + struct compute_vec_and + { + GLM_FUNC_QUALIFIER static vec call(vec const& a, vec const& b) + { + vec Result; + Result.data = _mm_and_si128(a.data, b.data); + return Result; + } + }; + + + template + struct compute_vec_and + { + GLM_FUNC_QUALIFIER static vec call(vec const& a, vec const& b) + { + vec Result; +# if GLM_ARCH & GLM_ARCH_AVX2_BIT + Result.data = _mm256_and_si256(a.data, b.data); +# elif GLM_ARCH & GLM_ARCH_AVX_BIT + Result.data = _mm256_and_pd(_mm256_castpd256_pd128(a.data), _mm256_castpd256_pd128(b.data)); +# else + Result.data.setv(0, _mm_and_si128(a.data.getv(0), b.data.getv(0))); + Result.data.setv(1, _mm_and_si128(a.data.getv(1), b.data.getv(1))); +# endif + return Result; + } + }; + + + + template + struct compute_vec_or + { + GLM_FUNC_QUALIFIER static vec call(vec const& a, vec const& b) + { + vec Result; + Result.data = _mm_or_si128(a.data, b.data); + return Result; + } + }; + + + template + struct compute_vec_or + { + GLM_FUNC_QUALIFIER static vec call(vec const& a, vec const& b) + { + vec Result; +# if GLM_ARCH & GLM_ARCH_AVX2_BIT + Result.data = _mm256_or_si256(a.data, b.data); +# else + Result.data.setv(0, _mm_or_si128(a.data.getv(0), b.data.getv(0))); + Result.data.setv(1, _mm_or_si128(a.data.getv(1), b.data.getv(1))); +# endif + return Result; + } + }; + + template + struct compute_vec_xor + { + GLM_FUNC_QUALIFIER static vec call(vec const& a, vec const& b) + { + vec Result; + Result.data = _mm_xor_si128(a.data, b.data); + return Result; + } + }; + + + template + struct compute_vec_xor + { + GLM_FUNC_QUALIFIER static vec call(vec const& a, vec const& b) + { + vec Result; +# if GLM_ARCH & GLM_ARCH_AVX2_BIT + Result.data = _mm256_xor_si256(a.data, b.data); +# else + Result.data.setv(0, _mm_xor_si128(a.data.getv(0), b.data.getv(0))); + Result.data.setv(1, _mm_xor_si128(a.data.getv(1), b.data.getv(1))); +# endif + return Result; + } + }; + + + //template + //struct compute_vec_shift_left<3, T, Q, -1, 32, true> + //{ + // GLM_FUNC_QUALIFIER static vec<3, T, Q> call(vec<3, T, Q> const& a, vec<3, T, Q> const& b) + // { + // vec<3, T, Q> Result; + // __m128 v2 = _mm_castsi128_ps(b.data); + // v2 = _mm_shuffle_ps(v2, v2, _MM_SHUFFLE(0, 0, 0, 0)); // note: shift is done with xmm[3] that correspond vec w that doesn't exist on vect3 + // _mm_set1_epi64x(_w, _z, _y, _x); + // __m128i vr = _mm_sll_epi32(a.data, _mm_castps_si128(v2)); + // Result.data = vr; + // return Result; + // } + //}; + + //template + //struct compute_vec_shift_left + //{ + // GLM_FUNC_QUALIFIER static vec call(vec const& a, vec const& b) + // { + // vec Result; + // Result.data = _mm_sll_epi32(a.data, b.data); + // return Result; + // } + //}; + + +// template +// struct compute_vec_shift_left +// { +// GLM_FUNC_QUALIFIER static vec call(vec const& a, vec const& b) +// { +// vec Result; +//# if GLM_ARCH & GLM_ARCH_AVX2_BIT +// Result.data = _mm256_sll_epi64(a.data, b.data); +//# else +// Result.data.setv(0, _mm_sll_epi64(a.data.getv(0), b.data.getv(0))); +// Result.data.setv(1, _mm_sll_epi64(a.data.getv(1), b.data.getv(1))); +//# endif +// return Result; +// } +// }; + + +// template +// struct compute_vec_shift_right +// { +// GLM_FUNC_QUALIFIER static vec call(vec const& a, vec const& b) +// { +// vec Result; +// Result.data = _mm_srl_epi32(a.data, b.data); +// return Result; +// } +// }; +// +// template +// struct compute_vec_shift_right +// { +// GLM_FUNC_QUALIFIER static vec call(vec const& a, vec const& b) +// { +// vec Result; +//# if GLM_ARCH & GLM_ARCH_AVX2_BIT +// Result.data = _mm256_srl_epi64(a.data, b.data); +//# else +// Result.data.setv(0, _mm_srl_epi64(a.data.getv(0), b.data.getv(0))); +// Result.data.setv(1, _mm_srl_epi64(a.data.getv(1), b.data.getv(1))); +//# endif +// return Result; +// } +// }; + + template + struct compute_vec_bitwise_not + { + GLM_FUNC_QUALIFIER static vec call(vec const& v) + { + vec Result; + Result.data = _mm_xor_si128(v.data, _mm_set1_epi32(-1)); + return Result; + } + }; + + + template + struct compute_vec_bitwise_not + { + GLM_FUNC_QUALIFIER static vec call(vec const& v) + { + vec Result; +# if GLM_ARCH & GLM_ARCH_AVX2_BIT + Result.data = _mm256_xor_si256(v.data, _mm256_set1_epi32(-1)); +# else + Result.data.setv(0, _mm_xor_si128(v.data.getv(0), _mm_set1_epi32(-1))); + Result.data.setv(1, _mm_xor_si128(v.data.getv(1), _mm_set1_epi32(-1))); +# endif + return Result; + } + }; + + + template + struct compute_vec_equal + { + GLM_FUNC_QUALIFIER static bool call(vec const& v1, vec const& v2) + { + return _mm_movemask_ps(_mm_cmpneq_ps(v1.data, v2.data)) == 0; + } + }; + +# if GLM_ARCH & GLM_ARCH_SSE41_BIT + template + struct compute_vec_equal + { + GLM_FUNC_QUALIFIER static bool call(vec const& v1, vec const& v2) + { + //return _mm_movemask_epi8(_mm_cmpeq_epi32(v1.data, v2.data)) != 0; + __m128i neq = _mm_xor_si128(v1.data, v2.data); + return _mm_test_all_zeros(neq, neq) == 0; + } + }; +# endif + + + + template + struct compute_vec_nequal + { + GLM_FUNC_QUALIFIER static bool call(vec const& v1, vec const& v2) + { + return _mm_movemask_ps(_mm_cmpneq_ps(v1.data, v2.data)) != 0; + } + }; + +# if GLM_ARCH & GLM_ARCH_SSE41_BIT + template + struct compute_vec_nequal + { + GLM_FUNC_QUALIFIER static bool call(vec const& v1, vec const& v2) + { + //return _mm_movemask_epi8(_mm_cmpneq_epi32(v1.data, v2.data)) != 0; + __m128i neq = _mm_xor_si128(v1.data, v2.data); + int v = _mm_test_all_zeros(neq, neq); + return v != 1; + } + }; + template + struct compute_vec_nequal + { + GLM_FUNC_QUALIFIER static bool call(vec const& v1, vec const& v2) + { + //return _mm_movemask_epi8(_mm_cmpneq_epi32(v1.data, v2.data)) != 0; + __m128i neq = _mm_xor_si128(v1.data, v2.data); + return _mm_test_all_zeros(neq, neq) != 1; + } + }; +# else + + + template + struct compute_vec_nequal + { + GLM_FUNC_QUALIFIER static bool call(vec const& v1, vec const& v2) + { + return compute_vec_nequal::call(v1, v2); + } + }; + +# endif + + + + +}//namespace detail + + +#define CTOR_FLOAT(L, Q)\ + template<>\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec::vec(float _s) :\ + data(_mm_set1_ps(_s))\ + {} + +#if GLM_ARCH & GLM_ARCH_AVX_BIT +# define CTOR_DOUBLE(L, Q)\ + template<>\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec::vec(double _s) :\ + data(_mm256_set1_pd(_s)){} + +#define CTOR_DOUBLE4(L, Q)\ + template<>\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec::vec(double _x, double _y, double _z, double _w):\ + data(_mm256_set_pd(_w, _z, _y, _x)) {} + +#define CTOR_DOUBLE3(L, Q)\ + template<>\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec::vec(double _x, double _y, double _z):\ + data(_mm256_set_pd(_z, _z, _y, _x)) {} + +# define CTOR_INT64(L, Q)\ + template<>\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec::vec(detail::int64 _s) :\ + data(_mm256_set1_epi64x(_s)){} + +#define CTOR_DOUBLE_COPY3(L, Q)\ + template<>\ + template\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, double, Q>::vec(vec<3, double, P> const& v) :\ + data(_mm256_setr_pd(v.x, v.y, v.z, v.z)){} + +#else +# define CTOR_DOUBLE(L, Q)\ + template<>\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec::vec(double _v) \ + {\ + data.setv(0, _mm_set1_pd(_v)); \ + data.setv(1, _mm_set1_pd(_v)); \ + } + +#define CTOR_DOUBLE4(L, Q)\ + template<>\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec::vec(double _x, double _y, double _z, double _w)\ + {\ + data.setv(0, _mm_setr_pd(_x, _y)); \ + data.setv(1, _mm_setr_pd(_z, _w)); \ + } + +#define CTOR_DOUBLE3(L, Q)\ + template<>\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec::vec(double _x, double _y, double _z)\ + {\ + data.setv(0, _mm_setr_pd(_x, _y)); \ + data.setv(1, _mm_setr_pd(_z, _z)); \ + } + +# define CTOR_INT64(L, Q)\ + template<>\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec::vec(detail::int64 _s) :\ + data(_mm256_set1_epi64x(_s)){} + +#define CTOR_DOUBLE_COPY3(L, Q)\ + template<>\ + template\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, double, Q>::vec(vec<3, double, P> const& v)\ + {\ + data.setv(0, _mm_setr_pd(v.x, v.y));\ + data.setv(1, _mm_setr_pd(v.z, 1.0));\ + } + +#endif //GLM_ARCH & GLM_ARCH_AVX_BIT + +#define CTOR_INT(L, Q)\ + template<>\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec::vec(int _s) :\ + data(_mm_set1_epi32(_s))\ + {} + +#define CTOR_FLOAT4(L, Q)\ + template<>\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec::vec(float _x, float _y, float _z, float _w) :\ + data(_mm_set_ps(_w, _z, _y, _x))\ + {} + +#define CTOR_FLOAT3(L, Q)\ + template<>\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec::vec(float _x, float _y, float _z) :\ + data(_mm_set_ps(_z, _z, _y, _x)){} + + +#define CTOR_INT4(L, Q)\ + template<>\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec::vec(int _x, int _y, int _z, int _w) :\ + data(_mm_set_epi32(_w, _z, _y, _x)){} + +#define CTOR_INT3(L, Q)\ + template<>\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec::vec(int _x, int _y, int _z) :\ + data(_mm_set_epi32(_z, _z, _y, _x)){} + +#define CTOR_VECF_INT4(L, Q)\ + template<>\ + template<>\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec::vec(int _x, int _y, int _z, int _w) :\ + data(_mm_cvtepi32_ps(_mm_set_epi32(_w, _z, _y, _x)))\ + {} + +#define CTOR_VECF_INT3(L, Q)\ + template<>\ + template<>\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec::vec(int _x, int _y, int _z) :\ + data(_mm_cvtepi32_ps(_mm_set_epi32(_z, _z, _y, _x)))\ + {} + +#define CTOR_DEFAULT(L, Q)\ + template<>\ + template<>\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec::vec() :\ + data(_mm_setzero_ps())\ + {} + +#define CTOR_FLOAT_COPY3(L, Q)\ + template<>\ + template\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<3, float, Q>::vec(vec<3, float, P> const& v)\ + :data(_mm_set_ps(v.z, v.z, v.y, v.x))\ + {} + + + +}//namespace glm + +#endif//GLM_ARCH & GLM_ARCH_SSE2_BIT + +#if GLM_ARCH & GLM_ARCH_NEON_BIT + +#if GLM_CONFIG_SWIZZLE == GLM_SWIZZLE_OPERATOR +// the functions below needs to be properly implemented, use unoptimized function fro now. + +template +struct _swizzle_base1 : public _swizzle_base1{}; + +template +struct _swizzle_base1 : public _swizzle_base1 {}; + +template +struct _swizzle_base1 : public _swizzle_base1 {}; + +# endif// GLM_CONFIG_SWIZZLE == GLM_SWIZZLE_OPERATOR + + + template + struct compute_vec_add + { + GLM_FUNC_QUALIFIER static + vec + call(vec const& a, vec const& b) + { + vec Result; + Result.data = vaddq_f32(a.data, b.data); + return Result; + } + }; + + template + struct compute_vec_add + { + GLM_FUNC_QUALIFIER static + vec + call(vec const& a, vec const& b) + { + vec Result; + Result.data = vaddq_u32(a.data, b.data); + return Result; + } + }; + + template + struct compute_vec_add + { + static + vec + call(vec const& a, vec const& b) + { + vec Result; + Result.data = vaddq_s32(a.data, b.data); + return Result; + } + }; + + template + struct compute_vec_sub + { + static vec call(vec const& a, vec const& b) + { + vec Result; + Result.data = vsubq_f32(a.data, b.data); + return Result; + } + }; + + template + struct compute_vec_sub + { + static vec call(vec const& a, vec const& b) + { + vec Result; + Result.data = vsubq_u32(a.data, b.data); + return Result; + } + }; + + template + struct compute_vec_sub + { + static vec call(vec const& a, vec const& b) + { + vec Result; + Result.data = vsubq_s32(a.data, b.data); + return Result; + } + }; + + template + struct compute_vec_mul + { + static vec call(vec const& a, vec const& b) + { + vec Result; + Result.data = vmulq_f32(a.data, b.data); + return Result; + } + }; + + template + struct compute_vec_mul + { + static vec call(vec const& a, vec const& b) + { + vec Result; + Result.data = vmulq_u32(a.data, b.data); + return Result; + } + }; + + template + struct compute_vec_mul + { + static vec call(vec const& a, vec const& b) + { + vec Result; + Result.data = vmulq_s32(a.data, b.data); + return Result; + } + }; + + template + struct compute_vec_div + { + static vec call(vec const& a, vec const& b) + { + vec Result; +#if GLM_ARCH & GLM_ARCH_ARMV8_BIT + Result.data = vdivq_f32(a.data, b.data); +#else + /* Arm assembler reference: + * + * The Newton-Raphson iteration: x[n+1] = x[n] * (2 - d * x[n]) + * converges to (1/d) if x0 is the result of VRECPE applied to d. + * + * Note: The precision usually improves with two interactions, but more than two iterations are not helpful. */ + float32x4_t x = vrecpeq_f32(b.data); + x = vmulq_f32(vrecpsq_f32(b.data, x), x); + x = vmulq_f32(vrecpsq_f32(b.data, x), x); + Result.data = vmulq_f32(a.data, x); +#endif + return Result; + } + }; + + template + struct compute_vec_equal + { + static bool call(vec const& v1, vec const& v2) + { + uint32x4_t cmp = vceqq_f32(v1.data, v2.data); +#if GLM_ARCH & GLM_ARCH_ARMV8_BIT + cmp = vpminq_u32(cmp, cmp); + cmp = vpminq_u32(cmp, cmp); + uint32_t r = cmp[0]; +#else + uint32x2_t cmpx2 = vpmin_u32(vget_low_u32(cmp), vget_high_u32(cmp)); + cmpx2 = vpmin_u32(cmpx2, cmpx2); + uint32_t r = cmpx2[0]; +#endif + return r == ~0u; + } + }; + + template + struct compute_vec_equal + { + static bool call(vec const& v1, vec const& v2) + { + uint32x4_t cmp = vceqq_u32(v1.data, v2.data); +#if GLM_ARCH & GLM_ARCH_ARMV8_BIT + cmp = vpminq_u32(cmp, cmp); + cmp = vpminq_u32(cmp, cmp); + uint32_t r = cmp[0]; +#else + uint32x2_t cmpx2 = vpmin_u32(vget_low_u32(cmp), vget_high_u32(cmp)); + cmpx2 = vpmin_u32(cmpx2, cmpx2); + uint32_t r = cmpx2[0]; +#endif + return r == ~0u; + } + }; + + template + struct compute_vec_equal + { + static bool call(vec const& v1, vec const& v2) + { + uint32x4_t cmp = vceqq_s32(v1.data, v2.data); +#if GLM_ARCH & GLM_ARCH_ARMV8_BIT + cmp = vpminq_u32(cmp, cmp); + cmp = vpminq_u32(cmp, cmp); + uint32_t r = cmp[0]; +#else + uint32x2_t cmpx2 = vpmin_u32(vget_low_u32(cmp), vget_high_u32(cmp)); + cmpx2 = vpmin_u32(cmpx2, cmpx2); + uint32_t r = cmpx2[0]; +#endif + return r == ~0u; + } + }; + + template + struct compute_vec_nequal + { + static bool call(vec const& v1, vec const& v2) + { + return !compute_vec_equal::call(v1, v2); + } + }; + + template + struct compute_vec_nequal + { + static bool call(vec const& v1, vec const& v2) + { + return !compute_vec_equal::call(v1, v2); + } + }; + + template + struct compute_vec_nequal + { + static bool call(vec const& v1, vec const& v2) + { + return !compute_vec_equal::call(v1, v2); + } + }; + + +#if !GLM_CONFIG_XYZW_ONLY + +#define CTOR_FLOAT(L, Q)\ + template<>\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec::vec(float _s) :\ + data(vdupq_n_f32(_s))\ + {} + +#define CTOR_INT(L, Q)\ + template<>\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec::vec(int _s) :\ + data(vdupq_n_s32(_s))\ + {} + +#define CTOR_UINT(L, Q)\ + template<>\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec::vec(uint _s) :\ + data(vdupq_n_u32(_s))\ + {} + +#define CTOR_VECF_INT4(L, Q)\ + template<>\ + template<>\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec::vec(int _x, int _y, int _z, int _w) :\ + data(vcvtq_f32_s32(vec(_x, _y, _z, _w).data))\ + {} + +#define CTOR_VECF_UINT4(L, Q)\ + template<>\ + template<>\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec::vec(uint _x, uint _y, uint _z, uint _w) :\ + data(vcvtq_f32_u32(vec(_x, _y, _z, _w).data))\ + {} + +#define CTOR_VECF_INT3(L, Q)\ + template<>\ + template<>\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec::vec(int _x, int _y, int _z) :\ + data(vcvtq_f32_s32(vec(_x, _y, _z).data))\ + {} + +#define CTOR_VECF_UINT4(L, Q)\ + template<>\ + template<>\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec::vec(uint _x, uint _y, uint _z, uint _w) :\ + data(vcvtq_f32_u32(vec(_x, _y, _z, _w).data))\ + {} + +#define CTOR_VECF_UINT3(L, Q)\ + template<>\ + template<>\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec::vec(uint _x, uint _y, uint _z) :\ + data(vcvtq_f32_u32(vec(_x, _y, _z).data))\ + {} + + +#define CTOR_VECF_VECF(L, Q)\ + template<>\ + template<>\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec::vec(const vec& rhs) :\ + data(rhs.data)\ + {} + +#define CTOR_VECF_VECI(L, Q)\ + template<>\ + template<>\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec::vec(const vec& rhs) :\ + data(vcvtq_f32_s32(rhs.data))\ + {} + +#define CTOR_VECF_VECU(L, Q)\ + template<>\ + template<>\ + GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec::vec(const vec& rhs) :\ + data(vcvtq_f32_u32(rhs.data))\ + {} + + +#endif + + +}//namespace detail + +}//namespace glm + +#endif diff --git a/glm/ext/vector_relational.inl b/glm/ext/vector_relational.inl index 7a39ab50..8c50b2ff 100644 --- a/glm/ext/vector_relational.inl +++ b/glm/ext/vector_relational.inl @@ -10,7 +10,7 @@ namespace glm { return equal(x, y, vec(Epsilon)); } - + template GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec equal(vec const& x, vec const& y, vec const& Epsilon) { diff --git a/glm/gtc/random.inl b/glm/gtc/random.inl index 249ec9f9..57243681 100644 --- a/glm/gtc/random.inl +++ b/glm/gtc/random.inl @@ -69,7 +69,7 @@ namespace detail { return (vec(compute_rand::call()) << static_cast(8)) | - (vec(compute_rand::call()) << static_cast(0)); + (vec(compute_rand::call())); } }; @@ -80,7 +80,7 @@ namespace detail { return (vec(compute_rand::call()) << static_cast(16)) | - (vec(compute_rand::call()) << static_cast(0)); + (vec(compute_rand::call())); } }; @@ -91,7 +91,7 @@ namespace detail { return (vec(compute_rand::call()) << static_cast(32)) | - (vec(compute_rand::call()) << static_cast(0)); + (vec(compute_rand::call())); } }; diff --git a/glm/gtx/common.inl b/glm/gtx/common.inl index 4575b207..4651e35c 100644 --- a/glm/gtx/common.inl +++ b/glm/gtx/common.inl @@ -12,7 +12,7 @@ namespace detail { GLM_FUNC_QUALIFIER static vec call(vec const& a, vec const& b) { - return detail::functor2::call(std::fmod, a, b); + return detail::functor2::call(TFmod(), a, b); } }; diff --git a/glm/gtx/vec_swizzle.hpp b/glm/gtx/vec_swizzle.hpp index 2dafa645..bce96e8b 100644 --- a/glm/gtx/vec_swizzle.hpp +++ b/glm/gtx/vec_swizzle.hpp @@ -282,11 +282,6 @@ namespace glm { return glm::vec<3, T, Q>(v.x, v.y, v.z); } - template - GLM_FUNC_QUALIFIER glm::vec<3, T, Q> xyz(const glm::vec<4, T, Q> &v) { - return glm::vec<3, T, Q>(v.x, v.y, v.z); - } - // xyw template GLM_FUNC_QUALIFIER glm::vec<3, T, Q> xyw(const glm::vec<4, T, Q> &v) { @@ -1040,16 +1035,6 @@ namespace glm { return glm::vec<4, T, Q>(v.x, v.y, v.z, v.y); } - // xyzz - template - GLM_FUNC_QUALIFIER glm::vec<4, T, Q> xyzz(const glm::vec<3, T, Q> &v) { - return glm::vec<4, T, Q>(v.x, v.y, v.z, v.z); - } - - template - GLM_FUNC_QUALIFIER glm::vec<4, T, Q> xyzz(const glm::vec<4, T, Q> &v) { - return glm::vec<4, T, Q>(v.x, v.y, v.z, v.z); - } // xyzw template diff --git a/glm/simd/common.h b/glm/simd/common.h index 9b017cb4..d6c78f18 100644 --- a/glm/simd/common.h +++ b/glm/simd/common.h @@ -63,7 +63,7 @@ GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec4_swizzle_xyzw(glm_f32vec4 a) GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec1_fma(glm_f32vec4 a, glm_f32vec4 b, glm_f32vec4 c) { -# if (GLM_ARCH & GLM_ARCH_AVX2_BIT) && !(GLM_COMPILER & GLM_COMPILER_CLANG) +# ifdef GLM_FORCE_FMA return _mm_fmadd_ss(a, b, c); # else return _mm_add_ss(_mm_mul_ss(a, b), c); @@ -72,7 +72,16 @@ GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec1_fma(glm_f32vec4 a, glm_f32vec4 b, glm_f3 GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec4_fma(glm_f32vec4 a, glm_f32vec4 b, glm_f32vec4 c) { -# if (GLM_ARCH & GLM_ARCH_AVX2_BIT) && !(GLM_COMPILER & GLM_COMPILER_CLANG) +# ifdef GLM_FORCE_FMA + return _mm_fmadd_ps(a, b, c); +# else + return glm_vec4_add(glm_vec4_mul(a, b), c); +# endif +} + +GLM_FUNC_QUALIFIER glm_f32vec4 glm_vec4d_fma(glm_f32vec4 a, glm_f32vec4 b, glm_f32vec4 c) +{ +# ifdef GLM_FORCE_FMA return _mm_fmadd_ps(a, b, c); # else return glm_vec4_add(glm_vec4_mul(a, b), c); diff --git a/glm/simd/matrix.h b/glm/simd/matrix.h index b6c42ea4..8f9461c8 100644 --- a/glm/simd/matrix.h +++ b/glm/simd/matrix.h @@ -166,6 +166,18 @@ GLM_FUNC_QUALIFIER void glm_mat4_transpose(glm_vec4 const in[4], glm_vec4 out[4] out[3] = _mm_shuffle_ps(tmp2, tmp3, 0xDD); } +GLM_FUNC_QUALIFIER void glm_mat3_transpose(glm_vec4 const in[3], glm_vec4 out[3]) +{ + __m128 tmp0 = _mm_shuffle_ps(in[0], in[1], 0x44); + __m128 tmp2 = _mm_shuffle_ps(in[0], in[1], 0xEE); + __m128 tmp1 = _mm_shuffle_ps(in[2], in[2], 0x44); + __m128 tmp3 = _mm_shuffle_ps(in[2], in[2], 0xEE); + + out[0] = _mm_shuffle_ps(tmp0, tmp1, 0x88); + out[1] = _mm_shuffle_ps(tmp0, tmp1, 0xDD); + out[2] = _mm_shuffle_ps(tmp2, tmp3, 0x88); +} + GLM_FUNC_QUALIFIER glm_vec4 glm_mat4_determinant_highp(glm_vec4 const in[4]) { __m128 Fac0; diff --git a/glm/simd/neon.h b/glm/simd/neon.h index f85947f5..4bb7c2ed 100644 --- a/glm/simd/neon.h +++ b/glm/simd/neon.h @@ -22,7 +22,7 @@ namespace glm { case 3: return vdupq_n_f32(vgetq_lane_f32(vsrc, 3)); #endif } - assert(!"Unreachable code executed!"); + assert(false); //Unreachable code executed! return vdupq_n_f32(0.0f); } @@ -40,7 +40,7 @@ namespace glm { case 3: return vdup_n_f32(vgetq_lane_f32(vsrc, 3)); #endif } - assert(!"Unreachable code executed!"); + assert(false); //Unreachable code executed! return vdup_n_f32(0.0f); } @@ -54,7 +54,8 @@ namespace glm { case 2: return vcopyq_laneq_f32(vdst, 0, vsrc, 2); case 3: return vcopyq_laneq_f32(vdst, 0, vsrc, 3); } - assert(!"Unreachable code executed!"); + assert(false); //Unreachable code executed! + break; case 1: switch(slane) { case 0: return vcopyq_laneq_f32(vdst, 1, vsrc, 0); @@ -62,7 +63,8 @@ namespace glm { case 2: return vcopyq_laneq_f32(vdst, 1, vsrc, 2); case 3: return vcopyq_laneq_f32(vdst, 1, vsrc, 3); } - assert(!"Unreachable code executed!"); + assert(false); //Unreachable code executed! + break; case 2: switch(slane) { case 0: return vcopyq_laneq_f32(vdst, 2, vsrc, 0); @@ -70,7 +72,8 @@ namespace glm { case 2: return vcopyq_laneq_f32(vdst, 2, vsrc, 2); case 3: return vcopyq_laneq_f32(vdst, 2, vsrc, 3); } - assert(!"Unreachable code executed!"); + assert(false); //Unreachable code executed! + break; case 3: switch(slane) { case 0: return vcopyq_laneq_f32(vdst, 3, vsrc, 0); @@ -78,7 +81,8 @@ namespace glm { case 2: return vcopyq_laneq_f32(vdst, 3, vsrc, 2); case 3: return vcopyq_laneq_f32(vdst, 3, vsrc, 3); } - assert(!"Unreachable code executed!"); + assert(false); //Unreachable code executed! + break; } #else @@ -89,7 +93,7 @@ namespace glm { case 2: l = vgetq_lane_f32(vsrc, 2); break; case 3: l = vgetq_lane_f32(vsrc, 3); break; default: - assert(!"Unreachable code executed!"); + assert(false); //Unreachable code executed! } switch(dlane) { case 0: return vsetq_lane_f32(l, vdst, 0); @@ -98,7 +102,7 @@ namespace glm { case 3: return vsetq_lane_f32(l, vdst, 3); } #endif - assert(!"Unreachable code executed!"); + assert(false); //Unreachable code executed! return vdupq_n_f32(0.0f); } @@ -110,9 +114,9 @@ namespace glm { case 2: return vmulq_laneq_f32(v, vlane, 2); break; case 3: return vmulq_laneq_f32(v, vlane, 3); break; default: - assert(!"Unreachable code executed!"); + assert(false); //Unreachable code executed! } - assert(!"Unreachable code executed!"); + assert(false); //Unreachable code executed! return vdupq_n_f32(0.0f); #else return vmulq_f32(v, dupq_lane(vlane, lane)); @@ -141,9 +145,9 @@ namespace glm { FMADD_LANE(acc, v, vlane, 3); return acc; default: - assert(!"Unreachable code executed!"); + assert(false); //Unreachable code executed! } - assert(!"Unreachable code executed!"); + assert(false); //Unreachable code executed! return vdupq_n_f32(0.0f); # undef FMADD_LANE #else diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 4ed1403f..f0153a0f 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -2,6 +2,122 @@ option(GLM_QUIET "No CMake Message" OFF) option(GLM_TEST_ENABLE "Build unit tests" ON) option(GLM_PERF_TEST_ENABLE "Build perf tests" OFF) +<<<<<<< HEAD +======= +option(GLM_TEST_ENABLE_SIMD_SSE2 "Enable SSE2 optimizations" OFF) +option(GLM_TEST_ENABLE_SIMD_SSE3 "Enable SSE3 optimizations" OFF) +option(GLM_TEST_ENABLE_SIMD_SSSE3 "Enable SSSE3 optimizations" OFF) +option(GLM_TEST_ENABLE_SIMD_SSE4_1 "Enable SSE 4.1 optimizations" OFF) +option(GLM_TEST_ENABLE_SIMD_SSE4_2 "Enable SSE 4.2 optimizations" OFF) +option(GLM_TEST_ENABLE_SIMD_AVX "Enable AVX optimizations" OFF) +option(GLM_TEST_ENABLE_SIMD_AVX2 "Enable AVX2 optimizations" OFF) +option(GLM_TEST_ENABLE_SIMD_NEON "Enable ARM NEON optimizations" OFF) +option(GLM_TEST_ENABLE_SIMD_FMA "Enable FMA optimizations" OFF) +option(GLM_TEST_FORCE_PURE "Force 'pure' instructions" OFF) + +if(GLM_TEST_FORCE_PURE) + add_definitions(-DGLM_FORCE_PURE) + + if(CMAKE_CXX_COMPILER_ID MATCHES "GNU") + add_compile_options(-mfpmath=387) + endif() + message(STATUS "GLM: No SIMD instruction set") + +elseif(GLM_TEST_ENABLE_SIMD_AVX2) + add_definitions(-DGLM_FORCE_INTRINSICS) + + if((CMAKE_CXX_COMPILER_ID MATCHES "GNU") OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) + add_compile_options(-mavx2) + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel") + add_compile_options(/QxAVX2) + elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") + add_compile_options(/arch:AVX2) + endif() + message(STATUS "GLM: AVX2 instruction set") + +elseif(GLM_TEST_ENABLE_SIMD_AVX) + add_definitions(-DGLM_FORCE_INTRINSICS) + + if((CMAKE_CXX_COMPILER_ID MATCHES "GNU") OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) + add_compile_options(-mavx) + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel") + add_compile_options(/QxAVX) + elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") + add_compile_options(/arch:AVX) + endif() + message(STATUS "GLM: AVX instruction set") + +elseif(GLM_TEST_ENABLE_SIMD_SSE4_2) + add_definitions(-DGLM_FORCE_INTRINSICS) + + if((CMAKE_CXX_COMPILER_ID MATCHES "GNU") OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) + add_compile_options(-msse4.2) + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel") + add_compile_options(/QxSSE4.2) + elseif((CMAKE_CXX_COMPILER_ID MATCHES "MSVC") AND NOT CMAKE_CL_64) + add_compile_options(/arch:SSE2) # VC doesn't support SSE4.2 + endif() + message(STATUS "GLM: SSE4.2 instruction set") + +elseif(GLM_TEST_ENABLE_SIMD_SSE4_1) + add_definitions(-DGLM_FORCE_INTRINSICS) + + if((CMAKE_CXX_COMPILER_ID MATCHES "GNU") OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) + add_compile_options(-msse4.1) + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel") + add_compile_options(/QxSSE4.1) + elseif((CMAKE_CXX_COMPILER_ID MATCHES "MSVC") AND NOT CMAKE_CL_64) + add_compile_options(/arch:SSE2) # VC doesn't support SSE4.1 + endif() + message(STATUS "GLM: SSE4.1 instruction set") + +elseif(GLM_TEST_ENABLE_SIMD_SSSE3) + add_definitions(-DGLM_FORCE_INTRINSICS) + + if((CMAKE_CXX_COMPILER_ID MATCHES "GNU") OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) + add_compile_options(-mssse3) + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel") + add_compile_options(/QxSSSE3) + elseif((CMAKE_CXX_COMPILER_ID MATCHES "MSVC") AND NOT CMAKE_CL_64) + add_compile_options(/arch:SSE2) # VC doesn't support SSSE3 + endif() + message(STATUS "GLM: SSSE3 instruction set") + +elseif(GLM_TEST_ENABLE_SIMD_SSE3) + add_definitions(-DGLM_FORCE_INTRINSICS) + + if((CMAKE_CXX_COMPILER_ID MATCHES "GNU") OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) + add_compile_options(-msse3) + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel") + add_compile_options(/QxSSE3) + elseif((CMAKE_CXX_COMPILER_ID MATCHES "MSVC") AND NOT CMAKE_CL_64) + add_compile_options(/arch:SSE2) # VC doesn't support SSE3 + endif() + message(STATUS "GLM: SSE3 instruction set") + +elseif(GLM_TEST_ENABLE_SIMD_SSE2) + add_definitions(-DGLM_FORCE_INTRINSICS) + + if((CMAKE_CXX_COMPILER_ID MATCHES "GNU") OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) + add_compile_options(-msse2) + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel") + add_compile_options(/QxSSE2) + elseif((CMAKE_CXX_COMPILER_ID MATCHES "MSVC") AND NOT CMAKE_CL_64) + add_compile_options(/arch:SSE2) + endif() + message(STATUS "GLM: SSE2 instruction set") +elseif(GLM_TEST_ENABLE_SIMD_NEON) + add_definitions(-DGLM_FORCE_NEON) + message(STATUS "GLM: ARM NEON instruction set") +endif() + +if (GLM_TEST_ENABLE_SIMD_FMA) + add_definitions(-DGLM_FORCE_FMA) + if((CMAKE_CXX_COMPILER_ID MATCHES "GNU") OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) + add_compile_options(-mfma) + endif() +endif() +>>>>>>> 5c1da05b (Simd improvement) # Compiler and default options @@ -10,9 +126,19 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") message("GLM: Clang - ${CMAKE_CXX_COMPILER_ID} compiler") endif() + add_definitions(-D_CRT_SECURE_NO_WARNINGS) if(NOT GLM_DISABLE_AUTO_DETECTION) add_compile_options(-Werror -Weverything) endif() +<<<<<<< HEAD +======= + if (NOT APPLE) + add_compile_options(-Wno-unsafe-buffer-usage) #this option makes MacOS compilation fail but prevent error on windows + endif() + + add_compile_options(-Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-c++11-long-long -Wno-padded -Wno-gnu-anonymous-struct -Wno-nested-anon-types) + add_compile_options(-Wno-undefined-reinterpret-cast -Wno-sign-conversion -Wno-unused-variable -Wno-missing-prototypes -Wno-unreachable-code -Wno-missing-variable-declarations -Wno-sign-compare -Wno-global-constructors -Wno-unused-macros -Wno-format-nonliteral -Wno-float-equal) +>>>>>>> 5c1da05b (Simd improvement) elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU") if(NOT GLM_QUIET) diff --git a/test/bug/bug_ms_vec_static.cpp b/test/bug/bug_ms_vec_static.cpp index f4919d66..1645aacb 100644 --- a/test/bug/bug_ms_vec_static.cpp +++ b/test/bug/bug_ms_vec_static.cpp @@ -3,7 +3,7 @@ #if GLM_CONFIG_ANONYMOUS_STRUCT == GLM_ENABLE struct vec2; -struct _swizzle +struct swizzleStruct { char _buffer[1]; }; @@ -27,7 +27,7 @@ struct vec2 union { struct { float x, y; }; - struct { _swizzle xx; }; + struct { swizzleStruct xx; }; }; #if GLM_COMPILER & GLM_COMPILER_CLANG diff --git a/test/core/core_force_pure.cpp b/test/core/core_force_pure.cpp index 03ec7558..bca17b53 100644 --- a/test/core/core_force_pure.cpp +++ b/test/core/core_force_pure.cpp @@ -1,7 +1,7 @@ #ifndef GLM_FORCE_PURE # define GLM_FORCE_PURE #endif//GLM_FORCE_PURE -#define GLM_FORCE_DEFAULT_ALIGNED_GENTYPES +//#define GLM_FORCE_DEFAULT_ALIGNED_GENTYPES #define GLM_FORCE_SWIZZLE #include #include diff --git a/test/core/core_type_aligned.cpp b/test/core/core_type_aligned.cpp index 63421f84..c29593c5 100644 --- a/test/core/core_type_aligned.cpp +++ b/test/core/core_type_aligned.cpp @@ -1,4 +1,7 @@ +#ifndef GLM_FORCE_PURE #define GLM_FORCE_DEFAULT_ALIGNED_GENTYPES +#endif + #include #if GLM_CONFIG_ALIGNED_GENTYPES == GLM_ENABLE diff --git a/test/gtc/gtc_random.cpp b/test/gtc/gtc_random.cpp index 0c834617..734c69d3 100644 --- a/test/gtc/gtc_random.cpp +++ b/test/gtc/gtc_random.cpp @@ -1,4 +1,6 @@ +#ifndef GLM_FORCE_PURE #define GLM_FORCE_DEFAULT_ALIGNED_GENTYPES +#endif #include #include #include diff --git a/test/gtc/gtc_type_aligned.cpp b/test/gtc/gtc_type_aligned.cpp index 7120deec..7074c665 100644 --- a/test/gtc/gtc_type_aligned.cpp +++ b/test/gtc/gtc_type_aligned.cpp @@ -91,6 +91,303 @@ struct my_u8vec4_packed }; GLM_STATIC_ASSERT(sizeof(my_u8vec4_packed) == sizeof(glm::uint32) + sizeof(glm::u8vec4), "glm::u8vec4 packed is not correct"); +static int test_copy_vec4() +{ + int Error = 0; + { + glm::aligned_vec4 const u(1.f, 2.f, 3.f, 4.f); + glm::packed_vec4 const v(u); + Error += v.x == u.x ? 0 : 1; + Error += v.y == u.y ? 0 : 1; + Error += v.z == u.z ? 0 : 1; + Error += v.w == u.w ? 0 : 1; + } + { + glm::packed_vec4 const u(1.f, 2.f, 3.f, 4.f); + glm::aligned_vec4 const v(u); + Error += v.x == u.x ? 0 : 1; + Error += v.y == u.y ? 0 : 1; + Error += v.z == u.z ? 0 : 1; + Error += v.w == u.w ? 0 : 1; + } + + { + glm::aligned_dvec4 const u(1., 2., 3., 4.); + glm::packed_dvec4 const v(u); + Error += v.x == u.x ? 0 : 1; + Error += v.y == u.y ? 0 : 1; + Error += v.z == u.z ? 0 : 1; + Error += v.w == u.w ? 0 : 1; + } + { + glm::packed_dvec4 const u(1.f, 2.f, 3.f, 4.f); + glm::aligned_dvec4 const v(u); + Error += v.x == u.x ? 0 : 1; + Error += v.y == u.y ? 0 : 1; + Error += v.z == u.z ? 0 : 1; + Error += v.w == u.w ? 0 : 1; + } + + { + glm::aligned_ivec4 const u(1, 2, 3, 4); + glm::packed_ivec4 const v(u); + Error += v.x == u.x ? 0 : 1; + Error += v.y == u.y ? 0 : 1; + Error += v.z == u.z ? 0 : 1; + Error += v.w == u.w ? 0 : 1; + } + { + glm::packed_ivec4 const u(1, 2, 3, 4); + glm::aligned_ivec4 const v(u); + Error += v.x == u.x ? 0 : 1; + Error += v.y == u.y ? 0 : 1; + Error += v.z == u.z ? 0 : 1; + Error += v.w == u.w ? 0 : 1; + } + + return Error; +} + +static int test_copy_vec3() +{ + int Error = 0; + { + glm::aligned_vec3 const u(1.f, 2.f, 3.f); + glm::packed_vec3 const v(u); + Error += v.x == u.x ? 0 : 1; + Error += v.y == u.y ? 0 : 1; + Error += v.z == u.z ? 0 : 1; + } + { + glm::packed_vec3 const u(1.f, 2.f, 3.f); + glm::aligned_vec3 const v(u); + Error += v.x == u.x ? 0 : 1; + Error += v.y == u.y ? 0 : 1; + Error += v.z == u.z ? 0 : 1; + } + + { + glm::aligned_dvec3 const u(1., 2., 3.); + glm::packed_dvec3 const v(u); + Error += v.x == u.x ? 0 : 1; + Error += v.y == u.y ? 0 : 1; + Error += v.z == u.z ? 0 : 1; + } + { + glm::packed_dvec3 const u(1.f, 2.f, 3.f); + glm::aligned_dvec3 const v(u); + Error += v.x == u.x ? 0 : 1; + Error += v.y == u.y ? 0 : 1; + Error += v.z == u.z ? 0 : 1; + } + + { + glm::aligned_ivec3 const u(1, 2, 3); + glm::packed_ivec3 const v(u); + Error += v.x == u.x ? 0 : 1; + Error += v.y == u.y ? 0 : 1; + Error += v.z == u.z ? 0 : 1; + } + { + glm::packed_ivec3 const u(1, 2, 3); + glm::aligned_ivec3 const v(u); + Error += v.x == u.x ? 0 : 1; + Error += v.y == u.y ? 0 : 1; + Error += v.z == u.z ? 0 : 1; + } + + return Error; +} + +static int test_splat_vec3() +{ + int Error = 0; + { + glm::aligned_vec3 const u(1.f, 2.f, 3.f); + glm::aligned_vec3 const v(glm::splatX(u)); + Error += v.x == u.x ? 0 : 1; + Error += v.y == u.x ? 0 : 1; + Error += v.z == u.x ? 0 : 1; + } + + { + glm::aligned_vec3 const u(1.f, 2.f, 3.f); + glm::aligned_vec3 const v(glm::splatY(u)); + Error += v.x == u.y ? 0 : 1; + Error += v.y == u.y ? 0 : 1; + Error += v.z == u.y ? 0 : 1; + } + + { + glm::aligned_vec3 const u(1.f, 2.f, 3.f); + glm::aligned_vec3 const v(glm::splatZ(u)); + Error += v.x == u.z ? 0 : 1; + Error += v.y == u.z ? 0 : 1; + Error += v.z == u.z ? 0 : 1; + } + + { + glm::aligned_dvec3 const u(1., 2., 3.); + glm::aligned_dvec3 const v(glm::splatX(u)); + Error += v.x == u.x ? 0 : 1; + Error += v.y == u.x ? 0 : 1; + Error += v.z == u.x ? 0 : 1; + } + + { + glm::aligned_dvec3 const u(1., 2., 3.); + glm::aligned_dvec3 const v(glm::splatY(u)); + Error += v.x == u.y ? 0 : 1; + Error += v.y == u.y ? 0 : 1; + Error += v.z == u.y ? 0 : 1; + } + + { + glm::aligned_dvec3 const u(1., 2., 3.); + glm::aligned_dvec3 const v(glm::splatZ(u)); + Error += v.x == u.z ? 0 : 1; + Error += v.y == u.z ? 0 : 1; + Error += v.z == u.z ? 0 : 1; + } + + return Error; +} + +static int test_splat_vec4() +{ + int Error = 0; + { + glm::aligned_vec4 const u(1.f, 2.f, 3.f, 4.f); + { + glm::aligned_vec4 const v(glm::splatX(u)); + Error += v.x == u.x ? 0 : 1; + Error += v.y == u.x ? 0 : 1; + Error += v.z == u.x ? 0 : 1; + Error += v.w == u.x ? 0 : 1; + } + + { + glm::aligned_vec4 const v(glm::splatY(u)); + Error += v.x == u.y ? 0 : 1; + Error += v.y == u.y ? 0 : 1; + Error += v.z == u.y ? 0 : 1; + Error += v.w == u.y ? 0 : 1; + } + + { + glm::aligned_vec4 const v(glm::splatZ(u)); + Error += v.x == u.z ? 0 : 1; + Error += v.y == u.z ? 0 : 1; + Error += v.z == u.z ? 0 : 1; + Error += v.w == u.z ? 0 : 1; + } + } + { + glm::aligned_dvec4 const u(1.f, 2.f, 3.f, 4.f); + { + glm::aligned_dvec4 const v(glm::splatX(u)); + Error += v.x == u.x ? 0 : 1; + Error += v.y == u.x ? 0 : 1; + Error += v.z == u.x ? 0 : 1; + Error += v.w == u.x ? 0 : 1; + } + + { + glm::aligned_dvec4 const v(glm::splatY(u)); + Error += v.x == u.y ? 0 : 1; + Error += v.y == u.y ? 0 : 1; + Error += v.z == u.y ? 0 : 1; + Error += v.w == u.y ? 0 : 1; + } + + { + glm::aligned_dvec4 const v(glm::splatZ(u)); + Error += v.x == u.z ? 0 : 1; + Error += v.y == u.z ? 0 : 1; + Error += v.z == u.z ? 0 : 1; + Error += v.w == u.z ? 0 : 1; + } + } + return Error; +} + +static int test_copy_vec4_vec3() +{ + int Error = 0; + + { + glm::aligned_vec3 const u(1.f, 2.f, 3.f); + glm::aligned_vec4 const v(glm::xyz0(u)); + Error += v.x == u.x ? 0 : 1; + Error += v.y == u.y ? 0 : 1; + Error += v.z == u.z ? 0 : 1; + Error += v.w == 0.0f ? 0 : 1; + } + + { + glm::aligned_vec3 const u(1.f, 2.f, 3.f); + glm::aligned_vec4 const v(glm::xyz1(u)); + Error += v.x == u.x ? 0 : 1; + Error += v.y == u.y ? 0 : 1; + Error += v.z == u.z ? 0 : 1; + Error += v.w == 1.0f ? 0 : 1; + } + + { + glm::aligned_dvec3 const u(1., 2., 3.); + glm::aligned_dvec4 const v(glm::xyz0(u)); + Error += v.x == u.x ? 0 : 1; + Error += v.y == u.y ? 0 : 1; + Error += v.z == u.z ? 0 : 1; + Error += v.w == 0. ? 0 : 1; + } + + { + glm::aligned_dvec3 const u(1., 2., 3.); + glm::aligned_dvec4 const v(glm::xyz1(u)); + Error += v.x == u.x ? 0 : 1; + Error += v.y == u.y ? 0 : 1; + Error += v.z == u.z ? 0 : 1; + Error += v.w == 1.0 ? 0 : 1; + } + + { + glm::aligned_dvec3 const u(1., 2., 3.); + glm::aligned_dvec4 const v(glm::xyzz(u)); + Error += v.x == u.x ? 0 : 1; + Error += v.y == u.y ? 0 : 1; + Error += v.z == u.z ? 0 : 1; + Error += v.w == u.z ? 0 : 1; + } + + { + glm::aligned_dvec3 const u(1., 2., 3.); + glm::aligned_dvec4 const v(glm::xyzz(u)); + Error += v.x == u.x ? 0 : 1; + Error += v.y == u.y ? 0 : 1; + Error += v.z == u.z ? 0 : 1; + Error += v.w == u.z ? 0 : 1; + } + + + { + glm::aligned_vec4 const u(1.f, 2.f, 3.f, 4.f); + glm::aligned_vec3 const v(glm::xyz(u)); + Error += v.x == u.x ? 0 : 1; + Error += v.y == u.y ? 0 : 1; + Error += v.z == u.z ? 0 : 1; + } + + { + glm::aligned_dvec4 const u(1., 2., 3., 4.); + glm::aligned_dvec3 const v(glm::xyz(u)); + Error += v.x == u.x ? 0 : 1; + Error += v.y == u.y ? 0 : 1; + Error += v.z == u.z ? 0 : 1; + } + return Error; +} + static int test_copy() { int Error = 0; @@ -188,15 +485,22 @@ static int test_aligned_mat4() return Error; } + int main() { int Error = 0; Error += test_ctor(); + Error += test_copy_vec4(); + Error += test_copy_vec3(); + Error += test_splat_vec3(); + Error += test_splat_vec4(); + Error += test_copy_vec4_vec3(); Error += test_copy(); Error += test_aligned_ivec4(); Error += test_aligned_mat4(); + return Error; } diff --git a/test/perf/perf_matrix_mul.cpp b/test/perf/perf_matrix_mul.cpp index 41e1a920..db65ff06 100644 --- a/test/perf/perf_matrix_mul.cpp +++ b/test/perf/perf_matrix_mul.cpp @@ -14,6 +14,31 @@ #include #include + +inline bool +is_aligned(const void* ptr, std::uintptr_t alignment) noexcept { + auto iptr = reinterpret_cast(ptr); + return !(iptr % alignment); +} + +template +static void align_check(matType const& M, std::vector const& I, std::vector& O) +{ + if (matType::col_type::is_aligned::value) + { + if (!is_aligned(&M, 16)) + abort(); + for (std::size_t i = 0, n = I.size(); i < n; ++i) + { + if (!is_aligned(&I[i], 16)) + abort(); + + if (!is_aligned(&O[i], 16)) + abort(); + } + } +} + template static void test_mat_mul_mat(matType const& M, std::vector const& I, std::vector& O) { @@ -32,6 +57,8 @@ static int launch_mat_mul_mat(std::vector& O, matType const& Transform, for(std::size_t i = 0; i < Samples; ++i) I[i] = Scale * static_cast(i); + align_check(Transform, I, O); + std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now(); test_mat_mul_mat(Transform, I, O); std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now(); @@ -65,27 +92,49 @@ static int comp_mat2_mul_mat2(std::size_t Samples) return Error; } +template +bool percent_error(const T1& a, const T2& b, float percentThreshold) +{ + typedef typename T1::value_type value_type; + for (int i = 0; i < a.length(); ++i) + for (int j = 0; j < a[i].length(); ++j) + { + value_type v; + if (a[i][j] != value_type(0)) + v = ((b[i][j] - a[i][j]) / a[i][j]) * value_type(100); + else + v = b[i][j] * value_type(100); + + if (v > value_type(percentThreshold)) + return false; + } + return true; +} + template static int comp_mat3_mul_mat3(std::size_t Samples) { - typedef typename packedMatType::value_type T; - + int Error = 0; - packedMatType const Transform(1, 2, 3, 4, 5, 6, 7, 8, 9); - packedMatType const Scale(0.01, 0.02, 0.03, 0.05, 0.01, 0.02, 0.03, 0.05, 0.01); - std::vector SISD; - std::printf("- SISD: %d us\n", launch_mat_mul_mat(SISD, Transform, Scale, Samples)); + { + packedMatType const Transform(1, 2, 3, 4, 5, 6, 7, 8, 9); + packedMatType const Scale(0.01, 0.02, 0.03, 0.05, 0.01, 0.02, 0.03, 0.05, 0.01); + std::printf("- SISD: %d us\n", launch_mat_mul_mat(SISD, Transform, Scale, Samples)); + } std::vector SIMD; - std::printf("- SIMD: %d us\n", launch_mat_mul_mat(SIMD, Transform, Scale, Samples)); - + { + alignedMatType const Transform(1, 2, 3, 4, 5, 6, 7, 8, 9); + alignedMatType const Scale(0.01, 0.02, 0.03, 0.05, 0.01, 0.02, 0.03, 0.05, 0.01); + std::printf("- SIMD: %d us\n", launch_mat_mul_mat(SIMD, Transform, Scale, Samples)); + } for(std::size_t i = 0; i < Samples; ++i) { packedMatType const A = SISD[i]; packedMatType const B = SIMD[i]; - Error += glm::all(glm::equal(A, B, static_cast(0.001))) ? 0 : 1; + Error += percent_error(A, B, 0.01f) ? 0 : 1; } return Error; @@ -94,7 +143,6 @@ static int comp_mat3_mul_mat3(std::size_t Samples) template static int comp_mat4_mul_mat4(std::size_t Samples) { - typedef typename packedMatType::value_type T; int Error = 0; @@ -111,7 +159,7 @@ static int comp_mat4_mul_mat4(std::size_t Samples) { packedMatType const A = SISD[i]; packedMatType const B = SIMD[i]; - Error += glm::all(glm::equal(A, B, static_cast(0.001))) ? 0 : 1; + Error += percent_error(A, B, 0.01f) ? 0 : 1; } return Error; @@ -125,13 +173,13 @@ int main() std::printf("mat2 * mat2:\n"); Error += comp_mat2_mul_mat2(Samples); - + std::printf("dmat2 * dmat2:\n"); Error += comp_mat2_mul_mat2(Samples); std::printf("mat3 * mat3:\n"); Error += comp_mat3_mul_mat3(Samples); - + std::printf("dmat3 * dmat3:\n"); Error += comp_mat3_mul_mat3(Samples); diff --git a/test/perf/perf_matrix_mul_vector.cpp b/test/perf/perf_matrix_mul_vector.cpp index 89a77703..11117776 100644 --- a/test/perf/perf_matrix_mul_vector.cpp +++ b/test/perf/perf_matrix_mul_vector.cpp @@ -72,14 +72,19 @@ static int comp_mat3_mul_vec3(std::size_t Samples) int Error = 0; - packedMatType const Transform(1, 2, 3, 4, 5, 6, 7, 8, 9); - packedVecType const Scale(0.01, 0.02, 0.05); - std::vector SISD; - std::printf("- SISD: %d us\n", launch_mat_mul_vec(SISD, Transform, Scale, Samples)); + { + packedMatType const Transform(1, 2, 3, 4, 5, 6, 7, 8, 9); + packedVecType const Scale(0.01, 0.02, 0.05); + std::printf("- SISD: %d us\n", launch_mat_mul_vec(SISD, Transform, Scale, Samples)); + } std::vector SIMD; - std::printf("- SIMD: %d us\n", launch_mat_mul_vec(SIMD, Transform, Scale, Samples)); + { + alignedMatType const Transform(1, 2, 3, 4, 5, 6, 7, 8, 9); + alignedVecType const Scale(0.01, 0.02, 0.05); + std::printf("- SIMD: %d us\n", launch_mat_mul_vec(SIMD, Transform, Scale, Samples)); + } for(std::size_t i = 0; i < Samples; ++i) { @@ -125,9 +130,9 @@ int main() std::printf("mat2 * vec2:\n"); Error += comp_mat2_mul_vec2(Samples); - + std::printf("dmat2 * dvec2:\n"); - Error += comp_mat2_mul_vec2(Samples); + Error += comp_mat2_mul_vec2(Samples); std::printf("mat3 * vec3:\n"); Error += comp_mat3_mul_vec3(Samples); diff --git a/test/perf/perf_vector_mul_matrix.cpp b/test/perf/perf_vector_mul_matrix.cpp index 9f59f81c..55384e63 100644 --- a/test/perf/perf_vector_mul_matrix.cpp +++ b/test/perf/perf_vector_mul_matrix.cpp @@ -14,14 +14,30 @@ #include #include -template -static void test_vec_mul_mat(matType const& M, std::vector const& I, std::vector& O) -{ - for (std::size_t i = 0, n = I.size(); i < n; ++i) - O[i] = I[i] * M; -} +template +struct test_vec_mul_mat {}; template +struct test_vec_mul_mat< matType, vecType, false> +{ + void operator()(matType const& M, std::vector const& I, std::vector& O) + { + for (std::size_t i = 0, n = I.size(); i < n; ++i) + O[i] = I[i] * M; + } +}; + +template +struct test_vec_mul_mat< matType, vecType, true> +{ + void operator()(matType const& M, std::vector const& I, std::vector& O) + { + for (std::size_t i = 0, n = I.size(); i < n; ++i) + O[i] = M * I[i]; + } +}; + +template static int launch_vec_mul_mat(std::vector& O, matType const& Transform, vecType const& Scale, std::size_t Samples) { typedef typename matType::value_type T; @@ -29,17 +45,20 @@ static int launch_vec_mul_mat(std::vector& O, matType const& Transform, std::vector I(Samples); O.resize(Samples); + memset(I.data(), 0, I.size() * sizeof(vecType)); + for(std::size_t i = 0; i < Samples; ++i) I[i] = Scale * static_cast(i); std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now(); - test_vec_mul_mat(Transform, I, O); + test_vec_mul_mat fct; + fct(Transform, I, O); std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now(); return static_cast(std::chrono::duration_cast(t2 - t1).count()); } -template +template static int comp_vec2_mul_mat2(std::size_t Samples) { typedef typename packedMatType::value_type T; @@ -50,10 +69,10 @@ static int comp_vec2_mul_mat2(std::size_t Samples) packedVecType const Scale(0.01, 0.02); std::vector SISD; - std::printf("- SISD: %d us\n", launch_vec_mul_mat(SISD, Transform, Scale, Samples)); + std::printf("- SISD: %d us\n", launch_vec_mul_mat(SISD, Transform, Scale, Samples)); std::vector SIMD; - std::printf("- SIMD: %d us\n", launch_vec_mul_mat(SIMD, Transform, Scale, Samples)); + std::printf("- SIMD: %d us\n", launch_vec_mul_mat(SIMD, Transform, Scale, Samples)); for(std::size_t i = 0; i < Samples; ++i) { @@ -65,7 +84,7 @@ static int comp_vec2_mul_mat2(std::size_t Samples) return Error; } -template +template static int comp_vec3_mul_mat3(std::size_t Samples) { typedef typename packedMatType::value_type T; @@ -76,10 +95,10 @@ static int comp_vec3_mul_mat3(std::size_t Samples) packedVecType const Scale(0.01, 0.02, 0.05); std::vector SISD; - std::printf("- SISD: %d us\n", launch_vec_mul_mat(SISD, Transform, Scale, Samples)); + std::printf("- SISD: %d us\n", launch_vec_mul_mat(SISD, Transform, Scale, Samples)); std::vector SIMD; - std::printf("- SIMD: %d us\n", launch_vec_mul_mat(SIMD, Transform, Scale, Samples)); + std::printf("- SIMD: %d us\n", launch_vec_mul_mat(SIMD, Transform, Scale, Samples)); for(std::size_t i = 0; i < Samples; ++i) { @@ -91,7 +110,7 @@ static int comp_vec3_mul_mat3(std::size_t Samples) return Error; } -template +template static int comp_vec4_mul_mat4(std::size_t Samples) { typedef typename packedMatType::value_type T; @@ -102,10 +121,10 @@ static int comp_vec4_mul_mat4(std::size_t Samples) packedVecType const Scale(0.01, 0.02, 0.03, 0.05); std::vector SISD; - std::printf("- SISD: %d us\n", launch_vec_mul_mat(SISD, Transform, Scale, Samples)); + std::printf("- SISD: %d us\n", launch_vec_mul_mat(SISD, Transform, Scale, Samples)); std::vector SIMD; - std::printf("- SIMD: %d us\n", launch_vec_mul_mat(SIMD, Transform, Scale, Samples)); + std::printf("- SIMD: %d us\n", launch_vec_mul_mat(SIMD, Transform, Scale, Samples)); for(std::size_t i = 0; i < Samples; ++i) { @@ -124,22 +143,41 @@ int main() int Error = 0; std::printf("vec2 * mat2:\n"); - Error += comp_vec2_mul_mat2(Samples); - + Error += comp_vec2_mul_mat2(Samples); + std::printf("dvec2 * dmat2:\n"); - Error += comp_vec2_mul_mat2(Samples); + Error += comp_vec2_mul_mat2(Samples); std::printf("vec3 * mat3:\n"); - Error += comp_vec3_mul_mat3(Samples); - + Error += comp_vec3_mul_mat3(Samples); + std::printf("dvec3 * dmat3:\n"); - Error += comp_vec3_mul_mat3(Samples); + Error += comp_vec3_mul_mat3(Samples); std::printf("vec4 * mat4:\n"); - Error += comp_vec4_mul_mat4(Samples); + Error += comp_vec4_mul_mat4(Samples); std::printf("dvec4 * dmat4:\n"); - Error += comp_vec4_mul_mat4(Samples); + Error += comp_vec4_mul_mat4(Samples); + + + std::printf("mat2 * vec2:\n"); + Error += comp_vec2_mul_mat2(Samples); + + std::printf("dmat2 * dvec2 :\n"); + Error += comp_vec2_mul_mat2(Samples); + + std::printf("mat3 * vec3:\n"); + Error += comp_vec3_mul_mat3(Samples); + + std::printf("dmat3 * dvec3 :\n"); + Error += comp_vec3_mul_mat3(Samples); + + std::printf("mat4 * vec4 :\n"); + Error += comp_vec4_mul_mat4(Samples); + + std::printf("dmat4 * dvec4 :\n"); + Error += comp_vec4_mul_mat4(Samples); return Error; }