diff --git a/glm/detail/setup.hpp b/glm/detail/setup.hpp index 0f8bb75a..b219af9f 100644 --- a/glm/detail/setup.hpp +++ b/glm/detail/setup.hpp @@ -70,14 +70,16 @@ #define GLM_ARCH_PURE 0x00000000 #define GLM_ARCH_X86 0x00000001 -#define GLM_ARCH_SSE2 0x00000002 -#define GLM_ARCH_SSE3 0x00000004 -#define GLM_ARCH_SSE4 0x00000008 -#define GLM_ARCH_AVX 0x00000010 -#define GLM_ARCH_AVX2 0x00000020 -#define GLM_ARCH_AVX512 0x00000040 // Skylake subset +#define GLM_ARCH_SSE2 0x00000002 | GLM_ARCH_X86 +#define GLM_ARCH_SSE3 0x00000004 | GLM_ARCH_SSE2 +#define GLM_ARCH_SSSE3 0x00000008 | GLM_ARCH_SSE3 +#define GLM_ARCH_SSE41 0x00000010 | GLM_ARCH_SSSE3 +#define GLM_ARCH_SSE42 0x00000020 | GLM_ARCH_SSE41 +#define GLM_ARCH_AVX 0x00000040 | GLM_ARCH_SSE42 +#define GLM_ARCH_AVX2 0x00000080 | GLM_ARCH_AVX +#define GLM_ARCH_AVX512 0x00000100 | GLM_ARCH_AVX2 // Skylake subset #define GLM_ARCH_ARM 0x00000100 -#define GLM_ARCH_NEON 0x00000200 +#define GLM_ARCH_NEON 0x00000200 | GLM_ARCH_ARM #define GLM_ARCH_MIPS 0x00010000 #define GLM_ARCH_PPC 0x01000000 @@ -88,31 +90,41 @@ #elif defined(GLM_FORCE_PPC) # define GLM_ARCH (GLM_ARCH_PPC) #elif defined(GLM_FORCE_NEON) -# define GLM_ARCH (GLM_ARCH_ARM | GLM_ARCH_NEON) +# define GLM_ARCH (GLM_ARCH_NEON) #elif defined(GLM_FORCE_AVX512) -# define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_AVX512 | GLM_ARCH_AVX2 | GLM_ARCH_AVX | GLM_ARCH_SSE4 | GLM_ARCH_SSE3 | GLM_ARCH_SSE2) +# define GLM_ARCH (GLM_ARCH_AVX512) #elif defined(GLM_FORCE_AVX2) -# define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_AVX2 | GLM_ARCH_AVX | GLM_ARCH_SSE4 | GLM_ARCH_SSE3 | GLM_ARCH_SSE2) +# define GLM_ARCH (GLM_ARCH_AVX2) #elif defined(GLM_FORCE_AVX) -# define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_AVX | GLM_ARCH_SSE4 | GLM_ARCH_SSE3 | GLM_ARCH_SSE2) -#elif defined(GLM_FORCE_SSE4) -# define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_SSE4 | GLM_ARCH_SSE3 | GLM_ARCH_SSE2) +# define GLM_ARCH (GLM_ARCH_AVX) +#elif defined(GLM_FORCE_SSE42) +# define GLM_ARCH (GLM_ARCH_SSE42) +#elif defined(GLM_FORCE_SSE41) +# define GLM_ARCH (GLM_ARCH_SSE41) +#elif defined(GLM_FORCE_SSSE3) +# define GLM_ARCH (GLM_ARCH_SSSE3) #elif defined(GLM_FORCE_SSE3) -# define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_SSE3 | GLM_ARCH_SSE2) +# define GLM_ARCH (GLM_ARCH_SSE3) #elif defined(GLM_FORCE_SSE2) -# define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_SSE2) +# define GLM_ARCH (GLM_ARCH_SSE2) #elif (GLM_COMPILER & (GLM_COMPILER_LLVM | GLM_COMPILER_GCC)) || ((GLM_COMPILER & GLM_COMPILER_INTEL) && (GLM_PLATFORM & GLM_PLATFORM_LINUX)) // This is Skylake set of instruction set # if defined(__AVX512BW__) && defined(__AVX512F__) && defined(__AVX512CD__) && defined(__AVX512VL__) && defined(__AVX512DQ__) -# define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_AVX512 | GLM_ARCH_AVX2 | GLM_ARCH_AVX | GLM_ARCH_SSE3 | GLM_ARCH_SSE2) +# define GLM_ARCH (GLM_ARCH_AVX512) # elif defined(__AVX2__) -# define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_AVX2 | GLM_ARCH_AVX | GLM_ARCH_SSE3 | GLM_ARCH_SSE2) +# define GLM_ARCH (GLM_ARCH_AVX2) # elif defined(__AVX__) -# define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_AVX | GLM_ARCH_SSE3 | GLM_ARCH_SSE2) +# define GLM_ARCH (GLM_ARCH_AVX) +# elif defined(__SSE4_2__) +# define GLM_ARCH (GLM_ARCH_SSE42) +# elif defined(__SSE4_1__) +# define GLM_ARCH (GLM_ARCH_SSE41) +# elif defined(__SSSE3__) +# define GLM_ARCH (GLM_ARCH_SSSE3) # elif defined(__SSE3__) -# define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_SSE3 | GLM_ARCH_SSE2) +# define GLM_ARCH (GLM_ARCH_SSE3) # elif defined(__SSE2__) -# define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_SSE2) +# define GLM_ARCH (GLM_ARCH_SSE2) # elif defined(__i386__) || defined(__x86_64__) # define GLM_ARCH (GLM_ARCH_X86) # elif defined(__ARM_NEON) @@ -130,14 +142,14 @@ # if defined(_M_ARM) # define GLM_ARCH (GLM_ARCH_ARM) # elif defined(__AVX2__) -# define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_AVX2 | GLM_ARCH_AVX | GLM_ARCH_SSE4 | GLM_ARCH_SSE3 | GLM_ARCH_SSE2) +# define GLM_ARCH (GLM_ARCH_AVX2) # elif defined(__AVX__) -# define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_AVX | GLM_ARCH_SSE4 | GLM_ARCH_SSE3 | GLM_ARCH_SSE2) +# define GLM_ARCH (GLM_ARCH_AVX) # elif defined(_M_X64) -# define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_SSE2) +# define GLM_ARCH (GLM_ARCH_SSE2) # elif defined(_M_IX86_FP) # if _M_IX86_FP >= 2 -# define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_SSE2) +# define GLM_ARCH (GLM_ARCH_SSE2) # else # define GLM_ARCH (GLM_ARCH_PURE) # endif diff --git a/glm/simd/common.h b/glm/simd/common.h index 621e8576..2b9a823e 100644 --- a/glm/simd/common.h +++ b/glm/simd/common.h @@ -12,11 +12,24 @@ static const __m128 GLM_VAR_USED glm_three = _mm_set_ps1(3.0f); static const __m128 GLM_VAR_USED glm_ps_2pow23 = _mm_set_ps1(8388608.0f); +//abs GLM_FUNC_QUALIFIER __m128 glm_f32v4_abs(__m128 x) { return _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF))); } +GLM_FUNC_QUALIFIER __m128i glm_i32v4_abs(__m128i x) +{ +# if GLM_ARCH & GLM_ARCH_SSSE3 + return _mm_sign_epi32(x, x); +# else + __m128i const sgn0 = _mm_srai_epi32(x, 31); + __m128i const inv0 = _mm_xor_si128(x, sgn0); + __m128i const sub0 = _mm_sub_epi32(inv0, sgn0); + return sub0; +# endif +} + //sign GLM_FUNC_QUALIFIER __m128 glm_f32v4_sgn(__m128 x) {