Clarify support for SSSE3, SSE4.1 and SSE4.2

This commit is contained in:
Christophe Riccio 2016-05-28 11:52:41 +02:00
parent 0ee3a79bfd
commit 71e6b537cc
2 changed files with 49 additions and 24 deletions

View File

@ -70,14 +70,16 @@
#define GLM_ARCH_PURE 0x00000000 #define GLM_ARCH_PURE 0x00000000
#define GLM_ARCH_X86 0x00000001 #define GLM_ARCH_X86 0x00000001
#define GLM_ARCH_SSE2 0x00000002 #define GLM_ARCH_SSE2 0x00000002 | GLM_ARCH_X86
#define GLM_ARCH_SSE3 0x00000004 #define GLM_ARCH_SSE3 0x00000004 | GLM_ARCH_SSE2
#define GLM_ARCH_SSE4 0x00000008 #define GLM_ARCH_SSSE3 0x00000008 | GLM_ARCH_SSE3
#define GLM_ARCH_AVX 0x00000010 #define GLM_ARCH_SSE41 0x00000010 | GLM_ARCH_SSSE3
#define GLM_ARCH_AVX2 0x00000020 #define GLM_ARCH_SSE42 0x00000020 | GLM_ARCH_SSE41
#define GLM_ARCH_AVX512 0x00000040 // Skylake subset #define GLM_ARCH_AVX 0x00000040 | GLM_ARCH_SSE42
#define GLM_ARCH_AVX2 0x00000080 | GLM_ARCH_AVX
#define GLM_ARCH_AVX512 0x00000100 | GLM_ARCH_AVX2 // Skylake subset
#define GLM_ARCH_ARM 0x00000100 #define GLM_ARCH_ARM 0x00000100
#define GLM_ARCH_NEON 0x00000200 #define GLM_ARCH_NEON 0x00000200 | GLM_ARCH_ARM
#define GLM_ARCH_MIPS 0x00010000 #define GLM_ARCH_MIPS 0x00010000
#define GLM_ARCH_PPC 0x01000000 #define GLM_ARCH_PPC 0x01000000
@ -88,31 +90,41 @@
#elif defined(GLM_FORCE_PPC) #elif defined(GLM_FORCE_PPC)
# define GLM_ARCH (GLM_ARCH_PPC) # define GLM_ARCH (GLM_ARCH_PPC)
#elif defined(GLM_FORCE_NEON) #elif defined(GLM_FORCE_NEON)
# define GLM_ARCH (GLM_ARCH_ARM | GLM_ARCH_NEON) # define GLM_ARCH (GLM_ARCH_NEON)
#elif defined(GLM_FORCE_AVX512) #elif defined(GLM_FORCE_AVX512)
# define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_AVX512 | GLM_ARCH_AVX2 | GLM_ARCH_AVX | GLM_ARCH_SSE4 | GLM_ARCH_SSE3 | GLM_ARCH_SSE2) # define GLM_ARCH (GLM_ARCH_AVX512)
#elif defined(GLM_FORCE_AVX2) #elif defined(GLM_FORCE_AVX2)
# define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_AVX2 | GLM_ARCH_AVX | GLM_ARCH_SSE4 | GLM_ARCH_SSE3 | GLM_ARCH_SSE2) # define GLM_ARCH (GLM_ARCH_AVX2)
#elif defined(GLM_FORCE_AVX) #elif defined(GLM_FORCE_AVX)
# define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_AVX | GLM_ARCH_SSE4 | GLM_ARCH_SSE3 | GLM_ARCH_SSE2) # define GLM_ARCH (GLM_ARCH_AVX)
#elif defined(GLM_FORCE_SSE4) #elif defined(GLM_FORCE_SSE42)
# define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_SSE4 | GLM_ARCH_SSE3 | GLM_ARCH_SSE2) # define GLM_ARCH (GLM_ARCH_SSE42)
#elif defined(GLM_FORCE_SSE41)
# define GLM_ARCH (GLM_ARCH_SSE41)
#elif defined(GLM_FORCE_SSSE3)
# define GLM_ARCH (GLM_ARCH_SSSE3)
#elif defined(GLM_FORCE_SSE3) #elif defined(GLM_FORCE_SSE3)
# define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_SSE3 | GLM_ARCH_SSE2) # define GLM_ARCH (GLM_ARCH_SSE3)
#elif defined(GLM_FORCE_SSE2) #elif defined(GLM_FORCE_SSE2)
# define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_SSE2) # define GLM_ARCH (GLM_ARCH_SSE2)
#elif (GLM_COMPILER & (GLM_COMPILER_LLVM | GLM_COMPILER_GCC)) || ((GLM_COMPILER & GLM_COMPILER_INTEL) && (GLM_PLATFORM & GLM_PLATFORM_LINUX)) #elif (GLM_COMPILER & (GLM_COMPILER_LLVM | GLM_COMPILER_GCC)) || ((GLM_COMPILER & GLM_COMPILER_INTEL) && (GLM_PLATFORM & GLM_PLATFORM_LINUX))
// This is Skylake set of instruction set // This is Skylake set of instruction set
# if defined(__AVX512BW__) && defined(__AVX512F__) && defined(__AVX512CD__) && defined(__AVX512VL__) && defined(__AVX512DQ__) # if defined(__AVX512BW__) && defined(__AVX512F__) && defined(__AVX512CD__) && defined(__AVX512VL__) && defined(__AVX512DQ__)
# define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_AVX512 | GLM_ARCH_AVX2 | GLM_ARCH_AVX | GLM_ARCH_SSE3 | GLM_ARCH_SSE2) # define GLM_ARCH (GLM_ARCH_AVX512)
# elif defined(__AVX2__) # elif defined(__AVX2__)
# define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_AVX2 | GLM_ARCH_AVX | GLM_ARCH_SSE3 | GLM_ARCH_SSE2) # define GLM_ARCH (GLM_ARCH_AVX2)
# elif defined(__AVX__) # elif defined(__AVX__)
# define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_AVX | GLM_ARCH_SSE3 | GLM_ARCH_SSE2) # define GLM_ARCH (GLM_ARCH_AVX)
# elif defined(__SSE4_2__)
# define GLM_ARCH (GLM_ARCH_SSE42)
# elif defined(__SSE4_1__)
# define GLM_ARCH (GLM_ARCH_SSE41)
# elif defined(__SSSE3__)
# define GLM_ARCH (GLM_ARCH_SSSE3)
# elif defined(__SSE3__) # elif defined(__SSE3__)
# define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_SSE3 | GLM_ARCH_SSE2) # define GLM_ARCH (GLM_ARCH_SSE3)
# elif defined(__SSE2__) # elif defined(__SSE2__)
# define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_SSE2) # define GLM_ARCH (GLM_ARCH_SSE2)
# elif defined(__i386__) || defined(__x86_64__) # elif defined(__i386__) || defined(__x86_64__)
# define GLM_ARCH (GLM_ARCH_X86) # define GLM_ARCH (GLM_ARCH_X86)
# elif defined(__ARM_NEON) # elif defined(__ARM_NEON)
@ -130,14 +142,14 @@
# if defined(_M_ARM) # if defined(_M_ARM)
# define GLM_ARCH (GLM_ARCH_ARM) # define GLM_ARCH (GLM_ARCH_ARM)
# elif defined(__AVX2__) # elif defined(__AVX2__)
# define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_AVX2 | GLM_ARCH_AVX | GLM_ARCH_SSE4 | GLM_ARCH_SSE3 | GLM_ARCH_SSE2) # define GLM_ARCH (GLM_ARCH_AVX2)
# elif defined(__AVX__) # elif defined(__AVX__)
# define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_AVX | GLM_ARCH_SSE4 | GLM_ARCH_SSE3 | GLM_ARCH_SSE2) # define GLM_ARCH (GLM_ARCH_AVX)
# elif defined(_M_X64) # elif defined(_M_X64)
# define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_SSE2) # define GLM_ARCH (GLM_ARCH_SSE2)
# elif defined(_M_IX86_FP) # elif defined(_M_IX86_FP)
# if _M_IX86_FP >= 2 # if _M_IX86_FP >= 2
# define GLM_ARCH (GLM_ARCH_X86 | GLM_ARCH_SSE2) # define GLM_ARCH (GLM_ARCH_SSE2)
# else # else
# define GLM_ARCH (GLM_ARCH_PURE) # define GLM_ARCH (GLM_ARCH_PURE)
# endif # endif

View File

@ -12,11 +12,24 @@ static const __m128 GLM_VAR_USED glm_three = _mm_set_ps1(3.0f);
static const __m128 GLM_VAR_USED glm_ps_2pow23 = _mm_set_ps1(8388608.0f); static const __m128 GLM_VAR_USED glm_ps_2pow23 = _mm_set_ps1(8388608.0f);
//abs
GLM_FUNC_QUALIFIER __m128 glm_f32v4_abs(__m128 x) GLM_FUNC_QUALIFIER __m128 glm_f32v4_abs(__m128 x)
{ {
return _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF))); return _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)));
} }
GLM_FUNC_QUALIFIER __m128i glm_i32v4_abs(__m128i x)
{
# if GLM_ARCH & GLM_ARCH_SSSE3
return _mm_sign_epi32(x, x);
# else
__m128i const sgn0 = _mm_srai_epi32(x, 31);
__m128i const inv0 = _mm_xor_si128(x, sgn0);
__m128i const sub0 = _mm_sub_epi32(inv0, sgn0);
return sub0;
# endif
}
//sign //sign
GLM_FUNC_QUALIFIER __m128 glm_f32v4_sgn(__m128 x) GLM_FUNC_QUALIFIER __m128 glm_f32v4_sgn(__m128 x)
{ {