Fixed SIMD code path selection

This commit is contained in:
Christophe Riccio 2016-05-28 17:54:37 +02:00
parent ab159770b3
commit 9f00ba86cb
15 changed files with 92 additions and 48 deletions

View File

@ -1,7 +1,7 @@
/// @ref core
/// @file glm/detail/func_common_simd.inl
#if GLM_ARCH & GLM_ARCH_SSE2
#if GLM_ARCH & GLM_ARCH_SSE2_FLAG
#include "../simd/common.h"
@ -135,4 +135,4 @@ namespace detail
}//namespace detail
}//namespace glm
#endif//GLM_ARCH & GLM_ARCH_SSE2
#endif//GLM_ARCH & GLM_ARCH_SSE2_FLAG

View File

@ -0,0 +1,9 @@
/// @ref core
/// @file glm/detail/func_exponential_simd.inl
namespace glm{
namespace detail
{
}//namespace detail
}//namespace glm

View File

@ -1,5 +1,7 @@
#include "../simd/geometric.h"
#if GLM_ARCH & GLM_ARCH_SSE2_FLAG
namespace glm{
namespace detail
{
@ -15,3 +17,4 @@ namespace detail
}//namespace detail
}//namespace glm
#endif//GLM_ARCH & GLM_ARCH_SSE2_FLAG

View File

@ -0,0 +1,9 @@
/// @ref core
/// @file glm/detail/func_packing_simd.inl
namespace glm{
namespace detail
{
}//namespace detail
}//namespace glm

View File

@ -0,0 +1,9 @@
/// @ref core
/// @file glm/detail/func_vector_relational_simd.inl
namespace glm{
namespace detail
{
}//namespace detail
}//namespace glm

View File

@ -68,20 +68,34 @@
// User defines: GLM_FORCE_PURE GLM_FORCE_SSE2 GLM_FORCE_SSE3 GLM_FORCE_AVX GLM_FORCE_AVX2 GLM_FORCE_AVX2
#define GLM_ARCH_PURE 0x00000000
#define GLM_ARCH_X86 0x00000001
#define GLM_ARCH_SSE2 0x00000002 | GLM_ARCH_X86
#define GLM_ARCH_SSE3 0x00000004 | GLM_ARCH_SSE2
#define GLM_ARCH_SSSE3 0x00000008 | GLM_ARCH_SSE3
#define GLM_ARCH_SSE41 0x00000010 | GLM_ARCH_SSSE3
#define GLM_ARCH_SSE42 0x00000020 | GLM_ARCH_SSE41
#define GLM_ARCH_AVX 0x00000040 | GLM_ARCH_SSE42
#define GLM_ARCH_AVX2 0x00000080 | GLM_ARCH_AVX
#define GLM_ARCH_AVX512 0x00000100 | GLM_ARCH_AVX2 // Skylake subset
#define GLM_ARCH_ARM 0x00000100
#define GLM_ARCH_NEON 0x00000200 | GLM_ARCH_ARM
#define GLM_ARCH_MIPS 0x00010000
#define GLM_ARCH_PPC 0x01000000
#define GLM_ARCH_X86_FLAG 0x00000001
#define GLM_ARCH_SSE2_FLAG 0x00000002
#define GLM_ARCH_SSE3_FLAG 0x00000004
#define GLM_ARCH_SSSE3_FLAG 0x00000008
#define GLM_ARCH_SSE41_FLAG 0x00000010
#define GLM_ARCH_SSE42_FLAG 0x00000020
#define GLM_ARCH_AVX_FLAG 0x00000040
#define GLM_ARCH_AVX2_FLAG 0x00000080
#define GLM_ARCH_AVX512_FLAG 0x00000100 // Skylake subset
#define GLM_ARCH_ARM_FLAG 0x00000100
#define GLM_ARCH_NEON_FLAG 0x00000200
#define GLM_ARCH_MIPS_FLAG 0x00010000
#define GLM_ARCH_PPC_FLAG 0x01000000
#define GLM_ARCH_PURE (0x00000000)
#define GLM_ARCH_X86 (GLM_ARCH_X86_FLAG)
#define GLM_ARCH_SSE2 (GLM_ARCH_SSE2_FLAG | GLM_ARCH_X86)
#define GLM_ARCH_SSE3 (GLM_ARCH_SSE3_FLAG | GLM_ARCH_SSE2)
#define GLM_ARCH_SSSE3 (GLM_ARCH_SSSE3_FLAG | GLM_ARCH_SSE3)
#define GLM_ARCH_SSE41 (GLM_ARCH_SSE41_FLAG | GLM_ARCH_SSSE3)
#define GLM_ARCH_SSE42 (GLM_ARCH_SSE42_FLAG | GLM_ARCH_SSE41)
#define GLM_ARCH_AVX (GLM_ARCH_AVX_FLAG | GLM_ARCH_SSE42)
#define GLM_ARCH_AVX2 (GLM_ARCH_AVX2_FLAG | GLM_ARCH_AVX)
#define GLM_ARCH_AVX512 (GLM_ARCH_AVX512_FLAG | GLM_ARCH_AVX2) // Skylake subset
#define GLM_ARCH_ARM (GLM_ARCH_ARM_FLAG)
#define GLM_ARCH_NEON (GLM_ARCH_NEON_FLAG | GLM_ARCH_ARM)
#define GLM_ARCH_MIPS (GLM_ARCH_MIPS_FLAG)
#define GLM_ARCH_PPC (GLM_ARCH_PPC_FLAG)
#if defined(GLM_FORCE_PURE)
# define GLM_ARCH GLM_ARCH_PURE

View File

@ -49,7 +49,7 @@ namespace detail
};
# endif
# if (GLM_ARCH & GLM_ARCH_AVX)
# if (GLM_ARCH & GLM_ARCH_AVX_FLAG)
template <>
struct simd_data<double>
{
@ -57,7 +57,7 @@ namespace detail
};
# endif
# if (GLM_ARCH & GLM_ARCH_AVX2)
# if (GLM_ARCH & GLM_ARCH_AVX2_FLAG)
template <>
struct simd_data<int64>
{

View File

@ -1,7 +1,7 @@
/// @ref core
/// @file glm/detail/type_tvec4_simd.inl
#if GLM_ARCH & GLM_ARCH_SSE2
#if GLM_ARCH & GLM_ARCH_SSE2_FLAG
namespace glm{
namespace detail
@ -72,7 +72,7 @@ namespace detail
}
};
# if GLM_ARCH & GLM_ARCH_AVX2
# if GLM_ARCH & GLM_ARCH_AVX2_FLAG
template <typename T, precision P>
struct compute_vec4_and<T, P, true, 64>
{
@ -96,7 +96,7 @@ namespace detail
}
};
# if GLM_ARCH & GLM_ARCH_AVX2
# if GLM_ARCH & GLM_ARCH_AVX2_FLAG
template <typename T, precision P>
struct compute_vec4_or<T, P, true, 64>
{
@ -120,7 +120,7 @@ namespace detail
}
};
# if GLM_ARCH & GLM_ARCH_AVX2
# if GLM_ARCH & GLM_ARCH_AVX2_FLAG
template <typename T, precision P>
struct compute_vec4_xor<T, P, true, 64>
{
@ -144,7 +144,7 @@ namespace detail
}
};
# if GLM_ARCH & GLM_ARCH_AVX2
# if GLM_ARCH & GLM_ARCH_AVX2_FLAG
template <typename T, precision P>
struct compute_vec4_shift_left<T, P, true, 64>
{
@ -168,7 +168,7 @@ namespace detail
}
};
# if GLM_ARCH & GLM_ARCH_AVX2
# if GLM_ARCH & GLM_ARCH_AVX2_FLAG
template <typename T, precision P>
struct compute_vec4_shift_right<T, P, true, 64>
{
@ -192,7 +192,7 @@ namespace detail
}
};
# if GLM_ARCH & GLM_ARCH_AVX2
# if GLM_ARCH & GLM_ARCH_AVX2_FLAG
template <typename T, precision P>
struct compute_vec4_bitwise_not<T, P, true, 64>
{
@ -248,7 +248,7 @@ namespace detail
data(_mm_set1_ps(s))
{}
# if GLM_ARCH & GLM_ARCH_AVX
# if GLM_ARCH & GLM_ARCH_AVX_FLAG
template <>
GLM_FUNC_QUALIFIER GLM_CONSTEXPR_SIMD tvec4<double, lowp>::tvec4(double s) :
data(_mm256_set1_pd(s))
@ -280,7 +280,7 @@ namespace detail
data(_mm_set1_epi32(s))
{}
# if GLM_ARCH & GLM_ARCH_AVX2
# if GLM_ARCH & GLM_ARCH_AVX2_FLAG
template <>
GLM_FUNC_QUALIFIER GLM_CONSTEXPR_SIMD tvec4<int64, lowp>::tvec4(int64 s) :
data(_mm256_set1_epi64x(s))
@ -350,4 +350,4 @@ namespace detail
*/
}//namespace glm
#endif//GLM_ARCH & GLM_ARCH_SSE2
#endif//GLM_ARCH & GLM_ARCH_SSE2_FLAG

View File

@ -3,12 +3,12 @@
#pragma once
#if GLM_ARCH & GLM_ARCH_SSE2
#if GLM_ARCH & GLM_ARCH_SSE2_FLAG
//mad
GLM_FUNC_QUALIFIER __m128 glm_f32v1_mad(__m128 a, __m128 b, __m128 c)
{
# if GLM_ARCH & GLM_ARCH_AVX2
# if GLM_ARCH & GLM_ARCH_AVX2_FLAG
return _mm_fmadd_ss(a, b, c);
# else
return _mm_add_ss(_mm_mul_ss(a, b), c);
@ -18,7 +18,7 @@ GLM_FUNC_QUALIFIER __m128 glm_f32v1_mad(__m128 a, __m128 b, __m128 c)
//mad
GLM_FUNC_QUALIFIER __m128 glm_f32v4_mad(__m128 a, __m128 b, __m128 c)
{
# if GLM_ARCH & GLM_ARCH_AVX2
# if GLM_ARCH & GLM_ARCH_AVX2_FLAG
return _mm_fmadd_ps(a, b, c);
# else
return _mm_add_ps(_mm_mul_ps(a, b), c);
@ -33,7 +33,7 @@ GLM_FUNC_QUALIFIER __m128 glm_f32v4_abs(__m128 x)
GLM_FUNC_QUALIFIER __m128i glm_i32v4_abs(__m128i x)
{
# if GLM_ARCH & GLM_ARCH_SSSE3
# if GLM_ARCH & GLM_ARCH_SSSE3_FLAG
return _mm_sign_epi32(x, x);
# else
__m128i const sgn0 = _mm_srai_epi32(x, 31);
@ -202,4 +202,4 @@ GLM_FUNC_QUALIFIER __m128 glm_f32v4_sqrt_wip(__m128 x)
return Mul3;
}
#endif//GLM_ARCH & GLM_ARCH_SSE2
#endif//GLM_ARCH & GLM_ARCH_SSE2_FLAG

View File

@ -5,13 +5,13 @@
#include "common.h"
#if GLM_ARCH & GLM_ARCH_SSE2
#if GLM_ARCH & GLM_ARCH_SSE2_FLAG
GLM_FUNC_QUALIFIER __m128 glm_f32v4_dot(__m128 v1, __m128 v2)
{
# if GLM_ARCH & GLM_ARCH_AVX
# if GLM_ARCH & GLM_ARCH_AVX_FLAG
return _mm_dp_ps(v1, v2, 0xff);
# elif GLM_ARCH & GLM_ARCH_SSE3
# elif GLM_ARCH & GLM_ARCH_SSE3_FLAG
__m128 const Mul0 = _mm_mul_ps(v1, v2);
__m128 const Hadd0 = _mm_hadd_ps(Mul0, Mul0);
__m128 const Hadd1 = _mm_hadd_ps(Hadd0, Hadd0);
@ -28,9 +28,9 @@ GLM_FUNC_QUALIFIER __m128 glm_f32v4_dot(__m128 v1, __m128 v2)
GLM_FUNC_QUALIFIER __m128 glm_f32v1_dot(__m128 v1, __m128 v2)
{
# if GLM_ARCH & GLM_ARCH_AVX
# if GLM_ARCH & GLM_ARCH_AVX_FLAG
return _mm_dp_ps(v1, v2, 0xff);
# elif GLM_ARCH & GLM_ARCH_SSE3
# elif GLM_ARCH & GLM_ARCH_SSE3_FLAG
__m128 const mul0 = _mm_mul_ps(v1, v2);
__m128 const had0 = _mm_hadd_ps(mul0, mul0);
__m128 const had1 = _mm_hadd_ps(had0, had0);

View File

@ -3,7 +3,7 @@
#pragma once
#if GLM_ARCH & GLM_ARCH_SSE2
#if GLM_ARCH & GLM_ARCH_SSE2_FLAG
GLM_FUNC_QUALIFIER __m128i glm_i128_interleave(__m128i x)
{
@ -112,4 +112,4 @@ GLM_FUNC_QUALIFIER __m128i glm_i128_interleave2(__m128i x, __m128i y)
return Reg1;
}
#endif//GLM_ARCH & GLM_ARCH_SSE2
#endif//GLM_ARCH & GLM_ARCH_SSE2_FLAG

View File

@ -5,7 +5,7 @@
#include "geometric.h"
#if GLM_ARCH & GLM_ARCH_SSE2
#if GLM_ARCH & GLM_ARCH_SSE2_FLAG
static const __m128 GLM_VAR_USED _m128_rad_ps = _mm_set_ps1(3.141592653589793238462643383279f / 180.f);
static const __m128 GLM_VAR_USED _m128_deg_ps = _mm_set_ps1(180.f / 3.141592653589793238462643383279f);
@ -1029,4 +1029,4 @@ GLM_FUNC_QUALIFIER void glm_f32m4_outer(__m128 const & c, __m128 const & r, __m1
out[3] = _mm_mul_ps(c, _mm_shuffle_ps(r, r, _MM_SHUFFLE(3, 3, 3, 3)));
}
#endif//GLM_ARCH & GLM_ARCH_SSE2
#endif//GLM_ARCH & GLM_ARCH_SSE2_FLAG

View File

@ -3,6 +3,6 @@
#pragma once
#if GLM_ARCH & GLM_ARCH_SSE2
#if GLM_ARCH & GLM_ARCH_SSE2_FLAG
#endif//GLM_ARCH & GLM_ARCH_SSE2
#endif//GLM_ARCH & GLM_ARCH_SSE2_FLAG

View File

@ -3,7 +3,7 @@
#pragma once
#if GLM_ARCH & GLM_ARCH_SSE2
#if GLM_ARCH & GLM_ARCH_SSE2_FLAG
#endif//GLM_ARCH & GLM_ARCH_SSE2
#endif//GLM_ARCH & GLM_ARCH_SSE2_FLAG

View File

@ -3,6 +3,6 @@
#pragma once
#if GLM_ARCH & GLM_ARCH_SSE2
#if GLM_ARCH & GLM_ARCH_SSE2_FLAG
#endif//GLM_ARCH & GLM_ARCH_SSE2
#endif//GLM_ARCH & GLM_ARCH_SSE2_FLAG