Concept prof for vec4 SIMD specialication

This commit is contained in:
Christophe Riccio 2013-12-26 11:42:38 +01:00
parent 48fcbd6ec4
commit d9f5e07641
4 changed files with 92 additions and 31 deletions

View File

@ -48,7 +48,7 @@ GLM_FUNC_QUALIFIER __m128 sse_dst_ps(__m128 p0, __m128 p1)
//dot //dot
GLM_FUNC_QUALIFIER __m128 sse_dot_ps(__m128 v1, __m128 v2) GLM_FUNC_QUALIFIER __m128 sse_dot_ps(__m128 v1, __m128 v2)
{ {
# if((GLM_ARCH & GLM_ARCH_SSE4) == GLM_ARCH_SSE4) # if(GLM_ARCH & GLM_ARCH_AVX)
return _mm_dp_ps(v1, v2, 0xff); return _mm_dp_ps(v1, v2, 0xff);
# else # else
__m128 mul0 = _mm_mul_ps(v1, v2); __m128 mul0 = _mm_mul_ps(v1, v2);

View File

@ -520,6 +520,13 @@
((GLM_LANG & GLM_LANG_CXX0X_FLAG) && (GLM_COMPILER & GLM_COMPILER_GCC) && (GLM_COMPILER >= GLM_COMPILER_GCC44)) || \ ((GLM_LANG & GLM_LANG_CXX0X_FLAG) && (GLM_COMPILER & GLM_COMPILER_GCC) && (GLM_COMPILER >= GLM_COMPILER_GCC44)) || \
__has_feature(cxx_generalized_initializers)) __has_feature(cxx_generalized_initializers))
// N2544 Unrestricted unions
#define GLM_HAS_UNRESTRICTED_UNIONS ( \
(GLM_LANG & GLM_LANG_CXX11_FLAG) || \
(GLM_LANG & GLM_LANG_CXXMS_FLAG) || \
((GLM_LANG & GLM_LANG_CXX0X_FLAG) && (GLM_COMPILER & GLM_COMPILER_GCC) && (GLM_COMPILER >= GLM_COMPILER_GCC46)) || \
__has_feature(cxx_unrestricted_unions))
// OpenMP // OpenMP
#ifdef _OPENMP #ifdef _OPENMP
# if(GLM_COMPILER & GLM_COMPILER_GCC) # if(GLM_COMPILER & GLM_COMPILER_GCC)
@ -545,14 +552,13 @@
///////////////// /////////////////
// Platform // Platform
// User defines: GLM_FORCE_PURE GLM_FORCE_SSE2 GLM_FORCE_AVX // User defines: GLM_FORCE_PURE GLM_FORCE_SSE2 GLM_FORCE_SSE3 GLM_FORCE_AVX GLM_FORCE_AVX2
#define GLM_ARCH_PURE 0x0000 #define GLM_ARCH_PURE 0x0000
#define GLM_ARCH_SSE2 0x0001 #define GLM_ARCH_SSE2 0x0001
#define GLM_ARCH_SSE3 0x0002// | GLM_ARCH_SSE2 #define GLM_ARCH_SSE3 0x0002// | GLM_ARCH_SSE2
#define GLM_ARCH_SSE4 0x0004// | GLM_ARCH_SSE3 | GLM_ARCH_SSE2 #define GLM_ARCH_AVX 0x0004// | GLM_ARCH_SSE3 | GLM_ARCH_SSE2
#define GLM_ARCH_AVX 0x0008// | GLM_ARCH_SSE4 | GLM_ARCH_SSE3 | GLM_ARCH_SSE2 #define GLM_ARCH_AVX2 0x0008// | GLM_ARCH_AVX | GLM_ARCH_SSE3 | GLM_ARCH_SSE2
#define GLM_ARCH_AVX2 0x0010// | GLM_ARCH_AVX | GLM_ARCH_SSE4 | GLM_ARCH_SSE3 | GLM_ARCH_SSE2
#if(defined(GLM_FORCE_PURE)) #if(defined(GLM_FORCE_PURE))
# define GLM_ARCH GLM_ARCH_PURE # define GLM_ARCH GLM_ARCH_PURE
@ -560,12 +566,22 @@
# define GLM_ARCH (GLM_ARCH_AVX2 | GLM_ARCH_AVX | GLM_ARCH_SSE3 | GLM_ARCH_SSE2) # define GLM_ARCH (GLM_ARCH_AVX2 | GLM_ARCH_AVX | GLM_ARCH_SSE3 | GLM_ARCH_SSE2)
#elif(defined(GLM_FORCE_AVX)) #elif(defined(GLM_FORCE_AVX))
# define GLM_ARCH (GLM_ARCH_AVX | GLM_ARCH_SSE3 | GLM_ARCH_SSE2) # define GLM_ARCH (GLM_ARCH_AVX | GLM_ARCH_SSE3 | GLM_ARCH_SSE2)
#elif(defined(GLM_FORCE_SSE4))
# define GLM_ARCH (GLM_ARCH_SSE4 | GLM_ARCH_SSE3 | GLM_ARCH_SSE2)
#elif(defined(GLM_FORCE_SSE3)) #elif(defined(GLM_FORCE_SSE3))
# define GLM_ARCH (GLM_ARCH_SSE3 | GLM_ARCH_SSE2) # define GLM_ARCH (GLM_ARCH_SSE3 | GLM_ARCH_SSE2)
#elif(defined(GLM_FORCE_SSE2)) #elif(defined(GLM_FORCE_SSE2))
# define GLM_ARCH (GLM_ARCH_SSE2) # define GLM_ARCH (GLM_ARCH_SSE2)
#elif((GLM_COMPILER & GLM_COMPILER_CLANG) || (GLM_COMPILER & GLM_COMPILER_GCC))
# if(__AVX2__)
# define GLM_ARCH (GLM_ARCH_AVX2 | GLM_ARCH_AVX | GLM_ARCH_SSE3 | GLM_ARCH_SSE2)
# elif(__AVX__)
# define GLM_ARCH (GLM_ARCH_AVX | GLM_ARCH_SSE3 | GLM_ARCH_SSE2)
# elif(__SSE3__)
# define GLM_ARCH (GLM_ARCH_SSE3 | GLM_ARCH_SSE2)
# elif(__SSE2__)
# define GLM_ARCH (GLM_ARCH_SSE2)
# else
# define GLM_ARCH GLM_ARCH_PURE
# endif
#elif((GLM_COMPILER & GLM_COMPILER_VC) && (defined(_M_IX86) || defined(_M_X64))) #elif((GLM_COMPILER & GLM_COMPILER_VC) && (defined(_M_IX86) || defined(_M_X64)))
# if(GLM_PLATFORM == GLM_PLATFORM_WINCE) # if(GLM_PLATFORM == GLM_PLATFORM_WINCE)
# define GLM_ARCH GLM_ARCH_PURE # define GLM_ARCH GLM_ARCH_PURE
@ -596,15 +612,6 @@
# else # else
# define GLM_ARCH GLM_ARCH_PURE # define GLM_ARCH GLM_ARCH_PURE
# endif # endif
#elif((GLM_PLATFORM & GLM_PLATFORM_APPLE) && (GLM_COMPILER & GLM_COMPILER_GCC))
# define GLM_ARCH GLM_ARCH_PURE
#elif(((GLM_COMPILER & GLM_COMPILER_GCC) && (defined(__i386__) || defined(__x86_64__))) || (GLM_COMPILER & GLM_COMPILER_LLVM_GCC))
# define GLM_ARCH (GLM_ARCH_PURE \
| (defined(__AVX2__) ? GLM_ARCH_AVX2 : 0) \
| (defined(__AVX__) ? GLM_ARCH_AVX : 0) \
| (defined(__SSE4__) ? GLM_ARCH_SSE4 : 0) \
| (defined(__SSE3__) ? GLM_ARCH_SSE3 : 0) \
| (defined(__SSE2__) ? GLM_ARCH_SSE2 : 0))
#else #else
# define GLM_ARCH GLM_ARCH_PURE # define GLM_ARCH GLM_ARCH_PURE
#endif #endif
@ -616,7 +623,6 @@
# include <intrin.h> # include <intrin.h>
#endif #endif
//#if(GLM_ARCH != GLM_ARCH_PURE)
#if(GLM_ARCH & GLM_ARCH_AVX2) #if(GLM_ARCH & GLM_ARCH_AVX2)
# include <immintrin.h> # include <immintrin.h>
#endif//GLM_ARCH #endif//GLM_ARCH
@ -639,22 +645,19 @@
inline __m128 _mm_castsi128_ps(__m128i PI) { union { __m128 ps; __m128i pi; } c; c.pi = PI; return c.ps; } inline __m128 _mm_castsi128_ps(__m128i PI) { union { __m128 ps; __m128i pi; } c; c.pi = PI; return c.ps; }
# endif # endif
#endif//GLM_ARCH #endif//GLM_ARCH
//#endif//(GLM_ARCH != GLM_ARCH_PURE)
#if(defined(GLM_MESSAGES) && !defined(GLM_MESSAGE_ARCH_DISPLAYED)) #if(defined(GLM_MESSAGES) && !defined(GLM_MESSAGE_ARCH_DISPLAYED))
# define GLM_MESSAGE_ARCH_DISPLAYED # define GLM_MESSAGE_ARCH_DISPLAYED
# if(GLM_ARCH == GLM_ARCH_PURE) # if(GLM_ARCH == GLM_ARCH_PURE)
# pragma message("GLM: Platform independent") # pragma message("GLM: Platform independent")
# elif(GLM_ARCH & GLM_ARCH_SSE2)
# pragma message("GLM: SSE2 instruction set")
# elif(GLM_ARCH & GLM_ARCH_SSE3)
# pragma message("GLM: SSE3 instruction set")
# elif(GLM_ARCH & GLM_ARCH_SSE4)
# pragma message("GLM: SSE4 instruction set")
# elif(GLM_ARCH & GLM_ARCH_AVX)
# pragma message("GLM: AVX instruction set")
# elif(GLM_ARCH & GLM_ARCH_AVX2) # elif(GLM_ARCH & GLM_ARCH_AVX2)
# pragma message("GLM: AVX2 instruction set") # pragma message("GLM: AVX2 instruction set")
# elif(GLM_ARCH & GLM_ARCH_AVX)
# pragma message("GLM: AVX instruction set")
# elif(GLM_ARCH & GLM_ARCH_SSE3)
# pragma message("GLM: SSE3 instruction set")
# elif(GLM_ARCH & GLM_ARCH_SSE2)
# pragma message("GLM: SSE2 instruction set")
# endif//GLM_ARCH # endif//GLM_ARCH
# pragma message("GLM: #define GLM_FORCE_PURE to avoid using platform specific instruction sets") # pragma message("GLM: #define GLM_FORCE_PURE to avoid using platform specific instruction sets")
#endif//GLM_MESSAGE #endif//GLM_MESSAGE

View File

@ -47,6 +47,28 @@
namespace glm{ namespace glm{
namespace detail namespace detail
{ {
template <typename T>
struct simd
{
typedef T type[4];
};
# if(GLM_ARCH & GLM_ARCH_SSE2)
template <>
struct simd<float>
{
typedef __m128 type;
};
# endif
# if(GLM_ARCH & GLM_ARCH_AVX)
template <>
struct simd<double>
{
typedef __m256d type;
};
# endif
template <typename T, precision P> template <typename T, precision P>
struct tvec4 struct tvec4
{ {
@ -86,10 +108,24 @@ namespace detail
_GLM_SWIZZLE4_4_MEMBERS(T, P, tvec4, s, t, p, q) _GLM_SWIZZLE4_4_MEMBERS(T, P, tvec4, s, t, p, q)
}; };
# else # else
# if(GLM_HAS_UNRESTRICTED_UNIONS)
union
{
typename simd<T>::type data;
struct
{
union { T x, r, s; }; union { T x, r, s; };
union { T y, g, t; }; union { T y, g, t; };
union { T z, b, p; }; union { T z, b, p; };
union { T w, a, q; }; union { T w, a, q; };
};
};
# else
union { T x, r, s; };
union { T y, g, t; };
union { T z, b, p; };
union { T w, a, q; };
# endif
# ifdef GLM_SWIZZLE # ifdef GLM_SWIZZLE
GLM_SWIZZLE_GEN_VEC_FROM_VEC4(T, P, detail::tvec4, detail::tvec2, detail::tvec3, detail::tvec4) GLM_SWIZZLE_GEN_VEC_FROM_VEC4(T, P, detail::tvec4, detail::tvec2, detail::tvec3, detail::tvec4)

View File

@ -13,8 +13,30 @@
#if(GLM_ARCH != GLM_ARCH_PURE) #if(GLM_ARCH != GLM_ARCH_PURE)
struct value
{
value(float x, float y, float z, float w) :
x(x), y(y), z(z), w(w)
{}
union
{
__m128 data;
struct
{
union { float x, r, s; };
union { float y, g, t; };
union { float z, b, p; };
union { float w, a, q; };
};
};
};
int main() int main()
{ {
value Value(1.0, 0.5, 0.0, 0.7);
glm::simdVec4 A1(0.0f, 0.1f, 0.2f, 0.3f); glm::simdVec4 A1(0.0f, 0.1f, 0.2f, 0.3f);
glm::simdVec4 B1(0.4f, 0.5f, 0.6f, 0.7f); glm::simdVec4 B1(0.4f, 0.5f, 0.6f, 0.7f);
glm::simdVec4 C1 = A1 + B1; glm::simdVec4 C1 = A1 + B1;