glm/wip/simd/sse_mat4.h
2010-04-14 13:27:44 +01:00

1281 lines
28 KiB
C++
Raw Blame History

#ifdef GLM_SSE_MAT4_H
#define GLM_SSE_MAT4_H
#include <xmmintrin.h>
#include <emmintrin.h>
#include "sse_vec4.h"
namespace glm{
namespace sse{
const __m128i maskX = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF);
const __m128i maskY = _mm_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000);
const __m128i maskZ = _mm_set_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000);
const __m128i maskW = _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000);
GLM_ALIGN(16) struct mat4
{
enum ENoInit
{
NO_INIT
};
union
{
__m128 data[4];
};
mat4();
mat4(ENoInit NoInit);
mat4(float s);
mat4(const vec4& x, const vec4& y, const vec4& z, const vec4& w);
mat4& operator+=(const float s);
mat4& operator+=(const mat4& m);
mat4& operator*=(const mat4& m);
void mat4::transpose();
};
__forceinline mat4::mat4()
{
void* address = this;
__asm
{
mov esi, address
xorps xmm0, xmm0
movaps [esi+0x00], xmm0
movaps [esi+0x10], xmm0
movaps [esi+0x20], xmm0
movaps [esi+0x30], xmm0
}
}
__forceinline mat4::mat4(ENoInit NoInit)
{}
__forceinline mat4::mat4(float s)
{
void* address = this;
__asm
{
mov esi, address
movss xmm0, s
shufps xmm0, xmm0, 0
xorps xmm1, xmm1
movaps [esi+0x00], xmm1
movaps [esi+0x10], xmm1
movaps [esi+0x20], xmm1
movaps [esi+0x30], xmm1
movss [esi+0x00], xmm0
movss [esi+0x14], xmm0
movss [esi+0x28], xmm0
movss [esi+0x3C], xmm0
}
}
__forceinline mat4::mat4(const vec4& x, const vec4& y, const vec4& z, const vec4& w)
{
void* address = this;
__asm
{
mov esi, address
mov edi, x
movaps xmm0, [edi]
movaps [esi+0x00], xmm0
mov edi, y
movaps xmm0, [edi]
movaps [esi+0x10], xmm0
mov edi, z
movaps xmm0, [edi]
movaps [esi+0x20], xmm0
mov edi, w
movaps xmm0, [edi]
movaps [esi+0x30], xmm0
}
}
__forceinline mat4 operator+(const mat4& m, float s)
{
__asm
{
movss xmm1, s
shufps xmm1, xmm1, 0
mov esi, [m]
movaps xmm0, [esi+0x00]
addps xmm0, xmm1
movaps [esi+0x00], xmm0
movaps xmm0, [esi+0x10]
addps xmm0, xmm1
movaps [esi+0x10], xmm0
movaps xmm0, [esi+0x20]
addps xmm0, xmm1
movaps [esi+0x20], xmm0
movaps xmm0, [esi+0x30]
addps xmm0, xmm1
movaps [esi+0x30], xmm0
}
}
__forceinline mat4 operator+(float s, const mat4& m)
{
return m + s;
}
__forceinline mat4 operator-(const mat4& m, float s)
{
__asm
{
movss xmm1, s
shufps xmm1, xmm1, 0
mov esi, [m]
movaps xmm0, [esi+0x00]
subps xmm0, xmm1
movaps [esi+0x00], xmm0
movaps xmm0, [esi+0x10]
subps xmm0, xmm1
movaps [esi+0x10], xmm0
movaps xmm0, [esi+0x20]
subps xmm0, xmm1
movaps [esi+0x20], xmm0
movaps xmm0, [esi+0x30]
subps xmm0, xmm1
movaps [esi+0x30], xmm0
}
}
__forceinline mat4 operator-(float s, const mat4& m)
{
__asm
{
movss xmm2, s
shufps xmm2, xmm2, 0
mov esi, [m]
movaps xmm0, [esi+0x00]
movaps xmm1, xmm2
subps xmm1, xmm0
movaps [esi+0x00], xmm1
movaps xmm0, [esi+0x10]
movaps xmm1, xmm2
subps xmm1, xmm0
movaps [esi+0x10], xmm1
movaps xmm0, [esi+0x20]
movaps xmm1, xmm2
subps xmm1, xmm0
movaps [esi+0x20], xmm1
movaps xmm0, [esi+0x30]
movaps xmm1, xmm2
subps xmm1, xmm0
movaps [esi+0x30], xmm1
}
}
__forceinline mat4 operator*(const mat4& m, float s)
{
__asm
{
movss xmm1, s
shufps xmm1, xmm1, 0
mov esi, [m]
movaps xmm0, [esi+0x00]
mulps xmm0, xmm1
movaps [esi+0x00], xmm0
movaps xmm0, [esi+0x10]
mulps xmm0, xmm1
movaps [esi+0x10], xmm0
movaps xmm0, [esi+0x20]
mulps xmm0, xmm1
movaps [esi+0x20], xmm0
movaps xmm0, [esi+0x30]
mulps xmm0, xmm1
movaps [esi+0x30], xmm0
}
}
__forceinline mat4 operator*(float s, const mat4& m)
{
return m * s;
}
__forceinline mat4 operator/(const mat4& m, float s)
{
__asm
{
movss xmm1, s
shufps xmm1, xmm1, 0
mov esi, [m]
movaps xmm0, [esi+0x00]
divps xmm0, xmm1
movaps [esi+0x00], xmm0
movaps xmm0, [esi+0x10]
divps xmm0, xmm1
movaps [esi+0x10], xmm0
movaps xmm0, [esi+0x20]
divps xmm0, xmm1
movaps [esi+0x20], xmm0
movaps xmm0, [esi+0x30]
divps xmm0, xmm1
movaps [esi+0x30], xmm0
}
}
__forceinline mat4 operator/(float s, const mat4& m)
{
__asm
{
movss xmm2, s
shufps xmm2, xmm2, 0
mov esi, [m]
movaps xmm0, [esi+0x00]
movaps xmm1, xmm2
divps xmm1, xmm0
movaps [esi+0x00], xmm1
movaps xmm0, [esi+0x10]
movaps xmm1, xmm2
divps xmm1, xmm0
movaps [esi+0x10], xmm1
movaps xmm0, [esi+0x20]
movaps xmm1, xmm2
divps xmm1, xmm0
movaps [esi+0x20], xmm1
movaps xmm0, [esi+0x30]
movaps xmm1, xmm2
divps xmm1, xmm0
movaps [esi+0x30], xmm1
}
}
__forceinline vec4 operator*(const mat4& m, const vec4& v)
{
vec4 result(vec4::NO_INIT);
// SSE2
__asm
{
mov esi, [m]
mov edi, [v]
movaps xmm4, [edi]
movaps xmm0, [esi+0x00]
movaps xmm1, [esi+0x10]
movaps xmm2, [esi+0x20]
movaps xmm3, [esi+0x30]
pshufd xmm5, xmm4, _MM_SHUFFLE(0, 0, 0, 0)
mulps xmm0, xmm5
pshufd xmm5, xmm4, _MM_SHUFFLE(1, 1, 1, 1)
mulps xmm1, xmm5
pshufd xmm5, xmm4, _MM_SHUFFLE(2, 2, 2, 2)
mulps xmm2, xmm5
pshufd xmm5, xmm4, _MM_SHUFFLE(3, 3, 3, 3)
mulps xmm3, xmm5
addps xmm0, xmm1
addps xmm2, xmm3
addps xmm0, xmm2
movaps result, xmm0
}
return result;
}
__forceinline vec4 operator*(const vec4& v, const mat4& m)
{
vec4 result(vec4::NO_INIT);
// SSE1
__asm
{
mov esi, [m]
mov edi, [v]
movaps xmm4, [edi] // xmm4: v[3], v[2], v[1], v[0]
movaps xmm0, [esi+0x00] // xmm0: m[0][3], m[0][2], m[0][1], m[0][0]
movaps xmm1, [esi+0x10] // xmm1: m[1][3], m[1][2], m[1][1], m[1][0]
movaps xmm2, [esi+0x20] // xmm2: m[2][3], m[2][2], m[2][1], m[2][0]
movaps xmm3, [esi+0x30] // xmm3: m[3][3], m[3][2], m[3][1], m[3][0]
mulps xmm0, xmm4 // xmm0: m[0][3]*v[3], m[0][2]*v[2], m[0][1]*v[1], m[0][0]*v[0]
mulps xmm1, xmm4 // xmm1: m[1][3]*v[3], m[1][2]*v[2], m[1][1]*v[1], m[1][0]*v[0]
mulps xmm2, xmm4 // xmm2: m[2][3]*v[3], m[2][2]*v[2], m[2][1]*v[1], m[2][0]*v[0]
mulps xmm3, xmm4 // xmm2: m[3][3]*v[3], m[3][2]*v[2], m[3][1]*v[1], m[3][0]*v[0]
movaps xmm4, xmm0 // xmm4: m[0][3]*v[3], m[0][2]*v[2], m[0][1]*v[1], m[0][0]*v[0]
movhlps xmm0, xmm1 // xmm0: m[0][3]*v[3], m[0][2]*v[2], m[1][3]*v[3], m[1][2]*v[2]
movlhps xmm1, xmm4 // xmm1: m[0][1]*v[1], m[0][0]*v[0], m[1][1]*v[1], m[1][0]*v[0]
addps xmm0, xmm1
movaps xmm5, xmm2
movhlps xmm2, xmm3
movlhps xmm3, xmm5
addps xmm2, xmm3
movaps xmm1, xmm0
shufps xmm0, xmm2, _MM_SHUFFLE(0, 2, 0, 2) //_MM_SHUFFLE(2, 0, 2, 0)
shufps xmm1, xmm2, _MM_SHUFFLE(1, 3, 1, 3) //_MM_SHUFFLE(3, 1, 3, 1)
addps xmm0, xmm1
movaps result, xmm0
}
return result;
}
// 289000
/*
inline mat4 operator*(const mat4& m, const mat4& n)
{
mat4 result;
void* adresse = &result;
// SSE2
__asm
{
mov esi, m
mov edi, n
mov eax, adresse
mov ecx, 0x10
mov edx, 0x04
next:
movaps xmm7, [edi]
movaps xmm0, [esi+0x00]
movaps xmm1, [esi+0x10]
movaps xmm2, [esi+0x20]
movaps xmm3, [esi+0x30]
pshufd xmm4, xmm7, _MM_SHUFFLE(0, 0, 0, 0)
mulps xmm0, xmm4
pshufd xmm4, xmm7, _MM_SHUFFLE(1, 1, 1, 1)
mulps xmm1, xmm4
pshufd xmm4, xmm7, _MM_SHUFFLE(2, 2, 2, 2)
mulps xmm2, xmm4
pshufd xmm4, xmm7, _MM_SHUFFLE(3, 3, 3, 3)
mulps xmm3, xmm4
addps xmm2, xmm3
addps xmm0, xmm1
addps xmm0, xmm2
movaps [eax], xmm0
add edi, ecx
add eax, ecx
dec edx
jnz next
}
return result;
}
*/
// 288000
/*
inline mat4 operator*(const mat4& m, const mat4& n)
{
mat4 result;
void* adresse = &result;
// SSE2
__asm
{
mov esi, m
mov edi, n
mov eax, adresse
mov ecx, 0x10
mov edx, 0x04
next:
movaps xmm7, [edi]
movaps xmm0, [esi+0x00]
pshufd xmm5, xmm4, _MM_SHUFFLE(0, 0, 0, 0)
mulps xmm0, xmm5
movaps xmm1, [esi+0x10]
pshufd xmm5, xmm4, _MM_SHUFFLE(1, 1, 1, 1)
mulps xmm1, xmm5
movaps xmm2, [esi+0x20]
pshufd xmm5, xmm4, _MM_SHUFFLE(2, 2, 2, 2)
mulps xmm2, xmm5
movaps xmm3, [esi+0x30]
pshufd xmm5, xmm4, _MM_SHUFFLE(3, 3, 3, 3)
mulps xmm3, xmm5
addps xmm0, xmm1
addps xmm2, xmm3
addps xmm0, xmm2
movaps [eax], xmm0
add edi, ecx
add eax, ecx
dec edx
jnz next
}
return result;
}
*/
/*
// 290000
inline mat4 operator*(const mat4& m, const mat4& n)
{
mat4 result;
void* adresse = &result;
// SSE2
__asm
{
mov esi, m
mov edi, n
mov eax, adresse
mov ecx, 0x10
mov edx, 0x04
movaps xmm7, [edi]
next:
pshufd xmm4, xmm7, _MM_SHUFFLE(0, 0, 0, 0)
pshufd xmm5, xmm7, _MM_SHUFFLE(1, 1, 1, 1)
movaps xmm0, [esi+0x00]
mulps xmm0, xmm4
movaps xmm1, [esi+0x10]
mulps xmm1, xmm5
addps xmm0, xmm1
pshufd xmm4, xmm7, _MM_SHUFFLE(2, 2, 2, 2)
pshufd xmm5, xmm7, _MM_SHUFFLE(3, 3, 3, 3)
movaps xmm2, [esi+0x20]
mulps xmm2, xmm4
movaps xmm3, [esi+0x30]
mulps xmm3, xmm5
addps xmm2, xmm3
addps xmm0, xmm2
movaps [eax], xmm0
add edi, ecx
add eax, ecx
dec edx
jnz next
}
return result;
}
*/
/*
// 295000
inline mat4 operator*(const mat4& m, const mat4& n)
{
mat4 result;
void* adresse = &result;
// SSE2
__asm
{
mov esi, m
mov edi, n
mov eax, adresse
mov ecx, 0x10
mov edx, 0x04
movaps xmm7, [edi]
next:
movaps xmm0, [esi+0x00]
movaps xmm1, [esi+0x10]
movaps xmm2, [esi+0x20]
movaps xmm3, [esi+0x30]
pshufd xmm4, xmm7, _MM_SHUFFLE(0, 0, 0, 0)
mulps xmm0, xmm4
pshufd xmm5, xmm7, _MM_SHUFFLE(1, 1, 1, 1)
mulps xmm1, xmm5
addps xmm0, xmm1
pshufd xmm4, xmm7, _MM_SHUFFLE(2, 2, 2, 2)
mulps xmm2, xmm4
pshufd xmm5, xmm7, _MM_SHUFFLE(3, 3, 3, 3)
mulps xmm3, xmm5
addps xmm2, xmm3
addps xmm0, xmm2
movaps [eax], xmm0
add edi, ecx
add eax, ecx
dec edx
jnz next
}
return result;
}
*/
/*
// 290000
inline mat4 operator*(const mat4& m, const mat4& n)
{
mat4 result;
void* adresse = &result;
// SSE2
__asm
{
mov esi, m
mov edi, n
mov eax, adresse
mov ecx, 0x10
mov edx, 0x04
movaps xmm7, [edi]
next:
movaps xmm0, [esi+0x00]
movaps xmm1, [esi+0x10]
movaps xmm2, [esi+0x20]
movaps xmm3, [esi+0x30]
pshufd xmm4, xmm7, _MM_SHUFFLE(0, 0, 0, 0)
pshufd xmm5, xmm7, _MM_SHUFFLE(1, 1, 1, 1)
mulps xmm0, xmm4
mulps xmm1, xmm5
addps xmm0, xmm1
pshufd xmm4, xmm7, _MM_SHUFFLE(2, 2, 2, 2)
pshufd xmm5, xmm7, _MM_SHUFFLE(3, 3, 3, 3)
mulps xmm2, xmm4
mulps xmm3, xmm5
addps xmm2, xmm3
addps xmm0, xmm2
movaps [eax], xmm0
add edi, ecx
add eax, ecx
dec edx
jnz next
}
return result;
}
*/
/*
// 288000
inline mat4 operator*(const mat4& m, const mat4& n)
{
mat4 result;
void* adresse = &result;
// SSE2
__asm
{
mov esi, m
mov edi, n
mov eax, adresse
mov ecx, 0x10
mov edx, 0x04
movaps xmm7, [edi]
next:
movaps xmm0, [esi+0x00]
movaps xmm1, [esi+0x10]
movaps xmm2, [esi+0x20]
movaps xmm3, [esi+0x30]
pshufd xmm4, xmm7, _MM_SHUFFLE(0, 0, 0, 0)
pshufd xmm5, xmm7, _MM_SHUFFLE(1, 1, 1, 1)
mulps xmm0, xmm4
mulps xmm1, xmm5
addps xmm0, xmm1
pshufd xmm4, xmm7, _MM_SHUFFLE(2, 2, 2, 2)
pshufd xmm5, xmm7, _MM_SHUFFLE(3, 3, 3, 3)
mulps xmm2, xmm4
mulps xmm3, xmm5
addps xmm2, xmm3
addps xmm0, xmm2
movaps [eax], xmm0
add edi, ecx
add eax, ecx
dec edx
jnz next
}
return result;
}
*/
/*
// 298000
inline mat4 operator*(const mat4& m, const mat4& n)
{
mat4 result;
void* adresse = &result;
const int int0x10 = 0x10;
// SSE2
__asm
{
mov esi, m
mov edi, n
mov eax, adresse
mov edx, int0x10
// First column
movaps xmm4, [edi]
pshufd xmm5, xmm4, _MM_SHUFFLE(0, 0, 0, 0)
movaps xmm0, [esi+0x00]
mulps xmm0, xmm5
pshufd xmm5, xmm4, _MM_SHUFFLE(1, 1, 1, 1)
movaps xmm1, [esi+0x10]
mulps xmm1, xmm5
pshufd xmm5, xmm4, _MM_SHUFFLE(2, 2, 2, 2)
movaps xmm2, [esi+0x20]
mulps xmm2, xmm5
pshufd xmm5, xmm4, _MM_SHUFFLE(3, 3, 3, 3)
movaps xmm3, [esi+0x30]
mulps xmm3, xmm5
addps xmm0, xmm1
addps xmm2, xmm3
addps xmm0, xmm2
movaps [eax], xmm0
add edi, edx
add eax, edx
// Second column
movaps xmm4, [edi]
pshufd xmm5, xmm4, _MM_SHUFFLE(0, 0, 0, 0)
movaps xmm0, [esi+0x00]
mulps xmm0, xmm5
pshufd xmm5, xmm4, _MM_SHUFFLE(1, 1, 1, 1)
movaps xmm1, [esi+0x10]
mulps xmm1, xmm5
pshufd xmm5, xmm4, _MM_SHUFFLE(2, 2, 2, 2)
movaps xmm2, [esi+0x20]
mulps xmm2, xmm5
pshufd xmm5, xmm4, _MM_SHUFFLE(3, 3, 3, 3)
movaps xmm3, [esi+0x30]
mulps xmm3, xmm5
addps xmm0, xmm1
addps xmm2, xmm3
addps xmm0, xmm2
movaps [eax], xmm0
add edi, edx
add eax, edx
// Third column
movaps xmm4, [edi]
pshufd xmm5, xmm4, _MM_SHUFFLE(0, 0, 0, 0)
movaps xmm0, [esi+0x00]
mulps xmm0, xmm5
pshufd xmm5, xmm4, _MM_SHUFFLE(1, 1, 1, 1)
movaps xmm1, [esi+0x10]
mulps xmm1, xmm5
pshufd xmm5, xmm4, _MM_SHUFFLE(2, 2, 2, 2)
movaps xmm2, [esi+0x20]
mulps xmm2, xmm5
pshufd xmm5, xmm4, _MM_SHUFFLE(3, 3, 3, 3)
movaps xmm3, [esi+0x30]
mulps xmm3, xmm5
addps xmm0, xmm1
addps xmm2, xmm3
addps xmm0, xmm2
movaps [eax], xmm0
add edi, edx
add eax, edx
// Forth column
movaps xmm4, [edi]
pshufd xmm5, xmm4, _MM_SHUFFLE(0, 0, 0, 0)
movaps xmm0, [esi+0x00]
mulps xmm0, xmm5
pshufd xmm5, xmm4, _MM_SHUFFLE(1, 1, 1, 1)
movaps xmm1, [esi+0x10]
mulps xmm1, xmm5
pshufd xmm5, xmm4, _MM_SHUFFLE(2, 2, 2, 2)
movaps xmm2, [esi+0x20]
mulps xmm2, xmm5
pshufd xmm5, xmm4, _MM_SHUFFLE(3, 3, 3, 3)
movaps xmm3, [esi+0x30]
mulps xmm3, xmm5
addps xmm0, xmm1
addps xmm2, xmm3
addps xmm0, xmm2
movaps [eax], xmm0
}
return result;
}
*/
// 294000
__forceinline mat4 operator*(const mat4& m, const mat4& n)
{
mat4 result(mat4::NO_INIT);
// SSE2
__asm
{
mov esi, m
mov edi, n
// First column
movaps xmm4, [edi+0x00]
movaps xmm0, [esi+0x00]
movaps xmm1, [esi+0x10]
movaps xmm2, [esi+0x20]
movaps xmm3, [esi+0x30]
pshufd xmm5, xmm4, _MM_SHUFFLE(0, 0, 0, 0)
mulps xmm0, xmm5
pshufd xmm5, xmm4, _MM_SHUFFLE(1, 1, 1, 1)
mulps xmm1, xmm5
pshufd xmm5, xmm4, _MM_SHUFFLE(2, 2, 2, 2)
mulps xmm2, xmm5
pshufd xmm5, xmm4, _MM_SHUFFLE(3, 3, 3, 3)
mulps xmm3, xmm5
addps xmm0, xmm1
addps xmm2, xmm3
addps xmm0, xmm2
movaps result[0x00], xmm0
// Second column
movaps xmm4, [edi+0x10]
pshufd xmm5, xmm4, _MM_SHUFFLE(0, 0, 0, 0)
movaps xmm0, [esi+0x00]
mulps xmm0, xmm5
pshufd xmm5, xmm4, _MM_SHUFFLE(1, 1, 1, 1)
movaps xmm1, [esi+0x10]
mulps xmm1, xmm5
pshufd xmm5, xmm4, _MM_SHUFFLE(2, 2, 2, 2)
movaps xmm2, [esi+0x20]
mulps xmm2, xmm5
pshufd xmm5, xmm4, _MM_SHUFFLE(3, 3, 3, 3)
movaps xmm3, [esi+0x30]
mulps xmm3, xmm5
addps xmm0, xmm1
addps xmm2, xmm3
addps xmm0, xmm2
movaps result[0x10], xmm0
// Third column
movaps xmm4, [edi+0x20]
pshufd xmm5, xmm4, _MM_SHUFFLE(0, 0, 0, 0)
movaps xmm0, [esi+0x00]
mulps xmm0, xmm5
pshufd xmm5, xmm4, _MM_SHUFFLE(1, 1, 1, 1)
movaps xmm1, [esi+0x10]
mulps xmm1, xmm5
pshufd xmm5, xmm4, _MM_SHUFFLE(2, 2, 2, 2)
movaps xmm2, [esi+0x20]
mulps xmm2, xmm5
pshufd xmm5, xmm4, _MM_SHUFFLE(3, 3, 3, 3)
movaps xmm3, [esi+0x30]
mulps xmm3, xmm5
addps xmm0, xmm1
addps xmm2, xmm3
addps xmm0, xmm2
movaps result[0x20], xmm0
// Forth column
movaps xmm4, [edi+0x30]
pshufd xmm5, xmm4, _MM_SHUFFLE(0, 0, 0, 0)
movaps xmm0, [esi+0x00]
mulps xmm0, xmm5
pshufd xmm5, xmm4, _MM_SHUFFLE(1, 1, 1, 1)
movaps xmm1, [esi+0x10]
mulps xmm1, xmm5
pshufd xmm5, xmm4, _MM_SHUFFLE(2, 2, 2, 2)
movaps xmm2, [esi+0x20]
mulps xmm2, xmm5
pshufd xmm5, xmm4, _MM_SHUFFLE(3, 3, 3, 3)
movaps xmm3, [esi+0x30]
mulps xmm3, xmm5
addps xmm0, xmm1
addps xmm2, xmm3
addps xmm0, xmm2
movaps result[0x30], xmm0
}
return result;
}
static const __m128 sign1010 = _mm_set_ps( 1.0f,-1.0f, 1.0f,-1.0f);
static const __m128 sign0101 = _mm_set_ps(-1.0f, 1.0f,-1.0f, 1.0f);
static const __m128i mask1010 = _mm_set_epi32(0x00000000, 0x80000000, 0x00000000, 0x80000000);
static const __m128i mask0101 = _mm_set_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000);
__forceinline mat4 inverse(const mat4& m)
{
mat4 result(mat4::NO_INIT);
vec4 m0a;
vec4 m0b;
vec4 m0c;
vec4 m1a;
vec4 m1b;
vec4 m1c;
vec4 m2a;
vec4 m2b;
vec4 m2c;
vec4 m3a;
vec4 m3b;
vec4 m3c;
__asm
{
mov esi, m
//////////////////////////////////////////////////////////////////////////////////////////////
//R00 = +1 * m11*(m22*m33 - m32*m23) -1 * m12*(m23*m31 - m33*m21) +1 * m13*(m21*m32 - m31*m22)
//R01 = -1 * m12*(m23*m30 - m33*m20) +1 * m13*(m20*m32 - m30*m22) -1 * m10*(m22*m33 - m32*m23)
//R02 = +1 * m13*(m20*m31 - m30*m21) -1 * m10*(m21*m33 - m31*m23) +1 * m11*(m23*m30 - m33*m20)
//R03 = -1 * m10*(m21*m32 - m31*m22) +1 * m11*(m22*m30 - m32*m20) -1 * m12*(m20*m31 - m30*m21)
movaps xmm6, [esi+20] // xmm6: m23, m22, m21, m20
movaps xmm7, [esi+30] // xmm7: m33, m32, m31, m30
//////////////////////////////////////////////////////////////////////////////////////////////
pshufd xmm0, xmm6, _MM_SHUFFLE(1, 0, 3, 2) // xmm0: m21, m20, m23, m22 (movhlps optimization possible)
pshufd xmm1, xmm7, _MM_SHUFFLE(0, 3, 2, 1) // xmm1: m30, m33, m32, m31
pshufd xmm2, xmm6, _MM_SHUFFLE(0, 3, 2, 1) // xmm2: m20, m23, m22, m21
pshufd xmm3, xmm7, _MM_SHUFFLE(1, 0, 3, 2) // xmm3: m31, m30, m33, m32 (movhlps optimization possible)
mulps xmm0, xmm1 // xmm0: m21*m30, m20*m33, m23*m32, m22*m31
mulps xmm3, xmm2 // xmm2: m20*m31, m23*m30, m22*m33, m21*m32
subps xmm3, xmm0 // xmm2: m20*m31-m21*m30, m23*m30-m20*m33, m22*m33-m23*m32, m21*m32-m22*m31
movaps xmm4, xmm3 // xmm4: m20*m31-m21*m30, m23*m30-m20*m33, m22*m33-m23*m32, m21*m32-m22*m31
//////////////////////////////////////////////////////////////////////////////////////////////
pshufd xmm0, xmm6, _MM_SHUFFLE(2, 1, 0, 3) // xmm0: m22, m21, m20, m23
// Reuse xmm1: pshufd xmm1, xmm7, _MM_SHUFFLE(0, 3, 2, 1) // xmm1: m30, m33, m32, m31
// Reuse xmm2: pshufd xmm2, xmm6, _MM_SHUFFLE(0, 3, 2, 1) // xmm2: m20, m23, m22, m21
pshufd xmm3, xmm7, _MM_SHUFFLE(2, 1, 0, 3) // xmm3: m32, m31, m30, m33
mulps xmm1, xmm0 // xmm1: m30*m22, m33*m21, m32*m20, m31*m23
mulps xmm2, xmm3 // xmm2: m20*m32, m23*m31, m22*m30, m21*m33
subps xmm1, xmm2 // xmm1: m30*m22-m20*m32, m33*m21-m23*m31, m32*m20-m22*m30, m31*m23-m21*m33
movaps xmm5, xmm2 // xmm5: m30*m22-m20*m32, m33*m21-m23*m31, m32*m20-m22*m30, m31*m23-m21*m33
//////////////////////////////////////////////////////////////////////////////////////////////
// Reuse xmm0: pshufd xmm0, xmm6, _MM_SHUFFLE(2, 1, 0, 3) // xmm0: m22, m21, m20, m23
pshufd xmm1, xmm7, _MM_SHUFFLE(1, 0, 3, 2) // xmm1: m31, m30, m33, m32 (movhlps optimization possible)
pshufd xmm2, xmm6, _MM_SHUFFLE(1, 0, 3, 2) // xmm2: m21, m20, m23, m22 (movhlps optimization possible)
// Reuse xmm3: pshufd xmm3, xmm7, _MM_SHUFFLE(2, 1, 0, 3) // xmm3: m32, m31, m30, m33
mulps xmm0, xmm1 // xmm1: m22*m31, m21*m30, m20*m33, m23*m32
mulps xmm2, xmm3 // xmm2: m21*m32, m20*m31, m23*m30, m22*m33
subps xmm2, xmm0 // xmm2: m23*m32-m22*m33, m20*m33-m23*m30, m21*m30-m20*m31, m22*m31-m21*m32
movaps xmm6, xmm0 // xmm6: m23*m32-m22*m33, m20*m33-m23*m30, m21*m30-m20*m31, m22*m31-m21*m32
//////////////////////////////////////////////////////////////////////////////////////////////
movaps xmm0, [esi+10] // m13, m12, m11, m10
pshufd xmm1, xmm0, _MM_SHUFFLE(2, 1, 0, 3)
xorps xmm1, mask1010 //-m12, m11,-m10, m13
mulaps xmm4, xmm1
pshufd xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2) // (movhlps optimization possible)
xorps xmm1, mask0101 // m11,-m10, m13,-m12
mulaps xmm5, xmm1
pshufd xmm1, xmm0, _MM_SHUFFLE(0, 3, 2, 1)
xorps xmm1, mask1010 //-m10, m13,-m12, m10
mulaps xmm6, xmm1
mulaps xmm6, xmm5
mulaps xmm6, xmm4
movaps result[0x00], xmm6
//////////////////////////////////////////////////////////////////////////////////////////////
//R10 = -1 * m21*(m32*m03 <20> m02*m33) +1 * m22*(m33*m01 <20> m03*m31) <20>1 * m23*(m31*m02 <20> m01*m32)
//R11 = +1 * m22*(m33*m00 <20> m03*m30) <20>1 * m23*(m30*m02 <20> m00*m32) +1 * m20*(m32*m03 <20> m02*m33)
//R12 = -1 * m23*(m30*m01 <20> m00*m31) +1 * m20*(m31*m03 <20> m01*m33) -1 * m21*(m33*m00 <20> m03*m30)
//R13 = +1 * m20*(m31*m02 <20> m01*m32) -1 * m21*(m32*m00 <20> m02*m30) +1 * m22*(m30*m01 <20> m00*m31)
movaps xmm6, [esi+00] // xmm6: m03, m02, m01, m00
// Reuse xmm7 movaps xmm7, [esi+30] // xmm7: m33, m32, m31, m30
//////////////////////////////////////////////////////////////////////////////////////////////
pshufd xmm0, xmm6, _MM_SHUFFLE(0, 3, 2, 1) // xmm0: m00, m03, m02, m01
pshufd xmm1, xmm7, _MM_SHUFFLE(1, 0, 3, 2) // xmm1: m31, m30, m33, m32 (movhlps optimization possible)
pshufd xmm2, xmm6, _MM_SHUFFLE(1, 0, 3, 2) // xmm2: m01, m00, m03, m02 (movhlps optimization possible)
pshufd xmm3, xmm7, _MM_SHUFFLE(0, 3, 2, 1) // xmm3: m30, m33, m32, m31
mulps xmm0, xmm1 // xmm0: m31*m00, m30*m03, m33*m02, m32*m01
mulps xmm3, xmm2 // xmm3: m30*m01, m33*m00, m32*m03, m31*m02
subps xmm3, xmm0 // xmm3: m30*m01-m31*m00, m33*m00-m30*m03, m32*m03-m33*m02, m31*m02-m32*m01
movaps xmm4, xmm3 // xmm4: m30*m01-m31*m00, m33*m00-m30*m03, m32*m03-m33*m02, m31*m02-m32*m01
//////////////////////////////////////////////////////////////////////////////////////////////
// Reuse xmm0 pshufd xmm0, xmm6, _MM_SHUFFLE(0, 3, 2, 1) // xmm0: m00, m03, m02, m01
pshufd xmm1, xmm7, _MM_SHUFFLE(2, 1, 0, 3) // xmm1: m32, m31, m30, m33
pshufd xmm2, xmm6, _MM_SHUFFLE(2, 1, 0, 3) // xmm2: m02, m01, m00, m03
// Reuse xmm3 pshufd xmm3, xmm7, _MM_SHUFFLE(0, 3, 2, 1) // xmm3: m30, m33, m32, m31
mulps xmm0, xmm1 // xmm0: m00*m32, m03*m31, m02*m30, m01*m33
mulps xmm3, xmm2 // xmm3: m30*m02, m33*m01, m32*m00, m31*m03
subps xmm0, xmm3 // xmm3: m00*m32-m30*m02, m03*m31-m33*m01, m02*m30-m32*m00, m01*m33-m31*m03
movaps xmm5, xmm0 // xmm4: m00*m32-m30*m02, m03*m31-m33*m01, m02*m30-m32*m00, m01*m33-m31*m03
//////////////////////////////////////////////////////////////////////////////////////////////
pshufd xmm0, xmm6, _MM_SHUFFLE(3, 0, 3, 2) // xmm0: m01, m00, m03, m02 (movhlps optimization possible)
// Reuse xmm1: pshufd xmm1, xmm7, _MM_SHUFFLE(2, 1, 0, 3) // xmm1: m32, m31, m30, m33
// Reuse xmm2: pshufd xmm2, xmm6, _MM_SHUFFLE(2, 1, 0, 3) // xmm2: m02, m01, m00, m03
pshufd xmm3, xmm7, _MM_SHUFFLE(3, 0, 3, 2) // xmm3: m31, m30, m33, m32 (movhlps optimization possible)
mulps xmm0, xmm1 // xmm0: m01*m32, m00*m31, m03*m30, m02*m33
mulps xmm3, xmm2 // xmm3: m02*m31, m01*m30, m00*m33, m03*m32
subps xmm3, xmm0 // xmm3: m02*m31-m01*m32, m01*m30-m00*m31, m00*m33-m03*m30, m03*m32-m02*m33
movaps xmm6, xmm3 // xmm6: m02*m31-m01*m32, m01*m30-m00*m31, m00*m33-m03*m30, m03*m32-m02*m33
//////////////////////////////////////////////////////////////////////////////////////////////
movaps xmm0, [esi+20] // m23, m22, m21, m20
pshufd xmm1, xmm0, _MM_SHUFFLE(2, 1, 0, 3)
xorps xmm1, mask0101 //-m22, m21,-m20, m23
mulaps xmm4, xmm1
pshufd xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2) // (movhlps optimization possible)
xorps xmm1, mask1010 // m21,-m20, m23,-m22
mulaps xmm5, xmm1
pshufd xmm1, xmm0, _MM_SHUFFLE(0, 3, 2, 1)
xorps xmm1, mask0101 //-m20, m23,-m22, m21
mulaps xmm6, xmm1
mulaps xmm6, xmm5
mulaps xmm6, xmm4
movaps result[0x10], xmm6
//////////////////////////////////////////////////////////////////////////////////////////////
//R20 = +1 * m31*(m02*m13 - m12*m03) -1 * m32*(m03*m11 <20> m13*m01) +1 * m33*(m01*m12 - m11*m02)
//R21 = -1 * m32*(m03*m10 - m13*m00) +1 * m33*(m00*m12 <20> m10*m02) -1 * m30*(m02*m13 - m12*m03)
//R22 = +1 * m33*(m00*m11 - m10*m01) -1 * m30*(m01*m13 <20> m11*m03) +1 * m31*(m03*m10 - m13*m00)
//R23 = -1 * m30*(m01*m12 - m11*m02) +1 * m31*(m02*m10 <20> m12*m00) -1 * m32*(m00*m11 - m10*m01)
//Reuse xmm6: movaps xmm6, [esi+00] // xmm6: m03, m02, m01, m00
movaps xmm7, [esi+10] // xmm7: m13, m12, m11, m10
//////////////////////////////////////////////////////////////////////////////////////////////
pshufd xmm0, xmm6, _MM_SHUFFLE(1, 0, 3, 2) // xmm0: m11, m10, m13, m12 (movhlps optimization possible)
pshufd xmm1, xmm7, _MM_SHUFFLE(0, 3, 2, 1) // xmm1: m00, m03, m02, m01
pshufd xmm2, xmm6, _MM_SHUFFLE(0, 3, 2, 1) // xmm2: m10, m13, m12, m11
pshufd xmm3, xmm7, _MM_SHUFFLE(1, 0, 3, 2) // xmm3: m01, m00, m03, m02 (movhlps optimization possible)
mulps xmm0, xmm1 // xmm0: m11*m00, m10*m03, m13*m02, m12*m01
mulps xmm3, xmm2 // xmm3: m01*m10, m00*m13, m03*m12, m02*m11
subps xmm3, xmm0 // xmm3: m11*m00-m01*m10, m10*m03-m00*m13, m13*m02-m03*m12, m12*m01-m02*m11
movaps xmm4, xmm3 // xmm4: m11*m00-m01*m10, m10*m03-m00*m13, m13*m02-m03*m12, m12*m01-m02*m11
//////////////////////////////////////////////////////////////////////////////////////////////
// Reuse xmm0 pshufd xmm0, xmm6, _MM_SHUFFLE(0, 3, 2, 1) // xmm0: m00, m03, m02, m01
pshufd xmm1, xmm7, _MM_SHUFFLE(2, 1, 0, 3) // xmm1: m32, m31, m30, m33
pshufd xmm2, xmm6, _MM_SHUFFLE(2, 1, 0, 3) // xmm2: m02, m01, m00, m03
// Reuse xmm3 pshufd xmm3, xmm7, _MM_SHUFFLE(0, 3, 2, 1) // xmm3: m30, m33, m32, m31
mulps xmm0, xmm1 // xmm0: m00*m32, m03*m31, m02*m30, m01*m33
mulps xmm3, xmm2 // xmm3: m30*m02, m33*m01, m32*m00, m31*m03
subps xmm0, xmm3 // xmm3: m00*m32-m30*m02, m03*m31-m33*m01, m02*m30-m32*m00, m01*m33-m31*m03
movaps xmm5, xmm0 // xmm4: m00*m32-m30*m02, m03*m31-m33*m01, m02*m30-m32*m00, m01*m33-m31*m03
//////////////////////////////////////////////////////////////////////////////////////////////
pshufd xmm0, xmm6, _MM_SHUFFLE(3, 0, 3, 2) // xmm0: m01, m00, m03, m02 (movhlps optimization possible)
// Reuse xmm1: pshufd xmm1, xmm7, _MM_SHUFFLE(2, 1, 0, 3) // xmm1: m32, m31, m30, m33
// Reuse xmm2: pshufd xmm2, xmm6, _MM_SHUFFLE(2, 1, 0, 3) // xmm2: m02, m01, m00, m03
pshufd xmm3, xmm7, _MM_SHUFFLE(3, 0, 3, 2) // xmm3: m31, m30, m33, m32 (movhlps optimization possible)
mulps xmm0, xmm1 // xmm0: m01*m32, m00*m31, m03*m30, m02*m33
mulps xmm3, xmm2 // xmm3: m02*m31, m01*m30, m00*m33, m03*m32
subps xmm3, xmm0 // xmm3: m02*m31-m01*m32, m01*m30-m00*m31, m00*m33-m03*m30, m03*m32-m02*m33
movaps xmm6, xmm3 // xmm6: m02*m31-m01*m32, m01*m30-m00*m31, m00*m33-m03*m30, m03*m32-m02*m33
//////////////////////////////////////////////////////////////////////////////////////////////
movaps xmm0, [esi+20] // m23, m22, m21, m20
pshufd xmm1, xmm0, _MM_SHUFFLE(2, 1, 0, 3)
xorps xmm1, mask0101 //-m22, m21,-m20, m23
mulaps xmm4, xmm1
pshufd xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2) // (movhlps optimization possible)
xorps xmm1, mask1010 // m21,-m20, m23,-m22
mulaps xmm5, xmm1
pshufd xmm1, xmm0, _MM_SHUFFLE(0, 3, 2, 1)
xorps xmm1, mask0101 //-m20, m23,-m22, m21
mulaps xmm6, xmm1
mulaps xmm6, xmm5
mulaps xmm6, xmm4
movaps result[0x10], xmm6
//////////////////////////////////////////////////////////////////////////////////////////////
}
return result;
}
__forceinline mat4 transpose(const mat4& m)
{
mat4 result(mat4::NO_INIT);
__asm
{
mov esi, m
movaps xmm4, [esi+0x00]
movaps xmm5, xmm4
movaps xmm6, [esi+0x10]
unpcklps xmm5, xmm6
unpckhps xmm4, xmm6
movaps xmm0, xmm5
movhlps xmm1, xmm5
movaps xmm2, xmm4
movhlps xmm3, xmm4
movaps xmm4, [esi+0x20]
movaps xmm5, xmm4
movaps xmm6, [esi+0x30]
unpcklps xmm5, xmm6
unpckhps xmm4, xmm6
movlhps xmm0, xmm5
movlhps xmm2, xmm4
movhlps xmm7, xmm5
movhlps xmm6, xmm4
movlhps xmm1, xmm7
movlhps xmm3, xmm6
movaps result[0x00], xmm0
movaps result[0x10], xmm1
movaps result[0x20], xmm2
movaps result[0x30], xmm3
}
return result;
}
__forceinline void mat4::transpose()
{
void* address = this;
__asm
{
mov esi, address
movaps xmm4, [esi+0x00]
movaps xmm6, [esi+0x10]
movaps xmm5, xmm4
unpcklps xmm5, xmm6
movaps xmm0, xmm5
movhlps xmm1, xmm5
unpckhps xmm4, xmm6
movaps xmm2, xmm4
movhlps xmm3, xmm4
movaps xmm4, [esi+0x20]
movaps xmm6, [esi+0x30]
movaps xmm5, xmm4
unpcklps xmm5, xmm6
movlhps xmm0, xmm5
movhlps xmm2, xmm5
movlhps xmm1, xmm5
unpckhps xmm4, xmm6
movlhps xmm2, xmm4
movhlps xmm4, xmm4
movlhps xmm3, xmm4
movaps [esi+0x00], xmm0
movaps [esi+0x10], xmm1
movaps [esi+0x20], xmm2
movaps [esi+0x30], xmm3
}
}
/*
inline mat4 transpose(const mat4& m)
{
mat4 result;
__asm
{
mov esi, m
movaps xmm4, [esi+0x00]
movaps xmm5, xmm4
movaps xmm6, [esi+0x10]
unpcklps xmm5, xmm6
unpckhps xmm4, xmm6
movaps xmm0, xmm5
movhlps xmm1, xmm5
movaps xmm2, xmm4
movhlps xmm3, xmm4
movaps xmm4, [esi+0x20]
movaps xmm5, xmm4
movaps xmm6, [esi+0x30]
unpcklps xmm5, xmm6
unpckhps xmm4, xmm6
movlhps xmm0, xmm5
movhlps xmm7, xmm5
movlhps xmm1, xmm7
movlhps xmm2, xmm4
movhlps xmm6, xmm4
movlhps xmm3, xmm6
movaps result[0x00], xmm0
movaps result[0x10], xmm1
movaps result[0x20], xmm2
movaps result[0x30], xmm3
}
return result;
}
*/
}//namespace sse
}//namespace glm
void test_sse_mat4();
#endif//GLM_SSE_MAT4_H