Combine block data directly in AVX registers.

This commit is contained in:
Bartosz Taudul 2019-07-20 14:52:34 +02:00
parent 396c28011e
commit 178dc9eba7

View File

@ -575,15 +575,9 @@ static tracy_force_inline void ProcessRGB_AVX( const uint8_t* src, char*& dst )
__m256i mm4 = _mm256_or_si256( mm3, mmb );
__m256i mm5 = _mm256_shuffle_epi8( mm4, _mm256_set1_epi32( 0x09080100 ) );
uint32_t minmax0 = _mm256_cvtsi256_si32( mm5 );
uint32_t minmax1 = _mm256_extract_epi32( mm5, 4 );
uint32_t vp0 = _mm256_cvtsi256_si32( p );
uint32_t vp1 = _mm256_extract_epi32( p, 4 );
memcpy( dst, &minmax0, 4 );
memcpy( dst+4, &vp0, 4 );
memcpy( dst+8, &minmax1, 4 );
memcpy( dst+12, &vp1, 4 );
__m256i d0 = _mm256_unpacklo_epi32( mm5, p );
__m256i d1 = _mm256_permute4x64_epi64( d0, _MM_SHUFFLE( 3, 2, 2, 0 ) );
_mm_storeu_si128( (__m128i*)dst, _mm256_castsi256_si128( d1 ) );
dst += 16;
}
#endif