Pack color indices using SSE.

This commit is contained in:
Bartosz Taudul 2019-06-28 21:58:10 +02:00
parent d593e5dfa9
commit 2df1eaaa7e

View File

@ -106,6 +106,24 @@ static uint64_t CheckSolid( const uint8_t* src )
} }
static const uint8_t IndexTable[4] = { 1, 3, 2, 0 }; static const uint8_t IndexTable[4] = { 1, 3, 2, 0 };
static const uint8_t IndexTableSIMD[256] = {
85, 87, 86, 84, 93, 95, 94, 92, 89, 91, 90, 88, 81, 83, 82, 80,
117, 119, 118, 116, 125, 127, 126, 124, 121, 123, 122, 120, 113, 115, 114, 112,
101, 103, 102, 100, 109, 111, 110, 108, 105, 107, 106, 104, 97, 99, 98, 96,
69, 71, 70, 68, 77, 79, 78, 76, 73, 75, 74, 72, 65, 67, 66, 64,
213, 215, 214, 212, 221, 223, 222, 220, 217, 219, 218, 216, 209, 211, 210, 208,
245, 247, 246, 244, 253, 255, 254, 252, 249, 251, 250, 248, 241, 243, 242, 240,
229, 231, 230, 228, 237, 239, 238, 236, 233, 235, 234, 232, 225, 227, 226, 224,
197, 199, 198, 196, 205, 207, 206, 204, 201, 203, 202, 200, 193, 195, 194, 192,
149, 151, 150, 148, 157, 159, 158, 156, 153, 155, 154, 152, 145, 147, 146, 144,
181, 183, 182, 180, 189, 191, 190, 188, 185, 187, 186, 184, 177, 179, 178, 176,
165, 167, 166, 164, 173, 175, 174, 172, 169, 171, 170, 168, 161, 163, 162, 160,
133, 135, 134, 132, 141, 143, 142, 140, 137, 139, 138, 136, 129, 131, 130, 128,
21, 23, 22, 20, 29, 31, 30, 28, 25, 27, 26, 24, 17, 19, 18, 16,
53, 55, 54, 52, 61, 63, 62, 60, 57, 59, 58, 56, 49, 51, 50, 48,
37, 39, 38, 36, 45, 47, 46, 44, 41, 43, 42, 40, 33, 35, 34, 32,
5, 7, 6, 4, 13, 15, 14, 12, 9, 11, 10, 8, 1, 3, 2, 0
};
static uint64_t ProcessRGB( const uint8_t* src ) static uint64_t ProcessRGB( const uint8_t* src )
{ {
@ -170,24 +188,33 @@ static uint64_t ProcessRGB( const uint8_t* src )
__m128i p0 = _mm_packus_epi16( m0, m1 ); __m128i p0 = _mm_packus_epi16( m0, m1 );
__m128i mask0 = _mm_set1_epi32( 0x00000003 );
__m128i mask1 = _mm_set1_epi32( 0x00000300 );
__m128i mask2 = _mm_set1_epi32( 0x00030000 );
__m128i mask3 = _mm_set1_epi32( 0x03000000 );
__m128i p1 = _mm_and_si128( p0, mask0 );
__m128i p2 = _mm_srai_epi32( _mm_and_si128( p0, mask1 ), 6 );
__m128i p3 = _mm_srai_epi32( _mm_and_si128( p0, mask2 ), 12 );
__m128i p4 = _mm_srai_epi32( _mm_and_si128( p0, mask3 ), 18 );
__m128i p5 = _mm_or_si128( p1, p2 );
__m128i p6 = _mm_or_si128( p3, p4 );
__m128i p7 = _mm_or_si128( p5, p6 );
__m128i p8 = _mm_packus_epi32( p7, p7 );
__m128i p = _mm_packus_epi16( p8, p8 );
uint32_t vmin = _mm_cvtsi128_si32( min ); uint32_t vmin = _mm_cvtsi128_si32( min );
uint32_t vmax = _mm_cvtsi128_si32( max ); uint32_t vmax = _mm_cvtsi128_si32( max );
uint32_t vp = _mm_cvtsi128_si32( p );
uint32_t vp[4];
_mm_store_si128( (__m128i*)vp, p0 );
uint32_t data = 0; uint32_t data = 0;
int k = 0;
for( int i=0; i<4; i++ ) for( int i=0; i<4; i++ )
{ {
uint32_t p = vp[i]; uint8_t idx = IndexTableSIMD[vp & 0xFF];
for( int j=0; j<4; j++ ) vp >>= 8;
{ data |= idx << (i*8);
uint8_t idx = IndexTable[p & 0x3];
p >>= 8;
data |= idx << (k*2);
k++;
}
} }
return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( data ) << 32 ) ); return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( data ) << 32 ) );