From 2df1eaaa7ed38a389deabc3a210edb0f0fb655bf Mon Sep 17 00:00:00 2001 From: Bartosz Taudul Date: Fri, 28 Jun 2019 21:58:10 +0200 Subject: [PATCH] Pack color indices using SSE. --- client/TracyDxt1.cpp | 51 +++++++++++++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 12 deletions(-) diff --git a/client/TracyDxt1.cpp b/client/TracyDxt1.cpp index 97c7f383..839b8856 100644 --- a/client/TracyDxt1.cpp +++ b/client/TracyDxt1.cpp @@ -106,6 +106,24 @@ static uint64_t CheckSolid( const uint8_t* src ) } static const uint8_t IndexTable[4] = { 1, 3, 2, 0 }; +static const uint8_t IndexTableSIMD[256] = { + 85, 87, 86, 84, 93, 95, 94, 92, 89, 91, 90, 88, 81, 83, 82, 80, + 117, 119, 118, 116, 125, 127, 126, 124, 121, 123, 122, 120, 113, 115, 114, 112, + 101, 103, 102, 100, 109, 111, 110, 108, 105, 107, 106, 104, 97, 99, 98, 96, + 69, 71, 70, 68, 77, 79, 78, 76, 73, 75, 74, 72, 65, 67, 66, 64, + 213, 215, 214, 212, 221, 223, 222, 220, 217, 219, 218, 216, 209, 211, 210, 208, + 245, 247, 246, 244, 253, 255, 254, 252, 249, 251, 250, 248, 241, 243, 242, 240, + 229, 231, 230, 228, 237, 239, 238, 236, 233, 235, 234, 232, 225, 227, 226, 224, + 197, 199, 198, 196, 205, 207, 206, 204, 201, 203, 202, 200, 193, 195, 194, 192, + 149, 151, 150, 148, 157, 159, 158, 156, 153, 155, 154, 152, 145, 147, 146, 144, + 181, 183, 182, 180, 189, 191, 190, 188, 185, 187, 186, 184, 177, 179, 178, 176, + 165, 167, 166, 164, 173, 175, 174, 172, 169, 171, 170, 168, 161, 163, 162, 160, + 133, 135, 134, 132, 141, 143, 142, 140, 137, 139, 138, 136, 129, 131, 130, 128, + 21, 23, 22, 20, 29, 31, 30, 28, 25, 27, 26, 24, 17, 19, 18, 16, + 53, 55, 54, 52, 61, 63, 62, 60, 57, 59, 58, 56, 49, 51, 50, 48, + 37, 39, 38, 36, 45, 47, 46, 44, 41, 43, 42, 40, 33, 35, 34, 32, + 5, 7, 6, 4, 13, 15, 14, 12, 9, 11, 10, 8, 1, 3, 2, 0 +}; static uint64_t ProcessRGB( const uint8_t* src ) { @@ -170,24 +188,33 @@ static uint64_t ProcessRGB( const uint8_t* src ) __m128i p0 = _mm_packus_epi16( m0, m1 ); + __m128i mask0 = _mm_set1_epi32( 0x00000003 ); + __m128i mask1 = _mm_set1_epi32( 0x00000300 ); + __m128i mask2 = _mm_set1_epi32( 0x00030000 ); + __m128i mask3 = _mm_set1_epi32( 0x03000000 ); + + __m128i p1 = _mm_and_si128( p0, mask0 ); + __m128i p2 = _mm_srai_epi32( _mm_and_si128( p0, mask1 ), 6 ); + __m128i p3 = _mm_srai_epi32( _mm_and_si128( p0, mask2 ), 12 ); + __m128i p4 = _mm_srai_epi32( _mm_and_si128( p0, mask3 ), 18 ); + + __m128i p5 = _mm_or_si128( p1, p2 ); + __m128i p6 = _mm_or_si128( p3, p4 ); + __m128i p7 = _mm_or_si128( p5, p6 ); + + __m128i p8 = _mm_packus_epi32( p7, p7 ); + __m128i p = _mm_packus_epi16( p8, p8 ); + uint32_t vmin = _mm_cvtsi128_si32( min ); uint32_t vmax = _mm_cvtsi128_si32( max ); - - uint32_t vp[4]; - _mm_store_si128( (__m128i*)vp, p0 ); + uint32_t vp = _mm_cvtsi128_si32( p ); uint32_t data = 0; - int k = 0; for( int i=0; i<4; i++ ) { - uint32_t p = vp[i]; - for( int j=0; j<4; j++ ) - { - uint8_t idx = IndexTable[p & 0x3]; - p >>= 8; - data |= idx << (k*2); - k++; - } + uint8_t idx = IndexTableSIMD[vp & 0xFF]; + vp >>= 8; + data |= idx << (i*8); } return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( data ) << 32 ) );