Integrate CheckSolid into ProcessRGB.

This commit is contained in:
Bartosz Taudul 2019-06-29 02:02:20 +02:00
parent 50ac219e97
commit ab9f036f5e

View File

@ -36,75 +36,6 @@ static inline uint16_t to565( uint32_t c )
( ( c & 0x0000F8 ) << 8 );
}
static uint64_t CheckSolid( const uint8_t* src )
{
#ifdef __SSE4_1__
__m128i mask = _mm_set1_epi32( 0xF8FCF8 );
__m128i d0 = _mm_and_si128( _mm_loadu_si128(((__m128i*)src) + 0), mask );
__m128i d1 = _mm_and_si128( _mm_loadu_si128(((__m128i*)src) + 1), mask );
__m128i d2 = _mm_and_si128( _mm_loadu_si128(((__m128i*)src) + 2), mask );
__m128i d3 = _mm_and_si128( _mm_loadu_si128(((__m128i*)src) + 3), mask );
__m128i c = _mm_shuffle_epi32(d0, _MM_SHUFFLE(0, 0, 0, 0));
__m128i c0 = _mm_cmpeq_epi8(d0, c);
__m128i c1 = _mm_cmpeq_epi8(d1, c);
__m128i c2 = _mm_cmpeq_epi8(d2, c);
__m128i c3 = _mm_cmpeq_epi8(d3, c);
__m128i m0 = _mm_and_si128(c0, c1);
__m128i m1 = _mm_and_si128(c2, c3);
__m128i m = _mm_and_si128(m0, m1);
if (!_mm_testc_si128(m, _mm_set1_epi32(-1)))
{
return 0;
}
else
{
return to565( src[0], src[1], src[2] );
}
#elif defined __ARM_NEON
uint32x4_t mask = vdupq_n_u32( 0xF8FCF8 );
uint32x4_t d0 = vandq_u32( mask, vld1q_u32( (uint32_t*)src ) );
uint32x4_t d1 = vandq_u32( mask, vld1q_u32( (uint32_t*)src + 4 ) );
uint32x4_t d2 = vandq_u32( mask, vld1q_u32( (uint32_t*)src + 8 ) );
uint32x4_t d3 = vandq_u32( mask, vld1q_u32( (uint32_t*)src + 12 ) );
uint32x4_t c = vdupq_n_u32( d0[0] );
uint32x4_t c0 = vceqq_u32( d0, c );
uint32x4_t c1 = vceqq_u32( d1, c );
uint32x4_t c2 = vceqq_u32( d2, c );
uint32x4_t c3 = vceqq_u32( d3, c );
uint32x4_t m0 = vandq_u32( c0, c1 );
uint32x4_t m1 = vandq_u32( c2, c3 );
int64x2_t m = vreinterpretq_s64_u32( vandq_u32( m0, m1 ) );
if( m[0] != -1 || m[1] != -1 )
{
return 0;
}
else
{
return to565( src[0], src[1], src[2] );
}
#else
const auto ref = to565( src[0], src[1], src[2] );
src += 4;
for( int i=1; i<16; i++ )
{
if( to565( src[0], src[1], src[2] ) != ref )
{
return 0;
}
src += 4;
}
return uint64_t( ref );
#endif
}
static const uint8_t IndexTable[4] = { 1, 3, 2, 0 };
static const uint8_t IndexTableSIMD[256] = {
85, 87, 86, 84, 93, 95, 94, 92, 89, 91, 90, 88, 81, 83, 82, 80,
@ -127,15 +58,39 @@ static const uint8_t IndexTableSIMD[256] = {
static uint64_t ProcessRGB( const uint8_t* src )
{
const auto solid = CheckSolid( src );
if( solid != 0 ) return solid;
#ifdef __SSE4_1__
__m128i px0 = _mm_loadu_si128(((__m128i*)src) + 0);
__m128i px1 = _mm_loadu_si128(((__m128i*)src) + 1);
__m128i px2 = _mm_loadu_si128(((__m128i*)src) + 2);
__m128i px3 = _mm_loadu_si128(((__m128i*)src) + 3);
__m128i smask = _mm_set1_epi32( 0xF8FCF8 );
__m128i sd0 = _mm_and_si128( px0, smask );
__m128i sd1 = _mm_and_si128( px1, smask );
__m128i sd2 = _mm_and_si128( px2, smask );
__m128i sd3 = _mm_and_si128( px3, smask );
__m128i sc = _mm_shuffle_epi32(sd0, _MM_SHUFFLE(0, 0, 0, 0));
__m128i sc0 = _mm_cmpeq_epi8(sd0, sc);
__m128i sc1 = _mm_cmpeq_epi8(sd1, sc);
__m128i sc2 = _mm_cmpeq_epi8(sd2, sc);
__m128i sc3 = _mm_cmpeq_epi8(sd3, sc);
__m128i sm0 = _mm_and_si128(sc0, sc1);
__m128i sm1 = _mm_and_si128(sc2, sc3);
__m128i sm = _mm_and_si128(sm0, sm1);
if( _mm_testc_si128(sm, _mm_set1_epi32(-1)) )
{
return to565( src[0], src[1], src[2] );
}
__m128i mask = _mm_set1_epi32( 0xFFFFFF );
__m128i l0 = _mm_and_si128( _mm_loadu_si128(((__m128i*)src) + 0), mask );
__m128i l1 = _mm_and_si128( _mm_loadu_si128(((__m128i*)src) + 1), mask );
__m128i l2 = _mm_and_si128( _mm_loadu_si128(((__m128i*)src) + 2), mask );
__m128i l3 = _mm_and_si128( _mm_loadu_si128(((__m128i*)src) + 3), mask );
__m128i l0 = _mm_and_si128( px0, mask );
__m128i l1 = _mm_and_si128( px1, mask );
__m128i l2 = _mm_and_si128( px2, mask );
__m128i l3 = _mm_and_si128( px3, mask );
__m128i min0 = _mm_min_epu8( l0, l1 );
__m128i min1 = _mm_min_epu8( l2, l3 );
@ -219,11 +174,38 @@ static uint64_t ProcessRGB( const uint8_t* src )
return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( data ) << 32 ) );
#elif defined __ARM_NEON
uint32x4_t px0 = vld1q_u32( (uint32_t*)src );
uint32x4_t px1 = vld1q_u32( (uint32_t*)src + 4 );
uint32x4_t px2 = vld1q_u32( (uint32_t*)src + 8 );
uint32x4_t px3 = vld1q_u32( (uint32_t*)src + 12 );
uint32x4_t smask = vdupq_n_u32( 0xF8FCF8 );
uint32x4_t sd0 = vandq_u32( smask, px0 );
uint32x4_t sd1 = vandq_u32( smask, px1 );
uint32x4_t sd2 = vandq_u32( smask, px2 );
uint32x4_t sd3 = vandq_u32( smask, px3 );
uint32x4_t sc = vdupq_n_u32( sd0[0] );
uint32x4_t sc0 = vceqq_u32( sd0, sc );
uint32x4_t sc1 = vceqq_u32( sd1, sc );
uint32x4_t sc2 = vceqq_u32( sd2, sc );
uint32x4_t sc3 = vceqq_u32( sd3, sc );
uint32x4_t sm0 = vandq_u32( sc0, sc1 );
uint32x4_t sm1 = vandq_u32( sc2, sc3 );
int64x2_t sm = vreinterpretq_s64_u32( vandq_u32( sm0, sm1 ) );
if( sm[0] == -1 && sm[1] == -1 )
{
return to565( src[0], src[1], src[2] );
}
uint32x4_t mask = vdupq_n_u32( 0xFFFFFF );
uint8x16_t l0 = vreinterpretq_u8_u32( vandq_u32( mask, vld1q_u32( (uint32_t*)src ) ) );
uint8x16_t l1 = vreinterpretq_u8_u32( vandq_u32( mask, vld1q_u32( (uint32_t*)src + 4 ) ) );
uint8x16_t l2 = vreinterpretq_u8_u32( vandq_u32( mask, vld1q_u32( (uint32_t*)src + 8 ) ) );
uint8x16_t l3 = vreinterpretq_u8_u32( vandq_u32( mask, vld1q_u32( (uint32_t*)src + 12 ) ) );
uint8x16_t l0 = vreinterpretq_u8_u32( vandq_u32( mask, px0 ) );
uint8x16_t l1 = vreinterpretq_u8_u32( vandq_u32( mask, px1 ) );
uint8x16_t l2 = vreinterpretq_u8_u32( vandq_u32( mask, px2 ) );
uint8x16_t l3 = vreinterpretq_u8_u32( vandq_u32( mask, px3 ) );
uint8x16_t min0 = vminq_u8( l0, l1 );
uint8x16_t min1 = vminq_u8( l2, l3 );
@ -328,6 +310,21 @@ static uint64_t ProcessRGB( const uint8_t* src )
return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( data ) << 32 ) );
#else
const auto ref = to565( src[0], src[1], src[2] );
auto stmp = src + 4;
for( int i=1; i<16; i++ )
{
if( to565( stmp[0], stmp[1], stmp[2] ) != ref )
{
break;
}
stmp += 4;
}
if( stmp == src + 64 )
{
return uint64_t( ref );
}
uint8_t min[3] = { src[0], src[1], src[2] };
uint8_t max[3] = { src[0], src[1], src[2] };
auto tmp = src + 4;