mirror of
https://github.com/wolfpld/tracy.git
synced 2024-11-10 10:41:50 +00:00
AVX2 version of plot min max calculation.
Slightly faster (~5%) than the autovectorized serial code.
This commit is contained in:
parent
7a878cf4c7
commit
5dff7b5d1e
@ -17,6 +17,14 @@
|
|||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <time.h>
|
#include <time.h>
|
||||||
|
|
||||||
|
#ifdef __AVX2__
|
||||||
|
# ifdef _MSC_VER
|
||||||
|
# include <intrin.h>
|
||||||
|
# else
|
||||||
|
# include <x86intrin.h>
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "../common/TracyMutex.hpp"
|
#include "../common/TracyMutex.hpp"
|
||||||
#include "../common/TracyProtocol.hpp"
|
#include "../common/TracyProtocol.hpp"
|
||||||
#include "../common/TracySystem.hpp"
|
#include "../common/TracySystem.hpp"
|
||||||
@ -3686,11 +3694,42 @@ int View::DrawPlots( int offset, double pxns, const ImVec2& wpos, bool hover, fl
|
|||||||
auto tmp = it;
|
auto tmp = it;
|
||||||
++tmp;
|
++tmp;
|
||||||
const auto sz = end - tmp;
|
const auto sz = end - tmp;
|
||||||
|
#ifdef __AVX2__
|
||||||
|
__m256d vmin = _mm256_set1_pd( min );
|
||||||
|
__m256d vmax = vmin;
|
||||||
|
const auto ssz = sz / 4;
|
||||||
|
for( ptrdiff_t i=0; i<ssz; i++ )
|
||||||
|
{
|
||||||
|
__m256d v0 = _mm256_loadu_pd( (const double*)(tmp+0) );
|
||||||
|
__m256d v1 = _mm256_loadu_pd( (const double*)(tmp+2) );
|
||||||
|
__m256d v = _mm256_unpackhi_pd( v0, v1 );
|
||||||
|
vmin = _mm256_min_pd( vmin, v );
|
||||||
|
vmax = _mm256_max_pd( vmax, v );
|
||||||
|
tmp += 4;
|
||||||
|
}
|
||||||
|
__m256d min0 = _mm256_shuffle_pd( vmin, vmin, 5 );
|
||||||
|
__m256d max0 = _mm256_shuffle_pd( vmax, vmax, 5 );
|
||||||
|
__m256d min1 = _mm256_min_pd( vmin, min0 );
|
||||||
|
__m256d max1 = _mm256_max_pd( vmax, max0 );
|
||||||
|
__m256d min2 = _mm256_permute4x64_pd( min1, _MM_SHUFFLE( 0, 0, 2, 2 ) );
|
||||||
|
__m256d max2 = _mm256_permute4x64_pd( max1, _MM_SHUFFLE( 0, 0, 2, 2 ) );
|
||||||
|
__m256d min3 = _mm256_min_pd( min1, min2 );
|
||||||
|
__m256d max3 = _mm256_max_pd( max1, max2 );
|
||||||
|
min = _mm256_cvtsd_f64( min3 );
|
||||||
|
max = _mm256_cvtsd_f64( max3 );
|
||||||
|
const auto lsz = sz % 4;
|
||||||
|
for( ptrdiff_t i=0; i<lsz; i++ )
|
||||||
|
{
|
||||||
|
min = tmp[i].val < min ? tmp[i].val : min;
|
||||||
|
max = tmp[i].val > max ? tmp[i].val : max;
|
||||||
|
}
|
||||||
|
#else
|
||||||
for( ptrdiff_t i=0; i<sz; i++ )
|
for( ptrdiff_t i=0; i<sz; i++ )
|
||||||
{
|
{
|
||||||
min = tmp[i].val < min ? tmp[i].val : min;
|
min = tmp[i].val < min ? tmp[i].val : min;
|
||||||
max = tmp[i].val > max ? tmp[i].val : max;
|
max = tmp[i].val > max ? tmp[i].val : max;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
if( min == max )
|
if( min == max )
|
||||||
{
|
{
|
||||||
|
Loading…
Reference in New Issue
Block a user