Collect CPU cycles and instruction retirement events.

This commit is contained in:
Bartosz Taudul 2021-05-19 02:20:35 +02:00 committed by Bartosz Taudul
parent 16101571e0
commit 7794443453
No known key found for this signature in database
GPG Key ID: B7FE2008B7575DF3
3 changed files with 168 additions and 60 deletions

View File

@ -678,7 +678,9 @@ static int perf_event_open( struct perf_event_attr* hw_event, pid_t pid, int cpu
enum TraceEventId enum TraceEventId
{ {
EventCallstack EventCallstack,
EventCpuCycles,
EventInstructionsRetired
}; };
static void SetupSampling( int64_t& samplingPeriod ) static void SetupSampling( int64_t& samplingPeriod )
@ -690,22 +692,20 @@ static void SetupSampling( int64_t& samplingPeriod )
samplingPeriod = GetSamplingPeriod(); samplingPeriod = GetSamplingPeriod();
s_numCpus = (int)std::thread::hardware_concurrency(); s_numCpus = (int)std::thread::hardware_concurrency();
s_ring = (RingBuffer<RingBufSize>*)tracy_malloc( sizeof( RingBuffer<RingBufSize> ) * s_numCpus ); s_ring = (RingBuffer<RingBufSize>*)tracy_malloc( sizeof( RingBuffer<RingBufSize> ) * s_numCpus * 3 );
s_numBuffers = 0; s_numBuffers = 0;
// Stack traces
perf_event_attr pe = {}; perf_event_attr pe = {};
pe.type = PERF_TYPE_SOFTWARE; pe.type = PERF_TYPE_SOFTWARE;
pe.size = sizeof( perf_event_attr ); pe.size = sizeof( perf_event_attr );
pe.config = PERF_COUNT_SW_CPU_CLOCK; pe.config = PERF_COUNT_SW_CPU_CLOCK;
pe.sample_freq = GetSamplingFrequency(); pe.sample_freq = GetSamplingFrequency();
pe.sample_type = PERF_SAMPLE_TID | PERF_SAMPLE_TIME | PERF_SAMPLE_CALLCHAIN; pe.sample_type = PERF_SAMPLE_TID | PERF_SAMPLE_TIME | PERF_SAMPLE_CALLCHAIN;
#if LINUX_VERSION_CODE >= KERNEL_VERSION( 4, 8, 0 ) #if LINUX_VERSION_CODE >= KERNEL_VERSION( 4, 8, 0 )
pe.sample_max_stack = 127; pe.sample_max_stack = 127;
#endif #endif
pe.exclude_callchain_kernel = 1; pe.exclude_callchain_kernel = 1;
pe.disabled = 1; pe.disabled = 1;
pe.freq = 1; pe.freq = 1;
#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) #if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
@ -726,6 +726,46 @@ static void SetupSampling( int64_t& samplingPeriod )
s_numBuffers++; s_numBuffers++;
} }
// CPU cycles
pe = {};
pe.type = PERF_TYPE_HARDWARE;
pe.size = sizeof( perf_event_attr );
pe.config = PERF_COUNT_HW_CPU_CYCLES;
pe.sample_freq = 25*1000*1000;
pe.sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_TIME;
pe.disabled = 1;
pe.exclude_kernel = 1;
pe.exclude_idle = 1;
pe.precise_ip = 2;
#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
pe.use_clockid = 1;
pe.clockid = CLOCK_MONOTONIC_RAW;
#endif
for( int i=0; i<s_numCpus; i++ )
{
const int fd = perf_event_open( &pe, -1, i, -1, PERF_FLAG_FD_CLOEXEC );
if( fd != -1 )
{
new( s_ring+s_numBuffers ) RingBuffer<RingBufSize>( fd, EventCpuCycles );
s_numBuffers++;
}
}
// Instructions retired
pe.config = PERF_COUNT_HW_INSTRUCTIONS;
for( int i=0; i<s_numCpus; i++ )
{
const int fd = perf_event_open( &pe, -1, i, -1, PERF_FLAG_FD_CLOEXEC );
if( fd != -1 )
{
new( s_ring+s_numBuffers ) RingBuffer<RingBufSize>( fd, EventInstructionsRetired );
s_numBuffers++;
}
}
s_threadSampling = (Thread*)tracy_malloc( sizeof( Thread ) ); s_threadSampling = (Thread*)tracy_malloc( sizeof( Thread ) );
new(s_threadSampling) Thread( [] (void*) { new(s_threadSampling) Thread( [] (void*) {
ThreadExitHandler threadExitHandler; ThreadExitHandler threadExitHandler;
@ -760,77 +800,133 @@ static void SetupSampling( int64_t& samplingPeriod )
s_ring[i].Read( &hdr, 0, sizeof( perf_event_header ) ); s_ring[i].Read( &hdr, 0, sizeof( perf_event_header ) );
if( hdr.type == PERF_RECORD_SAMPLE ) if( hdr.type == PERF_RECORD_SAMPLE )
{ {
uint32_t pid, tid;
uint64_t t0;
uint64_t cnt;
auto offset = sizeof( perf_event_header ); auto offset = sizeof( perf_event_header );
s_ring[i].Read( &pid, offset, sizeof( uint32_t ) ); const auto id = s_ring[i].GetId();
if( pid == currentPid ) if( id == EventCallstack )
{ {
offset += sizeof( uint32_t ); // Layout:
s_ring[i].Read( &tid, offset, sizeof( uint32_t ) ); // u32 pid, tid
offset += sizeof( uint32_t ); // u64 time
s_ring[i].Read( &t0, offset, sizeof( uint64_t ) ); // u64 cnt
offset += sizeof( uint64_t ); // u64 ip[cnt]
s_ring[i].Read( &cnt, offset, sizeof( uint64_t ) );
offset += sizeof( uint64_t );
if( cnt > 0 ) uint32_t pid;
s_ring[i].Read( &pid, offset, sizeof( uint32_t ) );
if( pid == currentPid )
{ {
auto trace = (uint64_t*)tracy_malloc( ( 1 + cnt ) * sizeof( uint64_t ) ); uint32_t tid;
s_ring[i].Read( trace+1, offset, sizeof( uint64_t ) * cnt ); uint64_t t0;
uint64_t cnt;
offset += sizeof( uint32_t );
s_ring[i].Read( &tid, offset, sizeof( uint32_t ) );
offset += sizeof( uint32_t );
s_ring[i].Read( &t0, offset, sizeof( uint64_t ) );
offset += sizeof( uint64_t );
s_ring[i].Read( &cnt, offset, sizeof( uint64_t ) );
offset += sizeof( uint64_t );
if( cnt > 0 )
{
auto trace = (uint64_t*)tracy_malloc( ( 1 + cnt ) * sizeof( uint64_t ) );
s_ring[i].Read( trace+1, offset, sizeof( uint64_t ) * cnt );
#if defined __x86_64__ || defined _M_X64 #if defined __x86_64__ || defined _M_X64
// remove non-canonical pointers // remove non-canonical pointers
do do
{ {
const auto test = (int64_t)trace[cnt]; const auto test = (int64_t)trace[cnt];
const auto m1 = test >> 63; const auto m1 = test >> 63;
const auto m2 = test >> 47; const auto m2 = test >> 47;
if( m1 == m2 ) break; if( m1 == m2 ) break;
} }
while( --cnt > 0 ); while( --cnt > 0 );
for( uint64_t j=1; j<cnt; j++ ) for( uint64_t j=1; j<cnt; j++ )
{ {
const auto test = (int64_t)trace[j]; const auto test = (int64_t)trace[j];
const auto m1 = test >> 63; const auto m1 = test >> 63;
const auto m2 = test >> 47; const auto m2 = test >> 47;
if( m1 != m2 ) trace[j] = 0; if( m1 != m2 ) trace[j] = 0;
} }
#endif #endif
// skip kernel frames // skip kernel frames
uint64_t j; uint64_t j;
for( j=0; j<cnt; j++ ) for( j=0; j<cnt; j++ )
{
if( (int64_t)trace[j+1] >= 0 ) break;
}
if( j == cnt )
{
tracy_free( trace );
}
else
{
if( j > 0 )
{ {
cnt -= j; if( (int64_t)trace[j+1] >= 0 ) break;
memmove( trace+1, trace+1+j, sizeof( uint64_t ) * cnt );
} }
memcpy( trace, &cnt, sizeof( uint64_t ) ); if( j == cnt )
{
tracy_free( trace );
}
else
{
if( j > 0 )
{
cnt -= j;
memmove( trace+1, trace+1+j, sizeof( uint64_t ) * cnt );
}
memcpy( trace, &cnt, sizeof( uint64_t ) );
#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) #if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
t0 = s_ring[i].ConvertTimeToTsc( t0 ); t0 = s_ring[i].ConvertTimeToTsc( t0 );
#endif #endif
TracyLfqPrepare( QueueType::CallstackSample ); TracyLfqPrepare( QueueType::CallstackSample );
MemWrite( &item->callstackSampleFat.time, t0 ); MemWrite( &item->callstackSampleFat.time, t0 );
MemWrite( &item->callstackSampleFat.thread, (uint64_t)tid ); MemWrite( &item->callstackSampleFat.thread, (uint64_t)tid );
MemWrite( &item->callstackSampleFat.ptr, (uint64_t)trace ); MemWrite( &item->callstackSampleFat.ptr, (uint64_t)trace );
TracyLfqCommit; TracyLfqCommit;
}
} }
} }
} }
else
{
// Layout:
// u64 ip
// u32 pid, tid
// u64 time
uint32_t pid;
s_ring[i].Read( &pid, offset + sizeof( uint64_t ), sizeof( uint32_t ) );
if( pid == currentPid )
{
uint64_t ip, t0;
uint32_t tid;
s_ring[i].Read( &ip, offset, sizeof( uint64_t ) );
offset += sizeof( uint64_t ) + sizeof( uint32_t );
s_ring[i].Read( &tid, offset, sizeof( uint32_t ) );
offset += sizeof( uint32_t );
s_ring[i].Read( &t0, offset, sizeof( uint64_t ) );
#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
t0 = s_ring[i].ConvertTimeToTsc( t0 );
#endif
QueueType type;
switch( id )
{
case EventCpuCycles:
type = QueueType::HwSampleCpuCycle;
break;
case EventInstructionsRetired:
type = QueueType::HwSampleInstructionRetired;
break;
default:
assert( false );
break;
}
TracyLfqPrepare( type );
MemWrite( &item->hwSample.ip, ip );
MemWrite( &item->hwSample.thread, (uint64_t)tid );
MemWrite( &item->hwSample.time, t0 );
TracyLfqCommit;
}
}
} }
s_ring[i].Advance( hdr.size ); s_ring[i].Advance( hdr.size );
} }

View File

@ -9,7 +9,7 @@ namespace tracy
constexpr unsigned Lz4CompressBound( unsigned isize ) { return isize + ( isize / 255 ) + 16; } constexpr unsigned Lz4CompressBound( unsigned isize ) { return isize + ( isize / 255 ) + 16; }
enum : uint32_t { ProtocolVersion = 46 }; enum : uint32_t { ProtocolVersion = 47 };
enum : uint16_t { BroadcastVersion = 2 }; enum : uint16_t { BroadcastVersion = 2 };
using lz4sz_t = uint32_t; using lz4sz_t = uint32_t;

View File

@ -82,6 +82,8 @@ enum class QueueType : uint8_t
CodeInformation, CodeInformation,
SysTimeReport, SysTimeReport,
TidToPid, TidToPid,
HwSampleCpuCycle,
HwSampleInstructionRetired,
PlotConfig, PlotConfig,
ParamSetup, ParamSetup,
AckServerQueryNoop, AckServerQueryNoop,
@ -473,6 +475,13 @@ struct QueueTidToPid
uint64_t pid; uint64_t pid;
}; };
struct QueueHwSample
{
int64_t time;
uint64_t thread;
uint64_t ip;
};
enum class PlotFormatType : uint8_t enum class PlotFormatType : uint8_t
{ {
Number, Number,
@ -567,6 +576,7 @@ struct QueueItem
QueueContextSwitch contextSwitch; QueueContextSwitch contextSwitch;
QueueThreadWakeup threadWakeup; QueueThreadWakeup threadWakeup;
QueueTidToPid tidToPid; QueueTidToPid tidToPid;
QueueHwSample hwSample;
QueuePlotConfig plotConfig; QueuePlotConfig plotConfig;
QueueParamSetup paramSetup; QueueParamSetup paramSetup;
QueueCpuTopology cpuTopology; QueueCpuTopology cpuTopology;
@ -653,6 +663,8 @@ static constexpr size_t QueueDataSize[] = {
sizeof( QueueHeader ) + sizeof( QueueCodeInformation ), sizeof( QueueHeader ) + sizeof( QueueCodeInformation ),
sizeof( QueueHeader ) + sizeof( QueueSysTime ), sizeof( QueueHeader ) + sizeof( QueueSysTime ),
sizeof( QueueHeader ) + sizeof( QueueTidToPid ), sizeof( QueueHeader ) + sizeof( QueueTidToPid ),
sizeof( QueueHeader ) + sizeof( QueueHwSample ), // cpu cycle
sizeof( QueueHeader ) + sizeof( QueueHwSample ), // instruction retired
sizeof( QueueHeader ) + sizeof( QueuePlotConfig ), sizeof( QueueHeader ) + sizeof( QueuePlotConfig ),
sizeof( QueueHeader ) + sizeof( QueueParamSetup ), sizeof( QueueHeader ) + sizeof( QueueParamSetup ),
sizeof( QueueHeader ), // server query acknowledgement sizeof( QueueHeader ), // server query acknowledgement