mirror of
https://github.com/wolfpld/tracy.git
synced 2024-11-22 22:44:34 +00:00
Collect CPU cycles and instruction retirement events.
This commit is contained in:
parent
16101571e0
commit
7794443453
@ -678,7 +678,9 @@ static int perf_event_open( struct perf_event_attr* hw_event, pid_t pid, int cpu
|
|||||||
|
|
||||||
enum TraceEventId
|
enum TraceEventId
|
||||||
{
|
{
|
||||||
EventCallstack
|
EventCallstack,
|
||||||
|
EventCpuCycles,
|
||||||
|
EventInstructionsRetired
|
||||||
};
|
};
|
||||||
|
|
||||||
static void SetupSampling( int64_t& samplingPeriod )
|
static void SetupSampling( int64_t& samplingPeriod )
|
||||||
@ -690,22 +692,20 @@ static void SetupSampling( int64_t& samplingPeriod )
|
|||||||
samplingPeriod = GetSamplingPeriod();
|
samplingPeriod = GetSamplingPeriod();
|
||||||
|
|
||||||
s_numCpus = (int)std::thread::hardware_concurrency();
|
s_numCpus = (int)std::thread::hardware_concurrency();
|
||||||
s_ring = (RingBuffer<RingBufSize>*)tracy_malloc( sizeof( RingBuffer<RingBufSize> ) * s_numCpus );
|
s_ring = (RingBuffer<RingBufSize>*)tracy_malloc( sizeof( RingBuffer<RingBufSize> ) * s_numCpus * 3 );
|
||||||
s_numBuffers = 0;
|
s_numBuffers = 0;
|
||||||
|
|
||||||
|
// Stack traces
|
||||||
perf_event_attr pe = {};
|
perf_event_attr pe = {};
|
||||||
|
|
||||||
pe.type = PERF_TYPE_SOFTWARE;
|
pe.type = PERF_TYPE_SOFTWARE;
|
||||||
pe.size = sizeof( perf_event_attr );
|
pe.size = sizeof( perf_event_attr );
|
||||||
pe.config = PERF_COUNT_SW_CPU_CLOCK;
|
pe.config = PERF_COUNT_SW_CPU_CLOCK;
|
||||||
|
|
||||||
pe.sample_freq = GetSamplingFrequency();
|
pe.sample_freq = GetSamplingFrequency();
|
||||||
pe.sample_type = PERF_SAMPLE_TID | PERF_SAMPLE_TIME | PERF_SAMPLE_CALLCHAIN;
|
pe.sample_type = PERF_SAMPLE_TID | PERF_SAMPLE_TIME | PERF_SAMPLE_CALLCHAIN;
|
||||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION( 4, 8, 0 )
|
#if LINUX_VERSION_CODE >= KERNEL_VERSION( 4, 8, 0 )
|
||||||
pe.sample_max_stack = 127;
|
pe.sample_max_stack = 127;
|
||||||
#endif
|
#endif
|
||||||
pe.exclude_callchain_kernel = 1;
|
pe.exclude_callchain_kernel = 1;
|
||||||
|
|
||||||
pe.disabled = 1;
|
pe.disabled = 1;
|
||||||
pe.freq = 1;
|
pe.freq = 1;
|
||||||
#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
|
#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
|
||||||
@ -726,6 +726,46 @@ static void SetupSampling( int64_t& samplingPeriod )
|
|||||||
s_numBuffers++;
|
s_numBuffers++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CPU cycles
|
||||||
|
pe = {};
|
||||||
|
pe.type = PERF_TYPE_HARDWARE;
|
||||||
|
pe.size = sizeof( perf_event_attr );
|
||||||
|
pe.config = PERF_COUNT_HW_CPU_CYCLES;
|
||||||
|
pe.sample_freq = 25*1000*1000;
|
||||||
|
pe.sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_TIME;
|
||||||
|
pe.disabled = 1;
|
||||||
|
pe.exclude_kernel = 1;
|
||||||
|
pe.exclude_idle = 1;
|
||||||
|
pe.precise_ip = 2;
|
||||||
|
#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
|
||||||
|
pe.use_clockid = 1;
|
||||||
|
pe.clockid = CLOCK_MONOTONIC_RAW;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
for( int i=0; i<s_numCpus; i++ )
|
||||||
|
{
|
||||||
|
const int fd = perf_event_open( &pe, -1, i, -1, PERF_FLAG_FD_CLOEXEC );
|
||||||
|
if( fd != -1 )
|
||||||
|
{
|
||||||
|
new( s_ring+s_numBuffers ) RingBuffer<RingBufSize>( fd, EventCpuCycles );
|
||||||
|
s_numBuffers++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Instructions retired
|
||||||
|
pe.config = PERF_COUNT_HW_INSTRUCTIONS;
|
||||||
|
|
||||||
|
for( int i=0; i<s_numCpus; i++ )
|
||||||
|
{
|
||||||
|
const int fd = perf_event_open( &pe, -1, i, -1, PERF_FLAG_FD_CLOEXEC );
|
||||||
|
if( fd != -1 )
|
||||||
|
{
|
||||||
|
new( s_ring+s_numBuffers ) RingBuffer<RingBufSize>( fd, EventInstructionsRetired );
|
||||||
|
s_numBuffers++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
s_threadSampling = (Thread*)tracy_malloc( sizeof( Thread ) );
|
s_threadSampling = (Thread*)tracy_malloc( sizeof( Thread ) );
|
||||||
new(s_threadSampling) Thread( [] (void*) {
|
new(s_threadSampling) Thread( [] (void*) {
|
||||||
ThreadExitHandler threadExitHandler;
|
ThreadExitHandler threadExitHandler;
|
||||||
@ -760,77 +800,133 @@ static void SetupSampling( int64_t& samplingPeriod )
|
|||||||
s_ring[i].Read( &hdr, 0, sizeof( perf_event_header ) );
|
s_ring[i].Read( &hdr, 0, sizeof( perf_event_header ) );
|
||||||
if( hdr.type == PERF_RECORD_SAMPLE )
|
if( hdr.type == PERF_RECORD_SAMPLE )
|
||||||
{
|
{
|
||||||
uint32_t pid, tid;
|
|
||||||
uint64_t t0;
|
|
||||||
uint64_t cnt;
|
|
||||||
|
|
||||||
auto offset = sizeof( perf_event_header );
|
auto offset = sizeof( perf_event_header );
|
||||||
s_ring[i].Read( &pid, offset, sizeof( uint32_t ) );
|
const auto id = s_ring[i].GetId();
|
||||||
if( pid == currentPid )
|
if( id == EventCallstack )
|
||||||
{
|
{
|
||||||
offset += sizeof( uint32_t );
|
// Layout:
|
||||||
s_ring[i].Read( &tid, offset, sizeof( uint32_t ) );
|
// u32 pid, tid
|
||||||
offset += sizeof( uint32_t );
|
// u64 time
|
||||||
s_ring[i].Read( &t0, offset, sizeof( uint64_t ) );
|
// u64 cnt
|
||||||
offset += sizeof( uint64_t );
|
// u64 ip[cnt]
|
||||||
s_ring[i].Read( &cnt, offset, sizeof( uint64_t ) );
|
|
||||||
offset += sizeof( uint64_t );
|
|
||||||
|
|
||||||
if( cnt > 0 )
|
uint32_t pid;
|
||||||
|
s_ring[i].Read( &pid, offset, sizeof( uint32_t ) );
|
||||||
|
if( pid == currentPid )
|
||||||
{
|
{
|
||||||
auto trace = (uint64_t*)tracy_malloc( ( 1 + cnt ) * sizeof( uint64_t ) );
|
uint32_t tid;
|
||||||
s_ring[i].Read( trace+1, offset, sizeof( uint64_t ) * cnt );
|
uint64_t t0;
|
||||||
|
uint64_t cnt;
|
||||||
|
|
||||||
|
offset += sizeof( uint32_t );
|
||||||
|
s_ring[i].Read( &tid, offset, sizeof( uint32_t ) );
|
||||||
|
offset += sizeof( uint32_t );
|
||||||
|
s_ring[i].Read( &t0, offset, sizeof( uint64_t ) );
|
||||||
|
offset += sizeof( uint64_t );
|
||||||
|
s_ring[i].Read( &cnt, offset, sizeof( uint64_t ) );
|
||||||
|
offset += sizeof( uint64_t );
|
||||||
|
|
||||||
|
if( cnt > 0 )
|
||||||
|
{
|
||||||
|
auto trace = (uint64_t*)tracy_malloc( ( 1 + cnt ) * sizeof( uint64_t ) );
|
||||||
|
s_ring[i].Read( trace+1, offset, sizeof( uint64_t ) * cnt );
|
||||||
|
|
||||||
#if defined __x86_64__ || defined _M_X64
|
#if defined __x86_64__ || defined _M_X64
|
||||||
// remove non-canonical pointers
|
// remove non-canonical pointers
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
const auto test = (int64_t)trace[cnt];
|
const auto test = (int64_t)trace[cnt];
|
||||||
const auto m1 = test >> 63;
|
const auto m1 = test >> 63;
|
||||||
const auto m2 = test >> 47;
|
const auto m2 = test >> 47;
|
||||||
if( m1 == m2 ) break;
|
if( m1 == m2 ) break;
|
||||||
}
|
}
|
||||||
while( --cnt > 0 );
|
while( --cnt > 0 );
|
||||||
for( uint64_t j=1; j<cnt; j++ )
|
for( uint64_t j=1; j<cnt; j++ )
|
||||||
{
|
{
|
||||||
const auto test = (int64_t)trace[j];
|
const auto test = (int64_t)trace[j];
|
||||||
const auto m1 = test >> 63;
|
const auto m1 = test >> 63;
|
||||||
const auto m2 = test >> 47;
|
const auto m2 = test >> 47;
|
||||||
if( m1 != m2 ) trace[j] = 0;
|
if( m1 != m2 ) trace[j] = 0;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// skip kernel frames
|
// skip kernel frames
|
||||||
uint64_t j;
|
uint64_t j;
|
||||||
for( j=0; j<cnt; j++ )
|
for( j=0; j<cnt; j++ )
|
||||||
{
|
|
||||||
if( (int64_t)trace[j+1] >= 0 ) break;
|
|
||||||
}
|
|
||||||
if( j == cnt )
|
|
||||||
{
|
|
||||||
tracy_free( trace );
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if( j > 0 )
|
|
||||||
{
|
{
|
||||||
cnt -= j;
|
if( (int64_t)trace[j+1] >= 0 ) break;
|
||||||
memmove( trace+1, trace+1+j, sizeof( uint64_t ) * cnt );
|
|
||||||
}
|
}
|
||||||
memcpy( trace, &cnt, sizeof( uint64_t ) );
|
if( j == cnt )
|
||||||
|
{
|
||||||
|
tracy_free( trace );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if( j > 0 )
|
||||||
|
{
|
||||||
|
cnt -= j;
|
||||||
|
memmove( trace+1, trace+1+j, sizeof( uint64_t ) * cnt );
|
||||||
|
}
|
||||||
|
memcpy( trace, &cnt, sizeof( uint64_t ) );
|
||||||
|
|
||||||
#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
|
#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
|
||||||
t0 = s_ring[i].ConvertTimeToTsc( t0 );
|
t0 = s_ring[i].ConvertTimeToTsc( t0 );
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
TracyLfqPrepare( QueueType::CallstackSample );
|
TracyLfqPrepare( QueueType::CallstackSample );
|
||||||
MemWrite( &item->callstackSampleFat.time, t0 );
|
MemWrite( &item->callstackSampleFat.time, t0 );
|
||||||
MemWrite( &item->callstackSampleFat.thread, (uint64_t)tid );
|
MemWrite( &item->callstackSampleFat.thread, (uint64_t)tid );
|
||||||
MemWrite( &item->callstackSampleFat.ptr, (uint64_t)trace );
|
MemWrite( &item->callstackSampleFat.ptr, (uint64_t)trace );
|
||||||
TracyLfqCommit;
|
TracyLfqCommit;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Layout:
|
||||||
|
// u64 ip
|
||||||
|
// u32 pid, tid
|
||||||
|
// u64 time
|
||||||
|
|
||||||
|
uint32_t pid;
|
||||||
|
s_ring[i].Read( &pid, offset + sizeof( uint64_t ), sizeof( uint32_t ) );
|
||||||
|
if( pid == currentPid )
|
||||||
|
{
|
||||||
|
uint64_t ip, t0;
|
||||||
|
uint32_t tid;
|
||||||
|
|
||||||
|
s_ring[i].Read( &ip, offset, sizeof( uint64_t ) );
|
||||||
|
offset += sizeof( uint64_t ) + sizeof( uint32_t );
|
||||||
|
s_ring[i].Read( &tid, offset, sizeof( uint32_t ) );
|
||||||
|
offset += sizeof( uint32_t );
|
||||||
|
s_ring[i].Read( &t0, offset, sizeof( uint64_t ) );
|
||||||
|
|
||||||
|
#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
|
||||||
|
t0 = s_ring[i].ConvertTimeToTsc( t0 );
|
||||||
|
#endif
|
||||||
|
|
||||||
|
QueueType type;
|
||||||
|
switch( id )
|
||||||
|
{
|
||||||
|
case EventCpuCycles:
|
||||||
|
type = QueueType::HwSampleCpuCycle;
|
||||||
|
break;
|
||||||
|
case EventInstructionsRetired:
|
||||||
|
type = QueueType::HwSampleInstructionRetired;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
assert( false );
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
TracyLfqPrepare( type );
|
||||||
|
MemWrite( &item->hwSample.ip, ip );
|
||||||
|
MemWrite( &item->hwSample.thread, (uint64_t)tid );
|
||||||
|
MemWrite( &item->hwSample.time, t0 );
|
||||||
|
TracyLfqCommit;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
s_ring[i].Advance( hdr.size );
|
s_ring[i].Advance( hdr.size );
|
||||||
}
|
}
|
||||||
|
@ -9,7 +9,7 @@ namespace tracy
|
|||||||
|
|
||||||
constexpr unsigned Lz4CompressBound( unsigned isize ) { return isize + ( isize / 255 ) + 16; }
|
constexpr unsigned Lz4CompressBound( unsigned isize ) { return isize + ( isize / 255 ) + 16; }
|
||||||
|
|
||||||
enum : uint32_t { ProtocolVersion = 46 };
|
enum : uint32_t { ProtocolVersion = 47 };
|
||||||
enum : uint16_t { BroadcastVersion = 2 };
|
enum : uint16_t { BroadcastVersion = 2 };
|
||||||
|
|
||||||
using lz4sz_t = uint32_t;
|
using lz4sz_t = uint32_t;
|
||||||
|
@ -82,6 +82,8 @@ enum class QueueType : uint8_t
|
|||||||
CodeInformation,
|
CodeInformation,
|
||||||
SysTimeReport,
|
SysTimeReport,
|
||||||
TidToPid,
|
TidToPid,
|
||||||
|
HwSampleCpuCycle,
|
||||||
|
HwSampleInstructionRetired,
|
||||||
PlotConfig,
|
PlotConfig,
|
||||||
ParamSetup,
|
ParamSetup,
|
||||||
AckServerQueryNoop,
|
AckServerQueryNoop,
|
||||||
@ -473,6 +475,13 @@ struct QueueTidToPid
|
|||||||
uint64_t pid;
|
uint64_t pid;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct QueueHwSample
|
||||||
|
{
|
||||||
|
int64_t time;
|
||||||
|
uint64_t thread;
|
||||||
|
uint64_t ip;
|
||||||
|
};
|
||||||
|
|
||||||
enum class PlotFormatType : uint8_t
|
enum class PlotFormatType : uint8_t
|
||||||
{
|
{
|
||||||
Number,
|
Number,
|
||||||
@ -567,6 +576,7 @@ struct QueueItem
|
|||||||
QueueContextSwitch contextSwitch;
|
QueueContextSwitch contextSwitch;
|
||||||
QueueThreadWakeup threadWakeup;
|
QueueThreadWakeup threadWakeup;
|
||||||
QueueTidToPid tidToPid;
|
QueueTidToPid tidToPid;
|
||||||
|
QueueHwSample hwSample;
|
||||||
QueuePlotConfig plotConfig;
|
QueuePlotConfig plotConfig;
|
||||||
QueueParamSetup paramSetup;
|
QueueParamSetup paramSetup;
|
||||||
QueueCpuTopology cpuTopology;
|
QueueCpuTopology cpuTopology;
|
||||||
@ -653,6 +663,8 @@ static constexpr size_t QueueDataSize[] = {
|
|||||||
sizeof( QueueHeader ) + sizeof( QueueCodeInformation ),
|
sizeof( QueueHeader ) + sizeof( QueueCodeInformation ),
|
||||||
sizeof( QueueHeader ) + sizeof( QueueSysTime ),
|
sizeof( QueueHeader ) + sizeof( QueueSysTime ),
|
||||||
sizeof( QueueHeader ) + sizeof( QueueTidToPid ),
|
sizeof( QueueHeader ) + sizeof( QueueTidToPid ),
|
||||||
|
sizeof( QueueHeader ) + sizeof( QueueHwSample ), // cpu cycle
|
||||||
|
sizeof( QueueHeader ) + sizeof( QueueHwSample ), // instruction retired
|
||||||
sizeof( QueueHeader ) + sizeof( QueuePlotConfig ),
|
sizeof( QueueHeader ) + sizeof( QueuePlotConfig ),
|
||||||
sizeof( QueueHeader ) + sizeof( QueueParamSetup ),
|
sizeof( QueueHeader ) + sizeof( QueueParamSetup ),
|
||||||
sizeof( QueueHeader ), // server query acknowledgement
|
sizeof( QueueHeader ), // server query acknowledgement
|
||||||
|
Loading…
Reference in New Issue
Block a user