mirror of
https://github.com/wolfpld/tracy.git
synced 2024-11-26 07:54:36 +00:00
5741bcfd32
If call stack capture is enabled for context switch data, the 64KB buffer is too small to work without overruns. However, if the default buffer size is increased, then the maximum locked memory limit is hit. This change keeps the small buffer size for all the buffers that may be used without escalated privileges. The context switch buffer is bigger, but it does not need to obey the limits, as the application is running as root, if it is to be used.
1533 lines
51 KiB
C++
1533 lines
51 KiB
C++
#include "TracyDebug.hpp"
|
|
#include "TracyStringHelpers.hpp"
|
|
#include "TracySysTrace.hpp"
|
|
#include "../common/TracySystem.hpp"
|
|
|
|
#ifdef TRACY_HAS_SYSTEM_TRACING
|
|
|
|
#ifndef TRACY_SAMPLING_HZ
|
|
# if defined _WIN32
|
|
# define TRACY_SAMPLING_HZ 8000
|
|
# elif defined __linux__
|
|
# define TRACY_SAMPLING_HZ 10000
|
|
# endif
|
|
#endif
|
|
|
|
namespace tracy
|
|
{
|
|
|
|
static constexpr int GetSamplingFrequency()
|
|
{
|
|
#if defined _WIN32
|
|
return TRACY_SAMPLING_HZ > 8000 ? 8000 : ( TRACY_SAMPLING_HZ < 1 ? 1 : TRACY_SAMPLING_HZ );
|
|
#else
|
|
return TRACY_SAMPLING_HZ > 1000000 ? 1000000 : ( TRACY_SAMPLING_HZ < 1 ? 1 : TRACY_SAMPLING_HZ );
|
|
#endif
|
|
}
|
|
|
|
static constexpr int GetSamplingPeriod()
|
|
{
|
|
return 1000000000 / GetSamplingFrequency();
|
|
}
|
|
|
|
}
|
|
|
|
# if defined _WIN32
|
|
|
|
# ifndef NOMINMAX
|
|
# define NOMINMAX
|
|
# endif
|
|
|
|
# define INITGUID
|
|
# include <assert.h>
|
|
# include <string.h>
|
|
# include <windows.h>
|
|
# include <dbghelp.h>
|
|
# include <evntrace.h>
|
|
# include <evntcons.h>
|
|
# include <psapi.h>
|
|
# include <winternl.h>
|
|
|
|
# include "../common/TracyAlloc.hpp"
|
|
# include "../common/TracySystem.hpp"
|
|
# include "TracyProfiler.hpp"
|
|
# include "TracyThread.hpp"
|
|
|
|
namespace tracy
|
|
{
|
|
|
|
static const GUID PerfInfoGuid = { 0xce1dbfb4, 0x137e, 0x4da6, { 0x87, 0xb0, 0x3f, 0x59, 0xaa, 0x10, 0x2c, 0xbc } };
|
|
static const GUID DxgKrnlGuid = { 0x802ec45a, 0x1e99, 0x4b83, { 0x99, 0x20, 0x87, 0xc9, 0x82, 0x77, 0xba, 0x9d } };
|
|
static const GUID ThreadV2Guid = { 0x3d6fa8d1, 0xfe05, 0x11d0, { 0x9d, 0xda, 0x00, 0xc0, 0x4f, 0xd7, 0xba, 0x7c } };
|
|
|
|
|
|
static TRACEHANDLE s_traceHandle;
|
|
static TRACEHANDLE s_traceHandle2;
|
|
static EVENT_TRACE_PROPERTIES* s_prop;
|
|
static DWORD s_pid;
|
|
|
|
static EVENT_TRACE_PROPERTIES* s_propVsync;
|
|
static TRACEHANDLE s_traceHandleVsync;
|
|
static TRACEHANDLE s_traceHandleVsync2;
|
|
Thread* s_threadVsync = nullptr;
|
|
|
|
struct CSwitch
|
|
{
|
|
uint32_t newThreadId;
|
|
uint32_t oldThreadId;
|
|
int8_t newThreadPriority;
|
|
int8_t oldThreadPriority;
|
|
uint8_t previousCState;
|
|
int8_t spareByte;
|
|
int8_t oldThreadWaitReason;
|
|
int8_t oldThreadWaitMode;
|
|
int8_t oldThreadState;
|
|
int8_t oldThreadWaitIdealProcessor;
|
|
uint32_t newThreadWaitTime;
|
|
uint32_t reserved;
|
|
};
|
|
|
|
struct ReadyThread
|
|
{
|
|
uint32_t threadId;
|
|
int8_t adjustReason;
|
|
int8_t adjustIncrement;
|
|
int8_t flag;
|
|
int8_t reserverd;
|
|
};
|
|
|
|
struct ThreadTrace
|
|
{
|
|
uint32_t processId;
|
|
uint32_t threadId;
|
|
uint32_t stackBase;
|
|
uint32_t stackLimit;
|
|
uint32_t userStackBase;
|
|
uint32_t userStackLimit;
|
|
uint32_t startAddr;
|
|
uint32_t win32StartAddr;
|
|
uint32_t tebBase;
|
|
uint32_t subProcessTag;
|
|
};
|
|
|
|
struct StackWalkEvent
|
|
{
|
|
uint64_t eventTimeStamp;
|
|
uint32_t stackProcess;
|
|
uint32_t stackThread;
|
|
uint64_t stack[192];
|
|
};
|
|
|
|
struct VSyncInfo
|
|
{
|
|
void* dxgAdapter;
|
|
uint32_t vidPnTargetId;
|
|
uint64_t scannedPhysicalAddress;
|
|
uint32_t vidPnSourceId;
|
|
uint32_t frameNumber;
|
|
int64_t frameQpcTime;
|
|
void* hFlipDevice;
|
|
uint32_t flipType;
|
|
uint64_t flipFenceId;
|
|
};
|
|
|
|
extern "C" typedef NTSTATUS (WINAPI *t_NtQueryInformationThread)( HANDLE, THREADINFOCLASS, PVOID, ULONG, PULONG );
|
|
extern "C" typedef BOOL (WINAPI *t_EnumProcessModules)( HANDLE, HMODULE*, DWORD, LPDWORD );
|
|
extern "C" typedef BOOL (WINAPI *t_GetModuleInformation)( HANDLE, HMODULE, LPMODULEINFO, DWORD );
|
|
extern "C" typedef DWORD (WINAPI *t_GetModuleBaseNameA)( HANDLE, HMODULE, LPSTR, DWORD );
|
|
extern "C" typedef HRESULT (WINAPI *t_GetThreadDescription)( HANDLE, PWSTR* );
|
|
|
|
t_NtQueryInformationThread NtQueryInformationThread = (t_NtQueryInformationThread)GetProcAddress( GetModuleHandleA( "ntdll.dll" ), "NtQueryInformationThread" );
|
|
t_EnumProcessModules _EnumProcessModules = (t_EnumProcessModules)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "K32EnumProcessModules" );
|
|
t_GetModuleInformation _GetModuleInformation = (t_GetModuleInformation)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "K32GetModuleInformation" );
|
|
t_GetModuleBaseNameA _GetModuleBaseNameA = (t_GetModuleBaseNameA)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "K32GetModuleBaseNameA" );
|
|
|
|
static t_GetThreadDescription _GetThreadDescription = 0;
|
|
|
|
|
|
void WINAPI EventRecordCallback( PEVENT_RECORD record )
|
|
{
|
|
#ifdef TRACY_ON_DEMAND
|
|
if( !GetProfiler().IsConnected() ) return;
|
|
#endif
|
|
|
|
const auto& hdr = record->EventHeader;
|
|
switch( hdr.ProviderId.Data1 )
|
|
{
|
|
case 0x3d6fa8d1: // Thread Guid
|
|
if( hdr.EventDescriptor.Opcode == 36 )
|
|
{
|
|
const auto cswitch = (const CSwitch*)record->UserData;
|
|
|
|
TracyLfqPrepare( QueueType::ContextSwitch );
|
|
MemWrite( &item->contextSwitch.time, hdr.TimeStamp.QuadPart );
|
|
MemWrite( &item->contextSwitch.oldThread, cswitch->oldThreadId );
|
|
MemWrite( &item->contextSwitch.newThread, cswitch->newThreadId );
|
|
MemWrite( &item->contextSwitch.cpu, record->BufferContext.ProcessorNumber );
|
|
MemWrite( &item->contextSwitch.reason, cswitch->oldThreadWaitReason );
|
|
MemWrite( &item->contextSwitch.state, cswitch->oldThreadState );
|
|
TracyLfqCommit;
|
|
}
|
|
else if( hdr.EventDescriptor.Opcode == 50 )
|
|
{
|
|
const auto rt = (const ReadyThread*)record->UserData;
|
|
|
|
TracyLfqPrepare( QueueType::ThreadWakeup );
|
|
MemWrite( &item->threadWakeup.time, hdr.TimeStamp.QuadPart );
|
|
MemWrite( &item->threadWakeup.thread, rt->threadId );
|
|
TracyLfqCommit;
|
|
}
|
|
else if( hdr.EventDescriptor.Opcode == 1 || hdr.EventDescriptor.Opcode == 3 )
|
|
{
|
|
const auto tt = (const ThreadTrace*)record->UserData;
|
|
|
|
uint64_t tid = tt->threadId;
|
|
if( tid == 0 ) return;
|
|
uint64_t pid = tt->processId;
|
|
TracyLfqPrepare( QueueType::TidToPid );
|
|
MemWrite( &item->tidToPid.tid, tid );
|
|
MemWrite( &item->tidToPid.pid, pid );
|
|
TracyLfqCommit;
|
|
}
|
|
break;
|
|
case 0xdef2fe46: // StackWalk Guid
|
|
if( hdr.EventDescriptor.Opcode == 32 )
|
|
{
|
|
const auto sw = (const StackWalkEvent*)record->UserData;
|
|
if( sw->stackProcess == s_pid )
|
|
{
|
|
const uint64_t sz = ( record->UserDataLength - 16 ) / 8;
|
|
if( sz > 0 )
|
|
{
|
|
auto trace = (uint64_t*)tracy_malloc( ( 1 + sz ) * sizeof( uint64_t ) );
|
|
memcpy( trace, &sz, sizeof( uint64_t ) );
|
|
memcpy( trace+1, sw->stack, sizeof( uint64_t ) * sz );
|
|
TracyLfqPrepare( QueueType::CallstackSample );
|
|
MemWrite( &item->callstackSampleFat.time, sw->eventTimeStamp );
|
|
MemWrite( &item->callstackSampleFat.thread, sw->stackThread );
|
|
MemWrite( &item->callstackSampleFat.ptr, (uint64_t)trace );
|
|
TracyLfqCommit;
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
static constexpr const char* VsyncName[] = {
|
|
"[0] Vsync",
|
|
"[1] Vsync",
|
|
"[2] Vsync",
|
|
"[3] Vsync",
|
|
"[4] Vsync",
|
|
"[5] Vsync",
|
|
"[6] Vsync",
|
|
"[7] Vsync",
|
|
"Vsync"
|
|
};
|
|
|
|
static uint32_t VsyncTarget[8] = {};
|
|
|
|
void WINAPI EventRecordCallbackVsync( PEVENT_RECORD record )
|
|
{
|
|
#ifdef TRACY_ON_DEMAND
|
|
if( !GetProfiler().IsConnected() ) return;
|
|
#endif
|
|
|
|
const auto& hdr = record->EventHeader;
|
|
assert( hdr.ProviderId.Data1 == 0x802EC45A );
|
|
assert( hdr.EventDescriptor.Id == 0x0011 );
|
|
|
|
const auto vs = (const VSyncInfo*)record->UserData;
|
|
|
|
int idx = 0;
|
|
do
|
|
{
|
|
if( VsyncTarget[idx] == 0 )
|
|
{
|
|
VsyncTarget[idx] = vs->vidPnTargetId;
|
|
break;
|
|
}
|
|
else if( VsyncTarget[idx] == vs->vidPnTargetId )
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
while( ++idx < 8 );
|
|
|
|
TracyLfqPrepare( QueueType::FrameMarkMsg );
|
|
MemWrite( &item->frameMark.time, hdr.TimeStamp.QuadPart );
|
|
MemWrite( &item->frameMark.name, uint64_t( VsyncName[idx] ) );
|
|
TracyLfqCommit;
|
|
}
|
|
|
|
static void SetupVsync()
|
|
{
|
|
#if _WIN32_WINNT >= _WIN32_WINNT_WINBLUE
|
|
const auto psz = sizeof( EVENT_TRACE_PROPERTIES ) + MAX_PATH;
|
|
s_propVsync = (EVENT_TRACE_PROPERTIES*)tracy_malloc( psz );
|
|
memset( s_propVsync, 0, sizeof( EVENT_TRACE_PROPERTIES ) );
|
|
s_propVsync->LogFileMode = EVENT_TRACE_REAL_TIME_MODE;
|
|
s_propVsync->Wnode.BufferSize = psz;
|
|
#ifdef TRACY_TIMER_QPC
|
|
s_propVsync->Wnode.ClientContext = 1;
|
|
#else
|
|
s_propVsync->Wnode.ClientContext = 3;
|
|
#endif
|
|
s_propVsync->LoggerNameOffset = sizeof( EVENT_TRACE_PROPERTIES );
|
|
strcpy( ((char*)s_propVsync) + sizeof( EVENT_TRACE_PROPERTIES ), "TracyVsync" );
|
|
|
|
auto backup = tracy_malloc( psz );
|
|
memcpy( backup, s_propVsync, psz );
|
|
|
|
const auto controlStatus = ControlTraceA( 0, "TracyVsync", s_propVsync, EVENT_TRACE_CONTROL_STOP );
|
|
if( controlStatus != ERROR_SUCCESS && controlStatus != ERROR_WMI_INSTANCE_NOT_FOUND )
|
|
{
|
|
tracy_free( backup );
|
|
tracy_free( s_propVsync );
|
|
return;
|
|
}
|
|
|
|
memcpy( s_propVsync, backup, psz );
|
|
tracy_free( backup );
|
|
|
|
const auto startStatus = StartTraceA( &s_traceHandleVsync, "TracyVsync", s_propVsync );
|
|
if( startStatus != ERROR_SUCCESS )
|
|
{
|
|
tracy_free( s_propVsync );
|
|
return;
|
|
}
|
|
|
|
EVENT_FILTER_EVENT_ID fe = {};
|
|
fe.FilterIn = TRUE;
|
|
fe.Count = 1;
|
|
fe.Events[0] = 0x0011; // VSyncDPC_Info
|
|
|
|
EVENT_FILTER_DESCRIPTOR desc = {};
|
|
desc.Ptr = (ULONGLONG)&fe;
|
|
desc.Size = sizeof( fe );
|
|
desc.Type = EVENT_FILTER_TYPE_EVENT_ID;
|
|
|
|
ENABLE_TRACE_PARAMETERS params = {};
|
|
params.Version = ENABLE_TRACE_PARAMETERS_VERSION_2;
|
|
params.EnableProperty = EVENT_ENABLE_PROPERTY_IGNORE_KEYWORD_0;
|
|
params.SourceId = s_propVsync->Wnode.Guid;
|
|
params.EnableFilterDesc = &desc;
|
|
params.FilterDescCount = 1;
|
|
|
|
uint64_t mask = 0x4000000000000001; // Microsoft_Windows_DxgKrnl_Performance | Base
|
|
if( EnableTraceEx2( s_traceHandleVsync, &DxgKrnlGuid, EVENT_CONTROL_CODE_ENABLE_PROVIDER, TRACE_LEVEL_INFORMATION, mask, mask, 0, ¶ms ) != ERROR_SUCCESS )
|
|
{
|
|
tracy_free( s_propVsync );
|
|
return;
|
|
}
|
|
|
|
char loggerName[MAX_PATH];
|
|
strcpy( loggerName, "TracyVsync" );
|
|
|
|
EVENT_TRACE_LOGFILEA log = {};
|
|
log.LoggerName = loggerName;
|
|
log.ProcessTraceMode = PROCESS_TRACE_MODE_REAL_TIME | PROCESS_TRACE_MODE_EVENT_RECORD | PROCESS_TRACE_MODE_RAW_TIMESTAMP;
|
|
log.EventRecordCallback = EventRecordCallbackVsync;
|
|
|
|
s_traceHandleVsync2 = OpenTraceA( &log );
|
|
if( s_traceHandleVsync2 == (TRACEHANDLE)INVALID_HANDLE_VALUE )
|
|
{
|
|
CloseTrace( s_traceHandleVsync );
|
|
tracy_free( s_propVsync );
|
|
return;
|
|
}
|
|
|
|
s_threadVsync = (Thread*)tracy_malloc( sizeof( Thread ) );
|
|
new(s_threadVsync) Thread( [] (void*) {
|
|
ThreadExitHandler threadExitHandler;
|
|
SetThreadPriority( GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL );
|
|
SetThreadName( "Tracy Vsync" );
|
|
ProcessTrace( &s_traceHandleVsync2, 1, nullptr, nullptr );
|
|
}, nullptr );
|
|
#endif
|
|
}
|
|
|
|
static constexpr int GetSamplingInterval()
|
|
{
|
|
return GetSamplingPeriod() / 100;
|
|
}
|
|
|
|
bool SysTraceStart( int64_t& samplingPeriod )
|
|
{
|
|
if( !_GetThreadDescription ) _GetThreadDescription = (t_GetThreadDescription)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetThreadDescription" );
|
|
|
|
s_pid = GetCurrentProcessId();
|
|
|
|
#if defined _WIN64
|
|
constexpr bool isOs64Bit = true;
|
|
#else
|
|
BOOL _iswow64;
|
|
IsWow64Process( GetCurrentProcess(), &_iswow64 );
|
|
const bool isOs64Bit = _iswow64;
|
|
#endif
|
|
|
|
TOKEN_PRIVILEGES priv = {};
|
|
priv.PrivilegeCount = 1;
|
|
priv.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
|
|
if( LookupPrivilegeValue( nullptr, SE_SYSTEM_PROFILE_NAME, &priv.Privileges[0].Luid ) == 0 ) return false;
|
|
|
|
HANDLE pt;
|
|
if( OpenProcessToken( GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &pt ) == 0 ) return false;
|
|
const auto adjust = AdjustTokenPrivileges( pt, FALSE, &priv, 0, nullptr, nullptr );
|
|
CloseHandle( pt );
|
|
if( adjust == 0 ) return false;
|
|
const auto status = GetLastError();
|
|
if( status != ERROR_SUCCESS ) return false;
|
|
|
|
if( isOs64Bit )
|
|
{
|
|
TRACE_PROFILE_INTERVAL interval = {};
|
|
interval.Interval = GetSamplingInterval();
|
|
const auto intervalStatus = TraceSetInformation( 0, TraceSampledProfileIntervalInfo, &interval, sizeof( interval ) );
|
|
if( intervalStatus != ERROR_SUCCESS ) return false;
|
|
samplingPeriod = GetSamplingPeriod();
|
|
}
|
|
|
|
const auto psz = sizeof( EVENT_TRACE_PROPERTIES ) + sizeof( KERNEL_LOGGER_NAME );
|
|
s_prop = (EVENT_TRACE_PROPERTIES*)tracy_malloc( psz );
|
|
memset( s_prop, 0, sizeof( EVENT_TRACE_PROPERTIES ) );
|
|
ULONG flags = 0;
|
|
#ifndef TRACY_NO_CONTEXT_SWITCH
|
|
flags = EVENT_TRACE_FLAG_CSWITCH | EVENT_TRACE_FLAG_DISPATCHER | EVENT_TRACE_FLAG_THREAD;
|
|
#endif
|
|
#ifndef TRACY_NO_SAMPLING
|
|
if( isOs64Bit ) flags |= EVENT_TRACE_FLAG_PROFILE;
|
|
#endif
|
|
s_prop->EnableFlags = flags;
|
|
s_prop->LogFileMode = EVENT_TRACE_REAL_TIME_MODE;
|
|
s_prop->Wnode.BufferSize = psz;
|
|
s_prop->Wnode.Flags = WNODE_FLAG_TRACED_GUID;
|
|
#ifdef TRACY_TIMER_QPC
|
|
s_prop->Wnode.ClientContext = 1;
|
|
#else
|
|
s_prop->Wnode.ClientContext = 3;
|
|
#endif
|
|
s_prop->Wnode.Guid = SystemTraceControlGuid;
|
|
s_prop->BufferSize = 1024;
|
|
s_prop->MinimumBuffers = std::thread::hardware_concurrency() * 4;
|
|
s_prop->MaximumBuffers = std::thread::hardware_concurrency() * 6;
|
|
s_prop->LoggerNameOffset = sizeof( EVENT_TRACE_PROPERTIES );
|
|
memcpy( ((char*)s_prop) + sizeof( EVENT_TRACE_PROPERTIES ), KERNEL_LOGGER_NAME, sizeof( KERNEL_LOGGER_NAME ) );
|
|
|
|
auto backup = tracy_malloc( psz );
|
|
memcpy( backup, s_prop, psz );
|
|
|
|
const auto controlStatus = ControlTrace( 0, KERNEL_LOGGER_NAME, s_prop, EVENT_TRACE_CONTROL_STOP );
|
|
if( controlStatus != ERROR_SUCCESS && controlStatus != ERROR_WMI_INSTANCE_NOT_FOUND )
|
|
{
|
|
tracy_free( backup );
|
|
tracy_free( s_prop );
|
|
return false;
|
|
}
|
|
|
|
memcpy( s_prop, backup, psz );
|
|
tracy_free( backup );
|
|
|
|
const auto startStatus = StartTrace( &s_traceHandle, KERNEL_LOGGER_NAME, s_prop );
|
|
if( startStatus != ERROR_SUCCESS )
|
|
{
|
|
tracy_free( s_prop );
|
|
return false;
|
|
}
|
|
|
|
if( isOs64Bit )
|
|
{
|
|
CLASSIC_EVENT_ID stackId[2] = {};
|
|
stackId[0].EventGuid = PerfInfoGuid;
|
|
stackId[0].Type = 46;
|
|
stackId[1].EventGuid = ThreadV2Guid;
|
|
stackId[1].Type = 36;
|
|
const auto stackStatus = TraceSetInformation( s_traceHandle, TraceStackTracingInfo, &stackId, sizeof( stackId ) );
|
|
if( stackStatus != ERROR_SUCCESS )
|
|
{
|
|
tracy_free( s_prop );
|
|
return false;
|
|
}
|
|
}
|
|
|
|
#ifdef UNICODE
|
|
WCHAR KernelLoggerName[sizeof( KERNEL_LOGGER_NAME )];
|
|
#else
|
|
char KernelLoggerName[sizeof( KERNEL_LOGGER_NAME )];
|
|
#endif
|
|
memcpy( KernelLoggerName, KERNEL_LOGGER_NAME, sizeof( KERNEL_LOGGER_NAME ) );
|
|
EVENT_TRACE_LOGFILE log = {};
|
|
log.LoggerName = KernelLoggerName;
|
|
log.ProcessTraceMode = PROCESS_TRACE_MODE_REAL_TIME | PROCESS_TRACE_MODE_EVENT_RECORD | PROCESS_TRACE_MODE_RAW_TIMESTAMP;
|
|
log.EventRecordCallback = EventRecordCallback;
|
|
|
|
s_traceHandle2 = OpenTrace( &log );
|
|
if( s_traceHandle2 == (TRACEHANDLE)INVALID_HANDLE_VALUE )
|
|
{
|
|
CloseTrace( s_traceHandle );
|
|
tracy_free( s_prop );
|
|
return false;
|
|
}
|
|
|
|
#ifndef TRACY_NO_VSYNC_CAPTURE
|
|
SetupVsync();
|
|
#endif
|
|
|
|
return true;
|
|
}
|
|
|
|
void SysTraceStop()
|
|
{
|
|
if( s_threadVsync )
|
|
{
|
|
CloseTrace( s_traceHandleVsync2 );
|
|
CloseTrace( s_traceHandleVsync );
|
|
s_threadVsync->~Thread();
|
|
tracy_free( s_threadVsync );
|
|
}
|
|
|
|
CloseTrace( s_traceHandle2 );
|
|
CloseTrace( s_traceHandle );
|
|
}
|
|
|
|
void SysTraceWorker( void* ptr )
|
|
{
|
|
ThreadExitHandler threadExitHandler;
|
|
SetThreadPriority( GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL );
|
|
SetThreadName( "Tracy SysTrace" );
|
|
ProcessTrace( &s_traceHandle2, 1, 0, 0 );
|
|
ControlTrace( 0, KERNEL_LOGGER_NAME, s_prop, EVENT_TRACE_CONTROL_STOP );
|
|
tracy_free( s_prop );
|
|
}
|
|
|
|
void SysTraceGetExternalName( uint64_t thread, const char*& threadName, const char*& name )
|
|
{
|
|
bool threadSent = false;
|
|
auto hnd = OpenThread( THREAD_QUERY_INFORMATION, FALSE, DWORD( thread ) );
|
|
if( hnd == 0 )
|
|
{
|
|
hnd = OpenThread( THREAD_QUERY_LIMITED_INFORMATION, FALSE, DWORD( thread ) );
|
|
}
|
|
if( hnd != 0 )
|
|
{
|
|
if( _GetThreadDescription )
|
|
{
|
|
PWSTR tmp;
|
|
_GetThreadDescription( hnd, &tmp );
|
|
char buf[256];
|
|
if( tmp )
|
|
{
|
|
auto ret = wcstombs( buf, tmp, 256 );
|
|
if( ret != 0 )
|
|
{
|
|
threadName = CopyString( buf, ret );
|
|
threadSent = true;
|
|
}
|
|
}
|
|
}
|
|
const auto pid = GetProcessIdOfThread( hnd );
|
|
if( !threadSent && NtQueryInformationThread && _EnumProcessModules && _GetModuleInformation && _GetModuleBaseNameA )
|
|
{
|
|
void* ptr;
|
|
ULONG retlen;
|
|
auto status = NtQueryInformationThread( hnd, (THREADINFOCLASS)9 /*ThreadQuerySetWin32StartAddress*/, &ptr, sizeof( &ptr ), &retlen );
|
|
if( status == 0 )
|
|
{
|
|
const auto phnd = OpenProcess( PROCESS_QUERY_INFORMATION | PROCESS_VM_READ, FALSE, pid );
|
|
if( phnd != INVALID_HANDLE_VALUE )
|
|
{
|
|
HMODULE modules[1024];
|
|
DWORD needed;
|
|
if( _EnumProcessModules( phnd, modules, 1024 * sizeof( HMODULE ), &needed ) != 0 )
|
|
{
|
|
const auto sz = std::min( DWORD( needed / sizeof( HMODULE ) ), DWORD( 1024 ) );
|
|
for( DWORD i=0; i<sz; i++ )
|
|
{
|
|
MODULEINFO info;
|
|
if( _GetModuleInformation( phnd, modules[i], &info, sizeof( info ) ) != 0 )
|
|
{
|
|
if( (uint64_t)ptr >= (uint64_t)info.lpBaseOfDll && (uint64_t)ptr <= (uint64_t)info.lpBaseOfDll + (uint64_t)info.SizeOfImage )
|
|
{
|
|
char buf2[1024];
|
|
const auto modlen = _GetModuleBaseNameA( phnd, modules[i], buf2, 1024 );
|
|
if( modlen != 0 )
|
|
{
|
|
threadName = CopyString( buf2, modlen );
|
|
threadSent = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
CloseHandle( phnd );
|
|
}
|
|
}
|
|
}
|
|
CloseHandle( hnd );
|
|
if( !threadSent )
|
|
{
|
|
threadName = CopyString( "???", 3 );
|
|
threadSent = true;
|
|
}
|
|
if( pid != 0 )
|
|
{
|
|
{
|
|
uint64_t _pid = pid;
|
|
TracyLfqPrepare( QueueType::TidToPid );
|
|
MemWrite( &item->tidToPid.tid, thread );
|
|
MemWrite( &item->tidToPid.pid, _pid );
|
|
TracyLfqCommit;
|
|
}
|
|
if( pid == 4 )
|
|
{
|
|
name = CopyStringFast( "System", 6 );
|
|
return;
|
|
}
|
|
else
|
|
{
|
|
const auto phnd = OpenProcess( PROCESS_QUERY_LIMITED_INFORMATION, FALSE, pid );
|
|
if( phnd != INVALID_HANDLE_VALUE )
|
|
{
|
|
char buf2[1024];
|
|
const auto sz = GetProcessImageFileNameA( phnd, buf2, 1024 );
|
|
CloseHandle( phnd );
|
|
if( sz != 0 )
|
|
{
|
|
auto ptr = buf2 + sz - 1;
|
|
while( ptr > buf2 && *ptr != '\\' ) ptr--;
|
|
if( *ptr == '\\' ) ptr++;
|
|
name = CopyStringFast( ptr );
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if( !threadSent )
|
|
{
|
|
threadName = CopyString( "???", 3 );
|
|
}
|
|
name = CopyStringFast( "???", 3 );
|
|
}
|
|
|
|
}
|
|
|
|
# elif defined __linux__
|
|
|
|
# include <sys/types.h>
|
|
# include <sys/stat.h>
|
|
# include <sys/wait.h>
|
|
# include <fcntl.h>
|
|
# include <inttypes.h>
|
|
# include <limits>
|
|
# include <poll.h>
|
|
# include <stdio.h>
|
|
# include <stdlib.h>
|
|
# include <string.h>
|
|
# include <unistd.h>
|
|
# include <atomic>
|
|
# include <thread>
|
|
# include <linux/perf_event.h>
|
|
# include <linux/version.h>
|
|
# include <sys/mman.h>
|
|
# include <sys/ioctl.h>
|
|
# include <sys/syscall.h>
|
|
|
|
# include "TracyProfiler.hpp"
|
|
# include "TracyRingBuffer.hpp"
|
|
# include "TracyThread.hpp"
|
|
|
|
namespace tracy
|
|
{
|
|
|
|
static std::atomic<bool> traceActive { false };
|
|
static int s_numCpus = 0;
|
|
static int s_numBuffers = 0;
|
|
static int s_ctxBufferIdx = 0;
|
|
|
|
static RingBuffer* s_ring = nullptr;
|
|
|
|
static const int ThreadHashSize = 4 * 1024;
|
|
static uint32_t s_threadHash[ThreadHashSize] = {};
|
|
|
|
static bool CurrentProcOwnsThread( uint32_t tid )
|
|
{
|
|
const auto hash = tid & ( ThreadHashSize-1 );
|
|
const auto hv = s_threadHash[hash];
|
|
if( hv == tid ) return true;
|
|
if( hv == -tid ) return false;
|
|
|
|
char path[256];
|
|
sprintf( path, "/proc/self/task/%d", tid );
|
|
struct stat st;
|
|
if( stat( path, &st ) == 0 )
|
|
{
|
|
s_threadHash[hash] = tid;
|
|
return true;
|
|
}
|
|
else
|
|
{
|
|
s_threadHash[hash] = -tid;
|
|
return false;
|
|
}
|
|
}
|
|
|
|
static int perf_event_open( struct perf_event_attr* hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags )
|
|
{
|
|
return syscall( __NR_perf_event_open, hw_event, pid, cpu, group_fd, flags );
|
|
}
|
|
|
|
enum TraceEventId
|
|
{
|
|
EventCallstack,
|
|
EventCpuCycles,
|
|
EventInstructionsRetired,
|
|
EventCacheReference,
|
|
EventCacheMiss,
|
|
EventBranchRetired,
|
|
EventBranchMiss,
|
|
EventContextSwitch,
|
|
EventWakeup,
|
|
};
|
|
|
|
static void ProbePreciseIp( perf_event_attr& pe, unsigned long long config0, unsigned long long config1, pid_t pid )
|
|
{
|
|
pe.config = config1;
|
|
pe.precise_ip = 3;
|
|
while( pe.precise_ip != 0 )
|
|
{
|
|
const int fd = perf_event_open( &pe, pid, 0, -1, PERF_FLAG_FD_CLOEXEC );
|
|
if( fd != -1 )
|
|
{
|
|
close( fd );
|
|
break;
|
|
}
|
|
pe.precise_ip--;
|
|
}
|
|
pe.config = config0;
|
|
while( pe.precise_ip != 0 )
|
|
{
|
|
const int fd = perf_event_open( &pe, pid, 0, -1, PERF_FLAG_FD_CLOEXEC );
|
|
if( fd != -1 )
|
|
{
|
|
close( fd );
|
|
break;
|
|
}
|
|
pe.precise_ip--;
|
|
}
|
|
TracyDebug( " Probed precise_ip: %i\n", pe.precise_ip );
|
|
}
|
|
|
|
static void ProbePreciseIp( perf_event_attr& pe, pid_t pid )
|
|
{
|
|
pe.precise_ip = 3;
|
|
while( pe.precise_ip != 0 )
|
|
{
|
|
const int fd = perf_event_open( &pe, pid, 0, -1, PERF_FLAG_FD_CLOEXEC );
|
|
if( fd != -1 )
|
|
{
|
|
close( fd );
|
|
break;
|
|
}
|
|
pe.precise_ip--;
|
|
}
|
|
TracyDebug( " Probed precise_ip: %i\n", pe.precise_ip );
|
|
}
|
|
|
|
static bool IsGenuineIntel()
|
|
{
|
|
#if defined __i386 || defined __x86_64__
|
|
uint32_t regs[4] = {};
|
|
__get_cpuid( 0, regs, regs+1, regs+2, regs+3 );
|
|
char manufacturer[12];
|
|
memcpy( manufacturer, regs+1, 4 );
|
|
memcpy( manufacturer+4, regs+3, 4 );
|
|
memcpy( manufacturer+8, regs+2, 4 );
|
|
return memcmp( manufacturer, "GenuineIntel", 12 ) == 0;
|
|
#else
|
|
return false;
|
|
#endif
|
|
}
|
|
|
|
static const char* ReadFile( const char* path )
|
|
{
|
|
int fd = open( path, O_RDONLY );
|
|
if( fd < 0 ) return nullptr;
|
|
|
|
static char tmp[64];
|
|
const auto cnt = read( fd, tmp, 63 );
|
|
close( fd );
|
|
if( cnt < 0 ) return nullptr;
|
|
tmp[cnt] = '\0';
|
|
return tmp;
|
|
}
|
|
|
|
#ifdef __ANDROID__
|
|
static const char* ReadFileElevated( const char* path )
|
|
{
|
|
// Explanation for "su root sh -c": there are 2 flavors of "su" in circulation
|
|
// on Android. The default Android su has the following syntax to run a command
|
|
// as root:
|
|
// su root 'command'
|
|
// and 'command' is exec'd not passed to a shell, so if shell interpretation is
|
|
// wanted, one needs to do:
|
|
// su root sh -c 'command'
|
|
// Besides that default Android 'su' command, some Android devices use a different
|
|
// su with a command-line interface closer to the familiar util-linux su found
|
|
// on Linux distributions. Fortunately, both the util-linux su and the one
|
|
// in https://github.com/topjohnwu/Magisk seem to be happy with the above
|
|
// `su root sh -c 'command'` command line syntax.
|
|
|
|
int pipefd[2];
|
|
if( pipe( pipefd ) == 0 )
|
|
{
|
|
const auto pid = fork();
|
|
if( pid == 0 )
|
|
{
|
|
// child
|
|
close( pipefd[0] );
|
|
dup2( open( "/dev/null", O_WRONLY ), STDERR_FILENO );
|
|
if( dup2( pipefd[1], STDOUT_FILENO ) >= 0 )
|
|
{
|
|
close( pipefd[1] );
|
|
char tmp[1024];
|
|
sprintf( tmp, "cat %s", path );
|
|
execlp( "su", "su", "root", "sh", "-c", tmp, (char*)nullptr );
|
|
exit( 1 );
|
|
}
|
|
exit( 0 );
|
|
}
|
|
else if( pid > 0 )
|
|
{
|
|
// parent
|
|
close( pipefd[1] );
|
|
static char tmp[64];
|
|
const auto sz = read( pipefd[0], tmp, 63 );
|
|
close( pipefd[0] );
|
|
waitpid( pid, nullptr, 0 );
|
|
if( sz <= 0 ) return nullptr;
|
|
tmp[sz] = '\0';
|
|
return tmp;
|
|
}
|
|
else
|
|
{
|
|
return nullptr;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
return nullptr;
|
|
}
|
|
}
|
|
#else
|
|
static const char* ReadFileElevated( const char* path )
|
|
{
|
|
return ReadFile( path );
|
|
}
|
|
#endif
|
|
|
|
bool SysTraceStart( int64_t& samplingPeriod )
|
|
{
|
|
#ifndef CLOCK_MONOTONIC_RAW
|
|
return false;
|
|
#endif
|
|
|
|
int paranoidLevel = 2;
|
|
const auto paranoidLevelStr = ReadFile( "/proc/sys/kernel/perf_event_paranoid" );
|
|
if( !paranoidLevelStr ) return false;
|
|
paranoidLevel = atoi( paranoidLevelStr );
|
|
TracyDebug( "perf_event_paranoid: %i\n", paranoidLevel );
|
|
|
|
int switchId = -1, wakeupId = -1;
|
|
const auto switchIdStr = ReadFileElevated( "/sys/kernel/debug/tracing/events/sched/sched_switch/id" );
|
|
if( switchIdStr ) switchId = atoi( switchIdStr );
|
|
const auto wakeupIdStr = ReadFileElevated( "/sys/kernel/debug/tracing/events/sched/sched_wakeup/id" );
|
|
if( wakeupIdStr ) wakeupId = atoi( wakeupIdStr );
|
|
|
|
TracyDebug( "sched_switch id: %i\nsched_wakeup id: %i\n", switchId, wakeupId );
|
|
|
|
#ifdef TRACY_NO_SAMPLE_RETIREMENT
|
|
const bool noRetirement = true;
|
|
#else
|
|
const char* noRetirementEnv = GetEnvVar( "TRACY_NO_SAMPLE_RETIREMENT" );
|
|
const bool noRetirement = noRetirementEnv && noRetirementEnv[0] == '1';
|
|
#endif
|
|
|
|
#ifdef TRACY_NO_SAMPLE_CACHE
|
|
const bool noCache = true;
|
|
#else
|
|
const char* noCacheEnv = GetEnvVar( "TRACY_NO_SAMPLE_CACHE" );
|
|
const bool noCache = noCacheEnv && noCacheEnv[0] == '1';
|
|
#endif
|
|
|
|
#ifdef TRACY_NO_SAMPLE_BRANCH
|
|
const bool noBranch = true;
|
|
#else
|
|
const char* noBranchEnv = GetEnvVar( "TRACY_NO_SAMPLE_BRANCH" );
|
|
const bool noBranch = noBranchEnv && noBranchEnv[0] == '1';
|
|
#endif
|
|
|
|
#ifdef TRACY_NO_CONTEXT_SWITCH
|
|
const bool noCtxSwitch = true;
|
|
#else
|
|
const char* noCtxSwitchEnv = GetEnvVar( "TRACY_NO_CONTEXT_SWITCH" );
|
|
const bool noCtxSwitch = noCtxSwitchEnv && noCtxSwitchEnv[0] == '1';
|
|
#endif
|
|
|
|
samplingPeriod = GetSamplingPeriod();
|
|
uint32_t currentPid = (uint32_t)getpid();
|
|
|
|
const auto maxNumBuffers = s_numCpus * (
|
|
1 + // software sampling
|
|
2 + // CPU cycles + instructions retired
|
|
2 + // cache reference + miss
|
|
2 + // branch retired + miss
|
|
2 // context switches + wakeups
|
|
);
|
|
s_numCpus = (int)std::thread::hardware_concurrency();
|
|
s_ring = (RingBuffer*)tracy_malloc( sizeof( RingBuffer ) * maxNumBuffers );
|
|
s_numBuffers = 0;
|
|
|
|
// software sampling
|
|
perf_event_attr pe = {};
|
|
pe.type = PERF_TYPE_SOFTWARE;
|
|
pe.size = sizeof( perf_event_attr );
|
|
pe.config = PERF_COUNT_SW_CPU_CLOCK;
|
|
pe.sample_freq = GetSamplingFrequency();
|
|
pe.sample_type = PERF_SAMPLE_TID | PERF_SAMPLE_TIME | PERF_SAMPLE_CALLCHAIN;
|
|
#if LINUX_VERSION_CODE >= KERNEL_VERSION( 4, 8, 0 )
|
|
pe.sample_max_stack = 127;
|
|
#endif
|
|
pe.disabled = 1;
|
|
pe.freq = 1;
|
|
pe.inherit = 1;
|
|
#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
|
|
pe.use_clockid = 1;
|
|
pe.clockid = CLOCK_MONOTONIC_RAW;
|
|
#endif
|
|
|
|
TracyDebug( "Setup software sampling\n" );
|
|
ProbePreciseIp( pe, currentPid );
|
|
for( int i=0; i<s_numCpus; i++ )
|
|
{
|
|
int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
|
|
if( fd == -1 )
|
|
{
|
|
pe.exclude_kernel = 1;
|
|
ProbePreciseIp( pe, currentPid );
|
|
fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
|
|
if( fd == -1 )
|
|
{
|
|
for( int j=0; j<s_numBuffers; j++ ) s_ring[j].~RingBuffer();
|
|
tracy_free( s_ring );
|
|
return false;
|
|
}
|
|
TracyDebug( " No access to kernel samples\n" );
|
|
}
|
|
new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventCallstack );
|
|
s_numBuffers++;
|
|
TracyDebug( " Core %i ok\n", i );
|
|
}
|
|
|
|
// CPU cycles + instructions retired
|
|
pe = {};
|
|
pe.type = PERF_TYPE_HARDWARE;
|
|
pe.size = sizeof( perf_event_attr );
|
|
pe.sample_freq = 5000;
|
|
pe.sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TIME;
|
|
pe.disabled = 1;
|
|
pe.exclude_kernel = 1;
|
|
pe.exclude_guest = 1;
|
|
pe.exclude_hv = 1;
|
|
pe.freq = 1;
|
|
pe.inherit = 1;
|
|
#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
|
|
pe.use_clockid = 1;
|
|
pe.clockid = CLOCK_MONOTONIC_RAW;
|
|
#endif
|
|
|
|
if( !noRetirement )
|
|
{
|
|
TracyDebug( "Setup sampling cycles + retirement\n" );
|
|
ProbePreciseIp( pe, PERF_COUNT_HW_CPU_CYCLES, PERF_COUNT_HW_INSTRUCTIONS, currentPid );
|
|
for( int i=0; i<s_numCpus; i++ )
|
|
{
|
|
const int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
|
|
if( fd != -1 )
|
|
{
|
|
new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventCpuCycles );
|
|
s_numBuffers++;
|
|
TracyDebug( " Core %i ok\n", i );
|
|
}
|
|
}
|
|
|
|
pe.config = PERF_COUNT_HW_INSTRUCTIONS;
|
|
for( int i=0; i<s_numCpus; i++ )
|
|
{
|
|
const int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
|
|
if( fd != -1 )
|
|
{
|
|
new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventInstructionsRetired );
|
|
s_numBuffers++;
|
|
TracyDebug( " Core %i ok\n", i );
|
|
}
|
|
}
|
|
}
|
|
|
|
// cache reference + miss
|
|
if( !noCache )
|
|
{
|
|
TracyDebug( "Setup sampling CPU cache references + misses\n" );
|
|
ProbePreciseIp( pe, PERF_COUNT_HW_CACHE_REFERENCES, PERF_COUNT_HW_CACHE_MISSES, currentPid );
|
|
if( IsGenuineIntel() )
|
|
{
|
|
pe.precise_ip = 0;
|
|
TracyDebug( " CPU is GenuineIntel, forcing precise_ip down to 0\n" );
|
|
}
|
|
for( int i=0; i<s_numCpus; i++ )
|
|
{
|
|
const int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
|
|
if( fd != -1 )
|
|
{
|
|
new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventCacheReference );
|
|
s_numBuffers++;
|
|
TracyDebug( " Core %i ok\n", i );
|
|
}
|
|
}
|
|
|
|
pe.config = PERF_COUNT_HW_CACHE_MISSES;
|
|
for( int i=0; i<s_numCpus; i++ )
|
|
{
|
|
const int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
|
|
if( fd != -1 )
|
|
{
|
|
new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventCacheMiss );
|
|
s_numBuffers++;
|
|
TracyDebug( " Core %i ok\n", i );
|
|
}
|
|
}
|
|
}
|
|
|
|
// branch retired + miss
|
|
if( !noBranch )
|
|
{
|
|
TracyDebug( "Setup sampling CPU branch retirements + misses\n" );
|
|
ProbePreciseIp( pe, PERF_COUNT_HW_BRANCH_INSTRUCTIONS, PERF_COUNT_HW_BRANCH_MISSES, currentPid );
|
|
for( int i=0; i<s_numCpus; i++ )
|
|
{
|
|
const int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
|
|
if( fd != -1 )
|
|
{
|
|
new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventBranchRetired );
|
|
s_numBuffers++;
|
|
TracyDebug( " Core %i ok\n", i );
|
|
}
|
|
}
|
|
|
|
pe.config = PERF_COUNT_HW_BRANCH_MISSES;
|
|
for( int i=0; i<s_numCpus; i++ )
|
|
{
|
|
const int fd = perf_event_open( &pe, currentPid, i, -1, PERF_FLAG_FD_CLOEXEC );
|
|
if( fd != -1 )
|
|
{
|
|
new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventBranchMiss );
|
|
s_numBuffers++;
|
|
TracyDebug( " Core %i ok\n", i );
|
|
}
|
|
}
|
|
}
|
|
|
|
s_ctxBufferIdx = s_numBuffers;
|
|
|
|
// context switches
|
|
if( !noCtxSwitch && switchId != -1 )
|
|
{
|
|
pe = {};
|
|
pe.type = PERF_TYPE_TRACEPOINT;
|
|
pe.size = sizeof( perf_event_attr );
|
|
pe.sample_period = 1;
|
|
pe.sample_type = PERF_SAMPLE_TIME | PERF_SAMPLE_RAW | PERF_SAMPLE_CALLCHAIN;
|
|
#if LINUX_VERSION_CODE >= KERNEL_VERSION( 4, 8, 0 )
|
|
pe.sample_max_stack = 127;
|
|
#endif
|
|
pe.disabled = 1;
|
|
pe.inherit = 1;
|
|
pe.config = switchId;
|
|
#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
|
|
pe.use_clockid = 1;
|
|
pe.clockid = CLOCK_MONOTONIC_RAW;
|
|
#endif
|
|
|
|
TracyDebug( "Setup context switch capture\n" );
|
|
for( int i=0; i<s_numCpus; i++ )
|
|
{
|
|
const int fd = perf_event_open( &pe, -1, i, -1, PERF_FLAG_FD_CLOEXEC );
|
|
if( fd != -1 )
|
|
{
|
|
new( s_ring+s_numBuffers ) RingBuffer( 256*1024, fd, EventContextSwitch, i );
|
|
s_numBuffers++;
|
|
TracyDebug( " Core %i ok\n", i );
|
|
}
|
|
}
|
|
|
|
if( wakeupId != -1 )
|
|
{
|
|
pe.config = wakeupId;
|
|
pe.config &= ~PERF_SAMPLE_CALLCHAIN;
|
|
|
|
TracyDebug( "Setup wakeup capture\n" );
|
|
for( int i=0; i<s_numCpus; i++ )
|
|
{
|
|
const int fd = perf_event_open( &pe, -1, i, -1, PERF_FLAG_FD_CLOEXEC );
|
|
if( fd != -1 )
|
|
{
|
|
new( s_ring+s_numBuffers ) RingBuffer( 64*1024, fd, EventWakeup, i );
|
|
s_numBuffers++;
|
|
TracyDebug( " Core %i ok\n", i );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
TracyDebug( "Ringbuffers in use: %i\n", s_numBuffers );
|
|
|
|
traceActive.store( true, std::memory_order_relaxed );
|
|
return true;
|
|
}
|
|
|
|
void SysTraceStop()
|
|
{
|
|
traceActive.store( false, std::memory_order_relaxed );
|
|
}
|
|
|
|
static uint64_t* GetCallstackBlock( uint64_t cnt, RingBuffer& ring, uint64_t offset )
|
|
{
|
|
auto trace = (uint64_t*)tracy_malloc_fast( ( 1 + cnt ) * sizeof( uint64_t ) );
|
|
ring.Read( trace+1, offset, sizeof( uint64_t ) * cnt );
|
|
|
|
#if defined __x86_64__ || defined _M_X64
|
|
// remove non-canonical pointers
|
|
do
|
|
{
|
|
const auto test = (int64_t)trace[cnt];
|
|
const auto m1 = test >> 63;
|
|
const auto m2 = test >> 47;
|
|
if( m1 == m2 ) break;
|
|
}
|
|
while( --cnt > 0 );
|
|
for( uint64_t j=1; j<cnt; j++ )
|
|
{
|
|
const auto test = (int64_t)trace[j];
|
|
const auto m1 = test >> 63;
|
|
const auto m2 = test >> 47;
|
|
if( m1 != m2 ) trace[j] = 0;
|
|
}
|
|
#endif
|
|
|
|
for( uint64_t j=1; j<=cnt; j++ )
|
|
{
|
|
if( trace[j] >= (uint64_t)-4095 ) // PERF_CONTEXT_MAX
|
|
{
|
|
memmove( trace+j, trace+j+1, sizeof( uint64_t ) * ( cnt - j ) );
|
|
cnt--;
|
|
}
|
|
}
|
|
|
|
memcpy( trace, &cnt, sizeof( uint64_t ) );
|
|
return trace;
|
|
}
|
|
|
|
void SysTraceWorker( void* ptr )
|
|
{
|
|
ThreadExitHandler threadExitHandler;
|
|
SetThreadName( "Tracy Sampling" );
|
|
InitRpmalloc();
|
|
sched_param sp = { 5 };
|
|
pthread_setschedparam( pthread_self(), SCHED_FIFO, &sp );
|
|
for( int i=0; i<s_numBuffers; i++ ) s_ring[i].Enable();
|
|
for(;;)
|
|
{
|
|
bool hadData = false;
|
|
for( int i=0; i<s_ctxBufferIdx; i++ )
|
|
{
|
|
if( !traceActive.load( std::memory_order_relaxed ) ) break;
|
|
auto& ring = s_ring[i];
|
|
const auto head = ring.LoadHead();
|
|
const auto tail = ring.GetTail();
|
|
if( head == tail ) continue;
|
|
assert( head > tail );
|
|
hadData = true;
|
|
|
|
const auto end = head - tail;
|
|
uint64_t pos = 0;
|
|
while( pos < end )
|
|
{
|
|
perf_event_header hdr;
|
|
ring.Read( &hdr, pos, sizeof( perf_event_header ) );
|
|
if( hdr.type == PERF_RECORD_SAMPLE )
|
|
{
|
|
auto offset = pos + sizeof( perf_event_header );
|
|
const auto id = ring.GetId();
|
|
assert( id != EventContextSwitch );
|
|
if( id == EventCallstack )
|
|
{
|
|
// Layout:
|
|
// u32 pid, tid
|
|
// u64 time
|
|
// u64 cnt
|
|
// u64 ip[cnt]
|
|
|
|
uint32_t tid;
|
|
uint64_t t0;
|
|
uint64_t cnt;
|
|
|
|
offset += sizeof( uint32_t );
|
|
ring.Read( &tid, offset, sizeof( uint32_t ) );
|
|
offset += sizeof( uint32_t );
|
|
ring.Read( &t0, offset, sizeof( uint64_t ) );
|
|
offset += sizeof( uint64_t );
|
|
ring.Read( &cnt, offset, sizeof( uint64_t ) );
|
|
offset += sizeof( uint64_t );
|
|
|
|
if( cnt > 0 )
|
|
{
|
|
#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
|
|
t0 = ring.ConvertTimeToTsc( t0 );
|
|
#endif
|
|
auto trace = GetCallstackBlock( cnt, ring, offset );
|
|
|
|
TracyLfqPrepare( QueueType::CallstackSample );
|
|
MemWrite( &item->callstackSampleFat.time, t0 );
|
|
MemWrite( &item->callstackSampleFat.thread, tid );
|
|
MemWrite( &item->callstackSampleFat.ptr, (uint64_t)trace );
|
|
TracyLfqCommit;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// Layout:
|
|
// u64 ip
|
|
// u64 time
|
|
|
|
uint64_t ip, t0;
|
|
ring.Read( &ip, offset, sizeof( uint64_t ) );
|
|
offset += sizeof( uint64_t );
|
|
ring.Read( &t0, offset, sizeof( uint64_t ) );
|
|
|
|
#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
|
|
t0 = ring.ConvertTimeToTsc( t0 );
|
|
#endif
|
|
QueueType type;
|
|
switch( id )
|
|
{
|
|
case EventCpuCycles:
|
|
type = QueueType::HwSampleCpuCycle;
|
|
break;
|
|
case EventInstructionsRetired:
|
|
type = QueueType::HwSampleInstructionRetired;
|
|
break;
|
|
case EventCacheReference:
|
|
type = QueueType::HwSampleCacheReference;
|
|
break;
|
|
case EventCacheMiss:
|
|
type = QueueType::HwSampleCacheMiss;
|
|
break;
|
|
case EventBranchRetired:
|
|
type = QueueType::HwSampleBranchRetired;
|
|
break;
|
|
case EventBranchMiss:
|
|
type = QueueType::HwSampleBranchMiss;
|
|
break;
|
|
default:
|
|
assert( false );
|
|
break;
|
|
}
|
|
|
|
TracyLfqPrepare( type );
|
|
MemWrite( &item->hwSample.ip, ip );
|
|
MemWrite( &item->hwSample.time, t0 );
|
|
TracyLfqCommit;
|
|
}
|
|
}
|
|
pos += hdr.size;
|
|
}
|
|
assert( pos == end );
|
|
ring.Advance( end );
|
|
}
|
|
if( !traceActive.load( std::memory_order_relaxed ) ) break;
|
|
|
|
if( s_ctxBufferIdx != s_numBuffers )
|
|
{
|
|
const auto ctxBufNum = s_numBuffers - s_ctxBufferIdx;
|
|
|
|
int activeNum = 0;
|
|
bool active[512];
|
|
uint32_t end[512];
|
|
uint32_t pos[512];
|
|
for( int i=0; i<ctxBufNum; i++ )
|
|
{
|
|
const auto rbIdx = s_ctxBufferIdx + i;
|
|
const auto rbHead = s_ring[rbIdx].LoadHead();
|
|
const auto rbTail = s_ring[rbIdx].GetTail();
|
|
const auto rbActive = rbHead != rbTail;
|
|
|
|
active[i] = rbActive;
|
|
if( rbActive )
|
|
{
|
|
activeNum++;
|
|
end[i] = rbHead - rbTail;
|
|
pos[i] = 0;
|
|
}
|
|
else
|
|
{
|
|
end[i] = 0;
|
|
}
|
|
}
|
|
if( activeNum > 0 )
|
|
{
|
|
hadData = true;
|
|
while( activeNum > 0 )
|
|
{
|
|
int sel = -1;
|
|
int64_t t0 = std::numeric_limits<int64_t>::max();
|
|
for( int i=0; i<ctxBufNum; i++ )
|
|
{
|
|
if( !active[i] ) continue;
|
|
auto rbPos = pos[i];
|
|
assert( rbPos < end[i] );
|
|
const auto rbIdx = s_ctxBufferIdx + i;
|
|
perf_event_header hdr;
|
|
s_ring[rbIdx].Read( &hdr, rbPos, sizeof( perf_event_header ) );
|
|
if( hdr.type == PERF_RECORD_SAMPLE )
|
|
{
|
|
int64_t rbTime;
|
|
s_ring[rbIdx].Read( &rbTime, rbPos + sizeof( perf_event_header ), sizeof( int64_t ) );
|
|
if( rbTime < t0 )
|
|
{
|
|
t0 = rbTime;
|
|
sel = i;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
rbPos += hdr.size;
|
|
if( rbPos == end[i] )
|
|
{
|
|
active[i] = false;
|
|
activeNum--;
|
|
}
|
|
else
|
|
{
|
|
pos[i] = rbPos;
|
|
}
|
|
}
|
|
}
|
|
assert( sel >= 0 || activeNum == 0 );
|
|
if( sel >= 0 )
|
|
{
|
|
auto& ring = s_ring[s_ctxBufferIdx + sel];
|
|
auto rbPos = pos[sel];
|
|
auto offset = rbPos;
|
|
perf_event_header hdr;
|
|
ring.Read( &hdr, offset, sizeof( perf_event_header ) );
|
|
|
|
#if defined TRACY_HW_TIMER && ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 )
|
|
t0 = ring.ConvertTimeToTsc( t0 );
|
|
#endif
|
|
|
|
if( ring.GetId() == EventContextSwitch )
|
|
{
|
|
// Layout:
|
|
// u64 time
|
|
// u64 cnt
|
|
// u64 ip[cnt]
|
|
// u32 size
|
|
// u8 data[size]
|
|
// Data (not ABI stable, but has not changed since it was added, in 2009):
|
|
// u8 hdr[8]
|
|
// u8 prev_comm[16]
|
|
// u32 prev_pid
|
|
// u32 prev_prio
|
|
// u64 prev_state
|
|
// u8 next_comm[16]
|
|
// u32 next_pid
|
|
// u32 next_prio
|
|
|
|
offset += sizeof( perf_event_header ) + sizeof( uint64_t );
|
|
|
|
uint64_t cnt;
|
|
ring.Read( &cnt, offset, sizeof( uint64_t ) );
|
|
offset += sizeof( uint64_t );
|
|
const auto traceOffset = offset;
|
|
offset += sizeof( uint64_t ) * cnt + sizeof( uint32_t ) + 8 + 16;
|
|
|
|
uint32_t prev_pid, next_pid;
|
|
uint64_t prev_state;
|
|
|
|
ring.Read( &prev_pid, offset, sizeof( uint32_t ) );
|
|
offset += sizeof( uint32_t ) + sizeof( uint32_t );
|
|
ring.Read( &prev_state, offset, sizeof( uint64_t ) );
|
|
offset += sizeof( uint64_t ) + 16;
|
|
ring.Read( &next_pid, offset, sizeof( uint32_t ) );
|
|
|
|
uint8_t reason = 100;
|
|
uint8_t state;
|
|
|
|
if( prev_state & 0x0001 ) state = 104;
|
|
else if( prev_state & 0x0002 ) state = 101;
|
|
else if( prev_state & 0x0004 ) state = 105;
|
|
else if( prev_state & 0x0008 ) state = 106;
|
|
else if( prev_state & 0x0010 ) state = 108;
|
|
else if( prev_state & 0x0020 ) state = 109;
|
|
else if( prev_state & 0x0040 ) state = 110;
|
|
else if( prev_state & 0x0080 ) state = 102;
|
|
else state = 103;
|
|
|
|
TracyLfqPrepare( QueueType::ContextSwitch );
|
|
MemWrite( &item->contextSwitch.time, t0 );
|
|
MemWrite( &item->contextSwitch.oldThread, prev_pid );
|
|
MemWrite( &item->contextSwitch.newThread, next_pid );
|
|
MemWrite( &item->contextSwitch.cpu, uint8_t( ring.GetCpu() ) );
|
|
MemWrite( &item->contextSwitch.reason, reason );
|
|
MemWrite( &item->contextSwitch.state, state );
|
|
TracyLfqCommit;
|
|
|
|
if( cnt > 0 && prev_pid != 0 && CurrentProcOwnsThread( prev_pid ) )
|
|
{
|
|
auto trace = GetCallstackBlock( cnt, ring, traceOffset );
|
|
|
|
TracyLfqPrepare( QueueType::CallstackSampleContextSwitch );
|
|
MemWrite( &item->callstackSampleFat.time, t0 );
|
|
MemWrite( &item->callstackSampleFat.thread, prev_pid );
|
|
MemWrite( &item->callstackSampleFat.ptr, (uint64_t)trace );
|
|
TracyLfqCommit;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
assert( ring.GetId() == EventWakeup );
|
|
|
|
// Layout:
|
|
// u64 time
|
|
// u32 size
|
|
// u8 data[size]
|
|
// Data:
|
|
// u8 hdr[8]
|
|
// u8 comm[16]
|
|
// u32 pid
|
|
// u32 prio
|
|
// u64 target_cpu
|
|
|
|
offset += sizeof( perf_event_header ) + sizeof( uint64_t ) + sizeof( uint32_t ) + 8 + 16;
|
|
|
|
uint32_t pid;
|
|
ring.Read( &pid, offset, sizeof( uint32_t ) );
|
|
|
|
TracyLfqPrepare( QueueType::ThreadWakeup );
|
|
MemWrite( &item->threadWakeup.time, t0 );
|
|
MemWrite( &item->threadWakeup.thread, pid );
|
|
TracyLfqCommit;
|
|
}
|
|
|
|
rbPos += hdr.size;
|
|
if( rbPos == end[sel] )
|
|
{
|
|
active[sel] = false;
|
|
activeNum--;
|
|
}
|
|
else
|
|
{
|
|
pos[sel] = rbPos;
|
|
}
|
|
}
|
|
}
|
|
for( int i=0; i<ctxBufNum; i++ )
|
|
{
|
|
if( end[i] != 0 ) s_ring[s_ctxBufferIdx + i].Advance( end[i] );
|
|
}
|
|
}
|
|
}
|
|
if( !traceActive.load( std::memory_order_relaxed ) ) break;
|
|
if( !hadData )
|
|
{
|
|
std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
|
|
}
|
|
}
|
|
|
|
for( int i=0; i<s_numBuffers; i++ ) s_ring[i].~RingBuffer();
|
|
tracy_free_fast( s_ring );
|
|
}
|
|
|
|
void SysTraceGetExternalName( uint64_t thread, const char*& threadName, const char*& name )
|
|
{
|
|
FILE* f;
|
|
char fn[256];
|
|
sprintf( fn, "/proc/%" PRIu64 "/comm", thread );
|
|
f = fopen( fn, "rb" );
|
|
if( f )
|
|
{
|
|
char buf[256];
|
|
const auto sz = fread( buf, 1, 256, f );
|
|
if( sz > 0 && buf[sz-1] == '\n' ) buf[sz-1] = '\0';
|
|
threadName = CopyString( buf );
|
|
fclose( f );
|
|
}
|
|
else
|
|
{
|
|
threadName = CopyString( "???", 3 );
|
|
}
|
|
|
|
sprintf( fn, "/proc/%" PRIu64 "/status", thread );
|
|
f = fopen( fn, "rb" );
|
|
if( f )
|
|
{
|
|
int pid = -1;
|
|
size_t lsz = 1024;
|
|
auto line = (char*)tracy_malloc_fast( lsz );
|
|
for(;;)
|
|
{
|
|
auto rd = getline( &line, &lsz, f );
|
|
if( rd <= 0 ) break;
|
|
if( memcmp( "Tgid:\t", line, 6 ) == 0 )
|
|
{
|
|
pid = atoi( line + 6 );
|
|
break;
|
|
}
|
|
}
|
|
tracy_free_fast( line );
|
|
fclose( f );
|
|
if( pid >= 0 )
|
|
{
|
|
{
|
|
uint64_t _pid = pid;
|
|
TracyLfqPrepare( QueueType::TidToPid );
|
|
MemWrite( &item->tidToPid.tid, thread );
|
|
MemWrite( &item->tidToPid.pid, _pid );
|
|
TracyLfqCommit;
|
|
}
|
|
sprintf( fn, "/proc/%i/comm", pid );
|
|
f = fopen( fn, "rb" );
|
|
if( f )
|
|
{
|
|
char buf[256];
|
|
const auto sz = fread( buf, 1, 256, f );
|
|
if( sz > 0 && buf[sz-1] == '\n' ) buf[sz-1] = '\0';
|
|
name = CopyStringFast( buf );
|
|
fclose( f );
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
name = CopyStringFast( "???", 3 );
|
|
}
|
|
|
|
}
|
|
|
|
# endif
|
|
|
|
#endif
|