tracy/client/TracyProfiler.cpp

976 lines
28 KiB
C++
Raw Normal View History

#ifdef TRACY_ENABLE
2017-09-14 17:25:16 +00:00
#ifdef _MSC_VER
# include <winsock2.h>
2017-10-03 21:17:16 +00:00
# include <windows.h>
2017-09-14 17:25:16 +00:00
#else
# include <sys/time.h>
#endif
2017-10-03 21:17:16 +00:00
#ifdef _GNU_SOURCE
# include <errno.h>
#endif
2017-09-23 19:33:05 +00:00
#include <atomic>
2017-09-10 15:43:56 +00:00
#include <assert.h>
2017-09-23 19:09:46 +00:00
#include <chrono>
2017-09-14 17:25:16 +00:00
#include <limits>
2017-09-21 22:36:36 +00:00
#include <memory>
2018-04-01 18:04:35 +00:00
#include <mutex>
2017-10-30 13:59:05 +00:00
#include <stdlib.h>
2017-09-19 00:19:20 +00:00
#include <string.h>
2017-09-10 15:43:56 +00:00
#include "../common/TracyAlign.hpp"
#include "../common/TracyProtocol.hpp"
2017-09-11 20:51:47 +00:00
#include "../common/TracySocket.hpp"
#include "../common/TracySystem.hpp"
2017-10-14 14:59:43 +00:00
#include "tracy_rpmalloc.hpp"
2018-06-18 23:17:19 +00:00
#include "TracyCallstack.hpp"
2017-09-24 14:02:09 +00:00
#include "TracyScoped.hpp"
2017-09-10 15:43:56 +00:00
#include "TracyProfiler.hpp"
#include "TracyThread.hpp"
2017-09-10 15:43:56 +00:00
#ifdef __GNUC__
#define init_order( val ) __attribute__ ((init_priority(val)))
#else
#define init_order(x)
#endif
2018-04-27 14:58:45 +00:00
#if defined TRACY_HW_TIMER && __ARM_ARCH >= 6
# include <signal.h>
# include <setjmp.h>
#endif
2017-09-10 15:43:56 +00:00
namespace tracy
{
2017-10-14 14:59:43 +00:00
struct RPMallocInit
{
RPMallocInit() { rpmalloc_initialize(); }
};
struct RPMallocThreadInit
{
RPMallocThreadInit() { rpmalloc_thread_initialize(); }
};
2017-10-16 23:07:34 +00:00
struct InitTimeWrapper
{
int64_t val;
};
2018-04-27 14:58:45 +00:00
#if defined TRACY_HW_TIMER && __ARM_ARCH >= 6
int64_t (*GetTimeImpl)();
int64_t GetTimeImplFallback()
{
return std::chrono::duration_cast<std::chrono::nanoseconds>( std::chrono::high_resolution_clock::now().time_since_epoch() ).count();
}
int64_t GetTimeImplCntvct()
{
int64_t t;
# ifdef __aarch64__
asm volatile ( "mrs %0, cntvct_el0" : "=r" (t) );
# else
asm volatile ( "mrrc p15, 1, %Q0, %R0, c14" : "=r" (t) );
# endif
return t;
}
static sigjmp_buf SigIllEnv;
static int SetupHwTimerFailed()
{
return sigsetjmp( SigIllEnv, 1 );
}
2018-08-01 12:07:30 +00:00
static void SetupHwTimerSigIllHandler( int /*signum*/ )
2018-04-27 14:58:45 +00:00
{
siglongjmp( SigIllEnv, 1 );
}
static int64_t SetupHwTimer()
{
struct sigaction act, oldact;
memset( &act, 0, sizeof( act ) );
act.sa_handler = SetupHwTimerSigIllHandler;
if( sigaction( SIGILL, &act, &oldact ) )
{
GetTimeImpl = GetTimeImplFallback;
return Profiler::GetTime();
}
if( SetupHwTimerFailed() )
{
sigaction( SIGILL, &oldact, nullptr );
GetTimeImpl = GetTimeImplFallback;
return Profiler::GetTime();
}
GetTimeImplCntvct();
sigaction( SIGILL, &oldact, nullptr );
GetTimeImpl = GetTimeImplCntvct;
return Profiler::GetTime();
}
#else
static int64_t SetupHwTimer()
{
return Profiler::GetTime();
}
#endif
2017-10-03 21:17:16 +00:00
static const char* GetProcessName()
{
const char* processName = "unknown";
#if defined _MSC_VER
2017-10-03 21:17:16 +00:00
static char buf[_MAX_PATH];
GetModuleFileNameA( nullptr, buf, _MAX_PATH );
const char* ptr = buf;
while( *ptr != '\0' ) ptr++;
while( ptr > buf && *ptr != '\\' && *ptr != '/' ) ptr--;
if( ptr > buf ) ptr++;
processName = ptr;
2017-10-30 13:59:05 +00:00
#elif defined __ANDROID__
# if __ANDROID_API__ >= 21
auto buf = getprogname();
if( buf ) processName = buf;
2017-10-30 13:59:05 +00:00
# endif
#elif defined _GNU_SOURCE || defined __CYGWIN__
processName = program_invocation_short_name;
2018-08-01 12:07:30 +00:00
#endif
return processName;
2017-10-03 21:17:16 +00:00
}
2017-09-24 13:59:53 +00:00
enum { QueuePrealloc = 256 * 1024 };
2017-10-16 22:36:15 +00:00
// MSVC static initialization order solution. gcc/clang uses init_order() to avoid all this.
2017-09-10 15:43:56 +00:00
static Profiler* s_instance = nullptr;
static Thread* s_thread = nullptr;
2017-09-10 15:43:56 +00:00
2017-10-16 22:36:15 +00:00
// 1a. But s_queue is needed for initialization of variables in point 2.
extern moodycamel::ConcurrentQueue<QueueItem> s_queue;
static thread_local RPMallocThreadInit init_order(106) s_rpmalloc_thread_init;
2017-10-16 22:36:15 +00:00
// 2. If these variables would be in the .CRT$XCB section, they would be initialized only in main thread.
static thread_local moodycamel::ProducerToken init_order(107) s_token_detail( s_queue );
thread_local ProducerWrapper init_order(108) s_token { s_queue.get_explicit_producer( s_token_detail ) };
2017-10-16 22:36:15 +00:00
#ifdef _MSC_VER
// 1. Initialize these static variables before all other variables.
# pragma warning( disable : 4075 )
# pragma init_seg( ".CRT$XCB" )
#endif
2018-04-27 14:58:45 +00:00
static InitTimeWrapper init_order(101) s_initTime { SetupHwTimer() };
2017-10-16 23:07:34 +00:00
static RPMallocInit init_order(102) s_rpmalloc_init;
moodycamel::ConcurrentQueue<QueueItem> init_order(103) s_queue( QueuePrealloc );
std::atomic<uint32_t> init_order(104) s_lockCounter( 0 );
2018-06-22 13:10:23 +00:00
std::atomic<uint8_t> init_order(104) s_gpuCtxCounter( 0 );
2017-11-14 22:29:48 +00:00
thread_local GpuCtxWrapper init_order(104) s_gpuCtx { nullptr };
2018-06-17 16:14:37 +00:00
VkCtxWrapper init_order(104) s_vkCtx { nullptr };
2017-11-13 23:48:26 +00:00
#ifdef TRACY_COLLECT_THREAD_NAMES
struct ThreadNameData;
static std::atomic<ThreadNameData*> init_order(104) s_threadNameDataInstance( nullptr );
2018-07-30 16:12:42 +00:00
std::atomic<ThreadNameData*>& s_threadNameData = s_threadNameDataInstance;
#endif
2018-07-12 10:53:35 +00:00
#ifdef TRACY_ON_DEMAND
2018-07-13 18:20:37 +00:00
thread_local LuaZoneState init_order(104) s_luaZoneState { 0, false };
2018-07-12 10:53:35 +00:00
#endif
static Profiler init_order(105) s_profilerInstance;
2018-07-30 16:12:42 +00:00
Profiler& s_profiler = s_profilerInstance;
#ifdef _MSC_VER
# define DLL_EXPORT __declspec(dllexport)
#else
# define DLL_EXPORT __attribute__((visibility("default")))
#endif
// DLL exports to enable TracyClientDLL.cpp to retrieve the instances of Tracy objects and functions
DLL_EXPORT moodycamel::ConcurrentQueue<QueueItem>::ExplicitProducer* get_token()
{
return s_token.ptr;
}
DLL_EXPORT void*(*get_rpmalloc())(size_t size)
{
return rpmalloc;
}
DLL_EXPORT void(*get_rpfree())(void* ptr)
{
return rpfree;
}
#if defined TRACY_HW_TIMER && __ARM_ARCH >= 6
DLL_EXPORT int64_t(*get_GetTimeImpl())()
{
return GetTimeImpl;
}
#endif
DLL_EXPORT Profiler& get_profiler()
{
return s_profiler;
}
#ifdef TRACY_COLLECT_THREAD_NAMES
DLL_EXPORT std::atomic<ThreadNameData*>& get_threadNameData()
{
return s_threadNameData;
}
DLL_EXPORT void(*get_rpmalloc_thread_initialize())()
{
return rpmalloc_thread_initialize;
}
#endif
2017-10-16 22:36:15 +00:00
2017-11-02 11:56:13 +00:00
enum { BulkSize = TargetFrameSize / QueueItemSize };
2017-09-10 15:43:56 +00:00
Profiler::Profiler()
: m_timeBegin( 0 )
, m_mainThread( GetThreadHandle() )
, m_epoch( std::chrono::duration_cast<std::chrono::seconds>( std::chrono::system_clock::now().time_since_epoch() ).count() )
2017-09-10 18:14:16 +00:00
, m_shutdown( false )
2017-10-18 17:49:17 +00:00
, m_sock( nullptr )
, m_noExit( false )
, m_stream( LZ4_createStream() )
, m_buffer( (char*)tracy_malloc( TargetFrameSize*3 ) )
, m_bufferOffset( 0 )
, m_bufferStart( 0 )
2017-11-02 11:56:13 +00:00
, m_itemBuf( (QueueItem*)tracy_malloc( sizeof( QueueItem ) * BulkSize ) )
2017-11-02 16:37:10 +00:00
, m_lz4Buf( (char*)tracy_malloc( LZ4Size + sizeof( lz4sz_t ) ) )
2018-04-01 17:53:05 +00:00
, m_serialQueue( 1024*1024 )
, m_serialDequeue( 1024*1024 )
2018-07-10 19:50:00 +00:00
#ifdef TRACY_ON_DEMAND
, m_isConnected( false )
2018-07-10 20:26:40 +00:00
, m_frameCount( 0 )
2018-07-11 10:14:28 +00:00
, m_deferredQueue( 64*1024 )
2018-07-10 19:50:00 +00:00
#endif
2017-09-10 15:43:56 +00:00
{
assert( !s_instance );
s_instance = this;
2017-10-16 22:36:15 +00:00
#ifdef _MSC_VER
// 3. But these variables need to be initialized in main thread within the .CRT$XCB section. Do it here.
s_token_detail = moodycamel::ProducerToken( s_queue );
s_token = ProducerWrapper { s_queue.get_explicit_producer( s_token_detail ) };
#endif
2017-09-23 19:33:05 +00:00
CalibrateTimer();
2017-09-24 14:02:09 +00:00
CalibrateDelay();
2017-09-23 19:33:05 +00:00
#ifndef TRACY_NO_EXIT
const char* noExitEnv = getenv( "TRACY_NO_EXIT" );
if( noExitEnv && noExitEnv[0] == '1' )
{
m_noExit = true;
}
#endif
s_thread = (Thread*)tracy_malloc( sizeof( Thread ) );
new(s_thread) Thread( LaunchWorker, this );
SetThreadName( s_thread->Handle(), "Tracy Profiler" );
2018-06-18 23:17:19 +00:00
#ifdef TRACY_HAS_CALLSTACK
InitCallstack();
#endif
m_timeBegin.store( GetTime(), std::memory_order_relaxed );
2017-09-10 15:43:56 +00:00
}
Profiler::~Profiler()
{
m_shutdown.store( true, std::memory_order_relaxed );
s_thread->~Thread();
tracy_free( s_thread );
2017-11-02 16:37:10 +00:00
tracy_free( m_lz4Buf );
2017-11-02 11:56:13 +00:00
tracy_free( m_itemBuf );
tracy_free( m_buffer );
LZ4_freeStream( m_stream );
2017-10-18 17:49:17 +00:00
if( m_sock )
{
m_sock->~Socket();
tracy_free( m_sock );
}
2017-09-10 15:43:56 +00:00
assert( s_instance );
s_instance = nullptr;
}
bool Profiler::ShouldExit()
{
return s_instance->m_shutdown.load( std::memory_order_relaxed );
}
2017-09-10 15:43:56 +00:00
void Profiler::Worker()
{
rpmalloc_thread_initialize();
const auto procname = GetProcessName();
const auto pnsz = std::min<size_t>( strlen( procname ), WelcomeMessageProgramNameSize - 1 );
while( m_timeBegin.load( std::memory_order_relaxed ) == 0 ) std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
#ifdef TRACY_ON_DEMAND
uint8_t onDemand = 1;
#else
uint8_t onDemand = 0;
#endif
WelcomeMessage welcome;
MemWrite( &welcome.timerMul, m_timerMul );
MemWrite( &welcome.initBegin, s_initTime.val );
MemWrite( &welcome.initEnd, m_timeBegin.load( std::memory_order_relaxed ) );
MemWrite( &welcome.delay, m_delay );
MemWrite( &welcome.resolution, m_resolution );
MemWrite( &welcome.epoch, m_epoch );
MemWrite( &welcome.onDemand, onDemand );
memcpy( welcome.programName, procname, pnsz );
memset( welcome.programName + pnsz, 0, WelcomeMessageProgramNameSize - pnsz );
moodycamel::ConsumerToken token( s_queue );
2017-09-11 20:51:47 +00:00
ListenSocket listen;
listen.Listen( "8086", 8 );
2017-09-10 15:43:56 +00:00
for(;;)
{
2017-09-11 20:51:47 +00:00
for(;;)
{
2017-10-18 18:01:12 +00:00
#ifndef TRACY_NO_EXIT
if( !m_noExit && ShouldExit() ) return;
2017-10-18 18:01:12 +00:00
#endif
m_sock = listen.Accept();
if( m_sock ) break;
}
2017-09-11 20:51:47 +00:00
2018-07-10 19:50:00 +00:00
#ifdef TRACY_ON_DEMAND
ClearQueues( token );
2018-07-10 19:50:00 +00:00
m_isConnected.store( true, std::memory_order_relaxed );
#endif
LZ4_resetStream( m_stream );
m_sock->Send( &welcome, sizeof( welcome ) );
#ifdef TRACY_ON_DEMAND
OnDemandPayloadMessage onDemand;
onDemand.frames = m_frameCount.load( std::memory_order_relaxed );
m_sock->Send( &onDemand, sizeof( onDemand ) );
2018-07-11 10:28:40 +00:00
m_deferredLock.lock();
for( auto& item : m_deferredQueue )
{
const auto idx = MemRead<uint8_t>( &item.hdr.idx );
AppendData( &item, QueueDataSize[idx] );
}
m_deferredLock.unlock();
#endif
int keepAlive = 0;
2017-09-11 20:51:47 +00:00
for(;;)
{
2017-10-18 16:48:51 +00:00
const auto status = Dequeue( token );
2018-04-01 18:04:35 +00:00
const auto serialStatus = DequeueSerial();
if( status == ConnectionLost || serialStatus == ConnectionLost )
2017-09-11 20:51:47 +00:00
{
2017-10-18 16:48:51 +00:00
break;
2017-09-11 20:51:47 +00:00
}
2018-04-01 18:04:35 +00:00
else if( status == QueueEmpty && serialStatus == QueueEmpty )
2017-09-11 20:51:47 +00:00
{
2017-10-18 16:48:51 +00:00
if( ShouldExit() ) break;
if( m_bufferOffset != m_bufferStart )
{
if( !CommitData() ) break;
}
if( keepAlive == 500 )
{
QueueItem ka;
ka.hdr.type = QueueType::KeepAlive;
AppendData( &ka, QueueDataSize[ka.hdr.idx] );
if( !CommitData() ) break;
keepAlive = 0;
}
else
{
keepAlive++;
std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
}
}
else
{
keepAlive = 0;
2017-09-11 20:51:47 +00:00
}
2017-09-14 17:25:16 +00:00
while( m_sock->HasData() )
{
if( !HandleServerQuery() ) break;
2017-09-14 17:25:16 +00:00
}
}
2017-10-18 16:48:51 +00:00
if( ShouldExit() ) break;
2018-07-10 19:50:00 +00:00
#ifdef TRACY_ON_DEMAND
m_isConnected.store( false, std::memory_order_relaxed );
#endif
2017-10-18 16:48:51 +00:00
}
for(;;)
{
const auto status = Dequeue( token );
2018-04-01 18:04:35 +00:00
const auto serialStatus = DequeueSerial();
if( status == ConnectionLost || serialStatus == ConnectionLost )
{
break;
}
2018-04-01 18:04:35 +00:00
else if( status == QueueEmpty && serialStatus == QueueEmpty )
{
if( m_bufferOffset != m_bufferStart ) CommitData();
break;
}
while( m_sock->HasData() )
{
if( !HandleServerQuery() ) break;
}
}
2017-10-18 16:48:51 +00:00
QueueItem terminate;
MemWrite( &terminate.hdr.type, QueueType::Terminate );
2017-10-18 16:48:51 +00:00
if( !SendData( (const char*)&terminate, 1 ) ) return;
for(;;)
{
if( m_sock->HasData() )
{
while( m_sock->HasData() )
{
if( !HandleServerQuery() )
{
if( m_bufferOffset != m_bufferStart ) CommitData();
return;
}
}
2017-10-18 16:48:51 +00:00
while( Dequeue( token ) == Success ) {}
2018-04-01 18:04:35 +00:00
while( DequeueSerial() == Success ) {}
if( m_bufferOffset != m_bufferStart )
{
if( !CommitData() ) return;
}
2017-10-18 16:48:51 +00:00
}
else
{
if( m_bufferOffset != m_bufferStart ) CommitData();
2017-10-18 16:48:51 +00:00
std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
}
}
}
static void FreeAssociatedMemory( const QueueItem& item )
{
if( item.hdr.idx >= (int)QueueType::Terminate ) return;
uint64_t ptr;
switch( item.hdr.type )
{
case QueueType::ZoneText:
case QueueType::ZoneName:
ptr = MemRead<uint64_t>( &item.zoneText.text );
tracy_free( (void*)ptr );
break;
case QueueType::Message:
ptr = MemRead<uint64_t>( &item.message.text );
tracy_free( (void*)ptr );
break;
case QueueType::ZoneBeginAllocSrcLoc:
ptr = MemRead<uint64_t>( &item.zoneBegin.srcloc );
tracy_free( (void*)ptr );
break;
case QueueType::CallstackMemory:
ptr = MemRead<uint64_t>( &item.callstackMemory.ptr );
tracy_free( (void*)ptr );
break;
case QueueType::Callstack:
ptr = MemRead<uint64_t>( &item.callstack.ptr );
tracy_free( (void*)ptr );
break;
default:
assert( false );
break;
}
}
void Profiler::ClearQueues( moodycamel::ConsumerToken& token )
{
for(;;)
{
const auto sz = s_queue.try_dequeue_bulk( token, m_itemBuf, BulkSize );
if( sz == 0 ) break;
for( size_t i=0; i<sz; i++ ) FreeAssociatedMemory( m_itemBuf[i] );
}
Use the fastest mutex available. The selection is based on the following test results: MSVC: === Lock test, 6 threads === => NonRecursiveBenaphore No contention: 11.641 ns/iter 2 thread contention: 141.559 ns/iter 3 thread contention: 242.733 ns/iter 4 thread contention: 409.807 ns/iter 5 thread contention: 561.544 ns/iter 6 thread contention: 785.845 ns/iter => std::mutex No contention: 19.190 ns/iter 2 thread contention: 39.305 ns/iter 3 thread contention: 58.999 ns/iter 4 thread contention: 59.532 ns/iter 5 thread contention: 103.539 ns/iter 6 thread contention: 110.314 ns/iter => std::shared_timed_mutex No contention: 45.487 ns/iter 2 thread contention: 96.351 ns/iter 3 thread contention: 142.871 ns/iter 4 thread contention: 184.999 ns/iter 5 thread contention: 336.608 ns/iter 6 thread contention: 542.551 ns/iter => std::shared_mutex No contention: 10.861 ns/iter 2 thread contention: 17.495 ns/iter 3 thread contention: 31.126 ns/iter 4 thread contention: 40.468 ns/iter 5 thread contention: 15.677 ns/iter 6 thread contention: 64.505 ns/iter Cygwin (clang): === Lock test, 6 threads === => NonRecursiveBenaphore No contention: 11.536 ns/iter 2 thread contention: 121.082 ns/iter 3 thread contention: 396.430 ns/iter 4 thread contention: 672.555 ns/iter 5 thread contention: 1327.761 ns/iter 6 thread contention: 14151.955 ns/iter => std::mutex No contention: 62.583 ns/iter 2 thread contention: 3990.464 ns/iter 3 thread contention: 7161.189 ns/iter 4 thread contention: 9870.820 ns/iter 5 thread contention: 12355.178 ns/iter 6 thread contention: 14694.903 ns/iter => std::shared_timed_mutex No contention: 91.687 ns/iter 2 thread contention: 1115.037 ns/iter 3 thread contention: 4183.792 ns/iter 4 thread contention: 15283.491 ns/iter 5 thread contention: 27812.477 ns/iter 6 thread contention: 35028.140 ns/iter => std::shared_mutex No contention: 91.764 ns/iter 2 thread contention: 1051.826 ns/iter 3 thread contention: 5574.720 ns/iter 4 thread contention: 15721.416 ns/iter 5 thread contention: 27721.487 ns/iter 6 thread contention: 35420.404 ns/iter Linux (x64): === Lock test, 6 threads === => NonRecursiveBenaphore No contention: 13.487 ns/iter 2 thread contention: 210.317 ns/iter 3 thread contention: 430.855 ns/iter 4 thread contention: 510.533 ns/iter 5 thread contention: 1003.609 ns/iter 6 thread contention: 1787.683 ns/iter => std::mutex No contention: 12.403 ns/iter 2 thread contention: 157.122 ns/iter 3 thread contention: 186.791 ns/iter 4 thread contention: 265.073 ns/iter 5 thread contention: 283.778 ns/iter 6 thread contention: 270.687 ns/iter => std::shared_timed_mutex No contention: 21.509 ns/iter 2 thread contention: 150.179 ns/iter 3 thread contention: 256.574 ns/iter 4 thread contention: 415.351 ns/iter 5 thread contention: 611.532 ns/iter 6 thread contention: 944.695 ns/iter => std::shared_mutex No contention: 20.805 ns/iter 2 thread contention: 157.034 ns/iter 3 thread contention: 244.025 ns/iter 4 thread contention: 406.269 ns/iter 5 thread contention: 387.985 ns/iter 6 thread contention: 468.550 ns/iter Linux (arm64): === Lock test, 6 threads === => NonRecursiveBenaphore No contention: 20.891 ns/iter 2 thread contention: 211.037 ns/iter 3 thread contention: 409.962 ns/iter 4 thread contention: 657.441 ns/iter 5 thread contention: 828.405 ns/iter 6 thread contention: 1131.827 ns/iter => std::mutex No contention: 50.884 ns/iter 2 thread contention: 103.620 ns/iter 3 thread contention: 332.429 ns/iter 4 thread contention: 620.802 ns/iter 5 thread contention: 783.943 ns/iter 6 thread contention: 834.002 ns/iter => std::shared_timed_mutex No contention: 64.948 ns/iter 2 thread contention: 173.191 ns/iter 3 thread contention: 490.352 ns/iter 4 thread contention: 660.668 ns/iter 5 thread contention: 1014.546 ns/iter 6 thread contention: 1451.553 ns/iter => std::shared_mutex No contention: 64.521 ns/iter 2 thread contention: 195.222 ns/iter 3 thread contention: 490.819 ns/iter 4 thread contention: 654.786 ns/iter 5 thread contention: 955.759 ns/iter 6 thread contention: 1282.544 ns/iter
2018-07-13 22:39:01 +00:00
std::lock_guard<TracyMutex> lock( m_serialLock );
for( auto& v : m_serialDequeue ) FreeAssociatedMemory( v );
m_serialDequeue.clear();
for( auto& v : m_serialQueue ) FreeAssociatedMemory( v );
m_serialQueue.clear();
}
2017-10-18 16:48:51 +00:00
Profiler::DequeueStatus Profiler::Dequeue( moodycamel::ConsumerToken& token )
{
2017-11-02 11:56:13 +00:00
const auto sz = s_queue.try_dequeue_bulk( token, m_itemBuf, BulkSize );
2017-10-18 16:48:51 +00:00
if( sz > 0 )
{
auto end = m_itemBuf + sz;
auto item = m_itemBuf;
while( item != end )
2017-10-18 16:48:51 +00:00
{
2017-11-11 14:41:21 +00:00
uint64_t ptr;
2018-04-03 14:45:55 +00:00
const auto idx = MemRead<uint8_t>( &item->hdr.idx );
if( idx < (int)QueueType::Terminate )
2017-11-11 14:22:55 +00:00
{
switch( (QueueType)idx )
{
case QueueType::ZoneText:
2018-06-29 14:01:31 +00:00
case QueueType::ZoneName:
2018-04-03 14:45:55 +00:00
ptr = MemRead<uint64_t>( &item->zoneText.text );
SendString( ptr, (const char*)ptr, QueueType::CustomStringData );
tracy_free( (void*)ptr );
break;
case QueueType::Message:
2018-04-03 14:45:55 +00:00
ptr = MemRead<uint64_t>( &item->message.text );
SendString( ptr, (const char*)ptr, QueueType::CustomStringData );
tracy_free( (void*)ptr );
break;
case QueueType::ZoneBeginAllocSrcLoc:
2018-04-03 14:45:55 +00:00
ptr = MemRead<uint64_t>( &item->zoneBegin.srcloc );
SendSourceLocationPayload( ptr );
tracy_free( (void*)ptr );
break;
2018-06-21 22:56:01 +00:00
case QueueType::Callstack:
ptr = MemRead<uint64_t>( &item->callstack.ptr );
SendCallstackPayload( ptr );
tracy_free( (void*)ptr );
break;
default:
assert( false );
break;
}
2017-11-11 14:22:55 +00:00
}
if( !AppendData( item, QueueDataSize[idx] ) ) return ConnectionLost;
item++;
2017-10-18 16:48:51 +00:00
}
}
else
{
return QueueEmpty;
2017-09-10 15:43:56 +00:00
}
2017-10-18 16:48:51 +00:00
return Success;
2017-09-10 15:43:56 +00:00
}
2018-04-01 18:04:35 +00:00
Profiler::DequeueStatus Profiler::DequeueSerial()
{
{
Use the fastest mutex available. The selection is based on the following test results: MSVC: === Lock test, 6 threads === => NonRecursiveBenaphore No contention: 11.641 ns/iter 2 thread contention: 141.559 ns/iter 3 thread contention: 242.733 ns/iter 4 thread contention: 409.807 ns/iter 5 thread contention: 561.544 ns/iter 6 thread contention: 785.845 ns/iter => std::mutex No contention: 19.190 ns/iter 2 thread contention: 39.305 ns/iter 3 thread contention: 58.999 ns/iter 4 thread contention: 59.532 ns/iter 5 thread contention: 103.539 ns/iter 6 thread contention: 110.314 ns/iter => std::shared_timed_mutex No contention: 45.487 ns/iter 2 thread contention: 96.351 ns/iter 3 thread contention: 142.871 ns/iter 4 thread contention: 184.999 ns/iter 5 thread contention: 336.608 ns/iter 6 thread contention: 542.551 ns/iter => std::shared_mutex No contention: 10.861 ns/iter 2 thread contention: 17.495 ns/iter 3 thread contention: 31.126 ns/iter 4 thread contention: 40.468 ns/iter 5 thread contention: 15.677 ns/iter 6 thread contention: 64.505 ns/iter Cygwin (clang): === Lock test, 6 threads === => NonRecursiveBenaphore No contention: 11.536 ns/iter 2 thread contention: 121.082 ns/iter 3 thread contention: 396.430 ns/iter 4 thread contention: 672.555 ns/iter 5 thread contention: 1327.761 ns/iter 6 thread contention: 14151.955 ns/iter => std::mutex No contention: 62.583 ns/iter 2 thread contention: 3990.464 ns/iter 3 thread contention: 7161.189 ns/iter 4 thread contention: 9870.820 ns/iter 5 thread contention: 12355.178 ns/iter 6 thread contention: 14694.903 ns/iter => std::shared_timed_mutex No contention: 91.687 ns/iter 2 thread contention: 1115.037 ns/iter 3 thread contention: 4183.792 ns/iter 4 thread contention: 15283.491 ns/iter 5 thread contention: 27812.477 ns/iter 6 thread contention: 35028.140 ns/iter => std::shared_mutex No contention: 91.764 ns/iter 2 thread contention: 1051.826 ns/iter 3 thread contention: 5574.720 ns/iter 4 thread contention: 15721.416 ns/iter 5 thread contention: 27721.487 ns/iter 6 thread contention: 35420.404 ns/iter Linux (x64): === Lock test, 6 threads === => NonRecursiveBenaphore No contention: 13.487 ns/iter 2 thread contention: 210.317 ns/iter 3 thread contention: 430.855 ns/iter 4 thread contention: 510.533 ns/iter 5 thread contention: 1003.609 ns/iter 6 thread contention: 1787.683 ns/iter => std::mutex No contention: 12.403 ns/iter 2 thread contention: 157.122 ns/iter 3 thread contention: 186.791 ns/iter 4 thread contention: 265.073 ns/iter 5 thread contention: 283.778 ns/iter 6 thread contention: 270.687 ns/iter => std::shared_timed_mutex No contention: 21.509 ns/iter 2 thread contention: 150.179 ns/iter 3 thread contention: 256.574 ns/iter 4 thread contention: 415.351 ns/iter 5 thread contention: 611.532 ns/iter 6 thread contention: 944.695 ns/iter => std::shared_mutex No contention: 20.805 ns/iter 2 thread contention: 157.034 ns/iter 3 thread contention: 244.025 ns/iter 4 thread contention: 406.269 ns/iter 5 thread contention: 387.985 ns/iter 6 thread contention: 468.550 ns/iter Linux (arm64): === Lock test, 6 threads === => NonRecursiveBenaphore No contention: 20.891 ns/iter 2 thread contention: 211.037 ns/iter 3 thread contention: 409.962 ns/iter 4 thread contention: 657.441 ns/iter 5 thread contention: 828.405 ns/iter 6 thread contention: 1131.827 ns/iter => std::mutex No contention: 50.884 ns/iter 2 thread contention: 103.620 ns/iter 3 thread contention: 332.429 ns/iter 4 thread contention: 620.802 ns/iter 5 thread contention: 783.943 ns/iter 6 thread contention: 834.002 ns/iter => std::shared_timed_mutex No contention: 64.948 ns/iter 2 thread contention: 173.191 ns/iter 3 thread contention: 490.352 ns/iter 4 thread contention: 660.668 ns/iter 5 thread contention: 1014.546 ns/iter 6 thread contention: 1451.553 ns/iter => std::shared_mutex No contention: 64.521 ns/iter 2 thread contention: 195.222 ns/iter 3 thread contention: 490.819 ns/iter 4 thread contention: 654.786 ns/iter 5 thread contention: 955.759 ns/iter 6 thread contention: 1282.544 ns/iter
2018-07-13 22:39:01 +00:00
std::lock_guard<TracyMutex> lock( m_serialLock );
m_serialQueue.swap( m_serialDequeue );
}
const auto sz = m_serialDequeue.size();
2018-04-01 18:04:35 +00:00
if( sz > 0 )
{
auto item = m_serialDequeue.data();
2018-04-01 18:04:35 +00:00
auto end = item + sz;
while( item != end )
{
uint64_t ptr;
2018-04-03 15:57:12 +00:00
const auto idx = MemRead<uint8_t>( &item->hdr.idx );
if( idx < (int)QueueType::Terminate )
{
switch( (QueueType)idx )
{
case QueueType::CallstackMemory:
ptr = MemRead<uint64_t>( &item->callstackMemory.ptr );
SendCallstackPayload( ptr );
tracy_free( (void*)ptr );
break;
default:
assert( false );
break;
}
}
2018-04-01 18:04:35 +00:00
if( !AppendData( item, QueueDataSize[idx] ) ) return ConnectionLost;
item++;
}
m_serialDequeue.clear();
2018-04-01 18:04:35 +00:00
}
else
{
return QueueEmpty;
}
return Success;
}
bool Profiler::AppendData( const void* data, size_t len )
{
auto ret = true;
ret = NeedDataSize( len );
2018-06-23 00:16:58 +00:00
AppendDataUnsafe( data, len );
return ret;
}
bool Profiler::CommitData()
{
bool ret = SendData( m_buffer + m_bufferStart, m_bufferOffset - m_bufferStart );
if( m_bufferOffset > TargetFrameSize * 2 ) m_bufferOffset = 0;
m_bufferStart = m_bufferOffset;
return ret;
}
bool Profiler::NeedDataSize( size_t len )
{
bool ret = true;
if( m_bufferOffset - m_bufferStart + len > TargetFrameSize )
{
ret = CommitData();
}
return ret;
}
bool Profiler::SendData( const char* data, size_t len )
{
2017-11-02 16:37:10 +00:00
const lz4sz_t lz4sz = LZ4_compress_fast_continue( m_stream, data, m_lz4Buf + sizeof( lz4sz_t ), (int)len, LZ4Size, 1 );
memcpy( m_lz4Buf, &lz4sz, sizeof( lz4sz ) );
return m_sock->Send( m_lz4Buf, lz4sz + sizeof( lz4sz_t ) ) != -1;
}
2018-06-19 17:00:57 +00:00
void Profiler::SendString( uint64_t str, const char* ptr, QueueType type )
{
2018-08-04 18:48:21 +00:00
assert( type == QueueType::StringData || type == QueueType::ThreadName || type == QueueType::CustomStringData || type == QueueType::PlotName || type == QueueType::FrameName );
2017-10-03 14:41:32 +00:00
QueueItem item;
MemWrite( &item.hdr.type, type );
MemWrite( &item.stringTransfer.ptr, str );
auto len = strlen( ptr );
assert( len <= std::numeric_limits<uint16_t>::max() );
2017-10-16 18:42:53 +00:00
auto l16 = uint16_t( len );
NeedDataSize( QueueDataSize[(int)type] + sizeof( l16 ) + l16 );
AppendDataUnsafe( &item, QueueDataSize[(int)type] );
AppendDataUnsafe( &l16, sizeof( l16 ) );
AppendDataUnsafe( ptr, l16 );
}
void Profiler::SendSourceLocation( uint64_t ptr )
{
auto srcloc = (const SourceLocationData*)ptr;
QueueItem item;
MemWrite( &item.hdr.type, QueueType::SourceLocation );
MemWrite( &item.srcloc.name, (uint64_t)srcloc->name );
MemWrite( &item.srcloc.file, (uint64_t)srcloc->file );
MemWrite( &item.srcloc.function, (uint64_t)srcloc->function );
MemWrite( &item.srcloc.line, srcloc->line );
MemWrite( &item.srcloc.r, uint8_t( ( srcloc->color ) & 0xFF ) );
MemWrite( &item.srcloc.g, uint8_t( ( srcloc->color >> 8 ) & 0xFF ) );
MemWrite( &item.srcloc.b, uint8_t( ( srcloc->color >> 16 ) & 0xFF ) );
AppendData( &item, QueueDataSize[(int)QueueType::SourceLocation] );
}
2018-06-19 17:00:57 +00:00
void Profiler::SendSourceLocationPayload( uint64_t _ptr )
{
auto ptr = (const char*)_ptr;
QueueItem item;
MemWrite( &item.hdr.type, QueueType::SourceLocationPayload );
MemWrite( &item.stringTransfer.ptr, _ptr );
const auto len = *((uint32_t*)ptr);
assert( len <= std::numeric_limits<uint16_t>::max() );
assert( len > 4 );
const auto l16 = uint16_t( len - 4 );
NeedDataSize( QueueDataSize[(int)QueueType::SourceLocationPayload] + sizeof( l16 ) + l16 );
AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::SourceLocationPayload] );
AppendDataUnsafe( &l16, sizeof( l16 ) );
AppendDataUnsafe( ptr + 4, l16 );
}
2018-06-19 17:09:43 +00:00
void Profiler::SendCallstackPayload( uint64_t _ptr )
{
auto ptr = (uintptr_t*)_ptr;
QueueItem item;
MemWrite( &item.hdr.type, QueueType::CallstackPayload );
MemWrite( &item.stringTransfer.ptr, _ptr );
const auto sz = *ptr++;
const auto len = sz * sizeof( uint64_t );
const auto l16 = uint16_t( len );
NeedDataSize( QueueDataSize[(int)QueueType::CallstackPayload] + sizeof( l16 ) + l16 );
AppendDataUnsafe( &item, QueueDataSize[(int)QueueType::CallstackPayload] );
AppendDataUnsafe( &l16, sizeof( l16 ) );
2018-06-19 17:09:43 +00:00
2018-08-01 12:07:30 +00:00
if( compile_time_condition<sizeof( uintptr_t ) == sizeof( uint64_t )>::value )
2018-06-19 17:09:43 +00:00
{
AppendDataUnsafe( ptr, sizeof( uint64_t ) * sz );
}
else
{
for( uintptr_t i=0; i<sz; i++ )
{
const auto val = uint64_t( *ptr++ );
AppendDataUnsafe( &val, sizeof( uint64_t ) );
}
2018-06-19 17:09:43 +00:00
}
}
2018-06-19 23:06:31 +00:00
void Profiler::SendCallstackFrame( uint64_t ptr )
{
#ifdef TRACY_HAS_CALLSTACK
auto frame = DecodeCallstackPtr( ptr );
SendString( uint64_t( frame.name ), frame.name, QueueType::CustomStringData );
SendString( uint64_t( frame.file ), frame.file, QueueType::CustomStringData );
2018-06-19 23:06:31 +00:00
QueueItem item;
MemWrite( &item.hdr.type, QueueType::CallstackFrame );
MemWrite( &item.callstackFrame.ptr, ptr );
MemWrite( &item.callstackFrame.name, (uint64_t)frame.name );
MemWrite( &item.callstackFrame.file, (uint64_t)frame.file );
MemWrite( &item.callstackFrame.line, frame.line );
AppendData( &item, QueueDataSize[(int)QueueType::CallstackFrame] );
tracy_free( (void*)frame.name );
tracy_free( (void*)frame.file );
2018-06-19 23:06:31 +00:00
#endif
}
2018-06-19 17:09:43 +00:00
2017-10-17 20:02:47 +00:00
static bool DontExit() { return false; }
bool Profiler::HandleServerQuery()
{
timeval tv;
tv.tv_sec = 0;
tv.tv_usec = 10000;
uint8_t type;
2017-10-17 20:02:47 +00:00
if( !m_sock->Read( &type, sizeof( type ), &tv, DontExit ) ) return false;
uint64_t ptr;
2017-10-17 20:02:47 +00:00
if( !m_sock->Read( &ptr, sizeof( ptr ), &tv, DontExit ) ) return false;
switch( type )
{
case ServerQueryString:
SendString( ptr, (const char*)ptr, QueueType::StringData );
break;
2017-09-21 23:55:02 +00:00
case ServerQueryThreadString:
2017-09-22 23:38:26 +00:00
if( ptr == m_mainThread )
{
SendString( ptr, "Main thread", QueueType::ThreadName );
}
else
{
SendString( ptr, GetThreadName( ptr ), QueueType::ThreadName );
}
2017-09-21 23:55:02 +00:00
break;
case ServerQuerySourceLocation:
SendSourceLocation( ptr );
break;
2017-10-13 01:36:59 +00:00
case ServerQueryPlotName:
SendString( ptr, (const char*)ptr, QueueType::PlotName );
break;
2017-10-18 16:48:51 +00:00
case ServerQueryTerminate:
return false;
2018-06-19 23:06:31 +00:00
case ServerQueryCallstackFrame:
SendCallstackFrame( ptr );
break;
2018-08-04 18:48:21 +00:00
case ServerQueryFrameName:
SendString( ptr, (const char*)ptr, QueueType::FrameName );
break;
default:
assert( false );
break;
}
return true;
}
2017-09-23 19:33:05 +00:00
void Profiler::CalibrateTimer()
{
#ifdef TRACY_HW_TIMER
2018-04-27 14:58:45 +00:00
# if __ARM_ARCH >= 6
if( GetTimeImpl == GetTimeImplFallback )
{
m_timerMul = 1.;
return;
}
# endif
2017-09-23 19:33:05 +00:00
std::atomic_signal_fence( std::memory_order_acq_rel );
const auto t0 = std::chrono::high_resolution_clock::now();
2018-04-26 13:29:09 +00:00
const auto r0 = GetTime();
2017-09-23 19:33:05 +00:00
std::atomic_signal_fence( std::memory_order_acq_rel );
std::this_thread::sleep_for( std::chrono::milliseconds( 200 ) );
2017-09-23 19:33:05 +00:00
std::atomic_signal_fence( std::memory_order_acq_rel );
const auto t1 = std::chrono::high_resolution_clock::now();
2018-04-26 13:29:09 +00:00
const auto r1 = GetTime();
2017-09-23 19:33:05 +00:00
std::atomic_signal_fence( std::memory_order_acq_rel );
const auto dt = std::chrono::duration_cast<std::chrono::nanoseconds>( t1 - t0 ).count();
const auto dr = r1 - r0;
m_timerMul = double( dt ) / double( dr );
#else
m_timerMul = 1.;
2017-09-23 19:33:05 +00:00
#endif
}
2017-09-24 14:02:09 +00:00
class FakeZone
{
public:
FakeZone( const SourceLocationData* srcloc ) : m_id( (uint64_t)srcloc ) {}
2017-09-24 14:02:09 +00:00
~FakeZone() {}
private:
2017-10-01 15:42:22 +00:00
volatile uint64_t m_id;
2017-09-24 14:02:09 +00:00
};
void Profiler::CalibrateDelay()
{
enum { Iterations = 50000 };
enum { Events = Iterations * 2 }; // start + end
static_assert( Events * 2 < QueuePrealloc, "Delay calibration loop will allocate memory in queue" );
2017-10-10 23:04:21 +00:00
moodycamel::ProducerToken ptoken_detail( s_queue );
moodycamel::ConcurrentQueue<QueueItem>::ExplicitProducer* ptoken = s_queue.get_explicit_producer( ptoken_detail );
2017-09-24 14:02:09 +00:00
for( int i=0; i<Iterations; i++ )
{
static const tracy::SourceLocationData __tracy_source_location { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 };
{
2017-10-03 12:50:55 +00:00
Magic magic;
2017-10-10 23:27:22 +00:00
auto& tail = ptoken->get_tail_index();
2017-10-10 23:04:21 +00:00
auto item = ptoken->enqueue_begin<moodycamel::CanAlloc>( magic );
MemWrite( &item->hdr.type, QueueType::ZoneBegin );
MemWrite( &item->zoneBegin.thread, GetThreadHandle() );
#ifdef TRACY_RDTSCP_OPT
MemWrite( &item->zoneBegin.time, Profiler::GetTime( item->zoneBegin.cpu ) );
#else
uint32_t cpu;
MemWrite( &item->zoneBegin.time, Profiler::GetTime( cpu ) );
MemWrite( &item->zoneBegin.cpu, cpu );
#endif
MemWrite( &item->zoneBegin.srcloc, (uint64_t)&__tracy_source_location );
2017-10-10 23:27:22 +00:00
tail.store( magic + 1, std::memory_order_release );
}
{
2017-10-03 12:50:55 +00:00
Magic magic;
2017-10-10 23:27:22 +00:00
auto& tail = ptoken->get_tail_index();
2017-10-10 23:04:21 +00:00
auto item = ptoken->enqueue_begin<moodycamel::CanAlloc>( magic );
MemWrite( &item->hdr.type, QueueType::ZoneEnd );
MemWrite( &item->zoneEnd.thread, uint64_t( 0 ) );
#ifdef TRACY_RDTSCP_OPT
MemWrite( &item->zoneEnd.time, GetTime( item->zoneEnd.cpu ) );
#else
uint32_t cpu;
MemWrite( &item->zoneEnd.time, GetTime( cpu ) );
MemWrite( &item->zoneEnd.cpu, cpu );
#endif
2017-10-10 23:27:22 +00:00
tail.store( magic + 1, std::memory_order_release );
}
2017-09-24 14:02:09 +00:00
}
const auto f0 = GetTime();
2017-09-24 14:02:09 +00:00
for( int i=0; i<Iterations; i++ )
{
static const tracy::SourceLocationData __tracy_source_location { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 };
FakeZone ___tracy_scoped_zone( &__tracy_source_location );
2017-09-24 14:02:09 +00:00
}
const auto t0 = GetTime();
2017-09-24 14:02:09 +00:00
for( int i=0; i<Iterations; i++ )
{
static const tracy::SourceLocationData __tracy_source_location { nullptr, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 };
{
2017-10-03 12:50:55 +00:00
Magic magic;
2017-10-10 23:27:22 +00:00
auto& tail = ptoken->get_tail_index();
2017-10-10 23:04:21 +00:00
auto item = ptoken->enqueue_begin<moodycamel::CanAlloc>( magic );
MemWrite( &item->hdr.type, QueueType::ZoneBegin );
MemWrite( &item->zoneBegin.thread, GetThreadHandle() );
#ifdef TRACY_RDTSCP_OPT
MemWrite( &item->zoneBegin.time, Profiler::GetTime( item->zoneBegin.cpu ) );
#else
uint32_t cpu;
MemWrite( &item->zoneBegin.time, Profiler::GetTime( cpu ) );
MemWrite( &item->zoneBegin.cpu, cpu );
#endif
MemWrite( &item->zoneBegin.srcloc, (uint64_t)&__tracy_source_location );
2017-10-10 23:27:22 +00:00
tail.store( magic + 1, std::memory_order_release );
}
{
2017-10-03 12:50:55 +00:00
Magic magic;
2017-10-10 23:27:22 +00:00
auto& tail = ptoken->get_tail_index();
2017-10-10 23:04:21 +00:00
auto item = ptoken->enqueue_begin<moodycamel::CanAlloc>( magic );
MemWrite( &item->hdr.type, QueueType::ZoneEnd );
MemWrite( &item->zoneEnd.thread, uint64_t( 0 ) );
#ifdef TRACY_RDTSCP_OPT
MemWrite( &item->zoneEnd.time, GetTime( item->zoneEnd.cpu ) );
#else
uint32_t cpu;
MemWrite( &item->zoneEnd.time, GetTime( cpu ) );
MemWrite( &item->zoneEnd.cpu, cpu );
#endif
2017-10-10 23:27:22 +00:00
tail.store( magic + 1, std::memory_order_release );
}
2017-09-24 14:02:09 +00:00
}
const auto t1 = GetTime();
2017-09-24 14:02:09 +00:00
const auto dt = t1 - t0;
const auto df = t0 - f0;
m_delay = ( dt - df ) / Events;
2017-10-16 18:42:53 +00:00
auto mindiff = std::numeric_limits<int64_t>::max();
2017-09-29 16:29:39 +00:00
for( int i=0; i<Iterations * 10; i++ )
{
const auto t0i = GetTime();
const auto t1i = GetTime();
const auto dti = t1i - t0i;
if( dti > 0 && dti < mindiff ) mindiff = dti;
2017-09-29 16:29:39 +00:00
}
m_resolution = mindiff;
2017-09-24 14:02:09 +00:00
enum { Bulk = 1000 };
moodycamel::ConsumerToken token( s_queue );
int left = Events * 2;
QueueItem item[Bulk];
while( left != 0 )
{
const auto sz = s_queue.try_dequeue_bulk( token, item, std::min( left, (int)Bulk ) );
assert( sz > 0 );
2017-10-16 18:42:53 +00:00
left -= (int)sz;
2017-09-24 14:02:09 +00:00
}
}
2017-09-10 15:43:56 +00:00
}
#endif