tracy/client/TracyProfiler.hpp

669 lines
21 KiB
C++
Raw Normal View History

2017-09-10 15:43:56 +00:00
#ifndef __TRACYPROFILER_HPP__
#define __TRACYPROFILER_HPP__
2018-08-05 00:09:59 +00:00
#include <assert.h>
2017-09-10 15:43:56 +00:00
#include <atomic>
2017-09-10 18:08:42 +00:00
#include <stdint.h>
2017-10-14 15:20:37 +00:00
#include <string.h>
2017-09-10 15:43:56 +00:00
#include "tracy_concurrentqueue.h"
#include "TracyCallstack.hpp"
2019-02-21 20:59:02 +00:00
#include "TracySysTime.hpp"
2018-04-01 17:53:05 +00:00
#include "TracyFastVector.hpp"
2017-09-13 20:56:08 +00:00
#include "../common/TracyQueue.hpp"
#include "../common/TracyAlign.hpp"
#include "../common/TracyAlloc.hpp"
Use the fastest mutex available. The selection is based on the following test results: MSVC: === Lock test, 6 threads === => NonRecursiveBenaphore No contention: 11.641 ns/iter 2 thread contention: 141.559 ns/iter 3 thread contention: 242.733 ns/iter 4 thread contention: 409.807 ns/iter 5 thread contention: 561.544 ns/iter 6 thread contention: 785.845 ns/iter => std::mutex No contention: 19.190 ns/iter 2 thread contention: 39.305 ns/iter 3 thread contention: 58.999 ns/iter 4 thread contention: 59.532 ns/iter 5 thread contention: 103.539 ns/iter 6 thread contention: 110.314 ns/iter => std::shared_timed_mutex No contention: 45.487 ns/iter 2 thread contention: 96.351 ns/iter 3 thread contention: 142.871 ns/iter 4 thread contention: 184.999 ns/iter 5 thread contention: 336.608 ns/iter 6 thread contention: 542.551 ns/iter => std::shared_mutex No contention: 10.861 ns/iter 2 thread contention: 17.495 ns/iter 3 thread contention: 31.126 ns/iter 4 thread contention: 40.468 ns/iter 5 thread contention: 15.677 ns/iter 6 thread contention: 64.505 ns/iter Cygwin (clang): === Lock test, 6 threads === => NonRecursiveBenaphore No contention: 11.536 ns/iter 2 thread contention: 121.082 ns/iter 3 thread contention: 396.430 ns/iter 4 thread contention: 672.555 ns/iter 5 thread contention: 1327.761 ns/iter 6 thread contention: 14151.955 ns/iter => std::mutex No contention: 62.583 ns/iter 2 thread contention: 3990.464 ns/iter 3 thread contention: 7161.189 ns/iter 4 thread contention: 9870.820 ns/iter 5 thread contention: 12355.178 ns/iter 6 thread contention: 14694.903 ns/iter => std::shared_timed_mutex No contention: 91.687 ns/iter 2 thread contention: 1115.037 ns/iter 3 thread contention: 4183.792 ns/iter 4 thread contention: 15283.491 ns/iter 5 thread contention: 27812.477 ns/iter 6 thread contention: 35028.140 ns/iter => std::shared_mutex No contention: 91.764 ns/iter 2 thread contention: 1051.826 ns/iter 3 thread contention: 5574.720 ns/iter 4 thread contention: 15721.416 ns/iter 5 thread contention: 27721.487 ns/iter 6 thread contention: 35420.404 ns/iter Linux (x64): === Lock test, 6 threads === => NonRecursiveBenaphore No contention: 13.487 ns/iter 2 thread contention: 210.317 ns/iter 3 thread contention: 430.855 ns/iter 4 thread contention: 510.533 ns/iter 5 thread contention: 1003.609 ns/iter 6 thread contention: 1787.683 ns/iter => std::mutex No contention: 12.403 ns/iter 2 thread contention: 157.122 ns/iter 3 thread contention: 186.791 ns/iter 4 thread contention: 265.073 ns/iter 5 thread contention: 283.778 ns/iter 6 thread contention: 270.687 ns/iter => std::shared_timed_mutex No contention: 21.509 ns/iter 2 thread contention: 150.179 ns/iter 3 thread contention: 256.574 ns/iter 4 thread contention: 415.351 ns/iter 5 thread contention: 611.532 ns/iter 6 thread contention: 944.695 ns/iter => std::shared_mutex No contention: 20.805 ns/iter 2 thread contention: 157.034 ns/iter 3 thread contention: 244.025 ns/iter 4 thread contention: 406.269 ns/iter 5 thread contention: 387.985 ns/iter 6 thread contention: 468.550 ns/iter Linux (arm64): === Lock test, 6 threads === => NonRecursiveBenaphore No contention: 20.891 ns/iter 2 thread contention: 211.037 ns/iter 3 thread contention: 409.962 ns/iter 4 thread contention: 657.441 ns/iter 5 thread contention: 828.405 ns/iter 6 thread contention: 1131.827 ns/iter => std::mutex No contention: 50.884 ns/iter 2 thread contention: 103.620 ns/iter 3 thread contention: 332.429 ns/iter 4 thread contention: 620.802 ns/iter 5 thread contention: 783.943 ns/iter 6 thread contention: 834.002 ns/iter => std::shared_timed_mutex No contention: 64.948 ns/iter 2 thread contention: 173.191 ns/iter 3 thread contention: 490.352 ns/iter 4 thread contention: 660.668 ns/iter 5 thread contention: 1014.546 ns/iter 6 thread contention: 1451.553 ns/iter => std::shared_mutex No contention: 64.521 ns/iter 2 thread contention: 195.222 ns/iter 3 thread contention: 490.819 ns/iter 4 thread contention: 654.786 ns/iter 5 thread contention: 955.759 ns/iter 6 thread contention: 1282.544 ns/iter
2018-07-13 22:39:01 +00:00
#include "../common/TracyMutex.hpp"
#include "../common/TracyProtocol.hpp"
2017-09-10 18:09:14 +00:00
2019-01-19 11:03:30 +00:00
#if defined _WIN32 || defined __CYGWIN__
# include <intrin.h>
#endif
#ifdef __APPLE__
# include <TargetConditionals.h>
# include <mach/mach_time.h>
#endif
2019-01-19 11:03:30 +00:00
#if defined _WIN32 || defined __CYGWIN__ || ( ( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) && !defined __ANDROID__ ) || __ARM_ARCH >= 6
# define TRACY_HW_TIMER
#endif
2019-11-05 19:09:40 +00:00
#if !defined TRACY_HW_TIMER || ( __ARM_ARCH >= 6 && !defined CLOCK_MONOTONIC_RAW )
#include <chrono>
#endif
2019-01-14 19:55:37 +00:00
#ifndef TracyConcat
# define TracyConcat(x,y) TracyConcatIndirect(x,y)
#endif
#ifndef TracyConcatIndirect
# define TracyConcatIndirect(x,y) x##y
#endif
2017-09-10 15:43:56 +00:00
namespace tracy
{
2019-02-19 18:33:37 +00:00
class GpuCtx;
2019-02-19 17:38:08 +00:00
class Profiler;
class Socket;
2019-06-17 00:25:09 +00:00
class UdpBroadcast;
2019-02-19 18:33:37 +00:00
struct GpuCtxWrapper
{
2019-02-19 18:33:37 +00:00
GpuCtx* ptr;
};
TRACY_API moodycamel::ConcurrentQueue<QueueItem>::ExplicitProducer* GetToken();
TRACY_API Profiler& GetProfiler();
TRACY_API std::atomic<uint32_t>& GetLockCounter();
TRACY_API std::atomic<uint8_t>& GetGpuCtxCounter();
TRACY_API GpuCtxWrapper& GetGpuCtx();
TRACY_API uint64_t GetThreadHandle();
TRACY_API void InitRPMallocThread();
2019-02-19 18:33:37 +00:00
struct SourceLocationData
2017-11-14 22:29:48 +00:00
{
2019-02-19 18:33:37 +00:00
const char* name;
const char* function;
const char* file;
uint32_t line;
uint32_t color;
2017-11-14 22:29:48 +00:00
};
2018-07-13 18:20:37 +00:00
#ifdef TRACY_ON_DEMAND
struct LuaZoneState
{
uint32_t counter;
bool active;
};
#endif
#define TracyLfqPrepare( _type ) \
moodycamel::ConcurrentQueueDefaultTraits::index_t __magic; \
auto __token = GetToken(); \
auto& __tail = __token->get_tail_index(); \
auto item = __token->enqueue_begin( __magic ); \
MemWrite( &item->hdr.type, _type );
#define TracyLfqCommit \
__tail.store( __magic + 1, std::memory_order_release );
#define TracyLfqPrepareC( _type ) \
tracy::moodycamel::ConcurrentQueueDefaultTraits::index_t __magic; \
auto __token = tracy::GetToken(); \
auto& __tail = __token->get_tail_index(); \
auto item = __token->enqueue_begin( __magic ); \
tracy::MemWrite( &item->hdr.type, _type );
#define TracyLfqCommitC \
__tail.store( __magic + 1, std::memory_order_release );
2017-10-03 12:50:55 +00:00
2018-04-01 17:53:05 +00:00
typedef void(*ParameterCallback)( uint32_t idx, int32_t val );
2017-09-10 15:43:56 +00:00
class Profiler
{
2019-06-26 20:50:56 +00:00
struct FrameImageQueueItem
{
void* image;
uint64_t frame;
2019-06-26 20:50:56 +00:00
uint16_t w;
uint16_t h;
uint8_t offset;
bool flip;
};
2017-09-10 15:43:56 +00:00
public:
Profiler();
~Profiler();
static tracy_force_inline int64_t GetTime()
{
#ifdef TRACY_HW_TIMER
# if TARGET_OS_IOS == 1
return mach_absolute_time();
# elif __ARM_ARCH >= 6
# ifdef CLOCK_MONOTONIC_RAW
struct timespec ts;
clock_gettime( CLOCK_MONOTONIC_RAW, &ts );
return int64_t( ts.tv_sec ) * 1000000000ll + int64_t( ts.tv_nsec );
# else
return std::chrono::duration_cast<std::chrono::nanoseconds>( std::chrono::high_resolution_clock::now().time_since_epoch() ).count();
# endif
2019-01-19 11:03:30 +00:00
# elif defined _WIN32 || defined __CYGWIN__
Use rdtsc instead of rdtscp. But rdtscp is serializing! No, it's not. Quoting the Intel Instruction Set Reference: "The RDTSCP instruction is not a serializing instruction, but it does wait until all previous instructions have executed and all previous loads are globally visible. But it does not wait for previous stores to be globally visible, and subsequent instructions may begin execution before the read operation is performed.", "The RDTSC instruction is not a serializing instruction. It does not necessarily wait until all previous instructions have been executed before reading the counter. Similarly, subsequent instructions may begin execution before the read operation is performed." So, the difference is in waiting for prior instructions to finish executing. Notice that even in the rdtscp case, execution of the following instructions may commence before time measurement is finished and data stores may be still pending. But, you may say, Intel in its "How to Benchmark Code Execution Times" document shows that using rdtscp is superior to rdstc. Well, not exactly. What they do show is that when a *single function* is considered, there are ways to measure its execution time with little to no error. This is not what Tracy is doing. In our case there is no way to determine absolute "this is before" and "this is after" points of a zone, as we probably already are inside another zone. Stopping the CPU execution, so that a deeply nested zone may be measured with great precision, will skew the measurements of all parent zones. And this is not what we want to measure, anyway. We are not interested in how a *single function* behaves, but how a *whole program* behaves. The out-of-order CPU behavior may influence the measurements? Good! We are interested in that. We want to see *how* the code is really executed. How is *stopping* the CPU to make a timer read an appropriate thing to do, when we want to see how a program is performing? At least that's the theory. And besides all that, the profiling overhead is now reduced.
2019-10-20 18:52:33 +00:00
return int64_t( __rdtsc() );
2019-10-20 23:13:55 +00:00
# elif defined __i386 || defined _M_IX86
uint32_t eax, edx;
Use rdtsc instead of rdtscp. But rdtscp is serializing! No, it's not. Quoting the Intel Instruction Set Reference: "The RDTSCP instruction is not a serializing instruction, but it does wait until all previous instructions have executed and all previous loads are globally visible. But it does not wait for previous stores to be globally visible, and subsequent instructions may begin execution before the read operation is performed.", "The RDTSC instruction is not a serializing instruction. It does not necessarily wait until all previous instructions have been executed before reading the counter. Similarly, subsequent instructions may begin execution before the read operation is performed." So, the difference is in waiting for prior instructions to finish executing. Notice that even in the rdtscp case, execution of the following instructions may commence before time measurement is finished and data stores may be still pending. But, you may say, Intel in its "How to Benchmark Code Execution Times" document shows that using rdtscp is superior to rdstc. Well, not exactly. What they do show is that when a *single function* is considered, there are ways to measure its execution time with little to no error. This is not what Tracy is doing. In our case there is no way to determine absolute "this is before" and "this is after" points of a zone, as we probably already are inside another zone. Stopping the CPU execution, so that a deeply nested zone may be measured with great precision, will skew the measurements of all parent zones. And this is not what we want to measure, anyway. We are not interested in how a *single function* behaves, but how a *whole program* behaves. The out-of-order CPU behavior may influence the measurements? Good! We are interested in that. We want to see *how* the code is really executed. How is *stopping* the CPU to make a timer read an appropriate thing to do, when we want to see how a program is performing? At least that's the theory. And besides all that, the profiling overhead is now reduced.
2019-10-20 18:52:33 +00:00
asm volatile ( "rdtsc" : "=a" (eax), "=d" (edx) );
return ( uint64_t( edx ) << 32 ) + uint64_t( eax );
2019-10-20 23:13:55 +00:00
# elif defined __x86_64__ || defined _M_X64
uint64_t rax, rdx;
asm volatile ( "rdtsc" : "=a" (rax), "=d" (rdx) );
return ( rdx << 32 ) + rax;
# endif
#else
return std::chrono::duration_cast<std::chrono::nanoseconds>( std::chrono::high_resolution_clock::now().time_since_epoch() ).count();
#endif
}
tracy_force_inline uint32_t GetNextZoneId()
{
return m_zoneId.fetch_add( 1, std::memory_order_relaxed );
}
2019-08-12 11:27:15 +00:00
static tracy_force_inline QueueItem* QueueSerial()
{
auto& p = GetProfiler();
p.m_serialLock.lock();
return p.m_serialQueue.prepare_next();
}
static tracy_force_inline void QueueSerialFinish()
{
auto& p = GetProfiler();
p.m_serialQueue.commit_next();
p.m_serialLock.unlock();
}
static tracy_force_inline void SendFrameMark( const char* name )
{
if( !name ) GetProfiler().m_frameCount.fetch_add( 1, std::memory_order_relaxed );
#ifdef TRACY_ON_DEMAND
2019-02-19 17:38:08 +00:00
if( !GetProfiler().IsConnected() ) return;
2018-07-10 20:23:27 +00:00
#endif
TracyLfqPrepare( QueueType::FrameMarkMsg );
MemWrite( &item->frameMark.time, GetTime() );
MemWrite( &item->frameMark.name, uint64_t( name ) );
TracyLfqCommit;
}
2017-09-10 18:09:14 +00:00
2018-08-05 00:09:59 +00:00
static tracy_force_inline void SendFrameMark( const char* name, QueueType type )
2018-08-04 13:04:18 +00:00
{
assert( type == QueueType::FrameMarkMsgStart || type == QueueType::FrameMarkMsgEnd );
2018-08-04 13:04:18 +00:00
#ifdef TRACY_ON_DEMAND
2019-02-19 17:38:08 +00:00
if( !GetProfiler().IsConnected() ) return;
2018-08-04 13:04:18 +00:00
#endif
2019-08-12 11:27:15 +00:00
auto item = QueueSerial();
2018-08-05 00:09:59 +00:00
MemWrite( &item->hdr.type, type );
2018-08-04 13:04:18 +00:00
MemWrite( &item->frameMark.time, GetTime() );
MemWrite( &item->frameMark.name, uint64_t( name ) );
2019-08-12 11:27:15 +00:00
QueueSerialFinish();
2018-08-04 13:04:18 +00:00
}
2019-07-13 10:33:55 +00:00
static tracy_force_inline void SendFrameImage( const void* image, uint16_t w, uint16_t h, uint8_t offset, bool flip )
2019-06-06 19:39:54 +00:00
{
2019-06-26 20:50:56 +00:00
auto& profiler = GetProfiler();
2019-06-06 19:39:54 +00:00
#ifdef TRACY_ON_DEMAND
2019-06-26 20:50:56 +00:00
if( !profiler.IsConnected() ) return;
2019-06-06 19:39:54 +00:00
#endif
const auto sz = size_t( w ) * size_t( h ) * 4;
auto ptr = (char*)tracy_malloc( sz );
memcpy( ptr, image, sz );
2019-06-26 20:50:56 +00:00
profiler.m_fiLock.lock();
auto fi = profiler.m_fiQueue.prepare_next();
fi->image = ptr;
fi->frame = profiler.m_frameCount.load( std::memory_order_relaxed ) - offset;
2019-06-26 20:50:56 +00:00
fi->w = w;
fi->h = h;
fi->flip = flip;
profiler.m_fiQueue.commit_next();
profiler.m_fiLock.unlock();
2019-06-06 19:39:54 +00:00
}
2017-10-13 00:21:29 +00:00
static tracy_force_inline void PlotData( const char* name, int64_t val )
{
#ifdef TRACY_ON_DEMAND
2019-02-19 17:38:08 +00:00
if( !GetProfiler().IsConnected() ) return;
#endif
TracyLfqPrepare( QueueType::PlotData );
MemWrite( &item->plotData.name, (uint64_t)name );
MemWrite( &item->plotData.time, GetTime() );
MemWrite( &item->plotData.type, PlotDataType::Int );
MemWrite( &item->plotData.data.i, val );
TracyLfqCommit;
2017-10-13 00:21:29 +00:00
}
static tracy_force_inline void PlotData( const char* name, float val )
{
#ifdef TRACY_ON_DEMAND
2019-02-19 17:38:08 +00:00
if( !GetProfiler().IsConnected() ) return;
#endif
TracyLfqPrepare( QueueType::PlotData );
MemWrite( &item->plotData.name, (uint64_t)name );
MemWrite( &item->plotData.time, GetTime() );
MemWrite( &item->plotData.type, PlotDataType::Float );
MemWrite( &item->plotData.data.f, val );
TracyLfqCommit;
2017-10-13 00:21:29 +00:00
}
2017-10-13 00:07:03 +00:00
static tracy_force_inline void PlotData( const char* name, double val )
{
#ifdef TRACY_ON_DEMAND
2019-02-19 17:38:08 +00:00
if( !GetProfiler().IsConnected() ) return;
#endif
TracyLfqPrepare( QueueType::PlotData );
MemWrite( &item->plotData.name, (uint64_t)name );
MemWrite( &item->plotData.time, GetTime() );
MemWrite( &item->plotData.type, PlotDataType::Double );
MemWrite( &item->plotData.data.d, val );
TracyLfqCommit;
2017-10-13 00:07:03 +00:00
}
2019-11-05 15:47:41 +00:00
static tracy_force_inline void ConfigurePlot( const char* name, PlotFormatType type )
{
TracyLfqPrepare( QueueType::PlotConfig );
2019-11-05 15:47:41 +00:00
MemWrite( &item->plotConfig.name, (uint64_t)name );
MemWrite( &item->plotConfig.type, (uint8_t)type );
#ifdef TRACY_ON_DEMAND
GetProfiler().DeferItem( *item );
#endif
TracyLfqCommit;
2019-11-05 15:47:41 +00:00
}
2019-11-14 22:40:41 +00:00
static tracy_force_inline void Message( const char* txt, size_t size, int callstack )
2017-10-14 11:23:13 +00:00
{
2018-07-10 21:09:59 +00:00
#ifdef TRACY_ON_DEMAND
2019-02-19 17:38:08 +00:00
if( !GetProfiler().IsConnected() ) return;
2018-07-10 21:09:59 +00:00
#endif
auto ptr = (char*)tracy_malloc( size+1 );
2017-10-14 11:23:13 +00:00
memcpy( ptr, txt, size );
ptr[size] = '\0';
TracyLfqPrepare( callstack == 0 ? QueueType::Message : QueueType::MessageCallstack );
MemWrite( &item->message.time, GetTime() );
MemWrite( &item->message.text, (uint64_t)ptr );
TracyLfqCommit;
2019-11-14 22:40:41 +00:00
if( callstack != 0 ) tracy::GetProfiler().SendCallstack( callstack );
2017-10-14 11:23:13 +00:00
}
2019-11-14 22:40:41 +00:00
static tracy_force_inline void Message( const char* txt, int callstack )
2017-10-15 11:06:49 +00:00
{
2018-07-10 21:09:59 +00:00
#ifdef TRACY_ON_DEMAND
2019-02-19 17:38:08 +00:00
if( !GetProfiler().IsConnected() ) return;
2018-07-10 21:09:59 +00:00
#endif
TracyLfqPrepare( callstack == 0 ? QueueType::MessageLiteral : QueueType::MessageLiteralCallstack );
MemWrite( &item->message.time, GetTime() );
MemWrite( &item->message.text, (uint64_t)txt );
TracyLfqCommit;
2019-11-14 22:40:41 +00:00
if( callstack != 0 ) tracy::GetProfiler().SendCallstack( callstack );
2017-10-15 11:06:49 +00:00
}
2019-11-14 22:40:41 +00:00
static tracy_force_inline void MessageColor( const char* txt, size_t size, uint32_t color, int callstack )
2019-05-10 18:17:44 +00:00
{
#ifdef TRACY_ON_DEMAND
if( !GetProfiler().IsConnected() ) return;
#endif
auto ptr = (char*)tracy_malloc( size+1 );
memcpy( ptr, txt, size );
ptr[size] = '\0';
TracyLfqPrepare( callstack == 0 ? QueueType::MessageColor : QueueType::MessageColorCallstack );
2019-05-10 18:17:44 +00:00
MemWrite( &item->messageColor.time, GetTime() );
MemWrite( &item->messageColor.text, (uint64_t)ptr );
MemWrite( &item->messageColor.r, uint8_t( ( color ) & 0xFF ) );
MemWrite( &item->messageColor.g, uint8_t( ( color >> 8 ) & 0xFF ) );
MemWrite( &item->messageColor.b, uint8_t( ( color >> 16 ) & 0xFF ) );
TracyLfqCommit;
2019-11-14 22:40:41 +00:00
if( callstack != 0 ) tracy::GetProfiler().SendCallstack( callstack );
2019-05-10 18:17:44 +00:00
}
2019-11-14 22:40:41 +00:00
static tracy_force_inline void MessageColor( const char* txt, uint32_t color, int callstack )
2019-05-10 18:17:44 +00:00
{
#ifdef TRACY_ON_DEMAND
if( !GetProfiler().IsConnected() ) return;
#endif
TracyLfqPrepare( callstack == 0 ? QueueType::MessageLiteralColor : QueueType::MessageLiteralColorCallstack );
2019-05-10 18:17:44 +00:00
MemWrite( &item->messageColor.time, GetTime() );
MemWrite( &item->messageColor.text, (uint64_t)txt );
MemWrite( &item->messageColor.r, uint8_t( ( color ) & 0xFF ) );
MemWrite( &item->messageColor.g, uint8_t( ( color >> 8 ) & 0xFF ) );
MemWrite( &item->messageColor.b, uint8_t( ( color >> 16 ) & 0xFF ) );
TracyLfqCommit;
2019-11-14 22:40:41 +00:00
if( callstack != 0 ) tracy::GetProfiler().SendCallstack( callstack );
2019-05-10 18:17:44 +00:00
}
static tracy_force_inline void MessageAppInfo( const char* txt, size_t size )
{
auto ptr = (char*)tracy_malloc( size+1 );
memcpy( ptr, txt, size );
ptr[size] = '\0';
TracyLfqPrepare( QueueType::MessageAppInfo );
MemWrite( &item->message.time, GetTime() );
MemWrite( &item->message.text, (uint64_t)ptr );
#ifdef TRACY_ON_DEMAND
GetProfiler().DeferItem( *item );
#endif
TracyLfqCommit;
}
2018-03-31 19:56:05 +00:00
static tracy_force_inline void MemAlloc( const void* ptr, size_t size )
{
2018-07-11 23:36:01 +00:00
#ifdef TRACY_ON_DEMAND
2019-02-19 17:38:08 +00:00
if( !GetProfiler().IsConnected() ) return;
2018-07-11 23:36:01 +00:00
#endif
const auto thread = GetThreadHandle();
2019-02-19 17:38:08 +00:00
GetProfiler().m_serialLock.lock();
2018-06-20 21:29:44 +00:00
SendMemAlloc( QueueType::MemAlloc, thread, ptr, size );
2019-02-19 17:38:08 +00:00
GetProfiler().m_serialLock.unlock();
2018-03-31 19:56:05 +00:00
}
static tracy_force_inline void MemFree( const void* ptr )
{
2018-07-11 23:36:01 +00:00
#ifdef TRACY_ON_DEMAND
2019-02-19 17:38:08 +00:00
if( !GetProfiler().IsConnected() ) return;
2018-07-11 23:36:01 +00:00
#endif
const auto thread = GetThreadHandle();
2019-02-19 17:38:08 +00:00
GetProfiler().m_serialLock.lock();
2018-06-20 21:29:44 +00:00
SendMemFree( QueueType::MemFree, thread, ptr );
2019-02-19 17:38:08 +00:00
GetProfiler().m_serialLock.unlock();
2018-03-31 19:56:05 +00:00
}
static tracy_force_inline void MemAllocCallstack( const void* ptr, size_t size, int depth )
{
#ifdef TRACY_HAS_CALLSTACK
2019-10-05 00:11:45 +00:00
auto& profiler = GetProfiler();
2018-07-11 23:36:01 +00:00
# ifdef TRACY_ON_DEMAND
if( !profiler.IsConnected() ) return;
2018-07-11 23:36:01 +00:00
# endif
const auto thread = GetThreadHandle();
InitRPMallocThread();
auto callstack = Callstack( depth );
profiler.m_serialLock.lock();
2018-06-20 21:29:44 +00:00
SendMemAlloc( QueueType::MemAllocCallstack, thread, ptr, size );
SendCallstackMemory( callstack );
profiler.m_serialLock.unlock();
#else
MemAlloc( ptr, size );
#endif
}
static tracy_force_inline void MemFreeCallstack( const void* ptr, int depth )
{
#ifdef TRACY_HAS_CALLSTACK
2019-10-05 00:11:45 +00:00
auto& profiler = GetProfiler();
2018-07-11 23:36:01 +00:00
# ifdef TRACY_ON_DEMAND
if( !profiler.IsConnected() ) return;
2018-07-11 23:36:01 +00:00
# endif
const auto thread = GetThreadHandle();
InitRPMallocThread();
auto callstack = Callstack( depth );
profiler.m_serialLock.lock();
2018-06-20 21:29:44 +00:00
SendMemFree( QueueType::MemFreeCallstack, thread, ptr );
SendCallstackMemory( callstack );
profiler.m_serialLock.unlock();
#else
MemFree( ptr );
#endif
}
static tracy_force_inline void SendCallstack( int depth )
2018-06-21 22:56:01 +00:00
{
#ifdef TRACY_HAS_CALLSTACK
auto ptr = Callstack( depth );
TracyLfqPrepare( QueueType::Callstack );
MemWrite( &item->callstack.ptr, (uint64_t)ptr );
TracyLfqCommit;
2018-06-21 22:56:01 +00:00
#endif
}
static tracy_force_inline void ParameterRegister( ParameterCallback cb ) { GetProfiler().m_paramCallback = cb; }
static tracy_force_inline void ParameterSetup( uint32_t idx, const char* name, bool isBool, int32_t val )
{
TracyLfqPrepare( QueueType::ParamSetup );
tracy::MemWrite( &item->paramSetup.idx, idx );
tracy::MemWrite( &item->paramSetup.name, (uint64_t)name );
tracy::MemWrite( &item->paramSetup.isBool, (uint8_t)isBool );
tracy::MemWrite( &item->paramSetup.val, val );
#ifdef TRACY_ON_DEMAND
GetProfiler().DeferItem( *item );
#endif
TracyLfqCommit;
}
void SendCallstack( int depth, const char* skipBefore );
static void CutCallstack( void* callstack, const char* skipBefore );
static bool ShouldExit();
2018-07-10 19:50:00 +00:00
#ifdef TRACY_ON_DEMAND
tracy_force_inline bool IsConnected() const
2018-07-10 19:50:00 +00:00
{
return m_isConnected.load( std::memory_order_acquire );
2018-07-10 19:50:00 +00:00
}
2018-07-11 10:21:39 +00:00
tracy_force_inline uint64_t ConnectionId() const
{
return m_connectionId.load( std::memory_order_acquire );
}
2018-07-11 10:21:39 +00:00
tracy_force_inline void DeferItem( const QueueItem& item )
{
m_deferredLock.lock();
auto dst = m_deferredQueue.push_next();
memcpy( dst, &item, sizeof( item ) );
m_deferredLock.unlock();
}
2018-07-10 19:50:00 +00:00
#endif
void RequestShutdown() { m_shutdown.store( true, std::memory_order_relaxed ); m_shutdownManual.store( true, std::memory_order_relaxed ); }
bool HasShutdownFinished() const { return m_shutdownFinished.load( std::memory_order_relaxed ); }
void SendString( uint64_t ptr, const char* str, QueueType type );
// Allocated source location data layout:
// 4b payload size
// 4b color
// 4b source line
// fsz function name
// 1b null terminator
// ssz source file name
// 1b null terminator
// nsz zone name (optional)
static tracy_force_inline uint64_t AllocSourceLocation( uint32_t line, const char* source, const char* function )
{
const auto fsz = strlen( function );
const auto ssz = strlen( source );
const uint32_t sz = uint32_t( 4 + 4 + 4 + fsz + 1 + ssz + 1 );
auto ptr = (char*)tracy_malloc( sz );
memcpy( ptr, &sz, 4 );
memset( ptr + 4, 0, 4 );
memcpy( ptr + 8, &line, 4 );
memcpy( ptr + 12, function, fsz+1 );
memcpy( ptr + 12 + fsz + 1, source, ssz + 1 );
return uint64_t( ptr );
}
static tracy_force_inline uint64_t AllocSourceLocation( uint32_t line, const char* source, const char* function, const char* name, size_t nameSz )
{
const auto fsz = strlen( function );
const auto ssz = strlen( source );
const uint32_t sz = uint32_t( 4 + 4 + 4 + fsz + 1 + ssz + 1 + nameSz );
auto ptr = (char*)tracy_malloc( sz );
memcpy( ptr, &sz, 4 );
memset( ptr + 4, 0, 4 );
memcpy( ptr + 8, &line, 4 );
memcpy( ptr + 12, function, fsz+1 );
memcpy( ptr + 12 + fsz + 1, source, ssz + 1 );
memcpy( ptr + 12 + fsz + 1 + ssz + 1, name, nameSz );
return uint64_t( ptr );
}
2017-09-10 15:43:56 +00:00
private:
2019-11-14 17:24:29 +00:00
enum class DequeueStatus { DataDequeued, ConnectionLost, QueueEmpty };
2017-10-18 16:48:51 +00:00
static void LaunchWorker( void* ptr ) { ((Profiler*)ptr)->Worker(); }
2017-09-10 15:43:56 +00:00
void Worker();
2019-06-26 20:57:24 +00:00
static void LaunchCompressWorker( void* ptr ) { ((Profiler*)ptr)->CompressWorker(); }
void CompressWorker();
void ClearQueues( tracy::moodycamel::ConsumerToken& token );
void ClearSerial();
DequeueStatus Dequeue( tracy::moodycamel::ConsumerToken& token );
DequeueStatus DequeueContextSwitches( tracy::moodycamel::ConsumerToken& token, int64_t& timeStop );
2018-04-01 18:04:35 +00:00
DequeueStatus DequeueSerial();
bool CommitData();
tracy_force_inline bool AppendData( const void* data, size_t len )
{
const auto ret = NeedDataSize( len );
AppendDataUnsafe( data, len );
return ret;
}
tracy_force_inline bool NeedDataSize( size_t len )
{
assert( len <= TargetFrameSize );
bool ret = true;
if( m_bufferOffset - m_bufferStart + len > TargetFrameSize )
{
ret = CommitData();
}
return ret;
}
2017-10-18 16:48:51 +00:00
2018-06-23 00:16:58 +00:00
tracy_force_inline void AppendDataUnsafe( const void* data, size_t len )
{
memcpy( m_buffer + m_bufferOffset, data, len );
m_bufferOffset += int( len );
}
bool SendData( const char* data, size_t len );
void SendLongString( uint64_t ptr, const char* str, size_t len, QueueType type );
void SendSourceLocation( uint64_t ptr );
2018-06-19 17:00:57 +00:00
void SendSourceLocationPayload( uint64_t ptr );
2018-06-19 17:09:43 +00:00
void SendCallstackPayload( uint64_t ptr );
void SendCallstackPayload64( uint64_t ptr );
2019-02-28 19:30:07 +00:00
void SendCallstackAlloc( uint64_t ptr );
2018-06-19 23:06:31 +00:00
void SendCallstackFrame( uint64_t ptr );
bool HandleServerQuery();
2019-08-01 21:14:09 +00:00
void HandleDisconnect();
void HandleParameter( uint64_t payload );
2020-02-26 21:35:15 +00:00
void HandleSymbolQuery( uint64_t symbol );
2017-09-23 19:33:05 +00:00
void CalibrateTimer();
2017-09-24 14:02:09 +00:00
void CalibrateDelay();
2019-11-29 17:29:09 +00:00
void ReportTopology();
2017-09-23 19:33:05 +00:00
static tracy_force_inline void SendCallstackMemory( void* ptr )
2018-06-20 21:30:19 +00:00
{
#ifdef TRACY_HAS_CALLSTACK
2019-02-19 17:38:08 +00:00
auto item = GetProfiler().m_serialQueue.prepare_next();
2018-06-20 21:30:19 +00:00
MemWrite( &item->hdr.type, QueueType::CallstackMemory );
MemWrite( &item->callstackMemory.ptr, (uint64_t)ptr );
2019-02-19 17:38:08 +00:00
GetProfiler().m_serialQueue.commit_next();
2018-06-20 21:30:19 +00:00
#endif
}
2018-06-20 21:29:44 +00:00
static tracy_force_inline void SendMemAlloc( QueueType type, const uint64_t thread, const void* ptr, size_t size )
{
assert( type == QueueType::MemAlloc || type == QueueType::MemAllocCallstack );
2019-02-19 17:38:08 +00:00
auto item = GetProfiler().m_serialQueue.prepare_next();
2018-06-20 21:29:44 +00:00
MemWrite( &item->hdr.type, type );
MemWrite( &item->memAlloc.time, GetTime() );
MemWrite( &item->memAlloc.thread, thread );
MemWrite( &item->memAlloc.ptr, (uint64_t)ptr );
2018-08-01 12:07:30 +00:00
if( compile_time_condition<sizeof( size ) == 4>::value )
2018-06-20 21:29:44 +00:00
{
memcpy( &item->memAlloc.size, &size, 4 );
memset( &item->memAlloc.size + 4, 0, 2 );
}
else
{
assert( sizeof( size ) == 8 );
memcpy( &item->memAlloc.size, &size, 6 );
}
2019-02-19 17:38:08 +00:00
GetProfiler().m_serialQueue.commit_next();
2018-06-20 21:29:44 +00:00
}
static tracy_force_inline void SendMemFree( QueueType type, const uint64_t thread, const void* ptr )
{
assert( type == QueueType::MemFree || type == QueueType::MemFreeCallstack );
2019-02-19 17:38:08 +00:00
auto item = GetProfiler().m_serialQueue.prepare_next();
2018-06-20 21:29:44 +00:00
MemWrite( &item->hdr.type, type );
MemWrite( &item->memFree.time, GetTime() );
MemWrite( &item->memFree.thread, thread );
MemWrite( &item->memFree.ptr, (uint64_t)ptr );
2019-02-19 17:38:08 +00:00
GetProfiler().m_serialQueue.commit_next();
2018-06-20 21:29:44 +00:00
}
2017-09-23 19:33:05 +00:00
double m_timerMul;
2017-09-29 16:29:39 +00:00
uint64_t m_resolution;
2017-09-24 14:02:09 +00:00
uint64_t m_delay;
std::atomic<int64_t> m_timeBegin;
2017-09-22 23:37:07 +00:00
uint64_t m_mainThread;
uint64_t m_epoch;
2017-09-10 15:43:56 +00:00
std::atomic<bool> m_shutdown;
std::atomic<bool> m_shutdownManual;
std::atomic<bool> m_shutdownFinished;
2017-10-18 17:49:17 +00:00
Socket* m_sock;
2019-06-17 00:25:09 +00:00
UdpBroadcast* m_broadcast;
bool m_noExit;
uint32_t m_userPort;
std::atomic<uint32_t> m_zoneId;
2020-02-25 22:08:52 +00:00
int64_t m_samplingPeriod;
uint64_t m_threadCtx;
int64_t m_refTimeThread;
int64_t m_refTimeSerial;
2019-10-25 17:13:11 +00:00
int64_t m_refTimeCtx;
2019-10-25 17:52:01 +00:00
int64_t m_refTimeGpu;
void* m_stream; // LZ4_stream_t*
char* m_buffer;
int m_bufferOffset;
int m_bufferStart;
2017-11-02 11:56:13 +00:00
2017-11-02 16:37:10 +00:00
char* m_lz4Buf;
2018-04-01 17:53:05 +00:00
FastVector<QueueItem> m_serialQueue, m_serialDequeue;
Use the fastest mutex available. The selection is based on the following test results: MSVC: === Lock test, 6 threads === => NonRecursiveBenaphore No contention: 11.641 ns/iter 2 thread contention: 141.559 ns/iter 3 thread contention: 242.733 ns/iter 4 thread contention: 409.807 ns/iter 5 thread contention: 561.544 ns/iter 6 thread contention: 785.845 ns/iter => std::mutex No contention: 19.190 ns/iter 2 thread contention: 39.305 ns/iter 3 thread contention: 58.999 ns/iter 4 thread contention: 59.532 ns/iter 5 thread contention: 103.539 ns/iter 6 thread contention: 110.314 ns/iter => std::shared_timed_mutex No contention: 45.487 ns/iter 2 thread contention: 96.351 ns/iter 3 thread contention: 142.871 ns/iter 4 thread contention: 184.999 ns/iter 5 thread contention: 336.608 ns/iter 6 thread contention: 542.551 ns/iter => std::shared_mutex No contention: 10.861 ns/iter 2 thread contention: 17.495 ns/iter 3 thread contention: 31.126 ns/iter 4 thread contention: 40.468 ns/iter 5 thread contention: 15.677 ns/iter 6 thread contention: 64.505 ns/iter Cygwin (clang): === Lock test, 6 threads === => NonRecursiveBenaphore No contention: 11.536 ns/iter 2 thread contention: 121.082 ns/iter 3 thread contention: 396.430 ns/iter 4 thread contention: 672.555 ns/iter 5 thread contention: 1327.761 ns/iter 6 thread contention: 14151.955 ns/iter => std::mutex No contention: 62.583 ns/iter 2 thread contention: 3990.464 ns/iter 3 thread contention: 7161.189 ns/iter 4 thread contention: 9870.820 ns/iter 5 thread contention: 12355.178 ns/iter 6 thread contention: 14694.903 ns/iter => std::shared_timed_mutex No contention: 91.687 ns/iter 2 thread contention: 1115.037 ns/iter 3 thread contention: 4183.792 ns/iter 4 thread contention: 15283.491 ns/iter 5 thread contention: 27812.477 ns/iter 6 thread contention: 35028.140 ns/iter => std::shared_mutex No contention: 91.764 ns/iter 2 thread contention: 1051.826 ns/iter 3 thread contention: 5574.720 ns/iter 4 thread contention: 15721.416 ns/iter 5 thread contention: 27721.487 ns/iter 6 thread contention: 35420.404 ns/iter Linux (x64): === Lock test, 6 threads === => NonRecursiveBenaphore No contention: 13.487 ns/iter 2 thread contention: 210.317 ns/iter 3 thread contention: 430.855 ns/iter 4 thread contention: 510.533 ns/iter 5 thread contention: 1003.609 ns/iter 6 thread contention: 1787.683 ns/iter => std::mutex No contention: 12.403 ns/iter 2 thread contention: 157.122 ns/iter 3 thread contention: 186.791 ns/iter 4 thread contention: 265.073 ns/iter 5 thread contention: 283.778 ns/iter 6 thread contention: 270.687 ns/iter => std::shared_timed_mutex No contention: 21.509 ns/iter 2 thread contention: 150.179 ns/iter 3 thread contention: 256.574 ns/iter 4 thread contention: 415.351 ns/iter 5 thread contention: 611.532 ns/iter 6 thread contention: 944.695 ns/iter => std::shared_mutex No contention: 20.805 ns/iter 2 thread contention: 157.034 ns/iter 3 thread contention: 244.025 ns/iter 4 thread contention: 406.269 ns/iter 5 thread contention: 387.985 ns/iter 6 thread contention: 468.550 ns/iter Linux (arm64): === Lock test, 6 threads === => NonRecursiveBenaphore No contention: 20.891 ns/iter 2 thread contention: 211.037 ns/iter 3 thread contention: 409.962 ns/iter 4 thread contention: 657.441 ns/iter 5 thread contention: 828.405 ns/iter 6 thread contention: 1131.827 ns/iter => std::mutex No contention: 50.884 ns/iter 2 thread contention: 103.620 ns/iter 3 thread contention: 332.429 ns/iter 4 thread contention: 620.802 ns/iter 5 thread contention: 783.943 ns/iter 6 thread contention: 834.002 ns/iter => std::shared_timed_mutex No contention: 64.948 ns/iter 2 thread contention: 173.191 ns/iter 3 thread contention: 490.352 ns/iter 4 thread contention: 660.668 ns/iter 5 thread contention: 1014.546 ns/iter 6 thread contention: 1451.553 ns/iter => std::shared_mutex No contention: 64.521 ns/iter 2 thread contention: 195.222 ns/iter 3 thread contention: 490.819 ns/iter 4 thread contention: 654.786 ns/iter 5 thread contention: 955.759 ns/iter 6 thread contention: 1282.544 ns/iter
2018-07-13 22:39:01 +00:00
TracyMutex m_serialLock;
2018-07-10 19:50:00 +00:00
2019-06-26 20:50:56 +00:00
FastVector<FrameImageQueueItem> m_fiQueue, m_fiDequeue;
TracyMutex m_fiLock;
std::atomic<uint64_t> m_frameCount;
2018-07-10 19:50:00 +00:00
#ifdef TRACY_ON_DEMAND
std::atomic<bool> m_isConnected;
std::atomic<uint64_t> m_connectionId;
2018-07-11 10:14:28 +00:00
Use the fastest mutex available. The selection is based on the following test results: MSVC: === Lock test, 6 threads === => NonRecursiveBenaphore No contention: 11.641 ns/iter 2 thread contention: 141.559 ns/iter 3 thread contention: 242.733 ns/iter 4 thread contention: 409.807 ns/iter 5 thread contention: 561.544 ns/iter 6 thread contention: 785.845 ns/iter => std::mutex No contention: 19.190 ns/iter 2 thread contention: 39.305 ns/iter 3 thread contention: 58.999 ns/iter 4 thread contention: 59.532 ns/iter 5 thread contention: 103.539 ns/iter 6 thread contention: 110.314 ns/iter => std::shared_timed_mutex No contention: 45.487 ns/iter 2 thread contention: 96.351 ns/iter 3 thread contention: 142.871 ns/iter 4 thread contention: 184.999 ns/iter 5 thread contention: 336.608 ns/iter 6 thread contention: 542.551 ns/iter => std::shared_mutex No contention: 10.861 ns/iter 2 thread contention: 17.495 ns/iter 3 thread contention: 31.126 ns/iter 4 thread contention: 40.468 ns/iter 5 thread contention: 15.677 ns/iter 6 thread contention: 64.505 ns/iter Cygwin (clang): === Lock test, 6 threads === => NonRecursiveBenaphore No contention: 11.536 ns/iter 2 thread contention: 121.082 ns/iter 3 thread contention: 396.430 ns/iter 4 thread contention: 672.555 ns/iter 5 thread contention: 1327.761 ns/iter 6 thread contention: 14151.955 ns/iter => std::mutex No contention: 62.583 ns/iter 2 thread contention: 3990.464 ns/iter 3 thread contention: 7161.189 ns/iter 4 thread contention: 9870.820 ns/iter 5 thread contention: 12355.178 ns/iter 6 thread contention: 14694.903 ns/iter => std::shared_timed_mutex No contention: 91.687 ns/iter 2 thread contention: 1115.037 ns/iter 3 thread contention: 4183.792 ns/iter 4 thread contention: 15283.491 ns/iter 5 thread contention: 27812.477 ns/iter 6 thread contention: 35028.140 ns/iter => std::shared_mutex No contention: 91.764 ns/iter 2 thread contention: 1051.826 ns/iter 3 thread contention: 5574.720 ns/iter 4 thread contention: 15721.416 ns/iter 5 thread contention: 27721.487 ns/iter 6 thread contention: 35420.404 ns/iter Linux (x64): === Lock test, 6 threads === => NonRecursiveBenaphore No contention: 13.487 ns/iter 2 thread contention: 210.317 ns/iter 3 thread contention: 430.855 ns/iter 4 thread contention: 510.533 ns/iter 5 thread contention: 1003.609 ns/iter 6 thread contention: 1787.683 ns/iter => std::mutex No contention: 12.403 ns/iter 2 thread contention: 157.122 ns/iter 3 thread contention: 186.791 ns/iter 4 thread contention: 265.073 ns/iter 5 thread contention: 283.778 ns/iter 6 thread contention: 270.687 ns/iter => std::shared_timed_mutex No contention: 21.509 ns/iter 2 thread contention: 150.179 ns/iter 3 thread contention: 256.574 ns/iter 4 thread contention: 415.351 ns/iter 5 thread contention: 611.532 ns/iter 6 thread contention: 944.695 ns/iter => std::shared_mutex No contention: 20.805 ns/iter 2 thread contention: 157.034 ns/iter 3 thread contention: 244.025 ns/iter 4 thread contention: 406.269 ns/iter 5 thread contention: 387.985 ns/iter 6 thread contention: 468.550 ns/iter Linux (arm64): === Lock test, 6 threads === => NonRecursiveBenaphore No contention: 20.891 ns/iter 2 thread contention: 211.037 ns/iter 3 thread contention: 409.962 ns/iter 4 thread contention: 657.441 ns/iter 5 thread contention: 828.405 ns/iter 6 thread contention: 1131.827 ns/iter => std::mutex No contention: 50.884 ns/iter 2 thread contention: 103.620 ns/iter 3 thread contention: 332.429 ns/iter 4 thread contention: 620.802 ns/iter 5 thread contention: 783.943 ns/iter 6 thread contention: 834.002 ns/iter => std::shared_timed_mutex No contention: 64.948 ns/iter 2 thread contention: 173.191 ns/iter 3 thread contention: 490.352 ns/iter 4 thread contention: 660.668 ns/iter 5 thread contention: 1014.546 ns/iter 6 thread contention: 1451.553 ns/iter => std::shared_mutex No contention: 64.521 ns/iter 2 thread contention: 195.222 ns/iter 3 thread contention: 490.819 ns/iter 4 thread contention: 654.786 ns/iter 5 thread contention: 955.759 ns/iter 6 thread contention: 1282.544 ns/iter
2018-07-13 22:39:01 +00:00
TracyMutex m_deferredLock;
2018-07-11 10:14:28 +00:00
FastVector<QueueItem> m_deferredQueue;
2018-07-10 19:50:00 +00:00
#endif
2019-02-21 20:59:02 +00:00
#ifdef TRACY_HAS_SYSTIME
void ProcessSysTime();
SysTime m_sysTime;
uint64_t m_sysTimeLast = 0;
#else
void ProcessSysTime() {}
#endif
ParameterCallback m_paramCallback;
2017-09-10 15:43:56 +00:00
};
};
#endif