diff --git a/README.md b/README.md index 10e4b969..9955da42 100644 --- a/README.md +++ b/README.md @@ -102,11 +102,11 @@ Even if tracy is disabled, you still have to pay the no-op function call cost. T #### GPU profiling -Tracy provides bindings for profiling OpenGL execution time on GPU. To use it, you will need to include the `tracy/TracyOpenGL.hpp` header file and create an instance of `tracy::GpuCtx<>` class for each of your rendering contexts (typically you will only have one context). You need to keep track using the proper context yourself. The template parameter specifies the number of query events you anticipate to happen between event creation and collection. Each GPU zone requires two events and in some cases collection may happen after a couple frames (e.g. 5) have been submitted. +Tracy provides bindings for profiling OpenGL execution time on GPU. To use it, you will need to include the `tracy/TracyOpenGL.hpp` header file and declare each of your rendering contexts using the `TracyGpuContext` macro (typically you will only have one context). Tracy expects no more than one context per thread and no context migration. -To mark GPU zone use the `TracyGpuZone( ctx, name )` macro, where `ctx` is a pointer to the `tracy::GpuCtx<>` class instance you have created and `name` is a string literal name of the zone. Alternatively you may use `TracyGpuZoneC( ctx, name, color )` to specify zone color. +To mark GPU zone use the `TracyGpuZone( name )` macro, where `name` is a string literal name of the zone. Alternatively you may use `TracyGpuZoneC( name, color )` to specify zone color. -You also need to periodically call the `Collect()` method of the `tracy::GpuCtx<>` class. A good place to do it is after swap buffers function call. +You also need to periodically collect the GPU events using the `TracyGpuCollect` macro. A good place to do it is after swap buffers function call. ## Good practices diff --git a/TracyOpenGL.hpp b/TracyOpenGL.hpp index 605b8bae..14726181 100644 --- a/TracyOpenGL.hpp +++ b/TracyOpenGL.hpp @@ -5,8 +5,10 @@ #ifndef TRACY_ENABLE +#define TracyGpuContext #define TracyGpuZone(x,y) #define TracyGpuZoneC(x,y,z) +#define TracyGpuCollect namespace tracy { @@ -26,71 +28,23 @@ public: #include "Tracy.hpp" #include "client/TracyProfiler.hpp" +#include "common/TracyAlloc.hpp" -#define TracyGpuZone( ctx, name ) static const tracy::SourceLocation __tracy_gpu_source_location { __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; auto ___tracy_gpu_zone = tracy::detail::__GpuHelper( ctx, name, &__tracy_gpu_source_location ); -#define TracyGpuZoneC( ctx, name, color ) static const tracy::SourceLocation __tracy_gpu_source_location { __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; auto ___tracy_gpu_zone = tracy::detail::__GpuHelper( ctx, name, &__tracy_gpu_source_location ); +#define TracyGpuContext tracy::s_gpuCtx = (tracy::GpuCtx*)tracy::tracy_malloc( sizeof( tracy::GpuCtx ) ); new(tracy::s_gpuCtx) tracy::GpuCtx; +#define TracyGpuZone( name ) static const tracy::SourceLocation __tracy_gpu_source_location { __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::GpuCtxScope ___tracy_gpu_zone( name, &__tracy_gpu_source_location ); +#define TracyGpuZoneC( name, color ) static const tracy::SourceLocation __tracy_gpu_source_location { __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::GpuCtxScope ___tracy_gpu_zone( name, &__tracy_gpu_source_location ); +#define TracyGpuCollect tracy::s_gpuCtx->Collect(); namespace tracy { extern std::atomic s_gpuCtxCounter; -template class GpuCtx; - -template -class __GpuCtxScope -{ -public: - tracy_force_inline __GpuCtxScope( GpuCtx& ctx, const char* name, const SourceLocation* srcloc ) - : m_ctx( ctx ) - { - glQueryCounter( m_ctx.NextQueryId(), GL_TIMESTAMP ); - - Magic magic; - auto& token = s_token.ptr; - auto& tail = token->get_tail_index(); - auto item = token->enqueue_begin( magic ); - item->hdr.type = QueueType::GpuZoneBegin; - item->gpuZoneBegin.cpuTime = Profiler::GetTime(); - item->gpuZoneBegin.name = (uint64_t)name; - item->gpuZoneBegin.srcloc = (uint64_t)srcloc; - item->gpuZoneBegin.context = m_ctx.GetId(); - tail.store( magic + 1, std::memory_order_release ); - } - - tracy_force_inline ~__GpuCtxScope() - { - glQueryCounter( m_ctx.NextQueryId(), GL_TIMESTAMP ); - - Magic magic; - auto& token = s_token.ptr; - auto& tail = token->get_tail_index(); - auto item = token->enqueue_begin( magic ); - item->hdr.type = QueueType::GpuZoneEnd; - item->gpuZoneEnd.cpuTime = Profiler::GetTime(); - item->gpuZoneEnd.thread = GetThreadHandle(); - item->gpuZoneEnd.context = m_ctx.GetId(); - tail.store( magic + 1, std::memory_order_release ); - } - -private: - GpuCtx& m_ctx; -}; - -namespace detail -{ -template -static tracy_force_inline __GpuCtxScope __GpuHelper( GpuCtx* ctx, const char* name, const SourceLocation* srcloc ) -{ - return ctx->SpawnZone( name, srcloc ); -} -} - -template class GpuCtx { - friend class __GpuCtxScope; - friend __GpuCtxScope detail::__GpuHelper( GpuCtx* ctx, const char* name, const SourceLocation* srcloc ); + friend class GpuCtxScope; + + enum { QueryCount = 1024 }; public: GpuCtx() @@ -98,7 +52,7 @@ public: , m_head( 0 ) , m_tail( 0 ) { - glGenQueries( Num, m_query ); + glGenQueries( QueryCount, m_query ); int64_t tgpu; glGetInteger64v( GL_TIMESTAMP, &tgpu ); @@ -111,6 +65,7 @@ public: item->hdr.type = QueueType::GpuNewContext; item->gpuNewContext.cputime = tcpu; item->gpuNewContext.gputime = tgpu; + item->gpuNewContext.thread = GetThreadHandle(); item->gpuNewContext.context = m_context; tail.store( magic + 1, std::memory_order_release ); } @@ -120,13 +75,13 @@ public: ZoneScopedC( 0x881111 ); auto start = m_tail; - auto end = m_head + Num; - auto cnt = ( end - start ) % Num; + auto end = m_head + QueryCount; + auto cnt = ( end - start ) % QueryCount; while( cnt > 1 ) { auto mid = start + cnt / 2; GLint available; - glGetQueryObjectiv( m_query[mid % Num], GL_QUERY_RESULT_AVAILABLE, &available ); + glGetQueryObjectiv( m_query[mid % QueryCount], GL_QUERY_RESULT_AVAILABLE, &available ); if( available ) { start = mid; @@ -135,10 +90,10 @@ public: { end = mid; } - cnt = ( end - start ) % Num; + cnt = ( end - start ) % QueryCount; } - start %= Num; + start %= QueryCount; while( m_tail != start ) { @@ -153,20 +108,15 @@ public: item->gpuTime.gpuTime = (int64_t)time; item->gpuTime.context = m_context; tail.store( magic + 1, std::memory_order_release ); - m_tail = ( m_tail + 1 ) % Num; + m_tail = ( m_tail + 1 ) % QueryCount; } } private: - tracy_force_inline __GpuCtxScope SpawnZone( const char* name, const SourceLocation* srcloc ) - { - return __GpuCtxScope( *this, name, srcloc ); - } - tracy_force_inline unsigned int NextQueryId() { const auto id = m_head; - m_head = ( m_head + 1 ) % Num; + m_head = ( m_head + 1 ) % QueryCount; assert( m_head != m_tail ); return m_query[id]; } @@ -176,13 +126,49 @@ private: return m_context; } - unsigned int m_query[Num]; + unsigned int m_query[QueryCount]; uint16_t m_context; unsigned int m_head; unsigned int m_tail; }; +extern thread_local GpuCtx* s_gpuCtx; + +class GpuCtxScope +{ +public: + tracy_force_inline GpuCtxScope( const char* name, const SourceLocation* srcloc ) + { + glQueryCounter( s_gpuCtx->NextQueryId(), GL_TIMESTAMP ); + + Magic magic; + auto& token = s_token.ptr; + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + item->hdr.type = QueueType::GpuZoneBegin; + item->gpuZoneBegin.cpuTime = Profiler::GetTime(); + item->gpuZoneBegin.name = (uint64_t)name; + item->gpuZoneBegin.srcloc = (uint64_t)srcloc; + item->gpuZoneBegin.context = s_gpuCtx->GetId(); + tail.store( magic + 1, std::memory_order_release ); + } + + tracy_force_inline ~GpuCtxScope() + { + glQueryCounter( s_gpuCtx->NextQueryId(), GL_TIMESTAMP ); + + Magic magic; + auto& token = s_token.ptr; + auto& tail = token->get_tail_index(); + auto item = token->enqueue_begin( magic ); + item->hdr.type = QueueType::GpuZoneEnd; + item->gpuZoneEnd.cpuTime = Profiler::GetTime(); + item->gpuZoneEnd.context = s_gpuCtx->GetId(); + tail.store( magic + 1, std::memory_order_release ); + } +}; + } #endif diff --git a/client/TracyProfiler.cpp b/client/TracyProfiler.cpp index 85ae0d0d..b451a4bb 100644 --- a/client/TracyProfiler.cpp +++ b/client/TracyProfiler.cpp @@ -100,6 +100,9 @@ moodycamel::ConcurrentQueue init_order(103) s_queue( QueuePrealloc ); std::atomic init_order(104) s_lockCounter( 0 ); std::atomic init_order(104) s_gpuCtxCounter( 0 ); +class GpuCtx; +thread_local GpuCtx* init_order(104) s_gpuCtx = nullptr; + #ifdef TRACY_COLLECT_THREAD_NAMES struct ThreadNameData; std::atomic init_order(104) s_threadNameData( nullptr ); diff --git a/common/TracyQueue.hpp b/common/TracyQueue.hpp index bfc7f4e8..bac349dd 100644 --- a/common/TracyQueue.hpp +++ b/common/TracyQueue.hpp @@ -146,6 +146,7 @@ struct QueueGpuNewContext { int64_t cputime; int64_t gputime; + uint64_t thread; uint16_t context; }; @@ -160,7 +161,6 @@ struct QueueGpuZoneBegin struct QueueGpuZoneEnd { int64_t cpuTime; - uint64_t thread; uint16_t context; }; diff --git a/server/TracyEvent.hpp b/server/TracyEvent.hpp index d6246e46..dedbf717 100644 --- a/server/TracyEvent.hpp +++ b/server/TracyEvent.hpp @@ -103,7 +103,6 @@ struct GpuEvent int64_t gpuEnd; int32_t srcloc; uint64_t name; - uint64_t thread; Vector child; }; @@ -131,6 +130,7 @@ struct ThreadData struct GpuCtxData { int64_t timeDiff; + uint64_t thread; Vector timeline; Vector stack; Vector queue; diff --git a/server/TracyView.cpp b/server/TracyView.cpp index c593461b..d6a88817 100644 --- a/server/TracyView.cpp +++ b/server/TracyView.cpp @@ -358,6 +358,7 @@ View::View( FileRead& f ) for( uint64_t i=0; i(); + f.Read( &ctx->thread, sizeof( ctx->thread ) ); ReadTimeline( f, ctx->timeline ); ctx->showFull = true; m_gpuData.push_back( ctx ); @@ -881,6 +882,7 @@ void View::ProcessGpuNewContext( const QueueGpuNewContext& ev ) assert( ev.context == m_gpuData.size() ); auto gpu = m_slab.AllocInit(); gpu->timeDiff = int64_t( ev.cputime * m_timerMul - ev.gputime ); + gpu->thread = ev.thread; gpu->showFull = true; std::lock_guard lock( m_lock ); m_gpuData.push_back( gpu ); @@ -901,7 +903,6 @@ void View::ProcessGpuZoneBegin( const QueueGpuZoneBegin& ev ) zone->gpuEnd = -1; zone->name = ev.name; zone->srcloc = ShrinkSourceLocation( ev.srcloc ); - zone->thread = 0; auto timeline = &ctx->timeline; if( !ctx->stack.empty() ) @@ -929,7 +930,6 @@ void View::ProcessGpuZoneEnd( const QueueGpuZoneEnd& ev ) std::lock_guard lock( m_lock ); zone->cpuEnd = ev.cpuTime * m_timerMul; - zone->thread = ev.thread; } void View::ProcessGpuTime( const QueueGpuTime& ev ) @@ -2063,7 +2063,7 @@ void View::DrawZones() offset += ostep; if( v->showFull ) { - const auto depth = DrawGpuZoneLevel( v->timeline, hover, pxns, wpos, offset, 0 ); + const auto depth = DrawGpuZoneLevel( v->timeline, hover, pxns, wpos, offset, 0, v->thread ); offset += ostep * depth; } offset += ostep * 0.2f; @@ -2125,7 +2125,7 @@ void View::DrawZones() draw->AddRectFilled( wpos + ImVec2( 0, offset ), wpos + ImVec2( ty + txtsz.x + 4, offset + ty ), 0x448888DD ); draw->AddRect( wpos + ImVec2( 0, offset ), wpos + ImVec2( ty + txtsz.x + 4, offset + ty ), 0x888888DD ); } - if( m_gpuInfoWindow && m_gpuInfoWindow->thread == v->id ) + if( m_gpuInfoWindow && m_gpuInfoWindowThread == v->id ) { draw->AddRectFilled( wpos + ImVec2( 0, offset ), wpos + ImVec2( ty + txtsz.x + 4, offset + ty ), 0x4488DD88 ); draw->AddRect( wpos + ImVec2( 0, offset ), wpos + ImVec2( ty + txtsz.x + 4, offset + ty ), 0x8888DD88 ); @@ -2406,7 +2406,7 @@ int View::DrawZoneLevel( const Vector& vec, bool hover, double pxns, return maxdepth; } -int View::DrawGpuZoneLevel( const Vector& vec, bool hover, double pxns, const ImVec2& wpos, int _offset, int depth ) +int View::DrawGpuZoneLevel( const Vector& vec, bool hover, double pxns, const ImVec2& wpos, int _offset, int depth, uint64_t thread ) { // cast to uint64_t, so that unended zones (end = -1) are still drawn auto it = std::lower_bound( vec.begin(), vec.end(), m_zvStart - m_delay, [] ( const auto& l, const auto& r ) { return (uint64_t)l->gpuEnd < (uint64_t)r; } ); @@ -2478,9 +2478,10 @@ int View::DrawGpuZoneLevel( const Vector& vec, bool hover, double pxn { m_zoneInfoWindow = nullptr; m_gpuInfoWindow = &ev; + m_gpuInfoWindowThread = thread; } - m_gpuThread = ev.thread; + m_gpuThread = thread; m_gpuStart = ev.cpuStart; m_gpuEnd = ev.cpuEnd; } @@ -2498,7 +2499,7 @@ int View::DrawGpuZoneLevel( const Vector& vec, bool hover, double pxn { if( !ev.child.empty() ) { - const auto d = DrawGpuZoneLevel( ev.child, hover, pxns, wpos, _offset, depth ); + const auto d = DrawGpuZoneLevel( ev.child, hover, pxns, wpos, _offset, depth, thread ); if( d > maxdepth ) maxdepth = d; } @@ -2559,9 +2560,10 @@ int View::DrawGpuZoneLevel( const Vector& vec, bool hover, double pxn { m_zoneInfoWindow = nullptr; m_gpuInfoWindow = &ev; + m_gpuInfoWindowThread = thread; } - m_gpuThread = ev.thread; + m_gpuThread = thread; m_gpuStart = ev.cpuStart; m_gpuEnd = ev.cpuEnd; } @@ -3839,6 +3841,7 @@ void View::Write( FileWrite& f ) f.Write( &sz, sizeof( sz ) ); for( auto& ctx : m_gpuData ) { + f.Write( &ctx->thread, sizeof( ctx->thread ) ); WriteTimeline( f, ctx->timeline ); } @@ -3888,7 +3891,6 @@ void View::WriteTimeline( FileWrite& f, const Vector& vec ) f.Write( &v->gpuEnd, sizeof( v->gpuEnd ) ); f.Write( &v->srcloc, sizeof( v->srcloc ) ); f.Write( &v->name, sizeof( v->name ) ); - f.Write( &v->thread, sizeof( v->thread ) ); WriteTimeline( f, v->child ); } } @@ -3932,7 +3934,6 @@ void View::ReadTimeline( FileRead& f, Vector& vec ) f.Read( &zone->gpuEnd, sizeof( zone->gpuEnd ) ); f.Read( &zone->srcloc, sizeof( zone->srcloc ) ); f.Read( &zone->name, sizeof( zone->name ) ); - f.Read( &zone->thread, sizeof( zone->thread ) ); ReadTimeline( f, zone->child ); } } diff --git a/server/TracyView.hpp b/server/TracyView.hpp index 670007f1..9b86b2b9 100644 --- a/server/TracyView.hpp +++ b/server/TracyView.hpp @@ -122,7 +122,7 @@ private: bool DrawZoneFrames(); void DrawZones(); int DrawZoneLevel( const Vector& vec, bool hover, double pxns, const ImVec2& wpos, int offset, int depth ); - int DrawGpuZoneLevel( const Vector& vec, bool hover, double pxns, const ImVec2& wpos, int offset, int depth ); + int DrawGpuZoneLevel( const Vector& vec, bool hover, double pxns, const ImVec2& wpos, int offset, int depth, uint64_t thread ); int DrawLocks( uint64_t tid, bool hover, double pxns, const ImVec2& wpos, int offset, LockHighlight& highlight ); int DrawPlots( int offset, double pxns, const ImVec2& wpos, bool hover ); void DrawPlotPoint( const ImVec2& wpos, float x, float y, int offset, uint32_t color, bool hover, bool hasPrev, double val, double prev, bool merged ); @@ -236,6 +236,7 @@ private: const MessageData* m_msgHighlight; const GpuEvent* m_gpuInfoWindow; const GpuEvent* m_gpuHighlight; + uint64_t m_gpuInfoWindowThread; bool m_drawRegion; int64_t m_regionStart;