Simplify GPU context handling.

2024-09-20 05:42:18 +00:00 · 2017-11-14 00:48:26 +01:00 · 2017-11-14 00:48:26 +01:00 · 5c872b2137
commit 5c872b2137
parent d56f44a220
7 changed files with 77 additions and 86 deletions
--- a/README.md
+++ b/README.md
@ -102,11 +102,11 @@ Even if tracy is disabled, you still have to pay the no-op function call cost. T

 #### GPU profiling

-Tracy provides bindings for profiling OpenGL execution time on GPU. To use it, you will need to include the `tracy/TracyOpenGL.hpp` header file and create an instance of `tracy::GpuCtx<>` class for each of your rendering contexts (typically you will only have one context). You need to keep track using the proper context yourself. The template parameter specifies the number of query events you anticipate to happen between event creation and collection. Each GPU zone requires two events and in some cases collection may happen after a couple frames (e.g. 5) have been submitted.
+Tracy provides bindings for profiling OpenGL execution time on GPU. To use it, you will need to include the `tracy/TracyOpenGL.hpp` header file and declare each of your rendering contexts using the `TracyGpuContext` macro (typically you will only have one context). Tracy expects no more than one context per thread and no context migration.

-To mark GPU zone use the `TracyGpuZone( ctx, name )` macro, where `ctx` is a pointer to the `tracy::GpuCtx<>` class instance you have created and `name` is a string literal name of the zone. Alternatively you may use `TracyGpuZoneC( ctx, name, color )` to specify zone color.
+To mark GPU zone use the `TracyGpuZone( name )` macro, where `name` is a string literal name of the zone. Alternatively you may use `TracyGpuZoneC( name, color )` to specify zone color.

-You also need to periodically call the `Collect()` method of the `tracy::GpuCtx<>` class. A good place to do it is after swap buffers function call.
+You also need to periodically collect the GPU events using the `TracyGpuCollect` macro. A good place to do it is after swap buffers function call.

 ## Good practices

--- a/TracyOpenGL.hpp
+++ b/TracyOpenGL.hpp
@ -5,8 +5,10 @@

 #ifndef TRACY_ENABLE

+#define TracyGpuContext
 #define TracyGpuZone(x,y)
 #define TracyGpuZoneC(x,y,z)
+#define TracyGpuCollect

 namespace tracy
 {
@ -26,71 +28,23 @@ public:

 #include "Tracy.hpp"
 #include "client/TracyProfiler.hpp"
+#include "common/TracyAlloc.hpp"

-#define TracyGpuZone( ctx, name ) static const tracy::SourceLocation __tracy_gpu_source_location { __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; auto ___tracy_gpu_zone = tracy::detail::__GpuHelper( ctx, name, &__tracy_gpu_source_location );
-#define TracyGpuZoneC( ctx, name, color ) static const tracy::SourceLocation __tracy_gpu_source_location { __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; auto ___tracy_gpu_zone = tracy::detail::__GpuHelper( ctx, name, &__tracy_gpu_source_location );
+#define TracyGpuContext tracy::s_gpuCtx = (tracy::GpuCtx*)tracy::tracy_malloc( sizeof( tracy::GpuCtx ) ); new(tracy::s_gpuCtx) tracy::GpuCtx;
+#define TracyGpuZone( name ) static const tracy::SourceLocation __tracy_gpu_source_location { __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; tracy::GpuCtxScope ___tracy_gpu_zone( name, &__tracy_gpu_source_location );
+#define TracyGpuZoneC( name, color ) static const tracy::SourceLocation __tracy_gpu_source_location { __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; tracy::GpuCtxScope ___tracy_gpu_zone( name, &__tracy_gpu_source_location );
+#define TracyGpuCollect tracy::s_gpuCtx->Collect();

 namespace tracy
 {

 extern std::atomic<uint16_t> s_gpuCtxCounter;

-template<int Num> class GpuCtx;
-
-template<int Num>
-class __GpuCtxScope
-{
-public:
-    tracy_force_inline __GpuCtxScope( GpuCtx<Num>& ctx, const char* name, const SourceLocation* srcloc )
-        : m_ctx( ctx )
-    {
-        glQueryCounter( m_ctx.NextQueryId(), GL_TIMESTAMP );
-
-        Magic magic;
-        auto& token = s_token.ptr;
-        auto& tail = token->get_tail_index();
-        auto item = token->enqueue_begin<moodycamel::CanAlloc>( magic );
-        item->hdr.type = QueueType::GpuZoneBegin;
-        item->gpuZoneBegin.cpuTime = Profiler::GetTime();
-        item->gpuZoneBegin.name = (uint64_t)name;
-        item->gpuZoneBegin.srcloc = (uint64_t)srcloc;
-        item->gpuZoneBegin.context = m_ctx.GetId();
-        tail.store( magic + 1, std::memory_order_release );
-    }
-
-    tracy_force_inline ~__GpuCtxScope()
-    {
-        glQueryCounter( m_ctx.NextQueryId(), GL_TIMESTAMP );
-
-        Magic magic;
-        auto& token = s_token.ptr;
-        auto& tail = token->get_tail_index();
-        auto item = token->enqueue_begin<moodycamel::CanAlloc>( magic );
-        item->hdr.type = QueueType::GpuZoneEnd;
-        item->gpuZoneEnd.cpuTime = Profiler::GetTime();
-        item->gpuZoneEnd.thread = GetThreadHandle();
-        item->gpuZoneEnd.context = m_ctx.GetId();
-        tail.store( magic + 1, std::memory_order_release );
-    }
-
-private:
-    GpuCtx<Num>& m_ctx;
-};
-
-namespace detail
-{
-template<int Num>
-static tracy_force_inline __GpuCtxScope<Num> __GpuHelper( GpuCtx<Num>* ctx, const char* name, const SourceLocation* srcloc )
-{
-    return ctx->SpawnZone( name, srcloc );
-}
-}
-
-template<int Num>
 class GpuCtx
 {
-    friend class __GpuCtxScope<Num>;
-    friend __GpuCtxScope<Num> detail::__GpuHelper<Num>( GpuCtx<Num>* ctx, const char* name, const SourceLocation* srcloc );
+    friend class GpuCtxScope;
+
+    enum { QueryCount = 1024 };

 public:
    GpuCtx()
@ -98,7 +52,7 @@ public:
        , m_head( 0 )
        , m_tail( 0 )
    {
-        glGenQueries( Num, m_query );
+        glGenQueries( QueryCount, m_query );

        int64_t tgpu;
        glGetInteger64v( GL_TIMESTAMP, &tgpu );
@ -111,6 +65,7 @@ public:
        item->hdr.type = QueueType::GpuNewContext;
        item->gpuNewContext.cputime = tcpu;
        item->gpuNewContext.gputime = tgpu;
+        item->gpuNewContext.thread = GetThreadHandle();
        item->gpuNewContext.context = m_context;
        tail.store( magic + 1, std::memory_order_release );
    }
@ -120,13 +75,13 @@ public:
        ZoneScopedC( 0x881111 );

        auto start = m_tail;
-        auto end = m_head + Num;
-        auto cnt = ( end - start ) % Num;
+        auto end = m_head + QueryCount;
+        auto cnt = ( end - start ) % QueryCount;
        while( cnt > 1 )
        {
            auto mid = start + cnt / 2;
            GLint available;
-            glGetQueryObjectiv( m_query[mid % Num], GL_QUERY_RESULT_AVAILABLE, &available );
+            glGetQueryObjectiv( m_query[mid % QueryCount], GL_QUERY_RESULT_AVAILABLE, &available );
            if( available )
            {
                start = mid;
@ -135,10 +90,10 @@ public:
            {
                end = mid;
            }
-            cnt = ( end - start ) % Num;
+            cnt = ( end - start ) % QueryCount;
        }

-        start %= Num;
+        start %= QueryCount;

        while( m_tail != start )
        {
@ -153,20 +108,15 @@ public:
            item->gpuTime.gpuTime = (int64_t)time;
            item->gpuTime.context = m_context;
            tail.store( magic + 1, std::memory_order_release );
-            m_tail = ( m_tail + 1 ) % Num;
+            m_tail = ( m_tail + 1 ) % QueryCount;
        }
    }

 private:
-    tracy_force_inline __GpuCtxScope<Num> SpawnZone( const char* name, const SourceLocation* srcloc )
-    {
-        return __GpuCtxScope<Num>( *this, name, srcloc );
-    }
-
    tracy_force_inline unsigned int NextQueryId()
    {
        const auto id = m_head;
-        m_head = ( m_head + 1 ) % Num;
+        m_head = ( m_head + 1 ) % QueryCount;
        assert( m_head != m_tail );
        return m_query[id];
    }
@ -176,13 +126,49 @@ private:
        return m_context;
    }

-    unsigned int m_query[Num];
+    unsigned int m_query[QueryCount];
    uint16_t m_context;

    unsigned int m_head;
    unsigned int m_tail;
 };

+extern thread_local GpuCtx* s_gpuCtx;
+
+class GpuCtxScope
+{
+public:
+    tracy_force_inline GpuCtxScope( const char* name, const SourceLocation* srcloc )
+    {
+        glQueryCounter( s_gpuCtx->NextQueryId(), GL_TIMESTAMP );
+
+        Magic magic;
+        auto& token = s_token.ptr;
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin<moodycamel::CanAlloc>( magic );
+        item->hdr.type = QueueType::GpuZoneBegin;
+        item->gpuZoneBegin.cpuTime = Profiler::GetTime();
+        item->gpuZoneBegin.name = (uint64_t)name;
+        item->gpuZoneBegin.srcloc = (uint64_t)srcloc;
+        item->gpuZoneBegin.context = s_gpuCtx->GetId();
+        tail.store( magic + 1, std::memory_order_release );
+    }
+
+    tracy_force_inline ~GpuCtxScope()
+    {
+        glQueryCounter( s_gpuCtx->NextQueryId(), GL_TIMESTAMP );
+
+        Magic magic;
+        auto& token = s_token.ptr;
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin<moodycamel::CanAlloc>( magic );
+        item->hdr.type = QueueType::GpuZoneEnd;
+        item->gpuZoneEnd.cpuTime = Profiler::GetTime();
+        item->gpuZoneEnd.context = s_gpuCtx->GetId();
+        tail.store( magic + 1, std::memory_order_release );
+    }
+};
+
 }

 #endif
--- a/client/TracyProfiler.cpp
+++ b/client/TracyProfiler.cpp
@ -100,6 +100,9 @@ moodycamel::ConcurrentQueue<QueueItem> init_order(103) s_queue( QueuePrealloc );
 std::atomic<uint32_t> init_order(104) s_lockCounter( 0 );
 std::atomic<uint16_t> init_order(104) s_gpuCtxCounter( 0 );

+class GpuCtx;
+thread_local GpuCtx* init_order(104) s_gpuCtx = nullptr;
+
 #ifdef TRACY_COLLECT_THREAD_NAMES
 struct ThreadNameData;
 std::atomic<ThreadNameData*> init_order(104) s_threadNameData( nullptr );
--- a/common/TracyQueue.hpp
+++ b/common/TracyQueue.hpp
@ -146,6 +146,7 @@ struct QueueGpuNewContext
 {
    int64_t cputime;
    int64_t gputime;
+    uint64_t thread;
    uint16_t context;
 };

@ -160,7 +161,6 @@ struct QueueGpuZoneBegin
 struct QueueGpuZoneEnd
 {
    int64_t cpuTime;
-    uint64_t thread;
    uint16_t context;
 };

--- a/server/TracyEvent.hpp
+++ b/server/TracyEvent.hpp
@ -103,7 +103,6 @@ struct GpuEvent
    int64_t gpuEnd;
    int32_t srcloc;
    uint64_t name;
-    uint64_t thread;

    Vector<GpuEvent*> child;
 };
@ -131,6 +130,7 @@ struct ThreadData
 struct GpuCtxData
 {
    int64_t timeDiff;
+    uint64_t thread;
    Vector<GpuEvent*> timeline;
    Vector<GpuEvent*> stack;
    Vector<GpuEvent*> queue;
--- a/server/TracyView.cpp
+++ b/server/TracyView.cpp
@ -358,6 +358,7 @@ View::View( FileRead& f )
    for( uint64_t i=0; i<sz; i++ )
    {
        auto ctx = m_slab.AllocInit<GpuCtxData>();
+        f.Read( &ctx->thread, sizeof( ctx->thread ) );
        ReadTimeline( f, ctx->timeline );
        ctx->showFull = true;
        m_gpuData.push_back( ctx );
@ -881,6 +882,7 @@ void View::ProcessGpuNewContext( const QueueGpuNewContext& ev )
    assert( ev.context == m_gpuData.size() );
    auto gpu = m_slab.AllocInit<GpuCtxData>();
    gpu->timeDiff = int64_t( ev.cputime * m_timerMul - ev.gputime );
+    gpu->thread = ev.thread;
    gpu->showFull = true;
    std::lock_guard<std::mutex> lock( m_lock );
    m_gpuData.push_back( gpu );
@ -901,7 +903,6 @@ void View::ProcessGpuZoneBegin( const QueueGpuZoneBegin& ev )
    zone->gpuEnd = -1;
    zone->name = ev.name;
    zone->srcloc = ShrinkSourceLocation( ev.srcloc );
-    zone->thread = 0;

    auto timeline = &ctx->timeline;
    if( !ctx->stack.empty() )
@ -929,7 +930,6 @@ void View::ProcessGpuZoneEnd( const QueueGpuZoneEnd& ev )

    std::lock_guard<std::mutex> lock( m_lock );
    zone->cpuEnd = ev.cpuTime * m_timerMul;
-    zone->thread = ev.thread;
 }

 void View::ProcessGpuTime( const QueueGpuTime& ev )
@ -2063,7 +2063,7 @@ void View::DrawZones()
            offset += ostep;
            if( v->showFull )
            {
-                const auto depth = DrawGpuZoneLevel( v->timeline, hover, pxns, wpos, offset, 0 );
+                const auto depth = DrawGpuZoneLevel( v->timeline, hover, pxns, wpos, offset, 0, v->thread );
                offset += ostep * depth;
            }
            offset += ostep * 0.2f;
@ -2125,7 +2125,7 @@ void View::DrawZones()
            draw->AddRectFilled( wpos + ImVec2( 0, offset ), wpos + ImVec2( ty + txtsz.x + 4, offset + ty ), 0x448888DD );
            draw->AddRect( wpos + ImVec2( 0, offset ), wpos + ImVec2( ty + txtsz.x + 4, offset + ty ), 0x888888DD );
        }
-        if( m_gpuInfoWindow && m_gpuInfoWindow->thread == v->id )
+        if( m_gpuInfoWindow && m_gpuInfoWindowThread == v->id )
        {
            draw->AddRectFilled( wpos + ImVec2( 0, offset ), wpos + ImVec2( ty + txtsz.x + 4, offset + ty ), 0x4488DD88 );
            draw->AddRect( wpos + ImVec2( 0, offset ), wpos + ImVec2( ty + txtsz.x + 4, offset + ty ), 0x8888DD88 );
@ -2406,7 +2406,7 @@ int View::DrawZoneLevel( const Vector<ZoneEvent*>& vec, bool hover, double pxns,
    return maxdepth;
 }

-int View::DrawGpuZoneLevel( const Vector<GpuEvent*>& vec, bool hover, double pxns, const ImVec2& wpos, int _offset, int depth )
+int View::DrawGpuZoneLevel( const Vector<GpuEvent*>& vec, bool hover, double pxns, const ImVec2& wpos, int _offset, int depth, uint64_t thread )
 {
    // cast to uint64_t, so that unended zones (end = -1) are still drawn
    auto it = std::lower_bound( vec.begin(), vec.end(), m_zvStart - m_delay, [] ( const auto& l, const auto& r ) { return (uint64_t)l->gpuEnd < (uint64_t)r; } );
@ -2478,9 +2478,10 @@ int View::DrawGpuZoneLevel( const Vector<GpuEvent*>& vec, bool hover, double pxn
                    {
                        m_zoneInfoWindow = nullptr;
                        m_gpuInfoWindow = &ev;
+                        m_gpuInfoWindowThread = thread;
                    }

-                    m_gpuThread = ev.thread;
+                    m_gpuThread = thread;
                    m_gpuStart = ev.cpuStart;
                    m_gpuEnd = ev.cpuEnd;
                }
@ -2498,7 +2499,7 @@ int View::DrawGpuZoneLevel( const Vector<GpuEvent*>& vec, bool hover, double pxn
        {
            if( !ev.child.empty() )
            {
-                const auto d = DrawGpuZoneLevel( ev.child, hover, pxns, wpos, _offset, depth );
+                const auto d = DrawGpuZoneLevel( ev.child, hover, pxns, wpos, _offset, depth, thread );
                if( d > maxdepth ) maxdepth = d;
            }

@ -2559,9 +2560,10 @@ int View::DrawGpuZoneLevel( const Vector<GpuEvent*>& vec, bool hover, double pxn
                {
                    m_zoneInfoWindow = nullptr;
                    m_gpuInfoWindow = &ev;
+                    m_gpuInfoWindowThread = thread;
                }

-                m_gpuThread = ev.thread;
+                m_gpuThread = thread;
                m_gpuStart = ev.cpuStart;
                m_gpuEnd = ev.cpuEnd;
            }
@ -3839,6 +3841,7 @@ void View::Write( FileWrite& f )
    f.Write( &sz, sizeof( sz ) );
    for( auto& ctx : m_gpuData )
    {
+        f.Write( &ctx->thread, sizeof( ctx->thread ) );
        WriteTimeline( f, ctx->timeline );
    }

@ -3888,7 +3891,6 @@ void View::WriteTimeline( FileWrite& f, const Vector<GpuEvent*>& vec )
        f.Write( &v->gpuEnd, sizeof( v->gpuEnd ) );
        f.Write( &v->srcloc, sizeof( v->srcloc ) );
        f.Write( &v->name, sizeof( v->name ) );
-        f.Write( &v->thread, sizeof( v->thread ) );
        WriteTimeline( f, v->child );
    }
 }
@ -3932,7 +3934,6 @@ void View::ReadTimeline( FileRead& f, Vector<GpuEvent*>& vec )
        f.Read( &zone->gpuEnd, sizeof( zone->gpuEnd ) );
        f.Read( &zone->srcloc, sizeof( zone->srcloc ) );
        f.Read( &zone->name, sizeof( zone->name ) );
-        f.Read( &zone->thread, sizeof( zone->thread ) );
        ReadTimeline( f, zone->child );
    }
 }
--- a/server/TracyView.hpp
+++ b/server/TracyView.hpp
@ -122,7 +122,7 @@ private:
    bool DrawZoneFrames();
    void DrawZones();
    int DrawZoneLevel( const Vector<ZoneEvent*>& vec, bool hover, double pxns, const ImVec2& wpos, int offset, int depth );
-    int DrawGpuZoneLevel( const Vector<GpuEvent*>& vec, bool hover, double pxns, const ImVec2& wpos, int offset, int depth );
+    int DrawGpuZoneLevel( const Vector<GpuEvent*>& vec, bool hover, double pxns, const ImVec2& wpos, int offset, int depth, uint64_t thread );
    int DrawLocks( uint64_t tid, bool hover, double pxns, const ImVec2& wpos, int offset, LockHighlight& highlight );
    int DrawPlots( int offset, double pxns, const ImVec2& wpos, bool hover );
    void DrawPlotPoint( const ImVec2& wpos, float x, float y, int offset, uint32_t color, bool hover, bool hasPrev, double val, double prev, bool merged );
@ -236,6 +236,7 @@ private:
    const MessageData* m_msgHighlight;
    const GpuEvent* m_gpuInfoWindow;
    const GpuEvent* m_gpuHighlight;
+    uint64_t m_gpuInfoWindowThread;

    bool m_drawRegion;
    int64_t m_regionStart;