CPU-side GPU event transfer.

2024-11-22 22:44:34 +00:00 · 2017-11-11 21:09:48 +01:00 · 2017-11-11 21:09:48 +01:00 · 6fcdb924e8
commit 6fcdb924e8
parent b208df8829
5 changed files with 174 additions and 1 deletions
--- a/TracyOpenGL.hpp
+++ b/TracyOpenGL.hpp
@ -7,17 +7,76 @@

 #include "client/TracyProfiler.hpp"

+#define TracyGpuZone( ctx, name ) static const tracy::SourceLocation __tracy_gpu_source_location { __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; auto ___tracy_gpu_zone = tracy::detail::__GpuHelper( ctx, name, &__tracy_gpu_source_location );
+#define TracyGpuZoneC( ctx, name, color ) static const tracy::SourceLocation __tracy_gpu_source_location { __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; auto ___tracy_gpu_zone = tracy::detail::__GpuHelper( ctx, name, &__tracy_gpu_source_location );
+
 namespace tracy
 {

 extern std::atomic<uint16_t> s_gpuCtxCounter;

+template<int Num> class GpuCtx;
+
+template<int Num>
+class __GpuCtxScope
+{
+public:
+    tracy_force_inline __GpuCtxScope( GpuCtx<Num>& ctx, const char* name, const SourceLocation* srcloc )
+        : m_ctx( ctx )
+    {
+        glQueryCounter( m_ctx.NextQueryId(), GL_TIMESTAMP );
+
+        Magic magic;
+        auto& token = s_token.ptr;
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin<moodycamel::CanAlloc>( magic );
+        item->hdr.type = QueueType::GpuZoneBegin;
+        item->gpuZoneBegin.cpuTime = Profiler::GetTime();
+        item->gpuZoneBegin.name = (uint64_t)name;
+        item->gpuZoneBegin.srcloc = (uint64_t)srcloc;
+        item->gpuZoneBegin.context = m_ctx.GetId();
+        tail.store( magic + 1, std::memory_order_release );
+    }
+
+    tracy_force_inline ~__GpuCtxScope()
+    {
+        glQueryCounter( m_ctx.NextQueryId(), GL_TIMESTAMP );
+
+        Magic magic;
+        auto& token = s_token.ptr;
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin<moodycamel::CanAlloc>( magic );
+        item->hdr.type = QueueType::GpuZoneEnd;
+        item->gpuZoneEnd.cpuTime = Profiler::GetTime();
+        item->gpuZoneEnd.thread = GetThreadHandle();
+        item->gpuZoneEnd.context = m_ctx.GetId();
+        tail.store( magic + 1, std::memory_order_release );
+    }
+
+private:
+    GpuCtx<Num>& m_ctx;
+};
+
+namespace detail
+{
+template<int Num>
+static tracy_force_inline __GpuCtxScope<Num> __GpuHelper( GpuCtx<Num>* ctx, const char* name, const SourceLocation* srcloc )
+{
+    return ctx->SpawnZone( name, srcloc );
+}
+}
+
 template<int Num>
 class GpuCtx
 {
+    friend class __GpuCtxScope<Num>;
+    friend __GpuCtxScope<Num> detail::__GpuHelper<Num>( GpuCtx<Num>* ctx, const char* name, const SourceLocation* srcloc );
+
 public:
    GpuCtx()
        : m_context( s_gpuCtxCounter.fetch_add( 1, std::memory_order_relaxed ) )
+        , m_head( 0 )
+        , m_tail( 0 )
    {
        glGenQueries( Num, m_query );

@ -37,8 +96,28 @@ public:
    }

 private:
+    tracy_force_inline __GpuCtxScope<Num> SpawnZone( const char* name, const SourceLocation* srcloc )
+    {
+        return __GpuCtxScope<Num>( *this, name, srcloc );
+    }
+
+    tracy_force_inline unsigned int NextQueryId()
+    {
+        const auto id = m_head;
+        m_head = ( m_head + 1 ) % Num;
+        return m_query[id];
+    }
+
+    tracy_force_inline uint16_t GetId() const
+    {
+        return m_context;
+    }
+
    unsigned int m_query[Num];
    uint16_t m_context;
+
+    unsigned int m_head;
+    unsigned int m_tail;
 };

 }
--- a/common/TracyQueue.hpp
+++ b/common/TracyQueue.hpp
@ -30,6 +30,8 @@ enum class QueueType : uint8_t
    Message,
    MessageLiteral,
    GpuNewContext,
+    GpuZoneBegin,
+    GpuZoneEnd,
    NUM_TYPES
 };

@ -146,6 +148,21 @@ struct QueueGpuNewContext
    uint16_t context;
 };

+struct QueueGpuZoneBegin
+{
+    int64_t cpuTime;
+    uint64_t name;
+    uint64_t srcloc;
+    uint16_t context;
+};
+
+struct QueueGpuZoneEnd
+{
+    int64_t cpuTime;
+    uint64_t thread;
+    uint16_t context;
+};
+
 struct QueueHeader
 {
    union
@ -174,6 +191,8 @@ struct QueueItem
        QueuePlotData plotData;
        QueueMessage message;
        QueueGpuNewContext gpuNewContext;
+        QueueGpuZoneBegin gpuZoneBegin;
+        QueueGpuZoneEnd gpuZoneEnd;
    };
 };

@ -204,6 +223,8 @@ static const size_t QueueDataSize[] = {
    sizeof( QueueHeader ) + sizeof( QueueMessage ),
    sizeof( QueueHeader ) + sizeof( QueueMessage ),         // literal
    sizeof( QueueHeader ) + sizeof( QueueGpuNewContext ),
+    sizeof( QueueHeader ) + sizeof( QueueGpuZoneBegin ),
+    sizeof( QueueHeader ) + sizeof( QueueGpuZoneEnd ),
 };

 static_assert( QueueItemSize == 32, "Queue item size not 32 bytes" );
--- a/server/TracyEvent.hpp
+++ b/server/TracyEvent.hpp
@ -94,6 +94,22 @@ enum { LockEventSize = sizeof( LockEvent ) };
 enum { MaxLockThreads = sizeof( LockEvent::waitList ) * 8 };
 static_assert( std::numeric_limits<decltype(LockEvent::lockCount)>::max() >= MaxLockThreads, "Not enough space for lock count." );

+
+struct GpuEvent
+{
+    int64_t cpuStart;
+    int64_t cpuEnd;
+    int64_t gpuStart;
+    int64_t gpuEnd;
+    int32_t srcloc;
+    uint64_t name;
+    uint64_t thread;
+
+    Vector<GpuEvent*> child;
+};
+
+enum { GpuEventSize = sizeof( GpuEvent ) };
+
 #pragma pack()


@ -115,6 +131,9 @@ struct ThreadData
 struct GpuCtxData
 {
    int64_t timeDiff;
+    Vector<GpuEvent*> timeline;
+    Vector<GpuEvent*> stack;
+    Vector<GpuEvent*> queue;
 };

 struct LockMap
--- a/server/TracyView.cpp
+++ b/server/TracyView.cpp
@ -591,6 +591,12 @@ void View::Process( const QueueItem& ev )
    case QueueType::GpuNewContext:
        ProcessGpuNewContext( ev.gpuNewContext );
        break;
+    case QueueType::GpuZoneBegin:
+        ProcessGpuZoneBegin( ev.gpuZoneBegin );
+        break;
+    case QueueType::GpuZoneEnd:
+        ProcessGpuZoneEnd( ev.gpuZoneEnd );
+        break;
    case QueueType::Terminate:
        m_terminate = true;
        break;
@ -850,12 +856,58 @@ void View::ProcessMessageLiteral( const QueueMessage& ev )
 void View::ProcessGpuNewContext( const QueueGpuNewContext& ev )
 {
    assert( ev.context == m_gpuData.size() );
-    auto gpu = m_slab.Alloc<GpuCtxData>();
+    auto gpu = m_slab.AllocInit<GpuCtxData>();
    gpu->timeDiff = int64_t( ev.cputime * m_timerMul - ev.gputime );
    std::lock_guard<std::mutex> lock( m_lock );
    m_gpuData.push_back( gpu );
 }

+void View::ProcessGpuZoneBegin( const QueueGpuZoneBegin& ev )
+{
+    assert( m_gpuData.size() >= ev.context );
+    auto ctx = m_gpuData[ev.context];
+
+    CheckString( ev.name );
+    CheckSourceLocation( ev.srcloc );
+
+    auto zone = m_slab.AllocInit<GpuEvent>();
+    zone->cpuStart = ev.cpuTime;
+    zone->cpuEnd = -1;
+    zone->gpuStart = std::numeric_limits<int64_t>::max();
+    zone->gpuEnd = -1;
+    zone->name = ev.name;
+    zone->srcloc = ev.srcloc;
+    zone->thread = 0;
+
+    auto timeline = &ctx->timeline;
+    if( !ctx->stack.empty() )
+    {
+        timeline = &ctx->stack.back()->child;
+    }
+
+    m_lock.lock();
+    timeline->push_back( zone );
+    m_lock.unlock();
+
+    ctx->stack.push_back( zone );
+    ctx->queue.push_back( zone );
+}
+
+void View::ProcessGpuZoneEnd( const QueueGpuZoneEnd& ev )
+{
+    assert( m_gpuData.size() >= ev.context );
+    auto ctx = m_gpuData[ev.context];
+
+    assert( !ctx->stack.empty() );
+    auto zone = ctx->stack.back();
+    ctx->stack.pop_back();
+    ctx->queue.push_back( zone );
+
+    std::lock_guard<std::mutex> lock( m_lock );
+    zone->cpuEnd = ev.cpuTime;
+    zone->thread = ev.thread;
+}
+
 void View::CheckString( uint64_t ptr )
 {
    if( m_strings.find( ptr ) != m_strings.end() ) return;
--- a/server/TracyView.hpp
+++ b/server/TracyView.hpp
@ -68,6 +68,8 @@ private:
    void ProcessMessage( const QueueMessage& ev );
    void ProcessMessageLiteral( const QueueMessage& ev );
    void ProcessGpuNewContext( const QueueGpuNewContext& ev );
+    void ProcessGpuZoneBegin( const QueueGpuZoneBegin& ev );
+    void ProcessGpuZoneEnd( const QueueGpuZoneEnd& ev );

    void CheckString( uint64_t ptr );
    void CheckThreadString( uint64_t id );