From 6fcdb924e8a9563f5a5c15cc43e00be48abd04fe Mon Sep 17 00:00:00 2001
From: Bartosz Taudul <wolf.pld@gmail.com>
Date: Sat, 11 Nov 2017 21:09:48 +0100
Subject: [PATCH] CPU-side GPU event transfer.

---
 TracyOpenGL.hpp       | 79 +++++++++++++++++++++++++++++++++++++++++++
 common/TracyQueue.hpp | 21 ++++++++++++
 server/TracyEvent.hpp | 19 +++++++++++
 server/TracyView.cpp  | 54 ++++++++++++++++++++++++++++-
 server/TracyView.hpp  |  2 ++
 5 files changed, 174 insertions(+), 1 deletion(-)
diff --git a/TracyOpenGL.hpp b/TracyOpenGL.hpp
index 4c2b99d2..5712db37 100644
--- a/TracyOpenGL.hpp
+++ b/TracyOpenGL.hpp
@@ -7,17 +7,76 @@
 
 #include "client/TracyProfiler.hpp"
 
+#define TracyGpuZone( ctx, name ) static const tracy::SourceLocation __tracy_gpu_source_location { __FUNCTION__,  __FILE__, (uint32_t)__LINE__, 0 }; auto ___tracy_gpu_zone = tracy::detail::__GpuHelper( ctx, name, &__tracy_gpu_source_location );
+#define TracyGpuZoneC( ctx, name, color ) static const tracy::SourceLocation __tracy_gpu_source_location { __FUNCTION__,  __FILE__, (uint32_t)__LINE__, color }; auto ___tracy_gpu_zone = tracy::detail::__GpuHelper( ctx, name, &__tracy_gpu_source_location );
+
 namespace tracy
 {
 
 extern std::atomic<uint16_t> s_gpuCtxCounter;
 
+template<int Num> class GpuCtx;
+
+template<int Num>
+class __GpuCtxScope
+{
+public:
+    tracy_force_inline __GpuCtxScope( GpuCtx<Num>& ctx, const char* name, const SourceLocation* srcloc )
+        : m_ctx( ctx )
+    {
+        glQueryCounter( m_ctx.NextQueryId(), GL_TIMESTAMP );
+
+        Magic magic;
+        auto& token = s_token.ptr;
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin<moodycamel::CanAlloc>( magic );
+        item->hdr.type = QueueType::GpuZoneBegin;
+        item->gpuZoneBegin.cpuTime = Profiler::GetTime();
+        item->gpuZoneBegin.name = (uint64_t)name;
+        item->gpuZoneBegin.srcloc = (uint64_t)srcloc;
+        item->gpuZoneBegin.context = m_ctx.GetId();
+        tail.store( magic + 1, std::memory_order_release );
+    }
+
+    tracy_force_inline ~__GpuCtxScope()
+    {
+        glQueryCounter( m_ctx.NextQueryId(), GL_TIMESTAMP );
+
+        Magic magic;
+        auto& token = s_token.ptr;
+        auto& tail = token->get_tail_index();
+        auto item = token->enqueue_begin<moodycamel::CanAlloc>( magic );
+        item->hdr.type = QueueType::GpuZoneEnd;
+        item->gpuZoneEnd.cpuTime = Profiler::GetTime();
+        item->gpuZoneEnd.thread = GetThreadHandle();
+        item->gpuZoneEnd.context = m_ctx.GetId();
+        tail.store( magic + 1, std::memory_order_release );
+    }
+
+private:
+    GpuCtx<Num>& m_ctx;
+};
+
+namespace detail
+{
+template<int Num>
+static tracy_force_inline __GpuCtxScope<Num> __GpuHelper( GpuCtx<Num>* ctx, const char* name, const SourceLocation* srcloc )
+{
+    return ctx->SpawnZone( name, srcloc );
+}
+}
+
 template<int Num>
 class GpuCtx
 {
+    friend class __GpuCtxScope<Num>;
+    friend __GpuCtxScope<Num> detail::__GpuHelper<Num>( GpuCtx<Num>* ctx, const char* name, const SourceLocation* srcloc );
+
 public:
     GpuCtx()
         : m_context( s_gpuCtxCounter.fetch_add( 1, std::memory_order_relaxed ) )
+        , m_head( 0 )
+        , m_tail( 0 )
     {
         glGenQueries( Num, m_query );
 
@@ -37,8 +96,28 @@ public:
     }
 
 private:
+    tracy_force_inline __GpuCtxScope<Num> SpawnZone( const char* name, const SourceLocation* srcloc )
+    {
+        return __GpuCtxScope<Num>( *this, name, srcloc );
+    }
+
+    tracy_force_inline unsigned int NextQueryId()
+    {
+        const auto id = m_head;
+        m_head = ( m_head + 1 ) % Num;
+        return m_query[id];
+    }
+
+    tracy_force_inline uint16_t GetId() const
+    {
+        return m_context;
+    }
+
     unsigned int m_query[Num];
     uint16_t m_context;
+
+    unsigned int m_head;
+    unsigned int m_tail;
 };
 
 }
diff --git a/common/TracyQueue.hpp b/common/TracyQueue.hpp
index 9d00944f..e272d3e0 100644
--- a/common/TracyQueue.hpp
+++ b/common/TracyQueue.hpp
@@ -30,6 +30,8 @@ enum class QueueType : uint8_t
     Message,
     MessageLiteral,
     GpuNewContext,
+    GpuZoneBegin,
+    GpuZoneEnd,
     NUM_TYPES
 };
 
@@ -146,6 +148,21 @@ struct QueueGpuNewContext
     uint16_t context;
 };
 
+struct QueueGpuZoneBegin
+{
+    int64_t cpuTime;
+    uint64_t name;
+    uint64_t srcloc;
+    uint16_t context;
+};
+
+struct QueueGpuZoneEnd
+{
+    int64_t cpuTime;
+    uint64_t thread;
+    uint16_t context;
+};
+
 struct QueueHeader
 {
     union
@@ -174,6 +191,8 @@ struct QueueItem
         QueuePlotData plotData;
         QueueMessage message;
         QueueGpuNewContext gpuNewContext;
+        QueueGpuZoneBegin gpuZoneBegin;
+        QueueGpuZoneEnd gpuZoneEnd;
     };
 };
 
@@ -204,6 +223,8 @@ static const size_t QueueDataSize[] = {
     sizeof( QueueHeader ) + sizeof( QueueMessage ),
     sizeof( QueueHeader ) + sizeof( QueueMessage ),         // literal
     sizeof( QueueHeader ) + sizeof( QueueGpuNewContext ),
+    sizeof( QueueHeader ) + sizeof( QueueGpuZoneBegin ),
+    sizeof( QueueHeader ) + sizeof( QueueGpuZoneEnd ),
 };
 
 static_assert( QueueItemSize == 32, "Queue item size not 32 bytes" );
diff --git a/server/TracyEvent.hpp b/server/TracyEvent.hpp
index d169587f..69d28fcd 100644
--- a/server/TracyEvent.hpp
+++ b/server/TracyEvent.hpp
@@ -94,6 +94,22 @@ enum { LockEventSize = sizeof( LockEvent ) };
 enum { MaxLockThreads = sizeof( LockEvent::waitList ) * 8 };
 static_assert( std::numeric_limits<decltype(LockEvent::lockCount)>::max() >= MaxLockThreads, "Not enough space for lock count." );
 
+
+struct GpuEvent
+{
+    int64_t cpuStart;
+    int64_t cpuEnd;
+    int64_t gpuStart;
+    int64_t gpuEnd;
+    int32_t srcloc;
+    uint64_t name;
+    uint64_t thread;
+
+    Vector<GpuEvent*> child;
+};
+
+enum { GpuEventSize = sizeof( GpuEvent ) };
+
 #pragma pack()
 
 
@@ -115,6 +131,9 @@ struct ThreadData
 struct GpuCtxData
 {
     int64_t timeDiff;
+    Vector<GpuEvent*> timeline;
+    Vector<GpuEvent*> stack;
+    Vector<GpuEvent*> queue;
 };
 
 struct LockMap
diff --git a/server/TracyView.cpp b/server/TracyView.cpp
index cd4284eb..a81608e7 100644
--- a/server/TracyView.cpp
+++ b/server/TracyView.cpp
@@ -591,6 +591,12 @@ void View::Process( const QueueItem& ev )
     case QueueType::GpuNewContext:
         ProcessGpuNewContext( ev.gpuNewContext );
         break;
+    case QueueType::GpuZoneBegin:
+        ProcessGpuZoneBegin( ev.gpuZoneBegin );
+        break;
+    case QueueType::GpuZoneEnd:
+        ProcessGpuZoneEnd( ev.gpuZoneEnd );
+        break;
     case QueueType::Terminate:
         m_terminate = true;
         break;
@@ -850,12 +856,58 @@ void View::ProcessMessageLiteral( const QueueMessage& ev )
 void View::ProcessGpuNewContext( const QueueGpuNewContext& ev )
 {
     assert( ev.context == m_gpuData.size() );
-    auto gpu = m_slab.Alloc<GpuCtxData>();
+    auto gpu = m_slab.AllocInit<GpuCtxData>();
     gpu->timeDiff = int64_t( ev.cputime * m_timerMul - ev.gputime );
     std::lock_guard<std::mutex> lock( m_lock );
     m_gpuData.push_back( gpu );
 }
 
+void View::ProcessGpuZoneBegin( const QueueGpuZoneBegin& ev )
+{
+    assert( m_gpuData.size() >= ev.context );
+    auto ctx = m_gpuData[ev.context];
+
+    CheckString( ev.name );
+    CheckSourceLocation( ev.srcloc );
+
+    auto zone = m_slab.AllocInit<GpuEvent>();
+    zone->cpuStart = ev.cpuTime;
+    zone->cpuEnd = -1;
+    zone->gpuStart = std::numeric_limits<int64_t>::max();
+    zone->gpuEnd = -1;
+    zone->name = ev.name;
+    zone->srcloc = ev.srcloc;
+    zone->thread = 0;
+
+    auto timeline = &ctx->timeline;
+    if( !ctx->stack.empty() )
+    {
+        timeline = &ctx->stack.back()->child;
+    }
+
+    m_lock.lock();
+    timeline->push_back( zone );
+    m_lock.unlock();
+
+    ctx->stack.push_back( zone );
+    ctx->queue.push_back( zone );
+}
+
+void View::ProcessGpuZoneEnd( const QueueGpuZoneEnd& ev )
+{
+    assert( m_gpuData.size() >= ev.context );
+    auto ctx = m_gpuData[ev.context];
+
+    assert( !ctx->stack.empty() );
+    auto zone = ctx->stack.back();
+    ctx->stack.pop_back();
+    ctx->queue.push_back( zone );
+
+    std::lock_guard<std::mutex> lock( m_lock );
+    zone->cpuEnd = ev.cpuTime;
+    zone->thread = ev.thread;
+}
+
 void View::CheckString( uint64_t ptr )
 {
     if( m_strings.find( ptr ) != m_strings.end() ) return;
diff --git a/server/TracyView.hpp b/server/TracyView.hpp
index 4b71d768..14d161c9 100644
--- a/server/TracyView.hpp
+++ b/server/TracyView.hpp
@@ -68,6 +68,8 @@ private:
     void ProcessMessage( const QueueMessage& ev );
     void ProcessMessageLiteral( const QueueMessage& ev );
     void ProcessGpuNewContext( const QueueGpuNewContext& ev );
+    void ProcessGpuZoneBegin( const QueueGpuZoneBegin& ev );
+    void ProcessGpuZoneEnd( const QueueGpuZoneEnd& ev );
 
     void CheckString( uint64_t ptr );
     void CheckThreadString( uint64_t id );