#ifndef __TRACYD3D12_HPP__ #define __TRACYD3D12_HPP__ #ifndef TRACY_ENABLE #define TracyD3D12Context(device, queue) nullptr #define TracyD3D12Destroy(ctx) #define TracyD3D12NamedZone(ctx, varname, cmdList, name, active) #define TracyD3D12NamedZoneC(ctx, varname, cmdList, name, color, active) #define TracyD3D12Zone(ctx, cmdList, name) #define TracyD3D12ZoneC(ctx, cmdList, name, color) #define TracyD3D12Collect(ctx) namespace tracy { class D3D12ZoneScope {}; } using TracyD3D12Ctx = void*; #else #include "Tracy.hpp" #include "client/TracyProfiler.hpp" #include #include #include #include #include #include #include namespace tracy { // Command queue context. class D3D12QueueCtx { friend class D3D12ZoneScope; static constexpr uint32_t MaxQueries = 64 * 1024; // Queries are begin and end markers, so we can store half as many total time durations. Must be even! bool m_initialized = false; ID3D12Device* m_device; uint8_t m_context; Microsoft::WRL::ComPtr m_queryHeap; Microsoft::WRL::ComPtr m_readbackBuffer; uint32_t m_queryLimit = MaxQueries; uint32_t m_queryCounter = 0; uint32_t m_previousQueryCounter = 0; public: D3D12QueueCtx(ID3D12Device* device, ID3D12CommandQueue* queue) : m_device(device) , m_context(GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed)) { // Verify we support timestamp queries on this queue. if (queue->GetDesc().Type == D3D12_COMMAND_LIST_TYPE_COPY) { D3D12_FEATURE_DATA_D3D12_OPTIONS3 featureData{}; if (FAILED(device->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS3, &featureData, sizeof(featureData)))) { assert(false && "Platform does not support profiling of copy queues."); } } uint64_t timestampFrequency; if (FAILED(queue->GetTimestampFrequency(×tampFrequency))) { assert(false && "Failed to get timestamp frequency."); } uint64_t cpuTimestamp; uint64_t gpuTimestamp; if (FAILED(queue->GetClockCalibration(&gpuTimestamp, &cpuTimestamp))) { assert(false && "Failed to get queue clock calibration."); } cpuTimestamp = Profiler::GetTime(); D3D12_QUERY_HEAP_DESC heapDesc{}; heapDesc.Type = queue->GetDesc().Type == D3D12_COMMAND_LIST_TYPE_COPY ? D3D12_QUERY_HEAP_TYPE_COPY_QUEUE_TIMESTAMP : D3D12_QUERY_HEAP_TYPE_TIMESTAMP; heapDesc.Count = m_queryLimit; heapDesc.NodeMask = 0; // #TODO: Support multiple adapters. while (FAILED(device->CreateQueryHeap(&heapDesc, IID_PPV_ARGS(&m_queryHeap)))) { m_queryLimit /= 2; heapDesc.Count = m_queryLimit; } // Create a readback buffer, which will be used as a destination for the query data. D3D12_RESOURCE_DESC readbackBufferDesc{}; readbackBufferDesc.Alignment = 0; readbackBufferDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER; readbackBufferDesc.Width = m_queryLimit * sizeof(uint64_t); readbackBufferDesc.Height = 1; readbackBufferDesc.DepthOrArraySize = 1; readbackBufferDesc.Format = DXGI_FORMAT_UNKNOWN; readbackBufferDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR; // Buffers are always row major. readbackBufferDesc.MipLevels = 1; readbackBufferDesc.SampleDesc.Count = 1; readbackBufferDesc.SampleDesc.Quality = 0; readbackBufferDesc.Flags = D3D12_RESOURCE_FLAG_NONE; D3D12_HEAP_PROPERTIES readbackHeapProps{}; readbackHeapProps.Type = D3D12_HEAP_TYPE_READBACK; readbackHeapProps.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN; readbackHeapProps.MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN; readbackHeapProps.CreationNodeMask = 0; readbackHeapProps.VisibleNodeMask = 0; // #TODO: Support multiple adapters. if (FAILED(device->CreateCommittedResource(&readbackHeapProps, D3D12_HEAP_FLAG_NONE, &readbackBufferDesc, D3D12_RESOURCE_STATE_COPY_DEST, nullptr, IID_PPV_ARGS(&m_readbackBuffer)))) { assert(false && "Failed to create query readback buffer."); } auto* item = Profiler::QueueSerial(); MemWrite(&item->hdr.type, QueueType::GpuNewContext); MemWrite(&item->gpuNewContext.cpuTime, cpuTimestamp); MemWrite(&item->gpuNewContext.gpuTime, gpuTimestamp); memset(&item->gpuNewContext.thread, 0, sizeof(item->gpuNewContext.thread)); MemWrite(&item->gpuNewContext.period, 1E+09f / static_cast(timestampFrequency)); MemWrite(&item->gpuNewContext.context, m_context); MemWrite(&item->gpuNewContext.accuracyBits, uint8_t{ 0 }); MemWrite(&item->gpuNewContext.type, GpuContextType::Direct3D12); #ifdef TRACY_ON_DEMAND GetProfiler().DeferItem(*item); #endif Profiler::QueueSerialFinish(); m_initialized = true; } ~D3D12QueueCtx() {} void Collect() { ZoneScopedC(Color::Red4); // Check to see if we have any new queries. if (m_queryCounter == m_previousQueryCounter) return; #ifdef TRACY_ON_DEMAND if (!GetProfiler().IsConnected()) { m_queryCounter = 0; return; } #endif // Batch submit all of our query data to the profiler. D3D12_RANGE mapRange{ 0, m_queryLimit * sizeof(uint64_t) }; // Map the readback buffer so we can fetch the query data from the GPU. void* readbackBufferMapping = nullptr; if (FAILED(m_readbackBuffer->Map(0, &mapRange, &readbackBufferMapping))) { assert(false && "Failed to map readback buffer."); } auto* timestampData = static_cast(readbackBufferMapping); // First off we need to sort our query data. Without this, out-of-order command list execution (with respect to CPU timeline recording) // would cause view artifacts in the viewer (zones disappear, take up the whole timeline, etc.) std::vector queryData; queryData.resize(m_queryCounter); if (m_previousQueryCounter + m_queryCounter <= m_queryLimit) // Make sure we don't need to loop over. { std::copy(timestampData + m_previousQueryCounter, timestampData + m_previousQueryCounter + m_queryCounter, queryData.begin()); } else { const auto firstBatch = (m_previousQueryCounter + m_queryCounter) - m_queryLimit; std::copy(timestampData + m_previousQueryCounter, timestampData + m_queryLimit, queryData.begin()); std::copy(timestampData, timestampData + (m_queryCounter - firstBatch), std::next(queryData.begin(), m_queryCounter - firstBatch)); } std::sort(queryData.begin(), queryData.end(), std::less{}); // Data is sorted, send it to the profiler. for (uint32_t index = 0; index < queryData.size(); ++index) { const auto timestamp = queryData[index]; const auto queryId = m_previousQueryCounter + index; auto* item = Profiler::QueueSerial(); MemWrite(&item->hdr.type, QueueType::GpuTime); MemWrite(&item->gpuTime.gpuTime, timestamp); MemWrite(&item->gpuTime.queryId, static_cast(queryId)); MemWrite(&item->gpuTime.context, m_context); Profiler::QueueSerialFinish(); } m_readbackBuffer->Unmap(0, nullptr); m_previousQueryCounter += m_queryCounter; m_queryCounter = 0; if (m_previousQueryCounter >= m_queryLimit) { m_previousQueryCounter -= m_queryLimit; } } private: tracy_force_inline uint32_t NextQueryId() { assert(m_queryCounter < m_queryLimit && "Submitted too many GPU queries! Consider increasing MaxQueries."); const uint32_t id = (m_previousQueryCounter + m_queryCounter) % m_queryLimit; m_queryCounter += 2; // Allocate space for a begin and end query. return id; } tracy_force_inline uint8_t GetId() const { return m_context; } }; class D3D12ZoneScope { const bool m_active; D3D12QueueCtx* m_ctx = nullptr; ID3D12GraphicsCommandList* m_cmdList = nullptr; uint32_t m_queryId = 0; // Used for tracking in nested zones. public: tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, const SourceLocationData* srcLocation, bool active) #ifdef TRACY_ON_DEMAND : m_active(active && GetProfiler().IsConnected()) #else : m_active(active) #endif { if (!m_active) return; m_ctx = ctx; m_cmdList = cmdList; m_queryId = ctx->NextQueryId(); cmdList->EndQuery(ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, m_queryId); auto* item = Profiler::QueueSerial(); #if defined(TRACY_HAS_CALLSTACK) && defined(TRACY_CALLSTACK) MemWrite(&item->hdr.type, QueueType::GpuZoneBeginCallstackSerial); #else MemWrite(&item->hdr.type, QueueType::GpuZoneBeginSerial); #endif MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime()); MemWrite(&item->gpuZoneBegin.srcloc, reinterpret_cast(srcLocation)); MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle()); MemWrite(&item->gpuZoneBegin.queryId, static_cast(m_queryId)); MemWrite(&item->gpuZoneBegin.context, ctx->GetId()); Profiler::QueueSerialFinish(); #if defined(TRACY_HAS_CALLSTACK) && defined(TRACY_CALLSTACK) GetProfiler().SendCallstack(TRACY_CALLSTACK); #endif } tracy_force_inline ~D3D12ZoneScope() { if (!m_active) return; const auto queryId = m_queryId + 1; // Our end query slot is immediately after the begin slot. m_cmdList->EndQuery(m_ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, queryId); auto* item = Profiler::QueueSerial(); MemWrite(&item->hdr.type, QueueType::GpuZoneEndSerial); MemWrite(&item->gpuZoneEnd.cpuTime, Profiler::GetTime()); MemWrite(&item->gpuZoneEnd.thread, GetThreadHandle()); MemWrite(&item->gpuZoneEnd.queryId, static_cast(queryId)); MemWrite(&item->gpuZoneEnd.context, m_ctx->GetId()); Profiler::QueueSerialFinish(); m_cmdList->ResolveQueryData(m_ctx->m_queryHeap.Get(), D3D12_QUERY_TYPE_TIMESTAMP, m_queryId, 2, m_ctx->m_readbackBuffer.Get(), m_queryId * sizeof(uint64_t)); } }; static inline D3D12QueueCtx* CreateD3D12Context(ID3D12Device* device, ID3D12CommandQueue* queue) { InitRPMallocThread(); auto* ctx = static_cast(tracy_malloc(sizeof(D3D12QueueCtx))); new (ctx) D3D12QueueCtx{ device, queue }; return ctx; } static inline void DestroyD3D12Context(D3D12QueueCtx* ctx) { ctx->~D3D12QueueCtx(); tracy_free(ctx); } } using TracyD3D12Ctx = tracy::D3D12QueueCtx*; #define TracyD3D12Context(device, queue) tracy::CreateD3D12Context(device, queue); #define TracyD3D12Destroy(ctx) tracy::DestroyD3D12Context(ctx); #define TracyD3D12NamedZone(ctx, varname, cmdList, name, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location, __LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyConcat(__tracy_gpu_source_location, __LINE__), active }; #define TracyD3D12NamedZoneC(ctx, varname, cmdList, name, color, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location, __LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::D3D12ZoneScope varname{ ctx, cmdList, &TracyConcat(__tracy_gpu_source_location, __LINE__), active }; #define TracyD3D12Zone(ctx, cmdList, name) TracyD3D12NamedZone(ctx, ___tracy_gpu_zone, cmdList, name, true) #define TracyD3D12ZoneC(ctx, cmdList, name, color) TracyD3D12NamedZoneC(ctx, ___tracy_gpu_zone, cmdList, name, color, true) #define TracyD3D12Collect(ctx) ctx->Collect(); #endif #endif