Merge pull request #70 from Xenonic/master

Implemented GPU synchronization for Direct3D 12
This commit is contained in:
Bartosz Taudul 2020-07-09 00:18:12 +02:00 committed by GitHub
commit 587fd3a0bd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 39 additions and 3 deletions

View File

@ -50,8 +50,8 @@ namespace tracy
bool m_initialized = false; bool m_initialized = false;
ID3D12Device* m_device; ID3D12Device* m_device = nullptr;
ID3D12CommandQueue* m_queue; ID3D12CommandQueue* m_queue = nullptr;
uint8_t m_context; uint8_t m_context;
Microsoft::WRL::ComPtr<ID3D12QueryHeap> m_queryHeap; Microsoft::WRL::ComPtr<ID3D12QueryHeap> m_queryHeap;
Microsoft::WRL::ComPtr<ID3D12Resource> m_readbackBuffer; Microsoft::WRL::ComPtr<ID3D12Resource> m_readbackBuffer;
@ -65,6 +65,9 @@ namespace tracy
Microsoft::WRL::ComPtr<ID3D12Fence> m_payloadFence; Microsoft::WRL::ComPtr<ID3D12Fence> m_payloadFence;
std::queue<D3D12QueryPayload> m_payloadQueue; std::queue<D3D12QueryPayload> m_payloadQueue;
int64_t m_prevCalibration = 0;
int64_t m_qpcToNs = int64_t{ 1000000000 / GetFrequencyQpc() };
public: public:
D3D12QueueCtx(ID3D12Device* device, ID3D12CommandQueue* queue) D3D12QueueCtx(ID3D12Device* device, ID3D12CommandQueue* queue)
: m_device(device) : m_device(device)
@ -98,6 +101,9 @@ namespace tracy
assert(false && "Failed to get queue clock calibration."); assert(false && "Failed to get queue clock calibration.");
} }
// Save the device cpu timestamp, not the profiler's timestamp.
m_prevCalibration = cpuTimestamp * m_qpcToNs;
cpuTimestamp = Profiler::GetTime(); cpuTimestamp = Profiler::GetTime();
D3D12_QUERY_HEAP_DESC heapDesc{}; D3D12_QUERY_HEAP_DESC heapDesc{};
@ -150,7 +156,7 @@ namespace tracy
memset(&item->gpuNewContext.thread, 0, sizeof(item->gpuNewContext.thread)); memset(&item->gpuNewContext.thread, 0, sizeof(item->gpuNewContext.thread));
MemWrite(&item->gpuNewContext.period, 1E+09f / static_cast<float>(timestampFrequency)); MemWrite(&item->gpuNewContext.period, 1E+09f / static_cast<float>(timestampFrequency));
MemWrite(&item->gpuNewContext.context, m_context); MemWrite(&item->gpuNewContext.context, m_context);
MemWrite(&item->gpuNewContext.flags, uint8_t{ 0 }); MemWrite(&item->gpuNewContext.flags, GpuContextCalibration);
MemWrite(&item->gpuNewContext.type, GpuContextType::Direct3D12); MemWrite(&item->gpuNewContext.type, GpuContextType::Direct3D12);
#ifdef TRACY_ON_DEMAND #ifdef TRACY_ON_DEMAND
@ -233,6 +239,34 @@ namespace tracy
} }
m_readbackBuffer->Unmap(0, nullptr); m_readbackBuffer->Unmap(0, nullptr);
// Recalibrate to account for drift.
uint64_t cpuTimestamp;
uint64_t gpuTimestamp;
if (FAILED(m_queue->GetClockCalibration(&gpuTimestamp, &cpuTimestamp)))
{
assert(false && "Failed to get queue clock calibration.");
}
cpuTimestamp *= m_qpcToNs;
const auto cpuDelta = cpuTimestamp - m_prevCalibration;
if (cpuDelta > 0)
{
m_prevCalibration = cpuTimestamp;
cpuTimestamp = Profiler::GetTime();
auto* item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuCalibration);
MemWrite(&item->gpuCalibration.gpuTime, gpuTimestamp);
MemWrite(&item->gpuCalibration.cpuTime, cpuTimestamp);
MemWrite(&item->gpuCalibration.cpuDelta, cpuDelta);
MemWrite(&item->gpuCalibration.context, m_context);
Profiler::QueueSerialFinish();
}
} }
private: private:

View File

@ -1229,6 +1229,8 @@ Using GPU zones is the same as the Vulkan implementation, where the \texttt{Trac
The macro \texttt{TracyD3D12NewFrame(ctx)} is used to mark a new frame, and should appear before or after recording command lists, similar to \texttt{FrameMark}. This macro is a key component that enables automatic query data synchronization, so the user doesn't have to worry about synchronizing GPU execution before invoking a collection. Event data can then be collected and sent to the profiler using the \texttt{TracyD3D12Collect(ctx)} macro. The macro \texttt{TracyD3D12NewFrame(ctx)} is used to mark a new frame, and should appear before or after recording command lists, similar to \texttt{FrameMark}. This macro is a key component that enables automatic query data synchronization, so the user doesn't have to worry about synchronizing GPU execution before invoking a collection. Event data can then be collected and sent to the profiler using the \texttt{TracyD3D12Collect(ctx)} macro.
Note that due to artifacts from dynamic frequency scaling, GPU profiling may be slightly inaccurate. To counter this, \texttt{ID3D12Device::SetStablePowerState()} can be used to enable accurate profiling, at the expense of some performance. If the machine is not in developer mode, the device will be removed upon calling. Do not use this in shipping code.
\subsubsection{OpenCL} \subsubsection{OpenCL}
OpenCL support is achieved by including the \texttt{tracy/TracyOpenCL.hpp} header file. Tracing OpenCL requires the creation of a Tracy OpenCL context using the macro \texttt{TracyCLContext(context, device)}, which will return an instance of \texttt{TracyCLCtx} object that must be used when creating zones. The specified \texttt{device} must be part of the \texttt{context}. Cleanup is performed using the \texttt{TracyCLDestroy(ctx)} macro. Although not common, it is possible to create multiple OpenCL contexts for the same application. OpenCL support is achieved by including the \texttt{tracy/TracyOpenCL.hpp} header file. Tracing OpenCL requires the creation of a Tracy OpenCL context using the macro \texttt{TracyCLContext(context, device)}, which will return an instance of \texttt{TracyCLCtx} object that must be used when creating zones. The specified \texttt{device} must be part of the \texttt{context}. Cleanup is performed using the \texttt{TracyCLDestroy(ctx)} macro. Although not common, it is possible to create multiple OpenCL contexts for the same application.