Merge branch 'master' into master

This commit is contained in:
Andrew Depke 2020-06-08 23:50:20 -06:00 committed by GitHub
commit 39479b8d93
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 595 additions and 17 deletions

View File

@ -7,4 +7,5 @@ Till Rathmann <till.rathmann@gmx.de> (DLL support)
Sherief Farouk <sherief.personal@gmail.com> (compatibility fixes)
Dedmen Miller <dedmen@dedmen.de> (find zone bug fixes, improvements)
Michał Cichoń <michcic@gmail.com> (OSX call stack decoding backport)
Thales Sabino <thales@codeplay.com> (OpenCL support)
Andrew Depke <andrewdepke@gmail.com> (Direct3D 12 support)

4
NEWS
View File

@ -9,6 +9,9 @@ a mistake.
v0.7 (xxxx-xx-xx)
-----------------
This is the last release which will be able to load pre-v0.6 traces. Use the
update utility to convert your old traces now!
- chrome:tracing importer now imports zone metadata from "args" key.
- Added display of statistical mode to find zone menu.
- Automatic stack sampling is now available on windows.
@ -61,6 +64,7 @@ v0.7 (xxxx-xx-xx)
- Added ability to send simple integral values as extra payload for zones.
- Per-frame zone times on the frames plot can now display self time.
- Ability to bind only on localhost interface.
- OpenCL profiling.
v0.6.3 (2020-02-13)
-------------------

View File

@ -2,9 +2,9 @@
[![Sponsor](.github/sponsor.png)](https://github.com/sponsors/wolfpld/)
### A real time, nanosecond resolution, remote telemetry frame profiler for games and other applications.
### A real time, nanosecond resolution, remote telemetry, hybrid frame and sampling profiler for games and other applications.
Tracy supports profiling CPU (C, C++11, Lua), GPU (OpenGL, Vulkan), memory, locks, context switches, per-frame screenshots and more.
Tracy supports profiling CPU (C, C++11, Lua), GPU (OpenGL, Vulkan, OpenCL), memory, locks, context switches, per-frame screenshots and more.
For usage **and build process** instructions, consult the user manual [at the following address](https://github.com/wolfpld/tracy/releases).

View File

@ -150,7 +150,7 @@
# define ZoneScopedS( depth ) ZoneNamedS( ___tracy_scoped_zone, depth, true )
# define ZoneScopedNS( name, depth ) ZoneNamedNS( ___tracy_scoped_zone, name, depth, true )
# define ZoneScopedCS( color, depth ) ZoneNamedCS( ___tracy_scoped_zone, color, depth, true )
# define ZoneScopedNCS( name, color, depth ) ZoneNamedNCS( ___tracy_scoped_zone, name, color depth, true )
# define ZoneScopedNCS( name, color, depth ) ZoneNamedNCS( ___tracy_scoped_zone, name, color, depth, true )
# define TracyAllocS( ptr, size, depth ) tracy::Profiler::MemAllocCallstack( ptr, size, depth );
# define TracyFreeS( ptr, depth ) tracy::Profiler::MemFreeCallstack( ptr, depth );

333
TracyOpenCL.hpp Normal file
View File

@ -0,0 +1,333 @@
#ifndef __TRACYOPENCL_HPP__
#define __TRACYOPENCL_HPP__
#if !defined TRACY_ENABLE
#define TracyCLContext(x, y) nullptr
#define TracyCLDestroy(x)
#define TracyCLNamedZone(c, x, y, z, w)
#define TracyCLNamedZoneC(c, x, y, z, w, a)
#define TracyCLZone(c, x, y)
#define TracyCLZoneC(c, x, y, z)
#define TracyCLCollect(c)
#define TracyCLNamedZoneS(c, x, y, z, w, a)
#define TracyCLNamedZoneCS(c, x, y, z, w, v, a)
#define TracyCLZoneS(c, x, y, z)
#define TracyCLZoneCS(c, x, y, z, w)
namespace tracy
{
class OpenCLCtxScope {};
}
using TracyCLCtx = void*;
#else
#include <CL/cl.h>
#include <atomic>
#include <cassert>
#include "Tracy.hpp"
#include "client/TracyCallstack.hpp"
#include "client/TracyProfiler.hpp"
#include "common/TracyAlloc.hpp"
namespace tracy {
enum class EventPhase : uint8_t
{
Begin,
End
};
struct EventInfo
{
cl_event event;
EventPhase phase;
};
class OpenCLCtx
{
public:
enum { QueryCount = 64 * 1024 };
OpenCLCtx(cl_context context, cl_device_id device)
: m_contextId(GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed))
, m_head(0)
, m_tail(0)
{
assert(m_contextId != 255);
m_hostStartTime = Profiler::GetTime();
m_deviceStartTime = GetDeviceTimestamp(context, device);
auto item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuNewContext);
MemWrite(&item->gpuNewContext.cpuTime, m_hostStartTime);
MemWrite(&item->gpuNewContext.gpuTime, m_hostStartTime);
memset(&item->gpuNewContext.thread, 0, sizeof(item->gpuNewContext.thread));
MemWrite(&item->gpuNewContext.period, 1.0f);
MemWrite(&item->gpuNewContext.type, GpuContextType::OpenCL);
MemWrite(&item->gpuNewContext.context, (uint8_t) m_contextId);
MemWrite(&item->gpuNewContext.accuracyBits, (uint8_t)0);
#ifdef TRACY_ON_DEMAND
GetProfiler().DeferItem(*item);
#endif
Profiler::QueueSerialFinish();
}
void Collect()
{
ZoneScopedC(Color::Red4);
if (m_tail == m_head) return;
#ifdef TRACY_ON_DEMAND
if (!GetProfiler().IsConnected())
{
m_head = m_tail = 0;
}
#endif
while (m_tail != m_head)
{
EventInfo eventInfo = m_query[m_tail];
cl_event event = eventInfo.event;
cl_int eventStatus;
cl_int err = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, nullptr);
assert(err == CL_SUCCESS);
if (eventStatus != CL_COMPLETE) return;
cl_int eventInfoQuery = (eventInfo.phase == EventPhase::Begin)
? CL_PROFILING_COMMAND_START
: CL_PROFILING_COMMAND_END;
cl_ulong eventTimeStamp = 0;
err = clGetEventProfilingInfo(event, eventInfoQuery, sizeof(cl_ulong), &eventTimeStamp, nullptr);
assert(err == CL_SUCCESS);
assert(eventTimeStamp != 0);
auto item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuTime);
MemWrite(&item->gpuTime.gpuTime, TimestampOffset(eventTimeStamp));
MemWrite(&item->gpuTime.queryId, (uint16_t)m_tail);
MemWrite(&item->gpuTime.context, m_contextId);
Profiler::QueueSerialFinish();
if (eventInfo.phase == EventPhase::End)
{
// Done with the event, so release it
assert(clReleaseEvent(event) == CL_SUCCESS);
}
m_tail = (m_tail + 1) % QueryCount;
}
}
tracy_force_inline uint8_t GetId() const
{
return m_contextId;
}
tracy_force_inline unsigned int NextQueryId(EventInfo eventInfo)
{
const auto id = m_head;
m_head = (m_head + 1) % QueryCount;
assert(m_head != m_tail);
m_query[id] = eventInfo;
return id;
}
tracy_force_inline EventInfo& GetQuery(unsigned int id)
{
assert(id < QueryCount);
return m_query[id];
}
private:
tracy_force_inline int64_t GetHostStartTime() const
{
return m_hostStartTime;
}
tracy_force_inline int64_t GetDeviceStartTime() const
{
return m_deviceStartTime;
}
tracy_force_inline int64_t TimestampOffset(int64_t deviceTimestamp) const
{
return m_hostStartTime + (deviceTimestamp - m_deviceStartTime);
}
tracy_force_inline int64_t GetDeviceTimestamp(cl_context context, cl_device_id device) const
{
cl_ulong deviceTimestamp = 0;
cl_int err = CL_SUCCESS;
cl_command_queue queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
assert(err == CL_SUCCESS);
uint32_t dummyValue = 42;
cl_mem dummyBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(uint32_t), nullptr, &err);
assert(err == CL_SUCCESS);
cl_event writeBufferEvent;
err = clEnqueueWriteBuffer(queue, dummyBuffer, CL_TRUE, 0, sizeof(uint32_t), &dummyValue, 0, nullptr, &writeBufferEvent);
assert(err == CL_SUCCESS);
err = clWaitForEvents(1, &writeBufferEvent);
assert(err == CL_SUCCESS);
cl_int eventStatus;
err = clGetEventInfo(writeBufferEvent, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, nullptr);
assert(err == CL_SUCCESS);
assert(eventStatus == CL_COMPLETE);
err = clGetEventProfilingInfo(writeBufferEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &deviceTimestamp, nullptr);
assert(err == CL_SUCCESS);
err = clReleaseEvent(writeBufferEvent);
assert(err == CL_SUCCESS);
err = clReleaseMemObject(dummyBuffer);
assert(err == CL_SUCCESS);
err = clReleaseCommandQueue(queue);
assert(err == CL_SUCCESS);
return (int64_t)deviceTimestamp;
}
unsigned int m_contextId;
EventInfo m_query[QueryCount];
unsigned int m_head;
unsigned int m_tail;
int64_t m_hostStartTime;
int64_t m_deviceStartTime;
};
class OpenCLCtxScope {
public:
tracy_force_inline OpenCLCtxScope(OpenCLCtx* ctx, const SourceLocationData* srcLoc, bool is_active)
#ifdef TRACY_ON_DEMAND
: m_active(is_active&& GetProfiler().IsConnected())
#else
: m_active(is_active)
#endif
, m_ctx(ctx)
, m_event(nullptr)
{
if (!m_active) return;
m_beginQueryId = ctx->NextQueryId(EventInfo{ nullptr, EventPhase::Begin });
auto item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuZoneBeginSerial);
MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
MemWrite(&item->gpuZoneBegin.srcloc, (uint64_t)srcLoc);
MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)m_beginQueryId);
MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
Profiler::QueueSerialFinish();
}
tracy_force_inline OpenCLCtxScope(OpenCLCtx* ctx, const SourceLocationData* srcLoc, int depth, bool is_active)
#ifdef TRACY_ON_DEMAND
: m_active(is_active&& GetProfiler().IsConnected())
#else
: m_active(is_active)
#endif
, m_ctx(ctx)
, m_event(nullptr)
{
if (!m_active) return;
m_beginQueryId = ctx->NextQueryId(EventInfo{ nullptr, EventPhase::Begin });
auto item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuZoneBeginCallstackSerial);
MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
MemWrite(&item->gpuZoneBegin.srcloc, (uint64_t)srcLoc);
MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)m_beginQueryId);
MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
Profiler::QueueSerialFinish();
GetProfiler().SendCallstack(depth);
}
tracy_force_inline void SetEvent(cl_event event)
{
m_event = event;
assert(clRetainEvent(m_event) == CL_SUCCESS);
m_ctx->GetQuery(m_beginQueryId).event = m_event;
}
tracy_force_inline ~OpenCLCtxScope()
{
const auto queryId = m_ctx->NextQueryId(EventInfo{ m_event, EventPhase::End });
auto item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuZoneEndSerial);
MemWrite(&item->gpuZoneEnd.cpuTime, Profiler::GetTime());
MemWrite(&item->gpuZoneEnd.thread, GetThreadHandle());
MemWrite(&item->gpuZoneEnd.queryId, (uint16_t)queryId);
MemWrite(&item->gpuZoneEnd.context, m_ctx->GetId());
Profiler::QueueSerialFinish();
}
const bool m_active;
OpenCLCtx* m_ctx;
cl_event m_event;
unsigned int m_beginQueryId;
};
static inline OpenCLCtx* CreateCLContext(cl_context context, cl_device_id device)
{
InitRPMallocThread();
auto ctx = (OpenCLCtx*)tracy_malloc(sizeof(OpenCLCtx));
new (ctx) OpenCLCtx(context, device);
return ctx;
}
static inline void DestroyCLContext(OpenCLCtx* ctx)
{
ctx->~OpenCLCtx();
tracy_free(ctx);
}
} // namespace tracy
using TracyCLCtx = tracy::OpenCLCtx*;
#define TracyCLContext(context, device) tracy::CreateCLContext(context, device);
#define TracyCLDestroy(ctx) tracy::DestroyCLContext(ctx);
#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
# define TracyCLNamedZone(ctx, varname, name, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), TRACY_CALLSTACK, active );
# define TracyCLNamedZoneC(ctx, varname, name, color, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), TRACY_CALLSTACK, active );
# define TracyCLZone(ctx, name) TracyCLNamedZoneS(ctx, __tracy_gpu_zone, name, TRACY_CALLSTACK, true)
# define TracyCLZoneC(ctx, name, color) TracyCLNamedZoneCS(ctx, __tracy_gpu_zone, name, color, TRACY_CALLSTACK, true)
#else
# define TracyCLNamedZone(ctx, varname, name, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), active);
# define TracyCLNamedZoneC(ctx, varname, name, color, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), active);
# define TracyCLZone(ctx, name) TracyCLNamedZone(ctx, __tracy_gpu_zone, name, true)
# define TracyCLZoneC(ctx, name, color) TracyCLNamedZoneC(ctx, __tracy_gpu_zone, name, color, true )
#endif
#ifdef TRACY_HAS_CALLSTACK
# define TracyCLNamedZoneS(ctx, varname, name, depth, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), depth, active);
# define TracyCLNamedZoneCS(ctx, varname, name, color, depth, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), depth, active);
# define TracyCLZoneS(ctx, name, depth) TracyCLNamedZoneS(ctx, __tracy_gpu_zone, name, depth, true)
# define TracyCLZoneCS(ctx, name, color, depth) TracyCLNamedZoneCS(ctx, __tracy_gpu_zone, name, color, depth, true)
#else
#define TracyCLNamedZoneS(ctx, varname, name, depth, active) TracyCLNamedZone(ctx, varname, name, active)
#define TracyCLNamedZoneCS(ctx, varname, name, color, depth, active) TracyCLNamedZoneC(ctx, varname, name, color, active)
#define TracyCLZoneS(ctx, name, depth) TracyCLZone(ctx, name)
#define TracyCLZoneCS(ctx, name, color, depth) TracyCLZoneC(ctx, name, color)
#endif
#define TracyCLNamedZoneSetEvent(varname, event) varname.SetEvent(event)
#define TracyCLZoneSetEvent(event) __tracy_gpu_zone.SetEvent(event)
#define TracyCLCollect(ctx) ctx->Collect()
#endif
#endif

View File

@ -224,7 +224,7 @@ static int64_t SetupHwTimer()
#ifndef TRACY_TIMER_QPC
uint32_t regs[4];
CpuId( regs, 0x80000001 );
if( !( regs[3] & ( 1 << 27 ) ) ) InitFailure( "CPU doesn't support RDTSCP instruction." );
if( !( regs[3] & ( 1 << 4 ) ) ) InitFailure( "CPU doesn't support RDTSC instruction." );
CpuId( regs, 0x80000007 );
if( !( regs[3] & ( 1 << 8 ) ) )
{

View File

@ -9,7 +9,7 @@ namespace tracy
constexpr unsigned Lz4CompressBound( unsigned isize ) { return isize + ( isize / 255 ) + 16; }
enum : uint32_t { ProtocolVersion = 33 };
enum : uint32_t { ProtocolVersion = 34 };
enum : uint32_t { BroadcastVersion = 1 };
using lz4sz_t = uint32_t;

View File

@ -264,6 +264,7 @@ enum class GpuContextType : uint8_t
Invalid,
OpenGl,
Vulkan,
OpenCL,
Direct3D12
};

View File

@ -0,0 +1,14 @@
cmake_minimum_required(VERSION 3.0)
project(OpenCLVectorAdd)
find_package(OpenCL REQUIRED)
add_executable(OpenCLVectorAdd OpenCLVectorAdd.cpp)
add_library(TracyClient STATIC ../../TracyClient.cpp
../../TracyOpenCL.hpp)
target_include_directories(TracyClient PUBLIC ../../)
target_compile_definitions(TracyClient PUBLIC TRACY_ENABLE=1)
target_link_libraries(OpenCLVectorAdd PUBLIC OpenCL::OpenCL TracyClient)

View File

@ -0,0 +1,190 @@
#include <iostream>
#include <cassert>
#include <string>
#include <vector>
#include <numeric>
#include <CL/cl.h>
#include <Tracy.hpp>
#include <TracyOpenCL.hpp>
#define CL_ASSERT(err) \
if((err) != CL_SUCCESS) \
{ \
std::cerr << "OpenCL Call Returned " << err << std::endl; \
assert(false); \
}
const char kernelSource[] =
" void __kernel vectorAdd(global float* C, global float* A, global float* B, int N) "
" { "
" int i = get_global_id(0); "
" if (i < N) { "
" C[i] = A[i] + B[i]; "
" } "
" } ";
int main()
{
cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_command_queue commandQueue;
cl_kernel vectorAddKernel;
cl_program program;
cl_int err;
cl_mem bufferA, bufferB, bufferC;
TracyCLCtx tracyCLCtx;
{
ZoneScopedN("OpenCL Init");
cl_uint numPlatforms = 0;
CL_ASSERT(clGetPlatformIDs(0, nullptr, &numPlatforms));
if (numPlatforms == 0)
{
std::cerr << "Cannot find OpenCL platform to run this application" << std::endl;
return 1;
}
CL_ASSERT(clGetPlatformIDs(1, &platform, nullptr));
size_t platformNameBufferSize = 0;
CL_ASSERT(clGetPlatformInfo(platform, CL_PLATFORM_NAME, 0, nullptr, &platformNameBufferSize));
std::string platformName(platformNameBufferSize, '\0');
CL_ASSERT(clGetPlatformInfo(platform, CL_PLATFORM_NAME, platformNameBufferSize, &platformName[0], nullptr));
std::cout << "OpenCL Platform: " << platformName << std::endl;
CL_ASSERT(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, nullptr));
size_t deviceNameBufferSize = 0;
CL_ASSERT(clGetDeviceInfo(device, CL_DEVICE_NAME, 0, nullptr, &deviceNameBufferSize));
std::string deviceName(deviceNameBufferSize, '\0');
CL_ASSERT(clGetDeviceInfo(device, CL_DEVICE_NAME, deviceNameBufferSize, &deviceName[0], nullptr));
std::cout << "OpenCL Device: " << deviceName << std::endl;
err = CL_SUCCESS;
context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &err);
CL_ASSERT(err);
size_t kernelSourceLength = sizeof(kernelSource);
const char* kernelSourceArray = { kernelSource };
program = clCreateProgramWithSource(context, 1, &kernelSourceArray, &kernelSourceLength, &err);
CL_ASSERT(err);
if (clBuildProgram(program, 1, &device, nullptr, nullptr, nullptr) != CL_SUCCESS)
{
size_t programBuildLogBufferSize = 0;
CL_ASSERT(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, nullptr, &programBuildLogBufferSize));
std::string programBuildLog(programBuildLogBufferSize, '\0');
CL_ASSERT(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, programBuildLogBufferSize, &programBuildLog[0], nullptr));
std::clog << programBuildLog << std::endl;
return 1;
}
vectorAddKernel = clCreateKernel(program, "vectorAdd", &err);
CL_ASSERT(err);
commandQueue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
CL_ASSERT(err);
}
tracyCLCtx = TracyCLContext(context, device);
size_t N = 10 * 1024 * 1024 / sizeof(float); // 10MB of floats
std::vector<float> hostA, hostB, hostC;
{
ZoneScopedN("Host Data Init");
hostA.resize(N);
hostB.resize(N);
hostC.resize(N);
std::iota(std::begin(hostA), std::end(hostA), 0);
std::iota(std::begin(hostB), std::end(hostB), 0);
}
{
ZoneScopedN("Host to Device Memory Copy");
bufferA = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), nullptr, &err);
CL_ASSERT(err);
bufferB = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), nullptr, &err);
CL_ASSERT(err);
bufferC = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), nullptr, &err);
CL_ASSERT(err);
cl_event writeBufferAEvent, writeBufferBEvent;
{
ZoneScopedN("Write Buffer A");
TracyCLZoneS(tracyCLCtx, "Write BufferA", 5);
CL_ASSERT(clEnqueueWriteBuffer(commandQueue, bufferA, CL_TRUE, 0, N * sizeof(float), hostA.data(), 0, nullptr, &writeBufferAEvent));
TracyCLZoneSetEvent(writeBufferAEvent);
}
{
ZoneScopedN("Write Buffer B");
TracyCLZone(tracyCLCtx, "Write BufferB");
CL_ASSERT(clEnqueueWriteBuffer(commandQueue, bufferB, CL_TRUE, 0, N * sizeof(float), hostB.data(), 0, nullptr, &writeBufferBEvent));
TracyCLZoneSetEvent(writeBufferBEvent);
}
}
for (int i = 0; i < 10; ++i)
{
ZoneScopedN("VectorAdd Kernel Launch");
TracyCLZoneC(tracyCLCtx, "VectorAdd Kernel", tracy::Color::Blue4);
CL_ASSERT(clSetKernelArg(vectorAddKernel, 0, sizeof(cl_mem), &bufferC));
CL_ASSERT(clSetKernelArg(vectorAddKernel, 1, sizeof(cl_mem), &bufferA));
CL_ASSERT(clSetKernelArg(vectorAddKernel, 2, sizeof(cl_mem), &bufferB));
CL_ASSERT(clSetKernelArg(vectorAddKernel, 3, sizeof(int), &static_cast<int>(N)));
cl_event vectorAddKernelEvent;
CL_ASSERT(clEnqueueNDRangeKernel(commandQueue, vectorAddKernel, 1, nullptr, &N, nullptr, 0, nullptr, &vectorAddKernelEvent));
CL_ASSERT(clWaitForEvents(1, &vectorAddKernelEvent));
TracyCLZoneSetEvent(vectorAddKernelEvent);
cl_ulong kernelStartTime, kernelEndTime;
CL_ASSERT(clGetEventProfilingInfo(vectorAddKernelEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernelStartTime, nullptr));
CL_ASSERT(clGetEventProfilingInfo(vectorAddKernelEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernelEndTime, nullptr));
std::cout << "VectorAdd Kernel Elapsed: " << ((kernelEndTime - kernelStartTime) / 1000) << " us" << std::endl;
}
{
ZoneScopedN("Device to Host Memory Copy");
TracyCLZone(tracyCLCtx, "Read Buffer C");
cl_event readbufferCEvent;
CL_ASSERT(clEnqueueReadBuffer(commandQueue, bufferC, CL_TRUE, 0, N * sizeof(float), hostC.data(), 0, nullptr, &readbufferCEvent));
TracyCLZoneSetEvent(readbufferCEvent);
}
CL_ASSERT(clFinish(commandQueue));
TracyCLCollect(tracyCLCtx);
{
ZoneScopedN("Checking results");
for (int i = 0; i < N; ++i)
{
assert(hostC[i] == hostA[i] + hostB[i]);
}
}
std::cout << "Results are correct!" << std::endl;
TracyCLDestroy(tracyCLCtx);
return 0;
}

View File

@ -117,7 +117,7 @@ Hello and welcome to the Tracy Profiler user manual! Here you will find all the
\section{A quick look at Tracy Profiler}
\label{quicklook}
Tracy is a real-time, nanosecond resolution \emph{hybrid frame and sampling profiler} that can be used for remote or embedded telemetry of games and other applications. It can profile CPU (C, C++11, Lua), GPU (OpenGL, Vulkan, Direct3D 12) and memory. It also can monitor locks held by threads and show where contention does happen.
Tracy is a real-time, nanosecond resolution \emph{hybrid frame and sampling profiler} that can be used for remote or embedded telemetry of games and other applications. It can profile CPU (C, C++11, Lua), GPU (OpenGL, Vulkan, Direct3D 12, OpenCL) and memory. It also can monitor locks held by threads and show where contention does happen.
While Tracy can perform statistical analysis of sampled call stack data, just like other \emph{statistical profilers} (such as VTune, perf or Very Sleepy), it mainly focuses on manual markup of the source code, which allows frame-by-frame inspection of the program execution. You will be able to see exactly which functions are called, how much time is spent in them, and how do they interact with each other in a multi-threaded environment. In contrast, the statistical analysis may show you the hot spots in your code, but it is unable to accurately pinpoint the underlying cause for semi-random frame stutter that may occur every couple of seconds.
@ -452,7 +452,7 @@ The first command will allow access to system CPU statistics. The second one wil
\paragraph{Cloud service providers}
In some cases you actually don't own the hardware, but lend it from someone else. In such circumstances you might be running inside a virtual machine, which may be configured to prohibit you from using the bare metal facilities needed by Tracy\footnote{Or you might just be using a quite old CPU, which doesn't have support for required features.}. One example of such limitation would be lack of access to a reliable time stamp register readings, which will prevent the application from starting with either 'CPU doesn't support RDTSCP instruction' or 'CPU doesn't support invariant TSC' error message. If you are using Windows, you may workaround this issue by rebuilding the profiled application with the \texttt{TRACY\_TIMER\_QPC} macro, but be aware that it will severely lower the resolution of timer readings.
In some cases you actually don't own the hardware, but lend it from someone else. In such circumstances you might be running inside a virtual machine, which may be configured to prohibit you from using the bare metal facilities needed by Tracy\footnote{Or you might just be using a quite old CPU, which doesn't have support for required features.}. One example of such limitation would be lack of access to a reliable time stamp register readings, which will prevent the application from starting with either 'CPU doesn't support RDTSC instruction' or 'CPU doesn't support invariant TSC' error message. If you are using Windows, you may workaround this issue by rebuilding the profiled application with the \texttt{TRACY\_TIMER\_QPC} macro, but be aware that it will severely lower the resolution of timer readings.
\subsubsection{Changing network port}
@ -691,7 +691,7 @@ Memory & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck \\
GPU zones (OpenGL) & \faCheck & \faCheck & \faCheck & \faPoo & \faPoo & \\
GPU zones (Vulkan) & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \\
Call stacks & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck \\
Symbol resolution & \faCheck & \faCheck & \faCheck & \faCheck & \faPoo & \faCheck \\
Symbol resolution & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck \\
Crash handling & \faCheck & \faCheck & \faCheck & \faTimes & \faTimes & \faTimes \\
CPU usage probing & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck & \faCheck \\
Context switches & \faCheck & \faCheck & \faCheck & \faTimes & \faPoo & \faTimes \\
@ -1207,6 +1207,16 @@ Using GPU zones is the same as the Vulkan implementation, where the \texttt{Trac
The macro \texttt{TracyD3D12NewFrame(ctx)} is used to mark a new frame, and should appear before or after recording command lists, similar to \texttt{FrameMark}. This macro is a key component that enables automatic query data synchronization, so the user doesn't have to worry about synchronizing GPU execution before invoking a collection. Event data can then be collected and sent to the profiler using the \texttt{TracyD3D12Collect(ctx)} macro.
\subsubsection{OpenCL}
OpenCL support is achieved by including the \texttt{tracy/TracyOpenCL.hpp} header file. Tracing OpenCL requires the creation of a Tracy OpenCL context using the macro \texttt{TracyCLContext(context, device)}, which will return an instance of \texttt{TracyCLCtx} object that must be used when creating zones. The specified \texttt{device} must be part of the \texttt{context}. Cleanup is performed using the \texttt{TracyCLDestroy(ctx)} macro. Although not common, it is possible to create multiple OpenCL contexts for the same application.
To mark an OpenCL zone one must make sure that a valid OpenCL \texttt{cl\_event} object is available. The event will be the object that Tracy will use to query profiling information from the OpenCL driver. For this to work, all OpenCL queues must be created with the \texttt{CL\_QUEUE\_PROFILING\_ENABLE} property.
OpenCL zones can be created with the \texttt{TracyCLZone(ctx, name)} where \texttt{name} will usually be a descriptive name for the operation represented by the \texttt{cl\_event}. Within the scope of the zone, you must call \texttt{TracyCLSetEvent(event)} for the event to be registered in Tracy.
Similarly to Vulkan and OpenGL, you also need to periodically collect the OpenCL events using the \texttt{TracyCLCollect(ctx)} macro. A good place to perform this operation is after a \texttt{clFinish}, since this will ensure that any previous queued OpenCL commands will have finished by this point.
\subsubsection{Multiple zones in one scope}
Putting more than one GPU zone macro in a single scope features the same issue as with the \texttt{ZoneScoped} macros, described in section~\ref{multizone} (but this time the variable name is \texttt{\_\_\_tracy\_gpu\_zone}).
@ -1279,6 +1289,15 @@ To have proper call stack information, the profiled application must be compiled
\item On MSVC open the project properties and go to \emph{Linker\textrightarrow Debugging\textrightarrow Generate Debug Info}, where the \emph{Generate Debug Information} option should be selected.
\item On gcc or clang remember to specify the debugging information \texttt{-g} parameter during compilation and omit the strip symbols \texttt{-s} parameter. Link the executable with an additional option \texttt{-rdynamic} (or \texttt{-{}-export-dynamic}, if you are passing parameters directly to the linker).
\item On OSX you may need to run \texttt{dsymutil} to extract the debugging data out of the executable binary.
\item On iOS you will have to add a \emph{New Run Script Phase} to your XCode project, which will execute the following shell script:
\begin{lstlisting}[language=sh]
cp -rf ${TARGET_BUILD_DIR}/${WRAPPER_NAME}.dSYM/* ${TARGET_BUILD_DIR}/${UNLOCALIZED_RESOURCES_FOLDER_PATH}/${PRODUCT_NAME}.dSYM
\end{lstlisting}
You will also need to setup proper dependencies, by setting the following input file:\\
\texttt{\$\{TARGET\_BUILD\_DIR\}/\$\{WRAPPER\_NAME\}.dSYM}, and the following output file:\\
\texttt{\$\{TARGET\_BUILD\_DIR\}/\$\{UNLOCALIZED\_RESOURCES\_FOLDER\_PATH\}/\$\{PRODUCT\_NAME\}.dSYM}.
\end{itemize}
You may also be interested in symbols from external libraries, especially if you have sampling profiling enabled (section~\ref{sampling}). In MSVC you can retrieve such symbols by going to \emph{Tools\textrightarrow Options\textrightarrow Debugging\textrightarrow Symbols} and selecting appropriate \emph{Symbol file (.pdb) location} servers. Note that additional symbols may significantly increase application startup times.

View File

@ -436,6 +436,7 @@ void SourceView::ParseSource( const char* fileName, const Worker& worker, const
{
if( m_file != fileName )
{
m_srcWidth = 0;
m_file = fileName;
m_fileStringIdx = worker.FindStringIdx( fileName );
m_lines.clear();
@ -510,6 +511,7 @@ bool SourceView::Disassemble( uint64_t symAddr, const Worker& worker )
m_maxJumpLevel = 0;
m_asmSelected = -1;
m_asmCountBase = -1;
m_asmWidth = 0;
if( symAddr == 0 ) return false;
m_cpuArch = worker.GetCpuArch();
if( m_cpuArch == CpuArchUnknown ) return false;
@ -889,6 +891,7 @@ void SourceView::Render( const Worker& worker, View& view )
void SourceView::RenderSimpleSourceView()
{
ImGui::SetNextWindowContentSize( ImVec2( m_srcWidth, 0 ) );
ImGui::BeginChild( "##sourceView", ImVec2( 0, 0 ), true, ImGuiWindowFlags_HorizontalScrollbar );
if( m_font ) ImGui::PushFont( m_font );
@ -915,6 +918,8 @@ void SourceView::RenderSimpleSourceView()
}
RenderLine( line, lineNum++, 0, 0, 0, nullptr );
}
const auto win = ImGui::GetCurrentWindowRead();
m_srcWidth = win->DC.CursorMaxPos.x - win->DC.CursorStartPos.x;
}
else
{
@ -1376,11 +1381,12 @@ void SourceView::RenderSymbolSourceView( uint32_t iptotal, unordered_flat_map<ui
}
const float bottom = m_srcSampleSelect.empty() ? 0 : ImGui::GetFrameHeight();
ImGui::SetNextWindowContentSize( ImVec2( m_srcWidth, 0 ) );
ImGui::BeginChild( "##sourceView", ImVec2( 0, -bottom ), true, ImGuiWindowFlags_NoMove | ImGuiWindowFlags_HorizontalScrollbar );
if( m_font ) ImGui::PushFont( m_font );
auto draw = ImGui::GetWindowDrawList();
const auto wpos = ImGui::GetWindowPos();
const auto wpos = ImGui::GetWindowPos() - ImVec2( ImGui::GetCurrentWindowRead()->Scroll.x, 0 );
const auto wh = ImGui::GetWindowHeight();
const auto ty = ImGui::GetFontSize();
const auto ts = ImGui::CalcTextSize( " " ).x;
@ -1410,6 +1416,8 @@ void SourceView::RenderSymbolSourceView( uint32_t iptotal, unordered_flat_map<ui
}
RenderLine( line, lineNum++, 0, iptotal, ipmax, &worker );
}
const auto win = ImGui::GetCurrentWindowRead();
m_srcWidth = win->DC.CursorMaxPos.x - win->DC.CursorStartPos.x;
}
else
{
@ -1435,7 +1443,7 @@ void SourceView::RenderSymbolSourceView( uint32_t iptotal, unordered_flat_map<ui
}
}
auto win = ImGui::GetCurrentWindow();
const auto win = ImGui::GetCurrentWindowRead();
if( win->ScrollbarY )
{
auto draw = ImGui::GetWindowDrawList();
@ -1654,6 +1662,7 @@ uint64_t SourceView::RenderSymbolAsmView( uint32_t iptotal, unordered_flat_map<u
#endif
const float bottom = m_asmSampleSelect.empty() ? 0 : ImGui::GetFrameHeight();
ImGui::SetNextWindowContentSize( ImVec2( m_asmWidth, 0 ) );
ImGui::BeginChild( "##asmView", ImVec2( 0, -bottom ), true, ImGuiWindowFlags_NoMove | ImGuiWindowFlags_HorizontalScrollbar );
if( m_font ) ImGui::PushFont( m_font );
@ -1680,6 +1689,8 @@ uint64_t SourceView::RenderSymbolAsmView( uint32_t iptotal, unordered_flat_map<u
}
RenderAsmLine( line, 0, iptotal, ipmax, worker, jumpOut, maxAddrLen, view );
}
const auto win = ImGui::GetCurrentWindowRead();
m_asmWidth = win->DC.CursorMaxPos.x - win->DC.CursorStartPos.x;
}
else
{
@ -1848,7 +1859,7 @@ uint64_t SourceView::RenderSymbolAsmView( uint32_t iptotal, unordered_flat_map<u
#endif
}
auto win = ImGui::GetCurrentWindow();
const auto win = ImGui::GetCurrentWindowRead();
if( win->ScrollbarY )
{
auto draw = ImGui::GetWindowDrawList();
@ -2080,7 +2091,7 @@ void SourceView::RenderLine( const Line& line, int lineNum, uint32_t ipcnt, uint
{
const auto ty = ImGui::GetFontSize();
auto draw = ImGui::GetWindowDrawList();
const auto w = ImGui::GetWindowWidth();
const auto w = std::max( m_srcWidth, ImGui::GetWindowWidth() );
const auto wpos = ImGui::GetCursorScreenPos();
if( m_fileStringIdx == m_hoveredSource && lineNum == m_hoveredLine )
{
@ -2257,7 +2268,7 @@ void SourceView::RenderAsmLine( AsmLine& line, uint32_t ipcnt, uint32_t iptotal,
{
const auto ty = ImGui::GetFontSize();
auto draw = ImGui::GetWindowDrawList();
const auto w = ImGui::GetWindowWidth();
const auto w = std::max( m_asmWidth, ImGui::GetWindowWidth() );
const auto wpos = ImGui::GetCursorScreenPos();
if( m_selectedAddressesHover.find( line.addr ) != m_selectedAddressesHover.end() )
{

View File

@ -239,6 +239,9 @@ private:
unordered_flat_set<uint32_t> m_srcSampleSelect;
uint32_t m_asmGroupSelect = -1;
uint32_t m_srcGroupSelect = -1;
float m_srcWidth;
float m_asmWidth;
};
}

View File

@ -7,7 +7,7 @@ namespace Version
{
enum { Major = 0 };
enum { Minor = 6 };
enum { Patch = 14 };
enum { Patch = 15 };
}
}

View File

@ -76,6 +76,7 @@ constexpr const char* GpuContextNames[] = {
"Invalid",
"OpenGL",
"Vulkan",
"OpenCL",
"Direct3D 12"
};
@ -1293,7 +1294,7 @@ void View::DrawFrames()
enum { MaxFrameTime = 50 * 1000 * 1000 }; // 50ms
ImGuiWindow* window = ImGui::GetCurrentWindow();
ImGuiWindow* window = ImGui::GetCurrentWindowRead();
if( window->SkipItems ) return;
auto& io = ImGui::GetIO();
@ -2300,7 +2301,7 @@ void View::DrawZones()
if( m_vd.zvStart == m_vd.zvEnd ) return;
assert( m_vd.zvStart < m_vd.zvEnd );
if( ImGui::GetCurrentWindow()->SkipItems ) return;
if( ImGui::GetCurrentWindowRead()->SkipItems ) return;
m_gpuThread = 0;
m_gpuStart = 0;
@ -2473,7 +2474,8 @@ void View::DrawZones()
draw->AddTriangle( wpos + ImVec2( to/2, oldOffset + to/2 ), wpos + ImVec2( to/2, oldOffset + ty - to/2 ), wpos + ImVec2( to/2 + th, oldOffset + ty * 0.5 ), 0xFF886666, 2.0f );
}
const bool isMultithreaded = (v->type == GpuContextType::Vulkan || v->type == GpuContextType::Direct3D12);
const bool isMultithreaded = (v->type == GpuContextType::Vulkan) || (v->type == GpuContextType::OpenCL) || (v->type == GpuContextType::Direct3D12);
char buf[64];
sprintf( buf, "%s context %zu", GpuContextNames[(int)v->type], i );
DrawTextContrast( draw, wpos + ImVec2( ty, oldOffset ), showFull ? 0xFFFFAAAA : 0xFF886666, buf );