Merge pull request #31 from mcleary/opencl-support

Add OpenCL trace support
2024-11-10 10:41:50 +00:00 · 2020-06-06 14:56:29 +02:00 · 2020-06-06 14:56:29 +02:00 · 57f1ef05c7
commit 57f1ef05c7
parent 06158de6da a46f83364e
7 changed files with 553 additions and 3 deletions
--- a/1
+++ b/1
@ -7,3 +7,4 @@ Till Rathmann <till.rathmann@gmx.de>                (DLL support)
 Sherief Farouk <sherief.personal@gmail.com>         (compatibility fixes)
 Dedmen Miller <dedmen@dedmen.de>                    (find zone bug fixes, improvements)
 Michał Cichoń <michcic@gmail.com>                   (OSX call stack decoding backport)
+Thales Sabino <thales@codeplay.com>                 (OpenCL support)
--- a/TracyOpenCL.hpp
+++ b/TracyOpenCL.hpp
@ -0,0 +1,333 @@
+#ifndef __TRACYOPENCL_HPP__
+#define __TRACYOPENCL_HPP__
+
+#if !defined TRACY_ENABLE
+
+#define TracyCLContext(x, y) nullptr
+#define TracyCLDestroy(x)
+#define TracyCLNamedZone(c, x, y, z, w)
+#define TracyCLNamedZoneC(c, x, y, z, w, a)
+#define TracyCLZone(c, x, y)
+#define TracyCLZoneC(c, x, y, z)
+#define TracyCLCollect(c)
+
+#define TracyCLNamedZoneS(c, x, y, z, w, a)
+#define TracyCLNamedZoneCS(c, x, y, z, w, v, a)
+#define TracyCLZoneS(c, x, y, z)
+#define TracyCLZoneCS(c, x, y, z, w)
+
+namespace tracy
+{
+    class OpenCLCtxScope {};
+}
+
+using TracyCLCtx = void*;
+
+#else
+
+#include <CL/cl.h>
+
+#include <atomic>
+#include <cassert>
+
+#include "Tracy.hpp"
+#include "client/TracyCallstack.hpp"
+#include "client/TracyProfiler.hpp"
+#include "common/TracyAlloc.hpp"
+
+namespace tracy {
+
+    enum class EventPhase : uint8_t
+    {
+        Begin,
+        End
+    };
+
+    struct EventInfo
+    {
+        cl_event event;
+        EventPhase phase;
+    };
+
+    class OpenCLCtx
+    {
+    public:
+        enum { QueryCount = 64 * 1024 };
+
+        OpenCLCtx(cl_context context, cl_device_id device)
+            : m_contextId(GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed))
+            , m_head(0)
+            , m_tail(0)
+        {
+            assert(m_contextId != 255);
+
+            m_hostStartTime = Profiler::GetTime();
+            m_deviceStartTime = GetDeviceTimestamp(context, device);
+
+            auto item = Profiler::QueueSerial();
+            MemWrite(&item->hdr.type, QueueType::GpuNewContext);
+            MemWrite(&item->gpuNewContext.cpuTime, m_hostStartTime);
+            MemWrite(&item->gpuNewContext.gpuTime, m_hostStartTime);
+            memset(&item->gpuNewContext.thread, 0, sizeof(item->gpuNewContext.thread));
+            MemWrite(&item->gpuNewContext.period, 1.0f);
+            MemWrite(&item->gpuNewContext.type, GpuContextType::OpenCL);
+            MemWrite(&item->gpuNewContext.context, (uint8_t) m_contextId);
+            MemWrite(&item->gpuNewContext.accuracyBits, (uint8_t)0);
+#ifdef TRACY_ON_DEMAND
+            GetProfiler().DeferItem(*item);
+#endif
+            Profiler::QueueSerialFinish();
+        }
+
+        void Collect()
+        {
+            ZoneScopedC(Color::Red4);
+
+            if (m_tail == m_head) return;
+
+#ifdef TRACY_ON_DEMAND
+            if (!GetProfiler().IsConnected())
+            {
+                m_head = m_tail = 0;
+            }
+#endif
+
+            while (m_tail != m_head)
+            {
+                EventInfo eventInfo = m_query[m_tail];
+                cl_event event = eventInfo.event;
+                cl_int eventStatus;
+                cl_int err = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, nullptr);
+                assert(err == CL_SUCCESS);
+                if (eventStatus != CL_COMPLETE) return;
+
+                cl_int eventInfoQuery = (eventInfo.phase == EventPhase::Begin)
+                    ? CL_PROFILING_COMMAND_START
+                    : CL_PROFILING_COMMAND_END;
+
+                cl_ulong eventTimeStamp = 0;
+                err = clGetEventProfilingInfo(event, eventInfoQuery, sizeof(cl_ulong), &eventTimeStamp, nullptr);
+                assert(err == CL_SUCCESS);
+                assert(eventTimeStamp != 0);
+
+                auto item = Profiler::QueueSerial();
+                MemWrite(&item->hdr.type, QueueType::GpuTime);
+                MemWrite(&item->gpuTime.gpuTime, TimestampOffset(eventTimeStamp));
+                MemWrite(&item->gpuTime.queryId, (uint16_t)m_tail);
+                MemWrite(&item->gpuTime.context, m_contextId);
+                Profiler::QueueSerialFinish();
+
+                if (eventInfo.phase == EventPhase::End)
+                {
+                    // Done with the event, so release it
+                    assert(clReleaseEvent(event) == CL_SUCCESS);
+                }
+
+                m_tail = (m_tail + 1) % QueryCount;
+            }
+        }
+
+        tracy_force_inline uint8_t GetId() const
+        {
+            return m_contextId;
+        }
+
+        tracy_force_inline unsigned int NextQueryId(EventInfo eventInfo)
+        {
+            const auto id = m_head;
+            m_head = (m_head + 1) % QueryCount;
+            assert(m_head != m_tail);
+            m_query[id] = eventInfo;
+            return id;
+        }
+
+        tracy_force_inline EventInfo& GetQuery(unsigned int id)
+        {
+            assert(id < QueryCount);
+            return m_query[id];
+        }
+
+    private:
+        tracy_force_inline int64_t GetHostStartTime() const
+        {
+            return m_hostStartTime;
+        }
+
+        tracy_force_inline int64_t GetDeviceStartTime() const
+        {
+            return m_deviceStartTime;
+        }
+
+        tracy_force_inline int64_t TimestampOffset(int64_t deviceTimestamp) const
+        {
+            return m_hostStartTime + (deviceTimestamp - m_deviceStartTime);
+        }
+
+        tracy_force_inline int64_t GetDeviceTimestamp(cl_context context, cl_device_id device) const
+        {
+            cl_ulong deviceTimestamp = 0;
+            cl_int err = CL_SUCCESS;
+            cl_command_queue queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
+            assert(err == CL_SUCCESS);
+            uint32_t dummyValue = 42;
+            cl_mem dummyBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(uint32_t), nullptr, &err);
+            assert(err == CL_SUCCESS);
+            cl_event writeBufferEvent;
+            err = clEnqueueWriteBuffer(queue, dummyBuffer, CL_TRUE, 0, sizeof(uint32_t), &dummyValue, 0, nullptr, &writeBufferEvent);
+            assert(err == CL_SUCCESS);
+            err = clWaitForEvents(1, &writeBufferEvent);
+            assert(err == CL_SUCCESS);
+            cl_int eventStatus;
+            err = clGetEventInfo(writeBufferEvent, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, nullptr);
+            assert(err == CL_SUCCESS);
+            assert(eventStatus == CL_COMPLETE);
+            err = clGetEventProfilingInfo(writeBufferEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &deviceTimestamp, nullptr);
+            assert(err == CL_SUCCESS);
+            err = clReleaseEvent(writeBufferEvent);
+            assert(err == CL_SUCCESS);
+            err = clReleaseMemObject(dummyBuffer);
+            assert(err == CL_SUCCESS);
+            err = clReleaseCommandQueue(queue);
+            assert(err == CL_SUCCESS);
+
+            return (int64_t)deviceTimestamp;
+        }
+
+        unsigned int m_contextId;
+
+        EventInfo m_query[QueryCount];
+        unsigned int m_head;
+        unsigned int m_tail;
+
+        int64_t m_hostStartTime;
+        int64_t m_deviceStartTime;
+    };
+
+    class OpenCLCtxScope {
+    public:
+        tracy_force_inline OpenCLCtxScope(OpenCLCtx* ctx, const SourceLocationData* srcLoc, bool is_active)
+#ifdef TRACY_ON_DEMAND
+            : m_active(is_active&& GetProfiler().IsConnected())
+#else
+            : m_active(is_active)
+#endif
+            , m_ctx(ctx)
+            , m_event(nullptr)
+        {
+            if (!m_active) return;
+
+            m_beginQueryId = ctx->NextQueryId(EventInfo{ nullptr, EventPhase::Begin });
+
+            auto item = Profiler::QueueSerial();
+            MemWrite(&item->hdr.type, QueueType::GpuZoneBeginSerial);
+            MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
+            MemWrite(&item->gpuZoneBegin.srcloc, (uint64_t)srcLoc);
+            MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
+            MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)m_beginQueryId);
+            MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
+            Profiler::QueueSerialFinish();
+        }
+
+        tracy_force_inline OpenCLCtxScope(OpenCLCtx* ctx, const SourceLocationData* srcLoc, int depth, bool is_active)
+#ifdef TRACY_ON_DEMAND
+            : m_active(is_active&& GetProfiler().IsConnected())
+#else
+            : m_active(is_active)
+#endif
+            , m_ctx(ctx)
+            , m_event(nullptr)
+        {
+            if (!m_active) return;
+
+            m_beginQueryId = ctx->NextQueryId(EventInfo{ nullptr, EventPhase::Begin });
+
+            auto item = Profiler::QueueSerial();
+            MemWrite(&item->hdr.type, QueueType::GpuZoneBeginCallstackSerial);
+            MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
+            MemWrite(&item->gpuZoneBegin.srcloc, (uint64_t)srcLoc);
+            MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
+            MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)m_beginQueryId);
+            MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
+            Profiler::QueueSerialFinish();
+
+            GetProfiler().SendCallstack(depth);
+        }
+
+        tracy_force_inline void SetEvent(cl_event event)
+        {
+            m_event = event;
+            assert(clRetainEvent(m_event) == CL_SUCCESS);
+            m_ctx->GetQuery(m_beginQueryId).event = m_event;
+        }
+
+        tracy_force_inline ~OpenCLCtxScope()
+        {
+            const auto queryId = m_ctx->NextQueryId(EventInfo{ m_event, EventPhase::End });
+
+            auto item = Profiler::QueueSerial();
+            MemWrite(&item->hdr.type, QueueType::GpuZoneEndSerial);
+            MemWrite(&item->gpuZoneEnd.cpuTime, Profiler::GetTime());
+            MemWrite(&item->gpuZoneEnd.thread, GetThreadHandle());
+            MemWrite(&item->gpuZoneEnd.queryId, (uint16_t)queryId);
+            MemWrite(&item->gpuZoneEnd.context, m_ctx->GetId());
+            Profiler::QueueSerialFinish();
+        }
+
+        const bool m_active;
+        OpenCLCtx* m_ctx;
+        cl_event m_event;
+        unsigned int m_beginQueryId;
+    };
+
+    static inline OpenCLCtx* CreateCLContext(cl_context context, cl_device_id device)
+    {
+        InitRPMallocThread();
+        auto ctx = (OpenCLCtx*)tracy_malloc(sizeof(OpenCLCtx));
+        new (ctx) OpenCLCtx(context, device);
+        return ctx;
+    }
+
+    static inline void DestroyCLContext(OpenCLCtx* ctx)
+    {
+        ctx->~OpenCLCtx();
+        tracy_free(ctx);
+    }
+
+}  // namespace tracy
+
+using TracyCLCtx = tracy::OpenCLCtx*;
+
+#define TracyCLContext(context, device) tracy::CreateCLContext(context, device);
+#define TracyCLDestroy(ctx) tracy::DestroyCLContext(ctx);
+#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
+#   define TracyCLNamedZone(ctx, varname, name, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), TRACY_CALLSTACK, active );
+#   define TracyCLNamedZoneC(ctx, varname, name, color, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), TRACY_CALLSTACK, active );
+#   define TracyCLZone(ctx, name) TracyCLNamedZoneS(ctx, __tracy_gpu_zone, name, TRACY_CALLSTACK, true)
+#   define TracyCLZoneC(ctx, name, color) TracyCLNamedZoneCS(ctx, __tracy_gpu_zone, name, color, TRACY_CALLSTACK, true)
+#else
+#   define TracyCLNamedZone(ctx, varname, name, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), active);
+#   define TracyCLNamedZoneC(ctx, varname, name, color, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), active);
+#   define TracyCLZone(ctx, name) TracyCLNamedZone(ctx, __tracy_gpu_zone, name, true)
+#   define TracyCLZoneC(ctx, name, color) TracyCLNamedZoneC(ctx, __tracy_gpu_zone, name, color, true )
+#endif
+
+#ifdef TRACY_HAS_CALLSTACK
+#   define TracyCLNamedZoneS(ctx, varname, name, depth, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), depth, active);
+#   define TracyCLNamedZoneCS(ctx, varname, name, color, depth, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), depth, active);
+#   define TracyCLZoneS(ctx, name, depth) TracyCLNamedZoneS(ctx, __tracy_gpu_zone, name, depth, true)
+#   define TracyCLZoneCS(ctx, name, color, depth) TracyCLNamedZoneCS(ctx, __tracy_gpu_zone, name, color, depth, true)
+#else
+#define TracyCLNamedZoneS(ctx, varname, name, depth, active) TracyCLNamedZone(ctx, varname, name, active)
+#define TracyCLNamedZoneCS(ctx, varname, name, color, depth, active) TracyCLNamedZoneC(ctx, varname, name, color, active)
+#define TracyCLZoneS(ctx, name, depth) TracyCLZone(ctx, name)
+#define TracyCLZoneCS(ctx, name, color, depth) TracyCLZoneC(ctx, name, color)
+#endif
+
+#define TracyCLNamedZoneSetEvent(varname, event) varname.SetEvent(event)
+#define TracyCLZoneSetEvent(event) __tracy_gpu_zone.SetEvent(event)
+
+#define TracyCLCollect(ctx) ctx->Collect()
+
+#endif
+
+#endif
--- a/common/TracyQueue.hpp
+++ b/common/TracyQueue.hpp
@ -263,7 +263,8 @@ enum class GpuContextType : uint8_t
 {
    Invalid,
    OpenGl,
-    Vulkan
+    Vulkan,
+    OpenCL
 };

 struct QueueGpuNewContext
--- a/examples/OpenCLVectorAdd/CMakeLists.txt
+++ b/examples/OpenCLVectorAdd/CMakeLists.txt
@ -0,0 +1,14 @@
+cmake_minimum_required(VERSION 3.0)
+
+project(OpenCLVectorAdd)
+
+find_package(OpenCL REQUIRED)
+
+add_executable(OpenCLVectorAdd OpenCLVectorAdd.cpp)
+
+add_library(TracyClient STATIC ../../TracyClient.cpp
+                               ../../TracyOpenCL.hpp)
+target_include_directories(TracyClient PUBLIC ../../)
+target_compile_definitions(TracyClient PUBLIC TRACY_ENABLE=1)
+
+target_link_libraries(OpenCLVectorAdd PUBLIC OpenCL::OpenCL TracyClient)
--- a/examples/OpenCLVectorAdd/OpenCLVectorAdd.cpp
+++ b/examples/OpenCLVectorAdd/OpenCLVectorAdd.cpp
@ -0,0 +1,190 @@
+#include <iostream>
+#include <cassert>
+#include <string>
+#include <vector>
+#include <numeric>
+
+#include <CL/cl.h>
+
+#include <Tracy.hpp>
+#include <TracyOpenCL.hpp>
+
+#define CL_ASSERT(err)                                              \
+    if((err) != CL_SUCCESS)                                         \
+    {                                                               \
+        std::cerr << "OpenCL Call Returned " << err << std::endl;   \
+        assert(false);                                              \
+    }
+
+const char kernelSource[] =
+"   void __kernel vectorAdd(global float* C, global float* A, global float* B, int N)  "
+"   {                                                                                  "
+"       int i = get_global_id(0);                                                      "
+"       if (i < N) {                                                                   "
+"           C[i] = A[i] + B[i];                                                        "
+"       }                                                                              "
+"   }                                                                                  ";
+
+int main()
+{
+    cl_platform_id platform;
+    cl_device_id device;
+    cl_context context;
+    cl_command_queue commandQueue;
+    cl_kernel vectorAddKernel;
+    cl_program program;
+    cl_int err;
+    cl_mem bufferA, bufferB, bufferC;
+
+    TracyCLCtx tracyCLCtx;
+
+    {
+        ZoneScopedN("OpenCL Init");
+
+        cl_uint numPlatforms = 0;
+        CL_ASSERT(clGetPlatformIDs(0, nullptr, &numPlatforms));
+
+        if (numPlatforms == 0)
+        {
+            std::cerr << "Cannot find OpenCL platform to run this application" << std::endl;
+            return 1;
+        }
+
+        CL_ASSERT(clGetPlatformIDs(1, &platform, nullptr));
+
+        size_t platformNameBufferSize = 0;
+        CL_ASSERT(clGetPlatformInfo(platform, CL_PLATFORM_NAME, 0, nullptr, &platformNameBufferSize));
+        std::string platformName(platformNameBufferSize, '\0');
+        CL_ASSERT(clGetPlatformInfo(platform, CL_PLATFORM_NAME, platformNameBufferSize, &platformName[0], nullptr));
+
+        std::cout << "OpenCL Platform: " << platformName << std::endl;
+
+        CL_ASSERT(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, nullptr));
+        size_t deviceNameBufferSize = 0;
+        CL_ASSERT(clGetDeviceInfo(device, CL_DEVICE_NAME, 0, nullptr, &deviceNameBufferSize));
+        std::string deviceName(deviceNameBufferSize, '\0');
+        CL_ASSERT(clGetDeviceInfo(device, CL_DEVICE_NAME, deviceNameBufferSize, &deviceName[0], nullptr));
+
+        std::cout << "OpenCL Device: " << deviceName << std::endl;
+
+        err = CL_SUCCESS;
+        context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &err);
+        CL_ASSERT(err);
+
+        size_t kernelSourceLength = sizeof(kernelSource);
+        const char* kernelSourceArray = { kernelSource };
+        program = clCreateProgramWithSource(context, 1, &kernelSourceArray, &kernelSourceLength, &err);
+        CL_ASSERT(err);
+
+        if (clBuildProgram(program, 1, &device, nullptr, nullptr, nullptr) != CL_SUCCESS)
+        {
+            size_t programBuildLogBufferSize = 0;
+            CL_ASSERT(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, nullptr, &programBuildLogBufferSize));
+            std::string programBuildLog(programBuildLogBufferSize, '\0');
+            CL_ASSERT(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, programBuildLogBufferSize, &programBuildLog[0], nullptr));
+            std::clog << programBuildLog << std::endl;
+            return 1;
+        }
+
+        vectorAddKernel = clCreateKernel(program, "vectorAdd", &err);
+        CL_ASSERT(err);
+
+        commandQueue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
+        CL_ASSERT(err);
+    }
+
+    tracyCLCtx = TracyCLContext(context, device);
+
+    size_t N = 10 * 1024 * 1024 / sizeof(float); // 10MB of floats
+    std::vector<float> hostA, hostB, hostC;
+
+    {
+        ZoneScopedN("Host Data Init");
+        hostA.resize(N);
+        hostB.resize(N);
+        hostC.resize(N);
+
+        std::iota(std::begin(hostA), std::end(hostA), 0);
+        std::iota(std::begin(hostB), std::end(hostB), 0);
+    }
+
+    {
+        ZoneScopedN("Host to Device Memory Copy");
+
+        bufferA = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), nullptr, &err);
+        CL_ASSERT(err);
+        bufferB = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), nullptr, &err);
+        CL_ASSERT(err);
+        bufferC = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), nullptr, &err);
+        CL_ASSERT(err);
+
+        cl_event writeBufferAEvent, writeBufferBEvent;
+        {
+            ZoneScopedN("Write Buffer A");
+            TracyCLZoneS(tracyCLCtx, "Write BufferA", 5);
+
+            CL_ASSERT(clEnqueueWriteBuffer(commandQueue, bufferA, CL_TRUE, 0, N * sizeof(float), hostA.data(), 0, nullptr, &writeBufferAEvent));
+
+            TracyCLZoneSetEvent(writeBufferAEvent);
+        }
+        {
+            ZoneScopedN("Write Buffer B");
+            TracyCLZone(tracyCLCtx, "Write BufferB");
+
+            CL_ASSERT(clEnqueueWriteBuffer(commandQueue, bufferB, CL_TRUE, 0, N * sizeof(float), hostB.data(), 0, nullptr, &writeBufferBEvent));
+
+            TracyCLZoneSetEvent(writeBufferBEvent);
+        }
+    }
+
+    for (int i = 0; i < 10; ++i)
+    {
+        ZoneScopedN("VectorAdd Kernel Launch");
+        TracyCLZoneC(tracyCLCtx, "VectorAdd Kernel", tracy::Color::Blue4);
+
+        CL_ASSERT(clSetKernelArg(vectorAddKernel, 0, sizeof(cl_mem), &bufferC));
+        CL_ASSERT(clSetKernelArg(vectorAddKernel, 1, sizeof(cl_mem), &bufferA));
+        CL_ASSERT(clSetKernelArg(vectorAddKernel, 2, sizeof(cl_mem), &bufferB));
+        CL_ASSERT(clSetKernelArg(vectorAddKernel, 3, sizeof(int), &static_cast<int>(N)));
+
+        cl_event vectorAddKernelEvent;
+        CL_ASSERT(clEnqueueNDRangeKernel(commandQueue, vectorAddKernel, 1, nullptr, &N, nullptr, 0, nullptr, &vectorAddKernelEvent));
+
+        CL_ASSERT(clWaitForEvents(1, &vectorAddKernelEvent));
+
+        TracyCLZoneSetEvent(vectorAddKernelEvent);
+
+        cl_ulong kernelStartTime, kernelEndTime;
+        CL_ASSERT(clGetEventProfilingInfo(vectorAddKernelEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernelStartTime, nullptr));
+        CL_ASSERT(clGetEventProfilingInfo(vectorAddKernelEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernelEndTime, nullptr));
+        std::cout << "VectorAdd Kernel Elapsed: " << ((kernelEndTime - kernelStartTime) / 1000) << " us" << std::endl;
+    }
+
+    {
+        ZoneScopedN("Device to Host Memory Copy");
+        TracyCLZone(tracyCLCtx, "Read Buffer C");
+
+        cl_event readbufferCEvent;
+        CL_ASSERT(clEnqueueReadBuffer(commandQueue, bufferC, CL_TRUE, 0, N * sizeof(float), hostC.data(), 0, nullptr, &readbufferCEvent));
+        TracyCLZoneSetEvent(readbufferCEvent);
+    }
+
+    CL_ASSERT(clFinish(commandQueue));
+
+    TracyCLCollect(tracyCLCtx);
+
+    {
+        ZoneScopedN("Checking results");
+
+        for (int i = 0; i < N; ++i)
+        {
+            assert(hostC[i] == hostA[i] + hostB[i]);
+        }
+    }
+
+    std::cout << "Results are correct!" << std::endl;
+
+    TracyCLDestroy(tracyCLCtx);
+
+    return 0;
+}
--- a/manual/tracy.tex
+++ b/manual/tracy.tex
@ -1197,6 +1197,16 @@ To mark a GPU zone use the \texttt{TracyVkZone(ctx, cmdbuf, name)} macro, where

 You also need to periodically collect the GPU events using the \texttt{TracyVkCollect(ctx, cmdbuf)} macro\footnote{It is considerably faster than the OpenGL's \texttt{TracyGpuCollect}.}. The provided command buffer must be in the recording state and outside of a render pass instance.

+\subsubsection{OpenCL}
+
+OpenCL support is achieved by including the \texttt{tracy/TracyOpenCL.hpp} header file. Tracing OpenCL requires the creation of a Tracy OpenCL context using the macro \texttt{TracyCLContext(context, device)}, which will return an instance of \texttt{TracyCLCtx} object that must be used when creating zones. The specified \texttt{device} must be part of the \texttt{context}. Cleanup is performed using the \texttt{TracyCLDestroy(ctx)} macro. Although not common, it is possible to create multiple OpenCL contexts for the same application.
+
+To mark an OpenCL zone one must make sure that a valid OpenCL \texttt{cl\_event} object is available. The event will be the object that Tracy will use to query profiling information from the OpenCL driver. For this to work, all OpenCL queues must be created with the \texttt{CL\_QUEUE\_PROFILING\_ENABLE} property.
+
+OpenCL zones can be created with the \texttt{TracyCLZone(ctx, name)} where \texttt{name} will usually be a descriptive name for the operation represented by the \texttt{cl\_event}. Within the scope of the zone, you must call \texttt{TracyCLSetEvent(event)} for the event to be registered in Tracy.
+
+Similarly to Vulkan and OpenGL, you also need to periodically collect the OpenCL events using the \texttt{TracyCLCollect(ctx)} macro. A good place to perform this operation is after a \texttt{clFinish}, since this will ensure that any previous queued OpenCL commands will have finished by this point. 
+
 \subsubsection{Multiple zones in one scope}

 Putting more than one GPU zone macro in a single scope features the same issue as with the \texttt{ZoneScoped} macros, described in section~\ref{multizone} (but this time the variable name is \texttt{\_\_\_tracy\_gpu\_zone}).
--- a/server/TracyView.cpp
+++ b/server/TracyView.cpp
@ -75,7 +75,8 @@ constexpr const char* s_tracyStackFrames[] = {
 constexpr const char* GpuContextNames[] = {
    "Invalid",
    "OpenGL",
-    "Vulkan"
+    "Vulkan",
+    "OpenCL"
 };


@ -2472,7 +2473,7 @@ void View::DrawZones()
                    draw->AddTriangle( wpos + ImVec2( to/2, oldOffset + to/2 ), wpos + ImVec2( to/2, oldOffset + ty - to/2 ), wpos + ImVec2( to/2 + th, oldOffset + ty * 0.5 ), 0xFF886666, 2.0f );
                }

-                const bool isMultithreaded = v->type == GpuContextType::Vulkan;
+                const bool isMultithreaded = (v->type == GpuContextType::Vulkan) || (v->type == GpuContextType::OpenCL);
                char buf[64];
                sprintf( buf, "%s context %zu", GpuContextNames[(int)v->type], i );
                DrawTextContrast( draw, wpos + ImVec2( ty, oldOffset ), showFull ? 0xFFFFAAAA : 0xFF886666, buf );