From a46f83364e71bb2ae7e9b54d3029858cd8c8f815 Mon Sep 17 00:00:00 2001
From: Thales Sabino <thales@codeplay.com>
Date: Wed, 27 May 2020 16:26:52 +0100
Subject: [PATCH] Add OpenCL trace support

- Adds the file TracyOpenCL.hpp which contains the API to annotate OpenCL applications
- It works in a similar fashion to the Vulkan annotations
- Adds an example OpenCL application in examples/OpenCLVectorAdd
- Adds "OpenCL Context" to the UI
- Manual entry for annotating OpenCL zones
---
 AUTHORS                                      |   1 +
 TracyOpenCL.hpp                              | 333 +++++++++++++++++++
 common/TracyQueue.hpp                        |   3 +-
 examples/OpenCLVectorAdd/CMakeLists.txt      |  14 +
 examples/OpenCLVectorAdd/OpenCLVectorAdd.cpp | 190 +++++++++++
 manual/tracy.tex                             |  10 +
 server/TracyView.cpp                         |   5 +-
 7 files changed, 553 insertions(+), 3 deletions(-)
 create mode 100644 TracyOpenCL.hpp
 create mode 100644 examples/OpenCLVectorAdd/CMakeLists.txt
 create mode 100644 examples/OpenCLVectorAdd/OpenCLVectorAdd.cpp
diff --git a/AUTHORS b/AUTHORS
index 2719cf57..ca3b476e 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -7,3 +7,4 @@ Till Rathmann <till.rathmann@gmx.de>                (DLL support)
 Sherief Farouk <sherief.personal@gmail.com>         (compatibility fixes)
 Dedmen Miller <dedmen@dedmen.de>                    (find zone bug fixes, improvements)
 Michał Cichoń <michcic@gmail.com>                   (OSX call stack decoding backport)
+Thales Sabino <thales@codeplay.com>                 (OpenCL support)
diff --git a/TracyOpenCL.hpp b/TracyOpenCL.hpp
new file mode 100644
index 00000000..77c7de98
--- /dev/null
+++ b/TracyOpenCL.hpp
@@ -0,0 +1,333 @@
+#ifndef __TRACYOPENCL_HPP__
+#define __TRACYOPENCL_HPP__
+
+#if !defined TRACY_ENABLE
+
+#define TracyCLContext(x, y) nullptr
+#define TracyCLDestroy(x)
+#define TracyCLNamedZone(c, x, y, z, w)
+#define TracyCLNamedZoneC(c, x, y, z, w, a)
+#define TracyCLZone(c, x, y)
+#define TracyCLZoneC(c, x, y, z)
+#define TracyCLCollect(c)
+
+#define TracyCLNamedZoneS(c, x, y, z, w, a)
+#define TracyCLNamedZoneCS(c, x, y, z, w, v, a)
+#define TracyCLZoneS(c, x, y, z)
+#define TracyCLZoneCS(c, x, y, z, w)
+
+namespace tracy
+{
+    class OpenCLCtxScope {};
+}
+
+using TracyCLCtx = void*;
+
+#else
+
+#include <CL/cl.h>
+
+#include <atomic>
+#include <cassert>
+
+#include "Tracy.hpp"
+#include "client/TracyCallstack.hpp"
+#include "client/TracyProfiler.hpp"
+#include "common/TracyAlloc.hpp"
+
+namespace tracy {
+
+    enum class EventPhase : uint8_t
+    {
+        Begin,
+        End
+    };
+
+    struct EventInfo
+    {
+        cl_event event;
+        EventPhase phase;
+    };
+
+    class OpenCLCtx
+    {
+    public:
+        enum { QueryCount = 64 * 1024 };
+
+        OpenCLCtx(cl_context context, cl_device_id device)
+            : m_contextId(GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed))
+            , m_head(0)
+            , m_tail(0)
+        {
+            assert(m_contextId != 255);
+
+            m_hostStartTime = Profiler::GetTime();
+            m_deviceStartTime = GetDeviceTimestamp(context, device);
+
+            auto item = Profiler::QueueSerial();
+            MemWrite(&item->hdr.type, QueueType::GpuNewContext);
+            MemWrite(&item->gpuNewContext.cpuTime, m_hostStartTime);
+            MemWrite(&item->gpuNewContext.gpuTime, m_hostStartTime);
+            memset(&item->gpuNewContext.thread, 0, sizeof(item->gpuNewContext.thread));
+            MemWrite(&item->gpuNewContext.period, 1.0f);
+            MemWrite(&item->gpuNewContext.type, GpuContextType::OpenCL);
+            MemWrite(&item->gpuNewContext.context, (uint8_t) m_contextId);
+            MemWrite(&item->gpuNewContext.accuracyBits, (uint8_t)0);
+#ifdef TRACY_ON_DEMAND
+            GetProfiler().DeferItem(*item);
+#endif
+            Profiler::QueueSerialFinish();
+        }
+
+        void Collect()
+        {
+            ZoneScopedC(Color::Red4);
+
+            if (m_tail == m_head) return;
+
+#ifdef TRACY_ON_DEMAND
+            if (!GetProfiler().IsConnected())
+            {
+                m_head = m_tail = 0;
+            }
+#endif
+
+            while (m_tail != m_head)
+            {
+                EventInfo eventInfo = m_query[m_tail];
+                cl_event event = eventInfo.event;
+                cl_int eventStatus;
+                cl_int err = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, nullptr);
+                assert(err == CL_SUCCESS);
+                if (eventStatus != CL_COMPLETE) return;
+
+                cl_int eventInfoQuery = (eventInfo.phase == EventPhase::Begin)
+                    ? CL_PROFILING_COMMAND_START
+                    : CL_PROFILING_COMMAND_END;
+
+                cl_ulong eventTimeStamp = 0;
+                err = clGetEventProfilingInfo(event, eventInfoQuery, sizeof(cl_ulong), &eventTimeStamp, nullptr);
+                assert(err == CL_SUCCESS);
+                assert(eventTimeStamp != 0);
+
+                auto item = Profiler::QueueSerial();
+                MemWrite(&item->hdr.type, QueueType::GpuTime);
+                MemWrite(&item->gpuTime.gpuTime, TimestampOffset(eventTimeStamp));
+                MemWrite(&item->gpuTime.queryId, (uint16_t)m_tail);
+                MemWrite(&item->gpuTime.context, m_contextId);
+                Profiler::QueueSerialFinish();
+
+                if (eventInfo.phase == EventPhase::End)
+                {
+                    // Done with the event, so release it
+                    assert(clReleaseEvent(event) == CL_SUCCESS);
+                }
+
+                m_tail = (m_tail + 1) % QueryCount;
+            }
+        }
+
+        tracy_force_inline uint8_t GetId() const
+        {
+            return m_contextId;
+        }
+
+        tracy_force_inline unsigned int NextQueryId(EventInfo eventInfo)
+        {
+            const auto id = m_head;
+            m_head = (m_head + 1) % QueryCount;
+            assert(m_head != m_tail);
+            m_query[id] = eventInfo;
+            return id;
+        }
+
+        tracy_force_inline EventInfo& GetQuery(unsigned int id)
+        {
+            assert(id < QueryCount);
+            return m_query[id];
+        }
+
+    private:
+        tracy_force_inline int64_t GetHostStartTime() const
+        {
+            return m_hostStartTime;
+        }
+
+        tracy_force_inline int64_t GetDeviceStartTime() const
+        {
+            return m_deviceStartTime;
+        }
+
+        tracy_force_inline int64_t TimestampOffset(int64_t deviceTimestamp) const
+        {
+            return m_hostStartTime + (deviceTimestamp - m_deviceStartTime);
+        }
+
+        tracy_force_inline int64_t GetDeviceTimestamp(cl_context context, cl_device_id device) const
+        {
+            cl_ulong deviceTimestamp = 0;
+            cl_int err = CL_SUCCESS;
+            cl_command_queue queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
+            assert(err == CL_SUCCESS);
+            uint32_t dummyValue = 42;
+            cl_mem dummyBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(uint32_t), nullptr, &err);
+            assert(err == CL_SUCCESS);
+            cl_event writeBufferEvent;
+            err = clEnqueueWriteBuffer(queue, dummyBuffer, CL_TRUE, 0, sizeof(uint32_t), &dummyValue, 0, nullptr, &writeBufferEvent);
+            assert(err == CL_SUCCESS);
+            err = clWaitForEvents(1, &writeBufferEvent);
+            assert(err == CL_SUCCESS);
+            cl_int eventStatus;
+            err = clGetEventInfo(writeBufferEvent, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, nullptr);
+            assert(err == CL_SUCCESS);
+            assert(eventStatus == CL_COMPLETE);
+            err = clGetEventProfilingInfo(writeBufferEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &deviceTimestamp, nullptr);
+            assert(err == CL_SUCCESS);
+            err = clReleaseEvent(writeBufferEvent);
+            assert(err == CL_SUCCESS);
+            err = clReleaseMemObject(dummyBuffer);
+            assert(err == CL_SUCCESS);
+            err = clReleaseCommandQueue(queue);
+            assert(err == CL_SUCCESS);
+
+            return (int64_t)deviceTimestamp;
+        }
+
+        unsigned int m_contextId;
+
+        EventInfo m_query[QueryCount];
+        unsigned int m_head;
+        unsigned int m_tail;
+
+        int64_t m_hostStartTime;
+        int64_t m_deviceStartTime;
+    };
+
+    class OpenCLCtxScope {
+    public:
+        tracy_force_inline OpenCLCtxScope(OpenCLCtx* ctx, const SourceLocationData* srcLoc, bool is_active)
+#ifdef TRACY_ON_DEMAND
+            : m_active(is_active&& GetProfiler().IsConnected())
+#else
+            : m_active(is_active)
+#endif
+            , m_ctx(ctx)
+            , m_event(nullptr)
+        {
+            if (!m_active) return;
+
+            m_beginQueryId = ctx->NextQueryId(EventInfo{ nullptr, EventPhase::Begin });
+
+            auto item = Profiler::QueueSerial();
+            MemWrite(&item->hdr.type, QueueType::GpuZoneBeginSerial);
+            MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
+            MemWrite(&item->gpuZoneBegin.srcloc, (uint64_t)srcLoc);
+            MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
+            MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)m_beginQueryId);
+            MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
+            Profiler::QueueSerialFinish();
+        }
+
+        tracy_force_inline OpenCLCtxScope(OpenCLCtx* ctx, const SourceLocationData* srcLoc, int depth, bool is_active)
+#ifdef TRACY_ON_DEMAND
+            : m_active(is_active&& GetProfiler().IsConnected())
+#else
+            : m_active(is_active)
+#endif
+            , m_ctx(ctx)
+            , m_event(nullptr)
+        {
+            if (!m_active) return;
+
+            m_beginQueryId = ctx->NextQueryId(EventInfo{ nullptr, EventPhase::Begin });
+
+            auto item = Profiler::QueueSerial();
+            MemWrite(&item->hdr.type, QueueType::GpuZoneBeginCallstackSerial);
+            MemWrite(&item->gpuZoneBegin.cpuTime, Profiler::GetTime());
+            MemWrite(&item->gpuZoneBegin.srcloc, (uint64_t)srcLoc);
+            MemWrite(&item->gpuZoneBegin.thread, GetThreadHandle());
+            MemWrite(&item->gpuZoneBegin.queryId, (uint16_t)m_beginQueryId);
+            MemWrite(&item->gpuZoneBegin.context, ctx->GetId());
+            Profiler::QueueSerialFinish();
+
+            GetProfiler().SendCallstack(depth);
+        }
+
+        tracy_force_inline void SetEvent(cl_event event)
+        {
+            m_event = event;
+            assert(clRetainEvent(m_event) == CL_SUCCESS);
+            m_ctx->GetQuery(m_beginQueryId).event = m_event;
+        }
+
+        tracy_force_inline ~OpenCLCtxScope()
+        {
+            const auto queryId = m_ctx->NextQueryId(EventInfo{ m_event, EventPhase::End });
+
+            auto item = Profiler::QueueSerial();
+            MemWrite(&item->hdr.type, QueueType::GpuZoneEndSerial);
+            MemWrite(&item->gpuZoneEnd.cpuTime, Profiler::GetTime());
+            MemWrite(&item->gpuZoneEnd.thread, GetThreadHandle());
+            MemWrite(&item->gpuZoneEnd.queryId, (uint16_t)queryId);
+            MemWrite(&item->gpuZoneEnd.context, m_ctx->GetId());
+            Profiler::QueueSerialFinish();
+        }
+
+        const bool m_active;
+        OpenCLCtx* m_ctx;
+        cl_event m_event;
+        unsigned int m_beginQueryId;
+    };
+
+    static inline OpenCLCtx* CreateCLContext(cl_context context, cl_device_id device)
+    {
+        InitRPMallocThread();
+        auto ctx = (OpenCLCtx*)tracy_malloc(sizeof(OpenCLCtx));
+        new (ctx) OpenCLCtx(context, device);
+        return ctx;
+    }
+
+    static inline void DestroyCLContext(OpenCLCtx* ctx)
+    {
+        ctx->~OpenCLCtx();
+        tracy_free(ctx);
+    }
+
+}  // namespace tracy
+
+using TracyCLCtx = tracy::OpenCLCtx*;
+
+#define TracyCLContext(context, device) tracy::CreateCLContext(context, device);
+#define TracyCLDestroy(ctx) tracy::DestroyCLContext(ctx);
+#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK
+#   define TracyCLNamedZone(ctx, varname, name, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), TRACY_CALLSTACK, active );
+#   define TracyCLNamedZoneC(ctx, varname, name, color, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__) { name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), TRACY_CALLSTACK, active );
+#   define TracyCLZone(ctx, name) TracyCLNamedZoneS(ctx, __tracy_gpu_zone, name, TRACY_CALLSTACK, true)
+#   define TracyCLZoneC(ctx, name, color) TracyCLNamedZoneCS(ctx, __tracy_gpu_zone, name, color, TRACY_CALLSTACK, true)
+#else
+#   define TracyCLNamedZone(ctx, varname, name, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), active);
+#   define TracyCLNamedZoneC(ctx, varname, name, color, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), active);
+#   define TracyCLZone(ctx, name) TracyCLNamedZone(ctx, __tracy_gpu_zone, name, true)
+#   define TracyCLZoneC(ctx, name, color) TracyCLNamedZoneC(ctx, __tracy_gpu_zone, name, color, true )
+#endif
+
+#ifdef TRACY_HAS_CALLSTACK
+#   define TracyCLNamedZoneS(ctx, varname, name, depth, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, 0 }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), depth, active);
+#   define TracyCLNamedZoneCS(ctx, varname, name, color, depth, active) static const tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,__LINE__){ name, __FUNCTION__, __FILE__, (uint32_t)__LINE__, color }; tracy::OpenCLCtxScope varname(ctx, &TracyConcat(__tracy_gpu_source_location,__LINE__), depth, active);
+#   define TracyCLZoneS(ctx, name, depth) TracyCLNamedZoneS(ctx, __tracy_gpu_zone, name, depth, true)
+#   define TracyCLZoneCS(ctx, name, color, depth) TracyCLNamedZoneCS(ctx, __tracy_gpu_zone, name, color, depth, true)
+#else
+#define TracyCLNamedZoneS(ctx, varname, name, depth, active) TracyCLNamedZone(ctx, varname, name, active)
+#define TracyCLNamedZoneCS(ctx, varname, name, color, depth, active) TracyCLNamedZoneC(ctx, varname, name, color, active)
+#define TracyCLZoneS(ctx, name, depth) TracyCLZone(ctx, name)
+#define TracyCLZoneCS(ctx, name, color, depth) TracyCLZoneC(ctx, name, color)
+#endif
+
+#define TracyCLNamedZoneSetEvent(varname, event) varname.SetEvent(event)
+#define TracyCLZoneSetEvent(event) __tracy_gpu_zone.SetEvent(event)
+
+#define TracyCLCollect(ctx) ctx->Collect()
+
+#endif
+
+#endif
diff --git a/common/TracyQueue.hpp b/common/TracyQueue.hpp
index f791410b..9257f5a6 100644
--- a/common/TracyQueue.hpp
+++ b/common/TracyQueue.hpp
@@ -263,7 +263,8 @@ enum class GpuContextType : uint8_t
 {
     Invalid,
     OpenGl,
-    Vulkan
+    Vulkan,
+    OpenCL
 };
 
 struct QueueGpuNewContext
diff --git a/examples/OpenCLVectorAdd/CMakeLists.txt b/examples/OpenCLVectorAdd/CMakeLists.txt
new file mode 100644
index 00000000..e5b0bfca
--- /dev/null
+++ b/examples/OpenCLVectorAdd/CMakeLists.txt
@@ -0,0 +1,14 @@
+cmake_minimum_required(VERSION 3.0)
+
+project(OpenCLVectorAdd)
+
+find_package(OpenCL REQUIRED)
+
+add_executable(OpenCLVectorAdd OpenCLVectorAdd.cpp)
+
+add_library(TracyClient STATIC ../../TracyClient.cpp
+                               ../../TracyOpenCL.hpp)
+target_include_directories(TracyClient PUBLIC ../../)
+target_compile_definitions(TracyClient PUBLIC TRACY_ENABLE=1)
+
+target_link_libraries(OpenCLVectorAdd PUBLIC OpenCL::OpenCL TracyClient)
diff --git a/examples/OpenCLVectorAdd/OpenCLVectorAdd.cpp b/examples/OpenCLVectorAdd/OpenCLVectorAdd.cpp
new file mode 100644
index 00000000..d499424f
--- /dev/null
+++ b/examples/OpenCLVectorAdd/OpenCLVectorAdd.cpp
@@ -0,0 +1,190 @@
+#include <iostream>
+#include <cassert>
+#include <string>
+#include <vector>
+#include <numeric>
+
+#include <CL/cl.h>
+
+#include <Tracy.hpp>
+#include <TracyOpenCL.hpp>
+
+#define CL_ASSERT(err)                                              \
+    if((err) != CL_SUCCESS)                                         \
+    {                                                               \
+        std::cerr << "OpenCL Call Returned " << err << std::endl;   \
+        assert(false);                                              \
+    }
+
+const char kernelSource[] =
+"   void __kernel vectorAdd(global float* C, global float* A, global float* B, int N)  "
+"   {                                                                                  "
+"       int i = get_global_id(0);                                                      "
+"       if (i < N) {                                                                   "
+"           C[i] = A[i] + B[i];                                                        "
+"       }                                                                              "
+"   }                                                                                  ";
+
+int main()
+{
+    cl_platform_id platform;
+    cl_device_id device;
+    cl_context context;
+    cl_command_queue commandQueue;
+    cl_kernel vectorAddKernel;
+    cl_program program;
+    cl_int err;
+    cl_mem bufferA, bufferB, bufferC;
+
+    TracyCLCtx tracyCLCtx;
+
+    {
+        ZoneScopedN("OpenCL Init");
+
+        cl_uint numPlatforms = 0;
+        CL_ASSERT(clGetPlatformIDs(0, nullptr, &numPlatforms));
+
+        if (numPlatforms == 0)
+        {
+            std::cerr << "Cannot find OpenCL platform to run this application" << std::endl;
+            return 1;
+        }
+
+        CL_ASSERT(clGetPlatformIDs(1, &platform, nullptr));
+
+        size_t platformNameBufferSize = 0;
+        CL_ASSERT(clGetPlatformInfo(platform, CL_PLATFORM_NAME, 0, nullptr, &platformNameBufferSize));
+        std::string platformName(platformNameBufferSize, '\0');
+        CL_ASSERT(clGetPlatformInfo(platform, CL_PLATFORM_NAME, platformNameBufferSize, &platformName[0], nullptr));
+
+        std::cout << "OpenCL Platform: " << platformName << std::endl;
+
+        CL_ASSERT(clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, nullptr));
+        size_t deviceNameBufferSize = 0;
+        CL_ASSERT(clGetDeviceInfo(device, CL_DEVICE_NAME, 0, nullptr, &deviceNameBufferSize));
+        std::string deviceName(deviceNameBufferSize, '\0');
+        CL_ASSERT(clGetDeviceInfo(device, CL_DEVICE_NAME, deviceNameBufferSize, &deviceName[0], nullptr));
+
+        std::cout << "OpenCL Device: " << deviceName << std::endl;
+
+        err = CL_SUCCESS;
+        context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &err);
+        CL_ASSERT(err);
+
+        size_t kernelSourceLength = sizeof(kernelSource);
+        const char* kernelSourceArray = { kernelSource };
+        program = clCreateProgramWithSource(context, 1, &kernelSourceArray, &kernelSourceLength, &err);
+        CL_ASSERT(err);
+
+        if (clBuildProgram(program, 1, &device, nullptr, nullptr, nullptr) != CL_SUCCESS)
+        {
+            size_t programBuildLogBufferSize = 0;
+            CL_ASSERT(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, nullptr, &programBuildLogBufferSize));
+            std::string programBuildLog(programBuildLogBufferSize, '\0');
+            CL_ASSERT(clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, programBuildLogBufferSize, &programBuildLog[0], nullptr));
+            std::clog << programBuildLog << std::endl;
+            return 1;
+        }
+
+        vectorAddKernel = clCreateKernel(program, "vectorAdd", &err);
+        CL_ASSERT(err);
+
+        commandQueue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
+        CL_ASSERT(err);
+    }
+
+    tracyCLCtx = TracyCLContext(context, device);
+
+    size_t N = 10 * 1024 * 1024 / sizeof(float); // 10MB of floats
+    std::vector<float> hostA, hostB, hostC;
+
+    {
+        ZoneScopedN("Host Data Init");
+        hostA.resize(N);
+        hostB.resize(N);
+        hostC.resize(N);
+
+        std::iota(std::begin(hostA), std::end(hostA), 0);
+        std::iota(std::begin(hostB), std::end(hostB), 0);
+    }
+
+    {
+        ZoneScopedN("Host to Device Memory Copy");
+
+        bufferA = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), nullptr, &err);
+        CL_ASSERT(err);
+        bufferB = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), nullptr, &err);
+        CL_ASSERT(err);
+        bufferC = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), nullptr, &err);
+        CL_ASSERT(err);
+
+        cl_event writeBufferAEvent, writeBufferBEvent;
+        {
+            ZoneScopedN("Write Buffer A");
+            TracyCLZoneS(tracyCLCtx, "Write BufferA", 5);
+
+            CL_ASSERT(clEnqueueWriteBuffer(commandQueue, bufferA, CL_TRUE, 0, N * sizeof(float), hostA.data(), 0, nullptr, &writeBufferAEvent));
+
+            TracyCLZoneSetEvent(writeBufferAEvent);
+        }
+        {
+            ZoneScopedN("Write Buffer B");
+            TracyCLZone(tracyCLCtx, "Write BufferB");
+
+            CL_ASSERT(clEnqueueWriteBuffer(commandQueue, bufferB, CL_TRUE, 0, N * sizeof(float), hostB.data(), 0, nullptr, &writeBufferBEvent));
+
+            TracyCLZoneSetEvent(writeBufferBEvent);
+        }
+    }
+
+    for (int i = 0; i < 10; ++i)
+    {
+        ZoneScopedN("VectorAdd Kernel Launch");
+        TracyCLZoneC(tracyCLCtx, "VectorAdd Kernel", tracy::Color::Blue4);
+
+        CL_ASSERT(clSetKernelArg(vectorAddKernel, 0, sizeof(cl_mem), &bufferC));
+        CL_ASSERT(clSetKernelArg(vectorAddKernel, 1, sizeof(cl_mem), &bufferA));
+        CL_ASSERT(clSetKernelArg(vectorAddKernel, 2, sizeof(cl_mem), &bufferB));
+        CL_ASSERT(clSetKernelArg(vectorAddKernel, 3, sizeof(int), &static_cast<int>(N)));
+
+        cl_event vectorAddKernelEvent;
+        CL_ASSERT(clEnqueueNDRangeKernel(commandQueue, vectorAddKernel, 1, nullptr, &N, nullptr, 0, nullptr, &vectorAddKernelEvent));
+
+        CL_ASSERT(clWaitForEvents(1, &vectorAddKernelEvent));
+
+        TracyCLZoneSetEvent(vectorAddKernelEvent);
+
+        cl_ulong kernelStartTime, kernelEndTime;
+        CL_ASSERT(clGetEventProfilingInfo(vectorAddKernelEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernelStartTime, nullptr));
+        CL_ASSERT(clGetEventProfilingInfo(vectorAddKernelEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernelEndTime, nullptr));
+        std::cout << "VectorAdd Kernel Elapsed: " << ((kernelEndTime - kernelStartTime) / 1000) << " us" << std::endl;
+    }
+
+    {
+        ZoneScopedN("Device to Host Memory Copy");
+        TracyCLZone(tracyCLCtx, "Read Buffer C");
+
+        cl_event readbufferCEvent;
+        CL_ASSERT(clEnqueueReadBuffer(commandQueue, bufferC, CL_TRUE, 0, N * sizeof(float), hostC.data(), 0, nullptr, &readbufferCEvent));
+        TracyCLZoneSetEvent(readbufferCEvent);
+    }
+
+    CL_ASSERT(clFinish(commandQueue));
+
+    TracyCLCollect(tracyCLCtx);
+
+    {
+        ZoneScopedN("Checking results");
+
+        for (int i = 0; i < N; ++i)
+        {
+            assert(hostC[i] == hostA[i] + hostB[i]);
+        }
+    }
+
+    std::cout << "Results are correct!" << std::endl;
+
+    TracyCLDestroy(tracyCLCtx);
+
+    return 0;
+}
diff --git a/manual/tracy.tex b/manual/tracy.tex
index 6fe40bf5..b53ad6a3 100644
--- a/manual/tracy.tex
+++ b/manual/tracy.tex
@@ -1197,6 +1197,16 @@ To mark a GPU zone use the \texttt{TracyVkZone(ctx, cmdbuf, name)} macro, where
 
 You also need to periodically collect the GPU events using the \texttt{TracyVkCollect(ctx, cmdbuf)} macro\footnote{It is considerably faster than the OpenGL's \texttt{TracyGpuCollect}.}. The provided command buffer must be in the recording state and outside of a render pass instance.
 
+\subsubsection{OpenCL}
+
+OpenCL support is achieved by including the \texttt{tracy/TracyOpenCL.hpp} header file. Tracing OpenCL requires the creation of a Tracy OpenCL context using the macro \texttt{TracyCLContext(context, device)}, which will return an instance of \texttt{TracyCLCtx} object that must be used when creating zones. The specified \texttt{device} must be part of the \texttt{context}. Cleanup is performed using the \texttt{TracyCLDestroy(ctx)} macro. Although not common, it is possible to create multiple OpenCL contexts for the same application.
+
+To mark an OpenCL zone one must make sure that a valid OpenCL \texttt{cl\_event} object is available. The event will be the object that Tracy will use to query profiling information from the OpenCL driver. For this to work, all OpenCL queues must be created with the \texttt{CL\_QUEUE\_PROFILING\_ENABLE} property.
+
+OpenCL zones can be created with the \texttt{TracyCLZone(ctx, name)} where \texttt{name} will usually be a descriptive name for the operation represented by the \texttt{cl\_event}. Within the scope of the zone, you must call \texttt{TracyCLSetEvent(event)} for the event to be registered in Tracy.
+
+Similarly to Vulkan and OpenGL, you also need to periodically collect the OpenCL events using the \texttt{TracyCLCollect(ctx)} macro. A good place to perform this operation is after a \texttt{clFinish}, since this will ensure that any previous queued OpenCL commands will have finished by this point. 
+
 \subsubsection{Multiple zones in one scope}
 
 Putting more than one GPU zone macro in a single scope features the same issue as with the \texttt{ZoneScoped} macros, described in section~\ref{multizone} (but this time the variable name is \texttt{\_\_\_tracy\_gpu\_zone}).
diff --git a/server/TracyView.cpp b/server/TracyView.cpp
index 57581311..646c6267 100644
--- a/server/TracyView.cpp
+++ b/server/TracyView.cpp
@@ -75,7 +75,8 @@ constexpr const char* s_tracyStackFrames[] = {
 constexpr const char* GpuContextNames[] = {
     "Invalid",
     "OpenGL",
-    "Vulkan"
+    "Vulkan",
+    "OpenCL"
 };
 
 
@@ -2472,7 +2473,7 @@ void View::DrawZones()
                     draw->AddTriangle( wpos + ImVec2( to/2, oldOffset + to/2 ), wpos + ImVec2( to/2, oldOffset + ty - to/2 ), wpos + ImVec2( to/2 + th, oldOffset + ty * 0.5 ), 0xFF886666, 2.0f );
                 }
 
-                const bool isMultithreaded = v->type == GpuContextType::Vulkan;
+                const bool isMultithreaded = (v->type == GpuContextType::Vulkan) || (v->type == GpuContextType::OpenCL);
                 char buf[64];
                 sprintf( buf, "%s context %zu", GpuContextNames[(int)v->type], i );
                 DrawTextContrast( draw, wpos + ImVec2( ty, oldOffset ), showFull ? 0xFFFFAAAA : 0xFF886666, buf );