diff --git a/TracyOpenCL.hpp b/TracyOpenCL.hpp
index 1fd3e741..d477158d 100644
--- a/TracyOpenCL.hpp
+++ b/TracyOpenCL.hpp
@@ -35,12 +35,28 @@ using TracyCLCtx = void*;
 
 #include <atomic>
 #include <cassert>
+#include <sstream>
 
 #include "Tracy.hpp"
 #include "client/TracyCallstack.hpp"
 #include "client/TracyProfiler.hpp"
 #include "common/TracyAlloc.hpp"
 
+#define TRACY_CL_TO_STRING_INDIRECT(T) #T
+#define TRACY_CL_TO_STRING(T) TRACY_CL_TO_STRING_INDIRECT(T)
+#define TRACY_CL_ASSERT(p) if(!(p)) {                                                         \
+    TracyMessageL( "TRACY_CL_ASSERT failed on " __FILE__ ":" TRACY_CL_TO_STRING(__LINE__) );  \
+    assert(false && "TRACY_CL_ASSERT failed");                                                \
+}
+#define TRACY_CL_CHECK_ERROR(err) if(err != CL_SUCCESS) {                    \
+    std::ostringstream oss;                                                  \
+    oss << "TRACY_CL_CHECK_ERROR failed on " << __FILE__ << ":" << __LINE__  \
+        << ": error code " << err;                                           \
+    auto msg = oss.str();                                                    \
+    TracyMessage(msg.data(), msg.size());                                    \
+    assert(false && "TRACY_CL_CHECK_ERROR failed");                          \
+}
+
 namespace tracy {
 
     enum class EventPhase : uint8_t
@@ -66,34 +82,27 @@ namespace tracy {
             , m_tail(0)
         {
             int64_t tcpu, tgpu;
-            assert(m_contextId != 255);
+            TRACY_CL_ASSERT(m_contextId != 255);
 
             cl_int err = CL_SUCCESS;
             cl_command_queue queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
-            assert(err == CL_SUCCESS);
+            TRACY_CL_CHECK_ERROR(err)
             uint32_t dummyValue = 42;
             cl_mem dummyBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(uint32_t), nullptr, &err);
-            assert(err == CL_SUCCESS);
+            TRACY_CL_CHECK_ERROR(err)
             cl_event writeBufferEvent;
-            err = clEnqueueWriteBuffer(queue, dummyBuffer, CL_FALSE, 0, sizeof(uint32_t), &dummyValue, 0, nullptr, &writeBufferEvent);
-            assert(err == CL_SUCCESS);
-            err = clWaitForEvents(1, &writeBufferEvent);
+            TRACY_CL_CHECK_ERROR(clEnqueueWriteBuffer(queue, dummyBuffer, CL_FALSE, 0, sizeof(uint32_t), &dummyValue, 0, nullptr, &writeBufferEvent));
+            TRACY_CL_CHECK_ERROR(clWaitForEvents(1, &writeBufferEvent));
 
             tcpu = Profiler::GetTime();
 
-            assert(err == CL_SUCCESS);
             cl_int eventStatus;
-            err = clGetEventInfo(writeBufferEvent, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, nullptr);
-            assert(err == CL_SUCCESS);
-            assert(eventStatus == CL_COMPLETE);
-            err = clGetEventProfilingInfo(writeBufferEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &tgpu, nullptr);
-            assert(err == CL_SUCCESS);
-            err = clReleaseEvent(writeBufferEvent);
-            assert(err == CL_SUCCESS);
-            err = clReleaseMemObject(dummyBuffer);
-            assert(err == CL_SUCCESS);
-            err = clReleaseCommandQueue(queue);
-            assert(err == CL_SUCCESS);
+            TRACY_CL_CHECK_ERROR(clGetEventInfo(writeBufferEvent, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, nullptr));
+            TRACY_CL_ASSERT(eventStatus == CL_COMPLETE);
+            TRACY_CL_CHECK_ERROR(clGetEventProfilingInfo(writeBufferEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &tgpu, nullptr));
+            TRACY_CL_CHECK_ERROR(clReleaseEvent(writeBufferEvent));
+            TRACY_CL_CHECK_ERROR(clReleaseMemObject(dummyBuffer));
+            TRACY_CL_CHECK_ERROR(clReleaseCommandQueue(queue));
 
             auto item = Profiler::QueueSerial();
             MemWrite(&item->hdr.type, QueueType::GpuNewContext);
@@ -139,13 +148,23 @@ namespace tracy {
             }
 #endif
 
-            while (m_tail != m_head)
+            for (; m_tail != m_head; m_tail = (m_tail + 1) % QueryCount)
             {
-                EventInfo eventInfo = m_query[m_tail];
-                cl_event event = eventInfo.event;
+                EventInfo eventInfo = GetQuery(m_tail);
                 cl_int eventStatus;
-                cl_int err = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, nullptr);
-                assert(err == CL_SUCCESS);
+                cl_int err = clGetEventInfo(eventInfo.event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, nullptr);
+                if (err != CL_SUCCESS)
+                {
+                    std::ostringstream oss;
+                    oss << "clGetEventInfo falied with error code " << err << ", on event " << eventInfo.event << ", skipping...";
+                    auto msg = oss.str();
+                    TracyMessage(msg.data(), msg.size());
+                    if (eventInfo.event == nullptr) {
+                        TracyMessageL("A TracyCLZone must be paird with a TracyCLZoneSetEvent, check your code!");
+                    }
+                    assert(false && "clGetEventInfo failed, maybe a TracyCLZone is not paired with TracyCLZoneSetEvent");
+                    continue;
+                }
                 if (eventStatus != CL_COMPLETE) return;
 
                 cl_int eventInfoQuery = (eventInfo.phase == EventPhase::Begin)
@@ -153,9 +172,16 @@ namespace tracy {
                     : CL_PROFILING_COMMAND_END;
 
                 cl_ulong eventTimeStamp = 0;
-                err = clGetEventProfilingInfo(event, eventInfoQuery, sizeof(cl_ulong), &eventTimeStamp, nullptr);
-                assert(err == CL_SUCCESS);
-                assert(eventTimeStamp != 0);
+                err = clGetEventProfilingInfo(eventInfo.event, eventInfoQuery, sizeof(cl_ulong), &eventTimeStamp, nullptr);
+                if (err == CL_PROFILING_INFO_NOT_AVAILABLE)
+                {
+                    TracyMessageL("command queue is not created with CL_QUEUE_PROFILING_ENABLE flag, check your code!");
+                    assert(false && "command queue is not created with CL_QUEUE_PROFILING_ENABLE flag");
+                }
+                else
+                    TRACY_CL_CHECK_ERROR(err);
+
+                TRACY_CL_ASSERT(eventTimeStamp != 0);
 
                 auto item = Profiler::QueueSerial();
                 MemWrite(&item->hdr.type, QueueType::GpuTime);
@@ -167,11 +193,8 @@ namespace tracy {
                 if (eventInfo.phase == EventPhase::End)
                 {
                     // Done with the event, so release it
-                    err = clReleaseEvent(event);
-                    assert(err == CL_SUCCESS);
+                    TRACY_CL_CHECK_ERROR(clReleaseEvent(eventInfo.event));
                 }
-
-                m_tail = (m_tail + 1) % QueryCount;
             }
         }
 
@@ -184,14 +207,14 @@ namespace tracy {
         {
             const auto id = m_head;
             m_head = (m_head + 1) % QueryCount;
-            assert(m_head != m_tail);
+            TRACY_CL_ASSERT(m_head != m_tail);
             m_query[id] = eventInfo;
             return id;
         }
 
         tracy_force_inline EventInfo& GetQuery(unsigned int id)
         {
-            assert(id < QueryCount);
+            TRACY_CL_ASSERT(id < QueryCount);
             return m_query[id];
         }
 
@@ -200,8 +223,8 @@ namespace tracy {
         unsigned int m_contextId;
 
         EventInfo m_query[QueryCount];
-        unsigned int m_head;
-        unsigned int m_tail;
+        unsigned int m_head; // index at which a new event should be inserted
+        unsigned int m_tail; // oldest event
 
     };
 
@@ -259,8 +282,7 @@ namespace tracy {
         {
             if (!m_active) return;
             m_event = event;
-            cl_int err = clRetainEvent(m_event);
-            assert(err == CL_SUCCESS);
+            TRACY_CL_CHECK_ERROR(clRetainEvent(m_event));
             m_ctx->GetQuery(m_beginQueryId).event = m_event;
         }
 
diff --git a/examples/OpenCLVectorAdd/OpenCLVectorAdd.cpp b/examples/OpenCLVectorAdd/OpenCLVectorAdd.cpp
index b3ee0e44..a2560eae 100644
--- a/examples/OpenCLVectorAdd/OpenCLVectorAdd.cpp
+++ b/examples/OpenCLVectorAdd/OpenCLVectorAdd.cpp
@@ -1,3 +1,4 @@
+#include <algorithm>
 #include <iostream>
 #include <cassert>
 #include <string>
@@ -104,8 +105,8 @@ int main()
         hostB.resize(N);
         hostC.resize(N);
 
-        std::iota(std::begin(hostA), std::end(hostA), 0);
-        std::iota(std::begin(hostB), std::end(hostB), 0);
+        std::iota(std::begin(hostA), std::end(hostA), 0.0f);
+        std::iota(std::begin(hostB), std::end(hostB), 0.0f);
     }
 
     {
@@ -123,7 +124,7 @@ int main()
             ZoneScopedN("Write Buffer A");
             TracyCLZoneS(tracyCLCtx, "Write BufferA", 5);
 
-            CL_ASSERT(clEnqueueWriteBuffer(commandQueue, bufferA, CL_TRUE, 0, N * sizeof(float), hostA.data(), 0, nullptr, &writeBufferAEvent));
+            CL_ASSERT(clEnqueueWriteBuffer(commandQueue, bufferA, CL_FALSE, 0, N * sizeof(float), hostA.data(), 0, nullptr, &writeBufferAEvent));
 
             TracyCLZoneSetEvent(writeBufferAEvent);
         }
@@ -131,34 +132,44 @@ int main()
             ZoneScopedN("Write Buffer B");
             TracyCLZone(tracyCLCtx, "Write BufferB");
 
-            CL_ASSERT(clEnqueueWriteBuffer(commandQueue, bufferB, CL_TRUE, 0, N * sizeof(float), hostB.data(), 0, nullptr, &writeBufferBEvent));
+            CL_ASSERT(clEnqueueWriteBuffer(commandQueue, bufferB, CL_FALSE, 0, N * sizeof(float), hostB.data(), 0, nullptr, &writeBufferBEvent));
 
             TracyCLZoneSetEvent(writeBufferBEvent);
         }
     }
 
-    for (int i = 0; i < 10; ++i)
+    cl_int clN = static_cast<cl_int>(N);
+    const int numFrames = 10;
+    const int launchsPerFrame = 10;
+    constexpr int numLaunchs = numFrames * launchsPerFrame;
+    std::vector<cl_event> kernelLaunchEvts;
+    kernelLaunchEvts.reserve(numLaunchs);
+    for (int i = 0; i < numFrames; ++i)
     {
-        int n_value = static_cast<int>(N);
-        ZoneScopedN("VectorAdd Kernel Launch");
-        TracyCLZoneC(tracyCLCtx, "VectorAdd Kernel", tracy::Color::Blue4);
+        FrameMark;
+        for (int j = 0; j < launchsPerFrame; ++j) {
+            ZoneScopedN("VectorAdd Kernel Launch");
+            TracyCLZoneC(tracyCLCtx, "VectorAdd Kernel", tracy::Color::Blue4);
 
-        CL_ASSERT(clSetKernelArg(vectorAddKernel, 0, sizeof(cl_mem), &bufferC));
-        CL_ASSERT(clSetKernelArg(vectorAddKernel, 1, sizeof(cl_mem), &bufferA));
-        CL_ASSERT(clSetKernelArg(vectorAddKernel, 2, sizeof(cl_mem), &bufferB));
-        CL_ASSERT(clSetKernelArg(vectorAddKernel, 3, sizeof(int), &n_value));
+            CL_ASSERT(clSetKernelArg(vectorAddKernel, 0, sizeof(cl_mem), &bufferC));
+            CL_ASSERT(clSetKernelArg(vectorAddKernel, 1, sizeof(cl_mem), &bufferA));
+            CL_ASSERT(clSetKernelArg(vectorAddKernel, 2, sizeof(cl_mem), &bufferB));
+            CL_ASSERT(clSetKernelArg(vectorAddKernel, 3, sizeof(cl_int), &clN));
 
-        cl_event vectorAddKernelEvent;
-        CL_ASSERT(clEnqueueNDRangeKernel(commandQueue, vectorAddKernel, 1, nullptr, &N, nullptr, 0, nullptr, &vectorAddKernelEvent));
-
-        CL_ASSERT(clWaitForEvents(1, &vectorAddKernelEvent));
-
-        TracyCLZoneSetEvent(vectorAddKernelEvent);
-
-        cl_ulong kernelStartTime, kernelEndTime;
-        CL_ASSERT(clGetEventProfilingInfo(vectorAddKernelEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernelStartTime, nullptr));
-        CL_ASSERT(clGetEventProfilingInfo(vectorAddKernelEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernelEndTime, nullptr));
-        std::cout << "VectorAdd Kernel Elapsed: " << ((kernelEndTime - kernelStartTime) / 1000) << " us" << std::endl;
+            cl_event vectorAddKernelEvent;
+            CL_ASSERT(clEnqueueNDRangeKernel(commandQueue, vectorAddKernel, 1, nullptr, &N, nullptr, 0, nullptr, &vectorAddKernelEvent));
+            TracyCLZoneSetEvent(vectorAddKernelEvent);
+            CL_ASSERT(clRetainEvent(vectorAddKernelEvent));
+            kernelLaunchEvts.push_back(vectorAddKernelEvent);
+            std::cout << "VectorAdd Kernel Enqueued" << std::endl;
+        }
+        {
+            // Wait frame events to be finished
+            ZoneScopedN("clFinish");
+            CL_ASSERT(clFinish(commandQueue));
+        }
+        // You should collect on each 'frame' ends, so that streaming can be achieved.
+        TracyCLCollect(tracyCLCtx);
     }
 
     {
@@ -171,7 +182,25 @@ int main()
     }
 
     CL_ASSERT(clFinish(commandQueue));
+    std::vector<float> durations(kernelLaunchEvts.size());
+    for (int i=0; i<kernelLaunchEvts.size(); i++) {
+        cl_event evt = kernelLaunchEvts[i];
+        cl_ulong start;
+        cl_ulong end;
+        CL_ASSERT(clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, nullptr));
+        CL_ASSERT(clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, nullptr));
+        CL_ASSERT(clReleaseEvent(evt));
+        durations[i] = (end - start) * 0.001f;
+        std::cout << "VectorAdd Kernel " << i << " tooks " <<  static_cast<int>(durations[i]) << "us" << std::endl;
+    };
+    float avg = std::accumulate(durations.cbegin(), durations.cend(), 0.0f) / durations.size();
+    float stddev2 = std::accumulate(durations.cbegin(), durations.cend(), 0.0f, [avg](const float& acc, const float& v) {
+        auto d = v - avg;
+        return acc + d*d;
+    }) / (durations.size() - 1.0f);
+    std::cout << "VectorAdd runtime avg: " << avg << "us, std: " << sqrt(stddev2) << "us over " << numLaunchs << " runs." << std::endl;
 
+    // User should ensure all events are finished, in this case, collect after the clFinish will do the trick.
     TracyCLCollect(tracyCLCtx);
 
     {