Merge pull request #882 from slomp/slomp/metal-patch

Patching Apple/Metal back-end Debug issues
This commit is contained in:
Bartosz Taudul 2024-09-13 12:15:47 +02:00 committed by GitHub
commit 8724400884
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -49,7 +49,7 @@ namespace tracy
class MetalZoneScope {};
}
using TracyMetalCtx = void*;
using TracyMetalCtx = void;
#else
@ -58,8 +58,8 @@ using TracyMetalCtx = void*;
#endif
#include <atomic>
#include <assert.h>
#include <stdlib.h>
#include <cassert>
#include <cstdlib>
#include "Tracy.hpp"
#include "../client/TracyProfiler.hpp"
@ -70,14 +70,13 @@ using TracyMetalCtx = void*;
// ok to import if in obj-c code
#import <Metal/Metal.h>
#define VA_ARGS(...) , ##__VA_ARGS__
#define TRACY_METAL_VA_ARGS(...) , ##__VA_ARGS__
#define TracyMetalPanic(ret, msg, ...) do { \
char buffer [1024]; \
snprintf(buffer, sizeof(buffer), "TracyMetal: " msg VA_ARGS(__VA_ARGS__)); \
snprintf(buffer, sizeof(buffer), "TracyMetal: " msg TRACY_METAL_VA_ARGS(__VA_ARGS__)); \
TracyMessageC(buffer, strlen(buffer), tracy::Color::OrangeRed); \
fprintf(stderr, "%s\n", buffer); \
assert(false && "TracyMetal: " msg); \
ret; \
} while(false);
@ -90,9 +89,21 @@ using TracyMetalCtx = void*;
#endif//TRACY_METAL_DEBUG_MASK
#if TRACY_METAL_DEBUG_MASK
#define TracyMetalDebug(mask, ...) if (mask & TRACY_METAL_DEBUG_MASK) { __VA_ARGS__; }
#define TracyMetalDebugMasked(mask, ...) if constexpr (mask & TRACY_METAL_DEBUG_MASK) { __VA_ARGS__; }
#else
#define TracyMetalDebug(mask, ...)
#define TracyMetalDebugMasked(mask, ...)
#endif
#if TRACY_METAL_DEBUG_MASK & (1 << 1)
#define TracyMetalDebug_0b00010(...) __VA_ARGS__;
#else
#define TracyMetalDebug_0b00010(...)
#endif
#if TRACY_METAL_DEBUG_MASK & (1 << 4)
#define TracyMetalDebug_0b10000(...) __VA_ARGS__;
#else
#define TracyMetalDebug_0b10000(...)
#endif
#ifndef TracyMetalDebugZoneScopeWireTap
@ -109,89 +120,14 @@ class MetalCtx
enum { MaxQueries = 4 * 1024 }; // Metal: between 8 and 32768 _BYTES_...
public:
MetalCtx(id<MTLDevice> device)
: m_device(device)
{
ZoneScopedNC("TracyMetalCtx", tracy::Color::Red4);
TracyMetalDebug(1<<0, TracyMetalPanic(, "MTLCounterErrorValue = 0x%llx", MTLCounterErrorValue));
TracyMetalDebug(1<<0, TracyMetalPanic(, "MTLCounterDontSample = 0x%llx", MTLCounterDontSample));
if (m_device == nil)
{
TracyMetalPanic(return, "device is nil.");
}
if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtStageBoundary])
{
TracyMetalPanic(return, "ERROR: timestamp sampling at pipeline stage boundary is not supported.");
}
if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtDrawBoundary])
{
TracyMetalDebug(1<<0, fprintf(stderr, "WARNING: timestamp sampling at draw call boundary is not supported.\n"));
}
if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtBlitBoundary])
{
TracyMetalDebug(1<<0, fprintf(stderr, "WARNING: timestamp sampling at blit boundary is not supported.\n"));
}
if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtDispatchBoundary])
{
TracyMetalDebug(1<<0, fprintf(stderr, "WARNING: timestamp sampling at compute dispatch boundary is not supported.\n"));
}
if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtTileDispatchBoundary])
{
TracyMetalDebug(1<<0, fprintf(stderr, "WARNING: timestamp sampling at tile dispatch boundary is not supported.\n"));
}
m_counterSampleBuffers[0] = NewTimestampSampleBuffer(m_device, MaxQueries);
m_counterSampleBuffers[1] = NewTimestampSampleBuffer(m_device, MaxQueries);
m_timestampRequestTime.resize(MaxQueries);
MTLTimestamp cpuTimestamp = 0;
MTLTimestamp gpuTimestamp = 0;
[m_device sampleTimestamps:&cpuTimestamp gpuTimestamp:&gpuTimestamp];
m_mostRecentTimestamp = gpuTimestamp;
TracyMetalDebug(1<<0, TracyMetalPanic(, "Calibration: CPU timestamp (Metal): %llu", cpuTimestamp));
TracyMetalDebug(1<<0, TracyMetalPanic(, "Calibration: GPU timestamp (Metal): %llu", gpuTimestamp));
cpuTimestamp = Profiler::GetTime();
TracyMetalDebug(1<<0, TracyMetalPanic(, "Calibration: CPU timestamp (Tracy): %llu", cpuTimestamp));
float period = 1.0f;
m_contextId = GetGpuCtxCounter().fetch_add(1);
auto* item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuNewContext);
MemWrite(&item->gpuNewContext.cpuTime, int64_t(cpuTimestamp));
MemWrite(&item->gpuNewContext.gpuTime, int64_t(gpuTimestamp));
MemWrite(&item->gpuNewContext.thread, uint32_t(0)); // TODO: why not GetThreadHandle()?
MemWrite(&item->gpuNewContext.period, period);
MemWrite(&item->gpuNewContext.context, m_contextId);
//MemWrite(&item->gpuNewContext.flags, GpuContextCalibration);
MemWrite(&item->gpuNewContext.flags, GpuContextFlags(0));
MemWrite(&item->gpuNewContext.type, GpuContextType::Metal);
SubmitQueueItem(item);
}
~MetalCtx()
{
ZoneScopedNC("~TracyMetalCtx", tracy::Color::Red4);
TracyMetalDebug(1<<0, ZoneValue(m_previousCheckpoint.load()));
TracyMetalDebug(1<<0, ZoneValue(m_queryCounter.load()));
// collect the last remnants of Metal GPU activity...
// TODO: add a timeout to this loop?
while (m_previousCheckpoint.load() != m_queryCounter.load())
Collect();
}
static MetalCtx* Create(id<MTLDevice> device)
{
ZoneScopedNC("tracy::MetalCtx::Create", Color::Red4);
auto ctx = static_cast<MetalCtx*>(tracy_malloc(sizeof(MetalCtx)));
new (ctx) MetalCtx(device);
if (ctx->m_contextId == 255)
{
TracyMetalPanic(return nullptr, "ERROR: unable to create context.");
TracyMetalPanic({assert(false);} return nullptr, "ERROR: unable to create context.");
Destroy(ctx);
}
return ctx;
@ -199,6 +135,7 @@ public:
static void Destroy(MetalCtx* ctx)
{
ZoneScopedNC("tracy::MetalCtx::Destroy", Color::Red4);
ctx->~MetalCtx();
tracy_free(ctx);
}
@ -218,7 +155,7 @@ public:
bool Collect()
{
ZoneScopedNC("TracyMetal::Collect", Color::Red4);
ZoneScopedNC("tracy::MetalCtx::Collect", Color::Red4);
#ifdef TRACY_ON_DEMAND
if (!GetProfiler().IsConnected())
@ -238,8 +175,8 @@ public:
uintptr_t begin = m_previousCheckpoint.load();
uintptr_t latestCheckpoint = m_queryCounter.load(); // TODO: MTLEvent? MTLFence?;
TracyMetalDebug(1<<3, ZoneValue(begin));
TracyMetalDebug(1<<3, ZoneValue(latestCheckpoint));
TracyMetalDebugMasked(1<<3, ZoneValue(begin));
TracyMetalDebugMasked(1<<3, ZoneValue(latestCheckpoint));
uint32_t count = RingCount(begin, latestCheckpoint);
if (count == 0) // no pending timestamp queries
@ -260,7 +197,7 @@ public:
count = RingSize() - RingIndex(begin);
reallocateBuffer = true;
}
TracyMetalDebug(1<<3, ZoneValue(count));
TracyMetalDebugMasked(1<<3, ZoneValue(count));
auto buffer_idx = (begin / MaxQueries) % 2;
auto counterSampleBuffer = m_counterSampleBuffers[buffer_idx];
@ -270,7 +207,7 @@ public:
TracyMetalPanic(return false, "Collect: FULL! too many pending timestamp queries. [%llu, %llu] (%u)", begin, latestCheckpoint, count);
}
TracyMetalDebug(1<<3, TracyMetalPanic(, "Collect: [%llu, %llu] :: (%u)", begin, latestCheckpoint, count));
TracyMetalDebugMasked(1<<3, TracyMetalPanic(, "Collect: [%llu, %llu] :: (%u)", begin, latestCheckpoint, count));
NSRange range = NSMakeRange(RingIndex(begin), count);
NSData* data = [counterSampleBuffer resolveCounterRange:range];
@ -289,11 +226,11 @@ public:
int resolved = 0;
for (auto i = 0; i < numResolvedTimestamps; i += 2)
{
ZoneScopedN("TracyMetal::Collect::[i]");
TracyMetalDebug_0b10000( ZoneScopedN("tracy::MetalCtx::Collect::[i]") );
MTLTimestamp t_start = timestamps[i+0].timestamp;
MTLTimestamp t_end = timestamps[i+1].timestamp;
uint32_t k = RingIndex(begin + i);
TracyMetalDebug(1<<4, TracyMetalPanic(, "Collect: timestamp[%u] = %llu | timestamp[%u] = %llu | diff = %llu\n", k, t_start, k+1, t_end, (t_end - t_start)));
TracyMetalDebugMasked(1<<4, TracyMetalPanic(, "Collect: timestamp[%u] = %llu | timestamp[%u] = %llu | diff = %llu\n", k, t_start, k+1, t_end, (t_end - t_start)));
if ((t_start == MTLCounterErrorValue) || (t_end == MTLCounterErrorValue))
{
TracyMetalPanic(, "Collect: invalid timestamp (MTLCounterErrorValue) at %u.", k);
@ -306,17 +243,17 @@ public:
auto checkTime = std::chrono::high_resolution_clock::now();
auto requestTime = m_timestampRequestTime[k];
auto ms_in_flight = std::chrono::duration<float>(checkTime-requestTime).count()*1000.0f;
TracyMetalDebug(1<<4, TracyMetalPanic(, "Collect: invalid timestamp (zero) at %u [%.0fms in flight].", k, ms_in_flight));
TracyMetalDebugMasked(1<<4, TracyMetalPanic(, "Collect: invalid timestamp (zero) at %u [%.0fms in flight].", k, ms_in_flight));
const float timeout_ms = TRACY_METAL_TIMESTAMP_COLLECT_TIMEOUT * 1000.0f;
if (ms_in_flight < timeout_ms)
break;
ZoneScopedN("TracyMetal::Collect::Drop");
TracyMetalDebug_0b10000( ZoneScopedN("tracy::MetalCtx::Collect::Drop") );
TracyMetalPanic(, "Collect: giving up on timestamp at %u [%.0fms in flight].", k, ms_in_flight);
t_start = m_mostRecentTimestamp + 5;
t_end = t_start + 5;
}
TracyMetalDebug(1<<2, TracyFreeN((void*)(uintptr_t)(k+0), "TracyMetalGpuZone"));
TracyMetalDebug(1<<2, TracyFreeN((void*)(uintptr_t)(k+1), "TracyMetalGpuZone"));
TracyMetalDebugMasked(1<<2, TracyFreeN((void*)(uintptr_t)(k+0), "TracyMetalGpuZone"));
TracyMetalDebugMasked(1<<2, TracyFreeN((void*)(uintptr_t)(k+1), "TracyMetalGpuZone"));
{
auto* item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuTime);
@ -334,10 +271,10 @@ public:
Profiler::QueueSerialFinish();
}
m_mostRecentTimestamp = (t_end > m_mostRecentTimestamp) ? t_end : m_mostRecentTimestamp;
TracyMetalDebug(1<<1, TracyFreeN((void*)(uintptr_t)k, "TracyMetalTimestampQueryId"));
TracyMetalDebugMasked(1<<1, TracyFreeN((void*)(uintptr_t)k, "TracyMetalTimestampQueryId"));
resolved += 2;
}
TracyMetalDebug(1<<3, ZoneValue(RingCount(begin, m_previousCheckpoint.load())));
TracyMetalDebugMasked(1<<3, ZoneValue(RingCount(begin, m_previousCheckpoint.load())));
m_previousCheckpoint += resolved;
@ -357,6 +294,77 @@ public:
}
private:
MetalCtx(id<MTLDevice> device)
: m_device(device)
{
TracyMetalDebugMasked(1<<0, TracyMetalPanic(, "MTLCounterErrorValue = 0x%llx", MTLCounterErrorValue));
TracyMetalDebugMasked(1<<0, TracyMetalPanic(, "MTLCounterDontSample = 0x%llx", MTLCounterDontSample));
if (m_device == nil)
{
TracyMetalPanic({assert(false);} return, "device is nil.");
}
if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtStageBoundary])
{
TracyMetalPanic({assert(false);} return, "ERROR: timestamp sampling at pipeline stage boundary is not supported.");
}
if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtDrawBoundary])
{
TracyMetalDebugMasked(1<<0, fprintf(stderr, "WARNING: timestamp sampling at draw call boundary is not supported.\n"));
}
if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtBlitBoundary])
{
TracyMetalDebugMasked(1<<0, fprintf(stderr, "WARNING: timestamp sampling at blit boundary is not supported.\n"));
}
if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtDispatchBoundary])
{
TracyMetalDebugMasked(1<<0, fprintf(stderr, "WARNING: timestamp sampling at compute dispatch boundary is not supported.\n"));
}
if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtTileDispatchBoundary])
{
TracyMetalDebugMasked(1<<0, fprintf(stderr, "WARNING: timestamp sampling at tile dispatch boundary is not supported.\n"));
}
m_counterSampleBuffers[0] = NewTimestampSampleBuffer(m_device, MaxQueries);
m_counterSampleBuffers[1] = NewTimestampSampleBuffer(m_device, MaxQueries);
m_timestampRequestTime.resize(MaxQueries);
MTLTimestamp cpuTimestamp = 0;
MTLTimestamp gpuTimestamp = 0;
[m_device sampleTimestamps:&cpuTimestamp gpuTimestamp:&gpuTimestamp];
m_mostRecentTimestamp = gpuTimestamp;
TracyMetalDebugMasked(1<<0, TracyMetalPanic(, "Calibration: CPU timestamp (Metal): %llu", cpuTimestamp));
TracyMetalDebugMasked(1<<0, TracyMetalPanic(, "Calibration: GPU timestamp (Metal): %llu", gpuTimestamp));
cpuTimestamp = Profiler::GetTime();
TracyMetalDebugMasked(1<<0, TracyMetalPanic(, "Calibration: CPU timestamp (Tracy): %llu", cpuTimestamp));
float period = 1.0f;
m_contextId = GetGpuCtxCounter().fetch_add(1);
auto* item = Profiler::QueueSerial();
MemWrite(&item->hdr.type, QueueType::GpuNewContext);
MemWrite(&item->gpuNewContext.cpuTime, int64_t(cpuTimestamp));
MemWrite(&item->gpuNewContext.gpuTime, int64_t(gpuTimestamp));
MemWrite(&item->gpuNewContext.thread, uint32_t(0)); // TODO: why not GetThreadHandle()?
MemWrite(&item->gpuNewContext.period, period);
MemWrite(&item->gpuNewContext.context, m_contextId);
//MemWrite(&item->gpuNewContext.flags, GpuContextCalibration);
MemWrite(&item->gpuNewContext.flags, GpuContextFlags(0));
MemWrite(&item->gpuNewContext.type, GpuContextType::Metal);
SubmitQueueItem(item);
}
~MetalCtx()
{
// collect the last remnants of Metal GPU activity...
// TODO: add a timeout to this loop?
while (m_previousCheckpoint.load() != m_queryCounter.load())
Collect();
}
tracy_force_inline void SubmitQueueItem(QueueItem* item)
{
#ifdef TRACY_ON_DEMAND
@ -387,9 +395,9 @@ private:
tracy_force_inline Query NextQuery()
{
ZoneScopedNC("TracyMetal::NextQuery", tracy::Color::LightCoral);
TracyMetalDebug_0b00010( ZoneScopedNC("Tracy::MetalCtx::NextQuery", tracy::Color::LightCoral) );
auto id = m_queryCounter.fetch_add(2);
TracyMetalDebug(1<<1, ZoneValue(id));
TracyMetalDebug_0b00010( ZoneValue(id) );
auto count = RingCount(m_previousCheckpoint, id);
if (count >= MaxQueries)
{
@ -402,13 +410,13 @@ private:
);
}
uint32_t buffer_idx = (id / MaxQueries) % 2;
TracyMetalDebug(1<<1, ZoneValue(buffer_idx));
TracyMetalDebug_0b00010( ZoneValue(buffer_idx) );
auto buffer = m_counterSampleBuffers[buffer_idx];
if (buffer == nil)
TracyMetalPanic(, "NextQueryId: sample buffer is nil! (id=%llu)", id);
uint32_t idx = RingIndex(id);
TracyMetalDebug(1<<1, ZoneValue(idx));
TracyMetalDebug(1<<1, TracyAllocN((void*)(uintptr_t)idx, 2, "TracyMetalTimestampQueryId"));
TracyMetalDebug_0b00010( ZoneValue(idx) );
TracyMetalDebug_0b00010( TracyAllocN((void*)(uintptr_t)idx, 2, "TracyMetalTimestampQueryId") );
m_timestampRequestTime[idx] = std::chrono::high_resolution_clock::now();
return Query{ buffer, idx };
}
@ -420,7 +428,7 @@ private:
static id<MTLCounterSampleBuffer> NewTimestampSampleBuffer(id<MTLDevice> device, size_t count)
{
ZoneScopedN("TracyMetal::NewTimestampSampleBuffer");
ZoneScopedN("tracy::MetalCtx::NewTimestampSampleBuffer");
id<MTLCounterSet> timestampCounterSet = nil;
for (id<MTLCounterSet> counterSet in device.counterSets)
@ -433,7 +441,7 @@ private:
}
if (timestampCounterSet == nil)
{
TracyMetalPanic(return nil, "ERROR: timestamp counters are not supported on the platform.");
TracyMetalPanic({assert(false);} return nil, "ERROR: timestamp counters are not supported on the platform.");
}
MTLCounterSampleBufferDescriptor* sampleDescriptor = [[MTLCounterSampleBufferDescriptor alloc] init];
@ -447,7 +455,7 @@ private:
if (error != nil)
{
//NSLog(@"%@ | %@", error.localizedDescription, error.localizedFailureReason);
TracyMetalPanic(return nil,
TracyMetalPanic({assert(false);} return nil,
"ERROR: unable to create sample buffer for timestamp counters : %s | %s",
[error.localizedDescription cString], [error.localizedFailureReason cString]);
}
@ -483,7 +491,7 @@ public:
#endif
{
if ( !m_active ) return;
if (desc == nil) TracyMetalPanic(return, "compute pass descriptor is nil.");
if (desc == nil) TracyMetalPanic({assert(false);} return, "compute pass descriptor is nil.");
m_ctx = ctx;
auto& query = m_query = ctx->NextQuery();
@ -503,7 +511,7 @@ public:
#endif
{
if ( !m_active ) return;
if (desc == nil) TracyMetalPanic(return, "blit pass descriptor is nil.");
if (desc == nil) TracyMetalPanic({assert(false); }return, "blit pass descriptor is nil.");
m_ctx = ctx;
auto& query = m_query = ctx->NextQuery();
@ -523,7 +531,7 @@ public:
#endif
{
if ( !m_active ) return;
if (desc == nil) TracyMetalPanic(return, "render pass descriptor is nil.");
if (desc == nil) TracyMetalPanic({assert(false);} return, "render pass descriptor is nil.");
m_ctx = ctx;
auto& query = m_query = ctx->NextQuery();
@ -586,7 +594,7 @@ private:
MemWrite( &item->gpuZoneBegin.context, ctx->GetContextId() );
Profiler::QueueSerialFinish();
TracyMetalDebug(1<<2, TracyAllocN((void*)(uintptr_t)queryId, 1, "TracyMetalGpuZone"));
TracyMetalDebugMasked(1<<2, TracyAllocN((void*)(uintptr_t)queryId, 1, "TracyMetalGpuZone"));
}
static void SubmitZoneEndGpu(MetalCtx* ctx, uint32_t queryId)
@ -599,7 +607,7 @@ private:
MemWrite( &item->gpuZoneEnd.context, ctx->GetContextId() );
Profiler::QueueSerialFinish();
TracyMetalDebug(1<<2, TracyAllocN((void*)(uintptr_t)queryId, 1, "TracyMetalGpuZone"));
TracyMetalDebugMasked(1<<2, TracyAllocN((void*)(uintptr_t)queryId, 1, "TracyMetalGpuZone"));
}
MetalCtx::Query m_query = {};
@ -607,7 +615,7 @@ private:
}
using TracyMetalCtx = tracy::MetalCtx*;
using TracyMetalCtx = tracy::MetalCtx;
#define TracyMetalContext(device) tracy::MetalCtx::Create(device)
#define TracyMetalDestroy(ctx) tracy::MetalCtx::Destroy(ctx)
@ -620,6 +628,17 @@ using TracyMetalCtx = tracy::MetalCtx*;
#define TracyMetalCollect( ctx ) ctx->Collect();
#undef TracyMetalDebug_ZoneScopeWireTap
#undef TracyMetalDebug_0b00010
#undef TracyMetalDebug_0b10000
#undef TracyMetalDebugMasked
#undef TRACY_METAL_DEBUG_MASK
#undef TRACY_METAL_TIMESTAMP_COLLECT_TIMEOUT
#undef TracyMetalPanic
#undef TRACY_METAL_VA_ARGS
#endif
#endif//__TRACYMETAL_HMM__