addressing code review comments

This commit is contained in:
Marcos Slomp 2024-09-03 11:15:39 -07:00
parent fe51f02a25
commit 0ffa0be4fd

View File

@ -1,7 +1,9 @@
#ifndef __TRACYMETAL_HMM__ #ifndef __TRACYMETAL_HMM__
#define __TRACYMETAL_HMM__ #define __TRACYMETAL_HMM__
/* The Metal back-end in Tracy operates differently than other GPU back-ends like Vulkan, /* This file implements a Metal API back-end for Tracy (it has only been tested on Apple
Silicon devices, but it should also work on Intel-based Macs and older iOS devices).
The Metal back-end in Tracy operates differently than other GPU back-ends like Vulkan,
Direct3D and OpenGL. Specifically, TracyMetalZone() must be placed around the site where Direct3D and OpenGL. Specifically, TracyMetalZone() must be placed around the site where
a command encoder is created. This is because not all hardware supports timestamps at a command encoder is created. This is because not all hardware supports timestamps at
command granularity, and can only provide timestamps around an entire command encoder. command granularity, and can only provide timestamps around an entire command encoder.
@ -58,6 +60,10 @@ using TracyMetalCtx = void*;
#else #else
#if not __has_feature(objc_arc)
#error TracyMetal requires ARC to be enabled.
#endif
#include <atomic> #include <atomic>
#include <assert.h> #include <assert.h>
#include <stdlib.h> #include <stdlib.h>
@ -82,8 +88,13 @@ using TracyMetalCtx = void*;
ret; \ ret; \
} while(false); } while(false);
#ifndef TRACY_METAL_TIMESTAMP_COLLECT_TIMEOUT
#define TRACY_METAL_TIMESTAMP_COLLECT_TIMEOUT 0.200f
#endif//TRACY_METAL_TIMESTAMP_COLLECT_TIMEOUT
#ifndef TRACY_METAL_DEBUG_MASK
#define TRACY_METAL_DEBUG_MASK (0) #define TRACY_METAL_DEBUG_MASK (0)
#endif//TRACY_METAL_DEBUG_MASK
#if TRACY_METAL_DEBUG_MASK #if TRACY_METAL_DEBUG_MASK
#define TracyMetalDebug(mask, ...) if (mask & TRACY_METAL_DEBUG_MASK) { __VA_ARGS__; } #define TracyMetalDebug(mask, ...) if (mask & TRACY_METAL_DEBUG_MASK) { __VA_ARGS__; }
@ -91,9 +102,9 @@ using TracyMetalCtx = void*;
#define TracyMetalDebug(mask, ...) #define TracyMetalDebug(mask, ...)
#endif #endif
#ifndef TracyMetalZoneScopeWireTap #ifndef TracyMetalDebugZoneScopeWireTap
#define TracyMetalZoneScopeWireTap #define TracyMetalDebugZoneScopeWireTap
#endif//TracyMetalZoneScopeWireTap #endif//TracyMetalDebugZoneScopeWireTap
namespace tracy namespace tracy
{ {
@ -123,19 +134,19 @@ public:
} }
if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtDrawBoundary]) if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtDrawBoundary])
{ {
TracyMetalPanic(, "WARNING: timestamp sampling at draw call boundary is not supported."); TracyMetalDebug(1<<0, fprintf(stderr, "WARNING: timestamp sampling at draw call boundary is not supported.\n"));
} }
if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtBlitBoundary]) if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtBlitBoundary])
{ {
TracyMetalPanic(, "WARNING: timestamp sampling at blit boundary is not supported."); TracyMetalDebug(1<<0, fprintf(stderr, "WARNING: timestamp sampling at blit boundary is not supported.\n"));
} }
if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtDispatchBoundary]) if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtDispatchBoundary])
{ {
TracyMetalPanic(, "WARNING: timestamp sampling at compute dispatch boundary is not supported."); TracyMetalDebug(1<<0, fprintf(stderr, "WARNING: timestamp sampling at compute dispatch boundary is not supported.\n"));
} }
if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtTileDispatchBoundary]) if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtTileDispatchBoundary])
{ {
TracyMetalPanic(, "WARNING: timestamp sampling at tile dispatch boundary is not supported."); TracyMetalDebug(1<<0, fprintf(stderr, "WARNING: timestamp sampling at tile dispatch boundary is not supported.\n"));
} }
m_counterSampleBuffers[0] = NewTimestampSampleBuffer(m_device, MaxQueries); m_counterSampleBuffers[0] = NewTimestampSampleBuffer(m_device, MaxQueries);
@ -161,20 +172,20 @@ public:
MemWrite(&item->hdr.type, QueueType::GpuNewContext); MemWrite(&item->hdr.type, QueueType::GpuNewContext);
MemWrite(&item->gpuNewContext.cpuTime, int64_t(cpuTimestamp)); MemWrite(&item->gpuNewContext.cpuTime, int64_t(cpuTimestamp));
MemWrite(&item->gpuNewContext.gpuTime, int64_t(gpuTimestamp)); MemWrite(&item->gpuNewContext.gpuTime, int64_t(gpuTimestamp));
MemWrite(&item->gpuNewContext.thread, uint32_t(0)); // #TODO: why not GetThreadHandle()? MemWrite(&item->gpuNewContext.thread, uint32_t(0)); // TODO: why not GetThreadHandle()?
MemWrite(&item->gpuNewContext.period, period); MemWrite(&item->gpuNewContext.period, period);
MemWrite(&item->gpuNewContext.context, m_contextId); MemWrite(&item->gpuNewContext.context, m_contextId);
//MemWrite(&item->gpuNewContext.flags, GpuContextCalibration); //MemWrite(&item->gpuNewContext.flags, GpuContextCalibration);
MemWrite(&item->gpuNewContext.flags, GpuContextFlags(0)); MemWrite(&item->gpuNewContext.flags, GpuContextFlags(0));
MemWrite(&item->gpuNewContext.type, GpuContextType::Metal); MemWrite(&item->gpuNewContext.type, GpuContextType::Metal);
Profiler::QueueSerialFinish(); // TODO: DeferItem() for TRACY_ON_DEMAND SubmitQueueItem(item);
} }
~MetalCtx() ~MetalCtx()
{ {
ZoneScopedNC("~TracyMetalCtx", tracy::Color::Red4); ZoneScopedNC("~TracyMetalCtx", tracy::Color::Red4);
ZoneValue(m_previousCheckpoint.load()); TracyMetalDebug(1<<0, ZoneValue(m_previousCheckpoint.load()));
ZoneValue(m_queryCounter.load()); TracyMetalDebug(1<<0, ZoneValue(m_queryCounter.load()));
// collect the last remnants of Metal GPU activity... // collect the last remnants of Metal GPU activity...
// TODO: add a timeout to this loop? // TODO: add a timeout to this loop?
while (m_previousCheckpoint.load() != m_queryCounter.load()) while (m_previousCheckpoint.load() != m_queryCounter.load())
@ -204,15 +215,12 @@ public:
auto ptr = (char*)tracy_malloc( len ); auto ptr = (char*)tracy_malloc( len );
memcpy( ptr, name, len ); memcpy( ptr, name, len );
auto item = Profiler::QueueSerial(); auto* item = Profiler::QueueSerial();
MemWrite( &item->hdr.type, QueueType::GpuContextName ); MemWrite( &item->hdr.type, QueueType::GpuContextName );
MemWrite( &item->gpuContextNameFat.context, m_contextId ); MemWrite( &item->gpuContextNameFat.context, m_contextId );
MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr ); MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr );
MemWrite( &item->gpuContextNameFat.size, len ); MemWrite( &item->gpuContextNameFat.size, len );
#ifdef TRACY_ON_DEMAND SubmitQueueItem(item);
GetProfiler().DeferItem( *item );
#endif
Profiler::QueueSerialFinish();
} }
bool Collect() bool Collect()
@ -237,8 +245,8 @@ public:
uintptr_t begin = m_previousCheckpoint.load(); uintptr_t begin = m_previousCheckpoint.load();
uintptr_t latestCheckpoint = m_queryCounter.load(); // TODO: MTLEvent? MTLFence?; uintptr_t latestCheckpoint = m_queryCounter.load(); // TODO: MTLEvent? MTLFence?;
ZoneValue(begin); TracyMetalDebug(1<<3, ZoneValue(begin));
ZoneValue(latestCheckpoint); TracyMetalDebug(1<<3, ZoneValue(latestCheckpoint));
uint32_t count = RingCount(begin, latestCheckpoint); uint32_t count = RingCount(begin, latestCheckpoint);
if (count == 0) // no pending timestamp queries if (count == 0) // no pending timestamp queries
@ -259,7 +267,7 @@ public:
count = RingSize() - RingIndex(begin); count = RingSize() - RingIndex(begin);
reallocateBuffer = true; reallocateBuffer = true;
} }
ZoneValue(count); TracyMetalDebug(1<<3, ZoneValue(count));
auto buffer_idx = (begin / MaxQueries) % 2; auto buffer_idx = (begin / MaxQueries) % 2;
auto counterSampleBuffer = m_counterSampleBuffers[buffer_idx]; auto counterSampleBuffer = m_counterSampleBuffers[buffer_idx];
@ -306,7 +314,7 @@ public:
auto requestTime = m_timestampRequestTime[k]; auto requestTime = m_timestampRequestTime[k];
auto ms_in_flight = std::chrono::duration<float>(checkTime-requestTime).count()*1000.0f; auto ms_in_flight = std::chrono::duration<float>(checkTime-requestTime).count()*1000.0f;
TracyMetalDebug(1<<4, TracyMetalPanic(, "Collect: invalid timestamp (zero) at %u [%.0fms in flight].", k, ms_in_flight)); TracyMetalDebug(1<<4, TracyMetalPanic(, "Collect: invalid timestamp (zero) at %u [%.0fms in flight].", k, ms_in_flight));
const float timeout_ms = 200.0f; const float timeout_ms = TRACY_METAL_TIMESTAMP_COLLECT_TIMEOUT * 1000.0f;
if (ms_in_flight < timeout_ms) if (ms_in_flight < timeout_ms)
break; break;
ZoneScopedN("TracyMetal::Collect::Drop"); ZoneScopedN("TracyMetal::Collect::Drop");
@ -336,7 +344,7 @@ public:
TracyMetalDebug(1<<1, TracyFreeN((void*)(uintptr_t)k, "TracyMetalTimestampQueryId")); TracyMetalDebug(1<<1, TracyFreeN((void*)(uintptr_t)k, "TracyMetalTimestampQueryId"));
resolved += 2; resolved += 2;
} }
ZoneValue(RingCount(begin, m_previousCheckpoint.load())); TracyMetalDebug(1<<3, ZoneValue(RingCount(begin, m_previousCheckpoint.load())));
m_previousCheckpoint += resolved; m_previousCheckpoint += resolved;
@ -346,7 +354,9 @@ public:
// never happen so long as Collect is called frequently enough to prevent pending // never happen so long as Collect is called frequently enough to prevent pending
// timestamp query requests from piling up too quickly. // timestamp query requests from piling up too quickly.
if ((resolved == count) && (m_previousCheckpoint.load() % MaxQueries) == 0) if ((resolved == count) && (m_previousCheckpoint.load() % MaxQueries) == 0)
{
m_counterSampleBuffers[buffer_idx] = NewTimestampSampleBuffer(m_device, MaxQueries); m_counterSampleBuffers[buffer_idx] = NewTimestampSampleBuffer(m_device, MaxQueries);
}
//RecalibrateClocks(); // to account for drift //RecalibrateClocks(); // to account for drift
@ -354,6 +364,14 @@ public:
} }
private: private:
tracy_force_inline void SubmitQueueItem(QueueItem* item)
{
#ifdef TRACY_ON_DEMAND
GetProfiler().DeferItem(*item);
#endif
Profiler::QueueSerialFinish();
}
tracy_force_inline uint32_t RingIndex(uintptr_t index) tracy_force_inline uint32_t RingIndex(uintptr_t index)
{ {
index %= MaxQueries; index %= MaxQueries;
@ -378,7 +396,7 @@ private:
{ {
ZoneScopedNC("TracyMetal::NextQuery", tracy::Color::LightCoral); ZoneScopedNC("TracyMetal::NextQuery", tracy::Color::LightCoral);
auto id = m_queryCounter.fetch_add(2); auto id = m_queryCounter.fetch_add(2);
ZoneValue(id); TracyMetalDebug(1<<1, ZoneValue(id));
auto count = RingCount(m_previousCheckpoint, id); auto count = RingCount(m_previousCheckpoint, id);
if (count >= MaxQueries) if (count >= MaxQueries)
{ {
@ -386,17 +404,17 @@ private:
Query sentinel = Query{ m_counterSampleBuffers[1], MaxQueries-2 }; Query sentinel = Query{ m_counterSampleBuffers[1], MaxQueries-2 };
TracyMetalPanic( TracyMetalPanic(
return sentinel, return sentinel,
"NextQueryId: FULL! too many pending timestamp queries. [%llu, %llu] (%u)", "NextQueryId: FULL! too many pending timestamp queries. Consider calling TracyMetalCollect() more frequently. [%llu, %llu] (%u)",
m_previousCheckpoint.load(), id, count m_previousCheckpoint.load(), id, count
); );
} }
uint32_t buffer_idx = (id / MaxQueries) % 2; uint32_t buffer_idx = (id / MaxQueries) % 2;
ZoneValue(buffer_idx); TracyMetalDebug(1<<1, ZoneValue(buffer_idx));
auto buffer = m_counterSampleBuffers[buffer_idx]; auto buffer = m_counterSampleBuffers[buffer_idx];
if (buffer == nil) if (buffer == nil)
TracyMetalPanic(, "NextQueryId: sample buffer is nil! (id=%llu)", id); TracyMetalPanic(, "NextQueryId: sample buffer is nil! (id=%llu)", id);
uint32_t idx = RingIndex(id); uint32_t idx = RingIndex(id);
ZoneValue(idx); TracyMetalDebug(1<<1, ZoneValue(idx));
TracyMetalDebug(1<<1, TracyAllocN((void*)(uintptr_t)idx, 2, "TracyMetalTimestampQueryId")); TracyMetalDebug(1<<1, TracyAllocN((void*)(uintptr_t)idx, 2, "TracyMetalTimestampQueryId"));
m_timestampRequestTime[idx] = std::chrono::high_resolution_clock::now(); m_timestampRequestTime[idx] = std::chrono::high_resolution_clock::now();
return Query{ buffer, idx }; return Query{ buffer, idx };
@ -526,7 +544,7 @@ public:
SubmitZoneBeginGpu(ctx, query.idx + 0, srcloc); SubmitZoneBeginGpu(ctx, query.idx + 0, srcloc);
} }
#if 0 /* TODO: implement this constructor interfarce for "command-level" profiling, if the device supports it
tracy_force_inline MetalZoneScope( MetalCtx* ctx, id<MTLComputeCommandEncoder> cmdEncoder, const SourceLocationData* srcloc, bool is_active ) tracy_force_inline MetalZoneScope( MetalCtx* ctx, id<MTLComputeCommandEncoder> cmdEncoder, const SourceLocationData* srcloc, bool is_active )
#ifdef TRACY_ON_DEMAND #ifdef TRACY_ON_DEMAND
: m_active( is_active && GetProfiler().IsConnected() ) : m_active( is_active && GetProfiler().IsConnected() )
@ -544,7 +562,7 @@ public:
SubmitZoneBeginGpu(ctx, query.idx, srcloc); SubmitZoneBeginGpu(ctx, query.idx, srcloc);
} }
#endif */
tracy_force_inline ~MetalZoneScope() tracy_force_inline ~MetalZoneScope()
{ {
@ -553,13 +571,16 @@ public:
SubmitZoneEndGpu(m_ctx, m_query.idx + 1); SubmitZoneEndGpu(m_ctx, m_query.idx + 1);
} }
TracyMetalZoneScopeWireTap; TracyMetalDebugZoneScopeWireTap;
private: private:
const bool m_active; const bool m_active;
MetalCtx* m_ctx; MetalCtx* m_ctx;
/* TODO: declare it for "command-level" profiling
id<MTLComputeCommandEncoder> m_cmdEncoder; id<MTLComputeCommandEncoder> m_cmdEncoder;
*/
static void SubmitZoneBeginGpu(MetalCtx* ctx, uint32_t queryId, const SourceLocationData* srcloc) static void SubmitZoneBeginGpu(MetalCtx* ctx, uint32_t queryId, const SourceLocationData* srcloc)
{ {