From 37801e9e99c8b8aeb2c94f128d231ed7eee601a6 Mon Sep 17 00:00:00 2001 From: Tomer Shafir Date: Mon, 6 Apr 2026 20:27:18 +0300 Subject: [PATCH] [MCA] Enhance debug prints of processor resources (#190132) Previously, `computeProcResourceMasks()` would print resource masks on debug mode from multiple call sites, creating noise in the debug output. This patch aims to fix this and also print more info about the resources. It splits to 2 types of debug prints for resources: 1. No simulation - mask only 2. Simulation - mask + other info For 2, it shares printing on a single place in `ResourceManager` constructor, that should cover all the other simulation cases indirectly: 1. `llvm/lib/MCA/HardwareUnits/ResourceManager` - covered 2. `llvm/lib/MCA/InstrBuilder.c` - should be covered indirectly - only used by `llvm-mca` before simulation that constructs a `ResourceManager` 3. `llvm/tools/llvm-mca/Views/SummaryView.cpp` - after simulation that constructs a `ResourceManager` 4. `llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp` - after simulation that constructs a `ResourceManager` It also adds `BufferSize` to the output, which should be useful to debug scheduling model + MCA integration. For 1, it inlines mask-only printing into 2 other callers: 1. `llvm/include/llvm/MCA/Stages/InstructionTables.h` 2. `llvm/tools/llvm-exegesis/lib/SchedClassResolution.cpp` as they only use the masks there. I think this is a reasonable duplication across distinguishably different users/tools. Now every pair of callers, even across groups (1 and 2), effectively print in a mutually exclusive way. The patch adds debug tests for the 3 new callers, in the corresponding root test directories, to drive further location of logically target-independent tests that just require some target at the root. I think this convention is more discoverable, and is pretty widely used in the project. --- .../llvm/MCA/Stages/InstructionTables.h | 4 ++ llvm/include/llvm/MCA/Support.h | 5 ++ .../lib/MCA/HardwareUnits/ResourceManager.cpp | 18 ++++++ llvm/lib/MCA/Support.cpp | 21 ++++--- ...alysis-processor-resource-masks-debug.test | 55 +++++++++++++++++++ .../llvm-mca/processor-resource-masks-debug.s | 29 ++++++++++ .../llvm-mca/processor-resources-debug.s | 29 ++++++++++ .../lib/SchedClassResolution.cpp | 4 ++ 8 files changed, 156 insertions(+), 9 deletions(-) create mode 100644 llvm/test/tools/llvm-exegesis/analysis-processor-resource-masks-debug.test create mode 100644 llvm/test/tools/llvm-mca/processor-resource-masks-debug.s create mode 100644 llvm/test/tools/llvm-mca/processor-resources-debug.s diff --git a/llvm/include/llvm/MCA/Stages/InstructionTables.h b/llvm/include/llvm/MCA/Stages/InstructionTables.h index 7a96e82dd995..8db2a6b0f14d 100644 --- a/llvm/include/llvm/MCA/Stages/InstructionTables.h +++ b/llvm/include/llvm/MCA/Stages/InstructionTables.h @@ -22,6 +22,9 @@ #include "llvm/MCA/Stages/Stage.h" #include "llvm/MCA/Support.h" #include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "llvm-mca" namespace llvm { namespace mca { @@ -35,6 +38,7 @@ public: InstructionTables(const MCSchedModel &Model) : SM(Model), Masks(Model.getNumProcResourceKinds()) { computeProcResourceMasks(Model, Masks); + LLVM_DEBUG(dumpProcResourceMasks(Model, Masks)); } bool hasWorkToComplete() const override { return false; } diff --git a/llvm/include/llvm/MCA/Support.h b/llvm/include/llvm/MCA/Support.h index ce2ac9b4b6cd..6875787833d9 100644 --- a/llvm/include/llvm/MCA/Support.h +++ b/llvm/include/llvm/MCA/Support.h @@ -96,6 +96,11 @@ public: LLVM_ABI void computeProcResourceMasks(const MCSchedModel &SM, MutableArrayRef Masks); +#ifndef NDEBUG +LLVM_ABI void dumpProcResourceMasks(const MCSchedModel &SM, + ArrayRef Masks); +#endif + // Returns the index of the highest bit set. For resource masks, the position of // the highest bit set can be used to construct a resource mask identifier. inline unsigned getResourceStateIndex(uint64_t Mask) { diff --git a/llvm/lib/MCA/HardwareUnits/ResourceManager.cpp b/llvm/lib/MCA/HardwareUnits/ResourceManager.cpp index 0429f7b6970d..7671348ff29d 100644 --- a/llvm/lib/MCA/HardwareUnits/ResourceManager.cpp +++ b/llvm/lib/MCA/HardwareUnits/ResourceManager.cpp @@ -135,6 +135,24 @@ ResourceManager::ResourceManager(const MCSchedModel &SM) Strategies[Index] = getStrategyFor(*Resources[Index]); } + // Print static resource information on debug mode + LLVM_DEBUG({ + dbgs() << "\nProcessor resources:\n"; + // Print InvalidUnit first to be consistent with scheduling model indexing + // schema + const MCProcResourceDesc &InvalidUnit = *SM.getProcResource(0); + dbgs() << "[ 0] - " << format_hex(ProcResID2Mask[0], 16) << " - " + << InvalidUnit.Name << "\n"; + for (unsigned I = 0, E = Resources.size(); I < E; ++I) { + const ResourceState &RS = *Resources[I]; + const unsigned ProcResID = RS.getProcResourceID(); + const MCProcResourceDesc &Desc = *SM.getProcResource(ProcResID); + dbgs() << '[' << format_decimal(ProcResID, 2) << "] " + << " - " << format_hex(RS.getResourceMask(), 16) << " - " + << Desc.Name << " (BufferSize=" << RS.getBufferSize() << ")\n"; + } + }); + for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) { uint64_t Mask = ProcResID2Mask[I]; unsigned Index = getResourceStateIndex(Mask); diff --git a/llvm/lib/MCA/Support.cpp b/llvm/lib/MCA/Support.cpp index 1f1f2ab8d2c3..45459f7a3a72 100644 --- a/llvm/lib/MCA/Support.cpp +++ b/llvm/lib/MCA/Support.cpp @@ -14,6 +14,7 @@ #include "llvm/MCA/Support.h" #include "llvm/MC/MCSchedule.h" +#include "llvm/Support/Debug.h" #include namespace llvm { @@ -67,17 +68,19 @@ void computeProcResourceMasks(const MCSchedModel &SM, } ProcResourceID++; } - - LLVM_DEBUG({ - dbgs() << "\nProcessor resource masks:\n"; - for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) { - const MCProcResourceDesc &Desc = *SM.getProcResource(I); - dbgs() << '[' << format_decimal(I, 2) << "] " << " - " - << format_hex(Masks[I], 16) << " - " << Desc.Name << '\n'; - } - }); } +#ifndef NDEBUG +void dumpProcResourceMasks(const MCSchedModel &SM, ArrayRef Masks) { + dbgs() << "\nProcessor resource masks:\n"; + for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) { + const MCProcResourceDesc &Desc = *SM.getProcResource(I); + dbgs() << '[' << format_decimal(I, 2) << "] " << " - " + << format_hex(Masks[I], 16) << " - " << Desc.Name << '\n'; + } +} +#endif + double computeBlockRThroughput(const MCSchedModel &SM, unsigned DispatchWidth, unsigned NumMicroOps, ArrayRef ProcResourceUsage) { diff --git a/llvm/test/tools/llvm-exegesis/analysis-processor-resource-masks-debug.test b/llvm/test/tools/llvm-exegesis/analysis-processor-resource-masks-debug.test new file mode 100644 index 000000000000..fa97d8fa9cd6 --- /dev/null +++ b/llvm/test/tools/llvm-exegesis/analysis-processor-resource-masks-debug.test @@ -0,0 +1,55 @@ +# REQUIRES: asserts +# REQUIRES: aarch64-registered-target + +# RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-inconsistencies-output-file=/dev/null -analysis-numpoints=1 --debug-only=exegesis-sched-class-resolution 2>&1 | FileCheck %s + +## Do not print detailed processor resources information without simulation +# CHECK-NOT: Processor resources: + +## Print mask-only information without simulation +# CHECK-COUNT-1: Processor resource masks: +# CHECK-NEXT: [ 0] - 0x00000000000000 - InvalidUnit +# CHECK-NEXT: [ 1] - 0x00000000000001 - V2UnitB +# CHECK-NEXT: [ 2] - 0x00000000000002 - V2UnitD +# CHECK-NEXT: [ 3] - 0x000000000081e0 - V2UnitF +# CHECK-NEXT: [ 4] - 0x00000000000004 - V2UnitFlg +# CHECK-NEXT: [ 5] - 0x000000000107e0 - V2UnitI +# CHECK-NEXT: [ 6] - 0x00000000020018 - V2UnitL +# CHECK-NEXT: [ 7] - 0x00000000000008 - V2UnitL2 +# CHECK-NEXT: [ 8] - 0x00000000000010 - V2UnitL01 +# CHECK-NEXT: [ 9] - 0x00000000040060 - V2UnitM +# CHECK-NEXT: [10] - 0x00000000000020 - V2UnitM0 +# CHECK-NEXT: [11] - 0x00000000000040 - V2UnitM1 +# CHECK-NEXT: [12] - 0x00000000080180 - V2UnitR +# CHECK-NEXT: [13] - 0x00000000100780 - V2UnitS +# CHECK-NEXT: [14] - 0x00000000000080 - V2UnitS0 +# CHECK-NEXT: [15] - 0x00000000000100 - V2UnitS1 +# CHECK-NEXT: [16] - 0x00000000000200 - V2UnitS2 +# CHECK-NEXT: [17] - 0x00000000000400 - V2UnitS3 +# CHECK-NEXT: [18] - 0x00000000207800 - V2UnitV +# CHECK-NEXT: [19] - 0x00000000000800 - V2UnitV0 +# CHECK-NEXT: [20] - 0x00000000001000 - V2UnitV1 +# CHECK-NEXT: [21] - 0x00000000002000 - V2UnitV2 +# CHECK-NEXT: [22] - 0x00000000004000 - V2UnitV3 +# CHECK-NEXT: [23] - 0x00000000401800 - V2UnitV01 +# CHECK-NEXT: [24] - 0x00000000802800 - V2UnitV02 +# CHECK-NEXT: [25] - 0x00000001005000 - V2UnitV13 +# CHECK-NEXT: [26] - 0x00000002006000 - V2UnitV23 + +--- +mode: latency +key: + instructions: + - 'ADDVv4i16v H16 D16' + config: '' + register_initial_values: + - 'D16=0x0' +cpu_name: neoverse-v2 +llvm_triple: aarch64 +min_instructions: 100 +measurements: + - { key: latency, value: 1.0, per_snippet_value: 1.0 } +error: '' +info: Repeating a single explicitly serial instruction +assembled_snippet: 10E4002F10BA710E10BA710E10BA710E10BA710EC0035FD6 +... diff --git a/llvm/test/tools/llvm-mca/processor-resource-masks-debug.s b/llvm/test/tools/llvm-mca/processor-resource-masks-debug.s new file mode 100644 index 000000000000..afb23fb3fdb9 --- /dev/null +++ b/llvm/test/tools/llvm-mca/processor-resource-masks-debug.s @@ -0,0 +1,29 @@ +# REQUIRES: asserts +# REQUIRES: aarch64-registered-target + +# RUN: llvm-mca < %s -mtriple=aarch64 -mcpu=apple-m1 -debug -instruction-tables 2>&1 | FileCheck %s + +# LLVM-MCA-BEGIN foo +add x2, x0, x1 +# LLVM-MCA-END + +## Do not print detailed processor resources information without simulation +# CHECK-NOT: Processor resources: + +## Print mask-only information without simulation +# CHECK-COUNT-1: Processor resource masks: +# CHECK-NEXT: [ 0] - 0x00000000000000 - InvalidUnit +# CHECK-NEXT: [ 1] - 0x00000000000001 - CyUnitB +# CHECK-NEXT: [ 2] - 0x00000000000002 - CyUnitBR +# CHECK-NEXT: [ 3] - 0x00000000000004 - CyUnitFloatDiv +# CHECK-NEXT: [ 4] - 0x00000000000008 - CyUnitI +# CHECK-NEXT: [ 5] - 0x00000000000010 - CyUnitID +# CHECK-NEXT: [ 6] - 0x00000000000020 - CyUnitIM +# CHECK-NEXT: [ 7] - 0x00000000000040 - CyUnitIS +# CHECK-NEXT: [ 8] - 0x00000000000080 - CyUnitIntDiv +# CHECK-NEXT: [ 9] - 0x00000000000100 - CyUnitLS +# CHECK-NEXT: [10] - 0x00000000000200 - CyUnitV +# CHECK-NEXT: [11] - 0x00000000000400 - CyUnitVC +# CHECK-NEXT: [12] - 0x00000000000800 - CyUnitVD +# CHECK-NEXT: [13] - 0x00000000001000 - CyUnitVM +# CHECK: [0] Code Region - foo diff --git a/llvm/test/tools/llvm-mca/processor-resources-debug.s b/llvm/test/tools/llvm-mca/processor-resources-debug.s new file mode 100644 index 000000000000..632516b2c2b1 --- /dev/null +++ b/llvm/test/tools/llvm-mca/processor-resources-debug.s @@ -0,0 +1,29 @@ +# REQUIRES: asserts +# REQUIRES: aarch64-registered-target + +# RUN: llvm-mca < %s -mtriple=aarch64 -mcpu=apple-m1 -debug 2>&1 | FileCheck %s + +# LLVM-MCA-BEGIN foo +add x2, x0, x1 +# LLVM-MCA-END + +## Print detailed processor resources information on simulation +# CHECK-COUNT-1: Processor resources: +# CHECK-NEXT: [ 0] - 0x00000000000000 - InvalidUnit +# CHECK-NEXT: [ 1] - 0x00000000000001 - CyUnitB (BufferSize=24) +# CHECK-NEXT: [ 2] - 0x00000000000002 - CyUnitBR (BufferSize=-1) +# CHECK-NEXT: [ 3] - 0x00000000000004 - CyUnitFloatDiv (BufferSize=-1) +# CHECK-NEXT: [ 4] - 0x00000000000008 - CyUnitI (BufferSize=48) +# CHECK-NEXT: [ 5] - 0x00000000000010 - CyUnitID (BufferSize=16) +# CHECK-NEXT: [ 6] - 0x00000000000020 - CyUnitIM (BufferSize=32) +# CHECK-NEXT: [ 7] - 0x00000000000040 - CyUnitIS (BufferSize=24) +# CHECK-NEXT: [ 8] - 0x00000000000080 - CyUnitIntDiv (BufferSize=-1) +# CHECK-NEXT: [ 9] - 0x00000000000100 - CyUnitLS (BufferSize=28) +# CHECK-NEXT: [10] - 0x00000000000200 - CyUnitV (BufferSize=48) +# CHECK-NEXT: [11] - 0x00000000000400 - CyUnitVC (BufferSize=16) +# CHECK-NEXT: [12] - 0x00000000000800 - CyUnitVD (BufferSize=16) +# CHECK-NEXT: [13] - 0x00000000001000 - CyUnitVM (BufferSize=32) +# CHECK: [0] Code Region - foo + +## Do not print mask-only information on simulation +# CHECK-NOT: Processor resource masks: diff --git a/llvm/tools/llvm-exegesis/lib/SchedClassResolution.cpp b/llvm/tools/llvm-exegesis/lib/SchedClassResolution.cpp index d6dfb65bf82e..788a03b55c5d 100644 --- a/llvm/tools/llvm-exegesis/lib/SchedClassResolution.cpp +++ b/llvm/tools/llvm-exegesis/lib/SchedClassResolution.cpp @@ -11,9 +11,12 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MCA/Support.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/FormatVariadic.h" #include +#define DEBUG_TYPE "exegesis-sched-class-resolution" + namespace llvm { namespace exegesis { @@ -55,6 +58,7 @@ getNonRedundantWriteProcRes(const MCSchedClassDesc &SCDesc, // Collect resource masks. SmallVector ProcResourceMasks(NumProcRes); mca::computeProcResourceMasks(SM, ProcResourceMasks); + LLVM_DEBUG(mca::dumpProcResourceMasks(SM, ProcResourceMasks)); // Sort entries by smaller resources for (basic) topological ordering. using ResourceMaskAndEntry = std::pair;