[MCA] Enhance debug prints of processor resources (#190132)

Previously, `computeProcResourceMasks()` would print resource masks on
debug mode from multiple call sites, creating noise in the debug output.
This patch aims to fix this and also print more info about the
resources.

It splits to 2 types of debug prints for resources:

1. No simulation - mask only
2. Simulation - mask + other info

For 2, it shares printing on a single place in `ResourceManager`
constructor, that should cover all the other simulation cases
indirectly:

1. `llvm/lib/MCA/HardwareUnits/ResourceManager` - covered
2. `llvm/lib/MCA/InstrBuilder.c` - should be covered indirectly - only
used by `llvm-mca` before simulation that constructs a `ResourceManager`
3. `llvm/tools/llvm-mca/Views/SummaryView.cpp` - after simulation that
constructs a `ResourceManager`
4. `llvm/tools/llvm-mca/Views/BottleneckAnalysis.cpp` - after simulation
that constructs a `ResourceManager`

It also adds `BufferSize` to the output, which should be useful to debug
scheduling model + MCA integration.

For 1, it inlines mask-only printing into 2 other callers:

1. `llvm/include/llvm/MCA/Stages/InstructionTables.h`
2. `llvm/tools/llvm-exegesis/lib/SchedClassResolution.cpp`

as they only use the masks there. I think this is a reasonable
duplication across distinguishably different users/tools.

Now every pair of callers, even across groups (1 and 2), effectively
print in a mutually exclusive way.

The patch adds debug tests for the 3 new callers, in the corresponding
root test directories, to drive further location of logically
target-independent tests that just require some target at the root. I
think this convention is more discoverable, and is pretty widely used in
the project.
This commit is contained in:
Tomer Shafir 2026-04-06 20:27:18 +03:00 committed by GitHub
parent 72d4ce9889
commit 37801e9e99
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 156 additions and 9 deletions

View File

@ -22,6 +22,9 @@
#include "llvm/MCA/Stages/Stage.h"
#include "llvm/MCA/Support.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#define DEBUG_TYPE "llvm-mca"
namespace llvm {
namespace mca {
@ -35,6 +38,7 @@ public:
InstructionTables(const MCSchedModel &Model)
: SM(Model), Masks(Model.getNumProcResourceKinds()) {
computeProcResourceMasks(Model, Masks);
LLVM_DEBUG(dumpProcResourceMasks(Model, Masks));
}
bool hasWorkToComplete() const override { return false; }

View File

@ -96,6 +96,11 @@ public:
LLVM_ABI void computeProcResourceMasks(const MCSchedModel &SM,
MutableArrayRef<uint64_t> Masks);
#ifndef NDEBUG
LLVM_ABI void dumpProcResourceMasks(const MCSchedModel &SM,
ArrayRef<uint64_t> Masks);
#endif
// Returns the index of the highest bit set. For resource masks, the position of
// the highest bit set can be used to construct a resource mask identifier.
inline unsigned getResourceStateIndex(uint64_t Mask) {

View File

@ -135,6 +135,24 @@ ResourceManager::ResourceManager(const MCSchedModel &SM)
Strategies[Index] = getStrategyFor(*Resources[Index]);
}
// Print static resource information on debug mode
LLVM_DEBUG({
dbgs() << "\nProcessor resources:\n";
// Print InvalidUnit first to be consistent with scheduling model indexing
// schema
const MCProcResourceDesc &InvalidUnit = *SM.getProcResource(0);
dbgs() << "[ 0] - " << format_hex(ProcResID2Mask[0], 16) << " - "
<< InvalidUnit.Name << "\n";
for (unsigned I = 0, E = Resources.size(); I < E; ++I) {
const ResourceState &RS = *Resources[I];
const unsigned ProcResID = RS.getProcResourceID();
const MCProcResourceDesc &Desc = *SM.getProcResource(ProcResID);
dbgs() << '[' << format_decimal(ProcResID, 2) << "] "
<< " - " << format_hex(RS.getResourceMask(), 16) << " - "
<< Desc.Name << " (BufferSize=" << RS.getBufferSize() << ")\n";
}
});
for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
uint64_t Mask = ProcResID2Mask[I];
unsigned Index = getResourceStateIndex(Mask);

View File

@ -14,6 +14,7 @@
#include "llvm/MCA/Support.h"
#include "llvm/MC/MCSchedule.h"
#include "llvm/Support/Debug.h"
#include <numeric>
namespace llvm {
@ -67,17 +68,19 @@ void computeProcResourceMasks(const MCSchedModel &SM,
}
ProcResourceID++;
}
LLVM_DEBUG({
dbgs() << "\nProcessor resource masks:\n";
for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
const MCProcResourceDesc &Desc = *SM.getProcResource(I);
dbgs() << '[' << format_decimal(I, 2) << "] " << " - "
<< format_hex(Masks[I], 16) << " - " << Desc.Name << '\n';
}
});
}
#ifndef NDEBUG
void dumpProcResourceMasks(const MCSchedModel &SM, ArrayRef<uint64_t> Masks) {
dbgs() << "\nProcessor resource masks:\n";
for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
const MCProcResourceDesc &Desc = *SM.getProcResource(I);
dbgs() << '[' << format_decimal(I, 2) << "] " << " - "
<< format_hex(Masks[I], 16) << " - " << Desc.Name << '\n';
}
}
#endif
double computeBlockRThroughput(const MCSchedModel &SM, unsigned DispatchWidth,
unsigned NumMicroOps,
ArrayRef<unsigned> ProcResourceUsage) {

View File

@ -0,0 +1,55 @@
# REQUIRES: asserts
# REQUIRES: aarch64-registered-target
# RUN: llvm-exegesis -mode=analysis -benchmarks-file=%s -analysis-inconsistencies-output-file=/dev/null -analysis-numpoints=1 --debug-only=exegesis-sched-class-resolution 2>&1 | FileCheck %s
## Do not print detailed processor resources information without simulation
# CHECK-NOT: Processor resources:
## Print mask-only information without simulation
# CHECK-COUNT-1: Processor resource masks:
# CHECK-NEXT: [ 0] - 0x00000000000000 - InvalidUnit
# CHECK-NEXT: [ 1] - 0x00000000000001 - V2UnitB
# CHECK-NEXT: [ 2] - 0x00000000000002 - V2UnitD
# CHECK-NEXT: [ 3] - 0x000000000081e0 - V2UnitF
# CHECK-NEXT: [ 4] - 0x00000000000004 - V2UnitFlg
# CHECK-NEXT: [ 5] - 0x000000000107e0 - V2UnitI
# CHECK-NEXT: [ 6] - 0x00000000020018 - V2UnitL
# CHECK-NEXT: [ 7] - 0x00000000000008 - V2UnitL2
# CHECK-NEXT: [ 8] - 0x00000000000010 - V2UnitL01
# CHECK-NEXT: [ 9] - 0x00000000040060 - V2UnitM
# CHECK-NEXT: [10] - 0x00000000000020 - V2UnitM0
# CHECK-NEXT: [11] - 0x00000000000040 - V2UnitM1
# CHECK-NEXT: [12] - 0x00000000080180 - V2UnitR
# CHECK-NEXT: [13] - 0x00000000100780 - V2UnitS
# CHECK-NEXT: [14] - 0x00000000000080 - V2UnitS0
# CHECK-NEXT: [15] - 0x00000000000100 - V2UnitS1
# CHECK-NEXT: [16] - 0x00000000000200 - V2UnitS2
# CHECK-NEXT: [17] - 0x00000000000400 - V2UnitS3
# CHECK-NEXT: [18] - 0x00000000207800 - V2UnitV
# CHECK-NEXT: [19] - 0x00000000000800 - V2UnitV0
# CHECK-NEXT: [20] - 0x00000000001000 - V2UnitV1
# CHECK-NEXT: [21] - 0x00000000002000 - V2UnitV2
# CHECK-NEXT: [22] - 0x00000000004000 - V2UnitV3
# CHECK-NEXT: [23] - 0x00000000401800 - V2UnitV01
# CHECK-NEXT: [24] - 0x00000000802800 - V2UnitV02
# CHECK-NEXT: [25] - 0x00000001005000 - V2UnitV13
# CHECK-NEXT: [26] - 0x00000002006000 - V2UnitV23
---
mode: latency
key:
instructions:
- 'ADDVv4i16v H16 D16'
config: ''
register_initial_values:
- 'D16=0x0'
cpu_name: neoverse-v2
llvm_triple: aarch64
min_instructions: 100
measurements:
- { key: latency, value: 1.0, per_snippet_value: 1.0 }
error: ''
info: Repeating a single explicitly serial instruction
assembled_snippet: 10E4002F10BA710E10BA710E10BA710E10BA710EC0035FD6
...

View File

@ -0,0 +1,29 @@
# REQUIRES: asserts
# REQUIRES: aarch64-registered-target
# RUN: llvm-mca < %s -mtriple=aarch64 -mcpu=apple-m1 -debug -instruction-tables 2>&1 | FileCheck %s
# LLVM-MCA-BEGIN foo
add x2, x0, x1
# LLVM-MCA-END
## Do not print detailed processor resources information without simulation
# CHECK-NOT: Processor resources:
## Print mask-only information without simulation
# CHECK-COUNT-1: Processor resource masks:
# CHECK-NEXT: [ 0] - 0x00000000000000 - InvalidUnit
# CHECK-NEXT: [ 1] - 0x00000000000001 - CyUnitB
# CHECK-NEXT: [ 2] - 0x00000000000002 - CyUnitBR
# CHECK-NEXT: [ 3] - 0x00000000000004 - CyUnitFloatDiv
# CHECK-NEXT: [ 4] - 0x00000000000008 - CyUnitI
# CHECK-NEXT: [ 5] - 0x00000000000010 - CyUnitID
# CHECK-NEXT: [ 6] - 0x00000000000020 - CyUnitIM
# CHECK-NEXT: [ 7] - 0x00000000000040 - CyUnitIS
# CHECK-NEXT: [ 8] - 0x00000000000080 - CyUnitIntDiv
# CHECK-NEXT: [ 9] - 0x00000000000100 - CyUnitLS
# CHECK-NEXT: [10] - 0x00000000000200 - CyUnitV
# CHECK-NEXT: [11] - 0x00000000000400 - CyUnitVC
# CHECK-NEXT: [12] - 0x00000000000800 - CyUnitVD
# CHECK-NEXT: [13] - 0x00000000001000 - CyUnitVM
# CHECK: [0] Code Region - foo

View File

@ -0,0 +1,29 @@
# REQUIRES: asserts
# REQUIRES: aarch64-registered-target
# RUN: llvm-mca < %s -mtriple=aarch64 -mcpu=apple-m1 -debug 2>&1 | FileCheck %s
# LLVM-MCA-BEGIN foo
add x2, x0, x1
# LLVM-MCA-END
## Print detailed processor resources information on simulation
# CHECK-COUNT-1: Processor resources:
# CHECK-NEXT: [ 0] - 0x00000000000000 - InvalidUnit
# CHECK-NEXT: [ 1] - 0x00000000000001 - CyUnitB (BufferSize=24)
# CHECK-NEXT: [ 2] - 0x00000000000002 - CyUnitBR (BufferSize=-1)
# CHECK-NEXT: [ 3] - 0x00000000000004 - CyUnitFloatDiv (BufferSize=-1)
# CHECK-NEXT: [ 4] - 0x00000000000008 - CyUnitI (BufferSize=48)
# CHECK-NEXT: [ 5] - 0x00000000000010 - CyUnitID (BufferSize=16)
# CHECK-NEXT: [ 6] - 0x00000000000020 - CyUnitIM (BufferSize=32)
# CHECK-NEXT: [ 7] - 0x00000000000040 - CyUnitIS (BufferSize=24)
# CHECK-NEXT: [ 8] - 0x00000000000080 - CyUnitIntDiv (BufferSize=-1)
# CHECK-NEXT: [ 9] - 0x00000000000100 - CyUnitLS (BufferSize=28)
# CHECK-NEXT: [10] - 0x00000000000200 - CyUnitV (BufferSize=48)
# CHECK-NEXT: [11] - 0x00000000000400 - CyUnitVC (BufferSize=16)
# CHECK-NEXT: [12] - 0x00000000000800 - CyUnitVD (BufferSize=16)
# CHECK-NEXT: [13] - 0x00000000001000 - CyUnitVM (BufferSize=32)
# CHECK: [0] Code Region - foo
## Do not print mask-only information on simulation
# CHECK-NOT: Processor resource masks:

View File

@ -11,9 +11,12 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MCA/Support.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/FormatVariadic.h"
#include <vector>
#define DEBUG_TYPE "exegesis-sched-class-resolution"
namespace llvm {
namespace exegesis {
@ -55,6 +58,7 @@ getNonRedundantWriteProcRes(const MCSchedClassDesc &SCDesc,
// Collect resource masks.
SmallVector<uint64_t> ProcResourceMasks(NumProcRes);
mca::computeProcResourceMasks(SM, ProcResourceMasks);
LLVM_DEBUG(mca::dumpProcResourceMasks(SM, ProcResourceMasks));
// Sort entries by smaller resources for (basic) topological ordering.
using ResourceMaskAndEntry = std::pair<uint64_t, const MCWriteProcResEntry *>;