
Skip creating a module if no function is going to be imported. Also includes a change so that if the first partition is empty (which can happen), we import global with non-local linkage into the first non-empty partition, instead of P0 all the time. I thought we'd need to change users of the SplitModule callback so they can deal with less modules than the number requested, but no. We already return only 1 module in some cases and it seems to be handled just fine. Fixes SWDEV-523146
1600 lines
55 KiB
C++
1600 lines
55 KiB
C++
//===- AMDGPUSplitModule.cpp ----------------------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
/// \file Implements a module splitting algorithm designed to support the
|
|
/// FullLTO --lto-partitions option for parallel codegen.
|
|
///
|
|
/// The role of this module splitting pass is the same as
|
|
/// lib/Transforms/Utils/SplitModule.cpp: load-balance the module's functions
|
|
/// across a set of N partitions to allow for parallel codegen.
|
|
///
|
|
/// The similarities mostly end here, as this pass achieves load-balancing in a
|
|
/// more elaborate fashion which is targeted towards AMDGPU modules. It can take
|
|
/// advantage of the structure of AMDGPU modules (which are mostly
|
|
/// self-contained) to allow for more efficient splitting without affecting
|
|
/// codegen negatively, or causing innaccurate resource usage analysis.
|
|
///
|
|
/// High-level pass overview:
|
|
/// - SplitGraph & associated classes
|
|
/// - Graph representation of the module and of the dependencies that
|
|
/// matter for splitting.
|
|
/// - RecursiveSearchSplitting
|
|
/// - Core splitting algorithm.
|
|
/// - SplitProposal
|
|
/// - Represents a suggested solution for splitting the input module. These
|
|
/// solutions can be scored to determine the best one when multiple
|
|
/// solutions are available.
|
|
/// - Driver/pass "run" function glues everything together.
|
|
|
|
#include "AMDGPUSplitModule.h"
|
|
#include "AMDGPUTargetMachine.h"
|
|
#include "Utils/AMDGPUBaseInfo.h"
|
|
#include "llvm/ADT/DenseMap.h"
|
|
#include "llvm/ADT/EquivalenceClasses.h"
|
|
#include "llvm/ADT/GraphTraits.h"
|
|
#include "llvm/ADT/SmallVector.h"
|
|
#include "llvm/ADT/StringExtras.h"
|
|
#include "llvm/ADT/StringRef.h"
|
|
#include "llvm/Analysis/CallGraph.h"
|
|
#include "llvm/Analysis/TargetTransformInfo.h"
|
|
#include "llvm/IR/Function.h"
|
|
#include "llvm/IR/InstIterator.h"
|
|
#include "llvm/IR/Instruction.h"
|
|
#include "llvm/IR/Module.h"
|
|
#include "llvm/IR/Value.h"
|
|
#include "llvm/Support/Allocator.h"
|
|
#include "llvm/Support/Casting.h"
|
|
#include "llvm/Support/DOTGraphTraits.h"
|
|
#include "llvm/Support/Debug.h"
|
|
#include "llvm/Support/GraphWriter.h"
|
|
#include "llvm/Support/Path.h"
|
|
#include "llvm/Support/Timer.h"
|
|
#include "llvm/Support/raw_ostream.h"
|
|
#include "llvm/Transforms/Utils/Cloning.h"
|
|
#include <cassert>
|
|
#include <cmath>
|
|
#include <memory>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#ifndef NDEBUG
|
|
#include "llvm/Support/LockFileManager.h"
|
|
#endif
|
|
|
|
#define DEBUG_TYPE "amdgpu-split-module"
|
|
|
|
namespace llvm {
|
|
namespace {
|
|
|
|
static cl::opt<unsigned> MaxDepth(
|
|
"amdgpu-module-splitting-max-depth",
|
|
cl::desc(
|
|
"maximum search depth. 0 forces a greedy approach. "
|
|
"warning: the algorithm is up to O(2^N), where N is the max depth."),
|
|
cl::init(8));
|
|
|
|
static cl::opt<float> LargeFnFactor(
|
|
"amdgpu-module-splitting-large-threshold", cl::init(2.0f), cl::Hidden,
|
|
cl::desc(
|
|
"when max depth is reached and we can no longer branch out, this "
|
|
"value determines if a function is worth merging into an already "
|
|
"existing partition to reduce code duplication. This is a factor "
|
|
"of the ideal partition size, e.g. 2.0 means we consider the "
|
|
"function for merging if its cost (including its callees) is 2x the "
|
|
"size of an ideal partition."));
|
|
|
|
static cl::opt<float> LargeFnOverlapForMerge(
|
|
"amdgpu-module-splitting-merge-threshold", cl::init(0.7f), cl::Hidden,
|
|
cl::desc("when a function is considered for merging into a partition that "
|
|
"already contains some of its callees, do the merge if at least "
|
|
"n% of the code it can reach is already present inside the "
|
|
"partition; e.g. 0.7 means only merge >70%"));
|
|
|
|
static cl::opt<bool> NoExternalizeGlobals(
|
|
"amdgpu-module-splitting-no-externalize-globals", cl::Hidden,
|
|
cl::desc("disables externalization of global variable with local linkage; "
|
|
"may cause globals to be duplicated which increases binary size"));
|
|
|
|
static cl::opt<bool> NoExternalizeOnAddrTaken(
|
|
"amdgpu-module-splitting-no-externalize-address-taken", cl::Hidden,
|
|
cl::desc(
|
|
"disables externalization of functions whose addresses are taken"));
|
|
|
|
static cl::opt<std::string>
|
|
ModuleDotCfgOutput("amdgpu-module-splitting-print-module-dotcfg",
|
|
cl::Hidden,
|
|
cl::desc("output file to write out the dotgraph "
|
|
"representation of the input module"));
|
|
|
|
static cl::opt<std::string> PartitionSummariesOutput(
|
|
"amdgpu-module-splitting-print-partition-summaries", cl::Hidden,
|
|
cl::desc("output file to write out a summary of "
|
|
"the partitions created for each module"));
|
|
|
|
#ifndef NDEBUG
|
|
static cl::opt<bool>
|
|
UseLockFile("amdgpu-module-splitting-serial-execution", cl::Hidden,
|
|
cl::desc("use a lock file so only one process in the system "
|
|
"can run this pass at once. useful to avoid mangled "
|
|
"debug output in multithreaded environments."));
|
|
|
|
static cl::opt<bool>
|
|
DebugProposalSearch("amdgpu-module-splitting-debug-proposal-search",
|
|
cl::Hidden,
|
|
cl::desc("print all proposals received and whether "
|
|
"they were rejected or accepted"));
|
|
#endif
|
|
|
|
struct SplitModuleTimer : NamedRegionTimer {
|
|
SplitModuleTimer(StringRef Name, StringRef Desc)
|
|
: NamedRegionTimer(Name, Desc, DEBUG_TYPE, "AMDGPU Module Splitting",
|
|
TimePassesIsEnabled) {}
|
|
};
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// Utils
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
using CostType = InstructionCost::CostType;
|
|
using FunctionsCostMap = DenseMap<const Function *, CostType>;
|
|
using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>;
|
|
static constexpr unsigned InvalidPID = -1;
|
|
|
|
/// \param Num numerator
|
|
/// \param Dem denominator
|
|
/// \returns a printable object to print (Num/Dem) using "%0.2f".
|
|
static auto formatRatioOf(CostType Num, CostType Dem) {
|
|
CostType DemOr1 = Dem ? Dem : 1;
|
|
return format("%0.2f", (static_cast<double>(Num) / DemOr1) * 100);
|
|
}
|
|
|
|
/// Checks whether a given function is non-copyable.
|
|
///
|
|
/// Non-copyable functions cannot be cloned into multiple partitions, and only
|
|
/// one copy of the function can be present across all partitions.
|
|
///
|
|
/// Kernel functions and external functions fall into this category. If we were
|
|
/// to clone them, we would end up with multiple symbol definitions and a very
|
|
/// unhappy linker.
|
|
static bool isNonCopyable(const Function &F) {
|
|
return F.hasExternalLinkage() || !F.isDefinitionExact() ||
|
|
AMDGPU::isEntryFunctionCC(F.getCallingConv());
|
|
}
|
|
|
|
/// If \p GV has local linkage, make it external + hidden.
|
|
static void externalize(GlobalValue &GV) {
|
|
if (GV.hasLocalLinkage()) {
|
|
GV.setLinkage(GlobalValue::ExternalLinkage);
|
|
GV.setVisibility(GlobalValue::HiddenVisibility);
|
|
}
|
|
|
|
// Unnamed entities must be named consistently between modules. setName will
|
|
// give a distinct name to each such entity.
|
|
if (!GV.hasName())
|
|
GV.setName("__llvmsplit_unnamed");
|
|
}
|
|
|
|
/// Cost analysis function. Calculates the cost of each function in \p M
|
|
///
|
|
/// \param GetTTI Abstract getter for TargetTransformInfo.
|
|
/// \param M Module to analyze.
|
|
/// \param CostMap[out] Resulting Function -> Cost map.
|
|
/// \return The module's total cost.
|
|
static CostType calculateFunctionCosts(GetTTIFn GetTTI, Module &M,
|
|
FunctionsCostMap &CostMap) {
|
|
SplitModuleTimer SMT("calculateFunctionCosts", "cost analysis");
|
|
|
|
LLVM_DEBUG(dbgs() << "[cost analysis] calculating function costs\n");
|
|
CostType ModuleCost = 0;
|
|
[[maybe_unused]] CostType KernelCost = 0;
|
|
|
|
for (auto &Fn : M) {
|
|
if (Fn.isDeclaration())
|
|
continue;
|
|
|
|
CostType FnCost = 0;
|
|
const auto &TTI = GetTTI(Fn);
|
|
for (const auto &BB : Fn) {
|
|
for (const auto &I : BB) {
|
|
auto Cost =
|
|
TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
|
|
assert(Cost != InstructionCost::getMax());
|
|
// Assume expensive if we can't tell the cost of an instruction.
|
|
CostType CostVal = Cost.isValid()
|
|
? Cost.getValue()
|
|
: (CostType)TargetTransformInfo::TCC_Expensive;
|
|
assert((FnCost + CostVal) >= FnCost && "Overflow!");
|
|
FnCost += CostVal;
|
|
}
|
|
}
|
|
|
|
assert(FnCost != 0);
|
|
|
|
CostMap[&Fn] = FnCost;
|
|
assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!");
|
|
ModuleCost += FnCost;
|
|
|
|
if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv()))
|
|
KernelCost += FnCost;
|
|
}
|
|
|
|
if (CostMap.empty())
|
|
return 0;
|
|
|
|
assert(ModuleCost);
|
|
LLVM_DEBUG({
|
|
const CostType FnCost = ModuleCost - KernelCost;
|
|
dbgs() << " - total module cost is " << ModuleCost << ". kernels cost "
|
|
<< "" << KernelCost << " ("
|
|
<< format("%0.2f", (float(KernelCost) / ModuleCost) * 100)
|
|
<< "% of the module), functions cost " << FnCost << " ("
|
|
<< format("%0.2f", (float(FnCost) / ModuleCost) * 100)
|
|
<< "% of the module)\n";
|
|
});
|
|
|
|
return ModuleCost;
|
|
}
|
|
|
|
/// \return true if \p F can be indirectly called
|
|
static bool canBeIndirectlyCalled(const Function &F) {
|
|
if (F.isDeclaration() || AMDGPU::isEntryFunctionCC(F.getCallingConv()))
|
|
return false;
|
|
return !F.hasLocalLinkage() ||
|
|
F.hasAddressTaken(/*PutOffender=*/nullptr,
|
|
/*IgnoreCallbackUses=*/false,
|
|
/*IgnoreAssumeLikeCalls=*/true,
|
|
/*IgnoreLLVMUsed=*/true,
|
|
/*IgnoreARCAttachedCall=*/false,
|
|
/*IgnoreCastedDirectCall=*/true);
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// Graph-based Module Representation
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
/// AMDGPUSplitModule's view of the source Module, as a graph of all components
|
|
/// that can be split into different modules.
|
|
///
|
|
/// The most trivial instance of this graph is just the CallGraph of the module,
|
|
/// but it is not guaranteed that the graph is strictly equal to the CG. It
|
|
/// currently always is but it's designed in a way that would eventually allow
|
|
/// us to create abstract nodes, or nodes for different entities such as global
|
|
/// variables or any other meaningful constraint we must consider.
|
|
///
|
|
/// The graph is only mutable by this class, and is generally not modified
|
|
/// after \ref SplitGraph::buildGraph runs. No consumers of the graph can
|
|
/// mutate it.
|
|
class SplitGraph {
|
|
public:
|
|
class Node;
|
|
|
|
enum class EdgeKind : uint8_t {
|
|
/// The nodes are related through a direct call. This is a "strong" edge as
|
|
/// it means the Src will directly reference the Dst.
|
|
DirectCall,
|
|
/// The nodes are related through an indirect call.
|
|
/// This is a "weaker" edge and is only considered when traversing the graph
|
|
/// starting from a kernel. We need this edge for resource usage analysis.
|
|
///
|
|
/// The reason why we have this edge in the first place is due to how
|
|
/// AMDGPUResourceUsageAnalysis works. In the presence of an indirect call,
|
|
/// the resource usage of the kernel containing the indirect call is the
|
|
/// max resource usage of all functions that can be indirectly called.
|
|
IndirectCall,
|
|
};
|
|
|
|
/// An edge between two nodes. Edges are directional, and tagged with a
|
|
/// "kind".
|
|
struct Edge {
|
|
Edge(Node *Src, Node *Dst, EdgeKind Kind)
|
|
: Src(Src), Dst(Dst), Kind(Kind) {}
|
|
|
|
Node *Src; ///< Source
|
|
Node *Dst; ///< Destination
|
|
EdgeKind Kind;
|
|
};
|
|
|
|
using EdgesVec = SmallVector<const Edge *, 0>;
|
|
using edges_iterator = EdgesVec::const_iterator;
|
|
using nodes_iterator = const Node *const *;
|
|
|
|
SplitGraph(const Module &M, const FunctionsCostMap &CostMap,
|
|
CostType ModuleCost)
|
|
: M(M), CostMap(CostMap), ModuleCost(ModuleCost) {}
|
|
|
|
void buildGraph(CallGraph &CG);
|
|
|
|
#ifndef NDEBUG
|
|
bool verifyGraph() const;
|
|
#endif
|
|
|
|
bool empty() const { return Nodes.empty(); }
|
|
const iterator_range<nodes_iterator> nodes() const {
|
|
return {Nodes.begin(), Nodes.end()};
|
|
}
|
|
const Node &getNode(unsigned ID) const { return *Nodes[ID]; }
|
|
|
|
unsigned getNumNodes() const { return Nodes.size(); }
|
|
BitVector createNodesBitVector() const { return BitVector(Nodes.size()); }
|
|
|
|
const Module &getModule() const { return M; }
|
|
|
|
CostType getModuleCost() const { return ModuleCost; }
|
|
CostType getCost(const Function &F) const { return CostMap.at(&F); }
|
|
|
|
/// \returns the aggregated cost of all nodes in \p BV (bits set to 1 = node
|
|
/// IDs).
|
|
CostType calculateCost(const BitVector &BV) const;
|
|
|
|
private:
|
|
/// Retrieves the node for \p GV in \p Cache, or creates a new node for it and
|
|
/// updates \p Cache.
|
|
Node &getNode(DenseMap<const GlobalValue *, Node *> &Cache,
|
|
const GlobalValue &GV);
|
|
|
|
// Create a new edge between two nodes and add it to both nodes.
|
|
const Edge &createEdge(Node &Src, Node &Dst, EdgeKind EK);
|
|
|
|
const Module &M;
|
|
const FunctionsCostMap &CostMap;
|
|
CostType ModuleCost;
|
|
|
|
// Final list of nodes with stable ordering.
|
|
SmallVector<Node *> Nodes;
|
|
|
|
SpecificBumpPtrAllocator<Node> NodesPool;
|
|
|
|
// Edges are trivially destructible objects, so as a small optimization we
|
|
// use a BumpPtrAllocator which avoids destructor calls but also makes
|
|
// allocation faster.
|
|
static_assert(
|
|
std::is_trivially_destructible_v<Edge>,
|
|
"Edge must be trivially destructible to use the BumpPtrAllocator");
|
|
BumpPtrAllocator EdgesPool;
|
|
};
|
|
|
|
/// Nodes in the SplitGraph contain both incoming, and outgoing edges.
|
|
/// Incoming edges have this node as their Dst, and Outgoing ones have this node
|
|
/// as their Src.
|
|
///
|
|
/// Edge objects are shared by both nodes in Src/Dst. They provide immediate
|
|
/// feedback on how two nodes are related, and in which direction they are
|
|
/// related, which is valuable information to make splitting decisions.
|
|
///
|
|
/// Nodes are fundamentally abstract, and any consumers of the graph should
|
|
/// treat them as such. While a node will be a function most of the time, we
|
|
/// could also create nodes for any other reason. In the future, we could have
|
|
/// single nodes for multiple functions, or nodes for GVs, etc.
|
|
class SplitGraph::Node {
|
|
friend class SplitGraph;
|
|
|
|
public:
|
|
Node(unsigned ID, const GlobalValue &GV, CostType IndividualCost,
|
|
bool IsNonCopyable)
|
|
: ID(ID), GV(GV), IndividualCost(IndividualCost),
|
|
IsNonCopyable(IsNonCopyable), IsEntryFnCC(false), IsGraphEntry(false) {
|
|
if (auto *Fn = dyn_cast<Function>(&GV))
|
|
IsEntryFnCC = AMDGPU::isEntryFunctionCC(Fn->getCallingConv());
|
|
}
|
|
|
|
/// An 0-indexed ID for the node. The maximum ID (exclusive) is the number of
|
|
/// nodes in the graph. This ID can be used as an index in a BitVector.
|
|
unsigned getID() const { return ID; }
|
|
|
|
const Function &getFunction() const { return cast<Function>(GV); }
|
|
|
|
/// \returns the cost to import this component into a given module, not
|
|
/// accounting for any dependencies that may need to be imported as well.
|
|
CostType getIndividualCost() const { return IndividualCost; }
|
|
|
|
bool isNonCopyable() const { return IsNonCopyable; }
|
|
bool isEntryFunctionCC() const { return IsEntryFnCC; }
|
|
|
|
/// \returns whether this is an entry point in the graph. Entry points are
|
|
/// defined as follows: if you take all entry points in the graph, and iterate
|
|
/// their dependencies, you are guaranteed to visit all nodes in the graph at
|
|
/// least once.
|
|
bool isGraphEntryPoint() const { return IsGraphEntry; }
|
|
|
|
StringRef getName() const { return GV.getName(); }
|
|
|
|
bool hasAnyIncomingEdges() const { return IncomingEdges.size(); }
|
|
bool hasAnyIncomingEdgesOfKind(EdgeKind EK) const {
|
|
return any_of(IncomingEdges, [&](const auto *E) { return E->Kind == EK; });
|
|
}
|
|
|
|
bool hasAnyOutgoingEdges() const { return OutgoingEdges.size(); }
|
|
bool hasAnyOutgoingEdgesOfKind(EdgeKind EK) const {
|
|
return any_of(OutgoingEdges, [&](const auto *E) { return E->Kind == EK; });
|
|
}
|
|
|
|
iterator_range<edges_iterator> incoming_edges() const {
|
|
return IncomingEdges;
|
|
}
|
|
|
|
iterator_range<edges_iterator> outgoing_edges() const {
|
|
return OutgoingEdges;
|
|
}
|
|
|
|
bool shouldFollowIndirectCalls() const { return isEntryFunctionCC(); }
|
|
|
|
/// Visit all children of this node in a recursive fashion. Also visits Self.
|
|
/// If \ref shouldFollowIndirectCalls returns false, then this only follows
|
|
/// DirectCall edges.
|
|
///
|
|
/// \param Visitor Visitor Function.
|
|
void visitAllDependencies(std::function<void(const Node &)> Visitor) const;
|
|
|
|
/// Adds the depedencies of this node in \p BV by setting the bit
|
|
/// corresponding to each node.
|
|
///
|
|
/// Implemented using \ref visitAllDependencies, hence it follows the same
|
|
/// rules regarding dependencies traversal.
|
|
///
|
|
/// \param[out] BV The bitvector where the bits should be set.
|
|
void getDependencies(BitVector &BV) const {
|
|
visitAllDependencies([&](const Node &N) { BV.set(N.getID()); });
|
|
}
|
|
|
|
private:
|
|
void markAsGraphEntry() { IsGraphEntry = true; }
|
|
|
|
unsigned ID;
|
|
const GlobalValue &GV;
|
|
CostType IndividualCost;
|
|
bool IsNonCopyable : 1;
|
|
bool IsEntryFnCC : 1;
|
|
bool IsGraphEntry : 1;
|
|
|
|
// TODO: Use a single sorted vector (with all incoming/outgoing edges grouped
|
|
// together)
|
|
EdgesVec IncomingEdges;
|
|
EdgesVec OutgoingEdges;
|
|
};
|
|
|
|
void SplitGraph::Node::visitAllDependencies(
|
|
std::function<void(const Node &)> Visitor) const {
|
|
const bool FollowIndirect = shouldFollowIndirectCalls();
|
|
// FIXME: If this can access SplitGraph in the future, use a BitVector
|
|
// instead.
|
|
DenseSet<const Node *> Seen;
|
|
SmallVector<const Node *, 8> WorkList({this});
|
|
while (!WorkList.empty()) {
|
|
const Node *CurN = WorkList.pop_back_val();
|
|
if (auto [It, Inserted] = Seen.insert(CurN); !Inserted)
|
|
continue;
|
|
|
|
Visitor(*CurN);
|
|
|
|
for (const Edge *E : CurN->outgoing_edges()) {
|
|
if (!FollowIndirect && E->Kind == EdgeKind::IndirectCall)
|
|
continue;
|
|
WorkList.push_back(E->Dst);
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Checks if \p I has MD_callees and if it does, parse it and put the function
|
|
/// in \p Callees.
|
|
///
|
|
/// \returns true if there was metadata and it was parsed correctly. false if
|
|
/// there was no MD or if it contained unknown entries and parsing failed.
|
|
/// If this returns false, \p Callees will contain incomplete information
|
|
/// and must not be used.
|
|
static bool handleCalleesMD(const Instruction &I,
|
|
SetVector<Function *> &Callees) {
|
|
auto *MD = I.getMetadata(LLVMContext::MD_callees);
|
|
if (!MD)
|
|
return false;
|
|
|
|
for (const auto &Op : MD->operands()) {
|
|
Function *Callee = mdconst::extract_or_null<Function>(Op);
|
|
if (!Callee)
|
|
return false;
|
|
Callees.insert(Callee);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
void SplitGraph::buildGraph(CallGraph &CG) {
|
|
SplitModuleTimer SMT("buildGraph", "graph construction");
|
|
LLVM_DEBUG(
|
|
dbgs()
|
|
<< "[build graph] constructing graph representation of the input\n");
|
|
|
|
// FIXME(?): Is the callgraph really worth using if we have to iterate the
|
|
// function again whenever it fails to give us enough information?
|
|
|
|
// We build the graph by just iterating all functions in the module and
|
|
// working on their direct callees. At the end, all nodes should be linked
|
|
// together as expected.
|
|
DenseMap<const GlobalValue *, Node *> Cache;
|
|
SmallVector<const Function *> FnsWithIndirectCalls, IndirectlyCallableFns;
|
|
for (const Function &Fn : M) {
|
|
if (Fn.isDeclaration())
|
|
continue;
|
|
|
|
// Look at direct callees and create the necessary edges in the graph.
|
|
SetVector<const Function *> DirectCallees;
|
|
bool CallsExternal = false;
|
|
for (auto &CGEntry : *CG[&Fn]) {
|
|
auto *CGNode = CGEntry.second;
|
|
if (auto *Callee = CGNode->getFunction()) {
|
|
if (!Callee->isDeclaration())
|
|
DirectCallees.insert(Callee);
|
|
} else if (CGNode == CG.getCallsExternalNode())
|
|
CallsExternal = true;
|
|
}
|
|
|
|
// Keep track of this function if it contains an indirect call and/or if it
|
|
// can be indirectly called.
|
|
if (CallsExternal) {
|
|
LLVM_DEBUG(dbgs() << " [!] callgraph is incomplete for ";
|
|
Fn.printAsOperand(dbgs());
|
|
dbgs() << " - analyzing function\n");
|
|
|
|
SetVector<Function *> KnownCallees;
|
|
bool HasUnknownIndirectCall = false;
|
|
for (const auto &Inst : instructions(Fn)) {
|
|
// look at all calls without a direct callee.
|
|
const auto *CB = dyn_cast<CallBase>(&Inst);
|
|
if (!CB || CB->getCalledFunction())
|
|
continue;
|
|
|
|
// inline assembly can be ignored, unless InlineAsmIsIndirectCall is
|
|
// true.
|
|
if (CB->isInlineAsm()) {
|
|
LLVM_DEBUG(dbgs() << " found inline assembly\n");
|
|
continue;
|
|
}
|
|
|
|
if (handleCalleesMD(Inst, KnownCallees))
|
|
continue;
|
|
// If we failed to parse any !callees MD, or some was missing,
|
|
// the entire KnownCallees list is now unreliable.
|
|
KnownCallees.clear();
|
|
|
|
// Everything else is handled conservatively. If we fall into the
|
|
// conservative case don't bother analyzing further.
|
|
HasUnknownIndirectCall = true;
|
|
break;
|
|
}
|
|
|
|
if (HasUnknownIndirectCall) {
|
|
LLVM_DEBUG(dbgs() << " indirect call found\n");
|
|
FnsWithIndirectCalls.push_back(&Fn);
|
|
} else if (!KnownCallees.empty())
|
|
DirectCallees.insert_range(KnownCallees);
|
|
}
|
|
|
|
Node &N = getNode(Cache, Fn);
|
|
for (const auto *Callee : DirectCallees)
|
|
createEdge(N, getNode(Cache, *Callee), EdgeKind::DirectCall);
|
|
|
|
if (canBeIndirectlyCalled(Fn))
|
|
IndirectlyCallableFns.push_back(&Fn);
|
|
}
|
|
|
|
// Post-process functions with indirect calls.
|
|
for (const Function *Fn : FnsWithIndirectCalls) {
|
|
for (const Function *Candidate : IndirectlyCallableFns) {
|
|
Node &Src = getNode(Cache, *Fn);
|
|
Node &Dst = getNode(Cache, *Candidate);
|
|
createEdge(Src, Dst, EdgeKind::IndirectCall);
|
|
}
|
|
}
|
|
|
|
// Now, find all entry points.
|
|
SmallVector<Node *, 16> CandidateEntryPoints;
|
|
BitVector NodesReachableByKernels = createNodesBitVector();
|
|
for (Node *N : Nodes) {
|
|
// Functions with an Entry CC are always graph entry points too.
|
|
if (N->isEntryFunctionCC()) {
|
|
N->markAsGraphEntry();
|
|
N->getDependencies(NodesReachableByKernels);
|
|
} else if (!N->hasAnyIncomingEdgesOfKind(EdgeKind::DirectCall))
|
|
CandidateEntryPoints.push_back(N);
|
|
}
|
|
|
|
for (Node *N : CandidateEntryPoints) {
|
|
// This can be another entry point if it's not reachable by a kernel
|
|
// TODO: We could sort all of the possible new entries in a stable order
|
|
// (e.g. by cost), then consume them one by one until
|
|
// NodesReachableByKernels is all 1s. It'd allow us to avoid
|
|
// considering some nodes as non-entries in some specific cases.
|
|
if (!NodesReachableByKernels.test(N->getID()))
|
|
N->markAsGraphEntry();
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
assert(verifyGraph());
|
|
#endif
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
bool SplitGraph::verifyGraph() const {
|
|
unsigned ExpectedID = 0;
|
|
// Exceptionally using a set here in case IDs are messed up.
|
|
DenseSet<const Node *> SeenNodes;
|
|
DenseSet<const Function *> SeenFunctionNodes;
|
|
for (const Node *N : Nodes) {
|
|
if (N->getID() != (ExpectedID++)) {
|
|
errs() << "Node IDs are incorrect!\n";
|
|
return false;
|
|
}
|
|
|
|
if (!SeenNodes.insert(N).second) {
|
|
errs() << "Node seen more than once!\n";
|
|
return false;
|
|
}
|
|
|
|
if (&getNode(N->getID()) != N) {
|
|
errs() << "getNode doesn't return the right node\n";
|
|
return false;
|
|
}
|
|
|
|
for (const Edge *E : N->IncomingEdges) {
|
|
if (!E->Src || !E->Dst || (E->Dst != N) ||
|
|
(find(E->Src->OutgoingEdges, E) == E->Src->OutgoingEdges.end())) {
|
|
errs() << "ill-formed incoming edges\n";
|
|
return false;
|
|
}
|
|
}
|
|
|
|
for (const Edge *E : N->OutgoingEdges) {
|
|
if (!E->Src || !E->Dst || (E->Src != N) ||
|
|
(find(E->Dst->IncomingEdges, E) == E->Dst->IncomingEdges.end())) {
|
|
errs() << "ill-formed outgoing edges\n";
|
|
return false;
|
|
}
|
|
}
|
|
|
|
const Function &Fn = N->getFunction();
|
|
if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv())) {
|
|
if (N->hasAnyIncomingEdges()) {
|
|
errs() << "Kernels cannot have incoming edges\n";
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if (Fn.isDeclaration()) {
|
|
errs() << "declarations shouldn't have nodes!\n";
|
|
return false;
|
|
}
|
|
|
|
auto [It, Inserted] = SeenFunctionNodes.insert(&Fn);
|
|
if (!Inserted) {
|
|
errs() << "one function has multiple nodes!\n";
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if (ExpectedID != Nodes.size()) {
|
|
errs() << "Node IDs out of sync!\n";
|
|
return false;
|
|
}
|
|
|
|
if (createNodesBitVector().size() != getNumNodes()) {
|
|
errs() << "nodes bit vector doesn't have the right size!\n";
|
|
return false;
|
|
}
|
|
|
|
// Check we respect the promise of Node::isKernel
|
|
BitVector BV = createNodesBitVector();
|
|
for (const Node *N : nodes()) {
|
|
if (N->isGraphEntryPoint())
|
|
N->getDependencies(BV);
|
|
}
|
|
|
|
// Ensure each function in the module has an associated node.
|
|
for (const auto &Fn : M) {
|
|
if (!Fn.isDeclaration()) {
|
|
if (!SeenFunctionNodes.contains(&Fn)) {
|
|
errs() << "Fn has no associated node in the graph!\n";
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!BV.all()) {
|
|
errs() << "not all nodes are reachable through the graph's entry points!\n";
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
CostType SplitGraph::calculateCost(const BitVector &BV) const {
|
|
CostType Cost = 0;
|
|
for (unsigned NodeID : BV.set_bits())
|
|
Cost += getNode(NodeID).getIndividualCost();
|
|
return Cost;
|
|
}
|
|
|
|
SplitGraph::Node &
|
|
SplitGraph::getNode(DenseMap<const GlobalValue *, Node *> &Cache,
|
|
const GlobalValue &GV) {
|
|
auto &N = Cache[&GV];
|
|
if (N)
|
|
return *N;
|
|
|
|
CostType Cost = 0;
|
|
bool NonCopyable = false;
|
|
if (const Function *Fn = dyn_cast<Function>(&GV)) {
|
|
NonCopyable = isNonCopyable(*Fn);
|
|
Cost = CostMap.at(Fn);
|
|
}
|
|
N = new (NodesPool.Allocate()) Node(Nodes.size(), GV, Cost, NonCopyable);
|
|
Nodes.push_back(N);
|
|
assert(&getNode(N->getID()) == N);
|
|
return *N;
|
|
}
|
|
|
|
const SplitGraph::Edge &SplitGraph::createEdge(Node &Src, Node &Dst,
|
|
EdgeKind EK) {
|
|
const Edge *E = new (EdgesPool.Allocate<Edge>(1)) Edge(&Src, &Dst, EK);
|
|
Src.OutgoingEdges.push_back(E);
|
|
Dst.IncomingEdges.push_back(E);
|
|
return *E;
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// Split Proposals
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
/// Represents a module splitting proposal.
|
|
///
|
|
/// Proposals are made of N BitVectors, one for each partition, where each bit
|
|
/// set indicates that the node is present and should be copied inside that
|
|
/// partition.
|
|
///
|
|
/// Proposals have several metrics attached so they can be compared/sorted,
|
|
/// which the driver to try multiple strategies resultings in multiple proposals
|
|
/// and choose the best one out of them.
|
|
class SplitProposal {
|
|
public:
|
|
SplitProposal(const SplitGraph &SG, unsigned MaxPartitions) : SG(&SG) {
|
|
Partitions.resize(MaxPartitions, {0, SG.createNodesBitVector()});
|
|
}
|
|
|
|
void setName(StringRef NewName) { Name = NewName; }
|
|
StringRef getName() const { return Name; }
|
|
|
|
const BitVector &operator[](unsigned PID) const {
|
|
return Partitions[PID].second;
|
|
}
|
|
|
|
void add(unsigned PID, const BitVector &BV) {
|
|
Partitions[PID].second |= BV;
|
|
updateScore(PID);
|
|
}
|
|
|
|
void print(raw_ostream &OS) const;
|
|
LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
|
|
|
|
// Find the cheapest partition (lowest cost). In case of ties, always returns
|
|
// the highest partition number.
|
|
unsigned findCheapestPartition() const;
|
|
|
|
/// Calculate the CodeSize and Bottleneck scores.
|
|
void calculateScores();
|
|
|
|
#ifndef NDEBUG
|
|
void verifyCompleteness() const;
|
|
#endif
|
|
|
|
/// Only available after \ref calculateScores is called.
|
|
///
|
|
/// A positive number indicating the % of code duplication that this proposal
|
|
/// creates. e.g. 0.2 means this proposal adds roughly 20% code size by
|
|
/// duplicating some functions across partitions.
|
|
///
|
|
/// Value is always rounded up to 3 decimal places.
|
|
///
|
|
/// A perfect score would be 0.0, and anything approaching 1.0 is very bad.
|
|
double getCodeSizeScore() const { return CodeSizeScore; }
|
|
|
|
/// Only available after \ref calculateScores is called.
|
|
///
|
|
/// A number between [0, 1] which indicates how big of a bottleneck is
|
|
/// expected from the largest partition.
|
|
///
|
|
/// A score of 1.0 means the biggest partition is as big as the source module,
|
|
/// so build time will be equal to or greater than the build time of the
|
|
/// initial input.
|
|
///
|
|
/// Value is always rounded up to 3 decimal places.
|
|
///
|
|
/// This is one of the metrics used to estimate this proposal's build time.
|
|
double getBottleneckScore() const { return BottleneckScore; }
|
|
|
|
private:
|
|
void updateScore(unsigned PID) {
|
|
assert(SG);
|
|
for (auto &[PCost, Nodes] : Partitions) {
|
|
TotalCost -= PCost;
|
|
PCost = SG->calculateCost(Nodes);
|
|
TotalCost += PCost;
|
|
}
|
|
}
|
|
|
|
/// \see getCodeSizeScore
|
|
double CodeSizeScore = 0.0;
|
|
/// \see getBottleneckScore
|
|
double BottleneckScore = 0.0;
|
|
/// Aggregated cost of all partitions
|
|
CostType TotalCost = 0;
|
|
|
|
const SplitGraph *SG = nullptr;
|
|
std::string Name;
|
|
|
|
std::vector<std::pair<CostType, BitVector>> Partitions;
|
|
};
|
|
|
|
void SplitProposal::print(raw_ostream &OS) const {
|
|
assert(SG);
|
|
|
|
OS << "[proposal] " << Name << ", total cost:" << TotalCost
|
|
<< ", code size score:" << format("%0.3f", CodeSizeScore)
|
|
<< ", bottleneck score:" << format("%0.3f", BottleneckScore) << '\n';
|
|
for (const auto &[PID, Part] : enumerate(Partitions)) {
|
|
const auto &[Cost, NodeIDs] = Part;
|
|
OS << " - P" << PID << " nodes:" << NodeIDs.count() << " cost: " << Cost
|
|
<< '|' << formatRatioOf(Cost, SG->getModuleCost()) << "%\n";
|
|
}
|
|
}
|
|
|
|
unsigned SplitProposal::findCheapestPartition() const {
|
|
assert(!Partitions.empty());
|
|
CostType CurCost = std::numeric_limits<CostType>::max();
|
|
unsigned CurPID = InvalidPID;
|
|
for (const auto &[Idx, Part] : enumerate(Partitions)) {
|
|
if (Part.first <= CurCost) {
|
|
CurPID = Idx;
|
|
CurCost = Part.first;
|
|
}
|
|
}
|
|
assert(CurPID != InvalidPID);
|
|
return CurPID;
|
|
}
|
|
|
|
void SplitProposal::calculateScores() {
|
|
if (Partitions.empty())
|
|
return;
|
|
|
|
assert(SG);
|
|
CostType LargestPCost = 0;
|
|
for (auto &[PCost, Nodes] : Partitions) {
|
|
if (PCost > LargestPCost)
|
|
LargestPCost = PCost;
|
|
}
|
|
|
|
CostType ModuleCost = SG->getModuleCost();
|
|
CodeSizeScore = double(TotalCost) / ModuleCost;
|
|
assert(CodeSizeScore >= 0.0);
|
|
|
|
BottleneckScore = double(LargestPCost) / ModuleCost;
|
|
|
|
CodeSizeScore = std::ceil(CodeSizeScore * 100.0) / 100.0;
|
|
BottleneckScore = std::ceil(BottleneckScore * 100.0) / 100.0;
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
void SplitProposal::verifyCompleteness() const {
|
|
if (Partitions.empty())
|
|
return;
|
|
|
|
BitVector Result = Partitions[0].second;
|
|
for (const auto &P : drop_begin(Partitions))
|
|
Result |= P.second;
|
|
assert(Result.all() && "some nodes are missing from this proposal!");
|
|
}
|
|
#endif
|
|
|
|
//===-- RecursiveSearchStrategy -------------------------------------------===//
|
|
|
|
/// Partitioning algorithm.
|
|
///
|
|
/// This is a recursive search algorithm that can explore multiple possiblities.
|
|
///
|
|
/// When a cluster of nodes can go into more than one partition, and we haven't
|
|
/// reached maximum search depth, we recurse and explore both options and their
|
|
/// consequences. Both branches will yield a proposal, and the driver will grade
|
|
/// both and choose the best one.
|
|
///
|
|
/// If max depth is reached, we will use some heuristics to make a choice. Most
|
|
/// of the time we will just use the least-pressured (cheapest) partition, but
|
|
/// if a cluster is particularly big and there is a good amount of overlap with
|
|
/// an existing partition, we will choose that partition instead.
|
|
class RecursiveSearchSplitting {
|
|
public:
|
|
using SubmitProposalFn = function_ref<void(SplitProposal)>;
|
|
|
|
RecursiveSearchSplitting(const SplitGraph &SG, unsigned NumParts,
|
|
SubmitProposalFn SubmitProposal);
|
|
|
|
void run();
|
|
|
|
private:
|
|
struct WorkListEntry {
|
|
WorkListEntry(const BitVector &BV) : Cluster(BV) {}
|
|
|
|
unsigned NumNonEntryNodes = 0;
|
|
CostType TotalCost = 0;
|
|
CostType CostExcludingGraphEntryPoints = 0;
|
|
BitVector Cluster;
|
|
};
|
|
|
|
/// Collects all graph entry points's clusters and sort them so the most
|
|
/// expensive clusters are viewed first. This will merge clusters together if
|
|
/// they share a non-copyable dependency.
|
|
void setupWorkList();
|
|
|
|
/// Recursive function that assigns the worklist item at \p Idx into a
|
|
/// partition of \p SP.
|
|
///
|
|
/// \p Depth is the current search depth. When this value is equal to
|
|
/// \ref MaxDepth, we can no longer recurse.
|
|
///
|
|
/// This function only recurses if there is more than one possible assignment,
|
|
/// otherwise it is iterative to avoid creating a call stack that is as big as
|
|
/// \ref WorkList.
|
|
void pickPartition(unsigned Depth, unsigned Idx, SplitProposal SP);
|
|
|
|
/// \return A pair: first element is the PID of the partition that has the
|
|
/// most similarities with \p Entry, or \ref InvalidPID if no partition was
|
|
/// found with at least one element in common. The second element is the
|
|
/// aggregated cost of all dependencies in common between \p Entry and that
|
|
/// partition.
|
|
std::pair<unsigned, CostType>
|
|
findMostSimilarPartition(const WorkListEntry &Entry, const SplitProposal &SP);
|
|
|
|
const SplitGraph &SG;
|
|
unsigned NumParts;
|
|
SubmitProposalFn SubmitProposal;
|
|
|
|
// A Cluster is considered large when its cost, excluding entry points,
|
|
// exceeds this value.
|
|
CostType LargeClusterThreshold = 0;
|
|
unsigned NumProposalsSubmitted = 0;
|
|
SmallVector<WorkListEntry> WorkList;
|
|
};
|
|
|
|
RecursiveSearchSplitting::RecursiveSearchSplitting(
|
|
const SplitGraph &SG, unsigned NumParts, SubmitProposalFn SubmitProposal)
|
|
: SG(SG), NumParts(NumParts), SubmitProposal(SubmitProposal) {
|
|
// arbitrary max value as a safeguard. Anything above 10 will already be
|
|
// slow, this is just a max value to prevent extreme resource exhaustion or
|
|
// unbounded run time.
|
|
if (MaxDepth > 16)
|
|
report_fatal_error("[amdgpu-split-module] search depth of " +
|
|
Twine(MaxDepth) + " is too high!");
|
|
LargeClusterThreshold =
|
|
(LargeFnFactor != 0.0)
|
|
? CostType(((SG.getModuleCost() / NumParts) * LargeFnFactor))
|
|
: std::numeric_limits<CostType>::max();
|
|
LLVM_DEBUG(dbgs() << "[recursive search] large cluster threshold set at "
|
|
<< LargeClusterThreshold << "\n");
|
|
}
|
|
|
|
void RecursiveSearchSplitting::run() {
|
|
{
|
|
SplitModuleTimer SMT("recursive_search_prepare", "preparing worklist");
|
|
setupWorkList();
|
|
}
|
|
|
|
{
|
|
SplitModuleTimer SMT("recursive_search_pick", "partitioning");
|
|
SplitProposal SP(SG, NumParts);
|
|
pickPartition(/*BranchDepth=*/0, /*Idx=*/0, SP);
|
|
}
|
|
}
|
|
|
|
void RecursiveSearchSplitting::setupWorkList() {
|
|
// e.g. if A and B are two worklist item, and they both call a non copyable
|
|
// dependency C, this does:
|
|
// A=C
|
|
// B=C
|
|
// => NodeEC will create a single group (A, B, C) and we create a new
|
|
// WorkList entry for that group.
|
|
|
|
EquivalenceClasses<unsigned> NodeEC;
|
|
for (const SplitGraph::Node *N : SG.nodes()) {
|
|
if (!N->isGraphEntryPoint())
|
|
continue;
|
|
|
|
NodeEC.insert(N->getID());
|
|
N->visitAllDependencies([&](const SplitGraph::Node &Dep) {
|
|
if (&Dep != N && Dep.isNonCopyable())
|
|
NodeEC.unionSets(N->getID(), Dep.getID());
|
|
});
|
|
}
|
|
|
|
for (const auto &Node : NodeEC) {
|
|
if (!Node->isLeader())
|
|
continue;
|
|
|
|
BitVector Cluster = SG.createNodesBitVector();
|
|
for (unsigned M : NodeEC.members(*Node)) {
|
|
const SplitGraph::Node &N = SG.getNode(M);
|
|
if (N.isGraphEntryPoint())
|
|
N.getDependencies(Cluster);
|
|
}
|
|
WorkList.emplace_back(std::move(Cluster));
|
|
}
|
|
|
|
// Calculate costs and other useful information.
|
|
for (WorkListEntry &Entry : WorkList) {
|
|
for (unsigned NodeID : Entry.Cluster.set_bits()) {
|
|
const SplitGraph::Node &N = SG.getNode(NodeID);
|
|
const CostType Cost = N.getIndividualCost();
|
|
|
|
Entry.TotalCost += Cost;
|
|
if (!N.isGraphEntryPoint()) {
|
|
Entry.CostExcludingGraphEntryPoints += Cost;
|
|
++Entry.NumNonEntryNodes;
|
|
}
|
|
}
|
|
}
|
|
|
|
stable_sort(WorkList, [](const WorkListEntry &A, const WorkListEntry &B) {
|
|
if (A.TotalCost != B.TotalCost)
|
|
return A.TotalCost > B.TotalCost;
|
|
|
|
if (A.CostExcludingGraphEntryPoints != B.CostExcludingGraphEntryPoints)
|
|
return A.CostExcludingGraphEntryPoints > B.CostExcludingGraphEntryPoints;
|
|
|
|
if (A.NumNonEntryNodes != B.NumNonEntryNodes)
|
|
return A.NumNonEntryNodes > B.NumNonEntryNodes;
|
|
|
|
return A.Cluster.count() > B.Cluster.count();
|
|
});
|
|
|
|
LLVM_DEBUG({
|
|
dbgs() << "[recursive search] worklist:\n";
|
|
for (const auto &[Idx, Entry] : enumerate(WorkList)) {
|
|
dbgs() << " - [" << Idx << "]: ";
|
|
for (unsigned NodeID : Entry.Cluster.set_bits())
|
|
dbgs() << NodeID << " ";
|
|
dbgs() << "(total_cost:" << Entry.TotalCost
|
|
<< ", cost_excl_entries:" << Entry.CostExcludingGraphEntryPoints
|
|
<< ")\n";
|
|
}
|
|
});
|
|
}
|
|
|
|
void RecursiveSearchSplitting::pickPartition(unsigned Depth, unsigned Idx,
|
|
SplitProposal SP) {
|
|
while (Idx < WorkList.size()) {
|
|
// Step 1: Determine candidate PIDs.
|
|
//
|
|
const WorkListEntry &Entry = WorkList[Idx];
|
|
const BitVector &Cluster = Entry.Cluster;
|
|
|
|
// Default option is to do load-balancing, AKA assign to least pressured
|
|
// partition.
|
|
const unsigned CheapestPID = SP.findCheapestPartition();
|
|
assert(CheapestPID != InvalidPID);
|
|
|
|
// Explore assigning to the kernel that contains the most dependencies in
|
|
// common.
|
|
const auto [MostSimilarPID, SimilarDepsCost] =
|
|
findMostSimilarPartition(Entry, SP);
|
|
|
|
// We can chose to explore only one path if we only have one valid path, or
|
|
// if we reached maximum search depth and can no longer branch out.
|
|
unsigned SinglePIDToTry = InvalidPID;
|
|
if (MostSimilarPID == InvalidPID) // no similar PID found
|
|
SinglePIDToTry = CheapestPID;
|
|
else if (MostSimilarPID == CheapestPID) // both landed on the same PID
|
|
SinglePIDToTry = CheapestPID;
|
|
else if (Depth >= MaxDepth) {
|
|
// We have to choose one path. Use a heuristic to guess which one will be
|
|
// more appropriate.
|
|
if (Entry.CostExcludingGraphEntryPoints > LargeClusterThreshold) {
|
|
// Check if the amount of code in common makes it worth it.
|
|
assert(SimilarDepsCost && Entry.CostExcludingGraphEntryPoints);
|
|
const double Ratio = static_cast<double>(SimilarDepsCost) /
|
|
Entry.CostExcludingGraphEntryPoints;
|
|
assert(Ratio >= 0.0 && Ratio <= 1.0);
|
|
if (Ratio > LargeFnOverlapForMerge) {
|
|
// For debug, just print "L", so we'll see "L3=P3" for instance, which
|
|
// will mean we reached max depth and chose P3 based on this
|
|
// heuristic.
|
|
LLVM_DEBUG(dbgs() << 'L');
|
|
SinglePIDToTry = MostSimilarPID;
|
|
}
|
|
} else
|
|
SinglePIDToTry = CheapestPID;
|
|
}
|
|
|
|
// Step 2: Explore candidates.
|
|
|
|
// When we only explore one possible path, and thus branch depth doesn't
|
|
// increase, do not recurse, iterate instead.
|
|
if (SinglePIDToTry != InvalidPID) {
|
|
LLVM_DEBUG(dbgs() << Idx << "=P" << SinglePIDToTry << ' ');
|
|
// Only one path to explore, don't clone SP, don't increase depth.
|
|
SP.add(SinglePIDToTry, Cluster);
|
|
++Idx;
|
|
continue;
|
|
}
|
|
|
|
assert(MostSimilarPID != InvalidPID);
|
|
|
|
// We explore multiple paths: recurse at increased depth, then stop this
|
|
// function.
|
|
|
|
LLVM_DEBUG(dbgs() << '\n');
|
|
|
|
// lb = load balancing = put in cheapest partition
|
|
{
|
|
SplitProposal BranchSP = SP;
|
|
LLVM_DEBUG(dbgs().indent(Depth)
|
|
<< " [lb] " << Idx << "=P" << CheapestPID << "? ");
|
|
BranchSP.add(CheapestPID, Cluster);
|
|
pickPartition(Depth + 1, Idx + 1, BranchSP);
|
|
}
|
|
|
|
// ms = most similar = put in partition with the most in common
|
|
{
|
|
SplitProposal BranchSP = SP;
|
|
LLVM_DEBUG(dbgs().indent(Depth)
|
|
<< " [ms] " << Idx << "=P" << MostSimilarPID << "? ");
|
|
BranchSP.add(MostSimilarPID, Cluster);
|
|
pickPartition(Depth + 1, Idx + 1, BranchSP);
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
// Step 3: If we assigned all WorkList items, submit the proposal.
|
|
|
|
assert(Idx == WorkList.size());
|
|
assert(NumProposalsSubmitted <= (2u << MaxDepth) &&
|
|
"Search got out of bounds?");
|
|
SP.setName("recursive_search (depth=" + std::to_string(Depth) + ") #" +
|
|
std::to_string(NumProposalsSubmitted++));
|
|
LLVM_DEBUG(dbgs() << '\n');
|
|
SubmitProposal(SP);
|
|
}
|
|
|
|
std::pair<unsigned, CostType>
|
|
RecursiveSearchSplitting::findMostSimilarPartition(const WorkListEntry &Entry,
|
|
const SplitProposal &SP) {
|
|
if (!Entry.NumNonEntryNodes)
|
|
return {InvalidPID, 0};
|
|
|
|
// We take the partition that is the most similar using Cost as a metric.
|
|
// So we take the set of nodes in common, compute their aggregated cost, and
|
|
// pick the partition with the highest cost in common.
|
|
unsigned ChosenPID = InvalidPID;
|
|
CostType ChosenCost = 0;
|
|
for (unsigned PID = 0; PID < NumParts; ++PID) {
|
|
BitVector BV = SP[PID];
|
|
BV &= Entry.Cluster; // FIXME: & doesn't work between BVs?!
|
|
|
|
if (BV.none())
|
|
continue;
|
|
|
|
const CostType Cost = SG.calculateCost(BV);
|
|
|
|
if (ChosenPID == InvalidPID || ChosenCost < Cost ||
|
|
(ChosenCost == Cost && PID > ChosenPID)) {
|
|
ChosenPID = PID;
|
|
ChosenCost = Cost;
|
|
}
|
|
}
|
|
|
|
return {ChosenPID, ChosenCost};
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// DOTGraph Printing Support
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
const SplitGraph::Node *mapEdgeToDst(const SplitGraph::Edge *E) {
|
|
return E->Dst;
|
|
}
|
|
|
|
using SplitGraphEdgeDstIterator =
|
|
mapped_iterator<SplitGraph::edges_iterator, decltype(&mapEdgeToDst)>;
|
|
|
|
} // namespace
|
|
|
|
template <> struct GraphTraits<SplitGraph> {
|
|
using NodeRef = const SplitGraph::Node *;
|
|
using nodes_iterator = SplitGraph::nodes_iterator;
|
|
using ChildIteratorType = SplitGraphEdgeDstIterator;
|
|
|
|
using EdgeRef = const SplitGraph::Edge *;
|
|
using ChildEdgeIteratorType = SplitGraph::edges_iterator;
|
|
|
|
static NodeRef getEntryNode(NodeRef N) { return N; }
|
|
|
|
static ChildIteratorType child_begin(NodeRef Ref) {
|
|
return {Ref->outgoing_edges().begin(), mapEdgeToDst};
|
|
}
|
|
static ChildIteratorType child_end(NodeRef Ref) {
|
|
return {Ref->outgoing_edges().end(), mapEdgeToDst};
|
|
}
|
|
|
|
static nodes_iterator nodes_begin(const SplitGraph &G) {
|
|
return G.nodes().begin();
|
|
}
|
|
static nodes_iterator nodes_end(const SplitGraph &G) {
|
|
return G.nodes().end();
|
|
}
|
|
};
|
|
|
|
template <> struct DOTGraphTraits<SplitGraph> : public DefaultDOTGraphTraits {
|
|
DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
|
|
|
|
static std::string getGraphName(const SplitGraph &SG) {
|
|
return SG.getModule().getName().str();
|
|
}
|
|
|
|
std::string getNodeLabel(const SplitGraph::Node *N, const SplitGraph &SG) {
|
|
return N->getName().str();
|
|
}
|
|
|
|
static std::string getNodeDescription(const SplitGraph::Node *N,
|
|
const SplitGraph &SG) {
|
|
std::string Result;
|
|
if (N->isEntryFunctionCC())
|
|
Result += "entry-fn-cc ";
|
|
if (N->isNonCopyable())
|
|
Result += "non-copyable ";
|
|
Result += "cost:" + std::to_string(N->getIndividualCost());
|
|
return Result;
|
|
}
|
|
|
|
static std::string getNodeAttributes(const SplitGraph::Node *N,
|
|
const SplitGraph &SG) {
|
|
return N->hasAnyIncomingEdges() ? "" : "color=\"red\"";
|
|
}
|
|
|
|
static std::string getEdgeAttributes(const SplitGraph::Node *N,
|
|
SplitGraphEdgeDstIterator EI,
|
|
const SplitGraph &SG) {
|
|
|
|
switch ((*EI.getCurrent())->Kind) {
|
|
case SplitGraph::EdgeKind::DirectCall:
|
|
return "";
|
|
case SplitGraph::EdgeKind::IndirectCall:
|
|
return "style=\"dashed\"";
|
|
}
|
|
llvm_unreachable("Unknown SplitGraph::EdgeKind enum");
|
|
}
|
|
};
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// Driver
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
namespace {
|
|
|
|
// If we didn't externalize GVs, then local GVs need to be conservatively
|
|
// imported into every module (including their initializers), and then cleaned
|
|
// up afterwards.
|
|
static bool needsConservativeImport(const GlobalValue *GV) {
|
|
if (const auto *Var = dyn_cast<GlobalVariable>(GV))
|
|
return Var->hasLocalLinkage();
|
|
return isa<GlobalAlias>(GV);
|
|
}
|
|
|
|
/// Prints a summary of the partition \p N, represented by module \p M, to \p
|
|
/// OS.
|
|
static void printPartitionSummary(raw_ostream &OS, unsigned N, const Module &M,
|
|
unsigned PartCost, unsigned ModuleCost) {
|
|
OS << "*** Partition P" << N << " ***\n";
|
|
|
|
for (const auto &Fn : M) {
|
|
if (!Fn.isDeclaration())
|
|
OS << " - [function] " << Fn.getName() << "\n";
|
|
}
|
|
|
|
for (const auto &GV : M.globals()) {
|
|
if (GV.hasInitializer())
|
|
OS << " - [global] " << GV.getName() << "\n";
|
|
}
|
|
|
|
OS << "Partition contains " << formatRatioOf(PartCost, ModuleCost)
|
|
<< "% of the source\n";
|
|
}
|
|
|
|
static void evaluateProposal(SplitProposal &Best, SplitProposal New) {
|
|
SplitModuleTimer SMT("proposal_evaluation", "proposal ranking algorithm");
|
|
|
|
LLVM_DEBUG({
|
|
New.verifyCompleteness();
|
|
if (DebugProposalSearch)
|
|
New.print(dbgs());
|
|
});
|
|
|
|
const double CurBScore = Best.getBottleneckScore();
|
|
const double CurCSScore = Best.getCodeSizeScore();
|
|
const double NewBScore = New.getBottleneckScore();
|
|
const double NewCSScore = New.getCodeSizeScore();
|
|
|
|
// TODO: Improve this
|
|
// We can probably lower the precision of the comparison at first
|
|
// e.g. if we have
|
|
// - (Current): BScore: 0.489 CSCore 1.105
|
|
// - (New): BScore: 0.475 CSCore 1.305
|
|
// Currently we'd choose the new one because the bottleneck score is
|
|
// lower, but the new one duplicates more code. It may be worth it to
|
|
// discard the new proposal as the impact on build time is negligible.
|
|
|
|
// Compare them
|
|
bool IsBest = false;
|
|
if (NewBScore < CurBScore)
|
|
IsBest = true;
|
|
else if (NewBScore == CurBScore)
|
|
IsBest = (NewCSScore < CurCSScore); // Use code size as tie breaker.
|
|
|
|
if (IsBest)
|
|
Best = std::move(New);
|
|
|
|
LLVM_DEBUG(if (DebugProposalSearch) {
|
|
if (IsBest)
|
|
dbgs() << "[search] new best proposal!\n";
|
|
else
|
|
dbgs() << "[search] discarding - not profitable\n";
|
|
});
|
|
}
|
|
|
|
/// Trivial helper to create an identical copy of \p M.
|
|
static std::unique_ptr<Module> cloneAll(const Module &M) {
|
|
ValueToValueMapTy VMap;
|
|
return CloneModule(M, VMap, [&](const GlobalValue *GV) { return true; });
|
|
}
|
|
|
|
/// Writes \p SG as a DOTGraph to \ref ModuleDotCfgDir if requested.
|
|
static void writeDOTGraph(const SplitGraph &SG) {
|
|
if (ModuleDotCfgOutput.empty())
|
|
return;
|
|
|
|
std::error_code EC;
|
|
raw_fd_ostream OS(ModuleDotCfgOutput, EC);
|
|
if (EC) {
|
|
errs() << "[" DEBUG_TYPE "]: cannot open '" << ModuleDotCfgOutput
|
|
<< "' - DOTGraph will not be printed\n";
|
|
}
|
|
WriteGraph(OS, SG, /*ShortName=*/false,
|
|
/*Title=*/SG.getModule().getName());
|
|
}
|
|
|
|
static void splitAMDGPUModule(
|
|
GetTTIFn GetTTI, Module &M, unsigned NumParts,
|
|
function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
|
|
CallGraph CG(M);
|
|
|
|
// Externalize functions whose address are taken.
|
|
//
|
|
// This is needed because partitioning is purely based on calls, but sometimes
|
|
// a kernel/function may just look at the address of another local function
|
|
// and not do anything (no calls). After partitioning, that local function may
|
|
// end up in a different module (so it's just a declaration in the module
|
|
// where its address is taken), which emits a "undefined hidden symbol" linker
|
|
// error.
|
|
//
|
|
// Additionally, it guides partitioning to not duplicate this function if it's
|
|
// called directly at some point.
|
|
//
|
|
// TODO: Could we be smarter about this ? This makes all functions whose
|
|
// addresses are taken non-copyable. We should probably model this type of
|
|
// constraint in the graph and use it to guide splitting, instead of
|
|
// externalizing like this. Maybe non-copyable should really mean "keep one
|
|
// visible copy, then internalize all other copies" for some functions?
|
|
if (!NoExternalizeOnAddrTaken) {
|
|
for (auto &Fn : M) {
|
|
// TODO: Should aliases count? Probably not but they're so rare I'm not
|
|
// sure it's worth fixing.
|
|
if (Fn.hasLocalLinkage() && Fn.hasAddressTaken()) {
|
|
LLVM_DEBUG(dbgs() << "[externalize] "; Fn.printAsOperand(dbgs());
|
|
dbgs() << " because its address is taken\n");
|
|
externalize(Fn);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Externalize local GVs, which avoids duplicating their initializers, which
|
|
// in turns helps keep code size in check.
|
|
if (!NoExternalizeGlobals) {
|
|
for (auto &GV : M.globals()) {
|
|
if (GV.hasLocalLinkage())
|
|
LLVM_DEBUG(dbgs() << "[externalize] GV " << GV.getName() << '\n');
|
|
externalize(GV);
|
|
}
|
|
}
|
|
|
|
// Start by calculating the cost of every function in the module, as well as
|
|
// the module's overall cost.
|
|
FunctionsCostMap FnCosts;
|
|
const CostType ModuleCost = calculateFunctionCosts(GetTTI, M, FnCosts);
|
|
|
|
// Build the SplitGraph, which represents the module's functions and models
|
|
// their dependencies accurately.
|
|
SplitGraph SG(M, FnCosts, ModuleCost);
|
|
SG.buildGraph(CG);
|
|
|
|
if (SG.empty()) {
|
|
LLVM_DEBUG(
|
|
dbgs()
|
|
<< "[!] no nodes in graph, input is empty - no splitting possible\n");
|
|
ModuleCallback(cloneAll(M));
|
|
return;
|
|
}
|
|
|
|
LLVM_DEBUG({
|
|
dbgs() << "[graph] nodes:\n";
|
|
for (const SplitGraph::Node *N : SG.nodes()) {
|
|
dbgs() << " - [" << N->getID() << "]: " << N->getName() << " "
|
|
<< (N->isGraphEntryPoint() ? "(entry)" : "") << " "
|
|
<< (N->isNonCopyable() ? "(noncopyable)" : "") << "\n";
|
|
}
|
|
});
|
|
|
|
writeDOTGraph(SG);
|
|
|
|
LLVM_DEBUG(dbgs() << "[search] testing splitting strategies\n");
|
|
|
|
std::optional<SplitProposal> Proposal;
|
|
const auto EvaluateProposal = [&](SplitProposal SP) {
|
|
SP.calculateScores();
|
|
if (!Proposal)
|
|
Proposal = std::move(SP);
|
|
else
|
|
evaluateProposal(*Proposal, std::move(SP));
|
|
};
|
|
|
|
// TODO: It would be very easy to create new strategies by just adding a base
|
|
// class to RecursiveSearchSplitting and abstracting it away.
|
|
RecursiveSearchSplitting(SG, NumParts, EvaluateProposal).run();
|
|
LLVM_DEBUG(if (Proposal) dbgs() << "[search done] selected proposal: "
|
|
<< Proposal->getName() << "\n";);
|
|
|
|
if (!Proposal) {
|
|
LLVM_DEBUG(dbgs() << "[!] no proposal made, no splitting possible!\n");
|
|
ModuleCallback(cloneAll(M));
|
|
return;
|
|
}
|
|
|
|
LLVM_DEBUG(Proposal->print(dbgs()););
|
|
|
|
std::optional<raw_fd_ostream> SummariesOS;
|
|
if (!PartitionSummariesOutput.empty()) {
|
|
std::error_code EC;
|
|
SummariesOS.emplace(PartitionSummariesOutput, EC);
|
|
if (EC)
|
|
errs() << "[" DEBUG_TYPE "]: cannot open '" << PartitionSummariesOutput
|
|
<< "' - Partition summaries will not be printed\n";
|
|
}
|
|
|
|
// One module will import all GlobalValues that are not Functions
|
|
// and are not subject to conservative import.
|
|
bool ImportAllGVs = true;
|
|
|
|
for (unsigned PID = 0; PID < NumParts; ++PID) {
|
|
SplitModuleTimer SMT2("modules_creation",
|
|
"creating modules for each partition");
|
|
LLVM_DEBUG(dbgs() << "[split] creating new modules\n");
|
|
|
|
DenseSet<const Function *> FnsInPart;
|
|
for (unsigned NodeID : (*Proposal)[PID].set_bits())
|
|
FnsInPart.insert(&SG.getNode(NodeID).getFunction());
|
|
|
|
// Don't create empty modules.
|
|
if (FnsInPart.empty()) {
|
|
LLVM_DEBUG(dbgs() << "[split] P" << PID
|
|
<< " is empty, not creating module\n");
|
|
continue;
|
|
}
|
|
|
|
ValueToValueMapTy VMap;
|
|
CostType PartCost = 0;
|
|
std::unique_ptr<Module> MPart(
|
|
CloneModule(M, VMap, [&](const GlobalValue *GV) {
|
|
// Functions go in their assigned partition.
|
|
if (const auto *Fn = dyn_cast<Function>(GV)) {
|
|
if (FnsInPart.contains(Fn)) {
|
|
PartCost += SG.getCost(*Fn);
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// Everything else goes in the first non-empty module we create.
|
|
return ImportAllGVs || needsConservativeImport(GV);
|
|
}));
|
|
|
|
ImportAllGVs = false;
|
|
|
|
// FIXME: Aliases aren't seen often, and their handling isn't perfect so
|
|
// bugs are possible.
|
|
|
|
// Clean-up conservatively imported GVs without any users.
|
|
for (auto &GV : make_early_inc_range(MPart->global_values())) {
|
|
if (needsConservativeImport(&GV) && GV.use_empty())
|
|
GV.eraseFromParent();
|
|
}
|
|
|
|
if (SummariesOS)
|
|
printPartitionSummary(*SummariesOS, PID, *MPart, PartCost, ModuleCost);
|
|
|
|
LLVM_DEBUG(
|
|
printPartitionSummary(dbgs(), PID, *MPart, PartCost, ModuleCost));
|
|
|
|
ModuleCallback(std::move(MPart));
|
|
}
|
|
}
|
|
} // namespace
|
|
|
|
PreservedAnalyses AMDGPUSplitModulePass::run(Module &M,
|
|
ModuleAnalysisManager &MAM) {
|
|
SplitModuleTimer SMT(
|
|
"total", "total pass runtime (incl. potentially waiting for lockfile)");
|
|
|
|
FunctionAnalysisManager &FAM =
|
|
MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
|
|
const auto TTIGetter = [&FAM](Function &F) -> const TargetTransformInfo & {
|
|
return FAM.getResult<TargetIRAnalysis>(F);
|
|
};
|
|
|
|
bool Done = false;
|
|
#ifndef NDEBUG
|
|
if (UseLockFile) {
|
|
SmallString<128> LockFilePath;
|
|
sys::path::system_temp_directory(/*ErasedOnReboot=*/true, LockFilePath);
|
|
sys::path::append(LockFilePath, "amdgpu-split-module-debug");
|
|
LLVM_DEBUG(dbgs() << DEBUG_TYPE " using lockfile '" << LockFilePath
|
|
<< "'\n");
|
|
|
|
while (true) {
|
|
llvm::LockFileManager Lock(LockFilePath.str());
|
|
bool Owned;
|
|
if (Error Err = Lock.tryLock().moveInto(Owned)) {
|
|
consumeError(std::move(Err));
|
|
LLVM_DEBUG(
|
|
dbgs() << "[amdgpu-split-module] unable to acquire lockfile, debug "
|
|
"output may be mangled by other processes\n");
|
|
} else if (!Owned) {
|
|
switch (Lock.waitForUnlockFor(std::chrono::seconds(90))) {
|
|
case WaitForUnlockResult::Success:
|
|
break;
|
|
case WaitForUnlockResult::OwnerDied:
|
|
continue; // try again to get the lock.
|
|
case WaitForUnlockResult::Timeout:
|
|
LLVM_DEBUG(
|
|
dbgs()
|
|
<< "[amdgpu-split-module] unable to acquire lockfile, debug "
|
|
"output may be mangled by other processes\n");
|
|
Lock.unsafeMaybeUnlock();
|
|
break; // give up
|
|
}
|
|
}
|
|
|
|
splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
|
|
Done = true;
|
|
break;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
if (!Done)
|
|
splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
|
|
|
|
// We can change linkage/visibilities in the input, consider that nothing is
|
|
// preserved just to be safe. This pass runs last anyway.
|
|
return PreservedAnalyses::none();
|
|
}
|
|
} // namespace llvm
|