
Currently, when VPSlotTracker is initialized with a VPlan, its assignName method calls printAsOperand on each underlying instruction. Each such call recomputes slot numbers for the entire function, leading to O(N × M) complexity, where M is the number of instructions in the loop and N is the number of instructions in the function. This results in slow debug output for large loops. For example, printing costs of all instructions becomes O(M² × N), which is especially painful when enabling verbose dumps. This patch improves debugging performance by caching slot numbers using ModuleSlotTracker. It avoids redundant recomputation and makes debug output significantly faster.
467 lines
16 KiB
C++
467 lines
16 KiB
C++
//===- VPlanHelpers.h - VPlan-related auxiliary helpers -------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
/// \file
|
|
/// This file contains the declarations of different VPlan-related auxiliary
|
|
/// helpers.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANHELPERS_H
|
|
#define LLVM_TRANSFORMS_VECTORIZE_VPLANHELPERS_H
|
|
|
|
#include "VPlanAnalysis.h"
|
|
#include "VPlanDominatorTree.h"
|
|
#include "llvm/ADT/DenseMap.h"
|
|
#include "llvm/ADT/SmallPtrSet.h"
|
|
#include "llvm/ADT/SmallVector.h"
|
|
#include "llvm/Analysis/DomTreeUpdater.h"
|
|
#include "llvm/Analysis/TargetTransformInfo.h"
|
|
#include "llvm/IR/DebugLoc.h"
|
|
#include "llvm/IR/ModuleSlotTracker.h"
|
|
#include "llvm/Support/InstructionCost.h"
|
|
|
|
namespace llvm {
|
|
|
|
class AssumptionCache;
|
|
class BasicBlock;
|
|
class DominatorTree;
|
|
class InnerLoopVectorizer;
|
|
class IRBuilderBase;
|
|
class LoopInfo;
|
|
class SCEV;
|
|
class Type;
|
|
class VPBasicBlock;
|
|
class VPRegionBlock;
|
|
class VPlan;
|
|
class Value;
|
|
|
|
/// Returns a calculation for the total number of elements for a given \p VF.
|
|
/// For fixed width vectors this value is a constant, whereas for scalable
|
|
/// vectors it is an expression determined at runtime.
|
|
Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF);
|
|
|
|
/// Return a value for Step multiplied by VF.
|
|
Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
|
|
int64_t Step);
|
|
|
|
/// A helper function that returns how much we should divide the cost of a
|
|
/// predicated block by. Typically this is the reciprocal of the block
|
|
/// probability, i.e. if we return X we are assuming the predicated block will
|
|
/// execute once for every X iterations of the loop header so the block should
|
|
/// only contribute 1/X of its cost to the total cost calculation, but when
|
|
/// optimizing for code size it will just be 1 as code size costs don't depend
|
|
/// on execution probabilities.
|
|
///
|
|
/// TODO: We should use actual block probability here, if available. Currently,
|
|
/// we always assume predicated blocks have a 50% chance of executing.
|
|
inline unsigned
|
|
getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind) {
|
|
return CostKind == TTI::TCK_CodeSize ? 1 : 2;
|
|
}
|
|
|
|
/// A range of powers-of-2 vectorization factors with fixed start and
|
|
/// adjustable end. The range includes start and excludes end, e.g.,:
|
|
/// [1, 16) = {1, 2, 4, 8}
|
|
struct VFRange {
|
|
// A power of 2.
|
|
const ElementCount Start;
|
|
|
|
// A power of 2. If End <= Start range is empty.
|
|
ElementCount End;
|
|
|
|
bool isEmpty() const {
|
|
return End.getKnownMinValue() <= Start.getKnownMinValue();
|
|
}
|
|
|
|
VFRange(const ElementCount &Start, const ElementCount &End)
|
|
: Start(Start), End(End) {
|
|
assert(Start.isScalable() == End.isScalable() &&
|
|
"Both Start and End should have the same scalable flag");
|
|
assert(isPowerOf2_32(Start.getKnownMinValue()) &&
|
|
"Expected Start to be a power of 2");
|
|
assert(isPowerOf2_32(End.getKnownMinValue()) &&
|
|
"Expected End to be a power of 2");
|
|
}
|
|
|
|
/// Iterator to iterate over vectorization factors in a VFRange.
|
|
class iterator
|
|
: public iterator_facade_base<iterator, std::forward_iterator_tag,
|
|
ElementCount> {
|
|
ElementCount VF;
|
|
|
|
public:
|
|
iterator(ElementCount VF) : VF(VF) {}
|
|
|
|
bool operator==(const iterator &Other) const { return VF == Other.VF; }
|
|
|
|
ElementCount operator*() const { return VF; }
|
|
|
|
iterator &operator++() {
|
|
VF *= 2;
|
|
return *this;
|
|
}
|
|
};
|
|
|
|
iterator begin() { return iterator(Start); }
|
|
iterator end() {
|
|
assert(isPowerOf2_32(End.getKnownMinValue()));
|
|
return iterator(End);
|
|
}
|
|
};
|
|
|
|
/// In what follows, the term "input IR" refers to code that is fed into the
|
|
/// vectorizer whereas the term "output IR" refers to code that is generated by
|
|
/// the vectorizer.
|
|
|
|
/// VPLane provides a way to access lanes in both fixed width and scalable
|
|
/// vectors, where for the latter the lane index sometimes needs calculating
|
|
/// as a runtime expression.
|
|
class VPLane {
|
|
public:
|
|
/// Kind describes how to interpret Lane.
|
|
enum class Kind : uint8_t {
|
|
/// For First, Lane is the index into the first N elements of a
|
|
/// fixed-vector <N x <ElTy>> or a scalable vector <vscale x N x <ElTy>>.
|
|
First,
|
|
/// For ScalableLast, Lane is the offset from the start of the last
|
|
/// N-element subvector in a scalable vector <vscale x N x <ElTy>>. For
|
|
/// example, a Lane of 0 corresponds to lane `(vscale - 1) * N`, a Lane of
|
|
/// 1 corresponds to `((vscale - 1) * N) + 1`, etc.
|
|
ScalableLast
|
|
};
|
|
|
|
private:
|
|
/// in [0..VF)
|
|
unsigned Lane;
|
|
|
|
/// Indicates how the Lane should be interpreted, as described above.
|
|
Kind LaneKind = Kind::First;
|
|
|
|
public:
|
|
VPLane(unsigned Lane) : Lane(Lane) {}
|
|
VPLane(unsigned Lane, Kind LaneKind) : Lane(Lane), LaneKind(LaneKind) {}
|
|
|
|
static VPLane getFirstLane() { return VPLane(0, VPLane::Kind::First); }
|
|
|
|
static VPLane getLaneFromEnd(const ElementCount &VF, unsigned Offset) {
|
|
assert(Offset > 0 && Offset <= VF.getKnownMinValue() &&
|
|
"trying to extract with invalid offset");
|
|
unsigned LaneOffset = VF.getKnownMinValue() - Offset;
|
|
Kind LaneKind;
|
|
if (VF.isScalable())
|
|
// In this case 'LaneOffset' refers to the offset from the start of the
|
|
// last subvector with VF.getKnownMinValue() elements.
|
|
LaneKind = VPLane::Kind::ScalableLast;
|
|
else
|
|
LaneKind = VPLane::Kind::First;
|
|
return VPLane(LaneOffset, LaneKind);
|
|
}
|
|
|
|
static VPLane getLastLaneForVF(const ElementCount &VF) {
|
|
return getLaneFromEnd(VF, 1);
|
|
}
|
|
|
|
/// Returns a compile-time known value for the lane index and asserts if the
|
|
/// lane can only be calculated at runtime.
|
|
unsigned getKnownLane() const {
|
|
assert(LaneKind == Kind::First &&
|
|
"can only get known lane from the beginning");
|
|
return Lane;
|
|
}
|
|
|
|
/// Returns an expression describing the lane index that can be used at
|
|
/// runtime.
|
|
Value *getAsRuntimeExpr(IRBuilderBase &Builder, const ElementCount &VF) const;
|
|
|
|
/// Returns the Kind of lane offset.
|
|
Kind getKind() const { return LaneKind; }
|
|
|
|
/// Returns true if this is the first lane of the whole vector.
|
|
bool isFirstLane() const { return Lane == 0 && LaneKind == Kind::First; }
|
|
|
|
/// Maps the lane to a cache index based on \p VF.
|
|
unsigned mapToCacheIndex(const ElementCount &VF) const {
|
|
switch (LaneKind) {
|
|
case VPLane::Kind::ScalableLast:
|
|
assert(VF.isScalable() && Lane < VF.getKnownMinValue() &&
|
|
"ScalableLast can only be used with scalable VFs");
|
|
return VF.getKnownMinValue() + Lane;
|
|
default:
|
|
assert(Lane < VF.getKnownMinValue() &&
|
|
"Cannot extract lane larger than VF");
|
|
return Lane;
|
|
}
|
|
}
|
|
};
|
|
|
|
/// VPTransformState holds information passed down when "executing" a VPlan,
|
|
/// needed for generating the output IR.
|
|
struct VPTransformState {
|
|
VPTransformState(const TargetTransformInfo *TTI, ElementCount VF,
|
|
LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC,
|
|
IRBuilderBase &Builder, VPlan *Plan, Loop *CurrentParentLoop,
|
|
Type *CanonicalIVTy);
|
|
/// Target Transform Info.
|
|
const TargetTransformInfo *TTI;
|
|
|
|
/// The chosen Vectorization Factor of the loop being vectorized.
|
|
ElementCount VF;
|
|
|
|
/// Hold the index to generate specific scalar instructions. Null indicates
|
|
/// that all instances are to be generated, using either scalar or vector
|
|
/// instructions.
|
|
std::optional<VPLane> Lane;
|
|
|
|
struct DataState {
|
|
// Each value from the original loop, when vectorized, is represented by a
|
|
// vector value in the map.
|
|
DenseMap<const VPValue *, Value *> VPV2Vector;
|
|
|
|
DenseMap<const VPValue *, SmallVector<Value *, 4>> VPV2Scalars;
|
|
} Data;
|
|
|
|
/// Get the generated vector Value for a given VPValue \p Def if \p IsScalar
|
|
/// is false, otherwise return the generated scalar. \See set.
|
|
Value *get(const VPValue *Def, bool IsScalar = false);
|
|
|
|
/// Get the generated Value for a given VPValue and given Part and Lane.
|
|
Value *get(const VPValue *Def, const VPLane &Lane);
|
|
|
|
bool hasVectorValue(const VPValue *Def) {
|
|
return Data.VPV2Vector.contains(Def);
|
|
}
|
|
|
|
bool hasScalarValue(const VPValue *Def, VPLane Lane) {
|
|
auto I = Data.VPV2Scalars.find(Def);
|
|
if (I == Data.VPV2Scalars.end())
|
|
return false;
|
|
unsigned CacheIdx = Lane.mapToCacheIndex(VF);
|
|
return CacheIdx < I->second.size() && I->second[CacheIdx];
|
|
}
|
|
|
|
/// Set the generated vector Value for a given VPValue, if \p
|
|
/// IsScalar is false. If \p IsScalar is true, set the scalar in lane 0.
|
|
void set(const VPValue *Def, Value *V, bool IsScalar = false) {
|
|
if (IsScalar) {
|
|
set(Def, V, VPLane(0));
|
|
return;
|
|
}
|
|
assert((VF.isScalar() || isVectorizedTy(V->getType())) &&
|
|
"scalar values must be stored as (0, 0)");
|
|
Data.VPV2Vector[Def] = V;
|
|
}
|
|
|
|
/// Reset an existing vector value for \p Def and a given \p Part.
|
|
void reset(const VPValue *Def, Value *V) {
|
|
assert(Data.VPV2Vector.contains(Def) && "need to overwrite existing value");
|
|
Data.VPV2Vector[Def] = V;
|
|
}
|
|
|
|
/// Set the generated scalar \p V for \p Def and the given \p Lane.
|
|
void set(const VPValue *Def, Value *V, const VPLane &Lane) {
|
|
auto &Scalars = Data.VPV2Scalars[Def];
|
|
unsigned CacheIdx = Lane.mapToCacheIndex(VF);
|
|
if (Scalars.size() <= CacheIdx)
|
|
Scalars.resize(CacheIdx + 1);
|
|
assert(!Scalars[CacheIdx] && "should overwrite existing value");
|
|
Scalars[CacheIdx] = V;
|
|
}
|
|
|
|
/// Reset an existing scalar value for \p Def and a given \p Lane.
|
|
void reset(const VPValue *Def, Value *V, const VPLane &Lane) {
|
|
auto Iter = Data.VPV2Scalars.find(Def);
|
|
assert(Iter != Data.VPV2Scalars.end() &&
|
|
"need to overwrite existing value");
|
|
unsigned CacheIdx = Lane.mapToCacheIndex(VF);
|
|
assert(CacheIdx < Iter->second.size() &&
|
|
"need to overwrite existing value");
|
|
Iter->second[CacheIdx] = V;
|
|
}
|
|
|
|
/// Set the debug location in the builder using the debug location \p DL.
|
|
void setDebugLocFrom(DebugLoc DL);
|
|
|
|
/// Insert the scalar value of \p Def at \p Lane into \p Lane of \p WideValue
|
|
/// and return the resulting value.
|
|
Value *packScalarIntoVectorizedValue(const VPValue *Def, Value *WideValue,
|
|
const VPLane &Lane);
|
|
|
|
/// Hold state information used when constructing the CFG of the output IR,
|
|
/// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
|
|
struct CFGState {
|
|
/// The previous VPBasicBlock visited. Initially set to null.
|
|
VPBasicBlock *PrevVPBB = nullptr;
|
|
|
|
/// The previous IR BasicBlock created or used. Initially set to the new
|
|
/// header BasicBlock.
|
|
BasicBlock *PrevBB = nullptr;
|
|
|
|
/// The last IR BasicBlock in the output IR. Set to the exit block of the
|
|
/// vector loop.
|
|
BasicBlock *ExitBB = nullptr;
|
|
|
|
/// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case
|
|
/// of replication, maps the BasicBlock of the last replica created.
|
|
SmallDenseMap<const VPBasicBlock *, BasicBlock *> VPBB2IRBB;
|
|
|
|
/// Updater for the DominatorTree.
|
|
DomTreeUpdater DTU;
|
|
|
|
CFGState(DominatorTree *DT)
|
|
: DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy) {}
|
|
} CFG;
|
|
|
|
/// Hold a pointer to LoopInfo to register new basic blocks in the loop.
|
|
LoopInfo *LI;
|
|
|
|
/// Hold a pointer to AssumptionCache to register new assumptions after
|
|
/// replicating assume calls.
|
|
AssumptionCache *AC;
|
|
|
|
/// Hold a reference to the IRBuilder used to generate output IR code.
|
|
IRBuilderBase &Builder;
|
|
|
|
/// Pointer to the VPlan code is generated for.
|
|
VPlan *Plan;
|
|
|
|
/// The parent loop object for the current scope, or nullptr.
|
|
Loop *CurrentParentLoop = nullptr;
|
|
|
|
/// VPlan-based type analysis.
|
|
VPTypeAnalysis TypeAnalysis;
|
|
|
|
/// VPlan-based dominator tree.
|
|
VPDominatorTree VPDT;
|
|
};
|
|
|
|
/// Struct to hold various analysis needed for cost computations.
|
|
struct VPCostContext {
|
|
const TargetTransformInfo &TTI;
|
|
const TargetLibraryInfo &TLI;
|
|
VPTypeAnalysis Types;
|
|
LLVMContext &LLVMCtx;
|
|
LoopVectorizationCostModel &CM;
|
|
SmallPtrSet<Instruction *, 8> SkipCostComputation;
|
|
TargetTransformInfo::TargetCostKind CostKind;
|
|
|
|
VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
|
|
Type *CanIVTy, LoopVectorizationCostModel &CM,
|
|
TargetTransformInfo::TargetCostKind CostKind)
|
|
: TTI(TTI), TLI(TLI), Types(CanIVTy), LLVMCtx(CanIVTy->getContext()),
|
|
CM(CM), CostKind(CostKind) {}
|
|
|
|
/// Return the cost for \p UI with \p VF using the legacy cost model as
|
|
/// fallback until computing the cost of all recipes migrates to VPlan.
|
|
InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const;
|
|
|
|
/// Return true if the cost for \p UI shouldn't be computed, e.g. because it
|
|
/// has already been pre-computed.
|
|
bool skipCostComputation(Instruction *UI, bool IsVector) const;
|
|
|
|
/// Returns the OperandInfo for \p V, if it is a live-in.
|
|
TargetTransformInfo::OperandValueInfo getOperandInfo(VPValue *V) const;
|
|
|
|
/// Return true if \p I is considered uniform-after-vectorization in the
|
|
/// legacy cost model for \p VF. Only used to check for additional VPlan
|
|
/// simplifications.
|
|
bool isLegacyUniformAfterVectorization(Instruction *I, ElementCount VF) const;
|
|
};
|
|
|
|
/// This class can be used to assign names to VPValues. For VPValues without
|
|
/// underlying value, assign consecutive numbers and use those as names (wrapped
|
|
/// in vp<>). Otherwise, use the name from the underlying value (wrapped in
|
|
/// ir<>), appending a .V version number if there are multiple uses of the same
|
|
/// name. Allows querying names for VPValues for printing, similar to the
|
|
/// ModuleSlotTracker for IR values.
|
|
class VPSlotTracker {
|
|
/// Keep track of versioned names assigned to VPValues with underlying IR
|
|
/// values.
|
|
DenseMap<const VPValue *, std::string> VPValue2Name;
|
|
/// Keep track of the next number to use to version the base name.
|
|
StringMap<unsigned> BaseName2Version;
|
|
|
|
/// Number to assign to the next VPValue without underlying value.
|
|
unsigned NextSlot = 0;
|
|
|
|
/// Lazily created ModuleSlotTracker, used only when unnamed IR instructions
|
|
/// require slot tracking.
|
|
std::unique_ptr<ModuleSlotTracker> MST;
|
|
|
|
void assignName(const VPValue *V);
|
|
void assignNames(const VPlan &Plan);
|
|
void assignNames(const VPBasicBlock *VPBB);
|
|
std::string getName(const Value *V);
|
|
|
|
public:
|
|
VPSlotTracker(const VPlan *Plan = nullptr) {
|
|
if (Plan)
|
|
assignNames(*Plan);
|
|
}
|
|
|
|
/// Returns the name assigned to \p V, if there is one, otherwise try to
|
|
/// construct one from the underlying value, if there's one; else return
|
|
/// <badref>.
|
|
std::string getOrCreateName(const VPValue *V) const;
|
|
};
|
|
|
|
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
|
/// VPlanPrinter prints a given VPlan to a given output stream. The printing is
|
|
/// indented and follows the dot format.
|
|
class VPlanPrinter {
|
|
raw_ostream &OS;
|
|
const VPlan &Plan;
|
|
unsigned Depth = 0;
|
|
unsigned TabWidth = 2;
|
|
std::string Indent;
|
|
unsigned BID = 0;
|
|
SmallDenseMap<const VPBlockBase *, unsigned> BlockID;
|
|
|
|
VPSlotTracker SlotTracker;
|
|
|
|
/// Handle indentation.
|
|
void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); }
|
|
|
|
/// Print a given \p Block of the Plan.
|
|
void dumpBlock(const VPBlockBase *Block);
|
|
|
|
/// Print the information related to the CFG edges going out of a given
|
|
/// \p Block, followed by printing the successor blocks themselves.
|
|
void dumpEdges(const VPBlockBase *Block);
|
|
|
|
/// Print a given \p BasicBlock, including its VPRecipes, followed by printing
|
|
/// its successor blocks.
|
|
void dumpBasicBlock(const VPBasicBlock *BasicBlock);
|
|
|
|
/// Print a given \p Region of the Plan.
|
|
void dumpRegion(const VPRegionBlock *Region);
|
|
|
|
unsigned getOrCreateBID(const VPBlockBase *Block) {
|
|
return BlockID.count(Block) ? BlockID[Block] : BlockID[Block] = BID++;
|
|
}
|
|
|
|
Twine getOrCreateName(const VPBlockBase *Block);
|
|
|
|
Twine getUID(const VPBlockBase *Block);
|
|
|
|
/// Print the information related to a CFG edge between two VPBlockBases.
|
|
void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden,
|
|
const Twine &Label);
|
|
|
|
public:
|
|
VPlanPrinter(raw_ostream &O, const VPlan &P)
|
|
: OS(O), Plan(P), SlotTracker(&P) {}
|
|
|
|
LLVM_DUMP_METHOD void dump();
|
|
};
|
|
#endif
|
|
|
|
} // end namespace llvm
|
|
|
|
#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
|