
For code maintainability -- this may result in cases where we are applying the optimization where it is not profitable, but those are likely to be rare.
588 lines
20 KiB
C++
588 lines
20 KiB
C++
//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
/// \file
|
|
/// This pass does misc. AMDGPU optimizations on IR *just* before instruction
|
|
/// selection.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "AMDGPU.h"
|
|
#include "AMDGPUTargetMachine.h"
|
|
#include "llvm/Analysis/AssumptionCache.h"
|
|
#include "llvm/Analysis/UniformityAnalysis.h"
|
|
#include "llvm/Analysis/ValueTracking.h"
|
|
#include "llvm/CodeGen/TargetPassConfig.h"
|
|
#include "llvm/IR/IRBuilder.h"
|
|
#include "llvm/IR/InstVisitor.h"
|
|
#include "llvm/IR/IntrinsicsAMDGPU.h"
|
|
#include "llvm/Support/CommandLine.h"
|
|
#include "llvm/Support/KnownBits.h"
|
|
#include "llvm/Transforms/Utils/Local.h"
|
|
|
|
#define DEBUG_TYPE "amdgpu-late-codegenprepare"
|
|
|
|
using namespace llvm;
|
|
|
|
// Scalar load widening needs running after load-store-vectorizer as that pass
|
|
// doesn't handle overlapping cases. In addition, this pass enhances the
|
|
// widening to handle cases where scalar sub-dword loads are naturally aligned
|
|
// only but not dword aligned.
|
|
static cl::opt<bool>
|
|
WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
|
|
cl::desc("Widen sub-dword constant address space loads in "
|
|
"AMDGPULateCodeGenPrepare"),
|
|
cl::ReallyHidden, cl::init(true));
|
|
|
|
namespace {
|
|
|
|
class AMDGPULateCodeGenPrepare
|
|
: public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
|
|
Function &F;
|
|
const DataLayout &DL;
|
|
const GCNSubtarget &ST;
|
|
|
|
AssumptionCache *const AC;
|
|
UniformityInfo &UA;
|
|
|
|
SmallVector<WeakTrackingVH, 8> DeadInsts;
|
|
|
|
public:
|
|
AMDGPULateCodeGenPrepare(Function &F, const GCNSubtarget &ST,
|
|
AssumptionCache *AC, UniformityInfo &UA)
|
|
: F(F), DL(F.getDataLayout()), ST(ST), AC(AC), UA(UA) {}
|
|
bool run();
|
|
bool visitInstruction(Instruction &) { return false; }
|
|
|
|
// Check if the specified value is at least DWORD aligned.
|
|
bool isDWORDAligned(const Value *V) const {
|
|
KnownBits Known = computeKnownBits(V, DL, 0, AC);
|
|
return Known.countMinTrailingZeros() >= 2;
|
|
}
|
|
|
|
bool canWidenScalarExtLoad(LoadInst &LI) const;
|
|
bool visitLoadInst(LoadInst &LI);
|
|
};
|
|
|
|
using ValueToValueMap = DenseMap<const Value *, Value *>;
|
|
|
|
class LiveRegOptimizer {
|
|
private:
|
|
Module &Mod;
|
|
const DataLayout &DL;
|
|
const GCNSubtarget &ST;
|
|
|
|
/// The scalar type to convert to
|
|
Type *const ConvertToScalar;
|
|
/// The set of visited Instructions
|
|
SmallPtrSet<Instruction *, 4> Visited;
|
|
/// Map of Value -> Converted Value
|
|
ValueToValueMap ValMap;
|
|
/// Map of containing conversions from Optimal Type -> Original Type per BB.
|
|
DenseMap<BasicBlock *, ValueToValueMap> BBUseValMap;
|
|
|
|
public:
|
|
/// Calculate the and \p return the type to convert to given a problematic \p
|
|
/// OriginalType. In some instances, we may widen the type (e.g. v2i8 -> i32).
|
|
Type *calculateConvertType(Type *OriginalType);
|
|
/// Convert the virtual register defined by \p V to the compatible vector of
|
|
/// legal type
|
|
Value *convertToOptType(Instruction *V, BasicBlock::iterator &InstPt);
|
|
/// Convert the virtual register defined by \p V back to the original type \p
|
|
/// ConvertType, stripping away the MSBs in cases where there was an imperfect
|
|
/// fit (e.g. v2i32 -> v7i8)
|
|
Value *convertFromOptType(Type *ConvertType, Instruction *V,
|
|
BasicBlock::iterator &InstPt,
|
|
BasicBlock *InsertBlock);
|
|
/// Check for problematic PHI nodes or cross-bb values based on the value
|
|
/// defined by \p I, and coerce to legal types if necessary. For problematic
|
|
/// PHI node, we coerce all incoming values in a single invocation.
|
|
bool optimizeLiveType(Instruction *I,
|
|
SmallVectorImpl<WeakTrackingVH> &DeadInsts);
|
|
|
|
// Whether or not the type should be replaced to avoid inefficient
|
|
// legalization code
|
|
bool shouldReplace(Type *ITy) {
|
|
FixedVectorType *VTy = dyn_cast<FixedVectorType>(ITy);
|
|
if (!VTy)
|
|
return false;
|
|
|
|
const auto *TLI = ST.getTargetLowering();
|
|
|
|
Type *EltTy = VTy->getElementType();
|
|
// If the element size is not less than the convert to scalar size, then we
|
|
// can't do any bit packing
|
|
if (!EltTy->isIntegerTy() ||
|
|
EltTy->getScalarSizeInBits() > ConvertToScalar->getScalarSizeInBits())
|
|
return false;
|
|
|
|
// Only coerce illegal types
|
|
TargetLoweringBase::LegalizeKind LK =
|
|
TLI->getTypeConversion(EltTy->getContext(), EVT::getEVT(EltTy, false));
|
|
return LK.first != TargetLoweringBase::TypeLegal;
|
|
}
|
|
|
|
bool isOpLegal(Instruction *I) {
|
|
return isa<StoreInst>(I) || isa<IntrinsicInst>(I);
|
|
}
|
|
|
|
bool isCoercionProfitable(Instruction *II) {
|
|
SmallPtrSet<Instruction *, 4> CVisited;
|
|
SmallVector<Instruction *, 4> UserList;
|
|
|
|
// Check users for profitable conditions (across block user which can
|
|
// natively handle the illegal vector).
|
|
for (User *V : II->users())
|
|
if (auto *UseInst = dyn_cast<Instruction>(V))
|
|
UserList.push_back(UseInst);
|
|
|
|
auto IsLookThru = [](Instruction *II) {
|
|
if (const auto *Intr = dyn_cast<IntrinsicInst>(II))
|
|
return Intr->getIntrinsicID() == Intrinsic::amdgcn_perm;
|
|
return isa<PHINode>(II) || isa<ShuffleVectorInst>(II) ||
|
|
isa<InsertElementInst>(II) || isa<ExtractElementInst>(II) ||
|
|
isa<CastInst>(II);
|
|
};
|
|
|
|
while (!UserList.empty()) {
|
|
auto CII = UserList.pop_back_val();
|
|
if (!CVisited.insert(CII).second)
|
|
continue;
|
|
|
|
if (CII->getParent() == II->getParent() && !IsLookThru(II))
|
|
continue;
|
|
|
|
if (isOpLegal(CII))
|
|
return true;
|
|
|
|
if (IsLookThru(CII))
|
|
for (User *V : CII->users())
|
|
if (auto *UseInst = dyn_cast<Instruction>(V))
|
|
UserList.push_back(UseInst);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
LiveRegOptimizer(Module &Mod, const GCNSubtarget &ST)
|
|
: Mod(Mod), DL(Mod.getDataLayout()), ST(ST),
|
|
ConvertToScalar(Type::getInt32Ty(Mod.getContext())) {}
|
|
};
|
|
|
|
} // end anonymous namespace
|
|
|
|
bool AMDGPULateCodeGenPrepare::run() {
|
|
// "Optimize" the virtual regs that cross basic block boundaries. When
|
|
// building the SelectionDAG, vectors of illegal types that cross basic blocks
|
|
// will be scalarized and widened, with each scalar living in its
|
|
// own register. To work around this, this optimization converts the
|
|
// vectors to equivalent vectors of legal type (which are converted back
|
|
// before uses in subsequent blocks), to pack the bits into fewer physical
|
|
// registers (used in CopyToReg/CopyFromReg pairs).
|
|
LiveRegOptimizer LRO(*F.getParent(), ST);
|
|
|
|
bool Changed = false;
|
|
|
|
bool HasScalarSubwordLoads = ST.hasScalarSubwordLoads();
|
|
|
|
for (auto &BB : reverse(F))
|
|
for (Instruction &I : make_early_inc_range(reverse(BB))) {
|
|
Changed |= !HasScalarSubwordLoads && visit(I);
|
|
Changed |= LRO.optimizeLiveType(&I, DeadInsts);
|
|
}
|
|
|
|
RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts);
|
|
return Changed;
|
|
}
|
|
|
|
Type *LiveRegOptimizer::calculateConvertType(Type *OriginalType) {
|
|
assert(OriginalType->getScalarSizeInBits() <=
|
|
ConvertToScalar->getScalarSizeInBits());
|
|
|
|
FixedVectorType *VTy = cast<FixedVectorType>(OriginalType);
|
|
|
|
TypeSize OriginalSize = DL.getTypeSizeInBits(VTy);
|
|
TypeSize ConvertScalarSize = DL.getTypeSizeInBits(ConvertToScalar);
|
|
unsigned ConvertEltCount =
|
|
(OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
|
|
|
|
if (OriginalSize <= ConvertScalarSize)
|
|
return IntegerType::get(Mod.getContext(), ConvertScalarSize);
|
|
|
|
return VectorType::get(Type::getIntNTy(Mod.getContext(), ConvertScalarSize),
|
|
ConvertEltCount, false);
|
|
}
|
|
|
|
Value *LiveRegOptimizer::convertToOptType(Instruction *V,
|
|
BasicBlock::iterator &InsertPt) {
|
|
FixedVectorType *VTy = cast<FixedVectorType>(V->getType());
|
|
Type *NewTy = calculateConvertType(V->getType());
|
|
|
|
TypeSize OriginalSize = DL.getTypeSizeInBits(VTy);
|
|
TypeSize NewSize = DL.getTypeSizeInBits(NewTy);
|
|
|
|
IRBuilder<> Builder(V->getParent(), InsertPt);
|
|
// If there is a bitsize match, we can fit the old vector into a new vector of
|
|
// desired type.
|
|
if (OriginalSize == NewSize)
|
|
return Builder.CreateBitCast(V, NewTy, V->getName() + ".bc");
|
|
|
|
// If there is a bitsize mismatch, we must use a wider vector.
|
|
assert(NewSize > OriginalSize);
|
|
uint64_t ExpandedVecElementCount = NewSize / VTy->getScalarSizeInBits();
|
|
|
|
SmallVector<int, 8> ShuffleMask;
|
|
uint64_t OriginalElementCount = VTy->getElementCount().getFixedValue();
|
|
for (unsigned I = 0; I < OriginalElementCount; I++)
|
|
ShuffleMask.push_back(I);
|
|
|
|
for (uint64_t I = OriginalElementCount; I < ExpandedVecElementCount; I++)
|
|
ShuffleMask.push_back(OriginalElementCount);
|
|
|
|
Value *ExpandedVec = Builder.CreateShuffleVector(V, ShuffleMask);
|
|
return Builder.CreateBitCast(ExpandedVec, NewTy, V->getName() + ".bc");
|
|
}
|
|
|
|
Value *LiveRegOptimizer::convertFromOptType(Type *ConvertType, Instruction *V,
|
|
BasicBlock::iterator &InsertPt,
|
|
BasicBlock *InsertBB) {
|
|
FixedVectorType *NewVTy = cast<FixedVectorType>(ConvertType);
|
|
|
|
TypeSize OriginalSize = DL.getTypeSizeInBits(V->getType());
|
|
TypeSize NewSize = DL.getTypeSizeInBits(NewVTy);
|
|
|
|
IRBuilder<> Builder(InsertBB, InsertPt);
|
|
// If there is a bitsize match, we simply convert back to the original type.
|
|
if (OriginalSize == NewSize)
|
|
return Builder.CreateBitCast(V, NewVTy, V->getName() + ".bc");
|
|
|
|
// If there is a bitsize mismatch, then we must have used a wider value to
|
|
// hold the bits.
|
|
assert(OriginalSize > NewSize);
|
|
// For wide scalars, we can just truncate the value.
|
|
if (!V->getType()->isVectorTy()) {
|
|
Instruction *Trunc = cast<Instruction>(
|
|
Builder.CreateTrunc(V, IntegerType::get(Mod.getContext(), NewSize)));
|
|
return cast<Instruction>(Builder.CreateBitCast(Trunc, NewVTy));
|
|
}
|
|
|
|
// For wider vectors, we must strip the MSBs to convert back to the original
|
|
// type.
|
|
VectorType *ExpandedVT = VectorType::get(
|
|
Type::getIntNTy(Mod.getContext(), NewVTy->getScalarSizeInBits()),
|
|
(OriginalSize / NewVTy->getScalarSizeInBits()), false);
|
|
Instruction *Converted =
|
|
cast<Instruction>(Builder.CreateBitCast(V, ExpandedVT));
|
|
|
|
unsigned NarrowElementCount = NewVTy->getElementCount().getFixedValue();
|
|
SmallVector<int, 8> ShuffleMask(NarrowElementCount);
|
|
std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
|
|
|
|
return Builder.CreateShuffleVector(Converted, ShuffleMask);
|
|
}
|
|
|
|
bool LiveRegOptimizer::optimizeLiveType(
|
|
Instruction *I, SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
|
|
SmallVector<Instruction *, 4> Worklist;
|
|
SmallPtrSet<PHINode *, 4> PhiNodes;
|
|
SmallPtrSet<Instruction *, 4> Defs;
|
|
SmallPtrSet<Instruction *, 4> Uses;
|
|
|
|
Worklist.push_back(cast<Instruction>(I));
|
|
while (!Worklist.empty()) {
|
|
Instruction *II = Worklist.pop_back_val();
|
|
|
|
if (!Visited.insert(II).second)
|
|
continue;
|
|
|
|
if (!shouldReplace(II->getType()))
|
|
continue;
|
|
|
|
if (!isCoercionProfitable(II))
|
|
continue;
|
|
|
|
if (PHINode *Phi = dyn_cast<PHINode>(II)) {
|
|
PhiNodes.insert(Phi);
|
|
// Collect all the incoming values of problematic PHI nodes.
|
|
for (Value *V : Phi->incoming_values()) {
|
|
// Repeat the collection process for newly found PHI nodes.
|
|
if (PHINode *OpPhi = dyn_cast<PHINode>(V)) {
|
|
if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi))
|
|
Worklist.push_back(OpPhi);
|
|
continue;
|
|
}
|
|
|
|
Instruction *IncInst = dyn_cast<Instruction>(V);
|
|
// Other incoming value types (e.g. vector literals) are unhandled
|
|
if (!IncInst && !isa<ConstantAggregateZero>(V))
|
|
return false;
|
|
|
|
// Collect all other incoming values for coercion.
|
|
if (IncInst)
|
|
Defs.insert(IncInst);
|
|
}
|
|
}
|
|
|
|
// Collect all relevant uses.
|
|
for (User *V : II->users()) {
|
|
// Repeat the collection process for problematic PHI nodes.
|
|
if (PHINode *OpPhi = dyn_cast<PHINode>(V)) {
|
|
if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi))
|
|
Worklist.push_back(OpPhi);
|
|
continue;
|
|
}
|
|
|
|
Instruction *UseInst = cast<Instruction>(V);
|
|
// Collect all uses of PHINodes and any use the crosses BB boundaries.
|
|
if (UseInst->getParent() != II->getParent() || isa<PHINode>(II)) {
|
|
Uses.insert(UseInst);
|
|
if (!isa<PHINode>(II))
|
|
Defs.insert(II);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Coerce and track the defs.
|
|
for (Instruction *D : Defs) {
|
|
if (!ValMap.contains(D)) {
|
|
BasicBlock::iterator InsertPt = std::next(D->getIterator());
|
|
Value *ConvertVal = convertToOptType(D, InsertPt);
|
|
assert(ConvertVal);
|
|
ValMap[D] = ConvertVal;
|
|
}
|
|
}
|
|
|
|
// Construct new-typed PHI nodes.
|
|
for (PHINode *Phi : PhiNodes) {
|
|
ValMap[Phi] = PHINode::Create(calculateConvertType(Phi->getType()),
|
|
Phi->getNumIncomingValues(),
|
|
Phi->getName() + ".tc", Phi->getIterator());
|
|
}
|
|
|
|
// Connect all the PHI nodes with their new incoming values.
|
|
for (PHINode *Phi : PhiNodes) {
|
|
PHINode *NewPhi = cast<PHINode>(ValMap[Phi]);
|
|
bool MissingIncVal = false;
|
|
for (int I = 0, E = Phi->getNumIncomingValues(); I < E; I++) {
|
|
Value *IncVal = Phi->getIncomingValue(I);
|
|
if (isa<ConstantAggregateZero>(IncVal)) {
|
|
Type *NewType = calculateConvertType(Phi->getType());
|
|
NewPhi->addIncoming(ConstantInt::get(NewType, 0, false),
|
|
Phi->getIncomingBlock(I));
|
|
} else if (Value *Val = ValMap.lookup(IncVal))
|
|
NewPhi->addIncoming(Val, Phi->getIncomingBlock(I));
|
|
else
|
|
MissingIncVal = true;
|
|
}
|
|
if (MissingIncVal) {
|
|
Value *DeadVal = ValMap[Phi];
|
|
// The coercion chain of the PHI is broken. Delete the Phi
|
|
// from the ValMap and any connected / user Phis.
|
|
SmallVector<Value *, 4> PHIWorklist;
|
|
SmallPtrSet<Value *, 4> VisitedPhis;
|
|
PHIWorklist.push_back(DeadVal);
|
|
while (!PHIWorklist.empty()) {
|
|
Value *NextDeadValue = PHIWorklist.pop_back_val();
|
|
VisitedPhis.insert(NextDeadValue);
|
|
auto OriginalPhi =
|
|
std::find_if(PhiNodes.begin(), PhiNodes.end(),
|
|
[this, &NextDeadValue](PHINode *CandPhi) {
|
|
return ValMap[CandPhi] == NextDeadValue;
|
|
});
|
|
// This PHI may have already been removed from maps when
|
|
// unwinding a previous Phi
|
|
if (OriginalPhi != PhiNodes.end())
|
|
ValMap.erase(*OriginalPhi);
|
|
|
|
DeadInsts.emplace_back(cast<Instruction>(NextDeadValue));
|
|
|
|
for (User *U : NextDeadValue->users()) {
|
|
if (!VisitedPhis.contains(cast<PHINode>(U)))
|
|
PHIWorklist.push_back(U);
|
|
}
|
|
}
|
|
} else {
|
|
DeadInsts.emplace_back(cast<Instruction>(Phi));
|
|
}
|
|
}
|
|
// Coerce back to the original type and replace the uses.
|
|
for (Instruction *U : Uses) {
|
|
// Replace all converted operands for a use.
|
|
for (auto [OpIdx, Op] : enumerate(U->operands())) {
|
|
if (Value *Val = ValMap.lookup(Op)) {
|
|
Value *NewVal = nullptr;
|
|
if (BBUseValMap.contains(U->getParent()) &&
|
|
BBUseValMap[U->getParent()].contains(Val))
|
|
NewVal = BBUseValMap[U->getParent()][Val];
|
|
else {
|
|
BasicBlock::iterator InsertPt = U->getParent()->getFirstNonPHIIt();
|
|
// We may pick up ops that were previously converted for users in
|
|
// other blocks. If there is an originally typed definition of the Op
|
|
// already in this block, simply reuse it.
|
|
if (isa<Instruction>(Op) && !isa<PHINode>(Op) &&
|
|
U->getParent() == cast<Instruction>(Op)->getParent()) {
|
|
NewVal = Op;
|
|
} else {
|
|
NewVal =
|
|
convertFromOptType(Op->getType(), cast<Instruction>(ValMap[Op]),
|
|
InsertPt, U->getParent());
|
|
BBUseValMap[U->getParent()][ValMap[Op]] = NewVal;
|
|
}
|
|
}
|
|
assert(NewVal);
|
|
U->setOperand(OpIdx, NewVal);
|
|
}
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
|
|
unsigned AS = LI.getPointerAddressSpace();
|
|
// Skip non-constant address space.
|
|
if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
|
|
AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
|
|
return false;
|
|
// Skip non-simple loads.
|
|
if (!LI.isSimple())
|
|
return false;
|
|
Type *Ty = LI.getType();
|
|
// Skip aggregate types.
|
|
if (Ty->isAggregateType())
|
|
return false;
|
|
unsigned TySize = DL.getTypeStoreSize(Ty);
|
|
// Only handle sub-DWORD loads.
|
|
if (TySize >= 4)
|
|
return false;
|
|
// That load must be at least naturally aligned.
|
|
if (LI.getAlign() < DL.getABITypeAlign(Ty))
|
|
return false;
|
|
// It should be uniform, i.e. a scalar load.
|
|
return UA.isUniform(&LI);
|
|
}
|
|
|
|
bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
|
|
if (!WidenLoads)
|
|
return false;
|
|
|
|
// Skip if that load is already aligned on DWORD at least as it's handled in
|
|
// SDAG.
|
|
if (LI.getAlign() >= 4)
|
|
return false;
|
|
|
|
if (!canWidenScalarExtLoad(LI))
|
|
return false;
|
|
|
|
int64_t Offset = 0;
|
|
auto *Base =
|
|
GetPointerBaseWithConstantOffset(LI.getPointerOperand(), Offset, DL);
|
|
// If that base is not DWORD aligned, it's not safe to perform the following
|
|
// transforms.
|
|
if (!isDWORDAligned(Base))
|
|
return false;
|
|
|
|
int64_t Adjust = Offset & 0x3;
|
|
if (Adjust == 0) {
|
|
// With a zero adjust, the original alignment could be promoted with a
|
|
// better one.
|
|
LI.setAlignment(Align(4));
|
|
return true;
|
|
}
|
|
|
|
IRBuilder<> IRB(&LI);
|
|
IRB.SetCurrentDebugLocation(LI.getDebugLoc());
|
|
|
|
unsigned LdBits = DL.getTypeStoreSizeInBits(LI.getType());
|
|
auto *IntNTy = Type::getIntNTy(LI.getContext(), LdBits);
|
|
|
|
auto *NewPtr = IRB.CreateConstGEP1_64(
|
|
IRB.getInt8Ty(),
|
|
IRB.CreateAddrSpaceCast(Base, LI.getPointerOperand()->getType()),
|
|
Offset - Adjust);
|
|
|
|
LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4));
|
|
NewLd->copyMetadata(LI);
|
|
NewLd->setMetadata(LLVMContext::MD_range, nullptr);
|
|
|
|
unsigned ShAmt = Adjust * 8;
|
|
Value *NewVal = IRB.CreateBitCast(
|
|
IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt),
|
|
DL.typeSizeEqualsStoreSize(LI.getType()) ? IntNTy
|
|
: LI.getType()),
|
|
LI.getType());
|
|
LI.replaceAllUsesWith(NewVal);
|
|
DeadInsts.emplace_back(&LI);
|
|
|
|
return true;
|
|
}
|
|
|
|
PreservedAnalyses
|
|
AMDGPULateCodeGenPreparePass::run(Function &F, FunctionAnalysisManager &FAM) {
|
|
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
|
|
AssumptionCache &AC = FAM.getResult<AssumptionAnalysis>(F);
|
|
UniformityInfo &UI = FAM.getResult<UniformityInfoAnalysis>(F);
|
|
|
|
bool Changed = AMDGPULateCodeGenPrepare(F, ST, &AC, UI).run();
|
|
|
|
if (!Changed)
|
|
return PreservedAnalyses::all();
|
|
PreservedAnalyses PA = PreservedAnalyses::none();
|
|
PA.preserveSet<CFGAnalyses>();
|
|
return PA;
|
|
}
|
|
|
|
class AMDGPULateCodeGenPrepareLegacy : public FunctionPass {
|
|
public:
|
|
static char ID;
|
|
|
|
AMDGPULateCodeGenPrepareLegacy() : FunctionPass(ID) {}
|
|
|
|
StringRef getPassName() const override {
|
|
return "AMDGPU IR late optimizations";
|
|
}
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
AU.addRequired<TargetPassConfig>();
|
|
AU.addRequired<AssumptionCacheTracker>();
|
|
AU.addRequired<UniformityInfoWrapperPass>();
|
|
AU.setPreservesAll();
|
|
}
|
|
|
|
bool runOnFunction(Function &F) override;
|
|
};
|
|
|
|
bool AMDGPULateCodeGenPrepareLegacy::runOnFunction(Function &F) {
|
|
if (skipFunction(F))
|
|
return false;
|
|
|
|
const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
|
|
const TargetMachine &TM = TPC.getTM<TargetMachine>();
|
|
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
|
|
|
|
AssumptionCache &AC =
|
|
getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
|
|
UniformityInfo &UI =
|
|
getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
|
|
|
|
return AMDGPULateCodeGenPrepare(F, ST, &AC, UI).run();
|
|
}
|
|
|
|
INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepareLegacy, DEBUG_TYPE,
|
|
"AMDGPU IR late optimizations", false, false)
|
|
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
|
|
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
|
|
INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
|
|
INITIALIZE_PASS_END(AMDGPULateCodeGenPrepareLegacy, DEBUG_TYPE,
|
|
"AMDGPU IR late optimizations", false, false)
|
|
|
|
char AMDGPULateCodeGenPrepareLegacy::ID = 0;
|
|
|
|
FunctionPass *llvm::createAMDGPULateCodeGenPrepareLegacyPass() {
|
|
return new AMDGPULateCodeGenPrepareLegacy();
|
|
}
|