
As part of the "RemoveDIs" project, BasicBlock::iterator now carries a debug-info bit that's needed when getFirstNonPHI and similar feed into instruction insertion positions. Call-sites where that's necessary were updated a year ago; but to ensure some type safety however, we'd like to have all calls to moveBefore use iterators. This patch adds a (guaranteed dereferenceable) iterator-taking moveBefore, and changes a bunch of call-sites where it's obviously safe to change to use it by just calling getIterator() on an instruction pointer. A follow-up patch will contain less-obviously-safe changes. We'll eventually deprecate and remove the instruction-pointer insertBefore, but not before adding concise documentation of what considerations are needed (very few).
1638 lines
64 KiB
C++
1638 lines
64 KiB
C++
//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This pass merges loads/stores to/from sequential memory addresses into vector
|
|
// loads/stores. Although there's nothing GPU-specific in here, this pass is
|
|
// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
|
|
//
|
|
// (For simplicity below we talk about loads only, but everything also applies
|
|
// to stores.)
|
|
//
|
|
// This pass is intended to be run late in the pipeline, after other
|
|
// vectorization opportunities have been exploited. So the assumption here is
|
|
// that immediately following our new vector load we'll need to extract out the
|
|
// individual elements of the load, so we can operate on them individually.
|
|
//
|
|
// On CPUs this transformation is usually not beneficial, because extracting the
|
|
// elements of a vector register is expensive on most architectures. It's
|
|
// usually better just to load each element individually into its own scalar
|
|
// register.
|
|
//
|
|
// However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
|
|
// "vector load" loads directly into a series of scalar registers. In effect,
|
|
// extracting the elements of the vector is free. It's therefore always
|
|
// beneficial to vectorize a sequence of loads on these architectures.
|
|
//
|
|
// Vectorizing (perhaps a better name might be "coalescing") loads can have
|
|
// large performance impacts on GPU kernels, and opportunities for vectorizing
|
|
// are common in GPU code. This pass tries very hard to find such
|
|
// opportunities; its runtime is quadratic in the number of loads in a BB.
|
|
//
|
|
// Some CPU architectures, such as ARM, have instructions that load into
|
|
// multiple scalar registers, similar to a GPU vectorized load. In theory ARM
|
|
// could use this pass (with some modifications), but currently it implements
|
|
// its own pass to do something similar to what we do here.
|
|
//
|
|
// Overview of the algorithm and terminology in this pass:
|
|
//
|
|
// - Break up each basic block into pseudo-BBs, composed of instructions which
|
|
// are guaranteed to transfer control to their successors.
|
|
// - Within a single pseudo-BB, find all loads, and group them into
|
|
// "equivalence classes" according to getUnderlyingObject() and loaded
|
|
// element size. Do the same for stores.
|
|
// - For each equivalence class, greedily build "chains". Each chain has a
|
|
// leader instruction, and every other member of the chain has a known
|
|
// constant offset from the first instr in the chain.
|
|
// - Break up chains so that they contain only contiguous accesses of legal
|
|
// size with no intervening may-alias instrs.
|
|
// - Convert each chain to vector instructions.
|
|
//
|
|
// The O(n^2) behavior of this pass comes from initially building the chains.
|
|
// In the worst case we have to compare each new instruction to all of those
|
|
// that came before. To limit this, we only calculate the offset to the leaders
|
|
// of the N most recently-used chains.
|
|
|
|
#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
|
|
#include "llvm/ADT/APInt.h"
|
|
#include "llvm/ADT/ArrayRef.h"
|
|
#include "llvm/ADT/DenseMap.h"
|
|
#include "llvm/ADT/MapVector.h"
|
|
#include "llvm/ADT/PostOrderIterator.h"
|
|
#include "llvm/ADT/STLExtras.h"
|
|
#include "llvm/ADT/Sequence.h"
|
|
#include "llvm/ADT/SmallPtrSet.h"
|
|
#include "llvm/ADT/SmallVector.h"
|
|
#include "llvm/ADT/Statistic.h"
|
|
#include "llvm/ADT/iterator_range.h"
|
|
#include "llvm/Analysis/AliasAnalysis.h"
|
|
#include "llvm/Analysis/AssumptionCache.h"
|
|
#include "llvm/Analysis/MemoryLocation.h"
|
|
#include "llvm/Analysis/ScalarEvolution.h"
|
|
#include "llvm/Analysis/TargetTransformInfo.h"
|
|
#include "llvm/Analysis/ValueTracking.h"
|
|
#include "llvm/Analysis/VectorUtils.h"
|
|
#include "llvm/IR/Attributes.h"
|
|
#include "llvm/IR/BasicBlock.h"
|
|
#include "llvm/IR/ConstantRange.h"
|
|
#include "llvm/IR/Constants.h"
|
|
#include "llvm/IR/DataLayout.h"
|
|
#include "llvm/IR/DerivedTypes.h"
|
|
#include "llvm/IR/Dominators.h"
|
|
#include "llvm/IR/Function.h"
|
|
#include "llvm/IR/GetElementPtrTypeIterator.h"
|
|
#include "llvm/IR/IRBuilder.h"
|
|
#include "llvm/IR/InstrTypes.h"
|
|
#include "llvm/IR/Instruction.h"
|
|
#include "llvm/IR/Instructions.h"
|
|
#include "llvm/IR/LLVMContext.h"
|
|
#include "llvm/IR/Module.h"
|
|
#include "llvm/IR/Type.h"
|
|
#include "llvm/IR/Value.h"
|
|
#include "llvm/InitializePasses.h"
|
|
#include "llvm/Pass.h"
|
|
#include "llvm/Support/Alignment.h"
|
|
#include "llvm/Support/Casting.h"
|
|
#include "llvm/Support/Debug.h"
|
|
#include "llvm/Support/KnownBits.h"
|
|
#include "llvm/Support/MathExtras.h"
|
|
#include "llvm/Support/ModRef.h"
|
|
#include "llvm/Support/raw_ostream.h"
|
|
#include "llvm/Transforms/Utils/Local.h"
|
|
#include <algorithm>
|
|
#include <cassert>
|
|
#include <cstdint>
|
|
#include <cstdlib>
|
|
#include <iterator>
|
|
#include <numeric>
|
|
#include <optional>
|
|
#include <tuple>
|
|
#include <type_traits>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
using namespace llvm;
|
|
|
|
#define DEBUG_TYPE "load-store-vectorizer"
|
|
|
|
STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
|
|
STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
|
|
|
|
namespace {
|
|
|
|
// Equivalence class key, the initial tuple by which we group loads/stores.
|
|
// Loads/stores with different EqClassKeys are never merged.
|
|
//
|
|
// (We could in theory remove element-size from the this tuple. We'd just need
|
|
// to fix up the vector packing/unpacking code.)
|
|
using EqClassKey =
|
|
std::tuple<const Value * /* result of getUnderlyingObject() */,
|
|
unsigned /* AddrSpace */,
|
|
unsigned /* Load/Store element size bits */,
|
|
char /* IsLoad; char b/c bool can't be a DenseMap key */
|
|
>;
|
|
[[maybe_unused]] llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
|
|
const EqClassKey &K) {
|
|
const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K;
|
|
return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject
|
|
<< " of element size " << ElementSize << " bits in addrspace "
|
|
<< AddrSpace;
|
|
}
|
|
|
|
// A Chain is a set of instructions such that:
|
|
// - All instructions have the same equivalence class, so in particular all are
|
|
// loads, or all are stores.
|
|
// - We know the address accessed by the i'th chain elem relative to the
|
|
// chain's leader instruction, which is the first instr of the chain in BB
|
|
// order.
|
|
//
|
|
// Chains have two canonical orderings:
|
|
// - BB order, sorted by Instr->comesBefore.
|
|
// - Offset order, sorted by OffsetFromLeader.
|
|
// This pass switches back and forth between these orders.
|
|
struct ChainElem {
|
|
Instruction *Inst;
|
|
APInt OffsetFromLeader;
|
|
ChainElem(Instruction *Inst, APInt OffsetFromLeader)
|
|
: Inst(std::move(Inst)), OffsetFromLeader(std::move(OffsetFromLeader)) {}
|
|
};
|
|
using Chain = SmallVector<ChainElem, 1>;
|
|
|
|
void sortChainInBBOrder(Chain &C) {
|
|
sort(C, [](auto &A, auto &B) { return A.Inst->comesBefore(B.Inst); });
|
|
}
|
|
|
|
void sortChainInOffsetOrder(Chain &C) {
|
|
sort(C, [](const auto &A, const auto &B) {
|
|
if (A.OffsetFromLeader != B.OffsetFromLeader)
|
|
return A.OffsetFromLeader.slt(B.OffsetFromLeader);
|
|
return A.Inst->comesBefore(B.Inst); // stable tiebreaker
|
|
});
|
|
}
|
|
|
|
[[maybe_unused]] void dumpChain(ArrayRef<ChainElem> C) {
|
|
for (const auto &E : C) {
|
|
dbgs() << " " << *E.Inst << " (offset " << E.OffsetFromLeader << ")\n";
|
|
}
|
|
}
|
|
|
|
using EquivalenceClassMap =
|
|
MapVector<EqClassKey, SmallVector<Instruction *, 8>>;
|
|
|
|
// FIXME: Assuming stack alignment of 4 is always good enough
|
|
constexpr unsigned StackAdjustedAlignment = 4;
|
|
|
|
Instruction *propagateMetadata(Instruction *I, const Chain &C) {
|
|
SmallVector<Value *, 8> Values;
|
|
for (const ChainElem &E : C)
|
|
Values.emplace_back(E.Inst);
|
|
return propagateMetadata(I, Values);
|
|
}
|
|
|
|
bool isInvariantLoad(const Instruction *I) {
|
|
const LoadInst *LI = dyn_cast<LoadInst>(I);
|
|
return LI != nullptr && LI->hasMetadata(LLVMContext::MD_invariant_load);
|
|
}
|
|
|
|
/// Reorders the instructions that I depends on (the instructions defining its
|
|
/// operands), to ensure they dominate I.
|
|
void reorder(Instruction *I) {
|
|
SmallPtrSet<Instruction *, 16> InstructionsToMove;
|
|
SmallVector<Instruction *, 16> Worklist;
|
|
|
|
Worklist.emplace_back(I);
|
|
while (!Worklist.empty()) {
|
|
Instruction *IW = Worklist.pop_back_val();
|
|
int NumOperands = IW->getNumOperands();
|
|
for (int Idx = 0; Idx < NumOperands; Idx++) {
|
|
Instruction *IM = dyn_cast<Instruction>(IW->getOperand(Idx));
|
|
if (!IM || IM->getOpcode() == Instruction::PHI)
|
|
continue;
|
|
|
|
// If IM is in another BB, no need to move it, because this pass only
|
|
// vectorizes instructions within one BB.
|
|
if (IM->getParent() != I->getParent())
|
|
continue;
|
|
|
|
assert(IM != I && "Unexpected cycle while re-ordering instructions");
|
|
|
|
if (!IM->comesBefore(I)) {
|
|
InstructionsToMove.insert(IM);
|
|
Worklist.emplace_back(IM);
|
|
}
|
|
}
|
|
}
|
|
|
|
// All instructions to move should follow I. Start from I, not from begin().
|
|
for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;) {
|
|
Instruction *IM = &*(BBI++);
|
|
if (!InstructionsToMove.contains(IM))
|
|
continue;
|
|
IM->moveBefore(I->getIterator());
|
|
}
|
|
}
|
|
|
|
class Vectorizer {
|
|
Function &F;
|
|
AliasAnalysis &AA;
|
|
AssumptionCache &AC;
|
|
DominatorTree &DT;
|
|
ScalarEvolution &SE;
|
|
TargetTransformInfo &TTI;
|
|
const DataLayout &DL;
|
|
IRBuilder<> Builder;
|
|
|
|
// We could erase instrs right after vectorizing them, but that can mess up
|
|
// our BB iterators, and also can make the equivalence class keys point to
|
|
// freed memory. This is fixable, but it's simpler just to wait until we're
|
|
// done with the BB and erase all at once.
|
|
SmallVector<Instruction *, 128> ToErase;
|
|
|
|
public:
|
|
Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
|
|
DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI)
|
|
: F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI),
|
|
DL(F.getDataLayout()), Builder(SE.getContext()) {}
|
|
|
|
bool run();
|
|
|
|
private:
|
|
static const unsigned MaxDepth = 3;
|
|
|
|
/// Runs the vectorizer on a "pseudo basic block", which is a range of
|
|
/// instructions [Begin, End) within one BB all of which have
|
|
/// isGuaranteedToTransferExecutionToSuccessor(I) == true.
|
|
bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End);
|
|
|
|
/// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores
|
|
/// in the same BB with the same value for getUnderlyingObject() etc.
|
|
bool runOnEquivalenceClass(const EqClassKey &EqClassKey,
|
|
ArrayRef<Instruction *> EqClass);
|
|
|
|
/// Runs the vectorizer on one chain, i.e. a subset of an equivalence class
|
|
/// where all instructions access a known, constant offset from the first
|
|
/// instruction.
|
|
bool runOnChain(Chain &C);
|
|
|
|
/// Splits the chain into subchains of instructions which read/write a
|
|
/// contiguous block of memory. Discards any length-1 subchains (because
|
|
/// there's nothing to vectorize in there).
|
|
std::vector<Chain> splitChainByContiguity(Chain &C);
|
|
|
|
/// Splits the chain into subchains where it's safe to hoist loads up to the
|
|
/// beginning of the sub-chain and it's safe to sink loads up to the end of
|
|
/// the sub-chain. Discards any length-1 subchains.
|
|
std::vector<Chain> splitChainByMayAliasInstrs(Chain &C);
|
|
|
|
/// Splits the chain into subchains that make legal, aligned accesses.
|
|
/// Discards any length-1 subchains.
|
|
std::vector<Chain> splitChainByAlignment(Chain &C);
|
|
|
|
/// Converts the instrs in the chain into a single vectorized load or store.
|
|
/// Adds the old scalar loads/stores to ToErase.
|
|
bool vectorizeChain(Chain &C);
|
|
|
|
/// Tries to compute the offset in bytes PtrB - PtrA.
|
|
std::optional<APInt> getConstantOffset(Value *PtrA, Value *PtrB,
|
|
Instruction *ContextInst,
|
|
unsigned Depth = 0);
|
|
std::optional<APInt> getConstantOffsetComplexAddrs(Value *PtrA, Value *PtrB,
|
|
Instruction *ContextInst,
|
|
unsigned Depth);
|
|
std::optional<APInt> getConstantOffsetSelects(Value *PtrA, Value *PtrB,
|
|
Instruction *ContextInst,
|
|
unsigned Depth);
|
|
|
|
/// Gets the element type of the vector that the chain will load or store.
|
|
/// This is nontrivial because the chain may contain elements of different
|
|
/// types; e.g. it's legal to have a chain that contains both i32 and float.
|
|
Type *getChainElemTy(const Chain &C);
|
|
|
|
/// Determines whether ChainElem can be moved up (if IsLoad) or down (if
|
|
/// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias
|
|
/// instructions.
|
|
///
|
|
/// The map ChainElemOffsets must contain all of the elements in
|
|
/// [ChainBegin, ChainElem] and their offsets from some arbitrary base
|
|
/// address. It's ok if it contains additional entries.
|
|
template <bool IsLoadChain>
|
|
bool isSafeToMove(
|
|
Instruction *ChainElem, Instruction *ChainBegin,
|
|
const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets);
|
|
|
|
/// Merges the equivalence classes if they have underlying objects that differ
|
|
/// by one level of indirection (i.e., one is a getelementptr and the other is
|
|
/// the base pointer in that getelementptr).
|
|
void mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const;
|
|
|
|
/// Collects loads and stores grouped by "equivalence class", where:
|
|
/// - all elements in an eq class are a load or all are a store,
|
|
/// - they all load/store the same element size (it's OK to have e.g. i8 and
|
|
/// <4 x i8> in the same class, but not i32 and <4 x i8>), and
|
|
/// - they all have the same value for getUnderlyingObject().
|
|
EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin,
|
|
BasicBlock::iterator End);
|
|
|
|
/// Partitions Instrs into "chains" where every instruction has a known
|
|
/// constant offset from the first instr in the chain.
|
|
///
|
|
/// Postcondition: For all i, ret[i][0].second == 0, because the first instr
|
|
/// in the chain is the leader, and an instr touches distance 0 from itself.
|
|
std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
|
|
};
|
|
|
|
class LoadStoreVectorizerLegacyPass : public FunctionPass {
|
|
public:
|
|
static char ID;
|
|
|
|
LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
|
|
initializeLoadStoreVectorizerLegacyPassPass(
|
|
*PassRegistry::getPassRegistry());
|
|
}
|
|
|
|
bool runOnFunction(Function &F) override;
|
|
|
|
StringRef getPassName() const override {
|
|
return "GPU Load and Store Vectorizer";
|
|
}
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
AU.addRequired<AAResultsWrapperPass>();
|
|
AU.addRequired<AssumptionCacheTracker>();
|
|
AU.addRequired<ScalarEvolutionWrapperPass>();
|
|
AU.addRequired<DominatorTreeWrapperPass>();
|
|
AU.addRequired<TargetTransformInfoWrapperPass>();
|
|
AU.setPreservesCFG();
|
|
}
|
|
};
|
|
|
|
} // end anonymous namespace
|
|
|
|
char LoadStoreVectorizerLegacyPass::ID = 0;
|
|
|
|
INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
|
|
"Vectorize load and Store instructions", false, false)
|
|
INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
|
|
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker);
|
|
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
|
|
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
|
|
INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
|
|
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
|
|
INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
|
|
"Vectorize load and store instructions", false, false)
|
|
|
|
Pass *llvm::createLoadStoreVectorizerPass() {
|
|
return new LoadStoreVectorizerLegacyPass();
|
|
}
|
|
|
|
bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
|
|
// Don't vectorize when the attribute NoImplicitFloat is used.
|
|
if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
|
|
return false;
|
|
|
|
AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
|
|
DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
|
|
ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
|
|
TargetTransformInfo &TTI =
|
|
getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
|
|
|
|
AssumptionCache &AC =
|
|
getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
|
|
|
|
return Vectorizer(F, AA, AC, DT, SE, TTI).run();
|
|
}
|
|
|
|
PreservedAnalyses LoadStoreVectorizerPass::run(Function &F,
|
|
FunctionAnalysisManager &AM) {
|
|
// Don't vectorize when the attribute NoImplicitFloat is used.
|
|
if (F.hasFnAttribute(Attribute::NoImplicitFloat))
|
|
return PreservedAnalyses::all();
|
|
|
|
AliasAnalysis &AA = AM.getResult<AAManager>(F);
|
|
DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
|
|
ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
|
|
TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
|
|
AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
|
|
|
|
bool Changed = Vectorizer(F, AA, AC, DT, SE, TTI).run();
|
|
PreservedAnalyses PA;
|
|
PA.preserveSet<CFGAnalyses>();
|
|
return Changed ? PA : PreservedAnalyses::all();
|
|
}
|
|
|
|
bool Vectorizer::run() {
|
|
bool Changed = false;
|
|
// Break up the BB if there are any instrs which aren't guaranteed to transfer
|
|
// execution to their successor.
|
|
//
|
|
// Consider, for example:
|
|
//
|
|
// def assert_arr_len(int n) { if (n < 2) exit(); }
|
|
//
|
|
// load arr[0]
|
|
// call assert_array_len(arr.length)
|
|
// load arr[1]
|
|
//
|
|
// Even though assert_arr_len does not read or write any memory, we can't
|
|
// speculate the second load before the call. More info at
|
|
// https://github.com/llvm/llvm-project/issues/52950.
|
|
for (BasicBlock *BB : post_order(&F)) {
|
|
// BB must at least have a terminator.
|
|
assert(!BB->empty());
|
|
|
|
SmallVector<BasicBlock::iterator, 8> Barriers;
|
|
Barriers.emplace_back(BB->begin());
|
|
for (Instruction &I : *BB)
|
|
if (!isGuaranteedToTransferExecutionToSuccessor(&I))
|
|
Barriers.emplace_back(I.getIterator());
|
|
Barriers.emplace_back(BB->end());
|
|
|
|
for (auto It = Barriers.begin(), End = std::prev(Barriers.end()); It != End;
|
|
++It)
|
|
Changed |= runOnPseudoBB(*It, *std::next(It));
|
|
|
|
for (Instruction *I : ToErase) {
|
|
auto *PtrOperand = getLoadStorePointerOperand(I);
|
|
if (I->use_empty())
|
|
I->eraseFromParent();
|
|
RecursivelyDeleteTriviallyDeadInstructions(PtrOperand);
|
|
}
|
|
ToErase.clear();
|
|
}
|
|
|
|
return Changed;
|
|
}
|
|
|
|
bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
|
|
BasicBlock::iterator End) {
|
|
LLVM_DEBUG({
|
|
dbgs() << "LSV: Running on pseudo-BB [" << *Begin << " ... ";
|
|
if (End != Begin->getParent()->end())
|
|
dbgs() << *End;
|
|
else
|
|
dbgs() << "<BB end>";
|
|
dbgs() << ")\n";
|
|
});
|
|
|
|
bool Changed = false;
|
|
for (const auto &[EqClassKey, EqClass] :
|
|
collectEquivalenceClasses(Begin, End))
|
|
Changed |= runOnEquivalenceClass(EqClassKey, EqClass);
|
|
|
|
return Changed;
|
|
}
|
|
|
|
bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
|
|
ArrayRef<Instruction *> EqClass) {
|
|
bool Changed = false;
|
|
|
|
LLVM_DEBUG({
|
|
dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
|
|
<< " keyed on " << EqClassKey << ":\n";
|
|
for (Instruction *I : EqClass)
|
|
dbgs() << " " << *I << "\n";
|
|
});
|
|
|
|
std::vector<Chain> Chains = gatherChains(EqClass);
|
|
LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()
|
|
<< " nontrivial chains.\n";);
|
|
for (Chain &C : Chains)
|
|
Changed |= runOnChain(C);
|
|
return Changed;
|
|
}
|
|
|
|
bool Vectorizer::runOnChain(Chain &C) {
|
|
LLVM_DEBUG({
|
|
dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";
|
|
dumpChain(C);
|
|
});
|
|
|
|
// Split up the chain into increasingly smaller chains, until we can finally
|
|
// vectorize the chains.
|
|
//
|
|
// (Don't be scared by the depth of the loop nest here. These operations are
|
|
// all at worst O(n lg n) in the number of instructions, and splitting chains
|
|
// doesn't change the number of instrs. So the whole loop nest is O(n lg n).)
|
|
bool Changed = false;
|
|
for (auto &C : splitChainByMayAliasInstrs(C))
|
|
for (auto &C : splitChainByContiguity(C))
|
|
for (auto &C : splitChainByAlignment(C))
|
|
Changed |= vectorizeChain(C);
|
|
return Changed;
|
|
}
|
|
|
|
std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
|
|
if (C.empty())
|
|
return {};
|
|
|
|
sortChainInBBOrder(C);
|
|
|
|
LLVM_DEBUG({
|
|
dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n";
|
|
dumpChain(C);
|
|
});
|
|
|
|
// We know that elements in the chain with nonverlapping offsets can't
|
|
// alias, but AA may not be smart enough to figure this out. Use a
|
|
// hashtable.
|
|
DenseMap<Instruction *, APInt /*OffsetFromLeader*/> ChainOffsets;
|
|
for (const auto &E : C)
|
|
ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader});
|
|
|
|
// Loads get hoisted up to the first load in the chain. Stores get sunk
|
|
// down to the last store in the chain. Our algorithm for loads is:
|
|
//
|
|
// - Take the first element of the chain. This is the start of a new chain.
|
|
// - Take the next element of `Chain` and check for may-alias instructions
|
|
// up to the start of NewChain. If no may-alias instrs, add it to
|
|
// NewChain. Otherwise, start a new NewChain.
|
|
//
|
|
// For stores it's the same except in the reverse direction.
|
|
//
|
|
// We expect IsLoad to be an std::bool_constant.
|
|
auto Impl = [&](auto IsLoad) {
|
|
// MSVC is unhappy if IsLoad is a capture, so pass it as an arg.
|
|
auto [ChainBegin, ChainEnd] = [&](auto IsLoad) {
|
|
if constexpr (IsLoad())
|
|
return std::make_pair(C.begin(), C.end());
|
|
else
|
|
return std::make_pair(C.rbegin(), C.rend());
|
|
}(IsLoad);
|
|
assert(ChainBegin != ChainEnd);
|
|
|
|
std::vector<Chain> Chains;
|
|
SmallVector<ChainElem, 1> NewChain;
|
|
NewChain.emplace_back(*ChainBegin);
|
|
for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {
|
|
if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst,
|
|
ChainOffsets)) {
|
|
LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "
|
|
<< *ChainIt->Inst << " into " << *ChainBegin->Inst
|
|
<< "\n");
|
|
NewChain.emplace_back(*ChainIt);
|
|
} else {
|
|
LLVM_DEBUG(
|
|
dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "
|
|
<< *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n");
|
|
if (NewChain.size() > 1) {
|
|
LLVM_DEBUG({
|
|
dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
|
|
dumpChain(NewChain);
|
|
});
|
|
Chains.emplace_back(std::move(NewChain));
|
|
}
|
|
|
|
// Start a new chain.
|
|
NewChain = SmallVector<ChainElem, 1>({*ChainIt});
|
|
}
|
|
}
|
|
if (NewChain.size() > 1) {
|
|
LLVM_DEBUG({
|
|
dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
|
|
dumpChain(NewChain);
|
|
});
|
|
Chains.emplace_back(std::move(NewChain));
|
|
}
|
|
return Chains;
|
|
};
|
|
|
|
if (isa<LoadInst>(C[0].Inst))
|
|
return Impl(/*IsLoad=*/std::bool_constant<true>());
|
|
|
|
assert(isa<StoreInst>(C[0].Inst));
|
|
return Impl(/*IsLoad=*/std::bool_constant<false>());
|
|
}
|
|
|
|
std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
|
|
if (C.empty())
|
|
return {};
|
|
|
|
sortChainInOffsetOrder(C);
|
|
|
|
LLVM_DEBUG({
|
|
dbgs() << "LSV: splitChainByContiguity considering chain:\n";
|
|
dumpChain(C);
|
|
});
|
|
|
|
std::vector<Chain> Ret;
|
|
Ret.push_back({C.front()});
|
|
|
|
for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
|
|
// `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
|
|
auto &CurChain = Ret.back();
|
|
const ChainElem &Prev = CurChain.back();
|
|
unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));
|
|
assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
|
|
"collectEquivalenceClass");
|
|
APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
|
|
|
|
// Add this instruction to the end of the current chain, or start a new one.
|
|
bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
|
|
LLVM_DEBUG(dbgs() << "LSV: Instructions are "
|
|
<< (AreContiguous ? "" : "not ") << "contiguous: "
|
|
<< *Prev.Inst << " (ends at offset " << PrevReadEnd
|
|
<< ") -> " << *It->Inst << " (starts at offset "
|
|
<< It->OffsetFromLeader << ")\n");
|
|
if (AreContiguous)
|
|
CurChain.push_back(*It);
|
|
else
|
|
Ret.push_back({*It});
|
|
}
|
|
|
|
// Filter out length-1 chains, these are uninteresting.
|
|
llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; });
|
|
return Ret;
|
|
}
|
|
|
|
Type *Vectorizer::getChainElemTy(const Chain &C) {
|
|
assert(!C.empty());
|
|
// The rules are:
|
|
// - If there are any pointer types in the chain, use an integer type.
|
|
// - Prefer an integer type if it appears in the chain.
|
|
// - Otherwise, use the first type in the chain.
|
|
//
|
|
// The rule about pointer types is a simplification when we merge e.g. a load
|
|
// of a ptr and a double. There's no direct conversion from a ptr to a
|
|
// double; it requires a ptrtoint followed by a bitcast.
|
|
//
|
|
// It's unclear to me if the other rules have any practical effect, but we do
|
|
// it to match this pass's previous behavior.
|
|
if (any_of(C, [](const ChainElem &E) {
|
|
return getLoadStoreType(E.Inst)->getScalarType()->isPointerTy();
|
|
})) {
|
|
return Type::getIntNTy(
|
|
F.getContext(),
|
|
DL.getTypeSizeInBits(getLoadStoreType(C[0].Inst)->getScalarType()));
|
|
}
|
|
|
|
for (const ChainElem &E : C)
|
|
if (Type *T = getLoadStoreType(E.Inst)->getScalarType(); T->isIntegerTy())
|
|
return T;
|
|
return getLoadStoreType(C[0].Inst)->getScalarType();
|
|
}
|
|
|
|
std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
|
|
// We use a simple greedy algorithm.
|
|
// - Given a chain of length N, find all prefixes that
|
|
// (a) are not longer than the max register length, and
|
|
// (b) are a power of 2.
|
|
// - Starting from the longest prefix, try to create a vector of that length.
|
|
// - If one of them works, great. Repeat the algorithm on any remaining
|
|
// elements in the chain.
|
|
// - If none of them work, discard the first element and repeat on a chain
|
|
// of length N-1.
|
|
if (C.empty())
|
|
return {};
|
|
|
|
sortChainInOffsetOrder(C);
|
|
|
|
LLVM_DEBUG({
|
|
dbgs() << "LSV: splitChainByAlignment considering chain:\n";
|
|
dumpChain(C);
|
|
});
|
|
|
|
bool IsLoadChain = isa<LoadInst>(C[0].Inst);
|
|
auto GetVectorFactor = [&](unsigned VF, unsigned LoadStoreSize,
|
|
unsigned ChainSizeBytes, VectorType *VecTy) {
|
|
return IsLoadChain ? TTI.getLoadVectorFactor(VF, LoadStoreSize,
|
|
ChainSizeBytes, VecTy)
|
|
: TTI.getStoreVectorFactor(VF, LoadStoreSize,
|
|
ChainSizeBytes, VecTy);
|
|
};
|
|
|
|
#ifndef NDEBUG
|
|
for (const auto &E : C) {
|
|
Type *Ty = getLoadStoreType(E.Inst)->getScalarType();
|
|
assert(isPowerOf2_32(DL.getTypeSizeInBits(Ty)) &&
|
|
"Should have filtered out non-power-of-two elements in "
|
|
"collectEquivalenceClasses.");
|
|
}
|
|
#endif
|
|
|
|
unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
|
|
unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8;
|
|
|
|
std::vector<Chain> Ret;
|
|
for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) {
|
|
// Find candidate chains of size not greater than the largest vector reg.
|
|
// These chains are over the closed interval [CBegin, CEnd].
|
|
SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8>
|
|
CandidateChains;
|
|
for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) {
|
|
APInt Sz = C[CEnd].OffsetFromLeader +
|
|
DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst)) -
|
|
C[CBegin].OffsetFromLeader;
|
|
if (Sz.sgt(VecRegBytes))
|
|
break;
|
|
CandidateChains.emplace_back(CEnd,
|
|
static_cast<unsigned>(Sz.getLimitedValue()));
|
|
}
|
|
|
|
// Consider the longest chain first.
|
|
for (auto It = CandidateChains.rbegin(), End = CandidateChains.rend();
|
|
It != End; ++It) {
|
|
auto [CEnd, SizeBytes] = *It;
|
|
LLVM_DEBUG(
|
|
dbgs() << "LSV: splitChainByAlignment considering candidate chain ["
|
|
<< *C[CBegin].Inst << " ... " << *C[CEnd].Inst << "]\n");
|
|
|
|
Type *VecElemTy = getChainElemTy(C);
|
|
// Note, VecElemTy is a power of 2, but might be less than one byte. For
|
|
// example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case
|
|
// VecElemTy would be i4.
|
|
unsigned VecElemBits = DL.getTypeSizeInBits(VecElemTy);
|
|
|
|
// SizeBytes and VecElemBits are powers of 2, so they divide evenly.
|
|
assert((8 * SizeBytes) % VecElemBits == 0);
|
|
unsigned NumVecElems = 8 * SizeBytes / VecElemBits;
|
|
FixedVectorType *VecTy = FixedVectorType::get(VecElemTy, NumVecElems);
|
|
unsigned VF = 8 * VecRegBytes / VecElemBits;
|
|
|
|
// Check that TTI is happy with this vectorization factor.
|
|
unsigned TargetVF = GetVectorFactor(VF, VecElemBits,
|
|
VecElemBits * NumVecElems / 8, VecTy);
|
|
if (TargetVF != VF && TargetVF < NumVecElems) {
|
|
LLVM_DEBUG(
|
|
dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
|
|
"because TargetVF="
|
|
<< TargetVF << " != VF=" << VF
|
|
<< " and TargetVF < NumVecElems=" << NumVecElems << "\n");
|
|
continue;
|
|
}
|
|
|
|
// Is a load/store with this alignment allowed by TTI and at least as fast
|
|
// as an unvectorized load/store?
|
|
//
|
|
// TTI and F are passed as explicit captures to WAR an MSVC misparse (??).
|
|
auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &TTI = TTI,
|
|
&F = F](Align Alignment) {
|
|
if (Alignment.value() % SizeBytes == 0)
|
|
return true;
|
|
unsigned VectorizedSpeed = 0;
|
|
bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses(
|
|
F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed);
|
|
if (!AllowsMisaligned) {
|
|
LLVM_DEBUG(dbgs()
|
|
<< "LSV: Access of " << SizeBytes << "B in addrspace "
|
|
<< AS << " with alignment " << Alignment.value()
|
|
<< " is misaligned, and therefore can't be vectorized.\n");
|
|
return false;
|
|
}
|
|
|
|
unsigned ElementwiseSpeed = 0;
|
|
(TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS,
|
|
Alignment, &ElementwiseSpeed);
|
|
if (VectorizedSpeed < ElementwiseSpeed) {
|
|
LLVM_DEBUG(dbgs()
|
|
<< "LSV: Access of " << SizeBytes << "B in addrspace "
|
|
<< AS << " with alignment " << Alignment.value()
|
|
<< " has relative speed " << VectorizedSpeed
|
|
<< ", which is lower than the elementwise speed of "
|
|
<< ElementwiseSpeed
|
|
<< ". Therefore this access won't be vectorized.\n");
|
|
return false;
|
|
}
|
|
return true;
|
|
};
|
|
|
|
// If we're loading/storing from an alloca, align it if possible.
|
|
//
|
|
// FIXME: We eagerly upgrade the alignment, regardless of whether TTI
|
|
// tells us this is beneficial. This feels a bit odd, but it matches
|
|
// existing tests. This isn't *so* bad, because at most we align to 4
|
|
// bytes (current value of StackAdjustedAlignment).
|
|
//
|
|
// FIXME: We will upgrade the alignment of the alloca even if it turns out
|
|
// we can't vectorize for some other reason.
|
|
Value *PtrOperand = getLoadStorePointerOperand(C[CBegin].Inst);
|
|
bool IsAllocaAccess = AS == DL.getAllocaAddrSpace() &&
|
|
isa<AllocaInst>(PtrOperand->stripPointerCasts());
|
|
Align Alignment = getLoadStoreAlignment(C[CBegin].Inst);
|
|
Align PrefAlign = Align(StackAdjustedAlignment);
|
|
if (IsAllocaAccess && Alignment.value() % SizeBytes != 0 &&
|
|
IsAllowedAndFast(PrefAlign)) {
|
|
Align NewAlign = getOrEnforceKnownAlignment(
|
|
PtrOperand, PrefAlign, DL, C[CBegin].Inst, nullptr, &DT);
|
|
if (NewAlign >= Alignment) {
|
|
LLVM_DEBUG(dbgs()
|
|
<< "LSV: splitByChain upgrading alloca alignment from "
|
|
<< Alignment.value() << " to " << NewAlign.value()
|
|
<< "\n");
|
|
Alignment = NewAlign;
|
|
}
|
|
}
|
|
|
|
if (!IsAllowedAndFast(Alignment)) {
|
|
LLVM_DEBUG(
|
|
dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
|
|
"because its alignment is not AllowedAndFast: "
|
|
<< Alignment.value() << "\n");
|
|
continue;
|
|
}
|
|
|
|
if ((IsLoadChain &&
|
|
!TTI.isLegalToVectorizeLoadChain(SizeBytes, Alignment, AS)) ||
|
|
(!IsLoadChain &&
|
|
!TTI.isLegalToVectorizeStoreChain(SizeBytes, Alignment, AS))) {
|
|
LLVM_DEBUG(
|
|
dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
|
|
"because !isLegalToVectorizeLoad/StoreChain.");
|
|
continue;
|
|
}
|
|
|
|
// Hooray, we can vectorize this chain!
|
|
Chain &NewChain = Ret.emplace_back();
|
|
for (unsigned I = CBegin; I <= CEnd; ++I)
|
|
NewChain.emplace_back(C[I]);
|
|
CBegin = CEnd; // Skip over the instructions we've added to the chain.
|
|
break;
|
|
}
|
|
}
|
|
return Ret;
|
|
}
|
|
|
|
bool Vectorizer::vectorizeChain(Chain &C) {
|
|
if (C.size() < 2)
|
|
return false;
|
|
|
|
sortChainInOffsetOrder(C);
|
|
|
|
LLVM_DEBUG({
|
|
dbgs() << "LSV: Vectorizing chain of " << C.size() << " instructions:\n";
|
|
dumpChain(C);
|
|
});
|
|
|
|
Type *VecElemTy = getChainElemTy(C);
|
|
bool IsLoadChain = isa<LoadInst>(C[0].Inst);
|
|
unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
|
|
unsigned ChainBytes = std::accumulate(
|
|
C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) {
|
|
return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));
|
|
});
|
|
assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
|
|
// VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
|
|
// than 1 byte (e.g. VecTy == <32 x i1>).
|
|
Type *VecTy = FixedVectorType::get(
|
|
VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy));
|
|
|
|
Align Alignment = getLoadStoreAlignment(C[0].Inst);
|
|
// If this is a load/store of an alloca, we might have upgraded the alloca's
|
|
// alignment earlier. Get the new alignment.
|
|
if (AS == DL.getAllocaAddrSpace()) {
|
|
Alignment = std::max(
|
|
Alignment,
|
|
getOrEnforceKnownAlignment(getLoadStorePointerOperand(C[0].Inst),
|
|
MaybeAlign(), DL, C[0].Inst, nullptr, &DT));
|
|
}
|
|
|
|
// All elements of the chain must have the same scalar-type size.
|
|
#ifndef NDEBUG
|
|
for (const ChainElem &E : C)
|
|
assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) ==
|
|
DL.getTypeStoreSize(VecElemTy));
|
|
#endif
|
|
|
|
Instruction *VecInst;
|
|
if (IsLoadChain) {
|
|
// Loads get hoisted to the location of the first load in the chain. We may
|
|
// also need to hoist the (transitive) operands of the loads.
|
|
Builder.SetInsertPoint(
|
|
llvm::min_element(C, [](const auto &A, const auto &B) {
|
|
return A.Inst->comesBefore(B.Inst);
|
|
})->Inst);
|
|
|
|
// Chain is in offset order, so C[0] is the instr with the lowest offset,
|
|
// i.e. the root of the vector.
|
|
VecInst = Builder.CreateAlignedLoad(VecTy,
|
|
getLoadStorePointerOperand(C[0].Inst),
|
|
Alignment);
|
|
|
|
unsigned VecIdx = 0;
|
|
for (const ChainElem &E : C) {
|
|
Instruction *I = E.Inst;
|
|
Value *V;
|
|
Type *T = getLoadStoreType(I);
|
|
if (auto *VT = dyn_cast<FixedVectorType>(T)) {
|
|
auto Mask = llvm::to_vector<8>(
|
|
llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
|
|
V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
|
|
VecIdx += VT->getNumElements();
|
|
} else {
|
|
V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
|
|
I->getName());
|
|
++VecIdx;
|
|
}
|
|
if (V->getType() != I->getType())
|
|
V = Builder.CreateBitOrPointerCast(V, I->getType());
|
|
I->replaceAllUsesWith(V);
|
|
}
|
|
|
|
// Finally, we need to reorder the instrs in the BB so that the (transitive)
|
|
// operands of VecInst appear before it. To see why, suppose we have
|
|
// vectorized the following code:
|
|
//
|
|
// ptr1 = gep a, 1
|
|
// load1 = load i32 ptr1
|
|
// ptr0 = gep a, 0
|
|
// load0 = load i32 ptr0
|
|
//
|
|
// We will put the vectorized load at the location of the earliest load in
|
|
// the BB, i.e. load1. We get:
|
|
//
|
|
// ptr1 = gep a, 1
|
|
// loadv = load <2 x i32> ptr0
|
|
// load0 = extractelement loadv, 0
|
|
// load1 = extractelement loadv, 1
|
|
// ptr0 = gep a, 0
|
|
//
|
|
// Notice that loadv uses ptr0, which is defined *after* it!
|
|
reorder(VecInst);
|
|
} else {
|
|
// Stores get sunk to the location of the last store in the chain.
|
|
Builder.SetInsertPoint(llvm::max_element(C, [](auto &A, auto &B) {
|
|
return A.Inst->comesBefore(B.Inst);
|
|
})->Inst);
|
|
|
|
// Build the vector to store.
|
|
Value *Vec = PoisonValue::get(VecTy);
|
|
unsigned VecIdx = 0;
|
|
auto InsertElem = [&](Value *V) {
|
|
if (V->getType() != VecElemTy)
|
|
V = Builder.CreateBitOrPointerCast(V, VecElemTy);
|
|
Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));
|
|
};
|
|
for (const ChainElem &E : C) {
|
|
auto *I = cast<StoreInst>(E.Inst);
|
|
if (FixedVectorType *VT =
|
|
dyn_cast<FixedVectorType>(getLoadStoreType(I))) {
|
|
for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
|
|
InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
|
|
Builder.getInt32(J)));
|
|
}
|
|
} else {
|
|
InsertElem(I->getValueOperand());
|
|
}
|
|
}
|
|
|
|
// Chain is in offset order, so C[0] is the instr with the lowest offset,
|
|
// i.e. the root of the vector.
|
|
VecInst = Builder.CreateAlignedStore(
|
|
Vec,
|
|
getLoadStorePointerOperand(C[0].Inst),
|
|
Alignment);
|
|
}
|
|
|
|
propagateMetadata(VecInst, C);
|
|
|
|
for (const ChainElem &E : C)
|
|
ToErase.emplace_back(E.Inst);
|
|
|
|
++NumVectorInstructions;
|
|
NumScalarsVectorized += C.size();
|
|
return true;
|
|
}
|
|
|
|
template <bool IsLoadChain>
|
|
bool Vectorizer::isSafeToMove(
|
|
Instruction *ChainElem, Instruction *ChainBegin,
|
|
const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets) {
|
|
LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> "
|
|
<< *ChainBegin << ")\n");
|
|
|
|
assert(isa<LoadInst>(ChainElem) == IsLoadChain);
|
|
if (ChainElem == ChainBegin)
|
|
return true;
|
|
|
|
// Invariant loads can always be reordered; by definition they are not
|
|
// clobbered by stores.
|
|
if (isInvariantLoad(ChainElem))
|
|
return true;
|
|
|
|
auto BBIt = std::next([&] {
|
|
if constexpr (IsLoadChain)
|
|
return BasicBlock::reverse_iterator(ChainElem);
|
|
else
|
|
return BasicBlock::iterator(ChainElem);
|
|
}());
|
|
auto BBItEnd = std::next([&] {
|
|
if constexpr (IsLoadChain)
|
|
return BasicBlock::reverse_iterator(ChainBegin);
|
|
else
|
|
return BasicBlock::iterator(ChainBegin);
|
|
}());
|
|
|
|
const APInt &ChainElemOffset = ChainOffsets.at(ChainElem);
|
|
const unsigned ChainElemSize =
|
|
DL.getTypeStoreSize(getLoadStoreType(ChainElem));
|
|
|
|
for (; BBIt != BBItEnd; ++BBIt) {
|
|
Instruction *I = &*BBIt;
|
|
|
|
if (!I->mayReadOrWriteMemory())
|
|
continue;
|
|
|
|
// Loads can be reordered with other loads.
|
|
if (IsLoadChain && isa<LoadInst>(I))
|
|
continue;
|
|
|
|
// Stores can be sunk below invariant loads.
|
|
if (!IsLoadChain && isInvariantLoad(I))
|
|
continue;
|
|
|
|
// If I is in the chain, we can tell whether it aliases ChainIt by checking
|
|
// what offset ChainIt accesses. This may be better than AA is able to do.
|
|
//
|
|
// We should really only have duplicate offsets for stores (the duplicate
|
|
// loads should be CSE'ed), but in case we have a duplicate load, we'll
|
|
// split the chain so we don't have to handle this case specially.
|
|
if (auto OffsetIt = ChainOffsets.find(I); OffsetIt != ChainOffsets.end()) {
|
|
// I and ChainElem overlap if:
|
|
// - I and ChainElem have the same offset, OR
|
|
// - I's offset is less than ChainElem's, but I touches past the
|
|
// beginning of ChainElem, OR
|
|
// - ChainElem's offset is less than I's, but ChainElem touches past the
|
|
// beginning of I.
|
|
const APInt &IOffset = OffsetIt->second;
|
|
unsigned IElemSize = DL.getTypeStoreSize(getLoadStoreType(I));
|
|
if (IOffset == ChainElemOffset ||
|
|
(IOffset.sle(ChainElemOffset) &&
|
|
(IOffset + IElemSize).sgt(ChainElemOffset)) ||
|
|
(ChainElemOffset.sle(IOffset) &&
|
|
(ChainElemOffset + ChainElemSize).sgt(OffsetIt->second))) {
|
|
LLVM_DEBUG({
|
|
// Double check that AA also sees this alias. If not, we probably
|
|
// have a bug.
|
|
ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
|
|
assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
|
|
dbgs() << "LSV: Found alias in chain: " << *I << "\n";
|
|
});
|
|
return false; // We found an aliasing instruction; bail.
|
|
}
|
|
|
|
continue; // We're confident there's no alias.
|
|
}
|
|
|
|
LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
|
|
ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
|
|
if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {
|
|
LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n"
|
|
<< " Aliasing instruction:\n"
|
|
<< " " << *I << '\n'
|
|
<< " Aliased instruction and pointer:\n"
|
|
<< " " << *ChainElem << '\n'
|
|
<< " " << *getLoadStorePointerOperand(ChainElem)
|
|
<< '\n');
|
|
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
static bool checkNoWrapFlags(Instruction *I, bool Signed) {
|
|
BinaryOperator *BinOpI = cast<BinaryOperator>(I);
|
|
return (Signed && BinOpI->hasNoSignedWrap()) ||
|
|
(!Signed && BinOpI->hasNoUnsignedWrap());
|
|
}
|
|
|
|
static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
|
|
unsigned MatchingOpIdxA, Instruction *AddOpB,
|
|
unsigned MatchingOpIdxB, bool Signed) {
|
|
LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff
|
|
<< ", AddOpA=" << *AddOpA << ", MatchingOpIdxA="
|
|
<< MatchingOpIdxA << ", AddOpB=" << *AddOpB
|
|
<< ", MatchingOpIdxB=" << MatchingOpIdxB
|
|
<< ", Signed=" << Signed << "\n");
|
|
// If both OpA and OpB are adds with NSW/NUW and with one of the operands
|
|
// being the same, we can guarantee that the transformation is safe if we can
|
|
// prove that OpA won't overflow when Ret added to the other operand of OpA.
|
|
// For example:
|
|
// %tmp7 = add nsw i32 %tmp2, %v0
|
|
// %tmp8 = sext i32 %tmp7 to i64
|
|
// ...
|
|
// %tmp11 = add nsw i32 %v0, 1
|
|
// %tmp12 = add nsw i32 %tmp2, %tmp11
|
|
// %tmp13 = sext i32 %tmp12 to i64
|
|
//
|
|
// Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2.
|
|
// It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds
|
|
// 1 to %v0 and both %tmp11 and %tmp12 have the nsw flag.
|
|
assert(AddOpA->getOpcode() == Instruction::Add &&
|
|
AddOpB->getOpcode() == Instruction::Add &&
|
|
checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));
|
|
if (AddOpA->getOperand(MatchingOpIdxA) ==
|
|
AddOpB->getOperand(MatchingOpIdxB)) {
|
|
Value *OtherOperandA = AddOpA->getOperand(MatchingOpIdxA == 1 ? 0 : 1);
|
|
Value *OtherOperandB = AddOpB->getOperand(MatchingOpIdxB == 1 ? 0 : 1);
|
|
Instruction *OtherInstrA = dyn_cast<Instruction>(OtherOperandA);
|
|
Instruction *OtherInstrB = dyn_cast<Instruction>(OtherOperandB);
|
|
// Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
|
|
if (OtherInstrB && OtherInstrB->getOpcode() == Instruction::Add &&
|
|
checkNoWrapFlags(OtherInstrB, Signed) &&
|
|
isa<ConstantInt>(OtherInstrB->getOperand(1))) {
|
|
int64_t CstVal =
|
|
cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
|
|
if (OtherInstrB->getOperand(0) == OtherOperandA &&
|
|
IdxDiff.getSExtValue() == CstVal)
|
|
return true;
|
|
}
|
|
// Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
|
|
if (OtherInstrA && OtherInstrA->getOpcode() == Instruction::Add &&
|
|
checkNoWrapFlags(OtherInstrA, Signed) &&
|
|
isa<ConstantInt>(OtherInstrA->getOperand(1))) {
|
|
int64_t CstVal =
|
|
cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
|
|
if (OtherInstrA->getOperand(0) == OtherOperandB &&
|
|
IdxDiff.getSExtValue() == -CstVal)
|
|
return true;
|
|
}
|
|
// Match `x +nsw/nuw (y +nsw/nuw c)` and
|
|
// `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
|
|
if (OtherInstrA && OtherInstrB &&
|
|
OtherInstrA->getOpcode() == Instruction::Add &&
|
|
OtherInstrB->getOpcode() == Instruction::Add &&
|
|
checkNoWrapFlags(OtherInstrA, Signed) &&
|
|
checkNoWrapFlags(OtherInstrB, Signed) &&
|
|
isa<ConstantInt>(OtherInstrA->getOperand(1)) &&
|
|
isa<ConstantInt>(OtherInstrB->getOperand(1))) {
|
|
int64_t CstValA =
|
|
cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
|
|
int64_t CstValB =
|
|
cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
|
|
if (OtherInstrA->getOperand(0) == OtherInstrB->getOperand(0) &&
|
|
IdxDiff.getSExtValue() == (CstValB - CstValA))
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs(
|
|
Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
|
|
LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs PtrA=" << *PtrA
|
|
<< " PtrB=" << *PtrB << " ContextInst=" << *ContextInst
|
|
<< " Depth=" << Depth << "\n");
|
|
auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
|
|
auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
|
|
if (!GEPA || !GEPB)
|
|
return getConstantOffsetSelects(PtrA, PtrB, ContextInst, Depth);
|
|
|
|
// Look through GEPs after checking they're the same except for the last
|
|
// index.
|
|
if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
|
|
GEPA->getPointerOperand() != GEPB->getPointerOperand())
|
|
return std::nullopt;
|
|
gep_type_iterator GTIA = gep_type_begin(GEPA);
|
|
gep_type_iterator GTIB = gep_type_begin(GEPB);
|
|
for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
|
|
if (GTIA.getOperand() != GTIB.getOperand())
|
|
return std::nullopt;
|
|
++GTIA;
|
|
++GTIB;
|
|
}
|
|
|
|
Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
|
|
Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
|
|
if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
|
|
OpA->getType() != OpB->getType())
|
|
return std::nullopt;
|
|
|
|
uint64_t Stride = GTIA.getSequentialElementStride(DL);
|
|
|
|
// Only look through a ZExt/SExt.
|
|
if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
|
|
return std::nullopt;
|
|
|
|
bool Signed = isa<SExtInst>(OpA);
|
|
|
|
// At this point A could be a function parameter, i.e. not an instruction
|
|
Value *ValA = OpA->getOperand(0);
|
|
OpB = dyn_cast<Instruction>(OpB->getOperand(0));
|
|
if (!OpB || ValA->getType() != OpB->getType())
|
|
return std::nullopt;
|
|
|
|
const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
|
|
const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
|
|
const SCEV *IdxDiffSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA);
|
|
if (IdxDiffSCEV == SE.getCouldNotCompute())
|
|
return std::nullopt;
|
|
|
|
ConstantRange IdxDiffRange = SE.getSignedRange(IdxDiffSCEV);
|
|
if (!IdxDiffRange.isSingleElement())
|
|
return std::nullopt;
|
|
APInt IdxDiff = *IdxDiffRange.getSingleElement();
|
|
|
|
LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs IdxDiff=" << IdxDiff
|
|
<< "\n");
|
|
|
|
// Now we need to prove that adding IdxDiff to ValA won't overflow.
|
|
bool Safe = false;
|
|
|
|
// First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
|
|
// ValA, we're okay.
|
|
if (OpB->getOpcode() == Instruction::Add &&
|
|
isa<ConstantInt>(OpB->getOperand(1)) &&
|
|
IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) &&
|
|
checkNoWrapFlags(OpB, Signed))
|
|
Safe = true;
|
|
|
|
// Second attempt: check if we have eligible add NSW/NUW instruction
|
|
// sequences.
|
|
OpA = dyn_cast<Instruction>(ValA);
|
|
if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
|
|
OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) &&
|
|
checkNoWrapFlags(OpB, Signed)) {
|
|
// In the checks below a matching operand in OpA and OpB is an operand which
|
|
// is the same in those two instructions. Below we account for possible
|
|
// orders of the operands of these add instructions.
|
|
for (unsigned MatchingOpIdxA : {0, 1})
|
|
for (unsigned MatchingOpIdxB : {0, 1})
|
|
if (!Safe)
|
|
Safe = checkIfSafeAddSequence(IdxDiff, OpA, MatchingOpIdxA, OpB,
|
|
MatchingOpIdxB, Signed);
|
|
}
|
|
|
|
unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
|
|
|
|
// Third attempt:
|
|
//
|
|
// Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher
|
|
// order bit other than the sign bit are known to be zero in ValA, we can add
|
|
// Diff to it while guaranteeing no overflow of any sort.
|
|
//
|
|
// If IdxDiff is negative, do the same, but swap ValA and ValB.
|
|
if (!Safe) {
|
|
// When computing known bits, use the GEPs as context instructions, since
|
|
// they likely are in the same BB as the load/store.
|
|
KnownBits Known(BitWidth);
|
|
computeKnownBits((IdxDiff.sge(0) ? ValA : OpB), Known, DL, 0, &AC,
|
|
ContextInst, &DT);
|
|
APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
|
|
if (Signed)
|
|
BitsAllowedToBeSet.clearBit(BitWidth - 1);
|
|
if (BitsAllowedToBeSet.ult(IdxDiff.abs()))
|
|
return std::nullopt;
|
|
Safe = true;
|
|
}
|
|
|
|
if (Safe)
|
|
return IdxDiff * Stride;
|
|
return std::nullopt;
|
|
}
|
|
|
|
std::optional<APInt> Vectorizer::getConstantOffsetSelects(
|
|
Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
|
|
if (Depth++ == MaxDepth)
|
|
return std::nullopt;
|
|
|
|
if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
|
|
if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
|
|
if (SelectA->getCondition() != SelectB->getCondition())
|
|
return std::nullopt;
|
|
LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA
|
|
<< ", PtrB=" << *PtrB << ", ContextInst="
|
|
<< *ContextInst << ", Depth=" << Depth << "\n");
|
|
std::optional<APInt> TrueDiff = getConstantOffset(
|
|
SelectA->getTrueValue(), SelectB->getTrueValue(), ContextInst, Depth);
|
|
if (!TrueDiff)
|
|
return std::nullopt;
|
|
std::optional<APInt> FalseDiff =
|
|
getConstantOffset(SelectA->getFalseValue(), SelectB->getFalseValue(),
|
|
ContextInst, Depth);
|
|
if (TrueDiff == FalseDiff)
|
|
return TrueDiff;
|
|
}
|
|
}
|
|
return std::nullopt;
|
|
}
|
|
|
|
void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const {
|
|
if (EQClasses.size() < 2) // There is nothing to merge.
|
|
return;
|
|
|
|
// The reduced key has all elements of the ECClassKey except the underlying
|
|
// object. Check that EqClassKey has 4 elements and define the reduced key.
|
|
static_assert(std::tuple_size_v<EqClassKey> == 4,
|
|
"EqClassKey has changed - EqClassReducedKey needs changes too");
|
|
using EqClassReducedKey =
|
|
std::tuple<std::tuple_element_t<1, EqClassKey> /* AddrSpace */,
|
|
std::tuple_element_t<2, EqClassKey> /* Element size */,
|
|
std::tuple_element_t<3, EqClassKey> /* IsLoad; */>;
|
|
using ECReducedKeyToUnderlyingObjectMap =
|
|
MapVector<EqClassReducedKey,
|
|
SmallPtrSet<std::tuple_element_t<0, EqClassKey>, 4>>;
|
|
|
|
// Form a map from the reduced key (without the underlying object) to the
|
|
// underlying objects: 1 reduced key to many underlying objects, to form
|
|
// groups of potentially merge-able equivalence classes.
|
|
ECReducedKeyToUnderlyingObjectMap RedKeyToUOMap;
|
|
bool FoundPotentiallyOptimizableEC = false;
|
|
for (const auto &EC : EQClasses) {
|
|
const auto &Key = EC.first;
|
|
EqClassReducedKey RedKey{std::get<1>(Key), std::get<2>(Key),
|
|
std::get<3>(Key)};
|
|
RedKeyToUOMap[RedKey].insert(std::get<0>(Key));
|
|
if (RedKeyToUOMap[RedKey].size() > 1)
|
|
FoundPotentiallyOptimizableEC = true;
|
|
}
|
|
if (!FoundPotentiallyOptimizableEC)
|
|
return;
|
|
|
|
LLVM_DEBUG({
|
|
dbgs() << "LSV: mergeEquivalenceClasses: before merging:\n";
|
|
for (const auto &EC : EQClasses) {
|
|
dbgs() << " Key: {" << EC.first << "}\n";
|
|
for (const auto &Inst : EC.second)
|
|
dbgs() << " Inst: " << *Inst << '\n';
|
|
}
|
|
});
|
|
LLVM_DEBUG({
|
|
dbgs() << "LSV: mergeEquivalenceClasses: RedKeyToUOMap:\n";
|
|
for (const auto &RedKeyToUO : RedKeyToUOMap) {
|
|
dbgs() << " Reduced key: {" << std::get<0>(RedKeyToUO.first) << ", "
|
|
<< std::get<1>(RedKeyToUO.first) << ", "
|
|
<< static_cast<int>(std::get<2>(RedKeyToUO.first)) << "} --> "
|
|
<< RedKeyToUO.second.size() << " underlying objects:\n";
|
|
for (auto UObject : RedKeyToUO.second)
|
|
dbgs() << " " << *UObject << '\n';
|
|
}
|
|
});
|
|
|
|
using UObjectToUObjectMap = DenseMap<const Value *, const Value *>;
|
|
|
|
// Compute the ultimate targets for a set of underlying objects.
|
|
auto GetUltimateTargets =
|
|
[](SmallPtrSetImpl<const Value *> &UObjects) -> UObjectToUObjectMap {
|
|
UObjectToUObjectMap IndirectionMap;
|
|
for (const auto *UObject : UObjects) {
|
|
const unsigned MaxLookupDepth = 1; // look for 1-level indirections only
|
|
const auto *UltimateTarget = getUnderlyingObject(UObject, MaxLookupDepth);
|
|
if (UltimateTarget != UObject)
|
|
IndirectionMap[UObject] = UltimateTarget;
|
|
}
|
|
UObjectToUObjectMap UltimateTargetsMap;
|
|
for (const auto *UObject : UObjects) {
|
|
auto Target = UObject;
|
|
auto It = IndirectionMap.find(Target);
|
|
for (; It != IndirectionMap.end(); It = IndirectionMap.find(Target))
|
|
Target = It->second;
|
|
UltimateTargetsMap[UObject] = Target;
|
|
}
|
|
return UltimateTargetsMap;
|
|
};
|
|
|
|
// For each item in RedKeyToUOMap, if it has more than one underlying object,
|
|
// try to merge the equivalence classes.
|
|
for (auto &[RedKey, UObjects] : RedKeyToUOMap) {
|
|
if (UObjects.size() < 2)
|
|
continue;
|
|
auto UTMap = GetUltimateTargets(UObjects);
|
|
for (const auto &[UObject, UltimateTarget] : UTMap) {
|
|
if (UObject == UltimateTarget)
|
|
continue;
|
|
|
|
EqClassKey KeyFrom{UObject, std::get<0>(RedKey), std::get<1>(RedKey),
|
|
std::get<2>(RedKey)};
|
|
EqClassKey KeyTo{UltimateTarget, std::get<0>(RedKey), std::get<1>(RedKey),
|
|
std::get<2>(RedKey)};
|
|
// The entry for KeyFrom is guarantted to exist, unlike KeyTo. Thus,
|
|
// request the reference to the instructions vector for KeyTo first.
|
|
const auto &VecTo = EQClasses[KeyTo];
|
|
const auto &VecFrom = EQClasses[KeyFrom];
|
|
SmallVector<Instruction *, 8> MergedVec;
|
|
std::merge(VecFrom.begin(), VecFrom.end(), VecTo.begin(), VecTo.end(),
|
|
std::back_inserter(MergedVec),
|
|
[](Instruction *A, Instruction *B) {
|
|
return A && B && A->comesBefore(B);
|
|
});
|
|
EQClasses[KeyTo] = std::move(MergedVec);
|
|
EQClasses.erase(KeyFrom);
|
|
}
|
|
}
|
|
LLVM_DEBUG({
|
|
dbgs() << "LSV: mergeEquivalenceClasses: after merging:\n";
|
|
for (const auto &EC : EQClasses) {
|
|
dbgs() << " Key: {" << EC.first << "}\n";
|
|
for (const auto &Inst : EC.second)
|
|
dbgs() << " Inst: " << *Inst << '\n';
|
|
}
|
|
});
|
|
}
|
|
|
|
EquivalenceClassMap
|
|
Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
|
|
BasicBlock::iterator End) {
|
|
EquivalenceClassMap Ret;
|
|
|
|
auto GetUnderlyingObject = [](const Value *Ptr) -> const Value * {
|
|
const Value *ObjPtr = llvm::getUnderlyingObject(Ptr);
|
|
if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
|
|
// The select's themselves are distinct instructions even if they share
|
|
// the same condition and evaluate to consecutive pointers for true and
|
|
// false values of the condition. Therefore using the select's themselves
|
|
// for grouping instructions would put consecutive accesses into different
|
|
// lists and they won't be even checked for being consecutive, and won't
|
|
// be vectorized.
|
|
return Sel->getCondition();
|
|
}
|
|
return ObjPtr;
|
|
};
|
|
|
|
for (Instruction &I : make_range(Begin, End)) {
|
|
auto *LI = dyn_cast<LoadInst>(&I);
|
|
auto *SI = dyn_cast<StoreInst>(&I);
|
|
if (!LI && !SI)
|
|
continue;
|
|
|
|
if ((LI && !LI->isSimple()) || (SI && !SI->isSimple()))
|
|
continue;
|
|
|
|
if ((LI && !TTI.isLegalToVectorizeLoad(LI)) ||
|
|
(SI && !TTI.isLegalToVectorizeStore(SI)))
|
|
continue;
|
|
|
|
Type *Ty = getLoadStoreType(&I);
|
|
if (!VectorType::isValidElementType(Ty->getScalarType()))
|
|
continue;
|
|
|
|
// Skip weird non-byte sizes. They probably aren't worth the effort of
|
|
// handling correctly.
|
|
unsigned TySize = DL.getTypeSizeInBits(Ty);
|
|
if ((TySize % 8) != 0)
|
|
continue;
|
|
|
|
// Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
|
|
// functions are currently using an integer type for the vectorized
|
|
// load/store, and does not support casting between the integer type and a
|
|
// vector of pointers (e.g. i64 to <2 x i16*>)
|
|
if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
|
|
continue;
|
|
|
|
Value *Ptr = getLoadStorePointerOperand(&I);
|
|
unsigned AS = Ptr->getType()->getPointerAddressSpace();
|
|
unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
|
|
|
|
unsigned VF = VecRegSize / TySize;
|
|
VectorType *VecTy = dyn_cast<VectorType>(Ty);
|
|
|
|
// Only handle power-of-two sized elements.
|
|
if ((!VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(Ty))) ||
|
|
(VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(VecTy->getScalarType()))))
|
|
continue;
|
|
|
|
// No point in looking at these if they're too big to vectorize.
|
|
if (TySize > VecRegSize / 2 ||
|
|
(VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
|
|
continue;
|
|
|
|
Ret[{GetUnderlyingObject(Ptr), AS,
|
|
DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()),
|
|
/*IsLoad=*/LI != nullptr}]
|
|
.emplace_back(&I);
|
|
}
|
|
|
|
mergeEquivalenceClasses(Ret);
|
|
return Ret;
|
|
}
|
|
|
|
std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) {
|
|
if (Instrs.empty())
|
|
return {};
|
|
|
|
unsigned AS = getLoadStoreAddressSpace(Instrs[0]);
|
|
unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
|
|
|
|
#ifndef NDEBUG
|
|
// Check that Instrs is in BB order and all have the same addr space.
|
|
for (size_t I = 1; I < Instrs.size(); ++I) {
|
|
assert(Instrs[I - 1]->comesBefore(Instrs[I]));
|
|
assert(getLoadStoreAddressSpace(Instrs[I]) == AS);
|
|
}
|
|
#endif
|
|
|
|
// Machinery to build an MRU-hashtable of Chains.
|
|
//
|
|
// (Ideally this could be done with MapVector, but as currently implemented,
|
|
// moving an element to the front of a MapVector is O(n).)
|
|
struct InstrListElem : ilist_node<InstrListElem>,
|
|
std::pair<Instruction *, Chain> {
|
|
explicit InstrListElem(Instruction *I)
|
|
: std::pair<Instruction *, Chain>(I, {}) {}
|
|
};
|
|
struct InstrListElemDenseMapInfo {
|
|
using PtrInfo = DenseMapInfo<InstrListElem *>;
|
|
using IInfo = DenseMapInfo<Instruction *>;
|
|
static InstrListElem *getEmptyKey() { return PtrInfo::getEmptyKey(); }
|
|
static InstrListElem *getTombstoneKey() {
|
|
return PtrInfo::getTombstoneKey();
|
|
}
|
|
static unsigned getHashValue(const InstrListElem *E) {
|
|
return IInfo::getHashValue(E->first);
|
|
}
|
|
static bool isEqual(const InstrListElem *A, const InstrListElem *B) {
|
|
if (A == getEmptyKey() || B == getEmptyKey())
|
|
return A == getEmptyKey() && B == getEmptyKey();
|
|
if (A == getTombstoneKey() || B == getTombstoneKey())
|
|
return A == getTombstoneKey() && B == getTombstoneKey();
|
|
return IInfo::isEqual(A->first, B->first);
|
|
}
|
|
};
|
|
SpecificBumpPtrAllocator<InstrListElem> Allocator;
|
|
simple_ilist<InstrListElem> MRU;
|
|
DenseSet<InstrListElem *, InstrListElemDenseMapInfo> Chains;
|
|
|
|
// Compare each instruction in `instrs` to leader of the N most recently-used
|
|
// chains. This limits the O(n^2) behavior of this pass while also allowing
|
|
// us to build arbitrarily long chains.
|
|
for (Instruction *I : Instrs) {
|
|
constexpr int MaxChainsToTry = 64;
|
|
|
|
bool MatchFound = false;
|
|
auto ChainIter = MRU.begin();
|
|
for (size_t J = 0; J < MaxChainsToTry && ChainIter != MRU.end();
|
|
++J, ++ChainIter) {
|
|
if (std::optional<APInt> Offset = getConstantOffset(
|
|
getLoadStorePointerOperand(ChainIter->first),
|
|
getLoadStorePointerOperand(I),
|
|
/*ContextInst=*/
|
|
(ChainIter->first->comesBefore(I) ? I : ChainIter->first))) {
|
|
// `Offset` might not have the expected number of bits, if e.g. AS has a
|
|
// different number of bits than opaque pointers.
|
|
ChainIter->second.emplace_back(I, Offset.value());
|
|
// Move ChainIter to the front of the MRU list.
|
|
MRU.remove(*ChainIter);
|
|
MRU.push_front(*ChainIter);
|
|
MatchFound = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!MatchFound) {
|
|
APInt ZeroOffset(ASPtrBits, 0);
|
|
InstrListElem *E = new (Allocator.Allocate()) InstrListElem(I);
|
|
E->second.emplace_back(I, ZeroOffset);
|
|
MRU.push_front(*E);
|
|
Chains.insert(E);
|
|
}
|
|
}
|
|
|
|
std::vector<Chain> Ret;
|
|
Ret.reserve(Chains.size());
|
|
// Iterate over MRU rather than Chains so the order is deterministic.
|
|
for (auto &E : MRU)
|
|
if (E.second.size() > 1)
|
|
Ret.emplace_back(std::move(E.second));
|
|
return Ret;
|
|
}
|
|
|
|
std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
|
|
Instruction *ContextInst,
|
|
unsigned Depth) {
|
|
LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA
|
|
<< ", PtrB=" << *PtrB << ", ContextInst= " << *ContextInst
|
|
<< ", Depth=" << Depth << "\n");
|
|
// We'll ultimately return a value of this bit width, even if computations
|
|
// happen in a different width.
|
|
unsigned OrigBitWidth = DL.getIndexTypeSizeInBits(PtrA->getType());
|
|
APInt OffsetA(OrigBitWidth, 0);
|
|
APInt OffsetB(OrigBitWidth, 0);
|
|
PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
|
|
PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
|
|
unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
|
|
if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
|
|
return std::nullopt;
|
|
|
|
// If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets
|
|
// should properly handle a possible overflow and the value should fit into
|
|
// the smallest data type used in the cast/gep chain.
|
|
assert(OffsetA.getSignificantBits() <= NewPtrBitWidth &&
|
|
OffsetB.getSignificantBits() <= NewPtrBitWidth);
|
|
|
|
OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
|
|
OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
|
|
if (PtrA == PtrB)
|
|
return (OffsetB - OffsetA).sextOrTrunc(OrigBitWidth);
|
|
|
|
// Try to compute B - A.
|
|
const SCEV *DistScev = SE.getMinusSCEV(SE.getSCEV(PtrB), SE.getSCEV(PtrA));
|
|
if (DistScev != SE.getCouldNotCompute()) {
|
|
LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n");
|
|
ConstantRange DistRange = SE.getSignedRange(DistScev);
|
|
if (DistRange.isSingleElement()) {
|
|
// Handle index width (the width of Dist) != pointer width (the width of
|
|
// the Offset*s at this point).
|
|
APInt Dist = DistRange.getSingleElement()->sextOrTrunc(NewPtrBitWidth);
|
|
return (OffsetB - OffsetA + Dist).sextOrTrunc(OrigBitWidth);
|
|
}
|
|
}
|
|
if (std::optional<APInt> Diff =
|
|
getConstantOffsetComplexAddrs(PtrA, PtrB, ContextInst, Depth))
|
|
return (OffsetB - OffsetA + Diff->sext(OffsetB.getBitWidth()))
|
|
.sextOrTrunc(OrigBitWidth);
|
|
return std::nullopt;
|
|
}
|