
Most callers want a constant index. Instead of making every caller create a ConstantInt, we can do it in IRBuilder. This is similar to createInsertElement/createExtractElement.
1419 lines
59 KiB
C++
1419 lines
59 KiB
C++
//===-------- LoopIdiomVectorize.cpp - Loop idiom vectorization -----------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This pass implements a pass that recognizes certain loop idioms and
|
|
// transforms them into more optimized versions of the same loop. In cases
|
|
// where this happens, it can be a significant performance win.
|
|
//
|
|
// We currently support two loops:
|
|
//
|
|
// 1. A loop that finds the first mismatched byte in an array and returns the
|
|
// index, i.e. something like:
|
|
//
|
|
// while (++i != n) {
|
|
// if (a[i] != b[i])
|
|
// break;
|
|
// }
|
|
//
|
|
// In this example we can actually vectorize the loop despite the early exit,
|
|
// although the loop vectorizer does not support it. It requires some extra
|
|
// checks to deal with the possibility of faulting loads when crossing page
|
|
// boundaries. However, even with these checks it is still profitable to do the
|
|
// transformation.
|
|
//
|
|
// TODO List:
|
|
//
|
|
// * Add support for the inverse case where we scan for a matching element.
|
|
// * Permit 64-bit induction variable types.
|
|
// * Recognize loops that increment the IV *after* comparing bytes.
|
|
// * Allow 32-bit sign-extends of the IV used by the GEP.
|
|
//
|
|
// 2. A loop that finds the first matching character in an array among a set of
|
|
// possible matches, e.g.:
|
|
//
|
|
// for (; first != last; ++first)
|
|
// for (s_it = s_first; s_it != s_last; ++s_it)
|
|
// if (*first == *s_it)
|
|
// return first;
|
|
// return last;
|
|
//
|
|
// This corresponds to std::find_first_of (for arrays of bytes) from the C++
|
|
// standard library. This function can be implemented efficiently for targets
|
|
// that support @llvm.experimental.vector.match. For example, on AArch64 targets
|
|
// that implement SVE2, this lower to a MATCH instruction, which enables us to
|
|
// perform up to 16x16=256 comparisons in one go. This can lead to very
|
|
// significant speedups.
|
|
//
|
|
// TODO:
|
|
//
|
|
// * Add support for `find_first_not_of' loops (i.e. with not-equal comparison).
|
|
// * Make VF a configurable parameter (right now we assume 128-bit vectors).
|
|
// * Potentially adjust the cost model to let the transformation kick-in even if
|
|
// @llvm.experimental.vector.match doesn't have direct support in hardware.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// NOTE: This Pass matches really specific loop patterns because it's only
|
|
// supposed to be a temporary solution until our LoopVectorizer is powerful
|
|
// enough to vectorize them automatically.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h"
|
|
#include "llvm/Analysis/DomTreeUpdater.h"
|
|
#include "llvm/Analysis/LoopPass.h"
|
|
#include "llvm/Analysis/TargetTransformInfo.h"
|
|
#include "llvm/IR/Dominators.h"
|
|
#include "llvm/IR/IRBuilder.h"
|
|
#include "llvm/IR/Intrinsics.h"
|
|
#include "llvm/IR/MDBuilder.h"
|
|
#include "llvm/IR/PatternMatch.h"
|
|
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
|
|
|
using namespace llvm;
|
|
using namespace PatternMatch;
|
|
|
|
#define DEBUG_TYPE "loop-idiom-vectorize"
|
|
|
|
static cl::opt<bool> DisableAll("disable-loop-idiom-vectorize-all", cl::Hidden,
|
|
cl::init(false),
|
|
cl::desc("Disable Loop Idiom Vectorize Pass."));
|
|
|
|
static cl::opt<LoopIdiomVectorizeStyle>
|
|
LITVecStyle("loop-idiom-vectorize-style", cl::Hidden,
|
|
cl::desc("The vectorization style for loop idiom transform."),
|
|
cl::values(clEnumValN(LoopIdiomVectorizeStyle::Masked, "masked",
|
|
"Use masked vector intrinsics"),
|
|
clEnumValN(LoopIdiomVectorizeStyle::Predicated,
|
|
"predicated", "Use VP intrinsics")),
|
|
cl::init(LoopIdiomVectorizeStyle::Masked));
|
|
|
|
static cl::opt<bool>
|
|
DisableByteCmp("disable-loop-idiom-vectorize-bytecmp", cl::Hidden,
|
|
cl::init(false),
|
|
cl::desc("Proceed with Loop Idiom Vectorize Pass, but do "
|
|
"not convert byte-compare loop(s)."));
|
|
|
|
static cl::opt<unsigned>
|
|
ByteCmpVF("loop-idiom-vectorize-bytecmp-vf", cl::Hidden,
|
|
cl::desc("The vectorization factor for byte-compare patterns."),
|
|
cl::init(16));
|
|
|
|
static cl::opt<bool>
|
|
DisableFindFirstByte("disable-loop-idiom-vectorize-find-first-byte",
|
|
cl::Hidden, cl::init(false),
|
|
cl::desc("Do not convert find-first-byte loop(s)."));
|
|
|
|
static cl::opt<bool>
|
|
VerifyLoops("loop-idiom-vectorize-verify", cl::Hidden, cl::init(false),
|
|
cl::desc("Verify loops generated Loop Idiom Vectorize Pass."));
|
|
|
|
namespace {
|
|
class LoopIdiomVectorize {
|
|
LoopIdiomVectorizeStyle VectorizeStyle;
|
|
unsigned ByteCompareVF;
|
|
Loop *CurLoop = nullptr;
|
|
DominatorTree *DT;
|
|
LoopInfo *LI;
|
|
const TargetTransformInfo *TTI;
|
|
const DataLayout *DL;
|
|
|
|
// Blocks that will be used for inserting vectorized code.
|
|
BasicBlock *EndBlock = nullptr;
|
|
BasicBlock *VectorLoopPreheaderBlock = nullptr;
|
|
BasicBlock *VectorLoopStartBlock = nullptr;
|
|
BasicBlock *VectorLoopMismatchBlock = nullptr;
|
|
BasicBlock *VectorLoopIncBlock = nullptr;
|
|
|
|
public:
|
|
LoopIdiomVectorize(LoopIdiomVectorizeStyle S, unsigned VF, DominatorTree *DT,
|
|
LoopInfo *LI, const TargetTransformInfo *TTI,
|
|
const DataLayout *DL)
|
|
: VectorizeStyle(S), ByteCompareVF(VF), DT(DT), LI(LI), TTI(TTI), DL(DL) {
|
|
}
|
|
|
|
bool run(Loop *L);
|
|
|
|
private:
|
|
/// \name Countable Loop Idiom Handling
|
|
/// @{
|
|
|
|
bool runOnCountableLoop();
|
|
bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
|
|
SmallVectorImpl<BasicBlock *> &ExitBlocks);
|
|
|
|
bool recognizeByteCompare();
|
|
|
|
Value *expandFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU,
|
|
GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
|
|
Instruction *Index, Value *Start, Value *MaxLen);
|
|
|
|
Value *createMaskedFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU,
|
|
GetElementPtrInst *GEPA,
|
|
GetElementPtrInst *GEPB, Value *ExtStart,
|
|
Value *ExtEnd);
|
|
Value *createPredicatedFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU,
|
|
GetElementPtrInst *GEPA,
|
|
GetElementPtrInst *GEPB, Value *ExtStart,
|
|
Value *ExtEnd);
|
|
|
|
void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB,
|
|
PHINode *IndPhi, Value *MaxLen, Instruction *Index,
|
|
Value *Start, bool IncIdx, BasicBlock *FoundBB,
|
|
BasicBlock *EndBB);
|
|
|
|
bool recognizeFindFirstByte();
|
|
|
|
Value *expandFindFirstByte(IRBuilder<> &Builder, DomTreeUpdater &DTU,
|
|
unsigned VF, Type *CharTy, BasicBlock *ExitSucc,
|
|
BasicBlock *ExitFail, Value *SearchStart,
|
|
Value *SearchEnd, Value *NeedleStart,
|
|
Value *NeedleEnd);
|
|
|
|
void transformFindFirstByte(PHINode *IndPhi, unsigned VF, Type *CharTy,
|
|
BasicBlock *ExitSucc, BasicBlock *ExitFail,
|
|
Value *SearchStart, Value *SearchEnd,
|
|
Value *NeedleStart, Value *NeedleEnd);
|
|
/// @}
|
|
};
|
|
} // anonymous namespace
|
|
|
|
PreservedAnalyses LoopIdiomVectorizePass::run(Loop &L, LoopAnalysisManager &AM,
|
|
LoopStandardAnalysisResults &AR,
|
|
LPMUpdater &) {
|
|
if (DisableAll)
|
|
return PreservedAnalyses::all();
|
|
|
|
const auto *DL = &L.getHeader()->getDataLayout();
|
|
|
|
LoopIdiomVectorizeStyle VecStyle = VectorizeStyle;
|
|
if (LITVecStyle.getNumOccurrences())
|
|
VecStyle = LITVecStyle;
|
|
|
|
unsigned BCVF = ByteCompareVF;
|
|
if (ByteCmpVF.getNumOccurrences())
|
|
BCVF = ByteCmpVF;
|
|
|
|
LoopIdiomVectorize LIV(VecStyle, BCVF, &AR.DT, &AR.LI, &AR.TTI, DL);
|
|
if (!LIV.run(&L))
|
|
return PreservedAnalyses::all();
|
|
|
|
return PreservedAnalyses::none();
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// Implementation of LoopIdiomVectorize
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
bool LoopIdiomVectorize::run(Loop *L) {
|
|
CurLoop = L;
|
|
|
|
Function &F = *L->getHeader()->getParent();
|
|
if (DisableAll || F.hasOptSize())
|
|
return false;
|
|
|
|
if (F.hasFnAttribute(Attribute::NoImplicitFloat)) {
|
|
LLVM_DEBUG(dbgs() << DEBUG_TYPE << " is disabled on " << F.getName()
|
|
<< " due to its NoImplicitFloat attribute");
|
|
return false;
|
|
}
|
|
|
|
// If the loop could not be converted to canonical form, it must have an
|
|
// indirectbr in it, just give up.
|
|
if (!L->getLoopPreheader())
|
|
return false;
|
|
|
|
LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" << F.getName() << "] Loop %"
|
|
<< CurLoop->getHeader()->getName() << "\n");
|
|
|
|
if (recognizeByteCompare())
|
|
return true;
|
|
|
|
if (recognizeFindFirstByte())
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
bool LoopIdiomVectorize::recognizeByteCompare() {
|
|
// Currently the transformation only works on scalable vector types, although
|
|
// there is no fundamental reason why it cannot be made to work for fixed
|
|
// width too.
|
|
|
|
// We also need to know the minimum page size for the target in order to
|
|
// generate runtime memory checks to ensure the vector version won't fault.
|
|
if (!TTI->supportsScalableVectors() || !TTI->getMinPageSize().has_value() ||
|
|
DisableByteCmp)
|
|
return false;
|
|
|
|
BasicBlock *Header = CurLoop->getHeader();
|
|
|
|
// In LoopIdiomVectorize::run we have already checked that the loop
|
|
// has a preheader so we can assume it's in a canonical form.
|
|
if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 2)
|
|
return false;
|
|
|
|
PHINode *PN = dyn_cast<PHINode>(&Header->front());
|
|
if (!PN || PN->getNumIncomingValues() != 2)
|
|
return false;
|
|
|
|
auto LoopBlocks = CurLoop->getBlocks();
|
|
// The first block in the loop should contain only 4 instructions, e.g.
|
|
//
|
|
// while.cond:
|
|
// %res.phi = phi i32 [ %start, %ph ], [ %inc, %while.body ]
|
|
// %inc = add i32 %res.phi, 1
|
|
// %cmp.not = icmp eq i32 %inc, %n
|
|
// br i1 %cmp.not, label %while.end, label %while.body
|
|
//
|
|
if (LoopBlocks[0]->sizeWithoutDebug() > 4)
|
|
return false;
|
|
|
|
// The second block should contain 7 instructions, e.g.
|
|
//
|
|
// while.body:
|
|
// %idx = zext i32 %inc to i64
|
|
// %idx.a = getelementptr inbounds i8, ptr %a, i64 %idx
|
|
// %load.a = load i8, ptr %idx.a
|
|
// %idx.b = getelementptr inbounds i8, ptr %b, i64 %idx
|
|
// %load.b = load i8, ptr %idx.b
|
|
// %cmp.not.ld = icmp eq i8 %load.a, %load.b
|
|
// br i1 %cmp.not.ld, label %while.cond, label %while.end
|
|
//
|
|
if (LoopBlocks[1]->sizeWithoutDebug() > 7)
|
|
return false;
|
|
|
|
// The incoming value to the PHI node from the loop should be an add of 1.
|
|
Value *StartIdx = nullptr;
|
|
Instruction *Index = nullptr;
|
|
if (!CurLoop->contains(PN->getIncomingBlock(0))) {
|
|
StartIdx = PN->getIncomingValue(0);
|
|
Index = dyn_cast<Instruction>(PN->getIncomingValue(1));
|
|
} else {
|
|
StartIdx = PN->getIncomingValue(1);
|
|
Index = dyn_cast<Instruction>(PN->getIncomingValue(0));
|
|
}
|
|
|
|
// Limit to 32-bit types for now
|
|
if (!Index || !Index->getType()->isIntegerTy(32) ||
|
|
!match(Index, m_c_Add(m_Specific(PN), m_One())))
|
|
return false;
|
|
|
|
// If we match the pattern, PN and Index will be replaced with the result of
|
|
// the cttz.elts intrinsic. If any other instructions are used outside of
|
|
// the loop, we cannot replace it.
|
|
for (BasicBlock *BB : LoopBlocks)
|
|
for (Instruction &I : *BB)
|
|
if (&I != PN && &I != Index)
|
|
for (User *U : I.users())
|
|
if (!CurLoop->contains(cast<Instruction>(U)))
|
|
return false;
|
|
|
|
// Match the branch instruction for the header
|
|
Value *MaxLen;
|
|
BasicBlock *EndBB, *WhileBB;
|
|
if (!match(Header->getTerminator(),
|
|
m_Br(m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(Index),
|
|
m_Value(MaxLen)),
|
|
m_BasicBlock(EndBB), m_BasicBlock(WhileBB))) ||
|
|
!CurLoop->contains(WhileBB))
|
|
return false;
|
|
|
|
// WhileBB should contain the pattern of load & compare instructions. Match
|
|
// the pattern and find the GEP instructions used by the loads.
|
|
BasicBlock *FoundBB;
|
|
BasicBlock *TrueBB;
|
|
Value *LoadA, *LoadB;
|
|
if (!match(WhileBB->getTerminator(),
|
|
m_Br(m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(LoadA),
|
|
m_Value(LoadB)),
|
|
m_BasicBlock(TrueBB), m_BasicBlock(FoundBB))) ||
|
|
!CurLoop->contains(TrueBB))
|
|
return false;
|
|
|
|
Value *A, *B;
|
|
if (!match(LoadA, m_Load(m_Value(A))) || !match(LoadB, m_Load(m_Value(B))))
|
|
return false;
|
|
|
|
LoadInst *LoadAI = cast<LoadInst>(LoadA);
|
|
LoadInst *LoadBI = cast<LoadInst>(LoadB);
|
|
if (!LoadAI->isSimple() || !LoadBI->isSimple())
|
|
return false;
|
|
|
|
GetElementPtrInst *GEPA = dyn_cast<GetElementPtrInst>(A);
|
|
GetElementPtrInst *GEPB = dyn_cast<GetElementPtrInst>(B);
|
|
|
|
if (!GEPA || !GEPB)
|
|
return false;
|
|
|
|
Value *PtrA = GEPA->getPointerOperand();
|
|
Value *PtrB = GEPB->getPointerOperand();
|
|
|
|
// Check we are loading i8 values from two loop invariant pointers
|
|
if (!CurLoop->isLoopInvariant(PtrA) || !CurLoop->isLoopInvariant(PtrB) ||
|
|
!GEPA->getResultElementType()->isIntegerTy(8) ||
|
|
!GEPB->getResultElementType()->isIntegerTy(8) ||
|
|
!LoadAI->getType()->isIntegerTy(8) ||
|
|
!LoadBI->getType()->isIntegerTy(8) || PtrA == PtrB)
|
|
return false;
|
|
|
|
// Check that the index to the GEPs is the index we found earlier
|
|
if (GEPA->getNumIndices() > 1 || GEPB->getNumIndices() > 1)
|
|
return false;
|
|
|
|
Value *IdxA = GEPA->getOperand(GEPA->getNumIndices());
|
|
Value *IdxB = GEPB->getOperand(GEPB->getNumIndices());
|
|
if (IdxA != IdxB || !match(IdxA, m_ZExt(m_Specific(Index))))
|
|
return false;
|
|
|
|
// We only ever expect the pre-incremented index value to be used inside the
|
|
// loop.
|
|
if (!PN->hasOneUse())
|
|
return false;
|
|
|
|
// Ensure that when the Found and End blocks are identical the PHIs have the
|
|
// supported format. We don't currently allow cases like this:
|
|
// while.cond:
|
|
// ...
|
|
// br i1 %cmp.not, label %while.end, label %while.body
|
|
//
|
|
// while.body:
|
|
// ...
|
|
// br i1 %cmp.not2, label %while.cond, label %while.end
|
|
//
|
|
// while.end:
|
|
// %final_ptr = phi ptr [ %c, %while.body ], [ %d, %while.cond ]
|
|
//
|
|
// Where the incoming values for %final_ptr are unique and from each of the
|
|
// loop blocks, but not actually defined in the loop. This requires extra
|
|
// work setting up the byte.compare block, i.e. by introducing a select to
|
|
// choose the correct value.
|
|
// TODO: We could add support for this in future.
|
|
if (FoundBB == EndBB) {
|
|
for (PHINode &EndPN : EndBB->phis()) {
|
|
Value *WhileCondVal = EndPN.getIncomingValueForBlock(Header);
|
|
Value *WhileBodyVal = EndPN.getIncomingValueForBlock(WhileBB);
|
|
|
|
// The value of the index when leaving the while.cond block is always the
|
|
// same as the end value (MaxLen) so we permit either. The value when
|
|
// leaving the while.body block should only be the index. Otherwise for
|
|
// any other values we only allow ones that are same for both blocks.
|
|
if (WhileCondVal != WhileBodyVal &&
|
|
((WhileCondVal != Index && WhileCondVal != MaxLen) ||
|
|
(WhileBodyVal != Index)))
|
|
return false;
|
|
}
|
|
}
|
|
|
|
LLVM_DEBUG(dbgs() << "FOUND IDIOM IN LOOP: \n"
|
|
<< *(EndBB->getParent()) << "\n\n");
|
|
|
|
// The index is incremented before the GEP/Load pair so we need to
|
|
// add 1 to the start value.
|
|
transformByteCompare(GEPA, GEPB, PN, MaxLen, Index, StartIdx, /*IncIdx=*/true,
|
|
FoundBB, EndBB);
|
|
return true;
|
|
}
|
|
|
|
Value *LoopIdiomVectorize::createMaskedFindMismatch(
|
|
IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA,
|
|
GetElementPtrInst *GEPB, Value *ExtStart, Value *ExtEnd) {
|
|
Type *I64Type = Builder.getInt64Ty();
|
|
Type *ResType = Builder.getInt32Ty();
|
|
Type *LoadType = Builder.getInt8Ty();
|
|
Value *PtrA = GEPA->getPointerOperand();
|
|
Value *PtrB = GEPB->getPointerOperand();
|
|
|
|
ScalableVectorType *PredVTy =
|
|
ScalableVectorType::get(Builder.getInt1Ty(), ByteCompareVF);
|
|
|
|
Value *InitialPred = Builder.CreateIntrinsic(
|
|
Intrinsic::get_active_lane_mask, {PredVTy, I64Type}, {ExtStart, ExtEnd});
|
|
|
|
Value *VecLen = Builder.CreateIntrinsic(Intrinsic::vscale, {I64Type}, {});
|
|
VecLen =
|
|
Builder.CreateMul(VecLen, ConstantInt::get(I64Type, ByteCompareVF), "",
|
|
/*HasNUW=*/true, /*HasNSW=*/true);
|
|
|
|
Value *PFalse = Builder.CreateVectorSplat(PredVTy->getElementCount(),
|
|
Builder.getInt1(false));
|
|
|
|
BranchInst *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock);
|
|
Builder.Insert(JumpToVectorLoop);
|
|
|
|
DTU.applyUpdates({{DominatorTree::Insert, VectorLoopPreheaderBlock,
|
|
VectorLoopStartBlock}});
|
|
|
|
// Set up the first vector loop block by creating the PHIs, doing the vector
|
|
// loads and comparing the vectors.
|
|
Builder.SetInsertPoint(VectorLoopStartBlock);
|
|
PHINode *LoopPred = Builder.CreatePHI(PredVTy, 2, "mismatch_vec_loop_pred");
|
|
LoopPred->addIncoming(InitialPred, VectorLoopPreheaderBlock);
|
|
PHINode *VectorIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_vec_index");
|
|
VectorIndexPhi->addIncoming(ExtStart, VectorLoopPreheaderBlock);
|
|
Type *VectorLoadType =
|
|
ScalableVectorType::get(Builder.getInt8Ty(), ByteCompareVF);
|
|
Value *Passthru = ConstantInt::getNullValue(VectorLoadType);
|
|
|
|
Value *VectorLhsGep =
|
|
Builder.CreateGEP(LoadType, PtrA, VectorIndexPhi, "", GEPA->isInBounds());
|
|
Value *VectorLhsLoad = Builder.CreateMaskedLoad(VectorLoadType, VectorLhsGep,
|
|
Align(1), LoopPred, Passthru);
|
|
|
|
Value *VectorRhsGep =
|
|
Builder.CreateGEP(LoadType, PtrB, VectorIndexPhi, "", GEPB->isInBounds());
|
|
Value *VectorRhsLoad = Builder.CreateMaskedLoad(VectorLoadType, VectorRhsGep,
|
|
Align(1), LoopPred, Passthru);
|
|
|
|
Value *VectorMatchCmp = Builder.CreateICmpNE(VectorLhsLoad, VectorRhsLoad);
|
|
VectorMatchCmp = Builder.CreateSelect(LoopPred, VectorMatchCmp, PFalse);
|
|
Value *VectorMatchHasActiveLanes = Builder.CreateOrReduce(VectorMatchCmp);
|
|
BranchInst *VectorEarlyExit = BranchInst::Create(
|
|
VectorLoopMismatchBlock, VectorLoopIncBlock, VectorMatchHasActiveLanes);
|
|
Builder.Insert(VectorEarlyExit);
|
|
|
|
DTU.applyUpdates(
|
|
{{DominatorTree::Insert, VectorLoopStartBlock, VectorLoopMismatchBlock},
|
|
{DominatorTree::Insert, VectorLoopStartBlock, VectorLoopIncBlock}});
|
|
|
|
// Increment the index counter and calculate the predicate for the next
|
|
// iteration of the loop. We branch back to the start of the loop if there
|
|
// is at least one active lane.
|
|
Builder.SetInsertPoint(VectorLoopIncBlock);
|
|
Value *NewVectorIndexPhi =
|
|
Builder.CreateAdd(VectorIndexPhi, VecLen, "",
|
|
/*HasNUW=*/true, /*HasNSW=*/true);
|
|
VectorIndexPhi->addIncoming(NewVectorIndexPhi, VectorLoopIncBlock);
|
|
Value *NewPred =
|
|
Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
|
|
{PredVTy, I64Type}, {NewVectorIndexPhi, ExtEnd});
|
|
LoopPred->addIncoming(NewPred, VectorLoopIncBlock);
|
|
|
|
Value *PredHasActiveLanes =
|
|
Builder.CreateExtractElement(NewPred, uint64_t(0));
|
|
BranchInst *VectorLoopBranchBack =
|
|
BranchInst::Create(VectorLoopStartBlock, EndBlock, PredHasActiveLanes);
|
|
Builder.Insert(VectorLoopBranchBack);
|
|
|
|
DTU.applyUpdates(
|
|
{{DominatorTree::Insert, VectorLoopIncBlock, VectorLoopStartBlock},
|
|
{DominatorTree::Insert, VectorLoopIncBlock, EndBlock}});
|
|
|
|
// If we found a mismatch then we need to calculate which lane in the vector
|
|
// had a mismatch and add that on to the current loop index.
|
|
Builder.SetInsertPoint(VectorLoopMismatchBlock);
|
|
PHINode *FoundPred = Builder.CreatePHI(PredVTy, 1, "mismatch_vec_found_pred");
|
|
FoundPred->addIncoming(VectorMatchCmp, VectorLoopStartBlock);
|
|
PHINode *LastLoopPred =
|
|
Builder.CreatePHI(PredVTy, 1, "mismatch_vec_last_loop_pred");
|
|
LastLoopPred->addIncoming(LoopPred, VectorLoopStartBlock);
|
|
PHINode *VectorFoundIndex =
|
|
Builder.CreatePHI(I64Type, 1, "mismatch_vec_found_index");
|
|
VectorFoundIndex->addIncoming(VectorIndexPhi, VectorLoopStartBlock);
|
|
|
|
Value *PredMatchCmp = Builder.CreateAnd(LastLoopPred, FoundPred);
|
|
Value *Ctz = Builder.CreateCountTrailingZeroElems(ResType, PredMatchCmp);
|
|
Ctz = Builder.CreateZExt(Ctz, I64Type);
|
|
Value *VectorLoopRes64 = Builder.CreateAdd(VectorFoundIndex, Ctz, "",
|
|
/*HasNUW=*/true, /*HasNSW=*/true);
|
|
return Builder.CreateTrunc(VectorLoopRes64, ResType);
|
|
}
|
|
|
|
Value *LoopIdiomVectorize::createPredicatedFindMismatch(
|
|
IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA,
|
|
GetElementPtrInst *GEPB, Value *ExtStart, Value *ExtEnd) {
|
|
Type *I64Type = Builder.getInt64Ty();
|
|
Type *I32Type = Builder.getInt32Ty();
|
|
Type *ResType = I32Type;
|
|
Type *LoadType = Builder.getInt8Ty();
|
|
Value *PtrA = GEPA->getPointerOperand();
|
|
Value *PtrB = GEPB->getPointerOperand();
|
|
|
|
auto *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock);
|
|
Builder.Insert(JumpToVectorLoop);
|
|
|
|
DTU.applyUpdates({{DominatorTree::Insert, VectorLoopPreheaderBlock,
|
|
VectorLoopStartBlock}});
|
|
|
|
// Set up the first Vector loop block by creating the PHIs, doing the vector
|
|
// loads and comparing the vectors.
|
|
Builder.SetInsertPoint(VectorLoopStartBlock);
|
|
auto *VectorIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_vector_index");
|
|
VectorIndexPhi->addIncoming(ExtStart, VectorLoopPreheaderBlock);
|
|
|
|
// Calculate AVL by subtracting the vector loop index from the trip count
|
|
Value *AVL = Builder.CreateSub(ExtEnd, VectorIndexPhi, "avl", /*HasNUW=*/true,
|
|
/*HasNSW=*/true);
|
|
|
|
auto *VectorLoadType = ScalableVectorType::get(LoadType, ByteCompareVF);
|
|
auto *VF = ConstantInt::get(I32Type, ByteCompareVF);
|
|
|
|
Value *VL = Builder.CreateIntrinsic(Intrinsic::experimental_get_vector_length,
|
|
{I64Type}, {AVL, VF, Builder.getTrue()});
|
|
Value *GepOffset = VectorIndexPhi;
|
|
|
|
Value *VectorLhsGep =
|
|
Builder.CreateGEP(LoadType, PtrA, GepOffset, "", GEPA->isInBounds());
|
|
VectorType *TrueMaskTy =
|
|
VectorType::get(Builder.getInt1Ty(), VectorLoadType->getElementCount());
|
|
Value *AllTrueMask = Constant::getAllOnesValue(TrueMaskTy);
|
|
Value *VectorLhsLoad = Builder.CreateIntrinsic(
|
|
Intrinsic::vp_load, {VectorLoadType, VectorLhsGep->getType()},
|
|
{VectorLhsGep, AllTrueMask, VL}, nullptr, "lhs.load");
|
|
|
|
Value *VectorRhsGep =
|
|
Builder.CreateGEP(LoadType, PtrB, GepOffset, "", GEPB->isInBounds());
|
|
Value *VectorRhsLoad = Builder.CreateIntrinsic(
|
|
Intrinsic::vp_load, {VectorLoadType, VectorLhsGep->getType()},
|
|
{VectorRhsGep, AllTrueMask, VL}, nullptr, "rhs.load");
|
|
|
|
StringRef PredicateStr = CmpInst::getPredicateName(CmpInst::ICMP_NE);
|
|
auto *PredicateMDS = MDString::get(VectorLhsLoad->getContext(), PredicateStr);
|
|
Value *Pred = MetadataAsValue::get(VectorLhsLoad->getContext(), PredicateMDS);
|
|
Value *VectorMatchCmp = Builder.CreateIntrinsic(
|
|
Intrinsic::vp_icmp, {VectorLhsLoad->getType()},
|
|
{VectorLhsLoad, VectorRhsLoad, Pred, AllTrueMask, VL}, nullptr,
|
|
"mismatch.cmp");
|
|
Value *CTZ = Builder.CreateIntrinsic(
|
|
Intrinsic::vp_cttz_elts, {ResType, VectorMatchCmp->getType()},
|
|
{VectorMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(false), AllTrueMask,
|
|
VL});
|
|
Value *MismatchFound = Builder.CreateICmpNE(CTZ, VL);
|
|
auto *VectorEarlyExit = BranchInst::Create(VectorLoopMismatchBlock,
|
|
VectorLoopIncBlock, MismatchFound);
|
|
Builder.Insert(VectorEarlyExit);
|
|
|
|
DTU.applyUpdates(
|
|
{{DominatorTree::Insert, VectorLoopStartBlock, VectorLoopMismatchBlock},
|
|
{DominatorTree::Insert, VectorLoopStartBlock, VectorLoopIncBlock}});
|
|
|
|
// Increment the index counter and calculate the predicate for the next
|
|
// iteration of the loop. We branch back to the start of the loop if there
|
|
// is at least one active lane.
|
|
Builder.SetInsertPoint(VectorLoopIncBlock);
|
|
Value *VL64 = Builder.CreateZExt(VL, I64Type);
|
|
Value *NewVectorIndexPhi =
|
|
Builder.CreateAdd(VectorIndexPhi, VL64, "",
|
|
/*HasNUW=*/true, /*HasNSW=*/true);
|
|
VectorIndexPhi->addIncoming(NewVectorIndexPhi, VectorLoopIncBlock);
|
|
Value *ExitCond = Builder.CreateICmpNE(NewVectorIndexPhi, ExtEnd);
|
|
auto *VectorLoopBranchBack =
|
|
BranchInst::Create(VectorLoopStartBlock, EndBlock, ExitCond);
|
|
Builder.Insert(VectorLoopBranchBack);
|
|
|
|
DTU.applyUpdates(
|
|
{{DominatorTree::Insert, VectorLoopIncBlock, VectorLoopStartBlock},
|
|
{DominatorTree::Insert, VectorLoopIncBlock, EndBlock}});
|
|
|
|
// If we found a mismatch then we need to calculate which lane in the vector
|
|
// had a mismatch and add that on to the current loop index.
|
|
Builder.SetInsertPoint(VectorLoopMismatchBlock);
|
|
|
|
// Add LCSSA phis for CTZ and VectorIndexPhi.
|
|
auto *CTZLCSSAPhi = Builder.CreatePHI(CTZ->getType(), 1, "ctz");
|
|
CTZLCSSAPhi->addIncoming(CTZ, VectorLoopStartBlock);
|
|
auto *VectorIndexLCSSAPhi =
|
|
Builder.CreatePHI(VectorIndexPhi->getType(), 1, "mismatch_vector_index");
|
|
VectorIndexLCSSAPhi->addIncoming(VectorIndexPhi, VectorLoopStartBlock);
|
|
|
|
Value *CTZI64 = Builder.CreateZExt(CTZLCSSAPhi, I64Type);
|
|
Value *VectorLoopRes64 = Builder.CreateAdd(VectorIndexLCSSAPhi, CTZI64, "",
|
|
/*HasNUW=*/true, /*HasNSW=*/true);
|
|
return Builder.CreateTrunc(VectorLoopRes64, ResType);
|
|
}
|
|
|
|
Value *LoopIdiomVectorize::expandFindMismatch(
|
|
IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA,
|
|
GetElementPtrInst *GEPB, Instruction *Index, Value *Start, Value *MaxLen) {
|
|
Value *PtrA = GEPA->getPointerOperand();
|
|
Value *PtrB = GEPB->getPointerOperand();
|
|
|
|
// Get the arguments and types for the intrinsic.
|
|
BasicBlock *Preheader = CurLoop->getLoopPreheader();
|
|
BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator());
|
|
LLVMContext &Ctx = PHBranch->getContext();
|
|
Type *LoadType = Type::getInt8Ty(Ctx);
|
|
Type *ResType = Builder.getInt32Ty();
|
|
|
|
// Split block in the original loop preheader.
|
|
EndBlock = SplitBlock(Preheader, PHBranch, DT, LI, nullptr, "mismatch_end");
|
|
|
|
// Create the blocks that we're going to need:
|
|
// 1. A block for checking the zero-extended length exceeds 0
|
|
// 2. A block to check that the start and end addresses of a given array
|
|
// lie on the same page.
|
|
// 3. The vector loop preheader.
|
|
// 4. The first vector loop block.
|
|
// 5. The vector loop increment block.
|
|
// 6. A block we can jump to from the vector loop when a mismatch is found.
|
|
// 7. The first block of the scalar loop itself, containing PHIs , loads
|
|
// and cmp.
|
|
// 8. A scalar loop increment block to increment the PHIs and go back
|
|
// around the loop.
|
|
|
|
BasicBlock *MinItCheckBlock = BasicBlock::Create(
|
|
Ctx, "mismatch_min_it_check", EndBlock->getParent(), EndBlock);
|
|
|
|
// Update the terminator added by SplitBlock to branch to the first block
|
|
Preheader->getTerminator()->setSuccessor(0, MinItCheckBlock);
|
|
|
|
BasicBlock *MemCheckBlock = BasicBlock::Create(
|
|
Ctx, "mismatch_mem_check", EndBlock->getParent(), EndBlock);
|
|
|
|
VectorLoopPreheaderBlock = BasicBlock::Create(
|
|
Ctx, "mismatch_vec_loop_preheader", EndBlock->getParent(), EndBlock);
|
|
|
|
VectorLoopStartBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop",
|
|
EndBlock->getParent(), EndBlock);
|
|
|
|
VectorLoopIncBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop_inc",
|
|
EndBlock->getParent(), EndBlock);
|
|
|
|
VectorLoopMismatchBlock = BasicBlock::Create(Ctx, "mismatch_vec_loop_found",
|
|
EndBlock->getParent(), EndBlock);
|
|
|
|
BasicBlock *LoopPreHeaderBlock = BasicBlock::Create(
|
|
Ctx, "mismatch_loop_pre", EndBlock->getParent(), EndBlock);
|
|
|
|
BasicBlock *LoopStartBlock =
|
|
BasicBlock::Create(Ctx, "mismatch_loop", EndBlock->getParent(), EndBlock);
|
|
|
|
BasicBlock *LoopIncBlock = BasicBlock::Create(
|
|
Ctx, "mismatch_loop_inc", EndBlock->getParent(), EndBlock);
|
|
|
|
DTU.applyUpdates({{DominatorTree::Insert, Preheader, MinItCheckBlock},
|
|
{DominatorTree::Delete, Preheader, EndBlock}});
|
|
|
|
// Update LoopInfo with the new vector & scalar loops.
|
|
auto VectorLoop = LI->AllocateLoop();
|
|
auto ScalarLoop = LI->AllocateLoop();
|
|
|
|
if (CurLoop->getParentLoop()) {
|
|
CurLoop->getParentLoop()->addBasicBlockToLoop(MinItCheckBlock, *LI);
|
|
CurLoop->getParentLoop()->addBasicBlockToLoop(MemCheckBlock, *LI);
|
|
CurLoop->getParentLoop()->addBasicBlockToLoop(VectorLoopPreheaderBlock,
|
|
*LI);
|
|
CurLoop->getParentLoop()->addChildLoop(VectorLoop);
|
|
CurLoop->getParentLoop()->addBasicBlockToLoop(VectorLoopMismatchBlock, *LI);
|
|
CurLoop->getParentLoop()->addBasicBlockToLoop(LoopPreHeaderBlock, *LI);
|
|
CurLoop->getParentLoop()->addChildLoop(ScalarLoop);
|
|
} else {
|
|
LI->addTopLevelLoop(VectorLoop);
|
|
LI->addTopLevelLoop(ScalarLoop);
|
|
}
|
|
|
|
// Add the new basic blocks to their associated loops.
|
|
VectorLoop->addBasicBlockToLoop(VectorLoopStartBlock, *LI);
|
|
VectorLoop->addBasicBlockToLoop(VectorLoopIncBlock, *LI);
|
|
|
|
ScalarLoop->addBasicBlockToLoop(LoopStartBlock, *LI);
|
|
ScalarLoop->addBasicBlockToLoop(LoopIncBlock, *LI);
|
|
|
|
// Set up some types and constants that we intend to reuse.
|
|
Type *I64Type = Builder.getInt64Ty();
|
|
|
|
// Check the zero-extended iteration count > 0
|
|
Builder.SetInsertPoint(MinItCheckBlock);
|
|
Value *ExtStart = Builder.CreateZExt(Start, I64Type);
|
|
Value *ExtEnd = Builder.CreateZExt(MaxLen, I64Type);
|
|
// This check doesn't really cost us very much.
|
|
|
|
Value *LimitCheck = Builder.CreateICmpULE(Start, MaxLen);
|
|
BranchInst *MinItCheckBr =
|
|
BranchInst::Create(MemCheckBlock, LoopPreHeaderBlock, LimitCheck);
|
|
MinItCheckBr->setMetadata(
|
|
LLVMContext::MD_prof,
|
|
MDBuilder(MinItCheckBr->getContext()).createBranchWeights(99, 1));
|
|
Builder.Insert(MinItCheckBr);
|
|
|
|
DTU.applyUpdates(
|
|
{{DominatorTree::Insert, MinItCheckBlock, MemCheckBlock},
|
|
{DominatorTree::Insert, MinItCheckBlock, LoopPreHeaderBlock}});
|
|
|
|
// For each of the arrays, check the start/end addresses are on the same
|
|
// page.
|
|
Builder.SetInsertPoint(MemCheckBlock);
|
|
|
|
// The early exit in the original loop means that when performing vector
|
|
// loads we are potentially reading ahead of the early exit. So we could
|
|
// fault if crossing a page boundary. Therefore, we create runtime memory
|
|
// checks based on the minimum page size as follows:
|
|
// 1. Calculate the addresses of the first memory accesses in the loop,
|
|
// i.e. LhsStart and RhsStart.
|
|
// 2. Get the last accessed addresses in the loop, i.e. LhsEnd and RhsEnd.
|
|
// 3. Determine which pages correspond to all the memory accesses, i.e
|
|
// LhsStartPage, LhsEndPage, RhsStartPage, RhsEndPage.
|
|
// 4. If LhsStartPage == LhsEndPage and RhsStartPage == RhsEndPage, then
|
|
// we know we won't cross any page boundaries in the loop so we can
|
|
// enter the vector loop! Otherwise we fall back on the scalar loop.
|
|
Value *LhsStartGEP = Builder.CreateGEP(LoadType, PtrA, ExtStart);
|
|
Value *RhsStartGEP = Builder.CreateGEP(LoadType, PtrB, ExtStart);
|
|
Value *RhsStart = Builder.CreatePtrToInt(RhsStartGEP, I64Type);
|
|
Value *LhsStart = Builder.CreatePtrToInt(LhsStartGEP, I64Type);
|
|
Value *LhsEndGEP = Builder.CreateGEP(LoadType, PtrA, ExtEnd);
|
|
Value *RhsEndGEP = Builder.CreateGEP(LoadType, PtrB, ExtEnd);
|
|
Value *LhsEnd = Builder.CreatePtrToInt(LhsEndGEP, I64Type);
|
|
Value *RhsEnd = Builder.CreatePtrToInt(RhsEndGEP, I64Type);
|
|
|
|
const uint64_t MinPageSize = TTI->getMinPageSize().value();
|
|
const uint64_t AddrShiftAmt = llvm::Log2_64(MinPageSize);
|
|
Value *LhsStartPage = Builder.CreateLShr(LhsStart, AddrShiftAmt);
|
|
Value *LhsEndPage = Builder.CreateLShr(LhsEnd, AddrShiftAmt);
|
|
Value *RhsStartPage = Builder.CreateLShr(RhsStart, AddrShiftAmt);
|
|
Value *RhsEndPage = Builder.CreateLShr(RhsEnd, AddrShiftAmt);
|
|
Value *LhsPageCmp = Builder.CreateICmpNE(LhsStartPage, LhsEndPage);
|
|
Value *RhsPageCmp = Builder.CreateICmpNE(RhsStartPage, RhsEndPage);
|
|
|
|
Value *CombinedPageCmp = Builder.CreateOr(LhsPageCmp, RhsPageCmp);
|
|
BranchInst *CombinedPageCmpCmpBr = BranchInst::Create(
|
|
LoopPreHeaderBlock, VectorLoopPreheaderBlock, CombinedPageCmp);
|
|
CombinedPageCmpCmpBr->setMetadata(
|
|
LLVMContext::MD_prof, MDBuilder(CombinedPageCmpCmpBr->getContext())
|
|
.createBranchWeights(10, 90));
|
|
Builder.Insert(CombinedPageCmpCmpBr);
|
|
|
|
DTU.applyUpdates(
|
|
{{DominatorTree::Insert, MemCheckBlock, LoopPreHeaderBlock},
|
|
{DominatorTree::Insert, MemCheckBlock, VectorLoopPreheaderBlock}});
|
|
|
|
// Set up the vector loop preheader, i.e. calculate initial loop predicate,
|
|
// zero-extend MaxLen to 64-bits, determine the number of vector elements
|
|
// processed in each iteration, etc.
|
|
Builder.SetInsertPoint(VectorLoopPreheaderBlock);
|
|
|
|
// At this point we know two things must be true:
|
|
// 1. Start <= End
|
|
// 2. ExtMaxLen <= MinPageSize due to the page checks.
|
|
// Therefore, we know that we can use a 64-bit induction variable that
|
|
// starts from 0 -> ExtMaxLen and it will not overflow.
|
|
Value *VectorLoopRes = nullptr;
|
|
switch (VectorizeStyle) {
|
|
case LoopIdiomVectorizeStyle::Masked:
|
|
VectorLoopRes =
|
|
createMaskedFindMismatch(Builder, DTU, GEPA, GEPB, ExtStart, ExtEnd);
|
|
break;
|
|
case LoopIdiomVectorizeStyle::Predicated:
|
|
VectorLoopRes = createPredicatedFindMismatch(Builder, DTU, GEPA, GEPB,
|
|
ExtStart, ExtEnd);
|
|
break;
|
|
}
|
|
|
|
Builder.Insert(BranchInst::Create(EndBlock));
|
|
|
|
DTU.applyUpdates(
|
|
{{DominatorTree::Insert, VectorLoopMismatchBlock, EndBlock}});
|
|
|
|
// Generate code for scalar loop.
|
|
Builder.SetInsertPoint(LoopPreHeaderBlock);
|
|
Builder.Insert(BranchInst::Create(LoopStartBlock));
|
|
|
|
DTU.applyUpdates(
|
|
{{DominatorTree::Insert, LoopPreHeaderBlock, LoopStartBlock}});
|
|
|
|
Builder.SetInsertPoint(LoopStartBlock);
|
|
PHINode *IndexPhi = Builder.CreatePHI(ResType, 2, "mismatch_index");
|
|
IndexPhi->addIncoming(Start, LoopPreHeaderBlock);
|
|
|
|
// Otherwise compare the values
|
|
// Load bytes from each array and compare them.
|
|
Value *GepOffset = Builder.CreateZExt(IndexPhi, I64Type);
|
|
|
|
Value *LhsGep =
|
|
Builder.CreateGEP(LoadType, PtrA, GepOffset, "", GEPA->isInBounds());
|
|
Value *LhsLoad = Builder.CreateLoad(LoadType, LhsGep);
|
|
|
|
Value *RhsGep =
|
|
Builder.CreateGEP(LoadType, PtrB, GepOffset, "", GEPB->isInBounds());
|
|
Value *RhsLoad = Builder.CreateLoad(LoadType, RhsGep);
|
|
|
|
Value *MatchCmp = Builder.CreateICmpEQ(LhsLoad, RhsLoad);
|
|
// If we have a mismatch then exit the loop ...
|
|
BranchInst *MatchCmpBr = BranchInst::Create(LoopIncBlock, EndBlock, MatchCmp);
|
|
Builder.Insert(MatchCmpBr);
|
|
|
|
DTU.applyUpdates({{DominatorTree::Insert, LoopStartBlock, LoopIncBlock},
|
|
{DominatorTree::Insert, LoopStartBlock, EndBlock}});
|
|
|
|
// Have we reached the maximum permitted length for the loop?
|
|
Builder.SetInsertPoint(LoopIncBlock);
|
|
Value *PhiInc = Builder.CreateAdd(IndexPhi, ConstantInt::get(ResType, 1), "",
|
|
/*HasNUW=*/Index->hasNoUnsignedWrap(),
|
|
/*HasNSW=*/Index->hasNoSignedWrap());
|
|
IndexPhi->addIncoming(PhiInc, LoopIncBlock);
|
|
Value *IVCmp = Builder.CreateICmpEQ(PhiInc, MaxLen);
|
|
BranchInst *IVCmpBr = BranchInst::Create(EndBlock, LoopStartBlock, IVCmp);
|
|
Builder.Insert(IVCmpBr);
|
|
|
|
DTU.applyUpdates({{DominatorTree::Insert, LoopIncBlock, EndBlock},
|
|
{DominatorTree::Insert, LoopIncBlock, LoopStartBlock}});
|
|
|
|
// In the end block we need to insert a PHI node to deal with three cases:
|
|
// 1. We didn't find a mismatch in the scalar loop, so we return MaxLen.
|
|
// 2. We exitted the scalar loop early due to a mismatch and need to return
|
|
// the index that we found.
|
|
// 3. We didn't find a mismatch in the vector loop, so we return MaxLen.
|
|
// 4. We exitted the vector loop early due to a mismatch and need to return
|
|
// the index that we found.
|
|
Builder.SetInsertPoint(EndBlock, EndBlock->getFirstInsertionPt());
|
|
PHINode *ResPhi = Builder.CreatePHI(ResType, 4, "mismatch_result");
|
|
ResPhi->addIncoming(MaxLen, LoopIncBlock);
|
|
ResPhi->addIncoming(IndexPhi, LoopStartBlock);
|
|
ResPhi->addIncoming(MaxLen, VectorLoopIncBlock);
|
|
ResPhi->addIncoming(VectorLoopRes, VectorLoopMismatchBlock);
|
|
|
|
Value *FinalRes = Builder.CreateTrunc(ResPhi, ResType);
|
|
|
|
if (VerifyLoops) {
|
|
ScalarLoop->verifyLoop();
|
|
VectorLoop->verifyLoop();
|
|
if (!VectorLoop->isRecursivelyLCSSAForm(*DT, *LI))
|
|
report_fatal_error("Loops must remain in LCSSA form!");
|
|
if (!ScalarLoop->isRecursivelyLCSSAForm(*DT, *LI))
|
|
report_fatal_error("Loops must remain in LCSSA form!");
|
|
}
|
|
|
|
return FinalRes;
|
|
}
|
|
|
|
void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA,
|
|
GetElementPtrInst *GEPB,
|
|
PHINode *IndPhi, Value *MaxLen,
|
|
Instruction *Index, Value *Start,
|
|
bool IncIdx, BasicBlock *FoundBB,
|
|
BasicBlock *EndBB) {
|
|
|
|
// Insert the byte compare code at the end of the preheader block
|
|
BasicBlock *Preheader = CurLoop->getLoopPreheader();
|
|
BasicBlock *Header = CurLoop->getHeader();
|
|
BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator());
|
|
IRBuilder<> Builder(PHBranch);
|
|
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
|
|
Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc());
|
|
|
|
// Increment the pointer if this was done before the loads in the loop.
|
|
if (IncIdx)
|
|
Start = Builder.CreateAdd(Start, ConstantInt::get(Start->getType(), 1));
|
|
|
|
Value *ByteCmpRes =
|
|
expandFindMismatch(Builder, DTU, GEPA, GEPB, Index, Start, MaxLen);
|
|
|
|
// Replaces uses of index & induction Phi with intrinsic (we already
|
|
// checked that the the first instruction of Header is the Phi above).
|
|
assert(IndPhi->hasOneUse() && "Index phi node has more than one use!");
|
|
Index->replaceAllUsesWith(ByteCmpRes);
|
|
|
|
assert(PHBranch->isUnconditional() &&
|
|
"Expected preheader to terminate with an unconditional branch.");
|
|
|
|
// If no mismatch was found, we can jump to the end block. Create a
|
|
// new basic block for the compare instruction.
|
|
auto *CmpBB = BasicBlock::Create(Preheader->getContext(), "byte.compare",
|
|
Preheader->getParent());
|
|
CmpBB->moveBefore(EndBB);
|
|
|
|
// Replace the branch in the preheader with an always-true conditional branch.
|
|
// This ensures there is still a reference to the original loop.
|
|
Builder.CreateCondBr(Builder.getTrue(), CmpBB, Header);
|
|
PHBranch->eraseFromParent();
|
|
|
|
BasicBlock *MismatchEnd = cast<Instruction>(ByteCmpRes)->getParent();
|
|
DTU.applyUpdates({{DominatorTree::Insert, MismatchEnd, CmpBB}});
|
|
|
|
// Create the branch to either the end or found block depending on the value
|
|
// returned by the intrinsic.
|
|
Builder.SetInsertPoint(CmpBB);
|
|
if (FoundBB != EndBB) {
|
|
Value *FoundCmp = Builder.CreateICmpEQ(ByteCmpRes, MaxLen);
|
|
Builder.CreateCondBr(FoundCmp, EndBB, FoundBB);
|
|
DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB},
|
|
{DominatorTree::Insert, CmpBB, EndBB}});
|
|
|
|
} else {
|
|
Builder.CreateBr(FoundBB);
|
|
DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB}});
|
|
}
|
|
|
|
auto fixSuccessorPhis = [&](BasicBlock *SuccBB) {
|
|
for (PHINode &PN : SuccBB->phis()) {
|
|
// At this point we've already replaced all uses of the result from the
|
|
// loop with ByteCmp. Look through the incoming values to find ByteCmp,
|
|
// meaning this is a Phi collecting the results of the byte compare.
|
|
bool ResPhi = false;
|
|
for (Value *Op : PN.incoming_values())
|
|
if (Op == ByteCmpRes) {
|
|
ResPhi = true;
|
|
break;
|
|
}
|
|
|
|
// Any PHI that depended upon the result of the byte compare needs a new
|
|
// incoming value from CmpBB. This is because the original loop will get
|
|
// deleted.
|
|
if (ResPhi)
|
|
PN.addIncoming(ByteCmpRes, CmpBB);
|
|
else {
|
|
// There should be no other outside uses of other values in the
|
|
// original loop. Any incoming values should either:
|
|
// 1. Be for blocks outside the loop, which aren't interesting. Or ..
|
|
// 2. These are from blocks in the loop with values defined outside
|
|
// the loop. We should a similar incoming value from CmpBB.
|
|
for (BasicBlock *BB : PN.blocks())
|
|
if (CurLoop->contains(BB)) {
|
|
PN.addIncoming(PN.getIncomingValueForBlock(BB), CmpBB);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
// Ensure all Phis in the successors of CmpBB have an incoming value from it.
|
|
fixSuccessorPhis(EndBB);
|
|
if (EndBB != FoundBB)
|
|
fixSuccessorPhis(FoundBB);
|
|
|
|
// The new CmpBB block isn't part of the loop, but will need to be added to
|
|
// the outer loop if there is one.
|
|
if (!CurLoop->isOutermost())
|
|
CurLoop->getParentLoop()->addBasicBlockToLoop(CmpBB, *LI);
|
|
|
|
if (VerifyLoops && CurLoop->getParentLoop()) {
|
|
CurLoop->getParentLoop()->verifyLoop();
|
|
if (!CurLoop->getParentLoop()->isRecursivelyLCSSAForm(*DT, *LI))
|
|
report_fatal_error("Loops must remain in LCSSA form!");
|
|
}
|
|
}
|
|
|
|
bool LoopIdiomVectorize::recognizeFindFirstByte() {
|
|
// Currently the transformation only works on scalable vector types, although
|
|
// there is no fundamental reason why it cannot be made to work for fixed
|
|
// vectors. We also need to know the target's minimum page size in order to
|
|
// generate runtime memory checks to ensure the vector version won't fault.
|
|
if (!TTI->supportsScalableVectors() || !TTI->getMinPageSize().has_value() ||
|
|
DisableFindFirstByte)
|
|
return false;
|
|
|
|
// Define some constants we need throughout.
|
|
BasicBlock *Header = CurLoop->getHeader();
|
|
LLVMContext &Ctx = Header->getContext();
|
|
|
|
// We are expecting the four blocks defined below: Header, MatchBB, InnerBB,
|
|
// and OuterBB. For now, we will bail our for almost anything else. The Four
|
|
// blocks contain one nested loop.
|
|
if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 4 ||
|
|
CurLoop->getSubLoops().size() != 1)
|
|
return false;
|
|
|
|
auto *InnerLoop = CurLoop->getSubLoops().front();
|
|
PHINode *IndPhi = dyn_cast<PHINode>(&Header->front());
|
|
if (!IndPhi || IndPhi->getNumIncomingValues() != 2)
|
|
return false;
|
|
|
|
// Check instruction counts.
|
|
auto LoopBlocks = CurLoop->getBlocks();
|
|
if (LoopBlocks[0]->sizeWithoutDebug() > 3 ||
|
|
LoopBlocks[1]->sizeWithoutDebug() > 4 ||
|
|
LoopBlocks[2]->sizeWithoutDebug() > 3 ||
|
|
LoopBlocks[3]->sizeWithoutDebug() > 3)
|
|
return false;
|
|
|
|
// Check that no instruction other than IndPhi has outside uses.
|
|
for (BasicBlock *BB : LoopBlocks)
|
|
for (Instruction &I : *BB)
|
|
if (&I != IndPhi)
|
|
for (User *U : I.users())
|
|
if (!CurLoop->contains(cast<Instruction>(U)))
|
|
return false;
|
|
|
|
// Match the branch instruction in the header. We are expecting an
|
|
// unconditional branch to the inner loop.
|
|
//
|
|
// Header:
|
|
// %14 = phi ptr [ %24, %OuterBB ], [ %3, %Header.preheader ]
|
|
// %15 = load i8, ptr %14, align 1
|
|
// br label %MatchBB
|
|
BasicBlock *MatchBB;
|
|
if (!match(Header->getTerminator(), m_UnconditionalBr(MatchBB)) ||
|
|
!InnerLoop->contains(MatchBB))
|
|
return false;
|
|
|
|
// MatchBB should be the entrypoint into the inner loop containing the
|
|
// comparison between a search element and a needle.
|
|
//
|
|
// MatchBB:
|
|
// %20 = phi ptr [ %7, %Header ], [ %17, %InnerBB ]
|
|
// %21 = load i8, ptr %20, align 1
|
|
// %22 = icmp eq i8 %15, %21
|
|
// br i1 %22, label %ExitSucc, label %InnerBB
|
|
BasicBlock *ExitSucc, *InnerBB;
|
|
Value *LoadSearch, *LoadNeedle;
|
|
CmpPredicate MatchPred;
|
|
if (!match(MatchBB->getTerminator(),
|
|
m_Br(m_ICmp(MatchPred, m_Value(LoadSearch), m_Value(LoadNeedle)),
|
|
m_BasicBlock(ExitSucc), m_BasicBlock(InnerBB))) ||
|
|
MatchPred != ICmpInst::ICMP_EQ || !InnerLoop->contains(InnerBB))
|
|
return false;
|
|
|
|
// We expect outside uses of `IndPhi' in ExitSucc (and only there).
|
|
for (User *U : IndPhi->users())
|
|
if (!CurLoop->contains(cast<Instruction>(U))) {
|
|
auto *PN = dyn_cast<PHINode>(U);
|
|
if (!PN || PN->getParent() != ExitSucc)
|
|
return false;
|
|
}
|
|
|
|
// Match the loads and check they are simple.
|
|
Value *Search, *Needle;
|
|
if (!match(LoadSearch, m_Load(m_Value(Search))) ||
|
|
!match(LoadNeedle, m_Load(m_Value(Needle))) ||
|
|
!cast<LoadInst>(LoadSearch)->isSimple() ||
|
|
!cast<LoadInst>(LoadNeedle)->isSimple())
|
|
return false;
|
|
|
|
// Check we are loading valid characters.
|
|
Type *CharTy = LoadSearch->getType();
|
|
if (!CharTy->isIntegerTy() || LoadNeedle->getType() != CharTy)
|
|
return false;
|
|
|
|
// Pick the vectorisation factor based on CharTy, work out the cost of the
|
|
// match intrinsic and decide if we should use it.
|
|
// Note: For the time being we assume 128-bit vectors.
|
|
unsigned VF = 128 / CharTy->getIntegerBitWidth();
|
|
SmallVector<Type *> Args = {
|
|
ScalableVectorType::get(CharTy, VF), FixedVectorType::get(CharTy, VF),
|
|
ScalableVectorType::get(Type::getInt1Ty(Ctx), VF)};
|
|
IntrinsicCostAttributes Attrs(Intrinsic::experimental_vector_match, Args[2],
|
|
Args);
|
|
if (TTI->getIntrinsicInstrCost(Attrs, TTI::TCK_SizeAndLatency) > 4)
|
|
return false;
|
|
|
|
// The loads come from two PHIs, each with two incoming values.
|
|
PHINode *PSearch = dyn_cast<PHINode>(Search);
|
|
PHINode *PNeedle = dyn_cast<PHINode>(Needle);
|
|
if (!PSearch || PSearch->getNumIncomingValues() != 2 || !PNeedle ||
|
|
PNeedle->getNumIncomingValues() != 2)
|
|
return false;
|
|
|
|
// One PHI comes from the outer loop (PSearch), the other one from the inner
|
|
// loop (PNeedle). PSearch effectively corresponds to IndPhi.
|
|
if (InnerLoop->contains(PSearch))
|
|
std::swap(PSearch, PNeedle);
|
|
if (PSearch != &Header->front() || PNeedle != &MatchBB->front())
|
|
return false;
|
|
|
|
// The incoming values of both PHI nodes should be a gep of 1.
|
|
Value *SearchStart = PSearch->getIncomingValue(0);
|
|
Value *SearchIndex = PSearch->getIncomingValue(1);
|
|
if (CurLoop->contains(PSearch->getIncomingBlock(0)))
|
|
std::swap(SearchStart, SearchIndex);
|
|
|
|
Value *NeedleStart = PNeedle->getIncomingValue(0);
|
|
Value *NeedleIndex = PNeedle->getIncomingValue(1);
|
|
if (InnerLoop->contains(PNeedle->getIncomingBlock(0)))
|
|
std::swap(NeedleStart, NeedleIndex);
|
|
|
|
// Match the GEPs.
|
|
if (!match(SearchIndex, m_GEP(m_Specific(PSearch), m_One())) ||
|
|
!match(NeedleIndex, m_GEP(m_Specific(PNeedle), m_One())))
|
|
return false;
|
|
|
|
// Check the GEPs result type matches `CharTy'.
|
|
GetElementPtrInst *GEPSearch = cast<GetElementPtrInst>(SearchIndex);
|
|
GetElementPtrInst *GEPNeedle = cast<GetElementPtrInst>(NeedleIndex);
|
|
if (GEPSearch->getResultElementType() != CharTy ||
|
|
GEPNeedle->getResultElementType() != CharTy)
|
|
return false;
|
|
|
|
// InnerBB should increment the address of the needle pointer.
|
|
//
|
|
// InnerBB:
|
|
// %17 = getelementptr inbounds i8, ptr %20, i64 1
|
|
// %18 = icmp eq ptr %17, %10
|
|
// br i1 %18, label %OuterBB, label %MatchBB
|
|
BasicBlock *OuterBB;
|
|
Value *NeedleEnd;
|
|
if (!match(InnerBB->getTerminator(),
|
|
m_Br(m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(GEPNeedle),
|
|
m_Value(NeedleEnd)),
|
|
m_BasicBlock(OuterBB), m_Specific(MatchBB))) ||
|
|
!CurLoop->contains(OuterBB))
|
|
return false;
|
|
|
|
// OuterBB should increment the address of the search element pointer.
|
|
//
|
|
// OuterBB:
|
|
// %24 = getelementptr inbounds i8, ptr %14, i64 1
|
|
// %25 = icmp eq ptr %24, %6
|
|
// br i1 %25, label %ExitFail, label %Header
|
|
BasicBlock *ExitFail;
|
|
Value *SearchEnd;
|
|
if (!match(OuterBB->getTerminator(),
|
|
m_Br(m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(GEPSearch),
|
|
m_Value(SearchEnd)),
|
|
m_BasicBlock(ExitFail), m_Specific(Header))))
|
|
return false;
|
|
|
|
if (!CurLoop->isLoopInvariant(SearchStart) ||
|
|
!CurLoop->isLoopInvariant(SearchEnd) ||
|
|
!CurLoop->isLoopInvariant(NeedleStart) ||
|
|
!CurLoop->isLoopInvariant(NeedleEnd))
|
|
return false;
|
|
|
|
LLVM_DEBUG(dbgs() << "Found idiom in loop: \n" << *CurLoop << "\n\n");
|
|
|
|
transformFindFirstByte(IndPhi, VF, CharTy, ExitSucc, ExitFail, SearchStart,
|
|
SearchEnd, NeedleStart, NeedleEnd);
|
|
return true;
|
|
}
|
|
|
|
Value *LoopIdiomVectorize::expandFindFirstByte(
|
|
IRBuilder<> &Builder, DomTreeUpdater &DTU, unsigned VF, Type *CharTy,
|
|
BasicBlock *ExitSucc, BasicBlock *ExitFail, Value *SearchStart,
|
|
Value *SearchEnd, Value *NeedleStart, Value *NeedleEnd) {
|
|
// Set up some types and constants that we intend to reuse.
|
|
auto *PtrTy = Builder.getPtrTy();
|
|
auto *I64Ty = Builder.getInt64Ty();
|
|
auto *PredVTy = ScalableVectorType::get(Builder.getInt1Ty(), VF);
|
|
auto *CharVTy = ScalableVectorType::get(CharTy, VF);
|
|
auto *ConstVF = ConstantInt::get(I64Ty, VF);
|
|
|
|
// Other common arguments.
|
|
BasicBlock *Preheader = CurLoop->getLoopPreheader();
|
|
LLVMContext &Ctx = Preheader->getContext();
|
|
Value *Passthru = ConstantInt::getNullValue(CharVTy);
|
|
|
|
// Split block in the original loop preheader.
|
|
// SPH is the new preheader to the old scalar loop.
|
|
BasicBlock *SPH = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
|
|
nullptr, "scalar_preheader");
|
|
|
|
// Create the blocks that we're going to use.
|
|
//
|
|
// We will have the following loops:
|
|
// (O) Outer loop where we iterate over the elements of the search array.
|
|
// (I) Inner loop where we iterate over the elements of the needle array.
|
|
//
|
|
// Overall, the blocks do the following:
|
|
// (0) Check if the arrays can't cross page boundaries. If so go to (1),
|
|
// otherwise fall back to the original scalar loop.
|
|
// (1) Load the search array. Go to (2).
|
|
// (2) (a) Load the needle array.
|
|
// (b) Splat the first element to the inactive lanes.
|
|
// (c) Check if any elements match. If so go to (3), otherwise go to (4).
|
|
// (3) Compute the index of the first match and exit.
|
|
// (4) Check if we've reached the end of the needle array. If not loop back to
|
|
// (2), otherwise go to (5).
|
|
// (5) Check if we've reached the end of the search array. If not loop back to
|
|
// (1), otherwise exit.
|
|
// Blocks (0,3) are not part of any loop. Blocks (1,5) and (2,4) belong to
|
|
// the outer and inner loops, respectively.
|
|
BasicBlock *BB0 = BasicBlock::Create(Ctx, "mem_check", SPH->getParent(), SPH);
|
|
BasicBlock *BB1 =
|
|
BasicBlock::Create(Ctx, "find_first_vec_header", SPH->getParent(), SPH);
|
|
BasicBlock *BB2 =
|
|
BasicBlock::Create(Ctx, "match_check_vec", SPH->getParent(), SPH);
|
|
BasicBlock *BB3 =
|
|
BasicBlock::Create(Ctx, "calculate_match", SPH->getParent(), SPH);
|
|
BasicBlock *BB4 =
|
|
BasicBlock::Create(Ctx, "needle_check_vec", SPH->getParent(), SPH);
|
|
BasicBlock *BB5 =
|
|
BasicBlock::Create(Ctx, "search_check_vec", SPH->getParent(), SPH);
|
|
|
|
// Update LoopInfo with the new loops.
|
|
auto OuterLoop = LI->AllocateLoop();
|
|
auto InnerLoop = LI->AllocateLoop();
|
|
|
|
if (auto ParentLoop = CurLoop->getParentLoop()) {
|
|
ParentLoop->addBasicBlockToLoop(BB0, *LI);
|
|
ParentLoop->addChildLoop(OuterLoop);
|
|
ParentLoop->addBasicBlockToLoop(BB3, *LI);
|
|
} else {
|
|
LI->addTopLevelLoop(OuterLoop);
|
|
}
|
|
|
|
// Add the inner loop to the outer.
|
|
OuterLoop->addChildLoop(InnerLoop);
|
|
|
|
// Add the new basic blocks to the corresponding loops.
|
|
OuterLoop->addBasicBlockToLoop(BB1, *LI);
|
|
OuterLoop->addBasicBlockToLoop(BB5, *LI);
|
|
InnerLoop->addBasicBlockToLoop(BB2, *LI);
|
|
InnerLoop->addBasicBlockToLoop(BB4, *LI);
|
|
|
|
// Update the terminator added by SplitBlock to branch to the first block.
|
|
Preheader->getTerminator()->setSuccessor(0, BB0);
|
|
DTU.applyUpdates({{DominatorTree::Delete, Preheader, SPH},
|
|
{DominatorTree::Insert, Preheader, BB0}});
|
|
|
|
// (0) Check if we could be crossing a page boundary; if so, fallback to the
|
|
// old scalar loops. Also create a predicate of VF elements to be used in the
|
|
// vector loops.
|
|
Builder.SetInsertPoint(BB0);
|
|
Value *ISearchStart =
|
|
Builder.CreatePtrToInt(SearchStart, I64Ty, "search_start_int");
|
|
Value *ISearchEnd =
|
|
Builder.CreatePtrToInt(SearchEnd, I64Ty, "search_end_int");
|
|
Value *INeedleStart =
|
|
Builder.CreatePtrToInt(NeedleStart, I64Ty, "needle_start_int");
|
|
Value *INeedleEnd =
|
|
Builder.CreatePtrToInt(NeedleEnd, I64Ty, "needle_end_int");
|
|
Value *PredVF =
|
|
Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
|
|
{ConstantInt::get(I64Ty, 0), ConstVF});
|
|
|
|
const uint64_t MinPageSize = TTI->getMinPageSize().value();
|
|
const uint64_t AddrShiftAmt = llvm::Log2_64(MinPageSize);
|
|
Value *SearchStartPage =
|
|
Builder.CreateLShr(ISearchStart, AddrShiftAmt, "search_start_page");
|
|
Value *SearchEndPage =
|
|
Builder.CreateLShr(ISearchEnd, AddrShiftAmt, "search_end_page");
|
|
Value *NeedleStartPage =
|
|
Builder.CreateLShr(INeedleStart, AddrShiftAmt, "needle_start_page");
|
|
Value *NeedleEndPage =
|
|
Builder.CreateLShr(INeedleEnd, AddrShiftAmt, "needle_end_page");
|
|
Value *SearchPageCmp =
|
|
Builder.CreateICmpNE(SearchStartPage, SearchEndPage, "search_page_cmp");
|
|
Value *NeedlePageCmp =
|
|
Builder.CreateICmpNE(NeedleStartPage, NeedleEndPage, "needle_page_cmp");
|
|
|
|
Value *CombinedPageCmp =
|
|
Builder.CreateOr(SearchPageCmp, NeedlePageCmp, "combined_page_cmp");
|
|
BranchInst *CombinedPageBr = Builder.CreateCondBr(CombinedPageCmp, SPH, BB1);
|
|
CombinedPageBr->setMetadata(LLVMContext::MD_prof,
|
|
MDBuilder(Ctx).createBranchWeights(10, 90));
|
|
DTU.applyUpdates(
|
|
{{DominatorTree::Insert, BB0, SPH}, {DominatorTree::Insert, BB0, BB1}});
|
|
|
|
// (1) Load the search array and branch to the inner loop.
|
|
Builder.SetInsertPoint(BB1);
|
|
PHINode *Search = Builder.CreatePHI(PtrTy, 2, "psearch");
|
|
Value *PredSearch = Builder.CreateIntrinsic(
|
|
Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
|
|
{Builder.CreatePtrToInt(Search, I64Ty), ISearchEnd}, nullptr,
|
|
"search_pred");
|
|
PredSearch = Builder.CreateAnd(PredVF, PredSearch, "search_masked");
|
|
Value *LoadSearch = Builder.CreateMaskedLoad(
|
|
CharVTy, Search, Align(1), PredSearch, Passthru, "search_load_vec");
|
|
Builder.CreateBr(BB2);
|
|
DTU.applyUpdates({{DominatorTree::Insert, BB1, BB2}});
|
|
|
|
// (2) Inner loop.
|
|
Builder.SetInsertPoint(BB2);
|
|
PHINode *Needle = Builder.CreatePHI(PtrTy, 2, "pneedle");
|
|
|
|
// (2.a) Load the needle array.
|
|
Value *PredNeedle = Builder.CreateIntrinsic(
|
|
Intrinsic::get_active_lane_mask, {PredVTy, I64Ty},
|
|
{Builder.CreatePtrToInt(Needle, I64Ty), INeedleEnd}, nullptr,
|
|
"needle_pred");
|
|
PredNeedle = Builder.CreateAnd(PredVF, PredNeedle, "needle_masked");
|
|
Value *LoadNeedle = Builder.CreateMaskedLoad(
|
|
CharVTy, Needle, Align(1), PredNeedle, Passthru, "needle_load_vec");
|
|
|
|
// (2.b) Splat the first element to the inactive lanes.
|
|
Value *Needle0 =
|
|
Builder.CreateExtractElement(LoadNeedle, uint64_t(0), "needle0");
|
|
Value *Needle0Splat = Builder.CreateVectorSplat(ElementCount::getScalable(VF),
|
|
Needle0, "needle0");
|
|
LoadNeedle = Builder.CreateSelect(PredNeedle, LoadNeedle, Needle0Splat,
|
|
"needle_splat");
|
|
LoadNeedle = Builder.CreateExtractVector(
|
|
FixedVectorType::get(CharTy, VF), LoadNeedle, uint64_t(0), "needle_vec");
|
|
|
|
// (2.c) Test if there's a match.
|
|
Value *MatchPred = Builder.CreateIntrinsic(
|
|
Intrinsic::experimental_vector_match, {CharVTy, LoadNeedle->getType()},
|
|
{LoadSearch, LoadNeedle, PredSearch}, nullptr, "match_pred");
|
|
Value *IfAnyMatch = Builder.CreateOrReduce(MatchPred);
|
|
Builder.CreateCondBr(IfAnyMatch, BB3, BB4);
|
|
DTU.applyUpdates(
|
|
{{DominatorTree::Insert, BB2, BB3}, {DominatorTree::Insert, BB2, BB4}});
|
|
|
|
// (3) We found a match. Compute the index of its location and exit.
|
|
Builder.SetInsertPoint(BB3);
|
|
PHINode *MatchLCSSA = Builder.CreatePHI(PtrTy, 1, "match_start");
|
|
PHINode *MatchPredLCSSA =
|
|
Builder.CreatePHI(MatchPred->getType(), 1, "match_vec");
|
|
Value *MatchCnt = Builder.CreateIntrinsic(
|
|
Intrinsic::experimental_cttz_elts, {I64Ty, MatchPred->getType()},
|
|
{MatchPredLCSSA, /*ZeroIsPoison=*/Builder.getInt1(true)}, nullptr,
|
|
"match_idx");
|
|
Value *MatchVal =
|
|
Builder.CreateGEP(CharTy, MatchLCSSA, MatchCnt, "match_res");
|
|
Builder.CreateBr(ExitSucc);
|
|
DTU.applyUpdates({{DominatorTree::Insert, BB3, ExitSucc}});
|
|
|
|
// (4) Check if we've reached the end of the needle array.
|
|
Builder.SetInsertPoint(BB4);
|
|
Value *NextNeedle =
|
|
Builder.CreateGEP(CharTy, Needle, ConstVF, "needle_next_vec");
|
|
Builder.CreateCondBr(Builder.CreateICmpULT(NextNeedle, NeedleEnd), BB2, BB5);
|
|
DTU.applyUpdates(
|
|
{{DominatorTree::Insert, BB4, BB2}, {DominatorTree::Insert, BB4, BB5}});
|
|
|
|
// (5) Check if we've reached the end of the search array.
|
|
Builder.SetInsertPoint(BB5);
|
|
Value *NextSearch =
|
|
Builder.CreateGEP(CharTy, Search, ConstVF, "search_next_vec");
|
|
Builder.CreateCondBr(Builder.CreateICmpULT(NextSearch, SearchEnd), BB1,
|
|
ExitFail);
|
|
DTU.applyUpdates({{DominatorTree::Insert, BB5, BB1},
|
|
{DominatorTree::Insert, BB5, ExitFail}});
|
|
|
|
// Set up the PHI nodes.
|
|
Search->addIncoming(SearchStart, BB0);
|
|
Search->addIncoming(NextSearch, BB5);
|
|
Needle->addIncoming(NeedleStart, BB1);
|
|
Needle->addIncoming(NextNeedle, BB4);
|
|
// These are needed to retain LCSSA form.
|
|
MatchLCSSA->addIncoming(Search, BB2);
|
|
MatchPredLCSSA->addIncoming(MatchPred, BB2);
|
|
|
|
if (VerifyLoops) {
|
|
OuterLoop->verifyLoop();
|
|
InnerLoop->verifyLoop();
|
|
if (!OuterLoop->isRecursivelyLCSSAForm(*DT, *LI))
|
|
report_fatal_error("Loops must remain in LCSSA form!");
|
|
}
|
|
|
|
return MatchVal;
|
|
}
|
|
|
|
void LoopIdiomVectorize::transformFindFirstByte(
|
|
PHINode *IndPhi, unsigned VF, Type *CharTy, BasicBlock *ExitSucc,
|
|
BasicBlock *ExitFail, Value *SearchStart, Value *SearchEnd,
|
|
Value *NeedleStart, Value *NeedleEnd) {
|
|
// Insert the find first byte code at the end of the preheader block.
|
|
BasicBlock *Preheader = CurLoop->getLoopPreheader();
|
|
BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator());
|
|
IRBuilder<> Builder(PHBranch);
|
|
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
|
|
Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc());
|
|
|
|
Value *MatchVal =
|
|
expandFindFirstByte(Builder, DTU, VF, CharTy, ExitSucc, ExitFail,
|
|
SearchStart, SearchEnd, NeedleStart, NeedleEnd);
|
|
|
|
assert(PHBranch->isUnconditional() &&
|
|
"Expected preheader to terminate with an unconditional branch.");
|
|
|
|
// Add new incoming values with the result of the transformation to PHINodes
|
|
// of ExitSucc that use IndPhi.
|
|
for (auto *U : llvm::make_early_inc_range(IndPhi->users())) {
|
|
auto *PN = dyn_cast<PHINode>(U);
|
|
if (PN && PN->getParent() == ExitSucc)
|
|
PN->addIncoming(MatchVal, cast<Instruction>(MatchVal)->getParent());
|
|
}
|
|
|
|
if (VerifyLoops && CurLoop->getParentLoop()) {
|
|
CurLoop->getParentLoop()->verifyLoop();
|
|
if (!CurLoop->getParentLoop()->isRecursivelyLCSSAForm(*DT, *LI))
|
|
report_fatal_error("Loops must remain in LCSSA form!");
|
|
}
|
|
}
|