Currently the logic for introducing a header mask and predicating the
vector loop region is done inside introduceMasksAndLinearize.
This splits the tail folding part out into an individual VPlan transform
so that VPlanPredicator.cpp doesn't need to worry about tail folding,
which seemed to be a temporary measure according to a comment in
VPlanTransforms.h.
To perform tail folding independently, this splits the "body" of the
vector loop region between the phis in the header and the branch + iv
increment in the latch:
Before:
```
+-------------------------------------------+
|%iv = ... |
|... |
|%iv.next = add %iv, vfxuf |
|branch-on-count %iv.next, vector-trip-count|
+-------------------------------------------+
```
After:
```
+-------------------------------------------+
|%iv = ... |
|%wide.iv = widen-canonical-iv ... |
|%header-mask = icmp ule %wide.iv, BTC |---+
|branch-on-cond %header-mask | |
+-------------------------------------------+ |
| |
v |
+-------------------------------------------+ |
|... | |
+-------------------------------------------+ |
| |
v |
+-------------------------------------------+ |
|%iv.next = add %iv, vfxuf |<--+
|branch-on-count %iv.next, vector-trip-count|
+-------------------------------------------+
```
Phis are then inserted in the latch for any value in the loop body that
have outside uses, with poison as their incoming value from the header
edge.
The motivation for this is to allow us to share the same "predicate all
successor blocks" type of predication we do for tail folding, but for
early-exit loops in #172454. This may also allow us to directly emit an
EVL based header mask, instead of having to match + transform the
existing header mask in addExplicitVectorLength.
This also allows us to eventually handle recurrences in the same
transform, avoiding the need to special case tail folding in
addReductionResultComputation.
290 lines
11 KiB
C++
290 lines
11 KiB
C++
//===-- VPlanPredicator.cpp - VPlan predicator ----------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
///
|
|
/// \file
|
|
/// This file implements predication for VPlans.
|
|
///
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "VPRecipeBuilder.h"
|
|
#include "VPlan.h"
|
|
#include "VPlanCFG.h"
|
|
#include "VPlanPatternMatch.h"
|
|
#include "VPlanTransforms.h"
|
|
#include "VPlanUtils.h"
|
|
#include "llvm/ADT/PostOrderIterator.h"
|
|
|
|
using namespace llvm;
|
|
using namespace VPlanPatternMatch;
|
|
|
|
namespace {
|
|
class VPPredicator {
|
|
/// Builder to construct recipes to compute masks.
|
|
VPBuilder Builder;
|
|
|
|
/// When we if-convert we need to create edge masks. We have to cache values
|
|
/// so that we don't end up with exponential recursion/IR.
|
|
using EdgeMaskCacheTy =
|
|
DenseMap<std::pair<const VPBasicBlock *, const VPBasicBlock *>,
|
|
VPValue *>;
|
|
using BlockMaskCacheTy = DenseMap<const VPBasicBlock *, VPValue *>;
|
|
EdgeMaskCacheTy EdgeMaskCache;
|
|
|
|
BlockMaskCacheTy BlockMaskCache;
|
|
|
|
/// Create an edge mask for every destination of cases and/or default.
|
|
void createSwitchEdgeMasks(const VPInstruction *SI);
|
|
|
|
/// Computes and return the predicate of the edge between \p Src and \p Dst,
|
|
/// possibly inserting new recipes at \p Dst (using Builder's insertion point)
|
|
VPValue *createEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst);
|
|
|
|
/// Record \p Mask as the *entry* mask of \p VPBB, which is expected to not
|
|
/// already have a mask.
|
|
void setBlockInMask(const VPBasicBlock *VPBB, VPValue *Mask) {
|
|
// TODO: Include the masks as operands in the predicated VPlan directly to
|
|
// avoid keeping the map of masks beyond the predication transform.
|
|
assert(!getBlockInMask(VPBB) && "Mask already set");
|
|
BlockMaskCache[VPBB] = Mask;
|
|
}
|
|
|
|
/// Record \p Mask as the mask of the edge from \p Src to \p Dst. The edge is
|
|
/// expected to not have a mask already.
|
|
VPValue *setEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst,
|
|
VPValue *Mask) {
|
|
assert(Src != Dst && "Src and Dst must be different");
|
|
assert(!getEdgeMask(Src, Dst) && "Mask already set");
|
|
return EdgeMaskCache[{Src, Dst}] = Mask;
|
|
}
|
|
|
|
public:
|
|
/// Returns the *entry* mask for \p VPBB.
|
|
VPValue *getBlockInMask(const VPBasicBlock *VPBB) const {
|
|
return BlockMaskCache.lookup(VPBB);
|
|
}
|
|
|
|
/// Returns the precomputed predicate of the edge from \p Src to \p Dst.
|
|
VPValue *getEdgeMask(const VPBasicBlock *Src, const VPBasicBlock *Dst) const {
|
|
return EdgeMaskCache.lookup({Src, Dst});
|
|
}
|
|
|
|
/// Compute the predicate of \p VPBB.
|
|
void createBlockInMask(VPBasicBlock *VPBB);
|
|
|
|
/// Convert phi recipes in \p VPBB to VPBlendRecipes.
|
|
void convertPhisToBlends(VPBasicBlock *VPBB);
|
|
};
|
|
} // namespace
|
|
|
|
VPValue *VPPredicator::createEdgeMask(const VPBasicBlock *Src,
|
|
const VPBasicBlock *Dst) {
|
|
assert(is_contained(Dst->getPredecessors(), Src) && "Invalid edge");
|
|
|
|
// Look for cached value.
|
|
VPValue *EdgeMask = getEdgeMask(Src, Dst);
|
|
if (EdgeMask)
|
|
return EdgeMask;
|
|
|
|
VPValue *SrcMask = getBlockInMask(Src);
|
|
|
|
// If there's a single successor, there's no terminator recipe.
|
|
if (Src->getNumSuccessors() == 1)
|
|
return setEdgeMask(Src, Dst, SrcMask);
|
|
|
|
auto *Term = cast<VPInstruction>(Src->getTerminator());
|
|
if (Term->getOpcode() == Instruction::Switch) {
|
|
createSwitchEdgeMasks(Term);
|
|
return getEdgeMask(Src, Dst);
|
|
}
|
|
|
|
assert(Term->getOpcode() == VPInstruction::BranchOnCond &&
|
|
"Unsupported terminator");
|
|
if (Src->getSuccessors()[0] == Src->getSuccessors()[1])
|
|
return setEdgeMask(Src, Dst, SrcMask);
|
|
|
|
EdgeMask = Term->getOperand(0);
|
|
assert(EdgeMask && "No Edge Mask found for condition");
|
|
|
|
if (Src->getSuccessors()[0] != Dst)
|
|
EdgeMask = Builder.createNot(EdgeMask, Term->getDebugLoc());
|
|
|
|
if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
|
|
// The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
|
|
// is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
|
|
// instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
|
|
EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, Term->getDebugLoc());
|
|
}
|
|
|
|
return setEdgeMask(Src, Dst, EdgeMask);
|
|
}
|
|
|
|
void VPPredicator::createBlockInMask(VPBasicBlock *VPBB) {
|
|
// Start inserting after the block's phis, which be replaced by blends later.
|
|
Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi());
|
|
// All-one mask is modelled as no-mask following the convention for masked
|
|
// load/store/gather/scatter. Initialize BlockMask to no-mask.
|
|
VPValue *BlockMask = nullptr;
|
|
// This is the block mask. We OR all unique incoming edges.
|
|
for (auto *Predecessor : SetVector<VPBlockBase *>(
|
|
VPBB->getPredecessors().begin(), VPBB->getPredecessors().end())) {
|
|
VPValue *EdgeMask = createEdgeMask(cast<VPBasicBlock>(Predecessor), VPBB);
|
|
if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is
|
|
// too.
|
|
setBlockInMask(VPBB, EdgeMask);
|
|
return;
|
|
}
|
|
|
|
if (!BlockMask) { // BlockMask has its initial nullptr value.
|
|
BlockMask = EdgeMask;
|
|
continue;
|
|
}
|
|
|
|
BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
|
|
}
|
|
|
|
setBlockInMask(VPBB, BlockMask);
|
|
}
|
|
|
|
void VPPredicator::createSwitchEdgeMasks(const VPInstruction *SI) {
|
|
const VPBasicBlock *Src = SI->getParent();
|
|
|
|
// Create masks where SI is a switch. We create masks for all edges from SI's
|
|
// parent block at the same time. This is more efficient, as we can create and
|
|
// collect compares for all cases once.
|
|
VPValue *Cond = SI->getOperand(0);
|
|
VPBasicBlock *DefaultDst = cast<VPBasicBlock>(Src->getSuccessors()[0]);
|
|
MapVector<VPBasicBlock *, SmallVector<VPValue *>> Dst2Compares;
|
|
for (const auto &[Idx, Succ] : enumerate(drop_begin(Src->getSuccessors()))) {
|
|
VPBasicBlock *Dst = cast<VPBasicBlock>(Succ);
|
|
assert(!getEdgeMask(Src, Dst) && "Edge masks already created");
|
|
// Cases whose destination is the same as default are redundant and can
|
|
// be ignored - they will get there anyhow.
|
|
if (Dst == DefaultDst)
|
|
continue;
|
|
auto &Compares = Dst2Compares[Dst];
|
|
VPValue *V = SI->getOperand(Idx + 1);
|
|
Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V));
|
|
}
|
|
|
|
// We need to handle 2 separate cases below for all entries in Dst2Compares,
|
|
// which excludes destinations matching the default destination.
|
|
VPValue *SrcMask = getBlockInMask(Src);
|
|
VPValue *DefaultMask = nullptr;
|
|
for (const auto &[Dst, Conds] : Dst2Compares) {
|
|
// 1. Dst is not the default destination. Dst is reached if any of the
|
|
// cases with destination == Dst are taken. Join the conditions for each
|
|
// case whose destination == Dst using an OR.
|
|
VPValue *Mask = Conds[0];
|
|
for (VPValue *V : drop_begin(Conds))
|
|
Mask = Builder.createOr(Mask, V);
|
|
if (SrcMask)
|
|
Mask = Builder.createLogicalAnd(SrcMask, Mask);
|
|
setEdgeMask(Src, Dst, Mask);
|
|
|
|
// 2. Create the mask for the default destination, which is reached if
|
|
// none of the cases with destination != default destination are taken.
|
|
// Join the conditions for each case where the destination is != Dst using
|
|
// an OR and negate it.
|
|
DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask;
|
|
}
|
|
|
|
if (DefaultMask) {
|
|
DefaultMask = Builder.createNot(DefaultMask);
|
|
if (SrcMask)
|
|
DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask);
|
|
} else {
|
|
// There are no destinations other than the default destination, so this is
|
|
// an unconditional branch.
|
|
DefaultMask = SrcMask;
|
|
}
|
|
setEdgeMask(Src, DefaultDst, DefaultMask);
|
|
}
|
|
|
|
void VPPredicator::convertPhisToBlends(VPBasicBlock *VPBB) {
|
|
SmallVector<VPPhi *> Phis;
|
|
for (VPRecipeBase &R : VPBB->phis())
|
|
Phis.push_back(cast<VPPhi>(&R));
|
|
for (VPPhi *PhiR : Phis) {
|
|
// The non-header Phi is converted into a Blend recipe below,
|
|
// so we don't have to worry about the insertion order and we can just use
|
|
// the builder. At this point we generate the predication tree. There may
|
|
// be duplications since this is a simple recursive scan, but future
|
|
// optimizations will clean it up.
|
|
|
|
auto NotPoison = make_filter_range(PhiR->incoming_values(), [](VPValue *V) {
|
|
return !match(V, m_Poison());
|
|
});
|
|
if (all_equal(NotPoison)) {
|
|
PhiR->replaceAllUsesWith(NotPoison.empty() ? PhiR->getIncomingValue(0)
|
|
: *NotPoison.begin());
|
|
PhiR->eraseFromParent();
|
|
continue;
|
|
}
|
|
|
|
SmallVector<VPValue *, 2> OperandsWithMask;
|
|
for (const auto &[InVPV, InVPBB] : PhiR->incoming_values_and_blocks()) {
|
|
OperandsWithMask.push_back(InVPV);
|
|
OperandsWithMask.push_back(getEdgeMask(InVPBB, VPBB));
|
|
}
|
|
PHINode *IRPhi = cast_or_null<PHINode>(PhiR->getUnderlyingValue());
|
|
auto *Blend =
|
|
new VPBlendRecipe(IRPhi, OperandsWithMask, *PhiR, PhiR->getDebugLoc());
|
|
Builder.insert(Blend);
|
|
PhiR->replaceAllUsesWith(Blend);
|
|
PhiR->eraseFromParent();
|
|
}
|
|
}
|
|
|
|
void VPlanTransforms::introduceMasksAndLinearize(VPlan &Plan) {
|
|
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
|
|
// Scan the body of the loop in a topological order to visit each basic block
|
|
// after having visited its predecessor basic blocks.
|
|
VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
|
|
ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
|
|
Header);
|
|
VPPredicator Predicator;
|
|
for (VPBlockBase *VPB : RPOT) {
|
|
// Non-outer regions with VPBBs only are supported at the moment.
|
|
auto *VPBB = cast<VPBasicBlock>(VPB);
|
|
// Introduce the mask for VPBB, which may introduce needed edge masks, and
|
|
// convert all phi recipes of VPBB to blend recipes unless VPBB is the
|
|
// header.
|
|
if (VPBB != Header) {
|
|
Predicator.createBlockInMask(VPBB);
|
|
Predicator.convertPhisToBlends(VPBB);
|
|
}
|
|
|
|
VPValue *BlockMask = Predicator.getBlockInMask(VPBB);
|
|
if (!BlockMask)
|
|
continue;
|
|
|
|
// Mask all VPInstructions in the block.
|
|
for (VPRecipeBase &R : *VPBB) {
|
|
if (auto *VPI = dyn_cast<VPInstruction>(&R))
|
|
VPI->addMask(BlockMask);
|
|
}
|
|
}
|
|
|
|
// Linearize the blocks of the loop into one serial chain.
|
|
VPBlockBase *PrevVPBB = nullptr;
|
|
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
|
|
auto Successors = to_vector(VPBB->getSuccessors());
|
|
if (Successors.size() > 1)
|
|
VPBB->getTerminator()->eraseFromParent();
|
|
|
|
// Flatten the CFG in the loop. To do so, first disconnect VPBB from its
|
|
// successors. Then connect VPBB to the previously visited VPBB.
|
|
for (auto *Succ : Successors)
|
|
VPBlockUtils::disconnectBlocks(VPBB, Succ);
|
|
if (PrevVPBB)
|
|
VPBlockUtils::connectBlocks(PrevVPBB, VPBB);
|
|
|
|
PrevVPBB = VPBB;
|
|
}
|
|
}
|