[SLP] Replace MainOp and AltOp in TreeEntry with InstructionsState. (#122443)

Add TreeEntry::hasState.
Add assert for getTreeEntry.
Remove the OpValue parameter from the canReuseExtract function.
Remove the Opcode parameter from the ComputeMaxBitWidth lambda function.
This commit is contained in:
Han-Kuan Chen 2025-01-18 10:23:20 +08:00 committed by GitHub
parent 5b6a26ccdd
commit 07d496538f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 225 additions and 110 deletions

View File

@ -2414,15 +2414,17 @@ public:
} }
/// Go through the instructions in VL and append their operands. /// Go through the instructions in VL and append their operands.
void appendOperandsOfVL(ArrayRef<Value *> VL, Instruction *VL0) { void appendOperandsOfVL(ArrayRef<Value *> VL, const InstructionsState &S) {
assert(!VL.empty() && "Bad VL"); assert(!VL.empty() && "Bad VL");
assert((empty() || VL.size() == getNumLanes()) && assert((empty() || VL.size() == getNumLanes()) &&
"Expected same number of lanes"); "Expected same number of lanes");
assert(S.valid() && "InstructionsState is invalid.");
// IntrinsicInst::isCommutative returns true if swapping the first "two" // IntrinsicInst::isCommutative returns true if swapping the first "two"
// arguments to the intrinsic produces the same result. // arguments to the intrinsic produces the same result.
constexpr unsigned IntrinsicNumOperands = 2; constexpr unsigned IntrinsicNumOperands = 2;
unsigned NumOperands = VL0->getNumOperands(); Instruction *MainOp = S.getMainOp();
ArgSize = isa<IntrinsicInst>(VL0) ? IntrinsicNumOperands : NumOperands; unsigned NumOperands = MainOp->getNumOperands();
ArgSize = isa<IntrinsicInst>(MainOp) ? IntrinsicNumOperands : NumOperands;
OpsVec.resize(NumOperands); OpsVec.resize(NumOperands);
unsigned NumLanes = VL.size(); unsigned NumLanes = VL.size();
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
@ -2441,19 +2443,19 @@ public:
// operations or alternating sequences (e.g., +, -), we can safely // operations or alternating sequences (e.g., +, -), we can safely
// tell the inverse operations by checking commutativity. // tell the inverse operations by checking commutativity.
if (isa<PoisonValue>(VL[Lane])) { if (isa<PoisonValue>(VL[Lane])) {
if (auto *EI = dyn_cast<ExtractElementInst>(VL0)) { if (auto *EI = dyn_cast<ExtractElementInst>(MainOp)) {
if (OpIdx == 0) { if (OpIdx == 0) {
OpsVec[OpIdx][Lane] = {EI->getVectorOperand(), true, false}; OpsVec[OpIdx][Lane] = {EI->getVectorOperand(), true, false};
continue; continue;
} }
} else if (auto *EV = dyn_cast<ExtractValueInst>(VL0)) { } else if (auto *EV = dyn_cast<ExtractValueInst>(MainOp)) {
if (OpIdx == 0) { if (OpIdx == 0) {
OpsVec[OpIdx][Lane] = {EV->getAggregateOperand(), true, false}; OpsVec[OpIdx][Lane] = {EV->getAggregateOperand(), true, false};
continue; continue;
} }
} }
OpsVec[OpIdx][Lane] = { OpsVec[OpIdx][Lane] = {
PoisonValue::get(VL0->getOperand(OpIdx)->getType()), true, PoisonValue::get(MainOp->getOperand(OpIdx)->getType()), true,
false}; false};
continue; continue;
} }
@ -2566,11 +2568,12 @@ public:
public: public:
/// Initialize with all the operands of the instruction vector \p RootVL. /// Initialize with all the operands of the instruction vector \p RootVL.
VLOperands(ArrayRef<Value *> RootVL, Instruction *VL0, const BoUpSLP &R) VLOperands(ArrayRef<Value *> RootVL, const InstructionsState &S,
const BoUpSLP &R)
: TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R), : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
L(R.LI->getLoopFor((VL0->getParent()))) { L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
// Append all the operands of RootVL. // Append all the operands of RootVL.
appendOperandsOfVL(RootVL, VL0); appendOperandsOfVL(RootVL, S);
} }
/// \Returns a value vector with the operands across all lanes for the /// \Returns a value vector with the operands across all lanes for the
@ -3043,7 +3046,7 @@ private:
/// non-identity permutation that allows to reuse extract instructions. /// non-identity permutation that allows to reuse extract instructions.
/// \param ResizeAllowed indicates whether it is allowed to handle subvector /// \param ResizeAllowed indicates whether it is allowed to handle subvector
/// extract order. /// extract order.
bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, bool canReuseExtract(ArrayRef<Value *> VL,
SmallVectorImpl<unsigned> &CurrentOrder, SmallVectorImpl<unsigned> &CurrentOrder,
bool ResizeAllowed = false) const; bool ResizeAllowed = false) const;
@ -3270,7 +3273,7 @@ private:
}; };
/// Checks if the current node is a gather node. /// Checks if the current node is a gather node.
bool isGather() const {return State == NeedToGather; } bool isGather() const { return State == NeedToGather; }
/// A vector of scalars. /// A vector of scalars.
ValueList Scalars; ValueList Scalars;
@ -3334,9 +3337,9 @@ private:
/// reordering of operands during buildTree_rec() and vectorizeTree(). /// reordering of operands during buildTree_rec() and vectorizeTree().
SmallVector<ValueList, 2> Operands; SmallVector<ValueList, 2> Operands;
/// The main/alternate instruction. /// MainOp and AltOp are recorded inside. S should be obtained from
Instruction *MainOp = nullptr; /// newTreeEntry.
Instruction *AltOp = nullptr; InstructionsState S = InstructionsState::invalid();
/// Interleaving factor for interleaved loads Vectorize nodes. /// Interleaving factor for interleaved loads Vectorize nodes.
unsigned InterleaveFactor = 0; unsigned InterleaveFactor = 0;
@ -3360,10 +3363,10 @@ private:
/// Set this bundle's operand from Scalars. /// Set this bundle's operand from Scalars.
void setOperand(const BoUpSLP &R, bool RequireReorder = false) { void setOperand(const BoUpSLP &R, bool RequireReorder = false) {
VLOperands Ops(Scalars, MainOp, R); VLOperands Ops(Scalars, S, R);
if (RequireReorder) if (RequireReorder)
Ops.reorder(); Ops.reorder();
for (unsigned I : seq<unsigned>(MainOp->getNumOperands())) for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands()))
setOperand(I, Ops.getVL(I)); setOperand(I, Ops.getVL(I));
} }
@ -3396,13 +3399,9 @@ private:
} }
/// Some of the instructions in the list have alternate opcodes. /// Some of the instructions in the list have alternate opcodes.
bool isAltShuffle() const { return MainOp != AltOp; } bool isAltShuffle() const { return S.isAltShuffle(); }
bool isOpcodeOrAlt(Instruction *I) const { bool isOpcodeOrAlt(Instruction *I) const { return S.isOpcodeOrAlt(I); }
unsigned CheckedOpcode = I->getOpcode();
return (getOpcode() == CheckedOpcode ||
getAltOpcode() == CheckedOpcode);
}
/// Chooses the correct key for scheduling data. If \p Op has the same (or /// Chooses the correct key for scheduling data. If \p Op has the same (or
/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
@ -3411,31 +3410,24 @@ private:
auto *I = dyn_cast<Instruction>(Op); auto *I = dyn_cast<Instruction>(Op);
if (I && isOpcodeOrAlt(I)) if (I && isOpcodeOrAlt(I))
return Op; return Op;
return MainOp; return S.getMainOp();
} }
void setOperations(const InstructionsState &S) { void setOperations(const InstructionsState &S) {
assert(S && "InstructionsState is invalid."); assert(S && "InstructionsState is invalid.");
MainOp = S.getMainOp(); this->S = S;
AltOp = S.getAltOp();
} }
Instruction *getMainOp() const { Instruction *getMainOp() const { return S.getMainOp(); }
return MainOp;
}
Instruction *getAltOp() const { Instruction *getAltOp() const { return S.getAltOp(); }
return AltOp;
}
/// The main/alternate opcodes for the list of instructions. /// The main/alternate opcodes for the list of instructions.
unsigned getOpcode() const { unsigned getOpcode() const { return S.getOpcode(); }
return MainOp ? MainOp->getOpcode() : 0;
}
unsigned getAltOpcode() const { unsigned getAltOpcode() const { return S.getAltOpcode(); }
return AltOp ? AltOp->getOpcode() : 0;
} bool hasState() const { return S.valid(); }
/// When ReuseReorderShuffleIndices is empty it just returns position of \p /// When ReuseReorderShuffleIndices is empty it just returns position of \p
/// V within vector of Scalars. Otherwise, try to remap on its reuse index. /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
@ -3531,16 +3523,13 @@ private:
dbgs() << "CombinedVectorize\n"; dbgs() << "CombinedVectorize\n";
break; break;
} }
dbgs() << "MainOp: "; if (S) {
if (MainOp) dbgs() << "MainOp: " << *S.getMainOp() << "\n";
dbgs() << *MainOp << "\n"; dbgs() << "AltOp: " << *S.getAltOp() << "\n";
else } else {
dbgs() << "NULL\n"; dbgs() << "MainOp: NULL\n";
dbgs() << "AltOp: "; dbgs() << "AltOp: NULL\n";
if (AltOp) }
dbgs() << *AltOp << "\n";
else
dbgs() << "NULL\n";
dbgs() << "VectorizedValue: "; dbgs() << "VectorizedValue: ";
if (VectorizedValue) if (VectorizedValue)
dbgs() << *VectorizedValue << "\n"; dbgs() << *VectorizedValue << "\n";
@ -3715,9 +3704,13 @@ private:
} }
#endif #endif
TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); } TreeEntry *getTreeEntry(Value *V) {
assert(V && "V cannot be nullptr.");
return ScalarToTreeEntry.lookup(V);
}
const TreeEntry *getTreeEntry(Value *V) const { const TreeEntry *getTreeEntry(Value *V) const {
assert(V && "V cannot be nullptr.");
return ScalarToTreeEntry.lookup(V); return ScalarToTreeEntry.lookup(V);
} }
@ -5615,7 +5608,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
// Try build correct order for extractelement instructions. // Try build correct order for extractelement instructions.
SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(), SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
TE.ReuseShuffleIndices.end()); TE.ReuseShuffleIndices.end());
if (TE.getOpcode() == Instruction::ExtractElement && if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
all_of(TE.Scalars, [Sz](Value *V) { all_of(TE.Scalars, [Sz](Value *V) {
if (isa<PoisonValue>(V)) if (isa<PoisonValue>(V))
return true; return true;
@ -5777,10 +5770,11 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
return std::nullopt; // No need to reorder. return std::nullopt; // No need to reorder.
return std::move(Phis); return std::move(Phis);
} }
if (TE.isGather() && !TE.isAltShuffle() && allSameType(TE.Scalars)) { if (TE.isGather() && (!TE.hasState() || !TE.isAltShuffle()) &&
allSameType(TE.Scalars)) {
// TODO: add analysis of other gather nodes with extractelement // TODO: add analysis of other gather nodes with extractelement
// instructions and other values/instructions, not only undefs. // instructions and other values/instructions, not only undefs.
if ((TE.getOpcode() == Instruction::ExtractElement || if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
(all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) && (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
any_of(TE.Scalars, IsaPred<ExtractElementInst>))) && any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
all_of(TE.Scalars, [](Value *V) { all_of(TE.Scalars, [](Value *V) {
@ -5790,8 +5784,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
// Check that gather of extractelements can be represented as // Check that gather of extractelements can be represented as
// just a shuffle of a single vector. // just a shuffle of a single vector.
OrdersType CurrentOrder; OrdersType CurrentOrder;
bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder, bool Reuse =
/*ResizeAllowed=*/true); canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
if (Reuse || !CurrentOrder.empty()) if (Reuse || !CurrentOrder.empty())
return std::move(CurrentOrder); return std::move(CurrentOrder);
} }
@ -5840,7 +5834,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
return Order; return Order;
// Check if can include the order of vectorized loads. For masked gathers do // Check if can include the order of vectorized loads. For masked gathers do
// extra analysis later, so include such nodes into a special list. // extra analysis later, so include such nodes into a special list.
if (TE.isGather() && TE.getOpcode() == Instruction::Load) { if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
SmallVector<Value *> PointerOps; SmallVector<Value *> PointerOps;
OrdersType CurrentOrder; OrdersType CurrentOrder;
LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(), LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
@ -5955,7 +5949,7 @@ void BoUpSLP::reorderTopToBottom() {
// Patterns like [fadd,fsub] can be combined into a single instruction in // Patterns like [fadd,fsub] can be combined into a single instruction in
// x86. Reordering them into [fsub,fadd] blocks this pattern. So we need // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
// to take into account their order when looking for the most used order. // to take into account their order when looking for the most used order.
if (TE->isAltShuffle()) { if (TE->hasState() && TE->isAltShuffle()) {
VectorType *VecTy = VectorType *VecTy =
getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size()); getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
unsigned Opcode0 = TE->getOpcode(); unsigned Opcode0 = TE->getOpcode();
@ -6034,7 +6028,7 @@ void BoUpSLP::reorderTopToBottom() {
if (It != GathersToOrders.end()) if (It != GathersToOrders.end())
return It->second; return It->second;
} }
if (OpTE->isAltShuffle()) { if (OpTE->hasState() && OpTE->isAltShuffle()) {
auto It = AltShufflesToOrders.find(OpTE); auto It = AltShufflesToOrders.find(OpTE);
if (It != AltShufflesToOrders.end()) if (It != AltShufflesToOrders.end())
return It->second; return It->second;
@ -7637,7 +7631,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
} }
case Instruction::ExtractValue: case Instruction::ExtractValue:
case Instruction::ExtractElement: { case Instruction::ExtractElement: {
bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); bool Reuse = canReuseExtract(VL, CurrentOrder);
// FIXME: Vectorizing is not supported yet for non-power-of-2 ops. // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
if (!has_single_bit(VL.size())) if (!has_single_bit(VL.size()))
return TreeEntry::NeedToGather; return TreeEntry::NeedToGather;
@ -8657,7 +8651,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
TE->dump()); TE->dump());
ValueList Left, Right; ValueList Left, Right;
VLOperands Ops(VL, VL0, *this); VLOperands Ops(VL, S, *this);
if (cast<CmpInst>(VL0)->isCommutative()) { if (cast<CmpInst>(VL0)->isCommutative()) {
// Commutative predicate - collect + sort operands of the instructions // Commutative predicate - collect + sort operands of the instructions
// so that each side is more likely to have the same opcode. // so that each side is more likely to have the same opcode.
@ -8925,7 +8919,7 @@ unsigned BoUpSLP::canMapToVector(Type *T) const {
return N; return N;
} }
bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
SmallVectorImpl<unsigned> &CurrentOrder, SmallVectorImpl<unsigned> &CurrentOrder,
bool ResizeAllowed) const { bool ResizeAllowed) const {
const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>); const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
@ -9579,7 +9573,7 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
// Do not reorder nodes if it small (just 2 elements), all-constant or all // Do not reorder nodes if it small (just 2 elements), all-constant or all
// instructions have same opcode already. // instructions have same opcode already.
if (TE.Scalars.size() == 2 || (TE.getOpcode() && !TE.isAltShuffle()) || if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
all_of(TE.Scalars, isConstant)) all_of(TE.Scalars, isConstant))
return; return;
@ -9798,7 +9792,7 @@ void BoUpSLP::transformNodes() {
// Do not try partial vectorization for small nodes (<= 2), nodes with the // Do not try partial vectorization for small nodes (<= 2), nodes with the
// same opcode and same parent block or all constants. // same opcode and same parent block or all constants.
if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) || if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
!(!E.getOpcode() || E.getOpcode() == Instruction::Load || !(!E.hasState() || E.getOpcode() == Instruction::Load ||
E.isAltShuffle() || !allSameBlock(VL)) || E.isAltShuffle() || !allSameBlock(VL)) ||
allConstant(VL) || isSplat(VL)) allConstant(VL) || isSplat(VL))
continue; continue;
@ -9921,6 +9915,7 @@ void BoUpSLP::transformNodes() {
buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX)); buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
if (PrevSize + 1 == VectorizableTree.size() && if (PrevSize + 1 == VectorizableTree.size() &&
VectorizableTree[PrevSize]->isGather() && VectorizableTree[PrevSize]->isGather() &&
VectorizableTree[PrevSize]->hasState() &&
VectorizableTree[PrevSize]->getOpcode() != VectorizableTree[PrevSize]->getOpcode() !=
Instruction::ExtractElement && Instruction::ExtractElement &&
!isSplat(Slice)) { !isSplat(Slice)) {
@ -9941,6 +9936,8 @@ void BoUpSLP::transformNodes() {
E.ReorderIndices.clear(); E.ReorderIndices.clear();
} }
} }
if (!E.hasState())
continue;
switch (E.getOpcode()) { switch (E.getOpcode()) {
case Instruction::Load: { case Instruction::Load: {
// No need to reorder masked gather loads, just reorder the scalar // No need to reorder masked gather loads, just reorder the scalar
@ -10044,7 +10041,7 @@ void BoUpSLP::transformNodes() {
if (LoadEntriesToVectorize.empty()) { if (LoadEntriesToVectorize.empty()) {
// Single load node - exit. // Single load node - exit.
if (VectorizableTree.size() <= 1 && if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
VectorizableTree.front()->getOpcode() == Instruction::Load) VectorizableTree.front()->getOpcode() == Instruction::Load)
return; return;
// Small graph with small VF - exit. // Small graph with small VF - exit.
@ -10060,7 +10057,7 @@ void BoUpSLP::transformNodes() {
getCanonicalGraphSize() <= SmallTree && getCanonicalGraphSize() <= SmallTree &&
count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()), count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
[](const std::unique_ptr<TreeEntry> &TE) { [](const std::unique_ptr<TreeEntry> &TE) {
return TE->isGather() && return TE->isGather() && TE->hasState() &&
TE->getOpcode() == Instruction::Load && TE->getOpcode() == Instruction::Load &&
!allSameBlock(TE->Scalars); !allSameBlock(TE->Scalars);
}) == 1) }) == 1)
@ -10076,13 +10073,13 @@ void BoUpSLP::transformNodes() {
for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) { for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
TreeEntry &E = *TE; TreeEntry &E = *TE;
if (E.isGather() && if (E.isGather() &&
(E.getOpcode() == Instruction::Load || ((E.hasState() && E.getOpcode() == Instruction::Load) ||
(!E.getOpcode() && any_of(E.Scalars, (!E.hasState() && any_of(E.Scalars,
[&](Value *V) { [&](Value *V) {
return isa<LoadInst>(V) && return isa<LoadInst>(V) &&
!isVectorized(V) && !isVectorized(V) &&
!isDeleted(cast<Instruction>(V)); !isDeleted(cast<Instruction>(V));
}))) && }))) &&
!isSplat(E.Scalars)) { !isSplat(E.Scalars)) {
for (Value *V : E.Scalars) { for (Value *V : E.Scalars) {
auto *LI = dyn_cast<LoadInst>(V); auto *LI = dyn_cast<LoadInst>(V);
@ -10676,7 +10673,7 @@ public:
bool PrevNodeFound = any_of( bool PrevNodeFound = any_of(
ArrayRef(R.VectorizableTree).take_front(E->Idx), ArrayRef(R.VectorizableTree).take_front(E->Idx),
[&](const std::unique_ptr<TreeEntry> &TE) { [&](const std::unique_ptr<TreeEntry> &TE) {
return ((!TE->isAltShuffle() && return ((TE->hasState() && !TE->isAltShuffle() &&
TE->getOpcode() == Instruction::ExtractElement) || TE->getOpcode() == Instruction::ExtractElement) ||
TE->isGather()) && TE->isGather()) &&
all_of(enumerate(TE->Scalars), [&](auto &&Data) { all_of(enumerate(TE->Scalars), [&](auto &&Data) {
@ -11801,7 +11798,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) { for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
if (TE.get() == E) if (TE.get() == E)
break; break;
if (TE->isAltShuffle() && if (TE->hasState() && TE->isAltShuffle() &&
((TE->getOpcode() == E->getOpcode() && ((TE->getOpcode() == E->getOpcode() &&
TE->getAltOpcode() == E->getAltOpcode()) || TE->getAltOpcode() == E->getAltOpcode()) ||
(TE->getOpcode() == E->getAltOpcode() && (TE->getOpcode() == E->getAltOpcode() &&
@ -11963,10 +11960,12 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
[this](Value *V) { return EphValues.contains(V); }) && [this](Value *V) { return EphValues.contains(V); }) &&
(allConstant(TE->Scalars) || isSplat(TE->Scalars) || (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
TE->Scalars.size() < Limit || TE->Scalars.size() < Limit ||
((TE->getOpcode() == Instruction::ExtractElement || (((TE->hasState() &&
TE->getOpcode() == Instruction::ExtractElement) ||
all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) && all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
isFixedVectorShuffle(TE->Scalars, Mask, AC)) || isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
(TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()) || (TE->hasState() && TE->getOpcode() == Instruction::Load &&
!TE->isAltShuffle()) ||
any_of(TE->Scalars, IsaPred<LoadInst>)); any_of(TE->Scalars, IsaPred<LoadInst>));
}; };
@ -12095,9 +12094,10 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
!VectorizableTree.empty() && !VectorizableTree.empty() &&
all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) { all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
return (TE->isGather() && return (TE->isGather() &&
TE->getOpcode() != Instruction::ExtractElement && (!TE->hasState() ||
TE->getOpcode() != Instruction::ExtractElement) &&
count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) || count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
TE->getOpcode() == Instruction::PHI; (TE->hasState() && TE->getOpcode() == Instruction::PHI);
})) }))
return true; return true;
@ -12115,7 +12115,7 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
// somewhere. // somewhere.
bool IsAllowedSingleBVNode = bool IsAllowedSingleBVNode =
VectorizableTree.size() > 1 || VectorizableTree.size() > 1 ||
(VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() && (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
!VectorizableTree.front()->isAltShuffle() && !VectorizableTree.front()->isAltShuffle() &&
VectorizableTree.front()->getOpcode() != Instruction::PHI && VectorizableTree.front()->getOpcode() != Instruction::PHI &&
VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr && VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
@ -12131,6 +12131,7 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
return false; return false;
if (VectorizableTree.back()->isGather() && if (VectorizableTree.back()->isGather() &&
VectorizableTree.back()->hasState() &&
VectorizableTree.back()->isAltShuffle() && VectorizableTree.back()->isAltShuffle() &&
VectorizableTree.back()->getVectorFactor() > 2 && VectorizableTree.back()->getVectorFactor() > 2 &&
allSameBlock(VectorizableTree.back()->Scalars) && allSameBlock(VectorizableTree.back()->Scalars) &&
@ -12155,7 +12156,7 @@ bool BoUpSLP::isTreeNotExtendable() const {
getCanonicalGraphSize() <= SmallTree && getCanonicalGraphSize() <= SmallTree &&
count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()), count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
[](const std::unique_ptr<TreeEntry> &TE) { [](const std::unique_ptr<TreeEntry> &TE) {
return TE->isGather() && return TE->isGather() && TE->hasState() &&
TE->getOpcode() == Instruction::Load && TE->getOpcode() == Instruction::Load &&
!allSameBlock(TE->Scalars); !allSameBlock(TE->Scalars);
}) == 1) }) == 1)
@ -12167,7 +12168,7 @@ bool BoUpSLP::isTreeNotExtendable() const {
TreeEntry &E = *VectorizableTree[Idx]; TreeEntry &E = *VectorizableTree[Idx];
if (!E.isGather()) if (!E.isGather())
continue; continue;
if (E.getOpcode() && E.getOpcode() != Instruction::Load) if (E.hasState() && E.getOpcode() != Instruction::Load)
return false; return false;
if (isSplat(E.Scalars) || allConstant(E.Scalars)) if (isSplat(E.Scalars) || allConstant(E.Scalars))
continue; continue;
@ -12477,7 +12478,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n"); TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
continue; continue;
} }
if (TE.isGather()) { if (TE.isGather() && TE.hasState()) {
if (const TreeEntry *E = getTreeEntry(TE.getMainOp()); if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
E && E->getVectorFactor() == TE.getVectorFactor() && E && E->getVectorFactor() == TE.getVectorFactor() &&
E->isSame(TE.Scalars)) { E->isSame(TE.Scalars)) {
@ -13626,9 +13627,11 @@ BoUpSLP::isGatherShuffledEntry(
if (!TE->UserTreeIndices.empty() && if (!TE->UserTreeIndices.empty() &&
TE->UserTreeIndices.front().UserTE->isGather() && TE->UserTreeIndices.front().UserTE->isGather() &&
TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) { TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
assert((TE->Idx == 0 || TE->getOpcode() == Instruction::ExtractElement || assert(
isSplat(TE->Scalars)) && (TE->Idx == 0 ||
"Expected splat or extractelements only node."); (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
isSplat(TE->Scalars)) &&
"Expected splat or extractelements only node.");
return {}; return {};
} }
unsigned SliceSize = getPartNumElems(VL.size(), NumParts); unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
@ -14921,14 +14924,15 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
} }
} }
// Gather extracts after we check for full matched gathers only. // Gather extracts after we check for full matched gathers only.
if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load || if (!ExtractShuffles.empty() || !E->hasState() ||
((E->getOpcode() == Instruction::Load || E->getOpcode() != Instruction::Load ||
(((E->hasState() && E->getOpcode() == Instruction::Load) ||
any_of(E->Scalars, IsaPred<LoadInst>)) && any_of(E->Scalars, IsaPred<LoadInst>)) &&
any_of(E->Scalars, any_of(E->Scalars,
[this](Value *V) { [this](Value *V) {
return isa<LoadInst>(V) && getTreeEntry(V); return isa<LoadInst>(V) && getTreeEntry(V);
})) || })) ||
E->isAltShuffle() || (E->hasState() && E->isAltShuffle()) ||
all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) || all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
isSplat(E->Scalars) || isSplat(E->Scalars) ||
(E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) { (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
@ -15308,7 +15312,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size()); auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
if (E->isGather()) { if (E->isGather()) {
// Set insert point for non-reduction initial nodes. // Set insert point for non-reduction initial nodes.
if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList) if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
setInsertPointAfterBundle(E); setInsertPointAfterBundle(E);
Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs); Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
E->VectorizedValue = Vec; E->VectorizedValue = Vec;
@ -18153,8 +18157,9 @@ static RecurKind getRdxKind(Value *V);
void BoUpSLP::computeMinimumValueSizes() { void BoUpSLP::computeMinimumValueSizes() {
// We only attempt to truncate integer expressions. // We only attempt to truncate integer expressions.
bool IsStoreOrInsertElt = bool IsStoreOrInsertElt =
VectorizableTree.front()->getOpcode() == Instruction::Store || VectorizableTree.front()->hasState() &&
VectorizableTree.front()->getOpcode() == Instruction::InsertElement; (VectorizableTree.front()->getOpcode() == Instruction::Store ||
VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
if ((IsStoreOrInsertElt || UserIgnoreList) && if ((IsStoreOrInsertElt || UserIgnoreList) &&
ExtraBitWidthNodes.size() <= 1 && ExtraBitWidthNodes.size() <= 1 &&
(!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 || (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
@ -18195,10 +18200,9 @@ void BoUpSLP::computeMinimumValueSizes() {
return; return;
SmallVector<unsigned> ToDemote; SmallVector<unsigned> ToDemote;
auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot, auto ComputeMaxBitWidth =
bool IsProfitableToDemoteRoot, unsigned Opcode, [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
unsigned Limit, bool IsTruncRoot, unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
bool IsSignedCmp) -> unsigned {
ToDemote.clear(); ToDemote.clear();
// Check if the root is trunc and the next node is gather/buildvector, then // Check if the root is trunc and the next node is gather/buildvector, then
// keep trunc in scalars, which is free in most cases. // keep trunc in scalars, which is free in most cases.
@ -18239,11 +18243,14 @@ void BoUpSLP::computeMinimumValueSizes() {
return MaxBitWidth; return MaxBitWidth;
} }
if (!E.hasState())
return 0u;
unsigned VF = E.getVectorFactor(); unsigned VF = E.getVectorFactor();
Type *ScalarTy = E.Scalars.front()->getType(); Type *ScalarTy = E.Scalars.front()->getType();
unsigned ScalarTyNumElements = getNumElements(ScalarTy); unsigned ScalarTyNumElements = getNumElements(ScalarTy);
auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType()); auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
if (!TreeRootIT || !Opcode) if (!TreeRootIT)
return 0u; return 0u;
if (any_of(E.Scalars, if (any_of(E.Scalars,
@ -18315,6 +18322,7 @@ void BoUpSLP::computeMinimumValueSizes() {
IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF))) IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
return 0u; return 0u;
unsigned Opcode = E.getOpcode();
bool IsProfitableToDemote = Opcode == Instruction::Trunc || bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
Opcode == Instruction::SExt || Opcode == Instruction::SExt ||
Opcode == Instruction::ZExt || NumParts > 1; Opcode == Instruction::ZExt || NumParts > 1;
@ -18395,15 +18403,14 @@ void BoUpSLP::computeMinimumValueSizes() {
while (NodeIdx < VectorizableTree.size()) { while (NodeIdx < VectorizableTree.size()) {
ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars; ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
unsigned Limit = 2; unsigned Limit = 2;
unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
if (IsTopRoot && if (IsTopRoot &&
ReductionBitWidth == ReductionBitWidth ==
DL->getTypeSizeInBits( DL->getTypeSizeInBits(
VectorizableTree.front()->Scalars.front()->getType())) VectorizableTree.front()->Scalars.front()->getType()))
Limit = 3; Limit = 3;
unsigned MaxBitWidth = ComputeMaxBitWidth( unsigned MaxBitWidth = ComputeMaxBitWidth(
*VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode, *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
Limit, IsTruncRoot, IsSignedCmp); IsTruncRoot, IsSignedCmp);
if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) { if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth) if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
ReductionBitWidth = bit_ceil(MaxBitWidth); ReductionBitWidth = bit_ceil(MaxBitWidth);
@ -18446,19 +18453,21 @@ void BoUpSLP::computeMinimumValueSizes() {
}); });
IsSignedCmp = IsSignedCmp =
NodeIdx < VectorizableTree.size() && NodeIdx < VectorizableTree.size() &&
any_of(VectorizableTree[NodeIdx]->UserTreeIndices, any_of(
[&](const EdgeInfo &EI) { VectorizableTree[NodeIdx]->UserTreeIndices,
return EI.UserTE->getOpcode() == Instruction::ICmp && [&](const EdgeInfo &EI) {
any_of(EI.UserTE->Scalars, [&](Value *V) { return (EI.UserTE->hasState() &&
auto *IC = dyn_cast<ICmpInst>(V); EI.UserTE->getOpcode() == Instruction::ICmp) &&
return IC && any_of(EI.UserTE->Scalars, [&](Value *V) {
(IC->isSigned() || auto *IC = dyn_cast<ICmpInst>(V);
!isKnownNonNegative(IC->getOperand(0), return IC &&
SimplifyQuery(*DL)) || (IC->isSigned() ||
!isKnownNonNegative(IC->getOperand(1), !isKnownNonNegative(IC->getOperand(0),
SimplifyQuery(*DL))); SimplifyQuery(*DL)) ||
}); !isKnownNonNegative(IC->getOperand(1),
}); SimplifyQuery(*DL)));
});
});
} }
// If the maximum bit width we compute is less than the width of the roots' // If the maximum bit width we compute is less than the width of the roots'

View File

@ -0,0 +1,64 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes=slp-vectorizer -S %s | FileCheck %s
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
target triple = "aarch64-unknown-linux-gnu"
define void @foo(ptr %0) {
; CHECK-LABEL: @foo(
; CHECK-NEXT: vector.scevcheck:
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[TMP0:%.*]], i64 4
; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr null, i64 4
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x ptr> [[TMP1]], ptr [[SCEVGEP]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x ptr> [[TMP2]], <4 x ptr> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult <4 x ptr> [[TMP3]], zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i1> [[TMP4]], zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[SCEVGEP3]], i32 1
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
; CHECK-NEXT: [[TMP9:%.*]] = icmp ult <4 x ptr> [[TMP8]], zeroinitializer
; CHECK-NEXT: [[TMP10:%.*]] = and <4 x i1> [[TMP9]], zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
; CHECK-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP10]])
; CHECK-NEXT: [[OP_RDX:%.*]] = or i1 [[TMP11]], [[TMP12]]
; CHECK-NEXT: br i1 [[OP_RDX]], label [[DOTLR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: ret void
; CHECK: .lr.ph:
; CHECK-NEXT: ret void
;
vector.scevcheck:
%scevgep = getelementptr i8, ptr %0, i64 4
%scevgep3 = getelementptr i8, ptr null, i64 4
%bound011 = icmp ult ptr %scevgep, null
%found.conflict13 = and i1 %bound011, false
%bound014 = icmp ult ptr %scevgep, null
%found.conflict16 = and i1 %bound014, false
%conflict.rdx17 = or i1 %found.conflict13, %found.conflict16
%bound018 = icmp ult ptr %scevgep, null
%found.conflict20 = and i1 %bound018, false
%conflict.rdx21 = or i1 %conflict.rdx17, %found.conflict20
%bound022 = icmp ult ptr %0, null
%found.conflict24 = and i1 %bound022, false
%conflict.rdx25 = or i1 %conflict.rdx21, %found.conflict24
%bound026 = icmp ult ptr %0, null
%found.conflict28 = and i1 %bound026, false
%conflict.rdx29 = or i1 %conflict.rdx25, %found.conflict28
%bound030 = icmp ult ptr %0, null
%found.conflict32 = and i1 %bound030, false
%conflict.rdx33 = or i1 %conflict.rdx29, %found.conflict32
%bound034 = icmp ult ptr %0, null
%found.conflict36 = and i1 %bound034, false
%conflict.rdx37 = or i1 %conflict.rdx33, %found.conflict36
%bound038 = icmp ult ptr %scevgep3, null
%found.conflict40 = and i1 %bound038, false
%conflict.rdx41 = or i1 %conflict.rdx37, %found.conflict40
br i1 %conflict.rdx41, label %.lr.ph, label %vector.ph
vector.ph: ; preds = %vector.scevcheck
ret void
.lr.ph: ; preds = %vector.scevcheck
ret void
}

View File

@ -0,0 +1,17 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes=slp-vectorizer -S %s | FileCheck %s
define i32 @test(i32 %minlib) {
; CHECK-LABEL: @test(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[MUL2_I306:%.*]] = mul i32 [[MINLIB:%.*]], [[MINLIB]]
; CHECK-NEXT: [[MUL3_I307:%.*]] = mul i32 [[MUL2_I306]], [[MINLIB]]
; CHECK-NEXT: [[CMP183:%.*]] = icmp sgt i32 [[MUL3_I307]], 0
; CHECK-NEXT: ret i32 0
;
entry:
%mul2.i306 = mul i32 %minlib, %minlib
%mul3.i307 = mul i32 %mul2.i306, %minlib
%cmp183 = icmp sgt i32 %mul3.i307, 0
ret i32 0
}

View File

@ -0,0 +1,25 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes=slp-vectorizer -S %s | FileCheck %s
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
define void @_Z4blurN6Halide5Tools5ImageItEE(i1 %0, i1 %1, i1 %ident.check, i1 %ident.check56) {
; CHECK-LABEL: @_Z4blurN6Halide5Tools5ImageItEE(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP2:%.*]] = or i1 [[TMP0:%.*]], [[TMP1:%.*]]
; CHECK-NEXT: [[TMP3:%.*]] = or i1 [[IDENT_CHECK:%.*]], [[IDENT_CHECK56:%.*]]
; CHECK-NEXT: [[TMP4:%.*]] = or i1 [[TMP3]], [[TMP2]]
; CHECK-NEXT: br i1 [[TMP4]], label [[FOR_BODY6_US_I_I:%.*]], label [[FOR_BODY6_US_I_I]]
; CHECK: for.body6.us.i.i:
; CHECK-NEXT: ret void
;
entry:
%2 = or i1 %0, %1
%3 = or i1 %ident.check, %ident.check56
%4 = or i1 %3, %2
br i1 %4, label %for.body6.us.i.i, label %for.body6.us.i.i
for.body6.us.i.i: ; preds = %entry, %entry
ret void
}