[SLP] Replace MainOp and AltOp in TreeEntry with InstructionsState. (#122443)

Add TreeEntry::hasState. Add assert for getTreeEntry. Remove the OpValue parameter from the canReuseExtract function. Remove the Opcode parameter from the ComputeMaxBitWidth lambda function.
2025-01-18 10:23:20 +08:00 · 2025-01-18 10:23:20 +08:00 · 07d496538f
commit 07d496538f
parent 5b6a26ccdd
4 changed files with 225 additions and 110 deletions
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@ -2414,15 +2414,17 @@ public:
    }

    /// Go through the instructions in VL and append their operands.
-    void appendOperandsOfVL(ArrayRef<Value *> VL, Instruction *VL0) {
+    void appendOperandsOfVL(ArrayRef<Value *> VL, const InstructionsState &S) {
      assert(!VL.empty() && "Bad VL");
      assert((empty() || VL.size() == getNumLanes()) &&
             "Expected same number of lanes");
+      assert(S.valid() && "InstructionsState is invalid.");
      // IntrinsicInst::isCommutative returns true if swapping the first "two"
      // arguments to the intrinsic produces the same result.
      constexpr unsigned IntrinsicNumOperands = 2;
-      unsigned NumOperands = VL0->getNumOperands();
-      ArgSize = isa<IntrinsicInst>(VL0) ? IntrinsicNumOperands : NumOperands;
+      Instruction *MainOp = S.getMainOp();
+      unsigned NumOperands = MainOp->getNumOperands();
+      ArgSize = isa<IntrinsicInst>(MainOp) ? IntrinsicNumOperands : NumOperands;
      OpsVec.resize(NumOperands);
      unsigned NumLanes = VL.size();
      for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
@ -2441,19 +2443,19 @@ public:
          // operations or alternating sequences (e.g., +, -), we can safely
          // tell the inverse operations by checking commutativity.
          if (isa<PoisonValue>(VL[Lane])) {
-            if (auto *EI = dyn_cast<ExtractElementInst>(VL0)) {
+            if (auto *EI = dyn_cast<ExtractElementInst>(MainOp)) {
              if (OpIdx == 0) {
                OpsVec[OpIdx][Lane] = {EI->getVectorOperand(), true, false};
                continue;
              }
-            } else if (auto *EV = dyn_cast<ExtractValueInst>(VL0)) {
+            } else if (auto *EV = dyn_cast<ExtractValueInst>(MainOp)) {
              if (OpIdx == 0) {
                OpsVec[OpIdx][Lane] = {EV->getAggregateOperand(), true, false};
                continue;
              }
            }
            OpsVec[OpIdx][Lane] = {
-                PoisonValue::get(VL0->getOperand(OpIdx)->getType()), true,
+                PoisonValue::get(MainOp->getOperand(OpIdx)->getType()), true,
                false};
            continue;
          }
@ -2566,11 +2568,12 @@ public:

  public:
    /// Initialize with all the operands of the instruction vector \p RootVL.
-    VLOperands(ArrayRef<Value *> RootVL, Instruction *VL0, const BoUpSLP &R)
+    VLOperands(ArrayRef<Value *> RootVL, const InstructionsState &S,
+               const BoUpSLP &R)
        : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
-          L(R.LI->getLoopFor((VL0->getParent()))) {
+          L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
      // Append all the operands of RootVL.
-      appendOperandsOfVL(RootVL, VL0);
+      appendOperandsOfVL(RootVL, S);
    }

    /// \Returns a value vector with the operands across all lanes for the
@ -3043,7 +3046,7 @@ private:
  /// non-identity permutation that allows to reuse extract instructions.
  /// \param ResizeAllowed indicates whether it is allowed to handle subvector
  /// extract order.
-  bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
+  bool canReuseExtract(ArrayRef<Value *> VL,
                       SmallVectorImpl<unsigned> &CurrentOrder,
                       bool ResizeAllowed = false) const;

@ -3270,7 +3273,7 @@ private:
    };

    /// Checks if the current node is a gather node.
-    bool isGather() const {return State == NeedToGather; }
+    bool isGather() const { return State == NeedToGather; }

    /// A vector of scalars.
    ValueList Scalars;
@ -3334,9 +3337,9 @@ private:
    /// reordering of operands during buildTree_rec() and vectorizeTree().
    SmallVector<ValueList, 2> Operands;

-    /// The main/alternate instruction.
-    Instruction *MainOp = nullptr;
-    Instruction *AltOp = nullptr;
+    /// MainOp and AltOp are recorded inside. S should be obtained from
+    /// newTreeEntry.
+    InstructionsState S = InstructionsState::invalid();

    /// Interleaving factor for interleaved loads Vectorize nodes.
    unsigned InterleaveFactor = 0;
@ -3360,10 +3363,10 @@ private:

    /// Set this bundle's operand from Scalars.
    void setOperand(const BoUpSLP &R, bool RequireReorder = false) {
-      VLOperands Ops(Scalars, MainOp, R);
+      VLOperands Ops(Scalars, S, R);
      if (RequireReorder)
        Ops.reorder();
-      for (unsigned I : seq<unsigned>(MainOp->getNumOperands()))
+      for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands()))
        setOperand(I, Ops.getVL(I));
    }

@ -3396,13 +3399,9 @@ private:
    }

    /// Some of the instructions in the list have alternate opcodes.
-    bool isAltShuffle() const { return MainOp != AltOp; }
+    bool isAltShuffle() const { return S.isAltShuffle(); }

-    bool isOpcodeOrAlt(Instruction *I) const {
-      unsigned CheckedOpcode = I->getOpcode();
-      return (getOpcode() == CheckedOpcode ||
-              getAltOpcode() == CheckedOpcode);
-    }
+    bool isOpcodeOrAlt(Instruction *I) const { return S.isOpcodeOrAlt(I); }

    /// Chooses the correct key for scheduling data. If \p Op has the same (or
    /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
@ -3411,31 +3410,24 @@ private:
      auto *I = dyn_cast<Instruction>(Op);
      if (I && isOpcodeOrAlt(I))
        return Op;
-      return MainOp;
+      return S.getMainOp();
    }

    void setOperations(const InstructionsState &S) {
      assert(S && "InstructionsState is invalid.");
-      MainOp = S.getMainOp();
-      AltOp = S.getAltOp();
+      this->S = S;
    }

-    Instruction *getMainOp() const {
-      return MainOp;
-    }
+    Instruction *getMainOp() const { return S.getMainOp(); }

-    Instruction *getAltOp() const {
-      return AltOp;
-    }
+    Instruction *getAltOp() const { return S.getAltOp(); }

    /// The main/alternate opcodes for the list of instructions.
-    unsigned getOpcode() const {
-      return MainOp ? MainOp->getOpcode() : 0;
-    }
+    unsigned getOpcode() const { return S.getOpcode(); }

-    unsigned getAltOpcode() const {
-      return AltOp ? AltOp->getOpcode() : 0;
-    }
+    unsigned getAltOpcode() const { return S.getAltOpcode(); }
+
+    bool hasState() const { return S.valid(); }

    /// When ReuseReorderShuffleIndices is empty it just returns position of \p
    /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
@ -3531,16 +3523,13 @@ private:
        dbgs() << "CombinedVectorize\n";
        break;
      }
-      dbgs() << "MainOp: ";
-      if (MainOp)
-        dbgs() << *MainOp << "\n";
-      else
-        dbgs() << "NULL\n";
-      dbgs() << "AltOp: ";
-      if (AltOp)
-        dbgs() << *AltOp << "\n";
-      else
-        dbgs() << "NULL\n";
+      if (S) {
+        dbgs() << "MainOp: " << *S.getMainOp() << "\n";
+        dbgs() << "AltOp: " << *S.getAltOp() << "\n";
+      } else {
+        dbgs() << "MainOp: NULL\n";
+        dbgs() << "AltOp: NULL\n";
+      }
      dbgs() << "VectorizedValue: ";
      if (VectorizedValue)
        dbgs() << *VectorizedValue << "\n";
@ -3715,9 +3704,13 @@ private:
  }
 #endif

-  TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
+  TreeEntry *getTreeEntry(Value *V) {
+    assert(V && "V cannot be nullptr.");
+    return ScalarToTreeEntry.lookup(V);
+  }

  const TreeEntry *getTreeEntry(Value *V) const {
+    assert(V && "V cannot be nullptr.");
    return ScalarToTreeEntry.lookup(V);
  }

@ -5615,7 +5608,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
    // Try build correct order for extractelement instructions.
    SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
                                TE.ReuseShuffleIndices.end());
-    if (TE.getOpcode() == Instruction::ExtractElement &&
+    if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
        all_of(TE.Scalars, [Sz](Value *V) {
          if (isa<PoisonValue>(V))
            return true;
@ -5777,10 +5770,11 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
      return std::nullopt; // No need to reorder.
    return std::move(Phis);
  }
-  if (TE.isGather() && !TE.isAltShuffle() && allSameType(TE.Scalars)) {
+  if (TE.isGather() && (!TE.hasState() || !TE.isAltShuffle()) &&
+      allSameType(TE.Scalars)) {
    // TODO: add analysis of other gather nodes with extractelement
    // instructions and other values/instructions, not only undefs.
-    if ((TE.getOpcode() == Instruction::ExtractElement ||
+    if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
         (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
          any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
        all_of(TE.Scalars, [](Value *V) {
@ -5790,8 +5784,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
      // Check that gather of extractelements can be represented as
      // just a shuffle of a single vector.
      OrdersType CurrentOrder;
-      bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
-                                   /*ResizeAllowed=*/true);
+      bool Reuse =
+          canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
      if (Reuse || !CurrentOrder.empty())
        return std::move(CurrentOrder);
    }
@ -5840,7 +5834,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
        return Order;
    // Check if can include the order of vectorized loads. For masked gathers do
    // extra analysis later, so include such nodes into a special list.
-    if (TE.isGather() && TE.getOpcode() == Instruction::Load) {
+    if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
      SmallVector<Value *> PointerOps;
      OrdersType CurrentOrder;
      LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
@ -5955,7 +5949,7 @@ void BoUpSLP::reorderTopToBottom() {
    // Patterns like [fadd,fsub] can be combined into a single instruction in
    // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
    // to take into account their order when looking for the most used order.
-    if (TE->isAltShuffle()) {
+    if (TE->hasState() && TE->isAltShuffle()) {
      VectorType *VecTy =
          getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
      unsigned Opcode0 = TE->getOpcode();
@ -6034,7 +6028,7 @@ void BoUpSLP::reorderTopToBottom() {
          if (It != GathersToOrders.end())
            return It->second;
        }
-        if (OpTE->isAltShuffle()) {
+        if (OpTE->hasState() && OpTE->isAltShuffle()) {
          auto It = AltShufflesToOrders.find(OpTE);
          if (It != AltShufflesToOrders.end())
            return It->second;
@ -7637,7 +7631,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
  }
  case Instruction::ExtractValue:
  case Instruction::ExtractElement: {
-    bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
+    bool Reuse = canReuseExtract(VL, CurrentOrder);
    // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
    if (!has_single_bit(VL.size()))
      return TreeEntry::NeedToGather;
@ -8657,7 +8651,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
                 TE->dump());

      ValueList Left, Right;
-      VLOperands Ops(VL, VL0, *this);
+      VLOperands Ops(VL, S, *this);
      if (cast<CmpInst>(VL0)->isCommutative()) {
        // Commutative predicate - collect + sort operands of the instructions
        // so that each side is more likely to have the same opcode.
@ -8925,7 +8919,7 @@ unsigned BoUpSLP::canMapToVector(Type *T) const {
  return N;
 }

-bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
+bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
                              SmallVectorImpl<unsigned> &CurrentOrder,
                              bool ResizeAllowed) const {
  const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
@ -9579,7 +9573,7 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {

  // Do not reorder nodes if it small (just 2 elements), all-constant or all
  // instructions have same opcode already.
-  if (TE.Scalars.size() == 2 || (TE.getOpcode() && !TE.isAltShuffle()) ||
+  if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
      all_of(TE.Scalars, isConstant))
    return;

@ -9798,7 +9792,7 @@ void BoUpSLP::transformNodes() {
      // Do not try partial vectorization for small nodes (<= 2), nodes with the
      // same opcode and same parent block or all constants.
      if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
-          !(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
+          !(!E.hasState() || E.getOpcode() == Instruction::Load ||
            E.isAltShuffle() || !allSameBlock(VL)) ||
          allConstant(VL) || isSplat(VL))
        continue;
@ -9921,6 +9915,7 @@ void BoUpSLP::transformNodes() {
          buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
          if (PrevSize + 1 == VectorizableTree.size() &&
              VectorizableTree[PrevSize]->isGather() &&
+              VectorizableTree[PrevSize]->hasState() &&
              VectorizableTree[PrevSize]->getOpcode() !=
                  Instruction::ExtractElement &&
              !isSplat(Slice)) {
@ -9941,6 +9936,8 @@ void BoUpSLP::transformNodes() {
        E.ReorderIndices.clear();
      }
    }
+    if (!E.hasState())
+      continue;
    switch (E.getOpcode()) {
    case Instruction::Load: {
      // No need to reorder masked gather loads, just reorder the scalar
@ -10044,7 +10041,7 @@ void BoUpSLP::transformNodes() {

  if (LoadEntriesToVectorize.empty()) {
    // Single load node - exit.
-    if (VectorizableTree.size() <= 1 &&
+    if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
        VectorizableTree.front()->getOpcode() == Instruction::Load)
      return;
    // Small graph with small VF - exit.
@ -10060,7 +10057,7 @@ void BoUpSLP::transformNodes() {
        getCanonicalGraphSize() <= SmallTree &&
        count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
                 [](const std::unique_ptr<TreeEntry> &TE) {
-                   return TE->isGather() &&
+                   return TE->isGather() && TE->hasState() &&
                          TE->getOpcode() == Instruction::Load &&
                          !allSameBlock(TE->Scalars);
                 }) == 1)
@ -10076,13 +10073,13 @@ void BoUpSLP::transformNodes() {
  for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
    TreeEntry &E = *TE;
    if (E.isGather() &&
-        (E.getOpcode() == Instruction::Load ||
-         (!E.getOpcode() && any_of(E.Scalars,
-                                   [&](Value *V) {
-                                     return isa<LoadInst>(V) &&
-                                            !isVectorized(V) &&
-                                            !isDeleted(cast<Instruction>(V));
-                                   }))) &&
+        ((E.hasState() && E.getOpcode() == Instruction::Load) ||
+         (!E.hasState() && any_of(E.Scalars,
+                                  [&](Value *V) {
+                                    return isa<LoadInst>(V) &&
+                                           !isVectorized(V) &&
+                                           !isDeleted(cast<Instruction>(V));
+                                  }))) &&
        !isSplat(E.Scalars)) {
      for (Value *V : E.Scalars) {
        auto *LI = dyn_cast<LoadInst>(V);
@ -10676,7 +10673,7 @@ public:
    bool PrevNodeFound = any_of(
        ArrayRef(R.VectorizableTree).take_front(E->Idx),
        [&](const std::unique_ptr<TreeEntry> &TE) {
-          return ((!TE->isAltShuffle() &&
+          return ((TE->hasState() && !TE->isAltShuffle() &&
                   TE->getOpcode() == Instruction::ExtractElement) ||
                  TE->isGather()) &&
                 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
@ -11801,7 +11798,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
      for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
        if (TE.get() == E)
          break;
-        if (TE->isAltShuffle() &&
+        if (TE->hasState() && TE->isAltShuffle() &&
            ((TE->getOpcode() == E->getOpcode() &&
              TE->getAltOpcode() == E->getAltOpcode()) ||
             (TE->getOpcode() == E->getAltOpcode() &&
@ -11963,10 +11960,12 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
                   [this](Value *V) { return EphValues.contains(V); }) &&
           (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
            TE->Scalars.size() < Limit ||
-            ((TE->getOpcode() == Instruction::ExtractElement ||
+            (((TE->hasState() &&
+               TE->getOpcode() == Instruction::ExtractElement) ||
              all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
             isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
-            (TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()) ||
+            (TE->hasState() && TE->getOpcode() == Instruction::Load &&
+             !TE->isAltShuffle()) ||
            any_of(TE->Scalars, IsaPred<LoadInst>));
  };

@ -12095,9 +12094,10 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
      !VectorizableTree.empty() &&
      all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
        return (TE->isGather() &&
-                TE->getOpcode() != Instruction::ExtractElement &&
+                (!TE->hasState() ||
+                 TE->getOpcode() != Instruction::ExtractElement) &&
                count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
-               TE->getOpcode() == Instruction::PHI;
+               (TE->hasState() && TE->getOpcode() == Instruction::PHI);
      }))
    return true;

@ -12115,7 +12115,7 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
  // somewhere.
  bool IsAllowedSingleBVNode =
      VectorizableTree.size() > 1 ||
-      (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
+      (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
       !VectorizableTree.front()->isAltShuffle() &&
       VectorizableTree.front()->getOpcode() != Instruction::PHI &&
       VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
@ -12131,6 +12131,7 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
    return false;

  if (VectorizableTree.back()->isGather() &&
+      VectorizableTree.back()->hasState() &&
      VectorizableTree.back()->isAltShuffle() &&
      VectorizableTree.back()->getVectorFactor() > 2 &&
      allSameBlock(VectorizableTree.back()->Scalars) &&
@ -12155,7 +12156,7 @@ bool BoUpSLP::isTreeNotExtendable() const {
        getCanonicalGraphSize() <= SmallTree &&
        count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
                 [](const std::unique_ptr<TreeEntry> &TE) {
-                   return TE->isGather() &&
+                   return TE->isGather() && TE->hasState() &&
                          TE->getOpcode() == Instruction::Load &&
                          !allSameBlock(TE->Scalars);
                 }) == 1)
@ -12167,7 +12168,7 @@ bool BoUpSLP::isTreeNotExtendable() const {
    TreeEntry &E = *VectorizableTree[Idx];
    if (!E.isGather())
      continue;
-    if (E.getOpcode() && E.getOpcode() != Instruction::Load)
+    if (E.hasState() && E.getOpcode() != Instruction::Load)
      return false;
    if (isSplat(E.Scalars) || allConstant(E.Scalars))
      continue;
@ -12477,7 +12478,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
          TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
      continue;
    }
-    if (TE.isGather()) {
+    if (TE.isGather() && TE.hasState()) {
      if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
          E && E->getVectorFactor() == TE.getVectorFactor() &&
          E->isSame(TE.Scalars)) {
@ -13626,9 +13627,11 @@ BoUpSLP::isGatherShuffledEntry(
  if (!TE->UserTreeIndices.empty() &&
      TE->UserTreeIndices.front().UserTE->isGather() &&
      TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
-    assert((TE->Idx == 0 || TE->getOpcode() == Instruction::ExtractElement ||
-            isSplat(TE->Scalars)) &&
-           "Expected splat or extractelements only node.");
+    assert(
+        (TE->Idx == 0 ||
+         (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
+         isSplat(TE->Scalars)) &&
+        "Expected splat or extractelements only node.");
    return {};
  }
  unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
@ -14921,14 +14924,15 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
      }
    }
    // Gather extracts after we check for full matched gathers only.
-    if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
-        ((E->getOpcode() == Instruction::Load ||
+    if (!ExtractShuffles.empty() || !E->hasState() ||
+        E->getOpcode() != Instruction::Load ||
+        (((E->hasState() && E->getOpcode() == Instruction::Load) ||
          any_of(E->Scalars, IsaPred<LoadInst>)) &&
         any_of(E->Scalars,
                [this](Value *V) {
                  return isa<LoadInst>(V) && getTreeEntry(V);
                })) ||
-        E->isAltShuffle() ||
+        (E->hasState() && E->isAltShuffle()) ||
        all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
        isSplat(E->Scalars) ||
        (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
@ -15308,7 +15312,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
  auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
  if (E->isGather()) {
    // Set insert point for non-reduction initial nodes.
-    if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
+    if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
      setInsertPointAfterBundle(E);
    Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
    E->VectorizedValue = Vec;
@ -18153,8 +18157,9 @@ static RecurKind getRdxKind(Value *V);
 void BoUpSLP::computeMinimumValueSizes() {
  // We only attempt to truncate integer expressions.
  bool IsStoreOrInsertElt =
-      VectorizableTree.front()->getOpcode() == Instruction::Store ||
-      VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
+      VectorizableTree.front()->hasState() &&
+      (VectorizableTree.front()->getOpcode() == Instruction::Store ||
+       VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
  if ((IsStoreOrInsertElt || UserIgnoreList) &&
      ExtraBitWidthNodes.size() <= 1 &&
      (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
@ -18195,10 +18200,9 @@ void BoUpSLP::computeMinimumValueSizes() {
    return;

  SmallVector<unsigned> ToDemote;
-  auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot,
-                                bool IsProfitableToDemoteRoot, unsigned Opcode,
-                                unsigned Limit, bool IsTruncRoot,
-                                bool IsSignedCmp) -> unsigned {
+  auto ComputeMaxBitWidth =
+      [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
+          unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
    ToDemote.clear();
    // Check if the root is trunc and the next node is gather/buildvector, then
    // keep trunc in scalars, which is free in most cases.
@ -18239,11 +18243,14 @@ void BoUpSLP::computeMinimumValueSizes() {
      return MaxBitWidth;
    }

+    if (!E.hasState())
+      return 0u;
+
    unsigned VF = E.getVectorFactor();
    Type *ScalarTy = E.Scalars.front()->getType();
    unsigned ScalarTyNumElements = getNumElements(ScalarTy);
    auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
-    if (!TreeRootIT || !Opcode)
+    if (!TreeRootIT)
      return 0u;

    if (any_of(E.Scalars,
@ -18315,6 +18322,7 @@ void BoUpSLP::computeMinimumValueSizes() {
                IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
      return 0u;

+    unsigned Opcode = E.getOpcode();
    bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
                                Opcode == Instruction::SExt ||
                                Opcode == Instruction::ZExt || NumParts > 1;
@ -18395,15 +18403,14 @@ void BoUpSLP::computeMinimumValueSizes() {
  while (NodeIdx < VectorizableTree.size()) {
    ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
    unsigned Limit = 2;
-    unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
    if (IsTopRoot &&
        ReductionBitWidth ==
            DL->getTypeSizeInBits(
                VectorizableTree.front()->Scalars.front()->getType()))
      Limit = 3;
    unsigned MaxBitWidth = ComputeMaxBitWidth(
-        *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode,
-        Limit, IsTruncRoot, IsSignedCmp);
+        *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
+        IsTruncRoot, IsSignedCmp);
    if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
      if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
        ReductionBitWidth = bit_ceil(MaxBitWidth);
@ -18446,19 +18453,21 @@ void BoUpSLP::computeMinimumValueSizes() {
                 });
      IsSignedCmp =
          NodeIdx < VectorizableTree.size() &&
-          any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
-                 [&](const EdgeInfo &EI) {
-                   return EI.UserTE->getOpcode() == Instruction::ICmp &&
-                          any_of(EI.UserTE->Scalars, [&](Value *V) {
-                            auto *IC = dyn_cast<ICmpInst>(V);
-                            return IC &&
-                                   (IC->isSigned() ||
-                                    !isKnownNonNegative(IC->getOperand(0),
-                                                        SimplifyQuery(*DL)) ||
-                                    !isKnownNonNegative(IC->getOperand(1),
-                                                        SimplifyQuery(*DL)));
-                          });
-                 });
+          any_of(
+              VectorizableTree[NodeIdx]->UserTreeIndices,
+              [&](const EdgeInfo &EI) {
+                return (EI.UserTE->hasState() &&
+                        EI.UserTE->getOpcode() == Instruction::ICmp) &&
+                       any_of(EI.UserTE->Scalars, [&](Value *V) {
+                         auto *IC = dyn_cast<ICmpInst>(V);
+                         return IC &&
+                                (IC->isSigned() ||
+                                 !isKnownNonNegative(IC->getOperand(0),
+                                                     SimplifyQuery(*DL)) ||
+                                 !isKnownNonNegative(IC->getOperand(1),
+                                                     SimplifyQuery(*DL)));
+                       });
+              });
    }

    // If the maximum bit width we compute is less than the width of the roots'
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/InstructionsState-is-invalid-0.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/InstructionsState-is-invalid-0.ll
@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=slp-vectorizer -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @foo(ptr %0) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  vector.scevcheck:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[TMP0:%.*]], i64 4
+; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr null, i64 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x ptr> [[TMP1]], ptr [[SCEVGEP]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x ptr> [[TMP2]], <4 x ptr> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult <4 x ptr> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = and <4 x i1> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[SCEVGEP3]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult <4 x ptr> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = and <4 x i1> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP10]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = or i1 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    br i1 [[OP_RDX]], label [[DOTLR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    ret void
+; CHECK:       .lr.ph:
+; CHECK-NEXT:    ret void
+;
+vector.scevcheck:
+  %scevgep = getelementptr i8, ptr %0, i64 4
+  %scevgep3 = getelementptr i8, ptr null, i64 4
+  %bound011 = icmp ult ptr %scevgep, null
+  %found.conflict13 = and i1 %bound011, false
+  %bound014 = icmp ult ptr %scevgep, null
+  %found.conflict16 = and i1 %bound014, false
+  %conflict.rdx17 = or i1 %found.conflict13, %found.conflict16
+  %bound018 = icmp ult ptr %scevgep, null
+  %found.conflict20 = and i1 %bound018, false
+  %conflict.rdx21 = or i1 %conflict.rdx17, %found.conflict20
+  %bound022 = icmp ult ptr %0, null
+  %found.conflict24 = and i1 %bound022, false
+  %conflict.rdx25 = or i1 %conflict.rdx21, %found.conflict24
+  %bound026 = icmp ult ptr %0, null
+  %found.conflict28 = and i1 %bound026, false
+  %conflict.rdx29 = or i1 %conflict.rdx25, %found.conflict28
+  %bound030 = icmp ult ptr %0, null
+  %found.conflict32 = and i1 %bound030, false
+  %conflict.rdx33 = or i1 %conflict.rdx29, %found.conflict32
+  %bound034 = icmp ult ptr %0, null
+  %found.conflict36 = and i1 %bound034, false
+  %conflict.rdx37 = or i1 %conflict.rdx33, %found.conflict36
+  %bound038 = icmp ult ptr %scevgep3, null
+  %found.conflict40 = and i1 %bound038, false
+  %conflict.rdx41 = or i1 %conflict.rdx37, %found.conflict40
+  br i1 %conflict.rdx41, label %.lr.ph, label %vector.ph
+
+vector.ph:                                        ; preds = %vector.scevcheck
+  ret void
+
+.lr.ph:                                           ; preds = %vector.scevcheck
+  ret void
+}
--- a/llvm/test/Transforms/SLPVectorizer/InstructionsState-is-invalid-2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/InstructionsState-is-invalid-2.ll
@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=slp-vectorizer -S %s | FileCheck %s
+
+define i32 @test(i32 %minlib) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL2_I306:%.*]] = mul i32 [[MINLIB:%.*]], [[MINLIB]]
+; CHECK-NEXT:    [[MUL3_I307:%.*]] = mul i32 [[MUL2_I306]], [[MINLIB]]
+; CHECK-NEXT:    [[CMP183:%.*]] = icmp sgt i32 [[MUL3_I307]], 0
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %mul2.i306 = mul i32 %minlib, %minlib
+  %mul3.i307 = mul i32 %mul2.i306, %minlib
+  %cmp183 = icmp sgt i32 %mul3.i307, 0
+  ret i32 0
+}
--- a/llvm/test/Transforms/SLPVectorizer/X86/InstructionsState-is-invalid-1.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/InstructionsState-is-invalid-1.ll
@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=slp-vectorizer -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @_Z4blurN6Halide5Tools5ImageItEE(i1 %0, i1 %1, i1 %ident.check, i1 %ident.check56) {
+; CHECK-LABEL: @_Z4blurN6Halide5Tools5ImageItEE(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = or i1 [[TMP0:%.*]], [[TMP1:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or i1 [[IDENT_CHECK:%.*]], [[IDENT_CHECK56:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[FOR_BODY6_US_I_I:%.*]], label [[FOR_BODY6_US_I_I]]
+; CHECK:       for.body6.us.i.i:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %2 = or i1 %0, %1
+  %3 = or i1 %ident.check, %ident.check56
+  %4 = or i1 %3, %2
+  br i1 %4, label %for.body6.us.i.i, label %for.body6.us.i.i
+
+for.body6.us.i.i:                                 ; preds = %entry, %entry
+  ret void
+}