[MachinePipeliner] Remove isLoopCarriedDep and use DDG (#174394)

This patch completely removes `isLoopCarriedDep`, which was used previously to identify loop-carried dependencies in the DAG. Now that we have the DDG representation, this special handling is no longer necessary. Simply replacing its usage with the DDG causes several tests to fail, since cycle detection takes some of the validation-only edges in the DDG into account. To address this, this patch introduces extra edges in the DDG, which are used only for cycle detection and not for other parts of the pass (e.g., scheduling). The extra edges are determined to preserve the existing behavior of the pass as closely as possible, which makes the predicates for adding them somewhat complex. Split off from #135148, and the final patch in the series for #135148
2026-04-03 19:36:34 +09:00 · 2026-04-03 19:36:34 +09:00 · 9e516f5c58
commit 9e516f5c58
parent a2d3783b45
3 changed files with 57 additions and 394 deletions
--- a/llvm/include/llvm/CodeGen/MachinePipeliner.h
+++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h
@ -233,6 +233,14 @@ class SwingSchedulerDDG {
  struct SwingSchedulerDDGEdges {
    EdgesType Preds;
    EdgesType Succs;
+
+    /// This field is a subset of ValidationOnlyEdges. These edges are used only
+    /// by specific heuristics, mainly for cycle detection. Although they are
+    /// unnecessary in theory (i.e., ignoring them should still yield a valid
+    /// schedule), they are retained to preserve the existing behavior. Since we
+    /// only need which extra edges exist from a given SUnit, we only store the
+    /// destination SUnits.
+    SmallVector<SUnit *, 4> ExtraSuccs;
  };

  void initEdges(SUnit *SU);
@ -263,6 +271,8 @@ public:

  const EdgesType &getOutEdges(const SUnit *SU) const;

+  ArrayRef<SUnit *> getExtraOutEdges(const SUnit *SU) const;
+
  bool isValidSchedule(const SMSchedule &Schedule) const;
 };

@ -358,7 +368,7 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
      NumPaths = 0;
    }

-    void createAdjacencyStructure(SwingSchedulerDAG *DAG);
+    void createAdjacencyStructure(SwingSchedulerDDG *DDG);
    bool circuit(int V, int S, NodeSetType &NodeSets,
                 const SwingSchedulerDAG *DAG, bool HasBackedge = false);
    void unblock(int U);
@ -415,8 +425,6 @@ public:
    return ScheduleInfo[Node->NodeNum].ZeroLatencyHeight;
  }

-  bool isLoopCarriedDep(const SwingSchedulerDDGEdge &Edge) const;
-
  void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule);

  void fixupRegisterOverlaps(std::deque<SUnit *> &Instrs);
@ -527,13 +535,11 @@ public:
    SUnit *FirstNode = Nodes[0];
    SUnit *LastNode = Nodes[Nodes.size() - 1];

-    for (auto &PI : DDG->getInEdges(LastNode)) {
+    for (SUnit *SU : DDG->getExtraOutEdges(LastNode)) {
      // If we have an order dep that is potentially loop carried then a
-      // back-edge exists between the last node and the first node that isn't
-      // modeled in the DAG. Handle it manually by adding 1 to the distance of
-      // the last node.
-      if (PI.getSrc() != FirstNode || !PI.isOrderDep() ||
-          !DAG->isLoopCarriedDep(PI))
+      // back-edge exists between the last node and the first node in extra
+      // edges. Handle it manually by adding 1 to the distance of the last node.
+      if (SU != FirstNode)
        continue;
      unsigned &First = SUnitToDistance[FirstNode];
      unsigned Last = SUnitToDistance[LastNode];
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@ -1969,13 +1969,13 @@ unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {

 /// Create the adjacency structure of the nodes in the graph.
 void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
-    SwingSchedulerDAG *DAG) {
+    SwingSchedulerDDG *DDG) {
  BitVector Added(SUnits.size());
  DenseMap<int, int> OutputDeps;
  for (int i = 0, e = SUnits.size(); i != e; ++i) {
    Added.reset();
    // Add any successor to the adjacency matrix and exclude duplicates.
-    for (auto &OE : DAG->DDG->getOutEdges(&SUnits[i])) {
+    for (auto &OE : DDG->getOutEdges(&SUnits[i])) {
      // Only create a back-edge on the first and last nodes of a dependence
      // chain. This records any chains and adds them later.
      if (OE.isOutputDep()) {
@ -2007,22 +2007,17 @@ void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
        Added.set(N);
      }
    }
-    // A chain edge between a store and a load is treated as a back-edge in the
-    // adjacency matrix.
-    for (auto &IE : DAG->DDG->getInEdges(&SUnits[i])) {
-      SUnit *Src = IE.getSrc();
-      SUnit *Dst = IE.getDst();
-      if (!Dst->getInstr()->mayStore() || !DAG->isLoopCarriedDep(IE))
-        continue;
-      if (IE.isOrderDep() && Src->getInstr()->mayLoad()) {
-        int N = Src->NodeNum;
-        if (!Added.test(N)) {
-          AdjK[i].push_back(N);
-          Added.set(N);
-        }
+
+    // Also add any extra out edges to the adjacency matrix.
+    for (const SUnit *Dst : DDG->getExtraOutEdges(&SUnits[i])) {
+      int N = Dst->NodeNum;
+      if (!Added.test(N)) {
+        AdjK[i].push_back(N);
+        Added.set(N);
      }
    }
  }
+
  // Add back-edges in the adjacency matrix for the output dependences.
  for (auto &OD : OutputDeps)
    if (!Added.test(OD.second)) {
@ -2092,7 +2087,7 @@ void SwingSchedulerDAG::Circuits::unblock(int U) {
 void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
  Circuits Cir(SUnits, Topo);
  // Create the adjacency structure.
-  Cir.createAdjacencyStructure(this);
+  Cir.createAdjacencyStructure(&*DDG);
  for (int I = 0, E = SUnits.size(); I != E; ++I) {
    Cir.reset();
    Cir.circuit(I, I, NodeSets, this);
@ -3235,40 +3230,6 @@ bool SwingSchedulerDAG::mayOverlapInLaterIter(
  return true;
 }

-/// Return true for an order or output dependence that is loop carried
-/// potentially. A dependence is loop carried if the destination defines a value
-/// that may be used or defined by the source in a subsequent iteration.
-bool SwingSchedulerDAG::isLoopCarriedDep(
-    const SwingSchedulerDDGEdge &Edge) const {
-  if ((!Edge.isOrderDep() && !Edge.isOutputDep()) || Edge.isArtificial() ||
-      Edge.getDst()->isBoundaryNode())
-    return false;
-
-  if (!SwpPruneLoopCarried)
-    return true;
-
-  if (Edge.isOutputDep())
-    return true;
-
-  MachineInstr *SI = Edge.getSrc()->getInstr();
-  MachineInstr *DI = Edge.getDst()->getInstr();
-  assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
-
-  // Assume ordered loads and stores may have a loop carried dependence.
-  if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
-      SI->mayRaiseFPException() || DI->mayRaiseFPException() ||
-      SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
-    return true;
-
-  if (!DI->mayLoadOrStore() || !SI->mayLoadOrStore())
-    return false;
-
-  // The conservative assumption is that a dependence between memory operations
-  // may be loop carried. The following code checks when it can be proved that
-  // there is no loop carried dependence.
-  return mayOverlapInLaterIter(DI, SI);
-}
-
 void SwingSchedulerDAG::postProcessDAG() {
  for (auto &M : Mutations)
    M->apply(this);
@ -4253,6 +4214,7 @@ void SwingSchedulerDDG::addEdge(const SUnit *SU,
                                const SwingSchedulerDDGEdge &Edge) {
  assert(!Edge.isValidationOnly() &&
         "Validation-only edges are not expected here.");
+
  auto &Edges = getEdges(SU);
  if (Edge.getSrc() == SU)
    Edges.Succs.push_back(Edge);
@ -4296,6 +4258,32 @@ SwingSchedulerDDG::SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU,
                                   /*IsValidationOnly=*/true);
        Edge.setDistance(1);
        ValidationOnlyEdges.push_back(Edge);
+
+        // Store the edge as an extra edge if it meets the following conditions:
+        //
+        //  - The edge is a loop-carried order dependency.
+        //  - The edge is a back edge in terms of the original instruction
+        //    order.
+        //  - The destination instruction may load.
+        //  - The source instruction may store but does not load.
+        //
+        // These conditions are inherited from a previous implementation to
+        // preserve the existing behavior and avoid regressions.
+        bool UseAsExtraEdge = [&]() {
+          if (Edge.getDistance() == 0 || !Edge.isOrderDep())
+            return false;
+
+          SUnit *Src = Edge.getSrc();
+          SUnit *Dst = Edge.getDst();
+          if (Src->NodeNum < Dst->NodeNum)
+            return false;
+
+          MachineInstr *SrcMI = Src->getInstr();
+          MachineInstr *DstMI = Dst->getInstr();
+          return DstMI->mayLoad() && !SrcMI->mayLoad() && SrcMI->mayStore();
+        }();
+        if (UseAsExtraEdge)
+          getEdges(Edge.getSrc()).ExtraSuccs.push_back(Edge.getDst());
      }
    }
  }
@ -4311,6 +4299,10 @@ SwingSchedulerDDG::getOutEdges(const SUnit *SU) const {
  return getEdges(SU).Succs;
 }

+ArrayRef<SUnit *> SwingSchedulerDDG::getExtraOutEdges(const SUnit *SU) const {
+  return getEdges(SU).ExtraSuccs;
+}
+
 /// Check if \p Schedule doesn't violate the validation-only dependencies.
 bool SwingSchedulerDDG::isValidSchedule(const SMSchedule &Schedule) const {
  unsigned II = Schedule.getInitiationInterval();
--- a/llvm/test/CodeGen/AArch64/sms-instruction-scheduled-at-correct-cycle.mir
+++ b/llvm/test/CodeGen/AArch64/sms-instruction-scheduled-at-correct-cycle.mir
@ -1,335 +0,0 @@
-# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -debug-only=pipeliner -pipeliner-max-stages=50 -pipeliner-max-mii=50 -pipeliner-enable-copytophi=0 -pipeliner-ii-search-range=30 2>&1 | FileCheck %s
-# REQUIRES: asserts
-
-# Test that each instruction must be scheduled between the early cycle and the late cycle. Previously there were cases where an instruction is scheduled outside of the valid range. See issue #93936 for details.
-
-# CHECK: {{^ *}}Try to schedule with 47
-# CHECK: {{^ *}}Inst (11)   %48:fpr128 = LDRQui %35:gpr64sp, 0 :: (load (s128) from %ir.lsr.iv63, align 4, !tbaa !0)
-# CHECK-EMPTY:
-# CHECK-NEXT: {{^ *}}es: ffffffe8 ls: ffffffe9
-# CHECK-NEXT: {{^ *}}Trying to insert node between -24 and -23 II: 47
-# CHECK-NEXT: {{^ *}}insert at cycle -24   %48:fpr128 = LDRQui %35:gpr64sp, 0 :: (load (s128) from %ir.lsr.iv63, align 4, !tbaa !0)
-
--- |
-  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
-  
-  define dso_local void @f(ptr nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, ptr nocapture noundef readonly %c, ptr nocapture noundef readonly %d, ptr nocapture noundef readonly %e, float noundef %f, i32 noundef %N) local_unnamed_addr {
-  entry:
-    %cmp16 = icmp sgt i32 %N, 0
-    br i1 %cmp16, label %for.body.preheader, label %for.cond.cleanup
-  
-  for.body.preheader:                               ; preds = %entry
-    %wide.trip.count = zext nneg i32 %N to i64
-    %min.iters.check = icmp ult i32 %N, 8
-    br i1 %min.iters.check, label %for.body.preheader37, label %vector.memcheck
-  
-  vector.memcheck:                                  ; preds = %for.body.preheader
-    %0 = ptrtoint ptr %a to i64
-    %1 = ptrtoint ptr %b to i64
-    %2 = ptrtoint ptr %c to i64
-    %3 = ptrtoint ptr %d to i64
-    %4 = ptrtoint ptr %e to i64
-    %5 = sub i64 %0, %1
-    %diff.check = icmp ult i64 %5, 32
-    %6 = sub i64 %0, %2
-    %diff.check22 = icmp ult i64 %6, 32
-    %conflict.rdx = or i1 %diff.check, %diff.check22
-    %7 = sub i64 %0, %3
-    %diff.check24 = icmp ult i64 %7, 32
-    %conflict.rdx25 = or i1 %conflict.rdx, %diff.check24
-    %8 = sub i64 %0, %4
-    %diff.check27 = icmp ult i64 %8, 32
-    %conflict.rdx28 = or i1 %conflict.rdx25, %diff.check27
-    br i1 %conflict.rdx28, label %for.body.preheader37, label %vector.ph
-  
-  vector.ph:                                        ; preds = %vector.memcheck
-    %n.vec = and i64 %wide.trip.count, 2147483640
-    %broadcast.splatinsert = insertelement <4 x float> poison, float %f, i64 0
-    %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
-    %scevgep54 = getelementptr i8, ptr %b, i64 16
-    %scevgep58 = getelementptr i8, ptr %a, i64 16
-    %scevgep62 = getelementptr i8, ptr %c, i64 16
-    %scevgep66 = getelementptr i8, ptr %e, i64 16
-    %scevgep70 = getelementptr i8, ptr %d, i64 16
-    br label %vector.body
-  
-  vector.body:                                      ; preds = %vector.body, %vector.ph
-    %lsr.iv71 = phi ptr [ %scevgep72, %vector.body ], [ %scevgep70, %vector.ph ]
-    %lsr.iv67 = phi ptr [ %scevgep68, %vector.body ], [ %scevgep66, %vector.ph ]
-    %lsr.iv63 = phi ptr [ %scevgep64, %vector.body ], [ %scevgep62, %vector.ph ]
-    %lsr.iv59 = phi ptr [ %scevgep60, %vector.body ], [ %scevgep58, %vector.ph ]
-    %lsr.iv55 = phi ptr [ %scevgep56, %vector.body ], [ %scevgep54, %vector.ph ]
-    %lsr.iv52 = phi i64 [ %lsr.iv.next53, %vector.body ], [ %n.vec, %vector.ph ]
-    %scevgep57 = getelementptr i8, ptr %lsr.iv55, i64 -16
-    %wide.load = load <4 x float>, ptr %scevgep57, align 4, !tbaa !6
-    %wide.load29 = load <4 x float>, ptr %lsr.iv55, align 4, !tbaa !6
-    %9 = fmul <4 x float> %wide.load, %broadcast.splat
-    %10 = fmul <4 x float> %wide.load29, %broadcast.splat
-    %scevgep65 = getelementptr i8, ptr %lsr.iv63, i64 -16
-    %wide.load30 = load <4 x float>, ptr %scevgep65, align 4, !tbaa !6
-    %wide.load31 = load <4 x float>, ptr %lsr.iv63, align 4, !tbaa !6
-    %scevgep73 = getelementptr i8, ptr %lsr.iv71, i64 -16
-    %wide.load32 = load <4 x float>, ptr %scevgep73, align 4, !tbaa !6
-    %wide.load33 = load <4 x float>, ptr %lsr.iv71, align 4, !tbaa !6
-    %11 = fsub <4 x float> %wide.load30, %wide.load32
-    %12 = fsub <4 x float> %wide.load31, %wide.load33
-    %13 = fmul <4 x float> %9, %11
-    %14 = fmul <4 x float> %10, %12
-    %scevgep69 = getelementptr i8, ptr %lsr.iv67, i64 -16
-    %wide.load34 = load <4 x float>, ptr %scevgep69, align 4, !tbaa !6
-    %wide.load35 = load <4 x float>, ptr %lsr.iv67, align 4, !tbaa !6
-    %15 = fdiv <4 x float> %13, %wide.load34
-    %16 = fdiv <4 x float> %14, %wide.load35
-    %scevgep61 = getelementptr i8, ptr %lsr.iv59, i64 -16
-    store <4 x float> %15, ptr %scevgep61, align 4, !tbaa !6
-    store <4 x float> %16, ptr %lsr.iv59, align 4, !tbaa !6
-    %lsr.iv.next53 = add nsw i64 %lsr.iv52, -8
-    %scevgep56 = getelementptr i8, ptr %lsr.iv55, i64 32
-    %scevgep60 = getelementptr i8, ptr %lsr.iv59, i64 32
-    %scevgep64 = getelementptr i8, ptr %lsr.iv63, i64 32
-    %scevgep68 = getelementptr i8, ptr %lsr.iv67, i64 32
-    %scevgep72 = getelementptr i8, ptr %lsr.iv71, i64 32
-    %17 = icmp eq i64 %lsr.iv.next53, 0
-    br i1 %17, label %middle.block, label %vector.body, !llvm.loop !10
-  
-  middle.block:                                     ; preds = %vector.body
-    %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
-    br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader37
-  
-  for.body.preheader37:                             ; preds = %vector.memcheck, %for.body.preheader, %middle.block
-    %indvars.iv.ph = phi i64 [ %n.vec, %middle.block ], [ 0, %for.body.preheader ], [ 0, %vector.memcheck ]
-    %18 = shl nuw nsw i64 %indvars.iv.ph, 2
-    %scevgep = getelementptr i8, ptr %a, i64 %18
-    %scevgep39 = getelementptr i8, ptr %e, i64 %18
-    %scevgep42 = getelementptr i8, ptr %d, i64 %18
-    %scevgep45 = getelementptr i8, ptr %c, i64 %18
-    %scevgep48 = getelementptr i8, ptr %b, i64 %18
-    %19 = sub i64 %wide.trip.count, %indvars.iv.ph
-    br label %for.body
-  
-  for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
-    ret void
-  
-  for.body:                                         ; preds = %for.body.preheader37, %for.body
-    %lsr.iv51 = phi i64 [ %19, %for.body.preheader37 ], [ %lsr.iv.next, %for.body ]
-    %lsr.iv49 = phi ptr [ %scevgep48, %for.body.preheader37 ], [ %scevgep50, %for.body ]
-    %lsr.iv46 = phi ptr [ %scevgep45, %for.body.preheader37 ], [ %scevgep47, %for.body ]
-    %lsr.iv43 = phi ptr [ %scevgep42, %for.body.preheader37 ], [ %scevgep44, %for.body ]
-    %lsr.iv40 = phi ptr [ %scevgep39, %for.body.preheader37 ], [ %scevgep41, %for.body ]
-    %lsr.iv = phi ptr [ %scevgep, %for.body.preheader37 ], [ %scevgep38, %for.body ]
-    %20 = load float, ptr %lsr.iv49, align 4, !tbaa !6
-    %mul = fmul float %20, %f
-    %21 = load float, ptr %lsr.iv46, align 4, !tbaa !6
-    %22 = load float, ptr %lsr.iv43, align 4, !tbaa !6
-    %sub = fsub float %21, %22
-    %mul5 = fmul float %mul, %sub
-    %23 = load float, ptr %lsr.iv40, align 4, !tbaa !6
-    %div = fdiv float %mul5, %23
-    store float %div, ptr %lsr.iv, align 4, !tbaa !6
-    %scevgep38 = getelementptr i8, ptr %lsr.iv, i64 4
-    %scevgep41 = getelementptr i8, ptr %lsr.iv40, i64 4
-    %scevgep44 = getelementptr i8, ptr %lsr.iv43, i64 4
-    %scevgep47 = getelementptr i8, ptr %lsr.iv46, i64 4
-    %scevgep50 = getelementptr i8, ptr %lsr.iv49, i64 4
-    %lsr.iv.next = add i64 %lsr.iv51, -1
-    %exitcond.not = icmp eq i64 %lsr.iv.next, 0
-    br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
-  }
-  
-  !6 = !{!7, !7, i64 0}
-  !7 = !{!"float", !8, i64 0}
-  !8 = !{!"omnipotent char", !9, i64 0}
-  !9 = !{!"Simple C/C++ TBAA"}
-  !10 = distinct !{!10, !11, !12, !13}
-  !11 = !{!"llvm.loop.mustprogress"}
-  !12 = !{!"llvm.loop.isvectorized", i32 1}
-  !13 = !{!"llvm.loop.unroll.runtime.disable"}
-  !14 = distinct !{!14, !11, !12}
-
-...
---
-name:            f
-tracksRegLiveness: true
-liveins:
-  - { reg: '$x0', virtual-reg: '%39' }
-  - { reg: '$x1', virtual-reg: '%40' }
-  - { reg: '$x2', virtual-reg: '%41' }
-  - { reg: '$x3', virtual-reg: '%42' }
-  - { reg: '$x4', virtual-reg: '%43' }
-  - { reg: '$s0', virtual-reg: '%44' }
-  - { reg: '$w5', virtual-reg: '%45' }
-body:             |
-  bb.0.entry:
-    successors: %bb.1, %bb.7
-    liveins: $x0, $x1, $x2, $x3, $x4, $s0, $w5
-  
-    %45:gpr32common = COPY $w5
-    %44:fpr32 = COPY $s0
-    %43:gpr64common = COPY $x4
-    %42:gpr64common = COPY $x3
-    %41:gpr64common = COPY $x2
-    %40:gpr64common = COPY $x1
-    %39:gpr64common = COPY $x0
-    dead $wzr = SUBSWri %45, 1, 0, implicit-def $nzcv
-    Bcc 11, %bb.7, implicit $nzcv
-    B %bb.1
-  
-  bb.1.for.body.preheader:
-    successors: %bb.12, %bb.2
-  
-    %48:gpr32 = ORRWrs $wzr, %45, 0
-    %0:gpr64 = SUBREG_TO_REG killed %48, %subreg.sub_32
-    dead $wzr = SUBSWri %45, 8, 0, implicit-def $nzcv
-    Bcc 2, %bb.2, implicit $nzcv
-  
-  bb.12:
-    %49:gpr64all = COPY $xzr
-    %47:gpr64all = COPY %49
-    B %bb.6
-  
-  bb.2.vector.memcheck:
-    successors: %bb.6, %bb.11
-  
-    %55:gpr64common = SUBXrr %39, %40
-    %59:gpr64all = COPY $xzr
-    %51:gpr64all = COPY %59
-    dead $xzr = SUBSXri killed %55, 32, 0, implicit-def $nzcv
-    Bcc 3, %bb.6, implicit $nzcv
-    B %bb.11
-  
-  bb.11.vector.memcheck:
-    successors: %bb.6, %bb.10
-  
-    %56:gpr64common = SUBXrr %39, %41
-    dead $xzr = SUBSXri %56, 32, 0, implicit-def $nzcv
-    Bcc 3, %bb.6, implicit $nzcv
-    B %bb.10
-  
-  bb.10.vector.memcheck:
-    successors: %bb.6, %bb.9
-  
-    %57:gpr64common = SUBXrr %39, %42
-    dead $xzr = SUBSXri %57, 32, 0, implicit-def $nzcv
-    Bcc 3, %bb.6, implicit $nzcv
-    B %bb.9
-  
-  bb.9.vector.memcheck:
-    successors: %bb.6, %bb.3
-  
-    %58:gpr64common = SUBXrr %39, %43
-    dead $xzr = SUBSXri %58, 32, 0, implicit-def $nzcv
-    Bcc 3, %bb.6, implicit $nzcv
-    B %bb.3
-  
-  bb.3.vector.ph:
-    %64:gpr64common = ANDXri %0, 8027
-    %1:gpr64 = COPY %64
-    %66:fpr128 = IMPLICIT_DEF
-    %65:fpr128 = INSERT_SUBREG %66, %44, %subreg.ssub
-    %67:gpr64sp = ADDXri %40, 16, 0
-    %3:gpr64all = COPY %67
-    %68:gpr64sp = ADDXri %39, 16, 0
-    %4:gpr64all = COPY %68
-    %69:gpr64sp = ADDXri %41, 16, 0
-    %5:gpr64all = COPY %69
-    %70:gpr64sp = ADDXri %43, 16, 0
-    %6:gpr64all = COPY %70
-    %71:gpr64sp = ADDXri %42, 16, 0
-    %7:gpr64all = COPY %71
-  
-  bb.4.vector.body:
-    successors: %bb.5, %bb.4
-  
-    %8:gpr64sp = PHI %7, %bb.3, %19, %bb.4
-    %9:gpr64sp = PHI %6, %bb.3, %18, %bb.4
-    %10:gpr64sp = PHI %5, %bb.3, %17, %bb.4
-    %11:gpr64sp = PHI %4, %bb.3, %16, %bb.4
-    %12:gpr64sp = PHI %3, %bb.3, %15, %bb.4
-    %13:gpr64sp = PHI %1, %bb.3, %14, %bb.4
-    %72:fpr128 = LDURQi %12, -16 :: (load (s128) from %ir.scevgep57, align 4, !tbaa !6)
-    %73:fpr128 = LDRQui %12, 0 :: (load (s128) from %ir.lsr.iv55, align 4, !tbaa !6)
-    %74:fpr128 = nofpexcept FMULv4i32_indexed killed %72, %65, 0, implicit $fpcr
-    %75:fpr128 = nofpexcept FMULv4i32_indexed killed %73, %65, 0, implicit $fpcr
-    %76:fpr128 = LDURQi %10, -16 :: (load (s128) from %ir.scevgep65, align 4, !tbaa !6)
-    %77:fpr128 = LDRQui %10, 0 :: (load (s128) from %ir.lsr.iv63, align 4, !tbaa !6)
-    %78:fpr128 = LDURQi %8, -16 :: (load (s128) from %ir.scevgep73, align 4, !tbaa !6)
-    %79:fpr128 = LDRQui %8, 0 :: (load (s128) from %ir.lsr.iv71, align 4, !tbaa !6)
-    %80:fpr128 = nofpexcept FSUBv4f32 killed %76, killed %78, implicit $fpcr
-    %81:fpr128 = nofpexcept FSUBv4f32 killed %77, killed %79, implicit $fpcr
-    %82:fpr128 = nofpexcept FMULv4f32 killed %74, killed %80, implicit $fpcr
-    %83:fpr128 = nofpexcept FMULv4f32 killed %75, killed %81, implicit $fpcr
-    %84:fpr128 = LDURQi %9, -16 :: (load (s128) from %ir.scevgep69, align 4, !tbaa !6)
-    %85:fpr128 = LDRQui %9, 0 :: (load (s128) from %ir.lsr.iv67, align 4, !tbaa !6)
-    %86:fpr128 = nofpexcept FDIVv4f32 killed %82, killed %84, implicit $fpcr
-    %87:fpr128 = nofpexcept FDIVv4f32 killed %83, killed %85, implicit $fpcr
-    STURQi killed %86, %11, -16 :: (store (s128) into %ir.scevgep61, align 4, !tbaa !6)
-    STRQui killed %87, %11, 0 :: (store (s128) into %ir.lsr.iv59, align 4, !tbaa !6)
-    %88:gpr64 = nsw SUBSXri %13, 8, 0, implicit-def $nzcv
-    %14:gpr64all = COPY %88
-    %89:gpr64sp = ADDXri %12, 32, 0
-    %15:gpr64all = COPY %89
-    %90:gpr64sp = ADDXri %11, 32, 0
-    %16:gpr64all = COPY %90
-    %91:gpr64sp = ADDXri %10, 32, 0
-    %17:gpr64all = COPY %91
-    %92:gpr64sp = ADDXri %9, 32, 0
-    %18:gpr64all = COPY %92
-    %93:gpr64sp = ADDXri %8, 32, 0
-    %19:gpr64all = COPY %93
-    Bcc 1, %bb.4, implicit $nzcv
-    B %bb.5
-  
-  bb.5.middle.block:
-    dead $xzr = SUBSXrr %64, %0, implicit-def $nzcv
-    Bcc 0, %bb.7, implicit $nzcv
-    B %bb.6
-  
-  bb.6.for.body.preheader37:
-    %20:gpr64 = PHI %47, %bb.12, %51, %bb.2, %51, %bb.11, %51, %bb.10, %51, %bb.9, %1, %bb.5
-    %95:gpr64 = nuw nsw UBFMXri %20, 62, 61
-    %96:gpr64 = ADDXrr %39, %95
-    %21:gpr64all = COPY %96
-    %97:gpr64 = ADDXrr %43, %95
-    %22:gpr64all = COPY %97
-    %98:gpr64 = ADDXrr %42, %95
-    %23:gpr64all = COPY %98
-    %99:gpr64 = ADDXrr %41, %95
-    %24:gpr64all = COPY %99
-    %100:gpr64 = ADDXrr %40, %95
-    %25:gpr64all = COPY %100
-    %101:gpr64 = SUBXrr %0, %20
-    %26:gpr64all = COPY %101
-    B %bb.8
-  
-  bb.7.for.cond.cleanup:
-    RET_ReallyLR
-  
-  bb.8.for.body:
-    successors: %bb.7, %bb.8
-  
-    %27:gpr64sp = PHI %26, %bb.6, %38, %bb.8
-    %28:gpr64sp = PHI %25, %bb.6, %37, %bb.8
-    %29:gpr64sp = PHI %24, %bb.6, %36, %bb.8
-    %30:gpr64sp = PHI %23, %bb.6, %35, %bb.8
-    %31:gpr64sp = PHI %22, %bb.6, %34, %bb.8
-    %32:gpr64sp = PHI %21, %bb.6, %33, %bb.8
-    early-clobber %102:gpr64sp, %103:fpr32 = LDRSpost %28, 4 :: (load (s32) from %ir.lsr.iv49, !tbaa !6)
-    %104:fpr32 = nofpexcept FMULSrr killed %103, %44, implicit $fpcr
-    early-clobber %105:gpr64sp, %106:fpr32 = LDRSpost %29, 4 :: (load (s32) from %ir.lsr.iv46, !tbaa !6)
-    early-clobber %107:gpr64sp, %108:fpr32 = LDRSpost %30, 4 :: (load (s32) from %ir.lsr.iv43, !tbaa !6)
-    %109:fpr32 = nofpexcept FSUBSrr killed %106, killed %108, implicit $fpcr
-    %110:fpr32 = nofpexcept FMULSrr killed %104, killed %109, implicit $fpcr
-    early-clobber %111:gpr64sp, %112:fpr32 = LDRSpost %31, 4 :: (load (s32) from %ir.lsr.iv40, !tbaa !6)
-    %113:fpr32 = nofpexcept FDIVSrr killed %110, killed %112, implicit $fpcr
-    early-clobber %114:gpr64sp = STRSpost killed %113, %32, 4 :: (store (s32) into %ir.lsr.iv, !tbaa !6)
-    %33:gpr64all = COPY %114
-    %34:gpr64all = COPY %111
-    %35:gpr64all = COPY %107
-    %36:gpr64all = COPY %105
-    %37:gpr64all = COPY %102
-    %115:gpr64 = SUBSXri %27, 1, 0, implicit-def $nzcv
-    %38:gpr64all = COPY %115
-    Bcc 0, %bb.7, implicit $nzcv
-    B %bb.8
-
-...