From 7364203924cf9d464df4f6b9455ac6cd42c856ae Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <jeffrey.byrnes@amd.com>
Date: Mon, 30 Mar 2026 12:18:29 -0700
Subject: [PATCH] Reapply "[AMDGPU] Add HWUI pressure heuristics to coexec
 strategy (#184929)" (#189121)

Reland https://github.com/llvm/llvm-project/pull/184929 after fixing
some issues in the NDEBUG builds.

3a640ee is unchanged from the previously approved PR, the unreviewed
portion of this PR is 9cabd8d
---
 .../AMDGPU/AMDGPUCoExecSchedStrategy.cpp      | 446 ++++++++++++-
 .../Target/AMDGPU/AMDGPUCoExecSchedStrategy.h | 290 ++++++++-
 .../AMDGPU/coexec-sched-effective-stall.mir   |  10 +-
 llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll  | 606 ++++++++++++++++++
 4 files changed, 1322 insertions(+), 30 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
index 977c6f56ad15..d83f8fee2b2f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Support/Debug.h"
 
 using namespace llvm;
+using namespace llvm::AMDGPU;
 
 #define DEBUG_TYPE "machine-scheduler"
 
@@ -41,6 +42,370 @@ static SUnit *pickOnlyChoice(SchedBoundary &Zone) {
   return OnlyChoice;
 }
 
+InstructionFlavor llvm::AMDGPU::classifyFlavor(const MachineInstr &MI,
+                                               const SIInstrInfo &SII) {
+  if (MI.isDebugInstr())
+    return InstructionFlavor::Other;
+
+  unsigned Opc = MI.getOpcode();
+
+  // Check for specific opcodes first.
+  if (Opc == AMDGPU::ATOMIC_FENCE || Opc == AMDGPU::S_WAIT_ASYNCCNT ||
+      Opc == AMDGPU::S_WAIT_TENSORCNT || Opc == AMDGPU::S_BARRIER_WAIT ||
+      Opc == AMDGPU::S_BARRIER_SIGNAL_IMM)
+    return InstructionFlavor::Fence;
+
+  if (SII.isLDSDMA(MI))
+    return InstructionFlavor::DMA;
+
+  if (SII.isMFMAorWMMA(MI))
+    return InstructionFlavor::WMMA;
+
+  if (SII.isTRANS(MI))
+    return InstructionFlavor::TRANS;
+
+  if (SII.isVALU(MI))
+    return InstructionFlavor::SingleCycleVALU;
+
+  if (SII.isDS(MI))
+    return InstructionFlavor::DS;
+
+  if (SII.isFLAT(MI) || SII.isFLATGlobal(MI) || SII.isFLATScratch(MI))
+    return InstructionFlavor::VMEM;
+
+  if (SII.isSALU(MI))
+    return InstructionFlavor::SALU;
+
+  return InstructionFlavor::Other;
+}
+
+SUnit *HardwareUnitInfo::getNextTargetSU(bool LookDeep) const {
+  for (auto *PrioritySU : PrioritySUs) {
+    if (!PrioritySU->isTopReady())
+      return PrioritySU;
+  }
+
+  if (!LookDeep)
+    return nullptr;
+
+  unsigned MinDepth = std::numeric_limits<unsigned int>::max();
+  SUnit *TargetSU = nullptr;
+  for (auto *SU : AllSUs) {
+    if (SU->isScheduled)
+      continue;
+
+    if (SU->isTopReady())
+      continue;
+
+    if (SU->getDepth() < MinDepth) {
+      MinDepth = SU->getDepth();
+      TargetSU = SU;
+    }
+  }
+  return TargetSU;
+}
+
+void HardwareUnitInfo::insert(SUnit *SU, unsigned BlockingCycles) {
+#ifndef NDEBUG
+  bool Inserted = AllSUs.insert(SU);
+  assert(Inserted);
+#else
+  AllSUs.insert(SU);
+#endif
+
+  TotalCycles += BlockingCycles;
+
+  if (PrioritySUs.empty()) {
+    PrioritySUs.insert(SU);
+    return;
+  }
+  unsigned SUDepth = SU->getDepth();
+  unsigned CurrDepth = (*PrioritySUs.begin())->getDepth();
+  if (SUDepth > CurrDepth)
+    return;
+
+  if (SUDepth == CurrDepth) {
+    PrioritySUs.insert(SU);
+    return;
+  }
+
+  // SU is lower depth and should be prioritized.
+  PrioritySUs.clear();
+  PrioritySUs.insert(SU);
+}
+
+void HardwareUnitInfo::markScheduled(SUnit *SU, unsigned BlockingCycles) {
+  // We may want to ignore some HWUIs (e.g. InstructionFlavor::Other). To do so,
+  // we just clear the HWUI. However, we still have instructions which map to
+  // this HWUI. Don't bother managing the state for these HWUI.
+  if (TotalCycles == 0)
+    return;
+
+  AllSUs.remove(SU);
+  PrioritySUs.remove(SU);
+
+  TotalCycles -= BlockingCycles;
+
+  if (AllSUs.empty())
+    return;
+  if (PrioritySUs.empty()) {
+    for (auto SU : AllSUs) {
+      if (PrioritySUs.empty()) {
+        PrioritySUs.insert(SU);
+        continue;
+      }
+      unsigned SUDepth = SU->getDepth();
+      unsigned CurrDepth = (*PrioritySUs.begin())->getDepth();
+      if (SUDepth > CurrDepth)
+        continue;
+
+      if (SUDepth == CurrDepth) {
+        PrioritySUs.insert(SU);
+        continue;
+      }
+
+      // SU is lower depth and should be prioritized.
+      PrioritySUs.clear();
+      PrioritySUs.insert(SU);
+    }
+  }
+}
+
+HardwareUnitInfo *
+CandidateHeuristics::getHWUIFromFlavor(InstructionFlavor Flavor) {
+  for (auto &HWUICand : HWUInfo) {
+    if (HWUICand.getType() == Flavor) {
+      return &HWUICand;
+    }
+  }
+  return nullptr;
+}
+
+unsigned CandidateHeuristics::getHWUICyclesForInst(SUnit *SU) {
+  assert(SchedModel && SchedModel->hasInstrSchedModel());
+  unsigned ReleaseAtCycle = 0;
+  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+  for (TargetSchedModel::ProcResIter PI = SchedModel->getWriteProcResBegin(SC),
+                                     PE = SchedModel->getWriteProcResEnd(SC);
+       PI != PE; ++PI) {
+    ReleaseAtCycle = std::max(ReleaseAtCycle, (unsigned)PI->ReleaseAtCycle);
+  }
+  return ReleaseAtCycle;
+}
+
+void CandidateHeuristics::updateForScheduling(SUnit *SU) {
+  HardwareUnitInfo *HWUI =
+      getHWUIFromFlavor(classifyFlavor(*SU->getInstr(), *SII));
+  assert(HWUI);
+  HWUI->markScheduled(SU, getHWUICyclesForInst(SU));
+}
+
+void CandidateHeuristics::initialize(ScheduleDAGMI *SchedDAG,
+                                     const TargetSchedModel *TargetSchedModel,
+                                     const TargetRegisterInfo *TRI) {
+  DAG = SchedDAG;
+  SchedModel = TargetSchedModel;
+  assert(SchedModel && SchedModel->hasInstrSchedModel());
+
+  SRI = static_cast<const SIRegisterInfo *>(TRI);
+  SII = static_cast<const SIInstrInfo *>(DAG->TII);
+
+  HWUInfo.resize((int)InstructionFlavor::NUM_FLAVORS);
+
+  for (unsigned I = 0; I < HWUInfo.size(); I++) {
+    HWUInfo[I].reset();
+    HWUInfo[I].setType(I);
+  }
+
+  HWUInfo[(int)InstructionFlavor::WMMA].setProducesCoexecWindow(true);
+  HWUInfo[(int)InstructionFlavor::MultiCycleVALU].setProducesCoexecWindow(true);
+  HWUInfo[(int)InstructionFlavor::TRANS].setProducesCoexecWindow(true);
+
+  collectHWUIPressure();
+}
+
+void CandidateHeuristics::collectHWUIPressure() {
+  if (!SchedModel || !SchedModel->hasInstrSchedModel())
+    return;
+
+  for (auto &SU : DAG->SUnits) {
+    const InstructionFlavor Flavor = classifyFlavor(*SU.getInstr(), *SII);
+    HWUInfo[(int)(Flavor)].insert(&SU, getHWUICyclesForInst(&SU));
+  }
+
+  LLVM_DEBUG(dumpRegionSummary());
+}
+
+void CandidateHeuristics::dumpRegionSummary() {
+  MachineBasicBlock *BB = DAG->begin()->getParent();
+  dbgs() << "\n=== Region: " << DAG->MF.getName() << " BB" << BB->getNumber()
+         << " (" << DAG->SUnits.size() << " SUs) ===\n";
+
+  dbgs() << "\nHWUI Resource Pressure:\n";
+  for (auto &HWUI : HWUInfo) {
+    if (HWUI.getTotalCycles() == 0)
+      continue;
+
+    StringRef Name = getFlavorName(HWUI.getType());
+    dbgs() << "  " << Name << ": " << HWUI.getTotalCycles() << " cycles, "
+           << HWUI.size() << " instrs\n";
+  }
+  dbgs() << "\n";
+}
+
+void CandidateHeuristics::sortHWUIResources() {
+  // Highest priority should be first.
+  llvm::sort(HWUInfo, [](HardwareUnitInfo &A, HardwareUnitInfo &B) {
+    // Prefer CoexecWindow producers
+    if (A.producesCoexecWindow() != B.producesCoexecWindow())
+      return A.producesCoexecWindow();
+
+    // Prefer more demanded resources
+    if (A.getTotalCycles() != B.getTotalCycles())
+      return A.getTotalCycles() > B.getTotalCycles();
+
+    // In ties -- prefer the resource with more instructions
+    if (A.size() != B.size())
+      return A.size() < B.size();
+
+    // Default to Flavor order
+    return (unsigned)A.getType() < (unsigned)B.getType();
+  });
+}
+
+bool CandidateHeuristics::tryCriticalResourceDependency(
+    GenericSchedulerBase::SchedCandidate &TryCand,
+    GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary *Zone) const {
+
+  auto HasPrioritySU = [this, &Cand, &TryCand](unsigned ResourceIdx) {
+    const HardwareUnitInfo &HWUI = HWUInfo[ResourceIdx];
+
+    auto CandFlavor = classifyFlavor(*Cand.SU->getInstr(), *SII);
+    auto TryCandFlavor = classifyFlavor(*TryCand.SU->getInstr(), *SII);
+    bool LookDeep = (CandFlavor == InstructionFlavor::DS ||
+                     TryCandFlavor == InstructionFlavor::DS) &&
+                    HWUI.getType() == InstructionFlavor::WMMA;
+    auto *TargetSU = HWUI.getNextTargetSU(LookDeep);
+
+    // If we do not have a TargetSU for this resource, then it is not critical.
+    if (!TargetSU)
+      return false;
+
+    return true;
+  };
+
+  auto TryEnablesResource = [&Cand, &TryCand, this](unsigned ResourceIdx) {
+    const HardwareUnitInfo &HWUI = HWUInfo[ResourceIdx];
+    auto CandFlavor = classifyFlavor(*Cand.SU->getInstr(), *SII);
+
+    // We want to ensure our DS order matches WMMA order.
+    bool LookDeep = CandFlavor == InstructionFlavor::DS &&
+                    HWUI.getType() == InstructionFlavor::WMMA;
+    auto *TargetSU = HWUI.getNextTargetSU(LookDeep);
+
+    bool CandEnables =
+        TargetSU != Cand.SU && DAG->IsReachable(TargetSU, Cand.SU);
+    bool TryCandEnables =
+        TargetSU != TryCand.SU && DAG->IsReachable(TargetSU, TryCand.SU);
+
+    if (!CandEnables && !TryCandEnables)
+      return false;
+
+    if (CandEnables && !TryCandEnables) {
+      if (Cand.Reason > GenericSchedulerBase::RegCritical)
+        Cand.Reason = GenericSchedulerBase::RegCritical;
+
+      return true;
+    }
+
+    if (!CandEnables && TryCandEnables) {
+      TryCand.Reason = GenericSchedulerBase::RegCritical;
+      return true;
+    }
+
+    // Both enable, prefer the critical path.
+    unsigned CandHeight = Cand.SU->getHeight();
+    unsigned TryCandHeight = TryCand.SU->getHeight();
+
+    if (CandHeight > TryCandHeight) {
+      if (Cand.Reason > GenericSchedulerBase::RegCritical)
+        Cand.Reason = GenericSchedulerBase::RegCritical;
+
+      return true;
+    }
+
+    if (CandHeight < TryCandHeight) {
+      TryCand.Reason = GenericSchedulerBase::RegCritical;
+      return true;
+    }
+
+    // Same critical path, just prefer original candidate.
+    if (Cand.Reason > GenericSchedulerBase::RegCritical)
+      Cand.Reason = GenericSchedulerBase::RegCritical;
+
+    return true;
+  };
+
+  for (unsigned I = 0; I < HWUInfo.size(); I++) {
+    // If we have encountered a resource that is not critical, then neither
+    // candidate enables a critical resource
+    if (!HasPrioritySU(I))
+      continue;
+
+    bool Enabled = TryEnablesResource(I);
+    // If neither has enabled the resource, continue to the next resource
+    if (Enabled)
+      return true;
+  }
+  return false;
+}
+
+bool CandidateHeuristics::tryCriticalResource(
+    GenericSchedulerBase::SchedCandidate &TryCand,
+    GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary *Zone) const {
+  for (unsigned I = 0; I < HWUInfo.size(); I++) {
+    const HardwareUnitInfo &HWUI = HWUInfo[I];
+
+    bool CandUsesCrit = HWUI.contains(Cand.SU);
+    bool TryCandUsesCrit = HWUI.contains(TryCand.SU);
+
+    if (!CandUsesCrit && !TryCandUsesCrit)
+      continue;
+
+    if (CandUsesCrit != TryCandUsesCrit) {
+      if (CandUsesCrit) {
+        if (Cand.Reason > GenericSchedulerBase::RegCritical)
+          Cand.Reason = GenericSchedulerBase::RegCritical;
+        return true;
+      }
+      TryCand.Reason = GenericSchedulerBase::RegCritical;
+      return true;
+    }
+
+    // Otherwise, both use the critical resource
+    // For longer latency InstructionFlavors, we should prioritize first by
+    // their enablement of critical resources
+    if (HWUI.getType() == InstructionFlavor::DS) {
+      if (tryCriticalResourceDependency(TryCand, Cand, Zone))
+        return true;
+    }
+
+    // Prioritize based on HWUI priorities.
+    SUnit *Match = HWUI.getHigherPriority(Cand.SU, TryCand.SU);
+    if (Match) {
+      if (Match == Cand.SU) {
+        if (Cand.Reason > GenericSchedulerBase::RegCritical)
+          Cand.Reason = GenericSchedulerBase::RegCritical;
+        return true;
+      }
+      TryCand.Reason = GenericSchedulerBase::RegCritical;
+      return true;
+    }
+  }
+
+  return false;
+}
+
 AMDGPUCoExecSchedStrategy::AMDGPUCoExecSchedStrategy(
     const MachineSchedContext *C)
     : GCNSchedStrategy(C) {
@@ -68,6 +433,12 @@ void AMDGPUCoExecSchedStrategy::initialize(ScheduleDAGMI *DAG) {
   RegionPolicy.OnlyBottomUp = false;
 
   GCNSchedStrategy::initialize(DAG);
+  Heurs.initialize(DAG, SchedModel, TRI);
+}
+
+void AMDGPUCoExecSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
+  Heurs.updateForScheduling(SU);
+  GCNSchedStrategy::schedNode(SU, IsTopNode);
 }
 
 SUnit *AMDGPUCoExecSchedStrategy::pickNode(bool &IsTopNode) {
@@ -82,6 +453,9 @@ SUnit *AMDGPUCoExecSchedStrategy::pickNode(bool &IsTopNode) {
 
   bool PickedPending = false;
   SUnit *SU = nullptr;
+#ifndef NDEBUG
+  SchedCandidate *PickedCand = nullptr;
+#endif
   do {
     PickedPending = false;
     SU = pickOnlyChoice(Top);
@@ -92,10 +466,15 @@ SUnit *AMDGPUCoExecSchedStrategy::pickNode(bool &IsTopNode) {
                         PickedPending, /*IsBottomUp=*/false);
       assert(TopCand.Reason != NoCand && "failed to find a candidate");
       SU = TopCand.SU;
+#ifndef NDEBUG
+      PickedCand = &TopCand;
+#endif
     }
     IsTopNode = true;
   } while (SU->isScheduled);
 
+  LLVM_DEBUG(if (PickedCand) dumpPickSummary(SU, IsTopNode, *PickedCand));
+
   if (PickedPending) {
     unsigned ReadyCycle = SU->TopReadyCycle;
     unsigned CurrentCycle = Top.getCurrCycle();
@@ -149,7 +528,7 @@ void AMDGPUCoExecSchedStrategy::pickNodeFromQueue(
       initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
                     VGPRPressure, IsBottomUp);
       SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
-      tryCandidate(Cand, TryCand, ZoneArg);
+      tryCandidateCoexec(Cand, TryCand, ZoneArg);
       if (TryCand.Reason != NoCand) {
         if (TryCand.ResDelta == SchedResourceDelta())
           TryCand.initResourceDelta(Zone.DAG, SchedModel);
@@ -157,7 +536,7 @@ void AMDGPUCoExecSchedStrategy::pickNodeFromQueue(
         PickedPending = FromPending;
         Cand.setBest(TryCand);
       } else {
-        printCandidateDecision(TryCand, Cand);
+        LLVM_DEBUG(printCandidateDecision(TryCand, Cand));
       }
     }
   };
@@ -169,9 +548,36 @@ void AMDGPUCoExecSchedStrategy::pickNodeFromQueue(
   EvaluateQueue(Zone.Pending, /*FromPending=*/true);
 }
 
-bool AMDGPUCoExecSchedStrategy::tryCandidate(SchedCandidate &Cand,
-                                             SchedCandidate &TryCand,
-                                             SchedBoundary *Zone) const {
+#ifndef NDEBUG
+void AMDGPUCoExecSchedStrategy::dumpPickSummary(SUnit *SU, bool IsTopNode,
+                                                SchedCandidate &Cand) {
+  const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII);
+  unsigned Cycle = IsTopNode ? Top.getCurrCycle() : Bot.getCurrCycle();
+
+  dbgs() << "=== Pick @ Cycle " << Cycle << " ===\n";
+
+  const InstructionFlavor Flavor = classifyFlavor(*SU->getInstr(), *SII);
+  dbgs() << "Picked: SU(" << SU->NodeNum << ") ";
+  SU->getInstr()->print(dbgs(), /*IsStandalone=*/true, /*SkipOpers=*/false,
+                        /*SkipDebugLoc=*/true);
+  dbgs() << " [" << getFlavorName(Flavor) << "]\n";
+
+  dbgs() << "  Reason: ";
+  if (LastAMDGPUReason != AMDGPUSchedReason::None)
+    dbgs() << getReasonName(LastAMDGPUReason);
+  else if (Cand.Reason != NoCand)
+    dbgs() << GenericSchedulerBase::getReasonStr(Cand.Reason);
+  else
+    dbgs() << "Unknown";
+  dbgs() << "\n\n";
+
+  LastAMDGPUReason = AMDGPUSchedReason::None;
+}
+#endif
+
+bool AMDGPUCoExecSchedStrategy::tryCandidateCoexec(SchedCandidate &Cand,
+                                                   SchedCandidate &TryCand,
+                                                   SchedBoundary *Zone) {
   // Initialize the candidate if needed.
   if (!Cand.isValid()) {
     TryCand.Reason = FirstValid;
@@ -196,17 +602,21 @@ bool AMDGPUCoExecSchedStrategy::tryCandidate(SchedCandidate &Cand,
   // "tie-breaking" in nature.
   bool SameBoundary = Zone != nullptr;
   if (SameBoundary) {
-    // For loops that are acyclic path limited, aggressively schedule for
-    // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
-    // heuristics to take precedence.
-    if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
-        tryLatency(TryCand, Cand, *Zone))
-      return TryCand.Reason != NoCand;
-
-    // Otherwise compare candidates by the stall they would introduce if
+    // Compare candidates by the stall they would introduce if
     // scheduled in the current cycle.
     if (tryEffectiveStall(Cand, TryCand, *Zone))
       return TryCand.Reason != NoCand;
+
+    Heurs.sortHWUIResources();
+    if (Heurs.tryCriticalResource(TryCand, Cand, Zone)) {
+      LastAMDGPUReason = AMDGPUSchedReason::CritResourceBalance;
+      return TryCand.Reason != NoCand;
+    }
+
+    if (Heurs.tryCriticalResourceDependency(TryCand, Cand, Zone)) {
+      LastAMDGPUReason = AMDGPUSchedReason::CritResourceDep;
+      return TryCand.Reason != NoCand;
+    }
   }
 
   // Keep clustered nodes together to encourage downstream peephole
@@ -240,16 +650,6 @@ bool AMDGPUCoExecSchedStrategy::tryCandidate(SchedCandidate &Cand,
     return TryCand.Reason != NoCand;
 
   if (SameBoundary) {
-    // Avoid critical resource consumption and balance the schedule.
-    TryCand.initResourceDelta(DAG, SchedModel);
-    if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
-                TryCand, Cand, ResourceReduce))
-      return TryCand.Reason != NoCand;
-    if (tryGreater(TryCand.ResDelta.DemandedResources,
-                   Cand.ResDelta.DemandedResources, TryCand, Cand,
-                   ResourceDemand))
-      return TryCand.Reason != NoCand;
-
     // Avoid serializing long latency dependence chains.
     // For acyclic path limited loops, latency was already checked above.
     if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
index 07252c3fb45a..1684690cd829 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
@@ -19,12 +19,297 @@
 
 namespace llvm {
 
+namespace AMDGPU {
+
+//===----------------------------------------------------------------------===//
+// Instruction Flavor Classification
+//===----------------------------------------------------------------------===//
+
+enum class InstructionFlavor : uint8_t {
+  WMMA,            // WMMA/MFMA matrix operations
+  SingleCycleVALU, // Single-cycle VALU (not TRANS32, not multi-cycle CVT)
+  TRANS,           // Transcendental ops (v_exp, v_log, etc.)
+  MultiCycleVALU,  // VALU instructions with repeat rate > 1
+  VMEM,            // FLAT/GLOBAL memory operations
+  DS,              // LDS/GDS operations
+  SALU,            // Scalar ALU
+  DMA,             // Tensor DMA operations
+  Fence,           // Fences and waits
+  Other,           // Everything else
+  NUM_FLAVORS
+};
+
+inline StringRef getFlavorName(InstructionFlavor F) {
+  switch (F) {
+  case InstructionFlavor::WMMA:
+    return "WMMA";
+  case InstructionFlavor::SingleCycleVALU:
+    return "VALU(1c)";
+  case InstructionFlavor::TRANS:
+    return "TRANS";
+  case InstructionFlavor::MultiCycleVALU:
+    return "VALU(Nc)";
+  case InstructionFlavor::VMEM:
+    return "VMEM";
+  case InstructionFlavor::DS:
+    return "DS";
+  case InstructionFlavor::SALU:
+    return "SALU";
+  case InstructionFlavor::DMA:
+    return "DMA";
+  case InstructionFlavor::Fence:
+    return "Fence";
+  case InstructionFlavor::Other:
+    return "Other";
+  case InstructionFlavor::NUM_FLAVORS:
+    return "???";
+  }
+  llvm_unreachable("Unknown InstructionFlavor");
+}
+
+inline StringRef getFlavorShortName(InstructionFlavor F) {
+  switch (F) {
+  case InstructionFlavor::WMMA:
+    return "W";
+  case InstructionFlavor::SingleCycleVALU:
+    return "V";
+  case InstructionFlavor::TRANS:
+    return "T";
+  case InstructionFlavor::MultiCycleVALU:
+    return "C";
+  case InstructionFlavor::VMEM:
+    return "M";
+  case InstructionFlavor::DS:
+    return "D";
+  case InstructionFlavor::SALU:
+    return "S";
+  case InstructionFlavor::DMA:
+    return "X";
+  case InstructionFlavor::Fence:
+    return "F";
+  case InstructionFlavor::Other:
+    return "O";
+  case InstructionFlavor::NUM_FLAVORS:
+    return "?";
+  }
+  llvm_unreachable("Unknown InstructionFlavor");
+}
+
+InstructionFlavor classifyFlavor(const MachineInstr &MI,
+                                 const SIInstrInfo &SII);
+
+using FlavorGroup = SmallVector<InstructionFlavor, 4>;
+
+namespace FlavorGroups {
+inline FlavorGroup allVALU() {
+  return {InstructionFlavor::SingleCycleVALU, InstructionFlavor::TRANS,
+          InstructionFlavor::MultiCycleVALU};
+}
+inline FlavorGroup allMem() {
+  return {InstructionFlavor::VMEM, InstructionFlavor::DS,
+          InstructionFlavor::DMA};
+}
+inline FlavorGroup individual(InstructionFlavor F) { return {F}; }
+inline FlavorGroup all() {
+  FlavorGroup G;
+  for (unsigned I = 0;
+       I < static_cast<unsigned>(InstructionFlavor::NUM_FLAVORS); ++I)
+    G.push_back(static_cast<InstructionFlavor>(I));
+  return G;
+}
+} // namespace FlavorGroups
+
+/// AMDGPU-specific scheduling decision reasons. These provide more granularity
+/// than the generic CandReason enum for debugging purposes.
+enum class AMDGPUSchedReason : uint8_t {
+  None,
+  CritResourceBalance, // tryCriticalResource chose based on resource pressure
+  CritResourceDep,     // tryCriticalResourceDependency chose based on enabling
+  NUM_REASONS
+};
+
+inline StringRef getReasonName(AMDGPUSchedReason R) {
+  switch (R) {
+  case AMDGPUSchedReason::None:
+    return "None";
+  case AMDGPUSchedReason::CritResourceBalance:
+    return "CritResource";
+  case AMDGPUSchedReason::CritResourceDep:
+    return "CritResourceDep";
+  case AMDGPUSchedReason::NUM_REASONS:
+    return "???";
+  }
+  llvm_unreachable("Unknown AMDGPUSchedReason");
+}
+
+} // End namespace AMDGPU
+
+//===----------------------------------------------------------------------===//
+// Hardware Unit Information
+//===----------------------------------------------------------------------===//
+
+/// HardwareUnitInfo is a wrapper class which maps to some real hardware
+/// resource. This is used to model hardware resource pressure per region, and
+/// guide scheduling heuristics.
+class HardwareUnitInfo {
+private:
+  /// PrioritySUs maintains a list of the SUs we want to prioritize scheduling
+  /// for this HardwareUnit. This is used for agreement between
+  /// tryCriticalResourceDependency and tryCriticalResource: we schedule the
+  /// dependencies for a SU on critical resource, then schedule that same SU on
+  /// the critical resource. This agreement results in shorter live ranges and
+  /// more regular HardwareUnit access patterns. SUs are prioritized based on
+  /// depth for top-down scheduling.
+  SmallSetVector<SUnit *, 16> PrioritySUs;
+  /// All the SUs in the region that consume this resource
+  SmallSetVector<SUnit *, 16> AllSUs;
+  /// The total number of busy cycles for this HardwareUnit for a given region.
+  unsigned TotalCycles = 0;
+  // InstructionFlavor mapping
+  AMDGPU::InstructionFlavor Type;
+  // Whether or not instructions on this HardwareUnit may produce a window in
+  // which instructions in other HardwareUnits can coexecute. For example, WMMA
+  // / MFMA instructions may take multiple cycles, which may be overlapped with
+  // instructions on other HardwareUnits
+  bool ProducesCoexecWindow = false;
+
+public:
+  HardwareUnitInfo() {}
+
+  unsigned size() { return AllSUs.size(); }
+
+  unsigned getTotalCycles() { return TotalCycles; }
+
+  void setType(unsigned TheType) {
+    assert(TheType < (unsigned)AMDGPU::InstructionFlavor::NUM_FLAVORS);
+    Type = (AMDGPU::InstructionFlavor)(TheType);
+  }
+
+  AMDGPU::InstructionFlavor getType() const { return Type; }
+
+  bool producesCoexecWindow() const { return ProducesCoexecWindow; }
+
+  void setProducesCoexecWindow(bool Val) { ProducesCoexecWindow = Val; }
+
+  bool contains(SUnit *SU) const { return AllSUs.contains(SU); }
+
+  /// \returns true if there is a difference in priority between \p SU and \p
+  /// Other. If so, \returns the SUnit with higher priority. This
+  /// method looks through the PrioritySUs to determine if one SU is more
+  /// prioritized than the other. If neither are in the PrioritySUs list, then
+  /// neither have priority over each other.
+  SUnit *getHigherPriority(SUnit *SU, SUnit *Other) const {
+    for (auto *SUOrder : PrioritySUs) {
+      if (SUOrder == SU)
+        return SU;
+
+      if (SUOrder == Other)
+        return Other;
+    }
+    return nullptr;
+  }
+
+  void reset() {
+    AllSUs.clear();
+    PrioritySUs.clear();
+    TotalCycles = 0;
+    Type = AMDGPU::InstructionFlavor::Other;
+    ProducesCoexecWindow = false;
+  }
+
+  /// \returns the next SU in PrioritySUs that is not ready. If \p LookDeep is
+  /// set, we will look beyond the PrioritySUs (if all the PrioritySUs are
+  /// ready) to AllSUs to attempt to find a target SU. When looking through
+  /// AllSUs we sort pick the target SU by minimal depth for top-down
+  /// scheduling. getNextTargetSU is useful for determining which SU on this
+  /// HardwareUnit we are trying to schedule - this info helps us determine
+  /// which dependencies to schedule. LookDeep is useful if the dependencies are
+  /// long latency (e.g. memory instructions). If we have many long latency
+  /// dependencies, it is beneficial to enable SUs multiple levels ahead.
+  SUnit *getNextTargetSU(bool LookDeep = false) const;
+  /// Insert the \p SU into the AllSUs and account its \p BlockingCycles into
+  /// the TotalCycles. This maintains the list of PrioritySUs.
+  void insert(SUnit *SU, unsigned BlockingCycles);
+  /// Update the state for \p SU being scheduled by removing it from the AllSus
+  /// and reducing its \p BlockingCycles from the TotalCycles. This maintains
+  /// the list of PrioritySUS.
+  void markScheduled(SUnit *SU, unsigned BlockingCycles);
+};
+
+//===----------------------------------------------------------------------===//
+// Candidate Heuristics
+//===----------------------------------------------------------------------===//
+
+/// CandidateHeuristics contains state and implementations to facilitate making
+/// per instruction scheduling decisions; it contains methods used in
+/// tryCandidate to decide which instruction to schedule next.
+class CandidateHeuristics {
+protected:
+  ScheduleDAGMI *DAG;
+  const SIInstrInfo *SII;
+  const SIRegisterInfo *SRI;
+  const TargetSchedModel *SchedModel;
+  SmallVector<HardwareUnitInfo, 8> HWUInfo;
+
+  /// Walk over the region and collect total usage per HardwareUnit
+  void collectHWUIPressure();
+
+  /// Compute the blocking cycles for the appropriate HardwareUnit given an \p
+  /// SU
+  unsigned getHWUICyclesForInst(SUnit *SU);
+
+  /// Given a \p Flavor , find the corresponding HardwareUnit. \returns the
+  /// mapped HardwareUnit.
+  HardwareUnitInfo *getHWUIFromFlavor(AMDGPU::InstructionFlavor Flavor);
+
+public:
+  CandidateHeuristics() = default;
+
+  void initialize(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel,
+                  const TargetRegisterInfo *TRI);
+
+  /// Update the state to reflect that \p SU is going to be scheduled.
+  void updateForScheduling(SUnit *SU);
+
+  /// Sort the HWUInfo vector. After sorting, the HardwareUnits that are highest
+  /// priority are first. Priority is determined by maximizing coexecution and
+  /// keeping the critical HardwareUnit busy.
+  void sortHWUIResources();
+
+  /// Check for critical resource consumption. Prefer the candidate that uses
+  /// the most prioritized HardwareUnit. If both candidates use the same
+  /// HarwareUnit, prefer the candidate with higher priority on that
+  /// HardwareUnit.
+  bool tryCriticalResource(GenericSchedulerBase::SchedCandidate &TryCand,
+                           GenericSchedulerBase::SchedCandidate &Cand,
+                           SchedBoundary *Zone) const;
+
+  /// Check for dependencies of instructions that use prioritized HardwareUnits.
+  /// Prefer the candidate that is a dependency of an instruction that uses the
+  /// most prioritized HardwareUnit. If both candidates enable the same
+  /// HardwareUnit, prefer the candidate that enables the higher priority
+  /// instruction on that HardwareUnit.
+  bool
+  tryCriticalResourceDependency(GenericSchedulerBase::SchedCandidate &TryCand,
+                                GenericSchedulerBase::SchedCandidate &Cand,
+                                SchedBoundary *Zone) const;
+
+  void dumpRegionSummary();
+};
+
 class AMDGPUCoExecSchedStrategy final : public GCNSchedStrategy {
 protected:
-  bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
-                    SchedBoundary *Zone) const override;
   bool tryEffectiveStall(SchedCandidate &Cand, SchedCandidate &TryCand,
                          SchedBoundary &Zone) const;
+  AMDGPU::AMDGPUSchedReason LastAMDGPUReason = AMDGPU::AMDGPUSchedReason::None;
+  CandidateHeuristics Heurs;
+
+#ifndef NDEBUG
+  void dumpPickSummary(SUnit *SU, bool IsTopNode, SchedCandidate &Cand);
+#endif
+
+  bool tryCandidateCoexec(SchedCandidate &Cand, SchedCandidate &TryCand,
+                          SchedBoundary *Zone);
   void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
                          const RegPressureTracker &RPTracker,
                          SchedCandidate &Cand, bool &PickedPending,
@@ -38,6 +323,7 @@ public:
                   unsigned NumRegionInstrs) override;
   void initialize(ScheduleDAGMI *DAG) override;
   SUnit *pickNode(bool &IsTopNode) override;
+  void schedNode(SUnit *SU, bool IsTopNode) override;
 };
 
 ScheduleDAGInstrs *createGCNCoExecMachineScheduler(MachineSchedContext *C);
diff --git a/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir b/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
index f9f9a27e9af4..0a6f2fe9375d 100644
--- a/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
+++ b/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
@@ -38,6 +38,7 @@ body: |
     ; COEXEC-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF4:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+    ; COEXEC-NEXT: early-clobber %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; COEXEC-NEXT: [[DEF5:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF7:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
@@ -45,7 +46,6 @@ body: |
     ; COEXEC-NEXT: [[DEF9:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF10]], 0, 0, implicit $exec
-    ; COEXEC-NEXT: early-clobber %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; COEXEC-NEXT: early-clobber %14:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; COEXEC-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     ; COEXEC-NEXT: S_ENDPGM 0, implicit [[V_PK_ADD_F32_]], implicit %13, implicit %14
@@ -90,19 +90,19 @@ body: |
     ; DEFAULT-NEXT: S_ENDPGM 0, implicit %10, implicit %11
     ;
     ; COEXEC-LABEL: name: test-sched-pending-structural-stall
-    ; COEXEC: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+    ; COEXEC: S_NOP 0
+    ; COEXEC-NEXT: S_NOP 0
+    ; COEXEC-NEXT: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF4:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+    ; COEXEC-NEXT: early-clobber %10:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; COEXEC-NEXT: [[DEF5:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF7:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF8:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF9:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
-    ; COEXEC-NEXT: early-clobber %10:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    ; COEXEC-NEXT: S_NOP 0
-    ; COEXEC-NEXT: S_NOP 0
     ; COEXEC-NEXT: early-clobber %11:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; COEXEC-NEXT: S_ENDPGM 0, implicit %10, implicit %11
     %0:vreg_512_align2 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll b/llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll
new file mode 100644
index 000000000000..c1e7bc005998
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll
@@ -0,0 +1,606 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -amdgpu-sched-strategy=coexec --enable-post-misched=0 --verify-misched  < %s | FileCheck -check-prefix=COEXEC %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250  < %s | FileCheck -check-prefix=GCN %s
+
+
+define amdgpu_kernel void @ds_wmma(ptr addrspace(3) %base, ptr addrspace(1) %out, i1 %br0, i32 %delta) local_unnamed_addr #0 {
+; COEXEC-LABEL: ds_wmma:
+; COEXEC:       ; %bb.0: ; %entry
+; COEXEC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; COEXEC-NEXT:    v_mov_b32_e32 v0, 0
+; COEXEC-NEXT:    s_clause 0x1
+; COEXEC-NEXT:    s_load_b32 s2, s[4:5], 0x0 nv
+; COEXEC-NEXT:    s_load_b64 s[0:1], s[4:5], 0x10 nv
+; COEXEC-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; COEXEC-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v0
+; COEXEC-NEXT:    s_wait_kmcnt 0x0
+; COEXEC-NEXT:    s_bitcmp1_b32 s0, 0
+; COEXEC-NEXT:    s_cselect_b32 s0, -1, 0
+; COEXEC-NEXT:    v_mov_b32_e32 v5, v0
+; COEXEC-NEXT:    s_xor_b32 s0, s0, -1
+; COEXEC-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; COEXEC-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
+; COEXEC-NEXT:    v_mov_b32_e32 v6, v0
+; COEXEC-NEXT:    v_cmp_ne_u32_e64 s0, 1, v7
+; COEXEC-NEXT:    v_dual_mov_b32 v7, v0 :: v_dual_mov_b32 v8, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v24, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v9, v0 :: v_dual_mov_b32 v17, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v25, v0 :: v_dual_mov_b32 v10, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v26, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v11, v0 :: v_dual_mov_b32 v19, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v27, v0 :: v_dual_mov_b32 v12, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v20, v0 :: v_dual_mov_b32 v28, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v13, v0 :: v_dual_mov_b32 v21, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v29, v0 :: v_dual_mov_b32 v14, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v22, v0 :: v_dual_mov_b32 v30, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v15, v0 :: v_dual_mov_b32 v23, v0
+; COEXEC-NEXT:    v_mov_b32_e32 v31, v0
+; COEXEC-NEXT:  .LBB0_1: ; %loop
+; COEXEC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; COEXEC-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; COEXEC-NEXT:    v_nop
+; COEXEC-NEXT:    v_nop
+; COEXEC-NEXT:    v_nop
+; COEXEC-NEXT:    v_nop
+; COEXEC-NEXT:    v_mov_b32_e32 v92, s2
+; COEXEC-NEXT:    s_add_co_i32 s2, s2, s1
+; COEXEC-NEXT:    ds_load_tr16_b128 v[32:35], v92 offset:128
+; COEXEC-NEXT:    ds_load_tr16_b128 v[40:43], v92
+; COEXEC-NEXT:    ds_load_tr16_b128 v[36:39], v92 offset:192
+; COEXEC-NEXT:    ds_load_tr16_b128 v[44:47], v92 offset:64
+; COEXEC-NEXT:    ds_load_tr16_b128 v[48:51], v92 offset:384
+; COEXEC-NEXT:    ds_load_tr16_b128 v[56:59], v92 offset:256
+; COEXEC-NEXT:    ds_load_tr16_b128 v[52:55], v92 offset:448
+; COEXEC-NEXT:    ds_load_tr16_b128 v[60:63], v92 offset:320
+; COEXEC-NEXT:    ds_load_tr16_b128 v[64:67], v92 offset:640
+; COEXEC-NEXT:    ds_load_tr16_b128 v[72:75], v92 offset:512
+; COEXEC-NEXT:    ds_load_tr16_b128 v[68:71], v92 offset:704
+; COEXEC-NEXT:    ds_load_tr16_b128 v[76:79], v92 offset:576
+; COEXEC-NEXT:    ds_load_tr16_b128 v[80:83], v92 offset:896
+; COEXEC-NEXT:    ds_load_tr16_b128 v[88:91], v92 offset:768
+; COEXEC-NEXT:    ds_load_tr16_b128 v[84:87], v92 offset:960
+; COEXEC-NEXT:    ds_load_tr16_b128 v[92:95], v92 offset:832
+; COEXEC-NEXT:    s_wait_dscnt 0xc
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[24:31], v[40:47], v[32:39], v[24:31]
+; COEXEC-NEXT:    s_wait_dscnt 0x8
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[56:63], v[48:55], v[16:23]
+; COEXEC-NEXT:    s_wait_dscnt 0x4
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[8:15], v[72:79], v[64:71], v[8:15]
+; COEXEC-NEXT:    s_wait_dscnt 0x0
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[0:7], v[88:95], v[80:87], v[0:7]
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[24:31], v[40:47], v[32:39], v[24:31]
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[56:63], v[48:55], v[16:23]
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[8:15], v[72:79], v[64:71], v[8:15]
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[0:7], v[88:95], v[80:87], v[0:7]
+; COEXEC-NEXT:    s_cbranch_vccnz .LBB0_1
+; COEXEC-NEXT:  ; %bb.2: ; %end
+; COEXEC-NEXT:    v_nop
+; COEXEC-NEXT:    v_mov_b32_e32 v32, 0
+; COEXEC-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8 nv
+; COEXEC-NEXT:    s_wait_kmcnt 0x0
+; COEXEC-NEXT:    s_clause 0x7
+; COEXEC-NEXT:    global_store_b128 v32, v[28:31], s[0:1] offset:16
+; COEXEC-NEXT:    global_store_b128 v32, v[24:27], s[0:1]
+; COEXEC-NEXT:    global_store_b128 v32, v[20:23], s[0:1] offset:144
+; COEXEC-NEXT:    global_store_b128 v32, v[16:19], s[0:1] offset:128
+; COEXEC-NEXT:    global_store_b128 v32, v[12:15], s[0:1] offset:272
+; COEXEC-NEXT:    global_store_b128 v32, v[8:11], s[0:1] offset:256
+; COEXEC-NEXT:    global_store_b128 v32, v[4:7], s[0:1] offset:400
+; COEXEC-NEXT:    global_store_b128 v32, v[0:3], s[0:1] offset:384
+; COEXEC-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; COEXEC-NEXT:    s_endpgm
+;
+; GCN-LABEL: ds_wmma:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x10 nv
+; GCN-NEXT:    s_load_b32 s2, s[4:5], 0x0 nv
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0
+; GCN-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v0
+; GCN-NEXT:    v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v0
+; GCN-NEXT:    v_dual_mov_b32 v7, v0 :: v_dual_mov_b32 v8, v0
+; GCN-NEXT:    v_dual_mov_b32 v9, v0 :: v_dual_mov_b32 v10, v0
+; GCN-NEXT:    v_dual_mov_b32 v11, v0 :: v_dual_mov_b32 v12, v0
+; GCN-NEXT:    v_dual_mov_b32 v13, v0 :: v_dual_mov_b32 v14, v0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_bitcmp1_b32 s0, 0
+; GCN-NEXT:    v_dual_mov_b32 v15, v0 :: v_dual_mov_b32 v16, v0
+; GCN-NEXT:    s_cselect_b32 s0, -1, 0
+; GCN-NEXT:    v_dual_mov_b32 v17, v0 :: v_dual_mov_b32 v18, v0
+; GCN-NEXT:    s_xor_b32 s0, s0, -1
+; GCN-NEXT:    v_dual_mov_b32 v19, v0 :: v_dual_mov_b32 v20, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s0
+; GCN-NEXT:    v_dual_mov_b32 v21, v0 :: v_dual_mov_b32 v22, v0
+; GCN-NEXT:    v_dual_mov_b32 v23, v0 :: v_dual_mov_b32 v25, v0
+; GCN-NEXT:    v_mov_b32_e32 v26, v0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GCN-NEXT:    v_cmp_ne_u32_e64 s0, 1, v24
+; GCN-NEXT:    v_dual_mov_b32 v24, v0 :: v_dual_mov_b32 v27, v0
+; GCN-NEXT:    v_dual_mov_b32 v28, v0 :: v_dual_mov_b32 v29, v0
+; GCN-NEXT:    v_dual_mov_b32 v30, v0 :: v_dual_mov_b32 v31, v0
+; GCN-NEXT:  .LBB0_1: ; %loop
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_nop
+; GCN-NEXT:    v_nop
+; GCN-NEXT:    v_nop
+; GCN-NEXT:    v_nop
+; GCN-NEXT:    v_mov_b32_e32 v92, s2
+; GCN-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; GCN-NEXT:    s_add_co_i32 s2, s2, s1
+; GCN-NEXT:    ds_load_tr16_b128 v[32:35], v92
+; GCN-NEXT:    ds_load_tr16_b128 v[36:39], v92 offset:64
+; GCN-NEXT:    ds_load_tr16_b128 v[40:43], v92 offset:128
+; GCN-NEXT:    ds_load_tr16_b128 v[44:47], v92 offset:192
+; GCN-NEXT:    ds_load_tr16_b128 v[48:51], v92 offset:256
+; GCN-NEXT:    ds_load_tr16_b128 v[52:55], v92 offset:320
+; GCN-NEXT:    ds_load_tr16_b128 v[56:59], v92 offset:384
+; GCN-NEXT:    ds_load_tr16_b128 v[60:63], v92 offset:448
+; GCN-NEXT:    ds_load_tr16_b128 v[64:67], v92 offset:512
+; GCN-NEXT:    ds_load_tr16_b128 v[68:71], v92 offset:576
+; GCN-NEXT:    ds_load_tr16_b128 v[72:75], v92 offset:640
+; GCN-NEXT:    ds_load_tr16_b128 v[76:79], v92 offset:704
+; GCN-NEXT:    ds_load_tr16_b128 v[80:83], v92 offset:768
+; GCN-NEXT:    ds_load_tr16_b128 v[84:87], v92 offset:832
+; GCN-NEXT:    ds_load_tr16_b128 v[88:91], v92 offset:896
+; GCN-NEXT:    ds_load_tr16_b128 v[92:95], v92 offset:960
+; GCN-NEXT:    s_wait_dscnt 0xc
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31]
+; GCN-NEXT:    s_wait_dscnt 0x8
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23]
+; GCN-NEXT:    s_wait_dscnt 0x4
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15]
+; GCN-NEXT:    s_wait_dscnt 0x0
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7]
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31]
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23]
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15]
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7]
+; GCN-NEXT:    s_cbranch_vccnz .LBB0_1
+; GCN-NEXT:  ; %bb.2: ; %end
+; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8 nv
+; GCN-NEXT:    v_nop
+; GCN-NEXT:    v_mov_b32_e32 v32, 0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_clause 0x7
+; GCN-NEXT:    global_store_b128 v32, v[28:31], s[0:1] offset:16
+; GCN-NEXT:    global_store_b128 v32, v[24:27], s[0:1]
+; GCN-NEXT:    global_store_b128 v32, v[20:23], s[0:1] offset:144
+; GCN-NEXT:    global_store_b128 v32, v[16:19], s[0:1] offset:128
+; GCN-NEXT:    global_store_b128 v32, v[12:15], s[0:1] offset:272
+; GCN-NEXT:    global_store_b128 v32, v[8:11], s[0:1] offset:256
+; GCN-NEXT:    global_store_b128 v32, v[4:7], s[0:1] offset:400
+; GCN-NEXT:    global_store_b128 v32, v[0:3], s[0:1] offset:384
+; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT:    s_endpgm
+entry:
+
+  br label %loop
+
+loop:
+  %baseOff = phi i32 [ 0, %entry ], [ %newBaseOff, %loop ]
+  %wvec0 = phi <8 x float> [ <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %wmma01,  %loop ]
+  %wvec1 = phi <8 x float> [ <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %wmma11,  %loop ]
+  %wvec2 = phi <8 x float> [ <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %wmma21,  %loop ]
+  %wvec3 = phi <8 x float> [ <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %wmma31,  %loop ]
+  %p0 = getelementptr inbounds nuw i8, ptr addrspace(3) %base, i32 %baseOff
+  %p1 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 64
+  %p2 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 128
+  %p3 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 192
+  %p4 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 256
+  %p5 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 320
+  %p6 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 384
+  %p7 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 448
+  %p8 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 512
+  %p9 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 576
+  %p10 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 640
+  %p11 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 704
+  %p12 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 768
+  %p13 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 832
+  %p14 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 896
+  %p15 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 960
+  %l0 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) %p0)
+  %l1 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p1)
+  %l2 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p2)
+  %l3 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p3)
+  %l4 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p4)
+  %l5 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p5)
+  %l6 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p6)
+  %l7 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p7)
+  %l8 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p8)
+  %l9 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p9)
+  %l10 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p10)
+  %l11 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p11)
+  %l12 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p12)
+  %l13 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p13)
+  %l14 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p14)
+  %l15 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p15)
+  %vec0 = shufflevector <8 x half> %l0, <8 x half> %l1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vec1 = shufflevector <8 x half> %l2, <8 x half> %l3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vec2 = shufflevector <8 x half> %l4, <8 x half> %l5, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vec3 = shufflevector <8 x half> %l6, <8 x half> %l7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vec4 = shufflevector <8 x half> %l8, <8 x half> %l9, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vec5 = shufflevector <8 x half> %l10, <8 x half> %l11, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vec6 = shufflevector <8 x half> %l12, <8 x half> %l13, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vec7 = shufflevector <8 x half> %l14, <8 x half> %l15, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %wmma00 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec0, i1 false, <16 x half> %vec1, i16 0, <8 x float> %wvec0, i1 false, i1 false)
+  %wmma01 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec0, i1 false, <16 x half> %vec1, i16 0, <8 x float> %wmma00, i1 false, i1 false)
+  %wmma10 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec2, i1 false, <16 x half> %vec3, i16 0, <8 x float> %wvec1, i1 false, i1 false)
+  %wmma11 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec2, i1 false, <16 x half> %vec3, i16 0, <8 x float> %wmma10, i1 false, i1 false)
+  %wmma20 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec4, i1 false, <16 x half> %vec5, i16 0, <8 x float> %wvec2, i1 false, i1 false)
+  %wmma21 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec4, i1 false, <16 x half> %vec5, i16 0, <8 x float> %wmma20, i1 false, i1 false)
+  %wmma30 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec6, i1 false, <16 x half> %vec7, i16 0, <8 x float> %wvec3, i1 false, i1 false)
+  %wmma31 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec6, i1 false, <16 x half> %vec7, i16 0, <8 x float> %wmma30, i1 false, i1 false)
+  %newBaseOff = or disjoint i32 %baseOff, %delta
+  br i1 %br0, label %loop, label %end
+
+end:
+  %out1 = getelementptr inbounds nuw i8, ptr addrspace(1) %out, i32 128
+  %out2 = getelementptr inbounds nuw i8, ptr addrspace(1) %out, i32 256
+  %out3 = getelementptr inbounds nuw i8, ptr addrspace(1) %out, i32 384
+  store <8 x float> %wmma01, ptr addrspace(1) %out, align 16
+  store <8 x float> %wmma11, ptr addrspace(1) %out1, align 16
+  store <8 x float> %wmma21, ptr addrspace(1) %out2, align 16
+  store <8 x float> %wmma31, ptr addrspace(1) %out3, align 16
+  ret void
+}
+
+define amdgpu_kernel void @ds_wmma_permute(ptr addrspace(3) %base, ptr addrspace(3) %base1, ptr addrspace(1) %out, i1 %br0, i32 %delta) local_unnamed_addr #0 {
+; COEXEC-LABEL: ds_wmma_permute:
+; COEXEC:       ; %bb.0: ; %entry
+; COEXEC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; COEXEC-NEXT:    s_mov_b32 s6, 0
+; COEXEC-NEXT:    s_clause 0x1
+; COEXEC-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0 nv
+; COEXEC-NEXT:    s_load_b64 s[0:1], s[4:5], 0x10 nv
+; COEXEC-NEXT:    v_mov_b32_e32 v0, 0
+; COEXEC-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; COEXEC-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0
+; COEXEC-NEXT:    v_mov_b32_e32 v3, v0
+; COEXEC-NEXT:    s_wait_kmcnt 0x0
+; COEXEC-NEXT:    s_bitcmp1_b32 s0, 0
+; COEXEC-NEXT:    v_mov_b32_e32 v4, v0
+; COEXEC-NEXT:    s_cselect_b32 s0, -1, 0
+; COEXEC-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; COEXEC-NEXT:    s_xor_b32 s0, s0, -1
+; COEXEC-NEXT:    v_mov_b32_e32 v5, v0
+; COEXEC-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
+; COEXEC-NEXT:    v_mov_b32_e32 v6, v0
+; COEXEC-NEXT:    v_cmp_ne_u32_e64 s0, 1, v7
+; COEXEC-NEXT:    v_dual_mov_b32 v7, v0 :: v_dual_mov_b32 v8, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v24, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v9, v0 :: v_dual_mov_b32 v17, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v25, v0 :: v_dual_mov_b32 v10, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v26, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v11, v0 :: v_dual_mov_b32 v19, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v27, v0 :: v_dual_mov_b32 v12, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v20, v0 :: v_dual_mov_b32 v28, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v13, v0 :: v_dual_mov_b32 v21, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v29, v0 :: v_dual_mov_b32 v14, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v22, v0 :: v_dual_mov_b32 v30, v0
+; COEXEC-NEXT:    v_dual_mov_b32 v15, v0 :: v_dual_mov_b32 v23, v0
+; COEXEC-NEXT:    v_mov_b32_e32 v31, v0
+; COEXEC-NEXT:  .LBB1_1: ; %loop
+; COEXEC-NEXT:    ; =>This Inner Loop Header: Depth=1
+; COEXEC-NEXT:    s_add_co_i32 s7, s2, s6
+; COEXEC-NEXT:    s_add_co_i32 s8, s3, s6
+; COEXEC-NEXT:    s_add_co_i32 s6, s6, s1
+; COEXEC-NEXT:    v_nop
+; COEXEC-NEXT:    v_nop
+; COEXEC-NEXT:    v_nop
+; COEXEC-NEXT:    v_nop
+; COEXEC-NEXT:    v_mov_b32_e32 v124, s7
+; COEXEC-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; COEXEC-NEXT:    v_mov_b32_e32 v156, s8
+; COEXEC-NEXT:    ds_load_tr16_b128 v[32:35], v124
+; COEXEC-NEXT:    ds_load_tr16_b128 v[36:39], v124 offset:64
+; COEXEC-NEXT:    ds_load_tr16_b128 v[40:43], v156
+; COEXEC-NEXT:    ds_load_tr16_b128 v[44:47], v156 offset:64
+; COEXEC-NEXT:    ds_load_tr16_b128 v[48:51], v124 offset:256
+; COEXEC-NEXT:    ds_load_tr16_b128 v[56:59], v156 offset:256
+; COEXEC-NEXT:    ds_load_tr16_b128 v[52:55], v124 offset:320
+; COEXEC-NEXT:    ds_load_tr16_b128 v[60:63], v156 offset:320
+; COEXEC-NEXT:    ds_load_tr16_b128 v[64:67], v124 offset:512
+; COEXEC-NEXT:    ds_load_tr16_b128 v[72:75], v156 offset:512
+; COEXEC-NEXT:    ds_load_tr16_b128 v[68:71], v124 offset:576
+; COEXEC-NEXT:    ds_load_tr16_b128 v[76:79], v156 offset:576
+; COEXEC-NEXT:    ds_load_tr16_b128 v[80:83], v124 offset:768
+; COEXEC-NEXT:    ds_load_tr16_b128 v[88:91], v156 offset:768
+; COEXEC-NEXT:    ds_load_tr16_b128 v[84:87], v124 offset:832
+; COEXEC-NEXT:    ds_load_tr16_b128 v[92:95], v156 offset:832
+; COEXEC-NEXT:    ds_load_tr16_b128 v[96:99], v124 offset:128
+; COEXEC-NEXT:    ds_load_tr16_b128 v[104:107], v124 offset:384
+; COEXEC-NEXT:    ds_load_tr16_b128 v[112:115], v124 offset:640
+; COEXEC-NEXT:    ds_load_tr16_b128 v[120:123], v124 offset:896
+; COEXEC-NEXT:    ds_load_tr16_b128 v[128:131], v156 offset:128
+; COEXEC-NEXT:    ds_load_tr16_b128 v[136:139], v156 offset:384
+; COEXEC-NEXT:    ds_load_tr16_b128 v[144:147], v156 offset:640
+; COEXEC-NEXT:    s_wait_dscnt 0x13
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31]
+; COEXEC-NEXT:    ds_load_tr16_b128 v[152:155], v156 offset:896
+; COEXEC-NEXT:    ds_load_tr16_b128 v[100:103], v124 offset:192
+; COEXEC-NEXT:    ds_load_tr16_b128 v[108:111], v124 offset:448
+; COEXEC-NEXT:    ds_load_tr16_b128 v[116:119], v124 offset:704
+; COEXEC-NEXT:    ds_load_tr16_b128 v[124:127], v124 offset:960
+; COEXEC-NEXT:    ds_load_tr16_b128 v[132:135], v156 offset:192
+; COEXEC-NEXT:    ds_load_tr16_b128 v[140:143], v156 offset:448
+; COEXEC-NEXT:    s_wait_dscnt 0x16
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23]
+; COEXEC-NEXT:    ds_load_tr16_b128 v[148:151], v156 offset:704
+; COEXEC-NEXT:    ds_load_tr16_b128 v[156:159], v156 offset:960
+; COEXEC-NEXT:    s_wait_dscnt 0x14
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15]
+; COEXEC-NEXT:    s_wait_dscnt 0x10
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7]
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31]
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23]
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15]
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7]
+; COEXEC-NEXT:    s_wait_dscnt 0x3
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[24:31], v[96:103], v[128:135], v[24:31]
+; COEXEC-NEXT:    s_wait_dscnt 0x2
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[104:111], v[136:143], v[16:23]
+; COEXEC-NEXT:    s_wait_dscnt 0x1
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[8:15], v[112:119], v[144:151], v[8:15]
+; COEXEC-NEXT:    s_wait_dscnt 0x0
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[0:7], v[120:127], v[152:159], v[0:7]
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[24:31], v[96:103], v[128:135], v[24:31]
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[104:111], v[136:143], v[16:23]
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[8:15], v[112:119], v[144:151], v[8:15]
+; COEXEC-NEXT:    v_wmma_f32_16x16x32_f16 v[0:7], v[120:127], v[152:159], v[0:7]
+; COEXEC-NEXT:    s_cbranch_vccnz .LBB1_1
+; COEXEC-NEXT:  ; %bb.2: ; %end
+; COEXEC-NEXT:    v_mov_b32_e32 v32, 0
+; COEXEC-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8 nv
+; COEXEC-NEXT:    s_wait_kmcnt 0x0
+; COEXEC-NEXT:    s_clause 0x7
+; COEXEC-NEXT:    global_store_b128 v32, v[28:31], s[0:1] offset:16
+; COEXEC-NEXT:    global_store_b128 v32, v[24:27], s[0:1]
+; COEXEC-NEXT:    global_store_b128 v32, v[20:23], s[0:1] offset:144
+; COEXEC-NEXT:    global_store_b128 v32, v[16:19], s[0:1] offset:128
+; COEXEC-NEXT:    global_store_b128 v32, v[12:15], s[0:1] offset:272
+; COEXEC-NEXT:    global_store_b128 v32, v[8:11], s[0:1] offset:256
+; COEXEC-NEXT:    global_store_b128 v32, v[4:7], s[0:1] offset:400
+; COEXEC-NEXT:    global_store_b128 v32, v[0:3], s[0:1] offset:384
+; COEXEC-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; COEXEC-NEXT:    s_endpgm
+;
+; GCN-LABEL: ds_wmma_permute:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GCN-NEXT:    s_clause 0x1
+; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x10 nv
+; GCN-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0 nv
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0
+; GCN-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v0
+; GCN-NEXT:    v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v0
+; GCN-NEXT:    v_dual_mov_b32 v7, v0 :: v_dual_mov_b32 v8, v0
+; GCN-NEXT:    v_dual_mov_b32 v9, v0 :: v_dual_mov_b32 v10, v0
+; GCN-NEXT:    v_dual_mov_b32 v11, v0 :: v_dual_mov_b32 v12, v0
+; GCN-NEXT:    v_dual_mov_b32 v13, v0 :: v_dual_mov_b32 v14, v0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_bitcmp1_b32 s0, 0
+; GCN-NEXT:    v_dual_mov_b32 v15, v0 :: v_dual_mov_b32 v16, v0
+; GCN-NEXT:    s_cselect_b32 s0, -1, 0
+; GCN-NEXT:    v_dual_mov_b32 v17, v0 :: v_dual_mov_b32 v18, v0
+; GCN-NEXT:    s_xor_b32 s0, s0, -1
+; GCN-NEXT:    v_dual_mov_b32 v19, v0 :: v_dual_mov_b32 v20, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s0
+; GCN-NEXT:    v_dual_mov_b32 v21, v0 :: v_dual_mov_b32 v22, v0
+; GCN-NEXT:    v_dual_mov_b32 v23, v0 :: v_dual_mov_b32 v25, v0
+; GCN-NEXT:    v_mov_b32_e32 v26, v0
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GCN-NEXT:    v_cmp_ne_u32_e64 s0, 1, v24
+; GCN-NEXT:    v_dual_mov_b32 v24, v0 :: v_dual_mov_b32 v27, v0
+; GCN-NEXT:    v_dual_mov_b32 v28, v0 :: v_dual_mov_b32 v29, v0
+; GCN-NEXT:    v_dual_mov_b32 v30, v0 :: v_dual_mov_b32 v31, v0
+; GCN-NEXT:  .LBB1_1: ; %loop
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    s_add_co_i32 s7, s2, s6
+; GCN-NEXT:    s_add_co_i32 s8, s3, s6
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    v_dual_mov_b32 v96, s7 :: v_dual_mov_b32 v97, s8
+; GCN-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; GCN-NEXT:    s_add_co_i32 s6, s6, s1
+; GCN-NEXT:    ds_load_tr16_b128 v[32:35], v96
+; GCN-NEXT:    ds_load_tr16_b128 v[36:39], v96 offset:64
+; GCN-NEXT:    ds_load_tr16_b128 v[40:43], v97
+; GCN-NEXT:    ds_load_tr16_b128 v[44:47], v97 offset:64
+; GCN-NEXT:    ds_load_tr16_b128 v[48:51], v96 offset:256
+; GCN-NEXT:    ds_load_tr16_b128 v[52:55], v96 offset:320
+; GCN-NEXT:    ds_load_tr16_b128 v[56:59], v97 offset:256
+; GCN-NEXT:    ds_load_tr16_b128 v[60:63], v97 offset:320
+; GCN-NEXT:    ds_load_tr16_b128 v[64:67], v96 offset:512
+; GCN-NEXT:    ds_load_tr16_b128 v[68:71], v96 offset:576
+; GCN-NEXT:    ds_load_tr16_b128 v[72:75], v97 offset:512
+; GCN-NEXT:    ds_load_tr16_b128 v[76:79], v97 offset:576
+; GCN-NEXT:    ds_load_tr16_b128 v[80:83], v96 offset:768
+; GCN-NEXT:    ds_load_tr16_b128 v[84:87], v96 offset:832
+; GCN-NEXT:    ds_load_tr16_b128 v[88:91], v97 offset:768
+; GCN-NEXT:    ds_load_tr16_b128 v[92:95], v97 offset:832
+; GCN-NEXT:    s_wait_dscnt 0xc
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31]
+; GCN-NEXT:    s_wait_dscnt 0x8
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23]
+; GCN-NEXT:    s_wait_dscnt 0x4
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15]
+; GCN-NEXT:    s_wait_dscnt 0x0
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7]
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31]
+; GCN-NEXT:    ds_load_tr16_b128 v[32:35], v96 offset:128
+; GCN-NEXT:    ds_load_tr16_b128 v[36:39], v96 offset:192
+; GCN-NEXT:    ds_load_tr16_b128 v[40:43], v97 offset:128
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23]
+; GCN-NEXT:    ds_load_tr16_b128 v[44:47], v97 offset:192
+; GCN-NEXT:    ds_load_tr16_b128 v[48:51], v96 offset:384
+; GCN-NEXT:    ds_load_tr16_b128 v[52:55], v96 offset:448
+; GCN-NEXT:    ds_load_tr16_b128 v[56:59], v97 offset:384
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15]
+; GCN-NEXT:    ds_load_tr16_b128 v[60:63], v97 offset:448
+; GCN-NEXT:    ds_load_tr16_b128 v[64:67], v96 offset:640
+; GCN-NEXT:    ds_load_tr16_b128 v[68:71], v96 offset:704
+; GCN-NEXT:    ds_load_tr16_b128 v[72:75], v97 offset:640
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7]
+; GCN-NEXT:    ds_load_tr16_b128 v[76:79], v97 offset:704
+; GCN-NEXT:    ds_load_tr16_b128 v[80:83], v96 offset:896
+; GCN-NEXT:    ds_load_tr16_b128 v[84:87], v96 offset:960
+; GCN-NEXT:    ds_load_tr16_b128 v[88:91], v97 offset:896
+; GCN-NEXT:    ds_load_tr16_b128 v[92:95], v97 offset:960
+; GCN-NEXT:    s_wait_dscnt 0xc
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31]
+; GCN-NEXT:    s_wait_dscnt 0x8
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23]
+; GCN-NEXT:    s_wait_dscnt 0x4
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15]
+; GCN-NEXT:    s_wait_dscnt 0x0
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7]
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31]
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23]
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15]
+; GCN-NEXT:    v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7]
+; GCN-NEXT:    s_cbranch_vccnz .LBB1_1
+; GCN-NEXT:  ; %bb.2: ; %end
+; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8 nv
+; GCN-NEXT:    v_nop
+; GCN-NEXT:    v_mov_b32_e32 v32, 0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_clause 0x7
+; GCN-NEXT:    global_store_b128 v32, v[28:31], s[0:1] offset:16
+; GCN-NEXT:    global_store_b128 v32, v[24:27], s[0:1]
+; GCN-NEXT:    global_store_b128 v32, v[20:23], s[0:1] offset:144
+; GCN-NEXT:    global_store_b128 v32, v[16:19], s[0:1] offset:128
+; GCN-NEXT:    global_store_b128 v32, v[12:15], s[0:1] offset:272
+; GCN-NEXT:    global_store_b128 v32, v[8:11], s[0:1] offset:256
+; GCN-NEXT:    global_store_b128 v32, v[4:7], s[0:1] offset:400
+; GCN-NEXT:    global_store_b128 v32, v[0:3], s[0:1] offset:384
+; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT:    s_endpgm
+entry:
+
+  br label %loop
+
+loop:
+  %baseOff = phi i32 [ 0, %entry ], [ %newBaseOff, %loop ]
+  %wvec0 = phi <8 x float> [ <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %bwmma01,  %loop ]
+  %wvec1 = phi <8 x float> [ <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %bwmma11,  %loop ]
+  %wvec2 = phi <8 x float> [ <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %bwmma21,  %loop ]
+  %wvec3 = phi <8 x float> [ <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %bwmma31,  %loop ]
+  %p0 = getelementptr inbounds nuw i8, ptr addrspace(3) %base, i32 %baseOff
+  %p1 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 64
+  %p2 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 128
+  %p3 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 192
+  %p4 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 256
+  %p5 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 320
+  %p6 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 384
+  %p7 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 448
+  %p8 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 512
+  %p9 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 576
+  %p10 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 640
+  %p11 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 704
+  %p12 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 768
+  %p13 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 832
+  %p14 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 896
+  %p15 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 960
+  %bp0 = getelementptr inbounds nuw i8, ptr addrspace(3) %base1, i32 %baseOff
+  %bp1 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 64
+  %bp2 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 128
+  %bp3 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 192
+  %bp4 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 256
+  %bp5 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 320
+  %bp6 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 384
+  %bp7 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 448
+  %bp8 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 512
+  %bp9 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 576
+  %bp10 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 640
+  %bp11 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 704
+  %bp12 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 768
+  %bp13 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 832
+  %bp14 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 896
+  %bp15 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 960
+
+  %l0 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) %p0)
+  %l1 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p1)
+  %l2 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p2)
+  %l3 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p3)
+  %l4 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p4)
+  %l5 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p5)
+  %l6 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p6)
+  %l7 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p7)
+  %l8 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p8)
+  %l9 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p9)
+  %l10 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p10)
+  %l11 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p11)
+  %l12 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p12)
+  %l13 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p13)
+  %l14 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p14)
+  %l15 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p15)
+  %bl0 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) %bp0)
+  %bl1 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp1)
+  %bl2 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp2)
+  %bl3 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp3)
+  %bl4 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp4)
+  %bl5 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp5)
+  %bl6 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp6)
+  %bl7 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp7)
+  %bl8 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp8)
+  %bl9 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp9)
+  %bl10 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp10)
+  %bl11 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp11)
+  %bl12 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp12)
+  %bl13 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp13)
+  %bl14 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp14)
+  %bl15 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp15)
+  %vec0 = shufflevector <8 x half> %l0, <8 x half> %l1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vec1 = shufflevector <8 x half> %l2, <8 x half> %l3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vec2 = shufflevector <8 x half> %l4, <8 x half> %l5, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vec3 = shufflevector <8 x half> %l6, <8 x half> %l7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vec4 = shufflevector <8 x half> %l8, <8 x half> %l9, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vec5 = shufflevector <8 x half> %l10, <8 x half> %l11, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vec6 = shufflevector <8 x half> %l12, <8 x half> %l13, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vec7 = shufflevector <8 x half> %l14, <8 x half> %l15, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %bvec0 = shufflevector <8 x half> %bl0, <8 x half> %bl1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %bvec1 = shufflevector <8 x half> %bl2, <8 x half> %bl3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %bvec2 = shufflevector <8 x half> %bl4, <8 x half> %bl5, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %bvec3 = shufflevector <8 x half> %bl6, <8 x half> %bl7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %bvec4 = shufflevector <8 x half> %bl8, <8 x half> %bl9, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %bvec5 = shufflevector <8 x half> %bl10, <8 x half> %bl11, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %bvec6 = shufflevector <8 x half> %bl12, <8 x half> %bl13, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %bvec7 = shufflevector <8 x half> %bl14, <8 x half> %bl15, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %wmma00 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec0, i1 false, <16 x half> %bvec0, i16 0, <8 x float> %wvec0, i1 false, i1 false)
+  %bwmma00 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec0, i1 false, <16 x half> %bvec0, i16 0, <8 x float> %wmma00, i1 false, i1 false)
+  %wmma01 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec1, i1 false, <16 x half> %bvec1, i16 0, <8 x float> %bwmma00, i1 false, i1 false)
+  %bwmma01 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec1, i1 false, <16 x half> %bvec1, i16 0, <8 x float> %wmma01, i1 false, i1 false)
+  %wmma10 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec2, i1 false, <16 x half> %bvec2, i16 0, <8 x float> %wvec1, i1 false, i1 false)
+  %bwmma10 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec2, i1 false, <16 x half> %bvec2, i16 0, <8 x float> %wmma10, i1 false, i1 false)
+  %wmma11 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec3, i1 false, <16 x half> %bvec3, i16 0, <8 x float> %bwmma10, i1 false, i1 false)
+  %bwmma11 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec3, i1 false, <16 x half> %bvec3, i16 0, <8 x float> %wmma11, i1 false, i1 false)
+  %wmma20 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec4, i1 false, <16 x half> %bvec4, i16 0, <8 x float> %wvec2, i1 false, i1 false)
+  %bwmma20 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec4, i1 false, <16 x half> %bvec4, i16 0, <8 x float> %wmma20, i1 false, i1 false)
+  %wmma21 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec5, i1 false, <16 x half> %bvec5, i16 0, <8 x float> %bwmma20, i1 false, i1 false)
+  %bwmma21 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec5, i1 false, <16 x half> %bvec5, i16 0, <8 x float> %wmma21, i1 false, i1 false)
+  %wmma30 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec6, i1 false, <16 x half> %bvec6, i16 0, <8 x float> %wvec3, i1 false, i1 false)
+  %bwmma30 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec6, i1 false, <16 x half> %bvec6, i16 0, <8 x float> %wmma30, i1 false, i1 false)
+  %wmma31 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec7, i1 false, <16 x half> %bvec7, i16 0, <8 x float> %bwmma30, i1 false, i1 false)
+  %bwmma31 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec7, i1 false, <16 x half> %bvec7, i16 0, <8 x float> %wmma31, i1 false, i1 false)
+  %newBaseOff = or disjoint i32 %baseOff, %delta
+  br i1 %br0, label %loop, label %end
+
+end:
+  %out1 = getelementptr inbounds nuw i8, ptr addrspace(1) %out, i32 128
+  %out2 = getelementptr inbounds nuw i8, ptr addrspace(1) %out, i32 256
+  %out3 = getelementptr inbounds nuw i8, ptr addrspace(1) %out, i32 384
+  store <8 x float> %bwmma01, ptr addrspace(1) %out, align 16
+  store <8 x float> %bwmma11, ptr addrspace(1) %out1, align 16
+  store <8 x float> %bwmma21, ptr addrspace(1) %out2, align 16
+  store <8 x float> %bwmma31, ptr addrspace(1) %out3, align 16
+  ret void
+}
+
+
+attributes #0 = { "amdgpu-flat-work-group-size"="32,32" "amdgpu-waves-per-eu"="1,1" }