From 7364203924cf9d464df4f6b9455ac6cd42c856ae Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Mon, 30 Mar 2026 12:18:29 -0700 Subject: [PATCH] Reapply "[AMDGPU] Add HWUI pressure heuristics to coexec strategy (#184929)" (#189121) Reland https://github.com/llvm/llvm-project/pull/184929 after fixing some issues in the NDEBUG builds. 3a640ee is unchanged from the previously approved PR, the unreviewed portion of this PR is 9cabd8d --- .../AMDGPU/AMDGPUCoExecSchedStrategy.cpp | 446 ++++++++++++- .../Target/AMDGPU/AMDGPUCoExecSchedStrategy.h | 290 ++++++++- .../AMDGPU/coexec-sched-effective-stall.mir | 10 +- llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll | 606 ++++++++++++++++++ 4 files changed, 1322 insertions(+), 30 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp index 977c6f56ad15..d83f8fee2b2f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp @@ -15,6 +15,7 @@ #include "llvm/Support/Debug.h" using namespace llvm; +using namespace llvm::AMDGPU; #define DEBUG_TYPE "machine-scheduler" @@ -41,6 +42,370 @@ static SUnit *pickOnlyChoice(SchedBoundary &Zone) { return OnlyChoice; } +InstructionFlavor llvm::AMDGPU::classifyFlavor(const MachineInstr &MI, + const SIInstrInfo &SII) { + if (MI.isDebugInstr()) + return InstructionFlavor::Other; + + unsigned Opc = MI.getOpcode(); + + // Check for specific opcodes first. + if (Opc == AMDGPU::ATOMIC_FENCE || Opc == AMDGPU::S_WAIT_ASYNCCNT || + Opc == AMDGPU::S_WAIT_TENSORCNT || Opc == AMDGPU::S_BARRIER_WAIT || + Opc == AMDGPU::S_BARRIER_SIGNAL_IMM) + return InstructionFlavor::Fence; + + if (SII.isLDSDMA(MI)) + return InstructionFlavor::DMA; + + if (SII.isMFMAorWMMA(MI)) + return InstructionFlavor::WMMA; + + if (SII.isTRANS(MI)) + return InstructionFlavor::TRANS; + + if (SII.isVALU(MI)) + return InstructionFlavor::SingleCycleVALU; + + if (SII.isDS(MI)) + return InstructionFlavor::DS; + + if (SII.isFLAT(MI) || SII.isFLATGlobal(MI) || SII.isFLATScratch(MI)) + return InstructionFlavor::VMEM; + + if (SII.isSALU(MI)) + return InstructionFlavor::SALU; + + return InstructionFlavor::Other; +} + +SUnit *HardwareUnitInfo::getNextTargetSU(bool LookDeep) const { + for (auto *PrioritySU : PrioritySUs) { + if (!PrioritySU->isTopReady()) + return PrioritySU; + } + + if (!LookDeep) + return nullptr; + + unsigned MinDepth = std::numeric_limits::max(); + SUnit *TargetSU = nullptr; + for (auto *SU : AllSUs) { + if (SU->isScheduled) + continue; + + if (SU->isTopReady()) + continue; + + if (SU->getDepth() < MinDepth) { + MinDepth = SU->getDepth(); + TargetSU = SU; + } + } + return TargetSU; +} + +void HardwareUnitInfo::insert(SUnit *SU, unsigned BlockingCycles) { +#ifndef NDEBUG + bool Inserted = AllSUs.insert(SU); + assert(Inserted); +#else + AllSUs.insert(SU); +#endif + + TotalCycles += BlockingCycles; + + if (PrioritySUs.empty()) { + PrioritySUs.insert(SU); + return; + } + unsigned SUDepth = SU->getDepth(); + unsigned CurrDepth = (*PrioritySUs.begin())->getDepth(); + if (SUDepth > CurrDepth) + return; + + if (SUDepth == CurrDepth) { + PrioritySUs.insert(SU); + return; + } + + // SU is lower depth and should be prioritized. + PrioritySUs.clear(); + PrioritySUs.insert(SU); +} + +void HardwareUnitInfo::markScheduled(SUnit *SU, unsigned BlockingCycles) { + // We may want to ignore some HWUIs (e.g. InstructionFlavor::Other). To do so, + // we just clear the HWUI. However, we still have instructions which map to + // this HWUI. Don't bother managing the state for these HWUI. + if (TotalCycles == 0) + return; + + AllSUs.remove(SU); + PrioritySUs.remove(SU); + + TotalCycles -= BlockingCycles; + + if (AllSUs.empty()) + return; + if (PrioritySUs.empty()) { + for (auto SU : AllSUs) { + if (PrioritySUs.empty()) { + PrioritySUs.insert(SU); + continue; + } + unsigned SUDepth = SU->getDepth(); + unsigned CurrDepth = (*PrioritySUs.begin())->getDepth(); + if (SUDepth > CurrDepth) + continue; + + if (SUDepth == CurrDepth) { + PrioritySUs.insert(SU); + continue; + } + + // SU is lower depth and should be prioritized. + PrioritySUs.clear(); + PrioritySUs.insert(SU); + } + } +} + +HardwareUnitInfo * +CandidateHeuristics::getHWUIFromFlavor(InstructionFlavor Flavor) { + for (auto &HWUICand : HWUInfo) { + if (HWUICand.getType() == Flavor) { + return &HWUICand; + } + } + return nullptr; +} + +unsigned CandidateHeuristics::getHWUICyclesForInst(SUnit *SU) { + assert(SchedModel && SchedModel->hasInstrSchedModel()); + unsigned ReleaseAtCycle = 0; + const MCSchedClassDesc *SC = DAG->getSchedClass(SU); + for (TargetSchedModel::ProcResIter PI = SchedModel->getWriteProcResBegin(SC), + PE = SchedModel->getWriteProcResEnd(SC); + PI != PE; ++PI) { + ReleaseAtCycle = std::max(ReleaseAtCycle, (unsigned)PI->ReleaseAtCycle); + } + return ReleaseAtCycle; +} + +void CandidateHeuristics::updateForScheduling(SUnit *SU) { + HardwareUnitInfo *HWUI = + getHWUIFromFlavor(classifyFlavor(*SU->getInstr(), *SII)); + assert(HWUI); + HWUI->markScheduled(SU, getHWUICyclesForInst(SU)); +} + +void CandidateHeuristics::initialize(ScheduleDAGMI *SchedDAG, + const TargetSchedModel *TargetSchedModel, + const TargetRegisterInfo *TRI) { + DAG = SchedDAG; + SchedModel = TargetSchedModel; + assert(SchedModel && SchedModel->hasInstrSchedModel()); + + SRI = static_cast(TRI); + SII = static_cast(DAG->TII); + + HWUInfo.resize((int)InstructionFlavor::NUM_FLAVORS); + + for (unsigned I = 0; I < HWUInfo.size(); I++) { + HWUInfo[I].reset(); + HWUInfo[I].setType(I); + } + + HWUInfo[(int)InstructionFlavor::WMMA].setProducesCoexecWindow(true); + HWUInfo[(int)InstructionFlavor::MultiCycleVALU].setProducesCoexecWindow(true); + HWUInfo[(int)InstructionFlavor::TRANS].setProducesCoexecWindow(true); + + collectHWUIPressure(); +} + +void CandidateHeuristics::collectHWUIPressure() { + if (!SchedModel || !SchedModel->hasInstrSchedModel()) + return; + + for (auto &SU : DAG->SUnits) { + const InstructionFlavor Flavor = classifyFlavor(*SU.getInstr(), *SII); + HWUInfo[(int)(Flavor)].insert(&SU, getHWUICyclesForInst(&SU)); + } + + LLVM_DEBUG(dumpRegionSummary()); +} + +void CandidateHeuristics::dumpRegionSummary() { + MachineBasicBlock *BB = DAG->begin()->getParent(); + dbgs() << "\n=== Region: " << DAG->MF.getName() << " BB" << BB->getNumber() + << " (" << DAG->SUnits.size() << " SUs) ===\n"; + + dbgs() << "\nHWUI Resource Pressure:\n"; + for (auto &HWUI : HWUInfo) { + if (HWUI.getTotalCycles() == 0) + continue; + + StringRef Name = getFlavorName(HWUI.getType()); + dbgs() << " " << Name << ": " << HWUI.getTotalCycles() << " cycles, " + << HWUI.size() << " instrs\n"; + } + dbgs() << "\n"; +} + +void CandidateHeuristics::sortHWUIResources() { + // Highest priority should be first. + llvm::sort(HWUInfo, [](HardwareUnitInfo &A, HardwareUnitInfo &B) { + // Prefer CoexecWindow producers + if (A.producesCoexecWindow() != B.producesCoexecWindow()) + return A.producesCoexecWindow(); + + // Prefer more demanded resources + if (A.getTotalCycles() != B.getTotalCycles()) + return A.getTotalCycles() > B.getTotalCycles(); + + // In ties -- prefer the resource with more instructions + if (A.size() != B.size()) + return A.size() < B.size(); + + // Default to Flavor order + return (unsigned)A.getType() < (unsigned)B.getType(); + }); +} + +bool CandidateHeuristics::tryCriticalResourceDependency( + GenericSchedulerBase::SchedCandidate &TryCand, + GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary *Zone) const { + + auto HasPrioritySU = [this, &Cand, &TryCand](unsigned ResourceIdx) { + const HardwareUnitInfo &HWUI = HWUInfo[ResourceIdx]; + + auto CandFlavor = classifyFlavor(*Cand.SU->getInstr(), *SII); + auto TryCandFlavor = classifyFlavor(*TryCand.SU->getInstr(), *SII); + bool LookDeep = (CandFlavor == InstructionFlavor::DS || + TryCandFlavor == InstructionFlavor::DS) && + HWUI.getType() == InstructionFlavor::WMMA; + auto *TargetSU = HWUI.getNextTargetSU(LookDeep); + + // If we do not have a TargetSU for this resource, then it is not critical. + if (!TargetSU) + return false; + + return true; + }; + + auto TryEnablesResource = [&Cand, &TryCand, this](unsigned ResourceIdx) { + const HardwareUnitInfo &HWUI = HWUInfo[ResourceIdx]; + auto CandFlavor = classifyFlavor(*Cand.SU->getInstr(), *SII); + + // We want to ensure our DS order matches WMMA order. + bool LookDeep = CandFlavor == InstructionFlavor::DS && + HWUI.getType() == InstructionFlavor::WMMA; + auto *TargetSU = HWUI.getNextTargetSU(LookDeep); + + bool CandEnables = + TargetSU != Cand.SU && DAG->IsReachable(TargetSU, Cand.SU); + bool TryCandEnables = + TargetSU != TryCand.SU && DAG->IsReachable(TargetSU, TryCand.SU); + + if (!CandEnables && !TryCandEnables) + return false; + + if (CandEnables && !TryCandEnables) { + if (Cand.Reason > GenericSchedulerBase::RegCritical) + Cand.Reason = GenericSchedulerBase::RegCritical; + + return true; + } + + if (!CandEnables && TryCandEnables) { + TryCand.Reason = GenericSchedulerBase::RegCritical; + return true; + } + + // Both enable, prefer the critical path. + unsigned CandHeight = Cand.SU->getHeight(); + unsigned TryCandHeight = TryCand.SU->getHeight(); + + if (CandHeight > TryCandHeight) { + if (Cand.Reason > GenericSchedulerBase::RegCritical) + Cand.Reason = GenericSchedulerBase::RegCritical; + + return true; + } + + if (CandHeight < TryCandHeight) { + TryCand.Reason = GenericSchedulerBase::RegCritical; + return true; + } + + // Same critical path, just prefer original candidate. + if (Cand.Reason > GenericSchedulerBase::RegCritical) + Cand.Reason = GenericSchedulerBase::RegCritical; + + return true; + }; + + for (unsigned I = 0; I < HWUInfo.size(); I++) { + // If we have encountered a resource that is not critical, then neither + // candidate enables a critical resource + if (!HasPrioritySU(I)) + continue; + + bool Enabled = TryEnablesResource(I); + // If neither has enabled the resource, continue to the next resource + if (Enabled) + return true; + } + return false; +} + +bool CandidateHeuristics::tryCriticalResource( + GenericSchedulerBase::SchedCandidate &TryCand, + GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary *Zone) const { + for (unsigned I = 0; I < HWUInfo.size(); I++) { + const HardwareUnitInfo &HWUI = HWUInfo[I]; + + bool CandUsesCrit = HWUI.contains(Cand.SU); + bool TryCandUsesCrit = HWUI.contains(TryCand.SU); + + if (!CandUsesCrit && !TryCandUsesCrit) + continue; + + if (CandUsesCrit != TryCandUsesCrit) { + if (CandUsesCrit) { + if (Cand.Reason > GenericSchedulerBase::RegCritical) + Cand.Reason = GenericSchedulerBase::RegCritical; + return true; + } + TryCand.Reason = GenericSchedulerBase::RegCritical; + return true; + } + + // Otherwise, both use the critical resource + // For longer latency InstructionFlavors, we should prioritize first by + // their enablement of critical resources + if (HWUI.getType() == InstructionFlavor::DS) { + if (tryCriticalResourceDependency(TryCand, Cand, Zone)) + return true; + } + + // Prioritize based on HWUI priorities. + SUnit *Match = HWUI.getHigherPriority(Cand.SU, TryCand.SU); + if (Match) { + if (Match == Cand.SU) { + if (Cand.Reason > GenericSchedulerBase::RegCritical) + Cand.Reason = GenericSchedulerBase::RegCritical; + return true; + } + TryCand.Reason = GenericSchedulerBase::RegCritical; + return true; + } + } + + return false; +} + AMDGPUCoExecSchedStrategy::AMDGPUCoExecSchedStrategy( const MachineSchedContext *C) : GCNSchedStrategy(C) { @@ -68,6 +433,12 @@ void AMDGPUCoExecSchedStrategy::initialize(ScheduleDAGMI *DAG) { RegionPolicy.OnlyBottomUp = false; GCNSchedStrategy::initialize(DAG); + Heurs.initialize(DAG, SchedModel, TRI); +} + +void AMDGPUCoExecSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { + Heurs.updateForScheduling(SU); + GCNSchedStrategy::schedNode(SU, IsTopNode); } SUnit *AMDGPUCoExecSchedStrategy::pickNode(bool &IsTopNode) { @@ -82,6 +453,9 @@ SUnit *AMDGPUCoExecSchedStrategy::pickNode(bool &IsTopNode) { bool PickedPending = false; SUnit *SU = nullptr; +#ifndef NDEBUG + SchedCandidate *PickedCand = nullptr; +#endif do { PickedPending = false; SU = pickOnlyChoice(Top); @@ -92,10 +466,15 @@ SUnit *AMDGPUCoExecSchedStrategy::pickNode(bool &IsTopNode) { PickedPending, /*IsBottomUp=*/false); assert(TopCand.Reason != NoCand && "failed to find a candidate"); SU = TopCand.SU; +#ifndef NDEBUG + PickedCand = &TopCand; +#endif } IsTopNode = true; } while (SU->isScheduled); + LLVM_DEBUG(if (PickedCand) dumpPickSummary(SU, IsTopNode, *PickedCand)); + if (PickedPending) { unsigned ReadyCycle = SU->TopReadyCycle; unsigned CurrentCycle = Top.getCurrCycle(); @@ -149,7 +528,7 @@ void AMDGPUCoExecSchedStrategy::pickNodeFromQueue( initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure, VGPRPressure, IsBottomUp); SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; - tryCandidate(Cand, TryCand, ZoneArg); + tryCandidateCoexec(Cand, TryCand, ZoneArg); if (TryCand.Reason != NoCand) { if (TryCand.ResDelta == SchedResourceDelta()) TryCand.initResourceDelta(Zone.DAG, SchedModel); @@ -157,7 +536,7 @@ void AMDGPUCoExecSchedStrategy::pickNodeFromQueue( PickedPending = FromPending; Cand.setBest(TryCand); } else { - printCandidateDecision(TryCand, Cand); + LLVM_DEBUG(printCandidateDecision(TryCand, Cand)); } } }; @@ -169,9 +548,36 @@ void AMDGPUCoExecSchedStrategy::pickNodeFromQueue( EvaluateQueue(Zone.Pending, /*FromPending=*/true); } -bool AMDGPUCoExecSchedStrategy::tryCandidate(SchedCandidate &Cand, - SchedCandidate &TryCand, - SchedBoundary *Zone) const { +#ifndef NDEBUG +void AMDGPUCoExecSchedStrategy::dumpPickSummary(SUnit *SU, bool IsTopNode, + SchedCandidate &Cand) { + const SIInstrInfo *SII = static_cast(DAG->TII); + unsigned Cycle = IsTopNode ? Top.getCurrCycle() : Bot.getCurrCycle(); + + dbgs() << "=== Pick @ Cycle " << Cycle << " ===\n"; + + const InstructionFlavor Flavor = classifyFlavor(*SU->getInstr(), *SII); + dbgs() << "Picked: SU(" << SU->NodeNum << ") "; + SU->getInstr()->print(dbgs(), /*IsStandalone=*/true, /*SkipOpers=*/false, + /*SkipDebugLoc=*/true); + dbgs() << " [" << getFlavorName(Flavor) << "]\n"; + + dbgs() << " Reason: "; + if (LastAMDGPUReason != AMDGPUSchedReason::None) + dbgs() << getReasonName(LastAMDGPUReason); + else if (Cand.Reason != NoCand) + dbgs() << GenericSchedulerBase::getReasonStr(Cand.Reason); + else + dbgs() << "Unknown"; + dbgs() << "\n\n"; + + LastAMDGPUReason = AMDGPUSchedReason::None; +} +#endif + +bool AMDGPUCoExecSchedStrategy::tryCandidateCoexec(SchedCandidate &Cand, + SchedCandidate &TryCand, + SchedBoundary *Zone) { // Initialize the candidate if needed. if (!Cand.isValid()) { TryCand.Reason = FirstValid; @@ -196,17 +602,21 @@ bool AMDGPUCoExecSchedStrategy::tryCandidate(SchedCandidate &Cand, // "tie-breaking" in nature. bool SameBoundary = Zone != nullptr; if (SameBoundary) { - // For loops that are acyclic path limited, aggressively schedule for - // latency. Within an single cycle, whenever CurrMOps > 0, allow normal - // heuristics to take precedence. - if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() && - tryLatency(TryCand, Cand, *Zone)) - return TryCand.Reason != NoCand; - - // Otherwise compare candidates by the stall they would introduce if + // Compare candidates by the stall they would introduce if // scheduled in the current cycle. if (tryEffectiveStall(Cand, TryCand, *Zone)) return TryCand.Reason != NoCand; + + Heurs.sortHWUIResources(); + if (Heurs.tryCriticalResource(TryCand, Cand, Zone)) { + LastAMDGPUReason = AMDGPUSchedReason::CritResourceBalance; + return TryCand.Reason != NoCand; + } + + if (Heurs.tryCriticalResourceDependency(TryCand, Cand, Zone)) { + LastAMDGPUReason = AMDGPUSchedReason::CritResourceDep; + return TryCand.Reason != NoCand; + } } // Keep clustered nodes together to encourage downstream peephole @@ -240,16 +650,6 @@ bool AMDGPUCoExecSchedStrategy::tryCandidate(SchedCandidate &Cand, return TryCand.Reason != NoCand; if (SameBoundary) { - // Avoid critical resource consumption and balance the schedule. - TryCand.initResourceDelta(DAG, SchedModel); - if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, - TryCand, Cand, ResourceReduce)) - return TryCand.Reason != NoCand; - if (tryGreater(TryCand.ResDelta.DemandedResources, - Cand.ResDelta.DemandedResources, TryCand, Cand, - ResourceDemand)) - return TryCand.Reason != NoCand; - // Avoid serializing long latency dependence chains. // For acyclic path limited loops, latency was already checked above. if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency && diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h index 07252c3fb45a..1684690cd829 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h @@ -19,12 +19,297 @@ namespace llvm { +namespace AMDGPU { + +//===----------------------------------------------------------------------===// +// Instruction Flavor Classification +//===----------------------------------------------------------------------===// + +enum class InstructionFlavor : uint8_t { + WMMA, // WMMA/MFMA matrix operations + SingleCycleVALU, // Single-cycle VALU (not TRANS32, not multi-cycle CVT) + TRANS, // Transcendental ops (v_exp, v_log, etc.) + MultiCycleVALU, // VALU instructions with repeat rate > 1 + VMEM, // FLAT/GLOBAL memory operations + DS, // LDS/GDS operations + SALU, // Scalar ALU + DMA, // Tensor DMA operations + Fence, // Fences and waits + Other, // Everything else + NUM_FLAVORS +}; + +inline StringRef getFlavorName(InstructionFlavor F) { + switch (F) { + case InstructionFlavor::WMMA: + return "WMMA"; + case InstructionFlavor::SingleCycleVALU: + return "VALU(1c)"; + case InstructionFlavor::TRANS: + return "TRANS"; + case InstructionFlavor::MultiCycleVALU: + return "VALU(Nc)"; + case InstructionFlavor::VMEM: + return "VMEM"; + case InstructionFlavor::DS: + return "DS"; + case InstructionFlavor::SALU: + return "SALU"; + case InstructionFlavor::DMA: + return "DMA"; + case InstructionFlavor::Fence: + return "Fence"; + case InstructionFlavor::Other: + return "Other"; + case InstructionFlavor::NUM_FLAVORS: + return "???"; + } + llvm_unreachable("Unknown InstructionFlavor"); +} + +inline StringRef getFlavorShortName(InstructionFlavor F) { + switch (F) { + case InstructionFlavor::WMMA: + return "W"; + case InstructionFlavor::SingleCycleVALU: + return "V"; + case InstructionFlavor::TRANS: + return "T"; + case InstructionFlavor::MultiCycleVALU: + return "C"; + case InstructionFlavor::VMEM: + return "M"; + case InstructionFlavor::DS: + return "D"; + case InstructionFlavor::SALU: + return "S"; + case InstructionFlavor::DMA: + return "X"; + case InstructionFlavor::Fence: + return "F"; + case InstructionFlavor::Other: + return "O"; + case InstructionFlavor::NUM_FLAVORS: + return "?"; + } + llvm_unreachable("Unknown InstructionFlavor"); +} + +InstructionFlavor classifyFlavor(const MachineInstr &MI, + const SIInstrInfo &SII); + +using FlavorGroup = SmallVector; + +namespace FlavorGroups { +inline FlavorGroup allVALU() { + return {InstructionFlavor::SingleCycleVALU, InstructionFlavor::TRANS, + InstructionFlavor::MultiCycleVALU}; +} +inline FlavorGroup allMem() { + return {InstructionFlavor::VMEM, InstructionFlavor::DS, + InstructionFlavor::DMA}; +} +inline FlavorGroup individual(InstructionFlavor F) { return {F}; } +inline FlavorGroup all() { + FlavorGroup G; + for (unsigned I = 0; + I < static_cast(InstructionFlavor::NUM_FLAVORS); ++I) + G.push_back(static_cast(I)); + return G; +} +} // namespace FlavorGroups + +/// AMDGPU-specific scheduling decision reasons. These provide more granularity +/// than the generic CandReason enum for debugging purposes. +enum class AMDGPUSchedReason : uint8_t { + None, + CritResourceBalance, // tryCriticalResource chose based on resource pressure + CritResourceDep, // tryCriticalResourceDependency chose based on enabling + NUM_REASONS +}; + +inline StringRef getReasonName(AMDGPUSchedReason R) { + switch (R) { + case AMDGPUSchedReason::None: + return "None"; + case AMDGPUSchedReason::CritResourceBalance: + return "CritResource"; + case AMDGPUSchedReason::CritResourceDep: + return "CritResourceDep"; + case AMDGPUSchedReason::NUM_REASONS: + return "???"; + } + llvm_unreachable("Unknown AMDGPUSchedReason"); +} + +} // End namespace AMDGPU + +//===----------------------------------------------------------------------===// +// Hardware Unit Information +//===----------------------------------------------------------------------===// + +/// HardwareUnitInfo is a wrapper class which maps to some real hardware +/// resource. This is used to model hardware resource pressure per region, and +/// guide scheduling heuristics. +class HardwareUnitInfo { +private: + /// PrioritySUs maintains a list of the SUs we want to prioritize scheduling + /// for this HardwareUnit. This is used for agreement between + /// tryCriticalResourceDependency and tryCriticalResource: we schedule the + /// dependencies for a SU on critical resource, then schedule that same SU on + /// the critical resource. This agreement results in shorter live ranges and + /// more regular HardwareUnit access patterns. SUs are prioritized based on + /// depth for top-down scheduling. + SmallSetVector PrioritySUs; + /// All the SUs in the region that consume this resource + SmallSetVector AllSUs; + /// The total number of busy cycles for this HardwareUnit for a given region. + unsigned TotalCycles = 0; + // InstructionFlavor mapping + AMDGPU::InstructionFlavor Type; + // Whether or not instructions on this HardwareUnit may produce a window in + // which instructions in other HardwareUnits can coexecute. For example, WMMA + // / MFMA instructions may take multiple cycles, which may be overlapped with + // instructions on other HardwareUnits + bool ProducesCoexecWindow = false; + +public: + HardwareUnitInfo() {} + + unsigned size() { return AllSUs.size(); } + + unsigned getTotalCycles() { return TotalCycles; } + + void setType(unsigned TheType) { + assert(TheType < (unsigned)AMDGPU::InstructionFlavor::NUM_FLAVORS); + Type = (AMDGPU::InstructionFlavor)(TheType); + } + + AMDGPU::InstructionFlavor getType() const { return Type; } + + bool producesCoexecWindow() const { return ProducesCoexecWindow; } + + void setProducesCoexecWindow(bool Val) { ProducesCoexecWindow = Val; } + + bool contains(SUnit *SU) const { return AllSUs.contains(SU); } + + /// \returns true if there is a difference in priority between \p SU and \p + /// Other. If so, \returns the SUnit with higher priority. This + /// method looks through the PrioritySUs to determine if one SU is more + /// prioritized than the other. If neither are in the PrioritySUs list, then + /// neither have priority over each other. + SUnit *getHigherPriority(SUnit *SU, SUnit *Other) const { + for (auto *SUOrder : PrioritySUs) { + if (SUOrder == SU) + return SU; + + if (SUOrder == Other) + return Other; + } + return nullptr; + } + + void reset() { + AllSUs.clear(); + PrioritySUs.clear(); + TotalCycles = 0; + Type = AMDGPU::InstructionFlavor::Other; + ProducesCoexecWindow = false; + } + + /// \returns the next SU in PrioritySUs that is not ready. If \p LookDeep is + /// set, we will look beyond the PrioritySUs (if all the PrioritySUs are + /// ready) to AllSUs to attempt to find a target SU. When looking through + /// AllSUs we sort pick the target SU by minimal depth for top-down + /// scheduling. getNextTargetSU is useful for determining which SU on this + /// HardwareUnit we are trying to schedule - this info helps us determine + /// which dependencies to schedule. LookDeep is useful if the dependencies are + /// long latency (e.g. memory instructions). If we have many long latency + /// dependencies, it is beneficial to enable SUs multiple levels ahead. + SUnit *getNextTargetSU(bool LookDeep = false) const; + /// Insert the \p SU into the AllSUs and account its \p BlockingCycles into + /// the TotalCycles. This maintains the list of PrioritySUs. + void insert(SUnit *SU, unsigned BlockingCycles); + /// Update the state for \p SU being scheduled by removing it from the AllSus + /// and reducing its \p BlockingCycles from the TotalCycles. This maintains + /// the list of PrioritySUS. + void markScheduled(SUnit *SU, unsigned BlockingCycles); +}; + +//===----------------------------------------------------------------------===// +// Candidate Heuristics +//===----------------------------------------------------------------------===// + +/// CandidateHeuristics contains state and implementations to facilitate making +/// per instruction scheduling decisions; it contains methods used in +/// tryCandidate to decide which instruction to schedule next. +class CandidateHeuristics { +protected: + ScheduleDAGMI *DAG; + const SIInstrInfo *SII; + const SIRegisterInfo *SRI; + const TargetSchedModel *SchedModel; + SmallVector HWUInfo; + + /// Walk over the region and collect total usage per HardwareUnit + void collectHWUIPressure(); + + /// Compute the blocking cycles for the appropriate HardwareUnit given an \p + /// SU + unsigned getHWUICyclesForInst(SUnit *SU); + + /// Given a \p Flavor , find the corresponding HardwareUnit. \returns the + /// mapped HardwareUnit. + HardwareUnitInfo *getHWUIFromFlavor(AMDGPU::InstructionFlavor Flavor); + +public: + CandidateHeuristics() = default; + + void initialize(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel, + const TargetRegisterInfo *TRI); + + /// Update the state to reflect that \p SU is going to be scheduled. + void updateForScheduling(SUnit *SU); + + /// Sort the HWUInfo vector. After sorting, the HardwareUnits that are highest + /// priority are first. Priority is determined by maximizing coexecution and + /// keeping the critical HardwareUnit busy. + void sortHWUIResources(); + + /// Check for critical resource consumption. Prefer the candidate that uses + /// the most prioritized HardwareUnit. If both candidates use the same + /// HarwareUnit, prefer the candidate with higher priority on that + /// HardwareUnit. + bool tryCriticalResource(GenericSchedulerBase::SchedCandidate &TryCand, + GenericSchedulerBase::SchedCandidate &Cand, + SchedBoundary *Zone) const; + + /// Check for dependencies of instructions that use prioritized HardwareUnits. + /// Prefer the candidate that is a dependency of an instruction that uses the + /// most prioritized HardwareUnit. If both candidates enable the same + /// HardwareUnit, prefer the candidate that enables the higher priority + /// instruction on that HardwareUnit. + bool + tryCriticalResourceDependency(GenericSchedulerBase::SchedCandidate &TryCand, + GenericSchedulerBase::SchedCandidate &Cand, + SchedBoundary *Zone) const; + + void dumpRegionSummary(); +}; + class AMDGPUCoExecSchedStrategy final : public GCNSchedStrategy { protected: - bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, - SchedBoundary *Zone) const override; bool tryEffectiveStall(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary &Zone) const; + AMDGPU::AMDGPUSchedReason LastAMDGPUReason = AMDGPU::AMDGPUSchedReason::None; + CandidateHeuristics Heurs; + +#ifndef NDEBUG + void dumpPickSummary(SUnit *SU, bool IsTopNode, SchedCandidate &Cand); +#endif + + bool tryCandidateCoexec(SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone); void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, SchedCandidate &Cand, bool &PickedPending, @@ -38,6 +323,7 @@ public: unsigned NumRegionInstrs) override; void initialize(ScheduleDAGMI *DAG) override; SUnit *pickNode(bool &IsTopNode) override; + void schedNode(SUnit *SU, bool IsTopNode) override; }; ScheduleDAGInstrs *createGCNCoExecMachineScheduler(MachineSchedContext *C); diff --git a/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir b/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir index f9f9a27e9af4..0a6f2fe9375d 100644 --- a/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir +++ b/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir @@ -38,6 +38,7 @@ body: | ; COEXEC-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF ; COEXEC-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF ; COEXEC-NEXT: [[DEF4:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF + ; COEXEC-NEXT: early-clobber %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec ; COEXEC-NEXT: [[DEF5:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF ; COEXEC-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF ; COEXEC-NEXT: [[DEF7:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF @@ -45,7 +46,6 @@ body: | ; COEXEC-NEXT: [[DEF9:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF ; COEXEC-NEXT: [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF ; COEXEC-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF10]], 0, 0, implicit $exec - ; COEXEC-NEXT: early-clobber %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec ; COEXEC-NEXT: early-clobber %14:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec ; COEXEC-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec ; COEXEC-NEXT: S_ENDPGM 0, implicit [[V_PK_ADD_F32_]], implicit %13, implicit %14 @@ -90,19 +90,19 @@ body: | ; DEFAULT-NEXT: S_ENDPGM 0, implicit %10, implicit %11 ; ; COEXEC-LABEL: name: test-sched-pending-structural-stall - ; COEXEC: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF + ; COEXEC: S_NOP 0 + ; COEXEC-NEXT: S_NOP 0 + ; COEXEC-NEXT: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF ; COEXEC-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF ; COEXEC-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF ; COEXEC-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF ; COEXEC-NEXT: [[DEF4:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF + ; COEXEC-NEXT: early-clobber %10:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec ; COEXEC-NEXT: [[DEF5:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF ; COEXEC-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF ; COEXEC-NEXT: [[DEF7:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF ; COEXEC-NEXT: [[DEF8:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF ; COEXEC-NEXT: [[DEF9:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF - ; COEXEC-NEXT: early-clobber %10:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec - ; COEXEC-NEXT: S_NOP 0 - ; COEXEC-NEXT: S_NOP 0 ; COEXEC-NEXT: early-clobber %11:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec ; COEXEC-NEXT: S_ENDPGM 0, implicit %10, implicit %11 %0:vreg_512_align2 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll b/llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll new file mode 100644 index 000000000000..c1e7bc005998 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/coexec-scheduler.ll @@ -0,0 +1,606 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -amdgpu-sched-strategy=coexec --enable-post-misched=0 --verify-misched < %s | FileCheck -check-prefix=COEXEC %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s + + +define amdgpu_kernel void @ds_wmma(ptr addrspace(3) %base, ptr addrspace(1) %out, i1 %br0, i32 %delta) local_unnamed_addr #0 { +; COEXEC-LABEL: ds_wmma: +; COEXEC: ; %bb.0: ; %entry +; COEXEC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; COEXEC-NEXT: v_mov_b32_e32 v0, 0 +; COEXEC-NEXT: s_clause 0x1 +; COEXEC-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; COEXEC-NEXT: s_load_b64 s[0:1], s[4:5], 0x10 nv +; COEXEC-NEXT: s_delay_alu instid0(VALU_DEP_1) +; COEXEC-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0 +; COEXEC-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v0 +; COEXEC-NEXT: s_wait_kmcnt 0x0 +; COEXEC-NEXT: s_bitcmp1_b32 s0, 0 +; COEXEC-NEXT: s_cselect_b32 s0, -1, 0 +; COEXEC-NEXT: v_mov_b32_e32 v5, v0 +; COEXEC-NEXT: s_xor_b32 s0, s0, -1 +; COEXEC-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; COEXEC-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 +; COEXEC-NEXT: v_mov_b32_e32 v6, v0 +; COEXEC-NEXT: v_cmp_ne_u32_e64 s0, 1, v7 +; COEXEC-NEXT: v_dual_mov_b32 v7, v0 :: v_dual_mov_b32 v8, v0 +; COEXEC-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v24, v0 +; COEXEC-NEXT: v_dual_mov_b32 v9, v0 :: v_dual_mov_b32 v17, v0 +; COEXEC-NEXT: v_dual_mov_b32 v25, v0 :: v_dual_mov_b32 v10, v0 +; COEXEC-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v26, v0 +; COEXEC-NEXT: v_dual_mov_b32 v11, v0 :: v_dual_mov_b32 v19, v0 +; COEXEC-NEXT: v_dual_mov_b32 v27, v0 :: v_dual_mov_b32 v12, v0 +; COEXEC-NEXT: v_dual_mov_b32 v20, v0 :: v_dual_mov_b32 v28, v0 +; COEXEC-NEXT: v_dual_mov_b32 v13, v0 :: v_dual_mov_b32 v21, v0 +; COEXEC-NEXT: v_dual_mov_b32 v29, v0 :: v_dual_mov_b32 v14, v0 +; COEXEC-NEXT: v_dual_mov_b32 v22, v0 :: v_dual_mov_b32 v30, v0 +; COEXEC-NEXT: v_dual_mov_b32 v15, v0 :: v_dual_mov_b32 v23, v0 +; COEXEC-NEXT: v_mov_b32_e32 v31, v0 +; COEXEC-NEXT: .LBB0_1: ; %loop +; COEXEC-NEXT: ; =>This Inner Loop Header: Depth=1 +; COEXEC-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; COEXEC-NEXT: v_nop +; COEXEC-NEXT: v_nop +; COEXEC-NEXT: v_nop +; COEXEC-NEXT: v_nop +; COEXEC-NEXT: v_mov_b32_e32 v92, s2 +; COEXEC-NEXT: s_add_co_i32 s2, s2, s1 +; COEXEC-NEXT: ds_load_tr16_b128 v[32:35], v92 offset:128 +; COEXEC-NEXT: ds_load_tr16_b128 v[40:43], v92 +; COEXEC-NEXT: ds_load_tr16_b128 v[36:39], v92 offset:192 +; COEXEC-NEXT: ds_load_tr16_b128 v[44:47], v92 offset:64 +; COEXEC-NEXT: ds_load_tr16_b128 v[48:51], v92 offset:384 +; COEXEC-NEXT: ds_load_tr16_b128 v[56:59], v92 offset:256 +; COEXEC-NEXT: ds_load_tr16_b128 v[52:55], v92 offset:448 +; COEXEC-NEXT: ds_load_tr16_b128 v[60:63], v92 offset:320 +; COEXEC-NEXT: ds_load_tr16_b128 v[64:67], v92 offset:640 +; COEXEC-NEXT: ds_load_tr16_b128 v[72:75], v92 offset:512 +; COEXEC-NEXT: ds_load_tr16_b128 v[68:71], v92 offset:704 +; COEXEC-NEXT: ds_load_tr16_b128 v[76:79], v92 offset:576 +; COEXEC-NEXT: ds_load_tr16_b128 v[80:83], v92 offset:896 +; COEXEC-NEXT: ds_load_tr16_b128 v[88:91], v92 offset:768 +; COEXEC-NEXT: ds_load_tr16_b128 v[84:87], v92 offset:960 +; COEXEC-NEXT: ds_load_tr16_b128 v[92:95], v92 offset:832 +; COEXEC-NEXT: s_wait_dscnt 0xc +; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[40:47], v[32:39], v[24:31] +; COEXEC-NEXT: s_wait_dscnt 0x8 +; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[56:63], v[48:55], v[16:23] +; COEXEC-NEXT: s_wait_dscnt 0x4 +; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[72:79], v[64:71], v[8:15] +; COEXEC-NEXT: s_wait_dscnt 0x0 +; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[88:95], v[80:87], v[0:7] +; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[40:47], v[32:39], v[24:31] +; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[56:63], v[48:55], v[16:23] +; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[72:79], v[64:71], v[8:15] +; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[88:95], v[80:87], v[0:7] +; COEXEC-NEXT: s_cbranch_vccnz .LBB0_1 +; COEXEC-NEXT: ; %bb.2: ; %end +; COEXEC-NEXT: v_nop +; COEXEC-NEXT: v_mov_b32_e32 v32, 0 +; COEXEC-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv +; COEXEC-NEXT: s_wait_kmcnt 0x0 +; COEXEC-NEXT: s_clause 0x7 +; COEXEC-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:16 +; COEXEC-NEXT: global_store_b128 v32, v[24:27], s[0:1] +; COEXEC-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:144 +; COEXEC-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:128 +; COEXEC-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:272 +; COEXEC-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:256 +; COEXEC-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:400 +; COEXEC-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:384 +; COEXEC-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; COEXEC-NEXT: s_endpgm +; +; GCN-LABEL: ds_wmma: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x10 nv +; GCN-NEXT: s_load_b32 s2, s[4:5], 0x0 nv +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0 +; GCN-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v0 +; GCN-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v0 +; GCN-NEXT: v_dual_mov_b32 v7, v0 :: v_dual_mov_b32 v8, v0 +; GCN-NEXT: v_dual_mov_b32 v9, v0 :: v_dual_mov_b32 v10, v0 +; GCN-NEXT: v_dual_mov_b32 v11, v0 :: v_dual_mov_b32 v12, v0 +; GCN-NEXT: v_dual_mov_b32 v13, v0 :: v_dual_mov_b32 v14, v0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_bitcmp1_b32 s0, 0 +; GCN-NEXT: v_dual_mov_b32 v15, v0 :: v_dual_mov_b32 v16, v0 +; GCN-NEXT: s_cselect_b32 s0, -1, 0 +; GCN-NEXT: v_dual_mov_b32 v17, v0 :: v_dual_mov_b32 v18, v0 +; GCN-NEXT: s_xor_b32 s0, s0, -1 +; GCN-NEXT: v_dual_mov_b32 v19, v0 :: v_dual_mov_b32 v20, v0 +; GCN-NEXT: v_cndmask_b32_e64 v24, 0, 1, s0 +; GCN-NEXT: v_dual_mov_b32 v21, v0 :: v_dual_mov_b32 v22, v0 +; GCN-NEXT: v_dual_mov_b32 v23, v0 :: v_dual_mov_b32 v25, v0 +; GCN-NEXT: v_mov_b32_e32 v26, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GCN-NEXT: v_cmp_ne_u32_e64 s0, 1, v24 +; GCN-NEXT: v_dual_mov_b32 v24, v0 :: v_dual_mov_b32 v27, v0 +; GCN-NEXT: v_dual_mov_b32 v28, v0 :: v_dual_mov_b32 v29, v0 +; GCN-NEXT: v_dual_mov_b32 v30, v0 :: v_dual_mov_b32 v31, v0 +; GCN-NEXT: .LBB0_1: ; %loop +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: v_nop +; GCN-NEXT: v_nop +; GCN-NEXT: v_nop +; GCN-NEXT: v_nop +; GCN-NEXT: v_mov_b32_e32 v92, s2 +; GCN-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GCN-NEXT: s_add_co_i32 s2, s2, s1 +; GCN-NEXT: ds_load_tr16_b128 v[32:35], v92 +; GCN-NEXT: ds_load_tr16_b128 v[36:39], v92 offset:64 +; GCN-NEXT: ds_load_tr16_b128 v[40:43], v92 offset:128 +; GCN-NEXT: ds_load_tr16_b128 v[44:47], v92 offset:192 +; GCN-NEXT: ds_load_tr16_b128 v[48:51], v92 offset:256 +; GCN-NEXT: ds_load_tr16_b128 v[52:55], v92 offset:320 +; GCN-NEXT: ds_load_tr16_b128 v[56:59], v92 offset:384 +; GCN-NEXT: ds_load_tr16_b128 v[60:63], v92 offset:448 +; GCN-NEXT: ds_load_tr16_b128 v[64:67], v92 offset:512 +; GCN-NEXT: ds_load_tr16_b128 v[68:71], v92 offset:576 +; GCN-NEXT: ds_load_tr16_b128 v[72:75], v92 offset:640 +; GCN-NEXT: ds_load_tr16_b128 v[76:79], v92 offset:704 +; GCN-NEXT: ds_load_tr16_b128 v[80:83], v92 offset:768 +; GCN-NEXT: ds_load_tr16_b128 v[84:87], v92 offset:832 +; GCN-NEXT: ds_load_tr16_b128 v[88:91], v92 offset:896 +; GCN-NEXT: ds_load_tr16_b128 v[92:95], v92 offset:960 +; GCN-NEXT: s_wait_dscnt 0xc +; GCN-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31] +; GCN-NEXT: s_wait_dscnt 0x8 +; GCN-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23] +; GCN-NEXT: s_wait_dscnt 0x4 +; GCN-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15] +; GCN-NEXT: s_wait_dscnt 0x0 +; GCN-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7] +; GCN-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31] +; GCN-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23] +; GCN-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15] +; GCN-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7] +; GCN-NEXT: s_cbranch_vccnz .LBB0_1 +; GCN-NEXT: ; %bb.2: ; %end +; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv +; GCN-NEXT: v_nop +; GCN-NEXT: v_mov_b32_e32 v32, 0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_clause 0x7 +; GCN-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:16 +; GCN-NEXT: global_store_b128 v32, v[24:27], s[0:1] +; GCN-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:144 +; GCN-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:128 +; GCN-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:272 +; GCN-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:256 +; GCN-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:400 +; GCN-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:384 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +entry: + + br label %loop + +loop: + %baseOff = phi i32 [ 0, %entry ], [ %newBaseOff, %loop ] + %wvec0 = phi <8 x float> [ , %entry ], [ %wmma01, %loop ] + %wvec1 = phi <8 x float> [ , %entry ], [ %wmma11, %loop ] + %wvec2 = phi <8 x float> [ , %entry ], [ %wmma21, %loop ] + %wvec3 = phi <8 x float> [ , %entry ], [ %wmma31, %loop ] + %p0 = getelementptr inbounds nuw i8, ptr addrspace(3) %base, i32 %baseOff + %p1 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 64 + %p2 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 128 + %p3 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 192 + %p4 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 256 + %p5 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 320 + %p6 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 384 + %p7 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 448 + %p8 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 512 + %p9 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 576 + %p10 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 640 + %p11 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 704 + %p12 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 768 + %p13 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 832 + %p14 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 896 + %p15 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 960 + %l0 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) %p0) + %l1 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p1) + %l2 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p2) + %l3 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p3) + %l4 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p4) + %l5 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p5) + %l6 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p6) + %l7 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p7) + %l8 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p8) + %l9 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p9) + %l10 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p10) + %l11 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p11) + %l12 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p12) + %l13 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p13) + %l14 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p14) + %l15 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p15) + %vec0 = shufflevector <8 x half> %l0, <8 x half> %l1, <16 x i32> + %vec1 = shufflevector <8 x half> %l2, <8 x half> %l3, <16 x i32> + %vec2 = shufflevector <8 x half> %l4, <8 x half> %l5, <16 x i32> + %vec3 = shufflevector <8 x half> %l6, <8 x half> %l7, <16 x i32> + %vec4 = shufflevector <8 x half> %l8, <8 x half> %l9, <16 x i32> + %vec5 = shufflevector <8 x half> %l10, <8 x half> %l11, <16 x i32> + %vec6 = shufflevector <8 x half> %l12, <8 x half> %l13, <16 x i32> + %vec7 = shufflevector <8 x half> %l14, <8 x half> %l15, <16 x i32> + %wmma00 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec0, i1 false, <16 x half> %vec1, i16 0, <8 x float> %wvec0, i1 false, i1 false) + %wmma01 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec0, i1 false, <16 x half> %vec1, i16 0, <8 x float> %wmma00, i1 false, i1 false) + %wmma10 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec2, i1 false, <16 x half> %vec3, i16 0, <8 x float> %wvec1, i1 false, i1 false) + %wmma11 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec2, i1 false, <16 x half> %vec3, i16 0, <8 x float> %wmma10, i1 false, i1 false) + %wmma20 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec4, i1 false, <16 x half> %vec5, i16 0, <8 x float> %wvec2, i1 false, i1 false) + %wmma21 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec4, i1 false, <16 x half> %vec5, i16 0, <8 x float> %wmma20, i1 false, i1 false) + %wmma30 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec6, i1 false, <16 x half> %vec7, i16 0, <8 x float> %wvec3, i1 false, i1 false) + %wmma31 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec6, i1 false, <16 x half> %vec7, i16 0, <8 x float> %wmma30, i1 false, i1 false) + %newBaseOff = or disjoint i32 %baseOff, %delta + br i1 %br0, label %loop, label %end + +end: + %out1 = getelementptr inbounds nuw i8, ptr addrspace(1) %out, i32 128 + %out2 = getelementptr inbounds nuw i8, ptr addrspace(1) %out, i32 256 + %out3 = getelementptr inbounds nuw i8, ptr addrspace(1) %out, i32 384 + store <8 x float> %wmma01, ptr addrspace(1) %out, align 16 + store <8 x float> %wmma11, ptr addrspace(1) %out1, align 16 + store <8 x float> %wmma21, ptr addrspace(1) %out2, align 16 + store <8 x float> %wmma31, ptr addrspace(1) %out3, align 16 + ret void +} + +define amdgpu_kernel void @ds_wmma_permute(ptr addrspace(3) %base, ptr addrspace(3) %base1, ptr addrspace(1) %out, i1 %br0, i32 %delta) local_unnamed_addr #0 { +; COEXEC-LABEL: ds_wmma_permute: +; COEXEC: ; %bb.0: ; %entry +; COEXEC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; COEXEC-NEXT: s_mov_b32 s6, 0 +; COEXEC-NEXT: s_clause 0x1 +; COEXEC-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; COEXEC-NEXT: s_load_b64 s[0:1], s[4:5], 0x10 nv +; COEXEC-NEXT: v_mov_b32_e32 v0, 0 +; COEXEC-NEXT: s_delay_alu instid0(VALU_DEP_1) +; COEXEC-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0 +; COEXEC-NEXT: v_mov_b32_e32 v3, v0 +; COEXEC-NEXT: s_wait_kmcnt 0x0 +; COEXEC-NEXT: s_bitcmp1_b32 s0, 0 +; COEXEC-NEXT: v_mov_b32_e32 v4, v0 +; COEXEC-NEXT: s_cselect_b32 s0, -1, 0 +; COEXEC-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; COEXEC-NEXT: s_xor_b32 s0, s0, -1 +; COEXEC-NEXT: v_mov_b32_e32 v5, v0 +; COEXEC-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 +; COEXEC-NEXT: v_mov_b32_e32 v6, v0 +; COEXEC-NEXT: v_cmp_ne_u32_e64 s0, 1, v7 +; COEXEC-NEXT: v_dual_mov_b32 v7, v0 :: v_dual_mov_b32 v8, v0 +; COEXEC-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v24, v0 +; COEXEC-NEXT: v_dual_mov_b32 v9, v0 :: v_dual_mov_b32 v17, v0 +; COEXEC-NEXT: v_dual_mov_b32 v25, v0 :: v_dual_mov_b32 v10, v0 +; COEXEC-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v26, v0 +; COEXEC-NEXT: v_dual_mov_b32 v11, v0 :: v_dual_mov_b32 v19, v0 +; COEXEC-NEXT: v_dual_mov_b32 v27, v0 :: v_dual_mov_b32 v12, v0 +; COEXEC-NEXT: v_dual_mov_b32 v20, v0 :: v_dual_mov_b32 v28, v0 +; COEXEC-NEXT: v_dual_mov_b32 v13, v0 :: v_dual_mov_b32 v21, v0 +; COEXEC-NEXT: v_dual_mov_b32 v29, v0 :: v_dual_mov_b32 v14, v0 +; COEXEC-NEXT: v_dual_mov_b32 v22, v0 :: v_dual_mov_b32 v30, v0 +; COEXEC-NEXT: v_dual_mov_b32 v15, v0 :: v_dual_mov_b32 v23, v0 +; COEXEC-NEXT: v_mov_b32_e32 v31, v0 +; COEXEC-NEXT: .LBB1_1: ; %loop +; COEXEC-NEXT: ; =>This Inner Loop Header: Depth=1 +; COEXEC-NEXT: s_add_co_i32 s7, s2, s6 +; COEXEC-NEXT: s_add_co_i32 s8, s3, s6 +; COEXEC-NEXT: s_add_co_i32 s6, s6, s1 +; COEXEC-NEXT: v_nop +; COEXEC-NEXT: v_nop +; COEXEC-NEXT: v_nop +; COEXEC-NEXT: v_nop +; COEXEC-NEXT: v_mov_b32_e32 v124, s7 +; COEXEC-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; COEXEC-NEXT: v_mov_b32_e32 v156, s8 +; COEXEC-NEXT: ds_load_tr16_b128 v[32:35], v124 +; COEXEC-NEXT: ds_load_tr16_b128 v[36:39], v124 offset:64 +; COEXEC-NEXT: ds_load_tr16_b128 v[40:43], v156 +; COEXEC-NEXT: ds_load_tr16_b128 v[44:47], v156 offset:64 +; COEXEC-NEXT: ds_load_tr16_b128 v[48:51], v124 offset:256 +; COEXEC-NEXT: ds_load_tr16_b128 v[56:59], v156 offset:256 +; COEXEC-NEXT: ds_load_tr16_b128 v[52:55], v124 offset:320 +; COEXEC-NEXT: ds_load_tr16_b128 v[60:63], v156 offset:320 +; COEXEC-NEXT: ds_load_tr16_b128 v[64:67], v124 offset:512 +; COEXEC-NEXT: ds_load_tr16_b128 v[72:75], v156 offset:512 +; COEXEC-NEXT: ds_load_tr16_b128 v[68:71], v124 offset:576 +; COEXEC-NEXT: ds_load_tr16_b128 v[76:79], v156 offset:576 +; COEXEC-NEXT: ds_load_tr16_b128 v[80:83], v124 offset:768 +; COEXEC-NEXT: ds_load_tr16_b128 v[88:91], v156 offset:768 +; COEXEC-NEXT: ds_load_tr16_b128 v[84:87], v124 offset:832 +; COEXEC-NEXT: ds_load_tr16_b128 v[92:95], v156 offset:832 +; COEXEC-NEXT: ds_load_tr16_b128 v[96:99], v124 offset:128 +; COEXEC-NEXT: ds_load_tr16_b128 v[104:107], v124 offset:384 +; COEXEC-NEXT: ds_load_tr16_b128 v[112:115], v124 offset:640 +; COEXEC-NEXT: ds_load_tr16_b128 v[120:123], v124 offset:896 +; COEXEC-NEXT: ds_load_tr16_b128 v[128:131], v156 offset:128 +; COEXEC-NEXT: ds_load_tr16_b128 v[136:139], v156 offset:384 +; COEXEC-NEXT: ds_load_tr16_b128 v[144:147], v156 offset:640 +; COEXEC-NEXT: s_wait_dscnt 0x13 +; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31] +; COEXEC-NEXT: ds_load_tr16_b128 v[152:155], v156 offset:896 +; COEXEC-NEXT: ds_load_tr16_b128 v[100:103], v124 offset:192 +; COEXEC-NEXT: ds_load_tr16_b128 v[108:111], v124 offset:448 +; COEXEC-NEXT: ds_load_tr16_b128 v[116:119], v124 offset:704 +; COEXEC-NEXT: ds_load_tr16_b128 v[124:127], v124 offset:960 +; COEXEC-NEXT: ds_load_tr16_b128 v[132:135], v156 offset:192 +; COEXEC-NEXT: ds_load_tr16_b128 v[140:143], v156 offset:448 +; COEXEC-NEXT: s_wait_dscnt 0x16 +; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23] +; COEXEC-NEXT: ds_load_tr16_b128 v[148:151], v156 offset:704 +; COEXEC-NEXT: ds_load_tr16_b128 v[156:159], v156 offset:960 +; COEXEC-NEXT: s_wait_dscnt 0x14 +; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15] +; COEXEC-NEXT: s_wait_dscnt 0x10 +; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7] +; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31] +; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23] +; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15] +; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7] +; COEXEC-NEXT: s_wait_dscnt 0x3 +; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[96:103], v[128:135], v[24:31] +; COEXEC-NEXT: s_wait_dscnt 0x2 +; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[104:111], v[136:143], v[16:23] +; COEXEC-NEXT: s_wait_dscnt 0x1 +; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[112:119], v[144:151], v[8:15] +; COEXEC-NEXT: s_wait_dscnt 0x0 +; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[120:127], v[152:159], v[0:7] +; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[96:103], v[128:135], v[24:31] +; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[104:111], v[136:143], v[16:23] +; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[112:119], v[144:151], v[8:15] +; COEXEC-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[120:127], v[152:159], v[0:7] +; COEXEC-NEXT: s_cbranch_vccnz .LBB1_1 +; COEXEC-NEXT: ; %bb.2: ; %end +; COEXEC-NEXT: v_mov_b32_e32 v32, 0 +; COEXEC-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv +; COEXEC-NEXT: s_wait_kmcnt 0x0 +; COEXEC-NEXT: s_clause 0x7 +; COEXEC-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:16 +; COEXEC-NEXT: global_store_b128 v32, v[24:27], s[0:1] +; COEXEC-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:144 +; COEXEC-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:128 +; COEXEC-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:272 +; COEXEC-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:256 +; COEXEC-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:400 +; COEXEC-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:384 +; COEXEC-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; COEXEC-NEXT: s_endpgm +; +; GCN-LABEL: ds_wmma_permute: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x10 nv +; GCN-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0 +; GCN-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v0 +; GCN-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v0 +; GCN-NEXT: v_dual_mov_b32 v7, v0 :: v_dual_mov_b32 v8, v0 +; GCN-NEXT: v_dual_mov_b32 v9, v0 :: v_dual_mov_b32 v10, v0 +; GCN-NEXT: v_dual_mov_b32 v11, v0 :: v_dual_mov_b32 v12, v0 +; GCN-NEXT: v_dual_mov_b32 v13, v0 :: v_dual_mov_b32 v14, v0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_bitcmp1_b32 s0, 0 +; GCN-NEXT: v_dual_mov_b32 v15, v0 :: v_dual_mov_b32 v16, v0 +; GCN-NEXT: s_cselect_b32 s0, -1, 0 +; GCN-NEXT: v_dual_mov_b32 v17, v0 :: v_dual_mov_b32 v18, v0 +; GCN-NEXT: s_xor_b32 s0, s0, -1 +; GCN-NEXT: v_dual_mov_b32 v19, v0 :: v_dual_mov_b32 v20, v0 +; GCN-NEXT: v_cndmask_b32_e64 v24, 0, 1, s0 +; GCN-NEXT: v_dual_mov_b32 v21, v0 :: v_dual_mov_b32 v22, v0 +; GCN-NEXT: v_dual_mov_b32 v23, v0 :: v_dual_mov_b32 v25, v0 +; GCN-NEXT: v_mov_b32_e32 v26, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GCN-NEXT: v_cmp_ne_u32_e64 s0, 1, v24 +; GCN-NEXT: v_dual_mov_b32 v24, v0 :: v_dual_mov_b32 v27, v0 +; GCN-NEXT: v_dual_mov_b32 v28, v0 :: v_dual_mov_b32 v29, v0 +; GCN-NEXT: v_dual_mov_b32 v30, v0 :: v_dual_mov_b32 v31, v0 +; GCN-NEXT: .LBB1_1: ; %loop +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: s_add_co_i32 s7, s2, s6 +; GCN-NEXT: s_add_co_i32 s8, s3, s6 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: v_dual_mov_b32 v96, s7 :: v_dual_mov_b32 v97, s8 +; GCN-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GCN-NEXT: s_add_co_i32 s6, s6, s1 +; GCN-NEXT: ds_load_tr16_b128 v[32:35], v96 +; GCN-NEXT: ds_load_tr16_b128 v[36:39], v96 offset:64 +; GCN-NEXT: ds_load_tr16_b128 v[40:43], v97 +; GCN-NEXT: ds_load_tr16_b128 v[44:47], v97 offset:64 +; GCN-NEXT: ds_load_tr16_b128 v[48:51], v96 offset:256 +; GCN-NEXT: ds_load_tr16_b128 v[52:55], v96 offset:320 +; GCN-NEXT: ds_load_tr16_b128 v[56:59], v97 offset:256 +; GCN-NEXT: ds_load_tr16_b128 v[60:63], v97 offset:320 +; GCN-NEXT: ds_load_tr16_b128 v[64:67], v96 offset:512 +; GCN-NEXT: ds_load_tr16_b128 v[68:71], v96 offset:576 +; GCN-NEXT: ds_load_tr16_b128 v[72:75], v97 offset:512 +; GCN-NEXT: ds_load_tr16_b128 v[76:79], v97 offset:576 +; GCN-NEXT: ds_load_tr16_b128 v[80:83], v96 offset:768 +; GCN-NEXT: ds_load_tr16_b128 v[84:87], v96 offset:832 +; GCN-NEXT: ds_load_tr16_b128 v[88:91], v97 offset:768 +; GCN-NEXT: ds_load_tr16_b128 v[92:95], v97 offset:832 +; GCN-NEXT: s_wait_dscnt 0xc +; GCN-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31] +; GCN-NEXT: s_wait_dscnt 0x8 +; GCN-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23] +; GCN-NEXT: s_wait_dscnt 0x4 +; GCN-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15] +; GCN-NEXT: s_wait_dscnt 0x0 +; GCN-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7] +; GCN-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31] +; GCN-NEXT: ds_load_tr16_b128 v[32:35], v96 offset:128 +; GCN-NEXT: ds_load_tr16_b128 v[36:39], v96 offset:192 +; GCN-NEXT: ds_load_tr16_b128 v[40:43], v97 offset:128 +; GCN-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23] +; GCN-NEXT: ds_load_tr16_b128 v[44:47], v97 offset:192 +; GCN-NEXT: ds_load_tr16_b128 v[48:51], v96 offset:384 +; GCN-NEXT: ds_load_tr16_b128 v[52:55], v96 offset:448 +; GCN-NEXT: ds_load_tr16_b128 v[56:59], v97 offset:384 +; GCN-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15] +; GCN-NEXT: ds_load_tr16_b128 v[60:63], v97 offset:448 +; GCN-NEXT: ds_load_tr16_b128 v[64:67], v96 offset:640 +; GCN-NEXT: ds_load_tr16_b128 v[68:71], v96 offset:704 +; GCN-NEXT: ds_load_tr16_b128 v[72:75], v97 offset:640 +; GCN-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7] +; GCN-NEXT: ds_load_tr16_b128 v[76:79], v97 offset:704 +; GCN-NEXT: ds_load_tr16_b128 v[80:83], v96 offset:896 +; GCN-NEXT: ds_load_tr16_b128 v[84:87], v96 offset:960 +; GCN-NEXT: ds_load_tr16_b128 v[88:91], v97 offset:896 +; GCN-NEXT: ds_load_tr16_b128 v[92:95], v97 offset:960 +; GCN-NEXT: s_wait_dscnt 0xc +; GCN-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31] +; GCN-NEXT: s_wait_dscnt 0x8 +; GCN-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23] +; GCN-NEXT: s_wait_dscnt 0x4 +; GCN-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15] +; GCN-NEXT: s_wait_dscnt 0x0 +; GCN-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7] +; GCN-NEXT: v_wmma_f32_16x16x32_f16 v[24:31], v[32:39], v[40:47], v[24:31] +; GCN-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[48:55], v[56:63], v[16:23] +; GCN-NEXT: v_wmma_f32_16x16x32_f16 v[8:15], v[64:71], v[72:79], v[8:15] +; GCN-NEXT: v_wmma_f32_16x16x32_f16 v[0:7], v[80:87], v[88:95], v[0:7] +; GCN-NEXT: s_cbranch_vccnz .LBB1_1 +; GCN-NEXT: ; %bb.2: ; %end +; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv +; GCN-NEXT: v_nop +; GCN-NEXT: v_mov_b32_e32 v32, 0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_clause 0x7 +; GCN-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:16 +; GCN-NEXT: global_store_b128 v32, v[24:27], s[0:1] +; GCN-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:144 +; GCN-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:128 +; GCN-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:272 +; GCN-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:256 +; GCN-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:400 +; GCN-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:384 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +entry: + + br label %loop + +loop: + %baseOff = phi i32 [ 0, %entry ], [ %newBaseOff, %loop ] + %wvec0 = phi <8 x float> [ , %entry ], [ %bwmma01, %loop ] + %wvec1 = phi <8 x float> [ , %entry ], [ %bwmma11, %loop ] + %wvec2 = phi <8 x float> [ , %entry ], [ %bwmma21, %loop ] + %wvec3 = phi <8 x float> [ , %entry ], [ %bwmma31, %loop ] + %p0 = getelementptr inbounds nuw i8, ptr addrspace(3) %base, i32 %baseOff + %p1 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 64 + %p2 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 128 + %p3 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 192 + %p4 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 256 + %p5 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 320 + %p6 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 384 + %p7 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 448 + %p8 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 512 + %p9 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 576 + %p10 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 640 + %p11 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 704 + %p12 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 768 + %p13 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 832 + %p14 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 896 + %p15 = getelementptr inbounds nuw i8, ptr addrspace(3) %p0, i32 960 + %bp0 = getelementptr inbounds nuw i8, ptr addrspace(3) %base1, i32 %baseOff + %bp1 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 64 + %bp2 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 128 + %bp3 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 192 + %bp4 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 256 + %bp5 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 320 + %bp6 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 384 + %bp7 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 448 + %bp8 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 512 + %bp9 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 576 + %bp10 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 640 + %bp11 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 704 + %bp12 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 768 + %bp13 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 832 + %bp14 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 896 + %bp15 = getelementptr inbounds nuw i8, ptr addrspace(3) %bp0, i32 960 + + %l0 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) %p0) + %l1 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p1) + %l2 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p2) + %l3 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p3) + %l4 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p4) + %l5 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p5) + %l6 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p6) + %l7 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p7) + %l8 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p8) + %l9 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p9) + %l10 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p10) + %l11 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p11) + %l12 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p12) + %l13 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p13) + %l14 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p14) + %l15 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %p15) + %bl0 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) %bp0) + %bl1 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp1) + %bl2 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp2) + %bl3 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp3) + %bl4 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp4) + %bl5 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp5) + %bl6 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp6) + %bl7 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp7) + %bl8 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp8) + %bl9 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp9) + %bl10 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp10) + %bl11 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp11) + %bl12 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp12) + %bl13 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp13) + %bl14 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp14) + %bl15 = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) nonnull %bp15) + %vec0 = shufflevector <8 x half> %l0, <8 x half> %l1, <16 x i32> + %vec1 = shufflevector <8 x half> %l2, <8 x half> %l3, <16 x i32> + %vec2 = shufflevector <8 x half> %l4, <8 x half> %l5, <16 x i32> + %vec3 = shufflevector <8 x half> %l6, <8 x half> %l7, <16 x i32> + %vec4 = shufflevector <8 x half> %l8, <8 x half> %l9, <16 x i32> + %vec5 = shufflevector <8 x half> %l10, <8 x half> %l11, <16 x i32> + %vec6 = shufflevector <8 x half> %l12, <8 x half> %l13, <16 x i32> + %vec7 = shufflevector <8 x half> %l14, <8 x half> %l15, <16 x i32> + %bvec0 = shufflevector <8 x half> %bl0, <8 x half> %bl1, <16 x i32> + %bvec1 = shufflevector <8 x half> %bl2, <8 x half> %bl3, <16 x i32> + %bvec2 = shufflevector <8 x half> %bl4, <8 x half> %bl5, <16 x i32> + %bvec3 = shufflevector <8 x half> %bl6, <8 x half> %bl7, <16 x i32> + %bvec4 = shufflevector <8 x half> %bl8, <8 x half> %bl9, <16 x i32> + %bvec5 = shufflevector <8 x half> %bl10, <8 x half> %bl11, <16 x i32> + %bvec6 = shufflevector <8 x half> %bl12, <8 x half> %bl13, <16 x i32> + %bvec7 = shufflevector <8 x half> %bl14, <8 x half> %bl15, <16 x i32> + %wmma00 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec0, i1 false, <16 x half> %bvec0, i16 0, <8 x float> %wvec0, i1 false, i1 false) + %bwmma00 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec0, i1 false, <16 x half> %bvec0, i16 0, <8 x float> %wmma00, i1 false, i1 false) + %wmma01 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec1, i1 false, <16 x half> %bvec1, i16 0, <8 x float> %bwmma00, i1 false, i1 false) + %bwmma01 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec1, i1 false, <16 x half> %bvec1, i16 0, <8 x float> %wmma01, i1 false, i1 false) + %wmma10 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec2, i1 false, <16 x half> %bvec2, i16 0, <8 x float> %wvec1, i1 false, i1 false) + %bwmma10 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec2, i1 false, <16 x half> %bvec2, i16 0, <8 x float> %wmma10, i1 false, i1 false) + %wmma11 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec3, i1 false, <16 x half> %bvec3, i16 0, <8 x float> %bwmma10, i1 false, i1 false) + %bwmma11 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec3, i1 false, <16 x half> %bvec3, i16 0, <8 x float> %wmma11, i1 false, i1 false) + %wmma20 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec4, i1 false, <16 x half> %bvec4, i16 0, <8 x float> %wvec2, i1 false, i1 false) + %bwmma20 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec4, i1 false, <16 x half> %bvec4, i16 0, <8 x float> %wmma20, i1 false, i1 false) + %wmma21 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec5, i1 false, <16 x half> %bvec5, i16 0, <8 x float> %bwmma20, i1 false, i1 false) + %bwmma21 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec5, i1 false, <16 x half> %bvec5, i16 0, <8 x float> %wmma21, i1 false, i1 false) + %wmma30 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec6, i1 false, <16 x half> %bvec6, i16 0, <8 x float> %wvec3, i1 false, i1 false) + %bwmma30 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec6, i1 false, <16 x half> %bvec6, i16 0, <8 x float> %wmma30, i1 false, i1 false) + %wmma31 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec7, i1 false, <16 x half> %bvec7, i16 0, <8 x float> %bwmma30, i1 false, i1 false) + %bwmma31 = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %vec7, i1 false, <16 x half> %bvec7, i16 0, <8 x float> %wmma31, i1 false, i1 false) + %newBaseOff = or disjoint i32 %baseOff, %delta + br i1 %br0, label %loop, label %end + +end: + %out1 = getelementptr inbounds nuw i8, ptr addrspace(1) %out, i32 128 + %out2 = getelementptr inbounds nuw i8, ptr addrspace(1) %out, i32 256 + %out3 = getelementptr inbounds nuw i8, ptr addrspace(1) %out, i32 384 + store <8 x float> %bwmma01, ptr addrspace(1) %out, align 16 + store <8 x float> %bwmma11, ptr addrspace(1) %out1, align 16 + store <8 x float> %bwmma21, ptr addrspace(1) %out2, align 16 + store <8 x float> %bwmma31, ptr addrspace(1) %out3, align 16 + ret void +} + + +attributes #0 = { "amdgpu-flat-work-group-size"="32,32" "amdgpu-waves-per-eu"="1,1" }