diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td index 34eda21de875..c1ca7d4fd8e7 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.td +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td @@ -194,6 +194,14 @@ def __builtin_amdgcn_raw_ptr_buffer_load_async_lds : AMDGPUBuiltin<"void(__amdgp def __builtin_amdgcn_struct_ptr_buffer_load_lds : AMDGPUBuiltin<"void(__amdgpu_buffer_rsrc_t, void address_space<3> *, _Constant unsigned int, int, int, int, _Constant int, _Constant int)", [], "vmem-to-lds-load-insts">; def __builtin_amdgcn_struct_ptr_buffer_load_async_lds : AMDGPUBuiltin<"void(__amdgpu_buffer_rsrc_t, void address_space<3> *, _Constant unsigned int, int, int, int, _Constant int, _Constant int)", [], "vmem-to-lds-load-insts">; +//===----------------------------------------------------------------------===// +// Async mark builtins. +//===----------------------------------------------------------------------===// + +// FIXME: Not supported on GFX12 yet. Will need a new feature when we do. +def __builtin_amdgcn_asyncmark : AMDGPUBuiltin<"void()", [], "vmem-to-lds-load-insts">; +def __builtin_amdgcn_wait_asyncmark : AMDGPUBuiltin<"void(_Constant unsigned short)", [], "vmem-to-lds-load-insts">; + //===----------------------------------------------------------------------===// // Ballot builtins. //===----------------------------------------------------------------------===// diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-asyncmark-errs.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-asyncmark-errs.cl new file mode 100644 index 000000000000..7d4a141fbde6 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-asyncmark-errs.cl @@ -0,0 +1,7 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -verify -S -o - %s + +void test_feature() { + __builtin_amdgcn_asyncmark(); // expected-error{{'__builtin_amdgcn_asyncmark' needs target feature vmem-to-lds-load-insts}} + __builtin_amdgcn_wait_asyncmark(0); // expected-error{{'__builtin_amdgcn_wait_asyncmark' needs target feature vmem-to-lds-load-insts}} +} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-asyncmark.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-asyncmark.cl new file mode 100644 index 000000000000..976ae3cea5d6 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-asyncmark.cl @@ -0,0 +1,16 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx942 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s +// REQUIRES: amdgpu-registered-target + +// CHECK-LABEL: @test_invocation( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.amdgcn.asyncmark() +// CHECK-NEXT: call void @llvm.amdgcn.wait.asyncmark(i16 0) +// CHECK-NEXT: ret void +// +void test_invocation() { + __builtin_amdgcn_asyncmark(); + __builtin_amdgcn_wait_asyncmark(0); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 57afd6ccf3b8..66591519de73 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2839,6 +2839,15 @@ def int_amdgcn_global_load_async_lds : AMDGPUGlobalLoadLDS, ClangBuiltin<"__buil def int_amdgcn_pops_exiting_wave_id : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrHasSideEffects]>; +// Sets a marker in the stream of async requests. Modelled as InaccessibleMem. +def int_amdgcn_asyncmark : ClangBuiltin<"__builtin_amdgcn_asyncmark">, + Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; + +// Waits until the Nth previous marker is completed, if it exists. +def int_amdgcn_wait_asyncmark : + ClangBuiltin<"__builtin_amdgcn_wait_asyncmark">, + Intrinsic<[], [llvm_i16_ty], [ImmArg>, IntrNoMem, IntrHasSideEffects]>; + //===----------------------------------------------------------------------===// // GFX10 Intrinsics //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 3c9cd453bec8..82783dc95b2a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2393,6 +2393,12 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( case Intrinsic::amdgcn_global_load_lds: case Intrinsic::amdgcn_global_load_async_lds: return selectGlobalLoadLds(I); + case Intrinsic::amdgcn_asyncmark: + case Intrinsic::amdgcn_wait_asyncmark: + // FIXME: Not supported on GFX12 yet. Will need a new feature when we do. + if (!Subtarget->hasVMemToLDSLoad()) + return false; + break; case Intrinsic::amdgcn_exp_compr: if (!STI.hasCompressedExport()) { Function &F = I.getMF()->getFunction(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index 99c1ab8d379d..fc408aa30dd8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -347,7 +347,7 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { } } else { // We don't want these pseudo instructions encoded. They are - // placeholder terminator instructions and should only be printed as + // placeholder instructions and should only be printed as // comments. if (MI->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) { if (isVerbose()) @@ -361,6 +361,20 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { return; } + if (MI->getOpcode() == AMDGPU::ASYNCMARK) { + if (isVerbose()) + OutStreamer->emitRawComment(" asyncmark"); + return; + } + + if (MI->getOpcode() == AMDGPU::WAIT_ASYNCMARK) { + if (isVerbose()) { + OutStreamer->emitRawComment(" wait_asyncmark(" + + Twine(MI->getOperand(0).getImm()) + ")"); + } + return; + } + if (MI->getOpcode() == AMDGPU::SCHED_BARRIER) { if (isVerbose()) { std::string HexString; diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 7dfe0da7ef81..111867583fde 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -641,6 +641,24 @@ public: std::optional getExpertSchedulingEventType(const MachineInstr &Inst) const; + bool isAsync(const MachineInstr &MI) const { + if (!SIInstrInfo::isLDSDMA(MI)) + return false; + if (SIInstrInfo::usesASYNC_CNT(MI)) + return true; + const MachineOperand *Async = + TII->getNamedOperand(MI, AMDGPU::OpName::IsAsync); + return Async && (Async->getImm()); + } + + bool isNonAsyncLdsDmaWrite(const MachineInstr &MI) const { + return SIInstrInfo::mayWriteLDSThroughDMA(MI) && !isAsync(MI); + } + + bool isAsyncLdsDmaWrite(const MachineInstr &MI) const { + return SIInstrInfo::mayWriteLDSThroughDMA(MI) && isAsync(MI); + } + bool isVmemAccess(const MachineInstr &MI) const; bool generateWaitcntInstBefore(MachineInstr &MI, WaitcntBrackets &ScoreBrackets, @@ -769,11 +787,13 @@ public: AMDGPU::Waitcnt &Wait) const; void determineWaitForLDSDMA(InstCounterType T, VMEMID TID, AMDGPU::Waitcnt &Wait) const; + AMDGPU::Waitcnt determineAsyncWait(unsigned N); void tryClearSCCWriteEvent(MachineInstr *Inst); void applyWaitcnt(const AMDGPU::Waitcnt &Wait); void applyWaitcnt(InstCounterType T, unsigned Count); void updateByEvent(WaitEventType E, MachineInstr &MI); + void recordAsyncMark(MachineInstr &MI); bool hasPendingEvent() const { return !PendingEvents.empty(); } bool hasPendingEvent(WaitEventType E) const { @@ -865,11 +885,15 @@ private: unsigned OtherShift; }; + using CounterValueArray = std::array; + void determineWaitForScore(InstCounterType T, unsigned Score, AMDGPU::Waitcnt &Wait) const; static bool mergeScore(const MergeInfo &M, unsigned &Score, unsigned OtherScore); + bool mergeAsyncMarks(ArrayRef MergeInfos, + ArrayRef OtherMarks); iterator_range regunits(MCPhysReg Reg) const { assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC"); @@ -946,8 +970,8 @@ private: // TODO: Could we track SCC alongside SGPRs so it's not longer a special case? struct VMEMInfo { - // Scores for all instruction counters. - std::array Scores = {0}; + // Scores for all instruction counters. Zero-initialized. + CounterValueArray Scores{}; // Bitmask of the VmemTypes of VMEM instructions for this VGPR. unsigned VMEMTypes = 0; @@ -975,6 +999,14 @@ private: // Store representative LDS DMA operations. The only useful info here is // alias info. One store is kept per unique AAInfo. SmallVector LDSDMAStores; + + // State of all counters at each async mark encountered so far. + SmallVector AsyncMarks; + static constexpr unsigned MaxAsyncMarks = 16; + + // Track the upper bound score for async operations that are not part of a + // mark yet. Initialized to all zeros. + CounterValueArray AsyncScore{}; }; class SIInsertWaitcntsLegacy : public MachineFunctionPass { @@ -1176,7 +1208,7 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) { setScoreByOperand(Op, T, CurrScore); } if (Inst.mayStore() && - (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) { + (TII->isDS(Inst) || Context->isNonAsyncLdsDmaWrite(Inst))) { // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS // written can be accessed. A load from LDS to VMEM does not need a wait. // @@ -1220,6 +1252,14 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) { setVMemScore(LDSDMA_BEGIN + Slot, T, CurrScore); } + // FIXME: Not supported on GFX12 yet. Newer async operations use other + // counters too, so will need a map from instruction or event types to + // counter types. + if (Context->isAsyncLdsDmaWrite(Inst) && T == LOAD_CNT) { + assert(!SIInstrInfo::usesASYNC_CNT(Inst)); + AsyncScore[T] = CurrScore; + } + if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) { setRegScore(AMDGPU::SCC, T, CurrScore); PendingSCCWrite = &Inst; @@ -1227,13 +1267,28 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) { } } +void WaitcntBrackets::recordAsyncMark(MachineInstr &Inst) { + // In the absence of loops, AsyncMarks can grow linearly with the program + // until we encounter an ASYNCMARK_WAIT. We could drop the oldest mark above a + // limit every time we push a new mark, but that seems like unnecessary work + // in practical cases. We do separately truncate the array when processing a + // loop, which should be sufficient. + AsyncMarks.push_back(AsyncScore); + AsyncScore = {}; + LLVM_DEBUG({ + dbgs() << "recordAsyncMark:\n" << Inst; + for (const auto &Mark : AsyncMarks) { + llvm::interleaveComma(Mark, dbgs()); + dbgs() << '\n'; + } + }); +} + void WaitcntBrackets::print(raw_ostream &OS) const { const GCNSubtarget *ST = Context->ST; - OS << '\n'; for (auto T : inst_counter_types(Context->MaxCounter)) { unsigned SR = getScoreRange(T); - switch (T) { case LOAD_CNT: OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT(" @@ -1326,6 +1381,53 @@ void WaitcntBrackets::print(raw_ostream &OS) const { } OS << '\n'; + OS << "Async score: "; + if (AsyncScore.empty()) + OS << "none"; + else + llvm::interleaveComma(AsyncScore, OS); + OS << '\n'; + + OS << "Async marks: " << AsyncMarks.size() << '\n'; + + for (const auto &Mark : AsyncMarks) { + for (auto T : inst_counter_types()) { + unsigned MarkedScore = Mark[T]; + switch (T) { + case LOAD_CNT: + OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") + << "_CNT: " << MarkedScore; + break; + case DS_CNT: + OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") + << "_CNT: " << MarkedScore; + break; + case EXP_CNT: + OS << " EXP_CNT: " << MarkedScore; + break; + case STORE_CNT: + OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") + << "_CNT: " << MarkedScore; + break; + case SAMPLE_CNT: + OS << " SAMPLE_CNT: " << MarkedScore; + break; + case BVH_CNT: + OS << " BVH_CNT: " << MarkedScore; + break; + case KM_CNT: + OS << " KM_CNT: " << MarkedScore; + break; + case X_CNT: + OS << " X_CNT: " << MarkedScore; + break; + default: + OS << " UNKNOWN: " << MarkedScore; + break; + } + } + OS << '\n'; + } OS << '\n'; } @@ -1427,6 +1529,49 @@ void WaitcntBrackets::determineWaitForScore(InstCounterType T, } } +AMDGPU::Waitcnt WaitcntBrackets::determineAsyncWait(unsigned N) { + LLVM_DEBUG({ + dbgs() << "Need " << N << " async marks. Found " << AsyncMarks.size() + << ":\n"; + for (const auto &Mark : AsyncMarks) { + llvm::interleaveComma(Mark, dbgs()); + dbgs() << '\n'; + } + }); + + AMDGPU::Waitcnt Wait; + if (AsyncMarks.size() == MaxAsyncMarks) { + // Enforcing MaxAsyncMarks here is unnecessary work because the size of + // MaxAsyncMarks is linear when traversing straightline code. But we do + // need to check if truncation may have occured at a merge, and adjust N + // to ensure that a wait is generated. + LLVM_DEBUG(dbgs() << "Possible truncation. Ensuring a non-trivial wait.\n"); + N = std::min(N, (unsigned)MaxAsyncMarks - 1); + } + + if (AsyncMarks.size() <= N) { + LLVM_DEBUG(dbgs() << "No additional wait for async mark.\n"); + return Wait; + } + + size_t MarkIndex = AsyncMarks.size() - N - 1; + const auto &RequiredMark = AsyncMarks[MarkIndex]; + for (InstCounterType T : inst_counter_types()) + determineWaitForScore(T, RequiredMark[T], Wait); + + // Immediately remove the waited mark and all older ones + // This happens BEFORE the wait is actually inserted, which is fine + // because we've already extracted the wait requirements + LLVM_DEBUG({ + dbgs() << "Removing " << (MarkIndex + 1) + << " async marks after determining wait\n"; + }); + AsyncMarks.erase(AsyncMarks.begin(), AsyncMarks.begin() + MarkIndex + 1); + + LLVM_DEBUG(dbgs() << "Waits to add: " << Wait); + return Wait; +} + void WaitcntBrackets::determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg, AMDGPU::Waitcnt &Wait) const { if (Reg == AMDGPU::SCC) { @@ -1654,6 +1799,11 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( // possibility in an articial MIR test since such a situation cannot be // recreated by running the memory legalizer. II.eraseFromParent(); + } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) { + unsigned N = II.getOperand(0).getImm(); + LLVM_DEBUG(dbgs() << "Processing WAIT_ASYNCMARK: " << II << '\n';); + AMDGPU::Waitcnt OldWait = ScoreBrackets.determineAsyncWait(N); + Wait = Wait.combined(OldWait); } else { assert(Opcode == AMDGPU::S_WAITCNT_VSCNT); assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL); @@ -1907,6 +2057,8 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( // LDS, so no work required here yet. II.eraseFromParent(); continue; + } else if (Opcode == AMDGPU::WAIT_ASYNCMARK) { + reportFatalUsageError("WAIT_ASYNCMARK is not ready for GFX12 yet"); } else { std::optional CT = counterTypeForInstr(Opcode); assert(CT.has_value()); @@ -2231,6 +2383,7 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt( bool SIInsertWaitcnts::generateWaitcntInstBefore( MachineInstr &MI, WaitcntBrackets &ScoreBrackets, MachineInstr *OldWaitcntInstr, PreheaderFlushFlags FlushFlags) { + LLVM_DEBUG(dbgs() << "\n*** GenerateWaitcntInstBefore: "; MI.print(dbgs());); setForceEmitWaitcnt(); assert(!MI.isMetaInstruction()); @@ -2785,6 +2938,84 @@ bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score, return OtherShifted > MyShifted; } +bool WaitcntBrackets::mergeAsyncMarks(ArrayRef MergeInfos, + ArrayRef OtherMarks) { + bool StrictDom = false; + + LLVM_DEBUG(dbgs() << "Merging async marks ..."); + // Early exit: both empty + if (AsyncMarks.empty() && OtherMarks.empty()) { + LLVM_DEBUG(dbgs() << " nothing to merge\n"); + return false; + } + LLVM_DEBUG(dbgs() << '\n'); + + // Determine maximum length needed after merging + auto MaxSize = (unsigned)std::max(AsyncMarks.size(), OtherMarks.size()); + + // For each backedge in isolation, the algorithm reachs a fixed point after + // the first call to merge(). This is unchanged even with the AsyncMarks + // array because we call mergeScore just like the other cases. + // + // But in the rare pathological case, a nest of loops that pushes marks + // without waiting on any mark can cause AsyncMarks to grow very large. We cap + // it to a reasonable limit. We can tune this later or potentially introduce a + // user option to control the value. + MaxSize = std::min(MaxSize, MaxAsyncMarks); + + // Keep only the most recent marks within our limit. + if (AsyncMarks.size() > MaxSize) + AsyncMarks.erase(AsyncMarks.begin(), + AsyncMarks.begin() + (AsyncMarks.size() - MaxSize)); + + // Pad with zero-filled marks if our list is shorter. Zero represents "no + // pending async operations at this checkpoint" and acts as the identity + // element for max() during merging. We pad at the beginning since the marks + // need to be aligned in most-recent order. + CounterValueArray ZeroMark{}; + AsyncMarks.insert(AsyncMarks.begin(), MaxSize - AsyncMarks.size(), ZeroMark); + + LLVM_DEBUG({ + dbgs() << "Before merge:\n"; + for (const auto &Mark : AsyncMarks) { + llvm::interleaveComma(Mark, dbgs()); + dbgs() << '\n'; + } + }); + + LLVM_DEBUG({ + dbgs() << "Other marks:\n"; + for (const auto &Mark : OtherMarks) { + llvm::interleaveComma(Mark, dbgs()); + dbgs() << '\n'; + } + }); + + // Merge element-wise using the existing mergeScore function and the + // appropriate MergeInfo for each counter type. Iterate only while we have + // elements in both vectors. + unsigned OtherSize = OtherMarks.size(); + unsigned OurSize = AsyncMarks.size(); + unsigned MergeCount = std::min(OtherSize, OurSize); + assert(OurSize == MaxSize); + for (unsigned Idx = 1; Idx <= MergeCount; ++Idx) { + for (auto T : inst_counter_types(Context->MaxCounter)) { + StrictDom |= mergeScore(MergeInfos[T], AsyncMarks[OurSize - Idx][T], + OtherMarks[OtherSize - Idx][T]); + } + } + + LLVM_DEBUG({ + dbgs() << "After merge:\n"; + for (const auto &Mark : AsyncMarks) { + llvm::interleaveComma(Mark, dbgs()); + dbgs() << '\n'; + } + }); + + return StrictDom; +} + /// Merge the pending events and associater score brackets of \p Other into /// this brackets status. /// @@ -2800,6 +3031,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { for (auto K : Other.SGPRs.keys()) SGPRs.try_emplace(K); + // Array to store MergeInfo for each counter type + MergeInfo MergeInfos[NUM_INST_CNTS]; + for (auto T : inst_counter_types(Context->MaxCounter)) { // Merge event flags for this counter const WaitEventSet &EventsForT = Context->getWaitEvents(T); @@ -2816,7 +3050,7 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { if (NewUB < ScoreLBs[T]) report_fatal_error("waitcnt score overflow"); - MergeInfo M; + MergeInfo &M = MergeInfos[T]; M.OldLB = ScoreLBs[T]; M.OtherLB = Other.ScoreLBs[T]; M.MyShift = NewUB - ScoreUBs[T]; @@ -2862,6 +3096,10 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { } } + StrictDom |= mergeAsyncMarks(MergeInfos, Other.AsyncMarks); + for (auto T : inst_counter_types(Context->MaxCounter)) + StrictDom |= mergeScore(MergeInfos[T], AsyncScore[T], Other.AsyncScore[T]); + purgeEmptyTrackingData(); return StrictDom; } @@ -2874,6 +3112,7 @@ static bool isWaitInstr(MachineInstr &Inst) { Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT || Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT || Opcode == AMDGPU::S_WAITCNT_lds_direct || + Opcode == AMDGPU::WAIT_ASYNCMARK || counterTypeForInstr(Opcode).has_value(); } @@ -2991,6 +3230,14 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, if (Block.getFirstTerminator() == Inst) FlushFlags = isPreheaderToFlush(Block, ScoreBrackets); + if (Inst.getOpcode() == AMDGPU::ASYNCMARK) { + // FIXME: Not supported on GFX12 yet. Will need a new feature when we do. + assert(ST->getGeneration() < AMDGPUSubtarget::GFX12); + ScoreBrackets.recordAsyncMark(Inst); + ++Iter; + continue; + } + // Generate an s_waitcnt instruction to be placed before Inst, if needed. Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr, FlushFlags); @@ -3432,7 +3679,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { if (!SuccBI.Incoming) { SuccBI.Dirty = true; if (SuccBII <= BII) { - LLVM_DEBUG(dbgs() << "repeat on backedge\n"); + LLVM_DEBUG(dbgs() << "Repeat on backedge without merge\n"); Repeat = true; } if (!MoveBracketsToSucc) { @@ -3440,11 +3687,20 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { } else { SuccBI.Incoming = std::make_unique(*Brackets); } - } else if (SuccBI.Incoming->merge(*Brackets)) { - SuccBI.Dirty = true; - if (SuccBII <= BII) { - LLVM_DEBUG(dbgs() << "repeat on backedge\n"); - Repeat = true; + } else { + LLVM_DEBUG({ + dbgs() << "Try to merge "; + MBB->printName(dbgs()); + dbgs() << " into "; + Succ->printName(dbgs()); + dbgs() << '\n'; + }); + if (SuccBI.Incoming->merge(*Brackets)) { + SuccBI.Dirty = true; + if (SuccBII <= BII) { + LLVM_DEBUG(dbgs() << "Repeat on backedge with merge\n"); + Repeat = true; + } } } } diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 1b0862e039c6..ce6e862104b4 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1712,11 +1712,21 @@ let SubtargetPredicate = HasWaitXcnt in { // Represents the point at which a wave must wait for all outstanding direct loads to LDS. // Typically inserted by the memory legalizer and consumed by SIInsertWaitcnts. - def S_WAITCNT_lds_direct : SPseudoInstSI<(outs), (ins)> { let hasSideEffects = 0; } +let SubtargetPredicate = HasVMemToLDSLoad in { +def ASYNCMARK : SPseudoInstSI<(outs), (ins), + [(int_amdgcn_asyncmark)]> { + let maybeAtomic = 0; +} +def WAIT_ASYNCMARK : SOPP_Pseudo <"", (ins s16imm:$simm16), "$simm16", + [(int_amdgcn_wait_asyncmark timm:$simm16)]> { + let maybeAtomic = 0; +} +} + def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16", [(int_amdgcn_s_sethalt timm:$simm16)]>; def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16"> { diff --git a/llvm/test/CodeGen/AMDGPU/async-buffer-loads.ll b/llvm/test/CodeGen/AMDGPU/async-buffer-loads.ll index 93b51ff83deb..c6028497c941 100644 --- a/llvm/test/CodeGen/AMDGPU/async-buffer-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/async-buffer-loads.ll @@ -7,19 +7,24 @@ define float @raw.buffer.load(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds ; CHECK: ; %bb.0: ; %main_body ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b32 m0, s20 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 lds -; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 offset:4 glc lds -; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 offset:8 slc lds ; CHECK-NEXT: v_mov_b32_e32 v0, s20 -; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 lds +; CHECK-NEXT: ; asyncmark +; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 offset:4 glc lds +; CHECK-NEXT: ; asyncmark +; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 offset:8 slc lds +; CHECK-NEXT: ; wait_asyncmark(1) +; CHECK-NEXT: s_waitcnt vmcnt(2) ; CHECK-NEXT: ds_read_b32 v0, v0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] main_body: call void @llvm.amdgcn.raw.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.raw.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 4, i32 1) + call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.raw.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 8, i32 2) + call void @llvm.amdgcn.wait.asyncmark(i16 1) %res = load float, ptr addrspace(3) %lds ret float %res } @@ -29,19 +34,24 @@ define float @raw.ptr.buffer.load(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) ; CHECK: ; %bb.0: ; %main_body ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b32 m0, s20 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 lds -; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 offset:4 glc lds -; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 offset:8 slc lds ; CHECK-NEXT: v_mov_b32_e32 v0, s20 -; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 lds +; CHECK-NEXT: ; asyncmark +; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 offset:4 glc lds +; CHECK-NEXT: ; asyncmark +; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 offset:8 slc lds +; CHECK-NEXT: ; wait_asyncmark(1) +; CHECK-NEXT: s_waitcnt vmcnt(2) ; CHECK-NEXT: ds_read_b32 v0, v0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] main_body: call void @llvm.amdgcn.raw.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.raw.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 4, i32 1) + call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.raw.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 8, i32 2) + call void @llvm.amdgcn.wait.asyncmark(i16 1) %res = load float, ptr addrspace(3) %lds ret float %res } @@ -53,17 +63,23 @@ define float @struct.buffer.load(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg % ; CHECK-NEXT: s_mov_b32 m0, s20 ; CHECK-NEXT: v_mov_b32_e32 v0, 8 ; CHECK-NEXT: buffer_load_dword v0, s[16:19], 0 idxen lds +; CHECK-NEXT: ; asyncmark ; CHECK-NEXT: buffer_load_dword v0, s[16:19], 0 idxen offset:4 glc lds +; CHECK-NEXT: ; asyncmark ; CHECK-NEXT: buffer_load_dword v0, s[16:19], 0 idxen offset:8 slc lds ; CHECK-NEXT: v_mov_b32_e32 v0, s20 -; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; wait_asyncmark(1) +; CHECK-NEXT: s_waitcnt vmcnt(2) ; CHECK-NEXT: ds_read_b32 v0, v0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] main_body: call void @llvm.amdgcn.struct.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.struct.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 4, i32 1) + call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.struct.buffer.load.async.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 8, i32 2) + call void @llvm.amdgcn.wait.asyncmark(i16 1) %res = load float, ptr addrspace(3) %lds ret float %res } @@ -75,17 +91,23 @@ define float @struct.ptr.buffer.load(ptr addrspace(8) inreg %rsrc, ptr addrspace ; CHECK-NEXT: s_mov_b32 m0, s20 ; CHECK-NEXT: v_mov_b32_e32 v0, 8 ; CHECK-NEXT: buffer_load_dword v0, s[16:19], 0 idxen lds +; CHECK-NEXT: ; asyncmark ; CHECK-NEXT: buffer_load_dword v0, s[16:19], 0 idxen offset:4 glc lds +; CHECK-NEXT: ; asyncmark ; CHECK-NEXT: buffer_load_dword v0, s[16:19], 0 idxen offset:8 slc lds ; CHECK-NEXT: v_mov_b32_e32 v0, s20 -; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; wait_asyncmark(1) +; CHECK-NEXT: s_waitcnt vmcnt(2) ; CHECK-NEXT: ds_read_b32 v0, v0 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] main_body: call void @llvm.amdgcn.struct.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.struct.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 4, i32 1) + call void @llvm.amdgcn.asyncmark() call void @llvm.amdgcn.struct.ptr.buffer.load.async.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 8, i32 0, i32 0, i32 8, i32 2) + call void @llvm.amdgcn.wait.asyncmark(i16 1) %res = load float, ptr addrspace(3) %lds ret float %res } diff --git a/llvm/test/CodeGen/AMDGPU/asyncmark-err.ll b/llvm/test/CodeGen/AMDGPU/asyncmark-err.ll new file mode 100644 index 000000000000..f929cb3e380b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/asyncmark-err.ll @@ -0,0 +1,19 @@ +; RUN: split-file %s %t +; RUN: not --crash llc -filetype=null -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 %t/mark.ll 2>&1 | FileCheck --ignore-case %s +; RUN: not llc -filetype=null -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 %t/mark.ll 2>&1 | FileCheck --ignore-case %s +; RUN: not --crash llc -filetype=null -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 %t/wait.ll 2>&1 | FileCheck --ignore-case %s +; RUN: not llc -filetype=null -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 %t/wait.ll 2>&1 | FileCheck --ignore-case %s + +; CHECK: LLVM ERROR: Cannot select + +;--- mark.ll +define void @async_err() { + call void @llvm.amdgcn.asyncmark() + ret void +} + +;--- wait.ll +define void @async_err() { + call void @llvm.amdgcn.wait.asyncmark(i16 0) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/asyncmark-max-pregfx12.ll b/llvm/test/CodeGen/AMDGPU/asyncmark-max-pregfx12.ll new file mode 100644 index 000000000000..93bf2aae8163 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/asyncmark-max-pregfx12.ll @@ -0,0 +1,279 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=gfx942 < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 < %s | FileCheck %s + +; Loop body exceeds MaxAsyncMarkers on first iteration +; Preloop: 5 markers +; Loop body: 18 markers + +; CHECK-LABEL: test_loop_exceeds_max_first_iteration: +; CHECK: ; wait_asyncmark(3) +; CHECK-NEXT: s_waitcnt vmcnt(3) + +define void @test_loop_exceeds_max_first_iteration(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 %n, ptr addrspace(1) %out) { +entry: + ; Preloop: 5 async LDS DMA operations + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + + br label %loop_header + +loop_header: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop_body ] + %i.next = add i32 %i, 1 + %cmp = icmp slt i32 %i, %n + br i1 %cmp, label %loop_body, label %exit + +loop_body: + ; Loop body with 18 async operations + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + + br label %loop_header + +exit: + call void @llvm.amdgcn.wait.asyncmark(i16 3) + %lds_val = load i32, ptr addrspace(3) %lds + store i32 %lds_val, ptr addrspace(1) %out + ret void +} + +; Loop body does not exceed MaxAsyncMarkers on first iteration +; Preloop: 5 markers +; Loop body: 5 markers + +; CHECK-LABEL: test_loop_needs_more_iterations: +; CHECK: ; wait_asyncmark(3) +; CHECK-NEXT: s_waitcnt vmcnt(3) + +define void @test_loop_needs_more_iterations(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 %n, ptr addrspace(1) %out) { +entry: + ; Preloop: 5 async LDS DMA operations + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + + br label %loop_header + +loop_header: + %i = phi i32 [ 0, %entry ], [ %i.next, %loop_body ] + %i.next = add i32 %i, 1 + %cmp = icmp slt i32 %i, %n + br i1 %cmp, label %loop_body, label %exit + +loop_body: + ; Loop body with 5 async operations + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + + br label %loop_header + +exit: + call void @llvm.amdgcn.wait.asyncmark(i16 3) + %lds_val = load i32, ptr addrspace(3) %lds + store i32 %lds_val, ptr addrspace(1) %out + ret void +} + +; Merge exceeds MaxAsyncMarkers + +; CHECK-LABEL: max_when_merged: +; CHECK: ; wait_asyncmark(17) +; CHECK-NEXT: s_waitcnt vmcnt(15) + +define void @max_when_merged(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 %n, ptr addrspace(1) %out) { +entry: + %cmp = icmp slt i32 0, %n + br i1 %cmp, label %then, label %else + +then: + ; 5 async operations + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + + br label %endif + +else: + ; 18 async operations + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + + br label %endif + +endif: + call void @llvm.amdgcn.wait.asyncmark(i16 17) + %lds_val = load i32, ptr addrspace(3) %lds + store i32 %lds_val, ptr addrspace(1) %out + ret void +} + +; Straightline exceeds MaxAsyncMarkers + +; CHECK-LABEL: no_max_in_straightline: +; CHECK: ; wait_asyncmark(17) +; CHECK-NEXT: s_waitcnt vmcnt(17) + +define void @no_max_in_straightline(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 %n, ptr addrspace(1) %out) { + ; 18 async operations + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %in, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() + + call void @llvm.amdgcn.wait.asyncmark(i16 17) + %lds_val = load i32, ptr addrspace(3) %lds + store i32 %lds_val, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/asyncmark-pregfx12.ll b/llvm/test/CodeGen/AMDGPU/asyncmark-pregfx12.ll index f78cd0b959a0..fbac459d3e2f 100644 --- a/llvm/test/CodeGen/AMDGPU/asyncmark-pregfx12.ll +++ b/llvm/test/CodeGen/AMDGPU/asyncmark-pregfx12.ll @@ -2,7 +2,46 @@ ; RUN: sed 's/.ASYNC/.async/' %s | llc -march=amdgcn -mcpu=gfx900 -o - | FileCheck %s -check-prefixes=WITHASYNC ; RUN: sed 's/.ASYNC//' %s | llc -march=amdgcn -mcpu=gfx900 -o - | FileCheck %s -check-prefixes=WITHOUT -; Test async operations with global_load_lds and global loads +; Demonstrate that wait.asyncmark is a code motion barrier for loads from LDS. +; This is the simplest demo possible. We don't actually use async ops, but just +; a pair of adjacent LDS loads. In the absence of the async mark, these get +; coalesced into a wider LDS load. + +define void @code_barrier(ptr addrspace(1) %foo, ptr addrspace(3) %lds, ptr addrspace(3) %out) { +; WITHASYNC-LABEL: code_barrier: +; WITHASYNC: ; %bb.0: +; WITHASYNC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WITHASYNC-NEXT: ds_read_b32 v0, v2 +; WITHASYNC-NEXT: ; wait_asyncmark(0) +; WITHASYNC-NEXT: ds_read_b32 v1, v2 offset:4 +; WITHASYNC-NEXT: s_waitcnt lgkmcnt(0) +; WITHASYNC-NEXT: v_add_u32_e32 v0, v0, v1 +; WITHASYNC-NEXT: ds_write_b32 v3, v0 +; WITHASYNC-NEXT: s_waitcnt lgkmcnt(0) +; WITHASYNC-NEXT: s_setpc_b64 s[30:31] +; +; WITHOUT-LABEL: code_barrier: +; WITHOUT: ; %bb.0: +; WITHOUT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WITHOUT-NEXT: ds_read_b32 v0, v2 +; WITHOUT-NEXT: ; wait_asyncmark(0) +; WITHOUT-NEXT: ds_read_b32 v1, v2 offset:4 +; WITHOUT-NEXT: s_waitcnt lgkmcnt(0) +; WITHOUT-NEXT: v_add_u32_e32 v0, v0, v1 +; WITHOUT-NEXT: ds_write_b32 v3, v0 +; WITHOUT-NEXT: s_waitcnt lgkmcnt(0) +; WITHOUT-NEXT: s_setpc_b64 s[30:31] + %lds_gep1 = getelementptr i32, ptr addrspace(3) %lds, i32 1 + %val1 = load i32, ptr addrspace(3) %lds + call void @llvm.amdgcn.wait.asyncmark(i16 0) + %val2 = load i32, ptr addrspace(3) %lds_gep1 + %sum = add i32 %val1, %val2 + store i32 %sum, ptr addrspace(3) %out + ret void +} + +; Test async mark/wait with global_load_lds and global loads + ; This version uses wave barriers to enforce program order so that unrelated vmem ; instructions do not get reordered before reaching this point. @@ -10,24 +49,31 @@ define void @interleaved_global_and_dma(ptr addrspace(1) %foo, ptr addrspace(3) ; WITHASYNC-LABEL: interleaved_global_and_dma: ; WITHASYNC: ; %bb.0: ; %entry ; WITHASYNC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WITHASYNC-NEXT: v_readfirstlane_b32 s4, v2 ; WITHASYNC-NEXT: global_load_dword v7, v[3:4], off ; WITHASYNC-NEXT: global_load_dword v8, v[0:1], off -; WITHASYNC-NEXT: v_readfirstlane_b32 s4, v2 -; WITHASYNC-NEXT: ; wave barrier ; WITHASYNC-NEXT: s_mov_b32 m0, s4 +; WITHASYNC-NEXT: ; wave barrier +; WITHASYNC-NEXT: s_nop 0 +; WITHASYNC-NEXT: global_load_dword v[3:4], off lds +; WITHASYNC-NEXT: ; asyncmark ; WITHASYNC-NEXT: global_load_dword v0, v[0:1], off +; WITHASYNC-NEXT: ; wave barrier ; WITHASYNC-NEXT: s_nop 0 ; WITHASYNC-NEXT: global_load_dword v[3:4], off lds ; WITHASYNC-NEXT: ; wave barrier -; WITHASYNC-NEXT: global_load_dword v[3:4], off lds -; WITHASYNC-NEXT: ; wave barrier ; WITHASYNC-NEXT: global_load_dword v1, v[3:4], off +; WITHASYNC-NEXT: ; asyncmark +; WITHASYNC-NEXT: ; wait_asyncmark(1) +; WITHASYNC-NEXT: s_waitcnt vmcnt(3) +; WITHASYNC-NEXT: ds_read_b32 v3, v2 +; WITHASYNC-NEXT: ; wait_asyncmark(0) ; WITHASYNC-NEXT: s_waitcnt vmcnt(1) ; WITHASYNC-NEXT: ds_read_b32 v2, v2 -; WITHASYNC-NEXT: v_add_u32_e32 v3, v8, v7 -; WITHASYNC-NEXT: s_waitcnt lgkmcnt(0) -; WITHASYNC-NEXT: v_add3_u32 v0, v3, v2, v0 -; WITHASYNC-NEXT: s_waitcnt vmcnt(0) +; WITHASYNC-NEXT: v_add_u32_e32 v4, v8, v7 +; WITHASYNC-NEXT: s_waitcnt lgkmcnt(1) +; WITHASYNC-NEXT: v_add3_u32 v0, v4, v3, v0 +; WITHASYNC-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; WITHASYNC-NEXT: v_add3_u32 v0, v0, v1, v2 ; WITHASYNC-NEXT: global_store_dword v[5:6], v0, off ; WITHASYNC-NEXT: s_waitcnt vmcnt(0) @@ -36,24 +82,30 @@ define void @interleaved_global_and_dma(ptr addrspace(1) %foo, ptr addrspace(3) ; WITHOUT-LABEL: interleaved_global_and_dma: ; WITHOUT: ; %bb.0: ; %entry ; WITHOUT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WITHOUT-NEXT: v_readfirstlane_b32 s4, v2 ; WITHOUT-NEXT: global_load_dword v7, v[3:4], off ; WITHOUT-NEXT: global_load_dword v8, v[0:1], off -; WITHOUT-NEXT: v_readfirstlane_b32 s4, v2 -; WITHOUT-NEXT: ; wave barrier ; WITHOUT-NEXT: s_mov_b32 m0, s4 +; WITHOUT-NEXT: ; wave barrier +; WITHOUT-NEXT: s_nop 0 +; WITHOUT-NEXT: global_load_dword v[3:4], off lds +; WITHOUT-NEXT: ; asyncmark ; WITHOUT-NEXT: global_load_dword v0, v[0:1], off +; WITHOUT-NEXT: ; wave barrier ; WITHOUT-NEXT: s_nop 0 ; WITHOUT-NEXT: global_load_dword v[3:4], off lds ; WITHOUT-NEXT: ; wave barrier -; WITHOUT-NEXT: global_load_dword v[3:4], off lds -; WITHOUT-NEXT: ; wave barrier ; WITHOUT-NEXT: global_load_dword v1, v[3:4], off +; WITHOUT-NEXT: ; asyncmark +; WITHOUT-NEXT: ; wait_asyncmark(1) ; WITHOUT-NEXT: s_waitcnt vmcnt(1) +; WITHOUT-NEXT: ds_read_b32 v3, v2 +; WITHOUT-NEXT: ; wait_asyncmark(0) ; WITHOUT-NEXT: ds_read_b32 v2, v2 -; WITHOUT-NEXT: v_add_u32_e32 v3, v8, v7 -; WITHOUT-NEXT: s_waitcnt lgkmcnt(0) -; WITHOUT-NEXT: v_add3_u32 v0, v3, v2, v0 -; WITHOUT-NEXT: s_waitcnt vmcnt(0) +; WITHOUT-NEXT: v_add_u32_e32 v4, v8, v7 +; WITHOUT-NEXT: s_waitcnt lgkmcnt(1) +; WITHOUT-NEXT: v_add3_u32 v0, v4, v3, v0 +; WITHOUT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; WITHOUT-NEXT: v_add3_u32 v0, v0, v1, v2 ; WITHOUT-NEXT: global_store_dword v[5:6], v0, off ; WITHOUT-NEXT: s_waitcnt vmcnt(0) @@ -64,6 +116,7 @@ entry: %foo_v1 = load i32, ptr addrspace(1) %foo call void @llvm.amdgcn.wave.barrier() call void @llvm.amdgcn.global.load.ASYNC.lds(ptr addrspace(1) %bar, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() ; Second batch: global load, async global-to-LDS, global load %foo_v2 = load i32, ptr addrspace(1) %foo @@ -71,13 +124,16 @@ entry: call void @llvm.amdgcn.global.load.ASYNC.lds(ptr addrspace(1) %bar, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) call void @llvm.amdgcn.wave.barrier() %bar_v12 = load i32, ptr addrspace(1) %bar + call void @llvm.amdgcn.asyncmark() ; Wait for first async mark and read from LDS ; This results in vmcnt(3) corresponding to the second batch. + call void @llvm.amdgcn.wait.asyncmark(i16 1) %lds_val21 = load i32, ptr addrspace(3) %lds ; Wait for the next lds dma ; This results in vmcnt(1), corresponding to %bar_v12. Could have been combined with the lgkmcnt(1) for %lds_val21. + call void @llvm.amdgcn.wait.asyncmark(i16 0) %lds_val22 = load i32, ptr addrspace(3) %lds %sum1 = add i32 %foo_v1, %bar_v11 %sum2 = add i32 %sum1, %lds_val21 @@ -94,25 +150,31 @@ define void @interleaved_buffer_and_dma(ptr addrspace(8) inreg %buf, ptr addrspa ; WITHASYNC-LABEL: interleaved_buffer_and_dma: ; WITHASYNC: ; %bb.0: ; %entry ; WITHASYNC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WITHASYNC-NEXT: s_mov_b32 m0, s20 ; WITHASYNC-NEXT: global_load_dword v6, v[2:3], off ; WITHASYNC-NEXT: global_load_dword v7, v[0:1], off -; WITHASYNC-NEXT: s_mov_b32 m0, s20 -; WITHASYNC-NEXT: ; wave barrier ; WITHASYNC-NEXT: v_mov_b32_e32 v8, 0x54 +; WITHASYNC-NEXT: ; wave barrier +; WITHASYNC-NEXT: buffer_load_dword v8, s[16:19], 0 offen lds +; WITHASYNC-NEXT: ; asyncmark ; WITHASYNC-NEXT: global_load_dword v0, v[0:1], off ; WITHASYNC-NEXT: v_mov_b32_e32 v1, 0x58 -; WITHASYNC-NEXT: buffer_load_dword v8, s[16:19], 0 offen lds ; WITHASYNC-NEXT: ; wave barrier ; WITHASYNC-NEXT: buffer_load_dword v1, s[16:19], 0 offen lds ; WITHASYNC-NEXT: ; wave barrier ; WITHASYNC-NEXT: global_load_dword v1, v[2:3], off ; WITHASYNC-NEXT: v_mov_b32_e32 v2, s20 +; WITHASYNC-NEXT: ; asyncmark +; WITHASYNC-NEXT: ; wait_asyncmark(1) +; WITHASYNC-NEXT: s_waitcnt vmcnt(3) +; WITHASYNC-NEXT: ds_read_b32 v3, v2 +; WITHASYNC-NEXT: ; wait_asyncmark(0) ; WITHASYNC-NEXT: s_waitcnt vmcnt(1) ; WITHASYNC-NEXT: ds_read_b32 v2, v2 -; WITHASYNC-NEXT: v_add_u32_e32 v3, v7, v6 -; WITHASYNC-NEXT: s_waitcnt lgkmcnt(0) -; WITHASYNC-NEXT: v_add3_u32 v0, v3, v2, v0 -; WITHASYNC-NEXT: s_waitcnt vmcnt(0) +; WITHASYNC-NEXT: v_add_u32_e32 v6, v7, v6 +; WITHASYNC-NEXT: s_waitcnt lgkmcnt(1) +; WITHASYNC-NEXT: v_add3_u32 v0, v6, v3, v0 +; WITHASYNC-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; WITHASYNC-NEXT: v_add3_u32 v0, v0, v1, v2 ; WITHASYNC-NEXT: global_store_dword v[4:5], v0, off ; WITHASYNC-NEXT: s_waitcnt vmcnt(0) @@ -121,25 +183,30 @@ define void @interleaved_buffer_and_dma(ptr addrspace(8) inreg %buf, ptr addrspa ; WITHOUT-LABEL: interleaved_buffer_and_dma: ; WITHOUT: ; %bb.0: ; %entry ; WITHOUT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WITHOUT-NEXT: s_mov_b32 m0, s20 ; WITHOUT-NEXT: global_load_dword v6, v[2:3], off ; WITHOUT-NEXT: global_load_dword v7, v[0:1], off -; WITHOUT-NEXT: s_mov_b32 m0, s20 -; WITHOUT-NEXT: ; wave barrier ; WITHOUT-NEXT: v_mov_b32_e32 v8, 0x54 +; WITHOUT-NEXT: ; wave barrier +; WITHOUT-NEXT: buffer_load_dword v8, s[16:19], 0 offen lds +; WITHOUT-NEXT: ; asyncmark ; WITHOUT-NEXT: global_load_dword v0, v[0:1], off ; WITHOUT-NEXT: v_mov_b32_e32 v1, 0x58 -; WITHOUT-NEXT: buffer_load_dword v8, s[16:19], 0 offen lds ; WITHOUT-NEXT: ; wave barrier ; WITHOUT-NEXT: buffer_load_dword v1, s[16:19], 0 offen lds ; WITHOUT-NEXT: ; wave barrier ; WITHOUT-NEXT: global_load_dword v1, v[2:3], off ; WITHOUT-NEXT: v_mov_b32_e32 v2, s20 +; WITHOUT-NEXT: ; asyncmark +; WITHOUT-NEXT: ; wait_asyncmark(1) ; WITHOUT-NEXT: s_waitcnt vmcnt(1) +; WITHOUT-NEXT: ds_read_b32 v3, v2 +; WITHOUT-NEXT: ; wait_asyncmark(0) ; WITHOUT-NEXT: ds_read_b32 v2, v2 -; WITHOUT-NEXT: v_add_u32_e32 v3, v7, v6 -; WITHOUT-NEXT: s_waitcnt lgkmcnt(0) -; WITHOUT-NEXT: v_add3_u32 v0, v3, v2, v0 -; WITHOUT-NEXT: s_waitcnt vmcnt(0) +; WITHOUT-NEXT: v_add_u32_e32 v6, v7, v6 +; WITHOUT-NEXT: s_waitcnt lgkmcnt(1) +; WITHOUT-NEXT: v_add3_u32 v0, v6, v3, v0 +; WITHOUT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; WITHOUT-NEXT: v_add3_u32 v0, v0, v1, v2 ; WITHOUT-NEXT: global_store_dword v[4:5], v0, off ; WITHOUT-NEXT: s_waitcnt vmcnt(0) @@ -150,6 +217,7 @@ entry: %foo_v1 = load i32, ptr addrspace(1) %foo call void @llvm.amdgcn.wave.barrier() call void @llvm.amdgcn.raw.ptr.buffer.load.ASYNC.lds(ptr addrspace(8) %buf, ptr addrspace(3) %lds, i32 4, i32 84, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() ; Second batch: global load, async global-to-LDS, global load. %foo_v2 = load i32, ptr addrspace(1) %foo @@ -157,13 +225,16 @@ entry: call void @llvm.amdgcn.raw.ptr.buffer.load.ASYNC.lds(ptr addrspace(8) %buf, ptr addrspace(3) %lds, i32 4, i32 88, i32 0, i32 0, i32 0) call void @llvm.amdgcn.wave.barrier() %bar_v12 = load i32, ptr addrspace(1) %bar + call void @llvm.amdgcn.asyncmark() ; Wait for first async mark and read from LDS. ; This results in vmcnt(3) corresponding to the second batch. + call void @llvm.amdgcn.wait.asyncmark(i16 1) %lds_val21 = load i32, ptr addrspace(3) %lds ; Wait for the next lds dma. ; This results in vmcnt(1) because the last global load is not async. + call void @llvm.amdgcn.wait.asyncmark(i16 0) %lds_val22 = load i32, ptr addrspace(3) %lds %sum1 = add i32 %foo_v1, %bar_v11 %sum2 = add i32 %sum1, %lds_val21 @@ -185,29 +256,37 @@ define void @test_pipelined_loop(ptr addrspace(1) %foo, ptr addrspace(3) %lds, p ; WITHASYNC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; WITHASYNC-NEXT: v_readfirstlane_b32 s4, v2 ; WITHASYNC-NEXT: s_mov_b32 m0, s4 -; WITHASYNC-NEXT: s_nop 0 -; WITHASYNC-NEXT: global_load_dword v[0:1], off lds -; WITHASYNC-NEXT: global_load_dword v[0:1], off lds ; WITHASYNC-NEXT: v_mov_b32_e32 v5, 0 +; WITHASYNC-NEXT: global_load_dword v[0:1], off lds +; WITHASYNC-NEXT: ; asyncmark +; WITHASYNC-NEXT: global_load_dword v[0:1], off lds ; WITHASYNC-NEXT: s_mov_b32 s6, 2 ; WITHASYNC-NEXT: s_mov_b64 s[4:5], 0 -; WITHASYNC-NEXT: .LBB2_1: ; %loop_body +; WITHASYNC-NEXT: ; asyncmark +; WITHASYNC-NEXT: .LBB3_1: ; %loop_body ; WITHASYNC-NEXT: ; =>This Inner Loop Header: Depth=1 ; WITHASYNC-NEXT: v_readfirstlane_b32 s7, v2 ; WITHASYNC-NEXT: s_mov_b32 m0, s7 ; WITHASYNC-NEXT: s_add_i32 s6, s6, 1 ; WITHASYNC-NEXT: global_load_dword v[0:1], off lds -; WITHASYNC-NEXT: s_waitcnt vmcnt(0) +; WITHASYNC-NEXT: ; asyncmark +; WITHASYNC-NEXT: ; wait_asyncmark(2) +; WITHASYNC-NEXT: s_waitcnt vmcnt(2) ; WITHASYNC-NEXT: ds_read_b32 v6, v2 ; WITHASYNC-NEXT: v_cmp_ge_i32_e32 vcc, s6, v7 ; WITHASYNC-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; WITHASYNC-NEXT: s_waitcnt lgkmcnt(0) ; WITHASYNC-NEXT: v_add_u32_e32 v5, v5, v6 ; WITHASYNC-NEXT: s_andn2_b64 exec, exec, s[4:5] -; WITHASYNC-NEXT: s_cbranch_execnz .LBB2_1 +; WITHASYNC-NEXT: s_cbranch_execnz .LBB3_1 ; WITHASYNC-NEXT: ; %bb.2: ; %epilog ; WITHASYNC-NEXT: s_or_b64 exec, exec, s[4:5] -; WITHASYNC-NEXT: v_add_u32_e32 v0, v5, v6 +; WITHASYNC-NEXT: ; wait_asyncmark(1) +; WITHASYNC-NEXT: s_waitcnt vmcnt(1) +; WITHASYNC-NEXT: ds_read_b32 v0, v2 +; WITHASYNC-NEXT: ; wait_asyncmark(0) +; WITHASYNC-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; WITHASYNC-NEXT: v_add_u32_e32 v0, v5, v0 ; WITHASYNC-NEXT: global_store_dword v[3:4], v0, off ; WITHASYNC-NEXT: s_waitcnt vmcnt(0) ; WITHASYNC-NEXT: s_setpc_b64 s[30:31] @@ -217,18 +296,21 @@ define void @test_pipelined_loop(ptr addrspace(1) %foo, ptr addrspace(3) %lds, p ; WITHOUT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; WITHOUT-NEXT: v_readfirstlane_b32 s4, v2 ; WITHOUT-NEXT: s_mov_b32 m0, s4 -; WITHOUT-NEXT: s_nop 0 -; WITHOUT-NEXT: global_load_dword v[0:1], off lds -; WITHOUT-NEXT: global_load_dword v[0:1], off lds ; WITHOUT-NEXT: v_mov_b32_e32 v5, 0 +; WITHOUT-NEXT: global_load_dword v[0:1], off lds +; WITHOUT-NEXT: ; asyncmark +; WITHOUT-NEXT: global_load_dword v[0:1], off lds ; WITHOUT-NEXT: s_mov_b32 s6, 2 ; WITHOUT-NEXT: s_mov_b64 s[4:5], 0 -; WITHOUT-NEXT: .LBB2_1: ; %loop_body +; WITHOUT-NEXT: ; asyncmark +; WITHOUT-NEXT: .LBB3_1: ; %loop_body ; WITHOUT-NEXT: ; =>This Inner Loop Header: Depth=1 ; WITHOUT-NEXT: v_readfirstlane_b32 s7, v2 ; WITHOUT-NEXT: s_mov_b32 m0, s7 ; WITHOUT-NEXT: s_add_i32 s6, s6, 1 ; WITHOUT-NEXT: global_load_dword v[0:1], off lds +; WITHOUT-NEXT: ; asyncmark +; WITHOUT-NEXT: ; wait_asyncmark(2) ; WITHOUT-NEXT: s_waitcnt vmcnt(0) ; WITHOUT-NEXT: ds_read_b32 v6, v2 ; WITHOUT-NEXT: v_cmp_ge_i32_e32 vcc, s6, v7 @@ -236,19 +318,25 @@ define void @test_pipelined_loop(ptr addrspace(1) %foo, ptr addrspace(3) %lds, p ; WITHOUT-NEXT: s_waitcnt lgkmcnt(0) ; WITHOUT-NEXT: v_add_u32_e32 v5, v5, v6 ; WITHOUT-NEXT: s_andn2_b64 exec, exec, s[4:5] -; WITHOUT-NEXT: s_cbranch_execnz .LBB2_1 +; WITHOUT-NEXT: s_cbranch_execnz .LBB3_1 ; WITHOUT-NEXT: ; %bb.2: ; %epilog ; WITHOUT-NEXT: s_or_b64 exec, exec, s[4:5] -; WITHOUT-NEXT: v_add_u32_e32 v0, v5, v6 +; WITHOUT-NEXT: ; wait_asyncmark(1) +; WITHOUT-NEXT: ds_read_b32 v0, v2 +; WITHOUT-NEXT: ; wait_asyncmark(0) +; WITHOUT-NEXT: s_waitcnt lgkmcnt(0) +; WITHOUT-NEXT: v_add_u32_e32 v0, v5, v0 ; WITHOUT-NEXT: global_store_dword v[3:4], v0, off ; WITHOUT-NEXT: s_waitcnt vmcnt(0) ; WITHOUT-NEXT: s_setpc_b64 s[30:31] prolog: ; Load first iteration call void @llvm.amdgcn.global.load.ASYNC.lds(ptr addrspace(1) %foo, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() ; Load second iteration call void @llvm.amdgcn.global.load.ASYNC.lds(ptr addrspace(1) %foo, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() br label %loop_body @@ -258,8 +346,10 @@ loop_body: ; Load next iteration call void @llvm.amdgcn.global.load.ASYNC.lds(ptr addrspace(1) %foo, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() ; Wait for iteration i-2 and process + call void @llvm.amdgcn.wait.asyncmark(i16 2) %lds_idx = sub i32 %i, 2 %lds_val = load i32, ptr addrspace(3) %lds @@ -271,9 +361,11 @@ loop_body: epilog: ; Process remaining iterations + call void @llvm.amdgcn.wait.asyncmark(i16 1) %lds_val_n_2 = load i32, ptr addrspace(3) %lds %sum_e2 = add i32 %sum_i, %lds_val_n_2 + call void @llvm.amdgcn.wait.asyncmark(i16 0) %lds_val_n_1 = load i32, ptr addrspace(3) %lds %sum_e1 = add i32 %sum_e2, %lds_val_n_1 store i32 %sum_e2, ptr addrspace(1) %bar @@ -288,20 +380,22 @@ define void @test_pipelined_loop_with_global(ptr addrspace(1) %foo, ptr addrspac ; WITHASYNC: ; %bb.0: ; %prolog ; WITHASYNC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; WITHASYNC-NEXT: v_readfirstlane_b32 s4, v2 -; WITHASYNC-NEXT: global_load_dword v10, v[0:1], off -; WITHASYNC-NEXT: global_load_dword v8, v[0:1], off -; WITHASYNC-NEXT: global_load_dword v14, v[3:4], off -; WITHASYNC-NEXT: global_load_dword v9, v[3:4], off ; WITHASYNC-NEXT: s_mov_b32 m0, s4 +; WITHASYNC-NEXT: global_load_dword v10, v[0:1], off +; WITHASYNC-NEXT: global_load_dword v14, v[3:4], off ; WITHASYNC-NEXT: s_mov_b32 s6, 2 ; WITHASYNC-NEXT: global_load_dword v[0:1], off lds -; WITHASYNC-NEXT: global_load_dword v[0:1], off lds +; WITHASYNC-NEXT: ; asyncmark +; WITHASYNC-NEXT: global_load_dword v8, v[0:1], off +; WITHASYNC-NEXT: global_load_dword v9, v[3:4], off ; WITHASYNC-NEXT: s_mov_b64 s[4:5], 0 -; WITHASYNC-NEXT: s_waitcnt vmcnt(4) -; WITHASYNC-NEXT: v_mov_b32_e32 v13, v8 +; WITHASYNC-NEXT: global_load_dword v[0:1], off lds +; WITHASYNC-NEXT: ; asyncmark ; WITHASYNC-NEXT: s_waitcnt vmcnt(2) +; WITHASYNC-NEXT: v_mov_b32_e32 v13, v8 +; WITHASYNC-NEXT: s_waitcnt vmcnt(1) ; WITHASYNC-NEXT: v_mov_b32_e32 v15, v9 -; WITHASYNC-NEXT: .LBB3_1: ; %loop_body +; WITHASYNC-NEXT: .LBB4_1: ; %loop_body ; WITHASYNC-NEXT: ; =>This Inner Loop Header: Depth=1 ; WITHASYNC-NEXT: v_readfirstlane_b32 s7, v2 ; WITHASYNC-NEXT: s_waitcnt vmcnt(1) @@ -316,20 +410,29 @@ define void @test_pipelined_loop_with_global(ptr addrspace(1) %foo, ptr addrspac ; WITHASYNC-NEXT: v_mov_b32_e32 v16, v14 ; WITHASYNC-NEXT: v_mov_b32_e32 v17, v10 ; WITHASYNC-NEXT: v_mov_b32_e32 v10, v8 -; WITHASYNC-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; WITHASYNC-NEXT: v_mov_b32_e32 v14, v9 +; WITHASYNC-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; WITHASYNC-NEXT: ; asyncmark +; WITHASYNC-NEXT: ; wait_asyncmark(2) ; WITHASYNC-NEXT: s_andn2_b64 exec, exec, s[4:5] -; WITHASYNC-NEXT: s_cbranch_execnz .LBB3_1 +; WITHASYNC-NEXT: s_cbranch_execnz .LBB4_1 ; WITHASYNC-NEXT: ; %bb.2: ; %epilog ; WITHASYNC-NEXT: s_or_b64 exec, exec, s[4:5] -; WITHASYNC-NEXT: s_waitcnt vmcnt(0) ; WITHASYNC-NEXT: ds_read_b32 v0, v2 -; WITHASYNC-NEXT: v_add_u32_e32 v1, v17, v16 -; WITHASYNC-NEXT: v_add_u32_e32 v2, v13, v15 +; WITHASYNC-NEXT: ; wait_asyncmark(1) +; WITHASYNC-NEXT: s_waitcnt vmcnt(3) +; WITHASYNC-NEXT: ds_read_b32 v1, v2 +; WITHASYNC-NEXT: ; wait_asyncmark(0) +; WITHASYNC-NEXT: s_waitcnt vmcnt(0) +; WITHASYNC-NEXT: ds_read_b32 v2, v2 +; WITHASYNC-NEXT: v_add_u32_e32 v3, v17, v16 +; WITHASYNC-NEXT: s_waitcnt lgkmcnt(2) +; WITHASYNC-NEXT: v_add3_u32 v0, v3, v0, v12 +; WITHASYNC-NEXT: s_waitcnt lgkmcnt(1) +; WITHASYNC-NEXT: v_add3_u32 v0, v11, v0, v1 +; WITHASYNC-NEXT: v_add_u32_e32 v1, v13, v15 ; WITHASYNC-NEXT: s_waitcnt lgkmcnt(0) -; WITHASYNC-NEXT: v_add3_u32 v1, v1, v0, v12 -; WITHASYNC-NEXT: v_add3_u32 v1, v11, v1, v0 -; WITHASYNC-NEXT: v_add3_u32 v0, v2, v0, v1 +; WITHASYNC-NEXT: v_add3_u32 v0, v1, v2, v0 ; WITHASYNC-NEXT: global_store_dword v[5:6], v0, off ; WITHASYNC-NEXT: s_waitcnt vmcnt(0) ; WITHASYNC-NEXT: s_setpc_b64 s[30:31] @@ -338,20 +441,22 @@ define void @test_pipelined_loop_with_global(ptr addrspace(1) %foo, ptr addrspac ; WITHOUT: ; %bb.0: ; %prolog ; WITHOUT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; WITHOUT-NEXT: v_readfirstlane_b32 s4, v2 -; WITHOUT-NEXT: global_load_dword v10, v[0:1], off -; WITHOUT-NEXT: global_load_dword v8, v[0:1], off -; WITHOUT-NEXT: global_load_dword v14, v[3:4], off -; WITHOUT-NEXT: global_load_dword v9, v[3:4], off ; WITHOUT-NEXT: s_mov_b32 m0, s4 +; WITHOUT-NEXT: global_load_dword v10, v[0:1], off +; WITHOUT-NEXT: global_load_dword v14, v[3:4], off ; WITHOUT-NEXT: s_mov_b32 s6, 2 ; WITHOUT-NEXT: global_load_dword v[0:1], off lds -; WITHOUT-NEXT: global_load_dword v[0:1], off lds +; WITHOUT-NEXT: ; asyncmark +; WITHOUT-NEXT: global_load_dword v8, v[0:1], off +; WITHOUT-NEXT: global_load_dword v9, v[3:4], off ; WITHOUT-NEXT: s_mov_b64 s[4:5], 0 -; WITHOUT-NEXT: s_waitcnt vmcnt(4) -; WITHOUT-NEXT: v_mov_b32_e32 v13, v8 +; WITHOUT-NEXT: global_load_dword v[0:1], off lds +; WITHOUT-NEXT: ; asyncmark ; WITHOUT-NEXT: s_waitcnt vmcnt(2) +; WITHOUT-NEXT: v_mov_b32_e32 v13, v8 +; WITHOUT-NEXT: s_waitcnt vmcnt(1) ; WITHOUT-NEXT: v_mov_b32_e32 v15, v9 -; WITHOUT-NEXT: .LBB3_1: ; %loop_body +; WITHOUT-NEXT: .LBB4_1: ; %loop_body ; WITHOUT-NEXT: ; =>This Inner Loop Header: Depth=1 ; WITHOUT-NEXT: v_readfirstlane_b32 s7, v2 ; WITHOUT-NEXT: s_waitcnt vmcnt(1) @@ -366,20 +471,28 @@ define void @test_pipelined_loop_with_global(ptr addrspace(1) %foo, ptr addrspac ; WITHOUT-NEXT: v_mov_b32_e32 v16, v14 ; WITHOUT-NEXT: v_mov_b32_e32 v17, v10 ; WITHOUT-NEXT: v_mov_b32_e32 v10, v8 -; WITHOUT-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; WITHOUT-NEXT: v_mov_b32_e32 v14, v9 +; WITHOUT-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; WITHOUT-NEXT: ; asyncmark +; WITHOUT-NEXT: ; wait_asyncmark(2) ; WITHOUT-NEXT: s_andn2_b64 exec, exec, s[4:5] -; WITHOUT-NEXT: s_cbranch_execnz .LBB3_1 +; WITHOUT-NEXT: s_cbranch_execnz .LBB4_1 ; WITHOUT-NEXT: ; %bb.2: ; %epilog ; WITHOUT-NEXT: s_or_b64 exec, exec, s[4:5] ; WITHOUT-NEXT: s_waitcnt vmcnt(0) ; WITHOUT-NEXT: ds_read_b32 v0, v2 -; WITHOUT-NEXT: v_add_u32_e32 v1, v17, v16 -; WITHOUT-NEXT: v_add_u32_e32 v2, v13, v15 +; WITHOUT-NEXT: ; wait_asyncmark(1) +; WITHOUT-NEXT: ds_read_b32 v1, v2 +; WITHOUT-NEXT: ; wait_asyncmark(0) +; WITHOUT-NEXT: ds_read_b32 v2, v2 +; WITHOUT-NEXT: v_add_u32_e32 v3, v17, v16 +; WITHOUT-NEXT: s_waitcnt lgkmcnt(2) +; WITHOUT-NEXT: v_add3_u32 v0, v3, v0, v12 +; WITHOUT-NEXT: s_waitcnt lgkmcnt(1) +; WITHOUT-NEXT: v_add3_u32 v0, v11, v0, v1 +; WITHOUT-NEXT: v_add_u32_e32 v1, v13, v15 ; WITHOUT-NEXT: s_waitcnt lgkmcnt(0) -; WITHOUT-NEXT: v_add3_u32 v1, v1, v0, v12 -; WITHOUT-NEXT: v_add3_u32 v1, v11, v1, v0 -; WITHOUT-NEXT: v_add3_u32 v0, v2, v0, v1 +; WITHOUT-NEXT: v_add3_u32 v0, v1, v2, v0 ; WITHOUT-NEXT: global_store_dword v[5:6], v0, off ; WITHOUT-NEXT: s_waitcnt vmcnt(0) ; WITHOUT-NEXT: s_setpc_b64 s[30:31] @@ -388,11 +501,13 @@ prolog: %v0 = load i32, ptr addrspace(1) %foo %g0 = load i32, ptr addrspace(1) %bar call void @llvm.amdgcn.global.load.ASYNC.lds(ptr addrspace(1) %foo, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() ; Load second iteration %v1 = load i32, ptr addrspace(1) %foo %g1 = load i32, ptr addrspace(1) %bar call void @llvm.amdgcn.global.load.ASYNC.lds(ptr addrspace(1) %foo, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() br label %loop_body @@ -415,8 +530,10 @@ loop_body: %cur_v = load i32, ptr addrspace(1) %foo %cur_g = load i32, ptr addrspace(1) %bar call void @llvm.amdgcn.global.load.ASYNC.lds(ptr addrspace(1) %foo, ptr addrspace(3) %lds, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.asyncmark() ; Wait for iteration i-2 and process + call void @llvm.amdgcn.wait.asyncmark(i16 2) %lds_idx = sub i32 %i, 2 %lds_val = load i32, ptr addrspace(3) %lds @@ -429,11 +546,13 @@ loop_body: epilog: ; Process remaining iterations + call void @llvm.amdgcn.wait.asyncmark(i16 1) %lds_val_n_2 = load i32, ptr addrspace(3) %lds %sum_e0 = add i32 %sum, %g1_phi %sum_e1 = add i32 %v1_phi, %sum_e0 %sum_e2 = add i32 %sum_e1, %lds_val_n_2 + call void @llvm.amdgcn.wait.asyncmark(i16 0) %lds_val_n_1 = load i32, ptr addrspace(3) %lds %sum_e3 = add i32 %cur_v, %cur_g %sum_e4 = add i32 %sum_e3, %lds_val_n_1