diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 78430438ea9f..c1f778efa3d8 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -3397,15 +3397,11 @@ bool SIInsertWaitcnts::isDSRead(const MachineInstr &MI) const { // Check if instruction is a store to LDS that is counted via DSCNT // (where that counter exists). bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(const MachineInstr &MI) const { - if (!MI.mayStore()) - return false; - if (SIInstrInfo::isDS(MI)) - return true; - return false; + return MI.mayStore() && SIInstrInfo::isDS(MI); } // Return flags indicating which counters should be flushed in the preheader of -// the given loop. We currently decide to flush in a few situations: +// the given loop. We currently decide to flush in the following situations: // For VMEM (FlushVmCnt): // 1. The loop contains vmem store(s), no vmem load and at least one use of a // vgpr containing a value that is loaded outside of the loop. (Only on @@ -3415,27 +3411,46 @@ bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(const MachineInstr &MI) const { // outside of the loop. // For DS (FlushDsCnt, GFX12+ only): // 3. The loop contains no DS reads, and at least one use of a vgpr containing -// a value that is DS loaded outside of the loop. +// a value that is DS read outside of the loop. // 4. The loop contains DS read(s), loaded values are not used in the same // iteration but in the next iteration (prefetch pattern), and at least one -// use of a vgpr containing a value that is DS loaded outside of the loop. +// use of a vgpr containing a value that is DS read outside of the loop. // Flushing in preheader reduces wait overhead if the wait requirement in -// iteration 1 would otherwise be more strict. +// iteration 1 would otherwise be more strict (but unfortunately preheader +// flush decision is taken before knowing that). +// 5. (Single-block loops only) The loop has DS prefetch reads with flush point +// tracking. Some DS reads may be used in the same iteration (creating +// "flush points"), but others remain unflushed at the backedge. When a DS +// read is consumed in the same iteration, it and all prior reads are +// "flushed" (FIFO order). No DS writes are allowed in the loop. +// TODO: Find a way to extend to multi-block loops. PreheaderFlushFlags SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML, const WaitcntBrackets &Brackets) { PreheaderFlushFlags Flags; bool HasVMemLoad = false; bool HasVMemStore = false; - bool UsesVgprLoadedOutsideVMEM = false; - bool UsesVgprLoadedOutsideDS = false; + bool UsesVgprVMEMLoadedOutside = false; + bool UsesVgprDSReadOutside = false; bool VMemInvalidated = false; // DS optimization only applies to GFX12+ where DS_CNT is separate. - bool DSInvalidated = !ST->hasExtendedWaitCounts(); + // Tracking status for "no DS read in loop" or "pure DS prefetch + // (use only in next iteration)". + bool TrackSimpleDSOpt = ST->hasExtendedWaitCounts(); DenseSet VgprUse; DenseSet VgprDefVMEM; DenseSet VgprDefDS; + // Track DS reads for prefetch pattern with flush points (single-block only). + // Keeps track of the last DS read (position counted from the top of the loop) + // to each VGPR. Read is considered consumed (and thus needs flushing) if + // the dest register has a use or is overwritten (by any later opertions). + DenseMap LastDSReadPositionMap; + unsigned DSReadPosition = 0; + bool IsSingleBlock = ML->getNumBlocks() == 1; + bool TrackDSFlushPoint = ST->hasExtendedWaitCounts() && IsSingleBlock; + unsigned LastDSFlushPosition = 0; + for (MachineBasicBlock *MBB : ML->blocks()) { for (MachineInstr &MI : *MBB) { if (isVMEMOrFlatVMEM(MI)) { @@ -3445,12 +3460,30 @@ SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML, // TODO: Can we relax DSStore check? There may be cases where // these DS stores are drained prior to the end of MBB (or loop). if (mayStoreIncrementingDSCNT(MI)) { - // Early exit if both optimizations are invalidated. - // Otherwise, set invalid status and continue. + // Early exit if none of the optimizations are feasible. + // Otherwise, set tracking status appropriately and continue. if (VMemInvalidated) return Flags; - DSInvalidated = true; + TrackSimpleDSOpt = false; + TrackDSFlushPoint = false; } + bool IsDSRead = isDSRead(MI); + if (IsDSRead) + ++DSReadPosition; + + // Helper: if RU has a pending DS read, update LastDSFlushPosition + auto updateDSReadFlushTracking = [&](MCRegUnit RU) { + if (!TrackDSFlushPoint) + return; + if (auto It = LastDSReadPositionMap.find(RU); + It != LastDSReadPositionMap.end()) { + // RU defined by DSRead is used or overwritten. Need to complete + // the read, if not already implied by a later DSRead (to any RU) + // needing to complete in FIFO order. + LastDSFlushPosition = std::max(LastDSFlushPosition, It->second); + } + }; + for (const MachineOperand &Op : MI.all_uses()) { if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg())) continue; @@ -3461,14 +3494,17 @@ SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML, if (VgprDefVMEM.contains(RU)) VMemInvalidated = true; - // Check for DS loads used inside the loop + // Check for DS reads used inside the loop if (VgprDefDS.contains(RU)) - DSInvalidated = true; + TrackSimpleDSOpt = false; - // Early exit if both optimizations are invalidated - if (VMemInvalidated && DSInvalidated) + // Early exit if all optimizations are invalidated + if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint) return Flags; + // Check for flush points (DS read used in same iteration) + updateDSReadFlushTracking(RU); + VgprUse.insert(RU); // Check if this register has a pending VMEM load from outside the // loop (value loaded outside and used inside). @@ -3476,12 +3512,12 @@ SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML, if (Brackets.hasPendingVMEM(ID, LOAD_CNT) || Brackets.hasPendingVMEM(ID, SAMPLE_CNT) || Brackets.hasPendingVMEM(ID, BVH_CNT)) - UsesVgprLoadedOutsideVMEM = true; + UsesVgprVMEMLoadedOutside = true; // Check if loaded outside the loop via DS (not VMEM/FLAT). - // Only consider it a DS load if there's no pending VMEM load for + // Only consider it a DS read if there's no pending VMEM load for // this register, since FLAT can set both counters. else if (Brackets.hasPendingVMEM(ID, DS_CNT)) - UsesVgprLoadedOutsideDS = true; + UsesVgprDSReadOutside = true; } } @@ -3496,22 +3532,31 @@ SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML, VgprDefVMEM.insert(RU); } } - // Early exit if both optimizations are invalidated - if (VMemInvalidated && DSInvalidated) + // Early exit if all optimizations are invalidated + if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint) return Flags; } // DS read vgpr def // Note: Unlike VMEM, we DON'T invalidate when VgprUse.contains(RegNo). // If USE comes before DEF, it's the prefetch pattern (use value from - // previous iteration, load for next iteration). We should still flush + // previous iteration, read for next iteration). We should still flush // in preheader so iteration 1 doesn't need to wait inside the loop. // Only invalidate when DEF comes before USE (same-iteration consumption, // checked above when processing uses). - if (isDSRead(MI)) { + if (IsDSRead || TrackDSFlushPoint) { for (const MachineOperand &Op : MI.all_defs()) { + if (!TRI->isVectorRegister(*MRI, Op.getReg())) + continue; for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) { - VgprDefDS.insert(RU); + // Check for overwrite of pending DS read (flush point) by any + // instruction + updateDSReadFlushTracking(RU); + if (IsDSRead) { + VgprDefDS.insert(RU); + if (TrackDSFlushPoint) + LastDSReadPositionMap[RU] = DSReadPosition; + } } } } @@ -3519,17 +3564,23 @@ SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML, } // VMEM flush decision - if (!VMemInvalidated && UsesVgprLoadedOutsideVMEM && + if (!VMemInvalidated && UsesVgprVMEMLoadedOutside && ((!ST->hasVscnt() && HasVMemStore && !HasVMemLoad) || (HasVMemLoad && ST->hasVmemWriteVgprInOrder()))) Flags.FlushVmCnt = true; - // DS flush decision: flush if loop uses DS-loaded values from outside + // DS flush decision: + // Simple DS Opt: flush if loop uses DS read values from outside // and either has no DS reads in the loop, or DS reads whose results // are not used in the loop. - // DSInvalidated is pre-set to true on non-GFX12+ targets where DS_CNT - // is LGKM_CNT which also tracks FLAT/SMEM. - if (!DSInvalidated && UsesVgprLoadedOutsideDS) + bool SimpleDSOpt = TrackSimpleDSOpt && UsesVgprDSReadOutside; + // Prefetch with flush points: some DS reads used in same iteration, + // but unflushed reads remain at backedge + bool HasUnflushedDSReads = DSReadPosition > LastDSFlushPosition; + bool DSFlushPointPrefetch = + TrackDSFlushPoint && UsesVgprDSReadOutside && HasUnflushedDSReads; + + if (SimpleDSOpt || DSFlushPointPrefetch) Flags.FlushDsCnt = true; return Flags; diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-prefetch-flushed.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-prefetch-flushed.ll index 043ddbea98a4..c24fbcdb9e84 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-prefetch-flushed.ll +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-prefetch-flushed.ll @@ -2,7 +2,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -o - %s | FileCheck %s ; Test for DS prefetch with flush points: preheader has single ds_load_b64 (2xf32). ; Loop has DS loads where some are used in same iteration, others are prefetches. -; Expected: s_wait_dscnt at loop entry, not in preheader +; Expected: s_wait_dscnt 0 in preheader (preheader flush optimization) define amdgpu_kernel void @ds_prefetch_flushed(ptr addrspace(3) %lds, ptr addrspace(1) %out, i32 %n) { ; CHECK-LABEL: ds_prefetch_flushed: @@ -25,17 +25,19 @@ define amdgpu_kernel void @ds_prefetch_flushed(ptr addrspace(3) %lds, ptr addrsp ; CHECK-NEXT: s_mov_b32 s1, 0 ; CHECK-NEXT: ds_load_b64 v[0:1], v11 offset:4 ; CHECK-NEXT: ds_load_b64 v[2:3], v12 +; CHECK-NEXT: s_wait_dscnt 0x0 ; CHECK-NEXT: .LBB0_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: s_barrier_signal -1 -; CHECK-NEXT: s_wait_dscnt 0x0 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; CHECK-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[2:3] ; CHECK-NEXT: s_add_co_i32 s1, s1, 1 ; CHECK-NEXT: s_cmp_lt_i32 s1, s0 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; CHECK-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[0:1] +; CHECK-NEXT: s_wait_dscnt 0x1 ; CHECK-NEXT: v_pk_add_f32 v[6:7], v[8:9], v[6:7] +; CHECK-NEXT: s_wait_dscnt 0x0 ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) ; CHECK-NEXT: v_pk_add_f32 v[8:9], v[6:7], v[4:5] ; CHECK-NEXT: s_barrier_wait -1 diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-prefetch-flushed.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-prefetch-flushed.mir index 7811d3303535..39068b42fcf3 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-prefetch-flushed.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-prefetch-flushed.mir @@ -19,13 +19,13 @@ body: | ; CHECK-NEXT: $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec ; CHECK-NEXT: $vgpr28 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec ; CHECK-NEXT: $vgpr32 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec + ; CHECK-NEXT: S_WAIT_DSCNT 0 ; CHECK-NEXT: S_BRANCH %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; CHECK-NEXT: liveins: $sgpr0, $vgpr0, $vgpr10_vgpr11_vgpr12_vgpr13, $vgpr28, $vgpr32 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_WAIT_DSCNT 2 ; CHECK-NEXT: $vgpr50 = V_ADD_F32_e32 $vgpr10, $vgpr11, implicit $mode, implicit $exec ; CHECK-NEXT: S_WAIT_DSCNT 1 ; CHECK-NEXT: $vgpr51 = V_ADD_F32_e32 $vgpr28, $vgpr28, implicit $mode, implicit $exec