[AMDGPU] Extend DS loop wait optimization with flush point tracking (#175658)

Add support for prefetch patterns where some DS loads are used in the same iteration (creating flush points) while others remain unflushed at the backedge. This complements the existing pure prefetch optimization (PR172728) by handling cases where partial same-iteration consumption occurs. Assisted-by: Cursor / claude-4.5-opus-high
2026-03-02 09:19:46 -08:00 · 2026-03-02 09:19:46 -08:00 · 9d1fd9ec1e
commit 9d1fd9ec1e
parent 447eba88c8
3 changed files with 89 additions and 36 deletions
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@ -3397,15 +3397,11 @@ bool SIInsertWaitcnts::isDSRead(const MachineInstr &MI) const {
 // Check if instruction is a store to LDS that is counted via DSCNT
 // (where that counter exists).
 bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(const MachineInstr &MI) const {
-  if (!MI.mayStore())
-    return false;
-  if (SIInstrInfo::isDS(MI))
-    return true;
-  return false;
+  return MI.mayStore() && SIInstrInfo::isDS(MI);
 }

 // Return flags indicating which counters should be flushed in the preheader of
-// the given loop. We currently decide to flush in a few situations:
+// the given loop. We currently decide to flush in the following situations:
 // For VMEM (FlushVmCnt):
 // 1. The loop contains vmem store(s), no vmem load and at least one use of a
 //    vgpr containing a value that is loaded outside of the loop. (Only on
@ -3415,27 +3411,46 @@ bool SIInsertWaitcnts::mayStoreIncrementingDSCNT(const MachineInstr &MI) const {
 //    outside of the loop.
 // For DS (FlushDsCnt, GFX12+ only):
 // 3. The loop contains no DS reads, and at least one use of a vgpr containing
-//    a value that is DS loaded outside of the loop.
+//    a value that is DS read outside of the loop.
 // 4. The loop contains DS read(s), loaded values are not used in the same
 //    iteration but in the next iteration (prefetch pattern), and at least one
-//    use of a vgpr containing a value that is DS loaded outside of the loop.
+//    use of a vgpr containing a value that is DS read outside of the loop.
 //    Flushing in preheader reduces wait overhead if the wait requirement in
-//    iteration 1 would otherwise be more strict.
+//    iteration 1 would otherwise be more strict (but unfortunately preheader
+//    flush decision is taken before knowing that).
+// 5. (Single-block loops only) The loop has DS prefetch reads with flush point
+//    tracking. Some DS reads may be used in the same iteration (creating
+//    "flush points"), but others remain unflushed at the backedge. When a DS
+//    read is consumed in the same iteration, it and all prior reads are
+//    "flushed" (FIFO order). No DS writes are allowed in the loop.
+//    TODO: Find a way to extend to multi-block loops.
 PreheaderFlushFlags
 SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
                                         const WaitcntBrackets &Brackets) {
  PreheaderFlushFlags Flags;
  bool HasVMemLoad = false;
  bool HasVMemStore = false;
-  bool UsesVgprLoadedOutsideVMEM = false;
-  bool UsesVgprLoadedOutsideDS = false;
+  bool UsesVgprVMEMLoadedOutside = false;
+  bool UsesVgprDSReadOutside = false;
  bool VMemInvalidated = false;
  // DS optimization only applies to GFX12+ where DS_CNT is separate.
-  bool DSInvalidated = !ST->hasExtendedWaitCounts();
+  // Tracking status for "no DS read in loop" or "pure DS prefetch
+  // (use only in next iteration)".
+  bool TrackSimpleDSOpt = ST->hasExtendedWaitCounts();
  DenseSet<MCRegUnit> VgprUse;
  DenseSet<MCRegUnit> VgprDefVMEM;
  DenseSet<MCRegUnit> VgprDefDS;

+  // Track DS reads for prefetch pattern with flush points (single-block only).
+  // Keeps track of the last DS read (position counted from the top of the loop)
+  // to each VGPR. Read is considered consumed (and thus needs flushing) if
+  // the dest register has a use or is overwritten (by any later opertions).
+  DenseMap<MCRegUnit, unsigned> LastDSReadPositionMap;
+  unsigned DSReadPosition = 0;
+  bool IsSingleBlock = ML->getNumBlocks() == 1;
+  bool TrackDSFlushPoint = ST->hasExtendedWaitCounts() && IsSingleBlock;
+  unsigned LastDSFlushPosition = 0;
+
  for (MachineBasicBlock *MBB : ML->blocks()) {
    for (MachineInstr &MI : *MBB) {
      if (isVMEMOrFlatVMEM(MI)) {
@ -3445,12 +3460,30 @@ SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
      // TODO: Can we relax DSStore check? There may be cases where
      // these DS stores are drained prior to the end of MBB (or loop).
      if (mayStoreIncrementingDSCNT(MI)) {
-        // Early exit if both optimizations are invalidated.
-        // Otherwise, set invalid status and continue.
+        // Early exit if none of the optimizations are feasible.
+        // Otherwise, set tracking status appropriately and continue.
        if (VMemInvalidated)
          return Flags;
-        DSInvalidated = true;
+        TrackSimpleDSOpt = false;
+        TrackDSFlushPoint = false;
      }
+      bool IsDSRead = isDSRead(MI);
+      if (IsDSRead)
+        ++DSReadPosition;
+
+      // Helper: if RU has a pending DS read, update LastDSFlushPosition
+      auto updateDSReadFlushTracking = [&](MCRegUnit RU) {
+        if (!TrackDSFlushPoint)
+          return;
+        if (auto It = LastDSReadPositionMap.find(RU);
+            It != LastDSReadPositionMap.end()) {
+          // RU defined by DSRead is used or overwritten. Need to complete
+          // the read, if not already implied by a later DSRead (to any RU)
+          // needing to complete in FIFO order.
+          LastDSFlushPosition = std::max(LastDSFlushPosition, It->second);
+        }
+      };
+
      for (const MachineOperand &Op : MI.all_uses()) {
        if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg()))
          continue;
@ -3461,14 +3494,17 @@ SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
          if (VgprDefVMEM.contains(RU))
            VMemInvalidated = true;

-          // Check for DS loads used inside the loop
+          // Check for DS reads used inside the loop
          if (VgprDefDS.contains(RU))
-            DSInvalidated = true;
+            TrackSimpleDSOpt = false;

-          // Early exit if both optimizations are invalidated
-          if (VMemInvalidated && DSInvalidated)
+          // Early exit if all optimizations are invalidated
+          if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
            return Flags;

+          // Check for flush points (DS read used in same iteration)
+          updateDSReadFlushTracking(RU);
+
          VgprUse.insert(RU);
          // Check if this register has a pending VMEM load from outside the
          // loop (value loaded outside and used inside).
@ -3476,12 +3512,12 @@ SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
          if (Brackets.hasPendingVMEM(ID, LOAD_CNT) ||
              Brackets.hasPendingVMEM(ID, SAMPLE_CNT) ||
              Brackets.hasPendingVMEM(ID, BVH_CNT))
-            UsesVgprLoadedOutsideVMEM = true;
+            UsesVgprVMEMLoadedOutside = true;
          // Check if loaded outside the loop via DS (not VMEM/FLAT).
-          // Only consider it a DS load if there's no pending VMEM load for
+          // Only consider it a DS read if there's no pending VMEM load for
          // this register, since FLAT can set both counters.
          else if (Brackets.hasPendingVMEM(ID, DS_CNT))
-            UsesVgprLoadedOutsideDS = true;
+            UsesVgprDSReadOutside = true;
        }
      }

@ -3496,22 +3532,31 @@ SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
            VgprDefVMEM.insert(RU);
          }
        }
-        // Early exit if both optimizations are invalidated
-        if (VMemInvalidated && DSInvalidated)
+        // Early exit if all optimizations are invalidated
+        if (VMemInvalidated && !TrackSimpleDSOpt && !TrackDSFlushPoint)
          return Flags;
      }

      // DS read vgpr def
      // Note: Unlike VMEM, we DON'T invalidate when VgprUse.contains(RegNo).
      // If USE comes before DEF, it's the prefetch pattern (use value from
-      // previous iteration, load for next iteration). We should still flush
+      // previous iteration, read for next iteration). We should still flush
      // in preheader so iteration 1 doesn't need to wait inside the loop.
      // Only invalidate when DEF comes before USE (same-iteration consumption,
      // checked above when processing uses).
-      if (isDSRead(MI)) {
+      if (IsDSRead || TrackDSFlushPoint) {
        for (const MachineOperand &Op : MI.all_defs()) {
+          if (!TRI->isVectorRegister(*MRI, Op.getReg()))
+            continue;
          for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
-            VgprDefDS.insert(RU);
+            // Check for overwrite of pending DS read (flush point) by any
+            // instruction
+            updateDSReadFlushTracking(RU);
+            if (IsDSRead) {
+              VgprDefDS.insert(RU);
+              if (TrackDSFlushPoint)
+                LastDSReadPositionMap[RU] = DSReadPosition;
+            }
          }
        }
      }
@ -3519,17 +3564,23 @@ SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
  }

  // VMEM flush decision
-  if (!VMemInvalidated && UsesVgprLoadedOutsideVMEM &&
+  if (!VMemInvalidated && UsesVgprVMEMLoadedOutside &&
      ((!ST->hasVscnt() && HasVMemStore && !HasVMemLoad) ||
       (HasVMemLoad && ST->hasVmemWriteVgprInOrder())))
    Flags.FlushVmCnt = true;

-  // DS flush decision: flush if loop uses DS-loaded values from outside
+  // DS flush decision:
+  // Simple DS Opt: flush if loop uses DS read values from outside
  // and either has no DS reads in the loop, or DS reads whose results
  // are not used in the loop.
-  // DSInvalidated is pre-set to true on non-GFX12+ targets where DS_CNT
-  // is LGKM_CNT which also tracks FLAT/SMEM.
-  if (!DSInvalidated && UsesVgprLoadedOutsideDS)
+  bool SimpleDSOpt = TrackSimpleDSOpt && UsesVgprDSReadOutside;
+  // Prefetch with flush points: some DS reads used in same iteration,
+  // but unflushed reads remain at backedge
+  bool HasUnflushedDSReads = DSReadPosition > LastDSFlushPosition;
+  bool DSFlushPointPrefetch =
+      TrackDSFlushPoint && UsesVgprDSReadOutside && HasUnflushedDSReads;
+
+  if (SimpleDSOpt || DSFlushPointPrefetch)
    Flags.FlushDsCnt = true;

  return Flags;
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-prefetch-flushed.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-prefetch-flushed.ll
@ -2,7 +2,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -o - %s | FileCheck %s
 ; Test for DS prefetch with flush points: preheader has single ds_load_b64 (2xf32).
 ; Loop has DS loads where some are used in same iteration, others are prefetches.
-; Expected: s_wait_dscnt at loop entry, not in preheader
+; Expected: s_wait_dscnt 0 in preheader (preheader flush optimization)

 define amdgpu_kernel void @ds_prefetch_flushed(ptr addrspace(3) %lds, ptr addrspace(1) %out, i32 %n) {
 ; CHECK-LABEL: ds_prefetch_flushed:
@ -25,17 +25,19 @@ define amdgpu_kernel void @ds_prefetch_flushed(ptr addrspace(3) %lds, ptr addrsp
 ; CHECK-NEXT:    s_mov_b32 s1, 0
 ; CHECK-NEXT:    ds_load_b64 v[0:1], v11 offset:4
 ; CHECK-NEXT:    ds_load_b64 v[2:3], v12
+; CHECK-NEXT:    s_wait_dscnt 0x0
 ; CHECK-NEXT:  .LBB0_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    s_barrier_signal -1
-; CHECK-NEXT:    s_wait_dscnt 0x0
 ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; CHECK-NEXT:    v_pk_add_f32 v[8:9], v[8:9], v[2:3]
 ; CHECK-NEXT:    s_add_co_i32 s1, s1, 1
 ; CHECK-NEXT:    s_cmp_lt_i32 s1, s0
-; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; CHECK-NEXT:    v_pk_add_f32 v[8:9], v[8:9], v[0:1]
+; CHECK-NEXT:    s_wait_dscnt 0x1
 ; CHECK-NEXT:    v_pk_add_f32 v[6:7], v[8:9], v[6:7]
+; CHECK-NEXT:    s_wait_dscnt 0x0
 ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; CHECK-NEXT:    v_pk_add_f32 v[8:9], v[6:7], v[4:5]
 ; CHECK-NEXT:    s_barrier_wait -1
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-prefetch-flushed.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-loop-ds-prefetch-flushed.mir
@ -19,13 +19,13 @@ body: |
  ; CHECK-NEXT:   $vgpr10_vgpr11_vgpr12_vgpr13 = DS_READ_B128 $vgpr0, 0, 0, implicit $m0, implicit $exec
  ; CHECK-NEXT:   $vgpr28 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec
  ; CHECK-NEXT:   $vgpr32 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec
+  ; CHECK-NEXT:   S_WAIT_DSCNT 0
  ; CHECK-NEXT:   S_BRANCH %bb.1
  ; CHECK-NEXT: {{  $}}
  ; CHECK-NEXT: bb.1:
  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
  ; CHECK-NEXT:   liveins: $sgpr0, $vgpr0, $vgpr10_vgpr11_vgpr12_vgpr13, $vgpr28, $vgpr32
  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_WAIT_DSCNT 2
  ; CHECK-NEXT:   $vgpr50 = V_ADD_F32_e32 $vgpr10, $vgpr11, implicit $mode, implicit $exec
  ; CHECK-NEXT:   S_WAIT_DSCNT 1
  ; CHECK-NEXT:   $vgpr51 = V_ADD_F32_e32 $vgpr28, $vgpr28, implicit $mode, implicit $exec