[AMDGPU][SIInsertWaitcnts][NFC] Move soft xcnt deletion to separate function (#181760)

This patch simplifies the logic of `insertWaitcntInBlock()` by moving the code that removes the redundant soft xcnt instructions to a new function: `removeRedundantSoftXcnts()`. While doing so, this patch also cleans up the logic a bit by dropping the AtomiRMWState and the corresponding functions. This helps in several ways: - insertWaitcntInBlock() will now do what its name suggests, i.e., only insert and not remove. - it makes it clear that removal of softxcnts is orthogonal to insertion of waitcnts. - we won't have to worry about both erased and new instruction in insertWaitcntInBlock()'s loop. The change should be NFC.
2026-02-17 09:24:08 -08:00 · 2026-02-17 09:24:08 -08:00 · 5addddf8f1
commit 5addddf8f1
parent 60e50a4729
1 changed files with 42 additions and 72 deletions
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@ -99,21 +99,6 @@ static unsigned getWaitCountMax(const AMDGPU::HardwareLimits &Limits,
  }
 }

-static bool isSoftXcnt(MachineInstr &MI) {
-  return MI.getOpcode() == AMDGPU::S_WAIT_XCNT_soft;
-}
-
-static bool isAtomicRMW(MachineInstr &MI) {
-  return (MI.getDesc().TSFlags & SIInstrFlags::maybeAtomic) && MI.mayLoad() &&
-         MI.mayStore();
-}
-
-enum class AtomicRMWState {
-  NewBlock,    // Start of a new atomic RMW block
-  InsideBlock, // Middle of an existing block
-  NotInBlock   // Not in an atomic RMW block
-};
-
 /// Integer IDs used to track vector memory locations we may have to wait on.
 /// Encoded as u16 chunks:
 ///
@ -676,10 +661,11 @@ public:
                             WaitcntBrackets &ScoreBrackets);
  bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
                            WaitcntBrackets &ScoreBrackets);
+  /// Removes redundant Soft Xcnt Waitcnts in \p Block emitted by the Memory
+  /// Legalizer. Returns true if block was modified.
+  bool removeRedundantSoftXcnts(MachineBasicBlock &Block);
  void setSchedulingMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                         bool ExpertMode) const;
-  AtomicRMWState getAtomicRMWState(MachineInstr &MI,
-                                   AtomicRMWState PrevState) const;
  const WaitEventSet &getWaitEvents(InstCounterType T) const {
    return WCG->getWaitEvents(T);
  }
@ -3126,39 +3112,6 @@ void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &MBB,
      .addImm(EncodedReg);
 }

-// Track back-to-back atomic RMW instructions, referred to as a block.
-//
-// Determines whether \p MI starts a new atomic RMW block, is inside
-// an existing block, or is outside of a block. A block is broken when a
-// CU-scoped memory op or an atomic store is encountered. ALU ops
-// and non-memory instructions don't break a block. The function returns
-// the new state after processing the current instruction based on
-// \p PrevState, the previously captured state.
-AtomicRMWState
-SIInsertWaitcnts::getAtomicRMWState(MachineInstr &MI,
-                                    AtomicRMWState PrevState) const {
-  if (isAtomicRMW(MI)) {
-    // Transition from NotInBlock -> NewBlock -> InsideBlock.
-    if (PrevState == AtomicRMWState::NotInBlock)
-      return AtomicRMWState::NewBlock;
-    if (PrevState == AtomicRMWState::NewBlock)
-      return AtomicRMWState::InsideBlock;
-
-    return PrevState;
-  }
-
-  // LDS memory operations don't break the block.
-  if (TII->isDS(MI) || (TII->isFLAT(MI) && TII->mayAccessLDSThroughFlat(MI)))
-    return PrevState;
-
-  // Reset the atomic RMW block state when found other VMEM and SMEM operations.
-  if (MI.mayLoad() ^ MI.mayStore())
-    return AtomicRMWState::NotInBlock;
-
-  // Return the previous state otherwise.
-  return PrevState;
-}
-
 // Generate s_waitcnt instructions where needed.
 bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
                                            MachineBasicBlock &Block,
@ -3187,7 +3140,6 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,

  // Walk over the instructions.
  MachineInstr *OldWaitcntInstr = nullptr;
-  AtomicRMWState RMWState = AtomicRMWState::NotInBlock;

  for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
                                         E = Block.instr_end();
@ -3197,32 +3149,13 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
      ++Iter;
      continue;
    }
-    // Get the atomic RMW block state for current instruction.
-    RMWState = getAtomicRMWState(Inst, RMWState);
-
    // Track pre-existing waitcnts that were added in earlier iterations or by
    // the memory legalizer.
    if (isWaitInstr(Inst) ||
        (IsExpertMode && Inst.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR)) {
-      ++Iter;
-      bool IsSoftXcnt = isSoftXcnt(Inst);
-      // The Memory Legalizer conservatively inserts a soft xcnt before each
-      // atomic RMW operation. However, for sequences of back-to-back atomic
-      // RMWs, only the first s_wait_xcnt insertion is necessary. Optimize away
-      // the redundant soft xcnts when we're inside an atomic RMW block.
-      if (Iter != E && IsSoftXcnt) {
-        // Check if the next instruction can potentially change the atomic RMW
-        // state.
-        RMWState = getAtomicRMWState(*Iter, RMWState);
-      }
-
-      if (IsSoftXcnt && RMWState == AtomicRMWState::InsideBlock) {
-        // Delete this soft xcnt.
-        Inst.eraseFromParent();
-        Modified = true;
-      } else if (!OldWaitcntInstr) {
+      if (!OldWaitcntInstr)
        OldWaitcntInstr = &Inst;
-      }
+      ++Iter;
      continue;
    }

@ -3344,6 +3277,41 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
  return Modified;
 }

+bool SIInsertWaitcnts::removeRedundantSoftXcnts(MachineBasicBlock &Block) {
+  if (Block.size() <= 1)
+    return false;
+  // The Memory Legalizer conservatively inserts a soft xcnt before each
+  // atomic RMW operation. However, for sequences of back-to-back atomic
+  // RMWs, only the first s_wait_xcnt insertion is necessary. Optimize away
+  // the redundant soft xcnts.
+  bool Modified = false;
+  // Remember the last atomic with a soft xcnt right before it.
+  MachineInstr *LastAtomicWithSoftXcnt = nullptr;
+
+  for (MachineInstr &MI : drop_begin(Block)) {
+    // Ignore last atomic if non-LDS VMEM and SMEM.
+    bool IsLDS =
+        TII->isDS(MI) || (TII->isFLAT(MI) && TII->mayAccessLDSThroughFlat(MI));
+    if (!IsLDS && (MI.mayLoad() ^ MI.mayStore()))
+      LastAtomicWithSoftXcnt = nullptr;
+
+    bool IsAtomicRMW = (MI.getDesc().TSFlags & SIInstrFlags::maybeAtomic) &&
+                       MI.mayLoad() && MI.mayStore();
+    MachineInstr &PrevMI = *MI.getPrevNode();
+    // This is an atomic with a soft xcnt.
+    if (PrevMI.getOpcode() == AMDGPU::S_WAIT_XCNT_soft && IsAtomicRMW) {
+      // If we have already found an atomic with a soft xcnt, remove this soft
+      // xcnt as it's redundant.
+      if (LastAtomicWithSoftXcnt) {
+        PrevMI.eraseFromParent();
+        Modified = true;
+      }
+      LastAtomicWithSoftXcnt = &MI;
+    }
+  }
+  return Modified;
+}
+
 // Return flags indicating which counters should be flushed in the preheader.
 PreheaderFlushFlags
 SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
@ -3668,6 +3636,8 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
        }
      }

+      if (ST->hasWaitXcnt())
+        Modified |= removeRedundantSoftXcnts(*MBB);
      Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
      BI.Dirty = false;