[AMDGPU][SIMemoryLegalizer] Combine GFX10-11 CacheControl Classes (#168058)

Also breaks the long inheritance chains by making both `SIGfx10CacheControl` and `SIGfx12CacheControl` inherit from `SICacheControl` directly. With this patch, we now just have 3 `SICacheControl` implementations that each do their own thing, and there is no more code hidden 3 superclasses above (which made this code harder to read and maintain than it needed to be).
2025-11-18 10:12:56 +01:00 · 2025-11-18 10:12:56 +01:00 · 20795e06ed
commit 20795e06ed
parent ee1abb8d80
1 changed files with 39 additions and 121 deletions
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@ -404,7 +404,7 @@ public:

 /// Generates code sequences for the memory model of all GFX targets below
 /// GFX10.
-class SIGfx6CacheControl : public SICacheControl {
+class SIGfx6CacheControl final : public SICacheControl {
 public:

  SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
@ -443,14 +443,27 @@ public:
                     Position Pos) const override;
 };

-class SIGfx10CacheControl : public SIGfx6CacheControl {
+/// Generates code sequences for the memory model of GFX10/11.
+class SIGfx10CacheControl final : public SICacheControl {
 public:
-  SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
+  SIGfx10CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}

  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
                             SIAtomicScope Scope,
                             SIAtomicAddrSpace AddrSpace) const override;

+  bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
+                              SIAtomicScope Scope,
+                              SIAtomicAddrSpace AddrSpace) const override {
+    return false;
+  }
+
+  bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
+                            SIAtomicScope Scope,
+                            SIAtomicAddrSpace AddrSpace) const override {
+    return false;
+  }
+
  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                      bool IsVolatile, bool IsNonTemporal,
@ -463,23 +476,17 @@ public:

  bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                     SIAtomicAddrSpace AddrSpace, Position Pos) const override;
+
+  bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+                     SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
+                     Position Pos) const override {
+    return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
+                      IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
+                      /*AtomicsOnly=*/false);
+  }
 };

-class SIGfx11CacheControl : public SIGfx10CacheControl {
-public:
-  SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
-
-  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
-                             SIAtomicScope Scope,
-                             SIAtomicAddrSpace AddrSpace) const override;
-
-  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
-                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
-                                      bool IsVolatile, bool IsNonTemporal,
-                                      bool IsLastUse) const override;
-};
-
-class SIGfx12CacheControl : public SIGfx11CacheControl {
+class SIGfx12CacheControl final : public SICacheControl {
 protected:
  // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
  // \returns Returns true if \p MI is modified, false otherwise.
@ -504,7 +511,7 @@ protected:
                      SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;

 public:
-  SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {
+  SIGfx12CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {
    // GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases
    // the behavior is the same if assuming GFX12.0 in CU mode.
    assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled());
@ -915,10 +922,8 @@ std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
  GCNSubtarget::Generation Generation = ST.getGeneration();
  if (Generation < AMDGPUSubtarget::GFX10)
    return std::make_unique<SIGfx6CacheControl>(ST);
-  if (Generation < AMDGPUSubtarget::GFX11)
-    return std::make_unique<SIGfx10CacheControl>(ST);
  if (Generation < AMDGPUSubtarget::GFX12)
-    return std::make_unique<SIGfx11CacheControl>(ST);
+    return std::make_unique<SIGfx10CacheControl>(ST);
  return std::make_unique<SIGfx12CacheControl>(ST);
 }

@ -1438,8 +1443,7 @@ bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
 }

 bool SIGfx10CacheControl::enableLoadCacheBypass(
-    const MachineBasicBlock::iterator &MI,
-    SIAtomicScope Scope,
+    const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
    SIAtomicAddrSpace AddrSpace) const {
  assert(MI->mayLoad() && !MI->mayStore());
  bool Changed = false;
@ -1450,7 +1454,9 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(
    case SIAtomicScope::AGENT:
      // Set the L0 and L1 cache policies to MISS_EVICT.
      // Note: there is no L2 cache coherent bypass control at the ISA level.
-      Changed |= enableCPolBits(MI, CPol::GLC | CPol::DLC);
+      // For GFX10, set GLC+DLC, for GFX11, only set GLC.
+      Changed |=
+          enableCPolBits(MI, CPol::GLC | (AMDGPU::isGFX10(ST) ? CPol::DLC : 0));
      break;
    case SIAtomicScope::WORKGROUP:
      // In WGP mode the waves of a work-group can be executing on either CU of
@ -1504,6 +1510,10 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
      Changed |= enableCPolBits(MI, CPol::GLC | CPol::DLC);
    }

+    // GFX11: Set MALL NOALLOC for both load and store instructions.
+    if (AMDGPU::isGFX11(ST))
+      Changed |= enableCPolBits(MI, CPol::DLC);
+
    // Ensure operation has completed at system scope to cause all volatile
    // operations to be visible outside the program in a global order. Do not
    // request cross address space as only the global address space can be
@ -1524,6 +1534,10 @@ bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
      Changed |= enableCPolBits(MI, CPol::GLC);
    Changed |= enableCPolBits(MI, CPol::SLC);

+    // GFX11: Set MALL NOALLOC for both load and store instructions.
+    if (AMDGPU::isGFX11(ST))
+      Changed |= enableCPolBits(MI, CPol::DLC);
+
    return Changed;
  }

@ -1722,102 +1736,6 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
  return Changed;
 }

-bool SIGfx11CacheControl::enableLoadCacheBypass(
-    const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
-    SIAtomicAddrSpace AddrSpace) const {
-  assert(MI->mayLoad() && !MI->mayStore());
-  bool Changed = false;
-
-  if (canAffectGlobalAddrSpace(AddrSpace)) {
-    switch (Scope) {
-    case SIAtomicScope::SYSTEM:
-    case SIAtomicScope::AGENT:
-      // Set the L0 and L1 cache policies to MISS_EVICT.
-      // Note: there is no L2 cache coherent bypass control at the ISA level.
-      Changed |= enableCPolBits(MI, CPol::GLC);
-      break;
-    case SIAtomicScope::WORKGROUP:
-      // In WGP mode the waves of a work-group can be executing on either CU of
-      // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
-      // CU mode all waves of a work-group are on the same CU, and so the L0
-      // does not need to be bypassed.
-      if (!ST.isCuModeEnabled())
-        Changed |= enableCPolBits(MI, CPol::GLC);
-      break;
-    case SIAtomicScope::WAVEFRONT:
-    case SIAtomicScope::SINGLETHREAD:
-      // No cache to bypass.
-      break;
-    default:
-      llvm_unreachable("Unsupported synchronization scope");
-    }
-  }
-
-  /// The scratch address space does not need the global memory caches
-  /// to be bypassed as all memory operations by the same thread are
-  /// sequentially consistent, and no other thread can access scratch
-  /// memory.
-
-  /// Other address spaces do not have a cache.
-
-  return Changed;
-}
-
-bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
-    MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
-    bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
-
-  // Only handle load and store, not atomic read-modify-write insructions. The
-  // latter use glc to indicate if the atomic returns a result and so must not
-  // be used for cache control.
-  assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
-
-  // Only update load and store, not LLVM IR atomic read-modify-write
-  // instructions. The latter are always marked as volatile so cannot sensibly
-  // handle it as do not want to pessimize all atomics. Also they do not support
-  // the nontemporal attribute.
-  assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
-
-  bool Changed = false;
-
-  if (IsVolatile) {
-    // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
-    // and MISS_LRU for store instructions.
-    // Note: there is no L2 cache coherent bypass control at the ISA level.
-    if (Op == SIMemOp::LOAD)
-      Changed |= enableCPolBits(MI, CPol::GLC);
-
-    // Set MALL NOALLOC for load and store instructions.
-    Changed |= enableCPolBits(MI, CPol::DLC);
-
-    // Ensure operation has completed at system scope to cause all volatile
-    // operations to be visible outside the program in a global order. Do not
-    // request cross address space as only the global address space can be
-    // observable outside the program, so no need to cause a waitcnt for LDS
-    // address space operations.
-    Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
-                          Position::AFTER, AtomicOrdering::Unordered,
-                          /*AtomicsOnly=*/false);
-    return Changed;
-  }
-
-  if (IsNonTemporal) {
-    // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
-    // and L2 cache policy to STREAM.
-    // For stores setting both GLC and SLC configures L0 and L1 cache policy
-    // to MISS_EVICT and the L2 cache policy to STREAM.
-    if (Op == SIMemOp::STORE)
-      Changed |= enableCPolBits(MI, CPol::GLC);
-    Changed |= enableCPolBits(MI, CPol::SLC);
-
-    // Set MALL NOALLOC for load and store instructions.
-    Changed |= enableCPolBits(MI, CPol::DLC);
-    return Changed;
-  }
-
-  return Changed;
-}
-
 bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
                                AMDGPU::CPol::CPol Value) const {
  MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);