[AMDGPU] Mark address space cast from private to flat as divergent if target supports globally addressable scratch (#152376)

Globally addressable scratch is a new feature introduced in gfx1250. However, this feature changes how scratch space is mapped into the flat aperture, making address space casts from private to flat no longer uniform.
2025-08-06 17:08:56 -04:00 · 2025-08-06 17:08:56 -04:00 · 351b38f266
commit 351b38f266
parent 381623eb11
4 changed files with 97 additions and 3 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@ -991,10 +991,21 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
    return true;

  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
-    if (Intrinsic->getIntrinsicID() == Intrinsic::read_register)
+    Intrinsic::ID IID = Intrinsic->getIntrinsicID();
+    switch (IID) {
+    case Intrinsic::read_register:
      return isReadRegisterSourceOfDivergence(Intrinsic);
-
-    return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
+    case Intrinsic::amdgcn_addrspacecast_nonnull: {
+      unsigned SrcAS =
+          Intrinsic->getOperand(0)->getType()->getPointerAddressSpace();
+      unsigned DstAS = Intrinsic->getType()->getPointerAddressSpace();
+      return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
+             DstAS == AMDGPUAS::FLAT_ADDRESS &&
+             ST->hasGloballyAddressableScratch();
+    }
+    default:
+      return AMDGPU::isIntrinsicSourceOfDivergence(IID);
+    }
  }

  // Assume all function calls are a source of divergence.
@ -1008,6 +1019,15 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
  if (isa<InvokeInst>(V))
    return true;

+  // If the target supports globally addressable scratch, the mapping from
+  // scratch memory to the flat aperture changes therefore an address space cast
+  // is no longer uniform.
+  if (auto *CastI = dyn_cast<AddrSpaceCastInst>(V)) {
+    return CastI->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
+           CastI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
+           ST->hasGloballyAddressableScratch();
+  }
+
  return false;
 }

--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@ -10074,7 +10074,30 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,

 InstructionUniformity
 SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
+  const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
  unsigned opcode = MI.getOpcode();
+
+  auto HandleAddrSpaceCast = [this, &MRI](const MachineInstr &MI) {
+    Register Dst = MI.getOperand(0).getReg();
+    Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
+                                       : MI.getOperand(1).getReg();
+    LLT DstTy = MRI.getType(Dst);
+    LLT SrcTy = MRI.getType(Src);
+    unsigned DstAS = DstTy.getAddressSpace();
+    unsigned SrcAS = SrcTy.getAddressSpace();
+    return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
+                   DstAS == AMDGPUAS::FLAT_ADDRESS &&
+                   ST.hasGloballyAddressableScratch()
+               ? InstructionUniformity::NeverUniform
+               : InstructionUniformity::Default;
+  };
+
+  // If the target supports globally addressable scratch, the mapping from
+  // scratch memory to the flat aperture changes therefore an address space cast
+  // is no longer uniform.
+  if (opcode == TargetOpcode::G_ADDRSPACE_CAST)
+    return HandleAddrSpaceCast(MI);
+
  if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
    auto IID = GI->getIntrinsicID();
    if (AMDGPU::isIntrinsicSourceOfDivergence(IID))
@ -10083,6 +10106,8 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
      return InstructionUniformity::AlwaysUniform;

    switch (IID) {
+    case Intrinsic::amdgcn_addrspacecast_nonnull:
+      return HandleAddrSpaceCast(MI);
    case Intrinsic::amdgcn_if:
    case Intrinsic::amdgcn_else:
      // FIXME: Uniform if second result
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/addrspacecast.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/addrspacecast.mir
@ -0,0 +1,35 @@
+# NOTE: This file is Generic MIR translation of llvm/test/Analysis/UniformityAnalysis/AMDGPU/addrspacecast.ll test file
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=print-machine-uniformity -filetype=null %s 2>&1 | FileCheck %s --check-prefix=UNI
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=print-machine-uniformity -filetype=null %s 2>&1 | FileCheck %s --check-prefix=DIV
+
+# UNI: ALL VALUES UNIFORM
+# DIV: DIVERGENT: %3: %3:_(p0) = G_ADDRSPACE_CAST %2:_(p5)
+# DIV: DIVERGENT: %4: %4:_(p0) = G_INTRINSIC intrinsic(@llvm.amdgcn.addrspacecast.nonnull), %2:_(p5)
+
+--- |
+  define void @foo() {
+    %alloca = alloca i32, align 4, addrspace(5)
+    %cast = addrspacecast ptr addrspace(5) %alloca to ptr
+    store i32 1, ptr %cast, align 4
+    %cast.1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p5(ptr addrspace(5) %alloca)
+    store i32 2, ptr %cast.1, align 4
+    ret void
+  }
+...
+---
+name:            foo
+stack:
+  - { id: 0, name: alloca, type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+body:             |
+  bb.1 (%ir-block.0):
+    %10:_(s32) = G_CONSTANT i32 1
+    %12:_(s32) = G_CONSTANT i32 2
+    %8:_(p5) = G_FRAME_INDEX %stack.0.alloca
+    %9:_(p0) = G_ADDRSPACE_CAST %8(p5)
+    G_STORE %10(s32), %9(p0) :: (store (s32) into %ir.cast)
+    %11:_(p0) = G_INTRINSIC intrinsic(@llvm.amdgcn.addrspacecast.nonnull), %8(p5)
+    G_STORE %12(s32), %11(p0) :: (store (s32) into %ir.cast.1)
+    SI_RETURN
+...
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/addrspacecast.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/addrspacecast.ll
@ -0,0 +1,14 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s --check-prefix=UNI
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s --check-prefix=DIV
+
+; UNI: ALL VALUES UNIFORM
+; DIV: DIVERGENT:   %cast = addrspacecast ptr addrspace(5) %alloca to ptr
+; DIV: DIVERGENT:   %cast.1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p5(ptr addrspace(5) %alloca)
+define void @foo() {
+  %alloca = alloca i32, align 4, addrspace(5)
+  %cast = addrspacecast ptr addrspace(5) %alloca to ptr
+  store i32 1, ptr %cast
+  %cast.1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p5(ptr addrspace(5) %alloca)
+  store i32 2, ptr %cast.1
+  ret void
+}