[AMDGPU][GFX12.5] Add support for emitting memory operations with nv bit set (#179413)

- Add `MONonVolatile` MachineMemOperand flag.
- Set nv=1 on memory operations on GFX12.5 if the operation accesses a
constant address space,
  is an invariant load, or has the `MONonVolatile` flag set.
This commit is contained in:
Pierre van Houtryve 2026-02-06 11:35:46 +01:00 committed by GitHub
parent d64a609b2b
commit b738491d2f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 415 additions and 14 deletions

View File

@ -9929,6 +9929,7 @@ SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
{MONoClobber, "amdgpu-noclobber"},
{MOLastUse, "amdgpu-last-use"},
{MOCooperative, "amdgpu-cooperative"},
{MOThreadPrivate, "amdgpu-thread-private"},
};
return ArrayRef(TargetFlags);

View File

@ -52,6 +52,11 @@ static const MachineMemOperand::Flags MOLastUse =
static const MachineMemOperand::Flags MOCooperative =
MachineMemOperand::MOTargetFlag3;
/// Mark the MMO of accesses to memory locations that are
/// never written to by other threads.
static const MachineMemOperand::Flags MOThreadPrivate =
MachineMemOperand::MOTargetFlag4;
/// Utility to store machine instructions worklist.
struct SIInstrWorklist {
SIInstrWorklist() = default;

View File

@ -398,6 +398,10 @@ public:
bool IsCrossAddrSpaceOrdering,
Position Pos) const = 0;
/// Handle operations that are considered non-volatile.
/// See \ref isNonVolatileMemoryAccess
virtual bool handleNonVolatile(MachineInstr &MI) const { return false; }
/// Virtual destructor to allow derivations to be deleted.
virtual ~SICacheControl() = default;
};
@ -555,6 +559,8 @@ public:
SIAtomicAddrSpace AddrSpace) const override {
return setAtomicScope(MI, Scope, AddrSpace);
}
bool handleNonVolatile(MachineInstr &MI) const override;
};
class SIMemoryLegalizer final {
@ -899,6 +905,18 @@ SIMemOpAccess::getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const {
return constructFromMIWithMMO(MI);
}
/// \returns true if \p MI has one or more MMO, and all of them are fit for
/// being marked as non-volatile. This means that either they are accessing the
/// constant address space, are accessing a known invariant memory location, or
/// that they are marked with the non-volatile metadata/MMO flag.
static bool isNonVolatileMemoryAccess(const MachineInstr &MI) {
if (MI.getNumMemOperands() == 0)
return false;
return all_of(MI.memoperands(), [&](const MachineMemOperand *MMO) {
return MMO->getFlags() & (MOThreadPrivate | MachineMemOperand::MOInvariant);
});
}
SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
TII = ST.getInstrInfo();
IV = getIsaVersion(ST.getCPU());
@ -2061,6 +2079,17 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
return Changed;
}
bool SIGfx12CacheControl::handleNonVolatile(MachineInstr &MI) const {
// On GFX12.5, set the NV CPol bit.
if (!ST.hasGFX1250Insts())
return false;
MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
if (!CPol)
return false;
CPol->setImm(CPol->getImm() | AMDGPU::CPol::NV);
return true;
}
bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
@ -2456,20 +2485,21 @@ bool SIMemoryLegalizer::run(MachineFunction &MF) {
MI = II->getIterator();
}
if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
continue;
if (const auto &MOI = MOA.getLoadInfo(MI)) {
Changed |= expandLoad(*MOI, MI);
} else if (const auto &MOI = MOA.getStoreInfo(MI)) {
Changed |= expandStore(*MOI, MI);
} else if (const auto &MOI = MOA.getLDSDMAInfo(MI)) {
Changed |= expandLDSDMA(*MOI, MI);
} else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) {
Changed |= expandAtomicFence(*MOI, MI);
} else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) {
Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
if (MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic) {
if (const auto &MOI = MOA.getLoadInfo(MI))
Changed |= expandLoad(*MOI, MI);
else if (const auto &MOI = MOA.getStoreInfo(MI))
Changed |= expandStore(*MOI, MI);
else if (const auto &MOI = MOA.getLDSDMAInfo(MI))
Changed |= expandLDSDMA(*MOI, MI);
else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
Changed |= expandAtomicFence(*MOI, MI);
else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
}
if (isNonVolatileMemoryAccess(*MI))
Changed |= CC->handleNonVolatile(*MI);
}
}

View File

@ -0,0 +1,365 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU,GFX12-CU-DAGISEL %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU,GFX12-CU-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-DAGISEL %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-GISEL %s
define void @flat_i32_nonatomic(ptr addrspace(0) %in, ptr addrspace(0) %out) {
; GFX12-CU-LABEL: flat_i32_nonatomic:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_wait_expcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: flat_load_b32 v0, v[0:1]
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[2:3], v0
; GFX12-CU-NEXT: s_wait_dscnt 0x0
; GFX12-CU-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: flat_i32_nonatomic:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: flat_store_b32 v[2:3], v0
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
entry:
%val = load i32, ptr addrspace(0) %in
store i32 %val, ptr addrspace(0) %out
ret void
}
define i32 @md_invariant__flat_i32_nonatomic(ptr addrspace(0) %in, ptr addrspace(0) %out) {
; GFX12-CU-LABEL: md_invariant__flat_i32_nonatomic:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_wait_expcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: flat_load_b32 v0, v[0:1]
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: md_invariant__flat_i32_nonatomic:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: flat_load_b32 v0, v[0:1] nv
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
entry:
%val = load i32, ptr addrspace(0) %in, !invariant.load !0
ret i32 %val
}
define void @global_i32_nonatomic(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; GFX12-CU-LABEL: global_i32_nonatomic:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_wait_expcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: global_load_b32 v0, v[0:1], off
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v[2:3], v0, off
; GFX12-CU-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: global_i32_nonatomic:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b32 v0, v[0:1], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_store_b32 v[2:3], v0, off
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
entry:
%val = load i32, ptr addrspace(1) %in
store i32 %val, ptr addrspace(1) %out
ret void
}
define i32 @md_invariant__global_i32_nonatomic(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; GFX12-CU-LABEL: md_invariant__global_i32_nonatomic:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_wait_expcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: global_load_b32 v0, v[0:1], off
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: md_invariant__global_i32_nonatomic:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_load_b32 v0, v[0:1], off nv
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
entry:
%val = load i32, ptr addrspace(1) %in, !invariant.load !0
ret i32 %val
}
define i32 @scalar_i32_nonatomic(ptr addrspace(4) inreg %in) {
; GFX12-CU-LABEL: scalar_i32_nonatomic:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_wait_expcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: scalar_i32_nonatomic:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_load_b32 s0, s[0:1], 0x0 nv
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v0, s0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
entry:
%val = load i32, ptr addrspace(4) %in
ret i32 %val
}
define i32 @md_invariant__scalar_i32_nonatomic(ptr addrspace(4) inreg %in) {
; GFX12-CU-LABEL: md_invariant__scalar_i32_nonatomic:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_wait_expcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: md_invariant__scalar_i32_nonatomic:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_load_b32 s0, s[0:1], 0x0 nv
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v0, s0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
entry:
%val = load i32, ptr addrspace(4) %in, !invariant.load !0
ret i32 %val
}
define void @scratch_i32_nonatomic(ptr addrspace(5) %in, ptr addrspace(5) %out) {
; GFX12-CU-LABEL: scratch_i32_nonatomic:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_wait_expcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: scratch_load_b32 v0, v0, off
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: scratch_store_b32 v1, v0, off
; GFX12-CU-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: scratch_i32_nonatomic:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: scratch_load_b32 v0, v0, off
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: scratch_store_b32 v1, v0, off
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
entry:
%val = load i32, ptr addrspace(5) %in
store i32 %val, ptr addrspace(5) %out
ret void
}
define i32 @md_invariant__scratch_i32_nonatomic(ptr addrspace(5) %in, ptr addrspace(5) %out) {
; GFX12-CU-LABEL: md_invariant__scratch_i32_nonatomic:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_wait_expcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: scratch_load_b32 v0, v0, off
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: md_invariant__scratch_i32_nonatomic:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: scratch_load_b32 v0, v0, off nv
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
entry:
%val = load i32, ptr addrspace(5) %in, !invariant.load !0
ret i32 %val
}
define i32 @scalar32_i32_nonatomic(ptr addrspace(6) inreg %in) {
; GFX12-CU-LABEL: scalar32_i32_nonatomic:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_wait_expcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: s_mov_b32 s1, 0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: scalar32_i32_nonatomic:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_mov_b32 s1, 0
; GFX1250-NEXT: s_load_b32 s0, s[0:1], 0x0 nv
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v0, s0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
entry:
%val = load i32, ptr addrspace(6) %in
ret i32 %val
}
define i32 @md_invariant__scalar32_i32_nonatomic(ptr addrspace(6) inreg %in) {
; GFX12-CU-LABEL: md_invariant__scalar32_i32_nonatomic:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_wait_expcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: s_mov_b32 s1, 0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: md_invariant__scalar32_i32_nonatomic:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_mov_b32 s1, 0
; GFX1250-NEXT: s_load_b32 s0, s[0:1], 0x0 nv
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v0, s0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
entry:
%val = load i32, ptr addrspace(6) %in, !invariant.load !0
ret i32 %val
}
define void @buffer_i32_nonatomic(ptr addrspace(7) inreg %in, ptr addrspace(7) inreg %out) {
; GFX12-CU-DAGISEL-LABEL: buffer_i32_nonatomic:
; GFX12-CU-DAGISEL: ; %bb.0: ; %entry
; GFX12-CU-DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-DAGISEL-NEXT: s_wait_expcnt 0x0
; GFX12-CU-DAGISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-DAGISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-DAGISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-DAGISEL-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s21
; GFX12-CU-DAGISEL-NEXT: s_mov_b32 s7, s20
; GFX12-CU-DAGISEL-NEXT: s_mov_b32 s6, s19
; GFX12-CU-DAGISEL-NEXT: s_mov_b32 s5, s18
; GFX12-CU-DAGISEL-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen
; GFX12-CU-DAGISEL-NEXT: s_mov_b32 s4, s17
; GFX12-CU-DAGISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-DAGISEL-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen
; GFX12-CU-DAGISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-CU-GISEL-LABEL: buffer_i32_nonatomic:
; GFX12-CU-GISEL: ; %bb.0: ; %entry
; GFX12-CU-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-GISEL-NEXT: s_wait_expcnt 0x0
; GFX12-CU-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-GISEL-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s21
; GFX12-CU-GISEL-NEXT: s_mov_b32 s4, s17
; GFX12-CU-GISEL-NEXT: s_mov_b32 s5, s18
; GFX12-CU-GISEL-NEXT: s_mov_b32 s6, s19
; GFX12-CU-GISEL-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen
; GFX12-CU-GISEL-NEXT: s_mov_b32 s7, s20
; GFX12-CU-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-GISEL-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen
; GFX12-CU-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-DAGISEL-LABEL: buffer_i32_nonatomic:
; GFX1250-DAGISEL: ; %bb.0: ; %entry
; GFX1250-DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-DAGISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-DAGISEL-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s21
; GFX1250-DAGISEL-NEXT: s_mov_b32 s7, s20
; GFX1250-DAGISEL-NEXT: s_mov_b32 s6, s19
; GFX1250-DAGISEL-NEXT: s_mov_b32 s5, s18
; GFX1250-DAGISEL-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen
; GFX1250-DAGISEL-NEXT: s_mov_b32 s4, s17
; GFX1250-DAGISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-DAGISEL-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen
; GFX1250-DAGISEL-NEXT: s_set_pc_i64 s[30:31]
;
; GFX1250-GISEL-LABEL: buffer_i32_nonatomic:
; GFX1250-GISEL: ; %bb.0: ; %entry
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s21
; GFX1250-GISEL-NEXT: s_mov_b32 s4, s17
; GFX1250-GISEL-NEXT: s_mov_b32 s5, s18
; GFX1250-GISEL-NEXT: s_mov_b32 s6, s19
; GFX1250-GISEL-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen
; GFX1250-GISEL-NEXT: s_mov_b32 s7, s20
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen
; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31]
entry:
%val = load i32, ptr addrspace(7) %in
store i32 %val, ptr addrspace(7) %out
ret void
}
define i32 @md_invariant__buffer_i32_nonatomic(ptr addrspace(7) inreg %in, ptr addrspace(7) inreg %out) {
; GFX12-CU-LABEL: md_invariant__buffer_i32_nonatomic:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_wait_expcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s16
; GFX12-CU-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: md_invariant__buffer_i32_nonatomic:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mov_b32_e32 v0, s16
; GFX1250-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen nv
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
entry:
%val = load i32, ptr addrspace(7) %in, !invariant.load !0
ret i32 %val
}
!0 = !{}

View File

@ -85,7 +85,7 @@ body: |
; GFX1250-LABEL: name: promote_async_load_u64
; GFX1250: liveins: $vgpr0, $sgpr4_sgpr5
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
; GFX1250-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 32 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
; GFX1250-NEXT: renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec
; GFX1250-NEXT: renamable $vgpr0 = V_AND_B32_e32 1023, killed $vgpr0, implicit $exec
; GFX1250-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B128_SADDR $vgpr1, $sgpr0_sgpr1, $vgpr0, 0, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3)