[AMDGPU][gfx1250] Use SCOPE_SE for stores that may hit scratch (#150586)

This commit is contained in:
Pierre van Houtryve 2025-07-28 11:40:56 +02:00 committed by GitHub
parent d4f9c11e06
commit 2ad4e93ded
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 196 additions and 79 deletions

View File

@ -552,7 +552,7 @@ public:
(!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
// FLAT and SCRATCH instructions may access scratch. Other VMEM
// instructions do not.
if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst))
if (TII->mayAccessScratchThroughFlat(Inst))
return SCRATCH_WRITE_ACCESS;
return VMEM_WRITE_ACCESS;
}
@ -565,7 +565,6 @@ public:
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
bool isVmemAccess(const MachineInstr &MI) const;
bool generateWaitcntInstBefore(MachineInstr &MI,
WaitcntBrackets &ScoreBrackets,
@ -2160,32 +2159,6 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
return false;
}
// This is a flat memory operation. Check to see if it has memory tokens for
// either scratch or FLAT.
bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
const MachineInstr &MI) const {
assert(TII->isFLAT(MI));
// SCRATCH instructions always access scratch.
if (TII->isFLATScratch(MI))
return true;
// GLOBAL instructions never access scratch.
if (TII->isFLATGlobal(MI))
return false;
// If there are no memory operands then conservatively assume the flat
// operation may access scratch.
if (MI.memoperands_empty())
return true;
// See if any memory operand specifies an address space that involves scratch.
return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
unsigned AS = Memop->getAddrSpace();
return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
});
}
bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) ||
(TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));

View File

@ -4249,6 +4249,32 @@ bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
}
bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const {
if (!isFLAT(MI) || isFLATGlobal(MI))
return false;
// If scratch is not initialized, we can never access it.
if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init"))
return false;
// SCRATCH instructions always access scratch.
if (isFLATScratch(MI))
return true;
// If there are no memory operands then conservatively assume the flat
// operation may access scratch.
if (MI.memoperands_empty())
return true;
// TODO (?): Does this need to be taught how to read noalias.addrspace ?
// See if any memory operand specifies an address space that involves scratch.
return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
unsigned AS = Memop->getAddrSpace();
return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
});
}
bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
// Skip the full operand and register alias search modifiesRegister
// does. There's only a handful of instructions that touch this, it's only an

View File

@ -678,6 +678,12 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::FLAT;
}
/// \returns true for SCRATCH_ instructions, or FLAT_ instructions with
/// SCRATCH_ memory operands.
/// Conservatively correct; will return true if \p MI cannot be proven
/// to not hit scratch.
bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
static bool isBlockLoadStore(uint16_t Opcode) {
switch (Opcode) {
case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:

View File

@ -321,7 +321,8 @@ public:
bool IsNonTemporal,
bool IsLastUse = false) const = 0;
virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
virtual bool finalizeStore(MachineBasicBlock::iterator &MI,
bool Atomic) const {
return false;
};
@ -602,7 +603,8 @@ public:
bool IsVolatile, bool IsNonTemporal,
bool IsLastUse) const override;
bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
bool finalizeStore(MachineBasicBlock::iterator &MI,
bool Atomic) const override;
bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
@ -2551,11 +2553,25 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
return Changed;
}
bool SIGfx12CacheControl::expandSystemScopeStore(
MachineBasicBlock::iterator &MI) const {
bool SIGfx12CacheControl::finalizeStore(MachineBasicBlock::iterator &MI,
bool Atomic) const {
MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS))
return insertWaitsBeforeSystemScopeStore(MI);
if (!CPol)
return false;
const unsigned Scope = CPol->getImm() & CPol::SCOPE;
// GFX12.0 only: Extra waits needed before system scope stores.
if (!ST.hasGFX1250Insts()) {
if (!Atomic && Scope == CPol::SCOPE_SYS)
return insertWaitsBeforeSystemScopeStore(MI);
return false;
}
// GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address
// space.
if (TII->mayAccessScratchThroughFlat(*MI) && Scope == CPol::SCOPE_CU)
return setScope(MI, CPol::SCOPE_SE);
return false;
}
@ -2674,6 +2690,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
MOI.getIsCrossAddressSpaceOrdering(),
Position::BEFORE);
Changed |= CC->finalizeStore(MI, /*Atomic=*/true);
return Changed;
}
@ -2686,7 +2703,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
// GFX12 specific, scope(desired coherence domain in cache hierarchy) is
// instruction field, do not confuse it with atomic scope.
Changed |= CC->expandSystemScopeStore(MI);
Changed |= CC->finalizeStore(MI, /*Atomic=*/false);
return Changed;
}

View File

@ -0,0 +1,95 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-SDAG %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-GISEL %s
; Test that stores that may hit scratch are correctly promoted to SCOPE_SE.
define void @test_scratch_store(ptr addrspace(5) %ptr, i32 %val) {
; GCN-LABEL: test_scratch_store:
; GCN: ; %bb.0:
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: scratch_store_b32 v0, v1, off scope:SCOPE_SE
; GCN-NEXT: s_set_pc_i64 s[30:31]
store i32 %val, ptr addrspace(5) %ptr
ret void
}
define void @test_unknown_flat_store(ptr %ptr, i32 %val) {
; GCN-LABEL: test_unknown_flat_store:
; GCN: ; %bb.0:
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GCN-NEXT: s_wait_dscnt 0x0
; GCN-NEXT: s_set_pc_i64 s[30:31]
store i32 %val, ptr %ptr
ret void
}
define void @test_flat_store_no_scratch_alloc(ptr %ptr, i32 %val) #0 {
; GCN-LABEL: test_flat_store_no_scratch_alloc:
; GCN: ; %bb.0:
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: flat_store_b32 v[0:1], v2
; GCN-NEXT: s_wait_dscnt 0x0
; GCN-NEXT: s_set_pc_i64 s[30:31]
store i32 %val, ptr %ptr
ret void
}
; TODO: handle
define void @test_flat_store_noalias_addrspace(ptr %ptr, i32 %val) {
; GCN-LABEL: test_flat_store_noalias_addrspace:
; GCN: ; %bb.0:
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
; GCN-NEXT: s_wait_dscnt 0x0
; GCN-NEXT: s_set_pc_i64 s[30:31]
store i32 %val, ptr %ptr, !noalias.addrspace !{i32 5, i32 6}
ret void
}
; TODO: would be nice to handle too
define void @test_flat_store_select(ptr addrspace(1) %a, ptr addrspace(3) %b, i1 %cond, i32 %val) {
; GCN-SDAG-LABEL: test_flat_store_select:
; GCN-SDAG: ; %bb.0:
; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-SDAG-NEXT: s_wait_kmcnt 0x0
; GCN-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v2
; GCN-SDAG-NEXT: v_and_b32_e32 v3, 1, v3
; GCN-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base
; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo
; GCN-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, s1, vcc_lo
; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GCN-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
; GCN-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v2, v0
; GCN-SDAG-NEXT: flat_store_b32 v[0:1], v4 scope:SCOPE_SE
; GCN-SDAG-NEXT: s_wait_dscnt 0x0
; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31]
;
; GCN-GISEL-LABEL: test_flat_store_select:
; GCN-GISEL: ; %bb.0:
; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-GISEL-NEXT: s_wait_kmcnt 0x0
; GCN-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v2
; GCN-GISEL-NEXT: v_and_b32_e32 v3, 1, v3
; GCN-GISEL-NEXT: s_mov_b64 s[0:1], src_shared_base
; GCN-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo
; GCN-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, s1, vcc_lo
; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GCN-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
; GCN-GISEL-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v5, v1
; GCN-GISEL-NEXT: flat_store_b32 v[0:1], v4 scope:SCOPE_SE
; GCN-GISEL-NEXT: s_wait_dscnt 0x0
; GCN-GISEL-NEXT: s_set_pc_i64 s[30:31]
%a.ascast = addrspacecast ptr addrspace(1) %a to ptr
%b.ascast = addrspacecast ptr addrspace(3) %b to ptr
%ptr = select i1 %cond, ptr %a.ascast, ptr %b.ascast
store i32 %val, ptr %ptr
ret void
}
attributes #0 = { "amdgpu-no-flat-scratch-init" }

View File

@ -124,27 +124,27 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac
; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-SDAG-NEXT: s_wait_kmcnt 0x0
; GCN-SDAG-NEXT: s_clause 0xd
; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:52
; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:48
; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:44
; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 offset:40
; GCN-SDAG-NEXT: scratch_store_b32 off, v44, s32 offset:36
; GCN-SDAG-NEXT: scratch_store_b32 off, v45, s32 offset:32
; GCN-SDAG-NEXT: scratch_store_b32 off, v56, s32 offset:28
; GCN-SDAG-NEXT: scratch_store_b32 off, v57, s32 offset:24
; GCN-SDAG-NEXT: scratch_store_b32 off, v58, s32 offset:20
; GCN-SDAG-NEXT: scratch_store_b32 off, v59, s32 offset:16
; GCN-SDAG-NEXT: scratch_store_b32 off, v60, s32 offset:12
; GCN-SDAG-NEXT: scratch_store_b32 off, v61, s32 offset:8
; GCN-SDAG-NEXT: scratch_store_b32 off, v62, s32 offset:4
; GCN-SDAG-NEXT: scratch_store_b32 off, v63, s32
; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:52 scope:SCOPE_SE
; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:48 scope:SCOPE_SE
; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:44 scope:SCOPE_SE
; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 offset:40 scope:SCOPE_SE
; GCN-SDAG-NEXT: scratch_store_b32 off, v44, s32 offset:36 scope:SCOPE_SE
; GCN-SDAG-NEXT: scratch_store_b32 off, v45, s32 offset:32 scope:SCOPE_SE
; GCN-SDAG-NEXT: scratch_store_b32 off, v56, s32 offset:28 scope:SCOPE_SE
; GCN-SDAG-NEXT: scratch_store_b32 off, v57, s32 offset:24 scope:SCOPE_SE
; GCN-SDAG-NEXT: scratch_store_b32 off, v58, s32 offset:20 scope:SCOPE_SE
; GCN-SDAG-NEXT: scratch_store_b32 off, v59, s32 offset:16 scope:SCOPE_SE
; GCN-SDAG-NEXT: scratch_store_b32 off, v60, s32 offset:12 scope:SCOPE_SE
; GCN-SDAG-NEXT: scratch_store_b32 off, v61, s32 offset:8 scope:SCOPE_SE
; GCN-SDAG-NEXT: scratch_store_b32 off, v62, s32 offset:4 scope:SCOPE_SE
; GCN-SDAG-NEXT: scratch_store_b32 off, v63, s32 scope:SCOPE_SE
; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:224
; GCN-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:56 ; 16-byte Folded Spill
; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:56 scope:SCOPE_SE ; 16-byte Folded Spill
; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:240
; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:72 ; 16-byte Folded Spill
; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:72 scope:SCOPE_SE ; 16-byte Folded Spill
; GCN-SDAG-NEXT: s_clause 0xd
; GCN-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off offset:192
; GCN-SDAG-NEXT: global_load_b128 v[14:17], v[0:1], off offset:208
@ -206,27 +206,27 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac
; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-GISEL-NEXT: s_wait_kmcnt 0x0
; GCN-GISEL-NEXT: s_clause 0xf
; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:60
; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:56
; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:52
; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:48
; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:44
; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32 offset:40
; GCN-GISEL-NEXT: scratch_store_b32 off, v46, s32 offset:36
; GCN-GISEL-NEXT: scratch_store_b32 off, v47, s32 offset:32
; GCN-GISEL-NEXT: scratch_store_b32 off, v56, s32 offset:28
; GCN-GISEL-NEXT: scratch_store_b32 off, v57, s32 offset:24
; GCN-GISEL-NEXT: scratch_store_b32 off, v58, s32 offset:20
; GCN-GISEL-NEXT: scratch_store_b32 off, v59, s32 offset:16
; GCN-GISEL-NEXT: scratch_store_b32 off, v60, s32 offset:12
; GCN-GISEL-NEXT: scratch_store_b32 off, v61, s32 offset:8
; GCN-GISEL-NEXT: scratch_store_b32 off, v62, s32 offset:4
; GCN-GISEL-NEXT: scratch_store_b32 off, v63, s32
; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:60 scope:SCOPE_SE
; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:56 scope:SCOPE_SE
; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:52 scope:SCOPE_SE
; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:48 scope:SCOPE_SE
; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:44 scope:SCOPE_SE
; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32 offset:40 scope:SCOPE_SE
; GCN-GISEL-NEXT: scratch_store_b32 off, v46, s32 offset:36 scope:SCOPE_SE
; GCN-GISEL-NEXT: scratch_store_b32 off, v47, s32 offset:32 scope:SCOPE_SE
; GCN-GISEL-NEXT: scratch_store_b32 off, v56, s32 offset:28 scope:SCOPE_SE
; GCN-GISEL-NEXT: scratch_store_b32 off, v57, s32 offset:24 scope:SCOPE_SE
; GCN-GISEL-NEXT: scratch_store_b32 off, v58, s32 offset:20 scope:SCOPE_SE
; GCN-GISEL-NEXT: scratch_store_b32 off, v59, s32 offset:16 scope:SCOPE_SE
; GCN-GISEL-NEXT: scratch_store_b32 off, v60, s32 offset:12 scope:SCOPE_SE
; GCN-GISEL-NEXT: scratch_store_b32 off, v61, s32 offset:8 scope:SCOPE_SE
; GCN-GISEL-NEXT: scratch_store_b32 off, v62, s32 offset:4 scope:SCOPE_SE
; GCN-GISEL-NEXT: scratch_store_b32 off, v63, s32 scope:SCOPE_SE
; GCN-GISEL-NEXT: s_wait_xcnt 0x8
; GCN-GISEL-NEXT: v_dual_mov_b32 v46, v3 :: v_dual_mov_b32 v47, v4
; GCN-GISEL-NEXT: global_load_b128 v[2:5], v[0:1], off offset:32
; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
; GCN-GISEL-NEXT: scratch_store_b128 off, v[2:5], s32 offset:80 ; 16-byte Folded Spill
; GCN-GISEL-NEXT: scratch_store_b128 off, v[2:5], s32 offset:80 scope:SCOPE_SE ; 16-byte Folded Spill
; GCN-GISEL-NEXT: s_clause 0xe
; GCN-GISEL-NEXT: global_load_b128 v[6:9], v[0:1], off offset:48
; GCN-GISEL-NEXT: global_load_b128 v[10:13], v[0:1], off offset:64
@ -244,7 +244,7 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac
; GCN-GISEL-NEXT: global_load_b128 v[60:63], v[0:1], off offset:16
; GCN-GISEL-NEXT: global_load_b128 v[0:3], v[0:1], off offset:240
; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
; GCN-GISEL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:64 ; 16-byte Folded Spill
; GCN-GISEL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:64 scope:SCOPE_SE ; 16-byte Folded Spill
; GCN-GISEL-NEXT: scratch_load_b128 v[0:3], off, s32 offset:80 th:TH_LOAD_LU ; 16-byte Folded Reload
; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
; GCN-GISEL-NEXT: s_clause 0xe
@ -299,10 +299,10 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-SDAG-NEXT: s_wait_kmcnt 0x0
; GCN-SDAG-NEXT: s_clause 0x3
; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:12
; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:8
; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:4
; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32
; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:12 scope:SCOPE_SE
; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:8 scope:SCOPE_SE
; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:4 scope:SCOPE_SE
; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 scope:SCOPE_SE
; GCN-SDAG-NEXT: s_clause 0x7
; GCN-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off offset:112
; GCN-SDAG-NEXT: global_load_b128 v[18:21], v[0:1], off offset:96
@ -385,12 +385,12 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-GISEL-NEXT: s_wait_kmcnt 0x0
; GCN-GISEL-NEXT: s_clause 0x5
; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:20
; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:16
; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:12
; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:8
; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:4
; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32
; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:20 scope:SCOPE_SE
; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:16 scope:SCOPE_SE
; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:12 scope:SCOPE_SE
; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:8 scope:SCOPE_SE
; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:4 scope:SCOPE_SE
; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32 scope:SCOPE_SE
; GCN-GISEL-NEXT: s_clause 0x7
; GCN-GISEL-NEXT: global_load_b128 v[6:9], v[0:1], off offset:80
; GCN-GISEL-NEXT: global_load_b128 v[10:13], v[0:1], off