diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index dd3f2fe25a23..520c321cb7ea 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -552,7 +552,7 @@ public: (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) { // FLAT and SCRATCH instructions may access scratch. Other VMEM // instructions do not. - if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst)) + if (TII->mayAccessScratchThroughFlat(Inst)) return SCRATCH_WRITE_ACCESS; return VMEM_WRITE_ACCESS; } @@ -565,7 +565,6 @@ public: bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const; bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; - bool mayAccessScratchThroughFlat(const MachineInstr &MI) const; bool isVmemAccess(const MachineInstr &MI) const; bool generateWaitcntInstBefore(MachineInstr &MI, WaitcntBrackets &ScoreBrackets, @@ -2160,32 +2159,6 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const { return false; } -// This is a flat memory operation. Check to see if it has memory tokens for -// either scratch or FLAT. -bool SIInsertWaitcnts::mayAccessScratchThroughFlat( - const MachineInstr &MI) const { - assert(TII->isFLAT(MI)); - - // SCRATCH instructions always access scratch. - if (TII->isFLATScratch(MI)) - return true; - - // GLOBAL instructions never access scratch. - if (TII->isFLATGlobal(MI)) - return false; - - // If there are no memory operands then conservatively assume the flat - // operation may access scratch. - if (MI.memoperands_empty()) - return true; - - // See if any memory operand specifies an address space that involves scratch. - return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) { - unsigned AS = Memop->getAddrSpace(); - return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS; - }); -} - bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const { return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) || (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode())); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 8d6c1d046602..2aa6b4e82f9d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4249,6 +4249,32 @@ bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode); } +bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const { + if (!isFLAT(MI) || isFLATGlobal(MI)) + return false; + + // If scratch is not initialized, we can never access it. + if (MI.getMF()->getFunction().hasFnAttribute("amdgpu-no-flat-scratch-init")) + return false; + + // SCRATCH instructions always access scratch. + if (isFLATScratch(MI)) + return true; + + // If there are no memory operands then conservatively assume the flat + // operation may access scratch. + if (MI.memoperands_empty()) + return true; + + // TODO (?): Does this need to be taught how to read noalias.addrspace ? + + // See if any memory operand specifies an address space that involves scratch. + return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) { + unsigned AS = Memop->getAddrSpace(); + return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS; + }); +} + bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) { // Skip the full operand and register alias search modifiesRegister // does. There's only a handful of instructions that touch this, it's only an diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 2ffb7835c312..e042b59eb0f0 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -678,6 +678,12 @@ public: return get(Opcode).TSFlags & SIInstrFlags::FLAT; } + /// \returns true for SCRATCH_ instructions, or FLAT_ instructions with + /// SCRATCH_ memory operands. + /// Conservatively correct; will return true if \p MI cannot be proven + /// to not hit scratch. + bool mayAccessScratchThroughFlat(const MachineInstr &MI) const; + static bool isBlockLoadStore(uint16_t Opcode) { switch (Opcode) { case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE: diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 0e8a420fbb70..607825ea50e7 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -321,7 +321,8 @@ public: bool IsNonTemporal, bool IsLastUse = false) const = 0; - virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const { + virtual bool finalizeStore(MachineBasicBlock::iterator &MI, + bool Atomic) const { return false; }; @@ -602,7 +603,8 @@ public: bool IsVolatile, bool IsNonTemporal, bool IsLastUse) const override; - bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override; + bool finalizeStore(MachineBasicBlock::iterator &MI, + bool Atomic) const override; bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, @@ -2551,11 +2553,25 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( return Changed; } -bool SIGfx12CacheControl::expandSystemScopeStore( - MachineBasicBlock::iterator &MI) const { +bool SIGfx12CacheControl::finalizeStore(MachineBasicBlock::iterator &MI, + bool Atomic) const { MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); - if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS)) - return insertWaitsBeforeSystemScopeStore(MI); + if (!CPol) + return false; + + const unsigned Scope = CPol->getImm() & CPol::SCOPE; + + // GFX12.0 only: Extra waits needed before system scope stores. + if (!ST.hasGFX1250Insts()) { + if (!Atomic && Scope == CPol::SCOPE_SYS) + return insertWaitsBeforeSystemScopeStore(MI); + return false; + } + + // GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address + // space. + if (TII->mayAccessScratchThroughFlat(*MI) && Scope == CPol::SCOPE_CU) + return setScope(MI, CPol::SCOPE_SE); return false; } @@ -2674,6 +2690,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE); + Changed |= CC->finalizeStore(MI, /*Atomic=*/true); return Changed; } @@ -2686,7 +2703,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, // GFX12 specific, scope(desired coherence domain in cache hierarchy) is // instruction field, do not confuse it with atomic scope. - Changed |= CC->expandSystemScopeStore(MI); + Changed |= CC->finalizeStore(MI, /*Atomic=*/false); return Changed; } diff --git a/llvm/test/CodeGen/AMDGPU/gfx1250-scratch-scope-se.ll b/llvm/test/CodeGen/AMDGPU/gfx1250-scratch-scope-se.ll new file mode 100644 index 000000000000..d1e82a06077f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/gfx1250-scratch-scope-se.ll @@ -0,0 +1,95 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-SDAG %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-GISEL %s + +; Test that stores that may hit scratch are correctly promoted to SCOPE_SE. + +define void @test_scratch_store(ptr addrspace(5) %ptr, i32 %val) { +; GCN-LABEL: test_scratch_store: +; GCN: ; %bb.0: +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: scratch_store_b32 v0, v1, off scope:SCOPE_SE +; GCN-NEXT: s_set_pc_i64 s[30:31] + store i32 %val, ptr addrspace(5) %ptr + ret void +} + +define void @test_unknown_flat_store(ptr %ptr, i32 %val) { +; GCN-LABEL: test_unknown_flat_store: +; GCN: ; %bb.0: +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE +; GCN-NEXT: s_wait_dscnt 0x0 +; GCN-NEXT: s_set_pc_i64 s[30:31] + store i32 %val, ptr %ptr + ret void +} + +define void @test_flat_store_no_scratch_alloc(ptr %ptr, i32 %val) #0 { +; GCN-LABEL: test_flat_store_no_scratch_alloc: +; GCN: ; %bb.0: +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: flat_store_b32 v[0:1], v2 +; GCN-NEXT: s_wait_dscnt 0x0 +; GCN-NEXT: s_set_pc_i64 s[30:31] + store i32 %val, ptr %ptr + ret void +} + +; TODO: handle +define void @test_flat_store_noalias_addrspace(ptr %ptr, i32 %val) { +; GCN-LABEL: test_flat_store_noalias_addrspace: +; GCN: ; %bb.0: +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE +; GCN-NEXT: s_wait_dscnt 0x0 +; GCN-NEXT: s_set_pc_i64 s[30:31] + store i32 %val, ptr %ptr, !noalias.addrspace !{i32 5, i32 6} + ret void +} + +; TODO: would be nice to handle too +define void @test_flat_store_select(ptr addrspace(1) %a, ptr addrspace(3) %b, i1 %cond, i32 %val) { +; GCN-SDAG-LABEL: test_flat_store_select: +; GCN-SDAG: ; %bb.0: +; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-SDAG-NEXT: s_wait_kmcnt 0x0 +; GCN-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v2 +; GCN-SDAG-NEXT: v_and_b32_e32 v3, 1, v3 +; GCN-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base +; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo +; GCN-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, s1, vcc_lo +; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GCN-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GCN-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v2, v0 +; GCN-SDAG-NEXT: flat_store_b32 v[0:1], v4 scope:SCOPE_SE +; GCN-SDAG-NEXT: s_wait_dscnt 0x0 +; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GCN-GISEL-LABEL: test_flat_store_select: +; GCN-GISEL: ; %bb.0: +; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-GISEL-NEXT: s_wait_kmcnt 0x0 +; GCN-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v2 +; GCN-GISEL-NEXT: v_and_b32_e32 v3, 1, v3 +; GCN-GISEL-NEXT: s_mov_b64 s[0:1], src_shared_base +; GCN-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo +; GCN-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, s1, vcc_lo +; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GCN-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 +; GCN-GISEL-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v5, v1 +; GCN-GISEL-NEXT: flat_store_b32 v[0:1], v4 scope:SCOPE_SE +; GCN-GISEL-NEXT: s_wait_dscnt 0x0 +; GCN-GISEL-NEXT: s_set_pc_i64 s[30:31] + %a.ascast = addrspacecast ptr addrspace(1) %a to ptr + %b.ascast = addrspacecast ptr addrspace(3) %b to ptr + %ptr = select i1 %cond, ptr %a.ascast, ptr %b.ascast + store i32 %val, ptr %ptr + ret void +} + +attributes #0 = { "amdgpu-no-flat-scratch-init" } diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll index fd644a35f61e..3a898a921446 100644 --- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll @@ -124,27 +124,27 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac ; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-SDAG-NEXT: s_wait_kmcnt 0x0 ; GCN-SDAG-NEXT: s_clause 0xd -; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:52 -; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:48 -; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:44 -; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 offset:40 -; GCN-SDAG-NEXT: scratch_store_b32 off, v44, s32 offset:36 -; GCN-SDAG-NEXT: scratch_store_b32 off, v45, s32 offset:32 -; GCN-SDAG-NEXT: scratch_store_b32 off, v56, s32 offset:28 -; GCN-SDAG-NEXT: scratch_store_b32 off, v57, s32 offset:24 -; GCN-SDAG-NEXT: scratch_store_b32 off, v58, s32 offset:20 -; GCN-SDAG-NEXT: scratch_store_b32 off, v59, s32 offset:16 -; GCN-SDAG-NEXT: scratch_store_b32 off, v60, s32 offset:12 -; GCN-SDAG-NEXT: scratch_store_b32 off, v61, s32 offset:8 -; GCN-SDAG-NEXT: scratch_store_b32 off, v62, s32 offset:4 -; GCN-SDAG-NEXT: scratch_store_b32 off, v63, s32 +; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:52 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:48 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:44 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 offset:40 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v44, s32 offset:36 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v45, s32 offset:32 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v56, s32 offset:28 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v57, s32 offset:24 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v58, s32 offset:20 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v59, s32 offset:16 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v60, s32 offset:12 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v61, s32 offset:8 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v62, s32 offset:4 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v63, s32 scope:SCOPE_SE ; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:224 ; GCN-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 -; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:56 ; 16-byte Folded Spill +; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:56 scope:SCOPE_SE ; 16-byte Folded Spill ; GCN-SDAG-NEXT: global_load_b128 v[6:9], v[0:1], off offset:240 ; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 -; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:72 ; 16-byte Folded Spill +; GCN-SDAG-NEXT: scratch_store_b128 off, v[6:9], s32 offset:72 scope:SCOPE_SE ; 16-byte Folded Spill ; GCN-SDAG-NEXT: s_clause 0xd ; GCN-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off offset:192 ; GCN-SDAG-NEXT: global_load_b128 v[14:17], v[0:1], off offset:208 @@ -206,27 +206,27 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac ; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-GISEL-NEXT: s_wait_kmcnt 0x0 ; GCN-GISEL-NEXT: s_clause 0xf -; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:60 -; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:56 -; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:52 -; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:48 -; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:44 -; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32 offset:40 -; GCN-GISEL-NEXT: scratch_store_b32 off, v46, s32 offset:36 -; GCN-GISEL-NEXT: scratch_store_b32 off, v47, s32 offset:32 -; GCN-GISEL-NEXT: scratch_store_b32 off, v56, s32 offset:28 -; GCN-GISEL-NEXT: scratch_store_b32 off, v57, s32 offset:24 -; GCN-GISEL-NEXT: scratch_store_b32 off, v58, s32 offset:20 -; GCN-GISEL-NEXT: scratch_store_b32 off, v59, s32 offset:16 -; GCN-GISEL-NEXT: scratch_store_b32 off, v60, s32 offset:12 -; GCN-GISEL-NEXT: scratch_store_b32 off, v61, s32 offset:8 -; GCN-GISEL-NEXT: scratch_store_b32 off, v62, s32 offset:4 -; GCN-GISEL-NEXT: scratch_store_b32 off, v63, s32 +; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:60 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:56 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:52 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:48 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:44 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32 offset:40 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v46, s32 offset:36 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v47, s32 offset:32 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v56, s32 offset:28 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v57, s32 offset:24 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v58, s32 offset:20 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v59, s32 offset:16 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v60, s32 offset:12 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v61, s32 offset:8 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v62, s32 offset:4 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v63, s32 scope:SCOPE_SE ; GCN-GISEL-NEXT: s_wait_xcnt 0x8 ; GCN-GISEL-NEXT: v_dual_mov_b32 v46, v3 :: v_dual_mov_b32 v47, v4 ; GCN-GISEL-NEXT: global_load_b128 v[2:5], v[0:1], off offset:32 ; GCN-GISEL-NEXT: s_wait_loadcnt 0x0 -; GCN-GISEL-NEXT: scratch_store_b128 off, v[2:5], s32 offset:80 ; 16-byte Folded Spill +; GCN-GISEL-NEXT: scratch_store_b128 off, v[2:5], s32 offset:80 scope:SCOPE_SE ; 16-byte Folded Spill ; GCN-GISEL-NEXT: s_clause 0xe ; GCN-GISEL-NEXT: global_load_b128 v[6:9], v[0:1], off offset:48 ; GCN-GISEL-NEXT: global_load_b128 v[10:13], v[0:1], off offset:64 @@ -244,7 +244,7 @@ define i32 @test_v64i32_load_store(ptr addrspace(1) %ptr, i32 %idx, ptr addrspac ; GCN-GISEL-NEXT: global_load_b128 v[60:63], v[0:1], off offset:16 ; GCN-GISEL-NEXT: global_load_b128 v[0:3], v[0:1], off offset:240 ; GCN-GISEL-NEXT: s_wait_loadcnt 0x0 -; GCN-GISEL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:64 ; 16-byte Folded Spill +; GCN-GISEL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:64 scope:SCOPE_SE ; 16-byte Folded Spill ; GCN-GISEL-NEXT: scratch_load_b128 v[0:3], off, s32 offset:80 th:TH_LOAD_LU ; 16-byte Folded Reload ; GCN-GISEL-NEXT: s_wait_loadcnt 0x0 ; GCN-GISEL-NEXT: s_clause 0xe @@ -299,10 +299,10 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-SDAG-NEXT: s_wait_kmcnt 0x0 ; GCN-SDAG-NEXT: s_clause 0x3 -; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:12 -; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:8 -; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:4 -; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 +; GCN-SDAG-NEXT: scratch_store_b32 off, v40, s32 offset:12 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v41, s32 offset:8 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v42, s32 offset:4 scope:SCOPE_SE +; GCN-SDAG-NEXT: scratch_store_b32 off, v43, s32 scope:SCOPE_SE ; GCN-SDAG-NEXT: s_clause 0x7 ; GCN-SDAG-NEXT: global_load_b128 v[10:13], v[0:1], off offset:112 ; GCN-SDAG-NEXT: global_load_b128 v[18:21], v[0:1], off offset:96 @@ -385,12 +385,12 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-GISEL-NEXT: s_wait_kmcnt 0x0 ; GCN-GISEL-NEXT: s_clause 0x5 -; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:20 -; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:16 -; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:12 -; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:8 -; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:4 -; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32 +; GCN-GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:20 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v41, s32 offset:16 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v42, s32 offset:12 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v43, s32 offset:8 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v44, s32 offset:4 scope:SCOPE_SE +; GCN-GISEL-NEXT: scratch_store_b32 off, v45, s32 scope:SCOPE_SE ; GCN-GISEL-NEXT: s_clause 0x7 ; GCN-GISEL-NEXT: global_load_b128 v[6:9], v[0:1], off offset:80 ; GCN-GISEL-NEXT: global_load_b128 v[10:13], v[0:1], off