[AMDGPU] Select scale_offset for scratch instructions on gfx1250 (#150111)

This commit is contained in:
Stanislav Mekhanoshin 2025-07-22 15:24:55 -07:00 committed by GitHub
parent 2e2a8992f9
commit c6e560a25b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 411 additions and 114 deletions

View File

@ -2116,7 +2116,8 @@ bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
SDValue &VAddr, SDValue &SAddr, SDValue &VAddr, SDValue &SAddr,
SDValue &Offset) const { SDValue &Offset,
SDValue &CPol) const {
int64_t ImmOffset = 0; int64_t ImmOffset = 0;
SDValue LHS, RHS; SDValue LHS, RHS;
@ -2148,6 +2149,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset)) if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
return false; return false;
Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32); Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
return true; return true;
} }
} }
@ -2181,6 +2183,10 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
return false; return false;
SAddr = SelectSAddrFI(CurDAG, SAddr); SAddr = SelectSAddrFI(CurDAG, SAddr);
Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32); Offset = CurDAG->getSignedTargetConstant(ImmOffset, SDLoc(), MVT::i32);
bool ScaleOffset = SelectScaleOffset(N, VAddr, true /* IsSigned */);
CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
SDLoc(), MVT::i32);
return true; return true;
} }

View File

@ -175,7 +175,8 @@ private:
bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr, bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr,
uint64_t ImmOffset) const; uint64_t ImmOffset) const;
bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr, bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr,
SDValue &SAddr, SDValue &Offset) const; SDValue &SAddr, SDValue &Offset,
SDValue &CPol) const;
bool SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode, SDValue *SOffset, bool SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode, SDValue *SOffset,
SDValue *Offset, bool Imm32Only = false, SDValue *Offset, bool Imm32Only = false,

View File

@ -5895,22 +5895,32 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset)) if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
return std::nullopt; return std::nullopt;
unsigned CPol = selectScaleOffset(Root, RHS, true /* IsSigned */)
? AMDGPU::CPol::SCAL
: 0;
if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) { if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
int FI = LHSDef->MI->getOperand(1).getIndex(); int FI = LHSDef->MI->getOperand(1).getIndex();
return {{ return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
[=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
[=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
[=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
}}; }};
} }
if (!isSGPR(LHS))
if (auto Def = getDefSrcRegIgnoringCopies(LHS, *MRI))
LHS = Def->Reg;
if (!isSGPR(LHS)) if (!isSGPR(LHS))
return std::nullopt; return std::nullopt;
return {{ return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
[=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
[=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
[=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); } // cpol
}}; }};
} }

View File

@ -14,7 +14,7 @@ let WantsRoot = true in {
def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>; def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>;
def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>; def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>;
def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [], -10>; def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [], -10>;
def ScratchSVAddr : ComplexPattern<iPTR, 3, "SelectScratchSVAddr", [], [], -10>; def ScratchSVAddr : ComplexPattern<iPTR, 4, "SelectScratchSVAddr", [], [], -10>;
} }
class True16D16Table <string hiOp, string loOp> { class True16D16Table <string hiOp, string loOp> {
@ -1443,19 +1443,19 @@ class ScratchStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
>; >;
class ScratchLoadSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < class ScratchLoadSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset))), (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol))),
(inst $vaddr, $saddr, $offset, 0) (inst $vaddr, $saddr, $offset, $cpol)
>; >;
class ScratchStoreSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node, class ScratchStoreSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> : GCNPat < ValueType vt> : GCNPat <
(node vt:$data, (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset)), (node vt:$data, (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol)),
(inst getVregSrcForVT<vt>.ret:$data, $vaddr, $saddr, $offset) (inst getVregSrcForVT<vt>.ret:$data, $vaddr, $saddr, $offset, $cpol)
>; >;
class ScratchLoadSVaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < class ScratchLoadSVaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset), vt:$in)), (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset, CPol:$cpol), vt:$in)),
(inst $vaddr, $saddr, $offset, 0, $in) (inst $vaddr, $saddr, $offset, $cpol, $in)
>; >;
class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < class ScratchLoadSVaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <

View File

@ -257,20 +257,16 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) {
; GFX12: ; %bb.0: ; %bb ; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: v_mov_b32_e32 v2, 15 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0
; GFX12-NEXT: v_mov_b32_e32 v2, 15
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshl_b32 s0, s0, 7 ; GFX12-NEXT: s_lshl_b32 s0, s0, 7
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: scratch_store_b32 v0, v2, s0 scope:SCOPE_SYS
; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX12-NEXT: scratch_store_b32 v0, v2, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS ; GFX12-NEXT: scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_endpgm ; GFX12-NEXT: s_endpgm
; ;
@ -357,20 +353,16 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) {
; UNALIGNED_GFX12: ; %bb.0: ; %bb ; UNALIGNED_GFX12: ; %bb.0: ; %bb
; UNALIGNED_GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 ; UNALIGNED_GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0
; UNALIGNED_GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; UNALIGNED_GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15 ; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; UNALIGNED_GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; UNALIGNED_GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0
; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15
; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0
; UNALIGNED_GFX12-NEXT: s_lshl_b32 s0, s0, 7 ; UNALIGNED_GFX12-NEXT: s_lshl_b32 s0, s0, 7
; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v2, s0 scope:SCOPE_SYS
; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0
; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1
; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v2, off scope:SCOPE_SYS
; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0
; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS
; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0
; UNALIGNED_GFX12-NEXT: s_endpgm ; UNALIGNED_GFX12-NEXT: s_endpgm
bb: bb:
@ -937,19 +929,17 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, 15
; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0
; GFX12-NEXT: v_mov_b32_e32 v2, 15
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:384 scope:SCOPE_SYS ; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:384 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_lshl_b32 s0, s0, 7 ; GFX12-NEXT: s_lshl_b32 s0, s0, 7
; GFX12-NEXT: s_add_co_u32 s0, 0x100, s0 ; GFX12-NEXT: s_add_co_u32 s0, 0x100, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS
; GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_endpgm ; GFX12-NEXT: s_endpgm
; ;
@ -1048,19 +1038,17 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) {
; UNALIGNED_GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; UNALIGNED_GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; UNALIGNED_GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS
; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0
; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15
; UNALIGNED_GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; UNALIGNED_GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0
; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15
; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0
; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v2, off offset:384 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v2, off offset:384 scope:SCOPE_SYS
; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0
; UNALIGNED_GFX12-NEXT: s_lshl_b32 s0, s0, 7 ; UNALIGNED_GFX12-NEXT: s_lshl_b32 s0, s0, 7
; UNALIGNED_GFX12-NEXT: s_add_co_u32 s0, 0x100, s0 ; UNALIGNED_GFX12-NEXT: s_add_co_u32 s0, 0x100, s0
; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS
; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1
; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0
; UNALIGNED_GFX12-NEXT: s_endpgm ; UNALIGNED_GFX12-NEXT: s_endpgm
bb: bb:
@ -1579,19 +1567,17 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) {
; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, 15
; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0
; GFX12-NEXT: v_mov_b32_e32 v2, 15
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:16512 scope:SCOPE_SYS ; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:16512 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_lshl_b32 s0, s0, 7 ; GFX12-NEXT: s_lshl_b32 s0, s0, 7
; GFX12-NEXT: s_add_co_u32 s0, 0x4000, s0 ; GFX12-NEXT: s_add_co_u32 s0, 0x4000, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS
; GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_endpgm ; GFX12-NEXT: s_endpgm
; ;
@ -1692,19 +1678,17 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) {
; UNALIGNED_GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; UNALIGNED_GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; UNALIGNED_GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS
; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0
; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15
; UNALIGNED_GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; UNALIGNED_GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0
; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15
; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0
; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v2, off offset:16512 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v2, off offset:16512 scope:SCOPE_SYS
; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0
; UNALIGNED_GFX12-NEXT: s_lshl_b32 s0, s0, 7 ; UNALIGNED_GFX12-NEXT: s_lshl_b32 s0, s0, 7
; UNALIGNED_GFX12-NEXT: s_add_co_u32 s0, 0x4000, s0 ; UNALIGNED_GFX12-NEXT: s_add_co_u32 s0, 0x4000, s0
; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS
; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1
; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS
; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0
; UNALIGNED_GFX12-NEXT: s_endpgm ; UNALIGNED_GFX12-NEXT: s_endpgm
bb: bb:
@ -4060,9 +4044,7 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset(ptr a
; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: ; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
; GFX12: ; %bb.0: ; %bb ; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0 ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: scratch_store_b32 v0, v1, s0 offset:65512 scope:SCOPE_SYS
; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:65512 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_endpgm ; GFX12-NEXT: s_endpgm
; ;
@ -4113,9 +4095,7 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset(ptr a
; UNALIGNED_GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: ; UNALIGNED_GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
; UNALIGNED_GFX12: ; %bb.0: ; %bb ; UNALIGNED_GFX12: ; %bb.0: ; %bb
; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0 ; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v1, s0 offset:65512 scope:SCOPE_SYS
; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0
; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v1, off offset:65512 scope:SCOPE_SYS
; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0
; UNALIGNED_GFX12-NEXT: s_endpgm ; UNALIGNED_GFX12-NEXT: s_endpgm
bb: bb:
@ -4172,9 +4152,7 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(pt
; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: ; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
; GFX12: ; %bb.0: ; %bb ; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0 ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: scratch_store_b32 v0, v1, s0 offset:-16 scope:SCOPE_SYS
; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:-16 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_endpgm ; GFX12-NEXT: s_endpgm
; ;
@ -4223,9 +4201,7 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(pt
; UNALIGNED_GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: ; UNALIGNED_GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
; UNALIGNED_GFX12: ; %bb.0: ; %bb ; UNALIGNED_GFX12: ; %bb.0: ; %bb
; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0 ; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0
; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v1, s0 offset:-16 scope:SCOPE_SYS
; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0
; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v1, off offset:-16 scope:SCOPE_SYS
; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0
; UNALIGNED_GFX12-NEXT: s_endpgm ; UNALIGNED_GFX12-NEXT: s_endpgm
bb: bb:

View File

@ -150,13 +150,11 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: s_endpgm ; GFX12-GISEL-NEXT: s_endpgm
bb: bb:
@ -321,15 +319,14 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 2, v0 ; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 2, v0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: s_endpgm ; GFX12-GISEL-NEXT: s_endpgm
bb: bb:
@ -494,15 +491,14 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 4, v0 ; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 4, v0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: s_endpgm ; GFX12-GISEL-NEXT: s_endpgm
bb: bb:
@ -664,17 +660,15 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
; GFX12-GISEL-LABEL: soff2_voff1: ; GFX12-GISEL-LABEL: soff2_voff1:
; GFX12-GISEL: ; %bb.0: ; %bb ; GFX12-GISEL: ; %bb.0: ; %bb
; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: s_endpgm ; GFX12-GISEL-NEXT: s_endpgm
bb: bb:
@ -850,13 +844,11 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 2, v0 ; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 2, v0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: s_endpgm ; GFX12-GISEL-NEXT: s_endpgm
bb: bb:
@ -1032,13 +1024,11 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 4, v0 ; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 4, v0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: s_endpgm ; GFX12-GISEL-NEXT: s_endpgm
bb: bb:
@ -1200,17 +1190,15 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
; GFX12-GISEL-LABEL: soff4_voff1: ; GFX12-GISEL-LABEL: soff4_voff1:
; GFX12-GISEL: ; %bb.0: ; %bb ; GFX12-GISEL: ; %bb.0: ; %bb
; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: s_endpgm ; GFX12-GISEL-NEXT: s_endpgm
bb: bb:
@ -1386,13 +1374,11 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 2, v0 ; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 2, v0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: s_endpgm ; GFX12-GISEL-NEXT: s_endpgm
bb: bb:
@ -1565,13 +1551,11 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 4, v0 ; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 4, v0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: s_endpgm ; GFX12-GISEL-NEXT: s_endpgm
bb: bb:
@ -1672,9 +1656,7 @@ define amdgpu_kernel void @soff1_voff1_negative(i32 %soff) {
; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:-1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0
; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: s_endpgm ; GFX12-GISEL-NEXT: s_endpgm
bb: bb:

View File

@ -0,0 +1,322 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
define amdgpu_ps float @scratch_load_b32_alloca_idxprom(i32 %idx) {
; GCN-LABEL: scratch_load_b32_alloca_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: scratch_load_b32 v0, v0, off scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%p = alloca [32 x i32], align 4, addrspace(5)
%idxprom = zext i32 %idx to i64
%arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxprom
%ret = load float, ptr addrspace(5) %arrayidx, align 4
ret float %ret
}
define amdgpu_ps float @scratch_load_b32_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
; GCN-LABEL: scratch_load_b32_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: scratch_load_b32 v0, v0, s0 scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxprom
%ret = load float, ptr addrspace(5) %arrayidx, align 4
ret float %ret
}
define amdgpu_ps float @scratch_load_b32_idx32(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
; GCN-LABEL: scratch_load_b32_idx32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: scratch_load_b32 v0, v0, s0 scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i32 %idx
%ret = load float, ptr addrspace(5) %arrayidx, align 4
ret float %ret
}
define amdgpu_ps float @scratch_load_b32_idxprom_wrong_stride(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
; GCN-LABEL: scratch_load_b32_idxprom_wrong_stride:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GCN-NEXT: scratch_load_b32 v0, v0, s0
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idxprom = zext i32 %idx to i64
%arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(5) %p, i64 %idxprom
%ret = load float, ptr addrspace(5) %arrayidx, align 4
ret float %ret
}
define amdgpu_ps float @scratch_load_b16_idxprom_ioffset(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
; GCN-LABEL: scratch_load_b16_idxprom_ioffset:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: scratch_load_u16 v0, v0, s0 offset:32 scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idxprom = sext i32 %idx to i64
%idxadd = add i64 %idxprom, 16
%arrayidx = getelementptr inbounds i16, ptr addrspace(5) %p, i64 %idxadd
%ld = load i16, ptr addrspace(5) %arrayidx, align 2
%ret.i32 = zext i16 %ld to i32
%ret = bitcast i32 %ret.i32 to float
ret float %ret
}
define amdgpu_ps <2 x float> @scratch_load_b64_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
; GCN-LABEL: scratch_load_b64_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: scratch_load_b64 v[0:1], v0, s0 scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idxprom = zext i32 %idx to i64
%arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(5) %p, i64 %idxprom
%ret = load <2 x float>, ptr addrspace(5) %arrayidx, align 4
ret <2 x float> %ret
}
define amdgpu_ps <3 x float> @scratch_load_b96_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
; GCN-LABEL: scratch_load_b96_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: scratch_load_b96 v[0:2], v0, s0 scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idxprom = zext i32 %idx to i64
%arrayidx = getelementptr inbounds [3 x float], ptr addrspace(5) %p, i64 %idxprom
%ret = load <3 x float>, ptr addrspace(5) %arrayidx, align 4
ret <3 x float> %ret
}
define amdgpu_ps <3 x float> @scratch_load_b96_idxpromi_ioffset(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
; GCN-LABEL: scratch_load_b96_idxpromi_ioffset:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: scratch_load_b96 v[0:2], v0, s0 offset:192 scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idxprom = zext i32 %idx to i64
%idxadd = add i64 %idxprom, 16
%arrayidx = getelementptr inbounds [3 x float], ptr addrspace(5) %p, i64 %idxadd
%ret = load <3 x float>, ptr addrspace(5) %arrayidx, align 4
ret <3 x float> %ret
}
define amdgpu_ps <4 x float> @scratch_load_b128_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
; GCN-LABEL: scratch_load_b128_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: scratch_load_b128 v[0:3], v0, s0 scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idxprom = zext i32 %idx to i64
%arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(5) %p, i64 %idxprom
%ret = load <4 x float>, ptr addrspace(5) %arrayidx, align 4
ret <4 x float> %ret
}
define amdgpu_ps float @scratch_load_b32_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
; GCN-LABEL: scratch_load_b32_idxprom_range:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: scratch_load_b32 v0, v0, off
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: scratch_load_b32 v0, v0, s0 scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr addrspace(5) %pp, align 4, !range !0
%idxprom = zext i32 %idx to i64
%arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxprom
%ret = load float, ptr addrspace(5) %arrayidx, align 4
ret float %ret
}
define amdgpu_ps float @scratch_load_b32_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
; GCN-LABEL: scratch_load_b32_idxprom_range_ioffset:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: scratch_load_b32 v0, v0, off
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: scratch_load_b32 v0, v0, s0 offset:64 scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr addrspace(5) %pp, align 4, !range !0
%idxprom = zext i32 %idx to i64
%idxadd = add i64 %idxprom, 16
%arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxadd
%ret = load float, ptr addrspace(5) %arrayidx, align 4
ret float %ret
}
define amdgpu_ps float @scratch_load_b8_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
; GCN-LABEL: scratch_load_b8_idxprom_range_ioffset:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: scratch_load_b32 v0, v0, off
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: scratch_load_u8 v0, v0, s0 offset:16
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr addrspace(5) %pp, align 4, !range !0
%idxprom = zext i32 %idx to i64
%idxadd = add i64 %idxprom, 16
%arrayidx = getelementptr inbounds i8, ptr addrspace(5) %p, i64 %idxadd
%ld = load i8, ptr addrspace(5) %arrayidx
%ret.i32 = zext i8 %ld to i32
%ret = bitcast i32 %ret.i32 to float
ret float %ret
}
define amdgpu_ps float @scratch_load_b16_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
; GCN-LABEL: scratch_load_b16_idxprom_range:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: scratch_load_b32 v0, v0, off
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: scratch_load_u16 v0, v0, s0 scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr addrspace(5) %pp, align 4, !range !0
%idxprom = zext i32 %idx to i64
%arrayidx = getelementptr inbounds i16, ptr addrspace(5) %p, i64 %idxprom
%ld = load i16, ptr addrspace(5) %arrayidx, align 2
%ret.i32 = zext i16 %ld to i32
%ret = bitcast i32 %ret.i32 to float
ret float %ret
}
define amdgpu_ps float @scratch_load_b16_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
; GCN-LABEL: scratch_load_b16_idxprom_range_ioffset:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: scratch_load_b32 v0, v0, off
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: scratch_load_u16 v0, v0, s0 offset:32 scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr addrspace(5) %pp, align 4, !range !0
%idxprom = zext i32 %idx to i64
%idxadd = add i64 %idxprom, 16
%arrayidx = getelementptr inbounds i16, ptr addrspace(5) %p, i64 %idxadd
%ld = load i16, ptr addrspace(5) %arrayidx, align 2
%ret.i32 = zext i16 %ld to i32
%ret = bitcast i32 %ret.i32 to float
ret float %ret
}
define amdgpu_ps <2 x float> @scratch_load_b64_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
; GCN-LABEL: scratch_load_b64_idxprom_range:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: scratch_load_b32 v0, v0, off
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: scratch_load_b64 v[0:1], v0, s0 scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr addrspace(5) %pp, align 4, !range !0
%idxprom = zext i32 %idx to i64
%arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(5) %p, i64 %idxprom
%ret = load <2 x float>, ptr addrspace(5) %arrayidx, align 4
ret <2 x float> %ret
}
; Multiplication is unsigned here, so we cannot match it.
define amdgpu_ps <3 x float> @scratch_load_b96_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
; GCN-LABEL: scratch_load_b96_idxprom_range:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: scratch_load_b32 v0, v0, off
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: scratch_load_b96 v[0:2], v0, s0 scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr addrspace(5) %pp, align 4, !range !0
%idxprom = sext i32 %idx to i64
%arrayidx = getelementptr inbounds [3 x float], ptr addrspace(5) %p, i64 %idxprom
%ret = load <3 x float>, ptr addrspace(5) %arrayidx, align 4
ret <3 x float> %ret
}
define amdgpu_ps <3 x float> @scratch_load_b96_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
; GCN-LABEL: scratch_load_b96_idxprom_range_ioffset:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: scratch_load_b32 v0, v0, off
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: scratch_load_b96 v[0:2], v0, s0 offset:192 scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr addrspace(5) %pp, align 4, !range !0
%idxprom = sext i32 %idx to i64
%idxadd = add i64 %idxprom, 16
%arrayidx = getelementptr inbounds [3 x float], ptr addrspace(5) %p, i64 %idxadd
%ret = load <3 x float>, ptr addrspace(5) %arrayidx, align 4
ret <3 x float> %ret
}
define amdgpu_ps <4 x float> @scratch_load_b128_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
; GCN-LABEL: scratch_load_b128_idxprom_range:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: scratch_load_b32 v0, v0, off
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: scratch_load_b128 v[0:3], v0, s0 scale_offset
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: ; return to shader part epilog
entry:
%idx = load i32, ptr addrspace(5) %pp, align 4, !range !0
%idxprom = zext i32 %idx to i64
%arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(5) %p, i64 %idxprom
%ret = load <4 x float>, ptr addrspace(5) %arrayidx, align 4
ret <4 x float> %ret
}
define amdgpu_ps void @scratch_store_b32_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
; GCN-LABEL: scratch_store_b32_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_mov_b32_e32 v1, 1.0
; GCN-NEXT: scratch_store_b32 v0, v1, s0 scale_offset
; GCN-NEXT: s_endpgm
entry:
%idxprom = zext i32 %idx to i64
%arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxprom
store float 1.0, ptr addrspace(5) %arrayidx, align 4
ret void
}
define amdgpu_ps void @scratch_store_b16_idxprom(ptr addrspace(5) align 2 inreg %p, i32 %idx) {
; GCN-LABEL: scratch_store_b16_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_mov_b32_e32 v1, 1
; GCN-NEXT: scratch_store_b16 v0, v1, s0 scale_offset
; GCN-NEXT: s_endpgm
entry:
%idxprom = zext i32 %idx to i64
%arrayidx = getelementptr inbounds i16, ptr addrspace(5) %p, i64 %idxprom
store i16 1, ptr addrspace(5) %arrayidx, align 2
ret void
}
define amdgpu_ps void @scratch_store_b64_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
; GCN-LABEL: scratch_store_b64_idxprom:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_mov_b64_e32 v[2:3], 1.0
; GCN-NEXT: scratch_store_b64 v0, v[2:3], s0 scale_offset
; GCN-NEXT: s_endpgm
entry:
%idxprom = zext i32 %idx to i64
%arrayidx = getelementptr inbounds double, ptr addrspace(5) %p, i64 %idxprom
store double 1.0, ptr addrspace(5) %arrayidx, align 4
ret void
}
!0 = !{i32 0, i32 1024}