[AMDGPU] Implement LSR cost model for GFX9+ (#184138)

AMDGPU previously had no target-specific LSR cost model, so the generic
heuristic would often introduce extra induction variables and base-add
chains that hurt VALU throughput on GFX9+ (observed on gfx942).

Implement a custom cost model:

- isLSRCostLess(): prioritize per-iteration instruction count over setup
costs, penalize IV multiplies, and demote register count. Pre-GFX9 falls
back to the default comparator.
- getScalingFactorCost(): report that base+scale*index addressing
requires an extra ADD instruction.
- isNumRegsMajorCostOfLSR(): return false.
- shouldDropLSRSolutionIfLessProfitable(): return true.

Assisted-by: Claude Opus
This commit is contained in:
michaelselehov 2026-03-23 12:18:11 +01:00 committed by GitHub
parent bd6e8a8c66
commit 621fc8774e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
20 changed files with 7424 additions and 7344 deletions

View File

@ -15,6 +15,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUTargetTransformInfo.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIModeRegisterDefaults.h"
@ -1736,3 +1737,50 @@ GCNTTIImpl::getInstructionUniformity(const Value *V) const {
return InstructionUniformity::Default;
}
InstructionCost GCNTTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
StackOffset BaseOffset,
bool HasBaseReg, int64_t Scale,
unsigned AddrSpace) const {
if (HasBaseReg && Scale != 0) {
// gfx1250+ can fold base+scale*index when scale matches the memory access
// size (scale_offset bit). Supported for flat/global/constant/scratch
// (VMEM, max 128 bits) and constant_32bit (SMRD, capped to 128 bits here).
if (getST()->hasScaleOffset() && Ty && Ty->isSized() &&
(AMDGPU::isExtendedGlobalAddrSpace(AddrSpace) ||
AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)) {
TypeSize StoreSize = getDataLayout().getTypeStoreSize(Ty);
if (TypeSize::isKnownLE(StoreSize, TypeSize::getFixed(16)) &&
static_cast<int64_t>(StoreSize.getFixedValue()) == Scale)
return 0;
}
return 1;
}
return BaseT::getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg, Scale,
AddrSpace);
}
bool GCNTTIImpl::isLSRCostLess(const TTI::LSRCost &A,
const TTI::LSRCost &B) const {
// Favor lower per-iteration work over preheader/setup costs.
// AMDGPU lacks rich addressing modes, so ScaleCost is folded into the
// effective instruction count (base+scale*index requires a separate ADD).
unsigned EffInsnsA = A.Insns + A.ScaleCost;
unsigned EffInsnsB = B.Insns + B.ScaleCost;
return std::tie(EffInsnsA, A.NumIVMuls, A.AddRecCost, A.NumBaseAdds,
A.SetupCost, A.ImmCost, A.NumRegs) <
std::tie(EffInsnsB, B.NumIVMuls, B.AddRecCost, B.NumBaseAdds,
B.SetupCost, B.ImmCost, B.NumRegs);
}
bool GCNTTIImpl::isNumRegsMajorCostOfLSR() const {
// isLSRCostLess de-prioritizes register count; keep consistent.
return false;
}
bool GCNTTIImpl::shouldDropLSRSolutionIfLessProfitable() const {
// Prefer the baseline when LSR cannot clearly reduce per-iteration work.
return true;
}

View File

@ -311,6 +311,16 @@ public:
unsigned getNumberOfParts(Type *Tp) const override;
InstructionUniformity getInstructionUniformity(const Value *V) const override;
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
StackOffset BaseOffset, bool HasBaseReg,
int64_t Scale,
unsigned AddrSpace) const override;
bool isLSRCostLess(const TTI::LSRCost &A,
const TTI::LSRCost &B) const override;
bool isNumRegsMajorCostOfLSR() const override;
bool shouldDropLSRSolutionIfLessProfitable() const override;
};
} // end namespace llvm

View File

@ -11,18 +11,21 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind {
; GCN-NEXT: s_cbranch_scc1 .LBB0_3
; GCN-NEXT: ; %bb.1: ; %for.body.preheader
; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_addk_i32 s0, 0x80
; GCN-NEXT: s_mov_b32 s1, 0
; GCN-NEXT: s_and_b64 vcc, exec, -1
; GCN-NEXT: s_mov_b32 m0, -1
; GCN-NEXT: .LBB0_2: ; %for.body
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_lshl_b32 s2, s1, 2
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_add_i32 s2, s0, s2
; GCN-NEXT: s_addk_i32 s2, 0x80
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1
; GCN-NEXT: ds_write_b32 v0, v1
; GCN-NEXT: s_add_i32 s0, s0, 4
; GCN-NEXT: s_add_i32 s1, s1, 1
; GCN-NEXT: s_mov_b64 vcc, vcc
; GCN-NEXT: s_cbranch_vccnz .LBB0_2
; GCN-NEXT: .LBB0_3: ; %for.exit
@ -109,18 +112,21 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi
; GCN-LABEL: loop_const_true:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_addk_i32 s0, 0x80
; GCN-NEXT: s_mov_b32 s1, 0
; GCN-NEXT: s_and_b64 vcc, exec, -1
; GCN-NEXT: s_mov_b32 m0, -1
; GCN-NEXT: .LBB1_1: ; %for.body
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_lshl_b32 s2, s1, 2
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_add_i32 s2, s0, s2
; GCN-NEXT: s_addk_i32 s2, 0x80
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1
; GCN-NEXT: ds_write_b32 v0, v1
; GCN-NEXT: s_add_i32 s0, s0, 4
; GCN-NEXT: s_add_i32 s1, s1, 1
; GCN-NEXT: s_mov_b64 vcc, vcc
; GCN-NEXT: s_cbranch_vccnz .LBB1_1
; GCN-NEXT: ; %bb.2: ; %DummyReturnBlock
@ -368,22 +374,25 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind {
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_mov_b32 m0, -1
; GCN-NEXT: ds_read_u8 v0, v0
; GCN-NEXT: s_load_dword s4, s[4:5], 0x9
; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: s_bitcmp1_b32 s0, 0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: s_xor_b64 s[2:3], s[0:1], -1
; GCN-NEXT: s_add_i32 s0, s4, 0x80
; GCN-NEXT: v_readfirstlane_b32 s1, v0
; GCN-NEXT: s_bitcmp1_b32 s1, 0
; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0
; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], -1
; GCN-NEXT: s_mov_b32 s1, 0
; GCN-NEXT: s_and_b64 vcc, exec, s[2:3]
; GCN-NEXT: .LBB4_1: ; %for.body
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_lshl_b32 s2, s1, 2
; GCN-NEXT: s_add_i32 s2, s0, s2
; GCN-NEXT: s_addk_i32 s2, 0x80
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1
; GCN-NEXT: ds_write_b32 v0, v1
; GCN-NEXT: s_add_i32 s0, s0, 4
; GCN-NEXT: s_add_i32 s1, s1, 1
; GCN-NEXT: s_mov_b64 vcc, vcc
; GCN-NEXT: s_cbranch_vccz .LBB4_1
; GCN-NEXT: ; %bb.2: ; %for.exit

View File

@ -17,14 +17,13 @@ define amdgpu_kernel void @copy_to_reg_frameindex(ptr addrspace(1) %out, i32 %a,
; GFX7-NEXT: s_add_u32 s12, s12, s11
; GFX7-NEXT: s_addc_u32 s13, s13, 0
; GFX7-NEXT: s_mov_b32 s0, 0
; GFX7-NEXT: s_mov_b32 s1, 0
; GFX7-NEXT: .LBB0_1: ; %loop
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: s_add_i32 s1, s1, 1
; GFX7-NEXT: s_add_i32 s0, s0, 4
; GFX7-NEXT: s_cmp_lt_u32 s1, 16
; GFX7-NEXT: s_lshl_b32 s1, s0, 2
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: s_add_i32 s0, s0, 1
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: s_cmp_lt_u32 s0, 16
; GFX7-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
; GFX7-NEXT: s_cbranch_scc1 .LBB0_1
; GFX7-NEXT: ; %bb.2: ; %done
@ -45,14 +44,13 @@ define amdgpu_kernel void @copy_to_reg_frameindex(ptr addrspace(1) %out, i32 %a,
; GFX8-NEXT: s_add_u32 s88, s88, s11
; GFX8-NEXT: s_addc_u32 s89, s89, 0
; GFX8-NEXT: s_mov_b32 s0, 0
; GFX8-NEXT: s_mov_b32 s1, 0
; GFX8-NEXT: .LBB0_1: ; %loop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: v_mov_b32_e32 v0, s1
; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: s_add_i32 s1, s1, 1
; GFX8-NEXT: s_add_i32 s0, s0, 4
; GFX8-NEXT: s_cmp_lt_u32 s1, 16
; GFX8-NEXT: s_lshl_b32 s1, s0, 2
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: s_add_i32 s0, s0, 1
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_cmp_lt_u32 s0, 16
; GFX8-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
; GFX8-NEXT: s_cbranch_scc1 .LBB0_1
; GFX8-NEXT: ; %bb.2: ; %done

View File

@ -9,19 +9,21 @@ declare void @llvm.amdgcn.s.barrier() #1
; CHECK-LABEL: {{^}}signed_ds_offset_addressing_loop:
; CHECK: BB0_1:
; CHECK: v_add_i32_e32 [[VADDR:v[0-9]+]],
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]]
; SI-DAG: v_add_i32_e32 [[VADDR8:v[0-9]+]], vcc, 8, [[VADDR]]
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR8]]
; SI-DAG: v_add_i32_e32 [[VADDR0x80:v[0-9]+]], vcc, 0x80, [[VADDR]]
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x80]]
; SI-DAG: v_add_i32_e32 [[VADDR0x88:v[0-9]+]], vcc, 0x88, [[VADDR]]
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x88]]
; SI-DAG: v_add_i32_e32 [[VADDR0x100:v[0-9]+]], vcc, 0x100, [[VADDR]]
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x100]]
; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset1:2
; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:32 offset1:34
; CI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] offset:256
; SI-DAG: v_add_i32_e32 [[VADDR1:v[0-9]+]], vcc, 0xc20, [[VADDR]]
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR1]]
; SI-DAG: v_add_i32_e32 [[VADDR2:v[0-9]+]], vcc, {{s[0-9]+}}, [[VADDR]]
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR2]]
; SI-DAG: v_add_i32_e32 [[VADDR3:v[0-9]+]], vcc, 0xca0, [[VADDR]]
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR3]]
; SI-DAG: v_add_i32_e32 [[VADDR4:v[0-9]+]], vcc, 0xca8, [[VADDR]]
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR4]]
; SI-DAG: v_add_i32_e32 [[VADDR5:v[0-9]+]], vcc, 0xd20, [[VADDR]]
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR5]]
;
; CI: v_add_i32_e32 [[VADDRCI:v[0-9]+]], vcc, {{s[0-9]+}}, [[VADDR]]
; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDRCI]] offset0:8 offset1:10
; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDRCI]] offset0:40 offset1:42
; CI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] offset:3360
; CHECK: s_endpgm
define amdgpu_kernel void @signed_ds_offset_addressing_loop(ptr addrspace(1) noalias nocapture %out, ptr addrspace(3) noalias nocapture readonly %lptr, i32 %n) #2 {
entry:

View File

@ -2741,16 +2741,15 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) {
; GFX1250-SDAG: ; %bb.0: ; %bb
; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0
; GFX1250-SDAG-NEXT: s_movk_i32 s0, 0x100
; GFX1250-SDAG-NEXT: .LBB116_1: ; %bb3
; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400
; GFX1250-SDAG-NEXT: s_add_co_i32 s0, s0, -1
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4
; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0
; GFX1250-SDAG-NEXT: s_cbranch_scc0 .LBB116_1
; GFX1250-SDAG-NEXT: ; %bb.2: ; %bb2
; GFX1250-SDAG-NEXT: s_endpgm
@ -2758,19 +2757,19 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) {
; GFX1250-GISEL-LABEL: flat_addr_64bit_lsr_iv:
; GFX1250-GISEL: ; %bb.0: ; %bb
; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX1250-GISEL-NEXT: s_movk_i32 s0, 0x100
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX1250-GISEL-NEXT: .LBB116_1: ; %bb3
; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 4, v[2:3]
; GFX1250-GISEL-NEXT: flat_load_b32 v4, v[4:5] scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: flat_load_b32 v3, v[0:1] scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x400, v2
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, s0, v0, 4
; GFX1250-GISEL-NEXT: v_add_nc_u32_e32 v2, -1, v2
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
; GFX1250-GISEL-NEXT: s_cbranch_vccz .LBB116_1
; GFX1250-GISEL-NEXT: ; %bb.2: ; %bb2
; GFX1250-GISEL-NEXT: s_endpgm
@ -2779,16 +2778,15 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) {
; GFX1250-NOECC: ; %bb.0: ; %bb
; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-NOECC-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NOECC-NEXT: s_mov_b64 s[0:1], 0
; GFX1250-NOECC-NEXT: s_movk_i32 s0, 0x100
; GFX1250-NOECC-NEXT: .LBB116_1: ; %bb3
; GFX1250-NOECC-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1250-NOECC-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NOECC-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
; GFX1250-NOECC-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
; GFX1250-NOECC-NEXT: s_wait_dscnt 0x0
; GFX1250-NOECC-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
; GFX1250-NOECC-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX1250-NOECC-NEXT: s_wait_loadcnt 0x0
; GFX1250-NOECC-NEXT: s_cmp_eq_u32 s0, 0x400
; GFX1250-NOECC-NEXT: s_add_co_i32 s0, s0, -1
; GFX1250-NOECC-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4
; GFX1250-NOECC-NEXT: s_cmp_eq_u32 s0, 0
; GFX1250-NOECC-NEXT: s_cbranch_scc0 .LBB116_1
; GFX1250-NOECC-NEXT: ; %bb.2: ; %bb2
; GFX1250-NOECC-NEXT: s_endpgm
@ -2815,19 +2813,17 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
; GFX1250-SDAG: ; %bb.0: ; %bb
; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0
; GFX1250-SDAG-NEXT: s_movk_i32 s0, 0x100
; GFX1250-SDAG-NEXT: .LBB117_1: ; %bb3
; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400
; GFX1250-SDAG-NEXT: ; kill: killed $sgpr4_sgpr5
; GFX1250-SDAG-NEXT: s_add_co_i32 s0, s0, -1
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4
; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0
; GFX1250-SDAG-NEXT: s_cbranch_scc0 .LBB117_1
; GFX1250-SDAG-NEXT: ; %bb.2: ; %bb2
; GFX1250-SDAG-NEXT: s_endpgm
@ -2835,22 +2831,21 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
; GFX1250-GISEL-LABEL: flat_addr_64bit_lsr_iv_multiload:
; GFX1250-GISEL: ; %bb.0: ; %bb
; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX1250-GISEL-NEXT: s_movk_i32 s0, 0x100
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX1250-GISEL-NEXT: .LBB117_1: ; %bb3
; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 4, v[2:3]
; GFX1250-GISEL-NEXT: ; kill: killed $vgpr4 killed $vgpr5
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
; GFX1250-GISEL-NEXT: flat_load_b32 v6, v[4:5] scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: flat_load_b32 v3, v[0:1] scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-GISEL-NEXT: flat_load_b32 v6, v[4:5] scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: flat_load_b32 v3, v[0:1] scope:SCOPE_SYS
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x400, v2
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, s0, v0, 4
; GFX1250-GISEL-NEXT: v_add_nc_u32_e32 v2, -1, v2
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
; GFX1250-GISEL-NEXT: s_cbranch_vccz .LBB117_1
; GFX1250-GISEL-NEXT: ; %bb.2: ; %bb2
; GFX1250-GISEL-NEXT: s_endpgm
@ -2859,19 +2854,17 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
; GFX1250-NOECC: ; %bb.0: ; %bb
; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
; GFX1250-NOECC-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-NOECC-NEXT: s_mov_b64 s[0:1], 0
; GFX1250-NOECC-NEXT: s_movk_i32 s0, 0x100
; GFX1250-NOECC-NEXT: .LBB117_1: ; %bb3
; GFX1250-NOECC-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1250-NOECC-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NOECC-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
; GFX1250-NOECC-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
; GFX1250-NOECC-NEXT: s_wait_dscnt 0x0
; GFX1250-NOECC-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
; GFX1250-NOECC-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX1250-NOECC-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NOECC-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
; GFX1250-NOECC-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX1250-NOECC-NEXT: s_wait_loadcnt 0x0
; GFX1250-NOECC-NEXT: s_cmp_eq_u32 s0, 0x400
; GFX1250-NOECC-NEXT: ; kill: killed $sgpr4_sgpr5
; GFX1250-NOECC-NEXT: s_add_co_i32 s0, s0, -1
; GFX1250-NOECC-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4
; GFX1250-NOECC-NEXT: s_cmp_eq_u32 s0, 0
; GFX1250-NOECC-NEXT: s_cbranch_scc0 .LBB117_1
; GFX1250-NOECC-NEXT: ; %bb.2: ; %bb2
; GFX1250-NOECC-NEXT: s_endpgm

View File

@ -4720,17 +4720,16 @@ define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr a
define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
; GFX9-LABEL: global_addr_64bit_lsr_iv:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: s_movk_i32 s0, 0x100
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: .LBB132_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_add_u32 s4, s2, s0
; GFX9-NEXT: s_addc_u32 s5, s3, s1
; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_add_u32 s0, s0, 4
; GFX9-NEXT: s_addc_u32 s1, s1, 0
; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400
; GFX9-NEXT: s_add_i32 s0, s0, -1
; GFX9-NEXT: s_add_u32 s2, s2, 4
; GFX9-NEXT: s_addc_u32 s3, s3, 0
; GFX9-NEXT: s_cmp_eq_u32 s0, 0
; GFX9-NEXT: s_cbranch_scc0 .LBB132_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm
@ -4738,17 +4737,16 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
; GFX10-LABEL: global_addr_64bit_lsr_iv:
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: s_movk_i32 s0, 0x100
; GFX10-NEXT: .LBB132_1: ; %bb3
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_add_u32 s4, s2, s0
; GFX10-NEXT: s_addc_u32 s5, s3, s1
; GFX10-NEXT: s_add_u32 s0, s0, 4
; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc
; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400
; GFX10-NEXT: s_add_i32 s0, s0, -1
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_add_u32 s2, s2, 4
; GFX10-NEXT: s_addc_u32 s3, s3, 0
; GFX10-NEXT: s_cmp_eq_u32 s0, 0
; GFX10-NEXT: s_cbranch_scc0 .LBB132_1
; GFX10-NEXT: ; %bb.2: ; %bb2
; GFX10-NEXT: s_endpgm
@ -4756,17 +4754,15 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
; GFX11-LABEL: global_addr_64bit_lsr_iv:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_movk_i32 s0, 0x100
; GFX11-NEXT: .LBB132_1: ; %bb3
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_u32 s4, s2, s0
; GFX11-NEXT: s_addc_u32 s5, s3, s1
; GFX11-NEXT: s_add_u32 s0, s0, 4
; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_addc_u32 s1, s1, 0
; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x400
; GFX11-NEXT: s_add_i32 s0, s0, -1
; GFX11-NEXT: s_add_u32 s2, s2, 4
; GFX11-NEXT: s_addc_u32 s3, s3, 0
; GFX11-NEXT: s_cmp_eq_u32 s0, 0
; GFX11-NEXT: s_cbranch_scc0 .LBB132_1
; GFX11-NEXT: ; %bb.2: ; %bb2
; GFX11-NEXT: s_endpgm
@ -4774,15 +4770,14 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
; GFX12-SDAG-LABEL: global_addr_64bit_lsr_iv:
; GFX12-SDAG: ; %bb.0: ; %bb
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX12-SDAG-NEXT: s_mov_b64 s[0:1], 0
; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x100
; GFX12-SDAG-NEXT: .LBB132_1: ; %bb3
; GFX12-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400
; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -1
; GFX12-SDAG-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4
; GFX12-SDAG-NEXT: s_cmp_eq_u32 s0, 0
; GFX12-SDAG-NEXT: s_cbranch_scc0 .LBB132_1
; GFX12-SDAG-NEXT: ; %bb.2: ; %bb2
; GFX12-SDAG-NEXT: s_endpgm
@ -4790,16 +4785,15 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
; GFX12-GISEL-LABEL: global_addr_64bit_lsr_iv:
; GFX12-GISEL: ; %bb.0: ; %bb
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX12-GISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x100
; GFX12-GISEL-NEXT: .LBB132_1: ; %bb3
; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: s_add_co_u32 s4, s2, s0
; GFX12-GISEL-NEXT: s_add_co_ci_u32 s5, s3, s1
; GFX12-GISEL-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
; GFX12-GISEL-NEXT: global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
; GFX12-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: s_cmp_eq_u32 s0, 0x400
; GFX12-GISEL-NEXT: s_add_co_i32 s0, s0, -1
; GFX12-GISEL-NEXT: s_add_co_u32 s2, s2, 4
; GFX12-GISEL-NEXT: s_add_co_ci_u32 s3, s3, 0
; GFX12-GISEL-NEXT: s_cmp_eq_u32 s0, 0
; GFX12-GISEL-NEXT: s_cbranch_scc0 .LBB132_1
; GFX12-GISEL-NEXT: ; %bb.2: ; %bb2
; GFX12-GISEL-NEXT: s_endpgm
@ -4824,20 +4818,18 @@ bb3: ; preds = %bb3, %bb
define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg %arg, ptr addrspace(1) inreg %arg.1) {
; GFX9-LABEL: global_addr_64bit_lsr_iv_multiload:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: s_movk_i32 s0, 0x100
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: .LBB133_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_add_u32 s4, s2, s0
; GFX9-NEXT: s_addc_u32 s5, s3, s1
; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_add_u32 s0, s0, 4
; GFX9-NEXT: s_addc_u32 s1, s1, 0
; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400
; GFX9-NEXT: ; kill: killed $sgpr4 killed $sgpr5
; GFX9-NEXT: s_add_i32 s0, s0, -1
; GFX9-NEXT: s_add_u32 s2, s2, 4
; GFX9-NEXT: s_addc_u32 s3, s3, 0
; GFX9-NEXT: s_cmp_eq_u32 s0, 0
; GFX9-NEXT: s_cbranch_scc0 .LBB133_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm
@ -4845,20 +4837,18 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg
; GFX10-LABEL: global_addr_64bit_lsr_iv_multiload:
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: s_movk_i32 s0, 0x100
; GFX10-NEXT: .LBB133_1: ; %bb3
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_add_i32 s0, s0, -1
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_add_u32 s4, s2, s0
; GFX10-NEXT: s_addc_u32 s5, s3, s1
; GFX10-NEXT: s_add_u32 s0, s0, 4
; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400
; GFX10-NEXT: ; kill: killed $sgpr4 killed $sgpr5
; GFX10-NEXT: s_add_u32 s2, s2, 4
; GFX10-NEXT: s_addc_u32 s3, s3, 0
; GFX10-NEXT: s_cmp_eq_u32 s0, 0
; GFX10-NEXT: s_cbranch_scc0 .LBB133_1
; GFX10-NEXT: ; %bb.2: ; %bb2
; GFX10-NEXT: s_endpgm
@ -4866,19 +4856,17 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg
; GFX11-LABEL: global_addr_64bit_lsr_iv_multiload:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_movk_i32 s0, 0x100
; GFX11-NEXT: .LBB133_1: ; %bb3
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_u32 s4, s2, s0
; GFX11-NEXT: s_addc_u32 s5, s3, s1
; GFX11-NEXT: s_add_u32 s0, s0, 4
; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_addc_u32 s1, s1, 0
; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x400
; GFX11-NEXT: s_add_i32 s0, s0, -1
; GFX11-NEXT: s_add_u32 s2, s2, 4
; GFX11-NEXT: s_addc_u32 s3, s3, 0
; GFX11-NEXT: s_cmp_eq_u32 s0, 0
; GFX11-NEXT: s_cbranch_scc0 .LBB133_1
; GFX11-NEXT: ; %bb.2: ; %bb2
; GFX11-NEXT: s_endpgm
@ -4886,17 +4874,16 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg
; GFX12-SDAG-LABEL: global_addr_64bit_lsr_iv_multiload:
; GFX12-SDAG: ; %bb.0: ; %bb
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX12-SDAG-NEXT: s_mov_b64 s[0:1], 0
; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x100
; GFX12-SDAG-NEXT: .LBB133_1: ; %bb3
; GFX12-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400
; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -1
; GFX12-SDAG-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4
; GFX12-SDAG-NEXT: s_cmp_eq_u32 s0, 0
; GFX12-SDAG-NEXT: s_cbranch_scc0 .LBB133_1
; GFX12-SDAG-NEXT: ; %bb.2: ; %bb2
; GFX12-SDAG-NEXT: s_endpgm
@ -4904,18 +4891,17 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg
; GFX12-GISEL-LABEL: global_addr_64bit_lsr_iv_multiload:
; GFX12-GISEL: ; %bb.0: ; %bb
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX12-GISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x100
; GFX12-GISEL-NEXT: .LBB133_1: ; %bb3
; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: s_add_co_u32 s4, s2, s0
; GFX12-GISEL-NEXT: s_add_co_ci_u32 s5, s3, s1
; GFX12-GISEL-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
; GFX12-GISEL-NEXT: global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
; GFX12-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
; GFX12-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: s_cmp_eq_u32 s0, 0x400
; GFX12-GISEL-NEXT: s_add_co_i32 s0, s0, -1
; GFX12-GISEL-NEXT: s_add_co_u32 s2, s2, 4
; GFX12-GISEL-NEXT: s_add_co_ci_u32 s3, s3, 0
; GFX12-GISEL-NEXT: s_cmp_eq_u32 s0, 0
; GFX12-GISEL-NEXT: s_cbranch_scc0 .LBB133_1
; GFX12-GISEL-NEXT: ; %bb.2: ; %bb2
; GFX12-GISEL-NEXT: s_endpgm

View File

@ -8,44 +8,39 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-NEXT: s_mov_b32 s7, 0
; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: s_mov_b32 s3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: s_sub_i32 s4, 0, s6
; GFX9-NEXT: s_sub_i32 s2, 0, s6
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: s_mul_i32 s4, s4, s5
; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4
; GFX9-NEXT: s_add_i32 s8, s5, s4
; GFX9-NEXT: s_mov_b64 s[4:5], 0
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
; GFX9-NEXT: s_mul_i32 s2, s2, s4
; GFX9-NEXT: s_mul_hi_u32 s2, s4, s2
; GFX9-NEXT: s_add_i32 s4, s4, s2
; GFX9-NEXT: s_mov_b32 s2, s3
; GFX9-NEXT: .LBB0_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_not_b32 s10, s5
; GFX9-NEXT: s_mul_i32 s9, s6, s5
; GFX9-NEXT: s_mul_i32 s10, s6, s10
; GFX9-NEXT: s_add_i32 s11, s5, 1
; GFX9-NEXT: s_sub_i32 s9, s7, s9
; GFX9-NEXT: s_add_i32 s10, s7, s10
; GFX9-NEXT: s_cmp_ge_u32 s9, s6
; GFX9-NEXT: s_cselect_b32 s11, s11, s5
; GFX9-NEXT: s_cselect_b32 s9, s10, s9
; GFX9-NEXT: s_add_i32 s10, s11, 1
; GFX9-NEXT: s_cmp_ge_u32 s9, s6
; GFX9-NEXT: s_cselect_b32 s9, s10, s11
; GFX9-NEXT: s_add_u32 s10, s0, s2
; GFX9-NEXT: s_addc_u32 s11, s1, s3
; GFX9-NEXT: s_add_i32 s7, s7, 1
; GFX9-NEXT: s_add_u32 s4, s4, s8
; GFX9-NEXT: s_addc_u32 s5, s5, 0
; GFX9-NEXT: s_add_u32 s2, s2, 4
; GFX9-NEXT: s_addc_u32 s3, s3, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x1000
; GFX9-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-NEXT: s_mul_hi_u32 s5, s2, s4
; GFX9-NEXT: s_mul_i32 s7, s5, s6
; GFX9-NEXT: s_sub_i32 s7, s2, s7
; GFX9-NEXT: s_add_i32 s8, s5, 1
; GFX9-NEXT: s_sub_i32 s9, s7, s6
; GFX9-NEXT: s_cmp_ge_u32 s7, s6
; GFX9-NEXT: s_cselect_b32 s5, s8, s5
; GFX9-NEXT: s_cselect_b32 s7, s9, s7
; GFX9-NEXT: s_add_i32 s8, s5, 1
; GFX9-NEXT: s_cmp_ge_u32 s7, s6
; GFX9-NEXT: s_cselect_b32 s5, s8, s5
; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], 2
; GFX9-NEXT: s_add_u32 s8, s0, s8
; GFX9-NEXT: s_addc_u32 s9, s1, s9
; GFX9-NEXT: s_add_i32 s2, s2, 1
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x400
; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
; GFX9-NEXT: s_cbranch_scc0 .LBB0_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm
@ -55,45 +50,40 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX10-NEXT: s_mov_b32 s7, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX10-NEXT: s_sub_i32 s2, 0, s6
; GFX10-NEXT: s_sub_i32 s3, 0, s6
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX10-NEXT: v_readfirstlane_b32 s4, v0
; GFX10-NEXT: v_readfirstlane_b32 s2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_mul_i32 s2, s2, s4
; GFX10-NEXT: s_mul_hi_u32 s5, s4, s2
; GFX10-NEXT: s_mov_b64 s[2:3], 0
; GFX10-NEXT: s_add_i32 s8, s4, s5
; GFX10-NEXT: s_mov_b64 s[4:5], 0
; GFX10-NEXT: s_mul_i32 s3, s3, s2
; GFX10-NEXT: s_mul_hi_u32 s4, s2, s3
; GFX10-NEXT: s_mov_b32 s3, 0
; GFX10-NEXT: s_add_i32 s4, s2, s4
; GFX10-NEXT: s_mov_b32 s2, s3
; GFX10-NEXT: .LBB0_1: ; %bb3
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_mul_hi_u32 s5, s2, s4
; GFX10-NEXT: s_mul_i32 s7, s5, s6
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_not_b32 s10, s5
; GFX10-NEXT: s_mul_i32 s9, s6, s5
; GFX10-NEXT: s_mul_i32 s10, s6, s10
; GFX10-NEXT: s_sub_i32 s9, s7, s9
; GFX10-NEXT: s_add_i32 s11, s5, 1
; GFX10-NEXT: s_add_i32 s10, s7, s10
; GFX10-NEXT: s_cmp_ge_u32 s9, s6
; GFX10-NEXT: s_cselect_b32 s11, s11, s5
; GFX10-NEXT: s_cselect_b32 s9, s10, s9
; GFX10-NEXT: s_add_i32 s10, s11, 1
; GFX10-NEXT: s_cmp_ge_u32 s9, s6
; GFX10-NEXT: s_cselect_b32 s9, s10, s11
; GFX10-NEXT: s_add_u32 s10, s0, s2
; GFX10-NEXT: s_addc_u32 s11, s1, s3
; GFX10-NEXT: s_add_i32 s7, s7, 1
; GFX10-NEXT: s_add_u32 s4, s4, s8
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: s_addc_u32 s5, s5, 0
; GFX10-NEXT: s_add_u32 s2, s2, 4
; GFX10-NEXT: s_addc_u32 s3, s3, 0
; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x1000
; GFX10-NEXT: global_store_dword v0, v1, s[10:11]
; GFX10-NEXT: s_add_i32 s8, s5, 1
; GFX10-NEXT: s_sub_i32 s7, s2, s7
; GFX10-NEXT: s_sub_i32 s9, s7, s6
; GFX10-NEXT: s_cmp_ge_u32 s7, s6
; GFX10-NEXT: s_cselect_b32 s5, s8, s5
; GFX10-NEXT: s_cselect_b32 s7, s9, s7
; GFX10-NEXT: s_add_i32 s8, s5, 1
; GFX10-NEXT: s_cmp_ge_u32 s7, s6
; GFX10-NEXT: s_cselect_b32 s5, s8, s5
; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], 2
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: s_add_u32 s8, s0, s8
; GFX10-NEXT: s_addc_u32 s9, s1, s9
; GFX10-NEXT: s_add_i32 s2, s2, 1
; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x400
; GFX10-NEXT: global_store_dword v0, v1, s[8:9]
; GFX10-NEXT: s_cbranch_scc0 .LBB0_1
; GFX10-NEXT: ; %bb.2: ; %bb2
; GFX10-NEXT: s_endpgm
@ -103,49 +93,46 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT: s_mov_b32 s7, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX11-NEXT: s_sub_i32 s2, 0, s6
; GFX11-NEXT: s_sub_i32 s3, 0, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-NEXT: v_readfirstlane_b32 s2, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_mul_i32 s2, s2, s4
; GFX11-NEXT: s_mul_hi_u32 s5, s4, s2
; GFX11-NEXT: s_mov_b64 s[2:3], 0
; GFX11-NEXT: s_add_i32 s8, s4, s5
; GFX11-NEXT: s_mov_b64 s[4:5], 0
; GFX11-NEXT: s_mul_i32 s3, s3, s2
; GFX11-NEXT: s_mul_hi_u32 s4, s2, s3
; GFX11-NEXT: s_mov_b32 s3, 0
; GFX11-NEXT: s_add_i32 s4, s2, s4
; GFX11-NEXT: s_mov_b32 s2, s3
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB0_1: ; %bb3
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_mul_hi_u32 s5, s2, s4
; GFX11-NEXT: s_mul_i32 s7, s5, s6
; GFX11-NEXT: s_add_i32 s8, s5, 1
; GFX11-NEXT: s_sub_i32 s7, s2, s7
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_not_b32 s10, s5
; GFX11-NEXT: s_mul_i32 s9, s6, s5
; GFX11-NEXT: s_mul_i32 s10, s6, s10
; GFX11-NEXT: s_sub_i32 s9, s7, s9
; GFX11-NEXT: s_add_i32 s11, s5, 1
; GFX11-NEXT: s_add_i32 s10, s7, s10
; GFX11-NEXT: s_cmp_ge_u32 s9, s6
; GFX11-NEXT: s_cselect_b32 s11, s11, s5
; GFX11-NEXT: s_cselect_b32 s9, s10, s9
; GFX11-NEXT: s_add_i32 s10, s11, 1
; GFX11-NEXT: s_cmp_ge_u32 s9, s6
; GFX11-NEXT: s_cselect_b32 s9, s10, s11
; GFX11-NEXT: s_add_u32 s10, s0, s2
; GFX11-NEXT: s_addc_u32 s11, s1, s3
; GFX11-NEXT: s_add_i32 s7, s7, 1
; GFX11-NEXT: s_add_u32 s4, s4, s8
; GFX11-NEXT: v_mov_b32_e32 v1, s9
; GFX11-NEXT: s_addc_u32 s5, s5, 0
; GFX11-NEXT: s_add_u32 s2, s2, 4
; GFX11-NEXT: s_addc_u32 s3, s3, 0
; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x1000
; GFX11-NEXT: global_store_b32 v0, v1, s[10:11]
; GFX11-NEXT: s_sub_i32 s9, s7, s6
; GFX11-NEXT: s_cmp_ge_u32 s7, s6
; GFX11-NEXT: s_cselect_b32 s5, s8, s5
; GFX11-NEXT: s_cselect_b32 s7, s9, s7
; GFX11-NEXT: s_add_i32 s8, s5, 1
; GFX11-NEXT: s_cmp_ge_u32 s7, s6
; GFX11-NEXT: s_cselect_b32 s5, s8, s5
; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], 2
; GFX11-NEXT: v_mov_b32_e32 v1, s5
; GFX11-NEXT: s_add_u32 s8, s0, s8
; GFX11-NEXT: s_addc_u32 s9, s1, s9
; GFX11-NEXT: s_add_i32 s2, s2, 1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x400
; GFX11-NEXT: global_store_b32 v0, v1, s[8:9]
; GFX11-NEXT: s_cbranch_scc0 .LBB0_1
; GFX11-NEXT: ; %bb.2: ; %bb2
; GFX11-NEXT: s_endpgm
@ -171,42 +158,37 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-NEXT: s_mov_b32 s7, 0
; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: s_mov_b32 s3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: s_sub_i32 s4, 0, s6
; GFX9-NEXT: s_sub_i32 s2, 0, s6
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: s_mul_i32 s4, s4, s5
; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4
; GFX9-NEXT: s_add_i32 s8, s5, s4
; GFX9-NEXT: s_mov_b64 s[4:5], 0
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
; GFX9-NEXT: s_mul_i32 s2, s2, s4
; GFX9-NEXT: s_mul_hi_u32 s2, s4, s2
; GFX9-NEXT: s_add_i32 s4, s4, s2
; GFX9-NEXT: s_mov_b32 s2, s3
; GFX9-NEXT: .LBB1_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_not_b32 s10, s5
; GFX9-NEXT: s_mul_i32 s9, s6, s5
; GFX9-NEXT: s_mul_i32 s10, s6, s10
; GFX9-NEXT: s_sub_i32 s9, s7, s9
; GFX9-NEXT: s_add_i32 s10, s7, s10
; GFX9-NEXT: s_cmp_ge_u32 s9, s6
; GFX9-NEXT: s_cselect_b32 s9, s10, s9
; GFX9-NEXT: s_sub_i32 s10, s9, s6
; GFX9-NEXT: s_cmp_ge_u32 s9, s6
; GFX9-NEXT: s_cselect_b32 s9, s10, s9
; GFX9-NEXT: s_add_u32 s10, s0, s2
; GFX9-NEXT: s_addc_u32 s11, s1, s3
; GFX9-NEXT: s_add_i32 s7, s7, 1
; GFX9-NEXT: s_add_u32 s4, s4, s8
; GFX9-NEXT: s_addc_u32 s5, s5, 0
; GFX9-NEXT: s_add_u32 s2, s2, 4
; GFX9-NEXT: s_addc_u32 s3, s3, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x1000
; GFX9-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-NEXT: s_mul_hi_u32 s5, s2, s4
; GFX9-NEXT: s_mul_i32 s5, s5, s6
; GFX9-NEXT: s_sub_i32 s5, s2, s5
; GFX9-NEXT: s_sub_i32 s7, s5, s6
; GFX9-NEXT: s_cmp_ge_u32 s5, s6
; GFX9-NEXT: s_cselect_b32 s5, s7, s5
; GFX9-NEXT: s_sub_i32 s7, s5, s6
; GFX9-NEXT: s_cmp_ge_u32 s5, s6
; GFX9-NEXT: s_cselect_b32 s5, s7, s5
; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], 2
; GFX9-NEXT: s_add_u32 s8, s0, s8
; GFX9-NEXT: s_addc_u32 s9, s1, s9
; GFX9-NEXT: s_add_i32 s2, s2, 1
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x400
; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
; GFX9-NEXT: s_cbranch_scc0 .LBB1_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm
@ -216,43 +198,38 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s6, s[4:5], 0x2c
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX10-NEXT: s_mov_b32 s7, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX10-NEXT: s_sub_i32 s2, 0, s6
; GFX10-NEXT: s_sub_i32 s3, 0, s6
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX10-NEXT: v_readfirstlane_b32 s4, v0
; GFX10-NEXT: v_readfirstlane_b32 s2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_mul_i32 s2, s2, s4
; GFX10-NEXT: s_mul_hi_u32 s5, s4, s2
; GFX10-NEXT: s_mov_b64 s[2:3], 0
; GFX10-NEXT: s_add_i32 s8, s4, s5
; GFX10-NEXT: s_mov_b64 s[4:5], 0
; GFX10-NEXT: s_mul_i32 s3, s3, s2
; GFX10-NEXT: s_mul_hi_u32 s4, s2, s3
; GFX10-NEXT: s_mov_b32 s3, 0
; GFX10-NEXT: s_add_i32 s4, s2, s4
; GFX10-NEXT: s_mov_b32 s2, s3
; GFX10-NEXT: .LBB1_1: ; %bb3
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_not_b32 s9, s5
; GFX10-NEXT: s_mul_hi_u32 s5, s2, s4
; GFX10-NEXT: s_mul_i32 s5, s5, s6
; GFX10-NEXT: s_sub_i32 s5, s2, s5
; GFX10-NEXT: s_sub_i32 s7, s5, s6
; GFX10-NEXT: s_cmp_ge_u32 s5, s6
; GFX10-NEXT: s_cselect_b32 s5, s7, s5
; GFX10-NEXT: s_sub_i32 s7, s5, s6
; GFX10-NEXT: s_cmp_ge_u32 s5, s6
; GFX10-NEXT: s_cselect_b32 s5, s7, s5
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
; GFX10-NEXT: s_mul_i32 s10, s6, s5
; GFX10-NEXT: s_mul_i32 s9, s6, s9
; GFX10-NEXT: s_sub_i32 s10, s7, s10
; GFX10-NEXT: s_add_i32 s9, s7, s9
; GFX10-NEXT: s_cmp_ge_u32 s10, s6
; GFX10-NEXT: s_cselect_b32 s9, s9, s10
; GFX10-NEXT: s_sub_i32 s10, s9, s6
; GFX10-NEXT: s_cmp_ge_u32 s9, s6
; GFX10-NEXT: s_cselect_b32 s9, s10, s9
; GFX10-NEXT: s_add_u32 s10, s0, s2
; GFX10-NEXT: s_addc_u32 s11, s1, s3
; GFX10-NEXT: s_add_i32 s7, s7, 1
; GFX10-NEXT: s_add_u32 s4, s4, s8
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: s_addc_u32 s5, s5, 0
; GFX10-NEXT: s_add_u32 s2, s2, 4
; GFX10-NEXT: s_addc_u32 s3, s3, 0
; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x1000
; GFX10-NEXT: global_store_dword v0, v1, s[10:11]
; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], 2
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: s_add_u32 s8, s0, s8
; GFX10-NEXT: s_addc_u32 s9, s1, s9
; GFX10-NEXT: s_add_i32 s2, s2, 1
; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x400
; GFX10-NEXT: global_store_dword v0, v1, s[8:9]
; GFX10-NEXT: s_cbranch_scc0 .LBB1_1
; GFX10-NEXT: ; %bb.2: ; %bb2
; GFX10-NEXT: s_endpgm
@ -262,48 +239,45 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-NEXT: s_mov_b32 s7, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX11-NEXT: s_sub_i32 s2, 0, s6
; GFX11-NEXT: s_sub_i32 s3, 0, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-NEXT: v_readfirstlane_b32 s2, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_mul_i32 s2, s2, s4
; GFX11-NEXT: s_mul_hi_u32 s5, s4, s2
; GFX11-NEXT: s_mov_b64 s[2:3], 0
; GFX11-NEXT: s_add_i32 s8, s4, s5
; GFX11-NEXT: s_mov_b64 s[4:5], 0
; GFX11-NEXT: s_mul_i32 s3, s3, s2
; GFX11-NEXT: s_mul_hi_u32 s4, s2, s3
; GFX11-NEXT: s_mov_b32 s3, 0
; GFX11-NEXT: s_add_i32 s4, s2, s4
; GFX11-NEXT: s_mov_b32 s2, s3
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB1_1: ; %bb3
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_mul_hi_u32 s5, s2, s4
; GFX11-NEXT: s_mul_i32 s5, s5, s6
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_sub_i32 s5, s2, s5
; GFX11-NEXT: s_sub_i32 s7, s5, s6
; GFX11-NEXT: s_cmp_ge_u32 s5, s6
; GFX11-NEXT: s_cselect_b32 s5, s7, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_not_b32 s9, s5
; GFX11-NEXT: s_mul_i32 s10, s6, s5
; GFX11-NEXT: s_mul_i32 s9, s6, s9
; GFX11-NEXT: s_sub_i32 s10, s7, s10
; GFX11-NEXT: s_add_i32 s9, s7, s9
; GFX11-NEXT: s_cmp_ge_u32 s10, s6
; GFX11-NEXT: s_cselect_b32 s9, s9, s10
; GFX11-NEXT: s_sub_i32 s7, s5, s6
; GFX11-NEXT: s_cmp_ge_u32 s5, s6
; GFX11-NEXT: s_cselect_b32 s5, s7, s5
; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], 2
; GFX11-NEXT: v_mov_b32_e32 v1, s5
; GFX11-NEXT: s_add_u32 s8, s0, s8
; GFX11-NEXT: s_addc_u32 s9, s1, s9
; GFX11-NEXT: s_add_i32 s2, s2, 1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_sub_i32 s10, s9, s6
; GFX11-NEXT: s_cmp_ge_u32 s9, s6
; GFX11-NEXT: s_cselect_b32 s9, s10, s9
; GFX11-NEXT: s_add_u32 s10, s0, s2
; GFX11-NEXT: s_addc_u32 s11, s1, s3
; GFX11-NEXT: s_add_i32 s7, s7, 1
; GFX11-NEXT: s_add_u32 s4, s4, s8
; GFX11-NEXT: v_mov_b32_e32 v1, s9
; GFX11-NEXT: s_addc_u32 s5, s5, 0
; GFX11-NEXT: s_add_u32 s2, s2, 4
; GFX11-NEXT: s_addc_u32 s3, s3, 0
; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x1000
; GFX11-NEXT: global_store_b32 v0, v1, s[10:11]
; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x400
; GFX11-NEXT: global_store_b32 v0, v1, s[8:9]
; GFX11-NEXT: s_cbranch_scc0 .LBB1_1
; GFX11-NEXT: ; %bb.2: ; %bb2
; GFX11-NEXT: s_endpgm

View File

@ -0,0 +1,42 @@
; RUN: llc -O3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s
; Reduced from rocrand's threefry2x32_20 kernel.
; The AMDGPU LSR cost model should avoid creating a redundant VGPR induction
; variable when the loop already has a vector IV incremented by a uniform
; (SGPR) stride. Without the cost model fix, LSR introduces a second v_add
; in the loop body, wasting a VGPR and a VALU slot every iteration.
declare i32 @llvm.amdgcn.workitem.id.x() #0
; CHECK-LABEL: {{^}}lsr_vector_iv_cost:
; The loop must contain exactly one VALU add the single vector IV update.
; A second v_add_u32 here would mean LSR created a redundant IV.
; CHECK: {{^}}.LBB0_1:
; CHECK: v_add_u32
; CHECK-NOT: v_add_u32
; CHECK: s_cbranch
define amdgpu_kernel void @lsr_vector_iv_cost(<2 x i32> %arg0, i32 %stride, ptr addrspace(1) %out) {
entry:
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %loop
loop:
%iv.pn = phi i32 [ 0, %entry ], [ %or, %loop ]
%iv.vec = phi i32 [ %tid, %entry ], [ %sum1, %loop ]
%sum1 = add i32 %iv.vec, %stride
%elt = extractelement <2 x i32> %arg0, i64 0
%sum2 = add i32 %sum1, %elt
%xor = xor i32 1, %sum2
%sum3 = add i32 %sum2, %xor
%sum4 = add i32 %sum3, %elt
%or = or i32 %sum4, %stride
%shr = lshr i32 %iv.pn, 1
%cmp = icmp ult i32 %sum1, 1024
br i1 %cmp, label %loop, label %exit
exit:
store i32 %or, ptr addrspace(1) %out
ret void
}
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

View File

@ -19,11 +19,10 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
; CHECK-NEXT: .LBB0_1: ; %Flow
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: v_add_nc_u32_e32 v3, -4, v3
; CHECK-NEXT: .LBB0_2: ; %Flow1
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
; CHECK-NEXT: v_cmp_ne_u32_e64 s5, 0, v1
; CHECK-NEXT: v_cmp_ne_u32_e64 s5, 0, v3
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; j lastloop entry
; CHECK-NEXT: ;;#ASMEND
@ -33,40 +32,41 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
; CHECK-NEXT: .LBB0_3: ; %for.body33
; CHECK-NEXT: ; =>This Loop Header: Depth=1
; CHECK-NEXT: ; Child Loop BB0_6 Depth 2
; CHECK-NEXT: v_mov_b32_e32 v3, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: v_mov_b32_e32 v3, 0
; CHECK-NEXT: s_and_saveexec_b32 s7, s4
; CHECK-NEXT: s_cbranch_execz .LBB0_2
; CHECK-NEXT: ; %bb.4: ; %for.body51.preheader
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT: v_mov_b32_e32 v4, v2
; CHECK-NEXT: s_mov_b32 s9, 4
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: s_mov_b32 s9, 0
; CHECK-NEXT: s_branch .LBB0_6
; CHECK-NEXT: .p2align 6
; CHECK-NEXT: .LBB0_5: ; %if.end118
; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
; CHECK-NEXT: v_add_nc_u32_e32 v4, 4, v1
; CHECK-NEXT: s_add_i32 s9, s9, 4
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; backedge
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_add_nc_u32_e32 v3, s9, v2
; CHECK-NEXT: v_cmp_ge_u32_e64 s5, v3, v0
; CHECK-NEXT: v_cmp_ge_u32_e64 s5, v4, v0
; CHECK-NEXT: s_or_b32 s8, s5, s8
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; CHECK-NEXT: s_cbranch_execz .LBB0_1
; CHECK-NEXT: .LBB0_6: ; %for.body51
; CHECK-NEXT: ; Parent Loop BB0_3 Depth=1
; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: v_mov_b32_e32 v1, v4
; CHECK-NEXT: v_mov_b32_e32 v3, 1
; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB0_5
; CHECK-NEXT: ; %bb.7: ; %if.then112
; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2
; CHECK-NEXT: s_add_i32 s10, s9, 4
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: v_mov_b32_e32 v3, s10
; CHECK-NEXT: ds_write_b32 v1, v3
; CHECK-NEXT: v_mov_b32_e32 v3, 0
; CHECK-NEXT: v_mov_b32_e32 v4, s9
; CHECK-NEXT: ds_write_b32 v3, v4
; CHECK-NEXT: s_branch .LBB0_5
; CHECK-NEXT: .LBB0_8: ; %for.body159.preheader
; CHECK-NEXT: s_inst_prefetch 0x2

View File

@ -62,7 +62,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11]
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
; CHECK-NEXT: v_mov_b32_e32 v45, 0
; CHECK-NEXT: v_mov_b32_e32 v46, 0
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_mov_b32_e32 v43, v0
; CHECK-NEXT: v_mov_b32_e32 v31, v40
@ -91,7 +91,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_mov_b32 s12, s51
; CHECK-NEXT: s_mov_b32 s13, s50
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: ds_write_b32 v45, v45 offset:15360
; CHECK-NEXT: ds_write_b32 v46, v46 offset:15360
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v43
; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v43
@ -118,69 +118,66 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: v_cmpx_ne_u32_e32 0, v42
; CHECK-NEXT: s_cbranch_execz .LBB0_25
; CHECK-NEXT: ; %bb.1: ; %.preheader5
; CHECK-NEXT: v_mul_lo_u32 v0, v41, 14
; CHECK-NEXT: v_mul_lo_u32 v44, v41, 14
; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: s_mov_b32 s5, 0
; CHECK-NEXT: v_add_nc_u32_e32 v44, 0x3c04, v0
; CHECK-NEXT: v_add_nc_u32_e32 v45, 0x3c04, v44
; CHECK-NEXT: .LBB0_2: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_add_nc_u32_e32 v1, s5, v44
; CHECK-NEXT: v_add_nc_u32_e32 v0, s5, v45
; CHECK-NEXT: s_add_i32 s5, s5, 1
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v42
; CHECK-NEXT: ds_write_b8 v1, v45
; CHECK-NEXT: ds_write_b8 v0, v46
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB0_2
; CHECK-NEXT: ; %bb.3:
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v42
; CHECK-NEXT: s_mov_b32 s55, 0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v45
; CHECK-NEXT: v_add_nc_u32_e32 v46, -1, v42
; CHECK-NEXT: s_mov_b32 s53, 0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v46
; CHECK-NEXT: s_and_b32 exec_lo, exec_lo, vcc_lo
; CHECK-NEXT: s_cbranch_execz .LBB0_25
; CHECK-NEXT: ; %bb.4:
; CHECK-NEXT: v_lshlrev_b32_e32 v43, 10, v43
; CHECK-NEXT: v_add_nc_u32_e32 v46, 0x3c05, v0
; CHECK-NEXT: v_mov_b32_e32 v47, 0
; CHECK-NEXT: s_mov_b32 s53, 0
; CHECK-NEXT: s_mov_b32 s54, 0
; CHECK-NEXT: .LBB0_5: ; =>This Loop Header: Depth=1
; CHECK-NEXT: ; Child Loop BB0_8 Depth 2
; CHECK-NEXT: ; Child Loop BB0_20 Depth 2
; CHECK-NEXT: v_add_nc_u32_e32 v0, s55, v44
; CHECK-NEXT: s_lshl_b32 s4, s55, 5
; CHECK-NEXT: s_add_i32 s54, s55, 1
; CHECK-NEXT: s_add_i32 s5, s55, 5
; CHECK-NEXT: v_or3_b32 v57, s4, v43, s54
; CHECK-NEXT: s_mov_b32 s4, s53
; CHECK-NEXT: s_lshl_b32 s5, s53, 5
; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v45
; CHECK-NEXT: s_add_i32 s53, s53, 1
; CHECK-NEXT: s_add_i32 s4, s4, 5
; CHECK-NEXT: v_or3_b32 v57, s5, v43, s53
; CHECK-NEXT: v_mov_b32_e32 v58, s53
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: ds_read_u8 v56, v0
; CHECK-NEXT: v_mov_b32_e32 v58, s54
; CHECK-NEXT: s_mov_b32 s68, exec_lo
; CHECK-NEXT: v_cmpx_lt_u32_e64 s5, v42
; CHECK-NEXT: s_mov_b32 s55, exec_lo
; CHECK-NEXT: v_cmpx_lt_u32_e64 s4, v42
; CHECK-NEXT: s_cbranch_execz .LBB0_17
; CHECK-NEXT: ; %bb.6: ; %.preheader2
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: s_mov_b32 s69, 0
; CHECK-NEXT: s_mov_b32 s80, 0
; CHECK-NEXT: s_mov_b32 s68, 0
; CHECK-NEXT: s_mov_b32 s69, s53
; CHECK-NEXT: s_branch .LBB0_8
; CHECK-NEXT: .LBB0_7: ; in Loop: Header=BB0_8 Depth=2
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81
; CHECK-NEXT: s_add_i32 s80, s80, 4
; CHECK-NEXT: s_add_i32 s4, s55, s80
; CHECK-NEXT: v_add_nc_u32_e32 v0, s80, v57
; CHECK-NEXT: s_add_i32 s5, s4, 5
; CHECK-NEXT: s_add_i32 s4, s4, 1
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s5, v42
; CHECK-NEXT: v_mov_b32_e32 v58, s4
; CHECK-NEXT: s_or_b32 s69, vcc_lo, s69
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s69
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s80
; CHECK-NEXT: s_add_i32 s4, s69, 4
; CHECK-NEXT: v_add_nc_u32_e32 v57, 4, v57
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s4, v42
; CHECK-NEXT: v_mov_b32_e32 v58, s69
; CHECK-NEXT: s_or_b32 s68, vcc_lo, s68
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s68
; CHECK-NEXT: s_cbranch_execz .LBB0_16
; CHECK-NEXT: .LBB0_8: ; Parent Loop BB0_5 Depth=1
; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
; CHECK-NEXT: v_add_nc_u32_e32 v59, s80, v46
; CHECK-NEXT: v_add_nc_u32_e32 v58, s80, v57
; CHECK-NEXT: ds_read_u8 v0, v59
; CHECK-NEXT: v_add_nc_u32_e32 v58, s69, v45
; CHECK-NEXT: s_mov_b32 s69, s4
; CHECK-NEXT: ds_read_u8 v0, v58
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
; CHECK-NEXT: s_and_saveexec_b32 s81, s4
; CHECK-NEXT: v_cmp_eq_u16_sdwa s5, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
; CHECK-NEXT: s_and_saveexec_b32 s80, s5
; CHECK-NEXT: s_cbranch_execz .LBB0_10
; CHECK-NEXT: ; %bb.9: ; in Loop: Header=BB0_8 Depth=2
; CHECK-NEXT: v_mov_b32_e32 v31, v40
@ -199,13 +196,13 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CHECK-NEXT: ds_write_b32 v0, v58
; CHECK-NEXT: ds_write_b32 v0, v57
; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_8 Depth=2
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81
; CHECK-NEXT: ds_read_u8 v0, v59 offset:1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s80
; CHECK-NEXT: ds_read_u8 v0, v58 offset:1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
; CHECK-NEXT: s_and_saveexec_b32 s81, s4
; CHECK-NEXT: s_and_saveexec_b32 s80, s4
; CHECK-NEXT: s_cbranch_execz .LBB0_12
; CHECK-NEXT: ; %bb.11: ; in Loop: Header=BB0_8 Depth=2
; CHECK-NEXT: v_mov_b32_e32 v31, v40
@ -221,17 +218,17 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_mov_b32 s12, s51
; CHECK-NEXT: s_mov_b32 s13, s50
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: v_add_nc_u32_e32 v60, 1, v58
; CHECK-NEXT: v_add_nc_u32_e32 v59, 1, v57
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CHECK-NEXT: ds_write_b32 v0, v60
; CHECK-NEXT: ds_write_b32 v0, v59
; CHECK-NEXT: .LBB0_12: ; in Loop: Header=BB0_8 Depth=2
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81
; CHECK-NEXT: ds_read_u8 v0, v59 offset:2
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s80
; CHECK-NEXT: ds_read_u8 v0, v58 offset:2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
; CHECK-NEXT: s_and_saveexec_b32 s81, s4
; CHECK-NEXT: s_and_saveexec_b32 s80, s4
; CHECK-NEXT: s_cbranch_execz .LBB0_14
; CHECK-NEXT: ; %bb.13: ; in Loop: Header=BB0_8 Depth=2
; CHECK-NEXT: v_mov_b32_e32 v31, v40
@ -247,17 +244,17 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_mov_b32 s12, s51
; CHECK-NEXT: s_mov_b32 s13, s50
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: v_add_nc_u32_e32 v60, 2, v58
; CHECK-NEXT: v_add_nc_u32_e32 v59, 2, v57
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CHECK-NEXT: ds_write_b32 v0, v60
; CHECK-NEXT: ds_write_b32 v0, v59
; CHECK-NEXT: .LBB0_14: ; in Loop: Header=BB0_8 Depth=2
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81
; CHECK-NEXT: ds_read_u8 v0, v59 offset:3
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s80
; CHECK-NEXT: ds_read_u8 v0, v58 offset:3
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
; CHECK-NEXT: s_and_saveexec_b32 s81, s4
; CHECK-NEXT: s_and_saveexec_b32 s80, s4
; CHECK-NEXT: s_cbranch_execz .LBB0_7
; CHECK-NEXT: ; %bb.15: ; in Loop: Header=BB0_8 Depth=2
; CHECK-NEXT: v_mov_b32_e32 v31, v40
@ -273,19 +270,18 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_mov_b32 s12, s51
; CHECK-NEXT: s_mov_b32 s13, s50
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: v_add_nc_u32_e32 v58, 3, v58
; CHECK-NEXT: v_add_nc_u32_e32 v58, 3, v57
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CHECK-NEXT: ds_write_b32 v0, v58
; CHECK-NEXT: s_branch .LBB0_7
; CHECK-NEXT: .LBB0_16: ; %Flow43
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s69
; CHECK-NEXT: v_mov_b32_e32 v57, v0
; CHECK-NEXT: .LBB0_17: ; %Flow44
; CHECK-NEXT: .LBB0_16: ; %Flow32
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s68
; CHECK-NEXT: .LBB0_17: ; %Flow33
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55
; CHECK-NEXT: s_mov_b32 s55, exec_lo
; CHECK-NEXT: v_cmpx_lt_u32_e64 v58, v42
; CHECK-NEXT: s_cbranch_execz .LBB0_23
@ -306,7 +302,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: .LBB0_20: ; Parent Loop BB0_5 Depth=1
; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v58
; CHECK-NEXT: ds_read_u8 v0, v0
; CHECK-NEXT: ds_read_u8 v0, v0 offset:15364
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
; CHECK-NEXT: s_and_saveexec_b32 s69, s4
@ -330,24 +326,22 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CHECK-NEXT: ds_write_b32 v0, v57
; CHECK-NEXT: s_branch .LBB0_19
; CHECK-NEXT: .LBB0_22: ; %Flow41
; CHECK-NEXT: .LBB0_22: ; %Flow30
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: s_inst_prefetch 0x2
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s68
; CHECK-NEXT: .LBB0_23: ; %Flow42
; CHECK-NEXT: .LBB0_23: ; %Flow31
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55
; CHECK-NEXT: ; %bb.24: ; in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s54, v45
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s53, v46
; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v47
; CHECK-NEXT: v_add_nc_u32_e32 v46, 1, v46
; CHECK-NEXT: s_mov_b32 s55, s54
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_and_b32 s4, exec_lo, s4
; CHECK-NEXT: s_or_b32 s53, s4, s53
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s53
; CHECK-NEXT: s_or_b32 s54, s4, s54
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s54
; CHECK-NEXT: s_cbranch_execnz .LBB0_5
; CHECK-NEXT: .LBB0_25: ; %Flow49
; CHECK-NEXT: .LBB0_25: ; %Flow38
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: v_mov_b32_e32 v0, 1
@ -828,7 +822,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: s_mov_b32 s13, s50
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_mul_lo_u32 v46, v0, 14
; CHECK-NEXT: v_mul_lo_u32 v44, v0, 14
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: v_mov_b32_e32 v0, 1
; CHECK-NEXT: s_getpc_b64 s[16:17]
@ -842,7 +836,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: s_mov_b32 s13, s50
; CHECK-NEXT: s_mov_b32 s14, s33
; CHECK-NEXT: ds_write_b32 v43, v43 offset:15360
; CHECK-NEXT: v_add_nc_u32_e32 v44, 0x3c04, v46
; CHECK-NEXT: v_add_nc_u32_e32 v45, 0x3c04, v44
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v42
; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v42
@ -866,52 +860,49 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_mov_b32_e32 v41, v0
; CHECK-NEXT: v_lshlrev_b32_e32 v42, 10, v42
; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: s_mov_b32 s53, 0
; CHECK-NEXT: s_mov_b32 s52, 0
; CHECK-NEXT: ds_write_b8 v46, v43 offset:15364
; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v41
; CHECK-NEXT: ds_write_b8 v44, v43 offset:15364
; CHECK-NEXT: v_add_nc_u32_e32 v46, -1, v41
; CHECK-NEXT: .LBB1_1: ; %.37
; CHECK-NEXT: ; =>This Loop Header: Depth=1
; CHECK-NEXT: ; Child Loop BB1_3 Depth 2
; CHECK-NEXT: ; Child Loop BB1_8 Depth 2
; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v44
; CHECK-NEXT: s_lshl_b32 s5, s4, 5
; CHECK-NEXT: s_add_i32 s53, s4, 1
; CHECK-NEXT: s_add_i32 s6, s4, 5
; CHECK-NEXT: v_or3_b32 v47, s5, v42, s53
; CHECK-NEXT: s_mov_b32 s4, s53
; CHECK-NEXT: s_lshl_b32 s6, s53, 5
; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v45
; CHECK-NEXT: s_add_i32 s53, s53, 1
; CHECK-NEXT: s_add_i32 s5, s4, 5
; CHECK-NEXT: v_or3_b32 v56, s6, v42, s53
; CHECK-NEXT: v_mov_b32_e32 v57, s53
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: ds_read_u8 v46, v0
; CHECK-NEXT: v_mov_b32_e32 v56, s53
; CHECK-NEXT: s_mov_b32 s5, exec_lo
; CHECK-NEXT: v_cmpx_lt_u32_e64 s6, v41
; CHECK-NEXT: ds_read_u8 v47, v0
; CHECK-NEXT: s_mov_b32 s4, exec_lo
; CHECK-NEXT: v_cmpx_lt_u32_e64 s5, v41
; CHECK-NEXT: s_cbranch_execz .LBB1_5
; CHECK-NEXT: ; %bb.2: ; %.53.preheader
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT: s_mov_b32 s6, 0
; CHECK-NEXT: s_mov_b32 s7, 0
; CHECK-NEXT: .LBB1_3: ; %.53
; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1
; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
; CHECK-NEXT: s_add_i32 s7, s7, 4
; CHECK-NEXT: s_add_i32 s7, s5, 4
; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43
; CHECK-NEXT: s_add_i32 s8, s4, s7
; CHECK-NEXT: v_add_nc_u32_e32 v0, s7, v47
; CHECK-NEXT: s_add_i32 s9, s8, 5
; CHECK-NEXT: s_add_i32 s8, s8, 1
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s9, v41
; CHECK-NEXT: v_mov_b32_e32 v56, s8
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s7, v41
; CHECK-NEXT: v_add_nc_u32_e32 v56, 4, v56
; CHECK-NEXT: v_mov_b32_e32 v57, s5
; CHECK-NEXT: s_mov_b32 s5, s7
; CHECK-NEXT: s_or_b32 s6, vcc_lo, s6
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: s_cbranch_execnz .LBB1_3
; CHECK-NEXT: ; %bb.4: ; %Flow3
; CHECK-NEXT: ; %bb.4: ; %Flow4
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
; CHECK-NEXT: v_mov_b32_e32 v47, v0
; CHECK-NEXT: .LBB1_5: ; %Flow4
; CHECK-NEXT: .LBB1_5: ; %Flow5
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_mov_b32 s54, exec_lo
; CHECK-NEXT: v_cmpx_lt_u32_e64 v56, v41
; CHECK-NEXT: v_cmpx_lt_u32_e64 v57, v41
; CHECK-NEXT: s_cbranch_execz .LBB1_11
; CHECK-NEXT: ; %bb.6: ; %.103.preheader
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
@ -922,19 +913,19 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: .LBB1_7: ; %.114
; CHECK-NEXT: ; in Loop: Header=BB1_8 Depth=2
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s64
; CHECK-NEXT: v_add_nc_u32_e32 v57, 1, v57
; CHECK-NEXT: v_add_nc_u32_e32 v56, 1, v56
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v56, v41
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v57, v41
; CHECK-NEXT: s_or_b32 s55, vcc_lo, s55
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s55
; CHECK-NEXT: s_cbranch_execz .LBB1_10
; CHECK-NEXT: .LBB1_8: ; %.103
; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1
; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v56
; CHECK-NEXT: ds_read_u8 v0, v0
; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v57
; CHECK-NEXT: ds_read_u8 v0, v0 offset:15364
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v46, v0 src0_sel:BYTE_0 src1_sel:DWORD
; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v47, v0 src0_sel:BYTE_0 src1_sel:DWORD
; CHECK-NEXT: s_and_saveexec_b32 s64, s4
; CHECK-NEXT: s_cbranch_execz .LBB1_7
; CHECK-NEXT: ; %bb.9: ; %.110
@ -955,23 +946,22 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CHECK-NEXT: ds_write_b32 v0, v47
; CHECK-NEXT: ds_write_b32 v0, v56
; CHECK-NEXT: s_branch .LBB1_7
; CHECK-NEXT: .LBB1_10: ; %Flow
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT: s_inst_prefetch 0x2
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55
; CHECK-NEXT: .LBB1_11: ; %Flow2
; CHECK-NEXT: .LBB1_11: ; %Flow3
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s54
; CHECK-NEXT: ; %bb.12: ; %.32
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s53, v45
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s53, v46
; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v43
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
; CHECK-NEXT: s_and_b32 s4, exec_lo, s4
; CHECK-NEXT: s_or_b32 s52, s4, s52
; CHECK-NEXT: s_mov_b32 s4, s53
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s52
; CHECK-NEXT: s_cbranch_execnz .LBB1_1
; CHECK-NEXT: ; %bb.13: ; %.119

File diff suppressed because it is too large Load Diff

View File

@ -1060,38 +1060,38 @@ define void @memset_pattern_i64_as7_len33_dynval(ptr addrspace(7) inreg align 16
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, v1
; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v2
; GFX942-GISEL-NEXT: v_add_u32_e32 v2, s16, v0
; GFX942-GISEL-NEXT: s_mov_b64 s[4:5], 0
; GFX942-GISEL-NEXT: v_add_u32_e32 v1, s16, v0
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, v2
; GFX942-GISEL-NEXT: .LBB14_1: ; %memset.pattern-expansion-main-body
; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 32, v2
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:16
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:32
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:48
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:64
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:80
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:96
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:112
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:128
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:144
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:160
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:176
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:192
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:208
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:224
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:240
; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen offset:16
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen offset:32
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen offset:48
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen offset:64
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen offset:80
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen offset:96
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen offset:112
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen offset:128
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen offset:144
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen offset:160
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen offset:176
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen offset:192
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen offset:208
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen offset:224
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen offset:240
; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX942-GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
; GFX942-GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1
; GFX942-GISEL-NEXT: v_add_u32_e32 v1, 0x2000, v1
; GFX942-GISEL-NEXT: v_add_u32_e32 v3, 0x2000, v3
; GFX942-GISEL-NEXT: s_and_b64 vcc, exec, s[4:5]
; GFX942-GISEL-NEXT: s_cbranch_vccnz .LBB14_1
; GFX942-GISEL-NEXT: ; %bb.2: ; %memset.pattern-expansion-residual-body.preheader
; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s16, v0
; GFX942-GISEL-NEXT: buffer_store_dwordx2 v[4:5], v0, s[0:3], 0 offen offset:256
; GFX942-GISEL-NEXT: buffer_store_dwordx2 v[4:5], v2, s[0:3], 0 offen offset:256
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31]
%p = getelementptr inbounds i8, ptr addrspace(7) %a, i32 %offset

View File

@ -1140,7 +1140,6 @@ define void @move_to_valu_vgpr_operand_phi(ptr addrspace(3) %out) {
; SI-LABEL: move_to_valu_vgpr_operand_phi:
; SI: ; %bb.0: ; %bb0
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, 28, v0
; SI-NEXT: v_mov_b32_e32 v1, 1
; SI-NEXT: s_and_b64 vcc, exec, 0
; SI-NEXT: s_mov_b32 m0, -1
@ -1157,7 +1156,8 @@ define void @move_to_valu_vgpr_operand_phi(ptr addrspace(3) %out) {
; SI-NEXT: s_cbranch_scc1 .LBB20_1
; SI-NEXT: ; %bb.3: ; %bb2
; SI-NEXT: ; in Loop: Header=BB20_2 Depth=1
; SI-NEXT: ds_write_b32 v0, v1
; SI-NEXT: v_add_i32_e64 v2, s[4:5], 28, v0
; SI-NEXT: ds_write_b32 v2, v1
; SI-NEXT: s_mov_b64 vcc, vcc
; SI-NEXT: s_cbranch_vccz .LBB20_1
; SI-NEXT: ; %bb.4: ; %DummyReturnBlock
@ -1167,7 +1167,6 @@ define void @move_to_valu_vgpr_operand_phi(ptr addrspace(3) %out) {
; VI-LABEL: move_to_valu_vgpr_operand_phi:
; VI: ; %bb.0: ; %bb0
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 28, v0
; VI-NEXT: v_mov_b32_e32 v1, 1
; VI-NEXT: s_and_b64 vcc, exec, 0
; VI-NEXT: s_mov_b32 m0, -1
@ -1184,7 +1183,7 @@ define void @move_to_valu_vgpr_operand_phi(ptr addrspace(3) %out) {
; VI-NEXT: s_cbranch_scc1 .LBB20_1
; VI-NEXT: ; %bb.3: ; %bb2
; VI-NEXT: ; in Loop: Header=BB20_2 Depth=1
; VI-NEXT: ds_write_b32 v0, v1
; VI-NEXT: ds_write_b32 v0, v1 offset:28
; VI-NEXT: s_mov_b64 vcc, vcc
; VI-NEXT: s_cbranch_vccz .LBB20_1
; VI-NEXT: ; %bb.4: ; %DummyReturnBlock

View File

@ -247,27 +247,30 @@ define amdgpu_kernel void @simple_test_v_loop(ptr addrspace(1) %dst, ptr addrspa
; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc
; SI-NEXT: s_cbranch_execz .LBB4_3
; SI-NEXT: ; %bb.1: ; %loop.preheader
; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
; SI-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s8, 64
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: v_add_i32_e32 v0, vcc, s8, v0
; SI-NEXT: v_mov_b32_e32 v1, s13
; SI-NEXT: v_add_i32_e32 v0, vcc, s12, v0
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s10
; SI-NEXT: s_mov_b32 s5, s11
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_mov_b32 s4, s14
; SI-NEXT: s_mov_b32 s5, s15
; SI-NEXT: s_mov_b32 s3, s7
; SI-NEXT: s_mov_b32 s0, s2
; SI-NEXT: s_mov_b32 s1, s2
; SI-NEXT: .LBB4_2: ; %loop
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
; SI-NEXT: s_add_i32 s8, s8, -1
; SI-NEXT: s_cmp_lg_u32 s8, 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_add_u32 s0, s0, 4
; SI-NEXT: s_addc_u32 s1, s1, 0
; SI-NEXT: s_cmpk_lg_i32 s0, 0x100
; SI-NEXT: v_add_i32_e32 v0, vcc, 4, v0
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: s_cbranch_scc1 .LBB4_2
; SI-NEXT: .LBB4_3: ; %exit
; SI-NEXT: s_endpgm

View File

@ -12,40 +12,39 @@ define amdgpu_kernel void @ds_prefetch_pattern(ptr addrspace(3) %lds, ptr addrsp
; CHECK-NEXT: s_load_b32 s1, s[4:5], 0x0 nv
; CHECK-NEXT: s_load_b32 s0, s[4:5], 0x10 nv
; CHECK-NEXT: v_and_b32_e32 v12, 0x3ff, v0
; CHECK-NEXT: s_wait_kmcnt 0x0
; CHECK-NEXT: v_mov_b32_e32 v4, 0
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
; CHECK-NEXT: v_lshl_add_u32 v1, v12, 8, s1
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
; CHECK-NEXT: v_mov_b32_e32 v7, v4
; CHECK-NEXT: s_wait_kmcnt 0x0
; CHECK-NEXT: v_lshl_add_u32 v13, v12, 8, s1
; CHECK-NEXT: s_mov_b32 s1, 0
; CHECK-NEXT: ds_load_b128 v[4:7], v1
; CHECK-NEXT: ds_load_b128 v[8:11], v1 offset:16
; CHECK-NEXT: v_dual_add_nc_u32 v13, 32, v1 :: v_dual_mov_b32 v1, v0
; CHECK-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v0
; CHECK-NEXT: ds_load_b128 v[8:11], v13
; CHECK-NEXT: ds_load_b128 v[0:3], v13 offset:16
; CHECK-NEXT: s_wait_dscnt 0x0
; CHECK-NEXT: .LBB0_1: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: s_barrier_signal -1
; CHECK-NEXT: s_wait_dscnt 0x1
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; CHECK-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[6:7]
; CHECK-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[4:5]
; CHECK-NEXT: s_add_co_i32 s1, s1, 1
; CHECK-NEXT: s_wait_dscnt 0x1
; CHECK-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[10:11]
; CHECK-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[8:9]
; CHECK-NEXT: v_lshl_add_u32 v14, s1, 5, v13
; CHECK-NEXT: s_cmp_lt_i32 s1, s0
; CHECK-NEXT: s_wait_dscnt 0x0
; CHECK-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[10:11]
; CHECK-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[8:9]
; CHECK-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[2:3]
; CHECK-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[0:1]
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; CHECK-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[2:3]
; CHECK-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[0:1]
; CHECK-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[6:7]
; CHECK-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[4:5]
; CHECK-NEXT: s_barrier_wait -1
; CHECK-NEXT: ds_load_b128 v[4:7], v13
; CHECK-NEXT: ds_load_b128 v[8:11], v13 offset:16
; CHECK-NEXT: v_add_nc_u32_e32 v13, 32, v13
; CHECK-NEXT: ds_load_b128 v[8:11], v14
; CHECK-NEXT: ds_load_b128 v[0:3], v14 offset:16
; CHECK-NEXT: s_cbranch_scc1 .LBB0_1
; CHECK-NEXT: ; %bb.2: ; %exit
; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
; CHECK-NEXT: s_wait_kmcnt 0x0
; CHECK-NEXT: global_store_b128 v12, v[0:3], s[0:1] scale_offset
; CHECK-NEXT: global_store_b128 v12, v[4:7], s[0:1] scale_offset
; CHECK-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()

View File

@ -4,17 +4,15 @@
; OPT-LABEL: @test_local_atomicrmw_addressing_loop_uniform_index_max_offset_i32(
; OPT: .lr.ph.preheader:
; OPT: %scevgep2 = getelementptr i8, ptr addrspace(3) %arg1, i32 65532
; OPT: br label %.lr.ph
; OPT: .lr.ph:
; OPT: %lsr.iv3 = phi ptr addrspace(3) [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
; OPT: %lsr.iv2 = phi ptr addrspace(3) [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
; OPT: %lsr.iv1 = phi ptr addrspace(3) [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ]
; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ]
; OPT: %tmp4 = atomicrmw add ptr addrspace(3) %lsr.iv3, i32 undef seq_cst, align 4
; OPT: %scevgep4 = getelementptr i8, ptr addrspace(3) %lsr.iv2, i32 65532
; OPT: %tmp4 = atomicrmw add ptr addrspace(3) %scevgep4, i32 undef seq_cst, align 4
; OPT: %tmp7 = atomicrmw add ptr addrspace(3) %lsr.iv1, i32 undef seq_cst, align 4
; OPT: %0 = atomicrmw add ptr addrspace(3) %lsr.iv1, i32 %tmp8 seq_cst, align 4
; OPT: %scevgep4 = getelementptr i8, ptr addrspace(3) %lsr.iv3, i32 4
; OPT: %scevgep3 = getelementptr i8, ptr addrspace(3) %lsr.iv2, i32 4
; OPT: br i1 %exitcond
define amdgpu_kernel void @test_local_atomicrmw_addressing_loop_uniform_index_max_offset_i32(ptr addrspace(3) noalias nocapture %arg0, ptr addrspace(3) noalias nocapture readonly %arg1, i32 %n) #0 {
bb:
@ -46,15 +44,13 @@ bb:
; OPT-LABEL: test_local_cmpxchg_addressing_loop_uniform_index_max_offset_i32(
; OPT: .lr.ph.preheader:
; OPT: %scevgep2 = getelementptr i8, ptr addrspace(3) %arg1, i32 65532
; OPT: br label %.lr.ph
; OPT: .lr.ph:
; OPT: %lsr.iv3 = phi ptr addrspace(3) [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
; OPT: %lsr.iv2 = phi ptr addrspace(3) [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
; OPT: %lsr.iv1 = phi ptr addrspace(3) [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ]
; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ]
; OPT: %tmp4 = cmpxchg ptr addrspace(3) %lsr.iv3, i32 undef, i32 undef seq_cst monotonic, align 4
; OPT: %scevgep4 = getelementptr i8, ptr addrspace(3) %lsr.iv3, i32 4
; OPT: %scevgep4 = getelementptr i8, ptr addrspace(3) %lsr.iv2, i32 65532
; OPT: %tmp4 = cmpxchg ptr addrspace(3) %scevgep4, i32 undef, i32 undef seq_cst monotonic, align 4
; OPT: %scevgep3 = getelementptr i8, ptr addrspace(3) %lsr.iv2, i32 4
define amdgpu_kernel void @test_local_cmpxchg_addressing_loop_uniform_index_max_offset_i32(ptr addrspace(3) noalias nocapture %arg0, ptr addrspace(3) noalias nocapture readonly %arg1, i32 %n) #0 {
bb:
%tmp = icmp sgt i32 %n, 0

View File

@ -4,13 +4,11 @@
; spaces are correctly handled.
; OPT-LABEL: @test_global_addressing_loop_uniform_index_max_offset_i32(
; OPT: .lr.ph.preheader:
; OPT: %scevgep2 = getelementptr i8, ptr addrspace(1) %arg1, i64 4095
; OPT: br label %.lr.ph
; OPT: {{^}}.lr.ph:
; OPT: %lsr.iv3 = phi ptr addrspace(1) [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
; OPT: load i8, ptr addrspace(1) %lsr.iv3, align 1
; OPT: %scevgep4 = getelementptr i8, ptr addrspace(1) %lsr.iv3, i64 1
; OPT: %lsr.iv2 = phi ptr addrspace(1) [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
; OPT: %scevgep4 = getelementptr i8, ptr addrspace(1) %lsr.iv2, i64 4095
; OPT: load i8, ptr addrspace(1) %scevgep4, align 1
; OPT: %scevgep3 = getelementptr i8, ptr addrspace(1) %lsr.iv2, i64 1
define amdgpu_kernel void @test_global_addressing_loop_uniform_index_max_offset_i32(ptr addrspace(1) noalias nocapture %arg0, ptr addrspace(1) noalias nocapture readonly %arg1, i32 %n) #0 {
bb:
%tmp = icmp sgt i32 %n, 0
@ -80,13 +78,11 @@ bb:
}
; OPT-LABEL: @test_local_addressing_loop_uniform_index_max_offset_i32(
; OPT: .lr.ph.preheader:
; OPT: %scevgep2 = getelementptr i8, ptr addrspace(3) %arg1, i32 65535
; OPT: br label %.lr.ph
; OPT: {{^}}.lr.ph
; OPT: %lsr.iv3 = phi ptr addrspace(3) [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
; OPT: %tmp4 = load i8, ptr addrspace(3) %lsr.iv3, align 1
; OPT: %scevgep4 = getelementptr i8, ptr addrspace(3) %lsr.iv3, i32 1
; OPT: {{^}}.lr.ph:
; OPT: %lsr.iv2 = phi ptr addrspace(3) [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
; OPT: %scevgep4 = getelementptr i8, ptr addrspace(3) %lsr.iv2, i32 65535
; OPT: %tmp4 = load i8, ptr addrspace(3) %scevgep4, align 1
; OPT: %scevgep3 = getelementptr i8, ptr addrspace(3) %lsr.iv2, i32 1
define amdgpu_kernel void @test_local_addressing_loop_uniform_index_max_offset_i32(ptr addrspace(1) noalias nocapture %arg0, ptr addrspace(3) noalias nocapture readonly %arg1, i32 %n) #0 {
bb:
%tmp = icmp sgt i32 %n, 0

View File

@ -12,22 +12,20 @@ define amdgpu_kernel void @scaledregtest() local_unnamed_addr {
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: loopexit:
; CHECK-NEXT: [[SCEVGEP11_LCSSA:%.*]] = phi ptr addrspace(5) [ [[SCEVGEP11:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[SCEVGEP13_LCSSA:%.*]] = phi ptr [ [[SCEVGEP13:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: br label [[FOR_BODY_1:%.*]]
; CHECK: for.body.1:
; CHECK-NEXT: [[LSR_IV5:%.*]] = phi ptr addrspace(5) [ [[SCEVGEP6:%.*]], [[FOR_BODY_1]] ], [ [[SCEVGEP11_LCSSA]], [[LOOPEXIT:%.*]] ]
; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], [[FOR_BODY_1]] ], [ [[SCEVGEP13_LCSSA]], [[LOOPEXIT]] ]
; CHECK-NEXT: [[LSR_IV5:%.*]] = phi ptr addrspace(5) [ [[SCEVGEP6:%.*]], [[FOR_BODY_1]] ], [ [[SCEVGEP11_LCSSA:%.*]], [[LOOPEXIT:%.*]] ]
; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], [[FOR_BODY_1]] ], [ [[SCEVGEP13_LCSSA:%.*]], [[LOOPEXIT]] ]
; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[LSR_IV5]], align 8
; CHECK-NEXT: store ptr [[TMP0]], ptr [[LSR_IV1]], align 8
; CHECK-NEXT: [[SCEVGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i64 8
; CHECK-NEXT: [[SCEVGEP6]] = getelementptr i8, ptr addrspace(5) [[LSR_IV5]], i32 8
; CHECK-NEXT: br label [[FOR_BODY_1]]
; CHECK: for.body:
; CHECK-NEXT: [[LSR_IV12:%.*]] = phi ptr [ [[SCEVGEP13]], [[FOR_BODY]] ], [ null, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[LSR_IV10:%.*]] = phi ptr addrspace(5) [ [[SCEVGEP11]], [[FOR_BODY]] ], [ null, [[ENTRY]] ]
; CHECK-NEXT: [[SCEVGEP11]] = getelementptr i8, ptr addrspace(5) [[LSR_IV10]], i32 64
; CHECK-NEXT: [[SCEVGEP13]] = getelementptr i8, ptr [[LSR_IV12]], i64 64
; CHECK-NEXT: [[SCEVGEP11_LCSSA]] = phi ptr addrspace(5) [ [[SCEVGEP4:%.*]], [[FOR_BODY]] ], [ inttoptr (i32 64 to ptr addrspace(5)), [[ENTRY:%.*]] ]
; CHECK-NEXT: [[SCEVGEP13_LCSSA]] = phi ptr [ [[SCEVGEP:%.*]], [[FOR_BODY]] ], [ inttoptr (i64 64 to ptr), [[ENTRY]] ]
; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[SCEVGEP13_LCSSA]], i64 64
; CHECK-NEXT: [[SCEVGEP4]] = getelementptr i8, ptr addrspace(5) [[SCEVGEP11_LCSSA]], i32 64
; CHECK-NEXT: br i1 false, label [[LOOPEXIT]], label [[FOR_BODY]]
;
entry:

View File

@ -14,7 +14,7 @@ define amdgpu_kernel void @lsr_crash_preserve_addrspace_unknown_type() #0 {
; CHECK-NEXT: br label %[[BB1:.*]]
; CHECK: [[BB1]]:
; CHECK-NEXT: [[TMP:%.*]] = phi ptr addrspace(3) [ undef, %[[BB]] ], [ [[TMP18:%.*]], %[[BB17:.*]] ]
; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP]], i32 8
; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr inbounds [[TMP0:%.*]], ptr addrspace(3) [[TMP]], i64 0, i32 1
; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr addrspace(3) [[SCEVGEP1]], align 8
; CHECK-NEXT: br label %[[BB4:.*]]
; CHECK: [[BB4]]:
@ -26,14 +26,14 @@ define amdgpu_kernel void @lsr_crash_preserve_addrspace_unknown_type() #0 {
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 0, [[TMP10]]
; CHECK-NEXT: br i1 [[TMP11]], label %[[BB12:.*]], label %[[BB17]]
; CHECK: [[BB12]]:
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP]], i32 16
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr inbounds [[TMP0]], ptr addrspace(3) [[TMP]], i64 0, i32 2
; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(3) [[SCEVGEP]], align 4
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 0, [[TMP14]]
; CHECK-NEXT: br i1 [[TMP15]], label %[[BB16:.*]], label %[[BB17]]
; CHECK: [[BB16]]:
; CHECK-NEXT: unreachable
; CHECK: [[BB17]]:
; CHECK-NEXT: [[TMP18]] = getelementptr inbounds [[TMP0:%.*]], ptr addrspace(3) [[TMP]], i64 2
; CHECK-NEXT: [[TMP18]] = getelementptr inbounds [[TMP0]], ptr addrspace(3) [[TMP]], i64 2
; CHECK-NEXT: br label %[[BB1]]
;
bb: