[AMDGPU] Implement LSR cost model for GFX9+ (#184138)
AMDGPU previously had no target-specific LSR cost model, so the generic heuristic would often introduce extra induction variables and base-add chains that hurt VALU throughput on GFX9+ (observed on gfx942). Implement a custom cost model: - isLSRCostLess(): prioritize per-iteration instruction count over setup costs, penalize IV multiplies, and demote register count. Pre-GFX9 falls back to the default comparator. - getScalingFactorCost(): report that base+scale*index addressing requires an extra ADD instruction. - isNumRegsMajorCostOfLSR(): return false. - shouldDropLSRSolutionIfLessProfitable(): return true. Assisted-by: Claude Opus
This commit is contained in:
parent
bd6e8a8c66
commit
621fc8774e
@ -15,6 +15,7 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "AMDGPUTargetTransformInfo.h"
|
||||
#include "AMDGPUSubtarget.h"
|
||||
#include "AMDGPUTargetMachine.h"
|
||||
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
||||
#include "SIModeRegisterDefaults.h"
|
||||
@ -1736,3 +1737,50 @@ GCNTTIImpl::getInstructionUniformity(const Value *V) const {
|
||||
|
||||
return InstructionUniformity::Default;
|
||||
}
|
||||
|
||||
InstructionCost GCNTTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
|
||||
StackOffset BaseOffset,
|
||||
bool HasBaseReg, int64_t Scale,
|
||||
unsigned AddrSpace) const {
|
||||
if (HasBaseReg && Scale != 0) {
|
||||
// gfx1250+ can fold base+scale*index when scale matches the memory access
|
||||
// size (scale_offset bit). Supported for flat/global/constant/scratch
|
||||
// (VMEM, max 128 bits) and constant_32bit (SMRD, capped to 128 bits here).
|
||||
if (getST()->hasScaleOffset() && Ty && Ty->isSized() &&
|
||||
(AMDGPU::isExtendedGlobalAddrSpace(AddrSpace) ||
|
||||
AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
|
||||
AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)) {
|
||||
TypeSize StoreSize = getDataLayout().getTypeStoreSize(Ty);
|
||||
if (TypeSize::isKnownLE(StoreSize, TypeSize::getFixed(16)) &&
|
||||
static_cast<int64_t>(StoreSize.getFixedValue()) == Scale)
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
return BaseT::getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg, Scale,
|
||||
AddrSpace);
|
||||
}
|
||||
|
||||
bool GCNTTIImpl::isLSRCostLess(const TTI::LSRCost &A,
|
||||
const TTI::LSRCost &B) const {
|
||||
// Favor lower per-iteration work over preheader/setup costs.
|
||||
// AMDGPU lacks rich addressing modes, so ScaleCost is folded into the
|
||||
// effective instruction count (base+scale*index requires a separate ADD).
|
||||
unsigned EffInsnsA = A.Insns + A.ScaleCost;
|
||||
unsigned EffInsnsB = B.Insns + B.ScaleCost;
|
||||
|
||||
return std::tie(EffInsnsA, A.NumIVMuls, A.AddRecCost, A.NumBaseAdds,
|
||||
A.SetupCost, A.ImmCost, A.NumRegs) <
|
||||
std::tie(EffInsnsB, B.NumIVMuls, B.AddRecCost, B.NumBaseAdds,
|
||||
B.SetupCost, B.ImmCost, B.NumRegs);
|
||||
}
|
||||
|
||||
bool GCNTTIImpl::isNumRegsMajorCostOfLSR() const {
|
||||
// isLSRCostLess de-prioritizes register count; keep consistent.
|
||||
return false;
|
||||
}
|
||||
|
||||
bool GCNTTIImpl::shouldDropLSRSolutionIfLessProfitable() const {
|
||||
// Prefer the baseline when LSR cannot clearly reduce per-iteration work.
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -311,6 +311,16 @@ public:
|
||||
unsigned getNumberOfParts(Type *Tp) const override;
|
||||
|
||||
InstructionUniformity getInstructionUniformity(const Value *V) const override;
|
||||
|
||||
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
|
||||
StackOffset BaseOffset, bool HasBaseReg,
|
||||
int64_t Scale,
|
||||
unsigned AddrSpace) const override;
|
||||
|
||||
bool isLSRCostLess(const TTI::LSRCost &A,
|
||||
const TTI::LSRCost &B) const override;
|
||||
bool isNumRegsMajorCostOfLSR() const override;
|
||||
bool shouldDropLSRSolutionIfLessProfitable() const override;
|
||||
};
|
||||
|
||||
} // end namespace llvm
|
||||
|
||||
@ -11,18 +11,21 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind {
|
||||
; GCN-NEXT: s_cbranch_scc1 .LBB0_3
|
||||
; GCN-NEXT: ; %bb.1: ; %for.body.preheader
|
||||
; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_addk_i32 s0, 0x80
|
||||
; GCN-NEXT: s_mov_b32 s1, 0
|
||||
; GCN-NEXT: s_and_b64 vcc, exec, -1
|
||||
; GCN-NEXT: s_mov_b32 m0, -1
|
||||
; GCN-NEXT: .LBB0_2: ; %for.body
|
||||
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-NEXT: s_lshl_b32 s2, s1, 2
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_add_i32 s2, s0, s2
|
||||
; GCN-NEXT: s_addk_i32 s2, 0x80
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GCN-NEXT: ds_read_b32 v1, v0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1
|
||||
; GCN-NEXT: ds_write_b32 v0, v1
|
||||
; GCN-NEXT: s_add_i32 s0, s0, 4
|
||||
; GCN-NEXT: s_add_i32 s1, s1, 1
|
||||
; GCN-NEXT: s_mov_b64 vcc, vcc
|
||||
; GCN-NEXT: s_cbranch_vccnz .LBB0_2
|
||||
; GCN-NEXT: .LBB0_3: ; %for.exit
|
||||
@ -109,18 +112,21 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi
|
||||
; GCN-LABEL: loop_const_true:
|
||||
; GCN: ; %bb.0: ; %entry
|
||||
; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_addk_i32 s0, 0x80
|
||||
; GCN-NEXT: s_mov_b32 s1, 0
|
||||
; GCN-NEXT: s_and_b64 vcc, exec, -1
|
||||
; GCN-NEXT: s_mov_b32 m0, -1
|
||||
; GCN-NEXT: .LBB1_1: ; %for.body
|
||||
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-NEXT: s_lshl_b32 s2, s1, 2
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_add_i32 s2, s0, s2
|
||||
; GCN-NEXT: s_addk_i32 s2, 0x80
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GCN-NEXT: ds_read_b32 v1, v0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1
|
||||
; GCN-NEXT: ds_write_b32 v0, v1
|
||||
; GCN-NEXT: s_add_i32 s0, s0, 4
|
||||
; GCN-NEXT: s_add_i32 s1, s1, 1
|
||||
; GCN-NEXT: s_mov_b64 vcc, vcc
|
||||
; GCN-NEXT: s_cbranch_vccnz .LBB1_1
|
||||
; GCN-NEXT: ; %bb.2: ; %DummyReturnBlock
|
||||
@ -368,22 +374,25 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind {
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_mov_b32 m0, -1
|
||||
; GCN-NEXT: ds_read_u8 v0, v0
|
||||
; GCN-NEXT: s_load_dword s4, s[4:5], 0x9
|
||||
; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GCN-NEXT: s_bitcmp1_b32 s0, 0
|
||||
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GCN-NEXT: s_xor_b64 s[2:3], s[0:1], -1
|
||||
; GCN-NEXT: s_add_i32 s0, s4, 0x80
|
||||
; GCN-NEXT: v_readfirstlane_b32 s1, v0
|
||||
; GCN-NEXT: s_bitcmp1_b32 s1, 0
|
||||
; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], -1
|
||||
; GCN-NEXT: s_mov_b32 s1, 0
|
||||
; GCN-NEXT: s_and_b64 vcc, exec, s[2:3]
|
||||
; GCN-NEXT: .LBB4_1: ; %for.body
|
||||
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-NEXT: s_lshl_b32 s2, s1, 2
|
||||
; GCN-NEXT: s_add_i32 s2, s0, s2
|
||||
; GCN-NEXT: s_addk_i32 s2, 0x80
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GCN-NEXT: ds_read_b32 v1, v0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1
|
||||
; GCN-NEXT: ds_write_b32 v0, v1
|
||||
; GCN-NEXT: s_add_i32 s0, s0, 4
|
||||
; GCN-NEXT: s_add_i32 s1, s1, 1
|
||||
; GCN-NEXT: s_mov_b64 vcc, vcc
|
||||
; GCN-NEXT: s_cbranch_vccz .LBB4_1
|
||||
; GCN-NEXT: ; %bb.2: ; %for.exit
|
||||
|
||||
@ -17,14 +17,13 @@ define amdgpu_kernel void @copy_to_reg_frameindex(ptr addrspace(1) %out, i32 %a,
|
||||
; GFX7-NEXT: s_add_u32 s12, s12, s11
|
||||
; GFX7-NEXT: s_addc_u32 s13, s13, 0
|
||||
; GFX7-NEXT: s_mov_b32 s0, 0
|
||||
; GFX7-NEXT: s_mov_b32 s1, 0
|
||||
; GFX7-NEXT: .LBB0_1: ; %loop
|
||||
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX7-NEXT: s_add_i32 s1, s1, 1
|
||||
; GFX7-NEXT: s_add_i32 s0, s0, 4
|
||||
; GFX7-NEXT: s_cmp_lt_u32 s1, 16
|
||||
; GFX7-NEXT: s_lshl_b32 s1, s0, 2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: s_add_i32 s0, s0, 1
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-NEXT: s_cmp_lt_u32 s0, 16
|
||||
; GFX7-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen
|
||||
; GFX7-NEXT: s_cbranch_scc1 .LBB0_1
|
||||
; GFX7-NEXT: ; %bb.2: ; %done
|
||||
@ -45,14 +44,13 @@ define amdgpu_kernel void @copy_to_reg_frameindex(ptr addrspace(1) %out, i32 %a,
|
||||
; GFX8-NEXT: s_add_u32 s88, s88, s11
|
||||
; GFX8-NEXT: s_addc_u32 s89, s89, 0
|
||||
; GFX8-NEXT: s_mov_b32 s0, 0
|
||||
; GFX8-NEXT: s_mov_b32 s1, 0
|
||||
; GFX8-NEXT: .LBB0_1: ; %loop
|
||||
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX8-NEXT: s_add_i32 s1, s1, 1
|
||||
; GFX8-NEXT: s_add_i32 s0, s0, 4
|
||||
; GFX8-NEXT: s_cmp_lt_u32 s1, 16
|
||||
; GFX8-NEXT: s_lshl_b32 s1, s0, 2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: s_add_i32 s0, s0, 1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: s_cmp_lt_u32 s0, 16
|
||||
; GFX8-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
|
||||
; GFX8-NEXT: s_cbranch_scc1 .LBB0_1
|
||||
; GFX8-NEXT: ; %bb.2: ; %done
|
||||
|
||||
@ -9,19 +9,21 @@ declare void @llvm.amdgcn.s.barrier() #1
|
||||
; CHECK-LABEL: {{^}}signed_ds_offset_addressing_loop:
|
||||
; CHECK: BB0_1:
|
||||
; CHECK: v_add_i32_e32 [[VADDR:v[0-9]+]],
|
||||
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]]
|
||||
; SI-DAG: v_add_i32_e32 [[VADDR8:v[0-9]+]], vcc, 8, [[VADDR]]
|
||||
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR8]]
|
||||
; SI-DAG: v_add_i32_e32 [[VADDR0x80:v[0-9]+]], vcc, 0x80, [[VADDR]]
|
||||
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x80]]
|
||||
; SI-DAG: v_add_i32_e32 [[VADDR0x88:v[0-9]+]], vcc, 0x88, [[VADDR]]
|
||||
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x88]]
|
||||
; SI-DAG: v_add_i32_e32 [[VADDR0x100:v[0-9]+]], vcc, 0x100, [[VADDR]]
|
||||
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x100]]
|
||||
|
||||
; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset1:2
|
||||
; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:32 offset1:34
|
||||
; CI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] offset:256
|
||||
; SI-DAG: v_add_i32_e32 [[VADDR1:v[0-9]+]], vcc, 0xc20, [[VADDR]]
|
||||
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR1]]
|
||||
; SI-DAG: v_add_i32_e32 [[VADDR2:v[0-9]+]], vcc, {{s[0-9]+}}, [[VADDR]]
|
||||
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR2]]
|
||||
; SI-DAG: v_add_i32_e32 [[VADDR3:v[0-9]+]], vcc, 0xca0, [[VADDR]]
|
||||
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR3]]
|
||||
; SI-DAG: v_add_i32_e32 [[VADDR4:v[0-9]+]], vcc, 0xca8, [[VADDR]]
|
||||
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR4]]
|
||||
; SI-DAG: v_add_i32_e32 [[VADDR5:v[0-9]+]], vcc, 0xd20, [[VADDR]]
|
||||
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR5]]
|
||||
;
|
||||
; CI: v_add_i32_e32 [[VADDRCI:v[0-9]+]], vcc, {{s[0-9]+}}, [[VADDR]]
|
||||
; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDRCI]] offset0:8 offset1:10
|
||||
; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDRCI]] offset0:40 offset1:42
|
||||
; CI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] offset:3360
|
||||
; CHECK: s_endpgm
|
||||
define amdgpu_kernel void @signed_ds_offset_addressing_loop(ptr addrspace(1) noalias nocapture %out, ptr addrspace(3) noalias nocapture readonly %lptr, i32 %n) #2 {
|
||||
entry:
|
||||
|
||||
@ -2741,16 +2741,15 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) {
|
||||
; GFX1250-SDAG: ; %bb.0: ; %bb
|
||||
; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX1250-SDAG-NEXT: s_movk_i32 s0, 0x100
|
||||
; GFX1250-SDAG-NEXT: .LBB116_1: ; %bb3
|
||||
; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
|
||||
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
|
||||
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
|
||||
; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
|
||||
; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
|
||||
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400
|
||||
; GFX1250-SDAG-NEXT: s_add_co_i32 s0, s0, -1
|
||||
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4
|
||||
; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0
|
||||
; GFX1250-SDAG-NEXT: s_cbranch_scc0 .LBB116_1
|
||||
; GFX1250-SDAG-NEXT: ; %bb.2: ; %bb2
|
||||
; GFX1250-SDAG-NEXT: s_endpgm
|
||||
@ -2758,19 +2757,19 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) {
|
||||
; GFX1250-GISEL-LABEL: flat_addr_64bit_lsr_iv:
|
||||
; GFX1250-GISEL: ; %bb.0: ; %bb
|
||||
; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX1250-GISEL-NEXT: s_movk_i32 s0, 0x100
|
||||
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
|
||||
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
||||
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX1250-GISEL-NEXT: .LBB116_1: ; %bb3
|
||||
; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
|
||||
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
|
||||
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
|
||||
; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 4, v[2:3]
|
||||
; GFX1250-GISEL-NEXT: flat_load_b32 v4, v[4:5] scope:SCOPE_SYS
|
||||
; GFX1250-GISEL-NEXT: flat_load_b32 v3, v[0:1] scope:SCOPE_SYS
|
||||
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x400, v2
|
||||
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, s0, v0, 4
|
||||
; GFX1250-GISEL-NEXT: v_add_nc_u32_e32 v2, -1, v2
|
||||
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s0
|
||||
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
||||
; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
|
||||
; GFX1250-GISEL-NEXT: s_cbranch_vccz .LBB116_1
|
||||
; GFX1250-GISEL-NEXT: ; %bb.2: ; %bb2
|
||||
; GFX1250-GISEL-NEXT: s_endpgm
|
||||
@ -2779,16 +2778,15 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) {
|
||||
; GFX1250-NOECC: ; %bb.0: ; %bb
|
||||
; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NOECC-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NOECC-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX1250-NOECC-NEXT: s_movk_i32 s0, 0x100
|
||||
; GFX1250-NOECC-NEXT: .LBB116_1: ; %bb3
|
||||
; GFX1250-NOECC-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1250-NOECC-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1250-NOECC-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
|
||||
; GFX1250-NOECC-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
|
||||
; GFX1250-NOECC-NEXT: s_wait_dscnt 0x0
|
||||
; GFX1250-NOECC-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
|
||||
; GFX1250-NOECC-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
|
||||
; GFX1250-NOECC-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-NOECC-NEXT: s_cmp_eq_u32 s0, 0x400
|
||||
; GFX1250-NOECC-NEXT: s_add_co_i32 s0, s0, -1
|
||||
; GFX1250-NOECC-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4
|
||||
; GFX1250-NOECC-NEXT: s_cmp_eq_u32 s0, 0
|
||||
; GFX1250-NOECC-NEXT: s_cbranch_scc0 .LBB116_1
|
||||
; GFX1250-NOECC-NEXT: ; %bb.2: ; %bb2
|
||||
; GFX1250-NOECC-NEXT: s_endpgm
|
||||
@ -2815,19 +2813,17 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
|
||||
; GFX1250-SDAG: ; %bb.0: ; %bb
|
||||
; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX1250-SDAG-NEXT: s_movk_i32 s0, 0x100
|
||||
; GFX1250-SDAG-NEXT: .LBB117_1: ; %bb3
|
||||
; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
|
||||
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
|
||||
; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
|
||||
; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
|
||||
; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
|
||||
; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
|
||||
; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
|
||||
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400
|
||||
; GFX1250-SDAG-NEXT: ; kill: killed $sgpr4_sgpr5
|
||||
; GFX1250-SDAG-NEXT: s_add_co_i32 s0, s0, -1
|
||||
; GFX1250-SDAG-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4
|
||||
; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0
|
||||
; GFX1250-SDAG-NEXT: s_cbranch_scc0 .LBB117_1
|
||||
; GFX1250-SDAG-NEXT: ; %bb.2: ; %bb2
|
||||
; GFX1250-SDAG-NEXT: s_endpgm
|
||||
@ -2835,22 +2831,21 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
|
||||
; GFX1250-GISEL-LABEL: flat_addr_64bit_lsr_iv_multiload:
|
||||
; GFX1250-GISEL: ; %bb.0: ; %bb
|
||||
; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX1250-GISEL-NEXT: s_movk_i32 s0, 0x100
|
||||
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
|
||||
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
||||
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX1250-GISEL-NEXT: .LBB117_1: ; %bb3
|
||||
; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
|
||||
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
|
||||
; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 4, v[2:3]
|
||||
; GFX1250-GISEL-NEXT: ; kill: killed $vgpr4 killed $vgpr5
|
||||
; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
|
||||
; GFX1250-GISEL-NEXT: flat_load_b32 v6, v[4:5] scope:SCOPE_SYS
|
||||
; GFX1250-GISEL-NEXT: flat_load_b32 v3, v[0:1] scope:SCOPE_SYS
|
||||
; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX1250-GISEL-NEXT: flat_load_b32 v6, v[4:5] scope:SCOPE_SYS
|
||||
; GFX1250-GISEL-NEXT: flat_load_b32 v3, v[0:1] scope:SCOPE_SYS
|
||||
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x400, v2
|
||||
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, s0, v0, 4
|
||||
; GFX1250-GISEL-NEXT: v_add_nc_u32_e32 v2, -1, v2
|
||||
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s0
|
||||
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
||||
; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
|
||||
; GFX1250-GISEL-NEXT: s_cbranch_vccz .LBB117_1
|
||||
; GFX1250-GISEL-NEXT: ; %bb.2: ; %bb2
|
||||
; GFX1250-GISEL-NEXT: s_endpgm
|
||||
@ -2859,19 +2854,17 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
|
||||
; GFX1250-NOECC: ; %bb.0: ; %bb
|
||||
; GFX1250-NOECC-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NOECC-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NOECC-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX1250-NOECC-NEXT: s_movk_i32 s0, 0x100
|
||||
; GFX1250-NOECC-NEXT: .LBB117_1: ; %bb3
|
||||
; GFX1250-NOECC-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1250-NOECC-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1250-NOECC-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
|
||||
; GFX1250-NOECC-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
|
||||
; GFX1250-NOECC-NEXT: s_wait_dscnt 0x0
|
||||
; GFX1250-NOECC-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
|
||||
; GFX1250-NOECC-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
|
||||
; GFX1250-NOECC-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX1250-NOECC-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
|
||||
; GFX1250-NOECC-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
|
||||
; GFX1250-NOECC-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-NOECC-NEXT: s_cmp_eq_u32 s0, 0x400
|
||||
; GFX1250-NOECC-NEXT: ; kill: killed $sgpr4_sgpr5
|
||||
; GFX1250-NOECC-NEXT: s_add_co_i32 s0, s0, -1
|
||||
; GFX1250-NOECC-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4
|
||||
; GFX1250-NOECC-NEXT: s_cmp_eq_u32 s0, 0
|
||||
; GFX1250-NOECC-NEXT: s_cbranch_scc0 .LBB117_1
|
||||
; GFX1250-NOECC-NEXT: ; %bb.2: ; %bb2
|
||||
; GFX1250-NOECC-NEXT: s_endpgm
|
||||
|
||||
@ -4720,17 +4720,16 @@ define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr a
|
||||
define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
|
||||
; GFX9-LABEL: global_addr_64bit_lsr_iv:
|
||||
; GFX9: ; %bb.0: ; %bb
|
||||
; GFX9-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX9-NEXT: s_movk_i32 s0, 0x100
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX9-NEXT: .LBB132_1: ; %bb3
|
||||
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX9-NEXT: s_add_u32 s4, s2, s0
|
||||
; GFX9-NEXT: s_addc_u32 s5, s3, s1
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_add_u32 s0, s0, 4
|
||||
; GFX9-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400
|
||||
; GFX9-NEXT: s_add_i32 s0, s0, -1
|
||||
; GFX9-NEXT: s_add_u32 s2, s2, 4
|
||||
; GFX9-NEXT: s_addc_u32 s3, s3, 0
|
||||
; GFX9-NEXT: s_cmp_eq_u32 s0, 0
|
||||
; GFX9-NEXT: s_cbranch_scc0 .LBB132_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %bb2
|
||||
; GFX9-NEXT: s_endpgm
|
||||
@ -4738,17 +4737,16 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
|
||||
; GFX10-LABEL: global_addr_64bit_lsr_iv:
|
||||
; GFX10: ; %bb.0: ; %bb
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX10-NEXT: s_movk_i32 s0, 0x100
|
||||
; GFX10-NEXT: .LBB132_1: ; %bb3
|
||||
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
|
||||
; GFX10-NEXT: s_add_u32 s4, s2, s0
|
||||
; GFX10-NEXT: s_addc_u32 s5, s3, s1
|
||||
; GFX10-NEXT: s_add_u32 s0, s0, 4
|
||||
; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc
|
||||
; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400
|
||||
; GFX10-NEXT: s_add_i32 s0, s0, -1
|
||||
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
|
||||
; GFX10-NEXT: s_add_u32 s2, s2, 4
|
||||
; GFX10-NEXT: s_addc_u32 s3, s3, 0
|
||||
; GFX10-NEXT: s_cmp_eq_u32 s0, 0
|
||||
; GFX10-NEXT: s_cbranch_scc0 .LBB132_1
|
||||
; GFX10-NEXT: ; %bb.2: ; %bb2
|
||||
; GFX10-NEXT: s_endpgm
|
||||
@ -4756,17 +4754,15 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
|
||||
; GFX11-LABEL: global_addr_64bit_lsr_iv:
|
||||
; GFX11: ; %bb.0: ; %bb
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX11-NEXT: s_movk_i32 s0, 0x100
|
||||
; GFX11-NEXT: .LBB132_1: ; %bb3
|
||||
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_add_u32 s4, s2, s0
|
||||
; GFX11-NEXT: s_addc_u32 s5, s3, s1
|
||||
; GFX11-NEXT: s_add_u32 s0, s0, 4
|
||||
; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc
|
||||
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x400
|
||||
; GFX11-NEXT: s_add_i32 s0, s0, -1
|
||||
; GFX11-NEXT: s_add_u32 s2, s2, 4
|
||||
; GFX11-NEXT: s_addc_u32 s3, s3, 0
|
||||
; GFX11-NEXT: s_cmp_eq_u32 s0, 0
|
||||
; GFX11-NEXT: s_cbranch_scc0 .LBB132_1
|
||||
; GFX11-NEXT: ; %bb.2: ; %bb2
|
||||
; GFX11-NEXT: s_endpgm
|
||||
@ -4774,15 +4770,14 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
|
||||
; GFX12-SDAG-LABEL: global_addr_64bit_lsr_iv:
|
||||
; GFX12-SDAG: ; %bb.0: ; %bb
|
||||
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-SDAG-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x100
|
||||
; GFX12-SDAG-NEXT: .LBB132_1: ; %bb3
|
||||
; GFX12-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
|
||||
; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
|
||||
; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
|
||||
; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
|
||||
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400
|
||||
; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -1
|
||||
; GFX12-SDAG-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4
|
||||
; GFX12-SDAG-NEXT: s_cmp_eq_u32 s0, 0
|
||||
; GFX12-SDAG-NEXT: s_cbranch_scc0 .LBB132_1
|
||||
; GFX12-SDAG-NEXT: ; %bb.2: ; %bb2
|
||||
; GFX12-SDAG-NEXT: s_endpgm
|
||||
@ -4790,16 +4785,15 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
|
||||
; GFX12-GISEL-LABEL: global_addr_64bit_lsr_iv:
|
||||
; GFX12-GISEL: ; %bb.0: ; %bb
|
||||
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-GISEL-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x100
|
||||
; GFX12-GISEL-NEXT: .LBB132_1: ; %bb3
|
||||
; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-GISEL-NEXT: s_add_co_u32 s4, s2, s0
|
||||
; GFX12-GISEL-NEXT: s_add_co_ci_u32 s5, s3, s1
|
||||
; GFX12-GISEL-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
|
||||
; GFX12-GISEL-NEXT: global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
|
||||
; GFX12-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
|
||||
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-GISEL-NEXT: s_cmp_eq_u32 s0, 0x400
|
||||
; GFX12-GISEL-NEXT: s_add_co_i32 s0, s0, -1
|
||||
; GFX12-GISEL-NEXT: s_add_co_u32 s2, s2, 4
|
||||
; GFX12-GISEL-NEXT: s_add_co_ci_u32 s3, s3, 0
|
||||
; GFX12-GISEL-NEXT: s_cmp_eq_u32 s0, 0
|
||||
; GFX12-GISEL-NEXT: s_cbranch_scc0 .LBB132_1
|
||||
; GFX12-GISEL-NEXT: ; %bb.2: ; %bb2
|
||||
; GFX12-GISEL-NEXT: s_endpgm
|
||||
@ -4824,20 +4818,18 @@ bb3: ; preds = %bb3, %bb
|
||||
define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg %arg, ptr addrspace(1) inreg %arg.1) {
|
||||
; GFX9-LABEL: global_addr_64bit_lsr_iv_multiload:
|
||||
; GFX9: ; %bb.0: ; %bb
|
||||
; GFX9-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX9-NEXT: s_movk_i32 s0, 0x100
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX9-NEXT: .LBB133_1: ; %bb3
|
||||
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX9-NEXT: s_add_u32 s4, s2, s0
|
||||
; GFX9-NEXT: s_addc_u32 s5, s3, s1
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
|
||||
; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_add_u32 s0, s0, 4
|
||||
; GFX9-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400
|
||||
; GFX9-NEXT: ; kill: killed $sgpr4 killed $sgpr5
|
||||
; GFX9-NEXT: s_add_i32 s0, s0, -1
|
||||
; GFX9-NEXT: s_add_u32 s2, s2, 4
|
||||
; GFX9-NEXT: s_addc_u32 s3, s3, 0
|
||||
; GFX9-NEXT: s_cmp_eq_u32 s0, 0
|
||||
; GFX9-NEXT: s_cbranch_scc0 .LBB133_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %bb2
|
||||
; GFX9-NEXT: s_endpgm
|
||||
@ -4845,20 +4837,18 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg
|
||||
; GFX10-LABEL: global_addr_64bit_lsr_iv_multiload:
|
||||
; GFX10: ; %bb.0: ; %bb
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX10-NEXT: s_movk_i32 s0, 0x100
|
||||
; GFX10-NEXT: .LBB133_1: ; %bb3
|
||||
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_add_i32 s0, s0, -1
|
||||
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
|
||||
; GFX10-NEXT: s_add_u32 s4, s2, s0
|
||||
; GFX10-NEXT: s_addc_u32 s5, s3, s1
|
||||
; GFX10-NEXT: s_add_u32 s0, s0, 4
|
||||
; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400
|
||||
; GFX10-NEXT: ; kill: killed $sgpr4 killed $sgpr5
|
||||
; GFX10-NEXT: s_add_u32 s2, s2, 4
|
||||
; GFX10-NEXT: s_addc_u32 s3, s3, 0
|
||||
; GFX10-NEXT: s_cmp_eq_u32 s0, 0
|
||||
; GFX10-NEXT: s_cbranch_scc0 .LBB133_1
|
||||
; GFX10-NEXT: ; %bb.2: ; %bb2
|
||||
; GFX10-NEXT: s_endpgm
|
||||
@ -4866,19 +4856,17 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg
|
||||
; GFX11-LABEL: global_addr_64bit_lsr_iv_multiload:
|
||||
; GFX11: ; %bb.0: ; %bb
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX11-NEXT: s_movk_i32 s0, 0x100
|
||||
; GFX11-NEXT: .LBB133_1: ; %bb3
|
||||
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_add_u32 s4, s2, s0
|
||||
; GFX11-NEXT: s_addc_u32 s5, s3, s1
|
||||
; GFX11-NEXT: s_add_u32 s0, s0, 4
|
||||
; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc
|
||||
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc
|
||||
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x400
|
||||
; GFX11-NEXT: s_add_i32 s0, s0, -1
|
||||
; GFX11-NEXT: s_add_u32 s2, s2, 4
|
||||
; GFX11-NEXT: s_addc_u32 s3, s3, 0
|
||||
; GFX11-NEXT: s_cmp_eq_u32 s0, 0
|
||||
; GFX11-NEXT: s_cbranch_scc0 .LBB133_1
|
||||
; GFX11-NEXT: ; %bb.2: ; %bb2
|
||||
; GFX11-NEXT: s_endpgm
|
||||
@ -4886,17 +4874,16 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg
|
||||
; GFX12-SDAG-LABEL: global_addr_64bit_lsr_iv_multiload:
|
||||
; GFX12-SDAG: ; %bb.0: ; %bb
|
||||
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-SDAG-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x100
|
||||
; GFX12-SDAG-NEXT: .LBB133_1: ; %bb3
|
||||
; GFX12-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1]
|
||||
; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
|
||||
; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
|
||||
; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
|
||||
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
|
||||
; GFX12-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
|
||||
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400
|
||||
; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -1
|
||||
; GFX12-SDAG-NEXT: s_add_nc_u64 s[2:3], s[2:3], 4
|
||||
; GFX12-SDAG-NEXT: s_cmp_eq_u32 s0, 0
|
||||
; GFX12-SDAG-NEXT: s_cbranch_scc0 .LBB133_1
|
||||
; GFX12-SDAG-NEXT: ; %bb.2: ; %bb2
|
||||
; GFX12-SDAG-NEXT: s_endpgm
|
||||
@ -4904,18 +4891,17 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg
|
||||
; GFX12-GISEL-LABEL: global_addr_64bit_lsr_iv_multiload:
|
||||
; GFX12-GISEL: ; %bb.0: ; %bb
|
||||
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-GISEL-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX12-GISEL-NEXT: s_movk_i32 s0, 0x100
|
||||
; GFX12-GISEL-NEXT: .LBB133_1: ; %bb3
|
||||
; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX12-GISEL-NEXT: s_add_co_u32 s4, s2, s0
|
||||
; GFX12-GISEL-NEXT: s_add_co_ci_u32 s5, s3, s1
|
||||
; GFX12-GISEL-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
|
||||
; GFX12-GISEL-NEXT: global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
|
||||
; GFX12-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
|
||||
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-GISEL-NEXT: global_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
|
||||
; GFX12-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
|
||||
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-GISEL-NEXT: s_cmp_eq_u32 s0, 0x400
|
||||
; GFX12-GISEL-NEXT: s_add_co_i32 s0, s0, -1
|
||||
; GFX12-GISEL-NEXT: s_add_co_u32 s2, s2, 4
|
||||
; GFX12-GISEL-NEXT: s_add_co_ci_u32 s3, s3, 0
|
||||
; GFX12-GISEL-NEXT: s_cmp_eq_u32 s0, 0
|
||||
; GFX12-GISEL-NEXT: s_cbranch_scc0 .LBB133_1
|
||||
; GFX12-GISEL-NEXT: ; %bb.2: ; %bb2
|
||||
; GFX12-GISEL-NEXT: s_endpgm
|
||||
|
||||
@ -8,44 +8,39 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
|
||||
; GFX9: ; %bb.0: ; %bb
|
||||
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX9-NEXT: s_mov_b32 s7, 0
|
||||
; GFX9-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX9-NEXT: s_mov_b32 s3, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
|
||||
; GFX9-NEXT: s_sub_i32 s4, 0, s6
|
||||
; GFX9-NEXT: s_sub_i32 s2, 0, s6
|
||||
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
|
||||
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
|
||||
; GFX9-NEXT: s_mul_i32 s4, s4, s5
|
||||
; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4
|
||||
; GFX9-NEXT: s_add_i32 s8, s5, s4
|
||||
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
|
||||
; GFX9-NEXT: s_mul_i32 s2, s2, s4
|
||||
; GFX9-NEXT: s_mul_hi_u32 s2, s4, s2
|
||||
; GFX9-NEXT: s_add_i32 s4, s4, s2
|
||||
; GFX9-NEXT: s_mov_b32 s2, s3
|
||||
; GFX9-NEXT: .LBB0_1: ; %bb3
|
||||
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX9-NEXT: s_not_b32 s10, s5
|
||||
; GFX9-NEXT: s_mul_i32 s9, s6, s5
|
||||
; GFX9-NEXT: s_mul_i32 s10, s6, s10
|
||||
; GFX9-NEXT: s_add_i32 s11, s5, 1
|
||||
; GFX9-NEXT: s_sub_i32 s9, s7, s9
|
||||
; GFX9-NEXT: s_add_i32 s10, s7, s10
|
||||
; GFX9-NEXT: s_cmp_ge_u32 s9, s6
|
||||
; GFX9-NEXT: s_cselect_b32 s11, s11, s5
|
||||
; GFX9-NEXT: s_cselect_b32 s9, s10, s9
|
||||
; GFX9-NEXT: s_add_i32 s10, s11, 1
|
||||
; GFX9-NEXT: s_cmp_ge_u32 s9, s6
|
||||
; GFX9-NEXT: s_cselect_b32 s9, s10, s11
|
||||
; GFX9-NEXT: s_add_u32 s10, s0, s2
|
||||
; GFX9-NEXT: s_addc_u32 s11, s1, s3
|
||||
; GFX9-NEXT: s_add_i32 s7, s7, 1
|
||||
; GFX9-NEXT: s_add_u32 s4, s4, s8
|
||||
; GFX9-NEXT: s_addc_u32 s5, s5, 0
|
||||
; GFX9-NEXT: s_add_u32 s2, s2, 4
|
||||
; GFX9-NEXT: s_addc_u32 s3, s3, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s9
|
||||
; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x1000
|
||||
; GFX9-NEXT: global_store_dword v0, v1, s[10:11]
|
||||
; GFX9-NEXT: s_mul_hi_u32 s5, s2, s4
|
||||
; GFX9-NEXT: s_mul_i32 s7, s5, s6
|
||||
; GFX9-NEXT: s_sub_i32 s7, s2, s7
|
||||
; GFX9-NEXT: s_add_i32 s8, s5, 1
|
||||
; GFX9-NEXT: s_sub_i32 s9, s7, s6
|
||||
; GFX9-NEXT: s_cmp_ge_u32 s7, s6
|
||||
; GFX9-NEXT: s_cselect_b32 s5, s8, s5
|
||||
; GFX9-NEXT: s_cselect_b32 s7, s9, s7
|
||||
; GFX9-NEXT: s_add_i32 s8, s5, 1
|
||||
; GFX9-NEXT: s_cmp_ge_u32 s7, s6
|
||||
; GFX9-NEXT: s_cselect_b32 s5, s8, s5
|
||||
; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], 2
|
||||
; GFX9-NEXT: s_add_u32 s8, s0, s8
|
||||
; GFX9-NEXT: s_addc_u32 s9, s1, s9
|
||||
; GFX9-NEXT: s_add_i32 s2, s2, 1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x400
|
||||
; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
|
||||
; GFX9-NEXT: s_cbranch_scc0 .LBB0_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %bb2
|
||||
; GFX9-NEXT: s_endpgm
|
||||
@ -55,45 +50,40 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
|
||||
; GFX10-NEXT: s_clause 0x1
|
||||
; GFX10-NEXT: s_load_dword s6, s[4:5], 0x2c
|
||||
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX10-NEXT: s_mov_b32 s7, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6
|
||||
; GFX10-NEXT: s_sub_i32 s2, 0, s6
|
||||
; GFX10-NEXT: s_sub_i32 s3, 0, s6
|
||||
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
|
||||
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s4, v0
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s2, v0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-NEXT: s_mul_i32 s2, s2, s4
|
||||
; GFX10-NEXT: s_mul_hi_u32 s5, s4, s2
|
||||
; GFX10-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX10-NEXT: s_add_i32 s8, s4, s5
|
||||
; GFX10-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX10-NEXT: s_mul_i32 s3, s3, s2
|
||||
; GFX10-NEXT: s_mul_hi_u32 s4, s2, s3
|
||||
; GFX10-NEXT: s_mov_b32 s3, 0
|
||||
; GFX10-NEXT: s_add_i32 s4, s2, s4
|
||||
; GFX10-NEXT: s_mov_b32 s2, s3
|
||||
; GFX10-NEXT: .LBB0_1: ; %bb3
|
||||
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX10-NEXT: s_mul_hi_u32 s5, s2, s4
|
||||
; GFX10-NEXT: s_mul_i32 s7, s5, s6
|
||||
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
|
||||
; GFX10-NEXT: s_not_b32 s10, s5
|
||||
; GFX10-NEXT: s_mul_i32 s9, s6, s5
|
||||
; GFX10-NEXT: s_mul_i32 s10, s6, s10
|
||||
; GFX10-NEXT: s_sub_i32 s9, s7, s9
|
||||
; GFX10-NEXT: s_add_i32 s11, s5, 1
|
||||
; GFX10-NEXT: s_add_i32 s10, s7, s10
|
||||
; GFX10-NEXT: s_cmp_ge_u32 s9, s6
|
||||
; GFX10-NEXT: s_cselect_b32 s11, s11, s5
|
||||
; GFX10-NEXT: s_cselect_b32 s9, s10, s9
|
||||
; GFX10-NEXT: s_add_i32 s10, s11, 1
|
||||
; GFX10-NEXT: s_cmp_ge_u32 s9, s6
|
||||
; GFX10-NEXT: s_cselect_b32 s9, s10, s11
|
||||
; GFX10-NEXT: s_add_u32 s10, s0, s2
|
||||
; GFX10-NEXT: s_addc_u32 s11, s1, s3
|
||||
; GFX10-NEXT: s_add_i32 s7, s7, 1
|
||||
; GFX10-NEXT: s_add_u32 s4, s4, s8
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s9
|
||||
; GFX10-NEXT: s_addc_u32 s5, s5, 0
|
||||
; GFX10-NEXT: s_add_u32 s2, s2, 4
|
||||
; GFX10-NEXT: s_addc_u32 s3, s3, 0
|
||||
; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x1000
|
||||
; GFX10-NEXT: global_store_dword v0, v1, s[10:11]
|
||||
; GFX10-NEXT: s_add_i32 s8, s5, 1
|
||||
; GFX10-NEXT: s_sub_i32 s7, s2, s7
|
||||
; GFX10-NEXT: s_sub_i32 s9, s7, s6
|
||||
; GFX10-NEXT: s_cmp_ge_u32 s7, s6
|
||||
; GFX10-NEXT: s_cselect_b32 s5, s8, s5
|
||||
; GFX10-NEXT: s_cselect_b32 s7, s9, s7
|
||||
; GFX10-NEXT: s_add_i32 s8, s5, 1
|
||||
; GFX10-NEXT: s_cmp_ge_u32 s7, s6
|
||||
; GFX10-NEXT: s_cselect_b32 s5, s8, s5
|
||||
; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], 2
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX10-NEXT: s_add_u32 s8, s0, s8
|
||||
; GFX10-NEXT: s_addc_u32 s9, s1, s9
|
||||
; GFX10-NEXT: s_add_i32 s2, s2, 1
|
||||
; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x400
|
||||
; GFX10-NEXT: global_store_dword v0, v1, s[8:9]
|
||||
; GFX10-NEXT: s_cbranch_scc0 .LBB0_1
|
||||
; GFX10-NEXT: ; %bb.2: ; %bb2
|
||||
; GFX10-NEXT: s_endpgm
|
||||
@ -103,49 +93,46 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c
|
||||
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
||||
; GFX11-NEXT: s_mov_b32 s7, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s6
|
||||
; GFX11-NEXT: s_sub_i32 s2, 0, s6
|
||||
; GFX11-NEXT: s_sub_i32 s3, 0, s6
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
|
||||
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
|
||||
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
|
||||
; GFX11-NEXT: v_readfirstlane_b32 s2, v0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-NEXT: s_mul_i32 s2, s2, s4
|
||||
; GFX11-NEXT: s_mul_hi_u32 s5, s4, s2
|
||||
; GFX11-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX11-NEXT: s_add_i32 s8, s4, s5
|
||||
; GFX11-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX11-NEXT: s_mul_i32 s3, s3, s2
|
||||
; GFX11-NEXT: s_mul_hi_u32 s4, s2, s3
|
||||
; GFX11-NEXT: s_mov_b32 s3, 0
|
||||
; GFX11-NEXT: s_add_i32 s4, s2, s4
|
||||
; GFX11-NEXT: s_mov_b32 s2, s3
|
||||
; GFX11-NEXT: .p2align 6
|
||||
; GFX11-NEXT: .LBB0_1: ; %bb3
|
||||
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_mul_hi_u32 s5, s2, s4
|
||||
; GFX11-NEXT: s_mul_i32 s7, s5, s6
|
||||
; GFX11-NEXT: s_add_i32 s8, s5, 1
|
||||
; GFX11-NEXT: s_sub_i32 s7, s2, s7
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_not_b32 s10, s5
|
||||
; GFX11-NEXT: s_mul_i32 s9, s6, s5
|
||||
; GFX11-NEXT: s_mul_i32 s10, s6, s10
|
||||
; GFX11-NEXT: s_sub_i32 s9, s7, s9
|
||||
; GFX11-NEXT: s_add_i32 s11, s5, 1
|
||||
; GFX11-NEXT: s_add_i32 s10, s7, s10
|
||||
; GFX11-NEXT: s_cmp_ge_u32 s9, s6
|
||||
; GFX11-NEXT: s_cselect_b32 s11, s11, s5
|
||||
; GFX11-NEXT: s_cselect_b32 s9, s10, s9
|
||||
; GFX11-NEXT: s_add_i32 s10, s11, 1
|
||||
; GFX11-NEXT: s_cmp_ge_u32 s9, s6
|
||||
; GFX11-NEXT: s_cselect_b32 s9, s10, s11
|
||||
; GFX11-NEXT: s_add_u32 s10, s0, s2
|
||||
; GFX11-NEXT: s_addc_u32 s11, s1, s3
|
||||
; GFX11-NEXT: s_add_i32 s7, s7, 1
|
||||
; GFX11-NEXT: s_add_u32 s4, s4, s8
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, s9
|
||||
; GFX11-NEXT: s_addc_u32 s5, s5, 0
|
||||
; GFX11-NEXT: s_add_u32 s2, s2, 4
|
||||
; GFX11-NEXT: s_addc_u32 s3, s3, 0
|
||||
; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x1000
|
||||
; GFX11-NEXT: global_store_b32 v0, v1, s[10:11]
|
||||
; GFX11-NEXT: s_sub_i32 s9, s7, s6
|
||||
; GFX11-NEXT: s_cmp_ge_u32 s7, s6
|
||||
; GFX11-NEXT: s_cselect_b32 s5, s8, s5
|
||||
; GFX11-NEXT: s_cselect_b32 s7, s9, s7
|
||||
; GFX11-NEXT: s_add_i32 s8, s5, 1
|
||||
; GFX11-NEXT: s_cmp_ge_u32 s7, s6
|
||||
; GFX11-NEXT: s_cselect_b32 s5, s8, s5
|
||||
; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], 2
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX11-NEXT: s_add_u32 s8, s0, s8
|
||||
; GFX11-NEXT: s_addc_u32 s9, s1, s9
|
||||
; GFX11-NEXT: s_add_i32 s2, s2, 1
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x400
|
||||
; GFX11-NEXT: global_store_b32 v0, v1, s[8:9]
|
||||
; GFX11-NEXT: s_cbranch_scc0 .LBB0_1
|
||||
; GFX11-NEXT: ; %bb.2: ; %bb2
|
||||
; GFX11-NEXT: s_endpgm
|
||||
@ -171,42 +158,37 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
|
||||
; GFX9: ; %bb.0: ; %bb
|
||||
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX9-NEXT: s_mov_b32 s7, 0
|
||||
; GFX9-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX9-NEXT: s_mov_b32 s3, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
|
||||
; GFX9-NEXT: s_sub_i32 s4, 0, s6
|
||||
; GFX9-NEXT: s_sub_i32 s2, 0, s6
|
||||
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
|
||||
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
|
||||
; GFX9-NEXT: s_mul_i32 s4, s4, s5
|
||||
; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4
|
||||
; GFX9-NEXT: s_add_i32 s8, s5, s4
|
||||
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
|
||||
; GFX9-NEXT: s_mul_i32 s2, s2, s4
|
||||
; GFX9-NEXT: s_mul_hi_u32 s2, s4, s2
|
||||
; GFX9-NEXT: s_add_i32 s4, s4, s2
|
||||
; GFX9-NEXT: s_mov_b32 s2, s3
|
||||
; GFX9-NEXT: .LBB1_1: ; %bb3
|
||||
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX9-NEXT: s_not_b32 s10, s5
|
||||
; GFX9-NEXT: s_mul_i32 s9, s6, s5
|
||||
; GFX9-NEXT: s_mul_i32 s10, s6, s10
|
||||
; GFX9-NEXT: s_sub_i32 s9, s7, s9
|
||||
; GFX9-NEXT: s_add_i32 s10, s7, s10
|
||||
; GFX9-NEXT: s_cmp_ge_u32 s9, s6
|
||||
; GFX9-NEXT: s_cselect_b32 s9, s10, s9
|
||||
; GFX9-NEXT: s_sub_i32 s10, s9, s6
|
||||
; GFX9-NEXT: s_cmp_ge_u32 s9, s6
|
||||
; GFX9-NEXT: s_cselect_b32 s9, s10, s9
|
||||
; GFX9-NEXT: s_add_u32 s10, s0, s2
|
||||
; GFX9-NEXT: s_addc_u32 s11, s1, s3
|
||||
; GFX9-NEXT: s_add_i32 s7, s7, 1
|
||||
; GFX9-NEXT: s_add_u32 s4, s4, s8
|
||||
; GFX9-NEXT: s_addc_u32 s5, s5, 0
|
||||
; GFX9-NEXT: s_add_u32 s2, s2, 4
|
||||
; GFX9-NEXT: s_addc_u32 s3, s3, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s9
|
||||
; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x1000
|
||||
; GFX9-NEXT: global_store_dword v0, v1, s[10:11]
|
||||
; GFX9-NEXT: s_mul_hi_u32 s5, s2, s4
|
||||
; GFX9-NEXT: s_mul_i32 s5, s5, s6
|
||||
; GFX9-NEXT: s_sub_i32 s5, s2, s5
|
||||
; GFX9-NEXT: s_sub_i32 s7, s5, s6
|
||||
; GFX9-NEXT: s_cmp_ge_u32 s5, s6
|
||||
; GFX9-NEXT: s_cselect_b32 s5, s7, s5
|
||||
; GFX9-NEXT: s_sub_i32 s7, s5, s6
|
||||
; GFX9-NEXT: s_cmp_ge_u32 s5, s6
|
||||
; GFX9-NEXT: s_cselect_b32 s5, s7, s5
|
||||
; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], 2
|
||||
; GFX9-NEXT: s_add_u32 s8, s0, s8
|
||||
; GFX9-NEXT: s_addc_u32 s9, s1, s9
|
||||
; GFX9-NEXT: s_add_i32 s2, s2, 1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x400
|
||||
; GFX9-NEXT: global_store_dword v0, v1, s[8:9]
|
||||
; GFX9-NEXT: s_cbranch_scc0 .LBB1_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %bb2
|
||||
; GFX9-NEXT: s_endpgm
|
||||
@ -216,43 +198,38 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
|
||||
; GFX10-NEXT: s_clause 0x1
|
||||
; GFX10-NEXT: s_load_dword s6, s[4:5], 0x2c
|
||||
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX10-NEXT: s_mov_b32 s7, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6
|
||||
; GFX10-NEXT: s_sub_i32 s2, 0, s6
|
||||
; GFX10-NEXT: s_sub_i32 s3, 0, s6
|
||||
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
|
||||
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s4, v0
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s2, v0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-NEXT: s_mul_i32 s2, s2, s4
|
||||
; GFX10-NEXT: s_mul_hi_u32 s5, s4, s2
|
||||
; GFX10-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX10-NEXT: s_add_i32 s8, s4, s5
|
||||
; GFX10-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX10-NEXT: s_mul_i32 s3, s3, s2
|
||||
; GFX10-NEXT: s_mul_hi_u32 s4, s2, s3
|
||||
; GFX10-NEXT: s_mov_b32 s3, 0
|
||||
; GFX10-NEXT: s_add_i32 s4, s2, s4
|
||||
; GFX10-NEXT: s_mov_b32 s2, s3
|
||||
; GFX10-NEXT: .LBB1_1: ; %bb3
|
||||
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX10-NEXT: s_not_b32 s9, s5
|
||||
; GFX10-NEXT: s_mul_hi_u32 s5, s2, s4
|
||||
; GFX10-NEXT: s_mul_i32 s5, s5, s6
|
||||
; GFX10-NEXT: s_sub_i32 s5, s2, s5
|
||||
; GFX10-NEXT: s_sub_i32 s7, s5, s6
|
||||
; GFX10-NEXT: s_cmp_ge_u32 s5, s6
|
||||
; GFX10-NEXT: s_cselect_b32 s5, s7, s5
|
||||
; GFX10-NEXT: s_sub_i32 s7, s5, s6
|
||||
; GFX10-NEXT: s_cmp_ge_u32 s5, s6
|
||||
; GFX10-NEXT: s_cselect_b32 s5, s7, s5
|
||||
; GFX10-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0)
|
||||
; GFX10-NEXT: s_mul_i32 s10, s6, s5
|
||||
; GFX10-NEXT: s_mul_i32 s9, s6, s9
|
||||
; GFX10-NEXT: s_sub_i32 s10, s7, s10
|
||||
; GFX10-NEXT: s_add_i32 s9, s7, s9
|
||||
; GFX10-NEXT: s_cmp_ge_u32 s10, s6
|
||||
; GFX10-NEXT: s_cselect_b32 s9, s9, s10
|
||||
; GFX10-NEXT: s_sub_i32 s10, s9, s6
|
||||
; GFX10-NEXT: s_cmp_ge_u32 s9, s6
|
||||
; GFX10-NEXT: s_cselect_b32 s9, s10, s9
|
||||
; GFX10-NEXT: s_add_u32 s10, s0, s2
|
||||
; GFX10-NEXT: s_addc_u32 s11, s1, s3
|
||||
; GFX10-NEXT: s_add_i32 s7, s7, 1
|
||||
; GFX10-NEXT: s_add_u32 s4, s4, s8
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s9
|
||||
; GFX10-NEXT: s_addc_u32 s5, s5, 0
|
||||
; GFX10-NEXT: s_add_u32 s2, s2, 4
|
||||
; GFX10-NEXT: s_addc_u32 s3, s3, 0
|
||||
; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x1000
|
||||
; GFX10-NEXT: global_store_dword v0, v1, s[10:11]
|
||||
; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], 2
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX10-NEXT: s_add_u32 s8, s0, s8
|
||||
; GFX10-NEXT: s_addc_u32 s9, s1, s9
|
||||
; GFX10-NEXT: s_add_i32 s2, s2, 1
|
||||
; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x400
|
||||
; GFX10-NEXT: global_store_dword v0, v1, s[8:9]
|
||||
; GFX10-NEXT: s_cbranch_scc0 .LBB1_1
|
||||
; GFX10-NEXT: ; %bb.2: ; %bb2
|
||||
; GFX10-NEXT: s_endpgm
|
||||
@ -262,48 +239,45 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c
|
||||
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
||||
; GFX11-NEXT: s_mov_b32 s7, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s6
|
||||
; GFX11-NEXT: s_sub_i32 s2, 0, s6
|
||||
; GFX11-NEXT: s_sub_i32 s3, 0, s6
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
; GFX11-NEXT: s_waitcnt_depctr depctr_va_vdst(0)
|
||||
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
|
||||
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
|
||||
; GFX11-NEXT: v_readfirstlane_b32 s2, v0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-NEXT: s_mul_i32 s2, s2, s4
|
||||
; GFX11-NEXT: s_mul_hi_u32 s5, s4, s2
|
||||
; GFX11-NEXT: s_mov_b64 s[2:3], 0
|
||||
; GFX11-NEXT: s_add_i32 s8, s4, s5
|
||||
; GFX11-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX11-NEXT: s_mul_i32 s3, s3, s2
|
||||
; GFX11-NEXT: s_mul_hi_u32 s4, s2, s3
|
||||
; GFX11-NEXT: s_mov_b32 s3, 0
|
||||
; GFX11-NEXT: s_add_i32 s4, s2, s4
|
||||
; GFX11-NEXT: s_mov_b32 s2, s3
|
||||
; GFX11-NEXT: .p2align 6
|
||||
; GFX11-NEXT: .LBB1_1: ; %bb3
|
||||
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_mul_hi_u32 s5, s2, s4
|
||||
; GFX11-NEXT: s_mul_i32 s5, s5, s6
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_sub_i32 s5, s2, s5
|
||||
; GFX11-NEXT: s_sub_i32 s7, s5, s6
|
||||
; GFX11-NEXT: s_cmp_ge_u32 s5, s6
|
||||
; GFX11-NEXT: s_cselect_b32 s5, s7, s5
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_not_b32 s9, s5
|
||||
; GFX11-NEXT: s_mul_i32 s10, s6, s5
|
||||
; GFX11-NEXT: s_mul_i32 s9, s6, s9
|
||||
; GFX11-NEXT: s_sub_i32 s10, s7, s10
|
||||
; GFX11-NEXT: s_add_i32 s9, s7, s9
|
||||
; GFX11-NEXT: s_cmp_ge_u32 s10, s6
|
||||
; GFX11-NEXT: s_cselect_b32 s9, s9, s10
|
||||
; GFX11-NEXT: s_sub_i32 s7, s5, s6
|
||||
; GFX11-NEXT: s_cmp_ge_u32 s5, s6
|
||||
; GFX11-NEXT: s_cselect_b32 s5, s7, s5
|
||||
; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], 2
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX11-NEXT: s_add_u32 s8, s0, s8
|
||||
; GFX11-NEXT: s_addc_u32 s9, s1, s9
|
||||
; GFX11-NEXT: s_add_i32 s2, s2, 1
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_sub_i32 s10, s9, s6
|
||||
; GFX11-NEXT: s_cmp_ge_u32 s9, s6
|
||||
; GFX11-NEXT: s_cselect_b32 s9, s10, s9
|
||||
; GFX11-NEXT: s_add_u32 s10, s0, s2
|
||||
; GFX11-NEXT: s_addc_u32 s11, s1, s3
|
||||
; GFX11-NEXT: s_add_i32 s7, s7, 1
|
||||
; GFX11-NEXT: s_add_u32 s4, s4, s8
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, s9
|
||||
; GFX11-NEXT: s_addc_u32 s5, s5, 0
|
||||
; GFX11-NEXT: s_add_u32 s2, s2, 4
|
||||
; GFX11-NEXT: s_addc_u32 s3, s3, 0
|
||||
; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x1000
|
||||
; GFX11-NEXT: global_store_b32 v0, v1, s[10:11]
|
||||
; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x400
|
||||
; GFX11-NEXT: global_store_b32 v0, v1, s[8:9]
|
||||
; GFX11-NEXT: s_cbranch_scc0 .LBB1_1
|
||||
; GFX11-NEXT: ; %bb.2: ; %bb2
|
||||
; GFX11-NEXT: s_endpgm
|
||||
|
||||
42
llvm/test/CodeGen/AMDGPU/lsr-cost-model-vector-iv.ll
Normal file
42
llvm/test/CodeGen/AMDGPU/lsr-cost-model-vector-iv.ll
Normal file
@ -0,0 +1,42 @@
|
||||
; RUN: llc -O3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s
|
||||
|
||||
; Reduced from rocrand's threefry2x32_20 kernel.
|
||||
; The AMDGPU LSR cost model should avoid creating a redundant VGPR induction
|
||||
; variable when the loop already has a vector IV incremented by a uniform
|
||||
; (SGPR) stride. Without the cost model fix, LSR introduces a second v_add
|
||||
; in the loop body, wasting a VGPR and a VALU slot every iteration.
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #0
|
||||
|
||||
; CHECK-LABEL: {{^}}lsr_vector_iv_cost:
|
||||
; The loop must contain exactly one VALU add — the single vector IV update.
|
||||
; A second v_add_u32 here would mean LSR created a redundant IV.
|
||||
; CHECK: {{^}}.LBB0_1:
|
||||
; CHECK: v_add_u32
|
||||
; CHECK-NOT: v_add_u32
|
||||
; CHECK: s_cbranch
|
||||
define amdgpu_kernel void @lsr_vector_iv_cost(<2 x i32> %arg0, i32 %stride, ptr addrspace(1) %out) {
|
||||
entry:
|
||||
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
br label %loop
|
||||
|
||||
loop:
|
||||
%iv.pn = phi i32 [ 0, %entry ], [ %or, %loop ]
|
||||
%iv.vec = phi i32 [ %tid, %entry ], [ %sum1, %loop ]
|
||||
%sum1 = add i32 %iv.vec, %stride
|
||||
%elt = extractelement <2 x i32> %arg0, i64 0
|
||||
%sum2 = add i32 %sum1, %elt
|
||||
%xor = xor i32 1, %sum2
|
||||
%sum3 = add i32 %sum2, %xor
|
||||
%sum4 = add i32 %sum3, %elt
|
||||
%or = or i32 %sum4, %stride
|
||||
%shr = lshr i32 %iv.pn, 1
|
||||
%cmp = icmp ult i32 %sum1, 1024
|
||||
br i1 %cmp, label %loop, label %exit
|
||||
|
||||
exit:
|
||||
store i32 %or, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
||||
@ -19,11 +19,10 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
|
||||
; CHECK-NEXT: .LBB0_1: ; %Flow
|
||||
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
|
||||
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v3, -4, v3
|
||||
; CHECK-NEXT: .LBB0_2: ; %Flow1
|
||||
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
|
||||
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s7
|
||||
; CHECK-NEXT: v_cmp_ne_u32_e64 s5, 0, v1
|
||||
; CHECK-NEXT: v_cmp_ne_u32_e64 s5, 0, v3
|
||||
; CHECK-NEXT: ;;#ASMSTART
|
||||
; CHECK-NEXT: ; j lastloop entry
|
||||
; CHECK-NEXT: ;;#ASMEND
|
||||
@ -33,40 +32,41 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49
|
||||
; CHECK-NEXT: .LBB0_3: ; %for.body33
|
||||
; CHECK-NEXT: ; =>This Loop Header: Depth=1
|
||||
; CHECK-NEXT: ; Child Loop BB0_6 Depth 2
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, 0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, 0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, 0
|
||||
; CHECK-NEXT: s_and_saveexec_b32 s7, s4
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB0_2
|
||||
; CHECK-NEXT: ; %bb.4: ; %for.body51.preheader
|
||||
; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1
|
||||
; CHECK-NEXT: v_mov_b32_e32 v4, v2
|
||||
; CHECK-NEXT: s_mov_b32 s9, 4
|
||||
; CHECK-NEXT: s_mov_b32 s8, 0
|
||||
; CHECK-NEXT: s_mov_b32 s9, 0
|
||||
; CHECK-NEXT: s_branch .LBB0_6
|
||||
; CHECK-NEXT: .p2align 6
|
||||
; CHECK-NEXT: .LBB0_5: ; %if.end118
|
||||
; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2
|
||||
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v4, 4, v1
|
||||
; CHECK-NEXT: s_add_i32 s9, s9, 4
|
||||
; CHECK-NEXT: ;;#ASMSTART
|
||||
; CHECK-NEXT: ; backedge
|
||||
; CHECK-NEXT: ;;#ASMEND
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v3, s9, v2
|
||||
; CHECK-NEXT: v_cmp_ge_u32_e64 s5, v3, v0
|
||||
; CHECK-NEXT: v_cmp_ge_u32_e64 s5, v4, v0
|
||||
; CHECK-NEXT: s_or_b32 s8, s5, s8
|
||||
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB0_1
|
||||
; CHECK-NEXT: .LBB0_6: ; %for.body51
|
||||
; CHECK-NEXT: ; Parent Loop BB0_3 Depth=1
|
||||
; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, 1
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, v4
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, 1
|
||||
; CHECK-NEXT: s_and_saveexec_b32 s5, vcc_lo
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB0_5
|
||||
; CHECK-NEXT: ; %bb.7: ; %if.then112
|
||||
; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2
|
||||
; CHECK-NEXT: s_add_i32 s10, s9, 4
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, 0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, s10
|
||||
; CHECK-NEXT: ds_write_b32 v1, v3
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, 0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v4, s9
|
||||
; CHECK-NEXT: ds_write_b32 v3, v4
|
||||
; CHECK-NEXT: s_branch .LBB0_5
|
||||
; CHECK-NEXT: .LBB0_8: ; %for.body159.preheader
|
||||
; CHECK-NEXT: s_inst_prefetch 0x2
|
||||
|
||||
@ -62,7 +62,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
|
||||
; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11]
|
||||
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
|
||||
; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
|
||||
; CHECK-NEXT: v_mov_b32_e32 v45, 0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v46, 0
|
||||
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
||||
; CHECK-NEXT: v_mov_b32_e32 v43, v0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v31, v40
|
||||
@ -91,7 +91,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
|
||||
; CHECK-NEXT: s_mov_b32 s12, s51
|
||||
; CHECK-NEXT: s_mov_b32 s13, s50
|
||||
; CHECK-NEXT: s_mov_b32 s14, s33
|
||||
; CHECK-NEXT: ds_write_b32 v45, v45 offset:15360
|
||||
; CHECK-NEXT: ds_write_b32 v46, v46 offset:15360
|
||||
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
||||
; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v43
|
||||
; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v43
|
||||
@ -118,69 +118,66 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
|
||||
; CHECK-NEXT: v_cmpx_ne_u32_e32 0, v42
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB0_25
|
||||
; CHECK-NEXT: ; %bb.1: ; %.preheader5
|
||||
; CHECK-NEXT: v_mul_lo_u32 v0, v41, 14
|
||||
; CHECK-NEXT: v_mul_lo_u32 v44, v41, 14
|
||||
; CHECK-NEXT: s_mov_b32 s4, 0
|
||||
; CHECK-NEXT: s_mov_b32 s5, 0
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v44, 0x3c04, v0
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v45, 0x3c04, v44
|
||||
; CHECK-NEXT: .LBB0_2: ; =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v1, s5, v44
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, s5, v45
|
||||
; CHECK-NEXT: s_add_i32 s5, s5, 1
|
||||
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v42
|
||||
; CHECK-NEXT: ds_write_b8 v1, v45
|
||||
; CHECK-NEXT: ds_write_b8 v0, v46
|
||||
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
|
||||
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
|
||||
; CHECK-NEXT: s_cbranch_execnz .LBB0_2
|
||||
; CHECK-NEXT: ; %bb.3:
|
||||
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v42
|
||||
; CHECK-NEXT: s_mov_b32 s55, 0
|
||||
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v45
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v46, -1, v42
|
||||
; CHECK-NEXT: s_mov_b32 s53, 0
|
||||
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v46
|
||||
; CHECK-NEXT: s_and_b32 exec_lo, exec_lo, vcc_lo
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB0_25
|
||||
; CHECK-NEXT: ; %bb.4:
|
||||
; CHECK-NEXT: v_lshlrev_b32_e32 v43, 10, v43
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v46, 0x3c05, v0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v47, 0
|
||||
; CHECK-NEXT: s_mov_b32 s53, 0
|
||||
; CHECK-NEXT: s_mov_b32 s54, 0
|
||||
; CHECK-NEXT: .LBB0_5: ; =>This Loop Header: Depth=1
|
||||
; CHECK-NEXT: ; Child Loop BB0_8 Depth 2
|
||||
; CHECK-NEXT: ; Child Loop BB0_20 Depth 2
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, s55, v44
|
||||
; CHECK-NEXT: s_lshl_b32 s4, s55, 5
|
||||
; CHECK-NEXT: s_add_i32 s54, s55, 1
|
||||
; CHECK-NEXT: s_add_i32 s5, s55, 5
|
||||
; CHECK-NEXT: v_or3_b32 v57, s4, v43, s54
|
||||
; CHECK-NEXT: s_mov_b32 s4, s53
|
||||
; CHECK-NEXT: s_lshl_b32 s5, s53, 5
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v45
|
||||
; CHECK-NEXT: s_add_i32 s53, s53, 1
|
||||
; CHECK-NEXT: s_add_i32 s4, s4, 5
|
||||
; CHECK-NEXT: v_or3_b32 v57, s5, v43, s53
|
||||
; CHECK-NEXT: v_mov_b32_e32 v58, s53
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: ds_read_u8 v56, v0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v58, s54
|
||||
; CHECK-NEXT: s_mov_b32 s68, exec_lo
|
||||
; CHECK-NEXT: v_cmpx_lt_u32_e64 s5, v42
|
||||
; CHECK-NEXT: s_mov_b32 s55, exec_lo
|
||||
; CHECK-NEXT: v_cmpx_lt_u32_e64 s4, v42
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB0_17
|
||||
; CHECK-NEXT: ; %bb.6: ; %.preheader2
|
||||
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
|
||||
; CHECK-NEXT: s_mov_b32 s69, 0
|
||||
; CHECK-NEXT: s_mov_b32 s80, 0
|
||||
; CHECK-NEXT: s_mov_b32 s68, 0
|
||||
; CHECK-NEXT: s_mov_b32 s69, s53
|
||||
; CHECK-NEXT: s_branch .LBB0_8
|
||||
; CHECK-NEXT: .LBB0_7: ; in Loop: Header=BB0_8 Depth=2
|
||||
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81
|
||||
; CHECK-NEXT: s_add_i32 s80, s80, 4
|
||||
; CHECK-NEXT: s_add_i32 s4, s55, s80
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, s80, v57
|
||||
; CHECK-NEXT: s_add_i32 s5, s4, 5
|
||||
; CHECK-NEXT: s_add_i32 s4, s4, 1
|
||||
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s5, v42
|
||||
; CHECK-NEXT: v_mov_b32_e32 v58, s4
|
||||
; CHECK-NEXT: s_or_b32 s69, vcc_lo, s69
|
||||
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s69
|
||||
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s80
|
||||
; CHECK-NEXT: s_add_i32 s4, s69, 4
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v57, 4, v57
|
||||
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s4, v42
|
||||
; CHECK-NEXT: v_mov_b32_e32 v58, s69
|
||||
; CHECK-NEXT: s_or_b32 s68, vcc_lo, s68
|
||||
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s68
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB0_16
|
||||
; CHECK-NEXT: .LBB0_8: ; Parent Loop BB0_5 Depth=1
|
||||
; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v59, s80, v46
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v58, s80, v57
|
||||
; CHECK-NEXT: ds_read_u8 v0, v59
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v58, s69, v45
|
||||
; CHECK-NEXT: s_mov_b32 s69, s4
|
||||
; CHECK-NEXT: ds_read_u8 v0, v58
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; CHECK-NEXT: s_and_saveexec_b32 s81, s4
|
||||
; CHECK-NEXT: v_cmp_eq_u16_sdwa s5, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; CHECK-NEXT: s_and_saveexec_b32 s80, s5
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB0_10
|
||||
; CHECK-NEXT: ; %bb.9: ; in Loop: Header=BB0_8 Depth=2
|
||||
; CHECK-NEXT: v_mov_b32_e32 v31, v40
|
||||
@ -199,13 +196,13 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
|
||||
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
||||
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; CHECK-NEXT: ds_write_b32 v0, v58
|
||||
; CHECK-NEXT: ds_write_b32 v0, v57
|
||||
; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_8 Depth=2
|
||||
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81
|
||||
; CHECK-NEXT: ds_read_u8 v0, v59 offset:1
|
||||
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s80
|
||||
; CHECK-NEXT: ds_read_u8 v0, v58 offset:1
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; CHECK-NEXT: s_and_saveexec_b32 s81, s4
|
||||
; CHECK-NEXT: s_and_saveexec_b32 s80, s4
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB0_12
|
||||
; CHECK-NEXT: ; %bb.11: ; in Loop: Header=BB0_8 Depth=2
|
||||
; CHECK-NEXT: v_mov_b32_e32 v31, v40
|
||||
@ -221,17 +218,17 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
|
||||
; CHECK-NEXT: s_mov_b32 s12, s51
|
||||
; CHECK-NEXT: s_mov_b32 s13, s50
|
||||
; CHECK-NEXT: s_mov_b32 s14, s33
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v60, 1, v58
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v59, 1, v57
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
|
||||
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
||||
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; CHECK-NEXT: ds_write_b32 v0, v60
|
||||
; CHECK-NEXT: ds_write_b32 v0, v59
|
||||
; CHECK-NEXT: .LBB0_12: ; in Loop: Header=BB0_8 Depth=2
|
||||
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81
|
||||
; CHECK-NEXT: ds_read_u8 v0, v59 offset:2
|
||||
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s80
|
||||
; CHECK-NEXT: ds_read_u8 v0, v58 offset:2
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; CHECK-NEXT: s_and_saveexec_b32 s81, s4
|
||||
; CHECK-NEXT: s_and_saveexec_b32 s80, s4
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB0_14
|
||||
; CHECK-NEXT: ; %bb.13: ; in Loop: Header=BB0_8 Depth=2
|
||||
; CHECK-NEXT: v_mov_b32_e32 v31, v40
|
||||
@ -247,17 +244,17 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
|
||||
; CHECK-NEXT: s_mov_b32 s12, s51
|
||||
; CHECK-NEXT: s_mov_b32 s13, s50
|
||||
; CHECK-NEXT: s_mov_b32 s14, s33
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v60, 2, v58
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v59, 2, v57
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
|
||||
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
||||
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; CHECK-NEXT: ds_write_b32 v0, v60
|
||||
; CHECK-NEXT: ds_write_b32 v0, v59
|
||||
; CHECK-NEXT: .LBB0_14: ; in Loop: Header=BB0_8 Depth=2
|
||||
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s81
|
||||
; CHECK-NEXT: ds_read_u8 v0, v59 offset:3
|
||||
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s80
|
||||
; CHECK-NEXT: ds_read_u8 v0, v58 offset:3
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; CHECK-NEXT: s_and_saveexec_b32 s81, s4
|
||||
; CHECK-NEXT: s_and_saveexec_b32 s80, s4
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB0_7
|
||||
; CHECK-NEXT: ; %bb.15: ; in Loop: Header=BB0_8 Depth=2
|
||||
; CHECK-NEXT: v_mov_b32_e32 v31, v40
|
||||
@ -273,19 +270,18 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
|
||||
; CHECK-NEXT: s_mov_b32 s12, s51
|
||||
; CHECK-NEXT: s_mov_b32 s13, s50
|
||||
; CHECK-NEXT: s_mov_b32 s14, s33
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v58, 3, v58
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v58, 3, v57
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
|
||||
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
||||
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; CHECK-NEXT: ds_write_b32 v0, v58
|
||||
; CHECK-NEXT: s_branch .LBB0_7
|
||||
; CHECK-NEXT: .LBB0_16: ; %Flow43
|
||||
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
|
||||
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s69
|
||||
; CHECK-NEXT: v_mov_b32_e32 v57, v0
|
||||
; CHECK-NEXT: .LBB0_17: ; %Flow44
|
||||
; CHECK-NEXT: .LBB0_16: ; %Flow32
|
||||
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
|
||||
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s68
|
||||
; CHECK-NEXT: .LBB0_17: ; %Flow33
|
||||
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
|
||||
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55
|
||||
; CHECK-NEXT: s_mov_b32 s55, exec_lo
|
||||
; CHECK-NEXT: v_cmpx_lt_u32_e64 v58, v42
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB0_23
|
||||
@ -306,7 +302,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
|
||||
; CHECK-NEXT: .LBB0_20: ; Parent Loop BB0_5 Depth=1
|
||||
; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v58
|
||||
; CHECK-NEXT: ds_read_u8 v0, v0
|
||||
; CHECK-NEXT: ds_read_u8 v0, v0 offset:15364
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v56, v0 src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; CHECK-NEXT: s_and_saveexec_b32 s69, s4
|
||||
@ -330,24 +326,22 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
|
||||
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; CHECK-NEXT: ds_write_b32 v0, v57
|
||||
; CHECK-NEXT: s_branch .LBB0_19
|
||||
; CHECK-NEXT: .LBB0_22: ; %Flow41
|
||||
; CHECK-NEXT: .LBB0_22: ; %Flow30
|
||||
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
|
||||
; CHECK-NEXT: s_inst_prefetch 0x2
|
||||
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s68
|
||||
; CHECK-NEXT: .LBB0_23: ; %Flow42
|
||||
; CHECK-NEXT: .LBB0_23: ; %Flow31
|
||||
; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1
|
||||
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55
|
||||
; CHECK-NEXT: ; %bb.24: ; in Loop: Header=BB0_5 Depth=1
|
||||
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s54, v45
|
||||
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s53, v46
|
||||
; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v47
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v46, 1, v46
|
||||
; CHECK-NEXT: s_mov_b32 s55, s54
|
||||
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
|
||||
; CHECK-NEXT: s_and_b32 s4, exec_lo, s4
|
||||
; CHECK-NEXT: s_or_b32 s53, s4, s53
|
||||
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s53
|
||||
; CHECK-NEXT: s_or_b32 s54, s4, s54
|
||||
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s54
|
||||
; CHECK-NEXT: s_cbranch_execnz .LBB0_5
|
||||
; CHECK-NEXT: .LBB0_25: ; %Flow49
|
||||
; CHECK-NEXT: .LBB0_25: ; %Flow38
|
||||
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52
|
||||
; CHECK-NEXT: v_mov_b32_e32 v31, v40
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, 1
|
||||
@ -828,7 +822,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
|
||||
; CHECK-NEXT: s_mov_b32 s13, s50
|
||||
; CHECK-NEXT: s_mov_b32 s14, s33
|
||||
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
||||
; CHECK-NEXT: v_mul_lo_u32 v46, v0, 14
|
||||
; CHECK-NEXT: v_mul_lo_u32 v44, v0, 14
|
||||
; CHECK-NEXT: v_mov_b32_e32 v31, v40
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, 1
|
||||
; CHECK-NEXT: s_getpc_b64 s[16:17]
|
||||
@ -842,7 +836,7 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
|
||||
; CHECK-NEXT: s_mov_b32 s13, s50
|
||||
; CHECK-NEXT: s_mov_b32 s14, s33
|
||||
; CHECK-NEXT: ds_write_b32 v43, v43 offset:15360
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v44, 0x3c04, v46
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v45, 0x3c04, v44
|
||||
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
||||
; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v42
|
||||
; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v42
|
||||
@ -866,52 +860,49 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
|
||||
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
||||
; CHECK-NEXT: v_mov_b32_e32 v41, v0
|
||||
; CHECK-NEXT: v_lshlrev_b32_e32 v42, 10, v42
|
||||
; CHECK-NEXT: s_mov_b32 s4, 0
|
||||
; CHECK-NEXT: s_mov_b32 s53, 0
|
||||
; CHECK-NEXT: s_mov_b32 s52, 0
|
||||
; CHECK-NEXT: ds_write_b8 v46, v43 offset:15364
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v45, -1, v41
|
||||
; CHECK-NEXT: ds_write_b8 v44, v43 offset:15364
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v46, -1, v41
|
||||
; CHECK-NEXT: .LBB1_1: ; %.37
|
||||
; CHECK-NEXT: ; =>This Loop Header: Depth=1
|
||||
; CHECK-NEXT: ; Child Loop BB1_3 Depth 2
|
||||
; CHECK-NEXT: ; Child Loop BB1_8 Depth 2
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v44
|
||||
; CHECK-NEXT: s_lshl_b32 s5, s4, 5
|
||||
; CHECK-NEXT: s_add_i32 s53, s4, 1
|
||||
; CHECK-NEXT: s_add_i32 s6, s4, 5
|
||||
; CHECK-NEXT: v_or3_b32 v47, s5, v42, s53
|
||||
; CHECK-NEXT: s_mov_b32 s4, s53
|
||||
; CHECK-NEXT: s_lshl_b32 s6, s53, 5
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, s4, v45
|
||||
; CHECK-NEXT: s_add_i32 s53, s53, 1
|
||||
; CHECK-NEXT: s_add_i32 s5, s4, 5
|
||||
; CHECK-NEXT: v_or3_b32 v56, s6, v42, s53
|
||||
; CHECK-NEXT: v_mov_b32_e32 v57, s53
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: ds_read_u8 v46, v0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v56, s53
|
||||
; CHECK-NEXT: s_mov_b32 s5, exec_lo
|
||||
; CHECK-NEXT: v_cmpx_lt_u32_e64 s6, v41
|
||||
; CHECK-NEXT: ds_read_u8 v47, v0
|
||||
; CHECK-NEXT: s_mov_b32 s4, exec_lo
|
||||
; CHECK-NEXT: v_cmpx_lt_u32_e64 s5, v41
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB1_5
|
||||
; CHECK-NEXT: ; %bb.2: ; %.53.preheader
|
||||
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
|
||||
; CHECK-NEXT: s_mov_b32 s6, 0
|
||||
; CHECK-NEXT: s_mov_b32 s7, 0
|
||||
; CHECK-NEXT: .LBB1_3: ; %.53
|
||||
; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1
|
||||
; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
|
||||
; CHECK-NEXT: s_add_i32 s7, s7, 4
|
||||
; CHECK-NEXT: s_add_i32 s7, s5, 4
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43
|
||||
; CHECK-NEXT: s_add_i32 s8, s4, s7
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, s7, v47
|
||||
; CHECK-NEXT: s_add_i32 s9, s8, 5
|
||||
; CHECK-NEXT: s_add_i32 s8, s8, 1
|
||||
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s9, v41
|
||||
; CHECK-NEXT: v_mov_b32_e32 v56, s8
|
||||
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s7, v41
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v56, 4, v56
|
||||
; CHECK-NEXT: v_mov_b32_e32 v57, s5
|
||||
; CHECK-NEXT: s_mov_b32 s5, s7
|
||||
; CHECK-NEXT: s_or_b32 s6, vcc_lo, s6
|
||||
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s6
|
||||
; CHECK-NEXT: s_cbranch_execnz .LBB1_3
|
||||
; CHECK-NEXT: ; %bb.4: ; %Flow3
|
||||
; CHECK-NEXT: ; %bb.4: ; %Flow4
|
||||
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
|
||||
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s6
|
||||
; CHECK-NEXT: v_mov_b32_e32 v47, v0
|
||||
; CHECK-NEXT: .LBB1_5: ; %Flow4
|
||||
; CHECK-NEXT: .LBB1_5: ; %Flow5
|
||||
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
|
||||
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s5
|
||||
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s4
|
||||
; CHECK-NEXT: s_mov_b32 s54, exec_lo
|
||||
; CHECK-NEXT: v_cmpx_lt_u32_e64 v56, v41
|
||||
; CHECK-NEXT: v_cmpx_lt_u32_e64 v57, v41
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB1_11
|
||||
; CHECK-NEXT: ; %bb.6: ; %.103.preheader
|
||||
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
|
||||
@ -922,19 +913,19 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
|
||||
; CHECK-NEXT: .LBB1_7: ; %.114
|
||||
; CHECK-NEXT: ; in Loop: Header=BB1_8 Depth=2
|
||||
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s64
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v57, 1, v57
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v56, 1, v56
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47
|
||||
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v56, v41
|
||||
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, v57, v41
|
||||
; CHECK-NEXT: s_or_b32 s55, vcc_lo, s55
|
||||
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s55
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB1_10
|
||||
; CHECK-NEXT: .LBB1_8: ; %.103
|
||||
; CHECK-NEXT: ; Parent Loop BB1_1 Depth=1
|
||||
; CHECK-NEXT: ; => This Inner Loop Header: Depth=2
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v56
|
||||
; CHECK-NEXT: ds_read_u8 v0, v0
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v0, v44, v57
|
||||
; CHECK-NEXT: ds_read_u8 v0, v0 offset:15364
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v46, v0 src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; CHECK-NEXT: v_cmp_eq_u16_sdwa s4, v47, v0 src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; CHECK-NEXT: s_and_saveexec_b32 s64, s4
|
||||
; CHECK-NEXT: s_cbranch_execz .LBB1_7
|
||||
; CHECK-NEXT: ; %bb.9: ; %.110
|
||||
@ -955,23 +946,22 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43
|
||||
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
||||
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; CHECK-NEXT: ds_write_b32 v0, v47
|
||||
; CHECK-NEXT: ds_write_b32 v0, v56
|
||||
; CHECK-NEXT: s_branch .LBB1_7
|
||||
; CHECK-NEXT: .LBB1_10: ; %Flow
|
||||
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
|
||||
; CHECK-NEXT: s_inst_prefetch 0x2
|
||||
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55
|
||||
; CHECK-NEXT: .LBB1_11: ; %Flow2
|
||||
; CHECK-NEXT: .LBB1_11: ; %Flow3
|
||||
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
|
||||
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s54
|
||||
; CHECK-NEXT: ; %bb.12: ; %.32
|
||||
; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1
|
||||
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s53, v45
|
||||
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s53, v46
|
||||
; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v43
|
||||
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4
|
||||
; CHECK-NEXT: s_and_b32 s4, exec_lo, s4
|
||||
; CHECK-NEXT: s_or_b32 s52, s4, s52
|
||||
; CHECK-NEXT: s_mov_b32 s4, s53
|
||||
; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s52
|
||||
; CHECK-NEXT: s_cbranch_execnz .LBB1_1
|
||||
; CHECK-NEXT: ; %bb.13: ; %.119
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -1060,38 +1060,38 @@ define void @memset_pattern_i64_as7_len33_dynval(ptr addrspace(7) inreg align 16
|
||||
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, v1
|
||||
; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v2
|
||||
; GFX942-GISEL-NEXT: v_add_u32_e32 v2, s16, v0
|
||||
; GFX942-GISEL-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GFX942-GISEL-NEXT: v_add_u32_e32 v1, s16, v0
|
||||
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
|
||||
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
|
||||
; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
|
||||
; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, v2
|
||||
; GFX942-GISEL-NEXT: .LBB14_1: ; %memset.pattern-expansion-main-body
|
||||
; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 32, v2
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:16
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:32
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:48
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:64
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:80
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:96
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:112
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:128
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:144
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:160
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:176
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:192
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:208
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:224
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:240
|
||||
; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 32, v0
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen offset:16
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen offset:32
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen offset:48
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen offset:64
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen offset:80
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen offset:96
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen offset:112
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen offset:128
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen offset:144
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen offset:160
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen offset:176
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen offset:192
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen offset:208
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen offset:224
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx4 v[4:7], v3, s[0:3], 0 offen offset:240
|
||||
; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; GFX942-GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
|
||||
; GFX942-GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1
|
||||
; GFX942-GISEL-NEXT: v_add_u32_e32 v1, 0x2000, v1
|
||||
; GFX942-GISEL-NEXT: v_add_u32_e32 v3, 0x2000, v3
|
||||
; GFX942-GISEL-NEXT: s_and_b64 vcc, exec, s[4:5]
|
||||
; GFX942-GISEL-NEXT: s_cbranch_vccnz .LBB14_1
|
||||
; GFX942-GISEL-NEXT: ; %bb.2: ; %memset.pattern-expansion-residual-body.preheader
|
||||
; GFX942-GISEL-NEXT: v_add_u32_e32 v0, s16, v0
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx2 v[4:5], v0, s[0:3], 0 offen offset:256
|
||||
; GFX942-GISEL-NEXT: buffer_store_dwordx2 v[4:5], v2, s[0:3], 0 offen offset:256
|
||||
; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
%p = getelementptr inbounds i8, ptr addrspace(7) %a, i32 %offset
|
||||
|
||||
@ -1140,7 +1140,6 @@ define void @move_to_valu_vgpr_operand_phi(ptr addrspace(3) %out) {
|
||||
; SI-LABEL: move_to_valu_vgpr_operand_phi:
|
||||
; SI: ; %bb.0: ; %bb0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: v_add_i32_e32 v0, vcc, 28, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, 1
|
||||
; SI-NEXT: s_and_b64 vcc, exec, 0
|
||||
; SI-NEXT: s_mov_b32 m0, -1
|
||||
@ -1157,7 +1156,8 @@ define void @move_to_valu_vgpr_operand_phi(ptr addrspace(3) %out) {
|
||||
; SI-NEXT: s_cbranch_scc1 .LBB20_1
|
||||
; SI-NEXT: ; %bb.3: ; %bb2
|
||||
; SI-NEXT: ; in Loop: Header=BB20_2 Depth=1
|
||||
; SI-NEXT: ds_write_b32 v0, v1
|
||||
; SI-NEXT: v_add_i32_e64 v2, s[4:5], 28, v0
|
||||
; SI-NEXT: ds_write_b32 v2, v1
|
||||
; SI-NEXT: s_mov_b64 vcc, vcc
|
||||
; SI-NEXT: s_cbranch_vccz .LBB20_1
|
||||
; SI-NEXT: ; %bb.4: ; %DummyReturnBlock
|
||||
@ -1167,7 +1167,6 @@ define void @move_to_valu_vgpr_operand_phi(ptr addrspace(3) %out) {
|
||||
; VI-LABEL: move_to_valu_vgpr_operand_phi:
|
||||
; VI: ; %bb.0: ; %bb0
|
||||
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; VI-NEXT: v_add_u32_e32 v0, vcc, 28, v0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, 1
|
||||
; VI-NEXT: s_and_b64 vcc, exec, 0
|
||||
; VI-NEXT: s_mov_b32 m0, -1
|
||||
@ -1184,7 +1183,7 @@ define void @move_to_valu_vgpr_operand_phi(ptr addrspace(3) %out) {
|
||||
; VI-NEXT: s_cbranch_scc1 .LBB20_1
|
||||
; VI-NEXT: ; %bb.3: ; %bb2
|
||||
; VI-NEXT: ; in Loop: Header=BB20_2 Depth=1
|
||||
; VI-NEXT: ds_write_b32 v0, v1
|
||||
; VI-NEXT: ds_write_b32 v0, v1 offset:28
|
||||
; VI-NEXT: s_mov_b64 vcc, vcc
|
||||
; VI-NEXT: s_cbranch_vccz .LBB20_1
|
||||
; VI-NEXT: ; %bb.4: ; %DummyReturnBlock
|
||||
|
||||
@ -247,27 +247,30 @@ define amdgpu_kernel void @simple_test_v_loop(ptr addrspace(1) %dst, ptr addrspa
|
||||
; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
||||
; SI-NEXT: s_cbranch_execz .LBB4_3
|
||||
; SI-NEXT: ; %bb.1: ; %loop.preheader
|
||||
; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
|
||||
; SI-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x9
|
||||
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; SI-NEXT: s_mov_b64 s[0:1], 0
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s8, 64
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s9
|
||||
; SI-NEXT: v_add_i32_e32 v0, vcc, s8, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s13
|
||||
; SI-NEXT: v_add_i32_e32 v0, vcc, s12, v0
|
||||
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_mov_b32 s4, s10
|
||||
; SI-NEXT: s_mov_b32 s5, s11
|
||||
; SI-NEXT: s_mov_b32 s7, s3
|
||||
; SI-NEXT: s_mov_b32 s4, s14
|
||||
; SI-NEXT: s_mov_b32 s5, s15
|
||||
; SI-NEXT: s_mov_b32 s3, s7
|
||||
; SI-NEXT: s_mov_b32 s0, s2
|
||||
; SI-NEXT: s_mov_b32 s1, s2
|
||||
; SI-NEXT: .LBB4_2: ; %loop
|
||||
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
|
||||
; SI-NEXT: s_add_i32 s8, s8, -1
|
||||
; SI-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
|
||||
; SI-NEXT: s_add_u32 s0, s0, 4
|
||||
; SI-NEXT: s_addc_u32 s1, s1, 0
|
||||
; SI-NEXT: s_cmpk_lg_i32 s0, 0x100
|
||||
; SI-NEXT: v_add_i32_e32 v0, vcc, 4, v0
|
||||
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; SI-NEXT: s_cbranch_scc1 .LBB4_2
|
||||
; SI-NEXT: .LBB4_3: ; %exit
|
||||
; SI-NEXT: s_endpgm
|
||||
|
||||
@ -12,40 +12,39 @@ define amdgpu_kernel void @ds_prefetch_pattern(ptr addrspace(3) %lds, ptr addrsp
|
||||
; CHECK-NEXT: s_load_b32 s1, s[4:5], 0x0 nv
|
||||
; CHECK-NEXT: s_load_b32 s0, s[4:5], 0x10 nv
|
||||
; CHECK-NEXT: v_and_b32_e32 v12, 0x3ff, v0
|
||||
; CHECK-NEXT: s_wait_kmcnt 0x0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v4, 0
|
||||
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; CHECK-NEXT: v_lshl_add_u32 v1, v12, 8, s1
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
||||
; CHECK-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
|
||||
; CHECK-NEXT: v_mov_b32_e32 v7, v4
|
||||
; CHECK-NEXT: s_wait_kmcnt 0x0
|
||||
; CHECK-NEXT: v_lshl_add_u32 v13, v12, 8, s1
|
||||
; CHECK-NEXT: s_mov_b32 s1, 0
|
||||
; CHECK-NEXT: ds_load_b128 v[4:7], v1
|
||||
; CHECK-NEXT: ds_load_b128 v[8:11], v1 offset:16
|
||||
; CHECK-NEXT: v_dual_add_nc_u32 v13, 32, v1 :: v_dual_mov_b32 v1, v0
|
||||
; CHECK-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v0
|
||||
; CHECK-NEXT: ds_load_b128 v[8:11], v13
|
||||
; CHECK-NEXT: ds_load_b128 v[0:3], v13 offset:16
|
||||
; CHECK-NEXT: s_wait_dscnt 0x0
|
||||
; CHECK-NEXT: .LBB0_1: ; %loop
|
||||
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: s_barrier_signal -1
|
||||
; CHECK-NEXT: s_wait_dscnt 0x1
|
||||
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
||||
; CHECK-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[6:7]
|
||||
; CHECK-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[4:5]
|
||||
; CHECK-NEXT: s_add_co_i32 s1, s1, 1
|
||||
; CHECK-NEXT: s_wait_dscnt 0x1
|
||||
; CHECK-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[10:11]
|
||||
; CHECK-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[8:9]
|
||||
; CHECK-NEXT: v_lshl_add_u32 v14, s1, 5, v13
|
||||
; CHECK-NEXT: s_cmp_lt_i32 s1, s0
|
||||
; CHECK-NEXT: s_wait_dscnt 0x0
|
||||
; CHECK-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[10:11]
|
||||
; CHECK-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[8:9]
|
||||
; CHECK-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[2:3]
|
||||
; CHECK-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[0:1]
|
||||
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; CHECK-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[2:3]
|
||||
; CHECK-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[0:1]
|
||||
; CHECK-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[6:7]
|
||||
; CHECK-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[4:5]
|
||||
; CHECK-NEXT: s_barrier_wait -1
|
||||
; CHECK-NEXT: ds_load_b128 v[4:7], v13
|
||||
; CHECK-NEXT: ds_load_b128 v[8:11], v13 offset:16
|
||||
; CHECK-NEXT: v_add_nc_u32_e32 v13, 32, v13
|
||||
; CHECK-NEXT: ds_load_b128 v[8:11], v14
|
||||
; CHECK-NEXT: ds_load_b128 v[0:3], v14 offset:16
|
||||
; CHECK-NEXT: s_cbranch_scc1 .LBB0_1
|
||||
; CHECK-NEXT: ; %bb.2: ; %exit
|
||||
; CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; CHECK-NEXT: s_wait_kmcnt 0x0
|
||||
; CHECK-NEXT: global_store_b128 v12, v[0:3], s[0:1] scale_offset
|
||||
; CHECK-NEXT: global_store_b128 v12, v[4:7], s[0:1] scale_offset
|
||||
; CHECK-NEXT: s_endpgm
|
||||
entry:
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
||||
@ -4,17 +4,15 @@
|
||||
|
||||
; OPT-LABEL: @test_local_atomicrmw_addressing_loop_uniform_index_max_offset_i32(
|
||||
|
||||
; OPT: .lr.ph.preheader:
|
||||
; OPT: %scevgep2 = getelementptr i8, ptr addrspace(3) %arg1, i32 65532
|
||||
; OPT: br label %.lr.ph
|
||||
; OPT: .lr.ph:
|
||||
; OPT: %lsr.iv3 = phi ptr addrspace(3) [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
|
||||
; OPT: %lsr.iv2 = phi ptr addrspace(3) [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
|
||||
; OPT: %lsr.iv1 = phi ptr addrspace(3) [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ]
|
||||
; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ]
|
||||
; OPT: %tmp4 = atomicrmw add ptr addrspace(3) %lsr.iv3, i32 undef seq_cst, align 4
|
||||
; OPT: %scevgep4 = getelementptr i8, ptr addrspace(3) %lsr.iv2, i32 65532
|
||||
; OPT: %tmp4 = atomicrmw add ptr addrspace(3) %scevgep4, i32 undef seq_cst, align 4
|
||||
; OPT: %tmp7 = atomicrmw add ptr addrspace(3) %lsr.iv1, i32 undef seq_cst, align 4
|
||||
; OPT: %0 = atomicrmw add ptr addrspace(3) %lsr.iv1, i32 %tmp8 seq_cst, align 4
|
||||
; OPT: %scevgep4 = getelementptr i8, ptr addrspace(3) %lsr.iv3, i32 4
|
||||
; OPT: %scevgep3 = getelementptr i8, ptr addrspace(3) %lsr.iv2, i32 4
|
||||
; OPT: br i1 %exitcond
|
||||
define amdgpu_kernel void @test_local_atomicrmw_addressing_loop_uniform_index_max_offset_i32(ptr addrspace(3) noalias nocapture %arg0, ptr addrspace(3) noalias nocapture readonly %arg1, i32 %n) #0 {
|
||||
bb:
|
||||
@ -46,15 +44,13 @@ bb:
|
||||
|
||||
; OPT-LABEL: test_local_cmpxchg_addressing_loop_uniform_index_max_offset_i32(
|
||||
|
||||
; OPT: .lr.ph.preheader:
|
||||
; OPT: %scevgep2 = getelementptr i8, ptr addrspace(3) %arg1, i32 65532
|
||||
; OPT: br label %.lr.ph
|
||||
; OPT: .lr.ph:
|
||||
; OPT: %lsr.iv3 = phi ptr addrspace(3) [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
|
||||
; OPT: %lsr.iv2 = phi ptr addrspace(3) [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
|
||||
; OPT: %lsr.iv1 = phi ptr addrspace(3) [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ]
|
||||
; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ]
|
||||
; OPT: %tmp4 = cmpxchg ptr addrspace(3) %lsr.iv3, i32 undef, i32 undef seq_cst monotonic, align 4
|
||||
; OPT: %scevgep4 = getelementptr i8, ptr addrspace(3) %lsr.iv3, i32 4
|
||||
; OPT: %scevgep4 = getelementptr i8, ptr addrspace(3) %lsr.iv2, i32 65532
|
||||
; OPT: %tmp4 = cmpxchg ptr addrspace(3) %scevgep4, i32 undef, i32 undef seq_cst monotonic, align 4
|
||||
; OPT: %scevgep3 = getelementptr i8, ptr addrspace(3) %lsr.iv2, i32 4
|
||||
define amdgpu_kernel void @test_local_cmpxchg_addressing_loop_uniform_index_max_offset_i32(ptr addrspace(3) noalias nocapture %arg0, ptr addrspace(3) noalias nocapture readonly %arg1, i32 %n) #0 {
|
||||
bb:
|
||||
%tmp = icmp sgt i32 %n, 0
|
||||
|
||||
@ -4,13 +4,11 @@
|
||||
; spaces are correctly handled.
|
||||
|
||||
; OPT-LABEL: @test_global_addressing_loop_uniform_index_max_offset_i32(
|
||||
; OPT: .lr.ph.preheader:
|
||||
; OPT: %scevgep2 = getelementptr i8, ptr addrspace(1) %arg1, i64 4095
|
||||
; OPT: br label %.lr.ph
|
||||
; OPT: {{^}}.lr.ph:
|
||||
; OPT: %lsr.iv3 = phi ptr addrspace(1) [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
|
||||
; OPT: load i8, ptr addrspace(1) %lsr.iv3, align 1
|
||||
; OPT: %scevgep4 = getelementptr i8, ptr addrspace(1) %lsr.iv3, i64 1
|
||||
; OPT: %lsr.iv2 = phi ptr addrspace(1) [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
|
||||
; OPT: %scevgep4 = getelementptr i8, ptr addrspace(1) %lsr.iv2, i64 4095
|
||||
; OPT: load i8, ptr addrspace(1) %scevgep4, align 1
|
||||
; OPT: %scevgep3 = getelementptr i8, ptr addrspace(1) %lsr.iv2, i64 1
|
||||
define amdgpu_kernel void @test_global_addressing_loop_uniform_index_max_offset_i32(ptr addrspace(1) noalias nocapture %arg0, ptr addrspace(1) noalias nocapture readonly %arg1, i32 %n) #0 {
|
||||
bb:
|
||||
%tmp = icmp sgt i32 %n, 0
|
||||
@ -80,13 +78,11 @@ bb:
|
||||
}
|
||||
|
||||
; OPT-LABEL: @test_local_addressing_loop_uniform_index_max_offset_i32(
|
||||
; OPT: .lr.ph.preheader:
|
||||
; OPT: %scevgep2 = getelementptr i8, ptr addrspace(3) %arg1, i32 65535
|
||||
; OPT: br label %.lr.ph
|
||||
; OPT: {{^}}.lr.ph
|
||||
; OPT: %lsr.iv3 = phi ptr addrspace(3) [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
|
||||
; OPT: %tmp4 = load i8, ptr addrspace(3) %lsr.iv3, align 1
|
||||
; OPT: %scevgep4 = getelementptr i8, ptr addrspace(3) %lsr.iv3, i32 1
|
||||
; OPT: {{^}}.lr.ph:
|
||||
; OPT: %lsr.iv2 = phi ptr addrspace(3) [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
|
||||
; OPT: %scevgep4 = getelementptr i8, ptr addrspace(3) %lsr.iv2, i32 65535
|
||||
; OPT: %tmp4 = load i8, ptr addrspace(3) %scevgep4, align 1
|
||||
; OPT: %scevgep3 = getelementptr i8, ptr addrspace(3) %lsr.iv2, i32 1
|
||||
define amdgpu_kernel void @test_local_addressing_loop_uniform_index_max_offset_i32(ptr addrspace(1) noalias nocapture %arg0, ptr addrspace(3) noalias nocapture readonly %arg1, i32 %n) #0 {
|
||||
bb:
|
||||
%tmp = icmp sgt i32 %n, 0
|
||||
|
||||
@ -12,22 +12,20 @@ define amdgpu_kernel void @scaledregtest() local_unnamed_addr {
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
||||
; CHECK: loopexit:
|
||||
; CHECK-NEXT: [[SCEVGEP11_LCSSA:%.*]] = phi ptr addrspace(5) [ [[SCEVGEP11:%.*]], [[FOR_BODY]] ]
|
||||
; CHECK-NEXT: [[SCEVGEP13_LCSSA:%.*]] = phi ptr [ [[SCEVGEP13:%.*]], [[FOR_BODY]] ]
|
||||
; CHECK-NEXT: br label [[FOR_BODY_1:%.*]]
|
||||
; CHECK: for.body.1:
|
||||
; CHECK-NEXT: [[LSR_IV5:%.*]] = phi ptr addrspace(5) [ [[SCEVGEP6:%.*]], [[FOR_BODY_1]] ], [ [[SCEVGEP11_LCSSA]], [[LOOPEXIT:%.*]] ]
|
||||
; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], [[FOR_BODY_1]] ], [ [[SCEVGEP13_LCSSA]], [[LOOPEXIT]] ]
|
||||
; CHECK-NEXT: [[LSR_IV5:%.*]] = phi ptr addrspace(5) [ [[SCEVGEP6:%.*]], [[FOR_BODY_1]] ], [ [[SCEVGEP11_LCSSA:%.*]], [[LOOPEXIT:%.*]] ]
|
||||
; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], [[FOR_BODY_1]] ], [ [[SCEVGEP13_LCSSA:%.*]], [[LOOPEXIT]] ]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[LSR_IV5]], align 8
|
||||
; CHECK-NEXT: store ptr [[TMP0]], ptr [[LSR_IV1]], align 8
|
||||
; CHECK-NEXT: [[SCEVGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i64 8
|
||||
; CHECK-NEXT: [[SCEVGEP6]] = getelementptr i8, ptr addrspace(5) [[LSR_IV5]], i32 8
|
||||
; CHECK-NEXT: br label [[FOR_BODY_1]]
|
||||
; CHECK: for.body:
|
||||
; CHECK-NEXT: [[LSR_IV12:%.*]] = phi ptr [ [[SCEVGEP13]], [[FOR_BODY]] ], [ null, [[ENTRY:%.*]] ]
|
||||
; CHECK-NEXT: [[LSR_IV10:%.*]] = phi ptr addrspace(5) [ [[SCEVGEP11]], [[FOR_BODY]] ], [ null, [[ENTRY]] ]
|
||||
; CHECK-NEXT: [[SCEVGEP11]] = getelementptr i8, ptr addrspace(5) [[LSR_IV10]], i32 64
|
||||
; CHECK-NEXT: [[SCEVGEP13]] = getelementptr i8, ptr [[LSR_IV12]], i64 64
|
||||
; CHECK-NEXT: [[SCEVGEP11_LCSSA]] = phi ptr addrspace(5) [ [[SCEVGEP4:%.*]], [[FOR_BODY]] ], [ inttoptr (i32 64 to ptr addrspace(5)), [[ENTRY:%.*]] ]
|
||||
; CHECK-NEXT: [[SCEVGEP13_LCSSA]] = phi ptr [ [[SCEVGEP:%.*]], [[FOR_BODY]] ], [ inttoptr (i64 64 to ptr), [[ENTRY]] ]
|
||||
; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[SCEVGEP13_LCSSA]], i64 64
|
||||
; CHECK-NEXT: [[SCEVGEP4]] = getelementptr i8, ptr addrspace(5) [[SCEVGEP11_LCSSA]], i32 64
|
||||
; CHECK-NEXT: br i1 false, label [[LOOPEXIT]], label [[FOR_BODY]]
|
||||
;
|
||||
entry:
|
||||
|
||||
@ -14,7 +14,7 @@ define amdgpu_kernel void @lsr_crash_preserve_addrspace_unknown_type() #0 {
|
||||
; CHECK-NEXT: br label %[[BB1:.*]]
|
||||
; CHECK: [[BB1]]:
|
||||
; CHECK-NEXT: [[TMP:%.*]] = phi ptr addrspace(3) [ undef, %[[BB]] ], [ [[TMP18:%.*]], %[[BB17:.*]] ]
|
||||
; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP]], i32 8
|
||||
; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr inbounds [[TMP0:%.*]], ptr addrspace(3) [[TMP]], i64 0, i32 1
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr addrspace(3) [[SCEVGEP1]], align 8
|
||||
; CHECK-NEXT: br label %[[BB4:.*]]
|
||||
; CHECK: [[BB4]]:
|
||||
@ -26,14 +26,14 @@ define amdgpu_kernel void @lsr_crash_preserve_addrspace_unknown_type() #0 {
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 0, [[TMP10]]
|
||||
; CHECK-NEXT: br i1 [[TMP11]], label %[[BB12:.*]], label %[[BB17]]
|
||||
; CHECK: [[BB12]]:
|
||||
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP]], i32 16
|
||||
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr inbounds [[TMP0]], ptr addrspace(3) [[TMP]], i64 0, i32 2
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(3) [[SCEVGEP]], align 4
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 0, [[TMP14]]
|
||||
; CHECK-NEXT: br i1 [[TMP15]], label %[[BB16:.*]], label %[[BB17]]
|
||||
; CHECK: [[BB16]]:
|
||||
; CHECK-NEXT: unreachable
|
||||
; CHECK: [[BB17]]:
|
||||
; CHECK-NEXT: [[TMP18]] = getelementptr inbounds [[TMP0:%.*]], ptr addrspace(3) [[TMP]], i64 2
|
||||
; CHECK-NEXT: [[TMP18]] = getelementptr inbounds [[TMP0]], ptr addrspace(3) [[TMP]], i64 2
|
||||
; CHECK-NEXT: br label %[[BB1]]
|
||||
;
|
||||
bb:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user