[AMDGPU] Reland "Remove redundant s_cmp_lg_* sX, 0" (#164201)
Reland PR https://github.com/llvm/llvm-project/pull/162352. Fix by excluding SI_PC_ADD_REL_OFFSET from instructions that set SCC = DST!=0. Passes check-libc-amdgcn-amd-amdhsa now. Distribution of instructions that allowed a redundant S_CMP to be deleted in check-libc-amdgcn-amd-amdhsa test: ``` S_AND_B32 485 S_AND_B64 47 S_ANDN2_B32 42 S_ANDN2_B64 277492 S_CSELECT_B64 17631 S_LSHL_B32 6 S_OR_B64 11 ``` --------- Signed-off-by: John Lu <John.Lu@amd.com> Co-authored-by: Matt Arsenault <arsenm2@gmail.com>
This commit is contained in:
parent
411be14eab
commit
9abbec66bf
@ -10628,6 +10628,59 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
|
||||
if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
|
||||
return false;
|
||||
|
||||
const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
|
||||
this]() -> bool {
|
||||
if (CmpValue != 0)
|
||||
return false;
|
||||
|
||||
MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
|
||||
if (!Def || Def->getParent() != CmpInstr.getParent())
|
||||
return false;
|
||||
|
||||
const auto foldableSelect = [](MachineInstr *Def) -> bool {
|
||||
if (Def->getOpcode() == AMDGPU::S_CSELECT_B32 ||
|
||||
Def->getOpcode() == AMDGPU::S_CSELECT_B64) {
|
||||
bool Op1IsNonZeroImm =
|
||||
Def->getOperand(1).isImm() && Def->getOperand(1).getImm() != 0;
|
||||
bool Op2IsZeroImm =
|
||||
Def->getOperand(2).isImm() && Def->getOperand(2).getImm() == 0;
|
||||
if (Op1IsNonZeroImm && Op2IsZeroImm)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
// For S_OP that set SCC = DST!=0, do the transformation
|
||||
//
|
||||
// s_cmp_lg_* (S_OP ...), 0 => (S_OP ...)
|
||||
|
||||
// If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
|
||||
// for S_CSELECT* already has the same value that will be calculated by
|
||||
// s_cmp_lg_*
|
||||
//
|
||||
// s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero
|
||||
// imm), 0)
|
||||
if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(Def))
|
||||
return false;
|
||||
|
||||
MachineInstr *KillsSCC = nullptr;
|
||||
for (MachineInstr &MI :
|
||||
make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
|
||||
if (MI.modifiesRegister(AMDGPU::SCC, &RI))
|
||||
return false;
|
||||
if (MI.killsRegister(AMDGPU::SCC, &RI))
|
||||
KillsSCC = &MI;
|
||||
}
|
||||
|
||||
if (MachineOperand *SccDef =
|
||||
Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
|
||||
SccDef->setIsDead(false);
|
||||
if (KillsSCC)
|
||||
KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
|
||||
CmpInstr.eraseFromParent();
|
||||
return true;
|
||||
};
|
||||
|
||||
const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
|
||||
this](int64_t ExpectedValue, unsigned SrcSize,
|
||||
bool IsReversible, bool IsSigned) -> bool {
|
||||
@ -10702,16 +10755,20 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
|
||||
if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
|
||||
return false;
|
||||
|
||||
for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
|
||||
I != E; ++I) {
|
||||
if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
|
||||
I->killsRegister(AMDGPU::SCC, &RI))
|
||||
MachineInstr *KillsSCC = nullptr;
|
||||
for (MachineInstr &MI :
|
||||
make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
|
||||
if (MI.modifiesRegister(AMDGPU::SCC, &RI))
|
||||
return false;
|
||||
if (MI.killsRegister(AMDGPU::SCC, &RI))
|
||||
KillsSCC = &MI;
|
||||
}
|
||||
|
||||
MachineOperand *SccDef =
|
||||
Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
|
||||
SccDef->setIsDead(false);
|
||||
if (KillsSCC)
|
||||
KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
|
||||
CmpInstr.eraseFromParent();
|
||||
|
||||
if (!MRI->use_nodbg_empty(DefReg)) {
|
||||
@ -10755,7 +10812,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
|
||||
case AMDGPU::S_CMP_LG_I32:
|
||||
case AMDGPU::S_CMPK_LG_U32:
|
||||
case AMDGPU::S_CMPK_LG_I32:
|
||||
return optimizeCmpAnd(0, 32, true, false);
|
||||
return optimizeCmpAnd(0, 32, true, false) || optimizeCmpSelect();
|
||||
case AMDGPU::S_CMP_GT_U32:
|
||||
case AMDGPU::S_CMPK_GT_U32:
|
||||
return optimizeCmpAnd(0, 32, false, false);
|
||||
@ -10763,7 +10820,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
|
||||
case AMDGPU::S_CMPK_GT_I32:
|
||||
return optimizeCmpAnd(0, 32, false, true);
|
||||
case AMDGPU::S_CMP_LG_U64:
|
||||
return optimizeCmpAnd(0, 64, true, false);
|
||||
return optimizeCmpAnd(0, 64, true, false) || optimizeCmpSelect();
|
||||
}
|
||||
|
||||
return false;
|
||||
|
||||
@ -714,6 +714,52 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
static bool setsSCCifResultIsNonZero(const MachineInstr &MI) {
|
||||
switch (MI.getOpcode()) {
|
||||
case AMDGPU::S_ABSDIFF_I32:
|
||||
case AMDGPU::S_ABS_I32:
|
||||
case AMDGPU::S_AND_B32:
|
||||
case AMDGPU::S_AND_B64:
|
||||
case AMDGPU::S_ANDN2_B32:
|
||||
case AMDGPU::S_ANDN2_B64:
|
||||
case AMDGPU::S_ASHR_I32:
|
||||
case AMDGPU::S_ASHR_I64:
|
||||
case AMDGPU::S_BCNT0_I32_B32:
|
||||
case AMDGPU::S_BCNT0_I32_B64:
|
||||
case AMDGPU::S_BCNT1_I32_B32:
|
||||
case AMDGPU::S_BCNT1_I32_B64:
|
||||
case AMDGPU::S_BFE_I32:
|
||||
case AMDGPU::S_BFE_I64:
|
||||
case AMDGPU::S_BFE_U32:
|
||||
case AMDGPU::S_BFE_U64:
|
||||
case AMDGPU::S_LSHL_B32:
|
||||
case AMDGPU::S_LSHL_B64:
|
||||
case AMDGPU::S_LSHR_B32:
|
||||
case AMDGPU::S_LSHR_B64:
|
||||
case AMDGPU::S_NAND_B32:
|
||||
case AMDGPU::S_NAND_B64:
|
||||
case AMDGPU::S_NOR_B32:
|
||||
case AMDGPU::S_NOR_B64:
|
||||
case AMDGPU::S_NOT_B32:
|
||||
case AMDGPU::S_NOT_B64:
|
||||
case AMDGPU::S_OR_B32:
|
||||
case AMDGPU::S_OR_B64:
|
||||
case AMDGPU::S_ORN2_B32:
|
||||
case AMDGPU::S_ORN2_B64:
|
||||
case AMDGPU::S_QUADMASK_B32:
|
||||
case AMDGPU::S_QUADMASK_B64:
|
||||
case AMDGPU::S_WQM_B32:
|
||||
case AMDGPU::S_WQM_B64:
|
||||
case AMDGPU::S_XNOR_B32:
|
||||
case AMDGPU::S_XNOR_B64:
|
||||
case AMDGPU::S_XOR_B32:
|
||||
case AMDGPU::S_XOR_B64:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static bool isEXP(const MachineInstr &MI) {
|
||||
return MI.getDesc().TSFlags & SIInstrFlags::EXP;
|
||||
}
|
||||
|
||||
@ -140,7 +140,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
|
||||
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
||||
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
|
||||
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
|
||||
; CHECK-NEXT: ; %bb.1: ; %false
|
||||
; CHECK-NEXT: s_mov_b32 s0, 33
|
||||
@ -345,7 +344,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
|
||||
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
|
||||
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
|
||||
; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
|
||||
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
|
||||
; CHECK-NEXT: ; %bb.1: ; %false
|
||||
; CHECK-NEXT: s_mov_b32 s0, 33
|
||||
|
||||
@ -143,7 +143,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
|
||||
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
|
||||
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
|
||||
; CHECK-NEXT: ; %bb.1: ; %false
|
||||
; CHECK-NEXT: s_mov_b32 s0, 33
|
||||
@ -348,7 +347,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
|
||||
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
|
||||
; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
|
||||
; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
|
||||
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
|
||||
; CHECK-NEXT: ; %bb.1: ; %false
|
||||
; CHECK-NEXT: s_mov_b32 s0, 33
|
||||
|
||||
@ -180,11 +180,7 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B
|
||||
; CHECK-LABEL: s_add64_32:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_add_u32 s0, s0, s2
|
||||
; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
|
||||
; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
|
||||
; CHECK-NEXT: s_addc_u32 s1, s1, s3
|
||||
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
|
||||
; CHECK-NEXT: s_addc_u32 s2, s4, 0
|
||||
; CHECK-NEXT: ; return to shader part epilog
|
||||
%sum64 = add i64 %val64A, %val64B
|
||||
@ -199,14 +195,10 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B
|
||||
define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
|
||||
; CHECK-LABEL: s_uadd_v2i64:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_add_u32 s10, s2, s6
|
||||
; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0
|
||||
; CHECK-NEXT: s_addc_u32 s8, s3, s7
|
||||
; CHECK-NEXT: s_add_u32 s6, s2, s6
|
||||
; CHECK-NEXT: s_addc_u32 s7, s3, s7
|
||||
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; CHECK-NEXT: s_add_u32 s0, s0, s4
|
||||
; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
|
||||
; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
|
||||
; CHECK-NEXT: s_addc_u32 s1, s1, s5
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, s0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, s1
|
||||
@ -215,8 +207,8 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
|
||||
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
|
||||
; CHECK-NEXT: v_readfirstlane_b32 s0, v7
|
||||
; CHECK-NEXT: v_readfirstlane_b32 s2, v6
|
||||
; CHECK-NEXT: v_mov_b32_e32 v4, s10
|
||||
; CHECK-NEXT: v_mov_b32_e32 v5, s8
|
||||
; CHECK-NEXT: v_mov_b32_e32 v4, s6
|
||||
; CHECK-NEXT: v_mov_b32_e32 v5, s7
|
||||
; CHECK-NEXT: s_mov_b32 s1, s0
|
||||
; CHECK-NEXT: s_mov_b32 s3, s2
|
||||
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
|
||||
@ -233,14 +225,10 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
|
||||
define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
|
||||
; CHECK-LABEL: s_usub_v2i64:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_sub_u32 s10, s2, s6
|
||||
; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0
|
||||
; CHECK-NEXT: s_subb_u32 s8, s3, s7
|
||||
; CHECK-NEXT: s_sub_u32 s6, s2, s6
|
||||
; CHECK-NEXT: s_subb_u32 s7, s3, s7
|
||||
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; CHECK-NEXT: s_sub_u32 s0, s0, s4
|
||||
; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
|
||||
; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
|
||||
; CHECK-NEXT: s_subb_u32 s1, s1, s5
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, s0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, s1
|
||||
@ -249,8 +237,8 @@ define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
|
||||
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
|
||||
; CHECK-NEXT: v_readfirstlane_b32 s0, v7
|
||||
; CHECK-NEXT: v_readfirstlane_b32 s2, v6
|
||||
; CHECK-NEXT: v_mov_b32_e32 v4, s10
|
||||
; CHECK-NEXT: v_mov_b32_e32 v5, s8
|
||||
; CHECK-NEXT: v_mov_b32_e32 v4, s6
|
||||
; CHECK-NEXT: v_mov_b32_e32 v5, s7
|
||||
; CHECK-NEXT: s_mov_b32 s1, s0
|
||||
; CHECK-NEXT: s_mov_b32 s3, s2
|
||||
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
|
||||
@ -268,8 +256,6 @@ define amdgpu_ps i64 @s_uadd_i64(i64 inreg %val0, i64 inreg %val1, ptr %ptrval)
|
||||
; CHECK-LABEL: s_uadd_i64:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_add_u32 s0, s0, s2
|
||||
; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0
|
||||
; CHECK-NEXT: s_addc_u32 s1, s1, s3
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, s0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, s1
|
||||
@ -292,8 +278,6 @@ define amdgpu_ps i64 @s_uadd_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
|
||||
; CHECK-LABEL: s_uadd_p1:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_add_u32 s0, s0, 1
|
||||
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
|
||||
; CHECK-NEXT: s_addc_u32 s1, s1, 0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, s0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, s1
|
||||
@ -339,8 +323,6 @@ define amdgpu_ps i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
|
||||
; CHECK-LABEL: s_usub_p1:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_sub_u32 s0, s0, 1
|
||||
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
|
||||
; CHECK-NEXT: s_subb_u32 s1, s1, 0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, s0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, s1
|
||||
@ -363,8 +345,6 @@ define amdgpu_ps i64 @s_usub_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
|
||||
; CHECK-LABEL: s_usub_n1:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_sub_u32 s0, s0, -1
|
||||
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
|
||||
; CHECK-NEXT: s_subb_u32 s1, s1, -1
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, s0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, s1
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -612,12 +612,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX8-NEXT: s_mov_b32 m0, s3
|
||||
; GFX8-NEXT: v_readlane_b32 s8, v0, s3
|
||||
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX8-NEXT: v_readlane_b32 s6, v0, s3
|
||||
; GFX8-NEXT: v_writelane_b32 v1, s2, m0
|
||||
; GFX8-NEXT: s_add_i32 s2, s2, s8
|
||||
; GFX8-NEXT: s_add_i32 s2, s2, s6
|
||||
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX8-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -653,12 +652,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX9-NEXT: s_mov_b32 m0, s3
|
||||
; GFX9-NEXT: v_readlane_b32 s8, v0, s3
|
||||
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX9-NEXT: v_readlane_b32 s6, v0, s3
|
||||
; GFX9-NEXT: v_writelane_b32 v1, s2, m0
|
||||
; GFX9-NEXT: s_add_i32 s2, s2, s8
|
||||
; GFX9-NEXT: s_add_i32 s2, s2, s6
|
||||
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -693,11 +691,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
|
||||
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
|
||||
; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX10W64-NEXT: s_add_i32 s2, s2, s8
|
||||
; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -733,11 +730,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
|
||||
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
|
||||
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
|
||||
; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
|
||||
; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
|
||||
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
|
||||
; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
|
||||
; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -774,11 +770,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
|
||||
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
|
||||
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
|
||||
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
|
||||
; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
|
||||
@ -818,11 +813,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
|
||||
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
|
||||
; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
|
||||
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
|
||||
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
|
||||
; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
|
||||
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
|
||||
; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
|
||||
; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
|
||||
@ -859,11 +853,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
|
||||
; GFX12W64-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
|
||||
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
|
||||
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
|
||||
; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
|
||||
@ -901,15 +894,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX12W32-NEXT: ; implicit-def: $vgpr0
|
||||
; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop
|
||||
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12W32-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
|
||||
; GFX12W32-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
|
||||
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
|
||||
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
|
||||
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
|
||||
; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
|
||||
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
|
||||
; GFX12W32-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
|
||||
; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
|
||||
@ -999,12 +992,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
|
||||
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX8-NEXT: s_mov_b32 m0, s3
|
||||
; GFX8-NEXT: v_readlane_b32 s8, v0, s3
|
||||
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX8-NEXT: v_readlane_b32 s6, v0, s3
|
||||
; GFX8-NEXT: v_writelane_b32 v1, s2, m0
|
||||
; GFX8-NEXT: s_add_i32 s2, s2, s8
|
||||
; GFX8-NEXT: s_add_i32 s2, s2, s6
|
||||
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX8-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -1042,12 +1034,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
|
||||
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX9-NEXT: s_mov_b32 m0, s3
|
||||
; GFX9-NEXT: v_readlane_b32 s8, v0, s3
|
||||
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX9-NEXT: v_readlane_b32 s6, v0, s3
|
||||
; GFX9-NEXT: v_writelane_b32 v1, s2, m0
|
||||
; GFX9-NEXT: s_add_i32 s2, s2, s8
|
||||
; GFX9-NEXT: s_add_i32 s2, s2, s6
|
||||
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -1084,11 +1075,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
|
||||
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
|
||||
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
|
||||
; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX10W64-NEXT: s_add_i32 s2, s2, s8
|
||||
; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -1127,11 +1117,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
|
||||
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
|
||||
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
|
||||
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
|
||||
; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
|
||||
; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
|
||||
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
|
||||
; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
|
||||
; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -1171,11 +1160,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
|
||||
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
|
||||
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
|
||||
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
|
||||
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
|
||||
; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
|
||||
@ -1218,11 +1206,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
|
||||
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
|
||||
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
|
||||
; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
|
||||
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
|
||||
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
|
||||
; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
|
||||
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
|
||||
; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
|
||||
; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
|
||||
@ -1261,11 +1248,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
|
||||
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
|
||||
; GFX12W64-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
|
||||
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
|
||||
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
|
||||
; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX12W64-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
|
||||
@ -1306,15 +1292,15 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
|
||||
; GFX12W32-NEXT: ; implicit-def: $vgpr0
|
||||
; GFX12W32-NEXT: .LBB3_1: ; %ComputeLoop
|
||||
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12W32-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
|
||||
; GFX12W32-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
|
||||
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
|
||||
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
|
||||
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
|
||||
; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
|
||||
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
|
||||
; GFX12W32-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
|
||||
; GFX12W32-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
|
||||
@ -2073,12 +2059,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX8-NEXT: s_mov_b32 m0, s3
|
||||
; GFX8-NEXT: v_readlane_b32 s8, v0, s3
|
||||
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX8-NEXT: v_readlane_b32 s6, v0, s3
|
||||
; GFX8-NEXT: v_writelane_b32 v1, s2, m0
|
||||
; GFX8-NEXT: s_add_i32 s2, s2, s8
|
||||
; GFX8-NEXT: s_add_i32 s2, s2, s6
|
||||
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX8-NEXT: s_cbranch_scc1 .LBB7_1
|
||||
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -2114,12 +2099,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX9-NEXT: s_mov_b32 m0, s3
|
||||
; GFX9-NEXT: v_readlane_b32 s8, v0, s3
|
||||
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX9-NEXT: v_readlane_b32 s6, v0, s3
|
||||
; GFX9-NEXT: v_writelane_b32 v1, s2, m0
|
||||
; GFX9-NEXT: s_add_i32 s2, s2, s8
|
||||
; GFX9-NEXT: s_add_i32 s2, s2, s6
|
||||
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -2154,11 +2138,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
|
||||
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
|
||||
; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX10W64-NEXT: s_add_i32 s2, s2, s8
|
||||
; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1
|
||||
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -2194,11 +2177,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
|
||||
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
|
||||
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
|
||||
; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
|
||||
; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
|
||||
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
|
||||
; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
|
||||
; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1
|
||||
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -2235,11 +2217,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
|
||||
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
|
||||
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
|
||||
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
|
||||
; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1
|
||||
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
|
||||
@ -2279,11 +2260,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
|
||||
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
|
||||
; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
|
||||
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
|
||||
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
|
||||
; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
|
||||
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
|
||||
; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
|
||||
; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1
|
||||
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
|
||||
@ -2321,11 +2301,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
|
||||
; GFX12W64-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
|
||||
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
|
||||
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
|
||||
; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1
|
||||
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
|
||||
@ -2363,15 +2342,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX12W32-NEXT: ; implicit-def: $vgpr0
|
||||
; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop
|
||||
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12W32-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
|
||||
; GFX12W32-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
|
||||
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
|
||||
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
|
||||
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
|
||||
; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
|
||||
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
|
||||
; GFX12W32-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
|
||||
; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1
|
||||
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
|
||||
|
||||
@ -717,12 +717,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
|
||||
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s2
|
||||
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2
|
||||
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0
|
||||
; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s7
|
||||
; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s3
|
||||
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
@ -762,12 +761,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
|
||||
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s2
|
||||
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2
|
||||
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0
|
||||
; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s7
|
||||
; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s3
|
||||
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
@ -805,13 +803,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1
|
||||
; GFX1064_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
|
||||
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s7, s[0:1]
|
||||
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7
|
||||
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
|
||||
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7
|
||||
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
|
||||
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2
|
||||
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2
|
||||
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s7
|
||||
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s8
|
||||
; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
@ -853,11 +850,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0
|
||||
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
|
||||
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1
|
||||
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s3
|
||||
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032_ITERATIVE-NEXT: s_add_i32 s6, s6, s2
|
||||
; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
@ -897,14 +893,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0
|
||||
; GFX1164_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
|
||||
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1]
|
||||
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
|
||||
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7
|
||||
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
|
||||
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7
|
||||
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2
|
||||
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
|
||||
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s7
|
||||
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s8
|
||||
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
@ -949,11 +944,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
|
||||
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1
|
||||
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
|
||||
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
|
||||
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
|
||||
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1132_ITERATIVE-NEXT: s_add_i32 s6, s6, s2
|
||||
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
@ -993,14 +987,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0
|
||||
; GFX1264_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
|
||||
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1]
|
||||
; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
|
||||
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
|
||||
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2
|
||||
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
|
||||
; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s7
|
||||
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
|
||||
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7
|
||||
; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
|
||||
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7
|
||||
; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8
|
||||
; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
@ -1028,6 +1022,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
|
||||
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
|
||||
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xf1ff
|
||||
; GFX1264_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0
|
||||
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1
|
||||
; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null
|
||||
@ -1041,15 +1036,15 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0
|
||||
; GFX1232_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
|
||||
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
|
||||
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
|
||||
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
|
||||
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1
|
||||
; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
|
||||
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
|
||||
; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
|
||||
; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2
|
||||
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
|
||||
; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
@ -2363,7 +2358,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX8_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3
|
||||
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
|
||||
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
@ -2416,7 +2410,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX9_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3
|
||||
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
|
||||
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
@ -2462,13 +2455,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
|
||||
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2
|
||||
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2
|
||||
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s2
|
||||
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2
|
||||
; GFX1064_ITERATIVE-NEXT: s_add_u32 s6, s6, s3
|
||||
; GFX1064_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8
|
||||
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
|
||||
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
@ -2515,13 +2507,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0
|
||||
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
|
||||
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1
|
||||
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s1
|
||||
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1
|
||||
; GFX1032_ITERATIVE-NEXT: s_add_u32 s6, s6, s2
|
||||
; GFX1032_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3
|
||||
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
|
||||
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
@ -2569,14 +2560,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s2
|
||||
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2
|
||||
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
|
||||
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s2
|
||||
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
|
||||
; GFX1164_ITERATIVE-NEXT: s_add_u32 s6, s6, s3
|
||||
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8
|
||||
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
|
||||
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
@ -2626,14 +2616,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1
|
||||
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
|
||||
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
|
||||
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
|
||||
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
|
||||
; GFX1132_ITERATIVE-NEXT: s_add_u32 s6, s6, s2
|
||||
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3
|
||||
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
|
||||
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
@ -2677,16 +2666,16 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GFX1264_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
|
||||
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1]
|
||||
; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1]
|
||||
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
|
||||
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8
|
||||
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8
|
||||
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8
|
||||
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8
|
||||
; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
|
||||
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
|
||||
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s10
|
||||
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s10
|
||||
; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
|
||||
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s10
|
||||
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s10
|
||||
; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9]
|
||||
; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
|
||||
; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
|
||||
; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
@ -2731,17 +2720,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GFX1232_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
|
||||
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
|
||||
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
|
||||
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
|
||||
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
|
||||
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1
|
||||
; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1
|
||||
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
|
||||
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
|
||||
; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8
|
||||
; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
|
||||
; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
|
||||
; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
|
||||
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
|
||||
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
@ -4490,12 +4479,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
|
||||
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s2
|
||||
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2
|
||||
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s12, m0
|
||||
; GFX8_ITERATIVE-NEXT: s_add_i32 s12, s12, s6
|
||||
; GFX8_ITERATIVE-NEXT: s_add_i32 s12, s12, s3
|
||||
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
|
||||
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
@ -4550,12 +4538,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
|
||||
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s2
|
||||
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2
|
||||
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s12, m0
|
||||
; GFX9_ITERATIVE-NEXT: s_add_i32 s12, s12, s6
|
||||
; GFX9_ITERATIVE-NEXT: s_add_i32 s12, s12, s3
|
||||
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
|
||||
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
@ -4608,13 +4595,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr2
|
||||
; GFX1064_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
|
||||
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1]
|
||||
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
|
||||
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s6
|
||||
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s6
|
||||
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
|
||||
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2
|
||||
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s2
|
||||
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1064_ITERATIVE-NEXT: s_add_i32 s12, s12, s6
|
||||
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064_ITERATIVE-NEXT: s_add_i32 s12, s12, s7
|
||||
; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
|
||||
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
@ -4670,11 +4656,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0
|
||||
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
|
||||
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s8, s1
|
||||
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s3
|
||||
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032_ITERATIVE-NEXT: s_add_i32 s8, s8, s2
|
||||
; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
|
||||
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
@ -4728,14 +4713,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2
|
||||
; GFX1164_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
|
||||
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[0:1]
|
||||
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
|
||||
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
|
||||
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s6
|
||||
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s6
|
||||
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2
|
||||
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s2
|
||||
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1164_ITERATIVE-NEXT: s_add_i32 s12, s12, s6
|
||||
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164_ITERATIVE-NEXT: s_add_i32 s12, s12, s7
|
||||
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
|
||||
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
@ -4799,11 +4783,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
|
||||
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
|
||||
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s8, s1
|
||||
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
|
||||
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1132_ITERATIVE-NEXT: s_add_i32 s8, s8, s2
|
||||
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
|
||||
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
@ -4861,14 +4844,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0
|
||||
; GFX1264_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
|
||||
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1]
|
||||
; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
|
||||
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
|
||||
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2
|
||||
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
|
||||
; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s7
|
||||
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
|
||||
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7
|
||||
; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
|
||||
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7
|
||||
; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8
|
||||
; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
|
||||
; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
@ -4896,6 +4879,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
|
||||
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
|
||||
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xf1ff
|
||||
; GFX1264_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0
|
||||
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1
|
||||
; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null
|
||||
@ -4909,15 +4893,15 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0
|
||||
; GFX1232_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
|
||||
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
|
||||
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
|
||||
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
|
||||
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1
|
||||
; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
|
||||
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
|
||||
; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
|
||||
; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2
|
||||
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
|
||||
; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
|
||||
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
@ -6673,7 +6657,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX8_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3
|
||||
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
|
||||
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
@ -6746,7 +6729,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX9_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3
|
||||
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
|
||||
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
@ -6812,13 +6794,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
|
||||
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s2
|
||||
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2
|
||||
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2
|
||||
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2
|
||||
; GFX1064_ITERATIVE-NEXT: s_add_u32 s8, s8, s3
|
||||
; GFX1064_ITERATIVE-NEXT: s_addc_u32 s9, s9, s6
|
||||
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
|
||||
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
@ -6883,13 +6864,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0
|
||||
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s1
|
||||
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1
|
||||
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s1
|
||||
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1
|
||||
; GFX1032_ITERATIVE-NEXT: s_add_u32 s8, s8, s2
|
||||
; GFX1032_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3
|
||||
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
|
||||
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
@ -6955,14 +6935,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s2
|
||||
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2
|
||||
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2
|
||||
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2
|
||||
; GFX1164_ITERATIVE-NEXT: s_add_u32 s8, s8, s3
|
||||
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s9, s9, s6
|
||||
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
|
||||
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
@ -7036,14 +7015,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s1
|
||||
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1
|
||||
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s1
|
||||
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1
|
||||
; GFX1132_ITERATIVE-NEXT: s_add_u32 s8, s8, s2
|
||||
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3
|
||||
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
|
||||
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
@ -7109,16 +7087,16 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GFX1264_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
|
||||
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1]
|
||||
; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1]
|
||||
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
|
||||
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8
|
||||
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8
|
||||
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8
|
||||
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8
|
||||
; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
|
||||
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
|
||||
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s10
|
||||
; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s10
|
||||
; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
|
||||
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s10
|
||||
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s10
|
||||
; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9]
|
||||
; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
|
||||
; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
|
||||
; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
@ -7163,17 +7141,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
|
||||
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GFX1232_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
|
||||
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
|
||||
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
|
||||
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
|
||||
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
|
||||
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1
|
||||
; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1
|
||||
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
|
||||
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
|
||||
; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8
|
||||
; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
|
||||
; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
|
||||
; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
|
||||
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
|
||||
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -611,12 +611,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX8-NEXT: s_mov_b32 m0, s3
|
||||
; GFX8-NEXT: v_readlane_b32 s8, v0, s3
|
||||
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX8-NEXT: v_readlane_b32 s6, v0, s3
|
||||
; GFX8-NEXT: v_writelane_b32 v1, s2, m0
|
||||
; GFX8-NEXT: s_add_i32 s2, s2, s8
|
||||
; GFX8-NEXT: s_add_i32 s2, s2, s6
|
||||
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX8-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -652,12 +651,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX9-NEXT: s_mov_b32 m0, s3
|
||||
; GFX9-NEXT: v_readlane_b32 s8, v0, s3
|
||||
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX9-NEXT: v_readlane_b32 s6, v0, s3
|
||||
; GFX9-NEXT: v_writelane_b32 v1, s2, m0
|
||||
; GFX9-NEXT: s_add_i32 s2, s2, s8
|
||||
; GFX9-NEXT: s_add_i32 s2, s2, s6
|
||||
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -692,11 +690,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
|
||||
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
|
||||
; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX10W64-NEXT: s_add_i32 s2, s2, s8
|
||||
; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -732,11 +729,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
|
||||
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
|
||||
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
|
||||
; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
|
||||
; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
|
||||
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
|
||||
; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
|
||||
; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -773,11 +769,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
|
||||
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
|
||||
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
|
||||
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
|
||||
; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
|
||||
@ -817,11 +812,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
|
||||
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
|
||||
; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
|
||||
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
|
||||
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
|
||||
; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
|
||||
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
|
||||
; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
|
||||
; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
|
||||
@ -858,11 +852,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
|
||||
; GFX12W64-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
|
||||
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
|
||||
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
|
||||
; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
|
||||
@ -900,15 +893,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX12W32-NEXT: ; implicit-def: $vgpr0
|
||||
; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop
|
||||
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12W32-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
|
||||
; GFX12W32-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
|
||||
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
|
||||
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
|
||||
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
|
||||
; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
|
||||
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
|
||||
; GFX12W32-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
|
||||
; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
|
||||
@ -1665,12 +1658,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX8-NEXT: s_mov_b32 m0, s3
|
||||
; GFX8-NEXT: v_readlane_b32 s8, v0, s3
|
||||
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX8-NEXT: v_readlane_b32 s6, v0, s3
|
||||
; GFX8-NEXT: v_writelane_b32 v1, s2, m0
|
||||
; GFX8-NEXT: s_add_i32 s2, s2, s8
|
||||
; GFX8-NEXT: s_add_i32 s2, s2, s6
|
||||
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX8-NEXT: s_cbranch_scc1 .LBB6_1
|
||||
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -1706,12 +1698,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX9-NEXT: s_mov_b32 m0, s3
|
||||
; GFX9-NEXT: v_readlane_b32 s8, v0, s3
|
||||
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX9-NEXT: v_readlane_b32 s6, v0, s3
|
||||
; GFX9-NEXT: v_writelane_b32 v1, s2, m0
|
||||
; GFX9-NEXT: s_add_i32 s2, s2, s8
|
||||
; GFX9-NEXT: s_add_i32 s2, s2, s6
|
||||
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB6_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -1746,11 +1737,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
|
||||
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
|
||||
; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX10W64-NEXT: s_add_i32 s2, s2, s8
|
||||
; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_1
|
||||
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -1786,11 +1776,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
|
||||
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
|
||||
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
|
||||
; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
|
||||
; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
|
||||
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
|
||||
; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
|
||||
; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_1
|
||||
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -1827,11 +1816,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
|
||||
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
|
||||
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
|
||||
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
|
||||
; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_1
|
||||
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
|
||||
@ -1871,11 +1859,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
|
||||
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
|
||||
; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
|
||||
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
|
||||
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
|
||||
; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
|
||||
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
|
||||
; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
|
||||
; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_1
|
||||
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
|
||||
@ -1913,11 +1900,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
|
||||
; GFX12W64-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
|
||||
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
|
||||
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
|
||||
; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX12W64-NEXT: s_cbranch_scc1 .LBB6_1
|
||||
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
|
||||
@ -1955,15 +1941,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX12W32-NEXT: ; implicit-def: $vgpr0
|
||||
; GFX12W32-NEXT: .LBB6_1: ; %ComputeLoop
|
||||
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12W32-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
|
||||
; GFX12W32-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
|
||||
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
|
||||
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
|
||||
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
|
||||
; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
|
||||
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
|
||||
; GFX12W32-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
|
||||
; GFX12W32-NEXT: s_cbranch_scc1 .LBB6_1
|
||||
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
|
||||
|
||||
@ -628,12 +628,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX8-NEXT: s_mov_b32 m0, s3
|
||||
; GFX8-NEXT: v_readlane_b32 s8, v0, s3
|
||||
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX8-NEXT: v_readlane_b32 s6, v0, s3
|
||||
; GFX8-NEXT: v_writelane_b32 v1, s2, m0
|
||||
; GFX8-NEXT: s_add_i32 s2, s2, s8
|
||||
; GFX8-NEXT: s_add_i32 s2, s2, s6
|
||||
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX8-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -670,12 +669,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX9-NEXT: s_mov_b32 m0, s3
|
||||
; GFX9-NEXT: v_readlane_b32 s8, v0, s3
|
||||
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX9-NEXT: v_readlane_b32 s6, v0, s3
|
||||
; GFX9-NEXT: v_writelane_b32 v1, s2, m0
|
||||
; GFX9-NEXT: s_add_i32 s2, s2, s8
|
||||
; GFX9-NEXT: s_add_i32 s2, s2, s6
|
||||
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -711,11 +709,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
|
||||
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
|
||||
; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX10W64-NEXT: s_add_i32 s2, s2, s8
|
||||
; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -752,11 +749,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
|
||||
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
|
||||
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
|
||||
; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
|
||||
; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
|
||||
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
|
||||
; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
|
||||
; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -794,11 +790,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
|
||||
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
|
||||
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
|
||||
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
|
||||
; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
|
||||
@ -839,11 +834,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
|
||||
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
|
||||
; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
|
||||
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
|
||||
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
|
||||
; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
|
||||
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
|
||||
; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
|
||||
; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
|
||||
@ -880,11 +874,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
|
||||
; GFX12W64-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
|
||||
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
|
||||
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
|
||||
; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
|
||||
@ -923,15 +916,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX12W32-NEXT: ; implicit-def: $vgpr0
|
||||
; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop
|
||||
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12W32-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
|
||||
; GFX12W32-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
|
||||
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
|
||||
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
|
||||
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
|
||||
; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
|
||||
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
|
||||
; GFX12W32-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
|
||||
; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
|
||||
@ -1833,12 +1826,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX8-NEXT: s_mov_b32 m0, s3
|
||||
; GFX8-NEXT: v_readlane_b32 s8, v0, s3
|
||||
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX8-NEXT: v_readlane_b32 s6, v0, s3
|
||||
; GFX8-NEXT: v_writelane_b32 v1, s2, m0
|
||||
; GFX8-NEXT: s_add_i32 s2, s2, s8
|
||||
; GFX8-NEXT: s_add_i32 s2, s2, s6
|
||||
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX8-NEXT: s_cbranch_scc1 .LBB7_1
|
||||
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -1875,12 +1867,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX9-NEXT: s_mov_b32 m0, s3
|
||||
; GFX9-NEXT: v_readlane_b32 s8, v0, s3
|
||||
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX9-NEXT: v_readlane_b32 s6, v0, s3
|
||||
; GFX9-NEXT: v_writelane_b32 v1, s2, m0
|
||||
; GFX9-NEXT: s_add_i32 s2, s2, s8
|
||||
; GFX9-NEXT: s_add_i32 s2, s2, s6
|
||||
; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -1916,11 +1907,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
|
||||
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
|
||||
; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX10W64-NEXT: s_add_i32 s2, s2, s8
|
||||
; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1
|
||||
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -1957,11 +1947,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
|
||||
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
|
||||
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
|
||||
; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
|
||||
; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
|
||||
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
|
||||
; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
|
||||
; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1
|
||||
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -1999,11 +1988,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
|
||||
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
|
||||
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
|
||||
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
|
||||
; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1
|
||||
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
|
||||
@ -2044,11 +2032,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
|
||||
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
|
||||
; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
|
||||
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
|
||||
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
|
||||
; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
|
||||
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
|
||||
; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
|
||||
; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1
|
||||
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
|
||||
@ -2086,11 +2073,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
|
||||
; GFX12W64-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
|
||||
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
|
||||
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
|
||||
; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1
|
||||
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
|
||||
@ -2129,15 +2115,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
|
||||
; GFX12W32-NEXT: ; implicit-def: $vgpr0
|
||||
; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop
|
||||
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12W32-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
|
||||
; GFX12W32-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
|
||||
; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
|
||||
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
|
||||
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
|
||||
; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
|
||||
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
|
||||
; GFX12W32-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
|
||||
; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1
|
||||
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
|
||||
|
||||
@ -704,7 +704,6 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
|
||||
; CISI-NEXT: s_add_u32 s4, s4, s6
|
||||
; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0
|
||||
; CISI-NEXT: s_or_b32 s6, s12, s13
|
||||
; CISI-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; CISI-NEXT: s_addc_u32 s5, s5, s7
|
||||
; CISI-NEXT: s_mov_b32 s8, s0
|
||||
; CISI-NEXT: s_mov_b32 s9, s1
|
||||
@ -725,16 +724,14 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: s_add_u32 s2, s4, s6
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: s_add_u32 s0, s4, s6
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_addc_u32 s1, s5, s7
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s1
|
||||
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; VI-NEXT: s_addc_u32 s0, s5, s7
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s0
|
||||
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s3
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
|
||||
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
@ -746,12 +743,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
|
||||
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_add_u32 s2, s12, s14
|
||||
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: s_addc_u32 s0, s13, s15
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: s_add_u32 s0, s12, s14
|
||||
; GFX9-NEXT: s_addc_u32 s1, s13, s15
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
|
||||
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
|
||||
@ -764,10 +759,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
|
||||
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX1010-NEXT: s_add_u32 s0, s12, s14
|
||||
; GFX1010-NEXT: s_cselect_b32 s1, -1, 0
|
||||
; GFX1010-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX1010-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX1010-NEXT: s_addc_u32 s1, s13, s15
|
||||
; GFX1010-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX1010-NEXT: s_cselect_b32 s0, -1, 0
|
||||
; GFX1010-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
|
||||
@ -781,10 +774,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
|
||||
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX1030W32-NEXT: s_add_u32 s4, s4, s6
|
||||
; GFX1030W32-NEXT: s_cselect_b32 s6, -1, 0
|
||||
; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX1030W32-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GFX1030W32-NEXT: s_addc_u32 s5, s5, s7
|
||||
; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX1030W32-NEXT: s_cselect_b32 s4, -1, 0
|
||||
; GFX1030W32-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
|
||||
@ -798,10 +789,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
|
||||
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX1030W64-NEXT: s_add_u32 s4, s4, s6
|
||||
; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0
|
||||
; GFX1030W64-NEXT: s_addc_u32 s5, s5, s7
|
||||
; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX1030W64-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX1030W64-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
|
||||
@ -814,10 +803,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
|
||||
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_add_u32 s4, s4, s6
|
||||
; GFX11-NEXT: s_cselect_b32 s6, -1, 0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX11-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GFX11-NEXT: s_addc_u32 s5, s5, s7
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
|
||||
@ -831,10 +818,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
|
||||
; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_add_co_u32 s0, s12, s14
|
||||
; GFX1250-NEXT: s_cselect_b32 s1, -1, 0
|
||||
; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
|
||||
; GFX1250-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX1250-NEXT: s_add_co_ci_u32 s1, s13, s15
|
||||
; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
|
||||
; GFX1250-NEXT: s_cselect_b32 s0, -1, 0
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
|
||||
@ -1691,7 +1676,6 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
|
||||
; CISI-NEXT: s_sub_u32 s4, s4, s6
|
||||
; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0
|
||||
; CISI-NEXT: s_or_b32 s6, s12, s13
|
||||
; CISI-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; CISI-NEXT: s_subb_u32 s5, s5, s7
|
||||
; CISI-NEXT: s_mov_b32 s8, s0
|
||||
; CISI-NEXT: s_mov_b32 s9, s1
|
||||
@ -1712,16 +1696,14 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: s_sub_u32 s2, s4, s6
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: s_sub_u32 s0, s4, s6
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_subb_u32 s1, s5, s7
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s1
|
||||
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; VI-NEXT: s_subb_u32 s0, s5, s7
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s0
|
||||
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s3
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
|
||||
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
@ -1733,12 +1715,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
|
||||
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_sub_u32 s2, s12, s14
|
||||
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: s_subb_u32 s0, s13, s15
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: s_sub_u32 s0, s12, s14
|
||||
; GFX9-NEXT: s_subb_u32 s1, s13, s15
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
|
||||
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
|
||||
@ -1751,10 +1731,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
|
||||
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX1010-NEXT: s_sub_u32 s0, s12, s14
|
||||
; GFX1010-NEXT: s_cselect_b32 s1, -1, 0
|
||||
; GFX1010-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX1010-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX1010-NEXT: s_subb_u32 s1, s13, s15
|
||||
; GFX1010-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX1010-NEXT: s_cselect_b32 s0, -1, 0
|
||||
; GFX1010-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
|
||||
@ -1768,10 +1746,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
|
||||
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX1030W32-NEXT: s_sub_u32 s4, s4, s6
|
||||
; GFX1030W32-NEXT: s_cselect_b32 s6, -1, 0
|
||||
; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX1030W32-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GFX1030W32-NEXT: s_subb_u32 s5, s5, s7
|
||||
; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX1030W32-NEXT: s_cselect_b32 s4, -1, 0
|
||||
; GFX1030W32-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
|
||||
@ -1785,10 +1761,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
|
||||
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX1030W64-NEXT: s_sub_u32 s4, s4, s6
|
||||
; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0
|
||||
; GFX1030W64-NEXT: s_subb_u32 s5, s5, s7
|
||||
; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX1030W64-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX1030W64-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
|
||||
@ -1801,10 +1775,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
|
||||
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_sub_u32 s4, s4, s6
|
||||
; GFX11-NEXT: s_cselect_b32 s6, -1, 0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX11-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GFX11-NEXT: s_subb_u32 s5, s5, s7
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
|
||||
@ -1818,10 +1790,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
|
||||
; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_sub_co_u32 s0, s12, s14
|
||||
; GFX1250-NEXT: s_cselect_b32 s1, -1, 0
|
||||
; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
|
||||
; GFX1250-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX1250-NEXT: s_sub_co_ci_u32 s1, s13, s15
|
||||
; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
|
||||
; GFX1250-NEXT: s_cselect_b32 s0, -1, 0
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
|
||||
@ -2218,49 +2188,46 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
|
||||
; VI-NEXT: s_addc_u32 s6, s7, s9
|
||||
; VI-NEXT: s_addc_u32 s8, s8, 0
|
||||
; VI-NEXT: v_readfirstlane_b32 s7, v0
|
||||
; VI-NEXT: s_add_u32 s12, s6, s7
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s12
|
||||
; VI-NEXT: s_add_u32 s10, s6, s7
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s10
|
||||
; VI-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s4, v0, 0
|
||||
; VI-NEXT: s_addc_u32 s13, 0, s8
|
||||
; VI-NEXT: s_mul_i32 s8, s4, s13
|
||||
; VI-NEXT: s_addc_u32 s11, 0, s8
|
||||
; VI-NEXT: s_mul_i32 s8, s4, s11
|
||||
; VI-NEXT: v_readfirstlane_b32 s9, v1
|
||||
; VI-NEXT: s_add_i32 s8, s9, s8
|
||||
; VI-NEXT: s_mul_i32 s9, s5, s12
|
||||
; VI-NEXT: s_add_i32 s14, s8, s9
|
||||
; VI-NEXT: s_sub_i32 s10, s3, s14
|
||||
; VI-NEXT: s_mul_i32 s9, s5, s10
|
||||
; VI-NEXT: s_add_i32 s12, s8, s9
|
||||
; VI-NEXT: s_sub_i32 s13, s3, s12
|
||||
; VI-NEXT: v_readfirstlane_b32 s8, v0
|
||||
; VI-NEXT: s_sub_u32 s15, s2, s8
|
||||
; VI-NEXT: s_sub_u32 s14, s2, s8
|
||||
; VI-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; VI-NEXT: s_cmp_lg_u64 s[8:9], 0
|
||||
; VI-NEXT: s_subb_u32 s16, s10, s5
|
||||
; VI-NEXT: s_sub_u32 s17, s15, s4
|
||||
; VI-NEXT: s_cselect_b64 s[10:11], -1, 0
|
||||
; VI-NEXT: s_cmp_lg_u64 s[10:11], 0
|
||||
; VI-NEXT: s_subb_u32 s10, s16, 0
|
||||
; VI-NEXT: s_cmp_ge_u32 s10, s5
|
||||
; VI-NEXT: s_cselect_b32 s11, -1, 0
|
||||
; VI-NEXT: s_cmp_ge_u32 s17, s4
|
||||
; VI-NEXT: s_subb_u32 s13, s13, s5
|
||||
; VI-NEXT: s_sub_u32 s15, s14, s4
|
||||
; VI-NEXT: s_subb_u32 s13, s13, 0
|
||||
; VI-NEXT: s_cmp_ge_u32 s13, s5
|
||||
; VI-NEXT: s_cselect_b32 s16, -1, 0
|
||||
; VI-NEXT: s_cmp_eq_u32 s10, s5
|
||||
; VI-NEXT: s_cselect_b32 s10, s16, s11
|
||||
; VI-NEXT: s_add_u32 s11, s12, 1
|
||||
; VI-NEXT: s_addc_u32 s16, s13, 0
|
||||
; VI-NEXT: s_add_u32 s17, s12, 2
|
||||
; VI-NEXT: s_addc_u32 s18, s13, 0
|
||||
; VI-NEXT: s_cmp_lg_u32 s10, 0
|
||||
; VI-NEXT: s_cselect_b32 s10, s17, s11
|
||||
; VI-NEXT: s_cselect_b32 s11, s18, s16
|
||||
; VI-NEXT: s_cmp_ge_u32 s15, s4
|
||||
; VI-NEXT: s_cselect_b32 s15, -1, 0
|
||||
; VI-NEXT: s_cmp_eq_u32 s13, s5
|
||||
; VI-NEXT: s_cselect_b32 s13, s15, s16
|
||||
; VI-NEXT: s_add_u32 s15, s10, 1
|
||||
; VI-NEXT: s_addc_u32 s16, s11, 0
|
||||
; VI-NEXT: s_add_u32 s17, s10, 2
|
||||
; VI-NEXT: s_addc_u32 s18, s11, 0
|
||||
; VI-NEXT: s_cmp_lg_u32 s13, 0
|
||||
; VI-NEXT: s_cselect_b32 s13, s17, s15
|
||||
; VI-NEXT: s_cselect_b32 s15, s18, s16
|
||||
; VI-NEXT: s_cmp_lg_u64 s[8:9], 0
|
||||
; VI-NEXT: s_subb_u32 s3, s3, s14
|
||||
; VI-NEXT: s_subb_u32 s3, s3, s12
|
||||
; VI-NEXT: s_cmp_ge_u32 s3, s5
|
||||
; VI-NEXT: s_cselect_b32 s8, -1, 0
|
||||
; VI-NEXT: s_cmp_ge_u32 s15, s4
|
||||
; VI-NEXT: s_cmp_ge_u32 s14, s4
|
||||
; VI-NEXT: s_cselect_b32 s9, -1, 0
|
||||
; VI-NEXT: s_cmp_eq_u32 s3, s5
|
||||
; VI-NEXT: s_cselect_b32 s3, s9, s8
|
||||
; VI-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; VI-NEXT: s_cselect_b32 s9, s11, s13
|
||||
; VI-NEXT: s_cselect_b32 s8, s10, s12
|
||||
; VI-NEXT: s_cselect_b32 s9, s15, s11
|
||||
; VI-NEXT: s_cselect_b32 s8, s13, s10
|
||||
; VI-NEXT: s_cbranch_execnz .LBB16_4
|
||||
; VI-NEXT: .LBB16_2:
|
||||
; VI-NEXT: v_cvt_f32_u32_e32 v0, s4
|
||||
@ -2311,8 +2278,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
|
||||
; GFX9-NEXT: ; %bb.1:
|
||||
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
|
||||
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
|
||||
; GFX9-NEXT: s_sub_u32 s10, 0, s6
|
||||
; GFX9-NEXT: s_subb_u32 s11, 0, s7
|
||||
; GFX9-NEXT: s_sub_u32 s8, 0, s6
|
||||
; GFX9-NEXT: s_subb_u32 s9, 0, s7
|
||||
; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
|
||||
; GFX9-NEXT: v_rcp_f32_e32 v0, v0
|
||||
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
||||
@ -2321,109 +2288,102 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
|
||||
; GFX9-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
|
||||
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
|
||||
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; GFX9-NEXT: v_readfirstlane_b32 s12, v1
|
||||
; GFX9-NEXT: v_readfirstlane_b32 s8, v0
|
||||
; GFX9-NEXT: s_mul_i32 s9, s10, s12
|
||||
; GFX9-NEXT: s_mul_hi_u32 s14, s10, s8
|
||||
; GFX9-NEXT: s_mul_i32 s13, s11, s8
|
||||
; GFX9-NEXT: s_add_i32 s9, s14, s9
|
||||
; GFX9-NEXT: s_add_i32 s9, s9, s13
|
||||
; GFX9-NEXT: s_mul_i32 s15, s10, s8
|
||||
; GFX9-NEXT: s_mul_i32 s14, s8, s9
|
||||
; GFX9-NEXT: s_mul_hi_u32 s16, s8, s15
|
||||
; GFX9-NEXT: s_mul_hi_u32 s13, s8, s9
|
||||
; GFX9-NEXT: v_readfirstlane_b32 s10, v1
|
||||
; GFX9-NEXT: v_readfirstlane_b32 s11, v0
|
||||
; GFX9-NEXT: s_mul_i32 s12, s8, s10
|
||||
; GFX9-NEXT: s_mul_hi_u32 s14, s8, s11
|
||||
; GFX9-NEXT: s_mul_i32 s13, s9, s11
|
||||
; GFX9-NEXT: s_add_i32 s12, s14, s12
|
||||
; GFX9-NEXT: s_add_i32 s12, s12, s13
|
||||
; GFX9-NEXT: s_mul_i32 s15, s8, s11
|
||||
; GFX9-NEXT: s_mul_i32 s14, s11, s12
|
||||
; GFX9-NEXT: s_mul_hi_u32 s16, s11, s15
|
||||
; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12
|
||||
; GFX9-NEXT: s_add_u32 s14, s16, s14
|
||||
; GFX9-NEXT: s_addc_u32 s13, 0, s13
|
||||
; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15
|
||||
; GFX9-NEXT: s_mul_i32 s15, s12, s15
|
||||
; GFX9-NEXT: s_mul_hi_u32 s17, s10, s15
|
||||
; GFX9-NEXT: s_mul_i32 s15, s10, s15
|
||||
; GFX9-NEXT: s_add_u32 s14, s14, s15
|
||||
; GFX9-NEXT: s_mul_hi_u32 s16, s12, s9
|
||||
; GFX9-NEXT: s_mul_hi_u32 s16, s10, s12
|
||||
; GFX9-NEXT: s_addc_u32 s13, s13, s17
|
||||
; GFX9-NEXT: s_addc_u32 s14, s16, 0
|
||||
; GFX9-NEXT: s_mul_i32 s9, s12, s9
|
||||
; GFX9-NEXT: s_add_u32 s9, s13, s9
|
||||
; GFX9-NEXT: s_mul_i32 s12, s10, s12
|
||||
; GFX9-NEXT: s_add_u32 s12, s13, s12
|
||||
; GFX9-NEXT: s_addc_u32 s13, 0, s14
|
||||
; GFX9-NEXT: s_add_u32 s14, s8, s9
|
||||
; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
|
||||
; GFX9-NEXT: s_addc_u32 s12, s12, s13
|
||||
; GFX9-NEXT: s_mul_i32 s8, s10, s12
|
||||
; GFX9-NEXT: s_mul_hi_u32 s9, s10, s14
|
||||
; GFX9-NEXT: s_add_i32 s8, s9, s8
|
||||
; GFX9-NEXT: s_mul_i32 s11, s11, s14
|
||||
; GFX9-NEXT: s_add_i32 s8, s8, s11
|
||||
; GFX9-NEXT: s_mul_i32 s10, s10, s14
|
||||
; GFX9-NEXT: s_mul_hi_u32 s11, s12, s10
|
||||
; GFX9-NEXT: s_mul_i32 s13, s12, s10
|
||||
; GFX9-NEXT: s_mul_i32 s16, s14, s8
|
||||
; GFX9-NEXT: s_mul_hi_u32 s10, s14, s10
|
||||
; GFX9-NEXT: s_mul_hi_u32 s15, s14, s8
|
||||
; GFX9-NEXT: s_add_u32 s10, s10, s16
|
||||
; GFX9-NEXT: s_add_u32 s11, s11, s12
|
||||
; GFX9-NEXT: s_addc_u32 s10, s10, s13
|
||||
; GFX9-NEXT: s_mul_i32 s12, s8, s10
|
||||
; GFX9-NEXT: s_mul_hi_u32 s13, s8, s11
|
||||
; GFX9-NEXT: s_add_i32 s12, s13, s12
|
||||
; GFX9-NEXT: s_mul_i32 s9, s9, s11
|
||||
; GFX9-NEXT: s_add_i32 s12, s12, s9
|
||||
; GFX9-NEXT: s_mul_i32 s8, s8, s11
|
||||
; GFX9-NEXT: s_mul_hi_u32 s13, s10, s8
|
||||
; GFX9-NEXT: s_mul_i32 s14, s10, s8
|
||||
; GFX9-NEXT: s_mul_i32 s16, s11, s12
|
||||
; GFX9-NEXT: s_mul_hi_u32 s8, s11, s8
|
||||
; GFX9-NEXT: s_mul_hi_u32 s15, s11, s12
|
||||
; GFX9-NEXT: s_add_u32 s8, s8, s16
|
||||
; GFX9-NEXT: s_addc_u32 s15, 0, s15
|
||||
; GFX9-NEXT: s_add_u32 s10, s10, s13
|
||||
; GFX9-NEXT: s_mul_hi_u32 s9, s12, s8
|
||||
; GFX9-NEXT: s_addc_u32 s10, s15, s11
|
||||
; GFX9-NEXT: s_add_u32 s8, s8, s14
|
||||
; GFX9-NEXT: s_mul_hi_u32 s9, s10, s12
|
||||
; GFX9-NEXT: s_addc_u32 s8, s15, s13
|
||||
; GFX9-NEXT: s_addc_u32 s9, s9, 0
|
||||
; GFX9-NEXT: s_mul_i32 s8, s12, s8
|
||||
; GFX9-NEXT: s_add_u32 s8, s10, s8
|
||||
; GFX9-NEXT: s_addc_u32 s10, 0, s9
|
||||
; GFX9-NEXT: s_add_u32 s11, s14, s8
|
||||
; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
|
||||
; GFX9-NEXT: s_addc_u32 s8, s12, s10
|
||||
; GFX9-NEXT: s_mul_i32 s10, s2, s8
|
||||
; GFX9-NEXT: s_mul_hi_u32 s12, s2, s11
|
||||
; GFX9-NEXT: s_mul_hi_u32 s9, s2, s8
|
||||
; GFX9-NEXT: s_add_u32 s10, s12, s10
|
||||
; GFX9-NEXT: s_mul_i32 s12, s10, s12
|
||||
; GFX9-NEXT: s_add_u32 s8, s8, s12
|
||||
; GFX9-NEXT: s_addc_u32 s9, 0, s9
|
||||
; GFX9-NEXT: s_mul_hi_u32 s13, s3, s11
|
||||
; GFX9-NEXT: s_mul_i32 s11, s3, s11
|
||||
; GFX9-NEXT: s_add_u32 s10, s10, s11
|
||||
; GFX9-NEXT: s_mul_hi_u32 s12, s3, s8
|
||||
; GFX9-NEXT: s_addc_u32 s9, s9, s13
|
||||
; GFX9-NEXT: s_addc_u32 s10, s12, 0
|
||||
; GFX9-NEXT: s_add_u32 s8, s11, s8
|
||||
; GFX9-NEXT: s_addc_u32 s9, s10, s9
|
||||
; GFX9-NEXT: s_mul_i32 s11, s2, s9
|
||||
; GFX9-NEXT: s_mul_hi_u32 s12, s2, s8
|
||||
; GFX9-NEXT: s_mul_hi_u32 s10, s2, s9
|
||||
; GFX9-NEXT: s_add_u32 s11, s12, s11
|
||||
; GFX9-NEXT: s_addc_u32 s10, 0, s10
|
||||
; GFX9-NEXT: s_mul_hi_u32 s13, s3, s8
|
||||
; GFX9-NEXT: s_mul_i32 s8, s3, s8
|
||||
; GFX9-NEXT: s_add_u32 s12, s9, s8
|
||||
; GFX9-NEXT: s_addc_u32 s13, 0, s10
|
||||
; GFX9-NEXT: s_mul_i32 s8, s6, s13
|
||||
; GFX9-NEXT: s_mul_hi_u32 s9, s6, s12
|
||||
; GFX9-NEXT: s_add_u32 s8, s11, s8
|
||||
; GFX9-NEXT: s_mul_hi_u32 s12, s3, s9
|
||||
; GFX9-NEXT: s_addc_u32 s8, s10, s13
|
||||
; GFX9-NEXT: s_addc_u32 s10, s12, 0
|
||||
; GFX9-NEXT: s_mul_i32 s9, s3, s9
|
||||
; GFX9-NEXT: s_add_u32 s11, s8, s9
|
||||
; GFX9-NEXT: s_addc_u32 s10, 0, s10
|
||||
; GFX9-NEXT: s_mul_i32 s8, s6, s10
|
||||
; GFX9-NEXT: s_mul_hi_u32 s9, s6, s11
|
||||
; GFX9-NEXT: s_add_i32 s8, s9, s8
|
||||
; GFX9-NEXT: s_mul_i32 s9, s7, s12
|
||||
; GFX9-NEXT: s_add_i32 s14, s8, s9
|
||||
; GFX9-NEXT: s_sub_i32 s10, s3, s14
|
||||
; GFX9-NEXT: s_mul_i32 s8, s6, s12
|
||||
; GFX9-NEXT: s_sub_u32 s15, s2, s8
|
||||
; GFX9-NEXT: s_mul_i32 s9, s7, s11
|
||||
; GFX9-NEXT: s_add_i32 s12, s8, s9
|
||||
; GFX9-NEXT: s_sub_i32 s13, s3, s12
|
||||
; GFX9-NEXT: s_mul_i32 s8, s6, s11
|
||||
; GFX9-NEXT: s_sub_u32 s14, s2, s8
|
||||
; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
|
||||
; GFX9-NEXT: s_subb_u32 s16, s10, s7
|
||||
; GFX9-NEXT: s_sub_u32 s17, s15, s6
|
||||
; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
|
||||
; GFX9-NEXT: s_subb_u32 s10, s16, 0
|
||||
; GFX9-NEXT: s_cmp_ge_u32 s10, s7
|
||||
; GFX9-NEXT: s_cselect_b32 s11, -1, 0
|
||||
; GFX9-NEXT: s_cmp_ge_u32 s17, s6
|
||||
; GFX9-NEXT: s_subb_u32 s13, s13, s7
|
||||
; GFX9-NEXT: s_sub_u32 s15, s14, s6
|
||||
; GFX9-NEXT: s_subb_u32 s13, s13, 0
|
||||
; GFX9-NEXT: s_cmp_ge_u32 s13, s7
|
||||
; GFX9-NEXT: s_cselect_b32 s16, -1, 0
|
||||
; GFX9-NEXT: s_cmp_eq_u32 s10, s7
|
||||
; GFX9-NEXT: s_cselect_b32 s10, s16, s11
|
||||
; GFX9-NEXT: s_add_u32 s11, s12, 1
|
||||
; GFX9-NEXT: s_addc_u32 s16, s13, 0
|
||||
; GFX9-NEXT: s_add_u32 s17, s12, 2
|
||||
; GFX9-NEXT: s_addc_u32 s18, s13, 0
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s10, 0
|
||||
; GFX9-NEXT: s_cselect_b32 s10, s17, s11
|
||||
; GFX9-NEXT: s_cselect_b32 s11, s18, s16
|
||||
; GFX9-NEXT: s_cmp_ge_u32 s15, s6
|
||||
; GFX9-NEXT: s_cselect_b32 s15, -1, 0
|
||||
; GFX9-NEXT: s_cmp_eq_u32 s13, s7
|
||||
; GFX9-NEXT: s_cselect_b32 s13, s15, s16
|
||||
; GFX9-NEXT: s_add_u32 s15, s11, 1
|
||||
; GFX9-NEXT: s_addc_u32 s16, s10, 0
|
||||
; GFX9-NEXT: s_add_u32 s17, s11, 2
|
||||
; GFX9-NEXT: s_addc_u32 s18, s10, 0
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s13, 0
|
||||
; GFX9-NEXT: s_cselect_b32 s13, s17, s15
|
||||
; GFX9-NEXT: s_cselect_b32 s15, s18, s16
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
|
||||
; GFX9-NEXT: s_subb_u32 s3, s3, s14
|
||||
; GFX9-NEXT: s_subb_u32 s3, s3, s12
|
||||
; GFX9-NEXT: s_cmp_ge_u32 s3, s7
|
||||
; GFX9-NEXT: s_cselect_b32 s8, -1, 0
|
||||
; GFX9-NEXT: s_cmp_ge_u32 s15, s6
|
||||
; GFX9-NEXT: s_cmp_ge_u32 s14, s6
|
||||
; GFX9-NEXT: s_cselect_b32 s9, -1, 0
|
||||
; GFX9-NEXT: s_cmp_eq_u32 s3, s7
|
||||
; GFX9-NEXT: s_cselect_b32 s3, s9, s8
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX9-NEXT: s_cselect_b32 s9, s11, s13
|
||||
; GFX9-NEXT: s_cselect_b32 s8, s10, s12
|
||||
; GFX9-NEXT: s_cselect_b32 s9, s15, s10
|
||||
; GFX9-NEXT: s_cselect_b32 s8, s13, s11
|
||||
; GFX9-NEXT: s_cbranch_execnz .LBB16_3
|
||||
; GFX9-NEXT: .LBB16_2:
|
||||
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
|
||||
@ -2503,44 +2463,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
|
||||
; GFX1010-NEXT: s_add_u32 s11, s12, s11
|
||||
; GFX1010-NEXT: s_addc_u32 s12, 0, s13
|
||||
; GFX1010-NEXT: s_add_u32 s8, s8, s11
|
||||
; GFX1010-NEXT: s_cselect_b32 s11, -1, 0
|
||||
; GFX1010-NEXT: s_mul_hi_u32 s13, s9, s8
|
||||
; GFX1010-NEXT: s_cmp_lg_u32 s11, 0
|
||||
; GFX1010-NEXT: s_mul_i32 s11, s9, s8
|
||||
; GFX1010-NEXT: s_addc_u32 s5, s5, s12
|
||||
; GFX1010-NEXT: s_mul_i32 s10, s10, s8
|
||||
; GFX1010-NEXT: s_mul_hi_u32 s11, s9, s8
|
||||
; GFX1010-NEXT: s_mul_i32 s12, s9, s8
|
||||
; GFX1010-NEXT: s_mul_i32 s9, s9, s5
|
||||
; GFX1010-NEXT: s_mul_hi_u32 s12, s8, s11
|
||||
; GFX1010-NEXT: s_add_i32 s9, s13, s9
|
||||
; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s11
|
||||
; GFX1010-NEXT: s_mul_i32 s10, s10, s8
|
||||
; GFX1010-NEXT: s_add_i32 s9, s11, s9
|
||||
; GFX1010-NEXT: s_mul_i32 s11, s5, s12
|
||||
; GFX1010-NEXT: s_add_i32 s9, s9, s10
|
||||
; GFX1010-NEXT: s_mul_i32 s10, s5, s11
|
||||
; GFX1010-NEXT: s_mul_hi_u32 s10, s8, s12
|
||||
; GFX1010-NEXT: s_mul_i32 s15, s8, s9
|
||||
; GFX1010-NEXT: s_mul_hi_u32 s14, s8, s9
|
||||
; GFX1010-NEXT: s_add_u32 s12, s12, s15
|
||||
; GFX1010-NEXT: s_add_u32 s10, s10, s15
|
||||
; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s12
|
||||
; GFX1010-NEXT: s_addc_u32 s14, 0, s14
|
||||
; GFX1010-NEXT: s_mul_hi_u32 s11, s5, s9
|
||||
; GFX1010-NEXT: s_add_u32 s10, s12, s10
|
||||
; GFX1010-NEXT: s_mul_hi_u32 s12, s5, s9
|
||||
; GFX1010-NEXT: s_add_u32 s10, s10, s11
|
||||
; GFX1010-NEXT: s_mul_i32 s9, s5, s9
|
||||
; GFX1010-NEXT: s_addc_u32 s10, s14, s13
|
||||
; GFX1010-NEXT: s_addc_u32 s11, s11, 0
|
||||
; GFX1010-NEXT: s_addc_u32 s11, s12, 0
|
||||
; GFX1010-NEXT: s_add_u32 s9, s10, s9
|
||||
; GFX1010-NEXT: s_addc_u32 s10, 0, s11
|
||||
; GFX1010-NEXT: s_add_u32 s8, s8, s9
|
||||
; GFX1010-NEXT: s_cselect_b32 s9, -1, 0
|
||||
; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s8
|
||||
; GFX1010-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX1010-NEXT: s_mul_hi_u32 s9, s3, s8
|
||||
; GFX1010-NEXT: s_addc_u32 s5, s5, s10
|
||||
; GFX1010-NEXT: s_mul_i32 s8, s3, s8
|
||||
; GFX1010-NEXT: s_mul_hi_u32 s9, s2, s8
|
||||
; GFX1010-NEXT: s_mul_i32 s12, s2, s5
|
||||
; GFX1010-NEXT: s_mul_hi_u32 s10, s2, s5
|
||||
; GFX1010-NEXT: s_add_u32 s11, s11, s12
|
||||
; GFX1010-NEXT: s_addc_u32 s10, 0, s10
|
||||
; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s5
|
||||
; GFX1010-NEXT: s_mul_hi_u32 s10, s3, s8
|
||||
; GFX1010-NEXT: s_mul_i32 s8, s3, s8
|
||||
; GFX1010-NEXT: s_add_u32 s9, s9, s12
|
||||
; GFX1010-NEXT: s_addc_u32 s11, 0, s11
|
||||
; GFX1010-NEXT: s_mul_hi_u32 s13, s3, s5
|
||||
; GFX1010-NEXT: s_add_u32 s8, s11, s8
|
||||
; GFX1010-NEXT: s_add_u32 s8, s9, s8
|
||||
; GFX1010-NEXT: s_mul_i32 s5, s3, s5
|
||||
; GFX1010-NEXT: s_addc_u32 s8, s10, s9
|
||||
; GFX1010-NEXT: s_addc_u32 s8, s11, s10
|
||||
; GFX1010-NEXT: s_addc_u32 s9, s13, 0
|
||||
; GFX1010-NEXT: s_add_u32 s5, s8, s5
|
||||
; GFX1010-NEXT: s_addc_u32 s8, 0, s9
|
||||
@ -2553,11 +2509,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
|
||||
; GFX1010-NEXT: s_sub_i32 s11, s3, s9
|
||||
; GFX1010-NEXT: s_sub_u32 s10, s2, s10
|
||||
; GFX1010-NEXT: s_cselect_b32 s12, -1, 0
|
||||
; GFX1010-NEXT: s_cmp_lg_u32 s12, 0
|
||||
; GFX1010-NEXT: s_subb_u32 s11, s11, s7
|
||||
; GFX1010-NEXT: s_sub_u32 s13, s10, s6
|
||||
; GFX1010-NEXT: s_cselect_b32 s14, -1, 0
|
||||
; GFX1010-NEXT: s_cmp_lg_u32 s14, 0
|
||||
; GFX1010-NEXT: s_subb_u32 s11, s11, 0
|
||||
; GFX1010-NEXT: s_cmp_ge_u32 s11, s7
|
||||
; GFX1010-NEXT: s_cselect_b32 s14, -1, 0
|
||||
@ -2663,44 +2616,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
|
||||
; GFX1030W32-NEXT: s_add_u32 s11, s12, s11
|
||||
; GFX1030W32-NEXT: s_addc_u32 s12, 0, s13
|
||||
; GFX1030W32-NEXT: s_add_u32 s8, s8, s11
|
||||
; GFX1030W32-NEXT: s_cselect_b32 s11, -1, 0
|
||||
; GFX1030W32-NEXT: s_mul_hi_u32 s13, s9, s8
|
||||
; GFX1030W32-NEXT: s_cmp_lg_u32 s11, 0
|
||||
; GFX1030W32-NEXT: s_mul_i32 s11, s9, s8
|
||||
; GFX1030W32-NEXT: s_addc_u32 s7, s7, s12
|
||||
; GFX1030W32-NEXT: s_mul_i32 s10, s10, s8
|
||||
; GFX1030W32-NEXT: s_mul_hi_u32 s11, s9, s8
|
||||
; GFX1030W32-NEXT: s_mul_i32 s12, s9, s8
|
||||
; GFX1030W32-NEXT: s_mul_i32 s9, s9, s7
|
||||
; GFX1030W32-NEXT: s_mul_hi_u32 s12, s8, s11
|
||||
; GFX1030W32-NEXT: s_add_i32 s9, s13, s9
|
||||
; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s11
|
||||
; GFX1030W32-NEXT: s_mul_i32 s10, s10, s8
|
||||
; GFX1030W32-NEXT: s_add_i32 s9, s11, s9
|
||||
; GFX1030W32-NEXT: s_mul_i32 s11, s7, s12
|
||||
; GFX1030W32-NEXT: s_add_i32 s9, s9, s10
|
||||
; GFX1030W32-NEXT: s_mul_i32 s10, s7, s11
|
||||
; GFX1030W32-NEXT: s_mul_hi_u32 s10, s8, s12
|
||||
; GFX1030W32-NEXT: s_mul_i32 s15, s8, s9
|
||||
; GFX1030W32-NEXT: s_mul_hi_u32 s14, s8, s9
|
||||
; GFX1030W32-NEXT: s_add_u32 s12, s12, s15
|
||||
; GFX1030W32-NEXT: s_add_u32 s10, s10, s15
|
||||
; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s12
|
||||
; GFX1030W32-NEXT: s_addc_u32 s14, 0, s14
|
||||
; GFX1030W32-NEXT: s_mul_hi_u32 s11, s7, s9
|
||||
; GFX1030W32-NEXT: s_add_u32 s10, s12, s10
|
||||
; GFX1030W32-NEXT: s_mul_hi_u32 s12, s7, s9
|
||||
; GFX1030W32-NEXT: s_add_u32 s10, s10, s11
|
||||
; GFX1030W32-NEXT: s_mul_i32 s9, s7, s9
|
||||
; GFX1030W32-NEXT: s_addc_u32 s10, s14, s13
|
||||
; GFX1030W32-NEXT: s_addc_u32 s11, s11, 0
|
||||
; GFX1030W32-NEXT: s_addc_u32 s11, s12, 0
|
||||
; GFX1030W32-NEXT: s_add_u32 s9, s10, s9
|
||||
; GFX1030W32-NEXT: s_addc_u32 s10, 0, s11
|
||||
; GFX1030W32-NEXT: s_add_u32 s8, s8, s9
|
||||
; GFX1030W32-NEXT: s_cselect_b32 s9, -1, 0
|
||||
; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s8
|
||||
; GFX1030W32-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX1030W32-NEXT: s_mul_hi_u32 s9, s3, s8
|
||||
; GFX1030W32-NEXT: s_addc_u32 s7, s7, s10
|
||||
; GFX1030W32-NEXT: s_mul_i32 s8, s3, s8
|
||||
; GFX1030W32-NEXT: s_mul_hi_u32 s9, s2, s8
|
||||
; GFX1030W32-NEXT: s_mul_i32 s12, s2, s7
|
||||
; GFX1030W32-NEXT: s_mul_hi_u32 s10, s2, s7
|
||||
; GFX1030W32-NEXT: s_add_u32 s11, s11, s12
|
||||
; GFX1030W32-NEXT: s_addc_u32 s10, 0, s10
|
||||
; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s7
|
||||
; GFX1030W32-NEXT: s_mul_hi_u32 s10, s3, s8
|
||||
; GFX1030W32-NEXT: s_mul_i32 s8, s3, s8
|
||||
; GFX1030W32-NEXT: s_add_u32 s9, s9, s12
|
||||
; GFX1030W32-NEXT: s_addc_u32 s11, 0, s11
|
||||
; GFX1030W32-NEXT: s_mul_hi_u32 s13, s3, s7
|
||||
; GFX1030W32-NEXT: s_add_u32 s8, s11, s8
|
||||
; GFX1030W32-NEXT: s_add_u32 s8, s9, s8
|
||||
; GFX1030W32-NEXT: s_mul_i32 s7, s3, s7
|
||||
; GFX1030W32-NEXT: s_addc_u32 s8, s10, s9
|
||||
; GFX1030W32-NEXT: s_addc_u32 s8, s11, s10
|
||||
; GFX1030W32-NEXT: s_addc_u32 s9, s13, 0
|
||||
; GFX1030W32-NEXT: s_add_u32 s7, s8, s7
|
||||
; GFX1030W32-NEXT: s_addc_u32 s8, 0, s9
|
||||
@ -2713,11 +2662,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
|
||||
; GFX1030W32-NEXT: s_sub_i32 s11, s3, s9
|
||||
; GFX1030W32-NEXT: s_sub_u32 s10, s2, s10
|
||||
; GFX1030W32-NEXT: s_cselect_b32 s12, -1, 0
|
||||
; GFX1030W32-NEXT: s_cmp_lg_u32 s12, 0
|
||||
; GFX1030W32-NEXT: s_subb_u32 s11, s11, s5
|
||||
; GFX1030W32-NEXT: s_sub_u32 s13, s10, s4
|
||||
; GFX1030W32-NEXT: s_cselect_b32 s14, -1, 0
|
||||
; GFX1030W32-NEXT: s_cmp_lg_u32 s14, 0
|
||||
; GFX1030W32-NEXT: s_subb_u32 s11, s11, 0
|
||||
; GFX1030W32-NEXT: s_cmp_ge_u32 s11, s5
|
||||
; GFX1030W32-NEXT: s_cselect_b32 s14, -1, 0
|
||||
@ -2790,8 +2736,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
|
||||
; GFX1030W64-NEXT: ; %bb.1:
|
||||
; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s4
|
||||
; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v1, s5
|
||||
; GFX1030W64-NEXT: s_sub_u32 s9, 0, s4
|
||||
; GFX1030W64-NEXT: s_subb_u32 s10, 0, s5
|
||||
; GFX1030W64-NEXT: s_sub_u32 s8, 0, s4
|
||||
; GFX1030W64-NEXT: s_subb_u32 s9, 0, s5
|
||||
; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0
|
||||
; GFX1030W64-NEXT: v_rcp_f32_e32 v0, v0
|
||||
; GFX1030W64-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
||||
@ -2800,109 +2746,102 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
|
||||
; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0
|
||||
; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v1, v1
|
||||
; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; GFX1030W64-NEXT: v_readfirstlane_b32 s8, v1
|
||||
; GFX1030W64-NEXT: v_readfirstlane_b32 s6, v0
|
||||
; GFX1030W64-NEXT: s_mul_i32 s7, s9, s8
|
||||
; GFX1030W64-NEXT: s_mul_hi_u32 s12, s9, s6
|
||||
; GFX1030W64-NEXT: s_mul_i32 s11, s10, s6
|
||||
; GFX1030W64-NEXT: s_add_i32 s7, s12, s7
|
||||
; GFX1030W64-NEXT: s_mul_i32 s13, s9, s6
|
||||
; GFX1030W64-NEXT: s_add_i32 s7, s7, s11
|
||||
; GFX1030W64-NEXT: s_mul_hi_u32 s12, s6, s13
|
||||
; GFX1030W64-NEXT: s_mul_i32 s15, s6, s7
|
||||
; GFX1030W64-NEXT: s_mul_hi_u32 s14, s8, s13
|
||||
; GFX1030W64-NEXT: s_mul_i32 s11, s8, s13
|
||||
; GFX1030W64-NEXT: s_mul_hi_u32 s13, s6, s7
|
||||
; GFX1030W64-NEXT: v_readfirstlane_b32 s6, v1
|
||||
; GFX1030W64-NEXT: v_readfirstlane_b32 s7, v0
|
||||
; GFX1030W64-NEXT: s_mul_i32 s10, s8, s6
|
||||
; GFX1030W64-NEXT: s_mul_hi_u32 s12, s8, s7
|
||||
; GFX1030W64-NEXT: s_mul_i32 s11, s9, s7
|
||||
; GFX1030W64-NEXT: s_add_i32 s10, s12, s10
|
||||
; GFX1030W64-NEXT: s_mul_i32 s13, s8, s7
|
||||
; GFX1030W64-NEXT: s_add_i32 s10, s10, s11
|
||||
; GFX1030W64-NEXT: s_mul_hi_u32 s12, s7, s13
|
||||
; GFX1030W64-NEXT: s_mul_i32 s15, s7, s10
|
||||
; GFX1030W64-NEXT: s_mul_hi_u32 s14, s6, s13
|
||||
; GFX1030W64-NEXT: s_mul_i32 s11, s6, s13
|
||||
; GFX1030W64-NEXT: s_mul_hi_u32 s13, s7, s10
|
||||
; GFX1030W64-NEXT: s_add_u32 s12, s12, s15
|
||||
; GFX1030W64-NEXT: s_addc_u32 s13, 0, s13
|
||||
; GFX1030W64-NEXT: s_mul_hi_u32 s16, s8, s7
|
||||
; GFX1030W64-NEXT: s_mul_hi_u32 s16, s6, s10
|
||||
; GFX1030W64-NEXT: s_add_u32 s11, s12, s11
|
||||
; GFX1030W64-NEXT: s_mul_i32 s7, s8, s7
|
||||
; GFX1030W64-NEXT: s_mul_i32 s10, s6, s10
|
||||
; GFX1030W64-NEXT: s_addc_u32 s11, s13, s14
|
||||
; GFX1030W64-NEXT: s_addc_u32 s12, s16, 0
|
||||
; GFX1030W64-NEXT: s_add_u32 s7, s11, s7
|
||||
; GFX1030W64-NEXT: s_addc_u32 s11, 0, s12
|
||||
; GFX1030W64-NEXT: s_add_u32 s12, s6, s7
|
||||
; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0
|
||||
; GFX1030W64-NEXT: s_mul_hi_u32 s13, s9, s12
|
||||
; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0
|
||||
; GFX1030W64-NEXT: s_mul_i32 s6, s9, s12
|
||||
; GFX1030W64-NEXT: s_addc_u32 s8, s8, s11
|
||||
; GFX1030W64-NEXT: s_mul_i32 s10, s10, s12
|
||||
; GFX1030W64-NEXT: s_mul_i32 s9, s9, s8
|
||||
; GFX1030W64-NEXT: s_mul_hi_u32 s7, s12, s6
|
||||
; GFX1030W64-NEXT: s_add_i32 s9, s13, s9
|
||||
; GFX1030W64-NEXT: s_mul_hi_u32 s11, s8, s6
|
||||
; GFX1030W64-NEXT: s_add_i32 s9, s9, s10
|
||||
; GFX1030W64-NEXT: s_mul_i32 s6, s8, s6
|
||||
; GFX1030W64-NEXT: s_mul_i32 s14, s12, s9
|
||||
; GFX1030W64-NEXT: s_mul_hi_u32 s13, s12, s9
|
||||
; GFX1030W64-NEXT: s_add_u32 s7, s7, s14
|
||||
; GFX1030W64-NEXT: s_addc_u32 s13, 0, s13
|
||||
; GFX1030W64-NEXT: s_mul_hi_u32 s10, s8, s9
|
||||
; GFX1030W64-NEXT: s_add_u32 s6, s7, s6
|
||||
; GFX1030W64-NEXT: s_mul_i32 s9, s8, s9
|
||||
; GFX1030W64-NEXT: s_addc_u32 s6, s13, s11
|
||||
; GFX1030W64-NEXT: s_addc_u32 s7, s10, 0
|
||||
; GFX1030W64-NEXT: s_add_u32 s6, s6, s9
|
||||
; GFX1030W64-NEXT: s_addc_u32 s9, 0, s7
|
||||
; GFX1030W64-NEXT: s_add_u32 s10, s12, s6
|
||||
; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0
|
||||
; GFX1030W64-NEXT: s_mul_hi_u32 s11, s2, s10
|
||||
; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0
|
||||
; GFX1030W64-NEXT: s_mul_hi_u32 s6, s3, s10
|
||||
; GFX1030W64-NEXT: s_addc_u32 s7, s8, s9
|
||||
; GFX1030W64-NEXT: s_mul_i32 s8, s3, s10
|
||||
; GFX1030W64-NEXT: s_mul_i32 s10, s2, s7
|
||||
; GFX1030W64-NEXT: s_mul_hi_u32 s9, s2, s7
|
||||
; GFX1030W64-NEXT: s_add_u32 s10, s11, s10
|
||||
; GFX1030W64-NEXT: s_addc_u32 s9, 0, s9
|
||||
; GFX1030W64-NEXT: s_mul_hi_u32 s12, s3, s7
|
||||
; GFX1030W64-NEXT: s_add_u32 s8, s10, s8
|
||||
; GFX1030W64-NEXT: s_addc_u32 s11, 0, s12
|
||||
; GFX1030W64-NEXT: s_add_u32 s7, s7, s10
|
||||
; GFX1030W64-NEXT: s_addc_u32 s6, s6, s11
|
||||
; GFX1030W64-NEXT: s_mul_hi_u32 s10, s8, s7
|
||||
; GFX1030W64-NEXT: s_mul_i32 s11, s8, s7
|
||||
; GFX1030W64-NEXT: s_mul_i32 s8, s8, s6
|
||||
; GFX1030W64-NEXT: s_mul_i32 s9, s9, s7
|
||||
; GFX1030W64-NEXT: s_add_i32 s8, s10, s8
|
||||
; GFX1030W64-NEXT: s_mul_i32 s10, s6, s11
|
||||
; GFX1030W64-NEXT: s_add_i32 s8, s8, s9
|
||||
; GFX1030W64-NEXT: s_mul_hi_u32 s9, s7, s11
|
||||
; GFX1030W64-NEXT: s_mul_i32 s14, s7, s8
|
||||
; GFX1030W64-NEXT: s_mul_hi_u32 s13, s7, s8
|
||||
; GFX1030W64-NEXT: s_add_u32 s9, s9, s14
|
||||
; GFX1030W64-NEXT: s_mul_hi_u32 s12, s6, s11
|
||||
; GFX1030W64-NEXT: s_addc_u32 s13, 0, s13
|
||||
; GFX1030W64-NEXT: s_mul_hi_u32 s11, s6, s8
|
||||
; GFX1030W64-NEXT: s_add_u32 s9, s9, s10
|
||||
; GFX1030W64-NEXT: s_mul_i32 s8, s6, s8
|
||||
; GFX1030W64-NEXT: s_addc_u32 s9, s13, s12
|
||||
; GFX1030W64-NEXT: s_addc_u32 s10, s11, 0
|
||||
; GFX1030W64-NEXT: s_add_u32 s8, s9, s8
|
||||
; GFX1030W64-NEXT: s_addc_u32 s9, 0, s10
|
||||
; GFX1030W64-NEXT: s_add_u32 s7, s7, s8
|
||||
; GFX1030W64-NEXT: s_addc_u32 s6, s6, s9
|
||||
; GFX1030W64-NEXT: s_mul_hi_u32 s8, s2, s7
|
||||
; GFX1030W64-NEXT: s_mul_i32 s11, s2, s6
|
||||
; GFX1030W64-NEXT: s_mul_hi_u32 s10, s2, s6
|
||||
; GFX1030W64-NEXT: s_mul_hi_u32 s9, s3, s7
|
||||
; GFX1030W64-NEXT: s_mul_i32 s7, s3, s7
|
||||
; GFX1030W64-NEXT: s_addc_u32 s6, s9, s6
|
||||
; GFX1030W64-NEXT: s_add_u32 s8, s8, s11
|
||||
; GFX1030W64-NEXT: s_addc_u32 s10, 0, s10
|
||||
; GFX1030W64-NEXT: s_mul_hi_u32 s12, s3, s6
|
||||
; GFX1030W64-NEXT: s_add_u32 s7, s8, s7
|
||||
; GFX1030W64-NEXT: s_mul_i32 s6, s3, s6
|
||||
; GFX1030W64-NEXT: s_addc_u32 s7, s10, s9
|
||||
; GFX1030W64-NEXT: s_addc_u32 s8, s12, 0
|
||||
; GFX1030W64-NEXT: s_add_u32 s10, s6, s7
|
||||
; GFX1030W64-NEXT: s_add_u32 s10, s7, s6
|
||||
; GFX1030W64-NEXT: s_addc_u32 s11, 0, s8
|
||||
; GFX1030W64-NEXT: s_mul_hi_u32 s6, s4, s10
|
||||
; GFX1030W64-NEXT: s_mul_i32 s7, s4, s11
|
||||
; GFX1030W64-NEXT: s_mul_i32 s8, s5, s10
|
||||
; GFX1030W64-NEXT: s_add_i32 s6, s6, s7
|
||||
; GFX1030W64-NEXT: s_add_i32 s12, s6, s8
|
||||
; GFX1030W64-NEXT: s_add_i32 s8, s6, s8
|
||||
; GFX1030W64-NEXT: s_mul_i32 s6, s4, s10
|
||||
; GFX1030W64-NEXT: s_sub_i32 s8, s3, s12
|
||||
; GFX1030W64-NEXT: s_sub_u32 s13, s2, s6
|
||||
; GFX1030W64-NEXT: s_sub_i32 s9, s3, s8
|
||||
; GFX1030W64-NEXT: s_sub_u32 s12, s2, s6
|
||||
; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0
|
||||
; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0
|
||||
; GFX1030W64-NEXT: s_subb_u32 s14, s8, s5
|
||||
; GFX1030W64-NEXT: s_sub_u32 s15, s13, s4
|
||||
; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0
|
||||
; GFX1030W64-NEXT: s_subb_u32 s8, s14, 0
|
||||
; GFX1030W64-NEXT: s_cmp_ge_u32 s8, s5
|
||||
; GFX1030W64-NEXT: s_cselect_b32 s9, -1, 0
|
||||
; GFX1030W64-NEXT: s_cmp_ge_u32 s15, s4
|
||||
; GFX1030W64-NEXT: s_subb_u32 s9, s9, s5
|
||||
; GFX1030W64-NEXT: s_sub_u32 s13, s12, s4
|
||||
; GFX1030W64-NEXT: s_subb_u32 s9, s9, 0
|
||||
; GFX1030W64-NEXT: s_cmp_ge_u32 s9, s5
|
||||
; GFX1030W64-NEXT: s_cselect_b32 s14, -1, 0
|
||||
; GFX1030W64-NEXT: s_cmp_eq_u32 s8, s5
|
||||
; GFX1030W64-NEXT: s_cselect_b32 s8, s14, s9
|
||||
; GFX1030W64-NEXT: s_add_u32 s9, s10, 1
|
||||
; GFX1030W64-NEXT: s_cmp_ge_u32 s13, s4
|
||||
; GFX1030W64-NEXT: s_cselect_b32 s13, -1, 0
|
||||
; GFX1030W64-NEXT: s_cmp_eq_u32 s9, s5
|
||||
; GFX1030W64-NEXT: s_cselect_b32 s9, s13, s14
|
||||
; GFX1030W64-NEXT: s_add_u32 s13, s10, 1
|
||||
; GFX1030W64-NEXT: s_addc_u32 s14, s11, 0
|
||||
; GFX1030W64-NEXT: s_add_u32 s15, s10, 2
|
||||
; GFX1030W64-NEXT: s_addc_u32 s16, s11, 0
|
||||
; GFX1030W64-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX1030W64-NEXT: s_cselect_b32 s15, s15, s9
|
||||
; GFX1030W64-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX1030W64-NEXT: s_cselect_b32 s13, s15, s13
|
||||
; GFX1030W64-NEXT: s_cselect_b32 s14, s16, s14
|
||||
; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0
|
||||
; GFX1030W64-NEXT: s_subb_u32 s3, s3, s12
|
||||
; GFX1030W64-NEXT: s_subb_u32 s3, s3, s8
|
||||
; GFX1030W64-NEXT: s_cmp_ge_u32 s3, s5
|
||||
; GFX1030W64-NEXT: s_cselect_b32 s6, -1, 0
|
||||
; GFX1030W64-NEXT: s_cmp_ge_u32 s13, s4
|
||||
; GFX1030W64-NEXT: s_cmp_ge_u32 s12, s4
|
||||
; GFX1030W64-NEXT: s_cselect_b32 s7, -1, 0
|
||||
; GFX1030W64-NEXT: s_cmp_eq_u32 s3, s5
|
||||
; GFX1030W64-NEXT: s_cselect_b32 s3, s7, s6
|
||||
; GFX1030W64-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX1030W64-NEXT: s_cselect_b32 s7, s14, s11
|
||||
; GFX1030W64-NEXT: s_cselect_b32 s6, s15, s10
|
||||
; GFX1030W64-NEXT: s_cselect_b32 s6, s13, s10
|
||||
; GFX1030W64-NEXT: s_cbranch_execnz .LBB16_3
|
||||
; GFX1030W64-NEXT: .LBB16_2:
|
||||
; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s4
|
||||
@ -2988,44 +2927,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
|
||||
; GFX11-NEXT: s_add_u32 s11, s12, s11
|
||||
; GFX11-NEXT: s_addc_u32 s12, 0, s13
|
||||
; GFX11-NEXT: s_add_u32 s8, s8, s11
|
||||
; GFX11-NEXT: s_cselect_b32 s11, -1, 0
|
||||
; GFX11-NEXT: s_mul_hi_u32 s13, s9, s8
|
||||
; GFX11-NEXT: s_cmp_lg_u32 s11, 0
|
||||
; GFX11-NEXT: s_mul_i32 s11, s9, s8
|
||||
; GFX11-NEXT: s_addc_u32 s7, s7, s12
|
||||
; GFX11-NEXT: s_mul_i32 s10, s10, s8
|
||||
; GFX11-NEXT: s_mul_hi_u32 s11, s9, s8
|
||||
; GFX11-NEXT: s_mul_i32 s12, s9, s8
|
||||
; GFX11-NEXT: s_mul_i32 s9, s9, s7
|
||||
; GFX11-NEXT: s_mul_hi_u32 s12, s8, s11
|
||||
; GFX11-NEXT: s_add_i32 s9, s13, s9
|
||||
; GFX11-NEXT: s_mul_hi_u32 s13, s7, s11
|
||||
; GFX11-NEXT: s_mul_i32 s10, s10, s8
|
||||
; GFX11-NEXT: s_add_i32 s9, s11, s9
|
||||
; GFX11-NEXT: s_mul_i32 s11, s7, s12
|
||||
; GFX11-NEXT: s_add_i32 s9, s9, s10
|
||||
; GFX11-NEXT: s_mul_i32 s10, s7, s11
|
||||
; GFX11-NEXT: s_mul_hi_u32 s10, s8, s12
|
||||
; GFX11-NEXT: s_mul_i32 s15, s8, s9
|
||||
; GFX11-NEXT: s_mul_hi_u32 s14, s8, s9
|
||||
; GFX11-NEXT: s_add_u32 s12, s12, s15
|
||||
; GFX11-NEXT: s_add_u32 s10, s10, s15
|
||||
; GFX11-NEXT: s_mul_hi_u32 s13, s7, s12
|
||||
; GFX11-NEXT: s_addc_u32 s14, 0, s14
|
||||
; GFX11-NEXT: s_mul_hi_u32 s11, s7, s9
|
||||
; GFX11-NEXT: s_add_u32 s10, s12, s10
|
||||
; GFX11-NEXT: s_mul_hi_u32 s12, s7, s9
|
||||
; GFX11-NEXT: s_add_u32 s10, s10, s11
|
||||
; GFX11-NEXT: s_mul_i32 s9, s7, s9
|
||||
; GFX11-NEXT: s_addc_u32 s10, s14, s13
|
||||
; GFX11-NEXT: s_addc_u32 s11, s11, 0
|
||||
; GFX11-NEXT: s_addc_u32 s11, s12, 0
|
||||
; GFX11-NEXT: s_add_u32 s9, s10, s9
|
||||
; GFX11-NEXT: s_addc_u32 s10, 0, s11
|
||||
; GFX11-NEXT: s_add_u32 s8, s8, s9
|
||||
; GFX11-NEXT: s_cselect_b32 s9, -1, 0
|
||||
; GFX11-NEXT: s_mul_hi_u32 s11, s2, s8
|
||||
; GFX11-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX11-NEXT: s_mul_hi_u32 s9, s3, s8
|
||||
; GFX11-NEXT: s_addc_u32 s7, s7, s10
|
||||
; GFX11-NEXT: s_mul_i32 s8, s3, s8
|
||||
; GFX11-NEXT: s_mul_hi_u32 s9, s2, s8
|
||||
; GFX11-NEXT: s_mul_i32 s12, s2, s7
|
||||
; GFX11-NEXT: s_mul_hi_u32 s10, s2, s7
|
||||
; GFX11-NEXT: s_add_u32 s11, s11, s12
|
||||
; GFX11-NEXT: s_addc_u32 s10, 0, s10
|
||||
; GFX11-NEXT: s_mul_hi_u32 s11, s2, s7
|
||||
; GFX11-NEXT: s_mul_hi_u32 s10, s3, s8
|
||||
; GFX11-NEXT: s_mul_i32 s8, s3, s8
|
||||
; GFX11-NEXT: s_add_u32 s9, s9, s12
|
||||
; GFX11-NEXT: s_addc_u32 s11, 0, s11
|
||||
; GFX11-NEXT: s_mul_hi_u32 s13, s3, s7
|
||||
; GFX11-NEXT: s_add_u32 s8, s11, s8
|
||||
; GFX11-NEXT: s_add_u32 s8, s9, s8
|
||||
; GFX11-NEXT: s_mul_i32 s7, s3, s7
|
||||
; GFX11-NEXT: s_addc_u32 s8, s10, s9
|
||||
; GFX11-NEXT: s_addc_u32 s8, s11, s10
|
||||
; GFX11-NEXT: s_addc_u32 s9, s13, 0
|
||||
; GFX11-NEXT: s_add_u32 s7, s8, s7
|
||||
; GFX11-NEXT: s_addc_u32 s8, 0, s9
|
||||
@ -3035,17 +2970,14 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
|
||||
; GFX11-NEXT: s_add_i32 s9, s9, s10
|
||||
; GFX11-NEXT: s_mul_i32 s10, s4, s7
|
||||
; GFX11-NEXT: s_add_i32 s9, s9, s11
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_sub_i32 s11, s3, s9
|
||||
; GFX11-NEXT: s_sub_u32 s10, s2, s10
|
||||
; GFX11-NEXT: s_cselect_b32 s12, -1, 0
|
||||
; GFX11-NEXT: s_cmp_lg_u32 s12, 0
|
||||
; GFX11-NEXT: s_subb_u32 s11, s11, s5
|
||||
; GFX11-NEXT: s_sub_u32 s13, s10, s4
|
||||
; GFX11-NEXT: s_cselect_b32 s14, -1, 0
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_cmp_lg_u32 s14, 0
|
||||
; GFX11-NEXT: s_subb_u32 s11, s11, 0
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_cmp_ge_u32 s11, s5
|
||||
; GFX11-NEXT: s_cselect_b32 s14, -1, 0
|
||||
; GFX11-NEXT: s_cmp_ge_u32 s13, s4
|
||||
@ -3118,9 +3050,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
|
||||
; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_or_b64 s[4:5], s[2:3], s[6:7]
|
||||
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1250-NEXT: s_and_b64 s[4:5], s[4:5], 0xffffffff00000000
|
||||
; GFX1250-NEXT: s_cmp_lg_u64 s[4:5], 0
|
||||
; GFX1250-NEXT: s_cbranch_scc0 .LBB16_4
|
||||
; GFX1250-NEXT: ; %bb.1:
|
||||
; GFX1250-NEXT: s_cvt_f32_u32 s4, s6
|
||||
@ -3155,12 +3086,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
|
||||
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[4:5], s[12:13]
|
||||
; GFX1250-NEXT: s_add_co_u32 s8, s8, s12
|
||||
; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
|
||||
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX1250-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GFX1250-NEXT: s_add_co_ci_u32 s9, s9, s13
|
||||
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1250-NEXT: s_mul_u64 s[10:11], s[10:11], s[8:9]
|
||||
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1250-NEXT: s_mul_hi_u32 s13, s8, s11
|
||||
; GFX1250-NEXT: s_mul_i32 s12, s8, s11
|
||||
; GFX1250-NEXT: s_mul_hi_u32 s4, s8, s10
|
||||
@ -3175,19 +3103,17 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
|
||||
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1250-NEXT: s_add_nc_u64 s[10:11], s[4:5], s[10:11]
|
||||
; GFX1250-NEXT: s_add_co_u32 s8, s8, s10
|
||||
; GFX1250-NEXT: s_cselect_b32 s10, -1, 0
|
||||
; GFX1250-NEXT: s_mul_hi_u32 s4, s2, s8
|
||||
; GFX1250-NEXT: s_cmp_lg_u32 s10, 0
|
||||
; GFX1250-NEXT: s_mul_hi_u32 s12, s3, s8
|
||||
; GFX1250-NEXT: s_add_co_ci_u32 s10, s9, s11
|
||||
; GFX1250-NEXT: s_mul_i32 s11, s3, s8
|
||||
; GFX1250-NEXT: s_mul_hi_u32 s4, s2, s8
|
||||
; GFX1250-NEXT: s_mul_hi_u32 s11, s3, s8
|
||||
; GFX1250-NEXT: s_mul_i32 s12, s3, s8
|
||||
; GFX1250-NEXT: s_mul_hi_u32 s9, s2, s10
|
||||
; GFX1250-NEXT: s_mul_i32 s8, s2, s10
|
||||
; GFX1250-NEXT: s_mul_hi_u32 s13, s3, s10
|
||||
; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], s[8:9]
|
||||
; GFX1250-NEXT: s_mul_i32 s10, s3, s10
|
||||
; GFX1250-NEXT: s_add_co_u32 s4, s8, s11
|
||||
; GFX1250-NEXT: s_add_co_ci_u32 s4, s9, s12
|
||||
; GFX1250-NEXT: s_add_co_u32 s4, s8, s12
|
||||
; GFX1250-NEXT: s_add_co_ci_u32 s4, s9, s11
|
||||
; GFX1250-NEXT: s_add_co_ci_u32 s11, s13, 0
|
||||
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], s[10:11]
|
||||
@ -3202,10 +3128,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
|
||||
; GFX1250-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, s7
|
||||
; GFX1250-NEXT: s_sub_co_u32 s13, s4, s6
|
||||
; GFX1250-NEXT: s_cselect_b32 s14, -1, 0
|
||||
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX1250-NEXT: s_cmp_lg_u32 s14, 0
|
||||
; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, 0
|
||||
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1250-NEXT: s_cmp_ge_u32 s12, s7
|
||||
; GFX1250-NEXT: s_cselect_b32 s14, -1, 0
|
||||
; GFX1250-NEXT: s_cmp_ge_u32 s13, s6
|
||||
|
||||
@ -714,9 +714,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
|
||||
; VI-NEXT: s_lshl_b32 s2, s2, 8
|
||||
; VI-NEXT: s_or_b32 s2, s2, s3
|
||||
; VI-NEXT: s_lshl_b32 s3, s2, 16
|
||||
; VI-NEXT: s_and_b32 s2, s2, 0xffff
|
||||
; VI-NEXT: s_flbit_i32_b32 s3, s3
|
||||
; VI-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; VI-NEXT: s_and_b32 s2, s2, 0xffff
|
||||
; VI-NEXT: s_cselect_b32 s2, s3, 32
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
|
||||
@ -1491,7 +1491,6 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_lshr_b32 s4, s6, 16
|
||||
; SI-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; SI-NEXT: s_cbranch_scc0 .LBB14_4
|
||||
; SI-NEXT: ; %bb.1: ; %else
|
||||
; SI-NEXT: s_mov_b32 s11, 0xf000
|
||||
@ -1521,7 +1520,6 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_lshr_b32 s4, s6, 16
|
||||
; VI-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; VI-NEXT: s_cbranch_scc0 .LBB14_4
|
||||
; VI-NEXT: ; %bb.1: ; %else
|
||||
; VI-NEXT: s_mov_b32 s11, 0xf000
|
||||
|
||||
@ -14,7 +14,6 @@ define i32 @s_add_co_select_user() {
|
||||
; GFX7-NEXT: s_add_u32 s7, s6, s6
|
||||
; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GFX7-NEXT: s_or_b32 s4, s4, s5
|
||||
; GFX7-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GFX7-NEXT: s_addc_u32 s8, s6, 0
|
||||
; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec
|
||||
@ -31,8 +30,6 @@ define i32 @s_add_co_select_user() {
|
||||
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_add_u32 s7, s6, s6
|
||||
; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
|
||||
; GFX9-NEXT: s_addc_u32 s8, s6, 0
|
||||
; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
|
||||
@ -49,8 +46,6 @@ define i32 @s_add_co_select_user() {
|
||||
; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_add_u32 s5, s4, s4
|
||||
; GFX10-NEXT: s_cselect_b32 s6, -1, 0
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GFX10-NEXT: s_addc_u32 s6, s4, 0
|
||||
; GFX10-NEXT: s_cselect_b32 s7, -1, 0
|
||||
; GFX10-NEXT: s_and_b32 s7, s7, exec_lo
|
||||
@ -67,16 +62,13 @@ define i32 @s_add_co_select_user() {
|
||||
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_add_u32 s1, s0, s0
|
||||
; GFX11-NEXT: s_cselect_b32 s2, -1, 0
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX11-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX11-NEXT: s_cselect_b32 s3, -1, 0
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_and_b32 s3, s3, exec_lo
|
||||
; GFX11-NEXT: s_cselect_b32 s2, s2, 0
|
||||
; GFX11-NEXT: s_cmp_gt_u32 s0, 31
|
||||
; GFX11-NEXT: s_cselect_b32 s0, s1, s2
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
bb:
|
||||
@ -104,7 +96,6 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
|
||||
; GFX7-NEXT: s_add_u32 s0, s2, s2
|
||||
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GFX7-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX7-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX7-NEXT: s_addc_u32 s0, s2, 0
|
||||
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GFX7-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
||||
@ -125,12 +116,10 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
|
||||
;
|
||||
; GFX9-LABEL: s_add_co_br_user:
|
||||
; GFX9: ; %bb.0: ; %bb
|
||||
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0
|
||||
; GFX9-NEXT: s_load_dword s0, s[8:9], 0x0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_add_u32 s0, s2, s2
|
||||
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: s_addc_u32 s0, s2, 0
|
||||
; GFX9-NEXT: s_add_u32 s1, s0, s0
|
||||
; GFX9-NEXT: s_addc_u32 s0, s0, 0
|
||||
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GFX9-NEXT: s_andn2_b64 vcc, exec, s[0:1]
|
||||
; GFX9-NEXT: s_cbranch_vccnz .LBB1_2
|
||||
@ -153,8 +142,6 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
|
||||
; GFX10-NEXT: s_load_dword s0, s[8:9], 0x0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_add_u32 s1, s0, s0
|
||||
; GFX10-NEXT: s_cselect_b32 s1, -1, 0
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX10-NEXT: s_addc_u32 s0, s0, 0
|
||||
; GFX10-NEXT: s_cselect_b32 s0, -1, 0
|
||||
; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s0
|
||||
@ -178,11 +165,9 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
|
||||
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_add_u32 s1, s0, s0
|
||||
; GFX11-NEXT: s_cselect_b32 s1, -1, 0
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX11-NEXT: s_addc_u32 s0, s0, 0
|
||||
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
|
||||
; GFX11-NEXT: s_cbranch_vccnz .LBB1_2
|
||||
; GFX11-NEXT: ; %bb.1: ; %bb0
|
||||
|
||||
@ -1117,7 +1117,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_and_b32 s3, s1, 0x1ff
|
||||
; SI-NEXT: s_or_b32 s0, s3, s0
|
||||
; SI-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
|
||||
; SI-NEXT: s_lshr_b32 s0, s1, 8
|
||||
@ -1169,7 +1168,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_and_b32 s3, s1, 0x1ff
|
||||
; VI-NEXT: s_or_b32 s0, s3, s0
|
||||
; VI-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; VI-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
|
||||
; VI-NEXT: s_lshr_b32 s0, s1, 8
|
||||
@ -1217,7 +1215,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_and_b32 s3, s1, 0x1ff
|
||||
; GFX9-NEXT: s_or_b32 s0, s3, s0
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s1, 8
|
||||
@ -1264,11 +1261,9 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
|
||||
; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
|
||||
; GFX11-TRUE16: ; %bb.0:
|
||||
; GFX11-TRUE16-NEXT: s_and_b32 s3, s1, 0x1ff
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-TRUE16-NEXT: s_or_b32 s0, s3, s0
|
||||
; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX11-TRUE16-NEXT: s_cselect_b32 s0, -1, 0
|
||||
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
|
||||
; GFX11-TRUE16-NEXT: s_bfe_u32 s0, s1, 0xb0014
|
||||
; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 8
|
||||
@ -1320,11 +1315,9 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
|
||||
; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
|
||||
; GFX11-FAKE16: ; %bb.0:
|
||||
; GFX11-FAKE16-NEXT: s_and_b32 s3, s1, 0x1ff
|
||||
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-FAKE16-NEXT: s_or_b32 s0, s3, s0
|
||||
; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX11-FAKE16-NEXT: s_cselect_b32 s0, -1, 0
|
||||
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
|
||||
; GFX11-FAKE16-NEXT: s_bfe_u32 s0, s1, 0xb0014
|
||||
; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, 8
|
||||
@ -4023,7 +4016,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
|
||||
; SI-NEXT: s_and_b32 s6, s4, 0xffe
|
||||
; SI-NEXT: s_and_b32 s4, s1, 0x1ff
|
||||
; SI-NEXT: s_or_b32 s0, s4, s0
|
||||
; SI-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; SI-NEXT: v_cvt_f16_f32_e32 v0, s5
|
||||
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
|
||||
@ -4066,7 +4058,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
|
||||
; SI-NEXT: s_and_b32 s5, s0, 0xffe
|
||||
; SI-NEXT: s_and_b32 s0, s3, 0x1ff
|
||||
; SI-NEXT: s_or_b32 s0, s0, s2
|
||||
; SI-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
|
||||
; SI-NEXT: v_readfirstlane_b32 s0, v2
|
||||
@ -4120,10 +4111,9 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
|
||||
; VI-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_lshr_b32 s5, s3, 8
|
||||
; VI-NEXT: s_and_b32 s6, s3, 0x1ff
|
||||
; VI-NEXT: s_and_b32 s5, s5, 0xffe
|
||||
; VI-NEXT: s_and_b32 s6, s3, 0x1ff
|
||||
; VI-NEXT: s_or_b32 s2, s6, s2
|
||||
; VI-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; VI-NEXT: s_cselect_b64 s[6:7], -1, 0
|
||||
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
|
||||
; VI-NEXT: s_bfe_u32 s3, s3, 0xb0014
|
||||
@ -4163,7 +4153,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
|
||||
; VI-NEXT: s_and_b32 s7, s2, 0xffe
|
||||
; VI-NEXT: s_and_b32 s2, s1, 0x1ff
|
||||
; VI-NEXT: s_or_b32 s0, s2, s0
|
||||
; VI-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
|
||||
; VI-NEXT: s_bfe_u32 s1, s1, 0xb0014
|
||||
@ -4209,10 +4198,9 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
|
||||
; GFX9-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_lshr_b32 s5, s3, 8
|
||||
; GFX9-NEXT: s_and_b32 s6, s3, 0x1ff
|
||||
; GFX9-NEXT: s_and_b32 s5, s5, 0xffe
|
||||
; GFX9-NEXT: s_and_b32 s6, s3, 0x1ff
|
||||
; GFX9-NEXT: s_or_b32 s2, s6, s2
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
|
||||
; GFX9-NEXT: s_bfe_u32 s6, s3, 0xb0014
|
||||
@ -4254,7 +4242,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
|
||||
; GFX9-NEXT: s_and_b32 s6, s2, 0xffe
|
||||
; GFX9-NEXT: s_and_b32 s2, s1, 0x1ff
|
||||
; GFX9-NEXT: s_or_b32 s0, s2, s0
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
|
||||
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
|
||||
@ -4301,11 +4288,10 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
|
||||
;
|
||||
; GFX11-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_and_b32 s5, s3, 0x1ff
|
||||
; GFX11-NEXT: s_lshr_b32 s6, s3, 8
|
||||
; GFX11-NEXT: s_or_b32 s2, s5, s2
|
||||
; GFX11-NEXT: s_and_b32 s5, s6, 0xffe
|
||||
; GFX11-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX11-NEXT: s_lshr_b32 s5, s3, 8
|
||||
; GFX11-NEXT: s_and_b32 s6, s3, 0x1ff
|
||||
; GFX11-NEXT: s_and_b32 s5, s5, 0xffe
|
||||
; GFX11-NEXT: s_or_b32 s2, s6, s2
|
||||
; GFX11-NEXT: s_cselect_b32 s2, -1, 0
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
|
||||
@ -4348,13 +4334,12 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
|
||||
; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x40f
|
||||
; GFX11-NEXT: s_cselect_b32 s2, s5, s6
|
||||
; GFX11-NEXT: s_lshr_b32 s3, s3, 16
|
||||
; GFX11-NEXT: s_and_b32 s6, s1, 0x1ff
|
||||
; GFX11-NEXT: s_lshr_b32 s5, s1, 8
|
||||
; GFX11-NEXT: s_and_b32 s3, s3, 0x8000
|
||||
; GFX11-NEXT: s_or_b32 s0, s6, s0
|
||||
; GFX11-NEXT: s_and_b32 s6, s1, 0x1ff
|
||||
; GFX11-NEXT: s_and_b32 s5, s5, 0xffe
|
||||
; GFX11-NEXT: s_or_b32 s2, s3, s2
|
||||
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX11-NEXT: s_or_b32 s0, s6, s0
|
||||
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
|
||||
|
||||
@ -599,10 +599,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
|
||||
; SI-GISEL-NEXT: s_addk_i32 s3, 0xfc10
|
||||
; SI-GISEL-NEXT: s_and_b32 s6, s6, 0xffe
|
||||
; SI-GISEL-NEXT: s_or_b32 s4, s7, s4
|
||||
; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; SI-GISEL-NEXT: s_or_b32 s4, s6, s4
|
||||
; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; SI-GISEL-NEXT: s_cselect_b32 s6, 1, 0
|
||||
; SI-GISEL-NEXT: s_lshl_b32 s6, s6, 9
|
||||
; SI-GISEL-NEXT: s_lshl_b32 s7, s3, 12
|
||||
@ -711,10 +709,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
|
||||
; VI-GISEL-NEXT: s_addk_i32 s4, 0xfc10
|
||||
; VI-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
|
||||
; VI-GISEL-NEXT: s_or_b32 s2, s6, s2
|
||||
; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; VI-GISEL-NEXT: s_or_b32 s2, s5, s2
|
||||
; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; VI-GISEL-NEXT: s_sub_i32 s7, 1, s4
|
||||
; VI-GISEL-NEXT: s_lshl_b32 s6, s4, 12
|
||||
@ -824,10 +820,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
|
||||
; GFX9-GISEL-NEXT: s_addk_i32 s4, 0xfc10
|
||||
; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
|
||||
; GFX9-GISEL-NEXT: s_or_b32 s2, s6, s2
|
||||
; GFX9-GISEL-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX9-GISEL-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX9-GISEL-NEXT: s_or_b32 s2, s5, s2
|
||||
; GFX9-GISEL-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX9-GISEL-NEXT: s_sub_i32 s7, 1, s4
|
||||
; GFX9-GISEL-NEXT: s_lshl_b32 s6, s4, 12
|
||||
@ -937,10 +931,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
|
||||
; GFX950-GISEL-NEXT: s_addk_i32 s4, 0xfc10
|
||||
; GFX950-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
|
||||
; GFX950-GISEL-NEXT: s_or_b32 s2, s6, s2
|
||||
; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX950-GISEL-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX950-GISEL-NEXT: s_or_b32 s2, s5, s2
|
||||
; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX950-GISEL-NEXT: s_sub_i32 s7, 1, s4
|
||||
; GFX950-GISEL-NEXT: s_lshl_b32 s6, s4, 12
|
||||
@ -1118,17 +1110,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s3, 0xb0014
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s3, 8
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s4, 0xfc10
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s5, s2
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s6, 1, s4
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s8, s2, 0x1000
|
||||
@ -1175,17 +1165,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s3, 0xb0014
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s3, 8
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s4, 0xfc10
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s5, s2
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s6, 1, s4
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s8, s2, 0x1000
|
||||
@ -1366,17 +1354,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s3, 0xb0014
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s3, 8
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s4, 0xfc10
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s5, s2
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s6, 1, s4
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s8, s2, 0x1000
|
||||
@ -1423,17 +1409,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s3, 0xb0014
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s3, 8
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s4, 0xfc10
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s5, s2
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s6, 1, s4
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s8, s2, 0x1000
|
||||
@ -2154,10 +2138,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
|
||||
; SI-GISEL-NEXT: s_addk_i32 s3, 0xfc10
|
||||
; SI-GISEL-NEXT: s_and_b32 s8, s8, 0xffe
|
||||
; SI-GISEL-NEXT: s_or_b32 s4, s9, s4
|
||||
; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; SI-GISEL-NEXT: s_or_b32 s4, s8, s4
|
||||
; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; SI-GISEL-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; SI-GISEL-NEXT: s_lshl_b32 s8, s8, 9
|
||||
; SI-GISEL-NEXT: s_lshl_b32 s9, s3, 12
|
||||
@ -2193,12 +2175,10 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
|
||||
; SI-GISEL-NEXT: s_and_b32 s4, s4, 0x8000
|
||||
; SI-GISEL-NEXT: s_addk_i32 s5, 0xfc10
|
||||
; SI-GISEL-NEXT: s_and_b32 s8, s8, 0xffe
|
||||
; SI-GISEL-NEXT: s_or_b32 s6, s9, s6
|
||||
; SI-GISEL-NEXT: s_or_b32 s3, s4, s3
|
||||
; SI-GISEL-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; SI-GISEL-NEXT: s_or_b32 s4, s9, s6
|
||||
; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; SI-GISEL-NEXT: s_or_b32 s4, s8, s4
|
||||
; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; SI-GISEL-NEXT: s_cselect_b32 s6, 1, 0
|
||||
; SI-GISEL-NEXT: s_lshl_b32 s6, s6, 9
|
||||
; SI-GISEL-NEXT: s_lshl_b32 s8, s5, 12
|
||||
@ -2355,10 +2335,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
|
||||
; VI-GISEL-NEXT: s_addk_i32 s2, 0xfc10
|
||||
; VI-GISEL-NEXT: s_and_b32 s3, s3, 0xffe
|
||||
; VI-GISEL-NEXT: s_or_b32 s4, s8, s4
|
||||
; VI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; VI-GISEL-NEXT: s_or_b32 s3, s3, s4
|
||||
; VI-GISEL-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; VI-GISEL-NEXT: s_sub_i32 s9, 1, s2
|
||||
; VI-GISEL-NEXT: s_lshl_b32 s8, s2, 12
|
||||
@ -2392,14 +2370,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
|
||||
; VI-GISEL-NEXT: s_or_b32 s2, s3, s2
|
||||
; VI-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014
|
||||
; VI-GISEL-NEXT: s_lshr_b32 s4, s7, 8
|
||||
; VI-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
|
||||
; VI-GISEL-NEXT: s_addk_i32 s3, 0xfc10
|
||||
; VI-GISEL-NEXT: s_and_b32 s4, s4, 0xffe
|
||||
; VI-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
|
||||
; VI-GISEL-NEXT: s_or_b32 s5, s5, s6
|
||||
; VI-GISEL-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; VI-GISEL-NEXT: s_or_b32 s4, s4, s5
|
||||
; VI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; VI-GISEL-NEXT: s_sub_i32 s8, 1, s3
|
||||
; VI-GISEL-NEXT: s_lshl_b32 s6, s3, 12
|
||||
@ -2555,10 +2531,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
|
||||
; GFX9-GISEL-NEXT: s_addk_i32 s2, 0xfc10
|
||||
; GFX9-GISEL-NEXT: s_and_b32 s3, s3, 0xffe
|
||||
; GFX9-GISEL-NEXT: s_or_b32 s4, s8, s4
|
||||
; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; GFX9-GISEL-NEXT: s_or_b32 s3, s3, s4
|
||||
; GFX9-GISEL-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; GFX9-GISEL-NEXT: s_sub_i32 s9, 1, s2
|
||||
; GFX9-GISEL-NEXT: s_lshl_b32 s8, s2, 12
|
||||
@ -2592,14 +2566,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
|
||||
; GFX9-GISEL-NEXT: s_or_b32 s2, s3, s2
|
||||
; GFX9-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014
|
||||
; GFX9-GISEL-NEXT: s_lshr_b32 s4, s7, 8
|
||||
; GFX9-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
|
||||
; GFX9-GISEL-NEXT: s_addk_i32 s3, 0xfc10
|
||||
; GFX9-GISEL-NEXT: s_and_b32 s4, s4, 0xffe
|
||||
; GFX9-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
|
||||
; GFX9-GISEL-NEXT: s_or_b32 s5, s5, s6
|
||||
; GFX9-GISEL-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX9-GISEL-NEXT: s_or_b32 s4, s4, s5
|
||||
; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX9-GISEL-NEXT: s_sub_i32 s8, 1, s3
|
||||
; GFX9-GISEL-NEXT: s_lshl_b32 s6, s3, 12
|
||||
@ -2752,10 +2724,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
|
||||
; GFX950-GISEL-NEXT: s_addk_i32 s2, 0xfc10
|
||||
; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0xffe
|
||||
; GFX950-GISEL-NEXT: s_or_b32 s4, s8, s4
|
||||
; GFX950-GISEL-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; GFX950-GISEL-NEXT: s_or_b32 s3, s3, s4
|
||||
; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; GFX950-GISEL-NEXT: s_sub_i32 s9, 1, s2
|
||||
; GFX950-GISEL-NEXT: s_lshl_b32 s8, s2, 12
|
||||
@ -2789,14 +2759,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
|
||||
; GFX950-GISEL-NEXT: s_or_b32 s2, s3, s2
|
||||
; GFX950-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014
|
||||
; GFX950-GISEL-NEXT: s_lshr_b32 s4, s7, 8
|
||||
; GFX950-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
|
||||
; GFX950-GISEL-NEXT: s_addk_i32 s3, 0xfc10
|
||||
; GFX950-GISEL-NEXT: s_and_b32 s4, s4, 0xffe
|
||||
; GFX950-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
|
||||
; GFX950-GISEL-NEXT: s_or_b32 s5, s5, s6
|
||||
; GFX950-GISEL-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX950-GISEL-NEXT: s_or_b32 s4, s4, s5
|
||||
; GFX950-GISEL-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX950-GISEL-NEXT: s_sub_i32 s8, 1, s3
|
||||
; GFX950-GISEL-NEXT: s_lshl_b32 s6, s3, 12
|
||||
@ -3073,17 +3041,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s2, s5, 0xb0014
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 8
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s2, 0xfc10
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0xffe
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s4
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s8, 1, s2
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s10, s3, 0x1000
|
||||
@ -3115,19 +3081,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_cmpk_eq_i32 s2, 0x40f
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s4, s3
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 16
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s7, 0xb0014
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s7, 8
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0x8000
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s8, s6
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s4, 0xfc10
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s3, s2
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s8, s6
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s5, s3
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s6, 1, s4
|
||||
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s9, s3, 0x1000
|
||||
@ -3176,17 +3140,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s2, s5, 0xb0014
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 8
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s2, 0xfc10
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0xffe
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s4
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s8, 1, s2
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s10, s3, 0x1000
|
||||
@ -3218,19 +3180,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_cmpk_eq_i32 s2, 0x40f
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s4, s3
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 16
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s7, 0xb0014
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s7, 8
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0x8000
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s8, s6
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s4, 0xfc10
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s3, s2
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s8, s6
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s5, s3
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s6, 1, s4
|
||||
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s9, s3, 0x1000
|
||||
@ -3511,17 +3471,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s2, s5, 0xb0014
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 8
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s2, 0xfc10
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0xffe
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s4
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s8, 1, s2
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s10, s3, 0x1000
|
||||
@ -3553,19 +3511,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_cmp_eq_u32 s2, 0x40f
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s4, s3
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 16
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s7, 0xb0014
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s7, 8
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0x8000
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s6, s8, s6
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s4, 0xfc10
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s3, s2
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s8, s6
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s5, s3
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s6, 1, s4
|
||||
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s9, s3, 0x1000
|
||||
@ -3614,17 +3570,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s2, s5, 0xb0014
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 8
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s2, 0xfc10
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0xffe
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s4
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s8, 1, s2
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s10, s3, 0x1000
|
||||
@ -3656,19 +3610,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_cmp_eq_u32 s2, 0x40f
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s4, s3
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 16
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s7, 0xb0014
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s7, 8
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0x8000
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s6, s8, s6
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s4, 0xfc10
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s3, s2
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s8, s6
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s5, s3
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s6, 1, s4
|
||||
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s9, s3, 0x1000
|
||||
|
||||
@ -182,7 +182,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
|
||||
; SI-NEXT: s_and_b32 s1, s7, 0x1ff
|
||||
; SI-NEXT: s_and_b32 s8, s0, 0xffe
|
||||
; SI-NEXT: s_or_b32 s0, s1, s6
|
||||
; SI-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014
|
||||
@ -237,7 +236,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
|
||||
; VI-SDAG-NEXT: s_and_b32 s8, s4, 0xffe
|
||||
; VI-SDAG-NEXT: s_and_b32 s4, s7, 0x1ff
|
||||
; VI-SDAG-NEXT: s_or_b32 s4, s4, s6
|
||||
; VI-SDAG-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; VI-SDAG-NEXT: s_mov_b32 s1, s5
|
||||
; VI-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
|
||||
@ -290,10 +288,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
|
||||
; VI-GISEL-NEXT: s_addk_i32 s4, 0xfc10
|
||||
; VI-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
|
||||
; VI-GISEL-NEXT: s_or_b32 s2, s6, s2
|
||||
; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; VI-GISEL-NEXT: s_or_b32 s2, s5, s2
|
||||
; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; VI-GISEL-NEXT: s_sub_i32 s7, 1, s4
|
||||
; VI-GISEL-NEXT: s_lshl_b32 s6, s4, 12
|
||||
@ -335,11 +331,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
|
||||
; GFX10-SDAG: ; %bb.0:
|
||||
; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff
|
||||
; GFX10-SDAG-NEXT: s_lshr_b32 s5, s3, 8
|
||||
; GFX10-SDAG-NEXT: s_or_b32 s2, s4, s2
|
||||
; GFX10-SDAG-NEXT: s_and_b32 s4, s5, 0xffe
|
||||
; GFX10-SDAG-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX10-SDAG-NEXT: s_lshr_b32 s4, s3, 8
|
||||
; GFX10-SDAG-NEXT: s_and_b32 s5, s3, 0x1ff
|
||||
; GFX10-SDAG-NEXT: s_and_b32 s4, s4, 0xffe
|
||||
; GFX10-SDAG-NEXT: s_or_b32 s2, s5, s2
|
||||
; GFX10-SDAG-NEXT: s_cselect_b32 s2, -1, 0
|
||||
; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
|
||||
; GFX10-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014
|
||||
@ -387,16 +382,14 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
|
||||
; GFX10-GISEL: ; %bb.0:
|
||||
; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
|
||||
; GFX10-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
|
||||
; GFX10-GISEL-NEXT: s_lshr_b32 s5, s3, 8
|
||||
; GFX10-GISEL-NEXT: s_or_b32 s2, s6, s2
|
||||
; GFX10-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
|
||||
; GFX10-GISEL-NEXT: s_addk_i32 s4, 0xfc10
|
||||
; GFX10-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
|
||||
; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX10-GISEL-NEXT: s_or_b32 s2, s6, s2
|
||||
; GFX10-GISEL-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX10-GISEL-NEXT: s_or_b32 s2, s5, s2
|
||||
; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX10-GISEL-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX10-GISEL-NEXT: s_sub_i32 s6, 1, s4
|
||||
; GFX10-GISEL-NEXT: s_or_b32 s8, s2, 0x1000
|
||||
@ -438,11 +431,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
|
||||
; GFX11-SDAG: ; %bb.0:
|
||||
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff
|
||||
; GFX11-SDAG-NEXT: s_lshr_b32 s5, s3, 8
|
||||
; GFX11-SDAG-NEXT: s_or_b32 s2, s4, s2
|
||||
; GFX11-SDAG-NEXT: s_and_b32 s4, s5, 0xffe
|
||||
; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX11-SDAG-NEXT: s_lshr_b32 s4, s3, 8
|
||||
; GFX11-SDAG-NEXT: s_and_b32 s5, s3, 0x1ff
|
||||
; GFX11-SDAG-NEXT: s_and_b32 s4, s4, 0xffe
|
||||
; GFX11-SDAG-NEXT: s_or_b32 s2, s5, s2
|
||||
; GFX11-SDAG-NEXT: s_cselect_b32 s2, -1, 0
|
||||
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
|
||||
@ -498,17 +490,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
|
||||
; GFX11-GISEL: ; %bb.0:
|
||||
; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
|
||||
; GFX11-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
|
||||
; GFX11-GISEL-NEXT: s_lshr_b32 s5, s3, 8
|
||||
; GFX11-GISEL-NEXT: s_or_b32 s2, s6, s2
|
||||
; GFX11-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
|
||||
; GFX11-GISEL-NEXT: s_addk_i32 s4, 0xfc10
|
||||
; GFX11-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
|
||||
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX11-GISEL-NEXT: s_or_b32 s2, s6, s2
|
||||
; GFX11-GISEL-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-GISEL-NEXT: s_or_b32 s2, s5, s2
|
||||
; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX11-GISEL-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX11-GISEL-NEXT: s_sub_i32 s6, 1, s4
|
||||
; GFX11-GISEL-NEXT: s_or_b32 s8, s2, 0x1000
|
||||
|
||||
@ -472,7 +472,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB1_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -536,11 +535,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
|
||||
; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop
|
||||
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
|
||||
; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -606,7 +604,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
|
||||
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
|
||||
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -660,12 +657,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
|
||||
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
|
||||
; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -710,9 +706,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
|
||||
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
|
||||
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -1690,7 +1685,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -1754,11 +1748,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
|
||||
; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop
|
||||
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
|
||||
; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -1824,7 +1817,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
|
||||
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -1878,12 +1870,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
|
||||
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
|
||||
; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -1928,9 +1919,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
|
||||
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
|
||||
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -2968,7 +2958,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -3032,11 +3021,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
|
||||
; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop
|
||||
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
|
||||
; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -3102,7 +3090,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
|
||||
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
|
||||
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -3156,12 +3143,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
|
||||
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
|
||||
; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -3206,9 +3192,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
|
||||
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
|
||||
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -3742,7 +3727,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB6_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -3806,11 +3790,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
|
||||
; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop
|
||||
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
|
||||
; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -3876,7 +3859,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
|
||||
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1
|
||||
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -3930,12 +3912,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
|
||||
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
|
||||
; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -3980,9 +3961,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
|
||||
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
|
||||
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -5019,7 +4999,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB8_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -5083,11 +5062,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
|
||||
; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop
|
||||
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
|
||||
; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -5153,7 +5131,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
|
||||
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1
|
||||
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -5207,12 +5184,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
|
||||
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
|
||||
; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -5270,9 +5246,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
|
||||
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
|
||||
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -6284,7 +6259,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
|
||||
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB10_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -6354,7 +6328,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
|
||||
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -6424,7 +6397,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1
|
||||
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -6485,8 +6457,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
|
||||
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -6550,7 +6520,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -7717,7 +7686,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
|
||||
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB12_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -7787,7 +7755,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
|
||||
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -7857,7 +7824,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1
|
||||
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -7918,8 +7884,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
|
||||
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -7983,7 +7947,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -9150,7 +9113,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
|
||||
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB14_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -9220,7 +9182,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
|
||||
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -9290,7 +9251,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1
|
||||
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -9351,8 +9311,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
|
||||
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -9416,7 +9374,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -10065,7 +10022,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
|
||||
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB15_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -10135,7 +10091,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
|
||||
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -10205,7 +10160,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1
|
||||
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -10266,8 +10220,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
|
||||
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -10331,7 +10283,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -11498,7 +11449,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
|
||||
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB17_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -11568,7 +11518,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
|
||||
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -11638,7 +11587,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1
|
||||
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -11699,8 +11647,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
|
||||
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -11764,7 +11710,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
|
||||
@ -381,13 +381,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
|
||||
; GFX9-NEXT: .LBB1_1: ; %ComputeLoop
|
||||
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX9-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
|
||||
; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
|
||||
; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: v_max_f32_e32 v2, v1, v2
|
||||
; GFX9-NEXT: v_max_f32_e32 v2, v2, v1
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB1_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -457,7 +456,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
|
||||
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -513,7 +511,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
|
||||
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
|
||||
; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
|
||||
@ -562,8 +559,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
|
||||
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -610,11 +606,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
|
||||
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
|
||||
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
|
||||
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -1420,13 +1414,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
|
||||
; GFX9-NEXT: .LBB3_1: ; %ComputeLoop
|
||||
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX9-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
|
||||
; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
|
||||
; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: v_max_f32_e32 v2, v1, v2
|
||||
; GFX9-NEXT: v_max_f32_e32 v2, v2, v1
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -1496,7 +1489,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
|
||||
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -1552,7 +1544,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
|
||||
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
|
||||
; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
@ -1601,8 +1592,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
|
||||
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -1649,11 +1639,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
|
||||
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
|
||||
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
|
||||
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -2459,13 +2447,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
|
||||
; GFX9-NEXT: .LBB5_1: ; %ComputeLoop
|
||||
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX9-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
|
||||
; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
|
||||
; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: v_max_f32_e32 v2, v1, v2
|
||||
; GFX9-NEXT: v_max_f32_e32 v2, v2, v1
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -2535,7 +2522,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
|
||||
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -2591,7 +2577,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
|
||||
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
|
||||
; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
|
||||
@ -2640,8 +2625,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
|
||||
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -2688,11 +2672,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
|
||||
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
|
||||
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
|
||||
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -3591,7 +3573,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
|
||||
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -3665,7 +3646,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
|
||||
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -3724,7 +3704,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1
|
||||
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -3774,8 +3753,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
|
||||
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -3841,10 +3819,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
|
||||
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
|
||||
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -4859,7 +4836,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
|
||||
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB9_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -4933,7 +4909,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
|
||||
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -4992,7 +4967,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1
|
||||
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -5042,8 +5016,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
|
||||
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -5109,10 +5082,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
|
||||
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
|
||||
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -6127,7 +6099,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
|
||||
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB11_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -6201,7 +6172,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
|
||||
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -6260,7 +6230,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1
|
||||
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -6310,8 +6279,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
|
||||
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -6377,10 +6345,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
|
||||
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
|
||||
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
|
||||
@ -381,13 +381,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
|
||||
; GFX9-NEXT: .LBB1_1: ; %ComputeLoop
|
||||
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX9-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
|
||||
; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
|
||||
; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: v_min_f32_e32 v2, v1, v2
|
||||
; GFX9-NEXT: v_min_f32_e32 v2, v2, v1
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB1_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -457,7 +456,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
|
||||
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -513,7 +511,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
|
||||
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
|
||||
; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
|
||||
@ -562,8 +559,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
|
||||
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -610,11 +606,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
|
||||
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
|
||||
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
|
||||
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -1420,13 +1414,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
|
||||
; GFX9-NEXT: .LBB3_1: ; %ComputeLoop
|
||||
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX9-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
|
||||
; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
|
||||
; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: v_min_f32_e32 v2, v1, v2
|
||||
; GFX9-NEXT: v_min_f32_e32 v2, v2, v1
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -1496,7 +1489,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
|
||||
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -1552,7 +1544,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
|
||||
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
|
||||
; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
@ -1601,8 +1592,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
|
||||
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -1649,11 +1639,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
|
||||
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
|
||||
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
|
||||
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -2459,13 +2447,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
|
||||
; GFX9-NEXT: .LBB5_1: ; %ComputeLoop
|
||||
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX9-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
|
||||
; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
|
||||
; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: v_min_f32_e32 v2, v1, v2
|
||||
; GFX9-NEXT: v_min_f32_e32 v2, v2, v1
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -2535,7 +2522,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
|
||||
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -2591,7 +2577,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
|
||||
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
|
||||
; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
|
||||
@ -2640,8 +2625,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
|
||||
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -2688,11 +2672,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
|
||||
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
|
||||
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
|
||||
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -3591,7 +3573,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
|
||||
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -3665,7 +3646,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
|
||||
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -3724,7 +3704,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1
|
||||
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -3774,8 +3753,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
|
||||
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -3841,10 +3819,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
|
||||
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
|
||||
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -4859,7 +4836,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
|
||||
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB9_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -4933,7 +4909,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
|
||||
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -4992,7 +4967,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1
|
||||
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -5042,8 +5016,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
|
||||
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -5109,10 +5082,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
|
||||
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
|
||||
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -6127,7 +6099,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
|
||||
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB11_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -6201,7 +6172,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
|
||||
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -6260,7 +6230,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1
|
||||
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -6310,8 +6279,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
|
||||
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -6377,10 +6345,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
|
||||
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
|
||||
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
|
||||
@ -532,7 +532,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB1_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -596,11 +595,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
|
||||
; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop
|
||||
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
|
||||
; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -666,7 +664,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
|
||||
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
|
||||
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -720,12 +717,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
|
||||
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
|
||||
; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -783,9 +779,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
|
||||
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
|
||||
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -1862,7 +1857,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -1926,11 +1920,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
|
||||
; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop
|
||||
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
|
||||
; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -1996,7 +1989,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
|
||||
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -2050,12 +2042,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
|
||||
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
|
||||
; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -2113,9 +2104,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
|
||||
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
|
||||
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -3192,7 +3182,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -3256,11 +3245,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
|
||||
; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop
|
||||
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
|
||||
; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -3326,7 +3314,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
|
||||
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
|
||||
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -3380,12 +3367,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
|
||||
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
|
||||
; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -3443,9 +3429,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
|
||||
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
|
||||
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -4018,7 +4003,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB6_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -4082,11 +4066,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
|
||||
; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop
|
||||
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
|
||||
; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -4152,7 +4135,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
|
||||
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1
|
||||
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -4206,12 +4188,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
|
||||
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
|
||||
; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -4269,9 +4250,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
|
||||
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
|
||||
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -5347,7 +5327,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
|
||||
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB8_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -5411,11 +5390,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
|
||||
; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop
|
||||
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
|
||||
; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -5481,7 +5459,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
|
||||
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1
|
||||
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -5535,12 +5512,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
|
||||
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
|
||||
; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
|
||||
; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -5598,9 +5574,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
|
||||
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
|
||||
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
|
||||
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
@ -6612,7 +6587,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
|
||||
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB10_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -6682,7 +6656,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
|
||||
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -6752,7 +6725,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1
|
||||
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -6813,8 +6785,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
|
||||
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -6878,7 +6848,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -8044,7 +8013,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
|
||||
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB12_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -8114,7 +8082,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
|
||||
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -8184,7 +8151,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1
|
||||
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -8245,8 +8211,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
|
||||
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -8310,7 +8274,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -9477,7 +9440,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
|
||||
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB14_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -9547,7 +9509,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
|
||||
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -9617,7 +9578,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1
|
||||
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -9678,8 +9638,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
|
||||
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -9743,7 +9701,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -10392,7 +10349,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
|
||||
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB15_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -10462,7 +10418,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
|
||||
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -10532,7 +10487,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1
|
||||
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -10593,8 +10547,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
|
||||
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -10658,7 +10610,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -11824,7 +11775,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
|
||||
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: s_cbranch_scc1 .LBB17_1
|
||||
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -11894,7 +11844,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
|
||||
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1
|
||||
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -11964,7 +11913,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
|
||||
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1
|
||||
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -12025,8 +11973,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
|
||||
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
|
||||
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1
|
||||
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
@ -12090,7 +12036,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
|
||||
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
|
||||
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1
|
||||
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
|
||||
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
|
||||
|
||||
@ -136,19 +136,17 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
|
||||
; GFX11-NEXT: .LBB2_6: ; %bb18
|
||||
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
|
||||
; GFX11-NEXT: v_readfirstlane_b32 s13, v0
|
||||
; GFX11-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX11-NEXT: s_cselect_b32 s1, -1, 0
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1
|
||||
; GFX11-NEXT: s_and_b32 s1, s8, s1
|
||||
; GFX11-NEXT: s_and_b32 s1, s1, exec_lo
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: s_cselect_b32 s13, -1, 0
|
||||
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13
|
||||
; GFX11-NEXT: s_and_b32 s13, s8, s13
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: s_and_b32 s13, s13, exec_lo
|
||||
; GFX11-NEXT: v_readfirstlane_b32 s19, v2
|
||||
; GFX11-NEXT: s_cselect_b32 s1, s19, s13
|
||||
; GFX11-NEXT: s_and_b32 s13, 0xffff, s0
|
||||
; GFX11-NEXT: s_cselect_b32 s1, s19, s1
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX11-NEXT: s_cmp_lg_u32 s13, 0
|
||||
; GFX11-NEXT: s_and_b32 s13, 0xffff, s0
|
||||
; GFX11-NEXT: s_cselect_b32 s13, -1, 0
|
||||
; GFX11-NEXT: s_and_b32 s20, s9, exec_lo
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13
|
||||
|
||||
@ -8265,12 +8265,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
|
||||
; GFX12-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_readlane_b32 s6, v1, s3
|
||||
; GFX12-NEXT: s_lshl_b32 s7, 1, s3
|
||||
; GFX12-NEXT: v_writelane_b32 v0, s0, s3
|
||||
; GFX12-NEXT: s_lshl_b32 s3, 1, s3
|
||||
; GFX12-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12-NEXT: s_and_not1_b32 s1, s1, s7
|
||||
; GFX12-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX12-NEXT: s_and_not1_b32 s1, s1, s3
|
||||
; GFX12-NEXT: s_add_f32 s0, s0, s6
|
||||
; GFX12-NEXT: s_cbranch_scc1 .LBB28_5
|
||||
; GFX12-NEXT: ; %bb.6: ; %ComputeEnd
|
||||
@ -8351,14 +8349,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
|
||||
; GFX942-NEXT: .LBB28_5: ; %ComputeLoop
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX942-NEXT: v_readfirstlane_b32 s8, v1
|
||||
; GFX942-NEXT: v_readlane_b32 s9, v2, s3
|
||||
; GFX942-NEXT: v_readfirstlane_b32 s6, v1
|
||||
; GFX942-NEXT: s_mov_b32 m0, s3
|
||||
; GFX942-NEXT: v_readlane_b32 s8, v2, s3
|
||||
; GFX942-NEXT: v_writelane_b32 v0, s6, m0
|
||||
; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX942-NEXT: v_writelane_b32 v0, s8, m0
|
||||
; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX942-NEXT: v_add_f32_e32 v1, s9, v1
|
||||
; GFX942-NEXT: v_add_f32_e32 v1, s8, v1
|
||||
; GFX942-NEXT: s_cbranch_scc1 .LBB28_5
|
||||
; GFX942-NEXT: ; %bb.6: ; %ComputeEnd
|
||||
; GFX942-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
|
||||
@ -8440,15 +8437,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
|
||||
; GFX11-NEXT: .LBB28_5: ; %ComputeLoop
|
||||
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-NEXT: s_ctz_i32_b32 s1, s0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_readfirstlane_b32 s3, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_readlane_b32 s6, v2, s1
|
||||
; GFX11-NEXT: s_lshl_b32 s7, 1, s1
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_and_not1_b32 s0, s0, s7
|
||||
; GFX11-NEXT: v_writelane_b32 v0, s3, s1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_add_f32_e32 v1, s6, v1
|
||||
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX11-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX11-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX11-NEXT: s_cbranch_scc1 .LBB28_5
|
||||
; GFX11-NEXT: ; %bb.6: ; %ComputeEnd
|
||||
; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
|
||||
@ -8528,11 +8524,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
|
||||
; GFX10-NEXT: s_ff1_i32_b32 s1, s0
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s3, v1
|
||||
; GFX10-NEXT: v_readlane_b32 s6, v2, s1
|
||||
; GFX10-NEXT: s_lshl_b32 s7, 1, s1
|
||||
; GFX10-NEXT: s_andn2_b32 s0, s0, s7
|
||||
; GFX10-NEXT: v_writelane_b32 v0, s3, s1
|
||||
; GFX10-NEXT: v_add_f32_e32 v1, s6, v1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX10-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX10-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX10-NEXT: s_cbranch_scc1 .LBB28_5
|
||||
; GFX10-NEXT: ; %bb.6: ; %ComputeEnd
|
||||
; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
|
||||
@ -8609,14 +8604,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
|
||||
; GFX90A-NEXT: .LBB28_5: ; %ComputeLoop
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX90A-NEXT: v_readfirstlane_b32 s8, v1
|
||||
; GFX90A-NEXT: v_readlane_b32 s9, v2, s3
|
||||
; GFX90A-NEXT: v_readfirstlane_b32 s6, v1
|
||||
; GFX90A-NEXT: s_mov_b32 m0, s3
|
||||
; GFX90A-NEXT: v_readlane_b32 s8, v2, s3
|
||||
; GFX90A-NEXT: v_writelane_b32 v0, s6, m0
|
||||
; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX90A-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX90A-NEXT: v_writelane_b32 v0, s8, m0
|
||||
; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX90A-NEXT: v_add_f32_e32 v1, s9, v1
|
||||
; GFX90A-NEXT: v_add_f32_e32 v1, s8, v1
|
||||
; GFX90A-NEXT: s_cbranch_scc1 .LBB28_5
|
||||
; GFX90A-NEXT: ; %bb.6: ; %ComputeEnd
|
||||
; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
|
||||
@ -8692,14 +8686,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
|
||||
; GFX908-NEXT: .LBB28_5: ; %ComputeLoop
|
||||
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX908-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX908-NEXT: v_readfirstlane_b32 s8, v1
|
||||
; GFX908-NEXT: v_readlane_b32 s9, v2, s3
|
||||
; GFX908-NEXT: v_readfirstlane_b32 s6, v1
|
||||
; GFX908-NEXT: s_mov_b32 m0, s3
|
||||
; GFX908-NEXT: v_readlane_b32 s8, v2, s3
|
||||
; GFX908-NEXT: v_writelane_b32 v0, s6, m0
|
||||
; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX908-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX908-NEXT: v_writelane_b32 v0, s8, m0
|
||||
; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX908-NEXT: v_add_f32_e32 v1, s9, v1
|
||||
; GFX908-NEXT: v_add_f32_e32 v1, s8, v1
|
||||
; GFX908-NEXT: s_cbranch_scc1 .LBB28_5
|
||||
; GFX908-NEXT: ; %bb.6: ; %ComputeEnd
|
||||
; GFX908-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
|
||||
@ -8776,14 +8769,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
|
||||
; GFX8-NEXT: .LBB28_5: ; %ComputeLoop
|
||||
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s8, v1
|
||||
; GFX8-NEXT: v_readlane_b32 s9, v2, s3
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s6, v1
|
||||
; GFX8-NEXT: s_mov_b32 m0, s3
|
||||
; GFX8-NEXT: v_readlane_b32 s8, v2, s3
|
||||
; GFX8-NEXT: v_writelane_b32 v0, s6, m0
|
||||
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX8-NEXT: v_writelane_b32 v0, s8, m0
|
||||
; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX8-NEXT: v_add_f32_e32 v1, s9, v1
|
||||
; GFX8-NEXT: v_add_f32_e32 v1, s8, v1
|
||||
; GFX8-NEXT: s_cbranch_scc1 .LBB28_5
|
||||
; GFX8-NEXT: ; %bb.6: ; %ComputeEnd
|
||||
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
|
||||
@ -9130,12 +9122,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
|
||||
; GFX12-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_readlane_b32 s6, v1, s3
|
||||
; GFX12-NEXT: s_lshl_b32 s7, 1, s3
|
||||
; GFX12-NEXT: v_writelane_b32 v0, s0, s3
|
||||
; GFX12-NEXT: s_lshl_b32 s3, 1, s3
|
||||
; GFX12-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12-NEXT: s_and_not1_b32 s1, s1, s7
|
||||
; GFX12-NEXT: s_wait_alu 0xfffe
|
||||
; GFX12-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX12-NEXT: s_and_not1_b32 s1, s1, s3
|
||||
; GFX12-NEXT: s_add_f32 s0, s0, s6
|
||||
; GFX12-NEXT: s_cbranch_scc1 .LBB29_5
|
||||
; GFX12-NEXT: ; %bb.6: ; %ComputeEnd
|
||||
@ -9212,14 +9202,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
|
||||
; GFX942-NEXT: .LBB29_5: ; %ComputeLoop
|
||||
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX942-NEXT: v_readfirstlane_b32 s8, v1
|
||||
; GFX942-NEXT: v_readlane_b32 s9, v2, s3
|
||||
; GFX942-NEXT: v_readfirstlane_b32 s6, v1
|
||||
; GFX942-NEXT: s_mov_b32 m0, s3
|
||||
; GFX942-NEXT: v_readlane_b32 s8, v2, s3
|
||||
; GFX942-NEXT: v_writelane_b32 v0, s6, m0
|
||||
; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX942-NEXT: v_writelane_b32 v0, s8, m0
|
||||
; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX942-NEXT: v_add_f32_e32 v1, s9, v1
|
||||
; GFX942-NEXT: v_add_f32_e32 v1, s8, v1
|
||||
; GFX942-NEXT: s_cbranch_scc1 .LBB29_5
|
||||
; GFX942-NEXT: ; %bb.6: ; %ComputeEnd
|
||||
; GFX942-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
|
||||
@ -9296,15 +9285,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
|
||||
; GFX11-NEXT: .LBB29_5: ; %ComputeLoop
|
||||
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-NEXT: s_ctz_i32_b32 s1, s0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_readfirstlane_b32 s3, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_readlane_b32 s6, v2, s1
|
||||
; GFX11-NEXT: s_lshl_b32 s7, 1, s1
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_and_not1_b32 s0, s0, s7
|
||||
; GFX11-NEXT: v_writelane_b32 v0, s3, s1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_add_f32_e32 v1, s6, v1
|
||||
; GFX11-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX11-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX11-NEXT: s_and_not1_b32 s0, s0, s1
|
||||
; GFX11-NEXT: s_cbranch_scc1 .LBB29_5
|
||||
; GFX11-NEXT: ; %bb.6: ; %ComputeEnd
|
||||
; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
|
||||
@ -9377,11 +9365,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
|
||||
; GFX10-NEXT: s_ff1_i32_b32 s1, s0
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s3, v1
|
||||
; GFX10-NEXT: v_readlane_b32 s6, v2, s1
|
||||
; GFX10-NEXT: s_lshl_b32 s7, 1, s1
|
||||
; GFX10-NEXT: s_andn2_b32 s0, s0, s7
|
||||
; GFX10-NEXT: v_writelane_b32 v0, s3, s1
|
||||
; GFX10-NEXT: v_add_f32_e32 v1, s6, v1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX10-NEXT: s_lshl_b32 s1, 1, s1
|
||||
; GFX10-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; GFX10-NEXT: s_cbranch_scc1 .LBB29_5
|
||||
; GFX10-NEXT: ; %bb.6: ; %ComputeEnd
|
||||
; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
|
||||
@ -9453,14 +9440,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
|
||||
; GFX90A-NEXT: .LBB29_5: ; %ComputeLoop
|
||||
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX90A-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX90A-NEXT: v_readfirstlane_b32 s8, v1
|
||||
; GFX90A-NEXT: v_readlane_b32 s9, v2, s3
|
||||
; GFX90A-NEXT: v_readfirstlane_b32 s6, v1
|
||||
; GFX90A-NEXT: s_mov_b32 m0, s3
|
||||
; GFX90A-NEXT: v_readlane_b32 s8, v2, s3
|
||||
; GFX90A-NEXT: v_writelane_b32 v0, s6, m0
|
||||
; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX90A-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX90A-NEXT: v_writelane_b32 v0, s8, m0
|
||||
; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX90A-NEXT: v_add_f32_e32 v1, s9, v1
|
||||
; GFX90A-NEXT: v_add_f32_e32 v1, s8, v1
|
||||
; GFX90A-NEXT: s_cbranch_scc1 .LBB29_5
|
||||
; GFX90A-NEXT: ; %bb.6: ; %ComputeEnd
|
||||
; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
|
||||
@ -9533,14 +9519,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
|
||||
; GFX908-NEXT: .LBB29_5: ; %ComputeLoop
|
||||
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX908-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX908-NEXT: v_readfirstlane_b32 s8, v1
|
||||
; GFX908-NEXT: v_readlane_b32 s9, v2, s3
|
||||
; GFX908-NEXT: v_readfirstlane_b32 s6, v1
|
||||
; GFX908-NEXT: s_mov_b32 m0, s3
|
||||
; GFX908-NEXT: v_readlane_b32 s8, v2, s3
|
||||
; GFX908-NEXT: v_writelane_b32 v0, s6, m0
|
||||
; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX908-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX908-NEXT: v_writelane_b32 v0, s8, m0
|
||||
; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX908-NEXT: v_add_f32_e32 v1, s9, v1
|
||||
; GFX908-NEXT: v_add_f32_e32 v1, s8, v1
|
||||
; GFX908-NEXT: s_cbranch_scc1 .LBB29_5
|
||||
; GFX908-NEXT: ; %bb.6: ; %ComputeEnd
|
||||
; GFX908-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
|
||||
@ -9614,14 +9599,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
|
||||
; GFX8-NEXT: .LBB29_5: ; %ComputeLoop
|
||||
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s8, v1
|
||||
; GFX8-NEXT: v_readlane_b32 s9, v2, s3
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s6, v1
|
||||
; GFX8-NEXT: s_mov_b32 m0, s3
|
||||
; GFX8-NEXT: v_readlane_b32 s8, v2, s3
|
||||
; GFX8-NEXT: v_writelane_b32 v0, s6, m0
|
||||
; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
|
||||
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX8-NEXT: v_writelane_b32 v0, s8, m0
|
||||
; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX8-NEXT: v_add_f32_e32 v1, s9, v1
|
||||
; GFX8-NEXT: v_add_f32_e32 v1, s8, v1
|
||||
; GFX8-NEXT: s_cbranch_scc1 .LBB29_5
|
||||
; GFX8-NEXT: ; %bb.6: ; %ComputeEnd
|
||||
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
|
||||
|
||||
@ -388,9 +388,8 @@ body: |
|
||||
; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
|
||||
; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY]], implicit-def $scc
|
||||
; GCN-NEXT: S_NOP 0, implicit killed $scc
|
||||
; GCN-NEXT: S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc
|
||||
; GCN-NEXT: S_BITCMP1_B32 killed [[COPY]], 0, implicit-def $scc
|
||||
; GCN-NEXT: S_NOP 0, implicit $scc
|
||||
; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
|
||||
; GCN-NEXT: S_BRANCH %bb.1
|
||||
; GCN-NEXT: {{ $}}
|
||||
@ -416,6 +415,80 @@ body: |
|
||||
bb.2:
|
||||
S_ENDPGM 0
|
||||
|
||||
...
|
||||
---
|
||||
name: xor_1_cmp_lg_0_killed_scc
|
||||
body: |
|
||||
; GCN-LABEL: name: xor_1_cmp_lg_0_killed_scc
|
||||
; GCN: bb.0:
|
||||
; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
|
||||
; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
|
||||
; GCN-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 1, killed [[COPY]], implicit-def $scc
|
||||
; GCN-NEXT: S_NOP 0, implicit $scc
|
||||
; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
|
||||
; GCN-NEXT: S_BRANCH %bb.1
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: bb.1:
|
||||
; GCN-NEXT: successors: %bb.2(0x80000000)
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: bb.2:
|
||||
; GCN-NEXT: S_ENDPGM 0
|
||||
bb.0:
|
||||
successors: %bb.1(0x40000000), %bb.2(0x40000000)
|
||||
liveins: $sgpr0, $vgpr0_vgpr1
|
||||
|
||||
%0:sreg_32 = COPY $sgpr0
|
||||
%1:sreg_32 = S_XOR_B32 1, killed %0, implicit-def $scc
|
||||
S_NOP 0, implicit killed $scc
|
||||
S_CMP_LG_U32 killed %1:sreg_32, 0, implicit-def $scc
|
||||
S_CBRANCH_SCC0 %bb.2, implicit $scc
|
||||
S_BRANCH %bb.1
|
||||
|
||||
bb.1:
|
||||
successors: %bb.2(0x80000000)
|
||||
|
||||
bb.2:
|
||||
S_ENDPGM 0
|
||||
|
||||
...
|
||||
---
|
||||
name: absdiff_1_cmp_lg_0_killed_scc
|
||||
body: |
|
||||
; GCN-LABEL: name: absdiff_1_cmp_lg_0_killed_scc
|
||||
; GCN: bb.0:
|
||||
; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
|
||||
; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
|
||||
; GCN-NEXT: [[S_ABSDIFF_I32_:%[0-9]+]]:sreg_32 = S_ABSDIFF_I32 1, killed [[COPY]], implicit-def $scc
|
||||
; GCN-NEXT: S_NOP 0, implicit $scc
|
||||
; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
|
||||
; GCN-NEXT: S_BRANCH %bb.1
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: bb.1:
|
||||
; GCN-NEXT: successors: %bb.2(0x80000000)
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: bb.2:
|
||||
; GCN-NEXT: S_ENDPGM 0
|
||||
bb.0:
|
||||
successors: %bb.1(0x40000000), %bb.2(0x40000000)
|
||||
liveins: $sgpr0, $vgpr0_vgpr1
|
||||
|
||||
%0:sreg_32 = COPY $sgpr0
|
||||
%1:sreg_32 = S_ABSDIFF_I32 1, killed %0, implicit-def $scc
|
||||
S_NOP 0, implicit killed $scc
|
||||
S_CMP_LG_U32 killed %1:sreg_32, 0, implicit-def $scc
|
||||
S_CBRANCH_SCC0 %bb.2, implicit $scc
|
||||
S_BRANCH %bb.1
|
||||
|
||||
bb.1:
|
||||
successors: %bb.2(0x80000000)
|
||||
|
||||
bb.2:
|
||||
S_ENDPGM 0
|
||||
|
||||
...
|
||||
|
||||
---
|
||||
@ -2070,8 +2143,7 @@ body: |
|
||||
; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
|
||||
; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 3, killed [[COPY]], implicit-def dead $scc
|
||||
; GCN-NEXT: S_CMP_LG_U32 killed [[S_AND_B32_]], 0, implicit-def $scc
|
||||
; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 3, killed [[COPY]], implicit-def $scc
|
||||
; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
|
||||
; GCN-NEXT: S_BRANCH %bb.1
|
||||
; GCN-NEXT: {{ $}}
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
|
||||
|
||||
declare i32 @llvm.ctpop.i32(i32)
|
||||
declare i64 @llvm.ctpop.i64(i64)
|
||||
@ -10,7 +10,6 @@ define amdgpu_ps i32 @shl32(i32 inreg %val0, i32 inreg %val1) {
|
||||
; CHECK-LABEL: shl32:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_lshl_b32 s0, s0, s1
|
||||
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
|
||||
@ -25,7 +24,6 @@ define amdgpu_ps i32 @shl64(i64 inreg %val0, i64 inreg %val1) {
|
||||
; CHECK-LABEL: shl64:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
|
||||
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
|
||||
@ -40,7 +38,6 @@ define amdgpu_ps i32 @lshr32(i32 inreg %val0, i32 inreg %val1) {
|
||||
; CHECK-LABEL: lshr32:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_lshr_b32 s0, s0, s1
|
||||
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
|
||||
@ -55,7 +52,6 @@ define amdgpu_ps i32 @lshr64(i64 inreg %val0, i64 inreg %val1) {
|
||||
; CHECK-LABEL: lshr64:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
|
||||
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
|
||||
@ -70,7 +66,6 @@ define amdgpu_ps i32 @ashr32(i32 inreg %val0, i32 inreg %val1) {
|
||||
; CHECK-LABEL: ashr32:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_ashr_i32 s0, s0, s1
|
||||
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
|
||||
@ -85,7 +80,6 @@ define amdgpu_ps i32 @ashr64(i64 inreg %val0, i64 inreg %val1) {
|
||||
; CHECK-LABEL: ashr64:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_ashr_i64 s[0:1], s[0:1], s2
|
||||
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
|
||||
@ -100,7 +94,6 @@ define amdgpu_ps i32 @abs32(i32 inreg %val0) {
|
||||
; CHECK-LABEL: abs32:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_abs_i32 s0, s0
|
||||
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; CHECK-NEXT: ;;#ASMSTART
|
||||
; CHECK-NEXT: ; use s0
|
||||
; CHECK-NEXT: ;;#ASMEND
|
||||
@ -121,7 +114,6 @@ define amdgpu_ps i32 @and32(i32 inreg %val0, i32 inreg %val1) {
|
||||
; CHECK-LABEL: and32:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_and_b32 s0, s0, s1
|
||||
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
|
||||
@ -136,7 +128,6 @@ define amdgpu_ps i32 @and64(i64 inreg %val0, i64 inreg %val1) {
|
||||
; CHECK-LABEL: and64:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
|
||||
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
|
||||
@ -151,7 +142,6 @@ define amdgpu_ps i32 @or32(i32 inreg %val0, i32 inreg %val1) {
|
||||
; CHECK-LABEL: or32:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_or_b32 s0, s0, s1
|
||||
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
|
||||
@ -166,7 +156,6 @@ define amdgpu_ps i32 @or64(i64 inreg %val0, i64 inreg %val1) {
|
||||
; CHECK-LABEL: or64:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
|
||||
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
|
||||
@ -181,7 +170,6 @@ define amdgpu_ps i32 @xor32(i32 inreg %val0, i32 inreg %val1) {
|
||||
; CHECK-LABEL: xor32:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_xor_b32 s0, s0, s1
|
||||
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
|
||||
@ -196,7 +184,6 @@ define amdgpu_ps i32 @xor64(i64 inreg %val0, i64 inreg %val1) {
|
||||
; CHECK-LABEL: xor64:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
|
||||
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
|
||||
@ -211,7 +198,6 @@ define amdgpu_ps i32 @nand32(i32 inreg %val0, i32 inreg %val1) {
|
||||
; CHECK-LABEL: nand32:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_nand_b32 s0, s0, s1
|
||||
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; CHECK-NEXT: ;;#ASMSTART
|
||||
; CHECK-NEXT: ; use s0
|
||||
; CHECK-NEXT: ;;#ASMEND
|
||||
@ -231,7 +217,6 @@ define amdgpu_ps i32 @nand64(i64 inreg %val0, i64 inreg %val1) {
|
||||
; CHECK-LABEL: nand64:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3]
|
||||
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; CHECK-NEXT: ;;#ASMSTART
|
||||
; CHECK-NEXT: ; use s[0:1]
|
||||
; CHECK-NEXT: ;;#ASMEND
|
||||
@ -251,7 +236,6 @@ define amdgpu_ps i32 @nor32(i32 inreg %val0, i32 inreg %val1) {
|
||||
; CHECK-LABEL: nor32:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_nor_b32 s0, s0, s1
|
||||
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; CHECK-NEXT: ;;#ASMSTART
|
||||
; CHECK-NEXT: ; use s0
|
||||
; CHECK-NEXT: ;;#ASMEND
|
||||
@ -271,7 +255,6 @@ define amdgpu_ps i32 @nor64(i64 inreg %val0, i64 inreg %val1) {
|
||||
; CHECK-LABEL: nor64:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_nor_b64 s[0:1], s[0:1], s[2:3]
|
||||
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; CHECK-NEXT: ;;#ASMSTART
|
||||
; CHECK-NEXT: ; use s[0:1]
|
||||
; CHECK-NEXT: ;;#ASMEND
|
||||
@ -291,7 +274,6 @@ define amdgpu_ps i32 @xnor32(i32 inreg %val0, i32 inreg %val1) {
|
||||
; CHECK-LABEL: xnor32:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_xnor_b32 s0, s0, s1
|
||||
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; CHECK-NEXT: ;;#ASMSTART
|
||||
; CHECK-NEXT: ; use s0
|
||||
; CHECK-NEXT: ;;#ASMEND
|
||||
@ -311,7 +293,6 @@ define amdgpu_ps i32 @xnor64(i64 inreg %val0, i64 inreg %val1) {
|
||||
; CHECK-LABEL: xnor64:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_xnor_b64 s[0:1], s[0:1], s[2:3]
|
||||
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; CHECK-NEXT: ;;#ASMSTART
|
||||
; CHECK-NEXT: ; use s[0:1]
|
||||
; CHECK-NEXT: ;;#ASMEND
|
||||
@ -331,7 +312,6 @@ define amdgpu_ps i32 @andn232(i32 inreg %val0, i32 inreg %val1) {
|
||||
; CHECK-LABEL: andn232:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_andn2_b32 s0, s0, s1
|
||||
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
|
||||
@ -347,7 +327,6 @@ define amdgpu_ps i32 @nandn264(i64 inreg %val0, i64 inreg %val1) {
|
||||
; CHECK-LABEL: nandn264:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
|
||||
@ -363,7 +342,6 @@ define amdgpu_ps i32 @orn232(i32 inreg %val0, i32 inreg %val1) {
|
||||
; CHECK-LABEL: orn232:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_orn2_b32 s0, s0, s1
|
||||
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
|
||||
@ -379,7 +357,6 @@ define amdgpu_ps i32 @orn264(i64 inreg %val0, i64 inreg %val1) {
|
||||
; CHECK-LABEL: orn264:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_orn2_b64 s[0:1], s[0:1], s[2:3]
|
||||
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
|
||||
@ -395,7 +372,6 @@ define amdgpu_ps i32 @bfe_i32(i32 inreg %val0) {
|
||||
; CHECK-LABEL: bfe_i32:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_bfe_i32 s0, s0, 0x80010
|
||||
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
|
||||
@ -433,7 +409,6 @@ define amdgpu_ps i32 @bfe_u32(i32 inreg %val0) {
|
||||
; CHECK-LABEL: bfe_u32:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_bfe_u32 s0, s0, 0x80010
|
||||
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
|
||||
@ -513,7 +488,6 @@ define amdgpu_ps i32 @bcnt132(i32 inreg %val0) {
|
||||
; CHECK-LABEL: bcnt132:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_bcnt1_i32_b32 s0, s0
|
||||
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; CHECK-NEXT: ;;#ASMSTART
|
||||
; CHECK-NEXT: ; use s0
|
||||
; CHECK-NEXT: ;;#ASMEND
|
||||
@ -552,7 +526,6 @@ define amdgpu_ps i32 @quadmask32(i32 inreg %val0) {
|
||||
; CHECK-LABEL: quadmask32:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_quadmask_b32 s0, s0
|
||||
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; CHECK-NEXT: ;;#ASMSTART
|
||||
; CHECK-NEXT: ; use s0
|
||||
; CHECK-NEXT: ;;#ASMEND
|
||||
@ -571,7 +544,6 @@ define amdgpu_ps i32 @quadmask64(i64 inreg %val0) {
|
||||
; CHECK-LABEL: quadmask64:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_quadmask_b64 s[0:1], s[0:1]
|
||||
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; CHECK-NEXT: ;;#ASMSTART
|
||||
; CHECK-NEXT: ; use s[0:1]
|
||||
; CHECK-NEXT: ;;#ASMEND
|
||||
@ -590,7 +562,6 @@ define amdgpu_ps i32 @not32(i32 inreg %val0) {
|
||||
; CHECK-LABEL: not32:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_not_b32 s0, s0
|
||||
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; CHECK-NEXT: ;;#ASMSTART
|
||||
; CHECK-NEXT: ; use s0
|
||||
; CHECK-NEXT: ;;#ASMEND
|
||||
@ -609,7 +580,6 @@ define amdgpu_ps i32 @not64(i64 inreg %val0) {
|
||||
; CHECK-LABEL: not64:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_not_b64 s[0:1], s[0:1]
|
||||
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; CHECK-NEXT: ;;#ASMSTART
|
||||
; CHECK-NEXT: ; use s[0:1]
|
||||
; CHECK-NEXT: ;;#ASMEND
|
||||
@ -623,3 +593,35 @@ define amdgpu_ps i32 @not64(i64 inreg %val0) {
|
||||
%zext = zext i1 %cmp to i32
|
||||
ret i32 %zext
|
||||
}
|
||||
|
||||
|
||||
; --------------------------------------------------------------------------------
|
||||
; Negative tests
|
||||
; --------------------------------------------------------------------------------
|
||||
|
||||
@1 = extern_weak dso_local addrspace(4) constant i32
|
||||
|
||||
define amdgpu_ps i32 @si_pc_add_rel_offset_must_not_optimize() {
|
||||
; CHECK-LABEL: si_pc_add_rel_offset_must_not_optimize:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_getpc_b64 s[0:1]
|
||||
; CHECK-NEXT: s_add_u32 s0, s0, __unnamed_1@rel32@lo+4
|
||||
; CHECK-NEXT: s_addc_u32 s1, s1, __unnamed_1@rel32@hi+12
|
||||
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; CHECK-NEXT: s_cbranch_scc0 .LBB35_2
|
||||
; CHECK-NEXT: ; %bb.1: ; %endif
|
||||
; CHECK-NEXT: s_mov_b32 s0, 1
|
||||
; CHECK-NEXT: s_branch .LBB35_3
|
||||
; CHECK-NEXT: .LBB35_2: ; %if
|
||||
; CHECK-NEXT: s_mov_b32 s0, 0
|
||||
; CHECK-NEXT: s_branch .LBB35_3
|
||||
; CHECK-NEXT: .LBB35_3:
|
||||
%cmp = icmp ne ptr addrspace(4) @1, null
|
||||
br i1 %cmp, label %endif, label %if
|
||||
|
||||
if:
|
||||
ret i32 0
|
||||
|
||||
endif:
|
||||
ret i32 1
|
||||
}
|
||||
|
||||
@ -12,8 +12,6 @@ define amdgpu_ps i32 @s_uaddo_pseudo(i32 inreg %val0) {
|
||||
; CHECK-LABEL: s_uaddo_pseudo:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_add_u32 s0, s0, 1
|
||||
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; CHECK-NEXT: s_addc_u32 s0, 1, 0
|
||||
; CHECK-NEXT: ; return to shader part epilog
|
||||
%pair = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %val0, i32 1)
|
||||
@ -32,8 +30,6 @@ define amdgpu_ps i32 @s_usubo_pseudo(i32 inreg %val0, i32 inreg %val1) {
|
||||
; CHECK-LABEL: s_usubo_pseudo:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_sub_u32 s0, s0, 1
|
||||
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
|
||||
; CHECK-NEXT: s_subb_u32 s0, s1, 0
|
||||
; CHECK-NEXT: ; return to shader part epilog
|
||||
%pair = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %val0, i32 1)
|
||||
|
||||
@ -56,10 +56,9 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
|
||||
; GCN-NEXT: s_addc_u32 s15, 0, s16
|
||||
; GCN-NEXT: s_add_u32 s16, s0, s1
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s16
|
||||
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GCN-NEXT: v_mul_hi_u32 v0, s12, v0
|
||||
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s0, s0, s1
|
||||
; GCN-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GCN-NEXT: s_addc_u32 s14, s14, s15
|
||||
; GCN-NEXT: s_mul_i32 s0, s12, s14
|
||||
; GCN-NEXT: v_readfirstlane_b32 s1, v0
|
||||
@ -90,7 +89,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
|
||||
; GCN-NEXT: s_add_u32 s15, s16, s0
|
||||
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s0, s0, s1
|
||||
; GCN-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GCN-NEXT: s_addc_u32 s14, s14, s12
|
||||
; GCN-NEXT: s_ashr_i32 s12, s7, 31
|
||||
; GCN-NEXT: s_add_u32 s0, s6, s12
|
||||
@ -116,52 +114,50 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
|
||||
; GCN-NEXT: v_readfirstlane_b32 s4, v0
|
||||
; GCN-NEXT: s_addc_u32 s4, s4, 0
|
||||
; GCN-NEXT: s_mul_i32 s14, s7, s14
|
||||
; GCN-NEXT: s_add_u32 s14, s1, s14
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s14
|
||||
; GCN-NEXT: s_add_u32 s16, s1, s14
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s16
|
||||
; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
|
||||
; GCN-NEXT: s_addc_u32 s15, 0, s4
|
||||
; GCN-NEXT: s_addc_u32 s17, 0, s4
|
||||
; GCN-NEXT: s_mov_b32 s1, s5
|
||||
; GCN-NEXT: s_mul_i32 s4, s10, s15
|
||||
; GCN-NEXT: s_mul_i32 s4, s10, s17
|
||||
; GCN-NEXT: v_readfirstlane_b32 s5, v0
|
||||
; GCN-NEXT: s_add_i32 s4, s5, s4
|
||||
; GCN-NEXT: s_mul_i32 s5, s11, s14
|
||||
; GCN-NEXT: s_add_i32 s16, s4, s5
|
||||
; GCN-NEXT: s_sub_i32 s17, s7, s16
|
||||
; GCN-NEXT: s_mul_i32 s4, s10, s14
|
||||
; GCN-NEXT: s_mul_i32 s5, s11, s16
|
||||
; GCN-NEXT: s_add_i32 s18, s4, s5
|
||||
; GCN-NEXT: s_sub_i32 s14, s7, s18
|
||||
; GCN-NEXT: s_mul_i32 s4, s10, s16
|
||||
; GCN-NEXT: s_sub_u32 s6, s6, s4
|
||||
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s18, s4, s5
|
||||
; GCN-NEXT: s_cmp_lg_u32 s18, 0
|
||||
; GCN-NEXT: s_subb_u32 s17, s17, s11
|
||||
; GCN-NEXT: s_sub_u32 s19, s6, s10
|
||||
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s15, s4, s5
|
||||
; GCN-NEXT: s_subb_u32 s19, s14, s11
|
||||
; GCN-NEXT: s_sub_u32 s20, s6, s10
|
||||
; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s14, s14, s15
|
||||
; GCN-NEXT: s_subb_u32 s14, s19, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s14, s11
|
||||
; GCN-NEXT: s_cselect_b32 s15, -1, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s20, s10
|
||||
; GCN-NEXT: s_cselect_b32 s19, -1, 0
|
||||
; GCN-NEXT: s_cmp_eq_u32 s14, s11
|
||||
; GCN-NEXT: s_cselect_b32 s14, s19, s15
|
||||
; GCN-NEXT: s_add_u32 s15, s16, 1
|
||||
; GCN-NEXT: s_addc_u32 s19, s17, 0
|
||||
; GCN-NEXT: s_add_u32 s20, s16, 2
|
||||
; GCN-NEXT: s_addc_u32 s21, s17, 0
|
||||
; GCN-NEXT: s_cmp_lg_u32 s14, 0
|
||||
; GCN-NEXT: s_cselect_b32 s14, s20, s15
|
||||
; GCN-NEXT: s_cselect_b32 s15, s21, s19
|
||||
; GCN-NEXT: s_or_b32 s4, s4, s5
|
||||
; GCN-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GCN-NEXT: s_subb_u32 s4, s17, 0
|
||||
; GCN-NEXT: s_subb_u32 s4, s7, s18
|
||||
; GCN-NEXT: s_cmp_ge_u32 s4, s11
|
||||
; GCN-NEXT: s_cselect_b32 s5, -1, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s19, s10
|
||||
; GCN-NEXT: s_cselect_b32 s17, -1, 0
|
||||
; GCN-NEXT: s_cmp_eq_u32 s4, s11
|
||||
; GCN-NEXT: s_cselect_b32 s4, s17, s5
|
||||
; GCN-NEXT: s_add_u32 s5, s14, 1
|
||||
; GCN-NEXT: s_addc_u32 s17, s15, 0
|
||||
; GCN-NEXT: s_add_u32 s19, s14, 2
|
||||
; GCN-NEXT: s_addc_u32 s20, s15, 0
|
||||
; GCN-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GCN-NEXT: s_cselect_b32 s4, s19, s5
|
||||
; GCN-NEXT: s_cselect_b32 s5, s20, s17
|
||||
; GCN-NEXT: s_cmp_lg_u32 s18, 0
|
||||
; GCN-NEXT: s_subb_u32 s7, s7, s16
|
||||
; GCN-NEXT: s_cmp_ge_u32 s7, s11
|
||||
; GCN-NEXT: s_cselect_b32 s16, -1, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s6, s10
|
||||
; GCN-NEXT: s_cselect_b32 s6, -1, 0
|
||||
; GCN-NEXT: s_cmp_eq_u32 s7, s11
|
||||
; GCN-NEXT: s_cselect_b32 s6, s6, s16
|
||||
; GCN-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GCN-NEXT: s_cselect_b32 s5, s5, s15
|
||||
; GCN-NEXT: s_cselect_b32 s4, s4, s14
|
||||
; GCN-NEXT: s_cmp_eq_u32 s4, s11
|
||||
; GCN-NEXT: s_cselect_b32 s4, s6, s5
|
||||
; GCN-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GCN-NEXT: s_cselect_b32 s5, s15, s17
|
||||
; GCN-NEXT: s_cselect_b32 s4, s14, s16
|
||||
; GCN-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9]
|
||||
; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
|
||||
; GCN-NEXT: s_sub_u32 s4, s4, s6
|
||||
@ -208,7 +204,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
|
||||
; GCN-IR-NEXT: s_add_u32 s18, s16, 1
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
|
||||
; GCN-IR-NEXT: s_or_b32 s10, s10, s11
|
||||
; GCN-IR-NEXT: s_cmp_lg_u32 s10, 0
|
||||
; GCN-IR-NEXT: s_addc_u32 s10, s17, 0
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
|
||||
; GCN-IR-NEXT: s_sub_i32 s16, 63, s16
|
||||
@ -242,7 +237,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
|
||||
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
|
||||
; GCN-IR-NEXT: s_or_b32 s20, s20, s21
|
||||
; GCN-IR-NEXT: s_cmp_lg_u32 s20, 0
|
||||
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
|
||||
; GCN-IR-NEXT: s_mov_b64 s[12:13], s[8:9]
|
||||
@ -1195,10 +1189,9 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
|
||||
; GCN-NEXT: s_addc_u32 s12, 0, s13
|
||||
; GCN-NEXT: s_add_u32 s13, s8, s9
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s13
|
||||
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
|
||||
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s8, s8, s9
|
||||
; GCN-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GCN-NEXT: s_addc_u32 s11, s11, s12
|
||||
; GCN-NEXT: s_mul_i32 s8, s2, s11
|
||||
; GCN-NEXT: v_readfirstlane_b32 s9, v0
|
||||
@ -1229,7 +1222,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
|
||||
; GCN-NEXT: s_add_u32 s2, s13, s2
|
||||
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s8, s8, s9
|
||||
; GCN-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GCN-NEXT: s_addc_u32 s8, s11, s10
|
||||
; GCN-NEXT: v_mul_hi_u32 v1, s2, 24
|
||||
; GCN-NEXT: v_mul_hi_u32 v0, s8, 24
|
||||
@ -1238,48 +1230,46 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
|
||||
; GCN-NEXT: v_readfirstlane_b32 s10, v1
|
||||
; GCN-NEXT: v_readfirstlane_b32 s9, v0
|
||||
; GCN-NEXT: s_add_u32 s8, s10, s8
|
||||
; GCN-NEXT: s_addc_u32 s10, 0, s9
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s10
|
||||
; GCN-NEXT: s_addc_u32 s12, 0, s9
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s12
|
||||
; GCN-NEXT: v_mul_hi_u32 v0, s6, v0
|
||||
; GCN-NEXT: s_mul_i32 s8, s7, s10
|
||||
; GCN-NEXT: s_mul_i32 s8, s7, s12
|
||||
; GCN-NEXT: v_readfirstlane_b32 s9, v0
|
||||
; GCN-NEXT: s_add_i32 s11, s9, s8
|
||||
; GCN-NEXT: s_sub_i32 s12, 0, s11
|
||||
; GCN-NEXT: s_mul_i32 s8, s6, s10
|
||||
; GCN-NEXT: s_sub_u32 s13, 24, s8
|
||||
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s14, s8, s9
|
||||
; GCN-NEXT: s_cmp_lg_u32 s14, 0
|
||||
; GCN-NEXT: s_subb_u32 s12, s12, s7
|
||||
; GCN-NEXT: s_sub_u32 s15, s13, s6
|
||||
; GCN-NEXT: s_add_i32 s13, s9, s8
|
||||
; GCN-NEXT: s_sub_i32 s10, 0, s13
|
||||
; GCN-NEXT: s_mul_i32 s8, s6, s12
|
||||
; GCN-NEXT: s_sub_u32 s14, 24, s8
|
||||
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s11, s8, s9
|
||||
; GCN-NEXT: s_subb_u32 s15, s10, s7
|
||||
; GCN-NEXT: s_sub_u32 s16, s14, s6
|
||||
; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s10, s10, s11
|
||||
; GCN-NEXT: s_subb_u32 s10, s15, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s10, s7
|
||||
; GCN-NEXT: s_cselect_b32 s11, -1, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s16, s6
|
||||
; GCN-NEXT: s_cselect_b32 s15, -1, 0
|
||||
; GCN-NEXT: s_cmp_eq_u32 s10, s7
|
||||
; GCN-NEXT: s_cselect_b32 s10, s15, s11
|
||||
; GCN-NEXT: s_add_u32 s11, s12, 1
|
||||
; GCN-NEXT: s_addc_u32 s15, 0, 0
|
||||
; GCN-NEXT: s_add_u32 s16, s12, 2
|
||||
; GCN-NEXT: s_addc_u32 s17, 0, 0
|
||||
; GCN-NEXT: s_cmp_lg_u32 s10, 0
|
||||
; GCN-NEXT: s_cselect_b32 s10, s16, s11
|
||||
; GCN-NEXT: s_cselect_b32 s11, s17, s15
|
||||
; GCN-NEXT: s_or_b32 s8, s8, s9
|
||||
; GCN-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GCN-NEXT: s_subb_u32 s8, s12, 0
|
||||
; GCN-NEXT: s_subb_u32 s8, 0, s13
|
||||
; GCN-NEXT: s_cmp_ge_u32 s8, s7
|
||||
; GCN-NEXT: s_cselect_b32 s9, -1, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s15, s6
|
||||
; GCN-NEXT: s_cselect_b32 s12, -1, 0
|
||||
; GCN-NEXT: s_cmp_eq_u32 s8, s7
|
||||
; GCN-NEXT: s_cselect_b32 s8, s12, s9
|
||||
; GCN-NEXT: s_add_u32 s9, s10, 1
|
||||
; GCN-NEXT: s_addc_u32 s12, 0, 0
|
||||
; GCN-NEXT: s_add_u32 s15, s10, 2
|
||||
; GCN-NEXT: s_addc_u32 s16, 0, 0
|
||||
; GCN-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GCN-NEXT: s_cselect_b32 s8, s15, s9
|
||||
; GCN-NEXT: s_cselect_b32 s9, s16, s12
|
||||
; GCN-NEXT: s_cmp_lg_u32 s14, 0
|
||||
; GCN-NEXT: s_subb_u32 s11, 0, s11
|
||||
; GCN-NEXT: s_cmp_ge_u32 s11, s7
|
||||
; GCN-NEXT: s_cselect_b32 s12, -1, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s13, s6
|
||||
; GCN-NEXT: s_cmp_ge_u32 s14, s6
|
||||
; GCN-NEXT: s_cselect_b32 s6, -1, 0
|
||||
; GCN-NEXT: s_cmp_eq_u32 s11, s7
|
||||
; GCN-NEXT: s_cselect_b32 s6, s6, s12
|
||||
; GCN-NEXT: s_cmp_eq_u32 s8, s7
|
||||
; GCN-NEXT: s_cselect_b32 s6, s6, s9
|
||||
; GCN-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GCN-NEXT: s_cselect_b32 s7, s9, 0
|
||||
; GCN-NEXT: s_cselect_b32 s6, s8, s10
|
||||
; GCN-NEXT: s_cselect_b32 s7, s11, 0
|
||||
; GCN-NEXT: s_cselect_b32 s6, s10, s12
|
||||
; GCN-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
|
||||
; GCN-NEXT: s_sub_u32 s6, s6, s4
|
||||
; GCN-NEXT: s_subb_u32 s7, s7, s4
|
||||
@ -1315,7 +1305,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
|
||||
; GCN-IR-NEXT: s_add_u32 s12, s10, 1
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; GCN-IR-NEXT: s_or_b32 s8, s8, s9
|
||||
; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GCN-IR-NEXT: s_addc_u32 s8, s11, 0
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; GCN-IR-NEXT: s_sub_i32 s10, 63, s10
|
||||
@ -1348,7 +1337,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
|
||||
; GCN-IR-NEXT: s_add_u32 s16, s16, 1
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
|
||||
; GCN-IR-NEXT: s_or_b32 s18, s18, s19
|
||||
; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0
|
||||
; GCN-IR-NEXT: s_addc_u32 s17, s17, 0
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
|
||||
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[6:7]
|
||||
|
||||
@ -1513,7 +1513,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
|
||||
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8
|
||||
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9
|
||||
; GCN-NEXT: s_sub_u32 s3, 0, s8
|
||||
; GCN-NEXT: s_subb_u32 s12, 0, s9
|
||||
; GCN-NEXT: s_subb_u32 s10, 0, s9
|
||||
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
|
||||
; GCN-NEXT: v_rcp_f32_e32 v0, v0
|
||||
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
||||
@ -1522,56 +1522,52 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
|
||||
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
|
||||
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
|
||||
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; GCN-NEXT: v_readfirstlane_b32 s13, v1
|
||||
; GCN-NEXT: v_readfirstlane_b32 s10, v0
|
||||
; GCN-NEXT: s_mul_i32 s11, s3, s13
|
||||
; GCN-NEXT: s_mul_hi_u32 s15, s3, s10
|
||||
; GCN-NEXT: s_mul_i32 s14, s12, s10
|
||||
; GCN-NEXT: s_add_i32 s11, s15, s11
|
||||
; GCN-NEXT: s_add_i32 s11, s11, s14
|
||||
; GCN-NEXT: s_mul_i32 s16, s3, s10
|
||||
; GCN-NEXT: s_mul_i32 s15, s10, s11
|
||||
; GCN-NEXT: s_mul_hi_u32 s17, s10, s16
|
||||
; GCN-NEXT: s_mul_hi_u32 s14, s10, s11
|
||||
; GCN-NEXT: v_readfirstlane_b32 s11, v1
|
||||
; GCN-NEXT: v_readfirstlane_b32 s12, v0
|
||||
; GCN-NEXT: s_mul_i32 s13, s3, s11
|
||||
; GCN-NEXT: s_mul_hi_u32 s15, s3, s12
|
||||
; GCN-NEXT: s_mul_i32 s14, s10, s12
|
||||
; GCN-NEXT: s_add_i32 s13, s15, s13
|
||||
; GCN-NEXT: s_add_i32 s13, s13, s14
|
||||
; GCN-NEXT: s_mul_i32 s16, s3, s12
|
||||
; GCN-NEXT: s_mul_i32 s15, s12, s13
|
||||
; GCN-NEXT: s_mul_hi_u32 s17, s12, s16
|
||||
; GCN-NEXT: s_mul_hi_u32 s14, s12, s13
|
||||
; GCN-NEXT: s_add_u32 s15, s17, s15
|
||||
; GCN-NEXT: s_addc_u32 s14, 0, s14
|
||||
; GCN-NEXT: s_mul_hi_u32 s18, s13, s16
|
||||
; GCN-NEXT: s_mul_i32 s16, s13, s16
|
||||
; GCN-NEXT: s_mul_hi_u32 s18, s11, s16
|
||||
; GCN-NEXT: s_mul_i32 s16, s11, s16
|
||||
; GCN-NEXT: s_add_u32 s15, s15, s16
|
||||
; GCN-NEXT: s_mul_hi_u32 s17, s13, s11
|
||||
; GCN-NEXT: s_mul_hi_u32 s17, s11, s13
|
||||
; GCN-NEXT: s_addc_u32 s14, s14, s18
|
||||
; GCN-NEXT: s_addc_u32 s15, s17, 0
|
||||
; GCN-NEXT: s_mul_i32 s11, s13, s11
|
||||
; GCN-NEXT: s_add_u32 s11, s14, s11
|
||||
; GCN-NEXT: s_mul_i32 s13, s11, s13
|
||||
; GCN-NEXT: s_add_u32 s13, s14, s13
|
||||
; GCN-NEXT: s_addc_u32 s14, 0, s15
|
||||
; GCN-NEXT: s_add_u32 s15, s10, s11
|
||||
; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[10:11], 0
|
||||
; GCN-NEXT: s_addc_u32 s13, s13, s14
|
||||
; GCN-NEXT: s_mul_i32 s10, s3, s13
|
||||
; GCN-NEXT: s_mul_hi_u32 s11, s3, s15
|
||||
; GCN-NEXT: s_add_i32 s10, s11, s10
|
||||
; GCN-NEXT: s_mul_i32 s12, s12, s15
|
||||
; GCN-NEXT: s_add_i32 s10, s10, s12
|
||||
; GCN-NEXT: s_mul_i32 s3, s3, s15
|
||||
; GCN-NEXT: s_mul_hi_u32 s12, s13, s3
|
||||
; GCN-NEXT: s_mul_i32 s14, s13, s3
|
||||
; GCN-NEXT: s_mul_i32 s17, s15, s10
|
||||
; GCN-NEXT: s_mul_hi_u32 s3, s15, s3
|
||||
; GCN-NEXT: s_mul_hi_u32 s16, s15, s10
|
||||
; GCN-NEXT: s_add_u32 s12, s12, s13
|
||||
; GCN-NEXT: s_addc_u32 s11, s11, s14
|
||||
; GCN-NEXT: s_mul_i32 s13, s3, s11
|
||||
; GCN-NEXT: s_mul_hi_u32 s14, s3, s12
|
||||
; GCN-NEXT: s_add_i32 s13, s14, s13
|
||||
; GCN-NEXT: s_mul_i32 s10, s10, s12
|
||||
; GCN-NEXT: s_add_i32 s13, s13, s10
|
||||
; GCN-NEXT: s_mul_i32 s3, s3, s12
|
||||
; GCN-NEXT: s_mul_hi_u32 s14, s11, s3
|
||||
; GCN-NEXT: s_mul_i32 s15, s11, s3
|
||||
; GCN-NEXT: s_mul_i32 s17, s12, s13
|
||||
; GCN-NEXT: s_mul_hi_u32 s3, s12, s3
|
||||
; GCN-NEXT: s_mul_hi_u32 s16, s12, s13
|
||||
; GCN-NEXT: s_add_u32 s3, s3, s17
|
||||
; GCN-NEXT: s_addc_u32 s16, 0, s16
|
||||
; GCN-NEXT: s_add_u32 s3, s3, s14
|
||||
; GCN-NEXT: s_mul_hi_u32 s11, s13, s10
|
||||
; GCN-NEXT: s_addc_u32 s3, s16, s12
|
||||
; GCN-NEXT: s_addc_u32 s11, s11, 0
|
||||
; GCN-NEXT: s_mul_i32 s10, s13, s10
|
||||
; GCN-NEXT: s_add_u32 s3, s3, s10
|
||||
; GCN-NEXT: s_addc_u32 s12, 0, s11
|
||||
; GCN-NEXT: s_add_u32 s3, s15, s3
|
||||
; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[10:11], 0
|
||||
; GCN-NEXT: s_addc_u32 s14, s13, s12
|
||||
; GCN-NEXT: s_add_u32 s3, s3, s15
|
||||
; GCN-NEXT: s_mul_hi_u32 s10, s11, s13
|
||||
; GCN-NEXT: s_addc_u32 s3, s16, s14
|
||||
; GCN-NEXT: s_addc_u32 s10, s10, 0
|
||||
; GCN-NEXT: s_mul_i32 s13, s11, s13
|
||||
; GCN-NEXT: s_add_u32 s3, s3, s13
|
||||
; GCN-NEXT: s_addc_u32 s10, 0, s10
|
||||
; GCN-NEXT: s_add_u32 s3, s12, s3
|
||||
; GCN-NEXT: s_addc_u32 s14, s11, s10
|
||||
; GCN-NEXT: s_ashr_i32 s10, s5, 31
|
||||
; GCN-NEXT: s_add_u32 s12, s4, s10
|
||||
; GCN-NEXT: s_mov_b32 s11, s10
|
||||
@ -1600,11 +1596,9 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
|
||||
; GCN-NEXT: s_mul_i32 s3, s8, s3
|
||||
; GCN-NEXT: s_sub_u32 s3, s12, s3
|
||||
; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0
|
||||
; GCN-NEXT: s_subb_u32 s12, s16, s9
|
||||
; GCN-NEXT: s_sub_u32 s18, s3, s8
|
||||
; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
|
||||
; GCN-NEXT: s_subb_u32 s19, s12, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s19, s9
|
||||
; GCN-NEXT: s_cselect_b32 s20, -1, 0
|
||||
@ -1614,12 +1608,10 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
|
||||
; GCN-NEXT: s_cselect_b32 s20, s21, s20
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
|
||||
; GCN-NEXT: s_subb_u32 s12, s12, s9
|
||||
; GCN-NEXT: s_sub_u32 s21, s18, s8
|
||||
; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
|
||||
; GCN-NEXT: s_sub_u32 s16, s18, s8
|
||||
; GCN-NEXT: s_subb_u32 s12, s12, 0
|
||||
; GCN-NEXT: s_cmp_lg_u32 s20, 0
|
||||
; GCN-NEXT: s_cselect_b32 s16, s21, s18
|
||||
; GCN-NEXT: s_cselect_b32 s16, s16, s18
|
||||
; GCN-NEXT: s_cselect_b32 s12, s12, s19
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0
|
||||
; GCN-NEXT: s_subb_u32 s5, s13, s5
|
||||
@ -1931,11 +1923,9 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
|
||||
; TONGA-NEXT: v_readfirstlane_b32 s14, v0
|
||||
; TONGA-NEXT: s_sub_u32 s12, s12, s14
|
||||
; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0
|
||||
; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
|
||||
; TONGA-NEXT: s_subb_u32 s3, s3, s7
|
||||
; TONGA-NEXT: s_sub_u32 s18, s12, s6
|
||||
; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
|
||||
; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
|
||||
; TONGA-NEXT: s_subb_u32 s19, s3, 0
|
||||
; TONGA-NEXT: s_cmp_ge_u32 s19, s7
|
||||
; TONGA-NEXT: s_cselect_b32 s20, -1, 0
|
||||
@ -1945,12 +1935,10 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
|
||||
; TONGA-NEXT: s_cselect_b32 s20, s21, s20
|
||||
; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
|
||||
; TONGA-NEXT: s_subb_u32 s3, s3, s7
|
||||
; TONGA-NEXT: s_sub_u32 s21, s18, s6
|
||||
; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
|
||||
; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
|
||||
; TONGA-NEXT: s_sub_u32 s16, s18, s6
|
||||
; TONGA-NEXT: s_subb_u32 s3, s3, 0
|
||||
; TONGA-NEXT: s_cmp_lg_u32 s20, 0
|
||||
; TONGA-NEXT: s_cselect_b32 s16, s21, s18
|
||||
; TONGA-NEXT: s_cselect_b32 s16, s16, s18
|
||||
; TONGA-NEXT: s_cselect_b32 s3, s3, s19
|
||||
; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
|
||||
; TONGA-NEXT: s_subb_u32 s5, s13, s5
|
||||
@ -2730,7 +2718,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6
|
||||
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7
|
||||
; GCN-NEXT: s_sub_u32 s9, 0, s6
|
||||
; GCN-NEXT: s_subb_u32 s16, 0, s7
|
||||
; GCN-NEXT: s_subb_u32 s14, 0, s7
|
||||
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
|
||||
; GCN-NEXT: v_rcp_f32_e32 v0, v0
|
||||
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
||||
@ -2739,56 +2727,52 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
|
||||
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
|
||||
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; GCN-NEXT: v_readfirstlane_b32 s17, v1
|
||||
; GCN-NEXT: v_readfirstlane_b32 s14, v0
|
||||
; GCN-NEXT: s_mul_i32 s15, s9, s17
|
||||
; GCN-NEXT: s_mul_hi_u32 s19, s9, s14
|
||||
; GCN-NEXT: s_mul_i32 s18, s16, s14
|
||||
; GCN-NEXT: s_add_i32 s15, s19, s15
|
||||
; GCN-NEXT: s_add_i32 s15, s15, s18
|
||||
; GCN-NEXT: s_mul_i32 s20, s9, s14
|
||||
; GCN-NEXT: s_mul_i32 s19, s14, s15
|
||||
; GCN-NEXT: s_mul_hi_u32 s21, s14, s20
|
||||
; GCN-NEXT: s_mul_hi_u32 s18, s14, s15
|
||||
; GCN-NEXT: v_readfirstlane_b32 s15, v1
|
||||
; GCN-NEXT: v_readfirstlane_b32 s16, v0
|
||||
; GCN-NEXT: s_mul_i32 s17, s9, s15
|
||||
; GCN-NEXT: s_mul_hi_u32 s19, s9, s16
|
||||
; GCN-NEXT: s_mul_i32 s18, s14, s16
|
||||
; GCN-NEXT: s_add_i32 s17, s19, s17
|
||||
; GCN-NEXT: s_add_i32 s17, s17, s18
|
||||
; GCN-NEXT: s_mul_i32 s20, s9, s16
|
||||
; GCN-NEXT: s_mul_i32 s19, s16, s17
|
||||
; GCN-NEXT: s_mul_hi_u32 s21, s16, s20
|
||||
; GCN-NEXT: s_mul_hi_u32 s18, s16, s17
|
||||
; GCN-NEXT: s_add_u32 s19, s21, s19
|
||||
; GCN-NEXT: s_addc_u32 s18, 0, s18
|
||||
; GCN-NEXT: s_mul_hi_u32 s22, s17, s20
|
||||
; GCN-NEXT: s_mul_i32 s20, s17, s20
|
||||
; GCN-NEXT: s_mul_hi_u32 s22, s15, s20
|
||||
; GCN-NEXT: s_mul_i32 s20, s15, s20
|
||||
; GCN-NEXT: s_add_u32 s19, s19, s20
|
||||
; GCN-NEXT: s_mul_hi_u32 s21, s17, s15
|
||||
; GCN-NEXT: s_mul_hi_u32 s21, s15, s17
|
||||
; GCN-NEXT: s_addc_u32 s18, s18, s22
|
||||
; GCN-NEXT: s_addc_u32 s19, s21, 0
|
||||
; GCN-NEXT: s_mul_i32 s15, s17, s15
|
||||
; GCN-NEXT: s_add_u32 s15, s18, s15
|
||||
; GCN-NEXT: s_mul_i32 s17, s15, s17
|
||||
; GCN-NEXT: s_add_u32 s17, s18, s17
|
||||
; GCN-NEXT: s_addc_u32 s18, 0, s19
|
||||
; GCN-NEXT: s_add_u32 s19, s14, s15
|
||||
; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0
|
||||
; GCN-NEXT: s_addc_u32 s17, s17, s18
|
||||
; GCN-NEXT: s_mul_i32 s14, s9, s17
|
||||
; GCN-NEXT: s_mul_hi_u32 s15, s9, s19
|
||||
; GCN-NEXT: s_add_i32 s14, s15, s14
|
||||
; GCN-NEXT: s_mul_i32 s16, s16, s19
|
||||
; GCN-NEXT: s_add_i32 s14, s14, s16
|
||||
; GCN-NEXT: s_mul_i32 s9, s9, s19
|
||||
; GCN-NEXT: s_mul_hi_u32 s16, s17, s9
|
||||
; GCN-NEXT: s_mul_i32 s18, s17, s9
|
||||
; GCN-NEXT: s_mul_i32 s21, s19, s14
|
||||
; GCN-NEXT: s_mul_hi_u32 s9, s19, s9
|
||||
; GCN-NEXT: s_mul_hi_u32 s20, s19, s14
|
||||
; GCN-NEXT: s_add_u32 s16, s16, s17
|
||||
; GCN-NEXT: s_addc_u32 s15, s15, s18
|
||||
; GCN-NEXT: s_mul_i32 s17, s9, s15
|
||||
; GCN-NEXT: s_mul_hi_u32 s18, s9, s16
|
||||
; GCN-NEXT: s_add_i32 s17, s18, s17
|
||||
; GCN-NEXT: s_mul_i32 s14, s14, s16
|
||||
; GCN-NEXT: s_add_i32 s17, s17, s14
|
||||
; GCN-NEXT: s_mul_i32 s9, s9, s16
|
||||
; GCN-NEXT: s_mul_hi_u32 s18, s15, s9
|
||||
; GCN-NEXT: s_mul_i32 s19, s15, s9
|
||||
; GCN-NEXT: s_mul_i32 s21, s16, s17
|
||||
; GCN-NEXT: s_mul_hi_u32 s9, s16, s9
|
||||
; GCN-NEXT: s_mul_hi_u32 s20, s16, s17
|
||||
; GCN-NEXT: s_add_u32 s9, s9, s21
|
||||
; GCN-NEXT: s_addc_u32 s20, 0, s20
|
||||
; GCN-NEXT: s_add_u32 s9, s9, s18
|
||||
; GCN-NEXT: s_mul_hi_u32 s15, s17, s14
|
||||
; GCN-NEXT: s_addc_u32 s9, s20, s16
|
||||
; GCN-NEXT: s_addc_u32 s15, s15, 0
|
||||
; GCN-NEXT: s_mul_i32 s14, s17, s14
|
||||
; GCN-NEXT: s_add_u32 s9, s9, s14
|
||||
; GCN-NEXT: s_addc_u32 s16, 0, s15
|
||||
; GCN-NEXT: s_add_u32 s9, s19, s9
|
||||
; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0
|
||||
; GCN-NEXT: s_addc_u32 s18, s17, s16
|
||||
; GCN-NEXT: s_add_u32 s9, s9, s19
|
||||
; GCN-NEXT: s_mul_hi_u32 s14, s15, s17
|
||||
; GCN-NEXT: s_addc_u32 s9, s20, s18
|
||||
; GCN-NEXT: s_addc_u32 s14, s14, 0
|
||||
; GCN-NEXT: s_mul_i32 s17, s15, s17
|
||||
; GCN-NEXT: s_add_u32 s9, s9, s17
|
||||
; GCN-NEXT: s_addc_u32 s14, 0, s14
|
||||
; GCN-NEXT: s_add_u32 s9, s16, s9
|
||||
; GCN-NEXT: s_addc_u32 s18, s15, s14
|
||||
; GCN-NEXT: s_ashr_i32 s14, s11, 31
|
||||
; GCN-NEXT: s_add_u32 s16, s10, s14
|
||||
; GCN-NEXT: s_mov_b32 s15, s14
|
||||
@ -2817,11 +2801,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; GCN-NEXT: s_mul_i32 s9, s6, s9
|
||||
; GCN-NEXT: s_sub_u32 s9, s16, s9
|
||||
; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
|
||||
; GCN-NEXT: s_subb_u32 s16, s20, s7
|
||||
; GCN-NEXT: s_sub_u32 s22, s9, s6
|
||||
; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
|
||||
; GCN-NEXT: s_subb_u32 s23, s16, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s23, s7
|
||||
; GCN-NEXT: s_cselect_b32 s24, -1, 0
|
||||
@ -2831,12 +2813,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; GCN-NEXT: s_cselect_b32 s24, s25, s24
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
|
||||
; GCN-NEXT: s_subb_u32 s16, s16, s7
|
||||
; GCN-NEXT: s_sub_u32 s25, s22, s6
|
||||
; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
|
||||
; GCN-NEXT: s_sub_u32 s20, s22, s6
|
||||
; GCN-NEXT: s_subb_u32 s16, s16, 0
|
||||
; GCN-NEXT: s_cmp_lg_u32 s24, 0
|
||||
; GCN-NEXT: s_cselect_b32 s20, s25, s22
|
||||
; GCN-NEXT: s_cselect_b32 s20, s20, s22
|
||||
; GCN-NEXT: s_cselect_b32 s16, s16, s23
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
|
||||
; GCN-NEXT: s_subb_u32 s11, s17, s11
|
||||
@ -2887,7 +2867,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10
|
||||
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11
|
||||
; GCN-NEXT: s_sub_u32 s3, 0, s10
|
||||
; GCN-NEXT: s_subb_u32 s14, 0, s11
|
||||
; GCN-NEXT: s_subb_u32 s12, 0, s11
|
||||
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
|
||||
; GCN-NEXT: v_rcp_f32_e32 v0, v0
|
||||
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
||||
@ -2896,56 +2876,52 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
|
||||
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
|
||||
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; GCN-NEXT: v_readfirstlane_b32 s15, v1
|
||||
; GCN-NEXT: v_readfirstlane_b32 s12, v0
|
||||
; GCN-NEXT: s_mul_i32 s13, s3, s15
|
||||
; GCN-NEXT: s_mul_hi_u32 s17, s3, s12
|
||||
; GCN-NEXT: s_mul_i32 s16, s14, s12
|
||||
; GCN-NEXT: s_add_i32 s13, s17, s13
|
||||
; GCN-NEXT: s_add_i32 s13, s13, s16
|
||||
; GCN-NEXT: s_mul_i32 s18, s3, s12
|
||||
; GCN-NEXT: s_mul_i32 s17, s12, s13
|
||||
; GCN-NEXT: s_mul_hi_u32 s19, s12, s18
|
||||
; GCN-NEXT: s_mul_hi_u32 s16, s12, s13
|
||||
; GCN-NEXT: v_readfirstlane_b32 s13, v1
|
||||
; GCN-NEXT: v_readfirstlane_b32 s14, v0
|
||||
; GCN-NEXT: s_mul_i32 s15, s3, s13
|
||||
; GCN-NEXT: s_mul_hi_u32 s17, s3, s14
|
||||
; GCN-NEXT: s_mul_i32 s16, s12, s14
|
||||
; GCN-NEXT: s_add_i32 s15, s17, s15
|
||||
; GCN-NEXT: s_add_i32 s15, s15, s16
|
||||
; GCN-NEXT: s_mul_i32 s18, s3, s14
|
||||
; GCN-NEXT: s_mul_i32 s17, s14, s15
|
||||
; GCN-NEXT: s_mul_hi_u32 s19, s14, s18
|
||||
; GCN-NEXT: s_mul_hi_u32 s16, s14, s15
|
||||
; GCN-NEXT: s_add_u32 s17, s19, s17
|
||||
; GCN-NEXT: s_addc_u32 s16, 0, s16
|
||||
; GCN-NEXT: s_mul_hi_u32 s20, s15, s18
|
||||
; GCN-NEXT: s_mul_i32 s18, s15, s18
|
||||
; GCN-NEXT: s_mul_hi_u32 s20, s13, s18
|
||||
; GCN-NEXT: s_mul_i32 s18, s13, s18
|
||||
; GCN-NEXT: s_add_u32 s17, s17, s18
|
||||
; GCN-NEXT: s_mul_hi_u32 s19, s15, s13
|
||||
; GCN-NEXT: s_mul_hi_u32 s19, s13, s15
|
||||
; GCN-NEXT: s_addc_u32 s16, s16, s20
|
||||
; GCN-NEXT: s_addc_u32 s17, s19, 0
|
||||
; GCN-NEXT: s_mul_i32 s13, s15, s13
|
||||
; GCN-NEXT: s_add_u32 s13, s16, s13
|
||||
; GCN-NEXT: s_mul_i32 s15, s13, s15
|
||||
; GCN-NEXT: s_add_u32 s15, s16, s15
|
||||
; GCN-NEXT: s_addc_u32 s16, 0, s17
|
||||
; GCN-NEXT: s_add_u32 s17, s12, s13
|
||||
; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0
|
||||
; GCN-NEXT: s_addc_u32 s15, s15, s16
|
||||
; GCN-NEXT: s_mul_i32 s12, s3, s15
|
||||
; GCN-NEXT: s_mul_hi_u32 s13, s3, s17
|
||||
; GCN-NEXT: s_add_i32 s12, s13, s12
|
||||
; GCN-NEXT: s_mul_i32 s14, s14, s17
|
||||
; GCN-NEXT: s_add_i32 s12, s12, s14
|
||||
; GCN-NEXT: s_mul_i32 s3, s3, s17
|
||||
; GCN-NEXT: s_mul_hi_u32 s14, s15, s3
|
||||
; GCN-NEXT: s_mul_i32 s16, s15, s3
|
||||
; GCN-NEXT: s_mul_i32 s19, s17, s12
|
||||
; GCN-NEXT: s_mul_hi_u32 s3, s17, s3
|
||||
; GCN-NEXT: s_mul_hi_u32 s18, s17, s12
|
||||
; GCN-NEXT: s_add_u32 s14, s14, s15
|
||||
; GCN-NEXT: s_addc_u32 s13, s13, s16
|
||||
; GCN-NEXT: s_mul_i32 s15, s3, s13
|
||||
; GCN-NEXT: s_mul_hi_u32 s16, s3, s14
|
||||
; GCN-NEXT: s_add_i32 s15, s16, s15
|
||||
; GCN-NEXT: s_mul_i32 s12, s12, s14
|
||||
; GCN-NEXT: s_add_i32 s15, s15, s12
|
||||
; GCN-NEXT: s_mul_i32 s3, s3, s14
|
||||
; GCN-NEXT: s_mul_hi_u32 s16, s13, s3
|
||||
; GCN-NEXT: s_mul_i32 s17, s13, s3
|
||||
; GCN-NEXT: s_mul_i32 s19, s14, s15
|
||||
; GCN-NEXT: s_mul_hi_u32 s3, s14, s3
|
||||
; GCN-NEXT: s_mul_hi_u32 s18, s14, s15
|
||||
; GCN-NEXT: s_add_u32 s3, s3, s19
|
||||
; GCN-NEXT: s_addc_u32 s18, 0, s18
|
||||
; GCN-NEXT: s_add_u32 s3, s3, s16
|
||||
; GCN-NEXT: s_mul_hi_u32 s13, s15, s12
|
||||
; GCN-NEXT: s_addc_u32 s3, s18, s14
|
||||
; GCN-NEXT: s_addc_u32 s13, s13, 0
|
||||
; GCN-NEXT: s_mul_i32 s12, s15, s12
|
||||
; GCN-NEXT: s_add_u32 s3, s3, s12
|
||||
; GCN-NEXT: s_addc_u32 s14, 0, s13
|
||||
; GCN-NEXT: s_add_u32 s3, s17, s3
|
||||
; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0
|
||||
; GCN-NEXT: s_addc_u32 s16, s15, s14
|
||||
; GCN-NEXT: s_add_u32 s3, s3, s17
|
||||
; GCN-NEXT: s_mul_hi_u32 s12, s13, s15
|
||||
; GCN-NEXT: s_addc_u32 s3, s18, s16
|
||||
; GCN-NEXT: s_addc_u32 s12, s12, 0
|
||||
; GCN-NEXT: s_mul_i32 s15, s13, s15
|
||||
; GCN-NEXT: s_add_u32 s3, s3, s15
|
||||
; GCN-NEXT: s_addc_u32 s12, 0, s12
|
||||
; GCN-NEXT: s_add_u32 s3, s14, s3
|
||||
; GCN-NEXT: s_addc_u32 s16, s13, s12
|
||||
; GCN-NEXT: s_ashr_i32 s12, s5, 31
|
||||
; GCN-NEXT: s_add_u32 s14, s4, s12
|
||||
; GCN-NEXT: s_mov_b32 s13, s12
|
||||
@ -2974,11 +2950,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; GCN-NEXT: s_mul_i32 s3, s10, s3
|
||||
; GCN-NEXT: s_sub_u32 s3, s14, s3
|
||||
; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
|
||||
; GCN-NEXT: s_subb_u32 s14, s18, s11
|
||||
; GCN-NEXT: s_sub_u32 s20, s3, s10
|
||||
; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
|
||||
; GCN-NEXT: s_subb_u32 s21, s14, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s21, s11
|
||||
; GCN-NEXT: s_cselect_b32 s22, -1, 0
|
||||
@ -2988,12 +2962,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; GCN-NEXT: s_cselect_b32 s22, s23, s22
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
|
||||
; GCN-NEXT: s_subb_u32 s14, s14, s11
|
||||
; GCN-NEXT: s_sub_u32 s23, s20, s10
|
||||
; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
|
||||
; GCN-NEXT: s_sub_u32 s18, s20, s10
|
||||
; GCN-NEXT: s_subb_u32 s14, s14, 0
|
||||
; GCN-NEXT: s_cmp_lg_u32 s22, 0
|
||||
; GCN-NEXT: s_cselect_b32 s18, s23, s20
|
||||
; GCN-NEXT: s_cselect_b32 s18, s18, s20
|
||||
; GCN-NEXT: s_cselect_b32 s14, s14, s21
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
|
||||
; GCN-NEXT: s_subb_u32 s5, s15, s5
|
||||
@ -3463,11 +3435,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; TONGA-NEXT: v_readfirstlane_b32 s14, v0
|
||||
; TONGA-NEXT: s_sub_u32 s12, s12, s14
|
||||
; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0
|
||||
; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
|
||||
; TONGA-NEXT: s_subb_u32 s1, s1, s7
|
||||
; TONGA-NEXT: s_sub_u32 s18, s12, s6
|
||||
; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
|
||||
; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
|
||||
; TONGA-NEXT: s_subb_u32 s19, s1, 0
|
||||
; TONGA-NEXT: s_cmp_ge_u32 s19, s7
|
||||
; TONGA-NEXT: s_cselect_b32 s20, -1, 0
|
||||
@ -3477,12 +3447,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; TONGA-NEXT: s_cselect_b32 s20, s21, s20
|
||||
; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
|
||||
; TONGA-NEXT: s_subb_u32 s1, s1, s7
|
||||
; TONGA-NEXT: s_sub_u32 s21, s18, s6
|
||||
; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
|
||||
; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
|
||||
; TONGA-NEXT: s_sub_u32 s16, s18, s6
|
||||
; TONGA-NEXT: s_subb_u32 s1, s1, 0
|
||||
; TONGA-NEXT: s_cmp_lg_u32 s20, 0
|
||||
; TONGA-NEXT: s_cselect_b32 s16, s21, s18
|
||||
; TONGA-NEXT: s_cselect_b32 s16, s16, s18
|
||||
; TONGA-NEXT: s_cselect_b32 s1, s1, s19
|
||||
; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
|
||||
; TONGA-NEXT: s_subb_u32 s3, s13, s3
|
||||
@ -4934,7 +4902,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6
|
||||
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7
|
||||
; GCN-NEXT: s_sub_u32 s17, 0, s6
|
||||
; GCN-NEXT: s_subb_u32 s24, 0, s7
|
||||
; GCN-NEXT: s_subb_u32 s22, 0, s7
|
||||
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
|
||||
; GCN-NEXT: v_rcp_f32_e32 v0, v0
|
||||
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
||||
@ -4943,56 +4911,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
|
||||
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
|
||||
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; GCN-NEXT: v_readfirstlane_b32 s25, v1
|
||||
; GCN-NEXT: v_readfirstlane_b32 s22, v0
|
||||
; GCN-NEXT: s_mul_i32 s23, s17, s25
|
||||
; GCN-NEXT: s_mul_hi_u32 s27, s17, s22
|
||||
; GCN-NEXT: s_mul_i32 s26, s24, s22
|
||||
; GCN-NEXT: s_add_i32 s23, s27, s23
|
||||
; GCN-NEXT: s_add_i32 s23, s23, s26
|
||||
; GCN-NEXT: s_mul_i32 s28, s17, s22
|
||||
; GCN-NEXT: s_mul_i32 s27, s22, s23
|
||||
; GCN-NEXT: s_mul_hi_u32 s29, s22, s28
|
||||
; GCN-NEXT: s_mul_hi_u32 s26, s22, s23
|
||||
; GCN-NEXT: v_readfirstlane_b32 s23, v1
|
||||
; GCN-NEXT: v_readfirstlane_b32 s24, v0
|
||||
; GCN-NEXT: s_mul_i32 s25, s17, s23
|
||||
; GCN-NEXT: s_mul_hi_u32 s27, s17, s24
|
||||
; GCN-NEXT: s_mul_i32 s26, s22, s24
|
||||
; GCN-NEXT: s_add_i32 s25, s27, s25
|
||||
; GCN-NEXT: s_add_i32 s25, s25, s26
|
||||
; GCN-NEXT: s_mul_i32 s28, s17, s24
|
||||
; GCN-NEXT: s_mul_i32 s27, s24, s25
|
||||
; GCN-NEXT: s_mul_hi_u32 s29, s24, s28
|
||||
; GCN-NEXT: s_mul_hi_u32 s26, s24, s25
|
||||
; GCN-NEXT: s_add_u32 s27, s29, s27
|
||||
; GCN-NEXT: s_addc_u32 s26, 0, s26
|
||||
; GCN-NEXT: s_mul_hi_u32 s30, s25, s28
|
||||
; GCN-NEXT: s_mul_i32 s28, s25, s28
|
||||
; GCN-NEXT: s_mul_hi_u32 s30, s23, s28
|
||||
; GCN-NEXT: s_mul_i32 s28, s23, s28
|
||||
; GCN-NEXT: s_add_u32 s27, s27, s28
|
||||
; GCN-NEXT: s_mul_hi_u32 s29, s25, s23
|
||||
; GCN-NEXT: s_mul_hi_u32 s29, s23, s25
|
||||
; GCN-NEXT: s_addc_u32 s26, s26, s30
|
||||
; GCN-NEXT: s_addc_u32 s27, s29, 0
|
||||
; GCN-NEXT: s_mul_i32 s23, s25, s23
|
||||
; GCN-NEXT: s_add_u32 s23, s26, s23
|
||||
; GCN-NEXT: s_mul_i32 s25, s23, s25
|
||||
; GCN-NEXT: s_add_u32 s25, s26, s25
|
||||
; GCN-NEXT: s_addc_u32 s26, 0, s27
|
||||
; GCN-NEXT: s_add_u32 s27, s22, s23
|
||||
; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
|
||||
; GCN-NEXT: s_addc_u32 s25, s25, s26
|
||||
; GCN-NEXT: s_mul_i32 s22, s17, s25
|
||||
; GCN-NEXT: s_mul_hi_u32 s23, s17, s27
|
||||
; GCN-NEXT: s_add_i32 s22, s23, s22
|
||||
; GCN-NEXT: s_mul_i32 s24, s24, s27
|
||||
; GCN-NEXT: s_add_i32 s22, s22, s24
|
||||
; GCN-NEXT: s_mul_i32 s17, s17, s27
|
||||
; GCN-NEXT: s_mul_hi_u32 s24, s25, s17
|
||||
; GCN-NEXT: s_mul_i32 s26, s25, s17
|
||||
; GCN-NEXT: s_mul_i32 s29, s27, s22
|
||||
; GCN-NEXT: s_mul_hi_u32 s17, s27, s17
|
||||
; GCN-NEXT: s_mul_hi_u32 s28, s27, s22
|
||||
; GCN-NEXT: s_add_u32 s24, s24, s25
|
||||
; GCN-NEXT: s_addc_u32 s23, s23, s26
|
||||
; GCN-NEXT: s_mul_i32 s25, s17, s23
|
||||
; GCN-NEXT: s_mul_hi_u32 s26, s17, s24
|
||||
; GCN-NEXT: s_add_i32 s25, s26, s25
|
||||
; GCN-NEXT: s_mul_i32 s22, s22, s24
|
||||
; GCN-NEXT: s_add_i32 s25, s25, s22
|
||||
; GCN-NEXT: s_mul_i32 s17, s17, s24
|
||||
; GCN-NEXT: s_mul_hi_u32 s26, s23, s17
|
||||
; GCN-NEXT: s_mul_i32 s27, s23, s17
|
||||
; GCN-NEXT: s_mul_i32 s29, s24, s25
|
||||
; GCN-NEXT: s_mul_hi_u32 s17, s24, s17
|
||||
; GCN-NEXT: s_mul_hi_u32 s28, s24, s25
|
||||
; GCN-NEXT: s_add_u32 s17, s17, s29
|
||||
; GCN-NEXT: s_addc_u32 s28, 0, s28
|
||||
; GCN-NEXT: s_add_u32 s17, s17, s26
|
||||
; GCN-NEXT: s_mul_hi_u32 s23, s25, s22
|
||||
; GCN-NEXT: s_addc_u32 s17, s28, s24
|
||||
; GCN-NEXT: s_addc_u32 s23, s23, 0
|
||||
; GCN-NEXT: s_mul_i32 s22, s25, s22
|
||||
; GCN-NEXT: s_add_u32 s17, s17, s22
|
||||
; GCN-NEXT: s_addc_u32 s24, 0, s23
|
||||
; GCN-NEXT: s_add_u32 s17, s27, s17
|
||||
; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
|
||||
; GCN-NEXT: s_addc_u32 s26, s25, s24
|
||||
; GCN-NEXT: s_add_u32 s17, s17, s27
|
||||
; GCN-NEXT: s_mul_hi_u32 s22, s23, s25
|
||||
; GCN-NEXT: s_addc_u32 s17, s28, s26
|
||||
; GCN-NEXT: s_addc_u32 s22, s22, 0
|
||||
; GCN-NEXT: s_mul_i32 s25, s23, s25
|
||||
; GCN-NEXT: s_add_u32 s17, s17, s25
|
||||
; GCN-NEXT: s_addc_u32 s22, 0, s22
|
||||
; GCN-NEXT: s_add_u32 s17, s24, s17
|
||||
; GCN-NEXT: s_addc_u32 s26, s23, s22
|
||||
; GCN-NEXT: s_ashr_i32 s22, s19, 31
|
||||
; GCN-NEXT: s_add_u32 s24, s18, s22
|
||||
; GCN-NEXT: s_mov_b32 s23, s22
|
||||
@ -5021,11 +4985,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; GCN-NEXT: s_mul_i32 s17, s6, s17
|
||||
; GCN-NEXT: s_sub_u32 s17, s24, s17
|
||||
; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
|
||||
; GCN-NEXT: s_subb_u32 s24, s28, s7
|
||||
; GCN-NEXT: s_sub_u32 s30, s17, s6
|
||||
; GCN-NEXT: s_cselect_b64 s[28:29], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0
|
||||
; GCN-NEXT: s_subb_u32 s31, s24, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s31, s7
|
||||
; GCN-NEXT: s_cselect_b32 s33, -1, 0
|
||||
@ -5035,12 +4997,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; GCN-NEXT: s_cselect_b32 s33, s34, s33
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0
|
||||
; GCN-NEXT: s_subb_u32 s24, s24, s7
|
||||
; GCN-NEXT: s_sub_u32 s34, s30, s6
|
||||
; GCN-NEXT: s_cselect_b64 s[28:29], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0
|
||||
; GCN-NEXT: s_sub_u32 s28, s30, s6
|
||||
; GCN-NEXT: s_subb_u32 s24, s24, 0
|
||||
; GCN-NEXT: s_cmp_lg_u32 s33, 0
|
||||
; GCN-NEXT: s_cselect_b32 s28, s34, s30
|
||||
; GCN-NEXT: s_cselect_b32 s28, s28, s30
|
||||
; GCN-NEXT: s_cselect_b32 s24, s24, s31
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
|
||||
; GCN-NEXT: s_subb_u32 s19, s25, s19
|
||||
@ -5091,7 +5051,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s18
|
||||
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s19
|
||||
; GCN-NEXT: s_sub_u32 s13, 0, s18
|
||||
; GCN-NEXT: s_subb_u32 s22, 0, s19
|
||||
; GCN-NEXT: s_subb_u32 s20, 0, s19
|
||||
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
|
||||
; GCN-NEXT: v_rcp_f32_e32 v0, v0
|
||||
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
||||
@ -5100,56 +5060,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
|
||||
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
|
||||
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; GCN-NEXT: v_readfirstlane_b32 s23, v1
|
||||
; GCN-NEXT: v_readfirstlane_b32 s20, v0
|
||||
; GCN-NEXT: s_mul_i32 s21, s13, s23
|
||||
; GCN-NEXT: s_mul_hi_u32 s25, s13, s20
|
||||
; GCN-NEXT: s_mul_i32 s24, s22, s20
|
||||
; GCN-NEXT: s_add_i32 s21, s25, s21
|
||||
; GCN-NEXT: s_add_i32 s21, s21, s24
|
||||
; GCN-NEXT: s_mul_i32 s26, s13, s20
|
||||
; GCN-NEXT: s_mul_i32 s25, s20, s21
|
||||
; GCN-NEXT: s_mul_hi_u32 s27, s20, s26
|
||||
; GCN-NEXT: s_mul_hi_u32 s24, s20, s21
|
||||
; GCN-NEXT: v_readfirstlane_b32 s21, v1
|
||||
; GCN-NEXT: v_readfirstlane_b32 s22, v0
|
||||
; GCN-NEXT: s_mul_i32 s23, s13, s21
|
||||
; GCN-NEXT: s_mul_hi_u32 s25, s13, s22
|
||||
; GCN-NEXT: s_mul_i32 s24, s20, s22
|
||||
; GCN-NEXT: s_add_i32 s23, s25, s23
|
||||
; GCN-NEXT: s_add_i32 s23, s23, s24
|
||||
; GCN-NEXT: s_mul_i32 s26, s13, s22
|
||||
; GCN-NEXT: s_mul_i32 s25, s22, s23
|
||||
; GCN-NEXT: s_mul_hi_u32 s27, s22, s26
|
||||
; GCN-NEXT: s_mul_hi_u32 s24, s22, s23
|
||||
; GCN-NEXT: s_add_u32 s25, s27, s25
|
||||
; GCN-NEXT: s_addc_u32 s24, 0, s24
|
||||
; GCN-NEXT: s_mul_hi_u32 s28, s23, s26
|
||||
; GCN-NEXT: s_mul_i32 s26, s23, s26
|
||||
; GCN-NEXT: s_mul_hi_u32 s28, s21, s26
|
||||
; GCN-NEXT: s_mul_i32 s26, s21, s26
|
||||
; GCN-NEXT: s_add_u32 s25, s25, s26
|
||||
; GCN-NEXT: s_mul_hi_u32 s27, s23, s21
|
||||
; GCN-NEXT: s_mul_hi_u32 s27, s21, s23
|
||||
; GCN-NEXT: s_addc_u32 s24, s24, s28
|
||||
; GCN-NEXT: s_addc_u32 s25, s27, 0
|
||||
; GCN-NEXT: s_mul_i32 s21, s23, s21
|
||||
; GCN-NEXT: s_add_u32 s21, s24, s21
|
||||
; GCN-NEXT: s_mul_i32 s23, s21, s23
|
||||
; GCN-NEXT: s_add_u32 s23, s24, s23
|
||||
; GCN-NEXT: s_addc_u32 s24, 0, s25
|
||||
; GCN-NEXT: s_add_u32 s25, s20, s21
|
||||
; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
|
||||
; GCN-NEXT: s_addc_u32 s23, s23, s24
|
||||
; GCN-NEXT: s_mul_i32 s20, s13, s23
|
||||
; GCN-NEXT: s_mul_hi_u32 s21, s13, s25
|
||||
; GCN-NEXT: s_add_i32 s20, s21, s20
|
||||
; GCN-NEXT: s_mul_i32 s22, s22, s25
|
||||
; GCN-NEXT: s_add_i32 s20, s20, s22
|
||||
; GCN-NEXT: s_mul_i32 s13, s13, s25
|
||||
; GCN-NEXT: s_mul_hi_u32 s22, s23, s13
|
||||
; GCN-NEXT: s_mul_i32 s24, s23, s13
|
||||
; GCN-NEXT: s_mul_i32 s27, s25, s20
|
||||
; GCN-NEXT: s_mul_hi_u32 s13, s25, s13
|
||||
; GCN-NEXT: s_mul_hi_u32 s26, s25, s20
|
||||
; GCN-NEXT: s_add_u32 s22, s22, s23
|
||||
; GCN-NEXT: s_addc_u32 s21, s21, s24
|
||||
; GCN-NEXT: s_mul_i32 s23, s13, s21
|
||||
; GCN-NEXT: s_mul_hi_u32 s24, s13, s22
|
||||
; GCN-NEXT: s_add_i32 s23, s24, s23
|
||||
; GCN-NEXT: s_mul_i32 s20, s20, s22
|
||||
; GCN-NEXT: s_add_i32 s23, s23, s20
|
||||
; GCN-NEXT: s_mul_i32 s13, s13, s22
|
||||
; GCN-NEXT: s_mul_hi_u32 s24, s21, s13
|
||||
; GCN-NEXT: s_mul_i32 s25, s21, s13
|
||||
; GCN-NEXT: s_mul_i32 s27, s22, s23
|
||||
; GCN-NEXT: s_mul_hi_u32 s13, s22, s13
|
||||
; GCN-NEXT: s_mul_hi_u32 s26, s22, s23
|
||||
; GCN-NEXT: s_add_u32 s13, s13, s27
|
||||
; GCN-NEXT: s_addc_u32 s26, 0, s26
|
||||
; GCN-NEXT: s_add_u32 s13, s13, s24
|
||||
; GCN-NEXT: s_mul_hi_u32 s21, s23, s20
|
||||
; GCN-NEXT: s_addc_u32 s13, s26, s22
|
||||
; GCN-NEXT: s_addc_u32 s21, s21, 0
|
||||
; GCN-NEXT: s_mul_i32 s20, s23, s20
|
||||
; GCN-NEXT: s_add_u32 s13, s13, s20
|
||||
; GCN-NEXT: s_addc_u32 s22, 0, s21
|
||||
; GCN-NEXT: s_add_u32 s13, s25, s13
|
||||
; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
|
||||
; GCN-NEXT: s_addc_u32 s24, s23, s22
|
||||
; GCN-NEXT: s_add_u32 s13, s13, s25
|
||||
; GCN-NEXT: s_mul_hi_u32 s20, s21, s23
|
||||
; GCN-NEXT: s_addc_u32 s13, s26, s24
|
||||
; GCN-NEXT: s_addc_u32 s20, s20, 0
|
||||
; GCN-NEXT: s_mul_i32 s23, s21, s23
|
||||
; GCN-NEXT: s_add_u32 s13, s13, s23
|
||||
; GCN-NEXT: s_addc_u32 s20, 0, s20
|
||||
; GCN-NEXT: s_add_u32 s13, s22, s13
|
||||
; GCN-NEXT: s_addc_u32 s24, s21, s20
|
||||
; GCN-NEXT: s_ashr_i32 s20, s15, 31
|
||||
; GCN-NEXT: s_add_u32 s22, s14, s20
|
||||
; GCN-NEXT: s_mov_b32 s21, s20
|
||||
@ -5178,11 +5134,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; GCN-NEXT: s_mul_i32 s13, s18, s13
|
||||
; GCN-NEXT: s_sub_u32 s13, s22, s13
|
||||
; GCN-NEXT: s_cselect_b64 s[24:25], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[24:25], 0
|
||||
; GCN-NEXT: s_subb_u32 s22, s26, s19
|
||||
; GCN-NEXT: s_sub_u32 s28, s13, s18
|
||||
; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
|
||||
; GCN-NEXT: s_subb_u32 s29, s22, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s29, s19
|
||||
; GCN-NEXT: s_cselect_b32 s30, -1, 0
|
||||
@ -5192,12 +5146,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; GCN-NEXT: s_cselect_b32 s30, s31, s30
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
|
||||
; GCN-NEXT: s_subb_u32 s22, s22, s19
|
||||
; GCN-NEXT: s_sub_u32 s31, s28, s18
|
||||
; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
|
||||
; GCN-NEXT: s_sub_u32 s26, s28, s18
|
||||
; GCN-NEXT: s_subb_u32 s22, s22, 0
|
||||
; GCN-NEXT: s_cmp_lg_u32 s30, 0
|
||||
; GCN-NEXT: s_cselect_b32 s26, s31, s28
|
||||
; GCN-NEXT: s_cselect_b32 s26, s26, s28
|
||||
; GCN-NEXT: s_cselect_b32 s22, s22, s29
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[24:25], 0
|
||||
; GCN-NEXT: s_subb_u32 s15, s23, s15
|
||||
@ -5257,7 +5209,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s14
|
||||
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s15
|
||||
; GCN-NEXT: s_sub_u32 s9, 0, s14
|
||||
; GCN-NEXT: s_subb_u32 s18, 0, s15
|
||||
; GCN-NEXT: s_subb_u32 s16, 0, s15
|
||||
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
|
||||
; GCN-NEXT: v_rcp_f32_e32 v0, v0
|
||||
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
||||
@ -5266,56 +5218,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
|
||||
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
|
||||
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; GCN-NEXT: v_readfirstlane_b32 s19, v1
|
||||
; GCN-NEXT: v_readfirstlane_b32 s16, v0
|
||||
; GCN-NEXT: s_mul_i32 s17, s9, s19
|
||||
; GCN-NEXT: s_mul_hi_u32 s21, s9, s16
|
||||
; GCN-NEXT: s_mul_i32 s20, s18, s16
|
||||
; GCN-NEXT: s_add_i32 s17, s21, s17
|
||||
; GCN-NEXT: s_add_i32 s17, s17, s20
|
||||
; GCN-NEXT: s_mul_i32 s22, s9, s16
|
||||
; GCN-NEXT: s_mul_i32 s21, s16, s17
|
||||
; GCN-NEXT: s_mul_hi_u32 s23, s16, s22
|
||||
; GCN-NEXT: s_mul_hi_u32 s20, s16, s17
|
||||
; GCN-NEXT: v_readfirstlane_b32 s17, v1
|
||||
; GCN-NEXT: v_readfirstlane_b32 s18, v0
|
||||
; GCN-NEXT: s_mul_i32 s19, s9, s17
|
||||
; GCN-NEXT: s_mul_hi_u32 s21, s9, s18
|
||||
; GCN-NEXT: s_mul_i32 s20, s16, s18
|
||||
; GCN-NEXT: s_add_i32 s19, s21, s19
|
||||
; GCN-NEXT: s_add_i32 s19, s19, s20
|
||||
; GCN-NEXT: s_mul_i32 s22, s9, s18
|
||||
; GCN-NEXT: s_mul_i32 s21, s18, s19
|
||||
; GCN-NEXT: s_mul_hi_u32 s23, s18, s22
|
||||
; GCN-NEXT: s_mul_hi_u32 s20, s18, s19
|
||||
; GCN-NEXT: s_add_u32 s21, s23, s21
|
||||
; GCN-NEXT: s_addc_u32 s20, 0, s20
|
||||
; GCN-NEXT: s_mul_hi_u32 s24, s19, s22
|
||||
; GCN-NEXT: s_mul_i32 s22, s19, s22
|
||||
; GCN-NEXT: s_mul_hi_u32 s24, s17, s22
|
||||
; GCN-NEXT: s_mul_i32 s22, s17, s22
|
||||
; GCN-NEXT: s_add_u32 s21, s21, s22
|
||||
; GCN-NEXT: s_mul_hi_u32 s23, s19, s17
|
||||
; GCN-NEXT: s_mul_hi_u32 s23, s17, s19
|
||||
; GCN-NEXT: s_addc_u32 s20, s20, s24
|
||||
; GCN-NEXT: s_addc_u32 s21, s23, 0
|
||||
; GCN-NEXT: s_mul_i32 s17, s19, s17
|
||||
; GCN-NEXT: s_add_u32 s17, s20, s17
|
||||
; GCN-NEXT: s_mul_i32 s19, s17, s19
|
||||
; GCN-NEXT: s_add_u32 s19, s20, s19
|
||||
; GCN-NEXT: s_addc_u32 s20, 0, s21
|
||||
; GCN-NEXT: s_add_u32 s21, s16, s17
|
||||
; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
|
||||
; GCN-NEXT: s_addc_u32 s19, s19, s20
|
||||
; GCN-NEXT: s_mul_i32 s16, s9, s19
|
||||
; GCN-NEXT: s_mul_hi_u32 s17, s9, s21
|
||||
; GCN-NEXT: s_add_i32 s16, s17, s16
|
||||
; GCN-NEXT: s_mul_i32 s18, s18, s21
|
||||
; GCN-NEXT: s_add_i32 s16, s16, s18
|
||||
; GCN-NEXT: s_mul_i32 s9, s9, s21
|
||||
; GCN-NEXT: s_mul_hi_u32 s18, s19, s9
|
||||
; GCN-NEXT: s_mul_i32 s20, s19, s9
|
||||
; GCN-NEXT: s_mul_i32 s23, s21, s16
|
||||
; GCN-NEXT: s_mul_hi_u32 s9, s21, s9
|
||||
; GCN-NEXT: s_mul_hi_u32 s22, s21, s16
|
||||
; GCN-NEXT: s_add_u32 s18, s18, s19
|
||||
; GCN-NEXT: s_addc_u32 s17, s17, s20
|
||||
; GCN-NEXT: s_mul_i32 s19, s9, s17
|
||||
; GCN-NEXT: s_mul_hi_u32 s20, s9, s18
|
||||
; GCN-NEXT: s_add_i32 s19, s20, s19
|
||||
; GCN-NEXT: s_mul_i32 s16, s16, s18
|
||||
; GCN-NEXT: s_add_i32 s19, s19, s16
|
||||
; GCN-NEXT: s_mul_i32 s9, s9, s18
|
||||
; GCN-NEXT: s_mul_hi_u32 s20, s17, s9
|
||||
; GCN-NEXT: s_mul_i32 s21, s17, s9
|
||||
; GCN-NEXT: s_mul_i32 s23, s18, s19
|
||||
; GCN-NEXT: s_mul_hi_u32 s9, s18, s9
|
||||
; GCN-NEXT: s_mul_hi_u32 s22, s18, s19
|
||||
; GCN-NEXT: s_add_u32 s9, s9, s23
|
||||
; GCN-NEXT: s_addc_u32 s22, 0, s22
|
||||
; GCN-NEXT: s_add_u32 s9, s9, s20
|
||||
; GCN-NEXT: s_mul_hi_u32 s17, s19, s16
|
||||
; GCN-NEXT: s_addc_u32 s9, s22, s18
|
||||
; GCN-NEXT: s_addc_u32 s17, s17, 0
|
||||
; GCN-NEXT: s_mul_i32 s16, s19, s16
|
||||
; GCN-NEXT: s_add_u32 s9, s9, s16
|
||||
; GCN-NEXT: s_addc_u32 s18, 0, s17
|
||||
; GCN-NEXT: s_add_u32 s9, s21, s9
|
||||
; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
|
||||
; GCN-NEXT: s_addc_u32 s20, s19, s18
|
||||
; GCN-NEXT: s_add_u32 s9, s9, s21
|
||||
; GCN-NEXT: s_mul_hi_u32 s16, s17, s19
|
||||
; GCN-NEXT: s_addc_u32 s9, s22, s20
|
||||
; GCN-NEXT: s_addc_u32 s16, s16, 0
|
||||
; GCN-NEXT: s_mul_i32 s19, s17, s19
|
||||
; GCN-NEXT: s_add_u32 s9, s9, s19
|
||||
; GCN-NEXT: s_addc_u32 s16, 0, s16
|
||||
; GCN-NEXT: s_add_u32 s9, s18, s9
|
||||
; GCN-NEXT: s_addc_u32 s20, s17, s16
|
||||
; GCN-NEXT: s_ashr_i32 s16, s11, 31
|
||||
; GCN-NEXT: s_add_u32 s18, s10, s16
|
||||
; GCN-NEXT: s_mov_b32 s17, s16
|
||||
@ -5344,11 +5292,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; GCN-NEXT: s_mul_i32 s9, s14, s9
|
||||
; GCN-NEXT: s_sub_u32 s9, s18, s9
|
||||
; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
|
||||
; GCN-NEXT: s_subb_u32 s18, s22, s15
|
||||
; GCN-NEXT: s_sub_u32 s24, s9, s14
|
||||
; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
|
||||
; GCN-NEXT: s_subb_u32 s25, s18, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s25, s15
|
||||
; GCN-NEXT: s_cselect_b32 s26, -1, 0
|
||||
@ -5358,12 +5304,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; GCN-NEXT: s_cselect_b32 s26, s27, s26
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
|
||||
; GCN-NEXT: s_subb_u32 s18, s18, s15
|
||||
; GCN-NEXT: s_sub_u32 s27, s24, s14
|
||||
; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
|
||||
; GCN-NEXT: s_sub_u32 s22, s24, s14
|
||||
; GCN-NEXT: s_subb_u32 s18, s18, 0
|
||||
; GCN-NEXT: s_cmp_lg_u32 s26, 0
|
||||
; GCN-NEXT: s_cselect_b32 s22, s27, s24
|
||||
; GCN-NEXT: s_cselect_b32 s22, s22, s24
|
||||
; GCN-NEXT: s_cselect_b32 s18, s18, s25
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
|
||||
; GCN-NEXT: s_subb_u32 s11, s19, s11
|
||||
@ -5420,7 +5364,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10
|
||||
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11
|
||||
; GCN-NEXT: s_sub_u32 s3, 0, s10
|
||||
; GCN-NEXT: s_subb_u32 s14, 0, s11
|
||||
; GCN-NEXT: s_subb_u32 s12, 0, s11
|
||||
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
|
||||
; GCN-NEXT: v_rcp_f32_e32 v0, v0
|
||||
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
||||
@ -5429,56 +5373,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
|
||||
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
|
||||
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; GCN-NEXT: v_readfirstlane_b32 s15, v1
|
||||
; GCN-NEXT: v_readfirstlane_b32 s12, v0
|
||||
; GCN-NEXT: s_mul_i32 s13, s3, s15
|
||||
; GCN-NEXT: s_mul_hi_u32 s17, s3, s12
|
||||
; GCN-NEXT: s_mul_i32 s16, s14, s12
|
||||
; GCN-NEXT: s_add_i32 s13, s17, s13
|
||||
; GCN-NEXT: s_add_i32 s13, s13, s16
|
||||
; GCN-NEXT: s_mul_i32 s18, s3, s12
|
||||
; GCN-NEXT: s_mul_i32 s17, s12, s13
|
||||
; GCN-NEXT: s_mul_hi_u32 s19, s12, s18
|
||||
; GCN-NEXT: s_mul_hi_u32 s16, s12, s13
|
||||
; GCN-NEXT: v_readfirstlane_b32 s13, v1
|
||||
; GCN-NEXT: v_readfirstlane_b32 s14, v0
|
||||
; GCN-NEXT: s_mul_i32 s15, s3, s13
|
||||
; GCN-NEXT: s_mul_hi_u32 s17, s3, s14
|
||||
; GCN-NEXT: s_mul_i32 s16, s12, s14
|
||||
; GCN-NEXT: s_add_i32 s15, s17, s15
|
||||
; GCN-NEXT: s_add_i32 s15, s15, s16
|
||||
; GCN-NEXT: s_mul_i32 s18, s3, s14
|
||||
; GCN-NEXT: s_mul_i32 s17, s14, s15
|
||||
; GCN-NEXT: s_mul_hi_u32 s19, s14, s18
|
||||
; GCN-NEXT: s_mul_hi_u32 s16, s14, s15
|
||||
; GCN-NEXT: s_add_u32 s17, s19, s17
|
||||
; GCN-NEXT: s_addc_u32 s16, 0, s16
|
||||
; GCN-NEXT: s_mul_hi_u32 s20, s15, s18
|
||||
; GCN-NEXT: s_mul_i32 s18, s15, s18
|
||||
; GCN-NEXT: s_mul_hi_u32 s20, s13, s18
|
||||
; GCN-NEXT: s_mul_i32 s18, s13, s18
|
||||
; GCN-NEXT: s_add_u32 s17, s17, s18
|
||||
; GCN-NEXT: s_mul_hi_u32 s19, s15, s13
|
||||
; GCN-NEXT: s_mul_hi_u32 s19, s13, s15
|
||||
; GCN-NEXT: s_addc_u32 s16, s16, s20
|
||||
; GCN-NEXT: s_addc_u32 s17, s19, 0
|
||||
; GCN-NEXT: s_mul_i32 s13, s15, s13
|
||||
; GCN-NEXT: s_add_u32 s13, s16, s13
|
||||
; GCN-NEXT: s_mul_i32 s15, s13, s15
|
||||
; GCN-NEXT: s_add_u32 s15, s16, s15
|
||||
; GCN-NEXT: s_addc_u32 s16, 0, s17
|
||||
; GCN-NEXT: s_add_u32 s17, s12, s13
|
||||
; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0
|
||||
; GCN-NEXT: s_addc_u32 s15, s15, s16
|
||||
; GCN-NEXT: s_mul_i32 s12, s3, s15
|
||||
; GCN-NEXT: s_mul_hi_u32 s13, s3, s17
|
||||
; GCN-NEXT: s_add_i32 s12, s13, s12
|
||||
; GCN-NEXT: s_mul_i32 s14, s14, s17
|
||||
; GCN-NEXT: s_add_i32 s12, s12, s14
|
||||
; GCN-NEXT: s_mul_i32 s3, s3, s17
|
||||
; GCN-NEXT: s_mul_hi_u32 s14, s15, s3
|
||||
; GCN-NEXT: s_mul_i32 s16, s15, s3
|
||||
; GCN-NEXT: s_mul_i32 s19, s17, s12
|
||||
; GCN-NEXT: s_mul_hi_u32 s3, s17, s3
|
||||
; GCN-NEXT: s_mul_hi_u32 s18, s17, s12
|
||||
; GCN-NEXT: s_add_u32 s14, s14, s15
|
||||
; GCN-NEXT: s_addc_u32 s13, s13, s16
|
||||
; GCN-NEXT: s_mul_i32 s15, s3, s13
|
||||
; GCN-NEXT: s_mul_hi_u32 s16, s3, s14
|
||||
; GCN-NEXT: s_add_i32 s15, s16, s15
|
||||
; GCN-NEXT: s_mul_i32 s12, s12, s14
|
||||
; GCN-NEXT: s_add_i32 s15, s15, s12
|
||||
; GCN-NEXT: s_mul_i32 s3, s3, s14
|
||||
; GCN-NEXT: s_mul_hi_u32 s16, s13, s3
|
||||
; GCN-NEXT: s_mul_i32 s17, s13, s3
|
||||
; GCN-NEXT: s_mul_i32 s19, s14, s15
|
||||
; GCN-NEXT: s_mul_hi_u32 s3, s14, s3
|
||||
; GCN-NEXT: s_mul_hi_u32 s18, s14, s15
|
||||
; GCN-NEXT: s_add_u32 s3, s3, s19
|
||||
; GCN-NEXT: s_addc_u32 s18, 0, s18
|
||||
; GCN-NEXT: s_add_u32 s3, s3, s16
|
||||
; GCN-NEXT: s_mul_hi_u32 s13, s15, s12
|
||||
; GCN-NEXT: s_addc_u32 s3, s18, s14
|
||||
; GCN-NEXT: s_addc_u32 s13, s13, 0
|
||||
; GCN-NEXT: s_mul_i32 s12, s15, s12
|
||||
; GCN-NEXT: s_add_u32 s3, s3, s12
|
||||
; GCN-NEXT: s_addc_u32 s14, 0, s13
|
||||
; GCN-NEXT: s_add_u32 s3, s17, s3
|
||||
; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0
|
||||
; GCN-NEXT: s_addc_u32 s16, s15, s14
|
||||
; GCN-NEXT: s_add_u32 s3, s3, s17
|
||||
; GCN-NEXT: s_mul_hi_u32 s12, s13, s15
|
||||
; GCN-NEXT: s_addc_u32 s3, s18, s16
|
||||
; GCN-NEXT: s_addc_u32 s12, s12, 0
|
||||
; GCN-NEXT: s_mul_i32 s15, s13, s15
|
||||
; GCN-NEXT: s_add_u32 s3, s3, s15
|
||||
; GCN-NEXT: s_addc_u32 s12, 0, s12
|
||||
; GCN-NEXT: s_add_u32 s3, s14, s3
|
||||
; GCN-NEXT: s_addc_u32 s16, s13, s12
|
||||
; GCN-NEXT: s_ashr_i32 s12, s5, 31
|
||||
; GCN-NEXT: s_add_u32 s14, s4, s12
|
||||
; GCN-NEXT: s_mov_b32 s13, s12
|
||||
@ -5507,11 +5447,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; GCN-NEXT: s_mul_i32 s3, s10, s3
|
||||
; GCN-NEXT: s_sub_u32 s3, s14, s3
|
||||
; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
|
||||
; GCN-NEXT: s_subb_u32 s14, s18, s11
|
||||
; GCN-NEXT: s_sub_u32 s20, s3, s10
|
||||
; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
|
||||
; GCN-NEXT: s_subb_u32 s21, s14, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s21, s11
|
||||
; GCN-NEXT: s_cselect_b32 s22, -1, 0
|
||||
@ -5521,12 +5459,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; GCN-NEXT: s_cselect_b32 s22, s23, s22
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
|
||||
; GCN-NEXT: s_subb_u32 s14, s14, s11
|
||||
; GCN-NEXT: s_sub_u32 s23, s20, s10
|
||||
; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
|
||||
; GCN-NEXT: s_sub_u32 s18, s20, s10
|
||||
; GCN-NEXT: s_subb_u32 s14, s14, 0
|
||||
; GCN-NEXT: s_cmp_lg_u32 s22, 0
|
||||
; GCN-NEXT: s_cselect_b32 s18, s23, s20
|
||||
; GCN-NEXT: s_cselect_b32 s18, s18, s20
|
||||
; GCN-NEXT: s_cselect_b32 s14, s14, s21
|
||||
; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
|
||||
; GCN-NEXT: s_subb_u32 s5, s15, s5
|
||||
@ -6299,11 +6235,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; TONGA-NEXT: v_readfirstlane_b32 s14, v8
|
||||
; TONGA-NEXT: s_sub_u32 s12, s12, s14
|
||||
; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0
|
||||
; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
|
||||
; TONGA-NEXT: s_subb_u32 s1, s1, s7
|
||||
; TONGA-NEXT: s_sub_u32 s18, s12, s6
|
||||
; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
|
||||
; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
|
||||
; TONGA-NEXT: s_subb_u32 s19, s1, 0
|
||||
; TONGA-NEXT: s_cmp_ge_u32 s19, s7
|
||||
; TONGA-NEXT: s_cselect_b32 s20, -1, 0
|
||||
@ -6313,12 +6247,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
|
||||
; TONGA-NEXT: s_cselect_b32 s20, s21, s20
|
||||
; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
|
||||
; TONGA-NEXT: s_subb_u32 s1, s1, s7
|
||||
; TONGA-NEXT: s_sub_u32 s21, s18, s6
|
||||
; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
|
||||
; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
|
||||
; TONGA-NEXT: s_sub_u32 s16, s18, s6
|
||||
; TONGA-NEXT: s_subb_u32 s1, s1, 0
|
||||
; TONGA-NEXT: s_cmp_lg_u32 s20, 0
|
||||
; TONGA-NEXT: s_cselect_b32 s16, s21, s18
|
||||
; TONGA-NEXT: s_cselect_b32 s16, s16, s18
|
||||
; TONGA-NEXT: s_cselect_b32 s1, s1, s19
|
||||
; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
|
||||
; TONGA-NEXT: s_subb_u32 s3, s13, s3
|
||||
|
||||
@ -51,10 +51,9 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
|
||||
; GCN-NEXT: s_addc_u32 s13, 0, s14
|
||||
; GCN-NEXT: s_add_u32 s14, s0, s1
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s14
|
||||
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
|
||||
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s0, s0, s1
|
||||
; GCN-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GCN-NEXT: s_addc_u32 s12, s12, s13
|
||||
; GCN-NEXT: s_mul_i32 s0, s10, s12
|
||||
; GCN-NEXT: v_readfirstlane_b32 s1, v0
|
||||
@ -85,7 +84,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
|
||||
; GCN-NEXT: s_add_u32 s11, s14, s0
|
||||
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s0, s0, s1
|
||||
; GCN-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GCN-NEXT: s_addc_u32 s1, s12, s10
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GCN-NEXT: v_mul_hi_u32 v1, s6, v0
|
||||
@ -115,46 +113,43 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
|
||||
; GCN-NEXT: v_readfirstlane_b32 s10, v0
|
||||
; GCN-NEXT: s_add_i32 s5, s10, s5
|
||||
; GCN-NEXT: s_mul_i32 s10, s9, s4
|
||||
; GCN-NEXT: s_add_i32 s10, s5, s10
|
||||
; GCN-NEXT: s_sub_i32 s11, s7, s10
|
||||
; GCN-NEXT: s_add_i32 s12, s5, s10
|
||||
; GCN-NEXT: s_sub_i32 s10, s7, s12
|
||||
; GCN-NEXT: s_mul_i32 s4, s8, s4
|
||||
; GCN-NEXT: s_sub_u32 s6, s6, s4
|
||||
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s12, s4, s5
|
||||
; GCN-NEXT: s_cmp_lg_u32 s12, 0
|
||||
; GCN-NEXT: s_subb_u32 s11, s11, s9
|
||||
; GCN-NEXT: s_sub_u32 s13, s6, s8
|
||||
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s11, s4, s5
|
||||
; GCN-NEXT: s_subb_u32 s13, s10, s9
|
||||
; GCN-NEXT: s_sub_u32 s14, s6, s8
|
||||
; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s15, s10, s11
|
||||
; GCN-NEXT: s_subb_u32 s15, s13, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s15, s9
|
||||
; GCN-NEXT: s_cselect_b32 s16, -1, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s14, s8
|
||||
; GCN-NEXT: s_cselect_b32 s17, -1, 0
|
||||
; GCN-NEXT: s_cmp_eq_u32 s15, s9
|
||||
; GCN-NEXT: s_cselect_b32 s16, s17, s16
|
||||
; GCN-NEXT: s_or_b32 s10, s10, s11
|
||||
; GCN-NEXT: s_subb_u32 s13, s13, s9
|
||||
; GCN-NEXT: s_sub_u32 s17, s14, s8
|
||||
; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s10, s10, s11
|
||||
; GCN-NEXT: s_subb_u32 s10, s13, 0
|
||||
; GCN-NEXT: s_cmp_lg_u32 s16, 0
|
||||
; GCN-NEXT: s_cselect_b32 s11, s17, s14
|
||||
; GCN-NEXT: s_cselect_b32 s10, s10, s15
|
||||
; GCN-NEXT: s_or_b32 s4, s4, s5
|
||||
; GCN-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GCN-NEXT: s_subb_u32 s14, s11, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s14, s9
|
||||
; GCN-NEXT: s_subb_u32 s4, s7, s12
|
||||
; GCN-NEXT: s_cmp_ge_u32 s4, s9
|
||||
; GCN-NEXT: s_cselect_b32 s5, -1, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s13, s8
|
||||
; GCN-NEXT: s_cselect_b32 s15, -1, 0
|
||||
; GCN-NEXT: s_cmp_eq_u32 s14, s9
|
||||
; GCN-NEXT: s_cselect_b32 s15, s15, s5
|
||||
; GCN-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GCN-NEXT: s_subb_u32 s11, s11, s9
|
||||
; GCN-NEXT: s_sub_u32 s16, s13, s8
|
||||
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s4, s4, s5
|
||||
; GCN-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GCN-NEXT: s_subb_u32 s4, s11, 0
|
||||
; GCN-NEXT: s_cmp_lg_u32 s15, 0
|
||||
; GCN-NEXT: s_cselect_b32 s5, s16, s13
|
||||
; GCN-NEXT: s_cselect_b32 s4, s4, s14
|
||||
; GCN-NEXT: s_cmp_lg_u32 s12, 0
|
||||
; GCN-NEXT: s_subb_u32 s7, s7, s10
|
||||
; GCN-NEXT: s_cmp_ge_u32 s7, s9
|
||||
; GCN-NEXT: s_cselect_b32 s10, -1, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s6, s8
|
||||
; GCN-NEXT: s_cselect_b32 s8, -1, 0
|
||||
; GCN-NEXT: s_cmp_eq_u32 s7, s9
|
||||
; GCN-NEXT: s_cselect_b32 s8, s8, s10
|
||||
; GCN-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GCN-NEXT: s_cselect_b32 s4, s4, s7
|
||||
; GCN-NEXT: s_cselect_b32 s5, s5, s6
|
||||
; GCN-NEXT: s_cselect_b32 s7, -1, 0
|
||||
; GCN-NEXT: s_cmp_eq_u32 s4, s9
|
||||
; GCN-NEXT: s_cselect_b32 s5, s7, s5
|
||||
; GCN-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GCN-NEXT: s_cselect_b32 s4, s10, s4
|
||||
; GCN-NEXT: s_cselect_b32 s5, s11, s6
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
@ -187,7 +182,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
|
||||
; GCN-IR-NEXT: s_add_u32 s14, s12, 1
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; GCN-IR-NEXT: s_or_b32 s8, s8, s9
|
||||
; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GCN-IR-NEXT: s_addc_u32 s8, s13, 0
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
|
||||
@ -221,7 +215,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
|
||||
; GCN-IR-NEXT: s_add_u32 s16, s16, 1
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
|
||||
; GCN-IR-NEXT: s_or_b32 s18, s18, s19
|
||||
; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0
|
||||
; GCN-IR-NEXT: s_addc_u32 s17, s17, 0
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
|
||||
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
|
||||
@ -1016,10 +1009,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
|
||||
; GCN-NEXT: s_addc_u32 s13, 0, s14
|
||||
; GCN-NEXT: s_add_u32 s14, s8, s9
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s14
|
||||
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
|
||||
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s8, s8, s9
|
||||
; GCN-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GCN-NEXT: s_addc_u32 s12, s12, s13
|
||||
; GCN-NEXT: s_mul_i32 s8, s10, s12
|
||||
; GCN-NEXT: v_readfirstlane_b32 s9, v0
|
||||
@ -1050,7 +1042,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
|
||||
; GCN-NEXT: s_add_u32 s11, s14, s8
|
||||
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s8, s8, s9
|
||||
; GCN-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GCN-NEXT: s_addc_u32 s10, s12, s10
|
||||
; GCN-NEXT: s_ashr_i32 s8, s7, 31
|
||||
; GCN-NEXT: s_add_u32 s6, s6, s8
|
||||
@ -1083,46 +1074,43 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
|
||||
; GCN-NEXT: v_readfirstlane_b32 s12, v0
|
||||
; GCN-NEXT: s_add_i32 s11, s12, s11
|
||||
; GCN-NEXT: s_mul_i32 s12, s5, s10
|
||||
; GCN-NEXT: s_add_i32 s12, s11, s12
|
||||
; GCN-NEXT: s_sub_i32 s13, s7, s12
|
||||
; GCN-NEXT: s_add_i32 s14, s11, s12
|
||||
; GCN-NEXT: s_sub_i32 s12, s7, s14
|
||||
; GCN-NEXT: s_mul_i32 s10, s4, s10
|
||||
; GCN-NEXT: s_sub_u32 s6, s6, s10
|
||||
; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s14, s10, s11
|
||||
; GCN-NEXT: s_cmp_lg_u32 s14, 0
|
||||
; GCN-NEXT: s_subb_u32 s13, s13, s5
|
||||
; GCN-NEXT: s_sub_u32 s15, s6, s4
|
||||
; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s13, s10, s11
|
||||
; GCN-NEXT: s_subb_u32 s15, s12, s5
|
||||
; GCN-NEXT: s_sub_u32 s16, s6, s4
|
||||
; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s17, s12, s13
|
||||
; GCN-NEXT: s_subb_u32 s17, s15, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s17, s5
|
||||
; GCN-NEXT: s_cselect_b32 s18, -1, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s16, s4
|
||||
; GCN-NEXT: s_cselect_b32 s19, -1, 0
|
||||
; GCN-NEXT: s_cmp_eq_u32 s17, s5
|
||||
; GCN-NEXT: s_cselect_b32 s18, s19, s18
|
||||
; GCN-NEXT: s_or_b32 s12, s12, s13
|
||||
; GCN-NEXT: s_subb_u32 s15, s15, s5
|
||||
; GCN-NEXT: s_sub_u32 s19, s16, s4
|
||||
; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s12, s12, s13
|
||||
; GCN-NEXT: s_subb_u32 s12, s15, 0
|
||||
; GCN-NEXT: s_cmp_lg_u32 s18, 0
|
||||
; GCN-NEXT: s_cselect_b32 s13, s19, s16
|
||||
; GCN-NEXT: s_cselect_b32 s12, s12, s17
|
||||
; GCN-NEXT: s_or_b32 s10, s10, s11
|
||||
; GCN-NEXT: s_cmp_lg_u32 s10, 0
|
||||
; GCN-NEXT: s_subb_u32 s16, s13, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s16, s5
|
||||
; GCN-NEXT: s_cselect_b32 s11, -1, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s15, s4
|
||||
; GCN-NEXT: s_cselect_b32 s17, -1, 0
|
||||
; GCN-NEXT: s_cmp_eq_u32 s16, s5
|
||||
; GCN-NEXT: s_cselect_b32 s17, s17, s11
|
||||
; GCN-NEXT: s_cmp_lg_u32 s10, 0
|
||||
; GCN-NEXT: s_subb_u32 s13, s13, s5
|
||||
; GCN-NEXT: s_sub_u32 s18, s15, s4
|
||||
; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s10, s10, s11
|
||||
; GCN-NEXT: s_cmp_lg_u32 s10, 0
|
||||
; GCN-NEXT: s_subb_u32 s10, s13, 0
|
||||
; GCN-NEXT: s_cmp_lg_u32 s17, 0
|
||||
; GCN-NEXT: s_cselect_b32 s11, s18, s15
|
||||
; GCN-NEXT: s_cselect_b32 s10, s10, s16
|
||||
; GCN-NEXT: s_cmp_lg_u32 s14, 0
|
||||
; GCN-NEXT: s_subb_u32 s7, s7, s12
|
||||
; GCN-NEXT: s_subb_u32 s7, s7, s14
|
||||
; GCN-NEXT: s_cmp_ge_u32 s7, s5
|
||||
; GCN-NEXT: s_cselect_b32 s12, -1, 0
|
||||
; GCN-NEXT: s_cselect_b32 s10, -1, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s6, s4
|
||||
; GCN-NEXT: s_cselect_b32 s4, -1, 0
|
||||
; GCN-NEXT: s_cmp_eq_u32 s7, s5
|
||||
; GCN-NEXT: s_cselect_b32 s4, s4, s12
|
||||
; GCN-NEXT: s_cselect_b32 s4, s4, s10
|
||||
; GCN-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GCN-NEXT: s_cselect_b32 s5, s10, s7
|
||||
; GCN-NEXT: s_cselect_b32 s4, s11, s6
|
||||
; GCN-NEXT: s_cselect_b32 s5, s12, s7
|
||||
; GCN-NEXT: s_cselect_b32 s4, s13, s6
|
||||
; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
|
||||
; GCN-NEXT: s_sub_u32 s4, s4, s8
|
||||
; GCN-NEXT: s_subb_u32 s5, s5, s8
|
||||
@ -1170,7 +1158,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
|
||||
; GCN-IR-NEXT: s_add_u32 s16, s14, 1
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
|
||||
; GCN-IR-NEXT: s_or_b32 s10, s10, s11
|
||||
; GCN-IR-NEXT: s_cmp_lg_u32 s10, 0
|
||||
; GCN-IR-NEXT: s_addc_u32 s10, s15, 0
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
|
||||
; GCN-IR-NEXT: s_sub_i32 s14, 63, s14
|
||||
@ -1204,7 +1191,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
|
||||
; GCN-IR-NEXT: s_add_u32 s18, s18, 1
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
|
||||
; GCN-IR-NEXT: s_or_b32 s20, s20, s21
|
||||
; GCN-IR-NEXT: s_cmp_lg_u32 s20, 0
|
||||
; GCN-IR-NEXT: s_addc_u32 s19, s19, 0
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
|
||||
; GCN-IR-NEXT: s_mov_b64 s[12:13], s[2:3]
|
||||
@ -1369,10 +1355,9 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
|
||||
; GCN-NEXT: s_addc_u32 s10, 0, s11
|
||||
; GCN-NEXT: s_add_u32 s11, s6, s7
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s11
|
||||
; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
|
||||
; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
|
||||
; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s6, s6, s7
|
||||
; GCN-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GCN-NEXT: s_addc_u32 s9, s9, s10
|
||||
; GCN-NEXT: s_mul_i32 s6, s2, s9
|
||||
; GCN-NEXT: v_readfirstlane_b32 s7, v0
|
||||
@ -1403,7 +1388,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
|
||||
; GCN-NEXT: s_add_u32 s2, s11, s2
|
||||
; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s6, s6, s7
|
||||
; GCN-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GCN-NEXT: s_addc_u32 s6, s9, s8
|
||||
; GCN-NEXT: v_mul_hi_u32 v1, s2, 24
|
||||
; GCN-NEXT: v_mul_hi_u32 v0, s6, 24
|
||||
@ -1418,45 +1402,42 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
|
||||
; GCN-NEXT: s_mul_i32 s7, s5, s6
|
||||
; GCN-NEXT: s_mul_i32 s6, s4, s6
|
||||
; GCN-NEXT: v_readfirstlane_b32 s8, v0
|
||||
; GCN-NEXT: s_add_i32 s8, s8, s7
|
||||
; GCN-NEXT: s_sub_i32 s9, 0, s8
|
||||
; GCN-NEXT: s_sub_u32 s10, 24, s6
|
||||
; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s11, s6, s7
|
||||
; GCN-NEXT: s_cmp_lg_u32 s11, 0
|
||||
; GCN-NEXT: s_subb_u32 s9, s9, s5
|
||||
; GCN-NEXT: s_sub_u32 s12, s10, s4
|
||||
; GCN-NEXT: s_add_i32 s10, s8, s7
|
||||
; GCN-NEXT: s_sub_i32 s8, 0, s10
|
||||
; GCN-NEXT: s_sub_u32 s11, 24, s6
|
||||
; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s9, s6, s7
|
||||
; GCN-NEXT: s_subb_u32 s12, s8, s5
|
||||
; GCN-NEXT: s_sub_u32 s13, s11, s4
|
||||
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s14, s8, s9
|
||||
; GCN-NEXT: s_subb_u32 s14, s12, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s14, s5
|
||||
; GCN-NEXT: s_cselect_b32 s15, -1, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s13, s4
|
||||
; GCN-NEXT: s_cselect_b32 s16, -1, 0
|
||||
; GCN-NEXT: s_cmp_eq_u32 s14, s5
|
||||
; GCN-NEXT: s_cselect_b32 s15, s16, s15
|
||||
; GCN-NEXT: s_or_b32 s8, s8, s9
|
||||
; GCN-NEXT: s_subb_u32 s12, s12, s5
|
||||
; GCN-NEXT: s_sub_u32 s16, s13, s4
|
||||
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s8, s8, s9
|
||||
; GCN-NEXT: s_subb_u32 s8, s12, 0
|
||||
; GCN-NEXT: s_cmp_lg_u32 s15, 0
|
||||
; GCN-NEXT: s_cselect_b32 s9, s16, s13
|
||||
; GCN-NEXT: s_cselect_b32 s8, s8, s14
|
||||
; GCN-NEXT: s_or_b32 s6, s6, s7
|
||||
; GCN-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GCN-NEXT: s_subb_u32 s13, s9, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s13, s5
|
||||
; GCN-NEXT: s_subb_u32 s6, 0, s10
|
||||
; GCN-NEXT: s_cmp_ge_u32 s6, s5
|
||||
; GCN-NEXT: s_cselect_b32 s7, -1, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s12, s4
|
||||
; GCN-NEXT: s_cselect_b32 s14, -1, 0
|
||||
; GCN-NEXT: s_cmp_eq_u32 s13, s5
|
||||
; GCN-NEXT: s_cselect_b32 s14, s14, s7
|
||||
; GCN-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GCN-NEXT: s_subb_u32 s9, s9, s5
|
||||
; GCN-NEXT: s_sub_u32 s15, s12, s4
|
||||
; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s6, s6, s7
|
||||
; GCN-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GCN-NEXT: s_subb_u32 s6, s9, 0
|
||||
; GCN-NEXT: s_cmp_lg_u32 s14, 0
|
||||
; GCN-NEXT: s_cselect_b32 s7, s15, s12
|
||||
; GCN-NEXT: s_cselect_b32 s6, s6, s13
|
||||
; GCN-NEXT: s_cmp_lg_u32 s11, 0
|
||||
; GCN-NEXT: s_subb_u32 s8, 0, s8
|
||||
; GCN-NEXT: s_cmp_ge_u32 s8, s5
|
||||
; GCN-NEXT: s_cselect_b32 s9, -1, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s10, s4
|
||||
; GCN-NEXT: s_cmp_ge_u32 s11, s4
|
||||
; GCN-NEXT: s_cselect_b32 s4, -1, 0
|
||||
; GCN-NEXT: s_cmp_eq_u32 s8, s5
|
||||
; GCN-NEXT: s_cselect_b32 s4, s4, s9
|
||||
; GCN-NEXT: s_cmp_eq_u32 s6, s5
|
||||
; GCN-NEXT: s_cselect_b32 s4, s4, s7
|
||||
; GCN-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GCN-NEXT: s_cselect_b32 s4, s6, s8
|
||||
; GCN-NEXT: s_cselect_b32 s5, s7, s10
|
||||
; GCN-NEXT: s_cselect_b32 s4, s8, s6
|
||||
; GCN-NEXT: s_cselect_b32 s5, s9, s11
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
@ -1489,7 +1470,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
|
||||
; GCN-IR-NEXT: s_add_u32 s8, s2, 1
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
|
||||
; GCN-IR-NEXT: s_or_b32 s9, s10, s11
|
||||
; GCN-IR-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GCN-IR-NEXT: s_addc_u32 s3, s3, 0
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
|
||||
; GCN-IR-NEXT: s_sub_i32 s2, 63, s2
|
||||
@ -1522,7 +1502,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
|
||||
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
|
||||
; GCN-IR-NEXT: s_or_b32 s16, s16, s17
|
||||
; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0
|
||||
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
|
||||
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[6:7]
|
||||
|
||||
@ -18,7 +18,6 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; SI-NEXT: s_or_b32 s0, s0, s1
|
||||
; SI-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; SI-NEXT: s_addc_u32 s3, s3, s9
|
||||
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
@ -35,10 +34,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: s_add_u32 s2, s2, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; VI-NEXT: s_addc_u32 s3, s3, s5
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s3
|
||||
@ -53,14 +50,12 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
|
||||
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_add_u32 s6, s2, s6
|
||||
; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
|
||||
; GFX9-NEXT: s_addc_u32 s4, s3, s7
|
||||
; GFX9-NEXT: s_add_u32 s4, s2, s6
|
||||
; GFX9-NEXT: s_addc_u32 s5, s3, s7
|
||||
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
@ -73,8 +68,6 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_add_u32 s2, s2, s6
|
||||
; GFX10-NEXT: s_cselect_b32 s4, -1, 0
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GFX10-NEXT: s_addc_u32 s3, s3, s7
|
||||
; GFX10-NEXT: s_cselect_b32 s4, -1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
|
||||
@ -91,14 +84,12 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
|
||||
; GFX11-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_add_u32 s2, s2, s4
|
||||
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GFX11-NEXT: s_addc_u32 s3, s3, s5
|
||||
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
@ -444,7 +435,6 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
|
||||
; SI-NEXT: s_add_u32 s4, s4, s6
|
||||
; SI-NEXT: s_cselect_b64 s[12:13], -1, 0
|
||||
; SI-NEXT: s_or_b32 s6, s12, s13
|
||||
; SI-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; SI-NEXT: s_addc_u32 s5, s5, s7
|
||||
; SI-NEXT: s_mov_b32 s8, s0
|
||||
; SI-NEXT: s_mov_b32 s9, s1
|
||||
@ -465,16 +455,14 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: s_add_u32 s2, s4, s6
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: s_add_u32 s0, s4, s6
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_addc_u32 s1, s5, s7
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s1
|
||||
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; VI-NEXT: s_addc_u32 s0, s5, s7
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s0
|
||||
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s3
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
|
||||
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
@ -486,12 +474,10 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
|
||||
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_add_u32 s2, s12, s14
|
||||
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: s_addc_u32 s0, s13, s15
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: s_add_u32 s0, s12, s14
|
||||
; GFX9-NEXT: s_addc_u32 s1, s13, s15
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
|
||||
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
|
||||
@ -504,10 +490,8 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_add_u32 s0, s12, s14
|
||||
; GFX10-NEXT: s_cselect_b32 s1, -1, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX10-NEXT: s_addc_u32 s1, s13, s15
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-NEXT: s_cselect_b32 s0, -1, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
|
||||
@ -520,10 +504,8 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
|
||||
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_add_u32 s4, s4, s6
|
||||
; GFX11-NEXT: s_cselect_b32 s6, -1, 0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX11-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GFX11-NEXT: s_addc_u32 s5, s5, s7
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
|
||||
|
||||
@ -148,7 +148,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
|
||||
; GCN-IR-NEXT: s_add_u32 s14, s12, 1
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; GCN-IR-NEXT: s_or_b32 s8, s8, s9
|
||||
; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GCN-IR-NEXT: s_addc_u32 s8, s13, 0
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
|
||||
@ -182,7 +181,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
|
||||
; GCN-IR-NEXT: s_add_u32 s10, s10, 1
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
|
||||
; GCN-IR-NEXT: s_or_b32 s16, s16, s17
|
||||
; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0
|
||||
; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
|
||||
; GCN-IR-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
@ -831,10 +829,9 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
|
||||
; GCN-NEXT: s_addc_u32 s10, 0, s11
|
||||
; GCN-NEXT: s_add_u32 s11, s4, s5
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s11
|
||||
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GCN-NEXT: v_mul_hi_u32 v0, s6, v0
|
||||
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s4, s4, s5
|
||||
; GCN-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GCN-NEXT: s_addc_u32 s9, s9, s10
|
||||
; GCN-NEXT: s_mul_i32 s4, s6, s9
|
||||
; GCN-NEXT: v_readfirstlane_b32 s5, v0
|
||||
@ -865,7 +862,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
|
||||
; GCN-NEXT: s_add_u32 s8, s11, s4
|
||||
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s4, s4, s5
|
||||
; GCN-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GCN-NEXT: s_addc_u32 s4, s9, s6
|
||||
; GCN-NEXT: v_mul_hi_u32 v1, s8, 24
|
||||
; GCN-NEXT: v_mul_hi_u32 v0, s4, 24
|
||||
@ -874,52 +870,50 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
|
||||
; GCN-NEXT: v_readfirstlane_b32 s8, v1
|
||||
; GCN-NEXT: v_readfirstlane_b32 s5, v0
|
||||
; GCN-NEXT: s_add_u32 s4, s8, s4
|
||||
; GCN-NEXT: s_addc_u32 s8, 0, s5
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GCN-NEXT: s_addc_u32 s10, 0, s5
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s10
|
||||
; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
|
||||
; GCN-NEXT: s_mov_b32 s4, s0
|
||||
; GCN-NEXT: s_mov_b32 s5, s1
|
||||
; GCN-NEXT: s_mul_i32 s0, s3, s8
|
||||
; GCN-NEXT: s_mul_i32 s0, s3, s10
|
||||
; GCN-NEXT: v_readfirstlane_b32 s1, v0
|
||||
; GCN-NEXT: s_add_i32 s9, s1, s0
|
||||
; GCN-NEXT: s_sub_i32 s10, 0, s9
|
||||
; GCN-NEXT: s_mul_i32 s0, s2, s8
|
||||
; GCN-NEXT: s_sub_u32 s11, 24, s0
|
||||
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s12, s0, s1
|
||||
; GCN-NEXT: s_cmp_lg_u32 s12, 0
|
||||
; GCN-NEXT: s_subb_u32 s10, s10, s3
|
||||
; GCN-NEXT: s_sub_u32 s13, s11, s2
|
||||
; GCN-NEXT: s_add_i32 s11, s1, s0
|
||||
; GCN-NEXT: s_sub_i32 s8, 0, s11
|
||||
; GCN-NEXT: s_mul_i32 s0, s2, s10
|
||||
; GCN-NEXT: s_sub_u32 s12, 24, s0
|
||||
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s9, s0, s1
|
||||
; GCN-NEXT: s_subb_u32 s13, s8, s3
|
||||
; GCN-NEXT: s_sub_u32 s14, s12, s2
|
||||
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s8, s8, s9
|
||||
; GCN-NEXT: s_subb_u32 s8, s13, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s8, s3
|
||||
; GCN-NEXT: s_cselect_b32 s9, -1, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s14, s2
|
||||
; GCN-NEXT: s_cselect_b32 s13, -1, 0
|
||||
; GCN-NEXT: s_cmp_eq_u32 s8, s3
|
||||
; GCN-NEXT: s_cselect_b32 s8, s13, s9
|
||||
; GCN-NEXT: s_add_u32 s9, s10, 1
|
||||
; GCN-NEXT: s_addc_u32 s13, 0, 0
|
||||
; GCN-NEXT: s_add_u32 s14, s10, 2
|
||||
; GCN-NEXT: s_addc_u32 s15, 0, 0
|
||||
; GCN-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GCN-NEXT: s_cselect_b32 s8, s14, s9
|
||||
; GCN-NEXT: s_cselect_b32 s9, s15, s13
|
||||
; GCN-NEXT: s_or_b32 s0, s0, s1
|
||||
; GCN-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GCN-NEXT: s_subb_u32 s0, s10, 0
|
||||
; GCN-NEXT: s_subb_u32 s0, 0, s11
|
||||
; GCN-NEXT: s_cmp_ge_u32 s0, s3
|
||||
; GCN-NEXT: s_cselect_b32 s1, -1, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s13, s2
|
||||
; GCN-NEXT: s_cselect_b32 s10, -1, 0
|
||||
; GCN-NEXT: s_cmp_eq_u32 s0, s3
|
||||
; GCN-NEXT: s_cselect_b32 s0, s10, s1
|
||||
; GCN-NEXT: s_add_u32 s1, s8, 1
|
||||
; GCN-NEXT: s_addc_u32 s10, 0, 0
|
||||
; GCN-NEXT: s_add_u32 s13, s8, 2
|
||||
; GCN-NEXT: s_addc_u32 s14, 0, 0
|
||||
; GCN-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GCN-NEXT: s_cselect_b32 s0, s13, s1
|
||||
; GCN-NEXT: s_cselect_b32 s1, s14, s10
|
||||
; GCN-NEXT: s_cmp_lg_u32 s12, 0
|
||||
; GCN-NEXT: s_subb_u32 s9, 0, s9
|
||||
; GCN-NEXT: s_cmp_ge_u32 s9, s3
|
||||
; GCN-NEXT: s_cselect_b32 s10, -1, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s11, s2
|
||||
; GCN-NEXT: s_cmp_ge_u32 s12, s2
|
||||
; GCN-NEXT: s_cselect_b32 s2, -1, 0
|
||||
; GCN-NEXT: s_cmp_eq_u32 s9, s3
|
||||
; GCN-NEXT: s_cselect_b32 s2, s2, s10
|
||||
; GCN-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GCN-NEXT: s_cselect_b32 s1, s1, 0
|
||||
; GCN-NEXT: s_cselect_b32 s0, s0, s8
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GCN-NEXT: s_cmp_eq_u32 s0, s3
|
||||
; GCN-NEXT: s_cselect_b32 s0, s2, s1
|
||||
; GCN-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GCN-NEXT: s_cselect_b32 s0, s9, 0
|
||||
; GCN-NEXT: s_cselect_b32 s1, s8, s10
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GCN-NEXT: s_endpgm
|
||||
;
|
||||
@ -945,7 +939,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
|
||||
; GCN-IR-NEXT: s_add_u32 s10, s8, 1
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
|
||||
; GCN-IR-NEXT: s_or_b32 s6, s6, s7
|
||||
; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
|
||||
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
|
||||
@ -978,7 +971,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
|
||||
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
|
||||
; GCN-IR-NEXT: s_or_b32 s16, s16, s17
|
||||
; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0
|
||||
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
|
||||
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5]
|
||||
@ -1317,7 +1309,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
|
||||
; GCN-IR-NEXT: s_add_u32 s11, s8, 1
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
|
||||
; GCN-IR-NEXT: s_or_b32 s6, s6, s7
|
||||
; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
|
||||
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
|
||||
@ -1347,7 +1338,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
|
||||
; GCN-IR-NEXT: s_add_u32 s10, s10, 1
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0
|
||||
; GCN-IR-NEXT: s_or_b32 s12, s12, s13
|
||||
; GCN-IR-NEXT: s_cmp_lg_u32 s12, 0
|
||||
; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0
|
||||
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5]
|
||||
|
||||
@ -51,10 +51,9 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
|
||||
; GCN-NEXT: s_addc_u32 s13, 0, s14
|
||||
; GCN-NEXT: s_add_u32 s14, s0, s1
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s14
|
||||
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
|
||||
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s0, s0, s1
|
||||
; GCN-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GCN-NEXT: s_addc_u32 s12, s12, s13
|
||||
; GCN-NEXT: s_mul_i32 s0, s10, s12
|
||||
; GCN-NEXT: v_readfirstlane_b32 s1, v0
|
||||
@ -85,7 +84,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
|
||||
; GCN-NEXT: s_add_u32 s11, s14, s0
|
||||
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s0, s0, s1
|
||||
; GCN-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GCN-NEXT: s_addc_u32 s1, s12, s10
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GCN-NEXT: v_mul_hi_u32 v1, s6, v0
|
||||
@ -115,46 +113,43 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
|
||||
; GCN-NEXT: v_readfirstlane_b32 s10, v0
|
||||
; GCN-NEXT: s_add_i32 s5, s10, s5
|
||||
; GCN-NEXT: s_mul_i32 s10, s9, s4
|
||||
; GCN-NEXT: s_add_i32 s10, s5, s10
|
||||
; GCN-NEXT: s_sub_i32 s11, s7, s10
|
||||
; GCN-NEXT: s_add_i32 s12, s5, s10
|
||||
; GCN-NEXT: s_sub_i32 s10, s7, s12
|
||||
; GCN-NEXT: s_mul_i32 s4, s8, s4
|
||||
; GCN-NEXT: s_sub_u32 s6, s6, s4
|
||||
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s12, s4, s5
|
||||
; GCN-NEXT: s_cmp_lg_u32 s12, 0
|
||||
; GCN-NEXT: s_subb_u32 s11, s11, s9
|
||||
; GCN-NEXT: s_sub_u32 s13, s6, s8
|
||||
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s11, s4, s5
|
||||
; GCN-NEXT: s_subb_u32 s13, s10, s9
|
||||
; GCN-NEXT: s_sub_u32 s14, s6, s8
|
||||
; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s15, s10, s11
|
||||
; GCN-NEXT: s_subb_u32 s15, s13, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s15, s9
|
||||
; GCN-NEXT: s_cselect_b32 s16, -1, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s14, s8
|
||||
; GCN-NEXT: s_cselect_b32 s17, -1, 0
|
||||
; GCN-NEXT: s_cmp_eq_u32 s15, s9
|
||||
; GCN-NEXT: s_cselect_b32 s16, s17, s16
|
||||
; GCN-NEXT: s_or_b32 s10, s10, s11
|
||||
; GCN-NEXT: s_subb_u32 s13, s13, s9
|
||||
; GCN-NEXT: s_sub_u32 s17, s14, s8
|
||||
; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s10, s10, s11
|
||||
; GCN-NEXT: s_subb_u32 s10, s13, 0
|
||||
; GCN-NEXT: s_cmp_lg_u32 s16, 0
|
||||
; GCN-NEXT: s_cselect_b32 s11, s17, s14
|
||||
; GCN-NEXT: s_cselect_b32 s10, s10, s15
|
||||
; GCN-NEXT: s_or_b32 s4, s4, s5
|
||||
; GCN-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GCN-NEXT: s_subb_u32 s14, s11, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s14, s9
|
||||
; GCN-NEXT: s_subb_u32 s4, s7, s12
|
||||
; GCN-NEXT: s_cmp_ge_u32 s4, s9
|
||||
; GCN-NEXT: s_cselect_b32 s5, -1, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s13, s8
|
||||
; GCN-NEXT: s_cselect_b32 s15, -1, 0
|
||||
; GCN-NEXT: s_cmp_eq_u32 s14, s9
|
||||
; GCN-NEXT: s_cselect_b32 s15, s15, s5
|
||||
; GCN-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GCN-NEXT: s_subb_u32 s11, s11, s9
|
||||
; GCN-NEXT: s_sub_u32 s16, s13, s8
|
||||
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s4, s4, s5
|
||||
; GCN-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GCN-NEXT: s_subb_u32 s4, s11, 0
|
||||
; GCN-NEXT: s_cmp_lg_u32 s15, 0
|
||||
; GCN-NEXT: s_cselect_b32 s5, s16, s13
|
||||
; GCN-NEXT: s_cselect_b32 s4, s4, s14
|
||||
; GCN-NEXT: s_cmp_lg_u32 s12, 0
|
||||
; GCN-NEXT: s_subb_u32 s7, s7, s10
|
||||
; GCN-NEXT: s_cmp_ge_u32 s7, s9
|
||||
; GCN-NEXT: s_cselect_b32 s10, -1, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s6, s8
|
||||
; GCN-NEXT: s_cselect_b32 s8, -1, 0
|
||||
; GCN-NEXT: s_cmp_eq_u32 s7, s9
|
||||
; GCN-NEXT: s_cselect_b32 s8, s8, s10
|
||||
; GCN-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GCN-NEXT: s_cselect_b32 s4, s4, s7
|
||||
; GCN-NEXT: s_cselect_b32 s5, s5, s6
|
||||
; GCN-NEXT: s_cselect_b32 s7, -1, 0
|
||||
; GCN-NEXT: s_cmp_eq_u32 s4, s9
|
||||
; GCN-NEXT: s_cselect_b32 s5, s7, s5
|
||||
; GCN-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GCN-NEXT: s_cselect_b32 s4, s10, s4
|
||||
; GCN-NEXT: s_cselect_b32 s5, s11, s6
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
@ -187,7 +182,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
|
||||
; GCN-IR-NEXT: s_add_u32 s14, s12, 1
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; GCN-IR-NEXT: s_or_b32 s8, s8, s9
|
||||
; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GCN-IR-NEXT: s_addc_u32 s8, s13, 0
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
|
||||
@ -221,7 +215,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
|
||||
; GCN-IR-NEXT: s_add_u32 s16, s16, 1
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
|
||||
; GCN-IR-NEXT: s_or_b32 s18, s18, s19
|
||||
; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0
|
||||
; GCN-IR-NEXT: s_addc_u32 s17, s17, 0
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
|
||||
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
|
||||
@ -853,10 +846,9 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
|
||||
; GCN-NEXT: s_addc_u32 s10, 0, s11
|
||||
; GCN-NEXT: s_add_u32 s11, s4, s5
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s11
|
||||
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GCN-NEXT: v_mul_hi_u32 v0, s6, v0
|
||||
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s4, s4, s5
|
||||
; GCN-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GCN-NEXT: s_addc_u32 s9, s9, s10
|
||||
; GCN-NEXT: s_mul_i32 s4, s6, s9
|
||||
; GCN-NEXT: v_readfirstlane_b32 s5, v0
|
||||
@ -887,7 +879,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
|
||||
; GCN-NEXT: s_add_u32 s8, s11, s4
|
||||
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s4, s4, s5
|
||||
; GCN-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GCN-NEXT: s_addc_u32 s4, s9, s6
|
||||
; GCN-NEXT: v_mul_hi_u32 v1, s8, 24
|
||||
; GCN-NEXT: v_mul_hi_u32 v0, s4, 24
|
||||
@ -903,46 +894,43 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
|
||||
; GCN-NEXT: s_mov_b32 s5, s1
|
||||
; GCN-NEXT: s_mul_i32 s0, s3, s8
|
||||
; GCN-NEXT: v_readfirstlane_b32 s1, v0
|
||||
; GCN-NEXT: s_add_i32 s9, s1, s0
|
||||
; GCN-NEXT: s_sub_i32 s10, 0, s9
|
||||
; GCN-NEXT: s_add_i32 s10, s1, s0
|
||||
; GCN-NEXT: s_sub_i32 s9, 0, s10
|
||||
; GCN-NEXT: s_mul_i32 s0, s2, s8
|
||||
; GCN-NEXT: s_sub_u32 s8, 24, s0
|
||||
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s11, s0, s1
|
||||
; GCN-NEXT: s_cmp_lg_u32 s11, 0
|
||||
; GCN-NEXT: s_subb_u32 s10, s10, s3
|
||||
; GCN-NEXT: s_sub_u32 s12, s8, s2
|
||||
; GCN-NEXT: s_sub_u32 s11, 24, s0
|
||||
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s8, s0, s1
|
||||
; GCN-NEXT: s_subb_u32 s12, s9, s3
|
||||
; GCN-NEXT: s_sub_u32 s13, s11, s2
|
||||
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s14, s8, s9
|
||||
; GCN-NEXT: s_subb_u32 s14, s12, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s14, s3
|
||||
; GCN-NEXT: s_cselect_b32 s15, -1, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s13, s2
|
||||
; GCN-NEXT: s_cselect_b32 s16, -1, 0
|
||||
; GCN-NEXT: s_cmp_eq_u32 s14, s3
|
||||
; GCN-NEXT: s_cselect_b32 s15, s16, s15
|
||||
; GCN-NEXT: s_or_b32 s8, s8, s9
|
||||
; GCN-NEXT: s_subb_u32 s12, s12, s3
|
||||
; GCN-NEXT: s_sub_u32 s16, s13, s2
|
||||
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s8, s8, s9
|
||||
; GCN-NEXT: s_subb_u32 s8, s12, 0
|
||||
; GCN-NEXT: s_cmp_lg_u32 s15, 0
|
||||
; GCN-NEXT: s_cselect_b32 s9, s16, s13
|
||||
; GCN-NEXT: s_cselect_b32 s8, s8, s14
|
||||
; GCN-NEXT: s_or_b32 s0, s0, s1
|
||||
; GCN-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GCN-NEXT: s_subb_u32 s13, s10, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s13, s3
|
||||
; GCN-NEXT: s_subb_u32 s0, 0, s10
|
||||
; GCN-NEXT: s_cmp_ge_u32 s0, s3
|
||||
; GCN-NEXT: s_cselect_b32 s1, -1, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s12, s2
|
||||
; GCN-NEXT: s_cselect_b32 s14, -1, 0
|
||||
; GCN-NEXT: s_cmp_eq_u32 s13, s3
|
||||
; GCN-NEXT: s_cselect_b32 s14, s14, s1
|
||||
; GCN-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GCN-NEXT: s_subb_u32 s10, s10, s3
|
||||
; GCN-NEXT: s_sub_u32 s15, s12, s2
|
||||
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GCN-NEXT: s_or_b32 s0, s0, s1
|
||||
; GCN-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GCN-NEXT: s_subb_u32 s0, s10, 0
|
||||
; GCN-NEXT: s_cmp_lg_u32 s14, 0
|
||||
; GCN-NEXT: s_cselect_b32 s1, s15, s12
|
||||
; GCN-NEXT: s_cselect_b32 s0, s0, s13
|
||||
; GCN-NEXT: s_cmp_lg_u32 s11, 0
|
||||
; GCN-NEXT: s_subb_u32 s9, 0, s9
|
||||
; GCN-NEXT: s_cmp_ge_u32 s9, s3
|
||||
; GCN-NEXT: s_cselect_b32 s10, -1, 0
|
||||
; GCN-NEXT: s_cmp_ge_u32 s8, s2
|
||||
; GCN-NEXT: s_cmp_ge_u32 s11, s2
|
||||
; GCN-NEXT: s_cselect_b32 s2, -1, 0
|
||||
; GCN-NEXT: s_cmp_eq_u32 s9, s3
|
||||
; GCN-NEXT: s_cselect_b32 s2, s2, s10
|
||||
; GCN-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GCN-NEXT: s_cselect_b32 s0, s0, s9
|
||||
; GCN-NEXT: s_cselect_b32 s1, s1, s8
|
||||
; GCN-NEXT: s_cmp_eq_u32 s0, s3
|
||||
; GCN-NEXT: s_cselect_b32 s1, s2, s1
|
||||
; GCN-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GCN-NEXT: s_cselect_b32 s0, s8, s0
|
||||
; GCN-NEXT: s_cselect_b32 s1, s9, s11
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
@ -970,7 +958,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
|
||||
; GCN-IR-NEXT: s_add_u32 s10, s8, 1
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
|
||||
; GCN-IR-NEXT: s_or_b32 s6, s6, s7
|
||||
; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
|
||||
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
|
||||
@ -1003,7 +990,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
|
||||
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
|
||||
; GCN-IR-NEXT: s_or_b32 s16, s16, s17
|
||||
; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0
|
||||
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
|
||||
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5]
|
||||
@ -1093,7 +1079,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
|
||||
; GCN-IR-NEXT: s_add_u32 s11, s8, 1
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
|
||||
; GCN-IR-NEXT: s_or_b32 s6, s6, s7
|
||||
; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
|
||||
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
|
||||
@ -1123,7 +1108,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
|
||||
; GCN-IR-NEXT: s_add_u32 s12, s12, 1
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0
|
||||
; GCN-IR-NEXT: s_or_b32 s14, s14, s15
|
||||
; GCN-IR-NEXT: s_cmp_lg_u32 s14, 0
|
||||
; GCN-IR-NEXT: s_addc_u32 s13, s13, 0
|
||||
; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0
|
||||
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
|
||||
|
||||
@ -18,7 +18,6 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; SI-NEXT: s_or_b32 s0, s0, s1
|
||||
; SI-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; SI-NEXT: s_subb_u32 s3, s3, s9
|
||||
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
@ -35,10 +34,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: s_sub_u32 s2, s2, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; VI-NEXT: s_subb_u32 s3, s3, s5
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s3
|
||||
@ -53,14 +50,12 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
|
||||
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_sub_u32 s6, s2, s6
|
||||
; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
|
||||
; GFX9-NEXT: s_subb_u32 s4, s3, s7
|
||||
; GFX9-NEXT: s_sub_u32 s4, s2, s6
|
||||
; GFX9-NEXT: s_subb_u32 s5, s3, s7
|
||||
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
@ -73,8 +68,6 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_sub_u32 s2, s2, s6
|
||||
; GFX10-NEXT: s_cselect_b32 s4, -1, 0
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GFX10-NEXT: s_subb_u32 s3, s3, s7
|
||||
; GFX10-NEXT: s_cselect_b32 s4, -1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
|
||||
@ -91,14 +84,12 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
|
||||
; GFX11-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_sub_u32 s2, s2, s4
|
||||
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GFX11-NEXT: s_subb_u32 s3, s3, s5
|
||||
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
@ -443,7 +434,6 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
|
||||
; SI-NEXT: s_sub_u32 s4, s4, s6
|
||||
; SI-NEXT: s_cselect_b64 s[12:13], -1, 0
|
||||
; SI-NEXT: s_or_b32 s6, s12, s13
|
||||
; SI-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; SI-NEXT: s_subb_u32 s5, s5, s7
|
||||
; SI-NEXT: s_mov_b32 s8, s0
|
||||
; SI-NEXT: s_mov_b32 s9, s1
|
||||
@ -464,16 +454,14 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
|
||||
; VI: ; %bb.0:
|
||||
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: s_sub_u32 s2, s4, s6
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: s_sub_u32 s0, s4, s6
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_subb_u32 s1, s5, s7
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s1
|
||||
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; VI-NEXT: s_subb_u32 s0, s5, s7
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s0
|
||||
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s3
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
|
||||
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
@ -485,12 +473,10 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
|
||||
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_sub_u32 s2, s12, s14
|
||||
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX9-NEXT: s_subb_u32 s0, s13, s15
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: s_sub_u32 s0, s12, s14
|
||||
; GFX9-NEXT: s_subb_u32 s1, s13, s15
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
|
||||
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
|
||||
@ -503,10 +489,8 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_sub_u32 s0, s12, s14
|
||||
; GFX10-NEXT: s_cselect_b32 s1, -1, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX10-NEXT: s_subb_u32 s1, s13, s15
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-NEXT: s_cselect_b32 s0, -1, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
|
||||
@ -519,10 +503,8 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
|
||||
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_sub_u32 s4, s4, s6
|
||||
; GFX11-NEXT: s_cselect_b32 s6, -1, 0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX11-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GFX11-NEXT: s_subb_u32 s5, s5, s7
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
|
||||
|
||||
@ -774,44 +774,40 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
|
||||
; GFX1032-NEXT: s_add_u32 s11, s12, s11
|
||||
; GFX1032-NEXT: s_addc_u32 s12, 0, s13
|
||||
; GFX1032-NEXT: s_add_u32 s8, s8, s11
|
||||
; GFX1032-NEXT: s_cselect_b32 s11, -1, 0
|
||||
; GFX1032-NEXT: s_mul_hi_u32 s13, s9, s8
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s11, 0
|
||||
; GFX1032-NEXT: s_mul_i32 s11, s9, s8
|
||||
; GFX1032-NEXT: s_addc_u32 s5, s5, s12
|
||||
; GFX1032-NEXT: s_mul_i32 s10, s10, s8
|
||||
; GFX1032-NEXT: s_mul_hi_u32 s11, s9, s8
|
||||
; GFX1032-NEXT: s_mul_i32 s12, s9, s8
|
||||
; GFX1032-NEXT: s_mul_i32 s9, s9, s5
|
||||
; GFX1032-NEXT: s_mul_hi_u32 s12, s8, s11
|
||||
; GFX1032-NEXT: s_add_i32 s9, s13, s9
|
||||
; GFX1032-NEXT: s_mul_hi_u32 s13, s5, s11
|
||||
; GFX1032-NEXT: s_mul_i32 s10, s10, s8
|
||||
; GFX1032-NEXT: s_add_i32 s9, s11, s9
|
||||
; GFX1032-NEXT: s_mul_i32 s11, s5, s12
|
||||
; GFX1032-NEXT: s_add_i32 s9, s9, s10
|
||||
; GFX1032-NEXT: s_mul_i32 s10, s5, s11
|
||||
; GFX1032-NEXT: s_mul_hi_u32 s10, s8, s12
|
||||
; GFX1032-NEXT: s_mul_i32 s15, s8, s9
|
||||
; GFX1032-NEXT: s_mul_hi_u32 s14, s8, s9
|
||||
; GFX1032-NEXT: s_add_u32 s12, s12, s15
|
||||
; GFX1032-NEXT: s_add_u32 s10, s10, s15
|
||||
; GFX1032-NEXT: s_mul_hi_u32 s13, s5, s12
|
||||
; GFX1032-NEXT: s_addc_u32 s14, 0, s14
|
||||
; GFX1032-NEXT: s_mul_hi_u32 s11, s5, s9
|
||||
; GFX1032-NEXT: s_add_u32 s10, s12, s10
|
||||
; GFX1032-NEXT: s_mul_hi_u32 s12, s5, s9
|
||||
; GFX1032-NEXT: s_add_u32 s10, s10, s11
|
||||
; GFX1032-NEXT: s_mul_i32 s9, s5, s9
|
||||
; GFX1032-NEXT: s_addc_u32 s10, s14, s13
|
||||
; GFX1032-NEXT: s_addc_u32 s11, s11, 0
|
||||
; GFX1032-NEXT: s_addc_u32 s11, s12, 0
|
||||
; GFX1032-NEXT: s_add_u32 s9, s10, s9
|
||||
; GFX1032-NEXT: s_addc_u32 s10, 0, s11
|
||||
; GFX1032-NEXT: s_add_u32 s8, s8, s9
|
||||
; GFX1032-NEXT: s_cselect_b32 s9, -1, 0
|
||||
; GFX1032-NEXT: s_mul_hi_u32 s11, s2, s8
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX1032-NEXT: s_mul_hi_u32 s9, s3, s8
|
||||
; GFX1032-NEXT: s_addc_u32 s5, s5, s10
|
||||
; GFX1032-NEXT: s_mul_i32 s8, s3, s8
|
||||
; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s8
|
||||
; GFX1032-NEXT: s_mul_i32 s12, s2, s5
|
||||
; GFX1032-NEXT: s_mul_hi_u32 s10, s2, s5
|
||||
; GFX1032-NEXT: s_add_u32 s11, s11, s12
|
||||
; GFX1032-NEXT: s_addc_u32 s10, 0, s10
|
||||
; GFX1032-NEXT: s_mul_hi_u32 s11, s2, s5
|
||||
; GFX1032-NEXT: s_mul_hi_u32 s10, s3, s8
|
||||
; GFX1032-NEXT: s_mul_i32 s8, s3, s8
|
||||
; GFX1032-NEXT: s_add_u32 s9, s9, s12
|
||||
; GFX1032-NEXT: s_addc_u32 s11, 0, s11
|
||||
; GFX1032-NEXT: s_mul_hi_u32 s13, s3, s5
|
||||
; GFX1032-NEXT: s_add_u32 s8, s11, s8
|
||||
; GFX1032-NEXT: s_add_u32 s8, s9, s8
|
||||
; GFX1032-NEXT: s_mul_i32 s5, s3, s5
|
||||
; GFX1032-NEXT: s_addc_u32 s8, s10, s9
|
||||
; GFX1032-NEXT: s_addc_u32 s8, s11, s10
|
||||
; GFX1032-NEXT: s_addc_u32 s9, s13, 0
|
||||
; GFX1032-NEXT: s_add_u32 s5, s8, s5
|
||||
; GFX1032-NEXT: s_addc_u32 s8, 0, s9
|
||||
@ -824,11 +820,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
|
||||
; GFX1032-NEXT: s_sub_i32 s11, s3, s9
|
||||
; GFX1032-NEXT: s_sub_u32 s10, s2, s10
|
||||
; GFX1032-NEXT: s_cselect_b32 s12, -1, 0
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s12, 0
|
||||
; GFX1032-NEXT: s_subb_u32 s11, s11, s1
|
||||
; GFX1032-NEXT: s_sub_u32 s13, s10, s0
|
||||
; GFX1032-NEXT: s_cselect_b32 s14, -1, 0
|
||||
; GFX1032-NEXT: s_cmp_lg_u32 s14, 0
|
||||
; GFX1032-NEXT: s_subb_u32 s11, s11, 0
|
||||
; GFX1032-NEXT: s_cmp_ge_u32 s11, s1
|
||||
; GFX1032-NEXT: s_cselect_b32 s14, -1, 0
|
||||
@ -901,8 +894,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
|
||||
; GFX1064-NEXT: ; %bb.1:
|
||||
; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s0
|
||||
; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s1
|
||||
; GFX1064-NEXT: s_sub_u32 s9, 0, s0
|
||||
; GFX1064-NEXT: s_subb_u32 s10, 0, s1
|
||||
; GFX1064-NEXT: s_sub_u32 s8, 0, s0
|
||||
; GFX1064-NEXT: s_subb_u32 s9, 0, s1
|
||||
; GFX1064-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
|
||||
; GFX1064-NEXT: v_rcp_f32_e32 v0, v0
|
||||
; GFX1064-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
||||
@ -911,109 +904,102 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
|
||||
; GFX1064-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
|
||||
; GFX1064-NEXT: v_cvt_u32_f32_e32 v1, v1
|
||||
; GFX1064-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; GFX1064-NEXT: v_readfirstlane_b32 s8, v1
|
||||
; GFX1064-NEXT: v_readfirstlane_b32 s4, v0
|
||||
; GFX1064-NEXT: s_mul_i32 s5, s9, s8
|
||||
; GFX1064-NEXT: s_mul_hi_u32 s12, s9, s4
|
||||
; GFX1064-NEXT: s_mul_i32 s11, s10, s4
|
||||
; GFX1064-NEXT: s_add_i32 s5, s12, s5
|
||||
; GFX1064-NEXT: s_mul_i32 s13, s9, s4
|
||||
; GFX1064-NEXT: s_add_i32 s5, s5, s11
|
||||
; GFX1064-NEXT: s_mul_hi_u32 s12, s4, s13
|
||||
; GFX1064-NEXT: s_mul_i32 s15, s4, s5
|
||||
; GFX1064-NEXT: s_mul_hi_u32 s14, s8, s13
|
||||
; GFX1064-NEXT: s_mul_i32 s11, s8, s13
|
||||
; GFX1064-NEXT: s_mul_hi_u32 s13, s4, s5
|
||||
; GFX1064-NEXT: v_readfirstlane_b32 s4, v1
|
||||
; GFX1064-NEXT: v_readfirstlane_b32 s5, v0
|
||||
; GFX1064-NEXT: s_mul_i32 s10, s8, s4
|
||||
; GFX1064-NEXT: s_mul_hi_u32 s12, s8, s5
|
||||
; GFX1064-NEXT: s_mul_i32 s11, s9, s5
|
||||
; GFX1064-NEXT: s_add_i32 s10, s12, s10
|
||||
; GFX1064-NEXT: s_mul_i32 s13, s8, s5
|
||||
; GFX1064-NEXT: s_add_i32 s10, s10, s11
|
||||
; GFX1064-NEXT: s_mul_hi_u32 s12, s5, s13
|
||||
; GFX1064-NEXT: s_mul_i32 s15, s5, s10
|
||||
; GFX1064-NEXT: s_mul_hi_u32 s14, s4, s13
|
||||
; GFX1064-NEXT: s_mul_i32 s11, s4, s13
|
||||
; GFX1064-NEXT: s_mul_hi_u32 s13, s5, s10
|
||||
; GFX1064-NEXT: s_add_u32 s12, s12, s15
|
||||
; GFX1064-NEXT: s_addc_u32 s13, 0, s13
|
||||
; GFX1064-NEXT: s_mul_hi_u32 s16, s8, s5
|
||||
; GFX1064-NEXT: s_mul_hi_u32 s16, s4, s10
|
||||
; GFX1064-NEXT: s_add_u32 s11, s12, s11
|
||||
; GFX1064-NEXT: s_mul_i32 s5, s8, s5
|
||||
; GFX1064-NEXT: s_mul_i32 s10, s4, s10
|
||||
; GFX1064-NEXT: s_addc_u32 s11, s13, s14
|
||||
; GFX1064-NEXT: s_addc_u32 s12, s16, 0
|
||||
; GFX1064-NEXT: s_add_u32 s5, s11, s5
|
||||
; GFX1064-NEXT: s_addc_u32 s11, 0, s12
|
||||
; GFX1064-NEXT: s_add_u32 s12, s4, s5
|
||||
; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GFX1064-NEXT: s_mul_hi_u32 s13, s9, s12
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
|
||||
; GFX1064-NEXT: s_mul_i32 s4, s9, s12
|
||||
; GFX1064-NEXT: s_addc_u32 s8, s8, s11
|
||||
; GFX1064-NEXT: s_mul_i32 s10, s10, s12
|
||||
; GFX1064-NEXT: s_mul_i32 s9, s9, s8
|
||||
; GFX1064-NEXT: s_mul_hi_u32 s5, s12, s4
|
||||
; GFX1064-NEXT: s_add_i32 s9, s13, s9
|
||||
; GFX1064-NEXT: s_mul_hi_u32 s11, s8, s4
|
||||
; GFX1064-NEXT: s_add_i32 s9, s9, s10
|
||||
; GFX1064-NEXT: s_mul_i32 s4, s8, s4
|
||||
; GFX1064-NEXT: s_mul_i32 s14, s12, s9
|
||||
; GFX1064-NEXT: s_mul_hi_u32 s13, s12, s9
|
||||
; GFX1064-NEXT: s_add_u32 s5, s5, s14
|
||||
; GFX1064-NEXT: s_addc_u32 s13, 0, s13
|
||||
; GFX1064-NEXT: s_mul_hi_u32 s10, s8, s9
|
||||
; GFX1064-NEXT: s_add_u32 s4, s5, s4
|
||||
; GFX1064-NEXT: s_mul_i32 s9, s8, s9
|
||||
; GFX1064-NEXT: s_addc_u32 s4, s13, s11
|
||||
; GFX1064-NEXT: s_addc_u32 s5, s10, 0
|
||||
; GFX1064-NEXT: s_add_u32 s4, s4, s9
|
||||
; GFX1064-NEXT: s_addc_u32 s9, 0, s5
|
||||
; GFX1064-NEXT: s_add_u32 s10, s12, s4
|
||||
; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GFX1064-NEXT: s_mul_hi_u32 s11, s2, s10
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
|
||||
; GFX1064-NEXT: s_mul_hi_u32 s4, s3, s10
|
||||
; GFX1064-NEXT: s_addc_u32 s5, s8, s9
|
||||
; GFX1064-NEXT: s_mul_i32 s8, s3, s10
|
||||
; GFX1064-NEXT: s_mul_i32 s10, s2, s5
|
||||
; GFX1064-NEXT: s_mul_hi_u32 s9, s2, s5
|
||||
; GFX1064-NEXT: s_add_u32 s10, s11, s10
|
||||
; GFX1064-NEXT: s_addc_u32 s9, 0, s9
|
||||
; GFX1064-NEXT: s_mul_hi_u32 s12, s3, s5
|
||||
; GFX1064-NEXT: s_add_u32 s8, s10, s8
|
||||
; GFX1064-NEXT: s_addc_u32 s11, 0, s12
|
||||
; GFX1064-NEXT: s_add_u32 s5, s5, s10
|
||||
; GFX1064-NEXT: s_addc_u32 s4, s4, s11
|
||||
; GFX1064-NEXT: s_mul_hi_u32 s10, s8, s5
|
||||
; GFX1064-NEXT: s_mul_i32 s11, s8, s5
|
||||
; GFX1064-NEXT: s_mul_i32 s8, s8, s4
|
||||
; GFX1064-NEXT: s_mul_i32 s9, s9, s5
|
||||
; GFX1064-NEXT: s_add_i32 s8, s10, s8
|
||||
; GFX1064-NEXT: s_mul_i32 s10, s4, s11
|
||||
; GFX1064-NEXT: s_add_i32 s8, s8, s9
|
||||
; GFX1064-NEXT: s_mul_hi_u32 s9, s5, s11
|
||||
; GFX1064-NEXT: s_mul_i32 s14, s5, s8
|
||||
; GFX1064-NEXT: s_mul_hi_u32 s13, s5, s8
|
||||
; GFX1064-NEXT: s_add_u32 s9, s9, s14
|
||||
; GFX1064-NEXT: s_mul_hi_u32 s12, s4, s11
|
||||
; GFX1064-NEXT: s_addc_u32 s13, 0, s13
|
||||
; GFX1064-NEXT: s_mul_hi_u32 s11, s4, s8
|
||||
; GFX1064-NEXT: s_add_u32 s9, s9, s10
|
||||
; GFX1064-NEXT: s_mul_i32 s8, s4, s8
|
||||
; GFX1064-NEXT: s_addc_u32 s9, s13, s12
|
||||
; GFX1064-NEXT: s_addc_u32 s10, s11, 0
|
||||
; GFX1064-NEXT: s_add_u32 s8, s9, s8
|
||||
; GFX1064-NEXT: s_addc_u32 s9, 0, s10
|
||||
; GFX1064-NEXT: s_add_u32 s5, s5, s8
|
||||
; GFX1064-NEXT: s_addc_u32 s4, s4, s9
|
||||
; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s5
|
||||
; GFX1064-NEXT: s_mul_i32 s11, s2, s4
|
||||
; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s4
|
||||
; GFX1064-NEXT: s_mul_hi_u32 s9, s3, s5
|
||||
; GFX1064-NEXT: s_mul_i32 s5, s3, s5
|
||||
; GFX1064-NEXT: s_addc_u32 s4, s9, s4
|
||||
; GFX1064-NEXT: s_add_u32 s8, s8, s11
|
||||
; GFX1064-NEXT: s_addc_u32 s10, 0, s10
|
||||
; GFX1064-NEXT: s_mul_hi_u32 s12, s3, s4
|
||||
; GFX1064-NEXT: s_add_u32 s5, s8, s5
|
||||
; GFX1064-NEXT: s_mul_i32 s4, s3, s4
|
||||
; GFX1064-NEXT: s_addc_u32 s5, s10, s9
|
||||
; GFX1064-NEXT: s_addc_u32 s8, s12, 0
|
||||
; GFX1064-NEXT: s_add_u32 s10, s4, s5
|
||||
; GFX1064-NEXT: s_add_u32 s10, s5, s4
|
||||
; GFX1064-NEXT: s_addc_u32 s11, 0, s8
|
||||
; GFX1064-NEXT: s_mul_hi_u32 s4, s0, s10
|
||||
; GFX1064-NEXT: s_mul_i32 s5, s0, s11
|
||||
; GFX1064-NEXT: s_mul_i32 s8, s1, s10
|
||||
; GFX1064-NEXT: s_add_i32 s4, s4, s5
|
||||
; GFX1064-NEXT: s_add_i32 s12, s4, s8
|
||||
; GFX1064-NEXT: s_add_i32 s8, s4, s8
|
||||
; GFX1064-NEXT: s_mul_i32 s4, s0, s10
|
||||
; GFX1064-NEXT: s_sub_i32 s8, s3, s12
|
||||
; GFX1064-NEXT: s_sub_u32 s13, s2, s4
|
||||
; GFX1064-NEXT: s_sub_i32 s9, s3, s8
|
||||
; GFX1064-NEXT: s_sub_u32 s12, s2, s4
|
||||
; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
|
||||
; GFX1064-NEXT: s_subb_u32 s14, s8, s1
|
||||
; GFX1064-NEXT: s_sub_u32 s15, s13, s0
|
||||
; GFX1064-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[8:9], 0
|
||||
; GFX1064-NEXT: s_subb_u32 s8, s14, 0
|
||||
; GFX1064-NEXT: s_cmp_ge_u32 s8, s1
|
||||
; GFX1064-NEXT: s_cselect_b32 s9, -1, 0
|
||||
; GFX1064-NEXT: s_cmp_ge_u32 s15, s0
|
||||
; GFX1064-NEXT: s_subb_u32 s9, s9, s1
|
||||
; GFX1064-NEXT: s_sub_u32 s13, s12, s0
|
||||
; GFX1064-NEXT: s_subb_u32 s9, s9, 0
|
||||
; GFX1064-NEXT: s_cmp_ge_u32 s9, s1
|
||||
; GFX1064-NEXT: s_cselect_b32 s14, -1, 0
|
||||
; GFX1064-NEXT: s_cmp_eq_u32 s8, s1
|
||||
; GFX1064-NEXT: s_cselect_b32 s8, s14, s9
|
||||
; GFX1064-NEXT: s_add_u32 s9, s10, 1
|
||||
; GFX1064-NEXT: s_cmp_ge_u32 s13, s0
|
||||
; GFX1064-NEXT: s_cselect_b32 s13, -1, 0
|
||||
; GFX1064-NEXT: s_cmp_eq_u32 s9, s1
|
||||
; GFX1064-NEXT: s_cselect_b32 s9, s13, s14
|
||||
; GFX1064-NEXT: s_add_u32 s13, s10, 1
|
||||
; GFX1064-NEXT: s_addc_u32 s14, s11, 0
|
||||
; GFX1064-NEXT: s_add_u32 s15, s10, 2
|
||||
; GFX1064-NEXT: s_addc_u32 s16, s11, 0
|
||||
; GFX1064-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX1064-NEXT: s_cselect_b32 s15, s15, s9
|
||||
; GFX1064-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX1064-NEXT: s_cselect_b32 s13, s15, s13
|
||||
; GFX1064-NEXT: s_cselect_b32 s14, s16, s14
|
||||
; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
|
||||
; GFX1064-NEXT: s_subb_u32 s3, s3, s12
|
||||
; GFX1064-NEXT: s_subb_u32 s3, s3, s8
|
||||
; GFX1064-NEXT: s_cmp_ge_u32 s3, s1
|
||||
; GFX1064-NEXT: s_cselect_b32 s4, -1, 0
|
||||
; GFX1064-NEXT: s_cmp_ge_u32 s13, s0
|
||||
; GFX1064-NEXT: s_cmp_ge_u32 s12, s0
|
||||
; GFX1064-NEXT: s_cselect_b32 s5, -1, 0
|
||||
; GFX1064-NEXT: s_cmp_eq_u32 s3, s1
|
||||
; GFX1064-NEXT: s_cselect_b32 s1, s5, s4
|
||||
; GFX1064-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX1064-NEXT: s_cselect_b32 s5, s14, s11
|
||||
; GFX1064-NEXT: s_cselect_b32 s4, s15, s10
|
||||
; GFX1064-NEXT: s_cselect_b32 s4, s13, s10
|
||||
; GFX1064-NEXT: s_cbranch_execnz .LBB15_3
|
||||
; GFX1064-NEXT: .LBB15_2:
|
||||
; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s0
|
||||
|
||||
@ -271,7 +271,6 @@ define i1 @workgroup_nonzero() {
|
||||
; DAGISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; DAGISEL-GFX8-NEXT: s_or_b32 s4, s12, s13
|
||||
; DAGISEL-GFX8-NEXT: s_or_b32 s4, s4, s14
|
||||
; DAGISEL-GFX8-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; DAGISEL-GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; DAGISEL-GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
|
||||
; DAGISEL-GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
@ -281,7 +280,6 @@ define i1 @workgroup_nonzero() {
|
||||
; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; DAGISEL-GFX942-NEXT: s_or_b32 s0, s12, s13
|
||||
; DAGISEL-GFX942-NEXT: s_or_b32 s0, s0, s14
|
||||
; DAGISEL-GFX942-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; DAGISEL-GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; DAGISEL-GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
@ -299,8 +297,6 @@ define i1 @workgroup_nonzero() {
|
||||
; DAGISEL-GFX12-NEXT: s_or_b32 s0, ttmp9, s0
|
||||
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
|
||||
; DAGISEL-GFX12-NEXT: s_or_b32 s0, s0, s1
|
||||
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
|
||||
; DAGISEL-GFX12-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; DAGISEL-GFX12-NEXT: s_cselect_b32 s0, -1, 0
|
||||
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
|
||||
; DAGISEL-GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
|
||||
@ -311,7 +307,6 @@ define i1 @workgroup_nonzero() {
|
||||
; GISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GISEL-GFX8-NEXT: s_or_b32 s4, s12, s13
|
||||
; GISEL-GFX8-NEXT: s_or_b32 s4, s4, s14
|
||||
; GISEL-GFX8-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GISEL-GFX8-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; GISEL-GFX8-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GISEL-GFX8-NEXT: s_setpc_b64 s[30:31]
|
||||
@ -321,7 +316,6 @@ define i1 @workgroup_nonzero() {
|
||||
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GISEL-GFX942-NEXT: s_or_b32 s0, s12, s13
|
||||
; GISEL-GFX942-NEXT: s_or_b32 s0, s0, s14
|
||||
; GISEL-GFX942-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GISEL-GFX942-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GISEL-GFX942-NEXT: s_setpc_b64 s[30:31]
|
||||
@ -339,8 +333,6 @@ define i1 @workgroup_nonzero() {
|
||||
; GISEL-GFX12-NEXT: s_or_b32 s0, ttmp9, s0
|
||||
; GISEL-GFX12-NEXT: s_wait_alu 0xfffe
|
||||
; GISEL-GFX12-NEXT: s_or_b32 s0, s0, s1
|
||||
; GISEL-GFX12-NEXT: s_wait_alu 0xfffe
|
||||
; GISEL-GFX12-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GISEL-GFX12-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GISEL-GFX12-NEXT: s_wait_alu 0xfffe
|
||||
; GISEL-GFX12-NEXT: v_mov_b32_e32 v0, s0
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user