[AMDGPU][ISel] setcc peephole for comparisons with upper 32 bits of a 64-bit register pair (#177662)
This optimisation is motivated by [this minimal example](https://godbolt.org/z/9o83GvGsM). Particularly, if `mask` is a 64-bit integer, a select ```cpp auto out = ((mask >> 32) != 0) ? a : b; ``` converts the null-check on the higher 32-bits of `mask` into a `v_cmp_lt_u64` with the integer `1 << 32` stored in a pair of VGPRs (effectively wasting two VGPRs). More generally, if a 64-bit integer (whose lower 32 bits are known to be irrelevant) is compared with a 64-bit constant, two VGPRs are wasted to construct this constant. This patch modifies ISel to take advantage of how 64-bit values are stored in pairs of VGPRs (or SGPRs), and truncates the 64-bit constant to its upper 32-bit constant where possible. Alive2 proof for analogous middle-end transformation: https://alive2.llvm.org/ce/z/zizKms
This commit is contained in:
parent
8066cc9fc4
commit
26add8a272
@ -17171,6 +17171,26 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
|
||||
return LHS.getOperand(0);
|
||||
}
|
||||
}
|
||||
|
||||
// setcc v.64, 0xXXXX'XXXX'0000'0000, lt/ge
|
||||
// => setcc v.hi32, 0xXXXX'XXXX, lt/ge
|
||||
//
|
||||
// setcc v.64, 0xXXXX'XXXX'FFFF'FFFF, le/gt
|
||||
// => setcc v.hi32, 0xXXXX'XXXX, le/gt
|
||||
if (VT == MVT::i64) {
|
||||
const uint64_t Mask32 = maskTrailingOnes<uint64_t>(32);
|
||||
const uint64_t CRHSInt = CRHSVal.getZExtValue();
|
||||
|
||||
if ( // setcc v.64, 0xXXXX'XXXX'0000'0000, lt/ge
|
||||
((CRHSInt & Mask32) == 0 && (CC == ISD::SETULT || CC == ISD::SETUGE ||
|
||||
CC == ISD::SETLT || CC == ISD::SETGE)) ||
|
||||
// setcc v.64, 0xXXXX'XXXX'FFFF'FFFF, le/gt
|
||||
((CRHSInt & Mask32) == Mask32 &&
|
||||
(CC == ISD::SETULE || CC == ISD::SETUGT || CC == ISD::SETLE ||
|
||||
CC == ISD::SETGT)))
|
||||
return DAG.getSetCC(SL, N->getValueType(0), getHiHalf64(LHS, DAG),
|
||||
DAG.getConstant(CRHSInt >> 32, SL, MVT::i32), CC);
|
||||
}
|
||||
}
|
||||
|
||||
// Eliminate setcc by using carryout from add/sub instruction
|
||||
|
||||
@ -568,17 +568,19 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
|
||||
; GFX908-NEXT: ; %bb.3: ; %bb14
|
||||
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
|
||||
; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
|
||||
; GFX908-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1
|
||||
; GFX908-NEXT: s_cmp_lt_i32 s11, 0
|
||||
; GFX908-NEXT: s_mov_b32 s13, s12
|
||||
; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3]
|
||||
; GFX908-NEXT: s_cselect_b64 s[18:19], -1, 0
|
||||
; GFX908-NEXT: s_cmp_gt_i32 s11, -1
|
||||
; GFX908-NEXT: v_mov_b32_e32 v4, s12
|
||||
; GFX908-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v6
|
||||
; GFX908-NEXT: v_mov_b32_e32 v6, s12
|
||||
; GFX908-NEXT: v_mov_b32_e32 v8, s12
|
||||
; GFX908-NEXT: v_mov_b32_e32 v5, s13
|
||||
; GFX908-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; GFX908-NEXT: v_mov_b32_e32 v7, s13
|
||||
; GFX908-NEXT: v_mov_b32_e32 v9, s13
|
||||
; GFX908-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0
|
||||
; GFX908-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[2:3]
|
||||
; GFX908-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v10
|
||||
; GFX908-NEXT: v_mov_b32_e32 v11, v5
|
||||
; GFX908-NEXT: s_mov_b64 s[20:21], s[14:15]
|
||||
; GFX908-NEXT: v_mov_b32_e32 v10, v4
|
||||
@ -599,9 +601,9 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
|
||||
; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; GFX908-NEXT: s_add_u32 s20, s20, s4
|
||||
; GFX908-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[2:3]
|
||||
; GFX908-NEXT: s_addc_u32 s21, s21, s5
|
||||
; GFX908-NEXT: s_mov_b64 s[22:23], 0
|
||||
; GFX908-NEXT: v_cmp_lt_i32_e64 s[24:25], -1, v3
|
||||
; GFX908-NEXT: s_andn2_b64 vcc, exec, s[24:25]
|
||||
; GFX908-NEXT: s_cbranch_vccz .LBB3_9
|
||||
; GFX908-NEXT: .LBB3_5: ; %bb16
|
||||
@ -728,15 +730,17 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
|
||||
; GFX90A-NEXT: ; %bb.3: ; %bb14
|
||||
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
|
||||
; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
|
||||
; GFX90A-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1
|
||||
; GFX90A-NEXT: s_cmp_lt_i32 s11, 0
|
||||
; GFX90A-NEXT: s_mov_b32 s13, s12
|
||||
; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[2:3]
|
||||
; GFX90A-NEXT: s_cselect_b64 s[18:19], -1, 0
|
||||
; GFX90A-NEXT: s_cmp_gt_i32 s11, -1
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[12:13], s[12:13] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v8
|
||||
; GFX90A-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[12:13], s[12:13] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1]
|
||||
; GFX90A-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0
|
||||
; GFX90A-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[2:3]
|
||||
; GFX90A-NEXT: s_mov_b64 s[20:21], s[14:15]
|
||||
; GFX90A-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v12
|
||||
; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
|
||||
; GFX90A-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX90A-NEXT: v_readfirstlane_b32 s9, v4
|
||||
@ -756,8 +760,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
|
||||
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
|
||||
; GFX90A-NEXT: s_add_u32 s20, s20, s4
|
||||
; GFX90A-NEXT: s_addc_u32 s21, s21, s5
|
||||
; GFX90A-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[4:5]
|
||||
; GFX90A-NEXT: s_mov_b64 s[22:23], 0
|
||||
; GFX90A-NEXT: v_cmp_lt_i32_e64 s[24:25], -1, v5
|
||||
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[24:25]
|
||||
; GFX90A-NEXT: s_cbranch_vccz .LBB3_9
|
||||
; GFX90A-NEXT: .LBB3_5: ; %bb16
|
||||
|
||||
@ -541,19 +541,20 @@ define amdgpu_kernel void @commute_sgt_neg1_i64(ptr addrspace(1) %out, ptr addrs
|
||||
; GCN-LABEL: commute_sgt_neg1_i64:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
|
||||
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s6, 0
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GCN-NEXT: s_mov_b64 s[10:11], s[6:7]
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
|
||||
; GCN-NEXT: s_mov_b64 s[8:9], s[2:3]
|
||||
; GCN-NEXT: buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:4
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], s[0:1]
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
|
||||
; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[3:4]
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
|
||||
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
|
||||
; GCN-NEXT: v_not_b32_e32 v0, v3
|
||||
; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v0
|
||||
; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
|
||||
; GCN-NEXT: s_endpgm
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
|
||||
%gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
|
||||
|
||||
@ -13,8 +13,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
|
||||
; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v1, vcc
|
||||
; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, 0, v2, vcc
|
||||
; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v3, vcc
|
||||
; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
|
||||
; GFX9-NEXT: v_ashrrev_i32_e32 v17, 31, v3
|
||||
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v9, v1, v9, vcc
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v8, v0, v8, vcc
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v11, v3, v11, vcc
|
||||
@ -22,9 +21,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
|
||||
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, 0, v4
|
||||
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, 0, v5, vcc
|
||||
; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, 0, v6, vcc
|
||||
; GFX9-NEXT: v_ashrrev_i32_e32 v17, 31, v3
|
||||
; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, 0, v7, vcc
|
||||
; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7]
|
||||
; GFX9-NEXT: v_ashrrev_i32_e32 v18, 31, v7
|
||||
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v7
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v21, v5, v1, vcc
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v22, v4, v0, vcc
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc
|
||||
@ -53,6 +52,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
|
||||
; GFX9-NEXT: v_add_u32_e32 v3, 32, v3
|
||||
; GFX9-NEXT: v_min_u32_e32 v3, v3, v6
|
||||
; GFX9-NEXT: v_ffbh_u32_e32 v6, v8
|
||||
; GFX9-NEXT: v_ashrrev_i32_e32 v18, 31, v7
|
||||
; GFX9-NEXT: v_add_u32_e32 v6, 32, v6
|
||||
; GFX9-NEXT: v_ffbh_u32_e32 v7, v9
|
||||
; GFX9-NEXT: v_min_u32_e32 v6, v6, v7
|
||||
|
||||
@ -8,14 +8,14 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
|
||||
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v20, 0
|
||||
; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f
|
||||
; SDAG-NEXT: v_ashrrev_i32_e32 v24, 31, v3
|
||||
; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v11
|
||||
; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f
|
||||
; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc
|
||||
; SDAG-NEXT: v_mov_b32_e32 v26, v24
|
||||
; SDAG-NEXT: v_mov_b32_e32 v27, v25
|
||||
; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v2, vcc
|
||||
; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
|
||||
; SDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v3
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v19, v1, v17, s[4:5]
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v18, v0, v16, s[4:5]
|
||||
; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v3, vcc
|
||||
@ -23,31 +23,31 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
|
||||
; SDAG-NEXT: v_ffbh_u32_e32 v1, v18
|
||||
; SDAG-NEXT: v_ffbh_u32_e32 v2, v19
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v17, v3, v0, s[4:5]
|
||||
; SDAG-NEXT: v_or_b32_e32 v0, v18, v16
|
||||
; SDAG-NEXT: v_sub_i32_e32 v3, vcc, 0, v8
|
||||
; SDAG-NEXT: v_or_b32_e32 v0, v18, v16
|
||||
; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], 32, v1
|
||||
; SDAG-NEXT: v_ffbh_u32_e32 v22, v16
|
||||
; SDAG-NEXT: v_or_b32_e32 v1, v19, v17
|
||||
; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc
|
||||
; SDAG-NEXT: v_or_b32_e32 v1, v19, v17
|
||||
; SDAG-NEXT: v_min_u32_e32 v2, v21, v2
|
||||
; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], 32, v22
|
||||
; SDAG-NEXT: v_ffbh_u32_e32 v22, v17
|
||||
; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
|
||||
; SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[10:11]
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v28, v9, v23, s[6:7]
|
||||
; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v10, vcc
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v29, v8, v3, s[6:7]
|
||||
; SDAG-NEXT: v_min_u32_e32 v1, v21, v22
|
||||
; SDAG-NEXT: v_add_i32_e64 v3, s[8:9], 64, v2
|
||||
; SDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v11
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v28, v9, v23, s[4:5]
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v29, v8, v3, s[4:5]
|
||||
; SDAG-NEXT: v_subb_u32_e32 v3, vcc, 0, v10, vcc
|
||||
; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[0:1]
|
||||
; SDAG-NEXT: v_min_u32_e32 v0, v21, v22
|
||||
; SDAG-NEXT: v_add_i32_e64 v1, s[8:9], 64, v2
|
||||
; SDAG-NEXT: v_addc_u32_e64 v8, s[8:9], 0, 0, s[8:9]
|
||||
; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v11, vcc
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v0, s[6:7]
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v3, s[4:5]
|
||||
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v10, v8, 0, vcc
|
||||
; SDAG-NEXT: v_cndmask_b32_e32 v8, v3, v1, vcc
|
||||
; SDAG-NEXT: v_cndmask_b32_e32 v8, v1, v0, vcc
|
||||
; SDAG-NEXT: v_ffbh_u32_e32 v1, v29
|
||||
; SDAG-NEXT: v_ffbh_u32_e32 v21, v28
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v9, s[6:7]
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v9, s[4:5]
|
||||
; SDAG-NEXT: v_or_b32_e32 v0, v29, v2
|
||||
; SDAG-NEXT: v_add_i32_e32 v9, vcc, 32, v1
|
||||
; SDAG-NEXT: v_ffbh_u32_e32 v11, v2
|
||||
@ -57,12 +57,12 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
|
||||
; SDAG-NEXT: v_ffbh_u32_e32 v21, v3
|
||||
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
|
||||
; SDAG-NEXT: v_min_u32_e32 v0, v11, v21
|
||||
; SDAG-NEXT: v_add_i32_e64 v1, s[6:7], 64, v9
|
||||
; SDAG-NEXT: v_addc_u32_e64 v9, s[6:7], 0, 0, s[6:7]
|
||||
; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[2:3]
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, 0, s[6:7]
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[6:7]
|
||||
; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5]
|
||||
; SDAG-NEXT: v_add_i32_e64 v1, s[4:5], 64, v9
|
||||
; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], 0, 0, s[4:5]
|
||||
; SDAG-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[2:3]
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, 0, s[4:5]
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
|
||||
; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
||||
; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v0, v8
|
||||
; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v9, v10, vcc
|
||||
; SDAG-NEXT: v_xor_b32_e32 v0, 0x7f, v8
|
||||
@ -208,7 +208,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
|
||||
; SDAG-NEXT: v_mov_b32_e32 v23, v19
|
||||
; SDAG-NEXT: v_subb_u32_e32 v3, vcc, 0, v5, vcc
|
||||
; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v6, vcc
|
||||
; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7]
|
||||
; SDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v7
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v5, v5, v3, s[4:5]
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[4:5]
|
||||
; SDAG-NEXT: v_subb_u32_e32 v2, vcc, 0, v7, vcc
|
||||
@ -226,7 +226,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
|
||||
; SDAG-NEXT: v_ffbh_u32_e32 v30, v9
|
||||
; SDAG-NEXT: v_min_u32_e32 v6, v16, v6
|
||||
; SDAG-NEXT: v_subb_u32_e32 v16, vcc, 0, v14, vcc
|
||||
; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[14:15]
|
||||
; SDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v15
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v28, v13, v17, s[4:5]
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v29, v12, v7, s[4:5]
|
||||
; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[2:3]
|
||||
@ -833,7 +833,7 @@ define <2 x i128> @v_sdiv_v2i128_v_pow2k(<2 x i128> %lhs) {
|
||||
; SDAG-NEXT: v_mov_b32_e32 v19, v18
|
||||
; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc
|
||||
; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v2, vcc
|
||||
; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
|
||||
; SDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v3
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v13, v1, v10, s[4:5]
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v12, v0, v8, s[4:5]
|
||||
; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v3, vcc
|
||||
@ -991,7 +991,7 @@ define <2 x i128> @v_sdiv_v2i128_v_pow2k(<2 x i128> %lhs) {
|
||||
; SDAG-NEXT: v_mov_b32_e32 v21, v20
|
||||
; SDAG-NEXT: v_subb_u32_e32 v3, vcc, 0, v5, vcc
|
||||
; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v6, vcc
|
||||
; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7]
|
||||
; SDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v7
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v11, v5, v3, s[4:5]
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v10, v4, v2, s[4:5]
|
||||
; SDAG-NEXT: v_subb_u32_e32 v2, vcc, 0, v7, vcc
|
||||
@ -2855,61 +2855,61 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
|
||||
; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v19, 0
|
||||
; SDAG-NEXT: v_ashrrev_i32_e32 v28, 31, v3
|
||||
; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f
|
||||
; SDAG-NEXT: v_ashrrev_i32_e32 v28, 31, v3
|
||||
; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc
|
||||
; SDAG-NEXT: v_mov_b32_e32 v29, v28
|
||||
; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v2, vcc
|
||||
; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
|
||||
; SDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v3
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v17, v1, v17, s[4:5]
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v16, v0, v16, s[4:5]
|
||||
; SDAG-NEXT: v_subb_u32_e32 v1, vcc, 0, v3, vcc
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v0, v2, v18, s[4:5]
|
||||
; SDAG-NEXT: v_ffbh_u32_e32 v18, v16
|
||||
; SDAG-NEXT: v_ffbh_u32_e32 v20, v17
|
||||
; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 0, v8
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
|
||||
; SDAG-NEXT: v_or_b32_e32 v2, v16, v0
|
||||
; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 32, v18
|
||||
; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 0, v8
|
||||
; SDAG-NEXT: v_ffbh_u32_e32 v22, v0
|
||||
; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc
|
||||
; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 32, v18
|
||||
; SDAG-NEXT: v_or_b32_e32 v3, v17, v1
|
||||
; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc
|
||||
; SDAG-NEXT: v_add_i32_e64 v22, s[4:5], 32, v22
|
||||
; SDAG-NEXT: v_min_u32_e32 v18, v18, v20
|
||||
; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], 32, v22
|
||||
; SDAG-NEXT: v_ffbh_u32_e32 v22, v1
|
||||
; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[10:11]
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v30, v9, v23, s[4:5]
|
||||
; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v10, vcc
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v31, v8, v21, s[4:5]
|
||||
; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[2:3]
|
||||
; SDAG-NEXT: v_min_u32_e32 v3, v20, v22
|
||||
; SDAG-NEXT: v_ffbh_u32_e32 v20, v1
|
||||
; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
|
||||
; SDAG-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v11
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v30, v9, v23, s[6:7]
|
||||
; SDAG-NEXT: v_subb_u32_e32 v2, vcc, 0, v10, vcc
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v31, v8, v21, s[6:7]
|
||||
; SDAG-NEXT: v_min_u32_e32 v3, v22, v20
|
||||
; SDAG-NEXT: v_add_i32_e64 v8, s[8:9], 64, v18
|
||||
; SDAG-NEXT: v_addc_u32_e64 v18, s[8:9], 0, 0, s[8:9]
|
||||
; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v11, vcc
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v9, s[4:5]
|
||||
; SDAG-NEXT: v_addc_u32_e64 v9, s[8:9], 0, 0, s[8:9]
|
||||
; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v11, vcc
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7]
|
||||
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, 0, vcc
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v20, v9, 0, vcc
|
||||
; SDAG-NEXT: v_cndmask_b32_e32 v10, v8, v3, vcc
|
||||
; SDAG-NEXT: v_ffbh_u32_e32 v9, v31
|
||||
; SDAG-NEXT: v_ffbh_u32_e32 v21, v30
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v20, s[4:5]
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v18, s[6:7]
|
||||
; SDAG-NEXT: v_or_b32_e32 v8, v31, v2
|
||||
; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v9
|
||||
; SDAG-NEXT: v_ffbh_u32_e32 v20, v2
|
||||
; SDAG-NEXT: v_ffbh_u32_e32 v18, v2
|
||||
; SDAG-NEXT: v_or_b32_e32 v9, v30, v3
|
||||
; SDAG-NEXT: v_min_u32_e32 v11, v11, v21
|
||||
; SDAG-NEXT: v_add_i32_e32 v20, vcc, 32, v20
|
||||
; SDAG-NEXT: v_add_i32_e32 v18, vcc, 32, v18
|
||||
; SDAG-NEXT: v_ffbh_u32_e32 v21, v3
|
||||
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
|
||||
; SDAG-NEXT: v_min_u32_e32 v8, v20, v21
|
||||
; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 64, v11
|
||||
; SDAG-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, s[4:5]
|
||||
; SDAG-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[2:3]
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, 0, s[4:5]
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v8, v9, v8, s[4:5]
|
||||
; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
|
||||
; SDAG-NEXT: v_min_u32_e32 v8, v18, v21
|
||||
; SDAG-NEXT: v_add_i32_e64 v9, s[6:7], 64, v11
|
||||
; SDAG-NEXT: v_addc_u32_e64 v11, s[6:7], 0, 0, s[6:7]
|
||||
; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[2:3]
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, 0, s[6:7]
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v8, v9, v8, s[6:7]
|
||||
; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5]
|
||||
; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v8, v10
|
||||
; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v18, vcc
|
||||
; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v20, vcc
|
||||
; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v10
|
||||
; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v19, vcc
|
||||
; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11]
|
||||
@ -3051,7 +3051,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
|
||||
; SDAG-NEXT: v_mov_b32_e32 v35, v32
|
||||
; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc
|
||||
; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v6, vcc
|
||||
; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7]
|
||||
; SDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v7
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v11, v5, v11, s[4:5]
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v10, v4, v10, s[4:5]
|
||||
; SDAG-NEXT: v_subb_u32_e32 v5, vcc, 0, v7, vcc
|
||||
@ -3069,7 +3069,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
|
||||
; SDAG-NEXT: v_ffbh_u32_e32 v24, v5
|
||||
; SDAG-NEXT: v_min_u32_e32 v18, v18, v20
|
||||
; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v14, vcc
|
||||
; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[14:15]
|
||||
; SDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v15
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v36, v13, v23, s[4:5]
|
||||
; SDAG-NEXT: v_cndmask_b32_e64 v37, v12, v21, s[4:5]
|
||||
; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[6:7]
|
||||
|
||||
@ -125,17 +125,14 @@ define amdgpu_kernel void @uniform_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x
|
||||
; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
|
||||
; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
|
||||
; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
|
||||
; GCN-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
|
||||
; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
|
||||
; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[DEF]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
|
||||
; GCN-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORD_IMM]]
|
||||
; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY6]], implicit-def dead $scc
|
||||
; GCN-NEXT: S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc
|
||||
; GCN-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY $scc
|
||||
; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
|
||||
; GCN-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY killed [[S_MOV_B64_]]
|
||||
; GCN-NEXT: [[V_CMP_LT_I64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I64_e64 killed [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
|
||||
; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I64_e64_]], killed [[COPY7]], implicit-def dead $scc
|
||||
; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
||||
; GCN-NEXT: S_CMP_LT_I32 killed [[COPY5]], killed [[S_MOV_B32_2]], implicit-def $scc
|
||||
; GCN-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY $scc
|
||||
; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[COPY8]], killed [[COPY7]], implicit-def dead $scc
|
||||
; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec
|
||||
; GCN-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.2, addrspace 1)
|
||||
; GCN-NEXT: S_ENDPGM 0
|
||||
@ -155,12 +152,12 @@ define i1 @divergent_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x, i1 %z) {
|
||||
; GCN-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
|
||||
; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
|
||||
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[DEF]], %subreg.sub0, [[COPY1]], %subreg.sub1
|
||||
; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
|
||||
; GCN-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY]], implicit $exec
|
||||
; GCN-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_AND_B32_e64_]], 1, implicit $exec
|
||||
; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
|
||||
; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY killed [[S_MOV_B64_]]
|
||||
; GCN-NEXT: [[V_CMP_LT_I64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I64_e64 killed [[REG_SEQUENCE]], [[COPY2]], implicit $exec
|
||||
; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I64_e64_]], killed [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
||||
; GCN-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 killed [[COPY2]], killed [[S_MOV_B32_]], implicit $exec
|
||||
; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I32_e64_]], killed [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
|
||||
; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec
|
||||
; GCN-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]]
|
||||
; GCN-NEXT: SI_RETURN implicit $vgpr0
|
||||
|
||||
@ -137,9 +137,9 @@ define <2 x i64> @extract_2xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
|
||||
; GCN-NEXT: .LBB1_4: ; %exit
|
||||
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000
|
||||
; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5]
|
||||
; GCN-NEXT: v_cmp_lt_i32_e32 vcc, -1, v5
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc
|
||||
; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[6:7]
|
||||
; GCN-NEXT: v_cmp_lt_i32_e32 vcc, -1, v7
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v2, -1, v1, vcc
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, -1
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, -1
|
||||
@ -205,13 +205,13 @@ define <4 x i64> @extract_4xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
|
||||
; GCN-NEXT: .LBB2_4: ; %exit
|
||||
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000
|
||||
; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5]
|
||||
; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 0, v5
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc
|
||||
; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7]
|
||||
; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 0, v7
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v2, v1, -1, vcc
|
||||
; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[8:9]
|
||||
; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 0, v9
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc
|
||||
; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11]
|
||||
; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 0, v11
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v6, v1, -1, vcc
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, -1
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, -1
|
||||
@ -295,14 +295,14 @@ define <8 x i64> @extract_8xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
|
||||
; GCN-NEXT: .LBB3_4: ; %exit
|
||||
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000
|
||||
; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7]
|
||||
; GCN-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[8:9]
|
||||
; GCN-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[10:11]
|
||||
; GCN-NEXT: v_cmp_gt_i64_e64 s[8:9], 0, v[12:13]
|
||||
; GCN-NEXT: v_cmp_gt_i64_e64 s[10:11], 0, v[14:15]
|
||||
; GCN-NEXT: v_cmp_gt_i64_e64 s[12:13], 0, v[16:17]
|
||||
; GCN-NEXT: v_cmp_gt_i64_e64 s[14:15], 0, v[18:19]
|
||||
; GCN-NEXT: v_cmp_gt_i64_e64 s[16:17], 0, v[4:5]
|
||||
; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 0, v7
|
||||
; GCN-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v9
|
||||
; GCN-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v11
|
||||
; GCN-NEXT: v_cmp_gt_i32_e64 s[8:9], 0, v13
|
||||
; GCN-NEXT: v_cmp_gt_i32_e64 s[10:11], 0, v15
|
||||
; GCN-NEXT: v_cmp_gt_i32_e64 s[12:13], 0, v17
|
||||
; GCN-NEXT: v_cmp_gt_i32_e64 s[14:15], 0, v19
|
||||
; GCN-NEXT: v_cmp_gt_i32_e64 s[16:17], 0, v5
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v0, v1, -1, s[16:17]
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v2, v1, -1, vcc
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v4, v1, -1, s[4:5]
|
||||
|
||||
@ -21,8 +21,7 @@ define bfloat @sitofp_i128_to_bf16(i128 %x) {
|
||||
; GCN-NEXT: v_subb_co_u32_e32 v5, vcc, 0, v1, vcc
|
||||
; GCN-NEXT: v_subb_co_u32_e32 v6, vcc, 0, v2, vcc
|
||||
; GCN-NEXT: v_subb_co_u32_e32 v7, vcc, 0, v3, vcc
|
||||
; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
|
||||
; GCN-NEXT: ; implicit-def: $vgpr8
|
||||
; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
|
||||
@ -40,6 +39,7 @@ define bfloat @sitofp_i128_to_bf16(i128 %x) {
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc
|
||||
; GCN-NEXT: v_sub_u32_e32 v2, 0x80, v7
|
||||
; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 25, v2
|
||||
; GCN-NEXT: ; implicit-def: $vgpr8
|
||||
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
||||
; GCN-NEXT: ; %bb.2: ; %itofp-if-else
|
||||
|
||||
@ -17,8 +17,7 @@ define float @sitofp_i128_to_f32(i128 %x) {
|
||||
; SDAG-NEXT: v_subb_co_u32_e32 v5, vcc, 0, v1, vcc
|
||||
; SDAG-NEXT: v_subb_co_u32_e32 v6, vcc, 0, v2, vcc
|
||||
; SDAG-NEXT: v_subb_co_u32_e32 v7, vcc, 0, v3, vcc
|
||||
; SDAG-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
|
||||
; SDAG-NEXT: ; implicit-def: $vgpr8
|
||||
; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
|
||||
; SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
|
||||
; SDAG-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
||||
; SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
|
||||
@ -36,6 +35,7 @@ define float @sitofp_i128_to_f32(i128 %x) {
|
||||
; SDAG-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc
|
||||
; SDAG-NEXT: v_sub_u32_e32 v2, 0x80, v7
|
||||
; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v2
|
||||
; SDAG-NEXT: ; implicit-def: $vgpr8
|
||||
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
||||
; SDAG-NEXT: ; %bb.2: ; %itofp-if-else
|
||||
@ -527,8 +527,7 @@ define double @sitofp_i128_to_f64(i128 %x) {
|
||||
; SDAG-NEXT: v_subb_co_u32_e32 v1, vcc, 0, v5, vcc
|
||||
; SDAG-NEXT: v_subb_co_u32_e32 v6, vcc, 0, v2, vcc
|
||||
; SDAG-NEXT: v_subb_co_u32_e32 v7, vcc, 0, v3, vcc
|
||||
; SDAG-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
|
||||
; SDAG-NEXT: ; implicit-def: $vgpr10
|
||||
; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
|
||||
; SDAG-NEXT: v_cndmask_b32_e32 v6, v2, v6, vcc
|
||||
; SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
|
||||
; SDAG-NEXT: v_cndmask_b32_e32 v7, v3, v7, vcc
|
||||
@ -546,6 +545,7 @@ define double @sitofp_i128_to_f64(i128 %x) {
|
||||
; SDAG-NEXT: v_cndmask_b32_e32 v9, v1, v0, vcc
|
||||
; SDAG-NEXT: v_sub_u32_e32 v2, 0x80, v9
|
||||
; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 54, v2
|
||||
; SDAG-NEXT: ; implicit-def: $vgpr10
|
||||
; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
||||
@ -1105,8 +1105,7 @@ define half @sitofp_i128_to_f16(i128 %x) {
|
||||
; SDAG-NEXT: v_subb_co_u32_e32 v5, vcc, 0, v1, vcc
|
||||
; SDAG-NEXT: v_subb_co_u32_e32 v6, vcc, 0, v2, vcc
|
||||
; SDAG-NEXT: v_subb_co_u32_e32 v7, vcc, 0, v3, vcc
|
||||
; SDAG-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
|
||||
; SDAG-NEXT: ; implicit-def: $vgpr8
|
||||
; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
|
||||
; SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
|
||||
; SDAG-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
||||
; SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
|
||||
@ -1124,6 +1123,7 @@ define half @sitofp_i128_to_f16(i128 %x) {
|
||||
; SDAG-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc
|
||||
; SDAG-NEXT: v_sub_u32_e32 v2, 0x80, v7
|
||||
; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v2
|
||||
; SDAG-NEXT: ; implicit-def: $vgpr8
|
||||
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
|
||||
; SDAG-NEXT: ; %bb.2: ; %itofp-if-else
|
||||
|
||||
@ -14,7 +14,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
|
||||
; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v1, vcc
|
||||
; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, 0, v2, vcc
|
||||
; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v3, vcc
|
||||
; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
|
||||
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
|
||||
; GFX9-NEXT: v_ashrrev_i32_e32 v20, 31, v3
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
|
||||
@ -24,8 +24,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
|
||||
; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v5, vcc
|
||||
; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, 0, v6, vcc
|
||||
; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v7, vcc
|
||||
; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7]
|
||||
; GFX9-NEXT: v_mov_b32_e32 v21, v20
|
||||
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v7
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v22, v5, v9, vcc
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v23, v4, v8, vcc
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v11, vcc
|
||||
@ -70,6 +69,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
|
||||
; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v9, vcc
|
||||
; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f
|
||||
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7]
|
||||
; GFX9-NEXT: v_mov_b32_e32 v21, v20
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
|
||||
; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
|
||||
|
||||
@ -20,18 +20,19 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; SI-NEXT: s_add_u32 s10, s2, s8
|
||||
; SI-NEXT: s_addc_u32 s11, s3, s9
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
|
||||
; SI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[8:9], 0
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_add_u32 s0, s2, s8
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s11
|
||||
; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v0
|
||||
; SI-NEXT: s_addc_u32 s1, s3, s9
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
|
||||
; SI-NEXT: s_cmp_lt_i32 s9, 0
|
||||
; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; SI-NEXT: s_xor_b64 s[2:3], s[2:3], vcc
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
|
||||
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
@ -41,18 +42,19 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s2
|
||||
; VI-NEXT: s_add_u32 s6, s2, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s3
|
||||
; VI-NEXT: s_addc_u32 s7, s3, s5
|
||||
; VI-NEXT: v_cmp_lt_i64_e64 s[8:9], s[4:5], 0
|
||||
; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[1:2]
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: s_add_u32 s0, s2, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_xor_b64 s[0:1], s[8:9], vcc
|
||||
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s7
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s3
|
||||
; VI-NEXT: s_addc_u32 s1, s3, s5
|
||||
; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[2:3]
|
||||
; VI-NEXT: s_cmp_lt_i32 s5, 0
|
||||
; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; VI-NEXT: s_xor_b64 s[2:3], s[2:3], vcc
|
||||
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
|
||||
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
||||
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; VI-NEXT: s_endpgm
|
||||
@ -67,11 +69,12 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
|
||||
; GFX9-NEXT: s_add_u32 s4, s2, s6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: s_addc_u32 s5, s3, s7
|
||||
; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[6:7], 0
|
||||
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX9-NEXT: s_xor_b64 s[2:3], s[8:9], vcc
|
||||
; GFX9-NEXT: s_cmp_lt_i32 s7, 0
|
||||
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], vcc
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
||||
@ -86,9 +89,10 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_add_u32 s4, s2, s6
|
||||
; GFX10-NEXT: s_addc_u32 s5, s3, s7
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[6:7], 0
|
||||
; GFX10-NEXT: s_cmp_lt_i32 s7, 0
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3]
|
||||
; GFX10-NEXT: s_xor_b32 s2, s6, s2
|
||||
; GFX10-NEXT: s_cselect_b32 s3, -1, 0
|
||||
; GFX10-NEXT: s_xor_b32 s2, s3, s2
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
|
||||
; GFX10-NEXT: v_add_co_u32 v0, s2, s4, v0
|
||||
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s5, 0, s2
|
||||
@ -104,13 +108,14 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_add_u32 s6, s2, s4
|
||||
; GFX11-NEXT: s_addc_u32 s7, s3, s5
|
||||
; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[4:5], 0
|
||||
; GFX11-NEXT: s_cmp_lt_i32 s5, 0
|
||||
; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], s[2:3]
|
||||
; GFX11-NEXT: s_xor_b32 s2, s4, s2
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: s_cselect_b32 s3, -1, 0
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_xor_b32 s2, s3, s2
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_add_co_u32 v0, s2, s6, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s2
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
@ -355,7 +360,8 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
|
||||
; SI-NEXT: s_addc_u32 s13, s5, s7
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s5
|
||||
; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1]
|
||||
; SI-NEXT: v_cmp_lt_i64_e64 s[4:5], s[6:7], 0
|
||||
; SI-NEXT: s_cmp_lt_i32 s7, 0
|
||||
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s12
|
||||
; SI-NEXT: s_mov_b32 s8, s0
|
||||
; SI-NEXT: s_mov_b32 s9, s1
|
||||
@ -381,10 +387,11 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_addc_u32 s1, s5, s7
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s5
|
||||
; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
|
||||
; VI-NEXT: s_cmp_lt_i32 s7, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s3
|
||||
; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
|
||||
; VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
|
||||
; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s0
|
||||
; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
|
||||
@ -402,11 +409,12 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s12
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s13
|
||||
; GFX9-NEXT: s_addc_u32 s1, s13, s15
|
||||
; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[14:15], 0
|
||||
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
|
||||
; GFX9-NEXT: s_cmp_lt_i32 s15, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
|
||||
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], vcc
|
||||
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; GFX9-NEXT: global_store_byte v2, v0, s[10:11]
|
||||
@ -419,11 +427,12 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_add_u32 s0, s12, s14
|
||||
; GFX10-NEXT: s_addc_u32 s1, s13, s15
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[14:15], 0
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[12:13]
|
||||
; GFX10-NEXT: s_cmp_lt_i32 s15, 0
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[0:1], s[12:13]
|
||||
; GFX10-NEXT: s_cselect_b32 s3, -1, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX10-NEXT: s_xor_b32 s0, s2, s3
|
||||
; GFX10-NEXT: s_xor_b32 s0, s3, s2
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
|
||||
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
|
||||
; GFX10-NEXT: global_store_byte v2, v3, s[10:11]
|
||||
@ -435,11 +444,12 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_add_u32 s8, s4, s6
|
||||
; GFX11-NEXT: s_addc_u32 s9, s5, s7
|
||||
; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[6:7], 0
|
||||
; GFX11-NEXT: s_cmp_lt_i32 s7, 0
|
||||
; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5]
|
||||
; GFX11-NEXT: s_cselect_b32 s5, -1, 0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
|
||||
; GFX11-NEXT: s_xor_b32 s4, s6, s4
|
||||
; GFX11-NEXT: s_xor_b32 s4, s5, s4
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
@ -478,11 +488,11 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_add_i32_e32 v4, vcc, v0, v2
|
||||
; SI-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc
|
||||
; SI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
|
||||
; SI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1]
|
||||
; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0
|
||||
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
|
||||
; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
|
||||
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0
|
||||
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
@ -503,11 +513,11 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2
|
||||
; VI-NEXT: v_addc_u32_e32 v9, vcc, v1, v3, vcc
|
||||
; VI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
|
||||
; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1]
|
||||
; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9]
|
||||
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
|
||||
; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
|
||||
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9]
|
||||
; VI-NEXT: flat_store_byte v[6:7], v0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
@ -521,11 +531,11 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
|
||||
; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
|
||||
; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1]
|
||||
; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9]
|
||||
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
|
||||
; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9]
|
||||
; GFX9-NEXT: global_store_byte v6, v0, s[10:11]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
@ -540,9 +550,9 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
|
||||
; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
|
||||
; GFX10-NEXT: s_xor_b32 s0, vcc_lo, s0
|
||||
; GFX10-NEXT: v_cmp_gt_i32_e64 s0, 0, v3
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
|
||||
; GFX10-NEXT: s_xor_b32 s0, s0, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
|
||||
; GFX10-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9]
|
||||
; GFX10-NEXT: global_store_byte v6, v0, s[10:11]
|
||||
@ -560,9 +570,9 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
|
||||
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
|
||||
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
|
||||
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
|
||||
; GFX11-NEXT: s_xor_b32 s0, vcc_lo, s0
|
||||
; GFX11-NEXT: v_cmp_gt_i32_e64 s0, 0, v3
|
||||
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
|
||||
; GFX11-NEXT: s_xor_b32 s0, s0, vcc_lo
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
|
||||
@ -430,7 +430,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
|
||||
; GFX6-NEXT: v_add_i32_e32 v4, vcc, v0, v2
|
||||
; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc
|
||||
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
|
||||
; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
|
||||
; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v3
|
||||
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v5
|
||||
; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
|
||||
@ -443,7 +443,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
|
||||
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
|
||||
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc
|
||||
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
|
||||
; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
|
||||
; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v3
|
||||
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v5
|
||||
; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
|
||||
@ -456,7 +456,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2
|
||||
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
|
||||
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
|
||||
; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
|
||||
; GFX9-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v3
|
||||
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v5
|
||||
; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
|
||||
@ -468,7 +468,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
|
||||
; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[2:3]
|
||||
; GFX10-NEXT: v_cmp_gt_i32_e64 s4, 0, v3
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
|
||||
; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v5
|
||||
; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo
|
||||
@ -481,7 +481,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
|
||||
; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
|
||||
; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3]
|
||||
; GFX11-NEXT: v_cmp_gt_i32_e64 s0, 0, v3
|
||||
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
|
||||
; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v5
|
||||
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
|
||||
|
||||
202
llvm/test/CodeGen/AMDGPU/setcc-select.ll
Normal file
202
llvm/test/CodeGen/AMDGPU/setcc-select.ll
Normal file
@ -0,0 +1,202 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck %s
|
||||
|
||||
define i32 @select.hi32.sgpr.ult(i64 inreg %mask, i32 inreg %a, i32 inreg %b) {
|
||||
; CHECK-LABEL: select.hi32.sgpr.ult:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CHECK-NEXT: s_cmp_lt_u32 s17, 0xaaaaaaaa
|
||||
; CHECK-NEXT: s_cselect_b32 s4, s18, s19
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, s4
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
%mask.hi.z = icmp ult i64 %mask, u0xaaaaaaaa00000000
|
||||
%ret = select i1 %mask.hi.z, i32 %a, i32 %b
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @select.hi32.sgpr.uge(i64 inreg %mask, i32 inreg %a, i32 inreg %b) {
|
||||
; CHECK-LABEL: select.hi32.sgpr.uge:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CHECK-NEXT: s_cmp_gt_u32 s17, 0xaaaaaaa9
|
||||
; CHECK-NEXT: s_cselect_b32 s4, s18, s19
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, s4
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
%mask.hi.nz = icmp uge i64 %mask, u0xaaaaaaaa00000000
|
||||
%ret = select i1 %mask.hi.nz, i32 %a, i32 %b
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @select.hi32.sgpr.ule(i64 inreg %mask, i32 inreg %a, i32 inreg %b) {
|
||||
; CHECK-LABEL: select.hi32.sgpr.ule:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CHECK-NEXT: s_cmp_lt_u32 s17, 0xaaaaaaab
|
||||
; CHECK-NEXT: s_cselect_b32 s4, s18, s19
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, s4
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
%mask.hi.z = icmp ule i64 %mask, u0xaaaaaaaaffffffff
|
||||
%ret = select i1 %mask.hi.z, i32 %a, i32 %b
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @select.hi32.sgpr.ugt(i64 inreg %mask, i32 inreg %a, i32 inreg %b) {
|
||||
; CHECK-LABEL: select.hi32.sgpr.ugt:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CHECK-NEXT: s_cmp_gt_u32 s17, 0xaaaaaaaa
|
||||
; CHECK-NEXT: s_cselect_b32 s4, s18, s19
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, s4
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
%mask.hi.nz = icmp ugt i64 %mask, u0xaaaaaaaaffffffff
|
||||
%ret = select i1 %mask.hi.nz, i32 %a, i32 %b
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @select.hi32.vgpr.ult(i64 %mask, i32 %a, i32 %b) {
|
||||
; CHECK-LABEL: select.hi32.vgpr.ult:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CHECK-NEXT: s_mov_b32 s4, 0xaaaaaaaa
|
||||
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v1
|
||||
; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
%mask.hi.z = icmp ult i64 %mask, u0xaaaaaaaa00000000
|
||||
%ret = select i1 %mask.hi.z, i32 %a, i32 %b
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @select.hi32.vgpr.uge(i64 %mask, i32 %a, i32 %b) {
|
||||
; CHECK-LABEL: select.hi32.vgpr.uge:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CHECK-NEXT: s_mov_b32 s4, 0xaaaaaaa9
|
||||
; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s4, v1
|
||||
; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
%mask.hi.nz = icmp uge i64 %mask, u0xaaaaaaaa00000000
|
||||
%ret = select i1 %mask.hi.nz, i32 %a, i32 %b
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @select.hi32.vgpr.ule(i64 %mask, i32 %a, i32 %b) {
|
||||
; CHECK-LABEL: select.hi32.vgpr.ule:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CHECK-NEXT: s_mov_b32 s4, 0xaaaaaaab
|
||||
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v1
|
||||
; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
%mask.hi.z = icmp ule i64 %mask, u0xaaaaaaaaffffffff
|
||||
%ret = select i1 %mask.hi.z, i32 %a, i32 %b
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @select.hi32.vgpr.ugt(i64 %mask, i32 %a, i32 %b) {
|
||||
; CHECK-LABEL: select.hi32.vgpr.ugt:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CHECK-NEXT: s_mov_b32 s4, 0xaaaaaaaa
|
||||
; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s4, v1
|
||||
; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
%mask.hi.nz = icmp ugt i64 %mask, u0xaaaaaaaaffffffff
|
||||
%ret = select i1 %mask.hi.nz, i32 %a, i32 %b
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @select.hi32.sgpr.multiuse(i64 inreg %mask, i32 inreg %a, i32 inreg %b, i32 inreg %c, i32 inreg %d) {
|
||||
; CHECK-LABEL: select.hi32.sgpr.multiuse:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CHECK-NEXT: s_cmp_lt_u32 s17, 0xaaaaaaaa
|
||||
; CHECK-NEXT: s_cselect_b32 s4, s18, s19
|
||||
; CHECK-NEXT: s_cselect_b32 s5, s20, s21
|
||||
; CHECK-NEXT: s_add_i32 s4, s4, s5
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, s4
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
%mask.hi.nz = icmp ult i64 %mask, u0xaaaaaaaa00000000
|
||||
%ab = select i1 %mask.hi.nz, i32 %a, i32 %b
|
||||
%cd = select i1 %mask.hi.nz, i32 %c, i32 %d
|
||||
%ret = add i32 %ab, %cd
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
|
||||
define i32 @select.hi32.vgpr.multiuse(i64 %mask, i32 %a, i32 %b, i32 %c, i32 %d) {
|
||||
; CHECK-LABEL: select.hi32.vgpr.multiuse:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CHECK-NEXT: s_mov_b32 s4, 0xaaaaaaaa
|
||||
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v1
|
||||
; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
|
||||
; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
|
||||
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
%mask.hi.nz = icmp ult i64 %mask, u0xaaaaaaaa00000000
|
||||
%ab = select i1 %mask.hi.nz, i32 %a, i32 %b
|
||||
%cd = select i1 %mask.hi.nz, i32 %c, i32 %d
|
||||
%ret = add i32 %ab, %cd
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @select.bad.sgpr.ule(i64 inreg %mask, i32 inreg %a, i32 inreg %b) {
|
||||
; CHECK-LABEL: select.bad.sgpr.ule:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, 1
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, 0xaaaaaaaa
|
||||
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1]
|
||||
; CHECK-NEXT: s_and_b64 s[4:5], vcc, exec
|
||||
; CHECK-NEXT: s_cselect_b32 s4, s18, s19
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, s4
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
%test = icmp ule i64 %mask, u0xaaaaaaaa00000000
|
||||
%ret = select i1 %test, i32 %a, i32 %b
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @select.bad.sgpr.ult(i64 inreg %mask, i32 inreg %a, i32 inreg %b) {
|
||||
; CHECK-LABEL: select.bad.sgpr.ult:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, -1
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, 0xaaaaaaaa
|
||||
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1]
|
||||
; CHECK-NEXT: s_and_b64 s[4:5], vcc, exec
|
||||
; CHECK-NEXT: s_cselect_b32 s4, s18, s19
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, s4
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
%test = icmp ult i64 %mask, u0xaaaaaaaaffffffff
|
||||
%ret = select i1 %test, i32 %a, i32 %b
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
|
||||
define i32 @select.bad.vgpr.ule(i64 %mask, i32 %a, i32 %b) {
|
||||
; CHECK-LABEL: select.bad.vgpr.ule:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CHECK-NEXT: s_mov_b32 s4, 1
|
||||
; CHECK-NEXT: s_mov_b32 s5, 0xaaaaaaaa
|
||||
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
|
||||
; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
%test = icmp ule i64 %mask, u0xaaaaaaaa00000000
|
||||
%ret = select i1 %test, i32 %a, i32 %b
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
define i32 @select.bad.vgpr.ult(i64 %mask, i32 %a, i32 %b) {
|
||||
; CHECK-LABEL: select.bad.vgpr.ult:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; CHECK-NEXT: s_mov_b32 s4, -1
|
||||
; CHECK-NEXT: s_mov_b32 s5, 0xaaaaaaaa
|
||||
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
|
||||
; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
%test = icmp ult i64 %mask, u0xaaaaaaaaffffffff
|
||||
%ret = select i1 %test, i32 %a, i32 %b
|
||||
ret i32 %ret
|
||||
}
|
||||
@ -1047,7 +1047,7 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o
|
||||
; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 glc
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
|
||||
; SI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[2:3]
|
||||
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3
|
||||
; SI-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
|
||||
; SI-NEXT: v_cndmask_b32_e32 v2, 2, v4, vcc
|
||||
; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
|
||||
@ -1069,10 +1069,10 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
|
||||
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
|
||||
; VI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1]
|
||||
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc
|
||||
; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc
|
||||
; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
|
||||
@ -1089,7 +1089,7 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] glc dlc
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1]
|
||||
; GFX10-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc
|
||||
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
|
||||
@ -1108,7 +1108,7 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: global_load_b64 v[2:3], v4, s[4:5] glc dlc
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1]
|
||||
; GFX11-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
|
||||
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
|
||||
; GFX11-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc
|
||||
; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1]
|
||||
@ -1127,7 +1127,7 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: global_load_b64 v[2:3], v4, s[4:5] scope:SCOPE_SYS
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1]
|
||||
; GFX12-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
|
||||
; GFX12-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
|
||||
; GFX12-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc
|
||||
; GFX12-NEXT: global_store_b64 v4, v[0:1], s[0:1]
|
||||
|
||||
@ -9,11 +9,11 @@ define amdgpu_kernel void @widen_vselect_and_mask_v4f64(<4 x double> %arg) #0 {
|
||||
; GCN-LABEL: widen_vselect_and_mask_v4f64:
|
||||
; GCN: ; %bb.0: ; %bb
|
||||
; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b32 s6, -1
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], 16
|
||||
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s6, -1
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, v0
|
||||
@ -22,13 +22,11 @@ define amdgpu_kernel void @widen_vselect_and_mask_v4f64(<4 x double> %arg) #0 {
|
||||
; GCN-NEXT: s_waitcnt expcnt(0)
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[2:3]
|
||||
; GCN-NEXT: v_cmp_neq_f64_e64 s[0:1], s[0:1], 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[1:2]
|
||||
; GCN-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
|
||||
; GCN-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
|
||||
; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec
|
||||
; GCN-NEXT: s_cselect_b32 s0, 0x3ff00000, 0
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; GCN-NEXT: s_endpgm
|
||||
@ -54,18 +52,17 @@ define amdgpu_kernel void @widen_vselect_and_mask_v4i64(<4 x i64> %arg) #0 {
|
||||
; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GCN-NEXT: s_mov_b32 s10, -1
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-NEXT: s_mov_b64 s[8:9], 16
|
||||
; GCN-NEXT: s_mov_b32 s11, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s10, -1
|
||||
; GCN-NEXT: v_mov_b32_e32 v2, v1
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GCN-NEXT: v_mov_b32_e32 v4, v1
|
||||
; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[0:1], 0
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[2:3]
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[2:3]
|
||||
; GCN-NEXT: v_cmp_ne_u64_e64 s[0:1], s[0:1], 0
|
||||
; GCN-NEXT: v_mov_b32_e32 v6, v5
|
||||
; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[5:6]
|
||||
; GCN-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
|
||||
; GCN-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
|
||||
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
|
||||
; GCN-NEXT: buffer_store_dwordx4 v[1:4], off, s[8:11], 0
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user