From 26add8a27277f1495ef705d323fdee12975ab950 Mon Sep 17 00:00:00 2001 From: zGoldthorpe Date: Wed, 11 Feb 2026 13:41:03 -0700 Subject: [PATCH] [AMDGPU][ISel] `setcc` peephole for comparisons with upper 32 bits of a 64-bit register pair (#177662) This optimisation is motivated by [this minimal example](https://godbolt.org/z/9o83GvGsM). Particularly, if `mask` is a 64-bit integer, a select ```cpp auto out = ((mask >> 32) != 0) ? a : b; ``` converts the null-check on the higher 32-bits of `mask` into a `v_cmp_lt_u64` with the integer `1 << 32` stored in a pair of VGPRs (effectively wasting two VGPRs). More generally, if a 64-bit integer (whose lower 32 bits are known to be irrelevant) is compared with a 64-bit constant, two VGPRs are wasted to construct this constant. This patch modifies ISel to take advantage of how 64-bit values are stored in pairs of VGPRs (or SGPRs), and truncates the 64-bit constant to its upper 32-bit constant where possible. Alive2 proof for analogous middle-end transformation: https://alive2.llvm.org/ce/z/zizKms --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 20 ++ .../AMDGPU/agpr-copy-no-free-registers.ll | 24 ++- llvm/test/CodeGen/AMDGPU/commute-compares.ll | 15 +- llvm/test/CodeGen/AMDGPU/div_i128.ll | 8 +- llvm/test/CodeGen/AMDGPU/div_v2i128.ll | 108 +++++----- .../AMDGPU/divergence-driven-trunc-to-i1.ll | 19 +- llvm/test/CodeGen/AMDGPU/extract-subvector.ll | 28 +-- llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll | 4 +- llvm/test/CodeGen/AMDGPU/itofp.i128.ll | 12 +- llvm/test/CodeGen/AMDGPU/rem_i128.ll | 6 +- llvm/test/CodeGen/AMDGPU/saddo.ll | 112 +++++----- llvm/test/CodeGen/AMDGPU/saddsat.ll | 10 +- llvm/test/CodeGen/AMDGPU/setcc-select.ll | 202 ++++++++++++++++++ llvm/test/CodeGen/AMDGPU/v_cndmask.ll | 14 +- .../CodeGen/AMDGPU/widen-vselect-and-mask.ll | 15 +- 15 files changed, 414 insertions(+), 183 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/setcc-select.ll diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 202e0afdfccf..fe1d24f7803b 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -17171,6 +17171,26 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N, return LHS.getOperand(0); } } + + // setcc v.64, 0xXXXX'XXXX'0000'0000, lt/ge + // => setcc v.hi32, 0xXXXX'XXXX, lt/ge + // + // setcc v.64, 0xXXXX'XXXX'FFFF'FFFF, le/gt + // => setcc v.hi32, 0xXXXX'XXXX, le/gt + if (VT == MVT::i64) { + const uint64_t Mask32 = maskTrailingOnes(32); + const uint64_t CRHSInt = CRHSVal.getZExtValue(); + + if ( // setcc v.64, 0xXXXX'XXXX'0000'0000, lt/ge + ((CRHSInt & Mask32) == 0 && (CC == ISD::SETULT || CC == ISD::SETUGE || + CC == ISD::SETLT || CC == ISD::SETGE)) || + // setcc v.64, 0xXXXX'XXXX'FFFF'FFFF, le/gt + ((CRHSInt & Mask32) == Mask32 && + (CC == ISD::SETULE || CC == ISD::SETUGT || CC == ISD::SETLE || + CC == ISD::SETGT))) + return DAG.getSetCC(SL, N->getValueType(0), getHiHalf64(LHS, DAG), + DAG.getConstant(CRHSInt >> 32, SL, MVT::i32), CC); + } } // Eliminate setcc by using carryout from add/sub instruction diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index 7a8ca5674487..3096820df6b7 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -568,17 +568,19 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: ; %bb.3: ; %bb14 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX908-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1 +; GFX908-NEXT: s_cmp_lt_i32 s11, 0 ; GFX908-NEXT: s_mov_b32 s13, s12 -; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3] +; GFX908-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GFX908-NEXT: s_cmp_gt_i32 s11, -1 ; GFX908-NEXT: v_mov_b32_e32 v4, s12 -; GFX908-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v6 ; GFX908-NEXT: v_mov_b32_e32 v6, s12 ; GFX908-NEXT: v_mov_b32_e32 v8, s12 ; GFX908-NEXT: v_mov_b32_e32 v5, s13 +; GFX908-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX908-NEXT: v_mov_b32_e32 v7, s13 ; GFX908-NEXT: v_mov_b32_e32 v9, s13 -; GFX908-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0 +; GFX908-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[2:3] +; GFX908-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v10 ; GFX908-NEXT: v_mov_b32_e32 v11, v5 ; GFX908-NEXT: s_mov_b64 s[20:21], s[14:15] ; GFX908-NEXT: v_mov_b32_e32 v10, v4 @@ -599,9 +601,9 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX908-NEXT: s_add_u32 s20, s20, s4 -; GFX908-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[2:3] ; GFX908-NEXT: s_addc_u32 s21, s21, s5 ; GFX908-NEXT: s_mov_b64 s[22:23], 0 +; GFX908-NEXT: v_cmp_lt_i32_e64 s[24:25], -1, v3 ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[24:25] ; GFX908-NEXT: s_cbranch_vccz .LBB3_9 ; GFX908-NEXT: .LBB3_5: ; %bb16 @@ -728,15 +730,17 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: ; %bb.3: ; %bb14 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; GFX90A-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1 +; GFX90A-NEXT: s_cmp_lt_i32 s11, 0 ; GFX90A-NEXT: s_mov_b32 s13, s12 -; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[2:3] +; GFX90A-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GFX90A-NEXT: s_cmp_gt_i32 s11, -1 ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[12:13], s[12:13] op_sel:[0,1] -; GFX90A-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v8 +; GFX90A-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[12:13], s[12:13] op_sel:[0,1] ; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1] -; GFX90A-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0 +; GFX90A-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[2:3] ; GFX90A-NEXT: s_mov_b64 s[20:21], s[14:15] +; GFX90A-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v12 ; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s9, v4 @@ -756,8 +760,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX90A-NEXT: s_add_u32 s20, s20, s4 ; GFX90A-NEXT: s_addc_u32 s21, s21, s5 -; GFX90A-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[4:5] ; GFX90A-NEXT: s_mov_b64 s[22:23], 0 +; GFX90A-NEXT: v_cmp_lt_i32_e64 s[24:25], -1, v5 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[24:25] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_9 ; GFX90A-NEXT: .LBB3_5: ; %bb16 diff --git a/llvm/test/CodeGen/AMDGPU/commute-compares.ll b/llvm/test/CodeGen/AMDGPU/commute-compares.ll index ce4609495b0e..e4fb014af46a 100644 --- a/llvm/test/CodeGen/AMDGPU/commute-compares.ll +++ b/llvm/test/CodeGen/AMDGPU/commute-compares.ll @@ -541,19 +541,20 @@ define amdgpu_kernel void @commute_sgt_neg1_i64(ptr addrspace(1) %out, ptr addrs ; GCN-LABEL: commute_sgt_neg1_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] -; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN-NEXT: buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:4 +; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[3:4] -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: v_not_b32_e32 v0, v3 +; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index 5a4aa4effac0..167ef82a8d94 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -13,8 +13,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v1, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, 0, v2, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v3, vcc -; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] -; GFX9-NEXT: v_ashrrev_i32_e32 v17, 31, v3 +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v1, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v8, v0, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v11, v3, v11, vcc @@ -22,9 +21,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, 0, v4 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, 0, v5, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, 0, v6, vcc +; GFX9-NEXT: v_ashrrev_i32_e32 v17, 31, v3 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, 0, v7, vcc -; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7] -; GFX9-NEXT: v_ashrrev_i32_e32 v18, 31, v7 +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v21, v5, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v22, v4, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc @@ -53,6 +52,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_add_u32_e32 v3, 32, v3 ; GFX9-NEXT: v_min_u32_e32 v3, v3, v6 ; GFX9-NEXT: v_ffbh_u32_e32 v6, v8 +; GFX9-NEXT: v_ashrrev_i32_e32 v18, 31, v7 ; GFX9-NEXT: v_add_u32_e32 v6, 32, v6 ; GFX9-NEXT: v_ffbh_u32_e32 v7, v9 ; GFX9-NEXT: v_min_u32_e32 v6, v6, v7 diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll index 2b434c54da9c..52410c6d3698 100644 --- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll @@ -8,14 +8,14 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 ; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f ; SDAG-NEXT: v_ashrrev_i32_e32 v24, 31, v3 ; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v11 -; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f ; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc ; SDAG-NEXT: v_mov_b32_e32 v26, v24 ; SDAG-NEXT: v_mov_b32_e32 v27, v25 ; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v2, vcc -; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] +; SDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v3 ; SDAG-NEXT: v_cndmask_b32_e64 v19, v1, v17, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v18, v0, v16, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v3, vcc @@ -23,31 +23,31 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_ffbh_u32_e32 v1, v18 ; SDAG-NEXT: v_ffbh_u32_e32 v2, v19 ; SDAG-NEXT: v_cndmask_b32_e64 v17, v3, v0, s[4:5] -; SDAG-NEXT: v_or_b32_e32 v0, v18, v16 ; SDAG-NEXT: v_sub_i32_e32 v3, vcc, 0, v8 +; SDAG-NEXT: v_or_b32_e32 v0, v18, v16 ; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], 32, v1 ; SDAG-NEXT: v_ffbh_u32_e32 v22, v16 -; SDAG-NEXT: v_or_b32_e32 v1, v19, v17 ; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc +; SDAG-NEXT: v_or_b32_e32 v1, v19, v17 ; SDAG-NEXT: v_min_u32_e32 v2, v21, v2 ; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], 32, v22 ; SDAG-NEXT: v_ffbh_u32_e32 v22, v17 -; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[10:11] -; SDAG-NEXT: v_cndmask_b32_e64 v28, v9, v23, s[6:7] -; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v10, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v29, v8, v3, s[6:7] -; SDAG-NEXT: v_min_u32_e32 v1, v21, v22 -; SDAG-NEXT: v_add_i32_e64 v3, s[8:9], 64, v2 +; SDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v11 +; SDAG-NEXT: v_cndmask_b32_e64 v28, v9, v23, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v29, v8, v3, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, 0, v10, vcc +; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[0:1] +; SDAG-NEXT: v_min_u32_e32 v0, v21, v22 +; SDAG-NEXT: v_add_i32_e64 v1, s[8:9], 64, v2 ; SDAG-NEXT: v_addc_u32_e64 v8, s[8:9], 0, 0, s[8:9] ; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v11, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v0, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v3, s[4:5] ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] ; SDAG-NEXT: v_cndmask_b32_e64 v10, v8, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v8, v3, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v8, v1, v0, vcc ; SDAG-NEXT: v_ffbh_u32_e32 v1, v29 ; SDAG-NEXT: v_ffbh_u32_e32 v21, v28 -; SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v9, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v9, s[4:5] ; SDAG-NEXT: v_or_b32_e32 v0, v29, v2 ; SDAG-NEXT: v_add_i32_e32 v9, vcc, 32, v1 ; SDAG-NEXT: v_ffbh_u32_e32 v11, v2 @@ -57,12 +57,12 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_ffbh_u32_e32 v21, v3 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; SDAG-NEXT: v_min_u32_e32 v0, v11, v21 -; SDAG-NEXT: v_add_i32_e64 v1, s[6:7], 64, v9 -; SDAG-NEXT: v_addc_u32_e64 v9, s[6:7], 0, 0, s[6:7] -; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[2:3] -; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, 0, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[6:7] -; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] +; SDAG-NEXT: v_add_i32_e64 v1, s[4:5], 64, v9 +; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5] +; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v0, v8 ; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v9, v10, vcc ; SDAG-NEXT: v_xor_b32_e32 v0, 0x7f, v8 @@ -208,7 +208,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_mov_b32_e32 v23, v19 ; SDAG-NEXT: v_subb_u32_e32 v3, vcc, 0, v5, vcc ; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v6, vcc -; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7] +; SDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v7 ; SDAG-NEXT: v_cndmask_b32_e64 v5, v5, v3, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v2, vcc, 0, v7, vcc @@ -226,7 +226,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_ffbh_u32_e32 v30, v9 ; SDAG-NEXT: v_min_u32_e32 v6, v16, v6 ; SDAG-NEXT: v_subb_u32_e32 v16, vcc, 0, v14, vcc -; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[14:15] +; SDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v15 ; SDAG-NEXT: v_cndmask_b32_e64 v28, v13, v17, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v29, v12, v7, s[4:5] ; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[2:3] @@ -833,7 +833,7 @@ define <2 x i128> @v_sdiv_v2i128_v_pow2k(<2 x i128> %lhs) { ; SDAG-NEXT: v_mov_b32_e32 v19, v18 ; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc ; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v2, vcc -; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] +; SDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v3 ; SDAG-NEXT: v_cndmask_b32_e64 v13, v1, v10, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v12, v0, v8, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v3, vcc @@ -991,7 +991,7 @@ define <2 x i128> @v_sdiv_v2i128_v_pow2k(<2 x i128> %lhs) { ; SDAG-NEXT: v_mov_b32_e32 v21, v20 ; SDAG-NEXT: v_subb_u32_e32 v3, vcc, 0, v5, vcc ; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v6, vcc -; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7] +; SDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v7 ; SDAG-NEXT: v_cndmask_b32_e64 v11, v5, v3, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v10, v4, v2, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v2, vcc, 0, v7, vcc @@ -2855,61 +2855,61 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill ; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 ; SDAG-NEXT: v_mov_b32_e32 v19, 0 -; SDAG-NEXT: v_ashrrev_i32_e32 v28, 31, v3 ; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f +; SDAG-NEXT: v_ashrrev_i32_e32 v28, 31, v3 ; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc ; SDAG-NEXT: v_mov_b32_e32 v29, v28 ; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v2, vcc -; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] +; SDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v3 ; SDAG-NEXT: v_cndmask_b32_e64 v17, v1, v17, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v16, v0, v16, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, 0, v3, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v0, v2, v18, s[4:5] ; SDAG-NEXT: v_ffbh_u32_e32 v18, v16 ; SDAG-NEXT: v_ffbh_u32_e32 v20, v17 -; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 0, v8 ; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; SDAG-NEXT: v_or_b32_e32 v2, v16, v0 -; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 32, v18 +; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 0, v8 ; SDAG-NEXT: v_ffbh_u32_e32 v22, v0 -; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc +; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 32, v18 ; SDAG-NEXT: v_or_b32_e32 v3, v17, v1 +; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc +; SDAG-NEXT: v_add_i32_e64 v22, s[4:5], 32, v22 ; SDAG-NEXT: v_min_u32_e32 v18, v18, v20 -; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], 32, v22 -; SDAG-NEXT: v_ffbh_u32_e32 v22, v1 -; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[10:11] -; SDAG-NEXT: v_cndmask_b32_e64 v30, v9, v23, s[4:5] -; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v10, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v31, v8, v21, s[4:5] -; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[2:3] -; SDAG-NEXT: v_min_u32_e32 v3, v20, v22 +; SDAG-NEXT: v_ffbh_u32_e32 v20, v1 +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3] +; SDAG-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v11 +; SDAG-NEXT: v_cndmask_b32_e64 v30, v9, v23, s[6:7] +; SDAG-NEXT: v_subb_u32_e32 v2, vcc, 0, v10, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v31, v8, v21, s[6:7] +; SDAG-NEXT: v_min_u32_e32 v3, v22, v20 ; SDAG-NEXT: v_add_i32_e64 v8, s[8:9], 64, v18 -; SDAG-NEXT: v_addc_u32_e64 v18, s[8:9], 0, 0, s[8:9] -; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v11, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v9, s[4:5] +; SDAG-NEXT: v_addc_u32_e64 v9, s[8:9], 0, 0, s[8:9] +; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v11, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7] ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v20, v9, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v10, v8, v3, vcc ; SDAG-NEXT: v_ffbh_u32_e32 v9, v31 ; SDAG-NEXT: v_ffbh_u32_e32 v21, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v20, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v18, s[6:7] ; SDAG-NEXT: v_or_b32_e32 v8, v31, v2 ; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v9 -; SDAG-NEXT: v_ffbh_u32_e32 v20, v2 +; SDAG-NEXT: v_ffbh_u32_e32 v18, v2 ; SDAG-NEXT: v_or_b32_e32 v9, v30, v3 ; SDAG-NEXT: v_min_u32_e32 v11, v11, v21 -; SDAG-NEXT: v_add_i32_e32 v20, vcc, 32, v20 +; SDAG-NEXT: v_add_i32_e32 v18, vcc, 32, v18 ; SDAG-NEXT: v_ffbh_u32_e32 v21, v3 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; SDAG-NEXT: v_min_u32_e32 v8, v20, v21 -; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 64, v11 -; SDAG-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[2:3] -; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v9, v8, s[4:5] -; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SDAG-NEXT: v_min_u32_e32 v8, v18, v21 +; SDAG-NEXT: v_add_i32_e64 v9, s[6:7], 64, v11 +; SDAG-NEXT: v_addc_u32_e64 v11, s[6:7], 0, 0, s[6:7] +; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, 0, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v9, v8, s[6:7] +; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] ; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v8, v10 -; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v18, vcc +; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v20, vcc ; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v10 ; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v19, vcc ; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11] @@ -3051,7 +3051,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_mov_b32_e32 v35, v32 ; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc ; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v6, vcc -; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7] +; SDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v7 ; SDAG-NEXT: v_cndmask_b32_e64 v11, v5, v11, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v10, v4, v10, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v5, vcc, 0, v7, vcc @@ -3069,7 +3069,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_ffbh_u32_e32 v24, v5 ; SDAG-NEXT: v_min_u32_e32 v18, v18, v20 ; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v14, vcc -; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[14:15] +; SDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v15 ; SDAG-NEXT: v_cndmask_b32_e64 v36, v13, v23, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v37, v12, v21, s[4:5] ; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll index 0299cc60bfc8..1bbe503bfe07 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll @@ -125,17 +125,14 @@ define amdgpu_kernel void @uniform_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3 ; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3 - ; GCN-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[DEF]], %subreg.sub0, killed [[COPY5]], %subreg.sub1 ; GCN-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORD_IMM]] ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY6]], implicit-def dead $scc ; GCN-NEXT: S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc ; GCN-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY $scc - ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GCN-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY killed [[S_MOV_B64_]] - ; GCN-NEXT: [[V_CMP_LT_I64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I64_e64 killed [[REG_SEQUENCE2]], [[COPY8]], implicit $exec - ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I64_e64_]], killed [[COPY7]], implicit-def dead $scc + ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN-NEXT: S_CMP_LT_I32 killed [[COPY5]], killed [[S_MOV_B32_2]], implicit-def $scc + ; GCN-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY $scc + ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[COPY8]], killed [[COPY7]], implicit-def dead $scc ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec ; GCN-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.2, addrspace 1) ; GCN-NEXT: S_ENDPGM 0 @@ -155,12 +152,12 @@ define i1 @divergent_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x, i1 %z) { ; GCN-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[DEF]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 ; GCN-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY]], implicit $exec ; GCN-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_AND_B32_e64_]], 1, implicit $exec - ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY killed [[S_MOV_B64_]] - ; GCN-NEXT: [[V_CMP_LT_I64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I64_e64 killed [[REG_SEQUENCE]], [[COPY2]], implicit $exec - ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I64_e64_]], killed [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 killed [[COPY2]], killed [[S_MOV_B32_]], implicit $exec + ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I32_e64_]], killed [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec ; GCN-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] ; GCN-NEXT: SI_RETURN implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll index 87d7a73c5c01..4279b4f285b6 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll @@ -137,9 +137,9 @@ define <2 x i64> @extract_2xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN-NEXT: .LBB1_4: ; %exit ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000 -; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5] +; GCN-NEXT: v_cmp_lt_i32_e32 vcc, -1, v5 ; GCN-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc -; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[6:7] +; GCN-NEXT: v_cmp_lt_i32_e32 vcc, -1, v7 ; GCN-NEXT: v_cndmask_b32_e32 v2, -1, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v1, -1 ; GCN-NEXT: v_mov_b32_e32 v3, -1 @@ -205,13 +205,13 @@ define <4 x i64> @extract_4xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN-NEXT: .LBB2_4: ; %exit ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000 -; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5] +; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 0, v5 ; GCN-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc -; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7] +; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 0, v7 ; GCN-NEXT: v_cndmask_b32_e64 v2, v1, -1, vcc -; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[8:9] +; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 0, v9 ; GCN-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc -; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11] +; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 0, v11 ; GCN-NEXT: v_cndmask_b32_e64 v6, v1, -1, vcc ; GCN-NEXT: v_mov_b32_e32 v1, -1 ; GCN-NEXT: v_mov_b32_e32 v3, -1 @@ -295,14 +295,14 @@ define <8 x i64> @extract_8xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN-NEXT: .LBB3_4: ; %exit ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000 -; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7] -; GCN-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[8:9] -; GCN-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[10:11] -; GCN-NEXT: v_cmp_gt_i64_e64 s[8:9], 0, v[12:13] -; GCN-NEXT: v_cmp_gt_i64_e64 s[10:11], 0, v[14:15] -; GCN-NEXT: v_cmp_gt_i64_e64 s[12:13], 0, v[16:17] -; GCN-NEXT: v_cmp_gt_i64_e64 s[14:15], 0, v[18:19] -; GCN-NEXT: v_cmp_gt_i64_e64 s[16:17], 0, v[4:5] +; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 0, v7 +; GCN-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v9 +; GCN-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v11 +; GCN-NEXT: v_cmp_gt_i32_e64 s[8:9], 0, v13 +; GCN-NEXT: v_cmp_gt_i32_e64 s[10:11], 0, v15 +; GCN-NEXT: v_cmp_gt_i32_e64 s[12:13], 0, v17 +; GCN-NEXT: v_cmp_gt_i32_e64 s[14:15], 0, v19 +; GCN-NEXT: v_cmp_gt_i32_e64 s[16:17], 0, v5 ; GCN-NEXT: v_cndmask_b32_e64 v0, v1, -1, s[16:17] ; GCN-NEXT: v_cndmask_b32_e64 v2, v1, -1, vcc ; GCN-NEXT: v_cndmask_b32_e64 v4, v1, -1, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll index eee3352fa745..e775ed84f453 100644 --- a/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll +++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll @@ -21,8 +21,7 @@ define bfloat @sitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: v_subb_co_u32_e32 v5, vcc, 0, v1, vcc ; GCN-NEXT: v_subb_co_u32_e32 v6, vcc, 0, v2, vcc ; GCN-NEXT: v_subb_co_u32_e32 v7, vcc, 0, v3, vcc -; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] -; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GCN-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc @@ -40,6 +39,7 @@ define bfloat @sitofp_i128_to_bf16(i128 %x) { ; GCN-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc ; GCN-NEXT: v_sub_u32_e32 v2, 0x80, v7 ; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 25, v2 +; GCN-NEXT: ; implicit-def: $vgpr8 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-NEXT: ; %bb.2: ; %itofp-if-else diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll index 5a15ec8ec3c8..c08b0304239f 100644 --- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll @@ -17,8 +17,7 @@ define float @sitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: v_subb_co_u32_e32 v5, vcc, 0, v1, vcc ; SDAG-NEXT: v_subb_co_u32_e32 v6, vcc, 0, v2, vcc ; SDAG-NEXT: v_subb_co_u32_e32 v7, vcc, 0, v3, vcc -; SDAG-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: ; implicit-def: $vgpr8 +; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 ; SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc @@ -36,6 +35,7 @@ define float @sitofp_i128_to_f32(i128 %x) { ; SDAG-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc ; SDAG-NEXT: v_sub_u32_e32 v2, 0x80, v7 ; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v2 +; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SDAG-NEXT: ; %bb.2: ; %itofp-if-else @@ -527,8 +527,7 @@ define double @sitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: v_subb_co_u32_e32 v1, vcc, 0, v5, vcc ; SDAG-NEXT: v_subb_co_u32_e32 v6, vcc, 0, v2, vcc ; SDAG-NEXT: v_subb_co_u32_e32 v7, vcc, 0, v3, vcc -; SDAG-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: ; implicit-def: $vgpr10 +; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 ; SDAG-NEXT: v_cndmask_b32_e32 v6, v2, v6, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v7, v3, v7, vcc @@ -546,6 +545,7 @@ define double @sitofp_i128_to_f64(i128 %x) { ; SDAG-NEXT: v_cndmask_b32_e32 v9, v1, v0, vcc ; SDAG-NEXT: v_sub_u32_e32 v2, 0x80, v9 ; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 54, v2 +; SDAG-NEXT: ; implicit-def: $vgpr10 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -1105,8 +1105,7 @@ define half @sitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: v_subb_co_u32_e32 v5, vcc, 0, v1, vcc ; SDAG-NEXT: v_subb_co_u32_e32 v6, vcc, 0, v2, vcc ; SDAG-NEXT: v_subb_co_u32_e32 v7, vcc, 0, v3, vcc -; SDAG-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: ; implicit-def: $vgpr8 +; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 ; SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc @@ -1124,6 +1123,7 @@ define half @sitofp_i128_to_f16(i128 %x) { ; SDAG-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc ; SDAG-NEXT: v_sub_u32_e32 v2, 0x80, v7 ; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v2 +; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SDAG-NEXT: ; %bb.2: ; %itofp-if-else diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index 4e1f0c0538bb..941b1fa66c49 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -14,7 +14,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v1, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, 0, v2, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v3, vcc -; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 ; GFX9-NEXT: v_ashrrev_i32_e32 v20, 31, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc @@ -24,8 +24,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v5, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, 0, v6, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v7, vcc -; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7] -; GFX9-NEXT: v_mov_b32_e32 v21, v20 +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v22, v5, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v23, v4, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v11, vcc @@ -70,6 +69,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v9, vcc ; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v21, v20 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll index 8861b7726a4c..0e9ea5430d0e 100644 --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -20,18 +20,19 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: s_add_u32 s10, s2, s8 -; SI-NEXT: s_addc_u32 s11, s3, s9 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1] -; SI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[8:9], 0 ; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_add_u32 s0, s2, s8 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v0 +; SI-NEXT: s_addc_u32 s1, s3, s9 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; SI-NEXT: s_cmp_lt_i32 s9, 0 +; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; SI-NEXT: s_xor_b64 s[2:3], s[2:3], vcc +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -41,18 +42,19 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: s_add_u32 s6, s2, s4 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: s_addc_u32 s7, s3, s5 -; VI-NEXT: v_cmp_lt_i64_e64 s[8:9], s[4:5], 0 -; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[1:2] ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_add_u32 s0, s2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_xor_b64 s[0:1], s[8:9], vcc -; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_addc_u32 s1, s3, s5 +; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[2:3] +; VI-NEXT: s_cmp_lt_i32 s5, 0 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: s_xor_b64 s[2:3], s[2:3], vcc +; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -67,11 +69,12 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) ; GFX9-NEXT: s_add_u32 s4, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_addc_u32 s5, s3, s7 -; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[6:7], 0 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_xor_b64 s[2:3], s[8:9], vcc +; GFX9-NEXT: s_cmp_lt_i32 s7, 0 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -86,9 +89,10 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s4, s2, s6 ; GFX10-NEXT: s_addc_u32 s5, s3, s7 -; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[6:7], 0 +; GFX10-NEXT: s_cmp_lt_i32 s7, 0 ; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3] -; GFX10-NEXT: s_xor_b32 s2, s6, s2 +; GFX10-NEXT: s_cselect_b32 s3, -1, 0 +; GFX10-NEXT: s_xor_b32 s2, s3, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX10-NEXT: v_add_co_u32 v0, s2, s4, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s5, 0, s2 @@ -104,13 +108,14 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s6, s2, s4 ; GFX11-NEXT: s_addc_u32 s7, s3, s5 -; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[4:5], 0 +; GFX11-NEXT: s_cmp_lt_i32 s5, 0 ; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], s[2:3] -; GFX11-NEXT: s_xor_b32 s2, s4, s2 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_cselect_b32 s3, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s2, s3, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, s2, s6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm @@ -355,7 +360,8 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_addc_u32 s13, s5, s7 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] -; SI-NEXT: v_cmp_lt_i64_e64 s[4:5], s[6:7], 0 +; SI-NEXT: s_cmp_lt_i32 s7, 0 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 @@ -381,10 +387,11 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_addc_u32 s1, s5, s7 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5] +; VI-NEXT: s_cmp_lt_i32 s7, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5] -; VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc @@ -402,11 +409,12 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: v_mov_b32_e32 v0, s12 ; GFX9-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-NEXT: s_addc_u32 s1, s13, s15 -; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[14:15], 0 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: s_cmp_lt_i32 s15, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX9-NEXT: global_store_byte v2, v0, s[10:11] @@ -419,11 +427,12 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s12, s14 ; GFX10-NEXT: s_addc_u32 s1, s13, s15 -; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[14:15], 0 -; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[12:13] +; GFX10-NEXT: s_cmp_lt_i32 s15, 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[0:1], s[12:13] +; GFX10-NEXT: s_cselect_b32 s3, -1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: s_xor_b32 s0, s2, s3 +; GFX10-NEXT: s_xor_b32 s0, s3, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX10-NEXT: global_store_byte v2, v3, s[10:11] @@ -435,11 +444,12 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s8, s4, s6 ; GFX11-NEXT: s_addc_u32 s9, s5, s7 -; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[6:7], 0 +; GFX11-NEXT: s_cmp_lt_i32 s7, 0 ; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5] +; GFX11-NEXT: s_cselect_b32 s5, -1, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, s8 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9 -; GFX11-NEXT: s_xor_b32 s4, s6, s4 +; GFX11-NEXT: s_xor_b32 s4, s5, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 ; GFX11-NEXT: s_clause 0x1 @@ -478,11 +488,11 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, v0, v2 ; SI-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc -; SI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] ; SI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1] -; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0 +; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 ; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -503,11 +513,11 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v9, vcc, v1, v3, vcc -; VI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] ; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1] -; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9] +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 ; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9] ; VI-NEXT: flat_store_byte v[6:7], v0 ; VI-NEXT: s_endpgm ; @@ -521,11 +531,11 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1] -; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9] +; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 ; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9] ; GFX9-NEXT: global_store_byte v6, v0, s[10:11] ; GFX9-NEXT: s_endpgm ; @@ -540,9 +550,9 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo -; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] -; GFX10-NEXT: s_xor_b32 s0, vcc_lo, s0 +; GFX10-NEXT: v_cmp_gt_i32_e64 s0, 0, v3 +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] +; GFX10-NEXT: s_xor_b32 s0, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX10-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9] ; GFX10-NEXT: global_store_byte v6, v0, s[10:11] @@ -560,9 +570,9 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo -; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] -; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] -; GFX11-NEXT: s_xor_b32 s0, vcc_lo, s0 +; GFX11-NEXT: v_cmp_gt_i32_e64 s0, 0, v3 +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] +; GFX11-NEXT: s_xor_b32 s0, s0, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX11-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll index 9debb88dd0d7..5d5f3be9ce9f 100644 --- a/llvm/test/CodeGen/AMDGPU/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll @@ -430,7 +430,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] -; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] +; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc @@ -443,7 +443,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] -; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] +; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v3 ; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc @@ -456,7 +456,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] +; GFX9-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v3 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc @@ -468,7 +468,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo -; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[2:3] +; GFX10-NEXT: v_cmp_gt_i32_e64 s4, 0, v3 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo @@ -481,7 +481,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo -; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3] +; GFX11-NEXT: v_cmp_gt_i32_e64 s0, 0, v3 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v5 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/setcc-select.ll b/llvm/test/CodeGen/AMDGPU/setcc-select.ll new file mode 100644 index 000000000000..30c669c46ac1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/setcc-select.ll @@ -0,0 +1,202 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck %s + +define i32 @select.hi32.sgpr.ult(i64 inreg %mask, i32 inreg %a, i32 inreg %b) { +; CHECK-LABEL: select.hi32.sgpr.ult: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_cmp_lt_u32 s17, 0xaaaaaaaa +; CHECK-NEXT: s_cselect_b32 s4, s18, s19 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %mask.hi.z = icmp ult i64 %mask, u0xaaaaaaaa00000000 + %ret = select i1 %mask.hi.z, i32 %a, i32 %b + ret i32 %ret +} + +define i32 @select.hi32.sgpr.uge(i64 inreg %mask, i32 inreg %a, i32 inreg %b) { +; CHECK-LABEL: select.hi32.sgpr.uge: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_cmp_gt_u32 s17, 0xaaaaaaa9 +; CHECK-NEXT: s_cselect_b32 s4, s18, s19 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %mask.hi.nz = icmp uge i64 %mask, u0xaaaaaaaa00000000 + %ret = select i1 %mask.hi.nz, i32 %a, i32 %b + ret i32 %ret +} + +define i32 @select.hi32.sgpr.ule(i64 inreg %mask, i32 inreg %a, i32 inreg %b) { +; CHECK-LABEL: select.hi32.sgpr.ule: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_cmp_lt_u32 s17, 0xaaaaaaab +; CHECK-NEXT: s_cselect_b32 s4, s18, s19 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %mask.hi.z = icmp ule i64 %mask, u0xaaaaaaaaffffffff + %ret = select i1 %mask.hi.z, i32 %a, i32 %b + ret i32 %ret +} + +define i32 @select.hi32.sgpr.ugt(i64 inreg %mask, i32 inreg %a, i32 inreg %b) { +; CHECK-LABEL: select.hi32.sgpr.ugt: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_cmp_gt_u32 s17, 0xaaaaaaaa +; CHECK-NEXT: s_cselect_b32 s4, s18, s19 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %mask.hi.nz = icmp ugt i64 %mask, u0xaaaaaaaaffffffff + %ret = select i1 %mask.hi.nz, i32 %a, i32 %b + ret i32 %ret +} + +define i32 @select.hi32.vgpr.ult(i64 %mask, i32 %a, i32 %b) { +; CHECK-LABEL: select.hi32.vgpr.ult: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, 0xaaaaaaaa +; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; CHECK-NEXT: s_setpc_b64 s[30:31] + %mask.hi.z = icmp ult i64 %mask, u0xaaaaaaaa00000000 + %ret = select i1 %mask.hi.z, i32 %a, i32 %b + ret i32 %ret +} + +define i32 @select.hi32.vgpr.uge(i64 %mask, i32 %a, i32 %b) { +; CHECK-LABEL: select.hi32.vgpr.uge: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, 0xaaaaaaa9 +; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s4, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; CHECK-NEXT: s_setpc_b64 s[30:31] + %mask.hi.nz = icmp uge i64 %mask, u0xaaaaaaaa00000000 + %ret = select i1 %mask.hi.nz, i32 %a, i32 %b + ret i32 %ret +} + +define i32 @select.hi32.vgpr.ule(i64 %mask, i32 %a, i32 %b) { +; CHECK-LABEL: select.hi32.vgpr.ule: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, 0xaaaaaaab +; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; CHECK-NEXT: s_setpc_b64 s[30:31] + %mask.hi.z = icmp ule i64 %mask, u0xaaaaaaaaffffffff + %ret = select i1 %mask.hi.z, i32 %a, i32 %b + ret i32 %ret +} + +define i32 @select.hi32.vgpr.ugt(i64 %mask, i32 %a, i32 %b) { +; CHECK-LABEL: select.hi32.vgpr.ugt: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, 0xaaaaaaaa +; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s4, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; CHECK-NEXT: s_setpc_b64 s[30:31] + %mask.hi.nz = icmp ugt i64 %mask, u0xaaaaaaaaffffffff + %ret = select i1 %mask.hi.nz, i32 %a, i32 %b + ret i32 %ret +} + +define i32 @select.hi32.sgpr.multiuse(i64 inreg %mask, i32 inreg %a, i32 inreg %b, i32 inreg %c, i32 inreg %d) { +; CHECK-LABEL: select.hi32.sgpr.multiuse: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_cmp_lt_u32 s17, 0xaaaaaaaa +; CHECK-NEXT: s_cselect_b32 s4, s18, s19 +; CHECK-NEXT: s_cselect_b32 s5, s20, s21 +; CHECK-NEXT: s_add_i32 s4, s4, s5 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %mask.hi.nz = icmp ult i64 %mask, u0xaaaaaaaa00000000 + %ab = select i1 %mask.hi.nz, i32 %a, i32 %b + %cd = select i1 %mask.hi.nz, i32 %c, i32 %d + %ret = add i32 %ab, %cd + ret i32 %ret +} + + +define i32 @select.hi32.vgpr.multiuse(i64 %mask, i32 %a, i32 %b, i32 %c, i32 %d) { +; CHECK-LABEL: select.hi32.vgpr.multiuse: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, 0xaaaaaaaa +; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %mask.hi.nz = icmp ult i64 %mask, u0xaaaaaaaa00000000 + %ab = select i1 %mask.hi.nz, i32 %a, i32 %b + %cd = select i1 %mask.hi.nz, i32 %c, i32 %d + %ret = add i32 %ab, %cd + ret i32 %ret +} + +define i32 @select.bad.sgpr.ule(i64 inreg %mask, i32 inreg %a, i32 inreg %b) { +; CHECK-LABEL: select.bad.sgpr.ule: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: v_mov_b32_e32 v1, 0xaaaaaaaa +; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1] +; CHECK-NEXT: s_and_b64 s[4:5], vcc, exec +; CHECK-NEXT: s_cselect_b32 s4, s18, s19 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %test = icmp ule i64 %mask, u0xaaaaaaaa00000000 + %ret = select i1 %test, i32 %a, i32 %b + ret i32 %ret +} + +define i32 @select.bad.sgpr.ult(i64 inreg %mask, i32 inreg %a, i32 inreg %b) { +; CHECK-LABEL: select.bad.sgpr.ult: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, -1 +; CHECK-NEXT: v_mov_b32_e32 v1, 0xaaaaaaaa +; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1] +; CHECK-NEXT: s_and_b64 s[4:5], vcc, exec +; CHECK-NEXT: s_cselect_b32 s4, s18, s19 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %test = icmp ult i64 %mask, u0xaaaaaaaaffffffff + %ret = select i1 %test, i32 %a, i32 %b + ret i32 %ret +} + + +define i32 @select.bad.vgpr.ule(i64 %mask, i32 %a, i32 %b) { +; CHECK-LABEL: select.bad.vgpr.ule: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, 1 +; CHECK-NEXT: s_mov_b32 s5, 0xaaaaaaaa +; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] +; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; CHECK-NEXT: s_setpc_b64 s[30:31] + %test = icmp ule i64 %mask, u0xaaaaaaaa00000000 + %ret = select i1 %test, i32 %a, i32 %b + ret i32 %ret +} + +define i32 @select.bad.vgpr.ult(i64 %mask, i32 %a, i32 %b) { +; CHECK-LABEL: select.bad.vgpr.ult: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, -1 +; CHECK-NEXT: s_mov_b32 s5, 0xaaaaaaaa +; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] +; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; CHECK-NEXT: s_setpc_b64 s[30:31] + %test = icmp ult i64 %mask, u0xaaaaaaaaffffffff + %ret = select i1 %test, i32 %a, i32 %b + ret i32 %ret +} diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll index c3a7e2ae4f34..23172eb2d815 100644 --- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll @@ -1047,7 +1047,7 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o ; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 s[2:3], s[10:11] -; SI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[2:3] +; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3 ; SI-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc ; SI-NEXT: v_cndmask_b32_e32 v2, 2, v4, vcc ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 @@ -1069,10 +1069,10 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1] +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc +; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] @@ -1089,7 +1089,7 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1] +; GFX10-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc ; GFX10-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] @@ -1108,7 +1108,7 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_b64 v[2:3], v4, s[4:5] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1] +; GFX11-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc ; GFX11-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] @@ -1127,7 +1127,7 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_load_b64 v[2:3], v4, s[4:5] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1] +; GFX12-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 ; GFX12-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc ; GFX12-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc ; GFX12-NEXT: global_store_b64 v4, v[0:1], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll b/llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll index 69f6c38d55a2..ecb0c8eb9e0b 100644 --- a/llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll @@ -9,11 +9,11 @@ define amdgpu_kernel void @widen_vselect_and_mask_v4f64(<4 x double> %arg) #0 { ; GCN-LABEL: widen_vselect_and_mask_v4f64: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 -; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b64 s[4:5], 16 ; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v0 @@ -22,13 +22,11 @@ define amdgpu_kernel void @widen_vselect_and_mask_v4f64(<4 x double> %arg) #0 { ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[2:3] ; GCN-NEXT: v_cmp_neq_f64_e64 s[0:1], s[0:1], 0 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[1:2] +; GCN-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 ; GCN-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GCN-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 ; GCN-NEXT: s_mov_b64 s[4:5], 0 -; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm @@ -54,18 +52,17 @@ define amdgpu_kernel void @widen_vselect_and_mask_v4i64(<4 x i64> %arg) #0 { ; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b64 s[8:9], 16 ; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: v_mov_b32_e32 v2, v1 ; GCN-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NEXT: v_mov_b32_e32 v4, v1 ; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[0:1], 0 -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[2:3] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[2:3] ; GCN-NEXT: v_cmp_ne_u64_e64 s[0:1], s[0:1], 0 -; GCN-NEXT: v_mov_b32_e32 v6, v5 -; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[5:6] +; GCN-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 ; GCN-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GCN-NEXT: buffer_store_dwordx4 v[1:4], off, s[8:11], 0