This is a second attempt at "[SelectionDAG] Expand CTTZ_ELTS[_ZERO_POISON] and handle splitting" (#188220) That PR had to be reverted in 7d39664a6ae8daaf186b65578492244d96a50bf2 because we had crashes on AMDGPU since we didn't have scalarization support, and other crashes on PowerPC because we didn't handle the case when a vector needed widened. Tests for these are added in AMDGPU/cttz-elts.ll, RISCV/rvv/cttz-elts-scalarize.ll and PowerPC/cttz-elts.ll. The former crash has been fixed by adding DAGTypeLegalizer::ScalarizeVecOp_CTTZ_ELTS. The second crash has been fixed by reworking TargetLowering::expandCttzElts. The expansion for CTTZ_ELTS is nearly identical to VECTOR_FIND_LAST_ACTIVE, except it uses a reverse step vector and subtracts the result from VF. The easiest way to fix these crashes without introducing regressions is to reuse the VECTOR_FIND_LAST_ACTIVE expansion which already handles the case where the vector needs widened. This means that the node now needs to take in a boolean vector argument and uses VSELECT instead of an AND to zero out inactive lanes, so the op promotion code has also been shared.
74 lines
2.7 KiB
LLVM
74 lines
2.7 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
|
|
; RUN: llc -mtriple=amdgcn < %s | FileCheck %s
|
|
|
|
define i32 @v1i1(<1 x i1> %x) {
|
|
; CHECK-LABEL: v1i1:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
%y = call i32 @llvm.experimental.cttz.elts(<1 x i1> %x, i1 true)
|
|
ret i32 %y
|
|
}
|
|
|
|
define i32 @v4i1(<4 x i1> %x) {
|
|
; CHECK-LABEL: v4i1:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: v_and_b32_e32 v2, 1, v2
|
|
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
|
|
; CHECK-NEXT: v_and_b32_e32 v1, 1, v1
|
|
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
|
|
; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v1
|
|
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
|
|
; CHECK-NEXT: v_add_i32_e64 v1, s[4:5], 1, v1
|
|
; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
|
|
; CHECK-NEXT: s_xor_b64 s[6:7], vcc, -1
|
|
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
|
|
; CHECK-NEXT: s_xor_b64 s[6:7], s[4:5], -1
|
|
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[6:7]
|
|
; CHECK-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc
|
|
; CHECK-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5]
|
|
; CHECK-NEXT: v_or_b32_e32 v0, 2, v0
|
|
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 2, v1
|
|
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
%y = call i32 @llvm.experimental.cttz.elts(<4 x i1> %x, i1 true)
|
|
ret i32 %y
|
|
}
|
|
|
|
define i32 @v1i8(<1 x i8> %x) {
|
|
; CHECK-LABEL: v1i8:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
%y = call i32 @llvm.experimental.cttz.elts(<1 x i8> %x, i1 true)
|
|
ret i32 %y
|
|
}
|
|
|
|
define i32 @v4i8(<4 x i8> %x) {
|
|
; CHECK-LABEL: v4i8:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: v_and_b32_e32 v2, 0xff, v2
|
|
; CHECK-NEXT: v_and_b32_e32 v0, 0xff, v0
|
|
; CHECK-NEXT: v_and_b32_e32 v1, 0xff, v1
|
|
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
|
|
; CHECK-NEXT: v_addc_u32_e64 v1, s[4:5], 1, 0, vcc
|
|
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
|
|
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
|
|
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
|
|
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
|
|
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
|
; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
|
|
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
|
|
; CHECK-NEXT: v_cndmask_b32_e32 v1, 1, v4, vcc
|
|
; CHECK-NEXT: v_or_b32_e32 v1, 2, v1
|
|
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0
|
|
; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
%y = call i32 @llvm.experimental.cttz.elts(<4 x i8> %x, i1 true)
|
|
ret i32 %y
|
|
}
|