Luke Lau 598f3535fa
[SelectionDAG] Expand CTTZ_ELTS[_ZERO_POISON] and handle legalization (#188691)
This is a second attempt at "[SelectionDAG] Expand
CTTZ_ELTS[_ZERO_POISON] and handle splitting" (#188220)

That PR had to be reverted in 7d39664a6ae8daaf186b65578492244d96a50bf2
because we had crashes on AMDGPU since we didn't have scalarization
support, and other crashes on PowerPC because we didn't handle the case
when a vector needed widened. Tests for these are added in
AMDGPU/cttz-elts.ll, RISCV/rvv/cttz-elts-scalarize.ll and
PowerPC/cttz-elts.ll.

The former crash has been fixed by adding
DAGTypeLegalizer::ScalarizeVecOp_CTTZ_ELTS.

The second crash has been fixed by reworking
TargetLowering::expandCttzElts. The expansion for CTTZ_ELTS is nearly
identical to VECTOR_FIND_LAST_ACTIVE, except it uses a reverse step
vector and subtracts the result from VF. The easiest way to fix these
crashes without introducing regressions is to reuse the
VECTOR_FIND_LAST_ACTIVE expansion which already handles the case where
the vector needs widened.

This means that the node now needs to take in a boolean vector argument
and uses VSELECT instead of an AND to zero out inactive lanes, so the op
promotion code has also been shared.
2026-03-31 07:25:57 +00:00

74 lines
2.7 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=amdgcn < %s | FileCheck %s
define i32 @v1i1(<1 x i1> %x) {
; CHECK-LABEL: v1i1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_setpc_b64 s[30:31]
%y = call i32 @llvm.experimental.cttz.elts(<1 x i1> %x, i1 true)
ret i32 %y
}
define i32 @v4i1(<4 x i1> %x) {
; CHECK-LABEL: v4i1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v2, 1, v2
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_and_b32_e32 v1, 1, v1
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v1
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
; CHECK-NEXT: v_add_i32_e64 v1, s[4:5], 1, v1
; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
; CHECK-NEXT: s_xor_b64 s[6:7], vcc, -1
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
; CHECK-NEXT: s_xor_b64 s[6:7], s[4:5], -1
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[6:7]
; CHECK-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc
; CHECK-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5]
; CHECK-NEXT: v_or_b32_e32 v0, 2, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 2, v1
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
%y = call i32 @llvm.experimental.cttz.elts(<4 x i1> %x, i1 true)
ret i32 %y
}
define i32 @v1i8(<1 x i8> %x) {
; CHECK-LABEL: v1i8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_setpc_b64 s[30:31]
%y = call i32 @llvm.experimental.cttz.elts(<1 x i8> %x, i1 true)
ret i32 %y
}
define i32 @v4i8(<4 x i8> %x) {
; CHECK-LABEL: v4i8:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_and_b32_e32 v2, 0xff, v2
; CHECK-NEXT: v_and_b32_e32 v0, 0xff, v0
; CHECK-NEXT: v_and_b32_e32 v1, 0xff, v1
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; CHECK-NEXT: v_addc_u32_e64 v1, s[4:5], 1, 0, vcc
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; CHECK-NEXT: v_cndmask_b32_e32 v1, 1, v4, vcc
; CHECK-NEXT: v_or_b32_e32 v1, 2, v1
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0
; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
%y = call i32 @llvm.experimental.cttz.elts(<4 x i8> %x, i1 true)
ret i32 %y
}