[LLVM][AArch64] Optimize sign bit tests with TST instruction for SIGN_EXTEND patterns (#158061)

Hi, I recently found out in some cases LLVM doesn't generate optimal
code like:
```
sxtb w8, w0
cmp w8, #0
csel w0, w1, w2, lt
```
```
tst w0, #0x80
csel w0, w1, w2, mi
```

This optimization is only applied when the following conditions are met:
1. The comparison is setlt (signed less than)
2. The right-hand side is zero
3. The left-hand side is a sign extension operation (SIGN_EXTEND or
SIGN_EXTEND_INREG)
4. The sign-extended value has only one use (hasOneUse())
5. The original type is an integer type
This commit is contained in:
guan jian 2025-09-29 16:53:50 +08:00 committed by GitHub
parent edc76e15ed
commit 8d57211d6f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 97 additions and 28 deletions

View File

@ -11778,6 +11778,28 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(
return DAG.getNode(ISD::AND, DL, VT, LHS, Shift);
}
// Check for sign bit test patterns that can use TST optimization.
// (SELECT_CC setlt, sign_extend_inreg, 0, tval, fval)
// -> TST %operand, sign_bit; CSEL
// (SELECT_CC setlt, sign_extend, 0, tval, fval)
// -> TST %operand, sign_bit; CSEL
if (CC == ISD::SETLT && RHSC && RHSC->isZero() && LHS.hasOneUse() &&
(LHS.getOpcode() == ISD::SIGN_EXTEND_INREG ||
LHS.getOpcode() == ISD::SIGN_EXTEND)) {
uint64_t SignBitPos;
std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
EVT TestVT = LHS.getValueType();
SDValue SignBitConst = DAG.getConstant(1ULL << SignBitPos, DL, TestVT);
SDValue TST =
DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(TestVT, MVT::i32),
LHS, SignBitConst);
SDValue Flags = TST.getValue(1);
return DAG.getNode(AArch64ISD::CSEL, DL, TVal.getValueType(), TVal, FVal,
DAG.getConstant(AArch64CC::NE, DL, MVT::i32), Flags);
}
// Canonicalise absolute difference patterns:
// select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc ->
// select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc

View File

@ -78,9 +78,8 @@ B:
define i32 @g_i8_sign_extend_inreg(i8 %in, i32 %a, i32 %b) nounwind {
; CHECK-LABEL: g_i8_sign_extend_inreg:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sxtb w8, w0
; CHECK-NEXT: cmp w8, #0
; CHECK-NEXT: csel w8, w1, w2, mi
; CHECK-NEXT: tst w0, #0x80
; CHECK-NEXT: csel w8, w1, w2, ne
; CHECK-NEXT: add w0, w8, w0, uxtb
; CHECK-NEXT: ret
entry:
@ -100,9 +99,8 @@ B:
define i32 @g_i16_sign_extend_inreg(i16 %in, i32 %a, i32 %b) nounwind {
; CHECK-LABEL: g_i16_sign_extend_inreg:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sxth w8, w0
; CHECK-NEXT: cmp w8, #0
; CHECK-NEXT: csel w8, w1, w2, mi
; CHECK-NEXT: tst w0, #0x8000
; CHECK-NEXT: csel w8, w1, w2, ne
; CHECK-NEXT: add w0, w8, w0, uxth
; CHECK-NEXT: ret
entry:
@ -167,10 +165,8 @@ B:
define i64 @g_i32_sign_extend_i64(i32 %in, i64 %a, i64 %b) nounwind {
; CHECK-LABEL: g_i32_sign_extend_i64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-NEXT: sxtw x8, w0
; CHECK-NEXT: cmp x8, #0
; CHECK-NEXT: csel x8, x1, x2, mi
; CHECK-NEXT: tst w0, #0x80000000
; CHECK-NEXT: csel x8, x1, x2, ne
; CHECK-NEXT: add x0, x8, w0, uxtw
; CHECK-NEXT: ret
entry:

View File

@ -2093,3 +2093,54 @@ define <2 x i1> @icmp_slt_v2i64_Zero_LHS(<2 x i64> %a) {
%c = icmp slt <2 x i64> <i64 0, i64 0>, %a
ret <2 x i1> %c
}
; Test TST optimization for i8 sign bit testing with cross-type select
; This tests the pattern: icmp slt i8 %val, 0; select i1 %cmp, i32 %a, i32 %b
; The optimization should convert sxtb+cmp to tst for sign bit testing.
define i32 @i8_signbit_tst_constants(i8 %x, i8 %y) {
; CHECK-SD-LABEL: i8_signbit_tst_constants:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: add w9, w0, w1
; CHECK-SD-NEXT: mov w8, #42 // =0x2a
; CHECK-SD-NEXT: tst w9, #0x80
; CHECK-SD-NEXT: mov w9, #20894 // =0x519e
; CHECK-SD-NEXT: csel w0, w9, w8, ne
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: i8_signbit_tst_constants:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: add w8, w0, w1
; CHECK-GI-NEXT: mov w9, #42 // =0x2a
; CHECK-GI-NEXT: mov w10, #20894 // =0x519e
; CHECK-GI-NEXT: sxtb w8, w8
; CHECK-GI-NEXT: cmp w8, #0
; CHECK-GI-NEXT: csel w0, w10, w9, mi
; CHECK-GI-NEXT: ret
%add = add i8 %x, %y
%cmp = icmp slt i8 %add, 0
%sel = select i1 %cmp, i32 20894, i32 42
ret i32 %sel
}
; Test i8 sign bit testing with variable select values (problematic case)
define i32 @i8_signbit_variables(i8 %x, i8 %y, i32 %a, i32 %b) {
; CHECK-SD-LABEL: i8_signbit_variables:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: add w8, w0, w1
; CHECK-SD-NEXT: tst w8, #0x80
; CHECK-SD-NEXT: csel w0, w2, w3, ne
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: i8_signbit_variables:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: add w8, w0, w1
; CHECK-GI-NEXT: sxtb w8, w8
; CHECK-GI-NEXT: cmp w8, #0
; CHECK-GI-NEXT: csel w0, w2, w3, mi
; CHECK-GI-NEXT: ret
%add = add i8 %x, %y
%cmp = icmp slt i8 %add, 0
%sel = select i1 %cmp, i32 %a, i32 %b
ret i32 %sel
}

View File

@ -26,9 +26,9 @@ define i32 @reduce_and_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
; CHECK-LABEL: reduce_and_v1i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: smov w8, v0.b[0]
; CHECK-NEXT: cmp w8, #0
; CHECK-NEXT: csel w0, w0, w1, mi
; CHECK-NEXT: umov w8, v0.b[0]
; CHECK-NEXT: tst w8, #0x80
; CHECK-NEXT: csel w0, w0, w1, ne
; CHECK-NEXT: ret
%x = icmp slt <1 x i8> %a0, zeroinitializer
%y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x)
@ -120,9 +120,9 @@ define i32 @reduce_and_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind {
; CHECK-LABEL: reduce_and_v1i16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: smov w8, v0.h[0]
; CHECK-NEXT: cmp w8, #0
; CHECK-NEXT: csel w0, w0, w1, mi
; CHECK-NEXT: umov w8, v0.h[0]
; CHECK-NEXT: tst w8, #0x8000
; CHECK-NEXT: csel w0, w0, w1, ne
; CHECK-NEXT: ret
%x = icmp slt <1 x i16> %a0, zeroinitializer
%y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x)
@ -305,9 +305,9 @@ define i32 @reduce_or_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
; CHECK-LABEL: reduce_or_v1i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: smov w8, v0.b[0]
; CHECK-NEXT: cmp w8, #0
; CHECK-NEXT: csel w0, w0, w1, mi
; CHECK-NEXT: umov w8, v0.b[0]
; CHECK-NEXT: tst w8, #0x80
; CHECK-NEXT: csel w0, w0, w1, ne
; CHECK-NEXT: ret
%x = icmp slt <1 x i8> %a0, zeroinitializer
%y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x)
@ -399,9 +399,9 @@ define i32 @reduce_or_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind {
; CHECK-LABEL: reduce_or_v1i16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: smov w8, v0.h[0]
; CHECK-NEXT: cmp w8, #0
; CHECK-NEXT: csel w0, w0, w1, mi
; CHECK-NEXT: umov w8, v0.h[0]
; CHECK-NEXT: tst w8, #0x8000
; CHECK-NEXT: csel w0, w0, w1, ne
; CHECK-NEXT: ret
%x = icmp slt <1 x i16> %a0, zeroinitializer
%y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x)
@ -584,9 +584,9 @@ define i32 @reduce_xor_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
; CHECK-LABEL: reduce_xor_v1i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: smov w8, v0.b[0]
; CHECK-NEXT: cmp w8, #0
; CHECK-NEXT: csel w0, w0, w1, mi
; CHECK-NEXT: umov w8, v0.b[0]
; CHECK-NEXT: tst w8, #0x80
; CHECK-NEXT: csel w0, w0, w1, ne
; CHECK-NEXT: ret
%x = icmp slt <1 x i8> %a0, zeroinitializer
%y = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %x)
@ -679,9 +679,9 @@ define i32 @reduce_xor_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind {
; CHECK-LABEL: reduce_xor_v1i16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: smov w8, v0.h[0]
; CHECK-NEXT: cmp w8, #0
; CHECK-NEXT: csel w0, w0, w1, mi
; CHECK-NEXT: umov w8, v0.h[0]
; CHECK-NEXT: tst w8, #0x8000
; CHECK-NEXT: csel w0, w0, w1, ne
; CHECK-NEXT: ret
%x = icmp slt <1 x i16> %a0, zeroinitializer
%y = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %x)