woruyu bbcebec3af
[DAG] Refactor X86 combineVSelectWithAllOnesOrZeros fold into a generic DAG Combine (#145298)
This PR resolves https://github.com/llvm/llvm-project/issues/144513

The modification include five pattern :
1.vselect Cond, 0, 0 → 0
2.vselect Cond, -1, 0 → bitcast Cond
3.vselect Cond, -1, x → or Cond, x
4.vselect Cond, x, 0 → and Cond, x
5.vselect Cond, 000..., X -> andn Cond, X

1-4 have been migrated to DAGCombine. 5 still in x86 code.

The reason is that you cannot use the andn instruction directly in
DAGCombine, you can only use and+xor, which will introduce optimization
order issues. For example, in the x86 backend, select Cond, 0, x →
(~Cond) & x, the backend will first check whether the cond node of
(~Cond) is a setcc node. If so, it will modify the comparison operator
of the condition.So the x86 backend cannot complete the optimization of
andn.In short, I think it is a better choice to keep the pattern of
vselect Cond, 000..., X instead of and+xor in combineDAG.

For commit, the first is code changes and x86 test(note 1), the second
is tests in other backend(node 2).

---------

Co-authored-by: Simon Pilgrim <llvm-dev@redking.me.uk>
2025-07-02 15:07:48 +01:00

202 lines
8.3 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs -o - %s | FileCheck %s
define <8 x i16> @concat_add(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
; CHECK-LABEL: concat_add:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: mov v1.d[1], v3.d[0]
; CHECK-NEXT: mov v0.d[1], v2.d[0]
; CHECK-NEXT: add v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%x = add <4 x i16> %a, %b
%y = add <4 x i16> %c, %d
%z = shufflevector <4 x i16> %x, <4 x i16> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i16> %z
}
define <8 x i16> @concat_addtunc(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
; CHECK-LABEL: concat_addtunc:
; CHECK: // %bb.0:
; CHECK-NEXT: add v2.4s, v2.4s, v3.4s
; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-NEXT: uzp1 v0.8h, v0.8h, v2.8h
; CHECK-NEXT: ret
%x = add <4 x i32> %a, %b
%y = add <4 x i32> %c, %d
%xt = trunc <4 x i32> %x to <4 x i16>
%yt = trunc <4 x i32> %y to <4 x i16>
%z = shufflevector <4 x i16> %xt, <4 x i16> %yt, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i16> %z
}
define <8 x i16> @concat_addtunc2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
; CHECK-LABEL: concat_addtunc2:
; CHECK: // %bb.0:
; CHECK-NEXT: uzp1 v1.8h, v1.8h, v3.8h
; CHECK-NEXT: uzp1 v0.8h, v0.8h, v2.8h
; CHECK-NEXT: add v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%at = trunc <4 x i32> %a to <4 x i16>
%bt = trunc <4 x i32> %b to <4 x i16>
%ct = trunc <4 x i32> %c to <4 x i16>
%dt = trunc <4 x i32> %d to <4 x i16>
%x = add <4 x i16> %at, %bt
%y = add <4 x i16> %ct, %dt
%z = shufflevector <4 x i16> %x, <4 x i16> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i16> %z
}
define <8 x i16> @concat_sub(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
; CHECK-LABEL: concat_sub:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: mov v1.d[1], v3.d[0]
; CHECK-NEXT: mov v0.d[1], v2.d[0]
; CHECK-NEXT: sub v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%x = sub <4 x i16> %a, %b
%y = sub <4 x i16> %c, %d
%z = shufflevector <4 x i16> %x, <4 x i16> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i16> %z
}
define <8 x i16> @concat_mul(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
; CHECK-LABEL: concat_mul:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: mov v1.d[1], v3.d[0]
; CHECK-NEXT: mov v0.d[1], v2.d[0]
; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%x = mul <4 x i16> %a, %b
%y = mul <4 x i16> %c, %d
%z = shufflevector <4 x i16> %x, <4 x i16> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i16> %z
}
define <8 x i16> @concat_xor(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
; CHECK-LABEL: concat_xor:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: mov v1.d[1], v3.d[0]
; CHECK-NEXT: mov v0.d[1], v2.d[0]
; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%x = xor <4 x i16> %a, %b
%y = xor <4 x i16> %c, %d
%z = shufflevector <4 x i16> %x, <4 x i16> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i16> %z
}
define <8 x half> @concat_fadd(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) {
; CHECK-LABEL: concat_fadd:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: mov v1.d[1], v3.d[0]
; CHECK-NEXT: mov v0.d[1], v2.d[0]
; CHECK-NEXT: fadd v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%x = fadd <4 x half> %a, %b
%y = fadd <4 x half> %c, %d
%z = shufflevector <4 x half> %x, <4 x half> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x half> %z
}
define <8 x half> @concat_fmul(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) {
; CHECK-LABEL: concat_fmul:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: mov v1.d[1], v3.d[0]
; CHECK-NEXT: mov v0.d[1], v2.d[0]
; CHECK-NEXT: fmul v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%x = fmul <4 x half> %a, %b
%y = fmul <4 x half> %c, %d
%z = shufflevector <4 x half> %x, <4 x half> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x half> %z
}
define <8 x half> @concat_min(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) {
; CHECK-LABEL: concat_min:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
; CHECK-NEXT: mov v1.d[1], v3.d[0]
; CHECK-NEXT: mov v0.d[1], v2.d[0]
; CHECK-NEXT: fminnm v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%x = call <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b)
%y = call <4 x half> @llvm.minnum.v4f16(<4 x half> %c, <4 x half> %d)
%z = shufflevector <4 x half> %x, <4 x half> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x half> %z
}
define <8 x half> @concat_minmax(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) {
; CHECK-LABEL: concat_minmax:
; CHECK: // %bb.0:
; CHECK-NEXT: fmaxnm v2.4h, v2.4h, v3.4h
; CHECK-NEXT: fminnm v0.4h, v0.4h, v1.4h
; CHECK-NEXT: mov v0.d[1], v2.d[0]
; CHECK-NEXT: ret
%x = call <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b)
%y = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %c, <4 x half> %d)
%z = shufflevector <4 x half> %x, <4 x half> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x half> %z
}
define <16 x i8> @signOf_neon(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) {
; CHECK-LABEL: signOf_neon:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldp q1, q2, [x0]
; CHECK-NEXT: movi v0.16b, #1
; CHECK-NEXT: ldp q3, q4, [x1]
; CHECK-NEXT: cmhi v5.8h, v1.8h, v3.8h
; CHECK-NEXT: cmhi v6.8h, v2.8h, v4.8h
; CHECK-NEXT: cmhi v1.8h, v3.8h, v1.8h
; CHECK-NEXT: cmhi v2.8h, v4.8h, v2.8h
; CHECK-NEXT: uzp1 v3.16b, v5.16b, v6.16b
; CHECK-NEXT: uzp1 v1.16b, v1.16b, v2.16b
; CHECK-NEXT: and v0.16b, v3.16b, v0.16b
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
; CHECK-NEXT: ret
entry:
%0 = load <8 x i16>, ptr %a, align 2
%add.ptr = getelementptr inbounds i8, ptr %a, i64 16
%1 = load <8 x i16>, ptr %add.ptr, align 2
%2 = load <8 x i16>, ptr %b, align 2
%add.ptr6 = getelementptr inbounds i8, ptr %b, i64 16
%3 = load <8 x i16>, ptr %add.ptr6, align 2
%cmp.i33 = icmp ugt <8 x i16> %0, %2
%cmp.i31 = icmp ugt <8 x i16> %1, %3
%cmp.i29 = icmp ugt <8 x i16> %2, %0
%cmp.i = icmp ugt <8 x i16> %3, %1
%vmovn.i38.neg = zext <8 x i1> %cmp.i33 to <8 x i8>
%vmovn.i37.neg = zext <8 x i1> %cmp.i31 to <8 x i8>
%4 = select <8 x i1> %cmp.i29, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> %vmovn.i38.neg
%5 = select <8 x i1> %cmp.i, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> %vmovn.i37.neg
%or.i = shufflevector <8 x i8> %4, <8 x i8> %5, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x i8> %or.i
}