
This patch reassociates `add(add(vecreduce(a), b), add(vecreduce(c), d))` into `add(vecreduce(add(a, c)), add(b, d))`, to combine the reductions into a single node. This comes up after unrolling vectorized loops. There is another small change to move reassociateReduction inside fadd outside of a AllowNewConst block, as new constants will not be created and it should be OK to perform the combine later after legalization.
1425 lines
48 KiB
LLVM
1425 lines
48 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc --mtriple=aarch64-eabi < %s -global-isel=false | FileCheck %s --check-prefixes=CHECK,CHECK-SD
|
|
; RUN: llc --mtriple=aarch64-eabi < %s -global-isel=true | FileCheck %s --check-prefixes=CHECK,CHECK-GI
|
|
|
|
define float @add_f32(<8 x float> %a, <4 x float> %b) {
|
|
; CHECK-SD-LABEL: add_f32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: fadd v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: fadd v0.4s, v0.4s, v2.4s
|
|
; CHECK-SD-NEXT: faddp v0.4s, v0.4s, v0.4s
|
|
; CHECK-SD-NEXT: faddp s0, v0.2s
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: add_f32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: fadd v0.4s, v0.4s, v1.4s
|
|
; CHECK-GI-NEXT: faddp v1.4s, v2.4s, v2.4s
|
|
; CHECK-GI-NEXT: faddp v0.4s, v0.4s, v0.4s
|
|
; CHECK-GI-NEXT: faddp s1, v1.2s
|
|
; CHECK-GI-NEXT: faddp s0, v0.2s
|
|
; CHECK-GI-NEXT: fadd s0, s0, s1
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %a)
|
|
%r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
|
|
%r = fadd fast float %r1, %r2
|
|
ret float %r
|
|
}
|
|
|
|
define float @add_f32_same(<4 x float> %a, <4 x float> %b) {
|
|
; CHECK-SD-LABEL: add_f32_same:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: fadd v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: faddp v0.4s, v0.4s, v0.4s
|
|
; CHECK-SD-NEXT: faddp s0, v0.2s
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: add_f32_same:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: faddp v0.4s, v0.4s, v0.4s
|
|
; CHECK-GI-NEXT: faddp v1.4s, v1.4s, v1.4s
|
|
; CHECK-GI-NEXT: faddp s0, v0.2s
|
|
; CHECK-GI-NEXT: faddp s1, v1.2s
|
|
; CHECK-GI-NEXT: fadd s0, s0, s1
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
|
|
%r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
|
|
%r = fadd fast float %r1, %r2
|
|
ret float %r
|
|
}
|
|
|
|
define float @fmul_f32(<8 x float> %a, <4 x float> %b) {
|
|
; CHECK-SD-LABEL: fmul_f32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v2.4s
|
|
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
|
|
; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s
|
|
; CHECK-SD-NEXT: fmul s0, s0, v0.s[1]
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: fmul_f32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: fmul v0.4s, v0.4s, v1.4s
|
|
; CHECK-GI-NEXT: mov d3, v2.d[1]
|
|
; CHECK-GI-NEXT: mov d1, v0.d[1]
|
|
; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v1.2s
|
|
; CHECK-GI-NEXT: fmul v1.2s, v2.2s, v3.2s
|
|
; CHECK-GI-NEXT: mov s2, v0.s[1]
|
|
; CHECK-GI-NEXT: mov s3, v1.s[1]
|
|
; CHECK-GI-NEXT: fmul s0, s0, s2
|
|
; CHECK-GI-NEXT: fmul s1, s1, s3
|
|
; CHECK-GI-NEXT: fmul s0, s0, s1
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a)
|
|
%r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b)
|
|
%r = fmul fast float %r1, %r2
|
|
ret float %r
|
|
}
|
|
|
|
define float @fmul_f32_same(<4 x float> %a, <4 x float> %b) {
|
|
; CHECK-SD-LABEL: fmul_f32_same:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
|
|
; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s
|
|
; CHECK-SD-NEXT: fmul s0, s0, v0.s[1]
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: fmul_f32_same:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: mov d2, v0.d[1]
|
|
; CHECK-GI-NEXT: mov d3, v1.d[1]
|
|
; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v2.2s
|
|
; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v3.2s
|
|
; CHECK-GI-NEXT: mov s2, v0.s[1]
|
|
; CHECK-GI-NEXT: mov s3, v1.s[1]
|
|
; CHECK-GI-NEXT: fmul s0, s0, s2
|
|
; CHECK-GI-NEXT: fmul s1, s1, s3
|
|
; CHECK-GI-NEXT: fmul s0, s0, s1
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
|
|
%r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b)
|
|
%r = fmul fast float %r1, %r2
|
|
ret float %r
|
|
}
|
|
|
|
define float @fmin_f32(<8 x float> %a, <4 x float> %b) {
|
|
; CHECK-SD-LABEL: fmin_f32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: fminnm v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: fminnm v0.4s, v0.4s, v2.4s
|
|
; CHECK-SD-NEXT: fminnmv s0, v0.4s
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: fmin_f32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: fminnm v0.4s, v0.4s, v1.4s
|
|
; CHECK-GI-NEXT: fminnmv s1, v2.4s
|
|
; CHECK-GI-NEXT: fminnmv s0, v0.4s
|
|
; CHECK-GI-NEXT: fminnm s0, s0, s1
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %a)
|
|
%r2 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %b)
|
|
%r = call float @llvm.minnum.f32(float %r1, float %r2)
|
|
ret float %r
|
|
}
|
|
|
|
define float @fmin_f32_same(<4 x float> %a, <4 x float> %b) {
|
|
; CHECK-SD-LABEL: fmin_f32_same:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: fminnm v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: fminnmv s0, v0.4s
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: fmin_f32_same:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: fminnmv s0, v0.4s
|
|
; CHECK-GI-NEXT: fminnmv s1, v1.4s
|
|
; CHECK-GI-NEXT: fminnm s0, s0, s1
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
|
|
%r2 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %b)
|
|
%r = call float @llvm.minnum.f32(float %r1, float %r2)
|
|
ret float %r
|
|
}
|
|
|
|
define float @fmax_f32(<8 x float> %a, <4 x float> %b) {
|
|
; CHECK-SD-LABEL: fmax_f32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: fmaxnm v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: fmaxnm v0.4s, v0.4s, v2.4s
|
|
; CHECK-SD-NEXT: fmaxnmv s0, v0.4s
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: fmax_f32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: fmaxnm v0.4s, v0.4s, v1.4s
|
|
; CHECK-GI-NEXT: fmaxnmv s1, v2.4s
|
|
; CHECK-GI-NEXT: fmaxnmv s0, v0.4s
|
|
; CHECK-GI-NEXT: fmaxnm s0, s0, s1
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %a)
|
|
%r2 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b)
|
|
%r = call float @llvm.maxnum.f32(float %r1, float %r2)
|
|
ret float %r
|
|
}
|
|
|
|
define float @fmax_f32_same(<4 x float> %a, <4 x float> %b) {
|
|
; CHECK-SD-LABEL: fmax_f32_same:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: fmaxnm v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: fmaxnmv s0, v0.4s
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: fmax_f32_same:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: fmaxnmv s0, v0.4s
|
|
; CHECK-GI-NEXT: fmaxnmv s1, v1.4s
|
|
; CHECK-GI-NEXT: fmaxnm s0, s0, s1
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
|
|
%r2 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b)
|
|
%r = call float @llvm.maxnum.f32(float %r1, float %r2)
|
|
ret float %r
|
|
}
|
|
|
|
define float @fminimum_f32(<8 x float> %a, <4 x float> %b) {
|
|
; CHECK-SD-LABEL: fminimum_f32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: fmin v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: fmin v0.4s, v0.4s, v2.4s
|
|
; CHECK-SD-NEXT: fminv s0, v0.4s
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: fminimum_f32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: fmin v0.4s, v0.4s, v1.4s
|
|
; CHECK-GI-NEXT: fminv s1, v2.4s
|
|
; CHECK-GI-NEXT: fminv s0, v0.4s
|
|
; CHECK-GI-NEXT: fmin s0, s0, s1
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> %a)
|
|
%r2 = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %b)
|
|
%r = call float @llvm.minimum.f32(float %r1, float %r2)
|
|
ret float %r
|
|
}
|
|
|
|
define float @fminimum_f32_same(<4 x float> %a, <4 x float> %b) {
|
|
; CHECK-SD-LABEL: fminimum_f32_same:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: fmin v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: fminv s0, v0.4s
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: fminimum_f32_same:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: fminv s0, v0.4s
|
|
; CHECK-GI-NEXT: fminv s1, v1.4s
|
|
; CHECK-GI-NEXT: fmin s0, s0, s1
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %a)
|
|
%r2 = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %b)
|
|
%r = call float @llvm.minimum.f32(float %r1, float %r2)
|
|
ret float %r
|
|
}
|
|
|
|
define float @fmaximum_f32(<8 x float> %a, <4 x float> %b) {
|
|
; CHECK-SD-LABEL: fmaximum_f32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: fmax v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: fmax v0.4s, v0.4s, v2.4s
|
|
; CHECK-SD-NEXT: fmaxv s0, v0.4s
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: fmaximum_f32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: fmax v0.4s, v0.4s, v1.4s
|
|
; CHECK-GI-NEXT: fmaxv s1, v2.4s
|
|
; CHECK-GI-NEXT: fmaxv s0, v0.4s
|
|
; CHECK-GI-NEXT: fmax s0, s0, s1
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> %a)
|
|
%r2 = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %b)
|
|
%r = call float @llvm.maximum.f32(float %r1, float %r2)
|
|
ret float %r
|
|
}
|
|
|
|
define float @fmaximum_f32_same(<4 x float> %a, <4 x float> %b) {
|
|
; CHECK-SD-LABEL: fmaximum_f32_same:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: fmax v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: fmaxv s0, v0.4s
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: fmaximum_f32_same:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: fmaxv s0, v0.4s
|
|
; CHECK-GI-NEXT: fmaxv s1, v1.4s
|
|
; CHECK-GI-NEXT: fmax s0, s0, s1
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %a)
|
|
%r2 = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %b)
|
|
%r = call float @llvm.maximum.f32(float %r1, float %r2)
|
|
ret float %r
|
|
}
|
|
|
|
; These next two tests have incorrect minnum/minimum combinations
|
|
define float @fminimumnum_f32(<4 x float> %a, <4 x float> %b) {
|
|
; CHECK-LABEL: fminimumnum_f32:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: fminv s0, v0.4s
|
|
; CHECK-NEXT: fminv s1, v1.4s
|
|
; CHECK-NEXT: fminnm s0, s0, s1
|
|
; CHECK-NEXT: ret
|
|
%r1 = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %a)
|
|
%r2 = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %b)
|
|
%r = call float @llvm.minnum.f32(float %r1, float %r2)
|
|
ret float %r
|
|
}
|
|
|
|
define float @fmaxnumimum_f32(<4 x float> %a, <4 x float> %b) {
|
|
; CHECK-LABEL: fmaxnumimum_f32:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: fmaxnmv s0, v0.4s
|
|
; CHECK-NEXT: fmaxnmv s1, v1.4s
|
|
; CHECK-NEXT: fmax s0, s0, s1
|
|
; CHECK-NEXT: ret
|
|
%r1 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
|
|
%r2 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b)
|
|
%r = call float @llvm.maximum.f32(float %r1, float %r2)
|
|
ret float %r
|
|
}
|
|
|
|
|
|
define i32 @add_i32(<8 x i32> %a, <4 x i32> %b) {
|
|
; CHECK-SD-LABEL: add_i32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: add v0.4s, v0.4s, v2.4s
|
|
; CHECK-SD-NEXT: addv s0, v0.4s
|
|
; CHECK-SD-NEXT: fmov w0, s0
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: add_i32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
|
|
; CHECK-GI-NEXT: addv s1, v2.4s
|
|
; CHECK-GI-NEXT: addv s0, v0.4s
|
|
; CHECK-GI-NEXT: fmov w9, s1
|
|
; CHECK-GI-NEXT: fmov w8, s0
|
|
; CHECK-GI-NEXT: add w0, w8, w9
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32> %a)
|
|
%r2 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %b)
|
|
%r = add i32 %r1, %r2
|
|
ret i32 %r
|
|
}
|
|
|
|
define i16 @add_ext_i16(<16 x i8> %a, <16 x i8> %b) {
|
|
; CHECK-SD-LABEL: add_ext_i16:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: uaddlp v1.8h, v1.16b
|
|
; CHECK-SD-NEXT: uadalp v1.8h, v0.16b
|
|
; CHECK-SD-NEXT: addv h0, v1.8h
|
|
; CHECK-SD-NEXT: fmov w0, s0
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: add_ext_i16:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: uaddlv h0, v0.16b
|
|
; CHECK-GI-NEXT: uaddlv h1, v1.16b
|
|
; CHECK-GI-NEXT: fmov w8, s0
|
|
; CHECK-GI-NEXT: fmov w9, s1
|
|
; CHECK-GI-NEXT: add w0, w8, w9
|
|
; CHECK-GI-NEXT: ret
|
|
%ae = zext <16 x i8> %a to <16 x i16>
|
|
%be = zext <16 x i8> %b to <16 x i16>
|
|
%r1 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %ae)
|
|
%r2 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %be)
|
|
%r = add i16 %r1, %r2
|
|
ret i16 %r
|
|
}
|
|
|
|
define i16 @add_ext_v32i16(<32 x i8> %a, <16 x i8> %b) {
|
|
; CHECK-SD-LABEL: add_ext_v32i16:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: uaddl2 v3.8h, v0.16b, v1.16b
|
|
; CHECK-SD-NEXT: uaddl v0.8h, v0.8b, v1.8b
|
|
; CHECK-SD-NEXT: add v0.8h, v0.8h, v3.8h
|
|
; CHECK-SD-NEXT: uadalp v0.8h, v2.16b
|
|
; CHECK-SD-NEXT: addv h0, v0.8h
|
|
; CHECK-SD-NEXT: fmov w0, s0
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: add_ext_v32i16:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: uaddlv h0, v0.16b
|
|
; CHECK-GI-NEXT: uaddlv h1, v1.16b
|
|
; CHECK-GI-NEXT: uaddlv h2, v2.16b
|
|
; CHECK-GI-NEXT: fmov w8, s0
|
|
; CHECK-GI-NEXT: fmov w9, s1
|
|
; CHECK-GI-NEXT: add w8, w8, w9
|
|
; CHECK-GI-NEXT: fmov w9, s2
|
|
; CHECK-GI-NEXT: add w0, w8, w9
|
|
; CHECK-GI-NEXT: ret
|
|
%ae = zext <32 x i8> %a to <32 x i16>
|
|
%be = zext <16 x i8> %b to <16 x i16>
|
|
%r1 = call i16 @llvm.vector.reduce.add.i16.v32i16(<32 x i16> %ae)
|
|
%r2 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %be)
|
|
%r = add i16 %r1, %r2
|
|
ret i16 %r
|
|
}
|
|
|
|
define i32 @mul_i32(<8 x i32> %a, <4 x i32> %b) {
|
|
; CHECK-SD-LABEL: mul_i32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: mul v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: mul v0.4s, v0.4s, v2.4s
|
|
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
|
|
; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s
|
|
; CHECK-SD-NEXT: mov w8, v0.s[1]
|
|
; CHECK-SD-NEXT: fmov w9, s0
|
|
; CHECK-SD-NEXT: mul w0, w9, w8
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: mul_i32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: mov d3, v0.d[1]
|
|
; CHECK-GI-NEXT: mov d4, v1.d[1]
|
|
; CHECK-GI-NEXT: mul v0.2s, v0.2s, v3.2s
|
|
; CHECK-GI-NEXT: mul v1.2s, v1.2s, v4.2s
|
|
; CHECK-GI-NEXT: mov d3, v2.d[1]
|
|
; CHECK-GI-NEXT: mul v0.2s, v0.2s, v1.2s
|
|
; CHECK-GI-NEXT: mul v1.2s, v2.2s, v3.2s
|
|
; CHECK-GI-NEXT: mov w8, v0.s[1]
|
|
; CHECK-GI-NEXT: fmov w10, s0
|
|
; CHECK-GI-NEXT: mov w9, v1.s[1]
|
|
; CHECK-GI-NEXT: mul w8, w10, w8
|
|
; CHECK-GI-NEXT: fmov w10, s1
|
|
; CHECK-GI-NEXT: mul w9, w10, w9
|
|
; CHECK-GI-NEXT: mul w0, w8, w9
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call i32 @llvm.vector.reduce.mul.i32.v8i32(<8 x i32> %a)
|
|
%r2 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %b)
|
|
%r = mul i32 %r1, %r2
|
|
ret i32 %r
|
|
}
|
|
|
|
define i32 @mul_i32_same(<4 x i32> %a, <4 x i32> %b) {
|
|
; CHECK-SD-LABEL: mul_i32_same:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: mul v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
|
|
; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s
|
|
; CHECK-SD-NEXT: mov w8, v0.s[1]
|
|
; CHECK-SD-NEXT: fmov w9, s0
|
|
; CHECK-SD-NEXT: mul w0, w9, w8
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: mul_i32_same:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: mov d2, v0.d[1]
|
|
; CHECK-GI-NEXT: mov d3, v1.d[1]
|
|
; CHECK-GI-NEXT: mul v0.2s, v0.2s, v2.2s
|
|
; CHECK-GI-NEXT: mul v1.2s, v1.2s, v3.2s
|
|
; CHECK-GI-NEXT: mov w8, v0.s[1]
|
|
; CHECK-GI-NEXT: mov w9, v1.s[1]
|
|
; CHECK-GI-NEXT: fmov w10, s0
|
|
; CHECK-GI-NEXT: fmov w11, s1
|
|
; CHECK-GI-NEXT: mul w8, w10, w8
|
|
; CHECK-GI-NEXT: mul w9, w11, w9
|
|
; CHECK-GI-NEXT: mul w0, w8, w9
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %a)
|
|
%r2 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %b)
|
|
%r = mul i32 %r1, %r2
|
|
ret i32 %r
|
|
}
|
|
|
|
define i32 @and_i32(<8 x i32> %a, <4 x i32> %b) {
|
|
; CHECK-SD-LABEL: and_i32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b
|
|
; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b
|
|
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
|
|
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
|
|
; CHECK-SD-NEXT: fmov x8, d0
|
|
; CHECK-SD-NEXT: lsr x9, x8, #32
|
|
; CHECK-SD-NEXT: and w0, w8, w9
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: and_i32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
|
|
; CHECK-GI-NEXT: mov d1, v2.d[1]
|
|
; CHECK-GI-NEXT: mov d3, v0.d[1]
|
|
; CHECK-GI-NEXT: and v1.8b, v2.8b, v1.8b
|
|
; CHECK-GI-NEXT: and v0.8b, v0.8b, v3.8b
|
|
; CHECK-GI-NEXT: mov w8, v1.s[1]
|
|
; CHECK-GI-NEXT: fmov w10, s1
|
|
; CHECK-GI-NEXT: mov w9, v0.s[1]
|
|
; CHECK-GI-NEXT: fmov w11, s0
|
|
; CHECK-GI-NEXT: and w8, w10, w8
|
|
; CHECK-GI-NEXT: and w8, w11, w8
|
|
; CHECK-GI-NEXT: and w0, w8, w9
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call i32 @llvm.vector.reduce.and.i32.v8i32(<8 x i32> %a)
|
|
%r2 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %b)
|
|
%r = and i32 %r1, %r2
|
|
ret i32 %r
|
|
}
|
|
|
|
define i32 @and_i32_same(<4 x i32> %a, <4 x i32> %b) {
|
|
; CHECK-SD-LABEL: and_i32_same:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b
|
|
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
|
|
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
|
|
; CHECK-SD-NEXT: fmov x8, d0
|
|
; CHECK-SD-NEXT: lsr x9, x8, #32
|
|
; CHECK-SD-NEXT: and w0, w8, w9
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: and_i32_same:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: mov d2, v0.d[1]
|
|
; CHECK-GI-NEXT: mov d3, v1.d[1]
|
|
; CHECK-GI-NEXT: and v0.8b, v0.8b, v2.8b
|
|
; CHECK-GI-NEXT: and v1.8b, v1.8b, v3.8b
|
|
; CHECK-GI-NEXT: mov w8, v0.s[1]
|
|
; CHECK-GI-NEXT: mov w9, v1.s[1]
|
|
; CHECK-GI-NEXT: fmov w10, s0
|
|
; CHECK-GI-NEXT: fmov w11, s1
|
|
; CHECK-GI-NEXT: and w8, w10, w8
|
|
; CHECK-GI-NEXT: and w9, w11, w9
|
|
; CHECK-GI-NEXT: and w0, w8, w9
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %a)
|
|
%r2 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %b)
|
|
%r = and i32 %r1, %r2
|
|
ret i32 %r
|
|
}
|
|
|
|
define i32 @or_i32(<8 x i32> %a, <4 x i32> %b) {
|
|
; CHECK-SD-LABEL: or_i32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b
|
|
; CHECK-SD-NEXT: orr v0.16b, v0.16b, v2.16b
|
|
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
|
|
; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b
|
|
; CHECK-SD-NEXT: fmov x8, d0
|
|
; CHECK-SD-NEXT: lsr x9, x8, #32
|
|
; CHECK-SD-NEXT: orr w0, w8, w9
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: or_i32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b
|
|
; CHECK-GI-NEXT: mov d1, v2.d[1]
|
|
; CHECK-GI-NEXT: mov d3, v0.d[1]
|
|
; CHECK-GI-NEXT: orr v1.8b, v2.8b, v1.8b
|
|
; CHECK-GI-NEXT: orr v0.8b, v0.8b, v3.8b
|
|
; CHECK-GI-NEXT: mov w8, v1.s[1]
|
|
; CHECK-GI-NEXT: fmov w10, s1
|
|
; CHECK-GI-NEXT: mov w9, v0.s[1]
|
|
; CHECK-GI-NEXT: fmov w11, s0
|
|
; CHECK-GI-NEXT: orr w8, w10, w8
|
|
; CHECK-GI-NEXT: orr w8, w11, w8
|
|
; CHECK-GI-NEXT: orr w0, w8, w9
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call i32 @llvm.vector.reduce.or.i32.v8i32(<8 x i32> %a)
|
|
%r2 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %b)
|
|
%r = or i32 %r1, %r2
|
|
ret i32 %r
|
|
}
|
|
|
|
define i32 @or_i32_same(<4 x i32> %a, <4 x i32> %b) {
|
|
; CHECK-SD-LABEL: or_i32_same:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b
|
|
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
|
|
; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b
|
|
; CHECK-SD-NEXT: fmov x8, d0
|
|
; CHECK-SD-NEXT: lsr x9, x8, #32
|
|
; CHECK-SD-NEXT: orr w0, w8, w9
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: or_i32_same:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: mov d2, v0.d[1]
|
|
; CHECK-GI-NEXT: mov d3, v1.d[1]
|
|
; CHECK-GI-NEXT: orr v0.8b, v0.8b, v2.8b
|
|
; CHECK-GI-NEXT: orr v1.8b, v1.8b, v3.8b
|
|
; CHECK-GI-NEXT: mov w8, v0.s[1]
|
|
; CHECK-GI-NEXT: mov w9, v1.s[1]
|
|
; CHECK-GI-NEXT: fmov w10, s0
|
|
; CHECK-GI-NEXT: fmov w11, s1
|
|
; CHECK-GI-NEXT: orr w8, w10, w8
|
|
; CHECK-GI-NEXT: orr w9, w11, w9
|
|
; CHECK-GI-NEXT: orr w0, w8, w9
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %a)
|
|
%r2 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %b)
|
|
%r = or i32 %r1, %r2
|
|
ret i32 %r
|
|
}
|
|
|
|
define i32 @xor_i32(<8 x i32> %a, <4 x i32> %b) {
|
|
; CHECK-SD-LABEL: xor_i32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: eor v0.16b, v0.16b, v1.16b
|
|
; CHECK-SD-NEXT: eor v0.16b, v0.16b, v2.16b
|
|
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
|
|
; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b
|
|
; CHECK-SD-NEXT: fmov x8, d0
|
|
; CHECK-SD-NEXT: lsr x9, x8, #32
|
|
; CHECK-SD-NEXT: eor w0, w8, w9
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: xor_i32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: eor v0.16b, v0.16b, v1.16b
|
|
; CHECK-GI-NEXT: mov d1, v2.d[1]
|
|
; CHECK-GI-NEXT: mov d3, v0.d[1]
|
|
; CHECK-GI-NEXT: eor v1.8b, v2.8b, v1.8b
|
|
; CHECK-GI-NEXT: eor v0.8b, v0.8b, v3.8b
|
|
; CHECK-GI-NEXT: mov w8, v1.s[1]
|
|
; CHECK-GI-NEXT: fmov w10, s1
|
|
; CHECK-GI-NEXT: mov w9, v0.s[1]
|
|
; CHECK-GI-NEXT: fmov w11, s0
|
|
; CHECK-GI-NEXT: eor w8, w10, w8
|
|
; CHECK-GI-NEXT: eor w8, w11, w8
|
|
; CHECK-GI-NEXT: eor w0, w8, w9
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call i32 @llvm.vector.reduce.xor.i32.v8i32(<8 x i32> %a)
|
|
%r2 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %b)
|
|
%r = xor i32 %r1, %r2
|
|
ret i32 %r
|
|
}
|
|
|
|
define i32 @xor_i32_same(<4 x i32> %a, <4 x i32> %b) {
|
|
; CHECK-SD-LABEL: xor_i32_same:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: eor v0.16b, v0.16b, v1.16b
|
|
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
|
|
; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b
|
|
; CHECK-SD-NEXT: fmov x8, d0
|
|
; CHECK-SD-NEXT: lsr x9, x8, #32
|
|
; CHECK-SD-NEXT: eor w0, w8, w9
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: xor_i32_same:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: mov d2, v0.d[1]
|
|
; CHECK-GI-NEXT: mov d3, v1.d[1]
|
|
; CHECK-GI-NEXT: eor v0.8b, v0.8b, v2.8b
|
|
; CHECK-GI-NEXT: eor v1.8b, v1.8b, v3.8b
|
|
; CHECK-GI-NEXT: mov w8, v0.s[1]
|
|
; CHECK-GI-NEXT: mov w9, v1.s[1]
|
|
; CHECK-GI-NEXT: fmov w10, s0
|
|
; CHECK-GI-NEXT: fmov w11, s1
|
|
; CHECK-GI-NEXT: eor w8, w10, w8
|
|
; CHECK-GI-NEXT: eor w9, w11, w9
|
|
; CHECK-GI-NEXT: eor w0, w8, w9
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %a)
|
|
%r2 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %b)
|
|
%r = xor i32 %r1, %r2
|
|
ret i32 %r
|
|
}
|
|
|
|
define i32 @umin_i32(<8 x i32> %a, <4 x i32> %b) {
|
|
; CHECK-SD-LABEL: umin_i32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: umin v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: umin v0.4s, v0.4s, v2.4s
|
|
; CHECK-SD-NEXT: uminv s0, v0.4s
|
|
; CHECK-SD-NEXT: fmov w0, s0
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: umin_i32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: umin v0.4s, v0.4s, v1.4s
|
|
; CHECK-GI-NEXT: uminv s1, v2.4s
|
|
; CHECK-GI-NEXT: uminv s0, v0.4s
|
|
; CHECK-GI-NEXT: fmov w9, s1
|
|
; CHECK-GI-NEXT: fmov w8, s0
|
|
; CHECK-GI-NEXT: cmp w8, w9
|
|
; CHECK-GI-NEXT: fcsel s0, s0, s1, lo
|
|
; CHECK-GI-NEXT: fmov w0, s0
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call i32 @llvm.vector.reduce.umin.i32.v8i32(<8 x i32> %a)
|
|
%r2 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %b)
|
|
%r = call i32 @llvm.umin.i32(i32 %r1, i32 %r2)
|
|
ret i32 %r
|
|
}
|
|
|
|
define i32 @umin_i32_same(<4 x i32> %a, <4 x i32> %b) {
|
|
; CHECK-SD-LABEL: umin_i32_same:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: umin v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: uminv s0, v0.4s
|
|
; CHECK-SD-NEXT: fmov w0, s0
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: umin_i32_same:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: uminv s0, v0.4s
|
|
; CHECK-GI-NEXT: uminv s1, v1.4s
|
|
; CHECK-GI-NEXT: fmov w8, s0
|
|
; CHECK-GI-NEXT: fmov w9, s1
|
|
; CHECK-GI-NEXT: cmp w8, w9
|
|
; CHECK-GI-NEXT: fcsel s0, s0, s1, lo
|
|
; CHECK-GI-NEXT: fmov w0, s0
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %a)
|
|
%r2 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %b)
|
|
%r = call i32 @llvm.umin.i32(i32 %r1, i32 %r2)
|
|
ret i32 %r
|
|
}
|
|
|
|
define i32 @umax_i32(<8 x i32> %a, <4 x i32> %b) {
|
|
; CHECK-SD-LABEL: umax_i32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: umax v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: umax v0.4s, v0.4s, v2.4s
|
|
; CHECK-SD-NEXT: umaxv s0, v0.4s
|
|
; CHECK-SD-NEXT: fmov w0, s0
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: umax_i32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: umax v0.4s, v0.4s, v1.4s
|
|
; CHECK-GI-NEXT: umaxv s1, v2.4s
|
|
; CHECK-GI-NEXT: umaxv s0, v0.4s
|
|
; CHECK-GI-NEXT: fmov w9, s1
|
|
; CHECK-GI-NEXT: fmov w8, s0
|
|
; CHECK-GI-NEXT: cmp w8, w9
|
|
; CHECK-GI-NEXT: fcsel s0, s0, s1, hi
|
|
; CHECK-GI-NEXT: fmov w0, s0
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call i32 @llvm.vector.reduce.umax.i32.v8i32(<8 x i32> %a)
|
|
%r2 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %b)
|
|
%r = call i32 @llvm.umax.i32(i32 %r1, i32 %r2)
|
|
ret i32 %r
|
|
}
|
|
|
|
define i32 @umax_i32_same(<4 x i32> %a, <4 x i32> %b) {
|
|
; CHECK-SD-LABEL: umax_i32_same:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: umax v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: umaxv s0, v0.4s
|
|
; CHECK-SD-NEXT: fmov w0, s0
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: umax_i32_same:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: umaxv s0, v0.4s
|
|
; CHECK-GI-NEXT: umaxv s1, v1.4s
|
|
; CHECK-GI-NEXT: fmov w8, s0
|
|
; CHECK-GI-NEXT: fmov w9, s1
|
|
; CHECK-GI-NEXT: cmp w8, w9
|
|
; CHECK-GI-NEXT: fcsel s0, s0, s1, hi
|
|
; CHECK-GI-NEXT: fmov w0, s0
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %a)
|
|
%r2 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %b)
|
|
%r = call i32 @llvm.umax.i32(i32 %r1, i32 %r2)
|
|
ret i32 %r
|
|
}
|
|
|
|
define i32 @smin_i32(<8 x i32> %a, <4 x i32> %b) {
|
|
; CHECK-SD-LABEL: smin_i32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: smin v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: smin v0.4s, v0.4s, v2.4s
|
|
; CHECK-SD-NEXT: sminv s0, v0.4s
|
|
; CHECK-SD-NEXT: fmov w0, s0
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: smin_i32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: smin v0.4s, v0.4s, v1.4s
|
|
; CHECK-GI-NEXT: sminv s1, v2.4s
|
|
; CHECK-GI-NEXT: sminv s0, v0.4s
|
|
; CHECK-GI-NEXT: fmov w9, s1
|
|
; CHECK-GI-NEXT: fmov w8, s0
|
|
; CHECK-GI-NEXT: cmp w8, w9
|
|
; CHECK-GI-NEXT: fcsel s0, s0, s1, lt
|
|
; CHECK-GI-NEXT: fmov w0, s0
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call i32 @llvm.vector.reduce.smin.i32.v8i32(<8 x i32> %a)
|
|
%r2 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %b)
|
|
%r = call i32 @llvm.smin.i32(i32 %r1, i32 %r2)
|
|
ret i32 %r
|
|
}
|
|
|
|
define i32 @smin_i32_same(<4 x i32> %a, <4 x i32> %b) {
|
|
; CHECK-SD-LABEL: smin_i32_same:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: smin v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: sminv s0, v0.4s
|
|
; CHECK-SD-NEXT: fmov w0, s0
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: smin_i32_same:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: sminv s0, v0.4s
|
|
; CHECK-GI-NEXT: sminv s1, v1.4s
|
|
; CHECK-GI-NEXT: fmov w8, s0
|
|
; CHECK-GI-NEXT: fmov w9, s1
|
|
; CHECK-GI-NEXT: cmp w8, w9
|
|
; CHECK-GI-NEXT: fcsel s0, s0, s1, lt
|
|
; CHECK-GI-NEXT: fmov w0, s0
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %a)
|
|
%r2 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %b)
|
|
%r = call i32 @llvm.smin.i32(i32 %r1, i32 %r2)
|
|
ret i32 %r
|
|
}
|
|
|
|
define i32 @smax_i32(<8 x i32> %a, <4 x i32> %b) {
|
|
; CHECK-SD-LABEL: smax_i32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: smax v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: smax v0.4s, v0.4s, v2.4s
|
|
; CHECK-SD-NEXT: smaxv s0, v0.4s
|
|
; CHECK-SD-NEXT: fmov w0, s0
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: smax_i32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: smax v0.4s, v0.4s, v1.4s
|
|
; CHECK-GI-NEXT: smaxv s1, v2.4s
|
|
; CHECK-GI-NEXT: smaxv s0, v0.4s
|
|
; CHECK-GI-NEXT: fmov w9, s1
|
|
; CHECK-GI-NEXT: fmov w8, s0
|
|
; CHECK-GI-NEXT: cmp w8, w9
|
|
; CHECK-GI-NEXT: fcsel s0, s0, s1, gt
|
|
; CHECK-GI-NEXT: fmov w0, s0
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call i32 @llvm.vector.reduce.smax.i32.v8i32(<8 x i32> %a)
|
|
%r2 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %b)
|
|
%r = call i32 @llvm.smax.i32(i32 %r1, i32 %r2)
|
|
ret i32 %r
|
|
}
|
|
|
|
define i32 @smax_i32_same(<4 x i32> %a, <4 x i32> %b) {
|
|
; CHECK-SD-LABEL: smax_i32_same:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: smax v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: smaxv s0, v0.4s
|
|
; CHECK-SD-NEXT: fmov w0, s0
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: smax_i32_same:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: smaxv s0, v0.4s
|
|
; CHECK-GI-NEXT: smaxv s1, v1.4s
|
|
; CHECK-GI-NEXT: fmov w8, s0
|
|
; CHECK-GI-NEXT: fmov w9, s1
|
|
; CHECK-GI-NEXT: cmp w8, w9
|
|
; CHECK-GI-NEXT: fcsel s0, s0, s1, gt
|
|
; CHECK-GI-NEXT: fmov w0, s0
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %a)
|
|
%r2 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %b)
|
|
%r = call i32 @llvm.smax.i32(i32 %r1, i32 %r2)
|
|
ret i32 %r
|
|
}
|
|
|
|
|
|
define float @nested_fadd_f32(<4 x float> %a, <4 x float> %b, float %c, float %d) {
|
|
; CHECK-SD-LABEL: nested_fadd_f32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: fadd v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: fadd s2, s2, s3
|
|
; CHECK-SD-NEXT: faddp v0.4s, v0.4s, v0.4s
|
|
; CHECK-SD-NEXT: faddp s0, v0.2s
|
|
; CHECK-SD-NEXT: fadd s0, s0, s2
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: nested_fadd_f32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: faddp v0.4s, v0.4s, v0.4s
|
|
; CHECK-GI-NEXT: faddp v1.4s, v1.4s, v1.4s
|
|
; CHECK-GI-NEXT: faddp s0, v0.2s
|
|
; CHECK-GI-NEXT: faddp s1, v1.2s
|
|
; CHECK-GI-NEXT: fadd s0, s0, s2
|
|
; CHECK-GI-NEXT: fadd s1, s1, s3
|
|
; CHECK-GI-NEXT: fadd s0, s0, s1
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
|
|
%a1 = fadd fast float %r1, %c
|
|
%r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
|
|
%a2 = fadd fast float %r2, %d
|
|
%r = fadd fast float %a1, %a2
|
|
ret float %r
|
|
}
|
|
|
|
define float @nested_fadd_f32_slow(<4 x float> %a, <4 x float> %b, float %c, float %d) {
|
|
; CHECK-SD-LABEL: nested_fadd_f32_slow:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: mov s4, v1.s[2]
|
|
; CHECK-SD-NEXT: mov s5, v0.s[2]
|
|
; CHECK-SD-NEXT: faddp s6, v0.2s
|
|
; CHECK-SD-NEXT: faddp s7, v1.2s
|
|
; CHECK-SD-NEXT: mov s1, v1.s[3]
|
|
; CHECK-SD-NEXT: mov s0, v0.s[3]
|
|
; CHECK-SD-NEXT: fadd s5, s6, s5
|
|
; CHECK-SD-NEXT: fadd s4, s7, s4
|
|
; CHECK-SD-NEXT: fadd s0, s5, s0
|
|
; CHECK-SD-NEXT: fadd s1, s4, s1
|
|
; CHECK-SD-NEXT: fadd s0, s0, s2
|
|
; CHECK-SD-NEXT: fadd s1, s1, s3
|
|
; CHECK-SD-NEXT: fadd s0, s0, s1
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: nested_fadd_f32_slow:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: mov s4, v0.s[2]
|
|
; CHECK-GI-NEXT: faddp s5, v0.2s
|
|
; CHECK-GI-NEXT: mov s6, v1.s[2]
|
|
; CHECK-GI-NEXT: faddp s7, v1.2s
|
|
; CHECK-GI-NEXT: mov s0, v0.s[3]
|
|
; CHECK-GI-NEXT: mov s1, v1.s[3]
|
|
; CHECK-GI-NEXT: fadd s4, s5, s4
|
|
; CHECK-GI-NEXT: fadd s5, s7, s6
|
|
; CHECK-GI-NEXT: fadd s0, s4, s0
|
|
; CHECK-GI-NEXT: fadd s1, s5, s1
|
|
; CHECK-GI-NEXT: fadd s0, s0, s2
|
|
; CHECK-GI-NEXT: fadd s1, s1, s3
|
|
; CHECK-GI-NEXT: fadd s0, s0, s1
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
|
|
%a1 = fadd float %r1, %c
|
|
%r2 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
|
|
%a2 = fadd float %r2, %d
|
|
%r = fadd float %a1, %a2
|
|
ret float %r
|
|
}
|
|
|
|
define float @nested_mul_f32(<4 x float> %a, <4 x float> %b, float %c, float %d) {
|
|
; CHECK-SD-LABEL: nested_mul_f32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: fmul s2, s2, s3
|
|
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
|
|
; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s
|
|
; CHECK-SD-NEXT: fmul s0, s0, v0.s[1]
|
|
; CHECK-SD-NEXT: fmul s0, s0, s2
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: nested_mul_f32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: mov d4, v0.d[1]
|
|
; CHECK-GI-NEXT: mov d5, v1.d[1]
|
|
; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v4.2s
|
|
; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v5.2s
|
|
; CHECK-GI-NEXT: mov s4, v0.s[1]
|
|
; CHECK-GI-NEXT: mov s5, v1.s[1]
|
|
; CHECK-GI-NEXT: fmul s0, s0, s4
|
|
; CHECK-GI-NEXT: fmul s1, s1, s5
|
|
; CHECK-GI-NEXT: fmul s0, s0, s2
|
|
; CHECK-GI-NEXT: fmul s1, s1, s3
|
|
; CHECK-GI-NEXT: fmul s0, s0, s1
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
|
|
%a1 = fmul fast float %r1, %c
|
|
%r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b)
|
|
%a2 = fmul fast float %r2, %d
|
|
%r = fmul fast float %a1, %a2
|
|
ret float %r
|
|
}
|
|
|
|
define i32 @nested_add_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
|
|
; CHECK-SD-LABEL: nested_add_i32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: add w8, w0, w1
|
|
; CHECK-SD-NEXT: addv s0, v0.4s
|
|
; CHECK-SD-NEXT: fmov w9, s0
|
|
; CHECK-SD-NEXT: add w0, w9, w8
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: nested_add_i32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: addv s0, v0.4s
|
|
; CHECK-GI-NEXT: addv s1, v1.4s
|
|
; CHECK-GI-NEXT: fmov w8, s0
|
|
; CHECK-GI-NEXT: fmov w9, s1
|
|
; CHECK-GI-NEXT: add w8, w8, w0
|
|
; CHECK-GI-NEXT: add w9, w9, w1
|
|
; CHECK-GI-NEXT: add w0, w8, w9
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
|
|
%a1 = add i32 %r1, %c
|
|
%r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %b)
|
|
%a2 = add i32 %r2, %d
|
|
%r = add i32 %a1, %a2
|
|
ret i32 %r
|
|
}
|
|
|
|
define i32 @nested_add_c1_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
|
|
; CHECK-SD-LABEL: nested_add_c1_i32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: add w8, w0, w1
|
|
; CHECK-SD-NEXT: addv s0, v0.4s
|
|
; CHECK-SD-NEXT: fmov w9, s0
|
|
; CHECK-SD-NEXT: add w0, w9, w8
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: nested_add_c1_i32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: addv s0, v0.4s
|
|
; CHECK-GI-NEXT: addv s1, v1.4s
|
|
; CHECK-GI-NEXT: fmov w8, s0
|
|
; CHECK-GI-NEXT: fmov w9, s1
|
|
; CHECK-GI-NEXT: add w8, w0, w8
|
|
; CHECK-GI-NEXT: add w9, w9, w1
|
|
; CHECK-GI-NEXT: add w0, w8, w9
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
|
|
%a1 = add i32 %c, %r1
|
|
%r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %b)
|
|
%a2 = add i32 %r2, %d
|
|
%r = add i32 %a1, %a2
|
|
ret i32 %r
|
|
}
|
|
|
|
define i32 @nested_add_c2_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
|
|
; CHECK-SD-LABEL: nested_add_c2_i32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: add w8, w0, w1
|
|
; CHECK-SD-NEXT: addv s0, v0.4s
|
|
; CHECK-SD-NEXT: fmov w9, s0
|
|
; CHECK-SD-NEXT: add w0, w9, w8
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: nested_add_c2_i32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: addv s0, v0.4s
|
|
; CHECK-GI-NEXT: addv s1, v1.4s
|
|
; CHECK-GI-NEXT: fmov w8, s0
|
|
; CHECK-GI-NEXT: fmov w9, s1
|
|
; CHECK-GI-NEXT: add w8, w8, w0
|
|
; CHECK-GI-NEXT: add w9, w1, w9
|
|
; CHECK-GI-NEXT: add w0, w8, w9
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
|
|
%a1 = add i32 %r1, %c
|
|
%r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %b)
|
|
%a2 = add i32 %d, %r2
|
|
%r = add i32 %a1, %a2
|
|
ret i32 %r
|
|
}
|
|
|
|
define i32 @nested_add_manyreduct_i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
|
|
; CHECK-SD-LABEL: nested_add_manyreduct_i32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: add v1.4s, v1.4s, v3.4s
|
|
; CHECK-SD-NEXT: add v0.4s, v0.4s, v2.4s
|
|
; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: addv s0, v0.4s
|
|
; CHECK-SD-NEXT: fmov w0, s0
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: nested_add_manyreduct_i32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: addv s0, v0.4s
|
|
; CHECK-GI-NEXT: addv s2, v2.4s
|
|
; CHECK-GI-NEXT: addv s1, v1.4s
|
|
; CHECK-GI-NEXT: addv s3, v3.4s
|
|
; CHECK-GI-NEXT: fmov w8, s0
|
|
; CHECK-GI-NEXT: fmov w9, s2
|
|
; CHECK-GI-NEXT: fmov w10, s1
|
|
; CHECK-GI-NEXT: fmov w11, s3
|
|
; CHECK-GI-NEXT: add w8, w8, w9
|
|
; CHECK-GI-NEXT: add w9, w10, w11
|
|
; CHECK-GI-NEXT: add w0, w8, w9
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
|
|
%r3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %c)
|
|
%a1 = add i32 %r1, %r3
|
|
%r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %b)
|
|
%r4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %d)
|
|
%a2 = add i32 %r2, %r4
|
|
%r = add i32 %a1, %a2
|
|
ret i32 %r
|
|
}
|
|
|
|
define i32 @nested_mul_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
|
|
; CHECK-SD-LABEL: nested_mul_i32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: mul v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: mul w8, w0, w1
|
|
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
|
|
; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s
|
|
; CHECK-SD-NEXT: mov w9, v0.s[1]
|
|
; CHECK-SD-NEXT: fmov w10, s0
|
|
; CHECK-SD-NEXT: mul w9, w10, w9
|
|
; CHECK-SD-NEXT: mul w0, w9, w8
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: nested_mul_i32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: mov d2, v0.d[1]
|
|
; CHECK-GI-NEXT: mov d3, v1.d[1]
|
|
; CHECK-GI-NEXT: mul v0.2s, v0.2s, v2.2s
|
|
; CHECK-GI-NEXT: mul v1.2s, v1.2s, v3.2s
|
|
; CHECK-GI-NEXT: mov w8, v0.s[1]
|
|
; CHECK-GI-NEXT: fmov w10, s0
|
|
; CHECK-GI-NEXT: mov w9, v1.s[1]
|
|
; CHECK-GI-NEXT: mul w8, w10, w8
|
|
; CHECK-GI-NEXT: fmov w10, s1
|
|
; CHECK-GI-NEXT: mul w9, w10, w9
|
|
; CHECK-GI-NEXT: mul w8, w8, w0
|
|
; CHECK-GI-NEXT: mul w9, w9, w1
|
|
; CHECK-GI-NEXT: mul w0, w8, w9
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %a)
|
|
%a1 = mul i32 %r1, %c
|
|
%r2 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %b)
|
|
%a2 = mul i32 %r2, %d
|
|
%r = mul i32 %a1, %a2
|
|
ret i32 %r
|
|
}
|
|
|
|
define i32 @nested_and_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
|
|
; CHECK-SD-LABEL: nested_and_i32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b
|
|
; CHECK-SD-NEXT: and w8, w0, w1
|
|
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
|
|
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
|
|
; CHECK-SD-NEXT: fmov x9, d0
|
|
; CHECK-SD-NEXT: lsr x10, x9, #32
|
|
; CHECK-SD-NEXT: and w8, w9, w8
|
|
; CHECK-SD-NEXT: and w0, w8, w10
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: nested_and_i32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: mov d2, v0.d[1]
|
|
; CHECK-GI-NEXT: mov d3, v1.d[1]
|
|
; CHECK-GI-NEXT: and v0.8b, v0.8b, v2.8b
|
|
; CHECK-GI-NEXT: and v1.8b, v1.8b, v3.8b
|
|
; CHECK-GI-NEXT: mov w8, v0.s[1]
|
|
; CHECK-GI-NEXT: mov w9, v1.s[1]
|
|
; CHECK-GI-NEXT: fmov w10, s0
|
|
; CHECK-GI-NEXT: fmov w11, s1
|
|
; CHECK-GI-NEXT: and w10, w10, w0
|
|
; CHECK-GI-NEXT: and w11, w11, w1
|
|
; CHECK-GI-NEXT: and w8, w10, w8
|
|
; CHECK-GI-NEXT: and w9, w11, w9
|
|
; CHECK-GI-NEXT: and w0, w8, w9
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a)
|
|
%a1 = and i32 %r1, %c
|
|
%r2 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %b)
|
|
%a2 = and i32 %r2, %d
|
|
%r = and i32 %a1, %a2
|
|
ret i32 %r
|
|
}
|
|
|
|
define i32 @nested_or_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
|
|
; CHECK-SD-LABEL: nested_or_i32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b
|
|
; CHECK-SD-NEXT: orr w8, w0, w1
|
|
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
|
|
; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b
|
|
; CHECK-SD-NEXT: fmov x9, d0
|
|
; CHECK-SD-NEXT: lsr x10, x9, #32
|
|
; CHECK-SD-NEXT: orr w8, w9, w8
|
|
; CHECK-SD-NEXT: orr w0, w8, w10
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: nested_or_i32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: mov d2, v0.d[1]
|
|
; CHECK-GI-NEXT: mov d3, v1.d[1]
|
|
; CHECK-GI-NEXT: orr v0.8b, v0.8b, v2.8b
|
|
; CHECK-GI-NEXT: orr v1.8b, v1.8b, v3.8b
|
|
; CHECK-GI-NEXT: mov w8, v0.s[1]
|
|
; CHECK-GI-NEXT: mov w9, v1.s[1]
|
|
; CHECK-GI-NEXT: fmov w10, s0
|
|
; CHECK-GI-NEXT: fmov w11, s1
|
|
; CHECK-GI-NEXT: orr w10, w10, w0
|
|
; CHECK-GI-NEXT: orr w11, w11, w1
|
|
; CHECK-GI-NEXT: orr w8, w10, w8
|
|
; CHECK-GI-NEXT: orr w9, w11, w9
|
|
; CHECK-GI-NEXT: orr w0, w8, w9
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
|
|
%a1 = or i32 %r1, %c
|
|
%r2 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %b)
|
|
%a2 = or i32 %r2, %d
|
|
%r = or i32 %a1, %a2
|
|
ret i32 %r
|
|
}
|
|
|
|
define i32 @nested_xor_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
|
|
; CHECK-SD-LABEL: nested_xor_i32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: eor v0.16b, v0.16b, v1.16b
|
|
; CHECK-SD-NEXT: eor w8, w0, w1
|
|
; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8
|
|
; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b
|
|
; CHECK-SD-NEXT: fmov x9, d0
|
|
; CHECK-SD-NEXT: lsr x10, x9, #32
|
|
; CHECK-SD-NEXT: eor w8, w9, w8
|
|
; CHECK-SD-NEXT: eor w0, w8, w10
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: nested_xor_i32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: mov d2, v0.d[1]
|
|
; CHECK-GI-NEXT: mov d3, v1.d[1]
|
|
; CHECK-GI-NEXT: eor v0.8b, v0.8b, v2.8b
|
|
; CHECK-GI-NEXT: eor v1.8b, v1.8b, v3.8b
|
|
; CHECK-GI-NEXT: mov w8, v0.s[1]
|
|
; CHECK-GI-NEXT: mov w9, v1.s[1]
|
|
; CHECK-GI-NEXT: fmov w10, s0
|
|
; CHECK-GI-NEXT: fmov w11, s1
|
|
; CHECK-GI-NEXT: eor w10, w10, w0
|
|
; CHECK-GI-NEXT: eor w11, w11, w1
|
|
; CHECK-GI-NEXT: eor w8, w10, w8
|
|
; CHECK-GI-NEXT: eor w9, w11, w9
|
|
; CHECK-GI-NEXT: eor w0, w8, w9
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a)
|
|
%a1 = xor i32 %r1, %c
|
|
%r2 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %b)
|
|
%a2 = xor i32 %r2, %d
|
|
%r = xor i32 %a1, %a2
|
|
ret i32 %r
|
|
}
|
|
|
|
define i32 @nested_smin_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
|
|
; CHECK-SD-LABEL: nested_smin_i32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: smin v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: cmp w0, w1
|
|
; CHECK-SD-NEXT: csel w8, w0, w1, lt
|
|
; CHECK-SD-NEXT: sminv s0, v0.4s
|
|
; CHECK-SD-NEXT: fmov w9, s0
|
|
; CHECK-SD-NEXT: cmp w9, w8
|
|
; CHECK-SD-NEXT: csel w0, w9, w8, lt
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: nested_smin_i32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: sminv s0, v0.4s
|
|
; CHECK-GI-NEXT: sminv s1, v1.4s
|
|
; CHECK-GI-NEXT: fmov w8, s0
|
|
; CHECK-GI-NEXT: fmov w9, s1
|
|
; CHECK-GI-NEXT: cmp w8, w0
|
|
; CHECK-GI-NEXT: csel w8, w8, w0, lt
|
|
; CHECK-GI-NEXT: cmp w9, w1
|
|
; CHECK-GI-NEXT: csel w9, w9, w1, lt
|
|
; CHECK-GI-NEXT: cmp w8, w9
|
|
; CHECK-GI-NEXT: csel w0, w8, w9, lt
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a)
|
|
%a1 = call i32 @llvm.smin.i32(i32 %r1, i32 %c)
|
|
%r2 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %b)
|
|
%a2 = call i32 @llvm.smin.i32(i32 %r2, i32 %d)
|
|
%r = call i32 @llvm.smin.i32(i32 %a1, i32 %a2)
|
|
ret i32 %r
|
|
}
|
|
|
|
define i32 @nested_smax_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
|
|
; CHECK-SD-LABEL: nested_smax_i32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: smax v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: cmp w0, w1
|
|
; CHECK-SD-NEXT: csel w8, w0, w1, gt
|
|
; CHECK-SD-NEXT: smaxv s0, v0.4s
|
|
; CHECK-SD-NEXT: fmov w9, s0
|
|
; CHECK-SD-NEXT: cmp w9, w8
|
|
; CHECK-SD-NEXT: csel w0, w9, w8, gt
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: nested_smax_i32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: smaxv s0, v0.4s
|
|
; CHECK-GI-NEXT: smaxv s1, v1.4s
|
|
; CHECK-GI-NEXT: fmov w8, s0
|
|
; CHECK-GI-NEXT: fmov w9, s1
|
|
; CHECK-GI-NEXT: cmp w8, w0
|
|
; CHECK-GI-NEXT: csel w8, w8, w0, gt
|
|
; CHECK-GI-NEXT: cmp w9, w1
|
|
; CHECK-GI-NEXT: csel w9, w9, w1, gt
|
|
; CHECK-GI-NEXT: cmp w8, w9
|
|
; CHECK-GI-NEXT: csel w0, w8, w9, gt
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a)
|
|
%a1 = call i32 @llvm.smax.i32(i32 %r1, i32 %c)
|
|
%r2 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %b)
|
|
%a2 = call i32 @llvm.smax.i32(i32 %r2, i32 %d)
|
|
%r = call i32 @llvm.smax.i32(i32 %a1, i32 %a2)
|
|
ret i32 %r
|
|
}
|
|
|
|
define i32 @nested_umin_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
|
|
; CHECK-SD-LABEL: nested_umin_i32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: umin v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: cmp w0, w1
|
|
; CHECK-SD-NEXT: csel w8, w0, w1, lo
|
|
; CHECK-SD-NEXT: uminv s0, v0.4s
|
|
; CHECK-SD-NEXT: fmov w9, s0
|
|
; CHECK-SD-NEXT: cmp w9, w8
|
|
; CHECK-SD-NEXT: csel w0, w9, w8, lo
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: nested_umin_i32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: uminv s0, v0.4s
|
|
; CHECK-GI-NEXT: uminv s1, v1.4s
|
|
; CHECK-GI-NEXT: fmov w8, s0
|
|
; CHECK-GI-NEXT: fmov w9, s1
|
|
; CHECK-GI-NEXT: cmp w8, w0
|
|
; CHECK-GI-NEXT: csel w8, w8, w0, lo
|
|
; CHECK-GI-NEXT: cmp w9, w1
|
|
; CHECK-GI-NEXT: csel w9, w9, w1, lo
|
|
; CHECK-GI-NEXT: cmp w8, w9
|
|
; CHECK-GI-NEXT: csel w0, w8, w9, lo
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a)
|
|
%a1 = call i32 @llvm.umin.i32(i32 %r1, i32 %c)
|
|
%r2 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %b)
|
|
%a2 = call i32 @llvm.umin.i32(i32 %r2, i32 %d)
|
|
%r = call i32 @llvm.umin.i32(i32 %a1, i32 %a2)
|
|
ret i32 %r
|
|
}
|
|
|
|
define i32 @nested_umax_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) {
|
|
; CHECK-SD-LABEL: nested_umax_i32:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: umax v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: cmp w0, w1
|
|
; CHECK-SD-NEXT: csel w8, w0, w1, hi
|
|
; CHECK-SD-NEXT: umaxv s0, v0.4s
|
|
; CHECK-SD-NEXT: fmov w9, s0
|
|
; CHECK-SD-NEXT: cmp w9, w8
|
|
; CHECK-SD-NEXT: csel w0, w9, w8, hi
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: nested_umax_i32:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: umaxv s0, v0.4s
|
|
; CHECK-GI-NEXT: umaxv s1, v1.4s
|
|
; CHECK-GI-NEXT: fmov w8, s0
|
|
; CHECK-GI-NEXT: fmov w9, s1
|
|
; CHECK-GI-NEXT: cmp w8, w0
|
|
; CHECK-GI-NEXT: csel w8, w8, w0, hi
|
|
; CHECK-GI-NEXT: cmp w9, w1
|
|
; CHECK-GI-NEXT: csel w9, w9, w1, hi
|
|
; CHECK-GI-NEXT: cmp w8, w9
|
|
; CHECK-GI-NEXT: csel w0, w8, w9, hi
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a)
|
|
%a1 = call i32 @llvm.umax.i32(i32 %r1, i32 %c)
|
|
%r2 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %b)
|
|
%a2 = call i32 @llvm.umax.i32(i32 %r2, i32 %d)
|
|
%r = call i32 @llvm.umax.i32(i32 %a1, i32 %a2)
|
|
ret i32 %r
|
|
}
|
|
|
|
define float @nested_fmin_float(<4 x float> %a, <4 x float> %b, float %c, float %d) {
|
|
; CHECK-SD-LABEL: nested_fmin_float:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: fminnm v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: fminnm s2, s2, s3
|
|
; CHECK-SD-NEXT: fminnmv s0, v0.4s
|
|
; CHECK-SD-NEXT: fminnm s0, s0, s2
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: nested_fmin_float:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: fminnmv s0, v0.4s
|
|
; CHECK-GI-NEXT: fminnmv s1, v1.4s
|
|
; CHECK-GI-NEXT: fminnm s0, s0, s2
|
|
; CHECK-GI-NEXT: fminnm s1, s1, s3
|
|
; CHECK-GI-NEXT: fminnm s0, s0, s1
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
|
|
%a1 = call float @llvm.minnum.f32(float %r1, float %c)
|
|
%r2 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %b)
|
|
%a2 = call float @llvm.minnum.f32(float %r2, float %d)
|
|
%r = call float @llvm.minnum.f32(float %a1, float %a2)
|
|
ret float %r
|
|
}
|
|
|
|
define float @nested_fmax_float(<4 x float> %a, <4 x float> %b, float %c, float %d) {
|
|
; CHECK-SD-LABEL: nested_fmax_float:
|
|
; CHECK-SD: // %bb.0:
|
|
; CHECK-SD-NEXT: fmaxnm v0.4s, v0.4s, v1.4s
|
|
; CHECK-SD-NEXT: fmaxnm s2, s2, s3
|
|
; CHECK-SD-NEXT: fmaxnmv s0, v0.4s
|
|
; CHECK-SD-NEXT: fmaxnm s0, s0, s2
|
|
; CHECK-SD-NEXT: ret
|
|
;
|
|
; CHECK-GI-LABEL: nested_fmax_float:
|
|
; CHECK-GI: // %bb.0:
|
|
; CHECK-GI-NEXT: fmaxnmv s0, v0.4s
|
|
; CHECK-GI-NEXT: fmaxnmv s1, v1.4s
|
|
; CHECK-GI-NEXT: fmaxnm s0, s0, s2
|
|
; CHECK-GI-NEXT: fmaxnm s1, s1, s3
|
|
; CHECK-GI-NEXT: fmaxnm s0, s0, s1
|
|
; CHECK-GI-NEXT: ret
|
|
%r1 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
|
|
%a1 = call float @llvm.maxnum.f32(float %r1, float %c)
|
|
%r2 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b)
|
|
%a2 = call float @llvm.maxnum.f32(float %r2, float %d)
|
|
%r = call float @llvm.maxnum.f32(float %a1, float %a2)
|
|
ret float %r
|
|
}
|
|
|
|
|
|
declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
|
|
declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
|
|
declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
|
|
declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
|
|
declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
|
|
declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
|
|
declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
|
|
declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
|
|
declare float @llvm.vector.reduce.fminimum.v8f32(<8 x float>)
|
|
declare float @llvm.vector.reduce.fminimum.v4f32(<4 x float>)
|
|
declare float @llvm.vector.reduce.fmaximum.v8f32(<8 x float>)
|
|
declare float @llvm.vector.reduce.fmaximum.v4f32(<4 x float>)
|
|
declare i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32>)
|
|
declare i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32>)
|
|
declare i16 @llvm.vector.reduce.add.i16.v32i16(<32 x i16>)
|
|
declare i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16>)
|
|
declare i32 @llvm.vector.reduce.mul.i32.v8i32(<8 x i32>)
|
|
declare i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32>)
|
|
declare i32 @llvm.vector.reduce.and.i32.v8i32(<8 x i32>)
|
|
declare i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32>)
|
|
declare i32 @llvm.vector.reduce.or.i32.v8i32(<8 x i32>)
|
|
declare i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32>)
|
|
declare i32 @llvm.vector.reduce.xor.i32.v8i32(<8 x i32>)
|
|
declare i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32>)
|
|
declare i32 @llvm.vector.reduce.umin.i32.v8i32(<8 x i32>)
|
|
declare i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32>)
|
|
declare i32 @llvm.vector.reduce.umax.i32.v8i32(<8 x i32>)
|
|
declare i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32>)
|
|
declare i32 @llvm.vector.reduce.smin.i32.v8i32(<8 x i32>)
|
|
declare i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32>)
|
|
declare i32 @llvm.vector.reduce.smax.i32.v8i32(<8 x i32>)
|
|
declare i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32>)
|
|
declare float @llvm.minnum.f32(float, float)
|
|
declare float @llvm.maxnum.f32(float, float)
|
|
declare float @llvm.minimum.f32(float, float)
|
|
declare float @llvm.maximum.f32(float, float)
|
|
declare i32 @llvm.umin.i32(i32, i32)
|
|
declare i32 @llvm.umax.i32(i32, i32)
|
|
declare i32 @llvm.smin.i32(i32, i32)
|
|
declare i32 @llvm.smax.i32(i32, i32)
|