If we have a vector FP division with a splatted divisor, use getVectorMinNumElements when scaling the num of uses by splat factor. For AArch64 the combine kicks in for the <vscale x 4 x float> case since it's above the fdiv threshold (3) when scaling num uses by splat factor, but the codegen is worse (splat + vector fdiv + vector fmul) than the <vscale x 2 x double> case (splat + vector fdiv). If the combine could be converted into a scalar FP division by scalarizeBinOpOfSplats it may be cheaper, but it looks like this is predicated on the isExtractVecEltCheap TLI function which is implemented for x86 but not AArch64. Perhaps for now combineRepeatedFPDivisors should only scale num uses by splat if the division can be converted into scalar op. Reviewed By: sdesmalen Differential Revision: https://reviews.llvm.org/D118343
221 lines
8.3 KiB
LLVM
221 lines
8.3 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=aarch64-unknown-unknown < %s | FileCheck %s
|
|
|
|
; Following test cases check:
|
|
; a / D; b / D; c / D;
|
|
; =>
|
|
; recip = 1.0 / D; a * recip; b * recip; c * recip;
|
|
define void @three_fdiv_float(float %D, float %a, float %b, float %c) #0 {
|
|
; CHECK-LABEL: three_fdiv_float:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: fmov s4, #1.00000000
|
|
; CHECK-NEXT: fdiv s4, s4, s0
|
|
; CHECK-NEXT: fmul s0, s1, s4
|
|
; CHECK-NEXT: fmul s1, s2, s4
|
|
; CHECK-NEXT: fmul s2, s3, s4
|
|
; CHECK-NEXT: b foo_3f
|
|
%div = fdiv float %a, %D
|
|
%div1 = fdiv float %b, %D
|
|
%div2 = fdiv float %c, %D
|
|
tail call void @foo_3f(float %div, float %div1, float %div2)
|
|
ret void
|
|
}
|
|
|
|
define void @three_fdiv_double(double %D, double %a, double %b, double %c) #0 {
|
|
; CHECK-LABEL: three_fdiv_double:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: fmov d4, #1.00000000
|
|
; CHECK-NEXT: fdiv d4, d4, d0
|
|
; CHECK-NEXT: fmul d0, d1, d4
|
|
; CHECK-NEXT: fmul d1, d2, d4
|
|
; CHECK-NEXT: fmul d2, d3, d4
|
|
; CHECK-NEXT: b foo_3d
|
|
%div = fdiv double %a, %D
|
|
%div1 = fdiv double %b, %D
|
|
%div2 = fdiv double %c, %D
|
|
tail call void @foo_3d(double %div, double %div1, double %div2)
|
|
ret void
|
|
}
|
|
|
|
define void @three_fdiv_4xfloat(<4 x float> %D, <4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
|
|
; CHECK-LABEL: three_fdiv_4xfloat:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: fmov v4.4s, #1.00000000
|
|
; CHECK-NEXT: fdiv v4.4s, v4.4s, v0.4s
|
|
; CHECK-NEXT: fmul v0.4s, v1.4s, v4.4s
|
|
; CHECK-NEXT: fmul v1.4s, v2.4s, v4.4s
|
|
; CHECK-NEXT: fmul v2.4s, v3.4s, v4.4s
|
|
; CHECK-NEXT: b foo_3_4xf
|
|
%div = fdiv <4 x float> %a, %D
|
|
%div1 = fdiv <4 x float> %b, %D
|
|
%div2 = fdiv <4 x float> %c, %D
|
|
tail call void @foo_3_4xf(<4 x float> %div, <4 x float> %div1, <4 x float> %div2)
|
|
ret void
|
|
}
|
|
|
|
define void @three_fdiv_2xdouble(<2 x double> %D, <2 x double> %a, <2 x double> %b, <2 x double> %c) #0 {
|
|
; CHECK-LABEL: three_fdiv_2xdouble:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: fmov v4.2d, #1.00000000
|
|
; CHECK-NEXT: fdiv v4.2d, v4.2d, v0.2d
|
|
; CHECK-NEXT: fmul v0.2d, v1.2d, v4.2d
|
|
; CHECK-NEXT: fmul v1.2d, v2.2d, v4.2d
|
|
; CHECK-NEXT: fmul v2.2d, v3.2d, v4.2d
|
|
; CHECK-NEXT: b foo_3_2xd
|
|
%div = fdiv <2 x double> %a, %D
|
|
%div1 = fdiv <2 x double> %b, %D
|
|
%div2 = fdiv <2 x double> %c, %D
|
|
tail call void @foo_3_2xd(<2 x double> %div, <2 x double> %div1, <2 x double> %div2)
|
|
ret void
|
|
}
|
|
|
|
; Following test cases check we never combine two FDIVs if neither of them
|
|
; calculates a reciprocal.
|
|
define void @two_fdiv_float(float %D, float %a, float %b) #0 {
|
|
; CHECK-LABEL: two_fdiv_float:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: fdiv s3, s1, s0
|
|
; CHECK-NEXT: fdiv s1, s2, s0
|
|
; CHECK-NEXT: fmov s0, s3
|
|
; CHECK-NEXT: b foo_2f
|
|
%div = fdiv float %a, %D
|
|
%div1 = fdiv float %b, %D
|
|
tail call void @foo_2f(float %div, float %div1)
|
|
ret void
|
|
}
|
|
|
|
define void @two_fdiv_double(double %D, double %a, double %b) #0 {
|
|
; CHECK-LABEL: two_fdiv_double:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: fdiv d3, d1, d0
|
|
; CHECK-NEXT: fdiv d1, d2, d0
|
|
; CHECK-NEXT: fmov d0, d3
|
|
; CHECK-NEXT: b foo_2d
|
|
%div = fdiv double %a, %D
|
|
%div1 = fdiv double %b, %D
|
|
tail call void @foo_2d(double %div, double %div1)
|
|
ret void
|
|
}
|
|
|
|
define void @splat_three_fdiv_4xfloat(float %D, <4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
|
|
; CHECK-LABEL: splat_three_fdiv_4xfloat:
|
|
; CHECK: // %bb.0:
|
|
; CHECK-NEXT: fmov v4.4s, #1.00000000
|
|
; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
|
|
; CHECK-NEXT: dup v0.4s, v0.s[0]
|
|
; CHECK-NEXT: fdiv v4.4s, v4.4s, v0.4s
|
|
; CHECK-NEXT: fmul v0.4s, v1.4s, v4.4s
|
|
; CHECK-NEXT: fmul v1.4s, v2.4s, v4.4s
|
|
; CHECK-NEXT: fmul v2.4s, v3.4s, v4.4s
|
|
; CHECK-NEXT: b foo_3_4xf
|
|
%D.ins = insertelement <4 x float> poison, float %D, i64 0
|
|
%splat = shufflevector <4 x float> %D.ins, <4 x float> poison, <4 x i32> zeroinitializer
|
|
%div = fdiv <4 x float> %a, %splat
|
|
%div1 = fdiv <4 x float> %b, %splat
|
|
%div2 = fdiv <4 x float> %c, %splat
|
|
tail call void @foo_3_4xf(<4 x float> %div, <4 x float> %div1, <4 x float> %div2)
|
|
ret void
|
|
}
|
|
|
|
define <4 x float> @splat_fdiv_v4f32(float %D, <4 x float> %a) #1 {
|
|
; CHECK-LABEL: splat_fdiv_v4f32:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: fmov v2.4s, #1.00000000
|
|
; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
|
|
; CHECK-NEXT: dup v0.4s, v0.s[0]
|
|
; CHECK-NEXT: fdiv v0.4s, v2.4s, v0.4s
|
|
; CHECK-NEXT: fmul v0.4s, v1.4s, v0.4s
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%D.ins = insertelement <4 x float> poison, float %D, i64 0
|
|
%splat = shufflevector <4 x float> %D.ins, <4 x float> poison, <4 x i32> zeroinitializer
|
|
%div = fdiv <4 x float> %a, %splat
|
|
ret <4 x float> %div
|
|
}
|
|
|
|
define <vscale x 4 x float> @splat_fdiv_nxv4f32(float %D, <vscale x 4 x float> %a) #1 {
|
|
; CHECK-LABEL: splat_fdiv_nxv4f32:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
|
|
; CHECK-NEXT: fmov z2.s, #1.00000000
|
|
; CHECK-NEXT: ptrue p0.s
|
|
; CHECK-NEXT: mov z0.s, s0
|
|
; CHECK-NEXT: fdivr z0.s, p0/m, z0.s, z2.s
|
|
; CHECK-NEXT: fmul z0.s, z1.s, z0.s
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%D.ins = insertelement <vscale x 4 x float> poison, float %D, i64 0
|
|
%splat = shufflevector <vscale x 4 x float> %D.ins, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
|
|
%div = fdiv <vscale x 4 x float> %a, %splat
|
|
ret <vscale x 4 x float> %div
|
|
}
|
|
|
|
define void @splat_three_fdiv_nxv4f32(float %D, <vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c) #1 {
|
|
; CHECK-LABEL: splat_three_fdiv_nxv4f32:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
|
|
; CHECK-NEXT: fmov z4.s, #1.00000000
|
|
; CHECK-NEXT: ptrue p0.s
|
|
; CHECK-NEXT: mov z0.s, s0
|
|
; CHECK-NEXT: fdiv z4.s, p0/m, z4.s, z0.s
|
|
; CHECK-NEXT: fmul z0.s, z1.s, z4.s
|
|
; CHECK-NEXT: fmul z1.s, z2.s, z4.s
|
|
; CHECK-NEXT: fmul z2.s, z3.s, z4.s
|
|
; CHECK-NEXT: b foo_3_nxv4f32
|
|
entry:
|
|
%D.ins = insertelement <vscale x 4 x float> poison, float %D, i64 0
|
|
%splat = shufflevector <vscale x 4 x float> %D.ins, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
|
|
%div = fdiv <vscale x 4 x float> %a, %splat
|
|
%div1 = fdiv <vscale x 4 x float> %b, %splat
|
|
%div2 = fdiv <vscale x 4 x float> %c, %splat
|
|
tail call void @foo_3_nxv4f32(<vscale x 4 x float> %div, <vscale x 4 x float> %div1, <vscale x 4 x float> %div2)
|
|
ret void
|
|
}
|
|
|
|
define <vscale x 2 x double> @splat_fdiv_nxv2f64(double %D, <vscale x 2 x double> %a) #1 {
|
|
; CHECK-LABEL: splat_fdiv_nxv2f64:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
|
|
; CHECK-NEXT: ptrue p0.d
|
|
; CHECK-NEXT: mov z0.d, d0
|
|
; CHECK-NEXT: fdivr z0.d, p0/m, z0.d, z1.d
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%D.ins = insertelement <vscale x 2 x double> poison, double %D, i64 0
|
|
%splat = shufflevector <vscale x 2 x double> %D.ins, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
|
|
%div = fdiv <vscale x 2 x double> %a, %splat
|
|
ret <vscale x 2 x double> %div
|
|
}
|
|
|
|
define void @splat_two_fdiv_nxv2f64(double %D, <vscale x 2 x double> %a, <vscale x 2 x double> %b) #1 {
|
|
; CHECK-LABEL: splat_two_fdiv_nxv2f64:
|
|
; CHECK: // %bb.0: // %entry
|
|
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
|
|
; CHECK-NEXT: fmov z3.d, #1.00000000
|
|
; CHECK-NEXT: ptrue p0.d
|
|
; CHECK-NEXT: mov z0.d, d0
|
|
; CHECK-NEXT: fdiv z3.d, p0/m, z3.d, z0.d
|
|
; CHECK-NEXT: fmul z0.d, z1.d, z3.d
|
|
; CHECK-NEXT: fmul z1.d, z2.d, z3.d
|
|
; CHECK-NEXT: b foo_2_nxv2f64
|
|
entry:
|
|
%D.ins = insertelement <vscale x 2 x double> poison, double %D, i64 0
|
|
%splat = shufflevector <vscale x 2 x double> %D.ins, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
|
|
%div = fdiv <vscale x 2 x double> %a, %splat
|
|
%div1 = fdiv <vscale x 2 x double> %b, %splat
|
|
tail call void @foo_2_nxv2f64(<vscale x 2 x double> %div, <vscale x 2 x double> %div1)
|
|
ret void
|
|
}
|
|
|
|
declare void @foo_3f(float, float, float)
|
|
declare void @foo_3d(double, double, double)
|
|
declare void @foo_3_4xf(<4 x float>, <4 x float>, <4 x float>)
|
|
declare void @foo_3_2xd(<2 x double>, <2 x double>, <2 x double>)
|
|
declare void @foo_2f(float, float)
|
|
declare void @foo_2d(double, double)
|
|
declare void @foo_3_nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
|
|
declare void @foo_2_nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
|
|
|
|
attributes #0 = { "unsafe-fp-math"="true" }
|
|
attributes #1 = { "unsafe-fp-math"="true" "target-features"="+sve" }
|