Craig Topper b34eef7b41 [X86] Remove another weird scalar sqrt/rcp/rsqrt pattern.
This pattern turned a vector sqrt/rcp/rsqrt operation of sse_load_f32/f64 into the the scalar instruction for the operation and put undef into the upper bits. For correctness, the resulting code should still perform the sqrt/rcp/rsqrt on the upper bits after the load is extended since that's what the operation asked for. Particularly in the case where the upper bits are 0, in that case we need calculate the sqrt/rcp/rsqrt of the zeroes and keep the result in the upper-bits. This implies we should be using the packed instruction still.

The only test case for this pattern is one I just added so there was no coverage of this.

llvm-svn: 288784
2016-12-06 08:08:12 +00:00

376 lines
13 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
define <4 x double> @addpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp {
; CHECK-LABEL: addpd256:
; CHECK: ## BB#0: ## %entry
; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
entry:
%add.i = fadd <4 x double> %x, %y
ret <4 x double> %add.i
}
define <4 x double> @addpd256fold(<4 x double> %y) nounwind uwtable readnone ssp {
; CHECK-LABEL: addpd256fold:
; CHECK: ## BB#0: ## %entry
; CHECK-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
%add.i = fadd <4 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00>
ret <4 x double> %add.i
}
define <8 x float> @addps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp {
; CHECK-LABEL: addps256:
; CHECK: ## BB#0: ## %entry
; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
entry:
%add.i = fadd <8 x float> %x, %y
ret <8 x float> %add.i
}
define <8 x float> @addps256fold(<8 x float> %y) nounwind uwtable readnone ssp {
; CHECK-LABEL: addps256fold:
; CHECK: ## BB#0: ## %entry
; CHECK-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
%add.i = fadd <8 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000>
ret <8 x float> %add.i
}
define <4 x double> @subpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp {
; CHECK-LABEL: subpd256:
; CHECK: ## BB#0: ## %entry
; CHECK-NEXT: vsubpd %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
entry:
%sub.i = fsub <4 x double> %x, %y
ret <4 x double> %sub.i
}
define <4 x double> @subpd256fold(<4 x double> %y, <4 x double>* nocapture %x) nounwind uwtable readonly ssp {
; CHECK-LABEL: subpd256fold:
; CHECK: ## BB#0: ## %entry
; CHECK-NEXT: vsubpd (%rdi), %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
%tmp2 = load <4 x double>, <4 x double>* %x, align 32
%sub.i = fsub <4 x double> %y, %tmp2
ret <4 x double> %sub.i
}
define <8 x float> @subps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp {
; CHECK-LABEL: subps256:
; CHECK: ## BB#0: ## %entry
; CHECK-NEXT: vsubps %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
entry:
%sub.i = fsub <8 x float> %x, %y
ret <8 x float> %sub.i
}
define <8 x float> @subps256fold(<8 x float> %y, <8 x float>* nocapture %x) nounwind uwtable readonly ssp {
; CHECK-LABEL: subps256fold:
; CHECK: ## BB#0: ## %entry
; CHECK-NEXT: vsubps (%rdi), %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
%tmp2 = load <8 x float>, <8 x float>* %x, align 32
%sub.i = fsub <8 x float> %y, %tmp2
ret <8 x float> %sub.i
}
define <4 x double> @mulpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp {
; CHECK-LABEL: mulpd256:
; CHECK: ## BB#0: ## %entry
; CHECK-NEXT: vmulpd %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
entry:
%mul.i = fmul <4 x double> %x, %y
ret <4 x double> %mul.i
}
define <4 x double> @mulpd256fold(<4 x double> %y) nounwind uwtable readnone ssp {
; CHECK-LABEL: mulpd256fold:
; CHECK: ## BB#0: ## %entry
; CHECK-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
%mul.i = fmul <4 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00>
ret <4 x double> %mul.i
}
define <8 x float> @mulps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp {
; CHECK-LABEL: mulps256:
; CHECK: ## BB#0: ## %entry
; CHECK-NEXT: vmulps %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
entry:
%mul.i = fmul <8 x float> %x, %y
ret <8 x float> %mul.i
}
define <8 x float> @mulps256fold(<8 x float> %y) nounwind uwtable readnone ssp {
; CHECK-LABEL: mulps256fold:
; CHECK: ## BB#0: ## %entry
; CHECK-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
%mul.i = fmul <8 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000>
ret <8 x float> %mul.i
}
define <4 x double> @divpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp {
; CHECK-LABEL: divpd256:
; CHECK: ## BB#0: ## %entry
; CHECK-NEXT: vdivpd %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
entry:
%div.i = fdiv <4 x double> %x, %y
ret <4 x double> %div.i
}
define <4 x double> @divpd256fold(<4 x double> %y) nounwind uwtable readnone ssp {
; CHECK-LABEL: divpd256fold:
; CHECK: ## BB#0: ## %entry
; CHECK-NEXT: vdivpd {{.*}}(%rip), %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
%div.i = fdiv <4 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00>
ret <4 x double> %div.i
}
define <8 x float> @divps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp {
; CHECK-LABEL: divps256:
; CHECK: ## BB#0: ## %entry
; CHECK-NEXT: vdivps %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
entry:
%div.i = fdiv <8 x float> %x, %y
ret <8 x float> %div.i
}
define <8 x float> @divps256fold(<8 x float> %y) nounwind uwtable readnone ssp {
; CHECK-LABEL: divps256fold:
; CHECK: ## BB#0: ## %entry
; CHECK-NEXT: vdivps {{.*}}(%rip), %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
%div.i = fdiv <8 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000>
ret <8 x float> %div.i
}
define float @sqrtA(float %a) nounwind uwtable readnone ssp {
; CHECK-LABEL: sqrtA:
; CHECK: ## BB#0: ## %entry
; CHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retq
entry:
%conv1 = tail call float @sqrtf(float %a) nounwind readnone
ret float %conv1
}
declare double @sqrt(double) readnone
define double @sqrtB(double %a) nounwind uwtable readnone ssp {
; CHECK-LABEL: sqrtB:
; CHECK: ## BB#0: ## %entry
; CHECK-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retq
entry:
%call = tail call double @sqrt(double %a) nounwind readnone
ret double %call
}
declare float @sqrtf(float) readnone
define <4 x i64> @vpaddq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
; CHECK-LABEL: vpaddq:
; CHECK: ## BB#0:
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; CHECK-NEXT: retq
%x = add <4 x i64> %i, %j
ret <4 x i64> %x
}
define <8 x i32> @vpaddd(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
; CHECK-LABEL: vpaddd:
; CHECK: ## BB#0:
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm2
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; CHECK-NEXT: retq
%x = add <8 x i32> %i, %j
ret <8 x i32> %x
}
define <16 x i16> @vpaddw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
; CHECK-LABEL: vpaddw:
; CHECK: ## BB#0:
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpaddw %xmm2, %xmm3, %xmm2
; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; CHECK-NEXT: retq
%x = add <16 x i16> %i, %j
ret <16 x i16> %x
}
define <32 x i8> @vpaddb(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
; CHECK-LABEL: vpaddb:
; CHECK: ## BB#0:
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpaddb %xmm2, %xmm3, %xmm2
; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; CHECK-NEXT: retq
%x = add <32 x i8> %i, %j
ret <32 x i8> %x
}
define <4 x i64> @vpsubq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
; CHECK-LABEL: vpsubq:
; CHECK: ## BB#0:
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpsubq %xmm2, %xmm3, %xmm2
; CHECK-NEXT: vpsubq %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; CHECK-NEXT: retq
%x = sub <4 x i64> %i, %j
ret <4 x i64> %x
}
define <8 x i32> @vpsubd(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
; CHECK-LABEL: vpsubd:
; CHECK: ## BB#0:
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpsubd %xmm2, %xmm3, %xmm2
; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; CHECK-NEXT: retq
%x = sub <8 x i32> %i, %j
ret <8 x i32> %x
}
define <16 x i16> @vpsubw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
; CHECK-LABEL: vpsubw:
; CHECK: ## BB#0:
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpsubw %xmm2, %xmm3, %xmm2
; CHECK-NEXT: vpsubw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; CHECK-NEXT: retq
%x = sub <16 x i16> %i, %j
ret <16 x i16> %x
}
define <32 x i8> @vpsubb(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
; CHECK-LABEL: vpsubb:
; CHECK: ## BB#0:
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpsubb %xmm2, %xmm3, %xmm2
; CHECK-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; CHECK-NEXT: retq
%x = sub <32 x i8> %i, %j
ret <32 x i8> %x
}
define <8 x i32> @vpmulld(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
; CHECK-LABEL: vpmulld:
; CHECK: ## BB#0:
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpmulld %xmm2, %xmm3, %xmm2
; CHECK-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; CHECK-NEXT: retq
%x = mul <8 x i32> %i, %j
ret <8 x i32> %x
}
define <16 x i16> @vpmullw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
; CHECK-LABEL: vpmullw:
; CHECK: ## BB#0:
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpmullw %xmm2, %xmm3, %xmm2
; CHECK-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; CHECK-NEXT: retq
%x = mul <16 x i16> %i, %j
ret <16 x i16> %x
}
define <4 x i64> @mul_v4i64(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
; CHECK-LABEL: mul_v4i64:
; CHECK: ## BB#0:
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpmuludq %xmm2, %xmm3, %xmm4
; CHECK-NEXT: vpsrlq $32, %xmm2, %xmm5
; CHECK-NEXT: vpmuludq %xmm5, %xmm3, %xmm5
; CHECK-NEXT: vpsllq $32, %xmm5, %xmm5
; CHECK-NEXT: vpaddq %xmm5, %xmm4, %xmm4
; CHECK-NEXT: vpsrlq $32, %xmm3, %xmm3
; CHECK-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
; CHECK-NEXT: vpsllq $32, %xmm2, %xmm2
; CHECK-NEXT: vpaddq %xmm2, %xmm4, %xmm2
; CHECK-NEXT: vpmuludq %xmm1, %xmm0, %xmm3
; CHECK-NEXT: vpsrlq $32, %xmm1, %xmm4
; CHECK-NEXT: vpmuludq %xmm4, %xmm0, %xmm4
; CHECK-NEXT: vpsllq $32, %xmm4, %xmm4
; CHECK-NEXT: vpaddq %xmm4, %xmm3, %xmm3
; CHECK-NEXT: vpsrlq $32, %xmm0, %xmm0
; CHECK-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpsllq $32, %xmm0, %xmm0
; CHECK-NEXT: vpaddq %xmm0, %xmm3, %xmm0
; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; CHECK-NEXT: retq
%x = mul <4 x i64> %i, %j
ret <4 x i64> %x
}
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
define <4 x float> @int_sqrt_ss() {
; CHECK-LABEL: int_sqrt_ss:
; CHECK: ## BB#0:
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retq
%x0 = load float, float addrspace(1)* undef, align 8
%x1 = insertelement <4 x float> undef, float %x0, i32 0
%x2 = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %x1) nounwind
ret <4 x float> %x2
}
define <2 x double> @vector_sqrt_scalar_load(double* %a0) optsize {
; CHECK-LABEL: vector_sqrt_scalar_load:
; CHECK: ## BB#0:
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: vsqrtpd %xmm0, %xmm0
; CHECK-NEXT: retq
%a1 = load double, double* %a0
%a2 = insertelement <2 x double> undef, double %a1, i32 0
%res = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a2) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) nounwind readnone