llvm-project/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
Phoebe Wang 76e14deb4a
[X86][BreakFalseDeps] Using reverse order for undef register selection (#137569)
BreakFalseDeps picks the best register for undef operands if
instructions have false dependency. The problem is if the instruction is
close to the beginning of the function, ReachingDefAnalysis is over
optimism to the unused registers, which results in collision with
registers just defined in the caller.

This patch changes the selection of undef register in an reverse order,
which reduces the probability of register collisions between caller and
callee. It brings improvement in some of our internal benchmarks with
negligible effect on other benchmarks.
2025-06-11 22:08:20 +08:00

1734 lines
80 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=CHECK-SSE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK-AVX,CHECK-AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK-AVX,CHECK-AVX512F,CHECK-NO-FASTFMA
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx -fp-contract=fast | FileCheck %s --check-prefixes=CHECK-AVX,CHECK-AVX512F,CHECK-FMA
declare i16 @llvm.umax.i16(i16, i16)
declare i64 @llvm.umin.i64(i64, i64)
declare <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float>, <4 x i32>)
define <4 x float> @fmul_pow2_4xfloat(<4 x i32> %i) {
; CHECK-SSE-LABEL: fmul_pow2_4xfloat:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: pslld $23, %xmm0
; CHECK-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX2-LABEL: fmul_pow2_4xfloat:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: vpslld $23, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1091567616,1091567616,1091567616,1091567616]
; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
; CHECK-NO-FASTFMA-LABEL: fmul_pow2_4xfloat:
; CHECK-NO-FASTFMA: # %bb.0:
; CHECK-NO-FASTFMA-NEXT: vpslld $23, %xmm0, %xmm0
; CHECK-NO-FASTFMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1091567616,1091567616,1091567616,1091567616]
; CHECK-NO-FASTFMA-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-NO-FASTFMA-NEXT: retq
;
; CHECK-FMA-LABEL: fmul_pow2_4xfloat:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vpslld $23, %xmm0, %xmm0
; CHECK-FMA-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
; CHECK-FMA-NEXT: retq
%p2 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %i
%p2_f = uitofp <4 x i32> %p2 to <4 x float>
%r = fmul <4 x float> <float 9.000000e+00, float 9.000000e+00, float 9.000000e+00, float 9.000000e+00>, %p2_f
ret <4 x float> %r
}
define <4 x float> @fmul_pow2_ldexp_4xfloat(<4 x i32> %i) {
; CHECK-SSE-LABEL: fmul_pow2_ldexp_4xfloat:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: subq $56, %rsp
; CHECK-SSE-NEXT: .cfi_def_cfa_offset 64
; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
; CHECK-SSE-NEXT: movd %xmm1, %edi
; CHECK-SSE-NEXT: movss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-SSE-NEXT: callq ldexpf@PLT
; CHECK-SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-SSE-NEXT: # xmm0 = mem[2,3,2,3]
; CHECK-SSE-NEXT: movd %xmm0, %edi
; CHECK-SSE-NEXT: movss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-SSE-NEXT: callq ldexpf@PLT
; CHECK-SSE-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-SSE-NEXT: movd %xmm0, %edi
; CHECK-SSE-NEXT: movss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-SSE-NEXT: callq ldexpf@PLT
; CHECK-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-SSE-NEXT: # xmm0 = mem[1,1,1,1]
; CHECK-SSE-NEXT: movd %xmm0, %edi
; CHECK-SSE-NEXT: movss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-SSE-NEXT: callq ldexpf@PLT
; CHECK-SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; CHECK-SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; CHECK-SSE-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload
; CHECK-SSE-NEXT: # xmm1 = xmm1[0],mem[0]
; CHECK-SSE-NEXT: movaps %xmm1, %xmm0
; CHECK-SSE-NEXT: addq $56, %rsp
; CHECK-SSE-NEXT: .cfi_def_cfa_offset 8
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX-LABEL: fmul_pow2_ldexp_4xfloat:
; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: subq $40, %rsp
; CHECK-AVX-NEXT: .cfi_def_cfa_offset 48
; CHECK-AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX-NEXT: vextractps $1, %xmm0, %edi
; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX-NEXT: callq ldexpf@PLT
; CHECK-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-AVX-NEXT: vmovd %xmm0, %edi
; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX-NEXT: callq ldexpf@PLT
; CHECK-AVX-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3]
; CHECK-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-AVX-NEXT: vextractps $2, %xmm0, %edi
; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX-NEXT: callq ldexpf@PLT
; CHECK-AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
; CHECK-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
; CHECK-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-AVX-NEXT: vextractps $3, %xmm0, %edi
; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX-NEXT: callq ldexpf@PLT
; CHECK-AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
; CHECK-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; CHECK-AVX-NEXT: addq $40, %rsp
; CHECK-AVX-NEXT: .cfi_def_cfa_offset 8
; CHECK-AVX-NEXT: retq
%r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> <float 9.000000e+00, float 9.000000e+00, float 9.000000e+00, float 9.000000e+00>, <4 x i32> %i)
ret <4 x float> %r
}
define <4 x float> @fdiv_pow2_4xfloat(<4 x i32> %i) {
; CHECK-SSE-LABEL: fdiv_pow2_4xfloat:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: pslld $23, %xmm0
; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm1 = [1091567616,1091567616,1091567616,1091567616]
; CHECK-SSE-NEXT: psubd %xmm0, %xmm1
; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX-LABEL: fdiv_pow2_4xfloat:
; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: vpslld $23, %xmm0, %xmm0
; CHECK-AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1091567616,1091567616,1091567616,1091567616]
; CHECK-AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0
; CHECK-AVX-NEXT: retq
%p2 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %i
%p2_f = uitofp <4 x i32> %p2 to <4 x float>
%r = fdiv <4 x float> <float 9.000000e+00, float 9.000000e+00, float 9.000000e+00, float 9.000000e+00>, %p2_f
ret <4 x float> %r
}
declare <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half>, <8 x i16>)
define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; CHECK-SSE-LABEL: fmul_pow2_8xhalf:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: subq $104, %rsp
; CHECK-SSE-NEXT: .cfi_def_cfa_offset 112
; CHECK-SSE-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
; CHECK-SSE-NEXT: pslld $23, %xmm1
; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
; CHECK-SSE-NEXT: paddd %xmm2, %xmm1
; CHECK-SSE-NEXT: cvttps2dq %xmm1, %xmm1
; CHECK-SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-SSE-NEXT: pslld $16, %xmm1
; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; CHECK-SSE-NEXT: pslld $23, %xmm0
; CHECK-SSE-NEXT: paddd %xmm2, %xmm0
; CHECK-SSE-NEXT: cvttps2dq %xmm0, %xmm0
; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-SSE-NEXT: pslld $16, %xmm0
; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-SSE-NEXT: psrld $16, %xmm0
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-SSE-NEXT: psrlq $48, %xmm0
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; CHECK-SSE-NEXT: cvtdq2ps %xmm1, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-SSE-NEXT: psrld $16, %xmm0
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-SSE-NEXT: psrlq $48, %xmm0
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-SSE-NEXT: pxor %xmm1, %xmm1
; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-SSE-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-SSE-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-SSE-NEXT: addq $104, %rsp
; CHECK-SSE-NEXT: .cfi_def_cfa_offset 8
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX2-LABEL: fmul_pow2_8xhalf:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: subq $120, %rsp
; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 128
; CHECK-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
; CHECK-AVX2-NEXT: vpsllvd %ymm0, %ymm1, %ymm0
; CHECK-AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
; CHECK-AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; CHECK-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; CHECK-AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
; CHECK-AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; CHECK-AVX2-NEXT: vzeroupper
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; CHECK-AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-AVX2-NEXT: vzeroupper
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-AVX2-NEXT: # xmm0 = mem[1,0]
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-AVX2-NEXT: # xmm0 = mem[3,3,3,3]
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; CHECK-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; CHECK-AVX2-NEXT: vzeroupper
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-AVX2-NEXT: # xmm0 = mem[1,0]
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-AVX2-NEXT: # xmm0 = mem[3,3,3,3]
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT
; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT
; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT
; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT
; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; CHECK-AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; CHECK-AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT
; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT
; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT
; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT
; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; CHECK-AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; CHECK-AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-AVX2-NEXT: addq $120, %rsp
; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8
; CHECK-AVX2-NEXT: retq
;
; CHECK-NO-FASTFMA-LABEL: fmul_pow2_8xhalf:
; CHECK-NO-FASTFMA: # %bb.0:
; CHECK-NO-FASTFMA-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; CHECK-NO-FASTFMA-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
; CHECK-NO-FASTFMA-NEXT: vpsllvd %ymm0, %ymm1, %ymm0
; CHECK-NO-FASTFMA-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NO-FASTFMA-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; CHECK-NO-FASTFMA-NEXT: vcvtdq2ps %ymm0, %ymm0
; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %ymm0, %xmm0
; CHECK-NO-FASTFMA-NEXT: vcvtph2ps %xmm0, %ymm0
; CHECK-NO-FASTFMA-NEXT: vbroadcastss {{.*#+}} ymm1 = [8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3]
; CHECK-NO-FASTFMA-NEXT: vmulps %ymm1, %ymm0, %ymm0
; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %ymm0, %xmm0
; CHECK-NO-FASTFMA-NEXT: vzeroupper
; CHECK-NO-FASTFMA-NEXT: retq
;
; CHECK-FMA-LABEL: fmul_pow2_8xhalf:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
; CHECK-FMA-NEXT: vpsllvw %xmm0, %xmm1, %xmm0
; CHECK-FMA-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; CHECK-FMA-NEXT: vcvtdq2ps %ymm0, %ymm0
; CHECK-FMA-NEXT: vcvtps2ph $4, %ymm0, %xmm0
; CHECK-FMA-NEXT: vcvtph2ps %xmm0, %ymm0
; CHECK-FMA-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
; CHECK-FMA-NEXT: vcvtps2ph $4, %ymm0, %xmm0
; CHECK-FMA-NEXT: vzeroupper
; CHECK-FMA-NEXT: retq
%p2 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %i
%p2_f = uitofp <8 x i16> %p2 to <8 x half>
%r = fmul <8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, %p2_f
ret <8 x half> %r
}
define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
; CHECK-SSE-LABEL: fmul_pow2_ldexp_8xhalf:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: subq $72, %rsp
; CHECK-SSE-NEXT: .cfi_def_cfa_offset 80
; CHECK-SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
; CHECK-SSE-NEXT: pextrw $7, %xmm0, %eax
; CHECK-SSE-NEXT: movswl %ax, %edi
; CHECK-SSE-NEXT: movss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-SSE-NEXT: callq ldexpf@PLT
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-SSE-NEXT: pextrw $6, %xmm0, %eax
; CHECK-SSE-NEXT: movswl %ax, %edi
; CHECK-SSE-NEXT: movd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-SSE-NEXT: callq ldexpf@PLT
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-SSE-NEXT: pextrw $5, %xmm0, %eax
; CHECK-SSE-NEXT: movswl %ax, %edi
; CHECK-SSE-NEXT: movss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-SSE-NEXT: callq ldexpf@PLT
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-SSE-NEXT: pextrw $4, %xmm0, %eax
; CHECK-SSE-NEXT: movswl %ax, %edi
; CHECK-SSE-NEXT: movd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-SSE-NEXT: callq ldexpf@PLT
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-SSE-NEXT: pextrw $3, %xmm0, %eax
; CHECK-SSE-NEXT: movswl %ax, %edi
; CHECK-SSE-NEXT: movss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-SSE-NEXT: callq ldexpf@PLT
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-SSE-NEXT: pextrw $2, %xmm0, %eax
; CHECK-SSE-NEXT: movswl %ax, %edi
; CHECK-SSE-NEXT: movd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-SSE-NEXT: callq ldexpf@PLT
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-SSE-NEXT: pextrw $1, %xmm0, %eax
; CHECK-SSE-NEXT: movswl %ax, %edi
; CHECK-SSE-NEXT: movss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-SSE-NEXT: callq ldexpf@PLT
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-SSE-NEXT: movd %xmm0, %eax
; CHECK-SSE-NEXT: movswl %ax, %edi
; CHECK-SSE-NEXT: movd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-SSE-NEXT: callq ldexpf@PLT
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-SSE-NEXT: addq $72, %rsp
; CHECK-SSE-NEXT: .cfi_def_cfa_offset 8
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX2-LABEL: fmul_pow2_ldexp_8xhalf:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: subq $72, %rsp
; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 80
; CHECK-AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
; CHECK-AVX2-NEXT: vpextrw $7, %xmm0, %eax
; CHECK-AVX2-NEXT: movswl %ax, %edi
; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX2-NEXT: callq ldexpf@PLT
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-AVX2-NEXT: vpextrw $6, %xmm0, %eax
; CHECK-AVX2-NEXT: movswl %ax, %edi
; CHECK-AVX2-NEXT: vmovd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX2-NEXT: callq ldexpf@PLT
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-AVX2-NEXT: vpextrw $5, %xmm0, %eax
; CHECK-AVX2-NEXT: movswl %ax, %edi
; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX2-NEXT: callq ldexpf@PLT
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-AVX2-NEXT: vpextrw $4, %xmm0, %eax
; CHECK-AVX2-NEXT: movswl %ax, %edi
; CHECK-AVX2-NEXT: vmovd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX2-NEXT: callq ldexpf@PLT
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-AVX2-NEXT: vpextrw $3, %xmm0, %eax
; CHECK-AVX2-NEXT: movswl %ax, %edi
; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX2-NEXT: callq ldexpf@PLT
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-AVX2-NEXT: vpextrw $2, %xmm0, %eax
; CHECK-AVX2-NEXT: movswl %ax, %edi
; CHECK-AVX2-NEXT: vmovd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX2-NEXT: callq ldexpf@PLT
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-AVX2-NEXT: vpextrw $1, %xmm0, %eax
; CHECK-AVX2-NEXT: movswl %ax, %edi
; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX2-NEXT: callq ldexpf@PLT
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-AVX2-NEXT: vmovd %xmm0, %eax
; CHECK-AVX2-NEXT: movswl %ax, %edi
; CHECK-AVX2-NEXT: vmovd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX2-NEXT: callq ldexpf@PLT
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-AVX2-NEXT: addq $72, %rsp
; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8
; CHECK-AVX2-NEXT: retq
;
; CHECK-AVX512F-LABEL: fmul_pow2_ldexp_8xhalf:
; CHECK-AVX512F: # %bb.0:
; CHECK-AVX512F-NEXT: subq $72, %rsp
; CHECK-AVX512F-NEXT: .cfi_def_cfa_offset 80
; CHECK-AVX512F-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
; CHECK-AVX512F-NEXT: vpextrw $7, %xmm0, %eax
; CHECK-AVX512F-NEXT: movswl %ax, %edi
; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX512F-NEXT: callq ldexpf@PLT
; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-AVX512F-NEXT: vpextrw $6, %xmm0, %eax
; CHECK-AVX512F-NEXT: movswl %ax, %edi
; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX512F-NEXT: callq ldexpf@PLT
; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-AVX512F-NEXT: vpextrw $5, %xmm0, %eax
; CHECK-AVX512F-NEXT: movswl %ax, %edi
; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX512F-NEXT: callq ldexpf@PLT
; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-AVX512F-NEXT: vpextrw $4, %xmm0, %eax
; CHECK-AVX512F-NEXT: movswl %ax, %edi
; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX512F-NEXT: callq ldexpf@PLT
; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-AVX512F-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-AVX512F-NEXT: vpextrw $3, %xmm0, %eax
; CHECK-AVX512F-NEXT: movswl %ax, %edi
; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX512F-NEXT: callq ldexpf@PLT
; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-AVX512F-NEXT: vpextrw $2, %xmm0, %eax
; CHECK-AVX512F-NEXT: movswl %ax, %edi
; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX512F-NEXT: callq ldexpf@PLT
; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-AVX512F-NEXT: vpextrw $1, %xmm0, %eax
; CHECK-AVX512F-NEXT: movswl %ax, %edi
; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX512F-NEXT: callq ldexpf@PLT
; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; CHECK-AVX512F-NEXT: vmovd %xmm0, %eax
; CHECK-AVX512F-NEXT: movswl %ax, %edi
; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX512F-NEXT: callq ldexpf@PLT
; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; CHECK-AVX512F-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0]
; CHECK-AVX512F-NEXT: addq $72, %rsp
; CHECK-AVX512F-NEXT: .cfi_def_cfa_offset 8
; CHECK-AVX512F-NEXT: retq
%r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, <8 x i16> %i)
ret <8 x half> %r
}
define <8 x half> @fdiv_pow2_8xhalf(<8 x i16> %i) {
; CHECK-SSE-LABEL: fdiv_pow2_8xhalf:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: psllw $10, %xmm0
; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
; CHECK-SSE-NEXT: psubw %xmm0, %xmm1
; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX2-LABEL: fdiv_pow2_8xhalf:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: vpsllw $10, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
; CHECK-AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0
; CHECK-AVX2-NEXT: retq
;
; CHECK-NO-FASTFMA-LABEL: fdiv_pow2_8xhalf:
; CHECK-NO-FASTFMA: # %bb.0:
; CHECK-NO-FASTFMA-NEXT: vpsllw $10, %xmm0, %xmm0
; CHECK-NO-FASTFMA-NEXT: vpbroadcastw {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
; CHECK-NO-FASTFMA-NEXT: vpsubw %xmm0, %xmm1, %xmm0
; CHECK-NO-FASTFMA-NEXT: retq
;
; CHECK-FMA-LABEL: fdiv_pow2_8xhalf:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vpsllw $10, %xmm0, %xmm0
; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [28672,28672,28672,28672,28672,28672,28672,28672]
; CHECK-FMA-NEXT: vpsubw %xmm0, %xmm1, %xmm0
; CHECK-FMA-NEXT: retq
%p2 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %i
%p2_f = uitofp <8 x i16> %p2 to <8 x half>
%r = fdiv <8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, %p2_f
ret <8 x half> %r
}
; FIXME: The movzbl is unnecessary. It would be UB for the upper bits to be set
; in the original IR.
define double @fmul_pow_shl_cnt(i64 %cnt) nounwind {
; CHECK-SSE-LABEL: fmul_pow_shl_cnt:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: movzbl %dil, %eax
; CHECK-SSE-NEXT: shlq $52, %rax
; CHECK-SSE-NEXT: movabsq $4621256167635550208, %rcx # imm = 0x4022000000000000
; CHECK-SSE-NEXT: addq %rax, %rcx
; CHECK-SSE-NEXT: movq %rcx, %xmm0
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX-LABEL: fmul_pow_shl_cnt:
; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: movzbl %dil, %eax
; CHECK-AVX-NEXT: shlq $52, %rax
; CHECK-AVX-NEXT: movabsq $4621256167635550208, %rcx # imm = 0x4022000000000000
; CHECK-AVX-NEXT: addq %rax, %rcx
; CHECK-AVX-NEXT: vmovq %rcx, %xmm0
; CHECK-AVX-NEXT: retq
%shl = shl nuw i64 1, %cnt
%conv = uitofp i64 %shl to double
%mul = fmul double 9.000000e+00, %conv
ret double %mul
}
; FIXME: The movzbl is unnecessary. It would be UB for the upper bits to be set
; in the original IR.
define double @fmul_pow_shl_cnt2(i64 %cnt) nounwind {
; CHECK-SSE-LABEL: fmul_pow_shl_cnt2:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: movzbl %dil, %eax
; CHECK-SSE-NEXT: incl %eax
; CHECK-SSE-NEXT: shlq $52, %rax
; CHECK-SSE-NEXT: movabsq $-4602115869219225600, %rcx # imm = 0xC022000000000000
; CHECK-SSE-NEXT: addq %rax, %rcx
; CHECK-SSE-NEXT: movq %rcx, %xmm0
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX-LABEL: fmul_pow_shl_cnt2:
; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: movzbl %dil, %eax
; CHECK-AVX-NEXT: incl %eax
; CHECK-AVX-NEXT: shlq $52, %rax
; CHECK-AVX-NEXT: movabsq $-4602115869219225600, %rcx # imm = 0xC022000000000000
; CHECK-AVX-NEXT: addq %rax, %rcx
; CHECK-AVX-NEXT: vmovq %rcx, %xmm0
; CHECK-AVX-NEXT: retq
%shl = shl nuw i64 2, %cnt
%conv = uitofp i64 %shl to double
%mul = fmul double -9.000000e+00, %conv
ret double %mul
}
; Make sure we do a movzbl of the input register.
define double @fmul_pow_shl_cnt3(i8 %cnt) nounwind {
; CHECK-SSE-LABEL: fmul_pow_shl_cnt3:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: movzbl %dil, %eax
; CHECK-SSE-NEXT: shlq $52, %rax
; CHECK-SSE-NEXT: movabsq $-4602115869219225600, %rcx # imm = 0xC022000000000000
; CHECK-SSE-NEXT: addq %rax, %rcx
; CHECK-SSE-NEXT: movq %rcx, %xmm0
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX-LABEL: fmul_pow_shl_cnt3:
; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: movzbl %dil, %eax
; CHECK-AVX-NEXT: shlq $52, %rax
; CHECK-AVX-NEXT: movabsq $-4602115869219225600, %rcx # imm = 0xC022000000000000
; CHECK-AVX-NEXT: addq %rax, %rcx
; CHECK-AVX-NEXT: vmovq %rcx, %xmm0
; CHECK-AVX-NEXT: retq
%zext_cnt = zext i8 %cnt to i64
%shl = shl nuw i64 1, %zext_cnt
%conv = uitofp i64 %shl to double
%mul = fmul double -9.000000e+00, %conv
ret double %mul
}
; FIXME: The movzbl is unnecessary. It would be UB for the upper bits to be set
; in the original IR.
define float @fmul_pow_select(i32 %cnt, i1 %c) nounwind {
; CHECK-SSE-LABEL: fmul_pow_select:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: movzbl %dil, %eax
; CHECK-SSE-NEXT: leal 1(%rax), %ecx
; CHECK-SSE-NEXT: testb $1, %sil
; CHECK-SSE-NEXT: cmovnel %eax, %ecx
; CHECK-SSE-NEXT: shll $23, %ecx
; CHECK-SSE-NEXT: addl $1091567616, %ecx # imm = 0x41100000
; CHECK-SSE-NEXT: movd %ecx, %xmm0
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX-LABEL: fmul_pow_select:
; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: movzbl %dil, %eax
; CHECK-AVX-NEXT: leal 1(%rax), %ecx
; CHECK-AVX-NEXT: testb $1, %sil
; CHECK-AVX-NEXT: cmovnel %eax, %ecx
; CHECK-AVX-NEXT: shll $23, %ecx
; CHECK-AVX-NEXT: addl $1091567616, %ecx # imm = 0x41100000
; CHECK-AVX-NEXT: vmovd %ecx, %xmm0
; CHECK-AVX-NEXT: retq
%shl2 = shl nuw i32 2, %cnt
%shl1 = shl nuw i32 1, %cnt
%shl = select i1 %c, i32 %shl1, i32 %shl2
%conv = uitofp i32 %shl to float
%mul = fmul float 9.000000e+00, %conv
ret float %mul
}
; FIXME: The movzbl is unnecessary. It would be UB for the upper bits to be set
; in the original IR.
define float @fmul_fly_pow_mul_min_pow2(i64 %cnt) nounwind {
; CHECK-SSE-LABEL: fmul_fly_pow_mul_min_pow2:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: movzbl %dil, %eax
; CHECK-SSE-NEXT: addl $3, %eax
; CHECK-SSE-NEXT: cmpl $13, %eax
; CHECK-SSE-NEXT: movl $13, %ecx
; CHECK-SSE-NEXT: cmovbl %eax, %ecx
; CHECK-SSE-NEXT: shll $23, %ecx
; CHECK-SSE-NEXT: addl $1091567616, %ecx # imm = 0x41100000
; CHECK-SSE-NEXT: movd %ecx, %xmm0
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX-LABEL: fmul_fly_pow_mul_min_pow2:
; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: movzbl %dil, %eax
; CHECK-AVX-NEXT: addl $3, %eax
; CHECK-AVX-NEXT: cmpl $13, %eax
; CHECK-AVX-NEXT: movl $13, %ecx
; CHECK-AVX-NEXT: cmovbl %eax, %ecx
; CHECK-AVX-NEXT: shll $23, %ecx
; CHECK-AVX-NEXT: addl $1091567616, %ecx # imm = 0x41100000
; CHECK-AVX-NEXT: vmovd %ecx, %xmm0
; CHECK-AVX-NEXT: retq
%shl8 = shl nuw i64 8, %cnt
%shl = call i64 @llvm.umin.i64(i64 %shl8, i64 8192)
%conv = uitofp i64 %shl to float
%mul = fmul float 9.000000e+00, %conv
ret float %mul
}
; FIXME: The movzbl is unnecessary. It would be UB for the upper bits to be set
; in the original IR.
define double @fmul_pow_mul_max_pow2(i16 %cnt) nounwind {
; CHECK-SSE-LABEL: fmul_pow_mul_max_pow2:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: movzbl %dil, %eax
; CHECK-SSE-NEXT: leaq 1(%rax), %rcx
; CHECK-SSE-NEXT: cmpq %rcx, %rax
; CHECK-SSE-NEXT: cmovaq %rax, %rcx
; CHECK-SSE-NEXT: shlq $52, %rcx
; CHECK-SSE-NEXT: movabsq $4613937818241073152, %rax # imm = 0x4008000000000000
; CHECK-SSE-NEXT: orq %rcx, %rax
; CHECK-SSE-NEXT: movq %rax, %xmm0
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX-LABEL: fmul_pow_mul_max_pow2:
; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: movzbl %dil, %eax
; CHECK-AVX-NEXT: leaq 1(%rax), %rcx
; CHECK-AVX-NEXT: cmpq %rcx, %rax
; CHECK-AVX-NEXT: cmovaq %rax, %rcx
; CHECK-AVX-NEXT: shlq $52, %rcx
; CHECK-AVX-NEXT: movabsq $4613937818241073152, %rax # imm = 0x4008000000000000
; CHECK-AVX-NEXT: orq %rcx, %rax
; CHECK-AVX-NEXT: vmovq %rax, %xmm0
; CHECK-AVX-NEXT: retq
%shl2 = shl nuw i16 2, %cnt
%shl1 = shl nuw i16 1, %cnt
%shl = call i16 @llvm.umax.i16(i16 %shl1, i16 %shl2)
%conv = uitofp i16 %shl to double
%mul = fmul double 3.000000e+00, %conv
ret double %mul
}
define double @fmul_pow_shl_cnt_fail_maybe_non_pow2(i64 %v, i64 %cnt) nounwind {
; CHECK-SSE-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: movq %rsi, %rcx
; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx
; CHECK-SSE-NEXT: shlq %cl, %rdi
; CHECK-SSE-NEXT: movq %rdi, %xmm1
; CHECK-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; CHECK-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE-NEXT: movapd %xmm1, %xmm0
; CHECK-SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; CHECK-SSE-NEXT: addsd %xmm1, %xmm0
; CHECK-SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: movq %rsi, %rcx
; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
; CHECK-AVX2-NEXT: shlq %cl, %rdi
; CHECK-AVX2-NEXT: vmovq %rdi, %xmm0
; CHECK-AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
; CHECK-AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm0
; CHECK-AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
; CHECK-NO-FASTFMA: # %bb.0:
; CHECK-NO-FASTFMA-NEXT: movq %rsi, %rcx
; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx
; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rdi
; CHECK-NO-FASTFMA-NEXT: vcvtusi2sd %rdi, %xmm15, %xmm0
; CHECK-NO-FASTFMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-NO-FASTFMA-NEXT: retq
;
; CHECK-FMA-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: shlxq %rsi, %rdi, %rax
; CHECK-FMA-NEXT: vcvtusi2sd %rax, %xmm15, %xmm0
; CHECK-FMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-FMA-NEXT: retq
%shl = shl nuw i64 %v, %cnt
%conv = uitofp i64 %shl to double
%mul = fmul double 9.000000e+00, %conv
ret double %mul
}
define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nounwind {
; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,2]
; CHECK-SSE-NEXT: movdqa %xmm2, %xmm3
; CHECK-SSE-NEXT: psllq %xmm1, %xmm3
; CHECK-SSE-NEXT: psllq %xmm0, %xmm2
; CHECK-SSE-NEXT: movq %xmm2, %rax
; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm0
; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
; CHECK-SSE-NEXT: movq %xmm1, %rax
; CHECK-SSE-NEXT: xorps %xmm1, %xmm1
; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1
; CHECK-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [2,2]
; CHECK-AVX2-NEXT: vpsllvq %xmm0, %xmm1, %xmm0
; CHECK-AVX2-NEXT: vpextrq $1, %xmm0, %rax
; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1
; CHECK-AVX2-NEXT: vmovq %xmm0, %rax
; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0
; CHECK-AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1]
; CHECK-AVX2-NEXT: vmulps %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
; CHECK-NO-FASTFMA: # %bb.0:
; CHECK-NO-FASTFMA-NEXT: vpmovsxbq {{.*#+}} xmm1 = [2,2]
; CHECK-NO-FASTFMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0
; CHECK-NO-FASTFMA-NEXT: vpextrq $1, %xmm0, %rax
; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1
; CHECK-NO-FASTFMA-NEXT: vmovq %xmm0, %rax
; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0
; CHECK-NO-FASTFMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
; CHECK-NO-FASTFMA-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1]
; CHECK-NO-FASTFMA-NEXT: vmulps %xmm1, %xmm0, %xmm0
; CHECK-NO-FASTFMA-NEXT: retq
;
; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2]
; CHECK-FMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0
; CHECK-FMA-NEXT: vcvtqq2ps %xmm0, %xmm0
; CHECK-FMA-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
; CHECK-FMA-NEXT: retq
%shl = shl nsw nuw <2 x i64> <i64 2, i64 2>, %cnt
%conv = uitofp <2 x i64> %shl to <2 x float>
%mul = fmul <2 x float> <float 15.000000e+00, float 15.000000e+00>, %conv
ret <2 x float> %mul
}
define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: psllq $52, %xmm0
; CHECK-SSE-NEXT: paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: vpsllq $52, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec:
; CHECK-NO-FASTFMA: # %bb.0:
; CHECK-NO-FASTFMA-NEXT: vpsllq $52, %xmm0, %xmm0
; CHECK-NO-FASTFMA-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-NO-FASTFMA-NEXT: retq
;
; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vpsllq $52, %xmm0, %xmm0
; CHECK-FMA-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
; CHECK-FMA-NEXT: retq
%shl = shl nsw nuw <2 x i64> <i64 2, i64 2>, %cnt
%conv = uitofp <2 x i64> %shl to <2 x double>
%mul = fmul <2 x double> <double 15.000000e+00, double 15.000000e+00>, %conv
ret <2 x double> %mul
}
define <4 x float> @fmul_pow_shl_cnt_vec_preserve_fma(<4 x i32> %cnt, <4 x float> %add) nounwind {
; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: pslld $23, %xmm0
; CHECK-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: addps %xmm1, %xmm0
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: vpslld $23, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1092616192,1092616192,1092616192,1092616192]
; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; CHECK-AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
; CHECK-NO-FASTFMA: # %bb.0:
; CHECK-NO-FASTFMA-NEXT: vpslld $23, %xmm0, %xmm0
; CHECK-NO-FASTFMA-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1092616192,1092616192,1092616192,1092616192]
; CHECK-NO-FASTFMA-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; CHECK-NO-FASTFMA-NEXT: vaddps %xmm1, %xmm0, %xmm0
; CHECK-NO-FASTFMA-NEXT: retq
;
; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_preserve_fma:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2]
; CHECK-FMA-NEXT: vpsllvd %xmm0, %xmm2, %xmm0
; CHECK-FMA-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-FMA-NEXT: vfmadd132ps {{.*#+}} xmm0 = (xmm0 * mem) + xmm1
; CHECK-FMA-NEXT: retq
%shl = shl nsw nuw <4 x i32> <i32 2, i32 2, i32 2, i32 2>, %cnt
%conv = uitofp <4 x i32> %shl to <4 x float>
%mul = fmul <4 x float> <float 5.000000e+00, float 5.000000e+00, float 5.000000e+00, float 5.000000e+00>, %conv
%res = fadd <4 x float> %mul, %add
ret <4 x float> %res
}
define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwind {
; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: psllq $52, %xmm0
; CHECK-SSE-NEXT: paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: vpsllq $52, %xmm0, %xmm0
; CHECK-AVX-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX-NEXT: retq
%shl = shl nsw nuw <2 x i64> <i64 2, i64 2>, %cnt
%conv = uitofp <2 x i64> %shl to <2 x double>
%mul = fmul <2 x double> <double 15.000000e+00, double 14.000000e+00>, %conv
ret <2 x double> %mul
}
define <2 x double> @fmul_pow_shl_cnt_vec_non_splat2_todo(<2 x i64> %cnt) nounwind {
; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: psllq $52, %xmm0
; CHECK-SSE-NEXT: paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: vpsllq $52, %xmm0, %xmm0
; CHECK-AVX-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX-NEXT: retq
%shl = shl nsw nuw <2 x i64> <i64 2, i64 1>, %cnt
%conv = uitofp <2 x i64> %shl to <2 x double>
%mul = fmul <2 x double> <double 15.000000e+00, double 15.000000e+00>, %conv
ret <2 x double> %mul
}
define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
; CHECK-SSE-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: subq $40, %rsp
; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; CHECK-SSE-NEXT: pslld $23, %xmm0
; CHECK-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: cvttps2dq %xmm0, %xmm0
; CHECK-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; CHECK-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,2,u,u,u,u,u,u]
; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-SSE-NEXT: psrld $16, %xmm0
; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; CHECK-SSE-NEXT: cvtdq2ps %xmm1, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0
; CHECK-SSE-NEXT: addq $40, %rsp
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: subq $56, %rsp
; CHECK-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; CHECK-AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0]
; CHECK-AVX2-NEXT: vpsllvd %ymm0, %ymm1, %ymm0
; CHECK-AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-AVX2-NEXT: vpextrw $2, %xmm0, %eax
; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0
; CHECK-AVX2-NEXT: vzeroupper
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; CHECK-AVX2-NEXT: vpextrw $0, %xmm0, %eax
; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0
; CHECK-AVX2-NEXT: vzeroupper
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT
; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-AVX2-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; CHECK-AVX2-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT
; CHECK-AVX2-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; CHECK-AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; CHECK-AVX2-NEXT: addq $56, %rsp
; CHECK-AVX2-NEXT: retq
;
; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
; CHECK-NO-FASTFMA: # %bb.0:
; CHECK-NO-FASTFMA-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; CHECK-NO-FASTFMA-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0]
; CHECK-NO-FASTFMA-NEXT: vpsllvd %ymm0, %ymm1, %ymm0
; CHECK-NO-FASTFMA-NEXT: vpmovdw %zmm0, %ymm0
; CHECK-NO-FASTFMA-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; CHECK-NO-FASTFMA-NEXT: vcvtdq2ps %ymm0, %ymm0
; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %ymm0, %xmm0
; CHECK-NO-FASTFMA-NEXT: vcvtph2ps %xmm0, %ymm0
; CHECK-NO-FASTFMA-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1]
; CHECK-NO-FASTFMA-NEXT: vmulps %ymm1, %ymm0, %ymm0
; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %ymm0, %xmm0
; CHECK-NO-FASTFMA-NEXT: vzeroupper
; CHECK-NO-FASTFMA-NEXT: retq
;
; CHECK-FMA-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
; CHECK-FMA-NEXT: vpsllvw %xmm0, %xmm1, %xmm0
; CHECK-FMA-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; CHECK-FMA-NEXT: vcvtdq2ps %ymm0, %ymm0
; CHECK-FMA-NEXT: vcvtps2ph $4, %ymm0, %xmm0
; CHECK-FMA-NEXT: vcvtph2ps %xmm0, %ymm0
; CHECK-FMA-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
; CHECK-FMA-NEXT: vcvtps2ph $4, %ymm0, %xmm0
; CHECK-FMA-NEXT: vzeroupper
; CHECK-FMA-NEXT: retq
%shl = shl nsw nuw <2 x i16> <i16 2, i16 2>, %cnt
%conv = uitofp <2 x i16> %shl to <2 x half>
%mul = fmul <2 x half> <half 15.000000e+00, half 15.000000e+00>, %conv
ret <2 x half> %mul
}
define double @fmul_pow_shl_cnt_fail_maybe_bad_exp(i64 %cnt) nounwind {
; CHECK-SSE-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: movq %rdi, %rcx
; CHECK-SSE-NEXT: movl $1, %eax
; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx
; CHECK-SSE-NEXT: shlq %cl, %rax
; CHECK-SSE-NEXT: movq %rax, %xmm1
; CHECK-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; CHECK-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE-NEXT: movapd %xmm1, %xmm0
; CHECK-SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; CHECK-SSE-NEXT: addsd %xmm1, %xmm0
; CHECK-SSE-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: movq %rdi, %rcx
; CHECK-AVX2-NEXT: movl $1, %eax
; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
; CHECK-AVX2-NEXT: shlq %cl, %rax
; CHECK-AVX2-NEXT: vmovq %rax, %xmm0
; CHECK-AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
; CHECK-AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm0
; CHECK-AVX2-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-AVX2-NEXT: retq
;
; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
; CHECK-NO-FASTFMA: # %bb.0:
; CHECK-NO-FASTFMA-NEXT: movq %rdi, %rcx
; CHECK-NO-FASTFMA-NEXT: movl $1, %eax
; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx
; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rax
; CHECK-NO-FASTFMA-NEXT: vcvtusi2sd %rax, %xmm15, %xmm0
; CHECK-NO-FASTFMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-NO-FASTFMA-NEXT: retq
;
; CHECK-FMA-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: movl $1, %eax
; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax
; CHECK-FMA-NEXT: vcvtusi2sd %rax, %xmm15, %xmm0
; CHECK-FMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-FMA-NEXT: retq
%shl = shl nuw i64 1, %cnt
%conv = uitofp i64 %shl to double
%mul = fmul double 9.745314e+288, %conv
ret double %mul
}
; FIXME: The movzbl is unnecessary. It would be UB for the upper bits to be set
; in the original IR.
define double @fmul_pow_shl_cnt_safe(i16 %cnt) nounwind {
; CHECK-SSE-LABEL: fmul_pow_shl_cnt_safe:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: movzbl %dil, %eax
; CHECK-SSE-NEXT: shlq $52, %rax
; CHECK-SSE-NEXT: movabsq $8930638061065157010, %rcx # imm = 0x7BEFFFFFFF5F3992
; CHECK-SSE-NEXT: addq %rax, %rcx
; CHECK-SSE-NEXT: movq %rcx, %xmm0
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX-LABEL: fmul_pow_shl_cnt_safe:
; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: movzbl %dil, %eax
; CHECK-AVX-NEXT: shlq $52, %rax
; CHECK-AVX-NEXT: movabsq $8930638061065157010, %rcx # imm = 0x7BEFFFFFFF5F3992
; CHECK-AVX-NEXT: addq %rax, %rcx
; CHECK-AVX-NEXT: vmovq %rcx, %xmm0
; CHECK-AVX-NEXT: retq
%shl = shl nuw i16 1, %cnt
%conv = uitofp i16 %shl to double
%mul = fmul double 9.745314e+288, %conv
ret double %mul
}
define <2 x double> @fdiv_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_vec:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: psllq $52, %xmm0
; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4607182418800017408,4607182418800017408]
; CHECK-SSE-NEXT: psubq %xmm0, %xmm1
; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX-LABEL: fdiv_pow_shl_cnt_vec:
; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: vpsllq $52, %xmm0, %xmm0
; CHECK-AVX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4607182418800017408,4607182418800017408]
; CHECK-AVX-NEXT: vpsubq %xmm0, %xmm1, %xmm0
; CHECK-AVX-NEXT: retq
%shl = shl nuw <2 x i64> <i64 1, i64 1>, %cnt
%conv = uitofp <2 x i64> %shl to <2 x double>
%mul = fdiv <2 x double> <double 1.000000e+00, double 1.000000e+00>, %conv
ret <2 x double> %mul
}
define <2 x float> @fdiv_pow_shl_cnt_vec_with_expensive_cast(<2 x i64> %cnt) nounwind {
; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
; CHECK-SSE-NEXT: pslld $23, %xmm1
; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm0 = [1065353216,1065353216,u,u]
; CHECK-SSE-NEXT: psubd %xmm1, %xmm0
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast:
; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-AVX-NEXT: vpslld $23, %xmm0, %xmm0
; CHECK-AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216]
; CHECK-AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0
; CHECK-AVX-NEXT: retq
%shl = shl nuw <2 x i64> <i64 1, i64 1>, %cnt
%conv = uitofp <2 x i64> %shl to <2 x float>
%mul = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %conv
ret <2 x float> %mul
}
define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind {
; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: movq %rdi, %rcx
; CHECK-SSE-NEXT: movl $8, %eax
; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx
; CHECK-SSE-NEXT: shlq %cl, %rax
; CHECK-SSE-NEXT: testq %rax, %rax
; CHECK-SSE-NEXT: js .LBB23_1
; CHECK-SSE-NEXT: # %bb.2:
; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1
; CHECK-SSE-NEXT: jmp .LBB23_3
; CHECK-SSE-NEXT: .LBB23_1:
; CHECK-SSE-NEXT: shrq %rax
; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1
; CHECK-SSE-NEXT: addss %xmm1, %xmm1
; CHECK-SSE-NEXT: .LBB23_3:
; CHECK-SSE-NEXT: movss {{.*#+}} xmm0 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-SSE-NEXT: divss %xmm1, %xmm0
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: movq %rdi, %rcx
; CHECK-AVX2-NEXT: movl $8, %eax
; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
; CHECK-AVX2-NEXT: shlq %cl, %rax
; CHECK-AVX2-NEXT: testq %rax, %rax
; CHECK-AVX2-NEXT: js .LBB23_1
; CHECK-AVX2-NEXT: # %bb.2:
; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0
; CHECK-AVX2-NEXT: jmp .LBB23_3
; CHECK-AVX2-NEXT: .LBB23_1:
; CHECK-AVX2-NEXT: shrq %rax
; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0
; CHECK-AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0
; CHECK-AVX2-NEXT: .LBB23_3:
; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0
; CHECK-AVX2-NEXT: retq
;
; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
; CHECK-NO-FASTFMA: # %bb.0:
; CHECK-NO-FASTFMA-NEXT: movq %rdi, %rcx
; CHECK-NO-FASTFMA-NEXT: movl $8, %eax
; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx
; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rax
; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0
; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
; CHECK-NO-FASTFMA-NEXT: retq
;
; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: movl $8, %eax
; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax
; CHECK-FMA-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0
; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
; CHECK-FMA-NEXT: retq
%shl = shl i64 8, %cnt
%conv = uitofp i64 %shl to float
%mul = fdiv float -9.000000e+00, %conv
ret float %mul
}
define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind {
; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: movq %rdi, %rcx
; CHECK-SSE-NEXT: movl $8, %eax
; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx
; CHECK-SSE-NEXT: shlq %cl, %rax
; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1
; CHECK-SSE-NEXT: movss {{.*#+}} xmm0 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-SSE-NEXT: divss %xmm1, %xmm0
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: movq %rdi, %rcx
; CHECK-AVX2-NEXT: movl $8, %eax
; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
; CHECK-AVX2-NEXT: shlq %cl, %rax
; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0
; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0
; CHECK-AVX2-NEXT: retq
;
; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
; CHECK-NO-FASTFMA: # %bb.0:
; CHECK-NO-FASTFMA-NEXT: movq %rdi, %rcx
; CHECK-NO-FASTFMA-NEXT: movl $8, %eax
; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx
; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rax
; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0
; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
; CHECK-NO-FASTFMA-NEXT: retq
;
; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_neg_int:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: movl $8, %eax
; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax
; CHECK-FMA-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0
; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
; CHECK-FMA-NEXT: retq
%shl = shl i64 8, %cnt
%conv = sitofp i64 %shl to float
%mul = fdiv float -9.000000e+00, %conv
ret float %mul
}
define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
; CHECK-SSE-LABEL: fdiv_pow_shl_cnt:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: andl $31, %edi
; CHECK-SSE-NEXT: shll $23, %edi
; CHECK-SSE-NEXT: movl $-1115684864, %eax # imm = 0xBD800000
; CHECK-SSE-NEXT: subl %edi, %eax
; CHECK-SSE-NEXT: movd %eax, %xmm0
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX-LABEL: fdiv_pow_shl_cnt:
; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: andl $31, %edi
; CHECK-AVX-NEXT: shll $23, %edi
; CHECK-AVX-NEXT: movl $-1115684864, %eax # imm = 0xBD800000
; CHECK-AVX-NEXT: subl %edi, %eax
; CHECK-AVX-NEXT: vmovd %eax, %xmm0
; CHECK-AVX-NEXT: retq
%cnt = and i64 %cnt_in, 31
%shl = shl i64 8, %cnt
%conv = sitofp i64 %shl to float
%mul = fdiv float -0.500000e+00, %conv
ret float %mul
}
define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: pushq %rax
; CHECK-SSE-NEXT: movl %edi, %ecx
; CHECK-SSE-NEXT: movl $1, %eax
; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx
; CHECK-SSE-NEXT: shll %cl, %eax
; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
; CHECK-SSE-NEXT: movss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-SSE-NEXT: divss %xmm0, %xmm1
; CHECK-SSE-NEXT: movaps %xmm1, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: popq %rax
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: pushq %rax
; CHECK-AVX2-NEXT: movl %edi, %ecx
; CHECK-AVX2-NEXT: movl $1, %eax
; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx
; CHECK-AVX2-NEXT: shll %cl, %eax
; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT
; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: popq %rax
; CHECK-AVX2-NEXT: retq
;
; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
; CHECK-NO-FASTFMA: # %bb.0:
; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx
; CHECK-NO-FASTFMA-NEXT: movl $1, %eax
; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx
; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax
; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %eax, %xmm15, %xmm0
; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-NO-FASTFMA-NEXT: vcvtph2ps %xmm0, %xmm0
; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-NO-FASTFMA-NEXT: retq
;
; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: movl $1, %eax
; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax
; CHECK-FMA-NEXT: vcvtusi2ss %eax, %xmm15, %xmm0
; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-FMA-NEXT: vcvtph2ps %xmm0, %xmm0
; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-FMA-NEXT: retq
%shl = shl nuw i32 1, %cnt
%conv = uitofp i32 %shl to half
%mul = fdiv half 0xH7000, %conv
ret half %mul
}
define half @fdiv_pow_shl_cnt_in_bounds(i16 %cnt) nounwind {
; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_in_bounds:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: shll $10, %edi
; CHECK-SSE-NEXT: movl $28672, %eax # imm = 0x7000
; CHECK-SSE-NEXT: subl %edi, %eax
; CHECK-SSE-NEXT: pinsrw $0, %eax, %xmm0
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX-LABEL: fdiv_pow_shl_cnt_in_bounds:
; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: shll $10, %edi
; CHECK-AVX-NEXT: movl $28672, %eax # imm = 0x7000
; CHECK-AVX-NEXT: subl %edi, %eax
; CHECK-AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
; CHECK-AVX-NEXT: retq
%shl = shl nuw i16 1, %cnt
%conv = uitofp i16 %shl to half
%mul = fdiv half 0xH7000, %conv
ret half %mul
}
define half @fdiv_pow_shl_cnt_in_bounds2(i16 %cnt) nounwind {
; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_in_bounds2:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: shll $10, %edi
; CHECK-SSE-NEXT: movl $18432, %eax # imm = 0x4800
; CHECK-SSE-NEXT: subl %edi, %eax
; CHECK-SSE-NEXT: pinsrw $0, %eax, %xmm0
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX-LABEL: fdiv_pow_shl_cnt_in_bounds2:
; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: shll $10, %edi
; CHECK-AVX-NEXT: movl $18432, %eax # imm = 0x4800
; CHECK-AVX-NEXT: subl %edi, %eax
; CHECK-AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
; CHECK-AVX-NEXT: retq
%shl = shl nuw i16 1, %cnt
%conv = uitofp i16 %shl to half
%mul = fdiv half 0xH4800, %conv
ret half %mul
}
define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: pushq %rax
; CHECK-SSE-NEXT: movl %edi, %ecx
; CHECK-SSE-NEXT: movl $1, %eax
; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx
; CHECK-SSE-NEXT: shll %cl, %eax
; CHECK-SSE-NEXT: movzwl %ax, %eax
; CHECK-SSE-NEXT: cvtsi2ss %eax, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
; CHECK-SSE-NEXT: movss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-SSE-NEXT: divss %xmm0, %xmm1
; CHECK-SSE-NEXT: movaps %xmm1, %xmm0
; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
; CHECK-SSE-NEXT: popq %rax
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: pushq %rax
; CHECK-AVX2-NEXT: movl %edi, %ecx
; CHECK-AVX2-NEXT: movl $1, %eax
; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx
; CHECK-AVX2-NEXT: shll %cl, %eax
; CHECK-AVX2-NEXT: movzwl %ax, %eax
; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT
; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0
; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT
; CHECK-AVX2-NEXT: popq %rax
; CHECK-AVX2-NEXT: retq
;
; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
; CHECK-NO-FASTFMA: # %bb.0:
; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx
; CHECK-NO-FASTFMA-NEXT: movl $1, %eax
; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx
; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax
; CHECK-NO-FASTFMA-NEXT: movzwl %ax, %eax
; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0
; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-NO-FASTFMA-NEXT: vcvtph2ps %xmm0, %xmm0
; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-NO-FASTFMA-NEXT: retq
;
; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: movl $1, %eax
; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax
; CHECK-FMA-NEXT: movzwl %ax, %eax
; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0
; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-FMA-NEXT: vcvtph2ps %xmm0, %xmm0
; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-FMA-NEXT: retq
%shl = shl nuw i16 1, %cnt
%conv = uitofp i16 %shl to half
%mul = fdiv half 0xH4000, %conv
ret half %mul
}
; FIXME: The movzbl is unnecessary. It would be UB for the upper bits to be set
; in the original IR.
define double @fdiv_pow_shl_cnt32_to_dbl_okay(i32 %cnt) nounwind {
; CHECK-SSE-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: movzbl %dil, %eax
; CHECK-SSE-NEXT: shlq $52, %rax
; CHECK-SSE-NEXT: movabsq $3936146074321813504, %rcx # imm = 0x36A0000000000000
; CHECK-SSE-NEXT: subq %rax, %rcx
; CHECK-SSE-NEXT: movq %rcx, %xmm0
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay:
; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: movzbl %dil, %eax
; CHECK-AVX-NEXT: shlq $52, %rax
; CHECK-AVX-NEXT: movabsq $3936146074321813504, %rcx # imm = 0x36A0000000000000
; CHECK-AVX-NEXT: subq %rax, %rcx
; CHECK-AVX-NEXT: vmovq %rcx, %xmm0
; CHECK-AVX-NEXT: retq
%shl = shl nuw i32 1, %cnt
%conv = uitofp i32 %shl to double
%mul = fdiv double 0x36A0000000000000, %conv
ret double %mul
}
define float @fdiv_pow_shl_cnt32_out_of_bounds2(i32 %cnt) nounwind {
; CHECK-SSE-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: movl %edi, %ecx
; CHECK-SSE-NEXT: movl $1, %eax
; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $ecx
; CHECK-SSE-NEXT: shll %cl, %eax
; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1
; CHECK-SSE-NEXT: movss {{.*#+}} xmm0 = [1.00974148E-28,0.0E+0,0.0E+0,0.0E+0]
; CHECK-SSE-NEXT: divss %xmm1, %xmm0
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: movl %edi, %ecx
; CHECK-AVX2-NEXT: movl $1, %eax
; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx
; CHECK-AVX2-NEXT: shll %cl, %eax
; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0
; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = [1.00974148E-28,0.0E+0,0.0E+0,0.0E+0]
; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0
; CHECK-AVX2-NEXT: retq
;
; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
; CHECK-NO-FASTFMA: # %bb.0:
; CHECK-NO-FASTFMA-NEXT: movl %edi, %ecx
; CHECK-NO-FASTFMA-NEXT: movl $1, %eax
; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx
; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax
; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %eax, %xmm15, %xmm0
; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = [1.00974148E-28,0.0E+0,0.0E+0,0.0E+0]
; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
; CHECK-NO-FASTFMA-NEXT: retq
;
; CHECK-FMA-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2:
; CHECK-FMA: # %bb.0:
; CHECK-FMA-NEXT: movl $1, %eax
; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax
; CHECK-FMA-NEXT: vcvtusi2ss %eax, %xmm15, %xmm0
; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = [1.00974148E-28,0.0E+0,0.0E+0,0.0E+0]
; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
; CHECK-FMA-NEXT: retq
%shl = shl nuw i32 1, %cnt
%conv = uitofp i32 %shl to float
%mul = fdiv float 0x3a1fffff00000000, %conv
ret float %mul
}
; FIXME: The movzbl is unnecessary. It would be UB for the upper bits to be set
; in the original IR.
define float @fdiv_pow_shl_cnt32_okay(i32 %cnt) nounwind {
; CHECK-SSE-LABEL: fdiv_pow_shl_cnt32_okay:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: movzbl %dil, %eax
; CHECK-SSE-NEXT: shll $23, %eax
; CHECK-SSE-NEXT: movl $285212672, %ecx # imm = 0x11000000
; CHECK-SSE-NEXT: subl %eax, %ecx
; CHECK-SSE-NEXT: movd %ecx, %xmm0
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX-LABEL: fdiv_pow_shl_cnt32_okay:
; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: movzbl %dil, %eax
; CHECK-AVX-NEXT: shll $23, %eax
; CHECK-AVX-NEXT: movl $285212672, %ecx # imm = 0x11000000
; CHECK-AVX-NEXT: subl %eax, %ecx
; CHECK-AVX-NEXT: vmovd %ecx, %xmm0
; CHECK-AVX-NEXT: retq
%shl = shl nuw i32 1, %cnt
%conv = uitofp i32 %shl to float
%mul = fdiv float 0x3a20000000000000, %conv
ret float %mul
}
define x86_fp80 @pr128528(i1 %cond) {
; CHECK-SSE-LABEL: pr128528:
; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: testb $1, %dil
; CHECK-SSE-NEXT: movl $8, %eax
; CHECK-SSE-NEXT: movl $1, %ecx
; CHECK-SSE-NEXT: cmovnel %eax, %ecx
; CHECK-SSE-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
; CHECK-SSE-NEXT: fildl -{{[0-9]+}}(%rsp)
; CHECK-SSE-NEXT: fmull {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX-LABEL: pr128528:
; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: testb $1, %dil
; CHECK-AVX-NEXT: movl $8, %eax
; CHECK-AVX-NEXT: movl $1, %ecx
; CHECK-AVX-NEXT: cmovnel %eax, %ecx
; CHECK-AVX-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
; CHECK-AVX-NEXT: fildl -{{[0-9]+}}(%rsp)
; CHECK-AVX-NEXT: fmull {{\.?LCPI[0-9]+_[0-9]+}}(%rip)
; CHECK-AVX-NEXT: retq
%sub9 = select i1 %cond, i32 8, i32 1
%conv = uitofp i32 %sub9 to x86_fp80
%mul = fmul x86_fp80 %conv, 0xK4007D055555555555800
ret x86_fp80 %mul
}