llvm-project/llvm/test/CodeGen/X86/vector-narrow-binop.ll
Guozhi Wei f1d8345a2a [TwoAddressInstructionPass] Create register mapping for registers with multiple uses in the current MBB
Currently we create register mappings for registers used only once in current
MBB. For registers with multiple uses, when all the uses are in the current MBB,
we can also create mappings for them similarly according to the last use.
For example

    %reg101 = ...
            = ... reg101
    %reg103 = ADD %reg101, %reg102

We can create mapping between %reg101 and %reg103.

Differential Revision: https://reviews.llvm.org/D113193
2021-11-29 19:01:59 -08:00

196 lines
7.7 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX,AVX512
; AVX1 has support for 256-bit bitwise logic because the FP variants were included.
; If using those ops requires extra insert/extract though, it's probably not worth it.
define <8 x i32> @PR32790(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
; SSE-LABEL: PR32790:
; SSE: # %bb.0:
; SSE-NEXT: paddd %xmm2, %xmm0
; SSE-NEXT: paddd %xmm3, %xmm1
; SSE-NEXT: pand %xmm5, %xmm1
; SSE-NEXT: pand %xmm4, %xmm0
; SSE-NEXT: psubd %xmm6, %xmm0
; SSE-NEXT: psubd %xmm7, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: PR32790:
; AVX1: # %bb.0:
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm1
; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: PR32790:
; AVX2: # %bb.0:
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpsubd %ymm3, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: PR32790:
; AVX512: # %bb.0:
; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512-NEXT: vpsubd %ymm3, %ymm0, %ymm0
; AVX512-NEXT: retq
%add = add <8 x i32> %a, %b
%and = and <8 x i32> %add, %c
%sub = sub <8 x i32> %and, %d
ret <8 x i32> %sub
}
; In a more extreme case, even the later AVX targets should avoid extract/insert just
; because 256-bit ops are supported.
define <4 x i32> @do_not_use_256bit_op(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
; SSE-LABEL: do_not_use_256bit_op:
; SSE: # %bb.0:
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: pand %xmm3, %xmm1
; SSE-NEXT: psubd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: do_not_use_256bit_op:
; AVX: # %bb.0:
; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%concat1 = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%concat2 = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%and = and <8 x i32> %concat1, %concat2
%extract1 = shufflevector <8 x i32> %and, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%extract2 = shufflevector <8 x i32> %and, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%sub = sub <4 x i32> %extract1, %extract2
ret <4 x i32> %sub
}
; When extracting from a vector binop, the source width should be a multiple of the destination width.
; https://bugs.llvm.org/show_bug.cgi?id=39511
define <3 x float> @PR39511(<4 x float> %t0, <3 x float>* %b) {
; SSE-LABEL: PR39511:
; SSE: # %bb.0:
; SSE-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: PR39511:
; AVX: # %bb.0:
; AVX-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%add = fadd <4 x float> %t0, <float 1.0, float 2.0, float 3.0, float 4.0>
%ext = shufflevector <4 x float> %add, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
ret <3 x float> %ext
}
; When extracting from a vector binop, we need to be extracting
; by a width of at least 1 of the original vector elements.
; https://bugs.llvm.org/show_bug.cgi?id=39893
define <2 x i8> @PR39893(<2 x i32> %x, <8 x i8> %y) {
; SSE-LABEL: PR39893:
; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm2, %xmm2
; SSE-NEXT: psubd %xmm0, %xmm2
; SSE-NEXT: psrld $16, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: PR39893:
; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm0
; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX-NEXT: retq
%sub = sub <2 x i32> <i32 0, i32 undef>, %x
%bc = bitcast <2 x i32> %sub to <8 x i8>
%shuffle = shufflevector <8 x i8> %y, <8 x i8> %bc, <2 x i32> <i32 10, i32 4>
ret <2 x i8> %shuffle
}
define <2 x i8> @PR39893_2(<2 x float> %x) {
; SSE-LABEL: PR39893_2:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: subps %xmm0, %xmm1
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: PR39893_2:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vsubps %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%fsub = fsub <2 x float> zeroinitializer, %x
%bc = bitcast <2 x float> %fsub to <8 x i8>
%shuffle = shufflevector <8 x i8> %bc, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
ret <2 x i8> %shuffle
}
define <4 x double> @fmul_v2f64(<2 x double> %x, <2 x double> %y) {
; SSE-LABEL: fmul_v2f64:
; SSE: # %bb.0:
; SSE-NEXT: movapd %xmm1, %xmm2
; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
; SSE-NEXT: mulpd %xmm2, %xmm2
; SSE-NEXT: mulpd %xmm1, %xmm1
; SSE-NEXT: addpd %xmm1, %xmm2
; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
; SSE-NEXT: movapd %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: fmul_v2f64:
; AVX1: # %bb.0:
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],xmm0[0]
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; AVX1-NEXT: vmulpd %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vmulpd %xmm2, %xmm2, %xmm1
; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: retq
;
; AVX2-LABEL: fmul_v2f64:
; AVX2: # %bb.0:
; AVX2-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm1[0],xmm0[0]
; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; AVX2-NEXT: vmulpd %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vmulpd %xmm2, %xmm2, %xmm1
; AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: retq
;
; AVX512-LABEL: fmul_v2f64:
; AVX512: # %bb.0:
; AVX512-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1]
; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512-NEXT: vmulpd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vfmadd231pd {{.*#+}} xmm0 = (xmm2 * xmm2) + xmm0
; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512-NEXT: retq
%s = shufflevector <2 x double> %x, <2 x double> %y, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
%bo = fmul fast <4 x double> %s, %s
%ext = shufflevector <4 x double> %bo, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
%add = fadd fast <4 x double> %bo, %ext
%rdx = shufflevector <4 x double> %add, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
ret <4 x double> %rdx
}