Craig Topper 8dfb9627b7 [X86] Make v32i16/v64i8 legal types without avx512bw. Use custom splitting instead.
This moves v32i16/v64i8 to a model consistent with how we
treat integer types with avx1.

This does change the ABI for types vXi16/vXi8 vectors larger than
512 bits to pass in multiple zmms instead of multiple ymms. We'd
already hacked some code to make v64i8/v32i16 pass in zmm.

Cost model is still a bit of a mess. In some place I tried to
match existing behavior. But really we need to account for
splitting and concating costs. Cost model for shuffles is
especially pessimistic.

Differential Revision: https://reviews.llvm.org/D76212
2020-04-15 12:17:18 -07:00

303 lines
12 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 --check-prefix=SSE2-PROMOTE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 --check-prefix=SSE41-PROMOTE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
define <4 x i16> @mulhuw_v4i16(<4 x i16> %a, <4 x i16> %b) {
; SSE-LABEL: mulhuw_v4i16:
; SSE: # %bb.0:
; SSE-NEXT: pmulhuw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: mulhuw_v4i16:
; AVX: # %bb.0:
; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%a1 = zext <4 x i16> %a to <4 x i32>
%b1 = zext <4 x i16> %b to <4 x i32>
%c = mul <4 x i32> %a1, %b1
%d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
%e = trunc <4 x i32> %d to <4 x i16>
ret <4 x i16> %e
}
define <4 x i16> @mulhw_v4i16(<4 x i16> %a, <4 x i16> %b) {
; SSE-LABEL: mulhw_v4i16:
; SSE: # %bb.0:
; SSE-NEXT: pmulhw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: mulhw_v4i16:
; AVX: # %bb.0:
; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%a1 = sext <4 x i16> %a to <4 x i32>
%b1 = sext <4 x i16> %b to <4 x i32>
%c = mul <4 x i32> %a1, %b1
%d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
%e = trunc <4 x i32> %d to <4 x i16>
ret <4 x i16> %e
}
define <8 x i16> @mulhuw_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: mulhuw_v8i16:
; SSE: # %bb.0:
; SSE-NEXT: pmulhuw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: mulhuw_v8i16:
; AVX: # %bb.0:
; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%a1 = zext <8 x i16> %a to <8 x i32>
%b1 = zext <8 x i16> %b to <8 x i32>
%c = mul <8 x i32> %a1, %b1
%d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
%e = trunc <8 x i32> %d to <8 x i16>
ret <8 x i16> %e
}
define <8 x i16> @mulhw_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: mulhw_v8i16:
; SSE: # %bb.0:
; SSE-NEXT: pmulhw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: mulhw_v8i16:
; AVX: # %bb.0:
; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%a1 = sext <8 x i16> %a to <8 x i32>
%b1 = sext <8 x i16> %b to <8 x i32>
%c = mul <8 x i32> %a1, %b1
%d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
%e = trunc <8 x i32> %d to <8 x i16>
ret <8 x i16> %e
}
define <16 x i16> @mulhuw_v16i16(<16 x i16> %a, <16 x i16> %b) {
; SSE-LABEL: mulhuw_v16i16:
; SSE: # %bb.0:
; SSE-NEXT: pmulhuw %xmm2, %xmm0
; SSE-NEXT: pmulhuw %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: mulhuw_v16i16:
; AVX: # %bb.0:
; AVX-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
%a1 = zext <16 x i16> %a to <16 x i32>
%b1 = zext <16 x i16> %b to <16 x i32>
%c = mul <16 x i32> %a1, %b1
%d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
%e = trunc <16 x i32> %d to <16 x i16>
ret <16 x i16> %e
}
define <16 x i16> @mulhw_v16i16(<16 x i16> %a, <16 x i16> %b) {
; SSE-LABEL: mulhw_v16i16:
; SSE: # %bb.0:
; SSE-NEXT: pmulhw %xmm2, %xmm0
; SSE-NEXT: pmulhw %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: mulhw_v16i16:
; AVX: # %bb.0:
; AVX-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
%a1 = sext <16 x i16> %a to <16 x i32>
%b1 = sext <16 x i16> %b to <16 x i32>
%c = mul <16 x i32> %a1, %b1
%d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
%e = trunc <16 x i32> %d to <16 x i16>
ret <16 x i16> %e
}
define <32 x i16> @mulhuw_v32i16(<32 x i16> %a, <32 x i16> %b) {
; SSE-LABEL: mulhuw_v32i16:
; SSE: # %bb.0:
; SSE-NEXT: pmulhuw %xmm4, %xmm0
; SSE-NEXT: pmulhuw %xmm5, %xmm1
; SSE-NEXT: pmulhuw %xmm6, %xmm2
; SSE-NEXT: pmulhuw %xmm7, %xmm3
; SSE-NEXT: retq
;
; AVX2-LABEL: mulhuw_v32i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmulhuw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512F-LABEL: mulhuw_v32i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; AVX512F-NEXT: vpmulhuw %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mulhuw_v32i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmulhuw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
%a1 = zext <32 x i16> %a to <32 x i32>
%b1 = zext <32 x i16> %b to <32 x i32>
%c = mul <32 x i32> %a1, %b1
%d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
%e = trunc <32 x i32> %d to <32 x i16>
ret <32 x i16> %e
}
define <32 x i16> @mulhw_v32i16(<32 x i16> %a, <32 x i16> %b) {
; SSE-LABEL: mulhw_v32i16:
; SSE: # %bb.0:
; SSE-NEXT: pmulhw %xmm4, %xmm0
; SSE-NEXT: pmulhw %xmm5, %xmm1
; SSE-NEXT: pmulhw %xmm6, %xmm2
; SSE-NEXT: pmulhw %xmm7, %xmm3
; SSE-NEXT: retq
;
; AVX2-LABEL: mulhw_v32i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmulhw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmulhw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512F-LABEL: mulhw_v32i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; AVX512F-NEXT: vpmulhw %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mulhw_v32i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmulhw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
%a1 = sext <32 x i16> %a to <32 x i32>
%b1 = sext <32 x i16> %b to <32 x i32>
%c = mul <32 x i32> %a1, %b1
%d = lshr <32 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
%e = trunc <32 x i32> %d to <32 x i16>
ret <32 x i16> %e
}
define <64 x i16> @mulhuw_v64i16(<64 x i16> %a, <64 x i16> %b) {
; SSE-LABEL: mulhuw_v64i16:
; SSE: # %bb.0:
; SSE-NEXT: movq %rdi, %rax
; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm3
; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm4
; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm5
; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm6
; SSE-NEXT: pmulhuw {{[0-9]+}}(%rsp), %xmm7
; SSE-NEXT: movdqa %xmm7, 112(%rdi)
; SSE-NEXT: movdqa %xmm6, 96(%rdi)
; SSE-NEXT: movdqa %xmm5, 80(%rdi)
; SSE-NEXT: movdqa %xmm4, 64(%rdi)
; SSE-NEXT: movdqa %xmm3, 48(%rdi)
; SSE-NEXT: movdqa %xmm2, 32(%rdi)
; SSE-NEXT: movdqa %xmm1, 16(%rdi)
; SSE-NEXT: movdqa %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX2-LABEL: mulhuw_v64i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmulhuw %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpmulhuw %ymm5, %ymm1, %ymm1
; AVX2-NEXT: vpmulhuw %ymm6, %ymm2, %ymm2
; AVX2-NEXT: vpmulhuw %ymm7, %ymm3, %ymm3
; AVX2-NEXT: retq
;
; AVX512F-LABEL: mulhuw_v64i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5
; AVX512F-NEXT: vpmulhuw %ymm4, %ymm5, %ymm4
; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm2
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
; AVX512F-NEXT: vpmulhuw %ymm2, %ymm4, %ymm2
; AVX512F-NEXT: vpmulhuw %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mulhuw_v64i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmulhuw %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpmulhuw %zmm3, %zmm1, %zmm1
; AVX512BW-NEXT: retq
%a1 = zext <64 x i16> %a to <64 x i32>
%b1 = zext <64 x i16> %b to <64 x i32>
%c = mul <64 x i32> %a1, %b1
%d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
%e = trunc <64 x i32> %d to <64 x i16>
ret <64 x i16> %e
}
define <64 x i16> @mulhw_v64i16(<64 x i16> %a, <64 x i16> %b) {
; SSE-LABEL: mulhw_v64i16:
; SSE: # %bb.0:
; SSE-NEXT: movq %rdi, %rax
; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm2
; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm3
; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm4
; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm5
; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm6
; SSE-NEXT: pmulhw {{[0-9]+}}(%rsp), %xmm7
; SSE-NEXT: movdqa %xmm7, 112(%rdi)
; SSE-NEXT: movdqa %xmm6, 96(%rdi)
; SSE-NEXT: movdqa %xmm5, 80(%rdi)
; SSE-NEXT: movdqa %xmm4, 64(%rdi)
; SSE-NEXT: movdqa %xmm3, 48(%rdi)
; SSE-NEXT: movdqa %xmm2, 32(%rdi)
; SSE-NEXT: movdqa %xmm1, 16(%rdi)
; SSE-NEXT: movdqa %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX2-LABEL: mulhw_v64i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmulhw %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpmulhw %ymm5, %ymm1, %ymm1
; AVX2-NEXT: vpmulhw %ymm6, %ymm2, %ymm2
; AVX2-NEXT: vpmulhw %ymm7, %ymm3, %ymm3
; AVX2-NEXT: retq
;
; AVX512F-LABEL: mulhw_v64i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5
; AVX512F-NEXT: vpmulhw %ymm4, %ymm5, %ymm4
; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm2
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
; AVX512F-NEXT: vpmulhw %ymm2, %ymm4, %ymm2
; AVX512F-NEXT: vpmulhw %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mulhw_v64i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmulhw %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpmulhw %zmm3, %zmm1, %zmm1
; AVX512BW-NEXT: retq
%a1 = sext <64 x i16> %a to <64 x i32>
%b1 = sext <64 x i16> %b to <64 x i32>
%c = mul <64 x i32> %a1, %b1
%d = lshr <64 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
%e = trunc <64 x i32> %d to <64 x i16>
ret <64 x i16> %e
}