Alexey Bataev 62af7252f1 [SLP] Fixed cost model for horizontal reduction.
Currently when cost of scalar operations is evaluated the vector type is
used for scalar operations. Patch fixes this issue and fixes evaluation
of the vector operations cost.
Several test showed that vector cost model is too optimistic. It
allowed vectorization of 8 or less add/fadd operations, though scalar
code is faster. Actually, only for 16 or more operations vector code
provides better performance.

Differential Revision: https://reviews.llvm.org/D26277

llvm-svn: 288398
2016-12-01 18:42:42 +00:00

368 lines
18 KiB
LLVM

; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=core2 -mtriple=x86_64-apple-darwin | FileCheck %s
; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=corei7 -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=SSE3
; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=corei7-avx -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=AVX
; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=core-avx2 -mtriple=x86_64-apple-darwin | FileCheck %s --check-prefix=AVX2
define fastcc float @reduction_cost_float(<4 x float> %rdx) {
%rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
%bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
%rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
; Check that we recognize the tree starting at the extractelement as a
; reduction.
; CHECK-LABEL: reduction_cost
; CHECK: cost of 9 {{.*}} extractelement
%r = extractelement <4 x float> %bin.rdx8, i32 0
ret float %r
}
define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) {
%rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef,
<8 x i32> <i32 4 , i32 5, i32 6, i32 7,
i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx = add <8 x i32> %rdx, %rdx.shuf
%rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,
<8 x i32> <i32 2 , i32 3, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
%rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef,
<8 x i32> <i32 1 , i32 undef, i32 undef, i32 undef,
i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
; CHECK-LABEL: reduction_cost_int
; CHECK: cost of 11 {{.*}} extractelement
; AVX-LABEL: reduction_cost_int
; AVX: cost of 5 {{.*}} extractelement
%r = extractelement <8 x i32> %bin.rdx.3, i32 0
ret i32 %r
}
define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) {
%rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
<4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
%rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
<4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
%bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
%rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
<4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
%rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
<4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
; CHECK-LABEL: pairwise_hadd
; CHECK: cost of 11 {{.*}} extractelement
%r = extractelement <4 x float> %bin.rdx.1, i32 0
%r2 = fadd float %r, %f1
ret float %r2
}
define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) {
%rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
<4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
%rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
<4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
%bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0
%rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
<4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
%rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
<4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
; CHECK-LABEL: pairwise_hadd_assoc
; CHECK: cost of 11 {{.*}} extractelement
%r = extractelement <4 x float> %bin.rdx.1, i32 0
%r2 = fadd float %r, %f1
ret float %r2
}
define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) {
%rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
<4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
%rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
<4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
%bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
%rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
<4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
; CHECK-LABEL: pairwise_hadd_skip_first
; CHECK: cost of 11 {{.*}} extractelement
%r = extractelement <4 x float> %bin.rdx.1, i32 0
%r2 = fadd float %r, %f1
ret float %r2
}
define fastcc double @no_pairwise_reduction2double(<2 x double> %rdx, double %f1) {
%rdx.shuf = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
%bin.rdx = fadd <2 x double> %rdx, %rdx.shuf
; SSE3: cost of 2 {{.*}} extractelement
; AVX: cost of 2 {{.*}} extractelement
; AVX2: cost of 2 {{.*}} extractelement
%r = extractelement <2 x double> %bin.rdx, i32 0
ret double %r
}
define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) {
%rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
%bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
%rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
; SSE3: cost of 4 {{.*}} extractelement
; AVX: cost of 3 {{.*}} extractelement
; AVX2: cost of 3 {{.*}} extractelement
%r = extractelement <4 x float> %bin.rdx8, i32 0
ret float %r
}
define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1) {
%rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
%bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
%rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
; AVX: cost of 3 {{.*}} extractelement
; AVX2: cost of 3 {{.*}} extractelement
%r = extractelement <4 x double> %bin.rdx8, i32 0
ret double %r
}
define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) {
%rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3
%rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
%rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7
; AVX: cost of 4 {{.*}} extractelement
; AVX2: cost of 4 {{.*}} extractelement
%r = extractelement <8 x float> %bin.rdx8, i32 0
ret float %r
}
define fastcc i64 @no_pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) {
%rdx.shuf = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
%bin.rdx = add <2 x i64> %rdx, %rdx.shuf
; SSE3: cost of 2 {{.*}} extractelement
; AVX: cost of 1 {{.*}} extractelement
; AVX2: cost of 1 {{.*}} extractelement
%r = extractelement <2 x i64> %bin.rdx, i32 0
ret i64 %r
}
define fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
%rdx.shuf = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
%bin.rdx = add <4 x i32> %rdx, %rdx.shuf
%rdx.shuf7 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%bin.rdx8 = add <4 x i32> %bin.rdx, %rdx.shuf7
; SSE3: cost of 3 {{.*}} extractelement
; AVX: cost of 3 {{.*}} extractelement
; AVX2: cost of 3 {{.*}} extractelement
%r = extractelement <4 x i32> %bin.rdx8, i32 0
ret i32 %r
}
define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
%rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
%bin.rdx = add <4 x i64> %rdx, %rdx.shuf
%rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
; AVX: cost of 3 {{.*}} extractelement
; AVX2: cost of 3 {{.*}} extractelement
%r = extractelement <4 x i64> %bin.rdx8, i32 0
ret i64 %r
}
define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
%rdx.shuf3 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3
%rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
%rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
; SSE3: cost of 4 {{.*}} extractelement
; AVX: cost of 4 {{.*}} extractelement
; AVX2: cost of 4 {{.*}} extractelement
%r = extractelement <8 x i16> %bin.rdx8, i32 0
ret i16 %r
}
define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
%rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3
%rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
%rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7
; AVX: cost of 5 {{.*}} extractelement
; AVX2: cost of 5 {{.*}} extractelement
%r = extractelement <8 x i32> %bin.rdx8, i32 0
ret i32 %r
}
define fastcc double @pairwise_reduction2double(<2 x double> %rdx, double %f1) {
%rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
%rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
%bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
; SSE3: cost of 2 {{.*}} extractelement
; AVX: cost of 2 {{.*}} extractelement
; AVX2: cost of 2 {{.*}} extractelement
%r = extractelement <2 x double> %bin.rdx8, i32 0
ret double %r
}
define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) {
%rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
%rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
%bin.rdx = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
%rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
%rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
; SSE3: cost of 4 {{.*}} extractelement
; AVX: cost of 4 {{.*}} extractelement
; AVX2: cost of 4 {{.*}} extractelement
%r = extractelement <4 x float> %bin.rdx8, i32 0
ret float %r
}
define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) {
%rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
%rdx.shuf.0.1 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
%bin.rdx = fadd <4 x double> %rdx.shuf.0.0, %rdx.shuf.0.1
%rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
%rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1
; AVX: cost of 5 {{.*}} extractelement
; AVX2: cost of 5 {{.*}} extractelement
%r = extractelement <4 x double> %bin.rdx8, i32 0
ret double %r
}
define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) {
%rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
%rdx.shuf.0.1 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx = fadd <8 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
%rdx.shuf.1.0 = shufflevector <8 x float> %bin.rdx, <8 x float> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%rdx.shuf.1.1 = shufflevector <8 x float> %bin.rdx, <8 x float> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx8 = fadd <8 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
%rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1
; AVX: cost of 7 {{.*}} extractelement
; AVX2: cost of 7 {{.*}} extractelement
%r = extractelement <8 x float> %bin.rdx9, i32 0
ret float %r
}
define fastcc i64 @pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) {
%rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 0, i32 undef>
%rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
%bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
; SSE3: cost of 2 {{.*}} extractelement
; AVX: cost of 1 {{.*}} extractelement
; AVX2: cost of 1 {{.*}} extractelement
%r = extractelement <2 x i64> %bin.rdx8, i32 0
ret i64 %r
}
define fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
%rdx.shuf.0.0 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
%rdx.shuf.0.1 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
%bin.rdx = add <4 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1
%rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
%rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
; SSE3: cost of 3 {{.*}} extractelement
; AVX: cost of 3 {{.*}} extractelement
; AVX2: cost of 3 {{.*}} extractelement
%r = extractelement <4 x i32> %bin.rdx8, i32 0
ret i32 %r
}
define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
%rdx.shuf.0.0 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
%rdx.shuf.0.1 = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
%bin.rdx = add <4 x i64> %rdx.shuf.0.0, %rdx.shuf.0.1
%rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
%rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1
; AVX: cost of 5 {{.*}} extractelement
; AVX2: cost of 5 {{.*}} extractelement
%r = extractelement <4 x i64> %bin.rdx8, i32 0
ret i64 %r
}
define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
%rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
%rdx.shuf.0.1 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx = add <8 x i16> %rdx.shuf.0.0, %rdx.shuf.0.1
%rdx.shuf.1.0 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1
%rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
; SSE3: cost of 5 {{.*}} extractelement
; AVX: cost of 5 {{.*}} extractelement
; AVX2: cost of 5 {{.*}} extractelement
%r = extractelement <8 x i16> %bin.rdx9, i32 0
ret i16 %r
}
define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
%rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6,i32 undef, i32 undef, i32 undef, i32 undef>
%rdx.shuf.0.1 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx = add <8 x i32> %rdx.shuf.0.0, %rdx.shuf.0.1
%rdx.shuf.1.0 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,<8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%rdx.shuf.1.1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,<8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx8 = add <8 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1
%rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef,<8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef,<8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1
; AVX: cost of 5 {{.*}} extractelement
; AVX2: cost of 5 {{.*}} extractelement
%r = extractelement <8 x i32> %bin.rdx9, i32 0
ret i32 %r
}