Sanjay Patel ce97ce3a5d [VectorCombine] try to form a better extractelement
Extracting to the same index that we are going to insert back into
allows forming select ("blend") shuffles and enables further transforms.

Admittedly, this is a quick-fix for a more general problem that I'm
hoping to solve by adding transforms for patterns that start with an
insertelement.

But this might resolve some regressions known to be caused by the
extract-extract transform (although I have not gotten more details on
those yet).

In the motivating case from PR34724:
https://bugs.llvm.org/show_bug.cgi?id=34724

The combination of subsequent instcombine and codegen transforms gets us this improvement:

  vmovshdup	%xmm0, %xmm2    ## xmm2 = xmm0[1,1,3,3]
  vhaddps	%xmm1, %xmm1, %xmm4
  vmovshdup	%xmm1, %xmm3    ## xmm3 = xmm1[1,1,3,3]
  vaddps	%xmm0, %xmm2, %xmm0
  vaddps	%xmm1, %xmm3, %xmm1
  vshufps	$200, %xmm4, %xmm0, %xmm0 ## xmm0 = xmm0[0,2],xmm4[0,3]
  vinsertps	$177, %xmm1, %xmm0, %xmm0 ## xmm0 = zero,xmm0[1,2],xmm1[2]

  -->

  vmovshdup	%xmm0, %xmm2    ## xmm2 = xmm0[1,1,3,3]
  vhaddps	%xmm1, %xmm1, %xmm1
  vaddps	%xmm0, %xmm2, %xmm0
  vshufps	$200, %xmm1, %xmm0, %xmm0 ## xmm0 = xmm0[0,2],xmm1[0,3]

Differential Revision: https://reviews.llvm.org/D76623
2020-04-03 13:55:13 -04:00

191 lines
7.6 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK,SSE
; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX
define i1 @cmp_v4i32(<4 x float> %arg, <4 x float> %arg1) {
; CHECK-LABEL: @cmp_v4i32(
; CHECK-NEXT: bb:
; CHECK-NEXT: [[T:%.*]] = bitcast <4 x float> [[ARG:%.*]] to <4 x i32>
; CHECK-NEXT: [[T3:%.*]] = bitcast <4 x float> [[ARG1:%.*]] to <4 x i32>
; CHECK-NEXT: [[TMP0:%.*]] = icmp eq <4 x i32> [[T]], [[T3]]
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
; CHECK-NEXT: br i1 [[TMP1]], label [[BB6:%.*]], label [[BB18:%.*]]
; CHECK: bb6:
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[T]], [[T3]]
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
; CHECK-NEXT: br i1 [[TMP3]], label [[BB10:%.*]], label [[BB18]]
; CHECK: bb10:
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[T]], [[T3]]
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2
; CHECK-NEXT: br i1 [[TMP5]], label [[BB14:%.*]], label [[BB18]]
; CHECK: bb14:
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i32> [[T]], [[T3]]
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 3
; CHECK-NEXT: br label [[BB18]]
; CHECK: bb18:
; CHECK-NEXT: [[T19:%.*]] = phi i1 [ false, [[BB10]] ], [ false, [[BB6]] ], [ false, [[BB:%.*]] ], [ [[TMP7]], [[BB14]] ]
; CHECK-NEXT: ret i1 [[T19]]
;
bb:
%t = bitcast <4 x float> %arg to <4 x i32>
%t2 = extractelement <4 x i32> %t, i32 0
%t3 = bitcast <4 x float> %arg1 to <4 x i32>
%t4 = extractelement <4 x i32> %t3, i32 0
%t5 = icmp eq i32 %t2, %t4
br i1 %t5, label %bb6, label %bb18
bb6:
%t7 = extractelement <4 x i32> %t, i32 1
%t8 = extractelement <4 x i32> %t3, i32 1
%t9 = icmp eq i32 %t7, %t8
br i1 %t9, label %bb10, label %bb18
bb10:
%t11 = extractelement <4 x i32> %t, i32 2
%t12 = extractelement <4 x i32> %t3, i32 2
%t13 = icmp eq i32 %t11, %t12
br i1 %t13, label %bb14, label %bb18
bb14:
%t15 = extractelement <4 x i32> %t, i32 3
%t16 = extractelement <4 x i32> %t3, i32 3
%t17 = icmp eq i32 %t15, %t16
br label %bb18
bb18:
%t19 = phi i1 [ false, %bb10 ], [ false, %bb6 ], [ false, %bb ], [ %t17, %bb14 ]
ret i1 %t19
}
define i32 @cmp_v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z) {
; SSE-LABEL: @cmp_v2f64(
; SSE-NEXT: entry:
; SSE-NEXT: [[X1:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
; SSE-NEXT: [[Y1:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 1
; SSE-NEXT: [[CMP1:%.*]] = fcmp oeq double [[X1]], [[Y1]]
; SSE-NEXT: br i1 [[CMP1]], label [[T:%.*]], label [[F:%.*]]
; SSE: t:
; SSE-NEXT: [[Z1:%.*]] = extractelement <2 x double> [[Z:%.*]], i32 1
; SSE-NEXT: [[CMP2:%.*]] = fcmp ogt double [[Y1]], [[Z1]]
; SSE-NEXT: [[E:%.*]] = select i1 [[CMP2]], i32 42, i32 99
; SSE-NEXT: ret i32 [[E]]
; SSE: f:
; SSE-NEXT: ret i32 0
;
; AVX-LABEL: @cmp_v2f64(
; AVX-NEXT: entry:
; AVX-NEXT: [[TMP0:%.*]] = fcmp oeq <2 x double> [[X:%.*]], [[Y:%.*]]
; AVX-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
; AVX-NEXT: br i1 [[TMP1]], label [[T:%.*]], label [[F:%.*]]
; AVX: t:
; AVX-NEXT: [[TMP2:%.*]] = fcmp ogt <2 x double> [[Y]], [[Z:%.*]]
; AVX-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
; AVX-NEXT: [[E:%.*]] = select i1 [[TMP3]], i32 42, i32 99
; AVX-NEXT: ret i32 [[E]]
; AVX: f:
; AVX-NEXT: ret i32 0
;
entry:
%x1 = extractelement <2 x double> %x, i32 1
%y1 = extractelement <2 x double> %y, i32 1
%cmp1 = fcmp oeq double %x1, %y1
br i1 %cmp1, label %t, label %f
t:
%z1 = extractelement <2 x double> %z, i32 1
%cmp2 = fcmp ogt double %y1, %z1
%e = select i1 %cmp2, i32 42, i32 99
ret i32 %e
f:
ret i32 0
}
define i1 @cmp01_v2f64(<2 x double> %x, <2 x double> %y) {
; SSE-LABEL: @cmp01_v2f64(
; SSE-NEXT: [[X0:%.*]] = extractelement <2 x double> [[X:%.*]], i32 0
; SSE-NEXT: [[Y1:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 1
; SSE-NEXT: [[CMP:%.*]] = fcmp oge double [[X0]], [[Y1]]
; SSE-NEXT: ret i1 [[CMP]]
;
; AVX-LABEL: @cmp01_v2f64(
; AVX-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[Y:%.*]], <2 x double> undef, <2 x i32> <i32 1, i32 undef>
; AVX-NEXT: [[TMP2:%.*]] = fcmp oge <2 x double> [[X:%.*]], [[TMP1]]
; AVX-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
; AVX-NEXT: ret i1 [[TMP3]]
;
%x0 = extractelement <2 x double> %x, i32 0
%y1 = extractelement <2 x double> %y, i32 1
%cmp = fcmp oge double %x0, %y1
ret i1 %cmp
}
define i1 @cmp10_v2f64(<2 x double> %x, <2 x double> %y) {
; SSE-LABEL: @cmp10_v2f64(
; SSE-NEXT: [[X1:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
; SSE-NEXT: [[Y0:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 0
; SSE-NEXT: [[CMP:%.*]] = fcmp ule double [[X1]], [[Y0]]
; SSE-NEXT: ret i1 [[CMP]]
;
; AVX-LABEL: @cmp10_v2f64(
; AVX-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[X:%.*]], <2 x double> undef, <2 x i32> <i32 1, i32 undef>
; AVX-NEXT: [[TMP2:%.*]] = fcmp ule <2 x double> [[TMP1]], [[Y:%.*]]
; AVX-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i64 0
; AVX-NEXT: ret i1 [[TMP3]]
;
%x1 = extractelement <2 x double> %x, i32 1
%y0 = extractelement <2 x double> %y, i32 0
%cmp = fcmp ule double %x1, %y0
ret i1 %cmp
}
define i1 @cmp12_v4i32(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: @cmp12_v4i32(
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X:%.*]], [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
; CHECK-NEXT: ret i1 [[TMP3]]
;
%x1 = extractelement <4 x i32> %x, i32 1
%y2 = extractelement <4 x i32> %y, i32 2
%cmp = icmp sgt i32 %x1, %y2
ret i1 %cmp
}
define <4 x i1> @ins_fcmp_ext_ext(<4 x float> %a, <4 x i1> %b) {
; SSE-LABEL: @ins_fcmp_ext_ext(
; SSE-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 1
; SSE-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
; SSE-NEXT: [[A21:%.*]] = fcmp ugt float [[A2]], [[A1]]
; SSE-NEXT: [[R:%.*]] = insertelement <4 x i1> [[B:%.*]], i1 [[A21]], i32 2
; SSE-NEXT: ret <4 x i1> [[R]]
;
; AVX-LABEL: @ins_fcmp_ext_ext(
; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
; AVX-NEXT: [[TMP2:%.*]] = fcmp ugt <4 x float> [[A]], [[TMP1]]
; AVX-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
; AVX-NEXT: [[R:%.*]] = insertelement <4 x i1> [[B:%.*]], i1 [[TMP3]], i32 2
; AVX-NEXT: ret <4 x i1> [[R]]
;
%a1 = extractelement <4 x float> %a, i32 1
%a2 = extractelement <4 x float> %a, i32 2
%a21 = fcmp ugt float %a2, %a1
%r = insertelement <4 x i1> %b, i1 %a21, i32 2
ret <4 x i1> %r
}
define <4 x i1> @ins_icmp_ext_ext(<4 x i32> %a, <4 x i1> %b) {
; CHECK-LABEL: @ins_icmp_ext_ext(
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <4 x i32> [[TMP1]], [[A]]
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i64 3
; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i1> [[B:%.*]], i1 [[TMP3]], i32 3
; CHECK-NEXT: ret <4 x i1> [[R]]
;
%a3 = extractelement <4 x i32> %a, i32 3
%a2 = extractelement <4 x i32> %a, i32 2
%a23 = icmp ule i32 %a2, %a3
%r = insertelement <4 x i1> %b, i1 %a23, i32 3
ret <4 x i1> %r
}