Alexey Bataev d65cc85977 [SLP]Do not schedule instructions with constants/argument/phi operands and external users.
No need to schedule entry nodes where all instructions are not memory
read/write instructions and their operands are either constants, or
arguments, or phis, or instructions from others blocks, or their users
are phis or from the other blocks.
The resulting vector instructions can be placed at
the beginning of the basic block without scheduling (if operands does
not need to be scheduled) or at the end of the block (if users are
outside of the block).
It may save some compile time and scheduling resources.

Differential Revision: https://reviews.llvm.org/D121121
2022-03-17 11:03:45 -07:00

544 lines
30 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s --check-prefix=GENERIC
; RUN: opt -S -mcpu=kryo -slp-vectorizer -dce -instcombine < %s | FileCheck %s --check-prefix=KRYO
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
target triple = "aarch64--linux-gnu"
; These tests check that we vectorize the index calculations in the
; gather-reduce pattern shown below. We check cases having i32 and i64
; subtraction.
;
; int gather_reduce_8x16(short *a, short *b, short *g, int n) {
; int sum = 0;
; for (int i = 0; i < n ; ++i) {
; sum += g[*a++ - b[0]]; sum += g[*a++ - b[1]];
; sum += g[*a++ - b[2]]; sum += g[*a++ - b[3]];
; sum += g[*a++ - b[4]]; sum += g[*a++ - b[5]];
; sum += g[*a++ - b[6]]; sum += g[*a++ - b[7]];
; }
; return sum;
; }
define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
; GENERIC-LABEL: @gather_reduce_8x16_i32(
; GENERIC-NEXT: entry:
; GENERIC-NEXT: [[CMP_99:%.*]] = icmp sgt i32 [[N:%.*]], 0
; GENERIC-NEXT: br i1 [[CMP_99]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; GENERIC: for.body.preheader:
; GENERIC-NEXT: br label [[FOR_BODY:%.*]]
; GENERIC: for.cond.cleanup.loopexit:
; GENERIC-NEXT: br label [[FOR_COND_CLEANUP]]
; GENERIC: for.cond.cleanup:
; GENERIC-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
; GENERIC-NEXT: ret i32 [[SUM_0_LCSSA]]
; GENERIC: for.body:
; GENERIC-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; GENERIC-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; GENERIC-NEXT: [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
; GENERIC-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
; GENERIC-NEXT: [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
; GENERIC-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
; GENERIC-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
; GENERIC-NEXT: [[TMP3:%.*]] = bitcast i16* [[B:%.*]] to <8 x i16>*
; GENERIC-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2
; GENERIC-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i32>
; GENERIC-NEXT: [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]]
; GENERIC-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP6]], i64 0
; GENERIC-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
; GENERIC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[G:%.*]], i64 [[TMP8]]
; GENERIC-NEXT: [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
; GENERIC-NEXT: [[CONV3:%.*]] = zext i16 [[TMP9]] to i32
; GENERIC-NEXT: [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
; GENERIC-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP6]], i64 1
; GENERIC-NEXT: [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
; GENERIC-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP11]]
; GENERIC-NEXT: [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
; GENERIC-NEXT: [[CONV11:%.*]] = zext i16 [[TMP12]] to i32
; GENERIC-NEXT: [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
; GENERIC-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[TMP6]], i64 2
; GENERIC-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
; GENERIC-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP14]]
; GENERIC-NEXT: [[TMP15:%.*]] = load i16, i16* [[ARRAYIDX19]], align 2
; GENERIC-NEXT: [[CONV20:%.*]] = zext i16 [[TMP15]] to i32
; GENERIC-NEXT: [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
; GENERIC-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP6]], i64 3
; GENERIC-NEXT: [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
; GENERIC-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP17]]
; GENERIC-NEXT: [[TMP18:%.*]] = load i16, i16* [[ARRAYIDX28]], align 2
; GENERIC-NEXT: [[CONV29:%.*]] = zext i16 [[TMP18]] to i32
; GENERIC-NEXT: [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
; GENERIC-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[TMP6]], i64 4
; GENERIC-NEXT: [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
; GENERIC-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP20]]
; GENERIC-NEXT: [[TMP21:%.*]] = load i16, i16* [[ARRAYIDX37]], align 2
; GENERIC-NEXT: [[CONV38:%.*]] = zext i16 [[TMP21]] to i32
; GENERIC-NEXT: [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
; GENERIC-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP6]], i64 5
; GENERIC-NEXT: [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
; GENERIC-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP23]]
; GENERIC-NEXT: [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX46]], align 2
; GENERIC-NEXT: [[CONV47:%.*]] = zext i16 [[TMP24]] to i32
; GENERIC-NEXT: [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
; GENERIC-NEXT: [[TMP25:%.*]] = extractelement <8 x i32> [[TMP6]], i64 6
; GENERIC-NEXT: [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
; GENERIC-NEXT: [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP26]]
; GENERIC-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
; GENERIC-NEXT: [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
; GENERIC-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
; GENERIC-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
; GENERIC-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
; GENERIC-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]
; GENERIC-NEXT: [[TMP30:%.*]] = load i16, i16* [[ARRAYIDX64]], align 2
; GENERIC-NEXT: [[CONV65:%.*]] = zext i16 [[TMP30]] to i32
; GENERIC-NEXT: [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
; GENERIC-NEXT: [[INC]] = add nuw nsw i32 [[I_0103]], 1
; GENERIC-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
; GENERIC-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
;
; KRYO-LABEL: @gather_reduce_8x16_i32(
; KRYO-NEXT: entry:
; KRYO-NEXT: [[CMP_99:%.*]] = icmp sgt i32 [[N:%.*]], 0
; KRYO-NEXT: br i1 [[CMP_99]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; KRYO: for.body.preheader:
; KRYO-NEXT: br label [[FOR_BODY:%.*]]
; KRYO: for.cond.cleanup.loopexit:
; KRYO-NEXT: br label [[FOR_COND_CLEANUP]]
; KRYO: for.cond.cleanup:
; KRYO-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
; KRYO-NEXT: ret i32 [[SUM_0_LCSSA]]
; KRYO: for.body:
; KRYO-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; KRYO-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; KRYO-NEXT: [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
; KRYO-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
; KRYO-NEXT: [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
; KRYO-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
; KRYO-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
; KRYO-NEXT: [[TMP3:%.*]] = bitcast i16* [[B:%.*]] to <8 x i16>*
; KRYO-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2
; KRYO-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i32>
; KRYO-NEXT: [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]]
; KRYO-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP6]], i64 0
; KRYO-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
; KRYO-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[G:%.*]], i64 [[TMP8]]
; KRYO-NEXT: [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
; KRYO-NEXT: [[CONV3:%.*]] = zext i16 [[TMP9]] to i32
; KRYO-NEXT: [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
; KRYO-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP6]], i64 1
; KRYO-NEXT: [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
; KRYO-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP11]]
; KRYO-NEXT: [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
; KRYO-NEXT: [[CONV11:%.*]] = zext i16 [[TMP12]] to i32
; KRYO-NEXT: [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
; KRYO-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[TMP6]], i64 2
; KRYO-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
; KRYO-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP14]]
; KRYO-NEXT: [[TMP15:%.*]] = load i16, i16* [[ARRAYIDX19]], align 2
; KRYO-NEXT: [[CONV20:%.*]] = zext i16 [[TMP15]] to i32
; KRYO-NEXT: [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
; KRYO-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP6]], i64 3
; KRYO-NEXT: [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
; KRYO-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP17]]
; KRYO-NEXT: [[TMP18:%.*]] = load i16, i16* [[ARRAYIDX28]], align 2
; KRYO-NEXT: [[CONV29:%.*]] = zext i16 [[TMP18]] to i32
; KRYO-NEXT: [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
; KRYO-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[TMP6]], i64 4
; KRYO-NEXT: [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
; KRYO-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP20]]
; KRYO-NEXT: [[TMP21:%.*]] = load i16, i16* [[ARRAYIDX37]], align 2
; KRYO-NEXT: [[CONV38:%.*]] = zext i16 [[TMP21]] to i32
; KRYO-NEXT: [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
; KRYO-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP6]], i64 5
; KRYO-NEXT: [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
; KRYO-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP23]]
; KRYO-NEXT: [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX46]], align 2
; KRYO-NEXT: [[CONV47:%.*]] = zext i16 [[TMP24]] to i32
; KRYO-NEXT: [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
; KRYO-NEXT: [[TMP25:%.*]] = extractelement <8 x i32> [[TMP6]], i64 6
; KRYO-NEXT: [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
; KRYO-NEXT: [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP26]]
; KRYO-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
; KRYO-NEXT: [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
; KRYO-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
; KRYO-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
; KRYO-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
; KRYO-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]
; KRYO-NEXT: [[TMP30:%.*]] = load i16, i16* [[ARRAYIDX64]], align 2
; KRYO-NEXT: [[CONV65:%.*]] = zext i16 [[TMP30]] to i32
; KRYO-NEXT: [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
; KRYO-NEXT: [[INC]] = add nuw nsw i32 [[I_0103]], 1
; KRYO-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
; KRYO-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
;
entry:
%cmp.99 = icmp sgt i32 %n, 0
br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader:
br label %for.body
for.cond.cleanup.loopexit:
br label %for.cond.cleanup
for.cond.cleanup:
%sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ]
ret i32 %sum.0.lcssa
for.body:
%i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
%sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ]
%a.addr.0101 = phi i16* [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ]
%incdec.ptr = getelementptr inbounds i16, i16* %a.addr.0101, i64 1
%0 = load i16, i16* %a.addr.0101, align 2
%conv = zext i16 %0 to i32
%incdec.ptr1 = getelementptr inbounds i16, i16* %b, i64 1
%1 = load i16, i16* %b, align 2
%conv2 = zext i16 %1 to i32
%sub = sub nsw i32 %conv, %conv2
%arrayidx = getelementptr inbounds i16, i16* %g, i32 %sub
%2 = load i16, i16* %arrayidx, align 2
%conv3 = zext i16 %2 to i32
%add = add nsw i32 %conv3, %sum.0102
%incdec.ptr4 = getelementptr inbounds i16, i16* %a.addr.0101, i64 2
%3 = load i16, i16* %incdec.ptr, align 2
%conv5 = zext i16 %3 to i32
%incdec.ptr6 = getelementptr inbounds i16, i16* %b, i64 2
%4 = load i16, i16* %incdec.ptr1, align 2
%conv7 = zext i16 %4 to i32
%sub8 = sub nsw i32 %conv5, %conv7
%arrayidx10 = getelementptr inbounds i16, i16* %g, i32 %sub8
%5 = load i16, i16* %arrayidx10, align 2
%conv11 = zext i16 %5 to i32
%add12 = add nsw i32 %add, %conv11
%incdec.ptr13 = getelementptr inbounds i16, i16* %a.addr.0101, i64 3
%6 = load i16, i16* %incdec.ptr4, align 2
%conv14 = zext i16 %6 to i32
%incdec.ptr15 = getelementptr inbounds i16, i16* %b, i64 3
%7 = load i16, i16* %incdec.ptr6, align 2
%conv16 = zext i16 %7 to i32
%sub17 = sub nsw i32 %conv14, %conv16
%arrayidx19 = getelementptr inbounds i16, i16* %g, i32 %sub17
%8 = load i16, i16* %arrayidx19, align 2
%conv20 = zext i16 %8 to i32
%add21 = add nsw i32 %add12, %conv20
%incdec.ptr22 = getelementptr inbounds i16, i16* %a.addr.0101, i64 4
%9 = load i16, i16* %incdec.ptr13, align 2
%conv23 = zext i16 %9 to i32
%incdec.ptr24 = getelementptr inbounds i16, i16* %b, i64 4
%10 = load i16, i16* %incdec.ptr15, align 2
%conv25 = zext i16 %10 to i32
%sub26 = sub nsw i32 %conv23, %conv25
%arrayidx28 = getelementptr inbounds i16, i16* %g, i32 %sub26
%11 = load i16, i16* %arrayidx28, align 2
%conv29 = zext i16 %11 to i32
%add30 = add nsw i32 %add21, %conv29
%incdec.ptr31 = getelementptr inbounds i16, i16* %a.addr.0101, i64 5
%12 = load i16, i16* %incdec.ptr22, align 2
%conv32 = zext i16 %12 to i32
%incdec.ptr33 = getelementptr inbounds i16, i16* %b, i64 5
%13 = load i16, i16* %incdec.ptr24, align 2
%conv34 = zext i16 %13 to i32
%sub35 = sub nsw i32 %conv32, %conv34
%arrayidx37 = getelementptr inbounds i16, i16* %g, i32 %sub35
%14 = load i16, i16* %arrayidx37, align 2
%conv38 = zext i16 %14 to i32
%add39 = add nsw i32 %add30, %conv38
%incdec.ptr40 = getelementptr inbounds i16, i16* %a.addr.0101, i64 6
%15 = load i16, i16* %incdec.ptr31, align 2
%conv41 = zext i16 %15 to i32
%incdec.ptr42 = getelementptr inbounds i16, i16* %b, i64 6
%16 = load i16, i16* %incdec.ptr33, align 2
%conv43 = zext i16 %16 to i32
%sub44 = sub nsw i32 %conv41, %conv43
%arrayidx46 = getelementptr inbounds i16, i16* %g, i32 %sub44
%17 = load i16, i16* %arrayidx46, align 2
%conv47 = zext i16 %17 to i32
%add48 = add nsw i32 %add39, %conv47
%incdec.ptr49 = getelementptr inbounds i16, i16* %a.addr.0101, i64 7
%18 = load i16, i16* %incdec.ptr40, align 2
%conv50 = zext i16 %18 to i32
%incdec.ptr51 = getelementptr inbounds i16, i16* %b, i64 7
%19 = load i16, i16* %incdec.ptr42, align 2
%conv52 = zext i16 %19 to i32
%sub53 = sub nsw i32 %conv50, %conv52
%arrayidx55 = getelementptr inbounds i16, i16* %g, i32 %sub53
%20 = load i16, i16* %arrayidx55, align 2
%conv56 = zext i16 %20 to i32
%add57 = add nsw i32 %add48, %conv56
%incdec.ptr58 = getelementptr inbounds i16, i16* %a.addr.0101, i64 8
%21 = load i16, i16* %incdec.ptr49, align 2
%conv59 = zext i16 %21 to i32
%22 = load i16, i16* %incdec.ptr51, align 2
%conv61 = zext i16 %22 to i32
%sub62 = sub nsw i32 %conv59, %conv61
%arrayidx64 = getelementptr inbounds i16, i16* %g, i32 %sub62
%23 = load i16, i16* %arrayidx64, align 2
%conv65 = zext i16 %23 to i32
%add66 = add nsw i32 %add57, %conv65
%inc = add nuw nsw i32 %i.0103, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
}
define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
; GENERIC-LABEL: @gather_reduce_8x16_i64(
; GENERIC-NEXT: entry:
; GENERIC-NEXT: [[CMP_99:%.*]] = icmp sgt i32 [[N:%.*]], 0
; GENERIC-NEXT: br i1 [[CMP_99]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; GENERIC: for.body.preheader:
; GENERIC-NEXT: br label [[FOR_BODY:%.*]]
; GENERIC: for.cond.cleanup.loopexit:
; GENERIC-NEXT: br label [[FOR_COND_CLEANUP]]
; GENERIC: for.cond.cleanup:
; GENERIC-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
; GENERIC-NEXT: ret i32 [[SUM_0_LCSSA]]
; GENERIC: for.body:
; GENERIC-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; GENERIC-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; GENERIC-NEXT: [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
; GENERIC-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
; GENERIC-NEXT: [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
; GENERIC-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
; GENERIC-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
; GENERIC-NEXT: [[TMP3:%.*]] = bitcast i16* [[B:%.*]] to <8 x i16>*
; GENERIC-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2
; GENERIC-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i32>
; GENERIC-NEXT: [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]]
; GENERIC-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP6]], i64 0
; GENERIC-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
; GENERIC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[G:%.*]], i64 [[TMP8]]
; GENERIC-NEXT: [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
; GENERIC-NEXT: [[CONV3:%.*]] = zext i16 [[TMP9]] to i32
; GENERIC-NEXT: [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
; GENERIC-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP6]], i64 1
; GENERIC-NEXT: [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
; GENERIC-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP11]]
; GENERIC-NEXT: [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
; GENERIC-NEXT: [[CONV11:%.*]] = zext i16 [[TMP12]] to i32
; GENERIC-NEXT: [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
; GENERIC-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[TMP6]], i64 2
; GENERIC-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
; GENERIC-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP14]]
; GENERIC-NEXT: [[TMP15:%.*]] = load i16, i16* [[ARRAYIDX19]], align 2
; GENERIC-NEXT: [[CONV20:%.*]] = zext i16 [[TMP15]] to i32
; GENERIC-NEXT: [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
; GENERIC-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP6]], i64 3
; GENERIC-NEXT: [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
; GENERIC-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP17]]
; GENERIC-NEXT: [[TMP18:%.*]] = load i16, i16* [[ARRAYIDX28]], align 2
; GENERIC-NEXT: [[CONV29:%.*]] = zext i16 [[TMP18]] to i32
; GENERIC-NEXT: [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
; GENERIC-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[TMP6]], i64 4
; GENERIC-NEXT: [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
; GENERIC-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP20]]
; GENERIC-NEXT: [[TMP21:%.*]] = load i16, i16* [[ARRAYIDX37]], align 2
; GENERIC-NEXT: [[CONV38:%.*]] = zext i16 [[TMP21]] to i32
; GENERIC-NEXT: [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
; GENERIC-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP6]], i64 5
; GENERIC-NEXT: [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
; GENERIC-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP23]]
; GENERIC-NEXT: [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX46]], align 2
; GENERIC-NEXT: [[CONV47:%.*]] = zext i16 [[TMP24]] to i32
; GENERIC-NEXT: [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
; GENERIC-NEXT: [[TMP25:%.*]] = extractelement <8 x i32> [[TMP6]], i64 6
; GENERIC-NEXT: [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
; GENERIC-NEXT: [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP26]]
; GENERIC-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
; GENERIC-NEXT: [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
; GENERIC-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
; GENERIC-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
; GENERIC-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
; GENERIC-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]
; GENERIC-NEXT: [[TMP30:%.*]] = load i16, i16* [[ARRAYIDX64]], align 2
; GENERIC-NEXT: [[CONV65:%.*]] = zext i16 [[TMP30]] to i32
; GENERIC-NEXT: [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
; GENERIC-NEXT: [[INC]] = add nuw nsw i32 [[I_0103]], 1
; GENERIC-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
; GENERIC-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
;
; KRYO-LABEL: @gather_reduce_8x16_i64(
; KRYO-NEXT: entry:
; KRYO-NEXT: [[CMP_99:%.*]] = icmp sgt i32 [[N:%.*]], 0
; KRYO-NEXT: br i1 [[CMP_99]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
; KRYO: for.body.preheader:
; KRYO-NEXT: br label [[FOR_BODY:%.*]]
; KRYO: for.cond.cleanup.loopexit:
; KRYO-NEXT: br label [[FOR_COND_CLEANUP]]
; KRYO: for.cond.cleanup:
; KRYO-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
; KRYO-NEXT: ret i32 [[SUM_0_LCSSA]]
; KRYO: for.body:
; KRYO-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; KRYO-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; KRYO-NEXT: [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
; KRYO-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
; KRYO-NEXT: [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
; KRYO-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
; KRYO-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
; KRYO-NEXT: [[TMP3:%.*]] = bitcast i16* [[B:%.*]] to <8 x i16>*
; KRYO-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2
; KRYO-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i32>
; KRYO-NEXT: [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]]
; KRYO-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP6]], i64 0
; KRYO-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
; KRYO-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[G:%.*]], i64 [[TMP8]]
; KRYO-NEXT: [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
; KRYO-NEXT: [[CONV3:%.*]] = zext i16 [[TMP9]] to i32
; KRYO-NEXT: [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
; KRYO-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP6]], i64 1
; KRYO-NEXT: [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
; KRYO-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP11]]
; KRYO-NEXT: [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
; KRYO-NEXT: [[CONV11:%.*]] = zext i16 [[TMP12]] to i32
; KRYO-NEXT: [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
; KRYO-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[TMP6]], i64 2
; KRYO-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
; KRYO-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP14]]
; KRYO-NEXT: [[TMP15:%.*]] = load i16, i16* [[ARRAYIDX19]], align 2
; KRYO-NEXT: [[CONV20:%.*]] = zext i16 [[TMP15]] to i32
; KRYO-NEXT: [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
; KRYO-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP6]], i64 3
; KRYO-NEXT: [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
; KRYO-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP17]]
; KRYO-NEXT: [[TMP18:%.*]] = load i16, i16* [[ARRAYIDX28]], align 2
; KRYO-NEXT: [[CONV29:%.*]] = zext i16 [[TMP18]] to i32
; KRYO-NEXT: [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
; KRYO-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[TMP6]], i64 4
; KRYO-NEXT: [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
; KRYO-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP20]]
; KRYO-NEXT: [[TMP21:%.*]] = load i16, i16* [[ARRAYIDX37]], align 2
; KRYO-NEXT: [[CONV38:%.*]] = zext i16 [[TMP21]] to i32
; KRYO-NEXT: [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
; KRYO-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP6]], i64 5
; KRYO-NEXT: [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
; KRYO-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP23]]
; KRYO-NEXT: [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX46]], align 2
; KRYO-NEXT: [[CONV47:%.*]] = zext i16 [[TMP24]] to i32
; KRYO-NEXT: [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
; KRYO-NEXT: [[TMP25:%.*]] = extractelement <8 x i32> [[TMP6]], i64 6
; KRYO-NEXT: [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
; KRYO-NEXT: [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP26]]
; KRYO-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
; KRYO-NEXT: [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
; KRYO-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
; KRYO-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
; KRYO-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
; KRYO-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]
; KRYO-NEXT: [[TMP30:%.*]] = load i16, i16* [[ARRAYIDX64]], align 2
; KRYO-NEXT: [[CONV65:%.*]] = zext i16 [[TMP30]] to i32
; KRYO-NEXT: [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
; KRYO-NEXT: [[INC]] = add nuw nsw i32 [[I_0103]], 1
; KRYO-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
; KRYO-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
;
entry:
%cmp.99 = icmp sgt i32 %n, 0
br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader:
br label %for.body
for.cond.cleanup.loopexit:
br label %for.cond.cleanup
for.cond.cleanup:
%sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ]
ret i32 %sum.0.lcssa
for.body:
%i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
%sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ]
%a.addr.0101 = phi i16* [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ]
%incdec.ptr = getelementptr inbounds i16, i16* %a.addr.0101, i64 1
%0 = load i16, i16* %a.addr.0101, align 2
%conv = zext i16 %0 to i64
%incdec.ptr1 = getelementptr inbounds i16, i16* %b, i64 1
%1 = load i16, i16* %b, align 2
%conv2 = zext i16 %1 to i64
%sub = sub nsw i64 %conv, %conv2
%arrayidx = getelementptr inbounds i16, i16* %g, i64 %sub
%2 = load i16, i16* %arrayidx, align 2
%conv3 = zext i16 %2 to i32
%add = add nsw i32 %conv3, %sum.0102
%incdec.ptr4 = getelementptr inbounds i16, i16* %a.addr.0101, i64 2
%3 = load i16, i16* %incdec.ptr, align 2
%conv5 = zext i16 %3 to i64
%incdec.ptr6 = getelementptr inbounds i16, i16* %b, i64 2
%4 = load i16, i16* %incdec.ptr1, align 2
%conv7 = zext i16 %4 to i64
%sub8 = sub nsw i64 %conv5, %conv7
%arrayidx10 = getelementptr inbounds i16, i16* %g, i64 %sub8
%5 = load i16, i16* %arrayidx10, align 2
%conv11 = zext i16 %5 to i32
%add12 = add nsw i32 %add, %conv11
%incdec.ptr13 = getelementptr inbounds i16, i16* %a.addr.0101, i64 3
%6 = load i16, i16* %incdec.ptr4, align 2
%conv14 = zext i16 %6 to i64
%incdec.ptr15 = getelementptr inbounds i16, i16* %b, i64 3
%7 = load i16, i16* %incdec.ptr6, align 2
%conv16 = zext i16 %7 to i64
%sub17 = sub nsw i64 %conv14, %conv16
%arrayidx19 = getelementptr inbounds i16, i16* %g, i64 %sub17
%8 = load i16, i16* %arrayidx19, align 2
%conv20 = zext i16 %8 to i32
%add21 = add nsw i32 %add12, %conv20
%incdec.ptr22 = getelementptr inbounds i16, i16* %a.addr.0101, i64 4
%9 = load i16, i16* %incdec.ptr13, align 2
%conv23 = zext i16 %9 to i64
%incdec.ptr24 = getelementptr inbounds i16, i16* %b, i64 4
%10 = load i16, i16* %incdec.ptr15, align 2
%conv25 = zext i16 %10 to i64
%sub26 = sub nsw i64 %conv23, %conv25
%arrayidx28 = getelementptr inbounds i16, i16* %g, i64 %sub26
%11 = load i16, i16* %arrayidx28, align 2
%conv29 = zext i16 %11 to i32
%add30 = add nsw i32 %add21, %conv29
%incdec.ptr31 = getelementptr inbounds i16, i16* %a.addr.0101, i64 5
%12 = load i16, i16* %incdec.ptr22, align 2
%conv32 = zext i16 %12 to i64
%incdec.ptr33 = getelementptr inbounds i16, i16* %b, i64 5
%13 = load i16, i16* %incdec.ptr24, align 2
%conv34 = zext i16 %13 to i64
%sub35 = sub nsw i64 %conv32, %conv34
%arrayidx37 = getelementptr inbounds i16, i16* %g, i64 %sub35
%14 = load i16, i16* %arrayidx37, align 2
%conv38 = zext i16 %14 to i32
%add39 = add nsw i32 %add30, %conv38
%incdec.ptr40 = getelementptr inbounds i16, i16* %a.addr.0101, i64 6
%15 = load i16, i16* %incdec.ptr31, align 2
%conv41 = zext i16 %15 to i64
%incdec.ptr42 = getelementptr inbounds i16, i16* %b, i64 6
%16 = load i16, i16* %incdec.ptr33, align 2
%conv43 = zext i16 %16 to i64
%sub44 = sub nsw i64 %conv41, %conv43
%arrayidx46 = getelementptr inbounds i16, i16* %g, i64 %sub44
%17 = load i16, i16* %arrayidx46, align 2
%conv47 = zext i16 %17 to i32
%add48 = add nsw i32 %add39, %conv47
%incdec.ptr49 = getelementptr inbounds i16, i16* %a.addr.0101, i64 7
%18 = load i16, i16* %incdec.ptr40, align 2
%conv50 = zext i16 %18 to i64
%incdec.ptr51 = getelementptr inbounds i16, i16* %b, i64 7
%19 = load i16, i16* %incdec.ptr42, align 2
%conv52 = zext i16 %19 to i64
%sub53 = sub nsw i64 %conv50, %conv52
%arrayidx55 = getelementptr inbounds i16, i16* %g, i64 %sub53
%20 = load i16, i16* %arrayidx55, align 2
%conv56 = zext i16 %20 to i32
%add57 = add nsw i32 %add48, %conv56
%incdec.ptr58 = getelementptr inbounds i16, i16* %a.addr.0101, i64 8
%21 = load i16, i16* %incdec.ptr49, align 2
%conv59 = zext i16 %21 to i64
%22 = load i16, i16* %incdec.ptr51, align 2
%conv61 = zext i16 %22 to i64
%sub62 = sub nsw i64 %conv59, %conv61
%arrayidx64 = getelementptr inbounds i16, i16* %g, i64 %sub62
%23 = load i16, i16* %arrayidx64, align 2
%conv65 = zext i16 %23 to i32
%add66 = add nsw i32 %add57, %conv65
%inc = add nuw nsw i32 %i.0103, 1
%exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
}