Alexey Bataev d65cc85977 [SLP]Do not schedule instructions with constants/argument/phi operands and external users.
No need to schedule entry nodes where all instructions are not memory
read/write instructions and their operands are either constants, or
arguments, or phis, or instructions from others blocks, or their users
are phis or from the other blocks.
The resulting vector instructions can be placed at
the beginning of the basic block without scheduling (if operands does
not need to be scheduled) or at the end of the block (if users are
outside of the block).
It may save some compile time and scheduling resources.

Differential Revision: https://reviews.llvm.org/D121121
2022-03-17 11:03:45 -07:00

164 lines
9.0 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -slp-vectorizer -S | FileCheck %s --check-prefix=DEFAULT
; RUN: opt < %s -slp-schedule-budget=0 -slp-min-tree-size=0 -slp-threshold=-30 -slp-vectorizer -S | FileCheck %s --check-prefix=GATHER
; RUN: opt < %s -slp-schedule-budget=0 -slp-threshold=-30 -slp-vectorizer -S | FileCheck %s --check-prefix=MAX-COST
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64--linux-gnu"
@a = common global [80 x i8] zeroinitializer, align 16
define void @PR28330(i32 %n) {
; DEFAULT-LABEL: @PR28330(
; DEFAULT-NEXT: entry:
; DEFAULT-NEXT: [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1
; DEFAULT-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
; DEFAULT-NEXT: br label [[FOR_BODY:%.*]]
; DEFAULT: for.body:
; DEFAULT-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; DEFAULT-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
; DEFAULT-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
; DEFAULT-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], [[P17]]
; DEFAULT-NEXT: br label [[FOR_BODY]]
;
; GATHER-LABEL: @PR28330(
; GATHER-NEXT: entry:
; GATHER-NEXT: [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1
; GATHER-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
; GATHER-NEXT: br label [[FOR_BODY:%.*]]
; GATHER: for.body:
; GATHER-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; GATHER-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
; GATHER-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
; GATHER-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], [[P17]]
; GATHER-NEXT: br label [[FOR_BODY]]
;
; MAX-COST-LABEL: @PR28330(
; MAX-COST-NEXT: entry:
; MAX-COST-NEXT: [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1
; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
; MAX-COST-NEXT: br label [[FOR_BODY:%.*]]
; MAX-COST: for.body:
; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; MAX-COST-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
; MAX-COST-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
; MAX-COST-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], [[P17]]
; MAX-COST-NEXT: br label [[FOR_BODY]]
;
entry:
%p0 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1
%p1 = icmp eq i8 %p0, 0
%p2 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2), align 2
%p3 = icmp eq i8 %p2, 0
%p4 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
%p5 = icmp eq i8 %p4, 0
%p6 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
%p7 = icmp eq i8 %p6, 0
%p8 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
%p9 = icmp eq i8 %p8, 0
%p10 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
%p11 = icmp eq i8 %p10, 0
%p12 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
%p13 = icmp eq i8 %p12, 0
%p14 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
%p15 = icmp eq i8 %p14, 0
br label %for.body
for.body:
%p17 = phi i32 [ %p34, %for.body ], [ 0, %entry ]
%p19 = select i1 %p1, i32 -720, i32 -80
%p20 = add i32 %p17, %p19
%p21 = select i1 %p3, i32 -720, i32 -80
%p22 = add i32 %p20, %p21
%p23 = select i1 %p5, i32 -720, i32 -80
%p24 = add i32 %p22, %p23
%p25 = select i1 %p7, i32 -720, i32 -80
%p26 = add i32 %p24, %p25
%p27 = select i1 %p9, i32 -720, i32 -80
%p28 = add i32 %p26, %p27
%p29 = select i1 %p11, i32 -720, i32 -80
%p30 = add i32 %p28, %p29
%p31 = select i1 %p13, i32 -720, i32 -80
%p32 = add i32 %p30, %p31
%p33 = select i1 %p15, i32 -720, i32 -80
%p34 = add i32 %p32, %p33
br label %for.body
}
define void @PR32038(i32 %n) {
; DEFAULT-LABEL: @PR32038(
; DEFAULT-NEXT: entry:
; DEFAULT-NEXT: [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1
; DEFAULT-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
; DEFAULT-NEXT: br label [[FOR_BODY:%.*]]
; DEFAULT: for.body:
; DEFAULT-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; DEFAULT-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
; DEFAULT-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
; DEFAULT-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], -5
; DEFAULT-NEXT: br label [[FOR_BODY]]
;
; GATHER-LABEL: @PR32038(
; GATHER-NEXT: entry:
; GATHER-NEXT: [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1
; GATHER-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
; GATHER-NEXT: br label [[FOR_BODY:%.*]]
; GATHER: for.body:
; GATHER-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; GATHER-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
; GATHER-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
; GATHER-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], -5
; GATHER-NEXT: br label [[FOR_BODY]]
;
; MAX-COST-LABEL: @PR32038(
; MAX-COST-NEXT: entry:
; MAX-COST-NEXT: [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1
; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
; MAX-COST-NEXT: br label [[FOR_BODY:%.*]]
; MAX-COST: for.body:
; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; MAX-COST-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
; MAX-COST-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
; MAX-COST-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], -5
; MAX-COST-NEXT: br label [[FOR_BODY]]
;
entry:
%p0 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1
%p1 = icmp eq i8 %p0, 0
%p2 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2), align 2
%p3 = icmp eq i8 %p2, 0
%p4 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
%p5 = icmp eq i8 %p4, 0
%p6 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
%p7 = icmp eq i8 %p6, 0
%p8 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
%p9 = icmp eq i8 %p8, 0
%p10 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
%p11 = icmp eq i8 %p10, 0
%p12 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
%p13 = icmp eq i8 %p12, 0
%p14 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
%p15 = icmp eq i8 %p14, 0
br label %for.body
for.body:
%p17 = phi i32 [ %p34, %for.body ], [ 0, %entry ]
%p19 = select i1 %p1, i32 -720, i32 -80
%p20 = add i32 -5, %p19
%p21 = select i1 %p3, i32 -720, i32 -80
%p22 = add i32 %p20, %p21
%p23 = select i1 %p5, i32 -720, i32 -80
%p24 = add i32 %p22, %p23
%p25 = select i1 %p7, i32 -720, i32 -80
%p26 = add i32 %p24, %p25
%p27 = select i1 %p9, i32 -720, i32 -80
%p28 = add i32 %p26, %p27
%p29 = select i1 %p11, i32 -720, i32 -80
%p30 = add i32 %p28, %p29
%p31 = select i1 %p13, i32 -720, i32 -80
%p32 = add i32 %p30, %p31
%p33 = select i1 %p15, i32 -720, i32 -80
%p34 = add i32 %p32, %p33
br label %for.body
}