When a recipe can be safely sunk and all of its users are outside the vector loop region in the same dedicated exit block, the recipe does not need to be executed on every iteration. This patch extends the VPlan-based LICM (Loop Invariant Code Motion) to also sink such recipes from the vector loop region into the exit block. This reduces redundant computation and improves cost model accuracy. TODO: Support nested loop sinking TODO: Support sinking `VPReplicateRecipe` (requires `replicateByVF` fixes) TODO: Support recipes with multiple defined values (e.g., interleaved loads) TODO: Clone recipes without users to all exit blocks TODO: Support PHI node users by checking incoming value blocks TODO: Support sinking when users are in multiple blocks TODO: Clone recipes when users are on multiple exit paths Co-authored-by: Luke Lau <luke@igalia.com> --------- Co-authored-by: Luke Lau <luke@igalia.com> Co-authored-by: Luke Lau <luke_lau@icloud.com>
95 lines
4.1 KiB
LLVM
95 lines
4.1 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
|
|
; RUN: opt -passes='loop-vectorize' -force-vector-width=4 -force-vector-interleave=1 -S < %s 2>&1 | FileCheck %s
|
|
|
|
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
|
|
|
; Make sure the selects generated from reduction are always emitted
|
|
; in deterministic order.
|
|
;
|
|
define i32 @foo() !prof !1 {
|
|
; CHECK-LABEL: define i32 @foo() {{.*}}{
|
|
; CHECK-NEXT: [[T16:.*:]]
|
|
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
|
|
; CHECK: [[VECTOR_PH]]:
|
|
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
|
|
; CHECK: [[VECTOR_BODY]]:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI_1:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[ADD_5:%.*]], %[[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[VEC_PHI_2:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[ADD_3:%.*]], %[[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[ADD_3]] = add <4 x i32> splat (i32 3), [[VEC_PHI_2]]
|
|
; CHECK-NEXT: [[ADD_5]] = add <4 x i32> [[VEC_PHI_1]], splat (i32 5)
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
|
|
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12
|
|
; CHECK-NEXT: br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
|
|
; CHECK: [[MIDDLE_BLOCK]]:
|
|
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
|
|
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
|
|
; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
|
|
; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IV]], splat (i64 9)
|
|
; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[ADD_5]], <4 x i32> [[VEC_PHI_1]]
|
|
; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[ADD_3]], <4 x i32> [[VEC_PHI_2]]
|
|
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
|
|
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
|
|
; CHECK-NEXT: br label %[[EXIT:.*]]
|
|
; CHECK: [[EXIT]]:
|
|
; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP7]], [[TMP6]]
|
|
; CHECK-NEXT: ret i32 [[ADD]]
|
|
;
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
|
|
%red.1 = phi i32 [ 0, %entry ], [ %red.1.next, %loop ]
|
|
%red.2 = phi i32 [ 0, %entry ], [ %red.2.next, %loop ]
|
|
%red.2.next = add i32 3, %red.2
|
|
%red.1.next = add i32 %red.1, 5
|
|
%iv.next = add nuw nsw i64 %iv, 1
|
|
%ec = icmp eq i64 %iv.next, 10
|
|
br i1 %ec, label %exit, label %loop, !prof !2
|
|
|
|
exit:
|
|
%r.2 = phi i32 [ %red.2.next, %loop ]
|
|
%r.1 = phi i32 [ %red.1.next, %loop ]
|
|
%add = add i32 %r.2, %r.1
|
|
ret i32 %add
|
|
}
|
|
|
|
; Make sure we do not fail when checking for ordered reduction. This test just
|
|
; exercises the path and bails out without performing vectorization.
|
|
define double @quux(i1 %arg) {
|
|
; CHECK-LABEL: define double @quux(
|
|
; CHECK-SAME: i1 [[ARG:%.*]]) {
|
|
; CHECK-NEXT: [[ENTRY:.*]]:
|
|
; CHECK-NEXT: br label %[[HEADER:.*]]
|
|
; CHECK: [[HEADER]]:
|
|
; CHECK-NEXT: [[TMP5:%.*]] = phi double [ 1.300000e+01, %[[ENTRY]] ], [ [[TMP:%.*]], %[[LATCH:.*]] ]
|
|
; CHECK-NEXT: [[TMP6:%.*]] = fadd double [[TMP5]], 1.000000e+00
|
|
; CHECK-NEXT: br label %[[LATCH]]
|
|
; CHECK: [[LATCH]]:
|
|
; CHECK-NEXT: [[TMP]] = phi double [ [[TMP6]], %[[HEADER]] ]
|
|
; CHECK-NEXT: br i1 [[ARG]], label %[[HEADER]], label %[[EXIT:.*]]
|
|
; CHECK: [[EXIT]]:
|
|
; CHECK-NEXT: [[R:%.*]] = phi double [ [[TMP]], %[[LATCH]] ]
|
|
; CHECK-NEXT: ret double [[R]]
|
|
;
|
|
entry:
|
|
br label %header
|
|
|
|
header:
|
|
%tmp5 = phi double [ 1.300000e+01, %entry ], [ %tmp, %latch ]
|
|
%tmp6 = fadd double %tmp5, 1.000000e+00
|
|
br label %latch
|
|
|
|
latch:
|
|
%tmp = phi double [ %tmp6, %header ]
|
|
br i1 %arg, label %header, label %exit
|
|
|
|
exit:
|
|
%r = phi double [ %tmp, %latch ]
|
|
ret double %r
|
|
}
|
|
|
|
!1 = !{!"function_entry_count", i64 801}
|
|
!2 = !{!"branch_weights", i32 746, i32 1}
|