Mel Chen 8c6658aca6
[VPlan] Sink recipes from the vector loop region in licm. (#168031)
When a recipe can be safely sunk and all of its users are outside the
vector loop region in the same dedicated exit block, the recipe does not
need to be executed on every iteration.
This patch extends the VPlan-based LICM (Loop Invariant Code Motion) to
also sink such recipes from the vector loop region into the exit block.
This reduces redundant computation and improves cost model accuracy.

TODO: Support nested loop sinking
TODO: Support sinking `VPReplicateRecipe` (requires `replicateByVF`
fixes)
TODO: Support recipes with multiple defined values (e.g., interleaved
loads)
TODO: Clone recipes without users to all exit blocks
TODO: Support PHI node users by checking incoming value blocks
TODO: Support sinking when users are in multiple blocks
TODO: Clone recipes when users are on multiple exit paths

Co-authored-by: Luke Lau <luke@igalia.com>

---------

Co-authored-by: Luke Lau <luke@igalia.com>
Co-authored-by: Luke Lau <luke_lau@icloud.com>
2026-02-03 07:57:15 +00:00

101 lines
4.6 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -force-vector-width=4 -force-vector-interleave=1 -passes=loop-vectorize -S %s | FileCheck %s
target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
define i32 @test(i64 %N, i32 %x) {
; CHECK-LABEL: @test(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[EXTRA_ITER:%.*]] = and i64 [[N:%.*]], 7
; CHECK-NEXT: br label [[CHECK:%.*]]
; CHECK: check:
; CHECK-NEXT: [[EXTRA_ITER_CHECK:%.*]] = icmp eq i64 [[EXTRA_ITER]], 0
; CHECK-NEXT: br i1 [[EXTRA_ITER_CHECK]], label [[EXIT:%.*]], label [[LOOP_PREHEADER:%.*]]
; CHECK: loop.preheader:
; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[EXTRA_ITER]], 3
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[EXTRA_ITER]], 1
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[VEC_PHI]], splat (i32 10)
; CHECK-NEXT: [[TMP2]] = select <4 x i1> [[TMP1]], <4 x i32> [[VEC_PHI]], <4 x i32> splat (i32 10)
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
; CHECK-NEXT: [[TMP6:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[TMP2]], <4 x i32> [[VEC_PHI]]
; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP3]])
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: exit.loopexit:
; CHECK-NEXT: br label [[EXIT]]
; CHECK: exit:
; CHECK-NEXT: [[RESULT:%.*]] = phi i32 [ 0, [[CHECK]] ], [ [[TMP5]], [[LOOP]] ]
; CHECK-NEXT: ret i32 [[RESULT]]
;
entry:
%extra.iter = and i64 %N, 7
br label %check
check:
%extra.iter.check = icmp eq i64 %extra.iter, 0
br i1 %extra.iter.check, label %exit, label %loop
loop:
%next = phi i32 [ %sel, %loop ], [ 0, %check ]
%iv = phi i64 [ %iv.next, %loop ], [ %extra.iter, %check ]
%sel.cond = icmp sgt i32 %next, 10
%sel = select i1 %sel.cond, i32 %next, i32 10
%iv.next = add nsw i64 %iv, -1
%ec = icmp eq i64 %iv.next, 0
br i1 %ec, label %exit, label %loop
exit:
%result = phi i32 [ %sel, %loop], [ 0, %check ]
ret i32 %result
}
define i32 @pr66895_tail_fold_reduction_exit_inst_gets_simplified(i32 %n) {
; CHECK-LABEL: @pr66895_tail_fold_reduction_exit_inst_gets_simplified(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[VEC_PHI]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[VEC_PHI]])
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: exit:
; CHECK-NEXT: ret i32 [[TMP3]]
;
entry:
br label %loop
loop:
%iv = phi i32 [ 12, %entry ], [ %iv.next, %loop ]
%red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
%iv.next = add i32 %iv, -1
%red.next = mul i32 %red, 1
%ec = icmp eq i32 %iv, 0
br i1 %ec, label %exit, label %loop
exit:
%red.lcssa = phi i32 [ %red.next, %loop ]
ret i32 %red.lcssa
}