[LoopRotate] Use SCEV exit counts to improve rotation profitability (#187483)

Most loop transformations, like unrolling and vectorization, expect the
latch branch to be countable. Allow rotation, if it turns the latch from
uncountable to countable.

This use SCEV to check for countable exits, if CheckExitCount set.
Currently it is not set for the LPM1 run (where SCEV is not used by
other passes), only in LPM.

With that compile-time impact is mostly neutral

https://llvm-compile-time-tracker.com/compare.php?from=eba342d0ba930a404a026c80aada51c43974f0db&to=2e676337b45fae63ce9498116d8e6e43772363c5&stat=instructions:u

ClamAV is consistently slower (~+0.15%) and 7zip faster in most cases
(~-0.13%)

Across a large test set based on C/C++ workloads, this rotates ~0.8%
more loops with ~2.68M rotated loops.

For the test set, ~2.7% more loops are runtime-unrolled and +6.36% more
early exit loops vectorized on ARM64 macOS.

This fixes a regression where std::ranges::find_last loops stopped
being runtime-unrolled after
5f648c370e
which changed the loop
structure so we stopped rotating.

https://clang.godbolt.org/z/6baeE1av6

Based on https://github.com/llvm/llvm-project/pull/162654.

Co-authored-by:  Marek Sedláček <mr.mareksedlacek@gmail.com>

PR: https://github.com/llvm/llvm-project/pull/187483
This commit is contained in:
Florian Hahn 2026-03-20 10:21:15 +00:00 committed by GitHub
parent 14de6dafee
commit 21f439f132
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 376 additions and 28 deletions

View File

@ -24,7 +24,7 @@ class Loop;
class LoopRotatePass : public PassInfoMixin<LoopRotatePass> {
public:
LoopRotatePass(bool EnableHeaderDuplication = true,
bool PrepareForLTO = false);
bool PrepareForLTO = false, bool CheckExitCount = false);
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR, LPMUpdater &U);
@ -34,6 +34,7 @@ public:
private:
const bool EnableHeaderDuplication;
const bool PrepareForLTO;
const bool CheckExitCount;
};
}

View File

@ -37,7 +37,8 @@ LLVM_ABI bool LoopRotation(Loop *L, LoopInfo *LI,
DominatorTree *DT, ScalarEvolution *SE,
MemorySSAUpdater *MSSAU, const SimplifyQuery &SQ,
bool RotationOnly, unsigned Threshold,
bool IsUtilMode, bool PrepareForLTO = false);
bool IsUtilMode, bool PrepareForLTO = false,
bool CheckExitCount = false);
} // namespace llvm

View File

@ -1274,17 +1274,25 @@ Expected<LICMOptions> parseLICMOptions(StringRef Params) {
return Result;
}
Expected<std::pair<bool, bool>> parseLoopRotateOptions(StringRef Params) {
std::pair<bool, bool> Result = {true, false};
struct LoopRotateOptions {
bool EnableHeaderDuplication = true;
bool PrepareForLTO = false;
bool CheckExitCount = false;
};
Expected<LoopRotateOptions> parseLoopRotateOptions(StringRef Params) {
LoopRotateOptions Result;
while (!Params.empty()) {
StringRef ParamName;
std::tie(ParamName, Params) = Params.split(';');
bool Enable = !ParamName.consume_front("no-");
if (ParamName == "header-duplication") {
Result.first = Enable;
Result.EnableHeaderDuplication = Enable;
} else if (ParamName == "prepare-for-lto") {
Result.second = Enable;
Result.PrepareForLTO = Enable;
} else if (ParamName == "check-exit-count") {
Result.CheckExitCount = Enable;
} else {
return make_error<StringError>(
formatv("invalid LoopRotate pass parameter '{}'", ParamName).str(),

View File

@ -1579,7 +1579,7 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
// Disable header duplication at -Oz.
LPM.addPass(LoopRotatePass(EnableLoopHeaderDuplication ||
Level != OptimizationLevel::Oz,
LTOPreLink));
LTOPreLink, /*CheckExitCount=*/true));
// Some loops may have become dead by now. Try to delete them.
// FIXME: see discussion in https://reviews.llvm.org/D112851,
// this may need to be revisited once we run GVN before loop deletion

View File

@ -800,12 +800,14 @@ LOOP_PASS_WITH_PARAMS(
parseLICMOptions, "allowspeculation;no-allowspeculation")
LOOP_PASS_WITH_PARAMS(
"loop-rotate", "LoopRotatePass",
[](std::pair<bool, bool> Params) {
return LoopRotatePass(Params.first, Params.second);
[](LoopRotateOptions Params) {
return LoopRotatePass(Params.EnableHeaderDuplication, Params.PrepareForLTO,
Params.CheckExitCount);
},
parseLoopRotateOptions,
"no-header-duplication;header-duplication;"
"no-prepare-for-lto;prepare-for-lto")
"no-prepare-for-lto;prepare-for-lto;"
"no-check-exit-count;check-exit-count")
LOOP_PASS_WITH_PARAMS(
"simple-loop-unswitch", "SimpleLoopUnswitchPass",
[](std::pair<bool, bool> Params) {

View File

@ -38,9 +38,10 @@ static cl::opt<bool> PrepareForLTOOption(
cl::desc("Run loop-rotation in the prepare-for-lto stage. This option "
"should be used for testing only."));
LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication, bool PrepareForLTO)
LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication, bool PrepareForLTO,
bool CheckExitCount)
: EnableHeaderDuplication(EnableHeaderDuplication),
PrepareForLTO(PrepareForLTO) {}
PrepareForLTO(PrepareForLTO), CheckExitCount(CheckExitCount) {}
void LoopRotatePass::printPipeline(
raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
@ -53,7 +54,11 @@ void LoopRotatePass::printPipeline(
if (!PrepareForLTO)
OS << "no-";
OS << "prepare-for-lto";
OS << "prepare-for-lto;";
if (!CheckExitCount)
OS << "no-";
OS << "check-exit-count";
OS << ">";
}
@ -74,9 +79,10 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
std::optional<MemorySSAUpdater> MSSAU;
if (AR.MSSA)
MSSAU = MemorySSAUpdater(AR.MSSA);
bool Changed = LoopRotation(&L, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE,
MSSAU ? &*MSSAU : nullptr, SQ, false, Threshold,
false, PrepareForLTO || PrepareForLTOOption);
bool Changed =
LoopRotation(&L, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE,
MSSAU ? &*MSSAU : nullptr, SQ, false, Threshold, false,
PrepareForLTO || PrepareForLTOOption, CheckExitCount);
if (!Changed)
return PreservedAnalyses::all();

View File

@ -63,16 +63,18 @@ class LoopRotate {
bool RotationOnly;
bool IsUtilMode;
bool PrepareForLTO;
bool CheckExitCount;
public:
LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI,
const TargetTransformInfo *TTI, AssumptionCache *AC,
DominatorTree *DT, ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
const SimplifyQuery &SQ, bool RotationOnly, bool IsUtilMode,
bool PrepareForLTO)
bool PrepareForLTO, bool CheckExitCount)
: MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE),
MSSAU(MSSAU), SQ(SQ), RotationOnly(RotationOnly),
IsUtilMode(IsUtilMode), PrepareForLTO(PrepareForLTO) {}
IsUtilMode(IsUtilMode), PrepareForLTO(PrepareForLTO),
CheckExitCount(CheckExitCount) {}
bool processLoop(Loop *L);
private:
@ -178,11 +180,12 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
}
}
// Assuming both header and latch are exiting, look for a phi which is only
// used outside the loop (via a LCSSA phi) in the exit from the header.
// This means that rotating the loop can remove the phi.
static bool profitableToRotateLoopExitingLatch(Loop *L) {
// Assuming both header and latch are exiting, check if rotating is profitable:
// either a header phi becomes dead, or rotating makes the latch exit count
// computable (enabling downstream optimizations like unrolling/vectorization).
static bool profitableToRotateLoopExitingLatch(Loop *L, ScalarEvolution *SE) {
BasicBlock *Header = L->getHeader();
BasicBlock *Latch = L->getLoopLatch();
CondBrInst *BI = dyn_cast<CondBrInst>(Header->getTerminator());
BasicBlock *HeaderExit = BI->getSuccessor(0);
if (L->contains(HeaderExit))
@ -196,6 +199,13 @@ static bool profitableToRotateLoopExitingLatch(Loop *L) {
continue;
return true;
}
// Check if rotating would make the latch exit count computable, enabling
// optimizations like runtime unrolling and vectorization.
if (SE && isa<SCEVCouldNotCompute>(SE->getExitCount(L, Latch)) &&
!isa<SCEVCouldNotCompute>(SE->getExitCount(L, Header)))
return true;
return false;
}
@ -363,7 +373,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
// Rotate if the loop latch was just simplified. Or if it makes the loop exit
// count computable. Or if we think it will be profitable.
if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false &&
!profitableToRotateLoopExitingLatch(L))
!profitableToRotateLoopExitingLatch(L, CheckExitCount ? SE : nullptr))
return Rotated;
// Check size of original header and reject loop if it is very big or we can't
@ -965,8 +975,9 @@ bool llvm::LoopRotation(Loop *L, LoopInfo *LI, const TargetTransformInfo *TTI,
ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
const SimplifyQuery &SQ, bool RotationOnly = true,
unsigned Threshold = unsigned(-1),
bool IsUtilMode = true, bool PrepareForLTO) {
bool IsUtilMode = true, bool PrepareForLTO,
bool CheckExitCount) {
LoopRotate LR(Threshold, LI, TTI, AC, DT, SE, MSSAU, SQ, RotationOnly,
IsUtilMode, PrepareForLTO);
IsUtilMode, PrepareForLTO, CheckExitCount);
return LR.processLoop(L);
}

View File

@ -4,7 +4,7 @@
; CHECK-0: function(adce),function(adce)
; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='module(rpo-function-attrs,require<globals-aa>,function(float2int,lower-constant-intrinsics,loop(loop-rotate)),invalidate<globals-aa>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-1
; CHECK-1: rpo-function-attrs,require<globals-aa>,function(float2int,lower-constant-intrinsics,loop(loop-rotate<header-duplication;no-prepare-for-lto>)),invalidate<globals-aa>
; CHECK-1: rpo-function-attrs,require<globals-aa>,function(float2int,lower-constant-intrinsics,loop(loop-rotate<header-duplication;no-prepare-for-lto;no-check-exit-count>)),invalidate<globals-aa>
;; Test that we get ClassName printed when there is no ClassName to pass-name mapping (as is the case for the BitcodeWriterPass).
; RUN: opt -o /dev/null -disable-verify -print-pipeline-passes -passes='function(mem2reg)' < %s -disable-pipeline-verification | FileCheck %s --match-full-lines --check-prefixes=CHECK-3
@ -66,7 +66,7 @@
;; Test that the loop-nest-pass lnicm is printed with the other loop-passes in the pipeline.
; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(loop-mssa(licm,loop-rotate,loop-deletion,lnicm,loop-rotate))' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-23
; CHECK-23: function(loop-mssa(licm<allowspeculation>,loop-rotate<header-duplication;no-prepare-for-lto>,loop-deletion,lnicm<allowspeculation>,loop-rotate<header-duplication;no-prepare-for-lto>))
; CHECK-23: function(loop-mssa(licm<allowspeculation>,loop-rotate<header-duplication;no-prepare-for-lto;no-check-exit-count>,loop-deletion,lnicm<allowspeculation>,loop-rotate<header-duplication;no-prepare-for-lto;no-check-exit-count>))
;; Test that -debugify and -check-debugify is printed correctly.
; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='debugify,no-op-function,check-debugify' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-24
@ -110,7 +110,7 @@
; CHECK-32: cgscc(function<no-rerun>(no-op-function))
; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(loop(loop-rotate<no-header-duplication;no-prepare-for-lto>))' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-33
; CHECK-33: function(loop(loop-rotate<no-header-duplication;no-prepare-for-lto>))
; CHECK-33: function(loop(loop-rotate<no-header-duplication;no-prepare-for-lto;no-check-exit-count>))
; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='globaldce' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-34
; CHECK-34: globaldce

View File

@ -0,0 +1,155 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -passes='loop(loop-rotate<header-duplication>)' %s | FileCheck %s --check-prefixes=CHECK,NO-EXIT-COUNT
; RUN: opt -S -passes='loop(loop-rotate<header-duplication;check-exit-count>)' %s | FileCheck %s --check-prefixes=CHECK,EXIT-COUNT
; Computable header exit, data-dependent latch exit. Rotated with check-exit-count.
define ptr @search_loop(ptr %begin, ptr %end, i8 %val) {
; NO-EXIT-COUNT-LABEL: @search_loop(
; NO-EXIT-COUNT-NEXT: entry:
; NO-EXIT-COUNT-NEXT: [[CMP_ENTRY:%.*]] = icmp eq ptr [[BEGIN:%.*]], [[END:%.*]]
; NO-EXIT-COUNT-NEXT: br i1 [[CMP_ENTRY]], label [[EXIT:%.*]], label [[FOR_COND_PREHEADER:%.*]]
; NO-EXIT-COUNT: for.cond.preheader:
; NO-EXIT-COUNT-NEXT: br label [[FOR_COND:%.*]]
; NO-EXIT-COUNT: for.cond:
; NO-EXIT-COUNT-NEXT: [[PTR:%.*]] = phi ptr [ [[PTR_DEC:%.*]], [[FOR_BODY:%.*]] ], [ [[END]], [[FOR_COND_PREHEADER]] ]
; NO-EXIT-COUNT-NEXT: [[PTR_DEC]] = getelementptr inbounds i8, ptr [[PTR]], i64 -1
; NO-EXIT-COUNT-NEXT: [[CMP_END:%.*]] = icmp eq ptr [[PTR_DEC]], [[BEGIN]]
; NO-EXIT-COUNT-NEXT: br i1 [[CMP_END]], label [[NOT_FOUND:%.*]], label [[FOR_BODY]]
; NO-EXIT-COUNT: for.body:
; NO-EXIT-COUNT-NEXT: [[LOAD:%.*]] = load i8, ptr [[PTR_DEC]], align 1
; NO-EXIT-COUNT-NEXT: [[CMP_VAL:%.*]] = icmp eq i8 [[LOAD]], [[VAL:%.*]]
; NO-EXIT-COUNT-NEXT: br i1 [[CMP_VAL]], label [[FOUND:%.*]], label [[FOR_COND]]
; NO-EXIT-COUNT: found:
; NO-EXIT-COUNT-NEXT: [[PTR_DEC_LCSSA1:%.*]] = phi ptr [ [[PTR_DEC]], [[FOR_BODY]] ]
; NO-EXIT-COUNT-NEXT: ret ptr [[PTR_DEC_LCSSA1]]
; NO-EXIT-COUNT: not.found:
; NO-EXIT-COUNT-NEXT: br label [[EXIT]]
; NO-EXIT-COUNT: exit:
; NO-EXIT-COUNT-NEXT: ret ptr [[END]]
;
; EXIT-COUNT-LABEL: @search_loop(
; EXIT-COUNT-NEXT: entry:
; EXIT-COUNT-NEXT: [[CMP_ENTRY:%.*]] = icmp eq ptr [[BEGIN:%.*]], [[END:%.*]]
; EXIT-COUNT-NEXT: br i1 [[CMP_ENTRY]], label [[EXIT:%.*]], label [[FOR_COND_PREHEADER:%.*]]
; EXIT-COUNT: for.cond.preheader:
; EXIT-COUNT-NEXT: [[PTR_DEC2:%.*]] = getelementptr inbounds i8, ptr [[END]], i64 -1
; EXIT-COUNT-NEXT: [[CMP_END3:%.*]] = icmp eq ptr [[PTR_DEC2]], [[BEGIN]]
; EXIT-COUNT-NEXT: br i1 [[CMP_END3]], label [[NOT_FOUND:%.*]], label [[FOR_BODY_LR_PH:%.*]]
; EXIT-COUNT: for.body.lr.ph:
; EXIT-COUNT-NEXT: br label [[FOR_BODY:%.*]]
; EXIT-COUNT: for.cond:
; EXIT-COUNT-NEXT: [[PTR:%.*]] = phi ptr [ [[PTR_DEC4:%.*]], [[FOR_BODY]] ]
; EXIT-COUNT-NEXT: [[PTR_DEC:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 -1
; EXIT-COUNT-NEXT: [[CMP_END:%.*]] = icmp eq ptr [[PTR_DEC]], [[BEGIN]]
; EXIT-COUNT-NEXT: br i1 [[CMP_END]], label [[FOR_COND_NOT_FOUND_CRIT_EDGE:%.*]], label [[FOR_BODY]]
; EXIT-COUNT: for.body:
; EXIT-COUNT-NEXT: [[PTR_DEC4]] = phi ptr [ [[PTR_DEC2]], [[FOR_BODY_LR_PH]] ], [ [[PTR_DEC]], [[FOR_COND:%.*]] ]
; EXIT-COUNT-NEXT: [[LOAD:%.*]] = load i8, ptr [[PTR_DEC4]], align 1
; EXIT-COUNT-NEXT: [[CMP_VAL:%.*]] = icmp eq i8 [[LOAD]], [[VAL:%.*]]
; EXIT-COUNT-NEXT: br i1 [[CMP_VAL]], label [[FOUND:%.*]], label [[FOR_COND]]
; EXIT-COUNT: found:
; EXIT-COUNT-NEXT: [[PTR_DEC_LCSSA1:%.*]] = phi ptr [ [[PTR_DEC4]], [[FOR_BODY]] ]
; EXIT-COUNT-NEXT: ret ptr [[PTR_DEC_LCSSA1]]
; EXIT-COUNT: for.cond.not.found_crit_edge:
; EXIT-COUNT-NEXT: br label [[NOT_FOUND]]
; EXIT-COUNT: not.found:
; EXIT-COUNT-NEXT: br label [[EXIT]]
; EXIT-COUNT: exit:
; EXIT-COUNT-NEXT: ret ptr [[END]]
;
entry:
%cmp.entry = icmp eq ptr %begin, %end
br i1 %cmp.entry, label %exit, label %for.cond
for.cond:
%ptr = phi ptr [ %end, %entry ], [ %ptr.dec, %for.body ]
%ptr.dec = getelementptr inbounds i8, ptr %ptr, i64 -1
%cmp.end = icmp eq ptr %ptr.dec, %begin
br i1 %cmp.end, label %not.found, label %for.body
for.body:
%load = load i8, ptr %ptr.dec, align 1
%cmp.val = icmp eq i8 %load, %val
br i1 %cmp.val, label %found, label %for.cond
found:
ret ptr %ptr.dec
not.found:
br label %exit
exit:
ret ptr %end
}
; Both exits are memory-dependent (not computable). Not rotated.
define i32 @both_mem_dependent(ptr %p, ptr %q, i32 %n) {
; CHECK-LABEL: @both_mem_dependent(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BODY:%.*]] ]
; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[P:%.*]], align 4
; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[LOAD1]], 0
; CHECK-NEXT: br i1 [[CMP1]], label [[EXIT1:%.*]], label [[BODY]]
; CHECK: body:
; CHECK-NEXT: [[LOAD2:%.*]] = load i32, ptr [[Q:%.*]], align 4
; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[LOAD2]], 0
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
; CHECK-NEXT: br i1 [[CMP2]], label [[EXIT2:%.*]], label [[LOOP]]
; CHECK: exit1:
; CHECK-NEXT: [[IV_LCSSA:%.*]] = phi i32 [ [[IV]], [[LOOP]] ]
; CHECK-NEXT: ret i32 [[IV_LCSSA]]
; CHECK: exit2:
; CHECK-NEXT: [[IV_LCSSA1:%.*]] = phi i32 [ [[IV]], [[BODY]] ]
; CHECK-NEXT: ret i32 [[IV_LCSSA1]]
;
entry:
br label %loop
loop:
%iv = phi i32 [ 0, %entry ], [ %iv.next, %body ]
%load1 = load i32, ptr %p, align 4
%cmp1 = icmp eq i32 %load1, 0
br i1 %cmp1, label %exit1, label %body
body:
%load2 = load i32, ptr %q, align 4
%cmp2 = icmp eq i32 %load2, 0
%iv.next = add i32 %iv, 1
br i1 %cmp2, label %exit2, label %loop
exit1:
ret i32 %iv
exit2:
ret i32 %iv
}
; Latch is unconditional, already properly structured.
define void @already_rotated(ptr %p, i32 %n) {
; CHECK-LABEL: @already_rotated(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
; CHECK-NEXT: store i32 [[IV]], ptr [[P:%.*]], align 4
; CHECK-NEXT: [[IV_NEXT]] = add nuw i32 [[IV]], 1
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], [[N:%.*]]
; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
; CHECK: exit:
; CHECK-NEXT: ret void
;
entry:
br label %loop
loop:
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
store i32 %iv, ptr %p, align 4
%iv.next = add nuw i32 %iv, 1
%cmp = icmp eq i32 %iv.next, %n
br i1 %cmp, label %exit, label %loop
exit:
ret void
}

View File

@ -0,0 +1,164 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
; RUN: opt -S -O3 %s | FileCheck %s
target triple = "aarch64-unknown-linux-gnu"
; Test that a search loop with computable header exit gets rotated and
; runtime-unrolled at -O3.
define ptr @search_loop_unrolled(ptr %begin, ptr %end, i8 %val) {
; CHECK-LABEL: define ptr @search_loop_unrolled(
; CHECK-SAME: ptr readnone captures(address) [[BEGIN:%.*]], ptr readonly captures(address, ret: address, provenance) [[END:%.*]], i8 [[VAL:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[CMP_ENTRY:%.*]] = icmp eq ptr [[BEGIN]], [[END]]
; CHECK-NEXT: [[PTR_DEC1:%.*]] = getelementptr inbounds i8, ptr [[END]], i64 -1
; CHECK-NEXT: [[CMP_END2:%.*]] = icmp eq ptr [[PTR_DEC1]], [[BEGIN]]
; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[CMP_ENTRY]], i1 true, i1 [[CMP_END2]]
; CHECK-NEXT: br i1 [[OR_COND]], label %[[COMMON_RET:.*]], label %[[FOR_COND:.*]]
; CHECK: [[FOR_COND]]:
; CHECK-NEXT: [[END5:%.*]] = ptrtoint ptr [[END]] to i64
; CHECK-NEXT: [[BEGIN6:%.*]] = ptrtoint ptr [[BEGIN]] to i64
; CHECK-NEXT: [[TMP0:%.*]] = xor i64 [[BEGIN6]], -1
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], [[END5]]
; CHECK-NEXT: [[TMP2:%.*]] = freeze i64 [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP2]], -1
; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP2]], 3
; CHECK-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0
; CHECK-NEXT: br i1 [[LCMP_MOD_NOT]], label %[[FOR_BODY_PROL_LOOPEXIT:.*]], label %[[FOR_BODY_PROL:.*]]
; CHECK: [[FOR_BODY_PROL]]:
; CHECK-NEXT: [[PTR_DEC5:%.*]] = phi ptr [ [[PTR_DEC4:%.*]], %[[FOR_COND_PROL:.*]] ], [ [[PTR_DEC1]], %[[FOR_COND]] ]
; CHECK-NEXT: [[PROL_ITER:%.*]] = phi i64 [ [[PROL_ITER_NEXT:%.*]], %[[FOR_COND_PROL]] ], [ 0, %[[FOR_COND]] ]
; CHECK-NEXT: [[LOAD_PROL:%.*]] = load i8, ptr [[PTR_DEC5]], align 1
; CHECK-NEXT: [[CMP_VAL_PROL:%.*]] = icmp eq i8 [[LOAD_PROL]], [[VAL]]
; CHECK-NEXT: br i1 [[CMP_VAL_PROL]], label %[[COMMON_RET]], label %[[FOR_COND_PROL]]
; CHECK: [[FOR_COND_PROL]]:
; CHECK-NEXT: [[PTR_DEC4]] = getelementptr inbounds i8, ptr [[PTR_DEC5]], i64 -1
; CHECK-NEXT: [[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1
; CHECK-NEXT: [[PROL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[PROL_ITER_NEXT]], [[XTRAITER]]
; CHECK-NEXT: br i1 [[PROL_ITER_CMP_NOT]], label %[[FOR_BODY_PROL_LOOPEXIT]], label %[[FOR_BODY_PROL]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[FOR_BODY_PROL_LOOPEXIT]]:
; CHECK-NEXT: [[PTR_DEC3_UNR:%.*]] = phi ptr [ [[PTR_DEC1]], %[[FOR_COND]] ], [ [[PTR_DEC4]], %[[FOR_COND_PROL]] ]
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 3
; CHECK-NEXT: br i1 [[TMP4]], label %[[COMMON_RET]], label %[[FOR_BODY:.*]]
; CHECK: [[FOR_COND1:.*]]:
; CHECK-NEXT: [[PTR_DEC:%.*]] = getelementptr inbounds i8, ptr [[PTR_DEC3:%.*]], i64 -1
; CHECK-NEXT: [[LOAD_1:%.*]] = load i8, ptr [[PTR_DEC]], align 1
; CHECK-NEXT: [[CMP_VAL_1:%.*]] = icmp eq i8 [[LOAD_1]], [[VAL]]
; CHECK-NEXT: br i1 [[CMP_VAL_1]], label %[[COMMON_RET_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT14:.*]], label %[[FOR_COND_1:.*]]
; CHECK: [[FOR_COND_1]]:
; CHECK-NEXT: [[PTR_DEC_1:%.*]] = getelementptr inbounds i8, ptr [[PTR_DEC3]], i64 -2
; CHECK-NEXT: [[LOAD_2:%.*]] = load i8, ptr [[PTR_DEC_1]], align 1
; CHECK-NEXT: [[CMP_VAL_2:%.*]] = icmp eq i8 [[LOAD_2]], [[VAL]]
; CHECK-NEXT: br i1 [[CMP_VAL_2]], label %[[COMMON_RET_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT12:.*]], label %[[FOR_COND_2:.*]]
; CHECK: [[FOR_COND_2]]:
; CHECK-NEXT: [[PTR_DEC_2:%.*]] = getelementptr inbounds i8, ptr [[PTR_DEC3]], i64 -3
; CHECK-NEXT: [[LOAD_3:%.*]] = load i8, ptr [[PTR_DEC_2]], align 1
; CHECK-NEXT: [[CMP_VAL_3:%.*]] = icmp eq i8 [[LOAD_3]], [[VAL]]
; CHECK-NEXT: br i1 [[CMP_VAL_3]], label %[[COMMON_RET_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT10:.*]], label %[[FOR_COND_3:.*]]
; CHECK: [[FOR_COND_3]]:
; CHECK-NEXT: [[PTR_DEC_3:%.*]] = getelementptr inbounds i8, ptr [[PTR_DEC3]], i64 -4
; CHECK-NEXT: [[CMP_END:%.*]] = icmp eq ptr [[PTR_DEC_3]], [[BEGIN]]
; CHECK-NEXT: br i1 [[CMP_END]], label %[[COMMON_RET]], label %[[FOR_BODY]]
; CHECK: [[FOR_BODY]]:
; CHECK-NEXT: [[PTR_DEC3]] = phi ptr [ [[PTR_DEC_3]], %[[FOR_COND_3]] ], [ [[PTR_DEC3_UNR]], %[[FOR_BODY_PROL_LOOPEXIT]] ]
; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[PTR_DEC3]], align 1
; CHECK-NEXT: [[CMP_VAL:%.*]] = icmp eq i8 [[LOAD]], [[VAL]]
; CHECK-NEXT: br i1 [[CMP_VAL]], label %[[COMMON_RET]], label %[[FOR_COND1]]
; CHECK: [[COMMON_RET_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT10]]:
; CHECK-NEXT: [[PTR_DEC_2_LE:%.*]] = getelementptr inbounds i8, ptr [[PTR_DEC3]], i64 -3
; CHECK-NEXT: br label %[[COMMON_RET]]
; CHECK: [[COMMON_RET_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT12]]:
; CHECK-NEXT: [[PTR_DEC_1_LE:%.*]] = getelementptr inbounds i8, ptr [[PTR_DEC3]], i64 -2
; CHECK-NEXT: br label %[[COMMON_RET]]
; CHECK: [[COMMON_RET_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT14]]:
; CHECK-NEXT: [[PTR_DEC_LE:%.*]] = getelementptr inbounds i8, ptr [[PTR_DEC3]], i64 -1
; CHECK-NEXT: br label %[[COMMON_RET]]
; CHECK: [[COMMON_RET]]:
; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi ptr [ [[END]], %[[ENTRY]] ], [ [[END]], %[[FOR_BODY_PROL_LOOPEXIT]] ], [ [[PTR_DEC3]], %[[FOR_BODY]] ], [ [[PTR_DEC_LE]], %[[COMMON_RET_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT14]] ], [ [[PTR_DEC_2_LE]], %[[COMMON_RET_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT10]] ], [ [[PTR_DEC_1_LE]], %[[COMMON_RET_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT12]] ], [ [[END]], %[[FOR_COND_3]] ], [ [[PTR_DEC5]], %[[FOR_BODY_PROL]] ]
; CHECK-NEXT: ret ptr [[COMMON_RET_OP]]
;
entry:
%cmp.entry = icmp eq ptr %begin, %end
br i1 %cmp.entry, label %exit, label %for.cond
for.cond:
%ptr = phi ptr [ %end, %entry ], [ %ptr.dec, %for.body ]
%ptr.dec = getelementptr inbounds i8, ptr %ptr, i64 -1
%cmp.end = icmp eq ptr %ptr.dec, %begin
br i1 %cmp.end, label %not.found, label %for.body
for.body:
%load = load i8, ptr %ptr.dec, align 1
%cmp.val = icmp eq i8 %load, %val
br i1 %cmp.val, label %found, label %for.cond
found:
ret ptr %ptr.dec
not.found:
br label %exit
exit:
ret ptr %end
}
define i64 @rotate_needed_to_vectorize(ptr noalias %scan, ptr noalias %match) {
; CHECK-LABEL: define i64 @rotate_needed_to_vectorize(
; CHECK-SAME: ptr noalias readonly captures(none) [[SCAN:%.*]], ptr noalias readonly captures(none) [[MATCH:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: br label %[[HEADER:.*]]
; CHECK: [[HEADER]]:
; CHECK-NEXT: [[LEN:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[EXIT:.*]] ]
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds nuw i8, ptr [[SCAN]], i64 [[LEN]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[GEP1]], align 1
; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds nuw i8, ptr [[MATCH]], i64 [[LEN]]
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[GEP2]], align 1
; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
; CHECK-NEXT: [[TMP3:%.*]] = freeze <16 x i1> [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i1> [[TMP3]] to i16
; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i16 [[TMP4]], 0
; CHECK-NEXT: br i1 [[DOTNOT]], label %[[EXIT]], label %[[VECTOR_EARLY_EXIT:.*]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[LEN]], 16
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-NEXT: br i1 [[TMP5]], label %[[EXIT1:.*]], label %[[HEADER]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: [[VECTOR_EARLY_EXIT]]:
; CHECK-NEXT: [[TMP6:%.*]] = tail call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> [[TMP3]], i1 false)
; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[LEN]], [[TMP6]]
; CHECK-NEXT: br label %[[EXIT1]]
; CHECK: [[EXIT1]]:
; CHECK-NEXT: [[LEN_LCSSA:%.*]] = phi i64 [ [[TMP7]], %[[VECTOR_EARLY_EXIT]] ], [ 1024, %[[EXIT]] ]
; CHECK-NEXT: ret i64 [[LEN_LCSSA]]
;
entry:
call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %scan, i64 1024) ]
call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %match, i64 1024) ]
br label %header
header:
%len = phi i64 [ 0, %entry ], [ %len.next, %body ]
%cmp.limit = icmp eq i64 %len, 1024
br i1 %cmp.limit, label %exit, label %body
body:
%gep1 = getelementptr inbounds i8, ptr %scan, i64 %len
%v1 = load i8, ptr %gep1, align 1
%gep2 = getelementptr inbounds i8, ptr %match, i64 %len
%v2 = load i8, ptr %gep2, align 1
%cmp.data = icmp eq i8 %v1, %v2
%len.next = add nuw i64 %len, 1
br i1 %cmp.data, label %header, label %mismatch
mismatch:
br label %exit
exit:
%result = phi i64 [ 1024, %header ], [ %len, %mismatch ]
ret i64 %result
}
;.
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
; CHECK: [[META1]] = !{!"llvm.loop.unroll.disable"}
; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]], [[META4:![0-9]+]]}
; CHECK: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
;.