[LoopRotate] Use SCEV exit counts to improve rotation profitability (#187483)
Most loop transformations, like unrolling and vectorization, expect the
latch branch to be countable. Allow rotation, if it turns the latch from
uncountable to countable.
This use SCEV to check for countable exits, if CheckExitCount set.
Currently it is not set for the LPM1 run (where SCEV is not used by
other passes), only in LPM.
With that compile-time impact is mostly neutral
https://llvm-compile-time-tracker.com/compare.php?from=eba342d0ba930a404a026c80aada51c43974f0db&to=2e676337b45fae63ce9498116d8e6e43772363c5&stat=instructions:u
ClamAV is consistently slower (~+0.15%) and 7zip faster in most cases
(~-0.13%)
Across a large test set based on C/C++ workloads, this rotates ~0.8%
more loops with ~2.68M rotated loops.
For the test set, ~2.7% more loops are runtime-unrolled and +6.36% more
early exit loops vectorized on ARM64 macOS.
This fixes a regression where std::ranges::find_last loops stopped
being runtime-unrolled after
5f648c370e
which changed the loop
structure so we stopped rotating.
https://clang.godbolt.org/z/6baeE1av6
Based on https://github.com/llvm/llvm-project/pull/162654.
Co-authored-by: Marek Sedláček <mr.mareksedlacek@gmail.com>
PR: https://github.com/llvm/llvm-project/pull/187483
This commit is contained in:
parent
14de6dafee
commit
21f439f132
@ -24,7 +24,7 @@ class Loop;
|
||||
class LoopRotatePass : public PassInfoMixin<LoopRotatePass> {
|
||||
public:
|
||||
LoopRotatePass(bool EnableHeaderDuplication = true,
|
||||
bool PrepareForLTO = false);
|
||||
bool PrepareForLTO = false, bool CheckExitCount = false);
|
||||
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
|
||||
LoopStandardAnalysisResults &AR, LPMUpdater &U);
|
||||
|
||||
@ -34,6 +34,7 @@ public:
|
||||
private:
|
||||
const bool EnableHeaderDuplication;
|
||||
const bool PrepareForLTO;
|
||||
const bool CheckExitCount;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@ -37,7 +37,8 @@ LLVM_ABI bool LoopRotation(Loop *L, LoopInfo *LI,
|
||||
DominatorTree *DT, ScalarEvolution *SE,
|
||||
MemorySSAUpdater *MSSAU, const SimplifyQuery &SQ,
|
||||
bool RotationOnly, unsigned Threshold,
|
||||
bool IsUtilMode, bool PrepareForLTO = false);
|
||||
bool IsUtilMode, bool PrepareForLTO = false,
|
||||
bool CheckExitCount = false);
|
||||
|
||||
} // namespace llvm
|
||||
|
||||
|
||||
@ -1274,17 +1274,25 @@ Expected<LICMOptions> parseLICMOptions(StringRef Params) {
|
||||
return Result;
|
||||
}
|
||||
|
||||
Expected<std::pair<bool, bool>> parseLoopRotateOptions(StringRef Params) {
|
||||
std::pair<bool, bool> Result = {true, false};
|
||||
struct LoopRotateOptions {
|
||||
bool EnableHeaderDuplication = true;
|
||||
bool PrepareForLTO = false;
|
||||
bool CheckExitCount = false;
|
||||
};
|
||||
|
||||
Expected<LoopRotateOptions> parseLoopRotateOptions(StringRef Params) {
|
||||
LoopRotateOptions Result;
|
||||
while (!Params.empty()) {
|
||||
StringRef ParamName;
|
||||
std::tie(ParamName, Params) = Params.split(';');
|
||||
|
||||
bool Enable = !ParamName.consume_front("no-");
|
||||
if (ParamName == "header-duplication") {
|
||||
Result.first = Enable;
|
||||
Result.EnableHeaderDuplication = Enable;
|
||||
} else if (ParamName == "prepare-for-lto") {
|
||||
Result.second = Enable;
|
||||
Result.PrepareForLTO = Enable;
|
||||
} else if (ParamName == "check-exit-count") {
|
||||
Result.CheckExitCount = Enable;
|
||||
} else {
|
||||
return make_error<StringError>(
|
||||
formatv("invalid LoopRotate pass parameter '{}'", ParamName).str(),
|
||||
|
||||
@ -1579,7 +1579,7 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
|
||||
// Disable header duplication at -Oz.
|
||||
LPM.addPass(LoopRotatePass(EnableLoopHeaderDuplication ||
|
||||
Level != OptimizationLevel::Oz,
|
||||
LTOPreLink));
|
||||
LTOPreLink, /*CheckExitCount=*/true));
|
||||
// Some loops may have become dead by now. Try to delete them.
|
||||
// FIXME: see discussion in https://reviews.llvm.org/D112851,
|
||||
// this may need to be revisited once we run GVN before loop deletion
|
||||
|
||||
@ -800,12 +800,14 @@ LOOP_PASS_WITH_PARAMS(
|
||||
parseLICMOptions, "allowspeculation;no-allowspeculation")
|
||||
LOOP_PASS_WITH_PARAMS(
|
||||
"loop-rotate", "LoopRotatePass",
|
||||
[](std::pair<bool, bool> Params) {
|
||||
return LoopRotatePass(Params.first, Params.second);
|
||||
[](LoopRotateOptions Params) {
|
||||
return LoopRotatePass(Params.EnableHeaderDuplication, Params.PrepareForLTO,
|
||||
Params.CheckExitCount);
|
||||
},
|
||||
parseLoopRotateOptions,
|
||||
"no-header-duplication;header-duplication;"
|
||||
"no-prepare-for-lto;prepare-for-lto")
|
||||
"no-prepare-for-lto;prepare-for-lto;"
|
||||
"no-check-exit-count;check-exit-count")
|
||||
LOOP_PASS_WITH_PARAMS(
|
||||
"simple-loop-unswitch", "SimpleLoopUnswitchPass",
|
||||
[](std::pair<bool, bool> Params) {
|
||||
|
||||
@ -38,9 +38,10 @@ static cl::opt<bool> PrepareForLTOOption(
|
||||
cl::desc("Run loop-rotation in the prepare-for-lto stage. This option "
|
||||
"should be used for testing only."));
|
||||
|
||||
LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication, bool PrepareForLTO)
|
||||
LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication, bool PrepareForLTO,
|
||||
bool CheckExitCount)
|
||||
: EnableHeaderDuplication(EnableHeaderDuplication),
|
||||
PrepareForLTO(PrepareForLTO) {}
|
||||
PrepareForLTO(PrepareForLTO), CheckExitCount(CheckExitCount) {}
|
||||
|
||||
void LoopRotatePass::printPipeline(
|
||||
raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
|
||||
@ -53,7 +54,11 @@ void LoopRotatePass::printPipeline(
|
||||
|
||||
if (!PrepareForLTO)
|
||||
OS << "no-";
|
||||
OS << "prepare-for-lto";
|
||||
OS << "prepare-for-lto;";
|
||||
|
||||
if (!CheckExitCount)
|
||||
OS << "no-";
|
||||
OS << "check-exit-count";
|
||||
OS << ">";
|
||||
}
|
||||
|
||||
@ -74,9 +79,10 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
|
||||
std::optional<MemorySSAUpdater> MSSAU;
|
||||
if (AR.MSSA)
|
||||
MSSAU = MemorySSAUpdater(AR.MSSA);
|
||||
bool Changed = LoopRotation(&L, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE,
|
||||
MSSAU ? &*MSSAU : nullptr, SQ, false, Threshold,
|
||||
false, PrepareForLTO || PrepareForLTOOption);
|
||||
bool Changed =
|
||||
LoopRotation(&L, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE,
|
||||
MSSAU ? &*MSSAU : nullptr, SQ, false, Threshold, false,
|
||||
PrepareForLTO || PrepareForLTOOption, CheckExitCount);
|
||||
|
||||
if (!Changed)
|
||||
return PreservedAnalyses::all();
|
||||
|
||||
@ -63,16 +63,18 @@ class LoopRotate {
|
||||
bool RotationOnly;
|
||||
bool IsUtilMode;
|
||||
bool PrepareForLTO;
|
||||
bool CheckExitCount;
|
||||
|
||||
public:
|
||||
LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI,
|
||||
const TargetTransformInfo *TTI, AssumptionCache *AC,
|
||||
DominatorTree *DT, ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
|
||||
const SimplifyQuery &SQ, bool RotationOnly, bool IsUtilMode,
|
||||
bool PrepareForLTO)
|
||||
bool PrepareForLTO, bool CheckExitCount)
|
||||
: MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE),
|
||||
MSSAU(MSSAU), SQ(SQ), RotationOnly(RotationOnly),
|
||||
IsUtilMode(IsUtilMode), PrepareForLTO(PrepareForLTO) {}
|
||||
IsUtilMode(IsUtilMode), PrepareForLTO(PrepareForLTO),
|
||||
CheckExitCount(CheckExitCount) {}
|
||||
bool processLoop(Loop *L);
|
||||
|
||||
private:
|
||||
@ -178,11 +180,12 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
|
||||
}
|
||||
}
|
||||
|
||||
// Assuming both header and latch are exiting, look for a phi which is only
|
||||
// used outside the loop (via a LCSSA phi) in the exit from the header.
|
||||
// This means that rotating the loop can remove the phi.
|
||||
static bool profitableToRotateLoopExitingLatch(Loop *L) {
|
||||
// Assuming both header and latch are exiting, check if rotating is profitable:
|
||||
// either a header phi becomes dead, or rotating makes the latch exit count
|
||||
// computable (enabling downstream optimizations like unrolling/vectorization).
|
||||
static bool profitableToRotateLoopExitingLatch(Loop *L, ScalarEvolution *SE) {
|
||||
BasicBlock *Header = L->getHeader();
|
||||
BasicBlock *Latch = L->getLoopLatch();
|
||||
CondBrInst *BI = dyn_cast<CondBrInst>(Header->getTerminator());
|
||||
BasicBlock *HeaderExit = BI->getSuccessor(0);
|
||||
if (L->contains(HeaderExit))
|
||||
@ -196,6 +199,13 @@ static bool profitableToRotateLoopExitingLatch(Loop *L) {
|
||||
continue;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check if rotating would make the latch exit count computable, enabling
|
||||
// optimizations like runtime unrolling and vectorization.
|
||||
if (SE && isa<SCEVCouldNotCompute>(SE->getExitCount(L, Latch)) &&
|
||||
!isa<SCEVCouldNotCompute>(SE->getExitCount(L, Header)))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -363,7 +373,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
|
||||
// Rotate if the loop latch was just simplified. Or if it makes the loop exit
|
||||
// count computable. Or if we think it will be profitable.
|
||||
if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false &&
|
||||
!profitableToRotateLoopExitingLatch(L))
|
||||
!profitableToRotateLoopExitingLatch(L, CheckExitCount ? SE : nullptr))
|
||||
return Rotated;
|
||||
|
||||
// Check size of original header and reject loop if it is very big or we can't
|
||||
@ -965,8 +975,9 @@ bool llvm::LoopRotation(Loop *L, LoopInfo *LI, const TargetTransformInfo *TTI,
|
||||
ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
|
||||
const SimplifyQuery &SQ, bool RotationOnly = true,
|
||||
unsigned Threshold = unsigned(-1),
|
||||
bool IsUtilMode = true, bool PrepareForLTO) {
|
||||
bool IsUtilMode = true, bool PrepareForLTO,
|
||||
bool CheckExitCount) {
|
||||
LoopRotate LR(Threshold, LI, TTI, AC, DT, SE, MSSAU, SQ, RotationOnly,
|
||||
IsUtilMode, PrepareForLTO);
|
||||
IsUtilMode, PrepareForLTO, CheckExitCount);
|
||||
return LR.processLoop(L);
|
||||
}
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
; CHECK-0: function(adce),function(adce)
|
||||
|
||||
; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='module(rpo-function-attrs,require<globals-aa>,function(float2int,lower-constant-intrinsics,loop(loop-rotate)),invalidate<globals-aa>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-1
|
||||
; CHECK-1: rpo-function-attrs,require<globals-aa>,function(float2int,lower-constant-intrinsics,loop(loop-rotate<header-duplication;no-prepare-for-lto>)),invalidate<globals-aa>
|
||||
; CHECK-1: rpo-function-attrs,require<globals-aa>,function(float2int,lower-constant-intrinsics,loop(loop-rotate<header-duplication;no-prepare-for-lto;no-check-exit-count>)),invalidate<globals-aa>
|
||||
|
||||
;; Test that we get ClassName printed when there is no ClassName to pass-name mapping (as is the case for the BitcodeWriterPass).
|
||||
; RUN: opt -o /dev/null -disable-verify -print-pipeline-passes -passes='function(mem2reg)' < %s -disable-pipeline-verification | FileCheck %s --match-full-lines --check-prefixes=CHECK-3
|
||||
@ -66,7 +66,7 @@
|
||||
|
||||
;; Test that the loop-nest-pass lnicm is printed with the other loop-passes in the pipeline.
|
||||
; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(loop-mssa(licm,loop-rotate,loop-deletion,lnicm,loop-rotate))' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-23
|
||||
; CHECK-23: function(loop-mssa(licm<allowspeculation>,loop-rotate<header-duplication;no-prepare-for-lto>,loop-deletion,lnicm<allowspeculation>,loop-rotate<header-duplication;no-prepare-for-lto>))
|
||||
; CHECK-23: function(loop-mssa(licm<allowspeculation>,loop-rotate<header-duplication;no-prepare-for-lto;no-check-exit-count>,loop-deletion,lnicm<allowspeculation>,loop-rotate<header-duplication;no-prepare-for-lto;no-check-exit-count>))
|
||||
|
||||
;; Test that -debugify and -check-debugify is printed correctly.
|
||||
; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='debugify,no-op-function,check-debugify' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-24
|
||||
@ -110,7 +110,7 @@
|
||||
; CHECK-32: cgscc(function<no-rerun>(no-op-function))
|
||||
|
||||
; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(loop(loop-rotate<no-header-duplication;no-prepare-for-lto>))' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-33
|
||||
; CHECK-33: function(loop(loop-rotate<no-header-duplication;no-prepare-for-lto>))
|
||||
; CHECK-33: function(loop(loop-rotate<no-header-duplication;no-prepare-for-lto;no-check-exit-count>))
|
||||
|
||||
; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='globaldce' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-34
|
||||
; CHECK-34: globaldce
|
||||
|
||||
155
llvm/test/Transforms/LoopRotate/rotate-exitcount.ll
Normal file
155
llvm/test/Transforms/LoopRotate/rotate-exitcount.ll
Normal file
@ -0,0 +1,155 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
||||
; RUN: opt -S -passes='loop(loop-rotate<header-duplication>)' %s | FileCheck %s --check-prefixes=CHECK,NO-EXIT-COUNT
|
||||
; RUN: opt -S -passes='loop(loop-rotate<header-duplication;check-exit-count>)' %s | FileCheck %s --check-prefixes=CHECK,EXIT-COUNT
|
||||
|
||||
; Computable header exit, data-dependent latch exit. Rotated with check-exit-count.
|
||||
define ptr @search_loop(ptr %begin, ptr %end, i8 %val) {
|
||||
; NO-EXIT-COUNT-LABEL: @search_loop(
|
||||
; NO-EXIT-COUNT-NEXT: entry:
|
||||
; NO-EXIT-COUNT-NEXT: [[CMP_ENTRY:%.*]] = icmp eq ptr [[BEGIN:%.*]], [[END:%.*]]
|
||||
; NO-EXIT-COUNT-NEXT: br i1 [[CMP_ENTRY]], label [[EXIT:%.*]], label [[FOR_COND_PREHEADER:%.*]]
|
||||
; NO-EXIT-COUNT: for.cond.preheader:
|
||||
; NO-EXIT-COUNT-NEXT: br label [[FOR_COND:%.*]]
|
||||
; NO-EXIT-COUNT: for.cond:
|
||||
; NO-EXIT-COUNT-NEXT: [[PTR:%.*]] = phi ptr [ [[PTR_DEC:%.*]], [[FOR_BODY:%.*]] ], [ [[END]], [[FOR_COND_PREHEADER]] ]
|
||||
; NO-EXIT-COUNT-NEXT: [[PTR_DEC]] = getelementptr inbounds i8, ptr [[PTR]], i64 -1
|
||||
; NO-EXIT-COUNT-NEXT: [[CMP_END:%.*]] = icmp eq ptr [[PTR_DEC]], [[BEGIN]]
|
||||
; NO-EXIT-COUNT-NEXT: br i1 [[CMP_END]], label [[NOT_FOUND:%.*]], label [[FOR_BODY]]
|
||||
; NO-EXIT-COUNT: for.body:
|
||||
; NO-EXIT-COUNT-NEXT: [[LOAD:%.*]] = load i8, ptr [[PTR_DEC]], align 1
|
||||
; NO-EXIT-COUNT-NEXT: [[CMP_VAL:%.*]] = icmp eq i8 [[LOAD]], [[VAL:%.*]]
|
||||
; NO-EXIT-COUNT-NEXT: br i1 [[CMP_VAL]], label [[FOUND:%.*]], label [[FOR_COND]]
|
||||
; NO-EXIT-COUNT: found:
|
||||
; NO-EXIT-COUNT-NEXT: [[PTR_DEC_LCSSA1:%.*]] = phi ptr [ [[PTR_DEC]], [[FOR_BODY]] ]
|
||||
; NO-EXIT-COUNT-NEXT: ret ptr [[PTR_DEC_LCSSA1]]
|
||||
; NO-EXIT-COUNT: not.found:
|
||||
; NO-EXIT-COUNT-NEXT: br label [[EXIT]]
|
||||
; NO-EXIT-COUNT: exit:
|
||||
; NO-EXIT-COUNT-NEXT: ret ptr [[END]]
|
||||
;
|
||||
; EXIT-COUNT-LABEL: @search_loop(
|
||||
; EXIT-COUNT-NEXT: entry:
|
||||
; EXIT-COUNT-NEXT: [[CMP_ENTRY:%.*]] = icmp eq ptr [[BEGIN:%.*]], [[END:%.*]]
|
||||
; EXIT-COUNT-NEXT: br i1 [[CMP_ENTRY]], label [[EXIT:%.*]], label [[FOR_COND_PREHEADER:%.*]]
|
||||
; EXIT-COUNT: for.cond.preheader:
|
||||
; EXIT-COUNT-NEXT: [[PTR_DEC2:%.*]] = getelementptr inbounds i8, ptr [[END]], i64 -1
|
||||
; EXIT-COUNT-NEXT: [[CMP_END3:%.*]] = icmp eq ptr [[PTR_DEC2]], [[BEGIN]]
|
||||
; EXIT-COUNT-NEXT: br i1 [[CMP_END3]], label [[NOT_FOUND:%.*]], label [[FOR_BODY_LR_PH:%.*]]
|
||||
; EXIT-COUNT: for.body.lr.ph:
|
||||
; EXIT-COUNT-NEXT: br label [[FOR_BODY:%.*]]
|
||||
; EXIT-COUNT: for.cond:
|
||||
; EXIT-COUNT-NEXT: [[PTR:%.*]] = phi ptr [ [[PTR_DEC4:%.*]], [[FOR_BODY]] ]
|
||||
; EXIT-COUNT-NEXT: [[PTR_DEC:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 -1
|
||||
; EXIT-COUNT-NEXT: [[CMP_END:%.*]] = icmp eq ptr [[PTR_DEC]], [[BEGIN]]
|
||||
; EXIT-COUNT-NEXT: br i1 [[CMP_END]], label [[FOR_COND_NOT_FOUND_CRIT_EDGE:%.*]], label [[FOR_BODY]]
|
||||
; EXIT-COUNT: for.body:
|
||||
; EXIT-COUNT-NEXT: [[PTR_DEC4]] = phi ptr [ [[PTR_DEC2]], [[FOR_BODY_LR_PH]] ], [ [[PTR_DEC]], [[FOR_COND:%.*]] ]
|
||||
; EXIT-COUNT-NEXT: [[LOAD:%.*]] = load i8, ptr [[PTR_DEC4]], align 1
|
||||
; EXIT-COUNT-NEXT: [[CMP_VAL:%.*]] = icmp eq i8 [[LOAD]], [[VAL:%.*]]
|
||||
; EXIT-COUNT-NEXT: br i1 [[CMP_VAL]], label [[FOUND:%.*]], label [[FOR_COND]]
|
||||
; EXIT-COUNT: found:
|
||||
; EXIT-COUNT-NEXT: [[PTR_DEC_LCSSA1:%.*]] = phi ptr [ [[PTR_DEC4]], [[FOR_BODY]] ]
|
||||
; EXIT-COUNT-NEXT: ret ptr [[PTR_DEC_LCSSA1]]
|
||||
; EXIT-COUNT: for.cond.not.found_crit_edge:
|
||||
; EXIT-COUNT-NEXT: br label [[NOT_FOUND]]
|
||||
; EXIT-COUNT: not.found:
|
||||
; EXIT-COUNT-NEXT: br label [[EXIT]]
|
||||
; EXIT-COUNT: exit:
|
||||
; EXIT-COUNT-NEXT: ret ptr [[END]]
|
||||
;
|
||||
entry:
|
||||
%cmp.entry = icmp eq ptr %begin, %end
|
||||
br i1 %cmp.entry, label %exit, label %for.cond
|
||||
|
||||
for.cond:
|
||||
%ptr = phi ptr [ %end, %entry ], [ %ptr.dec, %for.body ]
|
||||
%ptr.dec = getelementptr inbounds i8, ptr %ptr, i64 -1
|
||||
%cmp.end = icmp eq ptr %ptr.dec, %begin
|
||||
br i1 %cmp.end, label %not.found, label %for.body
|
||||
|
||||
for.body:
|
||||
%load = load i8, ptr %ptr.dec, align 1
|
||||
%cmp.val = icmp eq i8 %load, %val
|
||||
br i1 %cmp.val, label %found, label %for.cond
|
||||
|
||||
found:
|
||||
ret ptr %ptr.dec
|
||||
|
||||
not.found:
|
||||
br label %exit
|
||||
|
||||
exit:
|
||||
ret ptr %end
|
||||
}
|
||||
|
||||
; Both exits are memory-dependent (not computable). Not rotated.
|
||||
define i32 @both_mem_dependent(ptr %p, ptr %q, i32 %n) {
|
||||
; CHECK-LABEL: @both_mem_dependent(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: br label [[LOOP:%.*]]
|
||||
; CHECK: loop:
|
||||
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BODY:%.*]] ]
|
||||
; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[P:%.*]], align 4
|
||||
; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[LOAD1]], 0
|
||||
; CHECK-NEXT: br i1 [[CMP1]], label [[EXIT1:%.*]], label [[BODY]]
|
||||
; CHECK: body:
|
||||
; CHECK-NEXT: [[LOAD2:%.*]] = load i32, ptr [[Q:%.*]], align 4
|
||||
; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[LOAD2]], 0
|
||||
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
|
||||
; CHECK-NEXT: br i1 [[CMP2]], label [[EXIT2:%.*]], label [[LOOP]]
|
||||
; CHECK: exit1:
|
||||
; CHECK-NEXT: [[IV_LCSSA:%.*]] = phi i32 [ [[IV]], [[LOOP]] ]
|
||||
; CHECK-NEXT: ret i32 [[IV_LCSSA]]
|
||||
; CHECK: exit2:
|
||||
; CHECK-NEXT: [[IV_LCSSA1:%.*]] = phi i32 [ [[IV]], [[BODY]] ]
|
||||
; CHECK-NEXT: ret i32 [[IV_LCSSA1]]
|
||||
;
|
||||
entry:
|
||||
br label %loop
|
||||
|
||||
loop:
|
||||
%iv = phi i32 [ 0, %entry ], [ %iv.next, %body ]
|
||||
%load1 = load i32, ptr %p, align 4
|
||||
%cmp1 = icmp eq i32 %load1, 0
|
||||
br i1 %cmp1, label %exit1, label %body
|
||||
|
||||
body:
|
||||
%load2 = load i32, ptr %q, align 4
|
||||
%cmp2 = icmp eq i32 %load2, 0
|
||||
%iv.next = add i32 %iv, 1
|
||||
br i1 %cmp2, label %exit2, label %loop
|
||||
|
||||
exit1:
|
||||
ret i32 %iv
|
||||
|
||||
exit2:
|
||||
ret i32 %iv
|
||||
}
|
||||
|
||||
; Latch is unconditional, already properly structured.
|
||||
define void @already_rotated(ptr %p, i32 %n) {
|
||||
; CHECK-LABEL: @already_rotated(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: br label [[LOOP:%.*]]
|
||||
; CHECK: loop:
|
||||
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
|
||||
; CHECK-NEXT: store i32 [[IV]], ptr [[P:%.*]], align 4
|
||||
; CHECK-NEXT: [[IV_NEXT]] = add nuw i32 [[IV]], 1
|
||||
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], [[N:%.*]]
|
||||
; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
|
||||
; CHECK: exit:
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
br label %loop
|
||||
|
||||
loop:
|
||||
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
|
||||
store i32 %iv, ptr %p, align 4
|
||||
%iv.next = add nuw i32 %iv, 1
|
||||
%cmp = icmp eq i32 %iv.next, %n
|
||||
br i1 %cmp, label %exit, label %loop
|
||||
|
||||
exit:
|
||||
ret void
|
||||
}
|
||||
@ -0,0 +1,164 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
|
||||
; RUN: opt -S -O3 %s | FileCheck %s
|
||||
|
||||
target triple = "aarch64-unknown-linux-gnu"
|
||||
|
||||
; Test that a search loop with computable header exit gets rotated and
|
||||
; runtime-unrolled at -O3.
|
||||
define ptr @search_loop_unrolled(ptr %begin, ptr %end, i8 %val) {
|
||||
; CHECK-LABEL: define ptr @search_loop_unrolled(
|
||||
; CHECK-SAME: ptr readnone captures(address) [[BEGIN:%.*]], ptr readonly captures(address, ret: address, provenance) [[END:%.*]], i8 [[VAL:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
|
||||
; CHECK-NEXT: [[ENTRY:.*]]:
|
||||
; CHECK-NEXT: [[CMP_ENTRY:%.*]] = icmp eq ptr [[BEGIN]], [[END]]
|
||||
; CHECK-NEXT: [[PTR_DEC1:%.*]] = getelementptr inbounds i8, ptr [[END]], i64 -1
|
||||
; CHECK-NEXT: [[CMP_END2:%.*]] = icmp eq ptr [[PTR_DEC1]], [[BEGIN]]
|
||||
; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[CMP_ENTRY]], i1 true, i1 [[CMP_END2]]
|
||||
; CHECK-NEXT: br i1 [[OR_COND]], label %[[COMMON_RET:.*]], label %[[FOR_COND:.*]]
|
||||
; CHECK: [[FOR_COND]]:
|
||||
; CHECK-NEXT: [[END5:%.*]] = ptrtoint ptr [[END]] to i64
|
||||
; CHECK-NEXT: [[BEGIN6:%.*]] = ptrtoint ptr [[BEGIN]] to i64
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = xor i64 [[BEGIN6]], -1
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], [[END5]]
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = freeze i64 [[TMP1]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP2]], -1
|
||||
; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP2]], 3
|
||||
; CHECK-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0
|
||||
; CHECK-NEXT: br i1 [[LCMP_MOD_NOT]], label %[[FOR_BODY_PROL_LOOPEXIT:.*]], label %[[FOR_BODY_PROL:.*]]
|
||||
; CHECK: [[FOR_BODY_PROL]]:
|
||||
; CHECK-NEXT: [[PTR_DEC5:%.*]] = phi ptr [ [[PTR_DEC4:%.*]], %[[FOR_COND_PROL:.*]] ], [ [[PTR_DEC1]], %[[FOR_COND]] ]
|
||||
; CHECK-NEXT: [[PROL_ITER:%.*]] = phi i64 [ [[PROL_ITER_NEXT:%.*]], %[[FOR_COND_PROL]] ], [ 0, %[[FOR_COND]] ]
|
||||
; CHECK-NEXT: [[LOAD_PROL:%.*]] = load i8, ptr [[PTR_DEC5]], align 1
|
||||
; CHECK-NEXT: [[CMP_VAL_PROL:%.*]] = icmp eq i8 [[LOAD_PROL]], [[VAL]]
|
||||
; CHECK-NEXT: br i1 [[CMP_VAL_PROL]], label %[[COMMON_RET]], label %[[FOR_COND_PROL]]
|
||||
; CHECK: [[FOR_COND_PROL]]:
|
||||
; CHECK-NEXT: [[PTR_DEC4]] = getelementptr inbounds i8, ptr [[PTR_DEC5]], i64 -1
|
||||
; CHECK-NEXT: [[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1
|
||||
; CHECK-NEXT: [[PROL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[PROL_ITER_NEXT]], [[XTRAITER]]
|
||||
; CHECK-NEXT: br i1 [[PROL_ITER_CMP_NOT]], label %[[FOR_BODY_PROL_LOOPEXIT]], label %[[FOR_BODY_PROL]], !llvm.loop [[LOOP0:![0-9]+]]
|
||||
; CHECK: [[FOR_BODY_PROL_LOOPEXIT]]:
|
||||
; CHECK-NEXT: [[PTR_DEC3_UNR:%.*]] = phi ptr [ [[PTR_DEC1]], %[[FOR_COND]] ], [ [[PTR_DEC4]], %[[FOR_COND_PROL]] ]
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 3
|
||||
; CHECK-NEXT: br i1 [[TMP4]], label %[[COMMON_RET]], label %[[FOR_BODY:.*]]
|
||||
; CHECK: [[FOR_COND1:.*]]:
|
||||
; CHECK-NEXT: [[PTR_DEC:%.*]] = getelementptr inbounds i8, ptr [[PTR_DEC3:%.*]], i64 -1
|
||||
; CHECK-NEXT: [[LOAD_1:%.*]] = load i8, ptr [[PTR_DEC]], align 1
|
||||
; CHECK-NEXT: [[CMP_VAL_1:%.*]] = icmp eq i8 [[LOAD_1]], [[VAL]]
|
||||
; CHECK-NEXT: br i1 [[CMP_VAL_1]], label %[[COMMON_RET_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT14:.*]], label %[[FOR_COND_1:.*]]
|
||||
; CHECK: [[FOR_COND_1]]:
|
||||
; CHECK-NEXT: [[PTR_DEC_1:%.*]] = getelementptr inbounds i8, ptr [[PTR_DEC3]], i64 -2
|
||||
; CHECK-NEXT: [[LOAD_2:%.*]] = load i8, ptr [[PTR_DEC_1]], align 1
|
||||
; CHECK-NEXT: [[CMP_VAL_2:%.*]] = icmp eq i8 [[LOAD_2]], [[VAL]]
|
||||
; CHECK-NEXT: br i1 [[CMP_VAL_2]], label %[[COMMON_RET_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT12:.*]], label %[[FOR_COND_2:.*]]
|
||||
; CHECK: [[FOR_COND_2]]:
|
||||
; CHECK-NEXT: [[PTR_DEC_2:%.*]] = getelementptr inbounds i8, ptr [[PTR_DEC3]], i64 -3
|
||||
; CHECK-NEXT: [[LOAD_3:%.*]] = load i8, ptr [[PTR_DEC_2]], align 1
|
||||
; CHECK-NEXT: [[CMP_VAL_3:%.*]] = icmp eq i8 [[LOAD_3]], [[VAL]]
|
||||
; CHECK-NEXT: br i1 [[CMP_VAL_3]], label %[[COMMON_RET_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT10:.*]], label %[[FOR_COND_3:.*]]
|
||||
; CHECK: [[FOR_COND_3]]:
|
||||
; CHECK-NEXT: [[PTR_DEC_3:%.*]] = getelementptr inbounds i8, ptr [[PTR_DEC3]], i64 -4
|
||||
; CHECK-NEXT: [[CMP_END:%.*]] = icmp eq ptr [[PTR_DEC_3]], [[BEGIN]]
|
||||
; CHECK-NEXT: br i1 [[CMP_END]], label %[[COMMON_RET]], label %[[FOR_BODY]]
|
||||
; CHECK: [[FOR_BODY]]:
|
||||
; CHECK-NEXT: [[PTR_DEC3]] = phi ptr [ [[PTR_DEC_3]], %[[FOR_COND_3]] ], [ [[PTR_DEC3_UNR]], %[[FOR_BODY_PROL_LOOPEXIT]] ]
|
||||
; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[PTR_DEC3]], align 1
|
||||
; CHECK-NEXT: [[CMP_VAL:%.*]] = icmp eq i8 [[LOAD]], [[VAL]]
|
||||
; CHECK-NEXT: br i1 [[CMP_VAL]], label %[[COMMON_RET]], label %[[FOR_COND1]]
|
||||
; CHECK: [[COMMON_RET_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT10]]:
|
||||
; CHECK-NEXT: [[PTR_DEC_2_LE:%.*]] = getelementptr inbounds i8, ptr [[PTR_DEC3]], i64 -3
|
||||
; CHECK-NEXT: br label %[[COMMON_RET]]
|
||||
; CHECK: [[COMMON_RET_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT12]]:
|
||||
; CHECK-NEXT: [[PTR_DEC_1_LE:%.*]] = getelementptr inbounds i8, ptr [[PTR_DEC3]], i64 -2
|
||||
; CHECK-NEXT: br label %[[COMMON_RET]]
|
||||
; CHECK: [[COMMON_RET_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT14]]:
|
||||
; CHECK-NEXT: [[PTR_DEC_LE:%.*]] = getelementptr inbounds i8, ptr [[PTR_DEC3]], i64 -1
|
||||
; CHECK-NEXT: br label %[[COMMON_RET]]
|
||||
; CHECK: [[COMMON_RET]]:
|
||||
; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi ptr [ [[END]], %[[ENTRY]] ], [ [[END]], %[[FOR_BODY_PROL_LOOPEXIT]] ], [ [[PTR_DEC3]], %[[FOR_BODY]] ], [ [[PTR_DEC_LE]], %[[COMMON_RET_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT14]] ], [ [[PTR_DEC_2_LE]], %[[COMMON_RET_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT10]] ], [ [[PTR_DEC_1_LE]], %[[COMMON_RET_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT12]] ], [ [[END]], %[[FOR_COND_3]] ], [ [[PTR_DEC5]], %[[FOR_BODY_PROL]] ]
|
||||
; CHECK-NEXT: ret ptr [[COMMON_RET_OP]]
|
||||
;
|
||||
entry:
|
||||
%cmp.entry = icmp eq ptr %begin, %end
|
||||
br i1 %cmp.entry, label %exit, label %for.cond
|
||||
|
||||
for.cond:
|
||||
%ptr = phi ptr [ %end, %entry ], [ %ptr.dec, %for.body ]
|
||||
%ptr.dec = getelementptr inbounds i8, ptr %ptr, i64 -1
|
||||
%cmp.end = icmp eq ptr %ptr.dec, %begin
|
||||
br i1 %cmp.end, label %not.found, label %for.body
|
||||
|
||||
for.body:
|
||||
%load = load i8, ptr %ptr.dec, align 1
|
||||
%cmp.val = icmp eq i8 %load, %val
|
||||
br i1 %cmp.val, label %found, label %for.cond
|
||||
|
||||
found:
|
||||
ret ptr %ptr.dec
|
||||
|
||||
not.found:
|
||||
br label %exit
|
||||
|
||||
exit:
|
||||
ret ptr %end
|
||||
}
|
||||
|
||||
define i64 @rotate_needed_to_vectorize(ptr noalias %scan, ptr noalias %match) {
|
||||
; CHECK-LABEL: define i64 @rotate_needed_to_vectorize(
|
||||
; CHECK-SAME: ptr noalias readonly captures(none) [[SCAN:%.*]], ptr noalias readonly captures(none) [[MATCH:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
|
||||
; CHECK-NEXT: [[ENTRY:.*]]:
|
||||
; CHECK-NEXT: br label %[[HEADER:.*]]
|
||||
; CHECK: [[HEADER]]:
|
||||
; CHECK-NEXT: [[LEN:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[EXIT:.*]] ]
|
||||
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds nuw i8, ptr [[SCAN]], i64 [[LEN]]
|
||||
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[GEP1]], align 1
|
||||
; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds nuw i8, ptr [[MATCH]], i64 [[LEN]]
|
||||
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[GEP2]], align 1
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = freeze <16 x i1> [[TMP2]]
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i1> [[TMP3]] to i16
|
||||
; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i16 [[TMP4]], 0
|
||||
; CHECK-NEXT: br i1 [[DOTNOT]], label %[[EXIT]], label %[[VECTOR_EARLY_EXIT:.*]]
|
||||
; CHECK: [[EXIT]]:
|
||||
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[LEN]], 16
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
|
||||
; CHECK-NEXT: br i1 [[TMP5]], label %[[EXIT1:.*]], label %[[HEADER]], !llvm.loop [[LOOP2:![0-9]+]]
|
||||
; CHECK: [[VECTOR_EARLY_EXIT]]:
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = tail call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> [[TMP3]], i1 false)
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[LEN]], [[TMP6]]
|
||||
; CHECK-NEXT: br label %[[EXIT1]]
|
||||
; CHECK: [[EXIT1]]:
|
||||
; CHECK-NEXT: [[LEN_LCSSA:%.*]] = phi i64 [ [[TMP7]], %[[VECTOR_EARLY_EXIT]] ], [ 1024, %[[EXIT]] ]
|
||||
; CHECK-NEXT: ret i64 [[LEN_LCSSA]]
|
||||
;
|
||||
entry:
|
||||
call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %scan, i64 1024) ]
|
||||
call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %match, i64 1024) ]
|
||||
br label %header
|
||||
|
||||
header:
|
||||
%len = phi i64 [ 0, %entry ], [ %len.next, %body ]
|
||||
%cmp.limit = icmp eq i64 %len, 1024
|
||||
br i1 %cmp.limit, label %exit, label %body
|
||||
|
||||
body:
|
||||
%gep1 = getelementptr inbounds i8, ptr %scan, i64 %len
|
||||
%v1 = load i8, ptr %gep1, align 1
|
||||
%gep2 = getelementptr inbounds i8, ptr %match, i64 %len
|
||||
%v2 = load i8, ptr %gep2, align 1
|
||||
%cmp.data = icmp eq i8 %v1, %v2
|
||||
%len.next = add nuw i64 %len, 1
|
||||
br i1 %cmp.data, label %header, label %mismatch
|
||||
|
||||
mismatch:
|
||||
br label %exit
|
||||
|
||||
exit:
|
||||
%result = phi i64 [ 1024, %header ], [ %len, %mismatch ]
|
||||
ret i64 %result
|
||||
}
|
||||
;.
|
||||
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
|
||||
; CHECK: [[META1]] = !{!"llvm.loop.unroll.disable"}
|
||||
; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]], [[META4:![0-9]+]]}
|
||||
; CHECK: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
|
||||
; CHECK: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
|
||||
;.
|
||||
Loading…
x
Reference in New Issue
Block a user