
D84108 exposed a bad interaction between inlining and loop-rotation during regular LTO, which is causing notable regressions in at least CINT2006/473.astar. The problem boils down to: we now rotate a loop just before the vectorizer which requires duplicating a function call in the preheader when compiling the individual files ('prepare for LTO'). But this then prevents further inlining of the function during LTO. This patch tries to resolve this issue by making LoopRotate more conservative with respect to rotating loops that have inline-able calls during the 'prepare for LTO' stage. I think this change intuitively improves the current situation in general. Loop-rotate tries hard to avoid creating headers that are 'too big'. At the moment, it assumes all inlining already happened and the cost of duplicating a call is equal to just doing the call. But with LTO, inlining also happens during full LTO and it is possible that a previously duplicated call is actually a huge function which gets inlined during LTO. From the perspective of LV, not much should change overall. Most loops calling user-provided functions won't get vectorized to start with (unless we can infer that the function does not touch memory, has no other side effects). If we do not inline the 'inline-able' call during the LTO stage, we merely delayed loop-rotation & vectorization. If we inline during LTO, chances should be very high that the inlined code is itself vectorizable or the user call was not vectorizable to start with. There could of course be scenarios where we inline a sufficiently large function with code not profitable to vectorize, which would have be vectorized earlier (by scalarzing the call). But even in that case, there probably is no big performance impact, because it should be mostly down to the cost-model to reject vectorization in that case. And then the version with scalarized calls should also not be beneficial. In a way, LV should have strictly more information after inlining and make more accurate decisions (barring cost-model issues). There is of course plenty of room for things to go wrong unexpectedly, so we need to keep a close look at actual performance and address any follow-up issues. I took a look at the impact on statistics for MultiSource/SPEC2000/SPEC2006. There are a few benchmarks with fewer loops rotated, but no change to the number of loops vectorized. Reviewed By: sanwou01 Differential Revision: https://reviews.llvm.org/D94232
153 lines
5.9 KiB
C++
153 lines
5.9 KiB
C++
//===- LoopRotation.cpp - Loop Rotation Pass ------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This file implements Loop Rotation Pass.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "llvm/Transforms/Scalar/LoopRotation.h"
|
|
#include "llvm/ADT/Statistic.h"
|
|
#include "llvm/Analysis/AssumptionCache.h"
|
|
#include "llvm/Analysis/InstructionSimplify.h"
|
|
#include "llvm/Analysis/LoopPass.h"
|
|
#include "llvm/Analysis/MemorySSA.h"
|
|
#include "llvm/Analysis/MemorySSAUpdater.h"
|
|
#include "llvm/Analysis/ScalarEvolution.h"
|
|
#include "llvm/Analysis/TargetTransformInfo.h"
|
|
#include "llvm/InitializePasses.h"
|
|
#include "llvm/Support/CommandLine.h"
|
|
#include "llvm/Support/Debug.h"
|
|
#include "llvm/Transforms/Scalar.h"
|
|
#include "llvm/Transforms/Scalar/LoopPassManager.h"
|
|
#include "llvm/Transforms/Utils/LoopRotationUtils.h"
|
|
#include "llvm/Transforms/Utils/LoopUtils.h"
|
|
using namespace llvm;
|
|
|
|
#define DEBUG_TYPE "loop-rotate"
|
|
|
|
static cl::opt<unsigned> DefaultRotationThreshold(
|
|
"rotation-max-header-size", cl::init(16), cl::Hidden,
|
|
cl::desc("The default maximum header size for automatic loop rotation"));
|
|
|
|
static cl::opt<bool> PrepareForLTOOption(
|
|
"rotation-prepare-for-lto", cl::init(false), cl::Hidden,
|
|
cl::desc("Run loop-rotation in the prepare-for-lto stage. This option "
|
|
"should be used for testing only."));
|
|
|
|
LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication, bool PrepareForLTO)
|
|
: EnableHeaderDuplication(EnableHeaderDuplication),
|
|
PrepareForLTO(PrepareForLTO) {}
|
|
|
|
PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
|
|
LoopStandardAnalysisResults &AR,
|
|
LPMUpdater &) {
|
|
// Vectorization requires loop-rotation. Use default threshold for loops the
|
|
// user explicitly marked for vectorization, even when header duplication is
|
|
// disabled.
|
|
int Threshold = EnableHeaderDuplication ||
|
|
hasVectorizeTransformation(&L) == TM_ForcedByUser
|
|
? DefaultRotationThreshold
|
|
: 0;
|
|
const DataLayout &DL = L.getHeader()->getModule()->getDataLayout();
|
|
const SimplifyQuery SQ = getBestSimplifyQuery(AR, DL);
|
|
|
|
Optional<MemorySSAUpdater> MSSAU;
|
|
if (AR.MSSA)
|
|
MSSAU = MemorySSAUpdater(AR.MSSA);
|
|
bool Changed =
|
|
LoopRotation(&L, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE,
|
|
MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, SQ, false,
|
|
Threshold, false, PrepareForLTO || PrepareForLTOOption);
|
|
|
|
if (!Changed)
|
|
return PreservedAnalyses::all();
|
|
|
|
if (AR.MSSA && VerifyMemorySSA)
|
|
AR.MSSA->verifyMemorySSA();
|
|
|
|
auto PA = getLoopPassPreservedAnalyses();
|
|
if (AR.MSSA)
|
|
PA.preserve<MemorySSAAnalysis>();
|
|
return PA;
|
|
}
|
|
|
|
namespace {
|
|
|
|
class LoopRotateLegacyPass : public LoopPass {
|
|
unsigned MaxHeaderSize;
|
|
bool PrepareForLTO;
|
|
|
|
public:
|
|
static char ID; // Pass ID, replacement for typeid
|
|
LoopRotateLegacyPass(int SpecifiedMaxHeaderSize = -1,
|
|
bool PrepareForLTO = false)
|
|
: LoopPass(ID), PrepareForLTO(PrepareForLTO) {
|
|
initializeLoopRotateLegacyPassPass(*PassRegistry::getPassRegistry());
|
|
if (SpecifiedMaxHeaderSize == -1)
|
|
MaxHeaderSize = DefaultRotationThreshold;
|
|
else
|
|
MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize);
|
|
}
|
|
|
|
// LCSSA form makes instruction renaming easier.
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
AU.addRequired<AssumptionCacheTracker>();
|
|
AU.addRequired<TargetTransformInfoWrapperPass>();
|
|
if (EnableMSSALoopDependency)
|
|
AU.addPreserved<MemorySSAWrapperPass>();
|
|
getLoopAnalysisUsage(AU);
|
|
}
|
|
|
|
bool runOnLoop(Loop *L, LPPassManager &LPM) override {
|
|
if (skipLoop(L))
|
|
return false;
|
|
Function &F = *L->getHeader()->getParent();
|
|
|
|
auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
|
|
const auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
|
|
auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
|
|
auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
|
|
auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
|
|
const SimplifyQuery SQ = getBestSimplifyQuery(*this, F);
|
|
Optional<MemorySSAUpdater> MSSAU;
|
|
if (EnableMSSALoopDependency) {
|
|
// Not requiring MemorySSA and getting it only if available will split
|
|
// the loop pass pipeline when LoopRotate is being run first.
|
|
auto *MSSAA = getAnalysisIfAvailable<MemorySSAWrapperPass>();
|
|
if (MSSAA)
|
|
MSSAU = MemorySSAUpdater(&MSSAA->getMSSA());
|
|
}
|
|
// Vectorization requires loop-rotation. Use default threshold for loops the
|
|
// user explicitly marked for vectorization, even when header duplication is
|
|
// disabled.
|
|
int Threshold = hasVectorizeTransformation(L) == TM_ForcedByUser
|
|
? DefaultRotationThreshold
|
|
: MaxHeaderSize;
|
|
|
|
return LoopRotation(L, LI, TTI, AC, &DT, &SE,
|
|
MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, SQ,
|
|
false, Threshold, false,
|
|
PrepareForLTO || PrepareForLTOOption);
|
|
}
|
|
};
|
|
} // end namespace
|
|
|
|
char LoopRotateLegacyPass::ID = 0;
|
|
INITIALIZE_PASS_BEGIN(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops",
|
|
false, false)
|
|
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
|
|
INITIALIZE_PASS_DEPENDENCY(LoopPass)
|
|
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
|
|
INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
|
|
INITIALIZE_PASS_END(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops", false,
|
|
false)
|
|
|
|
Pass *llvm::createLoopRotatePass(int MaxHeaderSize, bool PrepareForLTO) {
|
|
return new LoopRotateLegacyPass(MaxHeaderSize, PrepareForLTO);
|
|
}
|