llvm-project/llvm/lib/Analysis/CtxProfAnalysis.cpp

//===- CtxProfAnalysis.cpp - contextual profile analysis ------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Implementation of the contextual profile analysis, which maintains contextual
// profiling info through IPO passes.
//
//===----------------------------------------------------------------------===//

#include "llvm/Analysis/CtxProfAnalysis.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/CFG.h"
#include "llvm/IR/Analysis.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/ProfileData/PGOCtxProfReader.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Path.h"
#include <deque>
#include <memory>

#define DEBUG_TYPE "ctx_prof"

using namespace llvm;
cl::opt<std::string>
    UseCtxProfile("use-ctx-profile", cl::init(""), cl::Hidden,
                  cl::desc("Use the specified contextual profile file"));

static cl::opt<CtxProfAnalysisPrinterPass::PrintMode> PrintLevel(
    "ctx-profile-printer-level",
    cl::init(CtxProfAnalysisPrinterPass::PrintMode::YAML), cl::Hidden,
    cl::values(clEnumValN(CtxProfAnalysisPrinterPass::PrintMode::Everything,
                          "everything", "print everything - most verbose"),
               clEnumValN(CtxProfAnalysisPrinterPass::PrintMode::YAML, "yaml",
                          "just the yaml representation of the profile")),
    cl::desc("Verbosity level of the contextual profile printer pass."));

static cl::opt<bool> ForceIsInSpecializedModule(
    "ctx-profile-force-is-specialized", cl::init(false),
    cl::desc("Treat the given module as-if it were containing the "
             "post-thinlink module containing the root"));

const char *AssignGUIDPass::GUIDMetadataName = "guid";

namespace llvm {
class ProfileAnnotatorImpl final {
  friend class ProfileAnnotator;
  class BBInfo;
  struct EdgeInfo {
    BBInfo *const Src;
    BBInfo *const Dest;
    std::optional<uint64_t> Count;

    explicit EdgeInfo(BBInfo &Src, BBInfo &Dest) : Src(&Src), Dest(&Dest) {}
  };

  class BBInfo {
    std::optional<uint64_t> Count;
    // OutEdges is dimensioned to match the number of terminator operands.
    // Entries in the vector match the index in the terminator operand list. In
    // some cases - see `shouldExcludeEdge` and its implementation - an entry
    // will be nullptr.
    // InEdges doesn't have the above constraint.
    SmallVector<EdgeInfo *> OutEdges;
    SmallVector<EdgeInfo *> InEdges;
    size_t UnknownCountOutEdges = 0;
    size_t UnknownCountInEdges = 0;

    // Pass AssumeAllKnown when we try to propagate counts from edges to BBs -
    // because all the edge counters must be known.
    // Return std::nullopt if there were no edges to sum. The user can decide
    // how to interpret that.
    std::optional<uint64_t> getEdgeSum(const SmallVector<EdgeInfo *> &Edges,
                                       bool AssumeAllKnown) const {
      std::optional<uint64_t> Sum;
      for (const auto *E : Edges) {
        // `Edges` may be `OutEdges`, case in which `E` could be nullptr.
        if (E) {
          if (!Sum.has_value())
            Sum = 0;
          *Sum += (AssumeAllKnown ? *E->Count : E->Count.value_or(0U));
        }
      }
      return Sum;
    }

    bool computeCountFrom(const SmallVector<EdgeInfo *> &Edges) {
      assert(!Count.has_value());
      Count = getEdgeSum(Edges, true);
      return Count.has_value();
    }

    void setSingleUnknownEdgeCount(SmallVector<EdgeInfo *> &Edges) {
      uint64_t KnownSum = getEdgeSum(Edges, false).value_or(0U);
      uint64_t EdgeVal = *Count > KnownSum ? *Count - KnownSum : 0U;
      EdgeInfo *E = nullptr;
      for (auto *I : Edges)
        if (I && !I->Count.has_value()) {
          E = I;
#ifdef NDEBUG
          break;
#else
          assert((!E || E == I) &&
                 "Expected exactly one edge to have an unknown count, "
                 "found a second one");
          continue;
#endif
        }
      assert(E && "Expected exactly one edge to have an unknown count");
      assert(!E->Count.has_value());
      E->Count = EdgeVal;
      assert(E->Src->UnknownCountOutEdges > 0);
      assert(E->Dest->UnknownCountInEdges > 0);
      --E->Src->UnknownCountOutEdges;
      --E->Dest->UnknownCountInEdges;
    }

  public:
    BBInfo(size_t NumInEdges, size_t NumOutEdges, std::optional<uint64_t> Count)
        : Count(Count) {
      // For in edges, we just want to pre-allocate enough space, since we know
      // it at this stage. For out edges, we will insert edges at the indices
      // corresponding to positions in this BB's terminator instruction, so we
      // construct a default (nullptr values)-initialized vector. A nullptr edge
      // corresponds to those that are excluded (see shouldExcludeEdge).
      InEdges.reserve(NumInEdges);
      OutEdges.resize(NumOutEdges);
    }

    bool tryTakeCountFromKnownOutEdges(const BasicBlock &BB) {
      if (!UnknownCountOutEdges) {
        return computeCountFrom(OutEdges);
      }
      return false;
    }

    bool tryTakeCountFromKnownInEdges(const BasicBlock &BB) {
      if (!UnknownCountInEdges) {
        return computeCountFrom(InEdges);
      }
      return false;
    }

    void addInEdge(EdgeInfo &Info) {
      InEdges.push_back(&Info);
      ++UnknownCountInEdges;
    }

    // For the out edges, we care about the position we place them in, which is
    // the position in terminator instruction's list (at construction). Later,
    // we build branch_weights metadata with edge frequency values matching
    // these positions.
    void addOutEdge(size_t Index, EdgeInfo &Info) {
      OutEdges[Index] = &Info;
      ++UnknownCountOutEdges;
    }

    bool hasCount() const { return Count.has_value(); }

    uint64_t getCount() const { return *Count; }

    bool trySetSingleUnknownInEdgeCount() {
      if (UnknownCountInEdges == 1) {
        setSingleUnknownEdgeCount(InEdges);
        return true;
      }
      return false;
    }

    bool trySetSingleUnknownOutEdgeCount() {
      if (UnknownCountOutEdges == 1) {
        setSingleUnknownEdgeCount(OutEdges);
        return true;
      }
      return false;
    }
    size_t getNumOutEdges() const { return OutEdges.size(); }

    uint64_t getEdgeCount(size_t Index) const {
      if (auto *E = OutEdges[Index])
        return *E->Count;
      return 0U;
    }
  };

  const Function &F;
  ArrayRef<uint64_t> Counters;
  // To be accessed through getBBInfo() after construction.
  std::map<const BasicBlock *, BBInfo> BBInfos;
  std::vector<EdgeInfo> EdgeInfos;

  // The only criteria for exclusion is faux suspend -> exit edges in presplit
  // coroutines. The API serves for readability, currently.
  bool shouldExcludeEdge(const BasicBlock &Src, const BasicBlock &Dest) const {
    return llvm::isPresplitCoroSuspendExitEdge(Src, Dest);
  }

  BBInfo &getBBInfo(const BasicBlock &BB) { return BBInfos.find(&BB)->second; }

  const BBInfo &getBBInfo(const BasicBlock &BB) const {
    return BBInfos.find(&BB)->second;
  }

  // validation function after we propagate the counters: all BBs and edges'
  // counters must have a value.
  bool allCountersAreAssigned() const {
    for (const auto &BBInfo : BBInfos)
      if (!BBInfo.second.hasCount())
        return false;
    for (const auto &EdgeInfo : EdgeInfos)
      if (!EdgeInfo.Count.has_value())
        return false;
    return true;
  }

  /// Check that all paths from the entry basic block that use edges with
  /// non-zero counts arrive at a basic block with no successors (i.e. "exit")
  bool allTakenPathsExit() const {
    std::deque<const BasicBlock *> Worklist;
    DenseSet<const BasicBlock *> Visited;
    Worklist.push_back(&F.getEntryBlock());
    bool HitExit = false;
    while (!Worklist.empty()) {
      const auto *BB = Worklist.front();
      Worklist.pop_front();
      if (!Visited.insert(BB).second)
        continue;
      if (succ_size(BB) == 0) {
        if (isa<UnreachableInst>(BB->getTerminator()))
          return false;
        HitExit = true;
        continue;
      }
      if (succ_size(BB) == 1) {
        Worklist.push_back(BB->getUniqueSuccessor());
        continue;
      }
      const auto &BBInfo = getBBInfo(*BB);
      bool HasAWayOut = false;
      for (auto I = 0U; I < BB->getTerminator()->getNumSuccessors(); ++I) {
        const auto *Succ = BB->getTerminator()->getSuccessor(I);
        if (!shouldExcludeEdge(*BB, *Succ)) {
          if (BBInfo.getEdgeCount(I) > 0) {
            HasAWayOut = true;
            Worklist.push_back(Succ);
          }
        }
      }
      if (!HasAWayOut)
        return false;
    }
    return HitExit;
  }

  bool allNonColdSelectsHaveProfile() const {
    for (const auto &BB : F) {
      if (getBBInfo(BB).getCount() > 0) {
        for (const auto &I : BB) {
          if (const auto *SI = dyn_cast<SelectInst>(&I)) {
            if (const auto *Inst = CtxProfAnalysis::getSelectInstrumentation(
                    *const_cast<SelectInst *>(SI))) {
              auto Index = Inst->getIndex()->getZExtValue();
              assert(Index < Counters.size());
              if (Counters[Index] == 0)
                return false;
            }
          }
        }
      }
    }
    return true;
  }

  // This is an adaptation of PGOUseFunc::populateCounters.
  // FIXME(mtrofin): look into factoring the code to share one implementation.
  void propagateCounterValues() {
    bool KeepGoing = true;
    while (KeepGoing) {
      KeepGoing = false;
      for (const auto &BB : F) {
        auto &Info = getBBInfo(BB);
        if (!Info.hasCount())
          KeepGoing |= Info.tryTakeCountFromKnownOutEdges(BB) ||
                       Info.tryTakeCountFromKnownInEdges(BB);
        if (Info.hasCount()) {
          KeepGoing |= Info.trySetSingleUnknownOutEdgeCount();
          KeepGoing |= Info.trySetSingleUnknownInEdgeCount();
        }
      }
    }
    assert(allCountersAreAssigned() &&
           "[ctx-prof] Expected all counters have been assigned.");
    assert(allTakenPathsExit() &&
           "[ctx-prof] Encountered a BB with more than one successor, where "
           "all outgoing edges have a 0 count. This occurs in non-exiting "
           "functions (message pumps, usually) which are not supported in the "
           "contextual profiling case");
    assert(allNonColdSelectsHaveProfile() &&
           "[ctx-prof] All non-cold select instructions were expected to have "
           "a profile.");
  }

public:
  ProfileAnnotatorImpl(const Function &F, ArrayRef<uint64_t> Counters)
      : F(F), Counters(Counters) {
    assert(!F.isDeclaration());
    assert(!Counters.empty());
    size_t NrEdges = 0;
    for (const auto &BB : F) {
      std::optional<uint64_t> Count;
      if (auto *Ins = CtxProfAnalysis::getBBInstrumentation(
              const_cast<BasicBlock &>(BB))) {
        auto Index = Ins->getIndex()->getZExtValue();
        assert(Index < Counters.size() &&
               "The index must be inside the counters vector by construction - "
               "tripping this assertion indicates a bug in how the contextual "
               "profile is managed by IPO transforms");
        (void)Index;
        Count = Counters[Ins->getIndex()->getZExtValue()];
      } else if (isa<UnreachableInst>(BB.getTerminator())) {
        // The program presumably didn't crash.
        Count = 0;
      }
      auto [It, Ins] =
          BBInfos.insert({&BB, {pred_size(&BB), succ_size(&BB), Count}});
      (void)Ins;
      assert(Ins && "We iterate through the function's BBs, no reason to "
                    "insert one more than once");
      NrEdges += llvm::count_if(successors(&BB), [&](const auto *Succ) {
        return !shouldExcludeEdge(BB, *Succ);
      });
    }
    // Pre-allocate the vector, we want references to its contents to be stable.
    EdgeInfos.reserve(NrEdges);
    for (const auto &BB : F) {
      auto &Info = getBBInfo(BB);
      for (auto I = 0U; I < BB.getTerminator()->getNumSuccessors(); ++I) {
        const auto *Succ = BB.getTerminator()->getSuccessor(I);
        if (!shouldExcludeEdge(BB, *Succ)) {
          auto &EI = EdgeInfos.emplace_back(getBBInfo(BB), getBBInfo(*Succ));
          Info.addOutEdge(I, EI);
          getBBInfo(*Succ).addInEdge(EI);
        }
      }
    }
    assert(EdgeInfos.capacity() == NrEdges &&
           "The capacity of EdgeInfos should have stayed unchanged it was "
           "populated, because we need pointers to its contents to be stable");
    propagateCounterValues();
  }

  uint64_t getBBCount(const BasicBlock &BB) { return getBBInfo(BB).getCount(); }
};

} // namespace llvm

ProfileAnnotator::ProfileAnnotator(const Function &F,
                                   ArrayRef<uint64_t> RawCounters)
    : PImpl(std::make_unique<ProfileAnnotatorImpl>(F, RawCounters)) {}

ProfileAnnotator::~ProfileAnnotator() = default;

uint64_t ProfileAnnotator::getBBCount(const BasicBlock &BB) const {
  return PImpl->getBBCount(BB);
}

bool ProfileAnnotator::getSelectInstrProfile(SelectInst &SI,
                                             uint64_t &TrueCount,
                                             uint64_t &FalseCount) const {
  const auto &BBInfo = PImpl->getBBInfo(*SI.getParent());
  TrueCount = FalseCount = 0;
  if (BBInfo.getCount() == 0)
    return false;

  auto *Step = CtxProfAnalysis::getSelectInstrumentation(SI);
  if (!Step)
    return false;
  auto Index = Step->getIndex()->getZExtValue();
  assert(Index < PImpl->Counters.size() &&
         "The index of the step instruction must be inside the "
         "counters vector by "
         "construction - tripping this assertion indicates a bug in "
         "how the contextual profile is managed by IPO transforms");
  auto TotalCount = BBInfo.getCount();
  TrueCount = PImpl->Counters[Index];
  FalseCount = (TotalCount > TrueCount ? TotalCount - TrueCount : 0U);
  return true;
}

bool ProfileAnnotator::getOutgoingBranchWeights(
    BasicBlock &BB, SmallVectorImpl<uint64_t> &Profile,
    uint64_t &MaxCount) const {
  Profile.clear();

  if (succ_size(&BB) < 2)
    return false;

  auto *Term = BB.getTerminator();
  Profile.resize(Term->getNumSuccessors());

  const auto &BBInfo = PImpl->getBBInfo(BB);
  MaxCount = 0;
  for (unsigned SuccIdx = 0, Size = BBInfo.getNumOutEdges(); SuccIdx < Size;
       ++SuccIdx) {
    uint64_t EdgeCount = BBInfo.getEdgeCount(SuccIdx);
    if (EdgeCount > MaxCount)
      MaxCount = EdgeCount;
    Profile[SuccIdx] = EdgeCount;
  }
  return MaxCount > 0;
}

PreservedAnalyses AssignGUIDPass::run(Module &M, ModuleAnalysisManager &MAM) {
  for (auto &F : M.functions()) {
    if (F.isDeclaration())
      continue;
    if (F.getMetadata(GUIDMetadataName))
      continue;
    const GlobalValue::GUID GUID = F.getGUID();
    F.setMetadata(GUIDMetadataName,
                  MDNode::get(M.getContext(),
                              {ConstantAsMetadata::get(ConstantInt::get(
                                  Type::getInt64Ty(M.getContext()), GUID))}));
  }
  return PreservedAnalyses::none();
}

GlobalValue::GUID AssignGUIDPass::getGUID(const Function &F) {
  if (F.isDeclaration()) {
    assert(GlobalValue::isExternalLinkage(F.getLinkage()));
    return F.getGUID();
  }
  auto *MD = F.getMetadata(GUIDMetadataName);
  assert(MD && "guid not found for defined function");
  return cast<ConstantInt>(cast<ConstantAsMetadata>(MD->getOperand(0))
                               ->getValue()
                               ->stripPointerCasts())
      ->getZExtValue();
}
AnalysisKey CtxProfAnalysis::Key;

CtxProfAnalysis::CtxProfAnalysis(std::optional<StringRef> Profile)
    : Profile([&]() -> std::optional<StringRef> {
        if (Profile)
          return *Profile;
        if (UseCtxProfile.getNumOccurrences())
          return UseCtxProfile;
        return std::nullopt;
      }()) {}

PGOContextualProfile CtxProfAnalysis::run(Module &M,
                                          ModuleAnalysisManager &MAM) {
  if (!Profile)
    return {};
  ErrorOr<std::unique_ptr<MemoryBuffer>> MB = MemoryBuffer::getFile(*Profile);
  if (auto EC = MB.getError()) {
    M.getContext().emitError("could not open contextual profile file: " +
                             EC.message());
    return {};
  }
  PGOCtxProfileReader Reader(MB.get()->getBuffer());
  auto MaybeProfiles = Reader.loadProfiles();
  if (!MaybeProfiles) {
    M.getContext().emitError("contextual profile file is invalid: " +
                             toString(MaybeProfiles.takeError()));
    return {};
  }

  // FIXME: We should drive this from ThinLTO, but for the time being, use the
  // module name as indicator.
  // We want to *only* keep the contextual profiles in modules that capture
  // context trees. That allows us to compute specific PSIs, for example.
  auto DetermineRootsInModule = [&M]() -> const DenseSet<GlobalValue::GUID> {
    DenseSet<GlobalValue::GUID> ProfileRootsInModule;
    auto ModName = M.getName();
    auto Filename = sys::path::filename(ModName);
    // Drop the file extension.
    Filename = Filename.substr(0, Filename.find_last_of('.'));
    // See if it parses
    APInt Guid;
    // getAsInteger returns true if there are more chars to read other than the
    // integer. So the "false" test is what we want.
    if (!Filename.getAsInteger(0, Guid))
      ProfileRootsInModule.insert(Guid.getZExtValue());
    return ProfileRootsInModule;
  };
  const auto ProfileRootsInModule = DetermineRootsInModule();
  PGOContextualProfile Result;

  // the logic from here on allows for modules that contain - by design - more
  // than one root. We currently don't support that, because the determination
  // happens based on the module name matching the root guid, but the logic can
  // avoid assuming that.
  if (!ProfileRootsInModule.empty()) {
    Result.IsInSpecializedModule = true;
    // Trim first the roots that aren't in this module.
    for (auto &[RootGuid, _] :
         llvm::make_early_inc_range(MaybeProfiles->Contexts))
      if (!ProfileRootsInModule.contains(RootGuid))
        MaybeProfiles->Contexts.erase(RootGuid);
    // we can also drop the flat profiles
    MaybeProfiles->FlatProfiles.clear();
  }

  for (const auto &F : M) {
    if (F.isDeclaration())
      continue;
    auto GUID = AssignGUIDPass::getGUID(F);
    assert(GUID && "guid not found for defined function");
    const auto &Entry = F.begin();
    uint32_t MaxCounters = 0; // we expect at least a counter.
    for (const auto &I : *Entry)
      if (auto *C = dyn_cast<InstrProfIncrementInst>(&I)) {
        MaxCounters =
            static_cast<uint32_t>(C->getNumCounters()->getZExtValue());
        break;
      }
    if (!MaxCounters)
      continue;
    uint32_t MaxCallsites = 0;
    for (const auto &BB : F)
      for (const auto &I : BB)
        if (auto *C = dyn_cast<InstrProfCallsite>(&I)) {
          MaxCallsites =
              static_cast<uint32_t>(C->getNumCounters()->getZExtValue());
          break;
        }
    auto [It, Ins] = Result.FuncInfo.insert(
        {GUID, PGOContextualProfile::FunctionInfo(F.getName())});
    (void)Ins;
    assert(Ins);
    It->second.NextCallsiteIndex = MaxCallsites;
    It->second.NextCounterIndex = MaxCounters;
  }
  // If we made it this far, the Result is valid - which we mark by setting
  // .Profiles.
  Result.Profiles = std::move(*MaybeProfiles);
  Result.initIndex();
  return Result;
}

GlobalValue::GUID
PGOContextualProfile::getDefinedFunctionGUID(const Function &F) const {
  if (auto It = FuncInfo.find(AssignGUIDPass::getGUID(F)); It != FuncInfo.end())
    return It->first;
  return 0;
}

CtxProfAnalysisPrinterPass::CtxProfAnalysisPrinterPass(raw_ostream &OS)
    : OS(OS), Mode(PrintLevel) {}

PreservedAnalyses CtxProfAnalysisPrinterPass::run(Module &M,
                                                  ModuleAnalysisManager &MAM) {
  CtxProfAnalysis::Result &C = MAM.getResult<CtxProfAnalysis>(M);
  if (C.contexts().empty()) {
    OS << "No contextual profile was provided.\n";
    return PreservedAnalyses::all();
  }

  if (Mode == PrintMode::Everything) {
    OS << "Function Info:\n";
    for (const auto &[Guid, FuncInfo] : C.FuncInfo)
      OS << Guid << " : " << FuncInfo.Name
         << ". MaxCounterID: " << FuncInfo.NextCounterIndex
         << ". MaxCallsiteID: " << FuncInfo.NextCallsiteIndex << "\n";
  }

  if (Mode == PrintMode::Everything)
    OS << "\nCurrent Profile:\n";
  convertCtxProfToYaml(OS, C.profiles());
  OS << "\n";
  if (Mode == PrintMode::YAML)
    return PreservedAnalyses::all();

  OS << "\nFlat Profile:\n";
  auto Flat = C.flatten();
  for (const auto &[Guid, Counters] : Flat) {
    OS << Guid << " : ";
    for (auto V : Counters)
      OS << V << " ";
    OS << "\n";
  }
  return PreservedAnalyses::all();
}

InstrProfCallsite *CtxProfAnalysis::getCallsiteInstrumentation(CallBase &CB) {
  if (!InstrProfCallsite::canInstrumentCallsite(CB))
    return nullptr;
  for (auto *Prev = CB.getPrevNode(); Prev; Prev = Prev->getPrevNode()) {
    if (auto *IPC = dyn_cast<InstrProfCallsite>(Prev))
      return IPC;
    assert(!isa<CallBase>(Prev) &&
           "didn't expect to find another call, that's not the callsite "
           "instrumentation, before an instrumentable callsite");
  }
  return nullptr;
}

InstrProfIncrementInst *CtxProfAnalysis::getBBInstrumentation(BasicBlock &BB) {
  for (auto &I : BB)
    if (auto *Incr = dyn_cast<InstrProfIncrementInst>(&I))
      if (!isa<InstrProfIncrementInstStep>(&I))
        return Incr;
  return nullptr;
}

InstrProfIncrementInstStep *
CtxProfAnalysis::getSelectInstrumentation(SelectInst &SI) {
  Instruction *Prev = &SI;
  while ((Prev = Prev->getPrevNode()))
    if (auto *Step = dyn_cast<InstrProfIncrementInstStep>(Prev))
      return Step;
  return nullptr;
}

template <class ProfTy>
static void preorderVisitOneRoot(ProfTy &Profile,
                                 function_ref<void(ProfTy &)> Visitor) {
  std::function<void(ProfTy &)> Traverser = [&](auto &Ctx) {
    Visitor(Ctx);
    for (auto &[_, SubCtxSet] : Ctx.callsites())
      for (auto &[__, Subctx] : SubCtxSet)
        Traverser(Subctx);
  };
  Traverser(Profile);
}

template <class ProfilesTy, class ProfTy>
static void preorderVisit(ProfilesTy &Profiles,
                          function_ref<void(ProfTy &)> Visitor) {
  for (auto &[_, P] : Profiles)
    preorderVisitOneRoot<ProfTy>(P, Visitor);
}

void PGOContextualProfile::initIndex() {
  // Initialize the head of the index list for each function. We don't need it
  // after this point.
  DenseMap<GlobalValue::GUID, PGOCtxProfContext *> InsertionPoints;
  for (auto &[Guid, FI] : FuncInfo)
    InsertionPoints[Guid] = &FI.Index;
  preorderVisit<PGOCtxProfContext::CallTargetMapTy, PGOCtxProfContext>(
      Profiles.Contexts, [&](PGOCtxProfContext &Ctx) {
        auto InsertIt = InsertionPoints.find(Ctx.guid());
        if (InsertIt == InsertionPoints.end())
          return;
        // Insert at the end of the list. Since we traverse in preorder, it
        // means that when we iterate the list from the beginning, we'd
        // encounter the contexts in the order we would have, should we have
        // performed a full preorder traversal.
        InsertIt->second->Next = &Ctx;
        Ctx.Previous = InsertIt->second;
        InsertIt->second = &Ctx;
      });
}

bool PGOContextualProfile::isInSpecializedModule() const {
  return ForceIsInSpecializedModule.getNumOccurrences() > 0
             ? ForceIsInSpecializedModule
             : IsInSpecializedModule;
}

void PGOContextualProfile::update(Visitor V, const Function &F) {
  assert(isFunctionKnown(F));
  GlobalValue::GUID G = getDefinedFunctionGUID(F);
  for (auto *Node = FuncInfo.find(G)->second.Index.Next; Node;
       Node = Node->Next)
    V(*reinterpret_cast<PGOCtxProfContext *>(Node));
}

void PGOContextualProfile::visit(ConstVisitor V, const Function *F) const {
  if (!F)
    return preorderVisit<const PGOCtxProfContext::CallTargetMapTy,
                         const PGOCtxProfContext>(Profiles.Contexts, V);
  assert(isFunctionKnown(*F));
  GlobalValue::GUID G = getDefinedFunctionGUID(*F);
  for (const auto *Node = FuncInfo.find(G)->second.Index.Next; Node;
       Node = Node->Next)
    V(*reinterpret_cast<const PGOCtxProfContext *>(Node));
}

const CtxProfFlatProfile PGOContextualProfile::flatten() const {
  CtxProfFlatProfile Flat;
  auto Accummulate = [](SmallVectorImpl<uint64_t> &Into,
                        const SmallVectorImpl<uint64_t> &From,
                        uint64_t SamplingRate) {
    if (Into.empty())
      Into.resize(From.size());
    assert(Into.size() == From.size() &&
           "All contexts corresponding to a function should have the exact "
           "same number of counters.");
    for (size_t I = 0, E = Into.size(); I < E; ++I)
      Into[I] += From[I] * SamplingRate;
  };

  for (const auto &[_, CtxRoot] : Profiles.Contexts) {
    const uint64_t SamplingFactor = CtxRoot.getTotalRootEntryCount();
    preorderVisitOneRoot<const PGOCtxProfContext>(
        CtxRoot, [&](const PGOCtxProfContext &Ctx) {
          Accummulate(Flat[Ctx.guid()], Ctx.counters(), SamplingFactor);
        });

    for (const auto &[G, Unh] : CtxRoot.getUnhandled())
      Accummulate(Flat[G], Unh, SamplingFactor);
  }
  // We don't sample "Flat" currently, so sampling rate is 1.
  for (const auto &[G, FC] : Profiles.FlatProfiles)
    Accummulate(Flat[G], FC, /*SamplingRate=*/1);
  return Flat;
}

const CtxProfFlatIndirectCallProfile
PGOContextualProfile::flattenVirtCalls() const {
  CtxProfFlatIndirectCallProfile Ret;
  for (const auto &[_, CtxRoot] : Profiles.Contexts) {
    const uint64_t TotalRootEntryCount = CtxRoot.getTotalRootEntryCount();
    preorderVisitOneRoot<const PGOCtxProfContext>(
        CtxRoot, [&](const PGOCtxProfContext &Ctx) {
          auto &Targets = Ret[Ctx.guid()];
          for (const auto &[ID, SubctxSet] : Ctx.callsites())
            for (const auto &Subctx : SubctxSet)
              Targets[ID][Subctx.first] +=
                  Subctx.second.getEntrycount() * TotalRootEntryCount;
        });
  }
  return Ret;
}

void CtxProfAnalysis::collectIndirectCallPromotionList(
    CallBase &IC, Result &Profile,
    SetVector<std::pair<CallBase *, Function *>> &Candidates) {
  const auto *Instr = CtxProfAnalysis::getCallsiteInstrumentation(IC);
  if (!Instr)
    return;
  Module &M = *IC.getParent()->getModule();
  const uint32_t CallID = Instr->getIndex()->getZExtValue();
  Profile.visit(
      [&](const PGOCtxProfContext &Ctx) {
        const auto &Targets = Ctx.callsites().find(CallID);
        if (Targets == Ctx.callsites().end())
          return;
        for (const auto &[Guid, _] : Targets->second)
          if (auto Name = Profile.getFunctionName(Guid); !Name.empty())
            if (auto *Target = M.getFunction(Name))
              if (Target->hasFnAttribute(Attribute::AlwaysInline))
                Candidates.insert({&IC, Target});
      },
      IC.getCaller());
}