llvm-project/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp

//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements support for context disambiguation of allocation
// calls for profile guided heap optimization. Specifically, it uses Memprof
// profiles which indicate context specific allocation behavior (currently
// distinguishing cold vs hot memory allocations). Cloning is performed to
// expose the cold allocation call contexts, and the allocation calls are
// subsequently annotated with an attribute for later transformation.
//
// The transformations can be performed either directly on IR (regular LTO), or
// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
// Both types of LTO operate on a the same base graph representation, which
// uses CRTP to support either IR or Index formats.
//
//===----------------------------------------------------------------------===//

#include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SetOperations.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/MemoryProfileInfo.h"
#include "llvm/Analysis/ModuleSummaryAnalysis.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Bitcode/BitcodeReader.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/ModuleSummaryIndex.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/Utils/CallPromotionUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/Instrumentation.h"
#include <deque>
#include <sstream>
#include <unordered_map>
#include <vector>
using namespace llvm;
using namespace llvm::memprof;

#define DEBUG_TYPE "memprof-context-disambiguation"

STATISTIC(FunctionClonesAnalysis,
          "Number of function clones created during whole program analysis");
STATISTIC(FunctionClonesThinBackend,
          "Number of function clones created during ThinLTO backend");
STATISTIC(FunctionsClonedThinBackend,
          "Number of functions that had clones created during ThinLTO backend");
STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
                            "cloned) during whole program analysis");
STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
                         "during whole program analysis");
STATISTIC(AllocTypeNotColdThinBackend,
          "Number of not cold static allocations (possibly cloned) during "
          "ThinLTO backend");
STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
                                    "(possibly cloned) during ThinLTO backend");
STATISTIC(OrigAllocsThinBackend,
          "Number of original (not cloned) allocations with memprof profiles "
          "during ThinLTO backend");
STATISTIC(
    AllocVersionsThinBackend,
    "Number of allocation versions (including clones) during ThinLTO backend");
STATISTIC(MaxAllocVersionsThinBackend,
          "Maximum number of allocation versions created for an original "
          "allocation during ThinLTO backend");
STATISTIC(UnclonableAllocsThinBackend,
          "Number of unclonable ambigous allocations during ThinLTO backend");
STATISTIC(RemovedEdgesWithMismatchedCallees,
          "Number of edges removed due to mismatched callees (profiled vs IR)");
STATISTIC(FoundProfiledCalleeCount,
          "Number of profiled callees found via tail calls");
STATISTIC(FoundProfiledCalleeDepth,
          "Aggregate depth of profiled callees found via tail calls");
STATISTIC(FoundProfiledCalleeMaxDepth,
          "Maximum depth of profiled callees found via tail calls");
STATISTIC(FoundProfiledCalleeNonUniquelyCount,
          "Number of profiled callees found via multiple tail call chains");
STATISTIC(DeferredBackedges, "Number of backedges with deferred cloning");

static cl::opt<std::string> DotFilePathPrefix(
    "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
    cl::value_desc("filename"),
    cl::desc("Specify the path prefix of the MemProf dot files."));

static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
                                 cl::Hidden,
                                 cl::desc("Export graph to dot files."));

// How much of the graph to export to dot.
enum DotScope {
  All,     // The full CCG graph.
  Alloc,   // Only contexts for the specified allocation.
  Context, // Only the specified context.
};

static cl::opt<DotScope> DotGraphScope(
    "memprof-dot-scope", cl::desc("Scope of graph to export to dot"),
    cl::Hidden, cl::init(DotScope::All),
    cl::values(
        clEnumValN(DotScope::All, "all", "Export full callsite graph"),
        clEnumValN(DotScope::Alloc, "alloc",
                   "Export only nodes with contexts feeding given "
                   "-memprof-dot-alloc-id"),
        clEnumValN(DotScope::Context, "context",
                   "Export only nodes with given -memprof-dot-context-id")));

static cl::opt<unsigned>
    AllocIdForDot("memprof-dot-alloc-id", cl::init(0), cl::Hidden,
                  cl::desc("Id of alloc to export if -memprof-dot-scope=alloc "
                           "or to highlight if -memprof-dot-scope=all"));

static cl::opt<unsigned> ContextIdForDot(
    "memprof-dot-context-id", cl::init(0), cl::Hidden,
    cl::desc("Id of context to export if -memprof-dot-scope=context or to "
             "highlight otherwise"));

static cl::opt<bool>
    DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
            cl::desc("Dump CallingContextGraph to stdout after each stage."));

static cl::opt<bool>
    VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
              cl::desc("Perform verification checks on CallingContextGraph."));

static cl::opt<bool>
    VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
                cl::desc("Perform frequent verification checks on nodes."));

static cl::opt<std::string> MemProfImportSummary(
    "memprof-import-summary",
    cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
    cl::Hidden);

static cl::opt<unsigned>
    TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
                        cl::Hidden,
                        cl::desc("Max depth to recursively search for missing "
                                 "frames through tail calls."));

// Optionally enable cloning of callsites involved with recursive cycles
static cl::opt<bool> AllowRecursiveCallsites(
    "memprof-allow-recursive-callsites", cl::init(true), cl::Hidden,
    cl::desc("Allow cloning of callsites involved in recursive cycles"));

static cl::opt<bool> CloneRecursiveContexts(
    "memprof-clone-recursive-contexts", cl::init(true), cl::Hidden,
    cl::desc("Allow cloning of contexts through recursive cycles"));

// When disabled, try to detect and prevent cloning of recursive contexts.
// This is only necessary until we support cloning through recursive cycles.
// Leave on by default for now, as disabling requires a little bit of compile
// time overhead and doesn't affect correctness, it will just inflate the cold
// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
static cl::opt<bool> AllowRecursiveContexts(
    "memprof-allow-recursive-contexts", cl::init(true), cl::Hidden,
    cl::desc("Allow cloning of contexts having recursive cycles"));

namespace llvm {
cl::opt<bool> EnableMemProfContextDisambiguation(
    "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
    cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));

// Indicate we are linking with an allocator that supports hot/cold operator
// new interfaces.
cl::opt<bool> SupportsHotColdNew(
    "supports-hot-cold-new", cl::init(false), cl::Hidden,
    cl::desc("Linking with hot/cold operator new interfaces"));

static cl::opt<bool> MemProfRequireDefinitionForPromotion(
    "memprof-require-definition-for-promotion", cl::init(false), cl::Hidden,
    cl::desc(
        "Require target function definition when promoting indirect calls"));
} // namespace llvm

extern cl::opt<bool> MemProfReportHintedSizes;
extern cl::opt<unsigned> MinClonedColdBytePercent;

namespace {
/// CRTP base for graphs built from either IR or ThinLTO summary index.
///
/// The graph represents the call contexts in all memprof metadata on allocation
/// calls, with nodes for the allocations themselves, as well as for the calls
/// in each context. The graph is initially built from the allocation memprof
/// metadata (or summary) MIBs. It is then updated to match calls with callsite
/// metadata onto the nodes, updating it to reflect any inlining performed on
/// those calls.
///
/// Each MIB (representing an allocation's call context with allocation
/// behavior) is assigned a unique context id during the graph build. The edges
/// and nodes in the graph are decorated with the context ids they carry. This
/// is used to correctly update the graph when cloning is performed so that we
/// can uniquify the context for a single (possibly cloned) allocation.
template <typename DerivedCCG, typename FuncTy, typename CallTy>
class CallsiteContextGraph {
public:
  CallsiteContextGraph() = default;
  CallsiteContextGraph(const CallsiteContextGraph &) = default;
  CallsiteContextGraph(CallsiteContextGraph &&) = default;

  /// Main entry point to perform analysis and transformations on graph.
  bool process();

  /// Perform cloning on the graph necessary to uniquely identify the allocation
  /// behavior of an allocation based on its context.
  void identifyClones();

  /// Assign callsite clones to functions, cloning functions as needed to
  /// accommodate the combinations of their callsite clones reached by callers.
  /// For regular LTO this clones functions and callsites in the IR, but for
  /// ThinLTO the cloning decisions are noted in the summaries and later applied
  /// in applyImport.
  bool assignFunctions();

  void dump() const;
  void print(raw_ostream &OS) const;
  void printTotalSizes(raw_ostream &OS) const;

  friend raw_ostream &operator<<(raw_ostream &OS,
                                 const CallsiteContextGraph &CCG) {
    CCG.print(OS);
    return OS;
  }

  friend struct GraphTraits<
      const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
  friend struct DOTGraphTraits<
      const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;

  void exportToDot(std::string Label) const;

  /// Represents a function clone via FuncTy pointer and clone number pair.
  struct FuncInfo final
      : public std::pair<FuncTy *, unsigned /*Clone number*/> {
    using Base = std::pair<FuncTy *, unsigned>;
    FuncInfo(const Base &B) : Base(B) {}
    FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
    explicit operator bool() const { return this->first != nullptr; }
    FuncTy *func() const { return this->first; }
    unsigned cloneNo() const { return this->second; }
  };

  /// Represents a callsite clone via CallTy and clone number pair.
  struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
    using Base = std::pair<CallTy, unsigned>;
    CallInfo(const Base &B) : Base(B) {}
    CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
        : Base(Call, CloneNo) {}
    explicit operator bool() const { return (bool)this->first; }
    CallTy call() const { return this->first; }
    unsigned cloneNo() const { return this->second; }
    void setCloneNo(unsigned N) { this->second = N; }
    void print(raw_ostream &OS) const {
      if (!operator bool()) {
        assert(!cloneNo());
        OS << "null Call";
        return;
      }
      call()->print(OS);
      OS << "\t(clone " << cloneNo() << ")";
    }
    void dump() const {
      print(dbgs());
      dbgs() << "\n";
    }
    friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
      Call.print(OS);
      return OS;
    }
  };

  struct ContextEdge;

  /// Node in the Callsite Context Graph
  struct ContextNode {
    // Keep this for now since in the IR case where we have an Instruction* it
    // is not as immediately discoverable. Used for printing richer information
    // when dumping graph.
    bool IsAllocation;

    // Keeps track of when the Call was reset to null because there was
    // recursion.
    bool Recursive = false;

    // This will be formed by ORing together the AllocationType enum values
    // for contexts including this node.
    uint8_t AllocTypes = 0;

    // The corresponding allocation or interior call. This is the primary call
    // for which we have created this node.
    CallInfo Call;

    // List of other calls that can be treated the same as the primary call
    // through cloning. I.e. located in the same function and have the same
    // (possibly pruned) stack ids. They will be updated the same way as the
    // primary call when assigning to function clones.
    SmallVector<CallInfo, 0> MatchingCalls;

    // For alloc nodes this is a unique id assigned when constructed, and for
    // callsite stack nodes it is the original stack id when the node is
    // constructed from the memprof MIB metadata on the alloc nodes. Note that
    // this is only used when matching callsite metadata onto the stack nodes
    // created when processing the allocation memprof MIBs, and for labeling
    // nodes in the dot graph. Therefore we don't bother to assign a value for
    // clones.
    uint64_t OrigStackOrAllocId = 0;

    // Edges to all callees in the profiled call stacks.
    // TODO: Should this be a map (from Callee node) for more efficient lookup?
    std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;

    // Edges to all callers in the profiled call stacks.
    // TODO: Should this be a map (from Caller node) for more efficient lookup?
    std::vector<std::shared_ptr<ContextEdge>> CallerEdges;

    // Returns true if we need to look at the callee edges for determining the
    // node context ids and allocation type.
    bool useCallerEdgesForContextInfo() const {
      // Typically if the callee edges are empty either the caller edges are
      // also empty, or this is an allocation (leaf node). However, if we are
      // allowing recursive callsites and contexts this will be violated for
      // incompletely cloned recursive cycles.
      assert(!CalleeEdges.empty() || CallerEdges.empty() || IsAllocation ||
             (AllowRecursiveCallsites && AllowRecursiveContexts));
      // When cloning for a recursive context, during cloning we might be in the
      // midst of cloning for a recurrence and have moved context ids off of a
      // caller edge onto the clone but not yet off of the incoming caller
      // (back) edge. If we don't look at those we miss the fact that this node
      // still has context ids of interest.
      return IsAllocation || CloneRecursiveContexts;
    }

    // Compute the context ids for this node from the union of its edge context
    // ids.
    DenseSet<uint32_t> getContextIds() const {
      unsigned Count = 0;
      // Compute the number of ids for reserve below. In general we only need to
      // look at one set of edges, typically the callee edges, since other than
      // allocations and in some cases during recursion cloning, all the context
      // ids on the callers should also flow out via callee edges.
      for (auto &Edge : CalleeEdges.empty() ? CallerEdges : CalleeEdges)
        Count += Edge->getContextIds().size();
      DenseSet<uint32_t> ContextIds;
      ContextIds.reserve(Count);
      auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
          CalleeEdges, useCallerEdgesForContextInfo()
                           ? CallerEdges
                           : std::vector<std::shared_ptr<ContextEdge>>());
      for (const auto &Edge : Edges)
        ContextIds.insert_range(Edge->getContextIds());
      return ContextIds;
    }

    // Compute the allocation type for this node from the OR of its edge
    // allocation types.
    uint8_t computeAllocType() const {
      uint8_t BothTypes =
          (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
      uint8_t AllocType = (uint8_t)AllocationType::None;
      auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
          CalleeEdges, useCallerEdgesForContextInfo()
                           ? CallerEdges
                           : std::vector<std::shared_ptr<ContextEdge>>());
      for (const auto &Edge : Edges) {
        AllocType |= Edge->AllocTypes;
        // Bail early if alloc type reached both, no further refinement.
        if (AllocType == BothTypes)
          return AllocType;
      }
      return AllocType;
    }

    // The context ids set for this node is empty if its edge context ids are
    // also all empty.
    bool emptyContextIds() const {
      auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
          CalleeEdges, useCallerEdgesForContextInfo()
                           ? CallerEdges
                           : std::vector<std::shared_ptr<ContextEdge>>());
      for (const auto &Edge : Edges) {
        if (!Edge->getContextIds().empty())
          return false;
      }
      return true;
    }

    // List of clones of this ContextNode, initially empty.
    std::vector<ContextNode *> Clones;

    // If a clone, points to the original uncloned node.
    ContextNode *CloneOf = nullptr;

    ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}

    ContextNode(bool IsAllocation, CallInfo C)
        : IsAllocation(IsAllocation), Call(C) {}

    void addClone(ContextNode *Clone) {
      if (CloneOf) {
        CloneOf->Clones.push_back(Clone);
        Clone->CloneOf = CloneOf;
      } else {
        Clones.push_back(Clone);
        assert(!Clone->CloneOf);
        Clone->CloneOf = this;
      }
    }

    ContextNode *getOrigNode() {
      if (!CloneOf)
        return this;
      return CloneOf;
    }

    void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
                               unsigned int ContextId);

    ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
    ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
    void eraseCalleeEdge(const ContextEdge *Edge);
    void eraseCallerEdge(const ContextEdge *Edge);

    void setCall(CallInfo C) { Call = C; }

    bool hasCall() const { return (bool)Call.call(); }

    void printCall(raw_ostream &OS) const { Call.print(OS); }

    // True if this node was effectively removed from the graph, in which case
    // it should have an allocation type of None and empty context ids.
    bool isRemoved() const {
      // Typically if the callee edges are empty either the caller edges are
      // also empty, or this is an allocation (leaf node). However, if we are
      // allowing recursive callsites and contexts this will be violated for
      // incompletely cloned recursive cycles.
      assert((AllowRecursiveCallsites && AllowRecursiveContexts) ||
             (AllocTypes == (uint8_t)AllocationType::None) ==
                 emptyContextIds());
      return AllocTypes == (uint8_t)AllocationType::None;
    }

    void dump() const;
    void print(raw_ostream &OS) const;

    friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
      Node.print(OS);
      return OS;
    }
  };

  /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
  /// callee.
  struct ContextEdge {
    ContextNode *Callee;
    ContextNode *Caller;

    // This will be formed by ORing together the AllocationType enum values
    // for contexts including this edge.
    uint8_t AllocTypes = 0;

    // Set just before initiating cloning when cloning of recursive contexts is
    // enabled. Used to defer cloning of backedges until we have done cloning of
    // the callee node for non-backedge caller edges. This exposes cloning
    // opportunities through the backedge of the cycle.
    // TODO: Note that this is not updated during cloning, and it is unclear
    // whether that would be needed.
    bool IsBackedge = false;

    // The set of IDs for contexts including this edge.
    DenseSet<uint32_t> ContextIds;

    ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
                DenseSet<uint32_t> ContextIds)
        : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
          ContextIds(std::move(ContextIds)) {}

    DenseSet<uint32_t> &getContextIds() { return ContextIds; }

    // Helper to clear the fields of this edge when we are removing it from the
    // graph.
    inline void clear() {
      ContextIds.clear();
      AllocTypes = (uint8_t)AllocationType::None;
      Caller = nullptr;
      Callee = nullptr;
    }

    // Check if edge was removed from the graph. This is useful while iterating
    // over a copy of edge lists when performing operations that mutate the
    // graph in ways that might remove one of the edges.
    inline bool isRemoved() const {
      if (Callee || Caller)
        return false;
      // Any edges that have been removed from the graph but are still in a
      // shared_ptr somewhere should have all fields null'ed out by clear()
      // above.
      assert(AllocTypes == (uint8_t)AllocationType::None);
      assert(ContextIds.empty());
      return true;
    }

    void dump() const;
    void print(raw_ostream &OS) const;

    friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
      Edge.print(OS);
      return OS;
    }
  };

  /// Helpers to remove edges that have allocation type None (due to not
  /// carrying any context ids) after transformations.
  void removeNoneTypeCalleeEdges(ContextNode *Node);
  void removeNoneTypeCallerEdges(ContextNode *Node);
  void
  recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
                                       DenseSet<const ContextNode *> &Visited);

protected:
  /// Get a list of nodes corresponding to the stack ids in the given callsite
  /// context.
  template <class NodeT, class IteratorT>
  std::vector<uint64_t>
  getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);

  /// Adds nodes for the given allocation and any stack ids on its memprof MIB
  /// metadata (or summary).
  ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);

  /// Adds nodes for the given MIB stack ids.
  template <class NodeT, class IteratorT>
  void addStackNodesForMIB(ContextNode *AllocNode,
                           CallStack<NodeT, IteratorT> &StackContext,
                           CallStack<NodeT, IteratorT> &CallsiteContext,
                           AllocationType AllocType,
                           ArrayRef<ContextTotalSize> ContextSizeInfo);

  /// Matches all callsite metadata (or summary) to the nodes created for
  /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
  /// inlining performed on those callsite instructions.
  void updateStackNodes();

  /// Update graph to conservatively handle any callsite stack nodes that target
  /// multiple different callee target functions.
  void handleCallsitesWithMultipleTargets();

  /// Mark backedges via the standard DFS based backedge algorithm.
  void markBackedges();

  // Try to partition calls on the given node (already placed into the AllCalls
  // array) by callee function, creating new copies of Node as needed to hold
  // calls with different callees, and moving the callee edges appropriately.
  // Returns true if partitioning was successful.
  bool partitionCallsByCallee(
      ContextNode *Node, ArrayRef<CallInfo> AllCalls,
      std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);

  /// Save lists of calls with MemProf metadata in each function, for faster
  /// iteration.
  MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;

  /// Map from callsite node to the enclosing caller function.
  std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;

  // When exporting to dot, and an allocation id is specified, contains the
  // context ids on that allocation.
  DenseSet<uint32_t> DotAllocContextIds;

private:
  using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;

  // Structure to keep track of information for each call as we are matching
  // non-allocation callsites onto context nodes created from the allocation
  // call metadata / summary contexts.
  struct CallContextInfo {
    // The callsite we're trying to match.
    CallTy Call;
    // The callsites stack ids that have a context node in the graph.
    std::vector<uint64_t> StackIds;
    // The function containing this callsite.
    const FuncTy *Func;
    // Initially empty, if needed this will be updated to contain the context
    // ids for use in a new context node created for this callsite.
    DenseSet<uint32_t> ContextIds;
  };

  /// Helper to remove edge from graph, updating edge iterator if it is provided
  /// (in which case CalleeIter indicates which edge list is being iterated).
  /// This will also perform the necessary clearing of the ContextEdge members
  /// to enable later checking if the edge has been removed (since we may have
  /// other copies of the shared_ptr in existence, and in fact rely on this to
  /// enable removal while iterating over a copy of a node's edge list).
  void removeEdgeFromGraph(ContextEdge *Edge, EdgeIter *EI = nullptr,
                           bool CalleeIter = true);

  /// Assigns the given Node to calls at or inlined into the location with
  /// the Node's stack id, after post order traversing and processing its
  /// caller nodes. Uses the call information recorded in the given
  /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
  /// as needed. Called by updateStackNodes which sets up the given
  /// StackIdToMatchingCalls map.
  void assignStackNodesPostOrder(
      ContextNode *Node, DenseSet<const ContextNode *> &Visited,
      DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
      DenseMap<CallInfo, CallInfo> &CallToMatchingCall);

  /// Duplicates the given set of context ids, updating the provided
  /// map from each original id with the newly generated context ids,
  /// and returning the new duplicated id set.
  DenseSet<uint32_t> duplicateContextIds(
      const DenseSet<uint32_t> &StackSequenceContextIds,
      DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);

  /// Propagates all duplicated context ids across the graph.
  void propagateDuplicateContextIds(
      const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);

  /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
  /// else to its callers. Also updates OrigNode's edges to remove any context
  /// ids moved to the newly created edge.
  void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
                      bool TowardsCallee,
                      DenseSet<uint32_t> RemainingContextIds);

  /// Get the stack id corresponding to the given Id or Index (for IR this will
  /// return itself, for a summary index this will return the id recorded in the
  /// index for that stack id index value).
  uint64_t getStackId(uint64_t IdOrIndex) const {
    return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
  }

  /// Returns true if the given call targets the callee of the given edge, or if
  /// we were able to identify the call chain through intermediate tail calls.
  /// In the latter case new context nodes are added to the graph for the
  /// identified tail calls, and their synthesized nodes are added to
  /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
  /// the updated edges and to prepare it for an increment in the caller.
  bool
  calleesMatch(CallTy Call, EdgeIter &EI,
               MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);

  // Return the callee function of the given call, or nullptr if it can't be
  // determined
  const FuncTy *getCalleeFunc(CallTy Call) {
    return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call);
  }

  /// Returns true if the given call targets the given function, or if we were
  /// able to identify the call chain through intermediate tail calls (in which
  /// case FoundCalleeChain will be populated).
  bool calleeMatchesFunc(
      CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
      std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
    return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
        Call, Func, CallerFunc, FoundCalleeChain);
  }

  /// Returns true if both call instructions have the same callee.
  bool sameCallee(CallTy Call1, CallTy Call2) {
    return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2);
  }

  /// Get a list of nodes corresponding to the stack ids in the given
  /// callsite's context.
  std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
    return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
        Call);
  }

  /// Get the last stack id in the context for callsite.
  uint64_t getLastStackId(CallTy Call) {
    return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
  }

  /// Update the allocation call to record type of allocated memory.
  void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
    AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
    static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
  }

  /// Get the AllocationType assigned to the given allocation instruction clone.
  AllocationType getAllocationCallType(const CallInfo &Call) const {
    return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call);
  }

  /// Update non-allocation call to invoke (possibly cloned) function
  /// CalleeFunc.
  void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
    static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
  }

  /// Clone the given function for the given callsite, recording mapping of all
  /// of the functions tracked calls to their new versions in the CallMap.
  /// Assigns new clones to clone number CloneNo.
  FuncInfo cloneFunctionForCallsite(
      FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
      std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
    return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
        Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
  }

  /// Gets a label to use in the dot graph for the given call clone in the given
  /// function.
  std::string getLabel(const FuncTy *Func, const CallTy Call,
                       unsigned CloneNo) const {
    return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
  }

  // Create and return a new ContextNode.
  ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr,
                             CallInfo C = CallInfo()) {
    NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
    auto *NewNode = NodeOwner.back().get();
    if (F)
      NodeToCallingFunc[NewNode] = F;
    return NewNode;
  }

  /// Helpers to find the node corresponding to the given call or stackid.
  ContextNode *getNodeForInst(const CallInfo &C);
  ContextNode *getNodeForAlloc(const CallInfo &C);
  ContextNode *getNodeForStackId(uint64_t StackId);

  /// Computes the alloc type corresponding to the given context ids, by
  /// unioning their recorded alloc types.
  uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds) const;

  /// Returns the allocation type of the intersection of the contexts of two
  /// nodes (based on their provided context id sets), optimized for the case
  /// when Node1Ids is smaller than Node2Ids.
  uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
                                  const DenseSet<uint32_t> &Node2Ids) const;

  /// Returns the allocation type of the intersection of the contexts of two
  /// nodes (based on their provided context id sets).
  uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
                              const DenseSet<uint32_t> &Node2Ids) const;

  /// Create a clone of Edge's callee and move Edge to that new callee node,
  /// performing the necessary context id and allocation type updates.
  /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
  /// moved to an edge to the new callee.
  ContextNode *
  moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
                           DenseSet<uint32_t> ContextIdsToMove = {});

  /// Change the callee of Edge to existing callee clone NewCallee, performing
  /// the necessary context id and allocation type updates.
  /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
  /// moved to an edge to the new callee.
  void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
                                     ContextNode *NewCallee,
                                     bool NewClone = false,
                                     DenseSet<uint32_t> ContextIdsToMove = {});

  /// Change the caller of the edge at the given callee edge iterator to be
  /// NewCaller, performing the necessary context id and allocation type
  /// updates. This is similar to the above moveEdgeToExistingCalleeClone, but
  /// a simplified version of it as we always move the given edge and all of its
  /// context ids.
  void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
                                 ContextNode *NewCaller);

  /// Recursive helper for marking backedges via DFS.
  void markBackedges(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
                     DenseSet<const ContextNode *> &CurrentStack);

  /// Recursively perform cloning on the graph for the given Node and its
  /// callers, in order to uniquely identify the allocation behavior of an
  /// allocation given its context. The context ids of the allocation being
  /// processed are given in AllocContextIds.
  void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
                      const DenseSet<uint32_t> &AllocContextIds);

  /// Map from each context ID to the AllocationType assigned to that context.
  DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;

  /// Map from each contextID to the profiled full contexts and their total
  /// sizes (there may be more than one due to context trimming),
  /// optionally populated when requested (via MemProfReportHintedSizes or
  /// MinClonedColdBytePercent).
  DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;

  /// Identifies the context node created for a stack id when adding the MIB
  /// contexts to the graph. This is used to locate the context nodes when
  /// trying to assign the corresponding callsites with those stack ids to these
  /// nodes.
  DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;

  /// Maps to track the calls to their corresponding nodes in the graph.
  MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
  MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;

  /// Owner of all ContextNode unique_ptrs.
  std::vector<std::unique_ptr<ContextNode>> NodeOwner;

  /// Perform sanity checks on graph when requested.
  void check() const;

  /// Keeps track of the last unique context id assigned.
  unsigned int LastContextId = 0;
};

template <typename DerivedCCG, typename FuncTy, typename CallTy>
using ContextNode =
    typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
template <typename DerivedCCG, typename FuncTy, typename CallTy>
using ContextEdge =
    typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
template <typename DerivedCCG, typename FuncTy, typename CallTy>
using FuncInfo =
    typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
template <typename DerivedCCG, typename FuncTy, typename CallTy>
using CallInfo =
    typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;

/// CRTP derived class for graphs built from IR (regular LTO).
class ModuleCallsiteContextGraph
    : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
                                  Instruction *> {
public:
  ModuleCallsiteContextGraph(
      Module &M,
      llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);

private:
  friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
                              Instruction *>;

  uint64_t getStackId(uint64_t IdOrIndex) const;
  const Function *getCalleeFunc(Instruction *Call);
  bool calleeMatchesFunc(
      Instruction *Call, const Function *Func, const Function *CallerFunc,
      std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
  bool sameCallee(Instruction *Call1, Instruction *Call2);
  bool findProfiledCalleeThroughTailCalls(
      const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
      std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
      bool &FoundMultipleCalleeChains);
  uint64_t getLastStackId(Instruction *Call);
  std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
  void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
  AllocationType getAllocationCallType(const CallInfo &Call) const;
  void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
  CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
                       Instruction *>::FuncInfo
  cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
                           std::map<CallInfo, CallInfo> &CallMap,
                           std::vector<CallInfo> &CallsWithMetadataInFunc,
                           unsigned CloneNo);
  std::string getLabel(const Function *Func, const Instruction *Call,
                       unsigned CloneNo) const;

  const Module &Mod;
  llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
};

/// Represents a call in the summary index graph, which can either be an
/// allocation or an interior callsite node in an allocation's context.
/// Holds a pointer to the corresponding data structure in the index.
struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
  IndexCall() : PointerUnion() {}
  IndexCall(std::nullptr_t) : IndexCall() {}
  IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
  IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
  IndexCall(PointerUnion PT) : PointerUnion(PT) {}

  IndexCall *operator->() { return this; }

  void print(raw_ostream &OS) const {
    PointerUnion<CallsiteInfo *, AllocInfo *> Base = *this;
    if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(Base)) {
      OS << *AI;
    } else {
      auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(Base);
      assert(CI);
      OS << *CI;
    }
  }
};
} // namespace

namespace llvm {
template <> struct simplify_type<IndexCall> {
  using SimpleType = PointerUnion<CallsiteInfo *, AllocInfo *>;
  static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; }
};
template <> struct simplify_type<const IndexCall> {
  using SimpleType = const PointerUnion<CallsiteInfo *, AllocInfo *>;
  static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; }
};
} // namespace llvm

namespace {
/// CRTP derived class for graphs built from summary index (ThinLTO).
class IndexCallsiteContextGraph
    : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
                                  IndexCall> {
public:
  IndexCallsiteContextGraph(
      ModuleSummaryIndex &Index,
      llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
          isPrevailing);

  ~IndexCallsiteContextGraph() {
    // Now that we are done with the graph it is safe to add the new
    // CallsiteInfo structs to the function summary vectors. The graph nodes
    // point into locations within these vectors, so we don't want to add them
    // any earlier.
    for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
      auto *FS = I.first;
      for (auto &Callsite : I.second)
        FS->addCallsite(*Callsite.second);
    }
  }

private:
  friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
                              IndexCall>;

  uint64_t getStackId(uint64_t IdOrIndex) const;
  const FunctionSummary *getCalleeFunc(IndexCall &Call);
  bool calleeMatchesFunc(
      IndexCall &Call, const FunctionSummary *Func,
      const FunctionSummary *CallerFunc,
      std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
  bool sameCallee(IndexCall &Call1, IndexCall &Call2);
  bool findProfiledCalleeThroughTailCalls(
      ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
      std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
      bool &FoundMultipleCalleeChains);
  uint64_t getLastStackId(IndexCall &Call);
  std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
  void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
  AllocationType getAllocationCallType(const CallInfo &Call) const;
  void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
  CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
                       IndexCall>::FuncInfo
  cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
                           std::map<CallInfo, CallInfo> &CallMap,
                           std::vector<CallInfo> &CallsWithMetadataInFunc,
                           unsigned CloneNo);
  std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
                       unsigned CloneNo) const;

  // Saves mapping from function summaries containing memprof records back to
  // its VI, for use in checking and debugging.
  std::map<const FunctionSummary *, ValueInfo> FSToVIMap;

  const ModuleSummaryIndex &Index;
  llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
      isPrevailing;

  // Saves/owns the callsite info structures synthesized for missing tail call
  // frames that we discover while building the graph.
  // It maps from the summary of the function making the tail call, to a map
  // of callee ValueInfo to corresponding synthesized callsite info.
  std::unordered_map<FunctionSummary *,
                     std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
      FunctionCalleesToSynthesizedCallsiteInfos;
};
} // namespace

namespace llvm {
template <>
struct DenseMapInfo<typename CallsiteContextGraph<
    ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
    : public DenseMapInfo<std::pair<Instruction *, unsigned>> {};
template <>
struct DenseMapInfo<typename CallsiteContextGraph<
    IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
    : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
template <>
struct DenseMapInfo<IndexCall>
    : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
} // end namespace llvm

namespace {

// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
// type we should actually use on the corresponding allocation.
// If we can't clone a node that has NotCold+Cold alloc type, we will fall
// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
// from NotCold.
AllocationType allocTypeToUse(uint8_t AllocTypes) {
  assert(AllocTypes != (uint8_t)AllocationType::None);
  if (AllocTypes ==
      ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
    return AllocationType::NotCold;
  else
    return (AllocationType)AllocTypes;
}

// Helper to check if the alloc types for all edges recorded in the
// InAllocTypes vector match the alloc types for all edges in the Edges
// vector.
template <typename DerivedCCG, typename FuncTy, typename CallTy>
bool allocTypesMatch(
    const std::vector<uint8_t> &InAllocTypes,
    const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
        &Edges) {
  // This should be called only when the InAllocTypes vector was computed for
  // this set of Edges. Make sure the sizes are the same.
  assert(InAllocTypes.size() == Edges.size());
  return std::equal(
      InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(),
      [](const uint8_t &l,
         const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
        // Can share if one of the edges is None type - don't
        // care about the type along that edge as it doesn't
        // exist for those context ids.
        if (l == (uint8_t)AllocationType::None ||
            r->AllocTypes == (uint8_t)AllocationType::None)
          return true;
        return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
      });
}

// Helper to check if the alloc types for all edges recorded in the
// InAllocTypes vector match the alloc types for callee edges in the given
// clone. Because the InAllocTypes were computed from the original node's callee
// edges, and other cloning could have happened after this clone was created, we
// need to find the matching clone callee edge, which may or may not exist.
template <typename DerivedCCG, typename FuncTy, typename CallTy>
bool allocTypesMatchClone(
    const std::vector<uint8_t> &InAllocTypes,
    const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) {
  const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf;
  assert(Node);
  // InAllocTypes should have been computed for the original node's callee
  // edges.
  assert(InAllocTypes.size() == Node->CalleeEdges.size());
  // First create a map of the clone callee edge callees to the edge alloc type.
  DenseMap<const ContextNode<DerivedCCG, FuncTy, CallTy> *, uint8_t>
      EdgeCalleeMap;
  for (const auto &E : Clone->CalleeEdges) {
    assert(!EdgeCalleeMap.contains(E->Callee));
    EdgeCalleeMap[E->Callee] = E->AllocTypes;
  }
  // Next, walk the original node's callees, and look for the corresponding
  // clone edge to that callee.
  for (unsigned I = 0; I < Node->CalleeEdges.size(); I++) {
    auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee);
    // Not found is ok, we will simply add an edge if we use this clone.
    if (Iter == EdgeCalleeMap.end())
      continue;
    // Can share if one of the edges is None type - don't
    // care about the type along that edge as it doesn't
    // exist for those context ids.
    if (InAllocTypes[I] == (uint8_t)AllocationType::None ||
        Iter->second == (uint8_t)AllocationType::None)
      continue;
    if (allocTypeToUse(Iter->second) != allocTypeToUse(InAllocTypes[I]))
      return false;
  }
  return true;
}

} // end anonymous namespace

template <typename DerivedCCG, typename FuncTy, typename CallTy>
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
    const CallInfo &C) {
  ContextNode *Node = getNodeForAlloc(C);
  if (Node)
    return Node;

  return NonAllocationCallToContextNodeMap.lookup(C);
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
    const CallInfo &C) {
  return AllocationCallToContextNodeMap.lookup(C);
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
    uint64_t StackId) {
  auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
  if (StackEntryNode != StackEntryIdToContextNodeMap.end())
    return StackEntryNode->second;
  return nullptr;
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
    addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
                          unsigned int ContextId) {
  for (auto &Edge : CallerEdges) {
    if (Edge->Caller == Caller) {
      Edge->AllocTypes |= (uint8_t)AllocType;
      Edge->getContextIds().insert(ContextId);
      return;
    }
  }
  std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
      this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
  CallerEdges.push_back(Edge);
  Caller->CalleeEdges.push_back(Edge);
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph(
    ContextEdge *Edge, EdgeIter *EI, bool CalleeIter) {
  assert(!EI || (*EI)->get() == Edge);
  assert(!Edge->isRemoved());
  // Save the Caller and Callee pointers so we can erase Edge from their edge
  // lists after clearing Edge below. We do the clearing first in case it is
  // destructed after removing from the edge lists (if those were the last
  // shared_ptr references to Edge).
  auto *Callee = Edge->Callee;
  auto *Caller = Edge->Caller;

  // Make sure the edge fields are cleared out so we can properly detect
  // removed edges if Edge is not destructed because there is still a shared_ptr
  // reference.
  Edge->clear();

#ifndef NDEBUG
  auto CalleeCallerCount = Callee->CallerEdges.size();
  auto CallerCalleeCount = Caller->CalleeEdges.size();
#endif
  if (!EI) {
    Callee->eraseCallerEdge(Edge);
    Caller->eraseCalleeEdge(Edge);
  } else if (CalleeIter) {
    Callee->eraseCallerEdge(Edge);
    *EI = Caller->CalleeEdges.erase(*EI);
  } else {
    Caller->eraseCalleeEdge(Edge);
    *EI = Callee->CallerEdges.erase(*EI);
  }
  assert(Callee->CallerEdges.size() < CalleeCallerCount);
  assert(Caller->CalleeEdges.size() < CallerCalleeCount);
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<
    DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
  for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
    auto Edge = *EI;
    if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
      assert(Edge->ContextIds.empty());
      removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
    } else
      ++EI;
  }
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<
    DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
  for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
    auto Edge = *EI;
    if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
      assert(Edge->ContextIds.empty());
      Edge->Caller->eraseCalleeEdge(Edge.get());
      EI = Node->CallerEdges.erase(EI);
    } else
      ++EI;
  }
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
    findEdgeFromCallee(const ContextNode *Callee) {
  for (const auto &Edge : CalleeEdges)
    if (Edge->Callee == Callee)
      return Edge.get();
  return nullptr;
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
    findEdgeFromCaller(const ContextNode *Caller) {
  for (const auto &Edge : CallerEdges)
    if (Edge->Caller == Caller)
      return Edge.get();
  return nullptr;
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
    eraseCalleeEdge(const ContextEdge *Edge) {
  auto EI = llvm::find_if(
      CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
        return CalleeEdge.get() == Edge;
      });
  assert(EI != CalleeEdges.end());
  CalleeEdges.erase(EI);
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
    eraseCallerEdge(const ContextEdge *Edge) {
  auto EI = llvm::find_if(
      CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
        return CallerEdge.get() == Edge;
      });
  assert(EI != CallerEdges.end());
  CallerEdges.erase(EI);
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
    DenseSet<uint32_t> &ContextIds) const {
  uint8_t BothTypes =
      (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
  uint8_t AllocType = (uint8_t)AllocationType::None;
  for (auto Id : ContextIds) {
    AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
    // Bail early if alloc type reached both, no further refinement.
    if (AllocType == BothTypes)
      return AllocType;
  }
  return AllocType;
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
uint8_t
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
    const DenseSet<uint32_t> &Node1Ids,
    const DenseSet<uint32_t> &Node2Ids) const {
  uint8_t BothTypes =
      (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
  uint8_t AllocType = (uint8_t)AllocationType::None;
  for (auto Id : Node1Ids) {
    if (!Node2Ids.count(Id))
      continue;
    AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
    // Bail early if alloc type reached both, no further refinement.
    if (AllocType == BothTypes)
      return AllocType;
  }
  return AllocType;
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
    const DenseSet<uint32_t> &Node1Ids,
    const DenseSet<uint32_t> &Node2Ids) const {
  if (Node1Ids.size() < Node2Ids.size())
    return intersectAllocTypesImpl(Node1Ids, Node2Ids);
  else
    return intersectAllocTypesImpl(Node2Ids, Node1Ids);
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
    CallInfo Call, const FuncTy *F) {
  assert(!getNodeForAlloc(Call));
  ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, Call);
  AllocationCallToContextNodeMap[Call] = AllocNode;
  // Use LastContextId as a uniq id for MIB allocation nodes.
  AllocNode->OrigStackOrAllocId = LastContextId;
  // Alloc type should be updated as we add in the MIBs. We should assert
  // afterwards that it is not still None.
  AllocNode->AllocTypes = (uint8_t)AllocationType::None;

  return AllocNode;
}

static std::string getAllocTypeString(uint8_t AllocTypes) {
  if (!AllocTypes)
    return "None";
  std::string Str;
  if (AllocTypes & (uint8_t)AllocationType::NotCold)
    Str += "NotCold";
  if (AllocTypes & (uint8_t)AllocationType::Cold)
    Str += "Cold";
  return Str;
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
template <class NodeT, class IteratorT>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
    ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
    CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
    ArrayRef<ContextTotalSize> ContextSizeInfo) {
  // Treating the hot alloc type as NotCold before the disambiguation for "hot"
  // is done.
  if (AllocType == AllocationType::Hot)
    AllocType = AllocationType::NotCold;

  ContextIdToAllocationType[++LastContextId] = AllocType;

  if (!ContextSizeInfo.empty()) {
    auto &Entry = ContextIdToContextSizeInfos[LastContextId];
    Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
  }

  // Update alloc type and context ids for this MIB.
  AllocNode->AllocTypes |= (uint8_t)AllocType;

  // Now add or update nodes for each stack id in alloc's context.
  // Later when processing the stack ids on non-alloc callsites we will adjust
  // for any inlining in the context.
  ContextNode *PrevNode = AllocNode;
  // Look for recursion (direct recursion should have been collapsed by
  // module summary analysis, here we should just be detecting mutual
  // recursion). Mark these nodes so we don't try to clone.
  SmallSet<uint64_t, 8> StackIdSet;
  // Skip any on the allocation call (inlining).
  for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
       ContextIter != StackContext.end(); ++ContextIter) {
    auto StackId = getStackId(*ContextIter);
    ContextNode *StackNode = getNodeForStackId(StackId);
    if (!StackNode) {
      StackNode = createNewNode(/*IsAllocation=*/false);
      StackEntryIdToContextNodeMap[StackId] = StackNode;
      StackNode->OrigStackOrAllocId = StackId;
    }
    // Marking a node recursive will prevent its cloning completely, even for
    // non-recursive contexts flowing through it.
    if (!AllowRecursiveCallsites) {
      auto Ins = StackIdSet.insert(StackId);
      if (!Ins.second)
        StackNode->Recursive = true;
    }
    StackNode->AllocTypes |= (uint8_t)AllocType;
    PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
    PrevNode = StackNode;
  }
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
DenseSet<uint32_t>
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
    const DenseSet<uint32_t> &StackSequenceContextIds,
    DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
  DenseSet<uint32_t> NewContextIds;
  for (auto OldId : StackSequenceContextIds) {
    NewContextIds.insert(++LastContextId);
    OldToNewContextIds[OldId].insert(LastContextId);
    assert(ContextIdToAllocationType.count(OldId));
    // The new context has the same allocation type as original.
    ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
    if (DotAllocContextIds.contains(OldId))
      DotAllocContextIds.insert(LastContextId);
  }
  return NewContextIds;
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
    propagateDuplicateContextIds(
        const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
  // Build a set of duplicated context ids corresponding to the input id set.
  auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
    DenseSet<uint32_t> NewIds;
    for (auto Id : ContextIds)
      if (auto NewId = OldToNewContextIds.find(Id);
          NewId != OldToNewContextIds.end())
        NewIds.insert_range(NewId->second);
    return NewIds;
  };

  // Recursively update context ids sets along caller edges.
  auto UpdateCallers = [&](ContextNode *Node,
                           DenseSet<const ContextEdge *> &Visited,
                           auto &&UpdateCallers) -> void {
    for (const auto &Edge : Node->CallerEdges) {
      auto Inserted = Visited.insert(Edge.get());
      if (!Inserted.second)
        continue;
      ContextNode *NextNode = Edge->Caller;
      DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
      // Only need to recursively iterate to NextNode via this caller edge if
      // it resulted in any added ids to NextNode.
      if (!NewIdsToAdd.empty()) {
        Edge->getContextIds().insert_range(NewIdsToAdd);
        UpdateCallers(NextNode, Visited, UpdateCallers);
      }
    }
  };

  DenseSet<const ContextEdge *> Visited;
  for (auto &Entry : AllocationCallToContextNodeMap) {
    auto *Node = Entry.second;
    UpdateCallers(Node, Visited, UpdateCallers);
  }
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
    ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
    // This must be passed by value to make a copy since it will be adjusted
    // as ids are moved.
    DenseSet<uint32_t> RemainingContextIds) {
  auto &OrigEdges =
      TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
  DenseSet<uint32_t> RecursiveContextIds;
  DenseSet<uint32_t> AllCallerContextIds;
  if (AllowRecursiveCallsites) {
    // Identify which context ids are recursive which is needed to properly
    // update the RemainingContextIds set. The relevant recursive context ids
    // are those that are in multiple edges.
    for (auto &CE : OrigEdges) {
      AllCallerContextIds.reserve(CE->getContextIds().size());
      for (auto Id : CE->getContextIds())
        if (!AllCallerContextIds.insert(Id).second)
          RecursiveContextIds.insert(Id);
    }
  }
  // Increment iterator in loop so that we can remove edges as needed.
  for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
    auto Edge = *EI;
    DenseSet<uint32_t> NewEdgeContextIds;
    DenseSet<uint32_t> NotFoundContextIds;
    // Remove any matching context ids from Edge, return set that were found and
    // removed, these are the new edge's context ids. Also update the remaining
    // (not found ids).
    set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
                 NotFoundContextIds);
    // Update the remaining context ids set for the later edges. This is a
    // compile time optimization.
    if (RecursiveContextIds.empty()) {
      // No recursive ids, so all of the previously remaining context ids that
      // were not seen on this edge are the new remaining set.
      RemainingContextIds.swap(NotFoundContextIds);
    } else {
      // Keep the recursive ids in the remaining set as we expect to see those
      // on another edge. We can remove the non-recursive remaining ids that
      // were seen on this edge, however. We already have the set of remaining
      // ids that were on this edge (in NewEdgeContextIds). Figure out which are
      // non-recursive and only remove those. Note that despite the higher
      // overhead of updating the remaining context ids set when recursion
      // handling is enabled, it was found to be at worst performance neutral
      // and in one case a clear win.
      DenseSet<uint32_t> NonRecursiveRemainingCurEdgeIds =
          set_difference(NewEdgeContextIds, RecursiveContextIds);
      set_subtract(RemainingContextIds, NonRecursiveRemainingCurEdgeIds);
    }
    // If no matching context ids for this edge, skip it.
    if (NewEdgeContextIds.empty()) {
      ++EI;
      continue;
    }
    if (TowardsCallee) {
      uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
      auto NewEdge = std::make_shared<ContextEdge>(
          Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
      NewNode->CalleeEdges.push_back(NewEdge);
      NewEdge->Callee->CallerEdges.push_back(NewEdge);
    } else {
      uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
      auto NewEdge = std::make_shared<ContextEdge>(
          NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
      NewNode->CallerEdges.push_back(NewEdge);
      NewEdge->Caller->CalleeEdges.push_back(NewEdge);
    }
    // Remove old edge if context ids empty.
    if (Edge->getContextIds().empty()) {
      removeEdgeFromGraph(Edge.get(), &EI, TowardsCallee);
      continue;
    }
    ++EI;
  }
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
static void checkEdge(
    const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
  // Confirm that alloc type is not None and that we have at least one context
  // id.
  assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
  assert(!Edge->ContextIds.empty());
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
                      bool CheckEdges = true) {
  if (Node->isRemoved())
    return;
#ifndef NDEBUG
  // Compute node's context ids once for use in asserts.
  auto NodeContextIds = Node->getContextIds();
#endif
  // Node's context ids should be the union of both its callee and caller edge
  // context ids.
  if (Node->CallerEdges.size()) {
    DenseSet<uint32_t> CallerEdgeContextIds(
        Node->CallerEdges.front()->ContextIds);
    for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
      if (CheckEdges)
        checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
      set_union(CallerEdgeContextIds, Edge->ContextIds);
    }
    // Node can have more context ids than callers if some contexts terminate at
    // node and some are longer. If we are allowing recursive callsites and
    // contexts this will be violated for incompletely cloned recursive cycles,
    // so skip the checking in that case.
    assert((AllowRecursiveCallsites && AllowRecursiveContexts) ||
           NodeContextIds == CallerEdgeContextIds ||
           set_is_subset(CallerEdgeContextIds, NodeContextIds));
  }
  if (Node->CalleeEdges.size()) {
    DenseSet<uint32_t> CalleeEdgeContextIds(
        Node->CalleeEdges.front()->ContextIds);
    for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
      if (CheckEdges)
        checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
      set_union(CalleeEdgeContextIds, Edge->getContextIds());
    }
    // If we are allowing recursive callsites and contexts this will be violated
    // for incompletely cloned recursive cycles, so skip the checking in that
    // case.
    assert((AllowRecursiveCallsites && AllowRecursiveContexts) ||
           NodeContextIds == CalleeEdgeContextIds);
  }
  // FIXME: Since this checking is only invoked under an option, we should
  // change the error checking from using assert to something that will trigger
  // an error on a release build.
#ifndef NDEBUG
  // Make sure we don't end up with duplicate edges between the same caller and
  // callee.
  DenseSet<ContextNode<DerivedCCG, FuncTy, CallTy> *> NodeSet;
  for (const auto &E : Node->CalleeEdges)
    NodeSet.insert(E->Callee);
  assert(NodeSet.size() == Node->CalleeEdges.size());
#endif
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
    assignStackNodesPostOrder(
        ContextNode *Node, DenseSet<const ContextNode *> &Visited,
        DenseMap<uint64_t, std::vector<CallContextInfo>>
            &StackIdToMatchingCalls,
        DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
  auto Inserted = Visited.insert(Node);
  if (!Inserted.second)
    return;
  // Post order traversal. Iterate over a copy since we may add nodes and
  // therefore new callers during the recursive call, invalidating any
  // iterator over the original edge vector. We don't need to process these
  // new nodes as they were already processed on creation.
  auto CallerEdges = Node->CallerEdges;
  for (auto &Edge : CallerEdges) {
    // Skip any that have been removed during the recursion.
    if (Edge->isRemoved()) {
      assert(!is_contained(Node->CallerEdges, Edge));
      continue;
    }
    assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
                              CallToMatchingCall);
  }

  // If this node's stack id is in the map, update the graph to contain new
  // nodes representing any inlining at interior callsites. Note we move the
  // associated context ids over to the new nodes.

  // Ignore this node if it is for an allocation or we didn't record any
  // stack id lists ending at it.
  if (Node->IsAllocation ||
      !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
    return;

  auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
  // Handle the simple case first. A single call with a single stack id.
  // In this case there is no need to create any new context nodes, simply
  // assign the context node for stack id to this Call.
  if (Calls.size() == 1) {
    auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
    if (Ids.size() == 1) {
      assert(SavedContextIds.empty());
      // It should be this Node
      assert(Node == getNodeForStackId(Ids[0]));
      if (Node->Recursive)
        return;
      Node->setCall(Call);
      NonAllocationCallToContextNodeMap[Call] = Node;
      NodeToCallingFunc[Node] = Func;
      return;
    }
  }

#ifndef NDEBUG
  // Find the node for the last stack id, which should be the same
  // across all calls recorded for this id, and is this node's id.
  uint64_t LastId = Node->OrigStackOrAllocId;
  ContextNode *LastNode = getNodeForStackId(LastId);
  // We should only have kept stack ids that had nodes.
  assert(LastNode);
  assert(LastNode == Node);
#else
  ContextNode *LastNode = Node;
#endif

  // Compute the last node's context ids once, as it is shared by all calls in
  // this entry.
  DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();

  bool PrevIterCreatedNode = false;
  bool CreatedNode = false;
  for (unsigned I = 0; I < Calls.size();
       I++, PrevIterCreatedNode = CreatedNode) {
    CreatedNode = false;
    auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
    // Skip any for which we didn't assign any ids, these don't get a node in
    // the graph.
    if (SavedContextIds.empty()) {
      // If this call has a matching call (located in the same function and
      // having the same stack ids), simply add it to the context node created
      // for its matching call earlier. These can be treated the same through
      // cloning and get updated at the same time.
      if (!CallToMatchingCall.contains(Call))
        continue;
      auto MatchingCall = CallToMatchingCall[Call];
      if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
        // This should only happen if we had a prior iteration, and it didn't
        // create a node because of the below recomputation of context ids
        // finding none remaining and continuing early.
        assert(I > 0 && !PrevIterCreatedNode);
        continue;
      }
      NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
          Call);
      continue;
    }

    assert(LastId == Ids.back());

    // Recompute the context ids for this stack id sequence (the
    // intersection of the context ids of the corresponding nodes).
    // Start with the ids we saved in the map for this call, which could be
    // duplicated context ids. We have to recompute as we might have overlap
    // overlap between the saved context ids for different last nodes, and
    // removed them already during the post order traversal.
    set_intersect(SavedContextIds, LastNodeContextIds);
    ContextNode *PrevNode = LastNode;
    bool Skip = false;
    // Iterate backwards through the stack Ids, starting after the last Id
    // in the list, which was handled once outside for all Calls.
    for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
      auto Id = *IdIter;
      ContextNode *CurNode = getNodeForStackId(Id);
      // We should only have kept stack ids that had nodes and weren't
      // recursive.
      assert(CurNode);
      assert(!CurNode->Recursive);

      auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
      if (!Edge) {
        Skip = true;
        break;
      }
      PrevNode = CurNode;

      // Update the context ids, which is the intersection of the ids along
      // all edges in the sequence.
      set_intersect(SavedContextIds, Edge->getContextIds());

      // If we now have no context ids for clone, skip this call.
      if (SavedContextIds.empty()) {
        Skip = true;
        break;
      }
    }
    if (Skip)
      continue;

    // Create new context node.
    ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, Func, Call);
    NonAllocationCallToContextNodeMap[Call] = NewNode;
    CreatedNode = true;
    NewNode->AllocTypes = computeAllocType(SavedContextIds);

    ContextNode *FirstNode = getNodeForStackId(Ids[0]);
    assert(FirstNode);

    // Connect to callees of innermost stack frame in inlined call chain.
    // This updates context ids for FirstNode's callee's to reflect those
    // moved to NewNode.
    connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);

    // Connect to callers of outermost stack frame in inlined call chain.
    // This updates context ids for FirstNode's caller's to reflect those
    // moved to NewNode.
    connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);

    // Now we need to remove context ids from edges/nodes between First and
    // Last Node.
    PrevNode = nullptr;
    for (auto Id : Ids) {
      ContextNode *CurNode = getNodeForStackId(Id);
      // We should only have kept stack ids that had nodes.
      assert(CurNode);

      // Remove the context ids moved to NewNode from CurNode, and the
      // edge from the prior node.
      if (PrevNode) {
        auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
        assert(PrevEdge);
        set_subtract(PrevEdge->getContextIds(), SavedContextIds);
        if (PrevEdge->getContextIds().empty())
          removeEdgeFromGraph(PrevEdge);
      }
      // Since we update the edges from leaf to tail, only look at the callee
      // edges. This isn't an alloc node, so if there are no callee edges, the
      // alloc type is None.
      CurNode->AllocTypes = CurNode->CalleeEdges.empty()
                                ? (uint8_t)AllocationType::None
                                : CurNode->computeAllocType();
      PrevNode = CurNode;
    }
    if (VerifyNodes) {
      checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
      for (auto Id : Ids) {
        ContextNode *CurNode = getNodeForStackId(Id);
        // We should only have kept stack ids that had nodes.
        assert(CurNode);
        checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
      }
    }
  }
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
  // Map of stack id to all calls with that as the last (outermost caller)
  // callsite id that has a context node (some might not due to pruning
  // performed during matching of the allocation profile contexts).
  // The CallContextInfo contains the Call and a list of its stack ids with
  // ContextNodes, the function containing Call, and the set of context ids
  // the analysis will eventually identify for use in any new node created
  // for that callsite.
  DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
  for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
    for (auto &Call : CallsWithMetadata) {
      // Ignore allocations, already handled.
      if (AllocationCallToContextNodeMap.count(Call))
        continue;
      auto StackIdsWithContextNodes =
          getStackIdsWithContextNodesForCall(Call.call());
      // If there were no nodes created for MIBs on allocs (maybe this was in
      // the unambiguous part of the MIB stack that was pruned), ignore.
      if (StackIdsWithContextNodes.empty())
        continue;
      // Otherwise, record this Call along with the list of ids for the last
      // (outermost caller) stack id with a node.
      StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
          {Call.call(), StackIdsWithContextNodes, Func, {}});
    }
  }

  // First make a pass through all stack ids that correspond to a call,
  // as identified in the above loop. Compute the context ids corresponding to
  // each of these calls when they correspond to multiple stack ids due to
  // due to inlining. Perform any duplication of context ids required when
  // there is more than one call with the same stack ids. Their (possibly newly
  // duplicated) context ids are saved in the StackIdToMatchingCalls map.
  DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
  // Save a map from each call to any that are found to match it. I.e. located
  // in the same function and have the same (possibly pruned) stack ids. We use
  // this to avoid creating extra graph nodes as they can be treated the same.
  DenseMap<CallInfo, CallInfo> CallToMatchingCall;
  for (auto &It : StackIdToMatchingCalls) {
    auto &Calls = It.getSecond();
    // Skip single calls with a single stack id. These don't need a new node.
    if (Calls.size() == 1) {
      auto &Ids = Calls[0].StackIds;
      if (Ids.size() == 1)
        continue;
    }
    // In order to do the best and maximal matching of inlined calls to context
    // node sequences we will sort the vectors of stack ids in descending order
    // of length, and within each length, lexicographically by stack id. The
    // latter is so that we can specially handle calls that have identical stack
    // id sequences (either due to cloning or artificially because of the MIB
    // context pruning). Those with the same Ids are then sorted by function to
    // facilitate efficiently mapping them to the same context node.
    // Because the functions are pointers, to ensure a stable sort first assign
    // each function pointer to its first index in the Calls array, and then use
    // that to sort by.
    DenseMap<const FuncTy *, unsigned> FuncToIndex;
    for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
      FuncToIndex.insert({CallCtxInfo.Func, Idx});
    std::stable_sort(
        Calls.begin(), Calls.end(),
        [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
          return A.StackIds.size() > B.StackIds.size() ||
                 (A.StackIds.size() == B.StackIds.size() &&
                  (A.StackIds < B.StackIds ||
                   (A.StackIds == B.StackIds &&
                    FuncToIndex[A.Func] < FuncToIndex[B.Func])));
        });

    // Find the node for the last stack id, which should be the same
    // across all calls recorded for this id, and is the id for this
    // entry in the StackIdToMatchingCalls map.
    uint64_t LastId = It.getFirst();
    ContextNode *LastNode = getNodeForStackId(LastId);
    // We should only have kept stack ids that had nodes.
    assert(LastNode);

    if (LastNode->Recursive)
      continue;

    // Initialize the context ids with the last node's. We will subsequently
    // refine the context ids by computing the intersection along all edges.
    DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
    assert(!LastNodeContextIds.empty());

#ifndef NDEBUG
    // Save the set of functions seen for a particular set of the same stack
    // ids. This is used to ensure that they have been correctly sorted to be
    // adjacent in the Calls list, since we rely on that to efficiently place
    // all such matching calls onto the same context node.
    DenseSet<const FuncTy *> MatchingIdsFuncSet;
#endif

    for (unsigned I = 0; I < Calls.size(); I++) {
      auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
      assert(SavedContextIds.empty());
      assert(LastId == Ids.back());

#ifndef NDEBUG
      // If this call has a different set of ids than the last one, clear the
      // set used to ensure they are sorted properly.
      if (I > 0 && Ids != Calls[I - 1].StackIds)
        MatchingIdsFuncSet.clear();
#endif

      // First compute the context ids for this stack id sequence (the
      // intersection of the context ids of the corresponding nodes).
      // Start with the remaining saved ids for the last node.
      assert(!LastNodeContextIds.empty());
      DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;

      ContextNode *PrevNode = LastNode;
      ContextNode *CurNode = LastNode;
      bool Skip = false;

      // Iterate backwards through the stack Ids, starting after the last Id
      // in the list, which was handled once outside for all Calls.
      for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
        auto Id = *IdIter;
        CurNode = getNodeForStackId(Id);
        // We should only have kept stack ids that had nodes.
        assert(CurNode);

        if (CurNode->Recursive) {
          Skip = true;
          break;
        }

        auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
        // If there is no edge then the nodes belong to different MIB contexts,
        // and we should skip this inlined context sequence. For example, this
        // particular inlined context may include stack ids A->B, and we may
        // indeed have nodes for both A and B, but it is possible that they were
        // never profiled in sequence in a single MIB for any allocation (i.e.
        // we might have profiled an allocation that involves the callsite A,
        // but through a different one of its callee callsites, and we might
        // have profiled an allocation that involves callsite B, but reached
        // from a different caller callsite).
        if (!Edge) {
          Skip = true;
          break;
        }
        PrevNode = CurNode;

        // Update the context ids, which is the intersection of the ids along
        // all edges in the sequence.
        set_intersect(StackSequenceContextIds, Edge->getContextIds());

        // If we now have no context ids for clone, skip this call.
        if (StackSequenceContextIds.empty()) {
          Skip = true;
          break;
        }
      }
      if (Skip)
        continue;

      // If some of this call's stack ids did not have corresponding nodes (due
      // to pruning), don't include any context ids for contexts that extend
      // beyond these nodes. Otherwise we would be matching part of unrelated /
      // not fully matching stack contexts. To do this, subtract any context ids
      // found in caller nodes of the last node found above.
      if (Ids.back() != getLastStackId(Call)) {
        for (const auto &PE : LastNode->CallerEdges) {
          set_subtract(StackSequenceContextIds, PE->getContextIds());
          if (StackSequenceContextIds.empty())
            break;
        }
        // If we now have no context ids for clone, skip this call.
        if (StackSequenceContextIds.empty())
          continue;
      }

#ifndef NDEBUG
      // If the prior call had the same stack ids this set would not be empty.
      // Check if we already have a call that "matches" because it is located
      // in the same function. If the Calls list was sorted properly we should
      // not encounter this situation as all such entries should be adjacent
      // and processed in bulk further below.
      assert(!MatchingIdsFuncSet.contains(Func));

      MatchingIdsFuncSet.insert(Func);
#endif

      // Check if the next set of stack ids is the same (since the Calls vector
      // of tuples is sorted by the stack ids we can just look at the next one).
      // If so, save them in the CallToMatchingCall map so that they get
      // assigned to the same context node, and skip them.
      bool DuplicateContextIds = false;
      for (unsigned J = I + 1; J < Calls.size(); J++) {
        auto &CallCtxInfo = Calls[J];
        auto &NextIds = CallCtxInfo.StackIds;
        if (NextIds != Ids)
          break;
        auto *NextFunc = CallCtxInfo.Func;
        if (NextFunc != Func) {
          // We have another Call with the same ids but that cannot share this
          // node, must duplicate ids for it.
          DuplicateContextIds = true;
          break;
        }
        auto &NextCall = CallCtxInfo.Call;
        CallToMatchingCall[NextCall] = Call;
        // Update I so that it gets incremented correctly to skip this call.
        I = J;
      }

      // If we don't have duplicate context ids, then we can assign all the
      // context ids computed for the original node sequence to this call.
      // If there are duplicate calls with the same stack ids then we synthesize
      // new context ids that are duplicates of the originals. These are
      // assigned to SavedContextIds, which is a reference into the map entry
      // for this call, allowing us to access these ids later on.
      OldToNewContextIds.reserve(OldToNewContextIds.size() +
                                 StackSequenceContextIds.size());
      SavedContextIds =
          DuplicateContextIds
              ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
              : StackSequenceContextIds;
      assert(!SavedContextIds.empty());

      if (!DuplicateContextIds) {
        // Update saved last node's context ids to remove those that are
        // assigned to other calls, so that it is ready for the next call at
        // this stack id.
        set_subtract(LastNodeContextIds, StackSequenceContextIds);
        if (LastNodeContextIds.empty())
          break;
      }
    }
  }

  // Propagate the duplicate context ids over the graph.
  propagateDuplicateContextIds(OldToNewContextIds);

  if (VerifyCCG)
    check();

  // Now perform a post-order traversal over the graph, starting with the
  // allocation nodes, essentially processing nodes from callers to callees.
  // For any that contains an id in the map, update the graph to contain new
  // nodes representing any inlining at interior callsites. Note we move the
  // associated context ids over to the new nodes.
  DenseSet<const ContextNode *> Visited;
  for (auto &Entry : AllocationCallToContextNodeMap)
    assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
                              CallToMatchingCall);
  if (VerifyCCG)
    check();
}

uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
  CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
      Call->getMetadata(LLVMContext::MD_callsite));
  return CallsiteContext.back();
}

uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
  assert(isa<CallsiteInfo *>(Call));
  CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
      CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
  // Need to convert index into stack id.
  return Index.getStackIdAtIndex(CallsiteContext.back());
}

static const std::string MemProfCloneSuffix = ".memprof.";

static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
  // We use CloneNo == 0 to refer to the original version, which doesn't get
  // renamed with a suffix.
  if (!CloneNo)
    return Base.str();
  return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
}

static bool isMemProfClone(const Function &F) {
  return F.getName().contains(MemProfCloneSuffix);
}

std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
                                                 const Instruction *Call,
                                                 unsigned CloneNo) const {
  return (Twine(Call->getFunction()->getName()) + " -> " +
          cast<CallBase>(Call)->getCalledFunction()->getName())
      .str();
}

std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
                                                const IndexCall &Call,
                                                unsigned CloneNo) const {
  auto VI = FSToVIMap.find(Func);
  assert(VI != FSToVIMap.end());
  if (isa<AllocInfo *>(Call))
    return (VI->second.name() + " -> alloc").str();
  else {
    auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call);
    return (VI->second.name() + " -> " +
            getMemProfFuncName(Callsite->Callee.name(),
                               Callsite->Clones[CloneNo]))
        .str();
  }
}

std::vector<uint64_t>
ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
    Instruction *Call) {
  CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
      Call->getMetadata(LLVMContext::MD_callsite));
  return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
      CallsiteContext);
}

std::vector<uint64_t>
IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
  assert(isa<CallsiteInfo *>(Call));
  CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
      CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
  return getStackIdsWithContextNodes<CallsiteInfo,
                                     SmallVector<unsigned>::const_iterator>(
      CallsiteContext);
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
template <class NodeT, class IteratorT>
std::vector<uint64_t>
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
    CallStack<NodeT, IteratorT> &CallsiteContext) {
  std::vector<uint64_t> StackIds;
  for (auto IdOrIndex : CallsiteContext) {
    auto StackId = getStackId(IdOrIndex);
    ContextNode *Node = getNodeForStackId(StackId);
    if (!Node)
      break;
    StackIds.push_back(StackId);
  }
  return StackIds;
}

ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
    Module &M,
    llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
    : Mod(M), OREGetter(OREGetter) {
  for (auto &F : M) {
    std::vector<CallInfo> CallsWithMetadata;
    for (auto &BB : F) {
      for (auto &I : BB) {
        if (!isa<CallBase>(I))
          continue;
        if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
          CallsWithMetadata.push_back(&I);
          auto *AllocNode = addAllocNode(&I, &F);
          auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
          assert(CallsiteMD);
          CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
          // Add all of the MIBs and their stack nodes.
          for (auto &MDOp : MemProfMD->operands()) {
            auto *MIBMD = cast<const MDNode>(MDOp);
            std::vector<ContextTotalSize> ContextSizeInfo;
            // Collect the context size information if it exists.
            if (MIBMD->getNumOperands() > 2) {
              for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
                MDNode *ContextSizePair =
                    dyn_cast<MDNode>(MIBMD->getOperand(I));
                assert(ContextSizePair->getNumOperands() == 2);
                uint64_t FullStackId = mdconst::dyn_extract<ConstantInt>(
                                           ContextSizePair->getOperand(0))
                                           ->getZExtValue();
                uint64_t TotalSize = mdconst::dyn_extract<ConstantInt>(
                                         ContextSizePair->getOperand(1))
                                         ->getZExtValue();
                ContextSizeInfo.push_back({FullStackId, TotalSize});
              }
            }
            MDNode *StackNode = getMIBStackNode(MIBMD);
            assert(StackNode);
            CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
            addStackNodesForMIB<MDNode, MDNode::op_iterator>(
                AllocNode, StackContext, CallsiteContext,
                getMIBAllocType(MIBMD), ContextSizeInfo);
          }
          // If exporting the graph to dot and an allocation id of interest was
          // specified, record all the context ids for this allocation node.
          if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
            DotAllocContextIds = AllocNode->getContextIds();
          assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
          // Memprof and callsite metadata on memory allocations no longer
          // needed.
          I.setMetadata(LLVMContext::MD_memprof, nullptr);
          I.setMetadata(LLVMContext::MD_callsite, nullptr);
        }
        // For callsite metadata, add to list for this function for later use.
        else if (I.getMetadata(LLVMContext::MD_callsite)) {
          CallsWithMetadata.push_back(&I);
        }
      }
    }
    if (!CallsWithMetadata.empty())
      FuncToCallsWithMetadata[&F] = CallsWithMetadata;
  }

  if (DumpCCG) {
    dbgs() << "CCG before updating call stack chains:\n";
    dbgs() << *this;
  }

  if (ExportToDot)
    exportToDot("prestackupdate");

  updateStackNodes();

  handleCallsitesWithMultipleTargets();

  markBackedges();

  // Strip off remaining callsite metadata, no longer needed.
  for (auto &FuncEntry : FuncToCallsWithMetadata)
    for (auto &Call : FuncEntry.second)
      Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
}

IndexCallsiteContextGraph::IndexCallsiteContextGraph(
    ModuleSummaryIndex &Index,
    llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
        isPrevailing)
    : Index(Index), isPrevailing(isPrevailing) {
  for (auto &I : Index) {
    auto VI = Index.getValueInfo(I);
    for (auto &S : VI.getSummaryList()) {
      // We should only add the prevailing nodes. Otherwise we may try to clone
      // in a weak copy that won't be linked (and may be different than the
      // prevailing version).
      // We only keep the memprof summary on the prevailing copy now when
      // building the combined index, as a space optimization, however don't
      // rely on this optimization. The linker doesn't resolve local linkage
      // values so don't check whether those are prevailing.
      if (!GlobalValue::isLocalLinkage(S->linkage()) &&
          !isPrevailing(VI.getGUID(), S.get()))
        continue;
      auto *FS = dyn_cast<FunctionSummary>(S.get());
      if (!FS)
        continue;
      std::vector<CallInfo> CallsWithMetadata;
      if (!FS->allocs().empty()) {
        for (auto &AN : FS->mutableAllocs()) {
          // This can happen because of recursion elimination handling that
          // currently exists in ModuleSummaryAnalysis. Skip these for now.
          // We still added them to the summary because we need to be able to
          // correlate properly in applyImport in the backends.
          if (AN.MIBs.empty())
            continue;
          IndexCall AllocCall(&AN);
          CallsWithMetadata.push_back(AllocCall);
          auto *AllocNode = addAllocNode(AllocCall, FS);
          // Pass an empty CallStack to the CallsiteContext (second)
          // parameter, since for ThinLTO we already collapsed out the inlined
          // stack ids on the allocation call during ModuleSummaryAnalysis.
          CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
              EmptyContext;
          unsigned I = 0;
          assert(
              (!MemProfReportHintedSizes && MinClonedColdBytePercent >= 100) ||
              AN.ContextSizeInfos.size() == AN.MIBs.size());
          // Now add all of the MIBs and their stack nodes.
          for (auto &MIB : AN.MIBs) {
            CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
                StackContext(&MIB);
            std::vector<ContextTotalSize> ContextSizeInfo;
            if (!AN.ContextSizeInfos.empty()) {
              for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
                ContextSizeInfo.push_back({FullStackId, TotalSize});
            }
            addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
                AllocNode, StackContext, EmptyContext, MIB.AllocType,
                ContextSizeInfo);
            I++;
          }
          // If exporting the graph to dot and an allocation id of interest was
          // specified, record all the context ids for this allocation node.
          if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
            DotAllocContextIds = AllocNode->getContextIds();
          assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
          // Initialize version 0 on the summary alloc node to the current alloc
          // type, unless it has both types in which case make it default, so
          // that in the case where we aren't able to clone the original version
          // always ends up with the default allocation behavior.
          AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
        }
      }
      // For callsite metadata, add to list for this function for later use.
      if (!FS->callsites().empty())
        for (auto &SN : FS->mutableCallsites()) {
          IndexCall StackNodeCall(&SN);
          CallsWithMetadata.push_back(StackNodeCall);
        }

      if (!CallsWithMetadata.empty())
        FuncToCallsWithMetadata[FS] = CallsWithMetadata;

      if (!FS->allocs().empty() || !FS->callsites().empty())
        FSToVIMap[FS] = VI;
    }
  }

  if (DumpCCG) {
    dbgs() << "CCG before updating call stack chains:\n";
    dbgs() << *this;
  }

  if (ExportToDot)
    exportToDot("prestackupdate");

  updateStackNodes();

  handleCallsitesWithMultipleTargets();

  markBackedges();
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy,
                          CallTy>::handleCallsitesWithMultipleTargets() {
  // Look for and workaround callsites that call multiple functions.
  // This can happen for indirect calls, which needs better handling, and in
  // more rare cases (e.g. macro expansion).
  // TODO: To fix this for indirect calls we will want to perform speculative
  // devirtualization using either the normal PGO info with ICP, or using the
  // information in the profiled MemProf contexts. We can do this prior to
  // this transformation for regular LTO, and for ThinLTO we can simulate that
  // effect in the summary and perform the actual speculative devirtualization
  // while cloning in the ThinLTO backend.

  // Keep track of the new nodes synthesized for discovered tail calls missing
  // from the profiled contexts.
  MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;

  std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
  for (auto &Entry : NonAllocationCallToContextNodeMap) {
    auto *Node = Entry.second;
    assert(Node->Clones.empty());
    // Check all node callees and see if in the same function.
    // We need to check all of the calls recorded in this Node, because in some
    // cases we may have had multiple calls with the same debug info calling
    // different callees. This can happen, for example, when an object is
    // constructed in the paramter list - the destructor call of the object has
    // the same debug info (line/col) as the call the object was passed to.
    // Here we will prune any that don't match all callee nodes.
    std::vector<CallInfo> AllCalls;
    AllCalls.reserve(Node->MatchingCalls.size() + 1);
    AllCalls.push_back(Node->Call);
    llvm::append_range(AllCalls, Node->MatchingCalls);

    // First see if we can partition the calls by callee function, creating new
    // nodes to host each set of calls calling the same callees. This is
    // necessary for support indirect calls with ThinLTO, for which we
    // synthesized CallsiteInfo records for each target. They will all have the
    // same callsite stack ids and would be sharing a context node at this
    // point. We need to perform separate cloning for each, which will be
    // applied along with speculative devirtualization in the ThinLTO backends
    // as needed. Note this does not currently support looking through tail
    // calls, it is unclear if we need that for indirect call targets.
    // First partition calls by callee func. Map indexed by func, value is
    // struct with list of matching calls, assigned node.
    if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
      continue;

    auto It = AllCalls.begin();
    // Iterate through the calls until we find the first that matches.
    for (; It != AllCalls.end(); ++It) {
      auto ThisCall = *It;
      bool Match = true;
      for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
           ++EI) {
        auto Edge = *EI;
        if (!Edge->Callee->hasCall())
          continue;
        assert(NodeToCallingFunc.count(Edge->Callee));
        // Check if the called function matches that of the callee node.
        if (!calleesMatch(ThisCall.call(), EI, TailCallToContextNodeMap)) {
          Match = false;
          break;
        }
      }
      // Found a call that matches the callee nodes, we can quit now.
      if (Match) {
        // If the first match is not the primary call on the Node, update it
        // now. We will update the list of matching calls further below.
        if (Node->Call != ThisCall) {
          Node->setCall(ThisCall);
          // We need to update the NonAllocationCallToContextNodeMap, but don't
          // want to do this during iteration over that map, so save the calls
          // that need updated entries.
          NewCallToNode.push_back({ThisCall, Node});
        }
        break;
      }
    }
    // We will update this list below (or leave it cleared if there was no
    // match found above).
    Node->MatchingCalls.clear();
    // If we hit the end of the AllCalls vector, no call matching the callee
    // nodes was found, clear the call information in the node.
    if (It == AllCalls.end()) {
      RemovedEdgesWithMismatchedCallees++;
      // Work around by setting Node to have a null call, so it gets
      // skipped during cloning. Otherwise assignFunctions will assert
      // because its data structures are not designed to handle this case.
      Node->setCall(CallInfo());
      continue;
    }
    // Now add back any matching calls that call the same function as the
    // matching primary call on Node.
    for (++It; It != AllCalls.end(); ++It) {
      auto ThisCall = *It;
      if (!sameCallee(Node->Call.call(), ThisCall.call()))
        continue;
      Node->MatchingCalls.push_back(ThisCall);
    }
  }

  // Remove all mismatched nodes identified in the above loop from the node map
  // (checking whether they have a null call which is set above). For a
  // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
  // to do the removal via remove_if than by individually erasing entries above.
  // Also remove any entries if we updated the node's primary call above.
  NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
    return !it.second->hasCall() || it.second->Call != it.first;
  });

  // Add entries for any new primary calls recorded above.
  for (auto &[Call, Node] : NewCallToNode)
    NonAllocationCallToContextNodeMap[Call] = Node;

  // Add the new nodes after the above loop so that the iteration is not
  // invalidated.
  for (auto &[Call, Node] : TailCallToContextNodeMap)
    NonAllocationCallToContextNodeMap[Call] = Node;
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
    ContextNode *Node, ArrayRef<CallInfo> AllCalls,
    std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
  // Struct to keep track of all the calls having the same callee function,
  // and the node we eventually assign to them. Eventually we will record the
  // context node assigned to this group of calls.
  struct CallsWithSameCallee {
    std::vector<CallInfo> Calls;
    ContextNode *Node = nullptr;
  };

  // First partition calls by callee function. Build map from each function
  // to the list of matching calls.
  DenseMap<const FuncTy *, CallsWithSameCallee> CalleeFuncToCallInfo;
  for (auto ThisCall : AllCalls) {
    auto *F = getCalleeFunc(ThisCall.call());
    if (F)
      CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
  }

  // Next, walk through all callee edges. For each callee node, get its
  // containing function and see if it was recorded in the above map (meaning we
  // have at least one matching call). Build another map from each callee node
  // with a matching call to the structure instance created above containing all
  // the calls.
  DenseMap<ContextNode *, CallsWithSameCallee *> CalleeNodeToCallInfo;
  for (const auto &Edge : Node->CalleeEdges) {
    if (!Edge->Callee->hasCall())
      continue;
    const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
    if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
      CalleeNodeToCallInfo[Edge->Callee] =
          &CalleeFuncToCallInfo[ProfiledCalleeFunc];
  }

  // If there are entries in the second map, then there were no matching
  // calls/callees, nothing to do here. Return so we can go to the handling that
  // looks through tail calls.
  if (CalleeNodeToCallInfo.empty())
    return false;

  // Walk through all callee edges again. Any and all callee edges that didn't
  // match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
  // new caller node (UnmatchedCalleesNode) which gets a null call so that it is
  // ignored during cloning. If it is in the map, then we use the node recorded
  // in that entry (creating it if needed), and move the callee edge to it.
  // The first callee will use the original node instead of creating a new one.
  // Note that any of the original calls on this node (in AllCalls) that didn't
  // have a callee function automatically get dropped from the node as part of
  // this process.
  ContextNode *UnmatchedCalleesNode = nullptr;
  // Track whether we already assigned original node to a callee.
  bool UsedOrigNode = false;
  assert(NodeToCallingFunc[Node]);
  // Iterate over a copy of Node's callee edges, since we may need to remove
  // edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and
  // makes it less error-prone.
  auto CalleeEdges = Node->CalleeEdges;
  for (auto &Edge : CalleeEdges) {
    if (!Edge->Callee->hasCall())
      continue;

    // Will be updated below to point to whatever (caller) node this callee edge
    // should be moved to.
    ContextNode *CallerNodeToUse = nullptr;

    // Handle the case where there were no matching calls first. Move this
    // callee edge to the UnmatchedCalleesNode, creating it if needed.
    if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
      if (!UnmatchedCalleesNode)
        UnmatchedCalleesNode =
            createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
      CallerNodeToUse = UnmatchedCalleesNode;
    } else {
      // Look up the information recorded for this callee node, and use the
      // recorded caller node (creating it if needed).
      auto *Info = CalleeNodeToCallInfo[Edge->Callee];
      if (!Info->Node) {
        // If we haven't assigned any callees to the original node use it.
        if (!UsedOrigNode) {
          Info->Node = Node;
          // Clear the set of matching calls which will be updated below.
          Node->MatchingCalls.clear();
          UsedOrigNode = true;
        } else
          Info->Node =
              createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
        assert(!Info->Calls.empty());
        // The first call becomes the primary call for this caller node, and the
        // rest go in the matching calls list.
        Info->Node->setCall(Info->Calls.front());
        llvm::append_range(Info->Node->MatchingCalls,
                           llvm::drop_begin(Info->Calls));
        // Save the primary call to node correspondence so that we can update
        // the NonAllocationCallToContextNodeMap, which is being iterated in the
        // caller of this function.
        NewCallToNode.push_back({Info->Node->Call, Info->Node});
      }
      CallerNodeToUse = Info->Node;
    }

    // Don't need to move edge if we are using the original node;
    if (CallerNodeToUse == Node)
      continue;

    moveCalleeEdgeToNewCaller(Edge, CallerNodeToUse);
  }
  // Now that we are done moving edges, clean up any caller edges that ended
  // up with no type or context ids. During moveCalleeEdgeToNewCaller all
  // caller edges from Node are replicated onto the new callers, and it
  // simplifies the handling to leave them until we have moved all
  // edges/context ids.
  for (auto &I : CalleeNodeToCallInfo)
    removeNoneTypeCallerEdges(I.second->Node);
  if (UnmatchedCalleesNode)
    removeNoneTypeCallerEdges(UnmatchedCalleesNode);
  removeNoneTypeCallerEdges(Node);

  return true;
}

uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
  // In the Module (IR) case this is already the Id.
  return IdOrIndex;
}

uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
  // In the Index case this is an index into the stack id list in the summary
  // index, convert it to an Id.
  return Index.getStackIdAtIndex(IdOrIndex);
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
    CallTy Call, EdgeIter &EI,
    MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
  auto Edge = *EI;
  const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
  const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
  // Will be populated in order of callee to caller if we find a chain of tail
  // calls between the profiled caller and callee.
  std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
  if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
                         FoundCalleeChain))
    return false;

  // The usual case where the profiled callee matches that of the IR/summary.
  if (FoundCalleeChain.empty())
    return true;

  auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
    auto *CurEdge = Callee->findEdgeFromCaller(Caller);
    // If there is already an edge between these nodes, simply update it and
    // return.
    if (CurEdge) {
      CurEdge->ContextIds.insert_range(Edge->ContextIds);
      CurEdge->AllocTypes |= Edge->AllocTypes;
      return;
    }
    // Otherwise, create a new edge and insert it into the caller and callee
    // lists.
    auto NewEdge = std::make_shared<ContextEdge>(
        Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
    Callee->CallerEdges.push_back(NewEdge);
    if (Caller == Edge->Caller) {
      // If we are inserting the new edge into the current edge's caller, insert
      // the new edge before the current iterator position, and then increment
      // back to the current edge.
      EI = Caller->CalleeEdges.insert(EI, NewEdge);
      ++EI;
      assert(*EI == Edge &&
             "Iterator position not restored after insert and increment");
    } else
      Caller->CalleeEdges.push_back(NewEdge);
  };

  // Create new nodes for each found callee and connect in between the profiled
  // caller and callee.
  auto *CurCalleeNode = Edge->Callee;
  for (auto &[NewCall, Func] : FoundCalleeChain) {
    ContextNode *NewNode = nullptr;
    // First check if we have already synthesized a node for this tail call.
    if (TailCallToContextNodeMap.count(NewCall)) {
      NewNode = TailCallToContextNodeMap[NewCall];
      NewNode->AllocTypes |= Edge->AllocTypes;
    } else {
      FuncToCallsWithMetadata[Func].push_back({NewCall});
      // Create Node and record node info.
      NewNode = createNewNode(/*IsAllocation=*/false, Func, NewCall);
      TailCallToContextNodeMap[NewCall] = NewNode;
      NewNode->AllocTypes = Edge->AllocTypes;
    }

    // Hook up node to its callee node
    AddEdge(NewNode, CurCalleeNode);

    CurCalleeNode = NewNode;
  }

  // Hook up edge's original caller to new callee node.
  AddEdge(Edge->Caller, CurCalleeNode);

#ifndef NDEBUG
  // Save this because Edge's fields get cleared below when removed.
  auto *Caller = Edge->Caller;
#endif

  // Remove old edge
  removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);

  // To simplify the increment of EI in the caller, subtract one from EI.
  // In the final AddEdge call we would have either added a new callee edge,
  // to Edge->Caller, or found an existing one. Either way we are guaranteed
  // that there is at least one callee edge.
  assert(!Caller->CalleeEdges.empty());
  --EI;

  return true;
}

bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
    const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
    std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
    bool &FoundMultipleCalleeChains) {
  // Stop recursive search if we have already explored the maximum specified
  // depth.
  if (Depth > TailCallSearchDepth)
    return false;

  auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
    FoundCalleeChain.push_back({Callsite, F});
  };

  auto *CalleeFunc = dyn_cast<Function>(CurCallee);
  if (!CalleeFunc) {
    auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
    assert(Alias);
    CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
    assert(CalleeFunc);
  }

  // Look for tail calls in this function, and check if they either call the
  // profiled callee directly, or indirectly (via a recursive search).
  // Only succeed if there is a single unique tail call chain found between the
  // profiled caller and callee, otherwise we could perform incorrect cloning.
  bool FoundSingleCalleeChain = false;
  for (auto &BB : *CalleeFunc) {
    for (auto &I : BB) {
      auto *CB = dyn_cast<CallBase>(&I);
      if (!CB || !CB->isTailCall())
        continue;
      auto *CalledValue = CB->getCalledOperand();
      auto *CalledFunction = CB->getCalledFunction();
      if (CalledValue && !CalledFunction) {
        CalledValue = CalledValue->stripPointerCasts();
        // Stripping pointer casts can reveal a called function.
        CalledFunction = dyn_cast<Function>(CalledValue);
      }
      // Check if this is an alias to a function. If so, get the
      // called aliasee for the checks below.
      if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
        assert(!CalledFunction &&
               "Expected null called function in callsite for alias");
        CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
      }
      if (!CalledFunction)
        continue;
      if (CalledFunction == ProfiledCallee) {
        if (FoundSingleCalleeChain) {
          FoundMultipleCalleeChains = true;
          return false;
        }
        FoundSingleCalleeChain = true;
        FoundProfiledCalleeCount++;
        FoundProfiledCalleeDepth += Depth;
        if (Depth > FoundProfiledCalleeMaxDepth)
          FoundProfiledCalleeMaxDepth = Depth;
        SaveCallsiteInfo(&I, CalleeFunc);
      } else if (findProfiledCalleeThroughTailCalls(
                     ProfiledCallee, CalledFunction, Depth + 1,
                     FoundCalleeChain, FoundMultipleCalleeChains)) {
        // findProfiledCalleeThroughTailCalls should not have returned
        // true if FoundMultipleCalleeChains.
        assert(!FoundMultipleCalleeChains);
        if (FoundSingleCalleeChain) {
          FoundMultipleCalleeChains = true;
          return false;
        }
        FoundSingleCalleeChain = true;
        SaveCallsiteInfo(&I, CalleeFunc);
      } else if (FoundMultipleCalleeChains)
        return false;
    }
  }

  return FoundSingleCalleeChain;
}

const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) {
  auto *CB = dyn_cast<CallBase>(Call);
  if (!CB->getCalledOperand() || CB->isIndirectCall())
    return nullptr;
  auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
  auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
  if (Alias)
    return dyn_cast<Function>(Alias->getAliasee());
  return dyn_cast<Function>(CalleeVal);
}

bool ModuleCallsiteContextGraph::calleeMatchesFunc(
    Instruction *Call, const Function *Func, const Function *CallerFunc,
    std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
  auto *CB = dyn_cast<CallBase>(Call);
  if (!CB->getCalledOperand() || CB->isIndirectCall())
    return false;
  auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
  auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
  if (CalleeFunc == Func)
    return true;
  auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
  if (Alias && Alias->getAliasee() == Func)
    return true;

  // Recursively search for the profiled callee through tail calls starting with
  // the actual Callee. The discovered tail call chain is saved in
  // FoundCalleeChain, and we will fixup the graph to include these callsites
  // after returning.
  // FIXME: We will currently redo the same recursive walk if we find the same
  // mismatched callee from another callsite. We can improve this with more
  // bookkeeping of the created chain of new nodes for each mismatch.
  unsigned Depth = 1;
  bool FoundMultipleCalleeChains = false;
  if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
                                          FoundCalleeChain,
                                          FoundMultipleCalleeChains)) {
    LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
                      << Func->getName() << " from " << CallerFunc->getName()
                      << " that actually called " << CalleeVal->getName()
                      << (FoundMultipleCalleeChains
                              ? " (found multiple possible chains)"
                              : "")
                      << "\n");
    if (FoundMultipleCalleeChains)
      FoundProfiledCalleeNonUniquelyCount++;
    return false;
  }

  return true;
}

bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
                                            Instruction *Call2) {
  auto *CB1 = cast<CallBase>(Call1);
  if (!CB1->getCalledOperand() || CB1->isIndirectCall())
    return false;
  auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
  auto *CalleeFunc1 = dyn_cast<Function>(CalleeVal1);
  auto *CB2 = cast<CallBase>(Call2);
  if (!CB2->getCalledOperand() || CB2->isIndirectCall())
    return false;
  auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
  auto *CalleeFunc2 = dyn_cast<Function>(CalleeVal2);
  return CalleeFunc1 == CalleeFunc2;
}

bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
    ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
    std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
    bool &FoundMultipleCalleeChains) {
  // Stop recursive search if we have already explored the maximum specified
  // depth.
  if (Depth > TailCallSearchDepth)
    return false;

  auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
    // Make a CallsiteInfo for each discovered callee, if one hasn't already
    // been synthesized.
    if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
        !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
      // StackIds is empty (we don't have debug info available in the index for
      // these callsites)
      FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
          std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
    CallsiteInfo *NewCallsiteInfo =
        FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
    FoundCalleeChain.push_back({NewCallsiteInfo, FS});
  };

  // Look for tail calls in this function, and check if they either call the
  // profiled callee directly, or indirectly (via a recursive search).
  // Only succeed if there is a single unique tail call chain found between the
  // profiled caller and callee, otherwise we could perform incorrect cloning.
  bool FoundSingleCalleeChain = false;
  for (auto &S : CurCallee.getSummaryList()) {
    if (!GlobalValue::isLocalLinkage(S->linkage()) &&
        !isPrevailing(CurCallee.getGUID(), S.get()))
      continue;
    auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
    if (!FS)
      continue;
    auto FSVI = CurCallee;
    auto *AS = dyn_cast<AliasSummary>(S.get());
    if (AS)
      FSVI = AS->getAliaseeVI();
    for (auto &CallEdge : FS->calls()) {
      if (!CallEdge.second.hasTailCall())
        continue;
      if (CallEdge.first == ProfiledCallee) {
        if (FoundSingleCalleeChain) {
          FoundMultipleCalleeChains = true;
          return false;
        }
        FoundSingleCalleeChain = true;
        FoundProfiledCalleeCount++;
        FoundProfiledCalleeDepth += Depth;
        if (Depth > FoundProfiledCalleeMaxDepth)
          FoundProfiledCalleeMaxDepth = Depth;
        CreateAndSaveCallsiteInfo(CallEdge.first, FS);
        // Add FS to FSToVIMap  in case it isn't already there.
        assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
        FSToVIMap[FS] = FSVI;
      } else if (findProfiledCalleeThroughTailCalls(
                     ProfiledCallee, CallEdge.first, Depth + 1,
                     FoundCalleeChain, FoundMultipleCalleeChains)) {
        // findProfiledCalleeThroughTailCalls should not have returned
        // true if FoundMultipleCalleeChains.
        assert(!FoundMultipleCalleeChains);
        if (FoundSingleCalleeChain) {
          FoundMultipleCalleeChains = true;
          return false;
        }
        FoundSingleCalleeChain = true;
        CreateAndSaveCallsiteInfo(CallEdge.first, FS);
        // Add FS to FSToVIMap  in case it isn't already there.
        assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
        FSToVIMap[FS] = FSVI;
      } else if (FoundMultipleCalleeChains)
        return false;
    }
  }

  return FoundSingleCalleeChain;
}

const FunctionSummary *
IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
  ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
  if (Callee.getSummaryList().empty())
    return nullptr;
  return dyn_cast<FunctionSummary>(Callee.getSummaryList()[0]->getBaseObject());
}

bool IndexCallsiteContextGraph::calleeMatchesFunc(
    IndexCall &Call, const FunctionSummary *Func,
    const FunctionSummary *CallerFunc,
    std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
  ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
  // If there is no summary list then this is a call to an externally defined
  // symbol.
  AliasSummary *Alias =
      Callee.getSummaryList().empty()
          ? nullptr
          : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
  assert(FSToVIMap.count(Func));
  auto FuncVI = FSToVIMap[Func];
  if (Callee == FuncVI ||
      // If callee is an alias, check the aliasee, since only function
      // summary base objects will contain the stack node summaries and thus
      // get a context node.
      (Alias && Alias->getAliaseeVI() == FuncVI))
    return true;

  // Recursively search for the profiled callee through tail calls starting with
  // the actual Callee. The discovered tail call chain is saved in
  // FoundCalleeChain, and we will fixup the graph to include these callsites
  // after returning.
  // FIXME: We will currently redo the same recursive walk if we find the same
  // mismatched callee from another callsite. We can improve this with more
  // bookkeeping of the created chain of new nodes for each mismatch.
  unsigned Depth = 1;
  bool FoundMultipleCalleeChains = false;
  if (!findProfiledCalleeThroughTailCalls(
          FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
    LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
                      << " from " << FSToVIMap[CallerFunc]
                      << " that actually called " << Callee
                      << (FoundMultipleCalleeChains
                              ? " (found multiple possible chains)"
                              : "")
                      << "\n");
    if (FoundMultipleCalleeChains)
      FoundProfiledCalleeNonUniquelyCount++;
    return false;
  }

  return true;
}

bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
  ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Call1)->Callee;
  ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Call2)->Callee;
  return Callee1 == Callee2;
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
    const {
  print(dbgs());
  dbgs() << "\n";
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
    raw_ostream &OS) const {
  OS << "Node " << this << "\n";
  OS << "\t";
  printCall(OS);
  if (Recursive)
    OS << " (recursive)";
  OS << "\n";
  if (!MatchingCalls.empty()) {
    OS << "\tMatchingCalls:\n";
    for (auto &MatchingCall : MatchingCalls) {
      OS << "\t";
      MatchingCall.print(OS);
      OS << "\n";
    }
  }
  OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
  OS << "\tContextIds:";
  // Make a copy of the computed context ids that we can sort for stability.
  auto ContextIds = getContextIds();
  std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
  std::sort(SortedIds.begin(), SortedIds.end());
  for (auto Id : SortedIds)
    OS << " " << Id;
  OS << "\n";
  OS << "\tCalleeEdges:\n";
  for (auto &Edge : CalleeEdges)
    OS << "\t\t" << *Edge << "\n";
  OS << "\tCallerEdges:\n";
  for (auto &Edge : CallerEdges)
    OS << "\t\t" << *Edge << "\n";
  if (!Clones.empty()) {
    OS << "\tClones: ";
    ListSeparator LS;
    for (auto *Clone : Clones)
      OS << LS << Clone;
    OS << "\n";
  } else if (CloneOf) {
    OS << "\tClone of " << CloneOf << "\n";
  }
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
    const {
  print(dbgs());
  dbgs() << "\n";
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
    raw_ostream &OS) const {
  OS << "Edge from Callee " << Callee << " to Caller: " << Caller
     << (IsBackedge ? " (BE)" : "")
     << " AllocTypes: " << getAllocTypeString(AllocTypes);
  OS << " ContextIds:";
  std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
  std::sort(SortedIds.begin(), SortedIds.end());
  for (auto Id : SortedIds)
    OS << " " << Id;
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
  print(dbgs());
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
    raw_ostream &OS) const {
  OS << "Callsite Context Graph:\n";
  using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
  for (const auto Node : nodes<GraphType>(this)) {
    if (Node->isRemoved())
      continue;
    Node->print(OS);
    OS << "\n";
  }
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
    raw_ostream &OS) const {
  using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
  for (const auto Node : nodes<GraphType>(this)) {
    if (Node->isRemoved())
      continue;
    if (!Node->IsAllocation)
      continue;
    DenseSet<uint32_t> ContextIds = Node->getContextIds();
    auto AllocTypeFromCall = getAllocationCallType(Node->Call);
    std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
    std::sort(SortedIds.begin(), SortedIds.end());
    for (auto Id : SortedIds) {
      auto TypeI = ContextIdToAllocationType.find(Id);
      assert(TypeI != ContextIdToAllocationType.end());
      auto CSI = ContextIdToContextSizeInfos.find(Id);
      if (CSI != ContextIdToContextSizeInfos.end()) {
        for (auto &Info : CSI->second) {
          OS << "MemProf hinting: "
             << getAllocTypeString((uint8_t)TypeI->second)
             << " full allocation context " << Info.FullStackId
             << " with total size " << Info.TotalSize << " is "
             << getAllocTypeString(Node->AllocTypes) << " after cloning";
          if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
            OS << " marked " << getAllocTypeString((uint8_t)AllocTypeFromCall)
               << " due to cold byte percent";
          // Print the internal context id to aid debugging and visualization.
          OS << " (context id " << Id << ")";
          OS << "\n";
        }
      }
    }
  }
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
  using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
  for (const auto Node : nodes<GraphType>(this)) {
    checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
    for (auto &Edge : Node->CallerEdges)
      checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
  }
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
  using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
  using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;

  using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
  static NodeRef getNode(const NodePtrTy &P) { return P.get(); }

  using nodes_iterator =
      mapped_iterator<typename std::vector<NodePtrTy>::const_iterator,
                      decltype(&getNode)>;

  static nodes_iterator nodes_begin(GraphType G) {
    return nodes_iterator(G->NodeOwner.begin(), &getNode);
  }

  static nodes_iterator nodes_end(GraphType G) {
    return nodes_iterator(G->NodeOwner.end(), &getNode);
  }

  static NodeRef getEntryNode(GraphType G) {
    return G->NodeOwner.begin()->get();
  }

  using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
  static const ContextNode<DerivedCCG, FuncTy, CallTy> *
  GetCallee(const EdgePtrTy &P) {
    return P->Callee;
  }

  using ChildIteratorType =
      mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
                          DerivedCCG, FuncTy, CallTy>>>::const_iterator,
                      decltype(&GetCallee)>;

  static ChildIteratorType child_begin(NodeRef N) {
    return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
  }

  static ChildIteratorType child_end(NodeRef N) {
    return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
  }
};

template <typename DerivedCCG, typename FuncTy, typename CallTy>
struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
    : public DefaultDOTGraphTraits {
  DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {
    // If the user requested the full graph to be exported, but provided an
    // allocation id, or if the user gave a context id and requested more than
    // just a specific context to be exported, note that highlighting is
    // enabled.
    DoHighlight =
        (AllocIdForDot.getNumOccurrences() && DotGraphScope == DotScope::All) ||
        (ContextIdForDot.getNumOccurrences() &&
         DotGraphScope != DotScope::Context);
  }

  using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
  using GTraits = GraphTraits<GraphType>;
  using NodeRef = typename GTraits::NodeRef;
  using ChildIteratorType = typename GTraits::ChildIteratorType;

  static std::string getNodeLabel(NodeRef Node, GraphType G) {
    std::string LabelString =
        (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
         Twine(Node->OrigStackOrAllocId))
            .str();
    LabelString += "\n";
    if (Node->hasCall()) {
      auto Func = G->NodeToCallingFunc.find(Node);
      assert(Func != G->NodeToCallingFunc.end());
      LabelString +=
          G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
    } else {
      LabelString += "null call";
      if (Node->Recursive)
        LabelString += " (recursive)";
      else
        LabelString += " (external)";
    }
    return LabelString;
  }

  static std::string getNodeAttributes(NodeRef Node, GraphType G) {
    auto ContextIds = Node->getContextIds();
    // If highlighting enabled, see if this node contains any of the context ids
    // of interest. If so, it will use a different color and a larger fontsize
    // (which makes the node larger as well).
    bool Highlight = false;
    if (DoHighlight) {
      assert(ContextIdForDot.getNumOccurrences() ||
             AllocIdForDot.getNumOccurrences());
      if (ContextIdForDot.getNumOccurrences())
        Highlight = ContextIds.contains(ContextIdForDot);
      else
        Highlight = set_intersects(ContextIds, G->DotAllocContextIds);
    }
    std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
                                   getContextIds(ContextIds) + "\"")
                                      .str();
    // Default fontsize is 14
    if (Highlight)
      AttributeString += ",fontsize=\"30\"";
    AttributeString +=
        (Twine(",fillcolor=\"") + getColor(Node->AllocTypes, Highlight) + "\"")
            .str();
    if (Node->CloneOf) {
      AttributeString += ",color=\"blue\"";
      AttributeString += ",style=\"filled,bold,dashed\"";
    } else
      AttributeString += ",style=\"filled\"";
    return AttributeString;
  }

  static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
                                       GraphType G) {
    auto &Edge = *(ChildIter.getCurrent());
    // If highlighting enabled, see if this edge contains any of the context ids
    // of interest. If so, it will use a different color and a heavier arrow
    // size and weight (the larger weight makes the highlighted path
    // straighter).
    bool Highlight = false;
    if (DoHighlight) {
      assert(ContextIdForDot.getNumOccurrences() ||
             AllocIdForDot.getNumOccurrences());
      if (ContextIdForDot.getNumOccurrences())
        Highlight = Edge->ContextIds.contains(ContextIdForDot);
      else
        Highlight = set_intersects(Edge->ContextIds, G->DotAllocContextIds);
    }
    auto Color = getColor(Edge->AllocTypes, Highlight);
    std::string AttributeString =
        (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
         // fillcolor is the arrow head and color is the line
         Twine(",fillcolor=\"") + Color + "\"" + Twine(",color=\"") + Color +
         "\"")
            .str();
    if (Edge->IsBackedge)
      AttributeString += ",style=\"dotted\"";
    // Default penwidth and weight are both 1.
    if (Highlight)
      AttributeString += ",penwidth=\"2.0\",weight=\"2\"";
    return AttributeString;
  }

  // Since the NodeOwners list includes nodes that are no longer connected to
  // the graph, skip them here.
  static bool isNodeHidden(NodeRef Node, GraphType G) {
    if (Node->isRemoved())
      return true;
    // If a scope smaller than the full graph was requested, see if this node
    // contains any of the context ids of interest.
    if (DotGraphScope == DotScope::Alloc)
      return !set_intersects(Node->getContextIds(), G->DotAllocContextIds);
    if (DotGraphScope == DotScope::Context)
      return !Node->getContextIds().contains(ContextIdForDot);
    return false;
  }

private:
  static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
    std::string IdString = "ContextIds:";
    if (ContextIds.size() < 100) {
      std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
      std::sort(SortedIds.begin(), SortedIds.end());
      for (auto Id : SortedIds)
        IdString += (" " + Twine(Id)).str();
    } else {
      IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
    }
    return IdString;
  }

  static std::string getColor(uint8_t AllocTypes, bool Highlight) {
    // If DoHighlight is not enabled, we want to use the highlight colors for
    // NotCold and Cold, and the non-highlight color for NotCold+Cold. This is
    // both compatible with the color scheme before highlighting was supported,
    // and for the NotCold+Cold color the non-highlight color is a bit more
    // readable.
    if (AllocTypes == (uint8_t)AllocationType::NotCold)
      // Color "brown1" actually looks like a lighter red.
      return !DoHighlight || Highlight ? "brown1" : "lightpink";
    if (AllocTypes == (uint8_t)AllocationType::Cold)
      return !DoHighlight || Highlight ? "cyan" : "lightskyblue";
    if (AllocTypes ==
        ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
      return Highlight ? "magenta" : "mediumorchid1";
    return "gray";
  }

  static std::string getNodeId(NodeRef Node) {
    std::stringstream SStream;
    SStream << std::hex << "N0x" << (unsigned long long)Node;
    std::string Result = SStream.str();
    return Result;
  }

  // True if we should highlight a specific context or allocation's contexts in
  // the emitted graph.
  static bool DoHighlight;
};

template <typename DerivedCCG, typename FuncTy, typename CallTy>
bool DOTGraphTraits<
    const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>::DoHighlight =
    false;

template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
    std::string Label) const {
  WriteGraph(this, "", false, Label,
             DotFilePathPrefix + "ccg." + Label + ".dot");
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
    const std::shared_ptr<ContextEdge> &Edge,
    DenseSet<uint32_t> ContextIdsToMove) {
  ContextNode *Node = Edge->Callee;
  assert(NodeToCallingFunc.count(Node));
  ContextNode *Clone =
      createNewNode(Node->IsAllocation, NodeToCallingFunc[Node], Node->Call);
  Node->addClone(Clone);
  Clone->MatchingCalls = Node->MatchingCalls;
  moveEdgeToExistingCalleeClone(Edge, Clone, /*NewClone=*/true,
                                ContextIdsToMove);
  return Clone;
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
    moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
                                  ContextNode *NewCallee, bool NewClone,
                                  DenseSet<uint32_t> ContextIdsToMove) {
  // NewCallee and Edge's current callee must be clones of the same original
  // node (Edge's current callee may be the original node too).
  assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());

  bool EdgeIsRecursive = Edge->Callee == Edge->Caller;

  ContextNode *OldCallee = Edge->Callee;

  // We might already have an edge to the new callee from earlier cloning for a
  // different allocation. If one exists we will reuse it.
  auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);

  // Callers will pass an empty ContextIdsToMove set when they want to move the
  // edge. Copy in Edge's ids for simplicity.
  if (ContextIdsToMove.empty())
    ContextIdsToMove = Edge->getContextIds();

  // If we are moving all of Edge's ids, then just move the whole Edge.
  // Otherwise only move the specified subset, to a new edge if needed.
  if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
    // First, update the alloc types on New Callee from Edge.
    // Do this before we potentially clear Edge's fields below!
    NewCallee->AllocTypes |= Edge->AllocTypes;
    // Moving the whole Edge.
    if (ExistingEdgeToNewCallee) {
      // Since we already have an edge to NewCallee, simply move the ids
      // onto it, and remove the existing Edge.
      ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
      ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
      assert(Edge->ContextIds == ContextIdsToMove);
      removeEdgeFromGraph(Edge.get());
    } else {
      // Otherwise just reconnect Edge to NewCallee.
      Edge->Callee = NewCallee;
      NewCallee->CallerEdges.push_back(Edge);
      // Remove it from callee where it was previously connected.
      OldCallee->eraseCallerEdge(Edge.get());
      // Don't need to update Edge's context ids since we are simply
      // reconnecting it.
    }
  } else {
    // Only moving a subset of Edge's ids.
    // Compute the alloc type of the subset of ids being moved.
    auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);
    if (ExistingEdgeToNewCallee) {
      // Since we already have an edge to NewCallee, simply move the ids
      // onto it.
      ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
      ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
    } else {
      // Otherwise, create a new edge to NewCallee for the ids being moved.
      auto NewEdge = std::make_shared<ContextEdge>(
          NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
      Edge->Caller->CalleeEdges.push_back(NewEdge);
      NewCallee->CallerEdges.push_back(NewEdge);
    }
    // In either case, need to update the alloc types on NewCallee, and remove
    // those ids and update the alloc type on the original Edge.
    NewCallee->AllocTypes |= CallerEdgeAllocType;
    set_subtract(Edge->ContextIds, ContextIdsToMove);
    Edge->AllocTypes = computeAllocType(Edge->ContextIds);
  }
  // Now walk the old callee node's callee edges and move Edge's context ids
  // over to the corresponding edge into the clone (which is created here if
  // this is a newly created clone).
  for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
    ContextNode *CalleeToUse = OldCalleeEdge->Callee;
    // If this is a direct recursion edge, use NewCallee (the clone) as the
    // callee as well, so that any edge updated/created here is also direct
    // recursive.
    if (CalleeToUse == OldCallee) {
      // If this is a recursive edge, see if we already moved a recursive edge
      // (which would have to have been this one) - if we were only moving a
      // subset of context ids it would still be on OldCallee.
      if (EdgeIsRecursive) {
        assert(OldCalleeEdge == Edge);
        continue;
      }
      CalleeToUse = NewCallee;
    }
    // The context ids moving to the new callee are the subset of this edge's
    // context ids and the context ids on the caller edge being moved.
    DenseSet<uint32_t> EdgeContextIdsToMove =
        set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
    set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
    OldCalleeEdge->AllocTypes =
        computeAllocType(OldCalleeEdge->getContextIds());
    if (!NewClone) {
      // Update context ids / alloc type on corresponding edge to NewCallee.
      // There is a chance this may not exist if we are reusing an existing
      // clone, specifically during function assignment, where we would have
      // removed none type edges after creating the clone. If we can't find
      // a corresponding edge there, fall through to the cloning below.
      if (auto *NewCalleeEdge = NewCallee->findEdgeFromCallee(CalleeToUse)) {
        NewCalleeEdge->getContextIds().insert_range(EdgeContextIdsToMove);
        NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
        continue;
      }
    }
    auto NewEdge = std::make_shared<ContextEdge>(
        CalleeToUse, NewCallee, computeAllocType(EdgeContextIdsToMove),
        EdgeContextIdsToMove);
    NewCallee->CalleeEdges.push_back(NewEdge);
    NewEdge->Callee->CallerEdges.push_back(NewEdge);
  }
  // Recompute the node alloc type now that its callee edges have been
  // updated (since we will compute from those edges).
  OldCallee->AllocTypes = OldCallee->computeAllocType();
  // OldCallee alloc type should be None iff its context id set is now empty.
  assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
         OldCallee->emptyContextIds());
  if (VerifyCCG) {
    checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
    checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
    for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
      checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
                                            /*CheckEdges=*/false);
    for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
      checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
                                            /*CheckEdges=*/false);
  }
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
    moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
                              ContextNode *NewCaller) {
  auto *OldCallee = Edge->Callee;
  auto *NewCallee = OldCallee;
  // If this edge was direct recursive, make any new/updated edge also direct
  // recursive to NewCaller.
  bool Recursive = Edge->Caller == Edge->Callee;
  if (Recursive)
    NewCallee = NewCaller;

  ContextNode *OldCaller = Edge->Caller;
  OldCaller->eraseCalleeEdge(Edge.get());

  // We might already have an edge to the new caller. If one exists we will
  // reuse it.
  auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(NewCallee);

  if (ExistingEdgeToNewCaller) {
    // Since we already have an edge to NewCaller, simply move the ids
    // onto it, and remove the existing Edge.
    ExistingEdgeToNewCaller->getContextIds().insert_range(
        Edge->getContextIds());
    ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes;
    Edge->ContextIds.clear();
    Edge->AllocTypes = (uint8_t)AllocationType::None;
    OldCallee->eraseCallerEdge(Edge.get());
  } else {
    // Otherwise just reconnect Edge to NewCaller.
    Edge->Caller = NewCaller;
    NewCaller->CalleeEdges.push_back(Edge);
    if (Recursive) {
      assert(NewCallee == NewCaller);
      // In the case of (direct) recursive edges, we update the callee as well
      // so that it becomes recursive on the new caller.
      Edge->Callee = NewCallee;
      NewCallee->CallerEdges.push_back(Edge);
      OldCallee->eraseCallerEdge(Edge.get());
    }
    // Don't need to update Edge's context ids since we are simply
    // reconnecting it.
  }
  // In either case, need to update the alloc types on New Caller.
  NewCaller->AllocTypes |= Edge->AllocTypes;

  // Now walk the old caller node's caller edges and move Edge's context ids
  // over to the corresponding edge into the node (which is created here if
  // this is a newly created node). We can tell whether this is a newly created
  // node by seeing if it has any caller edges yet.
#ifndef NDEBUG
  bool IsNewNode = NewCaller->CallerEdges.empty();
#endif
  // If we just moved a direct recursive edge, presumably its context ids should
  // also flow out of OldCaller via some other non-recursive callee edge. We
  // don't want to remove the recursive context ids from other caller edges yet,
  // otherwise the context ids get into an inconsistent state on OldCaller.
  // We will update these context ids on the non-recursive caller edge when and
  // if they are updated on the non-recursive callee.
  if (!Recursive) {
    for (auto &OldCallerEdge : OldCaller->CallerEdges) {
      auto OldCallerCaller = OldCallerEdge->Caller;
      // The context ids moving to the new caller are the subset of this edge's
      // context ids and the context ids on the callee edge being moved.
      DenseSet<uint32_t> EdgeContextIdsToMove = set_intersection(
          OldCallerEdge->getContextIds(), Edge->getContextIds());
      if (OldCaller == OldCallerCaller) {
        OldCallerCaller = NewCaller;
        // Don't actually move this one. The caller will move it directly via a
        // call to this function with this as the Edge if it is appropriate to
        // move to a diff node that has a matching callee (itself).
        continue;
      }
      set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
      OldCallerEdge->AllocTypes =
          computeAllocType(OldCallerEdge->getContextIds());
      // In this function we expect that any pre-existing node already has edges
      // from the same callers as the old node. That should be true in the
      // current use case, where we will remove None-type edges after copying
      // over all caller edges from the callee.
      auto *ExistingCallerEdge = NewCaller->findEdgeFromCaller(OldCallerCaller);
      // Since we would have skipped caller edges when moving a direct recursive
      // edge, this may not hold true when recursive handling enabled.
      assert(IsNewNode || ExistingCallerEdge || AllowRecursiveCallsites);
      if (ExistingCallerEdge) {
        ExistingCallerEdge->getContextIds().insert_range(EdgeContextIdsToMove);
        ExistingCallerEdge->AllocTypes |=
            computeAllocType(EdgeContextIdsToMove);
        continue;
      }
      auto NewEdge = std::make_shared<ContextEdge>(
          NewCaller, OldCallerCaller, computeAllocType(EdgeContextIdsToMove),
          EdgeContextIdsToMove);
      NewCaller->CallerEdges.push_back(NewEdge);
      NewEdge->Caller->CalleeEdges.push_back(NewEdge);
    }
  }
  // Recompute the node alloc type now that its caller edges have been
  // updated (since we will compute from those edges).
  OldCaller->AllocTypes = OldCaller->computeAllocType();
  // OldCaller alloc type should be None iff its context id set is now empty.
  assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
         OldCaller->emptyContextIds());
  if (VerifyCCG) {
    checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false);
    checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false);
    for (const auto &OldCallerEdge : OldCaller->CallerEdges)
      checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
                                            /*CheckEdges=*/false);
    for (const auto &NewCallerEdge : NewCaller->CallerEdges)
      checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
                                            /*CheckEdges=*/false);
  }
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
    recursivelyRemoveNoneTypeCalleeEdges(
        ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
  auto Inserted = Visited.insert(Node);
  if (!Inserted.second)
    return;

  removeNoneTypeCalleeEdges(Node);

  for (auto *Clone : Node->Clones)
    recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);

  // The recursive call may remove some of this Node's caller edges.
  // Iterate over a copy and skip any that were removed.
  auto CallerEdges = Node->CallerEdges;
  for (auto &Edge : CallerEdges) {
    // Skip any that have been removed by an earlier recursive call.
    if (Edge->isRemoved()) {
      assert(!is_contained(Node->CallerEdges, Edge));
      continue;
    }
    recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
  }
}

// This is the standard DFS based backedge discovery algorithm.
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges() {
  // If we are cloning recursive contexts, find and mark backedges from all root
  // callers, using the typical DFS based backedge analysis.
  if (!CloneRecursiveContexts)
    return;
  DenseSet<const ContextNode *> Visited;
  DenseSet<const ContextNode *> CurrentStack;
  for (auto &Entry : NonAllocationCallToContextNodeMap) {
    auto *Node = Entry.second;
    if (Node->isRemoved())
      continue;
    // It is a root if it doesn't have callers.
    if (!Node->CallerEdges.empty())
      continue;
    markBackedges(Node, Visited, CurrentStack);
    assert(CurrentStack.empty());
  }
}

// Recursive helper for above markBackedges method.
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges(
    ContextNode *Node, DenseSet<const ContextNode *> &Visited,
    DenseSet<const ContextNode *> &CurrentStack) {
  auto I = Visited.insert(Node);
  // We should only call this for unvisited nodes.
  assert(I.second);
  (void)I;
  for (auto &CalleeEdge : Node->CalleeEdges) {
    auto *Callee = CalleeEdge->Callee;
    if (Visited.count(Callee)) {
      // Since this was already visited we need to check if it is currently on
      // the recursive stack in which case it is a backedge.
      if (CurrentStack.count(Callee))
        CalleeEdge->IsBackedge = true;
      continue;
    }
    CurrentStack.insert(Callee);
    markBackedges(Callee, Visited, CurrentStack);
    CurrentStack.erase(Callee);
  }
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
  DenseSet<const ContextNode *> Visited;
  for (auto &Entry : AllocationCallToContextNodeMap) {
    Visited.clear();
    identifyClones(Entry.second, Visited, Entry.second->getContextIds());
  }
  Visited.clear();
  for (auto &Entry : AllocationCallToContextNodeMap)
    recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
  if (VerifyCCG)
    check();
}

// helper function to check an AllocType is cold or notcold or both.
bool checkColdOrNotCold(uint8_t AllocType) {
  return (AllocType == (uint8_t)AllocationType::Cold) ||
         (AllocType == (uint8_t)AllocationType::NotCold) ||
         (AllocType ==
          ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold));
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
    ContextNode *Node, DenseSet<const ContextNode *> &Visited,
    const DenseSet<uint32_t> &AllocContextIds) {
  if (VerifyNodes)
    checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
  assert(!Node->CloneOf);

  // If Node as a null call, then either it wasn't found in the module (regular
  // LTO) or summary index (ThinLTO), or there were other conditions blocking
  // cloning (e.g. recursion, calls multiple targets, etc).
  // Do this here so that we don't try to recursively clone callers below, which
  // isn't useful at least for this node.
  if (!Node->hasCall())
    return;

  // No need to look at any callers if allocation type already unambiguous.
  if (hasSingleAllocType(Node->AllocTypes))
    return;

#ifndef NDEBUG
  auto Insert =
#endif
      Visited.insert(Node);
  // We should not have visited this node yet.
  assert(Insert.second);
  // The recursive call to identifyClones may delete the current edge from the
  // CallerEdges vector. Make a copy and iterate on that, simpler than passing
  // in an iterator and having recursive call erase from it. Other edges may
  // also get removed during the recursion, which will have null Callee and
  // Caller pointers (and are deleted later), so we skip those below.
  {
    auto CallerEdges = Node->CallerEdges;
    for (auto &Edge : CallerEdges) {
      // Skip any that have been removed by an earlier recursive call.
      if (Edge->isRemoved()) {
        assert(!is_contained(Node->CallerEdges, Edge));
        continue;
      }
      // Defer backedges. See comments further below where these edges are
      // handled during the cloning of this Node.
      if (Edge->IsBackedge) {
        // We should only mark these if cloning recursive contexts, where we
        // need to do this deferral.
        assert(CloneRecursiveContexts);
        continue;
      }
      // Ignore any caller we previously visited via another edge.
      if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
        identifyClones(Edge->Caller, Visited, AllocContextIds);
      }
    }
  }

  // Check if we reached an unambiguous call or have have only a single caller.
  if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
    return;

  // We need to clone.

  // Try to keep the original version as alloc type NotCold. This will make
  // cases with indirect calls or any other situation with an unknown call to
  // the original function get the default behavior. We do this by sorting the
  // CallerEdges of the Node we will clone by alloc type.
  //
  // Give NotCold edge the lowest sort priority so those edges are at the end of
  // the caller edges vector, and stay on the original version (since the below
  // code clones greedily until it finds all remaining edges have the same type
  // and leaves the remaining ones on the original Node).
  //
  // We shouldn't actually have any None type edges, so the sorting priority for
  // that is arbitrary, and we assert in that case below.
  const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
                                               /*Cold*/ 1,
                                               /*NotColdCold*/ 2};
  std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(),
                   [&](const std::shared_ptr<ContextEdge> &A,
                       const std::shared_ptr<ContextEdge> &B) {
                     // Nodes with non-empty context ids should be sorted before
                     // those with empty context ids.
                     if (A->ContextIds.empty())
                       // Either B ContextIds are non-empty (in which case we
                       // should return false because B < A), or B ContextIds
                       // are empty, in which case they are equal, and we should
                       // maintain the original relative ordering.
                       return false;
                     if (B->ContextIds.empty())
                       return true;

                     if (A->AllocTypes == B->AllocTypes)
                       // Use the first context id for each edge as a
                       // tie-breaker.
                       return *A->ContextIds.begin() < *B->ContextIds.begin();
                     return AllocTypeCloningPriority[A->AllocTypes] <
                            AllocTypeCloningPriority[B->AllocTypes];
                   });

  assert(Node->AllocTypes != (uint8_t)AllocationType::None);

  DenseSet<uint32_t> RecursiveContextIds;
  assert(AllowRecursiveContexts || !CloneRecursiveContexts);
  // If we are allowing recursive callsites, but have also disabled recursive
  // contexts, look for context ids that show up in multiple caller edges.
  if (AllowRecursiveCallsites && !AllowRecursiveContexts) {
    DenseSet<uint32_t> AllCallerContextIds;
    for (auto &CE : Node->CallerEdges) {
      // Resize to the largest set of caller context ids, since we know the
      // final set will be at least that large.
      AllCallerContextIds.reserve(CE->getContextIds().size());
      for (auto Id : CE->getContextIds())
        if (!AllCallerContextIds.insert(Id).second)
          RecursiveContextIds.insert(Id);
    }
  }

  // Iterate until we find no more opportunities for disambiguating the alloc
  // types via cloning. In most cases this loop will terminate once the Node
  // has a single allocation type, in which case no more cloning is needed.
  // Iterate over a copy of Node's caller edges, since we may need to remove
  // edges in the moveEdgeTo* methods, and this simplifies the handling and
  // makes it less error-prone.
  auto CallerEdges = Node->CallerEdges;
  for (auto &CallerEdge : CallerEdges) {
    // Skip any that have been removed by an earlier recursive call.
    if (CallerEdge->isRemoved()) {
      assert(!is_contained(Node->CallerEdges, CallerEdge));
      continue;
    }
    assert(CallerEdge->Callee == Node);

    // See if cloning the prior caller edge left this node with a single alloc
    // type or a single caller. In that case no more cloning of Node is needed.
    if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
      break;

    // If the caller was not successfully matched to a call in the IR/summary,
    // there is no point in trying to clone for it as we can't update that call.
    if (!CallerEdge->Caller->hasCall())
      continue;

    // Only need to process the ids along this edge pertaining to the given
    // allocation.
    auto CallerEdgeContextsForAlloc =
        set_intersection(CallerEdge->getContextIds(), AllocContextIds);
    if (!RecursiveContextIds.empty())
      CallerEdgeContextsForAlloc =
          set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds);
    if (CallerEdgeContextsForAlloc.empty())
      continue;

    auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);

    // Compute the node callee edge alloc types corresponding to the context ids
    // for this caller edge.
    std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
    CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
    for (auto &CalleeEdge : Node->CalleeEdges)
      CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
          CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));

    // Don't clone if doing so will not disambiguate any alloc types amongst
    // caller edges (including the callee edges that would be cloned).
    // Otherwise we will simply move all edges to the clone.
    //
    // First check if by cloning we will disambiguate the caller allocation
    // type from node's allocation type. Query allocTypeToUse so that we don't
    // bother cloning to distinguish NotCold+Cold from NotCold. Note that
    // neither of these should be None type.
    //
    // Then check if by cloning node at least one of the callee edges will be
    // disambiguated by splitting out different context ids.
    //
    // However, always do the cloning if this is a backedge, in which case we
    // have not yet cloned along this caller edge.
    assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
    assert(Node->AllocTypes != (uint8_t)AllocationType::None);
    if (!CallerEdge->IsBackedge &&
        allocTypeToUse(CallerAllocTypeForAlloc) ==
            allocTypeToUse(Node->AllocTypes) &&
        allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
            CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
      continue;
    }

    if (CallerEdge->IsBackedge) {
      // We should only mark these if cloning recursive contexts, where we
      // need to do this deferral.
      assert(CloneRecursiveContexts);
      DeferredBackedges++;
    }

    // If this is a backedge, we now do recursive cloning starting from its
    // caller since we may have moved unambiguous caller contexts to a clone
    // of this Node in a previous iteration of the current loop, giving more
    // opportunity for cloning through the backedge. Because we sorted the
    // caller edges earlier so that cold caller edges are first, we would have
    // visited and cloned this node for any unamibiguously cold non-recursive
    // callers before any ambiguous backedge callers. Note that we don't do this
    // if the caller is already cloned or visited during cloning (e.g. via a
    // different context path from the allocation).
    // TODO: Can we do better in the case where the caller was already visited?
    if (CallerEdge->IsBackedge && !CallerEdge->Caller->CloneOf &&
        !Visited.count(CallerEdge->Caller)) {
      const auto OrigIdCount = CallerEdge->getContextIds().size();
      // Now do the recursive cloning of this backedge's caller, which was
      // deferred earlier.
      identifyClones(CallerEdge->Caller, Visited, CallerEdgeContextsForAlloc);
      removeNoneTypeCalleeEdges(CallerEdge->Caller);
      // See if the recursive call to identifyClones moved the context ids to a
      // new edge from this node to a clone of caller, and switch to looking at
      // that new edge so that we clone Node for the new caller clone.
      bool UpdatedEdge = false;
      if (OrigIdCount > CallerEdge->getContextIds().size()) {
        for (auto E : Node->CallerEdges) {
          // Only interested in clones of the current edges caller.
          if (E->Caller->CloneOf != CallerEdge->Caller)
            continue;
          // See if this edge contains any of the context ids originally on the
          // current caller edge.
          auto CallerEdgeContextsForAllocNew =
              set_intersection(CallerEdgeContextsForAlloc, E->getContextIds());
          if (CallerEdgeContextsForAllocNew.empty())
            continue;
          // Make sure we don't pick a previously existing caller edge of this
          // Node, which would be processed on a different iteration of the
          // outer loop over the saved CallerEdges.
          if (llvm::is_contained(CallerEdges, E))
            continue;
          // The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge
          // are updated further below for all cases where we just invoked
          // identifyClones recursively.
          CallerEdgeContextsForAlloc.swap(CallerEdgeContextsForAllocNew);
          CallerEdge = E;
          UpdatedEdge = true;
          break;
        }
      }
      // If cloning removed this edge (and we didn't update it to a new edge
      // above), we're done with this edge. It's possible we moved all of the
      // context ids to an existing clone, in which case there's no need to do
      // further processing for them.
      if (CallerEdge->isRemoved())
        continue;

      // Now we need to update the information used for the cloning decisions
      // further below, as we may have modified edges and their context ids.

      // Note if we changed the CallerEdge above we would have already updated
      // the context ids.
      if (!UpdatedEdge) {
        CallerEdgeContextsForAlloc = set_intersection(
            CallerEdgeContextsForAlloc, CallerEdge->getContextIds());
        if (CallerEdgeContextsForAlloc.empty())
          continue;
      }
      // Update the other information that depends on the edges and on the now
      // updated CallerEdgeContextsForAlloc.
      CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
      CalleeEdgeAllocTypesForCallerEdge.clear();
      for (auto &CalleeEdge : Node->CalleeEdges) {
        CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
            CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
      }
    }

    // First see if we can use an existing clone. Check each clone and its
    // callee edges for matching alloc types.
    ContextNode *Clone = nullptr;
    for (auto *CurClone : Node->Clones) {
      if (allocTypeToUse(CurClone->AllocTypes) !=
          allocTypeToUse(CallerAllocTypeForAlloc))
        continue;

      bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) &&
                             hasSingleAllocType(CallerAllocTypeForAlloc);
      // The above check should mean that if both have single alloc types that
      // they should be equal.
      assert(!BothSingleAlloc ||
             CurClone->AllocTypes == CallerAllocTypeForAlloc);

      // If either both have a single alloc type (which are the same), or if the
      // clone's callee edges have the same alloc types as those for the current
      // allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge),
      // then we can reuse this clone.
      if (BothSingleAlloc || allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>(
                                 CalleeEdgeAllocTypesForCallerEdge, CurClone)) {
        Clone = CurClone;
        break;
      }
    }

    // The edge iterator is adjusted when we move the CallerEdge to the clone.
    if (Clone)
      moveEdgeToExistingCalleeClone(CallerEdge, Clone, /*NewClone=*/false,
                                    CallerEdgeContextsForAlloc);
    else
      Clone = moveEdgeToNewCalleeClone(CallerEdge, CallerEdgeContextsForAlloc);

    // Sanity check that no alloc types on clone or its edges are None.
    assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
  }

  // We should still have some context ids on the original Node.
  assert(!Node->emptyContextIds());

  // Sanity check that no alloc types on node or edges are None.
  assert(Node->AllocTypes != (uint8_t)AllocationType::None);

  if (VerifyNodes)
    checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
}

void ModuleCallsiteContextGraph::updateAllocationCall(
    CallInfo &Call, AllocationType AllocType) {
  std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
  auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
                                "memprof", AllocTypeString);
  cast<CallBase>(Call.call())->addFnAttr(A);
  OREGetter(Call.call()->getFunction())
      .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
            << ore::NV("AllocationCall", Call.call()) << " in clone "
            << ore::NV("Caller", Call.call()->getFunction())
            << " marked with memprof allocation attribute "
            << ore::NV("Attribute", AllocTypeString));
}

void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
                                                     AllocationType AllocType) {
  auto *AI = cast<AllocInfo *>(Call.call());
  assert(AI);
  assert(AI->Versions.size() > Call.cloneNo());
  AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
}

AllocationType
ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
  const auto *CB = cast<CallBase>(Call.call());
  if (!CB->getAttributes().hasFnAttr("memprof"))
    return AllocationType::None;
  return CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
             ? AllocationType::Cold
             : AllocationType::NotCold;
}

AllocationType
IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
  const auto *AI = cast<AllocInfo *>(Call.call());
  assert(AI->Versions.size() > Call.cloneNo());
  return (AllocationType)AI->Versions[Call.cloneNo()];
}

void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
                                            FuncInfo CalleeFunc) {
  if (CalleeFunc.cloneNo() > 0)
    cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
  OREGetter(CallerCall.call()->getFunction())
      .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
            << ore::NV("Call", CallerCall.call()) << " in clone "
            << ore::NV("Caller", CallerCall.call()->getFunction())
            << " assigned to call function clone "
            << ore::NV("Callee", CalleeFunc.func()));
}

void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
                                           FuncInfo CalleeFunc) {
  auto *CI = cast<CallsiteInfo *>(CallerCall.call());
  assert(CI &&
         "Caller cannot be an allocation which should not have profiled calls");
  assert(CI->Clones.size() > CallerCall.cloneNo());
  CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
}

CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
                     Instruction *>::FuncInfo
ModuleCallsiteContextGraph::cloneFunctionForCallsite(
    FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
    std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
  // Use existing LLVM facilities for cloning and obtaining Call in clone
  ValueToValueMapTy VMap;
  auto *NewFunc = CloneFunction(Func.func(), VMap);
  std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
  assert(!Func.func()->getParent()->getFunction(Name));
  NewFunc->setName(Name);
  for (auto &Inst : CallsWithMetadataInFunc) {
    // This map always has the initial version in it.
    assert(Inst.cloneNo() == 0);
    CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
  }
  OREGetter(Func.func())
      .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
            << "created clone " << ore::NV("NewFunction", NewFunc));
  return {NewFunc, CloneNo};
}

CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
                     IndexCall>::FuncInfo
IndexCallsiteContextGraph::cloneFunctionForCallsite(
    FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
    std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
  // Check how many clones we have of Call (and therefore function).
  // The next clone number is the current size of versions array.
  // Confirm this matches the CloneNo provided by the caller, which is based on
  // the number of function clones we have.
  assert(CloneNo == (isa<AllocInfo *>(Call.call())
                         ? cast<AllocInfo *>(Call.call())->Versions.size()
                         : cast<CallsiteInfo *>(Call.call())->Clones.size()));
  // Walk all the instructions in this function. Create a new version for
  // each (by adding an entry to the Versions/Clones summary array), and copy
  // over the version being called for the function clone being cloned here.
  // Additionally, add an entry to the CallMap for the new function clone,
  // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
  // to the new call clone.
  for (auto &Inst : CallsWithMetadataInFunc) {
    // This map always has the initial version in it.
    assert(Inst.cloneNo() == 0);
    if (auto *AI = dyn_cast<AllocInfo *>(Inst.call())) {
      assert(AI->Versions.size() == CloneNo);
      // We assign the allocation type later (in updateAllocationCall), just add
      // an entry for it here.
      AI->Versions.push_back(0);
    } else {
      auto *CI = cast<CallsiteInfo *>(Inst.call());
      assert(CI && CI->Clones.size() == CloneNo);
      // We assign the clone number later (in updateCall), just add an entry for
      // it here.
      CI->Clones.push_back(0);
    }
    CallMap[Inst] = {Inst.call(), CloneNo};
  }
  return {Func.func(), CloneNo};
}

// This method assigns cloned callsites to functions, cloning the functions as
// needed. The assignment is greedy and proceeds roughly as follows:
//
// For each function Func:
//   For each call with graph Node having clones:
//     Initialize ClonesWorklist to Node and its clones
//     Initialize NodeCloneCount to 0
//     While ClonesWorklist is not empty:
//        Clone = pop front ClonesWorklist
//        NodeCloneCount++
//        If Func has been cloned less than NodeCloneCount times:
//           If NodeCloneCount is 1:
//             Assign Clone to original Func
//             Continue
//           Create a new function clone
//           If other callers not assigned to call a function clone yet:
//              Assign them to call new function clone
//              Continue
//           Assign any other caller calling the cloned version to new clone
//
//        For each caller of Clone:
//           If caller is assigned to call a specific function clone:
//             If we cannot assign Clone to that function clone:
//               Create new callsite Clone NewClone
//               Add NewClone to ClonesWorklist
//               Continue
//             Assign Clone to existing caller's called function clone
//           Else:
//             If Clone not already assigned to a function clone:
//                Assign to first function clone without assignment
//             Assign caller to selected function clone
template <typename DerivedCCG, typename FuncTy, typename CallTy>
bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
  bool Changed = false;

  // Keep track of the assignment of nodes (callsites) to function clones they
  // call.
  DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;

  // Update caller node to call function version CalleeFunc, by recording the
  // assignment in CallsiteToCalleeFuncCloneMap.
  auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
                                        const FuncInfo &CalleeFunc) {
    assert(Caller->hasCall());
    CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
  };

  // Walk all functions for which we saw calls with memprof metadata, and handle
  // cloning for each of its calls.
  for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
    FuncInfo OrigFunc(Func);
    // Map from each clone of OrigFunc to a map of remappings of each call of
    // interest (from original uncloned call to the corresponding cloned call in
    // that function clone).
    std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
    for (auto &Call : CallsWithMetadata) {
      ContextNode *Node = getNodeForInst(Call);
      // Skip call if we do not have a node for it (all uses of its stack ids
      // were either on inlined chains or pruned from the MIBs), or if we did
      // not create any clones for it.
      if (!Node || Node->Clones.empty())
        continue;
      assert(Node->hasCall() &&
             "Not having a call should have prevented cloning");

      // Track the assignment of function clones to clones of the current
      // callsite Node being handled.
      std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;

      // Assign callsite version CallsiteClone to function version FuncClone,
      // and also assign (possibly cloned) Call to CallsiteClone.
      auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
                                                CallInfo &Call,
                                                ContextNode *CallsiteClone,
                                                bool IsAlloc) {
        // Record the clone of callsite node assigned to this function clone.
        FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;

        assert(FuncClonesToCallMap.count(FuncClone));
        std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
        CallInfo CallClone(Call);
        if (CallMap.count(Call))
          CallClone = CallMap[Call];
        CallsiteClone->setCall(CallClone);
        // Need to do the same for all matching calls.
        for (auto &MatchingCall : Node->MatchingCalls) {
          CallInfo CallClone(MatchingCall);
          if (CallMap.count(MatchingCall))
            CallClone = CallMap[MatchingCall];
          // Updates the call in the list.
          MatchingCall = CallClone;
        }
      };

      // Keep track of the clones of callsite Node that need to be assigned to
      // function clones. This list may be expanded in the loop body below if we
      // find additional cloning is required.
      std::deque<ContextNode *> ClonesWorklist;
      // Ignore original Node if we moved all of its contexts to clones.
      if (!Node->emptyContextIds())
        ClonesWorklist.push_back(Node);
      llvm::append_range(ClonesWorklist, Node->Clones);

      // Now walk through all of the clones of this callsite Node that we need,
      // and determine the assignment to a corresponding clone of the current
      // function (creating new function clones as needed).
      unsigned NodeCloneCount = 0;
      while (!ClonesWorklist.empty()) {
        ContextNode *Clone = ClonesWorklist.front();
        ClonesWorklist.pop_front();
        NodeCloneCount++;
        if (VerifyNodes)
          checkNode<DerivedCCG, FuncTy, CallTy>(Clone);

        // Need to create a new function clone if we have more callsite clones
        // than existing function clones, which would have been assigned to an
        // earlier clone in the list (we assign callsite clones to function
        // clones greedily).
        if (FuncClonesToCallMap.size() < NodeCloneCount) {
          // If this is the first callsite copy, assign to original function.
          if (NodeCloneCount == 1) {
            // Since FuncClonesToCallMap is empty in this case, no clones have
            // been created for this function yet, and no callers should have
            // been assigned a function clone for this callee node yet.
            assert(llvm::none_of(
                Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
                  return CallsiteToCalleeFuncCloneMap.count(E->Caller);
                }));
            // Initialize with empty call map, assign Clone to original function
            // and its callers, and skip to the next clone.
            FuncClonesToCallMap[OrigFunc] = {};
            AssignCallsiteCloneToFuncClone(
                OrigFunc, Call, Clone,
                AllocationCallToContextNodeMap.count(Call));
            for (auto &CE : Clone->CallerEdges) {
              // Ignore any caller that does not have a recorded callsite Call.
              if (!CE->Caller->hasCall())
                continue;
              RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
            }
            continue;
          }

          // First locate which copy of OrigFunc to clone again. If a caller
          // of this callsite clone was already assigned to call a particular
          // function clone, we need to redirect all of those callers to the
          // new function clone, and update their other callees within this
          // function.
          FuncInfo PreviousAssignedFuncClone;
          auto EI = llvm::find_if(
              Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
                return CallsiteToCalleeFuncCloneMap.count(E->Caller);
              });
          bool CallerAssignedToCloneOfFunc = false;
          if (EI != Clone->CallerEdges.end()) {
            const std::shared_ptr<ContextEdge> &Edge = *EI;
            PreviousAssignedFuncClone =
                CallsiteToCalleeFuncCloneMap[Edge->Caller];
            CallerAssignedToCloneOfFunc = true;
          }

          // Clone function and save it along with the CallInfo map created
          // during cloning in the FuncClonesToCallMap.
          std::map<CallInfo, CallInfo> NewCallMap;
          unsigned CloneNo = FuncClonesToCallMap.size();
          assert(CloneNo > 0 && "Clone 0 is the original function, which "
                                "should already exist in the map");
          FuncInfo NewFuncClone = cloneFunctionForCallsite(
              OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
          FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
          FunctionClonesAnalysis++;
          Changed = true;

          // If no caller callsites were already assigned to a clone of this
          // function, we can simply assign this clone to the new func clone
          // and update all callers to it, then skip to the next clone.
          if (!CallerAssignedToCloneOfFunc) {
            AssignCallsiteCloneToFuncClone(
                NewFuncClone, Call, Clone,
                AllocationCallToContextNodeMap.count(Call));
            for (auto &CE : Clone->CallerEdges) {
              // Ignore any caller that does not have a recorded callsite Call.
              if (!CE->Caller->hasCall())
                continue;
              RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
            }
            continue;
          }

          // We may need to do additional node cloning in this case.
          // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
          // that were previously assigned to call PreviousAssignedFuncClone,
          // to record that they now call NewFuncClone.
          // The none type edge removal may remove some of this Clone's caller
          // edges, if it is reached via another of its caller's callees.
          // Iterate over a copy and skip any that were removed.
          auto CallerEdges = Clone->CallerEdges;
          for (auto CE : CallerEdges) {
            // Skip any that have been removed on an earlier iteration.
            if (CE->isRemoved()) {
              assert(!is_contained(Clone->CallerEdges, CE));
              continue;
            }
            assert(CE);
            // Ignore any caller that does not have a recorded callsite Call.
            if (!CE->Caller->hasCall())
              continue;

            if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
                // We subsequently fall through to later handling that
                // will perform any additional cloning required for
                // callers that were calling other function clones.
                CallsiteToCalleeFuncCloneMap[CE->Caller] !=
                    PreviousAssignedFuncClone)
              continue;

            RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);

            // If we are cloning a function that was already assigned to some
            // callers, then essentially we are creating new callsite clones
            // of the other callsites in that function that are reached by those
            // callers. Clone the other callees of the current callsite's caller
            // that were already assigned to PreviousAssignedFuncClone
            // accordingly. This is important since we subsequently update the
            // calls from the nodes in the graph and their assignments to callee
            // functions recorded in CallsiteToCalleeFuncCloneMap.
            // The none type edge removal may remove some of this caller's
            // callee edges, if it is reached via another of its callees.
            // Iterate over a copy and skip any that were removed.
            auto CalleeEdges = CE->Caller->CalleeEdges;
            for (auto CalleeEdge : CalleeEdges) {
              // Skip any that have been removed on an earlier iteration when
              // cleaning up newly None type callee edges.
              if (CalleeEdge->isRemoved()) {
                assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge));
                continue;
              }
              assert(CalleeEdge);
              ContextNode *Callee = CalleeEdge->Callee;
              // Skip the current callsite, we are looking for other
              // callsites Caller calls, as well as any that does not have a
              // recorded callsite Call.
              if (Callee == Clone || !Callee->hasCall())
                continue;
              // Skip direct recursive calls. We don't need/want to clone the
              // caller node again, and this loop will not behave as expected if
              // we tried.
              if (Callee == CalleeEdge->Caller)
                continue;
              ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
              removeNoneTypeCalleeEdges(NewClone);
              // Moving the edge may have resulted in some none type
              // callee edges on the original Callee.
              removeNoneTypeCalleeEdges(Callee);
              assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
              // If the Callee node was already assigned to call a specific
              // function version, make sure its new clone is assigned to call
              // that same function clone.
              if (CallsiteToCalleeFuncCloneMap.count(Callee))
                RecordCalleeFuncOfCallsite(
                    NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
              // Update NewClone with the new Call clone of this callsite's Call
              // created for the new function clone created earlier.
              // Recall that we have already ensured when building the graph
              // that each caller can only call callsites within the same
              // function, so we are guaranteed that Callee Call is in the
              // current OrigFunc.
              // CallMap is set up as indexed by original Call at clone 0.
              CallInfo OrigCall(Callee->getOrigNode()->Call);
              OrigCall.setCloneNo(0);
              std::map<CallInfo, CallInfo> &CallMap =
                  FuncClonesToCallMap[NewFuncClone];
              assert(CallMap.count(OrigCall));
              CallInfo NewCall(CallMap[OrigCall]);
              assert(NewCall);
              NewClone->setCall(NewCall);
              // Need to do the same for all matching calls.
              for (auto &MatchingCall : NewClone->MatchingCalls) {
                CallInfo OrigMatchingCall(MatchingCall);
                OrigMatchingCall.setCloneNo(0);
                assert(CallMap.count(OrigMatchingCall));
                CallInfo NewCall(CallMap[OrigMatchingCall]);
                assert(NewCall);
                // Updates the call in the list.
                MatchingCall = NewCall;
              }
            }
          }
          // Fall through to handling below to perform the recording of the
          // function for this callsite clone. This enables handling of cases
          // where the callers were assigned to different clones of a function.
        }

        // See if we can use existing function clone. Walk through
        // all caller edges to see if any have already been assigned to
        // a clone of this callsite's function. If we can use it, do so. If not,
        // because that function clone is already assigned to a different clone
        // of this callsite, then we need to clone again.
        // Basically, this checking is needed to handle the case where different
        // caller functions/callsites may need versions of this function
        // containing different mixes of callsite clones across the different
        // callsites within the function. If that happens, we need to create
        // additional function clones to handle the various combinations.
        //
        // Keep track of any new clones of this callsite created by the
        // following loop, as well as any existing clone that we decided to
        // assign this clone to.
        std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
        FuncInfo FuncCloneAssignedToCurCallsiteClone;
        // Iterate over a copy of Clone's caller edges, since we may need to
        // remove edges in the moveEdgeTo* methods, and this simplifies the
        // handling and makes it less error-prone.
        auto CloneCallerEdges = Clone->CallerEdges;
        for (auto &Edge : CloneCallerEdges) {
          // Skip removed edges (due to direct recursive edges updated when
          // updating callee edges when moving an edge and subsequently
          // removed by call to removeNoneTypeCalleeEdges on the Clone).
          if (Edge->isRemoved())
            continue;
          // Ignore any caller that does not have a recorded callsite Call.
          if (!Edge->Caller->hasCall())
            continue;
          // If this caller already assigned to call a version of OrigFunc, need
          // to ensure we can assign this callsite clone to that function clone.
          if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
            FuncInfo FuncCloneCalledByCaller =
                CallsiteToCalleeFuncCloneMap[Edge->Caller];
            // First we need to confirm that this function clone is available
            // for use by this callsite node clone.
            //
            // While FuncCloneToCurNodeCloneMap is built only for this Node and
            // its callsite clones, one of those callsite clones X could have
            // been assigned to the same function clone called by Edge's caller
            // - if Edge's caller calls another callsite within Node's original
            // function, and that callsite has another caller reaching clone X.
            // We need to clone Node again in this case.
            if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
                 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
                     Clone) ||
                // Detect when we have multiple callers of this callsite that
                // have already been assigned to specific, and different, clones
                // of OrigFunc (due to other unrelated callsites in Func they
                // reach via call contexts). Is this Clone of callsite Node
                // assigned to a different clone of OrigFunc? If so, clone Node
                // again.
                (FuncCloneAssignedToCurCallsiteClone &&
                 FuncCloneAssignedToCurCallsiteClone !=
                     FuncCloneCalledByCaller)) {
              // We need to use a different newly created callsite clone, in
              // order to assign it to another new function clone on a
              // subsequent iteration over the Clones array (adjusted below).
              // Note we specifically do not reset the
              // CallsiteToCalleeFuncCloneMap entry for this caller, so that
              // when this new clone is processed later we know which version of
              // the function to copy (so that other callsite clones we have
              // assigned to that function clone are properly cloned over). See
              // comments in the function cloning handling earlier.

              // Check if we already have cloned this callsite again while
              // walking through caller edges, for a caller calling the same
              // function clone. If so, we can move this edge to that new clone
              // rather than creating yet another new clone.
              if (FuncCloneToNewCallsiteCloneMap.count(
                      FuncCloneCalledByCaller)) {
                ContextNode *NewClone =
                    FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
                moveEdgeToExistingCalleeClone(Edge, NewClone);
                // Cleanup any none type edges cloned over.
                removeNoneTypeCalleeEdges(NewClone);
              } else {
                // Create a new callsite clone.
                ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge);
                removeNoneTypeCalleeEdges(NewClone);
                FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
                    NewClone;
                // Add to list of clones and process later.
                ClonesWorklist.push_back(NewClone);
                assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
              }
              // Moving the caller edge may have resulted in some none type
              // callee edges.
              removeNoneTypeCalleeEdges(Clone);
              // We will handle the newly created callsite clone in a subsequent
              // iteration over this Node's Clones.
              continue;
            }

            // Otherwise, we can use the function clone already assigned to this
            // caller.
            if (!FuncCloneAssignedToCurCallsiteClone) {
              FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
              // Assign Clone to FuncCloneCalledByCaller
              AssignCallsiteCloneToFuncClone(
                  FuncCloneCalledByCaller, Call, Clone,
                  AllocationCallToContextNodeMap.count(Call));
            } else
              // Don't need to do anything - callsite is already calling this
              // function clone.
              assert(FuncCloneAssignedToCurCallsiteClone ==
                     FuncCloneCalledByCaller);

          } else {
            // We have not already assigned this caller to a version of
            // OrigFunc. Do the assignment now.

            // First check if we have already assigned this callsite clone to a
            // clone of OrigFunc for another caller during this iteration over
            // its caller edges.
            if (!FuncCloneAssignedToCurCallsiteClone) {
              // Find first function in FuncClonesToCallMap without an assigned
              // clone of this callsite Node. We should always have one
              // available at this point due to the earlier cloning when the
              // FuncClonesToCallMap size was smaller than the clone number.
              for (auto &CF : FuncClonesToCallMap) {
                if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
                  FuncCloneAssignedToCurCallsiteClone = CF.first;
                  break;
                }
              }
              assert(FuncCloneAssignedToCurCallsiteClone);
              // Assign Clone to FuncCloneAssignedToCurCallsiteClone
              AssignCallsiteCloneToFuncClone(
                  FuncCloneAssignedToCurCallsiteClone, Call, Clone,
                  AllocationCallToContextNodeMap.count(Call));
            } else
              assert(FuncCloneToCurNodeCloneMap
                         [FuncCloneAssignedToCurCallsiteClone] == Clone);
            // Update callers to record function version called.
            RecordCalleeFuncOfCallsite(Edge->Caller,
                                       FuncCloneAssignedToCurCallsiteClone);
          }
        }
      }
      if (VerifyCCG) {
        checkNode<DerivedCCG, FuncTy, CallTy>(Node);
        for (const auto &PE : Node->CalleeEdges)
          checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
        for (const auto &CE : Node->CallerEdges)
          checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
        for (auto *Clone : Node->Clones) {
          checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
          for (const auto &PE : Clone->CalleeEdges)
            checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
          for (const auto &CE : Clone->CallerEdges)
            checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
        }
      }
    }
  }

  uint8_t BothTypes =
      (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;

  auto UpdateCalls = [&](ContextNode *Node,
                         DenseSet<const ContextNode *> &Visited,
                         auto &&UpdateCalls) {
    auto Inserted = Visited.insert(Node);
    if (!Inserted.second)
      return;

    for (auto *Clone : Node->Clones)
      UpdateCalls(Clone, Visited, UpdateCalls);

    for (auto &Edge : Node->CallerEdges)
      UpdateCalls(Edge->Caller, Visited, UpdateCalls);

    // Skip if either no call to update, or if we ended up with no context ids
    // (we moved all edges onto other clones).
    if (!Node->hasCall() || Node->emptyContextIds())
      return;

    if (Node->IsAllocation) {
      auto AT = allocTypeToUse(Node->AllocTypes);
      // If the allocation type is ambiguous, and more aggressive hinting
      // has been enabled via the MinClonedColdBytePercent flag, see if this
      // allocation should be hinted cold anyway because its fraction cold bytes
      // allocated is at least the given threshold.
      if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
          !ContextIdToContextSizeInfos.empty()) {
        uint64_t TotalCold = 0;
        uint64_t Total = 0;
        for (auto Id : Node->getContextIds()) {
          auto TypeI = ContextIdToAllocationType.find(Id);
          assert(TypeI != ContextIdToAllocationType.end());
          auto CSI = ContextIdToContextSizeInfos.find(Id);
          if (CSI != ContextIdToContextSizeInfos.end()) {
            for (auto &Info : CSI->second) {
              Total += Info.TotalSize;
              if (TypeI->second == AllocationType::Cold)
                TotalCold += Info.TotalSize;
            }
          }
        }
        if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
          AT = AllocationType::Cold;
      }
      updateAllocationCall(Node->Call, AT);
      assert(Node->MatchingCalls.empty());
      return;
    }

    if (!CallsiteToCalleeFuncCloneMap.count(Node))
      return;

    auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
    updateCall(Node->Call, CalleeFunc);
    // Update all the matching calls as well.
    for (auto &Call : Node->MatchingCalls)
      updateCall(Call, CalleeFunc);
  };

  // Performs DFS traversal starting from allocation nodes to update calls to
  // reflect cloning decisions recorded earlier. For regular LTO this will
  // update the actual calls in the IR to call the appropriate function clone
  // (and add attributes to allocation calls), whereas for ThinLTO the decisions
  // are recorded in the summary entries.
  DenseSet<const ContextNode *> Visited;
  for (auto &Entry : AllocationCallToContextNodeMap)
    UpdateCalls(Entry.second, Visited, UpdateCalls);

  return Changed;
}

static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones(
    Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
    std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
        &FuncToAliasMap) {
  // The first "clone" is the original copy, we should only call this if we
  // needed to create new clones.
  assert(NumClones > 1);
  SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
  VMaps.reserve(NumClones - 1);
  FunctionsClonedThinBackend++;
  for (unsigned I = 1; I < NumClones; I++) {
    VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
    auto *NewF = CloneFunction(&F, *VMaps.back());
    FunctionClonesThinBackend++;
    // Strip memprof and callsite metadata from clone as they are no longer
    // needed.
    for (auto &BB : *NewF) {
      for (auto &Inst : BB) {
        Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
        Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
      }
    }
    std::string Name = getMemProfFuncName(F.getName(), I);
    auto *PrevF = M.getFunction(Name);
    if (PrevF) {
      // We might have created this when adjusting callsite in another
      // function. It should be a declaration.
      assert(PrevF->isDeclaration());
      NewF->takeName(PrevF);
      PrevF->replaceAllUsesWith(NewF);
      PrevF->eraseFromParent();
    } else
      NewF->setName(Name);
    ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
             << "created clone " << ore::NV("NewFunction", NewF));

    // Now handle aliases to this function, and clone those as well.
    if (!FuncToAliasMap.count(&F))
      continue;
    for (auto *A : FuncToAliasMap[&F]) {
      std::string Name = getMemProfFuncName(A->getName(), I);
      auto *PrevA = M.getNamedAlias(Name);
      auto *NewA = GlobalAlias::create(A->getValueType(),
                                       A->getType()->getPointerAddressSpace(),
                                       A->getLinkage(), Name, NewF);
      NewA->copyAttributesFrom(A);
      if (PrevA) {
        // We might have created this when adjusting callsite in another
        // function. It should be a declaration.
        assert(PrevA->isDeclaration());
        NewA->takeName(PrevA);
        PrevA->replaceAllUsesWith(NewA);
        PrevA->eraseFromParent();
      }
    }
  }
  return VMaps;
}

// Locate the summary for F. This is complicated by the fact that it might
// have been internalized or promoted.
static ValueInfo findValueInfoForFunc(const Function &F, const Module &M,
                                      const ModuleSummaryIndex *ImportSummary,
                                      const Function *CallingFunc = nullptr) {
  // FIXME: Ideally we would retain the original GUID in some fashion on the
  // function (e.g. as metadata), but for now do our best to locate the
  // summary without that information.
  ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
  if (!TheFnVI)
    // See if theFn was internalized, by checking index directly with
    // original name (this avoids the name adjustment done by getGUID() for
    // internal symbols).
    TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName()));
  if (TheFnVI)
    return TheFnVI;
  // Now query with the original name before any promotion was performed.
  StringRef OrigName =
      ModuleSummaryIndex::getOriginalNameBeforePromote(F.getName());
  // When this pass is enabled, we always add thinlto_src_file provenance
  // metadata to imported function definitions, which allows us to recreate the
  // original internal symbol's GUID.
  auto SrcFileMD = F.getMetadata("thinlto_src_file");
  // If this is a call to an imported/promoted local for which we didn't import
  // the definition, the metadata will not exist on the declaration. However,
  // since we are doing this early, before any inlining in the LTO backend, we
  // can simply look at the metadata on the calling function which must have
  // been from the same module if F was an internal symbol originally.
  if (!SrcFileMD && F.isDeclaration()) {
    // We would only call this for a declaration for a direct callsite, in which
    // case the caller would have provided the calling function pointer.
    assert(CallingFunc);
    SrcFileMD = CallingFunc->getMetadata("thinlto_src_file");
    // If this is a promoted local (OrigName != F.getName()), since this is a
    // declaration, it must be imported from a different module and therefore we
    // should always find the metadata on its calling function. Any call to a
    // promoted local that came from this module should still be a definition.
    assert(SrcFileMD || OrigName == F.getName());
  }
  StringRef SrcFile = M.getSourceFileName();
  if (SrcFileMD)
    SrcFile = dyn_cast<MDString>(SrcFileMD->getOperand(0))->getString();
  std::string OrigId = GlobalValue::getGlobalIdentifier(
      OrigName, GlobalValue::InternalLinkage, SrcFile);
  TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId));
  // Internal func in original module may have gotten a numbered suffix if we
  // imported an external function with the same name. This happens
  // automatically during IR linking for naming conflicts. It would have to
  // still be internal in that case (otherwise it would have been renamed on
  // promotion in which case we wouldn't have a naming conflict).
  if (!TheFnVI && OrigName == F.getName() && F.hasLocalLinkage() &&
      F.getName().contains('.')) {
    OrigName = F.getName().rsplit('.').first;
    OrigId = GlobalValue::getGlobalIdentifier(
        OrigName, GlobalValue::InternalLinkage, SrcFile);
    TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId));
  }
  // The only way we may not have a VI is if this is a declaration created for
  // an imported reference. For distributed ThinLTO we may not have a VI for
  // such declarations in the distributed summary.
  assert(TheFnVI || F.isDeclaration());
  return TheFnVI;
}

bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
    Module &M) {
  ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
  Symtab = std::make_unique<InstrProfSymtab>();
  // Don't add canonical names, to avoid multiple functions to the symtab
  // when they both have the same root name with "." suffixes stripped.
  // If we pick the wrong one then this could lead to incorrect ICP and calling
  // a memprof clone that we don't actually create (resulting in linker unsats).
  // What this means is that the GUID of the function (or its PGOFuncName
  // metadata) *must* match that in the VP metadata to allow promotion.
  // In practice this should not be a limitation, since local functions should
  // have PGOFuncName metadata and global function names shouldn't need any
  // special handling (they should not get the ".llvm.*" suffix that the
  // canonicalization handling is attempting to strip).
  if (Error E = Symtab->create(M, /*InLTO=*/true, /*AddCanonical=*/false)) {
    std::string SymtabFailure = toString(std::move(E));
    M.getContext().emitError("Failed to create symtab: " + SymtabFailure);
    return false;
  }
  return true;
}

bool MemProfContextDisambiguation::applyImport(Module &M) {
  assert(ImportSummary);
  bool Changed = false;

  // We also need to clone any aliases that reference cloned functions, because
  // the modified callsites may invoke via the alias. Keep track of the aliases
  // for each function.
  std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
      FuncToAliasMap;
  for (auto &A : M.aliases()) {
    auto *Aliasee = A.getAliaseeObject();
    if (auto *F = dyn_cast<Function>(Aliasee))
      FuncToAliasMap[F].insert(&A);
  }

  if (!initializeIndirectCallPromotionInfo(M))
    return false;

  for (auto &F : M) {
    if (F.isDeclaration() || isMemProfClone(F))
      continue;

    OptimizationRemarkEmitter ORE(&F);

    SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
    bool ClonesCreated = false;
    unsigned NumClonesCreated = 0;
    auto CloneFuncIfNeeded = [&](unsigned NumClones) {
      // We should at least have version 0 which is the original copy.
      assert(NumClones > 0);
      // If only one copy needed use original.
      if (NumClones == 1)
        return;
      // If we already performed cloning of this function, confirm that the
      // requested number of clones matches (the thin link should ensure the
      // number of clones for each constituent callsite is consistent within
      // each function), before returning.
      if (ClonesCreated) {
        assert(NumClonesCreated == NumClones);
        return;
      }
      VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
      // The first "clone" is the original copy, which doesn't have a VMap.
      assert(VMaps.size() == NumClones - 1);
      Changed = true;
      ClonesCreated = true;
      NumClonesCreated = NumClones;
    };

    auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
                             Function *CalledFunction) {
      // Perform cloning if not yet done.
      CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());

      assert(!isMemProfClone(*CalledFunction));

      // Update the calls per the summary info.
      // Save orig name since it gets updated in the first iteration
      // below.
      auto CalleeOrigName = CalledFunction->getName();
      for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
        // Do nothing if this version calls the original version of its
        // callee.
        if (!StackNode.Clones[J])
          continue;
        auto NewF = M.getOrInsertFunction(
            getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
            CalledFunction->getFunctionType());
        CallBase *CBClone;
        // Copy 0 is the original function.
        if (!J)
          CBClone = CB;
        else
          CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
        CBClone->setCalledFunction(NewF);
        ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
                 << ore::NV("Call", CBClone) << " in clone "
                 << ore::NV("Caller", CBClone->getFunction())
                 << " assigned to call function clone "
                 << ore::NV("Callee", NewF.getCallee()));
      }
    };

    // Locate the summary for F.
    ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
    // If not found, this could be an imported local (see comment in
    // findValueInfoForFunc). Skip for now as it will be cloned in its original
    // module (where it would have been promoted to global scope so should
    // satisfy any reference in this module).
    if (!TheFnVI)
      continue;

    auto *GVSummary =
        ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
    if (!GVSummary) {
      // Must have been imported, use the summary which matches the definition。
      // (might be multiple if this was a linkonce_odr).
      auto SrcModuleMD = F.getMetadata("thinlto_src_module");
      assert(SrcModuleMD &&
             "enable-import-metadata is needed to emit thinlto_src_module");
      StringRef SrcModule =
          dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
      for (auto &GVS : TheFnVI.getSummaryList()) {
        if (GVS->modulePath() == SrcModule) {
          GVSummary = GVS.get();
          break;
        }
      }
      assert(GVSummary && GVSummary->modulePath() == SrcModule);
    }

    // If this was an imported alias skip it as we won't have the function
    // summary, and it should be cloned in the original module.
    if (isa<AliasSummary>(GVSummary))
      continue;

    auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());

    if (FS->allocs().empty() && FS->callsites().empty())
      continue;

    auto SI = FS->callsites().begin();
    auto AI = FS->allocs().begin();

    // To handle callsite infos synthesized for tail calls which have missing
    // frames in the profiled context, map callee VI to the synthesized callsite
    // info.
    DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
    // Iterate the callsites for this function in reverse, since we place all
    // those synthesized for tail calls at the end.
    for (auto CallsiteIt = FS->callsites().rbegin();
         CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
      auto &Callsite = *CallsiteIt;
      // Stop as soon as we see a non-synthesized callsite info (see comment
      // above loop). All the entries added for discovered tail calls have empty
      // stack ids.
      if (!Callsite.StackIdIndices.empty())
        break;
      MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
    }

    // Keeps track of needed ICP for the function.
    SmallVector<ICallAnalysisData> ICallAnalysisInfo;

    // Assume for now that the instructions are in the exact same order
    // as when the summary was created, but confirm this is correct by
    // matching the stack ids.
    for (auto &BB : F) {
      for (auto &I : BB) {
        auto *CB = dyn_cast<CallBase>(&I);
        // Same handling as when creating module summary.
        if (!mayHaveMemprofSummary(CB))
          continue;

        auto *CalledValue = CB->getCalledOperand();
        auto *CalledFunction = CB->getCalledFunction();
        if (CalledValue && !CalledFunction) {
          CalledValue = CalledValue->stripPointerCasts();
          // Stripping pointer casts can reveal a called function.
          CalledFunction = dyn_cast<Function>(CalledValue);
        }
        // Check if this is an alias to a function. If so, get the
        // called aliasee for the checks below.
        if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
          assert(!CalledFunction &&
                 "Expected null called function in callsite for alias");
          CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
        }

        CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
            I.getMetadata(LLVMContext::MD_callsite));
        auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);

        // Include allocs that were already assigned a memprof function
        // attribute in the statistics.
        if (CB->getAttributes().hasFnAttr("memprof")) {
          assert(!MemProfMD);
          CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
              ? AllocTypeColdThinBackend++
              : AllocTypeNotColdThinBackend++;
          OrigAllocsThinBackend++;
          AllocVersionsThinBackend++;
          if (!MaxAllocVersionsThinBackend)
            MaxAllocVersionsThinBackend = 1;
          continue;
        }

        if (MemProfMD) {
          // Consult the next alloc node.
          assert(AI != FS->allocs().end());
          auto &AllocNode = *(AI++);

          // Sanity check that the MIB stack ids match between the summary and
          // instruction metadata.
          auto MIBIter = AllocNode.MIBs.begin();
          for (auto &MDOp : MemProfMD->operands()) {
            assert(MIBIter != AllocNode.MIBs.end());
            LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter =
                MIBIter->StackIdIndices.begin();
            auto *MIBMD = cast<const MDNode>(MDOp);
            MDNode *StackMDNode = getMIBStackNode(MIBMD);
            assert(StackMDNode);
            CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
            auto ContextIterBegin =
                StackContext.beginAfterSharedPrefix(CallsiteContext);
            // Skip the checking on the first iteration.
            uint64_t LastStackContextId =
                (ContextIterBegin != StackContext.end() &&
                 *ContextIterBegin == 0)
                    ? 1
                    : 0;
            for (auto ContextIter = ContextIterBegin;
                 ContextIter != StackContext.end(); ++ContextIter) {
              // If this is a direct recursion, simply skip the duplicate
              // entries, to be consistent with how the summary ids were
              // generated during ModuleSummaryAnalysis.
              if (LastStackContextId == *ContextIter)
                continue;
              LastStackContextId = *ContextIter;
              assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
              assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
                     *ContextIter);
              StackIdIndexIter++;
            }
            MIBIter++;
          }

          // Perform cloning if not yet done.
          CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());

          OrigAllocsThinBackend++;
          AllocVersionsThinBackend += AllocNode.Versions.size();
          if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
            MaxAllocVersionsThinBackend = AllocNode.Versions.size();

          // If there is only one version that means we didn't end up
          // considering this function for cloning, and in that case the alloc
          // will still be none type or should have gotten the default NotCold.
          // Skip that after calling clone helper since that does some sanity
          // checks that confirm we haven't decided yet that we need cloning.
          // We might have a single version that is cold due to the
          // MinClonedColdBytePercent heuristic, make sure we don't skip in that
          // case.
          if (AllocNode.Versions.size() == 1 &&
              (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
            assert((AllocationType)AllocNode.Versions[0] ==
                       AllocationType::NotCold ||
                   (AllocationType)AllocNode.Versions[0] ==
                       AllocationType::None);
            UnclonableAllocsThinBackend++;
            continue;
          }

          // All versions should have a singular allocation type.
          assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
            return Type == ((uint8_t)AllocationType::NotCold |
                            (uint8_t)AllocationType::Cold);
          }));

          // Update the allocation types per the summary info.
          for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
            // Ignore any that didn't get an assigned allocation type.
            if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
              continue;
            AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
            AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
                                            : AllocTypeNotColdThinBackend++;
            std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
            auto A = llvm::Attribute::get(F.getContext(), "memprof",
                                          AllocTypeString);
            CallBase *CBClone;
            // Copy 0 is the original function.
            if (!J)
              CBClone = CB;
            else
              // Since VMaps are only created for new clones, we index with
              // clone J-1 (J==0 is the original clone and does not have a VMaps
              // entry).
              CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
            CBClone->addFnAttr(A);
            ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
                     << ore::NV("AllocationCall", CBClone) << " in clone "
                     << ore::NV("Caller", CBClone->getFunction())
                     << " marked with memprof allocation attribute "
                     << ore::NV("Attribute", AllocTypeString));
          }
        } else if (!CallsiteContext.empty()) {
          if (!CalledFunction) {
#ifndef NDEBUG
            // We should have skipped inline assembly calls.
            auto *CI = dyn_cast<CallInst>(CB);
            assert(!CI || !CI->isInlineAsm());
#endif
            // We should have skipped direct calls via a Constant.
            assert(CalledValue && !isa<Constant>(CalledValue));

            // This is an indirect call, see if we have profile information and
            // whether any clones were recorded for the profiled targets (that
            // we synthesized CallsiteInfo summary records for when building the
            // index).
            auto NumClones =
                recordICPInfo(CB, FS->callsites(), SI, ICallAnalysisInfo);

            // Perform cloning if not yet done. This is done here in case
            // we don't need to do ICP, but might need to clone this
            // function as it is the target of other cloned calls.
            if (NumClones)
              CloneFuncIfNeeded(NumClones);
          }

          else {
            // Consult the next callsite node.
            assert(SI != FS->callsites().end());
            auto &StackNode = *(SI++);

#ifndef NDEBUG
            // Sanity check that the stack ids match between the summary and
            // instruction metadata.
            auto StackIdIndexIter = StackNode.StackIdIndices.begin();
            for (auto StackId : CallsiteContext) {
              assert(StackIdIndexIter != StackNode.StackIdIndices.end());
              assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
                     StackId);
              StackIdIndexIter++;
            }
#endif

            CloneCallsite(StackNode, CB, CalledFunction);
          }
        } else if (CB->isTailCall() && CalledFunction) {
          // Locate the synthesized callsite info for the callee VI, if any was
          // created, and use that for cloning.
          ValueInfo CalleeVI =
              findValueInfoForFunc(*CalledFunction, M, ImportSummary, &F);
          if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
            auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
            assert(Callsite != MapTailCallCalleeVIToCallsite.end());
            CloneCallsite(Callsite->second, CB, CalledFunction);
          }
        }
      }
    }

    // Now do any promotion required for cloning.
    performICP(M, FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
  }

  // We skip some of the functions and instructions above, so remove all the
  // metadata in a single sweep here.
  for (auto &F : M) {
    // We can skip memprof clones because createFunctionClones already strips
    // the metadata from the newly created clones.
    if (F.isDeclaration() || isMemProfClone(F))
      continue;
    for (auto &BB : F) {
      for (auto &I : BB) {
        if (!isa<CallBase>(I))
          continue;
        I.setMetadata(LLVMContext::MD_memprof, nullptr);
        I.setMetadata(LLVMContext::MD_callsite, nullptr);
      }
    }
  }

  return Changed;
}

unsigned MemProfContextDisambiguation::recordICPInfo(
    CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
    ArrayRef<CallsiteInfo>::iterator &SI,
    SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
  // First see if we have profile information for this indirect call.
  uint32_t NumCandidates;
  uint64_t TotalCount;
  auto CandidateProfileData =
      ICallAnalysis->getPromotionCandidatesForInstruction(CB, TotalCount,
                                                          NumCandidates);
  if (CandidateProfileData.empty())
    return 0;

  // Iterate through all of the candidate profiled targets along with the
  // CallsiteInfo summary records synthesized for them when building the index,
  // and see if any are cloned and/or refer to clones.
  bool ICPNeeded = false;
  unsigned NumClones = 0;
  size_t CallsiteInfoStartIndex = std::distance(AllCallsites.begin(), SI);
  for (const auto &Candidate : CandidateProfileData) {
#ifndef NDEBUG
    auto CalleeValueInfo =
#endif
        ImportSummary->getValueInfo(Candidate.Value);
    // We might not have a ValueInfo if this is a distributed
    // ThinLTO backend and decided not to import that function.
    assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo);
    assert(SI != AllCallsites.end());
    auto &StackNode = *(SI++);
    // See if any of the clones of the indirect callsite for this
    // profiled target should call a cloned version of the profiled
    // target. We only need to do the ICP here if so.
    ICPNeeded |= llvm::any_of(StackNode.Clones,
                              [](unsigned CloneNo) { return CloneNo != 0; });
    // Every callsite in the same function should have been cloned the same
    // number of times.
    assert(!NumClones || NumClones == StackNode.Clones.size());
    NumClones = StackNode.Clones.size();
  }
  if (!ICPNeeded)
    return NumClones;
  // Save information for ICP, which is performed later to avoid messing up the
  // current function traversal.
  ICallAnalysisInfo.push_back({CB, CandidateProfileData.vec(), NumCandidates,
                               TotalCount, CallsiteInfoStartIndex});
  return NumClones;
}

void MemProfContextDisambiguation::performICP(
    Module &M, ArrayRef<CallsiteInfo> AllCallsites,
    ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
    ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
    OptimizationRemarkEmitter &ORE) {
  // Now do any promotion required for cloning. Specifically, for each
  // recorded ICP candidate (which was only recorded because one clone of that
  // candidate should call a cloned target), we perform ICP (speculative
  // devirtualization) for each clone of the callsite, and update its callee
  // to the appropriate clone. Note that the ICP compares against the original
  // version of the target, which is what is in the vtable.
  for (auto &Info : ICallAnalysisInfo) {
    auto *CB = Info.CB;
    auto CallsiteIndex = Info.CallsiteInfoStartIndex;
    auto TotalCount = Info.TotalCount;
    unsigned NumPromoted = 0;
    unsigned NumClones = 0;

    for (auto &Candidate : Info.CandidateProfileData) {
      auto &StackNode = AllCallsites[CallsiteIndex++];

      // All calls in the same function must have the same number of clones.
      assert(!NumClones || NumClones == StackNode.Clones.size());
      NumClones = StackNode.Clones.size();

      // See if the target is in the module. If it wasn't imported, it is
      // possible that this profile could have been collected on a different
      // target (or version of the code), and we need to be conservative
      // (similar to what is done in the ICP pass).
      Function *TargetFunction = Symtab->getFunction(Candidate.Value);
      if (TargetFunction == nullptr ||
          // Any ThinLTO global dead symbol removal should have already
          // occurred, so it should be safe to promote when the target is a
          // declaration.
          // TODO: Remove internal option once more fully tested.
          (MemProfRequireDefinitionForPromotion &&
           TargetFunction->isDeclaration())) {
        ORE.emit([&]() {
          return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB)
                 << "Memprof cannot promote indirect call: target with md5sum "
                 << ore::NV("target md5sum", Candidate.Value) << " not found";
        });
        // FIXME: See if we can use the new declaration importing support to
        // at least get the declarations imported for this case. Hot indirect
        // targets should have been imported normally, however.
        continue;
      }

      // Check if legal to promote
      const char *Reason = nullptr;
      if (!isLegalToPromote(*CB, TargetFunction, &Reason)) {
        ORE.emit([&]() {
          return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
                 << "Memprof cannot promote indirect call to "
                 << ore::NV("TargetFunction", TargetFunction)
                 << " with count of " << ore::NV("TotalCount", TotalCount)
                 << ": " << Reason;
        });
        continue;
      }

      assert(!isMemProfClone(*TargetFunction));

      // Handle each call clone, applying ICP so that each clone directly
      // calls the specified callee clone, guarded by the appropriate ICP
      // check.
      CallBase *CBClone = CB;
      for (unsigned J = 0; J < NumClones; J++) {
        // Copy 0 is the original function.
        if (J > 0)
          CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
        // We do the promotion using the original name, so that the comparison
        // is against the name in the vtable. Then just below, change the new
        // direct call to call the cloned function.
        auto &DirectCall =
            pgo::promoteIndirectCall(*CBClone, TargetFunction, Candidate.Count,
                                     TotalCount, isSamplePGO, &ORE);
        auto *TargetToUse = TargetFunction;
        // Call original if this version calls the original version of its
        // callee.
        if (StackNode.Clones[J]) {
          TargetToUse =
              cast<Function>(M.getOrInsertFunction(
                                  getMemProfFuncName(TargetFunction->getName(),
                                                     StackNode.Clones[J]),
                                  TargetFunction->getFunctionType())
                                 .getCallee());
        }
        DirectCall.setCalledFunction(TargetToUse);
        ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
                 << ore::NV("Call", CBClone) << " in clone "
                 << ore::NV("Caller", CBClone->getFunction())
                 << " promoted and assigned to call function clone "
                 << ore::NV("Callee", TargetToUse));
      }

      // Update TotalCount (all clones should get same count above)
      TotalCount -= Candidate.Count;
      NumPromoted++;
    }
    // Adjust the MD.prof metadata for all clones, now that we have the new
    // TotalCount and the number promoted.
    CallBase *CBClone = CB;
    for (unsigned J = 0; J < NumClones; J++) {
      // Copy 0 is the original function.
      if (J > 0)
        CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
      // First delete the old one.
      CBClone->setMetadata(LLVMContext::MD_prof, nullptr);
      // If all promoted, we don't need the MD.prof metadata.
      // Otherwise we need update with the un-promoted records back.
      if (TotalCount != 0)
        annotateValueSite(
            M, *CBClone, ArrayRef(Info.CandidateProfileData).slice(NumPromoted),
            TotalCount, IPVK_IndirectCallTarget, Info.NumCandidates);
    }
  }
}

template <typename DerivedCCG, typename FuncTy, typename CallTy>
bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
  if (DumpCCG) {
    dbgs() << "CCG before cloning:\n";
    dbgs() << *this;
  }
  if (ExportToDot)
    exportToDot("postbuild");

  if (VerifyCCG) {
    check();
  }

  identifyClones();

  if (VerifyCCG) {
    check();
  }

  if (DumpCCG) {
    dbgs() << "CCG after cloning:\n";
    dbgs() << *this;
  }
  if (ExportToDot)
    exportToDot("cloned");

  bool Changed = assignFunctions();

  if (DumpCCG) {
    dbgs() << "CCG after assigning function clones:\n";
    dbgs() << *this;
  }
  if (ExportToDot)
    exportToDot("clonefuncassign");

  if (MemProfReportHintedSizes)
    printTotalSizes(errs());

  return Changed;
}

bool MemProfContextDisambiguation::processModule(
    Module &M,
    llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {

  // If we have an import summary, then the cloning decisions were made during
  // the thin link on the index. Apply them and return.
  if (ImportSummary)
    return applyImport(M);

  // TODO: If/when other types of memprof cloning are enabled beyond just for
  // hot and cold, we will need to change this to individually control the
  // AllocationType passed to addStackNodesForMIB during CCG construction.
  // Note that we specifically check this after applying imports above, so that
  // the option isn't needed to be passed to distributed ThinLTO backend
  // clang processes, which won't necessarily have visibility into the linker
  // dependences. Instead the information is communicated from the LTO link to
  // the backends via the combined summary index.
  if (!SupportsHotColdNew)
    return false;

  ModuleCallsiteContextGraph CCG(M, OREGetter);
  return CCG.process();
}

MemProfContextDisambiguation::MemProfContextDisambiguation(
    const ModuleSummaryIndex *Summary, bool isSamplePGO)
    : ImportSummary(Summary), isSamplePGO(isSamplePGO) {
  // Check the dot graph printing options once here, to make sure we have valid
  // and expected combinations.
  if (DotGraphScope == DotScope::Alloc && !AllocIdForDot.getNumOccurrences())
    llvm::report_fatal_error(
        "-memprof-dot-scope=alloc requires -memprof-dot-alloc-id");
  if (DotGraphScope == DotScope::Context &&
      !ContextIdForDot.getNumOccurrences())
    llvm::report_fatal_error(
        "-memprof-dot-scope=context requires -memprof-dot-context-id");
  if (DotGraphScope == DotScope::All && AllocIdForDot.getNumOccurrences() &&
      ContextIdForDot.getNumOccurrences())
    llvm::report_fatal_error(
        "-memprof-dot-scope=all can't have both -memprof-dot-alloc-id and "
        "-memprof-dot-context-id");
  if (ImportSummary) {
    // The MemProfImportSummary should only be used for testing ThinLTO
    // distributed backend handling via opt, in which case we don't have a
    // summary from the pass pipeline.
    assert(MemProfImportSummary.empty());
    return;
  }
  if (MemProfImportSummary.empty())
    return;

  auto ReadSummaryFile =
      errorOrToExpected(MemoryBuffer::getFile(MemProfImportSummary));
  if (!ReadSummaryFile) {
    logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
                          "Error loading file '" + MemProfImportSummary +
                              "': ");
    return;
  }
  auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
  if (!ImportSummaryForTestingOrErr) {
    logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
                          "Error parsing file '" + MemProfImportSummary +
                              "': ");
    return;
  }
  ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
  ImportSummary = ImportSummaryForTesting.get();
}

PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
                                                    ModuleAnalysisManager &AM) {
  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
  auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
    return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
  };
  if (!processModule(M, OREGetter))
    return PreservedAnalyses::all();
  return PreservedAnalyses::none();
}

void MemProfContextDisambiguation::run(
    ModuleSummaryIndex &Index,
    llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
        isPrevailing) {
  // TODO: If/when other types of memprof cloning are enabled beyond just for
  // hot and cold, we will need to change this to individually control the
  // AllocationType passed to addStackNodesForMIB during CCG construction.
  // The index was set from the option, so these should be in sync.
  assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
  if (!SupportsHotColdNew)
    return;

  IndexCallsiteContextGraph CCG(Index, isPrevailing);
  CCG.process();
}