//===- bolt/Profile/StaleProfileMatching.cpp - Profile data matching ----===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // BOLT often has to deal with profiles collected on binaries built from several // revisions behind release. As a result, a certain percentage of functions is // considered stale and not optimized. This file implements an ability to match // profile to functions that are not 100% binary identical, and thus, increasing // the optimization coverage and boost the performance of applications. // // The algorithm consists of two phases: matching and inference: // - At the matching phase, we try to "guess" as many block and jump counts from // the stale profile as possible. To this end, the content of each basic block // is hashed and stored in the (yaml) profile. When BOLT optimizes a binary, // it computes block hashes and identifies the corresponding entries in the // stale profile. It yields a partial profile for every CFG in the binary. // - At the inference phase, we employ a network flow-based algorithm (profi) to // reconstruct "realistic" block and jump counts from the partial profile // generated at the first stage. In practice, we don't always produce proper // profile data but the majority (e.g., >90%) of CFGs get the correct counts. // //===----------------------------------------------------------------------===// #include "bolt/Core/HashUtilities.h" #include "bolt/Profile/YAMLProfileReader.h" #include "llvm/ADT/Bitfields.h" #include "llvm/ADT/Hashing.h" #include "llvm/MC/MCPseudoProbe.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Timer.h" #include "llvm/Support/xxhash.h" #include "llvm/Transforms/Utils/SampleProfileInference.h" #include using namespace llvm; #undef DEBUG_TYPE #define DEBUG_TYPE "bolt-prof" namespace opts { extern cl::opt TimeRewrite; extern cl::OptionCategory BoltOptCategory; cl::opt InferStaleProfile("infer-stale-profile", cl::desc("Infer counts from stale profile data."), cl::init(false), cl::Hidden, cl::cat(BoltOptCategory)); static cl::opt StaleMatchingMinMatchedBlock( "stale-matching-min-matched-block", cl::desc("Percentage threshold of matched basic blocks at which stale " "profile inference is executed."), cl::init(0), cl::Hidden, cl::cat(BoltOptCategory)); static cl::opt StaleMatchingMaxFuncSize( "stale-matching-max-func-size", cl::desc("The maximum size of a function to consider for inference."), cl::init(10000), cl::Hidden, cl::cat(BoltOptCategory)); // Parameters of the profile inference algorithm. The default values are tuned // on several benchmarks. static cl::opt StaleMatchingEvenFlowDistribution( "stale-matching-even-flow-distribution", cl::desc("Try to evenly distribute flow when there are multiple equally " "likely options."), cl::init(true), cl::ReallyHidden, cl::cat(BoltOptCategory)); static cl::opt StaleMatchingRebalanceUnknown( "stale-matching-rebalance-unknown", cl::desc("Evenly re-distribute flow among unknown subgraphs."), cl::init(false), cl::ReallyHidden, cl::cat(BoltOptCategory)); static cl::opt StaleMatchingJoinIslands( "stale-matching-join-islands", cl::desc("Join isolated components having positive flow."), cl::init(true), cl::ReallyHidden, cl::cat(BoltOptCategory)); static cl::opt StaleMatchingCostBlockInc( "stale-matching-cost-block-inc", cl::desc("The cost of increasing a block count by one."), cl::init(150), cl::ReallyHidden, cl::cat(BoltOptCategory)); static cl::opt StaleMatchingCostBlockDec( "stale-matching-cost-block-dec", cl::desc("The cost of decreasing a block count by one."), cl::init(150), cl::ReallyHidden, cl::cat(BoltOptCategory)); static cl::opt StaleMatchingCostJumpInc( "stale-matching-cost-jump-inc", cl::desc("The cost of increasing a jump count by one."), cl::init(150), cl::ReallyHidden, cl::cat(BoltOptCategory)); static cl::opt StaleMatchingCostJumpDec( "stale-matching-cost-jump-dec", cl::desc("The cost of decreasing a jump count by one."), cl::init(150), cl::ReallyHidden, cl::cat(BoltOptCategory)); static cl::opt StaleMatchingCostBlockUnknownInc( "stale-matching-cost-block-unknown-inc", cl::desc("The cost of increasing an unknown block count by one."), cl::init(1), cl::ReallyHidden, cl::cat(BoltOptCategory)); static cl::opt StaleMatchingCostJumpUnknownInc( "stale-matching-cost-jump-unknown-inc", cl::desc("The cost of increasing an unknown jump count by one."), cl::init(140), cl::ReallyHidden, cl::cat(BoltOptCategory)); static cl::opt StaleMatchingCostJumpUnknownFTInc( "stale-matching-cost-jump-unknown-ft-inc", cl::desc( "The cost of increasing an unknown fall-through jump count by one."), cl::init(3), cl::ReallyHidden, cl::cat(BoltOptCategory)); cl::opt StaleMatchingWithPseudoProbes( "stale-matching-with-pseudo-probes", cl::desc("Turns on stale matching with block pseudo probes."), cl::init(false), cl::ReallyHidden, cl::cat(BoltOptCategory)); } // namespace opts namespace llvm { namespace bolt { /// An object wrapping several components of a basic block hash. The combined /// (blended) hash is represented and stored as one uint64_t, while individual /// components are of smaller size (e.g., uint16_t or uint8_t). struct BlendedBlockHash { private: using ValueOffset = Bitfield::Element; using ValueOpcode = Bitfield::Element; using ValueInstr = Bitfield::Element; using ValuePred = Bitfield::Element; using ValueSucc = Bitfield::Element; public: explicit BlendedBlockHash() {} explicit BlendedBlockHash(uint64_t Hash) { Offset = Bitfield::get(Hash); OpcodeHash = Bitfield::get(Hash); InstrHash = Bitfield::get(Hash); PredHash = Bitfield::get(Hash); SuccHash = Bitfield::get(Hash); } /// Combine the blended hash into uint64_t. uint64_t combine() const { uint64_t Hash = 0; Bitfield::set(Hash, Offset); Bitfield::set(Hash, OpcodeHash); Bitfield::set(Hash, InstrHash); Bitfield::set(Hash, PredHash); Bitfield::set(Hash, SuccHash); return Hash; } /// Compute a distance between two given blended hashes. The smaller the /// distance, the more similar two blocks are. For identical basic blocks, /// the distance is zero. uint64_t distance(const BlendedBlockHash &BBH) const { assert(OpcodeHash == BBH.OpcodeHash && "incorrect blended hash distance computation"); uint64_t Dist = 0; // Account for NeighborHash Dist += SuccHash == BBH.SuccHash ? 0 : 1; Dist += PredHash == BBH.PredHash ? 0 : 1; Dist <<= 16; // Account for InstrHash Dist += InstrHash == BBH.InstrHash ? 0 : 1; Dist <<= 16; // Account for Offset Dist += (Offset >= BBH.Offset ? Offset - BBH.Offset : BBH.Offset - Offset); return Dist; } /// The offset of the basic block from the function start. uint16_t Offset{0}; /// (Loose) Hash of the basic block instructions, excluding operands. uint16_t OpcodeHash{0}; /// (Strong) Hash of the basic block instructions, including opcodes and /// operands. uint16_t InstrHash{0}; /// (Loose) Hashes of the predecessors of the basic block. uint8_t PredHash{0}; /// (Loose) Hashes of the successors of the basic block. uint8_t SuccHash{0}; }; /// The object is used to identify and match basic blocks in a BinaryFunction /// given their hashes computed on a binary built from several revisions behind /// release. class StaleMatcher { public: /// Initialize stale matcher. void init(const std::vector &Blocks, const std::vector &Hashes, const std::vector &CallHashes) { assert(Blocks.size() == Hashes.size() && Hashes.size() == CallHashes.size() && "incorrect matcher initialization"); for (size_t I = 0; I < Blocks.size(); I++) { FlowBlock *Block = Blocks[I]; uint16_t OpHash = Hashes[I].OpcodeHash; OpHashToBlocks[OpHash].push_back(std::make_pair(Hashes[I], Block)); if (CallHashes[I]) CallHashToBlocks[CallHashes[I]].push_back( std::make_pair(Hashes[I], Block)); } } /// Creates a mapping from a pseudo probe to a flow block. void mapProbeToBB(const MCDecodedPseudoProbe *Probe, FlowBlock *Block) { BBPseudoProbeToBlock[Probe] = Block; } enum MatchMethod : char { MATCH_EXACT = 0, MATCH_PROBE_EXACT, MATCH_PROBE_LOOSE, MATCH_OPCODE, MATCH_CALL, NO_MATCH }; /// Find the most similar flow block for a profile block given blended hash. std::pair matchBlockStrict(BlendedBlockHash BlendedHash) { const auto &[Block, ExactHash] = matchWithOpcodes(BlendedHash); if (Block && ExactHash) return {Block, MATCH_EXACT}; return {nullptr, NO_MATCH}; } /// Find the most similar flow block for a profile block given pseudo probes. std::pair matchBlockProbe( const ArrayRef PseudoProbes, const YAMLProfileReader::InlineTreeNodeMapTy &InlineTreeNodeMap) { const auto &[ProbeBlock, ExactProbe] = matchWithPseudoProbes(PseudoProbes, InlineTreeNodeMap); if (ProbeBlock) return {ProbeBlock, ExactProbe ? MATCH_PROBE_EXACT : MATCH_PROBE_LOOSE}; return {nullptr, NO_MATCH}; } /// Find the most similar flow block for a profile block given its hashes. std::pair matchBlockLoose(BlendedBlockHash BlendedHash, uint64_t CallHash) { if (const FlowBlock *CallBlock = matchWithCalls(BlendedHash, CallHash)) return {CallBlock, MATCH_CALL}; if (const FlowBlock *OpcodeBlock = matchWithOpcodes(BlendedHash).first) return {OpcodeBlock, MATCH_OPCODE}; return {nullptr, NO_MATCH}; } /// Returns true if the two basic blocks (in the binary and in the profile) /// corresponding to the given hashes are matched to each other with a high /// confidence. static bool isHighConfidenceMatch(BlendedBlockHash Hash1, BlendedBlockHash Hash2) { return Hash1.InstrHash == Hash2.InstrHash; } private: using HashBlockPairType = std::pair; std::unordered_map> OpHashToBlocks; std::unordered_map> CallHashToBlocks; DenseMap BBPseudoProbeToBlock; // Uses OpcodeHash to find the most similar block for a given hash. std::pair matchWithOpcodes(BlendedBlockHash BlendedHash) const { auto BlockIt = OpHashToBlocks.find(BlendedHash.OpcodeHash); if (BlockIt == OpHashToBlocks.end()) return {nullptr, false}; FlowBlock *BestBlock = nullptr; uint64_t BestDist = std::numeric_limits::max(); BlendedBlockHash BestHash; for (const auto &[Hash, Block] : BlockIt->second) { uint64_t Dist = Hash.distance(BlendedHash); if (BestBlock == nullptr || Dist < BestDist) { BestDist = Dist; BestBlock = Block; BestHash = Hash; } } return {BestBlock, isHighConfidenceMatch(BestHash, BlendedHash)}; } // Uses CallHash to find the most similar block for a given hash. const FlowBlock *matchWithCalls(BlendedBlockHash BlendedHash, uint64_t CallHash) const { if (!CallHash) return nullptr; auto BlockIt = CallHashToBlocks.find(CallHash); if (BlockIt == CallHashToBlocks.end()) return nullptr; FlowBlock *BestBlock = nullptr; uint64_t BestDist = std::numeric_limits::max(); for (const auto &[Hash, Block] : BlockIt->second) { uint64_t Dist = Hash.OpcodeHash > BlendedHash.OpcodeHash ? Hash.OpcodeHash - BlendedHash.OpcodeHash : BlendedHash.OpcodeHash - Hash.OpcodeHash; if (BestBlock == nullptr || Dist < BestDist) { BestDist = Dist; BestBlock = Block; } } return BestBlock; } /// Matches a profile block with a binary block based on pseudo probes. /// Returns the best matching block (or nullptr) and whether the match is /// unambiguous. std::pair matchWithPseudoProbes( const ArrayRef BlockPseudoProbes, const YAMLProfileReader::InlineTreeNodeMapTy &InlineTreeNodeMap) const { if (!opts::StaleMatchingWithPseudoProbes) return {nullptr, false}; DenseMap FlowBlockMatchCount; auto matchProfileProbeToBlock = [&](uint32_t NodeId, uint64_t ProbeId) -> const FlowBlock * { const MCDecodedPseudoProbeInlineTree *BinaryNode = InlineTreeNodeMap.getInlineTreeNode(NodeId); if (!BinaryNode) return nullptr; const MCDecodedPseudoProbe Dummy(0, ProbeId, PseudoProbeType::Block, 0, 0, nullptr); ArrayRef BinaryProbes = BinaryNode->getProbes(); auto BinaryProbeIt = llvm::lower_bound( BinaryProbes, Dummy, [](const auto &LHS, const auto &RHS) { return LHS.getIndex() < RHS.getIndex(); }); if (BinaryProbeIt == BinaryNode->getProbes().end() || BinaryProbeIt->getIndex() != ProbeId) return nullptr; auto It = BBPseudoProbeToBlock.find(&*BinaryProbeIt); if (It == BBPseudoProbeToBlock.end()) return nullptr; return It->second; }; auto matchPseudoProbeInfo = [&](const yaml::bolt::PseudoProbeInfo &ProfileProbe, uint32_t NodeId) { for (uint64_t Index = 0; Index < 64; ++Index) if (ProfileProbe.BlockMask & 1ull << Index) ++FlowBlockMatchCount[matchProfileProbeToBlock(NodeId, Index + 1)]; for (const auto &ProfileProbes : {ProfileProbe.BlockProbes, ProfileProbe.IndCallProbes, ProfileProbe.CallProbes}) for (uint64_t ProfileProbe : ProfileProbes) ++FlowBlockMatchCount[matchProfileProbeToBlock(NodeId, ProfileProbe)]; }; for (const yaml::bolt::PseudoProbeInfo &ProfileProbe : BlockPseudoProbes) { if (!ProfileProbe.InlineTreeNodes.empty()) for (uint32_t ProfileInlineTreeNode : ProfileProbe.InlineTreeNodes) matchPseudoProbeInfo(ProfileProbe, ProfileInlineTreeNode); else matchPseudoProbeInfo(ProfileProbe, ProfileProbe.InlineTreeIndex); } uint32_t BestMatchCount = 0; uint32_t TotalMatchCount = 0; const FlowBlock *BestMatchBlock = nullptr; for (const auto &[FlowBlock, Count] : FlowBlockMatchCount) { TotalMatchCount += Count; if (Count < BestMatchCount || (Count == BestMatchCount && BestMatchBlock)) continue; BestMatchBlock = FlowBlock; BestMatchCount = Count; } return {BestMatchBlock, BestMatchCount == TotalMatchCount}; } }; void BinaryFunction::computeBlockHashes(HashFunction HashFunction) const { if (size() == 0) return; assert(hasCFG() && "the function is expected to have CFG"); std::vector BlendedHashes(BasicBlocks.size()); std::vector OpcodeHashes(BasicBlocks.size()); // Initialize hash components. for (size_t I = 0; I < BasicBlocks.size(); I++) { const BinaryBasicBlock *BB = BasicBlocks[I]; assert(BB->getIndex() == I && "incorrect block index"); BlendedHashes[I].Offset = BB->getOffset(); // Hashing complete instructions. std::string InstrHashStr = hashBlock( BC, *BB, [&](const MCOperand &Op) { return hashInstOperand(BC, Op); }); if (HashFunction == HashFunction::StdHash) { uint64_t InstrHash = std::hash{}(InstrHashStr); BlendedHashes[I].InstrHash = (uint16_t)hash_value(InstrHash); } else if (HashFunction == HashFunction::XXH3) { uint64_t InstrHash = llvm::xxh3_64bits(InstrHashStr); BlendedHashes[I].InstrHash = (uint16_t)InstrHash; } else { llvm_unreachable("Unhandled HashFunction"); } // Hashing opcodes. std::string OpcodeHashStr = hashBlockLoose(BC, *BB); if (HashFunction == HashFunction::StdHash) { OpcodeHashes[I] = std::hash{}(OpcodeHashStr); BlendedHashes[I].OpcodeHash = (uint16_t)hash_value(OpcodeHashes[I]); } else if (HashFunction == HashFunction::XXH3) { OpcodeHashes[I] = llvm::xxh3_64bits(OpcodeHashStr); BlendedHashes[I].OpcodeHash = (uint16_t)OpcodeHashes[I]; } else { llvm_unreachable("Unhandled HashFunction"); } } // Initialize neighbor hash. for (size_t I = 0; I < BasicBlocks.size(); I++) { const BinaryBasicBlock *BB = BasicBlocks[I]; // Append hashes of successors. uint64_t Hash = 0; for (BinaryBasicBlock *SuccBB : BB->successors()) { uint64_t SuccHash = OpcodeHashes[SuccBB->getIndex()]; Hash = hashing::detail::hash_16_bytes(Hash, SuccHash); } if (HashFunction == HashFunction::StdHash) { // Compatibility with old behavior. BlendedHashes[I].SuccHash = (uint8_t)hash_value(Hash); } else { BlendedHashes[I].SuccHash = (uint8_t)Hash; } // Append hashes of predecessors. Hash = 0; for (BinaryBasicBlock *PredBB : BB->predecessors()) { uint64_t PredHash = OpcodeHashes[PredBB->getIndex()]; Hash = hashing::detail::hash_16_bytes(Hash, PredHash); } if (HashFunction == HashFunction::StdHash) { // Compatibility with old behavior. BlendedHashes[I].PredHash = (uint8_t)hash_value(Hash); } else { BlendedHashes[I].PredHash = (uint8_t)Hash; } } // Assign hashes. for (size_t I = 0; I < BasicBlocks.size(); I++) { const BinaryBasicBlock *BB = BasicBlocks[I]; BB->setHash(BlendedHashes[I].combine()); } } // TODO: mediate the difference between flow function construction here in BOLT // and in the compiler by splitting blocks with exception throwing calls at the // call and adding the landing pad as the successor. /// Create a wrapper flow function to use with the profile inference algorithm, /// and initialize its jumps and metadata. FlowFunction createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) { FlowFunction Func; // Add a special "dummy" source so that there is always a unique entry point. FlowBlock EntryBlock; EntryBlock.Index = 0; Func.Blocks.push_back(EntryBlock); // Create FlowBlock for every basic block in the binary function. for (const BinaryBasicBlock *BB : BlockOrder) { Func.Blocks.emplace_back(); FlowBlock &Block = Func.Blocks.back(); Block.Index = Func.Blocks.size() - 1; (void)BB; assert(Block.Index == BB->getIndex() + 1 && "incorrectly assigned basic block index"); } // Add a special "dummy" sink block so there is always a unique sink. FlowBlock SinkBlock; SinkBlock.Index = Func.Blocks.size(); Func.Blocks.push_back(SinkBlock); // Create FlowJump for each jump between basic blocks in the binary function. std::vector InDegree(Func.Blocks.size(), 0); for (const BinaryBasicBlock *SrcBB : BlockOrder) { std::unordered_set UniqueSuccs; // Collect regular jumps for (const BinaryBasicBlock *DstBB : SrcBB->successors()) { // Ignoring parallel edges if (UniqueSuccs.find(DstBB) != UniqueSuccs.end()) continue; Func.Jumps.emplace_back(); FlowJump &Jump = Func.Jumps.back(); Jump.Source = SrcBB->getIndex() + 1; Jump.Target = DstBB->getIndex() + 1; InDegree[Jump.Target]++; UniqueSuccs.insert(DstBB); } // TODO: set jump from exit block to landing pad to Unlikely. // If the block is an exit, add a dummy edge from it to the sink block. if (UniqueSuccs.empty()) { Func.Jumps.emplace_back(); FlowJump &Jump = Func.Jumps.back(); Jump.Source = SrcBB->getIndex() + 1; Jump.Target = Func.Blocks.size() - 1; InDegree[Jump.Target]++; } // Collect jumps to landing pads for (const BinaryBasicBlock *DstBB : SrcBB->landing_pads()) { // Ignoring parallel edges if (UniqueSuccs.find(DstBB) != UniqueSuccs.end()) continue; Func.Jumps.emplace_back(); FlowJump &Jump = Func.Jumps.back(); Jump.Source = SrcBB->getIndex() + 1; Jump.Target = DstBB->getIndex() + 1; InDegree[Jump.Target]++; UniqueSuccs.insert(DstBB); } } // Add dummy edges to the extra sources. If there are multiple entry blocks, // add an unlikely edge from 0 to the subsequent ones. Skips the sink block. assert(InDegree[0] == 0 && "dummy entry blocks shouldn't have predecessors"); for (uint64_t I = 1; I < Func.Blocks.size() - 1; I++) { const BinaryBasicBlock *BB = BlockOrder[I - 1]; if (BB->isEntryPoint() || InDegree[I] == 0) { Func.Jumps.emplace_back(); FlowJump &Jump = Func.Jumps.back(); Jump.Source = 0; Jump.Target = I; if (!BB->isEntryPoint()) Jump.IsUnlikely = true; } } // Create necessary metadata for the flow function for (FlowJump &Jump : Func.Jumps) { assert(Jump.Source < Func.Blocks.size()); Func.Blocks[Jump.Source].SuccJumps.push_back(&Jump); assert(Jump.Target < Func.Blocks.size()); Func.Blocks[Jump.Target].PredJumps.push_back(&Jump); } return Func; } /// Assign initial block/jump weights based on the stale profile data. The goal /// is to extract as much information from the stale profile as possible. Here /// we assume that each basic block is specified via a hash value computed from /// its content and the hashes of the unchanged basic blocks stay the same /// across different revisions of the binary. Blocks may also have pseudo probe /// information in the profile and the binary which is used for matching. /// Whenever there is a count in the profile with the hash corresponding to one /// of the basic blocks in the binary, the count is "matched" to the block. /// Similarly, if both the source and the target of a count in the profile are /// matched to a jump in the binary, the count is recorded in CFG. size_t matchWeights( BinaryContext &BC, const BinaryFunction::BasicBlockOrderType &BlockOrder, const yaml::bolt::BinaryFunctionProfile &YamlBF, FlowFunction &Func, HashFunction HashFunction, YAMLProfileReader::ProfileLookupMap &IdToYamlBF, const BinaryFunction &BF, const ArrayRef ProbeMatchSpecs) { assert(Func.Blocks.size() == BlockOrder.size() + 2); std::vector CallHashes; std::vector Blocks; std::vector BlendedHashes; for (uint64_t I = 0; I < BlockOrder.size(); I++) { const BinaryBasicBlock *BB = BlockOrder[I]; assert(BB->getHash() != 0 && "empty hash of BinaryBasicBlock"); std::string CallHashStr = hashBlockCalls(BC, *BB); if (CallHashStr.empty()) { CallHashes.push_back(0); } else { if (HashFunction == HashFunction::StdHash) CallHashes.push_back(std::hash{}(CallHashStr)); else if (HashFunction == HashFunction::XXH3) CallHashes.push_back(llvm::xxh3_64bits(CallHashStr)); else llvm_unreachable("Unhandled HashFunction"); } Blocks.push_back(&Func.Blocks[I + 1]); BlendedBlockHash BlendedHash(BB->getHash()); BlendedHashes.push_back(BlendedHash); LLVM_DEBUG(dbgs() << "BB with index " << I << " has hash = " << Twine::utohexstr(BB->getHash()) << "\n"); } StaleMatcher Matcher; // Collects function pseudo probes for use in the StaleMatcher. if (opts::StaleMatchingWithPseudoProbes) { const MCPseudoProbeDecoder *Decoder = BC.getPseudoProbeDecoder(); assert(Decoder && "If pseudo probes are in use, pseudo probe decoder should exist"); const AddressProbesMap &ProbeMap = Decoder->getAddress2ProbesMap(); const uint64_t FuncAddr = BF.getAddress(); for (const MCDecodedPseudoProbe &Probe : ProbeMap.find(FuncAddr, FuncAddr + BF.getSize())) if (const BinaryBasicBlock *BB = BF.getBasicBlockContainingOffset(Probe.getAddress() - FuncAddr)) Matcher.mapProbeToBB(&Probe, Blocks[BB->getIndex()]); } Matcher.init(Blocks, BlendedHashes, CallHashes); using FlowBlockTy = std::pair; using ProfileBlockMatchMap = DenseMap; // Binary profile => block index => matched block + its block profile DenseMap MatchedBlocks; // Map of FlowBlock and matching method. DenseMap MatchedFlowBlocks; auto addMatchedBlock = [&](std::pair BlockMethod, const yaml::bolt::BinaryFunctionProfile &YamlBP, const yaml::bolt::BinaryBasicBlockProfile &YamlBB) { const auto &[MatchedBlock, Method] = BlockMethod; if (!MatchedBlock) return; // Don't override earlier matches if (MatchedFlowBlocks.contains(MatchedBlock)) return; MatchedFlowBlocks.try_emplace(MatchedBlock, Method); MatchedBlocks[&YamlBP][YamlBB.Index] = {MatchedBlock, &YamlBB}; }; // Match blocks from the profile to the blocks in CFG by strict hash. for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) { // Update matching stats. ++BC.Stats.NumStaleBlocks; BC.Stats.StaleSampleCount += YamlBB.ExecCount; assert(YamlBB.Hash != 0 && "empty hash of BinaryBasicBlockProfile"); BlendedBlockHash YamlHash(YamlBB.Hash); addMatchedBlock(Matcher.matchBlockStrict(YamlHash), YamlBF, YamlBB); } // Match blocks from the profile to the blocks in CFG by pseudo probes. for (const auto &[InlineNodeMap, YamlBP] : ProbeMatchSpecs) { for (const yaml::bolt::BinaryBasicBlockProfile &BB : YamlBP.get().Blocks) if (!BB.PseudoProbes.empty()) addMatchedBlock(Matcher.matchBlockProbe(BB.PseudoProbes, InlineNodeMap), YamlBP, BB); } // Match blocks from the profile to the blocks in CFG with loose methods. for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) { assert(YamlBB.Hash != 0 && "empty hash of BinaryBasicBlockProfile"); BlendedBlockHash YamlHash(YamlBB.Hash); std::string CallHashStr = hashBlockCalls(IdToYamlBF, YamlBB); uint64_t CallHash = 0; if (!CallHashStr.empty()) { if (HashFunction == HashFunction::StdHash) CallHash = std::hash{}(CallHashStr); else if (HashFunction == HashFunction::XXH3) CallHash = llvm::xxh3_64bits(CallHashStr); else llvm_unreachable("Unhandled HashFunction"); } auto [MatchedBlock, Method] = Matcher.matchBlockLoose(YamlHash, CallHash); if (MatchedBlock == nullptr && YamlBB.Index == 0) { MatchedBlock = Blocks[0]; // Report as loose match Method = StaleMatcher::MATCH_OPCODE; } if (!MatchedBlock) { LLVM_DEBUG(dbgs() << "Couldn't match yaml block (bid = " << YamlBB.Index << ")" << " with hash " << Twine::utohexstr(YamlBB.Hash) << "\n"); continue; } addMatchedBlock({MatchedBlock, Method}, YamlBF, YamlBB); } // Match jumps from the profile to the jumps from CFG std::vector OutWeight(Func.Blocks.size(), 0); std::vector InWeight(Func.Blocks.size(), 0); for (const auto &[YamlBF, MatchMap] : MatchedBlocks) { for (const auto &[YamlBBIdx, FlowBlockProfile] : MatchMap) { const auto &[MatchedBlock, YamlBB] = FlowBlockProfile; StaleMatcher::MatchMethod Method = MatchedFlowBlocks.lookup(MatchedBlock); BlendedBlockHash BinHash = BlendedHashes[MatchedBlock->Index - 1]; LLVM_DEBUG(dbgs() << "Matched yaml block (bid = " << YamlBBIdx << ")" << " with hash " << Twine::utohexstr(YamlBB->Hash) << " to BB (index = " << MatchedBlock->Index - 1 << ")" << " with hash " << Twine::utohexstr(BinHash.combine()) << "\n"); (void)BinHash; uint64_t ExecCount = YamlBB->ExecCount; // Update matching stats accounting for the matched block. switch (Method) { case StaleMatcher::MATCH_EXACT: ++BC.Stats.NumExactMatchedBlocks; BC.Stats.ExactMatchedSampleCount += ExecCount; LLVM_DEBUG(dbgs() << " exact match\n"); break; case StaleMatcher::MATCH_PROBE_EXACT: ++BC.Stats.NumPseudoProbeExactMatchedBlocks; BC.Stats.PseudoProbeExactMatchedSampleCount += ExecCount; LLVM_DEBUG(dbgs() << " exact pseudo probe match\n"); break; case StaleMatcher::MATCH_PROBE_LOOSE: ++BC.Stats.NumPseudoProbeLooseMatchedBlocks; BC.Stats.PseudoProbeLooseMatchedSampleCount += ExecCount; LLVM_DEBUG(dbgs() << " loose pseudo probe match\n"); break; case StaleMatcher::MATCH_CALL: ++BC.Stats.NumCallMatchedBlocks; BC.Stats.CallMatchedSampleCount += ExecCount; LLVM_DEBUG(dbgs() << " call match\n"); break; case StaleMatcher::MATCH_OPCODE: ++BC.Stats.NumLooseMatchedBlocks; BC.Stats.LooseMatchedSampleCount += ExecCount; LLVM_DEBUG(dbgs() << " loose match\n"); break; case StaleMatcher::NO_MATCH: LLVM_DEBUG(dbgs() << " no match\n"); } } for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF->Blocks) { for (const yaml::bolt::SuccessorInfo &YamlSI : YamlBB.Successors) { if (YamlSI.Count == 0) continue; // Try to find the jump for a given (src, dst) pair from the profile and // assign the jump weight based on the profile count const uint64_t SrcIndex = YamlBB.Index; const uint64_t DstIndex = YamlSI.Index; const FlowBlock *MatchedSrcBlock = MatchMap.lookup(SrcIndex).first; const FlowBlock *MatchedDstBlock = MatchMap.lookup(DstIndex).first; if (MatchedSrcBlock != nullptr && MatchedDstBlock != nullptr) { // Find a jump between the two blocks FlowJump *Jump = nullptr; for (FlowJump *SuccJump : MatchedSrcBlock->SuccJumps) { if (SuccJump->Target == MatchedDstBlock->Index) { Jump = SuccJump; break; } } // Assign the weight, if the corresponding jump is found if (Jump != nullptr) { Jump->Weight = YamlSI.Count; Jump->HasUnknownWeight = false; } } // Assign the weight for the src block, if it is found if (MatchedSrcBlock != nullptr) OutWeight[MatchedSrcBlock->Index] += YamlSI.Count; // Assign the weight for the dst block, if it is found if (MatchedDstBlock != nullptr) InWeight[MatchedDstBlock->Index] += YamlSI.Count; } } } // Assign block counts based on in-/out- jumps for (FlowBlock &Block : Func.Blocks) { if (OutWeight[Block.Index] == 0 && InWeight[Block.Index] == 0) { assert(Block.HasUnknownWeight && "unmatched block with a positive count"); continue; } Block.HasUnknownWeight = false; Block.Weight = std::max(OutWeight[Block.Index], InWeight[Block.Index]); } return MatchedBlocks[&YamlBF].size(); } /// The function finds all blocks that are (i) reachable from the Entry block /// and (ii) do not have a path to an exit, and marks all such blocks 'cold' /// so that profi does not send any flow to such blocks. void preprocessUnreachableBlocks(FlowFunction &Func) { const uint64_t NumBlocks = Func.Blocks.size(); // Start bfs from the source std::queue Queue; std::vector VisitedEntry(NumBlocks, false); for (uint64_t I = 0; I < NumBlocks; I++) { FlowBlock &Block = Func.Blocks[I]; if (Block.isEntry()) { Queue.push(I); VisitedEntry[I] = true; break; } } while (!Queue.empty()) { const uint64_t Src = Queue.front(); Queue.pop(); for (FlowJump *Jump : Func.Blocks[Src].SuccJumps) { const uint64_t Dst = Jump->Target; if (!VisitedEntry[Dst]) { Queue.push(Dst); VisitedEntry[Dst] = true; } } } // Start bfs from all sinks std::vector VisitedExit(NumBlocks, false); for (uint64_t I = 0; I < NumBlocks; I++) { FlowBlock &Block = Func.Blocks[I]; if (Block.isExit() && VisitedEntry[I]) { Queue.push(I); VisitedExit[I] = true; } } while (!Queue.empty()) { const uint64_t Src = Queue.front(); Queue.pop(); for (FlowJump *Jump : Func.Blocks[Src].PredJumps) { const uint64_t Dst = Jump->Source; if (!VisitedExit[Dst]) { Queue.push(Dst); VisitedExit[Dst] = true; } } } // Make all blocks of zero weight so that flow is not sent for (uint64_t I = 0; I < NumBlocks; I++) { FlowBlock &Block = Func.Blocks[I]; if (Block.Weight == 0) continue; if (!VisitedEntry[I] || !VisitedExit[I]) { Block.Weight = 0; Block.HasUnknownWeight = true; Block.IsUnlikely = true; for (FlowJump *Jump : Block.SuccJumps) { if (Jump->Source == Block.Index && Jump->Target == Block.Index) { Jump->Weight = 0; Jump->HasUnknownWeight = true; Jump->IsUnlikely = true; } } } } } /// Decide if stale profile matching can be applied for a given function. /// Currently we skip inference for (very) large instances and for instances /// having "unexpected" control flow (e.g., having no sink basic blocks). bool canApplyInference(const FlowFunction &Func, const yaml::bolt::BinaryFunctionProfile &YamlBF, const uint64_t &MatchedBlocks) { if (Func.Blocks.size() > opts::StaleMatchingMaxFuncSize) return false; if (MatchedBlocks * 100 < opts::StaleMatchingMinMatchedBlock * YamlBF.Blocks.size()) return false; // Returns false if the artificial sink block has no predecessors meaning // there are no exit blocks. if (Func.Blocks[Func.Blocks.size() - 1].isEntry()) return false; return true; } /// Apply the profile inference algorithm for a given flow function. void applyInference(FlowFunction &Func) { ProfiParams Params; // Set the params from the command-line flags. Params.EvenFlowDistribution = opts::StaleMatchingEvenFlowDistribution; Params.RebalanceUnknown = opts::StaleMatchingRebalanceUnknown; Params.JoinIslands = opts::StaleMatchingJoinIslands; Params.CostBlockInc = opts::StaleMatchingCostBlockInc; Params.CostBlockEntryInc = opts::StaleMatchingCostBlockInc; Params.CostBlockDec = opts::StaleMatchingCostBlockDec; Params.CostBlockEntryDec = opts::StaleMatchingCostBlockDec; Params.CostBlockUnknownInc = opts::StaleMatchingCostBlockUnknownInc; Params.CostJumpInc = opts::StaleMatchingCostJumpInc; Params.CostJumpFTInc = opts::StaleMatchingCostJumpInc; Params.CostJumpDec = opts::StaleMatchingCostJumpDec; Params.CostJumpFTDec = opts::StaleMatchingCostJumpDec; Params.CostJumpUnknownInc = opts::StaleMatchingCostJumpUnknownInc; Params.CostJumpUnknownFTInc = opts::StaleMatchingCostJumpUnknownFTInc; applyFlowInference(Params, Func); } /// Collect inferred counts from the flow function and update annotations in /// the binary function. void assignProfile(BinaryFunction &BF, const BinaryFunction::BasicBlockOrderType &BlockOrder, FlowFunction &Func) { BinaryContext &BC = BF.getBinaryContext(); assert(Func.Blocks.size() == BlockOrder.size() + 2); for (uint64_t I = 0; I < BlockOrder.size(); I++) { FlowBlock &Block = Func.Blocks[I + 1]; BinaryBasicBlock *BB = BlockOrder[I]; // Update block's count BB->setExecutionCount(Block.Flow); // Update jump counts: (i) clean existing counts and then (ii) set new ones auto BI = BB->branch_info_begin(); for (const BinaryBasicBlock *DstBB : BB->successors()) { (void)DstBB; BI->Count = 0; BI->MispredictedCount = 0; ++BI; } for (FlowJump *Jump : Block.SuccJumps) { if (Jump->IsUnlikely) continue; if (Jump->Flow == 0) continue; // Skips the artificial sink block. if (Jump->Target == Func.Blocks.size() - 1) continue; BinaryBasicBlock &SuccBB = *BlockOrder[Jump->Target - 1]; // Check if the edge corresponds to a regular jump or a landing pad if (BB->getSuccessor(SuccBB.getLabel())) { BinaryBasicBlock::BinaryBranchInfo &BI = BB->getBranchInfo(SuccBB); BI.Count += Jump->Flow; } else { BinaryBasicBlock *LP = BB->getLandingPad(SuccBB.getLabel()); if (LP && LP->getKnownExecutionCount() < Jump->Flow) LP->setExecutionCount(Jump->Flow); } } // Update call-site annotations auto setOrUpdateAnnotation = [&](MCInst &Instr, StringRef Name, uint64_t Count) { if (BC.MIB->hasAnnotation(Instr, Name)) BC.MIB->removeAnnotation(Instr, Name); // Do not add zero-count annotations if (Count == 0) return; BC.MIB->addAnnotation(Instr, Name, Count); }; for (MCInst &Instr : *BB) { // Ignore pseudo instructions if (BC.MIB->isPseudo(Instr)) continue; // Ignore jump tables const MCInst *LastInstr = BB->getLastNonPseudoInstr(); if (BC.MIB->getJumpTable(*LastInstr) && LastInstr == &Instr) continue; if (BC.MIB->isIndirectCall(Instr) || BC.MIB->isIndirectBranch(Instr)) { auto &ICSP = BC.MIB->getOrCreateAnnotationAs( Instr, "CallProfile"); if (!ICSP.empty()) { // Try to evenly distribute the counts among the call sites const uint64_t TotalCount = Block.Flow; const uint64_t NumSites = ICSP.size(); for (uint64_t Idx = 0; Idx < ICSP.size(); Idx++) { IndirectCallProfile &CSP = ICSP[Idx]; uint64_t CountPerSite = TotalCount / NumSites; // When counts cannot be exactly distributed, increase by 1 the // counts of the first (TotalCount % NumSites) call sites if (Idx < TotalCount % NumSites) CountPerSite++; CSP.Count = CountPerSite; } } else { ICSP.emplace_back(nullptr, Block.Flow, 0); } } else if (BC.MIB->getConditionalTailCall(Instr)) { // We don't know exactly the number of times the conditional tail call // is executed; conservatively, setting it to the count of the block setOrUpdateAnnotation(Instr, "CTCTakenCount", Block.Flow); BC.MIB->removeAnnotation(Instr, "CTCMispredCount"); } else if (BC.MIB->isCall(Instr)) { setOrUpdateAnnotation(Instr, "Count", Block.Flow); } } } // Update function's execution count and mark the function inferred. BF.setExecutionCount(Func.Blocks[0].Flow); BF.setHasInferredProfile(true); } bool YAMLProfileReader::inferStaleProfile( BinaryFunction &BF, const yaml::bolt::BinaryFunctionProfile &YamlBF, const ArrayRef ProbeMatchSpecs) { NamedRegionTimer T("inferStaleProfile", "stale profile inference", "rewrite", "Rewrite passes", opts::TimeRewrite); if (!BF.hasCFG()) return false; LLVM_DEBUG(dbgs() << "BOLT-INFO: applying profile inference for " << "\"" << BF.getPrintName() << "\"\n"); // Make sure that block hashes are up to date. BF.computeBlockHashes(YamlBP.Header.HashFunction); const BinaryFunction::BasicBlockOrderType BlockOrder( BF.getLayout().block_begin(), BF.getLayout().block_end()); // Tracks the number of matched blocks. // Create a wrapper flow function to use with the profile inference algorithm. FlowFunction Func = createFlowFunction(BlockOrder); // Match as many block/jump counts from the stale profile as possible size_t MatchedBlocks = matchWeights(BF.getBinaryContext(), BlockOrder, YamlBF, Func, YamlBP.Header.HashFunction, IdToYamLBF, BF, ProbeMatchSpecs); // Adjust the flow function by marking unreachable blocks Unlikely so that // they don't get any counts assigned. preprocessUnreachableBlocks(Func); // Check if profile inference can be applied for the instance. if (!canApplyInference(Func, YamlBF, MatchedBlocks)) return false; // Apply the profile inference algorithm. applyInference(Func); // Collect inferred counts and update function annotations. assignProfile(BF, BlockOrder, Func); // As of now, we always mark the binary function having "correct" profile. // In the future, we may discard the results for instances with poor inference // metrics and keep such functions un-optimized. return true; } } // end namespace bolt } // end namespace llvm