llvm-project
1035 строк · 39.2 Кб
1//===- bolt/Passes/SplitFunctions.cpp - Pass for splitting function code --===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the SplitFunctions pass.
10//
11//===----------------------------------------------------------------------===//
12
13#include "bolt/Passes/SplitFunctions.h"
14#include "bolt/Core/BinaryBasicBlock.h"
15#include "bolt/Core/BinaryFunction.h"
16#include "bolt/Core/FunctionLayout.h"
17#include "bolt/Core/ParallelUtilities.h"
18#include "bolt/Utils/CommandLineOpts.h"
19#include "llvm/ADT/STLExtras.h"
20#include "llvm/ADT/SmallVector.h"
21#include "llvm/ADT/iterator_range.h"
22#include "llvm/Support/CommandLine.h"
23#include "llvm/Support/FormatVariadic.h"
24#include <algorithm>
25#include <iterator>
26#include <memory>
27#include <numeric>
28#include <random>
29#include <vector>
30
31#define DEBUG_TYPE "bolt-opts"
32
33using namespace llvm;
34using namespace bolt;
35
36namespace {
37class DeprecatedSplitFunctionOptionParser : public cl::parser<bool> {
38public:
39explicit DeprecatedSplitFunctionOptionParser(cl::Option &O)
40: cl::parser<bool>(O) {}
41
42bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, bool &Value) {
43if (Arg == "2" || Arg == "3") {
44Value = true;
45errs() << formatv("BOLT-WARNING: specifying non-boolean value \"{0}\" "
46"for option -{1} is deprecated\n",
47Arg, ArgName);
48return false;
49}
50return cl::parser<bool>::parse(O, ArgName, Arg, Value);
51}
52};
53} // namespace
54
55namespace opts {
56
57extern cl::OptionCategory BoltOptCategory;
58
59extern cl::opt<bool> SplitEH;
60extern cl::opt<unsigned> ExecutionCountThreshold;
61extern cl::opt<uint32_t> RandomSeed;
62
63static cl::opt<bool> AggressiveSplitting(
64"split-all-cold", cl::desc("outline as many cold basic blocks as possible"),
65cl::cat(BoltOptCategory));
66
67static cl::opt<unsigned> SplitAlignThreshold(
68"split-align-threshold",
69cl::desc("when deciding to split a function, apply this alignment "
70"while doing the size comparison (see -split-threshold). "
71"Default value: 2."),
72cl::init(2),
73
74cl::Hidden, cl::cat(BoltOptCategory));
75
76static cl::opt<bool, false, DeprecatedSplitFunctionOptionParser>
77SplitFunctions("split-functions",
78cl::desc("split functions into fragments"),
79cl::cat(BoltOptCategory));
80
81static cl::opt<unsigned> SplitThreshold(
82"split-threshold",
83cl::desc("split function only if its main size is reduced by more than "
84"given amount of bytes. Default value: 0, i.e. split iff the "
85"size is reduced. Note that on some architectures the size can "
86"increase after splitting."),
87cl::init(0), cl::Hidden, cl::cat(BoltOptCategory));
88
89static cl::opt<SplitFunctionsStrategy> SplitStrategy(
90"split-strategy", cl::init(SplitFunctionsStrategy::Profile2),
91cl::values(clEnumValN(SplitFunctionsStrategy::Profile2, "profile2",
92"split each function into a hot and cold fragment "
93"using profiling information")),
94cl::values(clEnumValN(SplitFunctionsStrategy::CDSplit, "cdsplit",
95"split each function into a hot, warm, and cold "
96"fragment using profiling information")),
97cl::values(clEnumValN(
98SplitFunctionsStrategy::Random2, "random2",
99"split each function into a hot and cold fragment at a randomly chosen "
100"split point (ignoring any available profiling information)")),
101cl::values(clEnumValN(
102SplitFunctionsStrategy::RandomN, "randomN",
103"split each function into N fragments at a randomly chosen split "
104"points (ignoring any available profiling information)")),
105cl::values(clEnumValN(
106SplitFunctionsStrategy::All, "all",
107"split all basic blocks of each function into fragments such that each "
108"fragment contains exactly a single basic block")),
109cl::desc("strategy used to partition blocks into fragments"),
110cl::cat(BoltOptCategory));
111
112static cl::opt<double> CallScale(
113"call-scale",
114cl::desc("Call score scale coefficient (when --split-strategy=cdsplit)"),
115cl::init(0.95), cl::ReallyHidden, cl::cat(BoltOptCategory));
116
117static cl::opt<double>
118CallPower("call-power",
119cl::desc("Call score power (when --split-strategy=cdsplit)"),
120cl::init(0.05), cl::ReallyHidden, cl::cat(BoltOptCategory));
121
122static cl::opt<double>
123JumpPower("jump-power",
124cl::desc("Jump score power (when --split-strategy=cdsplit)"),
125cl::init(0.15), cl::ReallyHidden, cl::cat(BoltOptCategory));
126} // namespace opts
127
128namespace {
129bool hasFullProfile(const BinaryFunction &BF) {
130return llvm::all_of(BF.blocks(), [](const BinaryBasicBlock &BB) {
131return BB.getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE;
132});
133}
134
135bool allBlocksCold(const BinaryFunction &BF) {
136return llvm::all_of(BF.blocks(), [](const BinaryBasicBlock &BB) {
137return BB.getExecutionCount() == 0;
138});
139}
140
141struct SplitProfile2 final : public SplitStrategy {
142bool canSplit(const BinaryFunction &BF) override {
143return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF);
144}
145
146bool compactFragments() override { return true; }
147
148void fragment(const BlockIt Start, const BlockIt End) override {
149for (BinaryBasicBlock *const BB : llvm::make_range(Start, End)) {
150if (BB->getExecutionCount() == 0)
151BB->setFragmentNum(FragmentNum::cold());
152}
153}
154};
155
156struct SplitCacheDirected final : public SplitStrategy {
157BinaryContext &BC;
158using BasicBlockOrder = BinaryFunction::BasicBlockOrderType;
159
160bool canSplit(const BinaryFunction &BF) override {
161return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF);
162}
163
164explicit SplitCacheDirected(BinaryContext &BC) : BC(BC) {
165initializeAuxiliaryVariables();
166buildCallGraph();
167}
168
169// When some functions are hot-warm split and others are hot-warm-cold split,
170// we do not want to change the fragment numbers of the blocks in the hot-warm
171// split functions.
172bool compactFragments() override { return false; }
173
174void fragment(const BlockIt Start, const BlockIt End) override {
175BasicBlockOrder BlockOrder(Start, End);
176BinaryFunction &BF = *BlockOrder.front()->getFunction();
177// No need to re-split small functions.
178if (BlockOrder.size() <= 2)
179return;
180
181size_t BestSplitIndex = findSplitIndex(BF, BlockOrder);
182assert(BestSplitIndex < BlockOrder.size());
183
184// Assign fragments based on the computed best split index.
185// All basic blocks with index up to the best split index become hot.
186// All remaining blocks are warm / cold depending on if count is
187// greater than zero or not.
188for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
189BinaryBasicBlock *BB = BlockOrder[Index];
190if (Index <= BestSplitIndex)
191BB->setFragmentNum(FragmentNum::main());
192else
193BB->setFragmentNum(BB->getKnownExecutionCount() > 0
194? FragmentNum::warm()
195: FragmentNum::cold());
196}
197}
198
199private:
200struct CallInfo {
201size_t Length;
202size_t Count;
203};
204
205struct SplitScore {
206size_t SplitIndex = size_t(-1);
207size_t HotSizeReduction = 0;
208double LocalScore = 0;
209double CoverCallScore = 0;
210
211double sum() const { return LocalScore + CoverCallScore; }
212};
213
214// Auxiliary variables used by the algorithm.
215size_t TotalNumBlocks{0};
216size_t OrigHotSectionSize{0};
217DenseMap<const BinaryBasicBlock *, size_t> GlobalIndices;
218DenseMap<const BinaryBasicBlock *, size_t> BBSizes;
219DenseMap<const BinaryBasicBlock *, size_t> BBOffsets;
220
221// Call graph.
222std::vector<SmallVector<const BinaryBasicBlock *, 0>> Callers;
223std::vector<SmallVector<const BinaryBasicBlock *, 0>> Callees;
224
225bool shouldConsiderForCallGraph(const BinaryFunction &BF) {
226// Only a subset of the functions in the binary will be considered
227// for initializing auxiliary variables and building call graph.
228return BF.hasValidIndex() && BF.hasValidProfile() && !BF.empty();
229}
230
231void initializeAuxiliaryVariables() {
232for (BinaryFunction *BF : BC.getSortedFunctions()) {
233if (!shouldConsiderForCallGraph(*BF))
234continue;
235
236// Calculate the size of each BB after hot-cold splitting.
237// This populates BinaryBasicBlock::OutputAddressRange which
238// can be used to compute the size of each BB.
239BC.calculateEmittedSize(*BF, /*FixBranches=*/true);
240
241for (BinaryBasicBlock *BB : BF->getLayout().blocks()) {
242// Unique global index.
243GlobalIndices[BB] = TotalNumBlocks;
244TotalNumBlocks++;
245
246// Block size after hot-cold splitting.
247BBSizes[BB] = BB->getOutputSize();
248
249// Hot block offset after hot-cold splitting.
250BBOffsets[BB] = OrigHotSectionSize;
251if (!BB->isSplit())
252OrigHotSectionSize += BBSizes[BB];
253}
254}
255}
256
257void buildCallGraph() {
258Callers.resize(TotalNumBlocks);
259Callees.resize(TotalNumBlocks);
260for (const BinaryFunction *SrcFunction : BC.getSortedFunctions()) {
261if (!shouldConsiderForCallGraph(*SrcFunction))
262continue;
263
264for (BinaryBasicBlock &SrcBB : SrcFunction->blocks()) {
265// Skip blocks that are not executed
266if (SrcBB.getKnownExecutionCount() == 0)
267continue;
268
269// Find call instructions and extract target symbols from each one
270for (const MCInst &Inst : SrcBB) {
271if (!BC.MIB->isCall(Inst))
272continue;
273
274// Call info
275const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst);
276// Ignore calls w/o information
277if (!DstSym)
278continue;
279
280const BinaryFunction *DstFunction = BC.getFunctionForSymbol(DstSym);
281// Ignore calls that do not have a valid target, but do not ignore
282// recursive calls, because caller block could be moved to warm.
283if (!DstFunction || DstFunction->getLayout().block_empty())
284continue;
285
286const BinaryBasicBlock *DstBB = &(DstFunction->front());
287
288// Record the call only if DstBB is also in functions to consider for
289// call graph.
290if (GlobalIndices.contains(DstBB)) {
291Callers[GlobalIndices[DstBB]].push_back(&SrcBB);
292Callees[GlobalIndices[&SrcBB]].push_back(DstBB);
293}
294}
295}
296}
297}
298
299/// Populate BinaryBasicBlock::OutputAddressRange with estimated basic block
300/// start and end addresses for hot and warm basic blocks, assuming hot-warm
301/// splitting happens at \p SplitIndex. Also return estimated end addresses
302/// of the hot fragment before and after splitting.
303/// The estimations take into account the potential addition of branch
304/// instructions due to split fall through branches as well as the need to
305/// use longer branch instructions for split (un)conditional branches.
306std::pair<size_t, size_t>
307estimatePostSplitBBAddress(const BasicBlockOrder &BlockOrder,
308const size_t SplitIndex) {
309assert(SplitIndex < BlockOrder.size() && "Invalid split index");
310
311// Update function layout assuming hot-warm splitting at SplitIndex.
312for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
313BinaryBasicBlock *BB = BlockOrder[Index];
314if (BB->getFragmentNum() == FragmentNum::cold())
315break;
316BB->setFragmentNum(Index <= SplitIndex ? FragmentNum::main()
317: FragmentNum::warm());
318}
319BinaryFunction *BF = BlockOrder[0]->getFunction();
320BF->getLayout().update(BlockOrder);
321// Populate BB.OutputAddressRange under the updated layout.
322BC.calculateEmittedSize(*BF);
323
324// Populate BB.OutputAddressRange with estimated new start and end addresses
325// and compute the old end address of the hot section and the new end
326// address of the hot section.
327size_t OldHotEndAddr{0};
328size_t NewHotEndAddr{0};
329size_t CurrentAddr = BBOffsets[BlockOrder[0]];
330for (BinaryBasicBlock *BB : BlockOrder) {
331// We only care about new addresses of blocks in hot/warm.
332if (BB->getFragmentNum() == FragmentNum::cold())
333break;
334const size_t NewSize = BB->getOutputSize();
335BB->setOutputStartAddress(CurrentAddr);
336CurrentAddr += NewSize;
337BB->setOutputEndAddress(CurrentAddr);
338if (BB->getLayoutIndex() == SplitIndex) {
339NewHotEndAddr = CurrentAddr;
340// Approximate the start address of the warm fragment of the current
341// function using the original hot section size.
342CurrentAddr = OrigHotSectionSize;
343}
344OldHotEndAddr = BBOffsets[BB] + BBSizes[BB];
345}
346return std::make_pair(OldHotEndAddr, NewHotEndAddr);
347}
348
349/// Get a collection of "shortenable" calls, that is, calls of type X->Y
350/// when the function order is [... X ... BF ... Y ...].
351/// If the hot fragment size of BF is reduced, then such calls are guaranteed
352/// to get shorter by the reduced hot fragment size.
353std::vector<CallInfo> extractCoverCalls(const BinaryFunction &BF) {
354// Record the length and the count of the calls that can be shortened
355std::vector<CallInfo> CoverCalls;
356if (opts::CallScale == 0)
357return CoverCalls;
358
359const BinaryFunction *ThisBF = &BF;
360const BinaryBasicBlock *ThisBB = &(ThisBF->front());
361const size_t ThisGI = GlobalIndices[ThisBB];
362
363for (const BinaryFunction *DstBF : BC.getSortedFunctions()) {
364if (!shouldConsiderForCallGraph(*DstBF))
365continue;
366
367const BinaryBasicBlock *DstBB = &(DstBF->front());
368if (DstBB->getKnownExecutionCount() == 0)
369continue;
370
371const size_t DstGI = GlobalIndices[DstBB];
372for (const BinaryBasicBlock *SrcBB : Callers[DstGI]) {
373const BinaryFunction *SrcBF = SrcBB->getFunction();
374if (ThisBF == SrcBF)
375continue;
376
377const size_t CallCount = SrcBB->getKnownExecutionCount();
378
379const size_t SrcGI = GlobalIndices[SrcBB];
380
381const bool IsCoverCall = (SrcGI < ThisGI && ThisGI < DstGI) ||
382(DstGI <= ThisGI && ThisGI < SrcGI);
383if (!IsCoverCall)
384continue;
385
386const size_t SrcBBEndAddr = BBOffsets[SrcBB] + BBSizes[SrcBB];
387const size_t DstBBStartAddr = BBOffsets[DstBB];
388const size_t CallLength =
389AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr);
390const CallInfo CI{CallLength, CallCount};
391CoverCalls.emplace_back(CI);
392}
393}
394return CoverCalls;
395}
396
397/// Compute the edge score of a call edge.
398double computeCallScore(uint64_t CallCount, size_t CallLength) {
399// Increase call lengths by 1 to avoid raising 0 to a negative power.
400return opts::CallScale * static_cast<double>(CallCount) /
401std::pow(static_cast<double>(CallLength + 1), opts::CallPower);
402}
403
404/// Compute the edge score of a jump (branch) edge.
405double computeJumpScore(uint64_t JumpCount, size_t JumpLength) {
406// Increase jump lengths by 1 to avoid raising 0 to a negative power.
407return static_cast<double>(JumpCount) /
408std::pow(static_cast<double>(JumpLength + 1), opts::JumpPower);
409}
410
411/// Compute sum of scores over jumps within \p BlockOrder given \p SplitIndex.
412/// Increament Score.LocalScore in place by the sum.
413void computeJumpScore(const BasicBlockOrder &BlockOrder,
414const size_t SplitIndex, SplitScore &Score) {
415
416for (const BinaryBasicBlock *SrcBB : BlockOrder) {
417if (SrcBB->getKnownExecutionCount() == 0)
418continue;
419
420const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second;
421
422for (const auto Pair : zip(SrcBB->successors(), SrcBB->branch_info())) {
423const BinaryBasicBlock *DstBB = std::get<0>(Pair);
424const BinaryBasicBlock::BinaryBranchInfo &Branch = std::get<1>(Pair);
425const size_t JumpCount = Branch.Count;
426
427if (JumpCount == 0)
428continue;
429
430const size_t DstBBStartAddr = DstBB->getOutputAddressRange().first;
431const size_t NewJumpLength =
432AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr);
433Score.LocalScore += computeJumpScore(JumpCount, NewJumpLength);
434}
435}
436}
437
438/// Compute sum of scores over calls originated in the current function
439/// given \p SplitIndex. Increament Score.LocalScore in place by the sum.
440void computeLocalCallScore(const BasicBlockOrder &BlockOrder,
441const size_t SplitIndex, SplitScore &Score) {
442if (opts::CallScale == 0)
443return;
444
445// Global index of the last block in the current function.
446// This is later used to determine whether a call originated in the current
447// function is to a function that comes after the current function.
448const size_t LastGlobalIndex = GlobalIndices[BlockOrder.back()];
449
450// The length of calls originated in the input function can increase /
451// decrease depending on the splitting decision.
452for (const BinaryBasicBlock *SrcBB : BlockOrder) {
453const size_t CallCount = SrcBB->getKnownExecutionCount();
454// If SrcBB does not call any functions, skip it.
455if (CallCount == 0)
456continue;
457
458// Obtain an estimate on the end address of the src basic block
459// after splitting at SplitIndex.
460const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second;
461
462for (const BinaryBasicBlock *DstBB : Callees[GlobalIndices[SrcBB]]) {
463// Obtain an estimate on the start address of the dst basic block
464// after splitting at SplitIndex. If DstBB is in a function before
465// the current function, then its start address remains unchanged.
466size_t DstBBStartAddr = BBOffsets[DstBB];
467// If DstBB is in a function after the current function, then its
468// start address should be adjusted based on the reduction in hot size.
469if (GlobalIndices[DstBB] > LastGlobalIndex) {
470assert(DstBBStartAddr >= Score.HotSizeReduction);
471DstBBStartAddr -= Score.HotSizeReduction;
472}
473const size_t NewCallLength =
474AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr);
475Score.LocalScore += computeCallScore(CallCount, NewCallLength);
476}
477}
478}
479
480/// Compute sum of splitting scores for cover calls of the input function.
481/// Increament Score.CoverCallScore in place by the sum.
482void computeCoverCallScore(const BasicBlockOrder &BlockOrder,
483const size_t SplitIndex,
484const std::vector<CallInfo> &CoverCalls,
485SplitScore &Score) {
486if (opts::CallScale == 0)
487return;
488
489for (const CallInfo CI : CoverCalls) {
490assert(CI.Length >= Score.HotSizeReduction &&
491"Length of cover calls must exceed reduced size of hot fragment.");
492// Compute the new length of the call, which is shorter than the original
493// one by the size of the splitted fragment minus the total size increase.
494const size_t NewCallLength = CI.Length - Score.HotSizeReduction;
495Score.CoverCallScore += computeCallScore(CI.Count, NewCallLength);
496}
497}
498
499/// Compute the split score of splitting a function at a given index.
500/// The split score consists of local score and cover score. This function
501/// returns \p Score of SplitScore type. It contains the local score and
502/// cover score of the current splitting index. For easier book keeping and
503/// comparison, it also stores the split index and the resulting reduction
504/// in hot fragment size.
505SplitScore computeSplitScore(const BinaryFunction &BF,
506const BasicBlockOrder &BlockOrder,
507const size_t SplitIndex,
508const std::vector<CallInfo> &CoverCalls) {
509// Populate BinaryBasicBlock::OutputAddressRange with estimated
510// new start and end addresses after hot-warm splitting at SplitIndex.
511size_t OldHotEnd;
512size_t NewHotEnd;
513std::tie(OldHotEnd, NewHotEnd) =
514estimatePostSplitBBAddress(BlockOrder, SplitIndex);
515
516SplitScore Score;
517Score.SplitIndex = SplitIndex;
518
519// It's not worth splitting if OldHotEnd < NewHotEnd.
520if (OldHotEnd < NewHotEnd)
521return Score;
522
523// Hot fragment size reduction due to splitting.
524Score.HotSizeReduction = OldHotEnd - NewHotEnd;
525
526// First part of LocalScore is the sum over call edges originated in the
527// input function. These edges can get shorter or longer depending on
528// SplitIndex. Score.LocalScore is increamented in place.
529computeLocalCallScore(BlockOrder, SplitIndex, Score);
530
531// Second part of LocalScore is the sum over jump edges with src basic block
532// and dst basic block in the current function. Score.LocalScore is
533// increamented in place.
534computeJumpScore(BlockOrder, SplitIndex, Score);
535
536// Compute CoverCallScore and store in Score in place.
537computeCoverCallScore(BlockOrder, SplitIndex, CoverCalls, Score);
538return Score;
539}
540
541/// Find the most likely successor of a basic block when it has one or two
542/// successors. Return nullptr otherwise.
543const BinaryBasicBlock *getMostLikelySuccessor(const BinaryBasicBlock *BB) {
544if (BB->succ_size() == 1)
545return BB->getSuccessor();
546if (BB->succ_size() == 2) {
547uint64_t TakenCount = BB->getTakenBranchInfo().Count;
548assert(TakenCount != BinaryBasicBlock::COUNT_NO_PROFILE);
549uint64_t NonTakenCount = BB->getFallthroughBranchInfo().Count;
550assert(NonTakenCount != BinaryBasicBlock::COUNT_NO_PROFILE);
551if (TakenCount > NonTakenCount)
552return BB->getConditionalSuccessor(true);
553else if (TakenCount < NonTakenCount)
554return BB->getConditionalSuccessor(false);
555}
556return nullptr;
557}
558
559/// Find the best index for splitting. The returned value is the index of the
560/// last hot basic block. Hence, "no splitting" is equivalent to returning the
561/// value which is one less than the size of the function.
562size_t findSplitIndex(const BinaryFunction &BF,
563const BasicBlockOrder &BlockOrder) {
564assert(BlockOrder.size() > 2);
565// Find all function calls that can be shortened if we move blocks of the
566// current function to warm/cold
567const std::vector<CallInfo> CoverCalls = extractCoverCalls(BF);
568
569// Find the existing hot-cold splitting index.
570size_t HotColdIndex = 0;
571while (HotColdIndex + 1 < BlockOrder.size()) {
572if (BlockOrder[HotColdIndex + 1]->getFragmentNum() == FragmentNum::cold())
573break;
574HotColdIndex++;
575}
576assert(HotColdIndex + 1 == BlockOrder.size() ||
577(BlockOrder[HotColdIndex]->getFragmentNum() == FragmentNum::main() &&
578BlockOrder[HotColdIndex + 1]->getFragmentNum() ==
579FragmentNum::cold()));
580
581// Try all possible split indices up to HotColdIndex (blocks that have
582// Index <= SplitIndex are in hot) and find the one maximizing the
583// splitting score.
584SplitScore BestScore;
585for (size_t Index = 0; Index <= HotColdIndex; Index++) {
586const BinaryBasicBlock *LastHotBB = BlockOrder[Index];
587assert(LastHotBB->getFragmentNum() != FragmentNum::cold());
588
589// Do not break jump to the most likely successor.
590if (Index + 1 < BlockOrder.size() &&
591BlockOrder[Index + 1] == getMostLikelySuccessor(LastHotBB))
592continue;
593
594const SplitScore Score =
595computeSplitScore(BF, BlockOrder, Index, CoverCalls);
596if (Score.sum() > BestScore.sum())
597BestScore = Score;
598}
599
600// If we don't find a good splitting point, fallback to the original one.
601if (BestScore.SplitIndex == size_t(-1))
602return HotColdIndex;
603
604return BestScore.SplitIndex;
605}
606};
607
608struct SplitRandom2 final : public SplitStrategy {
609std::minstd_rand0 Gen;
610
611SplitRandom2() : Gen(opts::RandomSeed.getValue()) {}
612
613bool canSplit(const BinaryFunction &BF) override { return true; }
614
615bool compactFragments() override { return true; }
616
617void fragment(const BlockIt Start, const BlockIt End) override {
618using DiffT = typename std::iterator_traits<BlockIt>::difference_type;
619const DiffT NumBlocks = End - Start;
620assert(NumBlocks > 0 && "Cannot fragment empty function");
621
622// We want to split at least one block
623const auto LastSplitPoint = std::max<DiffT>(NumBlocks - 1, 1);
624std::uniform_int_distribution<DiffT> Dist(1, LastSplitPoint);
625const DiffT SplitPoint = Dist(Gen);
626for (BinaryBasicBlock *BB : llvm::make_range(Start + SplitPoint, End))
627BB->setFragmentNum(FragmentNum::cold());
628
629LLVM_DEBUG(dbgs() << formatv("BOLT-DEBUG: randomly chose last {0} (out of "
630"{1} possible) blocks to split\n",
631NumBlocks - SplitPoint, End - Start));
632}
633};
634
635struct SplitRandomN final : public SplitStrategy {
636std::minstd_rand0 Gen;
637
638SplitRandomN() : Gen(opts::RandomSeed.getValue()) {}
639
640bool canSplit(const BinaryFunction &BF) override { return true; }
641
642bool compactFragments() override { return true; }
643
644void fragment(const BlockIt Start, const BlockIt End) override {
645using DiffT = typename std::iterator_traits<BlockIt>::difference_type;
646const DiffT NumBlocks = End - Start;
647assert(NumBlocks > 0 && "Cannot fragment empty function");
648
649// With n blocks, there are n-1 places to split them.
650const DiffT MaximumSplits = NumBlocks - 1;
651// We want to generate at least two fragment if possible, but if there is
652// only one block, no splits are possible.
653const auto MinimumSplits = std::min<DiffT>(MaximumSplits, 1);
654std::uniform_int_distribution<DiffT> Dist(MinimumSplits, MaximumSplits);
655// Choose how many splits to perform
656const DiffT NumSplits = Dist(Gen);
657
658// Draw split points from a lottery
659SmallVector<unsigned, 0> Lottery(MaximumSplits);
660// Start lottery at 1, because there is no meaningful splitpoint before the
661// first block.
662std::iota(Lottery.begin(), Lottery.end(), 1u);
663std::shuffle(Lottery.begin(), Lottery.end(), Gen);
664Lottery.resize(NumSplits);
665llvm::sort(Lottery);
666
667// Add one past the end entry to lottery
668Lottery.push_back(NumBlocks);
669
670unsigned LotteryIndex = 0;
671unsigned BBPos = 0;
672for (BinaryBasicBlock *const BB : make_range(Start, End)) {
673// Check whether to start new fragment
674if (BBPos >= Lottery[LotteryIndex])
675++LotteryIndex;
676
677// Because LotteryIndex is 0 based and cold fragments are 1 based, we can
678// use the index to assign fragments.
679BB->setFragmentNum(FragmentNum(LotteryIndex));
680
681++BBPos;
682}
683}
684};
685
686struct SplitAll final : public SplitStrategy {
687bool canSplit(const BinaryFunction &BF) override { return true; }
688
689bool compactFragments() override {
690// Keeping empty fragments allows us to test, that empty fragments do not
691// generate symbols.
692return false;
693}
694
695void fragment(const BlockIt Start, const BlockIt End) override {
696unsigned Fragment = 0;
697for (BinaryBasicBlock *const BB : llvm::make_range(Start, End))
698BB->setFragmentNum(FragmentNum(Fragment++));
699}
700};
701} // namespace
702
703namespace llvm {
704namespace bolt {
705
706bool SplitFunctions::shouldOptimize(const BinaryFunction &BF) const {
707// Apply execution count threshold
708if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
709return false;
710
711return BinaryFunctionPass::shouldOptimize(BF);
712}
713
714Error SplitFunctions::runOnFunctions(BinaryContext &BC) {
715if (!opts::SplitFunctions)
716return Error::success();
717
718if (BC.IsLinuxKernel && BC.BOLTReserved.empty()) {
719BC.errs() << "BOLT-ERROR: split functions require reserved space in the "
720"Linux kernel binary\n";
721exit(1);
722}
723
724// If split strategy is not CDSplit, then a second run of the pass is not
725// needed after function reordering.
726if (BC.HasFinalizedFunctionOrder &&
727opts::SplitStrategy != SplitFunctionsStrategy::CDSplit)
728return Error::success();
729
730std::unique_ptr<SplitStrategy> Strategy;
731bool ForceSequential = false;
732
733switch (opts::SplitStrategy) {
734case SplitFunctionsStrategy::CDSplit:
735// CDSplit runs two splitting passes: hot-cold splitting (SplitPrfoile2)
736// before function reordering and hot-warm-cold splitting
737// (SplitCacheDirected) after function reordering.
738if (BC.HasFinalizedFunctionOrder)
739Strategy = std::make_unique<SplitCacheDirected>(BC);
740else
741Strategy = std::make_unique<SplitProfile2>();
742opts::AggressiveSplitting = true;
743BC.HasWarmSection = true;
744break;
745case SplitFunctionsStrategy::Profile2:
746Strategy = std::make_unique<SplitProfile2>();
747break;
748case SplitFunctionsStrategy::Random2:
749Strategy = std::make_unique<SplitRandom2>();
750// If we split functions randomly, we need to ensure that across runs with
751// the same input, we generate random numbers for each function in the same
752// order.
753ForceSequential = true;
754break;
755case SplitFunctionsStrategy::RandomN:
756Strategy = std::make_unique<SplitRandomN>();
757ForceSequential = true;
758break;
759case SplitFunctionsStrategy::All:
760Strategy = std::make_unique<SplitAll>();
761break;
762}
763
764ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
765return !shouldOptimize(BF);
766};
767
768ParallelUtilities::runOnEachFunction(
769BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
770[&](BinaryFunction &BF) { splitFunction(BF, *Strategy); }, SkipFunc,
771"SplitFunctions", ForceSequential);
772
773if (SplitBytesHot + SplitBytesCold > 0)
774BC.outs() << "BOLT-INFO: splitting separates " << SplitBytesHot
775<< " hot bytes from " << SplitBytesCold << " cold bytes "
776<< format("(%.2lf%% of split functions is hot).\n",
777100.0 * SplitBytesHot /
778(SplitBytesHot + SplitBytesCold));
779return Error::success();
780}
781
782void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy &S) {
783if (BF.empty())
784return;
785
786if (!S.canSplit(BF))
787return;
788
789FunctionLayout &Layout = BF.getLayout();
790BinaryFunction::BasicBlockOrderType PreSplitLayout(Layout.block_begin(),
791Layout.block_end());
792
793BinaryContext &BC = BF.getBinaryContext();
794size_t OriginalHotSize;
795size_t HotSize;
796size_t ColdSize;
797if (BC.isX86()) {
798std::tie(OriginalHotSize, ColdSize) = BC.calculateEmittedSize(BF);
799LLVM_DEBUG(dbgs() << "Estimated size for function " << BF
800<< " pre-split is <0x"
801<< Twine::utohexstr(OriginalHotSize) << ", 0x"
802<< Twine::utohexstr(ColdSize) << ">\n");
803}
804
805BinaryFunction::BasicBlockOrderType NewLayout(Layout.block_begin(),
806Layout.block_end());
807// Never outline the first basic block.
808NewLayout.front()->setCanOutline(false);
809for (BinaryBasicBlock *const BB : NewLayout) {
810if (!BB->canOutline())
811continue;
812
813// Do not split extra entry points in aarch64. They can be referred by
814// using ADRs and when this happens, these blocks cannot be placed far
815// away due to the limited range in ADR instruction.
816if (BC.isAArch64() && BB->isEntryPoint()) {
817BB->setCanOutline(false);
818continue;
819}
820
821if (BF.hasEHRanges() && !opts::SplitEH) {
822// We cannot move landing pads (or rather entry points for landing pads).
823if (BB->isLandingPad()) {
824BB->setCanOutline(false);
825continue;
826}
827// We cannot move a block that can throw since exception-handling
828// runtime cannot deal with split functions. However, if we can guarantee
829// that the block never throws, it is safe to move the block to
830// decrease the size of the function.
831for (MCInst &Instr : *BB) {
832if (BC.MIB->isInvoke(Instr)) {
833BB->setCanOutline(false);
834break;
835}
836}
837}
838
839// Outlining blocks with dynamic branches is not supported yet.
840if (BC.IsLinuxKernel) {
841if (llvm::any_of(
842*BB, [&](MCInst &Inst) { return BC.MIB->isDynamicBranch(Inst); }))
843BB->setCanOutline(false);
844}
845}
846
847BF.getLayout().updateLayoutIndices();
848S.fragment(NewLayout.begin(), NewLayout.end());
849
850// Make sure all non-outlineable blocks are in the main-fragment.
851for (BinaryBasicBlock *const BB : NewLayout) {
852if (!BB->canOutline())
853BB->setFragmentNum(FragmentNum::main());
854}
855
856if (opts::AggressiveSplitting) {
857// All blocks with 0 count that we can move go to the end of the function.
858// Even if they were natural to cluster formation and were seen in-between
859// hot basic blocks.
860llvm::stable_sort(NewLayout, [&](const BinaryBasicBlock *const A,
861const BinaryBasicBlock *const B) {
862return A->getFragmentNum() < B->getFragmentNum();
863});
864} else if (BF.hasEHRanges() && !opts::SplitEH) {
865// Typically functions with exception handling have landing pads at the end.
866// We cannot move beginning of landing pads, but we can move 0-count blocks
867// comprising landing pads to the end and thus facilitate splitting.
868auto FirstLP = NewLayout.begin();
869while ((*FirstLP)->isLandingPad())
870++FirstLP;
871
872std::stable_sort(FirstLP, NewLayout.end(),
873[&](BinaryBasicBlock *A, BinaryBasicBlock *B) {
874return A->getFragmentNum() < B->getFragmentNum();
875});
876}
877
878// Make sure that fragments are increasing.
879FragmentNum CurrentFragment = NewLayout.back()->getFragmentNum();
880for (BinaryBasicBlock *const BB : reverse(NewLayout)) {
881if (BB->getFragmentNum() > CurrentFragment)
882BB->setFragmentNum(CurrentFragment);
883CurrentFragment = BB->getFragmentNum();
884}
885
886if (S.compactFragments()) {
887FragmentNum CurrentFragment = FragmentNum::main();
888FragmentNum NewFragment = FragmentNum::main();
889for (BinaryBasicBlock *const BB : NewLayout) {
890if (BB->getFragmentNum() > CurrentFragment) {
891CurrentFragment = BB->getFragmentNum();
892NewFragment = FragmentNum(NewFragment.get() + 1);
893}
894BB->setFragmentNum(NewFragment);
895}
896}
897
898const bool LayoutUpdated = BF.getLayout().update(NewLayout);
899
900// For shared objects, invoke instructions and corresponding landing pads
901// have to be placed in the same fragment. When we split them, create
902// trampoline landing pads that will redirect the execution to real LPs.
903TrampolineSetType Trampolines;
904if (!BC.HasFixedLoadAddress && BF.hasEHRanges() && BF.isSplit())
905Trampolines = createEHTrampolines(BF);
906
907// Check the new size to see if it's worth splitting the function.
908if (BC.isX86() && LayoutUpdated) {
909std::tie(HotSize, ColdSize) = BC.calculateEmittedSize(BF);
910LLVM_DEBUG(dbgs() << "Estimated size for function " << BF
911<< " post-split is <0x" << Twine::utohexstr(HotSize)
912<< ", 0x" << Twine::utohexstr(ColdSize) << ">\n");
913if (alignTo(OriginalHotSize, opts::SplitAlignThreshold) <=
914alignTo(HotSize, opts::SplitAlignThreshold) + opts::SplitThreshold) {
915if (opts::Verbosity >= 2) {
916BC.outs() << "BOLT-INFO: Reversing splitting of function "
917<< formatv("{0}:\n {1:x}, {2:x} -> {3:x}\n", BF, HotSize,
918ColdSize, OriginalHotSize);
919}
920
921// Reverse the action of createEHTrampolines(). The trampolines will be
922// placed immediately before the matching destination resulting in no
923// extra code.
924if (PreSplitLayout.size() != BF.size())
925PreSplitLayout = mergeEHTrampolines(BF, PreSplitLayout, Trampolines);
926
927for (BinaryBasicBlock &BB : BF)
928BB.setFragmentNum(FragmentNum::main());
929BF.getLayout().update(PreSplitLayout);
930} else {
931SplitBytesHot += HotSize;
932SplitBytesCold += ColdSize;
933}
934}
935
936// Fix branches if the splitting decision of the pass after function
937// reordering is different from that of the pass before function reordering.
938if (LayoutUpdated && BC.HasFinalizedFunctionOrder)
939BF.fixBranches();
940}
941
942SplitFunctions::TrampolineSetType
943SplitFunctions::createEHTrampolines(BinaryFunction &BF) const {
944const auto &MIB = BF.getBinaryContext().MIB;
945
946// Map real landing pads to the corresponding trampolines.
947TrampolineSetType LPTrampolines;
948
949// Iterate over the copy of basic blocks since we are adding new blocks to the
950// function which will invalidate its iterators.
951std::vector<BinaryBasicBlock *> Blocks(BF.pbegin(), BF.pend());
952for (BinaryBasicBlock *BB : Blocks) {
953for (MCInst &Instr : *BB) {
954const std::optional<MCPlus::MCLandingPad> EHInfo = MIB->getEHInfo(Instr);
955if (!EHInfo || !EHInfo->first)
956continue;
957
958const MCSymbol *LPLabel = EHInfo->first;
959BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(LPLabel);
960if (BB->getFragmentNum() == LPBlock->getFragmentNum())
961continue;
962
963const MCSymbol *TrampolineLabel = nullptr;
964const TrampolineKey Key(BB->getFragmentNum(), LPLabel);
965auto Iter = LPTrampolines.find(Key);
966if (Iter != LPTrampolines.end()) {
967TrampolineLabel = Iter->second;
968} else {
969// Create a trampoline basic block in the same fragment as the thrower.
970// Note: there's no need to insert the jump instruction, it will be
971// added by fixBranches().
972BinaryBasicBlock *TrampolineBB = BF.addBasicBlock();
973TrampolineBB->setFragmentNum(BB->getFragmentNum());
974TrampolineBB->setExecutionCount(LPBlock->getExecutionCount());
975TrampolineBB->addSuccessor(LPBlock, TrampolineBB->getExecutionCount());
976TrampolineBB->setCFIState(LPBlock->getCFIState());
977TrampolineLabel = TrampolineBB->getLabel();
978LPTrampolines.insert(std::make_pair(Key, TrampolineLabel));
979}
980
981// Substitute the landing pad with the trampoline.
982MIB->updateEHInfo(Instr,
983MCPlus::MCLandingPad(TrampolineLabel, EHInfo->second));
984}
985}
986
987if (LPTrampolines.empty())
988return LPTrampolines;
989
990// All trampoline blocks were added to the end of the function. Place them at
991// the end of corresponding fragments.
992BinaryFunction::BasicBlockOrderType NewLayout(BF.getLayout().block_begin(),
993BF.getLayout().block_end());
994stable_sort(NewLayout, [&](BinaryBasicBlock *A, BinaryBasicBlock *B) {
995return A->getFragmentNum() < B->getFragmentNum();
996});
997BF.getLayout().update(NewLayout);
998
999// Conservatively introduce branch instructions.
1000BF.fixBranches();
1001
1002// Update exception-handling CFG for the function.
1003BF.recomputeLandingPads();
1004
1005return LPTrampolines;
1006}
1007
1008SplitFunctions::BasicBlockOrderType SplitFunctions::mergeEHTrampolines(
1009BinaryFunction &BF, SplitFunctions::BasicBlockOrderType &Layout,
1010const SplitFunctions::TrampolineSetType &Trampolines) const {
1011DenseMap<const MCSymbol *, SmallVector<const MCSymbol *, 0>>
1012IncomingTrampolines;
1013for (const auto &Entry : Trampolines) {
1014IncomingTrampolines[Entry.getFirst().Target].emplace_back(
1015Entry.getSecond());
1016}
1017
1018BasicBlockOrderType MergedLayout;
1019for (BinaryBasicBlock *BB : Layout) {
1020auto Iter = IncomingTrampolines.find(BB->getLabel());
1021if (Iter != IncomingTrampolines.end()) {
1022for (const MCSymbol *const Trampoline : Iter->getSecond()) {
1023BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(Trampoline);
1024assert(LPBlock && "Could not find matching landing pad block.");
1025MergedLayout.push_back(LPBlock);
1026}
1027}
1028MergedLayout.push_back(BB);
1029}
1030
1031return MergedLayout;
1032}
1033
1034} // namespace bolt
1035} // namespace llvm
1036