llvm-project

Форк
0
/
SplitFunctions.cpp 
1035 строк · 39.2 Кб
1
//===- bolt/Passes/SplitFunctions.cpp - Pass for splitting function code --===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This file implements the SplitFunctions pass.
10
//
11
//===----------------------------------------------------------------------===//
12

13
#include "bolt/Passes/SplitFunctions.h"
14
#include "bolt/Core/BinaryBasicBlock.h"
15
#include "bolt/Core/BinaryFunction.h"
16
#include "bolt/Core/FunctionLayout.h"
17
#include "bolt/Core/ParallelUtilities.h"
18
#include "bolt/Utils/CommandLineOpts.h"
19
#include "llvm/ADT/STLExtras.h"
20
#include "llvm/ADT/SmallVector.h"
21
#include "llvm/ADT/iterator_range.h"
22
#include "llvm/Support/CommandLine.h"
23
#include "llvm/Support/FormatVariadic.h"
24
#include <algorithm>
25
#include <iterator>
26
#include <memory>
27
#include <numeric>
28
#include <random>
29
#include <vector>
30

31
#define DEBUG_TYPE "bolt-opts"
32

33
using namespace llvm;
34
using namespace bolt;
35

36
namespace {
37
class DeprecatedSplitFunctionOptionParser : public cl::parser<bool> {
38
public:
39
  explicit DeprecatedSplitFunctionOptionParser(cl::Option &O)
40
      : cl::parser<bool>(O) {}
41

42
  bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, bool &Value) {
43
    if (Arg == "2" || Arg == "3") {
44
      Value = true;
45
      errs() << formatv("BOLT-WARNING: specifying non-boolean value \"{0}\" "
46
                        "for option -{1} is deprecated\n",
47
                        Arg, ArgName);
48
      return false;
49
    }
50
    return cl::parser<bool>::parse(O, ArgName, Arg, Value);
51
  }
52
};
53
} // namespace
54

55
namespace opts {
56

57
extern cl::OptionCategory BoltOptCategory;
58

59
extern cl::opt<bool> SplitEH;
60
extern cl::opt<unsigned> ExecutionCountThreshold;
61
extern cl::opt<uint32_t> RandomSeed;
62

63
static cl::opt<bool> AggressiveSplitting(
64
    "split-all-cold", cl::desc("outline as many cold basic blocks as possible"),
65
    cl::cat(BoltOptCategory));
66

67
static cl::opt<unsigned> SplitAlignThreshold(
68
    "split-align-threshold",
69
    cl::desc("when deciding to split a function, apply this alignment "
70
             "while doing the size comparison (see -split-threshold). "
71
             "Default value: 2."),
72
    cl::init(2),
73

74
    cl::Hidden, cl::cat(BoltOptCategory));
75

76
static cl::opt<bool, false, DeprecatedSplitFunctionOptionParser>
77
    SplitFunctions("split-functions",
78
                   cl::desc("split functions into fragments"),
79
                   cl::cat(BoltOptCategory));
80

81
static cl::opt<unsigned> SplitThreshold(
82
    "split-threshold",
83
    cl::desc("split function only if its main size is reduced by more than "
84
             "given amount of bytes. Default value: 0, i.e. split iff the "
85
             "size is reduced. Note that on some architectures the size can "
86
             "increase after splitting."),
87
    cl::init(0), cl::Hidden, cl::cat(BoltOptCategory));
88

89
static cl::opt<SplitFunctionsStrategy> SplitStrategy(
90
    "split-strategy", cl::init(SplitFunctionsStrategy::Profile2),
91
    cl::values(clEnumValN(SplitFunctionsStrategy::Profile2, "profile2",
92
                          "split each function into a hot and cold fragment "
93
                          "using profiling information")),
94
    cl::values(clEnumValN(SplitFunctionsStrategy::CDSplit, "cdsplit",
95
                          "split each function into a hot, warm, and cold "
96
                          "fragment using profiling information")),
97
    cl::values(clEnumValN(
98
        SplitFunctionsStrategy::Random2, "random2",
99
        "split each function into a hot and cold fragment at a randomly chosen "
100
        "split point (ignoring any available profiling information)")),
101
    cl::values(clEnumValN(
102
        SplitFunctionsStrategy::RandomN, "randomN",
103
        "split each function into N fragments at a randomly chosen split "
104
        "points (ignoring any available profiling information)")),
105
    cl::values(clEnumValN(
106
        SplitFunctionsStrategy::All, "all",
107
        "split all basic blocks of each function into fragments such that each "
108
        "fragment contains exactly a single basic block")),
109
    cl::desc("strategy used to partition blocks into fragments"),
110
    cl::cat(BoltOptCategory));
111

112
static cl::opt<double> CallScale(
113
    "call-scale",
114
    cl::desc("Call score scale coefficient (when --split-strategy=cdsplit)"),
115
    cl::init(0.95), cl::ReallyHidden, cl::cat(BoltOptCategory));
116

117
static cl::opt<double>
118
    CallPower("call-power",
119
              cl::desc("Call score power (when --split-strategy=cdsplit)"),
120
              cl::init(0.05), cl::ReallyHidden, cl::cat(BoltOptCategory));
121

122
static cl::opt<double>
123
    JumpPower("jump-power",
124
              cl::desc("Jump score power (when --split-strategy=cdsplit)"),
125
              cl::init(0.15), cl::ReallyHidden, cl::cat(BoltOptCategory));
126
} // namespace opts
127

128
namespace {
129
bool hasFullProfile(const BinaryFunction &BF) {
130
  return llvm::all_of(BF.blocks(), [](const BinaryBasicBlock &BB) {
131
    return BB.getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE;
132
  });
133
}
134

135
bool allBlocksCold(const BinaryFunction &BF) {
136
  return llvm::all_of(BF.blocks(), [](const BinaryBasicBlock &BB) {
137
    return BB.getExecutionCount() == 0;
138
  });
139
}
140

141
struct SplitProfile2 final : public SplitStrategy {
142
  bool canSplit(const BinaryFunction &BF) override {
143
    return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF);
144
  }
145

146
  bool compactFragments() override { return true; }
147

148
  void fragment(const BlockIt Start, const BlockIt End) override {
149
    for (BinaryBasicBlock *const BB : llvm::make_range(Start, End)) {
150
      if (BB->getExecutionCount() == 0)
151
        BB->setFragmentNum(FragmentNum::cold());
152
    }
153
  }
154
};
155

156
struct SplitCacheDirected final : public SplitStrategy {
157
  BinaryContext &BC;
158
  using BasicBlockOrder = BinaryFunction::BasicBlockOrderType;
159

160
  bool canSplit(const BinaryFunction &BF) override {
161
    return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF);
162
  }
163

164
  explicit SplitCacheDirected(BinaryContext &BC) : BC(BC) {
165
    initializeAuxiliaryVariables();
166
    buildCallGraph();
167
  }
168

169
  // When some functions are hot-warm split and others are hot-warm-cold split,
170
  // we do not want to change the fragment numbers of the blocks in the hot-warm
171
  // split functions.
172
  bool compactFragments() override { return false; }
173

174
  void fragment(const BlockIt Start, const BlockIt End) override {
175
    BasicBlockOrder BlockOrder(Start, End);
176
    BinaryFunction &BF = *BlockOrder.front()->getFunction();
177
    // No need to re-split small functions.
178
    if (BlockOrder.size() <= 2)
179
      return;
180

181
    size_t BestSplitIndex = findSplitIndex(BF, BlockOrder);
182
    assert(BestSplitIndex < BlockOrder.size());
183

184
    // Assign fragments based on the computed best split index.
185
    // All basic blocks with index up to the best split index become hot.
186
    // All remaining blocks are warm / cold depending on if count is
187
    // greater than zero or not.
188
    for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
189
      BinaryBasicBlock *BB = BlockOrder[Index];
190
      if (Index <= BestSplitIndex)
191
        BB->setFragmentNum(FragmentNum::main());
192
      else
193
        BB->setFragmentNum(BB->getKnownExecutionCount() > 0
194
                               ? FragmentNum::warm()
195
                               : FragmentNum::cold());
196
    }
197
  }
198

199
private:
200
  struct CallInfo {
201
    size_t Length;
202
    size_t Count;
203
  };
204

205
  struct SplitScore {
206
    size_t SplitIndex = size_t(-1);
207
    size_t HotSizeReduction = 0;
208
    double LocalScore = 0;
209
    double CoverCallScore = 0;
210

211
    double sum() const { return LocalScore + CoverCallScore; }
212
  };
213

214
  // Auxiliary variables used by the algorithm.
215
  size_t TotalNumBlocks{0};
216
  size_t OrigHotSectionSize{0};
217
  DenseMap<const BinaryBasicBlock *, size_t> GlobalIndices;
218
  DenseMap<const BinaryBasicBlock *, size_t> BBSizes;
219
  DenseMap<const BinaryBasicBlock *, size_t> BBOffsets;
220

221
  // Call graph.
222
  std::vector<SmallVector<const BinaryBasicBlock *, 0>> Callers;
223
  std::vector<SmallVector<const BinaryBasicBlock *, 0>> Callees;
224

225
  bool shouldConsiderForCallGraph(const BinaryFunction &BF) {
226
    // Only a subset of the functions in the binary will be considered
227
    // for initializing auxiliary variables and building call graph.
228
    return BF.hasValidIndex() && BF.hasValidProfile() && !BF.empty();
229
  }
230

231
  void initializeAuxiliaryVariables() {
232
    for (BinaryFunction *BF : BC.getSortedFunctions()) {
233
      if (!shouldConsiderForCallGraph(*BF))
234
        continue;
235

236
      // Calculate the size of each BB after hot-cold splitting.
237
      // This populates BinaryBasicBlock::OutputAddressRange which
238
      // can be used to compute the size of each BB.
239
      BC.calculateEmittedSize(*BF, /*FixBranches=*/true);
240

241
      for (BinaryBasicBlock *BB : BF->getLayout().blocks()) {
242
        // Unique global index.
243
        GlobalIndices[BB] = TotalNumBlocks;
244
        TotalNumBlocks++;
245

246
        // Block size after hot-cold splitting.
247
        BBSizes[BB] = BB->getOutputSize();
248

249
        // Hot block offset after hot-cold splitting.
250
        BBOffsets[BB] = OrigHotSectionSize;
251
        if (!BB->isSplit())
252
          OrigHotSectionSize += BBSizes[BB];
253
      }
254
    }
255
  }
256

257
  void buildCallGraph() {
258
    Callers.resize(TotalNumBlocks);
259
    Callees.resize(TotalNumBlocks);
260
    for (const BinaryFunction *SrcFunction : BC.getSortedFunctions()) {
261
      if (!shouldConsiderForCallGraph(*SrcFunction))
262
        continue;
263

264
      for (BinaryBasicBlock &SrcBB : SrcFunction->blocks()) {
265
        // Skip blocks that are not executed
266
        if (SrcBB.getKnownExecutionCount() == 0)
267
          continue;
268

269
        // Find call instructions and extract target symbols from each one
270
        for (const MCInst &Inst : SrcBB) {
271
          if (!BC.MIB->isCall(Inst))
272
            continue;
273

274
          // Call info
275
          const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst);
276
          // Ignore calls w/o information
277
          if (!DstSym)
278
            continue;
279

280
          const BinaryFunction *DstFunction = BC.getFunctionForSymbol(DstSym);
281
          // Ignore calls that do not have a valid target, but do not ignore
282
          // recursive calls, because caller block could be moved to warm.
283
          if (!DstFunction || DstFunction->getLayout().block_empty())
284
            continue;
285

286
          const BinaryBasicBlock *DstBB = &(DstFunction->front());
287

288
          // Record the call only if DstBB is also in functions to consider for
289
          // call graph.
290
          if (GlobalIndices.contains(DstBB)) {
291
            Callers[GlobalIndices[DstBB]].push_back(&SrcBB);
292
            Callees[GlobalIndices[&SrcBB]].push_back(DstBB);
293
          }
294
        }
295
      }
296
    }
297
  }
298

299
  /// Populate BinaryBasicBlock::OutputAddressRange with estimated basic block
300
  /// start and end addresses for hot and warm basic blocks, assuming hot-warm
301
  /// splitting happens at \p SplitIndex. Also return estimated end addresses
302
  /// of the hot fragment before and after splitting.
303
  /// The estimations take into account the potential addition of branch
304
  /// instructions due to split fall through branches as well as the need to
305
  /// use longer branch instructions for split (un)conditional branches.
306
  std::pair<size_t, size_t>
307
  estimatePostSplitBBAddress(const BasicBlockOrder &BlockOrder,
308
                             const size_t SplitIndex) {
309
    assert(SplitIndex < BlockOrder.size() && "Invalid split index");
310

311
    // Update function layout assuming hot-warm splitting at SplitIndex.
312
    for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
313
      BinaryBasicBlock *BB = BlockOrder[Index];
314
      if (BB->getFragmentNum() == FragmentNum::cold())
315
        break;
316
      BB->setFragmentNum(Index <= SplitIndex ? FragmentNum::main()
317
                                             : FragmentNum::warm());
318
    }
319
    BinaryFunction *BF = BlockOrder[0]->getFunction();
320
    BF->getLayout().update(BlockOrder);
321
    // Populate BB.OutputAddressRange under the updated layout.
322
    BC.calculateEmittedSize(*BF);
323

324
    // Populate BB.OutputAddressRange with estimated new start and end addresses
325
    // and compute the old end address of the hot section and the new end
326
    // address of the hot section.
327
    size_t OldHotEndAddr{0};
328
    size_t NewHotEndAddr{0};
329
    size_t CurrentAddr = BBOffsets[BlockOrder[0]];
330
    for (BinaryBasicBlock *BB : BlockOrder) {
331
      // We only care about new addresses of blocks in hot/warm.
332
      if (BB->getFragmentNum() == FragmentNum::cold())
333
        break;
334
      const size_t NewSize = BB->getOutputSize();
335
      BB->setOutputStartAddress(CurrentAddr);
336
      CurrentAddr += NewSize;
337
      BB->setOutputEndAddress(CurrentAddr);
338
      if (BB->getLayoutIndex() == SplitIndex) {
339
        NewHotEndAddr = CurrentAddr;
340
        // Approximate the start address of the warm fragment of the current
341
        // function using the original hot section size.
342
        CurrentAddr = OrigHotSectionSize;
343
      }
344
      OldHotEndAddr = BBOffsets[BB] + BBSizes[BB];
345
    }
346
    return std::make_pair(OldHotEndAddr, NewHotEndAddr);
347
  }
348

349
  /// Get a collection of "shortenable" calls, that is, calls of type X->Y
350
  /// when the function order is [... X ... BF ... Y ...].
351
  /// If the hot fragment size of BF is reduced, then such calls are guaranteed
352
  /// to get shorter by the reduced hot fragment size.
353
  std::vector<CallInfo> extractCoverCalls(const BinaryFunction &BF) {
354
    // Record the length and the count of the calls that can be shortened
355
    std::vector<CallInfo> CoverCalls;
356
    if (opts::CallScale == 0)
357
      return CoverCalls;
358

359
    const BinaryFunction *ThisBF = &BF;
360
    const BinaryBasicBlock *ThisBB = &(ThisBF->front());
361
    const size_t ThisGI = GlobalIndices[ThisBB];
362

363
    for (const BinaryFunction *DstBF : BC.getSortedFunctions()) {
364
      if (!shouldConsiderForCallGraph(*DstBF))
365
        continue;
366

367
      const BinaryBasicBlock *DstBB = &(DstBF->front());
368
      if (DstBB->getKnownExecutionCount() == 0)
369
        continue;
370

371
      const size_t DstGI = GlobalIndices[DstBB];
372
      for (const BinaryBasicBlock *SrcBB : Callers[DstGI]) {
373
        const BinaryFunction *SrcBF = SrcBB->getFunction();
374
        if (ThisBF == SrcBF)
375
          continue;
376

377
        const size_t CallCount = SrcBB->getKnownExecutionCount();
378

379
        const size_t SrcGI = GlobalIndices[SrcBB];
380

381
        const bool IsCoverCall = (SrcGI < ThisGI && ThisGI < DstGI) ||
382
                                 (DstGI <= ThisGI && ThisGI < SrcGI);
383
        if (!IsCoverCall)
384
          continue;
385

386
        const size_t SrcBBEndAddr = BBOffsets[SrcBB] + BBSizes[SrcBB];
387
        const size_t DstBBStartAddr = BBOffsets[DstBB];
388
        const size_t CallLength =
389
            AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr);
390
        const CallInfo CI{CallLength, CallCount};
391
        CoverCalls.emplace_back(CI);
392
      }
393
    }
394
    return CoverCalls;
395
  }
396

397
  /// Compute the edge score of a call edge.
398
  double computeCallScore(uint64_t CallCount, size_t CallLength) {
399
    // Increase call lengths by 1 to avoid raising 0 to a negative power.
400
    return opts::CallScale * static_cast<double>(CallCount) /
401
           std::pow(static_cast<double>(CallLength + 1), opts::CallPower);
402
  }
403

404
  /// Compute the edge score of a jump (branch) edge.
405
  double computeJumpScore(uint64_t JumpCount, size_t JumpLength) {
406
    // Increase jump lengths by 1 to avoid raising 0 to a negative power.
407
    return static_cast<double>(JumpCount) /
408
           std::pow(static_cast<double>(JumpLength + 1), opts::JumpPower);
409
  }
410

411
  /// Compute sum of scores over jumps within \p BlockOrder given \p SplitIndex.
412
  /// Increament Score.LocalScore in place by the sum.
413
  void computeJumpScore(const BasicBlockOrder &BlockOrder,
414
                        const size_t SplitIndex, SplitScore &Score) {
415

416
    for (const BinaryBasicBlock *SrcBB : BlockOrder) {
417
      if (SrcBB->getKnownExecutionCount() == 0)
418
        continue;
419

420
      const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second;
421

422
      for (const auto Pair : zip(SrcBB->successors(), SrcBB->branch_info())) {
423
        const BinaryBasicBlock *DstBB = std::get<0>(Pair);
424
        const BinaryBasicBlock::BinaryBranchInfo &Branch = std::get<1>(Pair);
425
        const size_t JumpCount = Branch.Count;
426

427
        if (JumpCount == 0)
428
          continue;
429

430
        const size_t DstBBStartAddr = DstBB->getOutputAddressRange().first;
431
        const size_t NewJumpLength =
432
            AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr);
433
        Score.LocalScore += computeJumpScore(JumpCount, NewJumpLength);
434
      }
435
    }
436
  }
437

438
  /// Compute sum of scores over calls originated in the current function
439
  /// given \p SplitIndex. Increament Score.LocalScore in place by the sum.
440
  void computeLocalCallScore(const BasicBlockOrder &BlockOrder,
441
                             const size_t SplitIndex, SplitScore &Score) {
442
    if (opts::CallScale == 0)
443
      return;
444

445
    // Global index of the last block in the current function.
446
    // This is later used to determine whether a call originated in the current
447
    // function is to a function that comes after the current function.
448
    const size_t LastGlobalIndex = GlobalIndices[BlockOrder.back()];
449

450
    // The length of calls originated in the input function can increase /
451
    // decrease depending on the splitting decision.
452
    for (const BinaryBasicBlock *SrcBB : BlockOrder) {
453
      const size_t CallCount = SrcBB->getKnownExecutionCount();
454
      // If SrcBB does not call any functions, skip it.
455
      if (CallCount == 0)
456
        continue;
457

458
      // Obtain an estimate on the end address of the src basic block
459
      // after splitting at SplitIndex.
460
      const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second;
461

462
      for (const BinaryBasicBlock *DstBB : Callees[GlobalIndices[SrcBB]]) {
463
        // Obtain an estimate on the start address of the dst basic block
464
        // after splitting at SplitIndex. If DstBB is in a function before
465
        // the current function, then its start address remains unchanged.
466
        size_t DstBBStartAddr = BBOffsets[DstBB];
467
        // If DstBB is in a function after the current function, then its
468
        // start address should be adjusted based on the reduction in hot size.
469
        if (GlobalIndices[DstBB] > LastGlobalIndex) {
470
          assert(DstBBStartAddr >= Score.HotSizeReduction);
471
          DstBBStartAddr -= Score.HotSizeReduction;
472
        }
473
        const size_t NewCallLength =
474
            AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr);
475
        Score.LocalScore += computeCallScore(CallCount, NewCallLength);
476
      }
477
    }
478
  }
479

480
  /// Compute sum of splitting scores for cover calls of the input function.
481
  /// Increament Score.CoverCallScore in place by the sum.
482
  void computeCoverCallScore(const BasicBlockOrder &BlockOrder,
483
                             const size_t SplitIndex,
484
                             const std::vector<CallInfo> &CoverCalls,
485
                             SplitScore &Score) {
486
    if (opts::CallScale == 0)
487
      return;
488

489
    for (const CallInfo CI : CoverCalls) {
490
      assert(CI.Length >= Score.HotSizeReduction &&
491
             "Length of cover calls must exceed reduced size of hot fragment.");
492
      // Compute the new length of the call, which is shorter than the original
493
      // one by the size of the splitted fragment minus the total size increase.
494
      const size_t NewCallLength = CI.Length - Score.HotSizeReduction;
495
      Score.CoverCallScore += computeCallScore(CI.Count, NewCallLength);
496
    }
497
  }
498

499
  /// Compute the split score of splitting a function at a given index.
500
  /// The split score consists of local score and cover score. This function
501
  /// returns \p Score of SplitScore type. It contains the local score and
502
  /// cover score of the current splitting index. For easier book keeping and
503
  /// comparison, it also stores the split index and the resulting reduction
504
  /// in hot fragment size.
505
  SplitScore computeSplitScore(const BinaryFunction &BF,
506
                               const BasicBlockOrder &BlockOrder,
507
                               const size_t SplitIndex,
508
                               const std::vector<CallInfo> &CoverCalls) {
509
    // Populate BinaryBasicBlock::OutputAddressRange with estimated
510
    // new start and end addresses after hot-warm splitting at SplitIndex.
511
    size_t OldHotEnd;
512
    size_t NewHotEnd;
513
    std::tie(OldHotEnd, NewHotEnd) =
514
        estimatePostSplitBBAddress(BlockOrder, SplitIndex);
515

516
    SplitScore Score;
517
    Score.SplitIndex = SplitIndex;
518

519
    // It's not worth splitting if OldHotEnd < NewHotEnd.
520
    if (OldHotEnd < NewHotEnd)
521
      return Score;
522

523
    // Hot fragment size reduction due to splitting.
524
    Score.HotSizeReduction = OldHotEnd - NewHotEnd;
525

526
    // First part of LocalScore is the sum over call edges originated in the
527
    // input function. These edges can get shorter or longer depending on
528
    // SplitIndex. Score.LocalScore is increamented in place.
529
    computeLocalCallScore(BlockOrder, SplitIndex, Score);
530

531
    // Second part of LocalScore is the sum over jump edges with src basic block
532
    // and dst basic block in the current function. Score.LocalScore is
533
    // increamented in place.
534
    computeJumpScore(BlockOrder, SplitIndex, Score);
535

536
    // Compute CoverCallScore and store in Score in place.
537
    computeCoverCallScore(BlockOrder, SplitIndex, CoverCalls, Score);
538
    return Score;
539
  }
540

541
  /// Find the most likely successor of a basic block when it has one or two
542
  /// successors. Return nullptr otherwise.
543
  const BinaryBasicBlock *getMostLikelySuccessor(const BinaryBasicBlock *BB) {
544
    if (BB->succ_size() == 1)
545
      return BB->getSuccessor();
546
    if (BB->succ_size() == 2) {
547
      uint64_t TakenCount = BB->getTakenBranchInfo().Count;
548
      assert(TakenCount != BinaryBasicBlock::COUNT_NO_PROFILE);
549
      uint64_t NonTakenCount = BB->getFallthroughBranchInfo().Count;
550
      assert(NonTakenCount != BinaryBasicBlock::COUNT_NO_PROFILE);
551
      if (TakenCount > NonTakenCount)
552
        return BB->getConditionalSuccessor(true);
553
      else if (TakenCount < NonTakenCount)
554
        return BB->getConditionalSuccessor(false);
555
    }
556
    return nullptr;
557
  }
558

559
  /// Find the best index for splitting. The returned value is the index of the
560
  /// last hot basic block. Hence, "no splitting" is equivalent to returning the
561
  /// value which is one less than the size of the function.
562
  size_t findSplitIndex(const BinaryFunction &BF,
563
                        const BasicBlockOrder &BlockOrder) {
564
    assert(BlockOrder.size() > 2);
565
    // Find all function calls that can be shortened if we move blocks of the
566
    // current function to warm/cold
567
    const std::vector<CallInfo> CoverCalls = extractCoverCalls(BF);
568

569
    // Find the existing hot-cold splitting index.
570
    size_t HotColdIndex = 0;
571
    while (HotColdIndex + 1 < BlockOrder.size()) {
572
      if (BlockOrder[HotColdIndex + 1]->getFragmentNum() == FragmentNum::cold())
573
        break;
574
      HotColdIndex++;
575
    }
576
    assert(HotColdIndex + 1 == BlockOrder.size() ||
577
           (BlockOrder[HotColdIndex]->getFragmentNum() == FragmentNum::main() &&
578
            BlockOrder[HotColdIndex + 1]->getFragmentNum() ==
579
                FragmentNum::cold()));
580

581
    // Try all possible split indices up to HotColdIndex (blocks that have
582
    // Index <= SplitIndex are in hot) and find the one maximizing the
583
    // splitting score.
584
    SplitScore BestScore;
585
    for (size_t Index = 0; Index <= HotColdIndex; Index++) {
586
      const BinaryBasicBlock *LastHotBB = BlockOrder[Index];
587
      assert(LastHotBB->getFragmentNum() != FragmentNum::cold());
588

589
      // Do not break jump to the most likely successor.
590
      if (Index + 1 < BlockOrder.size() &&
591
          BlockOrder[Index + 1] == getMostLikelySuccessor(LastHotBB))
592
        continue;
593

594
      const SplitScore Score =
595
          computeSplitScore(BF, BlockOrder, Index, CoverCalls);
596
      if (Score.sum() > BestScore.sum())
597
        BestScore = Score;
598
    }
599

600
    // If we don't find a good splitting point, fallback to the original one.
601
    if (BestScore.SplitIndex == size_t(-1))
602
      return HotColdIndex;
603

604
    return BestScore.SplitIndex;
605
  }
606
};
607

608
struct SplitRandom2 final : public SplitStrategy {
609
  std::minstd_rand0 Gen;
610

611
  SplitRandom2() : Gen(opts::RandomSeed.getValue()) {}
612

613
  bool canSplit(const BinaryFunction &BF) override { return true; }
614

615
  bool compactFragments() override { return true; }
616

617
  void fragment(const BlockIt Start, const BlockIt End) override {
618
    using DiffT = typename std::iterator_traits<BlockIt>::difference_type;
619
    const DiffT NumBlocks = End - Start;
620
    assert(NumBlocks > 0 && "Cannot fragment empty function");
621

622
    // We want to split at least one block
623
    const auto LastSplitPoint = std::max<DiffT>(NumBlocks - 1, 1);
624
    std::uniform_int_distribution<DiffT> Dist(1, LastSplitPoint);
625
    const DiffT SplitPoint = Dist(Gen);
626
    for (BinaryBasicBlock *BB : llvm::make_range(Start + SplitPoint, End))
627
      BB->setFragmentNum(FragmentNum::cold());
628

629
    LLVM_DEBUG(dbgs() << formatv("BOLT-DEBUG: randomly chose last {0} (out of "
630
                                 "{1} possible) blocks to split\n",
631
                                 NumBlocks - SplitPoint, End - Start));
632
  }
633
};
634

635
struct SplitRandomN final : public SplitStrategy {
636
  std::minstd_rand0 Gen;
637

638
  SplitRandomN() : Gen(opts::RandomSeed.getValue()) {}
639

640
  bool canSplit(const BinaryFunction &BF) override { return true; }
641

642
  bool compactFragments() override { return true; }
643

644
  void fragment(const BlockIt Start, const BlockIt End) override {
645
    using DiffT = typename std::iterator_traits<BlockIt>::difference_type;
646
    const DiffT NumBlocks = End - Start;
647
    assert(NumBlocks > 0 && "Cannot fragment empty function");
648

649
    // With n blocks, there are n-1 places to split them.
650
    const DiffT MaximumSplits = NumBlocks - 1;
651
    // We want to generate at least two fragment if possible, but if there is
652
    // only one block, no splits are possible.
653
    const auto MinimumSplits = std::min<DiffT>(MaximumSplits, 1);
654
    std::uniform_int_distribution<DiffT> Dist(MinimumSplits, MaximumSplits);
655
    // Choose how many splits to perform
656
    const DiffT NumSplits = Dist(Gen);
657

658
    // Draw split points from a lottery
659
    SmallVector<unsigned, 0> Lottery(MaximumSplits);
660
    // Start lottery at 1, because there is no meaningful splitpoint before the
661
    // first block.
662
    std::iota(Lottery.begin(), Lottery.end(), 1u);
663
    std::shuffle(Lottery.begin(), Lottery.end(), Gen);
664
    Lottery.resize(NumSplits);
665
    llvm::sort(Lottery);
666

667
    // Add one past the end entry to lottery
668
    Lottery.push_back(NumBlocks);
669

670
    unsigned LotteryIndex = 0;
671
    unsigned BBPos = 0;
672
    for (BinaryBasicBlock *const BB : make_range(Start, End)) {
673
      // Check whether to start new fragment
674
      if (BBPos >= Lottery[LotteryIndex])
675
        ++LotteryIndex;
676

677
      // Because LotteryIndex is 0 based and cold fragments are 1 based, we can
678
      // use the index to assign fragments.
679
      BB->setFragmentNum(FragmentNum(LotteryIndex));
680

681
      ++BBPos;
682
    }
683
  }
684
};
685

686
struct SplitAll final : public SplitStrategy {
687
  bool canSplit(const BinaryFunction &BF) override { return true; }
688

689
  bool compactFragments() override {
690
    // Keeping empty fragments allows us to test, that empty fragments do not
691
    // generate symbols.
692
    return false;
693
  }
694

695
  void fragment(const BlockIt Start, const BlockIt End) override {
696
    unsigned Fragment = 0;
697
    for (BinaryBasicBlock *const BB : llvm::make_range(Start, End))
698
      BB->setFragmentNum(FragmentNum(Fragment++));
699
  }
700
};
701
} // namespace
702

703
namespace llvm {
704
namespace bolt {
705

706
bool SplitFunctions::shouldOptimize(const BinaryFunction &BF) const {
707
  // Apply execution count threshold
708
  if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
709
    return false;
710

711
  return BinaryFunctionPass::shouldOptimize(BF);
712
}
713

714
Error SplitFunctions::runOnFunctions(BinaryContext &BC) {
715
  if (!opts::SplitFunctions)
716
    return Error::success();
717

718
  if (BC.IsLinuxKernel && BC.BOLTReserved.empty()) {
719
    BC.errs() << "BOLT-ERROR: split functions require reserved space in the "
720
                 "Linux kernel binary\n";
721
    exit(1);
722
  }
723

724
  // If split strategy is not CDSplit, then a second run of the pass is not
725
  // needed after function reordering.
726
  if (BC.HasFinalizedFunctionOrder &&
727
      opts::SplitStrategy != SplitFunctionsStrategy::CDSplit)
728
    return Error::success();
729

730
  std::unique_ptr<SplitStrategy> Strategy;
731
  bool ForceSequential = false;
732

733
  switch (opts::SplitStrategy) {
734
  case SplitFunctionsStrategy::CDSplit:
735
    // CDSplit runs two splitting passes: hot-cold splitting (SplitPrfoile2)
736
    // before function reordering and hot-warm-cold splitting
737
    // (SplitCacheDirected) after function reordering.
738
    if (BC.HasFinalizedFunctionOrder)
739
      Strategy = std::make_unique<SplitCacheDirected>(BC);
740
    else
741
      Strategy = std::make_unique<SplitProfile2>();
742
    opts::AggressiveSplitting = true;
743
    BC.HasWarmSection = true;
744
    break;
745
  case SplitFunctionsStrategy::Profile2:
746
    Strategy = std::make_unique<SplitProfile2>();
747
    break;
748
  case SplitFunctionsStrategy::Random2:
749
    Strategy = std::make_unique<SplitRandom2>();
750
    // If we split functions randomly, we need to ensure that across runs with
751
    // the same input, we generate random numbers for each function in the same
752
    // order.
753
    ForceSequential = true;
754
    break;
755
  case SplitFunctionsStrategy::RandomN:
756
    Strategy = std::make_unique<SplitRandomN>();
757
    ForceSequential = true;
758
    break;
759
  case SplitFunctionsStrategy::All:
760
    Strategy = std::make_unique<SplitAll>();
761
    break;
762
  }
763

764
  ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
765
    return !shouldOptimize(BF);
766
  };
767

768
  ParallelUtilities::runOnEachFunction(
769
      BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
770
      [&](BinaryFunction &BF) { splitFunction(BF, *Strategy); }, SkipFunc,
771
      "SplitFunctions", ForceSequential);
772

773
  if (SplitBytesHot + SplitBytesCold > 0)
774
    BC.outs() << "BOLT-INFO: splitting separates " << SplitBytesHot
775
              << " hot bytes from " << SplitBytesCold << " cold bytes "
776
              << format("(%.2lf%% of split functions is hot).\n",
777
                        100.0 * SplitBytesHot /
778
                            (SplitBytesHot + SplitBytesCold));
779
  return Error::success();
780
}
781

782
void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy &S) {
783
  if (BF.empty())
784
    return;
785

786
  if (!S.canSplit(BF))
787
    return;
788

789
  FunctionLayout &Layout = BF.getLayout();
790
  BinaryFunction::BasicBlockOrderType PreSplitLayout(Layout.block_begin(),
791
                                                     Layout.block_end());
792

793
  BinaryContext &BC = BF.getBinaryContext();
794
  size_t OriginalHotSize;
795
  size_t HotSize;
796
  size_t ColdSize;
797
  if (BC.isX86()) {
798
    std::tie(OriginalHotSize, ColdSize) = BC.calculateEmittedSize(BF);
799
    LLVM_DEBUG(dbgs() << "Estimated size for function " << BF
800
                      << " pre-split is <0x"
801
                      << Twine::utohexstr(OriginalHotSize) << ", 0x"
802
                      << Twine::utohexstr(ColdSize) << ">\n");
803
  }
804

805
  BinaryFunction::BasicBlockOrderType NewLayout(Layout.block_begin(),
806
                                                Layout.block_end());
807
  // Never outline the first basic block.
808
  NewLayout.front()->setCanOutline(false);
809
  for (BinaryBasicBlock *const BB : NewLayout) {
810
    if (!BB->canOutline())
811
      continue;
812

813
    // Do not split extra entry points in aarch64. They can be referred by
814
    // using ADRs and when this happens, these blocks cannot be placed far
815
    // away due to the limited range in ADR instruction.
816
    if (BC.isAArch64() && BB->isEntryPoint()) {
817
      BB->setCanOutline(false);
818
      continue;
819
    }
820

821
    if (BF.hasEHRanges() && !opts::SplitEH) {
822
      // We cannot move landing pads (or rather entry points for landing pads).
823
      if (BB->isLandingPad()) {
824
        BB->setCanOutline(false);
825
        continue;
826
      }
827
      // We cannot move a block that can throw since exception-handling
828
      // runtime cannot deal with split functions. However, if we can guarantee
829
      // that the block never throws, it is safe to move the block to
830
      // decrease the size of the function.
831
      for (MCInst &Instr : *BB) {
832
        if (BC.MIB->isInvoke(Instr)) {
833
          BB->setCanOutline(false);
834
          break;
835
        }
836
      }
837
    }
838

839
    // Outlining blocks with dynamic branches is not supported yet.
840
    if (BC.IsLinuxKernel) {
841
      if (llvm::any_of(
842
              *BB, [&](MCInst &Inst) { return BC.MIB->isDynamicBranch(Inst); }))
843
        BB->setCanOutline(false);
844
    }
845
  }
846

847
  BF.getLayout().updateLayoutIndices();
848
  S.fragment(NewLayout.begin(), NewLayout.end());
849

850
  // Make sure all non-outlineable blocks are in the main-fragment.
851
  for (BinaryBasicBlock *const BB : NewLayout) {
852
    if (!BB->canOutline())
853
      BB->setFragmentNum(FragmentNum::main());
854
  }
855

856
  if (opts::AggressiveSplitting) {
857
    // All blocks with 0 count that we can move go to the end of the function.
858
    // Even if they were natural to cluster formation and were seen in-between
859
    // hot basic blocks.
860
    llvm::stable_sort(NewLayout, [&](const BinaryBasicBlock *const A,
861
                                     const BinaryBasicBlock *const B) {
862
      return A->getFragmentNum() < B->getFragmentNum();
863
    });
864
  } else if (BF.hasEHRanges() && !opts::SplitEH) {
865
    // Typically functions with exception handling have landing pads at the end.
866
    // We cannot move beginning of landing pads, but we can move 0-count blocks
867
    // comprising landing pads to the end and thus facilitate splitting.
868
    auto FirstLP = NewLayout.begin();
869
    while ((*FirstLP)->isLandingPad())
870
      ++FirstLP;
871

872
    std::stable_sort(FirstLP, NewLayout.end(),
873
                     [&](BinaryBasicBlock *A, BinaryBasicBlock *B) {
874
                       return A->getFragmentNum() < B->getFragmentNum();
875
                     });
876
  }
877

878
  // Make sure that fragments are increasing.
879
  FragmentNum CurrentFragment = NewLayout.back()->getFragmentNum();
880
  for (BinaryBasicBlock *const BB : reverse(NewLayout)) {
881
    if (BB->getFragmentNum() > CurrentFragment)
882
      BB->setFragmentNum(CurrentFragment);
883
    CurrentFragment = BB->getFragmentNum();
884
  }
885

886
  if (S.compactFragments()) {
887
    FragmentNum CurrentFragment = FragmentNum::main();
888
    FragmentNum NewFragment = FragmentNum::main();
889
    for (BinaryBasicBlock *const BB : NewLayout) {
890
      if (BB->getFragmentNum() > CurrentFragment) {
891
        CurrentFragment = BB->getFragmentNum();
892
        NewFragment = FragmentNum(NewFragment.get() + 1);
893
      }
894
      BB->setFragmentNum(NewFragment);
895
    }
896
  }
897

898
  const bool LayoutUpdated = BF.getLayout().update(NewLayout);
899

900
  // For shared objects, invoke instructions and corresponding landing pads
901
  // have to be placed in the same fragment. When we split them, create
902
  // trampoline landing pads that will redirect the execution to real LPs.
903
  TrampolineSetType Trampolines;
904
  if (!BC.HasFixedLoadAddress && BF.hasEHRanges() && BF.isSplit())
905
    Trampolines = createEHTrampolines(BF);
906

907
  // Check the new size to see if it's worth splitting the function.
908
  if (BC.isX86() && LayoutUpdated) {
909
    std::tie(HotSize, ColdSize) = BC.calculateEmittedSize(BF);
910
    LLVM_DEBUG(dbgs() << "Estimated size for function " << BF
911
                      << " post-split is <0x" << Twine::utohexstr(HotSize)
912
                      << ", 0x" << Twine::utohexstr(ColdSize) << ">\n");
913
    if (alignTo(OriginalHotSize, opts::SplitAlignThreshold) <=
914
        alignTo(HotSize, opts::SplitAlignThreshold) + opts::SplitThreshold) {
915
      if (opts::Verbosity >= 2) {
916
        BC.outs() << "BOLT-INFO: Reversing splitting of function "
917
                  << formatv("{0}:\n  {1:x}, {2:x} -> {3:x}\n", BF, HotSize,
918
                             ColdSize, OriginalHotSize);
919
      }
920

921
      // Reverse the action of createEHTrampolines(). The trampolines will be
922
      // placed immediately before the matching destination resulting in no
923
      // extra code.
924
      if (PreSplitLayout.size() != BF.size())
925
        PreSplitLayout = mergeEHTrampolines(BF, PreSplitLayout, Trampolines);
926

927
      for (BinaryBasicBlock &BB : BF)
928
        BB.setFragmentNum(FragmentNum::main());
929
      BF.getLayout().update(PreSplitLayout);
930
    } else {
931
      SplitBytesHot += HotSize;
932
      SplitBytesCold += ColdSize;
933
    }
934
  }
935

936
  // Fix branches if the splitting decision of the pass after function
937
  // reordering is different from that of the pass before function reordering.
938
  if (LayoutUpdated && BC.HasFinalizedFunctionOrder)
939
    BF.fixBranches();
940
}
941

942
SplitFunctions::TrampolineSetType
943
SplitFunctions::createEHTrampolines(BinaryFunction &BF) const {
944
  const auto &MIB = BF.getBinaryContext().MIB;
945

946
  // Map real landing pads to the corresponding trampolines.
947
  TrampolineSetType LPTrampolines;
948

949
  // Iterate over the copy of basic blocks since we are adding new blocks to the
950
  // function which will invalidate its iterators.
951
  std::vector<BinaryBasicBlock *> Blocks(BF.pbegin(), BF.pend());
952
  for (BinaryBasicBlock *BB : Blocks) {
953
    for (MCInst &Instr : *BB) {
954
      const std::optional<MCPlus::MCLandingPad> EHInfo = MIB->getEHInfo(Instr);
955
      if (!EHInfo || !EHInfo->first)
956
        continue;
957

958
      const MCSymbol *LPLabel = EHInfo->first;
959
      BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(LPLabel);
960
      if (BB->getFragmentNum() == LPBlock->getFragmentNum())
961
        continue;
962

963
      const MCSymbol *TrampolineLabel = nullptr;
964
      const TrampolineKey Key(BB->getFragmentNum(), LPLabel);
965
      auto Iter = LPTrampolines.find(Key);
966
      if (Iter != LPTrampolines.end()) {
967
        TrampolineLabel = Iter->second;
968
      } else {
969
        // Create a trampoline basic block in the same fragment as the thrower.
970
        // Note: there's no need to insert the jump instruction, it will be
971
        // added by fixBranches().
972
        BinaryBasicBlock *TrampolineBB = BF.addBasicBlock();
973
        TrampolineBB->setFragmentNum(BB->getFragmentNum());
974
        TrampolineBB->setExecutionCount(LPBlock->getExecutionCount());
975
        TrampolineBB->addSuccessor(LPBlock, TrampolineBB->getExecutionCount());
976
        TrampolineBB->setCFIState(LPBlock->getCFIState());
977
        TrampolineLabel = TrampolineBB->getLabel();
978
        LPTrampolines.insert(std::make_pair(Key, TrampolineLabel));
979
      }
980

981
      // Substitute the landing pad with the trampoline.
982
      MIB->updateEHInfo(Instr,
983
                        MCPlus::MCLandingPad(TrampolineLabel, EHInfo->second));
984
    }
985
  }
986

987
  if (LPTrampolines.empty())
988
    return LPTrampolines;
989

990
  // All trampoline blocks were added to the end of the function. Place them at
991
  // the end of corresponding fragments.
992
  BinaryFunction::BasicBlockOrderType NewLayout(BF.getLayout().block_begin(),
993
                                                BF.getLayout().block_end());
994
  stable_sort(NewLayout, [&](BinaryBasicBlock *A, BinaryBasicBlock *B) {
995
    return A->getFragmentNum() < B->getFragmentNum();
996
  });
997
  BF.getLayout().update(NewLayout);
998

999
  // Conservatively introduce branch instructions.
1000
  BF.fixBranches();
1001

1002
  // Update exception-handling CFG for the function.
1003
  BF.recomputeLandingPads();
1004

1005
  return LPTrampolines;
1006
}
1007

1008
SplitFunctions::BasicBlockOrderType SplitFunctions::mergeEHTrampolines(
1009
    BinaryFunction &BF, SplitFunctions::BasicBlockOrderType &Layout,
1010
    const SplitFunctions::TrampolineSetType &Trampolines) const {
1011
  DenseMap<const MCSymbol *, SmallVector<const MCSymbol *, 0>>
1012
      IncomingTrampolines;
1013
  for (const auto &Entry : Trampolines) {
1014
    IncomingTrampolines[Entry.getFirst().Target].emplace_back(
1015
        Entry.getSecond());
1016
  }
1017

1018
  BasicBlockOrderType MergedLayout;
1019
  for (BinaryBasicBlock *BB : Layout) {
1020
    auto Iter = IncomingTrampolines.find(BB->getLabel());
1021
    if (Iter != IncomingTrampolines.end()) {
1022
      for (const MCSymbol *const Trampoline : Iter->getSecond()) {
1023
        BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(Trampoline);
1024
        assert(LPBlock && "Could not find matching landing pad block.");
1025
        MergedLayout.push_back(LPBlock);
1026
      }
1027
    }
1028
    MergedLayout.push_back(BB);
1029
  }
1030

1031
  return MergedLayout;
1032
}
1033

1034
} // namespace bolt
1035
} // namespace llvm
1036

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.