llvm-project
3275 строк · 123.8 Кб
1//===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements inline cost analysis.
10//
11//===----------------------------------------------------------------------===//
12
13#include "llvm/Analysis/InlineCost.h"
14#include "llvm/ADT/STLExtras.h"
15#include "llvm/ADT/SetVector.h"
16#include "llvm/ADT/SmallPtrSet.h"
17#include "llvm/ADT/SmallVector.h"
18#include "llvm/ADT/Statistic.h"
19#include "llvm/Analysis/AssumptionCache.h"
20#include "llvm/Analysis/BlockFrequencyInfo.h"
21#include "llvm/Analysis/CodeMetrics.h"
22#include "llvm/Analysis/ConstantFolding.h"
23#include "llvm/Analysis/InstructionSimplify.h"
24#include "llvm/Analysis/LoopInfo.h"
25#include "llvm/Analysis/MemoryBuiltins.h"
26#include "llvm/Analysis/OptimizationRemarkEmitter.h"
27#include "llvm/Analysis/ProfileSummaryInfo.h"
28#include "llvm/Analysis/TargetLibraryInfo.h"
29#include "llvm/Analysis/TargetTransformInfo.h"
30#include "llvm/Analysis/ValueTracking.h"
31#include "llvm/Config/llvm-config.h"
32#include "llvm/IR/AssemblyAnnotationWriter.h"
33#include "llvm/IR/CallingConv.h"
34#include "llvm/IR/DataLayout.h"
35#include "llvm/IR/Dominators.h"
36#include "llvm/IR/GetElementPtrTypeIterator.h"
37#include "llvm/IR/GlobalAlias.h"
38#include "llvm/IR/InstVisitor.h"
39#include "llvm/IR/IntrinsicInst.h"
40#include "llvm/IR/Operator.h"
41#include "llvm/IR/PatternMatch.h"
42#include "llvm/Support/CommandLine.h"
43#include "llvm/Support/Debug.h"
44#include "llvm/Support/FormattedStream.h"
45#include "llvm/Support/raw_ostream.h"
46#include <climits>
47#include <limits>
48#include <optional>
49
50using namespace llvm;
51
52#define DEBUG_TYPE "inline-cost"
53
54STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
55
56static cl::opt<int>
57DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225),
58cl::desc("Default amount of inlining to perform"));
59
60// We introduce this option since there is a minor compile-time win by avoiding
61// addition of TTI attributes (target-features in particular) to inline
62// candidates when they are guaranteed to be the same as top level methods in
63// some use cases. If we avoid adding the attribute, we need an option to avoid
64// checking these attributes.
65static cl::opt<bool> IgnoreTTIInlineCompatible(
66"ignore-tti-inline-compatible", cl::Hidden, cl::init(false),
67cl::desc("Ignore TTI attributes compatibility check between callee/caller "
68"during inline cost calculation"));
69
70static cl::opt<bool> PrintInstructionComments(
71"print-instruction-comments", cl::Hidden, cl::init(false),
72cl::desc("Prints comments for instruction based on inline cost analysis"));
73
74static cl::opt<int> InlineThreshold(
75"inline-threshold", cl::Hidden, cl::init(225),
76cl::desc("Control the amount of inlining to perform (default = 225)"));
77
78static cl::opt<int> HintThreshold(
79"inlinehint-threshold", cl::Hidden, cl::init(325),
80cl::desc("Threshold for inlining functions with inline hint"));
81
82static cl::opt<int>
83ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
84cl::init(45),
85cl::desc("Threshold for inlining cold callsites"));
86
87static cl::opt<bool> InlineEnableCostBenefitAnalysis(
88"inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(false),
89cl::desc("Enable the cost-benefit analysis for the inliner"));
90
91// InlineSavingsMultiplier overrides per TTI multipliers iff it is
92// specified explicitly in command line options. This option is exposed
93// for tuning and testing.
94static cl::opt<int> InlineSavingsMultiplier(
95"inline-savings-multiplier", cl::Hidden, cl::init(8),
96cl::desc("Multiplier to multiply cycle savings by during inlining"));
97
98// InlineSavingsProfitableMultiplier overrides per TTI multipliers iff it is
99// specified explicitly in command line options. This option is exposed
100// for tuning and testing.
101static cl::opt<int> InlineSavingsProfitableMultiplier(
102"inline-savings-profitable-multiplier", cl::Hidden, cl::init(4),
103cl::desc("A multiplier on top of cycle savings to decide whether the "
104"savings won't justify the cost"));
105
106static cl::opt<int>
107InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100),
108cl::desc("The maximum size of a callee that get's "
109"inlined without sufficient cycle savings"));
110
111// We introduce this threshold to help performance of instrumentation based
112// PGO before we actually hook up inliner with analysis passes such as BPI and
113// BFI.
114static cl::opt<int> ColdThreshold(
115"inlinecold-threshold", cl::Hidden, cl::init(45),
116cl::desc("Threshold for inlining functions with cold attribute"));
117
118static cl::opt<int>
119HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
120cl::desc("Threshold for hot callsites "));
121
122static cl::opt<int> LocallyHotCallSiteThreshold(
123"locally-hot-callsite-threshold", cl::Hidden, cl::init(525),
124cl::desc("Threshold for locally hot callsites "));
125
126static cl::opt<int> ColdCallSiteRelFreq(
127"cold-callsite-rel-freq", cl::Hidden, cl::init(2),
128cl::desc("Maximum block frequency, expressed as a percentage of caller's "
129"entry frequency, for a callsite to be cold in the absence of "
130"profile information."));
131
132static cl::opt<uint64_t> HotCallSiteRelFreq(
133"hot-callsite-rel-freq", cl::Hidden, cl::init(60),
134cl::desc("Minimum block frequency, expressed as a multiple of caller's "
135"entry frequency, for a callsite to be hot in the absence of "
136"profile information."));
137
138static cl::opt<int>
139InstrCost("inline-instr-cost", cl::Hidden, cl::init(5),
140cl::desc("Cost of a single instruction when inlining"));
141
142static cl::opt<int>
143MemAccessCost("inline-memaccess-cost", cl::Hidden, cl::init(0),
144cl::desc("Cost of load/store instruction when inlining"));
145
146static cl::opt<int> CallPenalty(
147"inline-call-penalty", cl::Hidden, cl::init(25),
148cl::desc("Call penalty that is applied per callsite when inlining"));
149
150static cl::opt<size_t>
151StackSizeThreshold("inline-max-stacksize", cl::Hidden,
152cl::init(std::numeric_limits<size_t>::max()),
153cl::desc("Do not inline functions with a stack size "
154"that exceeds the specified limit"));
155
156static cl::opt<size_t> RecurStackSizeThreshold(
157"recursive-inline-max-stacksize", cl::Hidden,
158cl::init(InlineConstants::TotalAllocaSizeRecursiveCaller),
159cl::desc("Do not inline recursive functions with a stack "
160"size that exceeds the specified limit"));
161
162static cl::opt<bool> OptComputeFullInlineCost(
163"inline-cost-full", cl::Hidden,
164cl::desc("Compute the full inline cost of a call site even when the cost "
165"exceeds the threshold."));
166
167static cl::opt<bool> InlineCallerSupersetNoBuiltin(
168"inline-caller-superset-nobuiltin", cl::Hidden, cl::init(true),
169cl::desc("Allow inlining when caller has a superset of callee's nobuiltin "
170"attributes."));
171
172static cl::opt<bool> DisableGEPConstOperand(
173"disable-gep-const-evaluation", cl::Hidden, cl::init(false),
174cl::desc("Disables evaluation of GetElementPtr with constant operands"));
175
176namespace llvm {
177std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) {
178if (Attr.isValid()) {
179int AttrValue = 0;
180if (!Attr.getValueAsString().getAsInteger(10, AttrValue))
181return AttrValue;
182}
183return std::nullopt;
184}
185
186std::optional<int> getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind) {
187return getStringFnAttrAsInt(CB.getFnAttr(AttrKind));
188}
189
190std::optional<int> getStringFnAttrAsInt(Function *F, StringRef AttrKind) {
191return getStringFnAttrAsInt(F->getFnAttribute(AttrKind));
192}
193
194namespace InlineConstants {
195int getInstrCost() { return InstrCost; }
196
197} // namespace InlineConstants
198
199} // namespace llvm
200
201namespace {
202class InlineCostCallAnalyzer;
203
204// This struct is used to store information about inline cost of a
205// particular instruction
206struct InstructionCostDetail {
207int CostBefore = 0;
208int CostAfter = 0;
209int ThresholdBefore = 0;
210int ThresholdAfter = 0;
211
212int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; }
213
214int getCostDelta() const { return CostAfter - CostBefore; }
215
216bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; }
217};
218
219class InlineCostAnnotationWriter : public AssemblyAnnotationWriter {
220private:
221InlineCostCallAnalyzer *const ICCA;
222
223public:
224InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {}
225void emitInstructionAnnot(const Instruction *I,
226formatted_raw_ostream &OS) override;
227};
228
229/// Carry out call site analysis, in order to evaluate inlinability.
230/// NOTE: the type is currently used as implementation detail of functions such
231/// as llvm::getInlineCost. Note the function_ref constructor parameters - the
232/// expectation is that they come from the outer scope, from the wrapper
233/// functions. If we want to support constructing CallAnalyzer objects where
234/// lambdas are provided inline at construction, or where the object needs to
235/// otherwise survive past the scope of the provided functions, we need to
236/// revisit the argument types.
237class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
238typedef InstVisitor<CallAnalyzer, bool> Base;
239friend class InstVisitor<CallAnalyzer, bool>;
240
241protected:
242virtual ~CallAnalyzer() = default;
243/// The TargetTransformInfo available for this compilation.
244const TargetTransformInfo &TTI;
245
246/// Getter for the cache of @llvm.assume intrinsics.
247function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
248
249/// Getter for BlockFrequencyInfo
250function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
251
252/// Profile summary information.
253ProfileSummaryInfo *PSI;
254
255/// The called function.
256Function &F;
257
258// Cache the DataLayout since we use it a lot.
259const DataLayout &DL;
260
261/// The OptimizationRemarkEmitter available for this compilation.
262OptimizationRemarkEmitter *ORE;
263
264/// The candidate callsite being analyzed. Please do not use this to do
265/// analysis in the caller function; we want the inline cost query to be
266/// easily cacheable. Instead, use the cover function paramHasAttr.
267CallBase &CandidateCall;
268
269/// Extension points for handling callsite features.
270// Called before a basic block was analyzed.
271virtual void onBlockStart(const BasicBlock *BB) {}
272
273/// Called after a basic block was analyzed.
274virtual void onBlockAnalyzed(const BasicBlock *BB) {}
275
276/// Called before an instruction was analyzed
277virtual void onInstructionAnalysisStart(const Instruction *I) {}
278
279/// Called after an instruction was analyzed
280virtual void onInstructionAnalysisFinish(const Instruction *I) {}
281
282/// Called at the end of the analysis of the callsite. Return the outcome of
283/// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
284/// the reason it can't.
285virtual InlineResult finalizeAnalysis() { return InlineResult::success(); }
286/// Called when we're about to start processing a basic block, and every time
287/// we are done processing an instruction. Return true if there is no point in
288/// continuing the analysis (e.g. we've determined already the call site is
289/// too expensive to inline)
290virtual bool shouldStop() { return false; }
291
292/// Called before the analysis of the callee body starts (with callsite
293/// contexts propagated). It checks callsite-specific information. Return a
294/// reason analysis can't continue if that's the case, or 'true' if it may
295/// continue.
296virtual InlineResult onAnalysisStart() { return InlineResult::success(); }
297/// Called if the analysis engine decides SROA cannot be done for the given
298/// alloca.
299virtual void onDisableSROA(AllocaInst *Arg) {}
300
301/// Called the analysis engine determines load elimination won't happen.
302virtual void onDisableLoadElimination() {}
303
304/// Called when we visit a CallBase, before the analysis starts. Return false
305/// to stop further processing of the instruction.
306virtual bool onCallBaseVisitStart(CallBase &Call) { return true; }
307
308/// Called to account for a call.
309virtual void onCallPenalty() {}
310
311/// Called to account for a load or store.
312virtual void onMemAccess(){};
313
314/// Called to account for the expectation the inlining would result in a load
315/// elimination.
316virtual void onLoadEliminationOpportunity() {}
317
318/// Called to account for the cost of argument setup for the Call in the
319/// callee's body (not the callsite currently under analysis).
320virtual void onCallArgumentSetup(const CallBase &Call) {}
321
322/// Called to account for a load relative intrinsic.
323virtual void onLoadRelativeIntrinsic() {}
324
325/// Called to account for a lowered call.
326virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) {
327}
328
329/// Account for a jump table of given size. Return false to stop further
330/// processing the switch instruction
331virtual bool onJumpTable(unsigned JumpTableSize) { return true; }
332
333/// Account for a case cluster of given size. Return false to stop further
334/// processing of the instruction.
335virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; }
336
337/// Called at the end of processing a switch instruction, with the given
338/// number of case clusters.
339virtual void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
340bool DefaultDestUndefined) {}
341
342/// Called to account for any other instruction not specifically accounted
343/// for.
344virtual void onMissedSimplification() {}
345
346/// Start accounting potential benefits due to SROA for the given alloca.
347virtual void onInitializeSROAArg(AllocaInst *Arg) {}
348
349/// Account SROA savings for the AllocaInst value.
350virtual void onAggregateSROAUse(AllocaInst *V) {}
351
352bool handleSROA(Value *V, bool DoNotDisable) {
353// Check for SROA candidates in comparisons.
354if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
355if (DoNotDisable) {
356onAggregateSROAUse(SROAArg);
357return true;
358}
359disableSROAForArg(SROAArg);
360}
361return false;
362}
363
364bool IsCallerRecursive = false;
365bool IsRecursiveCall = false;
366bool ExposesReturnsTwice = false;
367bool HasDynamicAlloca = false;
368bool ContainsNoDuplicateCall = false;
369bool HasReturn = false;
370bool HasIndirectBr = false;
371bool HasUninlineableIntrinsic = false;
372bool InitsVargArgs = false;
373
374/// Number of bytes allocated statically by the callee.
375uint64_t AllocatedSize = 0;
376unsigned NumInstructions = 0;
377unsigned NumVectorInstructions = 0;
378
379/// While we walk the potentially-inlined instructions, we build up and
380/// maintain a mapping of simplified values specific to this callsite. The
381/// idea is to propagate any special information we have about arguments to
382/// this call through the inlinable section of the function, and account for
383/// likely simplifications post-inlining. The most important aspect we track
384/// is CFG altering simplifications -- when we prove a basic block dead, that
385/// can cause dramatic shifts in the cost of inlining a function.
386DenseMap<Value *, Constant *> SimplifiedValues;
387
388/// Keep track of the values which map back (through function arguments) to
389/// allocas on the caller stack which could be simplified through SROA.
390DenseMap<Value *, AllocaInst *> SROAArgValues;
391
392/// Keep track of Allocas for which we believe we may get SROA optimization.
393DenseSet<AllocaInst *> EnabledSROAAllocas;
394
395/// Keep track of values which map to a pointer base and constant offset.
396DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
397
398/// Keep track of dead blocks due to the constant arguments.
399SmallPtrSet<BasicBlock *, 16> DeadBlocks;
400
401/// The mapping of the blocks to their known unique successors due to the
402/// constant arguments.
403DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;
404
405/// Model the elimination of repeated loads that is expected to happen
406/// whenever we simplify away the stores that would otherwise cause them to be
407/// loads.
408bool EnableLoadElimination = true;
409
410/// Whether we allow inlining for recursive call.
411bool AllowRecursiveCall = false;
412
413SmallPtrSet<Value *, 16> LoadAddrSet;
414
415AllocaInst *getSROAArgForValueOrNull(Value *V) const {
416auto It = SROAArgValues.find(V);
417if (It == SROAArgValues.end() || EnabledSROAAllocas.count(It->second) == 0)
418return nullptr;
419return It->second;
420}
421
422// Custom simplification helper routines.
423bool isAllocaDerivedArg(Value *V);
424void disableSROAForArg(AllocaInst *SROAArg);
425void disableSROA(Value *V);
426void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
427void disableLoadElimination();
428bool isGEPFree(GetElementPtrInst &GEP);
429bool canFoldInboundsGEP(GetElementPtrInst &I);
430bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
431bool simplifyCallSite(Function *F, CallBase &Call);
432bool simplifyInstruction(Instruction &I);
433bool simplifyIntrinsicCallIsConstant(CallBase &CB);
434bool simplifyIntrinsicCallObjectSize(CallBase &CB);
435ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
436
437/// Return true if the given argument to the function being considered for
438/// inlining has the given attribute set either at the call site or the
439/// function declaration. Primarily used to inspect call site specific
440/// attributes since these can be more precise than the ones on the callee
441/// itself.
442bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
443
444/// Return true if the given value is known non null within the callee if
445/// inlined through this particular callsite.
446bool isKnownNonNullInCallee(Value *V);
447
448/// Return true if size growth is allowed when inlining the callee at \p Call.
449bool allowSizeGrowth(CallBase &Call);
450
451// Custom analysis routines.
452InlineResult analyzeBlock(BasicBlock *BB,
453SmallPtrSetImpl<const Value *> &EphValues);
454
455// Disable several entry points to the visitor so we don't accidentally use
456// them by declaring but not defining them here.
457void visit(Module *);
458void visit(Module &);
459void visit(Function *);
460void visit(Function &);
461void visit(BasicBlock *);
462void visit(BasicBlock &);
463
464// Provide base case for our instruction visit.
465bool visitInstruction(Instruction &I);
466
467// Our visit overrides.
468bool visitAlloca(AllocaInst &I);
469bool visitPHI(PHINode &I);
470bool visitGetElementPtr(GetElementPtrInst &I);
471bool visitBitCast(BitCastInst &I);
472bool visitPtrToInt(PtrToIntInst &I);
473bool visitIntToPtr(IntToPtrInst &I);
474bool visitCastInst(CastInst &I);
475bool visitCmpInst(CmpInst &I);
476bool visitSub(BinaryOperator &I);
477bool visitBinaryOperator(BinaryOperator &I);
478bool visitFNeg(UnaryOperator &I);
479bool visitLoad(LoadInst &I);
480bool visitStore(StoreInst &I);
481bool visitExtractValue(ExtractValueInst &I);
482bool visitInsertValue(InsertValueInst &I);
483bool visitCallBase(CallBase &Call);
484bool visitReturnInst(ReturnInst &RI);
485bool visitBranchInst(BranchInst &BI);
486bool visitSelectInst(SelectInst &SI);
487bool visitSwitchInst(SwitchInst &SI);
488bool visitIndirectBrInst(IndirectBrInst &IBI);
489bool visitResumeInst(ResumeInst &RI);
490bool visitCleanupReturnInst(CleanupReturnInst &RI);
491bool visitCatchReturnInst(CatchReturnInst &RI);
492bool visitUnreachableInst(UnreachableInst &I);
493
494public:
495CallAnalyzer(Function &Callee, CallBase &Call, const TargetTransformInfo &TTI,
496function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
497function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
498ProfileSummaryInfo *PSI = nullptr,
499OptimizationRemarkEmitter *ORE = nullptr)
500: TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
501PSI(PSI), F(Callee), DL(F.getDataLayout()), ORE(ORE),
502CandidateCall(Call) {}
503
504InlineResult analyze();
505
506std::optional<Constant *> getSimplifiedValue(Instruction *I) {
507if (SimplifiedValues.contains(I))
508return SimplifiedValues[I];
509return std::nullopt;
510}
511
512// Keep a bunch of stats about the cost savings found so we can print them
513// out when debugging.
514unsigned NumConstantArgs = 0;
515unsigned NumConstantOffsetPtrArgs = 0;
516unsigned NumAllocaArgs = 0;
517unsigned NumConstantPtrCmps = 0;
518unsigned NumConstantPtrDiffs = 0;
519unsigned NumInstructionsSimplified = 0;
520
521void dump();
522};
523
524// Considering forming a binary search, we should find the number of nodes
525// which is same as the number of comparisons when lowered. For a given
526// number of clusters, n, we can define a recursive function, f(n), to find
527// the number of nodes in the tree. The recursion is :
528// f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
529// and f(n) = n, when n <= 3.
530// This will lead a binary tree where the leaf should be either f(2) or f(3)
531// when n > 3. So, the number of comparisons from leaves should be n, while
532// the number of non-leaf should be :
533// 2^(log2(n) - 1) - 1
534// = 2^log2(n) * 2^-1 - 1
535// = n / 2 - 1.
536// Considering comparisons from leaf and non-leaf nodes, we can estimate the
537// number of comparisons in a simple closed form :
538// n + n / 2 - 1 = n * 3 / 2 - 1
539int64_t getExpectedNumberOfCompare(int NumCaseCluster) {
540return 3 * static_cast<int64_t>(NumCaseCluster) / 2 - 1;
541}
542
543/// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
544/// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
545class InlineCostCallAnalyzer final : public CallAnalyzer {
546const bool ComputeFullInlineCost;
547int LoadEliminationCost = 0;
548/// Bonus to be applied when percentage of vector instructions in callee is
549/// high (see more details in updateThreshold).
550int VectorBonus = 0;
551/// Bonus to be applied when the callee has only one reachable basic block.
552int SingleBBBonus = 0;
553
554/// Tunable parameters that control the analysis.
555const InlineParams &Params;
556
557// This DenseMap stores the delta change in cost and threshold after
558// accounting for the given instruction. The map is filled only with the
559// flag PrintInstructionComments on.
560DenseMap<const Instruction *, InstructionCostDetail> InstructionCostDetailMap;
561
562/// Upper bound for the inlining cost. Bonuses are being applied to account
563/// for speculative "expected profit" of the inlining decision.
564int Threshold = 0;
565
566/// The amount of StaticBonus applied.
567int StaticBonusApplied = 0;
568
569/// Attempt to evaluate indirect calls to boost its inline cost.
570const bool BoostIndirectCalls;
571
572/// Ignore the threshold when finalizing analysis.
573const bool IgnoreThreshold;
574
575// True if the cost-benefit-analysis-based inliner is enabled.
576const bool CostBenefitAnalysisEnabled;
577
578/// Inlining cost measured in abstract units, accounts for all the
579/// instructions expected to be executed for a given function invocation.
580/// Instructions that are statically proven to be dead based on call-site
581/// arguments are not counted here.
582int Cost = 0;
583
584// The cumulative cost at the beginning of the basic block being analyzed. At
585// the end of analyzing each basic block, "Cost - CostAtBBStart" represents
586// the size of that basic block.
587int CostAtBBStart = 0;
588
589// The static size of live but cold basic blocks. This is "static" in the
590// sense that it's not weighted by profile counts at all.
591int ColdSize = 0;
592
593// Whether inlining is decided by cost-threshold analysis.
594bool DecidedByCostThreshold = false;
595
596// Whether inlining is decided by cost-benefit analysis.
597bool DecidedByCostBenefit = false;
598
599// The cost-benefit pair computed by cost-benefit analysis.
600std::optional<CostBenefitPair> CostBenefit;
601
602bool SingleBB = true;
603
604unsigned SROACostSavings = 0;
605unsigned SROACostSavingsLost = 0;
606
607/// The mapping of caller Alloca values to their accumulated cost savings. If
608/// we have to disable SROA for one of the allocas, this tells us how much
609/// cost must be added.
610DenseMap<AllocaInst *, int> SROAArgCosts;
611
612/// Return true if \p Call is a cold callsite.
613bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
614
615/// Update Threshold based on callsite properties such as callee
616/// attributes and callee hotness for PGO builds. The Callee is explicitly
617/// passed to support analyzing indirect calls whose target is inferred by
618/// analysis.
619void updateThreshold(CallBase &Call, Function &Callee);
620/// Return a higher threshold if \p Call is a hot callsite.
621std::optional<int> getHotCallSiteThreshold(CallBase &Call,
622BlockFrequencyInfo *CallerBFI);
623
624/// Handle a capped 'int' increment for Cost.
625void addCost(int64_t Inc) {
626Inc = std::clamp<int64_t>(Inc, INT_MIN, INT_MAX);
627Cost = std::clamp<int64_t>(Inc + Cost, INT_MIN, INT_MAX);
628}
629
630void onDisableSROA(AllocaInst *Arg) override {
631auto CostIt = SROAArgCosts.find(Arg);
632if (CostIt == SROAArgCosts.end())
633return;
634addCost(CostIt->second);
635SROACostSavings -= CostIt->second;
636SROACostSavingsLost += CostIt->second;
637SROAArgCosts.erase(CostIt);
638}
639
640void onDisableLoadElimination() override {
641addCost(LoadEliminationCost);
642LoadEliminationCost = 0;
643}
644
645bool onCallBaseVisitStart(CallBase &Call) override {
646if (std::optional<int> AttrCallThresholdBonus =
647getStringFnAttrAsInt(Call, "call-threshold-bonus"))
648Threshold += *AttrCallThresholdBonus;
649
650if (std::optional<int> AttrCallCost =
651getStringFnAttrAsInt(Call, "call-inline-cost")) {
652addCost(*AttrCallCost);
653// Prevent further processing of the call since we want to override its
654// inline cost, not just add to it.
655return false;
656}
657return true;
658}
659
660void onCallPenalty() override { addCost(CallPenalty); }
661
662void onMemAccess() override { addCost(MemAccessCost); }
663
664void onCallArgumentSetup(const CallBase &Call) override {
665// Pay the price of the argument setup. We account for the average 1
666// instruction per call argument setup here.
667addCost(Call.arg_size() * InstrCost);
668}
669void onLoadRelativeIntrinsic() override {
670// This is normally lowered to 4 LLVM instructions.
671addCost(3 * InstrCost);
672}
673void onLoweredCall(Function *F, CallBase &Call,
674bool IsIndirectCall) override {
675// We account for the average 1 instruction per call argument setup here.
676addCost(Call.arg_size() * InstrCost);
677
678// If we have a constant that we are calling as a function, we can peer
679// through it and see the function target. This happens not infrequently
680// during devirtualization and so we want to give it a hefty bonus for
681// inlining, but cap that bonus in the event that inlining wouldn't pan out.
682// Pretend to inline the function, with a custom threshold.
683if (IsIndirectCall && BoostIndirectCalls) {
684auto IndirectCallParams = Params;
685IndirectCallParams.DefaultThreshold =
686InlineConstants::IndirectCallThreshold;
687/// FIXME: if InlineCostCallAnalyzer is derived from, this may need
688/// to instantiate the derived class.
689InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
690GetAssumptionCache, GetBFI, PSI, ORE, false);
691if (CA.analyze().isSuccess()) {
692// We were able to inline the indirect call! Subtract the cost from the
693// threshold to get the bonus we want to apply, but don't go below zero.
694Cost -= std::max(0, CA.getThreshold() - CA.getCost());
695}
696} else
697// Otherwise simply add the cost for merely making the call.
698addCost(TTI.getInlineCallPenalty(CandidateCall.getCaller(), Call,
699CallPenalty));
700}
701
702void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
703bool DefaultDestUndefined) override {
704// If suitable for a jump table, consider the cost for the table size and
705// branch to destination.
706// Maximum valid cost increased in this function.
707if (JumpTableSize) {
708// Suppose a default branch includes one compare and one conditional
709// branch if it's reachable.
710if (!DefaultDestUndefined)
711addCost(2 * InstrCost);
712// Suppose a jump table requires one load and one jump instruction.
713int64_t JTCost =
714static_cast<int64_t>(JumpTableSize) * InstrCost + 2 * InstrCost;
715addCost(JTCost);
716return;
717}
718
719if (NumCaseCluster <= 3) {
720// Suppose a comparison includes one compare and one conditional branch.
721// We can reduce a set of instructions if the default branch is
722// undefined.
723addCost((NumCaseCluster - DefaultDestUndefined) * 2 * InstrCost);
724return;
725}
726
727int64_t ExpectedNumberOfCompare =
728getExpectedNumberOfCompare(NumCaseCluster);
729int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InstrCost;
730
731addCost(SwitchCost);
732}
733void onMissedSimplification() override { addCost(InstrCost); }
734
735void onInitializeSROAArg(AllocaInst *Arg) override {
736assert(Arg != nullptr &&
737"Should not initialize SROA costs for null value.");
738auto SROAArgCost = TTI.getCallerAllocaCost(&CandidateCall, Arg);
739SROACostSavings += SROAArgCost;
740SROAArgCosts[Arg] = SROAArgCost;
741}
742
743void onAggregateSROAUse(AllocaInst *SROAArg) override {
744auto CostIt = SROAArgCosts.find(SROAArg);
745assert(CostIt != SROAArgCosts.end() &&
746"expected this argument to have a cost");
747CostIt->second += InstrCost;
748SROACostSavings += InstrCost;
749}
750
751void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; }
752
753void onBlockAnalyzed(const BasicBlock *BB) override {
754if (CostBenefitAnalysisEnabled) {
755// Keep track of the static size of live but cold basic blocks. For now,
756// we define a cold basic block to be one that's never executed.
757assert(GetBFI && "GetBFI must be available");
758BlockFrequencyInfo *BFI = &(GetBFI(F));
759assert(BFI && "BFI must be available");
760auto ProfileCount = BFI->getBlockProfileCount(BB);
761if (*ProfileCount == 0)
762ColdSize += Cost - CostAtBBStart;
763}
764
765auto *TI = BB->getTerminator();
766// If we had any successors at this point, than post-inlining is likely to
767// have them as well. Note that we assume any basic blocks which existed
768// due to branches or switches which folded above will also fold after
769// inlining.
770if (SingleBB && TI->getNumSuccessors() > 1) {
771// Take off the bonus we applied to the threshold.
772Threshold -= SingleBBBonus;
773SingleBB = false;
774}
775}
776
777void onInstructionAnalysisStart(const Instruction *I) override {
778// This function is called to store the initial cost of inlining before
779// the given instruction was assessed.
780if (!PrintInstructionComments)
781return;
782InstructionCostDetailMap[I].CostBefore = Cost;
783InstructionCostDetailMap[I].ThresholdBefore = Threshold;
784}
785
786void onInstructionAnalysisFinish(const Instruction *I) override {
787// This function is called to find new values of cost and threshold after
788// the instruction has been assessed.
789if (!PrintInstructionComments)
790return;
791InstructionCostDetailMap[I].CostAfter = Cost;
792InstructionCostDetailMap[I].ThresholdAfter = Threshold;
793}
794
795bool isCostBenefitAnalysisEnabled() {
796if (!PSI || !PSI->hasProfileSummary())
797return false;
798
799if (!GetBFI)
800return false;
801
802if (InlineEnableCostBenefitAnalysis.getNumOccurrences()) {
803// Honor the explicit request from the user.
804if (!InlineEnableCostBenefitAnalysis)
805return false;
806} else {
807// Otherwise, require instrumentation profile.
808if (!PSI->hasInstrumentationProfile())
809return false;
810}
811
812auto *Caller = CandidateCall.getParent()->getParent();
813if (!Caller->getEntryCount())
814return false;
815
816BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller));
817if (!CallerBFI)
818return false;
819
820// For now, limit to hot call site.
821if (!PSI->isHotCallSite(CandidateCall, CallerBFI))
822return false;
823
824// Make sure we have a nonzero entry count.
825auto EntryCount = F.getEntryCount();
826if (!EntryCount || !EntryCount->getCount())
827return false;
828
829BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
830if (!CalleeBFI)
831return false;
832
833return true;
834}
835
836// A helper function to choose between command line override and default.
837unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const {
838if (InlineSavingsMultiplier.getNumOccurrences())
839return InlineSavingsMultiplier;
840return TTI.getInliningCostBenefitAnalysisSavingsMultiplier();
841}
842
843// A helper function to choose between command line override and default.
844unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const {
845if (InlineSavingsProfitableMultiplier.getNumOccurrences())
846return InlineSavingsProfitableMultiplier;
847return TTI.getInliningCostBenefitAnalysisProfitableMultiplier();
848}
849
850void OverrideCycleSavingsAndSizeForTesting(APInt &CycleSavings, int &Size) {
851if (std::optional<int> AttrCycleSavings = getStringFnAttrAsInt(
852CandidateCall, "inline-cycle-savings-for-test")) {
853CycleSavings = *AttrCycleSavings;
854}
855
856if (std::optional<int> AttrRuntimeCost = getStringFnAttrAsInt(
857CandidateCall, "inline-runtime-cost-for-test")) {
858Size = *AttrRuntimeCost;
859}
860}
861
862// Determine whether we should inline the given call site, taking into account
863// both the size cost and the cycle savings. Return std::nullopt if we don't
864// have sufficient profiling information to determine.
865std::optional<bool> costBenefitAnalysis() {
866if (!CostBenefitAnalysisEnabled)
867return std::nullopt;
868
869// buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0
870// for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by
871// falling back to the cost-based metric.
872// TODO: Improve this hacky condition.
873if (Threshold == 0)
874return std::nullopt;
875
876assert(GetBFI);
877BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
878assert(CalleeBFI);
879
880// The cycle savings expressed as the sum of InstrCost
881// multiplied by the estimated dynamic count of each instruction we can
882// avoid. Savings come from the call site cost, such as argument setup and
883// the call instruction, as well as the instructions that are folded.
884//
885// We use 128-bit APInt here to avoid potential overflow. This variable
886// should stay well below 10^^24 (or 2^^80) in practice. This "worst" case
887// assumes that we can avoid or fold a billion instructions, each with a
888// profile count of 10^^15 -- roughly the number of cycles for a 24-hour
889// period on a 4GHz machine.
890APInt CycleSavings(128, 0);
891
892for (auto &BB : F) {
893APInt CurrentSavings(128, 0);
894for (auto &I : BB) {
895if (BranchInst *BI = dyn_cast<BranchInst>(&I)) {
896// Count a conditional branch as savings if it becomes unconditional.
897if (BI->isConditional() &&
898isa_and_nonnull<ConstantInt>(
899SimplifiedValues.lookup(BI->getCondition()))) {
900CurrentSavings += InstrCost;
901}
902} else if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
903if (isa_and_present<ConstantInt>(SimplifiedValues.lookup(SI->getCondition())))
904CurrentSavings += InstrCost;
905} else if (Value *V = dyn_cast<Value>(&I)) {
906// Count an instruction as savings if we can fold it.
907if (SimplifiedValues.count(V)) {
908CurrentSavings += InstrCost;
909}
910}
911}
912
913auto ProfileCount = CalleeBFI->getBlockProfileCount(&BB);
914CurrentSavings *= *ProfileCount;
915CycleSavings += CurrentSavings;
916}
917
918// Compute the cycle savings per call.
919auto EntryProfileCount = F.getEntryCount();
920assert(EntryProfileCount && EntryProfileCount->getCount());
921auto EntryCount = EntryProfileCount->getCount();
922CycleSavings += EntryCount / 2;
923CycleSavings = CycleSavings.udiv(EntryCount);
924
925// Compute the total savings for the call site.
926auto *CallerBB = CandidateCall.getParent();
927BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent())));
928CycleSavings += getCallsiteCost(TTI, this->CandidateCall, DL);
929CycleSavings *= *CallerBFI->getBlockProfileCount(CallerBB);
930
931// Remove the cost of the cold basic blocks to model the runtime cost more
932// accurately. Both machine block placement and function splitting could
933// place cold blocks further from hot blocks.
934int Size = Cost - ColdSize;
935
936// Allow tiny callees to be inlined regardless of whether they meet the
937// savings threshold.
938Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1;
939
940OverrideCycleSavingsAndSizeForTesting(CycleSavings, Size);
941CostBenefit.emplace(APInt(128, Size), CycleSavings);
942
943// Let R be the ratio of CycleSavings to Size. We accept the inlining
944// opportunity if R is really high and reject if R is really low. If R is
945// somewhere in the middle, we fall back to the cost-based analysis.
946//
947// Specifically, let R = CycleSavings / Size, we accept the inlining
948// opportunity if:
949//
950// PSI->getOrCompHotCountThreshold()
951// R > -------------------------------------------------
952// getInliningCostBenefitAnalysisSavingsMultiplier()
953//
954// and reject the inlining opportunity if:
955//
956// PSI->getOrCompHotCountThreshold()
957// R <= ----------------------------------------------------
958// getInliningCostBenefitAnalysisProfitableMultiplier()
959//
960// Otherwise, we fall back to the cost-based analysis.
961//
962// Implementation-wise, use multiplication (CycleSavings * Multiplier,
963// HotCountThreshold * Size) rather than division to avoid precision loss.
964APInt Threshold(128, PSI->getOrCompHotCountThreshold());
965Threshold *= Size;
966
967APInt UpperBoundCycleSavings = CycleSavings;
968UpperBoundCycleSavings *= getInliningCostBenefitAnalysisSavingsMultiplier();
969if (UpperBoundCycleSavings.uge(Threshold))
970return true;
971
972APInt LowerBoundCycleSavings = CycleSavings;
973LowerBoundCycleSavings *=
974getInliningCostBenefitAnalysisProfitableMultiplier();
975if (LowerBoundCycleSavings.ult(Threshold))
976return false;
977
978// Otherwise, fall back to the cost-based analysis.
979return std::nullopt;
980}
981
982InlineResult finalizeAnalysis() override {
983// Loops generally act a lot like calls in that they act like barriers to
984// movement, require a certain amount of setup, etc. So when optimising for
985// size, we penalise any call sites that perform loops. We do this after all
986// other costs here, so will likely only be dealing with relatively small
987// functions (and hence DT and LI will hopefully be cheap).
988auto *Caller = CandidateCall.getFunction();
989if (Caller->hasMinSize()) {
990DominatorTree DT(F);
991LoopInfo LI(DT);
992int NumLoops = 0;
993for (Loop *L : LI) {
994// Ignore loops that will not be executed
995if (DeadBlocks.count(L->getHeader()))
996continue;
997NumLoops++;
998}
999addCost(NumLoops * InlineConstants::LoopPenalty);
1000}
1001
1002// We applied the maximum possible vector bonus at the beginning. Now,
1003// subtract the excess bonus, if any, from the Threshold before
1004// comparing against Cost.
1005if (NumVectorInstructions <= NumInstructions / 10)
1006Threshold -= VectorBonus;
1007else if (NumVectorInstructions <= NumInstructions / 2)
1008Threshold -= VectorBonus / 2;
1009
1010if (std::optional<int> AttrCost =
1011getStringFnAttrAsInt(CandidateCall, "function-inline-cost"))
1012Cost = *AttrCost;
1013
1014if (std::optional<int> AttrCostMult = getStringFnAttrAsInt(
1015CandidateCall,
1016InlineConstants::FunctionInlineCostMultiplierAttributeName))
1017Cost *= *AttrCostMult;
1018
1019if (std::optional<int> AttrThreshold =
1020getStringFnAttrAsInt(CandidateCall, "function-inline-threshold"))
1021Threshold = *AttrThreshold;
1022
1023if (auto Result = costBenefitAnalysis()) {
1024DecidedByCostBenefit = true;
1025if (*Result)
1026return InlineResult::success();
1027else
1028return InlineResult::failure("Cost over threshold.");
1029}
1030
1031if (IgnoreThreshold)
1032return InlineResult::success();
1033
1034DecidedByCostThreshold = true;
1035return Cost < std::max(1, Threshold)
1036? InlineResult::success()
1037: InlineResult::failure("Cost over threshold.");
1038}
1039
1040bool shouldStop() override {
1041if (IgnoreThreshold || ComputeFullInlineCost)
1042return false;
1043// Bail out the moment we cross the threshold. This means we'll under-count
1044// the cost, but only when undercounting doesn't matter.
1045if (Cost < Threshold)
1046return false;
1047DecidedByCostThreshold = true;
1048return true;
1049}
1050
1051void onLoadEliminationOpportunity() override {
1052LoadEliminationCost += InstrCost;
1053}
1054
1055InlineResult onAnalysisStart() override {
1056// Perform some tweaks to the cost and threshold based on the direct
1057// callsite information.
1058
1059// We want to more aggressively inline vector-dense kernels, so up the
1060// threshold, and we'll lower it if the % of vector instructions gets too
1061// low. Note that these bonuses are some what arbitrary and evolved over
1062// time by accident as much as because they are principled bonuses.
1063//
1064// FIXME: It would be nice to remove all such bonuses. At least it would be
1065// nice to base the bonus values on something more scientific.
1066assert(NumInstructions == 0);
1067assert(NumVectorInstructions == 0);
1068
1069// Update the threshold based on callsite properties
1070updateThreshold(CandidateCall, F);
1071
1072// While Threshold depends on commandline options that can take negative
1073// values, we want to enforce the invariant that the computed threshold and
1074// bonuses are non-negative.
1075assert(Threshold >= 0);
1076assert(SingleBBBonus >= 0);
1077assert(VectorBonus >= 0);
1078
1079// Speculatively apply all possible bonuses to Threshold. If cost exceeds
1080// this Threshold any time, and cost cannot decrease, we can stop processing
1081// the rest of the function body.
1082Threshold += (SingleBBBonus + VectorBonus);
1083
1084// Give out bonuses for the callsite, as the instructions setting them up
1085// will be gone after inlining.
1086addCost(-getCallsiteCost(TTI, this->CandidateCall, DL));
1087
1088// If this function uses the coldcc calling convention, prefer not to inline
1089// it.
1090if (F.getCallingConv() == CallingConv::Cold)
1091Cost += InlineConstants::ColdccPenalty;
1092
1093LLVM_DEBUG(dbgs() << " Initial cost: " << Cost << "\n");
1094
1095// Check if we're done. This can happen due to bonuses and penalties.
1096if (Cost >= Threshold && !ComputeFullInlineCost)
1097return InlineResult::failure("high cost");
1098
1099return InlineResult::success();
1100}
1101
1102public:
1103InlineCostCallAnalyzer(
1104Function &Callee, CallBase &Call, const InlineParams &Params,
1105const TargetTransformInfo &TTI,
1106function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
1107function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
1108ProfileSummaryInfo *PSI = nullptr,
1109OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true,
1110bool IgnoreThreshold = false)
1111: CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI, ORE),
1112ComputeFullInlineCost(OptComputeFullInlineCost ||
1113Params.ComputeFullInlineCost || ORE ||
1114isCostBenefitAnalysisEnabled()),
1115Params(Params), Threshold(Params.DefaultThreshold),
1116BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold),
1117CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()),
1118Writer(this) {
1119AllowRecursiveCall = *Params.AllowRecursiveCall;
1120}
1121
1122/// Annotation Writer for instruction details
1123InlineCostAnnotationWriter Writer;
1124
1125void dump();
1126
1127// Prints the same analysis as dump(), but its definition is not dependent
1128// on the build.
1129void print(raw_ostream &OS);
1130
1131std::optional<InstructionCostDetail> getCostDetails(const Instruction *I) {
1132if (InstructionCostDetailMap.contains(I))
1133return InstructionCostDetailMap[I];
1134return std::nullopt;
1135}
1136
1137virtual ~InlineCostCallAnalyzer() = default;
1138int getThreshold() const { return Threshold; }
1139int getCost() const { return Cost; }
1140int getStaticBonusApplied() const { return StaticBonusApplied; }
1141std::optional<CostBenefitPair> getCostBenefitPair() { return CostBenefit; }
1142bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit; }
1143bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold; }
1144};
1145
1146// Return true if CB is the sole call to local function Callee.
1147static bool isSoleCallToLocalFunction(const CallBase &CB,
1148const Function &Callee) {
1149return Callee.hasLocalLinkage() && Callee.hasOneLiveUse() &&
1150&Callee == CB.getCalledFunction();
1151}
1152
1153class InlineCostFeaturesAnalyzer final : public CallAnalyzer {
1154private:
1155InlineCostFeatures Cost = {};
1156
1157// FIXME: These constants are taken from the heuristic-based cost visitor.
1158// These should be removed entirely in a later revision to avoid reliance on
1159// heuristics in the ML inliner.
1160static constexpr int JTCostMultiplier = 2;
1161static constexpr int CaseClusterCostMultiplier = 2;
1162static constexpr int SwitchDefaultDestCostMultiplier = 2;
1163static constexpr int SwitchCostMultiplier = 2;
1164
1165// FIXME: These are taken from the heuristic-based cost visitor: we should
1166// eventually abstract these to the CallAnalyzer to avoid duplication.
1167unsigned SROACostSavingOpportunities = 0;
1168int VectorBonus = 0;
1169int SingleBBBonus = 0;
1170int Threshold = 5;
1171
1172DenseMap<AllocaInst *, unsigned> SROACosts;
1173
1174void increment(InlineCostFeatureIndex Feature, int64_t Delta = 1) {
1175Cost[static_cast<size_t>(Feature)] += Delta;
1176}
1177
1178void set(InlineCostFeatureIndex Feature, int64_t Value) {
1179Cost[static_cast<size_t>(Feature)] = Value;
1180}
1181
1182void onDisableSROA(AllocaInst *Arg) override {
1183auto CostIt = SROACosts.find(Arg);
1184if (CostIt == SROACosts.end())
1185return;
1186
1187increment(InlineCostFeatureIndex::sroa_losses, CostIt->second);
1188SROACostSavingOpportunities -= CostIt->second;
1189SROACosts.erase(CostIt);
1190}
1191
1192void onDisableLoadElimination() override {
1193set(InlineCostFeatureIndex::load_elimination, 1);
1194}
1195
1196void onCallPenalty() override {
1197increment(InlineCostFeatureIndex::call_penalty, CallPenalty);
1198}
1199
1200void onCallArgumentSetup(const CallBase &Call) override {
1201increment(InlineCostFeatureIndex::call_argument_setup,
1202Call.arg_size() * InstrCost);
1203}
1204
1205void onLoadRelativeIntrinsic() override {
1206increment(InlineCostFeatureIndex::load_relative_intrinsic, 3 * InstrCost);
1207}
1208
1209void onLoweredCall(Function *F, CallBase &Call,
1210bool IsIndirectCall) override {
1211increment(InlineCostFeatureIndex::lowered_call_arg_setup,
1212Call.arg_size() * InstrCost);
1213
1214if (IsIndirectCall) {
1215InlineParams IndirectCallParams = {/* DefaultThreshold*/ 0,
1216/*HintThreshold*/ {},
1217/*ColdThreshold*/ {},
1218/*OptSizeThreshold*/ {},
1219/*OptMinSizeThreshold*/ {},
1220/*HotCallSiteThreshold*/ {},
1221/*LocallyHotCallSiteThreshold*/ {},
1222/*ColdCallSiteThreshold*/ {},
1223/*ComputeFullInlineCost*/ true,
1224/*EnableDeferral*/ true};
1225IndirectCallParams.DefaultThreshold =
1226InlineConstants::IndirectCallThreshold;
1227
1228InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
1229GetAssumptionCache, GetBFI, PSI, ORE, false,
1230true);
1231if (CA.analyze().isSuccess()) {
1232increment(InlineCostFeatureIndex::nested_inline_cost_estimate,
1233CA.getCost());
1234increment(InlineCostFeatureIndex::nested_inlines, 1);
1235}
1236} else {
1237onCallPenalty();
1238}
1239}
1240
1241void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster,
1242bool DefaultDestUndefined) override {
1243if (JumpTableSize) {
1244if (!DefaultDestUndefined)
1245increment(InlineCostFeatureIndex::switch_default_dest_penalty,
1246SwitchDefaultDestCostMultiplier * InstrCost);
1247int64_t JTCost = static_cast<int64_t>(JumpTableSize) * InstrCost +
1248JTCostMultiplier * InstrCost;
1249increment(InlineCostFeatureIndex::jump_table_penalty, JTCost);
1250return;
1251}
1252
1253if (NumCaseCluster <= 3) {
1254increment(InlineCostFeatureIndex::case_cluster_penalty,
1255(NumCaseCluster - DefaultDestUndefined) *
1256CaseClusterCostMultiplier * InstrCost);
1257return;
1258}
1259
1260int64_t ExpectedNumberOfCompare =
1261getExpectedNumberOfCompare(NumCaseCluster);
1262
1263int64_t SwitchCost =
1264ExpectedNumberOfCompare * SwitchCostMultiplier * InstrCost;
1265increment(InlineCostFeatureIndex::switch_penalty, SwitchCost);
1266}
1267
1268void onMissedSimplification() override {
1269increment(InlineCostFeatureIndex::unsimplified_common_instructions,
1270InstrCost);
1271}
1272
1273void onInitializeSROAArg(AllocaInst *Arg) override {
1274auto SROAArgCost = TTI.getCallerAllocaCost(&CandidateCall, Arg);
1275SROACosts[Arg] = SROAArgCost;
1276SROACostSavingOpportunities += SROAArgCost;
1277}
1278
1279void onAggregateSROAUse(AllocaInst *Arg) override {
1280SROACosts.find(Arg)->second += InstrCost;
1281SROACostSavingOpportunities += InstrCost;
1282}
1283
1284void onBlockAnalyzed(const BasicBlock *BB) override {
1285if (BB->getTerminator()->getNumSuccessors() > 1)
1286set(InlineCostFeatureIndex::is_multiple_blocks, 1);
1287Threshold -= SingleBBBonus;
1288}
1289
1290InlineResult finalizeAnalysis() override {
1291auto *Caller = CandidateCall.getFunction();
1292if (Caller->hasMinSize()) {
1293DominatorTree DT(F);
1294LoopInfo LI(DT);
1295for (Loop *L : LI) {
1296// Ignore loops that will not be executed
1297if (DeadBlocks.count(L->getHeader()))
1298continue;
1299increment(InlineCostFeatureIndex::num_loops,
1300InlineConstants::LoopPenalty);
1301}
1302}
1303set(InlineCostFeatureIndex::dead_blocks, DeadBlocks.size());
1304set(InlineCostFeatureIndex::simplified_instructions,
1305NumInstructionsSimplified);
1306set(InlineCostFeatureIndex::constant_args, NumConstantArgs);
1307set(InlineCostFeatureIndex::constant_offset_ptr_args,
1308NumConstantOffsetPtrArgs);
1309set(InlineCostFeatureIndex::sroa_savings, SROACostSavingOpportunities);
1310
1311if (NumVectorInstructions <= NumInstructions / 10)
1312Threshold -= VectorBonus;
1313else if (NumVectorInstructions <= NumInstructions / 2)
1314Threshold -= VectorBonus / 2;
1315
1316set(InlineCostFeatureIndex::threshold, Threshold);
1317
1318return InlineResult::success();
1319}
1320
1321bool shouldStop() override { return false; }
1322
1323void onLoadEliminationOpportunity() override {
1324increment(InlineCostFeatureIndex::load_elimination, 1);
1325}
1326
1327InlineResult onAnalysisStart() override {
1328increment(InlineCostFeatureIndex::callsite_cost,
1329-1 * getCallsiteCost(TTI, this->CandidateCall, DL));
1330
1331set(InlineCostFeatureIndex::cold_cc_penalty,
1332(F.getCallingConv() == CallingConv::Cold));
1333
1334set(InlineCostFeatureIndex::last_call_to_static_bonus,
1335isSoleCallToLocalFunction(CandidateCall, F));
1336
1337// FIXME: we shouldn't repeat this logic in both the Features and Cost
1338// analyzer - instead, we should abstract it to a common method in the
1339// CallAnalyzer
1340int SingleBBBonusPercent = 50;
1341int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1342Threshold += TTI.adjustInliningThreshold(&CandidateCall);
1343Threshold *= TTI.getInliningThresholdMultiplier();
1344SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
1345VectorBonus = Threshold * VectorBonusPercent / 100;
1346Threshold += (SingleBBBonus + VectorBonus);
1347
1348return InlineResult::success();
1349}
1350
1351public:
1352InlineCostFeaturesAnalyzer(
1353const TargetTransformInfo &TTI,
1354function_ref<AssumptionCache &(Function &)> &GetAssumptionCache,
1355function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
1356ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee,
1357CallBase &Call)
1358: CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI) {}
1359
1360const InlineCostFeatures &features() const { return Cost; }
1361};
1362
1363} // namespace
1364
1365/// Test whether the given value is an Alloca-derived function argument.
1366bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
1367return SROAArgValues.count(V);
1368}
1369
1370void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) {
1371onDisableSROA(SROAArg);
1372EnabledSROAAllocas.erase(SROAArg);
1373disableLoadElimination();
1374}
1375
1376void InlineCostAnnotationWriter::emitInstructionAnnot(
1377const Instruction *I, formatted_raw_ostream &OS) {
1378// The cost of inlining of the given instruction is printed always.
1379// The threshold delta is printed only when it is non-zero. It happens
1380// when we decided to give a bonus at a particular instruction.
1381std::optional<InstructionCostDetail> Record = ICCA->getCostDetails(I);
1382if (!Record)
1383OS << "; No analysis for the instruction";
1384else {
1385OS << "; cost before = " << Record->CostBefore
1386<< ", cost after = " << Record->CostAfter
1387<< ", threshold before = " << Record->ThresholdBefore
1388<< ", threshold after = " << Record->ThresholdAfter << ", ";
1389OS << "cost delta = " << Record->getCostDelta();
1390if (Record->hasThresholdChanged())
1391OS << ", threshold delta = " << Record->getThresholdDelta();
1392}
1393auto C = ICCA->getSimplifiedValue(const_cast<Instruction *>(I));
1394if (C) {
1395OS << ", simplified to ";
1396(*C)->print(OS, true);
1397}
1398OS << "\n";
1399}
1400
1401/// If 'V' maps to a SROA candidate, disable SROA for it.
1402void CallAnalyzer::disableSROA(Value *V) {
1403if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
1404disableSROAForArg(SROAArg);
1405}
1406}
1407
1408void CallAnalyzer::disableLoadElimination() {
1409if (EnableLoadElimination) {
1410onDisableLoadElimination();
1411EnableLoadElimination = false;
1412}
1413}
1414
1415/// Accumulate a constant GEP offset into an APInt if possible.
1416///
1417/// Returns false if unable to compute the offset for any reason. Respects any
1418/// simplified values known during the analysis of this callsite.
1419bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
1420unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType());
1421assert(IntPtrWidth == Offset.getBitWidth());
1422
1423for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
1424GTI != GTE; ++GTI) {
1425ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
1426if (!OpC)
1427if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
1428OpC = dyn_cast<ConstantInt>(SimpleOp);
1429if (!OpC)
1430return false;
1431if (OpC->isZero())
1432continue;
1433
1434// Handle a struct index, which adds its field offset to the pointer.
1435if (StructType *STy = GTI.getStructTypeOrNull()) {
1436unsigned ElementIdx = OpC->getZExtValue();
1437const StructLayout *SL = DL.getStructLayout(STy);
1438Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
1439continue;
1440}
1441
1442APInt TypeSize(IntPtrWidth, GTI.getSequentialElementStride(DL));
1443Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
1444}
1445return true;
1446}
1447
1448/// Use TTI to check whether a GEP is free.
1449///
1450/// Respects any simplified values known during the analysis of this callsite.
1451bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
1452SmallVector<Value *, 4> Operands;
1453Operands.push_back(GEP.getOperand(0));
1454for (const Use &Op : GEP.indices())
1455if (Constant *SimpleOp = SimplifiedValues.lookup(Op))
1456Operands.push_back(SimpleOp);
1457else
1458Operands.push_back(Op);
1459return TTI.getInstructionCost(&GEP, Operands,
1460TargetTransformInfo::TCK_SizeAndLatency) ==
1461TargetTransformInfo::TCC_Free;
1462}
1463
1464bool CallAnalyzer::visitAlloca(AllocaInst &I) {
1465disableSROA(I.getOperand(0));
1466
1467// Check whether inlining will turn a dynamic alloca into a static
1468// alloca and handle that case.
1469if (I.isArrayAllocation()) {
1470Constant *Size = SimplifiedValues.lookup(I.getArraySize());
1471if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
1472// Sometimes a dynamic alloca could be converted into a static alloca
1473// after this constant prop, and become a huge static alloca on an
1474// unconditional CFG path. Avoid inlining if this is going to happen above
1475// a threshold.
1476// FIXME: If the threshold is removed or lowered too much, we could end up
1477// being too pessimistic and prevent inlining non-problematic code. This
1478// could result in unintended perf regressions. A better overall strategy
1479// is needed to track stack usage during inlining.
1480Type *Ty = I.getAllocatedType();
1481AllocatedSize = SaturatingMultiplyAdd(
1482AllocSize->getLimitedValue(),
1483DL.getTypeAllocSize(Ty).getKnownMinValue(), AllocatedSize);
1484if (AllocatedSize > InlineConstants::MaxSimplifiedDynamicAllocaToInline)
1485HasDynamicAlloca = true;
1486return false;
1487}
1488}
1489
1490// Accumulate the allocated size.
1491if (I.isStaticAlloca()) {
1492Type *Ty = I.getAllocatedType();
1493AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty).getKnownMinValue(),
1494AllocatedSize);
1495}
1496
1497// FIXME: This is overly conservative. Dynamic allocas are inefficient for
1498// a variety of reasons, and so we would like to not inline them into
1499// functions which don't currently have a dynamic alloca. This simply
1500// disables inlining altogether in the presence of a dynamic alloca.
1501if (!I.isStaticAlloca())
1502HasDynamicAlloca = true;
1503
1504return false;
1505}
1506
1507bool CallAnalyzer::visitPHI(PHINode &I) {
1508// FIXME: We need to propagate SROA *disabling* through phi nodes, even
1509// though we don't want to propagate it's bonuses. The idea is to disable
1510// SROA if it *might* be used in an inappropriate manner.
1511
1512// Phi nodes are always zero-cost.
1513// FIXME: Pointer sizes may differ between different address spaces, so do we
1514// need to use correct address space in the call to getPointerSizeInBits here?
1515// Or could we skip the getPointerSizeInBits call completely? As far as I can
1516// see the ZeroOffset is used as a dummy value, so we can probably use any
1517// bit width for the ZeroOffset?
1518APInt ZeroOffset = APInt::getZero(DL.getPointerSizeInBits(0));
1519bool CheckSROA = I.getType()->isPointerTy();
1520
1521// Track the constant or pointer with constant offset we've seen so far.
1522Constant *FirstC = nullptr;
1523std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
1524Value *FirstV = nullptr;
1525
1526for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
1527BasicBlock *Pred = I.getIncomingBlock(i);
1528// If the incoming block is dead, skip the incoming block.
1529if (DeadBlocks.count(Pred))
1530continue;
1531// If the parent block of phi is not the known successor of the incoming
1532// block, skip the incoming block.
1533BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
1534if (KnownSuccessor && KnownSuccessor != I.getParent())
1535continue;
1536
1537Value *V = I.getIncomingValue(i);
1538// If the incoming value is this phi itself, skip the incoming value.
1539if (&I == V)
1540continue;
1541
1542Constant *C = dyn_cast<Constant>(V);
1543if (!C)
1544C = SimplifiedValues.lookup(V);
1545
1546std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
1547if (!C && CheckSROA)
1548BaseAndOffset = ConstantOffsetPtrs.lookup(V);
1549
1550if (!C && !BaseAndOffset.first)
1551// The incoming value is neither a constant nor a pointer with constant
1552// offset, exit early.
1553return true;
1554
1555if (FirstC) {
1556if (FirstC == C)
1557// If we've seen a constant incoming value before and it is the same
1558// constant we see this time, continue checking the next incoming value.
1559continue;
1560// Otherwise early exit because we either see a different constant or saw
1561// a constant before but we have a pointer with constant offset this time.
1562return true;
1563}
1564
1565if (FirstV) {
1566// The same logic as above, but check pointer with constant offset here.
1567if (FirstBaseAndOffset == BaseAndOffset)
1568continue;
1569return true;
1570}
1571
1572if (C) {
1573// This is the 1st time we've seen a constant, record it.
1574FirstC = C;
1575continue;
1576}
1577
1578// The remaining case is that this is the 1st time we've seen a pointer with
1579// constant offset, record it.
1580FirstV = V;
1581FirstBaseAndOffset = BaseAndOffset;
1582}
1583
1584// Check if we can map phi to a constant.
1585if (FirstC) {
1586SimplifiedValues[&I] = FirstC;
1587return true;
1588}
1589
1590// Check if we can map phi to a pointer with constant offset.
1591if (FirstBaseAndOffset.first) {
1592ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
1593
1594if (auto *SROAArg = getSROAArgForValueOrNull(FirstV))
1595SROAArgValues[&I] = SROAArg;
1596}
1597
1598return true;
1599}
1600
1601/// Check we can fold GEPs of constant-offset call site argument pointers.
1602/// This requires target data and inbounds GEPs.
1603///
1604/// \return true if the specified GEP can be folded.
1605bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
1606// Check if we have a base + offset for the pointer.
1607std::pair<Value *, APInt> BaseAndOffset =
1608ConstantOffsetPtrs.lookup(I.getPointerOperand());
1609if (!BaseAndOffset.first)
1610return false;
1611
1612// Check if the offset of this GEP is constant, and if so accumulate it
1613// into Offset.
1614if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
1615return false;
1616
1617// Add the result as a new mapping to Base + Offset.
1618ConstantOffsetPtrs[&I] = BaseAndOffset;
1619
1620return true;
1621}
1622
1623bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
1624auto *SROAArg = getSROAArgForValueOrNull(I.getPointerOperand());
1625
1626// Lambda to check whether a GEP's indices are all constant.
1627auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
1628for (const Use &Op : GEP.indices())
1629if (!isa<Constant>(Op) && !SimplifiedValues.lookup(Op))
1630return false;
1631return true;
1632};
1633
1634if (!DisableGEPConstOperand)
1635if (simplifyInstruction(I))
1636return true;
1637
1638if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
1639if (SROAArg)
1640SROAArgValues[&I] = SROAArg;
1641
1642// Constant GEPs are modeled as free.
1643return true;
1644}
1645
1646// Variable GEPs will require math and will disable SROA.
1647if (SROAArg)
1648disableSROAForArg(SROAArg);
1649return isGEPFree(I);
1650}
1651
1652/// Simplify \p I if its operands are constants and update SimplifiedValues.
1653bool CallAnalyzer::simplifyInstruction(Instruction &I) {
1654SmallVector<Constant *> COps;
1655for (Value *Op : I.operands()) {
1656Constant *COp = dyn_cast<Constant>(Op);
1657if (!COp)
1658COp = SimplifiedValues.lookup(Op);
1659if (!COp)
1660return false;
1661COps.push_back(COp);
1662}
1663auto *C = ConstantFoldInstOperands(&I, COps, DL);
1664if (!C)
1665return false;
1666SimplifiedValues[&I] = C;
1667return true;
1668}
1669
1670/// Try to simplify a call to llvm.is.constant.
1671///
1672/// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since
1673/// we expect calls of this specific intrinsic to be infrequent.
1674///
1675/// FIXME: Given that we know CB's parent (F) caller
1676/// (CandidateCall->getParent()->getParent()), we might be able to determine
1677/// whether inlining F into F's caller would change how the call to
1678/// llvm.is.constant would evaluate.
1679bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase &CB) {
1680Value *Arg = CB.getArgOperand(0);
1681auto *C = dyn_cast<Constant>(Arg);
1682
1683if (!C)
1684C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(Arg));
1685
1686Type *RT = CB.getFunctionType()->getReturnType();
1687SimplifiedValues[&CB] = ConstantInt::get(RT, C ? 1 : 0);
1688return true;
1689}
1690
1691bool CallAnalyzer::simplifyIntrinsicCallObjectSize(CallBase &CB) {
1692// As per the langref, "The fourth argument to llvm.objectsize determines if
1693// the value should be evaluated at runtime."
1694if (cast<ConstantInt>(CB.getArgOperand(3))->isOne())
1695return false;
1696
1697Value *V = lowerObjectSizeCall(&cast<IntrinsicInst>(CB), DL, nullptr,
1698/*MustSucceed=*/true);
1699Constant *C = dyn_cast_or_null<Constant>(V);
1700if (C)
1701SimplifiedValues[&CB] = C;
1702return C;
1703}
1704
1705bool CallAnalyzer::visitBitCast(BitCastInst &I) {
1706// Propagate constants through bitcasts.
1707if (simplifyInstruction(I))
1708return true;
1709
1710// Track base/offsets through casts
1711std::pair<Value *, APInt> BaseAndOffset =
1712ConstantOffsetPtrs.lookup(I.getOperand(0));
1713// Casts don't change the offset, just wrap it up.
1714if (BaseAndOffset.first)
1715ConstantOffsetPtrs[&I] = BaseAndOffset;
1716
1717// Also look for SROA candidates here.
1718if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1719SROAArgValues[&I] = SROAArg;
1720
1721// Bitcasts are always zero cost.
1722return true;
1723}
1724
1725bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
1726// Propagate constants through ptrtoint.
1727if (simplifyInstruction(I))
1728return true;
1729
1730// Track base/offset pairs when converted to a plain integer provided the
1731// integer is large enough to represent the pointer.
1732unsigned IntegerSize = I.getType()->getScalarSizeInBits();
1733unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
1734if (IntegerSize == DL.getPointerSizeInBits(AS)) {
1735std::pair<Value *, APInt> BaseAndOffset =
1736ConstantOffsetPtrs.lookup(I.getOperand(0));
1737if (BaseAndOffset.first)
1738ConstantOffsetPtrs[&I] = BaseAndOffset;
1739}
1740
1741// This is really weird. Technically, ptrtoint will disable SROA. However,
1742// unless that ptrtoint is *used* somewhere in the live basic blocks after
1743// inlining, it will be nuked, and SROA should proceed. All of the uses which
1744// would block SROA would also block SROA if applied directly to a pointer,
1745// and so we can just add the integer in here. The only places where SROA is
1746// preserved either cannot fire on an integer, or won't in-and-of themselves
1747// disable SROA (ext) w/o some later use that we would see and disable.
1748if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1749SROAArgValues[&I] = SROAArg;
1750
1751return TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
1752TargetTransformInfo::TCC_Free;
1753}
1754
1755bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
1756// Propagate constants through ptrtoint.
1757if (simplifyInstruction(I))
1758return true;
1759
1760// Track base/offset pairs when round-tripped through a pointer without
1761// modifications provided the integer is not too large.
1762Value *Op = I.getOperand(0);
1763unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
1764if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
1765std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
1766if (BaseAndOffset.first)
1767ConstantOffsetPtrs[&I] = BaseAndOffset;
1768}
1769
1770// "Propagate" SROA here in the same manner as we do for ptrtoint above.
1771if (auto *SROAArg = getSROAArgForValueOrNull(Op))
1772SROAArgValues[&I] = SROAArg;
1773
1774return TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
1775TargetTransformInfo::TCC_Free;
1776}
1777
1778bool CallAnalyzer::visitCastInst(CastInst &I) {
1779// Propagate constants through casts.
1780if (simplifyInstruction(I))
1781return true;
1782
1783// Disable SROA in the face of arbitrary casts we don't explicitly list
1784// elsewhere.
1785disableSROA(I.getOperand(0));
1786
1787// If this is a floating-point cast, and the target says this operation
1788// is expensive, this may eventually become a library call. Treat the cost
1789// as such.
1790switch (I.getOpcode()) {
1791case Instruction::FPTrunc:
1792case Instruction::FPExt:
1793case Instruction::UIToFP:
1794case Instruction::SIToFP:
1795case Instruction::FPToUI:
1796case Instruction::FPToSI:
1797if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
1798onCallPenalty();
1799break;
1800default:
1801break;
1802}
1803
1804return TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
1805TargetTransformInfo::TCC_Free;
1806}
1807
1808bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
1809return CandidateCall.paramHasAttr(A->getArgNo(), Attr);
1810}
1811
1812bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
1813// Does the *call site* have the NonNull attribute set on an argument? We
1814// use the attribute on the call site to memoize any analysis done in the
1815// caller. This will also trip if the callee function has a non-null
1816// parameter attribute, but that's a less interesting case because hopefully
1817// the callee would already have been simplified based on that.
1818if (Argument *A = dyn_cast<Argument>(V))
1819if (paramHasAttr(A, Attribute::NonNull))
1820return true;
1821
1822// Is this an alloca in the caller? This is distinct from the attribute case
1823// above because attributes aren't updated within the inliner itself and we
1824// always want to catch the alloca derived case.
1825if (isAllocaDerivedArg(V))
1826// We can actually predict the result of comparisons between an
1827// alloca-derived value and null. Note that this fires regardless of
1828// SROA firing.
1829return true;
1830
1831return false;
1832}
1833
1834bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {
1835// If the normal destination of the invoke or the parent block of the call
1836// site is unreachable-terminated, there is little point in inlining this
1837// unless there is literally zero cost.
1838// FIXME: Note that it is possible that an unreachable-terminated block has a
1839// hot entry. For example, in below scenario inlining hot_call_X() may be
1840// beneficial :
1841// main() {
1842// hot_call_1();
1843// ...
1844// hot_call_N()
1845// exit(0);
1846// }
1847// For now, we are not handling this corner case here as it is rare in real
1848// code. In future, we should elaborate this based on BPI and BFI in more
1849// general threshold adjusting heuristics in updateThreshold().
1850if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) {
1851if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
1852return false;
1853} else if (isa<UnreachableInst>(Call.getParent()->getTerminator()))
1854return false;
1855
1856return true;
1857}
1858
1859bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
1860BlockFrequencyInfo *CallerBFI) {
1861// If global profile summary is available, then callsite's coldness is
1862// determined based on that.
1863if (PSI && PSI->hasProfileSummary())
1864return PSI->isColdCallSite(Call, CallerBFI);
1865
1866// Otherwise we need BFI to be available.
1867if (!CallerBFI)
1868return false;
1869
1870// Determine if the callsite is cold relative to caller's entry. We could
1871// potentially cache the computation of scaled entry frequency, but the added
1872// complexity is not worth it unless this scaling shows up high in the
1873// profiles.
1874const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
1875auto CallSiteBB = Call.getParent();
1876auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
1877auto CallerEntryFreq =
1878CallerBFI->getBlockFreq(&(Call.getCaller()->getEntryBlock()));
1879return CallSiteFreq < CallerEntryFreq * ColdProb;
1880}
1881
1882std::optional<int>
1883InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
1884BlockFrequencyInfo *CallerBFI) {
1885
1886// If global profile summary is available, then callsite's hotness is
1887// determined based on that.
1888if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(Call, CallerBFI))
1889return Params.HotCallSiteThreshold;
1890
1891// Otherwise we need BFI to be available and to have a locally hot callsite
1892// threshold.
1893if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
1894return std::nullopt;
1895
1896// Determine if the callsite is hot relative to caller's entry. We could
1897// potentially cache the computation of scaled entry frequency, but the added
1898// complexity is not worth it unless this scaling shows up high in the
1899// profiles.
1900const BasicBlock *CallSiteBB = Call.getParent();
1901BlockFrequency CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
1902BlockFrequency CallerEntryFreq = CallerBFI->getEntryFreq();
1903std::optional<BlockFrequency> Limit = CallerEntryFreq.mul(HotCallSiteRelFreq);
1904if (Limit && CallSiteFreq >= *Limit)
1905return Params.LocallyHotCallSiteThreshold;
1906
1907// Otherwise treat it normally.
1908return std::nullopt;
1909}
1910
1911void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
1912// If no size growth is allowed for this inlining, set Threshold to 0.
1913if (!allowSizeGrowth(Call)) {
1914Threshold = 0;
1915return;
1916}
1917
1918Function *Caller = Call.getCaller();
1919
1920// return min(A, B) if B is valid.
1921auto MinIfValid = [](int A, std::optional<int> B) {
1922return B ? std::min(A, *B) : A;
1923};
1924
1925// return max(A, B) if B is valid.
1926auto MaxIfValid = [](int A, std::optional<int> B) {
1927return B ? std::max(A, *B) : A;
1928};
1929
1930// Various bonus percentages. These are multiplied by Threshold to get the
1931// bonus values.
1932// SingleBBBonus: This bonus is applied if the callee has a single reachable
1933// basic block at the given callsite context. This is speculatively applied
1934// and withdrawn if more than one basic block is seen.
1935//
1936// LstCallToStaticBonus: This large bonus is applied to ensure the inlining
1937// of the last call to a static function as inlining such functions is
1938// guaranteed to reduce code size.
1939//
1940// These bonus percentages may be set to 0 based on properties of the caller
1941// and the callsite.
1942int SingleBBBonusPercent = 50;
1943int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1944int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
1945
1946// Lambda to set all the above bonus and bonus percentages to 0.
1947auto DisallowAllBonuses = [&]() {
1948SingleBBBonusPercent = 0;
1949VectorBonusPercent = 0;
1950LastCallToStaticBonus = 0;
1951};
1952
1953// Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
1954// and reduce the threshold if the caller has the necessary attribute.
1955if (Caller->hasMinSize()) {
1956Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
1957// For minsize, we want to disable the single BB bonus and the vector
1958// bonuses, but not the last-call-to-static bonus. Inlining the last call to
1959// a static function will, at the minimum, eliminate the parameter setup and
1960// call/return instructions.
1961SingleBBBonusPercent = 0;
1962VectorBonusPercent = 0;
1963} else if (Caller->hasOptSize())
1964Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
1965
1966// Adjust the threshold based on inlinehint attribute and profile based
1967// hotness information if the caller does not have MinSize attribute.
1968if (!Caller->hasMinSize()) {
1969if (Callee.hasFnAttribute(Attribute::InlineHint))
1970Threshold = MaxIfValid(Threshold, Params.HintThreshold);
1971
1972// FIXME: After switching to the new passmanager, simplify the logic below
1973// by checking only the callsite hotness/coldness as we will reliably
1974// have local profile information.
1975//
1976// Callsite hotness and coldness can be determined if sample profile is
1977// used (which adds hotness metadata to calls) or if caller's
1978// BlockFrequencyInfo is available.
1979BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr;
1980auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI);
1981if (!Caller->hasOptSize() && HotCallSiteThreshold) {
1982LLVM_DEBUG(dbgs() << "Hot callsite.\n");
1983// FIXME: This should update the threshold only if it exceeds the
1984// current threshold, but AutoFDO + ThinLTO currently relies on this
1985// behavior to prevent inlining of hot callsites during ThinLTO
1986// compile phase.
1987Threshold = *HotCallSiteThreshold;
1988} else if (isColdCallSite(Call, CallerBFI)) {
1989LLVM_DEBUG(dbgs() << "Cold callsite.\n");
1990// Do not apply bonuses for a cold callsite including the
1991// LastCallToStatic bonus. While this bonus might result in code size
1992// reduction, it can cause the size of a non-cold caller to increase
1993// preventing it from being inlined.
1994DisallowAllBonuses();
1995Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
1996} else if (PSI) {
1997// Use callee's global profile information only if we have no way of
1998// determining this via callsite information.
1999if (PSI->isFunctionEntryHot(&Callee)) {
2000LLVM_DEBUG(dbgs() << "Hot callee.\n");
2001// If callsite hotness can not be determined, we may still know
2002// that the callee is hot and treat it as a weaker hint for threshold
2003// increase.
2004Threshold = MaxIfValid(Threshold, Params.HintThreshold);
2005} else if (PSI->isFunctionEntryCold(&Callee)) {
2006LLVM_DEBUG(dbgs() << "Cold callee.\n");
2007// Do not apply bonuses for a cold callee including the
2008// LastCallToStatic bonus. While this bonus might result in code size
2009// reduction, it can cause the size of a non-cold caller to increase
2010// preventing it from being inlined.
2011DisallowAllBonuses();
2012Threshold = MinIfValid(Threshold, Params.ColdThreshold);
2013}
2014}
2015}
2016
2017Threshold += TTI.adjustInliningThreshold(&Call);
2018
2019// Finally, take the target-specific inlining threshold multiplier into
2020// account.
2021Threshold *= TTI.getInliningThresholdMultiplier();
2022
2023SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
2024VectorBonus = Threshold * VectorBonusPercent / 100;
2025
2026// If there is only one call of the function, and it has internal linkage,
2027// the cost of inlining it drops dramatically. It may seem odd to update
2028// Cost in updateThreshold, but the bonus depends on the logic in this method.
2029if (isSoleCallToLocalFunction(Call, F)) {
2030Cost -= LastCallToStaticBonus;
2031StaticBonusApplied = LastCallToStaticBonus;
2032}
2033}
2034
2035bool CallAnalyzer::visitCmpInst(CmpInst &I) {
2036Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2037// First try to handle simplified comparisons.
2038if (simplifyInstruction(I))
2039return true;
2040
2041if (I.getOpcode() == Instruction::FCmp)
2042return false;
2043
2044// Otherwise look for a comparison between constant offset pointers with
2045// a common base.
2046Value *LHSBase, *RHSBase;
2047APInt LHSOffset, RHSOffset;
2048std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
2049if (LHSBase) {
2050std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
2051if (RHSBase && LHSBase == RHSBase) {
2052// We have common bases, fold the icmp to a constant based on the
2053// offsets.
2054SimplifiedValues[&I] = ConstantInt::getBool(
2055I.getType(),
2056ICmpInst::compare(LHSOffset, RHSOffset, I.getPredicate()));
2057++NumConstantPtrCmps;
2058return true;
2059}
2060}
2061
2062auto isImplicitNullCheckCmp = [](const CmpInst &I) {
2063for (auto *User : I.users())
2064if (auto *Instr = dyn_cast<Instruction>(User))
2065if (!Instr->getMetadata(LLVMContext::MD_make_implicit))
2066return false;
2067return true;
2068};
2069
2070// If the comparison is an equality comparison with null, we can simplify it
2071// if we know the value (argument) can't be null
2072if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1))) {
2073if (isKnownNonNullInCallee(I.getOperand(0))) {
2074bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
2075SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
2076: ConstantInt::getFalse(I.getType());
2077return true;
2078}
2079// Implicit null checks act as unconditional branches and their comparisons
2080// should be treated as simplified and free of cost.
2081if (isImplicitNullCheckCmp(I))
2082return true;
2083}
2084return handleSROA(I.getOperand(0), isa<ConstantPointerNull>(I.getOperand(1)));
2085}
2086
2087bool CallAnalyzer::visitSub(BinaryOperator &I) {
2088// Try to handle a special case: we can fold computing the difference of two
2089// constant-related pointers.
2090Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2091Value *LHSBase, *RHSBase;
2092APInt LHSOffset, RHSOffset;
2093std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
2094if (LHSBase) {
2095std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
2096if (RHSBase && LHSBase == RHSBase) {
2097// We have common bases, fold the subtract to a constant based on the
2098// offsets.
2099Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
2100Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
2101if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
2102SimplifiedValues[&I] = C;
2103++NumConstantPtrDiffs;
2104return true;
2105}
2106}
2107}
2108
2109// Otherwise, fall back to the generic logic for simplifying and handling
2110// instructions.
2111return Base::visitSub(I);
2112}
2113
2114bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
2115Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
2116Constant *CLHS = dyn_cast<Constant>(LHS);
2117if (!CLHS)
2118CLHS = SimplifiedValues.lookup(LHS);
2119Constant *CRHS = dyn_cast<Constant>(RHS);
2120if (!CRHS)
2121CRHS = SimplifiedValues.lookup(RHS);
2122
2123Value *SimpleV = nullptr;
2124if (auto FI = dyn_cast<FPMathOperator>(&I))
2125SimpleV = simplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS,
2126FI->getFastMathFlags(), DL);
2127else
2128SimpleV =
2129simplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
2130
2131if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
2132SimplifiedValues[&I] = C;
2133
2134if (SimpleV)
2135return true;
2136
2137// Disable any SROA on arguments to arbitrary, unsimplified binary operators.
2138disableSROA(LHS);
2139disableSROA(RHS);
2140
2141// If the instruction is floating point, and the target says this operation
2142// is expensive, this may eventually become a library call. Treat the cost
2143// as such. Unless it's fneg which can be implemented with an xor.
2144using namespace llvm::PatternMatch;
2145if (I.getType()->isFloatingPointTy() &&
2146TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive &&
2147!match(&I, m_FNeg(m_Value())))
2148onCallPenalty();
2149
2150return false;
2151}
2152
2153bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
2154Value *Op = I.getOperand(0);
2155Constant *COp = dyn_cast<Constant>(Op);
2156if (!COp)
2157COp = SimplifiedValues.lookup(Op);
2158
2159Value *SimpleV = simplifyFNegInst(
2160COp ? COp : Op, cast<FPMathOperator>(I).getFastMathFlags(), DL);
2161
2162if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
2163SimplifiedValues[&I] = C;
2164
2165if (SimpleV)
2166return true;
2167
2168// Disable any SROA on arguments to arbitrary, unsimplified fneg.
2169disableSROA(Op);
2170
2171return false;
2172}
2173
2174bool CallAnalyzer::visitLoad(LoadInst &I) {
2175if (handleSROA(I.getPointerOperand(), I.isSimple()))
2176return true;
2177
2178// If the data is already loaded from this address and hasn't been clobbered
2179// by any stores or calls, this load is likely to be redundant and can be
2180// eliminated.
2181if (EnableLoadElimination &&
2182!LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
2183onLoadEliminationOpportunity();
2184return true;
2185}
2186
2187onMemAccess();
2188return false;
2189}
2190
2191bool CallAnalyzer::visitStore(StoreInst &I) {
2192if (handleSROA(I.getPointerOperand(), I.isSimple()))
2193return true;
2194
2195// The store can potentially clobber loads and prevent repeated loads from
2196// being eliminated.
2197// FIXME:
2198// 1. We can probably keep an initial set of eliminatable loads substracted
2199// from the cost even when we finally see a store. We just need to disable
2200// *further* accumulation of elimination savings.
2201// 2. We should probably at some point thread MemorySSA for the callee into
2202// this and then use that to actually compute *really* precise savings.
2203disableLoadElimination();
2204
2205onMemAccess();
2206return false;
2207}
2208
2209bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
2210// Constant folding for extract value is trivial.
2211if (simplifyInstruction(I))
2212return true;
2213
2214// SROA can't look through these, but they may be free.
2215return Base::visitExtractValue(I);
2216}
2217
2218bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
2219// Constant folding for insert value is trivial.
2220if (simplifyInstruction(I))
2221return true;
2222
2223// SROA can't look through these, but they may be free.
2224return Base::visitInsertValue(I);
2225}
2226
2227/// Try to simplify a call site.
2228///
2229/// Takes a concrete function and callsite and tries to actually simplify it by
2230/// analyzing the arguments and call itself with instsimplify. Returns true if
2231/// it has simplified the callsite to some other entity (a constant), making it
2232/// free.
2233bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) {
2234// FIXME: Using the instsimplify logic directly for this is inefficient
2235// because we have to continually rebuild the argument list even when no
2236// simplifications can be performed. Until that is fixed with remapping
2237// inside of instsimplify, directly constant fold calls here.
2238if (!canConstantFoldCallTo(&Call, F))
2239return false;
2240
2241// Try to re-map the arguments to constants.
2242SmallVector<Constant *, 4> ConstantArgs;
2243ConstantArgs.reserve(Call.arg_size());
2244for (Value *I : Call.args()) {
2245Constant *C = dyn_cast<Constant>(I);
2246if (!C)
2247C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(I));
2248if (!C)
2249return false; // This argument doesn't map to a constant.
2250
2251ConstantArgs.push_back(C);
2252}
2253if (Constant *C = ConstantFoldCall(&Call, F, ConstantArgs)) {
2254SimplifiedValues[&Call] = C;
2255return true;
2256}
2257
2258return false;
2259}
2260
2261bool CallAnalyzer::visitCallBase(CallBase &Call) {
2262if (!onCallBaseVisitStart(Call))
2263return true;
2264
2265if (Call.hasFnAttr(Attribute::ReturnsTwice) &&
2266!F.hasFnAttribute(Attribute::ReturnsTwice)) {
2267// This aborts the entire analysis.
2268ExposesReturnsTwice = true;
2269return false;
2270}
2271if (isa<CallInst>(Call) && cast<CallInst>(Call).cannotDuplicate())
2272ContainsNoDuplicateCall = true;
2273
2274Function *F = Call.getCalledFunction();
2275bool IsIndirectCall = !F;
2276if (IsIndirectCall) {
2277// Check if this happens to be an indirect function call to a known function
2278// in this inline context. If not, we've done all we can.
2279Value *Callee = Call.getCalledOperand();
2280F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
2281if (!F || F->getFunctionType() != Call.getFunctionType()) {
2282onCallArgumentSetup(Call);
2283
2284if (!Call.onlyReadsMemory())
2285disableLoadElimination();
2286return Base::visitCallBase(Call);
2287}
2288}
2289
2290assert(F && "Expected a call to a known function");
2291
2292// When we have a concrete function, first try to simplify it directly.
2293if (simplifyCallSite(F, Call))
2294return true;
2295
2296// Next check if it is an intrinsic we know about.
2297// FIXME: Lift this into part of the InstVisitor.
2298if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Call)) {
2299switch (II->getIntrinsicID()) {
2300default:
2301if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
2302disableLoadElimination();
2303return Base::visitCallBase(Call);
2304
2305case Intrinsic::load_relative:
2306onLoadRelativeIntrinsic();
2307return false;
2308
2309case Intrinsic::memset:
2310case Intrinsic::memcpy:
2311case Intrinsic::memmove:
2312disableLoadElimination();
2313// SROA can usually chew through these intrinsics, but they aren't free.
2314return false;
2315case Intrinsic::icall_branch_funnel:
2316case Intrinsic::localescape:
2317HasUninlineableIntrinsic = true;
2318return false;
2319case Intrinsic::vastart:
2320InitsVargArgs = true;
2321return false;
2322case Intrinsic::launder_invariant_group:
2323case Intrinsic::strip_invariant_group:
2324if (auto *SROAArg = getSROAArgForValueOrNull(II->getOperand(0)))
2325SROAArgValues[II] = SROAArg;
2326return true;
2327case Intrinsic::is_constant:
2328return simplifyIntrinsicCallIsConstant(Call);
2329case Intrinsic::objectsize:
2330return simplifyIntrinsicCallObjectSize(Call);
2331}
2332}
2333
2334if (F == Call.getFunction()) {
2335// This flag will fully abort the analysis, so don't bother with anything
2336// else.
2337IsRecursiveCall = true;
2338if (!AllowRecursiveCall)
2339return false;
2340}
2341
2342if (TTI.isLoweredToCall(F)) {
2343onLoweredCall(F, Call, IsIndirectCall);
2344}
2345
2346if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))
2347disableLoadElimination();
2348return Base::visitCallBase(Call);
2349}
2350
2351bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
2352// At least one return instruction will be free after inlining.
2353bool Free = !HasReturn;
2354HasReturn = true;
2355return Free;
2356}
2357
2358bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
2359// We model unconditional branches as essentially free -- they really
2360// shouldn't exist at all, but handling them makes the behavior of the
2361// inliner more regular and predictable. Interestingly, conditional branches
2362// which will fold away are also free.
2363return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
2364BI.getMetadata(LLVMContext::MD_make_implicit) ||
2365isa_and_nonnull<ConstantInt>(
2366SimplifiedValues.lookup(BI.getCondition()));
2367}
2368
2369bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
2370bool CheckSROA = SI.getType()->isPointerTy();
2371Value *TrueVal = SI.getTrueValue();
2372Value *FalseVal = SI.getFalseValue();
2373
2374Constant *TrueC = dyn_cast<Constant>(TrueVal);
2375if (!TrueC)
2376TrueC = SimplifiedValues.lookup(TrueVal);
2377Constant *FalseC = dyn_cast<Constant>(FalseVal);
2378if (!FalseC)
2379FalseC = SimplifiedValues.lookup(FalseVal);
2380Constant *CondC =
2381dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
2382
2383if (!CondC) {
2384// Select C, X, X => X
2385if (TrueC == FalseC && TrueC) {
2386SimplifiedValues[&SI] = TrueC;
2387return true;
2388}
2389
2390if (!CheckSROA)
2391return Base::visitSelectInst(SI);
2392
2393std::pair<Value *, APInt> TrueBaseAndOffset =
2394ConstantOffsetPtrs.lookup(TrueVal);
2395std::pair<Value *, APInt> FalseBaseAndOffset =
2396ConstantOffsetPtrs.lookup(FalseVal);
2397if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
2398ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
2399
2400if (auto *SROAArg = getSROAArgForValueOrNull(TrueVal))
2401SROAArgValues[&SI] = SROAArg;
2402return true;
2403}
2404
2405return Base::visitSelectInst(SI);
2406}
2407
2408// Select condition is a constant.
2409Value *SelectedV = CondC->isAllOnesValue() ? TrueVal
2410: (CondC->isNullValue()) ? FalseVal
2411: nullptr;
2412if (!SelectedV) {
2413// Condition is a vector constant that is not all 1s or all 0s. If all
2414// operands are constants, ConstantFoldSelectInstruction() can handle the
2415// cases such as select vectors.
2416if (TrueC && FalseC) {
2417if (auto *C = ConstantFoldSelectInstruction(CondC, TrueC, FalseC)) {
2418SimplifiedValues[&SI] = C;
2419return true;
2420}
2421}
2422return Base::visitSelectInst(SI);
2423}
2424
2425// Condition is either all 1s or all 0s. SI can be simplified.
2426if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
2427SimplifiedValues[&SI] = SelectedC;
2428return true;
2429}
2430
2431if (!CheckSROA)
2432return true;
2433
2434std::pair<Value *, APInt> BaseAndOffset =
2435ConstantOffsetPtrs.lookup(SelectedV);
2436if (BaseAndOffset.first) {
2437ConstantOffsetPtrs[&SI] = BaseAndOffset;
2438
2439if (auto *SROAArg = getSROAArgForValueOrNull(SelectedV))
2440SROAArgValues[&SI] = SROAArg;
2441}
2442
2443return true;
2444}
2445
2446bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
2447// We model unconditional switches as free, see the comments on handling
2448// branches.
2449if (isa<ConstantInt>(SI.getCondition()))
2450return true;
2451if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
2452if (isa<ConstantInt>(V))
2453return true;
2454
2455// Assume the most general case where the switch is lowered into
2456// either a jump table, bit test, or a balanced binary tree consisting of
2457// case clusters without merging adjacent clusters with the same
2458// destination. We do not consider the switches that are lowered with a mix
2459// of jump table/bit test/binary search tree. The cost of the switch is
2460// proportional to the size of the tree or the size of jump table range.
2461//
2462// NB: We convert large switches which are just used to initialize large phi
2463// nodes to lookup tables instead in simplifycfg, so this shouldn't prevent
2464// inlining those. It will prevent inlining in cases where the optimization
2465// does not (yet) fire.
2466
2467unsigned JumpTableSize = 0;
2468BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr;
2469unsigned NumCaseCluster =
2470TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI);
2471
2472onFinalizeSwitch(JumpTableSize, NumCaseCluster, SI.defaultDestUndefined());
2473return false;
2474}
2475
2476bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
2477// We never want to inline functions that contain an indirectbr. This is
2478// incorrect because all the blockaddress's (in static global initializers
2479// for example) would be referring to the original function, and this
2480// indirect jump would jump from the inlined copy of the function into the
2481// original function which is extremely undefined behavior.
2482// FIXME: This logic isn't really right; we can safely inline functions with
2483// indirectbr's as long as no other function or global references the
2484// blockaddress of a block within the current function.
2485HasIndirectBr = true;
2486return false;
2487}
2488
2489bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
2490// FIXME: It's not clear that a single instruction is an accurate model for
2491// the inline cost of a resume instruction.
2492return false;
2493}
2494
2495bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
2496// FIXME: It's not clear that a single instruction is an accurate model for
2497// the inline cost of a cleanupret instruction.
2498return false;
2499}
2500
2501bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
2502// FIXME: It's not clear that a single instruction is an accurate model for
2503// the inline cost of a catchret instruction.
2504return false;
2505}
2506
2507bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
2508// FIXME: It might be reasonably to discount the cost of instructions leading
2509// to unreachable as they have the lowest possible impact on both runtime and
2510// code size.
2511return true; // No actual code is needed for unreachable.
2512}
2513
2514bool CallAnalyzer::visitInstruction(Instruction &I) {
2515// Some instructions are free. All of the free intrinsics can also be
2516// handled by SROA, etc.
2517if (TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
2518TargetTransformInfo::TCC_Free)
2519return true;
2520
2521// We found something we don't understand or can't handle. Mark any SROA-able
2522// values in the operand list as no longer viable.
2523for (const Use &Op : I.operands())
2524disableSROA(Op);
2525
2526return false;
2527}
2528
2529/// Analyze a basic block for its contribution to the inline cost.
2530///
2531/// This method walks the analyzer over every instruction in the given basic
2532/// block and accounts for their cost during inlining at this callsite. It
2533/// aborts early if the threshold has been exceeded or an impossible to inline
2534/// construct has been detected. It returns false if inlining is no longer
2535/// viable, and true if inlining remains viable.
2536InlineResult
2537CallAnalyzer::analyzeBlock(BasicBlock *BB,
2538SmallPtrSetImpl<const Value *> &EphValues) {
2539for (Instruction &I : *BB) {
2540// FIXME: Currently, the number of instructions in a function regardless of
2541// our ability to simplify them during inline to constants or dead code,
2542// are actually used by the vector bonus heuristic. As long as that's true,
2543// we have to special case debug intrinsics here to prevent differences in
2544// inlining due to debug symbols. Eventually, the number of unsimplified
2545// instructions shouldn't factor into the cost computation, but until then,
2546// hack around it here.
2547// Similarly, skip pseudo-probes.
2548if (I.isDebugOrPseudoInst())
2549continue;
2550
2551// Skip ephemeral values.
2552if (EphValues.count(&I))
2553continue;
2554
2555++NumInstructions;
2556if (isa<ExtractElementInst>(I) || I.getType()->isVectorTy())
2557++NumVectorInstructions;
2558
2559// If the instruction simplified to a constant, there is no cost to this
2560// instruction. Visit the instructions using our InstVisitor to account for
2561// all of the per-instruction logic. The visit tree returns true if we
2562// consumed the instruction in any way, and false if the instruction's base
2563// cost should count against inlining.
2564onInstructionAnalysisStart(&I);
2565
2566if (Base::visit(&I))
2567++NumInstructionsSimplified;
2568else
2569onMissedSimplification();
2570
2571onInstructionAnalysisFinish(&I);
2572using namespace ore;
2573// If the visit this instruction detected an uninlinable pattern, abort.
2574InlineResult IR = InlineResult::success();
2575if (IsRecursiveCall && !AllowRecursiveCall)
2576IR = InlineResult::failure("recursive");
2577else if (ExposesReturnsTwice)
2578IR = InlineResult::failure("exposes returns twice");
2579else if (HasDynamicAlloca)
2580IR = InlineResult::failure("dynamic alloca");
2581else if (HasIndirectBr)
2582IR = InlineResult::failure("indirect branch");
2583else if (HasUninlineableIntrinsic)
2584IR = InlineResult::failure("uninlinable intrinsic");
2585else if (InitsVargArgs)
2586IR = InlineResult::failure("varargs");
2587if (!IR.isSuccess()) {
2588if (ORE)
2589ORE->emit([&]() {
2590return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2591&CandidateCall)
2592<< NV("Callee", &F) << " has uninlinable pattern ("
2593<< NV("InlineResult", IR.getFailureReason())
2594<< ") and cost is not fully computed";
2595});
2596return IR;
2597}
2598
2599// If the caller is a recursive function then we don't want to inline
2600// functions which allocate a lot of stack space because it would increase
2601// the caller stack usage dramatically.
2602if (IsCallerRecursive && AllocatedSize > RecurStackSizeThreshold) {
2603auto IR =
2604InlineResult::failure("recursive and allocates too much stack space");
2605if (ORE)
2606ORE->emit([&]() {
2607return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2608&CandidateCall)
2609<< NV("Callee", &F) << " is "
2610<< NV("InlineResult", IR.getFailureReason())
2611<< ". Cost is not fully computed";
2612});
2613return IR;
2614}
2615
2616if (shouldStop())
2617return InlineResult::failure(
2618"Call site analysis is not favorable to inlining.");
2619}
2620
2621return InlineResult::success();
2622}
2623
2624/// Compute the base pointer and cumulative constant offsets for V.
2625///
2626/// This strips all constant offsets off of V, leaving it the base pointer, and
2627/// accumulates the total constant offset applied in the returned constant. It
2628/// returns 0 if V is not a pointer, and returns the constant '0' if there are
2629/// no constant offsets applied.
2630ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
2631if (!V->getType()->isPointerTy())
2632return nullptr;
2633
2634unsigned AS = V->getType()->getPointerAddressSpace();
2635unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
2636APInt Offset = APInt::getZero(IntPtrWidth);
2637
2638// Even though we don't look through PHI nodes, we could be called on an
2639// instruction in an unreachable block, which may be on a cycle.
2640SmallPtrSet<Value *, 4> Visited;
2641Visited.insert(V);
2642do {
2643if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
2644if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
2645return nullptr;
2646V = GEP->getPointerOperand();
2647} else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
2648if (GA->isInterposable())
2649break;
2650V = GA->getAliasee();
2651} else {
2652break;
2653}
2654assert(V->getType()->isPointerTy() && "Unexpected operand type!");
2655} while (Visited.insert(V).second);
2656
2657Type *IdxPtrTy = DL.getIndexType(V->getType());
2658return cast<ConstantInt>(ConstantInt::get(IdxPtrTy, Offset));
2659}
2660
2661/// Find dead blocks due to deleted CFG edges during inlining.
2662///
2663/// If we know the successor of the current block, \p CurrBB, has to be \p
2664/// NextBB, the other successors of \p CurrBB are dead if these successors have
2665/// no live incoming CFG edges. If one block is found to be dead, we can
2666/// continue growing the dead block list by checking the successors of the dead
2667/// blocks to see if all their incoming edges are dead or not.
2668void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
2669auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
2670// A CFG edge is dead if the predecessor is dead or the predecessor has a
2671// known successor which is not the one under exam.
2672return (DeadBlocks.count(Pred) ||
2673(KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
2674};
2675
2676auto IsNewlyDead = [&](BasicBlock *BB) {
2677// If all the edges to a block are dead, the block is also dead.
2678return (!DeadBlocks.count(BB) &&
2679llvm::all_of(predecessors(BB),
2680[&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
2681};
2682
2683for (BasicBlock *Succ : successors(CurrBB)) {
2684if (Succ == NextBB || !IsNewlyDead(Succ))
2685continue;
2686SmallVector<BasicBlock *, 4> NewDead;
2687NewDead.push_back(Succ);
2688while (!NewDead.empty()) {
2689BasicBlock *Dead = NewDead.pop_back_val();
2690if (DeadBlocks.insert(Dead).second)
2691// Continue growing the dead block lists.
2692for (BasicBlock *S : successors(Dead))
2693if (IsNewlyDead(S))
2694NewDead.push_back(S);
2695}
2696}
2697}
2698
2699/// Analyze a call site for potential inlining.
2700///
2701/// Returns true if inlining this call is viable, and false if it is not
2702/// viable. It computes the cost and adjusts the threshold based on numerous
2703/// factors and heuristics. If this method returns false but the computed cost
2704/// is below the computed threshold, then inlining was forcibly disabled by
2705/// some artifact of the routine.
2706InlineResult CallAnalyzer::analyze() {
2707++NumCallsAnalyzed;
2708
2709auto Result = onAnalysisStart();
2710if (!Result.isSuccess())
2711return Result;
2712
2713if (F.empty())
2714return InlineResult::success();
2715
2716Function *Caller = CandidateCall.getFunction();
2717// Check if the caller function is recursive itself.
2718for (User *U : Caller->users()) {
2719CallBase *Call = dyn_cast<CallBase>(U);
2720if (Call && Call->getFunction() == Caller) {
2721IsCallerRecursive = true;
2722break;
2723}
2724}
2725
2726// Populate our simplified values by mapping from function arguments to call
2727// arguments with known important simplifications.
2728auto CAI = CandidateCall.arg_begin();
2729for (Argument &FAI : F.args()) {
2730assert(CAI != CandidateCall.arg_end());
2731if (Constant *C = dyn_cast<Constant>(CAI))
2732SimplifiedValues[&FAI] = C;
2733
2734Value *PtrArg = *CAI;
2735if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
2736ConstantOffsetPtrs[&FAI] = std::make_pair(PtrArg, C->getValue());
2737
2738// We can SROA any pointer arguments derived from alloca instructions.
2739if (auto *SROAArg = dyn_cast<AllocaInst>(PtrArg)) {
2740SROAArgValues[&FAI] = SROAArg;
2741onInitializeSROAArg(SROAArg);
2742EnabledSROAAllocas.insert(SROAArg);
2743}
2744}
2745++CAI;
2746}
2747NumConstantArgs = SimplifiedValues.size();
2748NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
2749NumAllocaArgs = SROAArgValues.size();
2750
2751// FIXME: If a caller has multiple calls to a callee, we end up recomputing
2752// the ephemeral values multiple times (and they're completely determined by
2753// the callee, so this is purely duplicate work).
2754SmallPtrSet<const Value *, 32> EphValues;
2755CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);
2756
2757// The worklist of live basic blocks in the callee *after* inlining. We avoid
2758// adding basic blocks of the callee which can be proven to be dead for this
2759// particular call site in order to get more accurate cost estimates. This
2760// requires a somewhat heavyweight iteration pattern: we need to walk the
2761// basic blocks in a breadth-first order as we insert live successors. To
2762// accomplish this, prioritizing for small iterations because we exit after
2763// crossing our threshold, we use a small-size optimized SetVector.
2764typedef SmallSetVector<BasicBlock *, 16> BBSetVector;
2765BBSetVector BBWorklist;
2766BBWorklist.insert(&F.getEntryBlock());
2767
2768// Note that we *must not* cache the size, this loop grows the worklist.
2769for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
2770if (shouldStop())
2771break;
2772
2773BasicBlock *BB = BBWorklist[Idx];
2774if (BB->empty())
2775continue;
2776
2777onBlockStart(BB);
2778
2779// Disallow inlining a blockaddress with uses other than strictly callbr.
2780// A blockaddress only has defined behavior for an indirect branch in the
2781// same function, and we do not currently support inlining indirect
2782// branches. But, the inliner may not see an indirect branch that ends up
2783// being dead code at a particular call site. If the blockaddress escapes
2784// the function, e.g., via a global variable, inlining may lead to an
2785// invalid cross-function reference.
2786// FIXME: pr/39560: continue relaxing this overt restriction.
2787if (BB->hasAddressTaken())
2788for (User *U : BlockAddress::get(&*BB)->users())
2789if (!isa<CallBrInst>(*U))
2790return InlineResult::failure("blockaddress used outside of callbr");
2791
2792// Analyze the cost of this block. If we blow through the threshold, this
2793// returns false, and we can bail on out.
2794InlineResult IR = analyzeBlock(BB, EphValues);
2795if (!IR.isSuccess())
2796return IR;
2797
2798Instruction *TI = BB->getTerminator();
2799
2800// Add in the live successors by first checking whether we have terminator
2801// that may be simplified based on the values simplified by this call.
2802if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
2803if (BI->isConditional()) {
2804Value *Cond = BI->getCondition();
2805if (ConstantInt *SimpleCond =
2806dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2807BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
2808BBWorklist.insert(NextBB);
2809KnownSuccessors[BB] = NextBB;
2810findDeadBlocks(BB, NextBB);
2811continue;
2812}
2813}
2814} else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
2815Value *Cond = SI->getCondition();
2816if (ConstantInt *SimpleCond =
2817dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2818BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
2819BBWorklist.insert(NextBB);
2820KnownSuccessors[BB] = NextBB;
2821findDeadBlocks(BB, NextBB);
2822continue;
2823}
2824}
2825
2826// If we're unable to select a particular successor, just count all of
2827// them.
2828for (BasicBlock *Succ : successors(BB))
2829BBWorklist.insert(Succ);
2830
2831onBlockAnalyzed(BB);
2832}
2833
2834// If this is a noduplicate call, we can still inline as long as
2835// inlining this would cause the removal of the caller (so the instruction
2836// is not actually duplicated, just moved).
2837if (!isSoleCallToLocalFunction(CandidateCall, F) && ContainsNoDuplicateCall)
2838return InlineResult::failure("noduplicate");
2839
2840// If the callee's stack size exceeds the user-specified threshold,
2841// do not let it be inlined.
2842// The command line option overrides a limit set in the function attributes.
2843size_t FinalStackSizeThreshold = StackSizeThreshold;
2844if (!StackSizeThreshold.getNumOccurrences())
2845if (std::optional<int> AttrMaxStackSize = getStringFnAttrAsInt(
2846Caller, InlineConstants::MaxInlineStackSizeAttributeName))
2847FinalStackSizeThreshold = *AttrMaxStackSize;
2848if (AllocatedSize > FinalStackSizeThreshold)
2849return InlineResult::failure("stacksize");
2850
2851return finalizeAnalysis();
2852}
2853
2854void InlineCostCallAnalyzer::print(raw_ostream &OS) {
2855#define DEBUG_PRINT_STAT(x) OS << " " #x ": " << x << "\n"
2856if (PrintInstructionComments)
2857F.print(OS, &Writer);
2858DEBUG_PRINT_STAT(NumConstantArgs);
2859DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
2860DEBUG_PRINT_STAT(NumAllocaArgs);
2861DEBUG_PRINT_STAT(NumConstantPtrCmps);
2862DEBUG_PRINT_STAT(NumConstantPtrDiffs);
2863DEBUG_PRINT_STAT(NumInstructionsSimplified);
2864DEBUG_PRINT_STAT(NumInstructions);
2865DEBUG_PRINT_STAT(SROACostSavings);
2866DEBUG_PRINT_STAT(SROACostSavingsLost);
2867DEBUG_PRINT_STAT(LoadEliminationCost);
2868DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
2869DEBUG_PRINT_STAT(Cost);
2870DEBUG_PRINT_STAT(Threshold);
2871#undef DEBUG_PRINT_STAT
2872}
2873
2874#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2875/// Dump stats about this call's analysis.
2876LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { print(dbgs()); }
2877#endif
2878
2879/// Test that there are no attribute conflicts between Caller and Callee
2880/// that prevent inlining.
2881static bool functionsHaveCompatibleAttributes(
2882Function *Caller, Function *Callee, TargetTransformInfo &TTI,
2883function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) {
2884// Note that CalleeTLI must be a copy not a reference. The legacy pass manager
2885// caches the most recently created TLI in the TargetLibraryInfoWrapperPass
2886// object, and always returns the same object (which is overwritten on each
2887// GetTLI call). Therefore we copy the first result.
2888auto CalleeTLI = GetTLI(*Callee);
2889return (IgnoreTTIInlineCompatible ||
2890TTI.areInlineCompatible(Caller, Callee)) &&
2891GetTLI(*Caller).areInlineCompatible(CalleeTLI,
2892InlineCallerSupersetNoBuiltin) &&
2893AttributeFuncs::areInlineCompatible(*Caller, *Callee);
2894}
2895
2896int llvm::getCallsiteCost(const TargetTransformInfo &TTI, const CallBase &Call,
2897const DataLayout &DL) {
2898int64_t Cost = 0;
2899for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) {
2900if (Call.isByValArgument(I)) {
2901// We approximate the number of loads and stores needed by dividing the
2902// size of the byval type by the target's pointer size.
2903PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
2904unsigned TypeSize = DL.getTypeSizeInBits(Call.getParamByValType(I));
2905unsigned AS = PTy->getAddressSpace();
2906unsigned PointerSize = DL.getPointerSizeInBits(AS);
2907// Ceiling division.
2908unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
2909
2910// If it generates more than 8 stores it is likely to be expanded as an
2911// inline memcpy so we take that as an upper bound. Otherwise we assume
2912// one load and one store per word copied.
2913// FIXME: The maxStoresPerMemcpy setting from the target should be used
2914// here instead of a magic number of 8, but it's not available via
2915// DataLayout.
2916NumStores = std::min(NumStores, 8U);
2917
2918Cost += 2 * NumStores * InstrCost;
2919} else {
2920// For non-byval arguments subtract off one instruction per call
2921// argument.
2922Cost += InstrCost;
2923}
2924}
2925// The call instruction also disappears after inlining.
2926Cost += InstrCost;
2927Cost += TTI.getInlineCallPenalty(Call.getCaller(), Call, CallPenalty);
2928
2929return std::min<int64_t>(Cost, INT_MAX);
2930}
2931
2932InlineCost llvm::getInlineCost(
2933CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
2934function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2935function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
2936function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
2937ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2938return getInlineCost(Call, Call.getCalledFunction(), Params, CalleeTTI,
2939GetAssumptionCache, GetTLI, GetBFI, PSI, ORE);
2940}
2941
2942std::optional<int> llvm::getInliningCostEstimate(
2943CallBase &Call, TargetTransformInfo &CalleeTTI,
2944function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2945function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
2946ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2947const InlineParams Params = {/* DefaultThreshold*/ 0,
2948/*HintThreshold*/ {},
2949/*ColdThreshold*/ {},
2950/*OptSizeThreshold*/ {},
2951/*OptMinSizeThreshold*/ {},
2952/*HotCallSiteThreshold*/ {},
2953/*LocallyHotCallSiteThreshold*/ {},
2954/*ColdCallSiteThreshold*/ {},
2955/*ComputeFullInlineCost*/ true,
2956/*EnableDeferral*/ true};
2957
2958InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI,
2959GetAssumptionCache, GetBFI, PSI, ORE, true,
2960/*IgnoreThreshold*/ true);
2961auto R = CA.analyze();
2962if (!R.isSuccess())
2963return std::nullopt;
2964return CA.getCost();
2965}
2966
2967std::optional<InlineCostFeatures> llvm::getInliningCostFeatures(
2968CallBase &Call, TargetTransformInfo &CalleeTTI,
2969function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2970function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
2971ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2972InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, PSI,
2973ORE, *Call.getCalledFunction(), Call);
2974auto R = CFA.analyze();
2975if (!R.isSuccess())
2976return std::nullopt;
2977return CFA.features();
2978}
2979
2980std::optional<InlineResult> llvm::getAttributeBasedInliningDecision(
2981CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI,
2982function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
2983
2984// Cannot inline indirect calls.
2985if (!Callee)
2986return InlineResult::failure("indirect call");
2987
2988// When callee coroutine function is inlined into caller coroutine function
2989// before coro-split pass,
2990// coro-early pass can not handle this quiet well.
2991// So we won't inline the coroutine function if it have not been unsplited
2992if (Callee->isPresplitCoroutine())
2993return InlineResult::failure("unsplited coroutine call");
2994
2995// Never inline calls with byval arguments that does not have the alloca
2996// address space. Since byval arguments can be replaced with a copy to an
2997// alloca, the inlined code would need to be adjusted to handle that the
2998// argument is in the alloca address space (so it is a little bit complicated
2999// to solve).
3000unsigned AllocaAS = Callee->getDataLayout().getAllocaAddrSpace();
3001for (unsigned I = 0, E = Call.arg_size(); I != E; ++I)
3002if (Call.isByValArgument(I)) {
3003PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
3004if (PTy->getAddressSpace() != AllocaAS)
3005return InlineResult::failure("byval arguments without alloca"
3006" address space");
3007}
3008
3009// Calls to functions with always-inline attributes should be inlined
3010// whenever possible.
3011if (Call.hasFnAttr(Attribute::AlwaysInline)) {
3012if (Call.getAttributes().hasFnAttr(Attribute::NoInline))
3013return InlineResult::failure("noinline call site attribute");
3014
3015auto IsViable = isInlineViable(*Callee);
3016if (IsViable.isSuccess())
3017return InlineResult::success();
3018return InlineResult::failure(IsViable.getFailureReason());
3019}
3020
3021// Never inline functions with conflicting attributes (unless callee has
3022// always-inline attribute).
3023Function *Caller = Call.getCaller();
3024if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI, GetTLI))
3025return InlineResult::failure("conflicting attributes");
3026
3027// Don't inline this call if the caller has the optnone attribute.
3028if (Caller->hasOptNone())
3029return InlineResult::failure("optnone attribute");
3030
3031// Don't inline a function that treats null pointer as valid into a caller
3032// that does not have this attribute.
3033if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
3034return InlineResult::failure("nullptr definitions incompatible");
3035
3036// Don't inline functions which can be interposed at link-time.
3037if (Callee->isInterposable())
3038return InlineResult::failure("interposable");
3039
3040// Don't inline functions marked noinline.
3041if (Callee->hasFnAttribute(Attribute::NoInline))
3042return InlineResult::failure("noinline function attribute");
3043
3044// Don't inline call sites marked noinline.
3045if (Call.isNoInline())
3046return InlineResult::failure("noinline call site attribute");
3047
3048return std::nullopt;
3049}
3050
3051InlineCost llvm::getInlineCost(
3052CallBase &Call, Function *Callee, const InlineParams &Params,
3053TargetTransformInfo &CalleeTTI,
3054function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
3055function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
3056function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
3057ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
3058
3059auto UserDecision =
3060llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI);
3061
3062if (UserDecision) {
3063if (UserDecision->isSuccess())
3064return llvm::InlineCost::getAlways("always inline attribute");
3065return llvm::InlineCost::getNever(UserDecision->getFailureReason());
3066}
3067
3068LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
3069<< "... (caller:" << Call.getCaller()->getName()
3070<< ")\n");
3071
3072InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI,
3073GetAssumptionCache, GetBFI, PSI, ORE);
3074InlineResult ShouldInline = CA.analyze();
3075
3076LLVM_DEBUG(CA.dump());
3077
3078// Always make cost benefit based decision explicit.
3079// We use always/never here since threshold is not meaningful,
3080// as it's not what drives cost-benefit analysis.
3081if (CA.wasDecidedByCostBenefit()) {
3082if (ShouldInline.isSuccess())
3083return InlineCost::getAlways("benefit over cost",
3084CA.getCostBenefitPair());
3085else
3086return InlineCost::getNever("cost over benefit", CA.getCostBenefitPair());
3087}
3088
3089if (CA.wasDecidedByCostThreshold())
3090return InlineCost::get(CA.getCost(), CA.getThreshold(),
3091CA.getStaticBonusApplied());
3092
3093// No details on how the decision was made, simply return always or never.
3094return ShouldInline.isSuccess()
3095? InlineCost::getAlways("empty function")
3096: InlineCost::getNever(ShouldInline.getFailureReason());
3097}
3098
3099InlineResult llvm::isInlineViable(Function &F) {
3100bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
3101for (BasicBlock &BB : F) {
3102// Disallow inlining of functions which contain indirect branches.
3103if (isa<IndirectBrInst>(BB.getTerminator()))
3104return InlineResult::failure("contains indirect branches");
3105
3106// Disallow inlining of blockaddresses which are used by non-callbr
3107// instructions.
3108if (BB.hasAddressTaken())
3109for (User *U : BlockAddress::get(&BB)->users())
3110if (!isa<CallBrInst>(*U))
3111return InlineResult::failure("blockaddress used outside of callbr");
3112
3113for (auto &II : BB) {
3114CallBase *Call = dyn_cast<CallBase>(&II);
3115if (!Call)
3116continue;
3117
3118// Disallow recursive calls.
3119Function *Callee = Call->getCalledFunction();
3120if (&F == Callee)
3121return InlineResult::failure("recursive call");
3122
3123// Disallow calls which expose returns-twice to a function not previously
3124// attributed as such.
3125if (!ReturnsTwice && isa<CallInst>(Call) &&
3126cast<CallInst>(Call)->canReturnTwice())
3127return InlineResult::failure("exposes returns-twice attribute");
3128
3129if (Callee)
3130switch (Callee->getIntrinsicID()) {
3131default:
3132break;
3133case llvm::Intrinsic::icall_branch_funnel:
3134// Disallow inlining of @llvm.icall.branch.funnel because current
3135// backend can't separate call targets from call arguments.
3136return InlineResult::failure(
3137"disallowed inlining of @llvm.icall.branch.funnel");
3138case llvm::Intrinsic::localescape:
3139// Disallow inlining functions that call @llvm.localescape. Doing this
3140// correctly would require major changes to the inliner.
3141return InlineResult::failure(
3142"disallowed inlining of @llvm.localescape");
3143case llvm::Intrinsic::vastart:
3144// Disallow inlining of functions that initialize VarArgs with
3145// va_start.
3146return InlineResult::failure(
3147"contains VarArgs initialized with va_start");
3148}
3149}
3150}
3151
3152return InlineResult::success();
3153}
3154
3155// APIs to create InlineParams based on command line flags and/or other
3156// parameters.
3157
3158InlineParams llvm::getInlineParams(int Threshold) {
3159InlineParams Params;
3160
3161// This field is the threshold to use for a callee by default. This is
3162// derived from one or more of:
3163// * optimization or size-optimization levels,
3164// * a value passed to createFunctionInliningPass function, or
3165// * the -inline-threshold flag.
3166// If the -inline-threshold flag is explicitly specified, that is used
3167// irrespective of anything else.
3168if (InlineThreshold.getNumOccurrences() > 0)
3169Params.DefaultThreshold = InlineThreshold;
3170else
3171Params.DefaultThreshold = Threshold;
3172
3173// Set the HintThreshold knob from the -inlinehint-threshold.
3174Params.HintThreshold = HintThreshold;
3175
3176// Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
3177Params.HotCallSiteThreshold = HotCallSiteThreshold;
3178
3179// If the -locally-hot-callsite-threshold is explicitly specified, use it to
3180// populate LocallyHotCallSiteThreshold. Later, we populate
3181// Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
3182// we know that optimization level is O3 (in the getInlineParams variant that
3183// takes the opt and size levels).
3184// FIXME: Remove this check (and make the assignment unconditional) after
3185// addressing size regression issues at O2.
3186if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
3187Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
3188
3189// Set the ColdCallSiteThreshold knob from the
3190// -inline-cold-callsite-threshold.
3191Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
3192
3193// Set the OptMinSizeThreshold and OptSizeThreshold params only if the
3194// -inlinehint-threshold commandline option is not explicitly given. If that
3195// option is present, then its value applies even for callees with size and
3196// minsize attributes.
3197// If the -inline-threshold is not specified, set the ColdThreshold from the
3198// -inlinecold-threshold even if it is not explicitly passed. If
3199// -inline-threshold is specified, then -inlinecold-threshold needs to be
3200// explicitly specified to set the ColdThreshold knob
3201if (InlineThreshold.getNumOccurrences() == 0) {
3202Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
3203Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
3204Params.ColdThreshold = ColdThreshold;
3205} else if (ColdThreshold.getNumOccurrences() > 0) {
3206Params.ColdThreshold = ColdThreshold;
3207}
3208return Params;
3209}
3210
3211InlineParams llvm::getInlineParams() {
3212return getInlineParams(DefaultThreshold);
3213}
3214
3215// Compute the default threshold for inlining based on the opt level and the
3216// size opt level.
3217static int computeThresholdFromOptLevels(unsigned OptLevel,
3218unsigned SizeOptLevel) {
3219if (OptLevel > 2)
3220return InlineConstants::OptAggressiveThreshold;
3221if (SizeOptLevel == 1) // -Os
3222return InlineConstants::OptSizeThreshold;
3223if (SizeOptLevel == 2) // -Oz
3224return InlineConstants::OptMinSizeThreshold;
3225return DefaultThreshold;
3226}
3227
3228InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
3229auto Params =
3230getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
3231// At O3, use the value of -locally-hot-callsite-threshold option to populate
3232// Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
3233// when it is specified explicitly.
3234if (OptLevel > 2)
3235Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
3236return Params;
3237}
3238
3239PreservedAnalyses
3240InlineCostAnnotationPrinterPass::run(Function &F,
3241FunctionAnalysisManager &FAM) {
3242PrintInstructionComments = true;
3243std::function<AssumptionCache &(Function &)> GetAssumptionCache =
3244[&](Function &F) -> AssumptionCache & {
3245return FAM.getResult<AssumptionAnalysis>(F);
3246};
3247Module *M = F.getParent();
3248ProfileSummaryInfo PSI(*M);
3249DataLayout DL(M);
3250TargetTransformInfo TTI(DL);
3251// FIXME: Redesign the usage of InlineParams to expand the scope of this pass.
3252// In the current implementation, the type of InlineParams doesn't matter as
3253// the pass serves only for verification of inliner's decisions.
3254// We can add a flag which determines InlineParams for this run. Right now,
3255// the default InlineParams are used.
3256const InlineParams Params = llvm::getInlineParams();
3257for (BasicBlock &BB : F) {
3258for (Instruction &I : BB) {
3259if (CallInst *CI = dyn_cast<CallInst>(&I)) {
3260Function *CalledFunction = CI->getCalledFunction();
3261if (!CalledFunction || CalledFunction->isDeclaration())
3262continue;
3263OptimizationRemarkEmitter ORE(CalledFunction);
3264InlineCostCallAnalyzer ICCA(*CalledFunction, *CI, Params, TTI,
3265GetAssumptionCache, nullptr, &PSI, &ORE);
3266ICCA.analyze();
3267OS << " Analyzing call of " << CalledFunction->getName()
3268<< "... (caller:" << CI->getCaller()->getName() << ")\n";
3269ICCA.print(OS);
3270OS << "\n";
3271}
3272}
3273}
3274return PreservedAnalyses::all();
3275}
3276