llvm-project
480 строк · 17.9 Кб
1//===- BypassSlowDivision.cpp - Bypass slow division ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains an optimization for div and rem on architectures that
10// execute short instructions significantly faster than longer instructions.
11// For example, on Intel Atom 32-bit divides are slow enough that during
12// runtime it is profitable to check the value of the operands, and if they are
13// positive and less than 256 use an unsigned 8-bit divide.
14//
15//===----------------------------------------------------------------------===//
16
17#include "llvm/Transforms/Utils/BypassSlowDivision.h"
18#include "llvm/ADT/DenseMap.h"
19#include "llvm/ADT/STLExtras.h"
20#include "llvm/ADT/SmallPtrSet.h"
21#include "llvm/Transforms/Utils/Local.h"
22#include "llvm/Analysis/ValueTracking.h"
23#include "llvm/IR/BasicBlock.h"
24#include "llvm/IR/Constants.h"
25#include "llvm/IR/DerivedTypes.h"
26#include "llvm/IR/Function.h"
27#include "llvm/IR/IRBuilder.h"
28#include "llvm/IR/Instruction.h"
29#include "llvm/IR/Instructions.h"
30#include "llvm/IR/Module.h"
31#include "llvm/IR/Type.h"
32#include "llvm/IR/Value.h"
33#include "llvm/Support/Casting.h"
34#include "llvm/Support/KnownBits.h"
35#include <cassert>
36#include <cstdint>
37
38using namespace llvm;
39
40#define DEBUG_TYPE "bypass-slow-division"
41
42namespace {
43
44struct QuotRemPair {
45Value *Quotient;
46Value *Remainder;
47
48QuotRemPair(Value *InQuotient, Value *InRemainder)
49: Quotient(InQuotient), Remainder(InRemainder) {}
50};
51
52/// A quotient and remainder, plus a BB from which they logically "originate".
53/// If you use Quotient or Remainder in a Phi node, you should use BB as its
54/// corresponding predecessor.
55struct QuotRemWithBB {
56BasicBlock *BB = nullptr;
57Value *Quotient = nullptr;
58Value *Remainder = nullptr;
59};
60
61using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>;
62using BypassWidthsTy = DenseMap<unsigned, unsigned>;
63using VisitedSetTy = SmallPtrSet<Instruction *, 4>;
64
65enum ValueRange {
66/// Operand definitely fits into BypassType. No runtime checks are needed.
67VALRNG_KNOWN_SHORT,
68/// A runtime check is required, as value range is unknown.
69VALRNG_UNKNOWN,
70/// Operand is unlikely to fit into BypassType. The bypassing should be
71/// disabled.
72VALRNG_LIKELY_LONG
73};
74
75class FastDivInsertionTask {
76bool IsValidTask = false;
77Instruction *SlowDivOrRem = nullptr;
78IntegerType *BypassType = nullptr;
79BasicBlock *MainBB = nullptr;
80
81bool isHashLikeValue(Value *V, VisitedSetTy &Visited);
82ValueRange getValueRange(Value *Op, VisitedSetTy &Visited);
83QuotRemWithBB createSlowBB(BasicBlock *Successor);
84QuotRemWithBB createFastBB(BasicBlock *Successor);
85QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS,
86BasicBlock *PhiBB);
87Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2);
88std::optional<QuotRemPair> insertFastDivAndRem();
89
90bool isSignedOp() {
91return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
92SlowDivOrRem->getOpcode() == Instruction::SRem;
93}
94
95bool isDivisionOp() {
96return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
97SlowDivOrRem->getOpcode() == Instruction::UDiv;
98}
99
100Type *getSlowType() { return SlowDivOrRem->getType(); }
101
102public:
103FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths);
104
105Value *getReplacement(DivCacheTy &Cache);
106};
107
108} // end anonymous namespace
109
110FastDivInsertionTask::FastDivInsertionTask(Instruction *I,
111const BypassWidthsTy &BypassWidths) {
112switch (I->getOpcode()) {
113case Instruction::UDiv:
114case Instruction::SDiv:
115case Instruction::URem:
116case Instruction::SRem:
117SlowDivOrRem = I;
118break;
119default:
120// I is not a div/rem operation.
121return;
122}
123
124// Skip division on vector types. Only optimize integer instructions.
125IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType());
126if (!SlowType)
127return;
128
129// Skip if this bitwidth is not bypassed.
130auto BI = BypassWidths.find(SlowType->getBitWidth());
131if (BI == BypassWidths.end())
132return;
133
134// Get type for div/rem instruction with bypass bitwidth.
135IntegerType *BT = IntegerType::get(I->getContext(), BI->second);
136BypassType = BT;
137
138// The original basic block.
139MainBB = I->getParent();
140
141// The instruction is indeed a slow div or rem operation.
142IsValidTask = true;
143}
144
145/// Reuses previously-computed dividend or remainder from the current BB if
146/// operands and operation are identical. Otherwise calls insertFastDivAndRem to
147/// perform the optimization and caches the resulting dividend and remainder.
148/// If no replacement can be generated, nullptr is returned.
149Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) {
150// First, make sure that the task is valid.
151if (!IsValidTask)
152return nullptr;
153
154// Then, look for a value in Cache.
155Value *Dividend = SlowDivOrRem->getOperand(0);
156Value *Divisor = SlowDivOrRem->getOperand(1);
157DivRemMapKey Key(isSignedOp(), Dividend, Divisor);
158auto CacheI = Cache.find(Key);
159
160if (CacheI == Cache.end()) {
161// If previous instance does not exist, try to insert fast div.
162std::optional<QuotRemPair> OptResult = insertFastDivAndRem();
163// Bail out if insertFastDivAndRem has failed.
164if (!OptResult)
165return nullptr;
166CacheI = Cache.insert({Key, *OptResult}).first;
167}
168
169QuotRemPair &Value = CacheI->second;
170return isDivisionOp() ? Value.Quotient : Value.Remainder;
171}
172
173/// Check if a value looks like a hash.
174///
175/// The routine is expected to detect values computed using the most common hash
176/// algorithms. Typically, hash computations end with one of the following
177/// instructions:
178///
179/// 1) MUL with a constant wider than BypassType
180/// 2) XOR instruction
181///
182/// And even if we are wrong and the value is not a hash, it is still quite
183/// unlikely that such values will fit into BypassType.
184///
185/// To detect string hash algorithms like FNV we have to look through PHI-nodes.
186/// It is implemented as a depth-first search for values that look neither long
187/// nor hash-like.
188bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) {
189Instruction *I = dyn_cast<Instruction>(V);
190if (!I)
191return false;
192
193switch (I->getOpcode()) {
194case Instruction::Xor:
195return true;
196case Instruction::Mul: {
197// After Constant Hoisting pass, long constants may be represented as
198// bitcast instructions. As a result, some constants may look like an
199// instruction at first, and an additional check is necessary to find out if
200// an operand is actually a constant.
201Value *Op1 = I->getOperand(1);
202ConstantInt *C = dyn_cast<ConstantInt>(Op1);
203if (!C && isa<BitCastInst>(Op1))
204C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0));
205return C && C->getValue().getSignificantBits() > BypassType->getBitWidth();
206}
207case Instruction::PHI:
208// Stop IR traversal in case of a crazy input code. This limits recursion
209// depth.
210if (Visited.size() >= 16)
211return false;
212// Do not visit nodes that have been visited already. We return true because
213// it means that we couldn't find any value that doesn't look hash-like.
214if (!Visited.insert(I).second)
215return true;
216return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) {
217// Ignore undef values as they probably don't affect the division
218// operands.
219return getValueRange(V, Visited) == VALRNG_LIKELY_LONG ||
220isa<UndefValue>(V);
221});
222default:
223return false;
224}
225}
226
227/// Check if an integer value fits into our bypass type.
228ValueRange FastDivInsertionTask::getValueRange(Value *V,
229VisitedSetTy &Visited) {
230unsigned ShortLen = BypassType->getBitWidth();
231unsigned LongLen = V->getType()->getIntegerBitWidth();
232
233assert(LongLen > ShortLen && "Value type must be wider than BypassType");
234unsigned HiBits = LongLen - ShortLen;
235
236const DataLayout &DL = SlowDivOrRem->getDataLayout();
237KnownBits Known(LongLen);
238
239computeKnownBits(V, Known, DL);
240
241if (Known.countMinLeadingZeros() >= HiBits)
242return VALRNG_KNOWN_SHORT;
243
244if (Known.countMaxLeadingZeros() < HiBits)
245return VALRNG_LIKELY_LONG;
246
247// Long integer divisions are often used in hashtable implementations. It's
248// not worth bypassing such divisions because hash values are extremely
249// unlikely to have enough leading zeros. The call below tries to detect
250// values that are unlikely to fit BypassType (including hashes).
251if (isHashLikeValue(V, Visited))
252return VALRNG_LIKELY_LONG;
253
254return VALRNG_UNKNOWN;
255}
256
257/// Add new basic block for slow div and rem operations and put it before
258/// SuccessorBB.
259QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) {
260QuotRemWithBB DivRemPair;
261DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
262MainBB->getParent(), SuccessorBB);
263IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
264Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
265
266Value *Dividend = SlowDivOrRem->getOperand(0);
267Value *Divisor = SlowDivOrRem->getOperand(1);
268
269if (isSignedOp()) {
270DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor);
271DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor);
272} else {
273DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor);
274DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor);
275}
276
277Builder.CreateBr(SuccessorBB);
278return DivRemPair;
279}
280
281/// Add new basic block for fast div and rem operations and put it before
282/// SuccessorBB.
283QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) {
284QuotRemWithBB DivRemPair;
285DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
286MainBB->getParent(), SuccessorBB);
287IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
288Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
289
290Value *Dividend = SlowDivOrRem->getOperand(0);
291Value *Divisor = SlowDivOrRem->getOperand(1);
292Value *ShortDivisorV =
293Builder.CreateCast(Instruction::Trunc, Divisor, BypassType);
294Value *ShortDividendV =
295Builder.CreateCast(Instruction::Trunc, Dividend, BypassType);
296
297// udiv/urem because this optimization only handles positive numbers.
298Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV);
299Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV);
300DivRemPair.Quotient =
301Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType());
302DivRemPair.Remainder =
303Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType());
304Builder.CreateBr(SuccessorBB);
305
306return DivRemPair;
307}
308
309/// Creates Phi nodes for result of Div and Rem.
310QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS,
311QuotRemWithBB &RHS,
312BasicBlock *PhiBB) {
313IRBuilder<> Builder(PhiBB, PhiBB->begin());
314Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
315PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2);
316QuoPhi->addIncoming(LHS.Quotient, LHS.BB);
317QuoPhi->addIncoming(RHS.Quotient, RHS.BB);
318PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2);
319RemPhi->addIncoming(LHS.Remainder, LHS.BB);
320RemPhi->addIncoming(RHS.Remainder, RHS.BB);
321return QuotRemPair(QuoPhi, RemPhi);
322}
323
324/// Creates a runtime check to test whether both the divisor and dividend fit
325/// into BypassType. The check is inserted at the end of MainBB. True return
326/// value means that the operands fit. Either of the operands may be NULL if it
327/// doesn't need a runtime check.
328Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) {
329assert((Op1 || Op2) && "Nothing to check");
330IRBuilder<> Builder(MainBB, MainBB->end());
331Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
332
333Value *OrV;
334if (Op1 && Op2)
335OrV = Builder.CreateOr(Op1, Op2);
336else
337OrV = Op1 ? Op1 : Op2;
338
339// BitMask is inverted to check if the operands are
340// larger than the bypass type
341uint64_t BitMask = ~BypassType->getBitMask();
342Value *AndV = Builder.CreateAnd(OrV, BitMask);
343
344// Compare operand values
345Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0);
346return Builder.CreateICmpEQ(AndV, ZeroV);
347}
348
349/// Substitutes the div/rem instruction with code that checks the value of the
350/// operands and uses a shorter-faster div/rem instruction when possible.
351std::optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() {
352Value *Dividend = SlowDivOrRem->getOperand(0);
353Value *Divisor = SlowDivOrRem->getOperand(1);
354
355VisitedSetTy SetL;
356ValueRange DividendRange = getValueRange(Dividend, SetL);
357if (DividendRange == VALRNG_LIKELY_LONG)
358return std::nullopt;
359
360VisitedSetTy SetR;
361ValueRange DivisorRange = getValueRange(Divisor, SetR);
362if (DivisorRange == VALRNG_LIKELY_LONG)
363return std::nullopt;
364
365bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT);
366bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT);
367
368if (DividendShort && DivisorShort) {
369// If both operands are known to be short then just replace the long
370// division with a short one in-place. Since we're not introducing control
371// flow in this case, narrowing the division is always a win, even if the
372// divisor is a constant (and will later get replaced by a multiplication).
373
374IRBuilder<> Builder(SlowDivOrRem);
375Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType);
376Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType);
377Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor);
378Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor);
379Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType());
380Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType());
381return QuotRemPair(ExtDiv, ExtRem);
382}
383
384if (isa<ConstantInt>(Divisor)) {
385// If the divisor is not a constant, DAGCombiner will convert it to a
386// multiplication by a magic constant. It isn't clear if it is worth
387// introducing control flow to get a narrower multiply.
388return std::nullopt;
389}
390
391// After Constant Hoisting pass, long constants may be represented as
392// bitcast instructions. As a result, some constants may look like an
393// instruction at first, and an additional check is necessary to find out if
394// an operand is actually a constant.
395if (auto *BCI = dyn_cast<BitCastInst>(Divisor))
396if (BCI->getParent() == SlowDivOrRem->getParent() &&
397isa<ConstantInt>(BCI->getOperand(0)))
398return std::nullopt;
399
400IRBuilder<> Builder(MainBB, MainBB->end());
401Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
402
403if (DividendShort && !isSignedOp()) {
404// If the division is unsigned and Dividend is known to be short, then
405// either
406// 1) Divisor is less or equal to Dividend, and the result can be computed
407// with a short division.
408// 2) Divisor is greater than Dividend. In this case, no division is needed
409// at all: The quotient is 0 and the remainder is equal to Dividend.
410//
411// So instead of checking at runtime whether Divisor fits into BypassType,
412// we emit a runtime check to differentiate between these two cases. This
413// lets us entirely avoid a long div.
414
415// Split the basic block before the div/rem.
416BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
417// Remove the unconditional branch from MainBB to SuccessorBB.
418MainBB->back().eraseFromParent();
419QuotRemWithBB Long;
420Long.BB = MainBB;
421Long.Quotient = ConstantInt::get(getSlowType(), 0);
422Long.Remainder = Dividend;
423QuotRemWithBB Fast = createFastBB(SuccessorBB);
424QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB);
425Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor);
426Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB);
427return Result;
428} else {
429// General case. Create both slow and fast div/rem pairs and choose one of
430// them at runtime.
431
432// Split the basic block before the div/rem.
433BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
434// Remove the unconditional branch from MainBB to SuccessorBB.
435MainBB->back().eraseFromParent();
436QuotRemWithBB Fast = createFastBB(SuccessorBB);
437QuotRemWithBB Slow = createSlowBB(SuccessorBB);
438QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB);
439Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend,
440DivisorShort ? nullptr : Divisor);
441Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB);
442return Result;
443}
444}
445
446/// This optimization identifies DIV/REM instructions in a BB that can be
447/// profitably bypassed and carried out with a shorter, faster divide.
448bool llvm::bypassSlowDivision(BasicBlock *BB,
449const BypassWidthsTy &BypassWidths) {
450DivCacheTy PerBBDivCache;
451
452bool MadeChange = false;
453Instruction *Next = &*BB->begin();
454while (Next != nullptr) {
455// We may add instructions immediately after I, but we want to skip over
456// them.
457Instruction *I = Next;
458Next = Next->getNextNode();
459
460// Ignore dead code to save time and avoid bugs.
461if (I->hasNUses(0))
462continue;
463
464FastDivInsertionTask Task(I, BypassWidths);
465if (Value *Replacement = Task.getReplacement(PerBBDivCache)) {
466I->replaceAllUsesWith(Replacement);
467I->eraseFromParent();
468MadeChange = true;
469}
470}
471
472// Above we eagerly create divs and rems, as pairs, so that we can efficiently
473// create divrem machine instructions. Now erase any unused divs / rems so we
474// don't leave extra instructions sitting around.
475for (auto &KV : PerBBDivCache)
476for (Value *V : {KV.second.Quotient, KV.second.Remainder})
477RecursivelyDeleteTriviallyDeadInstructions(V);
478
479return MadeChange;
480}
481