llvm-project
624 строки · 22.1 Кб
1//===--- CloneDetection.cpp - Finds code clones in an AST -------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// This file implements classes for searching and analyzing source code clones.
10///
11//===----------------------------------------------------------------------===//
12
13#include "clang/Analysis/CloneDetection.h"14#include "clang/AST/Attr.h"15#include "clang/AST/DataCollection.h"16#include "clang/AST/DeclTemplate.h"17#include "clang/Basic/SourceManager.h"18#include "llvm/Support/MD5.h"19#include "llvm/Support/Path.h"20
21using namespace clang;22
23StmtSequence::StmtSequence(const CompoundStmt *Stmt, const Decl *D,24unsigned StartIndex, unsigned EndIndex)25: S(Stmt), D(D), StartIndex(StartIndex), EndIndex(EndIndex) {26assert(Stmt && "Stmt must not be a nullptr");27assert(StartIndex < EndIndex && "Given array should not be empty");28assert(EndIndex <= Stmt->size() && "Given array too big for this Stmt");29}
30
31StmtSequence::StmtSequence(const Stmt *Stmt, const Decl *D)32: S(Stmt), D(D), StartIndex(0), EndIndex(0) {}33
34StmtSequence::StmtSequence()35: S(nullptr), D(nullptr), StartIndex(0), EndIndex(0) {}36
37bool StmtSequence::contains(const StmtSequence &Other) const {38// If both sequences reside in different declarations, they can never contain39// each other.40if (D != Other.D)41return false;42
43const SourceManager &SM = getASTContext().getSourceManager();44
45// Otherwise check if the start and end locations of the current sequence46// surround the other sequence.47bool StartIsInBounds =48SM.isBeforeInTranslationUnit(getBeginLoc(), Other.getBeginLoc()) ||49getBeginLoc() == Other.getBeginLoc();50if (!StartIsInBounds)51return false;52
53bool EndIsInBounds =54SM.isBeforeInTranslationUnit(Other.getEndLoc(), getEndLoc()) ||55Other.getEndLoc() == getEndLoc();56return EndIsInBounds;57}
58
59StmtSequence::iterator StmtSequence::begin() const {60if (!holdsSequence()) {61return &S;62}63auto CS = cast<CompoundStmt>(S);64return CS->body_begin() + StartIndex;65}
66
67StmtSequence::iterator StmtSequence::end() const {68if (!holdsSequence()) {69return reinterpret_cast<StmtSequence::iterator>(&S) + 1;70}71auto CS = cast<CompoundStmt>(S);72return CS->body_begin() + EndIndex;73}
74
75ASTContext &StmtSequence::getASTContext() const {76assert(D);77return D->getASTContext();78}
79
80SourceLocation StmtSequence::getBeginLoc() const {81return front()->getBeginLoc();82}
83
84SourceLocation StmtSequence::getEndLoc() const { return back()->getEndLoc(); }85
86SourceRange StmtSequence::getSourceRange() const {87return SourceRange(getBeginLoc(), getEndLoc());88}
89
90void CloneDetector::analyzeCodeBody(const Decl *D) {91assert(D);92assert(D->hasBody());93
94Sequences.push_back(StmtSequence(D->getBody(), D));95}
96
97/// Returns true if and only if \p Stmt contains at least one other
98/// sequence in the \p Group.
99static bool containsAnyInGroup(StmtSequence &Seq,100CloneDetector::CloneGroup &Group) {101for (StmtSequence &GroupSeq : Group) {102if (Seq.contains(GroupSeq))103return true;104}105return false;106}
107
108/// Returns true if and only if all sequences in \p OtherGroup are
109/// contained by a sequence in \p Group.
110static bool containsGroup(CloneDetector::CloneGroup &Group,111CloneDetector::CloneGroup &OtherGroup) {112// We have less sequences in the current group than we have in the other,113// so we will never fulfill the requirement for returning true. This is only114// possible because we know that a sequence in Group can contain at most115// one sequence in OtherGroup.116if (Group.size() < OtherGroup.size())117return false;118
119for (StmtSequence &Stmt : Group) {120if (!containsAnyInGroup(Stmt, OtherGroup))121return false;122}123return true;124}
125
126void OnlyLargestCloneConstraint::constrain(127std::vector<CloneDetector::CloneGroup> &Result) {128std::vector<unsigned> IndexesToRemove;129
130// Compare every group in the result with the rest. If one groups contains131// another group, we only need to return the bigger group.132// Note: This doesn't scale well, so if possible avoid calling any heavy133// function from this loop to minimize the performance impact.134for (unsigned i = 0; i < Result.size(); ++i) {135for (unsigned j = 0; j < Result.size(); ++j) {136// Don't compare a group with itself.137if (i == j)138continue;139
140if (containsGroup(Result[j], Result[i])) {141IndexesToRemove.push_back(i);142break;143}144}145}146
147// Erasing a list of indexes from the vector should be done with decreasing148// indexes. As IndexesToRemove is constructed with increasing values, we just149// reverse iterate over it to get the desired order.150for (unsigned I : llvm::reverse(IndexesToRemove))151Result.erase(Result.begin() + I);152}
153
154bool FilenamePatternConstraint::isAutoGenerated(155const CloneDetector::CloneGroup &Group) {156if (IgnoredFilesPattern.empty() || Group.empty() ||157!IgnoredFilesRegex->isValid())158return false;159
160for (const StmtSequence &S : Group) {161const SourceManager &SM = S.getASTContext().getSourceManager();162StringRef Filename = llvm::sys::path::filename(163SM.getFilename(S.getContainingDecl()->getLocation()));164if (IgnoredFilesRegex->match(Filename))165return true;166}167
168return false;169}
170
171/// This class defines what a type II code clone is: If it collects for two
172/// statements the same data, then those two statements are considered to be
173/// clones of each other.
174///
175/// All collected data is forwarded to the given data consumer of the type T.
176/// The data consumer class needs to provide a member method with the signature:
177/// update(StringRef Str)
178namespace {179template <class T>180class CloneTypeIIStmtDataCollector181: public ConstStmtVisitor<CloneTypeIIStmtDataCollector<T>> {182ASTContext &Context;183/// The data sink to which all data is forwarded.184T &DataConsumer;185
186template <class Ty> void addData(const Ty &Data) {187data_collection::addDataToConsumer(DataConsumer, Data);188}189
190public:191CloneTypeIIStmtDataCollector(const Stmt *S, ASTContext &Context,192T &DataConsumer)193: Context(Context), DataConsumer(DataConsumer) {194this->Visit(S);195}196
197// Define a visit method for each class to collect data and subsequently visit
198// all parent classes. This uses a template so that custom visit methods by us
199// take precedence.
200#define DEF_ADD_DATA(CLASS, CODE) \201template <class = void> void Visit##CLASS(const CLASS *S) { \202CODE; \203ConstStmtVisitor<CloneTypeIIStmtDataCollector<T>>::Visit##CLASS(S); \204}205
206#include "clang/AST/StmtDataCollectors.inc"207
208// Type II clones ignore variable names and literals, so let's skip them.
209#define SKIP(CLASS) \210void Visit##CLASS(const CLASS *S) { \211ConstStmtVisitor<CloneTypeIIStmtDataCollector<T>>::Visit##CLASS(S); \212}213SKIP(DeclRefExpr)214SKIP(MemberExpr)215SKIP(IntegerLiteral)216SKIP(FloatingLiteral)217SKIP(StringLiteral)218SKIP(CXXBoolLiteralExpr)219SKIP(CharacterLiteral)220#undef SKIP221};222} // end anonymous namespace223
224static size_t createHash(llvm::MD5 &Hash) {225size_t HashCode;226
227// Create the final hash code for the current Stmt.228llvm::MD5::MD5Result HashResult;229Hash.final(HashResult);230
231// Copy as much as possible of the generated hash code to the Stmt's hash232// code.233std::memcpy(&HashCode, &HashResult,234std::min(sizeof(HashCode), sizeof(HashResult)));235
236return HashCode;237}
238
239/// Generates and saves a hash code for the given Stmt.
240/// \param S The given Stmt.
241/// \param D The Decl containing S.
242/// \param StmtsByHash Output parameter that will contain the hash codes for
243/// each StmtSequence in the given Stmt.
244/// \return The hash code of the given Stmt.
245///
246/// If the given Stmt is a CompoundStmt, this method will also generate
247/// hashes for all possible StmtSequences in the children of this Stmt.
248static size_t249saveHash(const Stmt *S, const Decl *D,250std::vector<std::pair<size_t, StmtSequence>> &StmtsByHash) {251llvm::MD5 Hash;252ASTContext &Context = D->getASTContext();253
254CloneTypeIIStmtDataCollector<llvm::MD5>(S, Context, Hash);255
256auto CS = dyn_cast<CompoundStmt>(S);257SmallVector<size_t, 8> ChildHashes;258
259for (const Stmt *Child : S->children()) {260if (Child == nullptr) {261ChildHashes.push_back(0);262continue;263}264size_t ChildHash = saveHash(Child, D, StmtsByHash);265Hash.update(266StringRef(reinterpret_cast<char *>(&ChildHash), sizeof(ChildHash)));267ChildHashes.push_back(ChildHash);268}269
270if (CS) {271// If we're in a CompoundStmt, we hash all possible combinations of child272// statements to find clones in those subsequences.273// We first go through every possible starting position of a subsequence.274for (unsigned Pos = 0; Pos < CS->size(); ++Pos) {275// Then we try all possible lengths this subsequence could have and276// reuse the same hash object to make sure we only hash every child277// hash exactly once.278llvm::MD5 Hash;279for (unsigned Length = 1; Length <= CS->size() - Pos; ++Length) {280// Grab the current child hash and put it into our hash. We do281// -1 on the index because we start counting the length at 1.282size_t ChildHash = ChildHashes[Pos + Length - 1];283Hash.update(284StringRef(reinterpret_cast<char *>(&ChildHash), sizeof(ChildHash)));285// If we have at least two elements in our subsequence, we can start286// saving it.287if (Length > 1) {288llvm::MD5 SubHash = Hash;289StmtsByHash.push_back(std::make_pair(290createHash(SubHash), StmtSequence(CS, D, Pos, Pos + Length)));291}292}293}294}295
296size_t HashCode = createHash(Hash);297StmtsByHash.push_back(std::make_pair(HashCode, StmtSequence(S, D)));298return HashCode;299}
300
301namespace {302/// Wrapper around FoldingSetNodeID that it can be used as the template
303/// argument of the StmtDataCollector.
304class FoldingSetNodeIDWrapper {305
306llvm::FoldingSetNodeID &FS;307
308public:309FoldingSetNodeIDWrapper(llvm::FoldingSetNodeID &FS) : FS(FS) {}310
311void update(StringRef Str) { FS.AddString(Str); }312};313} // end anonymous namespace314
315/// Writes the relevant data from all statements and child statements
316/// in the given StmtSequence into the given FoldingSetNodeID.
317static void CollectStmtSequenceData(const StmtSequence &Sequence,318FoldingSetNodeIDWrapper &OutputData) {319for (const Stmt *S : Sequence) {320CloneTypeIIStmtDataCollector<FoldingSetNodeIDWrapper>(321S, Sequence.getASTContext(), OutputData);322
323for (const Stmt *Child : S->children()) {324if (!Child)325continue;326
327CollectStmtSequenceData(StmtSequence(Child, Sequence.getContainingDecl()),328OutputData);329}330}331}
332
333/// Returns true if both sequences are clones of each other.
334static bool areSequencesClones(const StmtSequence &LHS,335const StmtSequence &RHS) {336// We collect the data from all statements in the sequence as we did before337// when generating a hash value for each sequence. But this time we don't338// hash the collected data and compare the whole data set instead. This339// prevents any false-positives due to hash code collisions.340llvm::FoldingSetNodeID DataLHS, DataRHS;341FoldingSetNodeIDWrapper LHSWrapper(DataLHS);342FoldingSetNodeIDWrapper RHSWrapper(DataRHS);343
344CollectStmtSequenceData(LHS, LHSWrapper);345CollectStmtSequenceData(RHS, RHSWrapper);346
347return DataLHS == DataRHS;348}
349
350void RecursiveCloneTypeIIHashConstraint::constrain(351std::vector<CloneDetector::CloneGroup> &Sequences) {352// FIXME: Maybe we can do this in-place and don't need this additional vector.353std::vector<CloneDetector::CloneGroup> Result;354
355for (CloneDetector::CloneGroup &Group : Sequences) {356// We assume in the following code that the Group is non-empty, so we357// skip all empty groups.358if (Group.empty())359continue;360
361std::vector<std::pair<size_t, StmtSequence>> StmtsByHash;362
363// Generate hash codes for all children of S and save them in StmtsByHash.364for (const StmtSequence &S : Group) {365saveHash(S.front(), S.getContainingDecl(), StmtsByHash);366}367
368// Sort hash_codes in StmtsByHash.369llvm::stable_sort(StmtsByHash, llvm::less_first());370
371// Check for each StmtSequence if its successor has the same hash value.372// We don't check the last StmtSequence as it has no successor.373// Note: The 'size - 1 ' in the condition is safe because we check for an374// empty Group vector at the beginning of this function.375for (unsigned i = 0; i < StmtsByHash.size() - 1; ++i) {376const auto Current = StmtsByHash[i];377
378// It's likely that we just found a sequence of StmtSequences that379// represent a CloneGroup, so we create a new group and start checking and380// adding the StmtSequences in this sequence.381CloneDetector::CloneGroup NewGroup;382
383size_t PrototypeHash = Current.first;384
385for (; i < StmtsByHash.size(); ++i) {386// A different hash value means we have reached the end of the sequence.387if (PrototypeHash != StmtsByHash[i].first) {388// The current sequence could be the start of a new CloneGroup. So we389// decrement i so that we visit it again in the outer loop.390// Note: i can never be 0 at this point because we are just comparing391// the hash of the Current StmtSequence with itself in the 'if' above.392assert(i != 0);393--i;394break;395}396// Same hash value means we should add the StmtSequence to the current397// group.398NewGroup.push_back(StmtsByHash[i].second);399}400
401// We created a new clone group with matching hash codes and move it to402// the result vector.403Result.push_back(NewGroup);404}405}406// Sequences is the output parameter, so we copy our result into it.407Sequences = Result;408}
409
410void RecursiveCloneTypeIIVerifyConstraint::constrain(411std::vector<CloneDetector::CloneGroup> &Sequences) {412CloneConstraint::splitCloneGroups(413Sequences, [](const StmtSequence &A, const StmtSequence &B) {414return areSequencesClones(A, B);415});416}
417
418size_t MinComplexityConstraint::calculateStmtComplexity(419const StmtSequence &Seq, std::size_t Limit,420const std::string &ParentMacroStack) {421if (Seq.empty())422return 0;423
424size_t Complexity = 1;425
426ASTContext &Context = Seq.getASTContext();427
428// Look up what macros expanded into the current statement.429std::string MacroStack =430data_collection::getMacroStack(Seq.getBeginLoc(), Context);431
432// First, check if ParentMacroStack is not empty which means we are currently433// dealing with a parent statement which was expanded from a macro.434// If this parent statement was expanded from the same macros as this435// statement, we reduce the initial complexity of this statement to zero.436// This causes that a group of statements that were generated by a single437// macro expansion will only increase the total complexity by one.438// Note: This is not the final complexity of this statement as we still439// add the complexity of the child statements to the complexity value.440if (!ParentMacroStack.empty() && MacroStack == ParentMacroStack) {441Complexity = 0;442}443
444// Iterate over the Stmts in the StmtSequence and add their complexity values445// to the current complexity value.446if (Seq.holdsSequence()) {447for (const Stmt *S : Seq) {448Complexity += calculateStmtComplexity(449StmtSequence(S, Seq.getContainingDecl()), Limit, MacroStack);450if (Complexity >= Limit)451return Limit;452}453} else {454for (const Stmt *S : Seq.front()->children()) {455Complexity += calculateStmtComplexity(456StmtSequence(S, Seq.getContainingDecl()), Limit, MacroStack);457if (Complexity >= Limit)458return Limit;459}460}461return Complexity;462}
463
464void MatchingVariablePatternConstraint::constrain(465std::vector<CloneDetector::CloneGroup> &CloneGroups) {466CloneConstraint::splitCloneGroups(467CloneGroups, [](const StmtSequence &A, const StmtSequence &B) {468VariablePattern PatternA(A);469VariablePattern PatternB(B);470return PatternA.countPatternDifferences(PatternB) == 0;471});472}
473
474void CloneConstraint::splitCloneGroups(475std::vector<CloneDetector::CloneGroup> &CloneGroups,476llvm::function_ref<bool(const StmtSequence &, const StmtSequence &)>477Compare) {478std::vector<CloneDetector::CloneGroup> Result;479for (auto &HashGroup : CloneGroups) {480// Contains all indexes in HashGroup that were already added to a481// CloneGroup.482std::vector<char> Indexes;483Indexes.resize(HashGroup.size());484
485for (unsigned i = 0; i < HashGroup.size(); ++i) {486// Skip indexes that are already part of a CloneGroup.487if (Indexes[i])488continue;489
490// Pick the first unhandled StmtSequence and consider it as the491// beginning492// of a new CloneGroup for now.493// We don't add i to Indexes because we never iterate back.494StmtSequence Prototype = HashGroup[i];495CloneDetector::CloneGroup PotentialGroup = {Prototype};496++Indexes[i];497
498// Check all following StmtSequences for clones.499for (unsigned j = i + 1; j < HashGroup.size(); ++j) {500// Skip indexes that are already part of a CloneGroup.501if (Indexes[j])502continue;503
504// If a following StmtSequence belongs to our CloneGroup, we add it.505const StmtSequence &Candidate = HashGroup[j];506
507if (!Compare(Prototype, Candidate))508continue;509
510PotentialGroup.push_back(Candidate);511// Make sure we never visit this StmtSequence again.512++Indexes[j];513}514
515// Otherwise, add it to the result and continue searching for more516// groups.517Result.push_back(PotentialGroup);518}519
520assert(llvm::all_of(Indexes, [](char c) { return c == 1; }));521}522CloneGroups = Result;523}
524
525void VariablePattern::addVariableOccurence(const VarDecl *VarDecl,526const Stmt *Mention) {527// First check if we already reference this variable528for (size_t KindIndex = 0; KindIndex < Variables.size(); ++KindIndex) {529if (Variables[KindIndex] == VarDecl) {530// If yes, add a new occurrence that points to the existing entry in531// the Variables vector.532Occurences.emplace_back(KindIndex, Mention);533return;534}535}536// If this variable wasn't already referenced, add it to the list of537// referenced variables and add a occurrence that points to this new entry.538Occurences.emplace_back(Variables.size(), Mention);539Variables.push_back(VarDecl);540}
541
542void VariablePattern::addVariables(const Stmt *S) {543// Sometimes we get a nullptr (such as from IfStmts which often have nullptr544// children). We skip such statements as they don't reference any545// variables.546if (!S)547return;548
549// Check if S is a reference to a variable. If yes, add it to the pattern.550if (auto D = dyn_cast<DeclRefExpr>(S)) {551if (auto VD = dyn_cast<VarDecl>(D->getDecl()->getCanonicalDecl()))552addVariableOccurence(VD, D);553}554
555// Recursively check all children of the given statement.556for (const Stmt *Child : S->children()) {557addVariables(Child);558}559}
560
561unsigned VariablePattern::countPatternDifferences(562const VariablePattern &Other,563VariablePattern::SuspiciousClonePair *FirstMismatch) {564unsigned NumberOfDifferences = 0;565
566assert(Other.Occurences.size() == Occurences.size());567for (unsigned i = 0; i < Occurences.size(); ++i) {568auto ThisOccurence = Occurences[i];569auto OtherOccurence = Other.Occurences[i];570if (ThisOccurence.KindID == OtherOccurence.KindID)571continue;572
573++NumberOfDifferences;574
575// If FirstMismatch is not a nullptr, we need to store information about576// the first difference between the two patterns.577if (FirstMismatch == nullptr)578continue;579
580// Only proceed if we just found the first difference as we only store581// information about the first difference.582if (NumberOfDifferences != 1)583continue;584
585const VarDecl *FirstSuggestion = nullptr;586// If there is a variable available in the list of referenced variables587// which wouldn't break the pattern if it is used in place of the588// current variable, we provide this variable as the suggested fix.589if (OtherOccurence.KindID < Variables.size())590FirstSuggestion = Variables[OtherOccurence.KindID];591
592// Store information about the first clone.593FirstMismatch->FirstCloneInfo =594VariablePattern::SuspiciousClonePair::SuspiciousCloneInfo(595Variables[ThisOccurence.KindID], ThisOccurence.Mention,596FirstSuggestion);597
598// Same as above but with the other clone. We do this for both clones as599// we don't know which clone is the one containing the unintended600// pattern error.601const VarDecl *SecondSuggestion = nullptr;602if (ThisOccurence.KindID < Other.Variables.size())603SecondSuggestion = Other.Variables[ThisOccurence.KindID];604
605// Store information about the second clone.606FirstMismatch->SecondCloneInfo =607VariablePattern::SuspiciousClonePair::SuspiciousCloneInfo(608Other.Variables[OtherOccurence.KindID], OtherOccurence.Mention,609SecondSuggestion);610
611// SuspiciousClonePair guarantees that the first clone always has a612// suggested variable associated with it. As we know that one of the two613// clones in the pair always has suggestion, we swap the two clones614// in case the first clone has no suggested variable which means that615// the second clone has a suggested variable and should be first.616if (!FirstMismatch->FirstCloneInfo.Suggestion)617std::swap(FirstMismatch->FirstCloneInfo, FirstMismatch->SecondCloneInfo);618
619// This ensures that we always have at least one suggestion in a pair.620assert(FirstMismatch->FirstCloneInfo.Suggestion);621}622
623return NumberOfDifferences;624}
625