1 //===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// 10 /// \file 11 /// This file defines classes for searching and analyzing source code clones. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #ifndef LLVM_CLANG_AST_CLONEDETECTION_H 16 #define LLVM_CLANG_AST_CLONEDETECTION_H 17 18 #include "clang/AST/StmtVisitor.h" 19 #include "llvm/Support/Regex.h" 20 #include <vector> 21 22 namespace clang { 23 24 class Stmt; 25 class Decl; 26 class VarDecl; 27 class ASTContext; 28 class CompoundStmt; 29 30 /// Identifies a list of statements. 31 /// 32 /// Can either identify a single arbitrary Stmt object, a continuous sequence of 33 /// child statements inside a CompoundStmt or no statements at all. 34 class StmtSequence { 35 /// If this object identifies a sequence of statements inside a CompoundStmt, 36 /// S points to this CompoundStmt. If this object only identifies a single 37 /// Stmt, then S is a pointer to this Stmt. 38 const Stmt *S; 39 40 /// The declaration that contains the statements. 41 const Decl *D; 42 43 /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence 44 /// instance is representing the CompoundStmt children inside the array 45 /// [StartIndex, EndIndex). 46 unsigned StartIndex; 47 unsigned EndIndex; 48 49 public: 50 /// Constructs a StmtSequence holding multiple statements. 51 /// 52 /// The resulting StmtSequence identifies a continuous sequence of statements 53 /// in the body of the given CompoundStmt. Which statements of the body should 54 /// be identified needs to be specified by providing a start and end index 55 /// that describe a non-empty sub-array in the body of the given CompoundStmt. 56 /// 57 /// \param Stmt A CompoundStmt that contains all statements in its body. 58 /// \param D The Decl containing this Stmt. 59 /// \param StartIndex The inclusive start index in the children array of 60 /// \p Stmt 61 /// \param EndIndex The exclusive end index in the children array of \p Stmt. 62 StmtSequence(const CompoundStmt *Stmt, const Decl *D, unsigned StartIndex, 63 unsigned EndIndex); 64 65 /// Constructs a StmtSequence holding a single statement. 66 /// 67 /// \param Stmt An arbitrary Stmt. 68 /// \param D The Decl containing this Stmt. 69 StmtSequence(const Stmt *Stmt, const Decl *D); 70 71 /// Constructs an empty StmtSequence. 72 StmtSequence(); 73 74 typedef const Stmt *const *iterator; 75 76 /// Returns an iterator pointing to the first statement in this sequence. 77 iterator begin() const; 78 79 /// Returns an iterator pointing behind the last statement in this sequence. 80 iterator end() const; 81 82 /// Returns the first statement in this sequence. 83 /// 84 /// This method should only be called on a non-empty StmtSequence object. 85 const Stmt *front() const { 86 assert(!empty()); 87 return begin()[0]; 88 } 89 90 /// Returns the last statement in this sequence. 91 /// 92 /// This method should only be called on a non-empty StmtSequence object. 93 const Stmt *back() const { 94 assert(!empty()); 95 return begin()[size() - 1]; 96 } 97 98 /// Returns the number of statements this object holds. 99 unsigned size() const { 100 if (holdsSequence()) 101 return EndIndex - StartIndex; 102 if (S == nullptr) 103 return 0; 104 return 1; 105 } 106 107 /// Returns true if and only if this StmtSequence contains no statements. 108 bool empty() const { return size() == 0; } 109 110 /// Returns the related ASTContext for the stored Stmts. 111 ASTContext &getASTContext() const; 112 113 /// Returns the declaration that contains the stored Stmts. 114 const Decl *getContainingDecl() const { 115 assert(D); 116 return D; 117 } 118 119 /// Returns true if this objects holds a list of statements. 120 bool holdsSequence() const { return EndIndex != 0; } 121 122 /// Returns the start sourcelocation of the first statement in this sequence. 123 /// 124 /// This method should only be called on a non-empty StmtSequence object. 125 SourceLocation getStartLoc() const; 126 127 /// Returns the end sourcelocation of the last statement in this sequence. 128 /// 129 /// This method should only be called on a non-empty StmtSequence object. 130 SourceLocation getEndLoc() const; 131 132 /// Returns the source range of the whole sequence - from the beginning 133 /// of the first statement to the end of the last statement. 134 SourceRange getSourceRange() const; 135 136 bool operator==(const StmtSequence &Other) const { 137 return std::tie(S, StartIndex, EndIndex) == 138 std::tie(Other.S, Other.StartIndex, Other.EndIndex); 139 } 140 141 bool operator!=(const StmtSequence &Other) const { 142 return std::tie(S, StartIndex, EndIndex) != 143 std::tie(Other.S, Other.StartIndex, Other.EndIndex); 144 } 145 146 /// Returns true if and only if this sequence covers a source range that 147 /// contains the source range of the given sequence \p Other. 148 /// 149 /// This method should only be called on a non-empty StmtSequence object 150 /// and passed a non-empty StmtSequence object. 151 bool contains(const StmtSequence &Other) const; 152 }; 153 154 /// Searches for similar subtrees in the AST. 155 /// 156 /// First, this class needs several declarations with statement bodies which 157 /// can be passed via analyzeCodeBody. Afterwards all statements can be 158 /// searched for clones by calling findClones with a given list of constraints 159 /// that should specify the wanted properties of the clones. 160 /// 161 /// The result of findClones can be further constrained with the constrainClones 162 /// method. 163 /// 164 /// This class only searches for clones in exectuable source code 165 /// (e.g. function bodies). Other clones (e.g. cloned comments or declarations) 166 /// are not supported. 167 class CloneDetector { 168 169 public: 170 /// A collection of StmtSequences that share an arbitrary property. 171 typedef llvm::SmallVector<StmtSequence, 8> CloneGroup; 172 173 /// Generates and stores search data for all statements in the body of 174 /// the given Decl. 175 void analyzeCodeBody(const Decl *D); 176 177 /// Constrains the given list of clone groups with the given constraint. 178 /// 179 /// The constraint is expected to have a method with the signature 180 /// `void constrain(std::vector<CloneDetector::CloneGroup> &Sequences)` 181 /// as this is the interface that the CloneDetector uses for applying the 182 /// constraint. The constraint is supposed to directly modify the passed list 183 /// so that all clones in the list fulfill the specific property this 184 /// constraint ensures. 185 template <typename T> 186 static void constrainClones(std::vector<CloneGroup> &CloneGroups, T C) { 187 C.constrain(CloneGroups); 188 } 189 190 /// Constrains the given list of clone groups with the given list of 191 /// constraints. 192 /// 193 /// The constraints are applied in sequence in the order in which they are 194 /// passed to this function. 195 template <typename T1, typename... Ts> 196 static void constrainClones(std::vector<CloneGroup> &CloneGroups, T1 C, 197 Ts... ConstraintList) { 198 constrainClones(CloneGroups, C); 199 constrainClones(CloneGroups, ConstraintList...); 200 } 201 202 /// Searches for clones in all previously passed statements. 203 /// \param Result Output parameter to which all created clone groups are 204 /// added. 205 /// \param ConstraintList The constraints that should be applied to the 206 // result. 207 template <typename... Ts> 208 void findClones(std::vector<CloneGroup> &Result, Ts... ConstraintList) { 209 // The initial assumption is that there is only one clone group and every 210 // statement is a clone of the others. This clone group will then be 211 // split up with the help of the constraints. 212 CloneGroup AllClones; 213 AllClones.reserve(Sequences.size()); 214 for (const auto &C : Sequences) { 215 AllClones.push_back(C); 216 } 217 218 Result.push_back(AllClones); 219 220 constrainClones(Result, ConstraintList...); 221 } 222 223 private: 224 CloneGroup Sequences; 225 }; 226 227 /// This class is a utility class that contains utility functions for building 228 /// custom constraints. 229 class CloneConstraint { 230 public: 231 /// Removes all groups by using a filter function. 232 /// \param CloneGroups The list of CloneGroups that is supposed to be 233 /// filtered. 234 /// \param Filter The filter function that should return true for all groups 235 /// that should be removed from the list. 236 static void filterGroups( 237 std::vector<CloneDetector::CloneGroup> &CloneGroups, 238 llvm::function_ref<bool(const CloneDetector::CloneGroup &)> Filter) { 239 CloneGroups.erase( 240 std::remove_if(CloneGroups.begin(), CloneGroups.end(), Filter), 241 CloneGroups.end()); 242 } 243 244 /// Splits the given CloneGroups until the given Compare function returns true 245 /// for all clones in a single group. 246 /// \param CloneGroups A list of CloneGroups that should be modified. 247 /// \param Compare The comparison function that all clones are supposed to 248 /// pass. Should return true if and only if two clones belong 249 /// to the same CloneGroup. 250 static void splitCloneGroups( 251 std::vector<CloneDetector::CloneGroup> &CloneGroups, 252 llvm::function_ref<bool(const StmtSequence &, const StmtSequence &)> 253 Compare); 254 }; 255 256 /// This constraint moves clones into clone groups of type II via hashing. 257 /// 258 /// Clones with different hash values are moved into separate clone groups. 259 /// Collisions are possible, and this constraint does nothing to address this 260 /// them. Add the slower RecursiveCloneTypeIIVerifyConstraint later in the 261 /// constraint chain, not necessarily immediately, to eliminate hash collisions 262 /// through a more detailed analysis. 263 class RecursiveCloneTypeIIHashConstraint { 264 public: 265 void constrain(std::vector<CloneDetector::CloneGroup> &Sequences); 266 }; 267 268 /// This constraint moves clones into clone groups of type II by comparing them. 269 /// 270 /// Clones that aren't type II clones are moved into separate clone groups. 271 /// In contrast to the RecursiveCloneTypeIIHashConstraint, all clones in a clone 272 /// group are guaranteed to be be type II clones of each other, but it is too 273 /// slow to efficiently handle large amounts of clones. 274 class RecursiveCloneTypeIIVerifyConstraint { 275 public: 276 void constrain(std::vector<CloneDetector::CloneGroup> &Sequences); 277 }; 278 279 /// Ensures that every clone has at least the given complexity. 280 /// 281 /// Complexity is here defined as the total amount of children of a statement. 282 /// This constraint assumes the first statement in the group is representative 283 /// for all other statements in the group in terms of complexity. 284 class MinComplexityConstraint { 285 unsigned MinComplexity; 286 287 public: 288 MinComplexityConstraint(unsigned MinComplexity) 289 : MinComplexity(MinComplexity) {} 290 291 /// Calculates the complexity of the given StmtSequence. 292 /// \param Limit The limit of complexity we probe for. After reaching 293 /// this limit during calculation, this method is exiting 294 /// early to improve performance and returns this limit. 295 size_t calculateStmtComplexity(const StmtSequence &Seq, std::size_t Limit, 296 const std::string &ParentMacroStack = ""); 297 298 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) { 299 CloneConstraint::filterGroups( 300 CloneGroups, [this](const CloneDetector::CloneGroup &A) { 301 if (!A.empty()) 302 return calculateStmtComplexity(A.front(), MinComplexity) < 303 MinComplexity; 304 else 305 return false; 306 }); 307 } 308 }; 309 310 /// Ensures that all clone groups contain at least the given amount of clones. 311 class MinGroupSizeConstraint { 312 unsigned MinGroupSize; 313 314 public: 315 MinGroupSizeConstraint(unsigned MinGroupSize = 2) 316 : MinGroupSize(MinGroupSize) {} 317 318 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) { 319 CloneConstraint::filterGroups(CloneGroups, 320 [this](const CloneDetector::CloneGroup &A) { 321 return A.size() < MinGroupSize; 322 }); 323 } 324 }; 325 326 /// Ensures that no clone group fully contains another clone group. 327 struct OnlyLargestCloneConstraint { 328 void constrain(std::vector<CloneDetector::CloneGroup> &Result); 329 }; 330 331 struct FilenamePatternConstraint { 332 StringRef IgnoredFilesPattern; 333 std::shared_ptr<llvm::Regex> IgnoredFilesRegex; 334 335 FilenamePatternConstraint(StringRef IgnoredFilesPattern) 336 : IgnoredFilesPattern(IgnoredFilesPattern) { 337 IgnoredFilesRegex = std::make_shared<llvm::Regex>("^(" + 338 IgnoredFilesPattern.str() + "$)"); 339 } 340 341 bool isAutoGenerated(const CloneDetector::CloneGroup &Group); 342 343 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) { 344 CloneConstraint::filterGroups( 345 CloneGroups, [this](const CloneDetector::CloneGroup &Group) { 346 return isAutoGenerated(Group); 347 }); 348 } 349 }; 350 351 /// Analyzes the pattern of the referenced variables in a statement. 352 class VariablePattern { 353 354 /// Describes an occurence of a variable reference in a statement. 355 struct VariableOccurence { 356 /// The index of the associated VarDecl in the Variables vector. 357 size_t KindID; 358 /// The statement in the code where the variable was referenced. 359 const Stmt *Mention; 360 361 VariableOccurence(size_t KindID, const Stmt *Mention) 362 : KindID(KindID), Mention(Mention) {} 363 }; 364 365 /// All occurences of referenced variables in the order of appearance. 366 std::vector<VariableOccurence> Occurences; 367 /// List of referenced variables in the order of appearance. 368 /// Every item in this list is unique. 369 std::vector<const VarDecl *> Variables; 370 371 /// Adds a new variable referenced to this pattern. 372 /// \param VarDecl The declaration of the variable that is referenced. 373 /// \param Mention The SourceRange where this variable is referenced. 374 void addVariableOccurence(const VarDecl *VarDecl, const Stmt *Mention); 375 376 /// Adds each referenced variable from the given statement. 377 void addVariables(const Stmt *S); 378 379 public: 380 /// Creates an VariablePattern object with information about the given 381 /// StmtSequence. 382 VariablePattern(const StmtSequence &Sequence) { 383 for (const Stmt *S : Sequence) 384 addVariables(S); 385 } 386 387 /// Describes two clones that reference their variables in a different pattern 388 /// which could indicate a programming error. 389 struct SuspiciousClonePair { 390 /// Utility class holding the relevant information about a single 391 /// clone in this pair. 392 struct SuspiciousCloneInfo { 393 /// The variable which referencing in this clone was against the pattern. 394 const VarDecl *Variable; 395 /// Where the variable was referenced. 396 const Stmt *Mention; 397 /// The variable that should have been referenced to follow the pattern. 398 /// If Suggestion is a nullptr then it's not possible to fix the pattern 399 /// by referencing a different variable in this clone. 400 const VarDecl *Suggestion; 401 SuspiciousCloneInfo(const VarDecl *Variable, const Stmt *Mention, 402 const VarDecl *Suggestion) 403 : Variable(Variable), Mention(Mention), Suggestion(Suggestion) {} 404 SuspiciousCloneInfo() {} 405 }; 406 /// The first clone in the pair which always has a suggested variable. 407 SuspiciousCloneInfo FirstCloneInfo; 408 /// This other clone in the pair which can have a suggested variable. 409 SuspiciousCloneInfo SecondCloneInfo; 410 }; 411 412 /// Counts the differences between this pattern and the given one. 413 /// \param Other The given VariablePattern to compare with. 414 /// \param FirstMismatch Output parameter that will be filled with information 415 /// about the first difference between the two patterns. This parameter 416 /// can be a nullptr, in which case it will be ignored. 417 /// \return Returns the number of differences between the pattern this object 418 /// is following and the given VariablePattern. 419 /// 420 /// For example, the following statements all have the same pattern and this 421 /// function would return zero: 422 /// 423 /// if (a < b) return a; return b; 424 /// if (x < y) return x; return y; 425 /// if (u2 < u1) return u2; return u1; 426 /// 427 /// But the following statement has a different pattern (note the changed 428 /// variables in the return statements) and would have two differences when 429 /// compared with one of the statements above. 430 /// 431 /// if (a < b) return b; return a; 432 /// 433 /// This function should only be called if the related statements of the given 434 /// pattern and the statements of this objects are clones of each other. 435 unsigned countPatternDifferences( 436 const VariablePattern &Other, 437 VariablePattern::SuspiciousClonePair *FirstMismatch = nullptr); 438 }; 439 440 /// Ensures that all clones reference variables in the same pattern. 441 struct MatchingVariablePatternConstraint { 442 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups); 443 }; 444 445 } // end namespace clang 446 447 #endif // LLVM_CLANG_AST_CLONEDETECTION_H 448