1 //===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// 10 /// /file 11 /// This file defines classes for searching and anlyzing source code clones. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #ifndef LLVM_CLANG_AST_CLONEDETECTION_H 16 #define LLVM_CLANG_AST_CLONEDETECTION_H 17 18 #include "clang/Basic/SourceLocation.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include <vector> 21 22 namespace clang { 23 24 class Stmt; 25 class Decl; 26 class VarDecl; 27 class ASTContext; 28 class CompoundStmt; 29 30 /// Identifies a list of statements. 31 /// 32 /// Can either identify a single arbitrary Stmt object, a continuous sequence of 33 /// child statements inside a CompoundStmt or no statements at all. 34 class StmtSequence { 35 /// If this object identifies a sequence of statements inside a CompoundStmt, 36 /// S points to this CompoundStmt. If this object only identifies a single 37 /// Stmt, then S is a pointer to this Stmt. 38 const Stmt *S; 39 40 /// The declaration that contains the statements. 41 const Decl *D; 42 43 /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence 44 /// instance is representing the CompoundStmt children inside the array 45 /// [StartIndex, EndIndex). 46 unsigned StartIndex; 47 unsigned EndIndex; 48 49 public: 50 /// Constructs a StmtSequence holding multiple statements. 51 /// 52 /// The resulting StmtSequence identifies a continuous sequence of statements 53 /// in the body of the given CompoundStmt. Which statements of the body should 54 /// be identified needs to be specified by providing a start and end index 55 /// that describe a non-empty sub-array in the body of the given CompoundStmt. 56 /// 57 /// \param Stmt A CompoundStmt that contains all statements in its body. 58 /// \param D The Decl containing this Stmt. 59 /// \param StartIndex The inclusive start index in the children array of 60 /// \p Stmt 61 /// \param EndIndex The exclusive end index in the children array of \p Stmt. 62 StmtSequence(const CompoundStmt *Stmt, const Decl *D, unsigned StartIndex, 63 unsigned EndIndex); 64 65 /// Constructs a StmtSequence holding a single statement. 66 /// 67 /// \param Stmt An arbitrary Stmt. 68 /// \param D The Decl containing this Stmt. 69 StmtSequence(const Stmt *Stmt, const Decl *D); 70 71 /// Constructs an empty StmtSequence. 72 StmtSequence(); 73 74 typedef const Stmt *const *iterator; 75 76 /// Returns an iterator pointing to the first statement in this sequence. 77 iterator begin() const; 78 79 /// Returns an iterator pointing behind the last statement in this sequence. 80 iterator end() const; 81 82 /// Returns the first statement in this sequence. 83 /// 84 /// This method should only be called on a non-empty StmtSequence object. 85 const Stmt *front() const { 86 assert(!empty()); 87 return begin()[0]; 88 } 89 90 /// Returns the last statement in this sequence. 91 /// 92 /// This method should only be called on a non-empty StmtSequence object. 93 const Stmt *back() const { 94 assert(!empty()); 95 return begin()[size() - 1]; 96 } 97 98 /// Returns the number of statements this object holds. 99 unsigned size() const { 100 if (holdsSequence()) 101 return EndIndex - StartIndex; 102 if (S == nullptr) 103 return 0; 104 return 1; 105 } 106 107 /// Returns true if and only if this StmtSequence contains no statements. 108 bool empty() const { return size() == 0; } 109 110 /// Returns the related ASTContext for the stored Stmts. 111 ASTContext &getASTContext() const; 112 113 /// Returns the declaration that contains the stored Stmts. 114 const Decl *getContainingDecl() const { 115 assert(D); 116 return D; 117 } 118 119 /// Returns true if this objects holds a list of statements. 120 bool holdsSequence() const { return EndIndex != 0; } 121 122 /// Returns the start sourcelocation of the first statement in this sequence. 123 /// 124 /// This method should only be called on a non-empty StmtSequence object. 125 SourceLocation getStartLoc() const; 126 127 /// Returns the end sourcelocation of the last statement in this sequence. 128 /// 129 /// This method should only be called on a non-empty StmtSequence object. 130 SourceLocation getEndLoc() const; 131 132 /// Returns the source range of the whole sequence - from the beginning 133 /// of the first statement to the end of the last statement. 134 SourceRange getSourceRange() const; 135 136 bool operator==(const StmtSequence &Other) const { 137 return std::tie(S, StartIndex, EndIndex) == 138 std::tie(Other.S, Other.StartIndex, Other.EndIndex); 139 } 140 141 bool operator!=(const StmtSequence &Other) const { 142 return std::tie(S, StartIndex, EndIndex) != 143 std::tie(Other.S, Other.StartIndex, Other.EndIndex); 144 } 145 146 /// Returns true if and only if this sequence covers a source range that 147 /// contains the source range of the given sequence \p Other. 148 /// 149 /// This method should only be called on a non-empty StmtSequence object 150 /// and passed a non-empty StmtSequence object. 151 bool contains(const StmtSequence &Other) const; 152 }; 153 154 /// Searches for similar subtrees in the AST. 155 /// 156 /// First, this class needs several declarations with statement bodies which 157 /// can be passed via analyzeCodeBody. Afterwards all statements can be 158 /// searched for clones by calling findClones with a given list of constraints 159 /// that should specify the wanted properties of the clones. 160 /// 161 /// The result of findClones can be further constrained with the constrainClones 162 /// method. 163 /// 164 /// This class only searches for clones in exectuable source code 165 /// (e.g. function bodies). Other clones (e.g. cloned comments or declarations) 166 /// are not supported. 167 class CloneDetector { 168 169 public: 170 /// A collection of StmtSequences that share an arbitrary property. 171 typedef llvm::SmallVector<StmtSequence, 8> CloneGroup; 172 173 /// Generates and stores search data for all statements in the body of 174 /// the given Decl. 175 void analyzeCodeBody(const Decl *D); 176 177 /// Constrains the given list of clone groups with the given constraint. 178 /// 179 /// The constraint is expected to have a method with the signature 180 /// `void constrain(std::vector<CloneDetector::CloneGroup> &Sequences)` 181 /// as this is the interface that the CloneDetector uses for applying the 182 /// constraint. The constraint is supposed to directly modify the passed list 183 /// so that all clones in the list fulfill the specific property this 184 /// constraint ensures. 185 template <typename T> 186 static void constrainClones(std::vector<CloneGroup> &CloneGroups, T C) { 187 C.constrain(CloneGroups); 188 } 189 190 /// Constrains the given list of clone groups with the given list of 191 /// constraints. 192 /// 193 /// The constraints are applied in sequence in the order in which they are 194 /// passed to this function. 195 template <typename T1, typename... Ts> 196 static void constrainClones(std::vector<CloneGroup> &CloneGroups, T1 C, 197 Ts... ConstraintList) { 198 constrainClones(CloneGroups, C); 199 constrainClones(CloneGroups, ConstraintList...); 200 } 201 202 /// Searches for clones in all previously passed statements. 203 /// \param Result Output parameter to which all created clone groups are 204 /// added. 205 /// \param ConstraintList The constraints that should be applied to the 206 // result. 207 template <typename... Ts> 208 void findClones(std::vector<CloneGroup> &Result, Ts... ConstraintList) { 209 // The initial assumption is that there is only one clone group and every 210 // statement is a clone of the others. This clone group will then be 211 // split up with the help of the constraints. 212 CloneGroup AllClones; 213 AllClones.reserve(Sequences.size()); 214 for (const auto &C : Sequences) { 215 AllClones.push_back(C); 216 } 217 218 Result.push_back(AllClones); 219 220 constrainClones(Result, ConstraintList...); 221 } 222 223 private: 224 CloneGroup Sequences; 225 }; 226 227 /// This class is a utility class that contains utility functions for building 228 /// custom constraints. 229 class CloneConstraint { 230 public: 231 /// Removes all groups by using a filter function. 232 /// \param CloneGroups The list of CloneGroups that is supposed to be 233 /// filtered. 234 /// \param Filter The filter function that should return true for all groups 235 /// that should be removed from the list. 236 static void 237 filterGroups(std::vector<CloneDetector::CloneGroup> &CloneGroups, 238 std::function<bool(const CloneDetector::CloneGroup &)> Filter) { 239 CloneGroups.erase( 240 std::remove_if(CloneGroups.begin(), CloneGroups.end(), Filter), 241 CloneGroups.end()); 242 } 243 244 /// Splits the given CloneGroups until the given Compare function returns true 245 /// for all clones in a single group. 246 /// \param CloneGroups A list of CloneGroups that should be modified. 247 /// \param Compare The comparison function that all clones are supposed to 248 /// pass. Should return true if and only if two clones belong 249 /// to the same CloneGroup. 250 static void splitCloneGroups( 251 std::vector<CloneDetector::CloneGroup> &CloneGroups, 252 std::function<bool(const StmtSequence &, const StmtSequence &)> Compare); 253 }; 254 255 /// Searches all children of the given clones for type II clones (i.e. they are 256 /// identical in every aspect beside the used variable names). 257 class RecursiveCloneTypeIIConstraint { 258 259 /// Generates and saves a hash code for the given Stmt. 260 /// \param S The given Stmt. 261 /// \param D The Decl containing S. 262 /// \param StmtsByHash Output parameter that will contain the hash codes for 263 /// each StmtSequence in the given Stmt. 264 /// \return The hash code of the given Stmt. 265 /// 266 /// If the given Stmt is a CompoundStmt, this method will also generate 267 /// hashes for all possible StmtSequences in the children of this Stmt. 268 size_t saveHash(const Stmt *S, const Decl *D, 269 std::vector<std::pair<size_t, StmtSequence>> &StmtsByHash); 270 271 public: 272 void constrain(std::vector<CloneDetector::CloneGroup> &Sequences); 273 }; 274 275 /// Ensures that every clone has at least the given complexity. 276 /// 277 /// Complexity is here defined as the total amount of children of a statement. 278 /// This constraint assumes the first statement in the group is representative 279 /// for all other statements in the group in terms of complexity. 280 class MinComplexityConstraint { 281 unsigned MinComplexity; 282 283 public: 284 MinComplexityConstraint(unsigned MinComplexity) 285 : MinComplexity(MinComplexity) {} 286 287 size_t calculateStmtComplexity(const StmtSequence &Seq, 288 const std::string &ParentMacroStack = ""); 289 290 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) { 291 CloneConstraint::filterGroups( 292 CloneGroups, [this](const CloneDetector::CloneGroup &A) { 293 if (!A.empty()) 294 return calculateStmtComplexity(A.front()) < MinComplexity; 295 else 296 return false; 297 }); 298 } 299 }; 300 301 /// Ensures that all clone groups contain at least the given amount of clones. 302 class MinGroupSizeConstraint { 303 unsigned MinGroupSize; 304 305 public: 306 MinGroupSizeConstraint(unsigned MinGroupSize = 2) 307 : MinGroupSize(MinGroupSize) {} 308 309 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) { 310 CloneConstraint::filterGroups(CloneGroups, 311 [this](const CloneDetector::CloneGroup &A) { 312 return A.size() < MinGroupSize; 313 }); 314 } 315 }; 316 317 /// Ensures that no clone group fully contains another clone group. 318 struct OnlyLargestCloneConstraint { 319 void constrain(std::vector<CloneDetector::CloneGroup> &Result); 320 }; 321 322 /// Analyzes the pattern of the referenced variables in a statement. 323 class VariablePattern { 324 325 /// Describes an occurence of a variable reference in a statement. 326 struct VariableOccurence { 327 /// The index of the associated VarDecl in the Variables vector. 328 size_t KindID; 329 /// The statement in the code where the variable was referenced. 330 const Stmt *Mention; 331 332 VariableOccurence(size_t KindID, const Stmt *Mention) 333 : KindID(KindID), Mention(Mention) {} 334 }; 335 336 /// All occurences of referenced variables in the order of appearance. 337 std::vector<VariableOccurence> Occurences; 338 /// List of referenced variables in the order of appearance. 339 /// Every item in this list is unique. 340 std::vector<const VarDecl *> Variables; 341 342 /// Adds a new variable referenced to this pattern. 343 /// \param VarDecl The declaration of the variable that is referenced. 344 /// \param Mention The SourceRange where this variable is referenced. 345 void addVariableOccurence(const VarDecl *VarDecl, const Stmt *Mention); 346 347 /// Adds each referenced variable from the given statement. 348 void addVariables(const Stmt *S); 349 350 public: 351 /// Creates an VariablePattern object with information about the given 352 /// StmtSequence. 353 VariablePattern(const StmtSequence &Sequence) { 354 for (const Stmt *S : Sequence) 355 addVariables(S); 356 } 357 358 /// Describes two clones that reference their variables in a different pattern 359 /// which could indicate a programming error. 360 struct SuspiciousClonePair { 361 /// Utility class holding the relevant information about a single 362 /// clone in this pair. 363 struct SuspiciousCloneInfo { 364 /// The variable which referencing in this clone was against the pattern. 365 const VarDecl *Variable; 366 /// Where the variable was referenced. 367 const Stmt *Mention; 368 /// The variable that should have been referenced to follow the pattern. 369 /// If Suggestion is a nullptr then it's not possible to fix the pattern 370 /// by referencing a different variable in this clone. 371 const VarDecl *Suggestion; 372 SuspiciousCloneInfo(const VarDecl *Variable, const Stmt *Mention, 373 const VarDecl *Suggestion) 374 : Variable(Variable), Mention(Mention), Suggestion(Suggestion) {} 375 SuspiciousCloneInfo() {} 376 }; 377 /// The first clone in the pair which always has a suggested variable. 378 SuspiciousCloneInfo FirstCloneInfo; 379 /// This other clone in the pair which can have a suggested variable. 380 SuspiciousCloneInfo SecondCloneInfo; 381 }; 382 383 /// Counts the differences between this pattern and the given one. 384 /// \param Other The given VariablePattern to compare with. 385 /// \param FirstMismatch Output parameter that will be filled with information 386 /// about the first difference between the two patterns. This parameter 387 /// can be a nullptr, in which case it will be ignored. 388 /// \return Returns the number of differences between the pattern this object 389 /// is following and the given VariablePattern. 390 /// 391 /// For example, the following statements all have the same pattern and this 392 /// function would return zero: 393 /// 394 /// if (a < b) return a; return b; 395 /// if (x < y) return x; return y; 396 /// if (u2 < u1) return u2; return u1; 397 /// 398 /// But the following statement has a different pattern (note the changed 399 /// variables in the return statements) and would have two differences when 400 /// compared with one of the statements above. 401 /// 402 /// if (a < b) return b; return a; 403 /// 404 /// This function should only be called if the related statements of the given 405 /// pattern and the statements of this objects are clones of each other. 406 unsigned countPatternDifferences( 407 const VariablePattern &Other, 408 VariablePattern::SuspiciousClonePair *FirstMismatch = nullptr); 409 }; 410 411 /// Ensures that all clones reference variables in the same pattern. 412 struct MatchingVariablePatternConstraint { 413 void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups); 414 }; 415 416 } // end namespace clang 417 418 #endif // LLVM_CLANG_AST_CLONEDETECTION_H 419