Home | History | Annotate | Download | only in Analysis
      1 //===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 ///
     10 /// \file
     11 /// This file defines classes for searching and analyzing source code clones.
     12 ///
     13 //===----------------------------------------------------------------------===//
     14 
     15 #ifndef LLVM_CLANG_AST_CLONEDETECTION_H
     16 #define LLVM_CLANG_AST_CLONEDETECTION_H
     17 
     18 #include "clang/AST/StmtVisitor.h"
     19 #include "llvm/Support/Regex.h"
     20 #include <vector>
     21 
     22 namespace clang {
     23 
     24 class Stmt;
     25 class Decl;
     26 class VarDecl;
     27 class ASTContext;
     28 class CompoundStmt;
     29 
     30 /// Identifies a list of statements.
     31 ///
     32 /// Can either identify a single arbitrary Stmt object, a continuous sequence of
     33 /// child statements inside a CompoundStmt or no statements at all.
     34 class StmtSequence {
     35   /// If this object identifies a sequence of statements inside a CompoundStmt,
     36   /// S points to this CompoundStmt. If this object only identifies a single
     37   /// Stmt, then S is a pointer to this Stmt.
     38   const Stmt *S;
     39 
     40   /// The declaration that contains the statements.
     41   const Decl *D;
     42 
     43   /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence
     44   /// instance is representing the CompoundStmt children inside the array
     45   /// [StartIndex, EndIndex).
     46   unsigned StartIndex;
     47   unsigned EndIndex;
     48 
     49 public:
     50   /// Constructs a StmtSequence holding multiple statements.
     51   ///
     52   /// The resulting StmtSequence identifies a continuous sequence of statements
     53   /// in the body of the given CompoundStmt. Which statements of the body should
     54   /// be identified needs to be specified by providing a start and end index
     55   /// that describe a non-empty sub-array in the body of the given CompoundStmt.
     56   ///
     57   /// \param Stmt A CompoundStmt that contains all statements in its body.
     58   /// \param D The Decl containing this Stmt.
     59   /// \param StartIndex The inclusive start index in the children array of
     60   ///                   \p Stmt
     61   /// \param EndIndex The exclusive end index in the children array of \p Stmt.
     62   StmtSequence(const CompoundStmt *Stmt, const Decl *D, unsigned StartIndex,
     63                unsigned EndIndex);
     64 
     65   /// Constructs a StmtSequence holding a single statement.
     66   ///
     67   /// \param Stmt An arbitrary Stmt.
     68   /// \param D The Decl containing this Stmt.
     69   StmtSequence(const Stmt *Stmt, const Decl *D);
     70 
     71   /// Constructs an empty StmtSequence.
     72   StmtSequence();
     73 
     74   typedef const Stmt *const *iterator;
     75 
     76   /// Returns an iterator pointing to the first statement in this sequence.
     77   iterator begin() const;
     78 
     79   /// Returns an iterator pointing behind the last statement in this sequence.
     80   iterator end() const;
     81 
     82   /// Returns the first statement in this sequence.
     83   ///
     84   /// This method should only be called on a non-empty StmtSequence object.
     85   const Stmt *front() const {
     86     assert(!empty());
     87     return begin()[0];
     88   }
     89 
     90   /// Returns the last statement in this sequence.
     91   ///
     92   /// This method should only be called on a non-empty StmtSequence object.
     93   const Stmt *back() const {
     94     assert(!empty());
     95     return begin()[size() - 1];
     96   }
     97 
     98   /// Returns the number of statements this object holds.
     99   unsigned size() const {
    100     if (holdsSequence())
    101       return EndIndex - StartIndex;
    102     if (S == nullptr)
    103       return 0;
    104     return 1;
    105   }
    106 
    107   /// Returns true if and only if this StmtSequence contains no statements.
    108   bool empty() const { return size() == 0; }
    109 
    110   /// Returns the related ASTContext for the stored Stmts.
    111   ASTContext &getASTContext() const;
    112 
    113   /// Returns the declaration that contains the stored Stmts.
    114   const Decl *getContainingDecl() const {
    115     assert(D);
    116     return D;
    117   }
    118 
    119   /// Returns true if this objects holds a list of statements.
    120   bool holdsSequence() const { return EndIndex != 0; }
    121 
    122   /// Returns the start sourcelocation of the first statement in this sequence.
    123   ///
    124   /// This method should only be called on a non-empty StmtSequence object.
    125   SourceLocation getStartLoc() const;
    126 
    127   /// Returns the end sourcelocation of the last statement in this sequence.
    128   ///
    129   /// This method should only be called on a non-empty StmtSequence object.
    130   SourceLocation getEndLoc() const;
    131 
    132   /// Returns the source range of the whole sequence - from the beginning
    133   /// of the first statement to the end of the last statement.
    134   SourceRange getSourceRange() const;
    135 
    136   bool operator==(const StmtSequence &Other) const {
    137     return std::tie(S, StartIndex, EndIndex) ==
    138            std::tie(Other.S, Other.StartIndex, Other.EndIndex);
    139   }
    140 
    141   bool operator!=(const StmtSequence &Other) const {
    142     return std::tie(S, StartIndex, EndIndex) !=
    143            std::tie(Other.S, Other.StartIndex, Other.EndIndex);
    144   }
    145 
    146   /// Returns true if and only if this sequence covers a source range that
    147   /// contains the source range of the given sequence \p Other.
    148   ///
    149   /// This method should only be called on a non-empty StmtSequence object
    150   /// and passed a non-empty StmtSequence object.
    151   bool contains(const StmtSequence &Other) const;
    152 };
    153 
    154 /// Searches for similar subtrees in the AST.
    155 ///
    156 /// First, this class needs several declarations with statement bodies which
    157 /// can be passed via analyzeCodeBody. Afterwards all statements can be
    158 /// searched for clones by calling findClones with a given list of constraints
    159 /// that should specify the wanted properties of the clones.
    160 ///
    161 /// The result of findClones can be further constrained with the constrainClones
    162 /// method.
    163 ///
    164 /// This class only searches for clones in exectuable source code
    165 /// (e.g. function bodies). Other clones (e.g. cloned comments or declarations)
    166 /// are not supported.
    167 class CloneDetector {
    168 
    169 public:
    170   /// A collection of StmtSequences that share an arbitrary property.
    171   typedef llvm::SmallVector<StmtSequence, 8> CloneGroup;
    172 
    173   /// Generates and stores search data for all statements in the body of
    174   /// the given Decl.
    175   void analyzeCodeBody(const Decl *D);
    176 
    177   /// Constrains the given list of clone groups with the given constraint.
    178   ///
    179   /// The constraint is expected to have a method with the signature
    180   ///     `void constrain(std::vector<CloneDetector::CloneGroup> &Sequences)`
    181   /// as this is the interface that the CloneDetector uses for applying the
    182   /// constraint. The constraint is supposed to directly modify the passed list
    183   /// so that all clones in the list fulfill the specific property this
    184   /// constraint ensures.
    185   template <typename T>
    186   static void constrainClones(std::vector<CloneGroup> &CloneGroups, T C) {
    187     C.constrain(CloneGroups);
    188   }
    189 
    190   /// Constrains the given list of clone groups with the given list of
    191   /// constraints.
    192   ///
    193   /// The constraints are applied in sequence in the order in which they are
    194   /// passed to this function.
    195   template <typename T1, typename... Ts>
    196   static void constrainClones(std::vector<CloneGroup> &CloneGroups, T1 C,
    197                               Ts... ConstraintList) {
    198     constrainClones(CloneGroups, C);
    199     constrainClones(CloneGroups, ConstraintList...);
    200   }
    201 
    202   /// Searches for clones in all previously passed statements.
    203   /// \param Result Output parameter to which all created clone groups are
    204   ///               added.
    205   /// \param ConstraintList The constraints that should be applied to the
    206   //         result.
    207   template <typename... Ts>
    208   void findClones(std::vector<CloneGroup> &Result, Ts... ConstraintList) {
    209     // The initial assumption is that there is only one clone group and every
    210     // statement is a clone of the others. This clone group will then be
    211     // split up with the help of the constraints.
    212     CloneGroup AllClones;
    213     AllClones.reserve(Sequences.size());
    214     for (const auto &C : Sequences) {
    215       AllClones.push_back(C);
    216     }
    217 
    218     Result.push_back(AllClones);
    219 
    220     constrainClones(Result, ConstraintList...);
    221   }
    222 
    223 private:
    224   CloneGroup Sequences;
    225 };
    226 
    227 /// This class is a utility class that contains utility functions for building
    228 /// custom constraints.
    229 class CloneConstraint {
    230 public:
    231   /// Removes all groups by using a filter function.
    232   /// \param CloneGroups The list of CloneGroups that is supposed to be
    233   ///                    filtered.
    234   /// \param Filter The filter function that should return true for all groups
    235   ///               that should be removed from the list.
    236   static void filterGroups(
    237       std::vector<CloneDetector::CloneGroup> &CloneGroups,
    238       llvm::function_ref<bool(const CloneDetector::CloneGroup &)> Filter) {
    239     CloneGroups.erase(
    240         std::remove_if(CloneGroups.begin(), CloneGroups.end(), Filter),
    241         CloneGroups.end());
    242   }
    243 
    244   /// Splits the given CloneGroups until the given Compare function returns true
    245   /// for all clones in a single group.
    246   /// \param CloneGroups A list of CloneGroups that should be modified.
    247   /// \param Compare The comparison function that all clones are supposed to
    248   ///                pass. Should return true if and only if two clones belong
    249   ///                to the same CloneGroup.
    250   static void splitCloneGroups(
    251       std::vector<CloneDetector::CloneGroup> &CloneGroups,
    252       llvm::function_ref<bool(const StmtSequence &, const StmtSequence &)>
    253           Compare);
    254 };
    255 
    256 /// This constraint moves clones into clone groups of type II via hashing.
    257 ///
    258 /// Clones with different hash values are moved into separate clone groups.
    259 /// Collisions are possible, and this constraint does nothing to address this
    260 /// them. Add the slower RecursiveCloneTypeIIVerifyConstraint later in the
    261 /// constraint chain, not necessarily immediately, to eliminate hash collisions
    262 /// through a more detailed analysis.
    263 class RecursiveCloneTypeIIHashConstraint {
    264 public:
    265   void constrain(std::vector<CloneDetector::CloneGroup> &Sequences);
    266 };
    267 
    268 /// This constraint moves clones into clone groups of type II by comparing them.
    269 ///
    270 /// Clones that aren't type II clones are moved into separate clone groups.
    271 /// In contrast to the RecursiveCloneTypeIIHashConstraint, all clones in a clone
    272 /// group are guaranteed to be be type II clones of each other, but it is too
    273 /// slow to efficiently handle large amounts of clones.
    274 class RecursiveCloneTypeIIVerifyConstraint {
    275 public:
    276   void constrain(std::vector<CloneDetector::CloneGroup> &Sequences);
    277 };
    278 
    279 /// Ensures that every clone has at least the given complexity.
    280 ///
    281 /// Complexity is here defined as the total amount of children of a statement.
    282 /// This constraint assumes the first statement in the group is representative
    283 /// for all other statements in the group in terms of complexity.
    284 class MinComplexityConstraint {
    285   unsigned MinComplexity;
    286 
    287 public:
    288   MinComplexityConstraint(unsigned MinComplexity)
    289       : MinComplexity(MinComplexity) {}
    290 
    291   /// Calculates the complexity of the given StmtSequence.
    292   /// \param Limit The limit of complexity we probe for. After reaching
    293   ///              this limit during calculation, this method is exiting
    294   ///              early to improve performance and returns this limit.
    295   size_t calculateStmtComplexity(const StmtSequence &Seq, std::size_t Limit,
    296                                  const std::string &ParentMacroStack = "");
    297 
    298   void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
    299     CloneConstraint::filterGroups(
    300         CloneGroups, [this](const CloneDetector::CloneGroup &A) {
    301           if (!A.empty())
    302             return calculateStmtComplexity(A.front(), MinComplexity) <
    303                    MinComplexity;
    304           else
    305             return false;
    306         });
    307   }
    308 };
    309 
    310 /// Ensures that all clone groups contain at least the given amount of clones.
    311 class MinGroupSizeConstraint {
    312   unsigned MinGroupSize;
    313 
    314 public:
    315   MinGroupSizeConstraint(unsigned MinGroupSize = 2)
    316       : MinGroupSize(MinGroupSize) {}
    317 
    318   void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
    319     CloneConstraint::filterGroups(CloneGroups,
    320                                   [this](const CloneDetector::CloneGroup &A) {
    321                                     return A.size() < MinGroupSize;
    322                                   });
    323   }
    324 };
    325 
    326 /// Ensures that no clone group fully contains another clone group.
    327 struct OnlyLargestCloneConstraint {
    328   void constrain(std::vector<CloneDetector::CloneGroup> &Result);
    329 };
    330 
    331 struct FilenamePatternConstraint {
    332   StringRef IgnoredFilesPattern;
    333   std::shared_ptr<llvm::Regex> IgnoredFilesRegex;
    334 
    335   FilenamePatternConstraint(StringRef IgnoredFilesPattern)
    336       : IgnoredFilesPattern(IgnoredFilesPattern) {
    337     IgnoredFilesRegex = std::make_shared<llvm::Regex>("^(" +
    338         IgnoredFilesPattern.str() + "$)");
    339   }
    340 
    341   bool isAutoGenerated(const CloneDetector::CloneGroup &Group);
    342 
    343   void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) {
    344     CloneConstraint::filterGroups(
    345         CloneGroups, [this](const CloneDetector::CloneGroup &Group) {
    346           return isAutoGenerated(Group);
    347         });
    348   }
    349 };
    350 
    351 /// Analyzes the pattern of the referenced variables in a statement.
    352 class VariablePattern {
    353 
    354   /// Describes an occurence of a variable reference in a statement.
    355   struct VariableOccurence {
    356     /// The index of the associated VarDecl in the Variables vector.
    357     size_t KindID;
    358     /// The statement in the code where the variable was referenced.
    359     const Stmt *Mention;
    360 
    361     VariableOccurence(size_t KindID, const Stmt *Mention)
    362         : KindID(KindID), Mention(Mention) {}
    363   };
    364 
    365   /// All occurences of referenced variables in the order of appearance.
    366   std::vector<VariableOccurence> Occurences;
    367   /// List of referenced variables in the order of appearance.
    368   /// Every item in this list is unique.
    369   std::vector<const VarDecl *> Variables;
    370 
    371   /// Adds a new variable referenced to this pattern.
    372   /// \param VarDecl The declaration of the variable that is referenced.
    373   /// \param Mention The SourceRange where this variable is referenced.
    374   void addVariableOccurence(const VarDecl *VarDecl, const Stmt *Mention);
    375 
    376   /// Adds each referenced variable from the given statement.
    377   void addVariables(const Stmt *S);
    378 
    379 public:
    380   /// Creates an VariablePattern object with information about the given
    381   /// StmtSequence.
    382   VariablePattern(const StmtSequence &Sequence) {
    383     for (const Stmt *S : Sequence)
    384       addVariables(S);
    385   }
    386 
    387   /// Describes two clones that reference their variables in a different pattern
    388   /// which could indicate a programming error.
    389   struct SuspiciousClonePair {
    390     /// Utility class holding the relevant information about a single
    391     /// clone in this pair.
    392     struct SuspiciousCloneInfo {
    393       /// The variable which referencing in this clone was against the pattern.
    394       const VarDecl *Variable;
    395       /// Where the variable was referenced.
    396       const Stmt *Mention;
    397       /// The variable that should have been referenced to follow the pattern.
    398       /// If Suggestion is a nullptr then it's not possible to fix the pattern
    399       /// by referencing a different variable in this clone.
    400       const VarDecl *Suggestion;
    401       SuspiciousCloneInfo(const VarDecl *Variable, const Stmt *Mention,
    402                           const VarDecl *Suggestion)
    403           : Variable(Variable), Mention(Mention), Suggestion(Suggestion) {}
    404       SuspiciousCloneInfo() {}
    405     };
    406     /// The first clone in the pair which always has a suggested variable.
    407     SuspiciousCloneInfo FirstCloneInfo;
    408     /// This other clone in the pair which can have a suggested variable.
    409     SuspiciousCloneInfo SecondCloneInfo;
    410   };
    411 
    412   /// Counts the differences between this pattern and the given one.
    413   /// \param Other The given VariablePattern to compare with.
    414   /// \param FirstMismatch Output parameter that will be filled with information
    415   ///        about the first difference between the two patterns. This parameter
    416   ///        can be a nullptr, in which case it will be ignored.
    417   /// \return Returns the number of differences between the pattern this object
    418   ///         is following and the given VariablePattern.
    419   ///
    420   /// For example, the following statements all have the same pattern and this
    421   /// function would return zero:
    422   ///
    423   ///   if (a < b) return a; return b;
    424   ///   if (x < y) return x; return y;
    425   ///   if (u2 < u1) return u2; return u1;
    426   ///
    427   /// But the following statement has a different pattern (note the changed
    428   /// variables in the return statements) and would have two differences when
    429   /// compared with one of the statements above.
    430   ///
    431   ///   if (a < b) return b; return a;
    432   ///
    433   /// This function should only be called if the related statements of the given
    434   /// pattern and the statements of this objects are clones of each other.
    435   unsigned countPatternDifferences(
    436       const VariablePattern &Other,
    437       VariablePattern::SuspiciousClonePair *FirstMismatch = nullptr);
    438 };
    439 
    440 /// Ensures that all clones reference variables in the same pattern.
    441 struct MatchingVariablePatternConstraint {
    442   void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups);
    443 };
    444 
    445 } // end namespace clang
    446 
    447 #endif // LLVM_CLANG_AST_CLONEDETECTION_H
    448