Home | History | Annotate | Download | only in analyzer
      1 #!/usr/bin/env python
      2 
      3 """
      4 CmpRuns - A simple tool for comparing two static analyzer runs to determine
      5 which reports have been added, removed, or changed.
      6 
      7 This is designed to support automated testing using the static analyzer, from
      8 two perspectives: 
      9   1. To monitor changes in the static analyzer's reports on real code bases, for
     10      regression testing.
     11 
     12   2. For use by end users who want to integrate regular static analyzer testing
     13      into a buildbot like environment.
     14 
     15 Usage:
     16 
     17     # Load the results of both runs, to obtain lists of the corresponding
     18     # AnalysisDiagnostic objects.
     19     #
     20     resultsA = loadResultsFromSingleRun(singleRunInfoA, deleteEmpty)
     21     resultsB = loadResultsFromSingleRun(singleRunInfoB, deleteEmpty)
     22     
     23     # Generate a relation from diagnostics in run A to diagnostics in run B 
     24     # to obtain a list of triples (a, b, confidence). 
     25     diff = compareResults(resultsA, resultsB)
     26            
     27 """
     28 
     29 import os
     30 import plistlib
     31 import CmpRuns
     32 
     33 # Information about analysis run:
     34 # path - the analysis output directory
     35 # root - the name of the root directory, which will be disregarded when 
     36 # determining the source file name
     37 class SingleRunInfo:
     38     def __init__(self, path, root="", verboseLog=None):
     39         self.path = path
     40         self.root = root
     41         self.verboseLog = verboseLog
     42 
     43 class AnalysisDiagnostic:
     44     def __init__(self, data, report, htmlReport):
     45         self._data = data
     46         self._loc = self._data['location']
     47         self._report = report
     48         self._htmlReport = htmlReport
     49 
     50     def getFileName(self):
     51         root = self._report.run.root
     52         fileName = self._report.files[self._loc['file']]
     53         if fileName.startswith(root) :
     54             return fileName[len(root):]  
     55         return fileName
     56 
     57     def getLine(self):
     58         return self._loc['line']
     59         
     60     def getColumn(self):
     61         return self._loc['col']
     62 
     63     def getCategory(self):
     64         return self._data['category']
     65 
     66     def getDescription(self):
     67         return self._data['description']
     68 
     69     def getIssueIdentifier(self) :
     70         id = self.getFileName() + "+"
     71         if 'issue_context' in self._data :
     72           id += self._data['issue_context'] + "+"
     73         if 'issue_hash' in self._data :
     74           id += str(self._data['issue_hash'])
     75         return id
     76 
     77     def getReport(self):
     78         if self._htmlReport is None:
     79             return " "
     80         return os.path.join(self._report.run.path, self._htmlReport)
     81 
     82     def getReadableName(self):
     83         return '%s:%d:%d, %s: %s' % (self.getFileName(), self.getLine(), 
     84                                      self.getColumn(), self.getCategory(), 
     85                                      self.getDescription())
     86         
     87     # Note, the data format is not an API and may change from one analyzer 
     88     # version to another.        
     89     def getRawData(self):
     90         return self._data
     91 
     92 class multidict:
     93     def __init__(self, elts=()):
     94         self.data = {}
     95         for key,value in elts:
     96             self[key] = value
     97     
     98     def __getitem__(self, item):
     99         return self.data[item]
    100     def __setitem__(self, key, value):
    101         if key in self.data:
    102             self.data[key].append(value)
    103         else:
    104             self.data[key] = [value]
    105     def items(self):
    106         return self.data.items()
    107     def values(self):
    108         return self.data.values()
    109     def keys(self):
    110         return self.data.keys()
    111     def __len__(self):
    112         return len(self.data)
    113     def get(self, key, default=None):
    114         return self.data.get(key, default)
    115 
    116 class CmpOptions:
    117     def __init__(self, verboseLog=None, rootA="", rootB=""):
    118         self.rootA = rootA
    119         self.rootB = rootB
    120         self.verboseLog = verboseLog
    121 
    122 class AnalysisReport:
    123     def __init__(self, run, files):
    124         self.run = run
    125         self.files = files
    126         self.diagnostics = []
    127 
    128 class AnalysisRun:
    129     def __init__(self, info):
    130         self.path = info.path
    131         self.root = info.root
    132         self.info = info
    133         self.reports = []
    134         # Cumulative list of all diagnostics from all the reports.
    135         self.diagnostics = []
    136         self.clang_version = None
    137     
    138     def getClangVersion(self):
    139         return self.clang_version
    140 
    141 
    142 # Backward compatibility API. 
    143 def loadResults(path, opts, root = "", deleteEmpty=True):
    144     return loadResultsFromSingleRun(SingleRunInfo(path, root, opts.verboseLog),
    145                                     deleteEmpty)
    146 
    147 # Load results of the analyzes from a given output folder.
    148 # - info is the SingleRunInfo object
    149 # - deleteEmpty specifies if the empty plist files should be deleted
    150 def loadResultsFromSingleRun(info, deleteEmpty=True):
    151     path = info.path
    152     run = AnalysisRun(info)
    153     
    154     for (dirpath, dirnames, filenames) in os.walk(path):
    155         for f in filenames:
    156             if (not f.endswith('plist')):
    157                 continue
    158     
    159             p = os.path.join(dirpath, f)
    160             data = plistlib.readPlist(p)
    161     
    162             # We want to retrieve the clang version even if there are no 
    163             # reports. Assume that all reports were created using the same 
    164             # clang version (this is always true and is more efficient).
    165             if ('clang_version' in data) :
    166                 if (run.clang_version == None) :
    167                     run.clang_version = data.pop('clang_version')
    168                 else:
    169                     data.pop('clang_version')
    170                 
    171             # Ignore/delete empty reports.
    172             if not data['files']:
    173                 if deleteEmpty == True:
    174                     os.remove(p)
    175                 continue
    176     
    177             # Extract the HTML reports, if they exists.
    178             if 'HTMLDiagnostics_files' in data['diagnostics'][0]:
    179                 htmlFiles = []
    180                 for d in data['diagnostics']:
    181                     # FIXME: Why is this named files, when does it have multiple
    182                     # files?
    183                     assert len(d['HTMLDiagnostics_files']) == 1
    184                     htmlFiles.append(d.pop('HTMLDiagnostics_files')[0])
    185             else:
    186                 htmlFiles = [None] * len(data['diagnostics'])
    187             
    188             report = AnalysisReport(run, data.pop('files'))
    189             diagnostics = [AnalysisDiagnostic(d, report, h) 
    190                            for d,h in zip(data.pop('diagnostics'),
    191                                           htmlFiles)]
    192     
    193             assert not data
    194             
    195             report.diagnostics.extend(diagnostics)
    196             run.reports.append(report)
    197             run.diagnostics.extend(diagnostics)
    198             
    199     return run
    200 
    201 def cmpAnalysisDiagnostic(d) :
    202     return d.getIssueIdentifier()
    203 
    204 def compareResults(A, B):
    205     """
    206     compareResults - Generate a relation from diagnostics in run A to
    207     diagnostics in run B.
    208 
    209     The result is the relation as a list of triples (a, b, confidence) where
    210     each element {a,b} is None or an element from the respective run, and
    211     confidence is a measure of the match quality (where 0 indicates equality,
    212     and None is used if either element is None).
    213     """
    214 
    215     res = []
    216 
    217     # Quickly eliminate equal elements.
    218     neqA = []
    219     neqB = []
    220     eltsA = list(A.diagnostics)
    221     eltsB = list(B.diagnostics)
    222     eltsA.sort(key = cmpAnalysisDiagnostic)
    223     eltsB.sort(key = cmpAnalysisDiagnostic)
    224     while eltsA and eltsB:
    225         a = eltsA.pop()
    226         b = eltsB.pop()
    227         if (a.getIssueIdentifier() == b.getIssueIdentifier()) :
    228             res.append((a, b, 0))
    229         elif a.getIssueIdentifier() > b.getIssueIdentifier():
    230             eltsB.append(b)
    231             neqA.append(a)
    232         else:
    233             eltsA.append(a)
    234             neqB.append(b)
    235     neqA.extend(eltsA)
    236     neqB.extend(eltsB)
    237 
    238     # FIXME: Add fuzzy matching. One simple and possible effective idea would be
    239     # to bin the diagnostics, print them in a normalized form (based solely on
    240     # the structure of the diagnostic), compute the diff, then use that as the
    241     # basis for matching. This has the nice property that we don't depend in any
    242     # way on the diagnostic format.
    243 
    244     for a in neqA:
    245         res.append((a, None, None))
    246     for b in neqB:
    247         res.append((None, b, None))
    248 
    249     return res
    250 
    251 def dumpScanBuildResultsDiff(dirA, dirB, opts, deleteEmpty=True):
    252     # Load the run results.
    253     resultsA = loadResults(dirA, opts, opts.rootA, deleteEmpty)
    254     resultsB = loadResults(dirB, opts, opts.rootB, deleteEmpty)
    255     
    256     # Open the verbose log, if given.
    257     if opts.verboseLog:
    258         auxLog = open(opts.verboseLog, "wb")
    259     else:
    260         auxLog = None
    261 
    262     diff = compareResults(resultsA, resultsB)
    263     foundDiffs = 0
    264     for res in diff:
    265         a,b,confidence = res
    266         if a is None:
    267             print "ADDED: %r" % b.getReadableName()
    268             foundDiffs += 1
    269             if auxLog:
    270                 print >>auxLog, ("('ADDED', %r, %r)" % (b.getReadableName(),
    271                                                         b.getReport()))
    272         elif b is None:
    273             print "REMOVED: %r" % a.getReadableName()
    274             foundDiffs += 1
    275             if auxLog:
    276                 print >>auxLog, ("('REMOVED', %r, %r)" % (a.getReadableName(),
    277                                                           a.getReport()))
    278         elif confidence:
    279             print "CHANGED: %r to %r" % (a.getReadableName(),
    280                                          b.getReadableName())
    281             foundDiffs += 1
    282             if auxLog:
    283                 print >>auxLog, ("('CHANGED', %r, %r, %r, %r)" 
    284                                  % (a.getReadableName(),
    285                                     b.getReadableName(),
    286                                     a.getReport(),
    287                                     b.getReport()))
    288         else:
    289             pass
    290 
    291     TotalReports = len(resultsB.diagnostics)
    292     print "TOTAL REPORTS: %r" % TotalReports
    293     print "TOTAL DIFFERENCES: %r" % foundDiffs
    294     if auxLog:
    295         print >>auxLog, "('TOTAL NEW REPORTS', %r)" % TotalReports
    296         print >>auxLog, "('TOTAL DIFFERENCES', %r)" % foundDiffs
    297         
    298     return foundDiffs    
    299 
    300 def main():
    301     from optparse import OptionParser
    302     parser = OptionParser("usage: %prog [options] [dir A] [dir B]")
    303     parser.add_option("", "--rootA", dest="rootA",
    304                       help="Prefix to ignore on source files for directory A",
    305                       action="store", type=str, default="")
    306     parser.add_option("", "--rootB", dest="rootB",
    307                       help="Prefix to ignore on source files for directory B",
    308                       action="store", type=str, default="")
    309     parser.add_option("", "--verbose-log", dest="verboseLog",
    310                       help="Write additional information to LOG [default=None]",
    311                       action="store", type=str, default=None,
    312                       metavar="LOG")
    313     (opts, args) = parser.parse_args()
    314 
    315     if len(args) != 2:
    316         parser.error("invalid number of arguments")
    317 
    318     dirA,dirB = args
    319 
    320     dumpScanBuildResultsDiff(dirA, dirB, opts)    
    321 
    322 if __name__ == '__main__':
    323     main()
    324