Home | History | Annotate | Download | only in analyzer
      1 #!/usr/bin/env python
      2 
      3 """
      4 CmpRuns - A simple tool for comparing two static analyzer runs to determine
      5 which reports have been added, removed, or changed.
      6 
      7 This is designed to support automated testing using the static analyzer, from
      8 two perspectives: 
      9   1. To monitor changes in the static analyzer's reports on real code bases, for
     10      regression testing.
     11 
     12   2. For use by end users who want to integrate regular static analyzer testing
     13      into a buildbot like environment.
     14 
     15 Usage:
     16 
     17     # Load the results of both runs, to obtain lists of the corresponding
     18     # AnalysisDiagnostic objects.
     19     #
     20     resultsA = loadResultsFromSingleRun(singleRunInfoA, deleteEmpty)
     21     resultsB = loadResultsFromSingleRun(singleRunInfoB, deleteEmpty)
     22     
     23     # Generate a relation from diagnostics in run A to diagnostics in run B 
     24     # to obtain a list of triples (a, b, confidence). 
     25     diff = compareResults(resultsA, resultsB)
     26            
     27 """
     28 
     29 import os
     30 import plistlib
     31 import CmpRuns
     32 
     33 # Information about analysis run:
     34 # path - the analysis output directory
     35 # root - the name of the root directory, which will be disregarded when 
     36 # determining the source file name
     37 class SingleRunInfo:
     38     def __init__(self, path, root="", verboseLog=None):
     39         self.path = path
     40         self.root = root
     41         self.verboseLog = verboseLog
     42 
     43 class AnalysisDiagnostic:
     44     def __init__(self, data, report, htmlReport):
     45         self._data = data
     46         self._loc = self._data['location']
     47         self._report = report
     48         self._htmlReport = htmlReport
     49 
     50     def getFileName(self):
     51         root = self._report.run.root
     52         fileName = self._report.files[self._loc['file']]
     53         if fileName.startswith(root) :
     54             return fileName[len(root):]  
     55         return fileName
     56 
     57     def getLine(self):
     58         return self._loc['line']
     59         
     60     def getColumn(self):
     61         return self._loc['col']
     62 
     63     def getCategory(self):
     64         return self._data['category']
     65 
     66     def getDescription(self):
     67         return self._data['description']
     68 
     69     def getIssueIdentifier(self) :
     70         id = self.getFileName() + "+"
     71         if 'issue_context' in self._data :
     72           id += self._data['issue_context'] + "+"
     73         if 'issue_hash' in self._data :
     74           id += str(self._data['issue_hash'])
     75         return id
     76 
     77     def getReport(self):
     78         if self._htmlReport is None:
     79             return " "
     80         return os.path.join(self._report.run.path, self._htmlReport)
     81 
     82     def getReadableName(self):
     83         return '%s:%d:%d, %s: %s' % (self.getFileName(), self.getLine(), 
     84                                      self.getColumn(), self.getCategory(), 
     85                                      self.getDescription())
     86         
     87     # Note, the data format is not an API and may change from one analyzer 
     88     # version to another.        
     89     def getRawData(self):
     90         return self._data
     91 
     92 class multidict:
     93     def __init__(self, elts=()):
     94         self.data = {}
     95         for key,value in elts:
     96             self[key] = value
     97     
     98     def __getitem__(self, item):
     99         return self.data[item]
    100     def __setitem__(self, key, value):
    101         if key in self.data:
    102             self.data[key].append(value)
    103         else:
    104             self.data[key] = [value]
    105     def items(self):
    106         return self.data.items()
    107     def values(self):
    108         return self.data.values()
    109     def keys(self):
    110         return self.data.keys()
    111     def __len__(self):
    112         return len(self.data)
    113     def get(self, key, default=None):
    114         return self.data.get(key, default)
    115 
    116 class CmpOptions:
    117     def __init__(self, verboseLog=None, rootA="", rootB=""):
    118         self.rootA = rootA
    119         self.rootB = rootB
    120         self.verboseLog = verboseLog
    121 
    122 class AnalysisReport:
    123     def __init__(self, run, files):
    124         self.run = run
    125         self.files = files
    126         self.diagnostics = []
    127 
    128 class AnalysisRun:
    129     def __init__(self, info):
    130         self.path = info.path
    131         self.root = info.root
    132         self.info = info
    133         self.reports = []
    134         # Cumulative list of all diagnostics from all the reports.
    135         self.diagnostics = []
    136         self.clang_version = None
    137     
    138     def getClangVersion(self):
    139         return self.clang_version
    140 
    141     def readSingleFile(self, p, deleteEmpty):
    142         data = plistlib.readPlist(p)
    143 
    144         # We want to retrieve the clang version even if there are no 
    145         # reports. Assume that all reports were created using the same 
    146         # clang version (this is always true and is more efficient).
    147         if 'clang_version' in data:
    148             if self.clang_version == None:
    149                 self.clang_version = data.pop('clang_version')
    150             else:
    151                 data.pop('clang_version')
    152 
    153         # Ignore/delete empty reports.
    154         if not data['files']:
    155             if deleteEmpty == True:
    156                 os.remove(p)
    157             return
    158 
    159         # Extract the HTML reports, if they exists.
    160         if 'HTMLDiagnostics_files' in data['diagnostics'][0]:
    161             htmlFiles = []
    162             for d in data['diagnostics']:
    163                 # FIXME: Why is this named files, when does it have multiple
    164                 # files?
    165                 assert len(d['HTMLDiagnostics_files']) == 1
    166                 htmlFiles.append(d.pop('HTMLDiagnostics_files')[0])
    167         else:
    168             htmlFiles = [None] * len(data['diagnostics'])
    169             
    170         report = AnalysisReport(self, data.pop('files'))
    171         diagnostics = [AnalysisDiagnostic(d, report, h) 
    172                        for d,h in zip(data.pop('diagnostics'),
    173                                       htmlFiles)]
    174 
    175         assert not data
    176 
    177         report.diagnostics.extend(diagnostics)
    178         self.reports.append(report)
    179         self.diagnostics.extend(diagnostics)
    180 
    181 
    182 # Backward compatibility API. 
    183 def loadResults(path, opts, root = "", deleteEmpty=True):
    184     return loadResultsFromSingleRun(SingleRunInfo(path, root, opts.verboseLog),
    185                                     deleteEmpty)
    186 
    187 # Load results of the analyzes from a given output folder.
    188 # - info is the SingleRunInfo object
    189 # - deleteEmpty specifies if the empty plist files should be deleted
    190 def loadResultsFromSingleRun(info, deleteEmpty=True):
    191     path = info.path
    192     run = AnalysisRun(info)
    193 
    194     if os.path.isfile(path):
    195         run.readSingleFile(path, deleteEmpty)
    196     else:
    197         for (dirpath, dirnames, filenames) in os.walk(path):
    198             for f in filenames:
    199                 if (not f.endswith('plist')):
    200                     continue
    201                 p = os.path.join(dirpath, f)
    202                 run.readSingleFile(p, deleteEmpty)
    203 
    204     return run
    205 
    206 def cmpAnalysisDiagnostic(d) :
    207     return d.getIssueIdentifier()
    208 
    209 def compareResults(A, B):
    210     """
    211     compareResults - Generate a relation from diagnostics in run A to
    212     diagnostics in run B.
    213 
    214     The result is the relation as a list of triples (a, b, confidence) where
    215     each element {a,b} is None or an element from the respective run, and
    216     confidence is a measure of the match quality (where 0 indicates equality,
    217     and None is used if either element is None).
    218     """
    219 
    220     res = []
    221 
    222     # Quickly eliminate equal elements.
    223     neqA = []
    224     neqB = []
    225     eltsA = list(A.diagnostics)
    226     eltsB = list(B.diagnostics)
    227     eltsA.sort(key = cmpAnalysisDiagnostic)
    228     eltsB.sort(key = cmpAnalysisDiagnostic)
    229     while eltsA and eltsB:
    230         a = eltsA.pop()
    231         b = eltsB.pop()
    232         if (a.getIssueIdentifier() == b.getIssueIdentifier()) :
    233             res.append((a, b, 0))
    234         elif a.getIssueIdentifier() > b.getIssueIdentifier():
    235             eltsB.append(b)
    236             neqA.append(a)
    237         else:
    238             eltsA.append(a)
    239             neqB.append(b)
    240     neqA.extend(eltsA)
    241     neqB.extend(eltsB)
    242 
    243     # FIXME: Add fuzzy matching. One simple and possible effective idea would be
    244     # to bin the diagnostics, print them in a normalized form (based solely on
    245     # the structure of the diagnostic), compute the diff, then use that as the
    246     # basis for matching. This has the nice property that we don't depend in any
    247     # way on the diagnostic format.
    248 
    249     for a in neqA:
    250         res.append((a, None, None))
    251     for b in neqB:
    252         res.append((None, b, None))
    253 
    254     return res
    255 
    256 def dumpScanBuildResultsDiff(dirA, dirB, opts, deleteEmpty=True):
    257     # Load the run results.
    258     resultsA = loadResults(dirA, opts, opts.rootA, deleteEmpty)
    259     resultsB = loadResults(dirB, opts, opts.rootB, deleteEmpty)
    260     
    261     # Open the verbose log, if given.
    262     if opts.verboseLog:
    263         auxLog = open(opts.verboseLog, "wb")
    264     else:
    265         auxLog = None
    266 
    267     diff = compareResults(resultsA, resultsB)
    268     foundDiffs = 0
    269     for res in diff:
    270         a,b,confidence = res
    271         if a is None:
    272             print "ADDED: %r" % b.getReadableName()
    273             foundDiffs += 1
    274             if auxLog:
    275                 print >>auxLog, ("('ADDED', %r, %r)" % (b.getReadableName(),
    276                                                         b.getReport()))
    277         elif b is None:
    278             print "REMOVED: %r" % a.getReadableName()
    279             foundDiffs += 1
    280             if auxLog:
    281                 print >>auxLog, ("('REMOVED', %r, %r)" % (a.getReadableName(),
    282                                                           a.getReport()))
    283         elif confidence:
    284             print "CHANGED: %r to %r" % (a.getReadableName(),
    285                                          b.getReadableName())
    286             foundDiffs += 1
    287             if auxLog:
    288                 print >>auxLog, ("('CHANGED', %r, %r, %r, %r)" 
    289                                  % (a.getReadableName(),
    290                                     b.getReadableName(),
    291                                     a.getReport(),
    292                                     b.getReport()))
    293         else:
    294             pass
    295 
    296     TotalReports = len(resultsB.diagnostics)
    297     print "TOTAL REPORTS: %r" % TotalReports
    298     print "TOTAL DIFFERENCES: %r" % foundDiffs
    299     if auxLog:
    300         print >>auxLog, "('TOTAL NEW REPORTS', %r)" % TotalReports
    301         print >>auxLog, "('TOTAL DIFFERENCES', %r)" % foundDiffs
    302         
    303     return foundDiffs    
    304 
    305 def main():
    306     from optparse import OptionParser
    307     parser = OptionParser("usage: %prog [options] [dir A] [dir B]")
    308     parser.add_option("", "--rootA", dest="rootA",
    309                       help="Prefix to ignore on source files for directory A",
    310                       action="store", type=str, default="")
    311     parser.add_option("", "--rootB", dest="rootB",
    312                       help="Prefix to ignore on source files for directory B",
    313                       action="store", type=str, default="")
    314     parser.add_option("", "--verbose-log", dest="verboseLog",
    315                       help="Write additional information to LOG [default=None]",
    316                       action="store", type=str, default=None,
    317                       metavar="LOG")
    318     (opts, args) = parser.parse_args()
    319 
    320     if len(args) != 2:
    321         parser.error("invalid number of arguments")
    322 
    323     dirA,dirB = args
    324 
    325     dumpScanBuildResultsDiff(dirA, dirB, opts)    
    326 
    327 if __name__ == '__main__':
    328     main()
    329