Home | History | Annotate | Download | only in Lib
      1 """Utilities for comparing files and directories.
      2 
      3 Classes:
      4     dircmp
      5 
      6 Functions:
      7     cmp(f1, f2, shallow=True) -> int
      8     cmpfiles(a, b, common) -> ([], [], [])
      9     clear_cache()
     10 
     11 """
     12 
     13 import os
     14 import stat
     15 from itertools import filterfalse
     16 
     17 __all__ = ['clear_cache', 'cmp', 'dircmp', 'cmpfiles', 'DEFAULT_IGNORES']
     18 
     19 _cache = {}
     20 BUFSIZE = 8*1024
     21 
     22 DEFAULT_IGNORES = [
     23     'RCS', 'CVS', 'tags', '.git', '.hg', '.bzr', '_darcs', '__pycache__']
     24 
     25 def clear_cache():
     26     """Clear the filecmp cache."""
     27     _cache.clear()
     28 
     29 def cmp(f1, f2, shallow=True):
     30     """Compare two files.
     31 
     32     Arguments:
     33 
     34     f1 -- First file name
     35 
     36     f2 -- Second file name
     37 
     38     shallow -- Just check stat signature (do not read the files).
     39                defaults to True.
     40 
     41     Return value:
     42 
     43     True if the files are the same, False otherwise.
     44 
     45     This function uses a cache for past comparisons and the results,
     46     with cache entries invalidated if their stat information
     47     changes.  The cache may be cleared by calling clear_cache().
     48 
     49     """
     50 
     51     s1 = _sig(os.stat(f1))
     52     s2 = _sig(os.stat(f2))
     53     if s1[0] != stat.S_IFREG or s2[0] != stat.S_IFREG:
     54         return False
     55     if shallow and s1 == s2:
     56         return True
     57     if s1[1] != s2[1]:
     58         return False
     59 
     60     outcome = _cache.get((f1, f2, s1, s2))
     61     if outcome is None:
     62         outcome = _do_cmp(f1, f2)
     63         if len(_cache) > 100:      # limit the maximum size of the cache
     64             clear_cache()
     65         _cache[f1, f2, s1, s2] = outcome
     66     return outcome
     67 
     68 def _sig(st):
     69     return (stat.S_IFMT(st.st_mode),
     70             st.st_size,
     71             st.st_mtime)
     72 
     73 def _do_cmp(f1, f2):
     74     bufsize = BUFSIZE
     75     with open(f1, 'rb') as fp1, open(f2, 'rb') as fp2:
     76         while True:
     77             b1 = fp1.read(bufsize)
     78             b2 = fp2.read(bufsize)
     79             if b1 != b2:
     80                 return False
     81             if not b1:
     82                 return True
     83 
     84 # Directory comparison class.
     85 #
     86 class dircmp:
     87     """A class that manages the comparison of 2 directories.
     88 
     89     dircmp(a, b, ignore=None, hide=None)
     90       A and B are directories.
     91       IGNORE is a list of names to ignore,
     92         defaults to DEFAULT_IGNORES.
     93       HIDE is a list of names to hide,
     94         defaults to [os.curdir, os.pardir].
     95 
     96     High level usage:
     97       x = dircmp(dir1, dir2)
     98       x.report() -> prints a report on the differences between dir1 and dir2
     99        or
    100       x.report_partial_closure() -> prints report on differences between dir1
    101             and dir2, and reports on common immediate subdirectories.
    102       x.report_full_closure() -> like report_partial_closure,
    103             but fully recursive.
    104 
    105     Attributes:
    106      left_list, right_list: The files in dir1 and dir2,
    107         filtered by hide and ignore.
    108      common: a list of names in both dir1 and dir2.
    109      left_only, right_only: names only in dir1, dir2.
    110      common_dirs: subdirectories in both dir1 and dir2.
    111      common_files: files in both dir1 and dir2.
    112      common_funny: names in both dir1 and dir2 where the type differs between
    113         dir1 and dir2, or the name is not stat-able.
    114      same_files: list of identical files.
    115      diff_files: list of filenames which differ.
    116      funny_files: list of files which could not be compared.
    117      subdirs: a dictionary of dircmp objects, keyed by names in common_dirs.
    118      """
    119 
    120     def __init__(self, a, b, ignore=None, hide=None): # Initialize
    121         self.left = a
    122         self.right = b
    123         if hide is None:
    124             self.hide = [os.curdir, os.pardir] # Names never to be shown
    125         else:
    126             self.hide = hide
    127         if ignore is None:
    128             self.ignore = DEFAULT_IGNORES
    129         else:
    130             self.ignore = ignore
    131 
    132     def phase0(self): # Compare everything except common subdirectories
    133         self.left_list = _filter(os.listdir(self.left),
    134                                  self.hide+self.ignore)
    135         self.right_list = _filter(os.listdir(self.right),
    136                                   self.hide+self.ignore)
    137         self.left_list.sort()
    138         self.right_list.sort()
    139 
    140     def phase1(self): # Compute common names
    141         a = dict(zip(map(os.path.normcase, self.left_list), self.left_list))
    142         b = dict(zip(map(os.path.normcase, self.right_list), self.right_list))
    143         self.common = list(map(a.__getitem__, filter(b.__contains__, a)))
    144         self.left_only = list(map(a.__getitem__, filterfalse(b.__contains__, a)))
    145         self.right_only = list(map(b.__getitem__, filterfalse(a.__contains__, b)))
    146 
    147     def phase2(self): # Distinguish files, directories, funnies
    148         self.common_dirs = []
    149         self.common_files = []
    150         self.common_funny = []
    151 
    152         for x in self.common:
    153             a_path = os.path.join(self.left, x)
    154             b_path = os.path.join(self.right, x)
    155 
    156             ok = 1
    157             try:
    158                 a_stat = os.stat(a_path)
    159             except OSError as why:
    160                 # print('Can\'t stat', a_path, ':', why.args[1])
    161                 ok = 0
    162             try:
    163                 b_stat = os.stat(b_path)
    164             except OSError as why:
    165                 # print('Can\'t stat', b_path, ':', why.args[1])
    166                 ok = 0
    167 
    168             if ok:
    169                 a_type = stat.S_IFMT(a_stat.st_mode)
    170                 b_type = stat.S_IFMT(b_stat.st_mode)
    171                 if a_type != b_type:
    172                     self.common_funny.append(x)
    173                 elif stat.S_ISDIR(a_type):
    174                     self.common_dirs.append(x)
    175                 elif stat.S_ISREG(a_type):
    176                     self.common_files.append(x)
    177                 else:
    178                     self.common_funny.append(x)
    179             else:
    180                 self.common_funny.append(x)
    181 
    182     def phase3(self): # Find out differences between common files
    183         xx = cmpfiles(self.left, self.right, self.common_files)
    184         self.same_files, self.diff_files, self.funny_files = xx
    185 
    186     def phase4(self): # Find out differences between common subdirectories
    187         # A new dircmp object is created for each common subdirectory,
    188         # these are stored in a dictionary indexed by filename.
    189         # The hide and ignore properties are inherited from the parent
    190         self.subdirs = {}
    191         for x in self.common_dirs:
    192             a_x = os.path.join(self.left, x)
    193             b_x = os.path.join(self.right, x)
    194             self.subdirs[x]  = dircmp(a_x, b_x, self.ignore, self.hide)
    195 
    196     def phase4_closure(self): # Recursively call phase4() on subdirectories
    197         self.phase4()
    198         for sd in self.subdirs.values():
    199             sd.phase4_closure()
    200 
    201     def report(self): # Print a report on the differences between a and b
    202         # Output format is purposely lousy
    203         print('diff', self.left, self.right)
    204         if self.left_only:
    205             self.left_only.sort()
    206             print('Only in', self.left, ':', self.left_only)
    207         if self.right_only:
    208             self.right_only.sort()
    209             print('Only in', self.right, ':', self.right_only)
    210         if self.same_files:
    211             self.same_files.sort()
    212             print('Identical files :', self.same_files)
    213         if self.diff_files:
    214             self.diff_files.sort()
    215             print('Differing files :', self.diff_files)
    216         if self.funny_files:
    217             self.funny_files.sort()
    218             print('Trouble with common files :', self.funny_files)
    219         if self.common_dirs:
    220             self.common_dirs.sort()
    221             print('Common subdirectories :', self.common_dirs)
    222         if self.common_funny:
    223             self.common_funny.sort()
    224             print('Common funny cases :', self.common_funny)
    225 
    226     def report_partial_closure(self): # Print reports on self and on subdirs
    227         self.report()
    228         for sd in self.subdirs.values():
    229             print()
    230             sd.report()
    231 
    232     def report_full_closure(self): # Report on self and subdirs recursively
    233         self.report()
    234         for sd in self.subdirs.values():
    235             print()
    236             sd.report_full_closure()
    237 
    238     methodmap = dict(subdirs=phase4,
    239                      same_files=phase3, diff_files=phase3, funny_files=phase3,
    240                      common_dirs = phase2, common_files=phase2, common_funny=phase2,
    241                      common=phase1, left_only=phase1, right_only=phase1,
    242                      left_list=phase0, right_list=phase0)
    243 
    244     def __getattr__(self, attr):
    245         if attr not in self.methodmap:
    246             raise AttributeError(attr)
    247         self.methodmap[attr](self)
    248         return getattr(self, attr)
    249 
    250 def cmpfiles(a, b, common, shallow=True):
    251     """Compare common files in two directories.
    252 
    253     a, b -- directory names
    254     common -- list of file names found in both directories
    255     shallow -- if true, do comparison based solely on stat() information
    256 
    257     Returns a tuple of three lists:
    258       files that compare equal
    259       files that are different
    260       filenames that aren't regular files.
    261 
    262     """
    263     res = ([], [], [])
    264     for x in common:
    265         ax = os.path.join(a, x)
    266         bx = os.path.join(b, x)
    267         res[_cmp(ax, bx, shallow)].append(x)
    268     return res
    269 
    270 
    271 # Compare two files.
    272 # Return:
    273 #       0 for equal
    274 #       1 for different
    275 #       2 for funny cases (can't stat, etc.)
    276 #
    277 def _cmp(a, b, sh, abs=abs, cmp=cmp):
    278     try:
    279         return not abs(cmp(a, b, sh))
    280     except OSError:
    281         return 2
    282 
    283 
    284 # Return a copy with items that occur in skip removed.
    285 #
    286 def _filter(flist, skip):
    287     return list(filterfalse(skip.__contains__, flist))
    288 
    289 
    290 # Demonstration and testing.
    291 #
    292 def demo():
    293     import sys
    294     import getopt
    295     options, args = getopt.getopt(sys.argv[1:], 'r')
    296     if len(args) != 2:
    297         raise getopt.GetoptError('need exactly two args', None)
    298     dd = dircmp(args[0], args[1])
    299     if ('-r', '') in options:
    300         dd.report_full_closure()
    301     else:
    302         dd.report()
    303 
    304 if __name__ == '__main__':
    305     demo()
    306