1 """Utilities for comparing files and directories. 2 3 Classes: 4 dircmp 5 6 Functions: 7 cmp(f1, f2, shallow=True) -> int 8 cmpfiles(a, b, common) -> ([], [], []) 9 clear_cache() 10 11 """ 12 13 import os 14 import stat 15 from itertools import filterfalse 16 17 __all__ = ['clear_cache', 'cmp', 'dircmp', 'cmpfiles', 'DEFAULT_IGNORES'] 18 19 _cache = {} 20 BUFSIZE = 8*1024 21 22 DEFAULT_IGNORES = [ 23 'RCS', 'CVS', 'tags', '.git', '.hg', '.bzr', '_darcs', '__pycache__'] 24 25 def clear_cache(): 26 """Clear the filecmp cache.""" 27 _cache.clear() 28 29 def cmp(f1, f2, shallow=True): 30 """Compare two files. 31 32 Arguments: 33 34 f1 -- First file name 35 36 f2 -- Second file name 37 38 shallow -- Just check stat signature (do not read the files). 39 defaults to True. 40 41 Return value: 42 43 True if the files are the same, False otherwise. 44 45 This function uses a cache for past comparisons and the results, 46 with cache entries invalidated if their stat information 47 changes. The cache may be cleared by calling clear_cache(). 48 49 """ 50 51 s1 = _sig(os.stat(f1)) 52 s2 = _sig(os.stat(f2)) 53 if s1[0] != stat.S_IFREG or s2[0] != stat.S_IFREG: 54 return False 55 if shallow and s1 == s2: 56 return True 57 if s1[1] != s2[1]: 58 return False 59 60 outcome = _cache.get((f1, f2, s1, s2)) 61 if outcome is None: 62 outcome = _do_cmp(f1, f2) 63 if len(_cache) > 100: # limit the maximum size of the cache 64 clear_cache() 65 _cache[f1, f2, s1, s2] = outcome 66 return outcome 67 68 def _sig(st): 69 return (stat.S_IFMT(st.st_mode), 70 st.st_size, 71 st.st_mtime) 72 73 def _do_cmp(f1, f2): 74 bufsize = BUFSIZE 75 with open(f1, 'rb') as fp1, open(f2, 'rb') as fp2: 76 while True: 77 b1 = fp1.read(bufsize) 78 b2 = fp2.read(bufsize) 79 if b1 != b2: 80 return False 81 if not b1: 82 return True 83 84 # Directory comparison class. 85 # 86 class dircmp: 87 """A class that manages the comparison of 2 directories. 88 89 dircmp(a, b, ignore=None, hide=None) 90 A and B are directories. 91 IGNORE is a list of names to ignore, 92 defaults to DEFAULT_IGNORES. 93 HIDE is a list of names to hide, 94 defaults to [os.curdir, os.pardir]. 95 96 High level usage: 97 x = dircmp(dir1, dir2) 98 x.report() -> prints a report on the differences between dir1 and dir2 99 or 100 x.report_partial_closure() -> prints report on differences between dir1 101 and dir2, and reports on common immediate subdirectories. 102 x.report_full_closure() -> like report_partial_closure, 103 but fully recursive. 104 105 Attributes: 106 left_list, right_list: The files in dir1 and dir2, 107 filtered by hide and ignore. 108 common: a list of names in both dir1 and dir2. 109 left_only, right_only: names only in dir1, dir2. 110 common_dirs: subdirectories in both dir1 and dir2. 111 common_files: files in both dir1 and dir2. 112 common_funny: names in both dir1 and dir2 where the type differs between 113 dir1 and dir2, or the name is not stat-able. 114 same_files: list of identical files. 115 diff_files: list of filenames which differ. 116 funny_files: list of files which could not be compared. 117 subdirs: a dictionary of dircmp objects, keyed by names in common_dirs. 118 """ 119 120 def __init__(self, a, b, ignore=None, hide=None): # Initialize 121 self.left = a 122 self.right = b 123 if hide is None: 124 self.hide = [os.curdir, os.pardir] # Names never to be shown 125 else: 126 self.hide = hide 127 if ignore is None: 128 self.ignore = DEFAULT_IGNORES 129 else: 130 self.ignore = ignore 131 132 def phase0(self): # Compare everything except common subdirectories 133 self.left_list = _filter(os.listdir(self.left), 134 self.hide+self.ignore) 135 self.right_list = _filter(os.listdir(self.right), 136 self.hide+self.ignore) 137 self.left_list.sort() 138 self.right_list.sort() 139 140 def phase1(self): # Compute common names 141 a = dict(zip(map(os.path.normcase, self.left_list), self.left_list)) 142 b = dict(zip(map(os.path.normcase, self.right_list), self.right_list)) 143 self.common = list(map(a.__getitem__, filter(b.__contains__, a))) 144 self.left_only = list(map(a.__getitem__, filterfalse(b.__contains__, a))) 145 self.right_only = list(map(b.__getitem__, filterfalse(a.__contains__, b))) 146 147 def phase2(self): # Distinguish files, directories, funnies 148 self.common_dirs = [] 149 self.common_files = [] 150 self.common_funny = [] 151 152 for x in self.common: 153 a_path = os.path.join(self.left, x) 154 b_path = os.path.join(self.right, x) 155 156 ok = 1 157 try: 158 a_stat = os.stat(a_path) 159 except OSError as why: 160 # print('Can\'t stat', a_path, ':', why.args[1]) 161 ok = 0 162 try: 163 b_stat = os.stat(b_path) 164 except OSError as why: 165 # print('Can\'t stat', b_path, ':', why.args[1]) 166 ok = 0 167 168 if ok: 169 a_type = stat.S_IFMT(a_stat.st_mode) 170 b_type = stat.S_IFMT(b_stat.st_mode) 171 if a_type != b_type: 172 self.common_funny.append(x) 173 elif stat.S_ISDIR(a_type): 174 self.common_dirs.append(x) 175 elif stat.S_ISREG(a_type): 176 self.common_files.append(x) 177 else: 178 self.common_funny.append(x) 179 else: 180 self.common_funny.append(x) 181 182 def phase3(self): # Find out differences between common files 183 xx = cmpfiles(self.left, self.right, self.common_files) 184 self.same_files, self.diff_files, self.funny_files = xx 185 186 def phase4(self): # Find out differences between common subdirectories 187 # A new dircmp object is created for each common subdirectory, 188 # these are stored in a dictionary indexed by filename. 189 # The hide and ignore properties are inherited from the parent 190 self.subdirs = {} 191 for x in self.common_dirs: 192 a_x = os.path.join(self.left, x) 193 b_x = os.path.join(self.right, x) 194 self.subdirs[x] = dircmp(a_x, b_x, self.ignore, self.hide) 195 196 def phase4_closure(self): # Recursively call phase4() on subdirectories 197 self.phase4() 198 for sd in self.subdirs.values(): 199 sd.phase4_closure() 200 201 def report(self): # Print a report on the differences between a and b 202 # Output format is purposely lousy 203 print('diff', self.left, self.right) 204 if self.left_only: 205 self.left_only.sort() 206 print('Only in', self.left, ':', self.left_only) 207 if self.right_only: 208 self.right_only.sort() 209 print('Only in', self.right, ':', self.right_only) 210 if self.same_files: 211 self.same_files.sort() 212 print('Identical files :', self.same_files) 213 if self.diff_files: 214 self.diff_files.sort() 215 print('Differing files :', self.diff_files) 216 if self.funny_files: 217 self.funny_files.sort() 218 print('Trouble with common files :', self.funny_files) 219 if self.common_dirs: 220 self.common_dirs.sort() 221 print('Common subdirectories :', self.common_dirs) 222 if self.common_funny: 223 self.common_funny.sort() 224 print('Common funny cases :', self.common_funny) 225 226 def report_partial_closure(self): # Print reports on self and on subdirs 227 self.report() 228 for sd in self.subdirs.values(): 229 print() 230 sd.report() 231 232 def report_full_closure(self): # Report on self and subdirs recursively 233 self.report() 234 for sd in self.subdirs.values(): 235 print() 236 sd.report_full_closure() 237 238 methodmap = dict(subdirs=phase4, 239 same_files=phase3, diff_files=phase3, funny_files=phase3, 240 common_dirs = phase2, common_files=phase2, common_funny=phase2, 241 common=phase1, left_only=phase1, right_only=phase1, 242 left_list=phase0, right_list=phase0) 243 244 def __getattr__(self, attr): 245 if attr not in self.methodmap: 246 raise AttributeError(attr) 247 self.methodmap[attr](self) 248 return getattr(self, attr) 249 250 def cmpfiles(a, b, common, shallow=True): 251 """Compare common files in two directories. 252 253 a, b -- directory names 254 common -- list of file names found in both directories 255 shallow -- if true, do comparison based solely on stat() information 256 257 Returns a tuple of three lists: 258 files that compare equal 259 files that are different 260 filenames that aren't regular files. 261 262 """ 263 res = ([], [], []) 264 for x in common: 265 ax = os.path.join(a, x) 266 bx = os.path.join(b, x) 267 res[_cmp(ax, bx, shallow)].append(x) 268 return res 269 270 271 # Compare two files. 272 # Return: 273 # 0 for equal 274 # 1 for different 275 # 2 for funny cases (can't stat, etc.) 276 # 277 def _cmp(a, b, sh, abs=abs, cmp=cmp): 278 try: 279 return not abs(cmp(a, b, sh)) 280 except OSError: 281 return 2 282 283 284 # Return a copy with items that occur in skip removed. 285 # 286 def _filter(flist, skip): 287 return list(filterfalse(skip.__contains__, flist)) 288 289 290 # Demonstration and testing. 291 # 292 def demo(): 293 import sys 294 import getopt 295 options, args = getopt.getopt(sys.argv[1:], 'r') 296 if len(args) != 2: 297 raise getopt.GetoptError('need exactly two args', None) 298 dd = dircmp(args[0], args[1]) 299 if ('-r', '') in options: 300 dd.report_full_closure() 301 else: 302 dd.report() 303 304 if __name__ == '__main__': 305 demo() 306