1 """Diffs one repo source tree an upstream repo source tree. 2 3 Matches the projects from a Gerrit repo workspace to the projects 4 of an upstream workspace. After identifying exist both in the 5 downstream and the upstream workspace it then diffs the each project. 6 7 Finally, the results of the project matching and diffing are reported. 8 9 """ 10 11 from __future__ import absolute_import 12 from __future__ import division 13 from __future__ import print_function 14 import argparse 15 import csv 16 import datetime 17 import multiprocessing 18 import multiprocessing.pool 19 import os 20 import re 21 import subprocess 22 import xml.etree.ElementTree as et 23 import git_commits_not_upstreamed 24 25 26 def get_projects(source_tree): 27 """Retrieve the dict of projects names and paths. 28 29 Args: 30 source_tree: A path to the source tree. 31 32 Returns: 33 A dict of project paths keyed by project names. 34 """ 35 36 projects = {} 37 38 manifest = source_tree + '/.repo/manifest.xml' 39 tree = et.parse(manifest) 40 root = tree.getroot() 41 42 for project in root.findall('project'): 43 # Ignore projects that are not synced by default 44 if 'notdefault' in project.get('groups', ''): 45 continue 46 path = project.get('path', project.get('name')) 47 path = os.path.abspath(os.path.join(source_tree, path)) 48 name = project.get('name') 49 50 # check if project files actually exist 51 if not os.path.exists(path): 52 continue 53 54 projects[name] = path 55 56 return projects 57 58 59 def git(args): 60 """Git command. 61 62 Args: 63 args: A list of arguments to be sent to the git command. 64 65 Returns: 66 The output of the git command. 67 """ 68 69 command = ['git'] 70 command.extend(args) 71 with open(os.devnull, 'w') as devull: 72 return subprocess.check_output(command, stderr=devull) 73 74 75 def get_revision_diff_stats(directory, rev_a, rev_b): 76 """Retrieves stats of diff between two git revisions. 77 78 Args: 79 directory: A path to the git directory to diff. 80 rev_a: A git revision to diff. 81 rev_b: A git revision to diff. 82 83 Returns: 84 A dict with the count of files modified, lines added 85 and lines removed. 86 """ 87 stats = { 88 'file': 0, 89 'insertion': 0, 90 'deletion': 0, 91 } 92 93 git_diffstat = git( 94 ['-C', directory, 'diff', '--shortstat', rev_a, rev_b]) 95 for element in git_diffstat.split(','): 96 for key in stats: 97 if key in element: 98 stats[key] = int(element.split()[0]) 99 100 return stats 101 102 103 def get_project_stats(upstream_dir, downstream_dir): 104 """Retrieves stats of diff between two git projects. 105 106 Diffs a downstream directory against an upstream directory. 107 Lines that exist only in the downstream directory are considered insertions. 108 Lines that exist only in the upstream directory are considered deletions. 109 110 Args: 111 upstream_dir: A path to the upstream directory to compare. 112 downstream_dir: A path to the downstream directory to compare. 113 114 Returns: 115 A dict with the count of files modified, lines added 116 and lines removed. 117 """ 118 stats = { 119 'file': 0, 120 'insertion': 0, 121 'deletion': 0, 122 } 123 124 if upstream_dir and downstream_dir: 125 print('Diffing %s vs %s' % (downstream_dir, upstream_dir)) 126 git(['-C', downstream_dir, 'fetch', '--update-shallow', upstream_dir]) 127 stats = get_revision_diff_stats(downstream_dir, 'FETCH_HEAD', 'HEAD') 128 129 return stats 130 131 132 def match_project_by_root_commits( 133 downstream_project_name, downstream_project_path, upstream_root_commits): 134 """Match a downstream project to an upstream project using their root commits. 135 136 Find all root commits in a downstream project and find a matching 137 upstream project that have a root commit in common. 138 139 Args: 140 downstream_project_name: A string with the downstream project name. 141 downstream_project_path: A string with the downstream project path. 142 upstream_root_commits: A dict of root commits and their upstream project. 143 144 Returns: 145 A string with the matched upstream project name. 146 """ 147 upstream_match = None 148 downstream_root_commits = find_root_commits_in_path(downstream_project_path) 149 for root in downstream_root_commits: 150 if root in upstream_root_commits: 151 upstream_project_list = upstream_root_commits[root] 152 if len(upstream_project_list) > 1: 153 print('Warning: ' + downstream_project_name + 154 ' matches multiple projects') 155 print(upstream_project_list) 156 else: 157 upstream_match = upstream_project_list[0]['name'] 158 # Once there's a root commit match, stop looking for a project match 159 break 160 161 return upstream_match 162 163 164 def match_projects(upstream_projects, downstream_projects): 165 """Match downstream projects to upstream projects. 166 167 Args: 168 upstream_projects: A dict of upstream projects. 169 downstream_projects: A dict of downstream projects. 170 171 Returns: 172 A list of upstream and downstream project pairs. 173 """ 174 175 project_matches = [] 176 177 # keep a list of upstream projects that have not been matched 178 unmatched_upstream_projects = set(upstream_projects.keys()) 179 180 upstream_root_commits = find_root_commits_in_projects(upstream_projects) 181 # Match all downstream projects to an upstream project 182 for downstream_name, downstream_path in downstream_projects.iteritems(): 183 # First try to match projects by name 184 if downstream_name in upstream_projects: 185 upstream_match = downstream_name 186 # If there is no project name match then try matching by commit 187 else: 188 upstream_match = match_project_by_root_commits( 189 downstream_name, downstream_path, upstream_root_commits) 190 191 project_matches.append({ 192 'upstream': upstream_match, 193 'downstream': downstream_name, 194 }) 195 unmatched_upstream_projects.discard(upstream_match) 196 197 # Add all upstream projects that have not been matched 198 for project in unmatched_upstream_projects: 199 project_matches.append({ 200 'upstream': project, 201 'downstream': None, 202 }) 203 204 return project_matches 205 206 207 def filter_exclusion_list(projects, exclusion_file): 208 """Removes all projects that match the exclusion patterns.""" 209 210 filtered = {} 211 212 exclusion_list = [] 213 if exclusion_file: 214 with open(exclusion_file) as f: 215 exclusion_list = f.readlines() 216 exclusion_list = [line.strip() for line in exclusion_list] 217 exclusion_pattern = '|'.join(exclusion_list) 218 219 if exclusion_pattern: 220 for name, path in projects.iteritems(): 221 if re.match(exclusion_pattern, name): 222 print('Excluding ' + name) 223 else: 224 filtered[name] = path 225 else: 226 filtered = projects 227 228 return filtered 229 230 231 def get_all_projects_stats(upstream_source_tree, 232 downstream_source_tree, 233 exclusion_file): 234 """Finds the stats of all project in a source tree. 235 236 Args: 237 upstream_source_tree: A string with the path to the upstream gerrit 238 source tree. 239 downstream_source_tree: A string with the path to the downstream gerrit 240 source tree. 241 exclusion_file: A string with the path to the exclusion file. 242 243 Returns: 244 A list of dicts of matching upstream and downstream projects 245 including stats for projects that matches. 246 """ 247 upstream_projects, downstream_projects = map( 248 lambda t: get_projects_with_filter(t, exclusion_file), 249 (upstream_source_tree, downstream_source_tree), 250 ) 251 252 return multiprocessing.pool.ThreadPool( 253 processes=multiprocessing.cpu_count() 254 ).map( 255 lambda match: stats_from_match( 256 upstream_projects, 257 downstream_projects, 258 match, 259 ), 260 match_projects(upstream_projects, downstream_projects), 261 ) 262 263 264 def stats_from_match(upstream_projects, downstream_projects, match): 265 """Finds the stats of a single match of two projects. 266 267 Args: 268 upstream_projects: list of dicts obtained from get_project_stats 269 downstream_projects: list of dicts obtained from get_project_stats 270 match: a single match dict obtained from match_projects 271 272 Returns: 273 A dict of stats for this particular match 274 """ 275 276 def display_status(upstream_project_name, 277 downstream_project_name, 278 project_stats): 279 if not upstream_project_name: 280 return 'Downstream Only Projects' 281 elif not downstream_project_name: 282 return 'Upstream Only Projects' 283 elif project_stats['file'] == 0: 284 return 'Intact Projects' 285 elif upstream_project_name == downstream_project_name: 286 return 'Modified Projects' 287 return 'Forked Projects' 288 289 upstream_project_name = match['upstream'] 290 downstream_project_name = match['downstream'] 291 292 project_stats = get_project_stats( 293 upstream_projects.get(upstream_project_name), 294 downstream_projects.get(downstream_project_name), 295 ) 296 project_stats.update({ 297 'status': display_status( 298 upstream_project_name, 299 downstream_project_name, 300 project_stats 301 ), 302 'downstream_path': downstream_projects.get(downstream_project_name) 303 }) 304 project_stats.update(match) 305 return project_stats 306 307 308 def get_projects_with_filter(source_tree, exclusion_file): 309 """ Helper function to get projects with an exclusion file filter applied.""" 310 return filter_exclusion_list( 311 get_projects(source_tree), 312 exclusion_file, 313 ) 314 315 316 def find_root_commits_in_path(path): 317 """Returns a list of root commits in a git project path.""" 318 print('Analyzing history of ' + path) 319 rev_list = git(['-C', path, 'rev-list', '--max-parents=0', 'HEAD']) 320 return rev_list.splitlines() 321 322 323 def find_root_commits_in_projects(projects): 324 """Returns a dict of root commits with all projects with that root commit.""" 325 root_commits = {} 326 for name, path in projects.iteritems(): 327 for root in find_root_commits_in_path(path): 328 root_list = root_commits.get(root, []) 329 root_list.append({ 330 'name': name, 331 'path': path, 332 }) 333 root_commits[root] = root_list 334 return root_commits 335 336 337 def get_commit_stats_in_project(project): 338 """Extract commits that have not been upstreamed in a specific project. 339 340 Args: 341 project: A dict of a project name and path. 342 343 Returns: 344 A dict of commits not upstreamed. 345 """ 346 name = project['name'] 347 path = project['downstream_path'] 348 print('Finding commits not upstreamed in ' + name) 349 commits = git_commits_not_upstreamed.find('FETCH_HEAD', 'HEAD', path) 350 print('Found commits not upstreamed in ' + name) 351 stats = [] 352 for commit in commits: 353 author = git(['-C', path, 'show', '--no-patch', '--format=%ae', commit]) 354 author = author.strip() 355 subject = git(['-C', path, 'show', '--no-patch', '--format=%s', commit]) 356 subject = subject.strip() 357 stats.append({ 358 'commit': commit, 359 'author': author, 360 'subject': subject, 361 }) 362 363 return { 364 'name': name, 365 'stats': stats, 366 } 367 368 369 def get_all_commits_stats(project_stats): 370 """Extract commits that have not been upstreamed in all projects. 371 372 Args: 373 project_stats: A dict of matching upstream and downstream projects 374 including stats for projects that matches. 375 376 Returns: 377 A dict of commits not upstreamed. 378 """ 379 commit_stats = {} 380 downstream_stats = {match['downstream']: match for match in project_stats} 381 382 # Only analyze modified projects 383 modified_projects = [] 384 for name, stats in downstream_stats.iteritems(): 385 if stats['status'].startswith('Modified'): 386 stats['name'] = name 387 modified_projects.append(stats) 388 389 pool = multiprocessing.Pool() 390 391 commit_stats = pool.map(get_commit_stats_in_project, modified_projects) 392 393 commit_stats = {stats['name']: stats['stats'] for stats in commit_stats} 394 395 return commit_stats 396 397 398 def write_commit_csv(commit_stats, commit_output_file): 399 """Write project comparison data to a CSV file. 400 401 Args: 402 commit_stats: The dict of the stats for all commits. 403 commit_output_file: Path to the output file. 404 """ 405 with open(commit_output_file, 'w') as f: 406 fieldnames = [ 407 'Date', 408 'Commit', 409 'Downstream Project', 410 'Author', 411 'Subject', 412 ] 413 today = datetime.datetime.today().strftime('%Y/%m/%d') 414 writer = csv.DictWriter(f, fieldnames=fieldnames) 415 writer.writeheader() 416 for project, stats in commit_stats.iteritems(): 417 for stat in stats: 418 writer.writerow({ 419 'Date': today, 420 'Commit': stat['commit'], 421 'Downstream Project': project, 422 'Author': stat['author'], 423 'Subject': stat['subject'], 424 }) 425 print('Wrote commit stats to ' + commit_output_file) 426 427 428 def write_project_csv(project_stats, commit_stats, project_output_file): 429 """Write project comparison data to a CSV file. 430 431 Args: 432 project_stats: The dict of the stats for all projects. 433 commit_stats: The dict of the stats for all commits. 434 project_output_file: Path to the output file. 435 """ 436 with open(project_output_file, 'w') as f: 437 fieldnames = [ 438 'Date', 439 'Downstream Project', 440 'Upstream Project', 441 'Diff Status', 442 'Files Changed', 443 'Line Insertions', 444 'Line Deletions', 445 'Line Changes', 446 'Commits Not Upstreamed', 447 ] 448 writer = csv.DictWriter(f, fieldnames=fieldnames) 449 writer.writeheader() 450 today = datetime.datetime.today().strftime('%Y/%m/%d') 451 for stat in project_stats: 452 commits_not_upstreamed = 0 453 downstream_project = stat['downstream'] 454 if downstream_project in commit_stats: 455 commits_not_upstreamed = len(commit_stats[downstream_project]) 456 writer.writerow({ 457 'Date': today, 458 'Downstream Project': downstream_project, 459 'Upstream Project': stat['upstream'], 460 'Diff Status': stat['status'], 461 'Files Changed': stat['file'], 462 'Line Insertions': stat['insertion'], 463 'Line Deletions': stat['deletion'], 464 'Line Changes': stat['insertion'] + stat['deletion'], 465 'Commits Not Upstreamed': commits_not_upstreamed, 466 }) 467 print('Wrote project stats to ' + project_output_file) 468 469 470 def diff(upstream_source_tree, downstream_source_tree, project_output_file, 471 commit_output_file, exclusions_file): 472 """Diff one repo source tree against another. 473 474 Args: 475 upstream_source_tree: A string with the path to a gerrit source tree. 476 downstream_source_tree: A string with the path to a gerrit source tree. 477 project_output_file: Path to the project output file. 478 commit_output_file: Path to the commit output file. 479 exclusions_file: Path to exclusions file. 480 """ 481 project_stats = get_all_projects_stats(upstream_source_tree, 482 downstream_source_tree, 483 exclusions_file) 484 commit_stats = get_all_commits_stats(project_stats) 485 write_commit_csv(commit_stats, commit_output_file) 486 write_project_csv(project_stats, commit_stats, project_output_file) 487 488 489 def main(): 490 parser = argparse.ArgumentParser( 491 description='Diff a repo source tree against an upstream source tree.') 492 parser.add_argument('upstream_path', help='Path to an upstream source tree.') 493 parser.add_argument( 494 'downstream_path', help='Path to a downstream source tree.') 495 parser.add_argument( 496 '-p', 497 '--project_output_file', 498 help='Path to write the project output file', 499 default='project.csv',) 500 parser.add_argument( 501 '-c', 502 '--commit_output_file', 503 help='Path to write the commit output file', 504 default='commit.csv',) 505 parser.add_argument( 506 '-e', 507 '--exclusions_file', 508 help='Path to file with a list of project names to be excluded from' 509 'the diff. You may use a regular expression to match project names as' 510 'described in https://docs.python.org/2/howto/regex.html', 511 default='', 512 ) 513 args = parser.parse_args() 514 upstream_source_tree = os.path.abspath(args.upstream_path) 515 downstream_source_tree = os.path.abspath(args.downstream_path) 516 project_output_file = os.path.abspath(args.project_output_file) 517 commit_output_file = os.path.abspath(args.commit_output_file) 518 exclusions_file = '' 519 if args.exclusions_file: 520 exclusions_file = os.path.abspath(args.exclusions_file) 521 522 diff(upstream_source_tree, downstream_source_tree, project_output_file, 523 commit_output_file, exclusions_file) 524 525 526 if __name__ == '__main__': 527 main() 528