Home | History | Annotate | Download | only in repo_diff
      1 """Diffs one repo source tree an upstream repo source tree.
      2 
      3 Matches the projects from a Gerrit repo workspace to the projects
      4 of an upstream workspace. After identifying exist both in the
      5 downstream and the upstream workspace it then diffs the each project.
      6 
      7 Finally, the results of the project matching and diffing are reported.
      8 
      9 """
     10 
     11 from __future__ import absolute_import
     12 from __future__ import division
     13 from __future__ import print_function
     14 import argparse
     15 import csv
     16 import datetime
     17 import multiprocessing
     18 import multiprocessing.pool
     19 import os
     20 import re
     21 import subprocess
     22 import xml.etree.ElementTree as et
     23 import git_commits_not_upstreamed
     24 
     25 
     26 def get_projects(source_tree):
     27   """Retrieve the dict of projects names and paths.
     28 
     29   Args:
     30     source_tree: A path to the source tree.
     31 
     32   Returns:
     33     A dict of project paths keyed by project names.
     34   """
     35 
     36   projects = {}
     37 
     38   manifest = source_tree + '/.repo/manifest.xml'
     39   tree = et.parse(manifest)
     40   root = tree.getroot()
     41 
     42   for project in root.findall('project'):
     43     # Ignore projects that are not synced by default
     44     if 'notdefault' in project.get('groups', ''):
     45       continue
     46     path = project.get('path', project.get('name'))
     47     path = os.path.abspath(os.path.join(source_tree, path))
     48     name = project.get('name')
     49 
     50     # check if project files actually exist
     51     if not os.path.exists(path):
     52       continue
     53 
     54     projects[name] = path
     55 
     56   return projects
     57 
     58 
     59 def git(args):
     60   """Git command.
     61 
     62   Args:
     63     args: A list of arguments to be sent to the git command.
     64 
     65   Returns:
     66     The output of the git command.
     67   """
     68 
     69   command = ['git']
     70   command.extend(args)
     71   with open(os.devnull, 'w') as devull:
     72     return subprocess.check_output(command, stderr=devull)
     73 
     74 
     75 def get_revision_diff_stats(directory, rev_a, rev_b):
     76   """Retrieves stats of diff between two git revisions.
     77 
     78   Args:
     79     directory: A path to the git directory to diff.
     80     rev_a: A git revision to diff.
     81     rev_b: A git revision to diff.
     82 
     83   Returns:
     84     A dict with the count of files modified, lines added
     85     and lines removed.
     86   """
     87   stats = {
     88       'file': 0,
     89       'insertion': 0,
     90       'deletion': 0,
     91   }
     92 
     93   git_diffstat = git(
     94       ['-C', directory, 'diff', '--shortstat', rev_a, rev_b])
     95   for element in git_diffstat.split(','):
     96     for key in stats:
     97       if key in element:
     98         stats[key] = int(element.split()[0])
     99 
    100   return stats
    101 
    102 
    103 def get_project_stats(upstream_dir, downstream_dir):
    104   """Retrieves stats of diff between two git projects.
    105 
    106   Diffs a downstream directory against an upstream directory.
    107   Lines that exist only in the downstream directory are considered insertions.
    108   Lines that exist only in the upstream directory are considered deletions.
    109 
    110   Args:
    111     upstream_dir: A path to the upstream directory to compare.
    112     downstream_dir: A path to the downstream directory to compare.
    113 
    114   Returns:
    115     A dict with the count of files modified, lines added
    116     and lines removed.
    117   """
    118   stats = {
    119       'file': 0,
    120       'insertion': 0,
    121       'deletion': 0,
    122   }
    123 
    124   if upstream_dir and downstream_dir:
    125     print('Diffing %s vs %s' % (downstream_dir, upstream_dir))
    126     git(['-C', downstream_dir, 'fetch', '--update-shallow', upstream_dir])
    127     stats = get_revision_diff_stats(downstream_dir, 'FETCH_HEAD', 'HEAD')
    128 
    129   return stats
    130 
    131 
    132 def match_project_by_root_commits(
    133     downstream_project_name, downstream_project_path, upstream_root_commits):
    134   """Match a downstream project to an upstream project using their root commits.
    135 
    136   Find all root commits in a downstream project and find a matching
    137   upstream project that have a root commit in common.
    138 
    139   Args:
    140     downstream_project_name: A string with the downstream project name.
    141     downstream_project_path: A string with the downstream project path.
    142     upstream_root_commits: A dict of root commits and their upstream project.
    143 
    144   Returns:
    145     A string with the matched upstream project name.
    146   """
    147   upstream_match = None
    148   downstream_root_commits = find_root_commits_in_path(downstream_project_path)
    149   for root in downstream_root_commits:
    150     if root in upstream_root_commits:
    151       upstream_project_list = upstream_root_commits[root]
    152       if len(upstream_project_list) > 1:
    153         print('Warning: ' + downstream_project_name +
    154               ' matches multiple projects')
    155         print(upstream_project_list)
    156       else:
    157         upstream_match = upstream_project_list[0]['name']
    158       # Once there's a root commit match, stop looking for a project match
    159       break
    160 
    161   return upstream_match
    162 
    163 
    164 def match_projects(upstream_projects, downstream_projects):
    165   """Match downstream projects to upstream projects.
    166 
    167   Args:
    168     upstream_projects: A dict of upstream projects.
    169     downstream_projects: A dict of downstream projects.
    170 
    171   Returns:
    172     A list of upstream and downstream project pairs.
    173   """
    174 
    175   project_matches = []
    176 
    177   # keep a list of upstream projects that have not been matched
    178   unmatched_upstream_projects = set(upstream_projects.keys())
    179 
    180   upstream_root_commits = find_root_commits_in_projects(upstream_projects)
    181   # Match all downstream projects to an upstream project
    182   for downstream_name, downstream_path in downstream_projects.iteritems():
    183     # First try to match projects by name
    184     if downstream_name in upstream_projects:
    185       upstream_match = downstream_name
    186     # If there is no project name match then try matching by commit
    187     else:
    188       upstream_match = match_project_by_root_commits(
    189           downstream_name, downstream_path, upstream_root_commits)
    190 
    191     project_matches.append({
    192         'upstream': upstream_match,
    193         'downstream': downstream_name,
    194     })
    195     unmatched_upstream_projects.discard(upstream_match)
    196 
    197   # Add all upstream projects that have not been matched
    198   for project in unmatched_upstream_projects:
    199     project_matches.append({
    200         'upstream': project,
    201         'downstream': None,
    202     })
    203 
    204   return project_matches
    205 
    206 
    207 def filter_exclusion_list(projects, exclusion_file):
    208   """Removes all projects that match the exclusion patterns."""
    209 
    210   filtered = {}
    211 
    212   exclusion_list = []
    213   if exclusion_file:
    214     with open(exclusion_file) as f:
    215       exclusion_list = f.readlines()
    216   exclusion_list = [line.strip() for line in exclusion_list]
    217   exclusion_pattern = '|'.join(exclusion_list)
    218 
    219   if exclusion_pattern:
    220     for name, path in projects.iteritems():
    221       if re.match(exclusion_pattern, name):
    222         print('Excluding ' + name)
    223       else:
    224         filtered[name] = path
    225   else:
    226     filtered = projects
    227 
    228   return filtered
    229 
    230 
    231 def get_all_projects_stats(upstream_source_tree,
    232                            downstream_source_tree,
    233                            exclusion_file):
    234   """Finds the stats of all project in a source tree.
    235 
    236   Args:
    237     upstream_source_tree: A string with the path to the upstream gerrit
    238       source tree.
    239     downstream_source_tree: A string with the path to the downstream gerrit
    240       source tree.
    241     exclusion_file: A string with the path to the exclusion file.
    242 
    243   Returns:
    244     A list of dicts of matching upstream and downstream projects
    245     including stats for projects that matches.
    246   """
    247   upstream_projects, downstream_projects = map(
    248     lambda t: get_projects_with_filter(t, exclusion_file),
    249     (upstream_source_tree, downstream_source_tree),
    250   )
    251 
    252   return multiprocessing.pool.ThreadPool(
    253     processes=multiprocessing.cpu_count()
    254   ).map(
    255     lambda match: stats_from_match(
    256       upstream_projects,
    257       downstream_projects,
    258       match,
    259     ),
    260     match_projects(upstream_projects, downstream_projects),
    261   )
    262 
    263 
    264 def stats_from_match(upstream_projects, downstream_projects, match):
    265   """Finds the stats of a single match of two projects.
    266 
    267   Args:
    268     upstream_projects: list of dicts obtained from get_project_stats
    269     downstream_projects: list of dicts obtained from get_project_stats
    270     match: a single match dict obtained from match_projects
    271 
    272   Returns:
    273     A dict of stats for this particular match
    274   """
    275 
    276   def display_status(upstream_project_name,
    277                       downstream_project_name,
    278                       project_stats):
    279     if not upstream_project_name:
    280       return 'Downstream Only Projects'
    281     elif not downstream_project_name:
    282       return 'Upstream Only Projects'
    283     elif project_stats['file'] == 0:
    284       return 'Intact Projects'
    285     elif upstream_project_name == downstream_project_name:
    286       return 'Modified Projects'
    287     return 'Forked Projects'
    288 
    289   upstream_project_name = match['upstream']
    290   downstream_project_name = match['downstream']
    291 
    292   project_stats = get_project_stats(
    293     upstream_projects.get(upstream_project_name),
    294     downstream_projects.get(downstream_project_name),
    295   )
    296   project_stats.update({
    297     'status': display_status(
    298       upstream_project_name,
    299       downstream_project_name,
    300       project_stats
    301     ),
    302     'downstream_path': downstream_projects.get(downstream_project_name)
    303   })
    304   project_stats.update(match)
    305   return project_stats
    306 
    307 
    308 def get_projects_with_filter(source_tree, exclusion_file):
    309   """ Helper function to get projects with an exclusion file filter applied."""
    310   return filter_exclusion_list(
    311     get_projects(source_tree),
    312     exclusion_file,
    313   )
    314 
    315 
    316 def find_root_commits_in_path(path):
    317   """Returns a list of root commits in a git project path."""
    318   print('Analyzing history of ' + path)
    319   rev_list = git(['-C', path, 'rev-list', '--max-parents=0', 'HEAD'])
    320   return rev_list.splitlines()
    321 
    322 
    323 def find_root_commits_in_projects(projects):
    324   """Returns a dict of root commits with all projects with that root commit."""
    325   root_commits = {}
    326   for name, path in projects.iteritems():
    327     for root in find_root_commits_in_path(path):
    328       root_list = root_commits.get(root, [])
    329       root_list.append({
    330           'name': name,
    331           'path': path,
    332       })
    333       root_commits[root] = root_list
    334   return root_commits
    335 
    336 
    337 def get_commit_stats_in_project(project):
    338   """Extract commits that have not been upstreamed in a specific project.
    339 
    340   Args:
    341     project: A dict of a project name and path.
    342 
    343   Returns:
    344     A dict of commits not upstreamed.
    345   """
    346   name = project['name']
    347   path = project['downstream_path']
    348   print('Finding commits not upstreamed in ' + name)
    349   commits = git_commits_not_upstreamed.find('FETCH_HEAD', 'HEAD', path)
    350   print('Found commits not upstreamed in ' + name)
    351   stats = []
    352   for commit in commits:
    353     author = git(['-C', path, 'show', '--no-patch', '--format=%ae', commit])
    354     author = author.strip()
    355     subject = git(['-C', path, 'show', '--no-patch', '--format=%s', commit])
    356     subject = subject.strip()
    357     stats.append({
    358         'commit': commit,
    359         'author': author,
    360         'subject': subject,
    361     })
    362 
    363   return {
    364       'name': name,
    365       'stats': stats,
    366   }
    367 
    368 
    369 def get_all_commits_stats(project_stats):
    370   """Extract commits that have not been upstreamed in all projects.
    371 
    372   Args:
    373     project_stats: A dict of matching upstream and downstream projects
    374       including stats for projects that matches.
    375 
    376   Returns:
    377     A dict of commits not upstreamed.
    378   """
    379   commit_stats = {}
    380   downstream_stats = {match['downstream']: match for match in project_stats}
    381 
    382   # Only analyze modified projects
    383   modified_projects = []
    384   for name, stats in downstream_stats.iteritems():
    385     if stats['status'].startswith('Modified'):
    386       stats['name'] = name
    387       modified_projects.append(stats)
    388 
    389   pool = multiprocessing.Pool()
    390 
    391   commit_stats = pool.map(get_commit_stats_in_project, modified_projects)
    392 
    393   commit_stats = {stats['name']: stats['stats'] for stats in commit_stats}
    394 
    395   return commit_stats
    396 
    397 
    398 def write_commit_csv(commit_stats, commit_output_file):
    399   """Write project comparison data to a CSV file.
    400 
    401   Args:
    402     commit_stats: The dict of the stats for all commits.
    403     commit_output_file: Path to the output file.
    404   """
    405   with open(commit_output_file, 'w') as f:
    406     fieldnames = [
    407         'Date',
    408         'Commit',
    409         'Downstream Project',
    410         'Author',
    411         'Subject',
    412     ]
    413     today = datetime.datetime.today().strftime('%Y/%m/%d')
    414     writer = csv.DictWriter(f, fieldnames=fieldnames)
    415     writer.writeheader()
    416     for project, stats in commit_stats.iteritems():
    417       for stat in stats:
    418         writer.writerow({
    419             'Date': today,
    420             'Commit': stat['commit'],
    421             'Downstream Project': project,
    422             'Author': stat['author'],
    423             'Subject': stat['subject'],
    424         })
    425   print('Wrote commit stats to ' + commit_output_file)
    426 
    427 
    428 def write_project_csv(project_stats, commit_stats, project_output_file):
    429   """Write project comparison data to a CSV file.
    430 
    431   Args:
    432     project_stats: The dict of the stats for all projects.
    433     commit_stats: The dict of the stats for all commits.
    434     project_output_file: Path to the output file.
    435   """
    436   with open(project_output_file, 'w') as f:
    437     fieldnames = [
    438         'Date',
    439         'Downstream Project',
    440         'Upstream Project',
    441         'Diff Status',
    442         'Files Changed',
    443         'Line Insertions',
    444         'Line Deletions',
    445         'Line Changes',
    446         'Commits Not Upstreamed',
    447     ]
    448     writer = csv.DictWriter(f, fieldnames=fieldnames)
    449     writer.writeheader()
    450     today = datetime.datetime.today().strftime('%Y/%m/%d')
    451     for stat in project_stats:
    452       commits_not_upstreamed = 0
    453       downstream_project = stat['downstream']
    454       if downstream_project in commit_stats:
    455         commits_not_upstreamed = len(commit_stats[downstream_project])
    456       writer.writerow({
    457           'Date': today,
    458           'Downstream Project': downstream_project,
    459           'Upstream Project': stat['upstream'],
    460           'Diff Status': stat['status'],
    461           'Files Changed': stat['file'],
    462           'Line Insertions': stat['insertion'],
    463           'Line Deletions': stat['deletion'],
    464           'Line Changes': stat['insertion'] + stat['deletion'],
    465           'Commits Not Upstreamed': commits_not_upstreamed,
    466       })
    467   print('Wrote project stats to ' + project_output_file)
    468 
    469 
    470 def diff(upstream_source_tree, downstream_source_tree, project_output_file,
    471          commit_output_file, exclusions_file):
    472   """Diff one repo source tree against another.
    473 
    474   Args:
    475     upstream_source_tree: A string with the path to a gerrit source tree.
    476     downstream_source_tree: A string with the path to a gerrit source tree.
    477     project_output_file: Path to the project output file.
    478     commit_output_file: Path to the commit output file.
    479     exclusions_file: Path to exclusions file.
    480   """
    481   project_stats = get_all_projects_stats(upstream_source_tree,
    482                                          downstream_source_tree,
    483                                          exclusions_file)
    484   commit_stats = get_all_commits_stats(project_stats)
    485   write_commit_csv(commit_stats, commit_output_file)
    486   write_project_csv(project_stats, commit_stats, project_output_file)
    487 
    488 
    489 def main():
    490   parser = argparse.ArgumentParser(
    491       description='Diff a repo source tree against an upstream source tree.')
    492   parser.add_argument('upstream_path', help='Path to an upstream source tree.')
    493   parser.add_argument(
    494       'downstream_path', help='Path to a downstream source tree.')
    495   parser.add_argument(
    496       '-p',
    497       '--project_output_file',
    498       help='Path to write the project output file',
    499       default='project.csv',)
    500   parser.add_argument(
    501       '-c',
    502       '--commit_output_file',
    503       help='Path to write the commit output file',
    504       default='commit.csv',)
    505   parser.add_argument(
    506       '-e',
    507       '--exclusions_file',
    508       help='Path to file with a list of project names to be excluded from'
    509       'the diff. You may use a regular expression to match project names as'
    510       'described in https://docs.python.org/2/howto/regex.html',
    511       default='',
    512   )
    513   args = parser.parse_args()
    514   upstream_source_tree = os.path.abspath(args.upstream_path)
    515   downstream_source_tree = os.path.abspath(args.downstream_path)
    516   project_output_file = os.path.abspath(args.project_output_file)
    517   commit_output_file = os.path.abspath(args.commit_output_file)
    518   exclusions_file = ''
    519   if args.exclusions_file:
    520     exclusions_file = os.path.abspath(args.exclusions_file)
    521 
    522   diff(upstream_source_tree, downstream_source_tree, project_output_file,
    523        commit_output_file, exclusions_file)
    524 
    525 
    526 if __name__ == '__main__':
    527   main()
    528