Home | History | Annotate | Download | only in tools
      1 #!/usr/bin/perl -w
      2 # Copyright (c) 2013 The Chromium Authors. All rights reserved.
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 
      6 # Use: find_copyrights.pl <start-from> [exclude-dir ...]
      7 
      8 use strict;
      9 use warnings;
     10 use File::Basename;
     11 
     12 sub check_is_generated_file($);
     13 sub start_copyright_parsing();
     14 
     15 my $progname = basename($0);
     16 
     17 my $root_dir = shift @ARGV;
     18 my @find_args = ();
     19 while (@ARGV) {
     20     my $path = shift @ARGV;
     21     push @find_args, qw'-not ( -path', "*/$path/*", qw'-prune )'
     22 }
     23 push @find_args, qw(-follow -type f -print);
     24 
     25 open FIND, '-|', 'find', $root_dir, @find_args
     26             or die "$progname: Couldn't exec find: $!\n";
     27 my $check_regex = '\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)' .
     28     '|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?' .
     29     '|tex|mli?)$';
     30 my @files = ();
     31 while (<FIND>) {
     32     chomp;
     33     push @files, $_ unless (-z $_ || !m%$check_regex%);
     34 }
     35 close FIND;
     36 
     37 my $generated_file_scan_boundary = 25;
     38 while (@files) {
     39     my $file = shift @files;
     40     my $file_header = '';
     41     my %copyrights;
     42     open (F, "<$file") or die "$progname: Unable to access $file\n";
     43     my $parse_copyright = start_copyright_parsing();
     44     while (<F>) {
     45         $file_header .= $_ unless $. > $generated_file_scan_boundary;
     46         my $copyright_match = $parse_copyright->($_, $.);
     47         if ($copyright_match) {
     48             $copyrights{lc("$copyright_match")} = "$copyright_match";
     49         }
     50     }
     51     close(F);
     52     my $copyright = join(" / ", values %copyrights);
     53     print "$file\t";
     54     if (check_is_generated_file($file_header)) {
     55         print "GENERATED FILE";
     56     } else {
     57         print ($copyright or "*No copyright*");
     58     }
     59     print "\n";
     60 }
     61 
     62 sub check_is_generated_file($) {
     63     my $license = uc($_[0]);
     64     # Remove Python multiline comments to avoid false positives
     65     if (index($license, '"""') != -1) {
     66         $license =~ s/"""[^"]*(?:"""|$)//mg;
     67     }
     68     if (index($license, "'''") != -1) {
     69         $license =~ s/'''[^']*(?:'''|$)//mg;
     70     }
     71     # Quick checks using index.
     72     if (index($license, 'ALL CHANGES MADE IN THIS FILE WILL BE LOST') != -1) {
     73         return 1;
     74     }
     75     if (index($license, 'DO NOT EDIT') != -1 ||
     76         index($license, 'DO NOT DELETE') != -1 ||
     77         index($license, 'GENERATED') != -1) {
     78         return ($license =~ /(All changes made in this file will be lost' .
     79             'DO NOT (EDIT|delete this file)|Generated (at|automatically|data)' .
     80             '|Automatically generated|\Wgenerated\s+(?:\w+\s+)*file\W)/i);
     81     }
     82     return 0;
     83 }
     84 
     85 sub are_within_increasing_progression($$$) {
     86     my $delta = $_[0] - $_[1];
     87     return $delta >= 0 && $delta <= $_[2];
     88 }
     89 
     90 sub start_copyright_parsing() {
     91     my $max_line_numbers_proximity = 3;
     92     # Set up the defaults the way that proximity checks will not succeed.
     93     my $last_a_item_line_number = -200;
     94     my $last_b_item_line_number = -100;
     95 
     96     return sub {
     97         my $line = $_[0];
     98         my $line_number = $_[1];
     99 
    100         # Remove C / C++ strings to avoid false positives.
    101         if (index($line, '"') != -1) {
    102             $line =~ s/"[^"\\]*(?:\\.[^"\\]*)*"//g;
    103         }
    104 
    105         my $uc_line = uc($line);
    106 
    107         # Record '(a)' and '(b)' last occurences in C++ comments.
    108         my $cpp_comment_idx = index($uc_line, '//');
    109         if ($cpp_comment_idx != -1) {
    110             if (index($uc_line, '(A)') > $cpp_comment_idx) {
    111                 $last_a_item_line_number = $line_number;
    112             }
    113             if (index($uc_line, '(B)') > $cpp_comment_idx) {
    114                 $last_b_item_line_number = $line_number;
    115             }
    116         }
    117 
    118         # Fast bailout, uses the same patterns as the regexp.
    119         if (index($uc_line, 'COPYRIGHT') == -1 &&
    120             index($uc_line, 'COPR.') == -1 &&
    121             index($uc_line, '\x{00a9}') == -1 &&
    122             index($uc_line, '\xc2\xa9') == -1) {
    123 
    124             my $c_item_index = index($uc_line, '(C)');
    125             return '' if ($c_item_index == -1);
    126             # Filter out 'c' used as a list item inside C++ comments.
    127             # E.g. "// blah-blah (a) blah\n// blah-blah (b) and (c) blah"
    128             if ($c_item_index > $cpp_comment_idx &&
    129                 are_within_increasing_progression(
    130                     $line_number,
    131                     $last_b_item_line_number,
    132                     $max_line_numbers_proximity) &&
    133                 are_within_increasing_progression(
    134                     $last_b_item_line_number,
    135                     $last_a_item_line_number,
    136                     $max_line_numbers_proximity)) {
    137                 return '';
    138             }
    139         }
    140 
    141         my $copyright_indicator_regex =
    142             '(?:copyright|copr\.|\x{00a9}|\xc2\xa9|\(c\))';
    143         my $copyright_disindicator_regex =
    144             '\b(?:info(?:rmation)?|notice|and|or)\b';
    145 
    146         my $copyright = '';
    147         if ($line =~ m%\W$copyright_indicator_regex(?::\s*|\s+)(\w.*)$%i) {
    148             my $match = $1;
    149             if ($match !~ m%^\s*$copyright_disindicator_regex%i) {
    150                 $match =~ s/([,.])?\s*$//;
    151                 $match =~ s/$copyright_indicator_regex//ig;
    152                 $match =~ s/^\s+//;
    153                 $match =~ s/\s{2,}/ /g;
    154                 $match =~ s/\\@/@/g;
    155                 $copyright = $match;
    156             }
    157         }
    158 
    159         return $copyright;
    160     }
    161 }
    162