Home | History | Annotate | Download | only in genprops
      1 #!/usr/bin/python2.4
      2 # Copyright (c) 2009 International Business Machines
      3 # Corporation and others. All Rights Reserved.
      4 #
      5 #   file name:  ucdcopy.py
      6 #   encoding:   US-ASCII
      7 #   tab size:   8 (not used)
      8 #   indentation:4
      9 #
     10 #   created on: 2009aug04
     11 #   created by: Markus W. Scherer
     12 #
     13 # Copy Unicode Character Database (ucd) files from a tree
     14 # of files downloaded from ftp://www.unicode.org/Public/5.2.0/
     15 # to a folder like ICU's source/data/unidata/
     16 # and modify some of the files to make them more compact.
     17 #
     18 # Invoke with two command-line parameters, for the source
     19 # and destination folders.
     20 
     21 import os
     22 import os.path
     23 import re
     24 import shutil
     25 import sys
     26 
     27 _strip_re = re.compile("^([0-9a-fA-F]+.+?) *#.*")
     28 _code_point_re = re.compile("\s*([0-9a-fA-F]+)\s*;")
     29 
     30 def CopyAndStripWithOptionalMerge(s, t, do_merge):
     31   in_file = open(s, "r")
     32   out_file = open(t, "w")
     33   first = -1  # First code point with first_data.
     34   last = -1  # Last code point with first_data.
     35   first_data = ""  # Common data for code points [first..last].
     36   for line in in_file:
     37     match = _strip_re.match(line)
     38     if match:
     39       line = match.group(1)
     40     else:
     41       line = line.rstrip()
     42     if do_merge:
     43       match = _code_point_re.match(line)
     44       if match:
     45         c = int(match.group(1), 16)
     46         data = line[match.end() - 1:]
     47       else:
     48         c = -1
     49         data = ""
     50       if last >= 0 and (c != (last + 1) or data != first_data):
     51         # output the current range
     52         if first == last:
     53           out_file.write("%04X%s\n" % (first, first_data))
     54         else:
     55           out_file.write("%04X..%04X%s\n" % (first, last, first_data))
     56         first = -1
     57         last = -1
     58         first_data = ""
     59       if c < 0:
     60         # no data on this line, output as is
     61         out_file.write(line)
     62         out_file.write("\n")
     63       else:
     64         # data on this line, store for possible range compaction
     65         if last < 0:
     66           # set as the first line in a possible range
     67           first = c
     68           last = c
     69           first_data = data
     70         else:
     71           # must be c == (last + 1) and data == first_data
     72           # because of previous conditions
     73           # continue with the current range
     74           last = c
     75     else:
     76       # Only strip, don't merge: just output the stripped line.
     77       out_file.write(line)
     78       out_file.write("\n")
     79   if do_merge and last >= 0:
     80     # output the last range in the file
     81     if first == last:
     82       out_file.write("%04X%s\n" % (first, first_data))
     83     else:
     84       out_file.write("%04X..%04X%s\n" % (first, last, first_data))
     85     first = -1
     86     last = -1
     87     first_data = ""
     88   in_file.close()
     89   out_file.flush()
     90   out_file.close()
     91 
     92 
     93 def CopyAndStrip(s, t):
     94   """Copies a file and removes comments behind data lines but not in others."""
     95   CopyAndStripWithOptionalMerge(s, t, False)
     96 
     97 
     98 def CopyAndStripAndMerge(s, t):
     99   """Copies and strips a file and merges lines.
    100 
    101   Copies a file, removes comments, and
    102   merges lines with adjacent code point ranges and identical per-code point
    103   data lines into one line with range syntax.
    104   """
    105   CopyAndStripWithOptionalMerge(s, t, True)
    106 
    107 
    108 _unidata_files = {
    109   # Simply copy these files.
    110   "BidiMirroring.txt": shutil.copy,
    111   "BidiTest.txt": shutil.copy,
    112   "Blocks.txt": shutil.copy,
    113   "CaseFolding.txt": shutil.copy,
    114   "DerivedAge.txt": shutil.copy,
    115   "DerivedBidiClass.txt": shutil.copy,
    116   "DerivedJoiningGroup.txt": shutil.copy,
    117   "DerivedJoiningType.txt": shutil.copy,
    118   "DerivedNumericValues.txt": shutil.copy,
    119   "NameAliases.txt": shutil.copy,
    120   "NormalizationCorrections.txt": shutil.copy,
    121   "PropertyAliases.txt": shutil.copy,
    122   "PropertyValueAliases.txt": shutil.copy,
    123   "SpecialCasing.txt": shutil.copy,
    124   "UnicodeData.txt": shutil.copy,
    125 
    126   # Copy these files and remove comments behind data lines but not in others.
    127   "DerivedCoreProperties.txt": CopyAndStrip,
    128   "DerivedNormalizationProps.txt": CopyAndStrip,
    129   "GraphemeBreakProperty.txt": CopyAndStrip,
    130   "NormalizationTest.txt": CopyAndStrip,
    131   "PropList.txt": CopyAndStrip,
    132   "Scripts.txt": CopyAndStrip,
    133   "SentenceBreakProperty.txt": CopyAndStrip,
    134   "WordBreakProperty.txt": CopyAndStrip,
    135 
    136   # Also merge lines with adjacent code point ranges.
    137   "EastAsianWidth.txt": CopyAndStripAndMerge,
    138   "LineBreak.txt": CopyAndStripAndMerge
    139 }
    140 
    141 _file_version_re = re.compile("^([a-zA-Z0-9]+)" +
    142                               "-[0-9](?:\\.[0-9])*(?:d[0-9]+)?" +
    143                               "(\\.[a-z]+)$")
    144 
    145 def main():
    146   source_root = sys.argv[1]
    147   dest_root = sys.argv[2]
    148   source_files = []
    149   for root, dirs, files in os.walk(source_root):
    150     for file in files:
    151       source_files.append(os.path.join(root, file))
    152   files_processed = set()
    153   for source_file in source_files:
    154     basename = os.path.basename(source_file)
    155     match = _file_version_re.match(basename)
    156     if match:
    157       basename = match.group(1) + match.group(2)
    158       print basename
    159     if basename in _unidata_files:
    160       if basename in files_processed:
    161         print "duplicate file basename %s!" % basename
    162         sys.exit(1)
    163       files_processed.add(basename)
    164       dest_file = os.path.join(dest_root, basename)
    165       _unidata_files[basename](source_file, dest_file)
    166 
    167 
    168 if __name__ == "__main__":
    169   main()
    170