Home | History | Annotate | Download | only in localedata
      1 #!/usr/bin/env python
      2 #
      3 # Copyright 2016 The Android Open Source Project. All Rights Reserved.
      4 #
      5 # Licensed under the Apache License, Version 2.0 (the "License");
      6 # you may not use this file except in compliance with the License.
      7 # You may obtain a copy of the License at
      8 #
      9 #    http://www.apache.org/licenses/LICENSE-2.0
     10 #
     11 #    Unless required by applicable law or agreed to in writing, software
     12 #    distributed under the License is distributed on an "AS IS" BASIS,
     13 #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 #    See the License for the specific language governing permissions and
     15 #    limitations under the License.
     16 #
     17 
     18 """Generate a C++ data table containing locale data."""
     19 
     20 import collections
     21 import glob
     22 import os.path
     23 import sys
     24 
     25 
     26 def get_locale_parts(locale):
     27     """Split a locale into three parts, for langauge, script, and region."""
     28     parts = locale.split('_')
     29     if len(parts) == 1:
     30         return (parts[0], None, None)
     31     elif len(parts) == 2:
     32         if len(parts[1]) == 4:  # parts[1] is a script
     33             return (parts[0], parts[1], None)
     34         else:
     35             return (parts[0], None, parts[1])
     36     else:
     37         assert len(parts) == 3
     38         return tuple(parts)
     39 
     40 
     41 def read_likely_subtags(input_file_name):
     42     """Read and parse ICU's likelySubtags.txt."""
     43     with open(input_file_name) as input_file:
     44         likely_script_dict = {
     45             # Android's additions for pseudo-locales. These internal codes make
     46             # sure that the pseudo-locales would not match other English or
     47             # Arabic locales. (We can't use private-use ISO 15924 codes, since
     48             # they may be used by apps for other purposes.)
     49             "en_XA": "~~~A",
     50             "ar_XB": "~~~B",
     51             # Removed data from later versions of ICU
     52             "ji": "Hebr", # Old code for Yiddish, still used in Java and Android
     53         }
     54         representative_locales = {
     55             # Android's additions
     56             "en_Latn_GB", # representative for en_Latn_001
     57             "es_Latn_MX", # representative for es_Latn_419
     58             "es_Latn_US", # representative for es_Latn_419 (not the best idea,
     59                           # but Android has been shipping with it for quite a
     60                           # while. Fortunately, MX < US, so if both exist, MX
     61                           # would be chosen.)
     62         }
     63         for line in input_file:
     64             line = unicode(line, 'UTF-8').strip(u' \n\uFEFF').encode('UTF-8')
     65             if line.startswith('//'):
     66                 continue
     67             if '{' in line and '}' in line:
     68                 from_locale = line[:line.index('{')]
     69                 to_locale = line[line.index('"')+1:line.rindex('"')]
     70                 from_lang, from_scr, from_region = get_locale_parts(from_locale)
     71                 _, to_scr, to_region = get_locale_parts(to_locale)
     72                 if from_lang == 'und':
     73                     continue  # not very useful for our purposes
     74                 if from_region is None and to_region not in ['001', 'ZZ']:
     75                     representative_locales.add(to_locale)
     76                 if from_scr is None:
     77                     likely_script_dict[from_locale] = to_scr
     78         return likely_script_dict, frozenset(representative_locales)
     79 
     80 
     81 # From packLanguageOrRegion() in ResourceTypes.cpp
     82 def pack_language_or_region(inp, base):
     83     """Pack langauge or region in a two-byte tuple."""
     84     if inp is None:
     85         return (0, 0)
     86     elif len(inp) == 2:
     87         return ord(inp[0]), ord(inp[1])
     88     else:
     89         assert len(inp) == 3
     90         base = ord(base)
     91         first = ord(inp[0]) - base
     92         second = ord(inp[1]) - base
     93         third = ord(inp[2]) - base
     94 
     95         return (0x80 | (third << 2) | (second >>3),
     96                 ((second << 5) | first) & 0xFF)
     97 
     98 
     99 # From packLanguage() in ResourceTypes.cpp
    100 def pack_language(language):
    101     """Pack language in a two-byte tuple."""
    102     return pack_language_or_region(language, 'a')
    103 
    104 
    105 # From packRegion() in ResourceTypes.cpp
    106 def pack_region(region):
    107     """Pack region in a two-byte tuple."""
    108     return pack_language_or_region(region, '0')
    109 
    110 
    111 def pack_to_uint32(locale):
    112     """Pack language+region of locale into a 32-bit unsigned integer."""
    113     lang, _, region = get_locale_parts(locale)
    114     plang = pack_language(lang)
    115     pregion = pack_region(region)
    116     return (plang[0] << 24) | (plang[1] << 16) | (pregion[0] << 8) | pregion[1]
    117 
    118 
    119 def dump_script_codes(all_scripts):
    120     """Dump the SCRIPT_CODES table."""
    121     print 'const char SCRIPT_CODES[][4] = {'
    122     for index, script in enumerate(all_scripts):
    123         print "    /* %-2d */ {'%c', '%c', '%c', '%c'}," % (
    124             index, script[0], script[1], script[2], script[3])
    125     print '};'
    126     print
    127 
    128 
    129 def dump_script_data(likely_script_dict, all_scripts):
    130     """Dump the script data."""
    131     print
    132     print 'const std::unordered_map<uint32_t, uint8_t> LIKELY_SCRIPTS({'
    133     for locale in sorted(likely_script_dict.keys()):
    134         script = likely_script_dict[locale]
    135         print '    {0x%08Xu, %2du}, // %s -> %s' % (
    136             pack_to_uint32(locale),
    137             all_scripts.index(script),
    138             locale.replace('_', '-'),
    139             script)
    140     print '});'
    141 
    142 
    143 def pack_to_uint64(locale):
    144     """Pack a full locale into a 64-bit unsigned integer."""
    145     _, script, _ = get_locale_parts(locale)
    146     return ((pack_to_uint32(locale) << 32) |
    147             (ord(script[0]) << 24) |
    148             (ord(script[1]) << 16) |
    149             (ord(script[2]) << 8) |
    150             ord(script[3]))
    151 
    152 
    153 def dump_representative_locales(representative_locales):
    154     """Dump the set of representative locales."""
    155     print
    156     print 'std::unordered_set<uint64_t> REPRESENTATIVE_LOCALES({'
    157     for locale in sorted(representative_locales):
    158         print '    0x%08Xllu, // %s' % (
    159             pack_to_uint64(locale),
    160             locale)
    161     print '});'
    162 
    163 
    164 def read_and_dump_likely_data(icu_data_dir):
    165     """Read and dump the likely-script data."""
    166     likely_subtags_txt = os.path.join(icu_data_dir, 'misc', 'likelySubtags.txt')
    167     likely_script_dict, representative_locales = read_likely_subtags(
    168         likely_subtags_txt)
    169 
    170     all_scripts = list(set(likely_script_dict.values()))
    171     assert len(all_scripts) <= 256
    172     all_scripts.sort()
    173 
    174     dump_script_codes(all_scripts)
    175     dump_script_data(likely_script_dict, all_scripts)
    176     dump_representative_locales(representative_locales)
    177     return likely_script_dict
    178 
    179 
    180 def read_parent_data(icu_data_dir):
    181     """Read locale parent data from ICU data files."""
    182     all_icu_data_files = glob.glob(os.path.join(icu_data_dir, '*', '*.txt'))
    183     parent_dict = {}
    184     for data_file in all_icu_data_files:
    185         locale = os.path.splitext(os.path.basename(data_file))[0]
    186         with open(data_file) as input_file:
    187             for line in input_file:
    188                 if '%%Parent' in line:
    189                     parent = line[line.index('"')+1:line.rindex('"')]
    190                     if locale in parent_dict:
    191                         # Different files shouldn't have different parent info
    192                         assert parent_dict[locale] == parent
    193                     else:
    194                         parent_dict[locale] = parent
    195                 elif locale.startswith('ar_') and 'default{"latn"}' in line:
    196                     # Arabic parent overrides for ASCII digits. Since
    197                     # Unicode extensions are not supported in ResourceTypes,
    198                     # we will use ar-015 (Arabic, Northern Africa) instead
    199                     # of the more correct ar-u-nu-latn.
    200                     parent_dict[locale] = 'ar_015'
    201     return parent_dict
    202 
    203 
    204 def get_likely_script(locale, likely_script_dict):
    205     """Find the likely script for a locale, given the likely-script dictionary.
    206     """
    207     if locale.count('_') == 2:
    208         # it already has a script
    209         return locale.split('_')[1]
    210     elif locale in likely_script_dict:
    211         return likely_script_dict[locale]
    212     else:
    213         language = locale.split('_')[0]
    214         return likely_script_dict[language]
    215 
    216 
    217 def dump_parent_data(script_organized_dict):
    218     """Dump information for parents of locales."""
    219     sorted_scripts = sorted(script_organized_dict.keys())
    220     print
    221     for script in sorted_scripts:
    222         parent_dict = script_organized_dict[script]
    223         print ('const std::unordered_map<uint32_t, uint32_t> %s_PARENTS({'
    224             % script.upper())
    225         for locale in sorted(parent_dict.keys()):
    226             parent = parent_dict[locale]
    227             print '    {0x%08Xu, 0x%08Xu}, // %s -> %s' % (
    228                 pack_to_uint32(locale),
    229                 pack_to_uint32(parent),
    230                 locale.replace('_', '-'),
    231                 parent.replace('_', '-'))
    232         print '});'
    233         print
    234 
    235     print 'const struct {'
    236     print '    const char script[4];'
    237     print '    const std::unordered_map<uint32_t, uint32_t>* map;'
    238     print '} SCRIPT_PARENTS[] = {'
    239     for script in sorted_scripts:
    240         print "    {{'%c', '%c', '%c', '%c'}, &%s_PARENTS}," % (
    241             script[0], script[1], script[2], script[3],
    242             script.upper())
    243     print '};'
    244 
    245 
    246 def dump_parent_tree_depth(parent_dict):
    247     """Find and dump the depth of the parent tree."""
    248     max_depth = 1
    249     for locale, _ in parent_dict.items():
    250         depth = 1
    251         while locale in parent_dict:
    252             locale = parent_dict[locale]
    253             depth += 1
    254         max_depth = max(max_depth, depth)
    255     assert max_depth < 5 # Our algorithms assume small max_depth
    256     print
    257     print 'const size_t MAX_PARENT_DEPTH = %d;' % max_depth
    258 
    259 
    260 def read_and_dump_parent_data(icu_data_dir, likely_script_dict):
    261     """Read parent data from ICU and dump it."""
    262     parent_dict = read_parent_data(icu_data_dir)
    263     script_organized_dict = collections.defaultdict(dict)
    264     for locale in parent_dict:
    265         parent = parent_dict[locale]
    266         if parent == 'root':
    267             continue
    268         script = get_likely_script(locale, likely_script_dict)
    269         script_organized_dict[script][locale] = parent_dict[locale]
    270     dump_parent_data(script_organized_dict)
    271     dump_parent_tree_depth(parent_dict)
    272 
    273 
    274 def main():
    275     """Read the data files from ICU and dump the output to a C++ file."""
    276     source_root = sys.argv[1]
    277     icu_data_dir = os.path.join(
    278         source_root,
    279         'external', 'icu', 'icu4c', 'source', 'data')
    280 
    281     print '// Auto-generated by %s' % sys.argv[0]
    282     print
    283     likely_script_dict = read_and_dump_likely_data(icu_data_dir)
    284     read_and_dump_parent_data(icu_data_dir, likely_script_dict)
    285 
    286 
    287 if __name__ == '__main__':
    288     main()
    289