Home | History | Annotate | Download | only in icu
      1 package org.unicode.cldr.icu;
      2 
      3 import java.io.File;
      4 import java.util.ArrayList;
      5 import java.util.Collection;
      6 import java.util.HashMap;
      7 import java.util.HashSet;
      8 import java.util.List;
      9 import java.util.Map;
     10 import java.util.Set;
     11 import java.util.regex.Matcher;
     12 import java.util.regex.Pattern;
     13 
     14 import org.unicode.cldr.util.Builder;
     15 import org.unicode.cldr.util.CLDRFile;
     16 import org.unicode.cldr.util.CLDRFile.DraftStatus;
     17 import org.unicode.cldr.util.Factory;
     18 import org.unicode.cldr.util.PatternCache;
     19 import org.xml.sax.Attributes;
     20 import org.xml.sax.SAXException;
     21 
     22 import com.ibm.icu.impl.Utility;
     23 import com.ibm.icu.text.MessageFormat;
     24 
     25 /**
     26  * Converts CLDR collation files to the ICU format.
     27  * @author jchye
     28  */
     29 public class CollationMapper extends Mapper {
     30     private static Pattern SPECIALS_PATH = PatternCache.get("//ldml/special/icu:([\\w_]++)\\[@icu:([\\w_]++)=\"([^\"]++)\"]");
     31     private String sourceDir;
     32     private Factory specialFactory;
     33     private Set<String> validSubLocales = new HashSet<String>();
     34 
     35     // TODO: CLDR 28 ticket #8289 "Move collator CLDR settings into ICU format"
     36     // deprecated the collation sub-elements
     37     // import, settings, suppress_contractions, and optimize
     38     // and changed the data from XML syntax to ICU syntax.
     39     // Remove conversion of these elements when we do not need to handle old data any more.
     40 
     41     // Some settings have to be converted to numbers.
     42     private Map<String, String> settingsMap = Builder.with(new HashMap<String, String>())
     43         .put("primary", "1")
     44         .put("secondary", "2")
     45         .put("tertiary", "3")
     46         .put("quarternary", "4")
     47         .put("identical", "5")
     48         .put("on", "2")
     49         .get();
     50 
     51     /**
     52      * @param sourceDir the source dir of the collation files
     53      * @param specialFactory the factory for any ICU-specific collation info
     54      */
     55     public CollationMapper(String sourceDir, Factory specialFactory) {
     56         this.sourceDir = sourceDir;
     57         this.specialFactory = specialFactory;
     58     }
     59 
     60     /**
     61      * @return CLDR data converted to an ICU-friendly format
     62      */
     63     @Override
     64     public IcuData[] fillFromCldr(String locale) {
     65         List<IcuData> dataList = new ArrayList<IcuData>();
     66         IcuData mainLocale = new IcuData("common/collation/" + locale + ".xml", locale, true);
     67         CollationHandler handler = new CollationHandler(mainLocale);
     68         File file = new File(sourceDir, locale + ".xml");
     69         MapperUtils.parseFile(file, handler);
     70         dataList.add(mainLocale);
     71 
     72         String[] subLocales = handler.getSubLocales();
     73         if (subLocales != null) {
     74             for (String subLocale : subLocales) {
     75                 dataList.add(fillSubLocale(locale, subLocale));
     76                 validSubLocales.add(subLocale);
     77             }
     78         }
     79 
     80         if (hasSpecialFile(locale)) {
     81             CLDRFile specialFile = specialFactory.make(locale, false);
     82             mainLocale.setFileComment("ICU <specials> source: <path>/xml/collation/" + locale + ".xml");
     83             for (String path : specialFile) {
     84                 String fullPath = specialFile.getFullXPath(path);
     85                 Matcher matcher = SPECIALS_PATH.matcher(fullPath);
     86                 if (matcher.matches()) {
     87                     mainLocale.add(
     88                         MessageFormat.format("/{0}:process({1})", matcher.group(1), matcher.group(2)),
     89                         matcher.group(3));
     90                 }
     91             }
     92         }
     93 
     94         return MapperUtils.toArray(dataList);
     95     }
     96 
     97     /**
     98      * Creates an IcuData object for the specified sublocale
     99      * @param locale the parent of the sublocale
    100      * @param subLocale the sublocale
    101      * @return
    102      */
    103     private IcuData fillSubLocale(String locale, String subLocale) {
    104         IcuData icuData = new IcuData("icu-config.xml & build.xml", subLocale, true);
    105         icuData.setFileComment("validSubLocale of \"" + locale + "\"");
    106         icuData.add("/___", "");
    107         return icuData;
    108     }
    109 
    110     /**
    111      * @param filename
    112      * @return true if a special XML file with the specified filename is available.
    113      */
    114     private boolean hasSpecialFile(String filename) {
    115         return specialFactory != null && specialFactory.getAvailable().contains(filename);
    116     }
    117 
    118     /**
    119      * The XML handler for collation data.
    120      */
    121     private class CollationHandler extends MapperUtils.EmptyHandler {
    122         private IcuData icuData;
    123         private StringBuilder currentText = new StringBuilder();
    124         private String collationType;
    125         private boolean isShort;
    126         private List<String> properties = new ArrayList<String>();
    127         private List<String> rules = new ArrayList<String>();
    128         private String[] subLocales;
    129 
    130         public CollationHandler(IcuData icuData) {
    131             this.icuData = icuData;
    132         }
    133 
    134         @Override
    135         public void startElement(String uri, String localName, String qName, Attributes attr) throws SAXException {
    136             if (qName.equals("collation")) {
    137                 DraftStatus status = DraftStatus.forString(attr.getValue("draft"));
    138                 collationType = status.compareTo(DraftStatus.contributed) < 0 ? null : attr.getValue("type");
    139                 isShort = attr.getValue("alt") != null;
    140                 properties.clear();
    141                 rules.clear();
    142             } else if (qName.equals("collations")) {
    143                 String validSubLocales = attr.getValue("validSubLocales");
    144                 if (validSubLocales != null) {
    145                     subLocales = validSubLocales.split("\\s++");
    146                 }
    147             } else if (qName.equals("version")) {
    148                 icuData.add("/Version", MapperUtils.formatVersion(attr.getValue("number")));
    149             }
    150             if (collationType == null) return;
    151 
    152             // Collation-specific elements.
    153             if (qName.equals("settings")) {
    154                 for (int i = 0; i < attr.getLength(); i++) {
    155                     String name = attr.getLocalName(i);
    156                     String value = attr.getValue(i);
    157                     if (name.equals("strength") || name.equals("backwards")) {
    158                         value = settingsMap.get(value);
    159                     } else if (name.equals("hiraganaQuaternary")) {
    160                         name = "hiraganaQ";
    161                     }
    162                     properties.add(name + " " + value);
    163                 }
    164             } else if (qName.equals("import")) {
    165                 String value = attr.getValue("source");
    166                 String type = attr.getValue("type");
    167                 if (type != null) {
    168                     value += "-u-co-" + type;
    169                 }
    170                 properties.add("import " + value);
    171             }
    172         }
    173 
    174         @Override
    175         public void endElement(String uri, String localName, String qName) throws SAXException {
    176             // collationType will only be null if the draft status is insufficient.
    177             if (qName.equals("defaultCollation")) {
    178                 icuData.add("/collations/default", currentText.toString());
    179             } else if (collationType == null) {
    180                 currentText.setLength(0);
    181                 return;
    182             }
    183 
    184             if (qName.equals("suppress_contractions")) {
    185                 properties.add("suppressContractions " + currentText.toString());
    186             } else if (qName.equals("cr")) {
    187                 String[] lines = currentText.toString().split("\n");
    188                 for (String line : lines) {
    189                     int commentPos = Utility.quotedIndexOf(line, 0, line.length(), "#");
    190                     if (commentPos > -1) {
    191                         line = line.substring(0, commentPos);
    192                     }
    193                     line = line.trim();
    194                     if (line.length() > 0) {
    195                         rules.add(line);
    196                     }
    197                 }
    198             } else if (qName.equals("collation")) {
    199                 // Add attributes before the main rules.
    200                 StringBuilder attrBuffer = new StringBuilder();
    201                 if (properties.size() > 0) {
    202                     for (String property : properties) {
    203                         attrBuffer.append('[').append(property).append(']');
    204                     }
    205                     rules.add(0, attrBuffer.toString());
    206                 }
    207 
    208                 String[] rulesArray;
    209                 if (rules.size() == 0) {
    210                     rulesArray = new String[] { "" };
    211                 } else {
    212                     rulesArray = new String[rules.size()];
    213                     rules.toArray(rulesArray);
    214                 }
    215 
    216                 String rbPath = "/collations/" + collationType + "/Sequence";
    217                 // Always prefer the short version.
    218                 if (isShort || !icuData.containsKey(rbPath)) {
    219                     icuData.replace(rbPath, rulesArray);
    220                     icuData.replace("/collations/" + collationType + "/Version", new String[] { CLDRFile.GEN_VERSION });
    221                 }
    222             }
    223             currentText.setLength(0);
    224         }
    225 
    226         @Override
    227         public void characters(char[] ch, int start, int length) throws SAXException {
    228             currentText.append(ch, start, length);
    229         }
    230 
    231         public String[] getSubLocales() {
    232             return subLocales;
    233         }
    234     }
    235 
    236     @Override
    237     public Collection<String> getAvailable() {
    238         return MapperUtils.getNames(sourceDir);
    239     }
    240 
    241     @Override
    242     public Makefile generateMakefile(Collection<String> aliases) {
    243         Makefile makefile = new Makefile("COLLATION");
    244         makefile.addSyntheticAlias(aliases);
    245         makefile.addAliasSource();
    246         // Split sources into locales and sublocales.
    247         List<String> subLocales = new ArrayList<String>();
    248         List<String> locales = new ArrayList<String>();
    249         locales.add("$(COLLATION_EMPTY_SOURCE)");
    250         for (String source : sources) {
    251             if (validSubLocales.contains(source)) {
    252                 subLocales.add(source);
    253             } else {
    254                 locales.add(source);
    255             }
    256         }
    257         makefile.addEntry("COLLATION_EMPTY_SOURCE",
    258             "Empty locales, used for validSubLocale fallback.",
    259             subLocales);
    260         makefile.addSource(locales);
    261         return makefile;
    262     }
    263 }
    264