1 package org.unicode.cldr.icu; 2 3 import java.io.File; 4 import java.util.ArrayList; 5 import java.util.Collection; 6 import java.util.HashMap; 7 import java.util.HashSet; 8 import java.util.List; 9 import java.util.Map; 10 import java.util.Set; 11 import java.util.regex.Matcher; 12 import java.util.regex.Pattern; 13 14 import org.unicode.cldr.util.Builder; 15 import org.unicode.cldr.util.CLDRFile; 16 import org.unicode.cldr.util.CLDRFile.DraftStatus; 17 import org.unicode.cldr.util.Factory; 18 import org.unicode.cldr.util.PatternCache; 19 import org.xml.sax.Attributes; 20 import org.xml.sax.SAXException; 21 22 import com.ibm.icu.impl.Utility; 23 import com.ibm.icu.text.MessageFormat; 24 25 /** 26 * Converts CLDR collation files to the ICU format. 27 * @author jchye 28 */ 29 public class CollationMapper extends Mapper { 30 private static Pattern SPECIALS_PATH = PatternCache.get("//ldml/special/icu:([\\w_]++)\\[@icu:([\\w_]++)=\"([^\"]++)\"]"); 31 private String sourceDir; 32 private Factory specialFactory; 33 private Set<String> validSubLocales = new HashSet<String>(); 34 35 // TODO: CLDR 28 ticket #8289 "Move collator CLDR settings into ICU format" 36 // deprecated the collation sub-elements 37 // import, settings, suppress_contractions, and optimize 38 // and changed the data from XML syntax to ICU syntax. 39 // Remove conversion of these elements when we do not need to handle old data any more. 40 41 // Some settings have to be converted to numbers. 42 private Map<String, String> settingsMap = Builder.with(new HashMap<String, String>()) 43 .put("primary", "1") 44 .put("secondary", "2") 45 .put("tertiary", "3") 46 .put("quarternary", "4") 47 .put("identical", "5") 48 .put("on", "2") 49 .get(); 50 51 /** 52 * @param sourceDir the source dir of the collation files 53 * @param specialFactory the factory for any ICU-specific collation info 54 */ 55 public CollationMapper(String sourceDir, Factory specialFactory) { 56 this.sourceDir = sourceDir; 57 this.specialFactory = specialFactory; 58 } 59 60 /** 61 * @return CLDR data converted to an ICU-friendly format 62 */ 63 @Override 64 public IcuData[] fillFromCldr(String locale) { 65 List<IcuData> dataList = new ArrayList<IcuData>(); 66 IcuData mainLocale = new IcuData("common/collation/" + locale + ".xml", locale, true); 67 CollationHandler handler = new CollationHandler(mainLocale); 68 File file = new File(sourceDir, locale + ".xml"); 69 MapperUtils.parseFile(file, handler); 70 dataList.add(mainLocale); 71 72 String[] subLocales = handler.getSubLocales(); 73 if (subLocales != null) { 74 for (String subLocale : subLocales) { 75 dataList.add(fillSubLocale(locale, subLocale)); 76 validSubLocales.add(subLocale); 77 } 78 } 79 80 if (hasSpecialFile(locale)) { 81 CLDRFile specialFile = specialFactory.make(locale, false); 82 mainLocale.setFileComment("ICU <specials> source: <path>/xml/collation/" + locale + ".xml"); 83 for (String path : specialFile) { 84 String fullPath = specialFile.getFullXPath(path); 85 Matcher matcher = SPECIALS_PATH.matcher(fullPath); 86 if (matcher.matches()) { 87 mainLocale.add( 88 MessageFormat.format("/{0}:process({1})", matcher.group(1), matcher.group(2)), 89 matcher.group(3)); 90 } 91 } 92 } 93 94 return MapperUtils.toArray(dataList); 95 } 96 97 /** 98 * Creates an IcuData object for the specified sublocale 99 * @param locale the parent of the sublocale 100 * @param subLocale the sublocale 101 * @return 102 */ 103 private IcuData fillSubLocale(String locale, String subLocale) { 104 IcuData icuData = new IcuData("icu-config.xml & build.xml", subLocale, true); 105 icuData.setFileComment("validSubLocale of \"" + locale + "\""); 106 icuData.add("/___", ""); 107 return icuData; 108 } 109 110 /** 111 * @param filename 112 * @return true if a special XML file with the specified filename is available. 113 */ 114 private boolean hasSpecialFile(String filename) { 115 return specialFactory != null && specialFactory.getAvailable().contains(filename); 116 } 117 118 /** 119 * The XML handler for collation data. 120 */ 121 private class CollationHandler extends MapperUtils.EmptyHandler { 122 private IcuData icuData; 123 private StringBuilder currentText = new StringBuilder(); 124 private String collationType; 125 private boolean isShort; 126 private List<String> properties = new ArrayList<String>(); 127 private List<String> rules = new ArrayList<String>(); 128 private String[] subLocales; 129 130 public CollationHandler(IcuData icuData) { 131 this.icuData = icuData; 132 } 133 134 @Override 135 public void startElement(String uri, String localName, String qName, Attributes attr) throws SAXException { 136 if (qName.equals("collation")) { 137 DraftStatus status = DraftStatus.forString(attr.getValue("draft")); 138 collationType = status.compareTo(DraftStatus.contributed) < 0 ? null : attr.getValue("type"); 139 isShort = attr.getValue("alt") != null; 140 properties.clear(); 141 rules.clear(); 142 } else if (qName.equals("collations")) { 143 String validSubLocales = attr.getValue("validSubLocales"); 144 if (validSubLocales != null) { 145 subLocales = validSubLocales.split("\\s++"); 146 } 147 } else if (qName.equals("version")) { 148 icuData.add("/Version", MapperUtils.formatVersion(attr.getValue("number"))); 149 } 150 if (collationType == null) return; 151 152 // Collation-specific elements. 153 if (qName.equals("settings")) { 154 for (int i = 0; i < attr.getLength(); i++) { 155 String name = attr.getLocalName(i); 156 String value = attr.getValue(i); 157 if (name.equals("strength") || name.equals("backwards")) { 158 value = settingsMap.get(value); 159 } else if (name.equals("hiraganaQuaternary")) { 160 name = "hiraganaQ"; 161 } 162 properties.add(name + " " + value); 163 } 164 } else if (qName.equals("import")) { 165 String value = attr.getValue("source"); 166 String type = attr.getValue("type"); 167 if (type != null) { 168 value += "-u-co-" + type; 169 } 170 properties.add("import " + value); 171 } 172 } 173 174 @Override 175 public void endElement(String uri, String localName, String qName) throws SAXException { 176 // collationType will only be null if the draft status is insufficient. 177 if (qName.equals("defaultCollation")) { 178 icuData.add("/collations/default", currentText.toString()); 179 } else if (collationType == null) { 180 currentText.setLength(0); 181 return; 182 } 183 184 if (qName.equals("suppress_contractions")) { 185 properties.add("suppressContractions " + currentText.toString()); 186 } else if (qName.equals("cr")) { 187 String[] lines = currentText.toString().split("\n"); 188 for (String line : lines) { 189 int commentPos = Utility.quotedIndexOf(line, 0, line.length(), "#"); 190 if (commentPos > -1) { 191 line = line.substring(0, commentPos); 192 } 193 line = line.trim(); 194 if (line.length() > 0) { 195 rules.add(line); 196 } 197 } 198 } else if (qName.equals("collation")) { 199 // Add attributes before the main rules. 200 StringBuilder attrBuffer = new StringBuilder(); 201 if (properties.size() > 0) { 202 for (String property : properties) { 203 attrBuffer.append('[').append(property).append(']'); 204 } 205 rules.add(0, attrBuffer.toString()); 206 } 207 208 String[] rulesArray; 209 if (rules.size() == 0) { 210 rulesArray = new String[] { "" }; 211 } else { 212 rulesArray = new String[rules.size()]; 213 rules.toArray(rulesArray); 214 } 215 216 String rbPath = "/collations/" + collationType + "/Sequence"; 217 // Always prefer the short version. 218 if (isShort || !icuData.containsKey(rbPath)) { 219 icuData.replace(rbPath, rulesArray); 220 icuData.replace("/collations/" + collationType + "/Version", new String[] { CLDRFile.GEN_VERSION }); 221 } 222 } 223 currentText.setLength(0); 224 } 225 226 @Override 227 public void characters(char[] ch, int start, int length) throws SAXException { 228 currentText.append(ch, start, length); 229 } 230 231 public String[] getSubLocales() { 232 return subLocales; 233 } 234 } 235 236 @Override 237 public Collection<String> getAvailable() { 238 return MapperUtils.getNames(sourceDir); 239 } 240 241 @Override 242 public Makefile generateMakefile(Collection<String> aliases) { 243 Makefile makefile = new Makefile("COLLATION"); 244 makefile.addSyntheticAlias(aliases); 245 makefile.addAliasSource(); 246 // Split sources into locales and sublocales. 247 List<String> subLocales = new ArrayList<String>(); 248 List<String> locales = new ArrayList<String>(); 249 locales.add("$(COLLATION_EMPTY_SOURCE)"); 250 for (String source : sources) { 251 if (validSubLocales.contains(source)) { 252 subLocales.add(source); 253 } else { 254 locales.add(source); 255 } 256 } 257 makefile.addEntry("COLLATION_EMPTY_SOURCE", 258 "Empty locales, used for validSubLocale fallback.", 259 subLocales); 260 makefile.addSource(locales); 261 return makefile; 262 } 263 } 264