Home | History | Annotate | Download | only in tool
      1 package org.unicode.cldr.tool;
      2 
      3 import java.io.IOException;
      4 import java.util.Arrays;
      5 import java.util.Collection;
      6 import java.util.EnumMap;
      7 import java.util.LinkedHashMap;
      8 import java.util.Map;
      9 import java.util.Map.Entry;
     10 import java.util.Set;
     11 import java.util.TreeSet;
     12 
     13 import org.unicode.cldr.draft.FileUtilities;
     14 import org.unicode.cldr.tool.FormattedFileWriter.Anchors;
     15 import org.unicode.cldr.util.Annotations;
     16 import org.unicode.cldr.util.Annotations.AnnotationSet;
     17 import org.unicode.cldr.util.CLDRFile;
     18 import org.unicode.cldr.util.CLDRPaths;
     19 import org.unicode.cldr.util.CldrUtility;
     20 import org.unicode.cldr.util.Factory;
     21 import org.unicode.cldr.util.FileCopier;
     22 import org.unicode.cldr.util.LanguageGroup;
     23 import org.unicode.cldr.util.LanguageTagParser;
     24 import org.unicode.cldr.util.LocaleIDParser;
     25 
     26 import com.google.common.collect.Multimap;
     27 import com.google.common.collect.TreeMultimap;
     28 import com.ibm.icu.dev.util.CollectionUtilities;
     29 import com.ibm.icu.impl.Relation;
     30 import com.ibm.icu.impl.Row;
     31 import com.ibm.icu.impl.Row.R3;
     32 import com.ibm.icu.impl.Utility;
     33 import com.ibm.icu.text.RuleBasedCollator;
     34 import com.ibm.icu.text.UnicodeSet;
     35 import com.ibm.icu.util.ULocale;
     36 
     37 public class ChartAnnotations extends Chart {
     38 
     39     private static final String LDML_ANNOTATIONS = "<a href='http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-general.html#Annotations'>LDML Annotations</a>";
     40 
     41     private static final String MAIN_HEADER = "<p>Annotations provide names and keywords for Unicode characters, currently focusing on emoji. "
     42         + "If you see any problems, please <a target='_blank' href='http://unicode.org/cldr/trac/newticket'>file a ticket</a> with the corrected values for the locale. "
     43         + "For the XML data used for these charts, see "
     44         + "<a href='http://unicode.org/repos/cldr/tags/latest/common/annotations/'>latest-release annotations </a> "
     45         + "or <a href='http://unicode.org/repos/cldr/tags/latest/common/annotations/'>beta annotations</a>. "
     46         + "For more information, see " + LDML_ANNOTATIONS + ".</p>";
     47     private static final boolean DEBUG = false;
     48     private static final String DIR = CLDRPaths.CHART_DIRECTORY + "annotations/";
     49 
     50     public static void main(String[] args) {
     51         new ChartAnnotations().writeChart(null);
     52     }
     53 
     54     @Override
     55     public String getDirectory() {
     56         return DIR;
     57     }
     58 
     59     @Override
     60     public String getTitle() {
     61         return "Annotation Charts";
     62     }
     63 
     64     @Override
     65     public String getFileName() {
     66         return "index";
     67     }
     68 
     69     @Override
     70     public String getExplanation() {
     71         return MAIN_HEADER + "<p>The charts are presented in groups of related languages, for easier comparison.<p>";
     72     }
     73 
     74     public void writeContents(FormattedFileWriter pw) throws IOException {
     75         FileCopier.ensureDirectoryExists(DIR);
     76         FileCopier.copy(Chart.class, "index.css", DIR);
     77 
     78         FormattedFileWriter.Anchors anchors = new FormattedFileWriter.Anchors();
     79         writeSubcharts(anchors);
     80         pw.setIndex("Main Chart Index", "../index.html");
     81         pw.write(anchors.toString());
     82     }
     83 
     84     static final UnicodeSet EXTRAS = new UnicodeSet()
     85         .addAll(Arrays.asList(
     86             "", "", "#", "", "", "", "", "", "", "", "", "",
     87             "", "", "", "", "", "",
     88             "", "", "", "", "", "", "", "", "", "", "",
     89             "", "", "", "", "", "",
     90             "",
     91             "#",
     92             "",
     93             "",
     94             "",
     95             "","",
     96             "",
     97             "",
     98             "",""
     99             ))
    100         .freeze();
    101 
    102     public void writeSubcharts(Anchors anchors) throws IOException {
    103         Set<String> locales = Annotations.getAvailableLocales();
    104 
    105         AnnotationSet english = Annotations.getDataSet("en");
    106         UnicodeSet s = new UnicodeSet(english.keySet()).addAll(EXTRAS).freeze();
    107 
    108         // set up right order for columns
    109 
    110         Map<String, String> nameToCode = new LinkedHashMap<String, String>();
    111         Relation<LanguageGroup, R3<Integer, String, String>> groupToNameAndCodeSorted = Relation.of(
    112             new EnumMap<LanguageGroup, Set<R3<Integer, String, String>>>(LanguageGroup.class),
    113             TreeSet.class);
    114 
    115         Multimap<String, String> localeToSub = TreeMultimap.create();
    116         LanguageTagParser ltp = new LanguageTagParser();
    117 
    118         for (String locale : locales) {
    119             ltp.set(locale);
    120             if (locale.equals("root")) {
    121                 continue;
    122             }
    123             if (locale.equals("en")) { // make first
    124                 continue;
    125             }
    126             String region = ltp.getRegion();
    127             if (!region.isEmpty()) {
    128                 localeToSub.put(ltp.getLanguageScript(), locale);
    129                 continue;
    130             }
    131 
    132             if (locale.startsWith("en")) {
    133                 int debug = 0;
    134             }
    135             String name = ENGLISH.getName(locale, true);
    136             int baseEnd = locale.indexOf('_');
    137             ULocale loc = new ULocale(baseEnd < 0 ? locale : locale.substring(0, baseEnd));
    138             LanguageGroup group = LanguageGroup.get(loc);
    139             int rank = LanguageGroup.rankInGroup(loc);
    140             groupToNameAndCodeSorted.put(group, Row.of(rank, name, locale));
    141         }
    142 
    143         for (Entry<LanguageGroup, Set<R3<Integer, String, String>>> groupPairs : groupToNameAndCodeSorted.keyValuesSet()) {
    144             LanguageGroup group = groupPairs.getKey();
    145             String ename = ENGLISH.getName("en", true);
    146             nameToCode.clear();
    147             nameToCode.put(ename, "en"); // always have english first
    148 
    149             // add English variants if they exist
    150 
    151             for (R3<Integer, String, String> pair : groupPairs.getValue()) {
    152                 String name = pair.get1();
    153                 String locale = pair.get2();
    154                 if (locale.startsWith("en_")) {
    155                     nameToCode.put(name, locale);
    156                 }
    157             }
    158 
    159             for (R3<Integer, String, String> pair : groupPairs.getValue()) {
    160                 String name = pair.get1();
    161                 String locale = pair.get2();
    162 
    163                 nameToCode.put(name, locale);
    164                 System.out.println(pair);
    165             }
    166             // now build table with right order for columns
    167             double width = ((int) ((99.0 / (locales.size() + 1)) * 1000)) / 1000.0;
    168             //String widthString = "class='source' width='"+ width + "%'";
    169             String widthStringTarget = "class='target' width='" + width + "%'";
    170 
    171             TablePrinter tablePrinter = new TablePrinter()
    172                 .addColumn("Char", "class='source' width='1%'", CldrUtility.getDoubleLinkMsg(), "class='source-image'", true)
    173                 .addColumn("Hex", "class='source' width='1%'", null, "class='source'", true)
    174             //.addColumn("Formal Name", "class='source' width='" + width + "%'", null, "class='source'", true)
    175             ;
    176 
    177             for (Entry<String, String> entry : nameToCode.entrySet()) {
    178                 String name = entry.getKey();
    179                 tablePrinter.addColumn(name, widthStringTarget, null, "class='target'", true);
    180             }
    181             // sort the characters
    182             Set<String> sorted = new TreeSet<>(RBC);
    183             Multimap<String, String> valueToSub = TreeMultimap.create();
    184 
    185             for (String cp : s.addAllTo(sorted)) {
    186                 tablePrinter
    187                     .addRow()
    188                     .addCell(cp)
    189                     .addCell(Utility.hex(cp, 4, " "))
    190                 //.addCell(getName(cp))
    191                 ;
    192                 for (Entry<String, String> nameAndLocale : nameToCode.entrySet()) {
    193                     String name = nameAndLocale.getKey();
    194                     String locale = nameAndLocale.getValue();
    195 
    196                     AnnotationSet annotations = Annotations.getDataSet(locale);
    197                     AnnotationSet parentAnnotations = Annotations.getDataSet(LocaleIDParser.getParent(locale));
    198                     String baseAnnotation = annotations.toString(cp, true, parentAnnotations);
    199                     String baseAnnotationOriginal = baseAnnotation;
    200 
    201                     if (DEBUG) System.out.println(name + ":" + annotations.toString(cp, false, null));
    202                     Collection<String> subs = localeToSub.get(locale);
    203                     if (!subs.isEmpty()) {
    204                         valueToSub.clear();
    205                         for (String sub : subs) {
    206                             AnnotationSet subAnnotations = Annotations.getDataSet(sub);
    207                             AnnotationSet subParentAnnotations = Annotations.getDataSet(LocaleIDParser.getParent(locale));
    208                             String baseAnnotation2 = subAnnotations.toString(cp, true, subParentAnnotations);
    209                             if (!baseAnnotation2.equals(baseAnnotationOriginal)) {
    210                                 valueToSub.put(baseAnnotation2, sub);
    211                             }
    212                         }
    213                         for (Entry<String, Collection<String>> entry : valueToSub.asMap().entrySet()) {
    214                             baseAnnotation += "<hr><i>" + CollectionUtilities.join(entry.getValue(), ", ") + "</i>: " + entry.getKey();
    215                         }
    216                     }
    217                     tablePrinter.addCell(baseAnnotation);
    218                 }
    219                 tablePrinter.finishRow();
    220             }
    221             final String name = group.toString();
    222             new Subchart(name + " Annotations", FileUtilities.anchorize(name), tablePrinter).writeChart(anchors);
    223         }
    224     }
    225 
    226     static final int FIRST_REGIONAL = 0x1F1E6;
    227     static final int LAST_REGIONAL = 0x1F1FF;
    228 
    229     public static int getRegionalIndicator(int firstCodepoint) {
    230         return FIRST_REGIONAL <= firstCodepoint && firstCodepoint <= LAST_REGIONAL ? firstCodepoint - FIRST_REGIONAL + 'A' : -1;
    231     }
    232 
    233 //    private String getName(String cp) {
    234 //        int ri1 = getRegionalIndicator(cp.codePointAt(0));
    235 //        if (ri1 >= 0) {
    236 //            int ri2 = getRegionalIndicator(cp.codePointAt(2));
    237 //            return ENGLISH.getName(CLDRFile.TERRITORY_NAME, String.valueOf((char) ri1) + String.valueOf((char) ri2));
    238 //        }
    239 //        String result = NAMES80.get(cp);
    240 //        return result != null ? result : UCharacter.getName(cp, ", ");
    241 //    }
    242 //
    243 //    private static UnicodeMap<String> NAMES80 = new UnicodeMap<>();
    244 //    static {
    245 //        String[][] data = {
    246 //            { "", "EMOJI MODIFIER FITZPATRICK TYPE-1-2" },
    247 //            { "", "EMOJI MODIFIER FITZPATRICK TYPE-3" },
    248 //            { "", "EMOJI MODIFIER FITZPATRICK TYPE-4" },
    249 //            { "", "EMOJI MODIFIER FITZPATRICK TYPE-5" },
    250 //            { "", "EMOJI MODIFIER FITZPATRICK TYPE-6" },
    251 //            { "", "ZIPPER-MOUTH FACE" },
    252 //            { "", "MONEY-MOUTH FACE" },
    253 //            { "", "FACE WITH THERMOMETER" },
    254 //            { "", "NERD FACE" },
    255 //            { "", "THINKING FACE" },
    256 //            { "", "FACE WITH ROLLING EYES" },
    257 //            { "", "UPSIDE-DOWN FACE" },
    258 //            { "", "FACE WITH HEAD-BANDAGE" },
    259 //            { "", "ROBOT FACE" },
    260 //            { "", "HUGGING FACE" },
    261 //            { "", "SIGN OF THE HORNS" },
    262 //            { "", "CRAB (also Cancer)" },
    263 //            { "", "SCORPION (also Scorpio)" },
    264 //            { "", "LION FACE (also Leo)" },
    265 //            { "", "BOW AND ARROW (also Sagittarius)" },
    266 //            { "", "AMPHORA (also Aquarius)" },
    267 //            { "", "PLACE OF WORSHIP" },
    268 //            { "", "KAABA" },
    269 //            { "", "MOSQUE" },
    270 //            { "", "SYNAGOGUE" },
    271 //            { "", "MENORAH WITH NINE BRANCHES" },
    272 //            { "", "PRAYER BEADS" },
    273 //            { "", "HOT DOG" },
    274 //            { "", "TACO" },
    275 //            { "", "BURRITO" },
    276 //            { "", "CHEESE WEDGE" },
    277 //            { "", "POPCORN" },
    278 //            { "", "BOTTLE WITH POPPING CORK" },
    279 //            { "", "TURKEY" },
    280 //            { "", "UNICORN FACE" },
    281 //            { "", "CRICKET BAT AND BALL" },
    282 //            { "", "VOLLEYBALL" },
    283 //            { "", "FIELD HOCKEY STICK AND BALL" },
    284 //            { "", "ICE HOCKEY STICK AND PUCK" },
    285 //            { "", "TABLE TENNIS PADDLE AND BALL" },
    286 //            { "", "BADMINTON RACQUET AND SHUTTLECOCK" } };
    287 //        for (String[] pair : data) {
    288 //            NAMES80.put(pair[0], pair[1]);
    289 //        }
    290 //        NAMES80.freeze();
    291 //    }
    292 
    293     private class Subchart extends Chart {
    294         String title;
    295         String file;
    296         private TablePrinter tablePrinter;
    297 
    298         @Override
    299         public boolean getShowDate() {
    300             return false;
    301         }
    302 
    303         public Subchart(String title, String file, TablePrinter tablePrinter) {
    304             super();
    305             this.title = title;
    306             this.file = file;
    307             this.tablePrinter = tablePrinter;
    308         }
    309 
    310         @Override
    311         public String getDirectory() {
    312             return DIR;
    313         }
    314 
    315         @Override
    316         public String getTitle() {
    317             return title;
    318         }
    319 
    320         @Override
    321         public String getFileName() {
    322             return file;
    323         }
    324 
    325         @Override
    326         public String getExplanation() {
    327             return MAIN_HEADER
    328                 + "<p>This table shows the annotations for a group of related languages (plus English) for easier comparison. "
    329                 + "The first item is the <b>short name</b> (also the text-to-speech phrase). "
    330                 + "It is bolded for clarity, and marked with a * for searching on this page. "
    331                 + "The remaining phrases are <b>keywords</b> (labels), separated by |. "
    332                 + "The keywords plus the words in the short name are typically used for search and predictive typing.<p>\n"
    333                 + "<p>Most short names and keywords that can be constructed with the mechanism in " + LDML_ANNOTATIONS + " are omitted. "
    334                 + "However, a few are included for comparison: "
    335                 + CollectionUtilities.join(EXTRAS.addAllTo(new TreeSet<>()), ", ") + ". "
    336                 + "In this chart, missing items are marked with " + Annotations.MISSING_MARKER + ", "
    337                 + "fallback constructed items with " + Annotations.BAD_MARKER + ", "
    338                 + "substituted English values with " + Annotations.ENGLISH_MARKER + ", and "
    339                 + "values equal to their parent locales values are replaced with " + Annotations.EQUIVALENT + ".</p>\n";
    340         }
    341 
    342         @Override
    343         public void writeContents(FormattedFileWriter pw) throws IOException {
    344             pw.write(tablePrinter.toTable());
    345         }
    346     }
    347 
    348     public static RuleBasedCollator RBC;
    349     static {
    350         Factory cldrFactory = Factory.make(CLDRPaths.COMMON_DIRECTORY + "collation/", ".*");
    351         CLDRFile root = cldrFactory.make("root", false);
    352         String rules = root.getStringValue("//ldml/collations/collation[@type=\"emoji\"][@visibility=\"external\"]/cr");
    353 
    354 //        if (!rules.contains("'#'")) {
    355 //            rules = rules.replace("#", "'#'").replace("*", "'*'"); //hack for 8288
    356 //        }
    357 
    358         try {
    359             RBC = new RuleBasedCollator(rules);
    360         } catch (Exception e) {
    361             throw new IllegalArgumentException(e);
    362         }
    363     }
    364 
    365 //    static final Set<String> ENGLISH_LABELS = new LinkedHashSet<>(Arrays.asList(
    366 //        "flag", "nature", "objects", "people", "places", "symbols", "travel", "animal",
    367 //        "office", "sign", "word", "time", "food", "person", "weather", "activity",
    368 //        "vehicle", "restaurant", "communication", "emotion", "geometric", "mark",
    369 //        "education", "gesture", "japanese", "symbol", "congratulation", "body", "clothing"));
    370 
    371 //    static class Annotations {
    372 //
    373 //        final UnicodeRelation<String> values = new UnicodeRelation<>();
    374 //
    375 //        static Factory cldrFactory = Factory.make(CLDRPaths.COMMON_DIRECTORY + "annotations/", ".*");
    376 //
    377 //        static Set<String> getAvailableLocales() {
    378 //            return cldrFactory.getAvailable();
    379 //        }
    380 //
    381 //        static Map<String, Annotations> cache = new ConcurrentHashMap<>();
    382 //
    383 //        static synchronized Annotations make(String locale) {
    384 //            Annotations result = cache.get(locale);
    385 //            if (result == null) {
    386 //                CLDRFile file = cldrFactory.make(locale, false); // for now, don't resolve
    387 //                result = new Annotations();
    388 //                LinkedHashSet<String> values = new LinkedHashSet<>();
    389 //                XPathParts parts = new XPathParts();
    390 //                Splitter sp = Splitter.on(';').omitEmptyStrings().trimResults();
    391 //                for (String path : file) {
    392 //                    if (path.startsWith("//ldml/identity")) {
    393 //                        continue;
    394 //                    }
    395 //                    String value = file.getStringValue(path);
    396 //                    String fullPath = file.getFullXPath(path);
    397 //                    String cpString = parts.set(fullPath).getAttributeValue(-1, "cp");
    398 //                    UnicodeSet cps = new UnicodeSet(cpString);
    399 //                    String tts = parts.set(fullPath).getAttributeValue(-1, "tts");
    400 //                    values.clear();
    401 //                    if (tts != null) {
    402 //                        values.add(tts.trim()); // always first value
    403 //                    }
    404 //                    values.addAll(sp.splitToList(value));
    405 //                    result.values.addAll(cps, values);
    406 //                }
    407 //
    408 //                // remove labels
    409 //
    410 //                if (locale.equals("en")) {
    411 //                    for (Entry<String, Set<String>> item : result.values.keyValues()) {
    412 //                        String key = item.getKey();
    413 //                        Set<String> valueSet = new LinkedHashSet<>(item.getValue());
    414 //                        for (String skip : ENGLISH_LABELS) {
    415 //                            if (valueSet.contains(skip)) {
    416 //                                result.values.remove(key, skip);
    417 //                                if (result.values.get(key) == null) {
    418 //                                    result.values.add(key, skip); // restore
    419 //                                    break;
    420 //                                }
    421 //                            }
    422 //                        }
    423 //                        Set<String> newSet = result.values.get(key);
    424 //                        if (!valueSet.equals(newSet)) {
    425 //                            if (DEBUG) System.out.println("dropping labels from " + item.getKey() + ", old: " + valueSet + ", new: " + newSet);
    426 //                        }
    427 //                    }
    428 //                }
    429 //                result.values.freeze();
    430 //                cache.put(locale, result);
    431 //            }
    432 //            return result;
    433 //        }
    434 //    }
    435 }
    436