Home | History | Annotate | Download | only in translit
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4 **********************************************************************
      5 * Copyright (c) 2003-2012, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 * Author: Alan Liu
      9 * Created: February 11 2003
     10 * Since: ICU 2.6
     11 **********************************************************************
     12 */
     13 package com.ibm.icu.dev.tool.translit;
     14 
     15 import java.io.FileOutputStream;
     16 import java.io.IOException;
     17 import java.io.PrintStream;
     18 import java.util.Collections;
     19 import java.util.Comparator;
     20 import java.util.HashMap;
     21 import java.util.Iterator;
     22 import java.util.Locale;
     23 import java.util.Map;
     24 import java.util.Set;
     25 import java.util.TreeSet;
     26 import java.util.Vector;
     27 
     28 import com.ibm.icu.impl.Utility;
     29 import com.ibm.icu.lang.UCharacter;
     30 import com.ibm.icu.text.BreakIterator;
     31 import com.ibm.icu.text.UTF16;
     32 import com.ibm.icu.text.UnicodeSet;
     33 
     34 /**
     35  * This class produces the data tables used by the closeOver() method
     36  * of UnicodeSet.
     37  *
     38  * Whenever the Unicode database changes, this tool must be re-run
     39  * (AFTER the data file(s) underlying ICU4J are udpated).
     40  *
     41  * The output of this tool should then be pasted into the appropriate
     42  * files:
     43  *
     44  * ICU4J: com.ibm.icu.text.UnicodeSet.java
     45  * ICU4C: /icu/source/common/uniset.cpp
     46  */
     47 class UnicodeSetCloseOver {
     48 
     49     // Our output files
     50     static final String JAVA_OUT          = "to_UnicodeSet.java";
     51     static final String JAVA_CHARPROP_OUT = "to_UCharacterProperty.java";
     52     static final String C_SET_OUT         = "to_uniset.cpp";
     53     static final String C_UCHAR_OUT       = "to_uchar.c";
     54 
     55     // Source code "do not edit" warning
     56     static final String WARNING = "MACHINE-GENERATED; Unicode version " +
     57         UCharacter.getUnicodeVersion() +
     58         "; DO NOT EDIT; See " +
     59         UnicodeSetCloseOver.class.getName();
     60 
     61     // Case folding options flag.  This must correspond to the options
     62     // used in UnicodeSet.closeOver() in Java and C++.
     63     static final boolean DEFAULT_CASE_MAP = true; // false for Turkish
     64 
     65     public static void main(String[] args) throws IOException {
     66         System.out.println("This tool will generate several output files.  Each is named according");
     67         System.out.println("the target file.  For example, the contents of to_UnicodeSet.java should");
     68         System.out.println("be pasted into UnicodeSet.java.");
     69         System.out.println();
     70 
     71         generateCaseData();
     72     }
     73 
     74     /**
     75      * Create a map of String => Set.  The String in this case is a
     76      * folded string for which
     77      * UCharacter.foldCase(folded. DEFAULT_CASE_MAP).equals(folded).
     78      * The Set contains all single-character strings x for which
     79      * UCharacter.foldCase(x, DEFAULT_CASE_MAP).equals(folded), as
     80      * well as folded itself.
     81      */
     82     static Map createCaseFoldEquivalencyClasses() {
     83         Map equivClasses = new HashMap();
     84         for (int i = 0; i <= 0x10FFFF; ++i) {
     85             int cat = UCharacter.getType(i);
     86             if (cat == Character.UNASSIGNED || cat == Character.PRIVATE_USE)
     87                 continue;
     88 
     89             String cp = UTF16.valueOf(i);
     90             String folded = UCharacter.foldCase(cp, DEFAULT_CASE_MAP);
     91             if (folded.equals(cp)) continue;
     92 
     93             // At this point, have different case folding.  Add
     94             // the code point and its folded equivalent into the
     95             // equivalency class.
     96             TreeSet s = (TreeSet) equivClasses.get(folded);
     97             if (s == null) {
     98                 s = new TreeSet();
     99                 s.add(folded); // add the case fold result itself
    100                 equivClasses.put(folded, s);
    101             }
    102             s.add(cp);
    103         }
    104         return equivClasses;
    105     }
    106 
    107     /**
    108      * Analyze the case fold equivalency classes.  Break them into two
    109      * groups: 'pairs', and 'nonpairs'.  Create a tally of the length
    110      * configurations of the nonpairs.
    111      *
    112      * Length configurations of equivalency classes, as of Unicode
    113      * 3.2.  Most of the classes (83%) have two single codepoints.
    114      * Here "112:28" means there are 28 equivalency classes with 2
    115      * single codepoints and one string of length 2.
    116      *
    117      * 11:656
    118      * 111:16
    119      * 1111:3
    120      * 112:28
    121      * 113:2
    122      * 12:31
    123      * 13:12
    124      * 22:38
    125      *
    126      * Note: This method does not count the frequencies of the
    127      * different length configurations (as shown above after ':'); it
    128      * merely records which configurations occur.
    129      *
    130      * @param pairs Accumulate equivalency classes that consist of
    131      * exactly two codepoints here.  This is 83+% of the classes.
    132      * E.g., {"a", "A"}.
    133      * @param nonpairs Accumulate other equivalency classes here, as
    134      * lists of strings.  E,g, {"st", "\uFB05", "\uFB06"}.
    135      * @param lengths Accumulate a list of unique length structures,
    136      * not including pairs.  Each length structure is represented by a
    137      * string of digits.  The digit string "12" means the equivalency
    138      * class contains a single code point and a string of length 2.
    139      * Typical contents of 'lengths': { "111", "1111", "112",
    140      * "113", "12", "13", "22" }.  Note the absence of "11".
    141      */
    142     static void analyzeCaseData(Map equivClasses,
    143                                 StringBuffer pairs,
    144                                 Vector nonpairs,
    145                                 Vector lengths) {
    146         Iterator i = new TreeSet(equivClasses.keySet()).iterator();
    147         StringBuffer buf = new StringBuffer();
    148         while (i.hasNext()) {
    149             Object key = i.next();
    150             Vector v = new Vector((Set) equivClasses.get(key));
    151             if (v.size() == 2) {
    152                 String a = (String) v.elementAt(0);
    153                 String b = (String) v.elementAt(1);
    154                 if (a.length() == 1 && b.length() == 1) {
    155                     pairs.append(a).append(b);
    156                     continue;
    157                     // Note that pairs are included in 'lengths'
    158                 }
    159             }
    160             String[] a = new String[v.size()];
    161             v.toArray(a);
    162             nonpairs.add(a);
    163             //int singleCount = 0;
    164             //int stringCount = 0;
    165             // Make a string of the lengths, e.g., "111" means 3
    166             // single code points; "13" means a single code point
    167             // and a string of length 3.
    168             v.clear();
    169             for (int j=0; j<a.length; ++j) {
    170                 v.add(new Integer(a[j].length()));
    171             }
    172             Collections.sort(v);
    173             buf.setLength(0);
    174             for (int j=0; j<v.size(); ++j) {
    175                 buf.append(String.valueOf(v.elementAt(j)));
    176             }
    177             if (!lengths.contains(buf.toString())) {
    178                 lengths.add(buf.toString());
    179             }
    180         }
    181     }
    182 
    183     @SuppressWarnings("resource")
    184     static void generateCaseData() throws IOException {
    185 
    186         Map equivClasses = createCaseFoldEquivalencyClasses();
    187 
    188         // Accumulate equivalency classes that consist of exactly
    189         // two codepoints here.  This is 83+% of the classes.
    190         // E.g., {"a", "A"}.
    191         StringBuffer pairs = new StringBuffer();
    192 
    193         // Accumulate other equivalency classes here, as lists
    194         // of strings.  E,g, {"st", "\uFB05", "\uFB06"}.
    195         Vector nonpairs = new Vector(); // contains String[]
    196         Vector lengths = new Vector(); // "111", "12", "22", etc.
    197 
    198         analyzeCaseData(equivClasses, pairs, nonpairs, lengths);
    199 
    200         //-------------------------------------------------------------
    201         // Emit Java source
    202         PrintStream out = new PrintStream(new FileOutputStream(JAVA_OUT));
    203         System.out.println("Writing " + JAVA_OUT);
    204 
    205         out.println("    // " + WARNING);
    206         out.println("    private static final String CASE_PAIRS =");
    207         out.println(Utility.formatForSource(pairs.toString()) + ";");
    208         out.println();
    209         out.println("    // " + WARNING);
    210         out.println("    private static final String[][] CASE_NONPAIRS = {");
    211         for (int j=0; j<nonpairs.size(); ++j) {
    212             String[] a = (String[]) nonpairs.elementAt(j);
    213             out.print("        {");
    214             for (int k=0; k<a.length; ++k) {
    215                 if (k != 0) out.print(", ");
    216                 out.print(Utility.format1ForSource(a[k]));
    217             }
    218             out.println("},");
    219         }
    220         out.println("    };");
    221 
    222         //-------------------------------------------------------------
    223         // Emit C++ source
    224 
    225         // In C++, the pairs are again emitted in an array, but this
    226         // array is the final representation form -- it will not be
    227         // reprocessed into a hash.  It will be binary searched by
    228         // looking at the even elements [0], [2], [4], etc., and
    229         // ignoring the odd elements.  The even elements must contain
    230         // the folded members of the pairs.  That is, in the pair
    231         // {'A', 'a'}, the even element must be 'a', not 'A'.  Then a
    232         // code point to be located is first folded ('Y' => 'y') then
    233         // it binary searched against [0]='A', [2]='B', etc.  When a
    234         // match is found at k, the pair is [k], [k+1].
    235 
    236         out = new PrintStream(new FileOutputStream(C_SET_OUT));
    237         System.out.println("Writing " + C_SET_OUT);
    238 
    239         // Sort the pairs.  They must be ordered by the folded element.
    240         // Store these as two-character strings, with charAt(0) being
    241         // the folded member of the pair.
    242         TreeSet sortPairs = new TreeSet(new Comparator() {
    243             public int compare(Object a, Object b) {
    244                 return ((int) ((String) a).charAt(0)) -
    245                        ((int) ((String) b).charAt(0));
    246             }
    247             public boolean equals(Object obj) {
    248                 return false;
    249             }
    250         });
    251         for (int i=0; i<pairs.length(); i+=2) {
    252             String a = String.valueOf(pairs.charAt(i));
    253             String b = String.valueOf(pairs.charAt(i+1));
    254             String folded = UCharacter.foldCase(a, DEFAULT_CASE_MAP);
    255             if (a.equals(folded)) {
    256                 sortPairs.add(a + b);
    257             } else {
    258                 sortPairs.add(b + a);
    259             }
    260         }
    261 
    262         // Emit the pairs
    263         out.println("// " + WARNING);
    264         out.println("static const UChar CASE_PAIRS[] = {");
    265         Iterator it = sortPairs.iterator();
    266         while (it.hasNext()) {
    267             out.print("    ");
    268             int n = 0;
    269             while (n++ < 5 && it.hasNext()) {
    270                 String s = (String) it.next();
    271                 //out.print((int) s.charAt(0) + "," +
    272                 //                 (int) s.charAt(1) + ",");
    273                 out.print("0x" + Utility.hex(s.charAt(0)) + ",0x" +
    274                                  Utility.hex(s.charAt(1)) + ",");
    275             }
    276             out.println();
    277         }
    278         out.println("};");
    279         out.println();
    280 
    281         // The non-pairs are encoded in the following way.  All the
    282         // single codepoints in each class are grouped together
    283         // followed by a zero.  Then each multi-character string is
    284         // added, followed by a zero.  Finally, another zero is added.
    285         // Some examples:
    286         //  {"iQ", "R"}           =>  [ 'R', 0, 'i', 'Q', 0, 0 ]
    287         //  {"S", "D", "F", "G"}  =>  [ 'S', 'D', 'F', 'G', 0, 0 ]
    288         //  {"jW", "jY"}          =>  [ 0, 'j', 'W', 0, 'j', 'Y', 0, 0 ]
    289         // The end-result is a short, flat array of UChar values that
    290         // can be used to initialize a UChar[] array in C.
    291 
    292         int maxLen = 0; // Maximum encoded length of any class, including zeros
    293         out.println("// " + WARNING);
    294         out.println("static const CaseEquivClass CASE_NONPAIRS[] = {");
    295         for (int j=0; j<nonpairs.size(); ++j) {
    296             int len = 0;
    297             String[] a = (String[]) nonpairs.elementAt(j);
    298             out.print("    {");
    299             // Emit single code points
    300             for (int k=0; k<a.length; ++k) {
    301                 if (a[k].length() != 1) continue;
    302                 //out.print((int) a[k].charAt(0) + ",");
    303                 out.print("0x"+Utility.hex(a[k].charAt(0)) + ",");
    304                 ++len;
    305             }
    306             out.print("0,  "); // End of single code points
    307             ++len;
    308             // Emit multi-character strings
    309             for (int k=0; k<a.length; ++k) {
    310                 if (a[k].length() == 1) continue;
    311                 for (int m=0; m<a[k].length(); ++m) {
    312                     //out.print((int) a[k].charAt(m) + ",");
    313                     out.print("0x"+Utility.hex(a[k].charAt(m)) + ",");
    314                     ++len;
    315                 }
    316                 out.print("0, "); // End of string
    317                 ++len;
    318             }
    319             out.println("0},"); // End of equivalency class
    320             ++len;
    321             if (len > maxLen) maxLen = len;
    322         }
    323         out.println("};");
    324 
    325         // Make sure the CaseEquivClass data can fit.
    326         if (maxLen > 8) {
    327             throw new RuntimeException("Must adjust CaseEquivClass to accomodate " + maxLen + " UChars");
    328         }
    329 
    330         // Also make sure that we can map into this array using a
    331         // CompactByteArray.  We could do this check above, but we
    332         // keep it here, adjacent to the maxLen check.  We use one
    333         // value (-1 == 255) to indicate "no value."
    334         if (nonpairs.size() > 255) {
    335             throw new RuntimeException("Too many CASE_NONPAIRS array elements to be indexed by a CompactByteArray");
    336         }
    337 
    338         //-------------------------------------------------------------
    339         // Case-unique set:  All characters c for which closeOver(c)==c.
    340 
    341         // UPDATE: Instead of using this, we're using the related
    342         // notion of Case_Sensitive.  See below.  Note that
    343         // Case_Sensitive != ^Case_Unique.
    344 
    345         if (false) {
    346             UnicodeSet caseUnique = new UnicodeSet();
    347             for (int i = 0; i <= 0x10FFFF; ++i) {
    348                 String cp = UTF16.valueOf(i);
    349                 if (equivClasses.get(UCharacter.foldCase(cp, DEFAULT_CASE_MAP)) == null) {
    350                     caseUnique.add(i);
    351                 }
    352             }
    353             // out.println("caseUnique = " + caseUnique.toPattern(true));
    354         }
    355 
    356         UnicodeSet caseSensitive = getCaseSensitive();
    357         //System.out.println("caseSensitive = " + caseSensitive.toPattern(true));
    358 
    359         // Now for C, emit an array of ranges
    360         out = new PrintStream(new FileOutputStream(C_UCHAR_OUT));
    361         System.out.println("Writing " + C_UCHAR_OUT);
    362 
    363         out.println("/* " + WARNING + " */");
    364         emitUCharRangesArray(out, caseSensitive, "CASE_SENSITIVE_RANGES");
    365 
    366         // For Java, emit a string with the ranges (each pair of chars
    367         // in the string is a range).
    368         out = new PrintStream(new FileOutputStream(JAVA_CHARPROP_OUT));
    369         System.out.println("Writing " + JAVA_CHARPROP_OUT);
    370         out.println("    // " + WARNING);
    371         emitRangesString(out, caseSensitive, "CASE_SENSITIVE_RANGES");
    372     }
    373 
    374     /**
    375      * Create the set of case-sensitive characters.  These are characters
    376      * that participate in any case mapping operation as a source or
    377      * as a member of a target string.
    378      */
    379     static UnicodeSet getCaseSensitive() {
    380         UnicodeSet caseSensitive = new UnicodeSet();
    381         Locale loc = Locale.US;
    382         BreakIterator bi = BreakIterator.getTitleInstance(loc);
    383         for (int c = 0; c <= 0x10FFFF; ++c) {
    384             String cp = UTF16.valueOf(c);
    385             for (int j=0; j<4; ++j) {
    386                 String s = null;
    387                 switch (j) {
    388                 case 0: s = UCharacter.toUpperCase(loc, cp); break;
    389                 case 1: s = UCharacter.toLowerCase(loc, cp); break;
    390                 case 2: s = UCharacter.toTitleCase(loc, cp, bi); break;
    391                 case 3: s = UCharacter.foldCase(cp, DEFAULT_CASE_MAP); break;
    392                 }
    393                 if (!s.equals(cp)) {
    394                     int cc;
    395                     for (int k=0; k<s.length(); k+=UTF16.getCharCount(cc)) {
    396                         cc = UTF16.charAt(s, k);
    397                         caseSensitive.add(cc);
    398                     }
    399                     for (int k=0; k<cp.length(); k+=UTF16.getCharCount(cc)) {
    400                         cc = UTF16.charAt(cp, k);
    401                         caseSensitive.add(cc);
    402                     }
    403                 }
    404             }
    405             // Also do the single-codepoint API.  This shouldn't add any
    406             // code points, but we do it for completeness.
    407             for (int j=0; j<4; ++j) {
    408                 int d = 0;
    409                 switch (j) {
    410                 case 0: d = UCharacter.toUpperCase(c); break;
    411                 case 1: d = UCharacter.toLowerCase(c); break;
    412                 case 2: d = UCharacter.toTitleCase(c); break;
    413                 case 3: d = UCharacter.foldCase(c, DEFAULT_CASE_MAP); break;
    414                 }
    415                 if (d != c) {
    416                     if (!caseSensitive.contains(c) ||
    417                         !caseSensitive.contains(d)) {
    418                         System.out.println("Warning: code point " + c +
    419                                            " => " + d + " created NEW MAPPING"+
    420                                            " for Case_Sensitive");
    421                     }
    422                     caseSensitive.add(c);
    423                     caseSensitive.add(d);
    424                 }
    425             }
    426         }
    427         return caseSensitive;
    428     }
    429 
    430     /**
    431      * Given a UnicodeSet, emit it as an array of UChar pairs.  Each
    432      * pair will be the start/end of a range.  Code points >= U+10000
    433      * will be represented as surrogate pairs.
    434      */
    435     static void emitUCharRangesArray(PrintStream out, UnicodeSet set, String id) {
    436         // Store the pairs in a StringBuffer.  This handles surrogate
    437         // representation.
    438         StringBuffer buf = new StringBuffer();
    439         for (int i=0; i<set.getRangeCount(); ++i) {
    440             UTF16.append(buf, set.getRangeStart(i));
    441             UTF16.append(buf, set.getRangeEnd(i));
    442         }
    443         // Emit the pairs
    444         out.println("static const UChar " + id + "[] = {");
    445         for (int i=0; i<buf.length(); ) {
    446             out.print("    ");
    447             for (int n=0; n++<10 && i<buf.length(); ++i) {
    448                 out.print("0x" + Utility.hex(buf.charAt(i), 4) + ',');
    449             }
    450             out.println();
    451         }
    452         out.println("};");
    453         out.println("#define " + id + "_LENGTH (sizeof(" + id +
    454                     ")/sizeof(" + id + "[0]))");
    455     }
    456 
    457     /**
    458      * Given a UnicodeSet, emit it as a Java string.  The most economical
    459      * format is not the pattern, but instead a pairs list, with each
    460      * range pair represented as two adjacent characters.
    461      */
    462     static void emitRangesString(PrintStream out, UnicodeSet set, String id) {
    463         // Store the pairs in a StringBuffer.  This handles surrogate
    464         // representation.
    465         StringBuffer buf = new StringBuffer();
    466         for (int i=0; i<set.getRangeCount(); ++i) {
    467             UTF16.append(buf, set.getRangeStart(i));
    468             UTF16.append(buf, set.getRangeEnd(i));
    469         }
    470         // Emit the pairs
    471         out.println("    private static final String " + id + " =");
    472         out.println(Utility.formatForSource(buf.toString()) + ";");
    473     }
    474 }
    475