tool/translit/UnicodeSetCloseOver.java

//  2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
/*
**********************************************************************
* Copyright (c) 2003-2012, International Business Machines
* Corporation and others.  All Rights Reserved.
**********************************************************************
* Author: Alan Liu
* Created: February 11 2003
* Since: ICU 2.6
**********************************************************************
*/
package com.ibm.icu.dev.tool.translit;

import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.Vector;

import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;

/**
 * This class produces the data tables used by the closeOver() method
 * of UnicodeSet.
 *
 * Whenever the Unicode database changes, this tool must be re-run
 * (AFTER the data file(s) underlying ICU4J are udpated).
 *
 * The output of this tool should then be pasted into the appropriate
 * files:
 *
 * ICU4J: com.ibm.icu.text.UnicodeSet.java
 * ICU4C: /icu/source/common/uniset.cpp
 */
class UnicodeSetCloseOver {

    // Our output files
    static final String JAVA_OUT          = "to_UnicodeSet.java";
    static final String JAVA_CHARPROP_OUT = "to_UCharacterProperty.java";
    static final String C_SET_OUT         = "to_uniset.cpp";
    static final String C_UCHAR_OUT       = "to_uchar.c";

    // Source code "do not edit" warning
    static final String WARNING = "MACHINE-GENERATED; Unicode version " +
        UCharacter.getUnicodeVersion() +
        "; DO NOT EDIT; See " +
        UnicodeSetCloseOver.class.getName();

    // Case folding options flag.  This must correspond to the options
    // used in UnicodeSet.closeOver() in Java and C++.
    static final boolean DEFAULT_CASE_MAP = true; // false for Turkish

    public static void main(String[] args) throws IOException {
        System.out.println("This tool will generate several output files.  Each is named according");
        System.out.println("the target file.  For example, the contents of to_UnicodeSet.java should");
        System.out.println("be pasted into UnicodeSet.java.");
        System.out.println();

        generateCaseData();
    }

    /**
     * Create a map of String => Set.  The String in this case is a
     * folded string for which
     * UCharacter.foldCase(folded. DEFAULT_CASE_MAP).equals(folded).
     * The Set contains all single-character strings x for which
     * UCharacter.foldCase(x, DEFAULT_CASE_MAP).equals(folded), as
     * well as folded itself.
     */
    static Map createCaseFoldEquivalencyClasses() {
        Map equivClasses = new HashMap();
        for (int i = 0; i <= 0x10FFFF; ++i) {
            int cat = UCharacter.getType(i);
            if (cat == Character.UNASSIGNED || cat == Character.PRIVATE_USE)
                continue;

            String cp = UTF16.valueOf(i);
            String folded = UCharacter.foldCase(cp, DEFAULT_CASE_MAP);
            if (folded.equals(cp)) continue;

            // At this point, have different case folding.  Add
            // the code point and its folded equivalent into the
            // equivalency class.
            TreeSet s = (TreeSet) equivClasses.get(folded);
            if (s == null) {
                s = new TreeSet();
                s.add(folded); // add the case fold result itself
                equivClasses.put(folded, s);
            }
            s.add(cp);
        }
        return equivClasses;
    }

    /**
     * Analyze the case fold equivalency classes.  Break them into two
     * groups: 'pairs', and 'nonpairs'.  Create a tally of the length
     * configurations of the nonpairs.
     *
     * Length configurations of equivalency classes, as of Unicode
     * 3.2.  Most of the classes (83%) have two single codepoints.
     * Here "112:28" means there are 28 equivalency classes with 2
     * single codepoints and one string of length 2.
     *
     * 11:656
     * 111:16
     * 1111:3
     * 112:28
     * 113:2
     * 12:31
     * 13:12
     * 22:38
     *
     * Note: This method does not count the frequencies of the
     * different length configurations (as shown above after ':'); it
     * merely records which configurations occur.
     *
     * @param pairs Accumulate equivalency classes that consist of
     * exactly two codepoints here.  This is 83+% of the classes.
     * E.g., {"a", "A"}.
     * @param nonpairs Accumulate other equivalency classes here, as
     * lists of strings.  E,g, {"st", "\uFB05", "\uFB06"}.
     * @param lengths Accumulate a list of unique length structures,
     * not including pairs.  Each length structure is represented by a
     * string of digits.  The digit string "12" means the equivalency
     * class contains a single code point and a string of length 2.
     * Typical contents of 'lengths': { "111", "1111", "112",
     * "113", "12", "13", "22" }.  Note the absence of "11".
     */
    static void analyzeCaseData(Map equivClasses,
                                StringBuffer pairs,
                                Vector nonpairs,
                                Vector lengths) {
        Iterator i = new TreeSet(equivClasses.keySet()).iterator();
        StringBuffer buf = new StringBuffer();
        while (i.hasNext()) {
            Object key = i.next();
            Vector v = new Vector((Set) equivClasses.get(key));
            if (v.size() == 2) {
                String a = (String) v.elementAt(0);
                String b = (String) v.elementAt(1);
                if (a.length() == 1 && b.length() == 1) {
                    pairs.append(a).append(b);
                    continue;
                    // Note that pairs are included in 'lengths'
                }
            }
            String[] a = new String[v.size()];
            v.toArray(a);
            nonpairs.add(a);
            //int singleCount = 0;
            //int stringCount = 0;
            // Make a string of the lengths, e.g., "111" means 3
            // single code points; "13" means a single code point
            // and a string of length 3.
            v.clear();
            for (int j=0; j<a.length; ++j) {
                v.add(new Integer(a[j].length()));
            }
            Collections.sort(v);
            buf.setLength(0);
            for (int j=0; j<v.size(); ++j) {
                buf.append(String.valueOf(v.elementAt(j)));
            }
            if (!lengths.contains(buf.toString())) {
                lengths.add(buf.toString());
            }
        }
    }

    @SuppressWarnings("resource")
    static void generateCaseData() throws IOException {

        Map equivClasses = createCaseFoldEquivalencyClasses();

        // Accumulate equivalency classes that consist of exactly
        // two codepoints here.  This is 83+% of the classes.
        // E.g., {"a", "A"}.
        StringBuffer pairs = new StringBuffer();

        // Accumulate other equivalency classes here, as lists
        // of strings.  E,g, {"st", "\uFB05", "\uFB06"}.
        Vector nonpairs = new Vector(); // contains String[]
        Vector lengths = new Vector(); // "111", "12", "22", etc.

        analyzeCaseData(equivClasses, pairs, nonpairs, lengths);

        //-------------------------------------------------------------
        // Emit Java source
        PrintStream out = new PrintStream(new FileOutputStream(JAVA_OUT));
        System.out.println("Writing " + JAVA_OUT);

        out.println("    // " + WARNING);
        out.println("    private static final String CASE_PAIRS =");
        out.println(Utility.formatForSource(pairs.toString()) + ";");
        out.println();
        out.println("    // " + WARNING);
        out.println("    private static final String[][] CASE_NONPAIRS = {");
        for (int j=0; j<nonpairs.size(); ++j) {
            String[] a = (String[]) nonpairs.elementAt(j);
            out.print("        {");
            for (int k=0; k<a.length; ++k) {
                if (k != 0) out.print(", ");
                out.print(Utility.format1ForSource(a[k]));
            }
            out.println("},");
        }
        out.println("    };");

        //-------------------------------------------------------------
        // Emit C++ source

        // In C++, the pairs are again emitted in an array, but this
        // array is the final representation form -- it will not be
        // reprocessed into a hash.  It will be binary searched by
        // looking at the even elements [0], [2], [4], etc., and
        // ignoring the odd elements.  The even elements must contain
        // the folded members of the pairs.  That is, in the pair
        // {'A', 'a'}, the even element must be 'a', not 'A'.  Then a
        // code point to be located is first folded ('Y' => 'y') then
        // it binary searched against [0]='A', [2]='B', etc.  When a
        // match is found at k, the pair is [k], [k+1].

        out = new PrintStream(new FileOutputStream(C_SET_OUT));
        System.out.println("Writing " + C_SET_OUT);

        // Sort the pairs.  They must be ordered by the folded element.
        // Store these as two-character strings, with charAt(0) being
        // the folded member of the pair.
        TreeSet sortPairs = new TreeSet(new Comparator() {
            public int compare(Object a, Object b) {
                return ((int) ((String) a).charAt(0)) -
                       ((int) ((String) b).charAt(0));
            }
            public boolean equals(Object obj) {
                return false;
            }
        });
        for (int i=0; i<pairs.length(); i+=2) {
            String a = String.valueOf(pairs.charAt(i));
            String b = String.valueOf(pairs.charAt(i+1));
            String folded = UCharacter.foldCase(a, DEFAULT_CASE_MAP);
            if (a.equals(folded)) {
                sortPairs.add(a + b);
            } else {
                sortPairs.add(b + a);
            }
        }

        // Emit the pairs
        out.println("// " + WARNING);
        out.println("static const UChar CASE_PAIRS[] = {");
        Iterator it = sortPairs.iterator();
        while (it.hasNext()) {
            out.print("    ");
            int n = 0;
            while (n++ < 5 && it.hasNext()) {
                String s = (String) it.next();
                //out.print((int) s.charAt(0) + "," +
                //                 (int) s.charAt(1) + ",");
                out.print("0x" + Utility.hex(s.charAt(0)) + ",0x" +
                                 Utility.hex(s.charAt(1)) + ",");
            }
            out.println();
        }
        out.println("};");
        out.println();

        // The non-pairs are encoded in the following way.  All the
        // single codepoints in each class are grouped together
        // followed by a zero.  Then each multi-character string is
        // added, followed by a zero.  Finally, another zero is added.
        // Some examples:
        //  {"iQ", "R"}           =>  [ 'R', 0, 'i', 'Q', 0, 0 ]
        //  {"S", "D", "F", "G"}  =>  [ 'S', 'D', 'F', 'G', 0, 0 ]
        //  {"jW", "jY"}          =>  [ 0, 'j', 'W', 0, 'j', 'Y', 0, 0 ]
        // The end-result is a short, flat array of UChar values that
        // can be used to initialize a UChar[] array in C.

        int maxLen = 0; // Maximum encoded length of any class, including zeros
        out.println("// " + WARNING);
        out.println("static const CaseEquivClass CASE_NONPAIRS[] = {");
        for (int j=0; j<nonpairs.size(); ++j) {
            int len = 0;
            String[] a = (String[]) nonpairs.elementAt(j);
            out.print("    {");
            // Emit single code points
            for (int k=0; k<a.length; ++k) {
                if (a[k].length() != 1) continue;
                //out.print((int) a[k].charAt(0) + ",");
                out.print("0x"+Utility.hex(a[k].charAt(0)) + ",");
                ++len;
            }
            out.print("0,  "); // End of single code points
            ++len;
            // Emit multi-character strings
            for (int k=0; k<a.length; ++k) {
                if (a[k].length() == 1) continue;
                for (int m=0; m<a[k].length(); ++m) {
                    //out.print((int) a[k].charAt(m) + ",");
                    out.print("0x"+Utility.hex(a[k].charAt(m)) + ",");
                    ++len;
                }
                out.print("0, "); // End of string
                ++len;
            }
            out.println("0},"); // End of equivalency class
            ++len;
            if (len > maxLen) maxLen = len;
        }
        out.println("};");

        // Make sure the CaseEquivClass data can fit.
        if (maxLen > 8) {
            throw new RuntimeException("Must adjust CaseEquivClass to accomodate " + maxLen + " UChars");
        }

        // Also make sure that we can map into this array using a
        // CompactByteArray.  We could do this check above, but we
        // keep it here, adjacent to the maxLen check.  We use one
        // value (-1 == 255) to indicate "no value."
        if (nonpairs.size() > 255) {
            throw new RuntimeException("Too many CASE_NONPAIRS array elements to be indexed by a CompactByteArray");
        }

        //-------------------------------------------------------------
        // Case-unique set:  All characters c for which closeOver(c)==c.

        // UPDATE: Instead of using this, we're using the related
        // notion of Case_Sensitive.  See below.  Note that
        // Case_Sensitive != ^Case_Unique.

        if (false) {
            UnicodeSet caseUnique = new UnicodeSet();
            for (int i = 0; i <= 0x10FFFF; ++i) {
                String cp = UTF16.valueOf(i);
                if (equivClasses.get(UCharacter.foldCase(cp, DEFAULT_CASE_MAP)) == null) {
                    caseUnique.add(i);
                }
            }
            // out.println("caseUnique = " + caseUnique.toPattern(true));
        }

        UnicodeSet caseSensitive = getCaseSensitive();
        //System.out.println("caseSensitive = " + caseSensitive.toPattern(true));

        // Now for C, emit an array of ranges
        out = new PrintStream(new FileOutputStream(C_UCHAR_OUT));
        System.out.println("Writing " + C_UCHAR_OUT);

        out.println("/* " + WARNING + " */");
        emitUCharRangesArray(out, caseSensitive, "CASE_SENSITIVE_RANGES");

        // For Java, emit a string with the ranges (each pair of chars
        // in the string is a range).
        out = new PrintStream(new FileOutputStream(JAVA_CHARPROP_OUT));
        System.out.println("Writing " + JAVA_CHARPROP_OUT);
        out.println("    // " + WARNING);
        emitRangesString(out, caseSensitive, "CASE_SENSITIVE_RANGES");
    }

    /**
     * Create the set of case-sensitive characters.  These are characters
     * that participate in any case mapping operation as a source or
     * as a member of a target string.
     */
    static UnicodeSet getCaseSensitive() {
        UnicodeSet caseSensitive = new UnicodeSet();
        Locale loc = Locale.US;
        BreakIterator bi = BreakIterator.getTitleInstance(loc);
        for (int c = 0; c <= 0x10FFFF; ++c) {
            String cp = UTF16.valueOf(c);
            for (int j=0; j<4; ++j) {
                String s = null;
                switch (j) {
                case 0: s = UCharacter.toUpperCase(loc, cp); break;
                case 1: s = UCharacter.toLowerCase(loc, cp); break;
                case 2: s = UCharacter.toTitleCase(loc, cp, bi); break;
                case 3: s = UCharacter.foldCase(cp, DEFAULT_CASE_MAP); break;
                }
                if (!s.equals(cp)) {
                    int cc;
                    for (int k=0; k<s.length(); k+=UTF16.getCharCount(cc)) {
                        cc = UTF16.charAt(s, k);
                        caseSensitive.add(cc);
                    }
                    for (int k=0; k<cp.length(); k+=UTF16.getCharCount(cc)) {
                        cc = UTF16.charAt(cp, k);
                        caseSensitive.add(cc);
                    }
                }
            }
            // Also do the single-codepoint API.  This shouldn't add any
            // code points, but we do it for completeness.
            for (int j=0; j<4; ++j) {
                int d = 0;
                switch (j) {
                case 0: d = UCharacter.toUpperCase(c); break;
                case 1: d = UCharacter.toLowerCase(c); break;
                case 2: d = UCharacter.toTitleCase(c); break;
                case 3: d = UCharacter.foldCase(c, DEFAULT_CASE_MAP); break;
                }
                if (d != c) {
                    if (!caseSensitive.contains(c) ||
                        !caseSensitive.contains(d)) {
                        System.out.println("Warning: code point " + c +
                                           " => " + d + " created NEW MAPPING"+
                                           " for Case_Sensitive");
                    }
                    caseSensitive.add(c);
                    caseSensitive.add(d);
                }
            }
        }
        return caseSensitive;
    }

    /**
     * Given a UnicodeSet, emit it as an array of UChar pairs.  Each
     * pair will be the start/end of a range.  Code points >= U+10000
     * will be represented as surrogate pairs.
     */
    static void emitUCharRangesArray(PrintStream out, UnicodeSet set, String id) {
        // Store the pairs in a StringBuffer.  This handles surrogate
        // representation.
        StringBuffer buf = new StringBuffer();
        for (int i=0; i<set.getRangeCount(); ++i) {
            UTF16.append(buf, set.getRangeStart(i));
            UTF16.append(buf, set.getRangeEnd(i));
        }
        // Emit the pairs
        out.println("static const UChar " + id + "[] = {");
        for (int i=0; i<buf.length(); ) {
            out.print("    ");
            for (int n=0; n++<10 && i<buf.length(); ++i) {
                out.print("0x" + Utility.hex(buf.charAt(i), 4) + ',');
            }
            out.println();
        }
        out.println("};");
        out.println("#define " + id + "_LENGTH (sizeof(" + id +
                    ")/sizeof(" + id + "[0]))");
    }

    /**
     * Given a UnicodeSet, emit it as a Java string.  The most economical
     * format is not the pattern, but instead a pairs list, with each
     * range pair represented as two adjacent characters.
     */
    static void emitRangesString(PrintStream out, UnicodeSet set, String id) {
        // Store the pairs in a StringBuffer.  This handles surrogate
        // representation.
        StringBuffer buf = new StringBuffer();
        for (int i=0; i<set.getRangeCount(); ++i) {
            UTF16.append(buf, set.getRangeStart(i));
            UTF16.append(buf, set.getRangeEnd(i));
        }
        // Emit the pairs
        out.println("    private static final String " + id + " =");
        out.println(Utility.formatForSource(buf.toString()) + ";");
    }
}