Home | History | Annotate | Download | only in layout
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /**
      4  *******************************************************************************
      5  * Copyright (C) 2002-2010, International Business Machines Corporation and    *
      6  * others. All Rights Reserved.                                                *
      7  *******************************************************************************
      8  */
      9 
     10 package com.ibm.icu.dev.tool.layout;
     11 
     12 import java.util.Vector;
     13 
     14 import com.ibm.icu.impl.Utility;
     15 import com.ibm.icu.lang.UCharacter;
     16 import com.ibm.icu.lang.UScript;
     17 import com.ibm.icu.text.CanonicalIterator;
     18 import com.ibm.icu.text.UTF16;
     19 import com.ibm.icu.text.UnicodeSet;
     20 
     21 public class CanonicalCharacterData
     22 {
     23     private static int THRESHOLD = 4;
     24 
     25     public class Record
     26     {
     27         // TODO: might want to save arrays of Char32's rather than UTF16 strings...
     28         Record(int character, int script)
     29         {
     30             String char32 = UCharacter.toString(character);
     31             CanonicalIterator iterator = new CanonicalIterator(char32);
     32             Vector equivs = new Vector();
     33 
     34             composed = character;
     35 
     36             for (String equiv = iterator.next(); equiv != null; equiv = iterator.next()) {
     37                 // Skip all equivalents of length 1; it's either the original
     38                 // characeter or something like Angstrom for A-Ring, which we don't care about
     39                 if (UTF16.countCodePoint(equiv) > 1) {
     40                     equivs.add(equiv);
     41                 }
     42             }
     43 
     44             int nEquivalents = equivs.size();
     45 
     46             if (nEquivalents > maxEquivalents[script]) {
     47                 maxEquivalents[script] = nEquivalents;
     48             }
     49 
     50             if (nEquivalents > 0) {
     51                 equivalents = new String[nEquivalents];
     52 
     53                 if (nEquivalents > THRESHOLD) {
     54                     dumpEquivalents(character, equivs);
     55                 }
     56 
     57                 sortEquivalents(equivalents, equivs);
     58             }
     59         }
     60 
     61         public int getComposedCharacter()
     62         {
     63             return composed;
     64         }
     65 
     66         public int countEquivalents()
     67         {
     68             if (equivalents == null) {
     69                 return 0;
     70             }
     71 
     72             return equivalents.length;
     73         }
     74 
     75         public String[] getEquivalents()
     76         {
     77             return equivalents;
     78         }
     79 
     80         public String getEquivalent(int index)
     81         {
     82             if (equivalents == null || index < 0 || index >= equivalents.length) {
     83                 return null;
     84             }
     85 
     86             return equivalents[index];
     87         }
     88 
     89         private void dumpEquivalents(int character, Vector equivs)
     90         {
     91             int count = equivs.size();
     92 
     93             System.out.println(Utility.hex(character, 6) + " - " + count + ":");
     94 
     95             for (int i = 0; i < count; i += 1) {
     96                 String equiv = (String) equivs.elementAt(i);
     97                 int codePoints = UTF16.countCodePoint(equiv);
     98 
     99                 for (int c = 0; c < codePoints; c += 1) {
    100                     if (c > 0) {
    101                         System.out.print(" ");
    102                     }
    103 
    104                     System.out.print(Utility.hex(UTF16.charAt(equiv, c), 6));
    105                 }
    106 
    107                 System.out.println();
    108             }
    109 
    110             System.out.println();
    111         }
    112 
    113         private int composed;
    114         private String[] equivalents = null;
    115     }
    116 
    117     public CanonicalCharacterData()
    118     {
    119         // nothing to do...
    120     }
    121 
    122     public void add(int character)
    123     {
    124         int script = UScript.getScript(character);
    125         Vector recordVector = recordVectors[script];
    126 
    127         if (recordVector == null) {
    128             recordVector = recordVectors[script] = new Vector();
    129         }
    130 
    131         recordVector.add(new Record(character, script));
    132     }
    133 
    134     public int getMaxEquivalents(int script)
    135     {
    136         if (script < 0 || script >= UScript.CODE_LIMIT) {
    137             return 0;
    138         }
    139 
    140         return maxEquivalents[script];
    141     }
    142 
    143     public Record getRecord(int script, int index)
    144     {
    145         if (script < 0 || script >= UScript.CODE_LIMIT) {
    146             return null;
    147         }
    148 
    149         Vector recordVector = recordVectors[script];
    150 
    151         if (recordVector == null || index < 0 || index >= recordVector.size()) {
    152             return null;
    153         }
    154 
    155         return (Record) recordVector.elementAt(index);
    156     }
    157 
    158     public int countRecords(int script)
    159     {
    160         if (script < 0 || script >= UScript.CODE_LIMIT ||
    161             recordVectors[script] == null) {
    162             return 0;
    163         }
    164 
    165         return recordVectors[script].size();
    166     }
    167 
    168     public static CanonicalCharacterData factory(UnicodeSet characterSet)
    169     {
    170         int charCount = characterSet.size();
    171         CanonicalCharacterData data = new CanonicalCharacterData();
    172 
    173         System.out.println("There are " + charCount + " characters with a canonical decomposition.");
    174 
    175         for (int i = 0; i < charCount; i += 1) {
    176             data.add(characterSet.charAt(i));
    177         }
    178 
    179         return data;
    180     }
    181 
    182     private static int compareEquivalents(String a, String b)
    183     {
    184         int result = UTF16.countCodePoint(a) - UTF16.countCodePoint(b);
    185 
    186         if (result == 0) {
    187             return a.compareTo(b);
    188         }
    189 
    190         return result;
    191     }
    192 
    193     //
    194     // Straight insertion sort from Knuth vol. III, pg. 81
    195     //
    196     private static void sortEquivalents(String[] equivalents, Vector unsorted)
    197     {
    198         int nEquivalents = equivalents.length;
    199 
    200         for (int e = 0; e < nEquivalents; e += 1) {
    201             String v = (String) unsorted.elementAt(e);
    202             int i;
    203 
    204             for (i = e - 1; i >= 0; i -= 1) {
    205                 if (compareEquivalents(v, equivalents[i]) >= 0) {
    206                   break;
    207                 }
    208 
    209                 equivalents[i + 1] = equivalents[i];
    210             }
    211 
    212             equivalents[i + 1] = v;
    213        }
    214     }
    215 
    216     private Vector recordVectors[] = new Vector[UScript.CODE_LIMIT];
    217     private int maxEquivalents[] = new int[UScript.CODE_LIMIT];
    218 
    219 }
    220