Home | History | Annotate | Download | only in layout
      1 /**
      2  *******************************************************************************
      3  * Copyright (C) 2002-2010, International Business Machines Corporation and    *
      4  * others. All Rights Reserved.                                                *
      5  *******************************************************************************
      6  */
      7 
      8 package com.ibm.icu.dev.tool.layout;
      9 
     10 import java.util.Vector;
     11 
     12 import com.ibm.icu.impl.Utility;
     13 import com.ibm.icu.lang.UCharacter;
     14 import com.ibm.icu.lang.UScript;
     15 import com.ibm.icu.text.CanonicalIterator;
     16 import com.ibm.icu.text.UTF16;
     17 import com.ibm.icu.text.UnicodeSet;
     18 
     19 public class CanonicalCharacterData
     20 {
     21     private static int THRESHOLD = 4;
     22 
     23     public class Record
     24     {
     25         // TODO: might want to save arrays of Char32's rather than UTF16 strings...
     26         Record(int character, int script)
     27         {
     28             String char32 = UCharacter.toString(character);
     29             CanonicalIterator iterator = new CanonicalIterator(char32);
     30             Vector equivs = new Vector();
     31 
     32             composed = character;
     33 
     34             for (String equiv = iterator.next(); equiv != null; equiv = iterator.next()) {
     35                 // Skip all equivalents of length 1; it's either the original
     36                 // characeter or something like Angstrom for A-Ring, which we don't care about
     37                 if (UTF16.countCodePoint(equiv) > 1) {
     38                     equivs.add(equiv);
     39                 }
     40             }
     41 
     42             int nEquivalents = equivs.size();
     43 
     44             if (nEquivalents > maxEquivalents[script]) {
     45                 maxEquivalents[script] = nEquivalents;
     46             }
     47 
     48             if (nEquivalents > 0) {
     49                 equivalents = new String[nEquivalents];
     50 
     51                 if (nEquivalents > THRESHOLD) {
     52                     dumpEquivalents(character, equivs);
     53                 }
     54 
     55                 sortEquivalents(equivalents, equivs);
     56             }
     57         }
     58 
     59         public int getComposedCharacter()
     60         {
     61             return composed;
     62         }
     63 
     64         public int countEquivalents()
     65         {
     66             if (equivalents == null) {
     67                 return 0;
     68             }
     69 
     70             return equivalents.length;
     71         }
     72 
     73         public String[] getEquivalents()
     74         {
     75             return equivalents;
     76         }
     77 
     78         public String getEquivalent(int index)
     79         {
     80             if (equivalents == null || index < 0 || index >= equivalents.length) {
     81                 return null;
     82             }
     83 
     84             return equivalents[index];
     85         }
     86 
     87         private void dumpEquivalents(int character, Vector equivs)
     88         {
     89             int count = equivs.size();
     90 
     91             System.out.println(Utility.hex(character, 6) + " - " + count + ":");
     92 
     93             for (int i = 0; i < count; i += 1) {
     94                 String equiv = (String) equivs.elementAt(i);
     95                 int codePoints = UTF16.countCodePoint(equiv);
     96 
     97                 for (int c = 0; c < codePoints; c += 1) {
     98                     if (c > 0) {
     99                         System.out.print(" ");
    100                     }
    101 
    102                     System.out.print(Utility.hex(UTF16.charAt(equiv, c), 6));
    103                 }
    104 
    105                 System.out.println();
    106             }
    107 
    108             System.out.println();
    109         }
    110 
    111         private int composed;
    112         private String[] equivalents = null;
    113     }
    114 
    115     public CanonicalCharacterData()
    116     {
    117         // nothing to do...
    118     }
    119 
    120     public void add(int character)
    121     {
    122         int script = UScript.getScript(character);
    123         Vector recordVector = recordVectors[script];
    124 
    125         if (recordVector == null) {
    126             recordVector = recordVectors[script] = new Vector();
    127         }
    128 
    129         recordVector.add(new Record(character, script));
    130     }
    131 
    132     public int getMaxEquivalents(int script)
    133     {
    134         if (script < 0 || script >= UScript.CODE_LIMIT) {
    135             return 0;
    136         }
    137 
    138         return maxEquivalents[script];
    139     }
    140 
    141     public Record getRecord(int script, int index)
    142     {
    143         if (script < 0 || script >= UScript.CODE_LIMIT) {
    144             return null;
    145         }
    146 
    147         Vector recordVector = recordVectors[script];
    148 
    149         if (recordVector == null || index < 0 || index >= recordVector.size()) {
    150             return null;
    151         }
    152 
    153         return (Record) recordVector.elementAt(index);
    154     }
    155 
    156     public int countRecords(int script)
    157     {
    158         if (script < 0 || script >= UScript.CODE_LIMIT ||
    159             recordVectors[script] == null) {
    160             return 0;
    161         }
    162 
    163         return recordVectors[script].size();
    164     }
    165 
    166     public static CanonicalCharacterData factory(UnicodeSet characterSet)
    167     {
    168         int charCount = characterSet.size();
    169         CanonicalCharacterData data = new CanonicalCharacterData();
    170 
    171         System.out.println("There are " + charCount + " characters with a canonical decomposition.");
    172 
    173         for (int i = 0; i < charCount; i += 1) {
    174             data.add(characterSet.charAt(i));
    175         }
    176 
    177         return data;
    178     }
    179 
    180     private static int compareEquivalents(String a, String b)
    181     {
    182         int result = UTF16.countCodePoint(a) - UTF16.countCodePoint(b);
    183 
    184         if (result == 0) {
    185             return a.compareTo(b);
    186         }
    187 
    188         return result;
    189     }
    190 
    191     //
    192     // Straight insertion sort from Knuth vol. III, pg. 81
    193     //
    194     private static void sortEquivalents(String[] equivalents, Vector unsorted)
    195     {
    196         int nEquivalents = equivalents.length;
    197 
    198         for (int e = 0; e < nEquivalents; e += 1) {
    199             String v = (String) unsorted.elementAt(e);
    200             int i;
    201 
    202             for (i = e - 1; i >= 0; i -= 1) {
    203                 if (compareEquivalents(v, equivalents[i]) >= 0) {
    204                   break;
    205                 }
    206 
    207                 equivalents[i + 1] = equivalents[i];
    208             }
    209 
    210             equivalents[i + 1] = v;
    211        }
    212     }
    213 
    214     private Vector recordVectors[] = new Vector[UScript.CODE_LIMIT];
    215     private int maxEquivalents[] = new int[UScript.CODE_LIMIT];
    216 
    217 }
    218