Home | History | Annotate | Download | only in layout
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /**
      4  *******************************************************************************
      5  * Copyright (C) 2002-2010, International Business Machines Corporation and    *
      6  * others. All Rights Reserved.                                                *
      7  *******************************************************************************
      8  */
      9 
     10 
     11 package com.ibm.icu.dev.tool.layout;
     12 
     13 import com.ibm.icu.lang.UCharacter;
     14 import com.ibm.icu.lang.UScript;
     15 import com.ibm.icu.text.UTF16;
     16 import com.ibm.icu.text.UnicodeSet;
     17 
     18 /**
     19  * @author Eric Mader
     20  *
     21  * Notes:
     22  *
     23  * The property \p{Decomposition_Type=Canonical} will match all characters with a canonical
     24  * decomposition.
     25  *
     26  * So "[[\\p{Latin}\\p{Greek}\\p{Cyrillic}] & [\\p{Decomposition_Type=Canonical}]]"
     27  * will match all Latin, Greek and Cyrillic characters with a canonical decomposition.
     28  *
     29  * Are these three scripts enough? Do we want to collect them all at once and distribute by script,
     30  * or process them one script at a time. It's probably a good idea to build a single table for
     31  * however many scripts there are.
     32  *
     33  * It might be better to collect all the characters that have a canonical decomposition and just
     34  * sort them into however many scripts there are... unless we'll get characters in COMMON???
     35  */
     36 public class CanonGSUBBuilder
     37 {
     38     static public String convertArabicString(int type, int ligature, String decomp, ClassTable isolClassTable)
     39     {
     40         int leftType  = ArabicShaping.VALUE_NONE;
     41         int rightType = ArabicShaping.VALUE_NONE;
     42 
     43         switch (type) {
     44             case UCharacter.DecompositionType.ISOLATED:
     45                 break;
     46 
     47             case UCharacter.DecompositionType.FINAL:
     48                 rightType = ArabicShaping.VALUE_LEFT;
     49                 break;
     50 
     51             case UCharacter.DecompositionType.INITIAL:
     52                 leftType = ArabicShaping.VALUE_RIGHT;
     53                 break;
     54 
     55             case UCharacter.DecompositionType.MEDIAL:
     56                rightType = ArabicShaping.VALUE_LEFT;
     57                leftType  = ArabicShaping.VALUE_RIGHT;
     58                break;
     59 
     60            default:
     61                return decomp + UCharacter.toString(ligature);
     62         }
     63 
     64         char[] chars = decomp.toCharArray();
     65 
     66         ArabicShaping.shape(chars, leftType, rightType, isolClassTable);
     67 
     68         return new String(chars) + UCharacter.toString(ligature);
     69     }
     70 
     71     static void buildArabicContextualForms(ArabicCharacterData data, ClassTable initClassTable, ClassTable mediClassTable,
     72                                      ClassTable finaClassTable, ClassTable isolClassTable)
     73     {
     74         System.out.print("Finding Arabic contextual forms... ");
     75 
     76         for (int i = 0; i < data.countRecords(); i += 1) {
     77             ArabicCharacterData.Record record = data.getRecord(i);
     78             String decomposition = record.getDecomposition();
     79 
     80             if (decomposition != null && decomposition.length() == 1) {
     81                 int contextual = record.getCodePoint();
     82                 int isolated   = UTF16.charAt(record.getDecomposition(), 0);
     83 
     84                 switch (record.getDecompositionType()) {
     85                 case UCharacter.DecompositionType.INITIAL:
     86                     initClassTable.addMapping(isolated, contextual);
     87                     break;
     88 
     89                 case UCharacter.DecompositionType.MEDIAL:
     90                     mediClassTable.addMapping(isolated, contextual);
     91                     break;
     92 
     93                case UCharacter.DecompositionType.FINAL:
     94                    finaClassTable.addMapping(isolated, contextual);
     95                    break;
     96 
     97                case UCharacter.DecompositionType.ISOLATED:
     98                    isolClassTable.addMapping(isolated, contextual);
     99                    break;
    100 
    101                default:
    102                    // issue some error message?
    103                    break;
    104                 }
    105             }
    106         }
    107 
    108         System.out.println("Done.");
    109     }
    110 
    111     static LigatureTree buildArabicLigatureTree(ArabicCharacterData data, ClassTable isolClassTable)
    112     {
    113         LigatureTree contextualTree = new LigatureTree();
    114         int ligatureCount = 0;
    115 
    116         System.out.print("Building Arabic ligature tree... ");
    117 
    118         for (int i = 0; i < data.countRecords(); i += 1) {
    119             ArabicCharacterData.Record record = data.getRecord(i);
    120             String decomposition = record.getDecomposition();
    121 
    122             if (decomposition != null && decomposition.length() > 1) {
    123                 int ligature   = record.getCodePoint();
    124                 int decompType = record.getDecompositionType();
    125 
    126                 switch (decompType) {
    127                 case UCharacter.DecompositionType.FINAL:
    128                 case UCharacter.DecompositionType.INITIAL:
    129                 case UCharacter.DecompositionType.MEDIAL:
    130                 case UCharacter.DecompositionType.ISOLATED:
    131                     contextualTree.insert(convertArabicString(decompType, ligature, decomposition, isolClassTable));
    132                     ligatureCount += 1;
    133                     break;
    134 
    135                 case UCharacter.DecompositionType.CANONICAL:
    136                     //cannonicalTree.insert(decomposition + UCharacter.toString(ligature));
    137                     break;
    138                 }
    139             }
    140         }
    141 
    142         System.out.println(ligatureCount + " ligatures.");
    143 
    144         return contextualTree;
    145     }
    146 
    147     static final int SIMPLE_GLYPH = 1;
    148     static final int LIGATURE_GLYPH = 2;
    149     static final int MARK_GLYPH = 3;
    150     static final int COMPONENT_GLYPH = 4;
    151 
    152     static final int categoryClassMap[] = {
    153     0,              // UNASSIGNED
    154     SIMPLE_GLYPH,   // UPPERCASE_LETTER
    155     SIMPLE_GLYPH,   // LOWERCASE_LETTER
    156     SIMPLE_GLYPH,   // TITLECASE_LETTER
    157     SIMPLE_GLYPH,   // MODIFIER_LETTER
    158     SIMPLE_GLYPH,   // OTHER_LETTER
    159     MARK_GLYPH,     // NON_SPACING_MARK
    160     MARK_GLYPH,     // ENCLOSING_MARK ??
    161     MARK_GLYPH,     // COMBINING_SPACING_MARK ??
    162     SIMPLE_GLYPH,   // DECIMAL_NUMBER
    163     SIMPLE_GLYPH,   // LETTER_NUMBER
    164     SIMPLE_GLYPH,   // OTHER_NUMBER;
    165     0,              // SPACE_SEPARATOR
    166     0,              // LINE_SEPARATOR
    167     0,              // PARAGRAPH_SEPARATOR
    168     0,              // CONTROL
    169     0,              // FORMAT
    170     0,              // PRIVATE_USE
    171     0,              // SURROGATE
    172     SIMPLE_GLYPH,   // DASH_PUNCTUATION
    173     SIMPLE_GLYPH,   // START_PUNCTUATION
    174     SIMPLE_GLYPH,   // END_PUNCTUATION
    175     SIMPLE_GLYPH,   // CONNECTOR_PUNCTUATION
    176     SIMPLE_GLYPH,   // OTHER_PUNCTUATION
    177     SIMPLE_GLYPH,   // MATH_SYMBOL;
    178     SIMPLE_GLYPH,   // CURRENCY_SYMBOL
    179     SIMPLE_GLYPH,   // MODIFIER_SYMBOL
    180     SIMPLE_GLYPH,   // OTHER_SYMBOL
    181     SIMPLE_GLYPH,   // INITIAL_PUNCTUATION
    182     SIMPLE_GLYPH    // FINAL_PUNCTUATION
    183     };
    184 
    185     static int getGlyphClass(ArabicCharacterData.Record record)
    186     {
    187         String decomp = record.getDecomposition();
    188 
    189         if (decomp != null && decomp.length() > 1) {
    190             return LIGATURE_GLYPH;
    191         }
    192 
    193         return categoryClassMap[record.getGeneralCategory()];
    194     }
    195 
    196     static void addArabicGlyphClasses(ArabicCharacterData data, ClassTable classTable)
    197     {
    198         System.out.print("Adding Arabic glyph classes... ");
    199 
    200         for (int i = 0; i < data.countRecords(); i += 1) {
    201             ArabicCharacterData.Record record = data.getRecord(i);
    202             classTable.addMapping(record.getCodePoint(), getGlyphClass(record));
    203         }
    204 
    205         System.out.println("Done.");
    206     }
    207 
    208     private static void buildArabicTables(ScriptList scriptList, FeatureList featureList,
    209                                                 LookupList lookupList, ClassTable classTable) {
    210         // TODO: Might want to have the ligature table builder explicitly check for ligatures
    211         // which start with space and tatweel rather than pulling them out here...
    212         UnicodeSet arabicBlock   = new UnicodeSet("[[\\p{block=Arabic}] & [[:Cf:][:Po:][:So:][:Mn:][:Nd:][:Lm:]]]");
    213         UnicodeSet oddLigatures  = new UnicodeSet("[\\uFC5E-\\uFC63\\uFCF2-\\uFCF4\\uFE70-\\uFE7F]");
    214         UnicodeSet arabicLetters = new UnicodeSet("[\\p{Arabic}]");
    215         ArabicCharacterData arabicData = ArabicCharacterData.factory(arabicLetters.addAll(arabicBlock).removeAll(oddLigatures));
    216 
    217         addArabicGlyphClasses(arabicData, classTable);
    218 
    219         ClassTable initClassTable = new ClassTable();
    220         ClassTable mediClassTable = new ClassTable();
    221         ClassTable finaClassTable = new ClassTable();
    222         ClassTable isolClassTable = new ClassTable();
    223 
    224         buildArabicContextualForms(arabicData, initClassTable, mediClassTable, finaClassTable, isolClassTable);
    225         isolClassTable.snapshot();
    226         LigatureTree ligaTree = buildArabicLigatureTree(arabicData, isolClassTable);
    227 
    228         LigatureTreeWalker ligaWalker = new LigatureTreeWalker();
    229 
    230         ligaTree.walk(ligaWalker);
    231 
    232         Lookup initLookup, mediLookup, finaLookup, ligaLookup;
    233 
    234         initLookup = new Lookup(Lookup.GSST_Single, 0);
    235         initLookup.addSubtable(initClassTable);
    236 
    237         mediLookup = new Lookup(Lookup.GSST_Single, 0);
    238         mediLookup.addSubtable(mediClassTable);
    239 
    240         finaLookup = new Lookup(Lookup.GSST_Single, 0);
    241         finaLookup.addSubtable(finaClassTable);
    242 
    243         ligaLookup = new Lookup(Lookup.GSST_Ligature, Lookup.LF_IgnoreMarks);
    244         ligaLookup.addSubtable(ligaWalker);
    245 
    246         Feature init = new Feature("init");
    247         Feature medi = new Feature("medi");
    248         Feature fina = new Feature("fina");
    249         Feature liga = new Feature("liga");
    250 
    251         init.addLookup(lookupList.addLookup(initLookup));
    252         medi.addLookup(lookupList.addLookup(mediLookup));
    253         fina.addLookup(lookupList.addLookup(finaLookup));
    254         liga.addLookup(lookupList.addLookup(ligaLookup));
    255 
    256         featureList.addFeature(init);
    257         featureList.addFeature(medi);
    258         featureList.addFeature(fina);
    259         featureList.addFeature(liga);
    260 
    261         scriptList.addFeature("arab", "(default)", init);
    262         scriptList.addFeature("arab", "(default)", medi);
    263         scriptList.addFeature("arab", "(default)", fina);
    264         scriptList.addFeature("arab", "(default)", liga);
    265 
    266         System.out.println();
    267     }
    268 
    269     public static void buildLigatureTree(CanonicalCharacterData data, int script, LigatureTree ligatureTree)
    270     {
    271         int ligatureCount = 0;
    272 
    273         System.out.print("building composition ligature tree for " + UScript.getName(script) + "... ");
    274 
    275         for (int i = 0; i < data.countRecords(script); i += 1) {
    276             CanonicalCharacterData.Record record = data.getRecord(script, i);
    277             String composed = UCharacter.toString(record.getComposedCharacter());
    278 
    279             for (int e = 0; e < record.countEquivalents(); e += 1) {
    280                 String equivalent = record.getEquivalent(e);
    281 
    282                 ligatureTree.insert(equivalent + composed);
    283                 ligatureCount += 1;
    284             }
    285         }
    286 
    287         System.out.println(ligatureCount + " ligatures.");
    288     }
    289 
    290     public static DecompTable[] buildDecompTables(CanonicalCharacterData data, int script)
    291     {
    292         int maxDecompCount = data.getMaxEquivalents(script);
    293         DecompTable[] decompTables = new DecompTable[maxDecompCount];
    294 
    295         System.out.print("Building decompositon tables for " + UScript.getName(script) +
    296                          "... total decompositions: " + data.countRecords(script) +
    297                          ", max: " + maxDecompCount + "...");
    298 
    299         for (int i = 0; i < maxDecompCount; i += 1) {
    300             DecompTable table = new DecompTable();
    301 
    302             for (int r = 0; r < data.countRecords(script); r += 1) {
    303                 CanonicalCharacterData.Record record = data.getRecord(script, r);
    304 
    305                 if (record.countEquivalents() > i) {
    306                     table.add(record.getComposedCharacter(), record.getEquivalent(i));
    307                 }
    308             }
    309 
    310             decompTables[i] = table;
    311         }
    312 
    313         System.out.println(" Done.");
    314 
    315         return decompTables;
    316     }
    317 
    318     public static int[] buildLookups(CanonicalCharacterData data, LookupList lookupList, int script)
    319     {
    320         int[] lookups = new int[2];
    321 
    322         DecompTable[] decompTables = buildDecompTables(data, script);
    323 
    324         LigatureTree compTree = new LigatureTree();
    325 
    326         buildLigatureTree(data, script, compTree);
    327 
    328         System.out.println();
    329 
    330         LigatureTreeWalker compWalker = new LigatureTreeWalker();
    331 
    332         compTree.walk(compWalker);
    333 
    334         Lookup compLookup, dcmpLookup;
    335         //int compLookupIndex, dcmpLookupIndex;
    336 
    337         compLookup = new Lookup(Lookup.GSST_Ligature, 0);
    338         compLookup.addSubtable(compWalker);
    339 
    340         dcmpLookup = new Lookup(Lookup.GSST_Multiple, 0);
    341         for (int i = 0; i < decompTables.length; i += 1) {
    342             dcmpLookup.addSubtable(decompTables[i]);
    343         }
    344 
    345         lookups[0] = lookupList.addLookup(compLookup);
    346         lookups[1] = lookupList.addLookup(dcmpLookup);
    347 
    348         return lookups;
    349     }
    350 
    351     public static void addLookups(Feature feature, int[] lookups)
    352     {
    353         for (int i = 0; i < lookups.length; i += 1) {
    354             feature.addLookup(lookups[i]);
    355         }
    356     }
    357 
    358     /*
    359      * Hebrew mark order taken from the SBL Hebrew Font manual
    360      * Arabic mark order per Thomas Milo: hamza < shadda < combining_alef < sukun, vowel_marks < madda < qur'anic_marks
    361      */
    362     public static ClassTable buildCombiningClassTable()
    363     {
    364         UnicodeSet markSet = new UnicodeSet("[\\P{CanonicalCombiningClass=0}]");
    365         ClassTable exceptions = new ClassTable();
    366         ClassTable combiningClasses = new ClassTable();
    367         int markCount = markSet.size();
    368 
    369         exceptions.addMapping(0x05C1,  10); // Point Shin Dot
    370         exceptions.addMapping(0x05C2,  11); // Point Sin Dot
    371         exceptions.addMapping(0x05BC,  21); // Point Dagesh or Mapiq
    372         exceptions.addMapping(0x05BF,  23); // Point Rafe
    373         exceptions.addMapping(0x05B9,  27); // Point Holam
    374         exceptions.addMapping(0x0323, 220); // Comb. Dot Below (low punctum)
    375         exceptions.addMapping(0x0591, 220); // Accent Etnahta
    376         exceptions.addMapping(0x0596, 220); // Accent Tipeha
    377         exceptions.addMapping(0x059B, 220); // Accent Tevir
    378         exceptions.addMapping(0x05A3, 220); // Accent Munah
    379         exceptions.addMapping(0x05A4, 220); // Accent Mahapakh
    380         exceptions.addMapping(0x05A5, 220); // Accent Merkha
    381         exceptions.addMapping(0x05A6, 220); // Accent Merkha Kefula
    382         exceptions.addMapping(0x05A7, 220); // Accent Darga
    383         exceptions.addMapping(0x05AA, 220); // Accent Yerah Ben Yomo
    384         exceptions.addMapping(0x05B0, 220); // Point Sheva
    385         exceptions.addMapping(0x05B1, 220); // Point Hataf Segol
    386         exceptions.addMapping(0x05B2, 220); // Point Hataf Patah
    387         exceptions.addMapping(0x05B3, 220); // Point Hataf Qamats
    388         exceptions.addMapping(0x05B4, 220); // Point Hiriq
    389         exceptions.addMapping(0x05B5, 220); // Point Tsere
    390         exceptions.addMapping(0x05B6, 220); // Point Segol
    391         exceptions.addMapping(0x05B7, 220); // Point Patah
    392         exceptions.addMapping(0x05B8, 220); // Point Qamats
    393         exceptions.addMapping(0x05BB, 220); // Point Qubuts
    394         exceptions.addMapping(0x05BD, 220); // Point Meteg
    395         exceptions.addMapping(0x059A, 222); // Accent Yetiv
    396         exceptions.addMapping(0x05AD, 222); // Accent Dehi
    397         exceptions.addMapping(0x05C4, 230); // Mark Upper Dot (high punctum)
    398         exceptions.addMapping(0x0593, 230); // Accent Shalshelet
    399         exceptions.addMapping(0x0594, 230); // Accent Zaqef Qatan
    400         exceptions.addMapping(0x0595, 230); // Accent Zaqef Gadol
    401         exceptions.addMapping(0x0597, 230); // Accent Revia
    402         exceptions.addMapping(0x0598, 230); // Accent Zarqa
    403         exceptions.addMapping(0x059F, 230); // Accent Qarney Para
    404         exceptions.addMapping(0x059E, 230); // Accent Gershayim
    405         exceptions.addMapping(0x059D, 230); // Accent Geresh Muqdam
    406         exceptions.addMapping(0x059C, 230); // Accent Geresh
    407         exceptions.addMapping(0x0592, 230); // Accent Segolta
    408         exceptions.addMapping(0x05A0, 230); // Accent Telisha Gedola
    409         exceptions.addMapping(0x05AC, 230); // Accent Iluy
    410         exceptions.addMapping(0x05A8, 230); // Accent Qadma
    411         exceptions.addMapping(0x05AB, 230); // Accent Ole
    412         exceptions.addMapping(0x05AF, 230); // Mark Masora Circle
    413         exceptions.addMapping(0x05A1, 230); // Accent Pazer
    414       //exceptions.addMapping(0x0307, 230); // Mark Number/Masora Dot
    415         exceptions.addMapping(0x05AE, 232); // Accent Zinor
    416         exceptions.addMapping(0x05A9, 232); // Accent Telisha Qetana
    417         exceptions.addMapping(0x0599, 232); // Accent Pashta
    418 
    419         exceptions.addMapping(0x0655,  27); // ARABIC HAMZA BELOW
    420         exceptions.addMapping(0x0654,  27); // ARABIC HAMZA ABOVE
    421 
    422         exceptions.addMapping(0x0651,  28); // ARABIC SHADDA
    423 
    424         exceptions.addMapping(0x0656,  29); // ARABIC SUBSCRIPT ALEF
    425         exceptions.addMapping(0x0670,  29); // ARABIC LETTER SUPERSCRIPT ALEF
    426 
    427         exceptions.addMapping(0x064D,  30); // ARABIC KASRATAN
    428         exceptions.addMapping(0x0650,  30); // ARABIC KASRA
    429 
    430         exceptions.addMapping(0x0652,  31); // ARABIC SUKUN
    431         exceptions.addMapping(0x06E1,  31); // ARABIC SMALL HIGH DOTLESS HEAD OF KHAH
    432 
    433         exceptions.addMapping(0x064B,  31); // ARABIC FATHATAN
    434         exceptions.addMapping(0x064C,  31); // ARABIC DAMMATAN
    435         exceptions.addMapping(0x064E,  31); // ARABIC FATHA
    436         exceptions.addMapping(0x064F,  31); // ARABIC DAMMA
    437         exceptions.addMapping(0x0657,  31); // ARABIC INVERTED DAMMA
    438         exceptions.addMapping(0x0658,  31); // ARABIC MARK NOON GHUNNA
    439 
    440         exceptions.addMapping(0x0653,  32); // ARABIC MADDAH ABOVE
    441 
    442         exceptions.snapshot();
    443 
    444         for (int i = 0; i < markCount; i += 1) {
    445             int mark = markSet.charAt(i);
    446             int markClass = exceptions.getGlyphClassID(mark);
    447 
    448             if (markClass == 0) {
    449                 markClass = UCharacter.getCombiningClass(mark);
    450             }
    451 
    452             combiningClasses.addMapping(mark, markClass);
    453         }
    454 
    455         combiningClasses.snapshot();
    456         return combiningClasses;
    457     }
    458 
    459     public static void buildDecompTables(String fileName)
    460     {
    461         // F900 - FAFF are compatibility ideographs. They all decompose to a single other character, and can be ignored.
    462       //UnicodeSet decompSet = new UnicodeSet("[[[\\P{Hangul}] & [\\p{DecompositionType=Canonical}]] - [\uF900-\uFAFF]]");
    463         UnicodeSet decompSet = new UnicodeSet("[[\\p{DecompositionType=Canonical}] & [\\P{FullCompositionExclusion}] & [\\P{Hangul}]]");
    464         CanonicalCharacterData data = CanonicalCharacterData.factory(decompSet);
    465         ClassTable classTable = new ClassTable();
    466 
    467         LookupList  lookupList  = new LookupList();
    468         FeatureList featureList = new FeatureList();
    469         ScriptList  scriptList  = new ScriptList();
    470 
    471         // build common, inherited lookups...
    472 //        int[] commonLookups = buildLookups(data, lookupList, UScript.COMMON);
    473 //        int[] inheritedLookups = buildLookups(data, lookupList, UScript.INHERITED);
    474 
    475         for (int script = 0; script < UScript.CODE_LIMIT; script += 1) {
    476 
    477             // This is a bit lame, but it's the only way I can think of
    478             // to make this work w/o knowing the values of COMMON and INHERITED...
    479             if (script == UScript.COMMON || script == UScript.INHERITED ||
    480                 data.getMaxEquivalents(script) == 0) {
    481                 continue;
    482             }
    483 
    484             int[] lookups = buildLookups(data, lookupList, script);
    485 
    486             Feature ccmp = new Feature("ccmp");
    487 
    488             addLookups(ccmp, lookups);
    489 //            addLookups(ccmp, commonLookups);
    490 //            addLookups(ccmp, inheritedLookups);
    491 
    492             featureList.addFeature(ccmp);
    493 
    494             String scriptTag = TagUtilities.tagLabel(UScript.getShortName(script));
    495 
    496             scriptList.addFeature(scriptTag, "(default)", ccmp);
    497 
    498             if (script == UScript.ARABIC) {
    499                 buildArabicTables(scriptList, featureList, lookupList, classTable);
    500             }
    501         }
    502 
    503         featureList.finalizeFeatureList();
    504 
    505         ClassTable markClassTable = buildCombiningClassTable();
    506 
    507         GSUBWriter gsubWriter = new GSUBWriter("Canon", scriptList, featureList, lookupList);
    508         GDEFWriter gdefWriter = new GDEFWriter("Canon", classTable, markClassTable);
    509         String[] includeFiles = {"LETypes.h", "CanonShaping.h"};
    510 
    511         LigatureModuleWriter writer = new LigatureModuleWriter();
    512 
    513         writer.openFile(fileName);
    514         writer.writeHeader(null, includeFiles);
    515         writer.writeTable(gsubWriter);
    516         writer.writeTable(gdefWriter);
    517         writer.writeTrailer();
    518         writer.closeFile();
    519     }
    520 
    521     public static void main(String[] args)
    522     {
    523         buildDecompTables(args[0]);
    524     }
    525 }
    526