Home | History | Annotate | Download | only in layout
      1 /*
      2  *
      3  * (C) Copyright IBM Corp. 1998-2007 - All Rights Reserved
      4  *
      5  * Developed at DIT - Government of Bhutan
      6  *
      7  * Contact person: Pema Geyleg - <pema_geyleg (at) druknet.bt>
      8  *
      9  * This file is a modification of the ICU file KhmerReordering.cpp
     10  * by Jens Herden and Javier Sola who have given all their possible rights to IBM and the Governement of Bhutan
     11  * A first module for Dzongkha was developed by Karunakar under Panlocalisation funding.
     12  * Assistance for this module has been received from Namgay Thinley, Christopher Fynn and Javier Sola
     13  *
     14  */
     15 
     16 //#include <stdio.h>
     17 #include "LETypes.h"
     18 #include "OpenTypeTables.h"
     19 #include "TibetanReordering.h"
     20 #include "LEGlyphStorage.h"
     21 
     22 
     23 U_NAMESPACE_BEGIN
     24 
     25 // Characters that get refered to by name...
     26 enum
     27 {
     28     C_DOTTED_CIRCLE = 0x25CC,
     29     C_PRE_NUMBER_MARK = 0x0F3F
     30  };
     31 
     32 
     33 enum
     34 {
     35     // simple classes, they are used in the statetable (in this file) to control the length of a syllable
     36     // they are also used to know where a character should be placed (location in reference to the base character)
     37     // and also to know if a character, when independtly displayed, should be displayed with a dotted-circle to
     38     // indicate error in syllable construction
     39     _xx = TibetanClassTable::CC_RESERVED,
     40     _ba = TibetanClassTable::CC_BASE,
     41     _sj = TibetanClassTable::CC_SUBJOINED | TibetanClassTable::CF_DOTTED_CIRCLE | TibetanClassTable::CF_POS_BELOW,
     42     _tp = TibetanClassTable::CC_TSA_PHRU  | TibetanClassTable::CF_DOTTED_CIRCLE | TibetanClassTable::CF_POS_ABOVE,
     43     _ac = TibetanClassTable::CC_A_CHUNG |  TibetanClassTable::CF_DOTTED_CIRCLE | TibetanClassTable::CF_POS_BELOW,
     44     _cs = TibetanClassTable::CC_COMP_SANSKRIT | TibetanClassTable::CF_DOTTED_CIRCLE | TibetanClassTable::CF_POS_BELOW,
     45     _ha = TibetanClassTable::CC_HALANTA | TibetanClassTable::CF_DOTTED_CIRCLE | TibetanClassTable::CF_POS_BELOW,
     46     _bv = TibetanClassTable::CC_BELOW_VOWEL | TibetanClassTable::CF_DOTTED_CIRCLE | TibetanClassTable::CF_POS_BELOW,
     47     _av = TibetanClassTable::CC_ABOVE_VOWEL | TibetanClassTable::CF_DOTTED_CIRCLE | TibetanClassTable::CF_POS_ABOVE,
     48     _an = TibetanClassTable::CC_ANUSVARA | TibetanClassTable::CF_DOTTED_CIRCLE | TibetanClassTable::CF_POS_ABOVE,
     49     _cb = TibetanClassTable::CC_CANDRABINDU | TibetanClassTable::CF_DOTTED_CIRCLE | TibetanClassTable::CF_POS_ABOVE,
     50     _vs = TibetanClassTable::CC_VISARGA | TibetanClassTable::CF_DOTTED_CIRCLE| TibetanClassTable::CF_POS_AFTER,
     51     _as = TibetanClassTable::CC_ABOVE_S_MARK | TibetanClassTable::CF_DOTTED_CIRCLE | TibetanClassTable::CF_POS_ABOVE,
     52     _bs = TibetanClassTable::CC_BELOW_S_MARK | TibetanClassTable::CF_DOTTED_CIRCLE | TibetanClassTable::CF_POS_BELOW,
     53     _di = TibetanClassTable::CC_DIGIT | TibetanClassTable::CF_DIGIT,
     54     _pd = TibetanClassTable::CC_PRE_DIGIT_MARK | TibetanClassTable::CF_DOTTED_CIRCLE | TibetanClassTable::CF_PREDIGIT | TibetanClassTable::CF_POS_BEFORE ,
     55     _bd = TibetanClassTable::CC_POST_BELOW_DIGIT_M | TibetanClassTable::CF_DOTTED_CIRCLE | TibetanClassTable::CF_POS_AFTER
     56 };
     57 
     58 
     59 // Character class tables
     60 //_xx Non Combining characters
     61 //_ba Base Consonants
     62 //_sj Subjoined consonants
     63 //_tp Tsa - phru
     64 //_ac A-chung, Vowel Lengthening mark
     65 //_cs Precomposed Sanskrit vowel + subjoined consonants
     66 //_ha Halanta/Virama
     67 //_bv Below vowel
     68 //_av above vowel
     69 //_an Anusvara
     70 //_cb Candrabindu
     71 //_vs Visaraga/Post mark
     72 //_as Upper Stress marks
     73 //_bs Lower Stress marks
     74 //_di Digit
     75 //_pd Number pre combining, Needs reordering
     76 //_bd Other number combining marks
     77 
     78 static const TibetanClassTable::CharClass tibetanCharClasses[] =
     79 {
     80    // 0    1    2    3    4    5    6    7    8    9   a     b   c    d     e   f
     81     _xx, _ba, _xx, _xx, _ba, _ba, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, // 0F00 - 0F0F 0
     82     _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _bd, _bd, _xx, _xx, _xx, _xx, _xx, _xx, // 0F10 - 0F1F 1
     83     _di, _di, _di, _di, _di, _di, _di, _di, _di, _di, _xx, _xx, _xx, _xx, _xx, _xx, // 0F20 - 0F2F 2
     84     _xx, _xx, _xx, _xx, _xx, _bs, _xx, _bs, _xx, _tp, _xx, _xx, _xx, _xx, _bd, _pd, // 0F30 - 0F3F 3
     85     _ba, _ba, _ba, _ba, _ba, _ba, _ba, _ba, _xx, _ba, _ba, _ba, _ba, _ba, _ba, _ba, // 0F40 - 0F4F 4
     86     _ba, _ba, _ba, _ba, _ba, _ba, _ba, _ba, _ba, _ba, _ba, _ba, _ba, _ba, _ba, _ba, // 0F50 - 0F5F 5
     87     _ba, _ba, _ba, _ba, _ba, _ba, _ba, _ba, _ba, _ba, _ba, _xx, _xx, _xx, _xx, _xx, // 0F60 - 0F6F 6
     88     _xx, _ac, _av, _cs, _bv, _bv, _cs, _cs, _cs, _cs, _av, _av, _av, _av, _an, _vs, // 0F70 - 0F7F 7
     89     _av, _cs, _cb, _cb, _ha, _xx, _as, _as, _ba, _ba, _ba, _ba, _xx, _xx, _xx, _xx, // 0F80 - 0F8F 8
     90     _sj, _sj, _sj, _sj, _sj, _sj, _sj, _sj, _xx, _sj, _sj, _sj, _sj, _sj, _sj, _sj, // 0F90 - 0F9F 9
     91     _sj, _sj, _sj, _sj, _sj, _sj, _sj, _sj, _sj, _sj, _sj, _sj, _sj, _sj, _sj, _sj, // 0FA0 - 0FAF a
     92     _sj, _sj, _sj, _sj, _sj, _sj, _sj, _sj, _sj, _sj, _sj, _sj, _sj, _xx, _sj, _sj, // 0FB0 - 0FBF b
     93     _xx, _xx, _xx, _xx, _xx, _xx, _bs, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, // 0FC0 - 0FCF c
     94     _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx,// 0FD0 - 0FDF  d
     95     _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, // 0FE0 - 0FEF e
     96     _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, // 0FF0 - 0FFF f
     97 };
     98 
     99 
    100 //
    101 // Tibetan Class Tables
    102 //
    103 
    104 //
    105 // The range of characters defined in the above table is defined here. For Tibetan 0F00 to 0FFF
    106 // Even if the Tibetan range is bigger, most of the characters are not combinable, and therefore treated
    107 // as _xx
    108 static const TibetanClassTable tibetanClassTable = {0x0F00, 0x0FFF, tibetanCharClasses};
    109 
    110 
    111 // Below we define how a character in the input string is either in the tibetanCharClasses table
    112 // (in which case we get its type back), or an unknown object in which case we get _xx (CC_RESERVED) back
    113 TibetanClassTable::CharClass TibetanClassTable::getCharClass(LEUnicode ch) const
    114 {
    115     if (ch < firstChar || ch > lastChar) {
    116         return CC_RESERVED;
    117     }
    118 
    119     return classTable[ch - firstChar];
    120 }
    121 
    122 const TibetanClassTable *TibetanClassTable::getTibetanClassTable()
    123 {
    124     return &tibetanClassTable;
    125 }
    126 
    127 
    128 
    129 class TibetanReorderingOutput : public UMemory {
    130 private:
    131     le_int32 fSyllableCount;
    132     le_int32 fOutIndex;
    133     LEUnicode *fOutChars;
    134 
    135     LEGlyphStorage &fGlyphStorage;
    136 
    137 
    138 public:
    139     TibetanReorderingOutput(LEUnicode *outChars, LEGlyphStorage &glyphStorage)
    140         : fSyllableCount(0), fOutIndex(0), fOutChars(outChars), fGlyphStorage(glyphStorage)
    141     {
    142         // nothing else to do...
    143     }
    144 
    145     ~TibetanReorderingOutput()
    146     {
    147         // nothing to do here...
    148     }
    149 
    150     void reset()
    151     {
    152         fSyllableCount += 1;
    153     }
    154 
    155     void writeChar(LEUnicode ch, le_uint32 charIndex, FeatureMask featureMask)
    156     {
    157         LEErrorCode success = LE_NO_ERROR;
    158 
    159         fOutChars[fOutIndex] = ch;
    160 
    161         fGlyphStorage.setCharIndex(fOutIndex, charIndex, success);
    162         fGlyphStorage.setAuxData(fOutIndex, featureMask, success);
    163 
    164         fOutIndex += 1;
    165     }
    166 
    167     le_int32 getOutputIndex()
    168     {
    169         return fOutIndex;
    170     }
    171 };
    172 
    173 
    174 //TODO remove unused flags
    175 #define ccmpFeatureTag LE_CCMP_FEATURE_TAG
    176 #define blwfFeatureTag LE_BLWF_FEATURE_TAG
    177 #define pstfFeatureTag LE_PSTF_FEATURE_TAG
    178 #define presFeatureTag LE_PRES_FEATURE_TAG
    179 #define blwsFeatureTag LE_BLWS_FEATURE_TAG
    180 #define abvsFeatureTag LE_ABVS_FEATURE_TAG
    181 #define pstsFeatureTag LE_PSTS_FEATURE_TAG
    182 
    183 #define blwmFeatureTag LE_BLWM_FEATURE_TAG
    184 #define abvmFeatureTag LE_ABVM_FEATURE_TAG
    185 #define distFeatureTag LE_DIST_FEATURE_TAG
    186 
    187 #define prefFeatureTag LE_PREF_FEATURE_TAG
    188 #define abvfFeatureTag LE_ABVF_FEATURE_TAG
    189 #define cligFeatureTag LE_CLIG_FEATURE_TAG
    190 #define mkmkFeatureTag LE_MKMK_FEATURE_TAG
    191 
    192 // Shaping features
    193 #define prefFeatureMask 0x80000000UL
    194 #define blwfFeatureMask 0x40000000UL
    195 #define abvfFeatureMask 0x20000000UL
    196 #define pstfFeatureMask 0x10000000UL
    197 #define presFeatureMask 0x08000000UL
    198 #define blwsFeatureMask 0x04000000UL
    199 #define abvsFeatureMask 0x02000000UL
    200 #define pstsFeatureMask 0x01000000UL
    201 #define cligFeatureMask 0x00800000UL
    202 #define ccmpFeatureMask 0x00040000UL
    203 
    204 // Positioning features
    205 #define distFeatureMask 0x00400000UL
    206 #define blwmFeatureMask 0x00200000UL
    207 #define abvmFeatureMask 0x00100000UL
    208 #define mkmkFeatureMask 0x00080000UL
    209 
    210 #define tagPref    (ccmpFeatureMask | prefFeatureMask | presFeatureMask | cligFeatureMask | distFeatureMask)
    211 #define tagAbvf    (ccmpFeatureMask | abvfFeatureMask | abvsFeatureMask | cligFeatureMask | distFeatureMask | abvmFeatureMask | mkmkFeatureMask)
    212 #define tagPstf    (ccmpFeatureMask | blwfFeatureMask | blwsFeatureMask | prefFeatureMask | presFeatureMask | pstfFeatureMask | pstsFeatureMask | cligFeatureMask | distFeatureMask | blwmFeatureMask)
    213 #define tagBlwf    (ccmpFeatureMask | blwfFeatureMask | blwsFeatureMask | cligFeatureMask | distFeatureMask | blwmFeatureMask | mkmkFeatureMask)
    214 #define tagDefault (ccmpFeatureMask | prefFeatureMask | blwfFeatureMask | presFeatureMask | blwsFeatureMask | cligFeatureMask | distFeatureMask | abvmFeatureMask | blwmFeatureMask | mkmkFeatureMask)
    215 
    216 
    217 
    218 // These are in the order in which the features need to be applied
    219 // for correct processing
    220 static const FeatureMap featureMap[] =
    221 {
    222     // Shaping features
    223     {ccmpFeatureTag, ccmpFeatureMask},
    224     {prefFeatureTag, prefFeatureMask},
    225     {blwfFeatureTag, blwfFeatureMask},
    226     {abvfFeatureTag, abvfFeatureMask},
    227     {pstfFeatureTag, pstfFeatureMask},
    228     {presFeatureTag, presFeatureMask},
    229     {blwsFeatureTag, blwsFeatureMask},
    230     {abvsFeatureTag, abvsFeatureMask},
    231     {pstsFeatureTag, pstsFeatureMask},
    232     {cligFeatureTag, cligFeatureMask},
    233 
    234     // Positioning features
    235     {distFeatureTag, distFeatureMask},
    236     {blwmFeatureTag, blwmFeatureMask},
    237     {abvmFeatureTag, abvmFeatureMask},
    238     {mkmkFeatureTag, mkmkFeatureMask},
    239 };
    240 
    241 static const le_int32 featureMapCount = LE_ARRAY_SIZE(featureMap);
    242 
    243 // The stateTable is used to calculate the end (the length) of a well
    244 // formed Tibetan Syllable.
    245 //
    246 // Each horizontal line is ordered exactly the same way as the values in TibetanClassTable
    247 // CharClassValues in TibetanReordering.h This coincidence of values allows the
    248 // follow up of the table.
    249 //
    250 // Each line corresponds to a state, which does not necessarily need to be a type
    251 // of component... for example, state 2 is a base, with is always a first character
    252 // in the syllable, but the state could be produced a consonant of any type when
    253 // it is the first character that is analysed (in ground state).
    254 //
    255 static const le_int8 tibetanStateTable[][TibetanClassTable::CC_COUNT] =
    256 {
    257 
    258 
    259     //Dzongkha state table
    260     //xx  ba  sj  tp  ac  cs  ha  bv  av  an  cb  vs  as  bs  di  pd  bd
    261     { 1,  2,  4,  3,  8,  7,  9, 10, 14, 13, 17, 18, 19, 19, 20, 21, 21,}, //  0 - ground state
    262     {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,}, //  1 - exit state (or sign to the right of the syllable)
    263     {-1, -1,  4,  3,  8,  7,  9, 10, 14, 13, 17, 18, 19, 19, -1, -1, -1,}, //  2 - Base consonant
    264     {-1, -1,  5, -1,  8,  7, -1, 10, 14, 13, 17, 18, 19, 19, -1, -1, -1,}, //  3 - Tsa phru after base
    265     {-1, -1,  4,  6,  8,  7,  9, 10, 14, 13, 17, 18, 19, 19, -1, -1, -1,}, //  4 - Subjoined consonant after base
    266     {-1, -1,  5, -1,  8,  7, -1, 10, 14, 13, 17, 18, 19, 19, -1, -1, -1,}, //  5 - Subjoined consonant after tsa phru
    267     {-1, -1, -1, -1,  8,  7, -1, 10, 14, 13, 17, 18, 19, 19, -1, -1, -1,}, //  6 - Tsa phru after subjoined consonant
    268     {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 19, 19, -1, -1, -1,}, //  7 - Pre Composed Sanskrit
    269     {-1, -1, -1, -1, -1, -1, -1, 10, 14, 13, 17, 18, 19, 19, -1, -1, -1,}, //  8 - A-chung
    270     {-1, -1, -1, -1, -1, -1, -1, -1, 14, 13, 17, -1, 19, 19, -1, -1, -1,}, //  9 - Halanta
    271     {-1, -1, -1, -1, -1, -1, -1, 11, 14, 13, 17, 18, 19, 19, -1, -1, -1,}, // 10 - below vowel 1
    272     {-1, -1, -1, -1, -1, -1, -1, 12, 14, 13, 17, 18, 19, 19, -1, -1, -1,}, // 11 - below vowel 2
    273     {-1, -1, -1, -1, -1, -1, -1, -1, 14, 13, 17, 18, 19, 19, -1, -1, -1,}, // 12 - below vowel 3
    274     {-1, -1, -1, -1, -1, -1, -1, -1, 14, 17, 17, 18, 19, 19, -1, -1, -1,}, // 13 - Anusvara before vowel
    275     {-1, -1, -1, -1, -1, -1, -1, -1, 15, 17, 17, 18, 19, 19, -1, -1, -1,}, // 14 - above vowel 1
    276     {-1, -1, -1, -1, -1, -1, -1, -1, 16, 17, 17, 18, 19, 19, -1, -1, -1,}, // 15 - above vowel 2
    277     {-1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 17, 18, 19, 19, -1, -1, -1,}, // 16 - above vowel 3
    278     {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 18, 19, 19, -1, -1, -1,}, // 17 - Anusvara or Candrabindu after vowel
    279     {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 19, 19, -1, -1, -1,}, // 18 - Visarga
    280     {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,}, // 19 - strss mark
    281     {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 21, 21,}, // 20 - digit
    282     {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,}, // 21 - digit mark
    283 
    284 
    285 };
    286 
    287 
    288 const FeatureMap *TibetanReordering::getFeatureMap(le_int32 &count)
    289 {
    290     count = featureMapCount;
    291 
    292     return featureMap;
    293 }
    294 
    295 
    296 // Given an input string of characters and a location in which to start looking
    297 // calculate, using the state table, which one is the last character of the syllable
    298 // that starts in the starting position.
    299 le_int32 TibetanReordering::findSyllable(const TibetanClassTable *classTable, const LEUnicode *chars, le_int32 prev, le_int32 charCount)
    300 {
    301     le_int32 cursor = prev;
    302     le_int8 state = 0;
    303 
    304     while (cursor < charCount) {
    305         TibetanClassTable::CharClass charClass = (classTable->getCharClass(chars[cursor]) & TibetanClassTable::CF_CLASS_MASK);
    306 
    307         state = tibetanStateTable[state][charClass];
    308 
    309         if (state < 0) {
    310             break;
    311         }
    312 
    313         cursor += 1;
    314     }
    315 
    316     return cursor;
    317 }
    318 
    319 
    320 // This is the real reordering function as applied to the Tibetan language
    321 
    322 le_int32 TibetanReordering::reorder(const LEUnicode *chars, le_int32 charCount, le_int32,
    323                                   LEUnicode *outChars, LEGlyphStorage &glyphStorage)
    324 {
    325     const TibetanClassTable *classTable = TibetanClassTable::getTibetanClassTable();
    326 
    327     TibetanReorderingOutput output(outChars, glyphStorage);
    328     TibetanClassTable::CharClass charClass;
    329     le_int32 i, prev = 0;
    330 
    331     // This loop only exits when we reach the end of a run, which may contain
    332     // several syllables.
    333     while (prev < charCount) {
    334         le_int32 syllable = findSyllable(classTable, chars, prev, charCount);
    335 
    336         output.reset();
    337 
    338         // shall we add a dotted circle?
    339         // If in the position in which the base should be (first char in the string) there is
    340         // a character that has the Dotted circle flag (a character that cannot be a base)
    341         // then write a dotted circle
    342         if (classTable->getCharClass(chars[prev]) & TibetanClassTable::CF_DOTTED_CIRCLE) {
    343             output.writeChar(C_DOTTED_CIRCLE, prev, tagDefault);
    344         }
    345 
    346         // copy the rest to output, inverting the pre-number mark if present after a digit.
    347         for (i = prev; i < syllable; i += 1) {
    348             charClass = classTable->getCharClass(chars[i]);
    349 
    350            if ((TibetanClassTable::CF_DIGIT & charClass)
    351               && ( classTable->getCharClass(chars[i+1]) & TibetanClassTable::CF_PREDIGIT))
    352            {
    353          		 output.writeChar(C_PRE_NUMBER_MARK, i, tagPref);
    354                          output.writeChar(chars[i], i+1 , tagPref);
    355 			i += 1;
    356           } else {
    357             switch (charClass & TibetanClassTable::CF_POS_MASK) {
    358 
    359             	// If the present character is a number, and the next character is a pre-number combining mark
    360             // then the two characters are reordered
    361 
    362                 case TibetanClassTable::CF_POS_ABOVE :
    363                     output.writeChar(chars[i], i, tagAbvf);
    364                     break;
    365 
    366                 case TibetanClassTable::CF_POS_AFTER :
    367                     output.writeChar(chars[i], i, tagPstf);
    368                     break;
    369 
    370                 case TibetanClassTable::CF_POS_BELOW :
    371                     output.writeChar(chars[i], i, tagBlwf);
    372                     break;
    373 
    374                 default:
    375                     // default - any other characters
    376                    output.writeChar(chars[i], i, tagDefault);
    377                     break;
    378             } // switch
    379           } // if
    380         } // for
    381 
    382         prev = syllable; // move the pointer to the start of next syllable
    383     }
    384 
    385     return output.getOutputIndex();
    386 }
    387 
    388 
    389 U_NAMESPACE_END
    390