Home | History | Annotate | Download | only in i18n
      1 /*
      2  **********************************************************************
      3  *   Copyright (C) 2005-2010, International Business Machines
      4  *   Corporation and others.  All Rights Reserved.
      5  **********************************************************************
      6  */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #include "cmemory.h"
     11 
     12 #if !UCONFIG_NO_CONVERSION
     13 #include "csrsbcs.h"
     14 
     15 #define N_GRAM_SIZE 3
     16 #define N_GRAM_MASK 0xFFFFFF
     17 
     18 U_NAMESPACE_BEGIN
     19 
     20 NGramParser::NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap)
     21   :byteIndex(0), ngram(0)
     22 {
     23     ngramList = theNgramList;
     24     charMap   = theCharMap;
     25 
     26     ngramCount = hitCount = 0;
     27 }
     28 
     29 /*
     30  * Binary search for value in table, which must have exactly 64 entries.
     31  */
     32 
     33 int32_t NGramParser::search(const int32_t *table, int32_t value)
     34 {
     35     int32_t index = 0;
     36 
     37     if (table[index + 32] <= value) {
     38         index += 32;
     39     }
     40 
     41     if (table[index + 16] <= value) {
     42         index += 16;
     43     }
     44 
     45     if (table[index + 8] <= value) {
     46         index += 8;
     47     }
     48 
     49     if (table[index + 4] <= value) {
     50         index += 4;
     51     }
     52 
     53     if (table[index + 2] <= value) {
     54         index += 2;
     55     }
     56 
     57     if (table[index + 1] <= value) {
     58         index += 1;
     59     }
     60 
     61     if (table[index] > value) {
     62         index -= 1;
     63     }
     64 
     65     if (index < 0 || table[index] != value) {
     66         return -1;
     67     }
     68 
     69     return index;
     70 }
     71 
     72 void NGramParser::lookup(int32_t thisNgram)
     73 {
     74     ngramCount += 1;
     75 
     76     if (search(ngramList, thisNgram) >= 0) {
     77         hitCount += 1;
     78     }
     79 
     80 }
     81 
     82 void NGramParser::addByte(int32_t b)
     83 {
     84     ngram = ((ngram << 8) + b) & N_GRAM_MASK;
     85     lookup(ngram);
     86 }
     87 
     88 int32_t NGramParser::nextByte(InputText *det)
     89 {
     90     if (byteIndex >= det->fInputLen) {
     91         return -1;
     92     }
     93 
     94     return det->fInputBytes[byteIndex++];
     95 }
     96 
     97 int32_t NGramParser::parse(InputText *det)
     98 {
     99     int32_t b;
    100     bool ignoreSpace = FALSE;
    101 
    102     while ((b = nextByte(det)) >= 0) {
    103         uint8_t mb = charMap[b];
    104 
    105         // TODO: 0x20 might not be a space in all character sets...
    106         if (mb != 0) {
    107             if (!(mb == 0x20 && ignoreSpace)) {
    108                 addByte(mb);
    109             }
    110 
    111             ignoreSpace = (mb == 0x20);
    112         }
    113     }
    114 
    115     // TODO: Is this OK? The buffer could have ended in the middle of a word...
    116     addByte(0x20);
    117 
    118     double rawPercent = (double) hitCount / (double) ngramCount;
    119 
    120     //            if (rawPercent <= 2.0) {
    121     //                return 0;
    122     //            }
    123 
    124     // TODO - This is a bit of a hack to take care of a case
    125     // were we were getting a confidence of 135...
    126     if (rawPercent > 0.33) {
    127         return 98;
    128     }
    129 
    130     return (int32_t) (rawPercent * 300.0);
    131 }
    132 
    133 CharsetRecog_sbcs::CharsetRecog_sbcs()
    134 : haveC1Bytes(FALSE)
    135 {
    136     // nothing else to do
    137 }
    138 
    139 CharsetRecog_sbcs::~CharsetRecog_sbcs()
    140 {
    141     // nothing to do
    142 }
    143 
    144 int32_t CharsetRecog_sbcs::match_sbcs(InputText *det, const int32_t ngrams[],  const uint8_t byteMap[])
    145 {
    146     NGramParser parser(ngrams, byteMap);
    147     int32_t result;
    148 
    149     haveC1Bytes = det->fC1Bytes;
    150     result = parser.parse(det);
    151 
    152     return result;
    153 }
    154 
    155 static const uint8_t charMap_8859_1[] = {
    156     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    157     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    158     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    159     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    160     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
    161     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    162     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    163     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    164     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
    165     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
    166     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
    167     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
    168     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
    169     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
    170     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
    171     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
    172     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    173     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    174     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    175     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    176     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    177     0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
    178     0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
    179     0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
    180     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
    181     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
    182     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
    183     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
    184     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
    185     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
    186     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
    187     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
    188 };
    189 
    190 static const uint8_t charMap_8859_2[] = {
    191     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    192     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    193     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    194     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    195     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
    196     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    197     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    198     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    199     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
    200     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
    201     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
    202     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
    203     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
    204     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
    205     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
    206     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
    207     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    208     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    209     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    210     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    211     0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0x20,
    212     0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
    213     0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0xB7,
    214     0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
    215     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
    216     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
    217     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
    218     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
    219     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
    220     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
    221     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
    222     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
    223 };
    224 
    225 static const uint8_t charMap_8859_5[] = {
    226     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    227     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    228     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    229     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    230     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
    231     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    232     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    233     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    234     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
    235     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
    236     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
    237     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
    238     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
    239     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
    240     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
    241     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
    242     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    243     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    244     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    245     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    246     0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
    247     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
    248     0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
    249     0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
    250     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
    251     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
    252     0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
    253     0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
    254     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
    255     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
    256     0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
    257     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
    258 };
    259 
    260 static const uint8_t charMap_8859_6[] = {
    261     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    262     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    263     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    264     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    265     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
    266     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    267     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    268     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    269     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
    270     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
    271     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
    272     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
    273     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
    274     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
    275     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
    276     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
    277     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    278     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    279     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    280     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    281     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    282     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    283     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    284     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    285     0x20, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
    286     0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
    287     0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
    288     0xD8, 0xD9, 0xDA, 0x20, 0x20, 0x20, 0x20, 0x20,
    289     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
    290     0xE8, 0xE9, 0xEA, 0x20, 0x20, 0x20, 0x20, 0x20,
    291     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    292     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    293 };
    294 
    295 static const uint8_t charMap_8859_7[] = {
    296     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    297     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    298     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    299     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    300     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
    301     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    302     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    303     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    304     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
    305     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
    306     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
    307     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
    308     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
    309     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
    310     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
    311     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
    312     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    313     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    314     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    315     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    316     0x20, 0xA1, 0xA2, 0x20, 0x20, 0x20, 0x20, 0x20,
    317     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    318     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xDC, 0x20,
    319     0xDD, 0xDE, 0xDF, 0x20, 0xFC, 0x20, 0xFD, 0xFE,
    320     0xC0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
    321     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
    322     0xF0, 0xF1, 0x20, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
    323     0xF8, 0xF9, 0xFA, 0xFB, 0xDC, 0xDD, 0xDE, 0xDF,
    324     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
    325     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
    326     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
    327     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
    328 };
    329 
    330 static const uint8_t charMap_8859_8[] = {
    331     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    332     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    333     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    334     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    335     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
    336     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    337     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    338     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    339     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
    340     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
    341     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
    342     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
    343     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
    344     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
    345     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
    346     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
    347     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    348     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    349     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    350     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    351     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    352     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    353     0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
    354     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    355     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    356     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    357     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    358     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    359     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
    360     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
    361     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
    362     0xF8, 0xF9, 0xFA, 0x20, 0x20, 0x20, 0x20, 0x20,
    363 };
    364 
    365 static const uint8_t charMap_8859_9[] = {
    366     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    367     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    368     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    369     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    370     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
    371     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    372     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    373     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    374     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
    375     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
    376     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
    377     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
    378     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
    379     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
    380     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
    381     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
    382     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    383     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    384     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    385     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    386     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    387     0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
    388     0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
    389     0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
    390     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
    391     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
    392     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
    393     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x69, 0xFE, 0xDF,
    394     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
    395     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
    396     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
    397     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
    398 };
    399 
    400 static const int32_t ngrams_windows_1251[] = {
    401     0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE,
    402     0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED,
    403     0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2,
    404     0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520,
    405 };
    406 
    407 static const uint8_t charMap_windows_1251[] = {
    408     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    409     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    410     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    411     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    412     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
    413     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    414     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    415     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    416     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
    417     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
    418     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
    419     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
    420     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
    421     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
    422     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
    423     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
    424     0x90, 0x83, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
    425     0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
    426     0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    427     0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
    428     0x20, 0xA2, 0xA2, 0xBC, 0x20, 0xB4, 0x20, 0x20,
    429     0xB8, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0xBF,
    430     0x20, 0x20, 0xB3, 0xB3, 0xB4, 0xB5, 0x20, 0x20,
    431     0xB8, 0x20, 0xBA, 0x20, 0xBC, 0xBE, 0xBE, 0xBF,
    432     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
    433     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
    434     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
    435     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
    436     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
    437     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
    438     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
    439     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
    440 };
    441 
    442 static const int32_t ngrams_windows_1256[] = {
    443     0x20C7E1, 0x20C7E4, 0x20C8C7, 0x20DAE1, 0x20DDED, 0x20E1E1, 0x20E3E4, 0x20E6C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E120, 0xC7E1C3, 0xC7E1C7, 0xC7E1C8,
    444     0xC7E1CA, 0xC7E1CC, 0xC7E1CD, 0xC7E1CF, 0xC7E1D3, 0xC7E1DA, 0xC7E1DE, 0xC7E1E3, 0xC7E1E6, 0xC7E1ED, 0xC7E320, 0xC7E420, 0xC7E4CA, 0xC820C7, 0xC920C7, 0xC920DD,
    445     0xC920E1, 0xC920E3, 0xC920E6, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xDA20C7, 0xDAE1EC, 0xDDED20, 0xE120C7, 0xE1C920, 0xE1EC20, 0xE1ED20,
    446     0xE320C7, 0xE3C720, 0xE3C920, 0xE3E420, 0xE420C7, 0xE520C7, 0xE5C720, 0xE6C7E1, 0xE6E420, 0xEC20C7, 0xED20C7, 0xED20E3, 0xED20E6, 0xEDC920, 0xEDD120, 0xEDE420,
    447 };
    448 
    449 static const uint8_t charMap_windows_1256[] = {
    450     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    451     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    452     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    453     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    454     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
    455     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    456     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    457     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    458     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
    459     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
    460     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
    461     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
    462     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
    463     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
    464     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
    465     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
    466     0x20, 0x81, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
    467     0x88, 0x20, 0x8A, 0x20, 0x9C, 0x8D, 0x8E, 0x8F,
    468     0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    469     0x98, 0x20, 0x9A, 0x20, 0x9C, 0x20, 0x20, 0x9F,
    470     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    471     0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
    472     0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
    473     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    474     0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
    475     0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
    476     0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0x20,
    477     0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
    478     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
    479     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
    480     0x20, 0x20, 0x20, 0x20, 0xF4, 0x20, 0x20, 0x20,
    481     0x20, 0xF9, 0x20, 0xFB, 0xFC, 0x20, 0x20, 0xFF,
    482 };
    483 
    484 static const int32_t ngrams_KOI8_R[] = {
    485     0x20C4CF, 0x20C920, 0x20CBC1, 0x20CBCF, 0x20CEC1, 0x20CEC5, 0x20CFC2, 0x20D0CF, 0x20D0D2, 0x20D2C1, 0x20D3CF, 0x20D3D4, 0x20D4CF, 0x20D720, 0x20D7CF, 0x20DAC1,
    486     0x20DCD4, 0x20DED4, 0xC1CEC9, 0xC1D4D8, 0xC5CCD8, 0xC5CEC9, 0xC5D3D4, 0xC5D420, 0xC7CF20, 0xC920D0, 0xC9C520, 0xC9C920, 0xC9D120, 0xCCC5CE, 0xCCC920, 0xCCD8CE,
    487     0xCEC120, 0xCEC520, 0xCEC9C5, 0xCEC9D1, 0xCECF20, 0xCECFD7, 0xCF20D0, 0xCF20D3, 0xCF20D7, 0xCFC7CF, 0xCFCA20, 0xCFCCD8, 0xCFCD20, 0xCFD3D4, 0xCFD720, 0xCFD7C1,
    488     0xD0CFCC, 0xD0D2C5, 0xD0D2C9, 0xD0D2CF, 0xD2C1D7, 0xD2C5C4, 0xD3D120, 0xD3D4C1, 0xD3D4C9, 0xD3D4D7, 0xD4C5CC, 0xD4CF20, 0xD4CFD2, 0xD4D820, 0xD9C820, 0xDED4CF,
    489 };
    490 
    491 static const uint8_t charMap_KOI8_R[] = {
    492     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    493     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    494     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    495     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    496     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
    497     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    498     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    499     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    500     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
    501     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
    502     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
    503     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
    504     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
    505     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
    506     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
    507     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
    508     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    509     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    510     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    511     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    512     0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
    513     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    514     0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
    515     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
    516     0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
    517     0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
    518     0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
    519     0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
    520     0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
    521     0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
    522     0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
    523     0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
    524 };
    525 
    526 static const int32_t ngrams_IBM424_he_rtl[] = {
    527     0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641,
    528     0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045,
    529     0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x544041, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x554056,
    530     0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x684045, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x714069,
    531 };
    532 
    533 static const int32_t ngrams_IBM424_he_ltr[] = {
    534     0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x405462, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x407141,
    535     0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x414645, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x454054,
    536     0x454055, 0x454057, 0x454068, 0x454071, 0x455440, 0x464140, 0x464540, 0x484140, 0x514140, 0x514240, 0x514540, 0x544045, 0x544055, 0x544071, 0x546240, 0x546940,
    537     0x555151, 0x555158, 0x555168, 0x564045, 0x564055, 0x564071, 0x564240, 0x564540, 0x624540, 0x694045, 0x694055, 0x694071, 0x694540, 0x714140, 0x714540, 0x714651,
    538 };
    539 
    540 static const uint8_t charMap_IBM424_he[] = {
    541 /*           -0    -1    -2    -3    -4    -5    -6    -7    -8    -9    -A    -B    -C    -D    -E    -F   */
    542 /* 0- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    543 /* 1- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    544 /* 2- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    545 /* 3- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    546 /* 4- */    0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    547 /* 5- */    0x40, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    548 /* 6- */    0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    549 /* 7- */    0x40, 0x71, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x00, 0x40, 0x40,
    550 /* 8- */    0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    551 /* 9- */    0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    552 /* A- */    0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    553 /* B- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    554 /* C- */    0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    555 /* D- */    0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    556 /* E- */    0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    557 /* F- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    558 };
    559 
    560 static const int32_t ngrams_IBM420_ar_rtl[] = {
    561     0x4056B1, 0x4056BD, 0x405856, 0x409AB1, 0x40ABDC, 0x40B1B1, 0x40BBBD, 0x40CF56, 0x564056, 0x564640, 0x566340, 0x567540, 0x56B140, 0x56B149, 0x56B156, 0x56B158,
    562     0x56B163, 0x56B167, 0x56B169, 0x56B173, 0x56B178, 0x56B19A, 0x56B1AD, 0x56B1BB, 0x56B1CF, 0x56B1DC, 0x56BB40, 0x56BD40, 0x56BD63, 0x584056, 0x624056, 0x6240AB,
    563     0x6240B1, 0x6240BB, 0x6240CF, 0x634056, 0x734056, 0x736240, 0x754056, 0x756240, 0x784056, 0x9A4056, 0x9AB1DA, 0xABDC40, 0xB14056, 0xB16240, 0xB1DA40, 0xB1DC40,
    564     0xBB4056, 0xBB5640, 0xBB6240, 0xBBBD40, 0xBD4056, 0xBF4056, 0xBF5640, 0xCF56B1, 0xCFBD40, 0xDA4056, 0xDC4056, 0xDC40BB, 0xDC40CF, 0xDC6240, 0xDC7540, 0xDCBD40,
    565 };
    566 
    567 static const int32_t ngrams_IBM420_ar_ltr[] = {
    568     0x404656, 0x4056BB, 0x4056BF, 0x406273, 0x406275, 0x4062B1, 0x4062BB, 0x4062DC, 0x406356, 0x407556, 0x4075DC, 0x40B156, 0x40BB56, 0x40BD56, 0x40BDBB, 0x40BDCF,
    569     0x40BDDC, 0x40DAB1, 0x40DCAB, 0x40DCB1, 0x49B156, 0x564056, 0x564058, 0x564062, 0x564063, 0x564073, 0x564075, 0x564078, 0x56409A, 0x5640B1, 0x5640BB, 0x5640BD,
    570     0x5640BF, 0x5640DA, 0x5640DC, 0x565840, 0x56B156, 0x56CF40, 0x58B156, 0x63B156, 0x63BD56, 0x67B156, 0x69B156, 0x73B156, 0x78B156, 0x9AB156, 0xAB4062, 0xADB156,
    571     0xB14062, 0xB15640, 0xB156CF, 0xB19A40, 0xB1B140, 0xBB4062, 0xBB40DC, 0xBBB156, 0xBD5640, 0xBDBB40, 0xCF4062, 0xCF40DC, 0xCFB156, 0xDAB19A, 0xDCAB40, 0xDCB156
    572 };
    573 
    574 static const uint8_t charMap_IBM420_ar[]= {
    575 /*           -0    -1    -2    -3    -4    -5    -6    -7    -8    -9    -A    -B    -C    -D    -E    -F   */
    576 /* 0- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    577 /* 1- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    578 /* 2- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    579 /* 3- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    580 /* 4- */    0x40, 0x40, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    581 /* 5- */    0x40, 0x51, 0x52, 0x40, 0x40, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    582 /* 6- */    0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    583 /* 7- */    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
    584 /* 8- */    0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
    585 /* 9- */    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
    586 /* A- */    0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
    587 /* B- */    0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0x40, 0x40, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
    588 /* C- */    0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0xCB, 0x40, 0xCD, 0x40, 0xCF,
    589 /* D- */    0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
    590 /* E- */    0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0xEB, 0x40, 0xED, 0xEE, 0xEF,
    591 /* F- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0xFB, 0xFC, 0xFD, 0xFE, 0x40,
    592 };
    593 
    594 //ISO-8859-1,2,5,6,7,8,9 Ngrams
    595 static const int32_t ngrams_8859_1_en[] = {
    596     0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F,
    597     0x207768, 0x616964, 0x616C20, 0x616E20, 0x616E64, 0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061, 0x652073, 0x652074, 0x656420, 0x656E74,
    598     0x657220, 0x657320, 0x666F72, 0x686174, 0x686520, 0x686572, 0x696420, 0x696E20, 0x696E67, 0x696F6E, 0x697320, 0x6E2061, 0x6E2074, 0x6E6420, 0x6E6720, 0x6E7420,
    599     0x6F6620, 0x6F6E20, 0x6F7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169, 0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696F, 0x746F20, 0x747320,
    600 };
    601 
    602 static const int32_t ngrams_8859_1_da[] = {
    603     0x206166, 0x206174, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207369, 0x207374, 0x207469, 0x207669, 0x616620,
    604     0x616E20, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656E20, 0x656E64, 0x657220, 0x657265, 0x657320,
    605     0x657420, 0x666F72, 0x676520, 0x67656E, 0x676572, 0x696765, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6572, 0x6C6967, 0x6C6C65, 0x6D6564, 0x6E6465, 0x6E6520,
    606     0x6E6720, 0x6E6765, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722064, 0x722065, 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696C, 0x766572,
    607 };
    608 
    609 static const int32_t ngrams_8859_1_de[] = {
    610     0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F,
    611     0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220,
    612     0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465,
    613     0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572,
    614 };
    615 
    616 static const int32_t ngrams_8859_1_es[] = {
    617     0x206120, 0x206361, 0x20636F, 0x206465, 0x20656C, 0x20656E, 0x206573, 0x20696E, 0x206C61, 0x206C6F, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
    618     0x20756E, 0x207920, 0x612063, 0x612064, 0x612065, 0x61206C, 0x612070, 0x616369, 0x61646F, 0x616C20, 0x617220, 0x617320, 0x6369F3, 0x636F6E, 0x646520, 0x64656C,
    619     0x646F20, 0x652064, 0x652065, 0x65206C, 0x656C20, 0x656E20, 0x656E74, 0x657320, 0x657374, 0x69656E, 0x69F36E, 0x6C6120, 0x6C6F73, 0x6E2065, 0x6E7465, 0x6F2064,
    620     0x6F2065, 0x6F6E20, 0x6F7220, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746F20, 0x756520, 0xF36E20,
    621 };
    622 
    623 static const int32_t ngrams_8859_1_fr[] = {
    624     0x206175, 0x20636F, 0x206461, 0x206465, 0x206475, 0x20656E, 0x206574, 0x206C61, 0x206C65, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207365, 0x20736F, 0x20756E,
    625     0x20E020, 0x616E74, 0x617469, 0x636520, 0x636F6E, 0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065, 0x65206C, 0x652070, 0x652073, 0x656E20,
    626     0x656E74, 0x657220, 0x657320, 0x657420, 0x657572, 0x696F6E, 0x697320, 0x697420, 0x6C6120, 0x6C6520, 0x6C6573, 0x6D656E, 0x6E2064, 0x6E6520, 0x6E7320, 0x6E7420,
    627     0x6F6E20, 0x6F6E74, 0x6F7572, 0x717565, 0x72206C, 0x726520, 0x732061, 0x732064, 0x732065, 0x73206C, 0x732070, 0x742064, 0x746520, 0x74696F, 0x756520, 0x757220,
    628 };
    629 
    630 static const int32_t ngrams_8859_1_it[] = {
    631     0x20616C, 0x206368, 0x20636F, 0x206465, 0x206469, 0x206520, 0x20696C, 0x20696E, 0x206C61, 0x207065, 0x207072, 0x20756E, 0x612063, 0x612064, 0x612070, 0x612073,
    632     0x61746F, 0x636865, 0x636F6E, 0x64656C, 0x646920, 0x652061, 0x652063, 0x652064, 0x652069, 0x65206C, 0x652070, 0x652073, 0x656C20, 0x656C6C, 0x656E74, 0x657220,
    633     0x686520, 0x692061, 0x692063, 0x692064, 0x692073, 0x696120, 0x696C20, 0x696E20, 0x696F6E, 0x6C6120, 0x6C6520, 0x6C6920, 0x6C6C61, 0x6E6520, 0x6E6920, 0x6E6F20,
    634     0x6E7465, 0x6F2061, 0x6F2064, 0x6F2069, 0x6F2073, 0x6F6E20, 0x6F6E65, 0x706572, 0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746F20, 0x7A696F,
    635 };
    636 
    637 static const int32_t ngrams_8859_1_nl[] = {
    638     0x20616C, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656E, 0x206765, 0x206865, 0x20696E, 0x206D61, 0x206D65, 0x206F70, 0x207465, 0x207661, 0x207665,
    639     0x20766F, 0x207765, 0x207A69, 0x61616E, 0x616172, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656E, 0x646572, 0x652062, 0x652076, 0x65656E,
    640     0x656572, 0x656E20, 0x657220, 0x657273, 0x657420, 0x67656E, 0x686574, 0x696520, 0x696E20, 0x696E67, 0x697320, 0x6E2062, 0x6E2064, 0x6E2065, 0x6E2068, 0x6E206F,
    641     0x6E2076, 0x6E6465, 0x6E6720, 0x6F6E64, 0x6F6F72, 0x6F7020, 0x6F7220, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x76616E, 0x766572, 0x766F6F,
    642 };
    643 
    644 static const int32_t ngrams_8859_1_no[] = {
    645     0x206174, 0x206176, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207365, 0x20736B, 0x20736F, 0x207374, 0x207469,
    646     0x207669, 0x20E520, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646574, 0x652073, 0x656420, 0x656E20, 0x656E65, 0x657220, 0x657265, 0x657420, 0x657474,
    647     0x666F72, 0x67656E, 0x696B6B, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6520, 0x6C6C65, 0x6D6564, 0x6D656E, 0x6E2073, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E6E65,
    648     0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722073, 0x726520, 0x736F6D, 0x737465, 0x742073, 0x746520, 0x74656E, 0x746572, 0x74696C, 0x747420, 0x747465, 0x766572,
    649 };
    650 
    651 static const int32_t ngrams_8859_1_pt[] = {
    652     0x206120, 0x20636F, 0x206461, 0x206465, 0x20646F, 0x206520, 0x206573, 0x206D61, 0x206E6F, 0x206F20, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
    653     0x20756D, 0x612061, 0x612063, 0x612064, 0x612070, 0x616465, 0x61646F, 0x616C20, 0x617220, 0x617261, 0x617320, 0x636F6D, 0x636F6E, 0x646120, 0x646520, 0x646F20,
    654     0x646F73, 0x652061, 0x652064, 0x656D20, 0x656E74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6D656E, 0x6E7465, 0x6E746F, 0x6F2061, 0x6F2063, 0x6F2064, 0x6F2065,
    655     0x6F2070, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064, 0x732065, 0x732070, 0x737461, 0x746520, 0x746F20, 0x756520, 0xE36F20, 0xE7E36F,
    656 };
    657 
    658 static const int32_t ngrams_8859_1_sv[] = {
    659     0x206174, 0x206176, 0x206465, 0x20656E, 0x2066F6, 0x206861, 0x206920, 0x20696E, 0x206B6F, 0x206D65, 0x206F63, 0x2070E5, 0x20736B, 0x20736F, 0x207374, 0x207469,
    660     0x207661, 0x207669, 0x20E472, 0x616465, 0x616E20, 0x616E64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656E, 0x646572, 0x646574, 0x656420, 0x656E20, 0x657220,
    661     0x657420, 0x66F672, 0x67656E, 0x696C6C, 0x696E67, 0x6B6120, 0x6C6C20, 0x6D6564, 0x6E2073, 0x6E6120, 0x6E6465, 0x6E6720, 0x6E6765, 0x6E696E, 0x6F6368, 0x6F6D20,
    662     0x6F6E20, 0x70E520, 0x722061, 0x722073, 0x726120, 0x736B61, 0x736F6D, 0x742073, 0x746120, 0x746520, 0x746572, 0x74696C, 0x747420, 0x766172, 0xE47220, 0xF67220,
    663 };
    664 
    665 static const int32_t ngrams_8859_2_cs[] = {
    666     0x206120, 0x206279, 0x20646F, 0x206A65, 0x206E61, 0x206E65, 0x206F20, 0x206F64, 0x20706F, 0x207072, 0x2070F8, 0x20726F, 0x207365, 0x20736F, 0x207374, 0x20746F,
    667     0x207620, 0x207679, 0x207A61, 0x612070, 0x636520, 0x636820, 0x652070, 0x652073, 0x652076, 0x656D20, 0x656EED, 0x686F20, 0x686F64, 0x697374, 0x6A6520, 0x6B7465,
    668     0x6C6520, 0x6C6920, 0x6E6120, 0x6EE920, 0x6EEC20, 0x6EED20, 0x6F2070, 0x6F646E, 0x6F6A69, 0x6F7374, 0x6F7520, 0x6F7661, 0x706F64, 0x706F6A, 0x70726F, 0x70F865,
    669     0x736520, 0x736F75, 0x737461, 0x737469, 0x73746E, 0x746572, 0x746EED, 0x746F20, 0x752070, 0xBE6520, 0xE16EED, 0xE9686F, 0xED2070, 0xED2073, 0xED6D20, 0xF86564,
    670 };
    671 
    672 static const int32_t ngrams_8859_2_hu[] = {
    673     0x206120, 0x20617A, 0x206265, 0x206567, 0x20656C, 0x206665, 0x206861, 0x20686F, 0x206973, 0x206B65, 0x206B69, 0x206BF6, 0x206C65, 0x206D61, 0x206D65, 0x206D69,
    674     0x206E65, 0x20737A, 0x207465, 0x20E973, 0x612061, 0x61206B, 0x61206D, 0x612073, 0x616B20, 0x616E20, 0x617A20, 0x62616E, 0x62656E, 0x656779, 0x656B20, 0x656C20,
    675     0x656C65, 0x656D20, 0x656E20, 0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686F67, 0x696E74, 0x697320, 0x6B2061, 0x6BF67A, 0x6D6567, 0x6D696E, 0x6E2061,
    676     0x6E616B, 0x6E656B, 0x6E656D, 0x6E7420, 0x6F6779, 0x732061, 0x737A65, 0x737A74, 0x737AE1, 0x73E967, 0x742061, 0x747420, 0x74E173, 0x7A6572, 0xE16E20, 0xE97320,
    677 };
    678 
    679 static const int32_t ngrams_8859_2_pl[] = {
    680     0x20637A, 0x20646F, 0x206920, 0x206A65, 0x206B6F, 0x206D61, 0x206D69, 0x206E61, 0x206E69, 0x206F64, 0x20706F, 0x207072, 0x207369, 0x207720, 0x207769, 0x207779,
    681     0x207A20, 0x207A61, 0x612070, 0x612077, 0x616E69, 0x636820, 0x637A65, 0x637A79, 0x646F20, 0x647A69, 0x652070, 0x652073, 0x652077, 0x65207A, 0x65676F, 0x656A20,
    682     0x656D20, 0x656E69, 0x676F20, 0x696120, 0x696520, 0x69656A, 0x6B6120, 0x6B6920, 0x6B6965, 0x6D6965, 0x6E6120, 0x6E6961, 0x6E6965, 0x6F2070, 0x6F7761, 0x6F7769,
    683     0x706F6C, 0x707261, 0x70726F, 0x70727A, 0x727A65, 0x727A79, 0x7369EA, 0x736B69, 0x737461, 0x776965, 0x796368, 0x796D20, 0x7A6520, 0x7A6965, 0x7A7920, 0xF37720,
    684 };
    685 
    686 static const int32_t ngrams_8859_2_ro[] = {
    687     0x206120, 0x206163, 0x206361, 0x206365, 0x20636F, 0x206375, 0x206465, 0x206469, 0x206C61, 0x206D61, 0x207065, 0x207072, 0x207365, 0x2073E3, 0x20756E, 0x20BA69,
    688     0x20EE6E, 0x612063, 0x612064, 0x617265, 0x617420, 0x617465, 0x617520, 0x636172, 0x636F6E, 0x637520, 0x63E320, 0x646520, 0x652061, 0x652063, 0x652064, 0x652070,
    689     0x652073, 0x656120, 0x656920, 0x656C65, 0x656E74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070, 0x696520, 0x696920, 0x696E20, 0x6C6120, 0x6C6520, 0x6C6F72,
    690     0x6C7569, 0x6E6520, 0x6E7472, 0x6F7220, 0x70656E, 0x726520, 0x726561, 0x727520, 0x73E320, 0x746520, 0x747275, 0x74E320, 0x756920, 0x756C20, 0xBA6920, 0xEE6E20,
    691 };
    692 
    693 static const int32_t ngrams_8859_5_ru[] = {
    694     0x20D220, 0x20D2DE, 0x20D4DE, 0x20D7D0, 0x20D820, 0x20DAD0, 0x20DADE, 0x20DDD0, 0x20DDD5, 0x20DED1, 0x20DFDE, 0x20DFE0, 0x20E0D0, 0x20E1DE, 0x20E1E2, 0x20E2DE,
    695     0x20E7E2, 0x20EDE2, 0xD0DDD8, 0xD0E2EC, 0xD3DE20, 0xD5DBEC, 0xD5DDD8, 0xD5E1E2, 0xD5E220, 0xD820DF, 0xD8D520, 0xD8D820, 0xD8EF20, 0xDBD5DD, 0xDBD820, 0xDBECDD,
    696     0xDDD020, 0xDDD520, 0xDDD8D5, 0xDDD8EF, 0xDDDE20, 0xDDDED2, 0xDE20D2, 0xDE20DF, 0xDE20E1, 0xDED220, 0xDED2D0, 0xDED3DE, 0xDED920, 0xDEDBEC, 0xDEDC20, 0xDEE1E2,
    697     0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520,
    698 };
    699 
    700 static const int32_t ngrams_8859_6_ar[] = {
    701     0x20C7E4, 0x20C7E6, 0x20C8C7, 0x20D9E4, 0x20E1EA, 0x20E4E4, 0x20E5E6, 0x20E8C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E420, 0xC7E4C3, 0xC7E4C7, 0xC7E4C8,
    702     0xC7E4CA, 0xC7E4CC, 0xC7E4CD, 0xC7E4CF, 0xC7E4D3, 0xC7E4D9, 0xC7E4E2, 0xC7E4E5, 0xC7E4E8, 0xC7E4EA, 0xC7E520, 0xC7E620, 0xC7E6CA, 0xC820C7, 0xC920C7, 0xC920E1,
    703     0xC920E4, 0xC920E5, 0xC920E8, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xD920C7, 0xD9E4E9, 0xE1EA20, 0xE420C7, 0xE4C920, 0xE4E920, 0xE4EA20,
    704     0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620,
    705 };
    706 
    707 static const int32_t ngrams_8859_7_el[] = {
    708     0x20E1ED, 0x20E1F0, 0x20E3E9, 0x20E4E9, 0x20E5F0, 0x20E720, 0x20EAE1, 0x20ECE5, 0x20EDE1, 0x20EF20, 0x20F0E1, 0x20F0EF, 0x20F0F1, 0x20F3F4, 0x20F3F5, 0x20F4E7,
    709     0x20F4EF, 0xDFE120, 0xE120E1, 0xE120F4, 0xE1E920, 0xE1ED20, 0xE1F0FC, 0xE1F220, 0xE3E9E1, 0xE5E920, 0xE5F220, 0xE720F4, 0xE7ED20, 0xE7F220, 0xE920F4, 0xE9E120,
    710     0xE9EADE, 0xE9F220, 0xEAE1E9, 0xEAE1F4, 0xECE520, 0xED20E1, 0xED20E5, 0xED20F0, 0xEDE120, 0xEFF220, 0xEFF520, 0xF0EFF5, 0xF0F1EF, 0xF0FC20, 0xF220E1, 0xF220E5,
    711     0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20,
    712 };
    713 
    714 static const int32_t ngrams_8859_8_I_he[] = {
    715     0x20E0E5, 0x20E0E7, 0x20E0E9, 0x20E0FA, 0x20E1E9, 0x20E1EE, 0x20E4E0, 0x20E4E5, 0x20E4E9, 0x20E4EE, 0x20E4F2, 0x20E4F9, 0x20E4FA, 0x20ECE0, 0x20ECE4, 0x20EEE0,
    716     0x20F2EC, 0x20F9EC, 0xE0FA20, 0xE420E0, 0xE420E1, 0xE420E4, 0xE420EC, 0xE420EE, 0xE420F9, 0xE4E5E0, 0xE5E020, 0xE5ED20, 0xE5EF20, 0xE5F820, 0xE5FA20, 0xE920E4,
    717     0xE9E420, 0xE9E5FA, 0xE9E9ED, 0xE9ED20, 0xE9EF20, 0xE9F820, 0xE9FA20, 0xEC20E0, 0xEC20E4, 0xECE020, 0xECE420, 0xED20E0, 0xED20E1, 0xED20E4, 0xED20EC, 0xED20EE,
    718     0xED20F9, 0xEEE420, 0xEF20E4, 0xF0E420, 0xF0E920, 0xF0E9ED, 0xF2EC20, 0xF820E4, 0xF8E9ED, 0xF9EC20, 0xFA20E0, 0xFA20E1, 0xFA20E4, 0xFA20EC, 0xFA20EE, 0xFA20F9,
    719 };
    720 
    721 static const int32_t ngrams_8859_8_he[] = {
    722     0x20E0E5, 0x20E0EC, 0x20E4E9, 0x20E4EC, 0x20E4EE, 0x20E4F0, 0x20E9F0, 0x20ECF2, 0x20ECF9, 0x20EDE5, 0x20EDE9, 0x20EFE5, 0x20EFE9, 0x20F8E5, 0x20F8E9, 0x20FAE0,
    723     0x20FAE5, 0x20FAE9, 0xE020E4, 0xE020EC, 0xE020ED, 0xE020FA, 0xE0E420, 0xE0E5E4, 0xE0EC20, 0xE0EE20, 0xE120E4, 0xE120ED, 0xE120FA, 0xE420E4, 0xE420E9, 0xE420EC,
    724     0xE420ED, 0xE420EF, 0xE420F8, 0xE420FA, 0xE4EC20, 0xE5E020, 0xE5E420, 0xE7E020, 0xE9E020, 0xE9E120, 0xE9E420, 0xEC20E4, 0xEC20ED, 0xEC20FA, 0xECF220, 0xECF920,
    725     0xEDE9E9, 0xEDE9F0, 0xEDE9F8, 0xEE20E4, 0xEE20ED, 0xEE20FA, 0xEEE120, 0xEEE420, 0xF2E420, 0xF920E4, 0xF920ED, 0xF920FA, 0xF9E420, 0xFAE020, 0xFAE420, 0xFAE5E9,
    726 };
    727 
    728 static const int32_t ngrams_8859_9_tr[] = {
    729     0x206261, 0x206269, 0x206275, 0x206461, 0x206465, 0x206765, 0x206861, 0x20696C, 0x206B61, 0x206B6F, 0x206D61, 0x206F6C, 0x207361, 0x207461, 0x207665, 0x207961,
    730     0x612062, 0x616B20, 0x616C61, 0x616D61, 0x616E20, 0x616EFD, 0x617220, 0x617261, 0x6172FD, 0x6173FD, 0x617961, 0x626972, 0x646120, 0x646520, 0x646920, 0x652062,
    731     0x65206B, 0x656469, 0x656E20, 0x657220, 0x657269, 0x657369, 0x696C65, 0x696E20, 0x696E69, 0x697220, 0x6C616E, 0x6C6172, 0x6C6520, 0x6C6572, 0x6E2061, 0x6E2062,
    732     0x6E206B, 0x6E6461, 0x6E6465, 0x6E6520, 0x6E6920, 0x6E696E, 0x6EFD20, 0x72696E, 0x72FD6E, 0x766520, 0x796120, 0x796F72, 0xFD6E20, 0xFD6E64, 0xFD6EFD, 0xFDF0FD,
    733 };
    734 
    735 CharsetRecog_8859_1::~CharsetRecog_8859_1()
    736 {
    737     // nothing to do
    738 }
    739 
    740 const char *CharsetRecog_8859_1::getName() const
    741 {
    742     return haveC1Bytes? "windows-1252" : "ISO-8859-1";
    743 }
    744 
    745 const char *CharsetRecog_8859_1_en::getLanguage() const
    746 {
    747     return "en";
    748 }
    749 
    750 CharsetRecog_8859_1_en::~CharsetRecog_8859_1_en()
    751 {
    752     // nothing to do
    753 }
    754 
    755 int32_t CharsetRecog_8859_1_en::match(InputText *textIn)
    756 {
    757     int32_t result = match_sbcs(textIn, ngrams_8859_1_en, charMap_8859_1);
    758 
    759    // printf("8859_1_en: result = %d\n", result);
    760     return result; //match_sbcs(textIn, ngrams, charMap);
    761 }
    762 
    763 CharsetRecog_8859_1_da::~CharsetRecog_8859_1_da()
    764 {
    765     // nothing to do
    766 }
    767 
    768 const char *CharsetRecog_8859_1_da::getLanguage() const
    769 {
    770     return "da";
    771 }
    772 
    773 int32_t CharsetRecog_8859_1_da::match(InputText *textIn)
    774 {
    775     return match_sbcs(textIn, ngrams_8859_1_da, charMap_8859_1);
    776 }
    777 
    778 CharsetRecog_8859_1_de::~CharsetRecog_8859_1_de() {}
    779 
    780 const char *CharsetRecog_8859_1_de::getLanguage() const
    781 {
    782     return "de";
    783 }
    784 
    785 int32_t CharsetRecog_8859_1_de::match(InputText *textIn)
    786 {
    787     return match_sbcs(textIn, ngrams_8859_1_de, charMap_8859_1);
    788 }
    789 
    790 CharsetRecog_8859_1_es::~CharsetRecog_8859_1_es()
    791 {
    792     // nothing to do
    793 }
    794 
    795 const char *CharsetRecog_8859_1_es::getLanguage() const
    796 {
    797     return "es";
    798 }
    799 
    800 int32_t CharsetRecog_8859_1_es::match(InputText *textIn)
    801 {
    802     return match_sbcs(textIn, ngrams_8859_1_es, charMap_8859_1);
    803 }
    804 
    805 CharsetRecog_8859_1_fr::~CharsetRecog_8859_1_fr()
    806 {
    807     // nothing to do
    808 }
    809 
    810 const char *CharsetRecog_8859_1_fr::getLanguage() const
    811 {
    812     return "fr";
    813 }
    814 
    815 int32_t CharsetRecog_8859_1_fr::match(InputText *textIn)
    816 {
    817     return match_sbcs(textIn, ngrams_8859_1_fr, charMap_8859_1);
    818 }
    819 
    820 CharsetRecog_8859_1_it::~CharsetRecog_8859_1_it()
    821 {
    822     // nothing to do
    823 }
    824 
    825 const char *CharsetRecog_8859_1_it::getLanguage() const
    826 {
    827     return "it";
    828 }
    829 
    830 int32_t CharsetRecog_8859_1_it::match(InputText *textIn)
    831 {
    832     return match_sbcs(textIn, ngrams_8859_1_it, charMap_8859_1);
    833 }
    834 
    835 CharsetRecog_8859_1_nl::~CharsetRecog_8859_1_nl()
    836 {
    837     // nothing to do
    838 }
    839 
    840 const char *CharsetRecog_8859_1_nl::getLanguage() const
    841 {
    842     return "nl";
    843 }
    844 
    845 int32_t CharsetRecog_8859_1_nl::match(InputText *textIn)
    846 {
    847     return match_sbcs(textIn, ngrams_8859_1_nl, charMap_8859_1);
    848 }
    849 
    850 CharsetRecog_8859_1_no::~CharsetRecog_8859_1_no() {}
    851 
    852 const char *CharsetRecog_8859_1_no::getLanguage() const
    853 {
    854     return "no";
    855 }
    856 
    857 int32_t CharsetRecog_8859_1_no::match(InputText *textIn)
    858 {
    859     return match_sbcs(textIn, ngrams_8859_1_no, charMap_8859_1);
    860 }
    861 
    862 CharsetRecog_8859_1_pt::~CharsetRecog_8859_1_pt()
    863 {
    864     // nothing to do
    865 }
    866 
    867 const char *CharsetRecog_8859_1_pt::getLanguage() const
    868 {
    869     return "pt";
    870 }
    871 
    872 int32_t CharsetRecog_8859_1_pt::match(InputText *textIn)
    873 {
    874     return match_sbcs(textIn, ngrams_8859_1_pt, charMap_8859_1);
    875 }
    876 
    877 CharsetRecog_8859_1_sv::~CharsetRecog_8859_1_sv() {}
    878 
    879 const char *CharsetRecog_8859_1_sv::getLanguage() const
    880 {
    881     return "sv";
    882 }
    883 
    884 int32_t CharsetRecog_8859_1_sv::match(InputText *textIn)
    885 {
    886     return match_sbcs(textIn, ngrams_8859_1_sv, charMap_8859_1);
    887 }
    888 
    889 CharsetRecog_8859_2::~CharsetRecog_8859_2()
    890 {
    891     // nothing to do
    892 }
    893 
    894 const char *CharsetRecog_8859_2::getName() const
    895 {
    896     return haveC1Bytes? "windows-1250" : "ISO-8859-2";
    897 }
    898 
    899 CharsetRecog_8859_2_cs::~CharsetRecog_8859_2_cs()
    900 {
    901     // nothing to do
    902 }
    903 
    904 const char *CharsetRecog_8859_2_cs::getLanguage() const
    905 {
    906     return "cs";
    907 }
    908 
    909 int32_t CharsetRecog_8859_2_cs::match(InputText *textIn)
    910 {
    911     return match_sbcs(textIn, ngrams_8859_2_cs, charMap_8859_2);
    912 }
    913 
    914 CharsetRecog_8859_2_hu::~CharsetRecog_8859_2_hu()
    915 {
    916     // nothing to do
    917 }
    918 
    919 const char *CharsetRecog_8859_2_hu::getLanguage() const
    920 {
    921     return "hu";
    922 }
    923 
    924 int32_t CharsetRecog_8859_2_hu::match(InputText *textIn)
    925 {
    926     return match_sbcs(textIn, ngrams_8859_2_hu, charMap_8859_2);
    927 }
    928 
    929 CharsetRecog_8859_2_pl::~CharsetRecog_8859_2_pl()
    930 {
    931     // nothing to do
    932 }
    933 
    934 const char *CharsetRecog_8859_2_pl::getLanguage() const
    935 {
    936     return "pl";
    937 }
    938 
    939 int32_t CharsetRecog_8859_2_pl::match(InputText *textIn)
    940 {
    941     return match_sbcs(textIn, ngrams_8859_2_pl, charMap_8859_2);
    942 }
    943 
    944 CharsetRecog_8859_2_ro::~CharsetRecog_8859_2_ro()
    945 {
    946     // nothing to do
    947 }
    948 
    949 const char *CharsetRecog_8859_2_ro::getLanguage() const
    950 {
    951     return "ro";
    952 }
    953 
    954 int32_t CharsetRecog_8859_2_ro::match(InputText *textIn)
    955 {
    956     return match_sbcs(textIn, ngrams_8859_2_ro, charMap_8859_2);
    957 }
    958 
    959 CharsetRecog_8859_5::~CharsetRecog_8859_5()
    960 {
    961     // nothing to do
    962 }
    963 
    964 const char *CharsetRecog_8859_5::getName() const
    965 {
    966     return "ISO-8859-5";
    967 }
    968 
    969 CharsetRecog_8859_5_ru::~CharsetRecog_8859_5_ru()
    970 {
    971     // nothing to do
    972 }
    973 
    974 const char *CharsetRecog_8859_5_ru::getLanguage() const
    975 {
    976     return "ru";
    977 }
    978 
    979 int32_t CharsetRecog_8859_5_ru::match(InputText *textIn)
    980 {
    981     return match_sbcs(textIn, ngrams_8859_5_ru, charMap_8859_5);
    982 }
    983 
    984 CharsetRecog_8859_6::~CharsetRecog_8859_6()
    985 {
    986     // nothing to do
    987 }
    988 
    989 const char *CharsetRecog_8859_6::getName() const
    990 {
    991     return "ISO-8859-6";
    992 }
    993 
    994 CharsetRecog_8859_6_ar::~CharsetRecog_8859_6_ar()
    995 {
    996     // nothing to do
    997 }
    998 
    999 const char *CharsetRecog_8859_6_ar::getLanguage() const
   1000 {
   1001     return "ar";
   1002 }
   1003 
   1004 int32_t CharsetRecog_8859_6_ar::match(InputText *textIn)
   1005 {
   1006     return match_sbcs(textIn, ngrams_8859_6_ar, charMap_8859_6);
   1007 }
   1008 
   1009 CharsetRecog_8859_7::~CharsetRecog_8859_7()
   1010 {
   1011     // nothing to do
   1012 }
   1013 
   1014 const char *CharsetRecog_8859_7::getName() const
   1015 {
   1016     return haveC1Bytes? "windows-1253" : "ISO-8859-7";
   1017 }
   1018 
   1019 CharsetRecog_8859_7_el::~CharsetRecog_8859_7_el()
   1020 {
   1021     // nothing to do
   1022 }
   1023 
   1024 const char *CharsetRecog_8859_7_el::getLanguage() const
   1025 {
   1026     return "el";
   1027 }
   1028 
   1029 int32_t CharsetRecog_8859_7_el::match(InputText *textIn)
   1030 {
   1031     return match_sbcs(textIn, ngrams_8859_7_el, charMap_8859_7);
   1032 }
   1033 
   1034 CharsetRecog_8859_8::~CharsetRecog_8859_8()
   1035 {
   1036     // nothing to do
   1037 }
   1038 
   1039 const char *CharsetRecog_8859_8::getName() const
   1040 {
   1041     return haveC1Bytes? "windows-1255" : "ISO-8859-8";
   1042 }
   1043 
   1044 CharsetRecog_8859_8_I_he::~CharsetRecog_8859_8_I_he ()
   1045 {
   1046     // nothing to do
   1047 }
   1048 
   1049 const char *CharsetRecog_8859_8_I_he::getName() const
   1050 {
   1051     return haveC1Bytes? "windows-1255" : "ISO-8859-8-I";
   1052 }
   1053 
   1054 const char *CharsetRecog_8859_8_I_he::getLanguage() const
   1055 {
   1056     return "he";
   1057 }
   1058 
   1059 int32_t CharsetRecog_8859_8_I_he::match(InputText *textIn)
   1060 {
   1061     return match_sbcs(textIn, ngrams_8859_8_I_he, charMap_8859_8);
   1062 }
   1063 
   1064 CharsetRecog_8859_8_he::~CharsetRecog_8859_8_he()
   1065 {
   1066     // od ot gnihton
   1067 }
   1068 
   1069 const char *CharsetRecog_8859_8_he::getLanguage() const
   1070 {
   1071     return "he";
   1072 }
   1073 
   1074 int32_t CharsetRecog_8859_8_he::match(InputText *textIn)
   1075 {
   1076     return match_sbcs(textIn, ngrams_8859_8_he, charMap_8859_8);
   1077 }
   1078 
   1079 CharsetRecog_8859_9::~CharsetRecog_8859_9()
   1080 {
   1081     // nothing to do
   1082 }
   1083 
   1084 const char *CharsetRecog_8859_9::getName() const
   1085 {
   1086     return haveC1Bytes? "windows-1254" : "ISO-8859-9";
   1087 }
   1088 
   1089 CharsetRecog_8859_9_tr::~CharsetRecog_8859_9_tr ()
   1090 {
   1091     // nothing to do
   1092 }
   1093 
   1094 const char *CharsetRecog_8859_9_tr::getLanguage() const
   1095 {
   1096     return "tr";
   1097 }
   1098 
   1099 int32_t CharsetRecog_8859_9_tr::match(InputText *textIn)
   1100 {
   1101     return match_sbcs(textIn, ngrams_8859_9_tr, charMap_8859_9);
   1102 }
   1103 
   1104 CharsetRecog_windows_1256::~CharsetRecog_windows_1256()
   1105 {
   1106     // nothing to do
   1107 }
   1108 
   1109 const char *CharsetRecog_windows_1256::getName() const
   1110 {
   1111     return  "windows-1256";
   1112 }
   1113 
   1114 const char *CharsetRecog_windows_1256::getLanguage() const
   1115 {
   1116     return "ar";
   1117 }
   1118 
   1119 int32_t CharsetRecog_windows_1256::match(InputText *textIn)
   1120 {
   1121     return match_sbcs(textIn, ngrams_windows_1256, charMap_windows_1256);
   1122 }
   1123 
   1124 CharsetRecog_windows_1251::~CharsetRecog_windows_1251()
   1125 {
   1126     // nothing to do
   1127 }
   1128 
   1129 const char *CharsetRecog_windows_1251::getName() const
   1130 {
   1131     return  "windows-1251";
   1132 }
   1133 
   1134 const char *CharsetRecog_windows_1251::getLanguage() const
   1135 {
   1136     return "ru";
   1137 }
   1138 
   1139 int32_t CharsetRecog_windows_1251::match(InputText *textIn)
   1140 {
   1141     return match_sbcs(textIn, ngrams_windows_1251, charMap_windows_1251);
   1142 }
   1143 
   1144 CharsetRecog_KOI8_R::~CharsetRecog_KOI8_R()
   1145 {
   1146     // nothing to do
   1147 }
   1148 
   1149 const char *CharsetRecog_KOI8_R::getName() const
   1150 {
   1151     return  "KOI8-R";
   1152 }
   1153 
   1154 const char *CharsetRecog_KOI8_R::getLanguage() const
   1155 {
   1156     return "ru";
   1157 }
   1158 
   1159 int32_t CharsetRecog_KOI8_R::match(InputText *textIn)
   1160 {
   1161     return match_sbcs(textIn, ngrams_KOI8_R, charMap_KOI8_R);
   1162 }
   1163 
   1164 CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he()
   1165 {
   1166     // nothing to do
   1167 }
   1168 
   1169 const char *CharsetRecog_IBM424_he::getLanguage() const
   1170 {
   1171     return "he";
   1172 }
   1173 
   1174 CharsetRecog_IBM424_he_rtl::~CharsetRecog_IBM424_he_rtl()
   1175 {
   1176     // nothing to do
   1177 }
   1178 
   1179 const char *CharsetRecog_IBM424_he_rtl::getName() const
   1180 {
   1181     return  "IBM424_rtl";
   1182 }
   1183 
   1184 int32_t CharsetRecog_IBM424_he_rtl::match(InputText *textIn)
   1185 {
   1186     return match_sbcs(textIn, ngrams_IBM424_he_rtl, charMap_IBM424_he);
   1187 }
   1188 
   1189 CharsetRecog_IBM424_he_ltr::~CharsetRecog_IBM424_he_ltr()
   1190 {
   1191     // nothing to do
   1192 }
   1193 
   1194 const char *CharsetRecog_IBM424_he_ltr::getName() const
   1195 {
   1196     return  "IBM424_ltr";
   1197 }
   1198 
   1199 int32_t CharsetRecog_IBM424_he_ltr::match(InputText *textIn)
   1200 {
   1201     return match_sbcs(textIn, ngrams_IBM424_he_ltr, charMap_IBM424_he);
   1202 }
   1203 
   1204 static const uint8_t unshapeMap_IBM420[] = {
   1205 /*           -0    -1    -2    -3    -4    -5    -6    -7    -8    -9    -A    -B    -C    -D    -E    -F   */
   1206 /* 0- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
   1207 /* 1- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
   1208 /* 2- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
   1209 /* 3- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
   1210 /* 4- */    0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
   1211 /* 5- */    0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
   1212 /* 6- */    0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
   1213 /* 7- */    0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
   1214 /* 8- */    0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F,
   1215 /* 9- */    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E,
   1216 /* A- */    0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF,
   1217 /* B- */    0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF,
   1218 /* C- */    0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF,
   1219 /* D- */    0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF,
   1220 /* E- */    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
   1221 /* F- */    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
   1222 };
   1223 
   1224 CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar()
   1225 {
   1226     // nothing to do
   1227 }
   1228 
   1229 const char *CharsetRecog_IBM420_ar::getLanguage() const
   1230 {
   1231     return "ar";
   1232 }
   1233 
   1234 void CharsetRecog_IBM420_ar::matchInit(InputText *textIn) {
   1235     prev_fInputBytesLength = textIn->fInputLen;
   1236     prev_fInputBytes = textIn->fInputBytes;
   1237 
   1238     int32_t length = 0;
   1239     uint8_t *bb = unshape(prev_fInputBytes, prev_fInputBytesLength, length);
   1240 
   1241     if (bb != NULL) {
   1242         textIn->fInputBytes = bb;
   1243         textIn->fInputLen = length;
   1244 
   1245         deleteBuffer = TRUE;
   1246     } else {
   1247         deleteBuffer = FALSE;
   1248     }
   1249 }
   1250 
   1251 uint8_t *CharsetRecog_IBM420_ar::unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length) {
   1252     uint8_t *resultArray = unshapeLamAlef(inputBytes, inputBytesLength, length);
   1253 
   1254     if (resultArray != NULL) {
   1255         for (int32_t i = 0; i < inputBytesLength; i++) {
   1256             resultArray[i] = unshapeMap_IBM420[resultArray[i]];
   1257         }
   1258     }
   1259 
   1260     return resultArray;
   1261 }
   1262 
   1263 uint8_t *CharsetRecog_IBM420_ar::unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length) {
   1264     int32_t bigBufferLength = inputBytesLength * 2;
   1265     uint8_t *bigBuffer = (uint8_t *)uprv_malloc(bigBufferLength);
   1266     uint8_t *resultBuffer = NULL;
   1267 
   1268     if (bigBuffer != NULL) {
   1269         int32_t bufferIndex;
   1270         uint8_t unshapedLamAlef[] = { 0xb1, 0x56 };
   1271 
   1272         for (int32_t i = bufferIndex = 0; i < inputBytesLength; i++) {
   1273             if (isLamAlef(inputBytes[i])) {
   1274                 bigBuffer[bufferIndex++] = unshapedLamAlef[0];
   1275                 bigBuffer[bufferIndex++] = unshapedLamAlef[1];
   1276             } else {
   1277                 bigBuffer[bufferIndex++] = inputBytes[i];
   1278             }
   1279         }
   1280 
   1281         length = bufferIndex;
   1282         resultBuffer = (uint8_t *)uprv_malloc(length);
   1283         if (resultBuffer != NULL) {
   1284             uprv_memcpy(resultBuffer, bigBuffer, length);
   1285         }
   1286     }
   1287 
   1288     if (bigBuffer != NULL) {
   1289         uprv_free(bigBuffer);
   1290     }
   1291 
   1292     return resultBuffer;
   1293 }
   1294 
   1295 void CharsetRecog_IBM420_ar::matchFinish(InputText *textIn) {
   1296     if (deleteBuffer) {
   1297         uprv_free(textIn->fInputBytes);
   1298 
   1299         textIn->fInputBytes = prev_fInputBytes;
   1300         textIn->fInputLen = prev_fInputBytesLength;
   1301     }
   1302 }
   1303 
   1304 UBool CharsetRecog_IBM420_ar::isLamAlef(uint8_t b) {
   1305     uint8_t shapedLamAlef[] = {
   1306         0xb2, 0xb3, 0xb4, 0xb5, 0xb7, 0xb8
   1307     };
   1308 
   1309     for (uint32_t i = 0; i < sizeof(shapedLamAlef); i++) {
   1310         if (b == shapedLamAlef[i]) {
   1311             return TRUE;
   1312         }
   1313     }
   1314 
   1315     return FALSE;
   1316 }
   1317 
   1318 CharsetRecog_IBM420_ar_rtl::~CharsetRecog_IBM420_ar_rtl()
   1319 {
   1320     // nothing to do
   1321 }
   1322 
   1323 const char *CharsetRecog_IBM420_ar_rtl::getName() const
   1324 {
   1325     return  "IBM420_rtl";
   1326 }
   1327 
   1328 int32_t CharsetRecog_IBM420_ar_rtl::match(InputText *textIn)
   1329 {
   1330     return match_sbcs(textIn, ngrams_IBM420_ar_rtl, charMap_IBM420_ar);
   1331 }
   1332 
   1333 CharsetRecog_IBM420_ar_ltr::~CharsetRecog_IBM420_ar_ltr()
   1334 {
   1335     // nothing to do
   1336 }
   1337 
   1338 const char *CharsetRecog_IBM420_ar_ltr::getName() const
   1339 {
   1340     return  "IBM420_ltr";
   1341 }
   1342 
   1343 int32_t CharsetRecog_IBM420_ar_ltr::match(InputText *textIn)
   1344 {
   1345     return match_sbcs(textIn, ngrams_IBM420_ar_ltr, charMap_IBM420_ar);
   1346 }
   1347 
   1348 U_NAMESPACE_END
   1349 #endif
   1350 
   1351