Home | History | Annotate | Download | only in compact_lang_det
      1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef ENCODINGS_COMPACT_LANG_DET_CLDUTIL_H_
      6 #define ENCODINGS_COMPACT_LANG_DET_CLDUTIL_H_
      7 
      8 #include <string>
      9 #include "encodings/compact_lang_det/ext_lang_enc.h"
     10 #include "encodings/compact_lang_det/tote.h"
     11 #include "encodings/compact_lang_det/win/cld_basictypes.h"
     12 #include "encodings/compact_lang_det/win/cld_commandlineflags.h"
     13 #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
     14 
     15 namespace cld {
     16 
     17   // Hash bucket for four-way associative lookup with < 64K buckets
     18   // 32 bytes per bucket, 8-byte entries
     19   typedef struct {
     20     uint32 key[4];        // hashed word to look up
     21     uint32 value[4];      // packed three lang numbers and probability subscript
     22   } SmallWordProbBucket4;
     23 
     24   // Hash bucket for fouro-way associative lookup with >= 64K buckets
     25   // 24 bytes per bucket, 6-byte entries
     26   typedef struct {
     27     uint16 key[4];        // Half of hashed word to look up; other
     28                           //  half is used to pick the bucket
     29     uint32 value[4];      // packed three lang numbers and probability subscript
     30   } LargeQuadProbBucket4;
     31 
     32   // Hash bucket for four-way associative lookup, indirect probabilities
     33   // 16 bytes per bucket, 4-byte entries
     34   typedef struct {
     35     uint32 keyvalue[4];   // Upper part of word is hash, lower is indirect prob
     36   } IndirectProbBucket4;
     37 
     38 
     39   // This describes a complete CLD table, consisting of
     40   // a main lookup table, an indirect language/probability table, and
     41   // three constants.
     42   // The main table key is a quadgram, bigram, or longword hash, with
     43   // part of the key used to select a bucket modulo kCLDTableSize,
     44   // and the rest matched against the key portion of four entries in a bucket,
     45   // defined by kCLDTableKeyMask. The remaining bits of an entry, defined
     46   // by ~kCLDTableKeyMask, are usually a subscript in the indirect table.
     47   //
     48   // By using part of the key to select a bucket, those key bits do not need
     49   // to be stored in the main table entries, saving space (typically 2 bytes).
     50   //
     51   // By using an indirect table for lang/prob triples, only the subscript needs
     52   // to be stored in the main table entires, saving space (typically 2 bytes).
     53   //
     54   // Each entry in the indirect table has three languages and three
     55   // corresponding probabilities, packed into four bytes.
     56   //
     57   // The build date constant is included just for version tracking and is not
     58   // otherwise used.
     59   //
     60   // Different-size tables can be linked in for different production
     61   // environments. By going indirect through this struct, the runtime code is
     62   // insensitive to the actual sizes.
     63   //
     64   // An empty placeholder table can be described by a table size of 1
     65   // bucket, a keymask of 0xffffffff, a degenerate bucket of four no-match
     66   // entries, and a degenerate indirect table of one no-languages entry.
     67   //
     68   //
     69   struct CLDTableSummary {
     70     const IndirectProbBucket4* kCLDTable;
     71                                         // Each bucket has four entries, part
     72                                         //  key and part indirect subscript
     73     const uint32* kCLDTableInd;         // Each entry is three packed lang/prob
     74     const int kCLDTableSize;            // Bucket count
     75     const int kCLDTableIndSize;         // Entries count
     76     const int kCLDTableKeyMask;         // Mask hash key
     77     const int kCLDTableBuildDate;       // yyyymmdd
     78   };
     79 
     80 
     81   // Keeps per-character 0-12 language probabilities for CTJKVZ-- in that order.
     82   // Chinese ChineseT Japanese Korean Vietnamese Zhuang
     83   // (2 bytes unused, for alignment padding and future)
     84   typedef struct {
     85     uint8 probs[8];
     86   } UnigramProbArray;
     87 
     88   // Map 8-bit subscript to CTJKVZ probabilities
     89   // Target runtime probabilities for CTJK + VZ
     90   // Hand-generated to cover a reasonable range of choices
     91   static const int kTargetCTJKVZProbsSize = 242;
     92   static const UnigramProbArray kTargetCTJKVZProbs[kTargetCTJKVZProbsSize] = {
     93     {{0,0,0,0,0,0,0,0}},
     94     {{0,0,0,0,0,12,0,0}},
     95     {{0,0,0,0,12,0,0,0}},
     96     {{0,0,0,12,0,0,0,0}},
     97     {{0,0,12,0,0,0,0,0}},
     98     {{0,12,0,0,0,0,0,0}},
     99     {{12,0,0,0,0,0,0,0}},
    100 
    101     {{8,0,0,0,4,0,0,0}},
    102     {{8,0,0,4,0,0,0,0}},
    103     {{8,0,4,0,0,0,0,0}},
    104     {{8,4,0,0,0,0,0,0}},
    105     {{8,2,0,2,0,0,0,0}},
    106     {{0,0,0,0,0,8,0,0}},
    107     {{0,4,8,0,0,0,0,0}},
    108     {{4,0,0,0,0,8,0,0}},
    109     {{0,0,8,0,0,0,0,0}},
    110     {{8,2,2,0,0,0,0,0}},
    111     {{0,8,4,0,0,0,0,0}},
    112     {{8,0,0,0,0,4,0,0}},
    113     {{0,8,2,0,0,0,0,0}},
    114     {{4,8,0,0,0,0,0,0}},
    115     {{2,8,0,2,0,0,0,0}},
    116     {{2,2,8,0,0,0,0,0}},
    117     {{0,8,0,0,0,0,0,0}},
    118     {{0,2,8,0,0,0,0,0}},
    119     {{2,8,2,0,0,0,0,0}},
    120     {{8,0,0,0,0,0,0,0}},
    121     {{2,8,0,0,0,0,0,0}},
    122     {{8,2,0,0,0,0,0,0}},
    123 
    124     {{0,6,2,0,2,0,0,0}},
    125     {{2,0,0,0,6,0,0,0}},
    126     {{4,0,0,0,6,0,0,0}},
    127     {{4,6,0,0,4,0,0,0}},
    128     {{4,6,2,0,2,0,0,0}},
    129     {{4,6,4,0,2,0,0,0}},
    130     {{5,4,6,0,0,0,0,0}},
    131     {{6,0,0,0,4,0,0,0}},
    132     {{6,0,2,0,4,0,0,0}},
    133     {{6,0,4,0,4,0,0,0}},
    134     {{6,2,0,0,4,0,0,0}},
    135     {{6,2,2,0,4,0,0,0}},
    136     {{6,2,4,0,2,0,0,0}},
    137     {{6,4,0,0,2,0,0,0}},
    138     {{6,4,2,0,2,0,0,0}},
    139     {{0,0,6,2,0,0,0,0}},
    140     {{0,6,2,0,0,2,0,0}},
    141     {{2,2,2,0,0,6,0,0}},
    142     {{2,2,6,4,0,0,0,0}},
    143     {{2,4,0,0,0,6,0,0}},
    144     {{2,6,0,4,0,0,0,0}},
    145     {{2,6,2,4,0,0,0,0}},
    146     {{2,6,4,4,0,0,0,0}},
    147     {{4,0,2,0,0,6,0,0}},
    148     {{4,2,6,2,0,0,0,0}},
    149     {{4,4,2,0,0,6,0,0}},
    150     {{4,6,4,0,0,2,0,0}},
    151     {{6,0,2,0,0,2,0,0}},
    152     {{6,2,0,0,0,2,0,0}},
    153     {{6,2,2,0,0,4,0,0}},
    154     {{6,2,4,0,0,2,0,0}},
    155     {{4,6,2,0,0,4,0,0}},
    156     {{6,4,2,0,0,4,0,0}},
    157     {{2,0,0,0,0,6,0,0}},
    158     {{6,2,0,2,0,0,0,0}},
    159     {{2,2,0,0,0,6,0,0}},
    160     {{6,2,6,0,0,0,0,0}},
    161     {{6,4,2,0,0,2,0,0}},
    162     {{6,4,2,2,0,0,0,0}},
    163     {{4,6,4,2,0,0,0,0}},
    164     {{6,0,2,0,0,4,0,0}},
    165     {{6,0,4,0,0,2,0,0}},
    166     {{6,0,6,0,0,0,0,0}},
    167     {{6,2,2,0,0,0,0,0}},
    168     {{6,4,0,0,0,2,0,0}},
    169     {{6,4,5,0,0,0,0,0}},
    170     {{0,6,0,2,0,0,0,0}},
    171     {{0,6,2,2,0,0,0,0}},
    172     {{2,6,0,2,0,0,0,0}},
    173     {{2,6,2,2,0,0,0,0}},
    174     {{4,2,0,0,0,6,0,0}},
    175     {{6,4,0,0,0,4,0,0}},
    176     {{6,4,0,2,0,0,0,0}},
    177     {{6,6,0,2,0,0,0,0}},
    178     {{6,0,4,0,0,4,0,0}},
    179     {{6,2,0,0,0,4,0,0}},
    180     {{6,6,2,2,0,0,0,0}},
    181     {{4,6,0,0,0,2,0,0}},
    182     {{2,6,6,0,0,0,0,0}},
    183     {{4,5,6,0,0,0,0,0}},
    184     {{4,6,0,2,0,0,0,0}},
    185     {{6,2,0,0,0,6,0,0}},
    186     {{0,6,4,2,0,0,0,0}},
    187     {{4,0,6,0,0,0,0,0}},
    188     {{2,6,4,2,0,0,0,0}},
    189     {{4,6,0,0,0,4,0,0}},
    190     {{6,2,2,0,0,0,0,0}},
    191     {{4,6,2,2,0,0,0,0}},
    192     {{4,6,5,0,0,0,0,0}},
    193     {{6,0,2,0,0,0,0,0}},
    194     {{6,4,4,0,0,0,0,0}},
    195     {{4,2,6,0,0,0,0,0}},
    196     {{2,0,6,0,0,0,0,0}},
    197     {{4,4,0,0,0,6,0,0}},
    198     {{4,4,6,0,0,0,0,0}},
    199     {{4,6,2,0,0,2,0,0}},
    200     {{2,2,6,0,0,0,0,0}},
    201     {{2,4,6,0,0,0,0,0}},
    202     {{0,6,6,0,0,0,0,0}},
    203     {{6,2,4,0,0,0,0,0}},
    204     {{0,4,6,0,0,0,0,0}},
    205     {{4,0,0,0,0,6,0,0}},
    206     {{4,6,4,0,0,0,0,0}},
    207     {{6,0,0,0,0,6,0,0}},
    208     {{6,0,0,0,0,2,0,0}},
    209     {{6,0,4,0,0,0,0,0}},
    210     {{6,5,4,0,0,0,0,0}},
    211     {{0,2,6,0,0,0,0,0}},
    212     {{0,0,6,0,0,0,0,0}},
    213     {{6,6,2,0,0,0,0,0}},
    214     {{2,6,4,0,0,0,0,0}},
    215     {{6,4,2,0,0,0,0,0}},
    216     {{2,6,2,0,0,0,0,0}},
    217     {{2,6,0,0,0,0,0,0}},
    218     {{6,0,0,0,0,4,0,0}},
    219     {{6,4,0,0,0,0,0,0}},
    220     {{6,6,0,0,0,0,0,0}},
    221     {{5,6,4,0,0,0,0,0}},
    222     {{0,6,0,0,0,0,0,0}},
    223     {{6,2,0,0,0,0,0,0}},
    224     {{0,6,2,0,0,0,0,0}},
    225     {{4,6,2,0,0,0,0,0}},
    226     {{0,6,4,0,0,0,0,0}},
    227     {{4,6,0,0,0,0,0,0}},
    228     {{6,0,0,0,0,0,0,0}},
    229     {{6,6,5,0,0,0,0,0}},
    230     {{6,5,6,0,0,0,0,0}},
    231     {{5,6,6,0,0,0,0,0}},
    232     {{5,5,6,0,0,0,0,0}},
    233     {{5,6,5,0,0,0,0,0}},
    234     {{6,5,5,0,0,0,0,0}},
    235     {{6,6,6,0,0,0,0,0}},
    236     {{6,5,0,0,0,0,0,0}},
    237     {{6,0,5,0,0,0,0,0}},
    238     {{0,6,5,0,0,0,0,0}},
    239     {{5,6,0,0,0,0,0,0}},
    240     {{5,0,6,0,0,0,0,0}},
    241     {{0,5,6,0,0,0,0,0}},
    242 
    243     {{0,0,0,0,4,0,0,0}},
    244     {{0,0,0,4,0,0,0,0}},
    245     {{2,2,0,0,4,0,0,0}},
    246     {{2,2,2,0,4,0,0,0}},
    247     {{2,4,0,0,2,0,0,0}},
    248     {{2,4,2,0,2,0,0,0}},
    249     {{2,4,4,0,2,0,0,0}},
    250     {{4,0,2,0,4,0,0,0}},
    251     {{4,0,4,0,2,0,0,0}},
    252     {{4,2,0,0,2,0,0,0}},
    253     {{4,2,2,0,2,0,0,0}},
    254     {{4,4,0,0,2,0,0,0}},
    255     {{4,4,2,0,2,0,0,0}},
    256     {{4,4,4,0,2,0,0,0}},
    257     {{0,2,2,4,0,0,0,0}},
    258     {{2,2,4,2,0,0,0,0}},
    259     {{2,4,4,0,0,2,0,0}},
    260     {{2,4,4,2,0,0,0,0}},
    261     {{4,0,4,0,0,2,0,0}},
    262     {{4,0,4,0,0,4,0,0}},
    263     {{4,2,2,4,0,0,0,0}},
    264     {{4,4,0,2,0,0,0,0}},
    265     {{2,2,0,4,0,0,0,0}},
    266     {{2,4,2,2,0,0,0,0}},
    267     {{4,4,2,2,0,0,0,0}},
    268     {{4,0,4,0,0,0,0,0}},
    269     {{4,4,4,0,0,4,0,0}},
    270     {{0,4,0,2,0,0,0,0}},
    271     {{0,4,2,2,0,0,0,0}},
    272     {{4,0,2,0,0,2,0,0}},
    273     {{4,2,0,0,0,4,0,0}},
    274     {{2,2,2,0,0,4,0,0}},
    275     {{4,0,0,2,0,0,0,0}},
    276     {{4,4,4,0,0,2,0,0}},
    277     {{4,0,0,0,0,4,0,0}},
    278     {{4,0,2,0,0,4,0,0}},
    279     {{4,2,0,0,0,2,0,0}},
    280     {{4,2,2,0,0,2,0,0}},
    281     {{2,4,0,2,0,0,0,0}},
    282     {{2,2,0,0,0,4,0,0}},
    283     {{2,4,0,0,0,4,0,0}},
    284     {{2,4,2,0,0,4,0,0}},
    285     {{4,2,4,0,0,0,0,0}},
    286     {{2,0,4,0,0,0,0,0}},
    287     {{4,0,2,0,0,0,0,0}},
    288     {{4,4,0,0,0,4,0,0}},
    289     {{4,4,2,0,0,4,0,0}},
    290     {{0,4,4,0,0,0,0,0}},
    291     {{4,4,0,0,0,2,0,0}},
    292     {{2,4,0,0,0,2,0,0}},
    293     {{2,2,4,0,0,0,0,0}},
    294     {{0,2,4,0,0,0,0,0}},
    295     {{4,2,2,0,0,0,0,0}},
    296     {{2,4,2,0,0,2,0,0}},
    297     {{4,4,4,0,0,0,0,0}},
    298     {{2,4,4,0,0,0,0,0}},
    299     {{0,0,4,0,0,0,0,0}},
    300     {{0,4,2,0,0,0,0,0}},
    301     {{4,4,2,0,0,2,0,0}},
    302     {{2,4,2,0,0,0,0,0}},
    303     {{4,2,0,0,0,0,0,0}},
    304     {{4,4,0,0,0,0,0,0}},
    305     {{4,4,2,0,0,0,0,0}},
    306     {{2,4,0,0,0,0,0,0}},
    307     {{0,4,0,0,0,0,0,0}},
    308     {{4,0,0,0,0,0,0,0}},
    309     {{0,0,0,4,4,0,0,0}},
    310     {{0,0,4,0,4,0,0,0}},
    311     {{0,0,4,4,0,0,0,0}},
    312     {{0,4,0,0,4,0,0,0}},
    313     {{0,4,0,4,0,0,0,0}},
    314     {{4,0,0,0,4,0,0,0}},
    315     {{4,0,0,4,0,0,0,0}},
    316 
    317     {{2,0,0,0,0,0,0,0}},
    318     {{0,2,0,0,0,0,0,0}},
    319     {{0,2,0,2,2,0,0,0}},
    320     {{0,2,2,0,2,0,0,0}},
    321     {{2,0,0,2,2,0,0,0}},
    322     {{2,0,2,0,2,0,0,0}},
    323     {{2,0,2,2,0,0,0,0}},
    324     {{2,2,0,0,2,0,0,0}},
    325     {{2,2,2,2,0,0,0,0}},
    326     {{2,2,0,2,0,0,0,0}},
    327     {{2,2,0,0,0,0,0,0}},
    328     {{0,0,2,0,0,0,0,0}},
    329     {{0,2,2,0,0,0,0,0}},
    330     {{2,2,2,0,0,0,0,0}},
    331     {{0,0,0,2,0,0,0,0}},
    332     {{2,0,2,0,0,0,0,0}},
    333     {{0,2,0,2,0,0,0,0}},
    334     {{0,0,2,2,0,0,0,0}},
    335     {{0,2,2,2,0,0,0,0}},
    336   };
    337 
    338 
    339 
    340 
    341   // 1 to skip ASCII space, vowels AEIOU aeiou and UTF-8 continuation bytes 80-BF
    342   static const uint8 kSkipSpaceVowelContinue[256] = {
    343     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    344     1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    345     0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
    346     0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
    347 
    348     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    349     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    350     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    351     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    352   };
    353 
    354   // 1 to skip ASCII space, and UTF-8 continuation bytes 80-BF
    355   static const uint8 kSkipSpaceContinue[256] = {
    356     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    357     1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    358     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    359     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    360 
    361     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    362     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    363     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    364     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    365   };
    366 
    367 
    368   // If != UNKNOWN, use nilgrams to determine language of this script
    369   static const Language kOnlyLanguagePerLScript[] = {
    370     ENGLISH,            // ULScript_Common, [no words should be in this script]
    371     UNKNOWN_LANGUAGE,   // ULScript_Latin,
    372     //UNKNOWN_LANGUAGE,   // ULScript_Greek,  Jan 2009: change so we can score quads
    373     GREEK,              // ULScript_Greek,  Mar 2009: change back; do gibberish separately
    374     UNKNOWN_LANGUAGE,   // ULScript_Cyrillic,
    375     ARMENIAN,           // ULScript_Armenian,
    376     UNKNOWN_LANGUAGE,   // ULScript_Hebrew,
    377     UNKNOWN_LANGUAGE,   // ULScript_Arabic,
    378     SYRIAC,             // ULScript_Syriac,
    379     DHIVEHI,            // ULScript_Thaana,
    380     UNKNOWN_LANGUAGE,   // ULScript_Devanagari,
    381     UNKNOWN_LANGUAGE,   // ULScript_Bengali,
    382     PUNJABI,            // ULScript_Gurmukhi,
    383     GUJARATI,           // ULScript_Gujarati,
    384     ORIYA,              // ULScript_Oriya,
    385     TAMIL,              // ULScript_Tamil,
    386     TELUGU,             // ULScript_Telugu,
    387     KANNADA,            // ULScript_Kannada,
    388     MALAYALAM,          // ULScript_Malayalam,
    389     SINHALESE,          // ULScript_Sinhala,
    390     THAI,               // ULScript_Thai,
    391     LAOTHIAN,           // ULScript_Lao,
    392     UNKNOWN_LANGUAGE,   // ULScript_Tibetan,
    393     BURMESE,            // ULScript_Myanmar,
    394     GEORGIAN,           // ULScript_Georgian,
    395     UNKNOWN_LANGUAGE,   // ULScript_HanCJK,
    396     UNKNOWN_LANGUAGE,   // ULScript_Ethiopic,
    397     CHEROKEE,           // ULScript_Cherokee,
    398     INUKTITUT,          // ULScript_Canadian_Aboriginal,
    399     X_OGHAM,            // ULScript_Ogham,
    400     X_RUNIC,            // ULScript_Runic,
    401     KHMER,              // ULScript_Khmer,
    402     MONGOLIAN,          // ULScript_Mongolian,
    403     X_YI,               // ULScript_Yi,
    404     X_OLD_ITALIC,       // ULScript_Old_Italic,
    405     X_GOTHIC,           // ULScript_Gothic,
    406     X_DESERET,          // ULScript_Deseret,
    407     ENGLISH,            // ULScript_Inherited, [no words should be in this script]
    408     TAGALOG,            // ULScript_Tagalog,
    409     X_HANUNOO,          // ULScript_Hanunoo,
    410     X_BUHID,            // ULScript_Buhid,
    411     X_TAGBANWA,         // ULScript_Tagbanwa,
    412     LIMBU,              // ULScript_Limbu,
    413     X_TAI_LE,           // ULScript_Tai_Le,
    414     X_LINEAR_B,         // ULScript_Linear_B,
    415     X_UGARITIC,         // ULScript_Ugaritic,
    416     X_SHAVIAN,          // ULScript_Shavian,
    417     X_OSMANYA,          // ULScript_Osmanya,
    418     X_CYPRIOT,          // ULScript_Cypriot,
    419     X_BUGINESE,         // ULScript_Buginese,
    420     X_COPTIC,           // ULScript_Coptic,
    421     X_NEW_TAI_LUE,      // ULScript_New_Tai_Lue,
    422     X_GLAGOLITIC,       // ULScript_Glagolitic,
    423     X_TIFINAGH,         // ULScript_Tifinagh,
    424     X_SYLOTI_NAGRI,     // ULScript_Syloti_Nagri,
    425     X_OLD_PERSIAN,      // ULScript_Old_Persian,
    426     X_KHAROSHTHI,       // ULScript_Kharoshthi,
    427     X_BALINESE,         // ULScript_Balinese,
    428     X_CUNEIFORM,        // ULScript_Cuneiform,
    429     X_PHOENICIAN,       // ULScript_Phoenician,
    430     X_PHAGS_PA,         // ULScript_Phags_Pa,
    431     X_NKO,              // ULScript_Nko,
    432 
    433     // Unicode 5.1
    434     X_SUDANESE,         // ULScript_Sundanese,
    435     X_LEPCHA,           // ULScript_Lepcha,
    436     X_OL_CHIKI,         // ULScript_Ol_Chiki,
    437     X_VAI,              // ULScript_Vai,
    438     X_SAURASHTRA,       // ULScript_Saurashtra,
    439     X_KAYAH_LI,         // ULScript_Kayah_Li,
    440     X_REJANG,           // ULScript_Rejang,
    441     X_LYCIAN,           // ULScript_Lycian,
    442     X_CARIAN,           // ULScript_Carian,
    443     X_LYDIAN,           // ULScript_Lydian,
    444     X_CHAM,             // ULScript_Cham,
    445   };
    446 
    447   COMPILE_ASSERT(arraysize(kOnlyLanguagePerLScript) == ULScript_NUM_SCRIPTS,
    448        kOnlyLanguagePerLScript_has_incorrect_length);
    449 
    450 
    451   // This is, in a sense, the complement of the table above
    452   // If != UNKNOWN, determines a default language of this script
    453   static const Language kDefaultLanguagePerLScript[] = {
    454     UNKNOWN_LANGUAGE,            // ULScript_Common, [no words should be in this script]
    455     ENGLISH,   // ULScript_Latin,
    456     UNKNOWN_LANGUAGE,              // ULScript_Greek,
    457     RUSSIAN,   // ULScript_Cyrillic,
    458     UNKNOWN_LANGUAGE,           // ULScript_Armenian,
    459     HEBREW,   // ULScript_Hebrew,
    460     ARABIC,   // ULScript_Arabic,
    461     UNKNOWN_LANGUAGE,             // ULScript_Syriac,
    462     UNKNOWN_LANGUAGE,            // ULScript_Thaana,
    463     HINDI,   // ULScript_Devanagari,
    464     BENGALI,   // ULScript_Bengali,
    465     UNKNOWN_LANGUAGE,            // ULScript_Gurmukhi,
    466     UNKNOWN_LANGUAGE,           // ULScript_Gujarati,
    467     UNKNOWN_LANGUAGE,              // ULScript_Oriya,
    468     UNKNOWN_LANGUAGE,              // ULScript_Tamil,
    469     UNKNOWN_LANGUAGE,             // ULScript_Telugu,
    470     UNKNOWN_LANGUAGE,            // ULScript_Kannada,
    471     UNKNOWN_LANGUAGE,          // ULScript_Malayalam,
    472     UNKNOWN_LANGUAGE,          // ULScript_Sinhala,
    473     UNKNOWN_LANGUAGE,               // ULScript_Thai,
    474     UNKNOWN_LANGUAGE,           // ULScript_Lao,
    475     TIBETAN,   // ULScript_Tibetan,
    476     UNKNOWN_LANGUAGE,            // ULScript_Myanmar,
    477     UNKNOWN_LANGUAGE,           // ULScript_Georgian,
    478     CHINESE,   // ULScript_HanCJK,
    479     AMHARIC,   // ULScript_Ethiopic,
    480     UNKNOWN_LANGUAGE,           // ULScript_Cherokee,
    481     UNKNOWN_LANGUAGE,          // ULScript_Canadian_Aboriginal,
    482     UNKNOWN_LANGUAGE,            // ULScript_Ogham,
    483     UNKNOWN_LANGUAGE,            // ULScript_Runic,
    484     UNKNOWN_LANGUAGE,              // ULScript_Khmer,
    485     UNKNOWN_LANGUAGE,          // ULScript_Mongolian,
    486     UNKNOWN_LANGUAGE,               // ULScript_Yi,
    487     UNKNOWN_LANGUAGE,       // ULScript_Old_Italic,
    488     UNKNOWN_LANGUAGE,           // ULScript_Gothic,
    489     UNKNOWN_LANGUAGE,          // ULScript_Deseret,
    490     UNKNOWN_LANGUAGE,            // ULScript_Inherited, [no words should be in this script]
    491     UNKNOWN_LANGUAGE,            // ULScript_Tagalog,
    492     UNKNOWN_LANGUAGE,          // ULScript_Hanunoo,
    493     UNKNOWN_LANGUAGE,            // ULScript_Buhid,
    494     UNKNOWN_LANGUAGE,         // ULScript_Tagbanwa,
    495     UNKNOWN_LANGUAGE,              // ULScript_Limbu,
    496     UNKNOWN_LANGUAGE,           // ULScript_Tai_Le,
    497     UNKNOWN_LANGUAGE,         // ULScript_Linear_B,
    498     UNKNOWN_LANGUAGE,         // ULScript_Ugaritic,
    499     UNKNOWN_LANGUAGE,          // ULScript_Shavian,
    500     UNKNOWN_LANGUAGE,          // ULScript_Osmanya,
    501     UNKNOWN_LANGUAGE,          // ULScript_Cypriot,
    502     UNKNOWN_LANGUAGE,         // ULScript_Buginese,
    503     UNKNOWN_LANGUAGE,           // ULScript_Coptic,
    504     UNKNOWN_LANGUAGE,      // ULScript_New_Tai_Lue,
    505     UNKNOWN_LANGUAGE,       // ULScript_Glagolitic,
    506     UNKNOWN_LANGUAGE,         // ULScript_Tifinagh,
    507     UNKNOWN_LANGUAGE,     // ULScript_Syloti_Nagri,
    508     UNKNOWN_LANGUAGE,      // ULScript_Old_Persian,
    509     UNKNOWN_LANGUAGE,       // ULScript_Kharoshthi,
    510     UNKNOWN_LANGUAGE,         // ULScript_Balinese,
    511     UNKNOWN_LANGUAGE,        // ULScript_Cuneiform,
    512     UNKNOWN_LANGUAGE,       // ULScript_Phoenician,
    513     UNKNOWN_LANGUAGE,         // ULScript_Phags_Pa,
    514     UNKNOWN_LANGUAGE,              // ULScript_Nko,
    515 
    516     // Unicode 5.1
    517     UNKNOWN_LANGUAGE,         // ULScript_Sundanese,
    518     UNKNOWN_LANGUAGE,           // ULScript_Lepcha,
    519     UNKNOWN_LANGUAGE,         // ULScript_Ol_Chiki,
    520     UNKNOWN_LANGUAGE,              // ULScript_Vai,
    521     UNKNOWN_LANGUAGE,       // ULScript_Saurashtra,
    522     UNKNOWN_LANGUAGE,         // ULScript_Kayah_Li,
    523     UNKNOWN_LANGUAGE,           // ULScript_Rejang,
    524     UNKNOWN_LANGUAGE,           // ULScript_Lycian,
    525     UNKNOWN_LANGUAGE,           // ULScript_Carian,
    526     UNKNOWN_LANGUAGE,           // ULScript_Lydian,
    527     UNKNOWN_LANGUAGE,             // ULScript_Cham,
    528   };
    529 
    530   COMPILE_ASSERT(arraysize(kDefaultLanguagePerLScript) == ULScript_NUM_SCRIPTS,
    531        kDefaultLanguagePerLScript_has_incorrect_length);
    532 
    533 
    534   // True for standalone languages (only lang in a script)
    535   // Subscripted by packed language number
    536   // If 1, we will use nilgrams to determine language
    537   static const uint8 kIsStandaloneLang[EXT_NUM_LANGUAGES + 1] = {
    538      0,
    539      0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,1,0,    // GREEK
    540      0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
    541      0,1,0,0,1, 0,1,0,0,0, 0,0,1,1,0, 0,0,0,0,1,    // MALAYALAM..KANNADA
    542      1,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 1,0,0,0,1,    // PUNJABI..SINHALESE
    543      0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,1,1,0,    // ARMENIAN..LAOTHIAN
    544 
    545      0,0,0,0,1, 0,1,1,1,0, 1,0,0,0,0, 0,0,0,0,0,    // KHMER..ORIYA
    546      0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
    547      0,1,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,    // INUKTITUT
    548 
    549      0,0,0,0,0,                                     // [160..164]
    550     // Add new language standalone bit just before here
    551      0,0,0,0,0, 1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,1,
    552      1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,1,
    553 
    554      1,1,1,1,
    555    };
    556 
    557    // True for ULScript_HanCJK
    558    // (Vietnamese and Zhuang also have Latin script quadgrams)
    559    // Subscripted by packed language number
    560    static const uint8 kIsUnigramLang[EXT_NUM_LANGUAGES + 1] = {
    561       0,
    562       0,0,0,0,0, 0,0,0,1,1, 0,0,0,0,0, 0,1,0,0,0,    // JAPANESE KOREAN CHINESE
    563       0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,    //
    564       0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,    //
    565       0,0,0,0,0, 0,1,0,0,1, 0,0,0,0,0, 0,0,0,0,0,    // VIETNAMESE CHINESE_T
    566       0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,    //
    567 
    568       0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,    //
    569       0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,    //
    570       0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 1,0,0,0,0,    // ZHUANG
    571 
    572       0,0,0,0,0,                                     // [160..164]
    573      // Add new language unigram bit just before here
    574 
    575       0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,    //
    576       0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,    //
    577 
    578       0,0,0,0,
    579    };
    580 
    581 
    582   // True for ULScript_HanCJK
    583   // Subscripted by lscript number
    584   static const uint8 kScoreUniPerLScript[] = {
    585     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1,0,0,0,0,0,0,0,
    586     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    587     0,0,0,0,0,0,0,0,
    588   };
    589 
    590   COMPILE_ASSERT(arraysize(kScoreUniPerLScript) == ULScript_NUM_SCRIPTS,
    591        kScoreUniPerLScript_has_incorrect_length);
    592 
    593 
    594   // Defines Top40 packed languages
    595 
    596   // Tier 0/1 Language enum list (16)
    597   //   ENGLISH, /*no en_GB,*/ FRENCH, ITALIAN, GERMAN, SPANISH,    // E - FIGS
    598   //   DUTCH, CHINESE, CHINESE_T, JAPANESE, KOREAN,
    599   //   PORTUGUESE, RUSSIAN, POLISH, TURKISH, THAI,
    600   //   ARABIC,
    601   //
    602   // Tier 2 Language enum list (22)
    603   //   SWEDISH, FINNISH, DANISH, /*no pt-PT,*/ ROMANIAN, HUNGARIAN,
    604   //   HEBREW, INDONESIAN, CZECH, GREEK, NORWEGIAN,
    605   //   VIETNAMESE, BULGARIAN, CROATIAN, LITHUANIAN, SLOVAK,
    606   //   TAGALOG, SLOVENIAN, SERBIAN, CATALAN, LATVIAN,
    607   //   UKRAINIAN, HINDI,
    608   //
    609   //   use SERBO_CROATIAN instead of BOSNIAN, SERBIAN, CROATIAN, MONTENEGRIN(21)
    610   //
    611   // Include IgnoreMe (TG_UNKNOWN_LANGUAGE, 25+1) as a top 40
    612 
    613   // NOTE: packed, i.e. Language enum + 1
    614   static const uint8 kIsPackedTop40[EXT_NUM_LANGUAGES + 1] = {
    615     0,
    616     1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,1, 1,1,1,1,0,
    617     1,1,1,1,0, 1,0,1,0,0, 0,0,1,1,1, 1,0,0,1,0,
    618     0,0,0,0,0, 0,0,0,0,0, 0,0,0,1,1, 1,0,0,0,0,
    619     0,0,0,1,0, 0,1,0,1,1, 0,0,0,0,0, 0,0,0,0,0,
    620     0,0,0,0,0, 0,0,0,0,0, 0,0,1,0,0, 0,0,0,0,0,
    621 
    622     0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
    623     0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
    624     0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
    625 
    626     0,0,0,0,0,                                    // [160..164]
    627     // Add new language top40 bit just before here
    628 
    629     0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
    630     0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0, 0,0,0,0,0,
    631 
    632     0,0,0,0,
    633   };
    634 
    635 
    636 
    637   // Table has 234 eight-byte entries. Each entry has a five-byte array and
    638   // a three-byte array of log base 2 probabilities in the range 0..11.
    639   // The intended use is to express five or three probabilities in a single-byte
    640   // subscript, then decode via this table. These probabilities are
    641   // intended to go with an array of five or three language numbers.
    642   //
    643   // The corresponding language numbers will have to be sorted by descending
    644   // probability, then the actual probability subscript chosen to match the
    645   // closest available entry in this table.
    646   //
    647   // Pattern of probability values:
    648   // hi 3/4 1/2 1/4 lo    hi mid lo
    649   // where "3/4" is (hi*3+lo)/4, "1/2" is (hi+lo)/2, and "1/4" is (hi+lo*3)/4 and
    650   // mid is one of 3/4 1/2 or 1/4.
    651   // There are three groups of 78 (=12*13/2) entries, with hi running 0..11 and
    652   // lo running 0..hi. Only the first group is used for five-entry lookups.
    653   // The mid value in the first group is 1/2, the second group 3/4, and the
    654   // third group 1/4. For three-entry lookups, this allows the mid entry to be
    655   // somewhat higher or lower than the midpoint, to allow a better match to the
    656   // original probabilities.
    657   static const int kLgProbV2TblSize = 234;
    658   static const uint8 kLgProbV2Tbl[kLgProbV2TblSize * 8] = {
    659     1,1,1,1,1, 1,1,1,     // [0]
    660     2,2,2,1,1, 2,2,1,     // [1]
    661     2,2,2,2,2, 2,2,2,
    662     3,3,2,2,1, 3,2,1,     // [3]
    663     3,3,3,2,2, 3,3,2,
    664     3,3,3,3,3, 3,3,3,
    665     4,3,3,2,1, 4,3,1,     // [6]
    666     4,4,3,3,2, 4,3,2,
    667     4,4,4,3,3, 4,4,3,
    668     4,4,4,4,4, 4,4,4,
    669     5,4,3,2,1, 5,3,1,     // [10]
    670     5,4,4,3,2, 5,4,2,
    671     5,5,4,4,3, 5,4,3,
    672     5,5,5,4,4, 5,5,4,
    673     5,5,5,5,5, 5,5,5,
    674     6,5,4,2,1, 6,4,1,     // [15]
    675     6,5,4,3,2, 6,4,2,
    676     6,5,5,4,3, 6,5,3,
    677     6,6,5,5,4, 6,5,4,
    678     6,6,6,5,5, 6,6,5,
    679     6,6,6,6,6, 6,6,6,
    680     7,6,4,3,1, 7,4,1,     // [21]
    681     7,6,5,3,2, 7,5,2,
    682     7,6,5,4,3, 7,5,3,
    683     7,6,6,5,4, 7,6,4,
    684     7,7,6,6,5, 7,6,5,
    685     7,7,7,6,6, 7,7,6,
    686     7,7,7,7,7, 7,7,7,
    687     8,6,5,3,1, 8,5,1,     // [28]
    688     8,7,5,4,2, 8,5,2,
    689     8,7,6,4,3, 8,6,3,
    690     8,7,6,5,4, 8,6,4,
    691     8,7,7,6,5, 8,7,5,
    692     8,8,7,7,6, 8,7,6,
    693     8,8,8,7,7, 8,8,7,
    694     8,8,8,8,8, 8,8,8,
    695     9,7,5,3,1, 9,5,1,     // [36]
    696     9,7,6,4,2, 9,6,2,
    697     9,8,6,5,3, 9,6,3,
    698     9,8,7,5,4, 9,7,4,
    699     9,8,7,6,5, 9,7,5,
    700     9,8,8,7,6, 9,8,6,
    701     9,9,8,8,7, 9,8,7,
    702     9,9,9,8,8, 9,9,8,
    703     9,9,9,9,9, 9,9,9,
    704     10,8,6,3,1, 10,6,1,   // [45]
    705     10,8,6,4,2, 10,6,2,
    706     10,8,7,5,3, 10,7,3,
    707     10,9,7,6,4, 10,7,4,
    708     10,9,8,6,5, 10,8,5,
    709     10,9,8,7,6, 10,8,6,
    710     10,9,9,8,7, 10,9,7,
    711     10,10,9,9,8, 10,9,8,
    712     10,10,10,9,9, 10,10,9,
    713     10,10,10,10,10, 10,10,10,
    714     11,9,6,4,1, 11,6,1,   // [55]
    715     11,9,7,4,2, 11,7,2,
    716     11,9,7,5,3, 11,7,3,
    717     11,9,8,6,4, 11,8,4,
    718     11,10,8,7,5, 11,8,5,
    719     11,10,9,7,6, 11,9,6,
    720     11,10,9,8,7, 11,9,7,
    721     11,10,10,9,8, 11,10,8,
    722     11,11,10,10,9, 11,10,9,
    723     11,11,11,10,10, 11,11,10,
    724     11,11,11,11,11, 11,11,11,
    725     12,9,7,4,1, 12,7,1,   // [66]
    726     12,10,7,5,2, 12,7,2,
    727     12,10,8,5,3, 12,8,3,
    728     12,10,8,6,4, 12,8,4,
    729     12,10,9,7,5, 12,9,5,
    730     12,11,9,8,6, 12,9,6,
    731     12,11,10,8,7, 12,10,7,
    732     12,11,10,9,8, 12,10,8,
    733     12,11,11,10,9, 12,11,9,
    734     12,12,11,11,10, 12,11,10,
    735     12,12,12,11,11, 12,12,11,
    736     12,12,12,12,12, 12,12,12,
    737 
    738     1,1,1,1,1, 1,1,1,
    739     2,2,2,1,1, 2,2,1,
    740     2,2,2,2,2, 2,2,2,
    741     3,3,2,2,1, 3,3,1,
    742     3,3,3,2,2, 3,3,2,
    743     3,3,3,3,3, 3,3,3,
    744     4,3,3,2,1, 4,3,1,
    745     4,4,3,3,2, 4,4,2,
    746     4,4,4,3,3, 4,4,3,
    747     4,4,4,4,4, 4,4,4,
    748     5,4,3,2,1, 5,4,1,
    749     5,4,4,3,2, 5,4,2,
    750     5,5,4,4,3, 5,5,3,
    751     5,5,5,4,4, 5,5,4,
    752     5,5,5,5,5, 5,5,5,
    753     6,5,4,2,1, 6,5,1,
    754     6,5,4,3,2, 6,5,2,
    755     6,5,5,4,3, 6,5,3,
    756     6,6,5,5,4, 6,6,4,
    757     6,6,6,5,5, 6,6,5,
    758     6,6,6,6,6, 6,6,6,
    759     7,6,4,3,1, 7,6,1,
    760     7,6,5,3,2, 7,6,2,
    761     7,6,5,4,3, 7,6,3,
    762     7,6,6,5,4, 7,6,4,
    763     7,7,6,6,5, 7,7,5,
    764     7,7,7,6,6, 7,7,6,
    765     7,7,7,7,7, 7,7,7,
    766     8,6,5,3,1, 8,6,1,
    767     8,7,5,4,2, 8,7,2,
    768     8,7,6,4,3, 8,7,3,
    769     8,7,6,5,4, 8,7,4,
    770     8,7,7,6,5, 8,7,5,
    771     8,8,7,7,6, 8,8,6,
    772     8,8,8,7,7, 8,8,7,
    773     8,8,8,8,8, 8,8,8,
    774     9,7,5,3,1, 9,7,1,
    775     9,7,6,4,2, 9,7,2,
    776     9,8,6,5,3, 9,8,3,
    777     9,8,7,5,4, 9,8,4,
    778     9,8,7,6,5, 9,8,5,
    779     9,8,8,7,6, 9,8,6,
    780     9,9,8,8,7, 9,9,7,
    781     9,9,9,8,8, 9,9,8,
    782     9,9,9,9,9, 9,9,9,
    783     10,8,6,3,1, 10,8,1,
    784     10,8,6,4,2, 10,8,2,
    785     10,8,7,5,3, 10,8,3,
    786     10,9,7,6,4, 10,9,4,
    787     10,9,8,6,5, 10,9,5,
    788     10,9,8,7,6, 10,9,6,
    789     10,9,9,8,7, 10,9,7,
    790     10,10,9,9,8, 10,10,8,
    791     10,10,10,9,9, 10,10,9,
    792     10,10,10,10,10, 10,10,10,
    793     11,9,6,4,1, 11,9,1,
    794     11,9,7,4,2, 11,9,2,
    795     11,9,7,5,3, 11,9,3,
    796     11,9,8,6,4, 11,9,4,
    797     11,10,8,7,5, 11,10,5,
    798     11,10,9,7,6, 11,10,6,
    799     11,10,9,8,7, 11,10,7,
    800     11,10,10,9,8, 11,10,8,
    801     11,11,10,10,9, 11,11,9,
    802     11,11,11,10,10, 11,11,10,
    803     11,11,11,11,11, 11,11,11,
    804     12,9,7,4,1, 12,9,1,
    805     12,10,7,5,2, 12,10,2,
    806     12,10,8,5,3, 12,10,3,
    807     12,10,8,6,4, 12,10,4,
    808     12,10,9,7,5, 12,10,5,
    809     12,11,9,8,6, 12,11,6,
    810     12,11,10,8,7, 12,11,7,
    811     12,11,10,9,8, 12,11,8,
    812     12,11,11,10,9, 12,11,9,
    813     12,12,11,11,10, 12,12,10,
    814     12,12,12,11,11, 12,12,11,
    815     12,12,12,12,12, 12,12,12,
    816 
    817     1,1,1,1,1, 1,1,1,
    818     2,2,2,1,1, 2,1,1,
    819     2,2,2,2,2, 2,2,2,
    820     3,3,2,2,1, 3,2,1,
    821     3,3,3,2,2, 3,2,2,
    822     3,3,3,3,3, 3,3,3,
    823     4,3,3,2,1, 4,2,1,
    824     4,4,3,3,2, 4,3,2,
    825     4,4,4,3,3, 4,3,3,
    826     4,4,4,4,4, 4,4,4,
    827     5,4,3,2,1, 5,2,1,
    828     5,4,4,3,2, 5,3,2,
    829     5,5,4,4,3, 5,4,3,
    830     5,5,5,4,4, 5,4,4,
    831     5,5,5,5,5, 5,5,5,
    832     6,5,4,2,1, 6,2,1,
    833     6,5,4,3,2, 6,3,2,
    834     6,5,5,4,3, 6,4,3,
    835     6,6,5,5,4, 6,5,4,
    836     6,6,6,5,5, 6,5,5,
    837     6,6,6,6,6, 6,6,6,
    838     7,6,4,3,1, 7,3,1,
    839     7,6,5,3,2, 7,3,2,
    840     7,6,5,4,3, 7,4,3,
    841     7,6,6,5,4, 7,5,4,
    842     7,7,6,6,5, 7,6,5,
    843     7,7,7,6,6, 7,6,6,
    844     7,7,7,7,7, 7,7,7,
    845     8,6,5,3,1, 8,3,1,
    846     8,7,5,4,2, 8,4,2,
    847     8,7,6,4,3, 8,4,3,
    848     8,7,6,5,4, 8,5,4,
    849     8,7,7,6,5, 8,6,5,
    850     8,8,7,7,6, 8,7,6,
    851     8,8,8,7,7, 8,7,7,
    852     8,8,8,8,8, 8,8,8,
    853     9,7,5,3,1, 9,3,1,
    854     9,7,6,4,2, 9,4,2,
    855     9,8,6,5,3, 9,5,3,
    856     9,8,7,5,4, 9,5,4,
    857     9,8,7,6,5, 9,6,5,
    858     9,8,8,7,6, 9,7,6,
    859     9,9,8,8,7, 9,8,7,
    860     9,9,9,8,8, 9,8,8,
    861     9,9,9,9,9, 9,9,9,
    862     10,8,6,3,1, 10,3,1,
    863     10,8,6,4,2, 10,4,2,
    864     10,8,7,5,3, 10,5,3,
    865     10,9,7,6,4, 10,6,4,
    866     10,9,8,6,5, 10,6,5,
    867     10,9,8,7,6, 10,7,6,
    868     10,9,9,8,7, 10,8,7,
    869     10,10,9,9,8, 10,9,8,
    870     10,10,10,9,9, 10,9,9,
    871     10,10,10,10,10, 10,10,10,
    872     11,9,6,4,1, 11,4,1,
    873     11,9,7,4,2, 11,4,2,
    874     11,9,7,5,3, 11,5,3,
    875     11,9,8,6,4, 11,6,4,
    876     11,10,8,7,5, 11,7,5,
    877     11,10,9,7,6, 11,7,6,
    878     11,10,9,8,7, 11,8,7,
    879     11,10,10,9,8, 11,9,8,
    880     11,11,10,10,9, 11,10,9,
    881     11,11,11,10,10, 11,10,10,
    882     11,11,11,11,11, 11,11,11,
    883     12,9,7,4,1, 12,4,1,
    884     12,10,7,5,2, 12,5,2,
    885     12,10,8,5,3, 12,5,3,
    886     12,10,8,6,4, 12,6,4,
    887     12,10,9,7,5, 12,7,5,
    888     12,11,9,8,6, 12,8,6,
    889     12,11,10,8,7, 12,8,7,
    890     12,11,10,9,8, 12,9,8,
    891     12,11,11,10,9, 12,10,9,
    892     12,12,11,11,10, 12,11,10,
    893     12,12,12,11,11, 12,11,11,
    894     12,12,12,12,12, 12,12,12,
    895   };
    896 
    897   // Backmap a single desired probability into an entry in kLgProbV2Tbl
    898   static const uint8 kLgProbV2TblBackmap[13] = {
    899     0,
    900     0, 1, 3, 6,   10, 15, 21, 28,   36, 45, 55, 66,
    901   };
    902 
    903 
    904   // Always advances one UTF-8 character
    905   static const uint8 kAdvanceOneChar[256] = {
    906     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    907     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    908     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    909     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    910 
    911     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    912     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    913     2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
    914     3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4,
    915   };
    916 
    917   // Does not advance past space or cr/lf/nul
    918   static const uint8 kAdvanceOneCharButSpace[256] = {
    919     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    920     0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    921     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    922     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    923 
    924     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    925     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    926     2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
    927     3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4,
    928   };
    929 
    930   // Advances *only* on space or ASCII vowel (or illegal byte)
    931   static const uint8 kAdvanceOneCharSpaceVowel[256] = {
    932     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    933     1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    934     0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
    935     0,1,0,0,0,1,0,0, 0,1,0,0,0,0,0,1, 0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,
    936 
    937     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    938     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    939     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    940     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    941   };
    942 
    943   // Advances *only* on space (or illegal byte)
    944   static const uint8 kAdvanceOneCharSpace[256] = {
    945     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    946     1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    947     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    948     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    949 
    950     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    951     1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
    952     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    953     0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
    954   };
    955 
    956 
    957 //------------------------------------------------------------------------------
    958 // General
    959 //------------------------------------------------------------------------------
    960   static inline int minint(int a, int b) {return (a < b) ? a: b;}
    961   static inline int maxint(int a, int b) {return (a > b) ? a: b;}
    962 
    963   // Here to make available for debugging
    964   int ReliabilityDelta(int value1, int value2, int count);
    965   int ReliabilityMainstream(int topscore, int len, int mean_score);
    966 
    967   // Returns "0" for too small
    968   inline const char* MyExtLanguageCode(Language lang) {
    969     return ExtLanguageCode(lang);
    970   }
    971 
    972   // Map script into Latin, Cyrillic, Arabic, Other. Used in keeping track of
    973   // amount of training data for language-script combinations
    974   inline int LScript4(UnicodeLScript lscript) {
    975     if (lscript == ULScript_Latin) {return 0;}
    976     if (lscript == ULScript_Cyrillic) {return 1;}
    977     if (lscript == ULScript_Arabic) {return 2;}
    978     return 3;
    979   }
    980 
    981 
    982   // Routines to access 3 or 5 log probabilities in a single byte.
    983 
    984   // Return address of 8-byte entry[i]
    985   inline const uint8* LgProb2TblEntry(int i) {
    986     return &kLgProbV2Tbl[i * 8];
    987   }
    988 
    989   // Return one of five probabilities in an entry
    990   // CURRENTLY UNUSED
    991   inline uint8 LgProb5(const uint8* entry, int j) {
    992     return entry[j];
    993   }
    994 
    995   // Return one of three probabilities in an entry
    996   inline uint8 LgProb3(const uint8* entry, int j) {
    997     return entry[j + 5];
    998   }
    999 
   1000 
   1001 
   1002 //------------------------------------------------------------------------------
   1003 // Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores
   1004 //------------------------------------------------------------------------------
   1005 
   1006   // Pick up 1..12 bytes and hash them via mask/shift/add. NO pre/post
   1007   // OVERSHOOTS up to 3 bytes
   1008   uint32 BiHashV25(const char* word_ptr, int bytecount);
   1009 
   1010   // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
   1011   // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
   1012   uint32 QuadHashV25(const char* word_ptr, int bytecount);
   1013 
   1014   // Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
   1015   // OVERSHOOTS up to 3 bytes
   1016   uint32 QuadHashV25Underscore(const char* word_ptr, int bytecount);
   1017 
   1018 
   1019   // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
   1020   // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
   1021   // For runtime use of tables V3
   1022   uint64 OctaHash40(const char* word_ptr, int bytecount);
   1023 
   1024   uint64 OctaHash40underscore(const char* word_ptr, int bytecount);
   1025 
   1026 
   1027   // From 32-bit gram FP, return hash table subscript and remaining key
   1028   inline void QuadFPJustHash(uint32 quadhash,
   1029                                   uint32 keymask,
   1030                                   int bucketcount,
   1031                                   uint32* subscr, uint32* hashkey) {
   1032     *subscr = (quadhash + (quadhash >> 12)) & (bucketcount - 1);
   1033     *hashkey = quadhash & keymask;
   1034   }
   1035 
   1036   // Look up 32-bit gram FP in caller-passed table
   1037   // Typical size 256K entries (1.5MB)
   1038   // Two-byte hashkey
   1039   inline const uint32 QuadHashV3Lookup4(const cld::CLDTableSummary* gram_obj,
   1040                                         uint32 quadhash) {
   1041 
   1042     uint32 subscr, hashkey;
   1043     const IndirectProbBucket4* quadtable = gram_obj->kCLDTable;
   1044     uint32 keymask = gram_obj->kCLDTableKeyMask;
   1045     int bucketcount = gram_obj->kCLDTableSize;
   1046     QuadFPJustHash(quadhash, keymask, bucketcount, &subscr, &hashkey);
   1047     const IndirectProbBucket4* bucket_ptr = &quadtable[subscr];
   1048     // Four-way associative, 4 compares
   1049     if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) {
   1050       return bucket_ptr->keyvalue[0];
   1051     }
   1052     if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) {
   1053       return bucket_ptr->keyvalue[1];
   1054     }
   1055     if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) {
   1056       return bucket_ptr->keyvalue[2];
   1057     }
   1058     if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) {
   1059       return bucket_ptr->keyvalue[3];
   1060     }
   1061     return 0;
   1062   }
   1063 
   1064 
   1065   // Map 40 bits to subscript, hashkey, expected 18-22 bit subscript (min 16)
   1066   //     wwwwwwww xxxxxxxx xxxxxxxx yyyyyyyy yyyyyyyy
   1067   //   + ........ ....wwww wwwwxxxx xxxxxxxx xxxxyyyy
   1068   //     00000000 00000000 00000011 11111111 11111111 (18-bit bucketcount-1)
   1069   //
   1070   // hashkey:
   1071   //              wwwwxxxx xxxxxxxx xxxx.... ........ (20-bit keymask)
   1072   // 12-bit shift in subscript mixes in ~4 letters x 4 bits each
   1073 
   1074   // From 40-bit gram FP, return hash table subscript and remaining key
   1075   inline void OctaFPJustHash(uint64 longwordhash,
   1076                                     uint32 keymask,
   1077                                     int bucketcount,
   1078                                     uint32* subscr, uint32* hashkey) {
   1079     uint32 temp = (longwordhash + (longwordhash >> 12)) & (bucketcount - 1);
   1080     *subscr = temp;
   1081     temp = longwordhash >> 4;
   1082     *hashkey = temp & keymask;
   1083   }
   1084 
   1085   // Look up 40-bit gram FP in caller-passed table
   1086   // Typical size 256K-4M entries (1-16MB)
   1087   // 24-12 bit hashkey packed with 8-20 bit indirect lang/probs
   1088   // keymask is 0xfffff000 for 20-bit hashkey and 12-bit indirect
   1089   inline const uint32 OctaHashV3Lookup4(const cld::CLDTableSummary* gram_obj,
   1090                                             uint64 longwordhash) {
   1091     uint32 subscr, hashkey;
   1092     const IndirectProbBucket4* octatable = gram_obj->kCLDTable;
   1093     uint32 keymask = gram_obj->kCLDTableKeyMask;
   1094     int bucketcount = gram_obj->kCLDTableSize;
   1095     OctaFPJustHash(longwordhash, keymask, bucketcount,
   1096                           &subscr, &hashkey);
   1097     const IndirectProbBucket4* bucket_ptr = &octatable[subscr];
   1098     // Four-way associative, 4 compares
   1099     if (((hashkey ^ bucket_ptr->keyvalue[0]) & keymask) == 0) {
   1100       return bucket_ptr->keyvalue[0];
   1101     }
   1102     if (((hashkey ^ bucket_ptr->keyvalue[1]) & keymask) == 0) {
   1103       return bucket_ptr->keyvalue[1];
   1104     }
   1105     if (((hashkey ^ bucket_ptr->keyvalue[2]) & keymask) == 0) {
   1106       return bucket_ptr->keyvalue[2];
   1107     }
   1108     if (((hashkey ^ bucket_ptr->keyvalue[3]) & keymask) == 0) {
   1109       return bucket_ptr->keyvalue[3];
   1110     }
   1111     return 0;
   1112   }
   1113 
   1114 
   1115 
   1116 //------------------------------------------------------------------------------
   1117 // Scoring single groups of letters
   1118 //------------------------------------------------------------------------------
   1119 
   1120   // UNIGRAM score one => tote
   1121   // Input: 1-byte entry of subscript into unigram probs, plus
   1122   //  an accumulator tote.
   1123   // Output: running sums in tote updated
   1124   void ProcessProbV25UniTote(int propval, Tote* tote);
   1125 
   1126   // BIGRAM, QUADGRAM, OCTAGRAM score one => tote
   1127   // Input: 4-byte entry of 3 language numbers and one probability subscript,
   1128   //  plus an accumulator tote. (language 0 means unused entry)
   1129   // Output: running sums in tote updated
   1130   void ProcessProbV25Tote(uint32 probs, Tote* tote);
   1131 
   1132 
   1133 //------------------------------------------------------------------------------
   1134 // Routines to accumulate probabilities
   1135 //------------------------------------------------------------------------------
   1136 
   1137   // Score up to n=gram_limit unigrams, returning number of bytes consumed
   1138   // Caller supplies table, such as compact_lang_det_generated_ctjkvz_b1_obj
   1139   int DoUniScoreV3(const UTF8PropObj* unigram_obj,
   1140                    const char* isrc, int srclen, int advance_by,
   1141                    int* tote_grams, int gram_limit, Tote* chunk_tote);
   1142 
   1143 
   1144   // Score all words in isrc, using languages that have bigrams (CJK)
   1145   // Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj
   1146   // Return number of bigrams that hit in the hash table
   1147   int DoBigramScoreV3(const cld::CLDTableSummary* bigram_obj,
   1148                       const char* isrc, int srclen, Tote* chunk_tote);
   1149 
   1150 
   1151   // Score up to n=gram_limit quadgrams, returning number of bytes consumed
   1152   // Caller supplies table, such as &kQuadTable_obj or &kGibberishTable_obj
   1153   int DoQuadScoreV3(const cld::CLDTableSummary* quadgram_obj,
   1154                     const char* isrc, int srclen, int advance_by,
   1155                     int* tote_grams, int gram_limit, Tote* chunk_tote);
   1156 
   1157   // Score all octagrams (words) in isrc, using languages that have quadgrams
   1158   // Caller supplies table, such as &kLongWord8Table_obj
   1159   // Return number of words that hit in the hash table
   1160   int DoOctaScoreV3(const cld::CLDTableSummary* octagram_obj,
   1161                     const char* isrc, int srclen, Tote* chunk_tote);
   1162 
   1163 //------------------------------------------------------------------------------
   1164 // Reliability calculations, for single language and between languages
   1165 //------------------------------------------------------------------------------
   1166 
   1167   // Reliability = 0..100
   1168   static const int kMinReliable = 75;
   1169 
   1170   // Calculate ratio of score per 1KB vs. expected score per 1KB
   1171   double GetNormalizedScore(Language lang, UnicodeLScript lscript,
   1172                           int bytes, int score);
   1173 
   1174   // Calculate reliablity of len bytes of script lscript with chunk_tote
   1175   int GetReliability(int len, UnicodeLScript lscript, const Tote* chunk_tote);
   1176 
   1177 
   1178 //------------------------------------------------------------------------------
   1179 // Miscellaneous
   1180 //------------------------------------------------------------------------------
   1181 
   1182   // Make languages packed into uint32 values non-zero
   1183   // These routines later could remap so languages not in QuadHash tables are not
   1184   // represented, and so that any thrashing in accumulation is eliminated
   1185   uint8 inline PackLanguage(Language lang) {
   1186     return static_cast<uint8>(lang + 1);}
   1187 
   1188   Language inline UnpackLanguage(int ilang) {
   1189     return static_cast<Language>(ilang - 1);}
   1190 
   1191   // Useful single-byte tests
   1192   bool inline IsUTF8ContinueByte(char c) {
   1193     return static_cast<signed char>(c) < -64;}
   1194   bool inline IsUTF8HighByte(char c) {
   1195     return static_cast<signed char>(c) < 0;}
   1196 
   1197 
   1198   // Demote all languages except Top40 and plus_one
   1199   // Do this just before sorting
   1200   void DemoteNotTop40(Tote* chunk_tote, int packed_plus_one);
   1201 
   1202 }       // End namespace cld
   1203 
   1204 
   1205 #endif  // ENCODINGS_COMPACT_LANG_DET_CLDUTIL_H_
   1206