Home | History | Annotate | Download | only in compact_lang_det
      1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include <stdio.h>
      6 #include <string.h>
      7 //#include <sys/time.h>                               // for gettimeofday
      8 #include <string>
      9 
     10 #include "encodings/lang_enc.h"
     11 
     12 #include "encodings/compact_lang_det/compact_lang_det.h"
     13 #include "encodings/compact_lang_det/compact_lang_det_impl.h"
     14 #include "encodings/compact_lang_det/getonescriptspan.h"
     15 #include "encodings/compact_lang_det/letterscript_enum.h"
     16 #include "encodings/compact_lang_det/tote.h"
     17 #include "encodings/compact_lang_det/utf8propjustletter.h"
     18 #include "encodings/compact_lang_det/utf8propletterscriptnum.h"
     19 #include "encodings/compact_lang_det/utf8scannotjustletterspecial.h"
     20 
     21 #include "encodings/compact_lang_det/cldutil_dbg.h"
     22 
     23 #include "encodings/compact_lang_det/win/cld_basictypes.h"
     24 #include "encodings/compact_lang_det/win/cld_commandlineflags.h"
     25 #include "encodings/compact_lang_det/win/cld_google.h"
     26 #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
     27 
     28 // Linker supplies the right tables
     29 extern const UTF8PropObj compact_lang_det_generated_ctjkvz_b1_obj;
     30 extern const cld::CLDTableSummary kCjkBiTable_obj;
     31 extern const cld::CLDTableSummary kQuadTable_obj;
     32 extern const cld::CLDTableSummary kLongWord8Table_obj;
     33 
     34 DEFINE_bool(cld_html, false, "Print language spans in HTML on stderr");
     35 DEFINE_bool(cld_forcewords, false, "Score all words, in addition to quads");
     36 
     37 DEFINE_bool(cld_showme, false, "Put squeeze/repeat points into HTML text");
     38 DEFINE_bool(cld_echotext, false, "Print each scriptspan to stderr");
     39 DEFINE_int32(cld_textlimit, 160, "Examine only initial n KB of actual text");
     40 // 20 quadgrams is about 80 bytes or about 12 words in real text
     41 DEFINE_int32(cld_smoothwidth, 20, "Smoothing window width in quadgrams");
     42 
     43 
     44 static const int kLangHintInitial = 12;  // Boost language by N initially
     45 static const int kLangHintBoost = 12;    // Boost language by N/16 per quadgram
     46 
     47 static const int kShortSpanThresh = 32;       // Bytes
     48 static const int kMaxSecondChanceLen = 1024;  // Look at first 1K of short spans
     49 
     50 static const int kCheapSqueezeTestThresh = 4096;  // Only look for squeezing
     51                                                   // after this many text bytes
     52 static const int kCheapSqueezeTestLen = 256;   // Bytes to test to trigger sqz
     53 static const int kSpacesTriggerPercent = 25;   // Trigger sqz if >=25% spaces
     54 static const int kPredictTriggerPercent = 67;  // Trigger sqz if >=67% predicted
     55 
     56 static const int kChunksizeDefault = 48;      // Squeeze 48-byte chunks
     57 static const int kSpacesThreshPercent = 25;   // Squeeze if >=25% spaces
     58 static const int kPredictThreshPercent = 40;  // Squeeze if >=40% predicted
     59 
     60 static const int kMaxSpaceScan = 32;          // Bytes
     61 
     62 static const int kGoodLang1Percent = 70;
     63 static const int kGoodLang1and2Percent = 93;
     64 static const int kShortTextThresh = 256;      // Bytes
     65 
     66 static const int kMinChunkSizeQuads = 4;      // Chunk is at least four quads
     67 static const int kMaxChunkSizeQuads = 1024;   // Chunk is at most 1K quads
     68 
     69 static const int kDefaultWordSpan = 256;      // Scan at least this many initial
     70                                               // bytes with word scoring
     71 static const int kReallyBigWordSpan = 9999999;  // Forces word scoring all text
     72 
     73 static const int kMinReliableSeq = 50;      // Record in seq if >= 50% reliable
     74 
     75 static const int kPredictionTableSize = 4096;   // Must be exactly 4096 for
     76                                                 // cheap compressor
     77 
     78 //
     79 // Generated by dsites 2008.07.07 from 10% of Base
     80 //
     81 
     82 // Three packed language probs, subscripted by Encoding
     83 static const uint32 kEncodingHintProbs[] = {
     84   0x00000000,  // ASCII
     85   0x18120cd5,  // Latin2   POLISH.11 CZECH.5 HUNGARIAN.3
     86   0x1d3a4bc9,  // Latin3   AZERBAIJANI.10 BASQUE.3 CROATIAN.1
     87   0x030819d4,  // Latin4   ESTONIAN.11 ITALIAN.4 DUTCH.2
     88   0x00000000,  // ISO-8859-5
     89   0x00003742,  // Arabic   ARABIC.12
     90   0x00000000,  // Greek
     91   0x00000742,  // Hebrew   HEBREW.12
     92   0x00002242,  // Latin5   TURKISH.12
     93   0x060419c9,  // Latin6   ESTONIAN.10 FINNISH.3 GERMAN.1
     94   0x00000942,  // EUC-JP   Japanese.12
     95   0x00000942,  // SJS   Japanese.12
     96   0x00000942,  // JIS   Japanese.12
     97   0x00004642,  // BIG5   ChineseT.12
     98   0x00001142,  // GB   Chinese.12
     99   0x46295fcd,  // EUC-CN   UIGHUR.10 MALAY.6 ChineseT.5
    100   0x00000a42,  // KSC   Korean.12
    101   0x00000000,  // Unicode
    102   0x03104674,  // EUC   ChineseT.9 SWEDISH.8 DUTCH.3
    103   0x00000000,  // CNS
    104   0x0f1146c3,  // BIG5-CP950   ChineseT.9 Chinese.5 SPANISH.4
    105   0x00000942,  // CP932   Japanese.12
    106   0x00000000,  // UTF8
    107   0x00000000,  // Unknown
    108   0x00000000,  // ASCII-7-bit
    109   0x00000000,  // KOI8R
    110   0x00000000,  // CP1251
    111   0x00000000,  // CP1252
    112   0x00000000,  // KOI8U
    113   0x451d12cd,  // CP1250   CZECH.10 CROATIAN.6 SLOVAK.5
    114   0x0d06052a,  // ISO-8859-15   FRENCH.9 GERMAN.8 PORTUGUESE.7
    115   0x00002242,  // CP1254   TURKISH.12
    116   0x191516be,  // CP1257   LITHUANIAN.8 LATVIAN.7 ESTONIAN.7
    117   0x08003642,  // ISO-8859-11   THAI.12 ITALIAN.1
    118   0x00000000,  // CP874
    119   0x00003742,  // CP1256   ARABIC.12
    120   0x00000742,  // CP1255   HEBREW.12
    121   0x00000000,  // ISO-8859-8-I
    122   0x00000000,  // VISUAL
    123   0x00000000,  // CP852
    124   0x39001242,  // CSN_369103   CZECH.12 ESPERANTO.1
    125   0x00000000,  // CP1253
    126   0x00000000,  // CP866
    127   0x2e001944,  // ISO-8859-13   ESTONIAN.12 ALBANIAN.3
    128   0x08090a74,  // ISO-2022-KR   Korean.9 Japanese.8 ITALIAN.3
    129   0x00001142,  // GBK   Chinese.12
    130   0x4600113d,  // GB18030   Chinese.11 ChineseT.7
    131   0x00004642,  // BIG5_HKSCS   ChineseT.12
    132   0x00000000,  // ISO_2022_CN
    133   0x00000000,  // TSCII
    134   0x00000000,  // TAM
    135   0x00000000,  // TAB
    136   0x00000000,  // JAGRAN
    137   0x00000000,  // MACINTOSH
    138   0x00000000,  // UTF7
    139   0x00000000,  // BHASKAR
    140   0x00000000,  // HTCHANAKYA
    141   0x090646ca,  // UTF-16BE   ChineseT.10 GERMAN.4 Japanese.2
    142   0x00000000,  // UTF-16LE
    143   0x00000000,  // UTF-32BE
    144   0x00000000,  // UTF-32LE
    145   0x00000000,  // X-BINARYENC
    146   0x06001142,  // HZ-GB-2312   Chinese.12 GERMAN.1
    147   0x461109c2,  // X-UTF8UTF8   Japanese.9 Chinese.5 ChineseT.3
    148   0x00000000,  // X-TAM-ELANGO
    149   0x00000000,  // X-TAM-LTTMBARANI
    150   0x00000000,  // X-TAM-SHREE
    151   0x00000000,  // X-TAM-TBOOMIS
    152   0x00000000,  // X-TAM-TMNEWS
    153   0x00000000,  // X-TAM-WEBTAMIL
    154   0x00000000,  // X-KDDI-Shift_JIS
    155   0x00000000,  // X-DoCoMo-Shift_JIS
    156   0x00000000,  // X-SoftBank-Shift_JIS
    157   0x00000000,  // X-KDDI-ISO-2022-JP
    158   0x00000000,  // X-SoftBank-ISO-2022-JP
    159 };
    160 
    161 COMPILE_ASSERT(arraysize(kEncodingHintProbs) == NUM_ENCODINGS,
    162                kEncodingHintProbs_has_incorrect_size);
    163 
    164 //
    165 // Generated by dsites 2008.07.07 from 10% of Base
    166 //
    167 
    168 // Three packed language probs, subscripted by (anchor) language
    169 static const uint32 kLanguageHintProbs[] = {
    170   0x00000000,  // ENGLISH
    171   0x00000242,  // DANISH   DANISH.12
    172   0x00000342,  // DUTCH   DUTCH.12
    173   0x00000442,  // FINNISH   FINNISH.12
    174   0x00000542,  // FRENCH   FRENCH.12
    175   0x00000642,  // GERMAN   GERMAN.12
    176   0x00000742,  // HEBREW   HEBREW.12
    177   0x00000842,  // ITALIAN   ITALIAN.12
    178   0x00000942,  // Japanese   Japanese.12
    179   0x00000a42,  // Korean   Korean.12
    180   0x51000b43,  // NORWEGIAN   NORWEGIAN.12 NORWEGIAN_N.2
    181   0x00000c42,  // POLISH   POLISH.12
    182   0x00000d42,  // PORTUGUESE   PORTUGUESE.12
    183   0x00000000,  // RUSSIAN
    184   0x00000f42,  // SPANISH   SPANISH.12
    185   0x00001042,  // SWEDISH   SWEDISH.12
    186   0x00001142,  // Chinese   Chinese.12
    187   0x00001242,  // CZECH   CZECH.12
    188   0x00000000,  // GREEK
    189   0x47001442,  // ICELANDIC   ICELANDIC.12 FAROESE.1
    190   0x00001542,  // LATVIAN   LATVIAN.12
    191   0x00001642,  // LITHUANIAN   LITHUANIAN.12
    192   0x00001742,  // ROMANIAN   ROMANIAN.12
    193   0x00001842,  // HUNGARIAN   HUNGARIAN.12
    194   0x00001942,  // ESTONIAN   ESTONIAN.12
    195   0x00000000,  // TG_UNKNOWN_LANGUAGE
    196   0x00000000,  // Unknown
    197   0x00001c42,  // BULGARIAN   BULGARIAN.12
    198   0x00001d42,  // CROATIAN   CROATIAN.12
    199   0x1e001d46,  // SERBIAN   CROATIAN.12 SERBIAN.5
    200   0x00000000,  // IRISH
    201   0x0f00203d,  // GALICIAN   GALICIAN.11 SPANISH.7
    202   0x5e00213a,  // TAGALOG   TAGALOG.11 SOMALI.4
    203   0x00002242,  // TURKISH   TURKISH.12
    204   0x00002342,  // UKRAINIAN   UKRAINIAN.12
    205   0x00000000,  // HINDI
    206   0x1c1e25d4,  // MACEDONIAN   MACEDONIAN.11 SERBIAN.4 BULGARIAN.2
    207   0x00002642,  // BENGALI   BENGALI.12
    208   0x00002742,  // INDONESIAN   INDONESIAN.12
    209   0x00000000,  // LATIN
    210   0x2700293c,  // MALAY   MALAY.11 INDONESIAN.6
    211   0x00000000,  // MALAYALAM
    212   0x00000000,  // WELSH
    213   0x00000000,  // NEPALI
    214   0x00000000,  // TELUGU
    215   0x00002e42,  // ALBANIAN   ALBANIAN.12
    216   0x00000000,  // TAMIL
    217   0x00003042,  // BELARUSIAN   BELARUSIAN.12
    218   0x00000000,  // JAVANESE
    219   0x00000000,  // OCCITAN
    220   0x375f3330,  // URDU   URDU.10 UIGHUR.7 ARABIC.4
    221   0x41003436,  // BIHARI   BIHARI.10 MARATHI.10
    222   0x00000000,  // GUJARATI
    223   0x0a4636b2,  // THAI   THAI.7 ChineseT.3 Korean.2
    224   0x00003742,  // ARABIC   ARABIC.12
    225   0x00003842,  // CATALAN   CATALAN.12
    226   0x00003942,  // ESPERANTO   ESPERANTO.12
    227   0x00003a42,  // BASQUE   BASQUE.12
    228   0x00000000,  // INTERLINGUA
    229   0x00000000,  // KANNADA
    230   0x05060cca,  // PUNJABI   POLISH.10 GERMAN.4 FRENCH.2
    231   0x00000000,  // SCOTS_GAELIC
    232   0x00003f42,  // SWAHILI   SWAHILI.12
    233   0x00004042,  // SLOVENIAN   SLOVENIAN.12
    234   0x00004142,  // MARATHI   MARATHI.12
    235   0x00004242,  // MALTESE   MALTESE.12
    236   0x00004342,  // VIETNAMESE   VIETNAMESE.12
    237   0x00000000,  // FRISIAN
    238   0x12004543,  // SLOVAK   SLOVAK.12 CZECH.2
    239   0x00004642,  // ChineseT   ChineseT.12
    240   0x00000000,  // FAROESE
    241   0x00000000,  // SUNDANESE
    242   0x79004944,  // UZBEK   UZBEK.12 TAJIK.3
    243   0x4d004a46,  // AMHARIC   AMHARIC.12 TIGRINYA.5
    244   0x00004b42,  // AZERBAIJANI   AZERBAIJANI.12
    245   0x00000000,  // GEORGIAN
    246   0x00000000,  // TIGRINYA
    247   0x00004e42,  // PERSIAN   PERSIAN.12
    248   0x00000000,  // BOSNIAN
    249   0x00000000,  // SINHALESE
    250   0x00000000,  // NORWEGIAN_N
    251   0x00000000,  // PORTUGUESE_P
    252   0x00000000,  // PORTUGUESE_B
    253   0x00000000,  // XHOSA
    254   0x00000000,  // ZULU
    255   0x00000000,  // GUARANI
    256   0x00000000,  // SESOTHO
    257   0x00000000,  // TURKMEN
    258   0x7a005933,  // KYRGYZ   KYRGYZ.10 TATAR.7
    259   0x00000000,  // BRETON
    260   0x00000000,  // TWI
    261   0x00000000,  // YIDDISH
    262   0x00000000,  // SERBO_CROATIAN
    263   0x00000000,  // SOMALI
    264   0x00005f42,  // UIGHUR   UIGHUR.12
    265   0x00006042,  // KURDISH   KURDISH.12
    266   0x00006142,  // MONGOLIAN   MONGOLIAN.12
    267   0x051130c9,  // ARMENIAN   BELARUSIAN.10 Chinese.3 FRENCH.1
    268   0x020f0521,  // LAOTHIAN   FRENCH.8 SPANISH.7 DANISH.6
    269   0x64004e35,  // SINDHI   PERSIAN.10 SINDHI.9
    270   0x00000000,  // RHAETO_ROMANCE
    271   0x00006642,  // AFRIKAANS   AFRIKAANS.12
    272   0x00000000,  // LUXEMBOURGISH
    273   0x00006842,  // BURMESE   BURMESE.12
    274   0x00002242,  // KHMER   TURKISH.12
    275   0x88006a3c,  // TIBETAN   TIBETAN.11 DZONGKHA.6
    276   0x00000000,  // DHIVEHI
    277   0x00000000,  // CHEROKEE
    278   0x00000000,  // SYRIAC
    279   0x00000000,  // LIMBU
    280   0x00000000,  // ORIYA
    281   0x00000000,  // ASSAMESE
    282   0x00000000,  // CORSICAN
    283   0x00000000,  // INTERLINGUE
    284   0x00007342,  // KAZAKH   KAZAKH.12
    285   0x00000000,  // LINGALA
    286   0x00000000,  // MOLDAVIAN
    287   0x5f007645,  // PASHTO   PASHTO.12 UIGHUR.4
    288   0x00000000,  // QUECHUA
    289   0x00000000,  // SHONA
    290   0x00007942,  // TAJIK   TAJIK.12
    291   0x00000000,  // TATAR
    292   0x00000000,  // TONGA
    293   0x00000000,  // YORUBA
    294   0x00000000,  // CREOLES_AND_PIDGINS_ENGLISH_BASED
    295   0x00000000,  // CREOLES_AND_PIDGINS_FRENCH_BASED
    296   0x00000000,  // CREOLES_AND_PIDGINS_PORTUGUESE_BASED
    297   0x00000000,  // CREOLES_AND_PIDGINS_OTHER
    298   0x00000000,  // MAORI
    299   0x00000000,  // WOLOF
    300   0x00000000,  // ABKHAZIAN
    301   0x00000000,  // AFAR
    302   0x00000000,  // AYMARA
    303   0x00000000,  // BASHKIR
    304   0x00000000,  // BISLAMA
    305   0x00000000,  // DZONGKHA
    306   0x00000000,  // FIJIAN
    307   0x00000000,  // GREENLANDIC
    308   0x00000000,  // HAUSA
    309   0x00000000,  // HAITIAN_CREOLE
    310   0x00000000,  // INUPIAK
    311   0x00000542,  // INUKTITUT   FRENCH.12
    312   0x00000000,  // KASHMIRI
    313   0x00000000,  // KINYARWANDA
    314   0x00000000,  // MALAGASY
    315   0x00000000,  // NAURU
    316   0x00000000,  // OROMO
    317   0x00000000,  // RUNDI
    318   0x00000000,  // SAMOAN
    319   0x00000000,  // SANGO
    320   0x344197d3,  // SANSKRIT   SANSKRIT.11 MARATHI.4 BIHARI.1
    321   0x00000000,  // SISWANT
    322   0x00000000,  // TSONGA
    323   0x00000000,  // TSWANA
    324   0x00000000,  // VOLAPUK
    325   0x00000000,  // ZHUANG
    326   0x00000000,  // KHASI
    327   0x00000000,  // SCOTS
    328   0x00000000,  // GANDA
    329   0x00000000,  // MANX
    330   0x00000000,  // MONTENEGRIN
    331   // Add new language hints just before here (just use 0x00000000)
    332 };
    333 
    334 COMPILE_ASSERT(arraysize(kLanguageHintProbs) == NUM_LANGUAGES,
    335                kLanguageHintProbs_has_incorrect_size);
    336 
    337 //
    338 // Generated by dsites 2008.07.07 from 10% of Base
    339 //
    340 
    341 typedef struct {
    342   char key[4];
    343   uint32 probs;
    344 } HintEntry;
    345 
    346 
    347 // Massaged TLD, followed by three packed language probs
    348 // Hand-removed 4 items dsites 2008.07.15
    349 static const int kTLDHintProbsSize = 201;
    350 static const HintEntry kTLDHintProbs[kTLDHintProbsSize] = {   // MaxRange 12
    351   {{0x61,0x63,0x5f,0x5f}, 0x0a000945},   // ac__ Japanese.12 Korean.4
    352   {{0x61,0x64,0x5f,0x5f}, 0x00003842},   // ad__ CATALAN.12
    353   {{0x61,0x65,0x5f,0x5f}, 0x00003742},   // ae__ ARABIC.12
    354   {{0x61,0x66,0x5f,0x5f}, 0x4e00763d},   // af__ PASHTO.11 PERSIAN.7
    355   {{0x61,0x67,0x5f,0x5f}, 0x09000643},   // ag__ GERMAN.12 Japanese.2
    356   {{0x61,0x69,0x5f,0x5f}, 0x0c180938},   // ai__ Japanese.11 HUNGARIAN.7 POLISH.2
    357   {{0x61,0x6c,0x5f,0x5f}, 0x00002e42},   // al__ ALBANIAN.12
    358   {{0x61,0x6e,0x5f,0x5f}, 0x6e00033d},   // an__ DUTCH.11 LIMBU.7
    359   {{0x61,0x6f,0x5f,0x5f}, 0x05000d42},   // ao__ PORTUGUESE.12 FRENCH.1
    360   {{0x61,0x71,0x5f,0x5f}, 0x05000f29},   // aq__ SPANISH.9 FRENCH.6
    361   {{0x61,0x72,0x5f,0x5f}, 0x00000f42},   // ar__ SPANISH.12
    362   {{0x61,0x73,0x5f,0x5f}, 0x0f120bcd},   // as__ NORWEGIAN.10 CZECH.6 SPANISH.5
    363   {{0x61,0x74,0x5f,0x5f}, 0x00000642},   // at__ GERMAN.12
    364   {{0x61,0x77,0x5f,0x5f}, 0x0f000345},   // aw__ DUTCH.12 SPANISH.4
    365   {{0x61,0x78,0x5f,0x5f}, 0x00001042},   // ax__ SWEDISH.12
    366   {{0x61,0x7a,0x5f,0x5f}, 0x00004b42},   // az__ AZERBAIJANI.12
    367   {{0x62,0x61,0x5f,0x5f}, 0x00001d42},   // ba__ CROATIAN.12
    368   {{0x62,0x62,0x5f,0x5f}, 0x00002842},   // bb__ LATIN.12
    369   {{0x62,0x64,0x5f,0x5f}, 0x00002642},   // bd__ BENGALI.12
    370   {{0x62,0x65,0x5f,0x5f}, 0x05000335},   // be__ DUTCH.10 FRENCH.9
    371   {{0x62,0x66,0x5f,0x5f}, 0x00000542},   // bf__ FRENCH.12
    372   {{0x62,0x67,0x5f,0x5f}, 0x00001c42},   // bg__ BULGARIAN.12
    373   {{0x62,0x68,0x5f,0x5f}, 0x00003742},   // bh__ ARABIC.12
    374   {{0x62,0x69,0x5f,0x5f}, 0x0f00053f},   // bi__ FRENCH.11 SPANISH.9
    375   {{0x62,0x6a,0x5f,0x5f}, 0x00000542},   // bj__ FRENCH.12
    376   {{0x62,0x6d,0x5f,0x5f}, 0x98043929},   // bm__ ESPERANTO.9 FINNISH.8 SISWANT.6
    377   {{0x62,0x6e,0x5f,0x5f}, 0x00002942},   // bn__ MALAY.12
    378   {{0x62,0x6f,0x5f,0x5f}, 0x00000f42},   // bo__ SPANISH.12
    379   {{0x62,0x72,0x5f,0x5f}, 0x00000d42},   // br__ PORTUGUESE.12
    380   {{0x62,0x74,0x5f,0x5f}, 0x00008842},   // bt__ DZONGKHA.12
    381   {{0x62,0x77,0x5f,0x5f}, 0x06059ac4},   // bw__ TSWANA.9 FRENCH.6 GERMAN.5
    382   {{0x62,0x79,0x5f,0x5f}, 0x00003024},   // by__ BELARUSIAN.9
    383   {{0x62,0x7a,0x5f,0x5f}, 0x0f0a0924},   // bz__ Japanese.9 Korean.5 SPANISH.1
    384   {{0x63,0x61,0x5f,0x5f}, 0x00000542},   // ca__ FRENCH.12
    385   {{0x63,0x61,0x74,0x5f}, 0x00003842},   // cat_ CATALAN.12
    386   {{0x63,0x64,0x5f,0x5f}, 0x06051224},   // cd__ CZECH.9 FRENCH.5 GERMAN.1
    387   {{0x63,0x66,0x5f,0x5f}, 0x00000542},   // cf__ FRENCH.12
    388   {{0x63,0x67,0x5f,0x5f}, 0x00000542},   // cg__ FRENCH.12
    389   {{0x63,0x68,0x5f,0x5f}, 0x08050638},   // ch__ GERMAN.11 FRENCH.7 ITALIAN.2
    390   {{0x63,0x69,0x5f,0x5f}, 0x00000542},   // ci__ FRENCH.12
    391   {{0x63,0x6c,0x5f,0x5f}, 0x00000f42},   // cl__ SPANISH.12
    392   {{0x63,0x6d,0x5f,0x5f}, 0x00000542},   // cm__ FRENCH.12
    393   {{0x63,0x6e,0x5f,0x5f}, 0x00001142},   // cn__ Chinese.12
    394   {{0x63,0x6f,0x5f,0x5f}, 0x00000f42},   // co__ SPANISH.12
    395 // {{0x63,0x6f,0x6f,0x70}, 0x0f0509cd},   // coop Japanese.10 FRENCH.6 SPANISH.5
    396   {{0x63,0x72,0x5f,0x5f}, 0x00000f42},   // cr__ SPANISH.12
    397   {{0x63,0x75,0x5f,0x5f}, 0x00000f42},   // cu__ SPANISH.12
    398   {{0x63,0x76,0x5f,0x5f}, 0x00000d42},   // cv__ PORTUGUESE.12
    399   {{0x63,0x78,0x5f,0x5f}, 0x223a091f},   // cx__ Japanese.8 BASQUE.6 TURKISH.4
    400   {{0x63,0x79,0x5f,0x5f}, 0x150622ba},   // cy__ TURKISH.8 GERMAN.4 LATVIAN.3
    401   {{0x63,0x7a,0x5f,0x5f}, 0x00001242},   // cz__ CZECH.12
    402   {{0x64,0x65,0x5f,0x5f}, 0x00000642},   // de__ GERMAN.12
    403   {{0x64,0x6b,0x5f,0x5f}, 0x00000242},   // dk__ DANISH.12
    404   {{0x64,0x6f,0x5f,0x5f}, 0x21000f42},   // do__ SPANISH.12 TAGALOG.1
    405   {{0x64,0x7a,0x5f,0x5f}, 0x37000535},   // dz__ FRENCH.10 ARABIC.9
    406   {{0x65,0x63,0x5f,0x5f}, 0x00000f42},   // ec__ SPANISH.12
    407 // {{0x65,0x64,0x75,0x5f}, 0x2e0f3873},   // edu_ CATALAN.9 SPANISH.7 ALBANIAN.2
    408   {{0x65,0x65,0x5f,0x5f}, 0x00001942},   // ee__ ESTONIAN.12
    409   {{0x65,0x67,0x5f,0x5f}, 0x05003742},   // eg__ ARABIC.12 FRENCH.1
    410   {{0x65,0x72,0x5f,0x5f}, 0x00000b42},   // er__ NORWEGIAN.12
    411   {{0x65,0x73,0x5f,0x5f}, 0x38200fd4},   // es__ SPANISH.11 GALICIAN.4 CATALAN.2
    412   {{0x65,0x74,0x5f,0x5f}, 0x39004a39},   // et__ AMHARIC.11 ESPERANTO.3
    413   {{0x66,0x69,0x5f,0x5f}, 0x10000444},   // fi__ FINNISH.12 SWEDISH.3
    414   {{0x66,0x6a,0x5f,0x5f}, 0x050489e0},   // fj__ FIJIAN.12 FINNISH.5 FRENCH.3
    415   {{0x66,0x6f,0x5f,0x5f}, 0x00004742},   // fo__ FAROESE.12
    416   {{0x66,0x72,0x5f,0x5f}, 0x00000542},   // fr__ FRENCH.12
    417   {{0x67,0x61,0x5f,0x5f}, 0x00000542},   // ga__ FRENCH.12
    418   {{0x67,0x64,0x5f,0x5f}, 0x061d05d5},   // gd__ FRENCH.11 CROATIAN.5 GERMAN.3
    419   {{0x67,0x65,0x5f,0x5f}, 0x00004c2d},   // ge__ GEORGIAN.10
    420   {{0x67,0x66,0x5f,0x5f}, 0x00000542},   // gf__ FRENCH.12
    421   {{0x67,0x67,0x5f,0x5f}, 0x06002244},   // gg__ TURKISH.12 GERMAN.3
    422   {{0x67,0x68,0x5f,0x5f}, 0x05000436},   // gh__ FINNISH.10 FRENCH.10
    423   {{0x67,0x69,0x5f,0x5f}, 0x0f0538ce},   // gi__ CATALAN.10 FRENCH.7 SPANISH.6
    424   {{0x67,0x6c,0x5f,0x5f}, 0x398a0238},   // gl__ DANISH.11 GREENLANDIC.7 ESPERANTO.2
    425   {{0x67,0x6d,0x5f,0x5f}, 0x0600043e},   // gm__ FINNISH.11 GERMAN.8
    426   {{0x67,0x6e,0x5f,0x5f}, 0x00000542},   // gn__ FRENCH.12
    427 // {{0x67,0x6f,0x76,0x5f}, 0x05000f25},   // gov_ SPANISH.9 FRENCH.2
    428   {{0x67,0x70,0x5f,0x5f}, 0x00000542},   // gp__ FRENCH.12
    429   {{0x67,0x71,0x5f,0x5f}, 0x0f000547},   // gq__ FRENCH.12 SPANISH.6
    430   {{0x67,0x73,0x5f,0x5f}, 0x00000942},   // gs__ Japanese.12
    431   {{0x67,0x74,0x5f,0x5f}, 0x00000f42},   // gt__ SPANISH.12
    432   {{0x68,0x6b,0x5f,0x5f}, 0x11004643},   // hk__ ChineseT.12 Chinese.2
    433   {{0x68,0x6d,0x5f,0x5f}, 0x4606092e},   // hm__ Japanese.10 GERMAN.6 ChineseT.2
    434   {{0x68,0x6e,0x5f,0x5f}, 0x00000f42},   // hn__ SPANISH.12
    435   {{0x68,0x72,0x5f,0x5f}, 0x00001d42},   // hr__ CROATIAN.12
    436   {{0x68,0x74,0x5f,0x5f}, 0x0f000542},   // ht__ FRENCH.12 SPANISH.1
    437   {{0x68,0x75,0x5f,0x5f}, 0x00001842},   // hu__ HUNGARIAN.12
    438   {{0x69,0x64,0x5f,0x5f}, 0x00002742},   // id__ INDONESIAN.12
    439   {{0x69,0x65,0x5f,0x5f}, 0x050c1f24},   // ie__ IRISH.9 POLISH.5 FRENCH.1
    440   {{0x69,0x6c,0x5f,0x5f}, 0x00000742},   // il__ HEBREW.12
    441   {{0x69,0x6e,0x74,0x5f}, 0x0f060574},   // int_ FRENCH.9 GERMAN.8 SPANISH.3
    442   {{0x69,0x6f,0x5f,0x5f}, 0x11090fd5},   // io__ SPANISH.11 Japanese.5 Chinese.3
    443   {{0x69,0x71,0x5f,0x5f}, 0x60003744},   // iq__ ARABIC.12 KURDISH.3
    444   {{0x69,0x72,0x5f,0x5f}, 0x00004e42},   // ir__ PERSIAN.12
    445   {{0x69,0x73,0x5f,0x5f}, 0x00001442},   // is__ ICELANDIC.12
    446   {{0x69,0x74,0x5f,0x5f}, 0x00000842},   // it__ ITALIAN.12
    447   {{0x6a,0x65,0x5f,0x5f}, 0x29050328},   // je__ DUTCH.9 FRENCH.7 MALAY.5
    448   {{0x6a,0x6d,0x5f,0x5f}, 0x040f0576},   // jm__ FRENCH.9 SPANISH.8 FINNISH.5
    449   {{0x6a,0x6f,0x5f,0x5f}, 0x00003742},   // jo__ ARABIC.12
    450 // {{0x6a,0x6f,0x62,0x73}, 0x0f060329},   // jobs DUTCH.9 GERMAN.8 SPANISH.6
    451   {{0x6a,0x70,0x5f,0x5f}, 0x00000942},   // jp__ Japanese.12
    452   {{0x6b,0x65,0x5f,0x5f}, 0x040f3fc3},   // ke__ SWAHILI.9 SPANISH.5 FINNISH.4
    453   {{0x6b,0x69,0x5f,0x5f}, 0x04000643},   // ki__ GERMAN.12 FINNISH.2
    454   {{0x6b,0x6d,0x5f,0x5f}, 0x00000542},   // km__ FRENCH.12
    455   {{0x6b,0x70,0x5f,0x5f}, 0x00000a42},   // kp__ Korean.12
    456   {{0x6b,0x72,0x5f,0x5f}, 0x00000a42},   // kr__ Korean.12
    457   {{0x6b,0x77,0x5f,0x5f}, 0x00003742},   // kw__ ARABIC.12
    458   {{0x6b,0x79,0x5f,0x5f}, 0x0500083f},   // ky__ ITALIAN.11 FRENCH.9
    459   {{0x6b,0x7a,0x5f,0x5f}, 0x0000732d},   // kz__ KAZAKH.10
    460   {{0x6c,0x62,0x5f,0x5f}, 0x05003747},   // lb__ ARABIC.12 FRENCH.6
    461   {{0x6c,0x63,0x5f,0x5f}, 0x09000645},   // lc__ GERMAN.12 Japanese.4
    462   {{0x6c,0x69,0x5f,0x5f}, 0x1600063d},   // li__ GERMAN.11 LITHUANIAN.7
    463   {{0x6c,0x73,0x5f,0x5f}, 0x00005742},   // ls__ SESOTHO.12
    464   {{0x6c,0x74,0x5f,0x5f}, 0x00001642},   // lt__ LITHUANIAN.12
    465   {{0x6c,0x75,0x5f,0x5f}, 0x0600053d},   // lu__ FRENCH.11 GERMAN.7
    466   {{0x6c,0x76,0x5f,0x5f}, 0x00001542},   // lv__ LATVIAN.12
    467   {{0x6c,0x79,0x5f,0x5f}, 0x05003744},   // ly__ ARABIC.12 FRENCH.3
    468   {{0x6d,0x61,0x5f,0x5f}, 0x3700053d},   // ma__ FRENCH.11 ARABIC.7
    469   {{0x6d,0x63,0x5f,0x5f}, 0x00000542},   // mc__ FRENCH.12
    470   {{0x6d,0x64,0x5f,0x5f}, 0x00001724},   // md__ ROMANIAN.9
    471   {{0x6d,0x65,0x5f,0x5f}, 0x00001d42},   // me__ CROATIAN.12
    472   {{0x6d,0x67,0x5f,0x5f}, 0x00000542},   // mg__ FRENCH.12
    473   {{0x6d,0x6b,0x5f,0x5f}, 0x1c002543},   // mk__ MACEDONIAN.12 BULGARIAN.2
    474   {{0x6d,0x6c,0x5f,0x5f}, 0x00000542},   // ml__ FRENCH.12
    475   {{0x6d,0x6e,0x5f,0x5f}, 0x00006142},   // mn__ MONGOLIAN.12
    476   {{0x6d,0x6f,0x5f,0x5f}, 0x110d4631},   // mo__ ChineseT.10 PORTUGUESE.8 Chinese.5
    477   {{0x6d,0x71,0x5f,0x5f}, 0x00000542},   // mq__ FRENCH.12
    478   {{0x6d,0x72,0x5f,0x5f}, 0x37000535},   // mr__ FRENCH.10 ARABIC.9
    479   {{0x6d,0x73,0x5f,0x5f}, 0x090f06d5},   // ms__ GERMAN.11 SPANISH.5 Japanese.3
    480   {{0x6d,0x74,0x5f,0x5f}, 0x00004242},   // mt__ MALTESE.12
    481   {{0x6d,0x75,0x5f,0x5f}, 0x05000934},   // mu__ Japanese.10 FRENCH.8
    482   {{0x6d,0x76,0x5f,0x5f}, 0x28000436},   // mv__ FINNISH.10 LATIN.10
    483   {{0x6d,0x77,0x5f,0x5f}, 0x0611092a},   // mw__ Japanese.9 Chinese.8 GERMAN.7
    484   {{0x6d,0x78,0x5f,0x5f}, 0x00000f42},   // mx__ SPANISH.12
    485   {{0x6d,0x79,0x5f,0x5f}, 0x00002942},   // my__ MALAY.12
    486   {{0x6d,0x7a,0x5f,0x5f}, 0x00000d42},   // mz__ PORTUGUESE.12
    487   {{0x6e,0x61,0x5f,0x5f}, 0x06006644},   // na__ AFRIKAANS.12 GERMAN.3
    488   {{0x6e,0x63,0x5f,0x5f}, 0x00000542},   // nc__ FRENCH.12
    489   {{0x6e,0x65,0x5f,0x5f}, 0x8b000542},   // ne__ FRENCH.12 HAUSA.1
    490   {{0x6e,0x66,0x5f,0x5f}, 0x00000542},   // nf__ FRENCH.12
    491   {{0x6e,0x69,0x5f,0x5f}, 0x00000f42},   // ni__ SPANISH.12
    492   {{0x6e,0x6c,0x5f,0x5f}, 0x00000342},   // nl__ DUTCH.12
    493   {{0x6e,0x6f,0x5f,0x5f}, 0x51000b43},   // no__ NORWEGIAN.12 NORWEGIAN_N.2
    494   {{0x6e,0x75,0x5f,0x5f}, 0x0300103b},   // nu__ SWEDISH.11 DUTCH.5
    495   {{0x6f,0x6d,0x5f,0x5f}, 0x00003742},   // om__ ARABIC.12
    496   {{0x70,0x61,0x5f,0x5f}, 0x00000f42},   // pa__ SPANISH.12
    497   {{0x70,0x65,0x5f,0x5f}, 0x00000f42},   // pe__ SPANISH.12
    498   {{0x70,0x66,0x5f,0x5f}, 0x00000542},   // pf__ FRENCH.12
    499   {{0x70,0x67,0x5f,0x5f}, 0x00000f24},   // pg__ SPANISH.9
    500   {{0x70,0x68,0x5f,0x5f}, 0x00002142},   // ph__ TAGALOG.12
    501   {{0x70,0x6b,0x5f,0x5f}, 0x00003342},   // pk__ URDU.12
    502   {{0x70,0x6c,0x5f,0x5f}, 0x30000c42},   // pl__ POLISH.12 BELARUSIAN.1
    503   {{0x70,0x6e,0x5f,0x5f}, 0x04000644},   // pn__ GERMAN.12 FINNISH.3
    504   {{0x70,0x72,0x5f,0x5f}, 0x00000f42},   // pr__ SPANISH.12
    505   {{0x70,0x72,0x6f,0x5f}, 0x46050fd5},   // pro_ SPANISH.11 FRENCH.5 ChineseT.3
    506   {{0x70,0x73,0x5f,0x5f}, 0x00003742},   // ps__ ARABIC.12
    507   {{0x70,0x74,0x5f,0x5f}, 0x00000d42},   // pt__ PORTUGUESE.12
    508   {{0x70,0x79,0x5f,0x5f}, 0x00000f42},   // py__ SPANISH.12
    509   {{0x71,0x61,0x5f,0x5f}, 0x00003742},   // qa__ ARABIC.12
    510   {{0x72,0x65,0x5f,0x5f}, 0x00000542},   // re__ FRENCH.12
    511   {{0x72,0x6f,0x5f,0x5f}, 0x00001742},   // ro__ ROMANIAN.12
    512   {{0x72,0x73,0x5f,0x5f}, 0x00001d42},   // rs__ CROATIAN.12
    513   {{0x72,0x77,0x5f,0x5f}, 0x9000053e},   // rw__ FRENCH.11 KINYARWANDA.8
    514   {{0x73,0x61,0x5f,0x5f}, 0x00003742},   // sa__ ARABIC.12
    515   {{0x73,0x62,0x5f,0x5f}, 0x00000442},   // sb__ FINNISH.12
    516   {{0x73,0x63,0x5f,0x5f}, 0x060f092f},   // sc__ Japanese.10 SPANISH.7 GERMAN.3
    517   {{0x73,0x64,0x5f,0x5f}, 0x00003742},   // sd__ ARABIC.12
    518   {{0x73,0x65,0x5f,0x5f}, 0x00001042},   // se__ SWEDISH.12
    519   {{0x73,0x69,0x5f,0x5f}, 0x00004042},   // si__ SLOVENIAN.12
    520   {{0x73,0x6b,0x5f,0x5f}, 0x12004543},   // sk__ SLOVAK.12 CZECH.2
    521   {{0x73,0x6d,0x5f,0x5f}, 0x00000842},   // sm__ ITALIAN.12
    522   {{0x73,0x6e,0x5f,0x5f}, 0x00000542},   // sn__ FRENCH.12
    523   {{0x73,0x72,0x5f,0x5f}, 0x03001e44},   // sr__ SERBIAN.12 DUTCH.3
    524   {{0x73,0x76,0x5f,0x5f}, 0x00000f42},   // sv__ SPANISH.12
    525   {{0x73,0x79,0x5f,0x5f}, 0x00003742},   // sy__ ARABIC.12
    526   {{0x74,0x63,0x5f,0x5f}, 0x0a2206cd},   // tc__ GERMAN.10 TURKISH.6 Korean.5
    527   {{0x74,0x66,0x5f,0x5f}, 0x00000642},   // tf__ GERMAN.12
    528   {{0x74,0x67,0x5f,0x5f}, 0x00000542},   // tg__ FRENCH.12
    529   {{0x74,0x68,0x5f,0x5f}, 0x9e0936c9},   // th__ THAI.10 Japanese.3 SCOTS.1
    530   {{0x74,0x6a,0x5f,0x5f}, 0x00007924},   // tj__ TAJIK.9
    531   {{0x74,0x6c,0x5f,0x5f}, 0x060f0dcd},   // tl__ PORTUGUESE.10 SPANISH.6 GERMAN.5
    532   {{0x74,0x6e,0x5f,0x5f}, 0x3700053e},   // tn__ FRENCH.11 ARABIC.8
    533   {{0x74,0x6f,0x5f,0x5f}, 0x064609c5},   // to__ Japanese.9 ChineseT.7 GERMAN.6
    534   {{0x74,0x70,0x5f,0x5f}, 0x06000944},   // tp__ Japanese.12 GERMAN.3
    535   {{0x74,0x72,0x5f,0x5f}, 0x00002242},   // tr__ TURKISH.12
    536   {{0x74,0x72,0x61,0x76}, 0x064509c3},   // trav Japanese.9 SLOVAK.5 GERMAN.4
    537   {{0x74,0x74,0x5f,0x5f}, 0x0f00063e},   // tt__ GERMAN.11 SPANISH.8
    538   {{0x74,0x77,0x5f,0x5f}, 0x00004642},   // tw__ ChineseT.12
    539   {{0x74,0x7a,0x5f,0x5f}, 0x00003f42},   // tz__ SWAHILI.12
    540   {{0x75,0x61,0x5f,0x5f}, 0x0000232d},   // ua__ UKRAINIAN.10
    541   {{0x75,0x79,0x5f,0x5f}, 0x00000f42},   // uy__ SPANISH.12
    542   {{0x75,0x7a,0x5f,0x5f}, 0x0000492d},   // uz__ UZBEK.10
    543   {{0x76,0x61,0x5f,0x5f}, 0x060f0828},   // va__ ITALIAN.9 SPANISH.7 GERMAN.5
    544   {{0x76,0x63,0x5f,0x5f}, 0x0d000939},   // vc__ Japanese.11 PORTUGUESE.3
    545   {{0x76,0x65,0x5f,0x5f}, 0x00000f42},   // ve__ SPANISH.12
    546   {{0x76,0x67,0x5f,0x5f}, 0x09000f43},   // vg__ SPANISH.12 Japanese.2
    547   {{0x76,0x69,0x5f,0x5f}, 0x00002942},   // vi__ MALAY.12
    548   {{0x76,0x6e,0x5f,0x5f}, 0x00004342},   // vn__ VIETNAMESE.12
    549   {{0x76,0x75,0x5f,0x5f}, 0x00000642},   // vu__ GERMAN.12
    550   {{0x77,0x73,0x5f,0x5f}, 0x4b0f0624},   // ws__ GERMAN.9 SPANISH.5 AZERBAIJANI.1
    551   {{0x79,0x65,0x5f,0x5f}, 0x00003742},   // ye__ ARABIC.12
    552   {{0x79,0x75,0x5f,0x5f}, 0x1e001d3d},   // yu__ CROATIAN.11 SERBIAN.7
    553   {{0x7a,0x61,0x5f,0x5f}, 0x00006642},   // za__ AFRIKAANS.12
    554   {{0x7a,0x6d,0x5f,0x5f}, 0x0b000435},   // zm__ FINNISH.10 NORWEGIAN.9
    555   {{0x7a,0x77,0x5f,0x5f}, 0x3f00783e},   // zw__ SHONA.11 SWAHILI.8
    556 };
    557 
    558 
    559 // Statistically closest language, based on quadgram table
    560 // Those that are far from other languges map to UNKNOWN_LANGUAGE
    561 // Subscripted by Language
    562 //
    563 // From lang_correlation.txt and hand-edits
    564 // sed 's/^\([^ ]*\) \([^ ]*\) coef=0\.\(..\).*$/
    565 //   (\3 >= kMinCorrPercent) ? \2 : UNKNOWN_LANGUAGE,
    566 //   \/\/ \1/' lang_correlation.txt >/tmp/closest_lang_decl.txt
    567 //
    568 static const int kMinCorrPercent = 24;        // Pick off how close you want
    569                                               // 24 catches PERSIAN <== ARABIC
    570                                               // but not SPANISH <== PORTUGESE
    571 static Language Unknown = UNKNOWN_LANGUAGE;
    572 
    573 // Subscripted by Language
    574 static const Language kClosestAltLanguage[] = {
    575   (28 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE,  // ENGLISH
    576   (36 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE,  // DANISH
    577   (31 >= kMinCorrPercent) ? AFRIKAANS : UNKNOWN_LANGUAGE,  // DUTCH
    578   (15 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE,  // FINNISH
    579   (11 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE,  // FRENCH
    580   (17 >= kMinCorrPercent) ? LUXEMBOURGISH : UNKNOWN_LANGUAGE,  // GERMAN
    581   (27 >= kMinCorrPercent) ? YIDDISH : UNKNOWN_LANGUAGE,  // HEBREW
    582   (16 >= kMinCorrPercent) ? CORSICAN : UNKNOWN_LANGUAGE,  // ITALIAN
    583   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Japanese
    584   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Korean
    585   (41 >= kMinCorrPercent) ? NORWEGIAN_N : UNKNOWN_LANGUAGE,  // NORWEGIAN
    586   ( 5 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE,  // POLISH
    587   (23 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE,  // PORTUGUESE
    588   (33 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE,  // RUSSIAN
    589   (28 >= kMinCorrPercent) ? GALICIAN : UNKNOWN_LANGUAGE,  // SPANISH
    590   (17 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE,  // SWEDISH
    591   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Chinese
    592   (42 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE,  // CZECH
    593   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // GREEK
    594   (35 >= kMinCorrPercent) ? FAROESE : UNKNOWN_LANGUAGE,  // ICELANDIC
    595   ( 7 >= kMinCorrPercent) ? LITHUANIAN : UNKNOWN_LANGUAGE,  // LATVIAN
    596   ( 7 >= kMinCorrPercent) ? LATVIAN : UNKNOWN_LANGUAGE,  // LITHUANIAN
    597   ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE,  // ROMANIAN
    598   ( 4 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE,  // HUNGARIAN
    599   (15 >= kMinCorrPercent) ? FINNISH : UNKNOWN_LANGUAGE,  // ESTONIAN
    600   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Ignore
    601   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Unknown
    602   (33 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE,  // BULGARIAN
    603   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CROATIAN
    604   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // SERBIAN
    605   (24 >= kMinCorrPercent) ? SCOTS_GAELIC : UNKNOWN_LANGUAGE,  // IRISH
    606   (28 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE,  // GALICIAN
    607   ( 8 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE,  // TAGALOG
    608   (29 >= kMinCorrPercent) ? AZERBAIJANI : UNKNOWN_LANGUAGE,  // TURKISH
    609   (28 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE,  // UKRAINIAN
    610   (37 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE,  // HINDI
    611   (29 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE,  // MACEDONIAN
    612   (14 >= kMinCorrPercent) ? ASSAMESE : UNKNOWN_LANGUAGE,  // BENGALI
    613   (46 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE,  // INDONESIAN
    614   ( 9 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE,  // LATIN
    615   (46 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE,  // MALAY
    616   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // MALAYALAM
    617   ( 4 >= kMinCorrPercent) ? BRETON : UNKNOWN_LANGUAGE,  // WELSH
    618   ( 8 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // NEPALI
    619   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // TELUGU
    620   ( 3 >= kMinCorrPercent) ? ESPERANTO : UNKNOWN_LANGUAGE,  // ALBANIAN
    621   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // TAMIL
    622   (22 >= kMinCorrPercent) ? UKRAINIAN : UNKNOWN_LANGUAGE,  // BELARUSIAN
    623   (15 >= kMinCorrPercent) ? SUNDANESE : UNKNOWN_LANGUAGE,  // JAVANESE
    624   (19 >= kMinCorrPercent) ? CATALAN : UNKNOWN_LANGUAGE,  // OCCITAN
    625   (27 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // URDU
    626   (36 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // BIHARI
    627   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // GUJARATI
    628   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // THAI
    629   (24 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // ARABIC
    630   (19 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE,  // CATALAN
    631   ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE,  // ESPERANTO
    632   ( 3 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE,  // BASQUE
    633   ( 9 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE,  // INTERLINGUA
    634   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // KANNADA
    635   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // PUNJABI
    636   (24 >= kMinCorrPercent) ? IRISH : UNKNOWN_LANGUAGE,  // SCOTS_GAELIC
    637   ( 7 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // SWAHILI
    638   (28 >= kMinCorrPercent) ? SERBO_CROATIAN : UNKNOWN_LANGUAGE,  // SLOVENIAN
    639   (37 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // MARATHI
    640   ( 3 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE,  // MALTESE
    641   ( 1 >= kMinCorrPercent) ? YORUBA : UNKNOWN_LANGUAGE,  // VIETNAMESE
    642   (15 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE,  // FRISIAN
    643   (42 >= kMinCorrPercent) ? CZECH : UNKNOWN_LANGUAGE,  // SLOVAK
    644   // Original ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ChineseT
    645   (24 >= kMinCorrPercent) ? CHINESE : UNKNOWN_LANGUAGE,  // ChineseT
    646   (35 >= kMinCorrPercent) ? ICELANDIC : UNKNOWN_LANGUAGE,  // FAROESE
    647   (15 >= kMinCorrPercent) ? JAVANESE : UNKNOWN_LANGUAGE,  // SUNDANESE
    648   (17 >= kMinCorrPercent) ? TAJIK : UNKNOWN_LANGUAGE,  // UZBEK
    649   ( 7 >= kMinCorrPercent) ? TIGRINYA : UNKNOWN_LANGUAGE,  // AMHARIC
    650   (29 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE,  // AZERBAIJANI
    651   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // GEORGIAN
    652   ( 7 >= kMinCorrPercent) ? AMHARIC : UNKNOWN_LANGUAGE,  // TIGRINYA
    653   (27 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE,  // PERSIAN
    654   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // BOSNIAN
    655   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // SINHALESE
    656   (41 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE,  // NORWEGIAN_N
    657   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // PORTUGUESE_P
    658   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // PORTUGUESE_B
    659   (37 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE,  // XHOSA
    660   (37 >= kMinCorrPercent) ? XHOSA : UNKNOWN_LANGUAGE,  // ZULU
    661   ( 2 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE,  // GUARANI
    662   (29 >= kMinCorrPercent) ? TSWANA : UNKNOWN_LANGUAGE,  // SESOTHO
    663   ( 7 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE,  // TURKMEN
    664   ( 8 >= kMinCorrPercent) ? KAZAKH : UNKNOWN_LANGUAGE,  // KYRGYZ
    665   ( 5 >= kMinCorrPercent) ? FRENCH : UNKNOWN_LANGUAGE,  // BRETON
    666   ( 3 >= kMinCorrPercent) ? GANDA : UNKNOWN_LANGUAGE,  // TWI
    667   (27 >= kMinCorrPercent) ? HEBREW : UNKNOWN_LANGUAGE,  // YIDDISH
    668   (28 >= kMinCorrPercent) ? SLOVENIAN : UNKNOWN_LANGUAGE,  // SERBO_CROATIAN
    669   (12 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE,  // SOMALI
    670   ( 9 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE,  // UIGHUR
    671   (15 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // KURDISH
    672   ( 6 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE,  // MONGOLIAN
    673   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ARMENIAN
    674   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // LAOTHIAN
    675   ( 8 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE,  // SINDHI
    676   (10 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE,  // RHAETO_ROMANCE
    677   (31 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE,  // AFRIKAANS
    678   (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE,  // LUXEMBOURGISH
    679   ( 2 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE,  // BURMESE
    680   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // KHMER
    681   (45 >= kMinCorrPercent) ? DZONGKHA : UNKNOWN_LANGUAGE,  // TIBETAN
    682   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // DHIVEHI
    683   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CHEROKEE
    684   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // SYRIAC
    685   ( 8 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE,  // LIMBU
    686   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ORIYA
    687   (14 >= kMinCorrPercent) ? BENGALI : UNKNOWN_LANGUAGE,  // ASSAMESE
    688   (16 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE,  // CORSICAN
    689   ( 5 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE,  // INTERLINGUE
    690   ( 8 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE,  // KAZAKH
    691   ( 4 >= kMinCorrPercent) ? SWAHILI : UNKNOWN_LANGUAGE,  // LINGALA
    692   (11 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE,  // MOLDAVIAN
    693   (19 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // PASHTO
    694   ( 5 >= kMinCorrPercent) ? AYMARA : UNKNOWN_LANGUAGE,  // QUECHUA
    695   ( 5 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // SHONA
    696   (17 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE,  // TAJIK
    697   (13 >= kMinCorrPercent) ? BASHKIR : UNKNOWN_LANGUAGE,  // TATAR
    698   (11 >= kMinCorrPercent) ? SAMOAN : UNKNOWN_LANGUAGE,  // TONGA
    699   ( 2 >= kMinCorrPercent) ? TWI : UNKNOWN_LANGUAGE,  // YORUBA
    700   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_ENGLISH_BASED
    701   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_FRENCH_BASED
    702   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_PORTUGUESE_BASED
    703   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_OTHER
    704   ( 6 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE,  // MAORI
    705   ( 3 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE,  // WOLOF
    706   ( 1 >= kMinCorrPercent) ? MONGOLIAN : UNKNOWN_LANGUAGE,  // ABKHAZIAN
    707   ( 8 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE,  // AFAR
    708   ( 5 >= kMinCorrPercent) ? QUECHUA : UNKNOWN_LANGUAGE,  // AYMARA
    709   (13 >= kMinCorrPercent) ? TATAR : UNKNOWN_LANGUAGE,  // BASHKIR
    710   ( 3 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE,  // BISLAMA
    711   (45 >= kMinCorrPercent) ? TIBETAN : UNKNOWN_LANGUAGE,  // DZONGKHA
    712   ( 4 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE,  // FIJIAN
    713   ( 7 >= kMinCorrPercent) ? INUPIAK : UNKNOWN_LANGUAGE,  // GREENLANDIC
    714   ( 3 >= kMinCorrPercent) ? AFAR : UNKNOWN_LANGUAGE,  // HAUSA
    715   ( 3 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE,  // HAITIAN_CREOLE
    716   ( 7 >= kMinCorrPercent) ? GREENLANDIC : UNKNOWN_LANGUAGE,  // INUPIAK
    717   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // INUKTITUT
    718   ( 4 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // KASHMIRI
    719   (30 >= kMinCorrPercent) ? RUNDI : UNKNOWN_LANGUAGE,  // KINYARWANDA
    720   ( 2 >= kMinCorrPercent) ? TAGALOG : UNKNOWN_LANGUAGE,  // MALAGASY
    721   (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE,  // NAURU
    722   (12 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE,  // OROMO
    723   (30 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // RUNDI
    724   (11 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE,  // SAMOAN
    725   ( 1 >= kMinCorrPercent) ? LINGALA : UNKNOWN_LANGUAGE,  // SANGO
    726   (32 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE,  // SANSKRIT
    727   (16 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE,  // SISWANT
    728   ( 5 >= kMinCorrPercent) ? SISWANT : UNKNOWN_LANGUAGE,  // TSONGA
    729   (29 >= kMinCorrPercent) ? SESOTHO : UNKNOWN_LANGUAGE,  // TSWANA
    730   ( 2 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE,  // VOLAPUK
    731   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ZHUANG
    732   ( 1 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE,  // KHASI
    733   (28 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE,  // SCOTS
    734   (15 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // GANDA
    735   ( 7 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE,  // MANX
    736   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // MONTENEGRIN
    737 };
    738 
    739 COMPILE_ASSERT(arraysize(kClosestAltLanguage) == NUM_LANGUAGES,
    740                kClosestAltLanguage_has_incorrect_size);
    741 
    742 
    743 inline bool FlagFinish(int flags) {return (flags & kCLDFlagFinish) != 0;}
    744 inline bool FlagSqueeze(int flags) {return (flags & kCLDFlagSqueeze) != 0;}
    745 inline bool FlagRepeats(int flags) {return (flags & kCLDFlagRepeats) != 0;}
    746 inline bool FlagTop40(int flags) {return (flags & kCLDFlagTop40) != 0;}
    747 inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;}
    748 inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;}
    749 inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;}
    750 
    751 
    752 
    753 
    754 //------------------------------------------------------------------------------
    755 // For --cld_html debugging output. Not thread safe
    756 //------------------------------------------------------------------------------
    757 static Language prior_lang = UNKNOWN_LANGUAGE;
    758 static bool prior_unreliable = false;
    759 
    760 //------------------------------------------------------------------------------
    761 // End For --cld_html debugging output
    762 //------------------------------------------------------------------------------
    763 
    764 
    765 // Backscan to word boundary, returning how many bytes n to go back
    766 // so that src - n is non-space ans src - n - 1 is space.
    767 // If not found in kMaxSpaceScan bytes, return 0
    768 int BackscanToSpace(const char* src, int limit) {
    769   int n = 0;
    770   limit = cld::minint(limit, kMaxSpaceScan);
    771   while (n < limit) {
    772     if (src[-n - 1] == ' ') {return n;}    // We are at _X
    773     ++n;
    774   }
    775   return 0;
    776 }
    777 
    778 // Forwardscan to word boundary, returning how many bytes n to go forward
    779 // so that src + n is non-space ans src + n - 1 is space.
    780 // If not found in kMaxSpaceScan bytes, return 0
    781 int ForwardscanToSpace(const char* src, int limit) {
    782   int n = 0;
    783   limit = cld::minint(limit, kMaxSpaceScan);
    784   while (n < limit) {
    785     if (src[n] == ' ') {return n + 1;}    // We are at _X
    786     ++n;
    787   }
    788   return 0;
    789 }
    790 
    791 
    792 // This uses a cheap predictor to get a measure of compression, and
    793 // hence a measure of repetitiveness. It works on complete UTF-8 characters
    794 // instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly
    795 // all the time when done with a byte-based count. Sigh.
    796 //
    797 // To allow running prediction across multiple chunks, caller passes in current
    798 // 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
    799 //
    800 // Returns the number of *bytes* correctly predicted, increments by 1..4 for
    801 // each correctly-predicted character.
    802 //
    803 // NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text
    804 //
    805 int CountPredictedBytes(const char* isrc, int srclen, int* hash, int* tbl) {
    806   int p_count = 0;
    807   const uint8* src = reinterpret_cast<const uint8*>(isrc);
    808   const uint8* srclimit = src + srclen;
    809   int local_hash = *hash;
    810 
    811   while (src < srclimit) {
    812     int c = src[0];
    813     int incr = 1;
    814 
    815     // Pick up one char and length
    816     if (c < 0xc0) {
    817       // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
    818       // Do nothing more
    819     } else if ((c & 0xe0) == 0xc0) {
    820       // Two-byte
    821       c = (c << 8) | src[1];
    822       incr = 2;
    823     } else if ((c & 0xf0) == 0xe0) {
    824       // Three-byte
    825       c = (c << 16) | (src[1] << 8) | src[2];
    826       incr = 3;
    827     } else {
    828       // Four-byte
    829       c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
    830       incr = 4;
    831     }
    832     src += incr;
    833 
    834     int p = tbl[local_hash];            // Prediction
    835     tbl[local_hash] = c;                // Update prediction
    836     p_count += (c == p);                // Count good predictions
    837 
    838     local_hash = ((local_hash << 4) ^ c) & 0xfff;
    839   }
    840 
    841   *hash = local_hash;
    842   return p_count;
    843 }
    844 
    845 
    846 
    847 // Counts number of spaces; a little faster than one-at-a-time
    848 // Doesn't count odd bytes at end
    849 int CountSpaces4(const char* src, int src_len) {
    850   int s_count = 0;
    851   for (int i = 0; i < (src_len & ~3); i += 4) {
    852     s_count += (src[i] == ' ');
    853     s_count += (src[i+1] == ' ');
    854     s_count += (src[i+2] == ' ');
    855     s_count += (src[i+3] == ' ');
    856   }
    857   return s_count;
    858 }
    859 
    860 // Remove words of text that have more than half their letters predicted
    861 // correctly by our cheap predictor, moving the remaining words in-place
    862 // to the front of the input buffer.
    863 //
    864 // To allow running prediction across multiple chunks, caller passes in current
    865 // 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
    866 //
    867 // Return the new, possibly-shorter length
    868 //
    869 // Result Buffer ALWAYS has leading space and trailing space space space NUL,
    870 // if input does
    871 //
    872 int CheapRepWordsInplace(char* isrc, int srclen, int* hash, int* tbl) {
    873   const uint8* src = reinterpret_cast<const uint8*>(isrc);
    874   const uint8* srclimit = src + srclen;
    875   char* dst = isrc;
    876   int local_hash = *hash;
    877   char* word_dst = dst;           // Start of next word
    878   int good_predict_bytes = 0;
    879   int word_length_bytes = 0;
    880 
    881   while (src < srclimit) {
    882     int c = src[0];
    883     int incr = 1;
    884     *dst++ = c;
    885 
    886     if (c == ' ') {
    887       if ((good_predict_bytes * 2) > word_length_bytes) {
    888         // Word is well-predicted: backup to start of this word
    889         dst = word_dst;
    890         if (FLAGS_cld_showme) {
    891           // Mark the deletion point with period
    892           // Don't repeat multiple periods
    893           // Cannot mark with more bytes or may overwrite unseen input
    894           if ((isrc < (dst - 2)) && (dst[-2] != '.')) {
    895             *dst++ = '.';
    896             *dst++ = ' ';
    897           }
    898         }
    899       }
    900       word_dst = dst;              // Start of next word
    901       good_predict_bytes = 0;
    902       word_length_bytes = 0;
    903     }
    904 
    905     // Pick up one char and length
    906     if (c < 0xc0) {
    907       // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
    908       // Do nothing more
    909     } else if ((c & 0xe0) == 0xc0) {
    910       // Two-byte
    911       *dst++ = src[1];
    912       c = (c << 8) | src[1];
    913       incr = 2;
    914     } else if ((c & 0xf0) == 0xe0) {
    915       // Three-byte
    916       *dst++ = src[1];
    917       *dst++ = src[2];
    918       c = (c << 16) | (src[1] << 8) | src[2];
    919       incr = 3;
    920     } else {
    921       // Four-byte
    922       *dst++ = src[1];
    923       *dst++ = src[2];
    924       *dst++ = src[3];
    925       c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
    926       incr = 4;
    927     }
    928     src += incr;
    929     word_length_bytes += incr;
    930 
    931     int p = tbl[local_hash];            // Prediction
    932     tbl[local_hash] = c;                // Update prediction
    933     if (c == p) {
    934       good_predict_bytes += incr;       // Count good predictions
    935     }
    936 
    937     local_hash = ((local_hash << 4) ^ c) & 0xfff;
    938   }
    939 
    940   *hash = local_hash;
    941 
    942   if ((dst - isrc) < (srclen - 3)) {
    943     // Pad and make last char clean UTF-8 by putting following spaces
    944     dst[0] = ' ';
    945     dst[1] = ' ';
    946     dst[2] = ' ';
    947     dst[3] = '\0';
    948   } else   if ((dst - isrc) < srclen) {
    949     // Make last char clean UTF-8 by putting following space off the end
    950     dst[0] = ' ';
    951   }
    952 
    953   return static_cast<int>(dst - isrc);
    954 }
    955 
    956 
    957 // Remove portions of text that have a high density of spaces, or that are
    958 // overly repetitive, squeezing the remaining text in-place to the front of the
    959 // input buffer.
    960 //
    961 // Squeezing looks at density of space/prediced chars in fixed-size chunks,
    962 // specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes.
    963 //
    964 // Return the new, possibly-shorter length
    965 //
    966 // Result Buffer ALWAYS has leading space and trailing space space space NUL,
    967 // if input does
    968 //
    969 int CompactLangDetImpl::CheapSqueezeInplace(char* isrc,
    970                                             int srclen,
    971                                             int ichunksize) {
    972   char* src = isrc;
    973   char* dst = src;
    974   char* srclimit = src + srclen;
    975   bool skipping = false;
    976 
    977   int hash = 0;
    978   // Allocate local prediction table.
    979   int* predict_tbl = new int[kPredictionTableSize];
    980   memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
    981 
    982   int chunksize = ichunksize;
    983   if (chunksize == 0) {chunksize = kChunksizeDefault;}
    984   int space_thresh = (chunksize * kSpacesThreshPercent) / 100;
    985   int predict_thresh = (chunksize * kPredictThreshPercent) / 100;
    986 
    987   while (src < srclimit) {
    988     int remaining_bytes = srclimit - src;
    989     int len = cld::minint(chunksize, remaining_bytes);
    990     // Make len land us on a UTF-8 character boundary, and also fix
    991     // mispredictions because we could get out of phase.
    992     // Loop always terminates at trailing space in buffer.
    993     while ((src[len] & 0xc0) == 0x80)
    994       ++len; // Move past continuation bytes
    995 
    996     int space_n = CountSpaces4(src, len);
    997     int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);
    998     if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {
    999       // Skip the text
   1000       if (!skipping) {
   1001         // Keeping-to-skipping transition; do it at a space
   1002         int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));
   1003         dst -= n;
   1004         skipping = true;
   1005         if (FLAGS_cld_showme) {
   1006           // Mark the deletion point with black square U+25A0
   1007           *dst++ = 0xe2;
   1008           *dst++ = 0x96;
   1009           *dst++ = 0xa0;
   1010           *dst++ = ' ';
   1011         }
   1012         if (dst == isrc) {
   1013           // Force a leading space if the first chunk is deleted
   1014           *dst++ = ' ';
   1015         }
   1016       }
   1017     } else {
   1018       // Keep the text
   1019       if (skipping) {
   1020         // Skipping-to-keeping transition; do it at a space
   1021         int n = ForwardscanToSpace(src, len);
   1022         src += n;
   1023         remaining_bytes -= n;   // Shrink remaining length
   1024         len -= n;
   1025         skipping = false;
   1026       }
   1027       // "len" can be negative in some cases
   1028       if (len > 0) {
   1029         memmove(dst, src, len);
   1030         dst += len;
   1031       }
   1032     }
   1033     src += len;
   1034   }
   1035 
   1036   if ((dst - isrc) < (srclen - 3)) {
   1037     // Pad and make last char clean UTF-8 by putting following spaces
   1038     dst[0] = ' ';
   1039     dst[1] = ' ';
   1040     dst[2] = ' ';
   1041     dst[3] = '\0';
   1042   } else   if ((dst - isrc) < srclen) {
   1043     // Make last char clean UTF-8 by putting following space off the end
   1044     dst[0] = ' ';
   1045   }
   1046 
   1047   // Deallocate local prediction table
   1048   delete[] predict_tbl;
   1049   return static_cast<int>(dst - isrc);
   1050 }
   1051 
   1052 // Timing 2.8GHz P4 (dsites 2008.03.20) with 170KB input
   1053 //  About 90 MB/sec, with or without memcpy, chunksize 48 or 4096
   1054 //  Just CountSpaces is about 340 MB/sec
   1055 //  Byte-only CountPredictedBytes is about 150 MB/sec
   1056 //  Byte-only CountPredictedBytes, conditional tbl[] = is about 85! MB/sec
   1057 //  Byte-only CountPredictedBytes is about 180 MB/sec, byte tbl, byte/int c
   1058 //  Unjammed byte-only both = 170 MB/sec
   1059 //  Jammed byte-only both = 120 MB/sec
   1060 //  Back to original w/slight updates, 110 MB/sec
   1061 //
   1062 bool CheapSqueezeTriggerTest(const char* src, int srclen, int testsize) {
   1063   // Don't trigger at all on short text
   1064   if (srclen < testsize) {return false;}
   1065   int space_thresh = (testsize * kSpacesTriggerPercent) / 100;
   1066   int predict_thresh = (testsize * kPredictTriggerPercent) / 100;
   1067   int hash = 0;
   1068   // Allocate local prediction table.
   1069   int* predict_tbl = new int[kPredictionTableSize];
   1070   memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
   1071 
   1072   bool retval = false;
   1073   if ((CountSpaces4(src, testsize) >= space_thresh) ||
   1074       (CountPredictedBytes(src, testsize, &hash, predict_tbl) >=
   1075        predict_thresh)) {
   1076     retval = true;
   1077   }
   1078   // Deallocate local prediction table
   1079   delete[] predict_tbl;
   1080   return retval;
   1081 }
   1082 
   1083 
   1084 
   1085 // Close pairs (correlation) language_enum/language_enum
   1086 //  id/ms (0.47)    38/40    [1]
   1087 //  bo/dz (0.46)    105/135  [2]
   1088 //  cz/sk (0.43)    17/68    [3]
   1089 //  no/nn (0.42)    10/80    [4]
   1090 //  hi/mr (0.38)    35/64    [5]
   1091 //  xh/zu (0.37)    83/84    [6]
   1092 // Subscripted by packed language, gives 0 or a subscript in closepair
   1093 // scoring array inside doc_tote
   1094 static const uint8 kClosePair[EXT_NUM_LANGUAGES + 1] = {
   1095   0,
   1096   0,0,0,0,0,0,0,0, 0,0,4,0,0,0,0,0, 0,3,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
   1097   0,0,0,5,0,0,1,0, 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
   1098   5,0,0,0,3,0,0,0, 0,0,0,0,0,0,0,0, 4,0,0,6,6,0,0,0, 0,0,0,0,0,0,0,0,
   1099   0,0,0,0,0,0,0,0, 0,2,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
   1100   0,0,0,0,0,0,0,2, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
   1101   // Add new language close-pair number just before here (just use 0)
   1102 };
   1103 
   1104 
   1105 // Delete any extended languages from doc_tote
   1106 void RemoveExtendedLanguages(ToteWithReliability* doc_tote) {
   1107   for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
   1108     if (cld::UnpackLanguage(doc_tote->Key(sub)) >= NUM_LANGUAGES) {
   1109       // Effectively remove the extended language by setting key&score to zero
   1110       if (FLAGS_dbgscore) {
   1111         fprintf(stderr, "{-%s} ",
   1112                 ExtLanguageCode(cld::UnpackLanguage(doc_tote->Key(sub))));
   1113       }
   1114 
   1115       // Delete entry
   1116       doc_tote->SetKey(sub, 0);
   1117       doc_tote->SetValue(sub, 0);
   1118       doc_tote->SetReliability(sub, 0);
   1119     }
   1120   }
   1121 }
   1122 
   1123 static const int kMinReliableKeepPercent = 41;  // Remove lang if reli < this
   1124 
   1125 // For Tier3 languages, require a minimum number of bytes to be first-place lang
   1126 static const int kGoodFirstT3MinBytes = 24;         // <this => no first
   1127 
   1128 // Move bytes for unreliable langs to another lang or UNKNOWN
   1129 // doc_tote is sorted, so cannot Add
   1130 //
   1131 // If both CHINESE and CHINESET are present and unreliable, do not delete both;
   1132 // merge both into CHINESE.
   1133 //
   1134 //dsites 2009.03.19
   1135 // we also want to remove Tier3 languages as the first lang if there is very
   1136 // little text like ej1 ej2 ej3 ej4
   1137 // maybe fold this back in earlier
   1138 //
   1139 void RemoveUnreliableLanguages(ToteWithReliability* doc_tote) {
   1140   // Prepass to merge some low-reliablility languages
   1141   int total_bytes = 0;
   1142   for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
   1143     int plang = doc_tote->Key(sub);
   1144     if (plang == 0) {continue;}                     // Empty slot
   1145 
   1146     Language lang = cld::UnpackLanguage(plang);
   1147     int bytes = doc_tote->Value(sub);
   1148     int reli = doc_tote->Reliability(sub);
   1149     if (bytes == 0) {continue;}                     // Zero bytes
   1150     total_bytes += bytes;
   1151 
   1152     // Reliable percent is stored reliable score over stored bytecount
   1153     int reliable_percent = reli / bytes;
   1154     if (reliable_percent >= kMinReliableKeepPercent) {continue;}   // Keeper
   1155 
   1156     // This language is too unreliable to keep, but we might merge it.
   1157     Language altlang = UNKNOWN_LANGUAGE;
   1158     if (lang < NUM_LANGUAGES) {altlang = kClosestAltLanguage[lang];}
   1159     if (altlang == UNKNOWN_LANGUAGE) {continue;}    // No alternative
   1160 
   1161     // Look for alternative in doc_tote
   1162     int altsub = doc_tote->Find(cld::PackLanguage(altlang));
   1163     if (altsub < 0) {continue;}                     // No alternative text
   1164 
   1165     int bytes2 = doc_tote->Value(altsub);
   1166     int reli2 = doc_tote->Reliability(altsub);
   1167     if (bytes2 == 0) {continue;}                    // Zero bytes
   1168 
   1169     // Reliable percent is stored reliable score over stored bytecount
   1170     int reliable_percent2 = reli2 / bytes2;
   1171 
   1172     // Merge one language into the other. Break ties toward lower lang #
   1173     int tosub = altsub;
   1174     int fromsub = sub;
   1175     bool into_lang = false;
   1176     if ((reliable_percent2 < reliable_percent) ||
   1177         ((reliable_percent2 == reliable_percent) && (lang < altlang))) {
   1178       tosub = sub;
   1179       fromsub = altsub;
   1180       into_lang = true;
   1181     }
   1182 
   1183     // Make sure reliability doesn't drop and is enough to avoid delete
   1184     int newpercent = cld::maxint(reliable_percent, reliable_percent2);
   1185     newpercent = cld::maxint(newpercent, kMinReliableKeepPercent);
   1186     int newbytes = bytes + bytes2;
   1187     int newreli = newpercent * newbytes;
   1188 
   1189     doc_tote->SetKey(fromsub, 0);
   1190     doc_tote->SetValue(fromsub, 0);
   1191     doc_tote->SetReliability(fromsub, 0);
   1192     doc_tote->SetValue(tosub, newbytes);
   1193     doc_tote->SetReliability(tosub, newreli);
   1194 
   1195     // Show fate of unreliable languages if at least 10 bytes
   1196     if (FLAGS_cld_html /*&& (newpercent >= 10)*/ && (newbytes >= 10)) {
   1197       if (into_lang) {
   1198         fprintf(stderr, "{Unreli %s.%d(%dB) => %s} ",
   1199                 ExtLanguageCode(altlang), reliable_percent2, bytes2,
   1200                 ExtLanguageCode(lang));
   1201       } else {
   1202         fprintf(stderr, "{Unreli %s.%d(%dB) => %s} ",
   1203                 ExtLanguageCode(lang), reliable_percent, bytes,
   1204                 ExtLanguageCode(altlang));
   1205       }
   1206     }
   1207   }
   1208 
   1209 
   1210   // Pass to delete any remaining unreliable languages
   1211   for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
   1212     int plang = doc_tote->Key(sub);
   1213     if (plang == 0) {continue;}                     // Empty slot
   1214 
   1215     Language lang = cld::UnpackLanguage(plang);
   1216     int bytes = doc_tote->Value(sub);
   1217     int reli = doc_tote->Reliability(sub);
   1218     if (bytes == 0) {continue;}                     // Zero bytes
   1219 
   1220     bool is_tier3 = (cld::kIsPackedTop40[plang] == 0);
   1221     if (is_tier3 &&
   1222         (bytes < kGoodFirstT3MinBytes) &&
   1223         (bytes < total_bytes)) {
   1224       reli = 0;                                     // Too-short tier3
   1225     }
   1226 
   1227     // Reliable percent is stored as reliable score over stored bytecount
   1228     int reliable_percent = reli / bytes;
   1229     if (reliable_percent >= kMinReliableKeepPercent) {continue;}  // Keeper
   1230 
   1231     // Delete unreliable entry
   1232     doc_tote->SetKey(sub, 0);
   1233     doc_tote->SetValue(sub, 0);
   1234     doc_tote->SetReliability(sub, 0);
   1235 
   1236     // Show fate of unreliable languages if at least 10 bytes
   1237     if (FLAGS_cld_html /*&& (reliable_percent >= 10)*/ && (bytes >= 10)) {
   1238       fprintf(stderr, "{Unreli %s.%d(%dB)} ",
   1239               ExtLanguageCode(lang), reliable_percent, bytes);
   1240     }
   1241   }
   1242 
   1243   if (FLAGS_cld_html) {fprintf(stderr, "<br>\n");}
   1244 }
   1245 
   1246 
   1247 // Move less likely byte count to more likely for close pairs of languages
   1248 void RefineScoredClosePairs(ToteWithReliability* doc_tote) {
   1249   for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
   1250     int close_packedlang = doc_tote->Key(sub);
   1251     int subscr = kClosePair[close_packedlang];
   1252     if (subscr == 0) {continue;}
   1253 
   1254     // We have a close pair language -- if the other one is also scored and the
   1255     // longword score differs enough, put all our eggs into one basket
   1256 
   1257     // Nonzero longword score: Go look for the other of this pair
   1258     for (int sub2 = sub + 1; sub2 < doc_tote->MaxSize(); ++sub2) {
   1259       if (kClosePair[doc_tote->Key(sub2)] == subscr) {
   1260         // We have a matching pair
   1261         int close_packedlang2 = doc_tote->Key(sub2);
   1262 
   1263         // Move all the text bytes from lower byte-count to higher one
   1264         int from_sub, to_sub;
   1265         Language from_lang, to_lang;
   1266         if (doc_tote->Value(sub) < doc_tote->Value(sub2)) {
   1267           from_sub = sub;
   1268           to_sub = sub2;
   1269           from_lang = cld::UnpackLanguage(close_packedlang);
   1270           to_lang = cld::UnpackLanguage(close_packedlang2);
   1271         } else {
   1272           from_sub = sub2;
   1273           to_sub = sub;
   1274           from_lang = cld::UnpackLanguage(close_packedlang2);
   1275           to_lang = cld::UnpackLanguage(close_packedlang);
   1276         }
   1277 
   1278         // Move all the bytes smaller => larger of the pair
   1279         if (FLAGS_cld_html || FLAGS_dbgscore) {
   1280           // Show fate of closepair language
   1281           int val = doc_tote->Value(from_sub);
   1282           int reli = doc_tote->Reliability(from_sub);
   1283           int reliable_percent = reli / (val ? val : 1);  // avoid zdiv
   1284           fprintf(stderr, "{CloseLangPair: %s.%d%%(%dB) => %s} ",
   1285                   ExtLanguageCode(from_lang),
   1286                   reliable_percent,
   1287                   doc_tote->Value(from_sub),
   1288                   ExtLanguageCode(to_lang));
   1289         }
   1290         int sum = doc_tote->Value(to_sub) + doc_tote->Value(from_sub);
   1291         doc_tote->SetValue(to_sub, sum);
   1292         doc_tote->SetReliability(to_sub, 100 * sum);
   1293 
   1294         // Delete old entry
   1295         doc_tote->SetKey(from_sub, 0);
   1296         doc_tote->SetValue(from_sub, 0);
   1297         doc_tote->SetReliability(from_sub, 0);
   1298 
   1299         break;    // Exit inner for sub2 loop
   1300       }
   1301     }     // End for sub2
   1302   }   // End for sub
   1303 }
   1304 
   1305 
   1306 void ApplyLanguageHints(Tote* chunk_tote, int tote_grams,
   1307                         uint8* lang_hint_boost) {
   1308   // Need 8 quad/unigrams to give full hint boost, else derate linearly
   1309   if (tote_grams > 8) {
   1310     tote_grams = 8;
   1311   }
   1312   for (int sub = 0; sub < chunk_tote->MaxSize(); ++sub) {
   1313     // Hint boosts are per packed subscript
   1314     int lang_sub = chunk_tote->Key(sub);
   1315     int new_value = chunk_tote->Value(sub) +
   1316       ((lang_hint_boost[lang_sub] * tote_grams) >> 3);
   1317     chunk_tote->SetValue(sub, new_value);
   1318     if (FLAGS_dbgscore && (lang_hint_boost[lang_sub] > 0)) {
   1319       fprintf(stderr, "[%s+=%d*%d/8] ",
   1320               ExtLanguageCode(cld::UnpackLanguage(lang_sub)),
   1321               lang_hint_boost[lang_sub], tote_grams);
   1322     }
   1323   }
   1324 }
   1325 
   1326 
   1327 void PrintHtmlEscapedText(FILE* f, const char* txt, int len) {
   1328   for (int i = 0; i < len; ++i) {
   1329     char c = txt[i];
   1330     if (c == '<') {
   1331       fprintf(f, "&lt;");
   1332     } else if (c == '>') {
   1333       fprintf(f, "&gt;");
   1334     } else if (c == '&') {
   1335       fprintf(f, "&amp;");
   1336     } else if (c == '\'') {
   1337       fprintf(f, "&apos;");
   1338     } else if (c == '"') {
   1339       fprintf(f, "&quot;");
   1340     } else {
   1341       fprintf(f, "%c", c);
   1342     }
   1343   }
   1344   fprintf(f, "<br>\n");
   1345 }
   1346 
   1347 
   1348 // Add one chunk's score to running document score
   1349 // If the top language is UNKNOWN_LANGUAGE, score nothing. This is used to
   1350 // positively identify text to be ignored, such as link farms.
   1351 // Sort before scoring and reinit afterward
   1352 //
   1353 // src and srclen are just for debug output
   1354 void ScoreChunkIntoDoc(const char* src, int srclen, int advance_by,
   1355                        UnicodeLScript lscript,
   1356                        Tote* chunk_tote,
   1357                        ToteWithReliability* doc_tote,
   1358                        int tote_grams,
   1359                        uint8* lang_hint_boost) {
   1360   // Apply hints before sorting
   1361   if (lang_hint_boost) {
   1362     ApplyLanguageHints(chunk_tote, tote_grams, lang_hint_boost);
   1363   }
   1364 
   1365   // Sort to get top two languages
   1366   chunk_tote->Sort(2);
   1367   Language cur_lang = cld::UnpackLanguage(chunk_tote->Key(0));
   1368 
   1369   // Return if empty
   1370   if (cur_lang < 0) {
   1371     chunk_tote->Reinit();
   1372     return;
   1373   }
   1374 
   1375   bool cur_unreliable = false;
   1376 
   1377   // Reliability is a function of mean script score per KB of text
   1378   int len = chunk_tote->GetByteCount();
   1379   int reliability = cld::GetReliability((len * 2) / advance_by,
   1380                                         lscript,
   1381                                         chunk_tote);
   1382   cur_unreliable = (reliability < cld::kMinReliable);
   1383 
   1384   // If tote_grams=0, always reliable
   1385   // If tote_grams=1, always unreliable
   1386   if (tote_grams == 0) {
   1387     reliability = 100;
   1388     cur_unreliable = false;
   1389   } else if (tote_grams == 1) {
   1390     reliability = 0;
   1391     cur_unreliable = true;
   1392   }
   1393 
   1394 #if 0
   1395   // TEMP
   1396   if (FLAGS_cld_html) {
   1397     if (reliability >= kMinReliableKeepPercent) {
   1398       fprintf(stderr, "R%d%% ", reliability);
   1399     } else {
   1400       fprintf(stderr, "--R%d%% ", reliability);
   1401     }
   1402   }
   1403 #endif
   1404 
   1405   // Track the sequence of language fragments [result currently unused]
   1406   ////if (reliability >= kMinReliableSeq) {
   1407   ////  doc_tote->AddSeq(chunk_tote->Key(0));
   1408   ////}
   1409 
   1410   if (cur_unreliable && (chunk_tote->Key(1) != 0)) {
   1411     // Unreliable and two top contenders, split byte count 5/8 - 3/8
   1412     int top_len = ((len * 5) + 4) >> 3;
   1413     int second_len = len - top_len;
   1414 
   1415     doc_tote->Add(chunk_tote->Key(0),
   1416                   top_len, chunk_tote->Value(0), reliability);
   1417     doc_tote->Add(chunk_tote->Key(1),
   1418                   second_len, chunk_tote->Value(1), reliability);
   1419     if (FLAGS_dbgscore) {
   1420       fprintf(stderr, "{+%s.%d.%dR(%dB) +%s.%d.%dR(%dB)} ",
   1421               ExtLanguageCode(cld::UnpackLanguage(chunk_tote->Key(0))),
   1422               chunk_tote->Value(0),
   1423               reliability,
   1424               top_len,
   1425               ExtLanguageCode(cld::UnpackLanguage(chunk_tote->Key(1))),
   1426               chunk_tote->Value(1),
   1427               reliability,
   1428               second_len);
   1429     }
   1430   } else {
   1431     // Reliable or single contender
   1432     doc_tote->Add(chunk_tote->Key(0),
   1433                   len, chunk_tote->Value(0), reliability);
   1434     if (FLAGS_dbgscore) {
   1435       fprintf(stderr, "{+%s.%d.%dR(%dB)} ",
   1436               ExtLanguageCode(cld::UnpackLanguage(chunk_tote->Key(0))),
   1437               chunk_tote->Value(0),
   1438               reliability,
   1439               len);
   1440     }
   1441   }
   1442 
   1443   if (FLAGS_cld_html) {
   1444     if (cur_lang < 0) {cur_lang = UNKNOWN_LANGUAGE;}
   1445     cld::PrintLang(stderr, chunk_tote,
   1446               cur_lang, cur_unreliable,
   1447               prior_lang, prior_unreliable);
   1448     prior_lang = cur_lang;
   1449     prior_unreliable = cur_unreliable;
   1450 
   1451     string temp(src, srclen);
   1452     if (temp[0] == '=') {
   1453       // Rewrite =ScriptX= or =SwitchX= as =Xxxx= for script code Xxxx
   1454       temp = "=Buffered_";
   1455       temp.append(UnicodeLScriptCode(lscript));
   1456       temp.append("=");
   1457     }
   1458     cld::PrintText(stderr, cur_lang, temp);
   1459   }
   1460 
   1461   chunk_tote->Reinit();
   1462 }
   1463 
   1464 
   1465 void PrintTopLang(Language top_lang) {
   1466   if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
   1467     fprintf(stderr, "[] ");
   1468   } else {
   1469     fprintf(stderr, "[%s] ", ExtLanguageName(top_lang));
   1470     prior_lang = top_lang;
   1471   }
   1472 }
   1473 
   1474 void PrintTopLangSpeculative(Language top_lang) {
   1475   fprintf(stderr, "<span style=\"color:#%06X;\">", 0xa0a0a0);
   1476   if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
   1477     fprintf(stderr, "[] ");
   1478   } else {
   1479     fprintf(stderr, "[%s] ", ExtLanguageName(top_lang));
   1480     prior_lang = top_lang;
   1481   }
   1482   fprintf(stderr, "</span>\n");
   1483 }
   1484 
   1485 
   1486 // Add one chunk's score to running document score
   1487 // Convenience function with constant src text
   1488 void ScoreChunkIntoDoc2(const char* src, int advance_by,
   1489                        UnicodeLScript lscript,
   1490                        Tote* chunk_tote,
   1491                        ToteWithReliability* doc_tote,
   1492                        int tote_grams,
   1493                        uint8* lang_hint_boost) {
   1494   int srclen = static_cast<int>(strlen(src));
   1495   ScoreChunkIntoDoc(src, srclen, advance_by, lscript, chunk_tote,
   1496                     doc_tote, tote_grams, lang_hint_boost);
   1497 }
   1498 
   1499 
   1500 // Score one scriptspan using the only language for that script
   1501 void ScoreNilgrams(getone::LangSpan* scriptspan, int lang,
   1502                   ToteWithReliability* doc_tote,
   1503                   uint8* lang_hint_boost,
   1504                   int flags, Language plus_one) {
   1505   // For debugging only. Not thread-safe
   1506   prior_lang = UNKNOWN_LANGUAGE;
   1507   prior_unreliable = false;
   1508 
   1509   const char* src = scriptspan->text;
   1510   int len = scriptspan->text_bytes;
   1511 
   1512   Tote chunk_tote;
   1513   // Score 1000 for 1000 bytes
   1514   chunk_tote.AddGram();
   1515   chunk_tote.Add(lang, scriptspan->text_bytes);
   1516   chunk_tote.AddBytes(scriptspan->text_bytes);
   1517   int advance_by = 2;
   1518   int tote_grams = 0;   // Indicates fully reliable
   1519   ScoreChunkIntoDoc(src, len, advance_by,
   1520                     scriptspan->script, &chunk_tote,
   1521                     doc_tote, tote_grams, lang_hint_boost);
   1522 }
   1523 
   1524 // Score one scriptspan using unigrams
   1525 // Updates tote_grams
   1526 static void ScoreUnigrams(const UTF8PropObj* unigram_obj,
   1527                       getone::LangSpan* scriptspan,
   1528                       int* tote_grams, int gram_limit,
   1529                       Tote* chunk_tote,
   1530                       ToteWithReliability* doc_tote,
   1531                       uint8* lang_hint_boost,
   1532                       int advance_by, int flags,
   1533                    int* initial_word_span, Language plus_one) {
   1534   // chunk_tote may have partial sum coming in
   1535   const char* src = scriptspan->text;
   1536   const char* srclimit = src + scriptspan->text_bytes;
   1537 
   1538   // For debugging only. Not thread-safe
   1539   prior_lang = UNKNOWN_LANGUAGE;
   1540   prior_unreliable = false;
   1541 
   1542   // Break text up into multiple chunks and score each
   1543   while (src < srclimit) {
   1544     // Updates tote_grams
   1545     int len = cld::DoUniScoreV3(unigram_obj,
   1546                                  src, srclimit - src, advance_by,
   1547                                  tote_grams, gram_limit, chunk_tote);
   1548     if (FlagUseWords(flags) || (*initial_word_span > 0)) {
   1549       // Use bigram scoring in addition to quadgrams
   1550       cld::DoBigramScoreV3(&kCjkBiTable_obj,
   1551                            src, len, chunk_tote);
   1552     }
   1553     chunk_tote->AddBytes(len);
   1554     *initial_word_span -= len;
   1555 
   1556     if (*tote_grams >= gram_limit) {
   1557       // Add this chunk to doc totals
   1558       // Remove all but top40 if asked
   1559       if (FlagTop40(flags)) {
   1560         cld::DemoteNotTop40(chunk_tote, cld::PackLanguage(plus_one));
   1561       }
   1562 
   1563       // Sort, accumulate into doc total, reinit
   1564       ScoreChunkIntoDoc(src, len, advance_by,
   1565                         scriptspan->script, chunk_tote,
   1566                         doc_tote, *tote_grams, lang_hint_boost);
   1567       *tote_grams = 0;
   1568     } else {
   1569       if (FLAGS_cld_html) {
   1570         string temp(src, len);
   1571         Language top_lang = cld::UnpackLanguage(chunk_tote->CurrentTopKey());
   1572         PrintTopLangSpeculative(top_lang);
   1573         cld::PrintText(stderr, top_lang, temp);
   1574       }
   1575     }
   1576     src += len;
   1577   }
   1578   // chunk_tote may have partial sum going out
   1579 }
   1580 
   1581 // Back up one UTF-8 character
   1582 const uint8* BackOneUTF8(const uint8* p) {
   1583   const uint8* retval = p - 1;
   1584   if ((*retval & 0xc0) == 0x80) {--retval;}
   1585   if ((*retval & 0xc0) == 0x80) {--retval;}
   1586   if ((*retval & 0xc0) == 0x80) {--retval;}
   1587   return retval;
   1588 }
   1589 
   1590 
   1591 // Score one scriptspan using quadgrams
   1592 // Incoming chunk_tote may have partial accumulation
   1593 static void ScoreQuadgrams(const cld::CLDTableSummary* quadgram_obj,
   1594                        getone::LangSpan* scriptspan,
   1595                        int* tote_grams, int gram_limit,
   1596                        Tote* chunk_tote,
   1597                        ToteWithReliability* doc_tote,
   1598                        uint8* lang_hint_boost,
   1599                        int advance_by, int flags,
   1600                        int* initial_word_span, Language plus_one) {
   1601   // chunk_tote may have partial sum coming in
   1602   const char* src = scriptspan->text;
   1603   const char* srclimit = src + scriptspan->text_bytes;
   1604   const char* lastscored_src = src;
   1605 
   1606   // For debugging only. Not thread-safe
   1607   prior_lang = UNKNOWN_LANGUAGE;
   1608   prior_unreliable = false;
   1609 
   1610   // Break text up into multiple chunks and score each
   1611   while (src < srclimit) {
   1612     // Updates tote_grams
   1613     int len = cld::DoQuadScoreV3(quadgram_obj,
   1614                                  src, srclimit - src, advance_by,
   1615                                  tote_grams, gram_limit, chunk_tote);
   1616     if (FlagUseWords(flags) || (*initial_word_span > 0)) {
   1617       // Use word scoring in addition to quadgrams
   1618       cld::DoOctaScoreV3(&kLongWord8Table_obj,
   1619                          src, len, chunk_tote);
   1620     }
   1621     chunk_tote->AddBytes(len);
   1622     *initial_word_span -= len;
   1623 
   1624     if (*tote_grams >= gram_limit) {
   1625       // Remove all but top40 if asked
   1626       if (FlagTop40(flags)) {
   1627         cld::DemoteNotTop40(chunk_tote, cld::PackLanguage(plus_one));
   1628       }
   1629 
   1630       // Sort, accumulate into doc total, reinit
   1631       ScoreChunkIntoDoc(src, len, advance_by,
   1632                         scriptspan->script, chunk_tote,
   1633                         doc_tote, *tote_grams, lang_hint_boost);
   1634       lastscored_src = src + len;
   1635       *tote_grams = 0;
   1636     } else {
   1637       if (FLAGS_cld_html) {
   1638         string temp(src, len);
   1639         Language top_lang = cld::UnpackLanguage(chunk_tote->CurrentTopKey());
   1640         PrintTopLangSpeculative(top_lang);
   1641         cld::PrintText(stderr, top_lang, temp);
   1642       }
   1643     }
   1644     src += len;
   1645   }
   1646 }
   1647 
   1648 
   1649 
   1650 void PrintLangs(FILE* f, const Language* language3, const int* percent3,
   1651                 const int* text_bytes, const bool* is_reliable) {
   1652   fprintf(f, "<br>&nbsp;&nbsp;Initial_Languages ");
   1653   if (language3[0] != UNKNOWN_LANGUAGE) {
   1654     fprintf(f, "%s%s(%d%%)  ",
   1655             ExtLanguageName(language3[0]),
   1656             *is_reliable ? "" : "*",
   1657             percent3[0]);
   1658   }
   1659   if (language3[1] != UNKNOWN_LANGUAGE) {
   1660     fprintf(f, "%s(%d%%)  ", ExtLanguageName(language3[1]), percent3[1]);
   1661   }
   1662   if (language3[2] != UNKNOWN_LANGUAGE) {
   1663     fprintf(f, "%s(%d%%)  ", ExtLanguageName(language3[2]), percent3[2]);
   1664   }
   1665   fprintf(f, "%d bytes \n", *text_bytes);
   1666 
   1667   fprintf(f, "<br>\n");
   1668 }
   1669 
   1670 
   1671 // Start the tote with a count of one for the default language for script
   1672 void InitScriptToteLang(Tote* script_tote, UnicodeLScript lscript) {
   1673   Language defaultlang = cld::kDefaultLanguagePerLScript[lscript];
   1674   script_tote->Add(cld::PackLanguage(defaultlang), 1);
   1675   script_tote->AddBytes(1);
   1676 #if 0
   1677   if (FLAGS_cld_html) {
   1678     cld::PrintLang(stderr, script_tote,
   1679               defaultlang, false,
   1680               UNKNOWN_LANGUAGE, false);
   1681     prior_lang = cur_lang;
   1682     string temp("+1");
   1683     cld::PrintText(stderr, defaultlang, temp);
   1684   }
   1685 #endif
   1686 }
   1687 
   1688 static const char* const kToteName[4] =
   1689   {"=Latn=", "=Hani=", "=Script2=", "=Script3="};
   1690 static const char* const kToteSwitch[4] =
   1691   {"=Latn=", "=Hani=", "=Switch2=", "=Switch3="};
   1692 
   1693 
   1694 
   1695 // Upper to lower, keep digits, everything else to minus '-' (2d)
   1696 static const char kCharsetToLowerTbl[256] = {
   1697   0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
   1698   0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
   1699   0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
   1700   0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37, 0x38,0x39,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
   1701 
   1702   0x2d,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
   1703   0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x2d,0x2d,0x2d,0x2d,0x2d,
   1704   0x2d,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
   1705   0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x2d,0x2d,0x2d,0x2d,0x2d,
   1706 
   1707   0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
   1708   0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
   1709   0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
   1710   0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
   1711 
   1712   0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
   1713   0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
   1714   0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
   1715   0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
   1716 };
   1717 
   1718 
   1719 static const char kIsAlpha[256] = {
   1720   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
   1721   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
   1722   0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0,
   1723   0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0,
   1724 
   1725   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
   1726   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
   1727   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
   1728   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
   1729 };
   1730 
   1731 static const char kIsDigit[256] = {
   1732   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
   1733   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1, 1,1,0,0,0,0,0,0,
   1734   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
   1735   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
   1736 
   1737   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
   1738   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
   1739   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
   1740   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
   1741 };
   1742 
   1743 // Normalize ASCII string to first 4 alphabetic/digit chars
   1744 // Letters are forced to lowercase ASCII
   1745 // Used to normalize TLD values
   1746 void MakeChar4(const char* str, char* norm) {
   1747   memcpy(norm, "____", 4);     // four underscores
   1748   int l_ptr = 0;
   1749   for (int i = 0; i < strlen(str); ++i) {
   1750     uint8 uc = static_cast<uint8>(str[i]);
   1751     if (kIsAlpha[uc] | kIsDigit[uc]) {
   1752       if (l_ptr < 4) {                  // Else ignore
   1753         norm[l_ptr] = kCharsetToLowerTbl[uc];
   1754         l_ptr++;
   1755       }
   1756     }
   1757   }
   1758 }
   1759 
   1760 // Find subscript of matching key in first 4 bytes of sorted hint array, or -1
   1761 static int HintBinaryLookup4(const HintEntry* hintprobs, int hintprobssize,
   1762                      const char* norm_key) {
   1763   // Key is always in range [lo..hi)
   1764   int lo = 0;
   1765   int hi = hintprobssize;
   1766   while (lo < hi) {
   1767     int mid = (lo + hi) >> 1;
   1768     int comp = memcmp(&hintprobs[mid].key[0], norm_key, 4);
   1769     if (comp < 0) {
   1770       lo = mid + 1;
   1771     } else if (comp > 0) {
   1772       hi = mid;
   1773     } else {
   1774       return mid;
   1775     }
   1776   }
   1777   return -1;
   1778 }
   1779 
   1780 
   1781 // Increment the initial probabilities based on a per-TLD probs entry
   1782 void ApplyTLDHint(uint8* lang_hint_boost, const char* tld_hint) {
   1783   if (FLAGS_dbgscore) {
   1784     fprintf(stderr, "TLD hint %s\n", tld_hint);
   1785   }
   1786   char normalized_tld[8];
   1787   MakeChar4(tld_hint, normalized_tld);
   1788   int n = HintBinaryLookup4(kTLDHintProbs, kTLDHintProbsSize,
   1789                            normalized_tld);
   1790   // TLD is four bytes, probability entry is 4 bytes
   1791   if (n >= 0) {
   1792     uint32 probs = kTLDHintProbs[n].probs;
   1793 
   1794     uint8 prob123 = (probs >> 0) & 0xff;
   1795     const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
   1796     uint8 top1 = (probs >> 8) & 0xff;
   1797     if (top1 > 0) {lang_hint_boost[top1] += cld::LgProb3(prob123_entry, 0);}
   1798     uint8 top2 = (probs >> 16) & 0xff;
   1799     if (top2 > 0) {lang_hint_boost[top2] += cld::LgProb3(prob123_entry, 1);}
   1800     uint8 top3 = (probs >> 24) & 0xff;
   1801     if (top3 > 0) {lang_hint_boost[top3] += cld::LgProb3(prob123_entry, 2);}
   1802   }
   1803 }
   1804 
   1805 
   1806 // Increment the initial probabilities based on a per-encoding probs entry
   1807 void ApplyEncodingHint(uint8* lang_hint_boost, int encoding_hint) {
   1808   if (FLAGS_dbgscore) {
   1809     Encoding tempenc = static_cast<Encoding>(encoding_hint);
   1810     fprintf(stderr, "ENC hint %s\n", EncodingName(tempenc));
   1811   }
   1812   if (encoding_hint < ISO_8859_1) {return;}
   1813   if (encoding_hint >= NUM_ENCODINGS) {return;}
   1814   uint32 probs = kEncodingHintProbs[encoding_hint];
   1815 
   1816   uint8 prob123 = (probs >> 0) & 0xff;
   1817   const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
   1818   uint8 top1 = (probs >> 8) & 0xff;
   1819   if (top1 > 0) {lang_hint_boost[top1] += cld::LgProb3(prob123_entry, 0);}
   1820   uint8 top2 = (probs >> 16) & 0xff;
   1821   if (top2 > 0) {lang_hint_boost[top2] += cld::LgProb3(prob123_entry, 1);}
   1822   uint8 top3 = (probs >> 24) & 0xff;
   1823   if (top3 > 0) {lang_hint_boost[top3] += cld::LgProb3(prob123_entry, 2);}
   1824 }
   1825 
   1826 
   1827 // Increment the initial probability for given language by fixed amount
   1828 // Does not recognize extended languages as hints
   1829 void ApplyLanguageHint(uint8* lang_hint_boost, Language language_hint) {
   1830   if (FLAGS_dbgscore) {
   1831     fprintf(stderr, "LANG hint %s\n", ExtLanguageName(language_hint));
   1832   }
   1833   if (language_hint < ENGLISH) {return;}
   1834   if (language_hint >= NUM_LANGUAGES) {return;}
   1835   uint32 probs = kLanguageHintProbs[language_hint];
   1836 
   1837   uint8 prob123 = (probs >> 0) & 0xff;
   1838   const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
   1839   uint8 top1 = (probs >> 8) & 0xff;
   1840   if (top1 > 0) {lang_hint_boost[top1] += cld::LgProb3(prob123_entry, 0);}
   1841   uint8 top2 = (probs >> 16) & 0xff;
   1842   if (top2 > 0) {lang_hint_boost[top2] += cld::LgProb3(prob123_entry, 1);}
   1843   uint8 top3 = (probs >> 24) & 0xff;
   1844   if (top3 > 0) {lang_hint_boost[top3] += cld::LgProb3(prob123_entry, 2);}
   1845 }
   1846 
   1847 // Extract return values before fixups
   1848 void ExtractLangEtc(ToteWithReliability* doc_tote, int total_text_bytes,
   1849                     int* reliable_percent3, Language* language3, int* percent3,
   1850                     double*  normalized_score3,
   1851                     int* text_bytes, bool* is_reliable) {
   1852   reliable_percent3[0] = 0;
   1853   reliable_percent3[1] = 0;
   1854   reliable_percent3[2] = 0;
   1855   language3[0] = UNKNOWN_LANGUAGE;
   1856   language3[1] = UNKNOWN_LANGUAGE;
   1857   language3[2] = UNKNOWN_LANGUAGE;
   1858   percent3[0] = 100;
   1859   percent3[1] = 0;
   1860   percent3[2] = 0;
   1861   normalized_score3[0] = 0.0;
   1862   normalized_score3[1] = 0.0;
   1863   normalized_score3[2] = 0.0;
   1864 
   1865   *text_bytes = total_text_bytes;
   1866   *is_reliable = false;
   1867 
   1868   int bytecount1 = total_text_bytes;
   1869   int bytecount2 = 0;
   1870   int bytecount3 = 0;
   1871 
   1872   int lang1 = doc_tote->Key(0);
   1873   if (lang1 != 0) {
   1874     // We have a top language
   1875     language3[0] = cld::UnpackLanguage(lang1);
   1876     bytecount1 = doc_tote->Value(0);
   1877     int reli1 = doc_tote->Reliability(0);
   1878     reliable_percent3[0] = reli1 / (bytecount1 ? bytecount1 : 1);  // avoid zdiv
   1879     normalized_score3[0] = cld::GetNormalizedScore(language3[0],
   1880                                                   ULScript_Common,
   1881                                                   bytecount1,
   1882                                                   doc_tote->Score(0));
   1883   }
   1884 
   1885   int lang2 = doc_tote->Key(1);
   1886   if (lang2 != 0) {
   1887     language3[1] = cld::UnpackLanguage(lang2);
   1888     bytecount2 = doc_tote->Value(1);
   1889     int reli2 = doc_tote->Reliability(1);
   1890     reliable_percent3[1] = reli2 / (bytecount2 ? bytecount2 : 1);  // avoid zdiv
   1891     normalized_score3[1] = cld::GetNormalizedScore(language3[1],
   1892                                                   ULScript_Common,
   1893                                                   bytecount2,
   1894                                                   doc_tote->Score(1));
   1895   }
   1896 
   1897   int lang3 = doc_tote->Key(2);
   1898   if (lang3 != 0) {
   1899     language3[2] = cld::UnpackLanguage(lang3);
   1900     bytecount3 = doc_tote->Value(2);
   1901     int reli3 = doc_tote->Reliability(2);
   1902     reliable_percent3[2] = reli3 / (bytecount3 ? bytecount3 : 1);  // avoid zdiv
   1903     normalized_score3[2] = cld::GetNormalizedScore(language3[2],
   1904                                                   ULScript_Common,
   1905                                                   bytecount3,
   1906                                                   doc_tote->Score(2));
   1907   }
   1908 
   1909   // Increase total bytes to sum (top 3) if low for some reason
   1910   int total_bytecount12 = bytecount1 + bytecount2;
   1911   int total_bytecount123 = total_bytecount12 + bytecount3;
   1912   if (total_text_bytes < total_bytecount123) {
   1913     total_text_bytes = total_bytecount123;
   1914     *text_bytes = total_text_bytes;
   1915   }
   1916 
   1917   // Sum minus previous % gives better roundoff behavior than bytecount/total
   1918   int total_text_bytes_div = cld::maxint(1, total_text_bytes);    // Avoid zdiv
   1919   percent3[0] = (bytecount1 * 100) / total_text_bytes_div;
   1920   percent3[1] = (total_bytecount12 * 100) / total_text_bytes_div;
   1921   percent3[2] = (total_bytecount123 * 100) / total_text_bytes_div;
   1922   percent3[2] -= percent3[1];
   1923   percent3[1] -= percent3[0];
   1924 
   1925   // Roundoff, say 96% 1.6% 1.4%, will produce non-obvious 96% 1% 2%
   1926   // Fix this explicitly
   1927   if (percent3[1] < percent3[2]) {
   1928     ++percent3[1];
   1929     --percent3[2];
   1930   }
   1931   if (percent3[0] < percent3[1]) {
   1932     ++percent3[0];
   1933     --percent3[1];
   1934   }
   1935 
   1936   *text_bytes = total_text_bytes;
   1937 
   1938   if (lang1 != 0) {
   1939     // We have a top language
   1940     // Its reliability is overal result reliability
   1941     int bytecount = doc_tote->Value(0);
   1942     int reli = doc_tote->Reliability(0);
   1943     int reliable_percent = reli / (bytecount ? bytecount : 1);  // avoid zdiv
   1944     *is_reliable = reliable_percent >= cld::kMinReliable;
   1945   } else {
   1946     // No top language at all. This can happen with zero text or 100% Klingon
   1947     // if extended=false. Just return all UNKNOWN_LANGUAGE, reliable.
   1948     *is_reliable = true;
   1949   }
   1950 }
   1951 
   1952 bool IsFIGS(Language lang) {
   1953   if (lang == FRENCH) {return true;}
   1954   if (lang == ITALIAN) {return true;}
   1955   if (lang == GERMAN) {return true;}
   1956   if (lang == SPANISH) {return true;}
   1957   return false;
   1958 }
   1959 
   1960 bool IsEFIGS(Language lang) {
   1961   if (lang == ENGLISH) {return true;}
   1962   if (lang == FRENCH) {return true;}
   1963   if (lang == ITALIAN) {return true;}
   1964   if (lang == GERMAN) {return true;}
   1965   if (lang == SPANISH) {return true;}
   1966   return false;
   1967 }
   1968 
   1969 static const int kNonEnBoilerplateMinPercent = 17;    // <this => no second
   1970 static const int kNonFIGSBoilerplateMinPercent = 20;  // <this => no second
   1971 static const int kGoodFirstMinPercent = 26;           // <this => UNK
   1972 static const int kGoodFirstReliableMinPercent = 51;   // <this => unreli
   1973 static const int kIgnoreMaxPercent = 95;              // >this => unreli
   1974 static const int kKeepMinPercent = 2;                 // <this => unreli
   1975 
   1976 // For Tier3 languages, require more bytes of text to override
   1977 // the first-place language
   1978 static const int kGoodSecondT1T2MinBytes = 15;        // <this => no second
   1979 static const int kGoodSecondT3MinBytes = 128;         // <this => no second
   1980                                                       //
   1981 
   1982 // Calculate a single summary language for the document, and its reliability.
   1983 // Returns language3[0] or language3[1] or ENGLISH or UNKNOWN_LANGUAGE
   1984 // This is the heart of matching human-rater perception.
   1985 // reliable_percent3[] is currently unused
   1986 //
   1987 // Do not return Tier3 second language unless there are at least 128 bytes
   1988 void CalcSummaryLang(ToteWithReliability* doc_tote, int total_text_bytes,
   1989                      const int* reliable_percent3,
   1990                      const Language* language3,
   1991                      const int* percent3,
   1992                      Language* summary_lang, bool* is_reliable) {
   1993   // Vector of active languages; changes if we delete some
   1994   int slot_count = 3;
   1995   int active_slot[3] = {0, 1, 2};
   1996 
   1997   int ignore_percent = 0;
   1998   int return_percent = percent3[0];   // Default to top lang
   1999   *summary_lang = language3[0];
   2000   *is_reliable = true;
   2001   if (percent3[0] < kKeepMinPercent) {*is_reliable = false;}
   2002 
   2003   // If any of top 3 is IGNORE, remove it and increment ignore_percent
   2004   for (int i = 0; i < 3; ++i) {
   2005     if (language3[i] == TG_UNKNOWN_LANGUAGE) {
   2006       ignore_percent += percent3[i];
   2007       // Move the rest up, levaing input vectors unchanged
   2008       for (int j=i+1; j < 3; ++j) {
   2009         active_slot[j - 1] = active_slot[j];
   2010       }
   2011       -- slot_count;
   2012       // Logically remove Ignore from percentage-text calculation
   2013       // (extra 1 in 101 avoids zdiv, biases slightly small)
   2014       return_percent = (percent3[0] * 100) / (101 - ignore_percent);
   2015       *summary_lang = language3[active_slot[0]];
   2016       if (percent3[active_slot[0]] < kKeepMinPercent) {*is_reliable = false;}
   2017     }
   2018   }
   2019 
   2020 
   2021   // If English and X, where X (not UNK) is big enough,
   2022   // assume the English is boilerplate and return X.
   2023   // Logically remove English from percentage-text calculation
   2024   int second_bytes = (total_text_bytes * percent3[active_slot[1]]) / 100;
   2025   // Require more bytes of text for Tier3 languages
   2026   int minbytesneeded = kGoodSecondT1T2MinBytes;
   2027   int plang_second =  cld::PackLanguage(language3[active_slot[1]]);
   2028   bool is_tier3 = (cld::kIsPackedTop40[plang_second] == 0);
   2029   if (is_tier3) {
   2030     minbytesneeded = kGoodSecondT3MinBytes;
   2031   }
   2032 
   2033   if ((language3[active_slot[0]] == ENGLISH) &&
   2034       (language3[active_slot[1]] != ENGLISH) &&
   2035       (language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
   2036       (percent3[active_slot[1]] >= kNonEnBoilerplateMinPercent) &&
   2037       (second_bytes >= minbytesneeded)) {
   2038     ignore_percent += percent3[active_slot[0]];
   2039     return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
   2040     *summary_lang = language3[active_slot[1]];
   2041     if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
   2042 
   2043   // Else If FIGS and X, where X (not UNK, EFIGS) is big enough,
   2044   // assume the FIGS is boilerplate and return X.
   2045   // Logically remove FIGS from percentage-text calculation
   2046   } else if (IsFIGS(language3[active_slot[0]]) &&
   2047              !IsEFIGS(language3[active_slot[1]]) &&
   2048              (language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
   2049              (percent3[active_slot[1]] >= kNonFIGSBoilerplateMinPercent) &&
   2050              (second_bytes >= minbytesneeded)) {
   2051     ignore_percent += percent3[active_slot[0]];
   2052     return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
   2053     *summary_lang = language3[active_slot[1]];
   2054     if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
   2055 
   2056   // Else we are returning the first language, but want to improve its
   2057   // return_percent if the second language should be ignored
   2058   } else  if ((language3[active_slot[1]] == ENGLISH) &&
   2059               (language3[active_slot[0]] != ENGLISH)) {
   2060     ignore_percent += percent3[active_slot[1]];
   2061     return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
   2062   } else  if (IsFIGS(language3[active_slot[1]]) &&
   2063               !IsEFIGS(language3[active_slot[0]])) {
   2064     ignore_percent += percent3[active_slot[1]];
   2065     return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
   2066   }
   2067 
   2068   // If return percent is too small (too many languages), return UNKNOWN
   2069   if ((return_percent < kGoodFirstMinPercent)) {
   2070     *summary_lang = UNKNOWN_LANGUAGE;
   2071     *is_reliable = false;
   2072   }
   2073 
   2074   // If return percent is small, return language but set unreliable.
   2075   if ((return_percent < kGoodFirstReliableMinPercent)) {
   2076     *is_reliable = false;
   2077   }
   2078 
   2079   // If ignore percent is too large, set unreliable.
   2080   if ((ignore_percent > kIgnoreMaxPercent)) {
   2081     *is_reliable = false;
   2082   }
   2083 
   2084   // If we removed all the active languages, return UNKNOWN
   2085   if (slot_count == 0) {
   2086     *summary_lang = UNKNOWN_LANGUAGE;
   2087     *is_reliable = false;
   2088   }
   2089 }
   2090 
   2091 
   2092 
   2093 // Result vector must be exactly three items
   2094 Language CompactLangDetImpl::DetectLanguageSummaryV25(
   2095                         const CompactLangDet::DetectionTables* tables,
   2096                         const char* buffer,
   2097                         int buffer_length,
   2098                         bool is_plain_text,
   2099                         const char* tld_hint,       // "id" boosts Indonesian
   2100                         int encoding_hint,          // SJS boosts Japanese
   2101                         Language language_hint,     // ITALIAN boosts it
   2102                         bool allow_extended_lang,
   2103                         int flags,
   2104                         Language plus_one,
   2105                         Language* language3,
   2106                         int* percent3,
   2107                         double* normalized_score3,
   2108                         int* text_bytes,
   2109                         bool* is_reliable) {
   2110   if (!tables) {
   2111     static const CompactLangDet::DetectionTables default_cld_tables = {
   2112       &kQuadTable_obj,
   2113       &compact_lang_det_generated_ctjkvz_b1_obj
   2114     };
   2115     tables = &default_cld_tables;
   2116   }
   2117   language3[0] = UNKNOWN_LANGUAGE;
   2118   language3[1] = UNKNOWN_LANGUAGE;
   2119   language3[2] = UNKNOWN_LANGUAGE;
   2120   percent3[0] = 100;
   2121   percent3[1] = 0;
   2122   percent3[2] = 0;
   2123   normalized_score3[0] = 0.0;
   2124   normalized_score3[1] = 0.0;
   2125   normalized_score3[2] = 0.0;
   2126   *text_bytes = 0;
   2127   *is_reliable = false;
   2128 
   2129   // Document totals
   2130   ToteWithReliability doc_tote;   // Reliability = 0..100
   2131 
   2132   // Vector of packed per-language boosts (just one filled in from hints)
   2133   uint8 lang_hint_boost[EXT_NUM_LANGUAGES + 1];
   2134   memset(lang_hint_boost, 0, sizeof(lang_hint_boost));
   2135 
   2136   // Apply hints,if any
   2137   if ((tld_hint != NULL) && (tld_hint[0] != '\0')) {
   2138     ApplyTLDHint(lang_hint_boost, tld_hint);
   2139   }
   2140   if (encoding_hint != UNKNOWN_ENCODING) {
   2141     ApplyEncodingHint(lang_hint_boost, encoding_hint);
   2142   }
   2143   if (language_hint != UNKNOWN_LANGUAGE) {
   2144     ApplyLanguageHint(lang_hint_boost, language_hint);
   2145   }
   2146 
   2147 
   2148   // Four individual script totals, Latin, Han, other2, other3
   2149   int next_other_tote = 2;
   2150 
   2151   // Four totes for up to four different scripts pending at once
   2152   Tote totes[4];                  // [0] Latn  [1] Hani  [2] other  [3] other
   2153   bool tote_seen[4] = {false, false, false, false};
   2154   int tote_grams[4] = {0, 0, 0, 0};     // Number in partial chunk
   2155   UnicodeLScript tote_script[4] =
   2156     {ULScript_Latin, ULScript_HanCJK, ULScript_Common, ULScript_Common};
   2157 
   2158   // Loop through text spans in a single script
   2159   ScriptScanner ss(buffer, buffer_length, is_plain_text);
   2160   getone::LangSpan scriptspan;
   2161 
   2162   scriptspan.text = NULL;
   2163   scriptspan.text_bytes = 0;
   2164   scriptspan.offset = 0;
   2165   scriptspan.script = ULScript_Common;
   2166   scriptspan.lang = UNKNOWN_LANGUAGE;
   2167 
   2168   int total_text_bytes = 0;
   2169   int textlimit = FLAGS_cld_textlimit << 10;    // in KB
   2170   if (textlimit == 0) {textlimit = 0x7fffffff;}
   2171 
   2172   int advance_by = 2;                   // Advance 2 bytes
   2173   int advance_limit = textlimit >> 3;   // For first 1/8 of max document
   2174 
   2175   int initial_word_span = kDefaultWordSpan;
   2176   if (FLAGS_cld_forcewords) {
   2177     initial_word_span = kReallyBigWordSpan;
   2178   }
   2179 
   2180   // Pick up chunk sizes
   2181   // Smoothwidth is units of quadgrams, about 2.5 chars (unigrams) each
   2182   // Sanity check -- force into a reasonable range
   2183   int chunksizequads = FLAGS_cld_smoothwidth;
   2184   chunksizequads = cld::minint(cld::maxint(chunksizequads, kMinChunkSizeQuads),
   2185                                kMaxChunkSizeQuads);
   2186   int chunksizeunis = (chunksizequads * 5) >> 1;
   2187 
   2188   // Varying short-span limit doesn't work well -- skips too much beyond 20KB
   2189   // int spantooshortlimit = advance_by * FLAGS_cld_smoothwidth;
   2190   int spantooshortlimit = kShortSpanThresh;
   2191 
   2192   // For debugging only. Not thread-safe
   2193   prior_lang = UNKNOWN_LANGUAGE;
   2194   prior_unreliable = false;
   2195 
   2196   // Allocate full-document prediction table for finding repeating words
   2197   int hash = 0;
   2198   int* predict_tbl = new int[kPredictionTableSize];
   2199   if (FlagRepeats(flags)) {
   2200     memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
   2201   }
   2202 
   2203   // Loop through scriptspans accumulating number of text bytes in each language
   2204   while (ss.GetOneScriptSpanLower(&scriptspan)) {
   2205     UnicodeLScript lscript = scriptspan.script;
   2206 
   2207     // Echo text if asked to
   2208     if (FLAGS_cld_echotext) {
   2209       PrintHtmlEscapedText(stderr, scriptspan.text, scriptspan.text_bytes);
   2210     }
   2211 
   2212     // Squeeze out big chunks of text span if asked to
   2213     if (FlagSqueeze(flags)) {
   2214       // Remove repetitive or mostly-spaces chunks
   2215       int newlen;
   2216       int chunksize = 0;    // Use the default
   2217       newlen = CheapSqueezeInplace(scriptspan.text, scriptspan.text_bytes,
   2218                                    chunksize);
   2219       scriptspan.text_bytes = newlen;
   2220     } else {
   2221       // Check now and then to see if we should be squeezing
   2222       if ((total_text_bytes >= kCheapSqueezeTestThresh) &&
   2223           !FlagFinish(flags) &&
   2224           ((getone::kMaxScriptBuffer >> 1) < scriptspan.text_bytes) &&
   2225           CheapSqueezeTriggerTest(scriptspan.text,
   2226                                     scriptspan.text_bytes,
   2227                                     kCheapSqueezeTestLen)) {
   2228         // Recursive call with big-chunk squeezing set
   2229         if (FLAGS_cld_html || FLAGS_dbgscore) {
   2230           fprintf(stderr,
   2231                   "<br>---text_bytes[%d] Recursive(Squeeze)---<br><br>\n",
   2232                   total_text_bytes);
   2233         }
   2234         // Deallocate full-document prediction table
   2235         delete[] predict_tbl;
   2236 
   2237         return DetectLanguageSummaryV25(
   2238                           tables,
   2239                           buffer,
   2240                           buffer_length,
   2241                           is_plain_text,
   2242                           tld_hint,               // "id" boosts Indonesian
   2243                           encoding_hint,          // SJS boosts Japanese
   2244                           language_hint,          // ITALIAN boosts it
   2245                           allow_extended_lang,
   2246                           flags | kCLDFlagSqueeze,
   2247                           plus_one,
   2248                           language3,
   2249                           percent3,
   2250                           normalized_score3,
   2251                           text_bytes,
   2252                           is_reliable);
   2253       }
   2254     }
   2255 
   2256     // Remove repetitive words if asked to
   2257     if (FlagRepeats(flags)) {
   2258       // Remove repetitive words
   2259       int newlen;
   2260       newlen = CheapRepWordsInplace(scriptspan.text, scriptspan.text_bytes,
   2261                                     &hash, predict_tbl);
   2262       scriptspan.text_bytes = newlen;
   2263     }
   2264 
   2265     // The real scoring
   2266     // Accumulate directly into the document total, or accmulate in one of four
   2267     // chunk totals. The purpose of the multiple chunk totals is to piece
   2268     // together short choppy pieces of text in alternating scripts. One total is
   2269     // dedicated to Latin text, one to Han text, and the other two are dynamicly
   2270     // assigned.
   2271     Language onlylang = cld::kOnlyLanguagePerLScript[lscript];
   2272 
   2273     if (onlylang != UNKNOWN_LANGUAGE) {
   2274       // This entire script run is in a single language.
   2275       ScoreNilgrams(&scriptspan, cld::PackLanguage(onlylang), &doc_tote,
   2276                    lang_hint_boost, flags, plus_one);
   2277     } else if (cld::kScoreUniPerLScript[lscript] != 0) {
   2278       // This entire script run's languages can be distinguished by uni-grams
   2279       // Accumulate in hani_tote
   2280       int tote_num = 1;
   2281       if (!tote_seen[tote_num]) {
   2282         tote_seen[tote_num] = true;
   2283         // Default language gets 1 byte
   2284         total_text_bytes += 1;
   2285         InitScriptToteLang(&totes[tote_num], lscript);
   2286       }
   2287       ScoreUnigrams(tables->unigram_obj,
   2288                        &scriptspan, &tote_grams[tote_num], chunksizeunis,
   2289                        &totes[tote_num],
   2290                        &doc_tote, lang_hint_boost,
   2291                        advance_by, flags, &initial_word_span, plus_one);
   2292     } else {
   2293       // This entire script-run's languages can be distinguished by quad-grams
   2294       // Accumulate in latn_tote or script0/1_tote
   2295       int tote_num = -1;
   2296       for (int t = 0; t < 4; ++t) {
   2297         if (lscript == tote_script[t]) {
   2298           tote_num = t;
   2299           break;
   2300         }
   2301       }
   2302       if (tote_num < 0) {
   2303         // Need to allocate other0/1
   2304         tote_num = next_other_tote;
   2305         next_other_tote ^= 1;     // Round-robin
   2306         if (tote_seen[tote_num]) {
   2307           // Flush previous
   2308           ScoreChunkIntoDoc2(kToteSwitch[tote_num], advance_by,
   2309                              tote_script[tote_num], &totes[tote_num],
   2310                              &doc_tote, tote_grams[tote_num], lang_hint_boost);
   2311           totes[tote_num].Reinit();
   2312         }
   2313         tote_script[tote_num] = lscript;
   2314       }
   2315 
   2316       if (!tote_seen[tote_num]) {
   2317         tote_seen[tote_num] = true;
   2318         // Default language gets 1 byte
   2319         total_text_bytes += 1;
   2320         InitScriptToteLang(&totes[tote_num], lscript);
   2321       }
   2322 
   2323       // The actual accumulation, possibly with word scoring also
   2324       ScoreQuadgrams(tables->quadgram_obj, &scriptspan, &tote_grams[tote_num],
   2325                         chunksizequads,
   2326                         &totes[tote_num],
   2327                         &doc_tote, lang_hint_boost,
   2328                         advance_by, flags, &initial_word_span, plus_one);
   2329     }
   2330 
   2331     total_text_bytes += scriptspan.text_bytes;
   2332 
   2333     // For long documents, do less-dense samples the further along we go.
   2334     // This is to keep speed sublinear in document size.
   2335     if (total_text_bytes > advance_limit) {
   2336       if (total_text_bytes > textlimit) {
   2337         // Don't look at rest of doc
   2338         if (FLAGS_cld_html || FLAGS_dbgscore) {
   2339           fprintf(stderr, "<br>---text_bytes[%d] textlimit %d reached---<br>",
   2340                   total_text_bytes, textlimit);
   2341         }
   2342         break;
   2343       }
   2344       advance_by <<= 1;         // Double advance bytes
   2345       advance_limit <<= 1;      // Double limit until next change
   2346       spantooshortlimit <<= 1;  // Double short-span size
   2347       if (FLAGS_cld_html || FLAGS_dbgscore) {
   2348         fprintf(stderr, "<br>---text_bytes[%d] advance_by doubled to %d---<br>",
   2349                 total_text_bytes, advance_by);
   2350       }
   2351     }
   2352   }     // End while (ss.GetOneScriptSpanLower())
   2353 
   2354   // Deallocate full-document prediction table
   2355   delete[] predict_tbl;
   2356 
   2357   // Flush pending totals
   2358   for (int tote_num = 0; tote_num < 4; ++tote_num) {
   2359     if (tote_seen[tote_num]) {
   2360       ScoreChunkIntoDoc2(kToteName[tote_num], advance_by,
   2361                          tote_script[tote_num], &totes[tote_num], &doc_tote,
   2362                          tote_grams[tote_num], lang_hint_boost);
   2363     }
   2364   }
   2365 
   2366   // If extended langauges are disallowed, remove them here
   2367   if (!allow_extended_lang) {
   2368     RemoveExtendedLanguages(&doc_tote);
   2369   }
   2370 
   2371   // Force close pairs to one or the other
   2372   RefineScoredClosePairs(&doc_tote);
   2373 
   2374 
   2375   // Calculate return results
   2376   // Find top three byte counts in tote heap
   2377   int reliable_percent3[3];
   2378 
   2379 
   2380   // Cannot use Add, etc. after sorting
   2381   doc_tote.Sort(3);
   2382 
   2383   ExtractLangEtc(&doc_tote, total_text_bytes,
   2384                  reliable_percent3, language3, percent3, normalized_score3,
   2385                  text_bytes, is_reliable);
   2386 
   2387   bool have_good_answer = false;
   2388   if (FlagFinish(flags)) {
   2389     // Force a result
   2390     have_good_answer = true;
   2391   } else if (total_text_bytes <= kShortTextThresh) {
   2392     // Don't recurse on short text -- we already did word scores
   2393     have_good_answer = true;
   2394   } else if (*is_reliable &&
   2395              (percent3[0] >= kGoodLang1Percent)) {
   2396     have_good_answer = true;
   2397   } else if (*is_reliable &&
   2398              ((percent3[0] + percent3[1]) >= kGoodLang1and2Percent)) {
   2399     have_good_answer = true;
   2400   }
   2401 
   2402 
   2403   if (have_good_answer) {
   2404     // This is the real, non-recursive return
   2405 
   2406     // Move bytes for unreliable langs to another lang or UNKNOWN
   2407     RemoveUnreliableLanguages(&doc_tote);
   2408 
   2409     // Redo the result extraction after the removal above
   2410     doc_tote.Sort(3);
   2411     ExtractLangEtc(&doc_tote, total_text_bytes,
   2412                    reliable_percent3, language3, percent3, normalized_score3,
   2413                    text_bytes, is_reliable);
   2414 
   2415 #if 0
   2416     // OLD code, replaced by CalcSummaryLang
   2417     //
   2418     // Suppress ignore-me text, TG_UNKNOWN_LANGUAGE if 2nd or 3rd language
   2419     // Force it to English if first language
   2420     if (language3[2] == TG_UNKNOWN_LANGUAGE) {
   2421       reliable_percent3[2] = 0;
   2422       language3[2] = UNKNOWN_LANGUAGE;
   2423       percent3[2] = 0;
   2424     } else if (language3[1] == TG_UNKNOWN_LANGUAGE) {
   2425       // Move up lower language
   2426       reliable_percent3[1] = reliable_percent3[2];
   2427       language3[1] = language3[2];
   2428       percent3[1] = percent3[2];
   2429       reliable_percent3[2] = 0;
   2430       language3[2] = UNKNOWN_LANGUAGE;
   2431       percent3[2] = 0;
   2432     } else if (language3[0] == TG_UNKNOWN_LANGUAGE) {
   2433       language3[0] = ENGLISH;
   2434     }
   2435 
   2436     if (language3[0] == UNKNOWN_LANGUAGE) {
   2437       // Last-ditch test for some result, but it is UNKNOWN_LANGUAGE
   2438       // Force it to English (should not happen)
   2439       language3[0] = ENGLISH;
   2440       percent3[0] = 100;
   2441       *is_reliable = true;
   2442     }
   2443 #endif
   2444 
   2445 
   2446 #if 0
   2447     // Scaffolding to reveal subset sequence lang distribution across doc text
   2448     // Track the sequence of language fragments [result currently unused]
   2449     if (FLAGS_cld_html) {
   2450       static const int kMaxSubsetSeq = 12;
   2451       uint8 subseq[kMaxSubsetSeq];
   2452       doc_tote.ExtractSeq(kMaxSubsetSeq, subseq);
   2453 
   2454       fprintf(stderr, "<br>\nSubset Sequence[%d]: ", kMaxSubsetSeq);
   2455       for (int i = 0; i < kMaxSubsetSeq; ++i) {
   2456         fprintf(stderr, "%s ", ExtLanguageCode(cld::UnpackLanguage(subseq[i])));
   2457         if ((i % 4) == 3) {fprintf(stderr, "&nbsp; ");}
   2458       }
   2459       fprintf(stderr, "&nbsp;&nbsp; ");
   2460 
   2461       for (int i = 0; i < 3; ++i) {
   2462         if (language3[i] != UNKNOWN_LANGUAGE) {
   2463           fprintf(stderr, "%s.%d(%d%%) ",
   2464                   ExtLanguageCode(language3[i]),
   2465                   reliable_percent3[i],
   2466                   percent3[i]);
   2467         }
   2468       }
   2469 
   2470       fprintf(stderr, "%d B ", total_text_bytes);
   2471       fprintf(stderr, "<br>\n");
   2472     }
   2473     // End Scaffolding to reveal subset sequence lang distribution
   2474 #endif
   2475 
   2476     Language summary_lang;
   2477     CalcSummaryLang(&doc_tote, total_text_bytes,
   2478                     reliable_percent3, language3, percent3,
   2479                     &summary_lang, is_reliable);
   2480 
   2481     if (FLAGS_cld_html) {
   2482       for (int i = 0; i < 3; ++i) {
   2483         if (language3[i] != UNKNOWN_LANGUAGE) {
   2484           fprintf(stderr, "%s.%d(%d%%) ",
   2485                   ExtLanguageCode(language3[i]),
   2486                   reliable_percent3[i],
   2487                   percent3[i]);
   2488         }
   2489       }
   2490 
   2491       fprintf(stderr, "%d B ", total_text_bytes);
   2492       fprintf(stderr, "= %s%c ",
   2493               ExtLanguageName(summary_lang), is_reliable ? ' ' : '*');
   2494       fprintf(stderr, "<br>\n");
   2495     }
   2496 
   2497     return summary_lang;
   2498   }
   2499 
   2500   // Not a good answer -- do recursive call to refine
   2501   if (FLAGS_cld_html || FLAGS_dbgscore) {
   2502     // This is what we hope to improve on in the recursive call, if any
   2503     PrintLangs(stderr, language3, percent3, text_bytes, is_reliable);
   2504   }
   2505 
   2506   // For restriction to Top40 + one, the one is 1st/2nd lang that is not Top40
   2507   // For this purpose, we treate "Ignore" as top40
   2508   Language new_plus_one = UNKNOWN_LANGUAGE;
   2509   if (cld::kIsPackedTop40[cld::PackLanguage(language3[0])] == 0) {
   2510     new_plus_one = language3[0];
   2511   } else if (cld::kIsPackedTop40[cld::PackLanguage(language3[1])] == 0) {
   2512     new_plus_one = language3[1];
   2513   }
   2514 
   2515   if (total_text_bytes < kShortTextThresh) {
   2516       // Short text: Recursive call with top40 and short set
   2517       if (FLAGS_cld_html || FLAGS_dbgscore) {
   2518         fprintf(stderr, "&nbsp;&nbsp;---text_bytes[%d] "
   2519                 "Recursive(Top40/Rep/Short/Words)---<br><br>\n",
   2520                 total_text_bytes);
   2521       }
   2522       return DetectLanguageSummaryV25(
   2523                         tables,
   2524                         buffer,
   2525                         buffer_length,
   2526                         is_plain_text,
   2527                         tld_hint,               // "id" boosts Indonesian
   2528                         encoding_hint,          // SJS boosts Japanese
   2529                         language_hint,          // ITALIAN boosts it
   2530                         allow_extended_lang,
   2531                         flags | kCLDFlagTop40 | kCLDFlagRepeats |
   2532                           kCLDFlagShort | kCLDFlagUseWords | kCLDFlagFinish,
   2533                         new_plus_one,
   2534                         language3,
   2535                         percent3,
   2536                         normalized_score3,
   2537                         text_bytes,
   2538                         is_reliable);
   2539   }
   2540 
   2541   // Longer text: Recursive call with top40 set
   2542   if (FLAGS_cld_html || FLAGS_dbgscore) {
   2543     fprintf(stderr,
   2544             "&nbsp;&nbsp;---text_bytes[%d] Recursive(Top40/Rep)---<br><br>\n",
   2545             total_text_bytes);
   2546   }
   2547   return DetectLanguageSummaryV25(
   2548                         tables,
   2549                         buffer,
   2550                         buffer_length,
   2551                         is_plain_text,
   2552                         tld_hint,               // "id" boosts Indonesian
   2553                         encoding_hint,          // SJS boosts Japanese
   2554                         language_hint,          // ITALIAN boosts it
   2555                         allow_extended_lang,
   2556                         flags | kCLDFlagTop40 | kCLDFlagRepeats |
   2557                           kCLDFlagFinish,
   2558                         new_plus_one,
   2559                         language3,
   2560                         percent3,
   2561                         normalized_score3,
   2562                         text_bytes,
   2563                         is_reliable);
   2564 }   // End CompactLangDetImpl::DetectLanguageSummaryV25
   2565