1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include <stdio.h> 6 #include <string.h> 7 //#include <sys/time.h> // for gettimeofday 8 #include <string> 9 10 #include "encodings/lang_enc.h" 11 12 #include "encodings/compact_lang_det/compact_lang_det.h" 13 #include "encodings/compact_lang_det/compact_lang_det_impl.h" 14 #include "encodings/compact_lang_det/getonescriptspan.h" 15 #include "encodings/compact_lang_det/letterscript_enum.h" 16 #include "encodings/compact_lang_det/tote.h" 17 #include "encodings/compact_lang_det/utf8propjustletter.h" 18 #include "encodings/compact_lang_det/utf8propletterscriptnum.h" 19 #include "encodings/compact_lang_det/utf8scannotjustletterspecial.h" 20 21 #include "encodings/compact_lang_det/cldutil_dbg.h" 22 23 #include "encodings/compact_lang_det/win/cld_basictypes.h" 24 #include "encodings/compact_lang_det/win/cld_commandlineflags.h" 25 #include "encodings/compact_lang_det/win/cld_google.h" 26 #include "encodings/compact_lang_det/win/cld_utf8statetable.h" 27 28 // Linker supplies the right tables 29 extern const UTF8PropObj compact_lang_det_generated_ctjkvz_b1_obj; 30 extern const cld::CLDTableSummary kCjkBiTable_obj; 31 extern const cld::CLDTableSummary kQuadTable_obj; 32 extern const cld::CLDTableSummary kLongWord8Table_obj; 33 34 DEFINE_bool(cld_html, false, "Print language spans in HTML on stderr"); 35 DEFINE_bool(cld_forcewords, false, "Score all words, in addition to quads"); 36 37 DEFINE_bool(cld_showme, false, "Put squeeze/repeat points into HTML text"); 38 DEFINE_bool(cld_echotext, false, "Print each scriptspan to stderr"); 39 DEFINE_int32(cld_textlimit, 160, "Examine only initial n KB of actual text"); 40 // 20 quadgrams is about 80 bytes or about 12 words in real text 41 DEFINE_int32(cld_smoothwidth, 20, "Smoothing window width in quadgrams"); 42 43 44 static const int kLangHintInitial = 12; // Boost language by N initially 45 static const int kLangHintBoost = 12; // Boost language by N/16 per quadgram 46 47 static const int kShortSpanThresh = 32; // Bytes 48 static const int kMaxSecondChanceLen = 1024; // Look at first 1K of short spans 49 50 static const int kCheapSqueezeTestThresh = 4096; // Only look for squeezing 51 // after this many text bytes 52 static const int kCheapSqueezeTestLen = 256; // Bytes to test to trigger sqz 53 static const int kSpacesTriggerPercent = 25; // Trigger sqz if >=25% spaces 54 static const int kPredictTriggerPercent = 67; // Trigger sqz if >=67% predicted 55 56 static const int kChunksizeDefault = 48; // Squeeze 48-byte chunks 57 static const int kSpacesThreshPercent = 25; // Squeeze if >=25% spaces 58 static const int kPredictThreshPercent = 40; // Squeeze if >=40% predicted 59 60 static const int kMaxSpaceScan = 32; // Bytes 61 62 static const int kGoodLang1Percent = 70; 63 static const int kGoodLang1and2Percent = 93; 64 static const int kShortTextThresh = 256; // Bytes 65 66 static const int kMinChunkSizeQuads = 4; // Chunk is at least four quads 67 static const int kMaxChunkSizeQuads = 1024; // Chunk is at most 1K quads 68 69 static const int kDefaultWordSpan = 256; // Scan at least this many initial 70 // bytes with word scoring 71 static const int kReallyBigWordSpan = 9999999; // Forces word scoring all text 72 73 static const int kMinReliableSeq = 50; // Record in seq if >= 50% reliable 74 75 static const int kPredictionTableSize = 4096; // Must be exactly 4096 for 76 // cheap compressor 77 78 // 79 // Generated by dsites 2008.07.07 from 10% of Base 80 // 81 82 // Three packed language probs, subscripted by Encoding 83 static const uint32 kEncodingHintProbs[] = { 84 0x00000000, // ASCII 85 0x18120cd5, // Latin2 POLISH.11 CZECH.5 HUNGARIAN.3 86 0x1d3a4bc9, // Latin3 AZERBAIJANI.10 BASQUE.3 CROATIAN.1 87 0x030819d4, // Latin4 ESTONIAN.11 ITALIAN.4 DUTCH.2 88 0x00000000, // ISO-8859-5 89 0x00003742, // Arabic ARABIC.12 90 0x00000000, // Greek 91 0x00000742, // Hebrew HEBREW.12 92 0x00002242, // Latin5 TURKISH.12 93 0x060419c9, // Latin6 ESTONIAN.10 FINNISH.3 GERMAN.1 94 0x00000942, // EUC-JP Japanese.12 95 0x00000942, // SJS Japanese.12 96 0x00000942, // JIS Japanese.12 97 0x00004642, // BIG5 ChineseT.12 98 0x00001142, // GB Chinese.12 99 0x46295fcd, // EUC-CN UIGHUR.10 MALAY.6 ChineseT.5 100 0x00000a42, // KSC Korean.12 101 0x00000000, // Unicode 102 0x03104674, // EUC ChineseT.9 SWEDISH.8 DUTCH.3 103 0x00000000, // CNS 104 0x0f1146c3, // BIG5-CP950 ChineseT.9 Chinese.5 SPANISH.4 105 0x00000942, // CP932 Japanese.12 106 0x00000000, // UTF8 107 0x00000000, // Unknown 108 0x00000000, // ASCII-7-bit 109 0x00000000, // KOI8R 110 0x00000000, // CP1251 111 0x00000000, // CP1252 112 0x00000000, // KOI8U 113 0x451d12cd, // CP1250 CZECH.10 CROATIAN.6 SLOVAK.5 114 0x0d06052a, // ISO-8859-15 FRENCH.9 GERMAN.8 PORTUGUESE.7 115 0x00002242, // CP1254 TURKISH.12 116 0x191516be, // CP1257 LITHUANIAN.8 LATVIAN.7 ESTONIAN.7 117 0x08003642, // ISO-8859-11 THAI.12 ITALIAN.1 118 0x00000000, // CP874 119 0x00003742, // CP1256 ARABIC.12 120 0x00000742, // CP1255 HEBREW.12 121 0x00000000, // ISO-8859-8-I 122 0x00000000, // VISUAL 123 0x00000000, // CP852 124 0x39001242, // CSN_369103 CZECH.12 ESPERANTO.1 125 0x00000000, // CP1253 126 0x00000000, // CP866 127 0x2e001944, // ISO-8859-13 ESTONIAN.12 ALBANIAN.3 128 0x08090a74, // ISO-2022-KR Korean.9 Japanese.8 ITALIAN.3 129 0x00001142, // GBK Chinese.12 130 0x4600113d, // GB18030 Chinese.11 ChineseT.7 131 0x00004642, // BIG5_HKSCS ChineseT.12 132 0x00000000, // ISO_2022_CN 133 0x00000000, // TSCII 134 0x00000000, // TAM 135 0x00000000, // TAB 136 0x00000000, // JAGRAN 137 0x00000000, // MACINTOSH 138 0x00000000, // UTF7 139 0x00000000, // BHASKAR 140 0x00000000, // HTCHANAKYA 141 0x090646ca, // UTF-16BE ChineseT.10 GERMAN.4 Japanese.2 142 0x00000000, // UTF-16LE 143 0x00000000, // UTF-32BE 144 0x00000000, // UTF-32LE 145 0x00000000, // X-BINARYENC 146 0x06001142, // HZ-GB-2312 Chinese.12 GERMAN.1 147 0x461109c2, // X-UTF8UTF8 Japanese.9 Chinese.5 ChineseT.3 148 0x00000000, // X-TAM-ELANGO 149 0x00000000, // X-TAM-LTTMBARANI 150 0x00000000, // X-TAM-SHREE 151 0x00000000, // X-TAM-TBOOMIS 152 0x00000000, // X-TAM-TMNEWS 153 0x00000000, // X-TAM-WEBTAMIL 154 0x00000000, // X-KDDI-Shift_JIS 155 0x00000000, // X-DoCoMo-Shift_JIS 156 0x00000000, // X-SoftBank-Shift_JIS 157 0x00000000, // X-KDDI-ISO-2022-JP 158 0x00000000, // X-SoftBank-ISO-2022-JP 159 }; 160 161 COMPILE_ASSERT(arraysize(kEncodingHintProbs) == NUM_ENCODINGS, 162 kEncodingHintProbs_has_incorrect_size); 163 164 // 165 // Generated by dsites 2008.07.07 from 10% of Base 166 // 167 168 // Three packed language probs, subscripted by (anchor) language 169 static const uint32 kLanguageHintProbs[] = { 170 0x00000000, // ENGLISH 171 0x00000242, // DANISH DANISH.12 172 0x00000342, // DUTCH DUTCH.12 173 0x00000442, // FINNISH FINNISH.12 174 0x00000542, // FRENCH FRENCH.12 175 0x00000642, // GERMAN GERMAN.12 176 0x00000742, // HEBREW HEBREW.12 177 0x00000842, // ITALIAN ITALIAN.12 178 0x00000942, // Japanese Japanese.12 179 0x00000a42, // Korean Korean.12 180 0x51000b43, // NORWEGIAN NORWEGIAN.12 NORWEGIAN_N.2 181 0x00000c42, // POLISH POLISH.12 182 0x00000d42, // PORTUGUESE PORTUGUESE.12 183 0x00000000, // RUSSIAN 184 0x00000f42, // SPANISH SPANISH.12 185 0x00001042, // SWEDISH SWEDISH.12 186 0x00001142, // Chinese Chinese.12 187 0x00001242, // CZECH CZECH.12 188 0x00000000, // GREEK 189 0x47001442, // ICELANDIC ICELANDIC.12 FAROESE.1 190 0x00001542, // LATVIAN LATVIAN.12 191 0x00001642, // LITHUANIAN LITHUANIAN.12 192 0x00001742, // ROMANIAN ROMANIAN.12 193 0x00001842, // HUNGARIAN HUNGARIAN.12 194 0x00001942, // ESTONIAN ESTONIAN.12 195 0x00000000, // TG_UNKNOWN_LANGUAGE 196 0x00000000, // Unknown 197 0x00001c42, // BULGARIAN BULGARIAN.12 198 0x00001d42, // CROATIAN CROATIAN.12 199 0x1e001d46, // SERBIAN CROATIAN.12 SERBIAN.5 200 0x00000000, // IRISH 201 0x0f00203d, // GALICIAN GALICIAN.11 SPANISH.7 202 0x5e00213a, // TAGALOG TAGALOG.11 SOMALI.4 203 0x00002242, // TURKISH TURKISH.12 204 0x00002342, // UKRAINIAN UKRAINIAN.12 205 0x00000000, // HINDI 206 0x1c1e25d4, // MACEDONIAN MACEDONIAN.11 SERBIAN.4 BULGARIAN.2 207 0x00002642, // BENGALI BENGALI.12 208 0x00002742, // INDONESIAN INDONESIAN.12 209 0x00000000, // LATIN 210 0x2700293c, // MALAY MALAY.11 INDONESIAN.6 211 0x00000000, // MALAYALAM 212 0x00000000, // WELSH 213 0x00000000, // NEPALI 214 0x00000000, // TELUGU 215 0x00002e42, // ALBANIAN ALBANIAN.12 216 0x00000000, // TAMIL 217 0x00003042, // BELARUSIAN BELARUSIAN.12 218 0x00000000, // JAVANESE 219 0x00000000, // OCCITAN 220 0x375f3330, // URDU URDU.10 UIGHUR.7 ARABIC.4 221 0x41003436, // BIHARI BIHARI.10 MARATHI.10 222 0x00000000, // GUJARATI 223 0x0a4636b2, // THAI THAI.7 ChineseT.3 Korean.2 224 0x00003742, // ARABIC ARABIC.12 225 0x00003842, // CATALAN CATALAN.12 226 0x00003942, // ESPERANTO ESPERANTO.12 227 0x00003a42, // BASQUE BASQUE.12 228 0x00000000, // INTERLINGUA 229 0x00000000, // KANNADA 230 0x05060cca, // PUNJABI POLISH.10 GERMAN.4 FRENCH.2 231 0x00000000, // SCOTS_GAELIC 232 0x00003f42, // SWAHILI SWAHILI.12 233 0x00004042, // SLOVENIAN SLOVENIAN.12 234 0x00004142, // MARATHI MARATHI.12 235 0x00004242, // MALTESE MALTESE.12 236 0x00004342, // VIETNAMESE VIETNAMESE.12 237 0x00000000, // FRISIAN 238 0x12004543, // SLOVAK SLOVAK.12 CZECH.2 239 0x00004642, // ChineseT ChineseT.12 240 0x00000000, // FAROESE 241 0x00000000, // SUNDANESE 242 0x79004944, // UZBEK UZBEK.12 TAJIK.3 243 0x4d004a46, // AMHARIC AMHARIC.12 TIGRINYA.5 244 0x00004b42, // AZERBAIJANI AZERBAIJANI.12 245 0x00000000, // GEORGIAN 246 0x00000000, // TIGRINYA 247 0x00004e42, // PERSIAN PERSIAN.12 248 0x00000000, // BOSNIAN 249 0x00000000, // SINHALESE 250 0x00000000, // NORWEGIAN_N 251 0x00000000, // PORTUGUESE_P 252 0x00000000, // PORTUGUESE_B 253 0x00000000, // XHOSA 254 0x00000000, // ZULU 255 0x00000000, // GUARANI 256 0x00000000, // SESOTHO 257 0x00000000, // TURKMEN 258 0x7a005933, // KYRGYZ KYRGYZ.10 TATAR.7 259 0x00000000, // BRETON 260 0x00000000, // TWI 261 0x00000000, // YIDDISH 262 0x00000000, // SERBO_CROATIAN 263 0x00000000, // SOMALI 264 0x00005f42, // UIGHUR UIGHUR.12 265 0x00006042, // KURDISH KURDISH.12 266 0x00006142, // MONGOLIAN MONGOLIAN.12 267 0x051130c9, // ARMENIAN BELARUSIAN.10 Chinese.3 FRENCH.1 268 0x020f0521, // LAOTHIAN FRENCH.8 SPANISH.7 DANISH.6 269 0x64004e35, // SINDHI PERSIAN.10 SINDHI.9 270 0x00000000, // RHAETO_ROMANCE 271 0x00006642, // AFRIKAANS AFRIKAANS.12 272 0x00000000, // LUXEMBOURGISH 273 0x00006842, // BURMESE BURMESE.12 274 0x00002242, // KHMER TURKISH.12 275 0x88006a3c, // TIBETAN TIBETAN.11 DZONGKHA.6 276 0x00000000, // DHIVEHI 277 0x00000000, // CHEROKEE 278 0x00000000, // SYRIAC 279 0x00000000, // LIMBU 280 0x00000000, // ORIYA 281 0x00000000, // ASSAMESE 282 0x00000000, // CORSICAN 283 0x00000000, // INTERLINGUE 284 0x00007342, // KAZAKH KAZAKH.12 285 0x00000000, // LINGALA 286 0x00000000, // MOLDAVIAN 287 0x5f007645, // PASHTO PASHTO.12 UIGHUR.4 288 0x00000000, // QUECHUA 289 0x00000000, // SHONA 290 0x00007942, // TAJIK TAJIK.12 291 0x00000000, // TATAR 292 0x00000000, // TONGA 293 0x00000000, // YORUBA 294 0x00000000, // CREOLES_AND_PIDGINS_ENGLISH_BASED 295 0x00000000, // CREOLES_AND_PIDGINS_FRENCH_BASED 296 0x00000000, // CREOLES_AND_PIDGINS_PORTUGUESE_BASED 297 0x00000000, // CREOLES_AND_PIDGINS_OTHER 298 0x00000000, // MAORI 299 0x00000000, // WOLOF 300 0x00000000, // ABKHAZIAN 301 0x00000000, // AFAR 302 0x00000000, // AYMARA 303 0x00000000, // BASHKIR 304 0x00000000, // BISLAMA 305 0x00000000, // DZONGKHA 306 0x00000000, // FIJIAN 307 0x00000000, // GREENLANDIC 308 0x00000000, // HAUSA 309 0x00000000, // HAITIAN_CREOLE 310 0x00000000, // INUPIAK 311 0x00000542, // INUKTITUT FRENCH.12 312 0x00000000, // KASHMIRI 313 0x00000000, // KINYARWANDA 314 0x00000000, // MALAGASY 315 0x00000000, // NAURU 316 0x00000000, // OROMO 317 0x00000000, // RUNDI 318 0x00000000, // SAMOAN 319 0x00000000, // SANGO 320 0x344197d3, // SANSKRIT SANSKRIT.11 MARATHI.4 BIHARI.1 321 0x00000000, // SISWANT 322 0x00000000, // TSONGA 323 0x00000000, // TSWANA 324 0x00000000, // VOLAPUK 325 0x00000000, // ZHUANG 326 0x00000000, // KHASI 327 0x00000000, // SCOTS 328 0x00000000, // GANDA 329 0x00000000, // MANX 330 0x00000000, // MONTENEGRIN 331 // Add new language hints just before here (just use 0x00000000) 332 }; 333 334 COMPILE_ASSERT(arraysize(kLanguageHintProbs) == NUM_LANGUAGES, 335 kLanguageHintProbs_has_incorrect_size); 336 337 // 338 // Generated by dsites 2008.07.07 from 10% of Base 339 // 340 341 typedef struct { 342 char key[4]; 343 uint32 probs; 344 } HintEntry; 345 346 347 // Massaged TLD, followed by three packed language probs 348 // Hand-removed 4 items dsites 2008.07.15 349 static const int kTLDHintProbsSize = 201; 350 static const HintEntry kTLDHintProbs[kTLDHintProbsSize] = { // MaxRange 12 351 {{0x61,0x63,0x5f,0x5f}, 0x0a000945}, // ac__ Japanese.12 Korean.4 352 {{0x61,0x64,0x5f,0x5f}, 0x00003842}, // ad__ CATALAN.12 353 {{0x61,0x65,0x5f,0x5f}, 0x00003742}, // ae__ ARABIC.12 354 {{0x61,0x66,0x5f,0x5f}, 0x4e00763d}, // af__ PASHTO.11 PERSIAN.7 355 {{0x61,0x67,0x5f,0x5f}, 0x09000643}, // ag__ GERMAN.12 Japanese.2 356 {{0x61,0x69,0x5f,0x5f}, 0x0c180938}, // ai__ Japanese.11 HUNGARIAN.7 POLISH.2 357 {{0x61,0x6c,0x5f,0x5f}, 0x00002e42}, // al__ ALBANIAN.12 358 {{0x61,0x6e,0x5f,0x5f}, 0x6e00033d}, // an__ DUTCH.11 LIMBU.7 359 {{0x61,0x6f,0x5f,0x5f}, 0x05000d42}, // ao__ PORTUGUESE.12 FRENCH.1 360 {{0x61,0x71,0x5f,0x5f}, 0x05000f29}, // aq__ SPANISH.9 FRENCH.6 361 {{0x61,0x72,0x5f,0x5f}, 0x00000f42}, // ar__ SPANISH.12 362 {{0x61,0x73,0x5f,0x5f}, 0x0f120bcd}, // as__ NORWEGIAN.10 CZECH.6 SPANISH.5 363 {{0x61,0x74,0x5f,0x5f}, 0x00000642}, // at__ GERMAN.12 364 {{0x61,0x77,0x5f,0x5f}, 0x0f000345}, // aw__ DUTCH.12 SPANISH.4 365 {{0x61,0x78,0x5f,0x5f}, 0x00001042}, // ax__ SWEDISH.12 366 {{0x61,0x7a,0x5f,0x5f}, 0x00004b42}, // az__ AZERBAIJANI.12 367 {{0x62,0x61,0x5f,0x5f}, 0x00001d42}, // ba__ CROATIAN.12 368 {{0x62,0x62,0x5f,0x5f}, 0x00002842}, // bb__ LATIN.12 369 {{0x62,0x64,0x5f,0x5f}, 0x00002642}, // bd__ BENGALI.12 370 {{0x62,0x65,0x5f,0x5f}, 0x05000335}, // be__ DUTCH.10 FRENCH.9 371 {{0x62,0x66,0x5f,0x5f}, 0x00000542}, // bf__ FRENCH.12 372 {{0x62,0x67,0x5f,0x5f}, 0x00001c42}, // bg__ BULGARIAN.12 373 {{0x62,0x68,0x5f,0x5f}, 0x00003742}, // bh__ ARABIC.12 374 {{0x62,0x69,0x5f,0x5f}, 0x0f00053f}, // bi__ FRENCH.11 SPANISH.9 375 {{0x62,0x6a,0x5f,0x5f}, 0x00000542}, // bj__ FRENCH.12 376 {{0x62,0x6d,0x5f,0x5f}, 0x98043929}, // bm__ ESPERANTO.9 FINNISH.8 SISWANT.6 377 {{0x62,0x6e,0x5f,0x5f}, 0x00002942}, // bn__ MALAY.12 378 {{0x62,0x6f,0x5f,0x5f}, 0x00000f42}, // bo__ SPANISH.12 379 {{0x62,0x72,0x5f,0x5f}, 0x00000d42}, // br__ PORTUGUESE.12 380 {{0x62,0x74,0x5f,0x5f}, 0x00008842}, // bt__ DZONGKHA.12 381 {{0x62,0x77,0x5f,0x5f}, 0x06059ac4}, // bw__ TSWANA.9 FRENCH.6 GERMAN.5 382 {{0x62,0x79,0x5f,0x5f}, 0x00003024}, // by__ BELARUSIAN.9 383 {{0x62,0x7a,0x5f,0x5f}, 0x0f0a0924}, // bz__ Japanese.9 Korean.5 SPANISH.1 384 {{0x63,0x61,0x5f,0x5f}, 0x00000542}, // ca__ FRENCH.12 385 {{0x63,0x61,0x74,0x5f}, 0x00003842}, // cat_ CATALAN.12 386 {{0x63,0x64,0x5f,0x5f}, 0x06051224}, // cd__ CZECH.9 FRENCH.5 GERMAN.1 387 {{0x63,0x66,0x5f,0x5f}, 0x00000542}, // cf__ FRENCH.12 388 {{0x63,0x67,0x5f,0x5f}, 0x00000542}, // cg__ FRENCH.12 389 {{0x63,0x68,0x5f,0x5f}, 0x08050638}, // ch__ GERMAN.11 FRENCH.7 ITALIAN.2 390 {{0x63,0x69,0x5f,0x5f}, 0x00000542}, // ci__ FRENCH.12 391 {{0x63,0x6c,0x5f,0x5f}, 0x00000f42}, // cl__ SPANISH.12 392 {{0x63,0x6d,0x5f,0x5f}, 0x00000542}, // cm__ FRENCH.12 393 {{0x63,0x6e,0x5f,0x5f}, 0x00001142}, // cn__ Chinese.12 394 {{0x63,0x6f,0x5f,0x5f}, 0x00000f42}, // co__ SPANISH.12 395 // {{0x63,0x6f,0x6f,0x70}, 0x0f0509cd}, // coop Japanese.10 FRENCH.6 SPANISH.5 396 {{0x63,0x72,0x5f,0x5f}, 0x00000f42}, // cr__ SPANISH.12 397 {{0x63,0x75,0x5f,0x5f}, 0x00000f42}, // cu__ SPANISH.12 398 {{0x63,0x76,0x5f,0x5f}, 0x00000d42}, // cv__ PORTUGUESE.12 399 {{0x63,0x78,0x5f,0x5f}, 0x223a091f}, // cx__ Japanese.8 BASQUE.6 TURKISH.4 400 {{0x63,0x79,0x5f,0x5f}, 0x150622ba}, // cy__ TURKISH.8 GERMAN.4 LATVIAN.3 401 {{0x63,0x7a,0x5f,0x5f}, 0x00001242}, // cz__ CZECH.12 402 {{0x64,0x65,0x5f,0x5f}, 0x00000642}, // de__ GERMAN.12 403 {{0x64,0x6b,0x5f,0x5f}, 0x00000242}, // dk__ DANISH.12 404 {{0x64,0x6f,0x5f,0x5f}, 0x21000f42}, // do__ SPANISH.12 TAGALOG.1 405 {{0x64,0x7a,0x5f,0x5f}, 0x37000535}, // dz__ FRENCH.10 ARABIC.9 406 {{0x65,0x63,0x5f,0x5f}, 0x00000f42}, // ec__ SPANISH.12 407 // {{0x65,0x64,0x75,0x5f}, 0x2e0f3873}, // edu_ CATALAN.9 SPANISH.7 ALBANIAN.2 408 {{0x65,0x65,0x5f,0x5f}, 0x00001942}, // ee__ ESTONIAN.12 409 {{0x65,0x67,0x5f,0x5f}, 0x05003742}, // eg__ ARABIC.12 FRENCH.1 410 {{0x65,0x72,0x5f,0x5f}, 0x00000b42}, // er__ NORWEGIAN.12 411 {{0x65,0x73,0x5f,0x5f}, 0x38200fd4}, // es__ SPANISH.11 GALICIAN.4 CATALAN.2 412 {{0x65,0x74,0x5f,0x5f}, 0x39004a39}, // et__ AMHARIC.11 ESPERANTO.3 413 {{0x66,0x69,0x5f,0x5f}, 0x10000444}, // fi__ FINNISH.12 SWEDISH.3 414 {{0x66,0x6a,0x5f,0x5f}, 0x050489e0}, // fj__ FIJIAN.12 FINNISH.5 FRENCH.3 415 {{0x66,0x6f,0x5f,0x5f}, 0x00004742}, // fo__ FAROESE.12 416 {{0x66,0x72,0x5f,0x5f}, 0x00000542}, // fr__ FRENCH.12 417 {{0x67,0x61,0x5f,0x5f}, 0x00000542}, // ga__ FRENCH.12 418 {{0x67,0x64,0x5f,0x5f}, 0x061d05d5}, // gd__ FRENCH.11 CROATIAN.5 GERMAN.3 419 {{0x67,0x65,0x5f,0x5f}, 0x00004c2d}, // ge__ GEORGIAN.10 420 {{0x67,0x66,0x5f,0x5f}, 0x00000542}, // gf__ FRENCH.12 421 {{0x67,0x67,0x5f,0x5f}, 0x06002244}, // gg__ TURKISH.12 GERMAN.3 422 {{0x67,0x68,0x5f,0x5f}, 0x05000436}, // gh__ FINNISH.10 FRENCH.10 423 {{0x67,0x69,0x5f,0x5f}, 0x0f0538ce}, // gi__ CATALAN.10 FRENCH.7 SPANISH.6 424 {{0x67,0x6c,0x5f,0x5f}, 0x398a0238}, // gl__ DANISH.11 GREENLANDIC.7 ESPERANTO.2 425 {{0x67,0x6d,0x5f,0x5f}, 0x0600043e}, // gm__ FINNISH.11 GERMAN.8 426 {{0x67,0x6e,0x5f,0x5f}, 0x00000542}, // gn__ FRENCH.12 427 // {{0x67,0x6f,0x76,0x5f}, 0x05000f25}, // gov_ SPANISH.9 FRENCH.2 428 {{0x67,0x70,0x5f,0x5f}, 0x00000542}, // gp__ FRENCH.12 429 {{0x67,0x71,0x5f,0x5f}, 0x0f000547}, // gq__ FRENCH.12 SPANISH.6 430 {{0x67,0x73,0x5f,0x5f}, 0x00000942}, // gs__ Japanese.12 431 {{0x67,0x74,0x5f,0x5f}, 0x00000f42}, // gt__ SPANISH.12 432 {{0x68,0x6b,0x5f,0x5f}, 0x11004643}, // hk__ ChineseT.12 Chinese.2 433 {{0x68,0x6d,0x5f,0x5f}, 0x4606092e}, // hm__ Japanese.10 GERMAN.6 ChineseT.2 434 {{0x68,0x6e,0x5f,0x5f}, 0x00000f42}, // hn__ SPANISH.12 435 {{0x68,0x72,0x5f,0x5f}, 0x00001d42}, // hr__ CROATIAN.12 436 {{0x68,0x74,0x5f,0x5f}, 0x0f000542}, // ht__ FRENCH.12 SPANISH.1 437 {{0x68,0x75,0x5f,0x5f}, 0x00001842}, // hu__ HUNGARIAN.12 438 {{0x69,0x64,0x5f,0x5f}, 0x00002742}, // id__ INDONESIAN.12 439 {{0x69,0x65,0x5f,0x5f}, 0x050c1f24}, // ie__ IRISH.9 POLISH.5 FRENCH.1 440 {{0x69,0x6c,0x5f,0x5f}, 0x00000742}, // il__ HEBREW.12 441 {{0x69,0x6e,0x74,0x5f}, 0x0f060574}, // int_ FRENCH.9 GERMAN.8 SPANISH.3 442 {{0x69,0x6f,0x5f,0x5f}, 0x11090fd5}, // io__ SPANISH.11 Japanese.5 Chinese.3 443 {{0x69,0x71,0x5f,0x5f}, 0x60003744}, // iq__ ARABIC.12 KURDISH.3 444 {{0x69,0x72,0x5f,0x5f}, 0x00004e42}, // ir__ PERSIAN.12 445 {{0x69,0x73,0x5f,0x5f}, 0x00001442}, // is__ ICELANDIC.12 446 {{0x69,0x74,0x5f,0x5f}, 0x00000842}, // it__ ITALIAN.12 447 {{0x6a,0x65,0x5f,0x5f}, 0x29050328}, // je__ DUTCH.9 FRENCH.7 MALAY.5 448 {{0x6a,0x6d,0x5f,0x5f}, 0x040f0576}, // jm__ FRENCH.9 SPANISH.8 FINNISH.5 449 {{0x6a,0x6f,0x5f,0x5f}, 0x00003742}, // jo__ ARABIC.12 450 // {{0x6a,0x6f,0x62,0x73}, 0x0f060329}, // jobs DUTCH.9 GERMAN.8 SPANISH.6 451 {{0x6a,0x70,0x5f,0x5f}, 0x00000942}, // jp__ Japanese.12 452 {{0x6b,0x65,0x5f,0x5f}, 0x040f3fc3}, // ke__ SWAHILI.9 SPANISH.5 FINNISH.4 453 {{0x6b,0x69,0x5f,0x5f}, 0x04000643}, // ki__ GERMAN.12 FINNISH.2 454 {{0x6b,0x6d,0x5f,0x5f}, 0x00000542}, // km__ FRENCH.12 455 {{0x6b,0x70,0x5f,0x5f}, 0x00000a42}, // kp__ Korean.12 456 {{0x6b,0x72,0x5f,0x5f}, 0x00000a42}, // kr__ Korean.12 457 {{0x6b,0x77,0x5f,0x5f}, 0x00003742}, // kw__ ARABIC.12 458 {{0x6b,0x79,0x5f,0x5f}, 0x0500083f}, // ky__ ITALIAN.11 FRENCH.9 459 {{0x6b,0x7a,0x5f,0x5f}, 0x0000732d}, // kz__ KAZAKH.10 460 {{0x6c,0x62,0x5f,0x5f}, 0x05003747}, // lb__ ARABIC.12 FRENCH.6 461 {{0x6c,0x63,0x5f,0x5f}, 0x09000645}, // lc__ GERMAN.12 Japanese.4 462 {{0x6c,0x69,0x5f,0x5f}, 0x1600063d}, // li__ GERMAN.11 LITHUANIAN.7 463 {{0x6c,0x73,0x5f,0x5f}, 0x00005742}, // ls__ SESOTHO.12 464 {{0x6c,0x74,0x5f,0x5f}, 0x00001642}, // lt__ LITHUANIAN.12 465 {{0x6c,0x75,0x5f,0x5f}, 0x0600053d}, // lu__ FRENCH.11 GERMAN.7 466 {{0x6c,0x76,0x5f,0x5f}, 0x00001542}, // lv__ LATVIAN.12 467 {{0x6c,0x79,0x5f,0x5f}, 0x05003744}, // ly__ ARABIC.12 FRENCH.3 468 {{0x6d,0x61,0x5f,0x5f}, 0x3700053d}, // ma__ FRENCH.11 ARABIC.7 469 {{0x6d,0x63,0x5f,0x5f}, 0x00000542}, // mc__ FRENCH.12 470 {{0x6d,0x64,0x5f,0x5f}, 0x00001724}, // md__ ROMANIAN.9 471 {{0x6d,0x65,0x5f,0x5f}, 0x00001d42}, // me__ CROATIAN.12 472 {{0x6d,0x67,0x5f,0x5f}, 0x00000542}, // mg__ FRENCH.12 473 {{0x6d,0x6b,0x5f,0x5f}, 0x1c002543}, // mk__ MACEDONIAN.12 BULGARIAN.2 474 {{0x6d,0x6c,0x5f,0x5f}, 0x00000542}, // ml__ FRENCH.12 475 {{0x6d,0x6e,0x5f,0x5f}, 0x00006142}, // mn__ MONGOLIAN.12 476 {{0x6d,0x6f,0x5f,0x5f}, 0x110d4631}, // mo__ ChineseT.10 PORTUGUESE.8 Chinese.5 477 {{0x6d,0x71,0x5f,0x5f}, 0x00000542}, // mq__ FRENCH.12 478 {{0x6d,0x72,0x5f,0x5f}, 0x37000535}, // mr__ FRENCH.10 ARABIC.9 479 {{0x6d,0x73,0x5f,0x5f}, 0x090f06d5}, // ms__ GERMAN.11 SPANISH.5 Japanese.3 480 {{0x6d,0x74,0x5f,0x5f}, 0x00004242}, // mt__ MALTESE.12 481 {{0x6d,0x75,0x5f,0x5f}, 0x05000934}, // mu__ Japanese.10 FRENCH.8 482 {{0x6d,0x76,0x5f,0x5f}, 0x28000436}, // mv__ FINNISH.10 LATIN.10 483 {{0x6d,0x77,0x5f,0x5f}, 0x0611092a}, // mw__ Japanese.9 Chinese.8 GERMAN.7 484 {{0x6d,0x78,0x5f,0x5f}, 0x00000f42}, // mx__ SPANISH.12 485 {{0x6d,0x79,0x5f,0x5f}, 0x00002942}, // my__ MALAY.12 486 {{0x6d,0x7a,0x5f,0x5f}, 0x00000d42}, // mz__ PORTUGUESE.12 487 {{0x6e,0x61,0x5f,0x5f}, 0x06006644}, // na__ AFRIKAANS.12 GERMAN.3 488 {{0x6e,0x63,0x5f,0x5f}, 0x00000542}, // nc__ FRENCH.12 489 {{0x6e,0x65,0x5f,0x5f}, 0x8b000542}, // ne__ FRENCH.12 HAUSA.1 490 {{0x6e,0x66,0x5f,0x5f}, 0x00000542}, // nf__ FRENCH.12 491 {{0x6e,0x69,0x5f,0x5f}, 0x00000f42}, // ni__ SPANISH.12 492 {{0x6e,0x6c,0x5f,0x5f}, 0x00000342}, // nl__ DUTCH.12 493 {{0x6e,0x6f,0x5f,0x5f}, 0x51000b43}, // no__ NORWEGIAN.12 NORWEGIAN_N.2 494 {{0x6e,0x75,0x5f,0x5f}, 0x0300103b}, // nu__ SWEDISH.11 DUTCH.5 495 {{0x6f,0x6d,0x5f,0x5f}, 0x00003742}, // om__ ARABIC.12 496 {{0x70,0x61,0x5f,0x5f}, 0x00000f42}, // pa__ SPANISH.12 497 {{0x70,0x65,0x5f,0x5f}, 0x00000f42}, // pe__ SPANISH.12 498 {{0x70,0x66,0x5f,0x5f}, 0x00000542}, // pf__ FRENCH.12 499 {{0x70,0x67,0x5f,0x5f}, 0x00000f24}, // pg__ SPANISH.9 500 {{0x70,0x68,0x5f,0x5f}, 0x00002142}, // ph__ TAGALOG.12 501 {{0x70,0x6b,0x5f,0x5f}, 0x00003342}, // pk__ URDU.12 502 {{0x70,0x6c,0x5f,0x5f}, 0x30000c42}, // pl__ POLISH.12 BELARUSIAN.1 503 {{0x70,0x6e,0x5f,0x5f}, 0x04000644}, // pn__ GERMAN.12 FINNISH.3 504 {{0x70,0x72,0x5f,0x5f}, 0x00000f42}, // pr__ SPANISH.12 505 {{0x70,0x72,0x6f,0x5f}, 0x46050fd5}, // pro_ SPANISH.11 FRENCH.5 ChineseT.3 506 {{0x70,0x73,0x5f,0x5f}, 0x00003742}, // ps__ ARABIC.12 507 {{0x70,0x74,0x5f,0x5f}, 0x00000d42}, // pt__ PORTUGUESE.12 508 {{0x70,0x79,0x5f,0x5f}, 0x00000f42}, // py__ SPANISH.12 509 {{0x71,0x61,0x5f,0x5f}, 0x00003742}, // qa__ ARABIC.12 510 {{0x72,0x65,0x5f,0x5f}, 0x00000542}, // re__ FRENCH.12 511 {{0x72,0x6f,0x5f,0x5f}, 0x00001742}, // ro__ ROMANIAN.12 512 {{0x72,0x73,0x5f,0x5f}, 0x00001d42}, // rs__ CROATIAN.12 513 {{0x72,0x77,0x5f,0x5f}, 0x9000053e}, // rw__ FRENCH.11 KINYARWANDA.8 514 {{0x73,0x61,0x5f,0x5f}, 0x00003742}, // sa__ ARABIC.12 515 {{0x73,0x62,0x5f,0x5f}, 0x00000442}, // sb__ FINNISH.12 516 {{0x73,0x63,0x5f,0x5f}, 0x060f092f}, // sc__ Japanese.10 SPANISH.7 GERMAN.3 517 {{0x73,0x64,0x5f,0x5f}, 0x00003742}, // sd__ ARABIC.12 518 {{0x73,0x65,0x5f,0x5f}, 0x00001042}, // se__ SWEDISH.12 519 {{0x73,0x69,0x5f,0x5f}, 0x00004042}, // si__ SLOVENIAN.12 520 {{0x73,0x6b,0x5f,0x5f}, 0x12004543}, // sk__ SLOVAK.12 CZECH.2 521 {{0x73,0x6d,0x5f,0x5f}, 0x00000842}, // sm__ ITALIAN.12 522 {{0x73,0x6e,0x5f,0x5f}, 0x00000542}, // sn__ FRENCH.12 523 {{0x73,0x72,0x5f,0x5f}, 0x03001e44}, // sr__ SERBIAN.12 DUTCH.3 524 {{0x73,0x76,0x5f,0x5f}, 0x00000f42}, // sv__ SPANISH.12 525 {{0x73,0x79,0x5f,0x5f}, 0x00003742}, // sy__ ARABIC.12 526 {{0x74,0x63,0x5f,0x5f}, 0x0a2206cd}, // tc__ GERMAN.10 TURKISH.6 Korean.5 527 {{0x74,0x66,0x5f,0x5f}, 0x00000642}, // tf__ GERMAN.12 528 {{0x74,0x67,0x5f,0x5f}, 0x00000542}, // tg__ FRENCH.12 529 {{0x74,0x68,0x5f,0x5f}, 0x9e0936c9}, // th__ THAI.10 Japanese.3 SCOTS.1 530 {{0x74,0x6a,0x5f,0x5f}, 0x00007924}, // tj__ TAJIK.9 531 {{0x74,0x6c,0x5f,0x5f}, 0x060f0dcd}, // tl__ PORTUGUESE.10 SPANISH.6 GERMAN.5 532 {{0x74,0x6e,0x5f,0x5f}, 0x3700053e}, // tn__ FRENCH.11 ARABIC.8 533 {{0x74,0x6f,0x5f,0x5f}, 0x064609c5}, // to__ Japanese.9 ChineseT.7 GERMAN.6 534 {{0x74,0x70,0x5f,0x5f}, 0x06000944}, // tp__ Japanese.12 GERMAN.3 535 {{0x74,0x72,0x5f,0x5f}, 0x00002242}, // tr__ TURKISH.12 536 {{0x74,0x72,0x61,0x76}, 0x064509c3}, // trav Japanese.9 SLOVAK.5 GERMAN.4 537 {{0x74,0x74,0x5f,0x5f}, 0x0f00063e}, // tt__ GERMAN.11 SPANISH.8 538 {{0x74,0x77,0x5f,0x5f}, 0x00004642}, // tw__ ChineseT.12 539 {{0x74,0x7a,0x5f,0x5f}, 0x00003f42}, // tz__ SWAHILI.12 540 {{0x75,0x61,0x5f,0x5f}, 0x0000232d}, // ua__ UKRAINIAN.10 541 {{0x75,0x79,0x5f,0x5f}, 0x00000f42}, // uy__ SPANISH.12 542 {{0x75,0x7a,0x5f,0x5f}, 0x0000492d}, // uz__ UZBEK.10 543 {{0x76,0x61,0x5f,0x5f}, 0x060f0828}, // va__ ITALIAN.9 SPANISH.7 GERMAN.5 544 {{0x76,0x63,0x5f,0x5f}, 0x0d000939}, // vc__ Japanese.11 PORTUGUESE.3 545 {{0x76,0x65,0x5f,0x5f}, 0x00000f42}, // ve__ SPANISH.12 546 {{0x76,0x67,0x5f,0x5f}, 0x09000f43}, // vg__ SPANISH.12 Japanese.2 547 {{0x76,0x69,0x5f,0x5f}, 0x00002942}, // vi__ MALAY.12 548 {{0x76,0x6e,0x5f,0x5f}, 0x00004342}, // vn__ VIETNAMESE.12 549 {{0x76,0x75,0x5f,0x5f}, 0x00000642}, // vu__ GERMAN.12 550 {{0x77,0x73,0x5f,0x5f}, 0x4b0f0624}, // ws__ GERMAN.9 SPANISH.5 AZERBAIJANI.1 551 {{0x79,0x65,0x5f,0x5f}, 0x00003742}, // ye__ ARABIC.12 552 {{0x79,0x75,0x5f,0x5f}, 0x1e001d3d}, // yu__ CROATIAN.11 SERBIAN.7 553 {{0x7a,0x61,0x5f,0x5f}, 0x00006642}, // za__ AFRIKAANS.12 554 {{0x7a,0x6d,0x5f,0x5f}, 0x0b000435}, // zm__ FINNISH.10 NORWEGIAN.9 555 {{0x7a,0x77,0x5f,0x5f}, 0x3f00783e}, // zw__ SHONA.11 SWAHILI.8 556 }; 557 558 559 // Statistically closest language, based on quadgram table 560 // Those that are far from other languges map to UNKNOWN_LANGUAGE 561 // Subscripted by Language 562 // 563 // From lang_correlation.txt and hand-edits 564 // sed 's/^\([^ ]*\) \([^ ]*\) coef=0\.\(..\).*$/ 565 // (\3 >= kMinCorrPercent) ? \2 : UNKNOWN_LANGUAGE, 566 // \/\/ \1/' lang_correlation.txt >/tmp/closest_lang_decl.txt 567 // 568 static const int kMinCorrPercent = 24; // Pick off how close you want 569 // 24 catches PERSIAN <== ARABIC 570 // but not SPANISH <== PORTUGESE 571 static Language Unknown = UNKNOWN_LANGUAGE; 572 573 // Subscripted by Language 574 static const Language kClosestAltLanguage[] = { 575 (28 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // ENGLISH 576 (36 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // DANISH 577 (31 >= kMinCorrPercent) ? AFRIKAANS : UNKNOWN_LANGUAGE, // DUTCH 578 (15 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // FINNISH 579 (11 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // FRENCH 580 (17 >= kMinCorrPercent) ? LUXEMBOURGISH : UNKNOWN_LANGUAGE, // GERMAN 581 (27 >= kMinCorrPercent) ? YIDDISH : UNKNOWN_LANGUAGE, // HEBREW 582 (16 >= kMinCorrPercent) ? CORSICAN : UNKNOWN_LANGUAGE, // ITALIAN 583 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Japanese 584 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Korean 585 (41 >= kMinCorrPercent) ? NORWEGIAN_N : UNKNOWN_LANGUAGE, // NORWEGIAN 586 ( 5 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // POLISH 587 (23 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // PORTUGUESE 588 (33 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // RUSSIAN 589 (28 >= kMinCorrPercent) ? GALICIAN : UNKNOWN_LANGUAGE, // SPANISH 590 (17 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // SWEDISH 591 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Chinese 592 (42 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // CZECH 593 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GREEK 594 (35 >= kMinCorrPercent) ? FAROESE : UNKNOWN_LANGUAGE, // ICELANDIC 595 ( 7 >= kMinCorrPercent) ? LITHUANIAN : UNKNOWN_LANGUAGE, // LATVIAN 596 ( 7 >= kMinCorrPercent) ? LATVIAN : UNKNOWN_LANGUAGE, // LITHUANIAN 597 ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ROMANIAN 598 ( 4 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE, // HUNGARIAN 599 (15 >= kMinCorrPercent) ? FINNISH : UNKNOWN_LANGUAGE, // ESTONIAN 600 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Ignore 601 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // Unknown 602 (33 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // BULGARIAN 603 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CROATIAN 604 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SERBIAN 605 (24 >= kMinCorrPercent) ? SCOTS_GAELIC : UNKNOWN_LANGUAGE, // IRISH 606 (28 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GALICIAN 607 ( 8 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // TAGALOG 608 (29 >= kMinCorrPercent) ? AZERBAIJANI : UNKNOWN_LANGUAGE, // TURKISH 609 (28 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // UKRAINIAN 610 (37 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // HINDI 611 (29 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE, // MACEDONIAN 612 (14 >= kMinCorrPercent) ? ASSAMESE : UNKNOWN_LANGUAGE, // BENGALI 613 (46 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // INDONESIAN 614 ( 9 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // LATIN 615 (46 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE, // MALAY 616 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MALAYALAM 617 ( 4 >= kMinCorrPercent) ? BRETON : UNKNOWN_LANGUAGE, // WELSH 618 ( 8 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // NEPALI 619 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TELUGU 620 ( 3 >= kMinCorrPercent) ? ESPERANTO : UNKNOWN_LANGUAGE, // ALBANIAN 621 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // TAMIL 622 (22 >= kMinCorrPercent) ? UKRAINIAN : UNKNOWN_LANGUAGE, // BELARUSIAN 623 (15 >= kMinCorrPercent) ? SUNDANESE : UNKNOWN_LANGUAGE, // JAVANESE 624 (19 >= kMinCorrPercent) ? CATALAN : UNKNOWN_LANGUAGE, // OCCITAN 625 (27 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // URDU 626 (36 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // BIHARI 627 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GUJARATI 628 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // THAI 629 (24 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // ARABIC 630 (19 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // CATALAN 631 ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // ESPERANTO 632 ( 3 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // BASQUE 633 ( 9 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE, // INTERLINGUA 634 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KANNADA 635 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PUNJABI 636 (24 >= kMinCorrPercent) ? IRISH : UNKNOWN_LANGUAGE, // SCOTS_GAELIC 637 ( 7 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SWAHILI 638 (28 >= kMinCorrPercent) ? SERBO_CROATIAN : UNKNOWN_LANGUAGE, // SLOVENIAN 639 (37 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // MARATHI 640 ( 3 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // MALTESE 641 ( 1 >= kMinCorrPercent) ? YORUBA : UNKNOWN_LANGUAGE, // VIETNAMESE 642 (15 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // FRISIAN 643 (42 >= kMinCorrPercent) ? CZECH : UNKNOWN_LANGUAGE, // SLOVAK 644 // Original ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ChineseT 645 (24 >= kMinCorrPercent) ? CHINESE : UNKNOWN_LANGUAGE, // ChineseT 646 (35 >= kMinCorrPercent) ? ICELANDIC : UNKNOWN_LANGUAGE, // FAROESE 647 (15 >= kMinCorrPercent) ? JAVANESE : UNKNOWN_LANGUAGE, // SUNDANESE 648 (17 >= kMinCorrPercent) ? TAJIK : UNKNOWN_LANGUAGE, // UZBEK 649 ( 7 >= kMinCorrPercent) ? TIGRINYA : UNKNOWN_LANGUAGE, // AMHARIC 650 (29 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // AZERBAIJANI 651 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // GEORGIAN 652 ( 7 >= kMinCorrPercent) ? AMHARIC : UNKNOWN_LANGUAGE, // TIGRINYA 653 (27 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // PERSIAN 654 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // BOSNIAN 655 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SINHALESE 656 (41 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE, // NORWEGIAN_N 657 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_P 658 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // PORTUGUESE_B 659 (37 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // XHOSA 660 (37 >= kMinCorrPercent) ? XHOSA : UNKNOWN_LANGUAGE, // ZULU 661 ( 2 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE, // GUARANI 662 (29 >= kMinCorrPercent) ? TSWANA : UNKNOWN_LANGUAGE, // SESOTHO 663 ( 7 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE, // TURKMEN 664 ( 8 >= kMinCorrPercent) ? KAZAKH : UNKNOWN_LANGUAGE, // KYRGYZ 665 ( 5 >= kMinCorrPercent) ? FRENCH : UNKNOWN_LANGUAGE, // BRETON 666 ( 3 >= kMinCorrPercent) ? GANDA : UNKNOWN_LANGUAGE, // TWI 667 (27 >= kMinCorrPercent) ? HEBREW : UNKNOWN_LANGUAGE, // YIDDISH 668 (28 >= kMinCorrPercent) ? SLOVENIAN : UNKNOWN_LANGUAGE, // SERBO_CROATIAN 669 (12 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // SOMALI 670 ( 9 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // UIGHUR 671 (15 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // KURDISH 672 ( 6 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // MONGOLIAN 673 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ARMENIAN 674 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // LAOTHIAN 675 ( 8 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE, // SINDHI 676 (10 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // RHAETO_ROMANCE 677 (31 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // AFRIKAANS 678 (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // LUXEMBOURGISH 679 ( 2 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE, // BURMESE 680 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // KHMER 681 (45 >= kMinCorrPercent) ? DZONGKHA : UNKNOWN_LANGUAGE, // TIBETAN 682 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // DHIVEHI 683 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CHEROKEE 684 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // SYRIAC 685 ( 8 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE, // LIMBU 686 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ORIYA 687 (14 >= kMinCorrPercent) ? BENGALI : UNKNOWN_LANGUAGE, // ASSAMESE 688 (16 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE, // CORSICAN 689 ( 5 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE, // INTERLINGUE 690 ( 8 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE, // KAZAKH 691 ( 4 >= kMinCorrPercent) ? SWAHILI : UNKNOWN_LANGUAGE, // LINGALA 692 (11 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE, // MOLDAVIAN 693 (19 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE, // PASHTO 694 ( 5 >= kMinCorrPercent) ? AYMARA : UNKNOWN_LANGUAGE, // QUECHUA 695 ( 5 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // SHONA 696 (17 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE, // TAJIK 697 (13 >= kMinCorrPercent) ? BASHKIR : UNKNOWN_LANGUAGE, // TATAR 698 (11 >= kMinCorrPercent) ? SAMOAN : UNKNOWN_LANGUAGE, // TONGA 699 ( 2 >= kMinCorrPercent) ? TWI : UNKNOWN_LANGUAGE, // YORUBA 700 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_ENGLISH_BASED 701 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_FRENCH_BASED 702 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_PORTUGUESE_BASED 703 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // CREOLES_AND_PIDGINS_OTHER 704 ( 6 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // MAORI 705 ( 3 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE, // WOLOF 706 ( 1 >= kMinCorrPercent) ? MONGOLIAN : UNKNOWN_LANGUAGE, // ABKHAZIAN 707 ( 8 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // AFAR 708 ( 5 >= kMinCorrPercent) ? QUECHUA : UNKNOWN_LANGUAGE, // AYMARA 709 (13 >= kMinCorrPercent) ? TATAR : UNKNOWN_LANGUAGE, // BASHKIR 710 ( 3 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // BISLAMA 711 (45 >= kMinCorrPercent) ? TIBETAN : UNKNOWN_LANGUAGE, // DZONGKHA 712 ( 4 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // FIJIAN 713 ( 7 >= kMinCorrPercent) ? INUPIAK : UNKNOWN_LANGUAGE, // GREENLANDIC 714 ( 3 >= kMinCorrPercent) ? AFAR : UNKNOWN_LANGUAGE, // HAUSA 715 ( 3 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE, // HAITIAN_CREOLE 716 ( 7 >= kMinCorrPercent) ? GREENLANDIC : UNKNOWN_LANGUAGE, // INUPIAK 717 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // INUKTITUT 718 ( 4 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE, // KASHMIRI 719 (30 >= kMinCorrPercent) ? RUNDI : UNKNOWN_LANGUAGE, // KINYARWANDA 720 ( 2 >= kMinCorrPercent) ? TAGALOG : UNKNOWN_LANGUAGE, // MALAGASY 721 (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE, // NAURU 722 (12 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE, // OROMO 723 (30 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // RUNDI 724 (11 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE, // SAMOAN 725 ( 1 >= kMinCorrPercent) ? LINGALA : UNKNOWN_LANGUAGE, // SANGO 726 (32 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE, // SANSKRIT 727 (16 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE, // SISWANT 728 ( 5 >= kMinCorrPercent) ? SISWANT : UNKNOWN_LANGUAGE, // TSONGA 729 (29 >= kMinCorrPercent) ? SESOTHO : UNKNOWN_LANGUAGE, // TSWANA 730 ( 2 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE, // VOLAPUK 731 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // ZHUANG 732 ( 1 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE, // KHASI 733 (28 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // SCOTS 734 (15 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE, // GANDA 735 ( 7 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE, // MANX 736 ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE, // MONTENEGRIN 737 }; 738 739 COMPILE_ASSERT(arraysize(kClosestAltLanguage) == NUM_LANGUAGES, 740 kClosestAltLanguage_has_incorrect_size); 741 742 743 inline bool FlagFinish(int flags) {return (flags & kCLDFlagFinish) != 0;} 744 inline bool FlagSqueeze(int flags) {return (flags & kCLDFlagSqueeze) != 0;} 745 inline bool FlagRepeats(int flags) {return (flags & kCLDFlagRepeats) != 0;} 746 inline bool FlagTop40(int flags) {return (flags & kCLDFlagTop40) != 0;} 747 inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;} 748 inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;} 749 inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;} 750 751 752 753 754 //------------------------------------------------------------------------------ 755 // For --cld_html debugging output. Not thread safe 756 //------------------------------------------------------------------------------ 757 static Language prior_lang = UNKNOWN_LANGUAGE; 758 static bool prior_unreliable = false; 759 760 //------------------------------------------------------------------------------ 761 // End For --cld_html debugging output 762 //------------------------------------------------------------------------------ 763 764 765 // Backscan to word boundary, returning how many bytes n to go back 766 // so that src - n is non-space ans src - n - 1 is space. 767 // If not found in kMaxSpaceScan bytes, return 0 768 int BackscanToSpace(const char* src, int limit) { 769 int n = 0; 770 limit = cld::minint(limit, kMaxSpaceScan); 771 while (n < limit) { 772 if (src[-n - 1] == ' ') {return n;} // We are at _X 773 ++n; 774 } 775 return 0; 776 } 777 778 // Forwardscan to word boundary, returning how many bytes n to go forward 779 // so that src + n is non-space ans src + n - 1 is space. 780 // If not found in kMaxSpaceScan bytes, return 0 781 int ForwardscanToSpace(const char* src, int limit) { 782 int n = 0; 783 limit = cld::minint(limit, kMaxSpaceScan); 784 while (n < limit) { 785 if (src[n] == ' ') {return n + 1;} // We are at _X 786 ++n; 787 } 788 return 0; 789 } 790 791 792 // This uses a cheap predictor to get a measure of compression, and 793 // hence a measure of repetitiveness. It works on complete UTF-8 characters 794 // instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly 795 // all the time when done with a byte-based count. Sigh. 796 // 797 // To allow running prediction across multiple chunks, caller passes in current 798 // 12-bit hash value and int[4096] prediction table. Caller inits these to 0. 799 // 800 // Returns the number of *bytes* correctly predicted, increments by 1..4 for 801 // each correctly-predicted character. 802 // 803 // NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text 804 // 805 int CountPredictedBytes(const char* isrc, int srclen, int* hash, int* tbl) { 806 int p_count = 0; 807 const uint8* src = reinterpret_cast<const uint8*>(isrc); 808 const uint8* srclimit = src + srclen; 809 int local_hash = *hash; 810 811 while (src < srclimit) { 812 int c = src[0]; 813 int incr = 1; 814 815 // Pick up one char and length 816 if (c < 0xc0) { 817 // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx 818 // Do nothing more 819 } else if ((c & 0xe0) == 0xc0) { 820 // Two-byte 821 c = (c << 8) | src[1]; 822 incr = 2; 823 } else if ((c & 0xf0) == 0xe0) { 824 // Three-byte 825 c = (c << 16) | (src[1] << 8) | src[2]; 826 incr = 3; 827 } else { 828 // Four-byte 829 c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3]; 830 incr = 4; 831 } 832 src += incr; 833 834 int p = tbl[local_hash]; // Prediction 835 tbl[local_hash] = c; // Update prediction 836 p_count += (c == p); // Count good predictions 837 838 local_hash = ((local_hash << 4) ^ c) & 0xfff; 839 } 840 841 *hash = local_hash; 842 return p_count; 843 } 844 845 846 847 // Counts number of spaces; a little faster than one-at-a-time 848 // Doesn't count odd bytes at end 849 int CountSpaces4(const char* src, int src_len) { 850 int s_count = 0; 851 for (int i = 0; i < (src_len & ~3); i += 4) { 852 s_count += (src[i] == ' '); 853 s_count += (src[i+1] == ' '); 854 s_count += (src[i+2] == ' '); 855 s_count += (src[i+3] == ' '); 856 } 857 return s_count; 858 } 859 860 // Remove words of text that have more than half their letters predicted 861 // correctly by our cheap predictor, moving the remaining words in-place 862 // to the front of the input buffer. 863 // 864 // To allow running prediction across multiple chunks, caller passes in current 865 // 12-bit hash value and int[4096] prediction table. Caller inits these to 0. 866 // 867 // Return the new, possibly-shorter length 868 // 869 // Result Buffer ALWAYS has leading space and trailing space space space NUL, 870 // if input does 871 // 872 int CheapRepWordsInplace(char* isrc, int srclen, int* hash, int* tbl) { 873 const uint8* src = reinterpret_cast<const uint8*>(isrc); 874 const uint8* srclimit = src + srclen; 875 char* dst = isrc; 876 int local_hash = *hash; 877 char* word_dst = dst; // Start of next word 878 int good_predict_bytes = 0; 879 int word_length_bytes = 0; 880 881 while (src < srclimit) { 882 int c = src[0]; 883 int incr = 1; 884 *dst++ = c; 885 886 if (c == ' ') { 887 if ((good_predict_bytes * 2) > word_length_bytes) { 888 // Word is well-predicted: backup to start of this word 889 dst = word_dst; 890 if (FLAGS_cld_showme) { 891 // Mark the deletion point with period 892 // Don't repeat multiple periods 893 // Cannot mark with more bytes or may overwrite unseen input 894 if ((isrc < (dst - 2)) && (dst[-2] != '.')) { 895 *dst++ = '.'; 896 *dst++ = ' '; 897 } 898 } 899 } 900 word_dst = dst; // Start of next word 901 good_predict_bytes = 0; 902 word_length_bytes = 0; 903 } 904 905 // Pick up one char and length 906 if (c < 0xc0) { 907 // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx 908 // Do nothing more 909 } else if ((c & 0xe0) == 0xc0) { 910 // Two-byte 911 *dst++ = src[1]; 912 c = (c << 8) | src[1]; 913 incr = 2; 914 } else if ((c & 0xf0) == 0xe0) { 915 // Three-byte 916 *dst++ = src[1]; 917 *dst++ = src[2]; 918 c = (c << 16) | (src[1] << 8) | src[2]; 919 incr = 3; 920 } else { 921 // Four-byte 922 *dst++ = src[1]; 923 *dst++ = src[2]; 924 *dst++ = src[3]; 925 c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3]; 926 incr = 4; 927 } 928 src += incr; 929 word_length_bytes += incr; 930 931 int p = tbl[local_hash]; // Prediction 932 tbl[local_hash] = c; // Update prediction 933 if (c == p) { 934 good_predict_bytes += incr; // Count good predictions 935 } 936 937 local_hash = ((local_hash << 4) ^ c) & 0xfff; 938 } 939 940 *hash = local_hash; 941 942 if ((dst - isrc) < (srclen - 3)) { 943 // Pad and make last char clean UTF-8 by putting following spaces 944 dst[0] = ' '; 945 dst[1] = ' '; 946 dst[2] = ' '; 947 dst[3] = '\0'; 948 } else if ((dst - isrc) < srclen) { 949 // Make last char clean UTF-8 by putting following space off the end 950 dst[0] = ' '; 951 } 952 953 return static_cast<int>(dst - isrc); 954 } 955 956 957 // Remove portions of text that have a high density of spaces, or that are 958 // overly repetitive, squeezing the remaining text in-place to the front of the 959 // input buffer. 960 // 961 // Squeezing looks at density of space/prediced chars in fixed-size chunks, 962 // specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes. 963 // 964 // Return the new, possibly-shorter length 965 // 966 // Result Buffer ALWAYS has leading space and trailing space space space NUL, 967 // if input does 968 // 969 int CompactLangDetImpl::CheapSqueezeInplace(char* isrc, 970 int srclen, 971 int ichunksize) { 972 char* src = isrc; 973 char* dst = src; 974 char* srclimit = src + srclen; 975 bool skipping = false; 976 977 int hash = 0; 978 // Allocate local prediction table. 979 int* predict_tbl = new int[kPredictionTableSize]; 980 memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0])); 981 982 int chunksize = ichunksize; 983 if (chunksize == 0) {chunksize = kChunksizeDefault;} 984 int space_thresh = (chunksize * kSpacesThreshPercent) / 100; 985 int predict_thresh = (chunksize * kPredictThreshPercent) / 100; 986 987 while (src < srclimit) { 988 int remaining_bytes = srclimit - src; 989 int len = cld::minint(chunksize, remaining_bytes); 990 // Make len land us on a UTF-8 character boundary, and also fix 991 // mispredictions because we could get out of phase. 992 // Loop always terminates at trailing space in buffer. 993 while ((src[len] & 0xc0) == 0x80) 994 ++len; // Move past continuation bytes 995 996 int space_n = CountSpaces4(src, len); 997 int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl); 998 if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) { 999 // Skip the text 1000 if (!skipping) { 1001 // Keeping-to-skipping transition; do it at a space 1002 int n = BackscanToSpace(dst, static_cast<int>(dst - isrc)); 1003 dst -= n; 1004 skipping = true; 1005 if (FLAGS_cld_showme) { 1006 // Mark the deletion point with black square U+25A0 1007 *dst++ = 0xe2; 1008 *dst++ = 0x96; 1009 *dst++ = 0xa0; 1010 *dst++ = ' '; 1011 } 1012 if (dst == isrc) { 1013 // Force a leading space if the first chunk is deleted 1014 *dst++ = ' '; 1015 } 1016 } 1017 } else { 1018 // Keep the text 1019 if (skipping) { 1020 // Skipping-to-keeping transition; do it at a space 1021 int n = ForwardscanToSpace(src, len); 1022 src += n; 1023 remaining_bytes -= n; // Shrink remaining length 1024 len -= n; 1025 skipping = false; 1026 } 1027 // "len" can be negative in some cases 1028 if (len > 0) { 1029 memmove(dst, src, len); 1030 dst += len; 1031 } 1032 } 1033 src += len; 1034 } 1035 1036 if ((dst - isrc) < (srclen - 3)) { 1037 // Pad and make last char clean UTF-8 by putting following spaces 1038 dst[0] = ' '; 1039 dst[1] = ' '; 1040 dst[2] = ' '; 1041 dst[3] = '\0'; 1042 } else if ((dst - isrc) < srclen) { 1043 // Make last char clean UTF-8 by putting following space off the end 1044 dst[0] = ' '; 1045 } 1046 1047 // Deallocate local prediction table 1048 delete[] predict_tbl; 1049 return static_cast<int>(dst - isrc); 1050 } 1051 1052 // Timing 2.8GHz P4 (dsites 2008.03.20) with 170KB input 1053 // About 90 MB/sec, with or without memcpy, chunksize 48 or 4096 1054 // Just CountSpaces is about 340 MB/sec 1055 // Byte-only CountPredictedBytes is about 150 MB/sec 1056 // Byte-only CountPredictedBytes, conditional tbl[] = is about 85! MB/sec 1057 // Byte-only CountPredictedBytes is about 180 MB/sec, byte tbl, byte/int c 1058 // Unjammed byte-only both = 170 MB/sec 1059 // Jammed byte-only both = 120 MB/sec 1060 // Back to original w/slight updates, 110 MB/sec 1061 // 1062 bool CheapSqueezeTriggerTest(const char* src, int srclen, int testsize) { 1063 // Don't trigger at all on short text 1064 if (srclen < testsize) {return false;} 1065 int space_thresh = (testsize * kSpacesTriggerPercent) / 100; 1066 int predict_thresh = (testsize * kPredictTriggerPercent) / 100; 1067 int hash = 0; 1068 // Allocate local prediction table. 1069 int* predict_tbl = new int[kPredictionTableSize]; 1070 memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0])); 1071 1072 bool retval = false; 1073 if ((CountSpaces4(src, testsize) >= space_thresh) || 1074 (CountPredictedBytes(src, testsize, &hash, predict_tbl) >= 1075 predict_thresh)) { 1076 retval = true; 1077 } 1078 // Deallocate local prediction table 1079 delete[] predict_tbl; 1080 return retval; 1081 } 1082 1083 1084 1085 // Close pairs (correlation) language_enum/language_enum 1086 // id/ms (0.47) 38/40 [1] 1087 // bo/dz (0.46) 105/135 [2] 1088 // cz/sk (0.43) 17/68 [3] 1089 // no/nn (0.42) 10/80 [4] 1090 // hi/mr (0.38) 35/64 [5] 1091 // xh/zu (0.37) 83/84 [6] 1092 // Subscripted by packed language, gives 0 or a subscript in closepair 1093 // scoring array inside doc_tote 1094 static const uint8 kClosePair[EXT_NUM_LANGUAGES + 1] = { 1095 0, 1096 0,0,0,0,0,0,0,0, 0,0,4,0,0,0,0,0, 0,3,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1097 0,0,0,5,0,0,1,0, 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1098 5,0,0,0,3,0,0,0, 0,0,0,0,0,0,0,0, 4,0,0,6,6,0,0,0, 0,0,0,0,0,0,0,0, 1099 0,0,0,0,0,0,0,0, 0,2,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1100 0,0,0,0,0,0,0,2, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1101 // Add new language close-pair number just before here (just use 0) 1102 }; 1103 1104 1105 // Delete any extended languages from doc_tote 1106 void RemoveExtendedLanguages(ToteWithReliability* doc_tote) { 1107 for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) { 1108 if (cld::UnpackLanguage(doc_tote->Key(sub)) >= NUM_LANGUAGES) { 1109 // Effectively remove the extended language by setting key&score to zero 1110 if (FLAGS_dbgscore) { 1111 fprintf(stderr, "{-%s} ", 1112 ExtLanguageCode(cld::UnpackLanguage(doc_tote->Key(sub)))); 1113 } 1114 1115 // Delete entry 1116 doc_tote->SetKey(sub, 0); 1117 doc_tote->SetValue(sub, 0); 1118 doc_tote->SetReliability(sub, 0); 1119 } 1120 } 1121 } 1122 1123 static const int kMinReliableKeepPercent = 41; // Remove lang if reli < this 1124 1125 // For Tier3 languages, require a minimum number of bytes to be first-place lang 1126 static const int kGoodFirstT3MinBytes = 24; // <this => no first 1127 1128 // Move bytes for unreliable langs to another lang or UNKNOWN 1129 // doc_tote is sorted, so cannot Add 1130 // 1131 // If both CHINESE and CHINESET are present and unreliable, do not delete both; 1132 // merge both into CHINESE. 1133 // 1134 //dsites 2009.03.19 1135 // we also want to remove Tier3 languages as the first lang if there is very 1136 // little text like ej1 ej2 ej3 ej4 1137 // maybe fold this back in earlier 1138 // 1139 void RemoveUnreliableLanguages(ToteWithReliability* doc_tote) { 1140 // Prepass to merge some low-reliablility languages 1141 int total_bytes = 0; 1142 for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) { 1143 int plang = doc_tote->Key(sub); 1144 if (plang == 0) {continue;} // Empty slot 1145 1146 Language lang = cld::UnpackLanguage(plang); 1147 int bytes = doc_tote->Value(sub); 1148 int reli = doc_tote->Reliability(sub); 1149 if (bytes == 0) {continue;} // Zero bytes 1150 total_bytes += bytes; 1151 1152 // Reliable percent is stored reliable score over stored bytecount 1153 int reliable_percent = reli / bytes; 1154 if (reliable_percent >= kMinReliableKeepPercent) {continue;} // Keeper 1155 1156 // This language is too unreliable to keep, but we might merge it. 1157 Language altlang = UNKNOWN_LANGUAGE; 1158 if (lang < NUM_LANGUAGES) {altlang = kClosestAltLanguage[lang];} 1159 if (altlang == UNKNOWN_LANGUAGE) {continue;} // No alternative 1160 1161 // Look for alternative in doc_tote 1162 int altsub = doc_tote->Find(cld::PackLanguage(altlang)); 1163 if (altsub < 0) {continue;} // No alternative text 1164 1165 int bytes2 = doc_tote->Value(altsub); 1166 int reli2 = doc_tote->Reliability(altsub); 1167 if (bytes2 == 0) {continue;} // Zero bytes 1168 1169 // Reliable percent is stored reliable score over stored bytecount 1170 int reliable_percent2 = reli2 / bytes2; 1171 1172 // Merge one language into the other. Break ties toward lower lang # 1173 int tosub = altsub; 1174 int fromsub = sub; 1175 bool into_lang = false; 1176 if ((reliable_percent2 < reliable_percent) || 1177 ((reliable_percent2 == reliable_percent) && (lang < altlang))) { 1178 tosub = sub; 1179 fromsub = altsub; 1180 into_lang = true; 1181 } 1182 1183 // Make sure reliability doesn't drop and is enough to avoid delete 1184 int newpercent = cld::maxint(reliable_percent, reliable_percent2); 1185 newpercent = cld::maxint(newpercent, kMinReliableKeepPercent); 1186 int newbytes = bytes + bytes2; 1187 int newreli = newpercent * newbytes; 1188 1189 doc_tote->SetKey(fromsub, 0); 1190 doc_tote->SetValue(fromsub, 0); 1191 doc_tote->SetReliability(fromsub, 0); 1192 doc_tote->SetValue(tosub, newbytes); 1193 doc_tote->SetReliability(tosub, newreli); 1194 1195 // Show fate of unreliable languages if at least 10 bytes 1196 if (FLAGS_cld_html /*&& (newpercent >= 10)*/ && (newbytes >= 10)) { 1197 if (into_lang) { 1198 fprintf(stderr, "{Unreli %s.%d(%dB) => %s} ", 1199 ExtLanguageCode(altlang), reliable_percent2, bytes2, 1200 ExtLanguageCode(lang)); 1201 } else { 1202 fprintf(stderr, "{Unreli %s.%d(%dB) => %s} ", 1203 ExtLanguageCode(lang), reliable_percent, bytes, 1204 ExtLanguageCode(altlang)); 1205 } 1206 } 1207 } 1208 1209 1210 // Pass to delete any remaining unreliable languages 1211 for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) { 1212 int plang = doc_tote->Key(sub); 1213 if (plang == 0) {continue;} // Empty slot 1214 1215 Language lang = cld::UnpackLanguage(plang); 1216 int bytes = doc_tote->Value(sub); 1217 int reli = doc_tote->Reliability(sub); 1218 if (bytes == 0) {continue;} // Zero bytes 1219 1220 bool is_tier3 = (cld::kIsPackedTop40[plang] == 0); 1221 if (is_tier3 && 1222 (bytes < kGoodFirstT3MinBytes) && 1223 (bytes < total_bytes)) { 1224 reli = 0; // Too-short tier3 1225 } 1226 1227 // Reliable percent is stored as reliable score over stored bytecount 1228 int reliable_percent = reli / bytes; 1229 if (reliable_percent >= kMinReliableKeepPercent) {continue;} // Keeper 1230 1231 // Delete unreliable entry 1232 doc_tote->SetKey(sub, 0); 1233 doc_tote->SetValue(sub, 0); 1234 doc_tote->SetReliability(sub, 0); 1235 1236 // Show fate of unreliable languages if at least 10 bytes 1237 if (FLAGS_cld_html /*&& (reliable_percent >= 10)*/ && (bytes >= 10)) { 1238 fprintf(stderr, "{Unreli %s.%d(%dB)} ", 1239 ExtLanguageCode(lang), reliable_percent, bytes); 1240 } 1241 } 1242 1243 if (FLAGS_cld_html) {fprintf(stderr, "<br>\n");} 1244 } 1245 1246 1247 // Move less likely byte count to more likely for close pairs of languages 1248 void RefineScoredClosePairs(ToteWithReliability* doc_tote) { 1249 for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) { 1250 int close_packedlang = doc_tote->Key(sub); 1251 int subscr = kClosePair[close_packedlang]; 1252 if (subscr == 0) {continue;} 1253 1254 // We have a close pair language -- if the other one is also scored and the 1255 // longword score differs enough, put all our eggs into one basket 1256 1257 // Nonzero longword score: Go look for the other of this pair 1258 for (int sub2 = sub + 1; sub2 < doc_tote->MaxSize(); ++sub2) { 1259 if (kClosePair[doc_tote->Key(sub2)] == subscr) { 1260 // We have a matching pair 1261 int close_packedlang2 = doc_tote->Key(sub2); 1262 1263 // Move all the text bytes from lower byte-count to higher one 1264 int from_sub, to_sub; 1265 Language from_lang, to_lang; 1266 if (doc_tote->Value(sub) < doc_tote->Value(sub2)) { 1267 from_sub = sub; 1268 to_sub = sub2; 1269 from_lang = cld::UnpackLanguage(close_packedlang); 1270 to_lang = cld::UnpackLanguage(close_packedlang2); 1271 } else { 1272 from_sub = sub2; 1273 to_sub = sub; 1274 from_lang = cld::UnpackLanguage(close_packedlang2); 1275 to_lang = cld::UnpackLanguage(close_packedlang); 1276 } 1277 1278 // Move all the bytes smaller => larger of the pair 1279 if (FLAGS_cld_html || FLAGS_dbgscore) { 1280 // Show fate of closepair language 1281 int val = doc_tote->Value(from_sub); 1282 int reli = doc_tote->Reliability(from_sub); 1283 int reliable_percent = reli / (val ? val : 1); // avoid zdiv 1284 fprintf(stderr, "{CloseLangPair: %s.%d%%(%dB) => %s} ", 1285 ExtLanguageCode(from_lang), 1286 reliable_percent, 1287 doc_tote->Value(from_sub), 1288 ExtLanguageCode(to_lang)); 1289 } 1290 int sum = doc_tote->Value(to_sub) + doc_tote->Value(from_sub); 1291 doc_tote->SetValue(to_sub, sum); 1292 doc_tote->SetReliability(to_sub, 100 * sum); 1293 1294 // Delete old entry 1295 doc_tote->SetKey(from_sub, 0); 1296 doc_tote->SetValue(from_sub, 0); 1297 doc_tote->SetReliability(from_sub, 0); 1298 1299 break; // Exit inner for sub2 loop 1300 } 1301 } // End for sub2 1302 } // End for sub 1303 } 1304 1305 1306 void ApplyLanguageHints(Tote* chunk_tote, int tote_grams, 1307 uint8* lang_hint_boost) { 1308 // Need 8 quad/unigrams to give full hint boost, else derate linearly 1309 if (tote_grams > 8) { 1310 tote_grams = 8; 1311 } 1312 for (int sub = 0; sub < chunk_tote->MaxSize(); ++sub) { 1313 // Hint boosts are per packed subscript 1314 int lang_sub = chunk_tote->Key(sub); 1315 int new_value = chunk_tote->Value(sub) + 1316 ((lang_hint_boost[lang_sub] * tote_grams) >> 3); 1317 chunk_tote->SetValue(sub, new_value); 1318 if (FLAGS_dbgscore && (lang_hint_boost[lang_sub] > 0)) { 1319 fprintf(stderr, "[%s+=%d*%d/8] ", 1320 ExtLanguageCode(cld::UnpackLanguage(lang_sub)), 1321 lang_hint_boost[lang_sub], tote_grams); 1322 } 1323 } 1324 } 1325 1326 1327 void PrintHtmlEscapedText(FILE* f, const char* txt, int len) { 1328 for (int i = 0; i < len; ++i) { 1329 char c = txt[i]; 1330 if (c == '<') { 1331 fprintf(f, "<"); 1332 } else if (c == '>') { 1333 fprintf(f, ">"); 1334 } else if (c == '&') { 1335 fprintf(f, "&"); 1336 } else if (c == '\'') { 1337 fprintf(f, "'"); 1338 } else if (c == '"') { 1339 fprintf(f, """); 1340 } else { 1341 fprintf(f, "%c", c); 1342 } 1343 } 1344 fprintf(f, "<br>\n"); 1345 } 1346 1347 1348 // Add one chunk's score to running document score 1349 // If the top language is UNKNOWN_LANGUAGE, score nothing. This is used to 1350 // positively identify text to be ignored, such as link farms. 1351 // Sort before scoring and reinit afterward 1352 // 1353 // src and srclen are just for debug output 1354 void ScoreChunkIntoDoc(const char* src, int srclen, int advance_by, 1355 UnicodeLScript lscript, 1356 Tote* chunk_tote, 1357 ToteWithReliability* doc_tote, 1358 int tote_grams, 1359 uint8* lang_hint_boost) { 1360 // Apply hints before sorting 1361 if (lang_hint_boost) { 1362 ApplyLanguageHints(chunk_tote, tote_grams, lang_hint_boost); 1363 } 1364 1365 // Sort to get top two languages 1366 chunk_tote->Sort(2); 1367 Language cur_lang = cld::UnpackLanguage(chunk_tote->Key(0)); 1368 1369 // Return if empty 1370 if (cur_lang < 0) { 1371 chunk_tote->Reinit(); 1372 return; 1373 } 1374 1375 bool cur_unreliable = false; 1376 1377 // Reliability is a function of mean script score per KB of text 1378 int len = chunk_tote->GetByteCount(); 1379 int reliability = cld::GetReliability((len * 2) / advance_by, 1380 lscript, 1381 chunk_tote); 1382 cur_unreliable = (reliability < cld::kMinReliable); 1383 1384 // If tote_grams=0, always reliable 1385 // If tote_grams=1, always unreliable 1386 if (tote_grams == 0) { 1387 reliability = 100; 1388 cur_unreliable = false; 1389 } else if (tote_grams == 1) { 1390 reliability = 0; 1391 cur_unreliable = true; 1392 } 1393 1394 #if 0 1395 // TEMP 1396 if (FLAGS_cld_html) { 1397 if (reliability >= kMinReliableKeepPercent) { 1398 fprintf(stderr, "R%d%% ", reliability); 1399 } else { 1400 fprintf(stderr, "--R%d%% ", reliability); 1401 } 1402 } 1403 #endif 1404 1405 // Track the sequence of language fragments [result currently unused] 1406 ////if (reliability >= kMinReliableSeq) { 1407 //// doc_tote->AddSeq(chunk_tote->Key(0)); 1408 ////} 1409 1410 if (cur_unreliable && (chunk_tote->Key(1) != 0)) { 1411 // Unreliable and two top contenders, split byte count 5/8 - 3/8 1412 int top_len = ((len * 5) + 4) >> 3; 1413 int second_len = len - top_len; 1414 1415 doc_tote->Add(chunk_tote->Key(0), 1416 top_len, chunk_tote->Value(0), reliability); 1417 doc_tote->Add(chunk_tote->Key(1), 1418 second_len, chunk_tote->Value(1), reliability); 1419 if (FLAGS_dbgscore) { 1420 fprintf(stderr, "{+%s.%d.%dR(%dB) +%s.%d.%dR(%dB)} ", 1421 ExtLanguageCode(cld::UnpackLanguage(chunk_tote->Key(0))), 1422 chunk_tote->Value(0), 1423 reliability, 1424 top_len, 1425 ExtLanguageCode(cld::UnpackLanguage(chunk_tote->Key(1))), 1426 chunk_tote->Value(1), 1427 reliability, 1428 second_len); 1429 } 1430 } else { 1431 // Reliable or single contender 1432 doc_tote->Add(chunk_tote->Key(0), 1433 len, chunk_tote->Value(0), reliability); 1434 if (FLAGS_dbgscore) { 1435 fprintf(stderr, "{+%s.%d.%dR(%dB)} ", 1436 ExtLanguageCode(cld::UnpackLanguage(chunk_tote->Key(0))), 1437 chunk_tote->Value(0), 1438 reliability, 1439 len); 1440 } 1441 } 1442 1443 if (FLAGS_cld_html) { 1444 if (cur_lang < 0) {cur_lang = UNKNOWN_LANGUAGE;} 1445 cld::PrintLang(stderr, chunk_tote, 1446 cur_lang, cur_unreliable, 1447 prior_lang, prior_unreliable); 1448 prior_lang = cur_lang; 1449 prior_unreliable = cur_unreliable; 1450 1451 string temp(src, srclen); 1452 if (temp[0] == '=') { 1453 // Rewrite =ScriptX= or =SwitchX= as =Xxxx= for script code Xxxx 1454 temp = "=Buffered_"; 1455 temp.append(UnicodeLScriptCode(lscript)); 1456 temp.append("="); 1457 } 1458 cld::PrintText(stderr, cur_lang, temp); 1459 } 1460 1461 chunk_tote->Reinit(); 1462 } 1463 1464 1465 void PrintTopLang(Language top_lang) { 1466 if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) { 1467 fprintf(stderr, "[] "); 1468 } else { 1469 fprintf(stderr, "[%s] ", ExtLanguageName(top_lang)); 1470 prior_lang = top_lang; 1471 } 1472 } 1473 1474 void PrintTopLangSpeculative(Language top_lang) { 1475 fprintf(stderr, "<span style=\"color:#%06X;\">", 0xa0a0a0); 1476 if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) { 1477 fprintf(stderr, "[] "); 1478 } else { 1479 fprintf(stderr, "[%s] ", ExtLanguageName(top_lang)); 1480 prior_lang = top_lang; 1481 } 1482 fprintf(stderr, "</span>\n"); 1483 } 1484 1485 1486 // Add one chunk's score to running document score 1487 // Convenience function with constant src text 1488 void ScoreChunkIntoDoc2(const char* src, int advance_by, 1489 UnicodeLScript lscript, 1490 Tote* chunk_tote, 1491 ToteWithReliability* doc_tote, 1492 int tote_grams, 1493 uint8* lang_hint_boost) { 1494 int srclen = static_cast<int>(strlen(src)); 1495 ScoreChunkIntoDoc(src, srclen, advance_by, lscript, chunk_tote, 1496 doc_tote, tote_grams, lang_hint_boost); 1497 } 1498 1499 1500 // Score one scriptspan using the only language for that script 1501 void ScoreNilgrams(getone::LangSpan* scriptspan, int lang, 1502 ToteWithReliability* doc_tote, 1503 uint8* lang_hint_boost, 1504 int flags, Language plus_one) { 1505 // For debugging only. Not thread-safe 1506 prior_lang = UNKNOWN_LANGUAGE; 1507 prior_unreliable = false; 1508 1509 const char* src = scriptspan->text; 1510 int len = scriptspan->text_bytes; 1511 1512 Tote chunk_tote; 1513 // Score 1000 for 1000 bytes 1514 chunk_tote.AddGram(); 1515 chunk_tote.Add(lang, scriptspan->text_bytes); 1516 chunk_tote.AddBytes(scriptspan->text_bytes); 1517 int advance_by = 2; 1518 int tote_grams = 0; // Indicates fully reliable 1519 ScoreChunkIntoDoc(src, len, advance_by, 1520 scriptspan->script, &chunk_tote, 1521 doc_tote, tote_grams, lang_hint_boost); 1522 } 1523 1524 // Score one scriptspan using unigrams 1525 // Updates tote_grams 1526 static void ScoreUnigrams(const UTF8PropObj* unigram_obj, 1527 getone::LangSpan* scriptspan, 1528 int* tote_grams, int gram_limit, 1529 Tote* chunk_tote, 1530 ToteWithReliability* doc_tote, 1531 uint8* lang_hint_boost, 1532 int advance_by, int flags, 1533 int* initial_word_span, Language plus_one) { 1534 // chunk_tote may have partial sum coming in 1535 const char* src = scriptspan->text; 1536 const char* srclimit = src + scriptspan->text_bytes; 1537 1538 // For debugging only. Not thread-safe 1539 prior_lang = UNKNOWN_LANGUAGE; 1540 prior_unreliable = false; 1541 1542 // Break text up into multiple chunks and score each 1543 while (src < srclimit) { 1544 // Updates tote_grams 1545 int len = cld::DoUniScoreV3(unigram_obj, 1546 src, srclimit - src, advance_by, 1547 tote_grams, gram_limit, chunk_tote); 1548 if (FlagUseWords(flags) || (*initial_word_span > 0)) { 1549 // Use bigram scoring in addition to quadgrams 1550 cld::DoBigramScoreV3(&kCjkBiTable_obj, 1551 src, len, chunk_tote); 1552 } 1553 chunk_tote->AddBytes(len); 1554 *initial_word_span -= len; 1555 1556 if (*tote_grams >= gram_limit) { 1557 // Add this chunk to doc totals 1558 // Remove all but top40 if asked 1559 if (FlagTop40(flags)) { 1560 cld::DemoteNotTop40(chunk_tote, cld::PackLanguage(plus_one)); 1561 } 1562 1563 // Sort, accumulate into doc total, reinit 1564 ScoreChunkIntoDoc(src, len, advance_by, 1565 scriptspan->script, chunk_tote, 1566 doc_tote, *tote_grams, lang_hint_boost); 1567 *tote_grams = 0; 1568 } else { 1569 if (FLAGS_cld_html) { 1570 string temp(src, len); 1571 Language top_lang = cld::UnpackLanguage(chunk_tote->CurrentTopKey()); 1572 PrintTopLangSpeculative(top_lang); 1573 cld::PrintText(stderr, top_lang, temp); 1574 } 1575 } 1576 src += len; 1577 } 1578 // chunk_tote may have partial sum going out 1579 } 1580 1581 // Back up one UTF-8 character 1582 const uint8* BackOneUTF8(const uint8* p) { 1583 const uint8* retval = p - 1; 1584 if ((*retval & 0xc0) == 0x80) {--retval;} 1585 if ((*retval & 0xc0) == 0x80) {--retval;} 1586 if ((*retval & 0xc0) == 0x80) {--retval;} 1587 return retval; 1588 } 1589 1590 1591 // Score one scriptspan using quadgrams 1592 // Incoming chunk_tote may have partial accumulation 1593 static void ScoreQuadgrams(const cld::CLDTableSummary* quadgram_obj, 1594 getone::LangSpan* scriptspan, 1595 int* tote_grams, int gram_limit, 1596 Tote* chunk_tote, 1597 ToteWithReliability* doc_tote, 1598 uint8* lang_hint_boost, 1599 int advance_by, int flags, 1600 int* initial_word_span, Language plus_one) { 1601 // chunk_tote may have partial sum coming in 1602 const char* src = scriptspan->text; 1603 const char* srclimit = src + scriptspan->text_bytes; 1604 const char* lastscored_src = src; 1605 1606 // For debugging only. Not thread-safe 1607 prior_lang = UNKNOWN_LANGUAGE; 1608 prior_unreliable = false; 1609 1610 // Break text up into multiple chunks and score each 1611 while (src < srclimit) { 1612 // Updates tote_grams 1613 int len = cld::DoQuadScoreV3(quadgram_obj, 1614 src, srclimit - src, advance_by, 1615 tote_grams, gram_limit, chunk_tote); 1616 if (FlagUseWords(flags) || (*initial_word_span > 0)) { 1617 // Use word scoring in addition to quadgrams 1618 cld::DoOctaScoreV3(&kLongWord8Table_obj, 1619 src, len, chunk_tote); 1620 } 1621 chunk_tote->AddBytes(len); 1622 *initial_word_span -= len; 1623 1624 if (*tote_grams >= gram_limit) { 1625 // Remove all but top40 if asked 1626 if (FlagTop40(flags)) { 1627 cld::DemoteNotTop40(chunk_tote, cld::PackLanguage(plus_one)); 1628 } 1629 1630 // Sort, accumulate into doc total, reinit 1631 ScoreChunkIntoDoc(src, len, advance_by, 1632 scriptspan->script, chunk_tote, 1633 doc_tote, *tote_grams, lang_hint_boost); 1634 lastscored_src = src + len; 1635 *tote_grams = 0; 1636 } else { 1637 if (FLAGS_cld_html) { 1638 string temp(src, len); 1639 Language top_lang = cld::UnpackLanguage(chunk_tote->CurrentTopKey()); 1640 PrintTopLangSpeculative(top_lang); 1641 cld::PrintText(stderr, top_lang, temp); 1642 } 1643 } 1644 src += len; 1645 } 1646 } 1647 1648 1649 1650 void PrintLangs(FILE* f, const Language* language3, const int* percent3, 1651 const int* text_bytes, const bool* is_reliable) { 1652 fprintf(f, "<br> Initial_Languages "); 1653 if (language3[0] != UNKNOWN_LANGUAGE) { 1654 fprintf(f, "%s%s(%d%%) ", 1655 ExtLanguageName(language3[0]), 1656 *is_reliable ? "" : "*", 1657 percent3[0]); 1658 } 1659 if (language3[1] != UNKNOWN_LANGUAGE) { 1660 fprintf(f, "%s(%d%%) ", ExtLanguageName(language3[1]), percent3[1]); 1661 } 1662 if (language3[2] != UNKNOWN_LANGUAGE) { 1663 fprintf(f, "%s(%d%%) ", ExtLanguageName(language3[2]), percent3[2]); 1664 } 1665 fprintf(f, "%d bytes \n", *text_bytes); 1666 1667 fprintf(f, "<br>\n"); 1668 } 1669 1670 1671 // Start the tote with a count of one for the default language for script 1672 void InitScriptToteLang(Tote* script_tote, UnicodeLScript lscript) { 1673 Language defaultlang = cld::kDefaultLanguagePerLScript[lscript]; 1674 script_tote->Add(cld::PackLanguage(defaultlang), 1); 1675 script_tote->AddBytes(1); 1676 #if 0 1677 if (FLAGS_cld_html) { 1678 cld::PrintLang(stderr, script_tote, 1679 defaultlang, false, 1680 UNKNOWN_LANGUAGE, false); 1681 prior_lang = cur_lang; 1682 string temp("+1"); 1683 cld::PrintText(stderr, defaultlang, temp); 1684 } 1685 #endif 1686 } 1687 1688 static const char* const kToteName[4] = 1689 {"=Latn=", "=Hani=", "=Script2=", "=Script3="}; 1690 static const char* const kToteSwitch[4] = 1691 {"=Latn=", "=Hani=", "=Switch2=", "=Switch3="}; 1692 1693 1694 1695 // Upper to lower, keep digits, everything else to minus '-' (2d) 1696 static const char kCharsetToLowerTbl[256] = { 1697 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 1698 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 1699 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 1700 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37, 0x38,0x39,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 1701 1702 0x2d,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f, 1703 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x2d,0x2d,0x2d,0x2d,0x2d, 1704 0x2d,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f, 1705 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x2d,0x2d,0x2d,0x2d,0x2d, 1706 1707 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 1708 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 1709 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 1710 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 1711 1712 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 1713 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 1714 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 1715 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 1716 }; 1717 1718 1719 static const char kIsAlpha[256] = { 1720 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1721 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1722 0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0, 1723 0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0, 1724 1725 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1726 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1727 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1728 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1729 }; 1730 1731 static const char kIsDigit[256] = { 1732 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1733 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1, 1,1,0,0,0,0,0,0, 1734 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1735 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1736 1737 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1738 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1739 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1740 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1741 }; 1742 1743 // Normalize ASCII string to first 4 alphabetic/digit chars 1744 // Letters are forced to lowercase ASCII 1745 // Used to normalize TLD values 1746 void MakeChar4(const char* str, char* norm) { 1747 memcpy(norm, "____", 4); // four underscores 1748 int l_ptr = 0; 1749 for (int i = 0; i < strlen(str); ++i) { 1750 uint8 uc = static_cast<uint8>(str[i]); 1751 if (kIsAlpha[uc] | kIsDigit[uc]) { 1752 if (l_ptr < 4) { // Else ignore 1753 norm[l_ptr] = kCharsetToLowerTbl[uc]; 1754 l_ptr++; 1755 } 1756 } 1757 } 1758 } 1759 1760 // Find subscript of matching key in first 4 bytes of sorted hint array, or -1 1761 static int HintBinaryLookup4(const HintEntry* hintprobs, int hintprobssize, 1762 const char* norm_key) { 1763 // Key is always in range [lo..hi) 1764 int lo = 0; 1765 int hi = hintprobssize; 1766 while (lo < hi) { 1767 int mid = (lo + hi) >> 1; 1768 int comp = memcmp(&hintprobs[mid].key[0], norm_key, 4); 1769 if (comp < 0) { 1770 lo = mid + 1; 1771 } else if (comp > 0) { 1772 hi = mid; 1773 } else { 1774 return mid; 1775 } 1776 } 1777 return -1; 1778 } 1779 1780 1781 // Increment the initial probabilities based on a per-TLD probs entry 1782 void ApplyTLDHint(uint8* lang_hint_boost, const char* tld_hint) { 1783 if (FLAGS_dbgscore) { 1784 fprintf(stderr, "TLD hint %s\n", tld_hint); 1785 } 1786 char normalized_tld[8]; 1787 MakeChar4(tld_hint, normalized_tld); 1788 int n = HintBinaryLookup4(kTLDHintProbs, kTLDHintProbsSize, 1789 normalized_tld); 1790 // TLD is four bytes, probability entry is 4 bytes 1791 if (n >= 0) { 1792 uint32 probs = kTLDHintProbs[n].probs; 1793 1794 uint8 prob123 = (probs >> 0) & 0xff; 1795 const uint8* prob123_entry = cld::LgProb2TblEntry(prob123); 1796 uint8 top1 = (probs >> 8) & 0xff; 1797 if (top1 > 0) {lang_hint_boost[top1] += cld::LgProb3(prob123_entry, 0);} 1798 uint8 top2 = (probs >> 16) & 0xff; 1799 if (top2 > 0) {lang_hint_boost[top2] += cld::LgProb3(prob123_entry, 1);} 1800 uint8 top3 = (probs >> 24) & 0xff; 1801 if (top3 > 0) {lang_hint_boost[top3] += cld::LgProb3(prob123_entry, 2);} 1802 } 1803 } 1804 1805 1806 // Increment the initial probabilities based on a per-encoding probs entry 1807 void ApplyEncodingHint(uint8* lang_hint_boost, int encoding_hint) { 1808 if (FLAGS_dbgscore) { 1809 Encoding tempenc = static_cast<Encoding>(encoding_hint); 1810 fprintf(stderr, "ENC hint %s\n", EncodingName(tempenc)); 1811 } 1812 if (encoding_hint < ISO_8859_1) {return;} 1813 if (encoding_hint >= NUM_ENCODINGS) {return;} 1814 uint32 probs = kEncodingHintProbs[encoding_hint]; 1815 1816 uint8 prob123 = (probs >> 0) & 0xff; 1817 const uint8* prob123_entry = cld::LgProb2TblEntry(prob123); 1818 uint8 top1 = (probs >> 8) & 0xff; 1819 if (top1 > 0) {lang_hint_boost[top1] += cld::LgProb3(prob123_entry, 0);} 1820 uint8 top2 = (probs >> 16) & 0xff; 1821 if (top2 > 0) {lang_hint_boost[top2] += cld::LgProb3(prob123_entry, 1);} 1822 uint8 top3 = (probs >> 24) & 0xff; 1823 if (top3 > 0) {lang_hint_boost[top3] += cld::LgProb3(prob123_entry, 2);} 1824 } 1825 1826 1827 // Increment the initial probability for given language by fixed amount 1828 // Does not recognize extended languages as hints 1829 void ApplyLanguageHint(uint8* lang_hint_boost, Language language_hint) { 1830 if (FLAGS_dbgscore) { 1831 fprintf(stderr, "LANG hint %s\n", ExtLanguageName(language_hint)); 1832 } 1833 if (language_hint < ENGLISH) {return;} 1834 if (language_hint >= NUM_LANGUAGES) {return;} 1835 uint32 probs = kLanguageHintProbs[language_hint]; 1836 1837 uint8 prob123 = (probs >> 0) & 0xff; 1838 const uint8* prob123_entry = cld::LgProb2TblEntry(prob123); 1839 uint8 top1 = (probs >> 8) & 0xff; 1840 if (top1 > 0) {lang_hint_boost[top1] += cld::LgProb3(prob123_entry, 0);} 1841 uint8 top2 = (probs >> 16) & 0xff; 1842 if (top2 > 0) {lang_hint_boost[top2] += cld::LgProb3(prob123_entry, 1);} 1843 uint8 top3 = (probs >> 24) & 0xff; 1844 if (top3 > 0) {lang_hint_boost[top3] += cld::LgProb3(prob123_entry, 2);} 1845 } 1846 1847 // Extract return values before fixups 1848 void ExtractLangEtc(ToteWithReliability* doc_tote, int total_text_bytes, 1849 int* reliable_percent3, Language* language3, int* percent3, 1850 double* normalized_score3, 1851 int* text_bytes, bool* is_reliable) { 1852 reliable_percent3[0] = 0; 1853 reliable_percent3[1] = 0; 1854 reliable_percent3[2] = 0; 1855 language3[0] = UNKNOWN_LANGUAGE; 1856 language3[1] = UNKNOWN_LANGUAGE; 1857 language3[2] = UNKNOWN_LANGUAGE; 1858 percent3[0] = 100; 1859 percent3[1] = 0; 1860 percent3[2] = 0; 1861 normalized_score3[0] = 0.0; 1862 normalized_score3[1] = 0.0; 1863 normalized_score3[2] = 0.0; 1864 1865 *text_bytes = total_text_bytes; 1866 *is_reliable = false; 1867 1868 int bytecount1 = total_text_bytes; 1869 int bytecount2 = 0; 1870 int bytecount3 = 0; 1871 1872 int lang1 = doc_tote->Key(0); 1873 if (lang1 != 0) { 1874 // We have a top language 1875 language3[0] = cld::UnpackLanguage(lang1); 1876 bytecount1 = doc_tote->Value(0); 1877 int reli1 = doc_tote->Reliability(0); 1878 reliable_percent3[0] = reli1 / (bytecount1 ? bytecount1 : 1); // avoid zdiv 1879 normalized_score3[0] = cld::GetNormalizedScore(language3[0], 1880 ULScript_Common, 1881 bytecount1, 1882 doc_tote->Score(0)); 1883 } 1884 1885 int lang2 = doc_tote->Key(1); 1886 if (lang2 != 0) { 1887 language3[1] = cld::UnpackLanguage(lang2); 1888 bytecount2 = doc_tote->Value(1); 1889 int reli2 = doc_tote->Reliability(1); 1890 reliable_percent3[1] = reli2 / (bytecount2 ? bytecount2 : 1); // avoid zdiv 1891 normalized_score3[1] = cld::GetNormalizedScore(language3[1], 1892 ULScript_Common, 1893 bytecount2, 1894 doc_tote->Score(1)); 1895 } 1896 1897 int lang3 = doc_tote->Key(2); 1898 if (lang3 != 0) { 1899 language3[2] = cld::UnpackLanguage(lang3); 1900 bytecount3 = doc_tote->Value(2); 1901 int reli3 = doc_tote->Reliability(2); 1902 reliable_percent3[2] = reli3 / (bytecount3 ? bytecount3 : 1); // avoid zdiv 1903 normalized_score3[2] = cld::GetNormalizedScore(language3[2], 1904 ULScript_Common, 1905 bytecount3, 1906 doc_tote->Score(2)); 1907 } 1908 1909 // Increase total bytes to sum (top 3) if low for some reason 1910 int total_bytecount12 = bytecount1 + bytecount2; 1911 int total_bytecount123 = total_bytecount12 + bytecount3; 1912 if (total_text_bytes < total_bytecount123) { 1913 total_text_bytes = total_bytecount123; 1914 *text_bytes = total_text_bytes; 1915 } 1916 1917 // Sum minus previous % gives better roundoff behavior than bytecount/total 1918 int total_text_bytes_div = cld::maxint(1, total_text_bytes); // Avoid zdiv 1919 percent3[0] = (bytecount1 * 100) / total_text_bytes_div; 1920 percent3[1] = (total_bytecount12 * 100) / total_text_bytes_div; 1921 percent3[2] = (total_bytecount123 * 100) / total_text_bytes_div; 1922 percent3[2] -= percent3[1]; 1923 percent3[1] -= percent3[0]; 1924 1925 // Roundoff, say 96% 1.6% 1.4%, will produce non-obvious 96% 1% 2% 1926 // Fix this explicitly 1927 if (percent3[1] < percent3[2]) { 1928 ++percent3[1]; 1929 --percent3[2]; 1930 } 1931 if (percent3[0] < percent3[1]) { 1932 ++percent3[0]; 1933 --percent3[1]; 1934 } 1935 1936 *text_bytes = total_text_bytes; 1937 1938 if (lang1 != 0) { 1939 // We have a top language 1940 // Its reliability is overal result reliability 1941 int bytecount = doc_tote->Value(0); 1942 int reli = doc_tote->Reliability(0); 1943 int reliable_percent = reli / (bytecount ? bytecount : 1); // avoid zdiv 1944 *is_reliable = reliable_percent >= cld::kMinReliable; 1945 } else { 1946 // No top language at all. This can happen with zero text or 100% Klingon 1947 // if extended=false. Just return all UNKNOWN_LANGUAGE, reliable. 1948 *is_reliable = true; 1949 } 1950 } 1951 1952 bool IsFIGS(Language lang) { 1953 if (lang == FRENCH) {return true;} 1954 if (lang == ITALIAN) {return true;} 1955 if (lang == GERMAN) {return true;} 1956 if (lang == SPANISH) {return true;} 1957 return false; 1958 } 1959 1960 bool IsEFIGS(Language lang) { 1961 if (lang == ENGLISH) {return true;} 1962 if (lang == FRENCH) {return true;} 1963 if (lang == ITALIAN) {return true;} 1964 if (lang == GERMAN) {return true;} 1965 if (lang == SPANISH) {return true;} 1966 return false; 1967 } 1968 1969 static const int kNonEnBoilerplateMinPercent = 17; // <this => no second 1970 static const int kNonFIGSBoilerplateMinPercent = 20; // <this => no second 1971 static const int kGoodFirstMinPercent = 26; // <this => UNK 1972 static const int kGoodFirstReliableMinPercent = 51; // <this => unreli 1973 static const int kIgnoreMaxPercent = 95; // >this => unreli 1974 static const int kKeepMinPercent = 2; // <this => unreli 1975 1976 // For Tier3 languages, require more bytes of text to override 1977 // the first-place language 1978 static const int kGoodSecondT1T2MinBytes = 15; // <this => no second 1979 static const int kGoodSecondT3MinBytes = 128; // <this => no second 1980 // 1981 1982 // Calculate a single summary language for the document, and its reliability. 1983 // Returns language3[0] or language3[1] or ENGLISH or UNKNOWN_LANGUAGE 1984 // This is the heart of matching human-rater perception. 1985 // reliable_percent3[] is currently unused 1986 // 1987 // Do not return Tier3 second language unless there are at least 128 bytes 1988 void CalcSummaryLang(ToteWithReliability* doc_tote, int total_text_bytes, 1989 const int* reliable_percent3, 1990 const Language* language3, 1991 const int* percent3, 1992 Language* summary_lang, bool* is_reliable) { 1993 // Vector of active languages; changes if we delete some 1994 int slot_count = 3; 1995 int active_slot[3] = {0, 1, 2}; 1996 1997 int ignore_percent = 0; 1998 int return_percent = percent3[0]; // Default to top lang 1999 *summary_lang = language3[0]; 2000 *is_reliable = true; 2001 if (percent3[0] < kKeepMinPercent) {*is_reliable = false;} 2002 2003 // If any of top 3 is IGNORE, remove it and increment ignore_percent 2004 for (int i = 0; i < 3; ++i) { 2005 if (language3[i] == TG_UNKNOWN_LANGUAGE) { 2006 ignore_percent += percent3[i]; 2007 // Move the rest up, levaing input vectors unchanged 2008 for (int j=i+1; j < 3; ++j) { 2009 active_slot[j - 1] = active_slot[j]; 2010 } 2011 -- slot_count; 2012 // Logically remove Ignore from percentage-text calculation 2013 // (extra 1 in 101 avoids zdiv, biases slightly small) 2014 return_percent = (percent3[0] * 100) / (101 - ignore_percent); 2015 *summary_lang = language3[active_slot[0]]; 2016 if (percent3[active_slot[0]] < kKeepMinPercent) {*is_reliable = false;} 2017 } 2018 } 2019 2020 2021 // If English and X, where X (not UNK) is big enough, 2022 // assume the English is boilerplate and return X. 2023 // Logically remove English from percentage-text calculation 2024 int second_bytes = (total_text_bytes * percent3[active_slot[1]]) / 100; 2025 // Require more bytes of text for Tier3 languages 2026 int minbytesneeded = kGoodSecondT1T2MinBytes; 2027 int plang_second = cld::PackLanguage(language3[active_slot[1]]); 2028 bool is_tier3 = (cld::kIsPackedTop40[plang_second] == 0); 2029 if (is_tier3) { 2030 minbytesneeded = kGoodSecondT3MinBytes; 2031 } 2032 2033 if ((language3[active_slot[0]] == ENGLISH) && 2034 (language3[active_slot[1]] != ENGLISH) && 2035 (language3[active_slot[1]] != UNKNOWN_LANGUAGE) && 2036 (percent3[active_slot[1]] >= kNonEnBoilerplateMinPercent) && 2037 (second_bytes >= minbytesneeded)) { 2038 ignore_percent += percent3[active_slot[0]]; 2039 return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent); 2040 *summary_lang = language3[active_slot[1]]; 2041 if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;} 2042 2043 // Else If FIGS and X, where X (not UNK, EFIGS) is big enough, 2044 // assume the FIGS is boilerplate and return X. 2045 // Logically remove FIGS from percentage-text calculation 2046 } else if (IsFIGS(language3[active_slot[0]]) && 2047 !IsEFIGS(language3[active_slot[1]]) && 2048 (language3[active_slot[1]] != UNKNOWN_LANGUAGE) && 2049 (percent3[active_slot[1]] >= kNonFIGSBoilerplateMinPercent) && 2050 (second_bytes >= minbytesneeded)) { 2051 ignore_percent += percent3[active_slot[0]]; 2052 return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent); 2053 *summary_lang = language3[active_slot[1]]; 2054 if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;} 2055 2056 // Else we are returning the first language, but want to improve its 2057 // return_percent if the second language should be ignored 2058 } else if ((language3[active_slot[1]] == ENGLISH) && 2059 (language3[active_slot[0]] != ENGLISH)) { 2060 ignore_percent += percent3[active_slot[1]]; 2061 return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent); 2062 } else if (IsFIGS(language3[active_slot[1]]) && 2063 !IsEFIGS(language3[active_slot[0]])) { 2064 ignore_percent += percent3[active_slot[1]]; 2065 return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent); 2066 } 2067 2068 // If return percent is too small (too many languages), return UNKNOWN 2069 if ((return_percent < kGoodFirstMinPercent)) { 2070 *summary_lang = UNKNOWN_LANGUAGE; 2071 *is_reliable = false; 2072 } 2073 2074 // If return percent is small, return language but set unreliable. 2075 if ((return_percent < kGoodFirstReliableMinPercent)) { 2076 *is_reliable = false; 2077 } 2078 2079 // If ignore percent is too large, set unreliable. 2080 if ((ignore_percent > kIgnoreMaxPercent)) { 2081 *is_reliable = false; 2082 } 2083 2084 // If we removed all the active languages, return UNKNOWN 2085 if (slot_count == 0) { 2086 *summary_lang = UNKNOWN_LANGUAGE; 2087 *is_reliable = false; 2088 } 2089 } 2090 2091 2092 2093 // Result vector must be exactly three items 2094 Language CompactLangDetImpl::DetectLanguageSummaryV25( 2095 const CompactLangDet::DetectionTables* tables, 2096 const char* buffer, 2097 int buffer_length, 2098 bool is_plain_text, 2099 const char* tld_hint, // "id" boosts Indonesian 2100 int encoding_hint, // SJS boosts Japanese 2101 Language language_hint, // ITALIAN boosts it 2102 bool allow_extended_lang, 2103 int flags, 2104 Language plus_one, 2105 Language* language3, 2106 int* percent3, 2107 double* normalized_score3, 2108 int* text_bytes, 2109 bool* is_reliable) { 2110 if (!tables) { 2111 static const CompactLangDet::DetectionTables default_cld_tables = { 2112 &kQuadTable_obj, 2113 &compact_lang_det_generated_ctjkvz_b1_obj 2114 }; 2115 tables = &default_cld_tables; 2116 } 2117 language3[0] = UNKNOWN_LANGUAGE; 2118 language3[1] = UNKNOWN_LANGUAGE; 2119 language3[2] = UNKNOWN_LANGUAGE; 2120 percent3[0] = 100; 2121 percent3[1] = 0; 2122 percent3[2] = 0; 2123 normalized_score3[0] = 0.0; 2124 normalized_score3[1] = 0.0; 2125 normalized_score3[2] = 0.0; 2126 *text_bytes = 0; 2127 *is_reliable = false; 2128 2129 // Document totals 2130 ToteWithReliability doc_tote; // Reliability = 0..100 2131 2132 // Vector of packed per-language boosts (just one filled in from hints) 2133 uint8 lang_hint_boost[EXT_NUM_LANGUAGES + 1]; 2134 memset(lang_hint_boost, 0, sizeof(lang_hint_boost)); 2135 2136 // Apply hints,if any 2137 if ((tld_hint != NULL) && (tld_hint[0] != '\0')) { 2138 ApplyTLDHint(lang_hint_boost, tld_hint); 2139 } 2140 if (encoding_hint != UNKNOWN_ENCODING) { 2141 ApplyEncodingHint(lang_hint_boost, encoding_hint); 2142 } 2143 if (language_hint != UNKNOWN_LANGUAGE) { 2144 ApplyLanguageHint(lang_hint_boost, language_hint); 2145 } 2146 2147 2148 // Four individual script totals, Latin, Han, other2, other3 2149 int next_other_tote = 2; 2150 2151 // Four totes for up to four different scripts pending at once 2152 Tote totes[4]; // [0] Latn [1] Hani [2] other [3] other 2153 bool tote_seen[4] = {false, false, false, false}; 2154 int tote_grams[4] = {0, 0, 0, 0}; // Number in partial chunk 2155 UnicodeLScript tote_script[4] = 2156 {ULScript_Latin, ULScript_HanCJK, ULScript_Common, ULScript_Common}; 2157 2158 // Loop through text spans in a single script 2159 ScriptScanner ss(buffer, buffer_length, is_plain_text); 2160 getone::LangSpan scriptspan; 2161 2162 scriptspan.text = NULL; 2163 scriptspan.text_bytes = 0; 2164 scriptspan.offset = 0; 2165 scriptspan.script = ULScript_Common; 2166 scriptspan.lang = UNKNOWN_LANGUAGE; 2167 2168 int total_text_bytes = 0; 2169 int textlimit = FLAGS_cld_textlimit << 10; // in KB 2170 if (textlimit == 0) {textlimit = 0x7fffffff;} 2171 2172 int advance_by = 2; // Advance 2 bytes 2173 int advance_limit = textlimit >> 3; // For first 1/8 of max document 2174 2175 int initial_word_span = kDefaultWordSpan; 2176 if (FLAGS_cld_forcewords) { 2177 initial_word_span = kReallyBigWordSpan; 2178 } 2179 2180 // Pick up chunk sizes 2181 // Smoothwidth is units of quadgrams, about 2.5 chars (unigrams) each 2182 // Sanity check -- force into a reasonable range 2183 int chunksizequads = FLAGS_cld_smoothwidth; 2184 chunksizequads = cld::minint(cld::maxint(chunksizequads, kMinChunkSizeQuads), 2185 kMaxChunkSizeQuads); 2186 int chunksizeunis = (chunksizequads * 5) >> 1; 2187 2188 // Varying short-span limit doesn't work well -- skips too much beyond 20KB 2189 // int spantooshortlimit = advance_by * FLAGS_cld_smoothwidth; 2190 int spantooshortlimit = kShortSpanThresh; 2191 2192 // For debugging only. Not thread-safe 2193 prior_lang = UNKNOWN_LANGUAGE; 2194 prior_unreliable = false; 2195 2196 // Allocate full-document prediction table for finding repeating words 2197 int hash = 0; 2198 int* predict_tbl = new int[kPredictionTableSize]; 2199 if (FlagRepeats(flags)) { 2200 memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0])); 2201 } 2202 2203 // Loop through scriptspans accumulating number of text bytes in each language 2204 while (ss.GetOneScriptSpanLower(&scriptspan)) { 2205 UnicodeLScript lscript = scriptspan.script; 2206 2207 // Echo text if asked to 2208 if (FLAGS_cld_echotext) { 2209 PrintHtmlEscapedText(stderr, scriptspan.text, scriptspan.text_bytes); 2210 } 2211 2212 // Squeeze out big chunks of text span if asked to 2213 if (FlagSqueeze(flags)) { 2214 // Remove repetitive or mostly-spaces chunks 2215 int newlen; 2216 int chunksize = 0; // Use the default 2217 newlen = CheapSqueezeInplace(scriptspan.text, scriptspan.text_bytes, 2218 chunksize); 2219 scriptspan.text_bytes = newlen; 2220 } else { 2221 // Check now and then to see if we should be squeezing 2222 if ((total_text_bytes >= kCheapSqueezeTestThresh) && 2223 !FlagFinish(flags) && 2224 ((getone::kMaxScriptBuffer >> 1) < scriptspan.text_bytes) && 2225 CheapSqueezeTriggerTest(scriptspan.text, 2226 scriptspan.text_bytes, 2227 kCheapSqueezeTestLen)) { 2228 // Recursive call with big-chunk squeezing set 2229 if (FLAGS_cld_html || FLAGS_dbgscore) { 2230 fprintf(stderr, 2231 "<br>---text_bytes[%d] Recursive(Squeeze)---<br><br>\n", 2232 total_text_bytes); 2233 } 2234 // Deallocate full-document prediction table 2235 delete[] predict_tbl; 2236 2237 return DetectLanguageSummaryV25( 2238 tables, 2239 buffer, 2240 buffer_length, 2241 is_plain_text, 2242 tld_hint, // "id" boosts Indonesian 2243 encoding_hint, // SJS boosts Japanese 2244 language_hint, // ITALIAN boosts it 2245 allow_extended_lang, 2246 flags | kCLDFlagSqueeze, 2247 plus_one, 2248 language3, 2249 percent3, 2250 normalized_score3, 2251 text_bytes, 2252 is_reliable); 2253 } 2254 } 2255 2256 // Remove repetitive words if asked to 2257 if (FlagRepeats(flags)) { 2258 // Remove repetitive words 2259 int newlen; 2260 newlen = CheapRepWordsInplace(scriptspan.text, scriptspan.text_bytes, 2261 &hash, predict_tbl); 2262 scriptspan.text_bytes = newlen; 2263 } 2264 2265 // The real scoring 2266 // Accumulate directly into the document total, or accmulate in one of four 2267 // chunk totals. The purpose of the multiple chunk totals is to piece 2268 // together short choppy pieces of text in alternating scripts. One total is 2269 // dedicated to Latin text, one to Han text, and the other two are dynamicly 2270 // assigned. 2271 Language onlylang = cld::kOnlyLanguagePerLScript[lscript]; 2272 2273 if (onlylang != UNKNOWN_LANGUAGE) { 2274 // This entire script run is in a single language. 2275 ScoreNilgrams(&scriptspan, cld::PackLanguage(onlylang), &doc_tote, 2276 lang_hint_boost, flags, plus_one); 2277 } else if (cld::kScoreUniPerLScript[lscript] != 0) { 2278 // This entire script run's languages can be distinguished by uni-grams 2279 // Accumulate in hani_tote 2280 int tote_num = 1; 2281 if (!tote_seen[tote_num]) { 2282 tote_seen[tote_num] = true; 2283 // Default language gets 1 byte 2284 total_text_bytes += 1; 2285 InitScriptToteLang(&totes[tote_num], lscript); 2286 } 2287 ScoreUnigrams(tables->unigram_obj, 2288 &scriptspan, &tote_grams[tote_num], chunksizeunis, 2289 &totes[tote_num], 2290 &doc_tote, lang_hint_boost, 2291 advance_by, flags, &initial_word_span, plus_one); 2292 } else { 2293 // This entire script-run's languages can be distinguished by quad-grams 2294 // Accumulate in latn_tote or script0/1_tote 2295 int tote_num = -1; 2296 for (int t = 0; t < 4; ++t) { 2297 if (lscript == tote_script[t]) { 2298 tote_num = t; 2299 break; 2300 } 2301 } 2302 if (tote_num < 0) { 2303 // Need to allocate other0/1 2304 tote_num = next_other_tote; 2305 next_other_tote ^= 1; // Round-robin 2306 if (tote_seen[tote_num]) { 2307 // Flush previous 2308 ScoreChunkIntoDoc2(kToteSwitch[tote_num], advance_by, 2309 tote_script[tote_num], &totes[tote_num], 2310 &doc_tote, tote_grams[tote_num], lang_hint_boost); 2311 totes[tote_num].Reinit(); 2312 } 2313 tote_script[tote_num] = lscript; 2314 } 2315 2316 if (!tote_seen[tote_num]) { 2317 tote_seen[tote_num] = true; 2318 // Default language gets 1 byte 2319 total_text_bytes += 1; 2320 InitScriptToteLang(&totes[tote_num], lscript); 2321 } 2322 2323 // The actual accumulation, possibly with word scoring also 2324 ScoreQuadgrams(tables->quadgram_obj, &scriptspan, &tote_grams[tote_num], 2325 chunksizequads, 2326 &totes[tote_num], 2327 &doc_tote, lang_hint_boost, 2328 advance_by, flags, &initial_word_span, plus_one); 2329 } 2330 2331 total_text_bytes += scriptspan.text_bytes; 2332 2333 // For long documents, do less-dense samples the further along we go. 2334 // This is to keep speed sublinear in document size. 2335 if (total_text_bytes > advance_limit) { 2336 if (total_text_bytes > textlimit) { 2337 // Don't look at rest of doc 2338 if (FLAGS_cld_html || FLAGS_dbgscore) { 2339 fprintf(stderr, "<br>---text_bytes[%d] textlimit %d reached---<br>", 2340 total_text_bytes, textlimit); 2341 } 2342 break; 2343 } 2344 advance_by <<= 1; // Double advance bytes 2345 advance_limit <<= 1; // Double limit until next change 2346 spantooshortlimit <<= 1; // Double short-span size 2347 if (FLAGS_cld_html || FLAGS_dbgscore) { 2348 fprintf(stderr, "<br>---text_bytes[%d] advance_by doubled to %d---<br>", 2349 total_text_bytes, advance_by); 2350 } 2351 } 2352 } // End while (ss.GetOneScriptSpanLower()) 2353 2354 // Deallocate full-document prediction table 2355 delete[] predict_tbl; 2356 2357 // Flush pending totals 2358 for (int tote_num = 0; tote_num < 4; ++tote_num) { 2359 if (tote_seen[tote_num]) { 2360 ScoreChunkIntoDoc2(kToteName[tote_num], advance_by, 2361 tote_script[tote_num], &totes[tote_num], &doc_tote, 2362 tote_grams[tote_num], lang_hint_boost); 2363 } 2364 } 2365 2366 // If extended langauges are disallowed, remove them here 2367 if (!allow_extended_lang) { 2368 RemoveExtendedLanguages(&doc_tote); 2369 } 2370 2371 // Force close pairs to one or the other 2372 RefineScoredClosePairs(&doc_tote); 2373 2374 2375 // Calculate return results 2376 // Find top three byte counts in tote heap 2377 int reliable_percent3[3]; 2378 2379 2380 // Cannot use Add, etc. after sorting 2381 doc_tote.Sort(3); 2382 2383 ExtractLangEtc(&doc_tote, total_text_bytes, 2384 reliable_percent3, language3, percent3, normalized_score3, 2385 text_bytes, is_reliable); 2386 2387 bool have_good_answer = false; 2388 if (FlagFinish(flags)) { 2389 // Force a result 2390 have_good_answer = true; 2391 } else if (total_text_bytes <= kShortTextThresh) { 2392 // Don't recurse on short text -- we already did word scores 2393 have_good_answer = true; 2394 } else if (*is_reliable && 2395 (percent3[0] >= kGoodLang1Percent)) { 2396 have_good_answer = true; 2397 } else if (*is_reliable && 2398 ((percent3[0] + percent3[1]) >= kGoodLang1and2Percent)) { 2399 have_good_answer = true; 2400 } 2401 2402 2403 if (have_good_answer) { 2404 // This is the real, non-recursive return 2405 2406 // Move bytes for unreliable langs to another lang or UNKNOWN 2407 RemoveUnreliableLanguages(&doc_tote); 2408 2409 // Redo the result extraction after the removal above 2410 doc_tote.Sort(3); 2411 ExtractLangEtc(&doc_tote, total_text_bytes, 2412 reliable_percent3, language3, percent3, normalized_score3, 2413 text_bytes, is_reliable); 2414 2415 #if 0 2416 // OLD code, replaced by CalcSummaryLang 2417 // 2418 // Suppress ignore-me text, TG_UNKNOWN_LANGUAGE if 2nd or 3rd language 2419 // Force it to English if first language 2420 if (language3[2] == TG_UNKNOWN_LANGUAGE) { 2421 reliable_percent3[2] = 0; 2422 language3[2] = UNKNOWN_LANGUAGE; 2423 percent3[2] = 0; 2424 } else if (language3[1] == TG_UNKNOWN_LANGUAGE) { 2425 // Move up lower language 2426 reliable_percent3[1] = reliable_percent3[2]; 2427 language3[1] = language3[2]; 2428 percent3[1] = percent3[2]; 2429 reliable_percent3[2] = 0; 2430 language3[2] = UNKNOWN_LANGUAGE; 2431 percent3[2] = 0; 2432 } else if (language3[0] == TG_UNKNOWN_LANGUAGE) { 2433 language3[0] = ENGLISH; 2434 } 2435 2436 if (language3[0] == UNKNOWN_LANGUAGE) { 2437 // Last-ditch test for some result, but it is UNKNOWN_LANGUAGE 2438 // Force it to English (should not happen) 2439 language3[0] = ENGLISH; 2440 percent3[0] = 100; 2441 *is_reliable = true; 2442 } 2443 #endif 2444 2445 2446 #if 0 2447 // Scaffolding to reveal subset sequence lang distribution across doc text 2448 // Track the sequence of language fragments [result currently unused] 2449 if (FLAGS_cld_html) { 2450 static const int kMaxSubsetSeq = 12; 2451 uint8 subseq[kMaxSubsetSeq]; 2452 doc_tote.ExtractSeq(kMaxSubsetSeq, subseq); 2453 2454 fprintf(stderr, "<br>\nSubset Sequence[%d]: ", kMaxSubsetSeq); 2455 for (int i = 0; i < kMaxSubsetSeq; ++i) { 2456 fprintf(stderr, "%s ", ExtLanguageCode(cld::UnpackLanguage(subseq[i]))); 2457 if ((i % 4) == 3) {fprintf(stderr, " ");} 2458 } 2459 fprintf(stderr, " "); 2460 2461 for (int i = 0; i < 3; ++i) { 2462 if (language3[i] != UNKNOWN_LANGUAGE) { 2463 fprintf(stderr, "%s.%d(%d%%) ", 2464 ExtLanguageCode(language3[i]), 2465 reliable_percent3[i], 2466 percent3[i]); 2467 } 2468 } 2469 2470 fprintf(stderr, "%d B ", total_text_bytes); 2471 fprintf(stderr, "<br>\n"); 2472 } 2473 // End Scaffolding to reveal subset sequence lang distribution 2474 #endif 2475 2476 Language summary_lang; 2477 CalcSummaryLang(&doc_tote, total_text_bytes, 2478 reliable_percent3, language3, percent3, 2479 &summary_lang, is_reliable); 2480 2481 if (FLAGS_cld_html) { 2482 for (int i = 0; i < 3; ++i) { 2483 if (language3[i] != UNKNOWN_LANGUAGE) { 2484 fprintf(stderr, "%s.%d(%d%%) ", 2485 ExtLanguageCode(language3[i]), 2486 reliable_percent3[i], 2487 percent3[i]); 2488 } 2489 } 2490 2491 fprintf(stderr, "%d B ", total_text_bytes); 2492 fprintf(stderr, "= %s%c ", 2493 ExtLanguageName(summary_lang), is_reliable ? ' ' : '*'); 2494 fprintf(stderr, "<br>\n"); 2495 } 2496 2497 return summary_lang; 2498 } 2499 2500 // Not a good answer -- do recursive call to refine 2501 if (FLAGS_cld_html || FLAGS_dbgscore) { 2502 // This is what we hope to improve on in the recursive call, if any 2503 PrintLangs(stderr, language3, percent3, text_bytes, is_reliable); 2504 } 2505 2506 // For restriction to Top40 + one, the one is 1st/2nd lang that is not Top40 2507 // For this purpose, we treate "Ignore" as top40 2508 Language new_plus_one = UNKNOWN_LANGUAGE; 2509 if (cld::kIsPackedTop40[cld::PackLanguage(language3[0])] == 0) { 2510 new_plus_one = language3[0]; 2511 } else if (cld::kIsPackedTop40[cld::PackLanguage(language3[1])] == 0) { 2512 new_plus_one = language3[1]; 2513 } 2514 2515 if (total_text_bytes < kShortTextThresh) { 2516 // Short text: Recursive call with top40 and short set 2517 if (FLAGS_cld_html || FLAGS_dbgscore) { 2518 fprintf(stderr, " ---text_bytes[%d] " 2519 "Recursive(Top40/Rep/Short/Words)---<br><br>\n", 2520 total_text_bytes); 2521 } 2522 return DetectLanguageSummaryV25( 2523 tables, 2524 buffer, 2525 buffer_length, 2526 is_plain_text, 2527 tld_hint, // "id" boosts Indonesian 2528 encoding_hint, // SJS boosts Japanese 2529 language_hint, // ITALIAN boosts it 2530 allow_extended_lang, 2531 flags | kCLDFlagTop40 | kCLDFlagRepeats | 2532 kCLDFlagShort | kCLDFlagUseWords | kCLDFlagFinish, 2533 new_plus_one, 2534 language3, 2535 percent3, 2536 normalized_score3, 2537 text_bytes, 2538 is_reliable); 2539 } 2540 2541 // Longer text: Recursive call with top40 set 2542 if (FLAGS_cld_html || FLAGS_dbgscore) { 2543 fprintf(stderr, 2544 " ---text_bytes[%d] Recursive(Top40/Rep)---<br><br>\n", 2545 total_text_bytes); 2546 } 2547 return DetectLanguageSummaryV25( 2548 tables, 2549 buffer, 2550 buffer_length, 2551 is_plain_text, 2552 tld_hint, // "id" boosts Indonesian 2553 encoding_hint, // SJS boosts Japanese 2554 language_hint, // ITALIAN boosts it 2555 allow_extended_lang, 2556 flags | kCLDFlagTop40 | kCLDFlagRepeats | 2557 kCLDFlagFinish, 2558 new_plus_one, 2559 language3, 2560 percent3, 2561 normalized_score3, 2562 text_bytes, 2563 is_reliable); 2564 } // End CompactLangDetImpl::DetectLanguageSummaryV25 2565