1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include <string> 6 #include "encodings/compact_lang_det/cldutil.h" 7 8 #include "base/basictypes.h" 9 #include "encodings/compact_lang_det/cldutil_dbg.h" 10 #include "encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h" 11 #include "encodings/compact_lang_det/utf8propletterscriptnum.h" 12 #include "encodings/compact_lang_det/win/cld_commandlineflags.h" 13 #include "encodings/compact_lang_det/win/cld_logging.h" 14 #include "encodings/compact_lang_det/win/cld_unilib.h" 15 #include "encodings/compact_lang_det/win/cld_utf.h" 16 #include "encodings/compact_lang_det/win/cld_utf8statetable.h" 17 18 // Runtime routines for hashing, looking up, and scoring 19 // unigrams (CJK), bigrams (CJK), quadgrams, and octagrams. 20 // Unigrams and bigrams are for CJK languages only, including simplified/ 21 // traditional Chinese, Japanese, Korean, Vietnamese Han characters, and 22 // Zhuang Han characters. Surrounding spaces are not considered. 23 // Quadgrams and octagrams for for non-CJK and include two bits indicating 24 // preceding and trailing spaces (word boundaries). 25 26 27 // Indicator bits for leading/trailing space around quad/octagram 28 // NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of 29 // 1-, 2-, or 3-bytes each. 30 static const uint32 kPreSpaceIndicator = 0x00004444; 31 static const uint32 kPostSpaceIndicator = 0x44440000; 32 33 // Little-endian masks for 0..24 bytes picked up as uint32's 34 static const uint32 kWordMask0[4] = { 35 0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF 36 }; 37 38 static const int kMinCJKUTF8CharBytes = 3; 39 40 static const int kMinGramCount = 3; 41 static const int kMaxGramCount = 16; 42 43 44 45 46 // Routines to access a hash table of <key:wordhash, value:probs> pairs 47 // Buckets have 4-byte wordhash for sizes < 32K buckets, but only 48 // 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as 49 // bucket subscript. 50 // Probs is a packed: three languages plus a subscript for probability table 51 // Buckets have all the keys together, then all the values.Key array never 52 // crosses a cache-line boundary, so no-match case takes exactly one cache miss. 53 // Match case may sometimes take an additional cache miss on value access. 54 // 55 // Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64 56 // byte buckets with single cache miss. 57 // Or 2-byte key and 6-byte value, allowing 5 languages instead of three. 58 //------------------------------------------------------------------------------ 59 60 61 //------------------------------------------------------------------------------ 62 // Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores 63 //------------------------------------------------------------------------------ 64 65 // Design principles for these hash functions 66 // - Few operations 67 // - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in 68 // Latin script expect 1- and 2-byte mixtures. 69 // - Last byte of each character has about 5 bits of information 70 // - Spread good bits around so they can interact in at least two ways 71 // with other characters 72 // - Use add for additional mixing thorugh carries 73 74 // CJK Three-byte bigram 75 // ....dddd..cccccc..bbbbbb....aaaa 76 // ..................ffffff..eeeeee 77 // make 78 // ....dddd..cccccc..bbbbbb....aaaa 79 // 000....dddd..cccccc..bbbbbb....a 80 // ..................ffffff..eeeeee 81 // ffffff..eeeeee000000000000000000 82 // 83 // CJK Four-byte bigram 84 // ..dddddd..cccccc....bbbb....aaaa 85 // ..hhhhhh..gggggg....ffff....eeee 86 // make 87 // ..dddddd..cccccc....bbbb....aaaa 88 // 000..dddddd..cccccc....bbbb....a 89 // ..hhhhhh..gggggg....ffff....eeee 90 // ..ffff....eeee000000000000000000 91 92 // BIGRAM 93 // Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post 94 // OVERSHOOTS up to 3 bytes 95 // For runtime use of tables 96 uint32 cld::BiHashV25(const char* word_ptr, int bytecount) { 97 if (bytecount == 0) { 98 return 0; 99 } 100 uint32 word0, word1; 101 if (bytecount <= 4) { 102 word0 = UnalignedLoad32(word_ptr) & kWordMask0[bytecount & 3]; 103 word0 = word0 ^ (word0 >> 3); 104 return word0; 105 } 106 // Else do 8 bytes 107 word0 = UnalignedLoad32(word_ptr); 108 word0 = word0 ^ (word0 >> 3); 109 word1 = UnalignedLoad32(word_ptr + 4) & kWordMask0[bytecount & 3]; 110 word1 = word1 ^ (word1 << 18); 111 return word0 + word1; 112 } 113 114 // 115 // Ascii-7 One-byte chars 116 // ...ddddd...ccccc...bbbbb...aaaaa 117 // make 118 // ...ddddd...ccccc...bbbbb...aaaaa 119 // 000...ddddd...ccccc...bbbbb...aa 120 // 121 // Latin 1- and 2-byte chars 122 // ...ddddd...ccccc...bbbbb...aaaaa 123 // ...................fffff...eeeee 124 // make 125 // ...ddddd...ccccc...bbbbb...aaaaa 126 // 000...ddddd...ccccc...bbbbb...aa 127 // ...................fffff...eeeee 128 // ...............fffff...eeeee0000 129 // 130 // Non-CJK Two-byte chars 131 // ...ddddd...........bbbbb........ 132 // ...hhhhh...........fffff........ 133 // make 134 // ...ddddd...........bbbbb........ 135 // 000...ddddd...........bbbbb..... 136 // ...hhhhh...........fffff........ 137 // hhhh...........fffff........0000 138 // 139 // Non-CJK Three-byte chars 140 // ...........ccccc................ 141 // ...................fffff........ 142 // ...lllll...................iiiii 143 // make 144 // ...........ccccc................ 145 // 000...........ccccc............. 146 // ...................fffff........ 147 // ...............fffff........0000 148 // ...lllll...................iiiii 149 // .lllll...................iiiii00 150 // 151 152 // QUADGRAM 153 // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add 154 // OVERSHOOTS up to 3 bytes 155 // For runtime use of tables 156 uint32 QuadHashV25Mix(const char* word_ptr, int bytecount, uint32 prepost) { 157 uint32 word0, word1, word2; 158 if (bytecount <= 4) { 159 word0 = UnalignedLoad32(word_ptr) & kWordMask0[bytecount & 3]; 160 word0 = word0 ^ (word0 >> 3); 161 return word0 ^ prepost; 162 } else if (bytecount <= 8) { 163 word0 = UnalignedLoad32(word_ptr); 164 word0 = word0 ^ (word0 >> 3); 165 word1 = UnalignedLoad32(word_ptr + 4) & kWordMask0[bytecount & 3]; 166 word1 = word1 ^ (word1 << 4); 167 return (word0 ^ prepost) + word1; 168 } 169 // else do 12 bytes 170 word0 = UnalignedLoad32(word_ptr); 171 word0 = word0 ^ (word0 >> 3); 172 word1 = UnalignedLoad32(word_ptr + 4); 173 word1 = word1 ^ (word1 << 4); 174 word2 = UnalignedLoad32(word_ptr + 8) & kWordMask0[bytecount & 3]; 175 word2 = word2 ^ (word2 << 2); 176 return (word0 ^ prepost) + word1 + word2; 177 } 178 179 180 // QUADGRAM wrapper with surrounding spaces 181 // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add 182 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes 183 // For runtime use of tables 184 uint32 cld::QuadHashV25(const char* word_ptr, int bytecount) { 185 if (bytecount == 0) { 186 return 0; 187 } 188 uint32 prepost = 0; 189 if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;} 190 if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;} 191 return QuadHashV25Mix(word_ptr, bytecount, prepost); 192 } 193 194 // QUADGRAM wrapper with surrounding underscores (offline use) 195 // Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add 196 // OVERSHOOTS up to 3 bytes 197 // For offline construction of tables 198 uint32 cld::QuadHashV25Underscore(const char* word_ptr, int bytecount) { 199 if (bytecount == 0) { 200 return 0; 201 } 202 const char* local_word_ptr = word_ptr; 203 int local_bytecount = bytecount; 204 uint32 prepost = 0; 205 if (local_word_ptr[0] == '_') { 206 prepost |= kPreSpaceIndicator; 207 ++local_word_ptr; 208 --local_bytecount; 209 } 210 if (local_word_ptr[local_bytecount - 1] == '_') { 211 prepost |= kPostSpaceIndicator; 212 --local_bytecount; 213 } 214 return QuadHashV25Mix(local_word_ptr, local_bytecount, prepost); 215 } 216 217 218 // OCTAGRAM 219 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add 220 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes 221 // 222 // The low 32 bits follow the pattern from above, tuned to different scripts 223 // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each 224 // For runtime use of tables V3 225 uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) { 226 uint64 word0; 227 uint64 word1; 228 uint64 sum; 229 230 if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;} 231 if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;} 232 switch ((bytecount - 1) >> 2) { 233 case 0: // 1..4 bytes 234 word0 = UnalignedLoad32(word_ptr) & kWordMask0[bytecount & 3]; 235 sum = word0; 236 word0 = word0 ^ (word0 >> 3); 237 break; 238 case 1: // 5..8 bytes 239 word0 = UnalignedLoad32(word_ptr); 240 sum = word0; 241 word0 = word0 ^ (word0 >> 3); 242 word1 = UnalignedLoad32(word_ptr + 4) & kWordMask0[bytecount & 3]; 243 sum += word1; 244 word1 = word1 ^ (word1 << 4); 245 word0 += word1; 246 break; 247 case 2: // 9..12 bytes 248 word0 = UnalignedLoad32(word_ptr); 249 sum = word0; 250 word0 = word0 ^ (word0 >> 3); 251 word1 = UnalignedLoad32(word_ptr + 4); 252 sum += word1; 253 word1 = word1 ^ (word1 << 4); 254 word0 += word1; 255 word1 = UnalignedLoad32(word_ptr + 8) & kWordMask0[bytecount & 3]; 256 sum += word1; 257 word1 = word1 ^ (word1 << 2); 258 word0 += word1; 259 break; 260 case 3: // 13..16 bytes 261 word0 = UnalignedLoad32(word_ptr); 262 sum = word0; 263 word0 = word0 ^ (word0 >> 3); 264 word1 = UnalignedLoad32(word_ptr + 4); 265 sum += word1; 266 word1 = word1 ^ (word1 << 4); 267 word0 += word1; 268 word1 = UnalignedLoad32(word_ptr + 8); 269 sum += word1; 270 word1 = word1 ^ (word1 << 2); 271 word0 += word1; 272 word1 = UnalignedLoad32(word_ptr + 12) & kWordMask0[bytecount & 3]; 273 sum += word1; 274 word1 = word1 ^ (word1 >> 8); 275 word0 += word1; 276 break; 277 case 4: // 17..20 bytes 278 word0 = UnalignedLoad32(word_ptr); 279 sum = word0; 280 word0 = word0 ^ (word0 >> 3); 281 word1 = UnalignedLoad32(word_ptr + 4); 282 sum += word1; 283 word1 = word1 ^ (word1 << 4); 284 word0 += word1; 285 word1 = UnalignedLoad32(word_ptr + 8); 286 sum += word1; 287 word1 = word1 ^ (word1 << 2); 288 word0 += word1; 289 word1 = UnalignedLoad32(word_ptr + 12); 290 sum += word1; 291 word1 = word1 ^ (word1 >> 8); 292 word0 += word1; 293 word1 = UnalignedLoad32(word_ptr + 16) & kWordMask0[bytecount & 3]; 294 sum += word1; 295 word1 = word1 ^ (word1 >> 4); 296 word0 += word1; 297 break; 298 default: // 21..24 bytes and higher (ignores beyond 24) 299 word0 = UnalignedLoad32(&word_ptr); 300 sum = word0; 301 word0 = word0 ^ (word0 >> 3); 302 word1 = UnalignedLoad32(word_ptr + 4); 303 sum += word1; 304 word1 = word1 ^ (word1 << 4); 305 word0 += word1; 306 word1 = UnalignedLoad32(word_ptr + 8); 307 sum += word1; 308 word1 = word1 ^ (word1 << 2); 309 word0 += word1; 310 word1 = UnalignedLoad32(word_ptr + 12); 311 sum += word1; 312 word1 = word1 ^ (word1 >> 8); 313 word0 += word1; 314 word1 = UnalignedLoad32(word_ptr + 16); 315 sum += word1; 316 word1 = word1 ^ (word1 >> 4); 317 word0 += word1; 318 word1 = UnalignedLoad32(word_ptr + 20) & kWordMask0[bytecount & 3]; 319 sum += word1; 320 word1 = word1 ^ (word1 >> 6); 321 word0 += word1; 322 break; 323 } 324 325 sum += (sum >> 17); // extra 1-bit shift for bytes 2 & 3 326 sum += (sum >> 9); // extra 1-bit shift for bytes 1 & 3 327 sum = (sum & 0xff) << 32; 328 return (word0 ^ prepost) + sum; 329 } 330 331 // OCTAGRAM wrapper with surrounding spaces 332 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add 333 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes 334 // 335 // The low 32 bits follow the pattern from above, tuned to different scripts 336 // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each 337 // For runtime use of tables V3 338 uint64 cld::OctaHash40(const char* word_ptr, int bytecount) { 339 if (bytecount == 0) { 340 return 0; 341 } 342 uint64 prepost = 0; 343 if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;} 344 if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;} 345 return OctaHash40Mix(word_ptr, bytecount, prepost); 346 } 347 348 349 // OCTAGRAM wrapper with surrounding underscores (offline use) 350 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add 351 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes 352 // 353 // The low 32 bits follow the pattern from above, tuned to different scripts 354 // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each 355 // For offline construction of tables 356 uint64 cld::OctaHash40underscore(const char* word_ptr, int bytecount) { 357 if (bytecount == 0) { 358 return 0; 359 } 360 const char* local_word_ptr = word_ptr; 361 int local_bytecount = bytecount; 362 uint64 prepost = 0; 363 if (local_word_ptr[0] == '_') { 364 prepost |= kPreSpaceIndicator; 365 ++local_word_ptr; 366 --local_bytecount; 367 } 368 if (local_word_ptr[local_bytecount - 1] == '_') { 369 prepost |= kPostSpaceIndicator; 370 --local_bytecount; 371 } 372 return OctaHash40Mix(local_word_ptr, local_bytecount, prepost); 373 } 374 375 376 377 378 //------------------------------------------------------------------------------ 379 // Scoring single groups of letters 380 //------------------------------------------------------------------------------ 381 382 // UNIGRAM score one => tote 383 // Input: 1-byte entry of subscript into unigram probs, plus 384 // an accumulator tote. 385 // Output: running sums in tote updated 386 void cld::ProcessProbV25UniTote(int propval, Tote* tote) { 387 tote->AddGram(); 388 const UnigramProbArray* pa = &kTargetCTJKVZProbs[propval]; 389 if (pa->probs[0] > 0) {tote->Add(cld::PackLanguage(CHINESE), pa->probs[0]);} 390 if (pa->probs[1] > 0) {tote->Add(cld::PackLanguage(CHINESE_T), pa->probs[1]);} 391 if (pa->probs[2] > 0) {tote->Add(cld::PackLanguage(JAPANESE), pa->probs[2]);} 392 if (pa->probs[3] > 0) {tote->Add(cld::PackLanguage(KOREAN), pa->probs[3]);} 393 if (pa->probs[4] > 0) {tote->Add(cld::PackLanguage(VIETNAMESE), pa->probs[4]);} 394 if (pa->probs[5] > 0) {tote->Add(cld::PackLanguage(ZHUANG), pa->probs[5]);} 395 } 396 397 // BIGRAM, QUADGRAM, OCTAGRAM score one => tote 398 // Input: 4-byte entry of 3 language numbers and one probability subscript, plus 399 // an accumulator tote. (language 0 means unused entry) 400 // Output: running sums in tote updated 401 void cld::ProcessProbV25Tote(uint32 probs, Tote* tote) { 402 tote->AddGram(); 403 uint8 prob123 = (probs >> 0) & 0xff; 404 const uint8* prob123_entry = cld::LgProb2TblEntry(prob123); 405 406 uint8 top1 = (probs >> 8) & 0xff; 407 if (top1 > 0) {tote->Add(top1, cld::LgProb3(prob123_entry, 0));} 408 uint8 top2 = (probs >> 16) & 0xff; 409 if (top2 > 0) {tote->Add(top2, cld::LgProb3(prob123_entry, 1));} 410 uint8 top3 = (probs >> 24) & 0xff; 411 if (top3 > 0) {tote->Add(top3, cld::LgProb3(prob123_entry, 2));} 412 } 413 414 415 //------------------------------------------------------------------------------ 416 // Routines to accumulate probabilities 417 //------------------------------------------------------------------------------ 418 419 420 // UNIGRAM, using UTF-8 property table, advancing by 1/2/4/8 chars 421 // Caller supplies table, such as compact_lang_det_generated_ctjkvz_b1_obj 422 // Score up to n unigrams, returning number of bytes consumed 423 // Updates tote_grams 424 int cld::DoUniScoreV3(const UTF8PropObj* unigram_obj, 425 const char* isrc, int srclen, int advance_by, 426 int* tote_grams, int gram_limit, Tote* chunk_tote) { 427 const char* src = isrc; 428 if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);} 429 430 // Property-based CJK unigram lookup 431 if (src[0] == ' ') {++src; --srclen;} 432 433 const uint8* usrc = reinterpret_cast<const uint8*>(src); 434 int usrclen = srclen; 435 436 while (usrclen > 0) { 437 int len = kAdvanceOneChar[usrc[0]]; 438 // Look up property of one UTF-8 character and advance over it 439 // Return 0 if input length is zero 440 // Return 0 and advance one byte if input is ill-formed 441 442 int propval = UTF8GenericPropertyBigOneByte(unigram_obj, &usrc, &usrclen); 443 444 if (FLAGS_dbglookup) { 445 DbgUniTermToStderr(propval, usrc, len); 446 } 447 448 if (propval > 0) { 449 ProcessProbV25UniTote(propval, chunk_tote); 450 ++(*tote_grams); 451 if (FLAGS_dbgscore) {DbgScoreRecordUni((const char*)usrc, propval, len);} 452 } 453 454 // Advance by 1/2/4/8 characters (half of quad advance) 455 if (advance_by == 2) { 456 // Already advanced by 1 457 } else if (advance_by == 4) { 458 // Advance by 2 chars total, if not at end 459 if (UTFmax <= usrclen) { 460 int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n; 461 } 462 } else if (advance_by == 8) { 463 // Advance by 4 chars total, if not at end 464 if ((UTFmax * 3) <= usrclen) { 465 int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n; 466 n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n; 467 n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n; 468 } 469 } else { 470 // Advance by 8 chars total, if not at end 471 if ((UTFmax * 7) <= usrclen) { 472 int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n; 473 n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n; 474 n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n; 475 n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n; 476 n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n; 477 n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n; 478 n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n; 479 } 480 } 481 DCHECK(usrclen >= 0); 482 483 if (*tote_grams >= gram_limit) { 484 break; 485 } 486 } 487 if (FLAGS_dbgscore) { 488 // With advance_by>2, we consume more input to get the same number of quads 489 int len = src - isrc; 490 DbgScoreTop(src, (len * 2) / advance_by, chunk_tote); 491 DbgScoreFlush(); 492 } 493 494 int consumed2 = reinterpret_cast<const char*>(usrc) - isrc; 495 return consumed2; 496 } 497 498 499 // BIGRAM, using hash table, always advancing by 1 char 500 // Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj 501 // Score all bigrams in isrc, using languages that have bigrams (CJK) 502 // Return number of bigrams that hit in the hash table 503 int cld::DoBigramScoreV3(const cld::CLDTableSummary* bigram_obj, 504 const char* isrc, int srclen, Tote* chunk_tote) { 505 int hit_count = 0; 506 const char* src = isrc; 507 508 // Hashtable-based CJK bigram lookup 509 const uint8* usrc = reinterpret_cast<const uint8*>(src); 510 const uint8* usrclimit1 = usrc + srclen - UTFmax; 511 if (FLAGS_dbgscore) { 512 fprintf(stderr, " " ); 513 } 514 515 while (usrc < usrclimit1) { 516 int len = kAdvanceOneChar[usrc[0]]; 517 int len2 = kAdvanceOneChar[usrc[len]] + len; 518 519 if ((kMinCJKUTF8CharBytes * 2) <= len2) { // Two CJK chars possible 520 // Lookup and score this bigram 521 // Always ignore pre/post spaces 522 uint32 bihash = BiHashV25(reinterpret_cast<const char*>(usrc), len2); 523 uint32 probs = QuadHashV3Lookup4(bigram_obj, bihash); 524 // Now go indirect on the subscript 525 probs = bigram_obj->kCLDTableInd[probs & 526 ~bigram_obj->kCLDTableKeyMask]; 527 528 // Process the bigram 529 if (FLAGS_dbglookup) { 530 const char* ssrc = reinterpret_cast<const char*>(usrc); 531 DbgBiTermToStderr(bihash, probs, ssrc, len2); 532 DbgScoreRecord(NULL, probs, len2); 533 } else if (FLAGS_dbgscore && (probs != 0)) { 534 const char* ssrc = reinterpret_cast<const char*>(usrc); 535 DbgScoreRecord(NULL, probs, len2); 536 string temp(ssrc, len2); 537 fprintf(stderr, "%s ", temp.c_str()); 538 } 539 540 if (probs != 0) { 541 ProcessProbV25Tote(probs, chunk_tote); 542 ++hit_count; 543 } 544 } 545 usrc += len; // Advance by one char 546 } 547 548 if (FLAGS_dbgscore) { 549 fprintf(stderr, "[%d bigrams scored]\n", hit_count); 550 DbgScoreState(); 551 } 552 return hit_count; 553 } 554 555 556 557 // QUADGRAM, using hash table, advancing by 2/4/8/16 chars 558 // Caller supplies table, such as &kQuadTable_obj or &kGibberishTable_obj 559 // Score up to n quadgrams, returning number of bytes consumed 560 // Updates tote_grams 561 int cld::DoQuadScoreV3(const cld::CLDTableSummary* quadgram_obj, 562 const char* isrc, int srclen, int advance_by, 563 int* tote_grams, int gram_limit, Tote* chunk_tote) { 564 const char* src = isrc; 565 const char* srclimit = src + srclen; 566 // Limit is end, which has extra 20 20 20 00 past len 567 const char* srclimit7 = src + srclen - (UTFmax * 7); 568 const char* srclimit15 = src + srclen - (UTFmax * 15); 569 570 if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);} 571 572 // Run a little cache of last hits to catch overly-repetitive "text" 573 int next_prior = 0; 574 uint32 prior_quads[2] = {0, 0}; 575 576 // Visit all quadgrams 577 if (src[0] == ' ') {++src;} 578 while (src < srclimit) { 579 // Find one quadgram 580 const char* src_end = src; 581 src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; 582 src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; 583 const char* src_mid = src_end; 584 src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; 585 src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]]; 586 int len = src_end - src; 587 588 // Lookup and score this quadgram 589 uint32 quadhash = QuadHashV25(src, len); 590 uint32 probs = QuadHashV3Lookup4(quadgram_obj, quadhash); 591 // Now go indirect on the subscript 592 probs = quadgram_obj->kCLDTableInd[probs & 593 ~quadgram_obj->kCLDTableKeyMask]; 594 595 // Process the quadgram 596 if (FLAGS_dbglookup) { 597 DbgQuadTermToStderr(quadhash, probs, src, len); 598 } 599 if (probs != 0) { 600 // Filter out recent repeats. If this works out, use in the other lookups 601 if ((quadhash != prior_quads[0]) && (quadhash != prior_quads[1])) { 602 prior_quads[next_prior] = quadhash; 603 next_prior = (next_prior + 1) & 1; 604 ProcessProbV25Tote(probs, chunk_tote); 605 ++(*tote_grams); 606 if (FLAGS_dbgscore) {DbgScoreRecord(src, probs, len);} 607 } 608 } 609 610 // Advance all the way past word if at end-of-word 611 if (src_end[0] == ' ') { 612 src_mid = src_end; 613 } 614 615 // Advance by 2/4/8/16 characters 616 if (advance_by == 2) { 617 src = src_mid; 618 } else if (advance_by == 4) { 619 src = src_end; 620 } else if (advance_by == 8) { 621 // Advance by 8 chars total (4 more), if not at end 622 if (src < srclimit7) { 623 src_end += kAdvanceOneChar[(uint8)src_end[0]]; 624 src_end += kAdvanceOneChar[(uint8)src_end[0]]; 625 src_end += kAdvanceOneChar[(uint8)src_end[0]]; 626 src_end += kAdvanceOneChar[(uint8)src_end[0]]; 627 } 628 src = src_end; 629 } else { 630 // Advance by 16 chars total (12 more), if not at end 631 if (src < srclimit15) { 632 // Advance by ~16 chars by adding 3 * current bytelen 633 int fourcharlen = src_end - src; 634 src = src_end + (3 * fourcharlen); 635 // Advance a bit more if mid-character 636 src += kAdvanceOneCharSpaceVowel[(uint8)src[0]]; 637 src += kAdvanceOneCharSpaceVowel[(uint8)src[0]]; 638 } else { 639 src = src_end; 640 } 641 } 642 DCHECK(src < srclimit); 643 src += kAdvanceOneCharSpaceVowel[(uint8)src[0]]; 644 645 if (*tote_grams >= gram_limit) { 646 break; 647 } 648 } 649 650 if (FLAGS_dbgscore) { 651 // With advance_by>2, we consume more input to get the same number of quads 652 int len = src - isrc; 653 DbgScoreTop(src, (len * 2) / advance_by, chunk_tote); 654 DbgScoreFlush(); 655 } 656 657 int consumed = src - isrc; 658 659 // If advancing by more than 2, src may have overshot srclimit 660 if (consumed > srclen) { 661 consumed = srclen; 662 } 663 664 return consumed; 665 } 666 667 668 // OCTAGRAM, using hash table, always advancing by 1 word 669 // Caller supplies table, such as &kLongWord8Table_obj 670 // Score all words in isrc, using languages that have quadgrams 671 // We don't normally use this routine except on the first quadgram run, 672 // but it can be used to resolve unreliable pages. 673 // This routine does not have an optimized advance_by 674 // SOON: Uses indirect language/probability longword 675 // 676 // Return number of words that hit in the hash table 677 int cld::DoOctaScoreV3(const cld::CLDTableSummary* octagram_obj, 678 const char* isrc, int srclen, Tote* chunk_tote) { 679 int hit_count = 0; 680 const char* src = isrc; 681 const char* srclimit = src + srclen + 1; 682 // Limit is end+1, to include extra space char (0x20) off the end 683 // 684 // Score all words truncated to 8 characters 685 int charcount = 0; 686 // Skip any initial space 687 if (src[0] == ' ') {++src;} 688 const char* word_ptr = src; 689 const char* word_end = word_ptr; 690 if (FLAGS_dbgscore) { 691 fprintf(stderr, " " ); 692 } 693 while (src < srclimit) { 694 // Terminate previous word or continue current word 695 if (src[0] == ' ') { 696 int bytecount = word_end - word_ptr; 697 if (bytecount == 0) 698 break; 699 // Lookup and score this word 700 uint64 wordhash40 = OctaHash40(word_ptr, bytecount); 701 uint32 probs = OctaHashV3Lookup4(octagram_obj, wordhash40); 702 // Now go indirect on the subscript 703 probs = octagram_obj->kCLDTableInd[probs & 704 ~octagram_obj->kCLDTableKeyMask]; 705 706 // // Lookup and score this word 707 // uint32 wordhash = QuadHashV25(word_ptr, bytecount); 708 // uint32 probs = WordHashLookup4(wordhash, kLongWord8Table, 709 // kLongWord8TableSize); 710 // 711 if (FLAGS_dbglookup) { 712 DbgWordTermToStderr(wordhash40, probs, word_ptr, bytecount); 713 DbgScoreRecord(NULL, probs, bytecount); 714 } else if (FLAGS_dbgscore && (probs != 0)) { 715 DbgScoreRecord(NULL, probs, bytecount); 716 string temp(word_ptr, bytecount); 717 fprintf(stderr, "%s ", temp.c_str()); 718 } 719 720 if (probs != 0) { 721 ProcessProbV25Tote(probs, chunk_tote); 722 ++hit_count; 723 } 724 charcount = 0; 725 word_ptr = src + 1; // Over the space 726 word_end = word_ptr; 727 } else { 728 ++charcount; 729 } 730 731 // Advance to next char 732 src += cld_UniLib::OneCharLen(src); 733 if (charcount <= 8) { 734 word_end = src; 735 } 736 } 737 738 if (FLAGS_dbgscore) { 739 fprintf(stderr, "[%d words scored]\n", hit_count); 740 DbgScoreState(); 741 } 742 return hit_count; 743 } 744 745 746 747 //------------------------------------------------------------------------------ 748 // Reliability calculations, for single language and between languages 749 //------------------------------------------------------------------------------ 750 751 // Return reliablity of result 0..100 for top two scores 752 // delta==0 is 0% reliable, delta==fully_reliable_thresh is 100% reliable 753 // (on a scale where +1 is a factor of 2 ** 1.6 = 3.02) 754 // Threshold is uni/quadgram increment count, bounded above and below. 755 // 756 // Requiring a factor of 3 improvement (e.g. +1 log base 3) 757 // for each scored quadgram is too stringent, so I've backed this off to a 758 // factor of 2 (e.g. +5/8 log base 3). 759 // 760 // I also somewhat lowered the Min/MaxGramCount limits above 761 // 762 // Added: if fewer than 8 quads/unis, max reliability is 12*n percent 763 // 764 int cld::ReliabilityDelta(int value1, int value2, int gramcount) { 765 int max_reliability_percent = 100; 766 if (gramcount < 8) { 767 max_reliability_percent = 12 * gramcount; 768 } 769 int fully_reliable_thresh = (gramcount * 5) >> 3; // see note above 770 if (fully_reliable_thresh < kMinGramCount) { // Fully = 3..16 771 fully_reliable_thresh = kMinGramCount; 772 } else if (fully_reliable_thresh > kMaxGramCount) { 773 fully_reliable_thresh = kMaxGramCount; 774 } 775 776 int delta = value1 - value2; 777 if (delta >= fully_reliable_thresh) {return max_reliability_percent;} 778 if (delta <= 0) {return 0;} 779 return cld::minint(max_reliability_percent, 780 (100 * delta) / fully_reliable_thresh); 781 } 782 783 // Return reliablity of result 0..100 for top score vs. mainsteam score 784 // Values are score per 1024 bytes of input 785 // ratio = max(top/mainstream, mainstream/top) 786 // ratio > 4.0 is 0% reliable, <= 2.0 is 100% reliable 787 // Change: short-text word scoring can give unusually good results. 788 // Let top exceed mainstream by 4x at 50% reliable 789 int cld::ReliabilityMainstream(int topscore, int len, int mean_score) { 790 if (mean_score == 0) {return 100;} // No reliability data available yet 791 if (topscore == 0) {return 0;} // zero score = unreliable 792 if (len == 0) {return 0;} // zero len = unreliable 793 int top_kb = (topscore << 10) / len; 794 double ratio; 795 double ratio_cutoff; 796 if (top_kb > mean_score) { 797 ratio = (1.0 * top_kb) / mean_score; 798 ratio_cutoff = 5.0; // ramp down from 100% to 0%: 3.0-5.0 799 } else { 800 ratio = (1.0 * mean_score) / top_kb; 801 ratio_cutoff = 4.0; // ramp down from 100% to 0%: 2.0-4.0 802 } 803 if (ratio <= ratio_cutoff - 2.0) {return 100;} 804 if (ratio > ratio_cutoff) {return 0;} 805 806 int iratio = static_cast<int>(100 * (ratio_cutoff - ratio) / 2.0); 807 return iratio; 808 } 809 810 // Calculate ratio of score per 1KB vs. expected score per 1KB 811 double cld::GetNormalizedScore(Language lang, UnicodeLScript lscript, 812 int bytes, int score) { 813 // Average training-data score for this language-script combo, per 1KB 814 int expected_score = kMeanScore[lang * 4 + LScript4(lscript)]; 815 if (lscript == ULScript_Common) { 816 // We don't know the script (only happens with second-chance score) 817 // Look for first non-zero mean value 818 for (int i = 2; i >= 0; --i) { 819 if (kMeanScore[lang * 4 + i] > 0) { 820 expected_score = kMeanScore[lang * 4 + i]; 821 break; 822 } 823 } 824 } 825 if (expected_score < 100) { 826 expected_score = 1000; 827 } 828 829 // Our score per 1KB 830 double our_score = (score << 10) / (bytes ? bytes : 1); // Avoid zdiv 831 double ratio = our_score / expected_score; 832 833 // Just the raw count normalized as though each language has mean=1000; 834 ratio = (score * 1000.0) / expected_score; 835 return ratio; 836 } 837 838 // Calculate reliablity of len bytes of script lscript with chunk_tote 839 int cld::GetReliability(int len, UnicodeLScript lscript, 840 const Tote* chunk_tote) { 841 Language cur_lang = UnpackLanguage(chunk_tote->Key(0)); 842 // Average score for this language-script combo 843 int mean_score = kMeanScore[cur_lang * 4 + LScript4(lscript)]; 844 if (lscript == ULScript_Common) { 845 // We don't know the script (only happens with second-chance score) 846 // Look for first non-zero mean value 847 for (int i = 2; i >= 0; --i) { 848 if (kMeanScore[cur_lang * 4 + i] > 0) { 849 mean_score = kMeanScore[cur_lang * 4 + i]; 850 break; 851 } 852 } 853 } 854 int reliability_delta = ReliabilityDelta(chunk_tote->Value(0), 855 chunk_tote->Value(1), 856 chunk_tote->GetGramCount()); 857 858 int reliability_main = ReliabilityMainstream(chunk_tote->Value(0), 859 len, 860 mean_score); 861 862 int reliability_min = minint(reliability_delta, reliability_main); 863 864 865 if (FLAGS_dbgreli) { 866 char temp1[4]; 867 char temp2[4]; 868 cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(0)), temp1); 869 if (temp1[2] == ' ') {temp1[2] = '\0';} 870 cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(1)), temp2); 871 if (temp2[2] == ' ') {temp2[2] = '\0';} 872 int srclen = len; 873 fprintf(stderr, "CALC GetReliability gram=%d incr=%d srclen=%d, %s=%d %s=%d " 874 "top/KB=%d mean/KB=%d del=%d%% reli=%d%% " 875 "lang/lscript %d %d\n", 876 chunk_tote->GetGramCount(), 877 chunk_tote->GetIncrCount(), 878 srclen, 879 temp1, chunk_tote->Value(0), 880 temp2, chunk_tote->Value(1), 881 (chunk_tote->Value(0) << 10) / (srclen ? srclen : 1), 882 mean_score, 883 reliability_delta, 884 reliability_main, 885 cur_lang, lscript); 886 } 887 888 return reliability_min; 889 } 890 891 892 //------------------------------------------------------------------------------ 893 // Miscellaneous 894 //------------------------------------------------------------------------------ 895 896 // Demote all languages except Top40 and plus_one 897 // Do this just before sorting chunk_tote results 898 void cld::DemoteNotTop40(Tote* chunk_tote, int packed_plus_one) { 899 for (int sub = 0; sub < chunk_tote->MaxSize(); ++sub) { 900 if (chunk_tote->Key(sub) == 0) continue; 901 if (chunk_tote->Key(sub) == packed_plus_one) continue; 902 if (kIsPackedTop40[chunk_tote->Key(sub)]) continue; 903 // Quarter the score of others 904 chunk_tote->SetValue(sub, chunk_tote->Value(sub) >> 2); 905 } 906 } 907