Home | History | Annotate | Download | only in compact_lang_det
      1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include <string>
      6 #include "encodings/compact_lang_det/cldutil.h"
      7 
      8 #include "base/basictypes.h"
      9 #include "encodings/compact_lang_det/cldutil_dbg.h"
     10 #include "encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h"
     11 #include "encodings/compact_lang_det/utf8propletterscriptnum.h"
     12 #include "encodings/compact_lang_det/win/cld_commandlineflags.h"
     13 #include "encodings/compact_lang_det/win/cld_logging.h"
     14 #include "encodings/compact_lang_det/win/cld_unilib.h"
     15 #include "encodings/compact_lang_det/win/cld_utf.h"
     16 #include "encodings/compact_lang_det/win/cld_utf8statetable.h"
     17 
     18 // Runtime routines for hashing, looking up, and scoring
     19 // unigrams (CJK), bigrams (CJK), quadgrams, and octagrams.
     20 // Unigrams and bigrams are for CJK languages only, including simplified/
     21 // traditional Chinese, Japanese, Korean, Vietnamese Han characters, and
     22 // Zhuang Han characters. Surrounding spaces are not considered.
     23 // Quadgrams and octagrams for for non-CJK and include two bits indicating
     24 // preceding and trailing spaces (word boundaries).
     25 
     26 
     27 // Indicator bits for leading/trailing space around quad/octagram
     28 // NOTE: 4444 bits are chosen to flip constant bits in hash of four chars of
     29 // 1-, 2-, or 3-bytes each.
     30 static const uint32 kPreSpaceIndicator =  0x00004444;
     31 static const uint32 kPostSpaceIndicator = 0x44440000;
     32 
     33 // Little-endian masks for 0..24 bytes picked up as uint32's
     34 static const uint32 kWordMask0[4] = {
     35   0xFFFFFFFF, 0x000000FF, 0x0000FFFF, 0x00FFFFFF
     36 };
     37 
     38 static const int kMinCJKUTF8CharBytes = 3;
     39 
     40 static const int kMinGramCount = 3;
     41 static const int kMaxGramCount = 16;
     42 
     43 
     44 
     45 
     46 // Routines to access a hash table of <key:wordhash, value:probs> pairs
     47 // Buckets have 4-byte wordhash for sizes < 32K buckets, but only
     48 // 2-byte wordhash for sizes >= 32K buckets, with other wordhash bits used as
     49 // bucket subscript.
     50 // Probs is a packed: three languages plus a subscript for probability table
     51 // Buckets have all the keys together, then all the values.Key array never
     52 // crosses a cache-line boundary, so no-match case takes exactly one cache miss.
     53 // Match case may sometimes take an additional cache miss on value access.
     54 //
     55 // Other possibilites include 5 or 10 6-byte entries plus pad to make 32 or 64
     56 // byte buckets with single cache miss.
     57 // Or 2-byte key and 6-byte value, allowing 5 languages instead  of three.
     58 //------------------------------------------------------------------------------
     59 
     60 
     61 //------------------------------------------------------------------------------
     62 // Hashing groups of 1/2/4/8 letters, perhaps with spaces or underscores
     63 //------------------------------------------------------------------------------
     64 
     65 // Design principles for these hash functions
     66 // - Few operations
     67 // - Handle 1-, 2-, and 3-byte UTF-8 scripts, ignoring intermixing except in
     68 //   Latin script expect 1- and 2-byte mixtures.
     69 // - Last byte of each character has about 5 bits of information
     70 // - Spread good bits around so they can interact in at least two ways
     71 //   with other characters
     72 // - Use add for additional mixing thorugh carries
     73 
     74 // CJK Three-byte bigram
     75 //   ....dddd..cccccc..bbbbbb....aaaa
     76 //   ..................ffffff..eeeeee
     77 // make
     78 //   ....dddd..cccccc..bbbbbb....aaaa
     79 //   000....dddd..cccccc..bbbbbb....a
     80 //   ..................ffffff..eeeeee
     81 //   ffffff..eeeeee000000000000000000
     82 //
     83 // CJK Four-byte bigram
     84 //   ..dddddd..cccccc....bbbb....aaaa
     85 //   ..hhhhhh..gggggg....ffff....eeee
     86 // make
     87 //   ..dddddd..cccccc....bbbb....aaaa
     88 //   000..dddddd..cccccc....bbbb....a
     89 //   ..hhhhhh..gggggg....ffff....eeee
     90 //   ..ffff....eeee000000000000000000
     91 
     92 // BIGRAM
     93 // Pick up 1..8 bytes and hash them via mask/shift/add. NO pre/post
     94 // OVERSHOOTS up to 3 bytes
     95 // For runtime use of tables
     96 uint32 cld::BiHashV25(const char* word_ptr, int bytecount) {
     97   if (bytecount == 0) {
     98     return 0;
     99   }
    100   uint32 word0, word1;
    101   if (bytecount <= 4) {
    102     word0 = UnalignedLoad32(word_ptr) & kWordMask0[bytecount & 3];
    103     word0 = word0 ^ (word0 >> 3);
    104     return word0;
    105   }
    106   // Else do 8 bytes
    107   word0 = UnalignedLoad32(word_ptr);
    108   word0 = word0 ^ (word0 >> 3);
    109   word1 = UnalignedLoad32(word_ptr + 4) & kWordMask0[bytecount & 3];
    110   word1 = word1 ^ (word1 << 18);
    111   return word0 + word1;
    112 }
    113 
    114 //
    115 // Ascii-7 One-byte chars
    116 //   ...ddddd...ccccc...bbbbb...aaaaa
    117 // make
    118 //   ...ddddd...ccccc...bbbbb...aaaaa
    119 //   000...ddddd...ccccc...bbbbb...aa
    120 //
    121 // Latin 1- and 2-byte chars
    122 //   ...ddddd...ccccc...bbbbb...aaaaa
    123 //   ...................fffff...eeeee
    124 // make
    125 //   ...ddddd...ccccc...bbbbb...aaaaa
    126 //   000...ddddd...ccccc...bbbbb...aa
    127 //   ...................fffff...eeeee
    128 //   ...............fffff...eeeee0000
    129 //
    130 // Non-CJK Two-byte chars
    131 //   ...ddddd...........bbbbb........
    132 //   ...hhhhh...........fffff........
    133 // make
    134 //   ...ddddd...........bbbbb........
    135 //   000...ddddd...........bbbbb.....
    136 //   ...hhhhh...........fffff........
    137 //   hhhh...........fffff........0000
    138 //
    139 // Non-CJK Three-byte chars
    140 //   ...........ccccc................
    141 //   ...................fffff........
    142 //   ...lllll...................iiiii
    143 // make
    144 //   ...........ccccc................
    145 //   000...........ccccc.............
    146 //   ...................fffff........
    147 //   ...............fffff........0000
    148 //   ...lllll...................iiiii
    149 //   .lllll...................iiiii00
    150 //
    151 
    152 // QUADGRAM
    153 // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
    154 // OVERSHOOTS up to 3 bytes
    155 // For runtime use of tables
    156 uint32 QuadHashV25Mix(const char* word_ptr, int bytecount, uint32 prepost) {
    157   uint32 word0, word1, word2;
    158   if (bytecount <= 4) {
    159     word0 = UnalignedLoad32(word_ptr) & kWordMask0[bytecount & 3];
    160     word0 = word0 ^ (word0 >> 3);
    161     return word0 ^ prepost;
    162   } else if (bytecount <= 8) {
    163     word0 = UnalignedLoad32(word_ptr);
    164     word0 = word0 ^ (word0 >> 3);
    165     word1 = UnalignedLoad32(word_ptr + 4) & kWordMask0[bytecount & 3];
    166     word1 = word1 ^ (word1 << 4);
    167     return (word0 ^ prepost) + word1;
    168   }
    169   // else do 12 bytes
    170   word0 = UnalignedLoad32(word_ptr);
    171   word0 = word0 ^ (word0 >> 3);
    172   word1 = UnalignedLoad32(word_ptr + 4);
    173   word1 = word1 ^ (word1 << 4);
    174   word2 = UnalignedLoad32(word_ptr + 8) & kWordMask0[bytecount & 3];
    175   word2 = word2 ^ (word2 << 2);
    176   return (word0 ^ prepost) + word1 + word2;
    177 }
    178 
    179 
    180 // QUADGRAM wrapper with surrounding spaces
    181 // Pick up 1..12 bytes plus pre/post space and hash them via mask/shift/add
    182 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
    183 // For runtime use of tables
    184 uint32 cld::QuadHashV25(const char* word_ptr, int bytecount) {
    185   if (bytecount == 0) {
    186     return 0;
    187   }
    188   uint32 prepost = 0;
    189   if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
    190   if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
    191   return QuadHashV25Mix(word_ptr, bytecount, prepost);
    192 }
    193 
    194 // QUADGRAM wrapper with surrounding underscores (offline use)
    195 // Pick up 1..12 bytes plus pre/post '_' and hash them via mask/shift/add
    196 // OVERSHOOTS up to 3 bytes
    197 // For offline construction of tables
    198 uint32 cld::QuadHashV25Underscore(const char* word_ptr, int bytecount) {
    199   if (bytecount == 0) {
    200     return 0;
    201   }
    202   const char* local_word_ptr = word_ptr;
    203   int local_bytecount = bytecount;
    204   uint32 prepost = 0;
    205   if (local_word_ptr[0] == '_') {
    206     prepost |= kPreSpaceIndicator;
    207     ++local_word_ptr;
    208     --local_bytecount;
    209   }
    210   if (local_word_ptr[local_bytecount - 1] == '_') {
    211     prepost |= kPostSpaceIndicator;
    212     --local_bytecount;
    213   }
    214   return QuadHashV25Mix(local_word_ptr, local_bytecount, prepost);
    215 }
    216 
    217 
    218 // OCTAGRAM
    219 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
    220 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
    221 //
    222 // The low 32 bits follow the pattern from above, tuned to different scripts
    223 // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
    224 // For runtime use of tables V3
    225 uint64 OctaHash40Mix(const char* word_ptr, int bytecount, uint64 prepost) {
    226   uint64 word0;
    227   uint64 word1;
    228   uint64 sum;
    229 
    230   if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
    231   if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
    232   switch ((bytecount - 1) >> 2) {
    233   case 0:       // 1..4 bytes
    234     word0 = UnalignedLoad32(word_ptr) & kWordMask0[bytecount & 3];
    235     sum = word0;
    236     word0 = word0 ^ (word0 >> 3);
    237     break;
    238   case 1:       // 5..8 bytes
    239     word0 = UnalignedLoad32(word_ptr);
    240     sum = word0;
    241     word0 = word0 ^ (word0 >> 3);
    242     word1 = UnalignedLoad32(word_ptr + 4) & kWordMask0[bytecount & 3];
    243     sum += word1;
    244     word1 = word1 ^ (word1 << 4);
    245     word0 += word1;
    246     break;
    247   case 2:       // 9..12 bytes
    248     word0 = UnalignedLoad32(word_ptr);
    249     sum = word0;
    250     word0 = word0 ^ (word0 >> 3);
    251     word1 = UnalignedLoad32(word_ptr + 4);
    252     sum += word1;
    253     word1 = word1 ^ (word1 << 4);
    254     word0 += word1;
    255     word1 = UnalignedLoad32(word_ptr + 8) & kWordMask0[bytecount & 3];
    256     sum += word1;
    257     word1 = word1 ^ (word1 << 2);
    258     word0 += word1;
    259     break;
    260   case 3:       // 13..16 bytes
    261     word0 = UnalignedLoad32(word_ptr);
    262     sum = word0;
    263     word0 = word0 ^ (word0 >> 3);
    264     word1 = UnalignedLoad32(word_ptr + 4);
    265     sum += word1;
    266     word1 = word1 ^ (word1 << 4);
    267     word0 += word1;
    268     word1 = UnalignedLoad32(word_ptr + 8);
    269     sum += word1;
    270     word1 = word1 ^ (word1 << 2);
    271     word0 += word1;
    272     word1 = UnalignedLoad32(word_ptr + 12) & kWordMask0[bytecount & 3];
    273     sum += word1;
    274     word1 = word1 ^ (word1 >> 8);
    275     word0 += word1;
    276     break;
    277   case 4:       // 17..20 bytes
    278     word0 = UnalignedLoad32(word_ptr);
    279     sum = word0;
    280     word0 = word0 ^ (word0 >> 3);
    281     word1 = UnalignedLoad32(word_ptr + 4);
    282     sum += word1;
    283     word1 = word1 ^ (word1 << 4);
    284     word0 += word1;
    285     word1 = UnalignedLoad32(word_ptr + 8);
    286     sum += word1;
    287     word1 = word1 ^ (word1 << 2);
    288     word0 += word1;
    289     word1 = UnalignedLoad32(word_ptr + 12);
    290     sum += word1;
    291     word1 = word1 ^ (word1 >> 8);
    292     word0 += word1;
    293     word1 = UnalignedLoad32(word_ptr + 16) & kWordMask0[bytecount & 3];
    294     sum += word1;
    295     word1 = word1 ^ (word1 >> 4);
    296     word0 += word1;
    297     break;
    298   default:      // 21..24 bytes and higher (ignores beyond 24)
    299     word0 = UnalignedLoad32(&word_ptr);
    300     sum = word0;
    301     word0 = word0 ^ (word0 >> 3);
    302     word1 = UnalignedLoad32(word_ptr + 4);
    303     sum += word1;
    304     word1 = word1 ^ (word1 << 4);
    305     word0 += word1;
    306     word1 = UnalignedLoad32(word_ptr + 8);
    307     sum += word1;
    308     word1 = word1 ^ (word1 << 2);
    309     word0 += word1;
    310     word1 = UnalignedLoad32(word_ptr + 12);
    311     sum += word1;
    312     word1 = word1 ^ (word1 >> 8);
    313     word0 += word1;
    314     word1 = UnalignedLoad32(word_ptr + 16);
    315     sum += word1;
    316     word1 = word1 ^ (word1 >> 4);
    317     word0 += word1;
    318     word1 = UnalignedLoad32(word_ptr + 20) & kWordMask0[bytecount & 3];
    319     sum += word1;
    320     word1 = word1 ^ (word1 >> 6);
    321     word0 += word1;
    322     break;
    323   }
    324 
    325   sum += (sum >> 17);             // extra 1-bit shift for bytes 2 & 3
    326   sum += (sum >> 9);              // extra 1-bit shift for bytes 1 & 3
    327   sum = (sum & 0xff) << 32;
    328   return (word0 ^ prepost) + sum;
    329 }
    330 
    331 // OCTAGRAM wrapper with surrounding spaces
    332 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
    333 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
    334 //
    335 // The low 32 bits follow the pattern from above, tuned to different scripts
    336 // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
    337 // For runtime use of tables V3
    338 uint64 cld::OctaHash40(const char* word_ptr, int bytecount) {
    339   if (bytecount == 0) {
    340     return 0;
    341   }
    342   uint64 prepost = 0;
    343   if (word_ptr[-1] == ' ') {prepost |= kPreSpaceIndicator;}
    344   if (word_ptr[bytecount] == ' ') {prepost |= kPostSpaceIndicator;}
    345   return OctaHash40Mix(word_ptr, bytecount, prepost);
    346 }
    347 
    348 
    349 // OCTAGRAM wrapper with surrounding underscores (offline use)
    350 // Pick up 1..24 bytes plus pre/post space and hash them via mask/shift/add
    351 // UNDERSHOOTS 1 byte, OVERSHOOTS up to 3 bytes
    352 //
    353 // The low 32 bits follow the pattern from above, tuned to different scripts
    354 // The high 8 bits are a simple sum of all bytes, shifted by 0/1/2/3 bits each
    355 // For offline construction of tables
    356 uint64 cld::OctaHash40underscore(const char* word_ptr, int bytecount) {
    357   if (bytecount == 0) {
    358     return 0;
    359   }
    360   const char* local_word_ptr = word_ptr;
    361   int local_bytecount = bytecount;
    362   uint64 prepost = 0;
    363   if (local_word_ptr[0] == '_') {
    364     prepost |= kPreSpaceIndicator;
    365     ++local_word_ptr;
    366     --local_bytecount;
    367   }
    368   if (local_word_ptr[local_bytecount - 1] == '_') {
    369     prepost |= kPostSpaceIndicator;
    370     --local_bytecount;
    371   }
    372   return OctaHash40Mix(local_word_ptr, local_bytecount, prepost);
    373 }
    374 
    375 
    376 
    377 
    378 //------------------------------------------------------------------------------
    379 // Scoring single groups of letters
    380 //------------------------------------------------------------------------------
    381 
    382 // UNIGRAM score one => tote
    383 // Input: 1-byte entry of subscript into unigram probs, plus
    384 //  an accumulator tote.
    385 // Output: running sums in tote updated
    386 void cld::ProcessProbV25UniTote(int propval, Tote* tote) {
    387   tote->AddGram();
    388   const UnigramProbArray* pa = &kTargetCTJKVZProbs[propval];
    389   if (pa->probs[0] > 0) {tote->Add(cld::PackLanguage(CHINESE), pa->probs[0]);}
    390   if (pa->probs[1] > 0) {tote->Add(cld::PackLanguage(CHINESE_T), pa->probs[1]);}
    391   if (pa->probs[2] > 0) {tote->Add(cld::PackLanguage(JAPANESE), pa->probs[2]);}
    392   if (pa->probs[3] > 0) {tote->Add(cld::PackLanguage(KOREAN), pa->probs[3]);}
    393   if (pa->probs[4] > 0) {tote->Add(cld::PackLanguage(VIETNAMESE), pa->probs[4]);}
    394   if (pa->probs[5] > 0) {tote->Add(cld::PackLanguage(ZHUANG), pa->probs[5]);}
    395 }
    396 
    397 // BIGRAM, QUADGRAM, OCTAGRAM score one => tote
    398 // Input: 4-byte entry of 3 language numbers and one probability subscript, plus
    399 //  an accumulator tote. (language 0 means unused entry)
    400 // Output: running sums in tote updated
    401 void cld::ProcessProbV25Tote(uint32 probs, Tote* tote) {
    402   tote->AddGram();
    403   uint8 prob123 = (probs >> 0) & 0xff;
    404   const uint8* prob123_entry = cld::LgProb2TblEntry(prob123);
    405 
    406   uint8 top1 = (probs >> 8) & 0xff;
    407   if (top1 > 0) {tote->Add(top1, cld::LgProb3(prob123_entry, 0));}
    408   uint8 top2 = (probs >> 16) & 0xff;
    409   if (top2 > 0) {tote->Add(top2, cld::LgProb3(prob123_entry, 1));}
    410   uint8 top3 = (probs >> 24) & 0xff;
    411   if (top3 > 0) {tote->Add(top3, cld::LgProb3(prob123_entry, 2));}
    412 }
    413 
    414 
    415 //------------------------------------------------------------------------------
    416 // Routines to accumulate probabilities
    417 //------------------------------------------------------------------------------
    418 
    419 
    420 // UNIGRAM, using UTF-8 property table, advancing by 1/2/4/8 chars
    421 // Caller supplies table, such as compact_lang_det_generated_ctjkvz_b1_obj
    422 // Score up to n unigrams, returning number of bytes consumed
    423 // Updates tote_grams
    424 int cld::DoUniScoreV3(const UTF8PropObj* unigram_obj,
    425                       const char* isrc, int srclen, int advance_by,
    426                       int* tote_grams, int gram_limit, Tote* chunk_tote) {
    427   const char* src = isrc;
    428   if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);}
    429 
    430   // Property-based CJK unigram lookup
    431   if (src[0] == ' ') {++src; --srclen;}
    432 
    433   const uint8* usrc = reinterpret_cast<const uint8*>(src);
    434   int usrclen = srclen;
    435 
    436   while (usrclen > 0) {
    437     int len = kAdvanceOneChar[usrc[0]];
    438     // Look up property of one UTF-8 character and advance over it
    439     // Return 0 if input length is zero
    440     // Return 0 and advance one byte if input is ill-formed
    441 
    442     int propval = UTF8GenericPropertyBigOneByte(unigram_obj, &usrc, &usrclen);
    443 
    444     if (FLAGS_dbglookup) {
    445       DbgUniTermToStderr(propval, usrc, len);
    446     }
    447 
    448     if (propval > 0) {
    449       ProcessProbV25UniTote(propval, chunk_tote);
    450       ++(*tote_grams);
    451       if (FLAGS_dbgscore) {DbgScoreRecordUni((const char*)usrc, propval, len);}
    452     }
    453 
    454     // Advance by 1/2/4/8 characters (half of quad advance)
    455     if (advance_by == 2) {
    456       // Already advanced by 1
    457     } else if (advance_by == 4) {
    458       // Advance by 2 chars total, if not at end
    459       if (UTFmax <= usrclen) {
    460         int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
    461       }
    462     } else if (advance_by == 8) {
    463       // Advance by 4 chars total, if not at end
    464       if ((UTFmax * 3) <= usrclen) {
    465         int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
    466         n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
    467         n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
    468       }
    469     } else {
    470       // Advance by 8 chars total, if not at end
    471       if ((UTFmax * 7) <= usrclen) {
    472         int n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
    473         n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
    474         n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
    475         n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
    476         n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
    477         n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
    478         n = kAdvanceOneChar[*usrc]; usrc += n; usrclen -= n;
    479       }
    480     }
    481     DCHECK(usrclen >= 0);
    482 
    483     if (*tote_grams >= gram_limit) {
    484       break;
    485     }
    486   }
    487   if (FLAGS_dbgscore) {
    488     // With advance_by>2, we consume more input to get the same number of quads
    489     int len = src - isrc;
    490     DbgScoreTop(src, (len * 2) / advance_by, chunk_tote);
    491     DbgScoreFlush();
    492   }
    493 
    494   int consumed2 = reinterpret_cast<const char*>(usrc) - isrc;
    495   return consumed2;
    496 }
    497 
    498 
    499 // BIGRAM, using hash table, always advancing by 1 char
    500 // Caller supplies table, such as &kCjkBiTable_obj or &kGibberishTable_obj
    501 // Score all bigrams in isrc, using languages that have bigrams (CJK)
    502 // Return number of bigrams that hit in the hash table
    503 int cld::DoBigramScoreV3(const cld::CLDTableSummary* bigram_obj,
    504                          const char* isrc, int srclen, Tote* chunk_tote) {
    505   int hit_count = 0;
    506   const char* src = isrc;
    507 
    508   // Hashtable-based CJK bigram lookup
    509   const uint8* usrc = reinterpret_cast<const uint8*>(src);
    510   const uint8* usrclimit1 = usrc + srclen - UTFmax;
    511   if (FLAGS_dbgscore) {
    512     fprintf(stderr, "  " );
    513   }
    514 
    515   while (usrc < usrclimit1) {
    516     int len = kAdvanceOneChar[usrc[0]];
    517     int len2 = kAdvanceOneChar[usrc[len]] + len;
    518 
    519     if ((kMinCJKUTF8CharBytes * 2) <= len2) {      // Two CJK chars possible
    520       // Lookup and score this bigram
    521       // Always ignore pre/post spaces
    522       uint32 bihash = BiHashV25(reinterpret_cast<const char*>(usrc), len2);
    523       uint32 probs = QuadHashV3Lookup4(bigram_obj, bihash);
    524       // Now go indirect on the subscript
    525       probs = bigram_obj->kCLDTableInd[probs &
    526         ~bigram_obj->kCLDTableKeyMask];
    527 
    528       // Process the bigram
    529       if (FLAGS_dbglookup) {
    530         const char* ssrc = reinterpret_cast<const char*>(usrc);
    531         DbgBiTermToStderr(bihash, probs, ssrc, len2);
    532         DbgScoreRecord(NULL, probs, len2);
    533       } else if (FLAGS_dbgscore && (probs != 0)) {
    534         const char* ssrc = reinterpret_cast<const char*>(usrc);
    535         DbgScoreRecord(NULL, probs, len2);
    536         string temp(ssrc, len2);
    537         fprintf(stderr, "%s ", temp.c_str());
    538       }
    539 
    540       if (probs != 0) {
    541         ProcessProbV25Tote(probs, chunk_tote);
    542         ++hit_count;
    543       }
    544     }
    545     usrc += len;  // Advance by one char
    546   }
    547 
    548   if (FLAGS_dbgscore) {
    549     fprintf(stderr, "[%d bigrams scored]\n", hit_count);
    550     DbgScoreState();
    551   }
    552   return hit_count;
    553 }
    554 
    555 
    556 
    557 // QUADGRAM, using hash table, advancing by 2/4/8/16 chars
    558 // Caller supplies table, such as &kQuadTable_obj or &kGibberishTable_obj
    559 // Score up to n quadgrams, returning number of bytes consumed
    560 // Updates tote_grams
    561 int cld::DoQuadScoreV3(const cld::CLDTableSummary* quadgram_obj,
    562                        const char* isrc, int srclen, int advance_by,
    563                        int* tote_grams, int gram_limit, Tote* chunk_tote) {
    564   const char* src = isrc;
    565   const char* srclimit = src + srclen;
    566   // Limit is end, which has extra 20 20 20 00 past len
    567   const char* srclimit7 = src + srclen - (UTFmax * 7);
    568   const char* srclimit15 = src + srclen - (UTFmax * 15);
    569 
    570   if (FLAGS_dbgscore) {DbgScoreInit(src, srclen);}
    571 
    572   // Run a little cache of last hits to catch overly-repetitive "text"
    573   int next_prior = 0;
    574   uint32 prior_quads[2] = {0, 0};
    575 
    576   // Visit all quadgrams
    577   if (src[0] == ' ') {++src;}
    578   while (src < srclimit) {
    579     // Find one quadgram
    580     const char* src_end = src;
    581     src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
    582     src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
    583     const char* src_mid = src_end;
    584     src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
    585     src_end += kAdvanceOneCharButSpace[(uint8)src_end[0]];
    586     int len = src_end - src;
    587 
    588     // Lookup and score this quadgram
    589     uint32 quadhash = QuadHashV25(src, len);
    590     uint32 probs = QuadHashV3Lookup4(quadgram_obj, quadhash);
    591     // Now go indirect on the subscript
    592     probs = quadgram_obj->kCLDTableInd[probs &
    593       ~quadgram_obj->kCLDTableKeyMask];
    594 
    595     // Process the quadgram
    596     if (FLAGS_dbglookup) {
    597       DbgQuadTermToStderr(quadhash, probs, src, len);
    598     }
    599     if (probs != 0) {
    600       // Filter out recent repeats. If this works out, use in the other lookups
    601       if ((quadhash != prior_quads[0]) && (quadhash != prior_quads[1])) {
    602         prior_quads[next_prior] = quadhash;
    603         next_prior = (next_prior + 1) & 1;
    604         ProcessProbV25Tote(probs, chunk_tote);
    605         ++(*tote_grams);
    606         if (FLAGS_dbgscore) {DbgScoreRecord(src, probs, len);}
    607       }
    608     }
    609 
    610     // Advance all the way past word if at end-of-word
    611     if (src_end[0] == ' ') {
    612       src_mid = src_end;
    613     }
    614 
    615     // Advance by 2/4/8/16 characters
    616     if (advance_by == 2) {
    617       src = src_mid;
    618     } else if (advance_by == 4) {
    619       src = src_end;
    620     } else if (advance_by == 8) {
    621       // Advance by 8 chars total (4 more), if not at end
    622       if (src < srclimit7) {
    623         src_end += kAdvanceOneChar[(uint8)src_end[0]];
    624         src_end += kAdvanceOneChar[(uint8)src_end[0]];
    625         src_end += kAdvanceOneChar[(uint8)src_end[0]];
    626         src_end += kAdvanceOneChar[(uint8)src_end[0]];
    627       }
    628       src = src_end;
    629     } else {
    630       // Advance by 16 chars total (12 more), if not at end
    631       if (src < srclimit15) {
    632         // Advance by ~16 chars by adding 3 * current bytelen
    633         int fourcharlen = src_end - src;
    634         src = src_end + (3 * fourcharlen);
    635         // Advance a bit more if mid-character
    636         src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
    637         src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
    638       } else {
    639         src = src_end;
    640       }
    641     }
    642     DCHECK(src < srclimit);
    643     src += kAdvanceOneCharSpaceVowel[(uint8)src[0]];
    644 
    645     if (*tote_grams >= gram_limit) {
    646       break;
    647     }
    648   }
    649 
    650   if (FLAGS_dbgscore) {
    651     // With advance_by>2, we consume more input to get the same number of quads
    652     int len = src - isrc;
    653     DbgScoreTop(src, (len * 2) / advance_by, chunk_tote);
    654     DbgScoreFlush();
    655   }
    656 
    657   int consumed = src - isrc;
    658 
    659   // If advancing by more than 2, src may have overshot srclimit
    660   if (consumed > srclen) {
    661     consumed = srclen;
    662   }
    663 
    664   return consumed;
    665 }
    666 
    667 
    668 // OCTAGRAM, using hash table, always advancing by 1 word
    669 // Caller supplies table, such as &kLongWord8Table_obj
    670 // Score all words in isrc, using languages that have quadgrams
    671 // We don't normally use this routine except on the first quadgram run,
    672 // but it can be used to resolve unreliable pages.
    673 // This routine does not have an optimized advance_by
    674 // SOON: Uses indirect language/probability longword
    675 //
    676 // Return number of words that hit in the hash table
    677 int cld::DoOctaScoreV3(const cld::CLDTableSummary* octagram_obj,
    678                        const char* isrc, int srclen, Tote* chunk_tote) {
    679   int hit_count = 0;
    680   const char* src = isrc;
    681   const char* srclimit = src + srclen + 1;
    682   // Limit is end+1, to include extra space char (0x20) off the end
    683   //
    684   // Score all words truncated to 8 characters
    685   int charcount = 0;
    686   // Skip any initial space
    687   if (src[0] == ' ') {++src;}
    688   const char* word_ptr = src;
    689   const char* word_end = word_ptr;
    690   if (FLAGS_dbgscore) {
    691     fprintf(stderr, "  " );
    692   }
    693   while (src < srclimit) {
    694     // Terminate previous word or continue current word
    695     if (src[0] == ' ') {
    696       int bytecount = word_end - word_ptr;
    697       if (bytecount == 0)
    698         break;
    699       // Lookup and score this word
    700       uint64 wordhash40 = OctaHash40(word_ptr, bytecount);
    701       uint32 probs = OctaHashV3Lookup4(octagram_obj, wordhash40);
    702       // Now go indirect on the subscript
    703       probs = octagram_obj->kCLDTableInd[probs &
    704         ~octagram_obj->kCLDTableKeyMask];
    705 
    706       // // Lookup and score this word
    707       // uint32 wordhash = QuadHashV25(word_ptr, bytecount);
    708       // uint32 probs = WordHashLookup4(wordhash, kLongWord8Table,
    709       //                                kLongWord8TableSize);
    710       //
    711       if (FLAGS_dbglookup) {
    712         DbgWordTermToStderr(wordhash40, probs, word_ptr, bytecount);
    713         DbgScoreRecord(NULL, probs, bytecount);
    714       } else if (FLAGS_dbgscore && (probs != 0)) {
    715         DbgScoreRecord(NULL, probs, bytecount);
    716         string temp(word_ptr, bytecount);
    717         fprintf(stderr, "%s ", temp.c_str());
    718       }
    719 
    720       if (probs != 0) {
    721         ProcessProbV25Tote(probs, chunk_tote);
    722         ++hit_count;
    723       }
    724       charcount = 0;
    725       word_ptr = src + 1;   // Over the space
    726       word_end = word_ptr;
    727     } else {
    728       ++charcount;
    729     }
    730 
    731     // Advance to next char
    732     src += cld_UniLib::OneCharLen(src);
    733     if (charcount <= 8) {
    734       word_end = src;
    735     }
    736   }
    737 
    738   if (FLAGS_dbgscore) {
    739     fprintf(stderr, "[%d words scored]\n", hit_count);
    740     DbgScoreState();
    741   }
    742   return hit_count;
    743 }
    744 
    745 
    746 
    747 //------------------------------------------------------------------------------
    748 // Reliability calculations, for single language and between languages
    749 //------------------------------------------------------------------------------
    750 
    751 // Return reliablity of result 0..100 for top two scores
    752 // delta==0 is 0% reliable, delta==fully_reliable_thresh is 100% reliable
    753 // (on a scale where +1 is a factor of  2 ** 1.6 = 3.02)
    754 // Threshold is uni/quadgram increment count, bounded above and below.
    755 //
    756 // Requiring a factor of 3 improvement (e.g. +1 log base 3)
    757 // for each scored quadgram is too stringent, so I've backed this off to a
    758 // factor of 2 (e.g. +5/8 log base 3).
    759 //
    760 // I also somewhat lowered the Min/MaxGramCount limits above
    761 //
    762 // Added: if fewer than 8 quads/unis, max reliability is 12*n percent
    763 //
    764 int cld::ReliabilityDelta(int value1, int value2, int gramcount) {
    765   int max_reliability_percent = 100;
    766   if (gramcount < 8) {
    767     max_reliability_percent = 12 * gramcount;
    768   }
    769   int fully_reliable_thresh = (gramcount * 5) >> 3;     // see note above
    770   if (fully_reliable_thresh < kMinGramCount) {          // Fully = 3..16
    771     fully_reliable_thresh = kMinGramCount;
    772   } else if (fully_reliable_thresh > kMaxGramCount) {
    773     fully_reliable_thresh = kMaxGramCount;
    774   }
    775 
    776   int delta = value1 - value2;
    777   if (delta >= fully_reliable_thresh) {return max_reliability_percent;}
    778   if (delta <= 0) {return 0;}
    779   return cld::minint(max_reliability_percent,
    780                      (100 * delta) / fully_reliable_thresh);
    781 }
    782 
    783 // Return reliablity of result 0..100 for top score vs. mainsteam score
    784 // Values are score per 1024 bytes of input
    785 // ratio = max(top/mainstream, mainstream/top)
    786 // ratio > 4.0 is 0% reliable, <= 2.0 is 100% reliable
    787 // Change: short-text word scoring can give unusually good results.
    788 //  Let top exceed mainstream by 4x at 50% reliable
    789 int cld::ReliabilityMainstream(int topscore, int len, int mean_score) {
    790   if (mean_score == 0) {return 100;}    // No reliability data available yet
    791   if (topscore == 0) {return 0;}        // zero score = unreliable
    792   if (len == 0) {return 0;}             // zero len = unreliable
    793   int top_kb = (topscore << 10) / len;
    794   double ratio;
    795   double ratio_cutoff;
    796   if (top_kb > mean_score) {
    797     ratio = (1.0 * top_kb) / mean_score;
    798     ratio_cutoff = 5.0;                 // ramp down from 100% to 0%: 3.0-5.0
    799   } else {
    800     ratio = (1.0 * mean_score) / top_kb;
    801     ratio_cutoff = 4.0;                 // ramp down from 100% to 0%: 2.0-4.0
    802   }
    803   if (ratio <= ratio_cutoff - 2.0) {return 100;}
    804   if (ratio > ratio_cutoff) {return 0;}
    805 
    806   int iratio = static_cast<int>(100 * (ratio_cutoff - ratio) / 2.0);
    807   return iratio;
    808 }
    809 
    810 // Calculate ratio of score per 1KB vs. expected score per 1KB
    811 double cld::GetNormalizedScore(Language lang, UnicodeLScript lscript,
    812                           int bytes, int score) {
    813   // Average training-data score for this language-script combo, per 1KB
    814   int expected_score = kMeanScore[lang * 4 + LScript4(lscript)];
    815   if (lscript == ULScript_Common) {
    816     // We don't know the script (only happens with second-chance score)
    817     // Look for first non-zero mean value
    818     for (int i = 2; i >= 0; --i) {
    819       if (kMeanScore[lang * 4 + i] > 0) {
    820         expected_score = kMeanScore[lang * 4 + i];
    821         break;
    822       }
    823     }
    824   }
    825   if (expected_score < 100) {
    826       expected_score = 1000;
    827   }
    828 
    829   // Our score per 1KB
    830   double our_score = (score << 10) / (bytes ? bytes : 1);  // Avoid zdiv
    831   double ratio = our_score / expected_score;
    832 
    833   // Just the raw count normalized as though each language has mean=1000;
    834   ratio = (score * 1000.0) /  expected_score;
    835   return ratio;
    836 }
    837 
    838 // Calculate reliablity of len bytes of script lscript with chunk_tote
    839 int cld::GetReliability(int len, UnicodeLScript lscript,
    840                    const Tote* chunk_tote) {
    841   Language cur_lang = UnpackLanguage(chunk_tote->Key(0));
    842   // Average score for this language-script combo
    843   int mean_score = kMeanScore[cur_lang * 4 + LScript4(lscript)];
    844   if (lscript == ULScript_Common) {
    845     // We don't know the script (only happens with second-chance score)
    846     // Look for first non-zero mean value
    847     for (int i = 2; i >= 0; --i) {
    848       if (kMeanScore[cur_lang * 4 + i] > 0) {
    849         mean_score = kMeanScore[cur_lang * 4 + i];
    850         break;
    851       }
    852     }
    853   }
    854   int reliability_delta = ReliabilityDelta(chunk_tote->Value(0),
    855                                            chunk_tote->Value(1),
    856                                            chunk_tote->GetGramCount());
    857 
    858   int reliability_main = ReliabilityMainstream(chunk_tote->Value(0),
    859                                                len,
    860                                                mean_score);
    861 
    862   int reliability_min = minint(reliability_delta, reliability_main);
    863 
    864 
    865   if (FLAGS_dbgreli) {
    866     char temp1[4];
    867     char temp2[4];
    868     cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(0)), temp1);
    869     if (temp1[2] == ' ') {temp1[2] = '\0';}
    870     cld::DbgLangName3(UnpackLanguage(chunk_tote->Key(1)), temp2);
    871     if (temp2[2] == ' ') {temp2[2] = '\0';}
    872     int srclen = len;
    873     fprintf(stderr, "CALC GetReliability gram=%d incr=%d srclen=%d,  %s=%d %s=%d "
    874                    "top/KB=%d mean/KB=%d del=%d%% reli=%d%%   "
    875                    "lang/lscript %d %d\n",
    876            chunk_tote->GetGramCount(),
    877            chunk_tote->GetIncrCount(),
    878            srclen,
    879            temp1, chunk_tote->Value(0),
    880            temp2, chunk_tote->Value(1),
    881            (chunk_tote->Value(0) << 10) / (srclen ? srclen : 1),
    882            mean_score,
    883            reliability_delta,
    884            reliability_main,
    885            cur_lang, lscript);
    886   }
    887 
    888   return reliability_min;
    889 }
    890 
    891 
    892 //------------------------------------------------------------------------------
    893 // Miscellaneous
    894 //------------------------------------------------------------------------------
    895 
    896 // Demote all languages except Top40 and plus_one
    897 // Do this just before sorting chunk_tote results
    898 void cld::DemoteNotTop40(Tote* chunk_tote, int packed_plus_one) {
    899   for (int sub = 0; sub < chunk_tote->MaxSize(); ++sub) {
    900     if (chunk_tote->Key(sub) == 0) continue;
    901     if (chunk_tote->Key(sub) == packed_plus_one) continue;
    902     if (kIsPackedTop40[chunk_tote->Key(sub)]) continue;
    903     // Quarter the score of others
    904     chunk_tote->SetValue(sub, chunk_tote->Value(sub) >> 2);
    905   }
    906 }
    907