Home | History | Annotate | Download | only in compact_lang_det
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "encodings/compact_lang_det/tote.h"
      6 #include <string.h>     // memset
      7 
      8 #include "encodings/compact_lang_det/win/cld_logging.h"
      9 
     10 
     11 // Take a set of <key, value> pairs and tote them up.
     12 // After explicitly sorting, retrieve top key, value pairs
     13 Tote::Tote() {
     14   gram_count_ = 0;
     15   incr_count_ = 0;
     16   byte_count_ = 0;
     17   memset(key_, 0, sizeof(key_));
     18   // No need to initialize values
     19 }
     20 
     21 Tote::~Tote() {
     22 }
     23 
     24 void Tote::Reinit() {
     25   gram_count_ = 0;
     26   incr_count_ = 0;
     27   byte_count_ = 0;
     28   memset(key_, 0, sizeof(key_));
     29   // No need to initialize values
     30 }
     31 
     32 // Increment count of quadgrams/trigrams/unigrams scored
     33 void Tote::AddGram() {
     34   ++gram_count_;
     35 }
     36 
     37 // Three-way associative, guaranteeing that the largest two counts are always
     38 // in the data structure. kMaxSize must be a multiple of 3, and is tied to the
     39 // subscript calculations here, which are for 8 sets of 3-way associative
     40 // buckets. The subscripts for set N are [N], [N+8], and [N+16] used in a
     41 // slightly-weird way: The initial probe point is [N] or [N+8], whichever
     42 // is specified by key mod 16. In most cases (nearly *all* cases except Latin
     43 // script), this entry matches and we update/return. The second probe is
     44 // the other of [N] and [N+8]. The third probe is only used as a fallback to
     45 // these two, and is there only for the rare case that there are three or more
     46 // languages with Language enum values equal mod 8, contending within the same
     47 // bucket. This can only happen in Latin and (rarely) Cyrillic scripts, because
     48 // the other scripts have fewer than 17 languages total.
     49 // If you change kMaxSize, change the constants 7/8/15/16 below
     50 void Tote::Add(uint8 ikey, int idelta) {
     51   DCHECK(ikey != 0);
     52   ++incr_count_;
     53 
     54   // Look for existing entry
     55   int sub0 = ikey & 15;
     56   if (key_[sub0] == ikey) {
     57     value_[sub0] += idelta;
     58     return;
     59   }
     60   int sub1 = sub0 ^ 8;
     61   if (key_[sub1] == ikey) {
     62     value_[sub1] += idelta;
     63     return;
     64   }
     65   int sub2 = (ikey & 7) + 16;
     66   if (key_[sub2] == ikey) {
     67     value_[sub2] += idelta;
     68     return;
     69   }
     70 
     71   // Allocate new entry
     72   int alloc = -1;
     73   if (key_[sub0] == 0) {
     74     alloc = sub0;
     75   } else if (key_[sub1] == 0) {
     76     alloc = sub1;
     77   } else if (key_[sub2] == 0) {
     78     alloc = sub2;
     79   } else {
     80     // All choices allocated, need to replace smallest one
     81     alloc = sub0;
     82     if (value_[sub1] < value_[alloc]) {alloc = sub1;}
     83     if (value_[sub2] < value_[alloc]) {alloc = sub2;}
     84   }
     85   key_[alloc] = ikey;
     86   value_[alloc] = idelta;
     87   return;
     88 }
     89 
     90 // Return current top key
     91 int Tote::CurrentTopKey() {
     92   int top_key = 0;
     93   int top_value = -1;
     94   for (int sub = 0; sub < kMaxSize_; ++sub) {
     95     if (key_[sub] == 0) {continue;}
     96     if (top_value < value_[sub]) {
     97       top_value = value_[sub];
     98       top_key = key_[sub];
     99     }
    100   }
    101   return top_key;
    102 }
    103 
    104 
    105 // Sort first n entries by decreasing order of value
    106 // If key==0 other fields are not valid, treat value as -1
    107 void Tote::Sort(int n) {
    108   // This is n**2, but n is small
    109   for (int sub = 0; sub < n; ++sub) {
    110     if (key_[sub] == 0) {value_[sub] = -1;}
    111 
    112     // Bubble sort key[sub] and entry[sub]
    113     for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) {
    114       if (key_[sub2] == 0) {value_[sub2] = -1;}
    115       if (value_[sub] < value_[sub2]) {
    116         // swap
    117         uint8 tmpk = key_[sub];
    118         key_[sub] = key_[sub2];
    119         key_[sub2] = tmpk;
    120         int tmpv = value_[sub];
    121         value_[sub] = value_[sub2];
    122         value_[sub2] = tmpv;
    123       }
    124     }
    125   }
    126 }
    127 
    128 void Tote::Dump(FILE* f) {
    129   for (int sub = 0; sub < kMaxSize_; ++sub) {
    130     if (key_[sub] > 0) {
    131       fprintf(f, "[%2d] %3d %8d\n", sub, key_[sub], value_[sub]);
    132     }
    133   }
    134   fprintf(f, "%d %d %d\n", gram_count_, incr_count_, byte_count_);
    135 }
    136 
    137 
    138 
    139 
    140 // Take a set of <key, value> pairs and tote them up.
    141 // After explicitly sorting, retrieve top key, value pairs
    142 ToteWithReliability::ToteWithReliability() {
    143   // No need to initialize score_ or value_
    144   incr_count_ = 0;
    145   sorted_ = 0;
    146   memset(closepair_, 0, sizeof(closepair_));
    147   memset(key_, 0, sizeof(key_));
    148 }
    149 
    150 ToteWithReliability::~ToteWithReliability() {
    151 }
    152 
    153 void ToteWithReliability::Reinit() {
    154   // No need to initialize score_ or value_
    155   incr_count_ = 0;
    156   sorted_ = 0;
    157   memset(closepair_, 0, sizeof(closepair_));
    158   memset(key_, 0, sizeof(key_));
    159   ////ss_.Init();
    160 }
    161 
    162 // Weight reliability by ibytes
    163 // Also see three-way associative comments above for Tote
    164 void ToteWithReliability::Add(uint8 ikey, int ibytes,
    165                               int score, int ireliability) {
    166   DCHECK(ikey != 0);
    167   CHECK(sorted_ == 0);
    168   ++incr_count_;
    169 
    170   // Look for existing entry
    171   int sub0 = ikey & 15;
    172   if (key_[sub0] == ikey) {
    173     value_[sub0] += ibytes;
    174     score_[sub0] += score;
    175     reliability_[sub0] += ireliability * ibytes;
    176     return;
    177   }
    178   int sub1 = sub0 ^ 8;
    179   if (key_[sub1] == ikey) {
    180     value_[sub1] += ibytes;
    181     score_[sub1] += score;
    182     reliability_[sub1] += ireliability * ibytes;
    183     return;
    184   }
    185   int sub2 = (ikey & 7) + 16;
    186   if (key_[sub2] == ikey) {
    187     value_[sub2] += ibytes;
    188     score_[sub2] += score;
    189     reliability_[sub2] += ireliability * ibytes;
    190     return;
    191   }
    192 
    193   // Allocate new entry
    194   int alloc = -1;
    195   if (key_[sub0] == 0) {
    196     alloc = sub0;
    197   } else if (key_[sub1] == 0) {
    198     alloc = sub1;
    199   } else if (key_[sub2] == 0) {
    200     alloc = sub2;
    201   } else {
    202     // All choices allocated, need to replace smallest one
    203     alloc = sub0;
    204     if (value_[sub1] < value_[alloc]) {alloc = sub1;}
    205     if (value_[sub2] < value_[alloc]) {alloc = sub2;}
    206   }
    207   key_[alloc] = ikey;
    208   value_[alloc] = ibytes;
    209   score_[alloc] = score;
    210   reliability_[alloc] = ireliability * ibytes;
    211   return;
    212 }
    213 
    214 // Find subscript of a given packed language, or -1
    215 int ToteWithReliability::Find(uint8 ikey) {
    216   DCHECK(ikey != 0);
    217 
    218   if (sorted_) {
    219     // Linear search if sorted
    220     for (int sub = 0; sub < kMaxSize_; ++sub) {
    221       if (key_[sub] == ikey) {return sub;}
    222     }
    223     return -1;
    224   }
    225 
    226   // Look for existing entry
    227   int sub0 = ikey & 15;
    228   if (key_[sub0] == ikey) {
    229     return sub0;
    230   }
    231   int sub1 = sub0 ^ 8;
    232   if (key_[sub1] == ikey) {
    233     return sub1;
    234   }
    235   int sub2 = (ikey & 7) + 16;
    236   if (key_[sub2] == ikey) {
    237     return sub2;
    238   }
    239 
    240   return -1;
    241 }
    242 
    243 // Return current top key
    244 int ToteWithReliability::CurrentTopKey() {
    245   int top_key = 0;
    246   int top_value = -1;
    247   for (int sub = 0; sub < kMaxSize_; ++sub) {
    248     if (key_[sub] == 0) {continue;}
    249     if (top_value < value_[sub]) {
    250       top_value = value_[sub];
    251       top_key = key_[sub];
    252     }
    253   }
    254   return top_key;
    255 }
    256 
    257 
    258 // Sort first n entries by decreasing order of value
    259 // If key==0 other fields are not valid, treat value as -1
    260 void ToteWithReliability::Sort(int n) {
    261   // This is n**2, but n is small
    262   for (int sub = 0; sub < n; ++sub) {
    263     if (key_[sub] == 0) {value_[sub] = -1;}
    264 
    265     // Bubble sort key[sub] and entry[sub]
    266     for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) {
    267       if (key_[sub2] == 0) {value_[sub2] = -1;}
    268       if (value_[sub] < value_[sub2]) {
    269         // swap
    270         uint8 tmpk = key_[sub];
    271         key_[sub] = key_[sub2];
    272         key_[sub2] = tmpk;
    273 
    274         int tmpv = value_[sub];
    275         value_[sub] = value_[sub2];
    276         value_[sub2] = tmpv;
    277 
    278         int tmps = score_[sub];
    279         score_[sub] = score_[sub2];
    280         score_[sub2] = tmps;
    281 
    282         int tmpr = reliability_[sub];
    283         reliability_[sub] = reliability_[sub2];
    284         reliability_[sub2] = tmpr;
    285       }
    286     }
    287   }
    288   sorted_ = 1;
    289 }
    290 
    291 void ToteWithReliability::Dump(FILE* f) {
    292   for (int sub = 0; sub < kMaxSize_; ++sub) {
    293     if (key_[sub] > 0) {
    294       fprintf(f, "[%2d] %3d %6d %5d %4d\n",
    295               sub, key_[sub], value_[sub], score_[sub], reliability_[sub]);
    296     }
    297   }
    298   fprintf(f, "  %d#\n", incr_count_);
    299 }
    300