1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "encodings/compact_lang_det/tote.h" 6 #include <string.h> // memset 7 8 #include "encodings/compact_lang_det/win/cld_logging.h" 9 10 11 // Take a set of <key, value> pairs and tote them up. 12 // After explicitly sorting, retrieve top key, value pairs 13 Tote::Tote() { 14 gram_count_ = 0; 15 incr_count_ = 0; 16 byte_count_ = 0; 17 memset(key_, 0, sizeof(key_)); 18 // No need to initialize values 19 } 20 21 Tote::~Tote() { 22 } 23 24 void Tote::Reinit() { 25 gram_count_ = 0; 26 incr_count_ = 0; 27 byte_count_ = 0; 28 memset(key_, 0, sizeof(key_)); 29 // No need to initialize values 30 } 31 32 // Increment count of quadgrams/trigrams/unigrams scored 33 void Tote::AddGram() { 34 ++gram_count_; 35 } 36 37 // Three-way associative, guaranteeing that the largest two counts are always 38 // in the data structure. kMaxSize must be a multiple of 3, and is tied to the 39 // subscript calculations here, which are for 8 sets of 3-way associative 40 // buckets. The subscripts for set N are [N], [N+8], and [N+16] used in a 41 // slightly-weird way: The initial probe point is [N] or [N+8], whichever 42 // is specified by key mod 16. In most cases (nearly *all* cases except Latin 43 // script), this entry matches and we update/return. The second probe is 44 // the other of [N] and [N+8]. The third probe is only used as a fallback to 45 // these two, and is there only for the rare case that there are three or more 46 // languages with Language enum values equal mod 8, contending within the same 47 // bucket. This can only happen in Latin and (rarely) Cyrillic scripts, because 48 // the other scripts have fewer than 17 languages total. 49 // If you change kMaxSize, change the constants 7/8/15/16 below 50 void Tote::Add(uint8 ikey, int idelta) { 51 DCHECK(ikey != 0); 52 ++incr_count_; 53 54 // Look for existing entry 55 int sub0 = ikey & 15; 56 if (key_[sub0] == ikey) { 57 value_[sub0] += idelta; 58 return; 59 } 60 int sub1 = sub0 ^ 8; 61 if (key_[sub1] == ikey) { 62 value_[sub1] += idelta; 63 return; 64 } 65 int sub2 = (ikey & 7) + 16; 66 if (key_[sub2] == ikey) { 67 value_[sub2] += idelta; 68 return; 69 } 70 71 // Allocate new entry 72 int alloc = -1; 73 if (key_[sub0] == 0) { 74 alloc = sub0; 75 } else if (key_[sub1] == 0) { 76 alloc = sub1; 77 } else if (key_[sub2] == 0) { 78 alloc = sub2; 79 } else { 80 // All choices allocated, need to replace smallest one 81 alloc = sub0; 82 if (value_[sub1] < value_[alloc]) {alloc = sub1;} 83 if (value_[sub2] < value_[alloc]) {alloc = sub2;} 84 } 85 key_[alloc] = ikey; 86 value_[alloc] = idelta; 87 return; 88 } 89 90 // Return current top key 91 int Tote::CurrentTopKey() { 92 int top_key = 0; 93 int top_value = -1; 94 for (int sub = 0; sub < kMaxSize_; ++sub) { 95 if (key_[sub] == 0) {continue;} 96 if (top_value < value_[sub]) { 97 top_value = value_[sub]; 98 top_key = key_[sub]; 99 } 100 } 101 return top_key; 102 } 103 104 105 // Sort first n entries by decreasing order of value 106 // If key==0 other fields are not valid, treat value as -1 107 void Tote::Sort(int n) { 108 // This is n**2, but n is small 109 for (int sub = 0; sub < n; ++sub) { 110 if (key_[sub] == 0) {value_[sub] = -1;} 111 112 // Bubble sort key[sub] and entry[sub] 113 for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) { 114 if (key_[sub2] == 0) {value_[sub2] = -1;} 115 if (value_[sub] < value_[sub2]) { 116 // swap 117 uint8 tmpk = key_[sub]; 118 key_[sub] = key_[sub2]; 119 key_[sub2] = tmpk; 120 int tmpv = value_[sub]; 121 value_[sub] = value_[sub2]; 122 value_[sub2] = tmpv; 123 } 124 } 125 } 126 } 127 128 void Tote::Dump(FILE* f) { 129 for (int sub = 0; sub < kMaxSize_; ++sub) { 130 if (key_[sub] > 0) { 131 fprintf(f, "[%2d] %3d %8d\n", sub, key_[sub], value_[sub]); 132 } 133 } 134 fprintf(f, "%d %d %d\n", gram_count_, incr_count_, byte_count_); 135 } 136 137 138 139 140 // Take a set of <key, value> pairs and tote them up. 141 // After explicitly sorting, retrieve top key, value pairs 142 ToteWithReliability::ToteWithReliability() { 143 // No need to initialize score_ or value_ 144 incr_count_ = 0; 145 sorted_ = 0; 146 memset(closepair_, 0, sizeof(closepair_)); 147 memset(key_, 0, sizeof(key_)); 148 } 149 150 ToteWithReliability::~ToteWithReliability() { 151 } 152 153 void ToteWithReliability::Reinit() { 154 // No need to initialize score_ or value_ 155 incr_count_ = 0; 156 sorted_ = 0; 157 memset(closepair_, 0, sizeof(closepair_)); 158 memset(key_, 0, sizeof(key_)); 159 ////ss_.Init(); 160 } 161 162 // Weight reliability by ibytes 163 // Also see three-way associative comments above for Tote 164 void ToteWithReliability::Add(uint8 ikey, int ibytes, 165 int score, int ireliability) { 166 DCHECK(ikey != 0); 167 CHECK(sorted_ == 0); 168 ++incr_count_; 169 170 // Look for existing entry 171 int sub0 = ikey & 15; 172 if (key_[sub0] == ikey) { 173 value_[sub0] += ibytes; 174 score_[sub0] += score; 175 reliability_[sub0] += ireliability * ibytes; 176 return; 177 } 178 int sub1 = sub0 ^ 8; 179 if (key_[sub1] == ikey) { 180 value_[sub1] += ibytes; 181 score_[sub1] += score; 182 reliability_[sub1] += ireliability * ibytes; 183 return; 184 } 185 int sub2 = (ikey & 7) + 16; 186 if (key_[sub2] == ikey) { 187 value_[sub2] += ibytes; 188 score_[sub2] += score; 189 reliability_[sub2] += ireliability * ibytes; 190 return; 191 } 192 193 // Allocate new entry 194 int alloc = -1; 195 if (key_[sub0] == 0) { 196 alloc = sub0; 197 } else if (key_[sub1] == 0) { 198 alloc = sub1; 199 } else if (key_[sub2] == 0) { 200 alloc = sub2; 201 } else { 202 // All choices allocated, need to replace smallest one 203 alloc = sub0; 204 if (value_[sub1] < value_[alloc]) {alloc = sub1;} 205 if (value_[sub2] < value_[alloc]) {alloc = sub2;} 206 } 207 key_[alloc] = ikey; 208 value_[alloc] = ibytes; 209 score_[alloc] = score; 210 reliability_[alloc] = ireliability * ibytes; 211 return; 212 } 213 214 // Find subscript of a given packed language, or -1 215 int ToteWithReliability::Find(uint8 ikey) { 216 DCHECK(ikey != 0); 217 218 if (sorted_) { 219 // Linear search if sorted 220 for (int sub = 0; sub < kMaxSize_; ++sub) { 221 if (key_[sub] == ikey) {return sub;} 222 } 223 return -1; 224 } 225 226 // Look for existing entry 227 int sub0 = ikey & 15; 228 if (key_[sub0] == ikey) { 229 return sub0; 230 } 231 int sub1 = sub0 ^ 8; 232 if (key_[sub1] == ikey) { 233 return sub1; 234 } 235 int sub2 = (ikey & 7) + 16; 236 if (key_[sub2] == ikey) { 237 return sub2; 238 } 239 240 return -1; 241 } 242 243 // Return current top key 244 int ToteWithReliability::CurrentTopKey() { 245 int top_key = 0; 246 int top_value = -1; 247 for (int sub = 0; sub < kMaxSize_; ++sub) { 248 if (key_[sub] == 0) {continue;} 249 if (top_value < value_[sub]) { 250 top_value = value_[sub]; 251 top_key = key_[sub]; 252 } 253 } 254 return top_key; 255 } 256 257 258 // Sort first n entries by decreasing order of value 259 // If key==0 other fields are not valid, treat value as -1 260 void ToteWithReliability::Sort(int n) { 261 // This is n**2, but n is small 262 for (int sub = 0; sub < n; ++sub) { 263 if (key_[sub] == 0) {value_[sub] = -1;} 264 265 // Bubble sort key[sub] and entry[sub] 266 for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) { 267 if (key_[sub2] == 0) {value_[sub2] = -1;} 268 if (value_[sub] < value_[sub2]) { 269 // swap 270 uint8 tmpk = key_[sub]; 271 key_[sub] = key_[sub2]; 272 key_[sub2] = tmpk; 273 274 int tmpv = value_[sub]; 275 value_[sub] = value_[sub2]; 276 value_[sub2] = tmpv; 277 278 int tmps = score_[sub]; 279 score_[sub] = score_[sub2]; 280 score_[sub2] = tmps; 281 282 int tmpr = reliability_[sub]; 283 reliability_[sub] = reliability_[sub2]; 284 reliability_[sub2] = tmpr; 285 } 286 } 287 } 288 sorted_ = 1; 289 } 290 291 void ToteWithReliability::Dump(FILE* f) { 292 for (int sub = 0; sub < kMaxSize_; ++sub) { 293 if (key_[sub] > 0) { 294 fprintf(f, "[%2d] %3d %6d %5d %4d\n", 295 sub, key_[sub], value_[sub], score_[sub], reliability_[sub]); 296 } 297 } 298 fprintf(f, " %d#\n", incr_count_); 299 } 300