1 /* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <assert.h> 18 #include <math.h> 19 #include <stdio.h> 20 #include <string.h> 21 #include <time.h> 22 #include "../include/mystdlib.h" 23 #include "../include/ngram.h" 24 25 namespace ime_pinyin { 26 27 #define ADD_COUNT 0.3 28 29 int comp_double(const void *p1, const void *p2) { 30 if (*static_cast<const double*>(p1) < *static_cast<const double*>(p2)) 31 return -1; 32 if (*static_cast<const double*>(p1) > *static_cast<const double*>(p2)) 33 return 1; 34 return 0; 35 } 36 37 inline double distance(double freq, double code) { 38 // return fabs(freq - code); 39 return freq * fabs(log(freq) - log(code)); 40 } 41 42 // Find the index of the code value which is nearest to the given freq 43 int qsearch_nearest(double code_book[], double freq, int start, int end) { 44 if (start == end) 45 return start; 46 47 if (start + 1 == end) { 48 if (distance(freq, code_book[end]) > distance(freq, code_book[start])) 49 return start; 50 return end; 51 } 52 53 int mid = (start + end) / 2; 54 55 if (code_book[mid] > freq) 56 return qsearch_nearest(code_book, freq, start, mid); 57 else 58 return qsearch_nearest(code_book, freq, mid, end); 59 } 60 61 size_t update_code_idx(double freqs[], size_t num, double code_book[], 62 CODEBOOK_TYPE *code_idx) { 63 size_t changed = 0; 64 for (size_t pos = 0; pos < num; pos++) { 65 CODEBOOK_TYPE idx; 66 idx = qsearch_nearest(code_book, freqs[pos], 0, kCodeBookSize - 1); 67 if (idx != code_idx[pos]) 68 changed++; 69 code_idx[pos] = idx; 70 } 71 return changed; 72 } 73 74 double recalculate_kernel(double freqs[], size_t num, double code_book[], 75 CODEBOOK_TYPE *code_idx) { 76 double ret = 0; 77 78 size_t *item_num = new size_t[kCodeBookSize]; 79 assert(item_num); 80 memset(item_num, 0, sizeof(size_t) * kCodeBookSize); 81 82 double *cb_new = new double[kCodeBookSize]; 83 assert(cb_new); 84 memset(cb_new, 0, sizeof(double) * kCodeBookSize); 85 86 for (size_t pos = 0; pos < num; pos++) { 87 ret += distance(freqs[pos], code_book[code_idx[pos]]); 88 89 cb_new[code_idx[pos]] += freqs[pos]; 90 item_num[code_idx[pos]] += 1; 91 } 92 93 for (size_t code = 0; code < kCodeBookSize; code++) { 94 assert(item_num[code] > 0); 95 code_book[code] = cb_new[code] / item_num[code]; 96 } 97 98 delete [] item_num; 99 delete [] cb_new; 100 101 return ret; 102 } 103 104 void iterate_codes(double freqs[], size_t num, double code_book[], 105 CODEBOOK_TYPE *code_idx) { 106 size_t iter_num = 0; 107 double delta_last = 0; 108 do { 109 size_t changed = update_code_idx(freqs, num, code_book, code_idx); 110 111 double delta = recalculate_kernel(freqs, num, code_book, code_idx); 112 113 if (kPrintDebug0) { 114 printf("---Unigram codebook iteration: %d : %d, %.9f\n", 115 iter_num, changed, delta); 116 } 117 iter_num++; 118 119 if (iter_num > 1 && 120 (delta == 0 || fabs(delta_last - delta)/fabs(delta) < 0.000000001)) 121 break; 122 delta_last = delta; 123 } while (true); 124 } 125 126 127 NGram* NGram::instance_ = NULL; 128 129 NGram::NGram() { 130 initialized_ = false; 131 idx_num_ = 0; 132 lma_freq_idx_ = NULL; 133 sys_score_compensation_ = 0; 134 135 #ifdef ___BUILD_MODEL___ 136 freq_codes_df_ = NULL; 137 #endif 138 freq_codes_ = NULL; 139 } 140 141 NGram::~NGram() { 142 if (NULL != lma_freq_idx_) 143 free(lma_freq_idx_); 144 145 #ifdef ___BUILD_MODEL___ 146 if (NULL != freq_codes_df_) 147 free(freq_codes_df_); 148 #endif 149 150 if (NULL != freq_codes_) 151 free(freq_codes_); 152 } 153 154 NGram& NGram::get_instance() { 155 if (NULL == instance_) 156 instance_ = new NGram(); 157 return *instance_; 158 } 159 160 bool NGram::save_ngram(FILE *fp) { 161 if (!initialized_ || NULL == fp) 162 return false; 163 164 if (0 == idx_num_ || NULL == freq_codes_ || NULL == lma_freq_idx_) 165 return false; 166 167 if (fwrite(&idx_num_, sizeof(size_t), 1, fp) != 1) 168 return false; 169 170 if (fwrite(freq_codes_, sizeof(LmaScoreType), kCodeBookSize, fp) != 171 kCodeBookSize) 172 return false; 173 174 if (fwrite(lma_freq_idx_, sizeof(CODEBOOK_TYPE), idx_num_, fp) != idx_num_) 175 return false; 176 177 return true; 178 } 179 180 bool NGram::load_ngram(FILE *fp) { 181 if (NULL == fp) 182 return false; 183 184 initialized_ = false; 185 186 if (fread(&idx_num_, sizeof(size_t), 1, fp) != 1 ) 187 return false; 188 189 if (NULL != lma_freq_idx_) 190 free(lma_freq_idx_); 191 192 if (NULL != freq_codes_) 193 free(freq_codes_); 194 195 lma_freq_idx_ = static_cast<CODEBOOK_TYPE*> 196 (malloc(idx_num_ * sizeof(CODEBOOK_TYPE))); 197 freq_codes_ = static_cast<LmaScoreType*> 198 (malloc(kCodeBookSize * sizeof(LmaScoreType))); 199 200 if (NULL == lma_freq_idx_ || NULL == freq_codes_) 201 return false; 202 203 if (fread(freq_codes_, sizeof(LmaScoreType), kCodeBookSize, fp) != 204 kCodeBookSize) 205 return false; 206 207 if (fread(lma_freq_idx_, sizeof(CODEBOOK_TYPE), idx_num_, fp) != idx_num_) 208 return false; 209 210 initialized_ = true; 211 212 total_freq_none_sys_ = 0; 213 return true; 214 } 215 216 void NGram::set_total_freq_none_sys(size_t freq_none_sys) { 217 total_freq_none_sys_ = freq_none_sys; 218 if (0 == total_freq_none_sys_) { 219 sys_score_compensation_ = 0; 220 } else { 221 double factor = static_cast<double>(kSysDictTotalFreq) / ( 222 kSysDictTotalFreq + total_freq_none_sys_); 223 sys_score_compensation_ = static_cast<float>( 224 log(factor) * kLogValueAmplifier); 225 } 226 } 227 228 // The caller makes sure this oject is initialized. 229 float NGram::get_uni_psb(LemmaIdType lma_id) { 230 return static_cast<float>(freq_codes_[lma_freq_idx_[lma_id]]) + 231 sys_score_compensation_; 232 } 233 234 float NGram::convert_psb_to_score(double psb) { 235 float score = static_cast<float>( 236 log(psb) * static_cast<double>(kLogValueAmplifier)); 237 if (score > static_cast<float>(kMaxScore)) { 238 score = static_cast<float>(kMaxScore); 239 } 240 return score; 241 } 242 243 #ifdef ___BUILD_MODEL___ 244 bool NGram::build_unigram(LemmaEntry *lemma_arr, size_t lemma_num, 245 LemmaIdType next_idx_unused) { 246 if (NULL == lemma_arr || 0 == lemma_num || next_idx_unused <= 1) 247 return false; 248 249 double total_freq = 0; 250 double *freqs = new double[next_idx_unused]; 251 if (NULL == freqs) 252 return false; 253 254 freqs[0] = ADD_COUNT; 255 total_freq += freqs[0]; 256 LemmaIdType idx_now = 0; 257 for (size_t pos = 0; pos < lemma_num; pos++) { 258 if (lemma_arr[pos].idx_by_hz == idx_now) 259 continue; 260 idx_now++; 261 262 assert(lemma_arr[pos].idx_by_hz == idx_now); 263 264 freqs[idx_now] = lemma_arr[pos].freq; 265 if (freqs[idx_now] <= 0) 266 freqs[idx_now] = 0.3; 267 268 total_freq += freqs[idx_now]; 269 } 270 271 double max_freq = 0; 272 idx_num_ = idx_now + 1; 273 assert(idx_now + 1 == next_idx_unused); 274 275 for (size_t pos = 0; pos < idx_num_; pos++) { 276 freqs[pos] = freqs[pos] / total_freq; 277 assert(freqs[pos] > 0); 278 if (freqs[pos] > max_freq) 279 max_freq = freqs[pos]; 280 } 281 282 // calculate the code book 283 if (NULL == freq_codes_df_) 284 freq_codes_df_ = new double[kCodeBookSize]; 285 assert(freq_codes_df_); 286 memset(freq_codes_df_, 0, sizeof(double) * kCodeBookSize); 287 288 if (NULL == freq_codes_) 289 freq_codes_ = new LmaScoreType[kCodeBookSize]; 290 assert(freq_codes_); 291 memset(freq_codes_, 0, sizeof(LmaScoreType) * kCodeBookSize); 292 293 size_t freq_pos = 0; 294 for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) { 295 bool found = true; 296 297 while (found) { 298 found = false; 299 double cand = freqs[freq_pos]; 300 for (size_t i = 0; i < code_pos; i++) 301 if (freq_codes_df_[i] == cand) { 302 found = true; 303 break; 304 } 305 if (found) 306 freq_pos++; 307 } 308 309 freq_codes_df_[code_pos] = freqs[freq_pos]; 310 freq_pos++; 311 } 312 313 myqsort(freq_codes_df_, kCodeBookSize, sizeof(double), comp_double); 314 315 if (NULL == lma_freq_idx_) 316 lma_freq_idx_ = new CODEBOOK_TYPE[idx_num_]; 317 assert(lma_freq_idx_); 318 319 iterate_codes(freqs, idx_num_, freq_codes_df_, lma_freq_idx_); 320 321 delete [] freqs; 322 323 if (kPrintDebug0) { 324 printf("\n------Language Model Unigram Codebook------\n"); 325 } 326 327 for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) { 328 double log_score = log(freq_codes_df_[code_pos]); 329 float final_score = convert_psb_to_score(freq_codes_df_[code_pos]); 330 if (kPrintDebug0) { 331 printf("code:%d, probability:%.9f, log score:%.3f, final score: %.3f\n", 332 code_pos, freq_codes_df_[code_pos], log_score, final_score); 333 } 334 freq_codes_[code_pos] = static_cast<LmaScoreType>(final_score); 335 } 336 337 initialized_ = true; 338 return true; 339 } 340 #endif 341 342 } // namespace ime_pinyin 343