Home | History | Annotate | Download | only in share
      1 /*
      2  * Copyright (C) 2009 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include <assert.h>
     18 #include <math.h>
     19 #include <stdio.h>
     20 #include <string.h>
     21 #include <time.h>
     22 #include "../include/mystdlib.h"
     23 #include "../include/ngram.h"
     24 
     25 namespace ime_pinyin {
     26 
     27 #define ADD_COUNT 0.3
     28 
     29 int comp_double(const void *p1, const void *p2) {
     30   if (*static_cast<const double*>(p1) < *static_cast<const double*>(p2))
     31     return -1;
     32   if (*static_cast<const double*>(p1) > *static_cast<const double*>(p2))
     33     return 1;
     34   return 0;
     35 }
     36 
     37 inline double distance(double freq, double code) {
     38   // return fabs(freq - code);
     39   return freq * fabs(log(freq) - log(code));
     40 }
     41 
     42 // Find the index of the code value which is nearest to the given freq
     43 int qsearch_nearest(double code_book[], double freq, int start, int end) {
     44   if (start == end)
     45     return start;
     46 
     47   if (start + 1 == end) {
     48     if (distance(freq, code_book[end]) > distance(freq, code_book[start]))
     49       return start;
     50     return end;
     51   }
     52 
     53   int mid = (start + end) / 2;
     54 
     55   if (code_book[mid] > freq)
     56     return qsearch_nearest(code_book, freq, start, mid);
     57   else
     58     return qsearch_nearest(code_book, freq, mid, end);
     59 }
     60 
     61 size_t update_code_idx(double freqs[], size_t num, double code_book[],
     62                        CODEBOOK_TYPE *code_idx) {
     63   size_t changed = 0;
     64   for (size_t pos = 0; pos < num; pos++) {
     65     CODEBOOK_TYPE idx;
     66     idx = qsearch_nearest(code_book, freqs[pos], 0, kCodeBookSize - 1);
     67     if (idx != code_idx[pos])
     68       changed++;
     69     code_idx[pos] = idx;
     70   }
     71   return changed;
     72 }
     73 
     74 double recalculate_kernel(double freqs[], size_t num, double code_book[],
     75                           CODEBOOK_TYPE *code_idx) {
     76   double ret = 0;
     77 
     78   size_t *item_num =  new size_t[kCodeBookSize];
     79   assert(item_num);
     80   memset(item_num, 0, sizeof(size_t) * kCodeBookSize);
     81 
     82   double *cb_new = new double[kCodeBookSize];
     83   assert(cb_new);
     84   memset(cb_new, 0, sizeof(double) * kCodeBookSize);
     85 
     86   for (size_t pos = 0; pos < num; pos++) {
     87     ret += distance(freqs[pos], code_book[code_idx[pos]]);
     88 
     89     cb_new[code_idx[pos]] += freqs[pos];
     90     item_num[code_idx[pos]] += 1;
     91   }
     92 
     93   for (size_t code = 0; code < kCodeBookSize; code++) {
     94     assert(item_num[code] > 0);
     95     code_book[code] = cb_new[code] / item_num[code];
     96   }
     97 
     98   delete [] item_num;
     99   delete [] cb_new;
    100 
    101   return ret;
    102 }
    103 
    104 void iterate_codes(double freqs[], size_t num, double code_book[],
    105                    CODEBOOK_TYPE *code_idx) {
    106   size_t iter_num = 0;
    107   double delta_last = 0;
    108   do {
    109     size_t changed = update_code_idx(freqs, num, code_book, code_idx);
    110 
    111     double delta = recalculate_kernel(freqs, num, code_book, code_idx);
    112 
    113     if (kPrintDebug0) {
    114       printf("---Unigram codebook iteration: %d : %d, %.9f\n",
    115              iter_num, changed, delta);
    116     }
    117     iter_num++;
    118 
    119     if (iter_num > 1 &&
    120         (delta == 0 || fabs(delta_last - delta)/fabs(delta) < 0.000000001))
    121       break;
    122     delta_last = delta;
    123   } while (true);
    124 }
    125 
    126 
    127 NGram* NGram::instance_ = NULL;
    128 
    129 NGram::NGram() {
    130   initialized_ = false;
    131   idx_num_ = 0;
    132   lma_freq_idx_ = NULL;
    133   sys_score_compensation_ = 0;
    134 
    135 #ifdef ___BUILD_MODEL___
    136   freq_codes_df_ = NULL;
    137 #endif
    138   freq_codes_ = NULL;
    139 }
    140 
    141 NGram::~NGram() {
    142   if (NULL != lma_freq_idx_)
    143     free(lma_freq_idx_);
    144 
    145 #ifdef ___BUILD_MODEL___
    146   if (NULL != freq_codes_df_)
    147     free(freq_codes_df_);
    148 #endif
    149 
    150   if (NULL != freq_codes_)
    151     free(freq_codes_);
    152 }
    153 
    154 NGram& NGram::get_instance() {
    155   if (NULL == instance_)
    156     instance_ = new NGram();
    157   return *instance_;
    158 }
    159 
    160 bool NGram::save_ngram(FILE *fp) {
    161   if (!initialized_ || NULL == fp)
    162     return false;
    163 
    164   if (0 == idx_num_ || NULL == freq_codes_ ||  NULL == lma_freq_idx_)
    165     return false;
    166 
    167   if (fwrite(&idx_num_, sizeof(size_t), 1, fp) != 1)
    168     return false;
    169 
    170   if (fwrite(freq_codes_, sizeof(LmaScoreType), kCodeBookSize, fp) !=
    171       kCodeBookSize)
    172     return false;
    173 
    174   if (fwrite(lma_freq_idx_, sizeof(CODEBOOK_TYPE), idx_num_, fp) != idx_num_)
    175     return false;
    176 
    177   return true;
    178 }
    179 
    180 bool NGram::load_ngram(FILE *fp) {
    181   if (NULL == fp)
    182     return false;
    183 
    184   initialized_ = false;
    185 
    186   if (fread(&idx_num_, sizeof(size_t), 1, fp) != 1 )
    187     return false;
    188 
    189   if (NULL != lma_freq_idx_)
    190     free(lma_freq_idx_);
    191 
    192   if (NULL != freq_codes_)
    193     free(freq_codes_);
    194 
    195   lma_freq_idx_ = static_cast<CODEBOOK_TYPE*>
    196                   (malloc(idx_num_ * sizeof(CODEBOOK_TYPE)));
    197   freq_codes_ = static_cast<LmaScoreType*>
    198       (malloc(kCodeBookSize * sizeof(LmaScoreType)));
    199 
    200   if (NULL == lma_freq_idx_ || NULL == freq_codes_)
    201     return false;
    202 
    203   if (fread(freq_codes_, sizeof(LmaScoreType), kCodeBookSize, fp) !=
    204       kCodeBookSize)
    205     return false;
    206 
    207   if (fread(lma_freq_idx_, sizeof(CODEBOOK_TYPE), idx_num_, fp) != idx_num_)
    208     return false;
    209 
    210   initialized_ = true;
    211 
    212   total_freq_none_sys_ = 0;
    213   return true;
    214 }
    215 
    216 void NGram::set_total_freq_none_sys(size_t freq_none_sys) {
    217   total_freq_none_sys_ = freq_none_sys;
    218   if (0 == total_freq_none_sys_) {
    219     sys_score_compensation_ = 0;
    220   } else {
    221     double factor = static_cast<double>(kSysDictTotalFreq) / (
    222         kSysDictTotalFreq + total_freq_none_sys_);
    223     sys_score_compensation_ = static_cast<float>(
    224         log(factor) * kLogValueAmplifier);
    225   }
    226 }
    227 
    228 // The caller makes sure this oject is initialized.
    229 float NGram::get_uni_psb(LemmaIdType lma_id) {
    230   return  static_cast<float>(freq_codes_[lma_freq_idx_[lma_id]]) +
    231       sys_score_compensation_;
    232 }
    233 
    234 float NGram::convert_psb_to_score(double psb) {
    235   float score = static_cast<float>(
    236       log(psb) * static_cast<double>(kLogValueAmplifier));
    237   if (score > static_cast<float>(kMaxScore)) {
    238     score = static_cast<float>(kMaxScore);
    239   }
    240   return score;
    241 }
    242 
    243 #ifdef ___BUILD_MODEL___
    244 bool NGram::build_unigram(LemmaEntry *lemma_arr, size_t lemma_num,
    245                           LemmaIdType next_idx_unused) {
    246   if (NULL == lemma_arr || 0 == lemma_num || next_idx_unused <= 1)
    247     return false;
    248 
    249   double total_freq = 0;
    250   double *freqs = new double[next_idx_unused];
    251   if (NULL == freqs)
    252     return false;
    253 
    254   freqs[0] = ADD_COUNT;
    255   total_freq += freqs[0];
    256   LemmaIdType idx_now = 0;
    257   for (size_t pos = 0; pos < lemma_num; pos++) {
    258     if (lemma_arr[pos].idx_by_hz == idx_now)
    259       continue;
    260     idx_now++;
    261 
    262     assert(lemma_arr[pos].idx_by_hz == idx_now);
    263 
    264     freqs[idx_now] = lemma_arr[pos].freq;
    265     if (freqs[idx_now] <= 0)
    266       freqs[idx_now] = 0.3;
    267 
    268     total_freq += freqs[idx_now];
    269   }
    270 
    271   double max_freq = 0;
    272   idx_num_ = idx_now + 1;
    273   assert(idx_now + 1 == next_idx_unused);
    274 
    275   for (size_t pos = 0; pos < idx_num_; pos++) {
    276     freqs[pos] = freqs[pos] / total_freq;
    277     assert(freqs[pos] > 0);
    278     if (freqs[pos] > max_freq)
    279       max_freq = freqs[pos];
    280   }
    281 
    282   // calculate the code book
    283   if (NULL == freq_codes_df_)
    284     freq_codes_df_ = new double[kCodeBookSize];
    285   assert(freq_codes_df_);
    286   memset(freq_codes_df_, 0, sizeof(double) * kCodeBookSize);
    287 
    288   if (NULL == freq_codes_)
    289     freq_codes_ = new LmaScoreType[kCodeBookSize];
    290   assert(freq_codes_);
    291   memset(freq_codes_, 0, sizeof(LmaScoreType) * kCodeBookSize);
    292 
    293   size_t freq_pos = 0;
    294   for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) {
    295     bool found = true;
    296 
    297     while (found) {
    298       found = false;
    299       double cand = freqs[freq_pos];
    300       for (size_t i = 0; i < code_pos; i++)
    301         if (freq_codes_df_[i] == cand) {
    302           found = true;
    303           break;
    304         }
    305       if (found)
    306         freq_pos++;
    307     }
    308 
    309     freq_codes_df_[code_pos] = freqs[freq_pos];
    310     freq_pos++;
    311   }
    312 
    313   myqsort(freq_codes_df_, kCodeBookSize, sizeof(double), comp_double);
    314 
    315   if (NULL == lma_freq_idx_)
    316     lma_freq_idx_ = new CODEBOOK_TYPE[idx_num_];
    317   assert(lma_freq_idx_);
    318 
    319   iterate_codes(freqs, idx_num_, freq_codes_df_, lma_freq_idx_);
    320 
    321   delete [] freqs;
    322 
    323   if (kPrintDebug0) {
    324     printf("\n------Language Model Unigram Codebook------\n");
    325   }
    326 
    327   for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) {
    328     double log_score = log(freq_codes_df_[code_pos]);
    329     float final_score = convert_psb_to_score(freq_codes_df_[code_pos]);
    330     if (kPrintDebug0) {
    331       printf("code:%d, probability:%.9f, log score:%.3f, final score: %.3f\n",
    332              code_pos, freq_codes_df_[code_pos], log_score, final_score);
    333     }
    334     freq_codes_[code_pos] = static_cast<LmaScoreType>(final_score);
    335   }
    336 
    337   initialized_ = true;
    338   return true;
    339 }
    340 #endif
    341 
    342 }  // namespace ime_pinyin
    343