Home | History | Annotate | Download | only in share
      1 /*
      2  * Copyright (C) 2009 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include <assert.h>
     18 #include <stdlib.h>
     19 #include <stdio.h>
     20 #include <string.h>
     21 #include <math.h>
     22 #include "../include/spellingtable.h"
     23 
     24 namespace ime_pinyin {
     25 
     26 #ifdef ___BUILD_MODEL___
     27 
     28 const char SpellingTable::
     29     kNotSupportList[kNotSupportNum][kMaxSpellingSize + 1] = {"HM", "HNG", "NG"};
     30 
     31 // "" is the biggest, so that all empty strings will be moved to the end
     32 // _eb mean empty is biggest
     33 int compare_raw_spl_eb(const void* p1, const void* p2) {
     34   if ('\0' == (static_cast<const RawSpelling*>(p1))->str[0])
     35     return 1;
     36 
     37   if ('\0' == (static_cast<const RawSpelling*>(p2))->str[0])
     38     return -1;
     39 
     40   return strcmp((static_cast<const RawSpelling*>(p1))->str,
     41                 (static_cast<const RawSpelling*>(p2))->str);
     42 }
     43 
     44 size_t get_odd_next(size_t value) {
     45   size_t v_next = value;
     46   while (true) {
     47     size_t v_next_sqrt = (size_t)sqrt(v_next);
     48 
     49     bool is_odd = true;
     50     for (size_t v_dv = 2; v_dv < v_next_sqrt + 1; v_dv++) {
     51       if (v_next % v_dv == 0) {
     52         is_odd = false;
     53         break;
     54       }
     55     }
     56 
     57     if (is_odd)
     58       return v_next;
     59 
     60     v_next++;
     61   }
     62 
     63   // never reach here
     64   return 0;
     65 }
     66 
     67 SpellingTable::SpellingTable() {
     68   need_score_ = false;
     69   raw_spellings_ = NULL;
     70   spelling_buf_ = NULL;
     71   spelling_num_ = 0;
     72   total_freq_ = 0;
     73   frozen_ = true;
     74 }
     75 
     76 SpellingTable::~SpellingTable() {
     77   free_resource();
     78 }
     79 
     80 size_t SpellingTable::get_hash_pos(const char* spelling_str) {
     81   size_t hash_pos = 0;
     82   for (size_t pos = 0; pos < spelling_size_; pos++) {
     83     if ('\0' == spelling_str[pos])
     84       break;
     85     hash_pos += (size_t)spelling_str[pos];
     86   }
     87 
     88   hash_pos = hash_pos % spelling_max_num_;
     89   return hash_pos;
     90 }
     91 
     92 size_t SpellingTable::hash_pos_next(size_t hash_pos) {
     93   hash_pos += 123;
     94   hash_pos = hash_pos % spelling_max_num_;
     95   return hash_pos;
     96 }
     97 
     98 void SpellingTable::free_resource() {
     99   if (NULL != raw_spellings_)
    100     delete [] raw_spellings_;
    101   raw_spellings_ = NULL;
    102 
    103   if (NULL != spelling_buf_)
    104     delete [] spelling_buf_;
    105   spelling_buf_ = NULL;
    106 }
    107 
    108 bool SpellingTable::init_table(size_t pure_spl_size, size_t spl_max_num,
    109                                bool need_score) {
    110   if (pure_spl_size == 0 || spl_max_num ==0)
    111     return false;
    112 
    113   need_score_ = need_score;
    114 
    115   free_resource();
    116 
    117   spelling_size_ = pure_spl_size + 1;
    118   if (need_score)
    119     spelling_size_ += 1;
    120   spelling_max_num_ = get_odd_next(spl_max_num);
    121   spelling_num_ = 0;
    122 
    123   raw_spellings_ = new RawSpelling[spelling_max_num_];
    124   spelling_buf_ = new char[spelling_max_num_ * (spelling_size_)];
    125   if (NULL == raw_spellings_ || NULL == spelling_buf_) {
    126     free_resource();
    127     return false;
    128   }
    129 
    130   memset(raw_spellings_, 0, spelling_max_num_ * sizeof(RawSpelling));
    131   memset(spelling_buf_, 0, spelling_max_num_ * (spelling_size_));
    132   frozen_ = false;
    133   total_freq_ = 0;
    134   return true;
    135 }
    136 
    137 bool SpellingTable::put_spelling(const char* spelling_str, double freq) {
    138   if (frozen_ || NULL == spelling_str)
    139     return false;
    140 
    141   for (size_t pos = 0; pos < kNotSupportNum; pos++) {
    142     if (strcmp(spelling_str, kNotSupportList[pos]) == 0) {
    143       return false;
    144     }
    145   }
    146 
    147   total_freq_ += freq;
    148 
    149   size_t hash_pos = get_hash_pos(spelling_str);
    150 
    151   raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0';
    152 
    153   if (strncmp(raw_spellings_[hash_pos].str, spelling_str,
    154               spelling_size_ - 1) == 0) {
    155     raw_spellings_[hash_pos].freq += freq;
    156     return true;
    157   }
    158 
    159   size_t hash_pos_ori = hash_pos;
    160 
    161   while (true) {
    162     if (strncmp(raw_spellings_[hash_pos].str,
    163                 spelling_str, spelling_size_ - 1) == 0) {
    164       raw_spellings_[hash_pos].freq += freq;
    165       return true;
    166     }
    167 
    168     if ('\0' == raw_spellings_[hash_pos].str[0]) {
    169       raw_spellings_[hash_pos].freq += freq;
    170       strncpy(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1);
    171       raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0';
    172       spelling_num_++;
    173       return true;
    174     }
    175 
    176     hash_pos = hash_pos_next(hash_pos);
    177     if (hash_pos_ori == hash_pos)
    178       return false;
    179   }
    180 
    181   // never reach here
    182   return false;
    183 }
    184 
    185 bool SpellingTable::contain(const char* spelling_str) {
    186   if (NULL == spelling_str || NULL == spelling_buf_ || frozen_)
    187     return false;
    188 
    189   size_t hash_pos = get_hash_pos(spelling_str);
    190 
    191   if ('\0' == raw_spellings_[hash_pos].str[0])
    192     return false;
    193 
    194   if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1)
    195       == 0)
    196     return true;
    197 
    198   size_t hash_pos_ori = hash_pos;
    199 
    200   while (true) {
    201     hash_pos = hash_pos_next(hash_pos);
    202     if (hash_pos_ori == hash_pos)
    203       return false;
    204 
    205     if ('\0' == raw_spellings_[hash_pos].str[0])
    206       return false;
    207 
    208     if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1)
    209         == 0)
    210       return true;
    211   }
    212 
    213   // never reach here
    214   return false;
    215 }
    216 
    217 const char* SpellingTable::arrange(size_t *item_size, size_t *spl_num) {
    218   if (NULL == raw_spellings_ || NULL == spelling_buf_ ||
    219       NULL == item_size || NULL == spl_num)
    220     return NULL;
    221 
    222   qsort(raw_spellings_, spelling_max_num_, sizeof(RawSpelling),
    223         compare_raw_spl_eb);
    224 
    225   // After sorting, only the first spelling_num_ items are valid.
    226   // Copy them to the destination buffer.
    227   for (size_t pos = 0; pos < spelling_num_; pos++) {
    228     strncpy(spelling_buf_ + pos * spelling_size_, raw_spellings_[pos].str,
    229             spelling_size_);
    230   }
    231 
    232   if (need_score_) {
    233     if (kPrintDebug0)
    234       printf("------------Spelling Possiblities--------------\n");
    235 
    236     double max_score = 0;
    237     double min_score = 0;
    238 
    239     // After sorting, only the first spelling_num_ items are valid.
    240     for (size_t pos = 0; pos < spelling_num_; pos++) {
    241       raw_spellings_[pos].freq /= total_freq_;
    242       if (need_score_) {
    243         if (0 == pos) {
    244           max_score = raw_spellings_[0].freq;
    245           min_score = max_score;
    246         } else {
    247           if (raw_spellings_[pos].freq > max_score)
    248             max_score = raw_spellings_[pos].freq;
    249           if (raw_spellings_[pos].freq < min_score)
    250             min_score = raw_spellings_[pos].freq;
    251         }
    252       }
    253     }
    254 
    255     if (kPrintDebug0)
    256       printf("-----max psb: %f, min psb: %f\n", max_score, min_score);
    257 
    258     max_score = log(max_score);
    259     min_score = log(min_score);
    260 
    261     if (kPrintDebug0)
    262       printf("-----max log value: %f, min log value: %f\n",
    263              max_score, min_score);
    264 
    265     // The absolute value of min_score is bigger than that of max_score because
    266     // both of them are negative after log function.
    267     score_amplifier_ = 1.0 * 255 / min_score;
    268 
    269     double average_score = 0;
    270     for (size_t pos = 0; pos < spelling_num_; pos++) {
    271       double score = log(raw_spellings_[pos].freq) * score_amplifier_;
    272       assert(score >= 0);
    273 
    274       average_score += score;
    275 
    276       // Because of calculation precision issue, score might be a little bigger
    277       // than 255 after being amplified.
    278       if (score > 255)
    279         score = 255;
    280       char *this_spl_buf = spelling_buf_ + pos * spelling_size_;
    281       this_spl_buf[spelling_size_ - 1] =
    282           static_cast<char>((unsigned char)score);
    283 
    284       if (kPrintDebug0) {
    285         printf("---pos:%d, %s, psb:%d\n", pos, this_spl_buf,
    286                (unsigned char)this_spl_buf[spelling_size_ -1]);
    287       }
    288     }
    289     average_score /= spelling_num_;
    290     assert(average_score <= 255);
    291     average_score_ = static_cast<uint8>(average_score);
    292 
    293     if (kPrintDebug0)
    294       printf("\n----Score Amplifier: %f, Average Score: %d\n", score_amplifier_,
    295              average_score_);
    296   }
    297 
    298   *item_size = spelling_size_;
    299   *spl_num = spelling_num_;
    300   frozen_ = true;
    301   return spelling_buf_;
    302 }
    303 
    304 float SpellingTable::get_score_amplifier() {
    305   return static_cast<float>(score_amplifier_);
    306 }
    307 
    308 unsigned char SpellingTable::get_average_score() {
    309   return average_score_;
    310 }
    311 
    312 #endif  // ___BUILD_MODEL___
    313 }  // namespace ime_pinyin
    314