1 /* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <assert.h> 18 #include <stdlib.h> 19 #include <stdio.h> 20 #include <string.h> 21 #include <math.h> 22 #include "../include/spellingtable.h" 23 24 namespace ime_pinyin { 25 26 #ifdef ___BUILD_MODEL___ 27 28 const char SpellingTable:: 29 kNotSupportList[kNotSupportNum][kMaxSpellingSize + 1] = {"HM", "HNG", "NG"}; 30 31 // "" is the biggest, so that all empty strings will be moved to the end 32 // _eb mean empty is biggest 33 int compare_raw_spl_eb(const void* p1, const void* p2) { 34 if ('\0' == (static_cast<const RawSpelling*>(p1))->str[0]) 35 return 1; 36 37 if ('\0' == (static_cast<const RawSpelling*>(p2))->str[0]) 38 return -1; 39 40 return strcmp((static_cast<const RawSpelling*>(p1))->str, 41 (static_cast<const RawSpelling*>(p2))->str); 42 } 43 44 size_t get_odd_next(size_t value) { 45 size_t v_next = value; 46 while (true) { 47 size_t v_next_sqrt = (size_t)sqrt(v_next); 48 49 bool is_odd = true; 50 for (size_t v_dv = 2; v_dv < v_next_sqrt + 1; v_dv++) { 51 if (v_next % v_dv == 0) { 52 is_odd = false; 53 break; 54 } 55 } 56 57 if (is_odd) 58 return v_next; 59 60 v_next++; 61 } 62 63 // never reach here 64 return 0; 65 } 66 67 SpellingTable::SpellingTable() { 68 need_score_ = false; 69 raw_spellings_ = NULL; 70 spelling_buf_ = NULL; 71 spelling_num_ = 0; 72 total_freq_ = 0; 73 frozen_ = true; 74 } 75 76 SpellingTable::~SpellingTable() { 77 free_resource(); 78 } 79 80 size_t SpellingTable::get_hash_pos(const char* spelling_str) { 81 size_t hash_pos = 0; 82 for (size_t pos = 0; pos < spelling_size_; pos++) { 83 if ('\0' == spelling_str[pos]) 84 break; 85 hash_pos += (size_t)spelling_str[pos]; 86 } 87 88 hash_pos = hash_pos % spelling_max_num_; 89 return hash_pos; 90 } 91 92 size_t SpellingTable::hash_pos_next(size_t hash_pos) { 93 hash_pos += 123; 94 hash_pos = hash_pos % spelling_max_num_; 95 return hash_pos; 96 } 97 98 void SpellingTable::free_resource() { 99 if (NULL != raw_spellings_) 100 delete [] raw_spellings_; 101 raw_spellings_ = NULL; 102 103 if (NULL != spelling_buf_) 104 delete [] spelling_buf_; 105 spelling_buf_ = NULL; 106 } 107 108 bool SpellingTable::init_table(size_t pure_spl_size, size_t spl_max_num, 109 bool need_score) { 110 if (pure_spl_size == 0 || spl_max_num ==0) 111 return false; 112 113 need_score_ = need_score; 114 115 free_resource(); 116 117 spelling_size_ = pure_spl_size + 1; 118 if (need_score) 119 spelling_size_ += 1; 120 spelling_max_num_ = get_odd_next(spl_max_num); 121 spelling_num_ = 0; 122 123 raw_spellings_ = new RawSpelling[spelling_max_num_]; 124 spelling_buf_ = new char[spelling_max_num_ * (spelling_size_)]; 125 if (NULL == raw_spellings_ || NULL == spelling_buf_) { 126 free_resource(); 127 return false; 128 } 129 130 memset(raw_spellings_, 0, spelling_max_num_ * sizeof(RawSpelling)); 131 memset(spelling_buf_, 0, spelling_max_num_ * (spelling_size_)); 132 frozen_ = false; 133 total_freq_ = 0; 134 return true; 135 } 136 137 bool SpellingTable::put_spelling(const char* spelling_str, double freq) { 138 if (frozen_ || NULL == spelling_str) 139 return false; 140 141 for (size_t pos = 0; pos < kNotSupportNum; pos++) { 142 if (strcmp(spelling_str, kNotSupportList[pos]) == 0) { 143 return false; 144 } 145 } 146 147 total_freq_ += freq; 148 149 size_t hash_pos = get_hash_pos(spelling_str); 150 151 raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0'; 152 153 if (strncmp(raw_spellings_[hash_pos].str, spelling_str, 154 spelling_size_ - 1) == 0) { 155 raw_spellings_[hash_pos].freq += freq; 156 return true; 157 } 158 159 size_t hash_pos_ori = hash_pos; 160 161 while (true) { 162 if (strncmp(raw_spellings_[hash_pos].str, 163 spelling_str, spelling_size_ - 1) == 0) { 164 raw_spellings_[hash_pos].freq += freq; 165 return true; 166 } 167 168 if ('\0' == raw_spellings_[hash_pos].str[0]) { 169 raw_spellings_[hash_pos].freq += freq; 170 strncpy(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1); 171 raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0'; 172 spelling_num_++; 173 return true; 174 } 175 176 hash_pos = hash_pos_next(hash_pos); 177 if (hash_pos_ori == hash_pos) 178 return false; 179 } 180 181 // never reach here 182 return false; 183 } 184 185 bool SpellingTable::contain(const char* spelling_str) { 186 if (NULL == spelling_str || NULL == spelling_buf_ || frozen_) 187 return false; 188 189 size_t hash_pos = get_hash_pos(spelling_str); 190 191 if ('\0' == raw_spellings_[hash_pos].str[0]) 192 return false; 193 194 if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1) 195 == 0) 196 return true; 197 198 size_t hash_pos_ori = hash_pos; 199 200 while (true) { 201 hash_pos = hash_pos_next(hash_pos); 202 if (hash_pos_ori == hash_pos) 203 return false; 204 205 if ('\0' == raw_spellings_[hash_pos].str[0]) 206 return false; 207 208 if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1) 209 == 0) 210 return true; 211 } 212 213 // never reach here 214 return false; 215 } 216 217 const char* SpellingTable::arrange(size_t *item_size, size_t *spl_num) { 218 if (NULL == raw_spellings_ || NULL == spelling_buf_ || 219 NULL == item_size || NULL == spl_num) 220 return NULL; 221 222 qsort(raw_spellings_, spelling_max_num_, sizeof(RawSpelling), 223 compare_raw_spl_eb); 224 225 // After sorting, only the first spelling_num_ items are valid. 226 // Copy them to the destination buffer. 227 for (size_t pos = 0; pos < spelling_num_; pos++) { 228 strncpy(spelling_buf_ + pos * spelling_size_, raw_spellings_[pos].str, 229 spelling_size_); 230 } 231 232 if (need_score_) { 233 if (kPrintDebug0) 234 printf("------------Spelling Possiblities--------------\n"); 235 236 double max_score = 0; 237 double min_score = 0; 238 239 // After sorting, only the first spelling_num_ items are valid. 240 for (size_t pos = 0; pos < spelling_num_; pos++) { 241 raw_spellings_[pos].freq /= total_freq_; 242 if (need_score_) { 243 if (0 == pos) { 244 max_score = raw_spellings_[0].freq; 245 min_score = max_score; 246 } else { 247 if (raw_spellings_[pos].freq > max_score) 248 max_score = raw_spellings_[pos].freq; 249 if (raw_spellings_[pos].freq < min_score) 250 min_score = raw_spellings_[pos].freq; 251 } 252 } 253 } 254 255 if (kPrintDebug0) 256 printf("-----max psb: %f, min psb: %f\n", max_score, min_score); 257 258 max_score = log(max_score); 259 min_score = log(min_score); 260 261 if (kPrintDebug0) 262 printf("-----max log value: %f, min log value: %f\n", 263 max_score, min_score); 264 265 // The absolute value of min_score is bigger than that of max_score because 266 // both of them are negative after log function. 267 score_amplifier_ = 1.0 * 255 / min_score; 268 269 double average_score = 0; 270 for (size_t pos = 0; pos < spelling_num_; pos++) { 271 double score = log(raw_spellings_[pos].freq) * score_amplifier_; 272 assert(score >= 0); 273 274 average_score += score; 275 276 // Because of calculation precision issue, score might be a little bigger 277 // than 255 after being amplified. 278 if (score > 255) 279 score = 255; 280 char *this_spl_buf = spelling_buf_ + pos * spelling_size_; 281 this_spl_buf[spelling_size_ - 1] = 282 static_cast<char>((unsigned char)score); 283 284 if (kPrintDebug0) { 285 printf("---pos:%d, %s, psb:%d\n", pos, this_spl_buf, 286 (unsigned char)this_spl_buf[spelling_size_ -1]); 287 } 288 } 289 average_score /= spelling_num_; 290 assert(average_score <= 255); 291 average_score_ = static_cast<uint8>(average_score); 292 293 if (kPrintDebug0) 294 printf("\n----Score Amplifier: %f, Average Score: %d\n", score_amplifier_, 295 average_score_); 296 } 297 298 *item_size = spelling_size_; 299 *spl_num = spelling_num_; 300 frozen_ = true; 301 return spelling_buf_; 302 } 303 304 float SpellingTable::get_score_amplifier() { 305 return static_cast<float>(score_amplifier_); 306 } 307 308 unsigned char SpellingTable::get_average_score() { 309 return average_score_; 310 } 311 312 #endif // ___BUILD_MODEL___ 313 } // namespace ime_pinyin 314