1 /* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "../include/userdict.h" 18 #include "../include/splparser.h" 19 #include "../include/ngram.h" 20 #include <stdio.h> 21 #include <string.h> 22 #include <stdlib.h> 23 #include <cutils/log.h> 24 #include <unistd.h> 25 #include <fcntl.h> 26 #include <sys/stat.h> 27 #include <assert.h> 28 #include <ctype.h> 29 #include <sys/types.h> 30 #include <sys/time.h> 31 #include <time.h> 32 #include <pthread.h> 33 #include <math.h> 34 35 namespace ime_pinyin { 36 37 #ifdef ___DEBUG_PERF___ 38 static uint64 _ellapse_ = 0; 39 static struct timeval _tv_start_, _tv_end_; 40 #define DEBUG_PERF_BEGIN \ 41 do { \ 42 gettimeofday(&_tv_start_, NULL); \ 43 } while(0) 44 #define DEBUG_PERF_END \ 45 do { \ 46 gettimeofday(&_tv_end_, NULL); \ 47 _ellapse_ = (_tv_end_.tv_sec - _tv_start_.tv_sec) * 1000000 + \ 48 (_tv_end_.tv_usec - _tv_start_.tv_usec); \ 49 } while(0) 50 #define LOGD_PERF(message) \ 51 LOGD("PERFORMANCE[%s] %llu usec.", message, _ellapse_); 52 #else 53 #define DEBUG_PERF_BEGIN 54 #define DEBUG_PERF_END 55 #define LOGD_PERF(message) 56 #endif 57 58 // XXX File load and write are thread-safe by g_mutex_ 59 static pthread_mutex_t g_mutex_ = PTHREAD_MUTEX_INITIALIZER; 60 static struct timeval g_last_update_ = {0, 0}; 61 62 inline uint32 UserDict::get_dict_file_size(UserDictInfo * info) { 63 return (4 + info->lemma_size + (info->lemma_count << 3) 64 #ifdef ___PREDICT_ENABLED___ 65 + (info->lemma_count << 2) 66 #endif 67 #ifdef ___SYNC_ENABLED___ 68 + (info->sync_count << 2) 69 #endif 70 + sizeof(*info)); 71 } 72 73 inline LmaScoreType UserDict::translate_score(int raw_score) { 74 // 1) ori_freq: original user frequency 75 uint32 ori_freq = extract_score_freq(raw_score); 76 // 2) lmt_off: lmt index (week offset for example) 77 uint64 lmt_off = ((raw_score & 0xffff0000) >> 16); 78 if (kUserDictLMTBitWidth < 16) { 79 uint64 mask = ~(1 << kUserDictLMTBitWidth); 80 lmt_off &= mask; 81 } 82 // 3) now_off: current time index (current week offset for example) 83 // assuming load_time_ is around current time 84 uint64 now_off = load_time_.tv_sec; 85 now_off = (now_off - kUserDictLMTSince) / kUserDictLMTGranularity; 86 now_off = (now_off << (64 - kUserDictLMTBitWidth)); 87 now_off = (now_off >> (64 - kUserDictLMTBitWidth)); 88 // 4) factor: decide expand-factor 89 int delta = now_off - lmt_off; 90 if (delta > 4) 91 delta = 4; 92 int factor = 80 - (delta << 4); 93 94 double tf = (double)(dict_info_.total_nfreq + total_other_nfreq_); 95 return (LmaScoreType)(log((double)factor * (double)ori_freq / tf) 96 * NGram::kLogValueAmplifier); 97 } 98 99 inline int UserDict::extract_score_freq(int raw_score) { 100 // Frequence stored in lowest 16 bits 101 int freq = (raw_score & 0x0000ffff); 102 return freq; 103 } 104 105 inline uint64 UserDict::extract_score_lmt(int raw_score) { 106 uint64 lmt = ((raw_score & 0xffff0000) >> 16); 107 if (kUserDictLMTBitWidth < 16) { 108 uint64 mask = ~(1 << kUserDictLMTBitWidth); 109 lmt &= mask; 110 } 111 lmt = lmt * kUserDictLMTGranularity + kUserDictLMTSince; 112 return lmt; 113 } 114 115 inline int UserDict::build_score(uint64 lmt, int freq) { 116 lmt = (lmt - kUserDictLMTSince) / kUserDictLMTGranularity; 117 lmt = (lmt << (64 - kUserDictLMTBitWidth)); 118 lmt = (lmt >> (64 - kUserDictLMTBitWidth)); 119 uint16 lmt16 = (uint16)lmt; 120 int s = freq; 121 s &= 0x0000ffff; 122 s = (lmt16 << 16) | s; 123 return s; 124 } 125 126 inline int64 UserDict::utf16le_atoll(uint16 *s, int len) { 127 int64 ret = 0; 128 if (len <= 0) 129 return ret; 130 131 int flag = 1; 132 const uint16 * endp = s + len; 133 if (*s == '-') { 134 flag = -1; 135 s++; 136 } else if (*s == '+') { 137 s++; 138 } 139 140 while (*s >= '0' && *s <= '9' && s < endp) { 141 ret += ret * 10 + (*s) - '0'; 142 s++; 143 } 144 return ret * flag; 145 } 146 147 inline int UserDict::utf16le_lltoa(int64 v, uint16 *s, int size) { 148 if (!s || size <= 0) 149 return 0; 150 uint16 *endp = s + size; 151 int ret_len = 0; 152 if (v < 0) { 153 *(s++) = '-'; 154 ++ret_len; 155 v *= -1; 156 } 157 158 uint16 *b = s; 159 while (s < endp && v != 0) { 160 *(s++) = '0' + (v % 10); 161 v = v / 10; 162 ++ret_len; 163 } 164 165 if (v != 0) 166 return 0; 167 168 --s; 169 170 while (b < s) { 171 *b = *s; 172 ++b, --s; 173 } 174 175 return ret_len; 176 } 177 178 inline void UserDict::set_lemma_flag(uint32 offset, uint8 flag) { 179 offset &= kUserDictOffsetMask; 180 lemmas_[offset] |= flag; 181 } 182 183 inline char UserDict::get_lemma_flag(uint32 offset) { 184 offset &= kUserDictOffsetMask; 185 return (char)(lemmas_[offset]); 186 } 187 188 inline char UserDict::get_lemma_nchar(uint32 offset) { 189 offset &= kUserDictOffsetMask; 190 return (char)(lemmas_[offset + 1]); 191 } 192 193 inline uint16 * UserDict::get_lemma_spell_ids(uint32 offset) { 194 offset &= kUserDictOffsetMask; 195 return (uint16 *)(lemmas_ + offset + 2); 196 } 197 198 inline uint16 * UserDict::get_lemma_word(uint32 offset) { 199 offset &= kUserDictOffsetMask; 200 uint8 nchar = get_lemma_nchar(offset); 201 return (uint16 *)(lemmas_ + offset + 2 + (nchar << 1)); 202 } 203 204 inline LemmaIdType UserDict::get_max_lemma_id() { 205 // When a lemma is deleted, we don't not claim its id back for 206 // simplicity and performance 207 return start_id_ + dict_info_.lemma_count - 1; 208 } 209 210 inline bool UserDict::is_valid_lemma_id(LemmaIdType id) { 211 if (id >= start_id_ && id <= get_max_lemma_id()) 212 return true; 213 return false; 214 } 215 216 inline bool UserDict::is_valid_state() { 217 if (state_ == USER_DICT_NONE) 218 return false; 219 return true; 220 } 221 222 UserDict::UserDict() 223 : start_id_(0), 224 version_(0), 225 lemmas_(NULL), 226 offsets_(NULL), 227 scores_(NULL), 228 ids_(NULL), 229 #ifdef ___PREDICT_ENABLED___ 230 predicts_(NULL), 231 #endif 232 #ifdef ___SYNC_ENABLED___ 233 syncs_(NULL), 234 sync_count_size_(0), 235 #endif 236 offsets_by_id_(NULL), 237 lemma_count_left_(0), 238 lemma_size_left_(0), 239 dict_file_(NULL), 240 state_(USER_DICT_NONE) { 241 memset(&dict_info_, 0, sizeof(dict_info_)); 242 memset(&load_time_, 0, sizeof(load_time_)); 243 #ifdef ___CACHE_ENABLED___ 244 cache_init(); 245 #endif 246 } 247 248 UserDict::~UserDict() { 249 close_dict(); 250 } 251 252 bool UserDict::load_dict(const char *file_name, LemmaIdType start_id, 253 LemmaIdType end_id) { 254 #ifdef ___DEBUG_PERF___ 255 DEBUG_PERF_BEGIN; 256 #endif 257 dict_file_ = strdup(file_name); 258 if (!dict_file_) 259 return false; 260 261 start_id_ = start_id; 262 263 if (false == validate(file_name) && false == reset(file_name)) { 264 goto error; 265 } 266 if (false == load(file_name, start_id)) { 267 goto error; 268 } 269 270 state_ = USER_DICT_SYNC; 271 272 gettimeofday(&load_time_, NULL); 273 274 #ifdef ___DEBUG_PERF___ 275 DEBUG_PERF_END; 276 LOGD_PERF("load_dict"); 277 #endif 278 return true; 279 error: 280 free((void*)dict_file_); 281 start_id_ = 0; 282 return false; 283 } 284 285 bool UserDict::close_dict() { 286 if (state_ == USER_DICT_NONE) 287 return true; 288 if (state_ == USER_DICT_SYNC) 289 goto out; 290 291 // If dictionary is written back by others, 292 // we can not simply write back here 293 // To do a safe flush, we have to discard all newly added 294 // lemmas and try to reload dict file. 295 pthread_mutex_lock(&g_mutex_); 296 if (load_time_.tv_sec > g_last_update_.tv_sec || 297 (load_time_.tv_sec == g_last_update_.tv_sec && 298 load_time_.tv_usec > g_last_update_.tv_usec)) { 299 write_back(); 300 gettimeofday(&g_last_update_, NULL); 301 } 302 pthread_mutex_unlock(&g_mutex_); 303 304 out: 305 free((void*)dict_file_); 306 free(lemmas_); 307 free(offsets_); 308 free(offsets_by_id_); 309 free(scores_); 310 free(ids_); 311 #ifdef ___PREDICT_ENABLED___ 312 free(predicts_); 313 #endif 314 315 version_ = 0; 316 dict_file_ = NULL; 317 lemmas_ = NULL; 318 #ifdef ___SYNC_ENABLED___ 319 syncs_ = NULL; 320 sync_count_size_ = 0; 321 #endif 322 offsets_ = NULL; 323 offsets_by_id_ = NULL; 324 scores_ = NULL; 325 ids_ = NULL; 326 #ifdef ___PREDICT_ENABLED___ 327 predicts_ = NULL; 328 #endif 329 330 memset(&dict_info_, 0, sizeof(dict_info_)); 331 lemma_count_left_ = 0; 332 lemma_size_left_ = 0; 333 state_ = USER_DICT_NONE; 334 335 return true; 336 } 337 338 size_t UserDict::number_of_lemmas() { 339 return dict_info_.lemma_count; 340 } 341 342 void UserDict::reset_milestones(uint16 from_step, MileStoneHandle from_handle) { 343 return; 344 } 345 346 MileStoneHandle UserDict::extend_dict(MileStoneHandle from_handle, 347 const DictExtPara *dep, 348 LmaPsbItem *lpi_items, 349 size_t lpi_max, size_t *lpi_num) { 350 if (is_valid_state() == false) 351 return 0; 352 353 bool need_extend = false; 354 355 #ifdef ___DEBUG_PERF___ 356 DEBUG_PERF_BEGIN; 357 #endif 358 *lpi_num = _get_lpis(dep->splids, dep->splids_extended + 1, 359 lpi_items, lpi_max, &need_extend); 360 #ifdef ___DEBUG_PERF___ 361 DEBUG_PERF_END; 362 LOGD_PERF("extend_dict"); 363 #endif 364 return ((*lpi_num > 0 || need_extend) ? 1 : 0); 365 } 366 367 int UserDict::is_fuzzy_prefix_spell_id( 368 const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) { 369 if (len1 < searchable->splids_len) 370 return 0; 371 372 SpellingTrie &spl_trie = SpellingTrie::get_instance(); 373 uint32 i = 0; 374 for (i = 0; i < searchable->splids_len; i++) { 375 const char py1 = *spl_trie.get_spelling_str(id1[i]); 376 uint16 off = 8 * (i % 4); 377 const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off); 378 if (py1 == py2) 379 continue; 380 return 0; 381 } 382 return 1; 383 } 384 385 int UserDict::fuzzy_compare_spell_id( 386 const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) { 387 if (len1 < searchable->splids_len) 388 return -1; 389 if (len1 > searchable->splids_len) 390 return 1; 391 392 SpellingTrie &spl_trie = SpellingTrie::get_instance(); 393 uint32 i = 0; 394 for (i = 0; i < len1; i++) { 395 const char py1 = *spl_trie.get_spelling_str(id1[i]); 396 uint16 off = 8 * (i % 4); 397 const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off); 398 if (py1 == py2) 399 continue; 400 if (py1 > py2) 401 return 1; 402 return -1; 403 } 404 return 0; 405 } 406 407 bool UserDict::is_prefix_spell_id( 408 const uint16 * fullids, uint16 fulllen, 409 const UserDictSearchable *searchable) { 410 if (fulllen < searchable->splids_len) 411 return false; 412 413 uint32 i = 0; 414 for (; i < searchable->splids_len; i++) { 415 uint16 start_id = searchable->splid_start[i]; 416 uint16 count = searchable->splid_count[i]; 417 if (fullids[i] >= start_id && fullids[i] < start_id + count) 418 continue; 419 else 420 return false; 421 } 422 return true; 423 } 424 425 bool UserDict::equal_spell_id( 426 const uint16 * fullids, uint16 fulllen, 427 const UserDictSearchable *searchable) { 428 if (fulllen != searchable->splids_len) 429 return false; 430 431 uint32 i = 0; 432 for (; i < fulllen; i++) { 433 uint16 start_id = searchable->splid_start[i]; 434 uint16 count = searchable->splid_count[i]; 435 if (fullids[i] >= start_id && fullids[i] < start_id + count) 436 continue; 437 else 438 return false; 439 } 440 return true; 441 } 442 443 int32 UserDict::locate_first_in_offsets(const UserDictSearchable * searchable) { 444 int32 begin = 0; 445 int32 end = dict_info_.lemma_count - 1; 446 int32 middle = -1; 447 448 int32 first_prefix = middle; 449 int32 last_matched = middle; 450 451 while (begin <= end) { 452 middle = (begin + end) >> 1; 453 uint32 offset = offsets_[middle]; 454 uint8 nchar = get_lemma_nchar(offset); 455 const uint16 * splids = get_lemma_spell_ids(offset); 456 int cmp = fuzzy_compare_spell_id(splids, nchar, searchable); 457 int pre = is_fuzzy_prefix_spell_id(splids, nchar, searchable); 458 459 if (pre) 460 first_prefix = middle; 461 462 if (cmp < 0) { 463 begin = middle + 1; 464 } else if (cmp > 0) { 465 end = middle - 1; 466 } else { 467 end = middle - 1; 468 last_matched = middle; 469 } 470 } 471 472 return first_prefix; 473 } 474 475 void UserDict::prepare_locate(UserDictSearchable *searchable, 476 const uint16 *splid_str, 477 uint16 splid_str_len) { 478 searchable->splids_len = splid_str_len; 479 memset(searchable->signature, 0, sizeof(searchable->signature)); 480 481 SpellingTrie &spl_trie = SpellingTrie::get_instance(); 482 uint32 i = 0; 483 for (; i < splid_str_len; i++) { 484 if (spl_trie.is_half_id(splid_str[i])) { 485 searchable->splid_count[i] = 486 spl_trie.half_to_full(splid_str[i], 487 &(searchable->splid_start[i])); 488 } else { 489 searchable->splid_count[i] = 1; 490 searchable->splid_start[i] = splid_str[i]; 491 } 492 const unsigned char py = *spl_trie.get_spelling_str(splid_str[i]); 493 searchable->signature[i>>2] |= (py << (8 * (i % 4))); 494 } 495 } 496 497 size_t UserDict::get_lpis(const uint16 *splid_str, uint16 splid_str_len, 498 LmaPsbItem *lpi_items, size_t lpi_max) { 499 return _get_lpis(splid_str, splid_str_len, lpi_items, lpi_max, NULL); 500 } 501 502 size_t UserDict::_get_lpis(const uint16 *splid_str, 503 uint16 splid_str_len, LmaPsbItem *lpi_items, 504 size_t lpi_max, bool * need_extend) { 505 bool tmp_extend; 506 if (!need_extend) 507 need_extend = &tmp_extend; 508 509 *need_extend = false; 510 511 if (is_valid_state() == false) 512 return 0; 513 if (lpi_max <= 0) 514 return 0; 515 516 if (0 == pthread_mutex_trylock(&g_mutex_)) { 517 if (load_time_.tv_sec < g_last_update_.tv_sec || 518 (load_time_.tv_sec == g_last_update_.tv_sec && 519 load_time_.tv_usec < g_last_update_.tv_usec)) { 520 // Others updated disk file, have to reload 521 pthread_mutex_unlock(&g_mutex_); 522 flush_cache(); 523 } else { 524 pthread_mutex_unlock(&g_mutex_); 525 } 526 } else { 527 } 528 529 UserDictSearchable searchable; 530 prepare_locate(&searchable, splid_str, splid_str_len); 531 532 uint32 max_off = dict_info_.lemma_count; 533 #ifdef ___CACHE_ENABLED___ 534 int32 middle; 535 uint32 start, count; 536 bool cached = cache_hit(&searchable, &start, &count); 537 if (cached) { 538 middle = start; 539 max_off = start + count; 540 } else { 541 middle = locate_first_in_offsets(&searchable); 542 start = middle; 543 } 544 #else 545 int32 middle = locate_first_in_offsets(&searchable); 546 #endif 547 548 if (middle == -1) { 549 #ifdef ___CACHE_ENABLED___ 550 if (!cached) 551 cache_push(USER_DICT_MISS_CACHE, &searchable, 0, 0); 552 #endif 553 return 0; 554 } 555 556 size_t lpi_current = 0; 557 558 bool fuzzy_break = false; 559 bool prefix_break = false; 560 while ((size_t)middle < max_off && !fuzzy_break && !prefix_break) { 561 if (lpi_current >= lpi_max) 562 break; 563 uint32 offset = offsets_[middle]; 564 // Ignore deleted lemmas 565 if (offset & kUserDictOffsetFlagRemove) { 566 middle++; 567 continue; 568 } 569 uint8 nchar = get_lemma_nchar(offset); 570 uint16 * splids = get_lemma_spell_ids(offset); 571 #ifdef ___CACHE_ENABLED___ 572 if (!cached && 0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) { 573 #else 574 if (0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) { 575 #endif 576 fuzzy_break = true; 577 } 578 579 if (prefix_break == false) { 580 if (is_fuzzy_prefix_spell_id(splids, nchar, &searchable)) { 581 if (*need_extend == false && 582 is_prefix_spell_id(splids, nchar, &searchable)) { 583 *need_extend = true; 584 } 585 } else { 586 prefix_break = true; 587 } 588 } 589 590 if (equal_spell_id(splids, nchar, &searchable) == true) { 591 lpi_items[lpi_current].psb = translate_score(scores_[middle]); 592 lpi_items[lpi_current].id = ids_[middle]; 593 lpi_items[lpi_current].lma_len = nchar; 594 lpi_current++; 595 } 596 middle++; 597 } 598 599 #ifdef ___CACHE_ENABLED___ 600 if (!cached) { 601 count = middle - start; 602 cache_push(USER_DICT_CACHE, &searchable, start, count); 603 } 604 #endif 605 606 return lpi_current; 607 } 608 609 uint16 UserDict::get_lemma_str(LemmaIdType id_lemma, char16* str_buf, 610 uint16 str_max) { 611 if (is_valid_state() == false) 612 return 0; 613 if (is_valid_lemma_id(id_lemma) == false) 614 return 0; 615 uint32 offset = offsets_by_id_[id_lemma - start_id_]; 616 uint8 nchar = get_lemma_nchar(offset); 617 char16 * str = get_lemma_word(offset); 618 uint16 m = nchar < str_max -1 ? nchar : str_max - 1; 619 int i = 0; 620 for (; i < m; i++) { 621 str_buf[i] = str[i]; 622 } 623 str_buf[i] = 0; 624 return m; 625 } 626 627 uint16 UserDict::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, 628 uint16 splids_max, bool arg_valid) { 629 if (is_valid_lemma_id(id_lemma) == false) 630 return 0; 631 uint32 offset = offsets_by_id_[id_lemma - start_id_]; 632 uint8 nchar = get_lemma_nchar(offset); 633 const uint16 * ids = get_lemma_spell_ids(offset); 634 int i = 0; 635 for (; i < nchar && i < splids_max; i++) 636 splids[i] = ids[i]; 637 return i; 638 } 639 640 size_t UserDict::predict(const char16 last_hzs[], uint16 hzs_len, 641 NPredictItem *npre_items, size_t npre_max, 642 size_t b4_used) { 643 uint32 new_added = 0; 644 #ifdef ___PREDICT_ENABLED___ 645 int32 end = dict_info_.lemma_count - 1; 646 int j = locate_first_in_predicts((const uint16*)last_hzs, hzs_len); 647 if (j == -1) 648 return 0; 649 650 while (j <= end) { 651 uint32 offset = predicts_[j]; 652 // Ignore deleted lemmas 653 if (offset & kUserDictOffsetFlagRemove) { 654 j++; 655 continue; 656 } 657 uint32 nchar = get_lemma_nchar(offset); 658 uint16 * words = get_lemma_word(offset); 659 uint16 * splids = get_lemma_spell_ids(offset); 660 661 if (nchar <= hzs_len) { 662 j++; 663 continue; 664 } 665 666 if (memcmp(words, last_hzs, hzs_len << 1) == 0) { 667 if (new_added >= npre_max) { 668 return new_added; 669 } 670 uint32 cpy_len = 671 (nchar < kMaxPredictSize ? (nchar << 1) : (kMaxPredictSize << 1)) 672 - (hzs_len << 1); 673 npre_items[new_added].his_len = hzs_len; 674 npre_items[new_added].psb = get_lemma_score(words, splids, nchar); 675 memcpy(npre_items[new_added].pre_hzs, words + hzs_len, cpy_len); 676 if ((cpy_len >> 1) < kMaxPredictSize) { 677 npre_items[new_added].pre_hzs[cpy_len >> 1] = 0; 678 } 679 new_added++; 680 } else { 681 break; 682 } 683 684 j++; 685 } 686 #endif 687 return new_added; 688 } 689 690 int32 UserDict::locate_in_offsets(char16 lemma_str[], uint16 splid_str[], 691 uint16 lemma_len) { 692 int32 max_off = dict_info_.lemma_count; 693 694 UserDictSearchable searchable; 695 prepare_locate(&searchable, splid_str, lemma_len); 696 #ifdef ___CACHE_ENABLED___ 697 int32 off; 698 uint32 start, count; 699 bool cached = load_cache(&searchable, &start, &count); 700 if (cached) { 701 off = start; 702 max_off = start + count; 703 } else { 704 off = locate_first_in_offsets(&searchable); 705 start = off; 706 } 707 #else 708 int32 off = locate_first_in_offsets(&searchable); 709 #endif 710 711 if (off == -1) { 712 return off; 713 } 714 715 while (off < max_off) { 716 uint32 offset = offsets_[off]; 717 if (offset & kUserDictOffsetFlagRemove) { 718 off++; 719 continue; 720 } 721 uint16 * splids = get_lemma_spell_ids(offset); 722 #ifdef ___CACHE_ENABLED___ 723 if (!cached && 0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable)) 724 break; 725 #else 726 if (0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable)) 727 break; 728 #endif 729 if (equal_spell_id(splids, lemma_len, &searchable) == true) { 730 uint16 * str = get_lemma_word(offset); 731 uint32 i = 0; 732 for (i = 0; i < lemma_len; i++) { 733 if (str[i] == lemma_str[i]) 734 continue; 735 break; 736 } 737 if (i < lemma_len) { 738 off++; 739 continue; 740 } 741 #ifdef ___CACHE_ENABLED___ 742 // No need to save_cache here, since current function is invoked by 743 // put_lemma. It's rarely possible for a user input same lemma twice. 744 // That means first time user type a new lemma, it is newly added into 745 // user dictionary, then it's possible that user type the same lemma 746 // again. 747 // Another reason save_cache can not be invoked here is this function 748 // aborts when lemma is found, and it never knows the count. 749 #endif 750 return off; 751 } 752 off++; 753 } 754 755 return -1; 756 } 757 758 #ifdef ___PREDICT_ENABLED___ 759 uint32 UserDict::locate_where_to_insert_in_predicts( 760 const uint16 * words, int lemma_len) { 761 int32 begin = 0; 762 int32 end = dict_info_.lemma_count - 1; 763 int32 middle = end; 764 765 uint32 last_matched = middle; 766 767 while (begin <= end) { 768 middle = (begin + end) >> 1; 769 uint32 offset = offsets_[middle]; 770 uint8 nchar = get_lemma_nchar(offset); 771 const uint16 * ws = get_lemma_word(offset); 772 773 uint32 minl = nchar < lemma_len ? nchar : lemma_len; 774 uint32 k = 0; 775 int cmp = 0; 776 777 for (; k < minl; k++) { 778 if (ws[k] < words[k]) { 779 cmp = -1; 780 break; 781 } else if (ws[k] > words[k]) { 782 cmp = 1; 783 break; 784 } 785 } 786 if (cmp == 0) { 787 if (nchar < lemma_len) 788 cmp = -1; 789 else if (nchar > lemma_len) 790 cmp = 1; 791 } 792 793 if (cmp < 0) { 794 begin = middle + 1; 795 last_matched = middle; 796 } else if (cmp > 0) { 797 end = middle - 1; 798 } else { 799 end = middle - 1; 800 last_matched = middle; 801 } 802 } 803 804 return last_matched; 805 } 806 807 int32 UserDict::locate_first_in_predicts(const uint16 * words, int lemma_len) { 808 int32 begin = 0; 809 int32 end = dict_info_.lemma_count - 1; 810 int32 middle = -1; 811 812 int32 last_matched = middle; 813 814 while (begin <= end) { 815 middle = (begin + end) >> 1; 816 uint32 offset = offsets_[middle]; 817 uint8 nchar = get_lemma_nchar(offset); 818 const uint16 * ws = get_lemma_word(offset); 819 820 uint32 minl = nchar < lemma_len ? nchar : lemma_len; 821 uint32 k = 0; 822 int cmp = 0; 823 824 for (; k < minl; k++) { 825 if (ws[k] < words[k]) { 826 cmp = -1; 827 break; 828 } else if (ws[k] > words[k]) { 829 cmp = 1; 830 break; 831 } 832 } 833 if (cmp == 0) { 834 if (nchar >= lemma_len) 835 last_matched = middle; 836 if (nchar < lemma_len) 837 cmp = -1; 838 else if (nchar > lemma_len) 839 cmp = 1; 840 } 841 842 if (cmp < 0) { 843 begin = middle + 1; 844 } else if (cmp > 0) { 845 end = middle - 1; 846 } else { 847 end = middle - 1; 848 } 849 } 850 851 return last_matched; 852 } 853 854 #endif 855 856 LemmaIdType UserDict::get_lemma_id(char16 lemma_str[], uint16 splids[], 857 uint16 lemma_len) { 858 int32 off = locate_in_offsets(lemma_str, splids, lemma_len); 859 if (off == -1) { 860 return 0; 861 } 862 863 return ids_[off]; 864 } 865 866 LmaScoreType UserDict::get_lemma_score(LemmaIdType lemma_id) { 867 if (is_valid_state() == false) 868 return 0; 869 if (is_valid_lemma_id(lemma_id) == false) 870 return 0; 871 872 return translate_score(_get_lemma_score(lemma_id)); 873 } 874 875 LmaScoreType UserDict::get_lemma_score(char16 lemma_str[], uint16 splids[], 876 uint16 lemma_len) { 877 if (is_valid_state() == false) 878 return 0; 879 return translate_score(_get_lemma_score(lemma_str, splids, lemma_len)); 880 } 881 882 int UserDict::_get_lemma_score(LemmaIdType lemma_id) { 883 if (is_valid_state() == false) 884 return 0; 885 if (is_valid_lemma_id(lemma_id) == false) 886 return 0; 887 888 uint32 offset = offsets_by_id_[lemma_id - start_id_]; 889 890 uint32 nchar = get_lemma_nchar(offset); 891 uint16 * spl = get_lemma_spell_ids(offset); 892 uint16 * wrd = get_lemma_word(offset); 893 894 int32 off = locate_in_offsets(wrd, spl, nchar); 895 if (off == -1) { 896 return 0; 897 } 898 899 return scores_[off]; 900 } 901 902 int UserDict::_get_lemma_score(char16 lemma_str[], uint16 splids[], 903 uint16 lemma_len) { 904 if (is_valid_state() == false) 905 return 0; 906 907 int32 off = locate_in_offsets(lemma_str, splids, lemma_len); 908 if (off == -1) { 909 return 0; 910 } 911 912 return scores_[off]; 913 } 914 915 #ifdef ___SYNC_ENABLED___ 916 void UserDict::remove_lemma_from_sync_list(uint32 offset) { 917 offset &= kUserDictOffsetMask; 918 uint32 i = 0; 919 for (; i < dict_info_.sync_count; i++) { 920 unsigned int off = (syncs_[i] & kUserDictOffsetMask); 921 if (off == offset) 922 break; 923 } 924 if (i < dict_info_.sync_count) { 925 syncs_[i] = syncs_[dict_info_.sync_count - 1]; 926 dict_info_.sync_count--; 927 } 928 } 929 #endif 930 931 #ifdef ___PREDICT_ENABLED___ 932 void UserDict::remove_lemma_from_predict_list(uint32 offset) { 933 offset &= kUserDictOffsetMask; 934 uint32 i = 0; 935 for (; i < dict_info_.lemma_count; i++) { 936 unsigned int off = (predicts_[i] & kUserDictOffsetMask); 937 if (off == offset) { 938 predicts_[i] |= kUserDictOffsetFlagRemove; 939 break; 940 } 941 } 942 } 943 #endif 944 945 bool UserDict::remove_lemma_by_offset_index(int offset_index) { 946 if (is_valid_state() == false) 947 return 0; 948 949 int32 off = offset_index; 950 if (off == -1) { 951 return false; 952 } 953 954 uint32 offset = offsets_[off]; 955 uint32 nchar = get_lemma_nchar(offset); 956 957 offsets_[off] |= kUserDictOffsetFlagRemove; 958 959 #ifdef ___SYNC_ENABLED___ 960 // Remove corresponding sync item 961 remove_lemma_from_sync_list(offset); 962 #endif 963 964 #ifdef ___PREDICT_ENABLED___ 965 remove_lemma_from_predict_list(offset); 966 #endif 967 dict_info_.free_count++; 968 dict_info_.free_size += (2 + (nchar << 2)); 969 970 if (state_ < USER_DICT_OFFSET_DIRTY) 971 state_ = USER_DICT_OFFSET_DIRTY; 972 return true; 973 } 974 975 bool UserDict::remove_lemma(LemmaIdType lemma_id) { 976 if (is_valid_state() == false) 977 return 0; 978 if (is_valid_lemma_id(lemma_id) == false) 979 return false; 980 uint32 offset = offsets_by_id_[lemma_id - start_id_]; 981 982 uint32 nchar = get_lemma_nchar(offset); 983 uint16 * spl = get_lemma_spell_ids(offset); 984 uint16 * wrd = get_lemma_word(offset); 985 986 int32 off = locate_in_offsets(wrd, spl, nchar); 987 988 return remove_lemma_by_offset_index(off); 989 } 990 991 void UserDict::flush_cache() { 992 LemmaIdType start_id = start_id_; 993 const char * file = strdup(dict_file_); 994 if (!file) 995 return; 996 close_dict(); 997 load_dict(file, start_id, kUserDictIdEnd); 998 free((void*)file); 999 #ifdef ___CACHE_ENABLED___ 1000 cache_init(); 1001 #endif 1002 return; 1003 } 1004 1005 bool UserDict::reset(const char *file) { 1006 FILE *fp = fopen(file, "w+"); 1007 if (!fp) { 1008 return false; 1009 } 1010 uint32 version = kUserDictVersion; 1011 size_t wred = fwrite(&version, 1, 4, fp); 1012 UserDictInfo info; 1013 memset(&info, 0, sizeof(info)); 1014 // By default, no limitation for lemma count and size 1015 // thereby, reclaim_ratio is never used 1016 wred += fwrite(&info, 1, sizeof(info), fp); 1017 if (wred != sizeof(info) + sizeof(version)) { 1018 fclose(fp); 1019 unlink(file); 1020 return false; 1021 } 1022 fclose(fp); 1023 return true; 1024 } 1025 1026 bool UserDict::validate(const char *file) { 1027 // b is ignored in POSIX compatible os including Linux 1028 // while b is important flag for Windows to specify binary mode 1029 FILE *fp = fopen(file, "rb"); 1030 if (!fp) { 1031 return false; 1032 } 1033 1034 size_t size; 1035 size_t readed; 1036 uint32 version; 1037 UserDictInfo dict_info; 1038 1039 // validate 1040 int err = fseek(fp, 0, SEEK_END); 1041 if (err) { 1042 goto error; 1043 } 1044 1045 size = ftell(fp); 1046 if (size < 4 + sizeof(dict_info)) { 1047 goto error; 1048 } 1049 1050 err = fseek(fp, 0, SEEK_SET); 1051 if (err) { 1052 goto error; 1053 } 1054 1055 readed = fread(&version, 1, sizeof(version), fp); 1056 if (readed < sizeof(version)) { 1057 goto error; 1058 } 1059 if (version != kUserDictVersion) { 1060 goto error; 1061 } 1062 1063 err = fseek(fp, -1 * sizeof(dict_info), SEEK_END); 1064 if (err) { 1065 goto error; 1066 } 1067 1068 readed = fread(&dict_info, 1, sizeof(dict_info), fp); 1069 if (readed != sizeof(dict_info)) { 1070 goto error; 1071 } 1072 1073 if (size != get_dict_file_size(&dict_info)) { 1074 goto error; 1075 } 1076 1077 fclose(fp); 1078 return true; 1079 1080 error: 1081 fclose(fp); 1082 return false; 1083 } 1084 1085 bool UserDict::load(const char *file, LemmaIdType start_id) { 1086 if (0 != pthread_mutex_trylock(&g_mutex_)) { 1087 return false; 1088 } 1089 // b is ignored in POSIX compatible os including Linux 1090 // while b is important flag for Windows to specify binary mode 1091 FILE *fp = fopen(file, "rb"); 1092 if (!fp) { 1093 pthread_mutex_unlock(&g_mutex_); 1094 return false; 1095 } 1096 1097 size_t readed, toread; 1098 UserDictInfo dict_info; 1099 uint8 *lemmas = NULL; 1100 uint32 *offsets = NULL; 1101 #ifdef ___SYNC_ENABLED___ 1102 uint32 *syncs = NULL; 1103 #endif 1104 uint32 *scores = NULL; 1105 uint32 *ids = NULL; 1106 uint32 *offsets_by_id = NULL; 1107 #ifdef ___PREDICT_ENABLED___ 1108 uint32 *predicts = NULL; 1109 #endif 1110 size_t i; 1111 int err; 1112 1113 err = fseek(fp, -1 * sizeof(dict_info), SEEK_END); 1114 if (err) goto error; 1115 1116 readed = fread(&dict_info, 1, sizeof(dict_info), fp); 1117 if (readed != sizeof(dict_info)) goto error; 1118 1119 lemmas = (uint8 *)malloc( 1120 dict_info.lemma_size + 1121 (kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2)))); 1122 1123 if (!lemmas) goto error; 1124 1125 offsets = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2); 1126 if (!offsets) goto error; 1127 1128 #ifdef ___PREDICT_ENABLED___ 1129 predicts = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2); 1130 if (!predicts) goto error; 1131 #endif 1132 1133 #ifdef ___SYNC_ENABLED___ 1134 syncs = (uint32 *)malloc((dict_info.sync_count + kUserDictPreAlloc) << 2); 1135 if (!syncs) goto error; 1136 #endif 1137 1138 scores = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2); 1139 if (!scores) goto error; 1140 1141 ids = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2); 1142 if (!ids) goto error; 1143 1144 offsets_by_id = (uint32 *)malloc( 1145 (dict_info.lemma_count + kUserDictPreAlloc) << 2); 1146 if (!offsets_by_id) goto error; 1147 1148 err = fseek(fp, 4, SEEK_SET); 1149 if (err) goto error; 1150 1151 readed = 0; 1152 while (readed < dict_info.lemma_size && !ferror(fp) && !feof(fp)) { 1153 readed += fread(lemmas + readed, 1, dict_info.lemma_size - readed, fp); 1154 } 1155 if (readed < dict_info.lemma_size) 1156 goto error; 1157 1158 toread = (dict_info.lemma_count << 2); 1159 readed = 0; 1160 while (readed < toread && !ferror(fp) && !feof(fp)) { 1161 readed += fread((((uint8*)offsets) + readed), 1, toread - readed, fp); 1162 } 1163 if (readed < toread) 1164 goto error; 1165 1166 #ifdef ___PREDICT_ENABLED___ 1167 toread = (dict_info.lemma_count << 2); 1168 readed = 0; 1169 while (readed < toread && !ferror(fp) && !feof(fp)) { 1170 readed += fread((((uint8*)predicts) + readed), 1, toread - readed, fp); 1171 } 1172 if (readed < toread) 1173 goto error; 1174 #endif 1175 1176 readed = 0; 1177 while (readed < toread && !ferror(fp) && !feof(fp)) { 1178 readed += fread((((uint8*)scores) + readed), 1, toread - readed, fp); 1179 } 1180 if (readed < toread) 1181 goto error; 1182 1183 #ifdef ___SYNC_ENABLED___ 1184 toread = (dict_info.sync_count << 2); 1185 readed = 0; 1186 while (readed < toread && !ferror(fp) && !feof(fp)) { 1187 readed += fread((((uint8*)syncs) + readed), 1, toread - readed, fp); 1188 } 1189 if (readed < toread) 1190 goto error; 1191 #endif 1192 1193 for (i = 0; i < dict_info.lemma_count; i++) { 1194 ids[i] = start_id + i; 1195 offsets_by_id[i] = offsets[i]; 1196 } 1197 1198 lemmas_ = lemmas; 1199 offsets_ = offsets; 1200 #ifdef ___SYNC_ENABLED___ 1201 syncs_ = syncs; 1202 sync_count_size_ = dict_info.sync_count + kUserDictPreAlloc; 1203 #endif 1204 offsets_by_id_ = offsets_by_id; 1205 scores_ = scores; 1206 ids_ = ids; 1207 #ifdef ___PREDICT_ENABLED___ 1208 predicts_ = predicts; 1209 #endif 1210 lemma_count_left_ = kUserDictPreAlloc; 1211 lemma_size_left_ = kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2)); 1212 memcpy(&dict_info_, &dict_info, sizeof(dict_info)); 1213 state_ = USER_DICT_SYNC; 1214 1215 fclose(fp); 1216 1217 pthread_mutex_unlock(&g_mutex_); 1218 return true; 1219 1220 error: 1221 if (lemmas) free(lemmas); 1222 if (offsets) free(offsets); 1223 #ifdef ___SYNC_ENABLED___ 1224 if (syncs) free(syncs); 1225 #endif 1226 if (scores) free(scores); 1227 if (ids) free(ids); 1228 if (offsets_by_id) free(offsets_by_id); 1229 #ifdef ___PREDICT_ENABLED___ 1230 if (predicts) free(predicts); 1231 #endif 1232 fclose(fp); 1233 pthread_mutex_unlock(&g_mutex_); 1234 return false; 1235 } 1236 1237 void UserDict::write_back() { 1238 // XXX write back is only allowed from close_dict due to thread-safe sake 1239 if (state_ == USER_DICT_NONE || state_ == USER_DICT_SYNC) 1240 return; 1241 int fd = open(dict_file_, O_WRONLY); 1242 if (fd == -1) 1243 return; 1244 switch (state_) { 1245 case USER_DICT_DEFRAGMENTED: 1246 write_back_all(fd); 1247 break; 1248 case USER_DICT_LEMMA_DIRTY: 1249 write_back_lemma(fd); 1250 break; 1251 case USER_DICT_OFFSET_DIRTY: 1252 write_back_offset(fd); 1253 break; 1254 case USER_DICT_SCORE_DIRTY: 1255 write_back_score(fd); 1256 break; 1257 #ifdef ___SYNC_ENABLED___ 1258 case USER_DICT_SYNC_DIRTY: 1259 write_back_sync(fd); 1260 break; 1261 #endif 1262 default: 1263 break; 1264 } 1265 // It seems truncate is not need on Linux, Windows except Mac 1266 // I am doing it here anyway for safety. 1267 off_t cur = lseek(fd, 0, SEEK_CUR); 1268 ftruncate(fd, cur); 1269 close(fd); 1270 state_ = USER_DICT_SYNC; 1271 } 1272 1273 #ifdef ___SYNC_ENABLED___ 1274 void UserDict::write_back_sync(int fd) { 1275 int err = lseek(fd, 4 + dict_info_.lemma_size 1276 + (dict_info_.lemma_count << 3) 1277 #ifdef ___PREDICT_ENABLED___ 1278 + (dict_info_.lemma_count << 2) 1279 #endif 1280 , SEEK_SET); 1281 if (err == -1) 1282 return; 1283 write(fd, syncs_, dict_info_.sync_count << 2); 1284 write(fd, &dict_info_, sizeof(dict_info_)); 1285 } 1286 #endif 1287 1288 void UserDict::write_back_offset(int fd) { 1289 int err = lseek(fd, 4 + dict_info_.lemma_size, SEEK_SET); 1290 if (err == -1) 1291 return; 1292 write(fd, offsets_, dict_info_.lemma_count << 2); 1293 #ifdef ___PREDICT_ENABLED___ 1294 write(fd, predicts_, dict_info_.lemma_count << 2); 1295 #endif 1296 write(fd, scores_, dict_info_.lemma_count << 2); 1297 #ifdef ___SYNC_ENABLED___ 1298 write(fd, syncs_, dict_info_.sync_count << 2); 1299 #endif 1300 write(fd, &dict_info_, sizeof(dict_info_)); 1301 } 1302 1303 void UserDict::write_back_score(int fd) { 1304 int err = lseek(fd, 4 + dict_info_.lemma_size 1305 + (dict_info_.lemma_count << 2) 1306 #ifdef ___PREDICT_ENABLED___ 1307 + (dict_info_.lemma_count << 2) 1308 #endif 1309 , SEEK_SET); 1310 if (err == -1) 1311 return; 1312 write(fd, scores_, dict_info_.lemma_count << 2); 1313 #ifdef ___SYNC_ENABLED___ 1314 write(fd, syncs_, dict_info_.sync_count << 2); 1315 #endif 1316 write(fd, &dict_info_, sizeof(dict_info_)); 1317 } 1318 1319 void UserDict::write_back_lemma(int fd) { 1320 int err = lseek(fd, 4, SEEK_SET); 1321 if (err == -1) 1322 return; 1323 // New lemmas are always appended, no need to write whole lemma block 1324 size_t need_write = kUserDictPreAlloc * 1325 (2 + (kUserDictAverageNchar << 2)) - lemma_size_left_; 1326 err = lseek(fd, dict_info_.lemma_size - need_write, SEEK_CUR); 1327 if (err == -1) 1328 return; 1329 write(fd, lemmas_ + dict_info_.lemma_size - need_write, need_write); 1330 1331 write(fd, offsets_, dict_info_.lemma_count << 2); 1332 #ifdef ___PREDICT_ENABLED___ 1333 write(fd, predicts_, dict_info_.lemma_count << 2); 1334 #endif 1335 write(fd, scores_, dict_info_.lemma_count << 2); 1336 #ifdef ___SYNC_ENABLED___ 1337 write(fd, syncs_, dict_info_.sync_count << 2); 1338 #endif 1339 write(fd, &dict_info_, sizeof(dict_info_)); 1340 } 1341 1342 void UserDict::write_back_all(int fd) { 1343 // XXX lemma_size is handled differently in writeall 1344 // and writelemma. I update lemma_size and lemma_count in different 1345 // places for these two cases. Should fix it to make it consistent. 1346 int err = lseek(fd, 4, SEEK_SET); 1347 if (err == -1) 1348 return; 1349 write(fd, lemmas_, dict_info_.lemma_size); 1350 write(fd, offsets_, dict_info_.lemma_count << 2); 1351 #ifdef ___PREDICT_ENABLED___ 1352 write(fd, predicts_, dict_info_.lemma_count << 2); 1353 #endif 1354 write(fd, scores_, dict_info_.lemma_count << 2); 1355 #ifdef ___SYNC_ENABLED___ 1356 write(fd, syncs_, dict_info_.sync_count << 2); 1357 #endif 1358 write(fd, &dict_info_, sizeof(dict_info_)); 1359 } 1360 1361 #ifdef ___CACHE_ENABLED___ 1362 bool UserDict::load_cache(UserDictSearchable *searchable, 1363 uint32 *offset, uint32 *length) { 1364 UserDictCache *cache = &caches_[searchable->splids_len - 1]; 1365 if (cache->head == cache->tail) 1366 return false; 1367 1368 uint16 j, sig_len = kMaxLemmaSize / 4; 1369 uint16 i = cache->head; 1370 while (1) { 1371 j = 0; 1372 for (; j < sig_len; j++) { 1373 if (cache->signatures[i][j] != searchable->signature[j]) 1374 break; 1375 } 1376 if (j < sig_len) { 1377 i++; 1378 if (i >= kUserDictCacheSize) 1379 i -= kUserDictCacheSize; 1380 if (i == cache->tail) 1381 break; 1382 continue; 1383 } 1384 *offset = cache->offsets[i]; 1385 *length = cache->lengths[i]; 1386 return true; 1387 } 1388 return false; 1389 } 1390 1391 void UserDict::save_cache(UserDictSearchable *searchable, 1392 uint32 offset, uint32 length) { 1393 UserDictCache *cache = &caches_[searchable->splids_len - 1]; 1394 uint16 next = cache->tail; 1395 1396 cache->offsets[next] = offset; 1397 cache->lengths[next] = length; 1398 uint16 sig_len = kMaxLemmaSize / 4; 1399 uint16 j = 0; 1400 for (; j < sig_len; j++) { 1401 cache->signatures[next][j] = searchable->signature[j]; 1402 } 1403 1404 if (++next >= kUserDictCacheSize) { 1405 next -= kUserDictCacheSize; 1406 } 1407 if (next == cache->head) { 1408 cache->head++; 1409 if (cache->head >= kUserDictCacheSize) { 1410 cache->head -= kUserDictCacheSize; 1411 } 1412 } 1413 cache->tail = next; 1414 } 1415 1416 void UserDict::reset_cache() { 1417 memset(caches_, 0, sizeof(caches_)); 1418 } 1419 1420 bool UserDict::load_miss_cache(UserDictSearchable *searchable) { 1421 UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1]; 1422 if (cache->head == cache->tail) 1423 return false; 1424 1425 uint16 j, sig_len = kMaxLemmaSize / 4; 1426 uint16 i = cache->head; 1427 while (1) { 1428 j = 0; 1429 for (; j < sig_len; j++) { 1430 if (cache->signatures[i][j] != searchable->signature[j]) 1431 break; 1432 } 1433 if (j < sig_len) { 1434 i++; 1435 if (i >= kUserDictMissCacheSize) 1436 i -= kUserDictMissCacheSize; 1437 if (i == cache->tail) 1438 break; 1439 continue; 1440 } 1441 return true; 1442 } 1443 return false; 1444 } 1445 1446 void UserDict::save_miss_cache(UserDictSearchable *searchable) { 1447 UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1]; 1448 uint16 next = cache->tail; 1449 1450 uint16 sig_len = kMaxLemmaSize / 4; 1451 uint16 j = 0; 1452 for (; j < sig_len; j++) { 1453 cache->signatures[next][j] = searchable->signature[j]; 1454 } 1455 1456 if (++next >= kUserDictMissCacheSize) { 1457 next -= kUserDictMissCacheSize; 1458 } 1459 if (next == cache->head) { 1460 cache->head++; 1461 if (cache->head >= kUserDictMissCacheSize) { 1462 cache->head -= kUserDictMissCacheSize; 1463 } 1464 } 1465 cache->tail = next; 1466 } 1467 1468 void UserDict::reset_miss_cache() { 1469 memset(miss_caches_, 0, sizeof(miss_caches_)); 1470 } 1471 1472 void UserDict::cache_init() { 1473 reset_cache(); 1474 reset_miss_cache(); 1475 } 1476 1477 bool UserDict::cache_hit(UserDictSearchable *searchable, 1478 uint32 *offset, uint32 *length) { 1479 bool hit = load_miss_cache(searchable); 1480 if (hit) { 1481 *offset = 0; 1482 *length = 0; 1483 return true; 1484 } 1485 hit = load_cache(searchable, offset, length); 1486 if (hit) { 1487 return true; 1488 } 1489 return false; 1490 } 1491 1492 void UserDict::cache_push(UserDictCacheType type, 1493 UserDictSearchable *searchable, 1494 uint32 offset, uint32 length) { 1495 switch (type) { 1496 case USER_DICT_MISS_CACHE: 1497 save_miss_cache(searchable); 1498 break; 1499 case USER_DICT_CACHE: 1500 save_cache(searchable, offset, length); 1501 break; 1502 default: 1503 break; 1504 } 1505 } 1506 1507 #endif 1508 1509 void UserDict::defragment(void) { 1510 #ifdef ___DEBUG_PERF___ 1511 DEBUG_PERF_BEGIN; 1512 #endif 1513 if (is_valid_state() == false) 1514 return; 1515 // Fixup offsets_, set REMOVE flag to lemma's flag if needed 1516 size_t first_freed = 0; 1517 size_t first_inuse = 0; 1518 while (first_freed < dict_info_.lemma_count) { 1519 // Find first freed offset 1520 while ((offsets_[first_freed] & kUserDictOffsetFlagRemove) == 0 && 1521 first_freed < dict_info_.lemma_count) { 1522 first_freed++; 1523 } 1524 if (first_freed < dict_info_.lemma_count) { 1525 // Save REMOVE flag to lemma flag 1526 int off = offsets_[first_freed]; 1527 set_lemma_flag(off, kUserDictLemmaFlagRemove); 1528 } else { 1529 break; 1530 } 1531 // Find first inuse offse after first_freed 1532 first_inuse = first_freed + 1; 1533 while ((offsets_[first_inuse] & kUserDictOffsetFlagRemove) && 1534 (first_inuse < dict_info_.lemma_count)) { 1535 // Save REMOVE flag to lemma flag 1536 int off = offsets_[first_inuse]; 1537 set_lemma_flag(off, kUserDictLemmaFlagRemove); 1538 first_inuse++; 1539 } 1540 if (first_inuse >= dict_info_.lemma_count) { 1541 break; 1542 } 1543 // Swap offsets_ 1544 int tmp = offsets_[first_inuse]; 1545 offsets_[first_inuse] = offsets_[first_freed]; 1546 offsets_[first_freed] = tmp; 1547 // Move scores_, no need to swap 1548 tmp = scores_[first_inuse]; 1549 scores_[first_inuse] = scores_[first_freed]; 1550 scores_[first_freed] = tmp; 1551 // Swap ids_ 1552 LemmaIdType tmpid = ids_[first_inuse]; 1553 ids_[first_inuse] = ids_[first_freed]; 1554 ids_[first_freed] = tmpid; 1555 // Go on 1556 first_freed++; 1557 } 1558 #ifdef ___PREDICT_ENABLED___ 1559 // Fixup predicts_ 1560 first_freed = 0; 1561 first_inuse = 0; 1562 while (first_freed < dict_info_.lemma_count) { 1563 // Find first freed offset 1564 while ((predicts_[first_freed] & kUserDictOffsetFlagRemove) == 0 && 1565 first_freed < dict_info_.lemma_count) { 1566 first_freed++; 1567 } 1568 if (first_freed >= dict_info_.lemma_count) 1569 break; 1570 // Find first inuse offse after first_freed 1571 first_inuse = first_freed + 1; 1572 while ((predicts_[first_inuse] & kUserDictOffsetFlagRemove) 1573 && (first_inuse < dict_info_.lemma_count)) { 1574 first_inuse++; 1575 } 1576 if (first_inuse >= dict_info_.lemma_count) { 1577 break; 1578 } 1579 // Swap offsets_ 1580 int tmp = predicts_[first_inuse]; 1581 predicts_[first_inuse] = predicts_[first_freed]; 1582 predicts_[first_freed] = tmp; 1583 // Go on 1584 first_freed++; 1585 } 1586 #endif 1587 dict_info_.lemma_count = first_freed; 1588 // Fixup lemmas_ 1589 size_t begin = 0; 1590 size_t end = 0; 1591 size_t dst = 0; 1592 int total_size = dict_info_.lemma_size + lemma_size_left_; 1593 int total_count = dict_info_.lemma_count + lemma_count_left_; 1594 size_t real_size = total_size - lemma_size_left_; 1595 while (dst < real_size) { 1596 unsigned char flag = get_lemma_flag(dst); 1597 unsigned char nchr = get_lemma_nchar(dst); 1598 if ((flag & kUserDictLemmaFlagRemove) == 0) { 1599 dst += nchr * 4 + 2; 1600 continue; 1601 } 1602 break; 1603 } 1604 if (dst >= real_size) 1605 return; 1606 1607 end = dst; 1608 while (end < real_size) { 1609 begin = end + get_lemma_nchar(end) * 4 + 2; 1610 repeat: 1611 // not used any more 1612 if (begin >= real_size) 1613 break; 1614 unsigned char flag = get_lemma_flag(begin); 1615 unsigned char nchr = get_lemma_nchar(begin); 1616 if (flag & kUserDictLemmaFlagRemove) { 1617 begin += nchr * 4 + 2; 1618 goto repeat; 1619 } 1620 end = begin + nchr * 4 + 2; 1621 while (end < real_size) { 1622 unsigned char eflag = get_lemma_flag(end); 1623 unsigned char enchr = get_lemma_nchar(end); 1624 if ((eflag & kUserDictLemmaFlagRemove) == 0) { 1625 end += enchr * 4 + 2; 1626 continue; 1627 } 1628 break; 1629 } 1630 memmove(lemmas_ + dst, lemmas_ + begin, end - begin); 1631 for (size_t j = 0; j < dict_info_.lemma_count; j++) { 1632 if (offsets_[j] >= begin && offsets_[j] < end) { 1633 offsets_[j] -= (begin - dst); 1634 offsets_by_id_[ids_[j] - start_id_] = offsets_[j]; 1635 } 1636 #ifdef ___PREDICT_ENABLED___ 1637 if (predicts_[j] >= begin && predicts_[j] < end) { 1638 predicts_[j] -= (begin - dst); 1639 } 1640 #endif 1641 } 1642 #ifdef ___SYNC_ENABLED___ 1643 for (size_t j = 0; j < dict_info_.sync_count; j++) { 1644 if (syncs_[j] >= begin && syncs_[j] < end) { 1645 syncs_[j] -= (begin - dst); 1646 } 1647 } 1648 #endif 1649 dst += (end - begin); 1650 } 1651 1652 dict_info_.free_count = 0; 1653 dict_info_.free_size = 0; 1654 dict_info_.lemma_size = dst; 1655 lemma_size_left_ = total_size - dict_info_.lemma_size; 1656 lemma_count_left_ = total_count - dict_info_.lemma_count; 1657 1658 // XXX Without following code, 1659 // offsets_by_id_ is not reordered. 1660 // That's to say, all removed lemmas' ids are not collected back. 1661 // There may not be room for addition of new lemmas due to 1662 // offsests_by_id_ reason, although lemma_size_left_ is fixed. 1663 // By default, we do want defrag as fast as possible, because 1664 // during defrag procedure, other peers can not write new lemmas 1665 // to user dictionary file. 1666 // XXX If write-back is invoked immediately after 1667 // this defragment, no need to fix up following in-mem data. 1668 for (uint32 i = 0; i < dict_info_.lemma_count; i++) { 1669 ids_[i] = start_id_ + i; 1670 offsets_by_id_[i] = offsets_[i]; 1671 } 1672 1673 state_ = USER_DICT_DEFRAGMENTED; 1674 1675 #ifdef ___DEBUG_PERF___ 1676 DEBUG_PERF_END; 1677 LOGD_PERF("defragment"); 1678 #endif 1679 } 1680 1681 #ifdef ___SYNC_ENABLED___ 1682 void UserDict::clear_sync_lemmas(unsigned int start, unsigned int end) { 1683 if (is_valid_state() == false) 1684 return; 1685 if (end > dict_info_.sync_count) 1686 end = dict_info_.sync_count; 1687 memmove(syncs_ + start, syncs_ + end, (dict_info_.sync_count - end) << 2); 1688 dict_info_.sync_count -= (end - start); 1689 if (state_ < USER_DICT_SYNC_DIRTY) 1690 state_ = USER_DICT_SYNC_DIRTY; 1691 } 1692 1693 int UserDict::get_sync_count() { 1694 if (is_valid_state() == false) 1695 return 0; 1696 return dict_info_.sync_count; 1697 } 1698 1699 LemmaIdType UserDict::put_lemma_no_sync(char16 lemma_str[], uint16 splids[], 1700 uint16 lemma_len, uint16 count, uint64 lmt) { 1701 int again = 0; 1702 begin: 1703 LemmaIdType id; 1704 uint32 * syncs_bak = syncs_; 1705 syncs_ = NULL; 1706 id = _put_lemma(lemma_str, splids, lemma_len, count, lmt); 1707 syncs_ = syncs_bak; 1708 if (id == 0 && again == 0) { 1709 if ((dict_info_.limit_lemma_count > 0 && 1710 dict_info_.lemma_count >= dict_info_.limit_lemma_count) 1711 || (dict_info_.limit_lemma_size > 0 && 1712 dict_info_.lemma_size + (2 + (lemma_len << 2)) 1713 > dict_info_.limit_lemma_size)) { 1714 // XXX Always reclaim and defrag in sync code path 1715 // sync thread is background thread and ok with heavy work 1716 reclaim(); 1717 defragment(); 1718 flush_cache(); 1719 again = 1; 1720 goto begin; 1721 } 1722 } 1723 return id; 1724 } 1725 1726 int UserDict::put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len) { 1727 int newly_added = 0; 1728 1729 SpellingParser * spl_parser = new SpellingParser(); 1730 if (!spl_parser) { 1731 return 0; 1732 } 1733 #ifdef ___DEBUG_PERF___ 1734 DEBUG_PERF_BEGIN; 1735 #endif 1736 char16 *ptr = lemmas; 1737 1738 // Extract pinyin,words,frequence,last_mod_time 1739 char16 * p = ptr, * py16 = ptr; 1740 char16 * hz16 = NULL; 1741 int py16_len = 0; 1742 uint16 splid[kMaxLemmaSize]; 1743 int splid_len = 0; 1744 int hz16_len = 0; 1745 char16 * fr16 = NULL; 1746 int fr16_len = 0; 1747 1748 while (p - ptr < len) { 1749 // Pinyin 1750 py16 = p; 1751 splid_len = 0; 1752 while (*p != 0x2c && (p - ptr) < len) { 1753 if (*p == 0x20) 1754 splid_len++; 1755 p++; 1756 } 1757 splid_len++; 1758 if (p - ptr == len) 1759 break; 1760 py16_len = p - py16; 1761 if (kMaxLemmaSize < splid_len) { 1762 break; 1763 } 1764 bool is_pre; 1765 int splidl = spl_parser->splstr16_to_idxs_f( 1766 py16, py16_len, splid, NULL, kMaxLemmaSize, is_pre); 1767 if (splidl != splid_len) 1768 break; 1769 // Phrase 1770 hz16 = ++p; 1771 while (*p != 0x2c && (p - ptr) < len) { 1772 p++; 1773 } 1774 hz16_len = p - hz16; 1775 if (hz16_len != splid_len) 1776 break; 1777 // Frequency 1778 fr16 = ++p; 1779 fr16_len = 0; 1780 while (*p != 0x2c && (p - ptr) < len) { 1781 p++; 1782 } 1783 fr16_len = p - fr16; 1784 uint32 intf = (uint32)utf16le_atoll(fr16, fr16_len); 1785 // Last modified time 1786 fr16 = ++p; 1787 fr16_len = 0; 1788 while (*p != 0x3b && (p - ptr) < len) { 1789 p++; 1790 } 1791 fr16_len = p - fr16; 1792 uint64 last_mod = utf16le_atoll(fr16, fr16_len); 1793 1794 put_lemma_no_sync(hz16, splid, splid_len, intf, last_mod); 1795 newly_added++; 1796 1797 p++; 1798 } 1799 1800 #ifdef ___DEBUG_PERF___ 1801 DEBUG_PERF_END; 1802 LOGD_PERF("put_lemmas_no_sync_from_utf16le_string"); 1803 #endif 1804 return newly_added; 1805 } 1806 1807 int UserDict::get_sync_lemmas_in_utf16le_string_from_beginning( 1808 char16 * str, int size, int * count) { 1809 int len = 0; 1810 *count = 0; 1811 1812 int left_len = size; 1813 1814 if (is_valid_state() == false) 1815 return len; 1816 1817 SpellingTrie * spl_trie = &SpellingTrie::get_instance(); 1818 if (!spl_trie) { 1819 return 0; 1820 } 1821 1822 uint32 i; 1823 for (i = 0; i < dict_info_.sync_count; i++) { 1824 int offset = syncs_[i]; 1825 uint32 nchar = get_lemma_nchar(offset); 1826 uint16 *spl = get_lemma_spell_ids(offset); 1827 uint16 *wrd = get_lemma_word(offset); 1828 int score = _get_lemma_score(wrd, spl, nchar); 1829 1830 static char score_temp[32], *pscore_temp = score_temp; 1831 static char16 temp[256], *ptemp = temp; 1832 1833 pscore_temp = score_temp; 1834 ptemp = temp; 1835 1836 uint32 j; 1837 // Add pinyin 1838 for (j = 0; j < nchar; j++) { 1839 int ret_len = spl_trie->get_spelling_str16( 1840 spl[j], ptemp, temp + sizeof(temp) - ptemp); 1841 if (ret_len <= 0) 1842 break; 1843 ptemp += ret_len; 1844 if (ptemp < temp + sizeof(temp) - 1) { 1845 *(ptemp++) = ' '; 1846 } else { 1847 j = 0; 1848 break; 1849 } 1850 } 1851 if (j < nchar) { 1852 continue; 1853 } 1854 ptemp--; 1855 if (ptemp < temp + sizeof(temp) - 1) { 1856 *(ptemp++) = ','; 1857 } else { 1858 continue; 1859 } 1860 // Add phrase 1861 for (j = 0; j < nchar; j++) { 1862 if (ptemp < temp + sizeof(temp) - 1) { 1863 *(ptemp++) = wrd[j]; 1864 } else { 1865 break; 1866 } 1867 } 1868 if (j < nchar) { 1869 continue; 1870 } 1871 if (ptemp < temp + sizeof(temp) - 1) { 1872 *(ptemp++) = ','; 1873 } else { 1874 continue; 1875 } 1876 // Add frequency 1877 uint32 intf = extract_score_freq(score); 1878 int ret_len = utf16le_lltoa(intf, ptemp, temp + sizeof(temp) - ptemp); 1879 if (ret_len <= 0) 1880 continue; 1881 ptemp += ret_len; 1882 if (ptemp < temp + sizeof(temp) - 1) { 1883 *(ptemp++) = ','; 1884 } else { 1885 continue; 1886 } 1887 // Add last modified time 1888 uint64 last_mod = extract_score_lmt(score); 1889 ret_len = utf16le_lltoa(last_mod, ptemp, temp + sizeof(temp) - ptemp); 1890 if (ret_len <= 0) 1891 continue; 1892 ptemp += ret_len; 1893 if (ptemp < temp + sizeof(temp) - 1) { 1894 *(ptemp++) = ';'; 1895 } else { 1896 continue; 1897 } 1898 1899 // Write to string 1900 int need_len = ptemp - temp; 1901 if (need_len > left_len) 1902 break; 1903 memcpy(str + len, temp, need_len * 2); 1904 left_len -= need_len; 1905 1906 len += need_len; 1907 (*count)++; 1908 } 1909 1910 if (len > 0) { 1911 if (state_ < USER_DICT_SYNC_DIRTY) 1912 state_ = USER_DICT_SYNC_DIRTY; 1913 } 1914 return len; 1915 } 1916 1917 #endif 1918 1919 bool UserDict::state(UserDictStat * stat) { 1920 if (is_valid_state() == false) 1921 return false; 1922 if (!stat) 1923 return false; 1924 stat->version = version_; 1925 stat->file_name = dict_file_; 1926 stat->load_time.tv_sec = load_time_.tv_sec; 1927 stat->load_time.tv_usec = load_time_.tv_usec; 1928 pthread_mutex_lock(&g_mutex_); 1929 stat->last_update.tv_sec = g_last_update_.tv_sec; 1930 stat->last_update.tv_usec = g_last_update_.tv_usec; 1931 pthread_mutex_unlock(&g_mutex_); 1932 stat->disk_size = get_dict_file_size(&dict_info_); 1933 stat->lemma_count = dict_info_.lemma_count; 1934 stat->lemma_size = dict_info_.lemma_size; 1935 stat->delete_count = dict_info_.free_count; 1936 stat->delete_size = dict_info_.free_size; 1937 #ifdef ___SYNC_ENABLED___ 1938 stat->sync_count = dict_info_.sync_count; 1939 #endif 1940 stat->limit_lemma_count = dict_info_.limit_lemma_count; 1941 stat->limit_lemma_size = dict_info_.limit_lemma_size; 1942 stat->reclaim_ratio = dict_info_.reclaim_ratio; 1943 return true; 1944 } 1945 1946 void UserDict::set_limit(uint32 max_lemma_count, 1947 uint32 max_lemma_size, uint32 reclaim_ratio) { 1948 dict_info_.limit_lemma_count = max_lemma_count; 1949 dict_info_.limit_lemma_size = max_lemma_size; 1950 if (reclaim_ratio > 100) 1951 reclaim_ratio = 100; 1952 dict_info_.reclaim_ratio = reclaim_ratio; 1953 } 1954 1955 void UserDict::reclaim() { 1956 if (is_valid_state() == false) 1957 return; 1958 1959 switch (dict_info_.reclaim_ratio) { 1960 case 0: 1961 return; 1962 case 100: 1963 // TODO: CLEAR to be implemented 1964 assert(false); 1965 return; 1966 default: 1967 break; 1968 } 1969 1970 // XXX Reclaim is only based on count, not size 1971 uint32 count = dict_info_.lemma_count; 1972 int rc = count * dict_info_.reclaim_ratio / 100; 1973 1974 UserDictScoreOffsetPair * score_offset_pairs = NULL; 1975 score_offset_pairs = (UserDictScoreOffsetPair *)malloc( 1976 sizeof(UserDictScoreOffsetPair) * rc); 1977 if (score_offset_pairs == NULL) { 1978 return; 1979 } 1980 1981 for (int i = 0; i < rc; i++) { 1982 int s = scores_[i]; 1983 score_offset_pairs[i].score = s; 1984 score_offset_pairs[i].offset_index = i; 1985 } 1986 1987 for (int i = (rc + 1) / 2; i >= 0; i--) 1988 shift_down(score_offset_pairs, i, rc); 1989 1990 for (uint32 i = rc; i < dict_info_.lemma_count; i++) { 1991 int s = scores_[i]; 1992 if (s < score_offset_pairs[0].score) { 1993 score_offset_pairs[0].score = s; 1994 score_offset_pairs[0].offset_index = i; 1995 shift_down(score_offset_pairs, 0, rc); 1996 } 1997 } 1998 1999 for (int i = 0; i < rc; i++) { 2000 int off = score_offset_pairs[i].offset_index; 2001 remove_lemma_by_offset_index(off); 2002 } 2003 if (rc > 0) { 2004 if (state_ < USER_DICT_OFFSET_DIRTY) 2005 state_ = USER_DICT_OFFSET_DIRTY; 2006 } 2007 2008 free(score_offset_pairs); 2009 } 2010 2011 inline void UserDict::swap(UserDictScoreOffsetPair * sop, int i, int j) { 2012 int s = sop[i].score; 2013 int p = sop[i].offset_index; 2014 sop[i].score = sop[j].score; 2015 sop[i].offset_index = sop[j].offset_index; 2016 sop[j].score = s; 2017 sop[j].offset_index = p; 2018 } 2019 2020 void UserDict::shift_down(UserDictScoreOffsetPair * sop, int i, int n) { 2021 int par = i; 2022 while (par < n) { 2023 int left = par * 2 + 1; 2024 int right = left + 1; 2025 if (left >= n && right >= n) 2026 break; 2027 if (right >= n) { 2028 if (sop[left].score > sop[par].score) { 2029 swap(sop, left, par); 2030 par = left; 2031 continue; 2032 } 2033 } else if (sop[left].score > sop[right].score && 2034 sop[left].score > sop[par].score) { 2035 swap(sop, left, par); 2036 par = left; 2037 continue; 2038 } else if (sop[right].score > sop[left].score && 2039 sop[right].score > sop[par].score) { 2040 swap(sop, right, par); 2041 par = right; 2042 continue; 2043 } 2044 break; 2045 } 2046 } 2047 2048 LemmaIdType UserDict::put_lemma(char16 lemma_str[], uint16 splids[], 2049 uint16 lemma_len, uint16 count) { 2050 return _put_lemma(lemma_str, splids, lemma_len, count, time(NULL)); 2051 } 2052 2053 LemmaIdType UserDict::_put_lemma(char16 lemma_str[], uint16 splids[], 2054 uint16 lemma_len, uint16 count, uint64 lmt) { 2055 #ifdef ___DEBUG_PERF___ 2056 DEBUG_PERF_BEGIN; 2057 #endif 2058 if (is_valid_state() == false) 2059 return 0; 2060 int32 off = locate_in_offsets(lemma_str, splids, lemma_len); 2061 if (off != -1) { 2062 int delta_score = count - scores_[off]; 2063 dict_info_.total_nfreq += delta_score; 2064 scores_[off] = build_score(lmt, count); 2065 if (state_ < USER_DICT_SCORE_DIRTY) 2066 state_ = USER_DICT_SCORE_DIRTY; 2067 #ifdef ___DEBUG_PERF___ 2068 DEBUG_PERF_END; 2069 LOGD_PERF("_put_lemma(update)"); 2070 #endif 2071 return ids_[off]; 2072 } else { 2073 if ((dict_info_.limit_lemma_count > 0 && 2074 dict_info_.lemma_count >= dict_info_.limit_lemma_count) 2075 || (dict_info_.limit_lemma_size > 0 && 2076 dict_info_.lemma_size + (2 + (lemma_len << 2)) 2077 > dict_info_.limit_lemma_size)) { 2078 // XXX Don't defragment here, it's too time-consuming. 2079 return 0; 2080 } 2081 int flushed = 0; 2082 if (lemma_count_left_ == 0 || 2083 lemma_size_left_ < (size_t)(2 + (lemma_len << 2))) { 2084 2085 // XXX When there is no space for new lemma, we flush to disk 2086 // flush_cache() may be called by upper user 2087 // and better place shoule be found instead of here 2088 flush_cache(); 2089 flushed = 1; 2090 // Or simply return and do nothing 2091 // return 0; 2092 } 2093 #ifdef ___DEBUG_PERF___ 2094 DEBUG_PERF_END; 2095 LOGD_PERF(flushed ? "_put_lemma(flush+add)" : "_put_lemma(add)"); 2096 #endif 2097 LemmaIdType id = append_a_lemma(lemma_str, splids, lemma_len, count, lmt); 2098 #ifdef ___SYNC_ENABLED___ 2099 if (syncs_ && id != 0) { 2100 queue_lemma_for_sync(id); 2101 } 2102 #endif 2103 return id; 2104 } 2105 return 0; 2106 } 2107 2108 #ifdef ___SYNC_ENABLED___ 2109 void UserDict::queue_lemma_for_sync(LemmaIdType id) { 2110 if (dict_info_.sync_count < sync_count_size_) { 2111 syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_]; 2112 } else { 2113 uint32 * syncs = (uint32*)realloc( 2114 syncs_, (sync_count_size_ + kUserDictPreAlloc) << 2); 2115 if (syncs) { 2116 sync_count_size_ += kUserDictPreAlloc; 2117 syncs_ = syncs; 2118 syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_]; 2119 } 2120 } 2121 } 2122 #endif 2123 2124 LemmaIdType UserDict::update_lemma(LemmaIdType lemma_id, int16 delta_count, 2125 bool selected) { 2126 #ifdef ___DEBUG_PERF___ 2127 DEBUG_PERF_BEGIN; 2128 #endif 2129 if (is_valid_state() == false) 2130 return 0; 2131 if (is_valid_lemma_id(lemma_id) == false) 2132 return 0; 2133 uint32 offset = offsets_by_id_[lemma_id - start_id_]; 2134 uint8 lemma_len = get_lemma_nchar(offset); 2135 char16 * lemma_str = get_lemma_word(offset); 2136 uint16 * splids = get_lemma_spell_ids(offset); 2137 2138 int32 off = locate_in_offsets(lemma_str, splids, lemma_len); 2139 if (off != -1) { 2140 int score = scores_[off]; 2141 int count = extract_score_freq(score); 2142 uint64 lmt = extract_score_lmt(score); 2143 if (count + delta_count > kUserDictMaxFrequency || 2144 count + delta_count < count) { 2145 delta_count = kUserDictMaxFrequency - count; 2146 } 2147 count += delta_count; 2148 dict_info_.total_nfreq += delta_count; 2149 if (selected) { 2150 lmt = time(NULL); 2151 } 2152 scores_[off] = build_score(lmt, count); 2153 if (state_ < USER_DICT_SCORE_DIRTY) 2154 state_ = USER_DICT_SCORE_DIRTY; 2155 #ifdef ___DEBUG_PERF___ 2156 DEBUG_PERF_END; 2157 LOGD_PERF("update_lemma"); 2158 #endif 2159 #ifdef ___SYNC_ENABLED___ 2160 queue_lemma_for_sync(ids_[off]); 2161 #endif 2162 return ids_[off]; 2163 } 2164 return 0; 2165 } 2166 2167 size_t UserDict::get_total_lemma_count() { 2168 return dict_info_.total_nfreq; 2169 } 2170 2171 void UserDict::set_total_lemma_count_of_others(size_t count) { 2172 total_other_nfreq_ = count; 2173 } 2174 2175 LemmaIdType UserDict::append_a_lemma(char16 lemma_str[], uint16 splids[], 2176 uint16 lemma_len, uint16 count, uint64 lmt) { 2177 LemmaIdType id = get_max_lemma_id() + 1; 2178 size_t offset = dict_info_.lemma_size; 2179 if (offset > kUserDictOffsetMask) 2180 return 0; 2181 2182 lemmas_[offset] = 0; 2183 lemmas_[offset + 1] = (uint8)lemma_len; 2184 for (size_t i = 0; i < lemma_len; i++) { 2185 *((uint16*)&lemmas_[offset + 2 + (i << 1)]) = splids[i]; 2186 *((char16*)&lemmas_[offset + 2 + (lemma_len << 1) + (i << 1)]) 2187 = lemma_str[i]; 2188 } 2189 uint32 off = dict_info_.lemma_count; 2190 offsets_[off] = offset; 2191 scores_[off] = build_score(lmt, count); 2192 ids_[off] = id; 2193 #ifdef ___PREDICT_ENABLED___ 2194 predicts_[off] = offset; 2195 #endif 2196 2197 offsets_by_id_[id - start_id_] = offset; 2198 2199 dict_info_.lemma_count++; 2200 dict_info_.lemma_size += (2 + (lemma_len << 2)); 2201 lemma_count_left_--; 2202 lemma_size_left_ -= (2 + (lemma_len << 2)); 2203 2204 // Sort 2205 2206 UserDictSearchable searchable; 2207 prepare_locate(&searchable, splids, lemma_len); 2208 2209 size_t i = 0; 2210 while (i < off) { 2211 offset = offsets_[i]; 2212 uint32 nchar = get_lemma_nchar(offset); 2213 uint16 * spl = get_lemma_spell_ids(offset); 2214 2215 if (0 <= fuzzy_compare_spell_id(spl, nchar, &searchable)) 2216 break; 2217 i++; 2218 } 2219 if (i != off) { 2220 uint32 temp = offsets_[off]; 2221 memmove(offsets_ + i + 1, offsets_ + i, (off - i) << 2); 2222 offsets_[i] = temp; 2223 2224 temp = scores_[off]; 2225 memmove(scores_ + i + 1, scores_ + i, (off - i) << 2); 2226 scores_[i] = temp; 2227 2228 temp = ids_[off]; 2229 memmove(ids_ + i + 1, ids_ + i, (off - i) << 2); 2230 ids_[i] = temp; 2231 } 2232 2233 #ifdef ___PREDICT_ENABLED___ 2234 uint32 j = 0; 2235 uint16 * words_new = get_lemma_word(predicts_[off]); 2236 j = locate_where_to_insert_in_predicts(words_new, lemma_len); 2237 if (j != off) { 2238 uint32 temp = predicts_[off]; 2239 memmove(predicts_ + j + 1, predicts_ + j, (off - j) << 2); 2240 predicts_[j] = temp; 2241 } 2242 #endif 2243 2244 if (state_ < USER_DICT_LEMMA_DIRTY) 2245 state_ = USER_DICT_LEMMA_DIRTY; 2246 2247 #ifdef ___CACHE_ENABLED___ 2248 cache_init(); 2249 #endif 2250 2251 dict_info_.total_nfreq += count; 2252 return id; 2253 } 2254 } 2255