Home | History | Annotate | Download | only in share
      1 /*
      2  * Copyright (C) 2009 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "../include/userdict.h"
     18 #include "../include/splparser.h"
     19 #include "../include/ngram.h"
     20 #include <stdio.h>
     21 #include <string.h>
     22 #include <stdlib.h>
     23 #include <cutils/log.h>
     24 #include <unistd.h>
     25 #include <fcntl.h>
     26 #include <sys/stat.h>
     27 #include <assert.h>
     28 #include <ctype.h>
     29 #include <sys/types.h>
     30 #include <sys/time.h>
     31 #include <time.h>
     32 #include <pthread.h>
     33 #include <math.h>
     34 
     35 namespace ime_pinyin {
     36 
     37 #ifdef ___DEBUG_PERF___
     38 static uint64 _ellapse_ = 0;
     39 static struct timeval _tv_start_, _tv_end_;
     40 #define DEBUG_PERF_BEGIN \
     41     do { \
     42       gettimeofday(&_tv_start_, NULL); \
     43     } while(0)
     44 #define DEBUG_PERF_END \
     45     do { \
     46       gettimeofday(&_tv_end_, NULL); \
     47       _ellapse_ = (_tv_end_.tv_sec - _tv_start_.tv_sec) * 1000000 + \
     48                   (_tv_end_.tv_usec - _tv_start_.tv_usec); \
     49     } while(0)
     50 #define LOGD_PERF(message) \
     51     LOGD("PERFORMANCE[%s] %llu usec.", message, _ellapse_);
     52 #else
     53 #define DEBUG_PERF_BEGIN
     54 #define DEBUG_PERF_END
     55 #define LOGD_PERF(message)
     56 #endif
     57 
     58 // XXX File load and write are thread-safe by g_mutex_
     59 static pthread_mutex_t g_mutex_ = PTHREAD_MUTEX_INITIALIZER;
     60 static struct timeval g_last_update_ = {0, 0};
     61 
     62 inline uint32 UserDict::get_dict_file_size(UserDictInfo * info) {
     63   return (4 + info->lemma_size + (info->lemma_count << 3)
     64 #ifdef ___PREDICT_ENABLED___
     65           + (info->lemma_count << 2)
     66 #endif
     67 #ifdef ___SYNC_ENABLED___
     68           + (info->sync_count << 2)
     69 #endif
     70           + sizeof(*info));
     71 }
     72 
     73 inline LmaScoreType UserDict::translate_score(int raw_score) {
     74   // 1) ori_freq: original user frequency
     75   uint32 ori_freq = extract_score_freq(raw_score);
     76   // 2) lmt_off: lmt index (week offset for example)
     77   uint64 lmt_off = ((raw_score & 0xffff0000) >> 16);
     78   if (kUserDictLMTBitWidth < 16) {
     79     uint64 mask = ~(1 << kUserDictLMTBitWidth);
     80     lmt_off &= mask;
     81   }
     82   // 3) now_off: current time index (current week offset for example)
     83   // assuming load_time_ is around current time
     84   uint64 now_off = load_time_.tv_sec;
     85   now_off = (now_off - kUserDictLMTSince) / kUserDictLMTGranularity;
     86   now_off = (now_off << (64 - kUserDictLMTBitWidth));
     87   now_off = (now_off >> (64 - kUserDictLMTBitWidth));
     88   // 4) factor: decide expand-factor
     89   int delta = now_off - lmt_off;
     90   if (delta > 4)
     91     delta = 4;
     92   int factor = 80 - (delta << 4);
     93 
     94   double tf = (double)(dict_info_.total_nfreq + total_other_nfreq_);
     95   return (LmaScoreType)(log((double)factor * (double)ori_freq / tf)
     96                         * NGram::kLogValueAmplifier);
     97 }
     98 
     99 inline int UserDict::extract_score_freq(int raw_score) {
    100   // Frequence stored in lowest 16 bits
    101   int freq = (raw_score & 0x0000ffff);
    102   return freq;
    103 }
    104 
    105 inline uint64 UserDict::extract_score_lmt(int raw_score) {
    106   uint64 lmt = ((raw_score & 0xffff0000) >> 16);
    107   if (kUserDictLMTBitWidth < 16) {
    108     uint64 mask = ~(1 << kUserDictLMTBitWidth);
    109     lmt &= mask;
    110   }
    111   lmt = lmt * kUserDictLMTGranularity + kUserDictLMTSince;
    112   return lmt;
    113 }
    114 
    115 inline int UserDict::build_score(uint64 lmt, int freq) {
    116   lmt = (lmt - kUserDictLMTSince) / kUserDictLMTGranularity;
    117   lmt = (lmt << (64 - kUserDictLMTBitWidth));
    118   lmt = (lmt >> (64 - kUserDictLMTBitWidth));
    119   uint16 lmt16 = (uint16)lmt;
    120   int s = freq;
    121   s &= 0x0000ffff;
    122   s = (lmt16 << 16) | s;
    123   return s;
    124 }
    125 
    126 inline int64 UserDict::utf16le_atoll(uint16 *s, int len) {
    127   int64 ret = 0;
    128   if (len <= 0)
    129     return ret;
    130 
    131   int flag = 1;
    132   const uint16 * endp = s + len;
    133   if (*s == '-') {
    134     flag = -1;
    135     s++;
    136   } else if (*s == '+') {
    137     s++;
    138   }
    139 
    140   while (*s >= '0' && *s <= '9' && s < endp) {
    141     ret += ret * 10 + (*s) - '0';
    142     s++;
    143   }
    144   return ret * flag;
    145 }
    146 
    147 inline int UserDict::utf16le_lltoa(int64 v, uint16 *s, int size) {
    148   if (!s || size <= 0)
    149     return 0;
    150   uint16 *endp = s + size;
    151   int ret_len = 0;
    152   if (v < 0) {
    153     *(s++) = '-';
    154     ++ret_len;
    155     v *= -1;
    156   }
    157 
    158   uint16 *b = s;
    159   while (s < endp && v != 0) {
    160     *(s++) = '0' + (v % 10);
    161     v = v / 10;
    162     ++ret_len;
    163   }
    164 
    165   if (v != 0)
    166     return 0;
    167 
    168   --s;
    169 
    170   while (b < s) {
    171     *b = *s;
    172     ++b, --s;
    173   }
    174 
    175   return ret_len;
    176 }
    177 
    178 inline void UserDict::set_lemma_flag(uint32 offset, uint8 flag) {
    179   offset &= kUserDictOffsetMask;
    180   lemmas_[offset] |= flag;
    181 }
    182 
    183 inline char UserDict::get_lemma_flag(uint32 offset) {
    184   offset &= kUserDictOffsetMask;
    185   return (char)(lemmas_[offset]);
    186 }
    187 
    188 inline char UserDict::get_lemma_nchar(uint32 offset) {
    189   offset &= kUserDictOffsetMask;
    190   return (char)(lemmas_[offset + 1]);
    191 }
    192 
    193 inline uint16 * UserDict::get_lemma_spell_ids(uint32 offset) {
    194   offset &= kUserDictOffsetMask;
    195   return (uint16 *)(lemmas_ + offset + 2);
    196 }
    197 
    198 inline uint16 * UserDict::get_lemma_word(uint32 offset) {
    199   offset &= kUserDictOffsetMask;
    200   uint8 nchar = get_lemma_nchar(offset);
    201   return (uint16 *)(lemmas_ + offset + 2 + (nchar << 1));
    202 }
    203 
    204 inline LemmaIdType UserDict::get_max_lemma_id() {
    205   // When a lemma is deleted, we don't not claim its id back for
    206   // simplicity and performance
    207   return start_id_ + dict_info_.lemma_count - 1;
    208 }
    209 
    210 inline bool UserDict::is_valid_lemma_id(LemmaIdType id) {
    211   if (id >= start_id_ && id <= get_max_lemma_id())
    212     return true;
    213   return false;
    214 }
    215 
    216 inline bool UserDict::is_valid_state() {
    217   if (state_ == USER_DICT_NONE)
    218     return false;
    219   return true;
    220 }
    221 
    222 UserDict::UserDict()
    223     : start_id_(0),
    224       version_(0),
    225       lemmas_(NULL),
    226       offsets_(NULL),
    227       scores_(NULL),
    228       ids_(NULL),
    229 #ifdef ___PREDICT_ENABLED___
    230       predicts_(NULL),
    231 #endif
    232 #ifdef ___SYNC_ENABLED___
    233       syncs_(NULL),
    234       sync_count_size_(0),
    235 #endif
    236       offsets_by_id_(NULL),
    237       lemma_count_left_(0),
    238       lemma_size_left_(0),
    239       dict_file_(NULL),
    240       state_(USER_DICT_NONE) {
    241   memset(&dict_info_, 0, sizeof(dict_info_));
    242   memset(&load_time_, 0, sizeof(load_time_));
    243 #ifdef ___CACHE_ENABLED___
    244   cache_init();
    245 #endif
    246 }
    247 
    248 UserDict::~UserDict() {
    249   close_dict();
    250 }
    251 
    252 bool UserDict::load_dict(const char *file_name, LemmaIdType start_id,
    253                          LemmaIdType end_id) {
    254 #ifdef ___DEBUG_PERF___
    255   DEBUG_PERF_BEGIN;
    256 #endif
    257   dict_file_ = strdup(file_name);
    258   if (!dict_file_)
    259     return false;
    260 
    261   start_id_ = start_id;
    262 
    263   if (false == validate(file_name) && false == reset(file_name)) {
    264     goto error;
    265   }
    266   if (false == load(file_name, start_id)) {
    267     goto error;
    268   }
    269 
    270   state_ = USER_DICT_SYNC;
    271 
    272   gettimeofday(&load_time_, NULL);
    273 
    274 #ifdef ___DEBUG_PERF___
    275   DEBUG_PERF_END;
    276   LOGD_PERF("load_dict");
    277 #endif
    278   return true;
    279  error:
    280   free((void*)dict_file_);
    281   start_id_ = 0;
    282   return false;
    283 }
    284 
    285 bool UserDict::close_dict() {
    286   if (state_ == USER_DICT_NONE)
    287     return true;
    288   if (state_ == USER_DICT_SYNC)
    289     goto out;
    290 
    291   // If dictionary is written back by others,
    292   // we can not simply write back here
    293   // To do a safe flush, we have to discard all newly added
    294   // lemmas and try to reload dict file.
    295   pthread_mutex_lock(&g_mutex_);
    296   if (load_time_.tv_sec > g_last_update_.tv_sec ||
    297     (load_time_.tv_sec == g_last_update_.tv_sec &&
    298      load_time_.tv_usec > g_last_update_.tv_usec)) {
    299     write_back();
    300     gettimeofday(&g_last_update_, NULL);
    301   }
    302   pthread_mutex_unlock(&g_mutex_);
    303 
    304  out:
    305   free((void*)dict_file_);
    306   free(lemmas_);
    307   free(offsets_);
    308   free(offsets_by_id_);
    309   free(scores_);
    310   free(ids_);
    311 #ifdef ___PREDICT_ENABLED___
    312   free(predicts_);
    313 #endif
    314 
    315   version_ = 0;
    316   dict_file_ = NULL;
    317   lemmas_ = NULL;
    318 #ifdef ___SYNC_ENABLED___
    319   syncs_ = NULL;
    320   sync_count_size_ = 0;
    321 #endif
    322   offsets_ = NULL;
    323   offsets_by_id_ = NULL;
    324   scores_ = NULL;
    325   ids_ = NULL;
    326 #ifdef ___PREDICT_ENABLED___
    327   predicts_ = NULL;
    328 #endif
    329 
    330   memset(&dict_info_, 0, sizeof(dict_info_));
    331   lemma_count_left_ = 0;
    332   lemma_size_left_ = 0;
    333   state_ = USER_DICT_NONE;
    334 
    335   return true;
    336 }
    337 
    338 size_t UserDict::number_of_lemmas() {
    339   return dict_info_.lemma_count;
    340 }
    341 
    342 void UserDict::reset_milestones(uint16 from_step, MileStoneHandle from_handle) {
    343   return;
    344 }
    345 
    346 MileStoneHandle UserDict::extend_dict(MileStoneHandle from_handle,
    347                                       const DictExtPara *dep,
    348                                       LmaPsbItem *lpi_items,
    349                                       size_t lpi_max, size_t *lpi_num) {
    350   if (is_valid_state() == false)
    351     return 0;
    352 
    353   bool need_extend = false;
    354 
    355 #ifdef ___DEBUG_PERF___
    356   DEBUG_PERF_BEGIN;
    357 #endif
    358   *lpi_num = _get_lpis(dep->splids, dep->splids_extended + 1,
    359                        lpi_items, lpi_max, &need_extend);
    360 #ifdef ___DEBUG_PERF___
    361   DEBUG_PERF_END;
    362   LOGD_PERF("extend_dict");
    363 #endif
    364   return ((*lpi_num > 0 || need_extend) ? 1 : 0);
    365 }
    366 
    367 int UserDict::is_fuzzy_prefix_spell_id(
    368     const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) {
    369   if (len1 < searchable->splids_len)
    370     return 0;
    371 
    372   SpellingTrie &spl_trie = SpellingTrie::get_instance();
    373   uint32 i = 0;
    374   for (i = 0; i < searchable->splids_len; i++) {
    375     const char py1 = *spl_trie.get_spelling_str(id1[i]);
    376     uint16 off = 8 * (i % 4);
    377     const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off);
    378     if (py1 == py2)
    379       continue;
    380     return 0;
    381   }
    382   return 1;
    383 }
    384 
    385 int UserDict::fuzzy_compare_spell_id(
    386     const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) {
    387   if (len1 < searchable->splids_len)
    388     return -1;
    389   if (len1 > searchable->splids_len)
    390     return 1;
    391 
    392   SpellingTrie &spl_trie = SpellingTrie::get_instance();
    393   uint32 i = 0;
    394   for (i = 0; i < len1; i++) {
    395     const char py1 = *spl_trie.get_spelling_str(id1[i]);
    396     uint16 off = 8 * (i % 4);
    397     const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off);
    398     if (py1 == py2)
    399       continue;
    400     if (py1 > py2)
    401       return 1;
    402     return -1;
    403   }
    404   return 0;
    405 }
    406 
    407 bool UserDict::is_prefix_spell_id(
    408     const uint16 * fullids, uint16 fulllen,
    409     const UserDictSearchable *searchable) {
    410   if (fulllen < searchable->splids_len)
    411     return false;
    412 
    413   uint32 i = 0;
    414   for (; i < searchable->splids_len; i++) {
    415     uint16 start_id = searchable->splid_start[i];
    416     uint16 count = searchable->splid_count[i];
    417     if (fullids[i] >= start_id && fullids[i] < start_id + count)
    418       continue;
    419     else
    420       return false;
    421   }
    422   return true;
    423 }
    424 
    425 bool UserDict::equal_spell_id(
    426     const uint16 * fullids, uint16 fulllen,
    427     const UserDictSearchable *searchable) {
    428   if (fulllen != searchable->splids_len)
    429     return false;
    430 
    431   uint32 i = 0;
    432   for (; i < fulllen; i++) {
    433     uint16 start_id = searchable->splid_start[i];
    434     uint16 count = searchable->splid_count[i];
    435     if (fullids[i] >= start_id && fullids[i] < start_id + count)
    436       continue;
    437     else
    438       return false;
    439   }
    440   return true;
    441 }
    442 
    443 int32 UserDict::locate_first_in_offsets(const UserDictSearchable * searchable) {
    444   int32 begin = 0;
    445   int32 end = dict_info_.lemma_count - 1;
    446   int32 middle = -1;
    447 
    448   int32 first_prefix = middle;
    449   int32 last_matched = middle;
    450 
    451   while (begin <= end) {
    452     middle = (begin + end) >> 1;
    453     uint32 offset = offsets_[middle];
    454     uint8 nchar = get_lemma_nchar(offset);
    455     const uint16 * splids = get_lemma_spell_ids(offset);
    456     int cmp = fuzzy_compare_spell_id(splids, nchar, searchable);
    457     int pre = is_fuzzy_prefix_spell_id(splids, nchar, searchable);
    458 
    459     if (pre)
    460       first_prefix = middle;
    461 
    462     if (cmp < 0) {
    463       begin = middle + 1;
    464     } else if (cmp > 0) {
    465       end = middle - 1;
    466     } else {
    467       end = middle - 1;
    468       last_matched = middle;
    469     }
    470   }
    471 
    472   return first_prefix;
    473 }
    474 
    475 void UserDict::prepare_locate(UserDictSearchable *searchable,
    476                              const uint16 *splid_str,
    477                              uint16 splid_str_len) {
    478   searchable->splids_len = splid_str_len;
    479   memset(searchable->signature, 0, sizeof(searchable->signature));
    480 
    481   SpellingTrie &spl_trie = SpellingTrie::get_instance();
    482   uint32 i = 0;
    483   for (; i < splid_str_len; i++) {
    484     if (spl_trie.is_half_id(splid_str[i])) {
    485       searchable->splid_count[i] =
    486           spl_trie.half_to_full(splid_str[i],
    487                                 &(searchable->splid_start[i]));
    488     } else {
    489       searchable->splid_count[i] = 1;
    490       searchable->splid_start[i] = splid_str[i];
    491     }
    492     const unsigned char py = *spl_trie.get_spelling_str(splid_str[i]);
    493     searchable->signature[i>>2] |= (py << (8 * (i % 4)));
    494   }
    495 }
    496 
    497 size_t UserDict::get_lpis(const uint16 *splid_str, uint16 splid_str_len,
    498                           LmaPsbItem *lpi_items, size_t lpi_max) {
    499   return _get_lpis(splid_str, splid_str_len, lpi_items, lpi_max, NULL);
    500 }
    501 
    502 size_t UserDict::_get_lpis(const uint16 *splid_str,
    503                            uint16 splid_str_len, LmaPsbItem *lpi_items,
    504                            size_t lpi_max, bool * need_extend) {
    505   bool tmp_extend;
    506   if (!need_extend)
    507     need_extend = &tmp_extend;
    508 
    509   *need_extend = false;
    510 
    511   if (is_valid_state() == false)
    512     return 0;
    513   if (lpi_max <= 0)
    514     return 0;
    515 
    516   if (0 == pthread_mutex_trylock(&g_mutex_)) {
    517     if (load_time_.tv_sec < g_last_update_.tv_sec ||
    518       (load_time_.tv_sec == g_last_update_.tv_sec &&
    519        load_time_.tv_usec < g_last_update_.tv_usec)) {
    520       // Others updated disk file, have to reload
    521       pthread_mutex_unlock(&g_mutex_);
    522       flush_cache();
    523     } else {
    524       pthread_mutex_unlock(&g_mutex_);
    525     }
    526   } else {
    527   }
    528 
    529   UserDictSearchable searchable;
    530   prepare_locate(&searchable, splid_str, splid_str_len);
    531 
    532   uint32 max_off = dict_info_.lemma_count;
    533 #ifdef ___CACHE_ENABLED___
    534   int32 middle;
    535   uint32 start, count;
    536   bool cached = cache_hit(&searchable, &start, &count);
    537   if (cached) {
    538     middle = start;
    539     max_off = start + count;
    540   } else {
    541     middle = locate_first_in_offsets(&searchable);
    542     start = middle;
    543   }
    544 #else
    545   int32 middle = locate_first_in_offsets(&searchable);
    546 #endif
    547 
    548   if (middle == -1) {
    549 #ifdef ___CACHE_ENABLED___
    550     if (!cached)
    551       cache_push(USER_DICT_MISS_CACHE, &searchable, 0, 0);
    552 #endif
    553     return 0;
    554   }
    555 
    556   size_t lpi_current = 0;
    557 
    558   bool fuzzy_break = false;
    559   bool prefix_break = false;
    560   while ((size_t)middle < max_off && !fuzzy_break && !prefix_break) {
    561     if (lpi_current >= lpi_max)
    562       break;
    563     uint32 offset = offsets_[middle];
    564     // Ignore deleted lemmas
    565     if (offset & kUserDictOffsetFlagRemove) {
    566       middle++;
    567       continue;
    568     }
    569     uint8 nchar = get_lemma_nchar(offset);
    570     uint16 * splids = get_lemma_spell_ids(offset);
    571 #ifdef ___CACHE_ENABLED___
    572     if (!cached && 0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) {
    573 #else
    574     if (0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) {
    575 #endif
    576       fuzzy_break = true;
    577     }
    578 
    579     if (prefix_break == false) {
    580       if (is_fuzzy_prefix_spell_id(splids, nchar, &searchable)) {
    581         if (*need_extend == false &&
    582             is_prefix_spell_id(splids, nchar, &searchable)) {
    583           *need_extend = true;
    584         }
    585       } else {
    586         prefix_break = true;
    587       }
    588     }
    589 
    590     if (equal_spell_id(splids, nchar, &searchable) == true) {
    591       lpi_items[lpi_current].psb = translate_score(scores_[middle]);
    592       lpi_items[lpi_current].id = ids_[middle];
    593       lpi_items[lpi_current].lma_len = nchar;
    594       lpi_current++;
    595     }
    596     middle++;
    597   }
    598 
    599 #ifdef ___CACHE_ENABLED___
    600   if (!cached) {
    601     count = middle - start;
    602     cache_push(USER_DICT_CACHE, &searchable, start, count);
    603   }
    604 #endif
    605 
    606   return lpi_current;
    607 }
    608 
    609 uint16 UserDict::get_lemma_str(LemmaIdType id_lemma, char16* str_buf,
    610                                uint16 str_max) {
    611   if (is_valid_state() == false)
    612     return 0;
    613   if (is_valid_lemma_id(id_lemma) == false)
    614     return 0;
    615   uint32 offset = offsets_by_id_[id_lemma - start_id_];
    616   uint8 nchar = get_lemma_nchar(offset);
    617   char16 * str = get_lemma_word(offset);
    618   uint16 m = nchar < str_max -1 ? nchar : str_max - 1;
    619   int i = 0;
    620   for (; i < m; i++) {
    621     str_buf[i] = str[i];
    622   }
    623   str_buf[i] = 0;
    624   return m;
    625 }
    626 
    627 uint16 UserDict::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
    628                                   uint16 splids_max, bool arg_valid) {
    629   if (is_valid_lemma_id(id_lemma) == false)
    630     return 0;
    631   uint32 offset = offsets_by_id_[id_lemma - start_id_];
    632   uint8 nchar = get_lemma_nchar(offset);
    633   const uint16 * ids = get_lemma_spell_ids(offset);
    634   int i = 0;
    635   for (; i < nchar && i < splids_max; i++)
    636     splids[i] = ids[i];
    637   return i;
    638 }
    639 
    640 size_t UserDict::predict(const char16 last_hzs[], uint16 hzs_len,
    641                          NPredictItem *npre_items, size_t npre_max,
    642                          size_t b4_used) {
    643   uint32 new_added = 0;
    644 #ifdef ___PREDICT_ENABLED___
    645   int32 end = dict_info_.lemma_count - 1;
    646   int j = locate_first_in_predicts((const uint16*)last_hzs, hzs_len);
    647   if (j == -1)
    648     return 0;
    649 
    650   while (j <= end) {
    651     uint32 offset = predicts_[j];
    652     // Ignore deleted lemmas
    653     if (offset & kUserDictOffsetFlagRemove) {
    654       j++;
    655       continue;
    656     }
    657     uint32 nchar = get_lemma_nchar(offset);
    658     uint16 * words = get_lemma_word(offset);
    659     uint16 * splids = get_lemma_spell_ids(offset);
    660 
    661     if (nchar <= hzs_len) {
    662       j++;
    663       continue;
    664     }
    665 
    666     if (memcmp(words, last_hzs, hzs_len << 1) == 0) {
    667       if (new_added >= npre_max) {
    668         return new_added;
    669       }
    670       uint32 cpy_len =
    671           (nchar < kMaxPredictSize ? (nchar << 1) : (kMaxPredictSize << 1))
    672           - (hzs_len << 1);
    673       npre_items[new_added].his_len = hzs_len;
    674       npre_items[new_added].psb = get_lemma_score(words, splids, nchar);
    675       memcpy(npre_items[new_added].pre_hzs, words + hzs_len, cpy_len);
    676       if ((cpy_len >> 1) < kMaxPredictSize) {
    677         npre_items[new_added].pre_hzs[cpy_len >> 1] = 0;
    678       }
    679       new_added++;
    680     } else {
    681       break;
    682     }
    683 
    684     j++;
    685   }
    686 #endif
    687   return new_added;
    688 }
    689 
    690 int32 UserDict::locate_in_offsets(char16 lemma_str[], uint16 splid_str[],
    691                                   uint16 lemma_len) {
    692   int32 max_off = dict_info_.lemma_count;
    693 
    694   UserDictSearchable searchable;
    695   prepare_locate(&searchable, splid_str, lemma_len);
    696 #ifdef ___CACHE_ENABLED___
    697   int32 off;
    698   uint32 start, count;
    699   bool cached = load_cache(&searchable, &start, &count);
    700   if (cached) {
    701     off = start;
    702     max_off = start + count;
    703   } else {
    704     off = locate_first_in_offsets(&searchable);
    705     start = off;
    706   }
    707 #else
    708   int32 off = locate_first_in_offsets(&searchable);
    709 #endif
    710 
    711   if (off == -1) {
    712     return off;
    713   }
    714 
    715   while (off < max_off) {
    716     uint32 offset = offsets_[off];
    717     if (offset & kUserDictOffsetFlagRemove) {
    718       off++;
    719       continue;
    720     }
    721     uint16 * splids = get_lemma_spell_ids(offset);
    722 #ifdef ___CACHE_ENABLED___
    723     if (!cached && 0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable))
    724       break;
    725 #else
    726     if (0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable))
    727       break;
    728 #endif
    729     if (equal_spell_id(splids, lemma_len, &searchable) == true) {
    730       uint16 * str = get_lemma_word(offset);
    731       uint32 i = 0;
    732       for (i = 0; i < lemma_len; i++) {
    733         if (str[i] == lemma_str[i])
    734           continue;
    735         break;
    736       }
    737       if (i < lemma_len) {
    738         off++;
    739         continue;
    740       }
    741 #ifdef ___CACHE_ENABLED___
    742       // No need to save_cache here, since current function is invoked by
    743       // put_lemma. It's rarely possible for a user input same lemma twice.
    744       // That means first time user type a new lemma, it is newly added into
    745       // user dictionary, then it's possible that user type the same lemma
    746       // again.
    747       // Another reason save_cache can not be invoked here is this function
    748       // aborts when lemma is found, and it never knows the count.
    749 #endif
    750       return off;
    751     }
    752     off++;
    753   }
    754 
    755   return -1;
    756 }
    757 
    758 #ifdef ___PREDICT_ENABLED___
    759 uint32 UserDict::locate_where_to_insert_in_predicts(
    760     const uint16 * words, int lemma_len) {
    761   int32 begin = 0;
    762   int32 end = dict_info_.lemma_count - 1;
    763   int32 middle = end;
    764 
    765   uint32 last_matched = middle;
    766 
    767   while (begin <= end) {
    768     middle = (begin + end) >> 1;
    769     uint32 offset = offsets_[middle];
    770     uint8 nchar = get_lemma_nchar(offset);
    771     const uint16 * ws = get_lemma_word(offset);
    772 
    773     uint32 minl = nchar < lemma_len ? nchar : lemma_len;
    774     uint32 k = 0;
    775     int cmp = 0;
    776 
    777     for (; k < minl; k++) {
    778       if (ws[k] < words[k]) {
    779         cmp = -1;
    780         break;
    781       } else if (ws[k] > words[k]) {
    782         cmp = 1;
    783         break;
    784       }
    785     }
    786     if (cmp == 0) {
    787       if (nchar < lemma_len)
    788         cmp = -1;
    789       else if (nchar > lemma_len)
    790         cmp = 1;
    791     }
    792 
    793     if (cmp < 0) {
    794       begin = middle + 1;
    795       last_matched = middle;
    796     } else if (cmp > 0) {
    797       end = middle - 1;
    798     } else {
    799       end = middle - 1;
    800       last_matched = middle;
    801     }
    802   }
    803 
    804   return last_matched;
    805 }
    806 
    807 int32 UserDict::locate_first_in_predicts(const uint16 * words, int lemma_len) {
    808   int32 begin = 0;
    809   int32 end = dict_info_.lemma_count - 1;
    810   int32 middle = -1;
    811 
    812   int32 last_matched = middle;
    813 
    814   while (begin <= end) {
    815     middle = (begin + end) >> 1;
    816     uint32 offset = offsets_[middle];
    817     uint8 nchar = get_lemma_nchar(offset);
    818     const uint16 * ws = get_lemma_word(offset);
    819 
    820     uint32 minl = nchar < lemma_len ? nchar : lemma_len;
    821     uint32 k = 0;
    822     int cmp = 0;
    823 
    824     for (; k < minl; k++) {
    825       if (ws[k] < words[k]) {
    826         cmp = -1;
    827         break;
    828       } else if (ws[k] > words[k]) {
    829         cmp = 1;
    830         break;
    831       }
    832     }
    833     if (cmp == 0) {
    834       if (nchar >= lemma_len)
    835         last_matched = middle;
    836       if (nchar < lemma_len)
    837         cmp = -1;
    838       else if (nchar > lemma_len)
    839         cmp = 1;
    840     }
    841 
    842     if (cmp < 0) {
    843       begin = middle + 1;
    844     } else if (cmp > 0) {
    845       end = middle - 1;
    846     } else {
    847       end = middle - 1;
    848     }
    849   }
    850 
    851   return last_matched;
    852 }
    853 
    854 #endif
    855 
    856 LemmaIdType UserDict::get_lemma_id(char16 lemma_str[], uint16 splids[],
    857                                    uint16 lemma_len) {
    858   int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
    859   if (off == -1) {
    860     return 0;
    861   }
    862 
    863   return ids_[off];
    864 }
    865 
    866 LmaScoreType UserDict::get_lemma_score(LemmaIdType lemma_id) {
    867   if (is_valid_state() == false)
    868     return 0;
    869   if (is_valid_lemma_id(lemma_id) == false)
    870     return 0;
    871 
    872   return translate_score(_get_lemma_score(lemma_id));
    873 }
    874 
    875 LmaScoreType UserDict::get_lemma_score(char16 lemma_str[], uint16 splids[],
    876                                 uint16 lemma_len) {
    877   if (is_valid_state() == false)
    878     return 0;
    879   return translate_score(_get_lemma_score(lemma_str, splids, lemma_len));
    880 }
    881 
    882 int UserDict::_get_lemma_score(LemmaIdType lemma_id) {
    883   if (is_valid_state() == false)
    884     return 0;
    885   if (is_valid_lemma_id(lemma_id) == false)
    886     return 0;
    887 
    888   uint32 offset = offsets_by_id_[lemma_id - start_id_];
    889 
    890   uint32 nchar = get_lemma_nchar(offset);
    891   uint16 * spl = get_lemma_spell_ids(offset);
    892   uint16 * wrd = get_lemma_word(offset);
    893 
    894   int32 off = locate_in_offsets(wrd, spl, nchar);
    895   if (off == -1) {
    896     return 0;
    897   }
    898 
    899   return scores_[off];
    900 }
    901 
    902 int UserDict::_get_lemma_score(char16 lemma_str[], uint16 splids[],
    903                                 uint16 lemma_len) {
    904   if (is_valid_state() == false)
    905     return 0;
    906 
    907   int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
    908   if (off == -1) {
    909     return 0;
    910   }
    911 
    912   return scores_[off];
    913 }
    914 
    915 #ifdef ___SYNC_ENABLED___
    916 void UserDict::remove_lemma_from_sync_list(uint32 offset) {
    917   offset &= kUserDictOffsetMask;
    918   uint32 i = 0;
    919   for (; i < dict_info_.sync_count; i++) {
    920     unsigned int off = (syncs_[i] & kUserDictOffsetMask);
    921     if (off == offset)
    922       break;
    923   }
    924   if (i < dict_info_.sync_count) {
    925     syncs_[i] = syncs_[dict_info_.sync_count - 1];
    926     dict_info_.sync_count--;
    927   }
    928 }
    929 #endif
    930 
    931 #ifdef ___PREDICT_ENABLED___
    932 void UserDict::remove_lemma_from_predict_list(uint32 offset) {
    933   offset &= kUserDictOffsetMask;
    934   uint32 i = 0;
    935   for (; i < dict_info_.lemma_count; i++) {
    936     unsigned int off = (predicts_[i] & kUserDictOffsetMask);
    937     if (off == offset) {
    938       predicts_[i] |= kUserDictOffsetFlagRemove;
    939       break;
    940     }
    941   }
    942 }
    943 #endif
    944 
    945 bool UserDict::remove_lemma_by_offset_index(int offset_index) {
    946   if (is_valid_state() == false)
    947     return 0;
    948 
    949   int32 off = offset_index;
    950   if (off == -1) {
    951     return false;
    952   }
    953 
    954   uint32 offset = offsets_[off];
    955   uint32 nchar = get_lemma_nchar(offset);
    956 
    957   offsets_[off] |= kUserDictOffsetFlagRemove;
    958 
    959 #ifdef ___SYNC_ENABLED___
    960   // Remove corresponding sync item
    961   remove_lemma_from_sync_list(offset);
    962 #endif
    963 
    964 #ifdef ___PREDICT_ENABLED___
    965   remove_lemma_from_predict_list(offset);
    966 #endif
    967   dict_info_.free_count++;
    968   dict_info_.free_size += (2 + (nchar << 2));
    969 
    970   if (state_ < USER_DICT_OFFSET_DIRTY)
    971     state_ = USER_DICT_OFFSET_DIRTY;
    972   return true;
    973 }
    974 
    975 bool UserDict::remove_lemma(LemmaIdType lemma_id) {
    976   if (is_valid_state() == false)
    977     return 0;
    978   if (is_valid_lemma_id(lemma_id) == false)
    979     return false;
    980   uint32 offset = offsets_by_id_[lemma_id - start_id_];
    981 
    982   uint32 nchar = get_lemma_nchar(offset);
    983   uint16 * spl = get_lemma_spell_ids(offset);
    984   uint16 * wrd = get_lemma_word(offset);
    985 
    986   int32 off = locate_in_offsets(wrd, spl, nchar);
    987 
    988   return remove_lemma_by_offset_index(off);
    989 }
    990 
    991 void UserDict::flush_cache() {
    992   LemmaIdType start_id = start_id_;
    993   const char * file = strdup(dict_file_);
    994   if (!file)
    995     return;
    996   close_dict();
    997   load_dict(file, start_id, kUserDictIdEnd);
    998   free((void*)file);
    999 #ifdef ___CACHE_ENABLED___
   1000   cache_init();
   1001 #endif
   1002   return;
   1003 }
   1004 
   1005 bool UserDict::reset(const char *file) {
   1006   FILE *fp = fopen(file, "w+");
   1007   if (!fp) {
   1008     return false;
   1009   }
   1010   uint32 version = kUserDictVersion;
   1011   size_t wred = fwrite(&version, 1, 4, fp);
   1012   UserDictInfo info;
   1013   memset(&info, 0, sizeof(info));
   1014   // By default, no limitation for lemma count and size
   1015   // thereby, reclaim_ratio is never used
   1016   wred += fwrite(&info, 1, sizeof(info), fp);
   1017   if (wred != sizeof(info) + sizeof(version)) {
   1018     fclose(fp);
   1019     unlink(file);
   1020     return false;
   1021   }
   1022   fclose(fp);
   1023   return true;
   1024 }
   1025 
   1026 bool UserDict::validate(const char *file) {
   1027   // b is ignored in POSIX compatible os including Linux
   1028   // while b is important flag for Windows to specify binary mode
   1029   FILE *fp = fopen(file, "rb");
   1030   if (!fp) {
   1031     return false;
   1032   }
   1033 
   1034   size_t size;
   1035   size_t readed;
   1036   uint32 version;
   1037   UserDictInfo dict_info;
   1038 
   1039   // validate
   1040   int err = fseek(fp, 0, SEEK_END);
   1041   if (err) {
   1042     goto error;
   1043   }
   1044 
   1045   size = ftell(fp);
   1046   if (size < 4 + sizeof(dict_info)) {
   1047     goto error;
   1048   }
   1049 
   1050   err = fseek(fp, 0, SEEK_SET);
   1051   if (err) {
   1052     goto error;
   1053   }
   1054 
   1055   readed = fread(&version, 1, sizeof(version), fp);
   1056   if (readed < sizeof(version)) {
   1057     goto error;
   1058   }
   1059   if (version != kUserDictVersion) {
   1060     goto error;
   1061   }
   1062 
   1063   err = fseek(fp, -1 * sizeof(dict_info), SEEK_END);
   1064   if (err) {
   1065     goto error;
   1066   }
   1067 
   1068   readed = fread(&dict_info, 1, sizeof(dict_info), fp);
   1069   if (readed != sizeof(dict_info)) {
   1070     goto error;
   1071   }
   1072 
   1073   if (size != get_dict_file_size(&dict_info)) {
   1074     goto error;
   1075   }
   1076 
   1077   fclose(fp);
   1078   return true;
   1079 
   1080  error:
   1081   fclose(fp);
   1082   return false;
   1083 }
   1084 
   1085 bool UserDict::load(const char *file, LemmaIdType start_id) {
   1086   if (0 != pthread_mutex_trylock(&g_mutex_)) {
   1087     return false;
   1088   }
   1089   // b is ignored in POSIX compatible os including Linux
   1090   // while b is important flag for Windows to specify binary mode
   1091   FILE *fp = fopen(file, "rb");
   1092   if (!fp) {
   1093     pthread_mutex_unlock(&g_mutex_);
   1094     return false;
   1095   }
   1096 
   1097   size_t readed, toread;
   1098   UserDictInfo dict_info;
   1099   uint8 *lemmas = NULL;
   1100   uint32 *offsets = NULL;
   1101 #ifdef ___SYNC_ENABLED___
   1102   uint32 *syncs = NULL;
   1103 #endif
   1104   uint32 *scores = NULL;
   1105   uint32 *ids = NULL;
   1106   uint32 *offsets_by_id = NULL;
   1107 #ifdef ___PREDICT_ENABLED___
   1108   uint32 *predicts = NULL;
   1109 #endif
   1110   size_t i;
   1111   int err;
   1112 
   1113   err = fseek(fp, -1 * sizeof(dict_info), SEEK_END);
   1114   if (err) goto error;
   1115 
   1116   readed = fread(&dict_info, 1, sizeof(dict_info), fp);
   1117   if (readed != sizeof(dict_info)) goto error;
   1118 
   1119   lemmas = (uint8 *)malloc(
   1120       dict_info.lemma_size +
   1121       (kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2))));
   1122 
   1123   if (!lemmas) goto error;
   1124 
   1125   offsets = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
   1126   if (!offsets) goto error;
   1127 
   1128 #ifdef ___PREDICT_ENABLED___
   1129   predicts = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
   1130   if (!predicts) goto error;
   1131 #endif
   1132 
   1133 #ifdef ___SYNC_ENABLED___
   1134   syncs = (uint32 *)malloc((dict_info.sync_count + kUserDictPreAlloc) << 2);
   1135   if (!syncs) goto error;
   1136 #endif
   1137 
   1138   scores = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
   1139   if (!scores) goto error;
   1140 
   1141   ids = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
   1142   if (!ids) goto error;
   1143 
   1144   offsets_by_id = (uint32 *)malloc(
   1145       (dict_info.lemma_count + kUserDictPreAlloc) << 2);
   1146   if (!offsets_by_id) goto error;
   1147 
   1148   err = fseek(fp, 4, SEEK_SET);
   1149   if (err) goto error;
   1150 
   1151   readed = 0;
   1152   while (readed < dict_info.lemma_size && !ferror(fp) && !feof(fp)) {
   1153     readed += fread(lemmas + readed, 1, dict_info.lemma_size - readed, fp);
   1154   }
   1155   if (readed < dict_info.lemma_size)
   1156     goto error;
   1157 
   1158   toread = (dict_info.lemma_count << 2);
   1159   readed = 0;
   1160   while (readed < toread && !ferror(fp) && !feof(fp)) {
   1161     readed += fread((((uint8*)offsets) + readed), 1, toread - readed, fp);
   1162   }
   1163   if (readed < toread)
   1164     goto error;
   1165 
   1166 #ifdef ___PREDICT_ENABLED___
   1167   toread = (dict_info.lemma_count << 2);
   1168   readed = 0;
   1169   while (readed < toread && !ferror(fp) && !feof(fp)) {
   1170     readed += fread((((uint8*)predicts) + readed), 1, toread - readed, fp);
   1171   }
   1172   if (readed < toread)
   1173     goto error;
   1174 #endif
   1175 
   1176   readed = 0;
   1177   while (readed < toread && !ferror(fp) && !feof(fp)) {
   1178     readed += fread((((uint8*)scores) + readed), 1, toread - readed, fp);
   1179   }
   1180   if (readed < toread)
   1181     goto error;
   1182 
   1183 #ifdef ___SYNC_ENABLED___
   1184   toread = (dict_info.sync_count << 2);
   1185   readed = 0;
   1186   while (readed < toread && !ferror(fp) && !feof(fp)) {
   1187     readed += fread((((uint8*)syncs) + readed), 1, toread - readed, fp);
   1188   }
   1189   if (readed < toread)
   1190     goto error;
   1191 #endif
   1192 
   1193   for (i = 0; i < dict_info.lemma_count; i++) {
   1194     ids[i] = start_id + i;
   1195     offsets_by_id[i] = offsets[i];
   1196   }
   1197 
   1198   lemmas_ = lemmas;
   1199   offsets_ = offsets;
   1200 #ifdef ___SYNC_ENABLED___
   1201   syncs_ = syncs;
   1202   sync_count_size_ = dict_info.sync_count + kUserDictPreAlloc;
   1203 #endif
   1204   offsets_by_id_ = offsets_by_id;
   1205   scores_ = scores;
   1206   ids_ = ids;
   1207 #ifdef ___PREDICT_ENABLED___
   1208   predicts_ = predicts;
   1209 #endif
   1210   lemma_count_left_ = kUserDictPreAlloc;
   1211   lemma_size_left_ = kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2));
   1212   memcpy(&dict_info_, &dict_info, sizeof(dict_info));
   1213   state_ = USER_DICT_SYNC;
   1214 
   1215   fclose(fp);
   1216 
   1217   pthread_mutex_unlock(&g_mutex_);
   1218   return true;
   1219 
   1220  error:
   1221   if (lemmas) free(lemmas);
   1222   if (offsets) free(offsets);
   1223 #ifdef ___SYNC_ENABLED___
   1224   if (syncs) free(syncs);
   1225 #endif
   1226   if (scores) free(scores);
   1227   if (ids) free(ids);
   1228   if (offsets_by_id) free(offsets_by_id);
   1229 #ifdef ___PREDICT_ENABLED___
   1230   if (predicts) free(predicts);
   1231 #endif
   1232   fclose(fp);
   1233   pthread_mutex_unlock(&g_mutex_);
   1234   return false;
   1235 }
   1236 
   1237 void UserDict::write_back() {
   1238   // XXX write back is only allowed from close_dict due to thread-safe sake
   1239   if (state_ == USER_DICT_NONE || state_ == USER_DICT_SYNC)
   1240     return;
   1241   int fd = open(dict_file_, O_WRONLY);
   1242   if (fd == -1)
   1243     return;
   1244   switch (state_) {
   1245     case USER_DICT_DEFRAGMENTED:
   1246       write_back_all(fd);
   1247       break;
   1248     case USER_DICT_LEMMA_DIRTY:
   1249       write_back_lemma(fd);
   1250       break;
   1251     case USER_DICT_OFFSET_DIRTY:
   1252       write_back_offset(fd);
   1253       break;
   1254     case USER_DICT_SCORE_DIRTY:
   1255       write_back_score(fd);
   1256       break;
   1257 #ifdef ___SYNC_ENABLED___
   1258     case USER_DICT_SYNC_DIRTY:
   1259       write_back_sync(fd);
   1260       break;
   1261 #endif
   1262     default:
   1263       break;
   1264   }
   1265   // It seems truncate is not need on Linux, Windows except Mac
   1266   // I am doing it here anyway for safety.
   1267   off_t cur = lseek(fd, 0, SEEK_CUR);
   1268   ftruncate(fd, cur);
   1269   close(fd);
   1270   state_ = USER_DICT_SYNC;
   1271 }
   1272 
   1273 #ifdef ___SYNC_ENABLED___
   1274 void UserDict::write_back_sync(int fd) {
   1275   int err = lseek(fd, 4 + dict_info_.lemma_size
   1276                   + (dict_info_.lemma_count << 3)
   1277 #ifdef ___PREDICT_ENABLED___
   1278                   + (dict_info_.lemma_count << 2)
   1279 #endif
   1280                   , SEEK_SET);
   1281   if (err == -1)
   1282     return;
   1283   write(fd, syncs_, dict_info_.sync_count << 2);
   1284   write(fd, &dict_info_, sizeof(dict_info_));
   1285 }
   1286 #endif
   1287 
   1288 void UserDict::write_back_offset(int fd) {
   1289   int err = lseek(fd, 4 + dict_info_.lemma_size, SEEK_SET);
   1290   if (err == -1)
   1291     return;
   1292   write(fd, offsets_, dict_info_.lemma_count << 2);
   1293 #ifdef ___PREDICT_ENABLED___
   1294   write(fd, predicts_, dict_info_.lemma_count << 2);
   1295 #endif
   1296   write(fd, scores_, dict_info_.lemma_count << 2);
   1297 #ifdef ___SYNC_ENABLED___
   1298   write(fd, syncs_, dict_info_.sync_count << 2);
   1299 #endif
   1300   write(fd, &dict_info_, sizeof(dict_info_));
   1301 }
   1302 
   1303 void UserDict::write_back_score(int fd) {
   1304   int err = lseek(fd, 4 + dict_info_.lemma_size
   1305                   + (dict_info_.lemma_count << 2)
   1306 #ifdef ___PREDICT_ENABLED___
   1307                   + (dict_info_.lemma_count << 2)
   1308 #endif
   1309                   , SEEK_SET);
   1310   if (err == -1)
   1311     return;
   1312   write(fd, scores_, dict_info_.lemma_count << 2);
   1313 #ifdef ___SYNC_ENABLED___
   1314   write(fd, syncs_, dict_info_.sync_count << 2);
   1315 #endif
   1316   write(fd, &dict_info_, sizeof(dict_info_));
   1317 }
   1318 
   1319 void UserDict::write_back_lemma(int fd) {
   1320   int err = lseek(fd, 4, SEEK_SET);
   1321   if (err == -1)
   1322     return;
   1323   // New lemmas are always appended, no need to write whole lemma block
   1324   size_t need_write = kUserDictPreAlloc *
   1325       (2 + (kUserDictAverageNchar << 2)) - lemma_size_left_;
   1326   err = lseek(fd, dict_info_.lemma_size - need_write, SEEK_CUR);
   1327   if (err == -1)
   1328     return;
   1329   write(fd, lemmas_ + dict_info_.lemma_size - need_write, need_write);
   1330 
   1331   write(fd, offsets_,  dict_info_.lemma_count << 2);
   1332 #ifdef ___PREDICT_ENABLED___
   1333   write(fd, predicts_,  dict_info_.lemma_count << 2);
   1334 #endif
   1335   write(fd, scores_, dict_info_.lemma_count << 2);
   1336 #ifdef ___SYNC_ENABLED___
   1337   write(fd, syncs_, dict_info_.sync_count << 2);
   1338 #endif
   1339   write(fd, &dict_info_, sizeof(dict_info_));
   1340 }
   1341 
   1342 void UserDict::write_back_all(int fd) {
   1343   // XXX lemma_size is handled differently in writeall
   1344   // and writelemma. I update lemma_size and lemma_count in different
   1345   // places for these two cases. Should fix it to make it consistent.
   1346   int err = lseek(fd, 4, SEEK_SET);
   1347   if (err == -1)
   1348     return;
   1349   write(fd, lemmas_, dict_info_.lemma_size);
   1350   write(fd, offsets_, dict_info_.lemma_count << 2);
   1351 #ifdef ___PREDICT_ENABLED___
   1352   write(fd, predicts_, dict_info_.lemma_count << 2);
   1353 #endif
   1354   write(fd, scores_, dict_info_.lemma_count << 2);
   1355 #ifdef ___SYNC_ENABLED___
   1356   write(fd, syncs_, dict_info_.sync_count << 2);
   1357 #endif
   1358   write(fd, &dict_info_, sizeof(dict_info_));
   1359 }
   1360 
   1361 #ifdef ___CACHE_ENABLED___
   1362 bool UserDict::load_cache(UserDictSearchable *searchable,
   1363                           uint32 *offset, uint32 *length) {
   1364   UserDictCache *cache = &caches_[searchable->splids_len - 1];
   1365   if (cache->head == cache->tail)
   1366     return false;
   1367 
   1368   uint16 j, sig_len = kMaxLemmaSize / 4;
   1369   uint16 i = cache->head;
   1370   while (1) {
   1371     j = 0;
   1372     for (; j < sig_len; j++) {
   1373       if (cache->signatures[i][j] != searchable->signature[j])
   1374         break;
   1375     }
   1376     if (j < sig_len) {
   1377       i++;
   1378       if (i >= kUserDictCacheSize)
   1379         i -= kUserDictCacheSize;
   1380       if (i == cache->tail)
   1381         break;
   1382       continue;
   1383     }
   1384     *offset = cache->offsets[i];
   1385     *length = cache->lengths[i];
   1386     return true;
   1387   }
   1388   return false;
   1389 }
   1390 
   1391 void UserDict::save_cache(UserDictSearchable *searchable,
   1392                           uint32 offset, uint32 length) {
   1393   UserDictCache *cache = &caches_[searchable->splids_len - 1];
   1394   uint16 next = cache->tail;
   1395 
   1396   cache->offsets[next] = offset;
   1397   cache->lengths[next] = length;
   1398   uint16 sig_len = kMaxLemmaSize / 4;
   1399   uint16 j = 0;
   1400   for (; j < sig_len; j++) {
   1401     cache->signatures[next][j] = searchable->signature[j];
   1402   }
   1403 
   1404   if (++next >= kUserDictCacheSize) {
   1405     next -= kUserDictCacheSize;
   1406   }
   1407   if (next == cache->head) {
   1408     cache->head++;
   1409     if (cache->head >= kUserDictCacheSize) {
   1410       cache->head -= kUserDictCacheSize;
   1411     }
   1412   }
   1413   cache->tail = next;
   1414 }
   1415 
   1416 void UserDict::reset_cache() {
   1417   memset(caches_, 0, sizeof(caches_));
   1418 }
   1419 
   1420 bool UserDict::load_miss_cache(UserDictSearchable *searchable) {
   1421   UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1];
   1422   if (cache->head == cache->tail)
   1423     return false;
   1424 
   1425   uint16 j, sig_len = kMaxLemmaSize / 4;
   1426   uint16 i = cache->head;
   1427   while (1) {
   1428     j = 0;
   1429     for (; j < sig_len; j++) {
   1430       if (cache->signatures[i][j] != searchable->signature[j])
   1431         break;
   1432     }
   1433     if (j < sig_len) {
   1434       i++;
   1435       if (i >= kUserDictMissCacheSize)
   1436         i -= kUserDictMissCacheSize;
   1437       if (i == cache->tail)
   1438         break;
   1439       continue;
   1440     }
   1441     return true;
   1442   }
   1443   return false;
   1444 }
   1445 
   1446 void UserDict::save_miss_cache(UserDictSearchable *searchable) {
   1447   UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1];
   1448   uint16 next = cache->tail;
   1449 
   1450   uint16 sig_len = kMaxLemmaSize / 4;
   1451   uint16 j = 0;
   1452   for (; j < sig_len; j++) {
   1453     cache->signatures[next][j] = searchable->signature[j];
   1454   }
   1455 
   1456   if (++next >= kUserDictMissCacheSize) {
   1457     next -= kUserDictMissCacheSize;
   1458   }
   1459   if (next == cache->head) {
   1460     cache->head++;
   1461     if (cache->head >= kUserDictMissCacheSize) {
   1462       cache->head -= kUserDictMissCacheSize;
   1463     }
   1464   }
   1465   cache->tail = next;
   1466 }
   1467 
   1468 void UserDict::reset_miss_cache() {
   1469   memset(miss_caches_, 0, sizeof(miss_caches_));
   1470 }
   1471 
   1472 void UserDict::cache_init() {
   1473   reset_cache();
   1474   reset_miss_cache();
   1475 }
   1476 
   1477 bool UserDict::cache_hit(UserDictSearchable *searchable,
   1478                          uint32 *offset, uint32 *length) {
   1479   bool hit = load_miss_cache(searchable);
   1480   if (hit) {
   1481     *offset = 0;
   1482     *length = 0;
   1483     return true;
   1484   }
   1485   hit = load_cache(searchable, offset, length);
   1486   if (hit) {
   1487     return true;
   1488   }
   1489   return false;
   1490 }
   1491 
   1492 void UserDict::cache_push(UserDictCacheType type,
   1493                          UserDictSearchable *searchable,
   1494                          uint32 offset, uint32 length) {
   1495   switch (type) {
   1496     case USER_DICT_MISS_CACHE:
   1497       save_miss_cache(searchable);
   1498       break;
   1499     case USER_DICT_CACHE:
   1500       save_cache(searchable, offset, length);
   1501       break;
   1502     default:
   1503       break;
   1504   }
   1505 }
   1506 
   1507 #endif
   1508 
   1509 void UserDict::defragment(void) {
   1510 #ifdef ___DEBUG_PERF___
   1511   DEBUG_PERF_BEGIN;
   1512 #endif
   1513   if (is_valid_state() == false)
   1514     return;
   1515   // Fixup offsets_, set REMOVE flag to lemma's flag if needed
   1516   size_t first_freed = 0;
   1517   size_t first_inuse = 0;
   1518   while (first_freed < dict_info_.lemma_count) {
   1519     // Find first freed offset
   1520     while ((offsets_[first_freed] & kUserDictOffsetFlagRemove) == 0 &&
   1521             first_freed < dict_info_.lemma_count) {
   1522       first_freed++;
   1523     }
   1524     if (first_freed < dict_info_.lemma_count) {
   1525       // Save REMOVE flag to lemma flag
   1526       int off = offsets_[first_freed];
   1527       set_lemma_flag(off, kUserDictLemmaFlagRemove);
   1528     } else {
   1529       break;
   1530     }
   1531     // Find first inuse offse after first_freed
   1532     first_inuse = first_freed + 1;
   1533     while ((offsets_[first_inuse] & kUserDictOffsetFlagRemove) &&
   1534            (first_inuse < dict_info_.lemma_count)) {
   1535       // Save REMOVE flag to lemma flag
   1536       int off = offsets_[first_inuse];
   1537       set_lemma_flag(off, kUserDictLemmaFlagRemove);
   1538       first_inuse++;
   1539     }
   1540     if (first_inuse >= dict_info_.lemma_count) {
   1541       break;
   1542     }
   1543     // Swap offsets_
   1544     int tmp = offsets_[first_inuse];
   1545     offsets_[first_inuse] = offsets_[first_freed];
   1546     offsets_[first_freed] = tmp;
   1547     // Move scores_, no need to swap
   1548     tmp = scores_[first_inuse];
   1549     scores_[first_inuse] = scores_[first_freed];
   1550     scores_[first_freed] = tmp;
   1551     // Swap ids_
   1552     LemmaIdType tmpid = ids_[first_inuse];
   1553     ids_[first_inuse] = ids_[first_freed];
   1554     ids_[first_freed] = tmpid;
   1555     // Go on
   1556     first_freed++;
   1557   }
   1558 #ifdef ___PREDICT_ENABLED___
   1559   // Fixup predicts_
   1560   first_freed = 0;
   1561   first_inuse = 0;
   1562   while (first_freed < dict_info_.lemma_count) {
   1563     // Find first freed offset
   1564     while ((predicts_[first_freed] & kUserDictOffsetFlagRemove) == 0 &&
   1565             first_freed < dict_info_.lemma_count) {
   1566       first_freed++;
   1567     }
   1568     if (first_freed >= dict_info_.lemma_count)
   1569       break;
   1570     // Find first inuse offse after first_freed
   1571     first_inuse = first_freed + 1;
   1572     while ((predicts_[first_inuse] & kUserDictOffsetFlagRemove)
   1573            && (first_inuse < dict_info_.lemma_count)) {
   1574       first_inuse++;
   1575     }
   1576     if (first_inuse >= dict_info_.lemma_count) {
   1577       break;
   1578     }
   1579     // Swap offsets_
   1580     int tmp = predicts_[first_inuse];
   1581     predicts_[first_inuse] = predicts_[first_freed];
   1582     predicts_[first_freed] = tmp;
   1583     // Go on
   1584     first_freed++;
   1585   }
   1586 #endif
   1587   dict_info_.lemma_count = first_freed;
   1588   // Fixup lemmas_
   1589   size_t begin = 0;
   1590   size_t end = 0;
   1591   size_t dst = 0;
   1592   int total_size = dict_info_.lemma_size + lemma_size_left_;
   1593   int total_count = dict_info_.lemma_count + lemma_count_left_;
   1594   size_t real_size = total_size - lemma_size_left_;
   1595   while (dst < real_size) {
   1596     unsigned char flag = get_lemma_flag(dst);
   1597     unsigned char nchr = get_lemma_nchar(dst);
   1598     if ((flag & kUserDictLemmaFlagRemove) == 0) {
   1599       dst += nchr * 4 + 2;
   1600       continue;
   1601     }
   1602     break;
   1603   }
   1604   if (dst >= real_size)
   1605     return;
   1606 
   1607   end = dst;
   1608   while (end < real_size) {
   1609     begin = end + get_lemma_nchar(end) * 4 + 2;
   1610  repeat:
   1611     // not used any more
   1612     if (begin >= real_size)
   1613       break;
   1614     unsigned char flag = get_lemma_flag(begin);
   1615     unsigned char nchr = get_lemma_nchar(begin);
   1616     if (flag & kUserDictLemmaFlagRemove) {
   1617       begin += nchr * 4 + 2;
   1618       goto repeat;
   1619     }
   1620     end = begin + nchr * 4 + 2;
   1621     while (end < real_size) {
   1622       unsigned char eflag = get_lemma_flag(end);
   1623       unsigned char enchr = get_lemma_nchar(end);
   1624       if ((eflag & kUserDictLemmaFlagRemove) == 0) {
   1625         end += enchr * 4 + 2;
   1626         continue;
   1627       }
   1628       break;
   1629     }
   1630     memmove(lemmas_ + dst, lemmas_ + begin, end - begin);
   1631     for (size_t j = 0; j < dict_info_.lemma_count; j++) {
   1632       if (offsets_[j] >= begin && offsets_[j] < end) {
   1633         offsets_[j] -= (begin - dst);
   1634         offsets_by_id_[ids_[j] - start_id_] = offsets_[j];
   1635       }
   1636 #ifdef ___PREDICT_ENABLED___
   1637       if (predicts_[j] >= begin && predicts_[j] < end) {
   1638         predicts_[j] -= (begin - dst);
   1639       }
   1640 #endif
   1641     }
   1642 #ifdef ___SYNC_ENABLED___
   1643     for (size_t j = 0; j < dict_info_.sync_count; j++) {
   1644       if (syncs_[j] >= begin && syncs_[j] < end) {
   1645         syncs_[j] -= (begin - dst);
   1646       }
   1647     }
   1648 #endif
   1649     dst += (end - begin);
   1650   }
   1651 
   1652   dict_info_.free_count = 0;
   1653   dict_info_.free_size = 0;
   1654   dict_info_.lemma_size = dst;
   1655   lemma_size_left_ = total_size - dict_info_.lemma_size;
   1656   lemma_count_left_ = total_count - dict_info_.lemma_count;
   1657 
   1658   // XXX Without following code,
   1659   // offsets_by_id_ is not reordered.
   1660   // That's to say, all removed lemmas' ids are not collected back.
   1661   // There may not be room for addition of new lemmas due to
   1662   // offsests_by_id_ reason, although lemma_size_left_ is fixed.
   1663   // By default, we do want defrag as fast as possible, because
   1664   // during defrag procedure, other peers can not write new lemmas
   1665   // to user dictionary file.
   1666   // XXX If write-back is invoked immediately after
   1667   // this defragment, no need to fix up following in-mem data.
   1668   for (uint32 i = 0; i < dict_info_.lemma_count; i++) {
   1669     ids_[i] = start_id_ + i;
   1670     offsets_by_id_[i] = offsets_[i];
   1671   }
   1672 
   1673   state_ = USER_DICT_DEFRAGMENTED;
   1674 
   1675 #ifdef ___DEBUG_PERF___
   1676   DEBUG_PERF_END;
   1677   LOGD_PERF("defragment");
   1678 #endif
   1679 }
   1680 
   1681 #ifdef ___SYNC_ENABLED___
   1682 void UserDict::clear_sync_lemmas(unsigned int start, unsigned int end) {
   1683   if (is_valid_state() == false)
   1684     return;
   1685   if (end > dict_info_.sync_count)
   1686     end = dict_info_.sync_count;
   1687   memmove(syncs_ + start, syncs_ + end, (dict_info_.sync_count - end) << 2);
   1688   dict_info_.sync_count -= (end - start);
   1689   if (state_ < USER_DICT_SYNC_DIRTY)
   1690     state_ = USER_DICT_SYNC_DIRTY;
   1691 }
   1692 
   1693 int UserDict::get_sync_count() {
   1694   if (is_valid_state() == false)
   1695     return 0;
   1696   return dict_info_.sync_count;
   1697 }
   1698 
   1699 LemmaIdType UserDict::put_lemma_no_sync(char16 lemma_str[], uint16 splids[],
   1700                         uint16 lemma_len, uint16 count, uint64 lmt) {
   1701   int again = 0;
   1702  begin:
   1703   LemmaIdType id;
   1704   uint32 * syncs_bak = syncs_;
   1705   syncs_ = NULL;
   1706   id = _put_lemma(lemma_str, splids, lemma_len, count, lmt);
   1707   syncs_ = syncs_bak;
   1708   if (id == 0 && again == 0) {
   1709     if ((dict_info_.limit_lemma_count > 0 &&
   1710         dict_info_.lemma_count >= dict_info_.limit_lemma_count)
   1711         || (dict_info_.limit_lemma_size > 0 &&
   1712             dict_info_.lemma_size + (2 + (lemma_len << 2))
   1713             > dict_info_.limit_lemma_size)) {
   1714       // XXX Always reclaim and defrag in sync code path
   1715       //     sync thread is background thread and ok with heavy work
   1716       reclaim();
   1717       defragment();
   1718       flush_cache();
   1719       again = 1;
   1720       goto begin;
   1721     }
   1722   }
   1723   return id;
   1724 }
   1725 
   1726 int UserDict::put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len) {
   1727   int newly_added = 0;
   1728 
   1729   SpellingParser * spl_parser = new SpellingParser();
   1730   if (!spl_parser) {
   1731     return 0;
   1732   }
   1733 #ifdef ___DEBUG_PERF___
   1734   DEBUG_PERF_BEGIN;
   1735 #endif
   1736   char16 *ptr = lemmas;
   1737 
   1738   // Extract pinyin,words,frequence,last_mod_time
   1739   char16 * p = ptr, * py16 = ptr;
   1740   char16 * hz16 = NULL;
   1741   int py16_len = 0;
   1742   uint16 splid[kMaxLemmaSize];
   1743   int splid_len = 0;
   1744   int hz16_len = 0;
   1745   char16 * fr16 = NULL;
   1746   int fr16_len = 0;
   1747 
   1748   while (p - ptr < len) {
   1749     // Pinyin
   1750     py16 = p;
   1751     splid_len = 0;
   1752     while (*p != 0x2c && (p - ptr) < len) {
   1753       if (*p == 0x20)
   1754         splid_len++;
   1755       p++;
   1756     }
   1757     splid_len++;
   1758     if (p - ptr == len)
   1759       break;
   1760     py16_len = p - py16;
   1761     if (kMaxLemmaSize < splid_len) {
   1762       break;
   1763     }
   1764     bool is_pre;
   1765     int splidl = spl_parser->splstr16_to_idxs_f(
   1766         py16, py16_len, splid, NULL, kMaxLemmaSize, is_pre);
   1767     if (splidl != splid_len)
   1768       break;
   1769     // Phrase
   1770     hz16 = ++p;
   1771     while (*p != 0x2c && (p - ptr) < len) {
   1772       p++;
   1773     }
   1774     hz16_len = p - hz16;
   1775     if (hz16_len != splid_len)
   1776       break;
   1777     // Frequency
   1778     fr16 = ++p;
   1779     fr16_len = 0;
   1780     while (*p != 0x2c && (p - ptr) < len) {
   1781       p++;
   1782     }
   1783     fr16_len = p - fr16;
   1784     uint32 intf = (uint32)utf16le_atoll(fr16, fr16_len);
   1785     // Last modified time
   1786     fr16 = ++p;
   1787     fr16_len = 0;
   1788     while (*p != 0x3b && (p - ptr) < len) {
   1789       p++;
   1790     }
   1791     fr16_len = p - fr16;
   1792     uint64 last_mod = utf16le_atoll(fr16, fr16_len);
   1793 
   1794     put_lemma_no_sync(hz16, splid, splid_len, intf, last_mod);
   1795     newly_added++;
   1796 
   1797     p++;
   1798   }
   1799 
   1800 #ifdef ___DEBUG_PERF___
   1801   DEBUG_PERF_END;
   1802   LOGD_PERF("put_lemmas_no_sync_from_utf16le_string");
   1803 #endif
   1804   return newly_added;
   1805 }
   1806 
   1807 int UserDict::get_sync_lemmas_in_utf16le_string_from_beginning(
   1808     char16 * str, int size, int * count) {
   1809   int len = 0;
   1810   *count = 0;
   1811 
   1812   int left_len = size;
   1813 
   1814   if (is_valid_state() == false)
   1815     return len;
   1816 
   1817   SpellingTrie * spl_trie = &SpellingTrie::get_instance();
   1818   if (!spl_trie) {
   1819     return 0;
   1820   }
   1821 
   1822   uint32 i;
   1823   for (i = 0; i < dict_info_.sync_count; i++) {
   1824     int offset = syncs_[i];
   1825     uint32 nchar = get_lemma_nchar(offset);
   1826     uint16 *spl = get_lemma_spell_ids(offset);
   1827     uint16 *wrd = get_lemma_word(offset);
   1828     int score = _get_lemma_score(wrd, spl, nchar);
   1829 
   1830     static char score_temp[32], *pscore_temp = score_temp;
   1831     static char16 temp[256], *ptemp = temp;
   1832 
   1833     pscore_temp = score_temp;
   1834     ptemp = temp;
   1835 
   1836     uint32 j;
   1837     // Add pinyin
   1838     for (j = 0; j < nchar; j++) {
   1839       int ret_len = spl_trie->get_spelling_str16(
   1840           spl[j], ptemp, temp + sizeof(temp) - ptemp);
   1841       if (ret_len <= 0)
   1842         break;
   1843       ptemp += ret_len;
   1844       if (ptemp < temp + sizeof(temp) - 1) {
   1845         *(ptemp++) = ' ';
   1846       } else {
   1847         j = 0;
   1848         break;
   1849       }
   1850     }
   1851     if (j < nchar) {
   1852       continue;
   1853     }
   1854     ptemp--;
   1855     if (ptemp < temp + sizeof(temp) - 1) {
   1856       *(ptemp++) = ',';
   1857     } else {
   1858       continue;
   1859     }
   1860     // Add phrase
   1861     for (j = 0; j < nchar; j++) {
   1862       if (ptemp < temp + sizeof(temp) - 1) {
   1863         *(ptemp++) = wrd[j];
   1864       } else {
   1865         break;
   1866       }
   1867     }
   1868     if (j < nchar) {
   1869       continue;
   1870     }
   1871     if (ptemp < temp + sizeof(temp) - 1) {
   1872       *(ptemp++) = ',';
   1873     } else {
   1874       continue;
   1875     }
   1876     // Add frequency
   1877     uint32 intf = extract_score_freq(score);
   1878     int ret_len = utf16le_lltoa(intf, ptemp, temp + sizeof(temp) - ptemp);
   1879     if (ret_len <= 0)
   1880       continue;
   1881     ptemp += ret_len;
   1882     if (ptemp < temp + sizeof(temp) - 1) {
   1883       *(ptemp++) = ',';
   1884     } else {
   1885       continue;
   1886     }
   1887     // Add last modified time
   1888     uint64 last_mod = extract_score_lmt(score);
   1889     ret_len = utf16le_lltoa(last_mod, ptemp, temp + sizeof(temp) - ptemp);
   1890     if (ret_len <= 0)
   1891       continue;
   1892     ptemp += ret_len;
   1893     if (ptemp < temp + sizeof(temp) - 1) {
   1894       *(ptemp++) = ';';
   1895     } else {
   1896       continue;
   1897     }
   1898 
   1899     // Write to string
   1900     int need_len = ptemp - temp;
   1901     if (need_len > left_len)
   1902       break;
   1903     memcpy(str + len, temp, need_len * 2);
   1904     left_len -= need_len;
   1905 
   1906     len += need_len;
   1907     (*count)++;
   1908   }
   1909 
   1910   if (len > 0) {
   1911     if (state_ < USER_DICT_SYNC_DIRTY)
   1912       state_ = USER_DICT_SYNC_DIRTY;
   1913   }
   1914   return len;
   1915 }
   1916 
   1917 #endif
   1918 
   1919 bool UserDict::state(UserDictStat * stat) {
   1920   if (is_valid_state() == false)
   1921     return false;
   1922   if (!stat)
   1923     return false;
   1924   stat->version = version_;
   1925   stat->file_name = dict_file_;
   1926   stat->load_time.tv_sec = load_time_.tv_sec;
   1927   stat->load_time.tv_usec = load_time_.tv_usec;
   1928   pthread_mutex_lock(&g_mutex_);
   1929   stat->last_update.tv_sec = g_last_update_.tv_sec;
   1930   stat->last_update.tv_usec = g_last_update_.tv_usec;
   1931   pthread_mutex_unlock(&g_mutex_);
   1932   stat->disk_size = get_dict_file_size(&dict_info_);
   1933   stat->lemma_count = dict_info_.lemma_count;
   1934   stat->lemma_size = dict_info_.lemma_size;
   1935   stat->delete_count = dict_info_.free_count;
   1936   stat->delete_size = dict_info_.free_size;
   1937 #ifdef ___SYNC_ENABLED___
   1938   stat->sync_count = dict_info_.sync_count;
   1939 #endif
   1940   stat->limit_lemma_count = dict_info_.limit_lemma_count;
   1941   stat->limit_lemma_size = dict_info_.limit_lemma_size;
   1942   stat->reclaim_ratio = dict_info_.reclaim_ratio;
   1943   return true;
   1944 }
   1945 
   1946 void UserDict::set_limit(uint32 max_lemma_count,
   1947                          uint32 max_lemma_size, uint32 reclaim_ratio) {
   1948   dict_info_.limit_lemma_count = max_lemma_count;
   1949   dict_info_.limit_lemma_size = max_lemma_size;
   1950   if (reclaim_ratio > 100)
   1951     reclaim_ratio = 100;
   1952   dict_info_.reclaim_ratio = reclaim_ratio;
   1953 }
   1954 
   1955 void UserDict::reclaim() {
   1956   if (is_valid_state() == false)
   1957     return;
   1958 
   1959   switch (dict_info_.reclaim_ratio) {
   1960     case 0:
   1961       return;
   1962     case 100:
   1963       // TODO: CLEAR to be implemented
   1964       assert(false);
   1965       return;
   1966     default:
   1967       break;
   1968   }
   1969 
   1970   // XXX Reclaim is only based on count, not size
   1971   uint32 count = dict_info_.lemma_count;
   1972   int rc = count * dict_info_.reclaim_ratio / 100;
   1973 
   1974   UserDictScoreOffsetPair * score_offset_pairs = NULL;
   1975   score_offset_pairs = (UserDictScoreOffsetPair *)malloc(
   1976       sizeof(UserDictScoreOffsetPair) * rc);
   1977   if (score_offset_pairs == NULL) {
   1978     return;
   1979   }
   1980 
   1981   for (int i = 0; i < rc; i++) {
   1982     int s = scores_[i];
   1983     score_offset_pairs[i].score = s;
   1984     score_offset_pairs[i].offset_index = i;
   1985   }
   1986 
   1987   for (int i = (rc + 1) / 2; i >= 0; i--)
   1988     shift_down(score_offset_pairs, i, rc);
   1989 
   1990   for (uint32 i = rc; i < dict_info_.lemma_count; i++) {
   1991     int s = scores_[i];
   1992     if (s < score_offset_pairs[0].score) {
   1993       score_offset_pairs[0].score = s;
   1994       score_offset_pairs[0].offset_index = i;
   1995       shift_down(score_offset_pairs, 0, rc);
   1996     }
   1997   }
   1998 
   1999   for (int i = 0; i < rc; i++) {
   2000     int off = score_offset_pairs[i].offset_index;
   2001     remove_lemma_by_offset_index(off);
   2002   }
   2003   if (rc > 0) {
   2004     if (state_ < USER_DICT_OFFSET_DIRTY)
   2005       state_ = USER_DICT_OFFSET_DIRTY;
   2006   }
   2007 
   2008   free(score_offset_pairs);
   2009 }
   2010 
   2011 inline void UserDict::swap(UserDictScoreOffsetPair * sop, int i, int j) {
   2012   int s = sop[i].score;
   2013   int p = sop[i].offset_index;
   2014   sop[i].score = sop[j].score;
   2015   sop[i].offset_index = sop[j].offset_index;
   2016   sop[j].score = s;
   2017   sop[j].offset_index = p;
   2018 }
   2019 
   2020 void UserDict::shift_down(UserDictScoreOffsetPair * sop, int i, int n) {
   2021   int par = i;
   2022   while (par < n) {
   2023     int left = par * 2 + 1;
   2024     int right = left + 1;
   2025     if (left >= n && right >= n)
   2026       break;
   2027     if (right >= n) {
   2028       if (sop[left].score > sop[par].score) {
   2029         swap(sop, left, par);
   2030         par = left;
   2031         continue;
   2032       }
   2033     } else if (sop[left].score > sop[right].score &&
   2034                sop[left].score > sop[par].score) {
   2035       swap(sop, left, par);
   2036       par = left;
   2037       continue;
   2038     } else if (sop[right].score > sop[left].score &&
   2039                sop[right].score > sop[par].score) {
   2040       swap(sop, right, par);
   2041       par = right;
   2042       continue;
   2043     }
   2044     break;
   2045   }
   2046 }
   2047 
   2048 LemmaIdType UserDict::put_lemma(char16 lemma_str[], uint16 splids[],
   2049                                 uint16 lemma_len, uint16 count) {
   2050   return _put_lemma(lemma_str, splids, lemma_len, count, time(NULL));
   2051 }
   2052 
   2053 LemmaIdType UserDict::_put_lemma(char16 lemma_str[], uint16 splids[],
   2054                                  uint16 lemma_len, uint16 count, uint64 lmt) {
   2055 #ifdef ___DEBUG_PERF___
   2056   DEBUG_PERF_BEGIN;
   2057 #endif
   2058   if (is_valid_state() == false)
   2059     return 0;
   2060   int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
   2061   if (off != -1) {
   2062     int delta_score = count - scores_[off];
   2063     dict_info_.total_nfreq += delta_score;
   2064     scores_[off] = build_score(lmt, count);
   2065     if (state_ < USER_DICT_SCORE_DIRTY)
   2066       state_ = USER_DICT_SCORE_DIRTY;
   2067 #ifdef ___DEBUG_PERF___
   2068     DEBUG_PERF_END;
   2069     LOGD_PERF("_put_lemma(update)");
   2070 #endif
   2071     return ids_[off];
   2072   } else {
   2073     if ((dict_info_.limit_lemma_count > 0 &&
   2074         dict_info_.lemma_count >= dict_info_.limit_lemma_count)
   2075         || (dict_info_.limit_lemma_size > 0 &&
   2076             dict_info_.lemma_size + (2 + (lemma_len << 2))
   2077             > dict_info_.limit_lemma_size)) {
   2078       // XXX Don't defragment here, it's too time-consuming.
   2079       return 0;
   2080     }
   2081     int flushed = 0;
   2082     if (lemma_count_left_ == 0 ||
   2083         lemma_size_left_ < (size_t)(2 + (lemma_len << 2))) {
   2084 
   2085       // XXX When there is no space for new lemma, we flush to disk
   2086       // flush_cache() may be called by upper user
   2087       // and better place shoule be found instead of here
   2088       flush_cache();
   2089       flushed = 1;
   2090       // Or simply return and do nothing
   2091       // return 0;
   2092     }
   2093 #ifdef ___DEBUG_PERF___
   2094     DEBUG_PERF_END;
   2095     LOGD_PERF(flushed ? "_put_lemma(flush+add)" : "_put_lemma(add)");
   2096 #endif
   2097     LemmaIdType id = append_a_lemma(lemma_str, splids, lemma_len, count, lmt);
   2098 #ifdef ___SYNC_ENABLED___
   2099     if (syncs_ && id != 0) {
   2100       queue_lemma_for_sync(id);
   2101     }
   2102 #endif
   2103     return id;
   2104   }
   2105   return 0;
   2106 }
   2107 
   2108 #ifdef ___SYNC_ENABLED___
   2109 void UserDict::queue_lemma_for_sync(LemmaIdType id) {
   2110   if (dict_info_.sync_count < sync_count_size_) {
   2111     syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_];
   2112   } else {
   2113     uint32 * syncs = (uint32*)realloc(
   2114         syncs_, (sync_count_size_ + kUserDictPreAlloc) << 2);
   2115     if (syncs) {
   2116       sync_count_size_ += kUserDictPreAlloc;
   2117       syncs_ = syncs;
   2118       syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_];
   2119     }
   2120   }
   2121 }
   2122 #endif
   2123 
   2124 LemmaIdType UserDict::update_lemma(LemmaIdType lemma_id, int16 delta_count,
   2125                                    bool selected) {
   2126 #ifdef ___DEBUG_PERF___
   2127   DEBUG_PERF_BEGIN;
   2128 #endif
   2129   if (is_valid_state() == false)
   2130     return 0;
   2131   if (is_valid_lemma_id(lemma_id) == false)
   2132     return 0;
   2133   uint32 offset = offsets_by_id_[lemma_id - start_id_];
   2134   uint8 lemma_len = get_lemma_nchar(offset);
   2135   char16 * lemma_str = get_lemma_word(offset);
   2136   uint16 * splids = get_lemma_spell_ids(offset);
   2137 
   2138   int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
   2139   if (off != -1) {
   2140     int score = scores_[off];
   2141     int count = extract_score_freq(score);
   2142     uint64 lmt = extract_score_lmt(score);
   2143     if (count + delta_count > kUserDictMaxFrequency ||
   2144         count + delta_count < count) {
   2145       delta_count = kUserDictMaxFrequency - count;
   2146     }
   2147     count += delta_count;
   2148     dict_info_.total_nfreq += delta_count;
   2149     if (selected) {
   2150       lmt = time(NULL);
   2151     }
   2152     scores_[off] = build_score(lmt, count);
   2153     if (state_ < USER_DICT_SCORE_DIRTY)
   2154       state_ = USER_DICT_SCORE_DIRTY;
   2155 #ifdef ___DEBUG_PERF___
   2156     DEBUG_PERF_END;
   2157     LOGD_PERF("update_lemma");
   2158 #endif
   2159 #ifdef ___SYNC_ENABLED___
   2160     queue_lemma_for_sync(ids_[off]);
   2161 #endif
   2162     return ids_[off];
   2163   }
   2164   return 0;
   2165 }
   2166 
   2167 size_t UserDict::get_total_lemma_count() {
   2168   return dict_info_.total_nfreq;
   2169 }
   2170 
   2171 void UserDict::set_total_lemma_count_of_others(size_t count) {
   2172   total_other_nfreq_ = count;
   2173 }
   2174 
   2175 LemmaIdType UserDict::append_a_lemma(char16 lemma_str[], uint16 splids[],
   2176                                    uint16 lemma_len, uint16 count, uint64 lmt) {
   2177   LemmaIdType id = get_max_lemma_id() + 1;
   2178   size_t offset = dict_info_.lemma_size;
   2179   if (offset > kUserDictOffsetMask)
   2180     return 0;
   2181 
   2182   lemmas_[offset] = 0;
   2183   lemmas_[offset + 1] = (uint8)lemma_len;
   2184   for (size_t i = 0; i < lemma_len; i++) {
   2185     *((uint16*)&lemmas_[offset + 2 + (i << 1)]) = splids[i];
   2186     *((char16*)&lemmas_[offset + 2 + (lemma_len << 1) + (i << 1)])
   2187         = lemma_str[i];
   2188   }
   2189   uint32 off = dict_info_.lemma_count;
   2190   offsets_[off] = offset;
   2191   scores_[off] = build_score(lmt, count);
   2192   ids_[off] = id;
   2193 #ifdef ___PREDICT_ENABLED___
   2194   predicts_[off] = offset;
   2195 #endif
   2196 
   2197   offsets_by_id_[id - start_id_] = offset;
   2198 
   2199   dict_info_.lemma_count++;
   2200   dict_info_.lemma_size += (2 + (lemma_len << 2));
   2201   lemma_count_left_--;
   2202   lemma_size_left_ -= (2 + (lemma_len << 2));
   2203 
   2204   // Sort
   2205 
   2206   UserDictSearchable searchable;
   2207   prepare_locate(&searchable, splids, lemma_len);
   2208 
   2209   size_t i = 0;
   2210   while (i < off) {
   2211     offset = offsets_[i];
   2212     uint32 nchar = get_lemma_nchar(offset);
   2213     uint16 * spl = get_lemma_spell_ids(offset);
   2214 
   2215     if (0 <= fuzzy_compare_spell_id(spl, nchar, &searchable))
   2216       break;
   2217     i++;
   2218   }
   2219   if (i != off) {
   2220     uint32 temp = offsets_[off];
   2221     memmove(offsets_ + i + 1, offsets_ + i, (off - i) << 2);
   2222     offsets_[i] = temp;
   2223 
   2224     temp = scores_[off];
   2225     memmove(scores_ + i + 1, scores_ + i, (off - i) << 2);
   2226     scores_[i] = temp;
   2227 
   2228     temp = ids_[off];
   2229     memmove(ids_ + i + 1, ids_ + i, (off - i) << 2);
   2230     ids_[i] = temp;
   2231   }
   2232 
   2233 #ifdef ___PREDICT_ENABLED___
   2234   uint32 j = 0;
   2235   uint16 * words_new = get_lemma_word(predicts_[off]);
   2236   j = locate_where_to_insert_in_predicts(words_new, lemma_len);
   2237   if (j != off) {
   2238     uint32 temp = predicts_[off];
   2239     memmove(predicts_ + j + 1, predicts_ + j, (off - j) << 2);
   2240     predicts_[j] = temp;
   2241   }
   2242 #endif
   2243 
   2244   if (state_ < USER_DICT_LEMMA_DIRTY)
   2245     state_ = USER_DICT_LEMMA_DIRTY;
   2246 
   2247 #ifdef ___CACHE_ENABLED___
   2248   cache_init();
   2249 #endif
   2250 
   2251   dict_info_.total_nfreq += count;
   2252   return id;
   2253 }
   2254 }
   2255