Home | History | Annotate | Download | only in share
      1 /*
      2  * Copyright (C) 2009 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include <assert.h>
     18 #include <math.h>
     19 #include <stdio.h>
     20 #include <string.h>
     21 #include "../include/lpicache.h"
     22 #include "../include/matrixsearch.h"
     23 #include "../include/mystdlib.h"
     24 #include "../include/ngram.h"
     25 #include "../include/userdict.h"
     26 
     27 namespace ime_pinyin {
     28 
     29 #define PRUMING_SCORE 8000.0
     30 
     31 MatrixSearch::MatrixSearch() {
     32   inited_ = false;
     33   spl_trie_ = SpellingTrie::get_cpinstance();
     34 
     35   reset_pointers_to_null();
     36 
     37   pys_decoded_len_ = 0;
     38   mtrx_nd_pool_used_ = 0;
     39   dmi_pool_used_ = 0;
     40   xi_an_enabled_ = false;
     41   dmi_c_phrase_ = false;
     42 
     43   assert(kMaxSearchSteps > 0);
     44   max_sps_len_ = kMaxSearchSteps - 1;
     45   max_hzs_len_ = kMaxSearchSteps;
     46 }
     47 
     48 MatrixSearch::~MatrixSearch() {
     49   free_resource();
     50 }
     51 
     52 void MatrixSearch::reset_pointers_to_null() {
     53   dict_trie_ = NULL;
     54   user_dict_ = NULL;
     55   spl_parser_ = NULL;
     56 
     57   share_buf_ = NULL;
     58 
     59   // The following four buffers are used for decoding, and they are based on
     60   // share_buf_, no need to delete them.
     61   mtrx_nd_pool_ = NULL;
     62   dmi_pool_ = NULL;
     63   matrix_ = NULL;
     64   dep_ = NULL;
     65 
     66   // Based on share_buf_, no need to delete them.
     67   npre_items_ = NULL;
     68 }
     69 
     70 bool MatrixSearch::alloc_resource() {
     71   free_resource();
     72 
     73   dict_trie_ = new DictTrie();
     74   user_dict_ = static_cast<AtomDictBase*>(new UserDict());
     75   spl_parser_ = new SpellingParser();
     76 
     77   size_t mtrx_nd_size = sizeof(MatrixNode) * kMtrxNdPoolSize;
     78   mtrx_nd_size = align_to_size_t(mtrx_nd_size) / sizeof(size_t);
     79   size_t dmi_size = sizeof(DictMatchInfo) * kDmiPoolSize;
     80   dmi_size = align_to_size_t(dmi_size) / sizeof(size_t);
     81   size_t matrix_size = sizeof(MatrixRow) * kMaxRowNum;
     82   matrix_size = align_to_size_t(matrix_size) / sizeof(size_t);
     83   size_t dep_size = sizeof(DictExtPara);
     84   dep_size = align_to_size_t(dep_size) / sizeof(size_t);
     85 
     86   // share_buf's size is determined by the buffers for search.
     87   share_buf_ = new size_t[mtrx_nd_size + dmi_size + matrix_size + dep_size];
     88 
     89   if (NULL == dict_trie_ || NULL == user_dict_ || NULL == spl_parser_ ||
     90       NULL == share_buf_)
     91     return false;
     92 
     93   // The buffers for search are based on the share buffer
     94   mtrx_nd_pool_ = reinterpret_cast<MatrixNode*>(share_buf_);
     95   dmi_pool_ = reinterpret_cast<DictMatchInfo*>(share_buf_ + mtrx_nd_size);
     96   matrix_ = reinterpret_cast<MatrixRow*>(share_buf_ + mtrx_nd_size + dmi_size);
     97   dep_ = reinterpret_cast<DictExtPara*>
     98       (share_buf_ + mtrx_nd_size + dmi_size + matrix_size);
     99 
    100   // The prediction buffer is also based on the share buffer.
    101   npre_items_ = reinterpret_cast<NPredictItem*>(share_buf_);
    102   npre_items_len_ = (mtrx_nd_size + dmi_size + matrix_size + dep_size) *
    103       sizeof(size_t) / sizeof(NPredictItem);
    104   return true;
    105 }
    106 
    107 void MatrixSearch::free_resource() {
    108   if (NULL != dict_trie_)
    109     delete dict_trie_;
    110 
    111   if (NULL != user_dict_)
    112     delete user_dict_;
    113 
    114   if (NULL != spl_parser_)
    115     delete spl_parser_;
    116 
    117   if (NULL != share_buf_)
    118     delete [] share_buf_;
    119 
    120   reset_pointers_to_null();
    121 }
    122 
    123 bool MatrixSearch::init(const char *fn_sys_dict, const char *fn_usr_dict) {
    124   if (NULL == fn_sys_dict || NULL == fn_usr_dict)
    125     return false;
    126 
    127   if (!alloc_resource())
    128     return false;
    129 
    130   if (!dict_trie_->load_dict(fn_sys_dict, 1, kSysDictIdEnd))
    131     return false;
    132 
    133   // If engine fails to load the user dictionary, reset the user dictionary
    134   // to NULL.
    135   if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) {
    136     delete user_dict_;
    137     user_dict_ = NULL;
    138   } else{
    139     user_dict_->set_total_lemma_count_of_others(NGram::kSysDictTotalFreq);
    140   }
    141 
    142   reset_search0();
    143 
    144   inited_ = true;
    145   return true;
    146 }
    147 
    148 bool MatrixSearch::init_fd(int sys_fd, long start_offset, long length,
    149                            const char *fn_usr_dict) {
    150   if (NULL == fn_usr_dict)
    151     return false;
    152 
    153   if (!alloc_resource())
    154     return false;
    155 
    156   if (!dict_trie_->load_dict_fd(sys_fd, start_offset, length, 1, kSysDictIdEnd))
    157     return false;
    158 
    159   if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) {
    160     delete user_dict_;
    161     user_dict_ = NULL;
    162   } else {
    163     user_dict_->set_total_lemma_count_of_others(NGram::kSysDictTotalFreq);
    164   }
    165 
    166   reset_search0();
    167 
    168   inited_ = true;
    169   return true;
    170 }
    171 
    172 void MatrixSearch::set_max_lens(size_t max_sps_len, size_t max_hzs_len) {
    173   if (0 != max_sps_len)
    174     max_sps_len_ = max_sps_len;
    175   if (0 != max_hzs_len)
    176     max_hzs_len_ = max_hzs_len;
    177 }
    178 
    179 void MatrixSearch::close() {
    180   flush_cache();
    181   free_resource();
    182   inited_ = false;
    183 }
    184 
    185 void MatrixSearch::flush_cache() {
    186   if (NULL != user_dict_)
    187     user_dict_->flush_cache();
    188 }
    189 
    190 void MatrixSearch::set_xi_an_switch(bool xi_an_enabled) {
    191   xi_an_enabled_ = xi_an_enabled;
    192 }
    193 
    194 bool MatrixSearch::get_xi_an_switch() {
    195   return xi_an_enabled_;
    196 }
    197 
    198 bool MatrixSearch::reset_search() {
    199   if (!inited_)
    200     return false;
    201   return reset_search0();
    202 }
    203 
    204 bool MatrixSearch::reset_search0() {
    205     if (!inited_)
    206         return false;
    207 
    208     pys_decoded_len_ = 0;
    209     mtrx_nd_pool_used_ = 0;
    210     dmi_pool_used_ = 0;
    211 
    212     // Get a MatrixNode from the pool
    213     matrix_[0].mtrx_nd_pos = mtrx_nd_pool_used_;
    214     matrix_[0].mtrx_nd_num = 1;
    215     mtrx_nd_pool_used_ += 1;
    216 
    217     // Update the node, and make it to be a starting node
    218     MatrixNode *node = mtrx_nd_pool_ + matrix_[0].mtrx_nd_pos;
    219     node->id = 0;
    220     node->score = 0;
    221     node->from = NULL;
    222     node->step = 0;
    223     node->dmi_fr = (PoolPosType)-1;
    224 
    225     matrix_[0].dmi_pos = 0;
    226     matrix_[0].dmi_num = 0;
    227     matrix_[0].dmi_has_full_id = 1;
    228     matrix_[0].mtrx_nd_fixed = node;
    229 
    230     lma_start_[0] = 0;
    231     fixed_lmas_ = 0;
    232     spl_start_[0] = 0;
    233     fixed_hzs_ = 0;
    234 
    235     dict_trie_->reset_milestones(0, 0);
    236     if (NULL != user_dict_)
    237       user_dict_->reset_milestones(0, 0);
    238 
    239     return true;
    240 }
    241 
    242 bool MatrixSearch::reset_search(size_t ch_pos, bool clear_fixed_this_step,
    243                                 bool clear_dmi_this_step,
    244                                 bool clear_mtrx_this_step) {
    245   if (!inited_ || ch_pos > pys_decoded_len_ || ch_pos >= kMaxRowNum)
    246     return false;
    247 
    248   if (0 == ch_pos) {
    249     reset_search0();
    250   } else {
    251     // Prepare mile stones of this step to clear.
    252     MileStoneHandle *dict_handles_to_clear = NULL;
    253     if (clear_dmi_this_step && matrix_[ch_pos].dmi_num > 0) {
    254       dict_handles_to_clear = dmi_pool_[matrix_[ch_pos].dmi_pos].dict_handles;
    255     }
    256 
    257     // If there are more steps, and this step is not allowed to clear, find
    258     // milestones of next step.
    259     if (pys_decoded_len_ > ch_pos && !clear_dmi_this_step) {
    260       dict_handles_to_clear = NULL;
    261       if (matrix_[ch_pos + 1].dmi_num > 0) {
    262         dict_handles_to_clear =
    263             dmi_pool_[matrix_[ch_pos + 1].dmi_pos].dict_handles;
    264       }
    265     }
    266 
    267     if (NULL != dict_handles_to_clear) {
    268       dict_trie_->reset_milestones(ch_pos, dict_handles_to_clear[0]);
    269       if (NULL != user_dict_)
    270         user_dict_->reset_milestones(ch_pos, dict_handles_to_clear[1]);
    271     }
    272 
    273     pys_decoded_len_ = ch_pos;
    274 
    275     if (clear_dmi_this_step) {
    276       dmi_pool_used_ = matrix_[ch_pos - 1].dmi_pos
    277                        + matrix_[ch_pos - 1].dmi_num;
    278       matrix_[ch_pos].dmi_num = 0;
    279     } else {
    280       dmi_pool_used_ = matrix_[ch_pos].dmi_pos + matrix_[ch_pos].dmi_num;
    281     }
    282 
    283     if (clear_mtrx_this_step) {
    284       mtrx_nd_pool_used_ = matrix_[ch_pos - 1].mtrx_nd_pos
    285                            + matrix_[ch_pos - 1].mtrx_nd_num;
    286       matrix_[ch_pos].mtrx_nd_num = 0;
    287     } else {
    288       mtrx_nd_pool_used_ = matrix_[ch_pos].mtrx_nd_pos
    289                            + matrix_[ch_pos].mtrx_nd_num;
    290     }
    291 
    292     // Modify fixed_hzs_
    293     if (fixed_hzs_ > 0 &&
    294         ((kLemmaIdComposing != lma_id_[0]) ||
    295          (kLemmaIdComposing == lma_id_[0] &&
    296           spl_start_[c_phrase_.length] <= ch_pos))) {
    297       size_t fixed_ch_pos = ch_pos;
    298       if (clear_fixed_this_step)
    299         fixed_ch_pos = fixed_ch_pos > 0 ? fixed_ch_pos - 1 : 0;
    300       while (NULL == matrix_[fixed_ch_pos].mtrx_nd_fixed && fixed_ch_pos > 0)
    301         fixed_ch_pos--;
    302 
    303       fixed_lmas_ = 0;
    304       fixed_hzs_ = 0;
    305       if (fixed_ch_pos > 0) {
    306         while (spl_start_[fixed_hzs_] < fixed_ch_pos)
    307           fixed_hzs_++;
    308         assert(spl_start_[fixed_hzs_] == fixed_ch_pos);
    309 
    310         while (lma_start_[fixed_lmas_] < fixed_hzs_)
    311           fixed_lmas_++;
    312         assert(lma_start_[fixed_lmas_] == fixed_hzs_);
    313       }
    314 
    315       // Re-search the Pinyin string for the unlocked lemma
    316       // which was previously fixed.
    317       //
    318       // Prepare mile stones of this step to clear.
    319       MileStoneHandle *dict_handles_to_clear = NULL;
    320       if (clear_dmi_this_step && ch_pos == fixed_ch_pos &&
    321           matrix_[fixed_ch_pos].dmi_num > 0) {
    322         dict_handles_to_clear = dmi_pool_[matrix_[fixed_ch_pos].dmi_pos].dict_handles;
    323       }
    324 
    325       // If there are more steps, and this step is not allowed to clear, find
    326       // milestones of next step.
    327       if (pys_decoded_len_ > fixed_ch_pos && !clear_dmi_this_step) {
    328         dict_handles_to_clear = NULL;
    329         if (matrix_[fixed_ch_pos + 1].dmi_num > 0) {
    330           dict_handles_to_clear =
    331               dmi_pool_[matrix_[fixed_ch_pos + 1].dmi_pos].dict_handles;
    332         }
    333       }
    334 
    335       if (NULL != dict_handles_to_clear) {
    336         dict_trie_->reset_milestones(fixed_ch_pos, dict_handles_to_clear[0]);
    337         if (NULL != user_dict_)
    338           user_dict_->reset_milestones(fixed_ch_pos, dict_handles_to_clear[1]);
    339       }
    340 
    341 
    342       pys_decoded_len_ = fixed_ch_pos;
    343 
    344       if (clear_dmi_this_step && ch_pos == fixed_ch_pos) {
    345         dmi_pool_used_ = matrix_[fixed_ch_pos - 1].dmi_pos
    346                          + matrix_[fixed_ch_pos - 1].dmi_num;
    347         matrix_[fixed_ch_pos].dmi_num = 0;
    348       } else {
    349         dmi_pool_used_ = matrix_[fixed_ch_pos].dmi_pos +
    350             matrix_[fixed_ch_pos].dmi_num;
    351       }
    352 
    353       if (clear_mtrx_this_step && ch_pos == fixed_ch_pos) {
    354         mtrx_nd_pool_used_ = matrix_[fixed_ch_pos - 1].mtrx_nd_pos
    355                              + matrix_[fixed_ch_pos - 1].mtrx_nd_num;
    356         matrix_[fixed_ch_pos].mtrx_nd_num = 0;
    357       } else {
    358         mtrx_nd_pool_used_ = matrix_[fixed_ch_pos].mtrx_nd_pos
    359                              + matrix_[fixed_ch_pos].mtrx_nd_num;
    360       }
    361 
    362       for (uint16 re_pos = fixed_ch_pos; re_pos < ch_pos; re_pos++) {
    363         add_char(pys_[re_pos]);
    364       }
    365     } else if (fixed_hzs_ > 0 && kLemmaIdComposing == lma_id_[0]) {
    366       for (uint16 subpos = 0; subpos < c_phrase_.sublma_num; subpos++) {
    367         uint16 splpos_begin = c_phrase_.sublma_start[subpos];
    368         uint16 splpos_end = c_phrase_.sublma_start[subpos + 1];
    369         for (uint16 splpos = splpos_begin; splpos < splpos_end; splpos++) {
    370           // If ch_pos is in this spelling
    371           uint16 spl_start = c_phrase_.spl_start[splpos];
    372           uint16 spl_end = c_phrase_.spl_start[splpos + 1];
    373           if (ch_pos >= spl_start && ch_pos < spl_end) {
    374             // Clear everything after this position
    375             c_phrase_.chn_str[splpos] = static_cast<char16>('\0');
    376             c_phrase_.sublma_start[subpos + 1] = splpos;
    377             c_phrase_.sublma_num = subpos + 1;
    378             c_phrase_.length = splpos;
    379 
    380             if (splpos == splpos_begin) {
    381               c_phrase_.sublma_num = subpos;
    382             }
    383           }
    384         }
    385       }
    386 
    387       // Extend the composing phrase.
    388       reset_search0();
    389       dmi_c_phrase_ = true;
    390       uint16 c_py_pos = 0;
    391       while (c_py_pos < spl_start_[c_phrase_.length]) {
    392         bool b_ac_tmp = add_char(pys_[c_py_pos]);
    393         assert(b_ac_tmp);
    394         c_py_pos++;
    395       }
    396       dmi_c_phrase_ = false;
    397 
    398       lma_id_num_ = 1;
    399       fixed_lmas_ = 1;
    400       fixed_lmas_no1_[0] = 0;  // A composing string is always modified.
    401       fixed_hzs_ = c_phrase_.length;
    402       lma_start_[1] = fixed_hzs_;
    403       lma_id_[0] = kLemmaIdComposing;
    404       matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ +
    405           matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos;
    406     }
    407   }
    408 
    409   return true;
    410 }
    411 
    412 void MatrixSearch::del_in_pys(size_t start, size_t len) {
    413   while (start < kMaxRowNum - len && '\0' != pys_[start]) {
    414     pys_[start] = pys_[start + len];
    415     start++;
    416   }
    417 }
    418 
    419 size_t MatrixSearch::search(const char *py, size_t py_len) {
    420   if (!inited_ || NULL == py)
    421     return 0;
    422 
    423   // If the search Pinyin string is too long, it will be truncated.
    424   if (py_len > kMaxRowNum - 1)
    425     py_len = kMaxRowNum - 1;
    426 
    427   // Compare the new string with the previous one. Find their prefix to
    428   // increase search efficiency.
    429   size_t ch_pos = 0;
    430   for (ch_pos = 0; ch_pos < pys_decoded_len_; ch_pos++) {
    431     if ('\0' == py[ch_pos] || py[ch_pos] != pys_[ch_pos])
    432       break;
    433   }
    434 
    435   bool clear_fix = true;
    436   if (ch_pos == pys_decoded_len_)
    437     clear_fix = false;
    438 
    439   reset_search(ch_pos, clear_fix, false, false);
    440 
    441   memcpy(pys_ + ch_pos, py + ch_pos, py_len - ch_pos);
    442   pys_[py_len] = '\0';
    443 
    444   while ('\0' != pys_[ch_pos]) {
    445     if (!add_char(py[ch_pos])) {
    446       pys_decoded_len_ = ch_pos;
    447       break;
    448     }
    449     ch_pos++;
    450   }
    451 
    452   // Get spelling ids and starting positions.
    453   get_spl_start_id();
    454 
    455   // If there are too many spellings, remove the last letter until the spelling
    456   // number is acceptable.
    457   while (spl_id_num_ > 9) {
    458     py_len--;
    459     reset_search(py_len, false, false, false);
    460     pys_[py_len] = '\0';
    461     get_spl_start_id();
    462   }
    463 
    464   prepare_candidates();
    465 
    466   if (kPrintDebug0) {
    467     printf("--Matrix Node Pool Used: %d\n", mtrx_nd_pool_used_);
    468     printf("--DMI Pool Used: %d\n", dmi_pool_used_);
    469 
    470     if (kPrintDebug1) {
    471       for (PoolPosType pos = 0; pos < dmi_pool_used_; pos++) {
    472         debug_print_dmi(pos, 1);
    473       }
    474     }
    475   }
    476 
    477   return ch_pos;
    478 }
    479 
    480 size_t MatrixSearch::delsearch(size_t pos, bool is_pos_in_splid,
    481                                bool clear_fixed_this_step) {
    482   if (!inited_)
    483     return 0;
    484 
    485   size_t reset_pos = pos;
    486 
    487   // Out of range for both Pinyin mode and Spelling id mode.
    488   if (pys_decoded_len_ <= pos) {
    489     del_in_pys(pos, 1);
    490 
    491     reset_pos = pys_decoded_len_;
    492     // Decode the string after the un-decoded position
    493     while ('\0' != pys_[reset_pos]) {
    494       if (!add_char(pys_[reset_pos])) {
    495         pys_decoded_len_ = reset_pos;
    496         break;
    497       }
    498       reset_pos++;
    499     }
    500     get_spl_start_id();
    501     prepare_candidates();
    502     return pys_decoded_len_;
    503   }
    504 
    505   // Spelling id mode, but out of range.
    506   if (is_pos_in_splid && spl_id_num_ <= pos)
    507     return pys_decoded_len_;
    508 
    509   // Begin to handle two modes respectively.
    510   // Pinyin mode by default
    511   size_t c_py_len = 0;  // The length of composing phrase's Pinyin
    512   size_t del_py_len = 1;
    513   if (!is_pos_in_splid) {
    514     // Pinyin mode is only allowed to delete beyond the fixed lemmas.
    515     if (fixed_lmas_ > 0 && pos < spl_start_[lma_start_[fixed_lmas_]])
    516       return pys_decoded_len_;
    517 
    518     del_in_pys(pos, 1);
    519 
    520     // If the deleted character is just the one after the last fixed lemma
    521     if (pos == spl_start_[lma_start_[fixed_lmas_]]) {
    522       // If all fixed lemmas have been merged, and the caller of the function
    523       // request to unlock the last fixed lemma.
    524       if (kLemmaIdComposing == lma_id_[0] && clear_fixed_this_step) {
    525         // Unlock the last sub lemma in the composing phrase. Because it is not
    526         // easy to unlock it directly. Instead, we re-decode the modified
    527         // composing phrase.
    528         c_phrase_.sublma_num--;
    529         c_phrase_.length = c_phrase_.sublma_start[c_phrase_.sublma_num];
    530         reset_pos = spl_start_[c_phrase_.length];
    531         c_py_len = reset_pos;
    532       }
    533     }
    534   } else {
    535     del_py_len = spl_start_[pos + 1] - spl_start_[pos];
    536 
    537     del_in_pys(spl_start_[pos], del_py_len);
    538 
    539     if (pos >= lma_start_[fixed_lmas_]) {
    540       c_py_len = 0;
    541       reset_pos = spl_start_[pos + 1] - del_py_len;
    542     } else {
    543       c_py_len = spl_start_[lma_start_[fixed_lmas_]] - del_py_len;
    544       reset_pos = c_py_len;
    545       if (c_py_len > 0)
    546         merge_fixed_lmas(pos);
    547     }
    548   }
    549 
    550   if (c_py_len > 0) {
    551     assert(c_phrase_.length > 0 && c_py_len ==
    552         c_phrase_.spl_start[c_phrase_.sublma_start[c_phrase_.sublma_num]]);
    553     // The composing phrase is valid, reset all search space,
    554     // and begin a new search which will only extend the composing
    555     // phrase.
    556     reset_search0();
    557 
    558     dmi_c_phrase_ = true;
    559     // Extend the composing phrase.
    560     uint16 c_py_pos = 0;
    561     while (c_py_pos < c_py_len) {
    562       bool b_ac_tmp = add_char(pys_[c_py_pos]);
    563       assert(b_ac_tmp);
    564       c_py_pos++;
    565     }
    566     dmi_c_phrase_ = false;
    567 
    568     // Fixd the composing phrase as the first choice.
    569     lma_id_num_ = 1;
    570     fixed_lmas_ = 1;
    571     fixed_lmas_no1_[0] = 0;  // A composing string is always modified.
    572     fixed_hzs_ = c_phrase_.length;
    573     lma_start_[1] = fixed_hzs_;
    574     lma_id_[0] = kLemmaIdComposing;
    575     matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ +
    576         matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos;
    577   } else {
    578     // Reseting search only clear pys_decoded_len_, but the string is kept.
    579     reset_search(reset_pos, clear_fixed_this_step, false, false);
    580   }
    581 
    582   // Decode the string after the delete position.
    583   while ('\0' != pys_[reset_pos]) {
    584     if (!add_char(pys_[reset_pos])) {
    585       pys_decoded_len_ = reset_pos;
    586       break;
    587     }
    588     reset_pos++;
    589   }
    590 
    591   get_spl_start_id();
    592   prepare_candidates();
    593   return pys_decoded_len_;
    594 }
    595 
    596 size_t MatrixSearch::get_candidate_num() {
    597   if (!inited_ || 0 == pys_decoded_len_ ||
    598       0 == matrix_[pys_decoded_len_].mtrx_nd_num)
    599     return 0;
    600 
    601   return 1 + lpi_total_;
    602 }
    603 
    604 char16* MatrixSearch::get_candidate(size_t cand_id, char16 *cand_str,
    605                                     size_t max_len) {
    606   if (!inited_ || 0 == pys_decoded_len_ || NULL == cand_str)
    607     return NULL;
    608 
    609   if (0 == cand_id) {
    610     return get_candidate0(cand_str, max_len, NULL, false);
    611   } else {
    612     cand_id--;
    613   }
    614 
    615   // For this case: the current sentence is a word only, and the user fixed it,
    616   // so the result will be fixed to the sentence space, and
    617   // lpi_total_ will be set to 0.
    618   if (0 == lpi_total_) {
    619     return get_candidate0(cand_str, max_len, NULL, false);
    620   }
    621 
    622   LemmaIdType id = lpi_items_[cand_id].id;
    623   char16 s[kMaxLemmaSize + 1];
    624 
    625   uint16 s_len = lpi_items_[cand_id].lma_len;
    626   if (s_len > 1) {
    627     s_len = get_lemma_str(id, s, kMaxLemmaSize + 1);
    628   } else {
    629     // For a single character, Hanzi is ready.
    630     s[0] = lpi_items_[cand_id].hanzi;
    631     s[1] = static_cast<char16>(0);
    632   }
    633 
    634   if (s_len > 0 &&  max_len > s_len) {
    635     utf16_strncpy(cand_str, s, s_len);
    636     cand_str[s_len] = (char16)'\0';
    637     return cand_str;
    638   }
    639 
    640   return NULL;
    641 }
    642 
    643 void MatrixSearch::update_dict_freq() {
    644   if (NULL != user_dict_) {
    645     // Update the total frequency of all lemmas, including system lemmas and
    646     // user dictionary lemmas.
    647     size_t total_freq = user_dict_->get_total_lemma_count();
    648     dict_trie_->set_total_lemma_count_of_others(total_freq);
    649   }
    650 }
    651 
    652 bool MatrixSearch::add_lma_to_userdict(uint16 lma_fr, uint16 lma_to,
    653                                        float score) {
    654   if (lma_to - lma_fr <= 1 || NULL == user_dict_)
    655     return false;
    656 
    657   char16 word_str[kMaxLemmaSize + 1];
    658   uint16 spl_ids[kMaxLemmaSize];
    659 
    660   uint16 spl_id_fr = 0;
    661 
    662   for (uint16 pos = lma_fr; pos < lma_to; pos++) {
    663     LemmaIdType lma_id = lma_id_[pos];
    664     if (is_user_lemma(lma_id)) {
    665       user_dict_->update_lemma(lma_id, 1, true);
    666     }
    667     uint16 lma_len = lma_start_[pos + 1] - lma_start_[pos];
    668     utf16_strncpy(spl_ids + spl_id_fr, spl_id_ + lma_start_[pos], lma_len);
    669 
    670     uint16 tmp = get_lemma_str(lma_id, word_str + spl_id_fr,
    671                                kMaxLemmaSize + 1 - spl_id_fr);
    672     assert(tmp == lma_len);
    673 
    674     tmp = get_lemma_splids(lma_id, spl_ids + spl_id_fr, lma_len, true);
    675     if (tmp != lma_len) {
    676       return false;
    677     }
    678 
    679     spl_id_fr += lma_len;
    680   }
    681 
    682   assert(spl_id_fr <= kMaxLemmaSize);
    683 
    684   return user_dict_->put_lemma(static_cast<char16*>(word_str), spl_ids,
    685                                  spl_id_fr, 1);
    686 }
    687 
    688 void MatrixSearch::debug_print_dmi(PoolPosType dmi_pos, uint16 nest_level) {
    689   if (dmi_pos >= dmi_pool_used_) return;
    690 
    691   DictMatchInfo *dmi = dmi_pool_ + dmi_pos;
    692 
    693   if (1 == nest_level) {
    694     printf("-----------------%d\'th DMI node begin----------->\n", dmi_pos);
    695   }
    696   if (dmi->dict_level > 1) {
    697     debug_print_dmi(dmi->dmi_fr, nest_level + 1);
    698   }
    699   printf("---%d\n", dmi->dict_level);
    700   printf(" MileStone: %x, %x\n", dmi->dict_handles[0], dmi->dict_handles[1]);
    701   printf(" Spelling : %s, %d\n", SpellingTrie::get_instance().
    702          get_spelling_str(dmi->spl_id), dmi->spl_id);
    703   printf(" Total Pinyin Len: %d\n", dmi->splstr_len);
    704   if (1 == nest_level) {
    705     printf("<----------------%d\'th DMI node end--------------\n\n", dmi_pos);
    706   }
    707 }
    708 
    709 bool MatrixSearch::try_add_cand0_to_userdict() {
    710   size_t new_cand_num = get_candidate_num();
    711   if (fixed_hzs_ > 0 && 1 == new_cand_num) {
    712     float score_from = 0;
    713     uint16 lma_id_from = 0;
    714     uint16 pos = 0;
    715     bool modified = false;
    716     while (pos < fixed_lmas_) {
    717       if (lma_start_[pos + 1] - lma_start_[lma_id_from] >
    718           static_cast<uint16>(kMaxLemmaSize)) {
    719         float score_to_add =
    720             mtrx_nd_pool_[matrix_[spl_start_[lma_start_[pos]]]
    721             .mtrx_nd_pos].score - score_from;
    722         if (modified) {
    723           score_to_add += 1.0;
    724           if (score_to_add > NGram::kMaxScore) {
    725             score_to_add = NGram::kMaxScore;
    726           }
    727           add_lma_to_userdict(lma_id_from, pos, score_to_add);
    728         }
    729         lma_id_from = pos;
    730         score_from += score_to_add;
    731 
    732         // Clear the flag for next user lemma.
    733         modified = false;
    734       }
    735 
    736       if (0 == fixed_lmas_no1_[pos]) {
    737         modified = true;
    738       }
    739       pos++;
    740     }
    741 
    742     // Single-char word is not allowed to add to userdict.
    743     if (lma_start_[pos] - lma_start_[lma_id_from] > 1) {
    744       float score_to_add =
    745           mtrx_nd_pool_[matrix_[spl_start_[lma_start_[pos]]]
    746           .mtrx_nd_pos].score - score_from;
    747       if (modified) {
    748         score_to_add += 1.0;
    749         if (score_to_add > NGram::kMaxScore) {
    750           score_to_add = NGram::kMaxScore;
    751         }
    752         add_lma_to_userdict(lma_id_from, pos, score_to_add);
    753       }
    754     }
    755   }
    756   return true;
    757 }
    758 
    759 // Choose a candidate, and give new candidates for next step.
    760 // If user finishes selection, we will try to communicate with user dictionary
    761 // to add new items or update score of some existing items.
    762 //
    763 // Basic rule:
    764 // 1. If user selects the first choice:
    765 //    1.1. If the first choice is not a sentence, instead, it is a lemma:
    766 //         1.1.1. If the first choice is a user lemma, notify the user
    767 //                dictionary that a user lemma is hit, and add occuring count
    768 //                by 1.
    769 //         1.1.2. If the first choice is a system lemma, do nothing.
    770 //    1.2. If the first choice is a sentence containing more than one lemma:
    771 //         1.2.1. The whole sentence will be added as a user lemma. If the
    772 //                sentence contains user lemmas, -> hit, and add occuring count
    773 //                by 1.
    774 size_t MatrixSearch::choose(size_t cand_id) {
    775   if (!inited_ || 0 == pys_decoded_len_)
    776     return 0;
    777 
    778   if (0 == cand_id) {
    779     fixed_hzs_ = spl_id_num_;
    780     matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ +
    781         matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos;
    782     for (size_t pos = fixed_lmas_; pos < lma_id_num_; pos++) {
    783       fixed_lmas_no1_[pos] = 1;
    784     }
    785     fixed_lmas_ = lma_id_num_;
    786     lpi_total_ = 0;  // Clean all other candidates.
    787 
    788     // 1. It is the first choice
    789     if (1 == lma_id_num_) {
    790       // 1.1. The first choice is not a sentence but a lemma
    791       if (is_user_lemma(lma_id_[0])) {
    792         // 1.1.1. The first choice is a user lemma, notify the user dictionary
    793         // that it is hit.
    794         if (NULL != user_dict_)
    795           user_dict_->update_lemma(lma_id_[0], 1, true);
    796       } else {
    797         // 1.1.2. do thing for a system lemma.
    798       }
    799     } else {
    800       // 1.2. The first choice is a sentence.
    801       // 1.2.1 Try to add the whole sentence to user dictionary, the whole
    802       // sentence may be splitted into many items.
    803       if (NULL != user_dict_) {
    804         try_add_cand0_to_userdict();
    805       }
    806     }
    807     update_dict_freq();
    808     return 1;
    809   } else {
    810     cand_id--;
    811   }
    812 
    813   // 2. It is not the full sentence candidate.
    814   // Find the length of the candidate.
    815   LemmaIdType id_chosen = lpi_items_[cand_id].id;
    816   LmaScoreType score_chosen = lpi_items_[cand_id].psb;
    817   size_t cand_len = lpi_items_[cand_id].lma_len;
    818 
    819   assert(cand_len > 0);
    820 
    821   // Notify the atom dictionary that this item is hit.
    822   if (is_user_lemma(id_chosen)) {
    823     if (NULL != user_dict_) {
    824       user_dict_->update_lemma(id_chosen, 1, true);
    825     }
    826     update_dict_freq();
    827   }
    828 
    829   // 3. Fixed the chosen item.
    830   // 3.1 Get the steps number.
    831   size_t step_fr = spl_start_[fixed_hzs_];
    832   size_t step_to = spl_start_[fixed_hzs_ + cand_len];
    833 
    834   // 3.2 Save the length of the original string.
    835   size_t pys_decoded_len = pys_decoded_len_;
    836 
    837   // 3.2 Reset the space of the fixed part.
    838   reset_search(step_to, false, false, true);
    839 
    840   // 3.3 For the last character of the fixed part, the previous DMI
    841   // information will be kept, while the MTRX information will be re-extended,
    842   // and only one node will be extended.
    843   matrix_[step_to].mtrx_nd_num = 0;
    844 
    845   LmaPsbItem lpi_item;
    846   lpi_item.psb = score_chosen;
    847   lpi_item.id = id_chosen;
    848 
    849   PoolPosType step_to_dmi_fr = match_dmi(step_to,
    850                                          spl_id_ + fixed_hzs_, cand_len);
    851   assert(step_to_dmi_fr != static_cast<PoolPosType>(-1));
    852 
    853   extend_mtrx_nd(matrix_[step_fr].mtrx_nd_fixed, &lpi_item, 1,
    854                  step_to_dmi_fr, step_to);
    855 
    856   matrix_[step_to].mtrx_nd_fixed = mtrx_nd_pool_ + matrix_[step_to].mtrx_nd_pos;
    857   mtrx_nd_pool_used_ = matrix_[step_to].mtrx_nd_pos +
    858                        matrix_[step_to].mtrx_nd_num;
    859 
    860   if (id_chosen == lma_id_[fixed_lmas_])
    861     fixed_lmas_no1_[fixed_lmas_] = 1;
    862   else
    863     fixed_lmas_no1_[fixed_lmas_] = 0;
    864   lma_id_[fixed_lmas_] = id_chosen;
    865   lma_start_[fixed_lmas_ + 1] = lma_start_[fixed_lmas_] + cand_len;
    866   fixed_lmas_++;
    867   fixed_hzs_ = fixed_hzs_ + cand_len;
    868 
    869   while (step_to != pys_decoded_len) {
    870     bool b = add_char(pys_[step_to]);
    871     assert(b);
    872     step_to++;
    873   }
    874 
    875   if (fixed_hzs_ < spl_id_num_) {
    876     prepare_candidates();
    877   } else {
    878     lpi_total_ = 0;
    879     if (NULL != user_dict_) {
    880       try_add_cand0_to_userdict();
    881     }
    882   }
    883 
    884   return get_candidate_num();
    885 }
    886 
    887 size_t MatrixSearch::cancel_last_choice() {
    888   if (!inited_ || 0 == pys_decoded_len_)
    889     return 0;
    890 
    891   size_t step_start = 0;
    892   if (fixed_hzs_ > 0) {
    893     size_t step_end = spl_start_[fixed_hzs_];
    894     MatrixNode *end_node = matrix_[step_end].mtrx_nd_fixed;
    895     assert(NULL != end_node);
    896 
    897     step_start = end_node->from->step;
    898 
    899     if (step_start > 0) {
    900       DictMatchInfo *dmi = dmi_pool_ + end_node->dmi_fr;
    901       fixed_hzs_ -= dmi->dict_level;
    902     } else {
    903       fixed_hzs_ = 0;
    904     }
    905 
    906     reset_search(step_start, false, false, false);
    907 
    908     while (pys_[step_start] != '\0') {
    909       bool b = add_char(pys_[step_start]);
    910       assert(b);
    911       step_start++;
    912     }
    913 
    914     prepare_candidates();
    915   }
    916   return get_candidate_num();
    917 }
    918 
    919 size_t MatrixSearch::get_fixedlen() {
    920   if (!inited_ || 0 == pys_decoded_len_)
    921     return 0;
    922   return fixed_hzs_;
    923 }
    924 
    925 bool MatrixSearch::prepare_add_char(char ch) {
    926   if (pys_decoded_len_ >= kMaxRowNum - 1 ||
    927       (!spl_parser_->is_valid_to_parse(ch) && ch != '\''))
    928     return false;
    929 
    930   if (dmi_pool_used_ >= kDmiPoolSize) return false;
    931 
    932   pys_[pys_decoded_len_] = ch;
    933   pys_decoded_len_++;
    934 
    935   MatrixRow *mtrx_this_row = matrix_ + pys_decoded_len_;
    936   mtrx_this_row->mtrx_nd_pos = mtrx_nd_pool_used_;
    937   mtrx_this_row->mtrx_nd_num = 0;
    938   mtrx_this_row->dmi_pos = dmi_pool_used_;
    939   mtrx_this_row->dmi_num = 0;
    940   mtrx_this_row->dmi_has_full_id = 0;
    941 
    942   return true;
    943 }
    944 
    945 bool MatrixSearch::is_split_at(uint16 pos) {
    946   return !spl_parser_->is_valid_to_parse(pys_[pos - 1]);
    947 }
    948 
    949 void MatrixSearch::fill_dmi(DictMatchInfo *dmi, MileStoneHandle *handles,
    950                             PoolPosType dmi_fr, uint16 spl_id,
    951                             uint16 node_num, unsigned char dict_level,
    952                             bool splid_end_split, unsigned char splstr_len,
    953                             unsigned char all_full_id) {
    954   dmi->dict_handles[0] = handles[0];
    955   dmi->dict_handles[1] = handles[1];
    956   dmi->dmi_fr = dmi_fr;
    957   dmi->spl_id = spl_id;
    958   dmi->dict_level = dict_level;
    959   dmi->splid_end_split = splid_end_split ? 1 : 0;
    960   dmi->splstr_len = splstr_len;
    961   dmi->all_full_id = all_full_id;
    962   dmi->c_phrase = 0;
    963 }
    964 
    965 bool MatrixSearch::add_char(char ch) {
    966   if (!prepare_add_char(ch))
    967     return false;
    968   return add_char_qwerty();
    969 }
    970 
    971 bool MatrixSearch::add_char_qwerty() {
    972   matrix_[pys_decoded_len_].mtrx_nd_num = 0;
    973 
    974   bool spl_matched = false;
    975   uint16 longest_ext = 0;
    976   // Extend the search matrix, from the oldest unfixed row. ext_len means
    977   // extending length.
    978   for (uint16 ext_len = kMaxPinyinSize + 1; ext_len > 0; ext_len--) {
    979     if (ext_len > pys_decoded_len_ - spl_start_[fixed_hzs_])
    980       continue;
    981 
    982     // Refer to the declaration of the variable dmi_has_full_id for the
    983     // explanation of this piece of code. In one word, it is used to prevent
    984     // from the unwise extending of "shoud ou" but allow the reasonable
    985     // extending of "heng ao", "lang a", etc.
    986     if (ext_len > 1 && 0 != longest_ext &&
    987         0 == matrix_[pys_decoded_len_ - ext_len].dmi_has_full_id) {
    988       if (xi_an_enabled_)
    989         continue;
    990       else
    991         break;
    992     }
    993 
    994     uint16 oldrow = pys_decoded_len_ - ext_len;
    995 
    996     // 0. If that row is before the last fixed step, ignore.
    997     if (spl_start_[fixed_hzs_] > oldrow)
    998       continue;
    999 
   1000     // 1. Check if that old row has valid MatrixNode. If no, means that row is
   1001     // not a boundary, either a word boundary or a spelling boundary.
   1002     // If it is for extending composing phrase, it's OK to ignore the 0.
   1003     if (0 == matrix_[oldrow].mtrx_nd_num && !dmi_c_phrase_)
   1004       continue;
   1005 
   1006     // 2. Get spelling id(s) for the last ext_len chars.
   1007     uint16 spl_idx;
   1008     bool is_pre = false;
   1009     spl_idx = spl_parser_->get_splid_by_str(pys_ + oldrow,
   1010                                             ext_len, &is_pre);
   1011     if (is_pre)
   1012       spl_matched = true;
   1013 
   1014     if (0 == spl_idx)
   1015       continue;
   1016 
   1017     bool splid_end_split = is_split_at(oldrow + ext_len);
   1018 
   1019     // 3. Extend the DMI nodes of that old row
   1020     // + 1 is to extend an extra node from the root
   1021     for (PoolPosType dmi_pos = matrix_[oldrow].dmi_pos;
   1022          dmi_pos < matrix_[oldrow].dmi_pos + matrix_[oldrow].dmi_num + 1;
   1023          dmi_pos++) {
   1024       DictMatchInfo *dmi = dmi_pool_ + dmi_pos;
   1025       if (dmi_pos == matrix_[oldrow].dmi_pos + matrix_[oldrow].dmi_num) {
   1026         dmi = NULL;  // The last one, NULL means extending from the root.
   1027       } else {
   1028         // If the dmi is covered by the fixed arrange, ignore it.
   1029         if (fixed_hzs_ > 0 &&
   1030             pys_decoded_len_ - ext_len - dmi->splstr_len <
   1031             spl_start_[fixed_hzs_]) {
   1032           continue;
   1033         }
   1034         // If it is not in mode for composing phrase, and the source DMI node
   1035         // is marked for composing phrase, ignore this node.
   1036         if (dmi->c_phrase != 0 && !dmi_c_phrase_) {
   1037           continue;
   1038         }
   1039       }
   1040 
   1041       // For example, if "gao" is extended, "g ao" is not allowed.
   1042       // or "zh" has been passed, "z h" is not allowed.
   1043       // Both word and word-connection will be prevented.
   1044       if (longest_ext > ext_len) {
   1045         if (NULL == dmi && 0 == matrix_[oldrow].dmi_has_full_id) {
   1046           continue;
   1047         }
   1048 
   1049         // "z h" is not allowed.
   1050         if (NULL != dmi && spl_trie_->is_half_id(dmi->spl_id)) {
   1051           continue;
   1052         }
   1053       }
   1054 
   1055       dep_->splids_extended = 0;
   1056       if (NULL != dmi) {
   1057         uint16 prev_ids_num = dmi->dict_level;
   1058         if ((!dmi_c_phrase_ && prev_ids_num >= kMaxLemmaSize) ||
   1059             (dmi_c_phrase_ && prev_ids_num >=  kMaxRowNum)) {
   1060           continue;
   1061         }
   1062 
   1063         DictMatchInfo *d = dmi;
   1064         while (d) {
   1065           dep_->splids[--prev_ids_num] = d->spl_id;
   1066           if ((PoolPosType)-1 == d->dmi_fr)
   1067             break;
   1068           d = dmi_pool_ + d->dmi_fr;
   1069         }
   1070         assert(0 == prev_ids_num);
   1071         dep_->splids_extended = dmi->dict_level;
   1072       }
   1073       dep_->splids[dep_->splids_extended] = spl_idx;
   1074       dep_->ext_len = ext_len;
   1075       dep_->splid_end_split = splid_end_split;
   1076 
   1077       dep_->id_num = 1;
   1078       dep_->id_start = spl_idx;
   1079       if (spl_trie_->is_half_id(spl_idx)) {
   1080         // Get the full id list
   1081         dep_->id_num = spl_trie_->half_to_full(spl_idx, &(dep_->id_start));
   1082         assert(dep_->id_num > 0);
   1083       }
   1084 
   1085       uint16 new_dmi_num;
   1086 
   1087       new_dmi_num = extend_dmi(dep_, dmi);
   1088 
   1089       if (new_dmi_num > 0) {
   1090         if (dmi_c_phrase_) {
   1091           dmi_pool_[dmi_pool_used_].c_phrase = 1;
   1092         }
   1093         matrix_[pys_decoded_len_].dmi_num += new_dmi_num;
   1094         dmi_pool_used_ += new_dmi_num;
   1095 
   1096         if (!spl_trie_->is_half_id(spl_idx))
   1097           matrix_[pys_decoded_len_].dmi_has_full_id = 1;
   1098       }
   1099 
   1100       // If get candiate lemmas, try to extend the path
   1101       if (lpi_total_ > 0) {
   1102         uint16 fr_row;
   1103         if (NULL == dmi) {
   1104           fr_row = oldrow;
   1105         } else {
   1106           assert(oldrow >= dmi->splstr_len);
   1107           fr_row = oldrow - dmi->splstr_len;
   1108         }
   1109         for (PoolPosType mtrx_nd_pos = matrix_[fr_row].mtrx_nd_pos;
   1110              mtrx_nd_pos < matrix_[fr_row].mtrx_nd_pos +
   1111              matrix_[fr_row].mtrx_nd_num;
   1112              mtrx_nd_pos++) {
   1113           MatrixNode *mtrx_nd = mtrx_nd_pool_ + mtrx_nd_pos;
   1114 
   1115           extend_mtrx_nd(mtrx_nd, lpi_items_, lpi_total_,
   1116                          dmi_pool_used_ - new_dmi_num, pys_decoded_len_);
   1117           if (longest_ext == 0)
   1118             longest_ext = ext_len;
   1119         }
   1120       }
   1121     }  // for dmi_pos
   1122   }  // for ext_len
   1123   mtrx_nd_pool_used_ += matrix_[pys_decoded_len_].mtrx_nd_num;
   1124 
   1125   if (dmi_c_phrase_)
   1126     return true;
   1127 
   1128   return (matrix_[pys_decoded_len_].mtrx_nd_num != 0 || spl_matched);
   1129 }
   1130 
   1131 void MatrixSearch::prepare_candidates() {
   1132   // Get candiates from the first un-fixed step.
   1133   uint16 lma_size_max = kMaxLemmaSize;
   1134   if (lma_size_max > spl_id_num_ - fixed_hzs_)
   1135     lma_size_max = spl_id_num_ - fixed_hzs_;
   1136 
   1137   uint16 lma_size = lma_size_max;
   1138 
   1139   // If the full sentense candidate's unfixed part may be the same with a normal
   1140   // lemma. Remove the lemma candidate in this case.
   1141   char16 fullsent[kMaxLemmaSize + 1];
   1142   char16 *pfullsent = NULL;
   1143   uint16 sent_len;
   1144   pfullsent = get_candidate0(fullsent, kMaxLemmaSize + 1, &sent_len, true);
   1145 
   1146   // If the unfixed part contains more than one ids, it is not necessary to
   1147   // check whether a lemma's string is the same to the unfixed part of the full
   1148   // sentence candidate, so, set it to NULL;
   1149   if (sent_len > kMaxLemmaSize)
   1150     pfullsent = NULL;
   1151 
   1152   lpi_total_ = 0;
   1153   size_t lpi_num_full_match = 0;  // Number of items which are fully-matched.
   1154   while (lma_size > 0) {
   1155     size_t lma_num;
   1156     lma_num = get_lpis(spl_id_ + fixed_hzs_, lma_size,
   1157                        lpi_items_ + lpi_total_,
   1158                        size_t(kMaxLmaPsbItems - lpi_total_),
   1159                        pfullsent, lma_size == lma_size_max);
   1160 
   1161     if (lma_num > 0) {
   1162       lpi_total_ += lma_num;
   1163       // For next lemma candidates which are not the longest, it is not
   1164       // necessary to compare with the full sentence candiate.
   1165       pfullsent = NULL;
   1166     }
   1167     if (lma_size == lma_size_max) {
   1168       lpi_num_full_match = lpi_total_;
   1169     }
   1170     lma_size--;
   1171   }
   1172 
   1173   // Sort those partially-matched items by their unified scores.
   1174   myqsort(lpi_items_ + lpi_num_full_match, lpi_total_ - lpi_num_full_match,
   1175           sizeof(LmaPsbItem), cmp_lpi_with_unified_psb);
   1176 
   1177   if (kPrintDebug0) {
   1178     printf("-----Prepare candidates, score:\n");
   1179     for (size_t a = 0; a < lpi_total_; a++) {
   1180       printf("[%03d]%d    ", a, lpi_items_[a].psb);
   1181       if ((a + 1) % 6 == 0) printf("\n");
   1182     }
   1183     printf("\n");
   1184   }
   1185 
   1186   if (kPrintDebug0) {
   1187     printf("--- lpi_total_ = %d\n", lpi_total_);
   1188   }
   1189 }
   1190 
   1191 const char* MatrixSearch::get_pystr(size_t *decoded_len) {
   1192   if (!inited_ || NULL == decoded_len)
   1193     return NULL;
   1194 
   1195   *decoded_len = pys_decoded_len_;
   1196   return pys_;
   1197 }
   1198 
   1199 void MatrixSearch::merge_fixed_lmas(size_t del_spl_pos) {
   1200   if (fixed_lmas_ == 0)
   1201     return;
   1202   // Update spelling segmentation information first.
   1203   spl_id_num_ -= 1;
   1204   uint16 del_py_len = spl_start_[del_spl_pos + 1] - spl_start_[del_spl_pos];
   1205   for (size_t pos = del_spl_pos; pos <= spl_id_num_; pos++) {
   1206     spl_start_[pos] = spl_start_[pos + 1] - del_py_len;
   1207     if (pos == spl_id_num_)
   1208       break;
   1209     spl_id_[pos] = spl_id_[pos + 1];
   1210   }
   1211 
   1212   // Begin to merge.
   1213   uint16 phrase_len = 0;
   1214 
   1215   // Update the spelling ids to the composing phrase.
   1216   // We need to convert these ids into full id in the future.
   1217   memcpy(c_phrase_.spl_ids, spl_id_, spl_id_num_ * sizeof(uint16));
   1218   memcpy(c_phrase_.spl_start, spl_start_, (spl_id_num_ + 1) * sizeof(uint16));
   1219 
   1220   // If composing phrase has not been created, first merge all fixed
   1221   //  lemmas into a composing phrase without deletion.
   1222   if (fixed_lmas_ > 1 || kLemmaIdComposing != lma_id_[0]) {
   1223     uint16 bp = 1;  // Begin position of real fixed lemmas.
   1224     // There is no existing composing phrase.
   1225     if (kLemmaIdComposing != lma_id_[0]) {
   1226       c_phrase_.sublma_num = 0;
   1227       bp = 0;
   1228     }
   1229 
   1230     uint16 sub_num = c_phrase_.sublma_num;
   1231     for (uint16 pos = bp; pos <= fixed_lmas_; pos++) {
   1232       c_phrase_.sublma_start[sub_num + pos - bp] = lma_start_[pos];
   1233       if (lma_start_[pos] > del_spl_pos) {
   1234         c_phrase_.sublma_start[sub_num + pos - bp] -= 1;
   1235       }
   1236 
   1237       if (pos == fixed_lmas_)
   1238         break;
   1239 
   1240       uint16 lma_len;
   1241       char16 *lma_str = c_phrase_.chn_str +
   1242           c_phrase_.sublma_start[sub_num] + phrase_len;
   1243 
   1244       lma_len = get_lemma_str(lma_id_[pos], lma_str, kMaxRowNum - phrase_len);
   1245       assert(lma_len == lma_start_[pos + 1] - lma_start_[pos]);
   1246       phrase_len += lma_len;
   1247     }
   1248     assert(phrase_len == lma_start_[fixed_lmas_]);
   1249     c_phrase_.length = phrase_len;  // will be deleted by 1
   1250     c_phrase_.sublma_num += fixed_lmas_ - bp;
   1251   } else {
   1252     for (uint16 pos = 0; pos <= c_phrase_.sublma_num; pos++) {
   1253       if (c_phrase_.sublma_start[pos] > del_spl_pos) {
   1254         c_phrase_.sublma_start[pos] -= 1;
   1255       }
   1256     }
   1257     phrase_len = c_phrase_.length;
   1258   }
   1259 
   1260   assert(phrase_len > 0);
   1261   if (1 == phrase_len) {
   1262     // After the only one is deleted, nothing will be left.
   1263     fixed_lmas_ = 0;
   1264     return;
   1265   }
   1266 
   1267   // Delete the Chinese character in the merged phrase.
   1268   // The corresponding elements in spl_ids and spl_start of the
   1269   // phrase have been deleted.
   1270   char16 *chn_str = c_phrase_.chn_str + del_spl_pos;
   1271   for (uint16 pos = 0;
   1272       pos < c_phrase_.sublma_start[c_phrase_.sublma_num] - del_spl_pos;
   1273       pos++) {
   1274     chn_str[pos] = chn_str[pos + 1];
   1275   }
   1276   c_phrase_.length -= 1;
   1277 
   1278   // If the deleted spelling id is in a sub lemma which contains more than
   1279   // one id, del_a_sub will be false; but if the deleted id is in a sub lemma
   1280   // which only contains 1 id, the whole sub lemma needs to be deleted, so
   1281   // del_a_sub will be true.
   1282   bool del_a_sub = false;
   1283   for (uint16 pos = 1; pos <= c_phrase_.sublma_num; pos++) {
   1284     if (c_phrase_.sublma_start[pos - 1] ==
   1285         c_phrase_.sublma_start[pos]) {
   1286       del_a_sub = true;
   1287     }
   1288     if (del_a_sub) {
   1289       c_phrase_.sublma_start[pos - 1] =
   1290           c_phrase_.sublma_start[pos];
   1291     }
   1292   }
   1293   if (del_a_sub)
   1294     c_phrase_.sublma_num -= 1;
   1295 
   1296   return;
   1297 }
   1298 
   1299 void MatrixSearch::get_spl_start_id() {
   1300   lma_id_num_ = 0;
   1301   lma_start_[0] = 0;
   1302 
   1303   spl_id_num_ = 0;
   1304   spl_start_[0] = 0;
   1305   if (!inited_ || 0 == pys_decoded_len_ ||
   1306       0 == matrix_[pys_decoded_len_].mtrx_nd_num)
   1307     return;
   1308 
   1309   // Calculate number of lemmas and spellings
   1310   // Only scan those part which is not fixed.
   1311   lma_id_num_ = fixed_lmas_;
   1312   spl_id_num_ = fixed_hzs_;
   1313 
   1314   MatrixNode *mtrx_nd = mtrx_nd_pool_ + matrix_[pys_decoded_len_].mtrx_nd_pos;
   1315   while (mtrx_nd != mtrx_nd_pool_) {
   1316     if (fixed_hzs_ > 0) {
   1317       if (mtrx_nd->step <= spl_start_[fixed_hzs_])
   1318         break;
   1319     }
   1320 
   1321     // Update the spelling segamentation information
   1322     unsigned char word_splstr_len = 0;
   1323     PoolPosType dmi_fr = mtrx_nd->dmi_fr;
   1324     if ((PoolPosType)-1 != dmi_fr)
   1325       word_splstr_len = dmi_pool_[dmi_fr].splstr_len;
   1326 
   1327     while ((PoolPosType)-1 != dmi_fr) {
   1328       spl_start_[spl_id_num_ + 1] = mtrx_nd->step -
   1329           (word_splstr_len - dmi_pool_[dmi_fr].splstr_len);
   1330       spl_id_[spl_id_num_] = dmi_pool_[dmi_fr].spl_id;
   1331       spl_id_num_++;
   1332       dmi_fr = dmi_pool_[dmi_fr].dmi_fr;
   1333     }
   1334 
   1335     // Update the lemma segmentation information
   1336     lma_start_[lma_id_num_ + 1] = spl_id_num_;
   1337     lma_id_[lma_id_num_] = mtrx_nd->id;
   1338     lma_id_num_++;
   1339 
   1340     mtrx_nd = mtrx_nd->from;
   1341   }
   1342 
   1343   // Reverse the result of spelling info
   1344   for (size_t pos = fixed_hzs_;
   1345        pos < fixed_hzs_ + (spl_id_num_ - fixed_hzs_ + 1) / 2; pos++) {
   1346     if (spl_id_num_ + fixed_hzs_ - pos != pos + 1) {
   1347       spl_start_[pos + 1] ^= spl_start_[spl_id_num_ - pos + fixed_hzs_];
   1348       spl_start_[spl_id_num_ - pos + fixed_hzs_] ^= spl_start_[pos + 1];
   1349       spl_start_[pos + 1] ^= spl_start_[spl_id_num_ - pos + fixed_hzs_];
   1350 
   1351       spl_id_[pos] ^= spl_id_[spl_id_num_ + fixed_hzs_ - pos - 1];
   1352       spl_id_[spl_id_num_ + fixed_hzs_- pos - 1] ^= spl_id_[pos];
   1353       spl_id_[pos] ^= spl_id_[spl_id_num_ + fixed_hzs_- pos - 1];
   1354     }
   1355   }
   1356 
   1357   // Reverse the result of lemma info
   1358   for (size_t pos = fixed_lmas_;
   1359        pos < fixed_lmas_ + (lma_id_num_ - fixed_lmas_ + 1) / 2; pos++) {
   1360     assert(lma_id_num_ + fixed_lmas_ - pos - 1 >= pos);
   1361 
   1362     if (lma_id_num_ + fixed_lmas_ - pos > pos + 1) {
   1363       lma_start_[pos + 1] ^= lma_start_[lma_id_num_ - pos + fixed_lmas_];
   1364       lma_start_[lma_id_num_ - pos + fixed_lmas_] ^= lma_start_[pos + 1];
   1365       lma_start_[pos + 1] ^= lma_start_[lma_id_num_ - pos + fixed_lmas_];
   1366 
   1367       lma_id_[pos] ^= lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_];
   1368       lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_] ^= lma_id_[pos];
   1369       lma_id_[pos] ^= lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_];
   1370     }
   1371   }
   1372 
   1373   for (size_t pos = fixed_lmas_ + 1; pos <= lma_id_num_; pos++) {
   1374     if (pos < lma_id_num_)
   1375       lma_start_[pos] = lma_start_[pos - 1] +
   1376           (lma_start_[pos] - lma_start_[pos + 1]);
   1377     else
   1378       lma_start_[pos] = lma_start_[pos - 1] + lma_start_[pos] -
   1379           lma_start_[fixed_lmas_];
   1380   }
   1381 
   1382   // Find the last fixed position
   1383   fixed_hzs_ = 0;
   1384   for (size_t pos = spl_id_num_; pos > 0; pos--) {
   1385     if (NULL != matrix_[spl_start_[pos]].mtrx_nd_fixed) {
   1386       fixed_hzs_ = pos;
   1387       break;
   1388     }
   1389   }
   1390 
   1391   return;
   1392 }
   1393 
   1394 size_t MatrixSearch::get_spl_start(const uint16 *&spl_start) {
   1395   get_spl_start_id();
   1396   spl_start = spl_start_;
   1397   return spl_id_num_;
   1398 }
   1399 
   1400 size_t MatrixSearch::extend_dmi(DictExtPara *dep, DictMatchInfo *dmi_s) {
   1401   if (dmi_pool_used_ >= kDmiPoolSize) return 0;
   1402 
   1403   if (dmi_c_phrase_)
   1404     return extend_dmi_c(dep, dmi_s);
   1405 
   1406   LpiCache& lpi_cache = LpiCache::get_instance();
   1407   uint16 splid = dep->splids[dep->splids_extended];
   1408 
   1409   bool cached = false;
   1410   if (0 == dep->splids_extended)
   1411     cached = lpi_cache.is_cached(splid);
   1412 
   1413   // 1. If this is a half Id, get its corresponding full starting Id and
   1414   // number of full Id.
   1415   size_t ret_val = 0;
   1416   PoolPosType mtrx_dmi_fr = (PoolPosType)-1;  // From which dmi node
   1417 
   1418   lpi_total_ = 0;
   1419 
   1420   MileStoneHandle from_h[3];
   1421   from_h[0] = 0;
   1422   from_h[1] = 0;
   1423 
   1424   if (0 != dep->splids_extended) {
   1425     from_h[0] = dmi_s->dict_handles[0];
   1426     from_h[1] = dmi_s->dict_handles[1];
   1427   }
   1428 
   1429   // 2. Begin exgtending in the system dictionary
   1430   size_t lpi_num = 0;
   1431   MileStoneHandle handles[2];
   1432   handles[0] = handles[1] = 0;
   1433   if (from_h[0] > 0 || NULL == dmi_s) {
   1434     handles[0] = dict_trie_->extend_dict(from_h[0], dep, lpi_items_,
   1435                                          kMaxLmaPsbItems, &lpi_num);
   1436   }
   1437   if (handles[0] > 0)
   1438     lpi_total_ = lpi_num;
   1439 
   1440   if (NULL == dmi_s) {  // from root
   1441     assert(0 != handles[0]);
   1442     mtrx_dmi_fr = dmi_pool_used_;
   1443   }
   1444 
   1445   // 3. Begin extending in the user dictionary
   1446   if (NULL != user_dict_ && (from_h[1] > 0 || NULL == dmi_s)) {
   1447     handles[1] = user_dict_->extend_dict(from_h[1], dep,
   1448                                          lpi_items_ + lpi_total_,
   1449                                          kMaxLmaPsbItems - lpi_total_,
   1450                                          &lpi_num);
   1451     if (handles[1] > 0) {
   1452       if (kPrintDebug0) {
   1453         for (size_t t = 0; t < lpi_num; t++) {
   1454           printf("--Extend in user dict: uid:%d uscore:%d\n", lpi_items_[lpi_total_ + t].id,
   1455                  lpi_items_[lpi_total_ + t].psb);
   1456         }
   1457       }
   1458       lpi_total_ += lpi_num;
   1459     }
   1460   }
   1461 
   1462   if (0 != handles[0] || 0 != handles[1]) {
   1463     if (dmi_pool_used_ >= kDmiPoolSize) return 0;
   1464 
   1465     DictMatchInfo *dmi_add = dmi_pool_ + dmi_pool_used_;
   1466     if (NULL == dmi_s) {
   1467       fill_dmi(dmi_add, handles,
   1468                (PoolPosType)-1, splid,
   1469                1, 1, dep->splid_end_split, dep->ext_len,
   1470                spl_trie_->is_half_id(splid) ? 0 : 1);
   1471     } else {
   1472       fill_dmi(dmi_add, handles,
   1473                dmi_s - dmi_pool_, splid, 1,
   1474                dmi_s->dict_level + 1, dep->splid_end_split,
   1475                dmi_s->splstr_len + dep->ext_len,
   1476                spl_trie_->is_half_id(splid) ? 0 : dmi_s->all_full_id);
   1477     }
   1478 
   1479     ret_val = 1;
   1480   }
   1481 
   1482   if (!cached) {
   1483     if (0 == lpi_total_)
   1484       return ret_val;
   1485 
   1486     if (kPrintDebug0) {
   1487       printf("--- lpi_total_ = %d\n", lpi_total_);
   1488     }
   1489 
   1490     myqsort(lpi_items_, lpi_total_, sizeof(LmaPsbItem), cmp_lpi_with_psb);
   1491     if (NULL == dmi_s && spl_trie_->is_half_id(splid))
   1492       lpi_total_ = lpi_cache.put_cache(splid, lpi_items_, lpi_total_);
   1493   } else {
   1494     assert(spl_trie_->is_half_id(splid));
   1495     lpi_total_ = lpi_cache.get_cache(splid, lpi_items_, kMaxLmaPsbItems);
   1496   }
   1497 
   1498   return ret_val;
   1499 }
   1500 
   1501 size_t MatrixSearch::extend_dmi_c(DictExtPara *dep, DictMatchInfo *dmi_s) {
   1502   lpi_total_ = 0;
   1503 
   1504   uint16 pos = dep->splids_extended;
   1505   assert(dmi_c_phrase_);
   1506   if (pos >= c_phrase_.length)
   1507     return 0;
   1508 
   1509   uint16 splid = dep->splids[pos];
   1510   if (splid == c_phrase_.spl_ids[pos]) {
   1511     DictMatchInfo *dmi_add = dmi_pool_ + dmi_pool_used_;
   1512     MileStoneHandle handles[2];  // Actually never used.
   1513     if (NULL == dmi_s)
   1514       fill_dmi(dmi_add, handles,
   1515                (PoolPosType)-1, splid,
   1516                1, 1, dep->splid_end_split, dep->ext_len,
   1517                spl_trie_->is_half_id(splid) ? 0 : 1);
   1518     else
   1519       fill_dmi(dmi_add, handles,
   1520                dmi_s - dmi_pool_, splid, 1,
   1521                dmi_s->dict_level + 1, dep->splid_end_split,
   1522                dmi_s->splstr_len + dep->ext_len,
   1523                spl_trie_->is_half_id(splid) ? 0 : dmi_s->all_full_id);
   1524 
   1525     if (pos == c_phrase_.length - 1) {
   1526       lpi_items_[0].id = kLemmaIdComposing;
   1527       lpi_items_[0].psb = 0;  // 0 is bigger than normal lemma score.
   1528       lpi_total_ = 1;
   1529     }
   1530     return 1;
   1531   }
   1532   return 0;
   1533 }
   1534 
   1535 size_t MatrixSearch::extend_mtrx_nd(MatrixNode *mtrx_nd, LmaPsbItem lpi_items[],
   1536                                     size_t lpi_num, PoolPosType dmi_fr,
   1537                                     size_t res_row) {
   1538   assert(NULL != mtrx_nd);
   1539   matrix_[res_row].mtrx_nd_fixed = NULL;
   1540 
   1541   if (mtrx_nd_pool_used_ >= kMtrxNdPoolSize - kMaxNodeARow)
   1542     return 0;
   1543 
   1544   if (0 == mtrx_nd->step) {
   1545     // Because the list is sorted, if the source step is 0, it is only
   1546     // necessary to pick up the first kMaxNodeARow items.
   1547     if (lpi_num > kMaxNodeARow)
   1548       lpi_num = kMaxNodeARow;
   1549   }
   1550 
   1551   MatrixNode *mtrx_nd_res_min = mtrx_nd_pool_ + matrix_[res_row].mtrx_nd_pos;
   1552   for (size_t pos = 0; pos < lpi_num; pos++) {
   1553     float score = mtrx_nd->score + lpi_items[pos].psb;
   1554     if (pos > 0 && score - PRUMING_SCORE > mtrx_nd_res_min->score)
   1555       break;
   1556 
   1557     // Try to add a new node
   1558     size_t mtrx_nd_num = matrix_[res_row].mtrx_nd_num;
   1559     MatrixNode *mtrx_nd_res = mtrx_nd_res_min + mtrx_nd_num;
   1560     bool replace = false;
   1561     // Find its position
   1562     while (mtrx_nd_res > mtrx_nd_res_min && score < (mtrx_nd_res - 1)->score) {
   1563       if (static_cast<size_t>(mtrx_nd_res - mtrx_nd_res_min) < kMaxNodeARow)
   1564         *mtrx_nd_res = *(mtrx_nd_res - 1);
   1565       mtrx_nd_res--;
   1566       replace = true;
   1567     }
   1568     if (replace || (mtrx_nd_num < kMaxNodeARow &&
   1569         matrix_[res_row].mtrx_nd_pos + mtrx_nd_num < kMtrxNdPoolSize)) {
   1570       mtrx_nd_res->id = lpi_items[pos].id;
   1571       mtrx_nd_res->score = score;
   1572       mtrx_nd_res->from = mtrx_nd;
   1573       mtrx_nd_res->dmi_fr = dmi_fr;
   1574       mtrx_nd_res->step = res_row;
   1575       if (matrix_[res_row].mtrx_nd_num < kMaxNodeARow)
   1576         matrix_[res_row].mtrx_nd_num++;
   1577     }
   1578   }
   1579   return matrix_[res_row].mtrx_nd_num;
   1580 }
   1581 
   1582 PoolPosType MatrixSearch::match_dmi(size_t step_to, uint16 spl_ids[],
   1583                                     uint16 spl_id_num) {
   1584   if (pys_decoded_len_ < step_to || 0 == matrix_[step_to].dmi_num) {
   1585     return static_cast<PoolPosType>(-1);
   1586   }
   1587 
   1588   for (PoolPosType dmi_pos = 0; dmi_pos < matrix_[step_to].dmi_num; dmi_pos++) {
   1589     DictMatchInfo *dmi = dmi_pool_ + matrix_[step_to].dmi_pos + dmi_pos;
   1590 
   1591     if (dmi->dict_level != spl_id_num)
   1592       continue;
   1593 
   1594     bool matched = true;
   1595     for (uint16 spl_pos = 0; spl_pos < spl_id_num; spl_pos++) {
   1596       if (spl_ids[spl_id_num - spl_pos - 1] != dmi->spl_id) {
   1597         matched = false;
   1598         break;
   1599       }
   1600 
   1601       dmi = dmi_pool_ + dmi->dmi_fr;
   1602     }
   1603     if (matched) {
   1604       return matrix_[step_to].dmi_pos + dmi_pos;
   1605     }
   1606   }
   1607 
   1608   return static_cast<PoolPosType>(-1);
   1609 }
   1610 
   1611 char16* MatrixSearch::get_candidate0(char16 *cand_str, size_t max_len,
   1612                                      uint16 *retstr_len,
   1613                                      bool only_unfixed) {
   1614   if (pys_decoded_len_ == 0 ||
   1615       matrix_[pys_decoded_len_].mtrx_nd_num == 0)
   1616     return NULL;
   1617 
   1618   LemmaIdType idxs[kMaxRowNum];
   1619   size_t id_num = 0;
   1620 
   1621   MatrixNode *mtrx_nd = mtrx_nd_pool_ + matrix_[pys_decoded_len_].mtrx_nd_pos;
   1622 
   1623   if (kPrintDebug0) {
   1624     printf("--- sentence score: %f\n", mtrx_nd->score);
   1625   }
   1626 
   1627   if (kPrintDebug1) {
   1628     printf("==============Sentence DMI (reverse order) begin===========>>\n");
   1629   }
   1630 
   1631   while (mtrx_nd != NULL) {
   1632     idxs[id_num] = mtrx_nd->id;
   1633     id_num++;
   1634 
   1635     if (kPrintDebug1) {
   1636        printf("---MatrixNode [step: %d, lma_idx: %d, total score:%.5f]\n",
   1637               mtrx_nd->step, mtrx_nd->id, mtrx_nd->score);
   1638        debug_print_dmi(mtrx_nd->dmi_fr, 1);
   1639     }
   1640 
   1641     mtrx_nd = mtrx_nd->from;
   1642   }
   1643 
   1644   if (kPrintDebug1) {
   1645     printf("<<==============Sentence DMI (reverse order) end=============\n");
   1646   }
   1647 
   1648   size_t ret_pos = 0;
   1649   do {
   1650     id_num--;
   1651     if (0 == idxs[id_num])
   1652       continue;
   1653 
   1654     char16 str[kMaxLemmaSize + 1];
   1655     uint16 str_len = get_lemma_str(idxs[id_num], str, kMaxLemmaSize + 1);
   1656     if (str_len > 0 && ((!only_unfixed && max_len - ret_pos > str_len) ||
   1657         (only_unfixed && max_len - ret_pos + fixed_hzs_ > str_len))) {
   1658       if (!only_unfixed)
   1659         utf16_strncpy(cand_str + ret_pos, str, str_len);
   1660       else if (ret_pos >= fixed_hzs_)
   1661         utf16_strncpy(cand_str + ret_pos - fixed_hzs_, str, str_len);
   1662 
   1663       ret_pos += str_len;
   1664     } else {
   1665       return NULL;
   1666     }
   1667   } while (id_num != 0);
   1668 
   1669   if (!only_unfixed) {
   1670     if (NULL != retstr_len)
   1671       *retstr_len = ret_pos;
   1672     cand_str[ret_pos] = (char16)'\0';
   1673   } else {
   1674     if (NULL != retstr_len)
   1675       *retstr_len = ret_pos - fixed_hzs_;
   1676     cand_str[ret_pos - fixed_hzs_] = (char16)'\0';
   1677   }
   1678   return cand_str;
   1679 }
   1680 
   1681 size_t MatrixSearch::get_lpis(const uint16* splid_str, size_t splid_str_len,
   1682                               LmaPsbItem* lma_buf, size_t max_lma_buf,
   1683                               const char16 *pfullsent, bool sort_by_psb) {
   1684   if (splid_str_len > kMaxLemmaSize)
   1685     return 0;
   1686 
   1687   size_t num1 = dict_trie_->get_lpis(splid_str, splid_str_len,
   1688                                      lma_buf, max_lma_buf);
   1689   size_t num2 = 0;
   1690   if (NULL != user_dict_) {
   1691     num2 = user_dict_->get_lpis(splid_str, splid_str_len,
   1692                          lma_buf + num1, max_lma_buf - num1);
   1693   }
   1694 
   1695   size_t num = num1 + num2;
   1696 
   1697   if (0 == num)
   1698     return 0;
   1699 
   1700   // Remove repeated items.
   1701   if (splid_str_len > 1) {
   1702     LmaPsbStrItem *lpsis = reinterpret_cast<LmaPsbStrItem*>(lma_buf + num);
   1703     size_t lpsi_num = (max_lma_buf - num) * sizeof(LmaPsbItem) /
   1704         sizeof(LmaPsbStrItem);
   1705     assert(lpsi_num > num);
   1706     if (num > lpsi_num) num = lpsi_num;
   1707     lpsi_num = num;
   1708 
   1709     for (size_t pos = 0; pos < lpsi_num; pos++) {
   1710       lpsis[pos].lpi = lma_buf[pos];
   1711       get_lemma_str(lma_buf[pos].id, lpsis[pos].str, kMaxLemmaSize + 1);
   1712     }
   1713 
   1714     myqsort(lpsis, lpsi_num, sizeof(LmaPsbStrItem), cmp_lpsi_with_str);
   1715 
   1716     size_t remain_num = 0;
   1717     for (size_t pos = 0; pos < lpsi_num; pos++) {
   1718       if (pos > 0 && utf16_strcmp(lpsis[pos].str, lpsis[pos - 1].str) == 0) {
   1719         if (lpsis[pos].lpi.psb < lpsis[pos - 1].lpi.psb) {
   1720           assert(remain_num > 0);
   1721           lma_buf[remain_num - 1] = lpsis[pos].lpi;
   1722         }
   1723         continue;
   1724       }
   1725       if (NULL != pfullsent && utf16_strcmp(lpsis[pos].str, pfullsent) == 0)
   1726         continue;
   1727 
   1728       lma_buf[remain_num] = lpsis[pos].lpi;
   1729       remain_num++;
   1730     }
   1731 
   1732     // Update the result number
   1733     num = remain_num;
   1734   } else {
   1735     // For single character, some characters have more than one spelling, for
   1736     // example, "de" and "di" are all valid for a Chinese character, so when
   1737     // the user input  "d", repeated items are generated.
   1738     // For single character lemmas, Hanzis will be gotten
   1739     for (size_t pos = 0; pos < num; pos++) {
   1740       char16 hanzis[2];
   1741       get_lemma_str(lma_buf[pos].id, hanzis, 2);
   1742       lma_buf[pos].hanzi = hanzis[0];
   1743     }
   1744 
   1745     myqsort(lma_buf, num, sizeof(LmaPsbItem), cmp_lpi_with_hanzi);
   1746 
   1747     size_t remain_num = 0;
   1748     for (size_t pos = 0; pos < num; pos++) {
   1749       if (pos > 0 && lma_buf[pos].hanzi == lma_buf[pos - 1].hanzi) {
   1750         if (NULL != pfullsent &&
   1751             static_cast<char16>(0) == pfullsent[1] &&
   1752             lma_buf[pos].hanzi == pfullsent[0])
   1753           continue;
   1754 
   1755         if (lma_buf[pos].psb < lma_buf[pos - 1].psb) {
   1756           assert(remain_num > 0);
   1757           assert(lma_buf[remain_num - 1].hanzi == lma_buf[pos].hanzi);
   1758           lma_buf[remain_num - 1] = lma_buf[pos];
   1759         }
   1760         continue;
   1761       }
   1762       if (NULL != pfullsent &&
   1763           static_cast<char16>(0) == pfullsent[1] &&
   1764           lma_buf[pos].hanzi == pfullsent[0])
   1765           continue;
   1766 
   1767       lma_buf[remain_num] = lma_buf[pos];
   1768       remain_num++;
   1769     }
   1770 
   1771     num = remain_num;
   1772   }
   1773 
   1774   if (sort_by_psb) {
   1775     myqsort(lma_buf, num, sizeof(LmaPsbItem), cmp_lpi_with_psb);
   1776   }
   1777   return num;
   1778 }
   1779 
   1780 uint16 MatrixSearch::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf,
   1781                                    uint16 str_max) {
   1782   uint16 str_len = 0;
   1783 
   1784   if (is_system_lemma(id_lemma)) {
   1785     str_len = dict_trie_->get_lemma_str(id_lemma, str_buf, str_max);
   1786   } else if (is_user_lemma(id_lemma)) {
   1787     if (NULL != user_dict_) {
   1788       str_len = user_dict_->get_lemma_str(id_lemma, str_buf, str_max);
   1789     } else {
   1790       str_len = 0;
   1791       str_buf[0] = static_cast<char16>('\0');
   1792     }
   1793   } else if (is_composing_lemma(id_lemma)) {
   1794     if (str_max <= 1)
   1795       return 0;
   1796     str_len = c_phrase_.sublma_start[c_phrase_.sublma_num];
   1797     if (str_len > str_max - 1)
   1798       str_len = str_max - 1;
   1799     utf16_strncpy(str_buf, c_phrase_.chn_str, str_len);
   1800     str_buf[str_len] = (char16)'\0';
   1801     return str_len;
   1802   }
   1803 
   1804   return str_len;
   1805 }
   1806 
   1807 uint16 MatrixSearch::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
   1808                                       uint16 splids_max, bool arg_valid) {
   1809   uint16 splid_num = 0;
   1810 
   1811   if (arg_valid) {
   1812     for (splid_num = 0; splid_num < splids_max; splid_num++) {
   1813       if (spl_trie_->is_half_id(splids[splid_num]))
   1814         break;
   1815     }
   1816     if (splid_num == splids_max)
   1817       return splid_num;
   1818   }
   1819 
   1820   if (is_system_lemma(id_lemma)) {
   1821     splid_num = dict_trie_->get_lemma_splids(id_lemma, splids, splids_max,
   1822                                               arg_valid);
   1823   } else if (is_user_lemma(id_lemma)) {
   1824     if (NULL != user_dict_) {
   1825       splid_num = user_dict_->get_lemma_splids(id_lemma, splids, splids_max,
   1826                                                arg_valid);
   1827     } else {
   1828       splid_num = 0;
   1829     }
   1830   } else if (is_composing_lemma(id_lemma)) {
   1831     if (c_phrase_.length > splids_max) {
   1832       return 0;
   1833     }
   1834     for (uint16 pos = 0; pos < c_phrase_.length; pos++) {
   1835       splids[pos] = c_phrase_.spl_ids[pos];
   1836       if (spl_trie_->is_half_id(splids[pos])) {
   1837         return 0;
   1838       }
   1839     }
   1840   }
   1841   return splid_num;
   1842 }
   1843 
   1844 size_t MatrixSearch::inner_predict(const char16 *fixed_buf, uint16 fixed_len,
   1845                                    char16 predict_buf[][kMaxPredictSize + 1],
   1846                                    size_t buf_len) {
   1847   size_t res_total = 0;
   1848   memset(npre_items_, 0, sizeof(NPredictItem) * npre_items_len_);
   1849   // In order to shorten the comments, j-character candidates predicted by
   1850   // i-character prefix are called P(i,j). All candiates predicted by
   1851   // i-character prefix are called P(i,*)
   1852   // Step 1. Get P(kMaxPredictSize, *) and sort them, here
   1853   // P(kMaxPredictSize, *) == P(kMaxPredictSize, 1)
   1854   for (size_t len = fixed_len; len >0; len--) {
   1855     // How many blank items are available
   1856     size_t this_max = npre_items_len_ - res_total;
   1857     size_t res_this;
   1858     // If the history is longer than 1, and we can not get prediction from
   1859     // lemmas longer than 2, in this case, we will add lemmas with
   1860     // highest scores as the prediction result.
   1861     if (fixed_len > 1 && 1 == len && 0 == res_total) {
   1862       // Try to find if recent n (n>1) characters can be a valid lemma in system
   1863       // dictionary.
   1864       bool nearest_n_word = false;
   1865       for (size_t nlen = 2; nlen <= fixed_len; nlen++) {
   1866         if (dict_trie_->get_lemma_id(fixed_buf + fixed_len - nlen, nlen) > 0) {
   1867           nearest_n_word = true;
   1868           break;
   1869         }
   1870       }
   1871       res_this = dict_trie_->predict_top_lmas(nearest_n_word ? len : 0,
   1872                                               npre_items_ + res_total,
   1873                                               this_max, res_total);
   1874       res_total += res_this;
   1875     }
   1876 
   1877     // How many blank items are available
   1878     this_max = npre_items_len_ - res_total;
   1879     res_this = 0;
   1880     if (!kOnlyUserDictPredict) {
   1881       res_this =
   1882           dict_trie_->predict(fixed_buf + fixed_len - len, len,
   1883                               npre_items_ + res_total, this_max,
   1884                               res_total);
   1885     }
   1886 
   1887     if (NULL != user_dict_) {
   1888       res_this = res_this +
   1889                  user_dict_->predict(fixed_buf + fixed_len - len, len,
   1890                                      npre_items_ + res_total + res_this,
   1891                                      this_max - res_this, res_total + res_this);
   1892     }
   1893 
   1894     if (kPredictLimitGt1) {
   1895       myqsort(npre_items_ + res_total, res_this, sizeof(NPredictItem),
   1896               cmp_npre_by_score);
   1897 
   1898       if (len > 3) {
   1899         if (res_this > kMaxPredictNumByGt3)
   1900           res_this = kMaxPredictNumByGt3;
   1901       } else if (3 == len) {
   1902         if (res_this > kMaxPredictNumBy3)
   1903           res_this = kMaxPredictNumBy3;
   1904       } else if (2 == len) {
   1905         if (res_this > kMaxPredictNumBy2)
   1906           res_this = kMaxPredictNumBy2;
   1907       }
   1908     }
   1909 
   1910     res_total += res_this;
   1911   }
   1912 
   1913   res_total = remove_duplicate_npre(npre_items_, res_total);
   1914 
   1915   if (kPreferLongHistoryPredict) {
   1916     myqsort(npre_items_, res_total, sizeof(NPredictItem),
   1917             cmp_npre_by_hislen_score);
   1918   } else {
   1919     myqsort(npre_items_, res_total, sizeof(NPredictItem),
   1920             cmp_npre_by_score);
   1921   }
   1922 
   1923   if (buf_len < res_total) {
   1924     res_total = buf_len;
   1925   }
   1926 
   1927   if (kPrintDebug2) {
   1928     printf("/////////////////Predicted Items Begin////////////////////>>\n");
   1929     for (size_t i = 0; i < res_total; i++) {
   1930       printf("---");
   1931       for (size_t j = 0; j < kMaxPredictSize; j++) {
   1932         printf("%d  ", npre_items_[i].pre_hzs[j]);
   1933       }
   1934       printf("\n");
   1935     }
   1936     printf("<<///////////////Predicted Items End////////////////////////\n");
   1937   }
   1938 
   1939   for (size_t i = 0; i < res_total; i++) {
   1940     utf16_strncpy(predict_buf[i], npre_items_[i].pre_hzs,
   1941                   kMaxPredictSize);
   1942     predict_buf[i][kMaxPredictSize] = '\0';
   1943   }
   1944 
   1945   return res_total;
   1946 }
   1947 
   1948 size_t MatrixSearch::get_predicts(const char16 fixed_buf[],
   1949                                   char16 predict_buf[][kMaxPredictSize + 1],
   1950                                   size_t buf_len) {
   1951   size_t fixed_len = utf16_strlen(fixed_buf);
   1952   if (0 ==fixed_len || fixed_len > kMaxPredictSize || 0 == buf_len)
   1953     return 0;
   1954 
   1955   return inner_predict(fixed_buf, fixed_len, predict_buf, buf_len);
   1956 }
   1957 
   1958 }  // namespace ime_pinyin
   1959