1 /* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <assert.h> 18 #include <math.h> 19 #include <stdio.h> 20 #include <string.h> 21 #include "../include/lpicache.h" 22 #include "../include/matrixsearch.h" 23 #include "../include/mystdlib.h" 24 #include "../include/ngram.h" 25 #include "../include/userdict.h" 26 27 namespace ime_pinyin { 28 29 #define PRUMING_SCORE 8000.0 30 31 MatrixSearch::MatrixSearch() { 32 inited_ = false; 33 spl_trie_ = SpellingTrie::get_cpinstance(); 34 35 reset_pointers_to_null(); 36 37 pys_decoded_len_ = 0; 38 mtrx_nd_pool_used_ = 0; 39 dmi_pool_used_ = 0; 40 xi_an_enabled_ = false; 41 dmi_c_phrase_ = false; 42 43 assert(kMaxSearchSteps > 0); 44 max_sps_len_ = kMaxSearchSteps - 1; 45 max_hzs_len_ = kMaxSearchSteps; 46 } 47 48 MatrixSearch::~MatrixSearch() { 49 free_resource(); 50 } 51 52 void MatrixSearch::reset_pointers_to_null() { 53 dict_trie_ = NULL; 54 user_dict_ = NULL; 55 spl_parser_ = NULL; 56 57 share_buf_ = NULL; 58 59 // The following four buffers are used for decoding, and they are based on 60 // share_buf_, no need to delete them. 61 mtrx_nd_pool_ = NULL; 62 dmi_pool_ = NULL; 63 matrix_ = NULL; 64 dep_ = NULL; 65 66 // Based on share_buf_, no need to delete them. 67 npre_items_ = NULL; 68 } 69 70 bool MatrixSearch::alloc_resource() { 71 free_resource(); 72 73 dict_trie_ = new DictTrie(); 74 user_dict_ = static_cast<AtomDictBase*>(new UserDict()); 75 spl_parser_ = new SpellingParser(); 76 77 size_t mtrx_nd_size = sizeof(MatrixNode) * kMtrxNdPoolSize; 78 mtrx_nd_size = align_to_size_t(mtrx_nd_size) / sizeof(size_t); 79 size_t dmi_size = sizeof(DictMatchInfo) * kDmiPoolSize; 80 dmi_size = align_to_size_t(dmi_size) / sizeof(size_t); 81 size_t matrix_size = sizeof(MatrixRow) * kMaxRowNum; 82 matrix_size = align_to_size_t(matrix_size) / sizeof(size_t); 83 size_t dep_size = sizeof(DictExtPara); 84 dep_size = align_to_size_t(dep_size) / sizeof(size_t); 85 86 // share_buf's size is determined by the buffers for search. 87 share_buf_ = new size_t[mtrx_nd_size + dmi_size + matrix_size + dep_size]; 88 89 if (NULL == dict_trie_ || NULL == user_dict_ || NULL == spl_parser_ || 90 NULL == share_buf_) 91 return false; 92 93 // The buffers for search are based on the share buffer 94 mtrx_nd_pool_ = reinterpret_cast<MatrixNode*>(share_buf_); 95 dmi_pool_ = reinterpret_cast<DictMatchInfo*>(share_buf_ + mtrx_nd_size); 96 matrix_ = reinterpret_cast<MatrixRow*>(share_buf_ + mtrx_nd_size + dmi_size); 97 dep_ = reinterpret_cast<DictExtPara*> 98 (share_buf_ + mtrx_nd_size + dmi_size + matrix_size); 99 100 // The prediction buffer is also based on the share buffer. 101 npre_items_ = reinterpret_cast<NPredictItem*>(share_buf_); 102 npre_items_len_ = (mtrx_nd_size + dmi_size + matrix_size + dep_size) * 103 sizeof(size_t) / sizeof(NPredictItem); 104 return true; 105 } 106 107 void MatrixSearch::free_resource() { 108 if (NULL != dict_trie_) 109 delete dict_trie_; 110 111 if (NULL != user_dict_) 112 delete user_dict_; 113 114 if (NULL != spl_parser_) 115 delete spl_parser_; 116 117 if (NULL != share_buf_) 118 delete [] share_buf_; 119 120 reset_pointers_to_null(); 121 } 122 123 bool MatrixSearch::init(const char *fn_sys_dict, const char *fn_usr_dict) { 124 if (NULL == fn_sys_dict || NULL == fn_usr_dict) 125 return false; 126 127 if (!alloc_resource()) 128 return false; 129 130 if (!dict_trie_->load_dict(fn_sys_dict, 1, kSysDictIdEnd)) 131 return false; 132 133 // If engine fails to load the user dictionary, reset the user dictionary 134 // to NULL. 135 if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) { 136 delete user_dict_; 137 user_dict_ = NULL; 138 } else{ 139 user_dict_->set_total_lemma_count_of_others(NGram::kSysDictTotalFreq); 140 } 141 142 reset_search0(); 143 144 inited_ = true; 145 return true; 146 } 147 148 bool MatrixSearch::init_fd(int sys_fd, long start_offset, long length, 149 const char *fn_usr_dict) { 150 if (NULL == fn_usr_dict) 151 return false; 152 153 if (!alloc_resource()) 154 return false; 155 156 if (!dict_trie_->load_dict_fd(sys_fd, start_offset, length, 1, kSysDictIdEnd)) 157 return false; 158 159 if (!user_dict_->load_dict(fn_usr_dict, kUserDictIdStart, kUserDictIdEnd)) { 160 delete user_dict_; 161 user_dict_ = NULL; 162 } else { 163 user_dict_->set_total_lemma_count_of_others(NGram::kSysDictTotalFreq); 164 } 165 166 reset_search0(); 167 168 inited_ = true; 169 return true; 170 } 171 172 void MatrixSearch::set_max_lens(size_t max_sps_len, size_t max_hzs_len) { 173 if (0 != max_sps_len) 174 max_sps_len_ = max_sps_len; 175 if (0 != max_hzs_len) 176 max_hzs_len_ = max_hzs_len; 177 } 178 179 void MatrixSearch::close() { 180 flush_cache(); 181 free_resource(); 182 inited_ = false; 183 } 184 185 void MatrixSearch::flush_cache() { 186 if (NULL != user_dict_) 187 user_dict_->flush_cache(); 188 } 189 190 void MatrixSearch::set_xi_an_switch(bool xi_an_enabled) { 191 xi_an_enabled_ = xi_an_enabled; 192 } 193 194 bool MatrixSearch::get_xi_an_switch() { 195 return xi_an_enabled_; 196 } 197 198 bool MatrixSearch::reset_search() { 199 if (!inited_) 200 return false; 201 return reset_search0(); 202 } 203 204 bool MatrixSearch::reset_search0() { 205 if (!inited_) 206 return false; 207 208 pys_decoded_len_ = 0; 209 mtrx_nd_pool_used_ = 0; 210 dmi_pool_used_ = 0; 211 212 // Get a MatrixNode from the pool 213 matrix_[0].mtrx_nd_pos = mtrx_nd_pool_used_; 214 matrix_[0].mtrx_nd_num = 1; 215 mtrx_nd_pool_used_ += 1; 216 217 // Update the node, and make it to be a starting node 218 MatrixNode *node = mtrx_nd_pool_ + matrix_[0].mtrx_nd_pos; 219 node->id = 0; 220 node->score = 0; 221 node->from = NULL; 222 node->step = 0; 223 node->dmi_fr = (PoolPosType)-1; 224 225 matrix_[0].dmi_pos = 0; 226 matrix_[0].dmi_num = 0; 227 matrix_[0].dmi_has_full_id = 1; 228 matrix_[0].mtrx_nd_fixed = node; 229 230 lma_start_[0] = 0; 231 fixed_lmas_ = 0; 232 spl_start_[0] = 0; 233 fixed_hzs_ = 0; 234 235 dict_trie_->reset_milestones(0, 0); 236 if (NULL != user_dict_) 237 user_dict_->reset_milestones(0, 0); 238 239 return true; 240 } 241 242 bool MatrixSearch::reset_search(size_t ch_pos, bool clear_fixed_this_step, 243 bool clear_dmi_this_step, 244 bool clear_mtrx_this_step) { 245 if (!inited_ || ch_pos > pys_decoded_len_ || ch_pos >= kMaxRowNum) 246 return false; 247 248 if (0 == ch_pos) { 249 reset_search0(); 250 } else { 251 // Prepare mile stones of this step to clear. 252 MileStoneHandle *dict_handles_to_clear = NULL; 253 if (clear_dmi_this_step && matrix_[ch_pos].dmi_num > 0) { 254 dict_handles_to_clear = dmi_pool_[matrix_[ch_pos].dmi_pos].dict_handles; 255 } 256 257 // If there are more steps, and this step is not allowed to clear, find 258 // milestones of next step. 259 if (pys_decoded_len_ > ch_pos && !clear_dmi_this_step) { 260 dict_handles_to_clear = NULL; 261 if (matrix_[ch_pos + 1].dmi_num > 0) { 262 dict_handles_to_clear = 263 dmi_pool_[matrix_[ch_pos + 1].dmi_pos].dict_handles; 264 } 265 } 266 267 if (NULL != dict_handles_to_clear) { 268 dict_trie_->reset_milestones(ch_pos, dict_handles_to_clear[0]); 269 if (NULL != user_dict_) 270 user_dict_->reset_milestones(ch_pos, dict_handles_to_clear[1]); 271 } 272 273 pys_decoded_len_ = ch_pos; 274 275 if (clear_dmi_this_step) { 276 dmi_pool_used_ = matrix_[ch_pos - 1].dmi_pos 277 + matrix_[ch_pos - 1].dmi_num; 278 matrix_[ch_pos].dmi_num = 0; 279 } else { 280 dmi_pool_used_ = matrix_[ch_pos].dmi_pos + matrix_[ch_pos].dmi_num; 281 } 282 283 if (clear_mtrx_this_step) { 284 mtrx_nd_pool_used_ = matrix_[ch_pos - 1].mtrx_nd_pos 285 + matrix_[ch_pos - 1].mtrx_nd_num; 286 matrix_[ch_pos].mtrx_nd_num = 0; 287 } else { 288 mtrx_nd_pool_used_ = matrix_[ch_pos].mtrx_nd_pos 289 + matrix_[ch_pos].mtrx_nd_num; 290 } 291 292 // Modify fixed_hzs_ 293 if (fixed_hzs_ > 0 && 294 ((kLemmaIdComposing != lma_id_[0]) || 295 (kLemmaIdComposing == lma_id_[0] && 296 spl_start_[c_phrase_.length] <= ch_pos))) { 297 size_t fixed_ch_pos = ch_pos; 298 if (clear_fixed_this_step) 299 fixed_ch_pos = fixed_ch_pos > 0 ? fixed_ch_pos - 1 : 0; 300 while (NULL == matrix_[fixed_ch_pos].mtrx_nd_fixed && fixed_ch_pos > 0) 301 fixed_ch_pos--; 302 303 fixed_lmas_ = 0; 304 fixed_hzs_ = 0; 305 if (fixed_ch_pos > 0) { 306 while (spl_start_[fixed_hzs_] < fixed_ch_pos) 307 fixed_hzs_++; 308 assert(spl_start_[fixed_hzs_] == fixed_ch_pos); 309 310 while (lma_start_[fixed_lmas_] < fixed_hzs_) 311 fixed_lmas_++; 312 assert(lma_start_[fixed_lmas_] == fixed_hzs_); 313 } 314 315 // Re-search the Pinyin string for the unlocked lemma 316 // which was previously fixed. 317 // 318 // Prepare mile stones of this step to clear. 319 MileStoneHandle *dict_handles_to_clear = NULL; 320 if (clear_dmi_this_step && ch_pos == fixed_ch_pos && 321 matrix_[fixed_ch_pos].dmi_num > 0) { 322 dict_handles_to_clear = dmi_pool_[matrix_[fixed_ch_pos].dmi_pos].dict_handles; 323 } 324 325 // If there are more steps, and this step is not allowed to clear, find 326 // milestones of next step. 327 if (pys_decoded_len_ > fixed_ch_pos && !clear_dmi_this_step) { 328 dict_handles_to_clear = NULL; 329 if (matrix_[fixed_ch_pos + 1].dmi_num > 0) { 330 dict_handles_to_clear = 331 dmi_pool_[matrix_[fixed_ch_pos + 1].dmi_pos].dict_handles; 332 } 333 } 334 335 if (NULL != dict_handles_to_clear) { 336 dict_trie_->reset_milestones(fixed_ch_pos, dict_handles_to_clear[0]); 337 if (NULL != user_dict_) 338 user_dict_->reset_milestones(fixed_ch_pos, dict_handles_to_clear[1]); 339 } 340 341 342 pys_decoded_len_ = fixed_ch_pos; 343 344 if (clear_dmi_this_step && ch_pos == fixed_ch_pos) { 345 dmi_pool_used_ = matrix_[fixed_ch_pos - 1].dmi_pos 346 + matrix_[fixed_ch_pos - 1].dmi_num; 347 matrix_[fixed_ch_pos].dmi_num = 0; 348 } else { 349 dmi_pool_used_ = matrix_[fixed_ch_pos].dmi_pos + 350 matrix_[fixed_ch_pos].dmi_num; 351 } 352 353 if (clear_mtrx_this_step && ch_pos == fixed_ch_pos) { 354 mtrx_nd_pool_used_ = matrix_[fixed_ch_pos - 1].mtrx_nd_pos 355 + matrix_[fixed_ch_pos - 1].mtrx_nd_num; 356 matrix_[fixed_ch_pos].mtrx_nd_num = 0; 357 } else { 358 mtrx_nd_pool_used_ = matrix_[fixed_ch_pos].mtrx_nd_pos 359 + matrix_[fixed_ch_pos].mtrx_nd_num; 360 } 361 362 for (uint16 re_pos = fixed_ch_pos; re_pos < ch_pos; re_pos++) { 363 add_char(pys_[re_pos]); 364 } 365 } else if (fixed_hzs_ > 0 && kLemmaIdComposing == lma_id_[0]) { 366 for (uint16 subpos = 0; subpos < c_phrase_.sublma_num; subpos++) { 367 uint16 splpos_begin = c_phrase_.sublma_start[subpos]; 368 uint16 splpos_end = c_phrase_.sublma_start[subpos + 1]; 369 for (uint16 splpos = splpos_begin; splpos < splpos_end; splpos++) { 370 // If ch_pos is in this spelling 371 uint16 spl_start = c_phrase_.spl_start[splpos]; 372 uint16 spl_end = c_phrase_.spl_start[splpos + 1]; 373 if (ch_pos >= spl_start && ch_pos < spl_end) { 374 // Clear everything after this position 375 c_phrase_.chn_str[splpos] = static_cast<char16>('\0'); 376 c_phrase_.sublma_start[subpos + 1] = splpos; 377 c_phrase_.sublma_num = subpos + 1; 378 c_phrase_.length = splpos; 379 380 if (splpos == splpos_begin) { 381 c_phrase_.sublma_num = subpos; 382 } 383 } 384 } 385 } 386 387 // Extend the composing phrase. 388 reset_search0(); 389 dmi_c_phrase_ = true; 390 uint16 c_py_pos = 0; 391 while (c_py_pos < spl_start_[c_phrase_.length]) { 392 bool b_ac_tmp = add_char(pys_[c_py_pos]); 393 assert(b_ac_tmp); 394 c_py_pos++; 395 } 396 dmi_c_phrase_ = false; 397 398 lma_id_num_ = 1; 399 fixed_lmas_ = 1; 400 fixed_lmas_no1_[0] = 0; // A composing string is always modified. 401 fixed_hzs_ = c_phrase_.length; 402 lma_start_[1] = fixed_hzs_; 403 lma_id_[0] = kLemmaIdComposing; 404 matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ + 405 matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos; 406 } 407 } 408 409 return true; 410 } 411 412 void MatrixSearch::del_in_pys(size_t start, size_t len) { 413 while (start < kMaxRowNum - len && '\0' != pys_[start]) { 414 pys_[start] = pys_[start + len]; 415 start++; 416 } 417 } 418 419 size_t MatrixSearch::search(const char *py, size_t py_len) { 420 if (!inited_ || NULL == py) 421 return 0; 422 423 // If the search Pinyin string is too long, it will be truncated. 424 if (py_len > kMaxRowNum - 1) 425 py_len = kMaxRowNum - 1; 426 427 // Compare the new string with the previous one. Find their prefix to 428 // increase search efficiency. 429 size_t ch_pos = 0; 430 for (ch_pos = 0; ch_pos < pys_decoded_len_; ch_pos++) { 431 if ('\0' == py[ch_pos] || py[ch_pos] != pys_[ch_pos]) 432 break; 433 } 434 435 bool clear_fix = true; 436 if (ch_pos == pys_decoded_len_) 437 clear_fix = false; 438 439 reset_search(ch_pos, clear_fix, false, false); 440 441 memcpy(pys_ + ch_pos, py + ch_pos, py_len - ch_pos); 442 pys_[py_len] = '\0'; 443 444 while ('\0' != pys_[ch_pos]) { 445 if (!add_char(py[ch_pos])) { 446 pys_decoded_len_ = ch_pos; 447 break; 448 } 449 ch_pos++; 450 } 451 452 // Get spelling ids and starting positions. 453 get_spl_start_id(); 454 455 // If there are too many spellings, remove the last letter until the spelling 456 // number is acceptable. 457 while (spl_id_num_ > 9) { 458 py_len--; 459 reset_search(py_len, false, false, false); 460 pys_[py_len] = '\0'; 461 get_spl_start_id(); 462 } 463 464 prepare_candidates(); 465 466 if (kPrintDebug0) { 467 printf("--Matrix Node Pool Used: %d\n", mtrx_nd_pool_used_); 468 printf("--DMI Pool Used: %d\n", dmi_pool_used_); 469 470 if (kPrintDebug1) { 471 for (PoolPosType pos = 0; pos < dmi_pool_used_; pos++) { 472 debug_print_dmi(pos, 1); 473 } 474 } 475 } 476 477 return ch_pos; 478 } 479 480 size_t MatrixSearch::delsearch(size_t pos, bool is_pos_in_splid, 481 bool clear_fixed_this_step) { 482 if (!inited_) 483 return 0; 484 485 size_t reset_pos = pos; 486 487 // Out of range for both Pinyin mode and Spelling id mode. 488 if (pys_decoded_len_ <= pos) { 489 del_in_pys(pos, 1); 490 491 reset_pos = pys_decoded_len_; 492 // Decode the string after the un-decoded position 493 while ('\0' != pys_[reset_pos]) { 494 if (!add_char(pys_[reset_pos])) { 495 pys_decoded_len_ = reset_pos; 496 break; 497 } 498 reset_pos++; 499 } 500 get_spl_start_id(); 501 prepare_candidates(); 502 return pys_decoded_len_; 503 } 504 505 // Spelling id mode, but out of range. 506 if (is_pos_in_splid && spl_id_num_ <= pos) 507 return pys_decoded_len_; 508 509 // Begin to handle two modes respectively. 510 // Pinyin mode by default 511 size_t c_py_len = 0; // The length of composing phrase's Pinyin 512 size_t del_py_len = 1; 513 if (!is_pos_in_splid) { 514 // Pinyin mode is only allowed to delete beyond the fixed lemmas. 515 if (fixed_lmas_ > 0 && pos < spl_start_[lma_start_[fixed_lmas_]]) 516 return pys_decoded_len_; 517 518 del_in_pys(pos, 1); 519 520 // If the deleted character is just the one after the last fixed lemma 521 if (pos == spl_start_[lma_start_[fixed_lmas_]]) { 522 // If all fixed lemmas have been merged, and the caller of the function 523 // request to unlock the last fixed lemma. 524 if (kLemmaIdComposing == lma_id_[0] && clear_fixed_this_step) { 525 // Unlock the last sub lemma in the composing phrase. Because it is not 526 // easy to unlock it directly. Instead, we re-decode the modified 527 // composing phrase. 528 c_phrase_.sublma_num--; 529 c_phrase_.length = c_phrase_.sublma_start[c_phrase_.sublma_num]; 530 reset_pos = spl_start_[c_phrase_.length]; 531 c_py_len = reset_pos; 532 } 533 } 534 } else { 535 del_py_len = spl_start_[pos + 1] - spl_start_[pos]; 536 537 del_in_pys(spl_start_[pos], del_py_len); 538 539 if (pos >= lma_start_[fixed_lmas_]) { 540 c_py_len = 0; 541 reset_pos = spl_start_[pos + 1] - del_py_len; 542 } else { 543 c_py_len = spl_start_[lma_start_[fixed_lmas_]] - del_py_len; 544 reset_pos = c_py_len; 545 if (c_py_len > 0) 546 merge_fixed_lmas(pos); 547 } 548 } 549 550 if (c_py_len > 0) { 551 assert(c_phrase_.length > 0 && c_py_len == 552 c_phrase_.spl_start[c_phrase_.sublma_start[c_phrase_.sublma_num]]); 553 // The composing phrase is valid, reset all search space, 554 // and begin a new search which will only extend the composing 555 // phrase. 556 reset_search0(); 557 558 dmi_c_phrase_ = true; 559 // Extend the composing phrase. 560 uint16 c_py_pos = 0; 561 while (c_py_pos < c_py_len) { 562 bool b_ac_tmp = add_char(pys_[c_py_pos]); 563 assert(b_ac_tmp); 564 c_py_pos++; 565 } 566 dmi_c_phrase_ = false; 567 568 // Fixd the composing phrase as the first choice. 569 lma_id_num_ = 1; 570 fixed_lmas_ = 1; 571 fixed_lmas_no1_[0] = 0; // A composing string is always modified. 572 fixed_hzs_ = c_phrase_.length; 573 lma_start_[1] = fixed_hzs_; 574 lma_id_[0] = kLemmaIdComposing; 575 matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ + 576 matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos; 577 } else { 578 // Reseting search only clear pys_decoded_len_, but the string is kept. 579 reset_search(reset_pos, clear_fixed_this_step, false, false); 580 } 581 582 // Decode the string after the delete position. 583 while ('\0' != pys_[reset_pos]) { 584 if (!add_char(pys_[reset_pos])) { 585 pys_decoded_len_ = reset_pos; 586 break; 587 } 588 reset_pos++; 589 } 590 591 get_spl_start_id(); 592 prepare_candidates(); 593 return pys_decoded_len_; 594 } 595 596 size_t MatrixSearch::get_candidate_num() { 597 if (!inited_ || 0 == pys_decoded_len_ || 598 0 == matrix_[pys_decoded_len_].mtrx_nd_num) 599 return 0; 600 601 return 1 + lpi_total_; 602 } 603 604 char16* MatrixSearch::get_candidate(size_t cand_id, char16 *cand_str, 605 size_t max_len) { 606 if (!inited_ || 0 == pys_decoded_len_ || NULL == cand_str) 607 return NULL; 608 609 if (0 == cand_id) { 610 return get_candidate0(cand_str, max_len, NULL, false); 611 } else { 612 cand_id--; 613 } 614 615 // For this case: the current sentence is a word only, and the user fixed it, 616 // so the result will be fixed to the sentence space, and 617 // lpi_total_ will be set to 0. 618 if (0 == lpi_total_) { 619 return get_candidate0(cand_str, max_len, NULL, false); 620 } 621 622 LemmaIdType id = lpi_items_[cand_id].id; 623 char16 s[kMaxLemmaSize + 1]; 624 625 uint16 s_len = lpi_items_[cand_id].lma_len; 626 if (s_len > 1) { 627 s_len = get_lemma_str(id, s, kMaxLemmaSize + 1); 628 } else { 629 // For a single character, Hanzi is ready. 630 s[0] = lpi_items_[cand_id].hanzi; 631 s[1] = static_cast<char16>(0); 632 } 633 634 if (s_len > 0 && max_len > s_len) { 635 utf16_strncpy(cand_str, s, s_len); 636 cand_str[s_len] = (char16)'\0'; 637 return cand_str; 638 } 639 640 return NULL; 641 } 642 643 void MatrixSearch::update_dict_freq() { 644 if (NULL != user_dict_) { 645 // Update the total frequency of all lemmas, including system lemmas and 646 // user dictionary lemmas. 647 size_t total_freq = user_dict_->get_total_lemma_count(); 648 dict_trie_->set_total_lemma_count_of_others(total_freq); 649 } 650 } 651 652 bool MatrixSearch::add_lma_to_userdict(uint16 lma_fr, uint16 lma_to, 653 float score) { 654 if (lma_to - lma_fr <= 1 || NULL == user_dict_) 655 return false; 656 657 char16 word_str[kMaxLemmaSize + 1]; 658 uint16 spl_ids[kMaxLemmaSize]; 659 660 uint16 spl_id_fr = 0; 661 662 for (uint16 pos = lma_fr; pos < lma_to; pos++) { 663 LemmaIdType lma_id = lma_id_[pos]; 664 if (is_user_lemma(lma_id)) { 665 user_dict_->update_lemma(lma_id, 1, true); 666 } 667 uint16 lma_len = lma_start_[pos + 1] - lma_start_[pos]; 668 utf16_strncpy(spl_ids + spl_id_fr, spl_id_ + lma_start_[pos], lma_len); 669 670 uint16 tmp = get_lemma_str(lma_id, word_str + spl_id_fr, 671 kMaxLemmaSize + 1 - spl_id_fr); 672 assert(tmp == lma_len); 673 674 tmp = get_lemma_splids(lma_id, spl_ids + spl_id_fr, lma_len, true); 675 if (tmp != lma_len) { 676 return false; 677 } 678 679 spl_id_fr += lma_len; 680 } 681 682 assert(spl_id_fr <= kMaxLemmaSize); 683 684 return user_dict_->put_lemma(static_cast<char16*>(word_str), spl_ids, 685 spl_id_fr, 1); 686 } 687 688 void MatrixSearch::debug_print_dmi(PoolPosType dmi_pos, uint16 nest_level) { 689 if (dmi_pos >= dmi_pool_used_) return; 690 691 DictMatchInfo *dmi = dmi_pool_ + dmi_pos; 692 693 if (1 == nest_level) { 694 printf("-----------------%d\'th DMI node begin----------->\n", dmi_pos); 695 } 696 if (dmi->dict_level > 1) { 697 debug_print_dmi(dmi->dmi_fr, nest_level + 1); 698 } 699 printf("---%d\n", dmi->dict_level); 700 printf(" MileStone: %x, %x\n", dmi->dict_handles[0], dmi->dict_handles[1]); 701 printf(" Spelling : %s, %d\n", SpellingTrie::get_instance(). 702 get_spelling_str(dmi->spl_id), dmi->spl_id); 703 printf(" Total Pinyin Len: %d\n", dmi->splstr_len); 704 if (1 == nest_level) { 705 printf("<----------------%d\'th DMI node end--------------\n\n", dmi_pos); 706 } 707 } 708 709 bool MatrixSearch::try_add_cand0_to_userdict() { 710 size_t new_cand_num = get_candidate_num(); 711 if (fixed_hzs_ > 0 && 1 == new_cand_num) { 712 float score_from = 0; 713 uint16 lma_id_from = 0; 714 uint16 pos = 0; 715 bool modified = false; 716 while (pos < fixed_lmas_) { 717 if (lma_start_[pos + 1] - lma_start_[lma_id_from] > 718 static_cast<uint16>(kMaxLemmaSize)) { 719 float score_to_add = 720 mtrx_nd_pool_[matrix_[spl_start_[lma_start_[pos]]] 721 .mtrx_nd_pos].score - score_from; 722 if (modified) { 723 score_to_add += 1.0; 724 if (score_to_add > NGram::kMaxScore) { 725 score_to_add = NGram::kMaxScore; 726 } 727 add_lma_to_userdict(lma_id_from, pos, score_to_add); 728 } 729 lma_id_from = pos; 730 score_from += score_to_add; 731 732 // Clear the flag for next user lemma. 733 modified = false; 734 } 735 736 if (0 == fixed_lmas_no1_[pos]) { 737 modified = true; 738 } 739 pos++; 740 } 741 742 // Single-char word is not allowed to add to userdict. 743 if (lma_start_[pos] - lma_start_[lma_id_from] > 1) { 744 float score_to_add = 745 mtrx_nd_pool_[matrix_[spl_start_[lma_start_[pos]]] 746 .mtrx_nd_pos].score - score_from; 747 if (modified) { 748 score_to_add += 1.0; 749 if (score_to_add > NGram::kMaxScore) { 750 score_to_add = NGram::kMaxScore; 751 } 752 add_lma_to_userdict(lma_id_from, pos, score_to_add); 753 } 754 } 755 } 756 return true; 757 } 758 759 // Choose a candidate, and give new candidates for next step. 760 // If user finishes selection, we will try to communicate with user dictionary 761 // to add new items or update score of some existing items. 762 // 763 // Basic rule: 764 // 1. If user selects the first choice: 765 // 1.1. If the first choice is not a sentence, instead, it is a lemma: 766 // 1.1.1. If the first choice is a user lemma, notify the user 767 // dictionary that a user lemma is hit, and add occuring count 768 // by 1. 769 // 1.1.2. If the first choice is a system lemma, do nothing. 770 // 1.2. If the first choice is a sentence containing more than one lemma: 771 // 1.2.1. The whole sentence will be added as a user lemma. If the 772 // sentence contains user lemmas, -> hit, and add occuring count 773 // by 1. 774 size_t MatrixSearch::choose(size_t cand_id) { 775 if (!inited_ || 0 == pys_decoded_len_) 776 return 0; 777 778 if (0 == cand_id) { 779 fixed_hzs_ = spl_id_num_; 780 matrix_[spl_start_[fixed_hzs_]].mtrx_nd_fixed = mtrx_nd_pool_ + 781 matrix_[spl_start_[fixed_hzs_]].mtrx_nd_pos; 782 for (size_t pos = fixed_lmas_; pos < lma_id_num_; pos++) { 783 fixed_lmas_no1_[pos] = 1; 784 } 785 fixed_lmas_ = lma_id_num_; 786 lpi_total_ = 0; // Clean all other candidates. 787 788 // 1. It is the first choice 789 if (1 == lma_id_num_) { 790 // 1.1. The first choice is not a sentence but a lemma 791 if (is_user_lemma(lma_id_[0])) { 792 // 1.1.1. The first choice is a user lemma, notify the user dictionary 793 // that it is hit. 794 if (NULL != user_dict_) 795 user_dict_->update_lemma(lma_id_[0], 1, true); 796 } else { 797 // 1.1.2. do thing for a system lemma. 798 } 799 } else { 800 // 1.2. The first choice is a sentence. 801 // 1.2.1 Try to add the whole sentence to user dictionary, the whole 802 // sentence may be splitted into many items. 803 if (NULL != user_dict_) { 804 try_add_cand0_to_userdict(); 805 } 806 } 807 update_dict_freq(); 808 return 1; 809 } else { 810 cand_id--; 811 } 812 813 // 2. It is not the full sentence candidate. 814 // Find the length of the candidate. 815 LemmaIdType id_chosen = lpi_items_[cand_id].id; 816 LmaScoreType score_chosen = lpi_items_[cand_id].psb; 817 size_t cand_len = lpi_items_[cand_id].lma_len; 818 819 assert(cand_len > 0); 820 821 // Notify the atom dictionary that this item is hit. 822 if (is_user_lemma(id_chosen)) { 823 if (NULL != user_dict_) { 824 user_dict_->update_lemma(id_chosen, 1, true); 825 } 826 update_dict_freq(); 827 } 828 829 // 3. Fixed the chosen item. 830 // 3.1 Get the steps number. 831 size_t step_fr = spl_start_[fixed_hzs_]; 832 size_t step_to = spl_start_[fixed_hzs_ + cand_len]; 833 834 // 3.2 Save the length of the original string. 835 size_t pys_decoded_len = pys_decoded_len_; 836 837 // 3.2 Reset the space of the fixed part. 838 reset_search(step_to, false, false, true); 839 840 // 3.3 For the last character of the fixed part, the previous DMI 841 // information will be kept, while the MTRX information will be re-extended, 842 // and only one node will be extended. 843 matrix_[step_to].mtrx_nd_num = 0; 844 845 LmaPsbItem lpi_item; 846 lpi_item.psb = score_chosen; 847 lpi_item.id = id_chosen; 848 849 PoolPosType step_to_dmi_fr = match_dmi(step_to, 850 spl_id_ + fixed_hzs_, cand_len); 851 assert(step_to_dmi_fr != static_cast<PoolPosType>(-1)); 852 853 extend_mtrx_nd(matrix_[step_fr].mtrx_nd_fixed, &lpi_item, 1, 854 step_to_dmi_fr, step_to); 855 856 matrix_[step_to].mtrx_nd_fixed = mtrx_nd_pool_ + matrix_[step_to].mtrx_nd_pos; 857 mtrx_nd_pool_used_ = matrix_[step_to].mtrx_nd_pos + 858 matrix_[step_to].mtrx_nd_num; 859 860 if (id_chosen == lma_id_[fixed_lmas_]) 861 fixed_lmas_no1_[fixed_lmas_] = 1; 862 else 863 fixed_lmas_no1_[fixed_lmas_] = 0; 864 lma_id_[fixed_lmas_] = id_chosen; 865 lma_start_[fixed_lmas_ + 1] = lma_start_[fixed_lmas_] + cand_len; 866 fixed_lmas_++; 867 fixed_hzs_ = fixed_hzs_ + cand_len; 868 869 while (step_to != pys_decoded_len) { 870 bool b = add_char(pys_[step_to]); 871 assert(b); 872 step_to++; 873 } 874 875 if (fixed_hzs_ < spl_id_num_) { 876 prepare_candidates(); 877 } else { 878 lpi_total_ = 0; 879 if (NULL != user_dict_) { 880 try_add_cand0_to_userdict(); 881 } 882 } 883 884 return get_candidate_num(); 885 } 886 887 size_t MatrixSearch::cancel_last_choice() { 888 if (!inited_ || 0 == pys_decoded_len_) 889 return 0; 890 891 size_t step_start = 0; 892 if (fixed_hzs_ > 0) { 893 size_t step_end = spl_start_[fixed_hzs_]; 894 MatrixNode *end_node = matrix_[step_end].mtrx_nd_fixed; 895 assert(NULL != end_node); 896 897 step_start = end_node->from->step; 898 899 if (step_start > 0) { 900 DictMatchInfo *dmi = dmi_pool_ + end_node->dmi_fr; 901 fixed_hzs_ -= dmi->dict_level; 902 } else { 903 fixed_hzs_ = 0; 904 } 905 906 reset_search(step_start, false, false, false); 907 908 while (pys_[step_start] != '\0') { 909 bool b = add_char(pys_[step_start]); 910 assert(b); 911 step_start++; 912 } 913 914 prepare_candidates(); 915 } 916 return get_candidate_num(); 917 } 918 919 size_t MatrixSearch::get_fixedlen() { 920 if (!inited_ || 0 == pys_decoded_len_) 921 return 0; 922 return fixed_hzs_; 923 } 924 925 bool MatrixSearch::prepare_add_char(char ch) { 926 if (pys_decoded_len_ >= kMaxRowNum - 1 || 927 (!spl_parser_->is_valid_to_parse(ch) && ch != '\'')) 928 return false; 929 930 if (dmi_pool_used_ >= kDmiPoolSize) return false; 931 932 pys_[pys_decoded_len_] = ch; 933 pys_decoded_len_++; 934 935 MatrixRow *mtrx_this_row = matrix_ + pys_decoded_len_; 936 mtrx_this_row->mtrx_nd_pos = mtrx_nd_pool_used_; 937 mtrx_this_row->mtrx_nd_num = 0; 938 mtrx_this_row->dmi_pos = dmi_pool_used_; 939 mtrx_this_row->dmi_num = 0; 940 mtrx_this_row->dmi_has_full_id = 0; 941 942 return true; 943 } 944 945 bool MatrixSearch::is_split_at(uint16 pos) { 946 return !spl_parser_->is_valid_to_parse(pys_[pos - 1]); 947 } 948 949 void MatrixSearch::fill_dmi(DictMatchInfo *dmi, MileStoneHandle *handles, 950 PoolPosType dmi_fr, uint16 spl_id, 951 uint16 node_num, unsigned char dict_level, 952 bool splid_end_split, unsigned char splstr_len, 953 unsigned char all_full_id) { 954 dmi->dict_handles[0] = handles[0]; 955 dmi->dict_handles[1] = handles[1]; 956 dmi->dmi_fr = dmi_fr; 957 dmi->spl_id = spl_id; 958 dmi->dict_level = dict_level; 959 dmi->splid_end_split = splid_end_split ? 1 : 0; 960 dmi->splstr_len = splstr_len; 961 dmi->all_full_id = all_full_id; 962 dmi->c_phrase = 0; 963 } 964 965 bool MatrixSearch::add_char(char ch) { 966 if (!prepare_add_char(ch)) 967 return false; 968 return add_char_qwerty(); 969 } 970 971 bool MatrixSearch::add_char_qwerty() { 972 matrix_[pys_decoded_len_].mtrx_nd_num = 0; 973 974 bool spl_matched = false; 975 uint16 longest_ext = 0; 976 // Extend the search matrix, from the oldest unfixed row. ext_len means 977 // extending length. 978 for (uint16 ext_len = kMaxPinyinSize + 1; ext_len > 0; ext_len--) { 979 if (ext_len > pys_decoded_len_ - spl_start_[fixed_hzs_]) 980 continue; 981 982 // Refer to the declaration of the variable dmi_has_full_id for the 983 // explanation of this piece of code. In one word, it is used to prevent 984 // from the unwise extending of "shoud ou" but allow the reasonable 985 // extending of "heng ao", "lang a", etc. 986 if (ext_len > 1 && 0 != longest_ext && 987 0 == matrix_[pys_decoded_len_ - ext_len].dmi_has_full_id) { 988 if (xi_an_enabled_) 989 continue; 990 else 991 break; 992 } 993 994 uint16 oldrow = pys_decoded_len_ - ext_len; 995 996 // 0. If that row is before the last fixed step, ignore. 997 if (spl_start_[fixed_hzs_] > oldrow) 998 continue; 999 1000 // 1. Check if that old row has valid MatrixNode. If no, means that row is 1001 // not a boundary, either a word boundary or a spelling boundary. 1002 // If it is for extending composing phrase, it's OK to ignore the 0. 1003 if (0 == matrix_[oldrow].mtrx_nd_num && !dmi_c_phrase_) 1004 continue; 1005 1006 // 2. Get spelling id(s) for the last ext_len chars. 1007 uint16 spl_idx; 1008 bool is_pre = false; 1009 spl_idx = spl_parser_->get_splid_by_str(pys_ + oldrow, 1010 ext_len, &is_pre); 1011 if (is_pre) 1012 spl_matched = true; 1013 1014 if (0 == spl_idx) 1015 continue; 1016 1017 bool splid_end_split = is_split_at(oldrow + ext_len); 1018 1019 // 3. Extend the DMI nodes of that old row 1020 // + 1 is to extend an extra node from the root 1021 for (PoolPosType dmi_pos = matrix_[oldrow].dmi_pos; 1022 dmi_pos < matrix_[oldrow].dmi_pos + matrix_[oldrow].dmi_num + 1; 1023 dmi_pos++) { 1024 DictMatchInfo *dmi = dmi_pool_ + dmi_pos; 1025 if (dmi_pos == matrix_[oldrow].dmi_pos + matrix_[oldrow].dmi_num) { 1026 dmi = NULL; // The last one, NULL means extending from the root. 1027 } else { 1028 // If the dmi is covered by the fixed arrange, ignore it. 1029 if (fixed_hzs_ > 0 && 1030 pys_decoded_len_ - ext_len - dmi->splstr_len < 1031 spl_start_[fixed_hzs_]) { 1032 continue; 1033 } 1034 // If it is not in mode for composing phrase, and the source DMI node 1035 // is marked for composing phrase, ignore this node. 1036 if (dmi->c_phrase != 0 && !dmi_c_phrase_) { 1037 continue; 1038 } 1039 } 1040 1041 // For example, if "gao" is extended, "g ao" is not allowed. 1042 // or "zh" has been passed, "z h" is not allowed. 1043 // Both word and word-connection will be prevented. 1044 if (longest_ext > ext_len) { 1045 if (NULL == dmi && 0 == matrix_[oldrow].dmi_has_full_id) { 1046 continue; 1047 } 1048 1049 // "z h" is not allowed. 1050 if (NULL != dmi && spl_trie_->is_half_id(dmi->spl_id)) { 1051 continue; 1052 } 1053 } 1054 1055 dep_->splids_extended = 0; 1056 if (NULL != dmi) { 1057 uint16 prev_ids_num = dmi->dict_level; 1058 if ((!dmi_c_phrase_ && prev_ids_num >= kMaxLemmaSize) || 1059 (dmi_c_phrase_ && prev_ids_num >= kMaxRowNum)) { 1060 continue; 1061 } 1062 1063 DictMatchInfo *d = dmi; 1064 while (d) { 1065 dep_->splids[--prev_ids_num] = d->spl_id; 1066 if ((PoolPosType)-1 == d->dmi_fr) 1067 break; 1068 d = dmi_pool_ + d->dmi_fr; 1069 } 1070 assert(0 == prev_ids_num); 1071 dep_->splids_extended = dmi->dict_level; 1072 } 1073 dep_->splids[dep_->splids_extended] = spl_idx; 1074 dep_->ext_len = ext_len; 1075 dep_->splid_end_split = splid_end_split; 1076 1077 dep_->id_num = 1; 1078 dep_->id_start = spl_idx; 1079 if (spl_trie_->is_half_id(spl_idx)) { 1080 // Get the full id list 1081 dep_->id_num = spl_trie_->half_to_full(spl_idx, &(dep_->id_start)); 1082 assert(dep_->id_num > 0); 1083 } 1084 1085 uint16 new_dmi_num; 1086 1087 new_dmi_num = extend_dmi(dep_, dmi); 1088 1089 if (new_dmi_num > 0) { 1090 if (dmi_c_phrase_) { 1091 dmi_pool_[dmi_pool_used_].c_phrase = 1; 1092 } 1093 matrix_[pys_decoded_len_].dmi_num += new_dmi_num; 1094 dmi_pool_used_ += new_dmi_num; 1095 1096 if (!spl_trie_->is_half_id(spl_idx)) 1097 matrix_[pys_decoded_len_].dmi_has_full_id = 1; 1098 } 1099 1100 // If get candiate lemmas, try to extend the path 1101 if (lpi_total_ > 0) { 1102 uint16 fr_row; 1103 if (NULL == dmi) { 1104 fr_row = oldrow; 1105 } else { 1106 assert(oldrow >= dmi->splstr_len); 1107 fr_row = oldrow - dmi->splstr_len; 1108 } 1109 for (PoolPosType mtrx_nd_pos = matrix_[fr_row].mtrx_nd_pos; 1110 mtrx_nd_pos < matrix_[fr_row].mtrx_nd_pos + 1111 matrix_[fr_row].mtrx_nd_num; 1112 mtrx_nd_pos++) { 1113 MatrixNode *mtrx_nd = mtrx_nd_pool_ + mtrx_nd_pos; 1114 1115 extend_mtrx_nd(mtrx_nd, lpi_items_, lpi_total_, 1116 dmi_pool_used_ - new_dmi_num, pys_decoded_len_); 1117 if (longest_ext == 0) 1118 longest_ext = ext_len; 1119 } 1120 } 1121 } // for dmi_pos 1122 } // for ext_len 1123 mtrx_nd_pool_used_ += matrix_[pys_decoded_len_].mtrx_nd_num; 1124 1125 if (dmi_c_phrase_) 1126 return true; 1127 1128 return (matrix_[pys_decoded_len_].mtrx_nd_num != 0 || spl_matched); 1129 } 1130 1131 void MatrixSearch::prepare_candidates() { 1132 // Get candiates from the first un-fixed step. 1133 uint16 lma_size_max = kMaxLemmaSize; 1134 if (lma_size_max > spl_id_num_ - fixed_hzs_) 1135 lma_size_max = spl_id_num_ - fixed_hzs_; 1136 1137 uint16 lma_size = lma_size_max; 1138 1139 // If the full sentense candidate's unfixed part may be the same with a normal 1140 // lemma. Remove the lemma candidate in this case. 1141 char16 fullsent[kMaxLemmaSize + 1]; 1142 char16 *pfullsent = NULL; 1143 uint16 sent_len; 1144 pfullsent = get_candidate0(fullsent, kMaxLemmaSize + 1, &sent_len, true); 1145 1146 // If the unfixed part contains more than one ids, it is not necessary to 1147 // check whether a lemma's string is the same to the unfixed part of the full 1148 // sentence candidate, so, set it to NULL; 1149 if (sent_len > kMaxLemmaSize) 1150 pfullsent = NULL; 1151 1152 lpi_total_ = 0; 1153 size_t lpi_num_full_match = 0; // Number of items which are fully-matched. 1154 while (lma_size > 0) { 1155 size_t lma_num; 1156 lma_num = get_lpis(spl_id_ + fixed_hzs_, lma_size, 1157 lpi_items_ + lpi_total_, 1158 size_t(kMaxLmaPsbItems - lpi_total_), 1159 pfullsent, lma_size == lma_size_max); 1160 1161 if (lma_num > 0) { 1162 lpi_total_ += lma_num; 1163 // For next lemma candidates which are not the longest, it is not 1164 // necessary to compare with the full sentence candiate. 1165 pfullsent = NULL; 1166 } 1167 if (lma_size == lma_size_max) { 1168 lpi_num_full_match = lpi_total_; 1169 } 1170 lma_size--; 1171 } 1172 1173 // Sort those partially-matched items by their unified scores. 1174 myqsort(lpi_items_ + lpi_num_full_match, lpi_total_ - lpi_num_full_match, 1175 sizeof(LmaPsbItem), cmp_lpi_with_unified_psb); 1176 1177 if (kPrintDebug0) { 1178 printf("-----Prepare candidates, score:\n"); 1179 for (size_t a = 0; a < lpi_total_; a++) { 1180 printf("[%03d]%d ", a, lpi_items_[a].psb); 1181 if ((a + 1) % 6 == 0) printf("\n"); 1182 } 1183 printf("\n"); 1184 } 1185 1186 if (kPrintDebug0) { 1187 printf("--- lpi_total_ = %d\n", lpi_total_); 1188 } 1189 } 1190 1191 const char* MatrixSearch::get_pystr(size_t *decoded_len) { 1192 if (!inited_ || NULL == decoded_len) 1193 return NULL; 1194 1195 *decoded_len = pys_decoded_len_; 1196 return pys_; 1197 } 1198 1199 void MatrixSearch::merge_fixed_lmas(size_t del_spl_pos) { 1200 if (fixed_lmas_ == 0) 1201 return; 1202 // Update spelling segmentation information first. 1203 spl_id_num_ -= 1; 1204 uint16 del_py_len = spl_start_[del_spl_pos + 1] - spl_start_[del_spl_pos]; 1205 for (size_t pos = del_spl_pos; pos <= spl_id_num_; pos++) { 1206 spl_start_[pos] = spl_start_[pos + 1] - del_py_len; 1207 if (pos == spl_id_num_) 1208 break; 1209 spl_id_[pos] = spl_id_[pos + 1]; 1210 } 1211 1212 // Begin to merge. 1213 uint16 phrase_len = 0; 1214 1215 // Update the spelling ids to the composing phrase. 1216 // We need to convert these ids into full id in the future. 1217 memcpy(c_phrase_.spl_ids, spl_id_, spl_id_num_ * sizeof(uint16)); 1218 memcpy(c_phrase_.spl_start, spl_start_, (spl_id_num_ + 1) * sizeof(uint16)); 1219 1220 // If composing phrase has not been created, first merge all fixed 1221 // lemmas into a composing phrase without deletion. 1222 if (fixed_lmas_ > 1 || kLemmaIdComposing != lma_id_[0]) { 1223 uint16 bp = 1; // Begin position of real fixed lemmas. 1224 // There is no existing composing phrase. 1225 if (kLemmaIdComposing != lma_id_[0]) { 1226 c_phrase_.sublma_num = 0; 1227 bp = 0; 1228 } 1229 1230 uint16 sub_num = c_phrase_.sublma_num; 1231 for (uint16 pos = bp; pos <= fixed_lmas_; pos++) { 1232 c_phrase_.sublma_start[sub_num + pos - bp] = lma_start_[pos]; 1233 if (lma_start_[pos] > del_spl_pos) { 1234 c_phrase_.sublma_start[sub_num + pos - bp] -= 1; 1235 } 1236 1237 if (pos == fixed_lmas_) 1238 break; 1239 1240 uint16 lma_len; 1241 char16 *lma_str = c_phrase_.chn_str + 1242 c_phrase_.sublma_start[sub_num] + phrase_len; 1243 1244 lma_len = get_lemma_str(lma_id_[pos], lma_str, kMaxRowNum - phrase_len); 1245 assert(lma_len == lma_start_[pos + 1] - lma_start_[pos]); 1246 phrase_len += lma_len; 1247 } 1248 assert(phrase_len == lma_start_[fixed_lmas_]); 1249 c_phrase_.length = phrase_len; // will be deleted by 1 1250 c_phrase_.sublma_num += fixed_lmas_ - bp; 1251 } else { 1252 for (uint16 pos = 0; pos <= c_phrase_.sublma_num; pos++) { 1253 if (c_phrase_.sublma_start[pos] > del_spl_pos) { 1254 c_phrase_.sublma_start[pos] -= 1; 1255 } 1256 } 1257 phrase_len = c_phrase_.length; 1258 } 1259 1260 assert(phrase_len > 0); 1261 if (1 == phrase_len) { 1262 // After the only one is deleted, nothing will be left. 1263 fixed_lmas_ = 0; 1264 return; 1265 } 1266 1267 // Delete the Chinese character in the merged phrase. 1268 // The corresponding elements in spl_ids and spl_start of the 1269 // phrase have been deleted. 1270 char16 *chn_str = c_phrase_.chn_str + del_spl_pos; 1271 for (uint16 pos = 0; 1272 pos < c_phrase_.sublma_start[c_phrase_.sublma_num] - del_spl_pos; 1273 pos++) { 1274 chn_str[pos] = chn_str[pos + 1]; 1275 } 1276 c_phrase_.length -= 1; 1277 1278 // If the deleted spelling id is in a sub lemma which contains more than 1279 // one id, del_a_sub will be false; but if the deleted id is in a sub lemma 1280 // which only contains 1 id, the whole sub lemma needs to be deleted, so 1281 // del_a_sub will be true. 1282 bool del_a_sub = false; 1283 for (uint16 pos = 1; pos <= c_phrase_.sublma_num; pos++) { 1284 if (c_phrase_.sublma_start[pos - 1] == 1285 c_phrase_.sublma_start[pos]) { 1286 del_a_sub = true; 1287 } 1288 if (del_a_sub) { 1289 c_phrase_.sublma_start[pos - 1] = 1290 c_phrase_.sublma_start[pos]; 1291 } 1292 } 1293 if (del_a_sub) 1294 c_phrase_.sublma_num -= 1; 1295 1296 return; 1297 } 1298 1299 void MatrixSearch::get_spl_start_id() { 1300 lma_id_num_ = 0; 1301 lma_start_[0] = 0; 1302 1303 spl_id_num_ = 0; 1304 spl_start_[0] = 0; 1305 if (!inited_ || 0 == pys_decoded_len_ || 1306 0 == matrix_[pys_decoded_len_].mtrx_nd_num) 1307 return; 1308 1309 // Calculate number of lemmas and spellings 1310 // Only scan those part which is not fixed. 1311 lma_id_num_ = fixed_lmas_; 1312 spl_id_num_ = fixed_hzs_; 1313 1314 MatrixNode *mtrx_nd = mtrx_nd_pool_ + matrix_[pys_decoded_len_].mtrx_nd_pos; 1315 while (mtrx_nd != mtrx_nd_pool_) { 1316 if (fixed_hzs_ > 0) { 1317 if (mtrx_nd->step <= spl_start_[fixed_hzs_]) 1318 break; 1319 } 1320 1321 // Update the spelling segamentation information 1322 unsigned char word_splstr_len = 0; 1323 PoolPosType dmi_fr = mtrx_nd->dmi_fr; 1324 if ((PoolPosType)-1 != dmi_fr) 1325 word_splstr_len = dmi_pool_[dmi_fr].splstr_len; 1326 1327 while ((PoolPosType)-1 != dmi_fr) { 1328 spl_start_[spl_id_num_ + 1] = mtrx_nd->step - 1329 (word_splstr_len - dmi_pool_[dmi_fr].splstr_len); 1330 spl_id_[spl_id_num_] = dmi_pool_[dmi_fr].spl_id; 1331 spl_id_num_++; 1332 dmi_fr = dmi_pool_[dmi_fr].dmi_fr; 1333 } 1334 1335 // Update the lemma segmentation information 1336 lma_start_[lma_id_num_ + 1] = spl_id_num_; 1337 lma_id_[lma_id_num_] = mtrx_nd->id; 1338 lma_id_num_++; 1339 1340 mtrx_nd = mtrx_nd->from; 1341 } 1342 1343 // Reverse the result of spelling info 1344 for (size_t pos = fixed_hzs_; 1345 pos < fixed_hzs_ + (spl_id_num_ - fixed_hzs_ + 1) / 2; pos++) { 1346 if (spl_id_num_ + fixed_hzs_ - pos != pos + 1) { 1347 spl_start_[pos + 1] ^= spl_start_[spl_id_num_ - pos + fixed_hzs_]; 1348 spl_start_[spl_id_num_ - pos + fixed_hzs_] ^= spl_start_[pos + 1]; 1349 spl_start_[pos + 1] ^= spl_start_[spl_id_num_ - pos + fixed_hzs_]; 1350 1351 spl_id_[pos] ^= spl_id_[spl_id_num_ + fixed_hzs_ - pos - 1]; 1352 spl_id_[spl_id_num_ + fixed_hzs_- pos - 1] ^= spl_id_[pos]; 1353 spl_id_[pos] ^= spl_id_[spl_id_num_ + fixed_hzs_- pos - 1]; 1354 } 1355 } 1356 1357 // Reverse the result of lemma info 1358 for (size_t pos = fixed_lmas_; 1359 pos < fixed_lmas_ + (lma_id_num_ - fixed_lmas_ + 1) / 2; pos++) { 1360 assert(lma_id_num_ + fixed_lmas_ - pos - 1 >= pos); 1361 1362 if (lma_id_num_ + fixed_lmas_ - pos > pos + 1) { 1363 lma_start_[pos + 1] ^= lma_start_[lma_id_num_ - pos + fixed_lmas_]; 1364 lma_start_[lma_id_num_ - pos + fixed_lmas_] ^= lma_start_[pos + 1]; 1365 lma_start_[pos + 1] ^= lma_start_[lma_id_num_ - pos + fixed_lmas_]; 1366 1367 lma_id_[pos] ^= lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_]; 1368 lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_] ^= lma_id_[pos]; 1369 lma_id_[pos] ^= lma_id_[lma_id_num_ - 1 - pos + fixed_lmas_]; 1370 } 1371 } 1372 1373 for (size_t pos = fixed_lmas_ + 1; pos <= lma_id_num_; pos++) { 1374 if (pos < lma_id_num_) 1375 lma_start_[pos] = lma_start_[pos - 1] + 1376 (lma_start_[pos] - lma_start_[pos + 1]); 1377 else 1378 lma_start_[pos] = lma_start_[pos - 1] + lma_start_[pos] - 1379 lma_start_[fixed_lmas_]; 1380 } 1381 1382 // Find the last fixed position 1383 fixed_hzs_ = 0; 1384 for (size_t pos = spl_id_num_; pos > 0; pos--) { 1385 if (NULL != matrix_[spl_start_[pos]].mtrx_nd_fixed) { 1386 fixed_hzs_ = pos; 1387 break; 1388 } 1389 } 1390 1391 return; 1392 } 1393 1394 size_t MatrixSearch::get_spl_start(const uint16 *&spl_start) { 1395 get_spl_start_id(); 1396 spl_start = spl_start_; 1397 return spl_id_num_; 1398 } 1399 1400 size_t MatrixSearch::extend_dmi(DictExtPara *dep, DictMatchInfo *dmi_s) { 1401 if (dmi_pool_used_ >= kDmiPoolSize) return 0; 1402 1403 if (dmi_c_phrase_) 1404 return extend_dmi_c(dep, dmi_s); 1405 1406 LpiCache& lpi_cache = LpiCache::get_instance(); 1407 uint16 splid = dep->splids[dep->splids_extended]; 1408 1409 bool cached = false; 1410 if (0 == dep->splids_extended) 1411 cached = lpi_cache.is_cached(splid); 1412 1413 // 1. If this is a half Id, get its corresponding full starting Id and 1414 // number of full Id. 1415 size_t ret_val = 0; 1416 PoolPosType mtrx_dmi_fr = (PoolPosType)-1; // From which dmi node 1417 1418 lpi_total_ = 0; 1419 1420 MileStoneHandle from_h[3]; 1421 from_h[0] = 0; 1422 from_h[1] = 0; 1423 1424 if (0 != dep->splids_extended) { 1425 from_h[0] = dmi_s->dict_handles[0]; 1426 from_h[1] = dmi_s->dict_handles[1]; 1427 } 1428 1429 // 2. Begin exgtending in the system dictionary 1430 size_t lpi_num = 0; 1431 MileStoneHandle handles[2]; 1432 handles[0] = handles[1] = 0; 1433 if (from_h[0] > 0 || NULL == dmi_s) { 1434 handles[0] = dict_trie_->extend_dict(from_h[0], dep, lpi_items_, 1435 kMaxLmaPsbItems, &lpi_num); 1436 } 1437 if (handles[0] > 0) 1438 lpi_total_ = lpi_num; 1439 1440 if (NULL == dmi_s) { // from root 1441 assert(0 != handles[0]); 1442 mtrx_dmi_fr = dmi_pool_used_; 1443 } 1444 1445 // 3. Begin extending in the user dictionary 1446 if (NULL != user_dict_ && (from_h[1] > 0 || NULL == dmi_s)) { 1447 handles[1] = user_dict_->extend_dict(from_h[1], dep, 1448 lpi_items_ + lpi_total_, 1449 kMaxLmaPsbItems - lpi_total_, 1450 &lpi_num); 1451 if (handles[1] > 0) { 1452 if (kPrintDebug0) { 1453 for (size_t t = 0; t < lpi_num; t++) { 1454 printf("--Extend in user dict: uid:%d uscore:%d\n", lpi_items_[lpi_total_ + t].id, 1455 lpi_items_[lpi_total_ + t].psb); 1456 } 1457 } 1458 lpi_total_ += lpi_num; 1459 } 1460 } 1461 1462 if (0 != handles[0] || 0 != handles[1]) { 1463 if (dmi_pool_used_ >= kDmiPoolSize) return 0; 1464 1465 DictMatchInfo *dmi_add = dmi_pool_ + dmi_pool_used_; 1466 if (NULL == dmi_s) { 1467 fill_dmi(dmi_add, handles, 1468 (PoolPosType)-1, splid, 1469 1, 1, dep->splid_end_split, dep->ext_len, 1470 spl_trie_->is_half_id(splid) ? 0 : 1); 1471 } else { 1472 fill_dmi(dmi_add, handles, 1473 dmi_s - dmi_pool_, splid, 1, 1474 dmi_s->dict_level + 1, dep->splid_end_split, 1475 dmi_s->splstr_len + dep->ext_len, 1476 spl_trie_->is_half_id(splid) ? 0 : dmi_s->all_full_id); 1477 } 1478 1479 ret_val = 1; 1480 } 1481 1482 if (!cached) { 1483 if (0 == lpi_total_) 1484 return ret_val; 1485 1486 if (kPrintDebug0) { 1487 printf("--- lpi_total_ = %d\n", lpi_total_); 1488 } 1489 1490 myqsort(lpi_items_, lpi_total_, sizeof(LmaPsbItem), cmp_lpi_with_psb); 1491 if (NULL == dmi_s && spl_trie_->is_half_id(splid)) 1492 lpi_total_ = lpi_cache.put_cache(splid, lpi_items_, lpi_total_); 1493 } else { 1494 assert(spl_trie_->is_half_id(splid)); 1495 lpi_total_ = lpi_cache.get_cache(splid, lpi_items_, kMaxLmaPsbItems); 1496 } 1497 1498 return ret_val; 1499 } 1500 1501 size_t MatrixSearch::extend_dmi_c(DictExtPara *dep, DictMatchInfo *dmi_s) { 1502 lpi_total_ = 0; 1503 1504 uint16 pos = dep->splids_extended; 1505 assert(dmi_c_phrase_); 1506 if (pos >= c_phrase_.length) 1507 return 0; 1508 1509 uint16 splid = dep->splids[pos]; 1510 if (splid == c_phrase_.spl_ids[pos]) { 1511 DictMatchInfo *dmi_add = dmi_pool_ + dmi_pool_used_; 1512 MileStoneHandle handles[2]; // Actually never used. 1513 if (NULL == dmi_s) 1514 fill_dmi(dmi_add, handles, 1515 (PoolPosType)-1, splid, 1516 1, 1, dep->splid_end_split, dep->ext_len, 1517 spl_trie_->is_half_id(splid) ? 0 : 1); 1518 else 1519 fill_dmi(dmi_add, handles, 1520 dmi_s - dmi_pool_, splid, 1, 1521 dmi_s->dict_level + 1, dep->splid_end_split, 1522 dmi_s->splstr_len + dep->ext_len, 1523 spl_trie_->is_half_id(splid) ? 0 : dmi_s->all_full_id); 1524 1525 if (pos == c_phrase_.length - 1) { 1526 lpi_items_[0].id = kLemmaIdComposing; 1527 lpi_items_[0].psb = 0; // 0 is bigger than normal lemma score. 1528 lpi_total_ = 1; 1529 } 1530 return 1; 1531 } 1532 return 0; 1533 } 1534 1535 size_t MatrixSearch::extend_mtrx_nd(MatrixNode *mtrx_nd, LmaPsbItem lpi_items[], 1536 size_t lpi_num, PoolPosType dmi_fr, 1537 size_t res_row) { 1538 assert(NULL != mtrx_nd); 1539 matrix_[res_row].mtrx_nd_fixed = NULL; 1540 1541 if (mtrx_nd_pool_used_ >= kMtrxNdPoolSize - kMaxNodeARow) 1542 return 0; 1543 1544 if (0 == mtrx_nd->step) { 1545 // Because the list is sorted, if the source step is 0, it is only 1546 // necessary to pick up the first kMaxNodeARow items. 1547 if (lpi_num > kMaxNodeARow) 1548 lpi_num = kMaxNodeARow; 1549 } 1550 1551 MatrixNode *mtrx_nd_res_min = mtrx_nd_pool_ + matrix_[res_row].mtrx_nd_pos; 1552 for (size_t pos = 0; pos < lpi_num; pos++) { 1553 float score = mtrx_nd->score + lpi_items[pos].psb; 1554 if (pos > 0 && score - PRUMING_SCORE > mtrx_nd_res_min->score) 1555 break; 1556 1557 // Try to add a new node 1558 size_t mtrx_nd_num = matrix_[res_row].mtrx_nd_num; 1559 MatrixNode *mtrx_nd_res = mtrx_nd_res_min + mtrx_nd_num; 1560 bool replace = false; 1561 // Find its position 1562 while (mtrx_nd_res > mtrx_nd_res_min && score < (mtrx_nd_res - 1)->score) { 1563 if (static_cast<size_t>(mtrx_nd_res - mtrx_nd_res_min) < kMaxNodeARow) 1564 *mtrx_nd_res = *(mtrx_nd_res - 1); 1565 mtrx_nd_res--; 1566 replace = true; 1567 } 1568 if (replace || (mtrx_nd_num < kMaxNodeARow && 1569 matrix_[res_row].mtrx_nd_pos + mtrx_nd_num < kMtrxNdPoolSize)) { 1570 mtrx_nd_res->id = lpi_items[pos].id; 1571 mtrx_nd_res->score = score; 1572 mtrx_nd_res->from = mtrx_nd; 1573 mtrx_nd_res->dmi_fr = dmi_fr; 1574 mtrx_nd_res->step = res_row; 1575 if (matrix_[res_row].mtrx_nd_num < kMaxNodeARow) 1576 matrix_[res_row].mtrx_nd_num++; 1577 } 1578 } 1579 return matrix_[res_row].mtrx_nd_num; 1580 } 1581 1582 PoolPosType MatrixSearch::match_dmi(size_t step_to, uint16 spl_ids[], 1583 uint16 spl_id_num) { 1584 if (pys_decoded_len_ < step_to || 0 == matrix_[step_to].dmi_num) { 1585 return static_cast<PoolPosType>(-1); 1586 } 1587 1588 for (PoolPosType dmi_pos = 0; dmi_pos < matrix_[step_to].dmi_num; dmi_pos++) { 1589 DictMatchInfo *dmi = dmi_pool_ + matrix_[step_to].dmi_pos + dmi_pos; 1590 1591 if (dmi->dict_level != spl_id_num) 1592 continue; 1593 1594 bool matched = true; 1595 for (uint16 spl_pos = 0; spl_pos < spl_id_num; spl_pos++) { 1596 if (spl_ids[spl_id_num - spl_pos - 1] != dmi->spl_id) { 1597 matched = false; 1598 break; 1599 } 1600 1601 dmi = dmi_pool_ + dmi->dmi_fr; 1602 } 1603 if (matched) { 1604 return matrix_[step_to].dmi_pos + dmi_pos; 1605 } 1606 } 1607 1608 return static_cast<PoolPosType>(-1); 1609 } 1610 1611 char16* MatrixSearch::get_candidate0(char16 *cand_str, size_t max_len, 1612 uint16 *retstr_len, 1613 bool only_unfixed) { 1614 if (pys_decoded_len_ == 0 || 1615 matrix_[pys_decoded_len_].mtrx_nd_num == 0) 1616 return NULL; 1617 1618 LemmaIdType idxs[kMaxRowNum]; 1619 size_t id_num = 0; 1620 1621 MatrixNode *mtrx_nd = mtrx_nd_pool_ + matrix_[pys_decoded_len_].mtrx_nd_pos; 1622 1623 if (kPrintDebug0) { 1624 printf("--- sentence score: %f\n", mtrx_nd->score); 1625 } 1626 1627 if (kPrintDebug1) { 1628 printf("==============Sentence DMI (reverse order) begin===========>>\n"); 1629 } 1630 1631 while (mtrx_nd != NULL) { 1632 idxs[id_num] = mtrx_nd->id; 1633 id_num++; 1634 1635 if (kPrintDebug1) { 1636 printf("---MatrixNode [step: %d, lma_idx: %d, total score:%.5f]\n", 1637 mtrx_nd->step, mtrx_nd->id, mtrx_nd->score); 1638 debug_print_dmi(mtrx_nd->dmi_fr, 1); 1639 } 1640 1641 mtrx_nd = mtrx_nd->from; 1642 } 1643 1644 if (kPrintDebug1) { 1645 printf("<<==============Sentence DMI (reverse order) end=============\n"); 1646 } 1647 1648 size_t ret_pos = 0; 1649 do { 1650 id_num--; 1651 if (0 == idxs[id_num]) 1652 continue; 1653 1654 char16 str[kMaxLemmaSize + 1]; 1655 uint16 str_len = get_lemma_str(idxs[id_num], str, kMaxLemmaSize + 1); 1656 if (str_len > 0 && ((!only_unfixed && max_len - ret_pos > str_len) || 1657 (only_unfixed && max_len - ret_pos + fixed_hzs_ > str_len))) { 1658 if (!only_unfixed) 1659 utf16_strncpy(cand_str + ret_pos, str, str_len); 1660 else if (ret_pos >= fixed_hzs_) 1661 utf16_strncpy(cand_str + ret_pos - fixed_hzs_, str, str_len); 1662 1663 ret_pos += str_len; 1664 } else { 1665 return NULL; 1666 } 1667 } while (id_num != 0); 1668 1669 if (!only_unfixed) { 1670 if (NULL != retstr_len) 1671 *retstr_len = ret_pos; 1672 cand_str[ret_pos] = (char16)'\0'; 1673 } else { 1674 if (NULL != retstr_len) 1675 *retstr_len = ret_pos - fixed_hzs_; 1676 cand_str[ret_pos - fixed_hzs_] = (char16)'\0'; 1677 } 1678 return cand_str; 1679 } 1680 1681 size_t MatrixSearch::get_lpis(const uint16* splid_str, size_t splid_str_len, 1682 LmaPsbItem* lma_buf, size_t max_lma_buf, 1683 const char16 *pfullsent, bool sort_by_psb) { 1684 if (splid_str_len > kMaxLemmaSize) 1685 return 0; 1686 1687 size_t num1 = dict_trie_->get_lpis(splid_str, splid_str_len, 1688 lma_buf, max_lma_buf); 1689 size_t num2 = 0; 1690 if (NULL != user_dict_) { 1691 num2 = user_dict_->get_lpis(splid_str, splid_str_len, 1692 lma_buf + num1, max_lma_buf - num1); 1693 } 1694 1695 size_t num = num1 + num2; 1696 1697 if (0 == num) 1698 return 0; 1699 1700 // Remove repeated items. 1701 if (splid_str_len > 1) { 1702 LmaPsbStrItem *lpsis = reinterpret_cast<LmaPsbStrItem*>(lma_buf + num); 1703 size_t lpsi_num = (max_lma_buf - num) * sizeof(LmaPsbItem) / 1704 sizeof(LmaPsbStrItem); 1705 assert(lpsi_num > num); 1706 if (num > lpsi_num) num = lpsi_num; 1707 lpsi_num = num; 1708 1709 for (size_t pos = 0; pos < lpsi_num; pos++) { 1710 lpsis[pos].lpi = lma_buf[pos]; 1711 get_lemma_str(lma_buf[pos].id, lpsis[pos].str, kMaxLemmaSize + 1); 1712 } 1713 1714 myqsort(lpsis, lpsi_num, sizeof(LmaPsbStrItem), cmp_lpsi_with_str); 1715 1716 size_t remain_num = 0; 1717 for (size_t pos = 0; pos < lpsi_num; pos++) { 1718 if (pos > 0 && utf16_strcmp(lpsis[pos].str, lpsis[pos - 1].str) == 0) { 1719 if (lpsis[pos].lpi.psb < lpsis[pos - 1].lpi.psb) { 1720 assert(remain_num > 0); 1721 lma_buf[remain_num - 1] = lpsis[pos].lpi; 1722 } 1723 continue; 1724 } 1725 if (NULL != pfullsent && utf16_strcmp(lpsis[pos].str, pfullsent) == 0) 1726 continue; 1727 1728 lma_buf[remain_num] = lpsis[pos].lpi; 1729 remain_num++; 1730 } 1731 1732 // Update the result number 1733 num = remain_num; 1734 } else { 1735 // For single character, some characters have more than one spelling, for 1736 // example, "de" and "di" are all valid for a Chinese character, so when 1737 // the user input "d", repeated items are generated. 1738 // For single character lemmas, Hanzis will be gotten 1739 for (size_t pos = 0; pos < num; pos++) { 1740 char16 hanzis[2]; 1741 get_lemma_str(lma_buf[pos].id, hanzis, 2); 1742 lma_buf[pos].hanzi = hanzis[0]; 1743 } 1744 1745 myqsort(lma_buf, num, sizeof(LmaPsbItem), cmp_lpi_with_hanzi); 1746 1747 size_t remain_num = 0; 1748 for (size_t pos = 0; pos < num; pos++) { 1749 if (pos > 0 && lma_buf[pos].hanzi == lma_buf[pos - 1].hanzi) { 1750 if (NULL != pfullsent && 1751 static_cast<char16>(0) == pfullsent[1] && 1752 lma_buf[pos].hanzi == pfullsent[0]) 1753 continue; 1754 1755 if (lma_buf[pos].psb < lma_buf[pos - 1].psb) { 1756 assert(remain_num > 0); 1757 assert(lma_buf[remain_num - 1].hanzi == lma_buf[pos].hanzi); 1758 lma_buf[remain_num - 1] = lma_buf[pos]; 1759 } 1760 continue; 1761 } 1762 if (NULL != pfullsent && 1763 static_cast<char16>(0) == pfullsent[1] && 1764 lma_buf[pos].hanzi == pfullsent[0]) 1765 continue; 1766 1767 lma_buf[remain_num] = lma_buf[pos]; 1768 remain_num++; 1769 } 1770 1771 num = remain_num; 1772 } 1773 1774 if (sort_by_psb) { 1775 myqsort(lma_buf, num, sizeof(LmaPsbItem), cmp_lpi_with_psb); 1776 } 1777 return num; 1778 } 1779 1780 uint16 MatrixSearch::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, 1781 uint16 str_max) { 1782 uint16 str_len = 0; 1783 1784 if (is_system_lemma(id_lemma)) { 1785 str_len = dict_trie_->get_lemma_str(id_lemma, str_buf, str_max); 1786 } else if (is_user_lemma(id_lemma)) { 1787 if (NULL != user_dict_) { 1788 str_len = user_dict_->get_lemma_str(id_lemma, str_buf, str_max); 1789 } else { 1790 str_len = 0; 1791 str_buf[0] = static_cast<char16>('\0'); 1792 } 1793 } else if (is_composing_lemma(id_lemma)) { 1794 if (str_max <= 1) 1795 return 0; 1796 str_len = c_phrase_.sublma_start[c_phrase_.sublma_num]; 1797 if (str_len > str_max - 1) 1798 str_len = str_max - 1; 1799 utf16_strncpy(str_buf, c_phrase_.chn_str, str_len); 1800 str_buf[str_len] = (char16)'\0'; 1801 return str_len; 1802 } 1803 1804 return str_len; 1805 } 1806 1807 uint16 MatrixSearch::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, 1808 uint16 splids_max, bool arg_valid) { 1809 uint16 splid_num = 0; 1810 1811 if (arg_valid) { 1812 for (splid_num = 0; splid_num < splids_max; splid_num++) { 1813 if (spl_trie_->is_half_id(splids[splid_num])) 1814 break; 1815 } 1816 if (splid_num == splids_max) 1817 return splid_num; 1818 } 1819 1820 if (is_system_lemma(id_lemma)) { 1821 splid_num = dict_trie_->get_lemma_splids(id_lemma, splids, splids_max, 1822 arg_valid); 1823 } else if (is_user_lemma(id_lemma)) { 1824 if (NULL != user_dict_) { 1825 splid_num = user_dict_->get_lemma_splids(id_lemma, splids, splids_max, 1826 arg_valid); 1827 } else { 1828 splid_num = 0; 1829 } 1830 } else if (is_composing_lemma(id_lemma)) { 1831 if (c_phrase_.length > splids_max) { 1832 return 0; 1833 } 1834 for (uint16 pos = 0; pos < c_phrase_.length; pos++) { 1835 splids[pos] = c_phrase_.spl_ids[pos]; 1836 if (spl_trie_->is_half_id(splids[pos])) { 1837 return 0; 1838 } 1839 } 1840 } 1841 return splid_num; 1842 } 1843 1844 size_t MatrixSearch::inner_predict(const char16 *fixed_buf, uint16 fixed_len, 1845 char16 predict_buf[][kMaxPredictSize + 1], 1846 size_t buf_len) { 1847 size_t res_total = 0; 1848 memset(npre_items_, 0, sizeof(NPredictItem) * npre_items_len_); 1849 // In order to shorten the comments, j-character candidates predicted by 1850 // i-character prefix are called P(i,j). All candiates predicted by 1851 // i-character prefix are called P(i,*) 1852 // Step 1. Get P(kMaxPredictSize, *) and sort them, here 1853 // P(kMaxPredictSize, *) == P(kMaxPredictSize, 1) 1854 for (size_t len = fixed_len; len >0; len--) { 1855 // How many blank items are available 1856 size_t this_max = npre_items_len_ - res_total; 1857 size_t res_this; 1858 // If the history is longer than 1, and we can not get prediction from 1859 // lemmas longer than 2, in this case, we will add lemmas with 1860 // highest scores as the prediction result. 1861 if (fixed_len > 1 && 1 == len && 0 == res_total) { 1862 // Try to find if recent n (n>1) characters can be a valid lemma in system 1863 // dictionary. 1864 bool nearest_n_word = false; 1865 for (size_t nlen = 2; nlen <= fixed_len; nlen++) { 1866 if (dict_trie_->get_lemma_id(fixed_buf + fixed_len - nlen, nlen) > 0) { 1867 nearest_n_word = true; 1868 break; 1869 } 1870 } 1871 res_this = dict_trie_->predict_top_lmas(nearest_n_word ? len : 0, 1872 npre_items_ + res_total, 1873 this_max, res_total); 1874 res_total += res_this; 1875 } 1876 1877 // How many blank items are available 1878 this_max = npre_items_len_ - res_total; 1879 res_this = 0; 1880 if (!kOnlyUserDictPredict) { 1881 res_this = 1882 dict_trie_->predict(fixed_buf + fixed_len - len, len, 1883 npre_items_ + res_total, this_max, 1884 res_total); 1885 } 1886 1887 if (NULL != user_dict_) { 1888 res_this = res_this + 1889 user_dict_->predict(fixed_buf + fixed_len - len, len, 1890 npre_items_ + res_total + res_this, 1891 this_max - res_this, res_total + res_this); 1892 } 1893 1894 if (kPredictLimitGt1) { 1895 myqsort(npre_items_ + res_total, res_this, sizeof(NPredictItem), 1896 cmp_npre_by_score); 1897 1898 if (len > 3) { 1899 if (res_this > kMaxPredictNumByGt3) 1900 res_this = kMaxPredictNumByGt3; 1901 } else if (3 == len) { 1902 if (res_this > kMaxPredictNumBy3) 1903 res_this = kMaxPredictNumBy3; 1904 } else if (2 == len) { 1905 if (res_this > kMaxPredictNumBy2) 1906 res_this = kMaxPredictNumBy2; 1907 } 1908 } 1909 1910 res_total += res_this; 1911 } 1912 1913 res_total = remove_duplicate_npre(npre_items_, res_total); 1914 1915 if (kPreferLongHistoryPredict) { 1916 myqsort(npre_items_, res_total, sizeof(NPredictItem), 1917 cmp_npre_by_hislen_score); 1918 } else { 1919 myqsort(npre_items_, res_total, sizeof(NPredictItem), 1920 cmp_npre_by_score); 1921 } 1922 1923 if (buf_len < res_total) { 1924 res_total = buf_len; 1925 } 1926 1927 if (kPrintDebug2) { 1928 printf("/////////////////Predicted Items Begin////////////////////>>\n"); 1929 for (size_t i = 0; i < res_total; i++) { 1930 printf("---"); 1931 for (size_t j = 0; j < kMaxPredictSize; j++) { 1932 printf("%d ", npre_items_[i].pre_hzs[j]); 1933 } 1934 printf("\n"); 1935 } 1936 printf("<<///////////////Predicted Items End////////////////////////\n"); 1937 } 1938 1939 for (size_t i = 0; i < res_total; i++) { 1940 utf16_strncpy(predict_buf[i], npre_items_[i].pre_hzs, 1941 kMaxPredictSize); 1942 predict_buf[i][kMaxPredictSize] = '\0'; 1943 } 1944 1945 return res_total; 1946 } 1947 1948 size_t MatrixSearch::get_predicts(const char16 fixed_buf[], 1949 char16 predict_buf[][kMaxPredictSize + 1], 1950 size_t buf_len) { 1951 size_t fixed_len = utf16_strlen(fixed_buf); 1952 if (0 ==fixed_len || fixed_len > kMaxPredictSize || 0 == buf_len) 1953 return 0; 1954 1955 return inner_predict(fixed_buf, fixed_len, predict_buf, buf_len); 1956 } 1957 1958 } // namespace ime_pinyin 1959