1 /* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <assert.h> 18 #include <stdio.h> 19 #include <string.h> 20 #include "../include/dicttrie.h" 21 #include "../include/dictbuilder.h" 22 #include "../include/lpicache.h" 23 #include "../include/mystdlib.h" 24 #include "../include/ngram.h" 25 26 namespace ime_pinyin { 27 28 DictTrie::DictTrie() { 29 spl_trie_ = SpellingTrie::get_cpinstance(); 30 31 root_ = NULL; 32 splid_le0_index_ = NULL; 33 lma_node_num_le0_ = 0; 34 nodes_ge1_ = NULL; 35 lma_node_num_ge1_ = 0; 36 lma_idx_buf_ = NULL; 37 lma_idx_buf_len_ = 0; 38 total_lma_num_ = 0; 39 top_lmas_num_ = 0; 40 dict_list_ = NULL; 41 42 parsing_marks_ = NULL; 43 mile_stones_ = NULL; 44 reset_milestones(0, kFirstValidMileStoneHandle); 45 } 46 47 DictTrie::~DictTrie() { 48 free_resource(true); 49 } 50 51 void DictTrie::free_resource(bool free_dict_list) { 52 if (NULL != root_) 53 free(root_); 54 root_ = NULL; 55 56 if (NULL != splid_le0_index_) 57 free(splid_le0_index_); 58 splid_le0_index_ = NULL; 59 60 if (NULL != nodes_ge1_) 61 free(nodes_ge1_); 62 nodes_ge1_ = NULL; 63 64 if (NULL != nodes_ge1_) 65 free(nodes_ge1_); 66 nodes_ge1_ = NULL; 67 68 if (free_dict_list) { 69 if (NULL != dict_list_) { 70 delete dict_list_; 71 } 72 dict_list_ = NULL; 73 } 74 75 if (parsing_marks_) 76 delete [] parsing_marks_; 77 parsing_marks_ = NULL; 78 79 if (mile_stones_) 80 delete [] mile_stones_; 81 mile_stones_ = NULL; 82 83 reset_milestones(0, kFirstValidMileStoneHandle); 84 } 85 86 inline size_t DictTrie::get_son_offset(const LmaNodeGE1 *node) { 87 return ((size_t)node->son_1st_off_l + ((size_t)node->son_1st_off_h << 16)); 88 } 89 90 inline size_t DictTrie::get_homo_idx_buf_offset(const LmaNodeGE1 *node) { 91 return ((size_t)node->homo_idx_buf_off_l + 92 ((size_t)node->homo_idx_buf_off_h << 16)); 93 } 94 95 inline LemmaIdType DictTrie::get_lemma_id(size_t id_offset) { 96 LemmaIdType id = 0; 97 for (uint16 pos = kLemmaIdSize - 1; pos > 0; pos--) 98 id = (id << 8) + lma_idx_buf_[id_offset * kLemmaIdSize + pos]; 99 id = (id << 8) + lma_idx_buf_[id_offset * kLemmaIdSize]; 100 return id; 101 } 102 103 #ifdef ___BUILD_MODEL___ 104 bool DictTrie::build_dict(const char* fn_raw, const char* fn_validhzs) { 105 DictBuilder* dict_builder = new DictBuilder(); 106 107 free_resource(true); 108 109 return dict_builder->build_dict(fn_raw, fn_validhzs, this); 110 } 111 112 bool DictTrie::save_dict(FILE *fp) { 113 if (NULL == fp) 114 return false; 115 116 if (fwrite(&lma_node_num_le0_, sizeof(size_t), 1, fp) != 1) 117 return false; 118 119 if (fwrite(&lma_node_num_ge1_, sizeof(size_t), 1, fp) != 1) 120 return false; 121 122 if (fwrite(&lma_idx_buf_len_, sizeof(size_t), 1, fp) != 1) 123 return false; 124 125 if (fwrite(&top_lmas_num_, sizeof(size_t), 1, fp) != 1) 126 return false; 127 128 if (fwrite(root_, sizeof(LmaNodeLE0), lma_node_num_le0_, fp) 129 != lma_node_num_le0_) 130 return false; 131 132 if (fwrite(nodes_ge1_, sizeof(LmaNodeGE1), lma_node_num_ge1_, fp) 133 != lma_node_num_ge1_) 134 return false; 135 136 if (fwrite(lma_idx_buf_, sizeof(unsigned char), lma_idx_buf_len_, fp) != 137 lma_idx_buf_len_) 138 return false; 139 140 return true; 141 } 142 143 bool DictTrie::save_dict(const char *filename) { 144 if (NULL == filename) 145 return false; 146 147 if (NULL == root_ || NULL == dict_list_) 148 return false; 149 150 SpellingTrie &spl_trie = SpellingTrie::get_instance(); 151 NGram &ngram = NGram::get_instance(); 152 153 FILE *fp = fopen(filename, "wb"); 154 if (NULL == fp) 155 return false; 156 157 if (!spl_trie.save_spl_trie(fp) || !dict_list_->save_list(fp) || 158 !save_dict(fp) || !ngram.save_ngram(fp)) { 159 fclose(fp); 160 return false; 161 } 162 163 fclose(fp); 164 return true; 165 } 166 #endif // ___BUILD_MODEL___ 167 168 bool DictTrie::load_dict(FILE *fp) { 169 if (NULL == fp) 170 return false; 171 172 if (fread(&lma_node_num_le0_, sizeof(size_t), 1, fp) != 1) 173 return false; 174 175 if (fread(&lma_node_num_ge1_, sizeof(size_t), 1, fp) != 1) 176 return false; 177 178 if (fread(&lma_idx_buf_len_, sizeof(size_t), 1, fp) != 1) 179 return false; 180 181 if (fread(&top_lmas_num_, sizeof(size_t), 1, fp) != 1 || 182 top_lmas_num_ >= lma_idx_buf_len_) 183 return false; 184 185 free_resource(false); 186 187 root_ = static_cast<LmaNodeLE0*> 188 (malloc(lma_node_num_le0_ * sizeof(LmaNodeLE0))); 189 nodes_ge1_ = static_cast<LmaNodeGE1*> 190 (malloc(lma_node_num_ge1_ * sizeof(LmaNodeGE1))); 191 lma_idx_buf_ = (unsigned char*)malloc(lma_idx_buf_len_); 192 total_lma_num_ = lma_idx_buf_len_ / kLemmaIdSize; 193 194 size_t buf_size = SpellingTrie::get_instance().get_spelling_num() + 1; 195 assert(lma_node_num_le0_ <= buf_size); 196 splid_le0_index_ = static_cast<uint16*>(malloc(buf_size * sizeof(uint16))); 197 198 // Init the space for parsing. 199 parsing_marks_ = new ParsingMark[kMaxParsingMark]; 200 mile_stones_ = new MileStone[kMaxMileStone]; 201 reset_milestones(0, kFirstValidMileStoneHandle); 202 203 if (NULL == root_ || NULL == nodes_ge1_ || NULL == lma_idx_buf_ || 204 NULL == splid_le0_index_ || NULL == parsing_marks_ || 205 NULL == mile_stones_) { 206 free_resource(false); 207 return false; 208 } 209 210 if (fread(root_, sizeof(LmaNodeLE0), lma_node_num_le0_, fp) 211 != lma_node_num_le0_) 212 return false; 213 214 if (fread(nodes_ge1_, sizeof(LmaNodeGE1), lma_node_num_ge1_, fp) 215 != lma_node_num_ge1_) 216 return false; 217 218 if (fread(lma_idx_buf_, sizeof(unsigned char), lma_idx_buf_len_, fp) != 219 lma_idx_buf_len_) 220 return false; 221 222 // The quick index for the first level sons 223 uint16 last_splid = kFullSplIdStart; 224 size_t last_pos = 0; 225 for (size_t i = 1; i < lma_node_num_le0_; i++) { 226 for (uint16 splid = last_splid; splid < root_[i].spl_idx; splid++) 227 splid_le0_index_[splid - kFullSplIdStart] = last_pos; 228 229 splid_le0_index_[root_[i].spl_idx - kFullSplIdStart] = 230 static_cast<uint16>(i); 231 last_splid = root_[i].spl_idx; 232 last_pos = i; 233 } 234 235 for (uint16 splid = last_splid + 1; 236 splid < buf_size + kFullSplIdStart; splid++) { 237 assert(static_cast<size_t>(splid - kFullSplIdStart) < buf_size); 238 splid_le0_index_[splid - kFullSplIdStart] = last_pos + 1; 239 } 240 241 return true; 242 } 243 244 bool DictTrie::load_dict(const char *filename, LemmaIdType start_id, 245 LemmaIdType end_id) { 246 if (NULL == filename || end_id <= start_id) 247 return false; 248 249 FILE *fp = fopen(filename, "rb"); 250 if (NULL == fp) 251 return false; 252 253 free_resource(true); 254 255 dict_list_ = new DictList(); 256 if (NULL == dict_list_) { 257 fclose(fp); 258 return false; 259 } 260 261 SpellingTrie &spl_trie = SpellingTrie::get_instance(); 262 NGram &ngram = NGram::get_instance(); 263 264 if (!spl_trie.load_spl_trie(fp) || !dict_list_->load_list(fp) || 265 !load_dict(fp) || !ngram.load_ngram(fp) || 266 total_lma_num_ > end_id - start_id + 1) { 267 free_resource(true); 268 fclose(fp); 269 return false; 270 } 271 272 fclose(fp); 273 return true; 274 } 275 276 bool DictTrie::load_dict_fd(int sys_fd, long start_offset, 277 long length, LemmaIdType start_id, 278 LemmaIdType end_id) { 279 if (start_offset < 0 || length <= 0 || end_id <= start_id) 280 return false; 281 282 FILE *fp = fdopen(sys_fd, "rb"); 283 if (NULL == fp) 284 return false; 285 286 if (-1 == fseek(fp, start_offset, SEEK_SET)) { 287 fclose(fp); 288 return false; 289 } 290 291 free_resource(true); 292 293 dict_list_ = new DictList(); 294 if (NULL == dict_list_) { 295 fclose(fp); 296 return false; 297 } 298 299 SpellingTrie &spl_trie = SpellingTrie::get_instance(); 300 NGram &ngram = NGram::get_instance(); 301 302 if (!spl_trie.load_spl_trie(fp) || !dict_list_->load_list(fp) || 303 !load_dict(fp) || !ngram.load_ngram(fp) || 304 ftell(fp) < start_offset + length || 305 total_lma_num_ > end_id - start_id + 1) { 306 free_resource(true); 307 fclose(fp); 308 return false; 309 } 310 311 fclose(fp); 312 return true; 313 } 314 315 size_t DictTrie::fill_lpi_buffer(LmaPsbItem lpi_items[], size_t lpi_max, 316 LmaNodeLE0 *node) { 317 size_t lpi_num = 0; 318 NGram& ngram = NGram::get_instance(); 319 for (size_t homo = 0; homo < (size_t)node->num_of_homo; homo++) { 320 lpi_items[lpi_num].id = get_lemma_id(node->homo_idx_buf_off + 321 homo); 322 lpi_items[lpi_num].lma_len = 1; 323 lpi_items[lpi_num].psb = 324 static_cast<LmaScoreType>(ngram.get_uni_psb(lpi_items[lpi_num].id)); 325 lpi_num++; 326 if (lpi_num >= lpi_max) 327 break; 328 } 329 330 return lpi_num; 331 } 332 333 size_t DictTrie::fill_lpi_buffer(LmaPsbItem lpi_items[], size_t lpi_max, 334 size_t homo_buf_off, LmaNodeGE1 *node, 335 uint16 lma_len) { 336 size_t lpi_num = 0; 337 NGram& ngram = NGram::get_instance(); 338 for (size_t homo = 0; homo < (size_t)node->num_of_homo; homo++) { 339 lpi_items[lpi_num].id = get_lemma_id(homo_buf_off + homo); 340 lpi_items[lpi_num].lma_len = lma_len; 341 lpi_items[lpi_num].psb = 342 static_cast<LmaScoreType>(ngram.get_uni_psb(lpi_items[lpi_num].id)); 343 lpi_num++; 344 if (lpi_num >= lpi_max) 345 break; 346 } 347 348 return lpi_num; 349 } 350 351 void DictTrie::reset_milestones(uint16 from_step, MileStoneHandle from_handle) { 352 if (0 == from_step) { 353 parsing_marks_pos_ = 0; 354 mile_stones_pos_ = kFirstValidMileStoneHandle; 355 } else { 356 if (from_handle > 0 && from_handle < mile_stones_pos_) { 357 mile_stones_pos_ = from_handle; 358 359 MileStone *mile_stone = mile_stones_ + from_handle; 360 parsing_marks_pos_ = mile_stone->mark_start; 361 } 362 } 363 } 364 365 MileStoneHandle DictTrie::extend_dict(MileStoneHandle from_handle, 366 const DictExtPara *dep, 367 LmaPsbItem *lpi_items, size_t lpi_max, 368 size_t *lpi_num) { 369 if (NULL == dep) 370 return 0; 371 372 // from LmaNodeLE0 (root) to LmaNodeLE0 373 if (0 == from_handle) { 374 assert(0 == dep->splids_extended); 375 return extend_dict0(from_handle, dep, lpi_items, lpi_max, lpi_num); 376 } 377 378 // from LmaNodeLE0 to LmaNodeGE1 379 if (1 == dep->splids_extended) 380 return extend_dict1(from_handle, dep, lpi_items, lpi_max, lpi_num); 381 382 // From LmaNodeGE1 to LmaNodeGE1 383 return extend_dict2(from_handle, dep, lpi_items, lpi_max, lpi_num); 384 } 385 386 MileStoneHandle DictTrie::extend_dict0(MileStoneHandle from_handle, 387 const DictExtPara *dep, 388 LmaPsbItem *lpi_items, 389 size_t lpi_max, size_t *lpi_num) { 390 assert(NULL != dep && 0 == from_handle); 391 *lpi_num = 0; 392 MileStoneHandle ret_handle = 0; 393 394 uint16 splid = dep->splids[dep->splids_extended]; 395 uint16 id_start = dep->id_start; 396 uint16 id_num = dep->id_num; 397 398 LpiCache& lpi_cache = LpiCache::get_instance(); 399 bool cached = lpi_cache.is_cached(splid); 400 401 // 2. Begin exgtending 402 // 2.1 Get the LmaPsbItem list 403 LmaNodeLE0 *node = root_; 404 size_t son_start = splid_le0_index_[id_start - kFullSplIdStart]; 405 size_t son_end = splid_le0_index_[id_start + id_num - kFullSplIdStart]; 406 for (size_t son_pos = son_start; son_pos < son_end; son_pos++) { 407 assert(1 == node->son_1st_off); 408 LmaNodeLE0 *son = root_ + son_pos; 409 assert(son->spl_idx >= id_start && son->spl_idx < id_start + id_num); 410 411 if (!cached && *lpi_num < lpi_max) { 412 bool need_lpi = true; 413 if (spl_trie_->is_half_id_yunmu(splid) && son_pos != son_start) 414 need_lpi = false; 415 416 if (need_lpi) 417 *lpi_num += fill_lpi_buffer(lpi_items + (*lpi_num), 418 lpi_max - *lpi_num, son); 419 } 420 421 // If necessary, fill in a new mile stone. 422 if (son->spl_idx == id_start) { 423 if (mile_stones_pos_ < kMaxMileStone && 424 parsing_marks_pos_ < kMaxParsingMark) { 425 parsing_marks_[parsing_marks_pos_].node_offset = son_pos; 426 parsing_marks_[parsing_marks_pos_].node_num = id_num; 427 mile_stones_[mile_stones_pos_].mark_start = parsing_marks_pos_; 428 mile_stones_[mile_stones_pos_].mark_num = 1; 429 ret_handle = mile_stones_pos_; 430 parsing_marks_pos_++; 431 mile_stones_pos_++; 432 } 433 } 434 435 if (son->spl_idx >= id_start + id_num -1) 436 break; 437 } 438 439 // printf("----- parsing marks: %d, mile stone: %d \n", parsing_marks_pos_, 440 // mile_stones_pos_); 441 return ret_handle; 442 } 443 444 MileStoneHandle DictTrie::extend_dict1(MileStoneHandle from_handle, 445 const DictExtPara *dep, 446 LmaPsbItem *lpi_items, 447 size_t lpi_max, size_t *lpi_num) { 448 assert(NULL != dep && from_handle > 0 && from_handle < mile_stones_pos_); 449 450 MileStoneHandle ret_handle = 0; 451 452 // 1. If this is a half Id, get its corresponding full starting Id and 453 // number of full Id. 454 size_t ret_val = 0; 455 456 uint16 id_start = dep->id_start; 457 uint16 id_num = dep->id_num; 458 459 // 2. Begin extending. 460 MileStone *mile_stone = mile_stones_ + from_handle; 461 462 for (uint16 h_pos = 0; h_pos < mile_stone->mark_num; h_pos++) { 463 ParsingMark p_mark = parsing_marks_[mile_stone->mark_start + h_pos]; 464 uint16 ext_num = p_mark.node_num; 465 for (uint16 ext_pos = 0; ext_pos < ext_num; ext_pos++) { 466 LmaNodeLE0 *node = root_ + p_mark.node_offset + ext_pos; 467 size_t found_start = 0; 468 size_t found_num = 0; 469 for (size_t son_pos = 0; son_pos < (size_t)node->num_of_son; son_pos++) { 470 assert(node->son_1st_off <= lma_node_num_ge1_); 471 LmaNodeGE1 *son = nodes_ge1_ + node->son_1st_off + son_pos; 472 if (son->spl_idx >= id_start 473 && son->spl_idx < id_start + id_num) { 474 if (*lpi_num < lpi_max) { 475 size_t homo_buf_off = get_homo_idx_buf_offset(son); 476 *lpi_num += fill_lpi_buffer(lpi_items + (*lpi_num), 477 lpi_max - *lpi_num, homo_buf_off, son, 478 2); 479 } 480 481 // If necessary, fill in the new DTMI 482 if (0 == found_num) { 483 found_start = son_pos; 484 } 485 found_num++; 486 } 487 if (son->spl_idx >= id_start + id_num - 1 || son_pos == 488 (size_t)node->num_of_son - 1) { 489 if (found_num > 0) { 490 if (mile_stones_pos_ < kMaxMileStone && 491 parsing_marks_pos_ < kMaxParsingMark) { 492 parsing_marks_[parsing_marks_pos_].node_offset = 493 node->son_1st_off + found_start; 494 parsing_marks_[parsing_marks_pos_].node_num = found_num; 495 if (0 == ret_val) 496 mile_stones_[mile_stones_pos_].mark_start = 497 parsing_marks_pos_; 498 parsing_marks_pos_++; 499 } 500 501 ret_val++; 502 } 503 break; 504 } // for son_pos 505 } // for ext_pos 506 } // for h_pos 507 } 508 509 if (ret_val > 0) { 510 mile_stones_[mile_stones_pos_].mark_num = ret_val; 511 ret_handle = mile_stones_pos_; 512 mile_stones_pos_++; 513 ret_val = 1; 514 } 515 516 // printf("----- parsing marks: %d, mile stone: %d \n", parsing_marks_pos_, 517 // mile_stones_pos_); 518 return ret_handle; 519 } 520 521 MileStoneHandle DictTrie::extend_dict2(MileStoneHandle from_handle, 522 const DictExtPara *dep, 523 LmaPsbItem *lpi_items, 524 size_t lpi_max, size_t *lpi_num) { 525 assert(NULL != dep && from_handle > 0 && from_handle < mile_stones_pos_); 526 527 MileStoneHandle ret_handle = 0; 528 529 // 1. If this is a half Id, get its corresponding full starting Id and 530 // number of full Id. 531 size_t ret_val = 0; 532 533 uint16 id_start = dep->id_start; 534 uint16 id_num = dep->id_num; 535 536 // 2. Begin extending. 537 MileStone *mile_stone = mile_stones_ + from_handle; 538 539 for (uint16 h_pos = 0; h_pos < mile_stone->mark_num; h_pos++) { 540 ParsingMark p_mark = parsing_marks_[mile_stone->mark_start + h_pos]; 541 uint16 ext_num = p_mark.node_num; 542 for (uint16 ext_pos = 0; ext_pos < ext_num; ext_pos++) { 543 LmaNodeGE1 *node = nodes_ge1_ + p_mark.node_offset + ext_pos; 544 size_t found_start = 0; 545 size_t found_num = 0; 546 547 for (size_t son_pos = 0; son_pos < (size_t)node->num_of_son; son_pos++) { 548 assert(node->son_1st_off_l > 0 || node->son_1st_off_h > 0); 549 LmaNodeGE1 *son = nodes_ge1_ + get_son_offset(node) + son_pos; 550 if (son->spl_idx >= id_start 551 && son->spl_idx < id_start + id_num) { 552 if (*lpi_num < lpi_max) { 553 size_t homo_buf_off = get_homo_idx_buf_offset(son); 554 *lpi_num += fill_lpi_buffer(lpi_items + (*lpi_num), 555 lpi_max - *lpi_num, homo_buf_off, son, 556 dep->splids_extended + 1); 557 } 558 559 // If necessary, fill in the new DTMI 560 if (0 == found_num) { 561 found_start = son_pos; 562 } 563 found_num++; 564 } 565 if (son->spl_idx >= id_start + id_num - 1 || son_pos == 566 (size_t)node->num_of_son - 1) { 567 if (found_num > 0) { 568 if (mile_stones_pos_ < kMaxMileStone && 569 parsing_marks_pos_ < kMaxParsingMark) { 570 parsing_marks_[parsing_marks_pos_].node_offset = 571 get_son_offset(node) + found_start; 572 parsing_marks_[parsing_marks_pos_].node_num = found_num; 573 if (0 == ret_val) 574 mile_stones_[mile_stones_pos_].mark_start = 575 parsing_marks_pos_; 576 parsing_marks_pos_++; 577 } 578 579 ret_val++; 580 } 581 break; 582 } 583 } // for son_pos 584 } // for ext_pos 585 } // for h_pos 586 587 if (ret_val > 0) { 588 mile_stones_[mile_stones_pos_].mark_num = ret_val; 589 ret_handle = mile_stones_pos_; 590 mile_stones_pos_++; 591 } 592 593 // printf("----- parsing marks: %d, mile stone: %d \n", parsing_marks_pos_, 594 // mile_stones_pos_); 595 return ret_handle; 596 } 597 598 bool DictTrie::try_extend(const uint16 *splids, uint16 splid_num, 599 LemmaIdType id_lemma) { 600 if (0 == splid_num || NULL == splids) 601 return false; 602 603 void *node = root_ + splid_le0_index_[splids[0] - kFullSplIdStart]; 604 605 for (uint16 pos = 1; pos < splid_num; pos++) { 606 if (1 == pos) { 607 LmaNodeLE0 *node_le0 = reinterpret_cast<LmaNodeLE0*>(node); 608 LmaNodeGE1 *node_son; 609 uint16 son_pos; 610 for (son_pos = 0; son_pos < static_cast<uint16>(node_le0->num_of_son); 611 son_pos++) { 612 assert(node_le0->son_1st_off <= lma_node_num_ge1_); 613 node_son = nodes_ge1_ + node_le0->son_1st_off 614 + son_pos; 615 if (node_son->spl_idx == splids[pos]) 616 break; 617 } 618 if (son_pos < node_le0->num_of_son) 619 node = reinterpret_cast<void*>(node_son); 620 else 621 return false; 622 } else { 623 LmaNodeGE1 *node_ge1 = reinterpret_cast<LmaNodeGE1*>(node); 624 LmaNodeGE1 *node_son; 625 uint16 son_pos; 626 for (son_pos = 0; son_pos < static_cast<uint16>(node_ge1->num_of_son); 627 son_pos++) { 628 assert(node_ge1->son_1st_off_l > 0 || node_ge1->son_1st_off_h > 0); 629 node_son = nodes_ge1_ + get_son_offset(node_ge1) + son_pos; 630 if (node_son->spl_idx == splids[pos]) 631 break; 632 } 633 if (son_pos < node_ge1->num_of_son) 634 node = reinterpret_cast<void*>(node_son); 635 else 636 return false; 637 } 638 } 639 640 if (1 == splid_num) { 641 LmaNodeLE0* node_le0 = reinterpret_cast<LmaNodeLE0*>(node); 642 size_t num_of_homo = (size_t)node_le0->num_of_homo; 643 for (size_t homo_pos = 0; homo_pos < num_of_homo; homo_pos++) { 644 LemmaIdType id_this = get_lemma_id(node_le0->homo_idx_buf_off + homo_pos); 645 char16 str[2]; 646 get_lemma_str(id_this, str, 2); 647 if (id_this == id_lemma) 648 return true; 649 } 650 } else { 651 LmaNodeGE1* node_ge1 = reinterpret_cast<LmaNodeGE1*>(node); 652 size_t num_of_homo = (size_t)node_ge1->num_of_homo; 653 for (size_t homo_pos = 0; homo_pos < num_of_homo; homo_pos++) { 654 size_t node_homo_off = get_homo_idx_buf_offset(node_ge1); 655 if (get_lemma_id(node_homo_off + homo_pos) == id_lemma) 656 return true; 657 } 658 } 659 660 return false; 661 } 662 663 size_t DictTrie::get_lpis(const uint16* splid_str, uint16 splid_str_len, 664 LmaPsbItem* lma_buf, size_t max_lma_buf) { 665 if (splid_str_len > kMaxLemmaSize) 666 return 0; 667 668 #define MAX_EXTENDBUF_LEN 200 669 670 size_t* node_buf1[MAX_EXTENDBUF_LEN]; // use size_t for data alignment 671 size_t* node_buf2[MAX_EXTENDBUF_LEN]; 672 LmaNodeLE0** node_fr_le0 = 673 reinterpret_cast<LmaNodeLE0**>(node_buf1); // Nodes from. 674 LmaNodeLE0** node_to_le0 = 675 reinterpret_cast<LmaNodeLE0**>(node_buf2); // Nodes to. 676 LmaNodeGE1** node_fr_ge1 = NULL; 677 LmaNodeGE1** node_to_ge1 = NULL; 678 size_t node_fr_num = 1; 679 size_t node_to_num = 0; 680 node_fr_le0[0] = root_; 681 if (NULL == node_fr_le0[0]) 682 return 0; 683 684 size_t spl_pos = 0; 685 686 while (spl_pos < splid_str_len) { 687 uint16 id_num = 1; 688 uint16 id_start = splid_str[spl_pos]; 689 // If it is a half id 690 if (spl_trie_->is_half_id(splid_str[spl_pos])) { 691 id_num = spl_trie_->half_to_full(splid_str[spl_pos], &id_start); 692 assert(id_num > 0); 693 } 694 695 // Extend the nodes 696 if (0 == spl_pos) { // From LmaNodeLE0 (root) to LmaNodeLE0 nodes 697 for (size_t node_fr_pos = 0; node_fr_pos < node_fr_num; node_fr_pos++) { 698 LmaNodeLE0 *node = node_fr_le0[node_fr_pos]; 699 assert(node == root_ && 1 == node_fr_num); 700 size_t son_start = splid_le0_index_[id_start - kFullSplIdStart]; 701 size_t son_end = 702 splid_le0_index_[id_start + id_num - kFullSplIdStart]; 703 for (size_t son_pos = son_start; son_pos < son_end; son_pos++) { 704 assert(1 == node->son_1st_off); 705 LmaNodeLE0 *node_son = root_ + son_pos; 706 assert(node_son->spl_idx >= id_start 707 && node_son->spl_idx < id_start + id_num); 708 if (node_to_num < MAX_EXTENDBUF_LEN) { 709 node_to_le0[node_to_num] = node_son; 710 node_to_num++; 711 } 712 // id_start + id_num - 1 is the last one, which has just been 713 // recorded. 714 if (node_son->spl_idx >= id_start + id_num - 1) 715 break; 716 } 717 } 718 719 spl_pos++; 720 if (spl_pos >= splid_str_len || node_to_num == 0) 721 break; 722 // Prepare the nodes for next extending 723 // next time, from LmaNodeLE0 to LmaNodeGE1 724 LmaNodeLE0** node_tmp = node_fr_le0; 725 node_fr_le0 = node_to_le0; 726 node_to_le0 = NULL; 727 node_to_ge1 = reinterpret_cast<LmaNodeGE1**>(node_tmp); 728 } else if (1 == spl_pos) { // From LmaNodeLE0 to LmaNodeGE1 nodes 729 for (size_t node_fr_pos = 0; node_fr_pos < node_fr_num; node_fr_pos++) { 730 LmaNodeLE0 *node = node_fr_le0[node_fr_pos]; 731 for (size_t son_pos = 0; son_pos < (size_t)node->num_of_son; 732 son_pos++) { 733 assert(node->son_1st_off <= lma_node_num_ge1_); 734 LmaNodeGE1 *node_son = nodes_ge1_ + node->son_1st_off 735 + son_pos; 736 if (node_son->spl_idx >= id_start 737 && node_son->spl_idx < id_start + id_num) { 738 if (node_to_num < MAX_EXTENDBUF_LEN) { 739 node_to_ge1[node_to_num] = node_son; 740 node_to_num++; 741 } 742 } 743 // id_start + id_num - 1 is the last one, which has just been 744 // recorded. 745 if (node_son->spl_idx >= id_start + id_num - 1) 746 break; 747 } 748 } 749 750 spl_pos++; 751 if (spl_pos >= splid_str_len || node_to_num == 0) 752 break; 753 // Prepare the nodes for next extending 754 // next time, from LmaNodeGE1 to LmaNodeGE1 755 node_fr_ge1 = node_to_ge1; 756 node_to_ge1 = reinterpret_cast<LmaNodeGE1**>(node_fr_le0); 757 node_fr_le0 = NULL; 758 node_to_le0 = NULL; 759 } else { // From LmaNodeGE1 to LmaNodeGE1 nodes 760 for (size_t node_fr_pos = 0; node_fr_pos < node_fr_num; node_fr_pos++) { 761 LmaNodeGE1 *node = node_fr_ge1[node_fr_pos]; 762 for (size_t son_pos = 0; son_pos < (size_t)node->num_of_son; 763 son_pos++) { 764 assert(node->son_1st_off_l > 0 || node->son_1st_off_h > 0); 765 LmaNodeGE1 *node_son = nodes_ge1_ 766 + get_son_offset(node) + son_pos; 767 if (node_son->spl_idx >= id_start 768 && node_son->spl_idx < id_start + id_num) { 769 if (node_to_num < MAX_EXTENDBUF_LEN) { 770 node_to_ge1[node_to_num] = node_son; 771 node_to_num++; 772 } 773 } 774 // id_start + id_num - 1 is the last one, which has just been 775 // recorded. 776 if (node_son->spl_idx >= id_start + id_num - 1) 777 break; 778 } 779 } 780 781 spl_pos++; 782 if (spl_pos >= splid_str_len || node_to_num == 0) 783 break; 784 // Prepare the nodes for next extending 785 // next time, from LmaNodeGE1 to LmaNodeGE1 786 LmaNodeGE1 **node_tmp = node_fr_ge1; 787 node_fr_ge1 = node_to_ge1; 788 node_to_ge1 = node_tmp; 789 } 790 791 // The number of node for next extending 792 node_fr_num = node_to_num; 793 node_to_num = 0; 794 } // while 795 796 if (0 == node_to_num) 797 return 0; 798 799 NGram &ngram = NGram::get_instance(); 800 size_t lma_num = 0; 801 802 // If the length is 1, and the splid is a one-char Yunmu like 'a', 'o', 'e', 803 // only those candidates for the full matched one-char id will be returned. 804 if (1 == splid_str_len && spl_trie_->is_half_id_yunmu(splid_str[0])) 805 node_to_num = node_to_num > 0 ? 1 : 0; 806 807 for (size_t node_pos = 0; node_pos < node_to_num; node_pos++) { 808 size_t num_of_homo = 0; 809 if (spl_pos <= 1) { // Get from LmaNodeLE0 nodes 810 LmaNodeLE0* node_le0 = node_to_le0[node_pos]; 811 num_of_homo = (size_t)node_le0->num_of_homo; 812 for (size_t homo_pos = 0; homo_pos < num_of_homo; homo_pos++) { 813 size_t ch_pos = lma_num + homo_pos; 814 lma_buf[ch_pos].id = 815 get_lemma_id(node_le0->homo_idx_buf_off + homo_pos); 816 lma_buf[ch_pos].lma_len = 1; 817 lma_buf[ch_pos].psb = 818 static_cast<LmaScoreType>(ngram.get_uni_psb(lma_buf[ch_pos].id)); 819 820 if (lma_num + homo_pos >= max_lma_buf - 1) 821 break; 822 } 823 } else { // Get from LmaNodeGE1 nodes 824 LmaNodeGE1* node_ge1 = node_to_ge1[node_pos]; 825 num_of_homo = (size_t)node_ge1->num_of_homo; 826 for (size_t homo_pos = 0; homo_pos < num_of_homo; homo_pos++) { 827 size_t ch_pos = lma_num + homo_pos; 828 size_t node_homo_off = get_homo_idx_buf_offset(node_ge1); 829 lma_buf[ch_pos].id = get_lemma_id(node_homo_off + homo_pos); 830 lma_buf[ch_pos].lma_len = splid_str_len; 831 lma_buf[ch_pos].psb = 832 static_cast<LmaScoreType>(ngram.get_uni_psb(lma_buf[ch_pos].id)); 833 834 if (lma_num + homo_pos >= max_lma_buf - 1) 835 break; 836 } 837 } 838 839 lma_num += num_of_homo; 840 if (lma_num >= max_lma_buf) { 841 lma_num = max_lma_buf; 842 break; 843 } 844 } 845 return lma_num; 846 } 847 848 uint16 DictTrie::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, 849 uint16 str_max) { 850 return dict_list_->get_lemma_str(id_lemma, str_buf, str_max); 851 } 852 853 uint16 DictTrie::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, 854 uint16 splids_max, bool arg_valid) { 855 char16 lma_str[kMaxLemmaSize + 1]; 856 uint16 lma_len = get_lemma_str(id_lemma, lma_str, kMaxLemmaSize + 1); 857 assert((!arg_valid && splids_max >= lma_len) || lma_len == splids_max); 858 859 uint16 spl_mtrx[kMaxLemmaSize * 5]; 860 uint16 spl_start[kMaxLemmaSize + 1]; 861 spl_start[0] = 0; 862 uint16 try_num = 1; 863 864 for (uint16 pos = 0; pos < lma_len; pos++) { 865 uint16 cand_splids_this = 0; 866 if (arg_valid && spl_trie_->is_full_id(splids[pos])) { 867 spl_mtrx[spl_start[pos]] = splids[pos]; 868 cand_splids_this = 1; 869 } else { 870 cand_splids_this = dict_list_->get_splids_for_hanzi(lma_str[pos], 871 arg_valid ? splids[pos] : 0, spl_mtrx + spl_start[pos], 872 kMaxLemmaSize * 5 - spl_start[pos]); 873 assert(cand_splids_this > 0); 874 } 875 spl_start[pos + 1] = spl_start[pos] + cand_splids_this; 876 try_num *= cand_splids_this; 877 } 878 879 for (uint16 try_pos = 0; try_pos < try_num; try_pos++) { 880 uint16 mod = 1; 881 for (uint16 pos = 0; pos < lma_len; pos++) { 882 uint16 radix = spl_start[pos + 1] - spl_start[pos]; 883 splids[pos] = spl_mtrx[ spl_start[pos] + try_pos / mod % radix]; 884 mod *= radix; 885 } 886 887 if (try_extend(splids, lma_len, id_lemma)) 888 return lma_len; 889 } 890 891 return 0; 892 } 893 894 void DictTrie::set_total_lemma_count_of_others(size_t count) { 895 NGram& ngram = NGram::get_instance(); 896 ngram.set_total_freq_none_sys(count); 897 } 898 899 void DictTrie::convert_to_hanzis(char16 *str, uint16 str_len) { 900 return dict_list_->convert_to_hanzis(str, str_len); 901 } 902 903 void DictTrie::convert_to_scis_ids(char16 *str, uint16 str_len) { 904 return dict_list_->convert_to_scis_ids(str, str_len); 905 } 906 907 LemmaIdType DictTrie::get_lemma_id(const char16 lemma_str[], uint16 lemma_len) { 908 if (NULL == lemma_str || lemma_len > kMaxLemmaSize) 909 return 0; 910 911 return dict_list_->get_lemma_id(lemma_str, lemma_len); 912 } 913 914 size_t DictTrie::predict_top_lmas(size_t his_len, NPredictItem *npre_items, 915 size_t npre_max, size_t b4_used) { 916 NGram &ngram = NGram::get_instance(); 917 918 size_t item_num = 0; 919 size_t top_lmas_id_offset = lma_idx_buf_len_ / kLemmaIdSize - top_lmas_num_; 920 size_t top_lmas_pos = 0; 921 while (item_num < npre_max && top_lmas_pos < top_lmas_num_) { 922 memset(npre_items + item_num, 0, sizeof(NPredictItem)); 923 LemmaIdType top_lma_id = get_lemma_id(top_lmas_id_offset + top_lmas_pos); 924 top_lmas_pos += 1; 925 if (dict_list_->get_lemma_str(top_lma_id, 926 npre_items[item_num].pre_hzs, 927 kMaxLemmaSize - 1) == 0) { 928 continue; 929 } 930 npre_items[item_num].psb = ngram.get_uni_psb(top_lma_id); 931 npre_items[item_num].his_len = his_len; 932 item_num++; 933 } 934 return item_num; 935 } 936 937 size_t DictTrie::predict(const char16 *last_hzs, uint16 hzs_len, 938 NPredictItem *npre_items, size_t npre_max, 939 size_t b4_used) { 940 return dict_list_->predict(last_hzs, hzs_len, npre_items, npre_max, b4_used); 941 } 942 } // namespace ime_pinyin 943