1 /* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <assert.h> 18 #include <stdio.h> 19 #include <stdlib.h> 20 #include <string.h> 21 22 #include "../include/dictbuilder.h" 23 #include "../include/dicttrie.h" 24 #include "../include/mystdlib.h" 25 #include "../include/ngram.h" 26 #include "../include/searchutility.h" 27 #include "../include/spellingtable.h" 28 #include "../include/spellingtrie.h" 29 #include "../include/splparser.h" 30 #include "../include/utf16reader.h" 31 32 namespace ime_pinyin { 33 34 #ifdef ___BUILD_MODEL___ 35 36 static const size_t kReadBufLen = 512; 37 static const size_t kSplTableHashLen = 2000; 38 39 // Compare a SingleCharItem, first by Hanzis, then by spelling ids, then by 40 // frequencies. 41 int cmp_scis_hz_splid_freq(const void* p1, const void* p2) { 42 const SingleCharItem *s1, *s2; 43 s1 = static_cast<const SingleCharItem*>(p1); 44 s2 = static_cast<const SingleCharItem*>(p2); 45 46 if (s1->hz < s2->hz) 47 return -1; 48 if (s1->hz > s2->hz) 49 return 1; 50 51 if (s1->splid.half_splid < s2->splid.half_splid) 52 return -1; 53 if (s1->splid.half_splid > s2->splid.half_splid) 54 return 1; 55 56 if (s1->splid.full_splid < s2->splid.full_splid) 57 return -1; 58 if (s1->splid.full_splid > s2->splid.full_splid) 59 return 1; 60 61 if (s1->freq > s2->freq) 62 return -1; 63 if (s1->freq < s2->freq) 64 return 1; 65 return 0; 66 } 67 68 int cmp_scis_hz_splid(const void* p1, const void* p2) { 69 const SingleCharItem *s1, *s2; 70 s1 = static_cast<const SingleCharItem*>(p1); 71 s2 = static_cast<const SingleCharItem*>(p2); 72 73 if (s1->hz < s2->hz) 74 return -1; 75 if (s1->hz > s2->hz) 76 return 1; 77 78 if (s1->splid.half_splid < s2->splid.half_splid) 79 return -1; 80 if (s1->splid.half_splid > s2->splid.half_splid) 81 return 1; 82 83 if (s1->splid.full_splid < s2->splid.full_splid) 84 return -1; 85 if (s1->splid.full_splid > s2->splid.full_splid) 86 return 1; 87 88 return 0; 89 } 90 91 int cmp_lemma_entry_hzs(const void* p1, const void* p2) { 92 size_t size1 = utf16_strlen(((const LemmaEntry*)p1)->hanzi_str); 93 size_t size2 = utf16_strlen(((const LemmaEntry*)p2)->hanzi_str); 94 if (size1 < size2) 95 return -1; 96 else if (size1 > size2) 97 return 1; 98 99 return utf16_strcmp(((const LemmaEntry*)p1)->hanzi_str, 100 ((const LemmaEntry*)p2)->hanzi_str); 101 } 102 103 int compare_char16(const void* p1, const void* p2) { 104 if (*((const char16*)p1) < *((const char16*)p2)) 105 return -1; 106 if (*((const char16*)p1) > *((const char16*)p2)) 107 return 1; 108 return 0; 109 } 110 111 int compare_py(const void* p1, const void* p2) { 112 int ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr, 113 ((const LemmaEntry*)p2)->spl_idx_arr); 114 115 if (0 != ret) 116 return ret; 117 118 return static_cast<int>(((const LemmaEntry*)p2)->freq) - 119 static_cast<int>(((const LemmaEntry*)p1)->freq); 120 } 121 122 // First hanzi, if the same, then Pinyin 123 int cmp_lemma_entry_hzspys(const void* p1, const void* p2) { 124 size_t size1 = utf16_strlen(((const LemmaEntry*)p1)->hanzi_str); 125 size_t size2 = utf16_strlen(((const LemmaEntry*)p2)->hanzi_str); 126 if (size1 < size2) 127 return -1; 128 else if (size1 > size2) 129 return 1; 130 int ret = utf16_strcmp(((const LemmaEntry*)p1)->hanzi_str, 131 ((const LemmaEntry*)p2)->hanzi_str); 132 133 if (0 != ret) 134 return ret; 135 136 ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr, 137 ((const LemmaEntry*)p2)->spl_idx_arr); 138 return ret; 139 } 140 141 int compare_splid2(const void* p1, const void* p2) { 142 int ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr, 143 ((const LemmaEntry*)p2)->spl_idx_arr); 144 return ret; 145 } 146 147 DictBuilder::DictBuilder() { 148 lemma_arr_ = NULL; 149 lemma_num_ = 0; 150 151 scis_ = NULL; 152 scis_num_ = 0; 153 154 lma_nodes_le0_ = NULL; 155 lma_nodes_ge1_ = NULL; 156 157 lma_nds_used_num_le0_ = 0; 158 lma_nds_used_num_ge1_ = 0; 159 160 homo_idx_buf_ = NULL; 161 homo_idx_num_eq1_ = 0; 162 homo_idx_num_gt1_ = 0; 163 164 top_lmas_ = NULL; 165 top_lmas_num_ = 0; 166 167 spl_table_ = NULL; 168 spl_parser_ = NULL; 169 } 170 171 DictBuilder::~DictBuilder() { 172 free_resource(); 173 } 174 175 bool DictBuilder::alloc_resource(size_t lma_num) { 176 if (0 == lma_num) 177 return false; 178 179 free_resource(); 180 181 lemma_num_ = lma_num; 182 lemma_arr_ = new LemmaEntry[lemma_num_]; 183 184 top_lmas_num_ = 0; 185 top_lmas_ = new LemmaEntry[kTopScoreLemmaNum]; 186 187 // New the scis_ buffer to the possible maximum size. 188 scis_num_ = lemma_num_ * kMaxLemmaSize; 189 scis_ = new SingleCharItem[scis_num_]; 190 191 // The root and first level nodes is less than kMaxSpellingNum + 1 192 lma_nds_used_num_le0_ = 0; 193 lma_nodes_le0_ = new LmaNodeLE0[kMaxSpellingNum + 1]; 194 195 // Other nodes is less than lemma_num 196 lma_nds_used_num_ge1_ = 0; 197 lma_nodes_ge1_ = new LmaNodeGE1[lemma_num_]; 198 199 homo_idx_buf_ = new LemmaIdType[lemma_num_]; 200 spl_table_ = new SpellingTable(); 201 spl_parser_ = new SpellingParser(); 202 203 if (NULL == lemma_arr_ || NULL == top_lmas_ || 204 NULL == scis_ || NULL == spl_table_ || 205 NULL == spl_parser_ || NULL == lma_nodes_le0_ || 206 NULL == lma_nodes_ge1_ || NULL == homo_idx_buf_) { 207 free_resource(); 208 return false; 209 } 210 211 memset(lemma_arr_, 0, sizeof(LemmaEntry) * lemma_num_); 212 memset(scis_, 0, sizeof(SingleCharItem) * scis_num_); 213 memset(lma_nodes_le0_, 0, sizeof(LmaNodeLE0) * (kMaxSpellingNum + 1)); 214 memset(lma_nodes_ge1_, 0, sizeof(LmaNodeGE1) * lemma_num_); 215 memset(homo_idx_buf_, 0, sizeof(LemmaIdType) * lemma_num_); 216 spl_table_->init_table(kMaxPinyinSize, kSplTableHashLen, true); 217 218 return true; 219 } 220 221 char16* DictBuilder::read_valid_hanzis(const char *fn_validhzs, size_t *num) { 222 if (NULL == fn_validhzs || NULL == num) 223 return NULL; 224 225 *num = 0; 226 FILE *fp = fopen(fn_validhzs, "rb"); 227 if (NULL == fp) 228 return NULL; 229 230 char16 utf16header; 231 if (fread(&utf16header, sizeof(char16), 1, fp) != 1 || 232 0xfeff != utf16header) { 233 fclose(fp); 234 return NULL; 235 } 236 237 fseek(fp, 0, SEEK_END); 238 *num = ftell(fp) / sizeof(char16); 239 assert(*num >= 1); 240 *num -= 1; 241 242 char16 *hzs = new char16[*num]; 243 if (NULL == hzs) { 244 fclose(fp); 245 return NULL; 246 } 247 248 fseek(fp, 2, SEEK_SET); 249 250 if (fread(hzs, sizeof(char16), *num, fp) != *num) { 251 fclose(fp); 252 delete [] hzs; 253 return NULL; 254 } 255 fclose(fp); 256 257 myqsort(hzs, *num, sizeof(char16), compare_char16); 258 return hzs; 259 } 260 261 bool DictBuilder::hz_in_hanzis_list(const char16 *hzs, size_t hzs_len, 262 char16 hz) { 263 if (NULL == hzs) 264 return false; 265 266 char16 *found; 267 found = static_cast<char16*>( 268 mybsearch(&hz, hzs, hzs_len, sizeof(char16), compare_char16)); 269 if (NULL == found) 270 return false; 271 272 assert(*found == hz); 273 return true; 274 } 275 276 // The caller makes sure that the parameters are valid. 277 bool DictBuilder::str_in_hanzis_list(const char16 *hzs, size_t hzs_len, 278 const char16 *str, size_t str_len) { 279 if (NULL == hzs || NULL == str) 280 return false; 281 282 for (size_t pos = 0; pos < str_len; pos++) { 283 if (!hz_in_hanzis_list(hzs, hzs_len, str[pos])) 284 return false; 285 } 286 return true; 287 } 288 289 void DictBuilder::get_top_lemmas() { 290 top_lmas_num_ = 0; 291 if (NULL == lemma_arr_) 292 return; 293 294 for (size_t pos = 0; pos < lemma_num_; pos++) { 295 if (0 == top_lmas_num_) { 296 top_lmas_[0] = lemma_arr_[pos]; 297 top_lmas_num_ = 1; 298 continue; 299 } 300 301 if (lemma_arr_[pos].freq > top_lmas_[top_lmas_num_ - 1].freq) { 302 if (kTopScoreLemmaNum > top_lmas_num_) 303 top_lmas_num_ += 1; 304 305 size_t move_pos; 306 for (move_pos = top_lmas_num_ - 1; move_pos > 0; move_pos--) { 307 top_lmas_[move_pos] = top_lmas_[move_pos - 1]; 308 if (0 == move_pos - 1 || 309 (move_pos - 1 > 0 && 310 top_lmas_[move_pos - 2].freq > lemma_arr_[pos].freq)) { 311 break; 312 } 313 } 314 assert(move_pos > 0); 315 top_lmas_[move_pos - 1] = lemma_arr_[pos]; 316 } else if (kTopScoreLemmaNum > top_lmas_num_) { 317 top_lmas_[top_lmas_num_] = lemma_arr_[pos]; 318 top_lmas_num_ += 1; 319 } 320 } 321 322 if (kPrintDebug0) { 323 printf("\n------Top Lemmas------------------\n"); 324 for (size_t pos = 0; pos < top_lmas_num_; pos++) { 325 printf("--%d, idx:%06d, score:%.5f\n", pos, top_lmas_[pos].idx_by_hz, 326 top_lmas_[pos].freq); 327 } 328 } 329 } 330 331 void DictBuilder::free_resource() { 332 if (NULL != lemma_arr_) 333 delete [] lemma_arr_; 334 335 if (NULL != scis_) 336 delete [] scis_; 337 338 if (NULL != lma_nodes_le0_) 339 delete [] lma_nodes_le0_; 340 341 if (NULL != lma_nodes_ge1_) 342 delete [] lma_nodes_ge1_; 343 344 if (NULL != homo_idx_buf_) 345 delete [] homo_idx_buf_; 346 347 if (NULL != spl_table_) 348 delete spl_table_; 349 350 if (NULL != spl_parser_) 351 delete spl_parser_; 352 353 lemma_arr_ = NULL; 354 scis_ = NULL; 355 lma_nodes_le0_ = NULL; 356 lma_nodes_ge1_ = NULL; 357 homo_idx_buf_ = NULL; 358 spl_table_ = NULL; 359 spl_parser_ = NULL; 360 361 lemma_num_ = 0; 362 lma_nds_used_num_le0_ = 0; 363 lma_nds_used_num_ge1_ = 0; 364 homo_idx_num_eq1_ = 0; 365 homo_idx_num_gt1_ = 0; 366 } 367 368 size_t DictBuilder::read_raw_dict(const char* fn_raw, 369 const char *fn_validhzs, 370 size_t max_item) { 371 if (NULL == fn_raw) return 0; 372 373 Utf16Reader utf16_reader; 374 if (!utf16_reader.open(fn_raw, kReadBufLen * 10)) 375 return false; 376 377 char16 read_buf[kReadBufLen]; 378 379 // Read the number of lemmas in the file 380 size_t lemma_num = 240000; 381 382 // allocate resource required 383 if (!alloc_resource(lemma_num)) { 384 utf16_reader.close(); 385 } 386 387 // Read the valid Hanzi list. 388 char16 *valid_hzs = NULL; 389 size_t valid_hzs_num = 0; 390 valid_hzs = read_valid_hanzis(fn_validhzs, &valid_hzs_num); 391 392 // Begin reading the lemma entries 393 for (size_t i = 0; i < max_item; i++) { 394 // read next entry 395 if (!utf16_reader.readline(read_buf, kReadBufLen)) { 396 lemma_num = i; 397 break; 398 } 399 400 size_t token_size; 401 char16 *token; 402 char16 *to_tokenize = read_buf; 403 404 // Get the Hanzi string 405 token = utf16_strtok(to_tokenize, &token_size, &to_tokenize); 406 if (NULL == token) { 407 free_resource(); 408 utf16_reader.close(); 409 return false; 410 } 411 412 size_t lemma_size = utf16_strlen(token); 413 414 if (lemma_size > kMaxLemmaSize) { 415 i--; 416 continue; 417 } 418 419 if (lemma_size > 4) { 420 i--; 421 continue; 422 } 423 424 // Copy to the lemma entry 425 utf16_strcpy(lemma_arr_[i].hanzi_str, token); 426 427 lemma_arr_[i].hz_str_len = token_size; 428 429 // Get the freq string 430 token = utf16_strtok(to_tokenize, &token_size, &to_tokenize); 431 if (NULL == token) { 432 free_resource(); 433 utf16_reader.close(); 434 return false; 435 } 436 lemma_arr_[i].freq = utf16_atof(token); 437 438 if (lemma_size > 1 && lemma_arr_[i].freq < 60) { 439 i--; 440 continue; 441 } 442 443 // Get GBK mark, if no valid Hanzi list available, all items which contains 444 // GBK characters will be discarded. Otherwise, all items which contains 445 // characters outside of the valid Hanzi list will be discarded. 446 token = utf16_strtok(to_tokenize, &token_size, &to_tokenize); 447 assert(NULL != token); 448 int gbk_flag = utf16_atoi(token); 449 if (NULL == valid_hzs || 0 == valid_hzs_num) { 450 if (0 != gbk_flag) { 451 i--; 452 continue; 453 } 454 } else { 455 if (!str_in_hanzis_list(valid_hzs, valid_hzs_num, 456 lemma_arr_[i].hanzi_str, lemma_arr_[i].hz_str_len)) { 457 i--; 458 continue; 459 } 460 } 461 462 // Get spelling String 463 bool spelling_not_support = false; 464 for (size_t hz_pos = 0; hz_pos < (size_t)lemma_arr_[i].hz_str_len; 465 hz_pos++) { 466 // Get a Pinyin 467 token = utf16_strtok(to_tokenize, &token_size, &to_tokenize); 468 if (NULL == token) { 469 free_resource(); 470 utf16_reader.close(); 471 return false; 472 } 473 474 assert(utf16_strlen(token) <= kMaxPinyinSize); 475 476 utf16_strcpy_tochar(lemma_arr_[i].pinyin_str[hz_pos], token); 477 478 format_spelling_str(lemma_arr_[i].pinyin_str[hz_pos]); 479 480 // Put the pinyin to the spelling table 481 if (!spl_table_->put_spelling(lemma_arr_[i].pinyin_str[hz_pos], 482 lemma_arr_[i].freq)) { 483 spelling_not_support = true; 484 break; 485 } 486 } 487 488 // The whole line must have been parsed fully, otherwise discard this one. 489 token = utf16_strtok(to_tokenize, &token_size, &to_tokenize); 490 if (spelling_not_support || NULL != token) { 491 i--; 492 continue; 493 } 494 } 495 496 delete [] valid_hzs; 497 utf16_reader.close(); 498 499 printf("read succesfully, lemma num: %d\n", lemma_num); 500 501 return lemma_num; 502 } 503 504 bool DictBuilder::build_dict(const char *fn_raw, 505 const char *fn_validhzs, 506 DictTrie *dict_trie) { 507 if (NULL == fn_raw || NULL == dict_trie) 508 return false; 509 510 lemma_num_ = read_raw_dict(fn_raw, fn_validhzs, 240000); 511 if (0 == lemma_num_) 512 return false; 513 514 // Arrange the spelling table, and build a spelling tree 515 // The size of an spelling. '\0' is included. If the spelling table is 516 // initialized to calculate the spelling scores, the last char in the 517 // spelling string will be score, and it is also included in spl_item_size. 518 size_t spl_item_size; 519 size_t spl_num; 520 const char* spl_buf; 521 spl_buf = spl_table_->arrange(&spl_item_size, &spl_num); 522 if (NULL == spl_buf) { 523 free_resource(); 524 return false; 525 } 526 527 SpellingTrie &spl_trie = SpellingTrie::get_instance(); 528 529 if (!spl_trie.construct(spl_buf, spl_item_size, spl_num, 530 spl_table_->get_score_amplifier(), 531 spl_table_->get_average_score())) { 532 free_resource(); 533 return false; 534 } 535 536 printf("spelling tree construct successfully.\n"); 537 538 // Convert the spelling string to idxs 539 for (size_t i = 0; i < lemma_num_; i++) { 540 for (size_t hz_pos = 0; hz_pos < (size_t)lemma_arr_[i].hz_str_len; 541 hz_pos++) { 542 uint16 spl_idxs[2]; 543 uint16 spl_start_pos[3]; 544 bool is_pre = true; 545 int spl_idx_num = 546 spl_parser_->splstr_to_idxs(lemma_arr_[i].pinyin_str[hz_pos], 547 strlen(lemma_arr_[i].pinyin_str[hz_pos]), 548 spl_idxs, spl_start_pos, 2, is_pre); 549 assert(1 == spl_idx_num); 550 551 if (spl_trie.is_half_id(spl_idxs[0])) { 552 uint16 num = spl_trie.half_to_full(spl_idxs[0], spl_idxs); 553 assert(0 != num); 554 } 555 lemma_arr_[i].spl_idx_arr[hz_pos] = spl_idxs[0]; 556 } 557 } 558 559 // Sort the lemma items according to the hanzi, and give each unique item a 560 // id 561 sort_lemmas_by_hz(); 562 563 scis_num_ = build_scis(); 564 565 // Construct the dict list 566 dict_trie->dict_list_ = new DictList(); 567 bool dl_success = dict_trie->dict_list_->init_list(scis_, scis_num_, 568 lemma_arr_, lemma_num_); 569 assert(dl_success); 570 571 // Construct the NGram information 572 NGram& ngram = NGram::get_instance(); 573 ngram.build_unigram(lemma_arr_, lemma_num_, 574 lemma_arr_[lemma_num_ - 1].idx_by_hz + 1); 575 576 // sort the lemma items according to the spelling idx string 577 myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), compare_py); 578 579 get_top_lemmas(); 580 581 #ifdef ___DO_STATISTICS___ 582 stat_init(); 583 #endif 584 585 lma_nds_used_num_le0_ = 1; // The root node 586 bool dt_success = construct_subset(static_cast<void*>(lma_nodes_le0_), 587 lemma_arr_, 0, lemma_num_, 0); 588 if (!dt_success) { 589 free_resource(); 590 return false; 591 } 592 593 #ifdef ___DO_STATISTICS___ 594 stat_print(); 595 #endif 596 597 // Move the node data and homo data to the DictTrie 598 dict_trie->root_ = new LmaNodeLE0[lma_nds_used_num_le0_]; 599 dict_trie->nodes_ge1_ = new LmaNodeGE1[lma_nds_used_num_ge1_]; 600 size_t lma_idx_num = homo_idx_num_eq1_ + homo_idx_num_gt1_ + top_lmas_num_; 601 dict_trie->lma_idx_buf_ = new unsigned char[lma_idx_num * kLemmaIdSize]; 602 assert(NULL != dict_trie->root_); 603 assert(NULL != dict_trie->lma_idx_buf_); 604 dict_trie->lma_node_num_le0_ = lma_nds_used_num_le0_; 605 dict_trie->lma_node_num_ge1_ = lma_nds_used_num_ge1_; 606 dict_trie->lma_idx_buf_len_ = lma_idx_num * kLemmaIdSize; 607 dict_trie->top_lmas_num_ = top_lmas_num_; 608 609 memcpy(dict_trie->root_, lma_nodes_le0_, 610 sizeof(LmaNodeLE0) * lma_nds_used_num_le0_); 611 memcpy(dict_trie->nodes_ge1_, lma_nodes_ge1_, 612 sizeof(LmaNodeGE1) * lma_nds_used_num_ge1_); 613 614 for (size_t pos = 0; pos < homo_idx_num_eq1_ + homo_idx_num_gt1_; pos++) { 615 id_to_charbuf(dict_trie->lma_idx_buf_ + pos * kLemmaIdSize, 616 homo_idx_buf_[pos]); 617 } 618 619 for (size_t pos = homo_idx_num_eq1_ + homo_idx_num_gt1_; 620 pos < lma_idx_num; pos++) { 621 LemmaIdType idx = 622 top_lmas_[pos - homo_idx_num_eq1_ - homo_idx_num_gt1_].idx_by_hz; 623 id_to_charbuf(dict_trie->lma_idx_buf_ + pos * kLemmaIdSize, idx); 624 } 625 626 if (kPrintDebug0) { 627 printf("homo_idx_num_eq1_: %d\n", homo_idx_num_eq1_); 628 printf("homo_idx_num_gt1_: %d\n", homo_idx_num_gt1_); 629 printf("top_lmas_num_: %d\n", top_lmas_num_); 630 } 631 632 free_resource(); 633 634 if (kPrintDebug0) { 635 printf("Building dict succeds\n"); 636 } 637 return dt_success; 638 } 639 640 void DictBuilder::id_to_charbuf(unsigned char *buf, LemmaIdType id) { 641 if (NULL == buf) return; 642 for (size_t pos = 0; pos < kLemmaIdSize; pos++) { 643 (buf)[pos] = (unsigned char)(id >> (pos * 8)); 644 } 645 } 646 647 void DictBuilder::set_son_offset(LmaNodeGE1 *node, size_t offset) { 648 node->son_1st_off_l = static_cast<uint16>(offset); 649 node->son_1st_off_h = static_cast<unsigned char>(offset >> 16); 650 } 651 652 void DictBuilder:: set_homo_id_buf_offset(LmaNodeGE1 *node, size_t offset) { 653 node->homo_idx_buf_off_l = static_cast<uint16>(offset); 654 node->homo_idx_buf_off_h = static_cast<unsigned char>(offset >> 16); 655 656 } 657 658 // All spelling strings will be converted to upper case, except that 659 // spellings started with "ZH"/"CH"/"SH" will be converted to 660 // "Zh"/"Ch"/"Sh" 661 void DictBuilder::format_spelling_str(char *spl_str) { 662 if (NULL == spl_str) 663 return; 664 665 uint16 pos = 0; 666 while ('\0' != spl_str[pos]) { 667 if (spl_str[pos] >= 'a' && spl_str[pos] <= 'z') 668 spl_str[pos] = spl_str[pos] - 'a' + 'A'; 669 670 if (1 == pos && 'H' == spl_str[pos]) { 671 if ('C' == spl_str[0] || 'S' == spl_str[0] || 'Z' == spl_str[0]) { 672 spl_str[pos] = 'h'; 673 } 674 } 675 pos++; 676 } 677 } 678 679 LemmaIdType DictBuilder::sort_lemmas_by_hz() { 680 if (NULL == lemma_arr_ || 0 == lemma_num_) 681 return 0; 682 683 myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), cmp_lemma_entry_hzs); 684 685 lemma_arr_[0].idx_by_hz = 1; 686 LemmaIdType idx_max = 1; 687 for (size_t i = 1; i < lemma_num_; i++) { 688 if (utf16_strcmp(lemma_arr_[i].hanzi_str, lemma_arr_[i-1].hanzi_str)) { 689 idx_max++; 690 lemma_arr_[i].idx_by_hz = idx_max; 691 } else { 692 idx_max++; 693 lemma_arr_[i].idx_by_hz = idx_max; 694 } 695 } 696 return idx_max + 1; 697 } 698 699 size_t DictBuilder::build_scis() { 700 if (NULL == scis_ || lemma_num_ * kMaxLemmaSize > scis_num_) 701 return 0; 702 703 SpellingTrie &spl_trie = SpellingTrie::get_instance(); 704 705 // This first one is blank, because id 0 is invalid. 706 scis_[0].freq = 0; 707 scis_[0].hz = 0; 708 scis_[0].splid.full_splid = 0; 709 scis_[0].splid.half_splid = 0; 710 scis_num_ = 1; 711 712 // Copy the hanzis to the buffer 713 for (size_t pos = 0; pos < lemma_num_; pos++) { 714 size_t hz_num = lemma_arr_[pos].hz_str_len; 715 for (size_t hzpos = 0; hzpos < hz_num; hzpos++) { 716 scis_[scis_num_].hz = lemma_arr_[pos].hanzi_str[hzpos]; 717 scis_[scis_num_].splid.full_splid = lemma_arr_[pos].spl_idx_arr[hzpos]; 718 scis_[scis_num_].splid.half_splid = 719 spl_trie.full_to_half(scis_[scis_num_].splid.full_splid); 720 if (1 == hz_num) 721 scis_[scis_num_].freq = lemma_arr_[pos].freq; 722 else 723 scis_[scis_num_].freq = 0.000001; 724 scis_num_++; 725 } 726 } 727 728 myqsort(scis_, scis_num_, sizeof(SingleCharItem), cmp_scis_hz_splid_freq); 729 730 // Remove repeated items 731 size_t unique_scis_num = 1; 732 for (size_t pos = 1; pos < scis_num_; pos++) { 733 if (scis_[pos].hz == scis_[pos - 1].hz && 734 scis_[pos].splid.full_splid == scis_[pos - 1].splid.full_splid) 735 continue; 736 scis_[unique_scis_num] = scis_[pos]; 737 scis_[unique_scis_num].splid.half_splid = 738 spl_trie.full_to_half(scis_[pos].splid.full_splid); 739 unique_scis_num++; 740 } 741 742 scis_num_ = unique_scis_num; 743 744 // Update the lemma list. 745 for (size_t pos = 0; pos < lemma_num_; pos++) { 746 size_t hz_num = lemma_arr_[pos].hz_str_len; 747 for (size_t hzpos = 0; hzpos < hz_num; hzpos++) { 748 SingleCharItem key; 749 key.hz = lemma_arr_[pos].hanzi_str[hzpos]; 750 key.splid.full_splid = lemma_arr_[pos].spl_idx_arr[hzpos]; 751 key.splid.half_splid = spl_trie.full_to_half(key.splid.full_splid); 752 753 SingleCharItem *found; 754 found = static_cast<SingleCharItem*>(mybsearch(&key, scis_, 755 unique_scis_num, 756 sizeof(SingleCharItem), 757 cmp_scis_hz_splid)); 758 759 assert(found); 760 761 lemma_arr_[pos].hanzi_scis_ids[hzpos] = 762 static_cast<uint16>(found - scis_); 763 lemma_arr_[pos].spl_idx_arr[hzpos] = found->splid.full_splid; 764 } 765 } 766 767 return scis_num_; 768 } 769 770 bool DictBuilder::construct_subset(void* parent, LemmaEntry* lemma_arr, 771 size_t item_start, size_t item_end, 772 size_t level) { 773 if (level >= kMaxLemmaSize || item_end <= item_start) 774 return false; 775 776 // 1. Scan for how many sons 777 size_t parent_son_num = 0; 778 // LemmaNode *son_1st = NULL; 779 // parent.num_of_son = 0; 780 781 LemmaEntry *lma_last_start = lemma_arr_ + item_start; 782 uint16 spl_idx_node = lma_last_start->spl_idx_arr[level]; 783 784 // Scan for how many sons to be allocaed 785 for (size_t i = item_start + 1; i< item_end; i++) { 786 LemmaEntry *lma_current = lemma_arr + i; 787 uint16 spl_idx_current = lma_current->spl_idx_arr[level]; 788 if (spl_idx_current != spl_idx_node) { 789 parent_son_num++; 790 spl_idx_node = spl_idx_current; 791 } 792 } 793 parent_son_num++; 794 795 #ifdef ___DO_STATISTICS___ 796 // Use to indicate whether all nodes of this layer have no son. 797 bool allson_noson = true; 798 799 assert(level < kMaxLemmaSize); 800 if (parent_son_num > max_sonbuf_len_[level]) 801 max_sonbuf_len_[level] = parent_son_num; 802 803 total_son_num_[level] += parent_son_num; 804 total_sonbuf_num_[level] += 1; 805 806 if (parent_son_num == 1) 807 sonbufs_num1_++; 808 else 809 sonbufs_numgt1_++; 810 total_lma_node_num_ += parent_son_num; 811 #endif 812 813 // 2. Update the parent's information 814 // Update the parent's son list; 815 LmaNodeLE0 *son_1st_le0 = NULL; // only one of le0 or ge1 is used 816 LmaNodeGE1 *son_1st_ge1 = NULL; // only one of le0 or ge1 is used. 817 if (0 == level) { // the parent is root 818 (static_cast<LmaNodeLE0*>(parent))->son_1st_off = 819 lma_nds_used_num_le0_; 820 son_1st_le0 = lma_nodes_le0_ + lma_nds_used_num_le0_; 821 lma_nds_used_num_le0_ += parent_son_num; 822 823 assert(parent_son_num <= 65535); 824 (static_cast<LmaNodeLE0*>(parent))->num_of_son = 825 static_cast<uint16>(parent_son_num); 826 } else if (1 == level) { // the parent is a son of root 827 (static_cast<LmaNodeLE0*>(parent))->son_1st_off = 828 lma_nds_used_num_ge1_; 829 son_1st_ge1 = lma_nodes_ge1_ + lma_nds_used_num_ge1_; 830 lma_nds_used_num_ge1_ += parent_son_num; 831 832 assert(parent_son_num <= 65535); 833 (static_cast<LmaNodeLE0*>(parent))->num_of_son = 834 static_cast<uint16>(parent_son_num); 835 } else { 836 set_son_offset((static_cast<LmaNodeGE1*>(parent)), 837 lma_nds_used_num_ge1_); 838 son_1st_ge1 = lma_nodes_ge1_ + lma_nds_used_num_ge1_; 839 lma_nds_used_num_ge1_ += parent_son_num; 840 841 assert(parent_son_num <= 255); 842 (static_cast<LmaNodeGE1*>(parent))->num_of_son = 843 (unsigned char)parent_son_num; 844 } 845 846 // 3. Now begin to construct the son one by one 847 size_t son_pos = 0; 848 849 lma_last_start = lemma_arr_ + item_start; 850 spl_idx_node = lma_last_start->spl_idx_arr[level]; 851 852 size_t homo_num = 0; 853 if (lma_last_start->spl_idx_arr[level + 1] == 0) 854 homo_num = 1; 855 856 size_t item_start_next = item_start; 857 858 for (size_t i = item_start + 1; i < item_end; i++) { 859 LemmaEntry* lma_current = lemma_arr_ + i; 860 uint16 spl_idx_current = lma_current->spl_idx_arr[level]; 861 862 if (spl_idx_current == spl_idx_node) { 863 if (lma_current->spl_idx_arr[level + 1] == 0) 864 homo_num++; 865 } else { 866 // Construct a node 867 LmaNodeLE0 *node_cur_le0 = NULL; // only one of them is valid 868 LmaNodeGE1 *node_cur_ge1 = NULL; 869 if (0 == level) { 870 node_cur_le0 = son_1st_le0 + son_pos; 871 node_cur_le0->spl_idx = spl_idx_node; 872 node_cur_le0->homo_idx_buf_off = homo_idx_num_eq1_ + homo_idx_num_gt1_; 873 node_cur_le0->son_1st_off = 0; 874 homo_idx_num_eq1_ += homo_num; 875 } else { 876 node_cur_ge1 = son_1st_ge1 + son_pos; 877 node_cur_ge1->spl_idx = spl_idx_node; 878 879 set_homo_id_buf_offset(node_cur_ge1, 880 (homo_idx_num_eq1_ + homo_idx_num_gt1_)); 881 set_son_offset(node_cur_ge1, 0); 882 homo_idx_num_gt1_ += homo_num; 883 } 884 885 if (homo_num > 0) { 886 LemmaIdType* idx_buf = homo_idx_buf_ + homo_idx_num_eq1_ + 887 homo_idx_num_gt1_ - homo_num; 888 if (0 == level) { 889 assert(homo_num <= 65535); 890 node_cur_le0->num_of_homo = static_cast<uint16>(homo_num); 891 } else { 892 assert(homo_num <= 255); 893 node_cur_ge1->num_of_homo = (unsigned char)homo_num; 894 } 895 896 for (size_t homo_pos = 0; homo_pos < homo_num; homo_pos++) { 897 idx_buf[homo_pos] = lemma_arr_[item_start_next + homo_pos].idx_by_hz; 898 } 899 900 #ifdef ___DO_STATISTICS___ 901 if (homo_num > max_homobuf_len_[level]) 902 max_homobuf_len_[level] = homo_num; 903 904 total_homo_num_[level] += homo_num; 905 #endif 906 } 907 908 if (i - item_start_next > homo_num) { 909 void *next_parent; 910 if (0 == level) 911 next_parent = static_cast<void*>(node_cur_le0); 912 else 913 next_parent = static_cast<void*>(node_cur_ge1); 914 construct_subset(next_parent, lemma_arr, 915 item_start_next + homo_num, i, level + 1); 916 #ifdef ___DO_STATISTICS___ 917 918 total_node_hasson_[level] += 1; 919 allson_noson = false; 920 #endif 921 } 922 923 // for the next son 924 lma_last_start = lma_current; 925 spl_idx_node = spl_idx_current; 926 item_start_next = i; 927 homo_num = 0; 928 if (lma_current->spl_idx_arr[level + 1] == 0) 929 homo_num = 1; 930 931 son_pos++; 932 } 933 } 934 935 // 4. The last one to construct 936 LmaNodeLE0 *node_cur_le0 = NULL; // only one of them is valid 937 LmaNodeGE1 *node_cur_ge1 = NULL; 938 if (0 == level) { 939 node_cur_le0 = son_1st_le0 + son_pos; 940 node_cur_le0->spl_idx = spl_idx_node; 941 node_cur_le0->homo_idx_buf_off = homo_idx_num_eq1_ + homo_idx_num_gt1_; 942 node_cur_le0->son_1st_off = 0; 943 homo_idx_num_eq1_ += homo_num; 944 } else { 945 node_cur_ge1 = son_1st_ge1 + son_pos; 946 node_cur_ge1->spl_idx = spl_idx_node; 947 948 set_homo_id_buf_offset(node_cur_ge1, 949 (homo_idx_num_eq1_ + homo_idx_num_gt1_)); 950 set_son_offset(node_cur_ge1, 0); 951 homo_idx_num_gt1_ += homo_num; 952 } 953 954 if (homo_num > 0) { 955 LemmaIdType* idx_buf = homo_idx_buf_ + homo_idx_num_eq1_ + 956 homo_idx_num_gt1_ - homo_num; 957 if (0 == level) { 958 assert(homo_num <= 65535); 959 node_cur_le0->num_of_homo = static_cast<uint16>(homo_num); 960 } else { 961 assert(homo_num <= 255); 962 node_cur_ge1->num_of_homo = (unsigned char)homo_num; 963 } 964 965 for (size_t homo_pos = 0; homo_pos < homo_num; homo_pos++) { 966 idx_buf[homo_pos] = lemma_arr[item_start_next + homo_pos].idx_by_hz; 967 } 968 969 #ifdef ___DO_STATISTICS___ 970 if (homo_num > max_homobuf_len_[level]) 971 max_homobuf_len_[level] = homo_num; 972 973 total_homo_num_[level] += homo_num; 974 #endif 975 } 976 977 if (item_end - item_start_next > homo_num) { 978 void *next_parent; 979 if (0 == level) 980 next_parent = static_cast<void*>(node_cur_le0); 981 else 982 next_parent = static_cast<void*>(node_cur_ge1); 983 construct_subset(next_parent, lemma_arr, 984 item_start_next + homo_num, item_end, level + 1); 985 #ifdef ___DO_STATISTICS___ 986 987 total_node_hasson_[level] += 1; 988 allson_noson = false; 989 #endif 990 } 991 992 #ifdef ___DO_STATISTICS___ 993 if (allson_noson) { 994 total_sonbuf_allnoson_[level] += 1; 995 total_node_in_sonbuf_allnoson_[level] += parent_son_num; 996 } 997 #endif 998 999 assert(son_pos + 1 == parent_son_num); 1000 return true; 1001 } 1002 1003 #ifdef ___DO_STATISTICS___ 1004 void DictBuilder::stat_init() { 1005 memset(max_sonbuf_len_, 0, sizeof(size_t) * kMaxLemmaSize); 1006 memset(max_homobuf_len_, 0, sizeof(size_t) * kMaxLemmaSize); 1007 memset(total_son_num_, 0, sizeof(size_t) * kMaxLemmaSize); 1008 memset(total_node_hasson_, 0, sizeof(size_t) * kMaxLemmaSize); 1009 memset(total_sonbuf_num_, 0, sizeof(size_t) * kMaxLemmaSize); 1010 memset(total_sonbuf_allnoson_, 0, sizeof(size_t) * kMaxLemmaSize); 1011 memset(total_node_in_sonbuf_allnoson_, 0, sizeof(size_t) * kMaxLemmaSize); 1012 memset(total_homo_num_, 0, sizeof(size_t) * kMaxLemmaSize); 1013 1014 sonbufs_num1_ = 0; 1015 sonbufs_numgt1_ = 0; 1016 total_lma_node_num_ = 0; 1017 } 1018 1019 void DictBuilder::stat_print() { 1020 printf("\n------------STAT INFO-------------\n"); 1021 printf("[root is layer -1]\n"); 1022 printf(".. max_sonbuf_len per layer(from layer 0):\n "); 1023 for (size_t i = 0; i < kMaxLemmaSize; i++) 1024 printf("%d, ", max_sonbuf_len_[i]); 1025 printf("-, \n"); 1026 1027 printf(".. max_homobuf_len per layer:\n -, "); 1028 for (size_t i = 0; i < kMaxLemmaSize; i++) 1029 printf("%d, ", max_homobuf_len_[i]); 1030 printf("\n"); 1031 1032 printf(".. total_son_num per layer:\n "); 1033 for (size_t i = 0; i < kMaxLemmaSize; i++) 1034 printf("%d, ", total_son_num_[i]); 1035 printf("-, \n"); 1036 1037 printf(".. total_node_hasson per layer:\n 1, "); 1038 for (size_t i = 0; i < kMaxLemmaSize; i++) 1039 printf("%d, ", total_node_hasson_[i]); 1040 printf("\n"); 1041 1042 printf(".. total_sonbuf_num per layer:\n "); 1043 for (size_t i = 0; i < kMaxLemmaSize; i++) 1044 printf("%d, ", total_sonbuf_num_[i]); 1045 printf("-, \n"); 1046 1047 printf(".. total_sonbuf_allnoson per layer:\n "); 1048 for (size_t i = 0; i < kMaxLemmaSize; i++) 1049 printf("%d, ", total_sonbuf_allnoson_[i]); 1050 printf("-, \n"); 1051 1052 printf(".. total_node_in_sonbuf_allnoson per layer:\n "); 1053 for (size_t i = 0; i < kMaxLemmaSize; i++) 1054 printf("%d, ", total_node_in_sonbuf_allnoson_[i]); 1055 printf("-, \n"); 1056 1057 printf(".. total_homo_num per layer:\n 0, "); 1058 for (size_t i = 0; i < kMaxLemmaSize; i++) 1059 printf("%d, ", total_homo_num_[i]); 1060 printf("\n"); 1061 1062 printf(".. son buf allocation number with only 1 son: %d\n", sonbufs_num1_); 1063 printf(".. son buf allocation number with more than 1 son: %d\n", 1064 sonbufs_numgt1_); 1065 printf(".. total lemma node number: %d\n", total_lma_node_num_ + 1); 1066 } 1067 #endif // ___DO_STATISTICS___ 1068 1069 #endif // ___BUILD_MODEL___ 1070 } // namespace ime_pinyin 1071