Home | History | Annotate | Download | only in share
      1 /*
      2  * Copyright (C) 2009 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include <assert.h>
     18 #include <stdio.h>
     19 #include <stdlib.h>
     20 #include <string.h>
     21 
     22 #include "../include/dictbuilder.h"
     23 #include "../include/dicttrie.h"
     24 #include "../include/mystdlib.h"
     25 #include "../include/ngram.h"
     26 #include "../include/searchutility.h"
     27 #include "../include/spellingtable.h"
     28 #include "../include/spellingtrie.h"
     29 #include "../include/splparser.h"
     30 #include "../include/utf16reader.h"
     31 
     32 namespace ime_pinyin {
     33 
     34 #ifdef ___BUILD_MODEL___
     35 
     36 static const size_t kReadBufLen = 512;
     37 static const size_t kSplTableHashLen = 2000;
     38 
     39 // Compare a SingleCharItem, first by Hanzis, then by spelling ids, then by
     40 // frequencies.
     41 int cmp_scis_hz_splid_freq(const void* p1, const void* p2) {
     42   const SingleCharItem *s1, *s2;
     43   s1 = static_cast<const SingleCharItem*>(p1);
     44   s2 = static_cast<const SingleCharItem*>(p2);
     45 
     46   if (s1->hz < s2->hz)
     47     return -1;
     48   if (s1->hz > s2->hz)
     49     return 1;
     50 
     51   if (s1->splid.half_splid < s2->splid.half_splid)
     52     return -1;
     53   if (s1->splid.half_splid > s2->splid.half_splid)
     54     return 1;
     55 
     56   if (s1->splid.full_splid < s2->splid.full_splid)
     57     return -1;
     58   if (s1->splid.full_splid > s2->splid.full_splid)
     59     return 1;
     60 
     61   if (s1->freq > s2->freq)
     62     return -1;
     63   if (s1->freq < s2->freq)
     64     return 1;
     65   return 0;
     66 }
     67 
     68 int cmp_scis_hz_splid(const void* p1, const void* p2) {
     69   const SingleCharItem *s1, *s2;
     70   s1 = static_cast<const SingleCharItem*>(p1);
     71   s2 = static_cast<const SingleCharItem*>(p2);
     72 
     73   if (s1->hz < s2->hz)
     74     return -1;
     75   if (s1->hz > s2->hz)
     76     return 1;
     77 
     78   if (s1->splid.half_splid < s2->splid.half_splid)
     79     return -1;
     80   if (s1->splid.half_splid > s2->splid.half_splid)
     81     return 1;
     82 
     83   if (s1->splid.full_splid < s2->splid.full_splid)
     84     return -1;
     85   if (s1->splid.full_splid > s2->splid.full_splid)
     86     return 1;
     87 
     88   return 0;
     89 }
     90 
     91 int cmp_lemma_entry_hzs(const void* p1, const void* p2) {
     92   size_t size1 = utf16_strlen(((const LemmaEntry*)p1)->hanzi_str);
     93   size_t size2 = utf16_strlen(((const LemmaEntry*)p2)->hanzi_str);
     94   if (size1 < size2)
     95     return -1;
     96   else if (size1 > size2)
     97     return 1;
     98 
     99   return utf16_strcmp(((const LemmaEntry*)p1)->hanzi_str,
    100                       ((const LemmaEntry*)p2)->hanzi_str);
    101 }
    102 
    103 int compare_char16(const void* p1, const void* p2) {
    104   if (*((const char16*)p1) < *((const char16*)p2))
    105     return -1;
    106   if (*((const char16*)p1) > *((const char16*)p2))
    107     return 1;
    108   return 0;
    109 }
    110 
    111 int compare_py(const void* p1, const void* p2) {
    112   int ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr,
    113                          ((const LemmaEntry*)p2)->spl_idx_arr);
    114 
    115   if (0 != ret)
    116     return ret;
    117 
    118   return static_cast<int>(((const LemmaEntry*)p2)->freq) -
    119          static_cast<int>(((const LemmaEntry*)p1)->freq);
    120 }
    121 
    122 // First hanzi, if the same, then Pinyin
    123 int cmp_lemma_entry_hzspys(const void* p1, const void* p2) {
    124   size_t size1 = utf16_strlen(((const LemmaEntry*)p1)->hanzi_str);
    125   size_t size2 = utf16_strlen(((const LemmaEntry*)p2)->hanzi_str);
    126   if (size1 < size2)
    127     return -1;
    128   else if (size1 > size2)
    129     return 1;
    130   int ret = utf16_strcmp(((const LemmaEntry*)p1)->hanzi_str,
    131                          ((const LemmaEntry*)p2)->hanzi_str);
    132 
    133   if (0 != ret)
    134     return ret;
    135 
    136   ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr,
    137                      ((const LemmaEntry*)p2)->spl_idx_arr);
    138   return ret;
    139 }
    140 
    141 int compare_splid2(const void* p1, const void* p2) {
    142   int ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr,
    143                          ((const LemmaEntry*)p2)->spl_idx_arr);
    144   return ret;
    145 }
    146 
    147 DictBuilder::DictBuilder() {
    148   lemma_arr_ = NULL;
    149   lemma_num_ = 0;
    150 
    151   scis_ = NULL;
    152   scis_num_ = 0;
    153 
    154   lma_nodes_le0_ = NULL;
    155   lma_nodes_ge1_ = NULL;
    156 
    157   lma_nds_used_num_le0_ = 0;
    158   lma_nds_used_num_ge1_ = 0;
    159 
    160   homo_idx_buf_ = NULL;
    161   homo_idx_num_eq1_ = 0;
    162   homo_idx_num_gt1_ = 0;
    163 
    164   top_lmas_ = NULL;
    165   top_lmas_num_ = 0;
    166 
    167   spl_table_ = NULL;
    168   spl_parser_ = NULL;
    169 }
    170 
    171 DictBuilder::~DictBuilder() {
    172   free_resource();
    173 }
    174 
    175 bool DictBuilder::alloc_resource(size_t lma_num) {
    176   if (0 == lma_num)
    177     return false;
    178 
    179   free_resource();
    180 
    181   lemma_num_ = lma_num;
    182   lemma_arr_ = new LemmaEntry[lemma_num_];
    183 
    184   top_lmas_num_ = 0;
    185   top_lmas_ = new LemmaEntry[kTopScoreLemmaNum];
    186 
    187   // New the scis_ buffer to the possible maximum size.
    188   scis_num_ = lemma_num_ * kMaxLemmaSize;
    189   scis_ = new SingleCharItem[scis_num_];
    190 
    191   // The root and first level nodes is less than kMaxSpellingNum + 1
    192   lma_nds_used_num_le0_ = 0;
    193   lma_nodes_le0_ = new LmaNodeLE0[kMaxSpellingNum + 1];
    194 
    195   // Other nodes is less than lemma_num
    196   lma_nds_used_num_ge1_ = 0;
    197   lma_nodes_ge1_ = new LmaNodeGE1[lemma_num_];
    198 
    199   homo_idx_buf_ = new LemmaIdType[lemma_num_];
    200   spl_table_ = new SpellingTable();
    201   spl_parser_ = new SpellingParser();
    202 
    203   if (NULL == lemma_arr_ || NULL == top_lmas_ ||
    204       NULL == scis_ || NULL == spl_table_ ||
    205       NULL == spl_parser_ || NULL == lma_nodes_le0_ ||
    206       NULL == lma_nodes_ge1_ || NULL == homo_idx_buf_) {
    207     free_resource();
    208     return false;
    209   }
    210 
    211   memset(lemma_arr_, 0, sizeof(LemmaEntry) * lemma_num_);
    212   memset(scis_, 0, sizeof(SingleCharItem) * scis_num_);
    213   memset(lma_nodes_le0_, 0, sizeof(LmaNodeLE0) * (kMaxSpellingNum + 1));
    214   memset(lma_nodes_ge1_, 0, sizeof(LmaNodeGE1) * lemma_num_);
    215   memset(homo_idx_buf_, 0, sizeof(LemmaIdType) * lemma_num_);
    216   spl_table_->init_table(kMaxPinyinSize, kSplTableHashLen, true);
    217 
    218   return true;
    219 }
    220 
    221 char16* DictBuilder::read_valid_hanzis(const char *fn_validhzs, size_t *num) {
    222   if (NULL == fn_validhzs || NULL == num)
    223     return NULL;
    224 
    225   *num = 0;
    226   FILE *fp = fopen(fn_validhzs, "rb");
    227   if (NULL == fp)
    228     return NULL;
    229 
    230   char16 utf16header;
    231   if (fread(&utf16header, sizeof(char16), 1, fp) != 1 ||
    232       0xfeff != utf16header) {
    233     fclose(fp);
    234     return NULL;
    235   }
    236 
    237   fseek(fp, 0, SEEK_END);
    238   *num = ftell(fp) / sizeof(char16);
    239   assert(*num >= 1);
    240   *num -= 1;
    241 
    242   char16 *hzs = new char16[*num];
    243   if (NULL == hzs) {
    244     fclose(fp);
    245     return NULL;
    246   }
    247 
    248   fseek(fp, 2, SEEK_SET);
    249 
    250   if (fread(hzs, sizeof(char16), *num, fp) != *num) {
    251     fclose(fp);
    252     delete [] hzs;
    253     return NULL;
    254   }
    255   fclose(fp);
    256 
    257   myqsort(hzs, *num, sizeof(char16), compare_char16);
    258   return hzs;
    259 }
    260 
    261 bool DictBuilder::hz_in_hanzis_list(const char16 *hzs, size_t hzs_len,
    262                                     char16 hz) {
    263   if (NULL == hzs)
    264     return false;
    265 
    266   char16 *found;
    267   found = static_cast<char16*>(
    268       mybsearch(&hz, hzs, hzs_len, sizeof(char16), compare_char16));
    269   if (NULL == found)
    270     return false;
    271 
    272   assert(*found == hz);
    273   return true;
    274 }
    275 
    276 // The caller makes sure that the parameters are valid.
    277 bool DictBuilder::str_in_hanzis_list(const char16 *hzs, size_t hzs_len,
    278                                      const char16 *str, size_t str_len) {
    279   if (NULL == hzs || NULL == str)
    280     return false;
    281 
    282   for (size_t pos = 0; pos < str_len; pos++) {
    283     if (!hz_in_hanzis_list(hzs, hzs_len, str[pos]))
    284       return false;
    285   }
    286   return true;
    287 }
    288 
    289 void DictBuilder::get_top_lemmas() {
    290   top_lmas_num_ = 0;
    291   if (NULL == lemma_arr_)
    292     return;
    293 
    294   for (size_t pos = 0; pos < lemma_num_; pos++) {
    295     if (0 == top_lmas_num_) {
    296       top_lmas_[0] = lemma_arr_[pos];
    297       top_lmas_num_ = 1;
    298       continue;
    299     }
    300 
    301     if (lemma_arr_[pos].freq > top_lmas_[top_lmas_num_ - 1].freq) {
    302       if (kTopScoreLemmaNum > top_lmas_num_)
    303         top_lmas_num_ += 1;
    304 
    305       size_t move_pos;
    306       for (move_pos = top_lmas_num_ - 1; move_pos > 0; move_pos--) {
    307         top_lmas_[move_pos] = top_lmas_[move_pos - 1];
    308         if (0 == move_pos - 1 ||
    309             (move_pos - 1 > 0 &&
    310              top_lmas_[move_pos - 2].freq > lemma_arr_[pos].freq)) {
    311           break;
    312         }
    313       }
    314       assert(move_pos > 0);
    315       top_lmas_[move_pos - 1] = lemma_arr_[pos];
    316     } else if (kTopScoreLemmaNum > top_lmas_num_) {
    317       top_lmas_[top_lmas_num_] = lemma_arr_[pos];
    318       top_lmas_num_ += 1;
    319     }
    320   }
    321 
    322   if (kPrintDebug0) {
    323     printf("\n------Top Lemmas------------------\n");
    324     for (size_t pos = 0; pos < top_lmas_num_; pos++) {
    325       printf("--%d, idx:%06d, score:%.5f\n", pos, top_lmas_[pos].idx_by_hz,
    326              top_lmas_[pos].freq);
    327     }
    328   }
    329 }
    330 
    331 void DictBuilder::free_resource() {
    332   if (NULL != lemma_arr_)
    333     delete [] lemma_arr_;
    334 
    335   if (NULL != scis_)
    336     delete [] scis_;
    337 
    338   if (NULL != lma_nodes_le0_)
    339     delete [] lma_nodes_le0_;
    340 
    341   if (NULL != lma_nodes_ge1_)
    342     delete [] lma_nodes_ge1_;
    343 
    344   if (NULL != homo_idx_buf_)
    345     delete [] homo_idx_buf_;
    346 
    347   if (NULL != spl_table_)
    348     delete spl_table_;
    349 
    350   if (NULL != spl_parser_)
    351     delete spl_parser_;
    352 
    353   lemma_arr_ = NULL;
    354   scis_ = NULL;
    355   lma_nodes_le0_ = NULL;
    356   lma_nodes_ge1_ = NULL;
    357   homo_idx_buf_ = NULL;
    358   spl_table_ = NULL;
    359   spl_parser_ = NULL;
    360 
    361   lemma_num_ = 0;
    362   lma_nds_used_num_le0_ = 0;
    363   lma_nds_used_num_ge1_ = 0;
    364   homo_idx_num_eq1_ = 0;
    365   homo_idx_num_gt1_ = 0;
    366 }
    367 
    368 size_t DictBuilder::read_raw_dict(const char* fn_raw,
    369                                   const char *fn_validhzs,
    370                                   size_t max_item) {
    371   if (NULL == fn_raw) return 0;
    372 
    373   Utf16Reader utf16_reader;
    374   if (!utf16_reader.open(fn_raw, kReadBufLen * 10))
    375     return false;
    376 
    377   char16 read_buf[kReadBufLen];
    378 
    379   // Read the number of lemmas in the file
    380   size_t lemma_num = 240000;
    381 
    382   // allocate resource required
    383   if (!alloc_resource(lemma_num)) {
    384     utf16_reader.close();
    385   }
    386 
    387   // Read the valid Hanzi list.
    388   char16 *valid_hzs = NULL;
    389   size_t valid_hzs_num = 0;
    390   valid_hzs = read_valid_hanzis(fn_validhzs, &valid_hzs_num);
    391 
    392   // Begin reading the lemma entries
    393   for (size_t i = 0; i < max_item; i++) {
    394     // read next entry
    395     if (!utf16_reader.readline(read_buf, kReadBufLen)) {
    396       lemma_num = i;
    397       break;
    398     }
    399 
    400     size_t token_size;
    401     char16 *token;
    402     char16 *to_tokenize = read_buf;
    403 
    404     // Get the Hanzi string
    405     token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
    406     if (NULL == token) {
    407       free_resource();
    408       utf16_reader.close();
    409       return false;
    410     }
    411 
    412     size_t lemma_size = utf16_strlen(token);
    413 
    414     if (lemma_size > kMaxLemmaSize) {
    415       i--;
    416       continue;
    417     }
    418 
    419     if (lemma_size > 4) {
    420       i--;
    421       continue;
    422     }
    423 
    424     // Copy to the lemma entry
    425     utf16_strcpy(lemma_arr_[i].hanzi_str, token);
    426 
    427     lemma_arr_[i].hz_str_len = token_size;
    428 
    429     // Get the freq string
    430     token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
    431     if (NULL == token) {
    432       free_resource();
    433       utf16_reader.close();
    434       return false;
    435     }
    436     lemma_arr_[i].freq = utf16_atof(token);
    437 
    438     if (lemma_size > 1 && lemma_arr_[i].freq < 60) {
    439       i--;
    440       continue;
    441     }
    442 
    443     // Get GBK mark, if no valid Hanzi list available, all items which contains
    444     // GBK characters will be discarded. Otherwise, all items which contains
    445     // characters outside of the valid Hanzi list will be discarded.
    446     token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
    447     assert(NULL != token);
    448     int gbk_flag = utf16_atoi(token);
    449     if (NULL == valid_hzs || 0 == valid_hzs_num) {
    450       if (0 != gbk_flag) {
    451         i--;
    452         continue;
    453       }
    454     } else {
    455       if (!str_in_hanzis_list(valid_hzs, valid_hzs_num,
    456           lemma_arr_[i].hanzi_str, lemma_arr_[i].hz_str_len)) {
    457         i--;
    458         continue;
    459       }
    460     }
    461 
    462     // Get spelling String
    463     bool spelling_not_support = false;
    464     for (size_t hz_pos = 0; hz_pos < (size_t)lemma_arr_[i].hz_str_len;
    465          hz_pos++) {
    466       // Get a Pinyin
    467       token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
    468       if (NULL == token) {
    469         free_resource();
    470         utf16_reader.close();
    471         return false;
    472       }
    473 
    474       assert(utf16_strlen(token) <= kMaxPinyinSize);
    475 
    476       utf16_strcpy_tochar(lemma_arr_[i].pinyin_str[hz_pos], token);
    477 
    478       format_spelling_str(lemma_arr_[i].pinyin_str[hz_pos]);
    479 
    480       // Put the pinyin to the spelling table
    481       if (!spl_table_->put_spelling(lemma_arr_[i].pinyin_str[hz_pos],
    482                                     lemma_arr_[i].freq)) {
    483         spelling_not_support = true;
    484         break;
    485       }
    486     }
    487 
    488     // The whole line must have been parsed fully, otherwise discard this one.
    489     token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
    490     if (spelling_not_support || NULL != token) {
    491       i--;
    492       continue;
    493     }
    494   }
    495 
    496   delete [] valid_hzs;
    497   utf16_reader.close();
    498 
    499   printf("read succesfully, lemma num: %d\n", lemma_num);
    500 
    501   return lemma_num;
    502 }
    503 
    504 bool DictBuilder::build_dict(const char *fn_raw,
    505                              const char *fn_validhzs,
    506                              DictTrie *dict_trie) {
    507   if (NULL == fn_raw || NULL == dict_trie)
    508     return false;
    509 
    510   lemma_num_ = read_raw_dict(fn_raw, fn_validhzs, 240000);
    511   if (0 == lemma_num_)
    512     return false;
    513 
    514   // Arrange the spelling table, and build a spelling tree
    515   // The size of an spelling. '\0' is included. If the spelling table is
    516   // initialized to calculate the spelling scores, the last char in the
    517   // spelling string will be score, and it is also included in spl_item_size.
    518   size_t spl_item_size;
    519   size_t spl_num;
    520   const char* spl_buf;
    521   spl_buf = spl_table_->arrange(&spl_item_size, &spl_num);
    522   if (NULL == spl_buf) {
    523     free_resource();
    524     return false;
    525   }
    526 
    527   SpellingTrie &spl_trie = SpellingTrie::get_instance();
    528 
    529   if (!spl_trie.construct(spl_buf, spl_item_size, spl_num,
    530                           spl_table_->get_score_amplifier(),
    531                           spl_table_->get_average_score())) {
    532     free_resource();
    533     return false;
    534   }
    535 
    536   printf("spelling tree construct successfully.\n");
    537 
    538   // Convert the spelling string to idxs
    539   for (size_t i = 0; i < lemma_num_; i++) {
    540     for (size_t hz_pos = 0; hz_pos < (size_t)lemma_arr_[i].hz_str_len;
    541          hz_pos++) {
    542       uint16 spl_idxs[2];
    543       uint16 spl_start_pos[3];
    544       bool is_pre = true;
    545       int spl_idx_num =
    546         spl_parser_->splstr_to_idxs(lemma_arr_[i].pinyin_str[hz_pos],
    547                                     strlen(lemma_arr_[i].pinyin_str[hz_pos]),
    548                                     spl_idxs, spl_start_pos, 2, is_pre);
    549       assert(1 == spl_idx_num);
    550 
    551       if (spl_trie.is_half_id(spl_idxs[0])) {
    552         uint16 num = spl_trie.half_to_full(spl_idxs[0], spl_idxs);
    553         assert(0 != num);
    554       }
    555       lemma_arr_[i].spl_idx_arr[hz_pos] = spl_idxs[0];
    556     }
    557   }
    558 
    559   // Sort the lemma items according to the hanzi, and give each unique item a
    560   // id
    561   sort_lemmas_by_hz();
    562 
    563   scis_num_ = build_scis();
    564 
    565   // Construct the dict list
    566   dict_trie->dict_list_ = new DictList();
    567   bool dl_success = dict_trie->dict_list_->init_list(scis_, scis_num_,
    568                                                      lemma_arr_, lemma_num_);
    569   assert(dl_success);
    570 
    571   // Construct the NGram information
    572   NGram& ngram = NGram::get_instance();
    573   ngram.build_unigram(lemma_arr_, lemma_num_,
    574                       lemma_arr_[lemma_num_ - 1].idx_by_hz + 1);
    575 
    576   // sort the lemma items according to the spelling idx string
    577   myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), compare_py);
    578 
    579   get_top_lemmas();
    580 
    581 #ifdef ___DO_STATISTICS___
    582   stat_init();
    583 #endif
    584 
    585   lma_nds_used_num_le0_ = 1;  // The root node
    586   bool dt_success = construct_subset(static_cast<void*>(lma_nodes_le0_),
    587                                      lemma_arr_, 0, lemma_num_, 0);
    588   if (!dt_success) {
    589     free_resource();
    590     return false;
    591   }
    592 
    593 #ifdef ___DO_STATISTICS___
    594   stat_print();
    595 #endif
    596 
    597   // Move the node data and homo data to the DictTrie
    598   dict_trie->root_ = new LmaNodeLE0[lma_nds_used_num_le0_];
    599   dict_trie->nodes_ge1_ = new LmaNodeGE1[lma_nds_used_num_ge1_];
    600   size_t lma_idx_num = homo_idx_num_eq1_ + homo_idx_num_gt1_ + top_lmas_num_;
    601   dict_trie->lma_idx_buf_ = new unsigned char[lma_idx_num * kLemmaIdSize];
    602   assert(NULL != dict_trie->root_);
    603   assert(NULL != dict_trie->lma_idx_buf_);
    604   dict_trie->lma_node_num_le0_ = lma_nds_used_num_le0_;
    605   dict_trie->lma_node_num_ge1_ = lma_nds_used_num_ge1_;
    606   dict_trie->lma_idx_buf_len_ = lma_idx_num * kLemmaIdSize;
    607   dict_trie->top_lmas_num_ = top_lmas_num_;
    608 
    609   memcpy(dict_trie->root_, lma_nodes_le0_,
    610          sizeof(LmaNodeLE0) * lma_nds_used_num_le0_);
    611   memcpy(dict_trie->nodes_ge1_, lma_nodes_ge1_,
    612          sizeof(LmaNodeGE1) * lma_nds_used_num_ge1_);
    613 
    614   for (size_t pos = 0; pos < homo_idx_num_eq1_ + homo_idx_num_gt1_; pos++) {
    615     id_to_charbuf(dict_trie->lma_idx_buf_ + pos * kLemmaIdSize,
    616                   homo_idx_buf_[pos]);
    617   }
    618 
    619   for (size_t pos = homo_idx_num_eq1_ + homo_idx_num_gt1_;
    620        pos < lma_idx_num; pos++) {
    621     LemmaIdType idx =
    622         top_lmas_[pos - homo_idx_num_eq1_ - homo_idx_num_gt1_].idx_by_hz;
    623     id_to_charbuf(dict_trie->lma_idx_buf_ + pos * kLemmaIdSize, idx);
    624   }
    625 
    626   if (kPrintDebug0) {
    627     printf("homo_idx_num_eq1_: %d\n", homo_idx_num_eq1_);
    628     printf("homo_idx_num_gt1_: %d\n", homo_idx_num_gt1_);
    629     printf("top_lmas_num_: %d\n", top_lmas_num_);
    630   }
    631 
    632   free_resource();
    633 
    634   if (kPrintDebug0) {
    635     printf("Building dict succeds\n");
    636   }
    637   return dt_success;
    638 }
    639 
    640 void DictBuilder::id_to_charbuf(unsigned char *buf, LemmaIdType id) {
    641   if (NULL == buf) return;
    642   for (size_t pos = 0; pos < kLemmaIdSize; pos++) {
    643     (buf)[pos] = (unsigned char)(id >> (pos * 8));
    644   }
    645 }
    646 
    647 void DictBuilder::set_son_offset(LmaNodeGE1 *node, size_t offset) {
    648   node->son_1st_off_l = static_cast<uint16>(offset);
    649   node->son_1st_off_h = static_cast<unsigned char>(offset >> 16);
    650 }
    651 
    652 void DictBuilder:: set_homo_id_buf_offset(LmaNodeGE1 *node, size_t offset) {
    653   node->homo_idx_buf_off_l = static_cast<uint16>(offset);
    654   node->homo_idx_buf_off_h = static_cast<unsigned char>(offset >> 16);
    655 
    656 }
    657 
    658 // All spelling strings will be converted to upper case, except that
    659 // spellings started with "ZH"/"CH"/"SH" will be converted to
    660 // "Zh"/"Ch"/"Sh"
    661 void DictBuilder::format_spelling_str(char *spl_str) {
    662   if (NULL == spl_str)
    663     return;
    664 
    665   uint16 pos = 0;
    666   while ('\0' != spl_str[pos]) {
    667     if (spl_str[pos] >= 'a' && spl_str[pos] <= 'z')
    668       spl_str[pos] = spl_str[pos] - 'a' + 'A';
    669 
    670     if (1 == pos && 'H' == spl_str[pos]) {
    671       if ('C' == spl_str[0] || 'S' == spl_str[0] || 'Z' == spl_str[0]) {
    672         spl_str[pos] = 'h';
    673       }
    674     }
    675     pos++;
    676   }
    677 }
    678 
    679 LemmaIdType DictBuilder::sort_lemmas_by_hz() {
    680   if (NULL == lemma_arr_ || 0 == lemma_num_)
    681     return 0;
    682 
    683   myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), cmp_lemma_entry_hzs);
    684 
    685   lemma_arr_[0].idx_by_hz = 1;
    686   LemmaIdType idx_max = 1;
    687   for (size_t i = 1; i < lemma_num_; i++) {
    688     if (utf16_strcmp(lemma_arr_[i].hanzi_str, lemma_arr_[i-1].hanzi_str)) {
    689       idx_max++;
    690       lemma_arr_[i].idx_by_hz = idx_max;
    691     } else {
    692       idx_max++;
    693       lemma_arr_[i].idx_by_hz = idx_max;
    694     }
    695   }
    696   return idx_max + 1;
    697 }
    698 
    699 size_t DictBuilder::build_scis() {
    700   if (NULL == scis_ || lemma_num_ * kMaxLemmaSize > scis_num_)
    701     return 0;
    702 
    703   SpellingTrie &spl_trie = SpellingTrie::get_instance();
    704 
    705   // This first one is blank, because id 0 is invalid.
    706   scis_[0].freq = 0;
    707   scis_[0].hz = 0;
    708   scis_[0].splid.full_splid = 0;
    709   scis_[0].splid.half_splid = 0;
    710   scis_num_ = 1;
    711 
    712   // Copy the hanzis to the buffer
    713   for (size_t pos = 0; pos < lemma_num_; pos++) {
    714     size_t hz_num = lemma_arr_[pos].hz_str_len;
    715     for (size_t hzpos = 0; hzpos < hz_num; hzpos++) {
    716       scis_[scis_num_].hz = lemma_arr_[pos].hanzi_str[hzpos];
    717       scis_[scis_num_].splid.full_splid = lemma_arr_[pos].spl_idx_arr[hzpos];
    718       scis_[scis_num_].splid.half_splid =
    719           spl_trie.full_to_half(scis_[scis_num_].splid.full_splid);
    720       if (1 == hz_num)
    721         scis_[scis_num_].freq = lemma_arr_[pos].freq;
    722       else
    723         scis_[scis_num_].freq = 0.000001;
    724       scis_num_++;
    725     }
    726   }
    727 
    728   myqsort(scis_, scis_num_, sizeof(SingleCharItem), cmp_scis_hz_splid_freq);
    729 
    730   // Remove repeated items
    731   size_t unique_scis_num = 1;
    732   for (size_t pos = 1; pos < scis_num_; pos++) {
    733     if (scis_[pos].hz == scis_[pos - 1].hz &&
    734         scis_[pos].splid.full_splid == scis_[pos - 1].splid.full_splid)
    735       continue;
    736     scis_[unique_scis_num] = scis_[pos];
    737     scis_[unique_scis_num].splid.half_splid =
    738         spl_trie.full_to_half(scis_[pos].splid.full_splid);
    739     unique_scis_num++;
    740   }
    741 
    742   scis_num_ = unique_scis_num;
    743 
    744   // Update the lemma list.
    745   for (size_t pos = 0; pos < lemma_num_; pos++) {
    746     size_t hz_num = lemma_arr_[pos].hz_str_len;
    747     for (size_t hzpos = 0; hzpos < hz_num; hzpos++) {
    748       SingleCharItem key;
    749       key.hz = lemma_arr_[pos].hanzi_str[hzpos];
    750       key.splid.full_splid = lemma_arr_[pos].spl_idx_arr[hzpos];
    751       key.splid.half_splid = spl_trie.full_to_half(key.splid.full_splid);
    752 
    753       SingleCharItem *found;
    754       found = static_cast<SingleCharItem*>(mybsearch(&key, scis_,
    755                                                      unique_scis_num,
    756                                                      sizeof(SingleCharItem),
    757                                                      cmp_scis_hz_splid));
    758 
    759       assert(found);
    760 
    761       lemma_arr_[pos].hanzi_scis_ids[hzpos] =
    762           static_cast<uint16>(found - scis_);
    763       lemma_arr_[pos].spl_idx_arr[hzpos] = found->splid.full_splid;
    764     }
    765   }
    766 
    767   return scis_num_;
    768 }
    769 
    770 bool DictBuilder::construct_subset(void* parent, LemmaEntry* lemma_arr,
    771                                    size_t item_start, size_t item_end,
    772                                    size_t level) {
    773   if (level >= kMaxLemmaSize || item_end <= item_start)
    774     return false;
    775 
    776   // 1. Scan for how many sons
    777   size_t parent_son_num = 0;
    778   // LemmaNode *son_1st = NULL;
    779   // parent.num_of_son = 0;
    780 
    781   LemmaEntry *lma_last_start = lemma_arr_ + item_start;
    782   uint16 spl_idx_node = lma_last_start->spl_idx_arr[level];
    783 
    784   // Scan for how many sons to be allocaed
    785   for (size_t i = item_start + 1; i< item_end; i++) {
    786     LemmaEntry *lma_current = lemma_arr + i;
    787     uint16 spl_idx_current = lma_current->spl_idx_arr[level];
    788     if (spl_idx_current != spl_idx_node) {
    789       parent_son_num++;
    790       spl_idx_node = spl_idx_current;
    791     }
    792   }
    793   parent_son_num++;
    794 
    795 #ifdef ___DO_STATISTICS___
    796   // Use to indicate whether all nodes of this layer have no son.
    797   bool allson_noson = true;
    798 
    799   assert(level < kMaxLemmaSize);
    800   if (parent_son_num > max_sonbuf_len_[level])
    801     max_sonbuf_len_[level] = parent_son_num;
    802 
    803   total_son_num_[level] += parent_son_num;
    804   total_sonbuf_num_[level] += 1;
    805 
    806   if (parent_son_num == 1)
    807     sonbufs_num1_++;
    808   else
    809     sonbufs_numgt1_++;
    810   total_lma_node_num_ += parent_son_num;
    811 #endif
    812 
    813   // 2. Update the parent's information
    814   //    Update the parent's son list;
    815   LmaNodeLE0 *son_1st_le0 = NULL;  // only one of le0 or ge1 is used
    816   LmaNodeGE1 *son_1st_ge1 = NULL;  // only one of le0 or ge1 is used.
    817   if (0 == level) {                 // the parent is root
    818     (static_cast<LmaNodeLE0*>(parent))->son_1st_off =
    819       lma_nds_used_num_le0_;
    820     son_1st_le0 = lma_nodes_le0_ + lma_nds_used_num_le0_;
    821     lma_nds_used_num_le0_ += parent_son_num;
    822 
    823     assert(parent_son_num <= 65535);
    824     (static_cast<LmaNodeLE0*>(parent))->num_of_son =
    825       static_cast<uint16>(parent_son_num);
    826   } else if (1 == level) {  // the parent is a son of root
    827     (static_cast<LmaNodeLE0*>(parent))->son_1st_off =
    828       lma_nds_used_num_ge1_;
    829     son_1st_ge1 = lma_nodes_ge1_ + lma_nds_used_num_ge1_;
    830     lma_nds_used_num_ge1_ += parent_son_num;
    831 
    832     assert(parent_son_num <= 65535);
    833     (static_cast<LmaNodeLE0*>(parent))->num_of_son =
    834       static_cast<uint16>(parent_son_num);
    835   } else {
    836     set_son_offset((static_cast<LmaNodeGE1*>(parent)),
    837                    lma_nds_used_num_ge1_);
    838     son_1st_ge1 = lma_nodes_ge1_ + lma_nds_used_num_ge1_;
    839     lma_nds_used_num_ge1_ += parent_son_num;
    840 
    841     assert(parent_son_num <= 255);
    842     (static_cast<LmaNodeGE1*>(parent))->num_of_son =
    843       (unsigned char)parent_son_num;
    844   }
    845 
    846   // 3. Now begin to construct the son one by one
    847   size_t son_pos = 0;
    848 
    849   lma_last_start = lemma_arr_ + item_start;
    850   spl_idx_node = lma_last_start->spl_idx_arr[level];
    851 
    852   size_t homo_num = 0;
    853   if (lma_last_start->spl_idx_arr[level + 1] == 0)
    854     homo_num = 1;
    855 
    856   size_t item_start_next = item_start;
    857 
    858   for (size_t i = item_start + 1; i < item_end; i++) {
    859     LemmaEntry* lma_current = lemma_arr_ + i;
    860     uint16 spl_idx_current = lma_current->spl_idx_arr[level];
    861 
    862     if (spl_idx_current == spl_idx_node) {
    863       if (lma_current->spl_idx_arr[level + 1] == 0)
    864         homo_num++;
    865     } else {
    866       // Construct a node
    867       LmaNodeLE0 *node_cur_le0 = NULL;  // only one of them is valid
    868       LmaNodeGE1 *node_cur_ge1 = NULL;
    869       if (0 == level) {
    870         node_cur_le0 = son_1st_le0 + son_pos;
    871         node_cur_le0->spl_idx = spl_idx_node;
    872         node_cur_le0->homo_idx_buf_off = homo_idx_num_eq1_ + homo_idx_num_gt1_;
    873         node_cur_le0->son_1st_off = 0;
    874         homo_idx_num_eq1_ += homo_num;
    875       } else {
    876         node_cur_ge1 = son_1st_ge1 + son_pos;
    877         node_cur_ge1->spl_idx = spl_idx_node;
    878 
    879         set_homo_id_buf_offset(node_cur_ge1,
    880                                (homo_idx_num_eq1_ + homo_idx_num_gt1_));
    881         set_son_offset(node_cur_ge1, 0);
    882         homo_idx_num_gt1_ += homo_num;
    883       }
    884 
    885       if (homo_num > 0) {
    886         LemmaIdType* idx_buf = homo_idx_buf_ + homo_idx_num_eq1_ +
    887               homo_idx_num_gt1_ - homo_num;
    888         if (0 == level) {
    889           assert(homo_num <= 65535);
    890           node_cur_le0->num_of_homo = static_cast<uint16>(homo_num);
    891         } else {
    892           assert(homo_num <= 255);
    893           node_cur_ge1->num_of_homo = (unsigned char)homo_num;
    894         }
    895 
    896         for (size_t homo_pos = 0; homo_pos < homo_num; homo_pos++) {
    897           idx_buf[homo_pos] = lemma_arr_[item_start_next + homo_pos].idx_by_hz;
    898         }
    899 
    900 #ifdef ___DO_STATISTICS___
    901         if (homo_num > max_homobuf_len_[level])
    902           max_homobuf_len_[level] = homo_num;
    903 
    904         total_homo_num_[level] += homo_num;
    905 #endif
    906       }
    907 
    908       if (i - item_start_next > homo_num) {
    909         void *next_parent;
    910         if (0 == level)
    911           next_parent = static_cast<void*>(node_cur_le0);
    912         else
    913           next_parent = static_cast<void*>(node_cur_ge1);
    914         construct_subset(next_parent, lemma_arr,
    915                          item_start_next + homo_num, i, level + 1);
    916 #ifdef ___DO_STATISTICS___
    917 
    918         total_node_hasson_[level] += 1;
    919         allson_noson = false;
    920 #endif
    921       }
    922 
    923       // for the next son
    924       lma_last_start = lma_current;
    925       spl_idx_node = spl_idx_current;
    926       item_start_next = i;
    927       homo_num = 0;
    928       if (lma_current->spl_idx_arr[level + 1] == 0)
    929         homo_num = 1;
    930 
    931       son_pos++;
    932     }
    933   }
    934 
    935   // 4. The last one to construct
    936   LmaNodeLE0 *node_cur_le0 = NULL;  // only one of them is valid
    937   LmaNodeGE1 *node_cur_ge1 = NULL;
    938   if (0 == level) {
    939     node_cur_le0 = son_1st_le0 + son_pos;
    940     node_cur_le0->spl_idx = spl_idx_node;
    941     node_cur_le0->homo_idx_buf_off = homo_idx_num_eq1_ + homo_idx_num_gt1_;
    942     node_cur_le0->son_1st_off = 0;
    943     homo_idx_num_eq1_ += homo_num;
    944   } else {
    945     node_cur_ge1 = son_1st_ge1 + son_pos;
    946     node_cur_ge1->spl_idx = spl_idx_node;
    947 
    948     set_homo_id_buf_offset(node_cur_ge1,
    949                            (homo_idx_num_eq1_ + homo_idx_num_gt1_));
    950     set_son_offset(node_cur_ge1, 0);
    951     homo_idx_num_gt1_ += homo_num;
    952   }
    953 
    954   if (homo_num > 0) {
    955     LemmaIdType* idx_buf = homo_idx_buf_ + homo_idx_num_eq1_ +
    956           homo_idx_num_gt1_ - homo_num;
    957     if (0 == level) {
    958       assert(homo_num <= 65535);
    959       node_cur_le0->num_of_homo = static_cast<uint16>(homo_num);
    960     } else {
    961       assert(homo_num <= 255);
    962       node_cur_ge1->num_of_homo = (unsigned char)homo_num;
    963     }
    964 
    965     for (size_t homo_pos = 0; homo_pos < homo_num; homo_pos++) {
    966       idx_buf[homo_pos] = lemma_arr[item_start_next + homo_pos].idx_by_hz;
    967     }
    968 
    969 #ifdef ___DO_STATISTICS___
    970     if (homo_num > max_homobuf_len_[level])
    971       max_homobuf_len_[level] = homo_num;
    972 
    973     total_homo_num_[level] += homo_num;
    974 #endif
    975   }
    976 
    977   if (item_end - item_start_next > homo_num) {
    978     void *next_parent;
    979     if (0 == level)
    980       next_parent = static_cast<void*>(node_cur_le0);
    981     else
    982       next_parent = static_cast<void*>(node_cur_ge1);
    983     construct_subset(next_parent, lemma_arr,
    984                      item_start_next + homo_num, item_end, level + 1);
    985 #ifdef ___DO_STATISTICS___
    986 
    987     total_node_hasson_[level] += 1;
    988     allson_noson = false;
    989 #endif
    990   }
    991 
    992 #ifdef ___DO_STATISTICS___
    993   if (allson_noson) {
    994     total_sonbuf_allnoson_[level] += 1;
    995     total_node_in_sonbuf_allnoson_[level] += parent_son_num;
    996   }
    997 #endif
    998 
    999   assert(son_pos + 1 == parent_son_num);
   1000   return true;
   1001 }
   1002 
   1003 #ifdef ___DO_STATISTICS___
   1004 void DictBuilder::stat_init() {
   1005   memset(max_sonbuf_len_, 0, sizeof(size_t) * kMaxLemmaSize);
   1006   memset(max_homobuf_len_, 0, sizeof(size_t) * kMaxLemmaSize);
   1007   memset(total_son_num_, 0, sizeof(size_t) * kMaxLemmaSize);
   1008   memset(total_node_hasson_, 0, sizeof(size_t) * kMaxLemmaSize);
   1009   memset(total_sonbuf_num_, 0, sizeof(size_t) * kMaxLemmaSize);
   1010   memset(total_sonbuf_allnoson_, 0, sizeof(size_t) * kMaxLemmaSize);
   1011   memset(total_node_in_sonbuf_allnoson_, 0, sizeof(size_t) * kMaxLemmaSize);
   1012   memset(total_homo_num_, 0, sizeof(size_t) * kMaxLemmaSize);
   1013 
   1014   sonbufs_num1_ = 0;
   1015   sonbufs_numgt1_ = 0;
   1016   total_lma_node_num_ = 0;
   1017 }
   1018 
   1019 void DictBuilder::stat_print() {
   1020   printf("\n------------STAT INFO-------------\n");
   1021   printf("[root is layer -1]\n");
   1022   printf(".. max_sonbuf_len per layer(from layer 0):\n   ");
   1023   for (size_t i = 0; i < kMaxLemmaSize; i++)
   1024     printf("%d, ", max_sonbuf_len_[i]);
   1025   printf("-, \n");
   1026 
   1027   printf(".. max_homobuf_len per layer:\n   -, ");
   1028   for (size_t i = 0; i < kMaxLemmaSize; i++)
   1029     printf("%d, ", max_homobuf_len_[i]);
   1030   printf("\n");
   1031 
   1032   printf(".. total_son_num per layer:\n   ");
   1033   for (size_t i = 0; i < kMaxLemmaSize; i++)
   1034     printf("%d, ", total_son_num_[i]);
   1035   printf("-, \n");
   1036 
   1037   printf(".. total_node_hasson per layer:\n   1, ");
   1038   for (size_t i = 0; i < kMaxLemmaSize; i++)
   1039     printf("%d, ", total_node_hasson_[i]);
   1040   printf("\n");
   1041 
   1042   printf(".. total_sonbuf_num per layer:\n   ");
   1043   for (size_t i = 0; i < kMaxLemmaSize; i++)
   1044     printf("%d, ", total_sonbuf_num_[i]);
   1045   printf("-, \n");
   1046 
   1047   printf(".. total_sonbuf_allnoson per layer:\n   ");
   1048   for (size_t i = 0; i < kMaxLemmaSize; i++)
   1049     printf("%d, ", total_sonbuf_allnoson_[i]);
   1050   printf("-, \n");
   1051 
   1052   printf(".. total_node_in_sonbuf_allnoson per layer:\n   ");
   1053   for (size_t i = 0; i < kMaxLemmaSize; i++)
   1054     printf("%d, ", total_node_in_sonbuf_allnoson_[i]);
   1055   printf("-, \n");
   1056 
   1057   printf(".. total_homo_num per layer:\n   0, ");
   1058   for (size_t i = 0; i < kMaxLemmaSize; i++)
   1059     printf("%d, ", total_homo_num_[i]);
   1060   printf("\n");
   1061 
   1062   printf(".. son buf allocation number with only 1 son: %d\n", sonbufs_num1_);
   1063   printf(".. son buf allocation number with more than 1 son: %d\n",
   1064          sonbufs_numgt1_);
   1065   printf(".. total lemma node number: %d\n", total_lma_node_num_ + 1);
   1066 }
   1067 #endif  // ___DO_STATISTICS___
   1068 
   1069 #endif  // ___BUILD_MODEL___
   1070 }  // namespace ime_pinyin
   1071