1 /* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef PINYINIME_INCLUDE_DICTBUILDER_H__ 18 #define PINYINIME_INCLUDE_DICTBUILDER_H__ 19 20 #include <stdlib.h> 21 #include "./utf16char.h" 22 #include "./dictdef.h" 23 #include "./dictlist.h" 24 #include "./spellingtable.h" 25 #include "./spellingtrie.h" 26 #include "./splparser.h" 27 28 namespace ime_pinyin { 29 30 #ifdef ___BUILD_MODEL___ 31 32 #define ___DO_STATISTICS___ 33 34 class DictTrie; 35 36 class DictBuilder { 37 private: 38 // The raw lemma array buffer. 39 LemmaEntry *lemma_arr_; 40 size_t lemma_num_; 41 42 // Used to store all possible single char items. 43 // Two items may have the same Hanzi while their spelling ids are different. 44 SingleCharItem *scis_; 45 size_t scis_num_; 46 47 // In the tree, root's level is -1. 48 // Lemma nodes for root, and level 0 49 LmaNodeLE0 *lma_nodes_le0_; 50 51 // Lemma nodes for layers whose levels are deeper than 0 52 LmaNodeGE1 *lma_nodes_ge1_; 53 54 // Number of used lemma nodes 55 size_t lma_nds_used_num_le0_; 56 size_t lma_nds_used_num_ge1_; 57 58 // Used to store homophonies' ids. 59 LemmaIdType *homo_idx_buf_; 60 // Number of homophonies each of which only contains one Chinese character. 61 size_t homo_idx_num_eq1_; 62 // Number of homophonies each of which contains more than one character. 63 size_t homo_idx_num_gt1_; 64 65 // The items with highest scores. 66 LemmaEntry *top_lmas_; 67 size_t top_lmas_num_; 68 69 SpellingTable *spl_table_; 70 SpellingParser *spl_parser_; 71 72 #ifdef ___DO_STATISTICS___ 73 size_t max_sonbuf_len_[kMaxLemmaSize]; 74 size_t max_homobuf_len_[kMaxLemmaSize]; 75 76 size_t total_son_num_[kMaxLemmaSize]; 77 size_t total_node_hasson_[kMaxLemmaSize]; 78 size_t total_sonbuf_num_[kMaxLemmaSize]; 79 size_t total_sonbuf_allnoson_[kMaxLemmaSize]; 80 size_t total_node_in_sonbuf_allnoson_[kMaxLemmaSize]; 81 size_t total_homo_num_[kMaxLemmaSize]; 82 83 size_t sonbufs_num1_; // Number of son buffer with only 1 son 84 size_t sonbufs_numgt1_; // Number of son buffer with more 1 son; 85 86 size_t total_lma_node_num_; 87 88 void stat_init(); 89 void stat_print(); 90 #endif 91 92 public: 93 94 DictBuilder(); 95 ~DictBuilder(); 96 97 // Build dictionary trie from the file fn_raw. File fn_validhzs provides 98 // valid chars. If fn_validhzs is NULL, only chars in GB2312 will be 99 // included. 100 bool build_dict(const char* fn_raw, const char* fn_validhzs, 101 DictTrie *dict_trie); 102 103 private: 104 // Fill in the buffer with id. The caller guarantees that the paramters are 105 // vaild. 106 void id_to_charbuf(unsigned char *buf, LemmaIdType id); 107 108 // Update the offset of sons for a node. 109 void set_son_offset(LmaNodeGE1 *node, size_t offset); 110 111 // Update the offset of homophonies' ids for a node. 112 void set_homo_id_buf_offset(LmaNodeGE1 *node, size_t offset); 113 114 // Format a speling string. 115 void format_spelling_str(char *spl_str); 116 117 // Sort the lemma_arr by the hanzi string, and give each of unique items 118 // a id. Why we need to sort the lemma list according to their Hanzi string 119 // is to find items started by a given prefix string to do prediction. 120 // Actually, the single char items are be in other order, for example, 121 // in spelling id order, etc. 122 // Return value is next un-allocated idx available. 123 LemmaIdType sort_lemmas_by_hz(); 124 125 // Build the SingleCharItem list, and fill the hanzi_scis_ids in the 126 // lemma buffer lemma_arr_. 127 // This function should be called after the lemma array is ready. 128 // Return the number of unique SingleCharItem elements. 129 size_t build_scis(); 130 131 // Construct a subtree using a subset of the spelling array (from 132 // item_star to item_end) 133 // parent is the parent node to update the necessary information 134 // parent can be a member of LmaNodeLE0 or LmaNodeGE1 135 bool construct_subset(void* parent, LemmaEntry* lemma_arr, 136 size_t item_start, size_t item_end, size_t level); 137 138 139 // Read valid Chinese Hanzis from the given file. 140 // num is used to return number of chars. 141 // The return buffer is sorted and caller needs to free the returned buffer. 142 char16* read_valid_hanzis(const char *fn_validhzs, size_t *num); 143 144 145 // Read a raw dictionary. max_item is the maximum number of items. If there 146 // are more items in the ditionary, only the first max_item will be read. 147 // Returned value is the number of items successfully read from the file. 148 size_t read_raw_dict(const char* fn_raw, const char *fn_validhzs, 149 size_t max_item); 150 151 // Try to find if a character is in hzs buffer. 152 bool hz_in_hanzis_list(const char16 *hzs, size_t hzs_len, char16 hz); 153 154 // Try to find if all characters in str are in hzs buffer. 155 bool str_in_hanzis_list(const char16 *hzs, size_t hzs_len, 156 const char16 *str, size_t str_len); 157 158 // Get these lemmas with toppest scores. 159 void get_top_lemmas(); 160 161 // Allocate resource to build dictionary. 162 // lma_num is the number of items to be loaded 163 bool alloc_resource(size_t lma_num); 164 165 // Free resource. 166 void free_resource(); 167 }; 168 #endif // ___BUILD_MODEL___ 169 } 170 171 #endif // PINYINIME_INCLUDE_DICTBUILDER_H__ 172