1 /* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef PINYINIME_INCLUDE_DICTTRIE_H__ 18 #define PINYINIME_INCLUDE_DICTTRIE_H__ 19 20 #include <stdlib.h> 21 #include "./atomdictbase.h" 22 #include "./dictdef.h" 23 #include "./dictlist.h" 24 #include "./searchutility.h" 25 26 namespace ime_pinyin { 27 28 class DictTrie : AtomDictBase { 29 private: 30 typedef struct ParsingMark { 31 size_t node_offset:24; 32 size_t node_num:8; // Number of nodes with this spelling id given 33 // by spl_id. If spl_id is a Shengmu, for nodes 34 // in the first layer of DictTrie, it equals to 35 // SpellingTrie::shm2full_num(); but for those 36 // nodes which are not in the first layer, 37 // node_num < SpellingTrie::shm2full_num(). 38 // For a full spelling id, node_num = 1; 39 }; 40 41 // Used to indicate an extended mile stone. 42 // An extended mile stone is used to mark a partial match in the dictionary 43 // trie to speed up further potential extending. 44 // For example, when the user inputs "w", a mile stone is created to mark the 45 // partial match status, so that when user inputs another char 'm', it will be 46 // faster to extend search space based on this mile stone. 47 // 48 // For partial match status of "wm", there can be more than one sub mile 49 // stone, for example, "wm" can be matched to "wanm", "wom", ..., etc, so 50 // there may be more one parsing mark used to mark these partial matchings. 51 // A mile stone records the starting position in the mark list and number of 52 // marks. 53 struct MileStone { 54 uint16 mark_start; 55 uint16 mark_num; 56 }; 57 58 DictList* dict_list_; 59 60 const SpellingTrie *spl_trie_; 61 62 LmaNodeLE0* root_; // Nodes for root and the first layer. 63 LmaNodeGE1* nodes_ge1_; // Nodes for other layers. 64 65 // An quick index from spelling id to the LmaNodeLE0 node buffer, or 66 // to the root_ buffer. 67 // Index length: 68 // SpellingTrie::get_instance().get_spelling_num() + 1. The last one is used 69 // to get the end. 70 // All Shengmu ids are not indexed because they will be converted into 71 // corresponding full ids. 72 // So, given an id splid, the son is: 73 // root_[splid_le0_index_[splid - kFullSplIdStart]] 74 uint16 *splid_le0_index_; 75 76 size_t lma_node_num_le0_; 77 size_t lma_node_num_ge1_; 78 79 // The first part is for homophnies, and the last top_lma_num_ items are 80 // lemmas with highest scores. 81 unsigned char *lma_idx_buf_; 82 size_t lma_idx_buf_len_; // The total size of lma_idx_buf_ in byte. 83 size_t total_lma_num_; // Total number of lemmas in this dictionary. 84 size_t top_lmas_num_; // Number of lemma with highest scores. 85 86 // Parsing mark list used to mark the detailed extended statuses. 87 ParsingMark *parsing_marks_; 88 // The position for next available mark. 89 uint16 parsing_marks_pos_; 90 91 // Mile stone list used to mark the extended status. 92 MileStone *mile_stones_; 93 // The position for the next available mile stone. We use positions (except 0) 94 // as handles. 95 MileStoneHandle mile_stones_pos_; 96 97 // Get the offset of sons for a node. 98 inline size_t get_son_offset(const LmaNodeGE1 *node); 99 100 // Get the offset of homonious ids for a node. 101 inline size_t get_homo_idx_buf_offset(const LmaNodeGE1 *node); 102 103 // Get the lemma id by the offset. 104 inline LemmaIdType get_lemma_id(size_t id_offset); 105 106 void free_resource(bool free_dict_list); 107 108 bool load_dict(FILE *fp); 109 110 // Given a LmaNodeLE0 node, extract the lemmas specified by it, and fill 111 // them into the lpi_items buffer. 112 // This function is called by the search engine. 113 size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size, 114 LmaNodeLE0 *node); 115 116 // Given a LmaNodeGE1 node, extract the lemmas specified by it, and fill 117 // them into the lpi_items buffer. 118 // This function is called by inner functions extend_dict0(), extend_dict1() 119 // and extend_dict2(). 120 size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size, 121 size_t homo_buf_off, LmaNodeGE1 *node, 122 uint16 lma_len); 123 124 // Extend in the trie from level 0. 125 MileStoneHandle extend_dict0(MileStoneHandle from_handle, 126 const DictExtPara *dep, LmaPsbItem *lpi_items, 127 size_t lpi_max, size_t *lpi_num); 128 129 // Extend in the trie from level 1. 130 MileStoneHandle extend_dict1(MileStoneHandle from_handle, 131 const DictExtPara *dep, LmaPsbItem *lpi_items, 132 size_t lpi_max, size_t *lpi_num); 133 134 // Extend in the trie from level 2. 135 MileStoneHandle extend_dict2(MileStoneHandle from_handle, 136 const DictExtPara *dep, LmaPsbItem *lpi_items, 137 size_t lpi_max, size_t *lpi_num); 138 139 // Try to extend the given spelling id buffer, and if the given id_lemma can 140 // be successfully gotten, return true; 141 // The given spelling ids are all valid full ids. 142 bool try_extend(const uint16 *splids, uint16 splid_num, LemmaIdType id_lemma); 143 144 #ifdef ___BUILD_MODEL___ 145 bool save_dict(FILE *fp); 146 #endif // ___BUILD_MODEL___ 147 148 static const int kMaxMileStone = 100; 149 static const int kMaxParsingMark = 600; 150 static const MileStoneHandle kFirstValidMileStoneHandle = 1; 151 152 friend class DictParser; 153 friend class DictBuilder; 154 155 public: 156 157 DictTrie(); 158 ~DictTrie(); 159 160 #ifdef ___BUILD_MODEL___ 161 // Construct the tree from the file fn_raw. 162 // fn_validhzs provide the valid hanzi list. If fn_validhzs is 163 // NULL, only chars in GB2312 will be included. 164 bool build_dict(const char *fn_raw, const char *fn_validhzs); 165 166 // Save the binary dictionary 167 // Actually, the SpellingTrie/DictList instance will be also saved. 168 bool save_dict(const char *filename); 169 #endif // ___BUILD_MODEL___ 170 171 void convert_to_hanzis(char16 *str, uint16 str_len); 172 173 void convert_to_scis_ids(char16 *str, uint16 str_len); 174 175 // Load a binary dictionary 176 // The SpellingTrie instance/DictList will be also loaded 177 bool load_dict(const char *filename, LemmaIdType start_id, 178 LemmaIdType end_id); 179 bool load_dict_fd(int sys_fd, long start_offset, long length, 180 LemmaIdType start_id, LemmaIdType end_id); 181 bool close_dict() {return true;} 182 size_t number_of_lemmas() {return 0;} 183 184 void reset_milestones(uint16 from_step, MileStoneHandle from_handle); 185 186 MileStoneHandle extend_dict(MileStoneHandle from_handle, 187 const DictExtPara *dep, 188 LmaPsbItem *lpi_items, 189 size_t lpi_max, size_t *lpi_num); 190 191 size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len, 192 LmaPsbItem *lpi_items, size_t lpi_max); 193 194 uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max); 195 196 uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, 197 uint16 splids_max, bool arg_valid); 198 199 size_t predict(const char16 *last_hzs, uint16 hzs_len, 200 NPredictItem *npre_items, size_t npre_max, 201 size_t b4_used); 202 203 LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[], 204 uint16 lemma_len, uint16 count) {return 0;} 205 206 LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count, 207 bool selected) {return 0;} 208 209 LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[], 210 uint16 lemma_len) {return 0;} 211 212 LmaScoreType get_lemma_score(LemmaIdType lemma_id) {return 0;} 213 214 LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[], 215 uint16 lemma_len) {return 0;} 216 217 bool remove_lemma(LemmaIdType lemma_id) {return false;} 218 219 size_t get_total_lemma_count() {return 0;} 220 void set_total_lemma_count_of_others(size_t count); 221 222 void flush_cache() {} 223 224 LemmaIdType get_lemma_id(const char16 lemma_str[], uint16 lemma_len); 225 226 // Fill the lemmas with highest scores to the prediction buffer. 227 // his_len is the history length to fill in the prediction buffer. 228 size_t predict_top_lmas(size_t his_len, NPredictItem *npre_items, 229 size_t npre_max, size_t b4_used); 230 }; 231 } 232 233 #endif // PINYINIME_INCLUDE_DICTTRIE_H__ 234