1 /* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__ 18 #define PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__ 19 20 #include <stdlib.h> 21 #include "./atomdictbase.h" 22 #include "./dicttrie.h" 23 #include "./searchutility.h" 24 #include "./spellingtrie.h" 25 #include "./splparser.h" 26 27 namespace ime_pinyin { 28 29 static const size_t kMaxRowNum = kMaxSearchSteps; 30 31 typedef struct { 32 // MileStoneHandle objects for the system and user dictionaries. 33 MileStoneHandle dict_handles[2]; 34 // From which DMI node. -1 means it's from root. 35 PoolPosType dmi_fr; 36 // The spelling id for the Pinyin string from the previous DMI to this node. 37 // If it is a half id like Shengmu, the node pointed by dict_node is the first 38 // node with this Shengmu, 39 uint16 spl_id; 40 // What's the level of the dict node. Level of root is 0, but root is never 41 // recorded by dict_node. 42 unsigned char dict_level:7; 43 // If this node is for composing phrase, this bit is 1. 44 unsigned char c_phrase:1; 45 // Whether the spl_id is parsed with a split character at the end. 46 unsigned char splid_end_split:1; 47 // What's the length of the spelling string for this match, for the whole 48 // word. 49 unsigned char splstr_len:7; 50 // Used to indicate whether all spelling ids from the root are full spelling 51 // ids. This information is useful for keymapping mode(not finished). Because 52 // in this mode, there is no clear boundaries, we prefer those results which 53 // have full spelling ids. 54 unsigned char all_full_id:1; 55 } DictMatchInfo, *PDictMatchInfo; 56 57 typedef struct MatrixNode { 58 LemmaIdType id; 59 float score; 60 MatrixNode *from; 61 // From which DMI node. Used to trace the spelling segmentation. 62 PoolPosType dmi_fr; 63 uint16 step; 64 } MatrixNode, *PMatrixNode; 65 66 typedef struct { 67 // The MatrixNode position in the matrix pool 68 PoolPosType mtrx_nd_pos; 69 // The DictMatchInfo position in the DictMatchInfo pool. 70 PoolPosType dmi_pos; 71 uint16 mtrx_nd_num; 72 uint16 dmi_num:15; 73 // Used to indicate whether there are dmi nodes in this step with full 74 // spelling id. This information is used to decide whether a substring of a 75 // valid Pinyin should be extended. 76 // 77 // Example1: shoudao 78 // When the last char 'o' is added, the parser will find "dao" is a valid 79 // Pinyin, and because all dmi nodes at location 'd' (including those for 80 // "shoud", and those for "d") have Shengmu id only, so it is not necessary 81 // to extend "ao", otherwise the result may be "shoud ao", that is not 82 // reasonable. 83 // 84 // Example2: hengao 85 // When the last 'o' is added, the parser finds "gao" is a valid Pinyin. 86 // Because some dmi nodes at 'g' has Shengmu ids (hen'g and g), but some dmi 87 // nodes at 'g' has full ids ('heng'), so it is necessary to extend "ao", thus 88 // "heng ao" can also be the result. 89 // 90 // Similarly, "ganga" is expanded to "gang a". 91 // 92 // For Pinyin string "xian", because "xian" is a valid Pinyin, because all dmi 93 // nodes at 'x' only have Shengmu ids, the parser will not try "x ian" (and it 94 // is not valid either). If the parser uses break in the loop, the result 95 // always be "xian"; but if the parser uses continue in the loop, "xi an" will 96 // also be tried. This behaviour can be set via the function 97 // set_xi_an_switch(). 98 uint16 dmi_has_full_id:1; 99 // Points to a MatrixNode of the current step to indicate which choice the 100 // user selects. 101 MatrixNode *mtrx_nd_fixed; 102 } MatrixRow, *PMatrixRow; 103 104 // When user inputs and selects candidates, the fixed lemma ids are stored in 105 // lma_id_ of class MatrixSearch, and fixed_lmas_ is used to indicate how many 106 // lemmas from the beginning are fixed. If user deletes Pinyin characters one 107 // by one from the end, these fixed lemmas can be unlocked one by one when 108 // necessary. Whenever user deletes a Chinese character and its spelling string 109 // in these fixed lemmas, all fixed lemmas will be merged together into a unit 110 // named ComposingPhrase with a lemma id kLemmaIdComposing, and this composing 111 // phrase will be the first lemma in the sentence. Because it contains some 112 // modified lemmas (by deleting a character), these merged lemmas are called 113 // sub lemmas (sublma), and each of them are represented individually, so that 114 // when user deletes Pinyin characters from the end, these sub lemmas can also 115 // be unlocked one by one. 116 typedef struct { 117 uint16 spl_ids[kMaxRowNum]; 118 uint16 spl_start[kMaxRowNum]; 119 char16 chn_str[kMaxRowNum]; // Chinese string. 120 uint16 sublma_start[kMaxRowNum]; // Counted in Chinese characters. 121 size_t sublma_num; 122 uint16 length; // Counted in Chinese characters. 123 } ComposingPhrase, *TComposingPhrase; 124 125 class MatrixSearch { 126 private: 127 // If it is true, prediction list by string whose length is greater than 1 128 // will be limited to a reasonable number. 129 static const bool kPredictLimitGt1 = false; 130 131 // If it is true, the engine will prefer long history based prediction, 132 // for example, when user inputs "BeiJing", we prefer "DaXue", etc., which are 133 // based on the two-character history. 134 static const bool kPreferLongHistoryPredict = true; 135 136 // If it is true, prediction will only be based on user dictionary. this flag 137 // is for debug purpose. 138 static const bool kOnlyUserDictPredict = false; 139 140 // The maximum buffer to store LmaPsbItems. 141 static const size_t kMaxLmaPsbItems = 1450; 142 143 // How many rows for each step. 144 static const size_t kMaxNodeARow = 5; 145 146 // The maximum length of the sentence candidates counted in chinese 147 // characters 148 static const size_t kMaxSentenceLength = 16; 149 150 // The size of the matrix node pool. 151 static const size_t kMtrxNdPoolSize = 200; 152 153 // The size of the DMI node pool. 154 static const size_t kDmiPoolSize = 800; 155 156 // Used to indicate whether this object has been initialized. 157 bool inited_; 158 159 // Spelling trie. 160 const SpellingTrie *spl_trie_; 161 162 // Used to indicate this switcher status: when "xian" is parseed, should 163 // "xi an" also be extended. Default is false. 164 // These cases include: xia, xian, xiang, zhuan, jiang..., etc. The string 165 // should be valid for a FULL spelling, or a combination of two spellings, 166 // first of which is a FULL id too. So even it is true, "da" will never be 167 // split into "d a", because "d" is not a full spelling id. 168 bool xi_an_enabled_; 169 170 // System dictionary. 171 DictTrie* dict_trie_; 172 173 // User dictionary. 174 AtomDictBase* user_dict_; 175 176 // Spelling parser. 177 SpellingParser* spl_parser_; 178 179 // The maximum allowed length of spelling string (such as a Pinyin string). 180 size_t max_sps_len_; 181 182 // The maximum allowed length of a result Chinese string. 183 size_t max_hzs_len_; 184 185 // Pinyin string. Max length: kMaxRowNum - 1 186 char pys_[kMaxRowNum]; 187 188 // The length of the string that has been decoded successfully. 189 size_t pys_decoded_len_; 190 191 // Shared buffer for multiple purposes. 192 size_t *share_buf_; 193 194 MatrixNode *mtrx_nd_pool_; 195 PoolPosType mtrx_nd_pool_used_; // How many nodes used in the pool 196 DictMatchInfo *dmi_pool_; 197 PoolPosType dmi_pool_used_; // How many items used in the pool 198 199 MatrixRow *matrix_; // The first row is for starting 200 201 DictExtPara *dep_; // Parameter used to extend DMI nodes. 202 203 NPredictItem *npre_items_; // Used to do prediction 204 size_t npre_items_len_; 205 206 // The starting positions and lemma ids for the full sentence candidate. 207 size_t lma_id_num_; 208 uint16 lma_start_[kMaxRowNum]; // Counted in spelling ids. 209 LemmaIdType lma_id_[kMaxRowNum]; 210 size_t fixed_lmas_; 211 212 // If fixed_lmas_ is bigger than i, Element i is used to indicate whether 213 // the i'th lemma id in lma_id_ is the first candidate for that step. 214 // If all candidates are the first one for that step, the whole string can be 215 // decoded by the engine automatically, so no need to add it to user 216 // dictionary. (We are considering to add it to user dictionary in the 217 // future). 218 uint8 fixed_lmas_no1_[kMaxRowNum]; 219 220 // Composing phrase 221 ComposingPhrase c_phrase_; 222 223 // If dmi_c_phrase_ is true, the decoder will try to match the 224 // composing phrase (And definitely it will match successfully). If it 225 // is false, the decoder will try to match lemmas items in dictionaries. 226 bool dmi_c_phrase_; 227 228 // The starting positions and spelling ids for the first full sentence 229 // candidate. 230 size_t spl_id_num_; // Number of splling ids 231 uint16 spl_start_[kMaxRowNum]; // Starting positions 232 uint16 spl_id_[kMaxRowNum]; // Spelling ids 233 // Used to remember the last fixed position, counted in Hanzi. 234 size_t fixed_hzs_; 235 236 // Lemma Items with possibility score, two purposes: 237 // 1. In Viterbi decoding, this buffer is used to get all possible candidates 238 // for current step; 239 // 2. When the search is done, this buffer is used to get candiates from the 240 // first un-fixed step and show them to the user. 241 LmaPsbItem lpi_items_[kMaxLmaPsbItems]; 242 size_t lpi_total_; 243 244 // Assign the pointers with NULL. The caller makes sure that all pointers are 245 // not valid before calling it. This function only will be called in the 246 // construction function and free_resource(). 247 void reset_pointers_to_null(); 248 249 bool alloc_resource(); 250 251 void free_resource(); 252 253 // Reset the search space totally. 254 bool reset_search0(); 255 256 // Reset the search space from ch_pos step. For example, if the original 257 // input Pinyin is "an", reset_search(1) will reset the search space to the 258 // result of "a". If the given position is out of range, return false. 259 // if clear_fixed_this_step is true, and the ch_pos step is a fixed step, 260 // clear its fixed status. if clear_dmi_his_step is true, clear the DMI nodes. 261 // If clear_mtrx_this_sTep is true, clear the mtrx nodes of this step. 262 // The DMI nodes will be kept. 263 // 264 // Note: this function should not destroy content of pys_. 265 bool reset_search(size_t ch_pos, bool clear_fixed_this_step, 266 bool clear_dmi_this_step, bool clear_mtrx_this_step); 267 268 // Delete a part of the content in pys_. 269 void del_in_pys(size_t start, size_t len); 270 271 // Delete a spelling id and its corresponding Chinese character, and merge 272 // the fixed lemmas into the composing phrase. 273 // del_spl_pos indicates which spelling id needs to be delete. 274 // This function will update the lemma and spelling segmentation information. 275 // The caller guarantees that fixed_lmas_ > 0 and del_spl_pos is within 276 // the fixed lemmas. 277 void merge_fixed_lmas(size_t del_spl_pos); 278 279 // Get spelling start posistions and ids. The result will be stored in 280 // spl_id_num_, spl_start_[], spl_id_[]. 281 // fixed_hzs_ will be also assigned. 282 void get_spl_start_id(); 283 284 // Get all lemma ids with match the given spelling id stream(shorter than the 285 // maximum length of a word). 286 // If pfullsent is not NULL, means the full sentence candidate may be the 287 // same with the coming lemma string, if so, remove that lemma. 288 // The result is sorted in descendant order by the frequency score. 289 size_t get_lpis(const uint16* splid_str, size_t splid_str_len, 290 LmaPsbItem* lma_buf, size_t max_lma_buf, 291 const char16 *pfullsent, bool sort_by_psb); 292 293 uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max); 294 295 uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, 296 uint16 splids_max, bool arg_valid); 297 298 299 // Extend a DMI node with a spelling id. ext_len is the length of the rows 300 // to extend, actually, it is the size of the spelling string of splid. 301 // return value can be 1 or 0. 302 // 1 means a new DMI is filled in (dmi_pool_used_ is the next blank DMI in 303 // the pool). 304 // 0 means either the dmi node can not be extended with splid, or the splid 305 // is a Shengmu id, which is only used to get lpi_items, or the result node 306 // in DictTrie has no son, it is not nccessary to keep the new DMI. 307 // 308 // This function modifies the content of lpi_items_ and lpi_total_. 309 // lpi_items_ is used to get the LmaPsbItem list, lpi_total_ returns the size. 310 // The function's returned value has no relation with the value of lpi_num. 311 // 312 // If dmi == NULL, this function will extend the root node of DictTrie 313 // 314 // This function will not change dmi_nd_pool_used_. Please change it after 315 // calling this function if necessary. 316 // 317 // The caller should guarantees that NULL != dep. 318 size_t extend_dmi(DictExtPara *dep, DictMatchInfo *dmi_s); 319 320 // Extend dmi for the composing phrase. 321 size_t extend_dmi_c(DictExtPara *dep, DictMatchInfo *dmi_s); 322 323 // Extend a MatrixNode with the give LmaPsbItem list. 324 // res_row is the destination row number. 325 // This function does not change mtrx_nd_pool_used_. Please change it after 326 // calling this function if necessary. 327 // return 0 always. 328 size_t extend_mtrx_nd(MatrixNode *mtrx_nd, LmaPsbItem lpi_items[], 329 size_t lpi_num, PoolPosType dmi_fr, size_t res_row); 330 331 332 // Try to find a dmi node at step_to position, and the found dmi node should 333 // match the given spelling id strings. 334 PoolPosType match_dmi(size_t step_to, uint16 spl_ids[], uint16 spl_id_num); 335 336 bool add_char(char ch); 337 bool prepare_add_char(char ch); 338 339 // Called after prepare_add_char, so the input char has been saved. 340 bool add_char_qwerty(); 341 342 // Prepare candidates from the last fixed hanzi position. 343 void prepare_candidates(); 344 345 // Is the character in step pos a splitter character? 346 // The caller guarantees that the position is valid. 347 bool is_split_at(uint16 pos); 348 349 void fill_dmi(DictMatchInfo *dmi, MileStoneHandle *handles, 350 PoolPosType dmi_fr, 351 uint16 spl_id, uint16 node_num, unsigned char dict_level, 352 bool splid_end_split, unsigned char splstr_len, 353 unsigned char all_full_id); 354 355 size_t inner_predict(const char16 fixed_scis_ids[], uint16 scis_num, 356 char16 predict_buf[][kMaxPredictSize + 1], 357 size_t buf_len); 358 359 // Add the first candidate to the user dictionary. 360 bool try_add_cand0_to_userdict(); 361 362 // Add a user lemma to the user dictionary. This lemma is a subset of 363 // candidate 0. lma_from is from which lemma in lma_ids_, lma_num is the 364 // number of lemmas to be combined together as a new lemma. The caller 365 // gurantees that the combined new lemma's length is less or equal to 366 // kMaxLemmaSize. 367 bool add_lma_to_userdict(uint16 lma_from, uint16 lma_num, float score); 368 369 // Update dictionary frequencies. 370 void update_dict_freq(); 371 372 void debug_print_dmi(PoolPosType dmi_pos, uint16 nest_level); 373 374 public: 375 MatrixSearch(); 376 ~MatrixSearch(); 377 378 bool init(const char *fn_sys_dict, const char *fn_usr_dict); 379 380 bool init_fd(int sys_fd, long start_offset, long length, 381 const char *fn_usr_dict); 382 383 void set_max_lens(size_t max_sps_len, size_t max_hzs_len); 384 385 void close(); 386 387 void flush_cache(); 388 389 void set_xi_an_switch(bool xi_an_enabled); 390 391 bool get_xi_an_switch(); 392 393 // Reset the search space. Equivalent to reset_search(0). 394 // If inited, always return true; 395 bool reset_search(); 396 397 // Search a Pinyin string. 398 // Return value is the position successfully parsed. 399 size_t search(const char *py, size_t py_len); 400 401 // Used to delete something in the Pinyin string kept by the engine, and do 402 // a re-search. 403 // Return value is the new length of Pinyin string kept by the engine which 404 // is parsed successfully. 405 // If is_pos_in_splid is false, pos is used to indicate that pos-th Pinyin 406 // character needs to be deleted. If is_pos_in_splid is true, all Pinyin 407 // characters for pos-th spelling id needs to be deleted. 408 // If the deleted character(s) is just after a fixed lemma or sub lemma in 409 // composing phrase, clear_fixed_this_step indicates whether we needs to 410 // unlock the last fixed lemma or sub lemma. 411 // If is_pos_in_splid is false, and pos-th character is in the range for the 412 // fixed lemmas or composing string, this function will do nothing and just 413 // return the result of the previous search. 414 size_t delsearch(size_t pos, bool is_pos_in_splid, 415 bool clear_fixed_this_step); 416 417 // Get the number of candiates, called after search(). 418 size_t get_candidate_num(); 419 420 // Get the Pinyin string stored by the engine. 421 // *decoded_len returns the length of the successfully decoded string. 422 const char* get_pystr(size_t *decoded_len); 423 424 // Get the spelling boundaries for the first sentence candidate. 425 // Number of spellings will be returned. The number of valid elements in 426 // spl_start is one more than the return value because the last one is used 427 // to indicate the beginning of the next un-input speling. 428 // For a Pinyin "women", the returned value is 2, spl_start is [0, 2, 5] . 429 size_t get_spl_start(const uint16 *&spl_start); 430 431 // Get one candiate string. If full sentence candidate is available, it will 432 // be the first one. 433 char16* get_candidate(size_t cand_id, char16 *cand_str, size_t max_len); 434 435 // Get the first candiate, which is a "full sentence". 436 // retstr_len is not NULL, it will be used to return the string length. 437 // If only_unfixed is true, only unfixed part will be fetched. 438 char16* get_candidate0(char16* cand_str, size_t max_len, 439 uint16 *retstr_len, bool only_unfixed); 440 441 // Choose a candidate. The decoder will do a search after the fixed position. 442 size_t choose(size_t cand_id); 443 444 // Cancel the last choosing operation, and return the new number of choices. 445 size_t cancel_last_choice(); 446 447 // Get the length of fixed Hanzis. 448 size_t get_fixedlen(); 449 450 size_t get_predicts(const char16 fixed_buf[], 451 char16 predict_buf[][kMaxPredictSize + 1], 452 size_t buf_len); 453 }; 454 } 455 456 #endif // PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__ 457