Home | History | Annotate | Download | only in include
      1 /*
      2  * Copyright (C) 2009 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__
     18 #define PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__
     19 
     20 #include <stdlib.h>
     21 #include "./atomdictbase.h"
     22 #include "./dicttrie.h"
     23 #include "./searchutility.h"
     24 #include "./spellingtrie.h"
     25 #include "./splparser.h"
     26 
     27 namespace ime_pinyin {
     28 
     29 static const size_t kMaxRowNum = kMaxSearchSteps;
     30 
     31 typedef struct {
     32   // MileStoneHandle objects for the system and user dictionaries.
     33   MileStoneHandle dict_handles[2];
     34   // From which DMI node. -1 means it's from root.
     35   PoolPosType dmi_fr;
     36   // The spelling id for the Pinyin string from the previous DMI to this node.
     37   // If it is a half id like Shengmu, the node pointed by dict_node is the first
     38   // node with this Shengmu,
     39   uint16 spl_id;
     40   // What's the level of the dict node. Level of root is 0, but root is never
     41   // recorded by dict_node.
     42   unsigned char dict_level:7;
     43   // If this node is for composing phrase, this bit is 1.
     44   unsigned char c_phrase:1;
     45   // Whether the spl_id is parsed with a split character at the end.
     46   unsigned char splid_end_split:1;
     47   // What's the length of the spelling string for this match, for the whole
     48   // word.
     49   unsigned char splstr_len:7;
     50   // Used to indicate whether all spelling ids from the root are full spelling
     51   // ids. This information is useful for keymapping mode(not finished). Because
     52   // in this mode, there is no clear boundaries, we prefer those results which
     53   // have full spelling ids.
     54   unsigned char all_full_id:1;
     55 } DictMatchInfo, *PDictMatchInfo;
     56 
     57 typedef struct MatrixNode {
     58   LemmaIdType id;
     59   float score;
     60   MatrixNode *from;
     61   // From which DMI node. Used to trace the spelling segmentation.
     62   PoolPosType dmi_fr;
     63   uint16 step;
     64 } MatrixNode, *PMatrixNode;
     65 
     66 typedef struct {
     67   // The MatrixNode position in the matrix pool
     68   PoolPosType mtrx_nd_pos;
     69   // The DictMatchInfo position in the DictMatchInfo pool.
     70   PoolPosType dmi_pos;
     71   uint16 mtrx_nd_num;
     72   uint16 dmi_num:15;
     73   // Used to indicate whether there are dmi nodes in this step with full
     74   // spelling id. This information is used to decide whether a substring of a
     75   // valid Pinyin should be extended.
     76   //
     77   // Example1: shoudao
     78   // When the last char 'o' is added, the parser will find "dao" is a valid
     79   // Pinyin, and because all dmi nodes at location 'd' (including those for
     80   // "shoud", and those for "d") have Shengmu id only, so it is not necessary
     81   // to extend "ao", otherwise the result may be "shoud ao", that is not
     82   // reasonable.
     83   //
     84   // Example2: hengao
     85   // When the last 'o' is added, the parser finds "gao" is a valid Pinyin.
     86   // Because some dmi nodes at 'g' has Shengmu ids (hen'g and g), but some dmi
     87   // nodes at 'g' has full ids ('heng'), so it is necessary to extend "ao", thus
     88   // "heng ao" can also be the result.
     89   //
     90   // Similarly, "ganga" is expanded to "gang a".
     91   //
     92   // For Pinyin string "xian", because "xian" is a valid Pinyin, because all dmi
     93   // nodes at 'x' only have Shengmu ids, the parser will not try "x ian" (and it
     94   // is not valid either). If the parser uses break in the loop, the result
     95   // always be "xian"; but if the parser uses continue in the loop, "xi an" will
     96   // also be tried. This behaviour can be set via the function
     97   // set_xi_an_switch().
     98   uint16 dmi_has_full_id:1;
     99   // Points to a MatrixNode of the current step to indicate which choice the
    100   // user selects.
    101   MatrixNode *mtrx_nd_fixed;
    102 } MatrixRow, *PMatrixRow;
    103 
    104 // When user inputs and selects candidates, the fixed lemma ids are stored in
    105 // lma_id_ of class MatrixSearch, and fixed_lmas_ is used to indicate how many
    106 // lemmas from the beginning are fixed. If user deletes Pinyin characters one
    107 // by one from the end, these fixed lemmas can be unlocked one by one when
    108 // necessary. Whenever user deletes a Chinese character and its spelling string
    109 // in these fixed lemmas, all fixed lemmas will be merged together into a unit
    110 // named ComposingPhrase with a lemma id kLemmaIdComposing, and this composing
    111 // phrase will be the first lemma in the sentence. Because it contains some
    112 // modified lemmas (by deleting a character), these merged lemmas are called
    113 // sub lemmas (sublma), and each of them are represented individually, so that
    114 // when user deletes Pinyin characters from the end, these sub lemmas can also
    115 // be unlocked one by one.
    116 typedef struct {
    117   uint16 spl_ids[kMaxRowNum];
    118   uint16 spl_start[kMaxRowNum];
    119   char16 chn_str[kMaxRowNum];       // Chinese string.
    120   uint16 sublma_start[kMaxRowNum];  // Counted in Chinese characters.
    121   size_t sublma_num;
    122   uint16 length;                    // Counted in Chinese characters.
    123 } ComposingPhrase, *TComposingPhrase;
    124 
    125 class MatrixSearch {
    126  private:
    127   // If it is true, prediction list by string whose length is greater than 1
    128   // will be limited to a reasonable number.
    129   static const bool kPredictLimitGt1 = false;
    130 
    131   // If it is true, the engine will prefer long history based prediction,
    132   // for example, when user inputs "BeiJing", we prefer "DaXue", etc., which are
    133   // based on the two-character history.
    134   static const bool kPreferLongHistoryPredict = true;
    135 
    136   // If it is true, prediction will only be based on user dictionary. this flag
    137   // is for debug purpose.
    138   static const bool kOnlyUserDictPredict = false;
    139 
    140   // The maximum buffer to store LmaPsbItems.
    141   static const size_t kMaxLmaPsbItems = 1450;
    142 
    143   // How many rows for each step.
    144   static const size_t kMaxNodeARow = 5;
    145 
    146   // The maximum length of the sentence candidates counted in chinese
    147   // characters
    148   static const size_t kMaxSentenceLength = 16;
    149 
    150   // The size of the matrix node pool.
    151   static const size_t kMtrxNdPoolSize = 200;
    152 
    153   // The size of the DMI node pool.
    154   static const size_t kDmiPoolSize = 800;
    155 
    156   // Used to indicate whether this object has been initialized.
    157   bool inited_;
    158 
    159   // Spelling trie.
    160   const SpellingTrie *spl_trie_;
    161 
    162   // Used to indicate this switcher status: when "xian" is parseed, should
    163   // "xi an" also be extended. Default is false.
    164   // These cases include: xia, xian, xiang, zhuan, jiang..., etc. The string
    165   // should be valid for a FULL spelling, or a combination of two spellings,
    166   // first of which is a FULL id too. So even it is true, "da" will never be
    167   // split into "d a", because "d" is not a full spelling id.
    168   bool xi_an_enabled_;
    169 
    170   // System dictionary.
    171   DictTrie* dict_trie_;
    172 
    173   // User dictionary.
    174   AtomDictBase* user_dict_;
    175 
    176   // Spelling parser.
    177   SpellingParser* spl_parser_;
    178 
    179   // The maximum allowed length of spelling string (such as a Pinyin string).
    180   size_t max_sps_len_;
    181 
    182   // The maximum allowed length of a result Chinese string.
    183   size_t max_hzs_len_;
    184 
    185   // Pinyin string. Max length: kMaxRowNum - 1
    186   char pys_[kMaxRowNum];
    187 
    188   // The length of the string that has been decoded successfully.
    189   size_t pys_decoded_len_;
    190 
    191   // Shared buffer for multiple purposes.
    192   size_t *share_buf_;
    193 
    194   MatrixNode *mtrx_nd_pool_;
    195   PoolPosType mtrx_nd_pool_used_;    // How many nodes used in the pool
    196   DictMatchInfo *dmi_pool_;
    197   PoolPosType dmi_pool_used_;        // How many items used in the pool
    198 
    199   MatrixRow *matrix_;                // The first row is for starting
    200 
    201   DictExtPara *dep_;                 // Parameter used to extend DMI nodes.
    202 
    203   NPredictItem *npre_items_;         // Used to do prediction
    204   size_t npre_items_len_;
    205 
    206   // The starting positions and lemma ids for the full sentence candidate.
    207   size_t lma_id_num_;
    208   uint16 lma_start_[kMaxRowNum];     // Counted in spelling ids.
    209   LemmaIdType lma_id_[kMaxRowNum];
    210   size_t fixed_lmas_;
    211 
    212   // If fixed_lmas_ is bigger than i,  Element i is used to indicate whether
    213   // the i'th lemma id in lma_id_ is the first candidate for that step.
    214   // If all candidates are the first one for that step, the whole string can be
    215   // decoded by the engine automatically, so no need to add it to user
    216   // dictionary. (We are considering to add it to user dictionary in the
    217   // future).
    218   uint8 fixed_lmas_no1_[kMaxRowNum];
    219 
    220   // Composing phrase
    221   ComposingPhrase c_phrase_;
    222 
    223   // If dmi_c_phrase_ is true, the decoder will try to match the
    224   // composing phrase (And definitely it will match successfully). If it
    225   // is false, the decoder will try to match lemmas items in dictionaries.
    226   bool dmi_c_phrase_;
    227 
    228   // The starting positions and spelling ids for the first full sentence
    229   // candidate.
    230   size_t spl_id_num_;                // Number of splling ids
    231   uint16 spl_start_[kMaxRowNum];     // Starting positions
    232   uint16 spl_id_[kMaxRowNum];        // Spelling ids
    233   // Used to remember the last fixed position, counted in Hanzi.
    234   size_t fixed_hzs_;
    235 
    236   // Lemma Items with possibility score, two purposes:
    237   // 1. In Viterbi decoding, this buffer is used to get all possible candidates
    238   // for current step;
    239   // 2. When the search is done, this buffer is used to get candiates from the
    240   // first un-fixed step and show them to the user.
    241   LmaPsbItem lpi_items_[kMaxLmaPsbItems];
    242   size_t lpi_total_;
    243 
    244   // Assign the pointers with NULL. The caller makes sure that all pointers are
    245   // not valid before calling it. This function only will be called in the
    246   // construction function and free_resource().
    247   void reset_pointers_to_null();
    248 
    249   bool alloc_resource();
    250 
    251   void free_resource();
    252 
    253   // Reset the search space totally.
    254   bool reset_search0();
    255 
    256   // Reset the search space from ch_pos step. For example, if the original
    257   // input Pinyin is "an", reset_search(1) will reset the search space to the
    258   // result of "a". If the given position is out of range, return false.
    259   // if clear_fixed_this_step is true, and the ch_pos step is a fixed step,
    260   // clear its fixed status. if clear_dmi_his_step is true, clear the DMI nodes.
    261   // If clear_mtrx_this_sTep is true, clear the mtrx nodes of this step.
    262   // The DMI nodes will be kept.
    263   //
    264   // Note: this function should not destroy content of pys_.
    265   bool reset_search(size_t ch_pos, bool clear_fixed_this_step,
    266                     bool clear_dmi_this_step, bool clear_mtrx_this_step);
    267 
    268   // Delete a part of the content in pys_.
    269   void del_in_pys(size_t start, size_t len);
    270 
    271   // Delete a spelling id and its corresponding Chinese character, and merge
    272   // the fixed lemmas into the composing phrase.
    273   // del_spl_pos indicates which spelling id needs to be delete.
    274   // This function will update the lemma and spelling segmentation information.
    275   // The caller guarantees that fixed_lmas_ > 0 and del_spl_pos is within
    276   // the fixed lemmas.
    277   void merge_fixed_lmas(size_t del_spl_pos);
    278 
    279   // Get spelling start posistions and ids. The result will be stored in
    280   // spl_id_num_, spl_start_[], spl_id_[].
    281   // fixed_hzs_ will be also assigned.
    282   void get_spl_start_id();
    283 
    284   // Get all lemma ids with match the given spelling id stream(shorter than the
    285   // maximum length of a word).
    286   // If pfullsent is not NULL, means the full sentence candidate may be the
    287   // same with the coming lemma string, if so, remove that lemma.
    288   // The result is sorted in descendant order by the frequency score.
    289   size_t get_lpis(const uint16* splid_str, size_t splid_str_len,
    290                   LmaPsbItem* lma_buf, size_t max_lma_buf,
    291                   const char16 *pfullsent, bool sort_by_psb);
    292 
    293   uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max);
    294 
    295   uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
    296                           uint16 splids_max, bool arg_valid);
    297 
    298 
    299   // Extend a DMI node with a spelling id. ext_len is the length of the rows
    300   // to extend, actually, it is the size of the spelling string of splid.
    301   // return value can be 1 or 0.
    302   // 1 means a new DMI is filled in (dmi_pool_used_ is the next blank DMI in
    303   // the pool).
    304   // 0 means either the dmi node can not be extended with splid, or the splid
    305   // is a Shengmu id, which is only used to get lpi_items, or the result node
    306   // in DictTrie has no son, it is not nccessary to keep the new DMI.
    307   //
    308   // This function modifies the content of lpi_items_ and lpi_total_.
    309   // lpi_items_ is used to get the LmaPsbItem list, lpi_total_ returns the size.
    310   // The function's returned value has no relation with the value of lpi_num.
    311   //
    312   // If dmi == NULL, this function will extend the root node of DictTrie
    313   //
    314   // This function will not change dmi_nd_pool_used_. Please change it after
    315   // calling this function if necessary.
    316   //
    317   // The caller should guarantees that NULL != dep.
    318   size_t extend_dmi(DictExtPara *dep, DictMatchInfo *dmi_s);
    319 
    320   // Extend dmi for the composing phrase.
    321   size_t extend_dmi_c(DictExtPara *dep, DictMatchInfo *dmi_s);
    322 
    323   // Extend a MatrixNode with the give LmaPsbItem list.
    324   // res_row is the destination row number.
    325   // This function does not change mtrx_nd_pool_used_. Please change it after
    326   // calling this function if necessary.
    327   // return 0 always.
    328   size_t extend_mtrx_nd(MatrixNode *mtrx_nd, LmaPsbItem lpi_items[],
    329                         size_t lpi_num, PoolPosType dmi_fr, size_t res_row);
    330 
    331 
    332   // Try to find a dmi node at step_to position, and the found dmi node should
    333   // match the given spelling id strings.
    334   PoolPosType match_dmi(size_t step_to, uint16 spl_ids[], uint16 spl_id_num);
    335 
    336   bool add_char(char ch);
    337   bool prepare_add_char(char ch);
    338 
    339   // Called after prepare_add_char, so the input char has been saved.
    340   bool add_char_qwerty();
    341 
    342   // Prepare candidates from the last fixed hanzi position.
    343   void prepare_candidates();
    344 
    345   // Is the character in step pos a splitter character?
    346   // The caller guarantees that the position is valid.
    347   bool is_split_at(uint16 pos);
    348 
    349   void fill_dmi(DictMatchInfo *dmi, MileStoneHandle *handles,
    350                 PoolPosType dmi_fr,
    351                 uint16 spl_id, uint16 node_num, unsigned char dict_level,
    352                 bool splid_end_split, unsigned char splstr_len,
    353                 unsigned char all_full_id);
    354 
    355   size_t inner_predict(const char16 fixed_scis_ids[], uint16 scis_num,
    356                        char16 predict_buf[][kMaxPredictSize + 1],
    357                        size_t buf_len);
    358 
    359   // Add the first candidate to the user dictionary.
    360   bool try_add_cand0_to_userdict();
    361 
    362   // Add a user lemma to the user dictionary. This lemma is a subset of
    363   // candidate 0. lma_from is from which lemma in lma_ids_, lma_num is the
    364   // number of lemmas to be combined together as a new lemma. The caller
    365   // gurantees that the combined new lemma's length is less or equal to
    366   // kMaxLemmaSize.
    367   bool add_lma_to_userdict(uint16 lma_from, uint16 lma_num, float score);
    368 
    369   // Update dictionary frequencies.
    370   void update_dict_freq();
    371 
    372   void debug_print_dmi(PoolPosType dmi_pos, uint16 nest_level);
    373 
    374  public:
    375   MatrixSearch();
    376   ~MatrixSearch();
    377 
    378   bool init(const char *fn_sys_dict, const char *fn_usr_dict);
    379 
    380   bool init_fd(int sys_fd, long start_offset, long length,
    381                const char *fn_usr_dict);
    382 
    383   void set_max_lens(size_t max_sps_len, size_t max_hzs_len);
    384 
    385   void close();
    386 
    387   void flush_cache();
    388 
    389   void set_xi_an_switch(bool xi_an_enabled);
    390 
    391   bool get_xi_an_switch();
    392 
    393   // Reset the search space. Equivalent to reset_search(0).
    394   // If inited, always return true;
    395   bool reset_search();
    396 
    397   // Search a Pinyin string.
    398   // Return value is the position successfully parsed.
    399   size_t search(const char *py, size_t py_len);
    400 
    401   // Used to delete something in the Pinyin string kept by the engine, and do
    402   // a re-search.
    403   // Return value is the new length of Pinyin string kept by the engine which
    404   // is parsed successfully.
    405   // If is_pos_in_splid is false, pos is used to indicate that pos-th Pinyin
    406   // character needs to be deleted. If is_pos_in_splid is true, all Pinyin
    407   // characters for pos-th spelling id needs to be deleted.
    408   // If the deleted character(s) is just after a fixed lemma or sub lemma in
    409   // composing phrase, clear_fixed_this_step indicates whether we needs to
    410   // unlock the last fixed lemma or sub lemma.
    411   // If is_pos_in_splid is false, and pos-th character is in the range for the
    412   // fixed lemmas or composing string, this function will do nothing and just
    413   // return the result of the previous search.
    414   size_t delsearch(size_t pos, bool is_pos_in_splid,
    415                    bool clear_fixed_this_step);
    416 
    417   // Get the number of candiates, called after search().
    418   size_t get_candidate_num();
    419 
    420   // Get the Pinyin string stored by the engine.
    421   // *decoded_len returns the length of the successfully decoded string.
    422   const char* get_pystr(size_t *decoded_len);
    423 
    424   // Get the spelling boundaries for the first sentence candidate.
    425   // Number of spellings will be returned. The number of valid elements in
    426   // spl_start is one more than the return value because the last one is used
    427   // to indicate the beginning of the next un-input speling.
    428   // For a Pinyin "women", the returned value is 2, spl_start is [0, 2, 5] .
    429   size_t get_spl_start(const uint16 *&spl_start);
    430 
    431   // Get one candiate string. If full sentence candidate is available, it will
    432   // be the first one.
    433   char16* get_candidate(size_t cand_id, char16 *cand_str, size_t max_len);
    434 
    435   // Get the first candiate, which is a "full sentence".
    436   // retstr_len is not NULL, it will be used to return the string length.
    437   // If only_unfixed is true, only unfixed part will be fetched.
    438   char16* get_candidate0(char16* cand_str, size_t max_len,
    439                          uint16 *retstr_len, bool only_unfixed);
    440 
    441   // Choose a candidate. The decoder will do a search after the fixed position.
    442   size_t choose(size_t cand_id);
    443 
    444   // Cancel the last choosing operation, and return the new number of choices.
    445   size_t cancel_last_choice();
    446 
    447   // Get the length of fixed Hanzis.
    448   size_t get_fixedlen();
    449 
    450   size_t get_predicts(const char16 fixed_buf[],
    451                       char16 predict_buf[][kMaxPredictSize + 1],
    452                       size_t buf_len);
    453 };
    454 }
    455 
    456 #endif  // PINYINIME_ANDPY_INCLUDE_MATRIXSEARCH_H__
    457