Home | History | Annotate | Download | only in include
      1 /*
      2  * Copyright (C) 2009 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef PINYINIME_INCLUDE_DICTTRIE_H__
     18 #define PINYINIME_INCLUDE_DICTTRIE_H__
     19 
     20 #include <stdlib.h>
     21 #include "./atomdictbase.h"
     22 #include "./dictdef.h"
     23 #include "./dictlist.h"
     24 #include "./searchutility.h"
     25 
     26 namespace ime_pinyin {
     27 
     28 class DictTrie : AtomDictBase {
     29  private:
     30   typedef struct ParsingMark {
     31     size_t node_offset:24;
     32     size_t node_num:8;           // Number of nodes with this spelling id given
     33                                  // by spl_id. If spl_id is a Shengmu, for nodes
     34                                  // in the first layer of DictTrie, it equals to
     35                                  // SpellingTrie::shm2full_num(); but for those
     36                                  // nodes which are not in the first layer,
     37                                  // node_num < SpellingTrie::shm2full_num().
     38                                  // For a full spelling id, node_num = 1;
     39   };
     40 
     41   // Used to indicate an extended mile stone.
     42   // An extended mile stone is used to mark a partial match in the dictionary
     43   // trie to speed up further potential extending.
     44   // For example, when the user inputs "w", a mile stone is created to mark the
     45   // partial match status, so that when user inputs another char 'm', it will be
     46   // faster to extend search space based on this mile stone.
     47   //
     48   // For partial match status of "wm", there can be more than one sub mile
     49   // stone, for example, "wm" can be matched to "wanm", "wom", ..., etc, so
     50   // there may be more one parsing mark used to mark these partial matchings.
     51   // A mile stone records the starting position in the mark list and number of
     52   // marks.
     53   struct MileStone {
     54     uint16 mark_start;
     55     uint16 mark_num;
     56   };
     57 
     58   DictList* dict_list_;
     59 
     60   const SpellingTrie *spl_trie_;
     61 
     62   LmaNodeLE0* root_;        // Nodes for root and the first layer.
     63   LmaNodeGE1* nodes_ge1_;   // Nodes for other layers.
     64 
     65   // An quick index from spelling id to the LmaNodeLE0 node buffer, or
     66   // to the root_ buffer.
     67   // Index length:
     68   // SpellingTrie::get_instance().get_spelling_num() + 1. The last one is used
     69   // to get the end.
     70   // All Shengmu ids are not indexed because they will be converted into
     71   // corresponding full ids.
     72   // So, given an id splid, the son is:
     73   // root_[splid_le0_index_[splid - kFullSplIdStart]]
     74   uint16 *splid_le0_index_;
     75 
     76   size_t lma_node_num_le0_;
     77   size_t lma_node_num_ge1_;
     78 
     79   // The first part is for homophnies, and the last  top_lma_num_ items are
     80   // lemmas with highest scores.
     81   unsigned char *lma_idx_buf_;
     82   size_t lma_idx_buf_len_;  // The total size of lma_idx_buf_ in byte.
     83   size_t total_lma_num_;    // Total number of lemmas in this dictionary.
     84   size_t top_lmas_num_;     // Number of lemma with highest scores.
     85 
     86   // Parsing mark list used to mark the detailed extended statuses.
     87   ParsingMark *parsing_marks_;
     88   // The position for next available mark.
     89   uint16 parsing_marks_pos_;
     90 
     91   // Mile stone list used to mark the extended status.
     92   MileStone *mile_stones_;
     93   // The position for the next available mile stone. We use positions (except 0)
     94   // as handles.
     95   MileStoneHandle mile_stones_pos_;
     96 
     97   // Get the offset of sons for a node.
     98   inline size_t get_son_offset(const LmaNodeGE1 *node);
     99 
    100   // Get the offset of homonious ids for a node.
    101   inline size_t get_homo_idx_buf_offset(const LmaNodeGE1 *node);
    102 
    103   // Get the lemma id by the offset.
    104   inline LemmaIdType get_lemma_id(size_t id_offset);
    105 
    106   void free_resource(bool free_dict_list);
    107 
    108   bool load_dict(FILE *fp);
    109 
    110   // Given a LmaNodeLE0 node, extract the lemmas specified by it, and fill
    111   // them into the lpi_items buffer.
    112   // This function is called by the search engine.
    113   size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size,
    114                          LmaNodeLE0 *node);
    115 
    116   // Given a LmaNodeGE1 node, extract the lemmas specified by it, and fill
    117   // them into the lpi_items buffer.
    118   // This function is called by inner functions extend_dict0(), extend_dict1()
    119   // and extend_dict2().
    120   size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size,
    121                          size_t homo_buf_off, LmaNodeGE1 *node,
    122                          uint16 lma_len);
    123 
    124   // Extend in the trie from level 0.
    125   MileStoneHandle extend_dict0(MileStoneHandle from_handle,
    126                                const DictExtPara *dep, LmaPsbItem *lpi_items,
    127                                size_t lpi_max, size_t *lpi_num);
    128 
    129   // Extend in the trie from level 1.
    130   MileStoneHandle extend_dict1(MileStoneHandle from_handle,
    131                                const DictExtPara *dep, LmaPsbItem *lpi_items,
    132                                size_t lpi_max, size_t *lpi_num);
    133 
    134   // Extend in the trie from level 2.
    135   MileStoneHandle extend_dict2(MileStoneHandle from_handle,
    136                                const DictExtPara *dep, LmaPsbItem *lpi_items,
    137                                size_t lpi_max, size_t *lpi_num);
    138 
    139   // Try to extend the given spelling id buffer, and if the given id_lemma can
    140   // be successfully gotten, return true;
    141   // The given spelling ids are all valid full ids.
    142   bool try_extend(const uint16 *splids, uint16 splid_num, LemmaIdType id_lemma);
    143 
    144 #ifdef ___BUILD_MODEL___
    145   bool save_dict(FILE *fp);
    146 #endif  // ___BUILD_MODEL___
    147 
    148   static const int kMaxMileStone = 100;
    149   static const int kMaxParsingMark = 600;
    150   static const MileStoneHandle kFirstValidMileStoneHandle = 1;
    151 
    152   friend class DictParser;
    153   friend class DictBuilder;
    154 
    155  public:
    156 
    157   DictTrie();
    158   ~DictTrie();
    159 
    160 #ifdef ___BUILD_MODEL___
    161   // Construct the tree from the file fn_raw.
    162   // fn_validhzs provide the valid hanzi list. If fn_validhzs is
    163   // NULL, only chars in GB2312 will be included.
    164   bool build_dict(const char *fn_raw, const char *fn_validhzs);
    165 
    166   // Save the binary dictionary
    167   // Actually, the SpellingTrie/DictList instance will be also saved.
    168   bool save_dict(const char *filename);
    169 #endif  // ___BUILD_MODEL___
    170 
    171   void convert_to_hanzis(char16 *str, uint16 str_len);
    172 
    173   void convert_to_scis_ids(char16 *str, uint16 str_len);
    174 
    175   // Load a binary dictionary
    176   // The SpellingTrie instance/DictList will be also loaded
    177   bool load_dict(const char *filename, LemmaIdType start_id,
    178                  LemmaIdType end_id);
    179   bool load_dict_fd(int sys_fd, long start_offset, long length,
    180                     LemmaIdType start_id, LemmaIdType end_id);
    181   bool close_dict() {return true;}
    182   size_t number_of_lemmas() {return 0;}
    183 
    184   void reset_milestones(uint16 from_step, MileStoneHandle from_handle);
    185 
    186   MileStoneHandle extend_dict(MileStoneHandle from_handle,
    187                               const DictExtPara *dep,
    188                               LmaPsbItem *lpi_items,
    189                               size_t lpi_max, size_t *lpi_num);
    190 
    191   size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len,
    192                   LmaPsbItem *lpi_items, size_t lpi_max);
    193 
    194   uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max);
    195 
    196   uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
    197                           uint16 splids_max, bool arg_valid);
    198 
    199   size_t predict(const char16 *last_hzs, uint16 hzs_len,
    200                  NPredictItem *npre_items, size_t npre_max,
    201                  size_t b4_used);
    202 
    203   LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[],
    204                         uint16 lemma_len, uint16 count) {return 0;}
    205 
    206   LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count,
    207                            bool selected) {return 0;}
    208 
    209   LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[],
    210                            uint16 lemma_len) {return 0;}
    211 
    212   LmaScoreType get_lemma_score(LemmaIdType lemma_id) {return 0;}
    213 
    214   LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[],
    215                         uint16 lemma_len) {return 0;}
    216 
    217   bool remove_lemma(LemmaIdType lemma_id) {return false;}
    218 
    219   size_t get_total_lemma_count() {return 0;}
    220   void set_total_lemma_count_of_others(size_t count);
    221 
    222   void flush_cache() {}
    223 
    224   LemmaIdType get_lemma_id(const char16 lemma_str[], uint16 lemma_len);
    225 
    226   // Fill the lemmas with highest scores to the prediction buffer.
    227   // his_len is the history length to fill in the prediction buffer.
    228   size_t predict_top_lmas(size_t his_len, NPredictItem *npre_items,
    229                           size_t npre_max, size_t b4_used);
    230 };
    231 }
    232 
    233 #endif  // PINYINIME_INCLUDE_DICTTRIE_H__
    234