Home | History | Annotate | Download | only in include
      1 /*
      2  * Copyright (C) 2009 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef PINYINIME_INCLUDE_DICTBUILDER_H__
     18 #define PINYINIME_INCLUDE_DICTBUILDER_H__
     19 
     20 #include <stdlib.h>
     21 #include "./utf16char.h"
     22 #include "./dictdef.h"
     23 #include "./dictlist.h"
     24 #include "./spellingtable.h"
     25 #include "./spellingtrie.h"
     26 #include "./splparser.h"
     27 
     28 namespace ime_pinyin {
     29 
     30 #ifdef ___BUILD_MODEL___
     31 
     32 #define ___DO_STATISTICS___
     33 
     34 class DictTrie;
     35 
     36 class DictBuilder {
     37  private:
     38   // The raw lemma array buffer.
     39   LemmaEntry *lemma_arr_;
     40   size_t lemma_num_;
     41 
     42   // Used to store all possible single char items.
     43   // Two items may have the same Hanzi while their spelling ids are different.
     44   SingleCharItem *scis_;
     45   size_t scis_num_;
     46 
     47   // In the tree, root's level is -1.
     48   // Lemma nodes for root, and level 0
     49   LmaNodeLE0 *lma_nodes_le0_;
     50 
     51   // Lemma nodes for layers whose levels are deeper than 0
     52   LmaNodeGE1 *lma_nodes_ge1_;
     53 
     54   // Number of used lemma nodes
     55   size_t lma_nds_used_num_le0_;
     56   size_t lma_nds_used_num_ge1_;
     57 
     58   // Used to store homophonies' ids.
     59   LemmaIdType *homo_idx_buf_;
     60   // Number of homophonies each of which only contains one Chinese character.
     61   size_t homo_idx_num_eq1_;
     62   // Number of homophonies each of which contains more than one character.
     63   size_t homo_idx_num_gt1_;
     64 
     65   // The items with highest scores.
     66   LemmaEntry *top_lmas_;
     67   size_t top_lmas_num_;
     68 
     69   SpellingTable *spl_table_;
     70   SpellingParser *spl_parser_;
     71 
     72 #ifdef ___DO_STATISTICS___
     73   size_t max_sonbuf_len_[kMaxLemmaSize];
     74   size_t max_homobuf_len_[kMaxLemmaSize];
     75 
     76   size_t total_son_num_[kMaxLemmaSize];
     77   size_t total_node_hasson_[kMaxLemmaSize];
     78   size_t total_sonbuf_num_[kMaxLemmaSize];
     79   size_t total_sonbuf_allnoson_[kMaxLemmaSize];
     80   size_t total_node_in_sonbuf_allnoson_[kMaxLemmaSize];
     81   size_t total_homo_num_[kMaxLemmaSize];
     82 
     83   size_t sonbufs_num1_;     // Number of son buffer with only 1 son
     84   size_t sonbufs_numgt1_;   // Number of son buffer with more 1 son;
     85 
     86   size_t total_lma_node_num_;
     87 
     88   void stat_init();
     89   void stat_print();
     90 #endif
     91 
     92  public:
     93 
     94   DictBuilder();
     95   ~DictBuilder();
     96 
     97   // Build dictionary trie from the file fn_raw. File fn_validhzs provides
     98   // valid chars. If fn_validhzs is NULL, only chars in GB2312 will be
     99   // included.
    100   bool build_dict(const char* fn_raw, const char* fn_validhzs,
    101                   DictTrie *dict_trie);
    102 
    103  private:
    104   // Fill in the buffer with id. The caller guarantees that the paramters are
    105   // vaild.
    106   void id_to_charbuf(unsigned char *buf, LemmaIdType id);
    107 
    108   // Update the offset of sons for a node.
    109   void set_son_offset(LmaNodeGE1 *node, size_t offset);
    110 
    111   // Update the offset of homophonies' ids for a node.
    112   void set_homo_id_buf_offset(LmaNodeGE1 *node, size_t offset);
    113 
    114   // Format a speling string.
    115   void format_spelling_str(char *spl_str);
    116 
    117   // Sort the lemma_arr by the hanzi string, and give each of unique items
    118   // a id. Why we need to sort the lemma list according to their Hanzi string
    119   // is to find items started by a given prefix string to do prediction.
    120   // Actually, the single char items are be in other order, for example,
    121   // in spelling id order, etc.
    122   // Return value is next un-allocated idx available.
    123   LemmaIdType sort_lemmas_by_hz();
    124 
    125   // Build the SingleCharItem list, and fill the hanzi_scis_ids in the
    126   // lemma buffer lemma_arr_.
    127   // This function should be called after the lemma array is ready.
    128   // Return the number of unique SingleCharItem elements.
    129   size_t build_scis();
    130 
    131   // Construct a subtree using a subset of the spelling array (from
    132   // item_star to item_end)
    133   // parent is the parent node to update the necessary information
    134   // parent can be a member of LmaNodeLE0 or LmaNodeGE1
    135   bool construct_subset(void* parent, LemmaEntry* lemma_arr,
    136                         size_t item_start, size_t item_end, size_t level);
    137 
    138 
    139   // Read valid Chinese Hanzis from the given file.
    140   // num is used to return number of chars.
    141   // The return buffer is sorted and caller needs to free the returned buffer.
    142   char16* read_valid_hanzis(const char *fn_validhzs, size_t *num);
    143 
    144 
    145   // Read a raw dictionary. max_item is the maximum number of items. If there
    146   // are more items in the ditionary, only the first max_item will be read.
    147   // Returned value is the number of items successfully read from the file.
    148   size_t read_raw_dict(const char* fn_raw, const char *fn_validhzs,
    149                        size_t max_item);
    150 
    151   // Try to find if a character is in hzs buffer.
    152   bool hz_in_hanzis_list(const char16 *hzs, size_t hzs_len, char16 hz);
    153 
    154   // Try to find if all characters in str are in hzs buffer.
    155   bool str_in_hanzis_list(const char16 *hzs, size_t hzs_len,
    156                           const char16 *str, size_t str_len);
    157 
    158   // Get these lemmas with toppest scores.
    159   void get_top_lemmas();
    160 
    161   // Allocate resource to build dictionary.
    162   // lma_num is the number of items to be loaded
    163   bool alloc_resource(size_t lma_num);
    164 
    165   // Free resource.
    166   void free_resource();
    167 };
    168 #endif  // ___BUILD_MODEL___
    169 }
    170 
    171 #endif  // PINYINIME_INCLUDE_DICTBUILDER_H__
    172