Home | History | Annotate | Download | only in include
      1 /*
      2  * Copyright (C) 2009 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef PINYINIME_INCLUDE_SPELLINGTRIE_H__
     18 #define PINYINIME_INCLUDE_SPELLINGTRIE_H__
     19 
     20 #include <stdio.h>
     21 #include <stdlib.h>
     22 #include "./dictdef.h"
     23 
     24 namespace ime_pinyin {
     25 
     26 static const unsigned short kFullSplIdStart = kHalfSpellingIdNum + 1;
     27 
     28 // Node used for the trie of spellings
     29 struct SpellingNode {
     30   SpellingNode *first_son;
     31   // The spelling id for each node. If you need more bits to store
     32   // spelling id, please adjust this structure.
     33   uint16 spelling_idx:11;
     34   uint16  num_of_son:5;
     35   char char_this_node;
     36   unsigned char score;
     37 };
     38 
     39 class SpellingTrie {
     40  private:
     41   static const int kMaxYmNum = 64;
     42   static const size_t kValidSplCharNum = 26;
     43 
     44   static const uint16 kHalfIdShengmuMask = 0x01;
     45   static const uint16 kHalfIdYunmuMask = 0x02;
     46   static const uint16 kHalfIdSzmMask = 0x04;
     47 
     48   // Map from half spelling id to single char.
     49   // For half ids of Zh/Ch/Sh, map to z/c/s (low case) respectively.
     50   // For example, 1 to 'A', 2 to 'B', 3 to 'C', 4 to 'c', 5 to 'D', ...,
     51   // 28 to 'Z', 29 to 'z'.
     52   // [0] is not used to achieve better efficiency.
     53   static const char kHalfId2Sc_[kFullSplIdStart + 1];
     54 
     55   static unsigned char char_flags_[];
     56   static SpellingTrie* instance_;
     57 
     58   // The spelling table
     59   char *spelling_buf_;
     60 
     61   // The size of longest spelling string, includes '\0' and an extra char to
     62   // store score. For example, "zhuang" is the longgest item in Pinyin list,
     63   // so spelling_size_ is 8.
     64   // Structure: The string ended with '\0' + score char.
     65   // An item with a lower score has a higher probability.
     66   size_t spelling_size_;
     67 
     68   // Number of full spelling ids.
     69   size_t spelling_num_;
     70 
     71   float score_amplifier_;
     72   unsigned char average_score_;
     73 
     74   // The Yunmu id list for the spelling ids (for half ids of Shengmu,
     75   // the Yunmu id is 0).
     76   // The length of the list is spelling_num_ + kFullSplIdStart,
     77   // so that spl_ym_ids_[splid] is the Yunmu id of the splid.
     78   uint8 *spl_ym_ids_;
     79 
     80   // The Yunmu table.
     81   // Each Yunmu will be assigned with Yunmu id from 1.
     82   char *ym_buf_;
     83   size_t ym_size_;  // The size of longest Yunmu string, '\0'included.
     84   size_t ym_num_;
     85 
     86   // The spelling string just queried
     87   char *splstr_queried_;
     88 
     89   // The spelling string just queried
     90   char16 *splstr16_queried_;
     91 
     92   // The root node of the spelling tree
     93   SpellingNode* root_;
     94 
     95   // If a none qwerty key such as a fnction key like ENTER is given, this node
     96   // will be used to indicate that this is not a QWERTY node.
     97   SpellingNode* dumb_node_;
     98 
     99   // If a splitter key is pressed, this node will be used to indicate that this
    100   // is a splitter key.
    101   SpellingNode* splitter_node_;
    102 
    103   // Used to get the first level sons.
    104   SpellingNode* level1_sons_[kValidSplCharNum];
    105 
    106   // The full spl_id range for specific half id.
    107   // h2f means half to full.
    108   // A half id can be a ShouZiMu id (id to represent the first char of a full
    109   // spelling, including Shengmu and Yunmu), or id of zh/ch/sh.
    110   // [1..kFullSplIdStart-1] is the arrange of half id.
    111   uint16 h2f_start_[kFullSplIdStart];
    112   uint16 h2f_num_[kFullSplIdStart];
    113 
    114   // Map from full id to half id.
    115   uint16 *f2h_;
    116 
    117 #ifdef ___BUILD_MODEL___
    118   // How many node used to build the trie.
    119   size_t node_num_;
    120 #endif
    121 
    122   SpellingTrie();
    123 
    124   void free_son_trie(SpellingNode* node);
    125 
    126   // Construct a subtree using a subset of the spelling array (from
    127   // item_star to item_end).
    128   // Member spelliing_buf_ and spelling_size_ should be valid.
    129   // parent is used to update its num_of_son and score.
    130   SpellingNode* construct_spellings_subset(size_t item_start, size_t item_end,
    131                                            size_t level, SpellingNode *parent);
    132   bool build_f2h();
    133 
    134   // The caller should guarantee ch >= 'A' && ch <= 'Z'
    135   bool is_shengmu_char(char ch) const;
    136 
    137   // The caller should guarantee ch >= 'A' && ch <= 'Z'
    138   bool is_yunmu_char(char ch) const;
    139 
    140 #ifdef ___BUILD_MODEL___
    141   // Given a spelling string, return its Yunmu string.
    142   // The caller guaratees spl_str is valid.
    143   const char* get_ym_str(const char *spl_str);
    144 
    145   // Build the Yunmu list, and the mapping relation between the full ids and the
    146   // Yunmu ids. This functin is called after the spelling trie is built.
    147   bool build_ym_info();
    148 #endif
    149 
    150   friend class SpellingParser;
    151   friend class SmartSplParser;
    152   friend class SmartSplParser2;
    153 
    154  public:
    155   ~SpellingTrie();
    156 
    157   inline static bool is_valid_spl_char(char ch) {
    158     return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
    159   }
    160 
    161   // The caller guarantees that the two chars are valid spelling chars.
    162   inline static bool is_same_spl_char(char ch1, char ch2) {
    163     return ch1 == ch2 || ch1 - ch2 == 'a' - 'A' || ch2 - ch1 == 'a' - 'A';
    164   }
    165 
    166   // Construct the tree from the input pinyin array
    167   // The given string list should have been sorted.
    168   // score_amplifier is used to convert a possibility value into score.
    169   // average_score is the average_score of all spellings. The dumb node is
    170   // assigned with this score.
    171   bool construct(const char* spelling_arr, size_t item_size, size_t item_num,
    172                  float score_amplifier, unsigned char average_score);
    173 
    174   // Test if the given id is a valid spelling id.
    175   // If function returns true, the given splid may be updated like this:
    176   // When 'A' is not enabled in ShouZiMu mode, the parsing result for 'A' is
    177   // first given as a half id 1, but because 'A' is a one-char Yunmu and
    178   // it is a valid id, it needs to updated to its corresponding full id.
    179   bool if_valid_id_update(uint16 *splid) const;
    180 
    181   // Test if the given id is a half id.
    182   bool is_half_id(uint16 splid) const;
    183 
    184   bool is_full_id(uint16 splid) const;
    185 
    186   // Test if the given id is a one-char Yunmu id (obviously, it is also a half
    187   // id), such as 'A', 'E' and 'O'.
    188   bool is_half_id_yunmu(uint16 splid) const;
    189 
    190   // Test if this char is a ShouZiMu char. This ShouZiMu char may be not enabled.
    191   // For Pinyin, only i/u/v is not a ShouZiMu char.
    192   // The caller should guarantee that ch >= 'A' && ch <= 'Z'
    193   bool is_szm_char(char ch) const;
    194 
    195   // Test If this char is enabled in ShouZiMu mode.
    196   // The caller should guarantee that ch >= 'A' && ch <= 'Z'
    197   bool szm_is_enabled(char ch) const;
    198 
    199   // Enable/disable Shengmus in ShouZiMu mode(using the first char of a spelling
    200   // to input).
    201   void szm_enable_shm(bool enable);
    202 
    203   // Enable/disable Yunmus in ShouZiMu mode.
    204   void szm_enable_ym(bool enable);
    205 
    206   // Test if this char is enabled in ShouZiMu mode.
    207   // The caller should guarantee ch >= 'A' && ch <= 'Z'
    208   bool is_szm_enabled(char ch) const;
    209 
    210   // Return the number of full ids for the given half id.
    211   uint16 half2full_num(uint16 half_id) const;
    212 
    213   // Return the number of full ids for the given half id, and fill spl_id_start
    214   // to return the first full id.
    215   uint16 half_to_full(uint16 half_id, uint16 *spl_id_start) const;
    216 
    217   // Return the corresponding half id for the given full id.
    218   // Not frequently used, low efficient.
    219   // Return 0 if fails.
    220   uint16 full_to_half(uint16 full_id) const;
    221 
    222   // To test whether a half id is compatible with a full id.
    223   // Generally, when half_id == full_to_half(full_id), return true.
    224   // But for "Zh, Ch, Sh", if fussy mode is on, half id for 'Z' is compatible
    225   // with a full id like "Zhe". (Fussy mode is not ready).
    226   bool half_full_compatible(uint16 half_id, uint16 full_id) const;
    227 
    228   static const SpellingTrie* get_cpinstance();
    229 
    230   static SpellingTrie& get_instance();
    231 
    232   // Save to the file stream
    233   bool save_spl_trie(FILE *fp);
    234 
    235   // Load from the file stream
    236   bool load_spl_trie(FILE *fp);
    237 
    238   // Get the number of spellings
    239   size_t get_spelling_num();
    240 
    241   // Return the Yunmu id for the given Yunmu string.
    242   // If the string is not valid, return 0;
    243   uint8 get_ym_id(const char* ym_str);
    244 
    245   // Get the readonly Pinyin string for a given spelling id
    246   const char* get_spelling_str(uint16 splid);
    247 
    248   // Get the readonly Pinyin string for a given spelling id
    249   const char16* get_spelling_str16(uint16 splid);
    250 
    251   // Get Pinyin string for a given spelling id. Return the length of the
    252   // string, and fill-in '\0' at the end.
    253   size_t get_spelling_str16(uint16 splid, char16 *splstr16,
    254                             size_t splstr16_len);
    255 };
    256 }
    257 
    258 #endif  // PINYINIME_INCLUDE_SPELLINGTRIE_H__
    259