Home | History | Annotate | Download | only in include
      1 /*
      2  * Copyright (C) 2009 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef PINYINIME_INCLUDE_USERDICT_H__
     18 #define PINYINIME_INCLUDE_USERDICT_H__
     19 
     20 #define ___CACHE_ENABLED___
     21 #define ___SYNC_ENABLED___
     22 #define ___PREDICT_ENABLED___
     23 
     24 // Debug performance for operations
     25 // #define ___DEBUG_PERF___
     26 
     27 #include <pthread.h>
     28 #include "atomdictbase.h"
     29 
     30 namespace ime_pinyin {
     31 
     32 class UserDict : public AtomDictBase {
     33  public:
     34   UserDict();
     35   ~UserDict();
     36 
     37   bool load_dict(const char *file_name, LemmaIdType start_id,
     38                  LemmaIdType end_id);
     39 
     40   bool close_dict();
     41 
     42   size_t number_of_lemmas();
     43 
     44   void reset_milestones(uint16 from_step, MileStoneHandle from_handle);
     45 
     46   MileStoneHandle extend_dict(MileStoneHandle from_handle,
     47                               const DictExtPara *dep, LmaPsbItem *lpi_items,
     48                               size_t lpi_max, size_t *lpi_num);
     49 
     50   size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len,
     51                   LmaPsbItem *lpi_items, size_t lpi_max);
     52 
     53   uint16 get_lemma_str(LemmaIdType id_lemma, char16* str_buf,
     54                        uint16 str_max);
     55 
     56   uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
     57                           uint16 splids_max, bool arg_valid);
     58 
     59   size_t predict(const char16 last_hzs[], uint16 hzs_len,
     60                  NPredictItem *npre_items, size_t npre_max,
     61                  size_t b4_used);
     62 
     63   // Full spelling ids are required
     64   LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[],
     65                         uint16 lemma_len, uint16 count);
     66 
     67   LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count,
     68                            bool selected);
     69 
     70   LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[],
     71                            uint16 lemma_len);
     72 
     73   LmaScoreType get_lemma_score(LemmaIdType lemma_id);
     74 
     75   LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[],
     76                         uint16 lemma_len);
     77 
     78   bool remove_lemma(LemmaIdType lemma_id);
     79 
     80   size_t get_total_lemma_count();
     81   void set_total_lemma_count_of_others(size_t count);
     82 
     83   void flush_cache();
     84 
     85   void set_limit(uint32 max_lemma_count, uint32 max_lemma_size,
     86                  uint32 reclaim_ratio);
     87 
     88   void reclaim();
     89 
     90   void defragment();
     91 
     92 #ifdef ___SYNC_ENABLED___
     93   void clear_sync_lemmas(unsigned int start, unsigned int end);
     94 
     95   int get_sync_count();
     96 
     97   LemmaIdType put_lemma_no_sync(char16 lemma_str[], uint16 splids[],
     98                         uint16 lemma_len, uint16 count, uint64 lmt);
     99    /**
    100     * Add lemmas encoded in UTF-16LE into dictionary without adding sync flag.
    101     *
    102     * @param lemmas in format of 'wo men,WM,0.32;da jia,DJ,0.12'
    103     * @param len length of lemmas string in UTF-16LE
    104     * @return newly added lemma count
    105     */
    106   int put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len);
    107 
    108   /**
    109    * Get lemmas need sync to a UTF-16LE string of above format.
    110    * Note: input buffer (str) must not be too small. If str is too small to
    111    *       contain single one lemma, there might be a dead loop.
    112    *
    113    * @param str buffer to write lemmas
    114    * @param size buffer size in UTF-16LE
    115    * @param count output value of lemma returned
    116    * @return UTF-16LE string length
    117    */
    118   int get_sync_lemmas_in_utf16le_string_from_beginning(
    119       char16 * str, int size, int * count);
    120 
    121 #endif
    122 
    123   struct UserDictStat {
    124     uint32 version;
    125     const char * file_name;
    126     struct timeval load_time;
    127     struct timeval last_update;
    128     uint32 disk_size;
    129     uint32 lemma_count;
    130     uint32 lemma_size;
    131     uint32 delete_count;
    132     uint32 delete_size;
    133 #ifdef ___SYNC_ENABLED___
    134     uint32 sync_count;
    135 #endif
    136     uint32 reclaim_ratio;
    137     uint32 limit_lemma_count;
    138     uint32 limit_lemma_size;
    139   };
    140 
    141   bool state(UserDictStat * stat);
    142 
    143  private:
    144   uint32 total_other_nfreq_;
    145   struct timeval load_time_;
    146   LemmaIdType start_id_;
    147   uint32 version_;
    148   uint8 * lemmas_;
    149 
    150   // In-Memory-Only flag for each lemma
    151   static const uint8 kUserDictLemmaFlagRemove = 1;
    152   // Inuse lemmas' offset
    153   uint32 * offsets_;
    154   // Highest bit in offset tells whether corresponding lemma is removed
    155   static const uint32 kUserDictOffsetFlagRemove = (1 << 31);
    156   // Maximum possible for the offset
    157   static const uint32 kUserDictOffsetMask = ~(kUserDictOffsetFlagRemove);
    158   // Bit width for last modified time, from 1 to 16
    159   static const uint32 kUserDictLMTBitWidth = 16;
    160   // Granularity for last modified time in second
    161   static const uint32 kUserDictLMTGranularity = 60 * 60 * 24 * 7;
    162   // Maximum frequency count
    163   static const uint16 kUserDictMaxFrequency = 0xFFFF;
    164 
    165 #define COARSE_UTC(year, month, day, hour, minute, second) \
    166   ( \
    167     (year - 1970) * 365 * 24 * 60 * 60 + \
    168     (month - 1) * 30 * 24 * 60 * 60 + \
    169     (day - 1) * 24 * 60 * 60 + \
    170     (hour - 0) * 60 * 60 + \
    171     (minute - 0) * 60 + \
    172     (second - 0) \
    173   )
    174   static const uint64 kUserDictLMTSince = COARSE_UTC(2009, 1, 1, 0, 0, 0);
    175 
    176   // Correspond to offsets_
    177   uint32 * scores_;
    178   // Following two fields are only valid in memory
    179   uint32 * ids_;
    180 #ifdef ___PREDICT_ENABLED___
    181   uint32 * predicts_;
    182 #endif
    183 #ifdef ___SYNC_ENABLED___
    184   uint32 * syncs_;
    185   size_t sync_count_size_;
    186 #endif
    187   uint32 * offsets_by_id_;
    188 
    189   size_t lemma_count_left_;
    190   size_t lemma_size_left_;
    191 
    192   const char * dict_file_;
    193 
    194   // Be sure size is 4xN
    195   struct UserDictInfo {
    196     // When limitation reached, how much percentage will be reclaimed (1 ~ 100)
    197     uint32 reclaim_ratio;
    198     // maximum lemma count, 0 means no limitation
    199     uint32 limit_lemma_count;
    200     // Maximum lemma size, it's different from
    201     // whole disk file size or in-mem dict size
    202     // 0 means no limitation
    203     uint32 limit_lemma_size;
    204     // Total lemma count including deleted and inuse
    205     // Also indicate offsets_ size
    206     uint32 lemma_count;
    207     // Total size of lemmas including used and freed
    208     uint32 lemma_size;
    209     // Freed lemma count
    210     uint32 free_count;
    211     // Freed lemma size in byte
    212     uint32 free_size;
    213 #ifdef ___SYNC_ENABLED___
    214     uint32 sync_count;
    215 #endif
    216     int32 total_nfreq;
    217   } dict_info_;
    218 
    219   static const uint32 kUserDictVersion = 0x0ABCDEF0;
    220 
    221   static const uint32 kUserDictPreAlloc = 32;
    222   static const uint32 kUserDictAverageNchar = 8;
    223 
    224   enum UserDictState {
    225     // Keep in order
    226     USER_DICT_NONE = 0,
    227     USER_DICT_SYNC,
    228 #ifdef ___SYNC_ENABLED___
    229     USER_DICT_SYNC_DIRTY,
    230 #endif
    231     USER_DICT_SCORE_DIRTY,
    232     USER_DICT_OFFSET_DIRTY,
    233     USER_DICT_LEMMA_DIRTY,
    234 
    235     USER_DICT_DEFRAGMENTED,
    236   } state_;
    237 
    238   struct UserDictSearchable {
    239     uint16 splids_len;
    240     uint16 splid_start[kMaxLemmaSize];
    241     uint16 splid_count[kMaxLemmaSize];
    242     // Compact inital letters for both FuzzyCompareSpellId and cache system
    243     uint32 signature[kMaxLemmaSize / 4];
    244   };
    245 
    246 #ifdef ___CACHE_ENABLED___
    247   enum UserDictCacheType {
    248     USER_DICT_CACHE,
    249     USER_DICT_MISS_CACHE,
    250   };
    251 
    252   static const int kUserDictCacheSize = 4;
    253   static const int kUserDictMissCacheSize = kMaxLemmaSize - 1;
    254 
    255   struct UserDictMissCache {
    256     uint32 signatures[kUserDictMissCacheSize][kMaxLemmaSize / 4];
    257     uint16 head, tail;
    258   } miss_caches_[kMaxLemmaSize];
    259 
    260   struct UserDictCache {
    261     uint32 signatures[kUserDictCacheSize][kMaxLemmaSize / 4];
    262     uint32 offsets[kUserDictCacheSize];
    263     uint32 lengths[kUserDictCacheSize];
    264     // Ring buffer
    265     uint16 head, tail;
    266   } caches_[kMaxLemmaSize];
    267 
    268   void cache_init();
    269 
    270   void cache_push(UserDictCacheType type,
    271                  UserDictSearchable *searchable,
    272                  uint32 offset, uint32 length);
    273 
    274   bool cache_hit(UserDictSearchable *searchable,
    275                  uint32 *offset, uint32 *length);
    276 
    277   bool load_cache(UserDictSearchable *searchable,
    278                   uint32 *offset, uint32 *length);
    279 
    280   void save_cache(UserDictSearchable *searchable,
    281                   uint32 offset, uint32 length);
    282 
    283   void reset_cache();
    284 
    285   bool load_miss_cache(UserDictSearchable *searchable);
    286 
    287   void save_miss_cache(UserDictSearchable *searchable);
    288 
    289   void reset_miss_cache();
    290 #endif
    291 
    292   LmaScoreType translate_score(int f);
    293 
    294   int extract_score_freq(int raw_score);
    295 
    296   uint64 extract_score_lmt(int raw_score);
    297 
    298   inline int build_score(uint64 lmt, int freq);
    299 
    300   inline int64 utf16le_atoll(uint16 *s, int len);
    301 
    302   inline int utf16le_lltoa(int64 v, uint16 *s, int size);
    303 
    304   LemmaIdType _put_lemma(char16 lemma_str[], uint16 splids[],
    305                         uint16 lemma_len, uint16 count, uint64 lmt);
    306 
    307   size_t _get_lpis(const uint16 *splid_str, uint16 splid_str_len,
    308                    LmaPsbItem *lpi_items, size_t lpi_max, bool * need_extend);
    309 
    310   int _get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len);
    311 
    312   int _get_lemma_score(LemmaIdType lemma_id);
    313 
    314   int is_fuzzy_prefix_spell_id(const uint16 * id1, uint16 len1,
    315                                const UserDictSearchable *searchable);
    316 
    317   bool is_prefix_spell_id(const uint16 * fullids,
    318                           uint16 fulllen, const UserDictSearchable *searchable);
    319 
    320   uint32 get_dict_file_size(UserDictInfo * info);
    321 
    322   bool reset(const char *file);
    323 
    324   bool validate(const char *file);
    325 
    326   bool load(const char *file, LemmaIdType start_id);
    327 
    328   bool is_valid_state();
    329 
    330   bool is_valid_lemma_id(LemmaIdType id);
    331 
    332   LemmaIdType get_max_lemma_id();
    333 
    334   void set_lemma_flag(uint32 offset, uint8 flag);
    335 
    336   char get_lemma_flag(uint32 offset);
    337 
    338   char get_lemma_nchar(uint32 offset);
    339 
    340   uint16 * get_lemma_spell_ids(uint32 offset);
    341 
    342   uint16 * get_lemma_word(uint32 offset);
    343 
    344   // Prepare searchable to fasten locate process
    345   void prepare_locate(UserDictSearchable *searchable,
    346                       const uint16 * splids, uint16 len);
    347 
    348   // Compare initial letters only
    349   int32 fuzzy_compare_spell_id(const uint16 * id1, uint16 len1,
    350                                const UserDictSearchable *searchable);
    351 
    352   // Compare exactly two spell ids
    353   // First argument must be a full id spell id
    354   bool equal_spell_id(const uint16 * fullids,
    355                       uint16 fulllen, const UserDictSearchable *searchable);
    356 
    357   // Find first item by initial letters
    358   int32 locate_first_in_offsets(const UserDictSearchable *searchable);
    359 
    360   LemmaIdType append_a_lemma(char16 lemma_str[], uint16 splids[],
    361                            uint16 lemma_len, uint16 count, uint64 lmt);
    362 
    363   // Check if a lemma is in dictionary
    364   int32 locate_in_offsets(char16 lemma_str[],
    365                           uint16 splid_str[], uint16 lemma_len);
    366 
    367   bool remove_lemma_by_offset_index(int offset_index);
    368 #ifdef ___PREDICT_ENABLED___
    369   uint32 locate_where_to_insert_in_predicts(const uint16 * words,
    370                                             int lemma_len);
    371 
    372   int32 locate_first_in_predicts(const uint16 * words, int lemma_len);
    373 
    374   void remove_lemma_from_predict_list(uint32 offset);
    375 #endif
    376 #ifdef ___SYNC_ENABLED___
    377   void queue_lemma_for_sync(LemmaIdType id);
    378 
    379   void remove_lemma_from_sync_list(uint32 offset);
    380 
    381   void write_back_sync(int fd);
    382 #endif
    383   void write_back_score(int fd);
    384   void write_back_offset(int fd);
    385   void write_back_lemma(int fd);
    386   void write_back_all(int fd);
    387   void write_back();
    388 
    389   struct UserDictScoreOffsetPair {
    390     int score;
    391     uint32 offset_index;
    392   };
    393 
    394   inline void swap(UserDictScoreOffsetPair * sop, int i, int j);
    395 
    396   void shift_down(UserDictScoreOffsetPair * sop, int i, int n);
    397 
    398   // On-disk format for each lemma
    399   // +-------------+
    400   // | Version (4) |
    401   // +-------------+
    402   // +-----------+-----------+--------------------+-------------------+
    403   // | Spare (1) | Nchar (1) | Splids (2 x Nchar) | Lemma (2 x Nchar) |
    404   // +-----------+-----------+--------------------+-------------------+
    405   // ...
    406   // +-----------------------+     +-------------+      <---Offset of offset
    407   // | Offset1 by_splids (4) | ... | OffsetN (4) |
    408   // +-----------------------+     +-------------+
    409 #ifdef ___PREDICT_ENABLED___
    410   // +----------------------+     +-------------+
    411   // | Offset1 by_lemma (4) | ... | OffsetN (4) |
    412   // +----------------------+     +-------------+
    413 #endif
    414   // +------------+     +------------+
    415   // | Score1 (4) | ... | ScoreN (4) |
    416   // +------------+     +------------+
    417 #ifdef ___SYNC_ENABLED___
    418   // +-------------+     +-------------+
    419   // | NewAdd1 (4) | ... | NewAddN (4) |
    420   // +-------------+     +-------------+
    421 #endif
    422   // +----------------+
    423   // | Dict Info (4x) |
    424   // +----------------+
    425 };
    426 }
    427 
    428 #endif
    429