1 /* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef PINYINIME_INCLUDE_USERDICT_H__ 18 #define PINYINIME_INCLUDE_USERDICT_H__ 19 20 #define ___CACHE_ENABLED___ 21 #define ___SYNC_ENABLED___ 22 #define ___PREDICT_ENABLED___ 23 24 // Debug performance for operations 25 // #define ___DEBUG_PERF___ 26 27 #include <pthread.h> 28 #include "atomdictbase.h" 29 30 namespace ime_pinyin { 31 32 class UserDict : public AtomDictBase { 33 public: 34 UserDict(); 35 ~UserDict(); 36 37 bool load_dict(const char *file_name, LemmaIdType start_id, 38 LemmaIdType end_id); 39 40 bool close_dict(); 41 42 size_t number_of_lemmas(); 43 44 void reset_milestones(uint16 from_step, MileStoneHandle from_handle); 45 46 MileStoneHandle extend_dict(MileStoneHandle from_handle, 47 const DictExtPara *dep, LmaPsbItem *lpi_items, 48 size_t lpi_max, size_t *lpi_num); 49 50 size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len, 51 LmaPsbItem *lpi_items, size_t lpi_max); 52 53 uint16 get_lemma_str(LemmaIdType id_lemma, char16* str_buf, 54 uint16 str_max); 55 56 uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, 57 uint16 splids_max, bool arg_valid); 58 59 size_t predict(const char16 last_hzs[], uint16 hzs_len, 60 NPredictItem *npre_items, size_t npre_max, 61 size_t b4_used); 62 63 // Full spelling ids are required 64 LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[], 65 uint16 lemma_len, uint16 count); 66 67 LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count, 68 bool selected); 69 70 LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[], 71 uint16 lemma_len); 72 73 LmaScoreType get_lemma_score(LemmaIdType lemma_id); 74 75 LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[], 76 uint16 lemma_len); 77 78 bool remove_lemma(LemmaIdType lemma_id); 79 80 size_t get_total_lemma_count(); 81 void set_total_lemma_count_of_others(size_t count); 82 83 void flush_cache(); 84 85 void set_limit(uint32 max_lemma_count, uint32 max_lemma_size, 86 uint32 reclaim_ratio); 87 88 void reclaim(); 89 90 void defragment(); 91 92 #ifdef ___SYNC_ENABLED___ 93 void clear_sync_lemmas(unsigned int start, unsigned int end); 94 95 int get_sync_count(); 96 97 LemmaIdType put_lemma_no_sync(char16 lemma_str[], uint16 splids[], 98 uint16 lemma_len, uint16 count, uint64 lmt); 99 /** 100 * Add lemmas encoded in UTF-16LE into dictionary without adding sync flag. 101 * 102 * @param lemmas in format of 'wo men,WM,0.32;da jia,DJ,0.12' 103 * @param len length of lemmas string in UTF-16LE 104 * @return newly added lemma count 105 */ 106 int put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len); 107 108 /** 109 * Get lemmas need sync to a UTF-16LE string of above format. 110 * Note: input buffer (str) must not be too small. If str is too small to 111 * contain single one lemma, there might be a dead loop. 112 * 113 * @param str buffer to write lemmas 114 * @param size buffer size in UTF-16LE 115 * @param count output value of lemma returned 116 * @return UTF-16LE string length 117 */ 118 int get_sync_lemmas_in_utf16le_string_from_beginning( 119 char16 * str, int size, int * count); 120 121 #endif 122 123 struct UserDictStat { 124 uint32 version; 125 const char * file_name; 126 struct timeval load_time; 127 struct timeval last_update; 128 uint32 disk_size; 129 uint32 lemma_count; 130 uint32 lemma_size; 131 uint32 delete_count; 132 uint32 delete_size; 133 #ifdef ___SYNC_ENABLED___ 134 uint32 sync_count; 135 #endif 136 uint32 reclaim_ratio; 137 uint32 limit_lemma_count; 138 uint32 limit_lemma_size; 139 }; 140 141 bool state(UserDictStat * stat); 142 143 private: 144 uint32 total_other_nfreq_; 145 struct timeval load_time_; 146 LemmaIdType start_id_; 147 uint32 version_; 148 uint8 * lemmas_; 149 150 // In-Memory-Only flag for each lemma 151 static const uint8 kUserDictLemmaFlagRemove = 1; 152 // Inuse lemmas' offset 153 uint32 * offsets_; 154 // Highest bit in offset tells whether corresponding lemma is removed 155 static const uint32 kUserDictOffsetFlagRemove = (1 << 31); 156 // Maximum possible for the offset 157 static const uint32 kUserDictOffsetMask = ~(kUserDictOffsetFlagRemove); 158 // Bit width for last modified time, from 1 to 16 159 static const uint32 kUserDictLMTBitWidth = 16; 160 // Granularity for last modified time in second 161 static const uint32 kUserDictLMTGranularity = 60 * 60 * 24 * 7; 162 // Maximum frequency count 163 static const uint16 kUserDictMaxFrequency = 0xFFFF; 164 165 #define COARSE_UTC(year, month, day, hour, minute, second) \ 166 ( \ 167 (year - 1970) * 365 * 24 * 60 * 60 + \ 168 (month - 1) * 30 * 24 * 60 * 60 + \ 169 (day - 1) * 24 * 60 * 60 + \ 170 (hour - 0) * 60 * 60 + \ 171 (minute - 0) * 60 + \ 172 (second - 0) \ 173 ) 174 static const uint64 kUserDictLMTSince = COARSE_UTC(2009, 1, 1, 0, 0, 0); 175 176 // Correspond to offsets_ 177 uint32 * scores_; 178 // Following two fields are only valid in memory 179 uint32 * ids_; 180 #ifdef ___PREDICT_ENABLED___ 181 uint32 * predicts_; 182 #endif 183 #ifdef ___SYNC_ENABLED___ 184 uint32 * syncs_; 185 size_t sync_count_size_; 186 #endif 187 uint32 * offsets_by_id_; 188 189 size_t lemma_count_left_; 190 size_t lemma_size_left_; 191 192 const char * dict_file_; 193 194 // Be sure size is 4xN 195 struct UserDictInfo { 196 // When limitation reached, how much percentage will be reclaimed (1 ~ 100) 197 uint32 reclaim_ratio; 198 // maximum lemma count, 0 means no limitation 199 uint32 limit_lemma_count; 200 // Maximum lemma size, it's different from 201 // whole disk file size or in-mem dict size 202 // 0 means no limitation 203 uint32 limit_lemma_size; 204 // Total lemma count including deleted and inuse 205 // Also indicate offsets_ size 206 uint32 lemma_count; 207 // Total size of lemmas including used and freed 208 uint32 lemma_size; 209 // Freed lemma count 210 uint32 free_count; 211 // Freed lemma size in byte 212 uint32 free_size; 213 #ifdef ___SYNC_ENABLED___ 214 uint32 sync_count; 215 #endif 216 int32 total_nfreq; 217 } dict_info_; 218 219 static const uint32 kUserDictVersion = 0x0ABCDEF0; 220 221 static const uint32 kUserDictPreAlloc = 32; 222 static const uint32 kUserDictAverageNchar = 8; 223 224 enum UserDictState { 225 // Keep in order 226 USER_DICT_NONE = 0, 227 USER_DICT_SYNC, 228 #ifdef ___SYNC_ENABLED___ 229 USER_DICT_SYNC_DIRTY, 230 #endif 231 USER_DICT_SCORE_DIRTY, 232 USER_DICT_OFFSET_DIRTY, 233 USER_DICT_LEMMA_DIRTY, 234 235 USER_DICT_DEFRAGMENTED, 236 } state_; 237 238 struct UserDictSearchable { 239 uint16 splids_len; 240 uint16 splid_start[kMaxLemmaSize]; 241 uint16 splid_count[kMaxLemmaSize]; 242 // Compact inital letters for both FuzzyCompareSpellId and cache system 243 uint32 signature[kMaxLemmaSize / 4]; 244 }; 245 246 #ifdef ___CACHE_ENABLED___ 247 enum UserDictCacheType { 248 USER_DICT_CACHE, 249 USER_DICT_MISS_CACHE, 250 }; 251 252 static const int kUserDictCacheSize = 4; 253 static const int kUserDictMissCacheSize = kMaxLemmaSize - 1; 254 255 struct UserDictMissCache { 256 uint32 signatures[kUserDictMissCacheSize][kMaxLemmaSize / 4]; 257 uint16 head, tail; 258 } miss_caches_[kMaxLemmaSize]; 259 260 struct UserDictCache { 261 uint32 signatures[kUserDictCacheSize][kMaxLemmaSize / 4]; 262 uint32 offsets[kUserDictCacheSize]; 263 uint32 lengths[kUserDictCacheSize]; 264 // Ring buffer 265 uint16 head, tail; 266 } caches_[kMaxLemmaSize]; 267 268 void cache_init(); 269 270 void cache_push(UserDictCacheType type, 271 UserDictSearchable *searchable, 272 uint32 offset, uint32 length); 273 274 bool cache_hit(UserDictSearchable *searchable, 275 uint32 *offset, uint32 *length); 276 277 bool load_cache(UserDictSearchable *searchable, 278 uint32 *offset, uint32 *length); 279 280 void save_cache(UserDictSearchable *searchable, 281 uint32 offset, uint32 length); 282 283 void reset_cache(); 284 285 bool load_miss_cache(UserDictSearchable *searchable); 286 287 void save_miss_cache(UserDictSearchable *searchable); 288 289 void reset_miss_cache(); 290 #endif 291 292 LmaScoreType translate_score(int f); 293 294 int extract_score_freq(int raw_score); 295 296 uint64 extract_score_lmt(int raw_score); 297 298 inline int build_score(uint64 lmt, int freq); 299 300 inline int64 utf16le_atoll(uint16 *s, int len); 301 302 inline int utf16le_lltoa(int64 v, uint16 *s, int size); 303 304 LemmaIdType _put_lemma(char16 lemma_str[], uint16 splids[], 305 uint16 lemma_len, uint16 count, uint64 lmt); 306 307 size_t _get_lpis(const uint16 *splid_str, uint16 splid_str_len, 308 LmaPsbItem *lpi_items, size_t lpi_max, bool * need_extend); 309 310 int _get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len); 311 312 int _get_lemma_score(LemmaIdType lemma_id); 313 314 int is_fuzzy_prefix_spell_id(const uint16 * id1, uint16 len1, 315 const UserDictSearchable *searchable); 316 317 bool is_prefix_spell_id(const uint16 * fullids, 318 uint16 fulllen, const UserDictSearchable *searchable); 319 320 uint32 get_dict_file_size(UserDictInfo * info); 321 322 bool reset(const char *file); 323 324 bool validate(const char *file); 325 326 bool load(const char *file, LemmaIdType start_id); 327 328 bool is_valid_state(); 329 330 bool is_valid_lemma_id(LemmaIdType id); 331 332 LemmaIdType get_max_lemma_id(); 333 334 void set_lemma_flag(uint32 offset, uint8 flag); 335 336 char get_lemma_flag(uint32 offset); 337 338 char get_lemma_nchar(uint32 offset); 339 340 uint16 * get_lemma_spell_ids(uint32 offset); 341 342 uint16 * get_lemma_word(uint32 offset); 343 344 // Prepare searchable to fasten locate process 345 void prepare_locate(UserDictSearchable *searchable, 346 const uint16 * splids, uint16 len); 347 348 // Compare initial letters only 349 int32 fuzzy_compare_spell_id(const uint16 * id1, uint16 len1, 350 const UserDictSearchable *searchable); 351 352 // Compare exactly two spell ids 353 // First argument must be a full id spell id 354 bool equal_spell_id(const uint16 * fullids, 355 uint16 fulllen, const UserDictSearchable *searchable); 356 357 // Find first item by initial letters 358 int32 locate_first_in_offsets(const UserDictSearchable *searchable); 359 360 LemmaIdType append_a_lemma(char16 lemma_str[], uint16 splids[], 361 uint16 lemma_len, uint16 count, uint64 lmt); 362 363 // Check if a lemma is in dictionary 364 int32 locate_in_offsets(char16 lemma_str[], 365 uint16 splid_str[], uint16 lemma_len); 366 367 bool remove_lemma_by_offset_index(int offset_index); 368 #ifdef ___PREDICT_ENABLED___ 369 uint32 locate_where_to_insert_in_predicts(const uint16 * words, 370 int lemma_len); 371 372 int32 locate_first_in_predicts(const uint16 * words, int lemma_len); 373 374 void remove_lemma_from_predict_list(uint32 offset); 375 #endif 376 #ifdef ___SYNC_ENABLED___ 377 void queue_lemma_for_sync(LemmaIdType id); 378 379 void remove_lemma_from_sync_list(uint32 offset); 380 381 void write_back_sync(int fd); 382 #endif 383 void write_back_score(int fd); 384 void write_back_offset(int fd); 385 void write_back_lemma(int fd); 386 void write_back_all(int fd); 387 void write_back(); 388 389 struct UserDictScoreOffsetPair { 390 int score; 391 uint32 offset_index; 392 }; 393 394 inline void swap(UserDictScoreOffsetPair * sop, int i, int j); 395 396 void shift_down(UserDictScoreOffsetPair * sop, int i, int n); 397 398 // On-disk format for each lemma 399 // +-------------+ 400 // | Version (4) | 401 // +-------------+ 402 // +-----------+-----------+--------------------+-------------------+ 403 // | Spare (1) | Nchar (1) | Splids (2 x Nchar) | Lemma (2 x Nchar) | 404 // +-----------+-----------+--------------------+-------------------+ 405 // ... 406 // +-----------------------+ +-------------+ <---Offset of offset 407 // | Offset1 by_splids (4) | ... | OffsetN (4) | 408 // +-----------------------+ +-------------+ 409 #ifdef ___PREDICT_ENABLED___ 410 // +----------------------+ +-------------+ 411 // | Offset1 by_lemma (4) | ... | OffsetN (4) | 412 // +----------------------+ +-------------+ 413 #endif 414 // +------------+ +------------+ 415 // | Score1 (4) | ... | ScoreN (4) | 416 // +------------+ +------------+ 417 #ifdef ___SYNC_ENABLED___ 418 // +-------------+ +-------------+ 419 // | NewAdd1 (4) | ... | NewAddN (4) | 420 // +-------------+ +-------------+ 421 #endif 422 // +----------------+ 423 // | Dict Info (4x) | 424 // +----------------+ 425 }; 426 } 427 428 #endif 429