1 /* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef PINYINIME_INCLUDE_NGRAM_H__ 18 #define PINYINIME_INCLUDE_NGRAM_H__ 19 20 #include <stdio.h> 21 #include <stdlib.h> 22 #include "./dictdef.h" 23 24 namespace ime_pinyin { 25 26 typedef unsigned char CODEBOOK_TYPE; 27 28 static const size_t kCodeBookSize = 256; 29 30 class NGram { 31 public: 32 // The maximum score of a lemma item. 33 static const LmaScoreType kMaxScore = 0x3fff; 34 35 // In order to reduce the storage size, the original log value is amplified by 36 // kScoreAmplifier, and we use LmaScoreType to store. 37 // After this process, an item with a lower score has a higher frequency. 38 static const int kLogValueAmplifier = -800; 39 40 // System words' total frequency. It is not the real total frequency, instead, 41 // It is only used to adjust system lemmas' scores when the user dictionary's 42 // total frequency changes. 43 // In this version, frequencies of system lemmas are fixed. We are considering 44 // to make them changable in next version. 45 static const size_t kSysDictTotalFreq = 100000000; 46 47 private: 48 49 static NGram* instance_; 50 51 bool initialized_; 52 size_t idx_num_; 53 54 size_t total_freq_none_sys_; 55 56 // Score compensation for system dictionary lemmas. 57 // Because after user adds some user lemmas, the total frequency changes, and 58 // we use this value to normalize the score. 59 float sys_score_compensation_; 60 61 #ifdef ___BUILD_MODEL___ 62 double *freq_codes_df_; 63 #endif 64 LmaScoreType *freq_codes_; 65 CODEBOOK_TYPE *lma_freq_idx_; 66 67 public: 68 NGram(); 69 ~NGram(); 70 71 static NGram& get_instance(); 72 73 bool save_ngram(FILE *fp); 74 bool load_ngram(FILE *fp); 75 76 // Set the total frequency of all none system dictionaries. 77 void set_total_freq_none_sys(size_t freq_none_sys); 78 79 float get_uni_psb(LemmaIdType lma_id); 80 81 // Convert a probability to score. Actually, the score will be limited to 82 // kMaxScore, but at runtime, we also need float expression to get accurate 83 // value of the score. 84 // After the conversion, a lower score indicates a higher probability of the 85 // item. 86 static float convert_psb_to_score(double psb); 87 88 #ifdef ___BUILD_MODEL___ 89 // For constructing the unigram mode model. 90 bool build_unigram(LemmaEntry *lemma_arr, size_t num, 91 LemmaIdType next_idx_unused); 92 #endif 93 }; 94 } 95 96 #endif // PINYINIME_INCLUDE_NGRAM_H__ 97