Home | History | Annotate | Download | only in include
      1 /*
      2  * Copyright (C) 2009 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef PINYINIME_INCLUDE_NGRAM_H__
     18 #define PINYINIME_INCLUDE_NGRAM_H__
     19 
     20 #include <stdio.h>
     21 #include <stdlib.h>
     22 #include "./dictdef.h"
     23 
     24 namespace ime_pinyin {
     25 
     26 typedef unsigned char CODEBOOK_TYPE;
     27 
     28 static const size_t kCodeBookSize = 256;
     29 
     30 class NGram {
     31  public:
     32   // The maximum score of a lemma item.
     33   static const LmaScoreType kMaxScore = 0x3fff;
     34 
     35   // In order to reduce the storage size, the original log value is amplified by
     36   // kScoreAmplifier, and we use LmaScoreType to store.
     37   // After this process, an item with a lower score has a higher frequency.
     38   static const int kLogValueAmplifier = -800;
     39 
     40   // System words' total frequency. It is not the real total frequency, instead,
     41   // It is only used to adjust system lemmas' scores when the user dictionary's
     42   // total frequency changes.
     43   // In this version, frequencies of system lemmas are fixed. We are considering
     44   // to make them changable in next version.
     45   static const size_t kSysDictTotalFreq = 100000000;
     46 
     47  private:
     48 
     49   static NGram* instance_;
     50 
     51   bool initialized_;
     52   size_t idx_num_;
     53 
     54   size_t total_freq_none_sys_;
     55 
     56   // Score compensation for system dictionary lemmas.
     57   // Because after user adds some user lemmas, the total frequency changes, and
     58   // we use this value to normalize the score.
     59   float sys_score_compensation_;
     60 
     61 #ifdef ___BUILD_MODEL___
     62   double *freq_codes_df_;
     63 #endif
     64   LmaScoreType *freq_codes_;
     65   CODEBOOK_TYPE *lma_freq_idx_;
     66 
     67  public:
     68   NGram();
     69   ~NGram();
     70 
     71   static NGram& get_instance();
     72 
     73   bool save_ngram(FILE *fp);
     74   bool load_ngram(FILE *fp);
     75 
     76   // Set the total frequency of all none system dictionaries.
     77   void set_total_freq_none_sys(size_t freq_none_sys);
     78 
     79   float get_uni_psb(LemmaIdType lma_id);
     80 
     81   // Convert a probability to score. Actually, the score will be limited to
     82   // kMaxScore, but at runtime, we also need float expression to get accurate
     83   // value of the score.
     84   // After the conversion, a lower score indicates a higher probability of the
     85   // item.
     86   static float convert_psb_to_score(double psb);
     87 
     88 #ifdef ___BUILD_MODEL___
     89   // For constructing the unigram mode model.
     90   bool build_unigram(LemmaEntry *lemma_arr, size_t num,
     91                      LemmaIdType next_idx_unused);
     92 #endif
     93 };
     94 }
     95 
     96 #endif  // PINYINIME_INCLUDE_NGRAM_H__
     97