Home | History | Annotate | Download | only in include
      1 /*
      2  * Copyright (C) 2009 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef PINYINIME_INCLUDE_DICTDEF_H__
     18 #define PINYINIME_INCLUDE_DICTDEF_H__
     19 
     20 #include <stdlib.h>
     21 #include "./utf16char.h"
     22 
     23 namespace ime_pinyin {
     24 
     25 // Enable the following line when building the binary dictionary model.
     26 // #define ___BUILD_MODEL___
     27 
     28 typedef unsigned char      uint8;
     29 typedef unsigned short     uint16;
     30 typedef unsigned int       uint32;
     31 
     32 typedef signed char        int8;
     33 typedef short              int16;
     34 typedef int                int32;
     35 typedef long long          int64;
     36 typedef unsigned long long uint64;
     37 
     38 const bool kPrintDebug0 = false;
     39 const bool kPrintDebug1 = false;
     40 const bool kPrintDebug2 = false;
     41 
     42 // The max length of a lemma.
     43 const size_t kMaxLemmaSize = 8;
     44 
     45 // The max length of a Pinyin (spelling).
     46 const size_t kMaxPinyinSize = 6;
     47 
     48 // The number of half spelling ids. For Chinese Pinyin, there 30 half ids.
     49 // See SpellingTrie.h for details.
     50 const size_t kHalfSpellingIdNum = 29;
     51 
     52 // The maximum number of full spellings. For Chinese Pinyin, there are only
     53 // about 410 spellings.
     54 // If change this value is bigger(needs more bits), please also update
     55 // other structures like SpellingNode, to make sure than a spelling id can be
     56 // stored.
     57 // -1 is because that 0 is never used.
     58 const size_t kMaxSpellingNum = 512 - kHalfSpellingIdNum - 1;
     59 const size_t kMaxSearchSteps = 40;
     60 
     61 // One character predicts its following characters.
     62 const size_t kMaxPredictSize = (kMaxLemmaSize - 1);
     63 
     64 // LemmaIdType must always be size_t.
     65 typedef size_t LemmaIdType;
     66 const size_t kLemmaIdSize = 3;  // Actually, a Id occupies 3 bytes in storage.
     67 const size_t kLemmaIdComposing = 0xffffff;
     68 
     69 typedef uint16 LmaScoreType;
     70 typedef uint16 KeyScoreType;
     71 
     72 // Number of items with highest score are kept for prediction purpose.
     73 const size_t kTopScoreLemmaNum = 10;
     74 
     75 const size_t kMaxPredictNumByGt3 = 1;
     76 const size_t kMaxPredictNumBy3 = 2;
     77 const size_t kMaxPredictNumBy2 = 2;
     78 
     79 // The last lemma id (included) for the system dictionary. The system
     80 // dictionary's ids always start from 1.
     81 const LemmaIdType kSysDictIdEnd = 500000;
     82 
     83 // The first lemma id for the user dictionary.
     84 const LemmaIdType kUserDictIdStart = 500001;
     85 
     86 // The last lemma id (included) for the user dictionary.
     87 const LemmaIdType kUserDictIdEnd = 600000;
     88 
     89 typedef struct {
     90   uint16 half_splid:5;
     91   uint16 full_splid:11;
     92 } SpellingId, *PSpellingId;
     93 
     94 
     95 /**
     96  * We use different node types for different layers
     97  * Statistical data of the building result for a testing dictionary:
     98  *                              root,   level 0,   level 1,   level 2,   level 3
     99  * max son num of one node:     406        280         41          2          -
    100  * max homo num of one node:      0         90         23          2          2
    101  * total node num of a layer:     1        406      31766      13516        993
    102  * total homo num of a layer:     9       5674      44609      12667        995
    103  *
    104  * The node number for root and level 0 won't be larger than 500
    105  * According to the information above, two kinds of nodes can be used; one for
    106  * root and level 0, the other for these layers deeper than 0.
    107  *
    108  * LE = less and equal,
    109  * A node occupies 16 bytes. so, totallly less than 16 * 500 = 8K
    110  */
    111 struct LmaNodeLE0 {
    112   size_t son_1st_off;
    113   size_t homo_idx_buf_off;
    114   uint16 spl_idx;
    115   uint16 num_of_son;
    116   uint16 num_of_homo;
    117 };
    118 
    119 /**
    120  * GE = great and equal
    121  * A node occupies 8 bytes.
    122  */
    123 struct LmaNodeGE1 {
    124   uint16 son_1st_off_l;        // Low bits of the son_1st_off
    125   uint16 homo_idx_buf_off_l;   // Low bits of the homo_idx_buf_off_1
    126   uint16 spl_idx;
    127   unsigned char num_of_son;            // number of son nodes
    128   unsigned char num_of_homo;           // number of homo words
    129   unsigned char son_1st_off_h;         // high bits of the son_1st_off
    130   unsigned char homo_idx_buf_off_h;    // high bits of the homo_idx_buf_off
    131 };
    132 
    133 #ifdef ___BUILD_MODEL___
    134 struct SingleCharItem {
    135   float freq;
    136   char16 hz;
    137   SpellingId splid;
    138 };
    139 
    140 struct LemmaEntry {
    141   LemmaIdType idx_by_py;
    142   LemmaIdType idx_by_hz;
    143   char16 hanzi_str[kMaxLemmaSize + 1];
    144 
    145   // The SingleCharItem id for each Hanzi.
    146   uint16 hanzi_scis_ids[kMaxLemmaSize];
    147 
    148   uint16 spl_idx_arr[kMaxLemmaSize + 1];
    149   char pinyin_str[kMaxLemmaSize][kMaxPinyinSize + 1];
    150   unsigned char hz_str_len;
    151   float freq;
    152 };
    153 #endif  // ___BUILD_MODEL___
    154 
    155 }  //  namespace ime_pinyin
    156 
    157 #endif  // PINYINIME_INCLUDE_DICTDEF_H__
    158