1 /* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef PINYINIME_INCLUDE_DICTDEF_H__ 18 #define PINYINIME_INCLUDE_DICTDEF_H__ 19 20 #include <stdlib.h> 21 #include "./utf16char.h" 22 23 namespace ime_pinyin { 24 25 // Enable the following line when building the binary dictionary model. 26 // #define ___BUILD_MODEL___ 27 28 typedef unsigned char uint8; 29 typedef unsigned short uint16; 30 typedef unsigned int uint32; 31 32 typedef signed char int8; 33 typedef short int16; 34 typedef int int32; 35 typedef long long int64; 36 typedef unsigned long long uint64; 37 38 const bool kPrintDebug0 = false; 39 const bool kPrintDebug1 = false; 40 const bool kPrintDebug2 = false; 41 42 // The max length of a lemma. 43 const size_t kMaxLemmaSize = 8; 44 45 // The max length of a Pinyin (spelling). 46 const size_t kMaxPinyinSize = 6; 47 48 // The number of half spelling ids. For Chinese Pinyin, there 30 half ids. 49 // See SpellingTrie.h for details. 50 const size_t kHalfSpellingIdNum = 29; 51 52 // The maximum number of full spellings. For Chinese Pinyin, there are only 53 // about 410 spellings. 54 // If change this value is bigger(needs more bits), please also update 55 // other structures like SpellingNode, to make sure than a spelling id can be 56 // stored. 57 // -1 is because that 0 is never used. 58 const size_t kMaxSpellingNum = 512 - kHalfSpellingIdNum - 1; 59 const size_t kMaxSearchSteps = 40; 60 61 // One character predicts its following characters. 62 const size_t kMaxPredictSize = (kMaxLemmaSize - 1); 63 64 // LemmaIdType must always be size_t. 65 typedef size_t LemmaIdType; 66 const size_t kLemmaIdSize = 3; // Actually, a Id occupies 3 bytes in storage. 67 const size_t kLemmaIdComposing = 0xffffff; 68 69 typedef uint16 LmaScoreType; 70 typedef uint16 KeyScoreType; 71 72 // Number of items with highest score are kept for prediction purpose. 73 const size_t kTopScoreLemmaNum = 10; 74 75 const size_t kMaxPredictNumByGt3 = 1; 76 const size_t kMaxPredictNumBy3 = 2; 77 const size_t kMaxPredictNumBy2 = 2; 78 79 // The last lemma id (included) for the system dictionary. The system 80 // dictionary's ids always start from 1. 81 const LemmaIdType kSysDictIdEnd = 500000; 82 83 // The first lemma id for the user dictionary. 84 const LemmaIdType kUserDictIdStart = 500001; 85 86 // The last lemma id (included) for the user dictionary. 87 const LemmaIdType kUserDictIdEnd = 600000; 88 89 typedef struct { 90 uint16 half_splid:5; 91 uint16 full_splid:11; 92 } SpellingId, *PSpellingId; 93 94 95 /** 96 * We use different node types for different layers 97 * Statistical data of the building result for a testing dictionary: 98 * root, level 0, level 1, level 2, level 3 99 * max son num of one node: 406 280 41 2 - 100 * max homo num of one node: 0 90 23 2 2 101 * total node num of a layer: 1 406 31766 13516 993 102 * total homo num of a layer: 9 5674 44609 12667 995 103 * 104 * The node number for root and level 0 won't be larger than 500 105 * According to the information above, two kinds of nodes can be used; one for 106 * root and level 0, the other for these layers deeper than 0. 107 * 108 * LE = less and equal, 109 * A node occupies 16 bytes. so, totallly less than 16 * 500 = 8K 110 */ 111 struct LmaNodeLE0 { 112 size_t son_1st_off; 113 size_t homo_idx_buf_off; 114 uint16 spl_idx; 115 uint16 num_of_son; 116 uint16 num_of_homo; 117 }; 118 119 /** 120 * GE = great and equal 121 * A node occupies 8 bytes. 122 */ 123 struct LmaNodeGE1 { 124 uint16 son_1st_off_l; // Low bits of the son_1st_off 125 uint16 homo_idx_buf_off_l; // Low bits of the homo_idx_buf_off_1 126 uint16 spl_idx; 127 unsigned char num_of_son; // number of son nodes 128 unsigned char num_of_homo; // number of homo words 129 unsigned char son_1st_off_h; // high bits of the son_1st_off 130 unsigned char homo_idx_buf_off_h; // high bits of the homo_idx_buf_off 131 }; 132 133 #ifdef ___BUILD_MODEL___ 134 struct SingleCharItem { 135 float freq; 136 char16 hz; 137 SpellingId splid; 138 }; 139 140 struct LemmaEntry { 141 LemmaIdType idx_by_py; 142 LemmaIdType idx_by_hz; 143 char16 hanzi_str[kMaxLemmaSize + 1]; 144 145 // The SingleCharItem id for each Hanzi. 146 uint16 hanzi_scis_ids[kMaxLemmaSize]; 147 148 uint16 spl_idx_arr[kMaxLemmaSize + 1]; 149 char pinyin_str[kMaxLemmaSize][kMaxPinyinSize + 1]; 150 unsigned char hz_str_len; 151 float freq; 152 }; 153 #endif // ___BUILD_MODEL___ 154 155 } // namespace ime_pinyin 156 157 #endif // PINYINIME_INCLUDE_DICTDEF_H__ 158