1 /* 2 * Copyright (C) 2010 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LATINIME_UNIGRAM_DICTIONARY_H 18 #define LATINIME_UNIGRAM_DICTIONARY_H 19 20 #include <stdint.h> 21 #include "correction.h" 22 #include "correction_state.h" 23 #include "defines.h" 24 #include "proximity_info.h" 25 26 #ifndef NULL 27 #define NULL 0 28 #endif 29 30 namespace latinime { 31 32 class UnigramDictionary { 33 34 public: 35 36 // Mask and flags for children address type selection. 37 static const int MASK_GROUP_ADDRESS_TYPE = 0xC0; 38 static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00; 39 static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40; 40 static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80; 41 static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0; 42 43 // Flag for single/multiple char group 44 static const int FLAG_HAS_MULTIPLE_CHARS = 0x20; 45 46 // Flag for terminal groups 47 static const int FLAG_IS_TERMINAL = 0x10; 48 49 // Flag for bigram presence 50 static const int FLAG_HAS_BIGRAMS = 0x04; 51 52 // Attribute (bigram/shortcut) related flags: 53 // Flag for presence of more attributes 54 static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80; 55 // Flag for sign of offset. If this flag is set, the offset value must be negated. 56 static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; 57 58 // Mask for attribute frequency, stored on 4 bits inside the flags byte. 59 static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F; 60 61 // Mask and flags for attribute address type selection. 62 static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30; 63 static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10; 64 static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; 65 static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; 66 67 UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler, 68 int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars, 69 const bool isLatestDictVersion); 70 bool isValidWord(const uint16_t* const inWord, const int length) const; 71 int getBigramPosition(int pos, unsigned short *word, int offset, int length) const; 72 int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, 73 const int *ycoordinates, const int *codes, const int codesSize, const int flags, 74 unsigned short *outWords, int *frequencies); 75 virtual ~UnigramDictionary(); 76 77 private: 78 79 void getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, 80 const int *ycoordinates, const int *codes, const int codesSize, 81 unsigned short *outWords, int *frequencies, const int flags); 82 bool isDigraph(const int* codes, const int i, const int codesSize) const; 83 void getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo, 84 const int *xcoordinates, const int* ycoordinates, const int *codesBuffer, 85 const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain, 86 const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies); 87 void initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, 88 const int *ycoordinates, const int *codes, const int codesSize, 89 unsigned short *outWords, int *frequencies); 90 void getSuggestionCandidates(const bool useFullEditDistance); 91 bool addWord(unsigned short *word, int length, int frequency); 92 void getSplitTwoWordsSuggestion(const int inputLength, Correction *correction); 93 void getMissingSpaceWords(const int inputLength, const int missingSpacePos, 94 Correction *correction, const bool useFullEditDistance); 95 void getMistypedSpaceWords(const int inputLength, const int spaceProximityPos, 96 Correction *correction, const bool useFullEditDistance); 97 void onTerminal(const int freq, Correction *correction); 98 bool needsToSkipCurrentNode(const unsigned short c, 99 const int inputIndex, const int skipPos, const int depth); 100 // Process a node by considering proximity, missing and excessive character 101 bool processCurrentNode(const int initialPos, 102 Correction *correction, int *newCount, 103 int *newChildPosition, int *nextSiblingPosition); 104 int getMostFrequentWordLike(const int startInputIndex, const int inputLength, 105 unsigned short *word); 106 int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length, 107 short unsigned int* outWord); 108 109 const uint8_t* const DICT_ROOT; 110 const int MAX_WORD_LENGTH; 111 const int MAX_WORDS; 112 const int MAX_PROXIMITY_CHARS; 113 const bool IS_LATEST_DICT_VERSION; 114 const int TYPED_LETTER_MULTIPLIER; 115 const int FULL_WORD_MULTIPLIER; 116 const int ROOT_POS; 117 const unsigned int BYTES_IN_ONE_CHAR; 118 const int MAX_UMLAUT_SEARCH_DEPTH; 119 120 // Flags for special processing 121 // Those *must* match the flags in BinaryDictionary.Flags.ALL_FLAGS in BinaryDictionary.java 122 // or something very bad (like, the apocalypse) will happen. 123 // Please update both at the same time. 124 enum { 125 REQUIRES_GERMAN_UMLAUT_PROCESSING = 0x1, 126 USE_FULL_EDIT_DISTANCE = 0x2 127 }; 128 static const struct digraph_t { int first; int second; } GERMAN_UMLAUT_DIGRAPHS[]; 129 130 int *mFrequencies; 131 unsigned short *mOutputChars; 132 ProximityInfo *mProximityInfo; 133 Correction *mCorrection; 134 int mInputLength; 135 // MAX_WORD_LENGTH_INTERNAL must be bigger than MAX_WORD_LENGTH 136 unsigned short mWord[MAX_WORD_LENGTH_INTERNAL]; 137 138 int mStackChildCount[MAX_WORD_LENGTH_INTERNAL];// TODO: remove 139 int mStackInputIndex[MAX_WORD_LENGTH_INTERNAL];// TODO: remove 140 int mStackSiblingPos[MAX_WORD_LENGTH_INTERNAL];// TODO: remove 141 }; 142 } // namespace latinime 143 144 #endif // LATINIME_UNIGRAM_DICTIONARY_H 145