1 /* 2 * Copyright (C) 2010 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LATINIME_UNIGRAM_DICTIONARY_H 18 #define LATINIME_UNIGRAM_DICTIONARY_H 19 20 #include <map> 21 #include <stdint.h> 22 #include "correction.h" 23 #include "correction_state.h" 24 #include "defines.h" 25 #include "proximity_info.h" 26 #include "words_priority_queue.h" 27 #include "words_priority_queue_pool.h" 28 29 namespace latinime { 30 31 class TerminalAttributes; 32 class UnigramDictionary { 33 typedef struct { int first; int second; int replacement; } digraph_t; 34 35 public: 36 // Mask and flags for children address type selection. 37 static const int MASK_GROUP_ADDRESS_TYPE = 0xC0; 38 static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00; 39 static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40; 40 static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80; 41 static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0; 42 43 // Flag for single/multiple char group 44 static const int FLAG_HAS_MULTIPLE_CHARS = 0x20; 45 46 // Flag for terminal groups 47 static const int FLAG_IS_TERMINAL = 0x10; 48 49 // Flag for shortcut targets presence 50 static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08; 51 // Flag for bigram presence 52 static const int FLAG_HAS_BIGRAMS = 0x04; 53 54 // Attribute (bigram/shortcut) related flags: 55 // Flag for presence of more attributes 56 static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80; 57 // Flag for sign of offset. If this flag is set, the offset value must be negated. 58 static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; 59 60 // Mask for attribute frequency, stored on 4 bits inside the flags byte. 61 static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F; 62 63 // Mask and flags for attribute address type selection. 64 static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30; 65 static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10; 66 static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; 67 static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; 68 69 // Error tolerances 70 static const int DEFAULT_MAX_ERRORS = 2; 71 static const int MAX_ERRORS_FOR_TWO_WORDS = 1; 72 73 static const int FLAG_MULTIPLE_SUGGEST_ABORT = 0; 74 static const int FLAG_MULTIPLE_SUGGEST_SKIP = 1; 75 static const int FLAG_MULTIPLE_SUGGEST_CONTINUE = 2; 76 UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler, 77 int fullWordMultiplier, int maxWordLength, int maxWords, const unsigned int flags); 78 int getFrequency(const int32_t* const inWord, const int length) const; 79 int getBigramPosition(int pos, unsigned short *word, int offset, int length) const; 80 int getSuggestions(ProximityInfo *proximityInfo, WordsPriorityQueuePool *queuePool, 81 Correction *correction, const int *xcoordinates, const int *ycoordinates, 82 const int *codes, const int codesSize, const std::map<int, int> *bigramMap, 83 const uint8_t *bigramFilter, const bool useFullEditDistance, unsigned short *outWords, 84 int *frequencies); 85 virtual ~UnigramDictionary(); 86 87 private: 88 void getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, 89 const int *ycoordinates, const int *codes, const int inputLength, 90 const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, 91 const bool useFullEditDistance, Correction *correction, 92 WordsPriorityQueuePool *queuePool); 93 int getDigraphReplacement(const int *codes, const int i, const int codesSize, 94 const digraph_t* const digraphs, const unsigned int digraphsSize) const; 95 void getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo, 96 const int *xcoordinates, const int* ycoordinates, const int *codesBuffer, 97 int *xCoordinatesBuffer, int *yCoordinatesBuffer, const int codesBufferSize, 98 const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, 99 const bool useFullEditDistance, const int* codesSrc, const int codesRemain, 100 const int currentDepth, int* codesDest, Correction *correction, 101 WordsPriorityQueuePool* queuePool, const digraph_t* const digraphs, 102 const unsigned int digraphsSize); 103 void initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, 104 const int *ycoordinates, const int *codes, const int codesSize, Correction *correction); 105 void getOneWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, 106 const int *ycoordinates, const int *codes, const std::map<int, int> *bigramMap, 107 const uint8_t *bigramFilter, const bool useFullEditDistance, const int inputLength, 108 Correction *correction, WordsPriorityQueuePool* queuePool); 109 void getSuggestionCandidates( 110 const bool useFullEditDistance, const int inputLength, 111 const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, 112 Correction *correction, WordsPriorityQueuePool* queuePool, const bool doAutoCompletion, 113 const int maxErrors, const int currentWordIndex); 114 void getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo, 115 const int *xcoordinates, const int *ycoordinates, const int *codes, 116 const bool useFullEditDistance, const int inputLength, 117 Correction *correction, WordsPriorityQueuePool* queuePool, 118 const bool hasAutoCorrectionCandidate); 119 void onTerminal(const int freq, const TerminalAttributes& terminalAttributes, 120 Correction *correction, WordsPriorityQueuePool *queuePool, const bool addToMasterQueue, 121 const int currentWordIndex); 122 bool needsToSkipCurrentNode(const unsigned short c, 123 const int inputIndex, const int skipPos, const int depth); 124 // Process a node by considering proximity, missing and excessive character 125 bool processCurrentNode(const int initialPos, const std::map<int, int> *bigramMap, 126 const uint8_t *bigramFilter, Correction *correction, int *newCount, 127 int *newChildPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool, 128 const int currentWordIndex); 129 int getMostFrequentWordLike(const int startInputIndex, const int inputLength, 130 ProximityInfo *proximityInfo, unsigned short *word); 131 int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length, 132 short unsigned int *outWord); 133 int getSubStringSuggestion( 134 ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, 135 const int *codes, const bool useFullEditDistance, Correction *correction, 136 WordsPriorityQueuePool* queuePool, const int inputLength, 137 const bool hasAutoCorrectionCandidate, const int currentWordIndex, 138 const int inputWordStartPos, const int inputWordLength, 139 const int outputWordStartPos, const bool isSpaceProximity, int *freqArray, 140 int *wordLengthArray, unsigned short* outputWord, int *outputWordLength); 141 void getMultiWordsSuggestionRec(ProximityInfo *proximityInfo, 142 const int *xcoordinates, const int *ycoordinates, const int *codes, 143 const bool useFullEditDistance, const int inputLength, 144 Correction *correction, WordsPriorityQueuePool* queuePool, 145 const bool hasAutoCorrectionCandidate, const int startPos, const int startWordIndex, 146 const int outputWordLength, int *freqArray, int* wordLengthArray, 147 unsigned short* outputWord); 148 149 const uint8_t* const DICT_ROOT; 150 const int MAX_WORD_LENGTH; 151 const int MAX_WORDS; 152 const int TYPED_LETTER_MULTIPLIER; 153 const int FULL_WORD_MULTIPLIER; 154 const int ROOT_POS; 155 const unsigned int BYTES_IN_ONE_CHAR; 156 const int MAX_DIGRAPH_SEARCH_DEPTH; 157 const int FLAGS; 158 159 static const digraph_t GERMAN_UMLAUT_DIGRAPHS[]; 160 static const digraph_t FRENCH_LIGATURES_DIGRAPHS[]; 161 162 // Still bundled members 163 unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];// TODO: remove 164 int mStackChildCount[MAX_WORD_LENGTH_INTERNAL];// TODO: remove 165 int mStackInputIndex[MAX_WORD_LENGTH_INTERNAL];// TODO: remove 166 int mStackSiblingPos[MAX_WORD_LENGTH_INTERNAL];// TODO: remove 167 }; 168 } // namespace latinime 169 170 #endif // LATINIME_UNIGRAM_DICTIONARY_H 171