Home | History | Annotate | Download | only in src
      1 /*
      2  * Copyright (C) 2010 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef LATINIME_UNIGRAM_DICTIONARY_H
     18 #define LATINIME_UNIGRAM_DICTIONARY_H
     19 
     20 #include <map>
     21 #include <stdint.h>
     22 #include "correction.h"
     23 #include "correction_state.h"
     24 #include "defines.h"
     25 #include "proximity_info.h"
     26 #include "words_priority_queue.h"
     27 #include "words_priority_queue_pool.h"
     28 
     29 namespace latinime {
     30 
     31 class TerminalAttributes;
     32 class UnigramDictionary {
     33     typedef struct { int first; int second; int replacement; } digraph_t;
     34 
     35  public:
     36     // Mask and flags for children address type selection.
     37     static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
     38     static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
     39     static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
     40     static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
     41     static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
     42 
     43     // Flag for single/multiple char group
     44     static const int FLAG_HAS_MULTIPLE_CHARS = 0x20;
     45 
     46     // Flag for terminal groups
     47     static const int FLAG_IS_TERMINAL = 0x10;
     48 
     49     // Flag for shortcut targets presence
     50     static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
     51     // Flag for bigram presence
     52     static const int FLAG_HAS_BIGRAMS = 0x04;
     53 
     54     // Attribute (bigram/shortcut) related flags:
     55     // Flag for presence of more attributes
     56     static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
     57     // Flag for sign of offset. If this flag is set, the offset value must be negated.
     58     static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
     59 
     60     // Mask for attribute frequency, stored on 4 bits inside the flags byte.
     61     static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F;
     62 
     63     // Mask and flags for attribute address type selection.
     64     static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
     65     static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
     66     static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
     67     static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
     68 
     69     // Error tolerances
     70     static const int DEFAULT_MAX_ERRORS = 2;
     71     static const int MAX_ERRORS_FOR_TWO_WORDS = 1;
     72 
     73     static const int FLAG_MULTIPLE_SUGGEST_ABORT = 0;
     74     static const int FLAG_MULTIPLE_SUGGEST_SKIP = 1;
     75     static const int FLAG_MULTIPLE_SUGGEST_CONTINUE = 2;
     76     UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler,
     77             int fullWordMultiplier, int maxWordLength, int maxWords, const unsigned int flags);
     78     int getFrequency(const int32_t* const inWord, const int length) const;
     79     int getBigramPosition(int pos, unsigned short *word, int offset, int length) const;
     80     int getSuggestions(ProximityInfo *proximityInfo, WordsPriorityQueuePool *queuePool,
     81             Correction *correction, const int *xcoordinates, const int *ycoordinates,
     82             const int *codes, const int codesSize, const std::map<int, int> *bigramMap,
     83             const uint8_t *bigramFilter, const bool useFullEditDistance, unsigned short *outWords,
     84             int *frequencies);
     85     virtual ~UnigramDictionary();
     86 
     87  private:
     88     void getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
     89             const int *ycoordinates, const int *codes, const int inputLength,
     90             const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
     91             const bool useFullEditDistance, Correction *correction,
     92             WordsPriorityQueuePool *queuePool);
     93     int getDigraphReplacement(const int *codes, const int i, const int codesSize,
     94             const digraph_t* const digraphs, const unsigned int digraphsSize) const;
     95     void getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo,
     96         const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,
     97         int *xCoordinatesBuffer, int *yCoordinatesBuffer, const int codesBufferSize,
     98         const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
     99         const bool useFullEditDistance, const int* codesSrc, const int codesRemain,
    100         const int currentDepth, int* codesDest, Correction *correction,
    101         WordsPriorityQueuePool* queuePool, const digraph_t* const digraphs,
    102         const unsigned int digraphsSize);
    103     void initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
    104             const int *ycoordinates, const int *codes, const int codesSize, Correction *correction);
    105     void getOneWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
    106             const int *ycoordinates, const int *codes, const std::map<int, int> *bigramMap,
    107             const uint8_t *bigramFilter, const bool useFullEditDistance, const int inputLength,
    108             Correction *correction, WordsPriorityQueuePool* queuePool);
    109     void getSuggestionCandidates(
    110             const bool useFullEditDistance, const int inputLength,
    111             const std::map<int, int> *bigramMap, const uint8_t *bigramFilter,
    112             Correction *correction, WordsPriorityQueuePool* queuePool, const bool doAutoCompletion,
    113             const int maxErrors, const int currentWordIndex);
    114     void getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo,
    115             const int *xcoordinates, const int *ycoordinates, const int *codes,
    116             const bool useFullEditDistance, const int inputLength,
    117             Correction *correction, WordsPriorityQueuePool* queuePool,
    118             const bool hasAutoCorrectionCandidate);
    119     void onTerminal(const int freq, const TerminalAttributes& terminalAttributes,
    120             Correction *correction, WordsPriorityQueuePool *queuePool, const bool addToMasterQueue,
    121             const int currentWordIndex);
    122     bool needsToSkipCurrentNode(const unsigned short c,
    123             const int inputIndex, const int skipPos, const int depth);
    124     // Process a node by considering proximity, missing and excessive character
    125     bool processCurrentNode(const int initialPos, const std::map<int, int> *bigramMap,
    126             const uint8_t *bigramFilter, Correction *correction, int *newCount,
    127             int *newChildPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool,
    128             const int currentWordIndex);
    129     int getMostFrequentWordLike(const int startInputIndex, const int inputLength,
    130             ProximityInfo *proximityInfo, unsigned short *word);
    131     int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length,
    132             short unsigned int *outWord);
    133     int getSubStringSuggestion(
    134             ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates,
    135             const int *codes, const bool useFullEditDistance, Correction *correction,
    136             WordsPriorityQueuePool* queuePool, const int inputLength,
    137             const bool hasAutoCorrectionCandidate, const int currentWordIndex,
    138             const int inputWordStartPos, const int inputWordLength,
    139             const int outputWordStartPos, const bool isSpaceProximity, int *freqArray,
    140             int *wordLengthArray, unsigned short* outputWord, int *outputWordLength);
    141     void getMultiWordsSuggestionRec(ProximityInfo *proximityInfo,
    142             const int *xcoordinates, const int *ycoordinates, const int *codes,
    143             const bool useFullEditDistance, const int inputLength,
    144             Correction *correction, WordsPriorityQueuePool* queuePool,
    145             const bool hasAutoCorrectionCandidate, const int startPos, const int startWordIndex,
    146             const int outputWordLength, int *freqArray, int* wordLengthArray,
    147             unsigned short* outputWord);
    148 
    149     const uint8_t* const DICT_ROOT;
    150     const int MAX_WORD_LENGTH;
    151     const int MAX_WORDS;
    152     const int TYPED_LETTER_MULTIPLIER;
    153     const int FULL_WORD_MULTIPLIER;
    154     const int ROOT_POS;
    155     const unsigned int BYTES_IN_ONE_CHAR;
    156     const int MAX_DIGRAPH_SEARCH_DEPTH;
    157     const int FLAGS;
    158 
    159     static const digraph_t GERMAN_UMLAUT_DIGRAPHS[];
    160     static const digraph_t FRENCH_LIGATURES_DIGRAPHS[];
    161 
    162     // Still bundled members
    163     unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];// TODO: remove
    164     int mStackChildCount[MAX_WORD_LENGTH_INTERNAL];// TODO: remove
    165     int mStackInputIndex[MAX_WORD_LENGTH_INTERNAL];// TODO: remove
    166     int mStackSiblingPos[MAX_WORD_LENGTH_INTERNAL];// TODO: remove
    167 };
    168 } // namespace latinime
    169 
    170 #endif // LATINIME_UNIGRAM_DICTIONARY_H
    171