Home | History | Annotate | Download | only in src
      1 /*
      2  * Copyright (C) 2010 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef LATINIME_UNIGRAM_DICTIONARY_H
     18 #define LATINIME_UNIGRAM_DICTIONARY_H
     19 
     20 #include <stdint.h>
     21 #include "correction.h"
     22 #include "correction_state.h"
     23 #include "defines.h"
     24 #include "proximity_info.h"
     25 
     26 #ifndef NULL
     27 #define NULL 0
     28 #endif
     29 
     30 namespace latinime {
     31 
     32 class UnigramDictionary {
     33 
     34 public:
     35 
     36     // Mask and flags for children address type selection.
     37     static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
     38     static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
     39     static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
     40     static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
     41     static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
     42 
     43     // Flag for single/multiple char group
     44     static const int FLAG_HAS_MULTIPLE_CHARS = 0x20;
     45 
     46     // Flag for terminal groups
     47     static const int FLAG_IS_TERMINAL = 0x10;
     48 
     49     // Flag for bigram presence
     50     static const int FLAG_HAS_BIGRAMS = 0x04;
     51 
     52     // Attribute (bigram/shortcut) related flags:
     53     // Flag for presence of more attributes
     54     static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
     55     // Flag for sign of offset. If this flag is set, the offset value must be negated.
     56     static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
     57 
     58     // Mask for attribute frequency, stored on 4 bits inside the flags byte.
     59     static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F;
     60 
     61     // Mask and flags for attribute address type selection.
     62     static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
     63     static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
     64     static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
     65     static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
     66 
     67     UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler,
     68             int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
     69             const bool isLatestDictVersion);
     70     bool isValidWord(const uint16_t* const inWord, const int length) const;
     71     int getBigramPosition(int pos, unsigned short *word, int offset, int length) const;
     72     int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
     73             const int *ycoordinates, const int *codes, const int codesSize, const int flags,
     74             unsigned short *outWords, int *frequencies);
     75     virtual ~UnigramDictionary();
     76 
     77 private:
     78 
     79     void getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
     80             const int *ycoordinates, const int *codes, const int codesSize,
     81             unsigned short *outWords, int *frequencies, const int flags);
     82     bool isDigraph(const int* codes, const int i, const int codesSize) const;
     83     void getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo,
     84         const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,
     85         const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,
     86         const int currentDepth, int* codesDest, unsigned short* outWords, int* frequencies);
     87     void initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
     88             const int *ycoordinates, const int *codes, const int codesSize,
     89             unsigned short *outWords, int *frequencies);
     90     void getSuggestionCandidates(const bool useFullEditDistance);
     91     bool addWord(unsigned short *word, int length, int frequency);
     92     void getSplitTwoWordsSuggestion(const int inputLength, Correction *correction);
     93     void getMissingSpaceWords(const int inputLength, const int missingSpacePos,
     94             Correction *correction, const bool useFullEditDistance);
     95     void getMistypedSpaceWords(const int inputLength, const int spaceProximityPos,
     96             Correction *correction, const bool useFullEditDistance);
     97     void onTerminal(const int freq, Correction *correction);
     98     bool needsToSkipCurrentNode(const unsigned short c,
     99             const int inputIndex, const int skipPos, const int depth);
    100     // Process a node by considering proximity, missing and excessive character
    101     bool processCurrentNode(const int initialPos,
    102             Correction *correction, int *newCount,
    103             int *newChildPosition, int *nextSiblingPosition);
    104     int getMostFrequentWordLike(const int startInputIndex, const int inputLength,
    105             unsigned short *word);
    106     int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length,
    107             short unsigned int* outWord);
    108 
    109     const uint8_t* const DICT_ROOT;
    110     const int MAX_WORD_LENGTH;
    111     const int MAX_WORDS;
    112     const int MAX_PROXIMITY_CHARS;
    113     const bool IS_LATEST_DICT_VERSION;
    114     const int TYPED_LETTER_MULTIPLIER;
    115     const int FULL_WORD_MULTIPLIER;
    116     const int ROOT_POS;
    117     const unsigned int BYTES_IN_ONE_CHAR;
    118     const int MAX_UMLAUT_SEARCH_DEPTH;
    119 
    120     // Flags for special processing
    121     // Those *must* match the flags in BinaryDictionary.Flags.ALL_FLAGS in BinaryDictionary.java
    122     // or something very bad (like, the apocalypse) will happen.
    123     // Please update both at the same time.
    124     enum {
    125         REQUIRES_GERMAN_UMLAUT_PROCESSING = 0x1,
    126         USE_FULL_EDIT_DISTANCE = 0x2
    127     };
    128     static const struct digraph_t { int first; int second; } GERMAN_UMLAUT_DIGRAPHS[];
    129 
    130     int *mFrequencies;
    131     unsigned short *mOutputChars;
    132     ProximityInfo *mProximityInfo;
    133     Correction *mCorrection;
    134     int mInputLength;
    135     // MAX_WORD_LENGTH_INTERNAL must be bigger than MAX_WORD_LENGTH
    136     unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];
    137 
    138     int mStackChildCount[MAX_WORD_LENGTH_INTERNAL];// TODO: remove
    139     int mStackInputIndex[MAX_WORD_LENGTH_INTERNAL];// TODO: remove
    140     int mStackSiblingPos[MAX_WORD_LENGTH_INTERNAL];// TODO: remove
    141 };
    142 } // namespace latinime
    143 
    144 #endif // LATINIME_UNIGRAM_DICTIONARY_H
    145