Home | History | Annotate | Download | only in utils
      1 /*
      2  * Copyright (C) 2014 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.inputmethod.latin.utils;
     18 
     19 import com.android.inputmethod.latin.NgramContext;
     20 import com.android.inputmethod.latin.NgramContext.WordInfo;
     21 import com.android.inputmethod.latin.define.DecoderSpecificConstants;
     22 import com.android.inputmethod.latin.settings.SpacingAndPunctuations;
     23 
     24 import java.util.Arrays;
     25 import java.util.regex.Pattern;
     26 
     27 import javax.annotation.Nonnull;
     28 
     29 public final class NgramContextUtils {
     30     private NgramContextUtils() {
     31         // Intentional empty constructor for utility class.
     32     }
     33 
     34     private static final Pattern NEWLINE_REGEX = Pattern.compile("[\\r\\n]+");
     35     private static final Pattern SPACE_REGEX = Pattern.compile("\\s+");
     36     // Get context information from nth word before the cursor. n = 1 retrieves the words
     37     // immediately before the cursor, n = 2 retrieves the words before that, and so on. This splits
     38     // on whitespace only.
     39     // Also, it won't return words that end in a separator (if the nth word before the cursor
     40     // ends in a separator, it returns information representing beginning-of-sentence).
     41     // Example (when Constants.MAX_PREV_WORD_COUNT_FOR_N_GRAM is 2):
     42     // (n = 1) "abc def|" -> abc, def
     43     // (n = 1) "abc def |" -> abc, def
     44     // (n = 1) "abc 'def|" -> empty, 'def
     45     // (n = 1) "abc def. |" -> beginning-of-sentence
     46     // (n = 1) "abc def . |" -> beginning-of-sentence
     47     // (n = 2) "abc def|" -> beginning-of-sentence, abc
     48     // (n = 2) "abc def |" -> beginning-of-sentence, abc
     49     // (n = 2) "abc 'def|" -> empty. The context is different from "abc def", but we cannot
     50     // represent this situation using NgramContext. See TODO in the method.
     51     // TODO: The next example's result should be "abc, def". This have to be fixed before we
     52     // retrieve the prior context of Beginning-of-Sentence.
     53     // (n = 2) "abc def. |" -> beginning-of-sentence, abc
     54     // (n = 2) "abc def . |" -> abc, def
     55     // (n = 2) "abc|" -> beginning-of-sentence
     56     // (n = 2) "abc |" -> beginning-of-sentence
     57     // (n = 2) "abc. def|" -> beginning-of-sentence
     58     @Nonnull
     59     public static NgramContext getNgramContextFromNthPreviousWord(final CharSequence prev,
     60             final SpacingAndPunctuations spacingAndPunctuations, final int n) {
     61         if (prev == null) return NgramContext.EMPTY_PREV_WORDS_INFO;
     62         final String[] lines = NEWLINE_REGEX.split(prev);
     63         if (lines.length == 0) {
     64             return new NgramContext(WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO);
     65         }
     66         final String[] w = SPACE_REGEX.split(lines[lines.length - 1]);
     67         final WordInfo[] prevWordsInfo =
     68                 new WordInfo[DecoderSpecificConstants.MAX_PREV_WORD_COUNT_FOR_N_GRAM];
     69         Arrays.fill(prevWordsInfo, WordInfo.EMPTY_WORD_INFO);
     70         for (int i = 0; i < prevWordsInfo.length; i++) {
     71             final int focusedWordIndex = w.length - n - i;
     72             // Referring to the word after the focused word.
     73             if ((focusedWordIndex + 1) >= 0 && (focusedWordIndex + 1) < w.length) {
     74                 final String wordFollowingTheNthPrevWord = w[focusedWordIndex + 1];
     75                 if (!wordFollowingTheNthPrevWord.isEmpty()) {
     76                     final char firstChar = wordFollowingTheNthPrevWord.charAt(0);
     77                     if (spacingAndPunctuations.isWordConnector(firstChar)) {
     78                         // The word following the focused word is starting with a word connector.
     79                         // TODO: Return meaningful context for this case.
     80                         break;
     81                     }
     82                 }
     83             }
     84             // If we can't find (n + i) words, the context is beginning-of-sentence.
     85             if (focusedWordIndex < 0) {
     86                 prevWordsInfo[i] = WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO;
     87                 break;
     88             }
     89 
     90             final String focusedWord = w[focusedWordIndex];
     91             // If the word is empty, the context is beginning-of-sentence.
     92             final int length = focusedWord.length();
     93             if (length <= 0) {
     94                 prevWordsInfo[i] = WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO;
     95                 break;
     96             }
     97             // If the word ends in a sentence terminator, the context is beginning-of-sentence.
     98             final char lastChar = focusedWord.charAt(length - 1);
     99             if (spacingAndPunctuations.isSentenceTerminator(lastChar)) {
    100                 prevWordsInfo[i] = WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO;
    101                 break;
    102             }
    103             // If ends in a word separator or connector, the context is unclear.
    104             // TODO: Return meaningful context for this case.
    105             if (spacingAndPunctuations.isWordSeparator(lastChar)
    106                     || spacingAndPunctuations.isWordConnector(lastChar)) {
    107                 break;
    108             }
    109             prevWordsInfo[i] = new WordInfo(focusedWord);
    110         }
    111         return new NgramContext(prevWordsInfo);
    112     }
    113 }
    114