1 /* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.inputmethod.latin.utils; 18 19 import com.android.inputmethod.latin.NgramContext; 20 import com.android.inputmethod.latin.NgramContext.WordInfo; 21 import com.android.inputmethod.latin.define.DecoderSpecificConstants; 22 import com.android.inputmethod.latin.settings.SpacingAndPunctuations; 23 24 import java.util.Arrays; 25 import java.util.regex.Pattern; 26 27 import javax.annotation.Nonnull; 28 29 public final class NgramContextUtils { 30 private NgramContextUtils() { 31 // Intentional empty constructor for utility class. 32 } 33 34 private static final Pattern NEWLINE_REGEX = Pattern.compile("[\\r\\n]+"); 35 private static final Pattern SPACE_REGEX = Pattern.compile("\\s+"); 36 // Get context information from nth word before the cursor. n = 1 retrieves the words 37 // immediately before the cursor, n = 2 retrieves the words before that, and so on. This splits 38 // on whitespace only. 39 // Also, it won't return words that end in a separator (if the nth word before the cursor 40 // ends in a separator, it returns information representing beginning-of-sentence). 41 // Example (when Constants.MAX_PREV_WORD_COUNT_FOR_N_GRAM is 2): 42 // (n = 1) "abc def|" -> abc, def 43 // (n = 1) "abc def |" -> abc, def 44 // (n = 1) "abc 'def|" -> empty, 'def 45 // (n = 1) "abc def. |" -> beginning-of-sentence 46 // (n = 1) "abc def . |" -> beginning-of-sentence 47 // (n = 2) "abc def|" -> beginning-of-sentence, abc 48 // (n = 2) "abc def |" -> beginning-of-sentence, abc 49 // (n = 2) "abc 'def|" -> empty. The context is different from "abc def", but we cannot 50 // represent this situation using NgramContext. See TODO in the method. 51 // TODO: The next example's result should be "abc, def". This have to be fixed before we 52 // retrieve the prior context of Beginning-of-Sentence. 53 // (n = 2) "abc def. |" -> beginning-of-sentence, abc 54 // (n = 2) "abc def . |" -> abc, def 55 // (n = 2) "abc|" -> beginning-of-sentence 56 // (n = 2) "abc |" -> beginning-of-sentence 57 // (n = 2) "abc. def|" -> beginning-of-sentence 58 @Nonnull 59 public static NgramContext getNgramContextFromNthPreviousWord(final CharSequence prev, 60 final SpacingAndPunctuations spacingAndPunctuations, final int n) { 61 if (prev == null) return NgramContext.EMPTY_PREV_WORDS_INFO; 62 final String[] lines = NEWLINE_REGEX.split(prev); 63 if (lines.length == 0) { 64 return new NgramContext(WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO); 65 } 66 final String[] w = SPACE_REGEX.split(lines[lines.length - 1]); 67 final WordInfo[] prevWordsInfo = 68 new WordInfo[DecoderSpecificConstants.MAX_PREV_WORD_COUNT_FOR_N_GRAM]; 69 Arrays.fill(prevWordsInfo, WordInfo.EMPTY_WORD_INFO); 70 for (int i = 0; i < prevWordsInfo.length; i++) { 71 final int focusedWordIndex = w.length - n - i; 72 // Referring to the word after the focused word. 73 if ((focusedWordIndex + 1) >= 0 && (focusedWordIndex + 1) < w.length) { 74 final String wordFollowingTheNthPrevWord = w[focusedWordIndex + 1]; 75 if (!wordFollowingTheNthPrevWord.isEmpty()) { 76 final char firstChar = wordFollowingTheNthPrevWord.charAt(0); 77 if (spacingAndPunctuations.isWordConnector(firstChar)) { 78 // The word following the focused word is starting with a word connector. 79 // TODO: Return meaningful context for this case. 80 break; 81 } 82 } 83 } 84 // If we can't find (n + i) words, the context is beginning-of-sentence. 85 if (focusedWordIndex < 0) { 86 prevWordsInfo[i] = WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO; 87 break; 88 } 89 90 final String focusedWord = w[focusedWordIndex]; 91 // If the word is empty, the context is beginning-of-sentence. 92 final int length = focusedWord.length(); 93 if (length <= 0) { 94 prevWordsInfo[i] = WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO; 95 break; 96 } 97 // If the word ends in a sentence terminator, the context is beginning-of-sentence. 98 final char lastChar = focusedWord.charAt(length - 1); 99 if (spacingAndPunctuations.isSentenceTerminator(lastChar)) { 100 prevWordsInfo[i] = WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO; 101 break; 102 } 103 // If ends in a word separator or connector, the context is unclear. 104 // TODO: Return meaningful context for this case. 105 if (spacingAndPunctuations.isWordSeparator(lastChar) 106 || spacingAndPunctuations.isWordConnector(lastChar)) { 107 break; 108 } 109 prevWordsInfo[i] = new WordInfo(focusedWord); 110 } 111 return new NgramContext(prevWordsInfo); 112 } 113 } 114