diff --git a/java/res/values/strings.xml b/java/res/values/strings.xml index 35dd3e089..70a5b7e2e 100644 --- a/java/res/values/strings.xml +++ b/java/res/values/strings.xml @@ -85,6 +85,11 @@ Spacebar and punctuation automatically insert highlighted word + + Bigram Suggestions + + Use previous word to improve suggestion + None diff --git a/java/res/xml/prefs.xml b/java/res/xml/prefs.xml index 535b63f3b..c93fe0ac3 100644 --- a/java/res/xml/prefs.xml +++ b/java/res/xml/prefs.xml @@ -81,6 +81,14 @@ android:defaultValue="@bool/enable_autocorrect" android:dependency="show_suggestions" /> - + + diff --git a/java/src/com/android/inputmethod/latin/BinaryDictionary.java b/java/src/com/android/inputmethod/latin/BinaryDictionary.java index 6473f4558..8d2363012 100644 --- a/java/src/com/android/inputmethod/latin/BinaryDictionary.java +++ b/java/src/com/android/inputmethod/latin/BinaryDictionary.java @@ -33,9 +33,9 @@ import android.util.Log; public class BinaryDictionary extends Dictionary { private static final String TAG = "BinaryDictionary"; - public static final int MAX_WORD_LENGTH = 48; private static final int MAX_ALTERNATIVES = 16; private static final int MAX_WORDS = 16; + private static final int MAX_BIGRAMS = 255; // TODO Probably don't need all 255 private static final int TYPED_LETTER_MULTIPLIER = 2; private static final boolean ENABLE_MISSED_CHARACTERS = true; @@ -44,7 +44,9 @@ public class BinaryDictionary extends Dictionary { private int mDictLength; private int[] mInputCodes = new int[MAX_WORD_LENGTH * MAX_ALTERNATIVES]; private char[] mOutputChars = new char[MAX_WORD_LENGTH * MAX_WORDS]; + private char[] mOutputChars_bigrams = new char[MAX_WORD_LENGTH * MAX_BIGRAMS]; private int[] mFrequencies = new int[MAX_WORDS]; + private int[] mFrequencies_bigrams = new int[MAX_BIGRAMS]; // Keep a reference to the native dict direct buffer in Java to avoid // unexpected deallocation of the direct buffer. private ByteBuffer mNativeDictDirectBuffer; @@ -71,7 +73,7 @@ public class BinaryDictionary extends Dictionary { /** * Create a dictionary from a byte buffer. This is used for testing. * @param context application context for reading resources - * @param resId the resource containing the raw binary dictionary + * @param byteBuffer a ByteBuffer containing the binary dictionary */ public BinaryDictionary(Context context, ByteBuffer byteBuffer) { if (byteBuffer != null) { @@ -95,6 +97,8 @@ public class BinaryDictionary extends Dictionary { char[] outputChars, int[] frequencies, int maxWordLength, int maxWords, int maxAlternatives, int skipPos, int[] nextLettersFrequencies, int nextLettersSize); + private native int getBigramsNative(int nativeData, char[] prevWord, int prevWordLength, + char[] outputChars, int[] frequencies, int maxWordLength, int maxBigrams); private final void loadDictionary(Context context, int resId) { InputStream is = context.getResources().openRawResource(resId); @@ -121,6 +125,30 @@ public class BinaryDictionary extends Dictionary { } } + @Override + public void getBigrams(final WordComposer composer, final CharSequence previousWord, + final WordCallback callback, int[] nextLettersFrequencies) { + + char[] chars = previousWord.toString().toCharArray(); + Arrays.fill(mOutputChars_bigrams, (char) 0); + Arrays.fill(mFrequencies_bigrams, 0); + + int count = getBigramsNative(mNativeDict, chars, chars.length, mOutputChars_bigrams, + mFrequencies_bigrams, MAX_WORD_LENGTH, MAX_BIGRAMS); + for (int j = 0; j < count; j++) { + if (mFrequencies_bigrams[j] < 1) break; + int start = j * MAX_WORD_LENGTH; + int len = 0; + while (mOutputChars_bigrams[start + len] != 0) { + len++; + } + if (len > 0) { + callback.addWord(mOutputChars_bigrams, start, len, mFrequencies_bigrams[j], + DataType.BIGRAM); + } + } + } + @Override public void getWords(final WordComposer codes, final WordCallback callback, int[] nextLettersFrequencies) { @@ -166,7 +194,7 @@ public class BinaryDictionary extends Dictionary { len++; } if (len > 0) { - callback.addWord(mOutputChars, start, len, mFrequencies[j]); + callback.addWord(mOutputChars, start, len, mFrequencies[j], DataType.UNIGRAM); } } } diff --git a/java/src/com/android/inputmethod/latin/Dictionary.java b/java/src/com/android/inputmethod/latin/Dictionary.java index e7b526663..54317c861 100644 --- a/java/src/com/android/inputmethod/latin/Dictionary.java +++ b/java/src/com/android/inputmethod/latin/Dictionary.java @@ -21,7 +21,9 @@ package com.android.inputmethod.latin; * strokes. */ abstract public class Dictionary { - + + protected static final int MAX_WORD_LENGTH = 48; + /** * Whether or not to replicate the typed word in the suggested list, even if it's valid. */ @@ -31,7 +33,11 @@ abstract public class Dictionary { * The weight to give to a word if it's length is the same as the number of typed characters. */ protected static final int FULL_WORD_FREQ_MULTIPLIER = 2; - + + public static enum DataType { + UNIGRAM, BIGRAM + } + /** * Interface to be implemented by classes requesting words to be fetched from the dictionary. * @see #getWords(WordComposer, WordCallback) @@ -45,9 +51,11 @@ abstract public class Dictionary { * @param wordLength length of valid characters in the character array * @param frequency the frequency of occurence. This is normalized between 1 and 255, but * can exceed those limits + * @param dataType tells type of this data * @return true if the word was added, false if no more words are required */ - boolean addWord(char[] word, int wordOffset, int wordLength, int frequency); + boolean addWord(char[] word, int wordOffset, int wordLength, int frequency, + DataType dataType); } /** @@ -64,6 +72,21 @@ abstract public class Dictionary { abstract public void getWords(final WordComposer composer, final WordCallback callback, int[] nextLettersFrequencies); + /** + * Searches for pairs in the bigram dictionary that matches the previous word and all the + * possible words following are added through the callback object. + * @param composer the key sequence to match + * @param callback the callback object to send possible word following previous word + * @param nextLettersFrequencies array of frequencies of next letters that could follow the + * word so far. For instance, "bracke" can be followed by "t", so array['t'] will have + * a non-zero value on returning from this method. + * Pass in null if you don't want the dictionary to look up next letters. + */ + public void getBigrams(final WordComposer composer, final CharSequence previousWord, + final WordCallback callback, int[] nextLettersFrequencies) { + // empty base implementation + } + /** * Checks if the given word occurs in the dictionary * @param word the word to search for. The search should be case-insensitive. diff --git a/java/src/com/android/inputmethod/latin/EditingUtil.java b/java/src/com/android/inputmethod/latin/EditingUtil.java index 7571f1daf..5133c60ca 100644 --- a/java/src/com/android/inputmethod/latin/EditingUtil.java +++ b/java/src/com/android/inputmethod/latin/EditingUtil.java @@ -16,6 +16,8 @@ package com.android.inputmethod.latin; +import java.util.regex.Pattern; + import android.view.inputmethod.ExtractedText; import android.view.inputmethod.ExtractedTextRequest; import android.view.inputmethod.InputConnection; @@ -24,6 +26,11 @@ import android.view.inputmethod.InputConnection; * Utility methods to deal with editing text through an InputConnection. */ public class EditingUtil { + /** + * Number of characters we want to look back in order to identify the previous word + */ + public static final int LOOKBACK_CHARACTER_NUM = 15; + private EditingUtil() {}; /** @@ -175,4 +182,13 @@ public class EditingUtil { private static boolean isWhitespace(int code, String whitespace) { return whitespace.contains(String.valueOf((char) code)); } + + private static final Pattern spaceRegex = Pattern.compile("\\s+"); + + public static CharSequence getPreviousWord(InputConnection connection) { + //TODO: Should fix this. This could be slow! + CharSequence prev = connection.getTextBeforeCursor(LOOKBACK_CHARACTER_NUM, 0); + String[] w = spaceRegex.split(prev); + return (w.length >= 2) ? w[w.length-2] : null; + } } diff --git a/java/src/com/android/inputmethod/latin/ExpandableDictionary.java b/java/src/com/android/inputmethod/latin/ExpandableDictionary.java index 46bc41c42..6f4d925ee 100644 --- a/java/src/com/android/inputmethod/latin/ExpandableDictionary.java +++ b/java/src/com/android/inputmethod/latin/ExpandableDictionary.java @@ -267,7 +267,7 @@ public class ExpandableDictionary extends Dictionary { if (completion) { word[depth] = c; if (terminal) { - if (!callback.addWord(word, 0, depth + 1, freq * snr)) { + if (!callback.addWord(word, 0, depth + 1, freq * snr, DataType.UNIGRAM)) { return; } // Add to frequency of next letters for predictive correction @@ -305,7 +305,8 @@ public class ExpandableDictionary extends Dictionary { || !same(word, depth + 1, codes.getTypedWord())) { int finalFreq = freq * snr * addedAttenuation; if (skipPos < 0) finalFreq *= FULL_WORD_FREQ_MULTIPLIER; - callback.addWord(word, 0, depth + 1, finalFreq); + callback.addWord(word, 0, depth + 1, finalFreq, + DataType.UNIGRAM); } } if (children != null) { diff --git a/java/src/com/android/inputmethod/latin/LatinIME.java b/java/src/com/android/inputmethod/latin/LatinIME.java index b1b6d9228..51fb9d876 100644 --- a/java/src/com/android/inputmethod/latin/LatinIME.java +++ b/java/src/com/android/inputmethod/latin/LatinIME.java @@ -89,6 +89,7 @@ public class LatinIME extends InputMethodService private static final String PREF_QUICK_FIXES = "quick_fixes"; private static final String PREF_SHOW_SUGGESTIONS = "show_suggestions"; private static final String PREF_AUTO_COMPLETE = "auto_complete"; + private static final String PREF_BIGRAM_SUGGESTIONS = "bigram_suggestion"; private static final String PREF_VOICE_MODE = "voice_mode"; // Whether or not the user has used voice input before (and thus, whether to show the @@ -187,6 +188,7 @@ public class LatinIME extends InputMethodService private boolean mAutoSpace; private boolean mJustAddedAutoSpace; private boolean mAutoCorrectEnabled; + private boolean mBigramSuggestionEnabled; private boolean mAutoCorrectOn; private boolean mCapsLock; private boolean mPasswordText; @@ -1538,7 +1540,7 @@ public class LatinIME extends InputMethodService } private List getTypedSuggestions(WordComposer word) { - List stringList = mSuggest.getSuggestions(mInputView, word, false); + List stringList = mSuggest.getSuggestions(mInputView, word, false, null); return stringList; } @@ -1549,7 +1551,14 @@ public class LatinIME extends InputMethodService } private void showSuggestions(WordComposer word) { - List stringList = mSuggest.getSuggestions(mInputView, word, false); + //long startTime = System.currentTimeMillis(); // TIME MEASUREMENT! + // TODO Maybe need better way of retrieving previous word + CharSequence prevWord = EditingUtil.getPreviousWord(getCurrentInputConnection()); + List stringList = mSuggest.getSuggestions(mInputView, word, false, + prevWord); + //long stopTime = System.currentTimeMillis(); // TIME MEASUREMENT! + //Log.d("LatinIME","Suggest Total Time - " + (stopTime - startTime)); + int[] nextLettersFrequencies = mSuggest.getNextLettersFrequencies(); ((LatinKeyboard) mInputView.getKeyboard()).setPreferredLetters(nextLettersFrequencies); @@ -2088,6 +2097,8 @@ public class LatinIME extends InputMethodService mCorrectionMode = (mAutoCorrectOn && mAutoCorrectEnabled) ? Suggest.CORRECTION_FULL : (mAutoCorrectOn ? Suggest.CORRECTION_BASIC : Suggest.CORRECTION_NONE); + mCorrectionMode = (mBigramSuggestionEnabled && mAutoCorrectOn && mAutoCorrectEnabled) + ? Suggest.CORRECTION_FULL_BIGRAM : mCorrectionMode; if (mSuggest != null) { mSuggest.setCorrectionMode(mCorrectionMode); } @@ -2154,6 +2165,7 @@ public class LatinIME extends InputMethodService } mAutoCorrectEnabled = sp.getBoolean(PREF_AUTO_COMPLETE, mResources.getBoolean(R.bool.enable_autocorrect)) & mShowSuggestions; + mBigramSuggestionEnabled = sp.getBoolean(PREF_BIGRAM_SUGGESTIONS, true) & mShowSuggestions; updateCorrectionMode(); updateAutoTextEnabled(mResources.getConfiguration().locale); mLanguageSwitcher.loadLocales(sp); diff --git a/java/src/com/android/inputmethod/latin/Suggest.java b/java/src/com/android/inputmethod/latin/Suggest.java index 010913d6d..3e6090c72 100755 --- a/java/src/com/android/inputmethod/latin/Suggest.java +++ b/java/src/com/android/inputmethod/latin/Suggest.java @@ -37,6 +37,21 @@ public class Suggest implements Dictionary.WordCallback { public static final int CORRECTION_NONE = 0; public static final int CORRECTION_BASIC = 1; public static final int CORRECTION_FULL = 2; + public static final int CORRECTION_FULL_BIGRAM = 3; + + /** + * Words that appear in both bigram and unigram data gets multiplier ranging from + * BIGRAM_MULTIPLIER_MIN to BIGRAM_MULTIPLIER_MAX depending on the frequency score from + * bigram data. + */ + public static final double BIGRAM_MULTIPLIER_MIN = 1.2; + public static final double BIGRAM_MULTIPLIER_MAX = 1.5; + + /** + * Maximum possible bigram frequency. Will depend on how many bits are being used in data + * structure. Maximum bigram freqeuncy will get the BIGRAM_MULTIPLIER_MAX as the multiplier. + */ + public static final int MAXIMUM_BIGRAM_FREQUENCY = 127; static final int LARGE_DICTIONARY_THRESHOLD = 200 * 1000; @@ -49,10 +64,13 @@ public class Suggest implements Dictionary.WordCallback { private Dictionary mContactsDictionary; private int mPrefMaxSuggestions = 12; + private int mPrefMaxBigrams = 255; private boolean mAutoTextEnabled; private int[] mPriorities = new int[mPrefMaxSuggestions]; + private int[] mBigramPriorities = new int[mPrefMaxBigrams]; + // Handle predictive correction for only the first 1280 characters for performance reasons // If we support scripts that need latin characters beyond that, we should probably use some // kind of a sparse array or language specific list with a mapping lookup table. @@ -60,6 +78,7 @@ public class Suggest implements Dictionary.WordCallback { // latin characters. private int[] mNextLettersFrequencies = new int[1280]; private ArrayList mSuggestions = new ArrayList(); + private ArrayList mBigramSuggestions = new ArrayList(); private ArrayList mStringPool = new ArrayList(); private boolean mHaveCorrection; private CharSequence mOriginalWord; @@ -80,7 +99,7 @@ public class Suggest implements Dictionary.WordCallback { private void initPool() { for (int i = 0; i < mPrefMaxSuggestions; i++) { - StringBuilder sb = new StringBuilder(32); + StringBuilder sb = new StringBuilder(Dictionary.MAX_WORD_LENGTH); mStringPool.add(sb); } } @@ -132,9 +151,10 @@ public class Suggest implements Dictionary.WordCallback { } mPrefMaxSuggestions = maxSuggestions; mPriorities = new int[mPrefMaxSuggestions]; - collectGarbage(); + mBigramPriorities = new int[mPrefMaxBigrams]; + collectGarbage(mSuggestions, mPrefMaxSuggestions); while (mStringPool.size() < mPrefMaxSuggestions) { - StringBuilder sb = new StringBuilder(32); + StringBuilder sb = new StringBuilder(Dictionary.MAX_WORD_LENGTH); mStringPool.add(sb); } } @@ -169,17 +189,16 @@ public class Suggest implements Dictionary.WordCallback { /** * Returns a list of words that match the list of character codes passed in. * This list will be overwritten the next time this function is called. - * @param a view for retrieving the context for AutoText - * @param codes the list of codes. Each list item contains an array of character codes - * in order of probability where the character at index 0 in the array has the highest - * probability. + * @param view a view for retrieving the context for AutoText + * @param wordComposer contains what is currently being typed + * @param prevWordForBigram previous word (used only for bigram) * @return list of suggestions. */ public List getSuggestions(View view, WordComposer wordComposer, - boolean includeTypedWordIfValid) { + boolean includeTypedWordIfValid, CharSequence prevWordForBigram) { mHaveCorrection = false; mCapitalize = wordComposer.isCapitalized(); - collectGarbage(); + collectGarbage(mSuggestions, mPrefMaxSuggestions); Arrays.fill(mPriorities, 0); Arrays.fill(mNextLettersFrequencies, 0); @@ -191,8 +210,39 @@ public class Suggest implements Dictionary.WordCallback { } else { mLowerOriginalWord = ""; } - // Search the dictionary only if there are at least 2 characters - if (wordComposer.size() > 1) { + + if (wordComposer.size() == 1 && (mCorrectionMode == CORRECTION_FULL_BIGRAM + || mCorrectionMode == CORRECTION_BASIC)) { + // At first character, just get the bigrams + Arrays.fill(mBigramPriorities, 0); + collectGarbage(mBigramSuggestions, mPrefMaxBigrams); + + if (!TextUtils.isEmpty(prevWordForBigram)) { + CharSequence lowerPrevWord = prevWordForBigram.toString().toLowerCase(); + if (mMainDict.isValidWord(lowerPrevWord)) { + prevWordForBigram = lowerPrevWord; + } + mMainDict.getBigrams(wordComposer, prevWordForBigram, this, + mNextLettersFrequencies); + char currentChar = wordComposer.getTypedWord().charAt(0); + int count = 0; + int bigramSuggestionSize = mBigramSuggestions.size(); + for (int i = 0; i < bigramSuggestionSize; i++) { + if (mBigramSuggestions.get(i).charAt(0) == currentChar) { + int poolSize = mStringPool.size(); + StringBuilder sb = poolSize > 0 ? + (StringBuilder) mStringPool.remove(poolSize - 1) + : new StringBuilder(Dictionary.MAX_WORD_LENGTH); + sb.setLength(0); + sb.append(mBigramSuggestions.get(i)); + mSuggestions.add(count++, sb); + if (count > mPrefMaxSuggestions) break; + } + } + } + + } else if (wordComposer.size() > 1) { + // Search the dictionary only if there are at least 2 characters if (mUserDictionary != null || mContactsDictionary != null) { if (mUserDictionary != null) { mUserDictionary.getWords(wordComposer, this, mNextLettersFrequencies); @@ -202,21 +252,26 @@ public class Suggest implements Dictionary.WordCallback { } if (mSuggestions.size() > 0 && isValidWord(mOriginalWord) - && mCorrectionMode == CORRECTION_FULL) { + && (mCorrectionMode == CORRECTION_FULL + || mCorrectionMode == CORRECTION_FULL_BIGRAM)) { mHaveCorrection = true; } } mMainDict.getWords(wordComposer, this, mNextLettersFrequencies); - if (mCorrectionMode == CORRECTION_FULL && mSuggestions.size() > 0) { + if ((mCorrectionMode == CORRECTION_FULL || mCorrectionMode == CORRECTION_FULL_BIGRAM) + && mSuggestions.size() > 0) { mHaveCorrection = true; } } + if (mOriginalWord != null) { mSuggestions.add(0, mOriginalWord.toString()); } - + // Check if the first suggestion has a minimum number of characters in common - if (mCorrectionMode == CORRECTION_FULL && mSuggestions.size() > 1) { + if (wordComposer.size() > 1 && mSuggestions.size() > 1 + && (mCorrectionMode == CORRECTION_FULL + || mCorrectionMode == CORRECTION_FULL_BIGRAM)) { if (!haveSufficientCommonality(mLowerOriginalWord, mSuggestions.get(1))) { mHaveCorrection = false; } @@ -247,7 +302,6 @@ public class Suggest implements Dictionary.WordCallback { i++; } } - removeDupes(); return mSuggestions; } @@ -301,20 +355,50 @@ public class Suggest implements Dictionary.WordCallback { return false; } - public boolean addWord(final char[] word, final int offset, final int length, final int freq) { + public boolean addWord(final char[] word, final int offset, final int length, int freq, + final Dictionary.DataType dataType) { + ArrayList suggestions; + int[] priorities; + int prefMaxSuggestions; + if(dataType == Dictionary.DataType.BIGRAM) { + suggestions = mBigramSuggestions; + priorities = mBigramPriorities; + prefMaxSuggestions = mPrefMaxBigrams; + } else { + suggestions = mSuggestions; + priorities = mPriorities; + prefMaxSuggestions = mPrefMaxSuggestions; + } + int pos = 0; - final int[] priorities = mPriorities; - final int prefMaxSuggestions = mPrefMaxSuggestions; + // Check if it's the same word, only caps are different if (compareCaseInsensitive(mLowerOriginalWord, word, offset, length)) { pos = 0; } else { + if (dataType == Dictionary.DataType.UNIGRAM) { + // Check if the word was already added before (by bigram data) + int bigramSuggestion = searchBigramSuggestion(word,offset,length); + if(bigramSuggestion >= 0) { + // turn freq from bigram into multiplier specified above + double multiplier = (((double) mBigramPriorities[bigramSuggestion]) + / MAXIMUM_BIGRAM_FREQUENCY) + * (BIGRAM_MULTIPLIER_MAX - BIGRAM_MULTIPLIER_MIN) + + BIGRAM_MULTIPLIER_MIN; + /* Log.d("Suggest","bigram num: " + bigramSuggestion + + " wordB: " + mBigramSuggestions.get(bigramSuggestion).toString() + + " currentPriority: " + freq + " bigramPriority: " + + mBigramPriorities[bigramSuggestion] + + " multiplier: " + multiplier); */ + freq = (int)Math.round((freq * multiplier)); + } + } + // Check the last one's priority and bail if (priorities[prefMaxSuggestions - 1] >= freq) return true; while (pos < prefMaxSuggestions) { if (priorities[pos] < freq - || (priorities[pos] == freq && length < mSuggestions - .get(pos).length())) { + || (priorities[pos] == freq && length < suggestions.get(pos).length())) { break; } pos++; @@ -324,12 +408,13 @@ public class Suggest implements Dictionary.WordCallback { if (pos >= prefMaxSuggestions) { return true; } + System.arraycopy(priorities, pos, priorities, pos + 1, prefMaxSuggestions - pos - 1); priorities[pos] = freq; int poolSize = mStringPool.size(); StringBuilder sb = poolSize > 0 ? (StringBuilder) mStringPool.remove(poolSize - 1) - : new StringBuilder(32); + : new StringBuilder(Dictionary.MAX_WORD_LENGTH); sb.setLength(0); if (mCapitalize) { sb.append(Character.toUpperCase(word[offset])); @@ -339,9 +424,9 @@ public class Suggest implements Dictionary.WordCallback { } else { sb.append(word, offset, length); } - mSuggestions.add(pos, sb); - if (mSuggestions.size() > prefMaxSuggestions) { - CharSequence garbage = mSuggestions.remove(prefMaxSuggestions); + suggestions.add(pos, sb); + if (suggestions.size() > prefMaxSuggestions) { + CharSequence garbage = suggestions.remove(prefMaxSuggestions); if (garbage instanceof StringBuilder) { mStringPool.add(garbage); } @@ -349,6 +434,26 @@ public class Suggest implements Dictionary.WordCallback { return true; } + private int searchBigramSuggestion(final char[] word, final int offset, final int length) { + // TODO This is almost O(n^2). Might need fix. + // search whether the word appeared in bigram data + int bigramSuggestSize = mBigramSuggestions.size(); + for(int i = 0; i < bigramSuggestSize; i++) { + if(mBigramSuggestions.get(i).length() == length) { + boolean chk = true; + for(int j = 0; j < length; j++) { + if(mBigramSuggestions.get(i).charAt(j) != word[offset+j]) { + chk = false; + break; + } + } + if(chk) return i; + } + } + + return -1; + } + public boolean isValidWord(final CharSequence word) { if (word == null || word.length() == 0) { return false; @@ -359,21 +464,21 @@ public class Suggest implements Dictionary.WordCallback { || (mContactsDictionary != null && mContactsDictionary.isValidWord(word)); } - private void collectGarbage() { + private void collectGarbage(ArrayList suggestions, int prefMaxSuggestions) { int poolSize = mStringPool.size(); - int garbageSize = mSuggestions.size(); - while (poolSize < mPrefMaxSuggestions && garbageSize > 0) { - CharSequence garbage = mSuggestions.get(garbageSize - 1); + int garbageSize = suggestions.size(); + while (poolSize < prefMaxSuggestions && garbageSize > 0) { + CharSequence garbage = suggestions.get(garbageSize - 1); if (garbage != null && garbage instanceof StringBuilder) { mStringPool.add(garbage); poolSize++; } garbageSize--; } - if (poolSize == mPrefMaxSuggestions + 1) { + if (poolSize == prefMaxSuggestions + 1) { Log.w("Suggest", "String pool got too big: " + poolSize); } - mSuggestions.clear(); + suggestions.clear(); } public void close() { diff --git a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp index bb45cb538..4fe80da69 100644 --- a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp +++ b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp @@ -59,8 +59,7 @@ static int latinime_BinaryDictionary_getSuggestions( jint maxAlternatives, jint skipPos, jintArray nextLettersArray, jint nextLettersSize) { Dictionary *dictionary = (Dictionary*) dict; - if (dictionary == NULL) - return 0; + if (dictionary == NULL) return 0; int *frequencies = env->GetIntArrayElements(frequencyArray, NULL); int *inputCodes = env->GetIntArrayElements(inputArray, NULL); @@ -81,6 +80,28 @@ static int latinime_BinaryDictionary_getSuggestions( return count; } +static int latinime_BinaryDictionary_getBigrams + (JNIEnv *env, jobject object, jint dict, jcharArray wordArray, jint wordLength, + jcharArray outputArray, jintArray frequencyArray, jint maxWordLength, jint maxBigrams) +{ + Dictionary *dictionary = (Dictionary*) dict; + if (dictionary == NULL) return 0; + + jchar *word = env->GetCharArrayElements(wordArray, NULL); + jchar *outputChars = env->GetCharArrayElements(outputArray, NULL); + int *frequencies = env->GetIntArrayElements(frequencyArray, NULL); + + int count = dictionary->getBigrams((unsigned short*) word, wordLength, + (unsigned short*) outputChars, frequencies, maxWordLength, maxBigrams); + + env->ReleaseCharArrayElements(wordArray, word, JNI_ABORT); + env->ReleaseCharArrayElements(outputArray, outputChars, 0); + env->ReleaseIntArrayElements(frequencyArray, frequencies, 0); + + return count; +} + + static jboolean latinime_BinaryDictionary_isValidWord (JNIEnv *env, jobject object, jint dict, jcharArray wordArray, jint wordLength) { @@ -108,7 +129,8 @@ static JNINativeMethod gMethods[] = { (void*)latinime_BinaryDictionary_open}, {"closeNative", "(I)V", (void*)latinime_BinaryDictionary_close}, {"getSuggestionsNative", "(I[II[C[IIIII[II)I", (void*)latinime_BinaryDictionary_getSuggestions}, - {"isValidWordNative", "(I[CI)Z", (void*)latinime_BinaryDictionary_isValidWord} + {"isValidWordNative", "(I[CI)Z", (void*)latinime_BinaryDictionary_isValidWord}, + {"getBigramsNative", "(I[CI[C[III)I", (void*)latinime_BinaryDictionary_getBigrams} }; static int registerNativeMethods(JNIEnv* env, const char* className, diff --git a/native/src/dictionary.cpp b/native/src/dictionary.cpp index e75beb5b7..a1a632faa 100644 --- a/native/src/dictionary.cpp +++ b/native/src/dictionary.cpp @@ -19,6 +19,7 @@ #include #include #include +//#define LOG_TAG "dictionary.cpp" //#include #define LOGI @@ -27,6 +28,9 @@ #include "char_utils.h" #define DEBUG_DICT 0 +#define DICTIONARY_VERSION_MIN 200 +#define DICTIONARY_HEADER_SIZE 2 +#define NOT_VALID_WORD -99 namespace latinime { @@ -35,6 +39,7 @@ Dictionary::Dictionary(void *dict, int typedLetterMultiplier, int fullWordMultip mDict = (unsigned char*) dict; mTypedLetterMultiplier = typedLetterMultiplier; mFullWordMultiplier = fullWordMultiplier; + getVersionNumber(); } Dictionary::~Dictionary() @@ -58,7 +63,11 @@ int Dictionary::getSuggestions(int *codes, int codesSize, unsigned short *outWor mNextLettersFrequencies = nextLetters; mNextLettersSize = nextLettersSize; - getWordsRec(0, 0, mInputLength * 3, false, 1, 0, 0); + if (checkIfDictVersionIsLatest()) { + getWordsRec(DICTIONARY_HEADER_SIZE, 0, mInputLength * 3, false, 1, 0, 0); + } else { + getWordsRec(0, 0, mInputLength * 3, false, 1, 0, 0); + } // Get the word count suggWords = 0; @@ -85,6 +94,21 @@ Dictionary::registerNextLetter(unsigned short c) } } +void +Dictionary::getVersionNumber() +{ + mVersion = (mDict[0] & 0xFF); + mBigram = (mDict[1] & 0xFF); + LOGI("IN NATIVE SUGGEST Version: %d Bigram : %d \n", mVersion, mBigram); +} + +// Checks whether it has the latest dictionary or the old dictionary +bool +Dictionary::checkIfDictVersionIsLatest() +{ + return (mVersion >= DICTIONARY_VERSION_MIN) && (mBigram == 1 || mBigram == 0); +} + unsigned short Dictionary::getChar(int *pos) { @@ -112,6 +136,28 @@ Dictionary::getAddress(int *pos) return address; } +int +Dictionary::getFreq(int *pos) +{ + int freq = mDict[(*pos)++] & 0xFF; + + if (checkIfDictVersionIsLatest()) { + // skipping bigram + int bigramExist = (mDict[*pos] & FLAG_BIGRAM_READ); + if (bigramExist > 0) { + int nextBigramExist = 1; + while (nextBigramExist > 0) { + (*pos) += 3; + nextBigramExist = (mDict[(*pos)++] & FLAG_BIGRAM_CONTINUED); + } + } else { + (*pos)++; + } + } + + return freq; +} + int Dictionary::wideStrLen(unsigned short *str) { @@ -161,6 +207,46 @@ Dictionary::addWord(unsigned short *word, int length, int frequency) return false; } +bool +Dictionary::addWordBigram(unsigned short *word, int length, int frequency) +{ + word[length] = 0; + if (DEBUG_DICT) { + char s[length + 1]; + for (int i = 0; i <= length; i++) s[i] = word[i]; + LOGI("Bigram: Found word = %s, freq = %d : \n", s, frequency); + } + + // Find the right insertion point + int insertAt = 0; + while (insertAt < mMaxBigrams) { + if (frequency > mBigramFreq[insertAt] + || (mBigramFreq[insertAt] == frequency + && length < wideStrLen(mBigramChars + insertAt * mMaxWordLength))) { + break; + } + insertAt++; + } + LOGI("Bigram: InsertAt -> %d maxBigrams: %d\n", insertAt, mMaxBigrams); + if (insertAt < mMaxBigrams) { + memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]), + (char*) mBigramFreq + insertAt * sizeof(mBigramFreq[0]), + (mMaxBigrams - insertAt - 1) * sizeof(mBigramFreq[0])); + mBigramFreq[insertAt] = frequency; + memmove((char*) mBigramChars + (insertAt + 1) * mMaxWordLength * sizeof(short), + (char*) mBigramChars + (insertAt ) * mMaxWordLength * sizeof(short), + (mMaxBigrams - insertAt - 1) * sizeof(short) * mMaxWordLength); + unsigned short *dest = mBigramChars + (insertAt ) * mMaxWordLength; + while (length--) { + *dest++ = *word++; + } + *dest = 0; // NULL terminate + if (DEBUG_DICT) LOGI("Bigram: Added word at %d\n", insertAt); + return true; + } + return false; +} + unsigned short Dictionary::toLowerCase(unsigned short c) { if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) { @@ -213,12 +299,17 @@ Dictionary::getWordsRec(int pos, int depth, int maxDepth, bool completion, int s } for (int i = 0; i < count; i++) { + // -- at char unsigned short c = getChar(&pos); + // -- at flag/add unsigned short lowerC = toLowerCase(c); bool terminal = getTerminal(&pos); int childrenAddress = getAddress(&pos); + // -- after address or flag int freq = 1; if (terminal) freq = getFreq(&pos); + // -- after add or freq + // If we are only doing completions, no need to look at the typed characters. if (completion) { mWord[depth] = c; @@ -232,7 +323,7 @@ Dictionary::getWordsRec(int pos, int depth, int maxDepth, bool completion, int s getWordsRec(childrenAddress, depth + 1, maxDepth, completion, snr, inputIndex, diffs); } - } else if (c == QUOTE && currentChars[0] != QUOTE || mSkipPos == depth) { + } else if ((c == QUOTE && currentChars[0] != QUOTE) || mSkipPos == depth) { // Skip the ' or other letter and continue deeper mWord[depth] = c; if (childrenAddress != 0) { @@ -270,14 +361,185 @@ Dictionary::getWordsRec(int pos, int depth, int maxDepth, bool completion, int s } } -bool -Dictionary::isValidWord(unsigned short *word, int length) +int +Dictionary::getBigramAddress(int *pos, bool advance) { - return isValidWordRec(0, word, 0, length); + int address = 0; + + address += (mDict[*pos] & 0x3F) << 16; + address += (mDict[*pos + 1] & 0xFF) << 8; + address += (mDict[*pos + 2] & 0xFF); + + if (advance) { + *pos += 3; + } + + return address; +} + +int +Dictionary::getBigramFreq(int *pos) +{ + int freq = mDict[(*pos)++] & FLAG_BIGRAM_FREQ; + + return freq; +} + + +int +Dictionary::getBigrams(unsigned short *prevWord, int prevWordLength, unsigned short *bigramChars, + int *bigramFreq, int maxWordLength, int maxBigrams) +{ + mBigramFreq = bigramFreq; + mBigramChars = bigramChars; + mMaxWordLength = maxWordLength; + mMaxBigrams = maxBigrams; + + if (mBigram == 1 && checkIfDictVersionIsLatest()) { + int pos = isValidWordRec(DICTIONARY_HEADER_SIZE, prevWord, 0, prevWordLength); + LOGI("Pos -> %d\n", pos); + if (pos < 0) { + return 0; + } + + int bigramCount = 0; + int bigramExist = (mDict[pos] & FLAG_BIGRAM_READ); + if (bigramExist > 0) { + int nextBigramExist = 1; + while (nextBigramExist > 0) { + int bigramAddress = getBigramAddress(&pos, true); + int frequency = (FLAG_BIGRAM_FREQ & mDict[pos]); + // search for all bigrams and store them + searchForTerminalNode(bigramAddress, frequency); + nextBigramExist = (mDict[pos++] & FLAG_BIGRAM_CONTINUED); + bigramCount++; + } + } + + return bigramCount; + } + return 0; +} + +void +Dictionary::searchForTerminalNode(int addressLookingFor, int frequency) +{ + // track word with such address and store it in an array + unsigned short word[mMaxWordLength]; + + int pos; + int followDownBranchAddress = DICTIONARY_HEADER_SIZE; + bool found = false; + char followingChar = ' '; + int depth = -1; + + while(!found) { + bool followDownAddressSearchStop = false; + bool firstAddress = true; + bool haveToSearchAll = true; + + if (depth >= 0) { + word[depth] = (unsigned short) followingChar; + } + pos = followDownBranchAddress; // pos start at count + int count = mDict[pos] & 0xFF; + LOGI("count - %d\n",count); + pos++; + for (int i = 0; i < count; i++) { + // pos at data + pos++; + // pos now at flag + if (!getFirstBitOfByte(&pos)) { // non-terminal + if (!followDownAddressSearchStop) { + int addr = getBigramAddress(&pos, false); + if (addr > addressLookingFor) { + followDownAddressSearchStop = true; + if (firstAddress) { + firstAddress = false; + haveToSearchAll = true; + } else if (!haveToSearchAll) { + break; + } + } else { + followDownBranchAddress = addr; + followingChar = (char)(0xFF & mDict[pos-1]); + if (firstAddress) { + firstAddress = false; + haveToSearchAll = false; + } + } + } + pos += 3; + } else if (getFirstBitOfByte(&pos)) { // terminal + if (addressLookingFor == (pos-1)) { // found !! + depth++; + word[depth] = (0xFF & mDict[pos-1]); + found = true; + break; + } + if (getSecondBitOfByte(&pos)) { // address + freq (4 byte) + if (!followDownAddressSearchStop) { + int addr = getBigramAddress(&pos, false); + if (addr > addressLookingFor) { + followDownAddressSearchStop = true; + if (firstAddress) { + firstAddress = false; + haveToSearchAll = true; + } else if (!haveToSearchAll) { + break; + } + } else { + followDownBranchAddress = addr; + followingChar = (char)(0xFF & mDict[pos-1]); + if (firstAddress) { + firstAddress = false; + haveToSearchAll = true; + } + } + } + pos += 4; + } else { // freq only (2 byte) + pos += 2; + } + + // skipping bigram + int bigramExist = (mDict[pos] & FLAG_BIGRAM_READ); + if (bigramExist > 0) { + int nextBigramExist = 1; + while (nextBigramExist > 0) { + pos += 3; + nextBigramExist = (mDict[pos++] & FLAG_BIGRAM_CONTINUED); + } + } else { + pos++; + } + } + } + depth++; + if (followDownBranchAddress == 0) { + LOGI("ERROR!!! Cannot find bigram!!"); + break; + } + } + + addWordBigram(word, depth, frequency); } bool +Dictionary::isValidWord(unsigned short *word, int length) +{ + if (checkIfDictVersionIsLatest()) { + return (isValidWordRec(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD); + } else { + return (isValidWordRec(0, word, 0, length) != NOT_VALID_WORD); + } +} + +int Dictionary::isValidWordRec(int pos, unsigned short *word, int offset, int length) { + // returns address of bigram data of that word + // return -99 if not found + int count = getCount(&pos); unsigned short currentChar = (unsigned short) word[offset]; for (int j = 0; j < count; j++) { @@ -287,12 +549,13 @@ Dictionary::isValidWordRec(int pos, unsigned short *word, int offset, int length if (c == currentChar) { if (offset == length - 1) { if (terminal) { - return true; + return (pos+1); } } else { if (childPos != 0) { - if (isValidWordRec(childPos, word, offset + 1, length)) { - return true; + int t = isValidWordRec(childPos, word, offset + 1, length); + if (t > 0) { + return t; } } } @@ -303,7 +566,7 @@ Dictionary::isValidWordRec(int pos, unsigned short *word, int offset, int length // There could be two instances of each alphabet - upper and lower case. So continue // looking ... } - return false; + return NOT_VALID_WORD; } diff --git a/native/src/dictionary.h b/native/src/dictionary.h index 3749f3d88..2c574290f 100644 --- a/native/src/dictionary.h +++ b/native/src/dictionary.h @@ -28,12 +28,19 @@ namespace latinime { // if the word has other endings. #define FLAG_TERMINAL_MASK 0x80 +#define FLAG_BIGRAM_READ 0x80 +#define FLAG_BIGRAM_CHILDEXIST 0x40 +#define FLAG_BIGRAM_CONTINUED 0x80 +#define FLAG_BIGRAM_FREQ 0x7F + class Dictionary { public: Dictionary(void *dict, int typedLetterMultipler, int fullWordMultiplier); int getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies, int maxWordLength, int maxWords, int maxAlternatives, int skipPos, int *nextLetters, int nextLettersSize); + int getBigrams(unsigned short *word, int length, unsigned short *outWords, int *frequencies, + int maxWordLength, int maxBigrams); bool isValidWord(unsigned short *word, int length); void setAsset(void *asset) { mAsset = asset; } void *getAsset() { return mAsset; } @@ -41,28 +48,40 @@ public: private: + void getVersionNumber(); + bool checkIfDictVersionIsLatest(); int getAddress(int *pos); + int getBigramAddress(int *pos, bool advance); + int getFreq(int *pos); + int getBigramFreq(int *pos); + void searchForTerminalNode(int address, int frequency); + + bool getFirstBitOfByte(int *pos) { return (mDict[*pos] & 0x80) > 0; } + bool getSecondBitOfByte(int *pos) { return (mDict[*pos] & 0x40) > 0; } bool getTerminal(int *pos) { return (mDict[*pos] & FLAG_TERMINAL_MASK) > 0; } - int getFreq(int *pos) { return mDict[(*pos)++] & 0xFF; } int getCount(int *pos) { return mDict[(*pos)++] & 0xFF; } unsigned short getChar(int *pos); int wideStrLen(unsigned short *str); bool sameAsTyped(unsigned short *word, int length); bool addWord(unsigned short *word, int length, int frequency); + bool addWordBigram(unsigned short *word, int length, int frequency); unsigned short toLowerCase(unsigned short c); void getWordsRec(int pos, int depth, int maxDepth, bool completion, int frequency, int inputIndex, int diffs); - bool isValidWordRec(int pos, unsigned short *word, int offset, int length); + int isValidWordRec(int pos, unsigned short *word, int offset, int length); void registerNextLetter(unsigned short c); unsigned char *mDict; void *mAsset; int *mFrequencies; + int *mBigramFreq; int mMaxWords; + int mMaxBigrams; int mMaxWordLength; unsigned short *mOutputChars; + unsigned short *mBigramChars; int *mInputCodes; int mInputLength; int mMaxAlternatives; @@ -74,6 +93,8 @@ private: int mTypedLetterMultiplier; int *mNextLettersFrequencies; int mNextLettersSize; + int mVersion; + int mBigram; }; // ---------------------------------------------------------------------------- diff --git a/tests/data/bigramlist.xml b/tests/data/bigramlist.xml new file mode 100644 index 000000000..dd3f2916e --- /dev/null +++ b/tests/data/bigramlist.xml @@ -0,0 +1,36 @@ + + + + + + + + + + + + + + + + + + + diff --git a/tests/data/wordlist.xml b/tests/data/wordlist.xml index 22d0caa38..b870eb2a3 100644 --- a/tests/data/wordlist.xml +++ b/tests/data/wordlist.xml @@ -225,6 +225,7 @@ services niño María + car hmmm hon tty diff --git a/tests/res/raw/test.dict b/tests/res/raw/test.dict index e789aaa9a..6a5d6d794 100644 Binary files a/tests/res/raw/test.dict and b/tests/res/raw/test.dict differ diff --git a/tests/src/com/android/inputmethod/latin/tests/SuggestTests.java b/tests/src/com/android/inputmethod/latin/tests/SuggestTests.java index 9401d926a..59720640a 100644 --- a/tests/src/com/android/inputmethod/latin/tests/SuggestTests.java +++ b/tests/src/com/android/inputmethod/latin/tests/SuggestTests.java @@ -71,7 +71,7 @@ public class SuggestTests extends AndroidTestCase { Log.w(TAG, "No available size for binary dictionary"); } mSuggest.setAutoTextEnabled(false); - mSuggest.setCorrectionMode(Suggest.CORRECTION_FULL); + mSuggest.setCorrectionMode(Suggest.CORRECTION_FULL_BIGRAM); } /************************** Helper functions ************************/ @@ -108,19 +108,56 @@ public class SuggestTests extends AndroidTestCase { private boolean isDefaultSuggestion(CharSequence typed, CharSequence expected) { WordComposer word = createWordComposer(typed); - List suggestions = mSuggest.getSuggestions(null, word, false); + List suggestions = mSuggest.getSuggestions(null, word, false, null); + return isDefaultSuggestion(suggestions, expected); + } + + private void getBigramSuggestions(CharSequence previous, CharSequence typed) { + if(!TextUtils.isEmpty(previous) && (typed.length() > 1)) { + WordComposer firstChar = createWordComposer(typed.charAt(0) + ""); + mSuggest.getSuggestions(null, firstChar, false, previous); + } + } + + private boolean isDefaultNextSuggestion(CharSequence previous, CharSequence typed, + CharSequence expected) { + WordComposer word = createWordComposer(typed); + getBigramSuggestions(previous, typed); + List suggestions = mSuggest.getSuggestions(null, word, false, previous); return isDefaultSuggestion(suggestions, expected); } private boolean isDefaultCorrection(CharSequence typed, CharSequence expected) { WordComposer word = createWordComposer(typed); - List suggestions = mSuggest.getSuggestions(null, word, false); + List suggestions = mSuggest.getSuggestions(null, word, false, null); + return isDefaultSuggestion(suggestions, expected) && mSuggest.hasMinimalCorrection(); + } + + private boolean isDefaultNextCorrection(CharSequence previous, CharSequence typed, + CharSequence expected) { + WordComposer word = createWordComposer(typed); + getBigramSuggestions(previous, typed); + List suggestions = mSuggest.getSuggestions(null, word, false, previous); + for(int i=0;i suggestions = mSuggest.getSuggestions(null, word, false); + List suggestions = mSuggest.getSuggestions(null, word, false, null); + for (int i = 1; i < suggestions.size(); i++) { + if (TextUtils.equals(suggestions.get(i), expected)) return true; + } + return false; + } + + private boolean isASuggestion(CharSequence previous, CharSequence typed, + CharSequence expected) { + WordComposer word = createWordComposer(typed); + getBigramSuggestions(previous, typed); + List suggestions = mSuggest.getSuggestions(null, word, false, previous); for (int i = 1; i < suggestions.size(); i++) { if (TextUtils.equals(suggestions.get(i), expected)) return true; } @@ -241,8 +278,30 @@ public class SuggestTests extends AndroidTestCase { * Are accented forms of words suggested as corrections? */ public void testAccents() { - assertTrue(isDefaultCorrection("nino", "ni\u00F1o")); // ni–o - assertTrue(isDefaultCorrection("nimo", "ni\u00F1o")); // ni–o - assertTrue(isDefaultCorrection("maria", "Mar\u00EDa")); // Mar’a + assertTrue(isDefaultCorrection("nino", "ni\u00F1o")); // niño + assertTrue(isDefaultCorrection("nimo", "ni\u00F1o")); // niño + assertTrue(isDefaultCorrection("maria", "Mar\u00EDa")); // María + } + + /** + * Make sure bigrams are showing when first character is typed + * and don't show any when there aren't any + */ + public void testBigramsAtFirstChar() { + assertTrue(isDefaultNextCorrection("about", "p", "part")); + assertTrue(isDefaultNextCorrection("I'm", "a", "about")); + assertTrue(isDefaultNextCorrection("about", "b", "business")); + assertTrue(isASuggestion("about", "b", "being")); + assertFalse(isDefaultNextSuggestion("about", "p", "business")); + } + + /** + * Make sure bigrams score affects the original score + */ + public void testBigramsScoreEffect() { + assertTrue(isDefaultCorrection("pa", "page")); + assertTrue(isDefaultNextCorrection("about", "pa", "part")); + assertTrue(isDefaultCorrection("sa", "said")); + assertTrue(isDefaultNextCorrection("from", "sa", "same")); } }