From 16cc3992d7468ef781753df7b4227330e0834501 Mon Sep 17 00:00:00 2001 From: Keisuke Kuroyanagi Date: Thu, 23 Oct 2014 14:32:45 +0900 Subject: [PATCH] Use trigrams for personalization dict. 5Bug: 14425059 Change-Id: I73cf6904e569d60996a3b079f16ea6df0cb90f02 --- .../inputmethod/latin/BinaryDictionary.java | 18 +- .../latin/DictionaryFacilitator.java | 6 +- .../latin/ExpandableBinaryDictionary.java | 18 +- ...izationHelperForDictionaryFacilitator.java | 26 +-- .../latin/utils/LanguageModelParam.java | 166 ------------------ .../WordInputEventForPersonalization.java | 117 ++++++++++++ ...oid_inputmethod_latin_BinaryDictionary.cpp | 126 +++++-------- native/jni/src/utils/jni_data_utils.h | 3 + .../latin/BinaryDictionaryDecayingTests.java | 64 +++++++ .../latin/BinaryDictionaryTests.java | 58 ------ .../PersonalizationDictionaryTests.java | 6 +- 11 files changed, 265 insertions(+), 343 deletions(-) delete mode 100644 java/src/com/android/inputmethod/latin/utils/LanguageModelParam.java create mode 100644 java/src/com/android/inputmethod/latin/utils/WordInputEventForPersonalization.java diff --git a/java/src/com/android/inputmethod/latin/BinaryDictionary.java b/java/src/com/android/inputmethod/latin/BinaryDictionary.java index cfe1ea6a7..dce321795 100644 --- a/java/src/com/android/inputmethod/latin/BinaryDictionary.java +++ b/java/src/com/android/inputmethod/latin/BinaryDictionary.java @@ -32,8 +32,8 @@ import com.android.inputmethod.latin.settings.SettingsValuesForSuggestion; import com.android.inputmethod.latin.utils.BinaryDictionaryUtils; import com.android.inputmethod.latin.utils.FileUtils; import com.android.inputmethod.latin.utils.JniUtils; -import com.android.inputmethod.latin.utils.LanguageModelParam; import com.android.inputmethod.latin.utils.StringUtils; +import com.android.inputmethod.latin.utils.WordInputEventForPersonalization; import java.io.File; import java.util.ArrayList; @@ -205,8 +205,8 @@ public final class BinaryDictionary extends Dictionary { private static native boolean updateEntriesForWordWithNgramContextNative(long dict, int[][] prevWordCodePointArrays, boolean[] isBeginningOfSentenceArray, int[] word, boolean isValidWord, int count, int timestamp); - private static native int addMultipleDictionaryEntriesNative(long dict, - LanguageModelParam[] languageModelParams, int startIndex); + private static native int updateEntriesForInputEventsNative(long dict, + WordInputEventForPersonalization[] inputEvents, int startIndex); private static native String getPropertyNative(long dict, String query); private static native boolean isCorruptedNative(long dict); private static native boolean migrateNative(long dict, String dictFilePath, @@ -526,19 +526,19 @@ public final class BinaryDictionary extends Dictionary { } @UsedForTesting - public void addMultipleDictionaryEntries(final LanguageModelParam[] languageModelParams) { + public void updateEntriesForInputEvents(final WordInputEventForPersonalization[] inputEvents) { if (!isValidDictionary()) { return; } - int processedParamCount = 0; - while (processedParamCount < languageModelParams.length) { + int processedEventCount = 0; + while (processedEventCount < inputEvents.length) { if (needsToRunGC(true /* mindsBlockByGC */)) { flushWithGC(); } - processedParamCount = addMultipleDictionaryEntriesNative(mNativeDict, - languageModelParams, processedParamCount); + processedEventCount = updateEntriesForInputEventsNative(mNativeDict, inputEvents, + processedEventCount); mHasUpdated = true; - if (processedParamCount <= 0) { + if (processedEventCount <= 0) { return; } } diff --git a/java/src/com/android/inputmethod/latin/DictionaryFacilitator.java b/java/src/com/android/inputmethod/latin/DictionaryFacilitator.java index 25cbabfea..5bf6bf47d 100644 --- a/java/src/com/android/inputmethod/latin/DictionaryFacilitator.java +++ b/java/src/com/android/inputmethod/latin/DictionaryFacilitator.java @@ -24,7 +24,7 @@ import android.view.inputmethod.InputMethodSubtype; import com.android.inputmethod.annotations.UsedForTesting; import com.android.inputmethod.keyboard.ProximityInfo; -import com.android.inputmethod.latin.ExpandableBinaryDictionary.AddMultipleDictionaryEntriesCallback; +import com.android.inputmethod.latin.ExpandableBinaryDictionary.UpdateEntriesForInputEventsCallback; import com.android.inputmethod.latin.NgramContext.WordInfo; import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo; import com.android.inputmethod.latin.personalization.ContextualDictionary; @@ -796,8 +796,8 @@ public class DictionaryFacilitator { public void addEntriesToPersonalizationDictionary( final PersonalizationDataChunk personalizationDataChunk, final SpacingAndPunctuations spacingAndPunctuations, - final AddMultipleDictionaryEntriesCallback callback) { - mPersonalizationHelper.addEntriesToPersonalizationDictionariesToUpdate( + final UpdateEntriesForInputEventsCallback callback) { + mPersonalizationHelper.updateEntriesOfPersonalizationDictionaries( getMostProbableLocale(), personalizationDataChunk, spacingAndPunctuations, callback); } diff --git a/java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java b/java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java index 97af02f21..a74ffcb26 100644 --- a/java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java +++ b/java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java @@ -32,7 +32,7 @@ import com.android.inputmethod.latin.utils.CombinedFormatUtils; import com.android.inputmethod.latin.utils.DistracterFilter; import com.android.inputmethod.latin.utils.ExecutorUtils; import com.android.inputmethod.latin.utils.FileUtils; -import com.android.inputmethod.latin.utils.LanguageModelParam; +import com.android.inputmethod.latin.utils.WordInputEventForPersonalization; import java.io.File; import java.util.ArrayList; @@ -447,16 +447,16 @@ abstract public class ExpandableBinaryDictionary extends Dictionary { }, word, distracterFilter); } - public interface AddMultipleDictionaryEntriesCallback { + public interface UpdateEntriesForInputEventsCallback { public void onFinished(); } /** - * Dynamically add multiple entries to the dictionary. + * Dynamically update entries according to input events. */ - public void addMultipleDictionaryEntriesDynamically( - @Nonnull final ArrayList languageModelParams, - final AddMultipleDictionaryEntriesCallback callback) { + public void updateEntriesForInputEvents( + @Nonnull final ArrayList inputEvents, + final UpdateEntriesForInputEventsCallback callback) { reloadDictionaryIfRequired(); asyncExecuteTaskWithWriteLock(new Runnable() { @Override @@ -466,9 +466,9 @@ abstract public class ExpandableBinaryDictionary extends Dictionary { if (binaryDictionary == null) { return; } - binaryDictionary.addMultipleDictionaryEntries( - languageModelParams.toArray( - new LanguageModelParam[languageModelParams.size()])); + binaryDictionary.updateEntriesForInputEvents( + inputEvents.toArray( + new WordInputEventForPersonalization[inputEvents.size()])); } finally { if (callback != null) { callback.onFinished(); diff --git a/java/src/com/android/inputmethod/latin/PersonalizationHelperForDictionaryFacilitator.java b/java/src/com/android/inputmethod/latin/PersonalizationHelperForDictionaryFacilitator.java index 396d062f8..2dbab0a3f 100644 --- a/java/src/com/android/inputmethod/latin/PersonalizationHelperForDictionaryFacilitator.java +++ b/java/src/com/android/inputmethod/latin/PersonalizationHelperForDictionaryFacilitator.java @@ -26,14 +26,14 @@ import java.util.concurrent.atomic.AtomicInteger; import android.content.Context; import android.view.inputmethod.InputMethodSubtype; -import com.android.inputmethod.latin.ExpandableBinaryDictionary.AddMultipleDictionaryEntriesCallback; +import com.android.inputmethod.latin.ExpandableBinaryDictionary.UpdateEntriesForInputEventsCallback; import com.android.inputmethod.latin.personalization.PersonalizationDataChunk; import com.android.inputmethod.latin.personalization.PersonalizationDictionary; import com.android.inputmethod.latin.settings.SpacingAndPunctuations; import com.android.inputmethod.latin.utils.DistracterFilter; import com.android.inputmethod.latin.utils.DistracterFilterCheckingIsInDictionary; -import com.android.inputmethod.latin.utils.LanguageModelParam; import com.android.inputmethod.latin.utils.SubtypeLocaleUtils; +import com.android.inputmethod.latin.utils.WordInputEventForPersonalization; /** * Class for managing and updating personalization dictionaries. @@ -119,10 +119,10 @@ public class PersonalizationHelperForDictionaryFacilitator { return personalizationDict; } - private void addEntriesToPersonalizationDictionariesForLocale(final Locale locale, + private void updateEntriesOfPersonalizationDictionariesForLocale(final Locale locale, final PersonalizationDataChunk personalizationDataChunk, final SpacingAndPunctuations spacingAndPunctuations, - final AddMultipleDictionaryEntriesCallback callback) { + final UpdateEntriesForInputEventsCallback callback) { final ExpandableBinaryDictionary personalizationDict = getPersonalizationDictToUpdate(mContext, locale); if (personalizationDict == null) { @@ -131,25 +131,25 @@ public class PersonalizationHelperForDictionaryFacilitator { } return; } - final ArrayList languageModelParams = - LanguageModelParam.createLanguageModelParamsFrom( + final ArrayList inputEvents = + WordInputEventForPersonalization.createInputEventFrom( personalizationDataChunk.mTokens, personalizationDataChunk.mTimestampInSeconds, spacingAndPunctuations, locale, new DistracterFilterCheckingIsInDictionary( mDistracterFilter, personalizationDict)); - if (languageModelParams == null || languageModelParams.isEmpty()) { + if (inputEvents == null || inputEvents.isEmpty()) { if (callback != null) { callback.onFinished(); } return; } - personalizationDict.addMultipleDictionaryEntriesDynamically(languageModelParams, callback); + personalizationDict.updateEntriesForInputEvents(inputEvents, callback); } - public void addEntriesToPersonalizationDictionariesToUpdate(final Locale defaultLocale, + public void updateEntriesOfPersonalizationDictionaries(final Locale defaultLocale, final PersonalizationDataChunk personalizationDataChunk, final SpacingAndPunctuations spacingAndPunctuations, - final AddMultipleDictionaryEntriesCallback callback) { + final UpdateEntriesForInputEventsCallback callback) { final String language = personalizationDataChunk.mDetectedLanguage; final HashSet locales; if (mIsMonolingualUser && PersonalizationDataChunk.LANGUAGE_UNKNOWN.equals(language) @@ -165,8 +165,8 @@ public class PersonalizationHelperForDictionaryFacilitator { return; } final AtomicInteger remainingTaskCount = new AtomicInteger(locales.size()); - final AddMultipleDictionaryEntriesCallback callbackForLocales = - new AddMultipleDictionaryEntriesCallback() { + final UpdateEntriesForInputEventsCallback callbackForLocales = + new UpdateEntriesForInputEventsCallback() { @Override public void onFinished() { if (remainingTaskCount.decrementAndGet() == 0) { @@ -178,7 +178,7 @@ public class PersonalizationHelperForDictionaryFacilitator { } }; for (final Locale locale : locales) { - addEntriesToPersonalizationDictionariesForLocale(locale, personalizationDataChunk, + updateEntriesOfPersonalizationDictionariesForLocale(locale, personalizationDataChunk, spacingAndPunctuations, callbackForLocales); } } diff --git a/java/src/com/android/inputmethod/latin/utils/LanguageModelParam.java b/java/src/com/android/inputmethod/latin/utils/LanguageModelParam.java deleted file mode 100644 index 3e5cb33ca..000000000 --- a/java/src/com/android/inputmethod/latin/utils/LanguageModelParam.java +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Copyright (C) 2014 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.android.inputmethod.latin.utils; - -import android.util.Log; - -import com.android.inputmethod.annotations.UsedForTesting; -import com.android.inputmethod.latin.Dictionary; -import com.android.inputmethod.latin.NgramContext; -import com.android.inputmethod.latin.settings.SpacingAndPunctuations; -import com.android.inputmethod.latin.utils.DistracterFilter.HandlingType; - -import java.util.ArrayList; -import java.util.List; -import java.util.Locale; - -// Note: this class is used as a parameter type of a native method. You should be careful when you -// rename this class or field name. See BinaryDictionary#addMultipleDictionaryEntriesNative(). -public final class LanguageModelParam { - private static final String TAG = LanguageModelParam.class.getSimpleName(); - private static final boolean DEBUG = false; - private static final boolean DEBUG_TOKEN = false; - - // For now, these probability values are being referred to only when we add new entries to - // decaying dynamic binary dictionaries. When these are referred to, what matters is 0 or - // non-0. Thus, it's not meaningful to compare 10, 100, and so on. - // TODO: Revise the logic in ForgettingCurveUtils in native code. - private static final int UNIGRAM_PROBABILITY_FOR_VALID_WORD = 100; - private static final int UNIGRAM_PROBABILITY_FOR_OOV_WORD = Dictionary.NOT_A_PROBABILITY; - private static final int BIGRAM_PROBABILITY_FOR_VALID_WORD = 10; - private static final int BIGRAM_PROBABILITY_FOR_OOV_WORD = Dictionary.NOT_A_PROBABILITY; - - public final CharSequence mTargetWord; - public final int[] mWord0; - public final int[] mWord1; - // TODO: this needs to be a list of shortcuts - public final int[] mShortcutTarget; - public final int mUnigramProbability; - public final int mBigramProbability; - public final int mShortcutProbability; - public final boolean mIsNotAWord; - public final boolean mIsPossiblyOffensive; - // Time stamp in seconds. - public final int mTimestamp; - - // Constructor for unigram. TODO: support shortcuts - @UsedForTesting - public LanguageModelParam(final CharSequence word, final int unigramProbability, - final int timestamp) { - this(null /* word0 */, word, unigramProbability, Dictionary.NOT_A_PROBABILITY, timestamp); - } - - // Constructor for unigram and bigram. - @UsedForTesting - public LanguageModelParam(final CharSequence word0, final CharSequence word1, - final int unigramProbability, final int bigramProbability, - final int timestamp) { - mTargetWord = word1; - mWord0 = (word0 == null) ? null : StringUtils.toCodePointArray(word0); - mWord1 = StringUtils.toCodePointArray(word1); - mShortcutTarget = null; - mUnigramProbability = unigramProbability; - mBigramProbability = bigramProbability; - mShortcutProbability = Dictionary.NOT_A_PROBABILITY; - mIsNotAWord = false; - mIsPossiblyOffensive = false; - mTimestamp = timestamp; - } - - // Process a list of words and return a list of {@link LanguageModelParam} objects. - public static ArrayList createLanguageModelParamsFrom( - final List tokens, final int timestamp, - final SpacingAndPunctuations spacingAndPunctuations, final Locale locale, - final DistracterFilter distracterFilter) { - final ArrayList languageModelParams = new ArrayList<>(); - final int N = tokens.size(); - NgramContext ngramContext = NgramContext.EMPTY_PREV_WORDS_INFO; - for (int i = 0; i < N; ++i) { - final String tempWord = tokens.get(i); - if (StringUtils.isEmptyStringOrWhiteSpaces(tempWord)) { - // just skip this token - if (DEBUG_TOKEN) { - Log.d(TAG, "--- isEmptyStringOrWhiteSpaces: \"" + tempWord + "\""); - } - continue; - } - if (!DictionaryInfoUtils.looksValidForDictionaryInsertion( - tempWord, spacingAndPunctuations)) { - if (DEBUG_TOKEN) { - Log.d(TAG, "--- not looksValidForDictionaryInsertion: \"" - + tempWord + "\""); - } - // Sentence terminator found. Split. - ngramContext = NgramContext.EMPTY_PREV_WORDS_INFO; - continue; - } - if (DEBUG_TOKEN) { - Log.d(TAG, "--- word: \"" + tempWord + "\""); - } - final LanguageModelParam languageModelParam = - detectWhetherVaildWordOrNotAndGetLanguageModelParam( - ngramContext, tempWord, timestamp, locale, distracterFilter); - if (languageModelParam == null) { - continue; - } - languageModelParams.add(languageModelParam); - ngramContext = ngramContext.getNextNgramContext( - new NgramContext.WordInfo(tempWord)); - } - return languageModelParams; - } - - private static LanguageModelParam detectWhetherVaildWordOrNotAndGetLanguageModelParam( - final NgramContext ngramContext, final String targetWord, final int timestamp, - final Locale locale, final DistracterFilter distracterFilter) { - if (locale == null) { - return null; - } - final int wordHandlingType = distracterFilter.getWordHandlingType(ngramContext, - targetWord, locale); - final String word = HandlingType.shouldBeLowerCased(wordHandlingType) ? - targetWord.toLowerCase(locale) : targetWord; - if (distracterFilter.isDistracterToWordsInDictionaries(ngramContext, targetWord, locale)) { - // The word is a distracter. - return null; - } - return createAndGetLanguageModelParamOfWord(ngramContext, word, timestamp, - !HandlingType.shouldBeHandledAsOov(wordHandlingType)); - } - - private static LanguageModelParam createAndGetLanguageModelParamOfWord( - final NgramContext ngramContext, final String word, final int timestamp, - final boolean isValidWord) { - final int unigramProbability = isValidWord ? - UNIGRAM_PROBABILITY_FOR_VALID_WORD : UNIGRAM_PROBABILITY_FOR_OOV_WORD; - if (!ngramContext.isValid()) { - if (DEBUG) { - Log.d(TAG, "--- add unigram: current(" - + (isValidWord ? "Valid" : "OOV") + ") = " + word); - } - return new LanguageModelParam(word, unigramProbability, timestamp); - } - if (DEBUG) { - Log.d(TAG, "--- add bigram: prev = " + ngramContext + ", current(" - + (isValidWord ? "Valid" : "OOV") + ") = " + word); - } - final int bigramProbability = isValidWord ? - BIGRAM_PROBABILITY_FOR_VALID_WORD : BIGRAM_PROBABILITY_FOR_OOV_WORD; - return new LanguageModelParam(ngramContext.getNthPrevWord(1 /* n */), word, - unigramProbability, bigramProbability, timestamp); - } -} diff --git a/java/src/com/android/inputmethod/latin/utils/WordInputEventForPersonalization.java b/java/src/com/android/inputmethod/latin/utils/WordInputEventForPersonalization.java new file mode 100644 index 000000000..644fda57f --- /dev/null +++ b/java/src/com/android/inputmethod/latin/utils/WordInputEventForPersonalization.java @@ -0,0 +1,117 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin.utils; + +import android.util.Log; + +import com.android.inputmethod.annotations.UsedForTesting; +import com.android.inputmethod.latin.Constants; +import com.android.inputmethod.latin.NgramContext; +import com.android.inputmethod.latin.settings.SpacingAndPunctuations; +import com.android.inputmethod.latin.utils.DistracterFilter.HandlingType; + +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; + +// Note: this class is used as a parameter type of a native method. You should be careful when you +// rename this class or field name. See BinaryDictionary#addMultipleDictionaryEntriesNative(). +public final class WordInputEventForPersonalization { + private static final String TAG = WordInputEventForPersonalization.class.getSimpleName(); + private static final boolean DEBUG_TOKEN = false; + + public final int[] mTargetWord; + public final int mPrevWordsCount; + public final int[][] mPrevWordArray = new int[Constants.MAX_PREV_WORD_COUNT_FOR_N_GRAM][]; + public final boolean[] mIsPrevWordBeginningOfSentenceArray = + new boolean[Constants.MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + public final boolean mIsValid; + // Time stamp in seconds. + public final int mTimestamp; + + @UsedForTesting + public WordInputEventForPersonalization(final CharSequence targetWord, + final NgramContext ngramContext, final boolean isValid, final int timestamp) { + mTargetWord = StringUtils.toCodePointArray(targetWord); + mPrevWordsCount = ngramContext.getPrevWordCount(); + ngramContext.outputToArray(mPrevWordArray, mIsPrevWordBeginningOfSentenceArray); + mIsValid = isValid; + mTimestamp = timestamp; + } + + // Process a list of words and return a list of {@link WordInputEventForPersonalization} + // objects. + public static ArrayList createInputEventFrom( + final List tokens, final int timestamp, + final SpacingAndPunctuations spacingAndPunctuations, final Locale locale, + final DistracterFilter distracterFilter) { + final ArrayList inputEvents = new ArrayList<>(); + final int N = tokens.size(); + NgramContext ngramContext = NgramContext.EMPTY_PREV_WORDS_INFO; + for (int i = 0; i < N; ++i) { + final String tempWord = tokens.get(i); + if (StringUtils.isEmptyStringOrWhiteSpaces(tempWord)) { + // just skip this token + if (DEBUG_TOKEN) { + Log.d(TAG, "--- isEmptyStringOrWhiteSpaces: \"" + tempWord + "\""); + } + continue; + } + if (!DictionaryInfoUtils.looksValidForDictionaryInsertion( + tempWord, spacingAndPunctuations)) { + if (DEBUG_TOKEN) { + Log.d(TAG, "--- not looksValidForDictionaryInsertion: \"" + + tempWord + "\""); + } + // Sentence terminator found. Split. + // TODO: Detect whether the context is beginning-of-sentence. + ngramContext = NgramContext.EMPTY_PREV_WORDS_INFO; + continue; + } + if (DEBUG_TOKEN) { + Log.d(TAG, "--- word: \"" + tempWord + "\""); + } + final WordInputEventForPersonalization inputEvent = + detectWhetherVaildWordOrNotAndGetInputEvent( + ngramContext, tempWord, timestamp, locale, distracterFilter); + if (inputEvent == null) { + continue; + } + inputEvents.add(inputEvent); + ngramContext = ngramContext.getNextNgramContext(new NgramContext.WordInfo(tempWord)); + } + return inputEvents; + } + + private static WordInputEventForPersonalization detectWhetherVaildWordOrNotAndGetInputEvent( + final NgramContext ngramContext, final String targetWord, final int timestamp, + final Locale locale, final DistracterFilter distracterFilter) { + if (locale == null) { + return null; + } + final int wordHandlingType = distracterFilter.getWordHandlingType(ngramContext, + targetWord, locale); + final String word = HandlingType.shouldBeLowerCased(wordHandlingType) ? + targetWord.toLowerCase(locale) : targetWord; + if (distracterFilter.isDistracterToWordsInDictionaries(ngramContext, targetWord, locale)) { + // The word is a distracter. + return null; + } + return new WordInputEventForPersonalization(word, ngramContext, + !HandlingType.shouldBeHandledAsOov(wordHandlingType), timestamp); + } +} diff --git a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp index 9239c8400..118f600bb 100644 --- a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp +++ b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp @@ -453,98 +453,60 @@ static bool latinime_BinaryDictionary_updateEntriesForWordWithNgramContext(JNIEn historicalInfo); } -// Returns how many language model params are processed. -static int latinime_BinaryDictionary_addMultipleDictionaryEntries(JNIEnv *env, jclass clazz, - jlong dict, jobjectArray languageModelParams, jint startIndex) { +// Returns how many input events are processed. +static int latinime_BinaryDictionary_updateEntriesForInputEvents(JNIEnv *env, jclass clazz, + jlong dict, jobjectArray inputEvents, jint startIndex) { Dictionary *dictionary = reinterpret_cast(dict); if (!dictionary) { return 0; } - jsize languageModelParamCount = env->GetArrayLength(languageModelParams); - if (languageModelParamCount == 0 || startIndex >= languageModelParamCount) { + jsize inputEventCount = env->GetArrayLength(inputEvents); + if (inputEventCount == 0 || startIndex >= inputEventCount) { return 0; } - jobject languageModelParam = env->GetObjectArrayElement(languageModelParams, 0); - jclass languageModelParamClass = env->GetObjectClass(languageModelParam); - env->DeleteLocalRef(languageModelParam); + jobject inputEvent = env->GetObjectArrayElement(inputEvents, 0); + jclass wordInputEventClass = env->GetObjectClass(inputEvent); + env->DeleteLocalRef(inputEvent); - jfieldID word0FieldId = env->GetFieldID(languageModelParamClass, "mWord0", "[I"); - jfieldID word1FieldId = env->GetFieldID(languageModelParamClass, "mWord1", "[I"); - jfieldID unigramProbabilityFieldId = - env->GetFieldID(languageModelParamClass, "mUnigramProbability", "I"); - jfieldID bigramProbabilityFieldId = - env->GetFieldID(languageModelParamClass, "mBigramProbability", "I"); - jfieldID timestampFieldId = - env->GetFieldID(languageModelParamClass, "mTimestamp", "I"); - jfieldID shortcutTargetFieldId = - env->GetFieldID(languageModelParamClass, "mShortcutTarget", "[I"); - jfieldID shortcutProbabilityFieldId = - env->GetFieldID(languageModelParamClass, "mShortcutProbability", "I"); - jfieldID isNotAWordFieldId = - env->GetFieldID(languageModelParamClass, "mIsNotAWord", "Z"); - jfieldID isPossiblyOffensiveFieldId = - env->GetFieldID(languageModelParamClass, "mIsPossiblyOffensive", "Z"); - env->DeleteLocalRef(languageModelParamClass); + jfieldID targetWordFieldId = env->GetFieldID(wordInputEventClass, "mTargetWord", "[I"); + jfieldID prevWordCountFieldId = env->GetFieldID(wordInputEventClass, "mPrevWordsCount", "I"); + jfieldID prevWordArrayFieldId = env->GetFieldID(wordInputEventClass, "mPrevWordArray", "[[I"); + jfieldID isPrevWordBoSArrayFieldId = + env->GetFieldID(wordInputEventClass, "mIsPrevWordBeginningOfSentenceArray", "[Z"); + jfieldID isValidFieldId = env->GetFieldID(wordInputEventClass, "mIsValid", "Z"); + jfieldID timestampFieldId = env->GetFieldID(wordInputEventClass, "mTimestamp", "I"); + env->DeleteLocalRef(wordInputEventClass); - for (int i = startIndex; i < languageModelParamCount; ++i) { - jobject languageModelParam = env->GetObjectArrayElement(languageModelParams, i); - // languageModelParam is a set of params for word1; thus, word1 cannot be null. On the - // other hand, word0 can be null and then it means the set of params doesn't contain bigram - // information. - jintArray word0 = static_cast( - env->GetObjectField(languageModelParam, word0FieldId)); - jsize word0Length = word0 ? env->GetArrayLength(word0) : 0; - int word0CodePoints[word0Length]; - if (word0) { - env->GetIntArrayRegion(word0, 0, word0Length, word0CodePoints); - } - jintArray word1 = static_cast( - env->GetObjectField(languageModelParam, word1FieldId)); - jsize word1Length = env->GetArrayLength(word1); - int word1CodePoints[word1Length]; - env->GetIntArrayRegion(word1, 0, word1Length, word1CodePoints); - jint unigramProbability = env->GetIntField(languageModelParam, unigramProbabilityFieldId); - jint timestamp = env->GetIntField(languageModelParam, timestampFieldId); - jboolean isNotAWord = env->GetBooleanField(languageModelParam, isNotAWordFieldId); - jboolean isPossiblyOffensive = env->GetBooleanField(languageModelParam, - isPossiblyOffensiveFieldId); - jintArray shortcutTarget = static_cast( - env->GetObjectField(languageModelParam, shortcutTargetFieldId)); - std::vector shortcuts; - { - std::vector shortcutTargetCodePoints; - JniDataUtils::jintarrayToVector(env, shortcutTarget, &shortcutTargetCodePoints); - if (!shortcutTargetCodePoints.empty()) { - jint shortcutProbability = - env->GetIntField(languageModelParam, shortcutProbabilityFieldId); - shortcuts.emplace_back(std::move(shortcutTargetCodePoints), shortcutProbability); - } - } + for (int i = startIndex; i < inputEventCount; ++i) { + jobject inputEvent = env->GetObjectArrayElement(inputEvents, i); + jintArray targetWord = static_cast( + env->GetObjectField(inputEvent, targetWordFieldId)); + jsize wordLength = env->GetArrayLength(targetWord); + int wordCodePoints[wordLength]; + env->GetIntArrayRegion(targetWord, 0, wordLength, wordCodePoints); + env->DeleteLocalRef(targetWord); + + jint prevWordCount = env->GetIntField(inputEvent, prevWordCountFieldId); + jobjectArray prevWordArray = + static_cast(env->GetObjectField(inputEvent, prevWordArrayFieldId)); + jbooleanArray isPrevWordBeginningOfSentenceArray = static_cast( + env->GetObjectField(inputEvent, isPrevWordBoSArrayFieldId)); + jboolean isValid = env->GetBooleanField(inputEvent, isValidFieldId); + jint timestamp = env->GetIntField(inputEvent, timestampFieldId); + const NgramContext ngramContext = JniDataUtils::constructNgramContext(env, + prevWordArray, isPrevWordBeginningOfSentenceArray, prevWordCount); // Use 1 for count to indicate the word has inputted. - const UnigramProperty unigramProperty(false /* isBeginningOfSentence */, isNotAWord, - isPossiblyOffensive, unigramProbability, - HistoricalInfo(timestamp, 0 /* level */, 1 /* count */), std::move(shortcuts)); - dictionary->addUnigramEntry(CodePointArrayView(word1CodePoints, word1Length), - &unigramProperty); - if (word0) { - jint bigramProbability = env->GetIntField(languageModelParam, bigramProbabilityFieldId); - // Use 1 for count to indicate the bigram has inputted. - const NgramContext ngramContext(word0CodePoints, word0Length, - false /* isBeginningOfSentence */); - const NgramProperty ngramProperty(ngramContext, - CodePointArrayView(word1CodePoints, word1Length).toVector(), - bigramProbability, HistoricalInfo(timestamp, 0 /* level */, 1 /* count */)); - dictionary->addNgramEntry(&ngramProperty); - } + dictionary->updateEntriesForWordWithNgramContext(&ngramContext, + CodePointArrayView(wordCodePoints, wordLength), isValid, + HistoricalInfo(timestamp, 0 /* level */, 1 /* count */)); if (dictionary->needsToRunGC(true /* mindsBlockByGC */)) { return i + 1; } - env->DeleteLocalRef(word0); - env->DeleteLocalRef(word1); - env->DeleteLocalRef(shortcutTarget); - env->DeleteLocalRef(languageModelParam); + env->DeleteLocalRef(prevWordArray); + env->DeleteLocalRef(isPrevWordBeginningOfSentenceArray); + env->DeleteLocalRef(inputEvent); } - return languageModelParamCount; + return inputEventCount; } static jstring latinime_BinaryDictionary_getProperty(JNIEnv *env, jclass clazz, jlong dict, @@ -754,10 +716,10 @@ static const JNINativeMethod sMethods[] = { reinterpret_cast(latinime_BinaryDictionary_updateEntriesForWordWithNgramContext) }, { - const_cast("addMultipleDictionaryEntriesNative"), + const_cast("updateEntriesForInputEventsNative"), const_cast( - "(J[Lcom/android/inputmethod/latin/utils/LanguageModelParam;I)I"), - reinterpret_cast(latinime_BinaryDictionary_addMultipleDictionaryEntries) + "(J[Lcom/android/inputmethod/latin/utils/WordInputEventForPersonalization;I)I"), + reinterpret_cast(latinime_BinaryDictionary_updateEntriesForInputEvents) }, { const_cast("getPropertyNative"), diff --git a/native/jni/src/utils/jni_data_utils.h b/native/jni/src/utils/jni_data_utils.h index 25cc41742..a259e1cd0 100644 --- a/native/jni/src/utils/jni_data_utils.h +++ b/native/jni/src/utils/jni_data_utils.h @@ -50,6 +50,7 @@ class JniDataUtils { const jsize keyUtf8Length = env->GetStringUTFLength(keyString); char keyChars[keyUtf8Length + 1]; env->GetStringUTFRegion(keyString, 0, env->GetStringLength(keyString), keyChars); + env->DeleteLocalRef(keyString); keyChars[keyUtf8Length] = '\0'; DictionaryHeaderStructurePolicy::AttributeMap::key_type key; HeaderReadWriteUtils::insertCharactersIntoVector(keyChars, &key); @@ -59,6 +60,7 @@ class JniDataUtils { const jsize valueUtf8Length = env->GetStringUTFLength(valueString); char valueChars[valueUtf8Length + 1]; env->GetStringUTFRegion(valueString, 0, env->GetStringLength(valueString), valueChars); + env->DeleteLocalRef(valueString); valueChars[valueUtf8Length] = '\0'; DictionaryHeaderStructurePolicy::AttributeMap::mapped_type value; HeaderReadWriteUtils::insertCharactersIntoVector(valueChars, &value); @@ -113,6 +115,7 @@ class JniDataUtils { continue; } env->GetIntArrayRegion(prevWord, 0, prevWordLength, prevWordCodePoints[i]); + env->DeleteLocalRef(prevWord); prevWordCodePointCount[i] = prevWordLength; jboolean isBeginningOfSentenceBoolean = JNI_FALSE; env->GetBooleanArrayRegion(isBeginningOfSentenceArray, i, 1 /* len */, diff --git a/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java b/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java index 15f7568c8..8f3373cc2 100644 --- a/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java +++ b/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java @@ -32,6 +32,7 @@ import com.android.inputmethod.latin.makedict.UnsupportedFormatException; import com.android.inputmethod.latin.utils.BinaryDictionaryUtils; import com.android.inputmethod.latin.utils.FileUtils; import com.android.inputmethod.latin.utils.LocaleUtils; +import com.android.inputmethod.latin.utils.WordInputEventForPersonalization; import java.io.File; import java.io.IOException; @@ -39,6 +40,7 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Locale; +import java.util.Map; import java.util.Random; import java.util.concurrent.TimeUnit; @@ -748,4 +750,66 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { binaryDictionary.close(); } + + public void testUpdateEntriesForInputEvents() { + for (final int formatVersion : DICT_FORMAT_VERSIONS) { + testUpdateEntriesForInputEvents(formatVersion); + } + } + + private void testUpdateEntriesForInputEvents(final int formatVersion) { + setCurrentTimeForTestMode(mCurrentTime); + final int codePointSetSize = 20; + final int EVENT_COUNT = 1000; + final double CONTINUE_RATE = 0.9; + final long seed = System.currentTimeMillis(); + final Random random = new Random(seed); + final File dictFile = createEmptyDictionaryAndGetFile(formatVersion); + + final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); + final ArrayList unigrams = new ArrayList<>(); + final ArrayList> bigrams = new ArrayList<>(); + final ArrayList, String>> trigrams = new ArrayList<>(); + + final WordInputEventForPersonalization[] inputEvents = + new WordInputEventForPersonalization[EVENT_COUNT]; + NgramContext ngramContext = NgramContext.EMPTY_PREV_WORDS_INFO; + int prevWordCount = 0; + for (int i = 0; i < inputEvents.length; i++) { + final String word = CodePointUtils.generateWord(random, codePointSet); + inputEvents[i] = new WordInputEventForPersonalization(word, ngramContext, + true /* isValid */, mCurrentTime); + unigrams.add(word); + if (prevWordCount >= 2) { + final Pair prevWordsPair = bigrams.get(bigrams.size() - 1); + trigrams.add(new Pair<>(prevWordsPair, word)); + } + if (prevWordCount >= 1) { + bigrams.add(new Pair<>(ngramContext.getNthPrevWord(1 /* n */).toString(), word)); + } + if (random.nextDouble() > CONTINUE_RATE) { + ngramContext = NgramContext.EMPTY_PREV_WORDS_INFO; + prevWordCount = 0; + } else { + ngramContext = ngramContext.getNextNgramContext(new WordInfo(word)); + prevWordCount++; + } + } + final BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); + binaryDictionary.updateEntriesForInputEvents(inputEvents); + + for (final String word : unigrams) { + assertTrue(binaryDictionary.isInDictionary(word)); + } + for (final Pair bigram : bigrams) { + assertTrue(isValidBigram(binaryDictionary, bigram.first, bigram.second)); + } + if (!supportsNgram(formatVersion)) { + return; + } + for (final Pair, String> trigram : trigrams) { + assertTrue(isValidTrigram(binaryDictionary, trigram.first.first, trigram.first.second, + trigram.second)); + } + } } diff --git a/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java b/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java index 5a72e417e..0f2fa2fb1 100644 --- a/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java +++ b/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java @@ -29,7 +29,6 @@ import com.android.inputmethod.latin.makedict.WeightedString; import com.android.inputmethod.latin.makedict.WordProperty; import com.android.inputmethod.latin.utils.BinaryDictionaryUtils; import com.android.inputmethod.latin.utils.FileUtils; -import com.android.inputmethod.latin.utils.LanguageModelParam; import java.io.File; import java.io.IOException; @@ -884,63 +883,6 @@ public class BinaryDictionaryTests extends AndroidTestCase { } } - public void testAddMultipleDictionaryEntries() { - for (final int formatVersion : DICT_FORMAT_VERSIONS) { - testAddMultipleDictionaryEntries(formatVersion); - } - } - - private void testAddMultipleDictionaryEntries(final int formatVersion) { - final int codePointSetSize = 20; - final int lmParamCount = 1000; - final double bigramContinueRate = 0.9; - final long seed = System.currentTimeMillis(); - final Random random = new Random(seed); - final File dictFile = createEmptyDictionaryAndGetFile(formatVersion); - - final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); - final HashMap unigramProbabilities = new HashMap<>(); - final HashMap, Integer> bigramProbabilities = new HashMap<>(); - - final LanguageModelParam[] languageModelParams = new LanguageModelParam[lmParamCount]; - String prevWord = null; - for (int i = 0; i < languageModelParams.length; i++) { - final String word = CodePointUtils.generateWord(random, codePointSet); - final int probability = random.nextInt(0xFF); - final int bigramProbability = probability + random.nextInt(0xFF - probability); - unigramProbabilities.put(word, probability); - if (prevWord == null) { - languageModelParams[i] = new LanguageModelParam(word, probability, - BinaryDictionary.NOT_A_VALID_TIMESTAMP); - } else { - languageModelParams[i] = new LanguageModelParam(prevWord, word, probability, - bigramProbability, BinaryDictionary.NOT_A_VALID_TIMESTAMP); - bigramProbabilities.put(new Pair<>(prevWord, word), - bigramProbability); - } - prevWord = (random.nextDouble() < bigramContinueRate) ? word : null; - } - - final BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile); - binaryDictionary.addMultipleDictionaryEntries(languageModelParams); - - for (Map.Entry entry : unigramProbabilities.entrySet()) { - assertEquals((int)entry.getValue(), binaryDictionary.getFrequency(entry.getKey())); - } - - for (Map.Entry, Integer> entry : bigramProbabilities.entrySet()) { - final String word0 = entry.getKey().first; - final String word1 = entry.getKey().second; - final int bigramProbability = entry.getValue(); - assertEquals(bigramProbability != Dictionary.NOT_A_PROBABILITY, - isValidBigram(binaryDictionary, word0, word1)); - if (canCheckBigramProbability(formatVersion)) { - assertEquals(bigramProbability, - getBigramProbability(binaryDictionary, word0, word1)); - } - } - } - public void testGetWordProperties() { for (final int formatVersion : DICT_FORMAT_VERSIONS) { testGetWordProperties(formatVersion); diff --git a/tests/src/com/android/inputmethod/latin/personalization/PersonalizationDictionaryTests.java b/tests/src/com/android/inputmethod/latin/personalization/PersonalizationDictionaryTests.java index 6ccb79d76..dc6fb0075 100644 --- a/tests/src/com/android/inputmethod/latin/personalization/PersonalizationDictionaryTests.java +++ b/tests/src/com/android/inputmethod/latin/personalization/PersonalizationDictionaryTests.java @@ -30,7 +30,7 @@ import com.android.inputmethod.latin.Dictionary; import com.android.inputmethod.latin.DictionaryFacilitator; import com.android.inputmethod.latin.ExpandableBinaryDictionary; import com.android.inputmethod.latin.RichInputMethodManager; -import com.android.inputmethod.latin.ExpandableBinaryDictionary.AddMultipleDictionaryEntriesCallback; +import com.android.inputmethod.latin.ExpandableBinaryDictionary.UpdateEntriesForInputEventsCallback; import com.android.inputmethod.latin.common.CodePointUtils; import com.android.inputmethod.latin.settings.SpacingAndPunctuations; @@ -96,8 +96,8 @@ public class PersonalizationDictionaryTests extends AndroidTestCase { true /* inputByUser */, tokens, timeStampInSeconds, DUMMY_PACKAGE_NAME, LOCALE_EN_US.getLanguage()); final CountDownLatch countDownLatch = new CountDownLatch(1); - final AddMultipleDictionaryEntriesCallback callback = - new AddMultipleDictionaryEntriesCallback() { + final UpdateEntriesForInputEventsCallback callback = + new UpdateEntriesForInputEventsCallback() { @Override public void onFinished() { countDownLatch.countDown();