Merge "Use trigrams for personalization dict."

This commit is contained in:
Keisuke Kuroyanagi 2014-10-23 05:42:37 +00:00 committed by Android (Google) Code Review
commit ade5ad1dae
11 changed files with 265 additions and 343 deletions

View file

@ -32,8 +32,8 @@ import com.android.inputmethod.latin.settings.SettingsValuesForSuggestion;
import com.android.inputmethod.latin.utils.BinaryDictionaryUtils; import com.android.inputmethod.latin.utils.BinaryDictionaryUtils;
import com.android.inputmethod.latin.utils.FileUtils; import com.android.inputmethod.latin.utils.FileUtils;
import com.android.inputmethod.latin.utils.JniUtils; import com.android.inputmethod.latin.utils.JniUtils;
import com.android.inputmethod.latin.utils.LanguageModelParam;
import com.android.inputmethod.latin.utils.StringUtils; import com.android.inputmethod.latin.utils.StringUtils;
import com.android.inputmethod.latin.utils.WordInputEventForPersonalization;
import java.io.File; import java.io.File;
import java.util.ArrayList; import java.util.ArrayList;
@ -205,8 +205,8 @@ public final class BinaryDictionary extends Dictionary {
private static native boolean updateEntriesForWordWithNgramContextNative(long dict, private static native boolean updateEntriesForWordWithNgramContextNative(long dict,
int[][] prevWordCodePointArrays, boolean[] isBeginningOfSentenceArray, int[][] prevWordCodePointArrays, boolean[] isBeginningOfSentenceArray,
int[] word, boolean isValidWord, int count, int timestamp); int[] word, boolean isValidWord, int count, int timestamp);
private static native int addMultipleDictionaryEntriesNative(long dict, private static native int updateEntriesForInputEventsNative(long dict,
LanguageModelParam[] languageModelParams, int startIndex); WordInputEventForPersonalization[] inputEvents, int startIndex);
private static native String getPropertyNative(long dict, String query); private static native String getPropertyNative(long dict, String query);
private static native boolean isCorruptedNative(long dict); private static native boolean isCorruptedNative(long dict);
private static native boolean migrateNative(long dict, String dictFilePath, private static native boolean migrateNative(long dict, String dictFilePath,
@ -526,19 +526,19 @@ public final class BinaryDictionary extends Dictionary {
} }
@UsedForTesting @UsedForTesting
public void addMultipleDictionaryEntries(final LanguageModelParam[] languageModelParams) { public void updateEntriesForInputEvents(final WordInputEventForPersonalization[] inputEvents) {
if (!isValidDictionary()) { if (!isValidDictionary()) {
return; return;
} }
int processedParamCount = 0; int processedEventCount = 0;
while (processedParamCount < languageModelParams.length) { while (processedEventCount < inputEvents.length) {
if (needsToRunGC(true /* mindsBlockByGC */)) { if (needsToRunGC(true /* mindsBlockByGC */)) {
flushWithGC(); flushWithGC();
} }
processedParamCount = addMultipleDictionaryEntriesNative(mNativeDict, processedEventCount = updateEntriesForInputEventsNative(mNativeDict, inputEvents,
languageModelParams, processedParamCount); processedEventCount);
mHasUpdated = true; mHasUpdated = true;
if (processedParamCount <= 0) { if (processedEventCount <= 0) {
return; return;
} }
} }

View file

@ -24,7 +24,7 @@ import android.view.inputmethod.InputMethodSubtype;
import com.android.inputmethod.annotations.UsedForTesting; import com.android.inputmethod.annotations.UsedForTesting;
import com.android.inputmethod.keyboard.ProximityInfo; import com.android.inputmethod.keyboard.ProximityInfo;
import com.android.inputmethod.latin.ExpandableBinaryDictionary.AddMultipleDictionaryEntriesCallback; import com.android.inputmethod.latin.ExpandableBinaryDictionary.UpdateEntriesForInputEventsCallback;
import com.android.inputmethod.latin.NgramContext.WordInfo; import com.android.inputmethod.latin.NgramContext.WordInfo;
import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo; import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo;
import com.android.inputmethod.latin.personalization.ContextualDictionary; import com.android.inputmethod.latin.personalization.ContextualDictionary;
@ -796,8 +796,8 @@ public class DictionaryFacilitator {
public void addEntriesToPersonalizationDictionary( public void addEntriesToPersonalizationDictionary(
final PersonalizationDataChunk personalizationDataChunk, final PersonalizationDataChunk personalizationDataChunk,
final SpacingAndPunctuations spacingAndPunctuations, final SpacingAndPunctuations spacingAndPunctuations,
final AddMultipleDictionaryEntriesCallback callback) { final UpdateEntriesForInputEventsCallback callback) {
mPersonalizationHelper.addEntriesToPersonalizationDictionariesToUpdate( mPersonalizationHelper.updateEntriesOfPersonalizationDictionaries(
getMostProbableLocale(), personalizationDataChunk, spacingAndPunctuations, getMostProbableLocale(), personalizationDataChunk, spacingAndPunctuations,
callback); callback);
} }

View file

@ -32,7 +32,7 @@ import com.android.inputmethod.latin.utils.CombinedFormatUtils;
import com.android.inputmethod.latin.utils.DistracterFilter; import com.android.inputmethod.latin.utils.DistracterFilter;
import com.android.inputmethod.latin.utils.ExecutorUtils; import com.android.inputmethod.latin.utils.ExecutorUtils;
import com.android.inputmethod.latin.utils.FileUtils; import com.android.inputmethod.latin.utils.FileUtils;
import com.android.inputmethod.latin.utils.LanguageModelParam; import com.android.inputmethod.latin.utils.WordInputEventForPersonalization;
import java.io.File; import java.io.File;
import java.util.ArrayList; import java.util.ArrayList;
@ -447,16 +447,16 @@ abstract public class ExpandableBinaryDictionary extends Dictionary {
}, word, distracterFilter); }, word, distracterFilter);
} }
public interface AddMultipleDictionaryEntriesCallback { public interface UpdateEntriesForInputEventsCallback {
public void onFinished(); public void onFinished();
} }
/** /**
* Dynamically add multiple entries to the dictionary. * Dynamically update entries according to input events.
*/ */
public void addMultipleDictionaryEntriesDynamically( public void updateEntriesForInputEvents(
@Nonnull final ArrayList<LanguageModelParam> languageModelParams, @Nonnull final ArrayList<WordInputEventForPersonalization> inputEvents,
final AddMultipleDictionaryEntriesCallback callback) { final UpdateEntriesForInputEventsCallback callback) {
reloadDictionaryIfRequired(); reloadDictionaryIfRequired();
asyncExecuteTaskWithWriteLock(new Runnable() { asyncExecuteTaskWithWriteLock(new Runnable() {
@Override @Override
@ -466,9 +466,9 @@ abstract public class ExpandableBinaryDictionary extends Dictionary {
if (binaryDictionary == null) { if (binaryDictionary == null) {
return; return;
} }
binaryDictionary.addMultipleDictionaryEntries( binaryDictionary.updateEntriesForInputEvents(
languageModelParams.toArray( inputEvents.toArray(
new LanguageModelParam[languageModelParams.size()])); new WordInputEventForPersonalization[inputEvents.size()]));
} finally { } finally {
if (callback != null) { if (callback != null) {
callback.onFinished(); callback.onFinished();

View file

@ -26,14 +26,14 @@ import java.util.concurrent.atomic.AtomicInteger;
import android.content.Context; import android.content.Context;
import android.view.inputmethod.InputMethodSubtype; import android.view.inputmethod.InputMethodSubtype;
import com.android.inputmethod.latin.ExpandableBinaryDictionary.AddMultipleDictionaryEntriesCallback; import com.android.inputmethod.latin.ExpandableBinaryDictionary.UpdateEntriesForInputEventsCallback;
import com.android.inputmethod.latin.personalization.PersonalizationDataChunk; import com.android.inputmethod.latin.personalization.PersonalizationDataChunk;
import com.android.inputmethod.latin.personalization.PersonalizationDictionary; import com.android.inputmethod.latin.personalization.PersonalizationDictionary;
import com.android.inputmethod.latin.settings.SpacingAndPunctuations; import com.android.inputmethod.latin.settings.SpacingAndPunctuations;
import com.android.inputmethod.latin.utils.DistracterFilter; import com.android.inputmethod.latin.utils.DistracterFilter;
import com.android.inputmethod.latin.utils.DistracterFilterCheckingIsInDictionary; import com.android.inputmethod.latin.utils.DistracterFilterCheckingIsInDictionary;
import com.android.inputmethod.latin.utils.LanguageModelParam;
import com.android.inputmethod.latin.utils.SubtypeLocaleUtils; import com.android.inputmethod.latin.utils.SubtypeLocaleUtils;
import com.android.inputmethod.latin.utils.WordInputEventForPersonalization;
/** /**
* Class for managing and updating personalization dictionaries. * Class for managing and updating personalization dictionaries.
@ -119,10 +119,10 @@ public class PersonalizationHelperForDictionaryFacilitator {
return personalizationDict; return personalizationDict;
} }
private void addEntriesToPersonalizationDictionariesForLocale(final Locale locale, private void updateEntriesOfPersonalizationDictionariesForLocale(final Locale locale,
final PersonalizationDataChunk personalizationDataChunk, final PersonalizationDataChunk personalizationDataChunk,
final SpacingAndPunctuations spacingAndPunctuations, final SpacingAndPunctuations spacingAndPunctuations,
final AddMultipleDictionaryEntriesCallback callback) { final UpdateEntriesForInputEventsCallback callback) {
final ExpandableBinaryDictionary personalizationDict = final ExpandableBinaryDictionary personalizationDict =
getPersonalizationDictToUpdate(mContext, locale); getPersonalizationDictToUpdate(mContext, locale);
if (personalizationDict == null) { if (personalizationDict == null) {
@ -131,25 +131,25 @@ public class PersonalizationHelperForDictionaryFacilitator {
} }
return; return;
} }
final ArrayList<LanguageModelParam> languageModelParams = final ArrayList<WordInputEventForPersonalization> inputEvents =
LanguageModelParam.createLanguageModelParamsFrom( WordInputEventForPersonalization.createInputEventFrom(
personalizationDataChunk.mTokens, personalizationDataChunk.mTokens,
personalizationDataChunk.mTimestampInSeconds, spacingAndPunctuations, personalizationDataChunk.mTimestampInSeconds, spacingAndPunctuations,
locale, new DistracterFilterCheckingIsInDictionary( locale, new DistracterFilterCheckingIsInDictionary(
mDistracterFilter, personalizationDict)); mDistracterFilter, personalizationDict));
if (languageModelParams == null || languageModelParams.isEmpty()) { if (inputEvents == null || inputEvents.isEmpty()) {
if (callback != null) { if (callback != null) {
callback.onFinished(); callback.onFinished();
} }
return; return;
} }
personalizationDict.addMultipleDictionaryEntriesDynamically(languageModelParams, callback); personalizationDict.updateEntriesForInputEvents(inputEvents, callback);
} }
public void addEntriesToPersonalizationDictionariesToUpdate(final Locale defaultLocale, public void updateEntriesOfPersonalizationDictionaries(final Locale defaultLocale,
final PersonalizationDataChunk personalizationDataChunk, final PersonalizationDataChunk personalizationDataChunk,
final SpacingAndPunctuations spacingAndPunctuations, final SpacingAndPunctuations spacingAndPunctuations,
final AddMultipleDictionaryEntriesCallback callback) { final UpdateEntriesForInputEventsCallback callback) {
final String language = personalizationDataChunk.mDetectedLanguage; final String language = personalizationDataChunk.mDetectedLanguage;
final HashSet<Locale> locales; final HashSet<Locale> locales;
if (mIsMonolingualUser && PersonalizationDataChunk.LANGUAGE_UNKNOWN.equals(language) if (mIsMonolingualUser && PersonalizationDataChunk.LANGUAGE_UNKNOWN.equals(language)
@ -165,8 +165,8 @@ public class PersonalizationHelperForDictionaryFacilitator {
return; return;
} }
final AtomicInteger remainingTaskCount = new AtomicInteger(locales.size()); final AtomicInteger remainingTaskCount = new AtomicInteger(locales.size());
final AddMultipleDictionaryEntriesCallback callbackForLocales = final UpdateEntriesForInputEventsCallback callbackForLocales =
new AddMultipleDictionaryEntriesCallback() { new UpdateEntriesForInputEventsCallback() {
@Override @Override
public void onFinished() { public void onFinished() {
if (remainingTaskCount.decrementAndGet() == 0) { if (remainingTaskCount.decrementAndGet() == 0) {
@ -178,7 +178,7 @@ public class PersonalizationHelperForDictionaryFacilitator {
} }
}; };
for (final Locale locale : locales) { for (final Locale locale : locales) {
addEntriesToPersonalizationDictionariesForLocale(locale, personalizationDataChunk, updateEntriesOfPersonalizationDictionariesForLocale(locale, personalizationDataChunk,
spacingAndPunctuations, callbackForLocales); spacingAndPunctuations, callbackForLocales);
} }
} }

View file

@ -1,166 +0,0 @@
/*
* Copyright (C) 2014 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.android.inputmethod.latin.utils;
import android.util.Log;
import com.android.inputmethod.annotations.UsedForTesting;
import com.android.inputmethod.latin.Dictionary;
import com.android.inputmethod.latin.NgramContext;
import com.android.inputmethod.latin.settings.SpacingAndPunctuations;
import com.android.inputmethod.latin.utils.DistracterFilter.HandlingType;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
// Note: this class is used as a parameter type of a native method. You should be careful when you
// rename this class or field name. See BinaryDictionary#addMultipleDictionaryEntriesNative().
public final class LanguageModelParam {
private static final String TAG = LanguageModelParam.class.getSimpleName();
private static final boolean DEBUG = false;
private static final boolean DEBUG_TOKEN = false;
// For now, these probability values are being referred to only when we add new entries to
// decaying dynamic binary dictionaries. When these are referred to, what matters is 0 or
// non-0. Thus, it's not meaningful to compare 10, 100, and so on.
// TODO: Revise the logic in ForgettingCurveUtils in native code.
private static final int UNIGRAM_PROBABILITY_FOR_VALID_WORD = 100;
private static final int UNIGRAM_PROBABILITY_FOR_OOV_WORD = Dictionary.NOT_A_PROBABILITY;
private static final int BIGRAM_PROBABILITY_FOR_VALID_WORD = 10;
private static final int BIGRAM_PROBABILITY_FOR_OOV_WORD = Dictionary.NOT_A_PROBABILITY;
public final CharSequence mTargetWord;
public final int[] mWord0;
public final int[] mWord1;
// TODO: this needs to be a list of shortcuts
public final int[] mShortcutTarget;
public final int mUnigramProbability;
public final int mBigramProbability;
public final int mShortcutProbability;
public final boolean mIsNotAWord;
public final boolean mIsPossiblyOffensive;
// Time stamp in seconds.
public final int mTimestamp;
// Constructor for unigram. TODO: support shortcuts
@UsedForTesting
public LanguageModelParam(final CharSequence word, final int unigramProbability,
final int timestamp) {
this(null /* word0 */, word, unigramProbability, Dictionary.NOT_A_PROBABILITY, timestamp);
}
// Constructor for unigram and bigram.
@UsedForTesting
public LanguageModelParam(final CharSequence word0, final CharSequence word1,
final int unigramProbability, final int bigramProbability,
final int timestamp) {
mTargetWord = word1;
mWord0 = (word0 == null) ? null : StringUtils.toCodePointArray(word0);
mWord1 = StringUtils.toCodePointArray(word1);
mShortcutTarget = null;
mUnigramProbability = unigramProbability;
mBigramProbability = bigramProbability;
mShortcutProbability = Dictionary.NOT_A_PROBABILITY;
mIsNotAWord = false;
mIsPossiblyOffensive = false;
mTimestamp = timestamp;
}
// Process a list of words and return a list of {@link LanguageModelParam} objects.
public static ArrayList<LanguageModelParam> createLanguageModelParamsFrom(
final List<String> tokens, final int timestamp,
final SpacingAndPunctuations spacingAndPunctuations, final Locale locale,
final DistracterFilter distracterFilter) {
final ArrayList<LanguageModelParam> languageModelParams = new ArrayList<>();
final int N = tokens.size();
NgramContext ngramContext = NgramContext.EMPTY_PREV_WORDS_INFO;
for (int i = 0; i < N; ++i) {
final String tempWord = tokens.get(i);
if (StringUtils.isEmptyStringOrWhiteSpaces(tempWord)) {
// just skip this token
if (DEBUG_TOKEN) {
Log.d(TAG, "--- isEmptyStringOrWhiteSpaces: \"" + tempWord + "\"");
}
continue;
}
if (!DictionaryInfoUtils.looksValidForDictionaryInsertion(
tempWord, spacingAndPunctuations)) {
if (DEBUG_TOKEN) {
Log.d(TAG, "--- not looksValidForDictionaryInsertion: \""
+ tempWord + "\"");
}
// Sentence terminator found. Split.
ngramContext = NgramContext.EMPTY_PREV_WORDS_INFO;
continue;
}
if (DEBUG_TOKEN) {
Log.d(TAG, "--- word: \"" + tempWord + "\"");
}
final LanguageModelParam languageModelParam =
detectWhetherVaildWordOrNotAndGetLanguageModelParam(
ngramContext, tempWord, timestamp, locale, distracterFilter);
if (languageModelParam == null) {
continue;
}
languageModelParams.add(languageModelParam);
ngramContext = ngramContext.getNextNgramContext(
new NgramContext.WordInfo(tempWord));
}
return languageModelParams;
}
private static LanguageModelParam detectWhetherVaildWordOrNotAndGetLanguageModelParam(
final NgramContext ngramContext, final String targetWord, final int timestamp,
final Locale locale, final DistracterFilter distracterFilter) {
if (locale == null) {
return null;
}
final int wordHandlingType = distracterFilter.getWordHandlingType(ngramContext,
targetWord, locale);
final String word = HandlingType.shouldBeLowerCased(wordHandlingType) ?
targetWord.toLowerCase(locale) : targetWord;
if (distracterFilter.isDistracterToWordsInDictionaries(ngramContext, targetWord, locale)) {
// The word is a distracter.
return null;
}
return createAndGetLanguageModelParamOfWord(ngramContext, word, timestamp,
!HandlingType.shouldBeHandledAsOov(wordHandlingType));
}
private static LanguageModelParam createAndGetLanguageModelParamOfWord(
final NgramContext ngramContext, final String word, final int timestamp,
final boolean isValidWord) {
final int unigramProbability = isValidWord ?
UNIGRAM_PROBABILITY_FOR_VALID_WORD : UNIGRAM_PROBABILITY_FOR_OOV_WORD;
if (!ngramContext.isValid()) {
if (DEBUG) {
Log.d(TAG, "--- add unigram: current("
+ (isValidWord ? "Valid" : "OOV") + ") = " + word);
}
return new LanguageModelParam(word, unigramProbability, timestamp);
}
if (DEBUG) {
Log.d(TAG, "--- add bigram: prev = " + ngramContext + ", current("
+ (isValidWord ? "Valid" : "OOV") + ") = " + word);
}
final int bigramProbability = isValidWord ?
BIGRAM_PROBABILITY_FOR_VALID_WORD : BIGRAM_PROBABILITY_FOR_OOV_WORD;
return new LanguageModelParam(ngramContext.getNthPrevWord(1 /* n */), word,
unigramProbability, bigramProbability, timestamp);
}
}

View file

@ -0,0 +1,117 @@
/*
* Copyright (C) 2014 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.android.inputmethod.latin.utils;
import android.util.Log;
import com.android.inputmethod.annotations.UsedForTesting;
import com.android.inputmethod.latin.Constants;
import com.android.inputmethod.latin.NgramContext;
import com.android.inputmethod.latin.settings.SpacingAndPunctuations;
import com.android.inputmethod.latin.utils.DistracterFilter.HandlingType;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
// Note: this class is used as a parameter type of a native method. You should be careful when you
// rename this class or field name. See BinaryDictionary#addMultipleDictionaryEntriesNative().
public final class WordInputEventForPersonalization {
private static final String TAG = WordInputEventForPersonalization.class.getSimpleName();
private static final boolean DEBUG_TOKEN = false;
public final int[] mTargetWord;
public final int mPrevWordsCount;
public final int[][] mPrevWordArray = new int[Constants.MAX_PREV_WORD_COUNT_FOR_N_GRAM][];
public final boolean[] mIsPrevWordBeginningOfSentenceArray =
new boolean[Constants.MAX_PREV_WORD_COUNT_FOR_N_GRAM];
public final boolean mIsValid;
// Time stamp in seconds.
public final int mTimestamp;
@UsedForTesting
public WordInputEventForPersonalization(final CharSequence targetWord,
final NgramContext ngramContext, final boolean isValid, final int timestamp) {
mTargetWord = StringUtils.toCodePointArray(targetWord);
mPrevWordsCount = ngramContext.getPrevWordCount();
ngramContext.outputToArray(mPrevWordArray, mIsPrevWordBeginningOfSentenceArray);
mIsValid = isValid;
mTimestamp = timestamp;
}
// Process a list of words and return a list of {@link WordInputEventForPersonalization}
// objects.
public static ArrayList<WordInputEventForPersonalization> createInputEventFrom(
final List<String> tokens, final int timestamp,
final SpacingAndPunctuations spacingAndPunctuations, final Locale locale,
final DistracterFilter distracterFilter) {
final ArrayList<WordInputEventForPersonalization> inputEvents = new ArrayList<>();
final int N = tokens.size();
NgramContext ngramContext = NgramContext.EMPTY_PREV_WORDS_INFO;
for (int i = 0; i < N; ++i) {
final String tempWord = tokens.get(i);
if (StringUtils.isEmptyStringOrWhiteSpaces(tempWord)) {
// just skip this token
if (DEBUG_TOKEN) {
Log.d(TAG, "--- isEmptyStringOrWhiteSpaces: \"" + tempWord + "\"");
}
continue;
}
if (!DictionaryInfoUtils.looksValidForDictionaryInsertion(
tempWord, spacingAndPunctuations)) {
if (DEBUG_TOKEN) {
Log.d(TAG, "--- not looksValidForDictionaryInsertion: \""
+ tempWord + "\"");
}
// Sentence terminator found. Split.
// TODO: Detect whether the context is beginning-of-sentence.
ngramContext = NgramContext.EMPTY_PREV_WORDS_INFO;
continue;
}
if (DEBUG_TOKEN) {
Log.d(TAG, "--- word: \"" + tempWord + "\"");
}
final WordInputEventForPersonalization inputEvent =
detectWhetherVaildWordOrNotAndGetInputEvent(
ngramContext, tempWord, timestamp, locale, distracterFilter);
if (inputEvent == null) {
continue;
}
inputEvents.add(inputEvent);
ngramContext = ngramContext.getNextNgramContext(new NgramContext.WordInfo(tempWord));
}
return inputEvents;
}
private static WordInputEventForPersonalization detectWhetherVaildWordOrNotAndGetInputEvent(
final NgramContext ngramContext, final String targetWord, final int timestamp,
final Locale locale, final DistracterFilter distracterFilter) {
if (locale == null) {
return null;
}
final int wordHandlingType = distracterFilter.getWordHandlingType(ngramContext,
targetWord, locale);
final String word = HandlingType.shouldBeLowerCased(wordHandlingType) ?
targetWord.toLowerCase(locale) : targetWord;
if (distracterFilter.isDistracterToWordsInDictionaries(ngramContext, targetWord, locale)) {
// The word is a distracter.
return null;
}
return new WordInputEventForPersonalization(word, ngramContext,
!HandlingType.shouldBeHandledAsOov(wordHandlingType), timestamp);
}
}

View file

@ -453,98 +453,60 @@ static bool latinime_BinaryDictionary_updateEntriesForWordWithNgramContext(JNIEn
historicalInfo); historicalInfo);
} }
// Returns how many language model params are processed. // Returns how many input events are processed.
static int latinime_BinaryDictionary_addMultipleDictionaryEntries(JNIEnv *env, jclass clazz, static int latinime_BinaryDictionary_updateEntriesForInputEvents(JNIEnv *env, jclass clazz,
jlong dict, jobjectArray languageModelParams, jint startIndex) { jlong dict, jobjectArray inputEvents, jint startIndex) {
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict); Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
if (!dictionary) { if (!dictionary) {
return 0; return 0;
} }
jsize languageModelParamCount = env->GetArrayLength(languageModelParams); jsize inputEventCount = env->GetArrayLength(inputEvents);
if (languageModelParamCount == 0 || startIndex >= languageModelParamCount) { if (inputEventCount == 0 || startIndex >= inputEventCount) {
return 0; return 0;
} }
jobject languageModelParam = env->GetObjectArrayElement(languageModelParams, 0); jobject inputEvent = env->GetObjectArrayElement(inputEvents, 0);
jclass languageModelParamClass = env->GetObjectClass(languageModelParam); jclass wordInputEventClass = env->GetObjectClass(inputEvent);
env->DeleteLocalRef(languageModelParam); env->DeleteLocalRef(inputEvent);
jfieldID word0FieldId = env->GetFieldID(languageModelParamClass, "mWord0", "[I"); jfieldID targetWordFieldId = env->GetFieldID(wordInputEventClass, "mTargetWord", "[I");
jfieldID word1FieldId = env->GetFieldID(languageModelParamClass, "mWord1", "[I"); jfieldID prevWordCountFieldId = env->GetFieldID(wordInputEventClass, "mPrevWordsCount", "I");
jfieldID unigramProbabilityFieldId = jfieldID prevWordArrayFieldId = env->GetFieldID(wordInputEventClass, "mPrevWordArray", "[[I");
env->GetFieldID(languageModelParamClass, "mUnigramProbability", "I"); jfieldID isPrevWordBoSArrayFieldId =
jfieldID bigramProbabilityFieldId = env->GetFieldID(wordInputEventClass, "mIsPrevWordBeginningOfSentenceArray", "[Z");
env->GetFieldID(languageModelParamClass, "mBigramProbability", "I"); jfieldID isValidFieldId = env->GetFieldID(wordInputEventClass, "mIsValid", "Z");
jfieldID timestampFieldId = jfieldID timestampFieldId = env->GetFieldID(wordInputEventClass, "mTimestamp", "I");
env->GetFieldID(languageModelParamClass, "mTimestamp", "I"); env->DeleteLocalRef(wordInputEventClass);
jfieldID shortcutTargetFieldId =
env->GetFieldID(languageModelParamClass, "mShortcutTarget", "[I");
jfieldID shortcutProbabilityFieldId =
env->GetFieldID(languageModelParamClass, "mShortcutProbability", "I");
jfieldID isNotAWordFieldId =
env->GetFieldID(languageModelParamClass, "mIsNotAWord", "Z");
jfieldID isPossiblyOffensiveFieldId =
env->GetFieldID(languageModelParamClass, "mIsPossiblyOffensive", "Z");
env->DeleteLocalRef(languageModelParamClass);
for (int i = startIndex; i < languageModelParamCount; ++i) { for (int i = startIndex; i < inputEventCount; ++i) {
jobject languageModelParam = env->GetObjectArrayElement(languageModelParams, i); jobject inputEvent = env->GetObjectArrayElement(inputEvents, i);
// languageModelParam is a set of params for word1; thus, word1 cannot be null. On the jintArray targetWord = static_cast<jintArray>(
// other hand, word0 can be null and then it means the set of params doesn't contain bigram env->GetObjectField(inputEvent, targetWordFieldId));
// information. jsize wordLength = env->GetArrayLength(targetWord);
jintArray word0 = static_cast<jintArray>( int wordCodePoints[wordLength];
env->GetObjectField(languageModelParam, word0FieldId)); env->GetIntArrayRegion(targetWord, 0, wordLength, wordCodePoints);
jsize word0Length = word0 ? env->GetArrayLength(word0) : 0; env->DeleteLocalRef(targetWord);
int word0CodePoints[word0Length];
if (word0) { jint prevWordCount = env->GetIntField(inputEvent, prevWordCountFieldId);
env->GetIntArrayRegion(word0, 0, word0Length, word0CodePoints); jobjectArray prevWordArray =
} static_cast<jobjectArray>(env->GetObjectField(inputEvent, prevWordArrayFieldId));
jintArray word1 = static_cast<jintArray>( jbooleanArray isPrevWordBeginningOfSentenceArray = static_cast<jbooleanArray>(
env->GetObjectField(languageModelParam, word1FieldId)); env->GetObjectField(inputEvent, isPrevWordBoSArrayFieldId));
jsize word1Length = env->GetArrayLength(word1); jboolean isValid = env->GetBooleanField(inputEvent, isValidFieldId);
int word1CodePoints[word1Length]; jint timestamp = env->GetIntField(inputEvent, timestampFieldId);
env->GetIntArrayRegion(word1, 0, word1Length, word1CodePoints); const NgramContext ngramContext = JniDataUtils::constructNgramContext(env,
jint unigramProbability = env->GetIntField(languageModelParam, unigramProbabilityFieldId); prevWordArray, isPrevWordBeginningOfSentenceArray, prevWordCount);
jint timestamp = env->GetIntField(languageModelParam, timestampFieldId);
jboolean isNotAWord = env->GetBooleanField(languageModelParam, isNotAWordFieldId);
jboolean isPossiblyOffensive = env->GetBooleanField(languageModelParam,
isPossiblyOffensiveFieldId);
jintArray shortcutTarget = static_cast<jintArray>(
env->GetObjectField(languageModelParam, shortcutTargetFieldId));
std::vector<UnigramProperty::ShortcutProperty> shortcuts;
{
std::vector<int> shortcutTargetCodePoints;
JniDataUtils::jintarrayToVector(env, shortcutTarget, &shortcutTargetCodePoints);
if (!shortcutTargetCodePoints.empty()) {
jint shortcutProbability =
env->GetIntField(languageModelParam, shortcutProbabilityFieldId);
shortcuts.emplace_back(std::move(shortcutTargetCodePoints), shortcutProbability);
}
}
// Use 1 for count to indicate the word has inputted. // Use 1 for count to indicate the word has inputted.
const UnigramProperty unigramProperty(false /* isBeginningOfSentence */, isNotAWord, dictionary->updateEntriesForWordWithNgramContext(&ngramContext,
isPossiblyOffensive, unigramProbability, CodePointArrayView(wordCodePoints, wordLength), isValid,
HistoricalInfo(timestamp, 0 /* level */, 1 /* count */), std::move(shortcuts)); HistoricalInfo(timestamp, 0 /* level */, 1 /* count */));
dictionary->addUnigramEntry(CodePointArrayView(word1CodePoints, word1Length),
&unigramProperty);
if (word0) {
jint bigramProbability = env->GetIntField(languageModelParam, bigramProbabilityFieldId);
// Use 1 for count to indicate the bigram has inputted.
const NgramContext ngramContext(word0CodePoints, word0Length,
false /* isBeginningOfSentence */);
const NgramProperty ngramProperty(ngramContext,
CodePointArrayView(word1CodePoints, word1Length).toVector(),
bigramProbability, HistoricalInfo(timestamp, 0 /* level */, 1 /* count */));
dictionary->addNgramEntry(&ngramProperty);
}
if (dictionary->needsToRunGC(true /* mindsBlockByGC */)) { if (dictionary->needsToRunGC(true /* mindsBlockByGC */)) {
return i + 1; return i + 1;
} }
env->DeleteLocalRef(word0); env->DeleteLocalRef(prevWordArray);
env->DeleteLocalRef(word1); env->DeleteLocalRef(isPrevWordBeginningOfSentenceArray);
env->DeleteLocalRef(shortcutTarget); env->DeleteLocalRef(inputEvent);
env->DeleteLocalRef(languageModelParam);
} }
return languageModelParamCount; return inputEventCount;
} }
static jstring latinime_BinaryDictionary_getProperty(JNIEnv *env, jclass clazz, jlong dict, static jstring latinime_BinaryDictionary_getProperty(JNIEnv *env, jclass clazz, jlong dict,
@ -754,10 +716,10 @@ static const JNINativeMethod sMethods[] = {
reinterpret_cast<void *>(latinime_BinaryDictionary_updateEntriesForWordWithNgramContext) reinterpret_cast<void *>(latinime_BinaryDictionary_updateEntriesForWordWithNgramContext)
}, },
{ {
const_cast<char *>("addMultipleDictionaryEntriesNative"), const_cast<char *>("updateEntriesForInputEventsNative"),
const_cast<char *>( const_cast<char *>(
"(J[Lcom/android/inputmethod/latin/utils/LanguageModelParam;I)I"), "(J[Lcom/android/inputmethod/latin/utils/WordInputEventForPersonalization;I)I"),
reinterpret_cast<void *>(latinime_BinaryDictionary_addMultipleDictionaryEntries) reinterpret_cast<void *>(latinime_BinaryDictionary_updateEntriesForInputEvents)
}, },
{ {
const_cast<char *>("getPropertyNative"), const_cast<char *>("getPropertyNative"),

View file

@ -50,6 +50,7 @@ class JniDataUtils {
const jsize keyUtf8Length = env->GetStringUTFLength(keyString); const jsize keyUtf8Length = env->GetStringUTFLength(keyString);
char keyChars[keyUtf8Length + 1]; char keyChars[keyUtf8Length + 1];
env->GetStringUTFRegion(keyString, 0, env->GetStringLength(keyString), keyChars); env->GetStringUTFRegion(keyString, 0, env->GetStringLength(keyString), keyChars);
env->DeleteLocalRef(keyString);
keyChars[keyUtf8Length] = '\0'; keyChars[keyUtf8Length] = '\0';
DictionaryHeaderStructurePolicy::AttributeMap::key_type key; DictionaryHeaderStructurePolicy::AttributeMap::key_type key;
HeaderReadWriteUtils::insertCharactersIntoVector(keyChars, &key); HeaderReadWriteUtils::insertCharactersIntoVector(keyChars, &key);
@ -59,6 +60,7 @@ class JniDataUtils {
const jsize valueUtf8Length = env->GetStringUTFLength(valueString); const jsize valueUtf8Length = env->GetStringUTFLength(valueString);
char valueChars[valueUtf8Length + 1]; char valueChars[valueUtf8Length + 1];
env->GetStringUTFRegion(valueString, 0, env->GetStringLength(valueString), valueChars); env->GetStringUTFRegion(valueString, 0, env->GetStringLength(valueString), valueChars);
env->DeleteLocalRef(valueString);
valueChars[valueUtf8Length] = '\0'; valueChars[valueUtf8Length] = '\0';
DictionaryHeaderStructurePolicy::AttributeMap::mapped_type value; DictionaryHeaderStructurePolicy::AttributeMap::mapped_type value;
HeaderReadWriteUtils::insertCharactersIntoVector(valueChars, &value); HeaderReadWriteUtils::insertCharactersIntoVector(valueChars, &value);
@ -113,6 +115,7 @@ class JniDataUtils {
continue; continue;
} }
env->GetIntArrayRegion(prevWord, 0, prevWordLength, prevWordCodePoints[i]); env->GetIntArrayRegion(prevWord, 0, prevWordLength, prevWordCodePoints[i]);
env->DeleteLocalRef(prevWord);
prevWordCodePointCount[i] = prevWordLength; prevWordCodePointCount[i] = prevWordLength;
jboolean isBeginningOfSentenceBoolean = JNI_FALSE; jboolean isBeginningOfSentenceBoolean = JNI_FALSE;
env->GetBooleanArrayRegion(isBeginningOfSentenceArray, i, 1 /* len */, env->GetBooleanArrayRegion(isBeginningOfSentenceArray, i, 1 /* len */,

View file

@ -32,6 +32,7 @@ import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
import com.android.inputmethod.latin.utils.BinaryDictionaryUtils; import com.android.inputmethod.latin.utils.BinaryDictionaryUtils;
import com.android.inputmethod.latin.utils.FileUtils; import com.android.inputmethod.latin.utils.FileUtils;
import com.android.inputmethod.latin.utils.LocaleUtils; import com.android.inputmethod.latin.utils.LocaleUtils;
import com.android.inputmethod.latin.utils.WordInputEventForPersonalization;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
@ -39,6 +40,7 @@ import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.Locale; import java.util.Locale;
import java.util.Map;
import java.util.Random; import java.util.Random;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
@ -748,4 +750,66 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
binaryDictionary.close(); binaryDictionary.close();
} }
public void testUpdateEntriesForInputEvents() {
for (final int formatVersion : DICT_FORMAT_VERSIONS) {
testUpdateEntriesForInputEvents(formatVersion);
}
}
private void testUpdateEntriesForInputEvents(final int formatVersion) {
setCurrentTimeForTestMode(mCurrentTime);
final int codePointSetSize = 20;
final int EVENT_COUNT = 1000;
final double CONTINUE_RATE = 0.9;
final long seed = System.currentTimeMillis();
final Random random = new Random(seed);
final File dictFile = createEmptyDictionaryAndGetFile(formatVersion);
final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
final ArrayList<String> unigrams = new ArrayList<>();
final ArrayList<Pair<String, String>> bigrams = new ArrayList<>();
final ArrayList<Pair<Pair<String, String>, String>> trigrams = new ArrayList<>();
final WordInputEventForPersonalization[] inputEvents =
new WordInputEventForPersonalization[EVENT_COUNT];
NgramContext ngramContext = NgramContext.EMPTY_PREV_WORDS_INFO;
int prevWordCount = 0;
for (int i = 0; i < inputEvents.length; i++) {
final String word = CodePointUtils.generateWord(random, codePointSet);
inputEvents[i] = new WordInputEventForPersonalization(word, ngramContext,
true /* isValid */, mCurrentTime);
unigrams.add(word);
if (prevWordCount >= 2) {
final Pair<String, String> prevWordsPair = bigrams.get(bigrams.size() - 1);
trigrams.add(new Pair<>(prevWordsPair, word));
}
if (prevWordCount >= 1) {
bigrams.add(new Pair<>(ngramContext.getNthPrevWord(1 /* n */).toString(), word));
}
if (random.nextDouble() > CONTINUE_RATE) {
ngramContext = NgramContext.EMPTY_PREV_WORDS_INFO;
prevWordCount = 0;
} else {
ngramContext = ngramContext.getNextNgramContext(new WordInfo(word));
prevWordCount++;
}
}
final BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
binaryDictionary.updateEntriesForInputEvents(inputEvents);
for (final String word : unigrams) {
assertTrue(binaryDictionary.isInDictionary(word));
}
for (final Pair<String, String> bigram : bigrams) {
assertTrue(isValidBigram(binaryDictionary, bigram.first, bigram.second));
}
if (!supportsNgram(formatVersion)) {
return;
}
for (final Pair<Pair<String, String>, String> trigram : trigrams) {
assertTrue(isValidTrigram(binaryDictionary, trigram.first.first, trigram.first.second,
trigram.second));
}
}
} }

View file

@ -29,7 +29,6 @@ import com.android.inputmethod.latin.makedict.WeightedString;
import com.android.inputmethod.latin.makedict.WordProperty; import com.android.inputmethod.latin.makedict.WordProperty;
import com.android.inputmethod.latin.utils.BinaryDictionaryUtils; import com.android.inputmethod.latin.utils.BinaryDictionaryUtils;
import com.android.inputmethod.latin.utils.FileUtils; import com.android.inputmethod.latin.utils.FileUtils;
import com.android.inputmethod.latin.utils.LanguageModelParam;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
@ -884,63 +883,6 @@ public class BinaryDictionaryTests extends AndroidTestCase {
} }
} }
public void testAddMultipleDictionaryEntries() {
for (final int formatVersion : DICT_FORMAT_VERSIONS) {
testAddMultipleDictionaryEntries(formatVersion);
}
}
private void testAddMultipleDictionaryEntries(final int formatVersion) {
final int codePointSetSize = 20;
final int lmParamCount = 1000;
final double bigramContinueRate = 0.9;
final long seed = System.currentTimeMillis();
final Random random = new Random(seed);
final File dictFile = createEmptyDictionaryAndGetFile(formatVersion);
final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
final HashMap<String, Integer> unigramProbabilities = new HashMap<>();
final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>();
final LanguageModelParam[] languageModelParams = new LanguageModelParam[lmParamCount];
String prevWord = null;
for (int i = 0; i < languageModelParams.length; i++) {
final String word = CodePointUtils.generateWord(random, codePointSet);
final int probability = random.nextInt(0xFF);
final int bigramProbability = probability + random.nextInt(0xFF - probability);
unigramProbabilities.put(word, probability);
if (prevWord == null) {
languageModelParams[i] = new LanguageModelParam(word, probability,
BinaryDictionary.NOT_A_VALID_TIMESTAMP);
} else {
languageModelParams[i] = new LanguageModelParam(prevWord, word, probability,
bigramProbability, BinaryDictionary.NOT_A_VALID_TIMESTAMP);
bigramProbabilities.put(new Pair<>(prevWord, word),
bigramProbability);
}
prevWord = (random.nextDouble() < bigramContinueRate) ? word : null;
}
final BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
binaryDictionary.addMultipleDictionaryEntries(languageModelParams);
for (Map.Entry<String, Integer> entry : unigramProbabilities.entrySet()) {
assertEquals((int)entry.getValue(), binaryDictionary.getFrequency(entry.getKey()));
}
for (Map.Entry<Pair<String, String>, Integer> entry : bigramProbabilities.entrySet()) {
final String word0 = entry.getKey().first;
final String word1 = entry.getKey().second;
final int bigramProbability = entry.getValue();
assertEquals(bigramProbability != Dictionary.NOT_A_PROBABILITY,
isValidBigram(binaryDictionary, word0, word1));
if (canCheckBigramProbability(formatVersion)) {
assertEquals(bigramProbability,
getBigramProbability(binaryDictionary, word0, word1));
}
}
}
public void testGetWordProperties() { public void testGetWordProperties() {
for (final int formatVersion : DICT_FORMAT_VERSIONS) { for (final int formatVersion : DICT_FORMAT_VERSIONS) {
testGetWordProperties(formatVersion); testGetWordProperties(formatVersion);

View file

@ -30,7 +30,7 @@ import com.android.inputmethod.latin.Dictionary;
import com.android.inputmethod.latin.DictionaryFacilitator; import com.android.inputmethod.latin.DictionaryFacilitator;
import com.android.inputmethod.latin.ExpandableBinaryDictionary; import com.android.inputmethod.latin.ExpandableBinaryDictionary;
import com.android.inputmethod.latin.RichInputMethodManager; import com.android.inputmethod.latin.RichInputMethodManager;
import com.android.inputmethod.latin.ExpandableBinaryDictionary.AddMultipleDictionaryEntriesCallback; import com.android.inputmethod.latin.ExpandableBinaryDictionary.UpdateEntriesForInputEventsCallback;
import com.android.inputmethod.latin.common.CodePointUtils; import com.android.inputmethod.latin.common.CodePointUtils;
import com.android.inputmethod.latin.settings.SpacingAndPunctuations; import com.android.inputmethod.latin.settings.SpacingAndPunctuations;
@ -96,8 +96,8 @@ public class PersonalizationDictionaryTests extends AndroidTestCase {
true /* inputByUser */, tokens, timeStampInSeconds, DUMMY_PACKAGE_NAME, true /* inputByUser */, tokens, timeStampInSeconds, DUMMY_PACKAGE_NAME,
LOCALE_EN_US.getLanguage()); LOCALE_EN_US.getLanguage());
final CountDownLatch countDownLatch = new CountDownLatch(1); final CountDownLatch countDownLatch = new CountDownLatch(1);
final AddMultipleDictionaryEntriesCallback callback = final UpdateEntriesForInputEventsCallback callback =
new AddMultipleDictionaryEntriesCallback() { new UpdateEntriesForInputEventsCallback() {
@Override @Override
public void onFinished() { public void onFinished() {
countDownLatch.countDown(); countDownLatch.countDown();