Use trigrams for personalization dict.
5Bug: 14425059 Change-Id: I73cf6904e569d60996a3b079f16ea6df0cb90f02
This commit is contained in:
parent
d3a4c51324
commit
16cc3992d7
11 changed files with 265 additions and 343 deletions
|
@ -32,8 +32,8 @@ import com.android.inputmethod.latin.settings.SettingsValuesForSuggestion;
|
||||||
import com.android.inputmethod.latin.utils.BinaryDictionaryUtils;
|
import com.android.inputmethod.latin.utils.BinaryDictionaryUtils;
|
||||||
import com.android.inputmethod.latin.utils.FileUtils;
|
import com.android.inputmethod.latin.utils.FileUtils;
|
||||||
import com.android.inputmethod.latin.utils.JniUtils;
|
import com.android.inputmethod.latin.utils.JniUtils;
|
||||||
import com.android.inputmethod.latin.utils.LanguageModelParam;
|
|
||||||
import com.android.inputmethod.latin.utils.StringUtils;
|
import com.android.inputmethod.latin.utils.StringUtils;
|
||||||
|
import com.android.inputmethod.latin.utils.WordInputEventForPersonalization;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
@ -205,8 +205,8 @@ public final class BinaryDictionary extends Dictionary {
|
||||||
private static native boolean updateEntriesForWordWithNgramContextNative(long dict,
|
private static native boolean updateEntriesForWordWithNgramContextNative(long dict,
|
||||||
int[][] prevWordCodePointArrays, boolean[] isBeginningOfSentenceArray,
|
int[][] prevWordCodePointArrays, boolean[] isBeginningOfSentenceArray,
|
||||||
int[] word, boolean isValidWord, int count, int timestamp);
|
int[] word, boolean isValidWord, int count, int timestamp);
|
||||||
private static native int addMultipleDictionaryEntriesNative(long dict,
|
private static native int updateEntriesForInputEventsNative(long dict,
|
||||||
LanguageModelParam[] languageModelParams, int startIndex);
|
WordInputEventForPersonalization[] inputEvents, int startIndex);
|
||||||
private static native String getPropertyNative(long dict, String query);
|
private static native String getPropertyNative(long dict, String query);
|
||||||
private static native boolean isCorruptedNative(long dict);
|
private static native boolean isCorruptedNative(long dict);
|
||||||
private static native boolean migrateNative(long dict, String dictFilePath,
|
private static native boolean migrateNative(long dict, String dictFilePath,
|
||||||
|
@ -526,19 +526,19 @@ public final class BinaryDictionary extends Dictionary {
|
||||||
}
|
}
|
||||||
|
|
||||||
@UsedForTesting
|
@UsedForTesting
|
||||||
public void addMultipleDictionaryEntries(final LanguageModelParam[] languageModelParams) {
|
public void updateEntriesForInputEvents(final WordInputEventForPersonalization[] inputEvents) {
|
||||||
if (!isValidDictionary()) {
|
if (!isValidDictionary()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
int processedParamCount = 0;
|
int processedEventCount = 0;
|
||||||
while (processedParamCount < languageModelParams.length) {
|
while (processedEventCount < inputEvents.length) {
|
||||||
if (needsToRunGC(true /* mindsBlockByGC */)) {
|
if (needsToRunGC(true /* mindsBlockByGC */)) {
|
||||||
flushWithGC();
|
flushWithGC();
|
||||||
}
|
}
|
||||||
processedParamCount = addMultipleDictionaryEntriesNative(mNativeDict,
|
processedEventCount = updateEntriesForInputEventsNative(mNativeDict, inputEvents,
|
||||||
languageModelParams, processedParamCount);
|
processedEventCount);
|
||||||
mHasUpdated = true;
|
mHasUpdated = true;
|
||||||
if (processedParamCount <= 0) {
|
if (processedEventCount <= 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,7 +24,7 @@ import android.view.inputmethod.InputMethodSubtype;
|
||||||
|
|
||||||
import com.android.inputmethod.annotations.UsedForTesting;
|
import com.android.inputmethod.annotations.UsedForTesting;
|
||||||
import com.android.inputmethod.keyboard.ProximityInfo;
|
import com.android.inputmethod.keyboard.ProximityInfo;
|
||||||
import com.android.inputmethod.latin.ExpandableBinaryDictionary.AddMultipleDictionaryEntriesCallback;
|
import com.android.inputmethod.latin.ExpandableBinaryDictionary.UpdateEntriesForInputEventsCallback;
|
||||||
import com.android.inputmethod.latin.NgramContext.WordInfo;
|
import com.android.inputmethod.latin.NgramContext.WordInfo;
|
||||||
import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo;
|
import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo;
|
||||||
import com.android.inputmethod.latin.personalization.ContextualDictionary;
|
import com.android.inputmethod.latin.personalization.ContextualDictionary;
|
||||||
|
@ -796,8 +796,8 @@ public class DictionaryFacilitator {
|
||||||
public void addEntriesToPersonalizationDictionary(
|
public void addEntriesToPersonalizationDictionary(
|
||||||
final PersonalizationDataChunk personalizationDataChunk,
|
final PersonalizationDataChunk personalizationDataChunk,
|
||||||
final SpacingAndPunctuations spacingAndPunctuations,
|
final SpacingAndPunctuations spacingAndPunctuations,
|
||||||
final AddMultipleDictionaryEntriesCallback callback) {
|
final UpdateEntriesForInputEventsCallback callback) {
|
||||||
mPersonalizationHelper.addEntriesToPersonalizationDictionariesToUpdate(
|
mPersonalizationHelper.updateEntriesOfPersonalizationDictionaries(
|
||||||
getMostProbableLocale(), personalizationDataChunk, spacingAndPunctuations,
|
getMostProbableLocale(), personalizationDataChunk, spacingAndPunctuations,
|
||||||
callback);
|
callback);
|
||||||
}
|
}
|
||||||
|
|
|
@ -32,7 +32,7 @@ import com.android.inputmethod.latin.utils.CombinedFormatUtils;
|
||||||
import com.android.inputmethod.latin.utils.DistracterFilter;
|
import com.android.inputmethod.latin.utils.DistracterFilter;
|
||||||
import com.android.inputmethod.latin.utils.ExecutorUtils;
|
import com.android.inputmethod.latin.utils.ExecutorUtils;
|
||||||
import com.android.inputmethod.latin.utils.FileUtils;
|
import com.android.inputmethod.latin.utils.FileUtils;
|
||||||
import com.android.inputmethod.latin.utils.LanguageModelParam;
|
import com.android.inputmethod.latin.utils.WordInputEventForPersonalization;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
@ -447,16 +447,16 @@ abstract public class ExpandableBinaryDictionary extends Dictionary {
|
||||||
}, word, distracterFilter);
|
}, word, distracterFilter);
|
||||||
}
|
}
|
||||||
|
|
||||||
public interface AddMultipleDictionaryEntriesCallback {
|
public interface UpdateEntriesForInputEventsCallback {
|
||||||
public void onFinished();
|
public void onFinished();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Dynamically add multiple entries to the dictionary.
|
* Dynamically update entries according to input events.
|
||||||
*/
|
*/
|
||||||
public void addMultipleDictionaryEntriesDynamically(
|
public void updateEntriesForInputEvents(
|
||||||
@Nonnull final ArrayList<LanguageModelParam> languageModelParams,
|
@Nonnull final ArrayList<WordInputEventForPersonalization> inputEvents,
|
||||||
final AddMultipleDictionaryEntriesCallback callback) {
|
final UpdateEntriesForInputEventsCallback callback) {
|
||||||
reloadDictionaryIfRequired();
|
reloadDictionaryIfRequired();
|
||||||
asyncExecuteTaskWithWriteLock(new Runnable() {
|
asyncExecuteTaskWithWriteLock(new Runnable() {
|
||||||
@Override
|
@Override
|
||||||
|
@ -466,9 +466,9 @@ abstract public class ExpandableBinaryDictionary extends Dictionary {
|
||||||
if (binaryDictionary == null) {
|
if (binaryDictionary == null) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
binaryDictionary.addMultipleDictionaryEntries(
|
binaryDictionary.updateEntriesForInputEvents(
|
||||||
languageModelParams.toArray(
|
inputEvents.toArray(
|
||||||
new LanguageModelParam[languageModelParams.size()]));
|
new WordInputEventForPersonalization[inputEvents.size()]));
|
||||||
} finally {
|
} finally {
|
||||||
if (callback != null) {
|
if (callback != null) {
|
||||||
callback.onFinished();
|
callback.onFinished();
|
||||||
|
|
|
@ -26,14 +26,14 @@ import java.util.concurrent.atomic.AtomicInteger;
|
||||||
import android.content.Context;
|
import android.content.Context;
|
||||||
import android.view.inputmethod.InputMethodSubtype;
|
import android.view.inputmethod.InputMethodSubtype;
|
||||||
|
|
||||||
import com.android.inputmethod.latin.ExpandableBinaryDictionary.AddMultipleDictionaryEntriesCallback;
|
import com.android.inputmethod.latin.ExpandableBinaryDictionary.UpdateEntriesForInputEventsCallback;
|
||||||
import com.android.inputmethod.latin.personalization.PersonalizationDataChunk;
|
import com.android.inputmethod.latin.personalization.PersonalizationDataChunk;
|
||||||
import com.android.inputmethod.latin.personalization.PersonalizationDictionary;
|
import com.android.inputmethod.latin.personalization.PersonalizationDictionary;
|
||||||
import com.android.inputmethod.latin.settings.SpacingAndPunctuations;
|
import com.android.inputmethod.latin.settings.SpacingAndPunctuations;
|
||||||
import com.android.inputmethod.latin.utils.DistracterFilter;
|
import com.android.inputmethod.latin.utils.DistracterFilter;
|
||||||
import com.android.inputmethod.latin.utils.DistracterFilterCheckingIsInDictionary;
|
import com.android.inputmethod.latin.utils.DistracterFilterCheckingIsInDictionary;
|
||||||
import com.android.inputmethod.latin.utils.LanguageModelParam;
|
|
||||||
import com.android.inputmethod.latin.utils.SubtypeLocaleUtils;
|
import com.android.inputmethod.latin.utils.SubtypeLocaleUtils;
|
||||||
|
import com.android.inputmethod.latin.utils.WordInputEventForPersonalization;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Class for managing and updating personalization dictionaries.
|
* Class for managing and updating personalization dictionaries.
|
||||||
|
@ -119,10 +119,10 @@ public class PersonalizationHelperForDictionaryFacilitator {
|
||||||
return personalizationDict;
|
return personalizationDict;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void addEntriesToPersonalizationDictionariesForLocale(final Locale locale,
|
private void updateEntriesOfPersonalizationDictionariesForLocale(final Locale locale,
|
||||||
final PersonalizationDataChunk personalizationDataChunk,
|
final PersonalizationDataChunk personalizationDataChunk,
|
||||||
final SpacingAndPunctuations spacingAndPunctuations,
|
final SpacingAndPunctuations spacingAndPunctuations,
|
||||||
final AddMultipleDictionaryEntriesCallback callback) {
|
final UpdateEntriesForInputEventsCallback callback) {
|
||||||
final ExpandableBinaryDictionary personalizationDict =
|
final ExpandableBinaryDictionary personalizationDict =
|
||||||
getPersonalizationDictToUpdate(mContext, locale);
|
getPersonalizationDictToUpdate(mContext, locale);
|
||||||
if (personalizationDict == null) {
|
if (personalizationDict == null) {
|
||||||
|
@ -131,25 +131,25 @@ public class PersonalizationHelperForDictionaryFacilitator {
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
final ArrayList<LanguageModelParam> languageModelParams =
|
final ArrayList<WordInputEventForPersonalization> inputEvents =
|
||||||
LanguageModelParam.createLanguageModelParamsFrom(
|
WordInputEventForPersonalization.createInputEventFrom(
|
||||||
personalizationDataChunk.mTokens,
|
personalizationDataChunk.mTokens,
|
||||||
personalizationDataChunk.mTimestampInSeconds, spacingAndPunctuations,
|
personalizationDataChunk.mTimestampInSeconds, spacingAndPunctuations,
|
||||||
locale, new DistracterFilterCheckingIsInDictionary(
|
locale, new DistracterFilterCheckingIsInDictionary(
|
||||||
mDistracterFilter, personalizationDict));
|
mDistracterFilter, personalizationDict));
|
||||||
if (languageModelParams == null || languageModelParams.isEmpty()) {
|
if (inputEvents == null || inputEvents.isEmpty()) {
|
||||||
if (callback != null) {
|
if (callback != null) {
|
||||||
callback.onFinished();
|
callback.onFinished();
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
personalizationDict.addMultipleDictionaryEntriesDynamically(languageModelParams, callback);
|
personalizationDict.updateEntriesForInputEvents(inputEvents, callback);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addEntriesToPersonalizationDictionariesToUpdate(final Locale defaultLocale,
|
public void updateEntriesOfPersonalizationDictionaries(final Locale defaultLocale,
|
||||||
final PersonalizationDataChunk personalizationDataChunk,
|
final PersonalizationDataChunk personalizationDataChunk,
|
||||||
final SpacingAndPunctuations spacingAndPunctuations,
|
final SpacingAndPunctuations spacingAndPunctuations,
|
||||||
final AddMultipleDictionaryEntriesCallback callback) {
|
final UpdateEntriesForInputEventsCallback callback) {
|
||||||
final String language = personalizationDataChunk.mDetectedLanguage;
|
final String language = personalizationDataChunk.mDetectedLanguage;
|
||||||
final HashSet<Locale> locales;
|
final HashSet<Locale> locales;
|
||||||
if (mIsMonolingualUser && PersonalizationDataChunk.LANGUAGE_UNKNOWN.equals(language)
|
if (mIsMonolingualUser && PersonalizationDataChunk.LANGUAGE_UNKNOWN.equals(language)
|
||||||
|
@ -165,8 +165,8 @@ public class PersonalizationHelperForDictionaryFacilitator {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
final AtomicInteger remainingTaskCount = new AtomicInteger(locales.size());
|
final AtomicInteger remainingTaskCount = new AtomicInteger(locales.size());
|
||||||
final AddMultipleDictionaryEntriesCallback callbackForLocales =
|
final UpdateEntriesForInputEventsCallback callbackForLocales =
|
||||||
new AddMultipleDictionaryEntriesCallback() {
|
new UpdateEntriesForInputEventsCallback() {
|
||||||
@Override
|
@Override
|
||||||
public void onFinished() {
|
public void onFinished() {
|
||||||
if (remainingTaskCount.decrementAndGet() == 0) {
|
if (remainingTaskCount.decrementAndGet() == 0) {
|
||||||
|
@ -178,7 +178,7 @@ public class PersonalizationHelperForDictionaryFacilitator {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
for (final Locale locale : locales) {
|
for (final Locale locale : locales) {
|
||||||
addEntriesToPersonalizationDictionariesForLocale(locale, personalizationDataChunk,
|
updateEntriesOfPersonalizationDictionariesForLocale(locale, personalizationDataChunk,
|
||||||
spacingAndPunctuations, callbackForLocales);
|
spacingAndPunctuations, callbackForLocales);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,166 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (C) 2014 The Android Open Source Project
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package com.android.inputmethod.latin.utils;
|
|
||||||
|
|
||||||
import android.util.Log;
|
|
||||||
|
|
||||||
import com.android.inputmethod.annotations.UsedForTesting;
|
|
||||||
import com.android.inputmethod.latin.Dictionary;
|
|
||||||
import com.android.inputmethod.latin.NgramContext;
|
|
||||||
import com.android.inputmethod.latin.settings.SpacingAndPunctuations;
|
|
||||||
import com.android.inputmethod.latin.utils.DistracterFilter.HandlingType;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Locale;
|
|
||||||
|
|
||||||
// Note: this class is used as a parameter type of a native method. You should be careful when you
|
|
||||||
// rename this class or field name. See BinaryDictionary#addMultipleDictionaryEntriesNative().
|
|
||||||
public final class LanguageModelParam {
|
|
||||||
private static final String TAG = LanguageModelParam.class.getSimpleName();
|
|
||||||
private static final boolean DEBUG = false;
|
|
||||||
private static final boolean DEBUG_TOKEN = false;
|
|
||||||
|
|
||||||
// For now, these probability values are being referred to only when we add new entries to
|
|
||||||
// decaying dynamic binary dictionaries. When these are referred to, what matters is 0 or
|
|
||||||
// non-0. Thus, it's not meaningful to compare 10, 100, and so on.
|
|
||||||
// TODO: Revise the logic in ForgettingCurveUtils in native code.
|
|
||||||
private static final int UNIGRAM_PROBABILITY_FOR_VALID_WORD = 100;
|
|
||||||
private static final int UNIGRAM_PROBABILITY_FOR_OOV_WORD = Dictionary.NOT_A_PROBABILITY;
|
|
||||||
private static final int BIGRAM_PROBABILITY_FOR_VALID_WORD = 10;
|
|
||||||
private static final int BIGRAM_PROBABILITY_FOR_OOV_WORD = Dictionary.NOT_A_PROBABILITY;
|
|
||||||
|
|
||||||
public final CharSequence mTargetWord;
|
|
||||||
public final int[] mWord0;
|
|
||||||
public final int[] mWord1;
|
|
||||||
// TODO: this needs to be a list of shortcuts
|
|
||||||
public final int[] mShortcutTarget;
|
|
||||||
public final int mUnigramProbability;
|
|
||||||
public final int mBigramProbability;
|
|
||||||
public final int mShortcutProbability;
|
|
||||||
public final boolean mIsNotAWord;
|
|
||||||
public final boolean mIsPossiblyOffensive;
|
|
||||||
// Time stamp in seconds.
|
|
||||||
public final int mTimestamp;
|
|
||||||
|
|
||||||
// Constructor for unigram. TODO: support shortcuts
|
|
||||||
@UsedForTesting
|
|
||||||
public LanguageModelParam(final CharSequence word, final int unigramProbability,
|
|
||||||
final int timestamp) {
|
|
||||||
this(null /* word0 */, word, unigramProbability, Dictionary.NOT_A_PROBABILITY, timestamp);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Constructor for unigram and bigram.
|
|
||||||
@UsedForTesting
|
|
||||||
public LanguageModelParam(final CharSequence word0, final CharSequence word1,
|
|
||||||
final int unigramProbability, final int bigramProbability,
|
|
||||||
final int timestamp) {
|
|
||||||
mTargetWord = word1;
|
|
||||||
mWord0 = (word0 == null) ? null : StringUtils.toCodePointArray(word0);
|
|
||||||
mWord1 = StringUtils.toCodePointArray(word1);
|
|
||||||
mShortcutTarget = null;
|
|
||||||
mUnigramProbability = unigramProbability;
|
|
||||||
mBigramProbability = bigramProbability;
|
|
||||||
mShortcutProbability = Dictionary.NOT_A_PROBABILITY;
|
|
||||||
mIsNotAWord = false;
|
|
||||||
mIsPossiblyOffensive = false;
|
|
||||||
mTimestamp = timestamp;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Process a list of words and return a list of {@link LanguageModelParam} objects.
|
|
||||||
public static ArrayList<LanguageModelParam> createLanguageModelParamsFrom(
|
|
||||||
final List<String> tokens, final int timestamp,
|
|
||||||
final SpacingAndPunctuations spacingAndPunctuations, final Locale locale,
|
|
||||||
final DistracterFilter distracterFilter) {
|
|
||||||
final ArrayList<LanguageModelParam> languageModelParams = new ArrayList<>();
|
|
||||||
final int N = tokens.size();
|
|
||||||
NgramContext ngramContext = NgramContext.EMPTY_PREV_WORDS_INFO;
|
|
||||||
for (int i = 0; i < N; ++i) {
|
|
||||||
final String tempWord = tokens.get(i);
|
|
||||||
if (StringUtils.isEmptyStringOrWhiteSpaces(tempWord)) {
|
|
||||||
// just skip this token
|
|
||||||
if (DEBUG_TOKEN) {
|
|
||||||
Log.d(TAG, "--- isEmptyStringOrWhiteSpaces: \"" + tempWord + "\"");
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (!DictionaryInfoUtils.looksValidForDictionaryInsertion(
|
|
||||||
tempWord, spacingAndPunctuations)) {
|
|
||||||
if (DEBUG_TOKEN) {
|
|
||||||
Log.d(TAG, "--- not looksValidForDictionaryInsertion: \""
|
|
||||||
+ tempWord + "\"");
|
|
||||||
}
|
|
||||||
// Sentence terminator found. Split.
|
|
||||||
ngramContext = NgramContext.EMPTY_PREV_WORDS_INFO;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (DEBUG_TOKEN) {
|
|
||||||
Log.d(TAG, "--- word: \"" + tempWord + "\"");
|
|
||||||
}
|
|
||||||
final LanguageModelParam languageModelParam =
|
|
||||||
detectWhetherVaildWordOrNotAndGetLanguageModelParam(
|
|
||||||
ngramContext, tempWord, timestamp, locale, distracterFilter);
|
|
||||||
if (languageModelParam == null) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
languageModelParams.add(languageModelParam);
|
|
||||||
ngramContext = ngramContext.getNextNgramContext(
|
|
||||||
new NgramContext.WordInfo(tempWord));
|
|
||||||
}
|
|
||||||
return languageModelParams;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static LanguageModelParam detectWhetherVaildWordOrNotAndGetLanguageModelParam(
|
|
||||||
final NgramContext ngramContext, final String targetWord, final int timestamp,
|
|
||||||
final Locale locale, final DistracterFilter distracterFilter) {
|
|
||||||
if (locale == null) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
final int wordHandlingType = distracterFilter.getWordHandlingType(ngramContext,
|
|
||||||
targetWord, locale);
|
|
||||||
final String word = HandlingType.shouldBeLowerCased(wordHandlingType) ?
|
|
||||||
targetWord.toLowerCase(locale) : targetWord;
|
|
||||||
if (distracterFilter.isDistracterToWordsInDictionaries(ngramContext, targetWord, locale)) {
|
|
||||||
// The word is a distracter.
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
return createAndGetLanguageModelParamOfWord(ngramContext, word, timestamp,
|
|
||||||
!HandlingType.shouldBeHandledAsOov(wordHandlingType));
|
|
||||||
}
|
|
||||||
|
|
||||||
private static LanguageModelParam createAndGetLanguageModelParamOfWord(
|
|
||||||
final NgramContext ngramContext, final String word, final int timestamp,
|
|
||||||
final boolean isValidWord) {
|
|
||||||
final int unigramProbability = isValidWord ?
|
|
||||||
UNIGRAM_PROBABILITY_FOR_VALID_WORD : UNIGRAM_PROBABILITY_FOR_OOV_WORD;
|
|
||||||
if (!ngramContext.isValid()) {
|
|
||||||
if (DEBUG) {
|
|
||||||
Log.d(TAG, "--- add unigram: current("
|
|
||||||
+ (isValidWord ? "Valid" : "OOV") + ") = " + word);
|
|
||||||
}
|
|
||||||
return new LanguageModelParam(word, unigramProbability, timestamp);
|
|
||||||
}
|
|
||||||
if (DEBUG) {
|
|
||||||
Log.d(TAG, "--- add bigram: prev = " + ngramContext + ", current("
|
|
||||||
+ (isValidWord ? "Valid" : "OOV") + ") = " + word);
|
|
||||||
}
|
|
||||||
final int bigramProbability = isValidWord ?
|
|
||||||
BIGRAM_PROBABILITY_FOR_VALID_WORD : BIGRAM_PROBABILITY_FOR_OOV_WORD;
|
|
||||||
return new LanguageModelParam(ngramContext.getNthPrevWord(1 /* n */), word,
|
|
||||||
unigramProbability, bigramProbability, timestamp);
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -0,0 +1,117 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2014 The Android Open Source Project
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package com.android.inputmethod.latin.utils;
|
||||||
|
|
||||||
|
import android.util.Log;
|
||||||
|
|
||||||
|
import com.android.inputmethod.annotations.UsedForTesting;
|
||||||
|
import com.android.inputmethod.latin.Constants;
|
||||||
|
import com.android.inputmethod.latin.NgramContext;
|
||||||
|
import com.android.inputmethod.latin.settings.SpacingAndPunctuations;
|
||||||
|
import com.android.inputmethod.latin.utils.DistracterFilter.HandlingType;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
|
// Note: this class is used as a parameter type of a native method. You should be careful when you
|
||||||
|
// rename this class or field name. See BinaryDictionary#addMultipleDictionaryEntriesNative().
|
||||||
|
public final class WordInputEventForPersonalization {
|
||||||
|
private static final String TAG = WordInputEventForPersonalization.class.getSimpleName();
|
||||||
|
private static final boolean DEBUG_TOKEN = false;
|
||||||
|
|
||||||
|
public final int[] mTargetWord;
|
||||||
|
public final int mPrevWordsCount;
|
||||||
|
public final int[][] mPrevWordArray = new int[Constants.MAX_PREV_WORD_COUNT_FOR_N_GRAM][];
|
||||||
|
public final boolean[] mIsPrevWordBeginningOfSentenceArray =
|
||||||
|
new boolean[Constants.MAX_PREV_WORD_COUNT_FOR_N_GRAM];
|
||||||
|
public final boolean mIsValid;
|
||||||
|
// Time stamp in seconds.
|
||||||
|
public final int mTimestamp;
|
||||||
|
|
||||||
|
@UsedForTesting
|
||||||
|
public WordInputEventForPersonalization(final CharSequence targetWord,
|
||||||
|
final NgramContext ngramContext, final boolean isValid, final int timestamp) {
|
||||||
|
mTargetWord = StringUtils.toCodePointArray(targetWord);
|
||||||
|
mPrevWordsCount = ngramContext.getPrevWordCount();
|
||||||
|
ngramContext.outputToArray(mPrevWordArray, mIsPrevWordBeginningOfSentenceArray);
|
||||||
|
mIsValid = isValid;
|
||||||
|
mTimestamp = timestamp;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process a list of words and return a list of {@link WordInputEventForPersonalization}
|
||||||
|
// objects.
|
||||||
|
public static ArrayList<WordInputEventForPersonalization> createInputEventFrom(
|
||||||
|
final List<String> tokens, final int timestamp,
|
||||||
|
final SpacingAndPunctuations spacingAndPunctuations, final Locale locale,
|
||||||
|
final DistracterFilter distracterFilter) {
|
||||||
|
final ArrayList<WordInputEventForPersonalization> inputEvents = new ArrayList<>();
|
||||||
|
final int N = tokens.size();
|
||||||
|
NgramContext ngramContext = NgramContext.EMPTY_PREV_WORDS_INFO;
|
||||||
|
for (int i = 0; i < N; ++i) {
|
||||||
|
final String tempWord = tokens.get(i);
|
||||||
|
if (StringUtils.isEmptyStringOrWhiteSpaces(tempWord)) {
|
||||||
|
// just skip this token
|
||||||
|
if (DEBUG_TOKEN) {
|
||||||
|
Log.d(TAG, "--- isEmptyStringOrWhiteSpaces: \"" + tempWord + "\"");
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!DictionaryInfoUtils.looksValidForDictionaryInsertion(
|
||||||
|
tempWord, spacingAndPunctuations)) {
|
||||||
|
if (DEBUG_TOKEN) {
|
||||||
|
Log.d(TAG, "--- not looksValidForDictionaryInsertion: \""
|
||||||
|
+ tempWord + "\"");
|
||||||
|
}
|
||||||
|
// Sentence terminator found. Split.
|
||||||
|
// TODO: Detect whether the context is beginning-of-sentence.
|
||||||
|
ngramContext = NgramContext.EMPTY_PREV_WORDS_INFO;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (DEBUG_TOKEN) {
|
||||||
|
Log.d(TAG, "--- word: \"" + tempWord + "\"");
|
||||||
|
}
|
||||||
|
final WordInputEventForPersonalization inputEvent =
|
||||||
|
detectWhetherVaildWordOrNotAndGetInputEvent(
|
||||||
|
ngramContext, tempWord, timestamp, locale, distracterFilter);
|
||||||
|
if (inputEvent == null) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
inputEvents.add(inputEvent);
|
||||||
|
ngramContext = ngramContext.getNextNgramContext(new NgramContext.WordInfo(tempWord));
|
||||||
|
}
|
||||||
|
return inputEvents;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static WordInputEventForPersonalization detectWhetherVaildWordOrNotAndGetInputEvent(
|
||||||
|
final NgramContext ngramContext, final String targetWord, final int timestamp,
|
||||||
|
final Locale locale, final DistracterFilter distracterFilter) {
|
||||||
|
if (locale == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
final int wordHandlingType = distracterFilter.getWordHandlingType(ngramContext,
|
||||||
|
targetWord, locale);
|
||||||
|
final String word = HandlingType.shouldBeLowerCased(wordHandlingType) ?
|
||||||
|
targetWord.toLowerCase(locale) : targetWord;
|
||||||
|
if (distracterFilter.isDistracterToWordsInDictionaries(ngramContext, targetWord, locale)) {
|
||||||
|
// The word is a distracter.
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return new WordInputEventForPersonalization(word, ngramContext,
|
||||||
|
!HandlingType.shouldBeHandledAsOov(wordHandlingType), timestamp);
|
||||||
|
}
|
||||||
|
}
|
|
@ -453,98 +453,60 @@ static bool latinime_BinaryDictionary_updateEntriesForWordWithNgramContext(JNIEn
|
||||||
historicalInfo);
|
historicalInfo);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns how many language model params are processed.
|
// Returns how many input events are processed.
|
||||||
static int latinime_BinaryDictionary_addMultipleDictionaryEntries(JNIEnv *env, jclass clazz,
|
static int latinime_BinaryDictionary_updateEntriesForInputEvents(JNIEnv *env, jclass clazz,
|
||||||
jlong dict, jobjectArray languageModelParams, jint startIndex) {
|
jlong dict, jobjectArray inputEvents, jint startIndex) {
|
||||||
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
|
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
|
||||||
if (!dictionary) {
|
if (!dictionary) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
jsize languageModelParamCount = env->GetArrayLength(languageModelParams);
|
jsize inputEventCount = env->GetArrayLength(inputEvents);
|
||||||
if (languageModelParamCount == 0 || startIndex >= languageModelParamCount) {
|
if (inputEventCount == 0 || startIndex >= inputEventCount) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
jobject languageModelParam = env->GetObjectArrayElement(languageModelParams, 0);
|
jobject inputEvent = env->GetObjectArrayElement(inputEvents, 0);
|
||||||
jclass languageModelParamClass = env->GetObjectClass(languageModelParam);
|
jclass wordInputEventClass = env->GetObjectClass(inputEvent);
|
||||||
env->DeleteLocalRef(languageModelParam);
|
env->DeleteLocalRef(inputEvent);
|
||||||
|
|
||||||
jfieldID word0FieldId = env->GetFieldID(languageModelParamClass, "mWord0", "[I");
|
jfieldID targetWordFieldId = env->GetFieldID(wordInputEventClass, "mTargetWord", "[I");
|
||||||
jfieldID word1FieldId = env->GetFieldID(languageModelParamClass, "mWord1", "[I");
|
jfieldID prevWordCountFieldId = env->GetFieldID(wordInputEventClass, "mPrevWordsCount", "I");
|
||||||
jfieldID unigramProbabilityFieldId =
|
jfieldID prevWordArrayFieldId = env->GetFieldID(wordInputEventClass, "mPrevWordArray", "[[I");
|
||||||
env->GetFieldID(languageModelParamClass, "mUnigramProbability", "I");
|
jfieldID isPrevWordBoSArrayFieldId =
|
||||||
jfieldID bigramProbabilityFieldId =
|
env->GetFieldID(wordInputEventClass, "mIsPrevWordBeginningOfSentenceArray", "[Z");
|
||||||
env->GetFieldID(languageModelParamClass, "mBigramProbability", "I");
|
jfieldID isValidFieldId = env->GetFieldID(wordInputEventClass, "mIsValid", "Z");
|
||||||
jfieldID timestampFieldId =
|
jfieldID timestampFieldId = env->GetFieldID(wordInputEventClass, "mTimestamp", "I");
|
||||||
env->GetFieldID(languageModelParamClass, "mTimestamp", "I");
|
env->DeleteLocalRef(wordInputEventClass);
|
||||||
jfieldID shortcutTargetFieldId =
|
|
||||||
env->GetFieldID(languageModelParamClass, "mShortcutTarget", "[I");
|
|
||||||
jfieldID shortcutProbabilityFieldId =
|
|
||||||
env->GetFieldID(languageModelParamClass, "mShortcutProbability", "I");
|
|
||||||
jfieldID isNotAWordFieldId =
|
|
||||||
env->GetFieldID(languageModelParamClass, "mIsNotAWord", "Z");
|
|
||||||
jfieldID isPossiblyOffensiveFieldId =
|
|
||||||
env->GetFieldID(languageModelParamClass, "mIsPossiblyOffensive", "Z");
|
|
||||||
env->DeleteLocalRef(languageModelParamClass);
|
|
||||||
|
|
||||||
for (int i = startIndex; i < languageModelParamCount; ++i) {
|
for (int i = startIndex; i < inputEventCount; ++i) {
|
||||||
jobject languageModelParam = env->GetObjectArrayElement(languageModelParams, i);
|
jobject inputEvent = env->GetObjectArrayElement(inputEvents, i);
|
||||||
// languageModelParam is a set of params for word1; thus, word1 cannot be null. On the
|
jintArray targetWord = static_cast<jintArray>(
|
||||||
// other hand, word0 can be null and then it means the set of params doesn't contain bigram
|
env->GetObjectField(inputEvent, targetWordFieldId));
|
||||||
// information.
|
jsize wordLength = env->GetArrayLength(targetWord);
|
||||||
jintArray word0 = static_cast<jintArray>(
|
int wordCodePoints[wordLength];
|
||||||
env->GetObjectField(languageModelParam, word0FieldId));
|
env->GetIntArrayRegion(targetWord, 0, wordLength, wordCodePoints);
|
||||||
jsize word0Length = word0 ? env->GetArrayLength(word0) : 0;
|
env->DeleteLocalRef(targetWord);
|
||||||
int word0CodePoints[word0Length];
|
|
||||||
if (word0) {
|
jint prevWordCount = env->GetIntField(inputEvent, prevWordCountFieldId);
|
||||||
env->GetIntArrayRegion(word0, 0, word0Length, word0CodePoints);
|
jobjectArray prevWordArray =
|
||||||
}
|
static_cast<jobjectArray>(env->GetObjectField(inputEvent, prevWordArrayFieldId));
|
||||||
jintArray word1 = static_cast<jintArray>(
|
jbooleanArray isPrevWordBeginningOfSentenceArray = static_cast<jbooleanArray>(
|
||||||
env->GetObjectField(languageModelParam, word1FieldId));
|
env->GetObjectField(inputEvent, isPrevWordBoSArrayFieldId));
|
||||||
jsize word1Length = env->GetArrayLength(word1);
|
jboolean isValid = env->GetBooleanField(inputEvent, isValidFieldId);
|
||||||
int word1CodePoints[word1Length];
|
jint timestamp = env->GetIntField(inputEvent, timestampFieldId);
|
||||||
env->GetIntArrayRegion(word1, 0, word1Length, word1CodePoints);
|
const NgramContext ngramContext = JniDataUtils::constructNgramContext(env,
|
||||||
jint unigramProbability = env->GetIntField(languageModelParam, unigramProbabilityFieldId);
|
prevWordArray, isPrevWordBeginningOfSentenceArray, prevWordCount);
|
||||||
jint timestamp = env->GetIntField(languageModelParam, timestampFieldId);
|
|
||||||
jboolean isNotAWord = env->GetBooleanField(languageModelParam, isNotAWordFieldId);
|
|
||||||
jboolean isPossiblyOffensive = env->GetBooleanField(languageModelParam,
|
|
||||||
isPossiblyOffensiveFieldId);
|
|
||||||
jintArray shortcutTarget = static_cast<jintArray>(
|
|
||||||
env->GetObjectField(languageModelParam, shortcutTargetFieldId));
|
|
||||||
std::vector<UnigramProperty::ShortcutProperty> shortcuts;
|
|
||||||
{
|
|
||||||
std::vector<int> shortcutTargetCodePoints;
|
|
||||||
JniDataUtils::jintarrayToVector(env, shortcutTarget, &shortcutTargetCodePoints);
|
|
||||||
if (!shortcutTargetCodePoints.empty()) {
|
|
||||||
jint shortcutProbability =
|
|
||||||
env->GetIntField(languageModelParam, shortcutProbabilityFieldId);
|
|
||||||
shortcuts.emplace_back(std::move(shortcutTargetCodePoints), shortcutProbability);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Use 1 for count to indicate the word has inputted.
|
// Use 1 for count to indicate the word has inputted.
|
||||||
const UnigramProperty unigramProperty(false /* isBeginningOfSentence */, isNotAWord,
|
dictionary->updateEntriesForWordWithNgramContext(&ngramContext,
|
||||||
isPossiblyOffensive, unigramProbability,
|
CodePointArrayView(wordCodePoints, wordLength), isValid,
|
||||||
HistoricalInfo(timestamp, 0 /* level */, 1 /* count */), std::move(shortcuts));
|
HistoricalInfo(timestamp, 0 /* level */, 1 /* count */));
|
||||||
dictionary->addUnigramEntry(CodePointArrayView(word1CodePoints, word1Length),
|
|
||||||
&unigramProperty);
|
|
||||||
if (word0) {
|
|
||||||
jint bigramProbability = env->GetIntField(languageModelParam, bigramProbabilityFieldId);
|
|
||||||
// Use 1 for count to indicate the bigram has inputted.
|
|
||||||
const NgramContext ngramContext(word0CodePoints, word0Length,
|
|
||||||
false /* isBeginningOfSentence */);
|
|
||||||
const NgramProperty ngramProperty(ngramContext,
|
|
||||||
CodePointArrayView(word1CodePoints, word1Length).toVector(),
|
|
||||||
bigramProbability, HistoricalInfo(timestamp, 0 /* level */, 1 /* count */));
|
|
||||||
dictionary->addNgramEntry(&ngramProperty);
|
|
||||||
}
|
|
||||||
if (dictionary->needsToRunGC(true /* mindsBlockByGC */)) {
|
if (dictionary->needsToRunGC(true /* mindsBlockByGC */)) {
|
||||||
return i + 1;
|
return i + 1;
|
||||||
}
|
}
|
||||||
env->DeleteLocalRef(word0);
|
env->DeleteLocalRef(prevWordArray);
|
||||||
env->DeleteLocalRef(word1);
|
env->DeleteLocalRef(isPrevWordBeginningOfSentenceArray);
|
||||||
env->DeleteLocalRef(shortcutTarget);
|
env->DeleteLocalRef(inputEvent);
|
||||||
env->DeleteLocalRef(languageModelParam);
|
|
||||||
}
|
}
|
||||||
return languageModelParamCount;
|
return inputEventCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
static jstring latinime_BinaryDictionary_getProperty(JNIEnv *env, jclass clazz, jlong dict,
|
static jstring latinime_BinaryDictionary_getProperty(JNIEnv *env, jclass clazz, jlong dict,
|
||||||
|
@ -754,10 +716,10 @@ static const JNINativeMethod sMethods[] = {
|
||||||
reinterpret_cast<void *>(latinime_BinaryDictionary_updateEntriesForWordWithNgramContext)
|
reinterpret_cast<void *>(latinime_BinaryDictionary_updateEntriesForWordWithNgramContext)
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
const_cast<char *>("addMultipleDictionaryEntriesNative"),
|
const_cast<char *>("updateEntriesForInputEventsNative"),
|
||||||
const_cast<char *>(
|
const_cast<char *>(
|
||||||
"(J[Lcom/android/inputmethod/latin/utils/LanguageModelParam;I)I"),
|
"(J[Lcom/android/inputmethod/latin/utils/WordInputEventForPersonalization;I)I"),
|
||||||
reinterpret_cast<void *>(latinime_BinaryDictionary_addMultipleDictionaryEntries)
|
reinterpret_cast<void *>(latinime_BinaryDictionary_updateEntriesForInputEvents)
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
const_cast<char *>("getPropertyNative"),
|
const_cast<char *>("getPropertyNative"),
|
||||||
|
|
|
@ -50,6 +50,7 @@ class JniDataUtils {
|
||||||
const jsize keyUtf8Length = env->GetStringUTFLength(keyString);
|
const jsize keyUtf8Length = env->GetStringUTFLength(keyString);
|
||||||
char keyChars[keyUtf8Length + 1];
|
char keyChars[keyUtf8Length + 1];
|
||||||
env->GetStringUTFRegion(keyString, 0, env->GetStringLength(keyString), keyChars);
|
env->GetStringUTFRegion(keyString, 0, env->GetStringLength(keyString), keyChars);
|
||||||
|
env->DeleteLocalRef(keyString);
|
||||||
keyChars[keyUtf8Length] = '\0';
|
keyChars[keyUtf8Length] = '\0';
|
||||||
DictionaryHeaderStructurePolicy::AttributeMap::key_type key;
|
DictionaryHeaderStructurePolicy::AttributeMap::key_type key;
|
||||||
HeaderReadWriteUtils::insertCharactersIntoVector(keyChars, &key);
|
HeaderReadWriteUtils::insertCharactersIntoVector(keyChars, &key);
|
||||||
|
@ -59,6 +60,7 @@ class JniDataUtils {
|
||||||
const jsize valueUtf8Length = env->GetStringUTFLength(valueString);
|
const jsize valueUtf8Length = env->GetStringUTFLength(valueString);
|
||||||
char valueChars[valueUtf8Length + 1];
|
char valueChars[valueUtf8Length + 1];
|
||||||
env->GetStringUTFRegion(valueString, 0, env->GetStringLength(valueString), valueChars);
|
env->GetStringUTFRegion(valueString, 0, env->GetStringLength(valueString), valueChars);
|
||||||
|
env->DeleteLocalRef(valueString);
|
||||||
valueChars[valueUtf8Length] = '\0';
|
valueChars[valueUtf8Length] = '\0';
|
||||||
DictionaryHeaderStructurePolicy::AttributeMap::mapped_type value;
|
DictionaryHeaderStructurePolicy::AttributeMap::mapped_type value;
|
||||||
HeaderReadWriteUtils::insertCharactersIntoVector(valueChars, &value);
|
HeaderReadWriteUtils::insertCharactersIntoVector(valueChars, &value);
|
||||||
|
@ -113,6 +115,7 @@ class JniDataUtils {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
env->GetIntArrayRegion(prevWord, 0, prevWordLength, prevWordCodePoints[i]);
|
env->GetIntArrayRegion(prevWord, 0, prevWordLength, prevWordCodePoints[i]);
|
||||||
|
env->DeleteLocalRef(prevWord);
|
||||||
prevWordCodePointCount[i] = prevWordLength;
|
prevWordCodePointCount[i] = prevWordLength;
|
||||||
jboolean isBeginningOfSentenceBoolean = JNI_FALSE;
|
jboolean isBeginningOfSentenceBoolean = JNI_FALSE;
|
||||||
env->GetBooleanArrayRegion(isBeginningOfSentenceArray, i, 1 /* len */,
|
env->GetBooleanArrayRegion(isBeginningOfSentenceArray, i, 1 /* len */,
|
||||||
|
|
|
@ -32,6 +32,7 @@ import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
|
||||||
import com.android.inputmethod.latin.utils.BinaryDictionaryUtils;
|
import com.android.inputmethod.latin.utils.BinaryDictionaryUtils;
|
||||||
import com.android.inputmethod.latin.utils.FileUtils;
|
import com.android.inputmethod.latin.utils.FileUtils;
|
||||||
import com.android.inputmethod.latin.utils.LocaleUtils;
|
import com.android.inputmethod.latin.utils.LocaleUtils;
|
||||||
|
import com.android.inputmethod.latin.utils.WordInputEventForPersonalization;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -39,6 +40,7 @@ import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
@ -748,4 +750,66 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
||||||
|
|
||||||
binaryDictionary.close();
|
binaryDictionary.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testUpdateEntriesForInputEvents() {
|
||||||
|
for (final int formatVersion : DICT_FORMAT_VERSIONS) {
|
||||||
|
testUpdateEntriesForInputEvents(formatVersion);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void testUpdateEntriesForInputEvents(final int formatVersion) {
|
||||||
|
setCurrentTimeForTestMode(mCurrentTime);
|
||||||
|
final int codePointSetSize = 20;
|
||||||
|
final int EVENT_COUNT = 1000;
|
||||||
|
final double CONTINUE_RATE = 0.9;
|
||||||
|
final long seed = System.currentTimeMillis();
|
||||||
|
final Random random = new Random(seed);
|
||||||
|
final File dictFile = createEmptyDictionaryAndGetFile(formatVersion);
|
||||||
|
|
||||||
|
final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
|
||||||
|
final ArrayList<String> unigrams = new ArrayList<>();
|
||||||
|
final ArrayList<Pair<String, String>> bigrams = new ArrayList<>();
|
||||||
|
final ArrayList<Pair<Pair<String, String>, String>> trigrams = new ArrayList<>();
|
||||||
|
|
||||||
|
final WordInputEventForPersonalization[] inputEvents =
|
||||||
|
new WordInputEventForPersonalization[EVENT_COUNT];
|
||||||
|
NgramContext ngramContext = NgramContext.EMPTY_PREV_WORDS_INFO;
|
||||||
|
int prevWordCount = 0;
|
||||||
|
for (int i = 0; i < inputEvents.length; i++) {
|
||||||
|
final String word = CodePointUtils.generateWord(random, codePointSet);
|
||||||
|
inputEvents[i] = new WordInputEventForPersonalization(word, ngramContext,
|
||||||
|
true /* isValid */, mCurrentTime);
|
||||||
|
unigrams.add(word);
|
||||||
|
if (prevWordCount >= 2) {
|
||||||
|
final Pair<String, String> prevWordsPair = bigrams.get(bigrams.size() - 1);
|
||||||
|
trigrams.add(new Pair<>(prevWordsPair, word));
|
||||||
|
}
|
||||||
|
if (prevWordCount >= 1) {
|
||||||
|
bigrams.add(new Pair<>(ngramContext.getNthPrevWord(1 /* n */).toString(), word));
|
||||||
|
}
|
||||||
|
if (random.nextDouble() > CONTINUE_RATE) {
|
||||||
|
ngramContext = NgramContext.EMPTY_PREV_WORDS_INFO;
|
||||||
|
prevWordCount = 0;
|
||||||
|
} else {
|
||||||
|
ngramContext = ngramContext.getNextNgramContext(new WordInfo(word));
|
||||||
|
prevWordCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
final BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
|
||||||
|
binaryDictionary.updateEntriesForInputEvents(inputEvents);
|
||||||
|
|
||||||
|
for (final String word : unigrams) {
|
||||||
|
assertTrue(binaryDictionary.isInDictionary(word));
|
||||||
|
}
|
||||||
|
for (final Pair<String, String> bigram : bigrams) {
|
||||||
|
assertTrue(isValidBigram(binaryDictionary, bigram.first, bigram.second));
|
||||||
|
}
|
||||||
|
if (!supportsNgram(formatVersion)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
for (final Pair<Pair<String, String>, String> trigram : trigrams) {
|
||||||
|
assertTrue(isValidTrigram(binaryDictionary, trigram.first.first, trigram.first.second,
|
||||||
|
trigram.second));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,7 +29,6 @@ import com.android.inputmethod.latin.makedict.WeightedString;
|
||||||
import com.android.inputmethod.latin.makedict.WordProperty;
|
import com.android.inputmethod.latin.makedict.WordProperty;
|
||||||
import com.android.inputmethod.latin.utils.BinaryDictionaryUtils;
|
import com.android.inputmethod.latin.utils.BinaryDictionaryUtils;
|
||||||
import com.android.inputmethod.latin.utils.FileUtils;
|
import com.android.inputmethod.latin.utils.FileUtils;
|
||||||
import com.android.inputmethod.latin.utils.LanguageModelParam;
|
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -884,63 +883,6 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testAddMultipleDictionaryEntries() {
|
|
||||||
for (final int formatVersion : DICT_FORMAT_VERSIONS) {
|
|
||||||
testAddMultipleDictionaryEntries(formatVersion);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void testAddMultipleDictionaryEntries(final int formatVersion) {
|
|
||||||
final int codePointSetSize = 20;
|
|
||||||
final int lmParamCount = 1000;
|
|
||||||
final double bigramContinueRate = 0.9;
|
|
||||||
final long seed = System.currentTimeMillis();
|
|
||||||
final Random random = new Random(seed);
|
|
||||||
final File dictFile = createEmptyDictionaryAndGetFile(formatVersion);
|
|
||||||
|
|
||||||
final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
|
|
||||||
final HashMap<String, Integer> unigramProbabilities = new HashMap<>();
|
|
||||||
final HashMap<Pair<String, String>, Integer> bigramProbabilities = new HashMap<>();
|
|
||||||
|
|
||||||
final LanguageModelParam[] languageModelParams = new LanguageModelParam[lmParamCount];
|
|
||||||
String prevWord = null;
|
|
||||||
for (int i = 0; i < languageModelParams.length; i++) {
|
|
||||||
final String word = CodePointUtils.generateWord(random, codePointSet);
|
|
||||||
final int probability = random.nextInt(0xFF);
|
|
||||||
final int bigramProbability = probability + random.nextInt(0xFF - probability);
|
|
||||||
unigramProbabilities.put(word, probability);
|
|
||||||
if (prevWord == null) {
|
|
||||||
languageModelParams[i] = new LanguageModelParam(word, probability,
|
|
||||||
BinaryDictionary.NOT_A_VALID_TIMESTAMP);
|
|
||||||
} else {
|
|
||||||
languageModelParams[i] = new LanguageModelParam(prevWord, word, probability,
|
|
||||||
bigramProbability, BinaryDictionary.NOT_A_VALID_TIMESTAMP);
|
|
||||||
bigramProbabilities.put(new Pair<>(prevWord, word),
|
|
||||||
bigramProbability);
|
|
||||||
}
|
|
||||||
prevWord = (random.nextDouble() < bigramContinueRate) ? word : null;
|
|
||||||
}
|
|
||||||
|
|
||||||
final BinaryDictionary binaryDictionary = getBinaryDictionary(dictFile);
|
|
||||||
binaryDictionary.addMultipleDictionaryEntries(languageModelParams);
|
|
||||||
|
|
||||||
for (Map.Entry<String, Integer> entry : unigramProbabilities.entrySet()) {
|
|
||||||
assertEquals((int)entry.getValue(), binaryDictionary.getFrequency(entry.getKey()));
|
|
||||||
}
|
|
||||||
|
|
||||||
for (Map.Entry<Pair<String, String>, Integer> entry : bigramProbabilities.entrySet()) {
|
|
||||||
final String word0 = entry.getKey().first;
|
|
||||||
final String word1 = entry.getKey().second;
|
|
||||||
final int bigramProbability = entry.getValue();
|
|
||||||
assertEquals(bigramProbability != Dictionary.NOT_A_PROBABILITY,
|
|
||||||
isValidBigram(binaryDictionary, word0, word1));
|
|
||||||
if (canCheckBigramProbability(formatVersion)) {
|
|
||||||
assertEquals(bigramProbability,
|
|
||||||
getBigramProbability(binaryDictionary, word0, word1));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testGetWordProperties() {
|
public void testGetWordProperties() {
|
||||||
for (final int formatVersion : DICT_FORMAT_VERSIONS) {
|
for (final int formatVersion : DICT_FORMAT_VERSIONS) {
|
||||||
testGetWordProperties(formatVersion);
|
testGetWordProperties(formatVersion);
|
||||||
|
|
|
@ -30,7 +30,7 @@ import com.android.inputmethod.latin.Dictionary;
|
||||||
import com.android.inputmethod.latin.DictionaryFacilitator;
|
import com.android.inputmethod.latin.DictionaryFacilitator;
|
||||||
import com.android.inputmethod.latin.ExpandableBinaryDictionary;
|
import com.android.inputmethod.latin.ExpandableBinaryDictionary;
|
||||||
import com.android.inputmethod.latin.RichInputMethodManager;
|
import com.android.inputmethod.latin.RichInputMethodManager;
|
||||||
import com.android.inputmethod.latin.ExpandableBinaryDictionary.AddMultipleDictionaryEntriesCallback;
|
import com.android.inputmethod.latin.ExpandableBinaryDictionary.UpdateEntriesForInputEventsCallback;
|
||||||
import com.android.inputmethod.latin.common.CodePointUtils;
|
import com.android.inputmethod.latin.common.CodePointUtils;
|
||||||
import com.android.inputmethod.latin.settings.SpacingAndPunctuations;
|
import com.android.inputmethod.latin.settings.SpacingAndPunctuations;
|
||||||
|
|
||||||
|
@ -96,8 +96,8 @@ public class PersonalizationDictionaryTests extends AndroidTestCase {
|
||||||
true /* inputByUser */, tokens, timeStampInSeconds, DUMMY_PACKAGE_NAME,
|
true /* inputByUser */, tokens, timeStampInSeconds, DUMMY_PACKAGE_NAME,
|
||||||
LOCALE_EN_US.getLanguage());
|
LOCALE_EN_US.getLanguage());
|
||||||
final CountDownLatch countDownLatch = new CountDownLatch(1);
|
final CountDownLatch countDownLatch = new CountDownLatch(1);
|
||||||
final AddMultipleDictionaryEntriesCallback callback =
|
final UpdateEntriesForInputEventsCallback callback =
|
||||||
new AddMultipleDictionaryEntriesCallback() {
|
new UpdateEntriesForInputEventsCallback() {
|
||||||
@Override
|
@Override
|
||||||
public void onFinished() {
|
public void onFinished() {
|
||||||
countDownLatch.countDown();
|
countDownLatch.countDown();
|
||||||
|
|
Loading…
Reference in a new issue