diff --git a/java/src/com/android/inputmethod/latin/BinaryDictionary.java b/java/src/com/android/inputmethod/latin/BinaryDictionary.java index fd296988e..1abbf3084 100644 --- a/java/src/com/android/inputmethod/latin/BinaryDictionary.java +++ b/java/src/com/android/inputmethod/latin/BinaryDictionary.java @@ -137,6 +137,8 @@ public final class BinaryDictionary extends Dictionary { private static native void addBigramWordsNative(long dict, int[] word0, int[] word1, int probability); private static native void removeBigramWordsNative(long dict, int[] word0, int[] word1); + private static native int addMultipleDictionaryEntriesNative(long dict, + LanguageModelParam[] languageModelParams, int startIndex); private static native int calculateProbabilityNative(long dict, int unigramProbability, int bigramProbability); private static native String getPropertyNative(long dict, String query); @@ -303,6 +305,46 @@ public final class BinaryDictionary extends Dictionary { removeBigramWordsNative(mNativeDict, codePoints0, codePoints1); } + public static class LanguageModelParam { + public final int[] mWord0; + public final int[] mWord1; + public final int mUnigramProbability; + public final int mBigramProbability; + + // Constructor for unigram. + public LanguageModelParam(final String word, final int unigramProbability) { + mWord0 = null; + mWord1 = StringUtils.toCodePointArray(word); + mUnigramProbability = unigramProbability; + mBigramProbability = NOT_A_PROBABILITY; + } + + // Constructor for unigram and bigram. + public LanguageModelParam(final String word0, final String word1, + final int unigramProbability, final int bigramProbability) { + mWord0 = StringUtils.toCodePointArray(word0); + mWord1 = StringUtils.toCodePointArray(word1); + mUnigramProbability = unigramProbability; + mBigramProbability = bigramProbability; + } + } + + public void addMultipleDictionaryEntries(final LanguageModelParam[] languageModelParams) { + if (!isValidDictionary()) return; + int processedParamCount = 0; + while (processedParamCount < languageModelParams.length) { + if (needsToRunGC(true /* mindsBlockByGC */)) { + flushWithGC(); + } + processedParamCount = addMultipleDictionaryEntriesNative(mNativeDict, + languageModelParams, processedParamCount); + if (processedParamCount <= 0) { + return; + } + } + + } + private void reopen() { close(); final File dictFile = new File(mDictFilePath); diff --git a/java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java b/java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java index a0e4e5472..25aa5e5d6 100644 --- a/java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java +++ b/java/src/com/android/inputmethod/latin/ExpandableBinaryDictionary.java @@ -22,6 +22,7 @@ import android.util.Log; import com.android.inputmethod.annotations.UsedForTesting; import com.android.inputmethod.keyboard.ProximityInfo; +import com.android.inputmethod.latin.BinaryDictionary.LanguageModelParam; import com.android.inputmethod.latin.makedict.FormatSpec; import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo; import com.android.inputmethod.latin.utils.AsyncResultHolder; @@ -326,7 +327,7 @@ abstract public class ExpandableBinaryDictionary extends Dictionary { * Dynamically adds a word bigram in the dictionary. May overwrite an existing entry. */ protected void addBigramDynamically(final String word0, final String word1, - final int frequency, final boolean isValid) { + final int frequency) { if (!mIsUpdatable) { Log.w(TAG, "addBigramDynamically is called for non-updatable dictionary: " + mFilename); @@ -363,22 +364,6 @@ abstract public class ExpandableBinaryDictionary extends Dictionary { public void onFinished(); } - public static class LanguageModelParam { - public final String mWord0; - public final String mWord1; - public final boolean mIsValid; - public final int mFrequency; - public final int mBigramFrequency; - public LanguageModelParam(final String word0, final String word1, final boolean isValid, - final int frequency, final int bigramFrequency) { - mWord0 = word0; - mWord1 = word1; - mIsValid = isValid; - mFrequency = frequency; - mBigramFrequency = bigramFrequency; - } - } - /** * Dynamically add multiple entries to the dictionary. */ @@ -395,21 +380,9 @@ abstract public class ExpandableBinaryDictionary extends Dictionary { public void run() { final boolean locked = setProcessingLargeTaskIfNot(); try { - for (final LanguageModelParam languageModelParam : languageModelParams) { - if (languageModelParam.mWord1 == null) { - continue; - } - if (mBinaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) { - mBinaryDictionary.flushWithGC(); - } - mBinaryDictionary.addUnigramWord(languageModelParam.mWord1, - languageModelParam.mFrequency); - if (languageModelParam.mWord0 != null - && !languageModelParam.mWord0.equals(languageModelParam.mWord1)) { - mBinaryDictionary.addBigramWords(languageModelParam.mWord0, - languageModelParam.mWord1, languageModelParam.mBigramFrequency); - } - } + mBinaryDictionary.addMultipleDictionaryEntries( + languageModelParams.toArray( + new LanguageModelParam[languageModelParams.size()])); } finally { if (callback != null) { callback.onFinished(); diff --git a/java/src/com/android/inputmethod/latin/personalization/DecayingExpandableBinaryDictionaryBase.java b/java/src/com/android/inputmethod/latin/personalization/DecayingExpandableBinaryDictionaryBase.java index 42bd7600e..bc1160160 100644 --- a/java/src/com/android/inputmethod/latin/personalization/DecayingExpandableBinaryDictionaryBase.java +++ b/java/src/com/android/inputmethod/latin/personalization/DecayingExpandableBinaryDictionaryBase.java @@ -21,6 +21,7 @@ import android.content.SharedPreferences; import android.util.Log; import com.android.inputmethod.annotations.UsedForTesting; +import com.android.inputmethod.latin.BinaryDictionary.LanguageModelParam; import com.android.inputmethod.latin.Constants; import com.android.inputmethod.latin.Dictionary; import com.android.inputmethod.latin.ExpandableBinaryDictionary; @@ -147,7 +148,7 @@ public abstract class DecayingExpandableBinaryDictionaryBase extends ExpandableB return; } if (null != word0) { - addBigramDynamically(word0, word1, frequency, isValid); + addBigramDynamically(word0, word1, frequency); } } diff --git a/java/src/com/android/inputmethod/latin/personalization/PersonalizationDictionaryUpdateSession.java b/java/src/com/android/inputmethod/latin/personalization/PersonalizationDictionaryUpdateSession.java index 1f46f5b1c..2ab366b8a 100644 --- a/java/src/com/android/inputmethod/latin/personalization/PersonalizationDictionaryUpdateSession.java +++ b/java/src/com/android/inputmethod/latin/personalization/PersonalizationDictionaryUpdateSession.java @@ -18,6 +18,7 @@ package com.android.inputmethod.latin.personalization; import android.content.Context; +import com.android.inputmethod.latin.BinaryDictionary.LanguageModelParam; import com.android.inputmethod.latin.ExpandableBinaryDictionary; import java.lang.ref.WeakReference; @@ -28,24 +29,6 @@ import java.util.ArrayList; * dictionary. */ public abstract class PersonalizationDictionaryUpdateSession { - /** - * This class is a parameter for a new unigram or bigram word which will be added - * to the personalization dictionary. - */ - public static class PersonalizationLanguageModelParam { - public final String mWord0; - public final String mWord1; - public final boolean mIsValid; - public final int mFrequency; - public PersonalizationLanguageModelParam(String word0, String word1, boolean isValid, - int frequency) { - mWord0 = word0; - mWord1 = word1; - mIsValid = isValid; - mFrequency = frequency; - } - } - // TODO: Use a dynamic binary dictionary instead public WeakReference mDictionary; public WeakReference mPredictionDictionary; @@ -117,7 +100,7 @@ public abstract class PersonalizationDictionaryUpdateSession { // TODO: Support multi locale. public void addMultipleDictionaryEntriesToPersonalizationDictionary( - final ArrayList languageModelParams, + final ArrayList languageModelParams, final ExpandableBinaryDictionary.AddMultipleDictionaryEntriesCallback callback) { final DecayingExpandableBinaryDictionaryBase dictionary = getPredictionDictionary(); if (dictionary == null) { @@ -128,17 +111,4 @@ public abstract class PersonalizationDictionaryUpdateSession { } dictionary.addMultipleDictionaryEntriesToDictionary(languageModelParams, callback); } - - // Bulk import - // TODO: Support multi locale to add bigram - public void addBigramsToPersonalizationDictionary( - final ArrayList lmParams) { - final DecayingExpandableBinaryDictionaryBase dictionary = getPredictionDictionary(); - if (dictionary == null) { - return; - } - for (final PersonalizationLanguageModelParam lmParam : lmParams) { - dictionary.addToDictionary(lmParam.mWord0, lmParam.mWord1, lmParam.mIsValid); - } - } } diff --git a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp index db0c18982..b1aa034ea 100644 --- a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp +++ b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp @@ -320,6 +320,60 @@ static void latinime_BinaryDictionary_removeBigramWords(JNIEnv *env, jclass claz word1Length); } + +// Returns how many language model params are processed. +static int latinime_BinaryDictionary_addMultipleDictionaryEntries(JNIEnv *env, jclass clazz, + jlong dict, jobjectArray languageModelParams, jint startIndex) { + Dictionary *dictionary = reinterpret_cast(dict); + if (!dictionary) { + return 0; + } + jsize languageModelParamCount = env->GetArrayLength(languageModelParams); + if (languageModelParamCount == 0 || startIndex >= languageModelParamCount) { + return 0; + } + jobject languageModelParam = env->GetObjectArrayElement(languageModelParams, 0); + jclass languageModelParamClass = env->GetObjectClass(languageModelParam); + env->DeleteLocalRef(languageModelParam); + jfieldID word0FieldId = env->GetFieldID(languageModelParamClass, "mWord0", "[I"); + jfieldID word1FieldId = env->GetFieldID(languageModelParamClass, "mWord1", "[I"); + jfieldID unigramProbabilityFieldId = + env->GetFieldID(languageModelParamClass, "mUnigramProbability", "I"); + jfieldID bigramProbabilityFieldId = + env->GetFieldID(languageModelParamClass, "mBigramProbability", "I"); + env->DeleteLocalRef(languageModelParamClass); + + for (int i = startIndex; i < languageModelParamCount; ++i) { + jobject languageModelParam = env->GetObjectArrayElement(languageModelParams, i); + jintArray word0 = static_cast( + env->GetObjectField(languageModelParam, word0FieldId)); + jsize word0Length = word0 ? env->GetArrayLength(word0) : 0; + int word0CodePoints[word0Length]; + if (word0) { + env->GetIntArrayRegion(word0, 0, word0Length, word0CodePoints); + } + jintArray word1 = static_cast( + env->GetObjectField(languageModelParam, word1FieldId)); + jsize word1Length = env->GetArrayLength(word1); + int word1CodePoints[word1Length]; + env->GetIntArrayRegion(word1, 0, word1Length, word1CodePoints); + jint unigramProbability = env->GetIntField(languageModelParam, unigramProbabilityFieldId); + dictionary->addUnigramWord(word1CodePoints, word1Length, unigramProbability); + if (word0) { + jint bigramProbability = env->GetIntField(languageModelParam, bigramProbabilityFieldId); + dictionary->addBigramWords(word0CodePoints, word0Length, word1CodePoints, word1Length, + bigramProbability); + } + if (dictionary->needsToRunGC(true /* mindsBlockByGC */)) { + return i + 1; + } + env->DeleteLocalRef(word0); + env->DeleteLocalRef(word1); + env->DeleteLocalRef(languageModelParam); + } + return languageModelParamCount; +} + static int latinime_BinaryDictionary_calculateProbabilityNative(JNIEnv *env, jclass clazz, jlong dict, jint unigramProbability, jint bigramProbability) { Dictionary *dictionary = reinterpret_cast(dict); @@ -418,6 +472,12 @@ static const JNINativeMethod sMethods[] = { const_cast("(J[I[I)V"), reinterpret_cast(latinime_BinaryDictionary_removeBigramWords) }, + { + const_cast("addMultipleDictionaryEntriesNative"), + const_cast( + "(J[Lcom/android/inputmethod/latin/BinaryDictionary$LanguageModelParam;I)I"), + reinterpret_cast(latinime_BinaryDictionary_addMultipleDictionaryEntries) + }, { const_cast("calculateProbabilityNative"), const_cast("(JII)I"), diff --git a/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java b/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java index 747bb299c..6affe233d 100644 --- a/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java +++ b/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java @@ -21,6 +21,7 @@ import android.test.suitebuilder.annotation.LargeTest; import android.text.TextUtils; import android.util.Pair; +import com.android.inputmethod.latin.BinaryDictionary.LanguageModelParam; import com.android.inputmethod.latin.makedict.CodePointUtils; import com.android.inputmethod.latin.makedict.FormatSpec; @@ -33,6 +34,7 @@ import java.util.Locale; import java.util.Map; import java.util.Random; +// TODO Use the seed passed as an argument for makedict test. @LargeTest public class BinaryDictionaryTests extends AndroidTestCase { private static final String TEST_DICT_FILE_EXTENSION = ".testDict"; @@ -776,4 +778,66 @@ public class BinaryDictionaryTests extends AndroidTestCase { dictFile.delete(); } + + public void testAddMultipleDictionaryEntries() { + testAddMultipleDictionaryEntries(3 /* formatVersion */); + testAddMultipleDictionaryEntries(4 /* formatVersion */); + } + + private void testAddMultipleDictionaryEntries(final int formatVersion) { + final int codePointSetSize = 20; + final int lmParamCount = 1000; + final double bigramContinueRate = 0.9; + final long seed = System.currentTimeMillis(); + final Random random = new Random(seed); + + File dictFile = null; + try { + dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); + } catch (IOException e) { + fail("IOException while writing an initial dictionary : " + e); + } + + final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); + final HashMap unigramProbabilities = new HashMap(); + final HashMap, Integer> bigramProbabilities = + new HashMap, Integer>(); + + final LanguageModelParam[] languageModelParams = new LanguageModelParam[lmParamCount]; + String prevWord = null; + for (int i = 0; i < languageModelParams.length; i++) { + final String word = CodePointUtils.generateWord(random, codePointSet); + final int probability = random.nextInt(0xFF); + final int bigramProbability = random.nextInt(0xF); + unigramProbabilities.put(word, probability); + if (prevWord == null) { + languageModelParams[i] = new LanguageModelParam(word, probability); + } else { + languageModelParams[i] = new LanguageModelParam(prevWord, word, probability, + bigramProbability); + bigramProbabilities.put(new Pair(prevWord, word), + bigramProbability); + } + prevWord = (random.nextDouble() < bigramContinueRate) ? word : null; + } + + final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), + 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, + Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); + binaryDictionary.addMultipleDictionaryEntries(languageModelParams); + + for (Map.Entry entry : unigramProbabilities.entrySet()) { + assertEquals((int)entry.getValue(), binaryDictionary.getFrequency(entry.getKey())); + } + + for (Map.Entry, Integer> entry : bigramProbabilities.entrySet()) { + final String word0 = entry.getKey().first; + final String word1 = entry.getKey().second; + final int unigramProbability = unigramProbabilities.get(word1); + final int bigramProbability = entry.getValue(); + final int probability = binaryDictionary.calculateProbability( + unigramProbability, bigramProbability); + assertEquals(probability, binaryDictionary.getBigramProbability(word0, word1)); + } + } }