From 94d9a2309fbca6b1e42b6c57b9c9509182fe8a0b Mon Sep 17 00:00:00 2001 From: Keisuke Kuroyanagi Date: Fri, 31 Jan 2014 20:32:44 +0900 Subject: [PATCH] Extend jni interface to get bigrams in WordProperty. Bug: 12810574 Change-Id: Ia4b88d02ea8790a5c47d32376cc0b84c3e071ddd --- .../inputmethod/latin/BinaryDictionary.java | 37 ++++++----- .../inputmethod/latin/utils/WordProperty.java | 54 +++++++++++---- ...oid_inputmethod_latin_BinaryDictionary.cpp | 12 ++-- .../suggest/core/dictionary/word_property.cpp | 15 +++-- .../suggest/core/dictionary/word_property.h | 4 +- .../latin/BinaryDictionaryTests.java | 65 ++++++++++--------- 6 files changed, 110 insertions(+), 77 deletions(-) diff --git a/java/src/com/android/inputmethod/latin/BinaryDictionary.java b/java/src/com/android/inputmethod/latin/BinaryDictionary.java index 95823dac5..6e0cdf2b1 100644 --- a/java/src/com/android/inputmethod/latin/BinaryDictionary.java +++ b/java/src/com/android/inputmethod/latin/BinaryDictionary.java @@ -68,11 +68,12 @@ public final class BinaryDictionary extends Dictionary { private static final int FORMAT_WORD_PROPERTY_HAS_BIGRAMS_INDEX = 2; private static final int FORMAT_WORD_PROPERTY_HAS_SHORTCUTS_INDEX = 3; - // Format to get unigram historical info from native side via getWordPropertyNative(). - private static final int FORMAT_WORD_PROPERTY_OUTPUT_HISTORICAL_INFO_COUNT = 3; - private static final int FORMAT_WORD_PROPERTY_TIMESTAMP_INDEX = 0; - private static final int FORMAT_WORD_PROPERTY_LEVEL_INDEX = 1; - private static final int FORMAT_WORD_PROPERTY_COUNT_INDEX = 2; + // Format to get probability and historical info from native side via getWordPropertyNative(). + public static final int FORMAT_WORD_PROPERTY_OUTPUT_PROBABILITY_INFO_COUNT = 4; + public static final int FORMAT_WORD_PROPERTY_PROBABILITY_INDEX = 0; + public static final int FORMAT_WORD_PROPERTY_TIMESTAMP_INDEX = 1; + public static final int FORMAT_WORD_PROPERTY_LEVEL_INDEX = 2; + public static final int FORMAT_WORD_PROPERTY_COUNT_INDEX = 3; private long mNativeDict; private final Locale mLocale; @@ -144,9 +145,9 @@ public final class BinaryDictionary extends Dictionary { private static native int getProbabilityNative(long dict, int[] word); private static native int getBigramProbabilityNative(long dict, int[] word0, int[] word1); private static native void getWordPropertyNative(long dict, int[] word, - int[] outCodePoints, boolean[] outFlags, int[] outProbability, - int[] outHistoricalInfo, ArrayList outShortcutTargets, - ArrayList outShortcutProbabilities); + int[] outCodePoints, boolean[] outFlags, int[] outProbabilityInfo, + ArrayList outBigramTargets, ArrayList outBigramProbabilityInfo, + ArrayList outShortcutTargets, ArrayList outShortcutProbabilities); private static native int getSuggestionsNative(long dict, long proximityInfo, long traverseSession, int[] xCoordinates, int[] yCoordinates, int[] times, int[] pointerIds, int[] inputCodePoints, int inputSize, int commitPoint, @@ -313,22 +314,22 @@ public final class BinaryDictionary extends Dictionary { final int[] codePoints = StringUtils.toCodePointArray(word); final int[] outCodePoints = new int[MAX_WORD_LENGTH]; final boolean[] outFlags = new boolean[FORMAT_WORD_PROPERTY_OUTPUT_FLAG_COUNT]; - final int[] outProbability = new int[1]; - final int[] outHistoricalInfo = - new int[FORMAT_WORD_PROPERTY_OUTPUT_HISTORICAL_INFO_COUNT]; + final int[] outProbabilityInfo = + new int[FORMAT_WORD_PROPERTY_OUTPUT_PROBABILITY_INFO_COUNT]; + final ArrayList outBigramTargets = CollectionUtils.newArrayList(); + final ArrayList outBigramProbabilityInfo = CollectionUtils.newArrayList(); final ArrayList outShortcutTargets = CollectionUtils.newArrayList(); final ArrayList outShortcutProbabilities = CollectionUtils.newArrayList(); - getWordPropertyNative(mNativeDict, codePoints, outCodePoints, outFlags, outProbability, - outHistoricalInfo, outShortcutTargets, outShortcutProbabilities); + getWordPropertyNative(mNativeDict, codePoints, outCodePoints, outFlags, outProbabilityInfo, + outBigramTargets, outBigramProbabilityInfo, outShortcutTargets, + outShortcutProbabilities); return new WordProperty(codePoints, outFlags[FORMAT_WORD_PROPERTY_IS_NOT_A_WORD_INDEX], outFlags[FORMAT_WORD_PROPERTY_IS_BLACKLISTED_INDEX], outFlags[FORMAT_WORD_PROPERTY_HAS_BIGRAMS_INDEX], - outFlags[FORMAT_WORD_PROPERTY_HAS_SHORTCUTS_INDEX], outProbability[0], - outHistoricalInfo[FORMAT_WORD_PROPERTY_TIMESTAMP_INDEX], - outHistoricalInfo[FORMAT_WORD_PROPERTY_LEVEL_INDEX], - outHistoricalInfo[FORMAT_WORD_PROPERTY_COUNT_INDEX], - outShortcutTargets, outShortcutProbabilities); + outFlags[FORMAT_WORD_PROPERTY_HAS_SHORTCUTS_INDEX], outProbabilityInfo, + outBigramTargets, outBigramProbabilityInfo, outShortcutTargets, + outShortcutProbabilities); } // Add a unigram entry to binary dictionary with unigram attributes in native code. diff --git a/java/src/com/android/inputmethod/latin/utils/WordProperty.java b/java/src/com/android/inputmethod/latin/utils/WordProperty.java index d6c0f900a..ba9b114b0 100644 --- a/java/src/com/android/inputmethod/latin/utils/WordProperty.java +++ b/java/src/com/android/inputmethod/latin/utils/WordProperty.java @@ -32,15 +32,30 @@ public class WordProperty { public final boolean mIsBlacklisted; public final boolean mHasBigrams; public final boolean mHasShortcuts; - public final int mProbability; - // mTimestamp, mLevel and mCount are historical info. These values are depend on the - // implementation in native code; thus, we must not use them and have any assumptions about - // them except for tests. - public final int mTimestamp; - public final int mLevel; - public final int mCount; + public final ProbabilityInfo mProbabilityInfo; + public final ArrayList mBigramTargets = CollectionUtils.newArrayList(); + public final ArrayList mBigramProbabilityInfo = CollectionUtils.newArrayList(); public final ArrayList mShortcutTargets = CollectionUtils.newArrayList(); + // TODO: Use this kind of Probability class for dictionary read/write code under the makedict + // package. + public static final class ProbabilityInfo { + public final int mProbability; + // wTimestamp, mLevel and mCount are historical info. These values are depend on the + // implementation in native code; thus, we must not use them and have any assumptions about + // them except for tests. + public final int mTimestamp; + public final int mLevel; + public final int mCount; + + public ProbabilityInfo(final int[] probabilityInfo) { + mProbability = probabilityInfo[BinaryDictionary.FORMAT_WORD_PROPERTY_PROBABILITY_INDEX]; + mTimestamp = probabilityInfo[BinaryDictionary.FORMAT_WORD_PROPERTY_TIMESTAMP_INDEX]; + mLevel = probabilityInfo[BinaryDictionary.FORMAT_WORD_PROPERTY_LEVEL_INDEX]; + mCount = probabilityInfo[BinaryDictionary.FORMAT_WORD_PROPERTY_COUNT_INDEX]; + } + } + private static int getCodePointCount(final int[] codePoints) { for (int i = 0; i < codePoints.length; i++) { if (codePoints[i] == 0) { @@ -53,18 +68,29 @@ public class WordProperty { // This represents invalid word when the probability is BinaryDictionary.NOT_A_PROBABILITY. public WordProperty(final int[] codePoints, final boolean isNotAWord, final boolean isBlacklisted, final boolean hasBigram, - final boolean hasShortcuts, final int probability, final int timestamp, - final int level, final int count, final ArrayList shortcutTargets, + final boolean hasShortcuts, final int[] probabilityInfo, + final ArrayList bigramTargets, final ArrayList bigramProbabilityInfo, + final ArrayList shortcutTargets, final ArrayList shortcutProbabilities) { mCodePoints = new String(codePoints, 0 /* offset */, getCodePointCount(codePoints)); mIsNotAWord = isNotAWord; mIsBlacklisted = isBlacklisted; mHasBigrams = hasBigram; mHasShortcuts = hasShortcuts; - mProbability = probability; - mTimestamp = timestamp; - mLevel = level; - mCount = count; + mProbabilityInfo = new ProbabilityInfo(probabilityInfo); + + final int bigramTargetCount = bigramTargets.size(); + for (int i = 0; i < bigramTargetCount; i++) { + final int[] bigramTargetCodePointArray = bigramTargets.get(i); + final String bigramTargetString = new String(bigramTargetCodePointArray, + 0 /* offset */, getCodePointCount(bigramTargetCodePointArray)); + final ProbabilityInfo bigramProbability = + new ProbabilityInfo(bigramProbabilityInfo.get(i)); + mBigramTargets.add( + new WeightedString(bigramTargetString, bigramProbability.mProbability)); + mBigramProbabilityInfo.add(bigramProbability); + } + final int shortcutTargetCount = shortcutTargets.size(); for (int i = 0; i < shortcutTargetCount; i++) { final int[] shortcutTargetCodePointArray = shortcutTargets.get(i); @@ -77,6 +103,6 @@ public class WordProperty { @UsedForTesting public boolean isValid() { - return mProbability != BinaryDictionary.NOT_A_PROBABILITY; + return mProbabilityInfo.mProbability != BinaryDictionary.NOT_A_PROBABILITY; } } \ No newline at end of file diff --git a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp index 16a3fe825..8f3f8e21e 100644 --- a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp +++ b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp @@ -262,16 +262,17 @@ static jint latinime_BinaryDictionary_getBigramProbability(JNIEnv *env, jclass c static void latinime_BinaryDictionary_getWordProperty(JNIEnv *env, jclass clazz, jlong dict, jintArray word, jintArray outCodePoints, jbooleanArray outFlags, - jintArray outProbability, jintArray outHistoricalInfo, jobject outShortcutTargets, - jobject outShortcutProbabilities) { + jintArray outProbabilityInfo, jobject outBigramTargets, jobject outBigramProbabilityInfo, + jobject outShortcutTargets, jobject outShortcutProbabilities) { Dictionary *dictionary = reinterpret_cast(dict); if (!dictionary) return; const jsize wordLength = env->GetArrayLength(word); int wordCodePoints[wordLength]; env->GetIntArrayRegion(word, 0, wordLength, wordCodePoints); const WordProperty wordProperty = dictionary->getWordProperty(wordCodePoints, wordLength); - wordProperty.outputProperties(env, outCodePoints, outFlags, outProbability, - outHistoricalInfo, outShortcutTargets, outShortcutProbabilities); + wordProperty.outputProperties(env, outCodePoints, outFlags, outProbabilityInfo, + outBigramTargets, outBigramProbabilityInfo, outShortcutTargets, + outShortcutProbabilities); } static jfloat latinime_BinaryDictionary_calcNormalizedScore(JNIEnv *env, jclass clazz, @@ -521,7 +522,8 @@ static const JNINativeMethod sMethods[] = { }, { const_cast("getWordPropertyNative"), - const_cast("(J[I[I[Z[I[ILjava/util/ArrayList;Ljava/util/ArrayList;)V"), + const_cast("(J[I[I[Z[ILjava/util/ArrayList;Ljava/util/ArrayList;" + "Ljava/util/ArrayList;Ljava/util/ArrayList;)V"), reinterpret_cast(latinime_BinaryDictionary_getWordProperty) }, { diff --git a/native/jni/src/suggest/core/dictionary/word_property.cpp b/native/jni/src/suggest/core/dictionary/word_property.cpp index 4a260a982..d8c330bbd 100644 --- a/native/jni/src/suggest/core/dictionary/word_property.cpp +++ b/native/jni/src/suggest/core/dictionary/word_property.cpp @@ -19,20 +19,23 @@ namespace latinime { void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints, - jbooleanArray outFlags, jintArray outProbability, jintArray outHistoricalInfo, - jobject outShortcutTargets, jobject outShortcutProbabilities) const { + jbooleanArray outFlags, jintArray outProbabilityInfo, jobject outBigramTargets, + jobject outBigramProbabilities, jobject outShortcutTargets, + jobject outShortcutProbabilities) const { env->SetIntArrayRegion(outCodePoints, 0 /* start */, mCodePoints.size(), &mCodePoints[0]); jboolean flags[] = {mIsNotAWord, mIsBlacklisted, mHasBigrams, mHasShortcuts}; env->SetBooleanArrayRegion(outFlags, 0 /* start */, NELEMS(flags), flags); - env->SetIntArrayRegion(outProbability, 0 /* start */, 1 /* len */, &mProbability); - int historicalInfo[] = {mTimestamp, mLevel, mCount}; - env->SetIntArrayRegion(outHistoricalInfo, 0 /* start */, NELEMS(historicalInfo), - historicalInfo); + int probabilityInfo[] = {mProbability, mTimestamp, mLevel, mCount}; + env->SetIntArrayRegion(outProbabilityInfo, 0 /* start */, NELEMS(probabilityInfo), + probabilityInfo); jclass integerClass = env->FindClass("java/lang/Integer"); jmethodID intToIntegerConstructorId = env->GetMethodID(integerClass, "", "(I)V"); jclass arrayListClass = env->FindClass("java/util/ArrayList"); jmethodID addMethodId = env->GetMethodID(arrayListClass, "add", "(Ljava/lang/Object;)Z"); + + // TODO: Output bigrams. + // Output shortcuts. const int shortcutTargetCount = mShortcuts.size(); for (int i = 0; i < shortcutTargetCount; ++i) { const std::vector *const targetCodePoints = mShortcuts[i].getTargetCodePoints(); diff --git a/native/jni/src/suggest/core/dictionary/word_property.h b/native/jni/src/suggest/core/dictionary/word_property.h index 69c880861..cc06b1baa 100644 --- a/native/jni/src/suggest/core/dictionary/word_property.h +++ b/native/jni/src/suggest/core/dictionary/word_property.h @@ -78,8 +78,8 @@ class WordProperty { mShortcuts(*shortcuts) {} void outputProperties(JNIEnv *const env, jintArray outCodePoints, jbooleanArray outFlags, - jintArray outProbability, jintArray outHistoricalInfo, jobject outShortcutTargets, - jobject outShortcutProbabilities) const; + jintArray outProbabilityInfo, jobject outBigramTargets, jobject outBigramProbabilities, + jobject outShortcutTargets, jobject outShortcutProbabilities) const; private: DISALLOW_ASSIGNMENT_OPERATOR(WordProperty); diff --git a/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java b/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java index 844fcbbd9..5294bb006 100644 --- a/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java +++ b/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java @@ -871,11 +871,11 @@ public class BinaryDictionaryTests extends AndroidTestCase { } } - public void testGetUnigramProperties() { - testGetUnigramProperties(FormatSpec.VERSION4); + public void testGetWordProperties() { + testGetWordProperties(FormatSpec.VERSION4); } - private void testGetUnigramProperties(final int formatVersion) { + private void testGetWordProperties(final int formatVersion) { final long seed = System.currentTimeMillis(); final Random random = new Random(seed); final int ITERATION_COUNT = 1000; @@ -892,8 +892,8 @@ public class BinaryDictionaryTests extends AndroidTestCase { 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); - final WordProperty invalidUnigramProperty = binaryDictionary.getWordProperty("dummyWord"); - assertFalse(invalidUnigramProperty.isValid()); + final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord"); + assertFalse(invalidWordProperty.isValid()); for (int i = 0; i < ITERATION_COUNT; i++) { final String word = CodePointUtils.generateWord(random, codePointSet); @@ -904,15 +904,15 @@ public class BinaryDictionaryTests extends AndroidTestCase { binaryDictionary.addUnigramWord(word, unigramProbability, null /* shortcutTarget */, BinaryDictionary.NOT_A_PROBABILITY, isNotAWord, isBlacklisted, BinaryDictionary.NOT_A_VALID_TIMESTAMP); - final WordProperty unigramProperty = binaryDictionary.getWordProperty(word); - assertEquals(word, unigramProperty.mCodePoints); - assertTrue(unigramProperty.isValid()); - assertEquals(isNotAWord, unigramProperty.mIsNotAWord); - assertEquals(isBlacklisted, unigramProperty.mIsBlacklisted); - assertEquals(false, unigramProperty.mHasBigrams); - assertEquals(false, unigramProperty.mHasShortcuts); - assertEquals(unigramProbability, unigramProperty.mProbability); - assertTrue(unigramProperty.mShortcutTargets.isEmpty()); + final WordProperty wordProperty = binaryDictionary.getWordProperty(word); + assertEquals(word, wordProperty.mCodePoints); + assertTrue(wordProperty.isValid()); + assertEquals(isNotAWord, wordProperty.mIsNotAWord); + assertEquals(isBlacklisted, wordProperty.mIsBlacklisted); + assertEquals(false, wordProperty.mHasBigrams); + assertEquals(false, wordProperty.mHasShortcuts); + assertEquals(unigramProbability, wordProperty.mProbabilityInfo.mProbability); + assertTrue(wordProperty.mShortcutTargets.isEmpty()); } } @@ -936,28 +936,28 @@ public class BinaryDictionaryTests extends AndroidTestCase { binaryDictionary.addUnigramWord("aaa", unigramProbability, "zzz", shortcutProbability, false /* isNotAWord */, false /* isBlacklisted */, 0 /* timestamp */); - WordProperty unigramProperty = binaryDictionary.getWordProperty("aaa"); - assertEquals(1, unigramProperty.mShortcutTargets.size()); - assertEquals("zzz", unigramProperty.mShortcutTargets.get(0).mWord); - assertEquals(shortcutProbability, unigramProperty.mShortcutTargets.get(0).mFrequency); + WordProperty wordProperty = binaryDictionary.getWordProperty("aaa"); + assertEquals(1, wordProperty.mShortcutTargets.size()); + assertEquals("zzz", wordProperty.mShortcutTargets.get(0).mWord); + assertEquals(shortcutProbability, wordProperty.mShortcutTargets.get(0).mFrequency); final int updatedShortcutProbability = 2; binaryDictionary.addUnigramWord("aaa", unigramProbability, "zzz", updatedShortcutProbability, false /* isNotAWord */, false /* isBlacklisted */, 0 /* timestamp */); - unigramProperty = binaryDictionary.getWordProperty("aaa"); - assertEquals(1, unigramProperty.mShortcutTargets.size()); - assertEquals("zzz", unigramProperty.mShortcutTargets.get(0).mWord); + wordProperty = binaryDictionary.getWordProperty("aaa"); + assertEquals(1, wordProperty.mShortcutTargets.size()); + assertEquals("zzz", wordProperty.mShortcutTargets.get(0).mWord); assertEquals(updatedShortcutProbability, - unigramProperty.mShortcutTargets.get(0).mFrequency); + wordProperty.mShortcutTargets.get(0).mFrequency); binaryDictionary.addUnigramWord("aaa", unigramProbability, "yyy", shortcutProbability, false /* isNotAWord */, false /* isBlacklisted */, 0 /* timestamp */); final HashMap shortcutTargets = new HashMap(); shortcutTargets.put("zzz", updatedShortcutProbability); shortcutTargets.put("yyy", shortcutProbability); - unigramProperty = binaryDictionary.getWordProperty("aaa"); - assertEquals(2, unigramProperty.mShortcutTargets.size()); - for (WeightedString shortcutTarget : unigramProperty.mShortcutTargets) { + wordProperty = binaryDictionary.getWordProperty("aaa"); + assertEquals(2, wordProperty.mShortcutTargets.size()); + for (WeightedString shortcutTarget : wordProperty.mShortcutTargets) { assertTrue(shortcutTargets.containsKey(shortcutTarget.mWord)); assertEquals((int)shortcutTargets.get(shortcutTarget.mWord), shortcutTarget.mFrequency); shortcutTargets.remove(shortcutTarget.mWord); @@ -965,9 +965,9 @@ public class BinaryDictionaryTests extends AndroidTestCase { shortcutTargets.put("zzz", updatedShortcutProbability); shortcutTargets.put("yyy", shortcutProbability); binaryDictionary.flushWithGC(); - unigramProperty = binaryDictionary.getWordProperty("aaa"); - assertEquals(2, unigramProperty.mShortcutTargets.size()); - for (WeightedString shortcutTarget : unigramProperty.mShortcutTargets) { + wordProperty = binaryDictionary.getWordProperty("aaa"); + assertEquals(2, wordProperty.mShortcutTargets.size()); + for (WeightedString shortcutTarget : wordProperty.mShortcutTargets) { assertTrue(shortcutTargets.containsKey(shortcutTarget.mWord)); assertEquals((int)shortcutTargets.get(shortcutTarget.mWord), shortcutTarget.mFrequency); shortcutTargets.remove(shortcutTarget.mWord); @@ -1034,14 +1034,15 @@ public class BinaryDictionaryTests extends AndroidTestCase { } for (final String word : words) { - final WordProperty unigramProperty = binaryDictionary.getWordProperty(word); - assertEquals((int)unigramProbabilities.get(word), unigramProperty.mProbability); + final WordProperty wordProperty = binaryDictionary.getWordProperty(word); + assertEquals((int)unigramProbabilities.get(word), + wordProperty.mProbabilityInfo.mProbability); if (!shortcutTargets.containsKey(word)) { // The word does not have shortcut targets. continue; } - assertEquals(shortcutTargets.get(word).size(), unigramProperty.mShortcutTargets.size()); - for (final WeightedString shortcutTarget : unigramProperty.mShortcutTargets) { + assertEquals(shortcutTargets.get(word).size(), wordProperty.mShortcutTargets.size()); + for (final WeightedString shortcutTarget : wordProperty.mShortcutTargets) { final String targetCodePonts = shortcutTarget.mWord; assertEquals((int)shortcutTargets.get(word).get(targetCodePonts), shortcutTarget.mFrequency);