Merge "Extend jni interface to get bigrams in WordProperty."

This commit is contained in:
Keisuke Kuroyanagi 2014-01-31 11:53:10 +00:00 committed by Android (Google) Code Review
commit 5ecf74b148
6 changed files with 110 additions and 77 deletions

View file

@ -68,11 +68,12 @@ public final class BinaryDictionary extends Dictionary {
private static final int FORMAT_WORD_PROPERTY_HAS_BIGRAMS_INDEX = 2;
private static final int FORMAT_WORD_PROPERTY_HAS_SHORTCUTS_INDEX = 3;
// Format to get unigram historical info from native side via getWordPropertyNative().
private static final int FORMAT_WORD_PROPERTY_OUTPUT_HISTORICAL_INFO_COUNT = 3;
private static final int FORMAT_WORD_PROPERTY_TIMESTAMP_INDEX = 0;
private static final int FORMAT_WORD_PROPERTY_LEVEL_INDEX = 1;
private static final int FORMAT_WORD_PROPERTY_COUNT_INDEX = 2;
// Format to get probability and historical info from native side via getWordPropertyNative().
public static final int FORMAT_WORD_PROPERTY_OUTPUT_PROBABILITY_INFO_COUNT = 4;
public static final int FORMAT_WORD_PROPERTY_PROBABILITY_INDEX = 0;
public static final int FORMAT_WORD_PROPERTY_TIMESTAMP_INDEX = 1;
public static final int FORMAT_WORD_PROPERTY_LEVEL_INDEX = 2;
public static final int FORMAT_WORD_PROPERTY_COUNT_INDEX = 3;
private long mNativeDict;
private final Locale mLocale;
@ -144,9 +145,9 @@ public final class BinaryDictionary extends Dictionary {
private static native int getProbabilityNative(long dict, int[] word);
private static native int getBigramProbabilityNative(long dict, int[] word0, int[] word1);
private static native void getWordPropertyNative(long dict, int[] word,
int[] outCodePoints, boolean[] outFlags, int[] outProbability,
int[] outHistoricalInfo, ArrayList<int[]> outShortcutTargets,
ArrayList<Integer> outShortcutProbabilities);
int[] outCodePoints, boolean[] outFlags, int[] outProbabilityInfo,
ArrayList<int[]> outBigramTargets, ArrayList<int[]> outBigramProbabilityInfo,
ArrayList<int[]> outShortcutTargets, ArrayList<Integer> outShortcutProbabilities);
private static native int getSuggestionsNative(long dict, long proximityInfo,
long traverseSession, int[] xCoordinates, int[] yCoordinates, int[] times,
int[] pointerIds, int[] inputCodePoints, int inputSize, int commitPoint,
@ -313,22 +314,22 @@ public final class BinaryDictionary extends Dictionary {
final int[] codePoints = StringUtils.toCodePointArray(word);
final int[] outCodePoints = new int[MAX_WORD_LENGTH];
final boolean[] outFlags = new boolean[FORMAT_WORD_PROPERTY_OUTPUT_FLAG_COUNT];
final int[] outProbability = new int[1];
final int[] outHistoricalInfo =
new int[FORMAT_WORD_PROPERTY_OUTPUT_HISTORICAL_INFO_COUNT];
final int[] outProbabilityInfo =
new int[FORMAT_WORD_PROPERTY_OUTPUT_PROBABILITY_INFO_COUNT];
final ArrayList<int[]> outBigramTargets = CollectionUtils.newArrayList();
final ArrayList<int[]> outBigramProbabilityInfo = CollectionUtils.newArrayList();
final ArrayList<int[]> outShortcutTargets = CollectionUtils.newArrayList();
final ArrayList<Integer> outShortcutProbabilities = CollectionUtils.newArrayList();
getWordPropertyNative(mNativeDict, codePoints, outCodePoints, outFlags, outProbability,
outHistoricalInfo, outShortcutTargets, outShortcutProbabilities);
getWordPropertyNative(mNativeDict, codePoints, outCodePoints, outFlags, outProbabilityInfo,
outBigramTargets, outBigramProbabilityInfo, outShortcutTargets,
outShortcutProbabilities);
return new WordProperty(codePoints,
outFlags[FORMAT_WORD_PROPERTY_IS_NOT_A_WORD_INDEX],
outFlags[FORMAT_WORD_PROPERTY_IS_BLACKLISTED_INDEX],
outFlags[FORMAT_WORD_PROPERTY_HAS_BIGRAMS_INDEX],
outFlags[FORMAT_WORD_PROPERTY_HAS_SHORTCUTS_INDEX], outProbability[0],
outHistoricalInfo[FORMAT_WORD_PROPERTY_TIMESTAMP_INDEX],
outHistoricalInfo[FORMAT_WORD_PROPERTY_LEVEL_INDEX],
outHistoricalInfo[FORMAT_WORD_PROPERTY_COUNT_INDEX],
outShortcutTargets, outShortcutProbabilities);
outFlags[FORMAT_WORD_PROPERTY_HAS_SHORTCUTS_INDEX], outProbabilityInfo,
outBigramTargets, outBigramProbabilityInfo, outShortcutTargets,
outShortcutProbabilities);
}
// Add a unigram entry to binary dictionary with unigram attributes in native code.

View file

@ -32,15 +32,30 @@ public class WordProperty {
public final boolean mIsBlacklisted;
public final boolean mHasBigrams;
public final boolean mHasShortcuts;
public final int mProbability;
// mTimestamp, mLevel and mCount are historical info. These values are depend on the
// implementation in native code; thus, we must not use them and have any assumptions about
// them except for tests.
public final int mTimestamp;
public final int mLevel;
public final int mCount;
public final ProbabilityInfo mProbabilityInfo;
public final ArrayList<WeightedString> mBigramTargets = CollectionUtils.newArrayList();
public final ArrayList<ProbabilityInfo> mBigramProbabilityInfo = CollectionUtils.newArrayList();
public final ArrayList<WeightedString> mShortcutTargets = CollectionUtils.newArrayList();
// TODO: Use this kind of Probability class for dictionary read/write code under the makedict
// package.
public static final class ProbabilityInfo {
public final int mProbability;
// wTimestamp, mLevel and mCount are historical info. These values are depend on the
// implementation in native code; thus, we must not use them and have any assumptions about
// them except for tests.
public final int mTimestamp;
public final int mLevel;
public final int mCount;
public ProbabilityInfo(final int[] probabilityInfo) {
mProbability = probabilityInfo[BinaryDictionary.FORMAT_WORD_PROPERTY_PROBABILITY_INDEX];
mTimestamp = probabilityInfo[BinaryDictionary.FORMAT_WORD_PROPERTY_TIMESTAMP_INDEX];
mLevel = probabilityInfo[BinaryDictionary.FORMAT_WORD_PROPERTY_LEVEL_INDEX];
mCount = probabilityInfo[BinaryDictionary.FORMAT_WORD_PROPERTY_COUNT_INDEX];
}
}
private static int getCodePointCount(final int[] codePoints) {
for (int i = 0; i < codePoints.length; i++) {
if (codePoints[i] == 0) {
@ -53,18 +68,29 @@ public class WordProperty {
// This represents invalid word when the probability is BinaryDictionary.NOT_A_PROBABILITY.
public WordProperty(final int[] codePoints, final boolean isNotAWord,
final boolean isBlacklisted, final boolean hasBigram,
final boolean hasShortcuts, final int probability, final int timestamp,
final int level, final int count, final ArrayList<int[]> shortcutTargets,
final boolean hasShortcuts, final int[] probabilityInfo,
final ArrayList<int[]> bigramTargets, final ArrayList<int[]> bigramProbabilityInfo,
final ArrayList<int[]> shortcutTargets,
final ArrayList<Integer> shortcutProbabilities) {
mCodePoints = new String(codePoints, 0 /* offset */, getCodePointCount(codePoints));
mIsNotAWord = isNotAWord;
mIsBlacklisted = isBlacklisted;
mHasBigrams = hasBigram;
mHasShortcuts = hasShortcuts;
mProbability = probability;
mTimestamp = timestamp;
mLevel = level;
mCount = count;
mProbabilityInfo = new ProbabilityInfo(probabilityInfo);
final int bigramTargetCount = bigramTargets.size();
for (int i = 0; i < bigramTargetCount; i++) {
final int[] bigramTargetCodePointArray = bigramTargets.get(i);
final String bigramTargetString = new String(bigramTargetCodePointArray,
0 /* offset */, getCodePointCount(bigramTargetCodePointArray));
final ProbabilityInfo bigramProbability =
new ProbabilityInfo(bigramProbabilityInfo.get(i));
mBigramTargets.add(
new WeightedString(bigramTargetString, bigramProbability.mProbability));
mBigramProbabilityInfo.add(bigramProbability);
}
final int shortcutTargetCount = shortcutTargets.size();
for (int i = 0; i < shortcutTargetCount; i++) {
final int[] shortcutTargetCodePointArray = shortcutTargets.get(i);
@ -77,6 +103,6 @@ public class WordProperty {
@UsedForTesting
public boolean isValid() {
return mProbability != BinaryDictionary.NOT_A_PROBABILITY;
return mProbabilityInfo.mProbability != BinaryDictionary.NOT_A_PROBABILITY;
}
}

View file

@ -262,16 +262,17 @@ static jint latinime_BinaryDictionary_getBigramProbability(JNIEnv *env, jclass c
static void latinime_BinaryDictionary_getWordProperty(JNIEnv *env, jclass clazz,
jlong dict, jintArray word, jintArray outCodePoints, jbooleanArray outFlags,
jintArray outProbability, jintArray outHistoricalInfo, jobject outShortcutTargets,
jobject outShortcutProbabilities) {
jintArray outProbabilityInfo, jobject outBigramTargets, jobject outBigramProbabilityInfo,
jobject outShortcutTargets, jobject outShortcutProbabilities) {
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
if (!dictionary) return;
const jsize wordLength = env->GetArrayLength(word);
int wordCodePoints[wordLength];
env->GetIntArrayRegion(word, 0, wordLength, wordCodePoints);
const WordProperty wordProperty = dictionary->getWordProperty(wordCodePoints, wordLength);
wordProperty.outputProperties(env, outCodePoints, outFlags, outProbability,
outHistoricalInfo, outShortcutTargets, outShortcutProbabilities);
wordProperty.outputProperties(env, outCodePoints, outFlags, outProbabilityInfo,
outBigramTargets, outBigramProbabilityInfo, outShortcutTargets,
outShortcutProbabilities);
}
static jfloat latinime_BinaryDictionary_calcNormalizedScore(JNIEnv *env, jclass clazz,
@ -521,7 +522,8 @@ static const JNINativeMethod sMethods[] = {
},
{
const_cast<char *>("getWordPropertyNative"),
const_cast<char *>("(J[I[I[Z[I[ILjava/util/ArrayList;Ljava/util/ArrayList;)V"),
const_cast<char *>("(J[I[I[Z[ILjava/util/ArrayList;Ljava/util/ArrayList;"
"Ljava/util/ArrayList;Ljava/util/ArrayList;)V"),
reinterpret_cast<void *>(latinime_BinaryDictionary_getWordProperty)
},
{

View file

@ -19,20 +19,23 @@
namespace latinime {
void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints,
jbooleanArray outFlags, jintArray outProbability, jintArray outHistoricalInfo,
jobject outShortcutTargets, jobject outShortcutProbabilities) const {
jbooleanArray outFlags, jintArray outProbabilityInfo, jobject outBigramTargets,
jobject outBigramProbabilities, jobject outShortcutTargets,
jobject outShortcutProbabilities) const {
env->SetIntArrayRegion(outCodePoints, 0 /* start */, mCodePoints.size(), &mCodePoints[0]);
jboolean flags[] = {mIsNotAWord, mIsBlacklisted, mHasBigrams, mHasShortcuts};
env->SetBooleanArrayRegion(outFlags, 0 /* start */, NELEMS(flags), flags);
env->SetIntArrayRegion(outProbability, 0 /* start */, 1 /* len */, &mProbability);
int historicalInfo[] = {mTimestamp, mLevel, mCount};
env->SetIntArrayRegion(outHistoricalInfo, 0 /* start */, NELEMS(historicalInfo),
historicalInfo);
int probabilityInfo[] = {mProbability, mTimestamp, mLevel, mCount};
env->SetIntArrayRegion(outProbabilityInfo, 0 /* start */, NELEMS(probabilityInfo),
probabilityInfo);
jclass integerClass = env->FindClass("java/lang/Integer");
jmethodID intToIntegerConstructorId = env->GetMethodID(integerClass, "<init>", "(I)V");
jclass arrayListClass = env->FindClass("java/util/ArrayList");
jmethodID addMethodId = env->GetMethodID(arrayListClass, "add", "(Ljava/lang/Object;)Z");
// TODO: Output bigrams.
// Output shortcuts.
const int shortcutTargetCount = mShortcuts.size();
for (int i = 0; i < shortcutTargetCount; ++i) {
const std::vector<int> *const targetCodePoints = mShortcuts[i].getTargetCodePoints();

View file

@ -78,8 +78,8 @@ class WordProperty {
mShortcuts(*shortcuts) {}
void outputProperties(JNIEnv *const env, jintArray outCodePoints, jbooleanArray outFlags,
jintArray outProbability, jintArray outHistoricalInfo, jobject outShortcutTargets,
jobject outShortcutProbabilities) const;
jintArray outProbabilityInfo, jobject outBigramTargets, jobject outBigramProbabilities,
jobject outShortcutTargets, jobject outShortcutProbabilities) const;
private:
DISALLOW_ASSIGNMENT_OPERATOR(WordProperty);

View file

@ -871,11 +871,11 @@ public class BinaryDictionaryTests extends AndroidTestCase {
}
}
public void testGetUnigramProperties() {
testGetUnigramProperties(FormatSpec.VERSION4);
public void testGetWordProperties() {
testGetWordProperties(FormatSpec.VERSION4);
}
private void testGetUnigramProperties(final int formatVersion) {
private void testGetWordProperties(final int formatVersion) {
final long seed = System.currentTimeMillis();
final Random random = new Random(seed);
final int ITERATION_COUNT = 1000;
@ -892,8 +892,8 @@ public class BinaryDictionaryTests extends AndroidTestCase {
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
final WordProperty invalidUnigramProperty = binaryDictionary.getWordProperty("dummyWord");
assertFalse(invalidUnigramProperty.isValid());
final WordProperty invalidWordProperty = binaryDictionary.getWordProperty("dummyWord");
assertFalse(invalidWordProperty.isValid());
for (int i = 0; i < ITERATION_COUNT; i++) {
final String word = CodePointUtils.generateWord(random, codePointSet);
@ -904,15 +904,15 @@ public class BinaryDictionaryTests extends AndroidTestCase {
binaryDictionary.addUnigramWord(word, unigramProbability,
null /* shortcutTarget */, BinaryDictionary.NOT_A_PROBABILITY,
isNotAWord, isBlacklisted, BinaryDictionary.NOT_A_VALID_TIMESTAMP);
final WordProperty unigramProperty = binaryDictionary.getWordProperty(word);
assertEquals(word, unigramProperty.mCodePoints);
assertTrue(unigramProperty.isValid());
assertEquals(isNotAWord, unigramProperty.mIsNotAWord);
assertEquals(isBlacklisted, unigramProperty.mIsBlacklisted);
assertEquals(false, unigramProperty.mHasBigrams);
assertEquals(false, unigramProperty.mHasShortcuts);
assertEquals(unigramProbability, unigramProperty.mProbability);
assertTrue(unigramProperty.mShortcutTargets.isEmpty());
final WordProperty wordProperty = binaryDictionary.getWordProperty(word);
assertEquals(word, wordProperty.mCodePoints);
assertTrue(wordProperty.isValid());
assertEquals(isNotAWord, wordProperty.mIsNotAWord);
assertEquals(isBlacklisted, wordProperty.mIsBlacklisted);
assertEquals(false, wordProperty.mHasBigrams);
assertEquals(false, wordProperty.mHasShortcuts);
assertEquals(unigramProbability, wordProperty.mProbabilityInfo.mProbability);
assertTrue(wordProperty.mShortcutTargets.isEmpty());
}
}
@ -936,28 +936,28 @@ public class BinaryDictionaryTests extends AndroidTestCase {
binaryDictionary.addUnigramWord("aaa", unigramProbability, "zzz",
shortcutProbability, false /* isNotAWord */, false /* isBlacklisted */,
0 /* timestamp */);
WordProperty unigramProperty = binaryDictionary.getWordProperty("aaa");
assertEquals(1, unigramProperty.mShortcutTargets.size());
assertEquals("zzz", unigramProperty.mShortcutTargets.get(0).mWord);
assertEquals(shortcutProbability, unigramProperty.mShortcutTargets.get(0).mFrequency);
WordProperty wordProperty = binaryDictionary.getWordProperty("aaa");
assertEquals(1, wordProperty.mShortcutTargets.size());
assertEquals("zzz", wordProperty.mShortcutTargets.get(0).mWord);
assertEquals(shortcutProbability, wordProperty.mShortcutTargets.get(0).mFrequency);
final int updatedShortcutProbability = 2;
binaryDictionary.addUnigramWord("aaa", unigramProbability, "zzz",
updatedShortcutProbability, false /* isNotAWord */, false /* isBlacklisted */,
0 /* timestamp */);
unigramProperty = binaryDictionary.getWordProperty("aaa");
assertEquals(1, unigramProperty.mShortcutTargets.size());
assertEquals("zzz", unigramProperty.mShortcutTargets.get(0).mWord);
wordProperty = binaryDictionary.getWordProperty("aaa");
assertEquals(1, wordProperty.mShortcutTargets.size());
assertEquals("zzz", wordProperty.mShortcutTargets.get(0).mWord);
assertEquals(updatedShortcutProbability,
unigramProperty.mShortcutTargets.get(0).mFrequency);
wordProperty.mShortcutTargets.get(0).mFrequency);
binaryDictionary.addUnigramWord("aaa", unigramProbability, "yyy",
shortcutProbability, false /* isNotAWord */, false /* isBlacklisted */,
0 /* timestamp */);
final HashMap<String, Integer> shortcutTargets = new HashMap<String, Integer>();
shortcutTargets.put("zzz", updatedShortcutProbability);
shortcutTargets.put("yyy", shortcutProbability);
unigramProperty = binaryDictionary.getWordProperty("aaa");
assertEquals(2, unigramProperty.mShortcutTargets.size());
for (WeightedString shortcutTarget : unigramProperty.mShortcutTargets) {
wordProperty = binaryDictionary.getWordProperty("aaa");
assertEquals(2, wordProperty.mShortcutTargets.size());
for (WeightedString shortcutTarget : wordProperty.mShortcutTargets) {
assertTrue(shortcutTargets.containsKey(shortcutTarget.mWord));
assertEquals((int)shortcutTargets.get(shortcutTarget.mWord), shortcutTarget.mFrequency);
shortcutTargets.remove(shortcutTarget.mWord);
@ -965,9 +965,9 @@ public class BinaryDictionaryTests extends AndroidTestCase {
shortcutTargets.put("zzz", updatedShortcutProbability);
shortcutTargets.put("yyy", shortcutProbability);
binaryDictionary.flushWithGC();
unigramProperty = binaryDictionary.getWordProperty("aaa");
assertEquals(2, unigramProperty.mShortcutTargets.size());
for (WeightedString shortcutTarget : unigramProperty.mShortcutTargets) {
wordProperty = binaryDictionary.getWordProperty("aaa");
assertEquals(2, wordProperty.mShortcutTargets.size());
for (WeightedString shortcutTarget : wordProperty.mShortcutTargets) {
assertTrue(shortcutTargets.containsKey(shortcutTarget.mWord));
assertEquals((int)shortcutTargets.get(shortcutTarget.mWord), shortcutTarget.mFrequency);
shortcutTargets.remove(shortcutTarget.mWord);
@ -1034,14 +1034,15 @@ public class BinaryDictionaryTests extends AndroidTestCase {
}
for (final String word : words) {
final WordProperty unigramProperty = binaryDictionary.getWordProperty(word);
assertEquals((int)unigramProbabilities.get(word), unigramProperty.mProbability);
final WordProperty wordProperty = binaryDictionary.getWordProperty(word);
assertEquals((int)unigramProbabilities.get(word),
wordProperty.mProbabilityInfo.mProbability);
if (!shortcutTargets.containsKey(word)) {
// The word does not have shortcut targets.
continue;
}
assertEquals(shortcutTargets.get(word).size(), unigramProperty.mShortcutTargets.size());
for (final WeightedString shortcutTarget : unigramProperty.mShortcutTargets) {
assertEquals(shortcutTargets.get(word).size(), wordProperty.mShortcutTargets.size());
for (final WeightedString shortcutTarget : wordProperty.mShortcutTargets) {
final String targetCodePonts = shortcutTarget.mWord;
assertEquals((int)shortcutTargets.get(word).get(targetCodePonts),
shortcutTarget.mFrequency);