From 9fb28f78f7b67bb1ab7412e0abfd7d104004b623 Mon Sep 17 00:00:00 2001 From: Keisuke Kuroyanagi Date: Fri, 6 Dec 2013 17:17:38 +0900 Subject: [PATCH] Add a jni method to fetch unigram information. Bug: 11956652 Change-Id: I59f1823bbc0146f6d2f34cbed8166bb6de197208 --- .../inputmethod/latin/BinaryDictionary.java | 44 ++++++++++ .../latin/utils/UnigramProperty.java | 82 +++++++++++++++++ native/jni/Android.mk | 3 +- ...oid_inputmethod_latin_BinaryDictionary.cpp | 22 ++++- .../suggest/core/dictionary/dictionary.cpp | 7 ++ .../src/suggest/core/dictionary/dictionary.h | 4 + .../core/dictionary/unigram_property.cpp | 52 +++++++++++ .../core/dictionary/unigram_property.h | 87 +++++++++++++++++++ .../dictionary_structure_with_buffer_policy.h | 5 ++ .../structure/v2/patricia_trie_policy.h | 6 ++ .../v4/ver4_patricia_trie_policy.cpp | 41 +++++++++ .../structure/v4/ver4_patricia_trie_policy.h | 3 + .../latin/BinaryDictionaryTests.java | 49 +++++++++++ 13 files changed, 403 insertions(+), 2 deletions(-) create mode 100644 java/src/com/android/inputmethod/latin/utils/UnigramProperty.java create mode 100644 native/jni/src/suggest/core/dictionary/unigram_property.cpp create mode 100644 native/jni/src/suggest/core/dictionary/unigram_property.h diff --git a/java/src/com/android/inputmethod/latin/BinaryDictionary.java b/java/src/com/android/inputmethod/latin/BinaryDictionary.java index aa530ffb9..49a198d1e 100644 --- a/java/src/com/android/inputmethod/latin/BinaryDictionary.java +++ b/java/src/com/android/inputmethod/latin/BinaryDictionary.java @@ -26,6 +26,7 @@ import com.android.inputmethod.latin.settings.NativeSuggestOptions; import com.android.inputmethod.latin.utils.CollectionUtils; import com.android.inputmethod.latin.utils.JniUtils; import com.android.inputmethod.latin.utils.StringUtils; +import com.android.inputmethod.latin.utils.UnigramProperty; import java.io.File; import java.util.ArrayList; @@ -59,6 +60,19 @@ public final class BinaryDictionary extends Dictionary { public static final int NOT_A_VALID_TIMESTAMP = -1; + // Format to get unigram flags from native side via getUnigramPropertyNative(). + private static final int FORMAT_UNIGRAM_PROPERTY_OUTPUT_FLAG_COUNT = 4; + private static final int FORMAT_UNIGRAM_PROPERTY_IS_NOT_A_WORD_INDEX = 0; + private static final int FORMAT_UNIGRAM_PROPERTY_IS_BLACKLISTED_INDEX = 1; + private static final int FORMAT_UNIGRAM_PROPERTY_HAS_BIGRAMS_INDEX = 2; + private static final int FORMAT_UNIGRAM_PROPERTY_HAS_SHORTCUTS_INDEX = 3; + + // Format to get unigram historical info from native side via getUnigramPropertyNative(). + private static final int FORMAT_UNIGRAM_PROPERTY_OUTPUT_HISTORICAL_INFO_COUNT = 3; + private static final int FORMAT_UNIGRAM_PROPERTY_TIMESTAMP_INDEX = 0; + private static final int FORMAT_UNIGRAM_PROPERTY_LEVEL_INDEX = 1; + private static final int FORMAT_UNIGRAM_PROPERTY_COUNT_INDEX = 2; + private long mNativeDict; private final Locale mLocale; private final long mDictSize; @@ -129,6 +143,10 @@ public final class BinaryDictionary extends Dictionary { private static native int getFormatVersionNative(long dict); private static native int getProbabilityNative(long dict, int[] word); private static native int getBigramProbabilityNative(long dict, int[] word0, int[] word1); + private static native void getUnigramPropertyNative(long dict, int[] word, + int[] outCodePoints, boolean[] outFlags, int[] outProbability, + int[] outHistoricalInfo, ArrayList outShortcutTargets, + ArrayList outShortcutProbabilities); private static native int getSuggestionsNative(long dict, long proximityInfo, long traverseSession, int[] xCoordinates, int[] yCoordinates, int[] times, int[] pointerIds, int[] inputCodePoints, int inputSize, int commitPoint, @@ -290,6 +308,32 @@ public final class BinaryDictionary extends Dictionary { return getBigramProbabilityNative(mNativeDict, codePoints0, codePoints1); } + @UsedForTesting + public UnigramProperty getUnigramProperty(final String word) { + if (TextUtils.isEmpty(word)) { + return null; + } + final int[] codePoints = StringUtils.toCodePointArray(word); + final int[] outCodePoints = new int[MAX_WORD_LENGTH]; + final boolean[] outFlags = new boolean[FORMAT_UNIGRAM_PROPERTY_OUTPUT_FLAG_COUNT]; + final int[] outProbability = new int[1]; + final int[] outHistoricalInfo = + new int[FORMAT_UNIGRAM_PROPERTY_OUTPUT_HISTORICAL_INFO_COUNT]; + final ArrayList outShortcutTargets = CollectionUtils.newArrayList(); + final ArrayList outShortcutProbabilities = CollectionUtils.newArrayList(); + getUnigramPropertyNative(mNativeDict, codePoints, outCodePoints, outFlags, outProbability, + outHistoricalInfo, outShortcutTargets, outShortcutProbabilities); + return new UnigramProperty(codePoints, + outFlags[FORMAT_UNIGRAM_PROPERTY_IS_NOT_A_WORD_INDEX], + outFlags[FORMAT_UNIGRAM_PROPERTY_IS_BLACKLISTED_INDEX], + outFlags[FORMAT_UNIGRAM_PROPERTY_HAS_BIGRAMS_INDEX], + outFlags[FORMAT_UNIGRAM_PROPERTY_HAS_SHORTCUTS_INDEX], outProbability[0], + outHistoricalInfo[FORMAT_UNIGRAM_PROPERTY_TIMESTAMP_INDEX], + outHistoricalInfo[FORMAT_UNIGRAM_PROPERTY_LEVEL_INDEX], + outHistoricalInfo[FORMAT_UNIGRAM_PROPERTY_COUNT_INDEX], + outShortcutTargets, outShortcutProbabilities); + } + // Add a unigram entry to binary dictionary with unigram attributes in native code. public void addUnigramWord(final String word, final int probability, final String shortcutTarget, final int shortcutProbability, final boolean isNotAWord, diff --git a/java/src/com/android/inputmethod/latin/utils/UnigramProperty.java b/java/src/com/android/inputmethod/latin/utils/UnigramProperty.java new file mode 100644 index 000000000..4feee4393 --- /dev/null +++ b/java/src/com/android/inputmethod/latin/utils/UnigramProperty.java @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package com.android.inputmethod.latin.utils; + +import com.android.inputmethod.annotations.UsedForTesting; +import com.android.inputmethod.latin.BinaryDictionary; +import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; + +import java.util.ArrayList; + +// This has information that belong to a unigram. This class has some detailed attributes such as +// historical information but they have to be checked only for testing purpose. +@UsedForTesting +public class UnigramProperty { + public final String mCodePoints; + public final boolean mIsNotAWord; + public final boolean mIsBlacklisted; + public final boolean mHasBigrams; + public final boolean mHasShortcuts; + public final int mProbability; + // mTimestamp, mLevel and mCount are historical info. These values are depend on the + // implementation in native code; thus, we must not use them and have any assumptions about + // them except for tests. + public final int mTimestamp; + public final int mLevel; + public final int mCount; + public final ArrayList mShortcutTargets = CollectionUtils.newArrayList(); + + private static int getCodePointCount(final int[] codePoints) { + for (int i = 0; i < codePoints.length; i++) { + if (codePoints[i] == 0) { + return i; + } + } + return codePoints.length; + } + + // This represents invalid unigram when the probability is BinaryDictionary.NOT_A_PROBABILITY. + public UnigramProperty(final int[] codePoints, final boolean isNotAWord, + final boolean isBlacklisted, final boolean hasBigram, + final boolean hasShortcuts, final int probability, final int timestamp, + final int level, final int count, final ArrayList shortcutTargets, + final ArrayList shortcutProbabilities) { + mCodePoints = new String(codePoints, 0 /* offset */, getCodePointCount(codePoints)); + mIsNotAWord = isNotAWord; + mIsBlacklisted = isBlacklisted; + mHasBigrams = hasBigram; + mHasShortcuts = hasShortcuts; + mProbability = probability; + mTimestamp = timestamp; + mLevel = level; + mCount = count; + final int shortcutTargetCount = shortcutTargets.size(); + for (int i = 0; i < shortcutTargetCount; i++) { + final int[] shortcutTargetCodePointArray = shortcutTargets.get(i); + final String shortcutTargetString = new String(shortcutTargetCodePointArray, + 0 /* offset */, getCodePointCount(shortcutTargetCodePointArray)); + mShortcutTargets.add( + new WeightedString(shortcutTargetString, shortcutProbabilities.get(i))); + } + } + + @UsedForTesting + public boolean isValid() { + return mProbability != BinaryDictionary.NOT_A_PROBABILITY; + } +} \ No newline at end of file diff --git a/native/jni/Android.mk b/native/jni/Android.mk index 52ac333c4..f2c6d3bec 100644 --- a/native/jni/Android.mk +++ b/native/jni/Android.mk @@ -58,7 +58,8 @@ LATIN_IME_CORE_SRC_FILES := \ dictionary.cpp \ digraph_utils.cpp \ error_type_utils.cpp \ - multi_bigram_map.cpp) \ + multi_bigram_map.cpp \ + unigram_property.cpp) \ $(addprefix suggest/core/layout/, \ additional_proximity_chars.cpp \ proximity_info.cpp \ diff --git a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp index 0ef372a51..3a896dbad 100644 --- a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp +++ b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp @@ -24,6 +24,7 @@ #include "jni.h" #include "jni_common.h" #include "suggest/core/dictionary/dictionary.h" +#include "suggest/core/dictionary/unigram_property.h" #include "suggest/core/suggest_options.h" #include "suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h" #include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" @@ -266,6 +267,21 @@ static jint latinime_BinaryDictionary_getBigramProbability(JNIEnv *env, jclass c word1Length); } +static void latinime_BinaryDictionary_getUnigramProperty(JNIEnv *env, jclass clazz, + jlong dict, jintArray word, jintArray outCodePoints, jbooleanArray outFlags, + jintArray outProbability, jintArray outHistoricalInfo, jobject outShortcutTargets, + jobject outShortcutProbabilities) { + Dictionary *dictionary = reinterpret_cast(dict); + if (!dictionary) return; + const jsize wordLength = env->GetArrayLength(word); + int wordCodePoints[wordLength]; + env->GetIntArrayRegion(word, 0, wordLength, wordCodePoints); + const UnigramProperty unigramProperty = dictionary->getUnigramProperty( + wordCodePoints, wordLength); + unigramProperty.outputProperties(env, outCodePoints, outFlags, outProbability, + outHistoricalInfo, outShortcutTargets, outShortcutProbabilities); +} + static jfloat latinime_BinaryDictionary_calcNormalizedScore(JNIEnv *env, jclass clazz, jintArray before, jintArray after, jint score) { jsize beforeLength = env->GetArrayLength(before); @@ -341,7 +357,6 @@ static void latinime_BinaryDictionary_removeBigramWords(JNIEnv *env, jclass claz word1Length); } - // Returns how many language model params are processed. static int latinime_BinaryDictionary_addMultipleDictionaryEntries(JNIEnv *env, jclass clazz, jlong dict, jobjectArray languageModelParams, jint startIndex) { @@ -506,6 +521,11 @@ static const JNINativeMethod sMethods[] = { const_cast("(J[I[I)I"), reinterpret_cast(latinime_BinaryDictionary_getBigramProbability) }, + { + const_cast("getUnigramPropertyNative"), + const_cast("(J[I[I[Z[I[ILjava/util/ArrayList;Ljava/util/ArrayList;)V"), + reinterpret_cast(latinime_BinaryDictionary_getUnigramProperty) + }, { const_cast("calcNormalizedScoreNative"), const_cast("([I[II)F"), diff --git a/native/jni/src/suggest/core/dictionary/dictionary.cpp b/native/jni/src/suggest/core/dictionary/dictionary.cpp index 8055707b7..07da8da87 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/dictionary.cpp @@ -143,6 +143,13 @@ void Dictionary::getProperty(const char *const query, const int queryLength, cha maxResultLength); } +const UnigramProperty Dictionary::getUnigramProperty(const int *const codePoints, + const int codePointCount) { + TimeKeeper::setCurrentTime(); + return mDictionaryStructureWithBufferPolicy.get()->getUnigramProperty( + codePoints, codePointCount); +} + void Dictionary::logDictionaryInfo(JNIEnv *const env) const { int dictionaryIdCodePointBuffer[HEADER_ATTRIBUTE_BUFFER_SIZE]; int versionStringCodePointBuffer[HEADER_ATTRIBUTE_BUFFER_SIZE]; diff --git a/native/jni/src/suggest/core/dictionary/dictionary.h b/native/jni/src/suggest/core/dictionary/dictionary.h index 4fef051d3..556b4c9db 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.h +++ b/native/jni/src/suggest/core/dictionary/dictionary.h @@ -22,6 +22,7 @@ #include "defines.h" #include "jni.h" #include "suggest/core/dictionary/bigram_dictionary.h" +#include "suggest/core/dictionary/unigram_property.h" #include "suggest/core/policy/dictionary_header_structure_policy.h" #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" #include "suggest/core/suggest_interface.h" @@ -33,6 +34,7 @@ class DictionaryStructureWithBufferPolicy; class DicTraverseSession; class ProximityInfo; class SuggestOptions; +class UnigramProperty; class Dictionary { public: @@ -92,6 +94,8 @@ class Dictionary { void getProperty(const char *const query, const int queryLength, char *const outResult, const int maxResultLength); + const UnigramProperty getUnigramProperty(const int *const codePoints, const int codePointCount); + const DictionaryStructureWithBufferPolicy *getDictionaryStructurePolicy() const { return mDictionaryStructureWithBufferPolicy.get(); } diff --git a/native/jni/src/suggest/core/dictionary/unigram_property.cpp b/native/jni/src/suggest/core/dictionary/unigram_property.cpp new file mode 100644 index 000000000..16bbb69d8 --- /dev/null +++ b/native/jni/src/suggest/core/dictionary/unigram_property.cpp @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/dictionary/unigram_property.h" + +namespace latinime { + +void UnigramProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints, + jbooleanArray outFlags, jintArray outProbability, jintArray outHistoricalInfo, + jobject outShortcutTargets, jobject outShortcutProbabilities) const { + env->SetIntArrayRegion(outCodePoints, 0 /* start */, mCodePointCount, mCodePoints); + jboolean flags[] = {mIsNotAWord, mIsBlacklisted, mHasBigrams, mHasShortcuts}; + env->SetBooleanArrayRegion(outFlags, 0 /* start */, NELEMS(flags), flags); + env->SetIntArrayRegion(outProbability, 0 /* start */, 1 /* len */, &mProbability); + int historicalInfo[] = {mTimestamp, mLevel, mCount}; + env->SetIntArrayRegion(outHistoricalInfo, 0 /* start */, NELEMS(historicalInfo), + historicalInfo); + + jclass integerClass = env->FindClass("java/lang/Integer"); + jmethodID intToIntegerConstructorId = env->GetMethodID(integerClass, "", "(I)V"); + jclass arrayListClass = env->FindClass("java/util/ArrayList"); + jmethodID addMethodId = env->GetMethodID(arrayListClass, "add", "(Ljava/lang/Object;)Z"); + const int shortcutTargetCount = mShortcutTargets.size(); + for (int i = 0; i < shortcutTargetCount; ++i) { + jintArray shortcutTargetCodePointArray = env->NewIntArray(mShortcutTargets[i].size()); + env->SetIntArrayRegion(shortcutTargetCodePointArray, 0 /* start */, + mShortcutTargets[i].size(), &mShortcutTargets[i][0]); + env->CallVoidMethod(outShortcutTargets, addMethodId, shortcutTargetCodePointArray); + env->DeleteLocalRef(shortcutTargetCodePointArray); + jobject integerProbability = env->NewObject(integerClass, intToIntegerConstructorId, + mShortcutProbabilities[i]); + env->CallVoidMethod(outShortcutProbabilities, addMethodId, integerProbability); + env->DeleteLocalRef(integerProbability); + } + env->DeleteLocalRef(integerClass); + env->DeleteLocalRef(arrayListClass); +} + +} // namespace latinime diff --git a/native/jni/src/suggest/core/dictionary/unigram_property.h b/native/jni/src/suggest/core/dictionary/unigram_property.h new file mode 100644 index 000000000..e93093bf4 --- /dev/null +++ b/native/jni/src/suggest/core/dictionary/unigram_property.h @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_UNIGRAM_PROPERTY_H +#define LATINIME_UNIGRAM_PROPERTY_H + +#include + +#include "defines.h" +#include "jni.h" + +namespace latinime { + +// This class is used for returning information belonging to a unigram to java side. +class UnigramProperty { + public: + // Invalid unigram. + UnigramProperty() + : mCodePoints(), mCodePointCount(0), mIsNotAWord(false), mIsBlacklisted(false), + mHasBigrams(false), mHasShortcuts(false), mProbability(NOT_A_PROBABILITY), + mTimestamp(0), mLevel(0), mCount(0), mShortcutTargets(), + mShortcutProbabilities() {} + + UnigramProperty(const UnigramProperty &unigramProperty) + : mCodePoints(), mCodePointCount(unigramProperty.mCodePointCount), + mIsNotAWord(unigramProperty.mIsNotAWord), + mIsBlacklisted(unigramProperty.mIsBlacklisted), + mHasBigrams(unigramProperty.mHasBigrams), + mHasShortcuts(unigramProperty.mHasShortcuts), + mProbability(unigramProperty.mProbability), + mTimestamp(unigramProperty.mTimestamp), mLevel(unigramProperty.mLevel), + mCount(unigramProperty.mCount), mShortcutTargets(unigramProperty.mShortcutTargets), + mShortcutProbabilities(unigramProperty.mShortcutProbabilities) { + memcpy(mCodePoints, unigramProperty.mCodePoints, sizeof(mCodePoints)); + } + + UnigramProperty(const int *const codePoints, const int codePointCount, + const bool isNotAWord, const bool isBlacklisted, const bool hasBigrams, + const bool hasShortcuts, const int probability, const int timestamp, + const int level, const int count, + const std::vector > *const shortcutTargets, + const std::vector *const shortcutProbabilities) + : mCodePoints(), mCodePointCount(codePointCount), + mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), mHasBigrams(hasBigrams), + mHasShortcuts(hasShortcuts), mProbability(probability), mTimestamp(timestamp), + mLevel(level), mCount(count), mShortcutTargets(*shortcutTargets), + mShortcutProbabilities(*shortcutProbabilities) { + memcpy(mCodePoints, codePoints, sizeof(mCodePoints)); + } + + void outputProperties(JNIEnv *const env, jintArray outCodePoints, jbooleanArray outFlags, + jintArray outProbability, jintArray outHistoricalInfo, jobject outShortcutTargets, + jobject outShortcutProbabilities) const; + + private: + DISALLOW_ASSIGNMENT_OPERATOR(UnigramProperty); + + int mCodePoints[MAX_WORD_LENGTH]; + int mCodePointCount; + bool mIsNotAWord; + bool mIsBlacklisted; + bool mHasBigrams; + bool mHasShortcuts; + int mProbability; + // Historical information + int mTimestamp; + int mLevel; + int mCount; + // Shortcut + std::vector > mShortcutTargets; + std::vector mShortcutProbabilities; +}; +} // namespace latinime +#endif // LATINIME_UNIGRAM_PROPERTY_H diff --git a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h index 523aa93e6..07bc74fa4 100644 --- a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h +++ b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h @@ -18,6 +18,7 @@ #define LATINIME_DICTIONARY_STRUCTURE_POLICY_H #include "defines.h" +#include "suggest/core/dictionary/unigram_property.h" #include "utils/exclusive_ownership_pointer.h" namespace latinime { @@ -90,6 +91,10 @@ class DictionaryStructureWithBufferPolicy { virtual void getProperty(const char *const query, const int queryLength, char *const outResult, const int maxResultLength) = 0; + // Used for testing. + virtual const UnigramProperty getUnigramProperty(const int *const codePonts, + const int codePointCount) const = 0; + protected: DictionaryStructureWithBufferPolicy() {} diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h index e78c82725..2adafd22b 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h @@ -123,6 +123,12 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { } } + const UnigramProperty getUnigramProperty(const int *const codePoints, + const int codePointCount) const { + // getUnigramProperty is not supported. + return UnigramProperty(); + } + private: DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTriePolicy); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp index 979b8e76d..d1ba1877c 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp @@ -16,8 +16,11 @@ #include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h" +#include + #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_vector.h" +#include "suggest/core/dictionary/unigram_property.h" #include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_reading_helper.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" #include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" @@ -290,4 +293,42 @@ void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int quer } } +const UnigramProperty Ver4PatriciaTriePolicy::getUnigramProperty(const int *const codePoints, + const int codePointCount) const { + const int ptNodePos = getTerminalPtNodePositionOfWord(codePoints, codePointCount, + false /* forceLowerCaseSearch */); + if (ptNodePos == NOT_A_DICT_POS) { + AKLOGE("fetchUnigramProperty is called for invalid word."); + return UnigramProperty(); + } + const PtNodeParams ptNodeParams = mNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos); + const ProbabilityEntry probabilityEntry = + mBuffers.get()->getProbabilityDictContent()->getProbabilityEntry( + ptNodeParams.getTerminalId()); + // Fetch shortcut information. + std::vector > shortcutTargets; + std::vector shortcutProbabilities; + if (ptNodeParams.hasShortcutTargets()) { + int shortcutTarget[MAX_WORD_LENGTH]; + const ShortcutDictContent *const shortcutDictContent = + mBuffers.get()->getShortcutDictContent(); + bool hasNext = true; + int shortcutPos = getShortcutPositionOfPtNode(ptNodePos); + while (hasNext) { + int shortcutTargetLength = 0; + int shortcutProbability = NOT_A_PROBABILITY; + shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget, + &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos); + std::vector target(shortcutTarget, shortcutTarget + shortcutTargetLength); + shortcutTargets.push_back(target); + shortcutProbabilities.push_back(shortcutProbability); + } + } + return UnigramProperty(ptNodeParams.getCodePoints(), ptNodeParams.getCodePointCount(), + ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.hasBigrams(), + ptNodeParams.hasShortcutTargets(), ptNodeParams.getProbability(), + probabilityEntry.getTimeStamp(), probabilityEntry.getLevel(), + probabilityEntry.getCount(), &shortcutTargets, &shortcutProbabilities); +} + } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h index 78f3a553d..db4e8d21c 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h @@ -107,6 +107,9 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { void getProperty(const char *const query, const int queryLength, char *const outResult, const int maxResultLength); + const UnigramProperty getUnigramProperty(const int *const codePoints, + const int codePointCount) const; + private: DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTriePolicy); diff --git a/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java b/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java index 9cccdedda..03a302b8f 100644 --- a/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java +++ b/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java @@ -24,6 +24,7 @@ import android.util.Pair; import com.android.inputmethod.latin.BinaryDictionary.LanguageModelParam; import com.android.inputmethod.latin.makedict.CodePointUtils; import com.android.inputmethod.latin.makedict.FormatSpec; +import com.android.inputmethod.latin.utils.UnigramProperty; import java.io.File; import java.io.IOException; @@ -824,4 +825,52 @@ public class BinaryDictionaryTests extends AndroidTestCase { assertEquals(probability, binaryDictionary.getBigramProbability(word0, word1)); } } + + public void testGetUnigramProperties() { + testGetUnigramProperties(4 /* formatVersion */); + } + + private void testGetUnigramProperties(final int formatVersion) { + final long seed = System.currentTimeMillis(); + final Random random = new Random(seed); + final int ITERATION_COUNT = 1000; + final int codePointSetSize = 20; + final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); + + File dictFile = null; + try { + dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion); + } catch (IOException e) { + fail("IOException while writing an initial dictionary : " + e); + } + final BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), + 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, + Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); + + final UnigramProperty invalidUnigramProperty = + binaryDictionary.getUnigramProperty("dummyWord"); + assertFalse(invalidUnigramProperty.isValid()); + + for (int i = 0; i < ITERATION_COUNT; i++) { + final String word = CodePointUtils.generateWord(random, codePointSet); + final int unigramProbability = random.nextInt(0xFF); + final boolean isNotAWord = random.nextBoolean(); + final boolean isBlacklisted = random.nextBoolean(); + // TODO: Add tests for shortcut. + // TODO: Add tests for historical info. + binaryDictionary.addUnigramWord(word, unigramProbability, + null /* shortcutTarget */, BinaryDictionary.NOT_A_PROBABILITY, + isNotAWord, isBlacklisted, BinaryDictionary.NOT_A_VALID_TIMESTAMP); + final UnigramProperty unigramProperty = + binaryDictionary.getUnigramProperty(word); + assertEquals(word, unigramProperty.mCodePoints); + assertTrue(unigramProperty.isValid()); + assertEquals(isNotAWord, unigramProperty.mIsNotAWord); + assertEquals(isBlacklisted, unigramProperty.mIsBlacklisted); + assertEquals(false, unigramProperty.mHasBigrams); + assertEquals(false, unigramProperty.mHasShortcuts); + assertEquals(unigramProbability, unigramProperty.mProbability); + assertTrue(unigramProperty.mShortcutTargets.isEmpty()); + } + } }