From b5ef884fbb6bfd08ce793604cdf7f0941e958a84 Mon Sep 17 00:00:00 2001 From: Keisuke Kuroyanagi Date: Wed, 22 Oct 2014 18:15:53 +0900 Subject: [PATCH] Support dumping ngram entries. Bug: 14425059 Change-Id: Ib03a0c3d166ed6f1e60c67127b28006d55143b6b --- .../latin/makedict/WordProperty.java | 19 +++++--- .../latin/utils/CombinedFormatUtils.java | 18 +++++-- ...oid_inputmethod_latin_BinaryDictionary.cpp | 5 +- .../dictionary/property/word_property.cpp | 48 ++++++++++++++----- .../core/dictionary/property/word_property.h | 6 ++- .../content/language_model_dict_content.cpp | 4 +- .../v4/ver4_patricia_trie_policy.cpp | 12 ++--- 7 files changed, 77 insertions(+), 35 deletions(-) diff --git a/java/src/com/android/inputmethod/latin/makedict/WordProperty.java b/java/src/com/android/inputmethod/latin/makedict/WordProperty.java index b129c3e40..e7808e46e 100644 --- a/java/src/com/android/inputmethod/latin/makedict/WordProperty.java +++ b/java/src/com/android/inputmethod/latin/makedict/WordProperty.java @@ -87,7 +87,7 @@ public final class WordProperty implements Comparable { final boolean isPossiblyOffensive, final boolean hasBigram, final boolean hasShortcuts, final boolean isBeginningOfSentence, final int[] probabilityInfo, final ArrayList ngramPrevWordsArray, - final ArrayList outNgramPrevWordIsBeginningOfSentenceArray, + final ArrayList ngramPrevWordIsBeginningOfSentenceArray, final ArrayList ngramTargets, final ArrayList ngramProbabilityInfo, final ArrayList shortcutTargets, final ArrayList shortcutProbabilities) { @@ -102,16 +102,22 @@ public final class WordProperty implements Comparable { mHasNgrams = hasBigram; final int relatedNgramCount = ngramTargets.size(); - final WordInfo currentWordInfo = - mIsBeginningOfSentence ? WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO - : new WordInfo(mWord); - final NgramContext ngramContext = new NgramContext(currentWordInfo); for (int i = 0; i < relatedNgramCount; i++) { final String ngramTargetString = StringUtils.getStringFromNullTerminatedCodePointArray(ngramTargets.get(i)); final WeightedString ngramTarget = new WeightedString(ngramTargetString, createProbabilityInfoFromArray(ngramProbabilityInfo.get(i))); - // TODO: Support n-gram. + final int[][] prevWords = ngramPrevWordsArray.get(i); + final boolean[] isBeginningOfSentenceArray = + ngramPrevWordIsBeginningOfSentenceArray.get(i); + final WordInfo[] wordInfoArray = new WordInfo[prevWords.length]; + for (int j = 0; j < prevWords.length; j++) { + wordInfoArray[j] = isBeginningOfSentenceArray[j] + ? WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO + : new WordInfo(StringUtils.getStringFromNullTerminatedCodePointArray( + prevWords[j])); + } + final NgramContext ngramContext = new NgramContext(wordInfoArray); ngrams.add(new NgramProperty(ngramTarget, ngramContext)); } mNgrams = ngrams.isEmpty() ? null : ngrams; @@ -126,6 +132,7 @@ public final class WordProperty implements Comparable { } // TODO: Remove + @UsedForTesting public ArrayList getBigrams() { if (null == mNgrams) { return null; diff --git a/java/src/com/android/inputmethod/latin/utils/CombinedFormatUtils.java b/java/src/com/android/inputmethod/latin/utils/CombinedFormatUtils.java index 248246232..4e0f5f583 100644 --- a/java/src/com/android/inputmethod/latin/utils/CombinedFormatUtils.java +++ b/java/src/com/android/inputmethod/latin/utils/CombinedFormatUtils.java @@ -17,6 +17,7 @@ package com.android.inputmethod.latin.utils; import com.android.inputmethod.latin.makedict.DictionaryHeader; +import com.android.inputmethod.latin.makedict.NgramProperty; import com.android.inputmethod.latin.makedict.ProbabilityInfo; import com.android.inputmethod.latin.makedict.WeightedString; import com.android.inputmethod.latin.makedict.WordProperty; @@ -26,6 +27,8 @@ import java.util.HashMap; public class CombinedFormatUtils { public static final String DICTIONARY_TAG = "dictionary"; public static final String BIGRAM_TAG = "bigram"; + public static final String NGRAM_TAG = "ngram"; + public static final String NGRAM_PREV_WORD_TAG = "prev_word"; public static final String SHORTCUT_TAG = "shortcut"; public static final String PROBABILITY_TAG = "f"; public static final String HISTORICAL_INFO_TAG = "historicalInfo"; @@ -76,12 +79,19 @@ public class CombinedFormatUtils { } } if (wordProperty.mHasNgrams) { - // TODO: Support ngram. - for (final WeightedString bigram : wordProperty.getBigrams()) { - builder.append(" " + BIGRAM_TAG + "=" + bigram.mWord); + for (final NgramProperty ngramProperty : wordProperty.mNgrams) { + builder.append(" " + NGRAM_TAG + "=" + ngramProperty.mTargetWord.mWord); builder.append(","); - builder.append(formatProbabilityInfo(bigram.mProbabilityInfo)); + builder.append(formatProbabilityInfo(ngramProperty.mTargetWord.mProbabilityInfo)); builder.append("\n"); + for (int i = 0; i < ngramProperty.mNgramContext.getPrevWordCount(); i++) { + builder.append(" " + NGRAM_PREV_WORD_TAG + "[" + i + "]=" + + ngramProperty.mNgramContext.getNthPrevWord(i + 1)); + if (ngramProperty.mNgramContext.isNthPrevWordBeginningOfSontence(i + 1)) { + builder.append("," + BEGINNING_OF_SENTENCE_TAG + "=true"); + } + builder.append("\n"); + } } } return builder.toString(); diff --git a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp index 461d1d859..9239c8400 100644 --- a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp +++ b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp @@ -327,8 +327,8 @@ static jint latinime_BinaryDictionary_getNextWord(JNIEnv *env, jclass clazz, static void latinime_BinaryDictionary_getWordProperty(JNIEnv *env, jclass clazz, jlong dict, jintArray word, jboolean isBeginningOfSentence, jintArray outCodePoints, - jbooleanArray outFlags, jintArray outProbabilityInfo, jobject /* outNgramPrevWordsArray */, - jobject /* outNgramPrevWordIsBeginningOfSentenceArray */, jobject outNgramTargets, + jbooleanArray outFlags, jintArray outProbabilityInfo, jobject outNgramPrevWordsArray, + jobject outNgramPrevWordIsBeginningOfSentenceArray, jobject outNgramTargets, jobject outNgramProbabilityInfo, jobject outShortcutTargets, jobject outShortcutProbabilities) { Dictionary *dictionary = reinterpret_cast(dict); @@ -352,6 +352,7 @@ static void latinime_BinaryDictionary_getWordProperty(JNIEnv *env, jclass clazz, const WordProperty wordProperty = dictionary->getWordProperty( CodePointArrayView(wordCodePoints, codePointCount)); wordProperty.outputProperties(env, outCodePoints, outFlags, outProbabilityInfo, + outNgramPrevWordsArray, outNgramPrevWordIsBeginningOfSentenceArray, outNgramTargets, outNgramProbabilityInfo, outShortcutTargets, outShortcutProbabilities); } diff --git a/native/jni/src/suggest/core/dictionary/property/word_property.cpp b/native/jni/src/suggest/core/dictionary/property/word_property.cpp index a707f1ba2..019f0880f 100644 --- a/native/jni/src/suggest/core/dictionary/property/word_property.cpp +++ b/native/jni/src/suggest/core/dictionary/property/word_property.cpp @@ -22,8 +22,9 @@ namespace latinime { void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints, - jbooleanArray outFlags, jintArray outProbabilityInfo, jobject outBigramTargets, - jobject outBigramProbabilities, jobject outShortcutTargets, + jbooleanArray outFlags, jintArray outProbabilityInfo, + jobject outNgramPrevWordsArray, jobject outNgramPrevWordIsBeginningOfSentenceArray, + jobject outNgramTargets, jobject outNgramProbabilities, jobject outShortcutTargets, jobject outShortcutProbabilities) const { JniDataUtils::outputCodePoints(env, outCodePoints, 0 /* start */, MAX_WORD_LENGTH /* maxLength */, mCodePoints.data(), mCodePoints.size(), @@ -43,16 +44,39 @@ void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints, jclass arrayListClass = env->FindClass("java/util/ArrayList"); jmethodID addMethodId = env->GetMethodID(arrayListClass, "add", "(Ljava/lang/Object;)Z"); - // Output bigrams. - // TODO: Support n-gram + // Output ngrams. + jclass intArrayClass = env->FindClass("[I"); for (const auto &ngramProperty : mNgrams) { - const std::vector *const word1CodePoints = ngramProperty.getTargetCodePoints(); - jintArray bigramWord1CodePointArray = env->NewIntArray(word1CodePoints->size()); - JniDataUtils::outputCodePoints(env, bigramWord1CodePointArray, 0 /* start */, - word1CodePoints->size(), word1CodePoints->data(), word1CodePoints->size(), - false /* needsNullTermination */); - env->CallBooleanMethod(outBigramTargets, addMethodId, bigramWord1CodePointArray); - env->DeleteLocalRef(bigramWord1CodePointArray); + const NgramContext *const ngramContext = ngramProperty.getNgramContext(); + jobjectArray prevWordWordCodePointsArray = env->NewObjectArray( + ngramContext->getPrevWordCount(), intArrayClass, nullptr); + jbooleanArray prevWordIsBeginningOfSentenceArray = + env->NewBooleanArray(ngramContext->getPrevWordCount()); + for (size_t i = 0; i < ngramContext->getPrevWordCount(); ++i) { + const CodePointArrayView codePoints = ngramContext->getNthPrevWordCodePoints(i + 1); + jintArray prevWordCodePoints = env->NewIntArray(codePoints.size()); + JniDataUtils::outputCodePoints(env, prevWordCodePoints, 0 /* start */, + codePoints.size(), codePoints.data(), codePoints.size(), + false /* needsNullTermination */); + env->SetObjectArrayElement(prevWordWordCodePointsArray, i, prevWordCodePoints); + env->DeleteLocalRef(prevWordCodePoints); + JniDataUtils::putBooleanToArray(env, prevWordIsBeginningOfSentenceArray, i, + ngramContext->isNthPrevWordBeginningOfSentence(i + 1)); + } + env->CallBooleanMethod(outNgramPrevWordsArray, addMethodId, prevWordWordCodePointsArray); + env->CallBooleanMethod(outNgramPrevWordIsBeginningOfSentenceArray, addMethodId, + prevWordIsBeginningOfSentenceArray); + env->DeleteLocalRef(prevWordWordCodePointsArray); + env->DeleteLocalRef(prevWordIsBeginningOfSentenceArray); + + const std::vector *const targetWordCodePoints = ngramProperty.getTargetCodePoints(); + jintArray targetWordCodePointArray = env->NewIntArray(targetWordCodePoints->size()); + JniDataUtils::outputCodePoints(env, targetWordCodePointArray, 0 /* start */, + targetWordCodePoints->size(), targetWordCodePoints->data(), + targetWordCodePoints->size(), false /* needsNullTermination */); + env->CallBooleanMethod(outNgramTargets, addMethodId, targetWordCodePointArray); + env->DeleteLocalRef(targetWordCodePointArray); + const HistoricalInfo &ngramHistoricalInfo = ngramProperty.getHistoricalInfo(); int bigramProbabilityInfo[] = {ngramProperty.getProbability(), ngramHistoricalInfo.getTimestamp(), ngramHistoricalInfo.getLevel(), @@ -60,7 +84,7 @@ void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints, jintArray bigramProbabilityInfoArray = env->NewIntArray(NELEMS(bigramProbabilityInfo)); env->SetIntArrayRegion(bigramProbabilityInfoArray, 0 /* start */, NELEMS(bigramProbabilityInfo), bigramProbabilityInfo); - env->CallBooleanMethod(outBigramProbabilities, addMethodId, bigramProbabilityInfoArray); + env->CallBooleanMethod(outNgramProbabilities, addMethodId, bigramProbabilityInfoArray); env->DeleteLocalRef(bigramProbabilityInfoArray); } diff --git a/native/jni/src/suggest/core/dictionary/property/word_property.h b/native/jni/src/suggest/core/dictionary/property/word_property.h index 01b8987b5..b5314faaa 100644 --- a/native/jni/src/suggest/core/dictionary/property/word_property.h +++ b/native/jni/src/suggest/core/dictionary/property/word_property.h @@ -39,8 +39,10 @@ class WordProperty { mNgrams(*ngrams) {} void outputProperties(JNIEnv *const env, jintArray outCodePoints, jbooleanArray outFlags, - jintArray outProbabilityInfo, jobject outBigramTargets, jobject outBigramProbabilities, - jobject outShortcutTargets, jobject outShortcutProbabilities) const; + jintArray outProbabilityInfo, jobject outNgramPrevWordsArray, + jobject outNgramPrevWordIsBeginningOfSentenceArray, jobject outNgramTargets, + jobject outNgramProbabilities, jobject outShortcutTargets, + jobject outShortcutProbabilities) const; const UnigramProperty *getUnigramProperty() const { return &mUnigramProperty; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp index b96290437..509bd683b 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp @@ -90,8 +90,8 @@ const WordAttributes LanguageModelDictContent::getWordAttributes(const WordIdArr // TODO: Some flags in unigramProbabilityEntry should be overwritten by flags in // probabilityEntry. const ProbabilityEntry unigramProbabilityEntry = getProbabilityEntry(wordId); - return WordAttributes(probability, unigramProbabilityEntry.isNotAWord(), - unigramProbabilityEntry.isBlacklisted(), + return WordAttributes(probability, unigramProbabilityEntry.isBlacklisted(), + unigramProbabilityEntry.isNotAWord(), unigramProbabilityEntry.isPossiblyOffensive()); } // Cannot find the word. diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp index 193326d82..249d822b2 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp @@ -488,9 +488,6 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty( AKLOGE("getWordProperty is called for invalid word."); return WordProperty(); } - const int ptNodePos = - mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId); - const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); const LanguageModelDictContent *const languageModelDictContent = mBuffers->getLanguageModelDictContent(); // Fetch ngram information. @@ -541,12 +538,13 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty( shortcutProbability); } } - const ProbabilityEntry probabilityEntry = languageModelDictContent->getProbabilityEntry( - ptNodeParams.getTerminalId()); + const WordAttributes wordAttributes = languageModelDictContent->getWordAttributes( + WordIdArrayView(), wordId, mHeaderPolicy); + const ProbabilityEntry probabilityEntry = languageModelDictContent->getProbabilityEntry(wordId); const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); const UnigramProperty unigramProperty(probabilityEntry.representsBeginningOfSentence(), - probabilityEntry.isNotAWord(), probabilityEntry.isBlacklisted(), - probabilityEntry.isPossiblyOffensive(), probabilityEntry.getProbability(), + wordAttributes.isNotAWord(), wordAttributes.isBlacklisted(), + wordAttributes.isPossiblyOffensive(), wordAttributes.getProbability(), *historicalInfo, std::move(shortcuts)); return WordProperty(wordCodePoints.toVector(), &unigramProperty, &ngrams); }