Merge "Support dumping ngram entries."

This commit is contained in:
Keisuke Kuroyanagi 2014-10-22 09:38:17 +00:00 committed by Android (Google) Code Review
commit 010909d443
7 changed files with 77 additions and 35 deletions

View file

@ -87,7 +87,7 @@ public final class WordProperty implements Comparable<WordProperty> {
final boolean isPossiblyOffensive, final boolean hasBigram, final boolean hasShortcuts, final boolean isPossiblyOffensive, final boolean hasBigram, final boolean hasShortcuts,
final boolean isBeginningOfSentence, final int[] probabilityInfo, final boolean isBeginningOfSentence, final int[] probabilityInfo,
final ArrayList<int[][]> ngramPrevWordsArray, final ArrayList<int[][]> ngramPrevWordsArray,
final ArrayList<boolean[]> outNgramPrevWordIsBeginningOfSentenceArray, final ArrayList<boolean[]> ngramPrevWordIsBeginningOfSentenceArray,
final ArrayList<int[]> ngramTargets, final ArrayList<int[]> ngramProbabilityInfo, final ArrayList<int[]> ngramTargets, final ArrayList<int[]> ngramProbabilityInfo,
final ArrayList<int[]> shortcutTargets, final ArrayList<int[]> shortcutTargets,
final ArrayList<Integer> shortcutProbabilities) { final ArrayList<Integer> shortcutProbabilities) {
@ -102,16 +102,22 @@ public final class WordProperty implements Comparable<WordProperty> {
mHasNgrams = hasBigram; mHasNgrams = hasBigram;
final int relatedNgramCount = ngramTargets.size(); final int relatedNgramCount = ngramTargets.size();
final WordInfo currentWordInfo =
mIsBeginningOfSentence ? WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO
: new WordInfo(mWord);
final NgramContext ngramContext = new NgramContext(currentWordInfo);
for (int i = 0; i < relatedNgramCount; i++) { for (int i = 0; i < relatedNgramCount; i++) {
final String ngramTargetString = final String ngramTargetString =
StringUtils.getStringFromNullTerminatedCodePointArray(ngramTargets.get(i)); StringUtils.getStringFromNullTerminatedCodePointArray(ngramTargets.get(i));
final WeightedString ngramTarget = new WeightedString(ngramTargetString, final WeightedString ngramTarget = new WeightedString(ngramTargetString,
createProbabilityInfoFromArray(ngramProbabilityInfo.get(i))); createProbabilityInfoFromArray(ngramProbabilityInfo.get(i)));
// TODO: Support n-gram. final int[][] prevWords = ngramPrevWordsArray.get(i);
final boolean[] isBeginningOfSentenceArray =
ngramPrevWordIsBeginningOfSentenceArray.get(i);
final WordInfo[] wordInfoArray = new WordInfo[prevWords.length];
for (int j = 0; j < prevWords.length; j++) {
wordInfoArray[j] = isBeginningOfSentenceArray[j]
? WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO
: new WordInfo(StringUtils.getStringFromNullTerminatedCodePointArray(
prevWords[j]));
}
final NgramContext ngramContext = new NgramContext(wordInfoArray);
ngrams.add(new NgramProperty(ngramTarget, ngramContext)); ngrams.add(new NgramProperty(ngramTarget, ngramContext));
} }
mNgrams = ngrams.isEmpty() ? null : ngrams; mNgrams = ngrams.isEmpty() ? null : ngrams;
@ -126,6 +132,7 @@ public final class WordProperty implements Comparable<WordProperty> {
} }
// TODO: Remove // TODO: Remove
@UsedForTesting
public ArrayList<WeightedString> getBigrams() { public ArrayList<WeightedString> getBigrams() {
if (null == mNgrams) { if (null == mNgrams) {
return null; return null;

View file

@ -17,6 +17,7 @@
package com.android.inputmethod.latin.utils; package com.android.inputmethod.latin.utils;
import com.android.inputmethod.latin.makedict.DictionaryHeader; import com.android.inputmethod.latin.makedict.DictionaryHeader;
import com.android.inputmethod.latin.makedict.NgramProperty;
import com.android.inputmethod.latin.makedict.ProbabilityInfo; import com.android.inputmethod.latin.makedict.ProbabilityInfo;
import com.android.inputmethod.latin.makedict.WeightedString; import com.android.inputmethod.latin.makedict.WeightedString;
import com.android.inputmethod.latin.makedict.WordProperty; import com.android.inputmethod.latin.makedict.WordProperty;
@ -26,6 +27,8 @@ import java.util.HashMap;
public class CombinedFormatUtils { public class CombinedFormatUtils {
public static final String DICTIONARY_TAG = "dictionary"; public static final String DICTIONARY_TAG = "dictionary";
public static final String BIGRAM_TAG = "bigram"; public static final String BIGRAM_TAG = "bigram";
public static final String NGRAM_TAG = "ngram";
public static final String NGRAM_PREV_WORD_TAG = "prev_word";
public static final String SHORTCUT_TAG = "shortcut"; public static final String SHORTCUT_TAG = "shortcut";
public static final String PROBABILITY_TAG = "f"; public static final String PROBABILITY_TAG = "f";
public static final String HISTORICAL_INFO_TAG = "historicalInfo"; public static final String HISTORICAL_INFO_TAG = "historicalInfo";
@ -76,12 +79,19 @@ public class CombinedFormatUtils {
} }
} }
if (wordProperty.mHasNgrams) { if (wordProperty.mHasNgrams) {
// TODO: Support ngram. for (final NgramProperty ngramProperty : wordProperty.mNgrams) {
for (final WeightedString bigram : wordProperty.getBigrams()) { builder.append(" " + NGRAM_TAG + "=" + ngramProperty.mTargetWord.mWord);
builder.append(" " + BIGRAM_TAG + "=" + bigram.mWord);
builder.append(","); builder.append(",");
builder.append(formatProbabilityInfo(bigram.mProbabilityInfo)); builder.append(formatProbabilityInfo(ngramProperty.mTargetWord.mProbabilityInfo));
builder.append("\n"); builder.append("\n");
for (int i = 0; i < ngramProperty.mNgramContext.getPrevWordCount(); i++) {
builder.append(" " + NGRAM_PREV_WORD_TAG + "[" + i + "]="
+ ngramProperty.mNgramContext.getNthPrevWord(i + 1));
if (ngramProperty.mNgramContext.isNthPrevWordBeginningOfSontence(i + 1)) {
builder.append("," + BEGINNING_OF_SENTENCE_TAG + "=true");
}
builder.append("\n");
}
} }
} }
return builder.toString(); return builder.toString();

View file

@ -327,8 +327,8 @@ static jint latinime_BinaryDictionary_getNextWord(JNIEnv *env, jclass clazz,
static void latinime_BinaryDictionary_getWordProperty(JNIEnv *env, jclass clazz, static void latinime_BinaryDictionary_getWordProperty(JNIEnv *env, jclass clazz,
jlong dict, jintArray word, jboolean isBeginningOfSentence, jintArray outCodePoints, jlong dict, jintArray word, jboolean isBeginningOfSentence, jintArray outCodePoints,
jbooleanArray outFlags, jintArray outProbabilityInfo, jobject /* outNgramPrevWordsArray */, jbooleanArray outFlags, jintArray outProbabilityInfo, jobject outNgramPrevWordsArray,
jobject /* outNgramPrevWordIsBeginningOfSentenceArray */, jobject outNgramTargets, jobject outNgramPrevWordIsBeginningOfSentenceArray, jobject outNgramTargets,
jobject outNgramProbabilityInfo, jobject outShortcutTargets, jobject outNgramProbabilityInfo, jobject outShortcutTargets,
jobject outShortcutProbabilities) { jobject outShortcutProbabilities) {
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict); Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
@ -352,6 +352,7 @@ static void latinime_BinaryDictionary_getWordProperty(JNIEnv *env, jclass clazz,
const WordProperty wordProperty = dictionary->getWordProperty( const WordProperty wordProperty = dictionary->getWordProperty(
CodePointArrayView(wordCodePoints, codePointCount)); CodePointArrayView(wordCodePoints, codePointCount));
wordProperty.outputProperties(env, outCodePoints, outFlags, outProbabilityInfo, wordProperty.outputProperties(env, outCodePoints, outFlags, outProbabilityInfo,
outNgramPrevWordsArray, outNgramPrevWordIsBeginningOfSentenceArray,
outNgramTargets, outNgramProbabilityInfo, outShortcutTargets, outNgramTargets, outNgramProbabilityInfo, outShortcutTargets,
outShortcutProbabilities); outShortcutProbabilities);
} }

View file

@ -22,8 +22,9 @@
namespace latinime { namespace latinime {
void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints, void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints,
jbooleanArray outFlags, jintArray outProbabilityInfo, jobject outBigramTargets, jbooleanArray outFlags, jintArray outProbabilityInfo,
jobject outBigramProbabilities, jobject outShortcutTargets, jobject outNgramPrevWordsArray, jobject outNgramPrevWordIsBeginningOfSentenceArray,
jobject outNgramTargets, jobject outNgramProbabilities, jobject outShortcutTargets,
jobject outShortcutProbabilities) const { jobject outShortcutProbabilities) const {
JniDataUtils::outputCodePoints(env, outCodePoints, 0 /* start */, JniDataUtils::outputCodePoints(env, outCodePoints, 0 /* start */,
MAX_WORD_LENGTH /* maxLength */, mCodePoints.data(), mCodePoints.size(), MAX_WORD_LENGTH /* maxLength */, mCodePoints.data(), mCodePoints.size(),
@ -43,16 +44,39 @@ void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints,
jclass arrayListClass = env->FindClass("java/util/ArrayList"); jclass arrayListClass = env->FindClass("java/util/ArrayList");
jmethodID addMethodId = env->GetMethodID(arrayListClass, "add", "(Ljava/lang/Object;)Z"); jmethodID addMethodId = env->GetMethodID(arrayListClass, "add", "(Ljava/lang/Object;)Z");
// Output bigrams. // Output ngrams.
// TODO: Support n-gram jclass intArrayClass = env->FindClass("[I");
for (const auto &ngramProperty : mNgrams) { for (const auto &ngramProperty : mNgrams) {
const std::vector<int> *const word1CodePoints = ngramProperty.getTargetCodePoints(); const NgramContext *const ngramContext = ngramProperty.getNgramContext();
jintArray bigramWord1CodePointArray = env->NewIntArray(word1CodePoints->size()); jobjectArray prevWordWordCodePointsArray = env->NewObjectArray(
JniDataUtils::outputCodePoints(env, bigramWord1CodePointArray, 0 /* start */, ngramContext->getPrevWordCount(), intArrayClass, nullptr);
word1CodePoints->size(), word1CodePoints->data(), word1CodePoints->size(), jbooleanArray prevWordIsBeginningOfSentenceArray =
env->NewBooleanArray(ngramContext->getPrevWordCount());
for (size_t i = 0; i < ngramContext->getPrevWordCount(); ++i) {
const CodePointArrayView codePoints = ngramContext->getNthPrevWordCodePoints(i + 1);
jintArray prevWordCodePoints = env->NewIntArray(codePoints.size());
JniDataUtils::outputCodePoints(env, prevWordCodePoints, 0 /* start */,
codePoints.size(), codePoints.data(), codePoints.size(),
false /* needsNullTermination */); false /* needsNullTermination */);
env->CallBooleanMethod(outBigramTargets, addMethodId, bigramWord1CodePointArray); env->SetObjectArrayElement(prevWordWordCodePointsArray, i, prevWordCodePoints);
env->DeleteLocalRef(bigramWord1CodePointArray); env->DeleteLocalRef(prevWordCodePoints);
JniDataUtils::putBooleanToArray(env, prevWordIsBeginningOfSentenceArray, i,
ngramContext->isNthPrevWordBeginningOfSentence(i + 1));
}
env->CallBooleanMethod(outNgramPrevWordsArray, addMethodId, prevWordWordCodePointsArray);
env->CallBooleanMethod(outNgramPrevWordIsBeginningOfSentenceArray, addMethodId,
prevWordIsBeginningOfSentenceArray);
env->DeleteLocalRef(prevWordWordCodePointsArray);
env->DeleteLocalRef(prevWordIsBeginningOfSentenceArray);
const std::vector<int> *const targetWordCodePoints = ngramProperty.getTargetCodePoints();
jintArray targetWordCodePointArray = env->NewIntArray(targetWordCodePoints->size());
JniDataUtils::outputCodePoints(env, targetWordCodePointArray, 0 /* start */,
targetWordCodePoints->size(), targetWordCodePoints->data(),
targetWordCodePoints->size(), false /* needsNullTermination */);
env->CallBooleanMethod(outNgramTargets, addMethodId, targetWordCodePointArray);
env->DeleteLocalRef(targetWordCodePointArray);
const HistoricalInfo &ngramHistoricalInfo = ngramProperty.getHistoricalInfo(); const HistoricalInfo &ngramHistoricalInfo = ngramProperty.getHistoricalInfo();
int bigramProbabilityInfo[] = {ngramProperty.getProbability(), int bigramProbabilityInfo[] = {ngramProperty.getProbability(),
ngramHistoricalInfo.getTimestamp(), ngramHistoricalInfo.getLevel(), ngramHistoricalInfo.getTimestamp(), ngramHistoricalInfo.getLevel(),
@ -60,7 +84,7 @@ void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints,
jintArray bigramProbabilityInfoArray = env->NewIntArray(NELEMS(bigramProbabilityInfo)); jintArray bigramProbabilityInfoArray = env->NewIntArray(NELEMS(bigramProbabilityInfo));
env->SetIntArrayRegion(bigramProbabilityInfoArray, 0 /* start */, env->SetIntArrayRegion(bigramProbabilityInfoArray, 0 /* start */,
NELEMS(bigramProbabilityInfo), bigramProbabilityInfo); NELEMS(bigramProbabilityInfo), bigramProbabilityInfo);
env->CallBooleanMethod(outBigramProbabilities, addMethodId, bigramProbabilityInfoArray); env->CallBooleanMethod(outNgramProbabilities, addMethodId, bigramProbabilityInfoArray);
env->DeleteLocalRef(bigramProbabilityInfoArray); env->DeleteLocalRef(bigramProbabilityInfoArray);
} }

View file

@ -39,8 +39,10 @@ class WordProperty {
mNgrams(*ngrams) {} mNgrams(*ngrams) {}
void outputProperties(JNIEnv *const env, jintArray outCodePoints, jbooleanArray outFlags, void outputProperties(JNIEnv *const env, jintArray outCodePoints, jbooleanArray outFlags,
jintArray outProbabilityInfo, jobject outBigramTargets, jobject outBigramProbabilities, jintArray outProbabilityInfo, jobject outNgramPrevWordsArray,
jobject outShortcutTargets, jobject outShortcutProbabilities) const; jobject outNgramPrevWordIsBeginningOfSentenceArray, jobject outNgramTargets,
jobject outNgramProbabilities, jobject outShortcutTargets,
jobject outShortcutProbabilities) const;
const UnigramProperty *getUnigramProperty() const { const UnigramProperty *getUnigramProperty() const {
return &mUnigramProperty; return &mUnigramProperty;

View file

@ -90,8 +90,8 @@ const WordAttributes LanguageModelDictContent::getWordAttributes(const WordIdArr
// TODO: Some flags in unigramProbabilityEntry should be overwritten by flags in // TODO: Some flags in unigramProbabilityEntry should be overwritten by flags in
// probabilityEntry. // probabilityEntry.
const ProbabilityEntry unigramProbabilityEntry = getProbabilityEntry(wordId); const ProbabilityEntry unigramProbabilityEntry = getProbabilityEntry(wordId);
return WordAttributes(probability, unigramProbabilityEntry.isNotAWord(), return WordAttributes(probability, unigramProbabilityEntry.isBlacklisted(),
unigramProbabilityEntry.isBlacklisted(), unigramProbabilityEntry.isNotAWord(),
unigramProbabilityEntry.isPossiblyOffensive()); unigramProbabilityEntry.isPossiblyOffensive());
} }
// Cannot find the word. // Cannot find the word.

View file

@ -488,9 +488,6 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
AKLOGE("getWordProperty is called for invalid word."); AKLOGE("getWordProperty is called for invalid word.");
return WordProperty(); return WordProperty();
} }
const int ptNodePos =
mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
const LanguageModelDictContent *const languageModelDictContent = const LanguageModelDictContent *const languageModelDictContent =
mBuffers->getLanguageModelDictContent(); mBuffers->getLanguageModelDictContent();
// Fetch ngram information. // Fetch ngram information.
@ -541,12 +538,13 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
shortcutProbability); shortcutProbability);
} }
} }
const ProbabilityEntry probabilityEntry = languageModelDictContent->getProbabilityEntry( const WordAttributes wordAttributes = languageModelDictContent->getWordAttributes(
ptNodeParams.getTerminalId()); WordIdArrayView(), wordId, mHeaderPolicy);
const ProbabilityEntry probabilityEntry = languageModelDictContent->getProbabilityEntry(wordId);
const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo();
const UnigramProperty unigramProperty(probabilityEntry.representsBeginningOfSentence(), const UnigramProperty unigramProperty(probabilityEntry.representsBeginningOfSentence(),
probabilityEntry.isNotAWord(), probabilityEntry.isBlacklisted(), wordAttributes.isNotAWord(), wordAttributes.isBlacklisted(),
probabilityEntry.isPossiblyOffensive(), probabilityEntry.getProbability(), wordAttributes.isPossiblyOffensive(), wordAttributes.getProbability(),
*historicalInfo, std::move(shortcuts)); *historicalInfo, std::move(shortcuts));
return WordProperty(wordCodePoints.toVector(), &unigramProperty, &ngrams); return WordProperty(wordCodePoints.toVector(), &unigramProperty, &ngrams);
} }