Support dumping ngram entries.

Bug: 14425059
Change-Id: Ib03a0c3d166ed6f1e60c67127b28006d55143b6b
This commit is contained in:
Keisuke Kuroyanagi 2014-10-22 18:15:53 +09:00
parent 1249395563
commit b5ef884fbb
7 changed files with 77 additions and 35 deletions

View file

@ -87,7 +87,7 @@ public final class WordProperty implements Comparable<WordProperty> {
final boolean isPossiblyOffensive, final boolean hasBigram, final boolean hasShortcuts,
final boolean isBeginningOfSentence, final int[] probabilityInfo,
final ArrayList<int[][]> ngramPrevWordsArray,
final ArrayList<boolean[]> outNgramPrevWordIsBeginningOfSentenceArray,
final ArrayList<boolean[]> ngramPrevWordIsBeginningOfSentenceArray,
final ArrayList<int[]> ngramTargets, final ArrayList<int[]> ngramProbabilityInfo,
final ArrayList<int[]> shortcutTargets,
final ArrayList<Integer> shortcutProbabilities) {
@ -102,16 +102,22 @@ public final class WordProperty implements Comparable<WordProperty> {
mHasNgrams = hasBigram;
final int relatedNgramCount = ngramTargets.size();
final WordInfo currentWordInfo =
mIsBeginningOfSentence ? WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO
: new WordInfo(mWord);
final NgramContext ngramContext = new NgramContext(currentWordInfo);
for (int i = 0; i < relatedNgramCount; i++) {
final String ngramTargetString =
StringUtils.getStringFromNullTerminatedCodePointArray(ngramTargets.get(i));
final WeightedString ngramTarget = new WeightedString(ngramTargetString,
createProbabilityInfoFromArray(ngramProbabilityInfo.get(i)));
// TODO: Support n-gram.
final int[][] prevWords = ngramPrevWordsArray.get(i);
final boolean[] isBeginningOfSentenceArray =
ngramPrevWordIsBeginningOfSentenceArray.get(i);
final WordInfo[] wordInfoArray = new WordInfo[prevWords.length];
for (int j = 0; j < prevWords.length; j++) {
wordInfoArray[j] = isBeginningOfSentenceArray[j]
? WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO
: new WordInfo(StringUtils.getStringFromNullTerminatedCodePointArray(
prevWords[j]));
}
final NgramContext ngramContext = new NgramContext(wordInfoArray);
ngrams.add(new NgramProperty(ngramTarget, ngramContext));
}
mNgrams = ngrams.isEmpty() ? null : ngrams;
@ -126,6 +132,7 @@ public final class WordProperty implements Comparable<WordProperty> {
}
// TODO: Remove
@UsedForTesting
public ArrayList<WeightedString> getBigrams() {
if (null == mNgrams) {
return null;

View file

@ -17,6 +17,7 @@
package com.android.inputmethod.latin.utils;
import com.android.inputmethod.latin.makedict.DictionaryHeader;
import com.android.inputmethod.latin.makedict.NgramProperty;
import com.android.inputmethod.latin.makedict.ProbabilityInfo;
import com.android.inputmethod.latin.makedict.WeightedString;
import com.android.inputmethod.latin.makedict.WordProperty;
@ -26,6 +27,8 @@ import java.util.HashMap;
public class CombinedFormatUtils {
public static final String DICTIONARY_TAG = "dictionary";
public static final String BIGRAM_TAG = "bigram";
public static final String NGRAM_TAG = "ngram";
public static final String NGRAM_PREV_WORD_TAG = "prev_word";
public static final String SHORTCUT_TAG = "shortcut";
public static final String PROBABILITY_TAG = "f";
public static final String HISTORICAL_INFO_TAG = "historicalInfo";
@ -76,12 +79,19 @@ public class CombinedFormatUtils {
}
}
if (wordProperty.mHasNgrams) {
// TODO: Support ngram.
for (final WeightedString bigram : wordProperty.getBigrams()) {
builder.append(" " + BIGRAM_TAG + "=" + bigram.mWord);
for (final NgramProperty ngramProperty : wordProperty.mNgrams) {
builder.append(" " + NGRAM_TAG + "=" + ngramProperty.mTargetWord.mWord);
builder.append(",");
builder.append(formatProbabilityInfo(bigram.mProbabilityInfo));
builder.append(formatProbabilityInfo(ngramProperty.mTargetWord.mProbabilityInfo));
builder.append("\n");
for (int i = 0; i < ngramProperty.mNgramContext.getPrevWordCount(); i++) {
builder.append(" " + NGRAM_PREV_WORD_TAG + "[" + i + "]="
+ ngramProperty.mNgramContext.getNthPrevWord(i + 1));
if (ngramProperty.mNgramContext.isNthPrevWordBeginningOfSontence(i + 1)) {
builder.append("," + BEGINNING_OF_SENTENCE_TAG + "=true");
}
builder.append("\n");
}
}
}
return builder.toString();

View file

@ -327,8 +327,8 @@ static jint latinime_BinaryDictionary_getNextWord(JNIEnv *env, jclass clazz,
static void latinime_BinaryDictionary_getWordProperty(JNIEnv *env, jclass clazz,
jlong dict, jintArray word, jboolean isBeginningOfSentence, jintArray outCodePoints,
jbooleanArray outFlags, jintArray outProbabilityInfo, jobject /* outNgramPrevWordsArray */,
jobject /* outNgramPrevWordIsBeginningOfSentenceArray */, jobject outNgramTargets,
jbooleanArray outFlags, jintArray outProbabilityInfo, jobject outNgramPrevWordsArray,
jobject outNgramPrevWordIsBeginningOfSentenceArray, jobject outNgramTargets,
jobject outNgramProbabilityInfo, jobject outShortcutTargets,
jobject outShortcutProbabilities) {
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
@ -352,6 +352,7 @@ static void latinime_BinaryDictionary_getWordProperty(JNIEnv *env, jclass clazz,
const WordProperty wordProperty = dictionary->getWordProperty(
CodePointArrayView(wordCodePoints, codePointCount));
wordProperty.outputProperties(env, outCodePoints, outFlags, outProbabilityInfo,
outNgramPrevWordsArray, outNgramPrevWordIsBeginningOfSentenceArray,
outNgramTargets, outNgramProbabilityInfo, outShortcutTargets,
outShortcutProbabilities);
}

View file

@ -22,8 +22,9 @@
namespace latinime {
void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints,
jbooleanArray outFlags, jintArray outProbabilityInfo, jobject outBigramTargets,
jobject outBigramProbabilities, jobject outShortcutTargets,
jbooleanArray outFlags, jintArray outProbabilityInfo,
jobject outNgramPrevWordsArray, jobject outNgramPrevWordIsBeginningOfSentenceArray,
jobject outNgramTargets, jobject outNgramProbabilities, jobject outShortcutTargets,
jobject outShortcutProbabilities) const {
JniDataUtils::outputCodePoints(env, outCodePoints, 0 /* start */,
MAX_WORD_LENGTH /* maxLength */, mCodePoints.data(), mCodePoints.size(),
@ -43,16 +44,39 @@ void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints,
jclass arrayListClass = env->FindClass("java/util/ArrayList");
jmethodID addMethodId = env->GetMethodID(arrayListClass, "add", "(Ljava/lang/Object;)Z");
// Output bigrams.
// TODO: Support n-gram
// Output ngrams.
jclass intArrayClass = env->FindClass("[I");
for (const auto &ngramProperty : mNgrams) {
const std::vector<int> *const word1CodePoints = ngramProperty.getTargetCodePoints();
jintArray bigramWord1CodePointArray = env->NewIntArray(word1CodePoints->size());
JniDataUtils::outputCodePoints(env, bigramWord1CodePointArray, 0 /* start */,
word1CodePoints->size(), word1CodePoints->data(), word1CodePoints->size(),
false /* needsNullTermination */);
env->CallBooleanMethod(outBigramTargets, addMethodId, bigramWord1CodePointArray);
env->DeleteLocalRef(bigramWord1CodePointArray);
const NgramContext *const ngramContext = ngramProperty.getNgramContext();
jobjectArray prevWordWordCodePointsArray = env->NewObjectArray(
ngramContext->getPrevWordCount(), intArrayClass, nullptr);
jbooleanArray prevWordIsBeginningOfSentenceArray =
env->NewBooleanArray(ngramContext->getPrevWordCount());
for (size_t i = 0; i < ngramContext->getPrevWordCount(); ++i) {
const CodePointArrayView codePoints = ngramContext->getNthPrevWordCodePoints(i + 1);
jintArray prevWordCodePoints = env->NewIntArray(codePoints.size());
JniDataUtils::outputCodePoints(env, prevWordCodePoints, 0 /* start */,
codePoints.size(), codePoints.data(), codePoints.size(),
false /* needsNullTermination */);
env->SetObjectArrayElement(prevWordWordCodePointsArray, i, prevWordCodePoints);
env->DeleteLocalRef(prevWordCodePoints);
JniDataUtils::putBooleanToArray(env, prevWordIsBeginningOfSentenceArray, i,
ngramContext->isNthPrevWordBeginningOfSentence(i + 1));
}
env->CallBooleanMethod(outNgramPrevWordsArray, addMethodId, prevWordWordCodePointsArray);
env->CallBooleanMethod(outNgramPrevWordIsBeginningOfSentenceArray, addMethodId,
prevWordIsBeginningOfSentenceArray);
env->DeleteLocalRef(prevWordWordCodePointsArray);
env->DeleteLocalRef(prevWordIsBeginningOfSentenceArray);
const std::vector<int> *const targetWordCodePoints = ngramProperty.getTargetCodePoints();
jintArray targetWordCodePointArray = env->NewIntArray(targetWordCodePoints->size());
JniDataUtils::outputCodePoints(env, targetWordCodePointArray, 0 /* start */,
targetWordCodePoints->size(), targetWordCodePoints->data(),
targetWordCodePoints->size(), false /* needsNullTermination */);
env->CallBooleanMethod(outNgramTargets, addMethodId, targetWordCodePointArray);
env->DeleteLocalRef(targetWordCodePointArray);
const HistoricalInfo &ngramHistoricalInfo = ngramProperty.getHistoricalInfo();
int bigramProbabilityInfo[] = {ngramProperty.getProbability(),
ngramHistoricalInfo.getTimestamp(), ngramHistoricalInfo.getLevel(),
@ -60,7 +84,7 @@ void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints,
jintArray bigramProbabilityInfoArray = env->NewIntArray(NELEMS(bigramProbabilityInfo));
env->SetIntArrayRegion(bigramProbabilityInfoArray, 0 /* start */,
NELEMS(bigramProbabilityInfo), bigramProbabilityInfo);
env->CallBooleanMethod(outBigramProbabilities, addMethodId, bigramProbabilityInfoArray);
env->CallBooleanMethod(outNgramProbabilities, addMethodId, bigramProbabilityInfoArray);
env->DeleteLocalRef(bigramProbabilityInfoArray);
}

View file

@ -39,8 +39,10 @@ class WordProperty {
mNgrams(*ngrams) {}
void outputProperties(JNIEnv *const env, jintArray outCodePoints, jbooleanArray outFlags,
jintArray outProbabilityInfo, jobject outBigramTargets, jobject outBigramProbabilities,
jobject outShortcutTargets, jobject outShortcutProbabilities) const;
jintArray outProbabilityInfo, jobject outNgramPrevWordsArray,
jobject outNgramPrevWordIsBeginningOfSentenceArray, jobject outNgramTargets,
jobject outNgramProbabilities, jobject outShortcutTargets,
jobject outShortcutProbabilities) const;
const UnigramProperty *getUnigramProperty() const {
return &mUnigramProperty;

View file

@ -90,8 +90,8 @@ const WordAttributes LanguageModelDictContent::getWordAttributes(const WordIdArr
// TODO: Some flags in unigramProbabilityEntry should be overwritten by flags in
// probabilityEntry.
const ProbabilityEntry unigramProbabilityEntry = getProbabilityEntry(wordId);
return WordAttributes(probability, unigramProbabilityEntry.isNotAWord(),
unigramProbabilityEntry.isBlacklisted(),
return WordAttributes(probability, unigramProbabilityEntry.isBlacklisted(),
unigramProbabilityEntry.isNotAWord(),
unigramProbabilityEntry.isPossiblyOffensive());
}
// Cannot find the word.

View file

@ -488,9 +488,6 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
AKLOGE("getWordProperty is called for invalid word.");
return WordProperty();
}
const int ptNodePos =
mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
const LanguageModelDictContent *const languageModelDictContent =
mBuffers->getLanguageModelDictContent();
// Fetch ngram information.
@ -541,12 +538,13 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
shortcutProbability);
}
}
const ProbabilityEntry probabilityEntry = languageModelDictContent->getProbabilityEntry(
ptNodeParams.getTerminalId());
const WordAttributes wordAttributes = languageModelDictContent->getWordAttributes(
WordIdArrayView(), wordId, mHeaderPolicy);
const ProbabilityEntry probabilityEntry = languageModelDictContent->getProbabilityEntry(wordId);
const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo();
const UnigramProperty unigramProperty(probabilityEntry.representsBeginningOfSentence(),
probabilityEntry.isNotAWord(), probabilityEntry.isBlacklisted(),
probabilityEntry.isPossiblyOffensive(), probabilityEntry.getProbability(),
wordAttributes.isNotAWord(), wordAttributes.isBlacklisted(),
wordAttributes.isPossiblyOffensive(), wordAttributes.getProbability(),
*historicalInfo, std::move(shortcuts));
return WordProperty(wordCodePoints.toVector(), &unigramProperty, &ngrams);
}