Merge "Support dumping ngram entries."
This commit is contained in:
commit
010909d443
7 changed files with 77 additions and 35 deletions
|
@ -87,7 +87,7 @@ public final class WordProperty implements Comparable<WordProperty> {
|
||||||
final boolean isPossiblyOffensive, final boolean hasBigram, final boolean hasShortcuts,
|
final boolean isPossiblyOffensive, final boolean hasBigram, final boolean hasShortcuts,
|
||||||
final boolean isBeginningOfSentence, final int[] probabilityInfo,
|
final boolean isBeginningOfSentence, final int[] probabilityInfo,
|
||||||
final ArrayList<int[][]> ngramPrevWordsArray,
|
final ArrayList<int[][]> ngramPrevWordsArray,
|
||||||
final ArrayList<boolean[]> outNgramPrevWordIsBeginningOfSentenceArray,
|
final ArrayList<boolean[]> ngramPrevWordIsBeginningOfSentenceArray,
|
||||||
final ArrayList<int[]> ngramTargets, final ArrayList<int[]> ngramProbabilityInfo,
|
final ArrayList<int[]> ngramTargets, final ArrayList<int[]> ngramProbabilityInfo,
|
||||||
final ArrayList<int[]> shortcutTargets,
|
final ArrayList<int[]> shortcutTargets,
|
||||||
final ArrayList<Integer> shortcutProbabilities) {
|
final ArrayList<Integer> shortcutProbabilities) {
|
||||||
|
@ -102,16 +102,22 @@ public final class WordProperty implements Comparable<WordProperty> {
|
||||||
mHasNgrams = hasBigram;
|
mHasNgrams = hasBigram;
|
||||||
|
|
||||||
final int relatedNgramCount = ngramTargets.size();
|
final int relatedNgramCount = ngramTargets.size();
|
||||||
final WordInfo currentWordInfo =
|
|
||||||
mIsBeginningOfSentence ? WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO
|
|
||||||
: new WordInfo(mWord);
|
|
||||||
final NgramContext ngramContext = new NgramContext(currentWordInfo);
|
|
||||||
for (int i = 0; i < relatedNgramCount; i++) {
|
for (int i = 0; i < relatedNgramCount; i++) {
|
||||||
final String ngramTargetString =
|
final String ngramTargetString =
|
||||||
StringUtils.getStringFromNullTerminatedCodePointArray(ngramTargets.get(i));
|
StringUtils.getStringFromNullTerminatedCodePointArray(ngramTargets.get(i));
|
||||||
final WeightedString ngramTarget = new WeightedString(ngramTargetString,
|
final WeightedString ngramTarget = new WeightedString(ngramTargetString,
|
||||||
createProbabilityInfoFromArray(ngramProbabilityInfo.get(i)));
|
createProbabilityInfoFromArray(ngramProbabilityInfo.get(i)));
|
||||||
// TODO: Support n-gram.
|
final int[][] prevWords = ngramPrevWordsArray.get(i);
|
||||||
|
final boolean[] isBeginningOfSentenceArray =
|
||||||
|
ngramPrevWordIsBeginningOfSentenceArray.get(i);
|
||||||
|
final WordInfo[] wordInfoArray = new WordInfo[prevWords.length];
|
||||||
|
for (int j = 0; j < prevWords.length; j++) {
|
||||||
|
wordInfoArray[j] = isBeginningOfSentenceArray[j]
|
||||||
|
? WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO
|
||||||
|
: new WordInfo(StringUtils.getStringFromNullTerminatedCodePointArray(
|
||||||
|
prevWords[j]));
|
||||||
|
}
|
||||||
|
final NgramContext ngramContext = new NgramContext(wordInfoArray);
|
||||||
ngrams.add(new NgramProperty(ngramTarget, ngramContext));
|
ngrams.add(new NgramProperty(ngramTarget, ngramContext));
|
||||||
}
|
}
|
||||||
mNgrams = ngrams.isEmpty() ? null : ngrams;
|
mNgrams = ngrams.isEmpty() ? null : ngrams;
|
||||||
|
@ -126,6 +132,7 @@ public final class WordProperty implements Comparable<WordProperty> {
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Remove
|
// TODO: Remove
|
||||||
|
@UsedForTesting
|
||||||
public ArrayList<WeightedString> getBigrams() {
|
public ArrayList<WeightedString> getBigrams() {
|
||||||
if (null == mNgrams) {
|
if (null == mNgrams) {
|
||||||
return null;
|
return null;
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
package com.android.inputmethod.latin.utils;
|
package com.android.inputmethod.latin.utils;
|
||||||
|
|
||||||
import com.android.inputmethod.latin.makedict.DictionaryHeader;
|
import com.android.inputmethod.latin.makedict.DictionaryHeader;
|
||||||
|
import com.android.inputmethod.latin.makedict.NgramProperty;
|
||||||
import com.android.inputmethod.latin.makedict.ProbabilityInfo;
|
import com.android.inputmethod.latin.makedict.ProbabilityInfo;
|
||||||
import com.android.inputmethod.latin.makedict.WeightedString;
|
import com.android.inputmethod.latin.makedict.WeightedString;
|
||||||
import com.android.inputmethod.latin.makedict.WordProperty;
|
import com.android.inputmethod.latin.makedict.WordProperty;
|
||||||
|
@ -26,6 +27,8 @@ import java.util.HashMap;
|
||||||
public class CombinedFormatUtils {
|
public class CombinedFormatUtils {
|
||||||
public static final String DICTIONARY_TAG = "dictionary";
|
public static final String DICTIONARY_TAG = "dictionary";
|
||||||
public static final String BIGRAM_TAG = "bigram";
|
public static final String BIGRAM_TAG = "bigram";
|
||||||
|
public static final String NGRAM_TAG = "ngram";
|
||||||
|
public static final String NGRAM_PREV_WORD_TAG = "prev_word";
|
||||||
public static final String SHORTCUT_TAG = "shortcut";
|
public static final String SHORTCUT_TAG = "shortcut";
|
||||||
public static final String PROBABILITY_TAG = "f";
|
public static final String PROBABILITY_TAG = "f";
|
||||||
public static final String HISTORICAL_INFO_TAG = "historicalInfo";
|
public static final String HISTORICAL_INFO_TAG = "historicalInfo";
|
||||||
|
@ -76,12 +79,19 @@ public class CombinedFormatUtils {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (wordProperty.mHasNgrams) {
|
if (wordProperty.mHasNgrams) {
|
||||||
// TODO: Support ngram.
|
for (final NgramProperty ngramProperty : wordProperty.mNgrams) {
|
||||||
for (final WeightedString bigram : wordProperty.getBigrams()) {
|
builder.append(" " + NGRAM_TAG + "=" + ngramProperty.mTargetWord.mWord);
|
||||||
builder.append(" " + BIGRAM_TAG + "=" + bigram.mWord);
|
|
||||||
builder.append(",");
|
builder.append(",");
|
||||||
builder.append(formatProbabilityInfo(bigram.mProbabilityInfo));
|
builder.append(formatProbabilityInfo(ngramProperty.mTargetWord.mProbabilityInfo));
|
||||||
builder.append("\n");
|
builder.append("\n");
|
||||||
|
for (int i = 0; i < ngramProperty.mNgramContext.getPrevWordCount(); i++) {
|
||||||
|
builder.append(" " + NGRAM_PREV_WORD_TAG + "[" + i + "]="
|
||||||
|
+ ngramProperty.mNgramContext.getNthPrevWord(i + 1));
|
||||||
|
if (ngramProperty.mNgramContext.isNthPrevWordBeginningOfSontence(i + 1)) {
|
||||||
|
builder.append("," + BEGINNING_OF_SENTENCE_TAG + "=true");
|
||||||
|
}
|
||||||
|
builder.append("\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return builder.toString();
|
return builder.toString();
|
||||||
|
|
|
@ -327,8 +327,8 @@ static jint latinime_BinaryDictionary_getNextWord(JNIEnv *env, jclass clazz,
|
||||||
|
|
||||||
static void latinime_BinaryDictionary_getWordProperty(JNIEnv *env, jclass clazz,
|
static void latinime_BinaryDictionary_getWordProperty(JNIEnv *env, jclass clazz,
|
||||||
jlong dict, jintArray word, jboolean isBeginningOfSentence, jintArray outCodePoints,
|
jlong dict, jintArray word, jboolean isBeginningOfSentence, jintArray outCodePoints,
|
||||||
jbooleanArray outFlags, jintArray outProbabilityInfo, jobject /* outNgramPrevWordsArray */,
|
jbooleanArray outFlags, jintArray outProbabilityInfo, jobject outNgramPrevWordsArray,
|
||||||
jobject /* outNgramPrevWordIsBeginningOfSentenceArray */, jobject outNgramTargets,
|
jobject outNgramPrevWordIsBeginningOfSentenceArray, jobject outNgramTargets,
|
||||||
jobject outNgramProbabilityInfo, jobject outShortcutTargets,
|
jobject outNgramProbabilityInfo, jobject outShortcutTargets,
|
||||||
jobject outShortcutProbabilities) {
|
jobject outShortcutProbabilities) {
|
||||||
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
|
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
|
||||||
|
@ -352,6 +352,7 @@ static void latinime_BinaryDictionary_getWordProperty(JNIEnv *env, jclass clazz,
|
||||||
const WordProperty wordProperty = dictionary->getWordProperty(
|
const WordProperty wordProperty = dictionary->getWordProperty(
|
||||||
CodePointArrayView(wordCodePoints, codePointCount));
|
CodePointArrayView(wordCodePoints, codePointCount));
|
||||||
wordProperty.outputProperties(env, outCodePoints, outFlags, outProbabilityInfo,
|
wordProperty.outputProperties(env, outCodePoints, outFlags, outProbabilityInfo,
|
||||||
|
outNgramPrevWordsArray, outNgramPrevWordIsBeginningOfSentenceArray,
|
||||||
outNgramTargets, outNgramProbabilityInfo, outShortcutTargets,
|
outNgramTargets, outNgramProbabilityInfo, outShortcutTargets,
|
||||||
outShortcutProbabilities);
|
outShortcutProbabilities);
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,8 +22,9 @@
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints,
|
void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints,
|
||||||
jbooleanArray outFlags, jintArray outProbabilityInfo, jobject outBigramTargets,
|
jbooleanArray outFlags, jintArray outProbabilityInfo,
|
||||||
jobject outBigramProbabilities, jobject outShortcutTargets,
|
jobject outNgramPrevWordsArray, jobject outNgramPrevWordIsBeginningOfSentenceArray,
|
||||||
|
jobject outNgramTargets, jobject outNgramProbabilities, jobject outShortcutTargets,
|
||||||
jobject outShortcutProbabilities) const {
|
jobject outShortcutProbabilities) const {
|
||||||
JniDataUtils::outputCodePoints(env, outCodePoints, 0 /* start */,
|
JniDataUtils::outputCodePoints(env, outCodePoints, 0 /* start */,
|
||||||
MAX_WORD_LENGTH /* maxLength */, mCodePoints.data(), mCodePoints.size(),
|
MAX_WORD_LENGTH /* maxLength */, mCodePoints.data(), mCodePoints.size(),
|
||||||
|
@ -43,16 +44,39 @@ void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints,
|
||||||
jclass arrayListClass = env->FindClass("java/util/ArrayList");
|
jclass arrayListClass = env->FindClass("java/util/ArrayList");
|
||||||
jmethodID addMethodId = env->GetMethodID(arrayListClass, "add", "(Ljava/lang/Object;)Z");
|
jmethodID addMethodId = env->GetMethodID(arrayListClass, "add", "(Ljava/lang/Object;)Z");
|
||||||
|
|
||||||
// Output bigrams.
|
// Output ngrams.
|
||||||
// TODO: Support n-gram
|
jclass intArrayClass = env->FindClass("[I");
|
||||||
for (const auto &ngramProperty : mNgrams) {
|
for (const auto &ngramProperty : mNgrams) {
|
||||||
const std::vector<int> *const word1CodePoints = ngramProperty.getTargetCodePoints();
|
const NgramContext *const ngramContext = ngramProperty.getNgramContext();
|
||||||
jintArray bigramWord1CodePointArray = env->NewIntArray(word1CodePoints->size());
|
jobjectArray prevWordWordCodePointsArray = env->NewObjectArray(
|
||||||
JniDataUtils::outputCodePoints(env, bigramWord1CodePointArray, 0 /* start */,
|
ngramContext->getPrevWordCount(), intArrayClass, nullptr);
|
||||||
word1CodePoints->size(), word1CodePoints->data(), word1CodePoints->size(),
|
jbooleanArray prevWordIsBeginningOfSentenceArray =
|
||||||
|
env->NewBooleanArray(ngramContext->getPrevWordCount());
|
||||||
|
for (size_t i = 0; i < ngramContext->getPrevWordCount(); ++i) {
|
||||||
|
const CodePointArrayView codePoints = ngramContext->getNthPrevWordCodePoints(i + 1);
|
||||||
|
jintArray prevWordCodePoints = env->NewIntArray(codePoints.size());
|
||||||
|
JniDataUtils::outputCodePoints(env, prevWordCodePoints, 0 /* start */,
|
||||||
|
codePoints.size(), codePoints.data(), codePoints.size(),
|
||||||
false /* needsNullTermination */);
|
false /* needsNullTermination */);
|
||||||
env->CallBooleanMethod(outBigramTargets, addMethodId, bigramWord1CodePointArray);
|
env->SetObjectArrayElement(prevWordWordCodePointsArray, i, prevWordCodePoints);
|
||||||
env->DeleteLocalRef(bigramWord1CodePointArray);
|
env->DeleteLocalRef(prevWordCodePoints);
|
||||||
|
JniDataUtils::putBooleanToArray(env, prevWordIsBeginningOfSentenceArray, i,
|
||||||
|
ngramContext->isNthPrevWordBeginningOfSentence(i + 1));
|
||||||
|
}
|
||||||
|
env->CallBooleanMethod(outNgramPrevWordsArray, addMethodId, prevWordWordCodePointsArray);
|
||||||
|
env->CallBooleanMethod(outNgramPrevWordIsBeginningOfSentenceArray, addMethodId,
|
||||||
|
prevWordIsBeginningOfSentenceArray);
|
||||||
|
env->DeleteLocalRef(prevWordWordCodePointsArray);
|
||||||
|
env->DeleteLocalRef(prevWordIsBeginningOfSentenceArray);
|
||||||
|
|
||||||
|
const std::vector<int> *const targetWordCodePoints = ngramProperty.getTargetCodePoints();
|
||||||
|
jintArray targetWordCodePointArray = env->NewIntArray(targetWordCodePoints->size());
|
||||||
|
JniDataUtils::outputCodePoints(env, targetWordCodePointArray, 0 /* start */,
|
||||||
|
targetWordCodePoints->size(), targetWordCodePoints->data(),
|
||||||
|
targetWordCodePoints->size(), false /* needsNullTermination */);
|
||||||
|
env->CallBooleanMethod(outNgramTargets, addMethodId, targetWordCodePointArray);
|
||||||
|
env->DeleteLocalRef(targetWordCodePointArray);
|
||||||
|
|
||||||
const HistoricalInfo &ngramHistoricalInfo = ngramProperty.getHistoricalInfo();
|
const HistoricalInfo &ngramHistoricalInfo = ngramProperty.getHistoricalInfo();
|
||||||
int bigramProbabilityInfo[] = {ngramProperty.getProbability(),
|
int bigramProbabilityInfo[] = {ngramProperty.getProbability(),
|
||||||
ngramHistoricalInfo.getTimestamp(), ngramHistoricalInfo.getLevel(),
|
ngramHistoricalInfo.getTimestamp(), ngramHistoricalInfo.getLevel(),
|
||||||
|
@ -60,7 +84,7 @@ void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints,
|
||||||
jintArray bigramProbabilityInfoArray = env->NewIntArray(NELEMS(bigramProbabilityInfo));
|
jintArray bigramProbabilityInfoArray = env->NewIntArray(NELEMS(bigramProbabilityInfo));
|
||||||
env->SetIntArrayRegion(bigramProbabilityInfoArray, 0 /* start */,
|
env->SetIntArrayRegion(bigramProbabilityInfoArray, 0 /* start */,
|
||||||
NELEMS(bigramProbabilityInfo), bigramProbabilityInfo);
|
NELEMS(bigramProbabilityInfo), bigramProbabilityInfo);
|
||||||
env->CallBooleanMethod(outBigramProbabilities, addMethodId, bigramProbabilityInfoArray);
|
env->CallBooleanMethod(outNgramProbabilities, addMethodId, bigramProbabilityInfoArray);
|
||||||
env->DeleteLocalRef(bigramProbabilityInfoArray);
|
env->DeleteLocalRef(bigramProbabilityInfoArray);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -39,8 +39,10 @@ class WordProperty {
|
||||||
mNgrams(*ngrams) {}
|
mNgrams(*ngrams) {}
|
||||||
|
|
||||||
void outputProperties(JNIEnv *const env, jintArray outCodePoints, jbooleanArray outFlags,
|
void outputProperties(JNIEnv *const env, jintArray outCodePoints, jbooleanArray outFlags,
|
||||||
jintArray outProbabilityInfo, jobject outBigramTargets, jobject outBigramProbabilities,
|
jintArray outProbabilityInfo, jobject outNgramPrevWordsArray,
|
||||||
jobject outShortcutTargets, jobject outShortcutProbabilities) const;
|
jobject outNgramPrevWordIsBeginningOfSentenceArray, jobject outNgramTargets,
|
||||||
|
jobject outNgramProbabilities, jobject outShortcutTargets,
|
||||||
|
jobject outShortcutProbabilities) const;
|
||||||
|
|
||||||
const UnigramProperty *getUnigramProperty() const {
|
const UnigramProperty *getUnigramProperty() const {
|
||||||
return &mUnigramProperty;
|
return &mUnigramProperty;
|
||||||
|
|
|
@ -90,8 +90,8 @@ const WordAttributes LanguageModelDictContent::getWordAttributes(const WordIdArr
|
||||||
// TODO: Some flags in unigramProbabilityEntry should be overwritten by flags in
|
// TODO: Some flags in unigramProbabilityEntry should be overwritten by flags in
|
||||||
// probabilityEntry.
|
// probabilityEntry.
|
||||||
const ProbabilityEntry unigramProbabilityEntry = getProbabilityEntry(wordId);
|
const ProbabilityEntry unigramProbabilityEntry = getProbabilityEntry(wordId);
|
||||||
return WordAttributes(probability, unigramProbabilityEntry.isNotAWord(),
|
return WordAttributes(probability, unigramProbabilityEntry.isBlacklisted(),
|
||||||
unigramProbabilityEntry.isBlacklisted(),
|
unigramProbabilityEntry.isNotAWord(),
|
||||||
unigramProbabilityEntry.isPossiblyOffensive());
|
unigramProbabilityEntry.isPossiblyOffensive());
|
||||||
}
|
}
|
||||||
// Cannot find the word.
|
// Cannot find the word.
|
||||||
|
|
|
@ -488,9 +488,6 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
|
||||||
AKLOGE("getWordProperty is called for invalid word.");
|
AKLOGE("getWordProperty is called for invalid word.");
|
||||||
return WordProperty();
|
return WordProperty();
|
||||||
}
|
}
|
||||||
const int ptNodePos =
|
|
||||||
mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
|
|
||||||
const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
|
|
||||||
const LanguageModelDictContent *const languageModelDictContent =
|
const LanguageModelDictContent *const languageModelDictContent =
|
||||||
mBuffers->getLanguageModelDictContent();
|
mBuffers->getLanguageModelDictContent();
|
||||||
// Fetch ngram information.
|
// Fetch ngram information.
|
||||||
|
@ -541,12 +538,13 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
|
||||||
shortcutProbability);
|
shortcutProbability);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
const ProbabilityEntry probabilityEntry = languageModelDictContent->getProbabilityEntry(
|
const WordAttributes wordAttributes = languageModelDictContent->getWordAttributes(
|
||||||
ptNodeParams.getTerminalId());
|
WordIdArrayView(), wordId, mHeaderPolicy);
|
||||||
|
const ProbabilityEntry probabilityEntry = languageModelDictContent->getProbabilityEntry(wordId);
|
||||||
const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo();
|
const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo();
|
||||||
const UnigramProperty unigramProperty(probabilityEntry.representsBeginningOfSentence(),
|
const UnigramProperty unigramProperty(probabilityEntry.representsBeginningOfSentence(),
|
||||||
probabilityEntry.isNotAWord(), probabilityEntry.isBlacklisted(),
|
wordAttributes.isNotAWord(), wordAttributes.isBlacklisted(),
|
||||||
probabilityEntry.isPossiblyOffensive(), probabilityEntry.getProbability(),
|
wordAttributes.isPossiblyOffensive(), wordAttributes.getProbability(),
|
||||||
*historicalInfo, std::move(shortcuts));
|
*historicalInfo, std::move(shortcuts));
|
||||||
return WordProperty(wordCodePoints.toVector(), &unigramProperty, &ngrams);
|
return WordProperty(wordCodePoints.toVector(), &unigramProperty, &ngrams);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue