Rename BigramProperty to NgramProperty.
Remaining work is changing bigram to ngram for supporting ngram entry counting, dumping, and migration. Bug: 14425059 Change-Id: Ifba288a1166996d62a5e57698f63537ea0a2a8ee
This commit is contained in:
parent
6c7a85cb35
commit
79bb37d499
26 changed files with 96 additions and 101 deletions
|
@ -403,10 +403,10 @@ static bool latinime_BinaryDictionary_addNgramEntry(JNIEnv *env, jclass clazz, j
|
||||||
jsize wordLength = env->GetArrayLength(word);
|
jsize wordLength = env->GetArrayLength(word);
|
||||||
int wordCodePoints[wordLength];
|
int wordCodePoints[wordLength];
|
||||||
env->GetIntArrayRegion(word, 0, wordLength, wordCodePoints);
|
env->GetIntArrayRegion(word, 0, wordLength, wordCodePoints);
|
||||||
// Use 1 for count to indicate the bigram has inputted.
|
// Use 1 for count to indicate the ngram has inputted.
|
||||||
const BigramProperty bigramProperty(CodePointArrayView(wordCodePoints, wordLength).toVector(),
|
const NgramProperty ngramProperty(CodePointArrayView(wordCodePoints, wordLength).toVector(),
|
||||||
probability, timestamp, 0 /* level */, 1 /* count */);
|
probability, timestamp, 0 /* level */, 1 /* count */);
|
||||||
return dictionary->addNgramEntry(&prevWordsInfo, &bigramProperty);
|
return dictionary->addNgramEntry(&prevWordsInfo, &ngramProperty);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool latinime_BinaryDictionary_removeNgramEntry(JNIEnv *env, jclass clazz, jlong dict,
|
static bool latinime_BinaryDictionary_removeNgramEntry(JNIEnv *env, jclass clazz, jlong dict,
|
||||||
|
@ -501,12 +501,12 @@ static int latinime_BinaryDictionary_addMultipleDictionaryEntries(JNIEnv *env, j
|
||||||
if (word0) {
|
if (word0) {
|
||||||
jint bigramProbability = env->GetIntField(languageModelParam, bigramProbabilityFieldId);
|
jint bigramProbability = env->GetIntField(languageModelParam, bigramProbabilityFieldId);
|
||||||
// Use 1 for count to indicate the bigram has inputted.
|
// Use 1 for count to indicate the bigram has inputted.
|
||||||
const BigramProperty bigramProperty(
|
const NgramProperty ngramProperty(
|
||||||
CodePointArrayView(word1CodePoints, word1Length).toVector(),
|
CodePointArrayView(word1CodePoints, word1Length).toVector(),
|
||||||
bigramProbability, timestamp, 0 /* level */, 1 /* count */);
|
bigramProbability, timestamp, 0 /* level */, 1 /* count */);
|
||||||
const PrevWordsInfo prevWordsInfo(word0CodePoints, word0Length,
|
const PrevWordsInfo prevWordsInfo(word0CodePoints, word0Length,
|
||||||
false /* isBeginningOfSentence */);
|
false /* isBeginningOfSentence */);
|
||||||
dictionary->addNgramEntry(&prevWordsInfo, &bigramProperty);
|
dictionary->addNgramEntry(&prevWordsInfo, &ngramProperty);
|
||||||
}
|
}
|
||||||
if (dictionary->needsToRunGC(true /* mindsBlockByGC */)) {
|
if (dictionary->needsToRunGC(true /* mindsBlockByGC */)) {
|
||||||
return i + 1;
|
return i + 1;
|
||||||
|
@ -603,6 +603,7 @@ static bool latinime_BinaryDictionary_migrateNative(JNIEnv *env, jclass clazz, j
|
||||||
} while (token != 0);
|
} while (token != 0);
|
||||||
|
|
||||||
// Add bigrams.
|
// Add bigrams.
|
||||||
|
// TODO: Support ngrams.
|
||||||
do {
|
do {
|
||||||
token = dictionary->getNextWordAndNextToken(token, wordCodePoints, &wordCodePointCount);
|
token = dictionary->getNextWordAndNextToken(token, wordCodePoints, &wordCodePointCount);
|
||||||
const WordProperty wordProperty = dictionary->getWordProperty(
|
const WordProperty wordProperty = dictionary->getWordProperty(
|
||||||
|
@ -617,10 +618,10 @@ static bool latinime_BinaryDictionary_migrateNative(JNIEnv *env, jclass clazz, j
|
||||||
}
|
}
|
||||||
const PrevWordsInfo prevWordsInfo(wordCodePoints, wordCodePointCount,
|
const PrevWordsInfo prevWordsInfo(wordCodePoints, wordCodePointCount,
|
||||||
wordProperty.getUnigramProperty()->representsBeginningOfSentence());
|
wordProperty.getUnigramProperty()->representsBeginningOfSentence());
|
||||||
for (const BigramProperty &bigramProperty : *wordProperty.getBigramProperties()) {
|
for (const NgramProperty &ngramProperty : *wordProperty.getNgramProperties()) {
|
||||||
if (!dictionaryStructureWithBufferPolicy->addNgramEntry(&prevWordsInfo,
|
if (!dictionaryStructureWithBufferPolicy->addNgramEntry(&prevWordsInfo,
|
||||||
&bigramProperty)) {
|
&ngramProperty)) {
|
||||||
LogUtils::logToJava(env, "Cannot add bigram to the new dict.");
|
LogUtils::logToJava(env, "Cannot add ngram to the new dict.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -144,9 +144,9 @@ bool Dictionary::removeUnigramEntry(const CodePointArrayView codePoints) {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Dictionary::addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
bool Dictionary::addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
||||||
const BigramProperty *const bigramProperty) {
|
const NgramProperty *const ngramProperty) {
|
||||||
TimeKeeper::setCurrentTime();
|
TimeKeeper::setCurrentTime();
|
||||||
return mDictionaryStructureWithBufferPolicy->addNgramEntry(prevWordsInfo, bigramProperty);
|
return mDictionaryStructureWithBufferPolicy->addNgramEntry(prevWordsInfo, ngramProperty);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Dictionary::removeNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
bool Dictionary::removeNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
||||||
|
|
|
@ -85,7 +85,7 @@ class Dictionary {
|
||||||
bool removeUnigramEntry(const CodePointArrayView codePoints);
|
bool removeUnigramEntry(const CodePointArrayView codePoints);
|
||||||
|
|
||||||
bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
||||||
const BigramProperty *const bigramProperty);
|
const NgramProperty *const ngramProperty);
|
||||||
|
|
||||||
bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
||||||
const CodePointArrayView codePoints);
|
const CodePointArrayView codePoints);
|
||||||
|
|
|
@ -14,8 +14,8 @@
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef LATINIME_BIGRAM_PROPERTY_H
|
#ifndef LATINIME_NGRAM_PROPERTY_H
|
||||||
#define LATINIME_BIGRAM_PROPERTY_H
|
#define LATINIME_NGRAM_PROPERTY_H
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
@ -23,10 +23,9 @@
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
// TODO: Change to NgramProperty.
|
class NgramProperty {
|
||||||
class BigramProperty {
|
|
||||||
public:
|
public:
|
||||||
BigramProperty(const std::vector<int> &&targetCodePoints, const int probability,
|
NgramProperty(const std::vector<int> &&targetCodePoints, const int probability,
|
||||||
const int timestamp, const int level, const int count)
|
const int timestamp, const int level, const int count)
|
||||||
: mTargetCodePoints(std::move(targetCodePoints)), mProbability(probability),
|
: mTargetCodePoints(std::move(targetCodePoints)), mProbability(probability),
|
||||||
mTimestamp(timestamp), mLevel(level), mCount(count) {}
|
mTimestamp(timestamp), mLevel(level), mCount(count) {}
|
||||||
|
@ -53,7 +52,7 @@ class BigramProperty {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Default copy constructor and assign operator are used for using in std::vector.
|
// Default copy constructor and assign operator are used for using in std::vector.
|
||||||
DISALLOW_DEFAULT_CONSTRUCTOR(BigramProperty);
|
DISALLOW_DEFAULT_CONSTRUCTOR(NgramProperty);
|
||||||
|
|
||||||
// TODO: Make members const.
|
// TODO: Make members const.
|
||||||
std::vector<int> mTargetCodePoints;
|
std::vector<int> mTargetCodePoints;
|
||||||
|
@ -63,4 +62,4 @@ class BigramProperty {
|
||||||
int mCount;
|
int mCount;
|
||||||
};
|
};
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
#endif // LATINIME_WORD_PROPERTY_H
|
#endif // LATINIME_NGRAM_PROPERTY_H
|
|
@ -28,7 +28,7 @@ void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints,
|
||||||
MAX_WORD_LENGTH /* maxLength */, mCodePoints.data(), mCodePoints.size(),
|
MAX_WORD_LENGTH /* maxLength */, mCodePoints.data(), mCodePoints.size(),
|
||||||
false /* needsNullTermination */);
|
false /* needsNullTermination */);
|
||||||
jboolean flags[] = {mUnigramProperty.isNotAWord(), mUnigramProperty.isBlacklisted(),
|
jboolean flags[] = {mUnigramProperty.isNotAWord(), mUnigramProperty.isBlacklisted(),
|
||||||
!mBigrams.empty(), mUnigramProperty.hasShortcuts(),
|
!mNgrams.empty(), mUnigramProperty.hasShortcuts(),
|
||||||
mUnigramProperty.representsBeginningOfSentence()};
|
mUnigramProperty.representsBeginningOfSentence()};
|
||||||
env->SetBooleanArrayRegion(outFlags, 0 /* start */, NELEMS(flags), flags);
|
env->SetBooleanArrayRegion(outFlags, 0 /* start */, NELEMS(flags), flags);
|
||||||
int probabilityInfo[] = {mUnigramProperty.getProbability(), mUnigramProperty.getTimestamp(),
|
int probabilityInfo[] = {mUnigramProperty.getProbability(), mUnigramProperty.getTimestamp(),
|
||||||
|
@ -42,8 +42,9 @@ void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints,
|
||||||
jmethodID addMethodId = env->GetMethodID(arrayListClass, "add", "(Ljava/lang/Object;)Z");
|
jmethodID addMethodId = env->GetMethodID(arrayListClass, "add", "(Ljava/lang/Object;)Z");
|
||||||
|
|
||||||
// Output bigrams.
|
// Output bigrams.
|
||||||
for (const auto &bigramProperty : mBigrams) {
|
// TODO: Support n-gram
|
||||||
const std::vector<int> *const word1CodePoints = bigramProperty.getTargetCodePoints();
|
for (const auto &ngramProperty : mNgrams) {
|
||||||
|
const std::vector<int> *const word1CodePoints = ngramProperty.getTargetCodePoints();
|
||||||
jintArray bigramWord1CodePointArray = env->NewIntArray(word1CodePoints->size());
|
jintArray bigramWord1CodePointArray = env->NewIntArray(word1CodePoints->size());
|
||||||
JniDataUtils::outputCodePoints(env, bigramWord1CodePointArray, 0 /* start */,
|
JniDataUtils::outputCodePoints(env, bigramWord1CodePointArray, 0 /* start */,
|
||||||
word1CodePoints->size(), word1CodePoints->data(), word1CodePoints->size(),
|
word1CodePoints->size(), word1CodePoints->data(), word1CodePoints->size(),
|
||||||
|
@ -51,9 +52,9 @@ void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints,
|
||||||
env->CallBooleanMethod(outBigramTargets, addMethodId, bigramWord1CodePointArray);
|
env->CallBooleanMethod(outBigramTargets, addMethodId, bigramWord1CodePointArray);
|
||||||
env->DeleteLocalRef(bigramWord1CodePointArray);
|
env->DeleteLocalRef(bigramWord1CodePointArray);
|
||||||
|
|
||||||
int bigramProbabilityInfo[] = {bigramProperty.getProbability(),
|
int bigramProbabilityInfo[] = {ngramProperty.getProbability(),
|
||||||
bigramProperty.getTimestamp(), bigramProperty.getLevel(),
|
ngramProperty.getTimestamp(), ngramProperty.getLevel(),
|
||||||
bigramProperty.getCount()};
|
ngramProperty.getCount()};
|
||||||
jintArray bigramProbabilityInfoArray = env->NewIntArray(NELEMS(bigramProbabilityInfo));
|
jintArray bigramProbabilityInfoArray = env->NewIntArray(NELEMS(bigramProbabilityInfo));
|
||||||
env->SetIntArrayRegion(bigramProbabilityInfoArray, 0 /* start */,
|
env->SetIntArrayRegion(bigramProbabilityInfoArray, 0 /* start */,
|
||||||
NELEMS(bigramProbabilityInfo), bigramProbabilityInfo);
|
NELEMS(bigramProbabilityInfo), bigramProbabilityInfo);
|
||||||
|
|
|
@ -21,7 +21,7 @@
|
||||||
|
|
||||||
#include "defines.h"
|
#include "defines.h"
|
||||||
#include "jni.h"
|
#include "jni.h"
|
||||||
#include "suggest/core/dictionary/property/bigram_property.h"
|
#include "suggest/core/dictionary/property/ngram_property.h"
|
||||||
#include "suggest/core/dictionary/property/unigram_property.h"
|
#include "suggest/core/dictionary/property/unigram_property.h"
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
@ -31,12 +31,12 @@ class WordProperty {
|
||||||
public:
|
public:
|
||||||
// Default constructor is used to create an instance that indicates an invalid word.
|
// Default constructor is used to create an instance that indicates an invalid word.
|
||||||
WordProperty()
|
WordProperty()
|
||||||
: mCodePoints(), mUnigramProperty(), mBigrams() {}
|
: mCodePoints(), mUnigramProperty(), mNgrams() {}
|
||||||
|
|
||||||
WordProperty(const std::vector<int> &&codePoints, const UnigramProperty *const unigramProperty,
|
WordProperty(const std::vector<int> &&codePoints, const UnigramProperty *const unigramProperty,
|
||||||
const std::vector<BigramProperty> *const bigrams)
|
const std::vector<NgramProperty> *const bigrams)
|
||||||
: mCodePoints(std::move(codePoints)), mUnigramProperty(*unigramProperty),
|
: mCodePoints(std::move(codePoints)), mUnigramProperty(*unigramProperty),
|
||||||
mBigrams(*bigrams) {}
|
mNgrams(*bigrams) {}
|
||||||
|
|
||||||
void outputProperties(JNIEnv *const env, jintArray outCodePoints, jbooleanArray outFlags,
|
void outputProperties(JNIEnv *const env, jintArray outCodePoints, jbooleanArray outFlags,
|
||||||
jintArray outProbabilityInfo, jobject outBigramTargets, jobject outBigramProbabilities,
|
jintArray outProbabilityInfo, jobject outBigramTargets, jobject outBigramProbabilities,
|
||||||
|
@ -46,8 +46,8 @@ class WordProperty {
|
||||||
return &mUnigramProperty;
|
return &mUnigramProperty;
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::vector<BigramProperty> *getBigramProperties() const {
|
const std::vector<NgramProperty> *getNgramProperties() const {
|
||||||
return &mBigrams;
|
return &mNgrams;
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -56,7 +56,7 @@ class WordProperty {
|
||||||
|
|
||||||
const std::vector<int> mCodePoints;
|
const std::vector<int> mCodePoints;
|
||||||
const UnigramProperty mUnigramProperty;
|
const UnigramProperty mUnigramProperty;
|
||||||
const std::vector<BigramProperty> mBigrams;
|
const std::vector<NgramProperty> mNgrams;
|
||||||
};
|
};
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
#endif // LATINIME_WORD_PROPERTY_H
|
#endif // LATINIME_WORD_PROPERTY_H
|
||||||
|
|
|
@ -81,7 +81,7 @@ class DictionaryStructureWithBufferPolicy {
|
||||||
|
|
||||||
// Returns whether the update was success or not.
|
// Returns whether the update was success or not.
|
||||||
virtual bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
virtual bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
||||||
const BigramProperty *const bigramProperty) = 0;
|
const NgramProperty *const ngramProperty) = 0;
|
||||||
|
|
||||||
// Returns whether the update was success or not.
|
// Returns whether the update was success or not.
|
||||||
virtual bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
virtual bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
||||||
|
|
|
@ -24,7 +24,7 @@
|
||||||
|
|
||||||
#include "suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h"
|
#include "suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h"
|
||||||
|
|
||||||
#include "suggest/core/dictionary/property/bigram_property.h"
|
#include "suggest/core/dictionary/property/ngram_property.h"
|
||||||
#include "suggest/policyimpl/dictionary/header/header_policy.h"
|
#include "suggest/policyimpl/dictionary/header/header_policy.h"
|
||||||
#include "suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h"
|
#include "suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h"
|
||||||
#include "suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.h"
|
#include "suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.h"
|
||||||
|
@ -60,7 +60,7 @@ void Ver4BigramListPolicy::getNextBigram(int *const outBigramPos, int *const out
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTargetTerminalId,
|
bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTargetTerminalId,
|
||||||
const BigramProperty *const bigramProperty, bool *const outAddedNewEntry) {
|
const NgramProperty *const ngramProperty, bool *const outAddedNewEntry) {
|
||||||
// 1. The word has no bigrams yet.
|
// 1. The word has no bigrams yet.
|
||||||
// 2. The word has bigrams, and there is the target in the list.
|
// 2. The word has bigrams, and there is the target in the list.
|
||||||
// 3. The word has bigrams, and there is an invalid entry that can be reclaimed.
|
// 3. The word has bigrams, and there is an invalid entry that can be reclaimed.
|
||||||
|
@ -79,7 +79,7 @@ bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTarget
|
||||||
const BigramEntry newBigramEntry(false /* hasNext */, NOT_A_PROBABILITY,
|
const BigramEntry newBigramEntry(false /* hasNext */, NOT_A_PROBABILITY,
|
||||||
newTargetTerminalId);
|
newTargetTerminalId);
|
||||||
const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom(&newBigramEntry,
|
const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom(&newBigramEntry,
|
||||||
bigramProperty);
|
ngramProperty);
|
||||||
// Write an entry.
|
// Write an entry.
|
||||||
const int writingPos = mBigramDictContent->getBigramListHeadPos(terminalId);
|
const int writingPos = mBigramDictContent->getBigramListHeadPos(terminalId);
|
||||||
if (!mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, writingPos)) {
|
if (!mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, writingPos)) {
|
||||||
|
@ -112,7 +112,7 @@ bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTarget
|
||||||
const BigramEntry newBigramEntry(false /* hasNext */, NOT_A_PROBABILITY,
|
const BigramEntry newBigramEntry(false /* hasNext */, NOT_A_PROBABILITY,
|
||||||
newTargetTerminalId);
|
newTargetTerminalId);
|
||||||
const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom(
|
const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom(
|
||||||
&newBigramEntry, bigramProperty);
|
&newBigramEntry, ngramProperty);
|
||||||
if (!mBigramDictContent->writeBigramEntryAtTail(&bigramEntryToWrite)) {
|
if (!mBigramDictContent->writeBigramEntryAtTail(&bigramEntryToWrite)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -138,7 +138,7 @@ bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTarget
|
||||||
const BigramEntry updatedBigramEntry =
|
const BigramEntry updatedBigramEntry =
|
||||||
originalBigramEntry.updateTargetTerminalIdAndGetEntry(newTargetTerminalId);
|
originalBigramEntry.updateTargetTerminalIdAndGetEntry(newTargetTerminalId);
|
||||||
const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom(
|
const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom(
|
||||||
&updatedBigramEntry, bigramProperty);
|
&updatedBigramEntry, ngramProperty);
|
||||||
return mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, entryPosToUpdate);
|
return mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, entryPosToUpdate);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -264,18 +264,18 @@ int Ver4BigramListPolicy::getEntryPosToUpdate(const int targetTerminalIdToFind,
|
||||||
|
|
||||||
const BigramEntry Ver4BigramListPolicy::createUpdatedBigramEntryFrom(
|
const BigramEntry Ver4BigramListPolicy::createUpdatedBigramEntryFrom(
|
||||||
const BigramEntry *const originalBigramEntry,
|
const BigramEntry *const originalBigramEntry,
|
||||||
const BigramProperty *const bigramProperty) const {
|
const NgramProperty *const ngramProperty) const {
|
||||||
// TODO: Consolidate historical info and probability.
|
// TODO: Consolidate historical info and probability.
|
||||||
if (mHeaderPolicy->hasHistoricalInfoOfWords()) {
|
if (mHeaderPolicy->hasHistoricalInfoOfWords()) {
|
||||||
const HistoricalInfo historicalInfoForUpdate(bigramProperty->getTimestamp(),
|
const HistoricalInfo historicalInfoForUpdate(ngramProperty->getTimestamp(),
|
||||||
bigramProperty->getLevel(), bigramProperty->getCount());
|
ngramProperty->getLevel(), ngramProperty->getCount());
|
||||||
const HistoricalInfo updatedHistoricalInfo =
|
const HistoricalInfo updatedHistoricalInfo =
|
||||||
ForgettingCurveUtils::createUpdatedHistoricalInfo(
|
ForgettingCurveUtils::createUpdatedHistoricalInfo(
|
||||||
originalBigramEntry->getHistoricalInfo(), bigramProperty->getProbability(),
|
originalBigramEntry->getHistoricalInfo(), ngramProperty->getProbability(),
|
||||||
&historicalInfoForUpdate, mHeaderPolicy);
|
&historicalInfoForUpdate, mHeaderPolicy);
|
||||||
return originalBigramEntry->updateHistoricalInfoAndGetEntry(&updatedHistoricalInfo);
|
return originalBigramEntry->updateHistoricalInfoAndGetEntry(&updatedHistoricalInfo);
|
||||||
} else {
|
} else {
|
||||||
return originalBigramEntry->updateProbabilityAndGetEntry(bigramProperty->getProbability());
|
return originalBigramEntry->updateProbabilityAndGetEntry(ngramProperty->getProbability());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -36,7 +36,7 @@ namespace v402 {
|
||||||
class BigramDictContent;
|
class BigramDictContent;
|
||||||
} // namespace v402
|
} // namespace v402
|
||||||
} // namespace backward
|
} // namespace backward
|
||||||
class BigramProperty;
|
class NgramProperty;
|
||||||
namespace backward {
|
namespace backward {
|
||||||
namespace v402 {
|
namespace v402 {
|
||||||
} // namespace v402
|
} // namespace v402
|
||||||
|
@ -64,7 +64,7 @@ class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool addNewEntry(const int terminalId, const int newTargetTerminalId,
|
bool addNewEntry(const int terminalId, const int newTargetTerminalId,
|
||||||
const BigramProperty *const bigramProperty, bool *const outAddedNewEntry);
|
const NgramProperty *const ngramProperty, bool *const outAddedNewEntry);
|
||||||
|
|
||||||
bool removeEntry(const int terminalId, const int targetTerminalId);
|
bool removeEntry(const int terminalId, const int targetTerminalId);
|
||||||
|
|
||||||
|
@ -80,7 +80,7 @@ class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy {
|
||||||
int *const outTailEntryPos) const;
|
int *const outTailEntryPos) const;
|
||||||
|
|
||||||
const BigramEntry createUpdatedBigramEntryFrom(const BigramEntry *const originalBigramEntry,
|
const BigramEntry createUpdatedBigramEntryFrom(const BigramEntry *const originalBigramEntry,
|
||||||
const BigramProperty *const bigramProperty) const;
|
const NgramProperty *const ngramProperty) const;
|
||||||
|
|
||||||
bool updateHasNextFlag(const bool hasNext, const int bigramEntryPos);
|
bool updateHasNextFlag(const bool hasNext, const int bigramEntryPos);
|
||||||
|
|
||||||
|
|
|
@ -232,8 +232,8 @@ bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition(
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Ver4PatriciaTrieNodeWriter::addNgramEntry(const WordIdArrayView prevWordIds, const int wordId,
|
bool Ver4PatriciaTrieNodeWriter::addNgramEntry(const WordIdArrayView prevWordIds, const int wordId,
|
||||||
const BigramProperty *const bigramProperty, bool *const outAddedNewEntry) {
|
const NgramProperty *const ngramProperty, bool *const outAddedNewEntry) {
|
||||||
if (!mBigramPolicy->addNewEntry(prevWordIds[0], wordId, bigramProperty, outAddedNewEntry)) {
|
if (!mBigramPolicy->addNewEntry(prevWordIds[0], wordId, ngramProperty, outAddedNewEntry)) {
|
||||||
AKLOGE("Cannot add new bigram entry. prevWordId: %d, wordId: %d",
|
AKLOGE("Cannot add new bigram entry. prevWordId: %d, wordId: %d",
|
||||||
prevWordIds[0], wordId);
|
prevWordIds[0], wordId);
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -94,7 +94,7 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter {
|
||||||
const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos);
|
const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos);
|
||||||
|
|
||||||
virtual bool addNgramEntry(const WordIdArrayView prevWordIds, const int wordId,
|
virtual bool addNgramEntry(const WordIdArrayView prevWordIds, const int wordId,
|
||||||
const BigramProperty *const bigramProperty, bool *const outAddedNewEntry);
|
const NgramProperty *const ngramProperty, bool *const outAddedNewEntry);
|
||||||
|
|
||||||
virtual bool removeNgramEntry(const WordIdArrayView prevWordIds, const int wordId);
|
virtual bool removeNgramEntry(const WordIdArrayView prevWordIds, const int wordId);
|
||||||
|
|
||||||
|
|
|
@ -30,7 +30,7 @@
|
||||||
#include "suggest/core/dicnode/dic_node_vector.h"
|
#include "suggest/core/dicnode/dic_node_vector.h"
|
||||||
#include "suggest/core/dictionary/multi_bigram_map.h"
|
#include "suggest/core/dictionary/multi_bigram_map.h"
|
||||||
#include "suggest/core/dictionary/ngram_listener.h"
|
#include "suggest/core/dictionary/ngram_listener.h"
|
||||||
#include "suggest/core/dictionary/property/bigram_property.h"
|
#include "suggest/core/dictionary/property/ngram_property.h"
|
||||||
#include "suggest/core/dictionary/property/unigram_property.h"
|
#include "suggest/core/dictionary/property/unigram_property.h"
|
||||||
#include "suggest/core/dictionary/property/word_property.h"
|
#include "suggest/core/dictionary/property/word_property.h"
|
||||||
#include "suggest/core/session/prev_words_info.h"
|
#include "suggest/core/session/prev_words_info.h"
|
||||||
|
@ -312,7 +312,7 @@ bool Ver4PatriciaTriePolicy::removeUnigramEntry(const CodePointArrayView wordCod
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
||||||
const BigramProperty *const bigramProperty) {
|
const NgramProperty *const ngramProperty) {
|
||||||
if (!mBuffers->isUpdatable()) {
|
if (!mBuffers->isUpdatable()) {
|
||||||
AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
|
AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
|
||||||
return false;
|
return false;
|
||||||
|
@ -326,9 +326,9 @@ bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsI
|
||||||
AKLOGE("prev words info is not valid for adding n-gram entry to the dictionary.");
|
AKLOGE("prev words info is not valid for adding n-gram entry to the dictionary.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (bigramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
|
if (ngramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
|
||||||
AKLOGE("The word is too long to insert the ngram to the dictionary. "
|
AKLOGE("The word is too long to insert the ngram to the dictionary. "
|
||||||
"length: %zd", bigramProperty->getTargetCodePoints()->size());
|
"length: %zd", ngramProperty->getTargetCodePoints()->size());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
|
WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
|
||||||
|
@ -356,7 +356,7 @@ bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsI
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
const int wordPos = getTerminalPtNodePosFromWordId(getWordId(
|
const int wordPos = getTerminalPtNodePosFromWordId(getWordId(
|
||||||
CodePointArrayView(*bigramProperty->getTargetCodePoints()),
|
CodePointArrayView(*ngramProperty->getTargetCodePoints()),
|
||||||
false /* forceLowerCaseSearch */));
|
false /* forceLowerCaseSearch */));
|
||||||
if (wordPos == NOT_A_DICT_POS) {
|
if (wordPos == NOT_A_DICT_POS) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -364,7 +364,7 @@ bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsI
|
||||||
bool addedNewBigram = false;
|
bool addedNewBigram = false;
|
||||||
const int prevWordPtNodePos = getTerminalPtNodePosFromWordId(prevWordIds[0]);
|
const int prevWordPtNodePos = getTerminalPtNodePosFromWordId(prevWordIds[0]);
|
||||||
if (mUpdatingHelper.addNgramEntry(PtNodePosArrayView::singleElementView(&prevWordPtNodePos),
|
if (mUpdatingHelper.addNgramEntry(PtNodePosArrayView::singleElementView(&prevWordPtNodePos),
|
||||||
wordPos, bigramProperty, &addedNewBigram)) {
|
wordPos, ngramProperty, &addedNewBigram)) {
|
||||||
if (addedNewBigram) {
|
if (addedNewBigram) {
|
||||||
mBigramCount++;
|
mBigramCount++;
|
||||||
}
|
}
|
||||||
|
@ -499,7 +499,7 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
|
||||||
ptNodeParams.getTerminalId());
|
ptNodeParams.getTerminalId());
|
||||||
const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo();
|
const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo();
|
||||||
// Fetch bigram information.
|
// Fetch bigram information.
|
||||||
std::vector<BigramProperty> bigrams;
|
std::vector<NgramProperty> ngrams;
|
||||||
const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos);
|
const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos);
|
||||||
if (bigramListPos != NOT_A_DICT_POS) {
|
if (bigramListPos != NOT_A_DICT_POS) {
|
||||||
int bigramWord1CodePoints[MAX_WORD_LENGTH];
|
int bigramWord1CodePoints[MAX_WORD_LENGTH];
|
||||||
|
@ -526,7 +526,7 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
|
||||||
ForgettingCurveUtils::decodeProbability(
|
ForgettingCurveUtils::decodeProbability(
|
||||||
bigramEntry.getHistoricalInfo(), mHeaderPolicy) :
|
bigramEntry.getHistoricalInfo(), mHeaderPolicy) :
|
||||||
bigramEntry.getProbability();
|
bigramEntry.getProbability();
|
||||||
bigrams.emplace_back(
|
ngrams.emplace_back(
|
||||||
CodePointArrayView(bigramWord1CodePoints, codePointCount).toVector(),
|
CodePointArrayView(bigramWord1CodePoints, codePointCount).toVector(),
|
||||||
probability, historicalInfo->getTimeStamp(), historicalInfo->getLevel(),
|
probability, historicalInfo->getTimeStamp(), historicalInfo->getLevel(),
|
||||||
historicalInfo->getCount());
|
historicalInfo->getCount());
|
||||||
|
@ -554,7 +554,7 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
|
||||||
ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
|
ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
|
||||||
historicalInfo->getTimeStamp(), historicalInfo->getLevel(),
|
historicalInfo->getTimeStamp(), historicalInfo->getLevel(),
|
||||||
historicalInfo->getCount(), &shortcuts);
|
historicalInfo->getCount(), &shortcuts);
|
||||||
return WordProperty(wordCodePoints.toVector(), &unigramProperty, &bigrams);
|
return WordProperty(wordCodePoints.toVector(), &unigramProperty, &ngrams);
|
||||||
}
|
}
|
||||||
|
|
||||||
int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints,
|
int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints,
|
||||||
|
|
|
@ -59,6 +59,7 @@ namespace backward {
|
||||||
namespace v402 {
|
namespace v402 {
|
||||||
|
|
||||||
// Word id = Position of a PtNode that represents the word.
|
// Word id = Position of a PtNode that represents the word.
|
||||||
|
// Max supported n-gram is bigram.
|
||||||
class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
public:
|
public:
|
||||||
Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers)
|
Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers)
|
||||||
|
@ -112,7 +113,7 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
bool removeUnigramEntry(const CodePointArrayView wordCodePoints);
|
bool removeUnigramEntry(const CodePointArrayView wordCodePoints);
|
||||||
|
|
||||||
bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
||||||
const BigramProperty *const bigramProperty);
|
const NgramProperty *const ngramProperty);
|
||||||
|
|
||||||
bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
||||||
const CodePointArrayView wordCodePoints);
|
const CodePointArrayView wordCodePoints);
|
||||||
|
|
|
@ -76,6 +76,7 @@ class DynamicPtGcEventListeners {
|
||||||
int mValidUnigramCount;
|
int mValidUnigramCount;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// TODO: Remove when we stop supporting v402 format.
|
||||||
// Updates all bigram entries that are held by valid PtNodes. This removes useless bigram
|
// Updates all bigram entries that are held by valid PtNodes. This removes useless bigram
|
||||||
// entries.
|
// entries.
|
||||||
class TraversePolicyToUpdateBigramProbability
|
class TraversePolicyToUpdateBigramProbability
|
||||||
|
|
|
@ -82,7 +82,7 @@ bool DynamicPtUpdatingHelper::addUnigramWord(DynamicPtReadingHelper *const readi
|
||||||
}
|
}
|
||||||
|
|
||||||
bool DynamicPtUpdatingHelper::addNgramEntry(const PtNodePosArrayView prevWordsPtNodePos,
|
bool DynamicPtUpdatingHelper::addNgramEntry(const PtNodePosArrayView prevWordsPtNodePos,
|
||||||
const int wordPos, const BigramProperty *const bigramProperty,
|
const int wordPos, const NgramProperty *const ngramProperty,
|
||||||
bool *const outAddedNewEntry) {
|
bool *const outAddedNewEntry) {
|
||||||
if (prevWordsPtNodePos.empty()) {
|
if (prevWordsPtNodePos.empty()) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -96,7 +96,7 @@ bool DynamicPtUpdatingHelper::addNgramEntry(const PtNodePosArrayView prevWordsPt
|
||||||
const WordIdArrayView prevWordIds(prevWordTerminalIds, prevWordsPtNodePos.size());
|
const WordIdArrayView prevWordIds(prevWordTerminalIds, prevWordsPtNodePos.size());
|
||||||
const int wordId =
|
const int wordId =
|
||||||
mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos).getTerminalId();
|
mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos).getTerminalId();
|
||||||
return mPtNodeWriter->addNgramEntry(prevWordIds, wordId, bigramProperty, outAddedNewEntry);
|
return mPtNodeWriter->addNgramEntry(prevWordIds, wordId, ngramProperty, outAddedNewEntry);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool DynamicPtUpdatingHelper::removeNgramEntry(const PtNodePosArrayView prevWordsPtNodePos,
|
bool DynamicPtUpdatingHelper::removeNgramEntry(const PtNodePosArrayView prevWordsPtNodePos,
|
||||||
|
|
|
@ -23,7 +23,7 @@
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
class BigramProperty;
|
class NgramProperty;
|
||||||
class BufferWithExtendableBuffer;
|
class BufferWithExtendableBuffer;
|
||||||
class DynamicPtReadingHelper;
|
class DynamicPtReadingHelper;
|
||||||
class PtNodeReader;
|
class PtNodeReader;
|
||||||
|
@ -46,7 +46,7 @@ class DynamicPtUpdatingHelper {
|
||||||
// TODO: Remove after stopping supporting v402.
|
// TODO: Remove after stopping supporting v402.
|
||||||
// Add an n-gram entry.
|
// Add an n-gram entry.
|
||||||
bool addNgramEntry(const PtNodePosArrayView prevWordsPtNodePos, const int wordPos,
|
bool addNgramEntry(const PtNodePosArrayView prevWordsPtNodePos, const int wordPos,
|
||||||
const BigramProperty *const bigramProperty, bool *const outAddedNewEntry);
|
const NgramProperty *const ngramProperty, bool *const outAddedNewEntry);
|
||||||
|
|
||||||
// TODO: Remove after stopping supporting v402.
|
// TODO: Remove after stopping supporting v402.
|
||||||
// Remove an n-gram entry.
|
// Remove an n-gram entry.
|
||||||
|
|
|
@ -25,7 +25,7 @@
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
class BigramProperty;
|
class NgramProperty;
|
||||||
class UnigramProperty;
|
class UnigramProperty;
|
||||||
|
|
||||||
// Interface class used to write PtNode information.
|
// Interface class used to write PtNode information.
|
||||||
|
@ -72,7 +72,7 @@ class PtNodeWriter {
|
||||||
const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos) = 0;
|
const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos) = 0;
|
||||||
|
|
||||||
virtual bool addNgramEntry(const WordIdArrayView prevWordIds, const int wordId,
|
virtual bool addNgramEntry(const WordIdArrayView prevWordIds, const int wordId,
|
||||||
const BigramProperty *const bigramProperty, bool *const outAddedNewEntry) = 0;
|
const NgramProperty *const ngramProperty, bool *const outAddedNewEntry) = 0;
|
||||||
|
|
||||||
virtual bool removeNgramEntry(const WordIdArrayView prevWordIds, const int wordId) = 0;
|
virtual bool removeNgramEntry(const WordIdArrayView prevWordIds, const int wordId) = 0;
|
||||||
|
|
||||||
|
|
|
@ -436,7 +436,7 @@ const WordProperty PatriciaTriePolicy::getWordProperty(
|
||||||
const PtNodeParams ptNodeParams =
|
const PtNodeParams ptNodeParams =
|
||||||
mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
|
mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
|
||||||
// Fetch bigram information.
|
// Fetch bigram information.
|
||||||
std::vector<BigramProperty> bigrams;
|
std::vector<NgramProperty> ngrams;
|
||||||
const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos);
|
const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos);
|
||||||
int bigramWord1CodePoints[MAX_WORD_LENGTH];
|
int bigramWord1CodePoints[MAX_WORD_LENGTH];
|
||||||
BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramListPos);
|
BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramListPos);
|
||||||
|
@ -450,7 +450,7 @@ const WordProperty PatriciaTriePolicy::getWordProperty(
|
||||||
getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos()), MAX_WORD_LENGTH,
|
getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos()), MAX_WORD_LENGTH,
|
||||||
bigramWord1CodePoints, &word1Probability);
|
bigramWord1CodePoints, &word1Probability);
|
||||||
const int probability = getProbability(word1Probability, bigramsIt.getProbability());
|
const int probability = getProbability(word1Probability, bigramsIt.getProbability());
|
||||||
bigrams.emplace_back(
|
ngrams.emplace_back(
|
||||||
CodePointArrayView(bigramWord1CodePoints, word1CodePointCount).toVector(),
|
CodePointArrayView(bigramWord1CodePoints, word1CodePointCount).toVector(),
|
||||||
probability, NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */);
|
probability, NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */);
|
||||||
}
|
}
|
||||||
|
@ -478,7 +478,7 @@ const WordProperty PatriciaTriePolicy::getWordProperty(
|
||||||
const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
|
const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
|
||||||
ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
|
ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
|
||||||
NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */, &shortcuts);
|
NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */, &shortcuts);
|
||||||
return WordProperty(wordCodePoints.toVector(), &unigramProperty, &bigrams);
|
return WordProperty(wordCodePoints.toVector(), &unigramProperty, &ngrams);
|
||||||
}
|
}
|
||||||
|
|
||||||
int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints,
|
int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints,
|
||||||
|
|
|
@ -38,6 +38,7 @@ class DicNode;
|
||||||
class DicNodeVector;
|
class DicNodeVector;
|
||||||
|
|
||||||
// Word id = Position of a PtNode that represents the word.
|
// Word id = Position of a PtNode that represents the word.
|
||||||
|
// Max supported n-gram is bigram.
|
||||||
class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
public:
|
public:
|
||||||
PatriciaTriePolicy(MmappedBuffer::MmappedBufferPtr mmappedBuffer)
|
PatriciaTriePolicy(MmappedBuffer::MmappedBufferPtr mmappedBuffer)
|
||||||
|
@ -93,7 +94,7 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
||||||
const BigramProperty *const bigramProperty) {
|
const NgramProperty *const ngramProperty) {
|
||||||
// This method should not be called for non-updatable dictionary.
|
// This method should not be called for non-updatable dictionary.
|
||||||
AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
|
AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -21,7 +21,7 @@
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
|
||||||
#include "defines.h"
|
#include "defines.h"
|
||||||
#include "suggest/core/dictionary/property/bigram_property.h"
|
#include "suggest/core/dictionary/property/ngram_property.h"
|
||||||
#include "suggest/core/dictionary/property/unigram_property.h"
|
#include "suggest/core/dictionary/property/unigram_property.h"
|
||||||
#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
|
#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
|
||||||
#include "suggest/policyimpl/dictionary/utils/historical_info.h"
|
#include "suggest/policyimpl/dictionary/utils/historical_info.h"
|
||||||
|
@ -56,12 +56,12 @@ class ProbabilityEntry {
|
||||||
mHistoricalInfo(unigramProperty->getTimestamp(), unigramProperty->getLevel(),
|
mHistoricalInfo(unigramProperty->getTimestamp(), unigramProperty->getLevel(),
|
||||||
unigramProperty->getCount()) {}
|
unigramProperty->getCount()) {}
|
||||||
|
|
||||||
// Create from bigram property.
|
// Create from ngram property.
|
||||||
// TODO: Set flags.
|
// TODO: Set flags.
|
||||||
ProbabilityEntry(const BigramProperty *const bigramProperty)
|
ProbabilityEntry(const NgramProperty *const ngramProperty)
|
||||||
: mFlags(0), mProbability(bigramProperty->getProbability()),
|
: mFlags(0), mProbability(ngramProperty->getProbability()),
|
||||||
mHistoricalInfo(bigramProperty->getTimestamp(), bigramProperty->getLevel(),
|
mHistoricalInfo(ngramProperty->getTimestamp(), ngramProperty->getLevel(),
|
||||||
bigramProperty->getCount()) {}
|
ngramProperty->getCount()) {}
|
||||||
|
|
||||||
bool isValid() const {
|
bool isValid() const {
|
||||||
return (mFlags & Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY) == 0;
|
return (mFlags & Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY) == 0;
|
||||||
|
|
|
@ -61,6 +61,7 @@ bool Ver4PatriciaTrieNodeWriter::markPtNodeAsDeleted(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: Quit using bigramLinkedNodePos.
|
||||||
bool Ver4PatriciaTrieNodeWriter::markPtNodeAsMoved(
|
bool Ver4PatriciaTrieNodeWriter::markPtNodeAsMoved(
|
||||||
const PtNodeParams *const toBeUpdatedPtNodeParams,
|
const PtNodeParams *const toBeUpdatedPtNodeParams,
|
||||||
const int movedPos, const int bigramLinkedNodePos) {
|
const int movedPos, const int bigramLinkedNodePos) {
|
||||||
|
@ -208,15 +209,16 @@ bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition(
|
||||||
terminalId, &probabilityEntryToWrite);
|
terminalId, &probabilityEntryToWrite);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: Support counting ngram entries.
|
||||||
bool Ver4PatriciaTrieNodeWriter::addNgramEntry(const WordIdArrayView prevWordIds, const int wordId,
|
bool Ver4PatriciaTrieNodeWriter::addNgramEntry(const WordIdArrayView prevWordIds, const int wordId,
|
||||||
const BigramProperty *const bigramProperty, bool *const outAddedNewBigram) {
|
const NgramProperty *const ngramProperty, bool *const outAddedNewBigram) {
|
||||||
LanguageModelDictContent *const languageModelDictContent =
|
LanguageModelDictContent *const languageModelDictContent =
|
||||||
mBuffers->getMutableLanguageModelDictContent();
|
mBuffers->getMutableLanguageModelDictContent();
|
||||||
const ProbabilityEntry probabilityEntry =
|
const ProbabilityEntry probabilityEntry =
|
||||||
languageModelDictContent->getNgramProbabilityEntry(prevWordIds, wordId);
|
languageModelDictContent->getNgramProbabilityEntry(prevWordIds, wordId);
|
||||||
const ProbabilityEntry probabilityEntryOfBigramProperty(bigramProperty);
|
const ProbabilityEntry probabilityEntryOfNgramProperty(ngramProperty);
|
||||||
const ProbabilityEntry updatedProbabilityEntry = createUpdatedEntryFrom(
|
const ProbabilityEntry updatedProbabilityEntry = createUpdatedEntryFrom(
|
||||||
&probabilityEntry, &probabilityEntryOfBigramProperty);
|
&probabilityEntry, &probabilityEntryOfNgramProperty);
|
||||||
if (!languageModelDictContent->setNgramProbabilityEntry(
|
if (!languageModelDictContent->setNgramProbabilityEntry(
|
||||||
prevWordIds, wordId, &updatedProbabilityEntry)) {
|
prevWordIds, wordId, &updatedProbabilityEntry)) {
|
||||||
AKLOGE("Cannot add new ngram entry. prevWordId[0]: %d, prevWordId.size(): %zd, wordId: %d",
|
AKLOGE("Cannot add new ngram entry. prevWordId[0]: %d, prevWordId.size(): %zd, wordId: %d",
|
||||||
|
|
|
@ -74,7 +74,7 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter {
|
||||||
const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos);
|
const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos);
|
||||||
|
|
||||||
virtual bool addNgramEntry(const WordIdArrayView prevWordIds, const int wordId,
|
virtual bool addNgramEntry(const WordIdArrayView prevWordIds, const int wordId,
|
||||||
const BigramProperty *const bigramProperty, bool *const outAddedNewEntry);
|
const NgramProperty *const ngramProperty, bool *const outAddedNewEntry);
|
||||||
|
|
||||||
virtual bool removeNgramEntry(const WordIdArrayView prevWordIds, const int wordId);
|
virtual bool removeNgramEntry(const WordIdArrayView prevWordIds, const int wordId);
|
||||||
|
|
||||||
|
|
|
@ -23,7 +23,7 @@
|
||||||
#include "suggest/core/dicnode/dic_node_vector.h"
|
#include "suggest/core/dicnode/dic_node_vector.h"
|
||||||
#include "suggest/core/dictionary/multi_bigram_map.h"
|
#include "suggest/core/dictionary/multi_bigram_map.h"
|
||||||
#include "suggest/core/dictionary/ngram_listener.h"
|
#include "suggest/core/dictionary/ngram_listener.h"
|
||||||
#include "suggest/core/dictionary/property/bigram_property.h"
|
#include "suggest/core/dictionary/property/ngram_property.h"
|
||||||
#include "suggest/core/dictionary/property/unigram_property.h"
|
#include "suggest/core/dictionary/property/unigram_property.h"
|
||||||
#include "suggest/core/dictionary/property/word_property.h"
|
#include "suggest/core/dictionary/property/word_property.h"
|
||||||
#include "suggest/core/session/prev_words_info.h"
|
#include "suggest/core/session/prev_words_info.h"
|
||||||
|
@ -266,7 +266,7 @@ bool Ver4PatriciaTriePolicy::removeUnigramEntry(const CodePointArrayView wordCod
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
||||||
const BigramProperty *const bigramProperty) {
|
const NgramProperty *const ngramProperty) {
|
||||||
if (!mBuffers->isUpdatable()) {
|
if (!mBuffers->isUpdatable()) {
|
||||||
AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
|
AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
|
||||||
return false;
|
return false;
|
||||||
|
@ -280,9 +280,9 @@ bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsI
|
||||||
AKLOGE("prev words info is not valid for adding n-gram entry to the dictionary.");
|
AKLOGE("prev words info is not valid for adding n-gram entry to the dictionary.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (bigramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
|
if (ngramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
|
||||||
AKLOGE("The word is too long to insert the ngram to the dictionary. "
|
AKLOGE("The word is too long to insert the ngram to the dictionary. "
|
||||||
"length: %zd", bigramProperty->getTargetCodePoints()->size());
|
"length: %zd", ngramProperty->getTargetCodePoints()->size());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
|
WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
|
||||||
|
@ -311,13 +311,13 @@ bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsI
|
||||||
// Refresh word ids.
|
// Refresh word ids.
|
||||||
prevWordsInfo->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */);
|
prevWordsInfo->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */);
|
||||||
}
|
}
|
||||||
const int wordId = getWordId(CodePointArrayView(*bigramProperty->getTargetCodePoints()),
|
const int wordId = getWordId(CodePointArrayView(*ngramProperty->getTargetCodePoints()),
|
||||||
false /* forceLowerCaseSearch */);
|
false /* forceLowerCaseSearch */);
|
||||||
if (wordId == NOT_A_WORD_ID) {
|
if (wordId == NOT_A_WORD_ID) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
bool addedNewEntry = false;
|
bool addedNewEntry = false;
|
||||||
if (mNodeWriter.addNgramEntry(prevWordIds, wordId, bigramProperty, &addedNewEntry)) {
|
if (mNodeWriter.addNgramEntry(prevWordIds, wordId, ngramProperty, &addedNewEntry)) {
|
||||||
if (addedNewEntry) {
|
if (addedNewEntry) {
|
||||||
mBigramCount++;
|
mBigramCount++;
|
||||||
}
|
}
|
||||||
|
@ -451,7 +451,7 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
|
||||||
const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo();
|
const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo();
|
||||||
// Fetch bigram information.
|
// Fetch bigram information.
|
||||||
// TODO: Support n-gram.
|
// TODO: Support n-gram.
|
||||||
std::vector<BigramProperty> bigrams;
|
std::vector<NgramProperty> ngrams;
|
||||||
const WordIdArrayView prevWordIds = WordIdArrayView::singleElementView(&wordId);
|
const WordIdArrayView prevWordIds = WordIdArrayView::singleElementView(&wordId);
|
||||||
int bigramWord1CodePoints[MAX_WORD_LENGTH];
|
int bigramWord1CodePoints[MAX_WORD_LENGTH];
|
||||||
for (const auto entry : mBuffers->getLanguageModelDictContent()->getProbabilityEntries(
|
for (const auto entry : mBuffers->getLanguageModelDictContent()->getProbabilityEntries(
|
||||||
|
@ -463,7 +463,7 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
|
||||||
const int probability = probabilityEntry.hasHistoricalInfo() ?
|
const int probability = probabilityEntry.hasHistoricalInfo() ?
|
||||||
ForgettingCurveUtils::decodeProbability(historicalInfo, mHeaderPolicy) :
|
ForgettingCurveUtils::decodeProbability(historicalInfo, mHeaderPolicy) :
|
||||||
probabilityEntry.getProbability();
|
probabilityEntry.getProbability();
|
||||||
bigrams.emplace_back(CodePointArrayView(bigramWord1CodePoints, codePointCount).toVector(),
|
ngrams.emplace_back(CodePointArrayView(bigramWord1CodePoints, codePointCount).toVector(),
|
||||||
probability, historicalInfo->getTimeStamp(), historicalInfo->getLevel(),
|
probability, historicalInfo->getTimeStamp(), historicalInfo->getLevel(),
|
||||||
historicalInfo->getCount());
|
historicalInfo->getCount());
|
||||||
}
|
}
|
||||||
|
@ -489,7 +489,7 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
|
||||||
probabilityEntry.isNotAWord(), probabilityEntry.isBlacklisted(),
|
probabilityEntry.isNotAWord(), probabilityEntry.isBlacklisted(),
|
||||||
probabilityEntry.getProbability(), historicalInfo->getTimeStamp(),
|
probabilityEntry.getProbability(), historicalInfo->getTimeStamp(),
|
||||||
historicalInfo->getLevel(), historicalInfo->getCount(), &shortcuts);
|
historicalInfo->getLevel(), historicalInfo->getCount(), &shortcuts);
|
||||||
return WordProperty(wordCodePoints.toVector(), &unigramProperty, &bigrams);
|
return WordProperty(wordCodePoints.toVector(), &unigramProperty, &ngrams);
|
||||||
}
|
}
|
||||||
|
|
||||||
int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints,
|
int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints,
|
||||||
|
|
|
@ -37,6 +37,7 @@ namespace latinime {
|
||||||
class DicNode;
|
class DicNode;
|
||||||
class DicNodeVector;
|
class DicNodeVector;
|
||||||
|
|
||||||
|
// TODO: Support counting ngram entries.
|
||||||
// Word id = Artificial id that is stored in the PtNode looked up by the word.
|
// Word id = Artificial id that is stored in the PtNode looked up by the word.
|
||||||
class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
public:
|
public:
|
||||||
|
@ -92,7 +93,7 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
bool removeUnigramEntry(const CodePointArrayView wordCodePoints);
|
bool removeUnigramEntry(const CodePointArrayView wordCodePoints);
|
||||||
|
|
||||||
bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
||||||
const BigramProperty *const bigramProperty);
|
const NgramProperty *const ngramProperty);
|
||||||
|
|
||||||
bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
||||||
const CodePointArrayView wordCodePoints);
|
const CodePointArrayView wordCodePoints);
|
||||||
|
|
|
@ -114,14 +114,6 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
|
||||||
DynamicPtGcEventListeners::TraversePolicyToUpdateBigramProbability
|
|
||||||
traversePolicyToUpdateBigramProbability(&ptNodeWriter);
|
|
||||||
if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
|
|
||||||
&traversePolicyToUpdateBigramProbability)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Mapping from positions in mBuffer to positions in bufferToWrite.
|
// Mapping from positions in mBuffer to positions in bufferToWrite.
|
||||||
PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap;
|
PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap;
|
||||||
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
||||||
|
|
|
@ -33,6 +33,7 @@ class Ver4PatriciaTrieWritingHelper {
|
||||||
Ver4PatriciaTrieWritingHelper(Ver4DictBuffers *const buffers)
|
Ver4PatriciaTrieWritingHelper(Ver4DictBuffers *const buffers)
|
||||||
: mBuffers(buffers) {}
|
: mBuffers(buffers) {}
|
||||||
|
|
||||||
|
// TODO: Support counting ngram entries.
|
||||||
bool writeToDictFile(const char *const dictDirPath, const int unigramCount,
|
bool writeToDictFile(const char *const dictDirPath, const int unigramCount,
|
||||||
const int bigramCount) const;
|
const int bigramCount) const;
|
||||||
|
|
||||||
|
@ -70,11 +71,6 @@ class Ver4PatriciaTrieWritingHelper {
|
||||||
Ver4DictBuffers *const buffersToWrite, int *const outUnigramCount,
|
Ver4DictBuffers *const buffersToWrite, int *const outUnigramCount,
|
||||||
int *const outBigramCount);
|
int *const outBigramCount);
|
||||||
|
|
||||||
bool truncateUnigrams(const Ver4PatriciaTrieNodeReader *const ptNodeReader,
|
|
||||||
Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount);
|
|
||||||
|
|
||||||
bool truncateBigrams(const int maxBigramCount);
|
|
||||||
|
|
||||||
Ver4DictBuffers *const mBuffers;
|
Ver4DictBuffers *const mBuffers;
|
||||||
};
|
};
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
|
|
Loading…
Reference in a new issue