Merge "Add Beginning-of-Sentence info in UnigramProperty."
This commit is contained in:
commit
d6f1c5c7fb
9 changed files with 41 additions and 16 deletions
|
@ -341,8 +341,8 @@ static void latinime_BinaryDictionary_addUnigramWord(JNIEnv *env, jclass clazz,
|
|||
shortcuts.emplace_back(&shortcutTargetCodePoints, shortcutProbability);
|
||||
}
|
||||
// Use 1 for count to indicate the word has inputted.
|
||||
const UnigramProperty unigramProperty(isNotAWord, isBlacklisted,
|
||||
probability, timestamp, 0 /* level */, 1 /* count */, &shortcuts);
|
||||
const UnigramProperty unigramProperty(false /* isBeginningOfSentence */, isNotAWord,
|
||||
isBlacklisted, probability, timestamp, 0 /* level */, 1 /* count */, &shortcuts);
|
||||
dictionary->addUnigramEntry(codePoints, codePointCount, &unigramProperty);
|
||||
}
|
||||
|
||||
|
@ -450,8 +450,9 @@ static int latinime_BinaryDictionary_addMultipleDictionaryEntries(JNIEnv *env, j
|
|||
shortcuts.emplace_back(&shortcutTargetCodePoints, shortcutProbability);
|
||||
}
|
||||
// Use 1 for count to indicate the word has inputted.
|
||||
const UnigramProperty unigramProperty(isNotAWord, isBlacklisted,
|
||||
unigramProbability, timestamp, 0 /* level */, 1 /* count */, &shortcuts);
|
||||
const UnigramProperty unigramProperty(false /* isBeginningOfSentence */, isNotAWord,
|
||||
isBlacklisted, unigramProbability, timestamp, 0 /* level */, 1 /* count */,
|
||||
&shortcuts);
|
||||
dictionary->addUnigramEntry(word1CodePoints, word1Length, &unigramProperty);
|
||||
if (word0) {
|
||||
jint bigramProbability = env->GetIntField(languageModelParam, bigramProbabilityFieldId);
|
||||
|
|
|
@ -82,6 +82,12 @@ int Dictionary::getBigramProbability(const PrevWordsInfo *const prevWordsInfo, c
|
|||
|
||||
void Dictionary::addUnigramEntry(const int *const word, const int length,
|
||||
const UnigramProperty *const unigramProperty) {
|
||||
if (unigramProperty->representsBeginningOfSentence()
|
||||
&& !mDictionaryStructureWithBufferPolicy->getHeaderStructurePolicy()
|
||||
->supportsBeginningOfSentence()) {
|
||||
AKLOGE("The dictionary doesn't support Beginning-of-Sentence.");
|
||||
return;
|
||||
}
|
||||
TimeKeeper::setCurrentTime();
|
||||
mDictionaryStructureWithBufferPolicy->addUnigramEntry(word, length, unigramProperty);
|
||||
}
|
||||
|
|
|
@ -48,15 +48,21 @@ class UnigramProperty {
|
|||
};
|
||||
|
||||
UnigramProperty()
|
||||
: mIsNotAWord(false), mIsBlacklisted(false), mProbability(NOT_A_PROBABILITY),
|
||||
mTimestamp(NOT_A_TIMESTAMP), mLevel(0), mCount(0), mShortcuts() {}
|
||||
: mRepresentsBeginningOfSentence(false), mIsNotAWord(false), mIsBlacklisted(false),
|
||||
mProbability(NOT_A_PROBABILITY), mTimestamp(NOT_A_TIMESTAMP), mLevel(0), mCount(0),
|
||||
mShortcuts() {}
|
||||
|
||||
UnigramProperty(const bool isNotAWord, const bool isBlacklisted, const int probability,
|
||||
const int timestamp, const int level, const int count,
|
||||
const std::vector<ShortcutProperty> *const shortcuts)
|
||||
: mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), mProbability(probability),
|
||||
UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord,
|
||||
const bool isBlacklisted, const int probability, const int timestamp, const int level,
|
||||
const int count, const std::vector<ShortcutProperty> *const shortcuts)
|
||||
: mRepresentsBeginningOfSentence(representsBeginningOfSentence),
|
||||
mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), mProbability(probability),
|
||||
mTimestamp(timestamp), mLevel(level), mCount(count), mShortcuts(*shortcuts) {}
|
||||
|
||||
bool representsBeginningOfSentence() const {
|
||||
return mRepresentsBeginningOfSentence;
|
||||
}
|
||||
|
||||
bool isNotAWord() const {
|
||||
return mIsNotAWord;
|
||||
}
|
||||
|
@ -94,6 +100,7 @@ class UnigramProperty {
|
|||
DISALLOW_ASSIGNMENT_OPERATOR(UnigramProperty);
|
||||
|
||||
// TODO: Make members const.
|
||||
bool mRepresentsBeginningOfSentence;
|
||||
bool mIsNotAWord;
|
||||
bool mIsBlacklisted;
|
||||
int mProbability;
|
||||
|
|
|
@ -51,6 +51,8 @@ class DictionaryHeaderStructurePolicy {
|
|||
|
||||
virtual const std::vector<int> *getLocale() const = 0;
|
||||
|
||||
virtual bool supportsBeginningOfSentence() const = 0;
|
||||
|
||||
protected:
|
||||
DictionaryHeaderStructurePolicy() {}
|
||||
|
||||
|
|
|
@ -246,6 +246,10 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
|||
return &mLocale;
|
||||
}
|
||||
|
||||
bool supportsBeginningOfSentence() const {
|
||||
return mDictFormatVersion == FormatUtils::VERSION_4_DEV;
|
||||
}
|
||||
|
||||
private:
|
||||
DISALLOW_COPY_AND_ASSIGN(HeaderPolicy);
|
||||
|
||||
|
|
|
@ -432,8 +432,8 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code
|
|||
shortcuts.emplace_back(&target, shortcutProbability);
|
||||
}
|
||||
}
|
||||
const UnigramProperty unigramProperty(ptNodeParams.isNotAWord(),
|
||||
ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
|
||||
const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */,
|
||||
ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
|
||||
historicalInfo->getTimeStamp(), historicalInfo->getLevel(),
|
||||
historicalInfo->getCount(), &shortcuts);
|
||||
return WordProperty(&codePointVector, &unigramProperty, &bigrams);
|
||||
|
|
|
@ -164,6 +164,11 @@ class PtNodeParams {
|
|||
&& isNotAWord();
|
||||
}
|
||||
|
||||
AK_FORCE_INLINE int representsBeginningOfSentence() const {
|
||||
return getCodePointCount() > 0 && getCodePoints()[0] == CODE_POINT_BEGINNING_OF_SENTENCE
|
||||
&& isNotAWord();
|
||||
}
|
||||
|
||||
// Parent node position
|
||||
AK_FORCE_INLINE int getParentPos() const {
|
||||
return mParentPos;
|
||||
|
|
|
@ -383,8 +383,8 @@ const WordProperty PatriciaTriePolicy::getWordProperty(const int *const codePoin
|
|||
shortcuts.emplace_back(&shortcutTarget, shortcutProbability);
|
||||
}
|
||||
}
|
||||
const UnigramProperty unigramProperty(ptNodeParams.isNotAWord(),
|
||||
ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
|
||||
const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
|
||||
ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
|
||||
NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */, &shortcuts);
|
||||
return WordProperty(&codePointVector, &unigramProperty, &bigrams);
|
||||
}
|
||||
|
|
|
@ -430,8 +430,8 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code
|
|||
shortcuts.emplace_back(&target, shortcutProbability);
|
||||
}
|
||||
}
|
||||
const UnigramProperty unigramProperty(ptNodeParams.isNotAWord(),
|
||||
ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
|
||||
const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
|
||||
ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
|
||||
historicalInfo->getTimeStamp(), historicalInfo->getLevel(),
|
||||
historicalInfo->getCount(), &shortcuts);
|
||||
return WordProperty(&codePointVector, &unigramProperty, &bigrams);
|
||||
|
|
Loading…
Reference in a new issue