Add Beginning-of-Sentence info in UnigramProperty.

Bug: 14119293
Bug: 14425059
Change-Id: I8a894352568377d32468e5563f4e89af00d22944
main
Keisuke Kuroyanagi 2014-05-23 00:07:14 +09:00
parent 9f8c9a0161
commit 2a015dcb25
9 changed files with 41 additions and 16 deletions

View File

@ -341,8 +341,8 @@ static void latinime_BinaryDictionary_addUnigramWord(JNIEnv *env, jclass clazz,
shortcuts.emplace_back(&shortcutTargetCodePoints, shortcutProbability); shortcuts.emplace_back(&shortcutTargetCodePoints, shortcutProbability);
} }
// Use 1 for count to indicate the word has inputted. // Use 1 for count to indicate the word has inputted.
const UnigramProperty unigramProperty(isNotAWord, isBlacklisted, const UnigramProperty unigramProperty(false /* isBeginningOfSentence */, isNotAWord,
probability, timestamp, 0 /* level */, 1 /* count */, &shortcuts); isBlacklisted, probability, timestamp, 0 /* level */, 1 /* count */, &shortcuts);
dictionary->addUnigramEntry(codePoints, codePointCount, &unigramProperty); dictionary->addUnigramEntry(codePoints, codePointCount, &unigramProperty);
} }
@ -450,8 +450,9 @@ static int latinime_BinaryDictionary_addMultipleDictionaryEntries(JNIEnv *env, j
shortcuts.emplace_back(&shortcutTargetCodePoints, shortcutProbability); shortcuts.emplace_back(&shortcutTargetCodePoints, shortcutProbability);
} }
// Use 1 for count to indicate the word has inputted. // Use 1 for count to indicate the word has inputted.
const UnigramProperty unigramProperty(isNotAWord, isBlacklisted, const UnigramProperty unigramProperty(false /* isBeginningOfSentence */, isNotAWord,
unigramProbability, timestamp, 0 /* level */, 1 /* count */, &shortcuts); isBlacklisted, unigramProbability, timestamp, 0 /* level */, 1 /* count */,
&shortcuts);
dictionary->addUnigramEntry(word1CodePoints, word1Length, &unigramProperty); dictionary->addUnigramEntry(word1CodePoints, word1Length, &unigramProperty);
if (word0) { if (word0) {
jint bigramProbability = env->GetIntField(languageModelParam, bigramProbabilityFieldId); jint bigramProbability = env->GetIntField(languageModelParam, bigramProbabilityFieldId);

View File

@ -82,6 +82,12 @@ int Dictionary::getBigramProbability(const PrevWordsInfo *const prevWordsInfo, c
void Dictionary::addUnigramEntry(const int *const word, const int length, void Dictionary::addUnigramEntry(const int *const word, const int length,
const UnigramProperty *const unigramProperty) { const UnigramProperty *const unigramProperty) {
if (unigramProperty->representsBeginningOfSentence()
&& !mDictionaryStructureWithBufferPolicy->getHeaderStructurePolicy()
->supportsBeginningOfSentence()) {
AKLOGE("The dictionary doesn't support Beginning-of-Sentence.");
return;
}
TimeKeeper::setCurrentTime(); TimeKeeper::setCurrentTime();
mDictionaryStructureWithBufferPolicy->addUnigramEntry(word, length, unigramProperty); mDictionaryStructureWithBufferPolicy->addUnigramEntry(word, length, unigramProperty);
} }

View File

@ -48,15 +48,21 @@ class UnigramProperty {
}; };
UnigramProperty() UnigramProperty()
: mIsNotAWord(false), mIsBlacklisted(false), mProbability(NOT_A_PROBABILITY), : mRepresentsBeginningOfSentence(false), mIsNotAWord(false), mIsBlacklisted(false),
mTimestamp(NOT_A_TIMESTAMP), mLevel(0), mCount(0), mShortcuts() {} mProbability(NOT_A_PROBABILITY), mTimestamp(NOT_A_TIMESTAMP), mLevel(0), mCount(0),
mShortcuts() {}
UnigramProperty(const bool isNotAWord, const bool isBlacklisted, const int probability, UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord,
const int timestamp, const int level, const int count, const bool isBlacklisted, const int probability, const int timestamp, const int level,
const std::vector<ShortcutProperty> *const shortcuts) const int count, const std::vector<ShortcutProperty> *const shortcuts)
: mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), mProbability(probability), : mRepresentsBeginningOfSentence(representsBeginningOfSentence),
mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), mProbability(probability),
mTimestamp(timestamp), mLevel(level), mCount(count), mShortcuts(*shortcuts) {} mTimestamp(timestamp), mLevel(level), mCount(count), mShortcuts(*shortcuts) {}
bool representsBeginningOfSentence() const {
return mRepresentsBeginningOfSentence;
}
bool isNotAWord() const { bool isNotAWord() const {
return mIsNotAWord; return mIsNotAWord;
} }
@ -94,6 +100,7 @@ class UnigramProperty {
DISALLOW_ASSIGNMENT_OPERATOR(UnigramProperty); DISALLOW_ASSIGNMENT_OPERATOR(UnigramProperty);
// TODO: Make members const. // TODO: Make members const.
bool mRepresentsBeginningOfSentence;
bool mIsNotAWord; bool mIsNotAWord;
bool mIsBlacklisted; bool mIsBlacklisted;
int mProbability; int mProbability;

View File

@ -51,6 +51,8 @@ class DictionaryHeaderStructurePolicy {
virtual const std::vector<int> *getLocale() const = 0; virtual const std::vector<int> *getLocale() const = 0;
virtual bool supportsBeginningOfSentence() const = 0;
protected: protected:
DictionaryHeaderStructurePolicy() {} DictionaryHeaderStructurePolicy() {}

View File

@ -246,6 +246,10 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
return &mLocale; return &mLocale;
} }
bool supportsBeginningOfSentence() const {
return mDictFormatVersion == FormatUtils::VERSION_4_DEV;
}
private: private:
DISALLOW_COPY_AND_ASSIGN(HeaderPolicy); DISALLOW_COPY_AND_ASSIGN(HeaderPolicy);

View File

@ -432,8 +432,8 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code
shortcuts.emplace_back(&target, shortcutProbability); shortcuts.emplace_back(&target, shortcutProbability);
} }
} }
const UnigramProperty unigramProperty(ptNodeParams.isNotAWord(), const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */,
ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(), ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
historicalInfo->getTimeStamp(), historicalInfo->getLevel(), historicalInfo->getTimeStamp(), historicalInfo->getLevel(),
historicalInfo->getCount(), &shortcuts); historicalInfo->getCount(), &shortcuts);
return WordProperty(&codePointVector, &unigramProperty, &bigrams); return WordProperty(&codePointVector, &unigramProperty, &bigrams);

View File

@ -164,6 +164,11 @@ class PtNodeParams {
&& isNotAWord(); && isNotAWord();
} }
AK_FORCE_INLINE int representsBeginningOfSentence() const {
return getCodePointCount() > 0 && getCodePoints()[0] == CODE_POINT_BEGINNING_OF_SENTENCE
&& isNotAWord();
}
// Parent node position // Parent node position
AK_FORCE_INLINE int getParentPos() const { AK_FORCE_INLINE int getParentPos() const {
return mParentPos; return mParentPos;

View File

@ -383,8 +383,8 @@ const WordProperty PatriciaTriePolicy::getWordProperty(const int *const codePoin
shortcuts.emplace_back(&shortcutTarget, shortcutProbability); shortcuts.emplace_back(&shortcutTarget, shortcutProbability);
} }
} }
const UnigramProperty unigramProperty(ptNodeParams.isNotAWord(), const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(), ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */, &shortcuts); NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */, &shortcuts);
return WordProperty(&codePointVector, &unigramProperty, &bigrams); return WordProperty(&codePointVector, &unigramProperty, &bigrams);
} }

View File

@ -430,8 +430,8 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code
shortcuts.emplace_back(&target, shortcutProbability); shortcuts.emplace_back(&target, shortcutProbability);
} }
} }
const UnigramProperty unigramProperty(ptNodeParams.isNotAWord(), const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(), ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
historicalInfo->getTimeStamp(), historicalInfo->getLevel(), historicalInfo->getTimeStamp(), historicalInfo->getLevel(),
historicalInfo->getCount(), &shortcuts); historicalInfo->getCount(), &shortcuts);
return WordProperty(&codePointVector, &unigramProperty, &bigrams); return WordProperty(&codePointVector, &unigramProperty, &bigrams);