am e6926663
: Merge "Support n-gram for look-up."
* commit 'e69266638f978ee19f7d9819695beaad87372c3e': Support n-gram for look-up.
This commit is contained in:
commit
25d778c33d
6 changed files with 28 additions and 18 deletions
|
@ -167,6 +167,14 @@ int LanguageModelDictContent::createAndGetBitmapEntryIndex(const WordIdArrayView
|
|||
if (lastBitmapEntryIndex == TrieMap::INVALID_INDEX) {
|
||||
return TrieMap::INVALID_INDEX;
|
||||
}
|
||||
const int oldestPrevWordId = prevWordIds[prevWordIds.size() - 1];
|
||||
const TrieMap::Result result = mTrieMap.get(oldestPrevWordId, lastBitmapEntryIndex);
|
||||
if (!result.mIsValid) {
|
||||
if (!mTrieMap.put(oldestPrevWordId,
|
||||
ProbabilityEntry().encode(mHasHistoricalInfo), lastBitmapEntryIndex)) {
|
||||
return TrieMap::INVALID_INDEX;
|
||||
}
|
||||
}
|
||||
return mTrieMap.getNextLevelBitmapEntryIndex(prevWordIds[prevWordIds.size() - 1],
|
||||
lastBitmapEntryIndex);
|
||||
}
|
||||
|
|
|
@ -36,7 +36,8 @@ class ProbabilityEntry {
|
|||
|
||||
// Dummy entry
|
||||
ProbabilityEntry()
|
||||
: mFlags(0), mProbability(NOT_A_PROBABILITY), mHistoricalInfo() {}
|
||||
: mFlags(Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY), mProbability(NOT_A_PROBABILITY),
|
||||
mHistoricalInfo() {}
|
||||
|
||||
// Entry without historical information
|
||||
ProbabilityEntry(const int flags, const int probability)
|
||||
|
@ -61,7 +62,7 @@ class ProbabilityEntry {
|
|||
bigramProperty->getCount()) {}
|
||||
|
||||
bool isValid() const {
|
||||
return (mProbability != NOT_A_PROBABILITY) || hasHistoricalInfo();
|
||||
return (mFlags & Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY) == 0;
|
||||
}
|
||||
|
||||
bool hasHistoricalInfo() const {
|
||||
|
|
|
@ -53,6 +53,7 @@ const int Ver4DictConstants::WORD_LEVEL_FIELD_SIZE = 1;
|
|||
const int Ver4DictConstants::WORD_COUNT_FIELD_SIZE = 1;
|
||||
|
||||
const uint8_t Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE = 0x1;
|
||||
const uint8_t Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY = 0x2;
|
||||
|
||||
const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE = 64;
|
||||
const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE = 4;
|
||||
|
|
|
@ -51,6 +51,7 @@ class Ver4DictConstants {
|
|||
static const int WORD_COUNT_FIELD_SIZE;
|
||||
// Flags in probability entry.
|
||||
static const uint8_t FLAG_REPRESENTS_BEGINNING_OF_SENTENCE;
|
||||
static const uint8_t FLAG_NOT_A_VALID_ENTRY;
|
||||
|
||||
static const int SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE;
|
||||
static const int SHORTCUT_ADDRESS_TABLE_DATA_SIZE;
|
||||
|
|
|
@ -120,16 +120,15 @@ const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext(
|
|||
const int ptNodePos =
|
||||
mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
|
||||
const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
|
||||
// TODO: Support n-gram.
|
||||
const int probability = mBuffers->getLanguageModelDictContent()->getWordProbability(
|
||||
prevWordIds.limit(1 /* maxSize */), wordId, mHeaderPolicy);
|
||||
prevWordIds, wordId, mHeaderPolicy);
|
||||
return WordAttributes(probability, ptNodeParams.isBlacklisted(), ptNodeParams.isNotAWord(),
|
||||
probability == 0);
|
||||
}
|
||||
|
||||
int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds,
|
||||
const int wordId) const {
|
||||
if (wordId == NOT_A_WORD_ID) {
|
||||
if (wordId == NOT_A_WORD_ID || prevWordIds.contains(NOT_A_WORD_ID)) {
|
||||
return NOT_A_PROBABILITY;
|
||||
}
|
||||
const int ptNodePos =
|
||||
|
@ -138,10 +137,8 @@ int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordI
|
|||
if (ptNodeParams.isDeleted() || ptNodeParams.isBlacklisted() || ptNodeParams.isNotAWord()) {
|
||||
return NOT_A_PROBABILITY;
|
||||
}
|
||||
// TODO: Support n-gram.
|
||||
const ProbabilityEntry probabilityEntry =
|
||||
mBuffers->getLanguageModelDictContent()->getNgramProbabilityEntry(
|
||||
prevWordIds.limit(1 /* maxSize */), wordId);
|
||||
mBuffers->getLanguageModelDictContent()->getNgramProbabilityEntry(prevWordIds, wordId);
|
||||
if (!probabilityEntry.isValid()) {
|
||||
return NOT_A_PROBABILITY;
|
||||
}
|
||||
|
@ -164,18 +161,20 @@ void Ver4PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordI
|
|||
if (prevWordIds.empty()) {
|
||||
return;
|
||||
}
|
||||
// TODO: Support n-gram.
|
||||
const auto languageModelDictContent = mBuffers->getLanguageModelDictContent();
|
||||
for (size_t i = 1; i <= prevWordIds.size(); ++i) {
|
||||
for (const auto entry : languageModelDictContent->getProbabilityEntries(
|
||||
prevWordIds.limit(1 /* maxSize */))) {
|
||||
prevWordIds.limit(i))) {
|
||||
const ProbabilityEntry &probabilityEntry = entry.getProbabilityEntry();
|
||||
const int probability = probabilityEntry.hasHistoricalInfo() ?
|
||||
ForgettingCurveUtils::decodeProbability(
|
||||
probabilityEntry.getHistoricalInfo(), mHeaderPolicy) :
|
||||
probabilityEntry.getHistoricalInfo(), mHeaderPolicy)
|
||||
+ ForgettingCurveUtils::getProbabilityBiasForNgram(i + 1 /* n */) :
|
||||
probabilityEntry.getProbability();
|
||||
listener->onVisitEntry(probability, entry.getWordId());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int Ver4PatriciaTriePolicy::getShortcutPositionOfWord(const int wordId) const {
|
||||
if (wordId == NOT_A_WORD_ID) {
|
||||
|
|
|
@ -29,7 +29,7 @@ namespace {
|
|||
TEST(LanguageModelDictContentTest, TestUnigramProbability) {
|
||||
LanguageModelDictContent languageModelDictContent(false /* useHistoricalInfo */);
|
||||
|
||||
const int flag = 0xFF;
|
||||
const int flag = 0xF0;
|
||||
const int probability = 10;
|
||||
const int wordId = 100;
|
||||
const ProbabilityEntry probabilityEntry(flag, probability);
|
||||
|
|
Loading…
Reference in a new issue