Merge "Support n-gram for look-up."
commit
e69266638f
|
@ -167,6 +167,14 @@ int LanguageModelDictContent::createAndGetBitmapEntryIndex(const WordIdArrayView
|
||||||
if (lastBitmapEntryIndex == TrieMap::INVALID_INDEX) {
|
if (lastBitmapEntryIndex == TrieMap::INVALID_INDEX) {
|
||||||
return TrieMap::INVALID_INDEX;
|
return TrieMap::INVALID_INDEX;
|
||||||
}
|
}
|
||||||
|
const int oldestPrevWordId = prevWordIds[prevWordIds.size() - 1];
|
||||||
|
const TrieMap::Result result = mTrieMap.get(oldestPrevWordId, lastBitmapEntryIndex);
|
||||||
|
if (!result.mIsValid) {
|
||||||
|
if (!mTrieMap.put(oldestPrevWordId,
|
||||||
|
ProbabilityEntry().encode(mHasHistoricalInfo), lastBitmapEntryIndex)) {
|
||||||
|
return TrieMap::INVALID_INDEX;
|
||||||
|
}
|
||||||
|
}
|
||||||
return mTrieMap.getNextLevelBitmapEntryIndex(prevWordIds[prevWordIds.size() - 1],
|
return mTrieMap.getNextLevelBitmapEntryIndex(prevWordIds[prevWordIds.size() - 1],
|
||||||
lastBitmapEntryIndex);
|
lastBitmapEntryIndex);
|
||||||
}
|
}
|
||||||
|
|
|
@ -36,7 +36,8 @@ class ProbabilityEntry {
|
||||||
|
|
||||||
// Dummy entry
|
// Dummy entry
|
||||||
ProbabilityEntry()
|
ProbabilityEntry()
|
||||||
: mFlags(0), mProbability(NOT_A_PROBABILITY), mHistoricalInfo() {}
|
: mFlags(Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY), mProbability(NOT_A_PROBABILITY),
|
||||||
|
mHistoricalInfo() {}
|
||||||
|
|
||||||
// Entry without historical information
|
// Entry without historical information
|
||||||
ProbabilityEntry(const int flags, const int probability)
|
ProbabilityEntry(const int flags, const int probability)
|
||||||
|
@ -61,7 +62,7 @@ class ProbabilityEntry {
|
||||||
bigramProperty->getCount()) {}
|
bigramProperty->getCount()) {}
|
||||||
|
|
||||||
bool isValid() const {
|
bool isValid() const {
|
||||||
return (mProbability != NOT_A_PROBABILITY) || hasHistoricalInfo();
|
return (mFlags & Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY) == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool hasHistoricalInfo() const {
|
bool hasHistoricalInfo() const {
|
||||||
|
|
|
@ -53,6 +53,7 @@ const int Ver4DictConstants::WORD_LEVEL_FIELD_SIZE = 1;
|
||||||
const int Ver4DictConstants::WORD_COUNT_FIELD_SIZE = 1;
|
const int Ver4DictConstants::WORD_COUNT_FIELD_SIZE = 1;
|
||||||
|
|
||||||
const uint8_t Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE = 0x1;
|
const uint8_t Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE = 0x1;
|
||||||
|
const uint8_t Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY = 0x2;
|
||||||
|
|
||||||
const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE = 64;
|
const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE = 64;
|
||||||
const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE = 4;
|
const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE = 4;
|
||||||
|
|
|
@ -51,6 +51,7 @@ class Ver4DictConstants {
|
||||||
static const int WORD_COUNT_FIELD_SIZE;
|
static const int WORD_COUNT_FIELD_SIZE;
|
||||||
// Flags in probability entry.
|
// Flags in probability entry.
|
||||||
static const uint8_t FLAG_REPRESENTS_BEGINNING_OF_SENTENCE;
|
static const uint8_t FLAG_REPRESENTS_BEGINNING_OF_SENTENCE;
|
||||||
|
static const uint8_t FLAG_NOT_A_VALID_ENTRY;
|
||||||
|
|
||||||
static const int SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE;
|
static const int SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE;
|
||||||
static const int SHORTCUT_ADDRESS_TABLE_DATA_SIZE;
|
static const int SHORTCUT_ADDRESS_TABLE_DATA_SIZE;
|
||||||
|
|
|
@ -120,16 +120,15 @@ const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext(
|
||||||
const int ptNodePos =
|
const int ptNodePos =
|
||||||
mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
|
mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
|
||||||
const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
|
const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
|
||||||
// TODO: Support n-gram.
|
|
||||||
const int probability = mBuffers->getLanguageModelDictContent()->getWordProbability(
|
const int probability = mBuffers->getLanguageModelDictContent()->getWordProbability(
|
||||||
prevWordIds.limit(1 /* maxSize */), wordId, mHeaderPolicy);
|
prevWordIds, wordId, mHeaderPolicy);
|
||||||
return WordAttributes(probability, ptNodeParams.isBlacklisted(), ptNodeParams.isNotAWord(),
|
return WordAttributes(probability, ptNodeParams.isBlacklisted(), ptNodeParams.isNotAWord(),
|
||||||
probability == 0);
|
probability == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds,
|
int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds,
|
||||||
const int wordId) const {
|
const int wordId) const {
|
||||||
if (wordId == NOT_A_WORD_ID) {
|
if (wordId == NOT_A_WORD_ID || prevWordIds.contains(NOT_A_WORD_ID)) {
|
||||||
return NOT_A_PROBABILITY;
|
return NOT_A_PROBABILITY;
|
||||||
}
|
}
|
||||||
const int ptNodePos =
|
const int ptNodePos =
|
||||||
|
@ -138,10 +137,8 @@ int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordI
|
||||||
if (ptNodeParams.isDeleted() || ptNodeParams.isBlacklisted() || ptNodeParams.isNotAWord()) {
|
if (ptNodeParams.isDeleted() || ptNodeParams.isBlacklisted() || ptNodeParams.isNotAWord()) {
|
||||||
return NOT_A_PROBABILITY;
|
return NOT_A_PROBABILITY;
|
||||||
}
|
}
|
||||||
// TODO: Support n-gram.
|
|
||||||
const ProbabilityEntry probabilityEntry =
|
const ProbabilityEntry probabilityEntry =
|
||||||
mBuffers->getLanguageModelDictContent()->getNgramProbabilityEntry(
|
mBuffers->getLanguageModelDictContent()->getNgramProbabilityEntry(prevWordIds, wordId);
|
||||||
prevWordIds.limit(1 /* maxSize */), wordId);
|
|
||||||
if (!probabilityEntry.isValid()) {
|
if (!probabilityEntry.isValid()) {
|
||||||
return NOT_A_PROBABILITY;
|
return NOT_A_PROBABILITY;
|
||||||
}
|
}
|
||||||
|
@ -164,18 +161,20 @@ void Ver4PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordI
|
||||||
if (prevWordIds.empty()) {
|
if (prevWordIds.empty()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// TODO: Support n-gram.
|
|
||||||
const auto languageModelDictContent = mBuffers->getLanguageModelDictContent();
|
const auto languageModelDictContent = mBuffers->getLanguageModelDictContent();
|
||||||
|
for (size_t i = 1; i <= prevWordIds.size(); ++i) {
|
||||||
for (const auto entry : languageModelDictContent->getProbabilityEntries(
|
for (const auto entry : languageModelDictContent->getProbabilityEntries(
|
||||||
prevWordIds.limit(1 /* maxSize */))) {
|
prevWordIds.limit(i))) {
|
||||||
const ProbabilityEntry &probabilityEntry = entry.getProbabilityEntry();
|
const ProbabilityEntry &probabilityEntry = entry.getProbabilityEntry();
|
||||||
const int probability = probabilityEntry.hasHistoricalInfo() ?
|
const int probability = probabilityEntry.hasHistoricalInfo() ?
|
||||||
ForgettingCurveUtils::decodeProbability(
|
ForgettingCurveUtils::decodeProbability(
|
||||||
probabilityEntry.getHistoricalInfo(), mHeaderPolicy) :
|
probabilityEntry.getHistoricalInfo(), mHeaderPolicy)
|
||||||
|
+ ForgettingCurveUtils::getProbabilityBiasForNgram(i + 1 /* n */) :
|
||||||
probabilityEntry.getProbability();
|
probabilityEntry.getProbability();
|
||||||
listener->onVisitEntry(probability, entry.getWordId());
|
listener->onVisitEntry(probability, entry.getWordId());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int Ver4PatriciaTriePolicy::getShortcutPositionOfWord(const int wordId) const {
|
int Ver4PatriciaTriePolicy::getShortcutPositionOfWord(const int wordId) const {
|
||||||
if (wordId == NOT_A_WORD_ID) {
|
if (wordId == NOT_A_WORD_ID) {
|
||||||
|
|
|
@ -29,7 +29,7 @@ namespace {
|
||||||
TEST(LanguageModelDictContentTest, TestUnigramProbability) {
|
TEST(LanguageModelDictContentTest, TestUnigramProbability) {
|
||||||
LanguageModelDictContent languageModelDictContent(false /* useHistoricalInfo */);
|
LanguageModelDictContent languageModelDictContent(false /* useHistoricalInfo */);
|
||||||
|
|
||||||
const int flag = 0xFF;
|
const int flag = 0xF0;
|
||||||
const int probability = 10;
|
const int probability = 10;
|
||||||
const int wordId = 100;
|
const int wordId = 100;
|
||||||
const ProbabilityEntry probabilityEntry(flag, probability);
|
const ProbabilityEntry probabilityEntry(flag, probability);
|
||||||
|
|
Loading…
Reference in New Issue