Support n-gram for look-up.

Bug: 14425059
Change-Id: I19523c29fb802cd65158c7540d1608e7f55c4ca7
main
Keisuke Kuroyanagi 2014-09-16 18:10:56 +09:00
parent 3676c11472
commit 4926b90ec5
6 changed files with 28 additions and 18 deletions

View File

@ -167,6 +167,14 @@ int LanguageModelDictContent::createAndGetBitmapEntryIndex(const WordIdArrayView
if (lastBitmapEntryIndex == TrieMap::INVALID_INDEX) { if (lastBitmapEntryIndex == TrieMap::INVALID_INDEX) {
return TrieMap::INVALID_INDEX; return TrieMap::INVALID_INDEX;
} }
const int oldestPrevWordId = prevWordIds[prevWordIds.size() - 1];
const TrieMap::Result result = mTrieMap.get(oldestPrevWordId, lastBitmapEntryIndex);
if (!result.mIsValid) {
if (!mTrieMap.put(oldestPrevWordId,
ProbabilityEntry().encode(mHasHistoricalInfo), lastBitmapEntryIndex)) {
return TrieMap::INVALID_INDEX;
}
}
return mTrieMap.getNextLevelBitmapEntryIndex(prevWordIds[prevWordIds.size() - 1], return mTrieMap.getNextLevelBitmapEntryIndex(prevWordIds[prevWordIds.size() - 1],
lastBitmapEntryIndex); lastBitmapEntryIndex);
} }

View File

@ -36,7 +36,8 @@ class ProbabilityEntry {
// Dummy entry // Dummy entry
ProbabilityEntry() ProbabilityEntry()
: mFlags(0), mProbability(NOT_A_PROBABILITY), mHistoricalInfo() {} : mFlags(Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY), mProbability(NOT_A_PROBABILITY),
mHistoricalInfo() {}
// Entry without historical information // Entry without historical information
ProbabilityEntry(const int flags, const int probability) ProbabilityEntry(const int flags, const int probability)
@ -61,7 +62,7 @@ class ProbabilityEntry {
bigramProperty->getCount()) {} bigramProperty->getCount()) {}
bool isValid() const { bool isValid() const {
return (mProbability != NOT_A_PROBABILITY) || hasHistoricalInfo(); return (mFlags & Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY) == 0;
} }
bool hasHistoricalInfo() const { bool hasHistoricalInfo() const {

View File

@ -53,6 +53,7 @@ const int Ver4DictConstants::WORD_LEVEL_FIELD_SIZE = 1;
const int Ver4DictConstants::WORD_COUNT_FIELD_SIZE = 1; const int Ver4DictConstants::WORD_COUNT_FIELD_SIZE = 1;
const uint8_t Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE = 0x1; const uint8_t Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE = 0x1;
const uint8_t Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY = 0x2;
const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE = 64; const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE = 64;
const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE = 4; const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE = 4;

View File

@ -51,6 +51,7 @@ class Ver4DictConstants {
static const int WORD_COUNT_FIELD_SIZE; static const int WORD_COUNT_FIELD_SIZE;
// Flags in probability entry. // Flags in probability entry.
static const uint8_t FLAG_REPRESENTS_BEGINNING_OF_SENTENCE; static const uint8_t FLAG_REPRESENTS_BEGINNING_OF_SENTENCE;
static const uint8_t FLAG_NOT_A_VALID_ENTRY;
static const int SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE; static const int SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE;
static const int SHORTCUT_ADDRESS_TABLE_DATA_SIZE; static const int SHORTCUT_ADDRESS_TABLE_DATA_SIZE;

View File

@ -120,16 +120,15 @@ const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext(
const int ptNodePos = const int ptNodePos =
mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId); mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
// TODO: Support n-gram.
const int probability = mBuffers->getLanguageModelDictContent()->getWordProbability( const int probability = mBuffers->getLanguageModelDictContent()->getWordProbability(
prevWordIds.limit(1 /* maxSize */), wordId, mHeaderPolicy); prevWordIds, wordId, mHeaderPolicy);
return WordAttributes(probability, ptNodeParams.isBlacklisted(), ptNodeParams.isNotAWord(), return WordAttributes(probability, ptNodeParams.isBlacklisted(), ptNodeParams.isNotAWord(),
probability == 0); probability == 0);
} }
int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds, int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds,
const int wordId) const { const int wordId) const {
if (wordId == NOT_A_WORD_ID) { if (wordId == NOT_A_WORD_ID || prevWordIds.contains(NOT_A_WORD_ID)) {
return NOT_A_PROBABILITY; return NOT_A_PROBABILITY;
} }
const int ptNodePos = const int ptNodePos =
@ -138,10 +137,8 @@ int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordI
if (ptNodeParams.isDeleted() || ptNodeParams.isBlacklisted() || ptNodeParams.isNotAWord()) { if (ptNodeParams.isDeleted() || ptNodeParams.isBlacklisted() || ptNodeParams.isNotAWord()) {
return NOT_A_PROBABILITY; return NOT_A_PROBABILITY;
} }
// TODO: Support n-gram.
const ProbabilityEntry probabilityEntry = const ProbabilityEntry probabilityEntry =
mBuffers->getLanguageModelDictContent()->getNgramProbabilityEntry( mBuffers->getLanguageModelDictContent()->getNgramProbabilityEntry(prevWordIds, wordId);
prevWordIds.limit(1 /* maxSize */), wordId);
if (!probabilityEntry.isValid()) { if (!probabilityEntry.isValid()) {
return NOT_A_PROBABILITY; return NOT_A_PROBABILITY;
} }
@ -164,16 +161,18 @@ void Ver4PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordI
if (prevWordIds.empty()) { if (prevWordIds.empty()) {
return; return;
} }
// TODO: Support n-gram.
const auto languageModelDictContent = mBuffers->getLanguageModelDictContent(); const auto languageModelDictContent = mBuffers->getLanguageModelDictContent();
for (const auto entry : languageModelDictContent->getProbabilityEntries( for (size_t i = 1; i <= prevWordIds.size(); ++i) {
prevWordIds.limit(1 /* maxSize */))) { for (const auto entry : languageModelDictContent->getProbabilityEntries(
const ProbabilityEntry &probabilityEntry = entry.getProbabilityEntry(); prevWordIds.limit(i))) {
const int probability = probabilityEntry.hasHistoricalInfo() ? const ProbabilityEntry &probabilityEntry = entry.getProbabilityEntry();
ForgettingCurveUtils::decodeProbability( const int probability = probabilityEntry.hasHistoricalInfo() ?
probabilityEntry.getHistoricalInfo(), mHeaderPolicy) : ForgettingCurveUtils::decodeProbability(
probabilityEntry.getProbability(); probabilityEntry.getHistoricalInfo(), mHeaderPolicy)
listener->onVisitEntry(probability, entry.getWordId()); + ForgettingCurveUtils::getProbabilityBiasForNgram(i + 1 /* n */) :
probabilityEntry.getProbability();
listener->onVisitEntry(probability, entry.getWordId());
}
} }
} }

View File

@ -29,7 +29,7 @@ namespace {
TEST(LanguageModelDictContentTest, TestUnigramProbability) { TEST(LanguageModelDictContentTest, TestUnigramProbability) {
LanguageModelDictContent languageModelDictContent(false /* useHistoricalInfo */); LanguageModelDictContent languageModelDictContent(false /* useHistoricalInfo */);
const int flag = 0xFF; const int flag = 0xF0;
const int probability = 10; const int probability = 10;
const int wordId = 100; const int wordId = 100;
const ProbabilityEntry probabilityEntry(flag, probability); const ProbabilityEntry probabilityEntry(flag, probability);