Support n-gram for look-up.

Bug: 14425059
Change-Id: I19523c29fb802cd65158c7540d1608e7f55c4ca7
main
Keisuke Kuroyanagi 2014-09-16 18:10:56 +09:00
parent 3676c11472
commit 4926b90ec5
6 changed files with 28 additions and 18 deletions

View File

@ -167,6 +167,14 @@ int LanguageModelDictContent::createAndGetBitmapEntryIndex(const WordIdArrayView
if (lastBitmapEntryIndex == TrieMap::INVALID_INDEX) {
return TrieMap::INVALID_INDEX;
}
const int oldestPrevWordId = prevWordIds[prevWordIds.size() - 1];
const TrieMap::Result result = mTrieMap.get(oldestPrevWordId, lastBitmapEntryIndex);
if (!result.mIsValid) {
if (!mTrieMap.put(oldestPrevWordId,
ProbabilityEntry().encode(mHasHistoricalInfo), lastBitmapEntryIndex)) {
return TrieMap::INVALID_INDEX;
}
}
return mTrieMap.getNextLevelBitmapEntryIndex(prevWordIds[prevWordIds.size() - 1],
lastBitmapEntryIndex);
}

View File

@ -36,7 +36,8 @@ class ProbabilityEntry {
// Dummy entry
ProbabilityEntry()
: mFlags(0), mProbability(NOT_A_PROBABILITY), mHistoricalInfo() {}
: mFlags(Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY), mProbability(NOT_A_PROBABILITY),
mHistoricalInfo() {}
// Entry without historical information
ProbabilityEntry(const int flags, const int probability)
@ -61,7 +62,7 @@ class ProbabilityEntry {
bigramProperty->getCount()) {}
bool isValid() const {
return (mProbability != NOT_A_PROBABILITY) || hasHistoricalInfo();
return (mFlags & Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY) == 0;
}
bool hasHistoricalInfo() const {

View File

@ -53,6 +53,7 @@ const int Ver4DictConstants::WORD_LEVEL_FIELD_SIZE = 1;
const int Ver4DictConstants::WORD_COUNT_FIELD_SIZE = 1;
const uint8_t Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE = 0x1;
const uint8_t Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY = 0x2;
const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE = 64;
const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE = 4;

View File

@ -51,6 +51,7 @@ class Ver4DictConstants {
static const int WORD_COUNT_FIELD_SIZE;
// Flags in probability entry.
static const uint8_t FLAG_REPRESENTS_BEGINNING_OF_SENTENCE;
static const uint8_t FLAG_NOT_A_VALID_ENTRY;
static const int SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE;
static const int SHORTCUT_ADDRESS_TABLE_DATA_SIZE;

View File

@ -120,16 +120,15 @@ const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext(
const int ptNodePos =
mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
// TODO: Support n-gram.
const int probability = mBuffers->getLanguageModelDictContent()->getWordProbability(
prevWordIds.limit(1 /* maxSize */), wordId, mHeaderPolicy);
prevWordIds, wordId, mHeaderPolicy);
return WordAttributes(probability, ptNodeParams.isBlacklisted(), ptNodeParams.isNotAWord(),
probability == 0);
}
int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds,
const int wordId) const {
if (wordId == NOT_A_WORD_ID) {
if (wordId == NOT_A_WORD_ID || prevWordIds.contains(NOT_A_WORD_ID)) {
return NOT_A_PROBABILITY;
}
const int ptNodePos =
@ -138,10 +137,8 @@ int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordI
if (ptNodeParams.isDeleted() || ptNodeParams.isBlacklisted() || ptNodeParams.isNotAWord()) {
return NOT_A_PROBABILITY;
}
// TODO: Support n-gram.
const ProbabilityEntry probabilityEntry =
mBuffers->getLanguageModelDictContent()->getNgramProbabilityEntry(
prevWordIds.limit(1 /* maxSize */), wordId);
mBuffers->getLanguageModelDictContent()->getNgramProbabilityEntry(prevWordIds, wordId);
if (!probabilityEntry.isValid()) {
return NOT_A_PROBABILITY;
}
@ -164,16 +161,18 @@ void Ver4PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordI
if (prevWordIds.empty()) {
return;
}
// TODO: Support n-gram.
const auto languageModelDictContent = mBuffers->getLanguageModelDictContent();
for (const auto entry : languageModelDictContent->getProbabilityEntries(
prevWordIds.limit(1 /* maxSize */))) {
const ProbabilityEntry &probabilityEntry = entry.getProbabilityEntry();
const int probability = probabilityEntry.hasHistoricalInfo() ?
ForgettingCurveUtils::decodeProbability(
probabilityEntry.getHistoricalInfo(), mHeaderPolicy) :
probabilityEntry.getProbability();
listener->onVisitEntry(probability, entry.getWordId());
for (size_t i = 1; i <= prevWordIds.size(); ++i) {
for (const auto entry : languageModelDictContent->getProbabilityEntries(
prevWordIds.limit(i))) {
const ProbabilityEntry &probabilityEntry = entry.getProbabilityEntry();
const int probability = probabilityEntry.hasHistoricalInfo() ?
ForgettingCurveUtils::decodeProbability(
probabilityEntry.getHistoricalInfo(), mHeaderPolicy)
+ ForgettingCurveUtils::getProbabilityBiasForNgram(i + 1 /* n */) :
probabilityEntry.getProbability();
listener->onVisitEntry(probability, entry.getWordId());
}
}
}

View File

@ -29,7 +29,7 @@ namespace {
TEST(LanguageModelDictContentTest, TestUnigramProbability) {
LanguageModelDictContent languageModelDictContent(false /* useHistoricalInfo */);
const int flag = 0xFF;
const int flag = 0xF0;
const int probability = 10;
const int wordId = 100;
const ProbabilityEntry probabilityEntry(flag, probability);