am f4928ad4
: Merge "Update useless n-gram entry detection logic during GC."
* commit 'f4928ad4dd3108d427ef421c3ef56b259c7c2063': Update useless n-gram entry detection logic during GC.
This commit is contained in:
commit
083da29f2a
3 changed files with 92 additions and 22 deletions
|
@ -270,16 +270,26 @@ int LanguageModelDictContent::getBitmapEntryIndex(const WordIdArrayView prevWord
|
||||||
}
|
}
|
||||||
|
|
||||||
bool LanguageModelDictContent::updateAllProbabilityEntriesForGCInner(const int bitmapEntryIndex,
|
bool LanguageModelDictContent::updateAllProbabilityEntriesForGCInner(const int bitmapEntryIndex,
|
||||||
const int level, const HeaderPolicy *const headerPolicy, int *const outEntryCounts) {
|
const int prevWordCount, const HeaderPolicy *const headerPolicy,
|
||||||
|
int *const outEntryCounts) {
|
||||||
for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) {
|
for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) {
|
||||||
if (level > MAX_PREV_WORD_COUNT_FOR_N_GRAM) {
|
if (prevWordCount > MAX_PREV_WORD_COUNT_FOR_N_GRAM) {
|
||||||
AKLOGE("Invalid level. level: %d, MAX_PREV_WORD_COUNT_FOR_N_GRAM: %d.",
|
AKLOGE("Invalid prevWordCount. prevWordCount: %d, MAX_PREV_WORD_COUNT_FOR_N_GRAM: %d.",
|
||||||
level, MAX_PREV_WORD_COUNT_FOR_N_GRAM);
|
prevWordCount, MAX_PREV_WORD_COUNT_FOR_N_GRAM);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
const ProbabilityEntry probabilityEntry =
|
const ProbabilityEntry probabilityEntry =
|
||||||
ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo);
|
ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo);
|
||||||
if (mHasHistoricalInfo && !probabilityEntry.representsBeginningOfSentence()) {
|
if (prevWordCount > 0 && probabilityEntry.isValid()
|
||||||
|
&& !mTrieMap.getRoot(entry.key()).mIsValid) {
|
||||||
|
// The entry is related to a word that has been removed. Remove the entry.
|
||||||
|
if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (mHasHistoricalInfo && !probabilityEntry.representsBeginningOfSentence()
|
||||||
|
&& probabilityEntry.isValid()) {
|
||||||
const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave(
|
const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave(
|
||||||
probabilityEntry.getHistoricalInfo(), headerPolicy);
|
probabilityEntry.getHistoricalInfo(), headerPolicy);
|
||||||
if (ForgettingCurveUtils::needsToKeep(&historicalInfo, headerPolicy)) {
|
if (ForgettingCurveUtils::needsToKeep(&historicalInfo, headerPolicy)) {
|
||||||
|
@ -298,13 +308,13 @@ bool LanguageModelDictContent::updateAllProbabilityEntriesForGCInner(const int b
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!probabilityEntry.representsBeginningOfSentence()) {
|
if (!probabilityEntry.representsBeginningOfSentence()) {
|
||||||
outEntryCounts[level] += 1;
|
outEntryCounts[prevWordCount] += 1;
|
||||||
}
|
}
|
||||||
if (!entry.hasNextLevelMap()) {
|
if (!entry.hasNextLevelMap()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (!updateAllProbabilityEntriesForGCInner(entry.getNextLevelBitmapEntryIndex(), level + 1,
|
if (!updateAllProbabilityEntriesForGCInner(entry.getNextLevelBitmapEntryIndex(),
|
||||||
headerPolicy, outEntryCounts)) {
|
prevWordCount + 1, headerPolicy, outEntryCounts)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -332,7 +342,7 @@ bool LanguageModelDictContent::turncateEntriesInSpecifiedLevel(
|
||||||
for (int i = 0; i < entryCountToRemove; ++i) {
|
for (int i = 0; i < entryCountToRemove; ++i) {
|
||||||
const EntryInfoToTurncate &entryInfo = entryInfoVector[i];
|
const EntryInfoToTurncate &entryInfo = entryInfoVector[i];
|
||||||
if (!removeNgramProbabilityEntry(
|
if (!removeNgramProbabilityEntry(
|
||||||
WordIdArrayView(entryInfo.mPrevWordIds, entryInfo.mEntryLevel), entryInfo.mKey)) {
|
WordIdArrayView(entryInfo.mPrevWordIds, entryInfo.mPrevWordCount), entryInfo.mKey)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -342,9 +352,9 @@ bool LanguageModelDictContent::turncateEntriesInSpecifiedLevel(
|
||||||
bool LanguageModelDictContent::getEntryInfo(const HeaderPolicy *const headerPolicy,
|
bool LanguageModelDictContent::getEntryInfo(const HeaderPolicy *const headerPolicy,
|
||||||
const int targetLevel, const int bitmapEntryIndex, std::vector<int> *const prevWordIds,
|
const int targetLevel, const int bitmapEntryIndex, std::vector<int> *const prevWordIds,
|
||||||
std::vector<EntryInfoToTurncate> *const outEntryInfo) const {
|
std::vector<EntryInfoToTurncate> *const outEntryInfo) const {
|
||||||
const int currentLevel = prevWordIds->size();
|
const int prevWordCount = prevWordIds->size();
|
||||||
for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) {
|
for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) {
|
||||||
if (currentLevel < targetLevel) {
|
if (prevWordCount < targetLevel) {
|
||||||
if (!entry.hasNextLevelMap()) {
|
if (!entry.hasNextLevelMap()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -379,10 +389,10 @@ bool LanguageModelDictContent::EntryInfoToTurncate::Comparator::operator()(
|
||||||
if (left.mKey != right.mKey) {
|
if (left.mKey != right.mKey) {
|
||||||
return left.mKey < right.mKey;
|
return left.mKey < right.mKey;
|
||||||
}
|
}
|
||||||
if (left.mEntryLevel != right.mEntryLevel) {
|
if (left.mPrevWordCount != right.mPrevWordCount) {
|
||||||
return left.mEntryLevel > right.mEntryLevel;
|
return left.mPrevWordCount > right.mPrevWordCount;
|
||||||
}
|
}
|
||||||
for (int i = 0; i < left.mEntryLevel; ++i) {
|
for (int i = 0; i < left.mPrevWordCount; ++i) {
|
||||||
if (left.mPrevWordIds[i] != right.mPrevWordIds[i]) {
|
if (left.mPrevWordIds[i] != right.mPrevWordIds[i]) {
|
||||||
return left.mPrevWordIds[i] < right.mPrevWordIds[i];
|
return left.mPrevWordIds[i] < right.mPrevWordIds[i];
|
||||||
}
|
}
|
||||||
|
@ -392,9 +402,10 @@ bool LanguageModelDictContent::EntryInfoToTurncate::Comparator::operator()(
|
||||||
}
|
}
|
||||||
|
|
||||||
LanguageModelDictContent::EntryInfoToTurncate::EntryInfoToTurncate(const int probability,
|
LanguageModelDictContent::EntryInfoToTurncate::EntryInfoToTurncate(const int probability,
|
||||||
const int timestamp, const int key, const int entryLevel, const int *const prevWordIds)
|
const int timestamp, const int key, const int prevWordCount, const int *const prevWordIds)
|
||||||
: mProbability(probability), mTimestamp(timestamp), mKey(key), mEntryLevel(entryLevel) {
|
: mProbability(probability), mTimestamp(timestamp), mKey(key),
|
||||||
memmove(mPrevWordIds, prevWordIds, mEntryLevel * sizeof(mPrevWordIds[0]));
|
mPrevWordCount(prevWordCount) {
|
||||||
|
memmove(mPrevWordIds, prevWordIds, mPrevWordCount * sizeof(mPrevWordIds[0]));
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
|
|
|
@ -160,7 +160,7 @@ class LanguageModelDictContent {
|
||||||
outEntryCounts[i] = 0;
|
outEntryCounts[i] = 0;
|
||||||
}
|
}
|
||||||
return updateAllProbabilityEntriesForGCInner(mTrieMap.getRootBitmapEntryIndex(),
|
return updateAllProbabilityEntriesForGCInner(mTrieMap.getRootBitmapEntryIndex(),
|
||||||
0 /* level */, headerPolicy, outEntryCounts);
|
0 /* prevWordCount */, headerPolicy, outEntryCounts);
|
||||||
}
|
}
|
||||||
|
|
||||||
// entryCounts should be created by updateAllProbabilityEntries.
|
// entryCounts should be created by updateAllProbabilityEntries.
|
||||||
|
@ -185,12 +185,12 @@ class LanguageModelDictContent {
|
||||||
};
|
};
|
||||||
|
|
||||||
EntryInfoToTurncate(const int probability, const int timestamp, const int key,
|
EntryInfoToTurncate(const int probability, const int timestamp, const int key,
|
||||||
const int entryLevel, const int *const prevWordIds);
|
const int prevWordCount, const int *const prevWordIds);
|
||||||
|
|
||||||
int mProbability;
|
int mProbability;
|
||||||
int mTimestamp;
|
int mTimestamp;
|
||||||
int mKey;
|
int mKey;
|
||||||
int mEntryLevel;
|
int mPrevWordCount;
|
||||||
int mPrevWordIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1];
|
int mPrevWordIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1];
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -208,7 +208,7 @@ class LanguageModelDictContent {
|
||||||
int *const outNgramCount);
|
int *const outNgramCount);
|
||||||
int createAndGetBitmapEntryIndex(const WordIdArrayView prevWordIds);
|
int createAndGetBitmapEntryIndex(const WordIdArrayView prevWordIds);
|
||||||
int getBitmapEntryIndex(const WordIdArrayView prevWordIds) const;
|
int getBitmapEntryIndex(const WordIdArrayView prevWordIds) const;
|
||||||
bool updateAllProbabilityEntriesForGCInner(const int bitmapEntryIndex, const int level,
|
bool updateAllProbabilityEntriesForGCInner(const int bitmapEntryIndex, const int prevWordCount,
|
||||||
const HeaderPolicy *const headerPolicy, int *const outEntryCounts);
|
const HeaderPolicy *const headerPolicy, int *const outEntryCounts);
|
||||||
bool turncateEntriesInSpecifiedLevel(const HeaderPolicy *const headerPolicy,
|
bool turncateEntriesInSpecifiedLevel(const HeaderPolicy *const headerPolicy,
|
||||||
const int maxEntryCount, const int targetLevel, int *const outEntryCount);
|
const int maxEntryCount, const int targetLevel, int *const outEntryCount);
|
||||||
|
|
|
@ -75,6 +75,10 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
||||||
return formatVersion > FormatSpec.VERSION401;
|
return formatVersion > FormatSpec.VERSION401;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static boolean supportsNgram(final int formatVersion) {
|
||||||
|
return formatVersion >= FormatSpec.VERSION4_DEV;
|
||||||
|
}
|
||||||
|
|
||||||
private void onInputWord(final BinaryDictionary binaryDictionary, final String word,
|
private void onInputWord(final BinaryDictionary binaryDictionary, final String word,
|
||||||
final boolean isValidWord) {
|
final boolean isValidWord) {
|
||||||
binaryDictionary.updateEntriesForWordWithNgramContext(NgramContext.EMPTY_PREV_WORDS_INFO,
|
binaryDictionary.updateEntriesForWordWithNgramContext(NgramContext.EMPTY_PREV_WORDS_INFO,
|
||||||
|
@ -88,6 +92,14 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
||||||
mCurrentTime /* timestamp */);
|
mCurrentTime /* timestamp */);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void onInputWordWithPrevWords(final BinaryDictionary binaryDictionary,
|
||||||
|
final String word, final boolean isValidWord, final String prevWord,
|
||||||
|
final String prevPrevWord) {
|
||||||
|
binaryDictionary.updateEntriesForWordWithNgramContext(
|
||||||
|
new NgramContext(new WordInfo(prevWord), new WordInfo(prevPrevWord)), word,
|
||||||
|
isValidWord, 1 /* count */, mCurrentTime /* timestamp */);
|
||||||
|
}
|
||||||
|
|
||||||
private void onInputWordWithBeginningOfSentenceContext(
|
private void onInputWordWithBeginningOfSentenceContext(
|
||||||
final BinaryDictionary binaryDictionary, final String word, final boolean isValidWord) {
|
final BinaryDictionary binaryDictionary, final String word, final boolean isValidWord) {
|
||||||
binaryDictionary.updateEntriesForWordWithNgramContext(NgramContext.BEGINNING_OF_SENTENCE,
|
binaryDictionary.updateEntriesForWordWithNgramContext(NgramContext.BEGINNING_OF_SENTENCE,
|
||||||
|
@ -99,6 +111,12 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
||||||
return binaryDictionary.isValidNgram(new NgramContext(new WordInfo(word0)), word1);
|
return binaryDictionary.isValidNgram(new NgramContext(new WordInfo(word0)), word1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static boolean isValidTrigram(final BinaryDictionary binaryDictionary,
|
||||||
|
final String word0, final String word1, final String word2) {
|
||||||
|
return binaryDictionary.isValidNgram(
|
||||||
|
new NgramContext(new WordInfo(word1), new WordInfo(word0)), word2);
|
||||||
|
}
|
||||||
|
|
||||||
private void forcePassingShortTime(final BinaryDictionary binaryDictionary) {
|
private void forcePassingShortTime(final BinaryDictionary binaryDictionary) {
|
||||||
// 30 days.
|
// 30 days.
|
||||||
final int timeToElapse = (int)TimeUnit.SECONDS.convert(30, TimeUnit.DAYS);
|
final int timeToElapse = (int)TimeUnit.SECONDS.convert(30, TimeUnit.DAYS);
|
||||||
|
@ -256,7 +274,23 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
||||||
onInputWordWithPrevWord(binaryDictionary, "y", true /* isValidWord */, "x");
|
onInputWordWithPrevWord(binaryDictionary, "y", true /* isValidWord */, "x");
|
||||||
assertFalse(isValidBigram(binaryDictionary, "x", "y"));
|
assertFalse(isValidBigram(binaryDictionary, "x", "y"));
|
||||||
|
|
||||||
binaryDictionary.close();
|
if (!supportsNgram(formatVersion)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
onInputWordWithPrevWords(binaryDictionary, "c", false /* isValidWord */, "b", "a");
|
||||||
|
assertFalse(isValidTrigram(binaryDictionary, "a", "b", "c"));
|
||||||
|
assertFalse(isValidBigram(binaryDictionary, "b", "c"));
|
||||||
|
onInputWordWithPrevWords(binaryDictionary, "c", false /* isValidWord */, "b", "a");
|
||||||
|
assertTrue(isValidTrigram(binaryDictionary, "a", "b", "c"));
|
||||||
|
assertTrue(isValidBigram(binaryDictionary, "b", "c"));
|
||||||
|
|
||||||
|
onInputWordWithPrevWords(binaryDictionary, "d", true /* isValidWord */, "b", "a");
|
||||||
|
assertTrue(isValidTrigram(binaryDictionary, "a", "b", "d"));
|
||||||
|
assertTrue(isValidBigram(binaryDictionary, "b", "d"));
|
||||||
|
|
||||||
|
onInputWordWithPrevWords(binaryDictionary, "cd", true /* isValidWord */, "b", "a");
|
||||||
|
assertTrue(isValidTrigram(binaryDictionary, "a", "b", "cd"));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testDecayingProbability() {
|
public void testDecayingProbability() {
|
||||||
|
@ -301,6 +335,31 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
||||||
forcePassingLongTime(binaryDictionary);
|
forcePassingLongTime(binaryDictionary);
|
||||||
assertFalse(isValidBigram(binaryDictionary, "a", "b"));
|
assertFalse(isValidBigram(binaryDictionary, "a", "b"));
|
||||||
|
|
||||||
|
if (!supportsNgram(formatVersion)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
onInputWord(binaryDictionary, "ab", true /* isValidWord */);
|
||||||
|
onInputWordWithPrevWord(binaryDictionary, "bc", true /* isValidWord */, "ab");
|
||||||
|
onInputWordWithPrevWords(binaryDictionary, "cd", true /* isValidWord */, "bc", "ab");
|
||||||
|
assertTrue(isValidTrigram(binaryDictionary, "ab", "bc", "cd"));
|
||||||
|
forcePassingShortTime(binaryDictionary);
|
||||||
|
assertFalse(isValidTrigram(binaryDictionary, "ab", "bc", "cd"));
|
||||||
|
|
||||||
|
onInputWord(binaryDictionary, "ab", true /* isValidWord */);
|
||||||
|
onInputWordWithPrevWord(binaryDictionary, "bc", true /* isValidWord */, "ab");
|
||||||
|
onInputWordWithPrevWords(binaryDictionary, "cd", true /* isValidWord */, "bc", "ab");
|
||||||
|
onInputWord(binaryDictionary, "ab", true /* isValidWord */);
|
||||||
|
onInputWordWithPrevWord(binaryDictionary, "bc", true /* isValidWord */, "ab");
|
||||||
|
onInputWordWithPrevWords(binaryDictionary, "cd", true /* isValidWord */, "bc", "ab");
|
||||||
|
onInputWord(binaryDictionary, "ab", true /* isValidWord */);
|
||||||
|
onInputWordWithPrevWord(binaryDictionary, "bc", true /* isValidWord */, "ab");
|
||||||
|
onInputWordWithPrevWords(binaryDictionary, "cd", true /* isValidWord */, "bc", "ab");
|
||||||
|
forcePassingShortTime(binaryDictionary);
|
||||||
|
assertTrue(isValidTrigram(binaryDictionary, "ab", "bc", "cd"));
|
||||||
|
forcePassingLongTime(binaryDictionary);
|
||||||
|
assertFalse(isValidTrigram(binaryDictionary, "ab", "bc", "cd"));
|
||||||
|
|
||||||
binaryDictionary.close();
|
binaryDictionary.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue