Add header attributes for evaluation.

Bug: 13197276
Change-Id: Ib5247da691ff24a73e13485288237ccc51bb54f0
main
Keisuke Kuroyanagi 2014-02-28 21:06:03 +09:00
parent 472e22a326
commit 5128935ac4
10 changed files with 119 additions and 48 deletions

View File

@ -64,6 +64,9 @@ abstract public class ExpandableBinaryDictionary extends Dictionary {
private static final int TIMEOUT_FOR_READ_OPS_IN_MILLISECONDS = 100; private static final int TIMEOUT_FOR_READ_OPS_IN_MILLISECONDS = 100;
private static final int TIMEOUT_FOR_READ_OPS_FOR_TESTS_IN_MILLISECONDS = 10000; private static final int TIMEOUT_FOR_READ_OPS_FOR_TESTS_IN_MILLISECONDS = 10000;
private static final int DEFAULT_MAX_UNIGRAM_COUNT = 10000;
private static final int DEFAULT_MAX_BIGRAM_COUNT = 10000;
/** /**
* The maximum length of a word in this dictionary. * The maximum length of a word in this dictionary.
*/ */
@ -207,6 +210,10 @@ abstract public class ExpandableBinaryDictionary extends Dictionary {
attributeMap.put(DictionaryHeader.DICTIONARY_LOCALE_KEY, mLocale.toString()); attributeMap.put(DictionaryHeader.DICTIONARY_LOCALE_KEY, mLocale.toString());
attributeMap.put(DictionaryHeader.DICTIONARY_VERSION_KEY, attributeMap.put(DictionaryHeader.DICTIONARY_VERSION_KEY,
String.valueOf(TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis()))); String.valueOf(TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis())));
attributeMap.put(DictionaryHeader.MAX_UNIGRAM_COUNT_KEY,
String.valueOf(DEFAULT_MAX_UNIGRAM_COUNT));
attributeMap.put(DictionaryHeader.MAX_BIGRAM_COUNT_KEY,
String.valueOf(DEFAULT_MAX_BIGRAM_COUNT));
return attributeMap; return attributeMap;
} }

View File

@ -42,6 +42,10 @@ public final class DictionaryHeader {
"FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP"; "FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP";
public static final String FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY = public static final String FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY =
"FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID"; "FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID";
public static final String FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY =
"FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS";
public static final String MAX_UNIGRAM_COUNT_KEY = "MAX_UNIGRAM_COUNT";
public static final String MAX_BIGRAM_COUNT_KEY = "MAX_BIGRAM_COUNT";
public static final String ATTRIBUTE_VALUE_TRUE = "1"; public static final String ATTRIBUTE_VALUE_TRUE = "1";
public DictionaryHeader(final int headerSize, final DictionaryOptions dictionaryOptions, public DictionaryHeader(final int headerSize, final DictionaryOptions dictionaryOptions,

View File

@ -162,7 +162,7 @@ bool Ver4BigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(const i
} else if (bigramEntry.hasHistoricalInfo()) { } else if (bigramEntry.hasHistoricalInfo()) {
const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave( const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave(
bigramEntry.getHistoricalInfo(), mHeaderPolicy); bigramEntry.getHistoricalInfo(), mHeaderPolicy);
if (ForgettingCurveUtils::needsToKeep(&historicalInfo)) { if (ForgettingCurveUtils::needsToKeep(&historicalInfo, mHeaderPolicy)) {
const BigramEntry updatedBigramEntry = const BigramEntry updatedBigramEntry =
bigramEntry.updateHistoricalInfoAndGetEntry(&historicalInfo); bigramEntry.updateHistoricalInfoAndGetEntry(&historicalInfo);
if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) { if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) {

View File

@ -37,11 +37,22 @@ const char *const HeaderPolicy::FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY =
"FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP"; "FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP";
const char *const HeaderPolicy::FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY = const char *const HeaderPolicy::FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY =
"FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID"; "FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID";
const char *const HeaderPolicy::FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY =
"FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS";
const char *const HeaderPolicy::MAX_UNIGRAM_COUNT_KEY = "MAX_UNIGRAM_COUNT";
const char *const HeaderPolicy::MAX_BIGRAM_COUNT_KEY = "MAX_BIGRAM_COUNT";
const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100; const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100;
const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f; const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f;
const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP = 4; const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP = 4;
const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID = 0; const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID = 0;
// 4 days
const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS =
4 * 24 * 60 * 60;
const int HeaderPolicy::DEFAULT_MAX_UNIGRAM_COUNT = 10000;
const int HeaderPolicy::DEFAULT_MAX_BIGRAM_COUNT = 10000;
// Used for logging. Question mark is used to indicate that the key is not found. // Used for logging. Question mark is used to indicate that the key is not found.
void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *outValue, void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *outValue,

View File

@ -58,7 +58,14 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP)), DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP)),
mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue( mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue(
&mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY, &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY,
DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)) {} DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)),
mForgettingCurveDurationToLevelDown(HeaderReadWriteUtils::readIntAttributeValue(
&mAttributeMap, FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY,
DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS)),
mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(
&mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)),
mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue(
&mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)) {}
// Constructs header information using an attribute map. // Constructs header information using an attribute map.
HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion, HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion,
@ -83,7 +90,14 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP)), DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP)),
mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue( mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue(
&mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY, &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY,
DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)) {} DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)),
mForgettingCurveDurationToLevelDown(HeaderReadWriteUtils::readIntAttributeValue(
&mAttributeMap, FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY,
DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS)),
mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(
&mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)),
mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue(
&mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)) {}
// Temporary dummy header. // Temporary dummy header.
HeaderPolicy() HeaderPolicy()
@ -92,7 +106,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
mRequiresGermanUmlautProcessing(false), mIsDecayingDict(false), mRequiresGermanUmlautProcessing(false), mIsDecayingDict(false),
mDate(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0), mDate(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0),
mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false), mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false),
mForgettingCurveOccurrencesToLevelUp(0), mForgettingCurveProbabilityValuesTableId(0) {} mForgettingCurveOccurrencesToLevelUp(0), mForgettingCurveProbabilityValuesTableId(0),
mForgettingCurveDurationToLevelDown(0), mMaxUnigramCount(0), mMaxBigramCount(0) {}
~HeaderPolicy() {} ~HeaderPolicy() {}
@ -179,6 +194,18 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
return mForgettingCurveProbabilityValuesTableId; return mForgettingCurveProbabilityValuesTableId;
} }
AK_FORCE_INLINE int getForgettingCurveDurationToLevelDown() const {
return mForgettingCurveDurationToLevelDown;
}
AK_FORCE_INLINE int getMaxUnigramCount() const {
return mMaxUnigramCount;
}
AK_FORCE_INLINE int getMaxBigramCount() const {
return mMaxBigramCount;
}
void readHeaderValueOrQuestionMark(const char *const key, void readHeaderValueOrQuestionMark(const char *const key,
int *outValue, int outValueSize) const; int *outValue, int outValueSize) const;
@ -205,10 +232,16 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
static const char *const LOCALE_KEY; static const char *const LOCALE_KEY;
static const char *const FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY; static const char *const FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY;
static const char *const FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY; static const char *const FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY;
static const char *const FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY;
static const char *const MAX_UNIGRAM_COUNT_KEY;
static const char *const MAX_BIGRAM_COUNT_KEY;
static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE; static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE;
static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE; static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE;
static const int DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP; static const int DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP;
static const int DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID; static const int DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID;
static const int DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS;
static const int DEFAULT_MAX_UNIGRAM_COUNT;
static const int DEFAULT_MAX_BIGRAM_COUNT;
const FormatUtils::FORMAT_VERSION mDictFormatVersion; const FormatUtils::FORMAT_VERSION mDictFormatVersion;
const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags; const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags;
@ -226,6 +259,9 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
const bool mHasHistoricalInfoOfWords; const bool mHasHistoricalInfoOfWords;
const int mForgettingCurveOccurrencesToLevelUp; const int mForgettingCurveOccurrencesToLevelUp;
const int mForgettingCurveProbabilityValuesTableId; const int mForgettingCurveProbabilityValuesTableId;
const int mForgettingCurveDurationToLevelDown;
const int mMaxUnigramCount;
const int mMaxBigramCount;
const std::vector<int> readLocale() const; const std::vector<int> readLocale() const;
float readMultipleWordCostMultiplier() const; float readMultipleWordCostMultiplier() const;

View File

@ -168,7 +168,7 @@ bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeA
toBeUpdatedPtNodeParams->getTerminalId()); toBeUpdatedPtNodeParams->getTerminalId());
return false; return false;
} }
const bool isValid = ForgettingCurveUtils::needsToKeep(&historicalInfo); const bool isValid = ForgettingCurveUtils::needsToKeep(&historicalInfo, mHeaderPolicy);
if (!isValid) { if (!isValid) {
if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) { if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) {
AKLOGE("Cannot mark PtNode as willBecomeNonTerminal."); AKLOGE("Cannot mark PtNode as willBecomeNonTerminal.");

View File

@ -329,11 +329,15 @@ void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int quer
snprintf(outResult, maxResultLength, "%d", mBigramCount); snprintf(outResult, maxResultLength, "%d", mBigramCount);
} else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) { } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) {
snprintf(outResult, maxResultLength, "%d", snprintf(outResult, maxResultLength, "%d",
mHeaderPolicy->isDecayingDict() ? ForgettingCurveUtils::MAX_UNIGRAM_COUNT : mHeaderPolicy->isDecayingDict() ?
ForgettingCurveUtils::getUnigramCountHardLimit(
mHeaderPolicy->getMaxUnigramCount()) :
static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE)); static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
} else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) { } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) {
snprintf(outResult, maxResultLength, "%d", snprintf(outResult, maxResultLength, "%d",
mHeaderPolicy->isDecayingDict() ? ForgettingCurveUtils::MAX_BIGRAM_COUNT : mHeaderPolicy->isDecayingDict() ?
ForgettingCurveUtils::getBigramCountHardLimit(
mHeaderPolicy->getMaxBigramCount()) :
static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE)); static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
} }
} }

View File

@ -96,12 +96,11 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
} }
const int unigramCount = traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted const int unigramCount = traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
.getValidUnigramCount(); .getValidUnigramCount();
if (headerPolicy->isDecayingDict() const int maxUnigramCount = headerPolicy->getMaxUnigramCount();
&& unigramCount > ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC) { if (headerPolicy->isDecayingDict() && unigramCount > maxUnigramCount) {
if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter, if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter, maxUnigramCount)) {
ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC)) {
AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount, AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount,
ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC); maxUnigramCount);
return false; return false;
} }
} }
@ -114,11 +113,10 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
return false; return false;
} }
const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount(); const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount();
if (headerPolicy->isDecayingDict() const int maxBigramCount = headerPolicy->getMaxBigramCount();
&& bigramCount > ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC) { if (headerPolicy->isDecayingDict() && bigramCount > maxBigramCount) {
if (!truncateBigrams(ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC)) { if (!truncateBigrams(maxBigramCount)) {
AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount, AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount, maxBigramCount);
ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC);
return false; return false;
} }
} }

View File

@ -25,20 +25,17 @@
namespace latinime { namespace latinime {
const int ForgettingCurveUtils::MAX_UNIGRAM_COUNT = 12000;
const int ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC = 10000;
const int ForgettingCurveUtils::MAX_BIGRAM_COUNT = 12000;
const int ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC = 10000;
const int ForgettingCurveUtils::MULTIPLIER_TWO_IN_PROBABILITY_SCALE = 8; const int ForgettingCurveUtils::MULTIPLIER_TWO_IN_PROBABILITY_SCALE = 8;
const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60; const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60;
const int ForgettingCurveUtils::MAX_LEVEL = 3; const int ForgettingCurveUtils::MAX_LEVEL = 3;
const int ForgettingCurveUtils::MIN_VALID_LEVEL = 1; const int ForgettingCurveUtils::MIN_VALID_LEVEL = 1;
const int ForgettingCurveUtils::TIME_STEP_DURATION_IN_SECONDS = 6 * 60 * 60;
const int ForgettingCurveUtils::MAX_ELAPSED_TIME_STEP_COUNT = 15; const int ForgettingCurveUtils::MAX_ELAPSED_TIME_STEP_COUNT = 15;
const int ForgettingCurveUtils::DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD = 14; const int ForgettingCurveUtils::DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD = 14;
const float ForgettingCurveUtils::UNIGRAM_COUNT_HARD_LIMIT_WEIGHT = 1.2;
const float ForgettingCurveUtils::BIGRAM_COUNT_HARD_LIMIT_WEIGHT = 1.2;
const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityTable; const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityTable;
// TODO: Revise the logic to decide the initial probability depending on the given probability. // TODO: Revise the logic to decide the initial probability depending on the given probability.
@ -71,7 +68,8 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
/* static */ int ForgettingCurveUtils::decodeProbability( /* static */ int ForgettingCurveUtils::decodeProbability(
const HistoricalInfo *const historicalInfo, const HeaderPolicy *const headerPolicy) { const HistoricalInfo *const historicalInfo, const HeaderPolicy *const headerPolicy) {
const int elapsedTimeStepCount = getElapsedTimeStepCount(historicalInfo->getTimeStamp()); const int elapsedTimeStepCount = getElapsedTimeStepCount(historicalInfo->getTimeStamp(),
headerPolicy->getForgettingCurveDurationToLevelDown());
return sProbabilityTable.getProbability( return sProbabilityTable.getProbability(
headerPolicy->getForgettingCurveProbabilityValuesTableId(), historicalInfo->getLevel(), headerPolicy->getForgettingCurveProbabilityValuesTableId(), historicalInfo->getLevel(),
min(max(elapsedTimeStepCount, 0), MAX_ELAPSED_TIME_STEP_COUNT)); min(max(elapsedTimeStepCount, 0), MAX_ELAPSED_TIME_STEP_COUNT));
@ -90,9 +88,11 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
} }
} }
/* static */ bool ForgettingCurveUtils::needsToKeep(const HistoricalInfo *const historicalInfo) { /* static */ bool ForgettingCurveUtils::needsToKeep(const HistoricalInfo *const historicalInfo,
const HeaderPolicy *const headerPolicy) {
return historicalInfo->getLevel() > 0 return historicalInfo->getLevel() > 0
|| getElapsedTimeStepCount(historicalInfo->getTimeStamp()) || getElapsedTimeStepCount(historicalInfo->getTimeStamp(),
headerPolicy->getForgettingCurveDurationToLevelDown())
< DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD; < DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD;
} }
@ -102,7 +102,9 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
if (originalHistoricalInfo->getTimeStamp() == NOT_A_TIMESTAMP) { if (originalHistoricalInfo->getTimeStamp() == NOT_A_TIMESTAMP) {
return HistoricalInfo(); return HistoricalInfo();
} }
const int elapsedTimeStep = getElapsedTimeStepCount(originalHistoricalInfo->getTimeStamp()); const int durationToLevelDownInSeconds = headerPolicy->getForgettingCurveDurationToLevelDown();
const int elapsedTimeStep = getElapsedTimeStepCount(
originalHistoricalInfo->getTimeStamp(), durationToLevelDownInSeconds);
if (elapsedTimeStep <= MAX_ELAPSED_TIME_STEP_COUNT) { if (elapsedTimeStep <= MAX_ELAPSED_TIME_STEP_COUNT) {
// No need to update historical info. // No need to update historical info.
return *originalHistoricalInfo; return *originalHistoricalInfo;
@ -111,18 +113,18 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
const int maxLevelDownAmonut = elapsedTimeStep / (MAX_ELAPSED_TIME_STEP_COUNT + 1); const int maxLevelDownAmonut = elapsedTimeStep / (MAX_ELAPSED_TIME_STEP_COUNT + 1);
const int levelDownAmount = (maxLevelDownAmonut >= originalHistoricalInfo->getLevel()) ? const int levelDownAmount = (maxLevelDownAmonut >= originalHistoricalInfo->getLevel()) ?
originalHistoricalInfo->getLevel() : maxLevelDownAmonut; originalHistoricalInfo->getLevel() : maxLevelDownAmonut;
const int adjustedTimestamp = originalHistoricalInfo->getTimeStamp() + const int adjustedTimestampInSeconds = originalHistoricalInfo->getTimeStamp() +
levelDownAmount * (MAX_ELAPSED_TIME_STEP_COUNT + 1) * TIME_STEP_DURATION_IN_SECONDS; levelDownAmount * durationToLevelDownInSeconds;
return HistoricalInfo(adjustedTimestamp, return HistoricalInfo(adjustedTimestampInSeconds,
originalHistoricalInfo->getLevel() - levelDownAmount, 0 /* count */); originalHistoricalInfo->getLevel() - levelDownAmount, 0 /* count */);
} }
/* static */ bool ForgettingCurveUtils::needsToDecay(const bool mindsBlockByDecay, /* static */ bool ForgettingCurveUtils::needsToDecay(const bool mindsBlockByDecay,
const int unigramCount, const int bigramCount, const HeaderPolicy *const headerPolicy) { const int unigramCount, const int bigramCount, const HeaderPolicy *const headerPolicy) {
if (unigramCount >= ForgettingCurveUtils::MAX_UNIGRAM_COUNT) { if (unigramCount >= getUnigramCountHardLimit(headerPolicy->getMaxUnigramCount())) {
// Unigram count exceeds the limit. // Unigram count exceeds the limit.
return true; return true;
} else if (bigramCount >= ForgettingCurveUtils::MAX_BIGRAM_COUNT) { } else if (bigramCount >= getBigramCountHardLimit(headerPolicy->getMaxBigramCount())) {
// Bigram count exceeds the limit. // Bigram count exceeds the limit.
return true; return true;
} }
@ -143,8 +145,12 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
return unigramProbability; return unigramProbability;
} }
/* static */ int ForgettingCurveUtils::getElapsedTimeStepCount(const int timestamp) { /* static */ int ForgettingCurveUtils::getElapsedTimeStepCount(const int timestamp,
return (TimeKeeper::peekCurrentTime() - timestamp) / TIME_STEP_DURATION_IN_SECONDS; const int durationToLevelDownInSeconds) {
const int elapsedTimeInSeconds = TimeKeeper::peekCurrentTime() - timestamp;
const int timeStepDurationInSeconds =
durationToLevelDownInSeconds / (MAX_ELAPSED_TIME_STEP_COUNT + 1);
return elapsedTimeInSeconds / timeStepDurationInSeconds;
} }
const int ForgettingCurveUtils::ProbabilityTable::PROBABILITY_TABLE_COUNT = 4; const int ForgettingCurveUtils::ProbabilityTable::PROBABILITY_TABLE_COUNT = 4;
@ -172,12 +178,10 @@ ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTables() {
mTables[tableId][level][timeStepCount] = NOT_A_PROBABILITY; mTables[tableId][level][timeStepCount] = NOT_A_PROBABILITY;
continue; continue;
} }
const int elapsedTime = timeStepCount * TIME_STEP_DURATION_IN_SECONDS;
const float probability = initialProbability const float probability = initialProbability
* powf(initialProbability / endProbability, * powf(initialProbability / endProbability,
-1.0f * static_cast<float>(elapsedTime) -1.0f * static_cast<float>(timeStepCount)
/ static_cast<float>(TIME_STEP_DURATION_IN_SECONDS / static_cast<float>(MAX_ELAPSED_TIME_STEP_COUNT + 1));
* (MAX_ELAPSED_TIME_STEP_COUNT + 1)));
mTables[tableId][level][timeStepCount] = mTables[tableId][level][timeStepCount] =
min(max(static_cast<int>(probability), 1), MAX_PROBABILITY); min(max(static_cast<int>(probability), 1), MAX_PROBABILITY);
} }

View File

@ -28,11 +28,6 @@ class HeaderPolicy;
class ForgettingCurveUtils { class ForgettingCurveUtils {
public: public:
static const int MAX_UNIGRAM_COUNT;
static const int MAX_UNIGRAM_COUNT_AFTER_GC;
static const int MAX_BIGRAM_COUNT;
static const int MAX_BIGRAM_COUNT_AFTER_GC;
static const HistoricalInfo createUpdatedHistoricalInfo( static const HistoricalInfo createUpdatedHistoricalInfo(
const HistoricalInfo *const originalHistoricalInfo, const int newProbability, const HistoricalInfo *const originalHistoricalInfo, const int newProbability,
const int timestamp, const HeaderPolicy *const headerPolicy); const int timestamp, const HeaderPolicy *const headerPolicy);
@ -47,11 +42,22 @@ class ForgettingCurveUtils {
static int getProbability(const int encodedUnigramProbability, static int getProbability(const int encodedUnigramProbability,
const int encodedBigramProbability); const int encodedBigramProbability);
static bool needsToKeep(const HistoricalInfo *const historicalInfo); static bool needsToKeep(const HistoricalInfo *const historicalInfo,
const HeaderPolicy *const headerPolicy);
static bool needsToDecay(const bool mindsBlockByDecay, const int unigramCount, static bool needsToDecay(const bool mindsBlockByDecay, const int unigramCount,
const int bigramCount, const HeaderPolicy *const headerPolicy); const int bigramCount, const HeaderPolicy *const headerPolicy);
AK_FORCE_INLINE static int getUnigramCountHardLimit(const int maxUnigramCount) {
return static_cast<int>(static_cast<float>(maxUnigramCount)
* UNIGRAM_COUNT_HARD_LIMIT_WEIGHT);
}
AK_FORCE_INLINE static int getBigramCountHardLimit(const int maxBigramCount) {
return static_cast<int>(static_cast<float>(maxBigramCount)
* BIGRAM_COUNT_HARD_LIMIT_WEIGHT);
}
private: private:
DISALLOW_IMPLICIT_CONSTRUCTORS(ForgettingCurveUtils); DISALLOW_IMPLICIT_CONSTRUCTORS(ForgettingCurveUtils);
@ -88,16 +94,17 @@ class ForgettingCurveUtils {
static const int MAX_LEVEL; static const int MAX_LEVEL;
static const int MIN_VALID_LEVEL; static const int MIN_VALID_LEVEL;
static const int TIME_STEP_DURATION_IN_SECONDS;
static const int MAX_ELAPSED_TIME_STEP_COUNT; static const int MAX_ELAPSED_TIME_STEP_COUNT;
static const int DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD; static const int DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD;
static const int HALF_LIFE_TIME_IN_SECONDS;
static const float UNIGRAM_COUNT_HARD_LIMIT_WEIGHT;
static const float BIGRAM_COUNT_HARD_LIMIT_WEIGHT;
static const ProbabilityTable sProbabilityTable; static const ProbabilityTable sProbabilityTable;
static int backoff(const int unigramProbability); static int backoff(const int unigramProbability);
static int getElapsedTimeStepCount(const int timestamp); static int getElapsedTimeStepCount(const int timestamp, const int durationToLevelDown);
}; };
} // namespace latinime } // namespace latinime
#endif /* LATINIME_FORGETTING_CURVE_UTILS_H */ #endif /* LATINIME_FORGETTING_CURVE_UTILS_H */