am 5128935a
: Add header attributes for evaluation.
* commit '5128935ac4d7961e3c863270b828e47a79b97235': Add header attributes for evaluation.
This commit is contained in:
commit
a222053f7f
10 changed files with 119 additions and 48 deletions
|
@ -64,6 +64,9 @@ abstract public class ExpandableBinaryDictionary extends Dictionary {
|
||||||
private static final int TIMEOUT_FOR_READ_OPS_IN_MILLISECONDS = 100;
|
private static final int TIMEOUT_FOR_READ_OPS_IN_MILLISECONDS = 100;
|
||||||
private static final int TIMEOUT_FOR_READ_OPS_FOR_TESTS_IN_MILLISECONDS = 10000;
|
private static final int TIMEOUT_FOR_READ_OPS_FOR_TESTS_IN_MILLISECONDS = 10000;
|
||||||
|
|
||||||
|
private static final int DEFAULT_MAX_UNIGRAM_COUNT = 10000;
|
||||||
|
private static final int DEFAULT_MAX_BIGRAM_COUNT = 10000;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The maximum length of a word in this dictionary.
|
* The maximum length of a word in this dictionary.
|
||||||
*/
|
*/
|
||||||
|
@ -207,6 +210,10 @@ abstract public class ExpandableBinaryDictionary extends Dictionary {
|
||||||
attributeMap.put(DictionaryHeader.DICTIONARY_LOCALE_KEY, mLocale.toString());
|
attributeMap.put(DictionaryHeader.DICTIONARY_LOCALE_KEY, mLocale.toString());
|
||||||
attributeMap.put(DictionaryHeader.DICTIONARY_VERSION_KEY,
|
attributeMap.put(DictionaryHeader.DICTIONARY_VERSION_KEY,
|
||||||
String.valueOf(TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis())));
|
String.valueOf(TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis())));
|
||||||
|
attributeMap.put(DictionaryHeader.MAX_UNIGRAM_COUNT_KEY,
|
||||||
|
String.valueOf(DEFAULT_MAX_UNIGRAM_COUNT));
|
||||||
|
attributeMap.put(DictionaryHeader.MAX_BIGRAM_COUNT_KEY,
|
||||||
|
String.valueOf(DEFAULT_MAX_BIGRAM_COUNT));
|
||||||
return attributeMap;
|
return attributeMap;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -42,6 +42,10 @@ public final class DictionaryHeader {
|
||||||
"FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP";
|
"FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP";
|
||||||
public static final String FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY =
|
public static final String FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY =
|
||||||
"FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID";
|
"FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID";
|
||||||
|
public static final String FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY =
|
||||||
|
"FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS";
|
||||||
|
public static final String MAX_UNIGRAM_COUNT_KEY = "MAX_UNIGRAM_COUNT";
|
||||||
|
public static final String MAX_BIGRAM_COUNT_KEY = "MAX_BIGRAM_COUNT";
|
||||||
public static final String ATTRIBUTE_VALUE_TRUE = "1";
|
public static final String ATTRIBUTE_VALUE_TRUE = "1";
|
||||||
|
|
||||||
public DictionaryHeader(final int headerSize, final DictionaryOptions dictionaryOptions,
|
public DictionaryHeader(final int headerSize, final DictionaryOptions dictionaryOptions,
|
||||||
|
|
|
@ -162,7 +162,7 @@ bool Ver4BigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(const i
|
||||||
} else if (bigramEntry.hasHistoricalInfo()) {
|
} else if (bigramEntry.hasHistoricalInfo()) {
|
||||||
const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave(
|
const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave(
|
||||||
bigramEntry.getHistoricalInfo(), mHeaderPolicy);
|
bigramEntry.getHistoricalInfo(), mHeaderPolicy);
|
||||||
if (ForgettingCurveUtils::needsToKeep(&historicalInfo)) {
|
if (ForgettingCurveUtils::needsToKeep(&historicalInfo, mHeaderPolicy)) {
|
||||||
const BigramEntry updatedBigramEntry =
|
const BigramEntry updatedBigramEntry =
|
||||||
bigramEntry.updateHistoricalInfoAndGetEntry(&historicalInfo);
|
bigramEntry.updateHistoricalInfoAndGetEntry(&historicalInfo);
|
||||||
if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) {
|
if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) {
|
||||||
|
|
|
@ -37,11 +37,22 @@ const char *const HeaderPolicy::FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY =
|
||||||
"FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP";
|
"FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP";
|
||||||
const char *const HeaderPolicy::FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY =
|
const char *const HeaderPolicy::FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY =
|
||||||
"FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID";
|
"FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID";
|
||||||
|
const char *const HeaderPolicy::FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY =
|
||||||
|
"FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS";
|
||||||
|
|
||||||
|
const char *const HeaderPolicy::MAX_UNIGRAM_COUNT_KEY = "MAX_UNIGRAM_COUNT";
|
||||||
|
const char *const HeaderPolicy::MAX_BIGRAM_COUNT_KEY = "MAX_BIGRAM_COUNT";
|
||||||
|
|
||||||
const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100;
|
const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100;
|
||||||
const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f;
|
const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f;
|
||||||
const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP = 4;
|
const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP = 4;
|
||||||
const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID = 0;
|
const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID = 0;
|
||||||
|
// 4 days
|
||||||
|
const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS =
|
||||||
|
4 * 24 * 60 * 60;
|
||||||
|
|
||||||
|
const int HeaderPolicy::DEFAULT_MAX_UNIGRAM_COUNT = 10000;
|
||||||
|
const int HeaderPolicy::DEFAULT_MAX_BIGRAM_COUNT = 10000;
|
||||||
|
|
||||||
// Used for logging. Question mark is used to indicate that the key is not found.
|
// Used for logging. Question mark is used to indicate that the key is not found.
|
||||||
void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *outValue,
|
void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *outValue,
|
||||||
|
|
|
@ -58,7 +58,14 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
||||||
DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP)),
|
DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP)),
|
||||||
mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue(
|
mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue(
|
||||||
&mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY,
|
&mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY,
|
||||||
DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)) {}
|
DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)),
|
||||||
|
mForgettingCurveDurationToLevelDown(HeaderReadWriteUtils::readIntAttributeValue(
|
||||||
|
&mAttributeMap, FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY,
|
||||||
|
DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS)),
|
||||||
|
mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(
|
||||||
|
&mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)),
|
||||||
|
mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue(
|
||||||
|
&mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)) {}
|
||||||
|
|
||||||
// Constructs header information using an attribute map.
|
// Constructs header information using an attribute map.
|
||||||
HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion,
|
HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion,
|
||||||
|
@ -83,7 +90,14 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
||||||
DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP)),
|
DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP)),
|
||||||
mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue(
|
mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue(
|
||||||
&mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY,
|
&mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY,
|
||||||
DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)) {}
|
DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)),
|
||||||
|
mForgettingCurveDurationToLevelDown(HeaderReadWriteUtils::readIntAttributeValue(
|
||||||
|
&mAttributeMap, FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY,
|
||||||
|
DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS)),
|
||||||
|
mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(
|
||||||
|
&mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)),
|
||||||
|
mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue(
|
||||||
|
&mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)) {}
|
||||||
|
|
||||||
// Temporary dummy header.
|
// Temporary dummy header.
|
||||||
HeaderPolicy()
|
HeaderPolicy()
|
||||||
|
@ -92,7 +106,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
||||||
mRequiresGermanUmlautProcessing(false), mIsDecayingDict(false),
|
mRequiresGermanUmlautProcessing(false), mIsDecayingDict(false),
|
||||||
mDate(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0),
|
mDate(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0),
|
||||||
mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false),
|
mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false),
|
||||||
mForgettingCurveOccurrencesToLevelUp(0), mForgettingCurveProbabilityValuesTableId(0) {}
|
mForgettingCurveOccurrencesToLevelUp(0), mForgettingCurveProbabilityValuesTableId(0),
|
||||||
|
mForgettingCurveDurationToLevelDown(0), mMaxUnigramCount(0), mMaxBigramCount(0) {}
|
||||||
|
|
||||||
~HeaderPolicy() {}
|
~HeaderPolicy() {}
|
||||||
|
|
||||||
|
@ -179,6 +194,18 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
||||||
return mForgettingCurveProbabilityValuesTableId;
|
return mForgettingCurveProbabilityValuesTableId;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
AK_FORCE_INLINE int getForgettingCurveDurationToLevelDown() const {
|
||||||
|
return mForgettingCurveDurationToLevelDown;
|
||||||
|
}
|
||||||
|
|
||||||
|
AK_FORCE_INLINE int getMaxUnigramCount() const {
|
||||||
|
return mMaxUnigramCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
AK_FORCE_INLINE int getMaxBigramCount() const {
|
||||||
|
return mMaxBigramCount;
|
||||||
|
}
|
||||||
|
|
||||||
void readHeaderValueOrQuestionMark(const char *const key,
|
void readHeaderValueOrQuestionMark(const char *const key,
|
||||||
int *outValue, int outValueSize) const;
|
int *outValue, int outValueSize) const;
|
||||||
|
|
||||||
|
@ -205,10 +232,16 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
||||||
static const char *const LOCALE_KEY;
|
static const char *const LOCALE_KEY;
|
||||||
static const char *const FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY;
|
static const char *const FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY;
|
||||||
static const char *const FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY;
|
static const char *const FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY;
|
||||||
|
static const char *const FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY;
|
||||||
|
static const char *const MAX_UNIGRAM_COUNT_KEY;
|
||||||
|
static const char *const MAX_BIGRAM_COUNT_KEY;
|
||||||
static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE;
|
static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE;
|
||||||
static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE;
|
static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE;
|
||||||
static const int DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP;
|
static const int DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP;
|
||||||
static const int DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID;
|
static const int DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID;
|
||||||
|
static const int DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS;
|
||||||
|
static const int DEFAULT_MAX_UNIGRAM_COUNT;
|
||||||
|
static const int DEFAULT_MAX_BIGRAM_COUNT;
|
||||||
|
|
||||||
const FormatUtils::FORMAT_VERSION mDictFormatVersion;
|
const FormatUtils::FORMAT_VERSION mDictFormatVersion;
|
||||||
const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags;
|
const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags;
|
||||||
|
@ -226,6 +259,9 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
||||||
const bool mHasHistoricalInfoOfWords;
|
const bool mHasHistoricalInfoOfWords;
|
||||||
const int mForgettingCurveOccurrencesToLevelUp;
|
const int mForgettingCurveOccurrencesToLevelUp;
|
||||||
const int mForgettingCurveProbabilityValuesTableId;
|
const int mForgettingCurveProbabilityValuesTableId;
|
||||||
|
const int mForgettingCurveDurationToLevelDown;
|
||||||
|
const int mMaxUnigramCount;
|
||||||
|
const int mMaxBigramCount;
|
||||||
|
|
||||||
const std::vector<int> readLocale() const;
|
const std::vector<int> readLocale() const;
|
||||||
float readMultipleWordCostMultiplier() const;
|
float readMultipleWordCostMultiplier() const;
|
||||||
|
|
|
@ -168,7 +168,7 @@ bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeA
|
||||||
toBeUpdatedPtNodeParams->getTerminalId());
|
toBeUpdatedPtNodeParams->getTerminalId());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
const bool isValid = ForgettingCurveUtils::needsToKeep(&historicalInfo);
|
const bool isValid = ForgettingCurveUtils::needsToKeep(&historicalInfo, mHeaderPolicy);
|
||||||
if (!isValid) {
|
if (!isValid) {
|
||||||
if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) {
|
if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) {
|
||||||
AKLOGE("Cannot mark PtNode as willBecomeNonTerminal.");
|
AKLOGE("Cannot mark PtNode as willBecomeNonTerminal.");
|
||||||
|
|
|
@ -329,11 +329,15 @@ void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int quer
|
||||||
snprintf(outResult, maxResultLength, "%d", mBigramCount);
|
snprintf(outResult, maxResultLength, "%d", mBigramCount);
|
||||||
} else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) {
|
} else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) {
|
||||||
snprintf(outResult, maxResultLength, "%d",
|
snprintf(outResult, maxResultLength, "%d",
|
||||||
mHeaderPolicy->isDecayingDict() ? ForgettingCurveUtils::MAX_UNIGRAM_COUNT :
|
mHeaderPolicy->isDecayingDict() ?
|
||||||
|
ForgettingCurveUtils::getUnigramCountHardLimit(
|
||||||
|
mHeaderPolicy->getMaxUnigramCount()) :
|
||||||
static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
|
static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
|
||||||
} else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) {
|
} else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) {
|
||||||
snprintf(outResult, maxResultLength, "%d",
|
snprintf(outResult, maxResultLength, "%d",
|
||||||
mHeaderPolicy->isDecayingDict() ? ForgettingCurveUtils::MAX_BIGRAM_COUNT :
|
mHeaderPolicy->isDecayingDict() ?
|
||||||
|
ForgettingCurveUtils::getBigramCountHardLimit(
|
||||||
|
mHeaderPolicy->getMaxBigramCount()) :
|
||||||
static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
|
static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -96,12 +96,11 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
|
||||||
}
|
}
|
||||||
const int unigramCount = traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
|
const int unigramCount = traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
|
||||||
.getValidUnigramCount();
|
.getValidUnigramCount();
|
||||||
if (headerPolicy->isDecayingDict()
|
const int maxUnigramCount = headerPolicy->getMaxUnigramCount();
|
||||||
&& unigramCount > ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC) {
|
if (headerPolicy->isDecayingDict() && unigramCount > maxUnigramCount) {
|
||||||
if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter,
|
if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter, maxUnigramCount)) {
|
||||||
ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC)) {
|
|
||||||
AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount,
|
AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount,
|
||||||
ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC);
|
maxUnigramCount);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -114,11 +113,10 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount();
|
const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount();
|
||||||
if (headerPolicy->isDecayingDict()
|
const int maxBigramCount = headerPolicy->getMaxBigramCount();
|
||||||
&& bigramCount > ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC) {
|
if (headerPolicy->isDecayingDict() && bigramCount > maxBigramCount) {
|
||||||
if (!truncateBigrams(ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC)) {
|
if (!truncateBigrams(maxBigramCount)) {
|
||||||
AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount,
|
AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount, maxBigramCount);
|
||||||
ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,20 +25,17 @@
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
const int ForgettingCurveUtils::MAX_UNIGRAM_COUNT = 12000;
|
|
||||||
const int ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC = 10000;
|
|
||||||
const int ForgettingCurveUtils::MAX_BIGRAM_COUNT = 12000;
|
|
||||||
const int ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC = 10000;
|
|
||||||
|
|
||||||
const int ForgettingCurveUtils::MULTIPLIER_TWO_IN_PROBABILITY_SCALE = 8;
|
const int ForgettingCurveUtils::MULTIPLIER_TWO_IN_PROBABILITY_SCALE = 8;
|
||||||
const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60;
|
const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60;
|
||||||
|
|
||||||
const int ForgettingCurveUtils::MAX_LEVEL = 3;
|
const int ForgettingCurveUtils::MAX_LEVEL = 3;
|
||||||
const int ForgettingCurveUtils::MIN_VALID_LEVEL = 1;
|
const int ForgettingCurveUtils::MIN_VALID_LEVEL = 1;
|
||||||
const int ForgettingCurveUtils::TIME_STEP_DURATION_IN_SECONDS = 6 * 60 * 60;
|
|
||||||
const int ForgettingCurveUtils::MAX_ELAPSED_TIME_STEP_COUNT = 15;
|
const int ForgettingCurveUtils::MAX_ELAPSED_TIME_STEP_COUNT = 15;
|
||||||
const int ForgettingCurveUtils::DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD = 14;
|
const int ForgettingCurveUtils::DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD = 14;
|
||||||
|
|
||||||
|
const float ForgettingCurveUtils::UNIGRAM_COUNT_HARD_LIMIT_WEIGHT = 1.2;
|
||||||
|
const float ForgettingCurveUtils::BIGRAM_COUNT_HARD_LIMIT_WEIGHT = 1.2;
|
||||||
|
|
||||||
const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityTable;
|
const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityTable;
|
||||||
|
|
||||||
// TODO: Revise the logic to decide the initial probability depending on the given probability.
|
// TODO: Revise the logic to decide the initial probability depending on the given probability.
|
||||||
|
@ -71,7 +68,8 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
|
||||||
|
|
||||||
/* static */ int ForgettingCurveUtils::decodeProbability(
|
/* static */ int ForgettingCurveUtils::decodeProbability(
|
||||||
const HistoricalInfo *const historicalInfo, const HeaderPolicy *const headerPolicy) {
|
const HistoricalInfo *const historicalInfo, const HeaderPolicy *const headerPolicy) {
|
||||||
const int elapsedTimeStepCount = getElapsedTimeStepCount(historicalInfo->getTimeStamp());
|
const int elapsedTimeStepCount = getElapsedTimeStepCount(historicalInfo->getTimeStamp(),
|
||||||
|
headerPolicy->getForgettingCurveDurationToLevelDown());
|
||||||
return sProbabilityTable.getProbability(
|
return sProbabilityTable.getProbability(
|
||||||
headerPolicy->getForgettingCurveProbabilityValuesTableId(), historicalInfo->getLevel(),
|
headerPolicy->getForgettingCurveProbabilityValuesTableId(), historicalInfo->getLevel(),
|
||||||
min(max(elapsedTimeStepCount, 0), MAX_ELAPSED_TIME_STEP_COUNT));
|
min(max(elapsedTimeStepCount, 0), MAX_ELAPSED_TIME_STEP_COUNT));
|
||||||
|
@ -90,10 +88,12 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* static */ bool ForgettingCurveUtils::needsToKeep(const HistoricalInfo *const historicalInfo) {
|
/* static */ bool ForgettingCurveUtils::needsToKeep(const HistoricalInfo *const historicalInfo,
|
||||||
|
const HeaderPolicy *const headerPolicy) {
|
||||||
return historicalInfo->getLevel() > 0
|
return historicalInfo->getLevel() > 0
|
||||||
|| getElapsedTimeStepCount(historicalInfo->getTimeStamp())
|
|| getElapsedTimeStepCount(historicalInfo->getTimeStamp(),
|
||||||
< DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD;
|
headerPolicy->getForgettingCurveDurationToLevelDown())
|
||||||
|
< DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* static */ const HistoricalInfo ForgettingCurveUtils::createHistoricalInfoToSave(
|
/* static */ const HistoricalInfo ForgettingCurveUtils::createHistoricalInfoToSave(
|
||||||
|
@ -102,7 +102,9 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
|
||||||
if (originalHistoricalInfo->getTimeStamp() == NOT_A_TIMESTAMP) {
|
if (originalHistoricalInfo->getTimeStamp() == NOT_A_TIMESTAMP) {
|
||||||
return HistoricalInfo();
|
return HistoricalInfo();
|
||||||
}
|
}
|
||||||
const int elapsedTimeStep = getElapsedTimeStepCount(originalHistoricalInfo->getTimeStamp());
|
const int durationToLevelDownInSeconds = headerPolicy->getForgettingCurveDurationToLevelDown();
|
||||||
|
const int elapsedTimeStep = getElapsedTimeStepCount(
|
||||||
|
originalHistoricalInfo->getTimeStamp(), durationToLevelDownInSeconds);
|
||||||
if (elapsedTimeStep <= MAX_ELAPSED_TIME_STEP_COUNT) {
|
if (elapsedTimeStep <= MAX_ELAPSED_TIME_STEP_COUNT) {
|
||||||
// No need to update historical info.
|
// No need to update historical info.
|
||||||
return *originalHistoricalInfo;
|
return *originalHistoricalInfo;
|
||||||
|
@ -111,18 +113,18 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
|
||||||
const int maxLevelDownAmonut = elapsedTimeStep / (MAX_ELAPSED_TIME_STEP_COUNT + 1);
|
const int maxLevelDownAmonut = elapsedTimeStep / (MAX_ELAPSED_TIME_STEP_COUNT + 1);
|
||||||
const int levelDownAmount = (maxLevelDownAmonut >= originalHistoricalInfo->getLevel()) ?
|
const int levelDownAmount = (maxLevelDownAmonut >= originalHistoricalInfo->getLevel()) ?
|
||||||
originalHistoricalInfo->getLevel() : maxLevelDownAmonut;
|
originalHistoricalInfo->getLevel() : maxLevelDownAmonut;
|
||||||
const int adjustedTimestamp = originalHistoricalInfo->getTimeStamp() +
|
const int adjustedTimestampInSeconds = originalHistoricalInfo->getTimeStamp() +
|
||||||
levelDownAmount * (MAX_ELAPSED_TIME_STEP_COUNT + 1) * TIME_STEP_DURATION_IN_SECONDS;
|
levelDownAmount * durationToLevelDownInSeconds;
|
||||||
return HistoricalInfo(adjustedTimestamp,
|
return HistoricalInfo(adjustedTimestampInSeconds,
|
||||||
originalHistoricalInfo->getLevel() - levelDownAmount, 0 /* count */);
|
originalHistoricalInfo->getLevel() - levelDownAmount, 0 /* count */);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* static */ bool ForgettingCurveUtils::needsToDecay(const bool mindsBlockByDecay,
|
/* static */ bool ForgettingCurveUtils::needsToDecay(const bool mindsBlockByDecay,
|
||||||
const int unigramCount, const int bigramCount, const HeaderPolicy *const headerPolicy) {
|
const int unigramCount, const int bigramCount, const HeaderPolicy *const headerPolicy) {
|
||||||
if (unigramCount >= ForgettingCurveUtils::MAX_UNIGRAM_COUNT) {
|
if (unigramCount >= getUnigramCountHardLimit(headerPolicy->getMaxUnigramCount())) {
|
||||||
// Unigram count exceeds the limit.
|
// Unigram count exceeds the limit.
|
||||||
return true;
|
return true;
|
||||||
} else if (bigramCount >= ForgettingCurveUtils::MAX_BIGRAM_COUNT) {
|
} else if (bigramCount >= getBigramCountHardLimit(headerPolicy->getMaxBigramCount())) {
|
||||||
// Bigram count exceeds the limit.
|
// Bigram count exceeds the limit.
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -143,8 +145,12 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
|
||||||
return unigramProbability;
|
return unigramProbability;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* static */ int ForgettingCurveUtils::getElapsedTimeStepCount(const int timestamp) {
|
/* static */ int ForgettingCurveUtils::getElapsedTimeStepCount(const int timestamp,
|
||||||
return (TimeKeeper::peekCurrentTime() - timestamp) / TIME_STEP_DURATION_IN_SECONDS;
|
const int durationToLevelDownInSeconds) {
|
||||||
|
const int elapsedTimeInSeconds = TimeKeeper::peekCurrentTime() - timestamp;
|
||||||
|
const int timeStepDurationInSeconds =
|
||||||
|
durationToLevelDownInSeconds / (MAX_ELAPSED_TIME_STEP_COUNT + 1);
|
||||||
|
return elapsedTimeInSeconds / timeStepDurationInSeconds;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int ForgettingCurveUtils::ProbabilityTable::PROBABILITY_TABLE_COUNT = 4;
|
const int ForgettingCurveUtils::ProbabilityTable::PROBABILITY_TABLE_COUNT = 4;
|
||||||
|
@ -172,12 +178,10 @@ ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTables() {
|
||||||
mTables[tableId][level][timeStepCount] = NOT_A_PROBABILITY;
|
mTables[tableId][level][timeStepCount] = NOT_A_PROBABILITY;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
const int elapsedTime = timeStepCount * TIME_STEP_DURATION_IN_SECONDS;
|
|
||||||
const float probability = initialProbability
|
const float probability = initialProbability
|
||||||
* powf(initialProbability / endProbability,
|
* powf(initialProbability / endProbability,
|
||||||
-1.0f * static_cast<float>(elapsedTime)
|
-1.0f * static_cast<float>(timeStepCount)
|
||||||
/ static_cast<float>(TIME_STEP_DURATION_IN_SECONDS
|
/ static_cast<float>(MAX_ELAPSED_TIME_STEP_COUNT + 1));
|
||||||
* (MAX_ELAPSED_TIME_STEP_COUNT + 1)));
|
|
||||||
mTables[tableId][level][timeStepCount] =
|
mTables[tableId][level][timeStepCount] =
|
||||||
min(max(static_cast<int>(probability), 1), MAX_PROBABILITY);
|
min(max(static_cast<int>(probability), 1), MAX_PROBABILITY);
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,11 +28,6 @@ class HeaderPolicy;
|
||||||
|
|
||||||
class ForgettingCurveUtils {
|
class ForgettingCurveUtils {
|
||||||
public:
|
public:
|
||||||
static const int MAX_UNIGRAM_COUNT;
|
|
||||||
static const int MAX_UNIGRAM_COUNT_AFTER_GC;
|
|
||||||
static const int MAX_BIGRAM_COUNT;
|
|
||||||
static const int MAX_BIGRAM_COUNT_AFTER_GC;
|
|
||||||
|
|
||||||
static const HistoricalInfo createUpdatedHistoricalInfo(
|
static const HistoricalInfo createUpdatedHistoricalInfo(
|
||||||
const HistoricalInfo *const originalHistoricalInfo, const int newProbability,
|
const HistoricalInfo *const originalHistoricalInfo, const int newProbability,
|
||||||
const int timestamp, const HeaderPolicy *const headerPolicy);
|
const int timestamp, const HeaderPolicy *const headerPolicy);
|
||||||
|
@ -47,11 +42,22 @@ class ForgettingCurveUtils {
|
||||||
static int getProbability(const int encodedUnigramProbability,
|
static int getProbability(const int encodedUnigramProbability,
|
||||||
const int encodedBigramProbability);
|
const int encodedBigramProbability);
|
||||||
|
|
||||||
static bool needsToKeep(const HistoricalInfo *const historicalInfo);
|
static bool needsToKeep(const HistoricalInfo *const historicalInfo,
|
||||||
|
const HeaderPolicy *const headerPolicy);
|
||||||
|
|
||||||
static bool needsToDecay(const bool mindsBlockByDecay, const int unigramCount,
|
static bool needsToDecay(const bool mindsBlockByDecay, const int unigramCount,
|
||||||
const int bigramCount, const HeaderPolicy *const headerPolicy);
|
const int bigramCount, const HeaderPolicy *const headerPolicy);
|
||||||
|
|
||||||
|
AK_FORCE_INLINE static int getUnigramCountHardLimit(const int maxUnigramCount) {
|
||||||
|
return static_cast<int>(static_cast<float>(maxUnigramCount)
|
||||||
|
* UNIGRAM_COUNT_HARD_LIMIT_WEIGHT);
|
||||||
|
}
|
||||||
|
|
||||||
|
AK_FORCE_INLINE static int getBigramCountHardLimit(const int maxBigramCount) {
|
||||||
|
return static_cast<int>(static_cast<float>(maxBigramCount)
|
||||||
|
* BIGRAM_COUNT_HARD_LIMIT_WEIGHT);
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(ForgettingCurveUtils);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(ForgettingCurveUtils);
|
||||||
|
|
||||||
|
@ -88,16 +94,17 @@ class ForgettingCurveUtils {
|
||||||
|
|
||||||
static const int MAX_LEVEL;
|
static const int MAX_LEVEL;
|
||||||
static const int MIN_VALID_LEVEL;
|
static const int MIN_VALID_LEVEL;
|
||||||
static const int TIME_STEP_DURATION_IN_SECONDS;
|
|
||||||
static const int MAX_ELAPSED_TIME_STEP_COUNT;
|
static const int MAX_ELAPSED_TIME_STEP_COUNT;
|
||||||
static const int DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD;
|
static const int DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD;
|
||||||
static const int HALF_LIFE_TIME_IN_SECONDS;
|
|
||||||
|
static const float UNIGRAM_COUNT_HARD_LIMIT_WEIGHT;
|
||||||
|
static const float BIGRAM_COUNT_HARD_LIMIT_WEIGHT;
|
||||||
|
|
||||||
static const ProbabilityTable sProbabilityTable;
|
static const ProbabilityTable sProbabilityTable;
|
||||||
|
|
||||||
static int backoff(const int unigramProbability);
|
static int backoff(const int unigramProbability);
|
||||||
|
|
||||||
static int getElapsedTimeStepCount(const int timestamp);
|
static int getElapsedTimeStepCount(const int timestamp, const int durationToLevelDown);
|
||||||
};
|
};
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
#endif /* LATINIME_FORGETTING_CURVE_UTILS_H */
|
#endif /* LATINIME_FORGETTING_CURVE_UTILS_H */
|
||||||
|
|
Loading…
Reference in a new issue