am 2fdb62a6: Start using historical info for decaying.
* commit '2fdb62a6a28d769435fe5d48d8b997c3e9a083d5': Start using historical info for decaying.main
commit
35dff6f9f2
|
@ -163,7 +163,8 @@ int BigramDictionary::getBigramProbability(const int *word0, int length0, const
|
|||
mDictionaryStructurePolicy->getBigramsStructurePolicy(), pos);
|
||||
while (bigramsIt.hasNext()) {
|
||||
bigramsIt.next();
|
||||
if (bigramsIt.getBigramPos() == nextWordPos) {
|
||||
if (bigramsIt.getBigramPos() == nextWordPos
|
||||
&& bigramsIt.getProbability() != NOT_A_PROBABILITY) {
|
||||
return mDictionaryStructurePolicy->getProbability(
|
||||
mDictionaryStructurePolicy->getUnigramProbabilityOfPtNode(nextWordPos),
|
||||
bigramsIt.getProbability());
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h"
|
||||
|
||||
#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h"
|
||||
#include "suggest/policyimpl/dictionary/header/header_policy.h"
|
||||
#include "suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h"
|
||||
#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h"
|
||||
#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
|
||||
|
@ -34,7 +35,12 @@ void Ver4BigramListPolicy::getNextBigram(int *const outBigramPos, int *const out
|
|||
bigramEntry.getTargetTerminalId());
|
||||
}
|
||||
if (outProbability) {
|
||||
*outProbability = bigramEntry.getProbability();
|
||||
if (bigramEntry.hasHistoricalInfo()) {
|
||||
*outProbability =
|
||||
ForgettingCurveUtils::decodeProbability(bigramEntry.getHistoricalInfo());
|
||||
} else {
|
||||
*outProbability = bigramEntry.getProbability();
|
||||
}
|
||||
}
|
||||
if (outHasNext) {
|
||||
*outHasNext = bigramEntry.hasNext();
|
||||
|
@ -152,17 +158,12 @@ bool Ver4BigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(const i
|
|||
if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) {
|
||||
return false;
|
||||
}
|
||||
} else if (mNeedsToDecayWhenUpdating) {
|
||||
const int probability = ForgettingCurveUtils::getEncodedProbabilityToSave(
|
||||
bigramEntry.getProbability(), mHeaderPolicy);
|
||||
const HistoricalInfo historicalInfo =
|
||||
ForgettingCurveUtils::createHistoricalInfoToSave(
|
||||
bigramEntry.getHistoricalInfo());
|
||||
// TODO: Use ForgettingCurveUtils::needsToKeep(&historicalInfo).
|
||||
if (ForgettingCurveUtils::isValidEncodedProbability(probability)) {
|
||||
} else if (bigramEntry.hasHistoricalInfo()) {
|
||||
const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave(
|
||||
bigramEntry.getHistoricalInfo());
|
||||
if (ForgettingCurveUtils::needsToKeep(&historicalInfo)) {
|
||||
const BigramEntry updatedBigramEntry =
|
||||
bigramEntry.updateProbabilityAndGetEntry(probability)
|
||||
.updateHistoricalInfoAndGetEntry(&historicalInfo);
|
||||
bigramEntry.updateHistoricalInfoAndGetEntry(&historicalInfo);
|
||||
if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) {
|
||||
return false;
|
||||
}
|
||||
|
@ -225,14 +226,12 @@ int Ver4BigramListPolicy::getEntryPosToUpdate(const int targetTerminalIdToFind,
|
|||
const BigramEntry Ver4BigramListPolicy::createUpdatedBigramEntryFrom(
|
||||
const BigramEntry *const originalBigramEntry, const int newProbability,
|
||||
const int timestamp) const {
|
||||
if (mNeedsToDecayWhenUpdating) {
|
||||
const int probability = ForgettingCurveUtils::getUpdatedEncodedProbability(
|
||||
originalBigramEntry->getProbability(), newProbability);
|
||||
// TODO: Consolidate historical info and probability.
|
||||
if (mHeaderPolicy->hasHistoricalInfoOfWords()) {
|
||||
const HistoricalInfo updatedHistoricalInfo =
|
||||
ForgettingCurveUtils::createUpdatedHistoricalInfo(
|
||||
originalBigramEntry->getHistoricalInfo(), newProbability, timestamp);
|
||||
return originalBigramEntry->updateProbabilityAndGetEntry(probability)
|
||||
.updateHistoricalInfoAndGetEntry(&updatedHistoricalInfo);
|
||||
return originalBigramEntry->updateHistoricalInfoAndGetEntry(&updatedHistoricalInfo);
|
||||
} else {
|
||||
return originalBigramEntry->updateProbabilityAndGetEntry(newProbability);
|
||||
}
|
||||
|
|
|
@ -24,18 +24,17 @@
|
|||
namespace latinime {
|
||||
|
||||
class BigramDictContent;
|
||||
class DictionaryHeaderStructurePolicy;
|
||||
class HeaderPolicy;
|
||||
class TerminalPositionLookupTable;
|
||||
|
||||
class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy {
|
||||
public:
|
||||
Ver4BigramListPolicy(BigramDictContent *const bigramDictContent,
|
||||
const TerminalPositionLookupTable *const terminalPositionLookupTable,
|
||||
const DictionaryHeaderStructurePolicy *const headerPolicy,
|
||||
const bool needsToDecayWhenUpdating)
|
||||
const HeaderPolicy *const headerPolicy)
|
||||
: mBigramDictContent(bigramDictContent),
|
||||
mTerminalPositionLookupTable(terminalPositionLookupTable),
|
||||
mHeaderPolicy(headerPolicy), mNeedsToDecayWhenUpdating(needsToDecayWhenUpdating) {}
|
||||
mHeaderPolicy(headerPolicy) {}
|
||||
|
||||
void getNextBigram(int *const outBigramPos, int *const outProbability,
|
||||
bool *const outHasNext, int *const bigramEntryPos) const;
|
||||
|
@ -64,8 +63,7 @@ class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy {
|
|||
|
||||
BigramDictContent *const mBigramDictContent;
|
||||
const TerminalPositionLookupTable *const mTerminalPositionLookupTable;
|
||||
const DictionaryHeaderStructurePolicy *const mHeaderPolicy;
|
||||
const bool mNeedsToDecayWhenUpdating;
|
||||
const HeaderPolicy *const mHeaderPolicy;
|
||||
};
|
||||
} // namespace latinime
|
||||
#endif /* LATINIME_VER4_BIGRAM_LIST_POLICY_H */
|
||||
|
|
|
@ -93,6 +93,18 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
|||
}
|
||||
}
|
||||
|
||||
AK_FORCE_INLINE bool isValid() const {
|
||||
// Decaying dictionary must have historical information.
|
||||
if (!mIsDecayingDict) {
|
||||
return true;
|
||||
}
|
||||
if (mHasHistoricalInfoOfWords) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
AK_FORCE_INLINE int getSize() const {
|
||||
return mSize;
|
||||
}
|
||||
|
@ -137,7 +149,7 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
|||
return mExtendedRegionSize;
|
||||
}
|
||||
|
||||
AK_FORCE_INLINE bool hasHistricalInfoOfWords() const {
|
||||
AK_FORCE_INLINE bool hasHistoricalInfoOfWords() const {
|
||||
return mHasHistoricalInfoOfWords;
|
||||
}
|
||||
|
||||
|
|
|
@ -20,7 +20,6 @@
|
|||
#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h"
|
||||
#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h"
|
||||
#include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_writing_utils.h"
|
||||
#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
|
||||
|
||||
namespace latinime {
|
||||
|
||||
|
@ -30,8 +29,7 @@ bool DynamicPatriciaTrieGcEventListeners
|
|||
// PtNode is useless when the PtNode is not a terminal and doesn't have any not useless
|
||||
// children.
|
||||
bool isUselessPtNode = !ptNodeParams->isTerminal();
|
||||
// TODO: Quit checking mNeedsToDecayWhenUpdating.
|
||||
if (ptNodeParams->isTerminal() && mNeedsToDecayWhenUpdating) {
|
||||
if (ptNodeParams->isTerminal()) {
|
||||
bool needsToKeepPtNode = true;
|
||||
if (!mPtNodeWriter->updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC(ptNodeParams,
|
||||
&needsToKeepPtNode)) {
|
||||
|
|
|
@ -27,8 +27,6 @@
|
|||
|
||||
namespace latinime {
|
||||
|
||||
class DictionaryHeaderStructurePolicy;
|
||||
class PtNodeWriter;
|
||||
class PtNodeParams;
|
||||
|
||||
// TODO: Move to pt_common.
|
||||
|
@ -41,12 +39,9 @@ class DynamicPatriciaTrieGcEventListeners {
|
|||
: public DynamicPatriciaTrieReadingHelper::TraversingEventListener {
|
||||
public:
|
||||
TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
|
||||
const DictionaryHeaderStructurePolicy *const headerPolicy,
|
||||
PtNodeWriter *const ptNodeWriter, BufferWithExtendableBuffer *const buffer,
|
||||
const bool needsToDecayWhenUpdating)
|
||||
: mHeaderPolicy(headerPolicy), mPtNodeWriter(ptNodeWriter), mBuffer(buffer),
|
||||
mNeedsToDecayWhenUpdating(needsToDecayWhenUpdating), mValueStack(),
|
||||
mChildrenValue(0), mValidUnigramCount(0) {}
|
||||
PtNodeWriter *const ptNodeWriter)
|
||||
: mPtNodeWriter(ptNodeWriter), mValueStack(), mChildrenValue(0),
|
||||
mValidUnigramCount(0) {}
|
||||
|
||||
~TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted() {};
|
||||
|
||||
|
@ -77,10 +72,7 @@ class DynamicPatriciaTrieGcEventListeners {
|
|||
DISALLOW_IMPLICIT_CONSTRUCTORS(
|
||||
TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted);
|
||||
|
||||
const DictionaryHeaderStructurePolicy *const mHeaderPolicy;
|
||||
PtNodeWriter *const mPtNodeWriter;
|
||||
BufferWithExtendableBuffer *const mBuffer;
|
||||
const bool mNeedsToDecayWhenUpdating;
|
||||
std::vector<int> mValueStack;
|
||||
int mChildrenValue;
|
||||
int mValidUnigramCount;
|
||||
|
|
|
@ -73,6 +73,10 @@ class BigramEntry {
|
|||
return mProbability;
|
||||
}
|
||||
|
||||
bool hasHistoricalInfo() const {
|
||||
return mHistoricalInfo.isValid();
|
||||
}
|
||||
|
||||
const HistoricalInfo *getHistoricalInfo() const {
|
||||
return &mHistoricalInfo;
|
||||
}
|
||||
|
|
|
@ -51,6 +51,10 @@ class ProbabilityEntry {
|
|||
return ProbabilityEntry(mFlags, mProbability, historicalInfo);
|
||||
}
|
||||
|
||||
bool hasHistoricalInfo() const {
|
||||
return mHistoricalInfo.isValid();
|
||||
}
|
||||
|
||||
int getFlags() const {
|
||||
return mFlags;
|
||||
}
|
||||
|
|
|
@ -45,9 +45,9 @@ class Ver4DictBuffers {
|
|||
}
|
||||
|
||||
AK_FORCE_INLINE bool isValid() const {
|
||||
return mDictBuffer.get() != 0 && mProbabilityDictContent.isValid()
|
||||
&& mTerminalPositionLookupTable.isValid() && mBigramDictContent.isValid()
|
||||
&& mShortcutDictContent.isValid();
|
||||
return mDictBuffer.get() != 0 && mHeaderPolicy.isValid()
|
||||
&& mProbabilityDictContent.isValid() && mTerminalPositionLookupTable.isValid()
|
||||
&& mBigramDictContent.isValid() && mShortcutDictContent.isValid();
|
||||
}
|
||||
|
||||
AK_FORCE_INLINE bool isNearSizeLimit() const {
|
||||
|
@ -131,9 +131,10 @@ class Ver4DictBuffers {
|
|||
BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE),
|
||||
// TODO: Quit using header size.
|
||||
mTerminalPositionLookupTable(dictDirPath, isUpdatable, mHeaderPolicy.getSize()),
|
||||
mProbabilityDictContent(dictDirPath, mHeaderPolicy.hasHistricalInfoOfWords(),
|
||||
mProbabilityDictContent(dictDirPath, mHeaderPolicy.hasHistoricalInfoOfWords(),
|
||||
isUpdatable),
|
||||
mBigramDictContent(dictDirPath, mHeaderPolicy.hasHistoricalInfoOfWords(),
|
||||
isUpdatable),
|
||||
mBigramDictContent(dictDirPath, mHeaderPolicy.hasHistricalInfoOfWords(), isUpdatable),
|
||||
mShortcutDictContent(dictDirPath, isUpdatable),
|
||||
mIsUpdatable(isUpdatable) {}
|
||||
|
||||
|
@ -142,8 +143,8 @@ class Ver4DictBuffers {
|
|||
mExpandableHeaderBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE),
|
||||
mExpandableTrieBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE),
|
||||
mTerminalPositionLookupTable(),
|
||||
mProbabilityDictContent(headerPolicy->hasHistricalInfoOfWords()),
|
||||
mBigramDictContent(headerPolicy->hasHistricalInfoOfWords()), mShortcutDictContent(),
|
||||
mProbabilityDictContent(headerPolicy->hasHistoricalInfoOfWords()),
|
||||
mBigramDictContent(headerPolicy->hasHistoricalInfoOfWords()), mShortcutDictContent(),
|
||||
mIsUpdatable(true) {}
|
||||
|
||||
const MmappedBuffer::MmappedBufferPtr mDictBuffer;
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h"
|
||||
#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h"
|
||||
#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
|
||||
#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
|
||||
|
||||
namespace latinime {
|
||||
|
||||
|
@ -62,7 +63,12 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce
|
|||
terminalId = Ver4PatriciaTrieReadingUtils::getTerminalIdAndAdvancePosition(dictBuf, &pos);
|
||||
const ProbabilityEntry probabilityEntry =
|
||||
mProbabilityDictContent->getProbabilityEntry(terminalId);
|
||||
probability = probabilityEntry.getProbability();
|
||||
if (probabilityEntry.hasHistoricalInfo()) {
|
||||
probability = ForgettingCurveUtils::decodeProbability(
|
||||
probabilityEntry.getHistoricalInfo());
|
||||
} else {
|
||||
probability = probabilityEntry.getProbability();
|
||||
}
|
||||
}
|
||||
int childrenPosFieldPos = pos;
|
||||
if (usesAdditionalBuffer) {
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h"
|
||||
|
||||
#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h"
|
||||
#include "suggest/policyimpl/dictionary/header/header_policy.h"
|
||||
#include "suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h"
|
||||
#include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_reading_utils.h"
|
||||
#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h"
|
||||
|
@ -147,27 +148,21 @@ bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeA
|
|||
AKLOGE("updatePtNodeProbabilityAndGetNeedsToSaveForGC is called for non-terminal PtNode.");
|
||||
return false;
|
||||
}
|
||||
if (mBuffers->getHeaderPolicy()->isDecayingDict()) {
|
||||
const ProbabilityEntry originalProbabilityEntry =
|
||||
mBuffers->getProbabilityDictContent()->getProbabilityEntry(
|
||||
toBeUpdatedPtNodeParams->getTerminalId());
|
||||
// TODO: Remove.
|
||||
const int newProbability = ForgettingCurveUtils::getEncodedProbabilityToSave(
|
||||
originalProbabilityEntry.getProbability(), mBuffers->getHeaderPolicy());
|
||||
const HistoricalInfo historicalInfo =
|
||||
ForgettingCurveUtils::createHistoricalInfoToSave(
|
||||
originalProbabilityEntry.getHistoricalInfo());
|
||||
const ProbabilityEntry originalProbabilityEntry =
|
||||
mBuffers->getProbabilityDictContent()->getProbabilityEntry(
|
||||
toBeUpdatedPtNodeParams->getTerminalId());
|
||||
if (originalProbabilityEntry.hasHistoricalInfo()) {
|
||||
const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave(
|
||||
originalProbabilityEntry.getHistoricalInfo());
|
||||
const ProbabilityEntry probabilityEntry =
|
||||
originalProbabilityEntry.createEntryWithUpdatedProbability(newProbability)
|
||||
.createEntryWithUpdatedHistoricalInfo(&historicalInfo);
|
||||
originalProbabilityEntry.createEntryWithUpdatedHistoricalInfo(&historicalInfo);
|
||||
if (!mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry(
|
||||
toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry)) {
|
||||
AKLOGE("Cannot write updated probability entry. terminalId: %d",
|
||||
toBeUpdatedPtNodeParams->getTerminalId());
|
||||
return false;
|
||||
}
|
||||
// TODO: Use ForgettingCurveUtils::needsToKeep(&historicalInfo).
|
||||
const bool isValid = ForgettingCurveUtils::isValidEncodedProbability(newProbability);
|
||||
const bool isValid = ForgettingCurveUtils::needsToKeep(&historicalInfo);
|
||||
if (!isValid) {
|
||||
if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) {
|
||||
AKLOGE("Cannot mark PtNode as willBecomeNonTerminal.");
|
||||
|
@ -380,14 +375,13 @@ bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition(
|
|||
const ProbabilityEntry Ver4PatriciaTrieNodeWriter::createUpdatedEntryFrom(
|
||||
const ProbabilityEntry *const originalProbabilityEntry, const int newProbability,
|
||||
const int timestamp) const {
|
||||
if (mNeedsToDecayWhenUpdating) {
|
||||
const int updatedProbability = ForgettingCurveUtils::getUpdatedEncodedProbability(
|
||||
originalProbabilityEntry->getProbability(), newProbability);
|
||||
// TODO: Consolidate historical info and probability.
|
||||
if (mBuffers->getHeaderPolicy()->hasHistoricalInfoOfWords()) {
|
||||
const HistoricalInfo updatedHistoricalInfo =
|
||||
ForgettingCurveUtils::createUpdatedHistoricalInfo(
|
||||
originalProbabilityEntry->getHistoricalInfo(), newProbability, timestamp);
|
||||
return originalProbabilityEntry->createEntryWithUpdatedProbability(updatedProbability)
|
||||
.createEntryWithUpdatedHistoricalInfo(&updatedHistoricalInfo);
|
||||
return originalProbabilityEntry->createEntryWithUpdatedHistoricalInfo(
|
||||
&updatedHistoricalInfo);
|
||||
} else {
|
||||
return originalProbabilityEntry->createEntryWithUpdatedProbability(newProbability);
|
||||
}
|
||||
|
|
|
@ -40,12 +40,10 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter {
|
|||
public:
|
||||
Ver4PatriciaTrieNodeWriter(BufferWithExtendableBuffer *const trieBuffer,
|
||||
Ver4DictBuffers *const buffers, const Ver4PatriciaTrieNodeReader *const ptNodeReader,
|
||||
Ver4BigramListPolicy *const bigramPolicy, Ver4ShortcutListPolicy *const shortcutPolicy,
|
||||
const bool needsToDecayWhenUpdating)
|
||||
Ver4BigramListPolicy *const bigramPolicy, Ver4ShortcutListPolicy *const shortcutPolicy)
|
||||
: mTrieBuffer(trieBuffer), mBuffers(buffers), mPtNodeReader(ptNodeReader),
|
||||
mReadingHelper(mTrieBuffer, mPtNodeReader),
|
||||
mBigramPolicy(bigramPolicy), mShortcutPolicy(shortcutPolicy),
|
||||
mNeedsToDecayWhenUpdating(needsToDecayWhenUpdating) {}
|
||||
mBigramPolicy(bigramPolicy), mShortcutPolicy(shortcutPolicy) {}
|
||||
|
||||
virtual ~Ver4PatriciaTrieNodeWriter() {}
|
||||
|
||||
|
@ -120,7 +118,6 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter {
|
|||
DynamicPatriciaTrieReadingHelper mReadingHelper;
|
||||
Ver4BigramListPolicy *const mBigramPolicy;
|
||||
Ver4ShortcutListPolicy *const mShortcutPolicy;
|
||||
const bool mNeedsToDecayWhenUpdating;
|
||||
};
|
||||
} // namespace latinime
|
||||
#endif /* LATINIME_VER4_PATRICIA_TRIE_NODE_WRITER_H */
|
||||
|
|
|
@ -34,8 +34,6 @@ const char *const Ver4PatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
|
|||
const char *const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
|
||||
const char *const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT";
|
||||
const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT";
|
||||
const char *const Ver4PatriciaTriePolicy::SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY =
|
||||
"SET_NEEDS_TO_DECAY_FOR_TESTING";
|
||||
const char *const Ver4PatriciaTriePolicy::SET_CURRENT_TIME_FOR_TESTING_QUERY_FORMAT =
|
||||
"SET_CURRENT_TIME_FOR_TESTING:%d";
|
||||
const char *const Ver4PatriciaTriePolicy::GET_CURRENT_TIME_QUERY = "GET_CURRENT_TIME";
|
||||
|
@ -62,8 +60,7 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d
|
|||
// A DecayingDict may have a terminal PtNode that has a terminal DicNode whose
|
||||
// probability is NOT_A_PROBABILITY. In such case, we don't want to treat it as a
|
||||
// valid terminal DicNode.
|
||||
isTerminal = getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY)
|
||||
!= NOT_A_PROBABILITY;
|
||||
isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY;
|
||||
}
|
||||
childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(),
|
||||
ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), isTerminal,
|
||||
|
@ -262,11 +259,7 @@ void Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) {
|
|||
AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary.");
|
||||
return;
|
||||
}
|
||||
const bool needsToDecay = mHeaderPolicy->isDecayingDict()
|
||||
&& (mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay(
|
||||
false /* mindsBlockByDecay */, mUnigramCount, mBigramCount, mHeaderPolicy));
|
||||
mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath, needsToDecay);
|
||||
mNeedsToDecayForTesting = false;
|
||||
mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath);
|
||||
}
|
||||
|
||||
bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {
|
||||
|
@ -286,8 +279,8 @@ bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {
|
|||
// Needs to reduce dictionary size.
|
||||
return true;
|
||||
} else if (mHeaderPolicy->isDecayingDict()) {
|
||||
return mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay(
|
||||
mindsBlockByGC, mUnigramCount, mBigramCount, mHeaderPolicy);
|
||||
return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mUnigramCount, mBigramCount,
|
||||
mHeaderPolicy);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
@ -308,8 +301,6 @@ void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int quer
|
|||
snprintf(outResult, maxResultLength, "%d",
|
||||
mHeaderPolicy->isDecayingDict() ? ForgettingCurveUtils::MAX_BIGRAM_COUNT :
|
||||
static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
|
||||
} else if (strncmp(query, SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY, compareLength) == 0) {
|
||||
mNeedsToDecayForTesting = true;
|
||||
} else if (sscanf(query, SET_CURRENT_TIME_FOR_TESTING_QUERY_FORMAT, ×tamp) == 1) {
|
||||
TimeKeeper::startTestModeWithForceCurrentTime(timestamp);
|
||||
} else if (strncmp(query, GET_CURRENT_TIME_QUERY, compareLength) == 0) {
|
||||
|
|
|
@ -41,17 +41,16 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
|||
: mBuffers(buffers), mHeaderPolicy(mBuffers.get()->getHeaderPolicy()),
|
||||
mDictBuffer(mBuffers.get()->getWritableTrieBuffer()),
|
||||
mBigramPolicy(mBuffers.get()->getMutableBigramDictContent(),
|
||||
mBuffers.get()->getTerminalPositionLookupTable(), mHeaderPolicy,
|
||||
mHeaderPolicy->isDecayingDict()),
|
||||
mBuffers.get()->getTerminalPositionLookupTable(), mHeaderPolicy),
|
||||
mShortcutPolicy(mBuffers.get()->getMutableShortcutDictContent(),
|
||||
mBuffers.get()->getTerminalPositionLookupTable()),
|
||||
mNodeReader(mDictBuffer, mBuffers.get()->getProbabilityDictContent()),
|
||||
mNodeWriter(mDictBuffer, mBuffers.get(), &mNodeReader, &mBigramPolicy,
|
||||
&mShortcutPolicy, mHeaderPolicy->isDecayingDict()),
|
||||
&mShortcutPolicy),
|
||||
mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter),
|
||||
mWritingHelper(mBuffers.get()),
|
||||
mUnigramCount(mHeaderPolicy->getUnigramCount()),
|
||||
mBigramCount(mHeaderPolicy->getBigramCount()), mNeedsToDecayForTesting(false) {};
|
||||
mBigramCount(mHeaderPolicy->getBigramCount()) {};
|
||||
|
||||
AK_FORCE_INLINE int getRootPosition() const {
|
||||
return 0;
|
||||
|
@ -117,7 +116,6 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
|||
static const char *const BIGRAM_COUNT_QUERY;
|
||||
static const char *const MAX_UNIGRAM_COUNT_QUERY;
|
||||
static const char *const MAX_BIGRAM_COUNT_QUERY;
|
||||
static const char *const SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY;
|
||||
static const char *const SET_CURRENT_TIME_FOR_TESTING_QUERY_FORMAT;
|
||||
static const char *const GET_CURRENT_TIME_QUERY;
|
||||
static const char *const QUIT_TIMEKEEPER_TEST_MODE_QUERY;
|
||||
|
@ -137,7 +135,6 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
|||
Ver4PatriciaTrieWritingHelper mWritingHelper;
|
||||
int mUnigramCount;
|
||||
int mBigramCount;
|
||||
bool mNeedsToDecayForTesting;
|
||||
};
|
||||
} // namespace latinime
|
||||
#endif // LATINIME_VER4_PATRICIA_TRIE_POLICY_H
|
||||
|
|
|
@ -53,20 +53,20 @@ void Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const trieFilePa
|
|||
}
|
||||
|
||||
void Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos,
|
||||
const char *const trieFilePath, const bool needsToDecay) {
|
||||
const char *const trieFilePath) {
|
||||
const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
|
||||
Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers(
|
||||
Ver4DictBuffers::createVer4DictBuffers(headerPolicy));
|
||||
int unigramCount = 0;
|
||||
int bigramCount = 0;
|
||||
if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &unigramCount, &bigramCount,
|
||||
needsToDecay)) {
|
||||
if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &unigramCount, &bigramCount)) {
|
||||
return;
|
||||
}
|
||||
BufferWithExtendableBuffer headerBuffer(
|
||||
BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
|
||||
if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */,
|
||||
needsToDecay, unigramCount, bigramCount, 0 /* extendedRegionSize */)) {
|
||||
true /* updatesLastDecayedTime */, unigramCount, bigramCount,
|
||||
0 /* extendedRegionSize */)) {
|
||||
return;
|
||||
}
|
||||
const int dirPathBufSize = strlen(trieFilePath) + 1 /* terminator */;
|
||||
|
@ -77,30 +77,29 @@ void Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeAr
|
|||
|
||||
bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
|
||||
const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite,
|
||||
int *const outUnigramCount, int *const outBigramCount, const bool needsToDecay) {
|
||||
int *const outUnigramCount, int *const outBigramCount) {
|
||||
Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer(),
|
||||
mBuffers->getProbabilityDictContent());
|
||||
Ver4BigramListPolicy bigramPolicy(mBuffers->getMutableBigramDictContent(),
|
||||
mBuffers->getTerminalPositionLookupTable(), headerPolicy, needsToDecay);
|
||||
mBuffers->getTerminalPositionLookupTable(), headerPolicy);
|
||||
Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(),
|
||||
mBuffers->getTerminalPositionLookupTable());
|
||||
Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(),
|
||||
mBuffers, &ptNodeReader, &bigramPolicy, &shortcutPolicy,
|
||||
false /* needsToDecayWhenUpdating */);
|
||||
mBuffers, &ptNodeReader, &bigramPolicy, &shortcutPolicy);
|
||||
|
||||
DynamicPatriciaTrieReadingHelper readingHelper(mBuffers->getTrieBuffer(), &ptNodeReader);
|
||||
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
||||
DynamicPatriciaTrieGcEventListeners
|
||||
::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
|
||||
traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
|
||||
headerPolicy, &ptNodeWriter, mBuffers->getWritableTrieBuffer(),
|
||||
needsToDecay);
|
||||
&ptNodeWriter);
|
||||
if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
|
||||
&traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) {
|
||||
return false;
|
||||
}
|
||||
if (needsToDecay && traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
|
||||
.getValidUnigramCount() > ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC) {
|
||||
if (headerPolicy->isDecayingDict()
|
||||
&& traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
|
||||
.getValidUnigramCount() > ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC) {
|
||||
// TODO: Remove more unigrams.
|
||||
}
|
||||
|
||||
|
@ -111,8 +110,9 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
|
|||
&traversePolicyToUpdateBigramProbability)) {
|
||||
return false;
|
||||
}
|
||||
if (needsToDecay && traversePolicyToUpdateBigramProbability.getValidBigramEntryCount()
|
||||
> ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC) {
|
||||
if (headerPolicy->isDecayingDict()
|
||||
&& traversePolicyToUpdateBigramProbability.getValidBigramEntryCount()
|
||||
> ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC) {
|
||||
// TODO: Remove more bigrams.
|
||||
}
|
||||
|
||||
|
@ -120,8 +120,7 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
|
|||
PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap;
|
||||
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
||||
Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(),
|
||||
buffersToWrite, &ptNodeReader, &bigramPolicy, &shortcutPolicy,
|
||||
false /* needsToDecayWhenUpdating */);
|
||||
buffersToWrite, &ptNodeReader, &bigramPolicy, &shortcutPolicy);
|
||||
DynamicPatriciaTrieGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
|
||||
traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers,
|
||||
buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap);
|
||||
|
@ -134,13 +133,11 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
|
|||
Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(),
|
||||
buffersToWrite->getProbabilityDictContent());
|
||||
Ver4BigramListPolicy newBigramPolicy(buffersToWrite->getMutableBigramDictContent(),
|
||||
buffersToWrite->getTerminalPositionLookupTable(), headerPolicy,
|
||||
false /* needsToDecay */);
|
||||
buffersToWrite->getTerminalPositionLookupTable(), headerPolicy);
|
||||
Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(),
|
||||
buffersToWrite->getTerminalPositionLookupTable());
|
||||
Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(),
|
||||
buffersToWrite, &newPtNodeReader, &newBigramPolicy, &newShortcutPolicy,
|
||||
false /* needsToDecayWhenUpdating */);
|
||||
buffersToWrite, &newPtNodeReader, &newBigramPolicy, &newShortcutPolicy);
|
||||
// Re-assign terminal IDs for valid terminal PtNodes.
|
||||
TerminalPositionLookupTable::TerminalIdMap terminalIdMap;
|
||||
if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds(
|
||||
|
|
|
@ -36,7 +36,7 @@ class Ver4PatriciaTrieWritingHelper {
|
|||
const int bigramCount) const;
|
||||
|
||||
void writeToDictFileWithGC(const int rootPtNodeArrayPos,
|
||||
const char *const trieFilePath, const bool needsToDecay);
|
||||
const char *const trieFilePath);
|
||||
|
||||
private:
|
||||
DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTrieWritingHelper);
|
||||
|
@ -66,7 +66,7 @@ class Ver4PatriciaTrieWritingHelper {
|
|||
|
||||
bool runGC(const int rootPtNodeArrayPos, const HeaderPolicy *const headerPolicy,
|
||||
Ver4DictBuffers *const buffersToWrite, int *const outUnigramCount,
|
||||
int *const outBigramCount, const bool needsToDecay);
|
||||
int *const outBigramCount);
|
||||
|
||||
Ver4DictBuffers *const mBuffers;
|
||||
};
|
||||
|
|
|
@ -31,12 +31,6 @@ const int ForgettingCurveUtils::MAX_BIGRAM_COUNT = 12000;
|
|||
const int ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC = 10000;
|
||||
|
||||
const int ForgettingCurveUtils::MAX_COMPUTED_PROBABILITY = 127;
|
||||
const int ForgettingCurveUtils::MAX_ENCODED_PROBABILITY = 15;
|
||||
const int ForgettingCurveUtils::MIN_VALID_ENCODED_PROBABILITY = 3;
|
||||
const int ForgettingCurveUtils::ENCODED_PROBABILITY_STEP = 1;
|
||||
// Currently, we try to decay each uni/bigram once every 2 hours. Accordingly, the expected
|
||||
// duration of the decay is approximately 66hours.
|
||||
const float ForgettingCurveUtils::MIN_PROBABILITY_TO_DECAY = 0.03f;
|
||||
const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60;
|
||||
|
||||
const int ForgettingCurveUtils::MAX_LEVEL = 3;
|
||||
|
@ -53,9 +47,9 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
|
|||
const int newProbability, const int timestamp) {
|
||||
if (newProbability != NOT_A_PROBABILITY && originalHistoricalInfo->getLevel() == 0) {
|
||||
return HistoricalInfo(timestamp, MIN_VALID_LEVEL /* level */, 0 /* count */);
|
||||
} else if (originalHistoricalInfo->getTimeStamp() == NOT_A_TIMESTAMP) {
|
||||
} else if (!originalHistoricalInfo->isValid()) {
|
||||
// Initial information.
|
||||
return HistoricalInfo(timestamp, 0 /* level */, 0 /* count */);
|
||||
return HistoricalInfo(timestamp, 0 /* level */, 1 /* count */);
|
||||
} else {
|
||||
const int updatedCount = originalHistoricalInfo->getCount() + 1;
|
||||
if (updatedCount > MAX_COUNT) {
|
||||
|
@ -75,85 +69,46 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
|
|||
}
|
||||
}
|
||||
|
||||
/* static */ int ForgettingCurveUtils::getProbability(const int encodedUnigramProbability,
|
||||
const int encodedBigramProbability) {
|
||||
if (encodedUnigramProbability == NOT_A_PROBABILITY) {
|
||||
/* static */ int ForgettingCurveUtils::decodeProbability(
|
||||
const HistoricalInfo *const historicalInfo) {
|
||||
const int elapsedTimeStepCount = getElapsedTimeStepCount(historicalInfo->getTimeStamp());
|
||||
return sProbabilityTable.getProbability(historicalInfo->getLevel(),
|
||||
min(max(elapsedTimeStepCount, 0), MAX_ELAPSED_TIME_STEP_COUNT));
|
||||
}
|
||||
|
||||
/* static */ int ForgettingCurveUtils::getProbability(const int unigramProbability,
|
||||
const int bigramProbability) {
|
||||
if (unigramProbability == NOT_A_PROBABILITY) {
|
||||
return NOT_A_PROBABILITY;
|
||||
} else if (encodedBigramProbability == NOT_A_PROBABILITY) {
|
||||
return backoff(decodeProbability(encodedUnigramProbability));
|
||||
} else if (bigramProbability == NOT_A_PROBABILITY) {
|
||||
return min(backoff(unigramProbability), MAX_COMPUTED_PROBABILITY);
|
||||
} else {
|
||||
const int unigramProbability = decodeProbability(encodedUnigramProbability);
|
||||
const int bigramProbability = decodeProbability(encodedBigramProbability);
|
||||
return min(max(unigramProbability, bigramProbability), MAX_COMPUTED_PROBABILITY);
|
||||
}
|
||||
}
|
||||
|
||||
// Caveat: Unlike getProbability(), this method doesn't assume special bigram probability encoding
|
||||
// (i.e. unigram probability + bigram probability delta).
|
||||
/* static */ int ForgettingCurveUtils::getUpdatedEncodedProbability(
|
||||
const int originalEncodedProbability, const int newProbability) {
|
||||
if (originalEncodedProbability == NOT_A_PROBABILITY) {
|
||||
// The bigram relation is not in this dictionary.
|
||||
if (newProbability == NOT_A_PROBABILITY) {
|
||||
// The bigram target is not in other dictionaries.
|
||||
return 0;
|
||||
} else {
|
||||
return MIN_VALID_ENCODED_PROBABILITY;
|
||||
}
|
||||
} else {
|
||||
if (newProbability != NOT_A_PROBABILITY
|
||||
&& originalEncodedProbability < MIN_VALID_ENCODED_PROBABILITY) {
|
||||
return MIN_VALID_ENCODED_PROBABILITY;
|
||||
}
|
||||
return min(originalEncodedProbability + ENCODED_PROBABILITY_STEP, MAX_ENCODED_PROBABILITY);
|
||||
}
|
||||
}
|
||||
|
||||
/* static */ int ForgettingCurveUtils::isValidEncodedProbability(const int encodedProbability) {
|
||||
return encodedProbability >= MIN_VALID_ENCODED_PROBABILITY;
|
||||
}
|
||||
|
||||
/* static */ bool ForgettingCurveUtils::needsToKeep(const HistoricalInfo *const historicalInfo) {
|
||||
return historicalInfo->getLevel() > 0
|
||||
|| getElapsedTimeStepCount(historicalInfo->getTimeStamp())
|
||||
< DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD;
|
||||
}
|
||||
|
||||
/* static */ int ForgettingCurveUtils::getEncodedProbabilityToSave(const int encodedProbability,
|
||||
const DictionaryHeaderStructurePolicy *const headerPolicy) {
|
||||
const int elapsedTime = TimeKeeper::peekCurrentTime() - headerPolicy->getLastDecayedTime();
|
||||
const int decayIterationCount = max(elapsedTime / DECAY_INTERVAL_SECONDS, 1);
|
||||
int currentEncodedProbability = max(min(encodedProbability, MAX_ENCODED_PROBABILITY), 0);
|
||||
// TODO: Implement the decay in more proper way.
|
||||
for (int i = 0; i < decayIterationCount; ++i) {
|
||||
const float currentRate = static_cast<float>(currentEncodedProbability)
|
||||
/ static_cast<float>(MAX_ENCODED_PROBABILITY);
|
||||
const float thresholdToDecay = (1.0f - MIN_PROBABILITY_TO_DECAY) * currentRate;
|
||||
const float randValue = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
|
||||
if (thresholdToDecay < randValue) {
|
||||
currentEncodedProbability = max(currentEncodedProbability - ENCODED_PROBABILITY_STEP,
|
||||
0);
|
||||
}
|
||||
}
|
||||
return currentEncodedProbability;
|
||||
}
|
||||
|
||||
/* static */ const HistoricalInfo ForgettingCurveUtils::createHistoricalInfoToSave(
|
||||
const HistoricalInfo *const originalHistoricalInfo) {
|
||||
if (originalHistoricalInfo->getTimeStamp() == NOT_A_TIMESTAMP) {
|
||||
return HistoricalInfo();
|
||||
}
|
||||
const int elapsedTimeStep = getElapsedTimeStepCount(originalHistoricalInfo->getTimeStamp());
|
||||
if (elapsedTimeStep < MAX_ELAPSED_TIME_STEP_COUNT) {
|
||||
if (elapsedTimeStep <= MAX_ELAPSED_TIME_STEP_COUNT) {
|
||||
// No need to update historical info.
|
||||
return *originalHistoricalInfo;
|
||||
}
|
||||
// Level down.
|
||||
const int maxLevelDownAmonut = elapsedTimeStep / MAX_ELAPSED_TIME_STEP_COUNT;
|
||||
const int maxLevelDownAmonut = elapsedTimeStep / (MAX_ELAPSED_TIME_STEP_COUNT + 1);
|
||||
const int levelDownAmount = (maxLevelDownAmonut >= originalHistoricalInfo->getLevel()) ?
|
||||
originalHistoricalInfo->getLevel() : maxLevelDownAmonut;
|
||||
const int adjustedTimestamp = originalHistoricalInfo->getTimeStamp() +
|
||||
levelDownAmount * MAX_ELAPSED_TIME_STEP_COUNT * TIME_STEP_DURATION_IN_SECONDS;
|
||||
levelDownAmount * (MAX_ELAPSED_TIME_STEP_COUNT + 1) * TIME_STEP_DURATION_IN_SECONDS;
|
||||
return HistoricalInfo(adjustedTimestamp,
|
||||
originalHistoricalInfo->getLevel() - levelDownAmount, 0 /* count */);
|
||||
}
|
||||
|
@ -179,14 +134,6 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
|
|||
return false;
|
||||
}
|
||||
|
||||
/* static */ int ForgettingCurveUtils::decodeProbability(const int encodedProbability) {
|
||||
if (encodedProbability < MIN_VALID_ENCODED_PROBABILITY) {
|
||||
return NOT_A_PROBABILITY;
|
||||
} else {
|
||||
return min(sProbabilityTable.getProbability(encodedProbability), MAX_ENCODED_PROBABILITY);
|
||||
}
|
||||
}
|
||||
|
||||
// See comments in ProbabilityUtils::backoff().
|
||||
/* static */ int ForgettingCurveUtils::backoff(const int unigramProbability) {
|
||||
if (unigramProbability == NOT_A_PROBABILITY) {
|
||||
|
@ -201,14 +148,24 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
|
|||
}
|
||||
|
||||
ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTable() {
|
||||
// Table entry is as follows:
|
||||
// 1, 1, 1, 2, 3, 5, 6, 9, 13, 18, 25, 34, 48, 66, 91, 127.
|
||||
// Note that first MIN_VALID_ENCODED_PROBABILITY values are not used.
|
||||
mTable.resize(MAX_ENCODED_PROBABILITY + 1);
|
||||
for (int i = 0; i <= MAX_ENCODED_PROBABILITY; ++i) {
|
||||
const int probability = static_cast<int>(powf(static_cast<float>(MAX_COMPUTED_PROBABILITY),
|
||||
static_cast<float>(i) / static_cast<float>(MAX_ENCODED_PROBABILITY)));
|
||||
mTable[i] = min(MAX_COMPUTED_PROBABILITY, max(0, probability));
|
||||
mTable.resize(MAX_LEVEL + 1);
|
||||
for (int level = 0; level <= MAX_LEVEL; ++level) {
|
||||
mTable[level].resize(MAX_ELAPSED_TIME_STEP_COUNT + 1);
|
||||
const float initialProbability =
|
||||
static_cast<float>(MAX_COMPUTED_PROBABILITY / (1 << (MAX_LEVEL - level)));
|
||||
for (int timeStepCount = 0; timeStepCount <= MAX_ELAPSED_TIME_STEP_COUNT; ++timeStepCount) {
|
||||
if (level == 0) {
|
||||
mTable[level][timeStepCount] = NOT_A_PROBABILITY;
|
||||
continue;
|
||||
}
|
||||
const int elapsedTime = timeStepCount * TIME_STEP_DURATION_IN_SECONDS;
|
||||
const float probability = initialProbability
|
||||
* powf(2.0f, -1.0f * static_cast<float>(elapsedTime)
|
||||
/ static_cast<float>(TIME_STEP_DURATION_IN_SECONDS
|
||||
* (MAX_ELAPSED_TIME_STEP_COUNT + 1)));
|
||||
mTable[level][timeStepCount] =
|
||||
min(max(static_cast<int>(probability), 1), MAX_COMPUTED_PROBABILITY);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -26,8 +26,6 @@ namespace latinime {
|
|||
|
||||
class DictionaryHeaderStructurePolicy;
|
||||
|
||||
// TODO: Check the elapsed time and decrease the probability depending on the time. Time field is
|
||||
// required to introduced to each terminal PtNode and bigram entry.
|
||||
// TODO: Quit using bigram probability to indicate the delta.
|
||||
class ForgettingCurveUtils {
|
||||
public:
|
||||
|
@ -43,22 +41,13 @@ class ForgettingCurveUtils {
|
|||
static const HistoricalInfo createHistoricalInfoToSave(
|
||||
const HistoricalInfo *const originalHistoricalInfo);
|
||||
|
||||
static int decodeProbability(const HistoricalInfo *const historicalInfo);
|
||||
|
||||
static int getProbability(const int encodedUnigramProbability,
|
||||
const int encodedBigramProbability);
|
||||
|
||||
// TODO: Remove.
|
||||
static int getUpdatedEncodedProbability(const int originalEncodedProbability,
|
||||
const int newProbability);
|
||||
|
||||
// TODO: Remove.
|
||||
static int isValidEncodedProbability(const int encodedProbability);
|
||||
|
||||
static bool needsToKeep(const HistoricalInfo *const historicalInfo);
|
||||
|
||||
// TODO: Remove.
|
||||
static int getEncodedProbabilityToSave(const int encodedProbability,
|
||||
const DictionaryHeaderStructurePolicy *const headerPolicy);
|
||||
|
||||
static bool needsToDecay(const bool mindsBlockByDecay, const int unigramCount,
|
||||
const int bigramCount, const DictionaryHeaderStructurePolicy *const headerPolicy);
|
||||
|
||||
|
@ -69,24 +58,17 @@ class ForgettingCurveUtils {
|
|||
public:
|
||||
ProbabilityTable();
|
||||
|
||||
int getProbability(const int encodedProbability) const {
|
||||
if (encodedProbability < 0 || encodedProbability > static_cast<int>(mTable.size())) {
|
||||
return NOT_A_PROBABILITY;
|
||||
}
|
||||
return mTable[encodedProbability];
|
||||
int getProbability(const int level, const int elapsedTimeStepCount) const {
|
||||
return mTable[level][elapsedTimeStepCount];
|
||||
}
|
||||
|
||||
private:
|
||||
DISALLOW_COPY_AND_ASSIGN(ProbabilityTable);
|
||||
|
||||
std::vector<int> mTable;
|
||||
std::vector<std::vector<int> > mTable;
|
||||
};
|
||||
|
||||
static const int MAX_COMPUTED_PROBABILITY;
|
||||
static const int MAX_ENCODED_PROBABILITY;
|
||||
static const int MIN_VALID_ENCODED_PROBABILITY;
|
||||
static const int ENCODED_PROBABILITY_STEP;
|
||||
static const float MIN_PROBABILITY_TO_DECAY;
|
||||
static const int DECAY_INTERVAL_SECONDS;
|
||||
|
||||
static const int MAX_LEVEL;
|
||||
|
@ -95,11 +77,10 @@ class ForgettingCurveUtils {
|
|||
static const int TIME_STEP_DURATION_IN_SECONDS;
|
||||
static const int MAX_ELAPSED_TIME_STEP_COUNT;
|
||||
static const int DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD;
|
||||
static const int HALF_LIFE_TIME_IN_SECONDS;
|
||||
|
||||
static const ProbabilityTable sProbabilityTable;
|
||||
|
||||
static int decodeProbability(const int encodedProbability);
|
||||
|
||||
static int backoff(const int unigramProbability);
|
||||
|
||||
static int getElapsedTimeStepCount(const int timestamp);
|
||||
|
|
|
@ -30,6 +30,10 @@ class HistoricalInfo {
|
|||
HistoricalInfo(const int timestamp, const int level, const int count)
|
||||
: mTimestamp(timestamp), mLevel(level), mCount(count) {}
|
||||
|
||||
bool isValid() const {
|
||||
return mTimestamp != NOT_A_TIMESTAMP;
|
||||
}
|
||||
|
||||
int getTimeStamp() const {
|
||||
return mTimestamp;
|
||||
}
|
||||
|
|
|
@ -30,6 +30,7 @@ import java.util.HashMap;
|
|||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
@LargeTest
|
||||
public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
||||
|
@ -38,8 +39,6 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
|||
|
||||
// Note that these are corresponding definitions in native code in
|
||||
// latinime::Ver4PatriciaTriePolicy.
|
||||
private static final String SET_NEEDS_TO_DECAY_FOR_TESTING_KEY =
|
||||
"SET_NEEDS_TO_DECAY_FOR_TESTING";
|
||||
private static final String SET_CURRENT_TIME_FOR_TESTING_QUERY =
|
||||
"SET_CURRENT_TIME_FOR_TESTING";
|
||||
private static final String GET_CURRENT_TIME_QUERY = "GET_CURRENT_TIME";
|
||||
|
@ -47,14 +46,28 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
|||
|
||||
private static final int DUMMY_PROBABILITY = 0;
|
||||
|
||||
private int mCurrentTime = 0;
|
||||
|
||||
@Override
|
||||
protected void setUp() throws Exception {
|
||||
super.setUp();
|
||||
mCurrentTime = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void tearDown() throws Exception {
|
||||
super.tearDown();
|
||||
try {
|
||||
final File dictFile =
|
||||
createEmptyDictionaryAndGetFile("TestBinaryDictionary", FormatSpec.VERSION4);
|
||||
final BinaryDictionary binaryDictionary =
|
||||
new BinaryDictionary(dictFile.getAbsolutePath(), 0 /* offset */,
|
||||
dictFile.length(), true /* useFullEditDistance */, Locale.getDefault(),
|
||||
TEST_LOCALE, true /* isUpdatable */);
|
||||
binaryDictionary.getPropertyForTests(QUIT_TIMEKEEPER_TEST_MODE_QUERY);
|
||||
} catch (IOException e) {
|
||||
fail("IOException while writing an initial dictionary : " + e);
|
||||
}
|
||||
}
|
||||
|
||||
private void addUnigramWord(final BinaryDictionary binaryDictionary, final String word,
|
||||
|
@ -62,32 +75,29 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
|||
binaryDictionary.addUnigramWord(word, probability, "" /* shortcutTarget */,
|
||||
BinaryDictionary.NOT_A_PROBABILITY /* shortcutProbability */,
|
||||
false /* isNotAWord */, false /* isBlacklisted */,
|
||||
BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
|
||||
mCurrentTime /* timestamp */);
|
||||
}
|
||||
|
||||
private void addBigramWords(final BinaryDictionary binaryDictionary, final String word0,
|
||||
final String word1, final int probability) {
|
||||
binaryDictionary.addBigramWords(word0, word1, probability,
|
||||
BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
|
||||
mCurrentTime /* timestamp */);
|
||||
}
|
||||
|
||||
private void forcePassingShortTime(final BinaryDictionary binaryDictionary) {
|
||||
// Entries having low probability would be suppressed once in 3 GCs.
|
||||
final int count = 3;
|
||||
for (int i = 0; i < count; i++) {
|
||||
binaryDictionary.getPropertyForTests(SET_NEEDS_TO_DECAY_FOR_TESTING_KEY);
|
||||
binaryDictionary.flushWithGC();
|
||||
}
|
||||
// 4 days.
|
||||
final int timeToElapse = (int)TimeUnit.SECONDS.convert(4, TimeUnit.DAYS);
|
||||
mCurrentTime += timeToElapse;
|
||||
setCurrentTime(binaryDictionary, mCurrentTime);
|
||||
binaryDictionary.flushWithGC();
|
||||
}
|
||||
|
||||
private void forcePassingLongTime(final BinaryDictionary binaryDictionary) {
|
||||
// Currently, probabilities are decayed when GC is run. All entries that have never been
|
||||
// typed in 128 GCs would be removed.
|
||||
final int count = 128;
|
||||
for (int i = 0; i < count; i++) {
|
||||
binaryDictionary.getPropertyForTests(SET_NEEDS_TO_DECAY_FOR_TESTING_KEY);
|
||||
binaryDictionary.flushWithGC();
|
||||
}
|
||||
// 60 days.
|
||||
final int timeToElapse = (int)TimeUnit.SECONDS.convert(60, TimeUnit.DAYS);
|
||||
mCurrentTime += timeToElapse;
|
||||
setCurrentTime(binaryDictionary, mCurrentTime);
|
||||
binaryDictionary.flushWithGC();
|
||||
}
|
||||
|
||||
private File createEmptyDictionaryAndGetFile(final String dictId,
|
||||
|
@ -147,6 +157,7 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
|||
BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
|
||||
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
|
||||
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
|
||||
binaryDictionary.getPropertyForTests(QUIT_TIMEKEEPER_TEST_MODE_QUERY);
|
||||
final int startTime = getCurrentTime(binaryDictionary);
|
||||
for (int i = 0; i < TEST_COUNT; i++) {
|
||||
final int currentTime = random.nextInt(Integer.MAX_VALUE);
|
||||
|
@ -233,6 +244,8 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
|||
addUnigramWord(binaryDictionary, "a", DUMMY_PROBABILITY);
|
||||
addUnigramWord(binaryDictionary, "a", DUMMY_PROBABILITY);
|
||||
addUnigramWord(binaryDictionary, "a", DUMMY_PROBABILITY);
|
||||
addUnigramWord(binaryDictionary, "a", DUMMY_PROBABILITY);
|
||||
assertTrue(binaryDictionary.isValidWord("a"));
|
||||
forcePassingShortTime(binaryDictionary);
|
||||
assertTrue(binaryDictionary.isValidWord("a"));
|
||||
forcePassingLongTime(binaryDictionary);
|
||||
|
@ -257,6 +270,9 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
|||
addUnigramWord(binaryDictionary, "a", DUMMY_PROBABILITY);
|
||||
addUnigramWord(binaryDictionary, "b", DUMMY_PROBABILITY);
|
||||
addBigramWords(binaryDictionary, "a", "b", DUMMY_PROBABILITY);
|
||||
addUnigramWord(binaryDictionary, "a", DUMMY_PROBABILITY);
|
||||
addUnigramWord(binaryDictionary, "b", DUMMY_PROBABILITY);
|
||||
addBigramWords(binaryDictionary, "a", "b", DUMMY_PROBABILITY);
|
||||
assertTrue(binaryDictionary.isValidBigram("a", "b"));
|
||||
forcePassingShortTime(binaryDictionary);
|
||||
assertTrue(binaryDictionary.isValidBigram("a", "b"));
|
||||
|
@ -307,8 +323,7 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
|||
Integer.parseInt(binaryDictionary.getPropertyForTests(
|
||||
BinaryDictionary.UNIGRAM_COUNT_QUERY));
|
||||
while (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) {
|
||||
binaryDictionary.getPropertyForTests(SET_NEEDS_TO_DECAY_FOR_TESTING_KEY);
|
||||
binaryDictionary.flushWithGC();
|
||||
forcePassingShortTime(binaryDictionary);
|
||||
}
|
||||
final int unigramCountAfterGC =
|
||||
Integer.parseInt(binaryDictionary.getPropertyForTests(
|
||||
|
@ -321,6 +336,10 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
|||
BinaryDictionary.UNIGRAM_COUNT_QUERY)) > 0);
|
||||
assertTrue(Integer.parseInt(binaryDictionary.getPropertyForTests(
|
||||
BinaryDictionary.UNIGRAM_COUNT_QUERY)) <= maxUnigramCount);
|
||||
forcePassingLongTime(binaryDictionary);
|
||||
assertEquals(0, Integer.parseInt(binaryDictionary.getPropertyForTests(
|
||||
BinaryDictionary.UNIGRAM_COUNT_QUERY)));
|
||||
|
||||
}
|
||||
|
||||
public void testAddManyBigramsToDecayingDict() {
|
||||
|
@ -378,8 +397,7 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
|||
Integer.parseInt(binaryDictionary.getPropertyForTests(
|
||||
BinaryDictionary.BIGRAM_COUNT_QUERY));
|
||||
while (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) {
|
||||
binaryDictionary.getPropertyForTests(SET_NEEDS_TO_DECAY_FOR_TESTING_KEY);
|
||||
binaryDictionary.flushWithGC();
|
||||
forcePassingShortTime(binaryDictionary);
|
||||
}
|
||||
final int bigramCountAfterGC =
|
||||
Integer.parseInt(binaryDictionary.getPropertyForTests(
|
||||
|
@ -392,5 +410,8 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
|||
BinaryDictionary.BIGRAM_COUNT_QUERY)) > 0);
|
||||
assertTrue(Integer.parseInt(binaryDictionary.getPropertyForTests(
|
||||
BinaryDictionary.BIGRAM_COUNT_QUERY)) <= maxBigramCount);
|
||||
forcePassingLongTime(binaryDictionary);
|
||||
assertEquals(0, Integer.parseInt(binaryDictionary.getPropertyForTests(
|
||||
BinaryDictionary.BIGRAM_COUNT_QUERY)));
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue