am 2fdb62a6: Start using historical info for decaying.
* commit '2fdb62a6a28d769435fe5d48d8b997c3e9a083d5': Start using historical info for decaying.main
commit
35dff6f9f2
|
@ -163,7 +163,8 @@ int BigramDictionary::getBigramProbability(const int *word0, int length0, const
|
||||||
mDictionaryStructurePolicy->getBigramsStructurePolicy(), pos);
|
mDictionaryStructurePolicy->getBigramsStructurePolicy(), pos);
|
||||||
while (bigramsIt.hasNext()) {
|
while (bigramsIt.hasNext()) {
|
||||||
bigramsIt.next();
|
bigramsIt.next();
|
||||||
if (bigramsIt.getBigramPos() == nextWordPos) {
|
if (bigramsIt.getBigramPos() == nextWordPos
|
||||||
|
&& bigramsIt.getProbability() != NOT_A_PROBABILITY) {
|
||||||
return mDictionaryStructurePolicy->getProbability(
|
return mDictionaryStructurePolicy->getProbability(
|
||||||
mDictionaryStructurePolicy->getUnigramProbabilityOfPtNode(nextWordPos),
|
mDictionaryStructurePolicy->getUnigramProbabilityOfPtNode(nextWordPos),
|
||||||
bigramsIt.getProbability());
|
bigramsIt.getProbability());
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h"
|
#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h"
|
||||||
|
|
||||||
#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h"
|
#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h"
|
||||||
|
#include "suggest/policyimpl/dictionary/header/header_policy.h"
|
||||||
#include "suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h"
|
#include "suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h"
|
||||||
#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h"
|
#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h"
|
||||||
#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
|
#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
|
||||||
|
@ -34,8 +35,13 @@ void Ver4BigramListPolicy::getNextBigram(int *const outBigramPos, int *const out
|
||||||
bigramEntry.getTargetTerminalId());
|
bigramEntry.getTargetTerminalId());
|
||||||
}
|
}
|
||||||
if (outProbability) {
|
if (outProbability) {
|
||||||
|
if (bigramEntry.hasHistoricalInfo()) {
|
||||||
|
*outProbability =
|
||||||
|
ForgettingCurveUtils::decodeProbability(bigramEntry.getHistoricalInfo());
|
||||||
|
} else {
|
||||||
*outProbability = bigramEntry.getProbability();
|
*outProbability = bigramEntry.getProbability();
|
||||||
}
|
}
|
||||||
|
}
|
||||||
if (outHasNext) {
|
if (outHasNext) {
|
||||||
*outHasNext = bigramEntry.hasNext();
|
*outHasNext = bigramEntry.hasNext();
|
||||||
}
|
}
|
||||||
|
@ -152,17 +158,12 @@ bool Ver4BigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(const i
|
||||||
if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) {
|
if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} else if (mNeedsToDecayWhenUpdating) {
|
} else if (bigramEntry.hasHistoricalInfo()) {
|
||||||
const int probability = ForgettingCurveUtils::getEncodedProbabilityToSave(
|
const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave(
|
||||||
bigramEntry.getProbability(), mHeaderPolicy);
|
|
||||||
const HistoricalInfo historicalInfo =
|
|
||||||
ForgettingCurveUtils::createHistoricalInfoToSave(
|
|
||||||
bigramEntry.getHistoricalInfo());
|
bigramEntry.getHistoricalInfo());
|
||||||
// TODO: Use ForgettingCurveUtils::needsToKeep(&historicalInfo).
|
if (ForgettingCurveUtils::needsToKeep(&historicalInfo)) {
|
||||||
if (ForgettingCurveUtils::isValidEncodedProbability(probability)) {
|
|
||||||
const BigramEntry updatedBigramEntry =
|
const BigramEntry updatedBigramEntry =
|
||||||
bigramEntry.updateProbabilityAndGetEntry(probability)
|
bigramEntry.updateHistoricalInfoAndGetEntry(&historicalInfo);
|
||||||
.updateHistoricalInfoAndGetEntry(&historicalInfo);
|
|
||||||
if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) {
|
if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -225,14 +226,12 @@ int Ver4BigramListPolicy::getEntryPosToUpdate(const int targetTerminalIdToFind,
|
||||||
const BigramEntry Ver4BigramListPolicy::createUpdatedBigramEntryFrom(
|
const BigramEntry Ver4BigramListPolicy::createUpdatedBigramEntryFrom(
|
||||||
const BigramEntry *const originalBigramEntry, const int newProbability,
|
const BigramEntry *const originalBigramEntry, const int newProbability,
|
||||||
const int timestamp) const {
|
const int timestamp) const {
|
||||||
if (mNeedsToDecayWhenUpdating) {
|
// TODO: Consolidate historical info and probability.
|
||||||
const int probability = ForgettingCurveUtils::getUpdatedEncodedProbability(
|
if (mHeaderPolicy->hasHistoricalInfoOfWords()) {
|
||||||
originalBigramEntry->getProbability(), newProbability);
|
|
||||||
const HistoricalInfo updatedHistoricalInfo =
|
const HistoricalInfo updatedHistoricalInfo =
|
||||||
ForgettingCurveUtils::createUpdatedHistoricalInfo(
|
ForgettingCurveUtils::createUpdatedHistoricalInfo(
|
||||||
originalBigramEntry->getHistoricalInfo(), newProbability, timestamp);
|
originalBigramEntry->getHistoricalInfo(), newProbability, timestamp);
|
||||||
return originalBigramEntry->updateProbabilityAndGetEntry(probability)
|
return originalBigramEntry->updateHistoricalInfoAndGetEntry(&updatedHistoricalInfo);
|
||||||
.updateHistoricalInfoAndGetEntry(&updatedHistoricalInfo);
|
|
||||||
} else {
|
} else {
|
||||||
return originalBigramEntry->updateProbabilityAndGetEntry(newProbability);
|
return originalBigramEntry->updateProbabilityAndGetEntry(newProbability);
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,18 +24,17 @@
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
class BigramDictContent;
|
class BigramDictContent;
|
||||||
class DictionaryHeaderStructurePolicy;
|
class HeaderPolicy;
|
||||||
class TerminalPositionLookupTable;
|
class TerminalPositionLookupTable;
|
||||||
|
|
||||||
class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy {
|
class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy {
|
||||||
public:
|
public:
|
||||||
Ver4BigramListPolicy(BigramDictContent *const bigramDictContent,
|
Ver4BigramListPolicy(BigramDictContent *const bigramDictContent,
|
||||||
const TerminalPositionLookupTable *const terminalPositionLookupTable,
|
const TerminalPositionLookupTable *const terminalPositionLookupTable,
|
||||||
const DictionaryHeaderStructurePolicy *const headerPolicy,
|
const HeaderPolicy *const headerPolicy)
|
||||||
const bool needsToDecayWhenUpdating)
|
|
||||||
: mBigramDictContent(bigramDictContent),
|
: mBigramDictContent(bigramDictContent),
|
||||||
mTerminalPositionLookupTable(terminalPositionLookupTable),
|
mTerminalPositionLookupTable(terminalPositionLookupTable),
|
||||||
mHeaderPolicy(headerPolicy), mNeedsToDecayWhenUpdating(needsToDecayWhenUpdating) {}
|
mHeaderPolicy(headerPolicy) {}
|
||||||
|
|
||||||
void getNextBigram(int *const outBigramPos, int *const outProbability,
|
void getNextBigram(int *const outBigramPos, int *const outProbability,
|
||||||
bool *const outHasNext, int *const bigramEntryPos) const;
|
bool *const outHasNext, int *const bigramEntryPos) const;
|
||||||
|
@ -64,8 +63,7 @@ class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy {
|
||||||
|
|
||||||
BigramDictContent *const mBigramDictContent;
|
BigramDictContent *const mBigramDictContent;
|
||||||
const TerminalPositionLookupTable *const mTerminalPositionLookupTable;
|
const TerminalPositionLookupTable *const mTerminalPositionLookupTable;
|
||||||
const DictionaryHeaderStructurePolicy *const mHeaderPolicy;
|
const HeaderPolicy *const mHeaderPolicy;
|
||||||
const bool mNeedsToDecayWhenUpdating;
|
|
||||||
};
|
};
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
#endif /* LATINIME_VER4_BIGRAM_LIST_POLICY_H */
|
#endif /* LATINIME_VER4_BIGRAM_LIST_POLICY_H */
|
||||||
|
|
|
@ -93,6 +93,18 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
AK_FORCE_INLINE bool isValid() const {
|
||||||
|
// Decaying dictionary must have historical information.
|
||||||
|
if (!mIsDecayingDict) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (mHasHistoricalInfoOfWords) {
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
AK_FORCE_INLINE int getSize() const {
|
AK_FORCE_INLINE int getSize() const {
|
||||||
return mSize;
|
return mSize;
|
||||||
}
|
}
|
||||||
|
@ -137,7 +149,7 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
||||||
return mExtendedRegionSize;
|
return mExtendedRegionSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
AK_FORCE_INLINE bool hasHistricalInfoOfWords() const {
|
AK_FORCE_INLINE bool hasHistoricalInfoOfWords() const {
|
||||||
return mHasHistoricalInfoOfWords;
|
return mHasHistoricalInfoOfWords;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -20,7 +20,6 @@
|
||||||
#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h"
|
#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h"
|
||||||
#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h"
|
#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h"
|
||||||
#include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_writing_utils.h"
|
#include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_writing_utils.h"
|
||||||
#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
|
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
|
@ -30,8 +29,7 @@ bool DynamicPatriciaTrieGcEventListeners
|
||||||
// PtNode is useless when the PtNode is not a terminal and doesn't have any not useless
|
// PtNode is useless when the PtNode is not a terminal and doesn't have any not useless
|
||||||
// children.
|
// children.
|
||||||
bool isUselessPtNode = !ptNodeParams->isTerminal();
|
bool isUselessPtNode = !ptNodeParams->isTerminal();
|
||||||
// TODO: Quit checking mNeedsToDecayWhenUpdating.
|
if (ptNodeParams->isTerminal()) {
|
||||||
if (ptNodeParams->isTerminal() && mNeedsToDecayWhenUpdating) {
|
|
||||||
bool needsToKeepPtNode = true;
|
bool needsToKeepPtNode = true;
|
||||||
if (!mPtNodeWriter->updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC(ptNodeParams,
|
if (!mPtNodeWriter->updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC(ptNodeParams,
|
||||||
&needsToKeepPtNode)) {
|
&needsToKeepPtNode)) {
|
||||||
|
|
|
@ -27,8 +27,6 @@
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
class DictionaryHeaderStructurePolicy;
|
|
||||||
class PtNodeWriter;
|
|
||||||
class PtNodeParams;
|
class PtNodeParams;
|
||||||
|
|
||||||
// TODO: Move to pt_common.
|
// TODO: Move to pt_common.
|
||||||
|
@ -41,12 +39,9 @@ class DynamicPatriciaTrieGcEventListeners {
|
||||||
: public DynamicPatriciaTrieReadingHelper::TraversingEventListener {
|
: public DynamicPatriciaTrieReadingHelper::TraversingEventListener {
|
||||||
public:
|
public:
|
||||||
TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
|
TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
|
||||||
const DictionaryHeaderStructurePolicy *const headerPolicy,
|
PtNodeWriter *const ptNodeWriter)
|
||||||
PtNodeWriter *const ptNodeWriter, BufferWithExtendableBuffer *const buffer,
|
: mPtNodeWriter(ptNodeWriter), mValueStack(), mChildrenValue(0),
|
||||||
const bool needsToDecayWhenUpdating)
|
mValidUnigramCount(0) {}
|
||||||
: mHeaderPolicy(headerPolicy), mPtNodeWriter(ptNodeWriter), mBuffer(buffer),
|
|
||||||
mNeedsToDecayWhenUpdating(needsToDecayWhenUpdating), mValueStack(),
|
|
||||||
mChildrenValue(0), mValidUnigramCount(0) {}
|
|
||||||
|
|
||||||
~TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted() {};
|
~TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted() {};
|
||||||
|
|
||||||
|
@ -77,10 +72,7 @@ class DynamicPatriciaTrieGcEventListeners {
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(
|
DISALLOW_IMPLICIT_CONSTRUCTORS(
|
||||||
TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted);
|
TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted);
|
||||||
|
|
||||||
const DictionaryHeaderStructurePolicy *const mHeaderPolicy;
|
|
||||||
PtNodeWriter *const mPtNodeWriter;
|
PtNodeWriter *const mPtNodeWriter;
|
||||||
BufferWithExtendableBuffer *const mBuffer;
|
|
||||||
const bool mNeedsToDecayWhenUpdating;
|
|
||||||
std::vector<int> mValueStack;
|
std::vector<int> mValueStack;
|
||||||
int mChildrenValue;
|
int mChildrenValue;
|
||||||
int mValidUnigramCount;
|
int mValidUnigramCount;
|
||||||
|
|
|
@ -73,6 +73,10 @@ class BigramEntry {
|
||||||
return mProbability;
|
return mProbability;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool hasHistoricalInfo() const {
|
||||||
|
return mHistoricalInfo.isValid();
|
||||||
|
}
|
||||||
|
|
||||||
const HistoricalInfo *getHistoricalInfo() const {
|
const HistoricalInfo *getHistoricalInfo() const {
|
||||||
return &mHistoricalInfo;
|
return &mHistoricalInfo;
|
||||||
}
|
}
|
||||||
|
|
|
@ -51,6 +51,10 @@ class ProbabilityEntry {
|
||||||
return ProbabilityEntry(mFlags, mProbability, historicalInfo);
|
return ProbabilityEntry(mFlags, mProbability, historicalInfo);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool hasHistoricalInfo() const {
|
||||||
|
return mHistoricalInfo.isValid();
|
||||||
|
}
|
||||||
|
|
||||||
int getFlags() const {
|
int getFlags() const {
|
||||||
return mFlags;
|
return mFlags;
|
||||||
}
|
}
|
||||||
|
|
|
@ -45,9 +45,9 @@ class Ver4DictBuffers {
|
||||||
}
|
}
|
||||||
|
|
||||||
AK_FORCE_INLINE bool isValid() const {
|
AK_FORCE_INLINE bool isValid() const {
|
||||||
return mDictBuffer.get() != 0 && mProbabilityDictContent.isValid()
|
return mDictBuffer.get() != 0 && mHeaderPolicy.isValid()
|
||||||
&& mTerminalPositionLookupTable.isValid() && mBigramDictContent.isValid()
|
&& mProbabilityDictContent.isValid() && mTerminalPositionLookupTable.isValid()
|
||||||
&& mShortcutDictContent.isValid();
|
&& mBigramDictContent.isValid() && mShortcutDictContent.isValid();
|
||||||
}
|
}
|
||||||
|
|
||||||
AK_FORCE_INLINE bool isNearSizeLimit() const {
|
AK_FORCE_INLINE bool isNearSizeLimit() const {
|
||||||
|
@ -131,9 +131,10 @@ class Ver4DictBuffers {
|
||||||
BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE),
|
BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE),
|
||||||
// TODO: Quit using header size.
|
// TODO: Quit using header size.
|
||||||
mTerminalPositionLookupTable(dictDirPath, isUpdatable, mHeaderPolicy.getSize()),
|
mTerminalPositionLookupTable(dictDirPath, isUpdatable, mHeaderPolicy.getSize()),
|
||||||
mProbabilityDictContent(dictDirPath, mHeaderPolicy.hasHistricalInfoOfWords(),
|
mProbabilityDictContent(dictDirPath, mHeaderPolicy.hasHistoricalInfoOfWords(),
|
||||||
|
isUpdatable),
|
||||||
|
mBigramDictContent(dictDirPath, mHeaderPolicy.hasHistoricalInfoOfWords(),
|
||||||
isUpdatable),
|
isUpdatable),
|
||||||
mBigramDictContent(dictDirPath, mHeaderPolicy.hasHistricalInfoOfWords(), isUpdatable),
|
|
||||||
mShortcutDictContent(dictDirPath, isUpdatable),
|
mShortcutDictContent(dictDirPath, isUpdatable),
|
||||||
mIsUpdatable(isUpdatable) {}
|
mIsUpdatable(isUpdatable) {}
|
||||||
|
|
||||||
|
@ -142,8 +143,8 @@ class Ver4DictBuffers {
|
||||||
mExpandableHeaderBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE),
|
mExpandableHeaderBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE),
|
||||||
mExpandableTrieBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE),
|
mExpandableTrieBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE),
|
||||||
mTerminalPositionLookupTable(),
|
mTerminalPositionLookupTable(),
|
||||||
mProbabilityDictContent(headerPolicy->hasHistricalInfoOfWords()),
|
mProbabilityDictContent(headerPolicy->hasHistoricalInfoOfWords()),
|
||||||
mBigramDictContent(headerPolicy->hasHistricalInfoOfWords()), mShortcutDictContent(),
|
mBigramDictContent(headerPolicy->hasHistoricalInfoOfWords()), mShortcutDictContent(),
|
||||||
mIsUpdatable(true) {}
|
mIsUpdatable(true) {}
|
||||||
|
|
||||||
const MmappedBuffer::MmappedBufferPtr mDictBuffer;
|
const MmappedBuffer::MmappedBufferPtr mDictBuffer;
|
||||||
|
|
|
@ -22,6 +22,7 @@
|
||||||
#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h"
|
#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h"
|
||||||
#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h"
|
#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h"
|
||||||
#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
|
#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
|
||||||
|
#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
|
@ -62,8 +63,13 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce
|
||||||
terminalId = Ver4PatriciaTrieReadingUtils::getTerminalIdAndAdvancePosition(dictBuf, &pos);
|
terminalId = Ver4PatriciaTrieReadingUtils::getTerminalIdAndAdvancePosition(dictBuf, &pos);
|
||||||
const ProbabilityEntry probabilityEntry =
|
const ProbabilityEntry probabilityEntry =
|
||||||
mProbabilityDictContent->getProbabilityEntry(terminalId);
|
mProbabilityDictContent->getProbabilityEntry(terminalId);
|
||||||
|
if (probabilityEntry.hasHistoricalInfo()) {
|
||||||
|
probability = ForgettingCurveUtils::decodeProbability(
|
||||||
|
probabilityEntry.getHistoricalInfo());
|
||||||
|
} else {
|
||||||
probability = probabilityEntry.getProbability();
|
probability = probabilityEntry.getProbability();
|
||||||
}
|
}
|
||||||
|
}
|
||||||
int childrenPosFieldPos = pos;
|
int childrenPosFieldPos = pos;
|
||||||
if (usesAdditionalBuffer) {
|
if (usesAdditionalBuffer) {
|
||||||
childrenPosFieldPos += mBuffer->getOriginalBufferSize();
|
childrenPosFieldPos += mBuffer->getOriginalBufferSize();
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h"
|
#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h"
|
||||||
|
|
||||||
#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h"
|
#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h"
|
||||||
|
#include "suggest/policyimpl/dictionary/header/header_policy.h"
|
||||||
#include "suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h"
|
#include "suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h"
|
||||||
#include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_reading_utils.h"
|
#include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_reading_utils.h"
|
||||||
#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h"
|
#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h"
|
||||||
|
@ -147,27 +148,21 @@ bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeA
|
||||||
AKLOGE("updatePtNodeProbabilityAndGetNeedsToSaveForGC is called for non-terminal PtNode.");
|
AKLOGE("updatePtNodeProbabilityAndGetNeedsToSaveForGC is called for non-terminal PtNode.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (mBuffers->getHeaderPolicy()->isDecayingDict()) {
|
|
||||||
const ProbabilityEntry originalProbabilityEntry =
|
const ProbabilityEntry originalProbabilityEntry =
|
||||||
mBuffers->getProbabilityDictContent()->getProbabilityEntry(
|
mBuffers->getProbabilityDictContent()->getProbabilityEntry(
|
||||||
toBeUpdatedPtNodeParams->getTerminalId());
|
toBeUpdatedPtNodeParams->getTerminalId());
|
||||||
// TODO: Remove.
|
if (originalProbabilityEntry.hasHistoricalInfo()) {
|
||||||
const int newProbability = ForgettingCurveUtils::getEncodedProbabilityToSave(
|
const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave(
|
||||||
originalProbabilityEntry.getProbability(), mBuffers->getHeaderPolicy());
|
|
||||||
const HistoricalInfo historicalInfo =
|
|
||||||
ForgettingCurveUtils::createHistoricalInfoToSave(
|
|
||||||
originalProbabilityEntry.getHistoricalInfo());
|
originalProbabilityEntry.getHistoricalInfo());
|
||||||
const ProbabilityEntry probabilityEntry =
|
const ProbabilityEntry probabilityEntry =
|
||||||
originalProbabilityEntry.createEntryWithUpdatedProbability(newProbability)
|
originalProbabilityEntry.createEntryWithUpdatedHistoricalInfo(&historicalInfo);
|
||||||
.createEntryWithUpdatedHistoricalInfo(&historicalInfo);
|
|
||||||
if (!mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry(
|
if (!mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry(
|
||||||
toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry)) {
|
toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry)) {
|
||||||
AKLOGE("Cannot write updated probability entry. terminalId: %d",
|
AKLOGE("Cannot write updated probability entry. terminalId: %d",
|
||||||
toBeUpdatedPtNodeParams->getTerminalId());
|
toBeUpdatedPtNodeParams->getTerminalId());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
// TODO: Use ForgettingCurveUtils::needsToKeep(&historicalInfo).
|
const bool isValid = ForgettingCurveUtils::needsToKeep(&historicalInfo);
|
||||||
const bool isValid = ForgettingCurveUtils::isValidEncodedProbability(newProbability);
|
|
||||||
if (!isValid) {
|
if (!isValid) {
|
||||||
if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) {
|
if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) {
|
||||||
AKLOGE("Cannot mark PtNode as willBecomeNonTerminal.");
|
AKLOGE("Cannot mark PtNode as willBecomeNonTerminal.");
|
||||||
|
@ -380,14 +375,13 @@ bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition(
|
||||||
const ProbabilityEntry Ver4PatriciaTrieNodeWriter::createUpdatedEntryFrom(
|
const ProbabilityEntry Ver4PatriciaTrieNodeWriter::createUpdatedEntryFrom(
|
||||||
const ProbabilityEntry *const originalProbabilityEntry, const int newProbability,
|
const ProbabilityEntry *const originalProbabilityEntry, const int newProbability,
|
||||||
const int timestamp) const {
|
const int timestamp) const {
|
||||||
if (mNeedsToDecayWhenUpdating) {
|
// TODO: Consolidate historical info and probability.
|
||||||
const int updatedProbability = ForgettingCurveUtils::getUpdatedEncodedProbability(
|
if (mBuffers->getHeaderPolicy()->hasHistoricalInfoOfWords()) {
|
||||||
originalProbabilityEntry->getProbability(), newProbability);
|
|
||||||
const HistoricalInfo updatedHistoricalInfo =
|
const HistoricalInfo updatedHistoricalInfo =
|
||||||
ForgettingCurveUtils::createUpdatedHistoricalInfo(
|
ForgettingCurveUtils::createUpdatedHistoricalInfo(
|
||||||
originalProbabilityEntry->getHistoricalInfo(), newProbability, timestamp);
|
originalProbabilityEntry->getHistoricalInfo(), newProbability, timestamp);
|
||||||
return originalProbabilityEntry->createEntryWithUpdatedProbability(updatedProbability)
|
return originalProbabilityEntry->createEntryWithUpdatedHistoricalInfo(
|
||||||
.createEntryWithUpdatedHistoricalInfo(&updatedHistoricalInfo);
|
&updatedHistoricalInfo);
|
||||||
} else {
|
} else {
|
||||||
return originalProbabilityEntry->createEntryWithUpdatedProbability(newProbability);
|
return originalProbabilityEntry->createEntryWithUpdatedProbability(newProbability);
|
||||||
}
|
}
|
||||||
|
|
|
@ -40,12 +40,10 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter {
|
||||||
public:
|
public:
|
||||||
Ver4PatriciaTrieNodeWriter(BufferWithExtendableBuffer *const trieBuffer,
|
Ver4PatriciaTrieNodeWriter(BufferWithExtendableBuffer *const trieBuffer,
|
||||||
Ver4DictBuffers *const buffers, const Ver4PatriciaTrieNodeReader *const ptNodeReader,
|
Ver4DictBuffers *const buffers, const Ver4PatriciaTrieNodeReader *const ptNodeReader,
|
||||||
Ver4BigramListPolicy *const bigramPolicy, Ver4ShortcutListPolicy *const shortcutPolicy,
|
Ver4BigramListPolicy *const bigramPolicy, Ver4ShortcutListPolicy *const shortcutPolicy)
|
||||||
const bool needsToDecayWhenUpdating)
|
|
||||||
: mTrieBuffer(trieBuffer), mBuffers(buffers), mPtNodeReader(ptNodeReader),
|
: mTrieBuffer(trieBuffer), mBuffers(buffers), mPtNodeReader(ptNodeReader),
|
||||||
mReadingHelper(mTrieBuffer, mPtNodeReader),
|
mReadingHelper(mTrieBuffer, mPtNodeReader),
|
||||||
mBigramPolicy(bigramPolicy), mShortcutPolicy(shortcutPolicy),
|
mBigramPolicy(bigramPolicy), mShortcutPolicy(shortcutPolicy) {}
|
||||||
mNeedsToDecayWhenUpdating(needsToDecayWhenUpdating) {}
|
|
||||||
|
|
||||||
virtual ~Ver4PatriciaTrieNodeWriter() {}
|
virtual ~Ver4PatriciaTrieNodeWriter() {}
|
||||||
|
|
||||||
|
@ -120,7 +118,6 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter {
|
||||||
DynamicPatriciaTrieReadingHelper mReadingHelper;
|
DynamicPatriciaTrieReadingHelper mReadingHelper;
|
||||||
Ver4BigramListPolicy *const mBigramPolicy;
|
Ver4BigramListPolicy *const mBigramPolicy;
|
||||||
Ver4ShortcutListPolicy *const mShortcutPolicy;
|
Ver4ShortcutListPolicy *const mShortcutPolicy;
|
||||||
const bool mNeedsToDecayWhenUpdating;
|
|
||||||
};
|
};
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
#endif /* LATINIME_VER4_PATRICIA_TRIE_NODE_WRITER_H */
|
#endif /* LATINIME_VER4_PATRICIA_TRIE_NODE_WRITER_H */
|
||||||
|
|
|
@ -34,8 +34,6 @@ const char *const Ver4PatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
|
||||||
const char *const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
|
const char *const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
|
||||||
const char *const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT";
|
const char *const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT";
|
||||||
const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT";
|
const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT";
|
||||||
const char *const Ver4PatriciaTriePolicy::SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY =
|
|
||||||
"SET_NEEDS_TO_DECAY_FOR_TESTING";
|
|
||||||
const char *const Ver4PatriciaTriePolicy::SET_CURRENT_TIME_FOR_TESTING_QUERY_FORMAT =
|
const char *const Ver4PatriciaTriePolicy::SET_CURRENT_TIME_FOR_TESTING_QUERY_FORMAT =
|
||||||
"SET_CURRENT_TIME_FOR_TESTING:%d";
|
"SET_CURRENT_TIME_FOR_TESTING:%d";
|
||||||
const char *const Ver4PatriciaTriePolicy::GET_CURRENT_TIME_QUERY = "GET_CURRENT_TIME";
|
const char *const Ver4PatriciaTriePolicy::GET_CURRENT_TIME_QUERY = "GET_CURRENT_TIME";
|
||||||
|
@ -62,8 +60,7 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d
|
||||||
// A DecayingDict may have a terminal PtNode that has a terminal DicNode whose
|
// A DecayingDict may have a terminal PtNode that has a terminal DicNode whose
|
||||||
// probability is NOT_A_PROBABILITY. In such case, we don't want to treat it as a
|
// probability is NOT_A_PROBABILITY. In such case, we don't want to treat it as a
|
||||||
// valid terminal DicNode.
|
// valid terminal DicNode.
|
||||||
isTerminal = getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY)
|
isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY;
|
||||||
!= NOT_A_PROBABILITY;
|
|
||||||
}
|
}
|
||||||
childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(),
|
childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(),
|
||||||
ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), isTerminal,
|
ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), isTerminal,
|
||||||
|
@ -262,11 +259,7 @@ void Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) {
|
||||||
AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary.");
|
AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary.");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const bool needsToDecay = mHeaderPolicy->isDecayingDict()
|
mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath);
|
||||||
&& (mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay(
|
|
||||||
false /* mindsBlockByDecay */, mUnigramCount, mBigramCount, mHeaderPolicy));
|
|
||||||
mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath, needsToDecay);
|
|
||||||
mNeedsToDecayForTesting = false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {
|
bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {
|
||||||
|
@ -286,8 +279,8 @@ bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {
|
||||||
// Needs to reduce dictionary size.
|
// Needs to reduce dictionary size.
|
||||||
return true;
|
return true;
|
||||||
} else if (mHeaderPolicy->isDecayingDict()) {
|
} else if (mHeaderPolicy->isDecayingDict()) {
|
||||||
return mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay(
|
return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mUnigramCount, mBigramCount,
|
||||||
mindsBlockByGC, mUnigramCount, mBigramCount, mHeaderPolicy);
|
mHeaderPolicy);
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -308,8 +301,6 @@ void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int quer
|
||||||
snprintf(outResult, maxResultLength, "%d",
|
snprintf(outResult, maxResultLength, "%d",
|
||||||
mHeaderPolicy->isDecayingDict() ? ForgettingCurveUtils::MAX_BIGRAM_COUNT :
|
mHeaderPolicy->isDecayingDict() ? ForgettingCurveUtils::MAX_BIGRAM_COUNT :
|
||||||
static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
|
static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
|
||||||
} else if (strncmp(query, SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY, compareLength) == 0) {
|
|
||||||
mNeedsToDecayForTesting = true;
|
|
||||||
} else if (sscanf(query, SET_CURRENT_TIME_FOR_TESTING_QUERY_FORMAT, ×tamp) == 1) {
|
} else if (sscanf(query, SET_CURRENT_TIME_FOR_TESTING_QUERY_FORMAT, ×tamp) == 1) {
|
||||||
TimeKeeper::startTestModeWithForceCurrentTime(timestamp);
|
TimeKeeper::startTestModeWithForceCurrentTime(timestamp);
|
||||||
} else if (strncmp(query, GET_CURRENT_TIME_QUERY, compareLength) == 0) {
|
} else if (strncmp(query, GET_CURRENT_TIME_QUERY, compareLength) == 0) {
|
||||||
|
|
|
@ -41,17 +41,16 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
: mBuffers(buffers), mHeaderPolicy(mBuffers.get()->getHeaderPolicy()),
|
: mBuffers(buffers), mHeaderPolicy(mBuffers.get()->getHeaderPolicy()),
|
||||||
mDictBuffer(mBuffers.get()->getWritableTrieBuffer()),
|
mDictBuffer(mBuffers.get()->getWritableTrieBuffer()),
|
||||||
mBigramPolicy(mBuffers.get()->getMutableBigramDictContent(),
|
mBigramPolicy(mBuffers.get()->getMutableBigramDictContent(),
|
||||||
mBuffers.get()->getTerminalPositionLookupTable(), mHeaderPolicy,
|
mBuffers.get()->getTerminalPositionLookupTable(), mHeaderPolicy),
|
||||||
mHeaderPolicy->isDecayingDict()),
|
|
||||||
mShortcutPolicy(mBuffers.get()->getMutableShortcutDictContent(),
|
mShortcutPolicy(mBuffers.get()->getMutableShortcutDictContent(),
|
||||||
mBuffers.get()->getTerminalPositionLookupTable()),
|
mBuffers.get()->getTerminalPositionLookupTable()),
|
||||||
mNodeReader(mDictBuffer, mBuffers.get()->getProbabilityDictContent()),
|
mNodeReader(mDictBuffer, mBuffers.get()->getProbabilityDictContent()),
|
||||||
mNodeWriter(mDictBuffer, mBuffers.get(), &mNodeReader, &mBigramPolicy,
|
mNodeWriter(mDictBuffer, mBuffers.get(), &mNodeReader, &mBigramPolicy,
|
||||||
&mShortcutPolicy, mHeaderPolicy->isDecayingDict()),
|
&mShortcutPolicy),
|
||||||
mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter),
|
mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter),
|
||||||
mWritingHelper(mBuffers.get()),
|
mWritingHelper(mBuffers.get()),
|
||||||
mUnigramCount(mHeaderPolicy->getUnigramCount()),
|
mUnigramCount(mHeaderPolicy->getUnigramCount()),
|
||||||
mBigramCount(mHeaderPolicy->getBigramCount()), mNeedsToDecayForTesting(false) {};
|
mBigramCount(mHeaderPolicy->getBigramCount()) {};
|
||||||
|
|
||||||
AK_FORCE_INLINE int getRootPosition() const {
|
AK_FORCE_INLINE int getRootPosition() const {
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -117,7 +116,6 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
static const char *const BIGRAM_COUNT_QUERY;
|
static const char *const BIGRAM_COUNT_QUERY;
|
||||||
static const char *const MAX_UNIGRAM_COUNT_QUERY;
|
static const char *const MAX_UNIGRAM_COUNT_QUERY;
|
||||||
static const char *const MAX_BIGRAM_COUNT_QUERY;
|
static const char *const MAX_BIGRAM_COUNT_QUERY;
|
||||||
static const char *const SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY;
|
|
||||||
static const char *const SET_CURRENT_TIME_FOR_TESTING_QUERY_FORMAT;
|
static const char *const SET_CURRENT_TIME_FOR_TESTING_QUERY_FORMAT;
|
||||||
static const char *const GET_CURRENT_TIME_QUERY;
|
static const char *const GET_CURRENT_TIME_QUERY;
|
||||||
static const char *const QUIT_TIMEKEEPER_TEST_MODE_QUERY;
|
static const char *const QUIT_TIMEKEEPER_TEST_MODE_QUERY;
|
||||||
|
@ -137,7 +135,6 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
Ver4PatriciaTrieWritingHelper mWritingHelper;
|
Ver4PatriciaTrieWritingHelper mWritingHelper;
|
||||||
int mUnigramCount;
|
int mUnigramCount;
|
||||||
int mBigramCount;
|
int mBigramCount;
|
||||||
bool mNeedsToDecayForTesting;
|
|
||||||
};
|
};
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
#endif // LATINIME_VER4_PATRICIA_TRIE_POLICY_H
|
#endif // LATINIME_VER4_PATRICIA_TRIE_POLICY_H
|
||||||
|
|
|
@ -53,20 +53,20 @@ void Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const trieFilePa
|
||||||
}
|
}
|
||||||
|
|
||||||
void Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos,
|
void Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos,
|
||||||
const char *const trieFilePath, const bool needsToDecay) {
|
const char *const trieFilePath) {
|
||||||
const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
|
const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
|
||||||
Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers(
|
Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers(
|
||||||
Ver4DictBuffers::createVer4DictBuffers(headerPolicy));
|
Ver4DictBuffers::createVer4DictBuffers(headerPolicy));
|
||||||
int unigramCount = 0;
|
int unigramCount = 0;
|
||||||
int bigramCount = 0;
|
int bigramCount = 0;
|
||||||
if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &unigramCount, &bigramCount,
|
if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &unigramCount, &bigramCount)) {
|
||||||
needsToDecay)) {
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
BufferWithExtendableBuffer headerBuffer(
|
BufferWithExtendableBuffer headerBuffer(
|
||||||
BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
|
BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
|
||||||
if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */,
|
if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */,
|
||||||
needsToDecay, unigramCount, bigramCount, 0 /* extendedRegionSize */)) {
|
true /* updatesLastDecayedTime */, unigramCount, bigramCount,
|
||||||
|
0 /* extendedRegionSize */)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const int dirPathBufSize = strlen(trieFilePath) + 1 /* terminator */;
|
const int dirPathBufSize = strlen(trieFilePath) + 1 /* terminator */;
|
||||||
|
@ -77,29 +77,28 @@ void Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeAr
|
||||||
|
|
||||||
bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
|
bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
|
||||||
const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite,
|
const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite,
|
||||||
int *const outUnigramCount, int *const outBigramCount, const bool needsToDecay) {
|
int *const outUnigramCount, int *const outBigramCount) {
|
||||||
Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer(),
|
Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer(),
|
||||||
mBuffers->getProbabilityDictContent());
|
mBuffers->getProbabilityDictContent());
|
||||||
Ver4BigramListPolicy bigramPolicy(mBuffers->getMutableBigramDictContent(),
|
Ver4BigramListPolicy bigramPolicy(mBuffers->getMutableBigramDictContent(),
|
||||||
mBuffers->getTerminalPositionLookupTable(), headerPolicy, needsToDecay);
|
mBuffers->getTerminalPositionLookupTable(), headerPolicy);
|
||||||
Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(),
|
Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(),
|
||||||
mBuffers->getTerminalPositionLookupTable());
|
mBuffers->getTerminalPositionLookupTable());
|
||||||
Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(),
|
Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(),
|
||||||
mBuffers, &ptNodeReader, &bigramPolicy, &shortcutPolicy,
|
mBuffers, &ptNodeReader, &bigramPolicy, &shortcutPolicy);
|
||||||
false /* needsToDecayWhenUpdating */);
|
|
||||||
|
|
||||||
DynamicPatriciaTrieReadingHelper readingHelper(mBuffers->getTrieBuffer(), &ptNodeReader);
|
DynamicPatriciaTrieReadingHelper readingHelper(mBuffers->getTrieBuffer(), &ptNodeReader);
|
||||||
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
||||||
DynamicPatriciaTrieGcEventListeners
|
DynamicPatriciaTrieGcEventListeners
|
||||||
::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
|
::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
|
||||||
traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
|
traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
|
||||||
headerPolicy, &ptNodeWriter, mBuffers->getWritableTrieBuffer(),
|
&ptNodeWriter);
|
||||||
needsToDecay);
|
|
||||||
if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
|
if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
|
||||||
&traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) {
|
&traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (needsToDecay && traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
|
if (headerPolicy->isDecayingDict()
|
||||||
|
&& traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
|
||||||
.getValidUnigramCount() > ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC) {
|
.getValidUnigramCount() > ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC) {
|
||||||
// TODO: Remove more unigrams.
|
// TODO: Remove more unigrams.
|
||||||
}
|
}
|
||||||
|
@ -111,7 +110,8 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
|
||||||
&traversePolicyToUpdateBigramProbability)) {
|
&traversePolicyToUpdateBigramProbability)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (needsToDecay && traversePolicyToUpdateBigramProbability.getValidBigramEntryCount()
|
if (headerPolicy->isDecayingDict()
|
||||||
|
&& traversePolicyToUpdateBigramProbability.getValidBigramEntryCount()
|
||||||
> ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC) {
|
> ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC) {
|
||||||
// TODO: Remove more bigrams.
|
// TODO: Remove more bigrams.
|
||||||
}
|
}
|
||||||
|
@ -120,8 +120,7 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
|
||||||
PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap;
|
PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap;
|
||||||
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
||||||
Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(),
|
Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(),
|
||||||
buffersToWrite, &ptNodeReader, &bigramPolicy, &shortcutPolicy,
|
buffersToWrite, &ptNodeReader, &bigramPolicy, &shortcutPolicy);
|
||||||
false /* needsToDecayWhenUpdating */);
|
|
||||||
DynamicPatriciaTrieGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
|
DynamicPatriciaTrieGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
|
||||||
traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers,
|
traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers,
|
||||||
buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap);
|
buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap);
|
||||||
|
@ -134,13 +133,11 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
|
||||||
Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(),
|
Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(),
|
||||||
buffersToWrite->getProbabilityDictContent());
|
buffersToWrite->getProbabilityDictContent());
|
||||||
Ver4BigramListPolicy newBigramPolicy(buffersToWrite->getMutableBigramDictContent(),
|
Ver4BigramListPolicy newBigramPolicy(buffersToWrite->getMutableBigramDictContent(),
|
||||||
buffersToWrite->getTerminalPositionLookupTable(), headerPolicy,
|
buffersToWrite->getTerminalPositionLookupTable(), headerPolicy);
|
||||||
false /* needsToDecay */);
|
|
||||||
Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(),
|
Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(),
|
||||||
buffersToWrite->getTerminalPositionLookupTable());
|
buffersToWrite->getTerminalPositionLookupTable());
|
||||||
Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(),
|
Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(),
|
||||||
buffersToWrite, &newPtNodeReader, &newBigramPolicy, &newShortcutPolicy,
|
buffersToWrite, &newPtNodeReader, &newBigramPolicy, &newShortcutPolicy);
|
||||||
false /* needsToDecayWhenUpdating */);
|
|
||||||
// Re-assign terminal IDs for valid terminal PtNodes.
|
// Re-assign terminal IDs for valid terminal PtNodes.
|
||||||
TerminalPositionLookupTable::TerminalIdMap terminalIdMap;
|
TerminalPositionLookupTable::TerminalIdMap terminalIdMap;
|
||||||
if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds(
|
if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds(
|
||||||
|
|
|
@ -36,7 +36,7 @@ class Ver4PatriciaTrieWritingHelper {
|
||||||
const int bigramCount) const;
|
const int bigramCount) const;
|
||||||
|
|
||||||
void writeToDictFileWithGC(const int rootPtNodeArrayPos,
|
void writeToDictFileWithGC(const int rootPtNodeArrayPos,
|
||||||
const char *const trieFilePath, const bool needsToDecay);
|
const char *const trieFilePath);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTrieWritingHelper);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTrieWritingHelper);
|
||||||
|
@ -66,7 +66,7 @@ class Ver4PatriciaTrieWritingHelper {
|
||||||
|
|
||||||
bool runGC(const int rootPtNodeArrayPos, const HeaderPolicy *const headerPolicy,
|
bool runGC(const int rootPtNodeArrayPos, const HeaderPolicy *const headerPolicy,
|
||||||
Ver4DictBuffers *const buffersToWrite, int *const outUnigramCount,
|
Ver4DictBuffers *const buffersToWrite, int *const outUnigramCount,
|
||||||
int *const outBigramCount, const bool needsToDecay);
|
int *const outBigramCount);
|
||||||
|
|
||||||
Ver4DictBuffers *const mBuffers;
|
Ver4DictBuffers *const mBuffers;
|
||||||
};
|
};
|
||||||
|
|
|
@ -31,12 +31,6 @@ const int ForgettingCurveUtils::MAX_BIGRAM_COUNT = 12000;
|
||||||
const int ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC = 10000;
|
const int ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC = 10000;
|
||||||
|
|
||||||
const int ForgettingCurveUtils::MAX_COMPUTED_PROBABILITY = 127;
|
const int ForgettingCurveUtils::MAX_COMPUTED_PROBABILITY = 127;
|
||||||
const int ForgettingCurveUtils::MAX_ENCODED_PROBABILITY = 15;
|
|
||||||
const int ForgettingCurveUtils::MIN_VALID_ENCODED_PROBABILITY = 3;
|
|
||||||
const int ForgettingCurveUtils::ENCODED_PROBABILITY_STEP = 1;
|
|
||||||
// Currently, we try to decay each uni/bigram once every 2 hours. Accordingly, the expected
|
|
||||||
// duration of the decay is approximately 66hours.
|
|
||||||
const float ForgettingCurveUtils::MIN_PROBABILITY_TO_DECAY = 0.03f;
|
|
||||||
const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60;
|
const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60;
|
||||||
|
|
||||||
const int ForgettingCurveUtils::MAX_LEVEL = 3;
|
const int ForgettingCurveUtils::MAX_LEVEL = 3;
|
||||||
|
@ -53,9 +47,9 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
|
||||||
const int newProbability, const int timestamp) {
|
const int newProbability, const int timestamp) {
|
||||||
if (newProbability != NOT_A_PROBABILITY && originalHistoricalInfo->getLevel() == 0) {
|
if (newProbability != NOT_A_PROBABILITY && originalHistoricalInfo->getLevel() == 0) {
|
||||||
return HistoricalInfo(timestamp, MIN_VALID_LEVEL /* level */, 0 /* count */);
|
return HistoricalInfo(timestamp, MIN_VALID_LEVEL /* level */, 0 /* count */);
|
||||||
} else if (originalHistoricalInfo->getTimeStamp() == NOT_A_TIMESTAMP) {
|
} else if (!originalHistoricalInfo->isValid()) {
|
||||||
// Initial information.
|
// Initial information.
|
||||||
return HistoricalInfo(timestamp, 0 /* level */, 0 /* count */);
|
return HistoricalInfo(timestamp, 0 /* level */, 1 /* count */);
|
||||||
} else {
|
} else {
|
||||||
const int updatedCount = originalHistoricalInfo->getCount() + 1;
|
const int updatedCount = originalHistoricalInfo->getCount() + 1;
|
||||||
if (updatedCount > MAX_COUNT) {
|
if (updatedCount > MAX_COUNT) {
|
||||||
|
@ -75,85 +69,46 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* static */ int ForgettingCurveUtils::getProbability(const int encodedUnigramProbability,
|
/* static */ int ForgettingCurveUtils::decodeProbability(
|
||||||
const int encodedBigramProbability) {
|
const HistoricalInfo *const historicalInfo) {
|
||||||
if (encodedUnigramProbability == NOT_A_PROBABILITY) {
|
const int elapsedTimeStepCount = getElapsedTimeStepCount(historicalInfo->getTimeStamp());
|
||||||
|
return sProbabilityTable.getProbability(historicalInfo->getLevel(),
|
||||||
|
min(max(elapsedTimeStepCount, 0), MAX_ELAPSED_TIME_STEP_COUNT));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* static */ int ForgettingCurveUtils::getProbability(const int unigramProbability,
|
||||||
|
const int bigramProbability) {
|
||||||
|
if (unigramProbability == NOT_A_PROBABILITY) {
|
||||||
return NOT_A_PROBABILITY;
|
return NOT_A_PROBABILITY;
|
||||||
} else if (encodedBigramProbability == NOT_A_PROBABILITY) {
|
} else if (bigramProbability == NOT_A_PROBABILITY) {
|
||||||
return backoff(decodeProbability(encodedUnigramProbability));
|
return min(backoff(unigramProbability), MAX_COMPUTED_PROBABILITY);
|
||||||
} else {
|
} else {
|
||||||
const int unigramProbability = decodeProbability(encodedUnigramProbability);
|
|
||||||
const int bigramProbability = decodeProbability(encodedBigramProbability);
|
|
||||||
return min(max(unigramProbability, bigramProbability), MAX_COMPUTED_PROBABILITY);
|
return min(max(unigramProbability, bigramProbability), MAX_COMPUTED_PROBABILITY);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Caveat: Unlike getProbability(), this method doesn't assume special bigram probability encoding
|
|
||||||
// (i.e. unigram probability + bigram probability delta).
|
|
||||||
/* static */ int ForgettingCurveUtils::getUpdatedEncodedProbability(
|
|
||||||
const int originalEncodedProbability, const int newProbability) {
|
|
||||||
if (originalEncodedProbability == NOT_A_PROBABILITY) {
|
|
||||||
// The bigram relation is not in this dictionary.
|
|
||||||
if (newProbability == NOT_A_PROBABILITY) {
|
|
||||||
// The bigram target is not in other dictionaries.
|
|
||||||
return 0;
|
|
||||||
} else {
|
|
||||||
return MIN_VALID_ENCODED_PROBABILITY;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (newProbability != NOT_A_PROBABILITY
|
|
||||||
&& originalEncodedProbability < MIN_VALID_ENCODED_PROBABILITY) {
|
|
||||||
return MIN_VALID_ENCODED_PROBABILITY;
|
|
||||||
}
|
|
||||||
return min(originalEncodedProbability + ENCODED_PROBABILITY_STEP, MAX_ENCODED_PROBABILITY);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* static */ int ForgettingCurveUtils::isValidEncodedProbability(const int encodedProbability) {
|
|
||||||
return encodedProbability >= MIN_VALID_ENCODED_PROBABILITY;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* static */ bool ForgettingCurveUtils::needsToKeep(const HistoricalInfo *const historicalInfo) {
|
/* static */ bool ForgettingCurveUtils::needsToKeep(const HistoricalInfo *const historicalInfo) {
|
||||||
return historicalInfo->getLevel() > 0
|
return historicalInfo->getLevel() > 0
|
||||||
|| getElapsedTimeStepCount(historicalInfo->getTimeStamp())
|
|| getElapsedTimeStepCount(historicalInfo->getTimeStamp())
|
||||||
< DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD;
|
< DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* static */ int ForgettingCurveUtils::getEncodedProbabilityToSave(const int encodedProbability,
|
|
||||||
const DictionaryHeaderStructurePolicy *const headerPolicy) {
|
|
||||||
const int elapsedTime = TimeKeeper::peekCurrentTime() - headerPolicy->getLastDecayedTime();
|
|
||||||
const int decayIterationCount = max(elapsedTime / DECAY_INTERVAL_SECONDS, 1);
|
|
||||||
int currentEncodedProbability = max(min(encodedProbability, MAX_ENCODED_PROBABILITY), 0);
|
|
||||||
// TODO: Implement the decay in more proper way.
|
|
||||||
for (int i = 0; i < decayIterationCount; ++i) {
|
|
||||||
const float currentRate = static_cast<float>(currentEncodedProbability)
|
|
||||||
/ static_cast<float>(MAX_ENCODED_PROBABILITY);
|
|
||||||
const float thresholdToDecay = (1.0f - MIN_PROBABILITY_TO_DECAY) * currentRate;
|
|
||||||
const float randValue = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
|
|
||||||
if (thresholdToDecay < randValue) {
|
|
||||||
currentEncodedProbability = max(currentEncodedProbability - ENCODED_PROBABILITY_STEP,
|
|
||||||
0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return currentEncodedProbability;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* static */ const HistoricalInfo ForgettingCurveUtils::createHistoricalInfoToSave(
|
/* static */ const HistoricalInfo ForgettingCurveUtils::createHistoricalInfoToSave(
|
||||||
const HistoricalInfo *const originalHistoricalInfo) {
|
const HistoricalInfo *const originalHistoricalInfo) {
|
||||||
if (originalHistoricalInfo->getTimeStamp() == NOT_A_TIMESTAMP) {
|
if (originalHistoricalInfo->getTimeStamp() == NOT_A_TIMESTAMP) {
|
||||||
return HistoricalInfo();
|
return HistoricalInfo();
|
||||||
}
|
}
|
||||||
const int elapsedTimeStep = getElapsedTimeStepCount(originalHistoricalInfo->getTimeStamp());
|
const int elapsedTimeStep = getElapsedTimeStepCount(originalHistoricalInfo->getTimeStamp());
|
||||||
if (elapsedTimeStep < MAX_ELAPSED_TIME_STEP_COUNT) {
|
if (elapsedTimeStep <= MAX_ELAPSED_TIME_STEP_COUNT) {
|
||||||
// No need to update historical info.
|
// No need to update historical info.
|
||||||
return *originalHistoricalInfo;
|
return *originalHistoricalInfo;
|
||||||
}
|
}
|
||||||
// Level down.
|
// Level down.
|
||||||
const int maxLevelDownAmonut = elapsedTimeStep / MAX_ELAPSED_TIME_STEP_COUNT;
|
const int maxLevelDownAmonut = elapsedTimeStep / (MAX_ELAPSED_TIME_STEP_COUNT + 1);
|
||||||
const int levelDownAmount = (maxLevelDownAmonut >= originalHistoricalInfo->getLevel()) ?
|
const int levelDownAmount = (maxLevelDownAmonut >= originalHistoricalInfo->getLevel()) ?
|
||||||
originalHistoricalInfo->getLevel() : maxLevelDownAmonut;
|
originalHistoricalInfo->getLevel() : maxLevelDownAmonut;
|
||||||
const int adjustedTimestamp = originalHistoricalInfo->getTimeStamp() +
|
const int adjustedTimestamp = originalHistoricalInfo->getTimeStamp() +
|
||||||
levelDownAmount * MAX_ELAPSED_TIME_STEP_COUNT * TIME_STEP_DURATION_IN_SECONDS;
|
levelDownAmount * (MAX_ELAPSED_TIME_STEP_COUNT + 1) * TIME_STEP_DURATION_IN_SECONDS;
|
||||||
return HistoricalInfo(adjustedTimestamp,
|
return HistoricalInfo(adjustedTimestamp,
|
||||||
originalHistoricalInfo->getLevel() - levelDownAmount, 0 /* count */);
|
originalHistoricalInfo->getLevel() - levelDownAmount, 0 /* count */);
|
||||||
}
|
}
|
||||||
|
@ -179,14 +134,6 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* static */ int ForgettingCurveUtils::decodeProbability(const int encodedProbability) {
|
|
||||||
if (encodedProbability < MIN_VALID_ENCODED_PROBABILITY) {
|
|
||||||
return NOT_A_PROBABILITY;
|
|
||||||
} else {
|
|
||||||
return min(sProbabilityTable.getProbability(encodedProbability), MAX_ENCODED_PROBABILITY);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// See comments in ProbabilityUtils::backoff().
|
// See comments in ProbabilityUtils::backoff().
|
||||||
/* static */ int ForgettingCurveUtils::backoff(const int unigramProbability) {
|
/* static */ int ForgettingCurveUtils::backoff(const int unigramProbability) {
|
||||||
if (unigramProbability == NOT_A_PROBABILITY) {
|
if (unigramProbability == NOT_A_PROBABILITY) {
|
||||||
|
@ -201,14 +148,24 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
|
||||||
}
|
}
|
||||||
|
|
||||||
ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTable() {
|
ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTable() {
|
||||||
// Table entry is as follows:
|
mTable.resize(MAX_LEVEL + 1);
|
||||||
// 1, 1, 1, 2, 3, 5, 6, 9, 13, 18, 25, 34, 48, 66, 91, 127.
|
for (int level = 0; level <= MAX_LEVEL; ++level) {
|
||||||
// Note that first MIN_VALID_ENCODED_PROBABILITY values are not used.
|
mTable[level].resize(MAX_ELAPSED_TIME_STEP_COUNT + 1);
|
||||||
mTable.resize(MAX_ENCODED_PROBABILITY + 1);
|
const float initialProbability =
|
||||||
for (int i = 0; i <= MAX_ENCODED_PROBABILITY; ++i) {
|
static_cast<float>(MAX_COMPUTED_PROBABILITY / (1 << (MAX_LEVEL - level)));
|
||||||
const int probability = static_cast<int>(powf(static_cast<float>(MAX_COMPUTED_PROBABILITY),
|
for (int timeStepCount = 0; timeStepCount <= MAX_ELAPSED_TIME_STEP_COUNT; ++timeStepCount) {
|
||||||
static_cast<float>(i) / static_cast<float>(MAX_ENCODED_PROBABILITY)));
|
if (level == 0) {
|
||||||
mTable[i] = min(MAX_COMPUTED_PROBABILITY, max(0, probability));
|
mTable[level][timeStepCount] = NOT_A_PROBABILITY;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const int elapsedTime = timeStepCount * TIME_STEP_DURATION_IN_SECONDS;
|
||||||
|
const float probability = initialProbability
|
||||||
|
* powf(2.0f, -1.0f * static_cast<float>(elapsedTime)
|
||||||
|
/ static_cast<float>(TIME_STEP_DURATION_IN_SECONDS
|
||||||
|
* (MAX_ELAPSED_TIME_STEP_COUNT + 1)));
|
||||||
|
mTable[level][timeStepCount] =
|
||||||
|
min(max(static_cast<int>(probability), 1), MAX_COMPUTED_PROBABILITY);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -26,8 +26,6 @@ namespace latinime {
|
||||||
|
|
||||||
class DictionaryHeaderStructurePolicy;
|
class DictionaryHeaderStructurePolicy;
|
||||||
|
|
||||||
// TODO: Check the elapsed time and decrease the probability depending on the time. Time field is
|
|
||||||
// required to introduced to each terminal PtNode and bigram entry.
|
|
||||||
// TODO: Quit using bigram probability to indicate the delta.
|
// TODO: Quit using bigram probability to indicate the delta.
|
||||||
class ForgettingCurveUtils {
|
class ForgettingCurveUtils {
|
||||||
public:
|
public:
|
||||||
|
@ -43,22 +41,13 @@ class ForgettingCurveUtils {
|
||||||
static const HistoricalInfo createHistoricalInfoToSave(
|
static const HistoricalInfo createHistoricalInfoToSave(
|
||||||
const HistoricalInfo *const originalHistoricalInfo);
|
const HistoricalInfo *const originalHistoricalInfo);
|
||||||
|
|
||||||
|
static int decodeProbability(const HistoricalInfo *const historicalInfo);
|
||||||
|
|
||||||
static int getProbability(const int encodedUnigramProbability,
|
static int getProbability(const int encodedUnigramProbability,
|
||||||
const int encodedBigramProbability);
|
const int encodedBigramProbability);
|
||||||
|
|
||||||
// TODO: Remove.
|
|
||||||
static int getUpdatedEncodedProbability(const int originalEncodedProbability,
|
|
||||||
const int newProbability);
|
|
||||||
|
|
||||||
// TODO: Remove.
|
|
||||||
static int isValidEncodedProbability(const int encodedProbability);
|
|
||||||
|
|
||||||
static bool needsToKeep(const HistoricalInfo *const historicalInfo);
|
static bool needsToKeep(const HistoricalInfo *const historicalInfo);
|
||||||
|
|
||||||
// TODO: Remove.
|
|
||||||
static int getEncodedProbabilityToSave(const int encodedProbability,
|
|
||||||
const DictionaryHeaderStructurePolicy *const headerPolicy);
|
|
||||||
|
|
||||||
static bool needsToDecay(const bool mindsBlockByDecay, const int unigramCount,
|
static bool needsToDecay(const bool mindsBlockByDecay, const int unigramCount,
|
||||||
const int bigramCount, const DictionaryHeaderStructurePolicy *const headerPolicy);
|
const int bigramCount, const DictionaryHeaderStructurePolicy *const headerPolicy);
|
||||||
|
|
||||||
|
@ -69,24 +58,17 @@ class ForgettingCurveUtils {
|
||||||
public:
|
public:
|
||||||
ProbabilityTable();
|
ProbabilityTable();
|
||||||
|
|
||||||
int getProbability(const int encodedProbability) const {
|
int getProbability(const int level, const int elapsedTimeStepCount) const {
|
||||||
if (encodedProbability < 0 || encodedProbability > static_cast<int>(mTable.size())) {
|
return mTable[level][elapsedTimeStepCount];
|
||||||
return NOT_A_PROBABILITY;
|
|
||||||
}
|
|
||||||
return mTable[encodedProbability];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_COPY_AND_ASSIGN(ProbabilityTable);
|
DISALLOW_COPY_AND_ASSIGN(ProbabilityTable);
|
||||||
|
|
||||||
std::vector<int> mTable;
|
std::vector<std::vector<int> > mTable;
|
||||||
};
|
};
|
||||||
|
|
||||||
static const int MAX_COMPUTED_PROBABILITY;
|
static const int MAX_COMPUTED_PROBABILITY;
|
||||||
static const int MAX_ENCODED_PROBABILITY;
|
|
||||||
static const int MIN_VALID_ENCODED_PROBABILITY;
|
|
||||||
static const int ENCODED_PROBABILITY_STEP;
|
|
||||||
static const float MIN_PROBABILITY_TO_DECAY;
|
|
||||||
static const int DECAY_INTERVAL_SECONDS;
|
static const int DECAY_INTERVAL_SECONDS;
|
||||||
|
|
||||||
static const int MAX_LEVEL;
|
static const int MAX_LEVEL;
|
||||||
|
@ -95,11 +77,10 @@ class ForgettingCurveUtils {
|
||||||
static const int TIME_STEP_DURATION_IN_SECONDS;
|
static const int TIME_STEP_DURATION_IN_SECONDS;
|
||||||
static const int MAX_ELAPSED_TIME_STEP_COUNT;
|
static const int MAX_ELAPSED_TIME_STEP_COUNT;
|
||||||
static const int DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD;
|
static const int DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD;
|
||||||
|
static const int HALF_LIFE_TIME_IN_SECONDS;
|
||||||
|
|
||||||
static const ProbabilityTable sProbabilityTable;
|
static const ProbabilityTable sProbabilityTable;
|
||||||
|
|
||||||
static int decodeProbability(const int encodedProbability);
|
|
||||||
|
|
||||||
static int backoff(const int unigramProbability);
|
static int backoff(const int unigramProbability);
|
||||||
|
|
||||||
static int getElapsedTimeStepCount(const int timestamp);
|
static int getElapsedTimeStepCount(const int timestamp);
|
||||||
|
|
|
@ -30,6 +30,10 @@ class HistoricalInfo {
|
||||||
HistoricalInfo(const int timestamp, const int level, const int count)
|
HistoricalInfo(const int timestamp, const int level, const int count)
|
||||||
: mTimestamp(timestamp), mLevel(level), mCount(count) {}
|
: mTimestamp(timestamp), mLevel(level), mCount(count) {}
|
||||||
|
|
||||||
|
bool isValid() const {
|
||||||
|
return mTimestamp != NOT_A_TIMESTAMP;
|
||||||
|
}
|
||||||
|
|
||||||
int getTimeStamp() const {
|
int getTimeStamp() const {
|
||||||
return mTimestamp;
|
return mTimestamp;
|
||||||
}
|
}
|
||||||
|
|
|
@ -30,6 +30,7 @@ import java.util.HashMap;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
@LargeTest
|
@LargeTest
|
||||||
public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
||||||
|
@ -38,8 +39,6 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
||||||
|
|
||||||
// Note that these are corresponding definitions in native code in
|
// Note that these are corresponding definitions in native code in
|
||||||
// latinime::Ver4PatriciaTriePolicy.
|
// latinime::Ver4PatriciaTriePolicy.
|
||||||
private static final String SET_NEEDS_TO_DECAY_FOR_TESTING_KEY =
|
|
||||||
"SET_NEEDS_TO_DECAY_FOR_TESTING";
|
|
||||||
private static final String SET_CURRENT_TIME_FOR_TESTING_QUERY =
|
private static final String SET_CURRENT_TIME_FOR_TESTING_QUERY =
|
||||||
"SET_CURRENT_TIME_FOR_TESTING";
|
"SET_CURRENT_TIME_FOR_TESTING";
|
||||||
private static final String GET_CURRENT_TIME_QUERY = "GET_CURRENT_TIME";
|
private static final String GET_CURRENT_TIME_QUERY = "GET_CURRENT_TIME";
|
||||||
|
@ -47,14 +46,28 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
||||||
|
|
||||||
private static final int DUMMY_PROBABILITY = 0;
|
private static final int DUMMY_PROBABILITY = 0;
|
||||||
|
|
||||||
|
private int mCurrentTime = 0;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void setUp() throws Exception {
|
protected void setUp() throws Exception {
|
||||||
super.setUp();
|
super.setUp();
|
||||||
|
mCurrentTime = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void tearDown() throws Exception {
|
protected void tearDown() throws Exception {
|
||||||
super.tearDown();
|
super.tearDown();
|
||||||
|
try {
|
||||||
|
final File dictFile =
|
||||||
|
createEmptyDictionaryAndGetFile("TestBinaryDictionary", FormatSpec.VERSION4);
|
||||||
|
final BinaryDictionary binaryDictionary =
|
||||||
|
new BinaryDictionary(dictFile.getAbsolutePath(), 0 /* offset */,
|
||||||
|
dictFile.length(), true /* useFullEditDistance */, Locale.getDefault(),
|
||||||
|
TEST_LOCALE, true /* isUpdatable */);
|
||||||
|
binaryDictionary.getPropertyForTests(QUIT_TIMEKEEPER_TEST_MODE_QUERY);
|
||||||
|
} catch (IOException e) {
|
||||||
|
fail("IOException while writing an initial dictionary : " + e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void addUnigramWord(final BinaryDictionary binaryDictionary, final String word,
|
private void addUnigramWord(final BinaryDictionary binaryDictionary, final String word,
|
||||||
|
@ -62,33 +75,30 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
||||||
binaryDictionary.addUnigramWord(word, probability, "" /* shortcutTarget */,
|
binaryDictionary.addUnigramWord(word, probability, "" /* shortcutTarget */,
|
||||||
BinaryDictionary.NOT_A_PROBABILITY /* shortcutProbability */,
|
BinaryDictionary.NOT_A_PROBABILITY /* shortcutProbability */,
|
||||||
false /* isNotAWord */, false /* isBlacklisted */,
|
false /* isNotAWord */, false /* isBlacklisted */,
|
||||||
BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
|
mCurrentTime /* timestamp */);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void addBigramWords(final BinaryDictionary binaryDictionary, final String word0,
|
private void addBigramWords(final BinaryDictionary binaryDictionary, final String word0,
|
||||||
final String word1, final int probability) {
|
final String word1, final int probability) {
|
||||||
binaryDictionary.addBigramWords(word0, word1, probability,
|
binaryDictionary.addBigramWords(word0, word1, probability,
|
||||||
BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
|
mCurrentTime /* timestamp */);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void forcePassingShortTime(final BinaryDictionary binaryDictionary) {
|
private void forcePassingShortTime(final BinaryDictionary binaryDictionary) {
|
||||||
// Entries having low probability would be suppressed once in 3 GCs.
|
// 4 days.
|
||||||
final int count = 3;
|
final int timeToElapse = (int)TimeUnit.SECONDS.convert(4, TimeUnit.DAYS);
|
||||||
for (int i = 0; i < count; i++) {
|
mCurrentTime += timeToElapse;
|
||||||
binaryDictionary.getPropertyForTests(SET_NEEDS_TO_DECAY_FOR_TESTING_KEY);
|
setCurrentTime(binaryDictionary, mCurrentTime);
|
||||||
binaryDictionary.flushWithGC();
|
binaryDictionary.flushWithGC();
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
private void forcePassingLongTime(final BinaryDictionary binaryDictionary) {
|
private void forcePassingLongTime(final BinaryDictionary binaryDictionary) {
|
||||||
// Currently, probabilities are decayed when GC is run. All entries that have never been
|
// 60 days.
|
||||||
// typed in 128 GCs would be removed.
|
final int timeToElapse = (int)TimeUnit.SECONDS.convert(60, TimeUnit.DAYS);
|
||||||
final int count = 128;
|
mCurrentTime += timeToElapse;
|
||||||
for (int i = 0; i < count; i++) {
|
setCurrentTime(binaryDictionary, mCurrentTime);
|
||||||
binaryDictionary.getPropertyForTests(SET_NEEDS_TO_DECAY_FOR_TESTING_KEY);
|
|
||||||
binaryDictionary.flushWithGC();
|
binaryDictionary.flushWithGC();
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
private File createEmptyDictionaryAndGetFile(final String dictId,
|
private File createEmptyDictionaryAndGetFile(final String dictId,
|
||||||
final int formatVersion) throws IOException {
|
final int formatVersion) throws IOException {
|
||||||
|
@ -147,6 +157,7 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
||||||
BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
|
BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
|
||||||
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
|
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
|
||||||
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
|
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
|
||||||
|
binaryDictionary.getPropertyForTests(QUIT_TIMEKEEPER_TEST_MODE_QUERY);
|
||||||
final int startTime = getCurrentTime(binaryDictionary);
|
final int startTime = getCurrentTime(binaryDictionary);
|
||||||
for (int i = 0; i < TEST_COUNT; i++) {
|
for (int i = 0; i < TEST_COUNT; i++) {
|
||||||
final int currentTime = random.nextInt(Integer.MAX_VALUE);
|
final int currentTime = random.nextInt(Integer.MAX_VALUE);
|
||||||
|
@ -233,6 +244,8 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
||||||
addUnigramWord(binaryDictionary, "a", DUMMY_PROBABILITY);
|
addUnigramWord(binaryDictionary, "a", DUMMY_PROBABILITY);
|
||||||
addUnigramWord(binaryDictionary, "a", DUMMY_PROBABILITY);
|
addUnigramWord(binaryDictionary, "a", DUMMY_PROBABILITY);
|
||||||
addUnigramWord(binaryDictionary, "a", DUMMY_PROBABILITY);
|
addUnigramWord(binaryDictionary, "a", DUMMY_PROBABILITY);
|
||||||
|
addUnigramWord(binaryDictionary, "a", DUMMY_PROBABILITY);
|
||||||
|
assertTrue(binaryDictionary.isValidWord("a"));
|
||||||
forcePassingShortTime(binaryDictionary);
|
forcePassingShortTime(binaryDictionary);
|
||||||
assertTrue(binaryDictionary.isValidWord("a"));
|
assertTrue(binaryDictionary.isValidWord("a"));
|
||||||
forcePassingLongTime(binaryDictionary);
|
forcePassingLongTime(binaryDictionary);
|
||||||
|
@ -257,6 +270,9 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
||||||
addUnigramWord(binaryDictionary, "a", DUMMY_PROBABILITY);
|
addUnigramWord(binaryDictionary, "a", DUMMY_PROBABILITY);
|
||||||
addUnigramWord(binaryDictionary, "b", DUMMY_PROBABILITY);
|
addUnigramWord(binaryDictionary, "b", DUMMY_PROBABILITY);
|
||||||
addBigramWords(binaryDictionary, "a", "b", DUMMY_PROBABILITY);
|
addBigramWords(binaryDictionary, "a", "b", DUMMY_PROBABILITY);
|
||||||
|
addUnigramWord(binaryDictionary, "a", DUMMY_PROBABILITY);
|
||||||
|
addUnigramWord(binaryDictionary, "b", DUMMY_PROBABILITY);
|
||||||
|
addBigramWords(binaryDictionary, "a", "b", DUMMY_PROBABILITY);
|
||||||
assertTrue(binaryDictionary.isValidBigram("a", "b"));
|
assertTrue(binaryDictionary.isValidBigram("a", "b"));
|
||||||
forcePassingShortTime(binaryDictionary);
|
forcePassingShortTime(binaryDictionary);
|
||||||
assertTrue(binaryDictionary.isValidBigram("a", "b"));
|
assertTrue(binaryDictionary.isValidBigram("a", "b"));
|
||||||
|
@ -307,8 +323,7 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
||||||
Integer.parseInt(binaryDictionary.getPropertyForTests(
|
Integer.parseInt(binaryDictionary.getPropertyForTests(
|
||||||
BinaryDictionary.UNIGRAM_COUNT_QUERY));
|
BinaryDictionary.UNIGRAM_COUNT_QUERY));
|
||||||
while (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) {
|
while (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) {
|
||||||
binaryDictionary.getPropertyForTests(SET_NEEDS_TO_DECAY_FOR_TESTING_KEY);
|
forcePassingShortTime(binaryDictionary);
|
||||||
binaryDictionary.flushWithGC();
|
|
||||||
}
|
}
|
||||||
final int unigramCountAfterGC =
|
final int unigramCountAfterGC =
|
||||||
Integer.parseInt(binaryDictionary.getPropertyForTests(
|
Integer.parseInt(binaryDictionary.getPropertyForTests(
|
||||||
|
@ -321,6 +336,10 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
||||||
BinaryDictionary.UNIGRAM_COUNT_QUERY)) > 0);
|
BinaryDictionary.UNIGRAM_COUNT_QUERY)) > 0);
|
||||||
assertTrue(Integer.parseInt(binaryDictionary.getPropertyForTests(
|
assertTrue(Integer.parseInt(binaryDictionary.getPropertyForTests(
|
||||||
BinaryDictionary.UNIGRAM_COUNT_QUERY)) <= maxUnigramCount);
|
BinaryDictionary.UNIGRAM_COUNT_QUERY)) <= maxUnigramCount);
|
||||||
|
forcePassingLongTime(binaryDictionary);
|
||||||
|
assertEquals(0, Integer.parseInt(binaryDictionary.getPropertyForTests(
|
||||||
|
BinaryDictionary.UNIGRAM_COUNT_QUERY)));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testAddManyBigramsToDecayingDict() {
|
public void testAddManyBigramsToDecayingDict() {
|
||||||
|
@ -378,8 +397,7 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
||||||
Integer.parseInt(binaryDictionary.getPropertyForTests(
|
Integer.parseInt(binaryDictionary.getPropertyForTests(
|
||||||
BinaryDictionary.BIGRAM_COUNT_QUERY));
|
BinaryDictionary.BIGRAM_COUNT_QUERY));
|
||||||
while (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) {
|
while (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) {
|
||||||
binaryDictionary.getPropertyForTests(SET_NEEDS_TO_DECAY_FOR_TESTING_KEY);
|
forcePassingShortTime(binaryDictionary);
|
||||||
binaryDictionary.flushWithGC();
|
|
||||||
}
|
}
|
||||||
final int bigramCountAfterGC =
|
final int bigramCountAfterGC =
|
||||||
Integer.parseInt(binaryDictionary.getPropertyForTests(
|
Integer.parseInt(binaryDictionary.getPropertyForTests(
|
||||||
|
@ -392,5 +410,8 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
||||||
BinaryDictionary.BIGRAM_COUNT_QUERY)) > 0);
|
BinaryDictionary.BIGRAM_COUNT_QUERY)) > 0);
|
||||||
assertTrue(Integer.parseInt(binaryDictionary.getPropertyForTests(
|
assertTrue(Integer.parseInt(binaryDictionary.getPropertyForTests(
|
||||||
BinaryDictionary.BIGRAM_COUNT_QUERY)) <= maxBigramCount);
|
BinaryDictionary.BIGRAM_COUNT_QUERY)) <= maxBigramCount);
|
||||||
|
forcePassingLongTime(binaryDictionary);
|
||||||
|
assertEquals(0, Integer.parseInt(binaryDictionary.getPropertyForTests(
|
||||||
|
BinaryDictionary.BIGRAM_COUNT_QUERY)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue