From 2fdb62a6a28d769435fe5d48d8b997c3e9a083d5 Mon Sep 17 00:00:00 2001 From: Keisuke Kuroyanagi Date: Tue, 10 Dec 2013 17:53:06 +0900 Subject: [PATCH] Start using historical info for decaying. Bug: 11073222 Change-Id: Iad5f42d7268939df35589c4900bf6f4425adaa3f --- .../core/dictionary/bigram_dictionary.cpp | 3 +- .../bigram/ver4_bigram_list_policy.cpp | 31 +++-- .../bigram/ver4_bigram_list_policy.h | 10 +- .../dictionary/header/header_policy.h | 14 ++- ...namic_patricia_trie_gc_event_listeners.cpp | 4 +- ...dynamic_patricia_trie_gc_event_listeners.h | 14 +-- .../structure/v4/content/bigram_entry.h | 4 + .../structure/v4/content/probability_entry.h | 4 + .../structure/v4/ver4_dict_buffers.h | 15 +-- .../v4/ver4_patricia_trie_node_reader.cpp | 8 +- .../v4/ver4_patricia_trie_node_writer.cpp | 32 ++--- .../v4/ver4_patricia_trie_node_writer.h | 7 +- .../v4/ver4_patricia_trie_policy.cpp | 17 +-- .../structure/v4/ver4_patricia_trie_policy.h | 9 +- .../v4/ver4_patricia_trie_writing_helper.cpp | 37 +++--- .../v4/ver4_patricia_trie_writing_helper.h | 4 +- .../utils/forgetting_curve_utils.cpp | 113 ++++++------------ .../dictionary/utils/forgetting_curve_utils.h | 31 +---- .../dictionary/utils/historical_info.h | 4 + .../latin/BinaryDictionaryDecayingTests.java | 63 ++++++---- 20 files changed, 189 insertions(+), 235 deletions(-) diff --git a/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp b/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp index c2a15a312..2a62b555b 100644 --- a/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp @@ -163,7 +163,8 @@ int BigramDictionary::getBigramProbability(const int *word0, int length0, const mDictionaryStructurePolicy->getBigramsStructurePolicy(), pos); while (bigramsIt.hasNext()) { bigramsIt.next(); - if (bigramsIt.getBigramPos() == nextWordPos) { + if (bigramsIt.getBigramPos() == nextWordPos + && bigramsIt.getProbability() != NOT_A_PROBABILITY) { return mDictionaryStructurePolicy->getProbability( mDictionaryStructurePolicy->getUnigramProbabilityOfPtNode(nextWordPos), bigramsIt.getProbability()); diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.cpp index 968bacee6..cd2243025 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.cpp @@ -17,6 +17,7 @@ #include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h" #include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h" +#include "suggest/policyimpl/dictionary/header/header_policy.h" #include "suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h" #include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" @@ -34,7 +35,12 @@ void Ver4BigramListPolicy::getNextBigram(int *const outBigramPos, int *const out bigramEntry.getTargetTerminalId()); } if (outProbability) { - *outProbability = bigramEntry.getProbability(); + if (bigramEntry.hasHistoricalInfo()) { + *outProbability = + ForgettingCurveUtils::decodeProbability(bigramEntry.getHistoricalInfo()); + } else { + *outProbability = bigramEntry.getProbability(); + } } if (outHasNext) { *outHasNext = bigramEntry.hasNext(); @@ -152,17 +158,12 @@ bool Ver4BigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(const i if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) { return false; } - } else if (mNeedsToDecayWhenUpdating) { - const int probability = ForgettingCurveUtils::getEncodedProbabilityToSave( - bigramEntry.getProbability(), mHeaderPolicy); - const HistoricalInfo historicalInfo = - ForgettingCurveUtils::createHistoricalInfoToSave( - bigramEntry.getHistoricalInfo()); - // TODO: Use ForgettingCurveUtils::needsToKeep(&historicalInfo). - if (ForgettingCurveUtils::isValidEncodedProbability(probability)) { + } else if (bigramEntry.hasHistoricalInfo()) { + const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave( + bigramEntry.getHistoricalInfo()); + if (ForgettingCurveUtils::needsToKeep(&historicalInfo)) { const BigramEntry updatedBigramEntry = - bigramEntry.updateProbabilityAndGetEntry(probability) - .updateHistoricalInfoAndGetEntry(&historicalInfo); + bigramEntry.updateHistoricalInfoAndGetEntry(&historicalInfo); if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) { return false; } @@ -225,14 +226,12 @@ int Ver4BigramListPolicy::getEntryPosToUpdate(const int targetTerminalIdToFind, const BigramEntry Ver4BigramListPolicy::createUpdatedBigramEntryFrom( const BigramEntry *const originalBigramEntry, const int newProbability, const int timestamp) const { - if (mNeedsToDecayWhenUpdating) { - const int probability = ForgettingCurveUtils::getUpdatedEncodedProbability( - originalBigramEntry->getProbability(), newProbability); + // TODO: Consolidate historical info and probability. + if (mHeaderPolicy->hasHistoricalInfoOfWords()) { const HistoricalInfo updatedHistoricalInfo = ForgettingCurveUtils::createUpdatedHistoricalInfo( originalBigramEntry->getHistoricalInfo(), newProbability, timestamp); - return originalBigramEntry->updateProbabilityAndGetEntry(probability) - .updateHistoricalInfoAndGetEntry(&updatedHistoricalInfo); + return originalBigramEntry->updateHistoricalInfoAndGetEntry(&updatedHistoricalInfo); } else { return originalBigramEntry->updateProbabilityAndGetEntry(newProbability); } diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h index 972144100..5b6c5a173 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h @@ -24,18 +24,17 @@ namespace latinime { class BigramDictContent; -class DictionaryHeaderStructurePolicy; +class HeaderPolicy; class TerminalPositionLookupTable; class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy { public: Ver4BigramListPolicy(BigramDictContent *const bigramDictContent, const TerminalPositionLookupTable *const terminalPositionLookupTable, - const DictionaryHeaderStructurePolicy *const headerPolicy, - const bool needsToDecayWhenUpdating) + const HeaderPolicy *const headerPolicy) : mBigramDictContent(bigramDictContent), mTerminalPositionLookupTable(terminalPositionLookupTable), - mHeaderPolicy(headerPolicy), mNeedsToDecayWhenUpdating(needsToDecayWhenUpdating) {} + mHeaderPolicy(headerPolicy) {} void getNextBigram(int *const outBigramPos, int *const outProbability, bool *const outHasNext, int *const bigramEntryPos) const; @@ -64,8 +63,7 @@ class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy { BigramDictContent *const mBigramDictContent; const TerminalPositionLookupTable *const mTerminalPositionLookupTable; - const DictionaryHeaderStructurePolicy *const mHeaderPolicy; - const bool mNeedsToDecayWhenUpdating; + const HeaderPolicy *const mHeaderPolicy; }; } // namespace latinime #endif /* LATINIME_VER4_BIGRAM_LIST_POLICY_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h index 29e937b3a..d71e0a068 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h @@ -93,6 +93,18 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { } } + AK_FORCE_INLINE bool isValid() const { + // Decaying dictionary must have historical information. + if (!mIsDecayingDict) { + return true; + } + if (mHasHistoricalInfoOfWords) { + return true; + } else { + return false; + } + } + AK_FORCE_INLINE int getSize() const { return mSize; } @@ -137,7 +149,7 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { return mExtendedRegionSize; } - AK_FORCE_INLINE bool hasHistricalInfoOfWords() const { + AK_FORCE_INLINE bool hasHistoricalInfoOfWords() const { return mHasHistoricalInfoOfWords; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_gc_event_listeners.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_gc_event_listeners.cpp index d4175355e..bcba67035 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_gc_event_listeners.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_gc_event_listeners.cpp @@ -20,7 +20,6 @@ #include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" #include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h" #include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_writing_utils.h" -#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" namespace latinime { @@ -30,8 +29,7 @@ bool DynamicPatriciaTrieGcEventListeners // PtNode is useless when the PtNode is not a terminal and doesn't have any not useless // children. bool isUselessPtNode = !ptNodeParams->isTerminal(); - // TODO: Quit checking mNeedsToDecayWhenUpdating. - if (ptNodeParams->isTerminal() && mNeedsToDecayWhenUpdating) { + if (ptNodeParams->isTerminal()) { bool needsToKeepPtNode = true; if (!mPtNodeWriter->updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC(ptNodeParams, &needsToKeepPtNode)) { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_gc_event_listeners.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_gc_event_listeners.h index edd692898..562eb69bd 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_gc_event_listeners.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_gc_event_listeners.h @@ -27,8 +27,6 @@ namespace latinime { -class DictionaryHeaderStructurePolicy; -class PtNodeWriter; class PtNodeParams; // TODO: Move to pt_common. @@ -41,12 +39,9 @@ class DynamicPatriciaTrieGcEventListeners { : public DynamicPatriciaTrieReadingHelper::TraversingEventListener { public: TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( - const DictionaryHeaderStructurePolicy *const headerPolicy, - PtNodeWriter *const ptNodeWriter, BufferWithExtendableBuffer *const buffer, - const bool needsToDecayWhenUpdating) - : mHeaderPolicy(headerPolicy), mPtNodeWriter(ptNodeWriter), mBuffer(buffer), - mNeedsToDecayWhenUpdating(needsToDecayWhenUpdating), mValueStack(), - mChildrenValue(0), mValidUnigramCount(0) {} + PtNodeWriter *const ptNodeWriter) + : mPtNodeWriter(ptNodeWriter), mValueStack(), mChildrenValue(0), + mValidUnigramCount(0) {} ~TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted() {}; @@ -77,10 +72,7 @@ class DynamicPatriciaTrieGcEventListeners { DISALLOW_IMPLICIT_CONSTRUCTORS( TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted); - const DictionaryHeaderStructurePolicy *const mHeaderPolicy; PtNodeWriter *const mPtNodeWriter; - BufferWithExtendableBuffer *const mBuffer; - const bool mNeedsToDecayWhenUpdating; std::vector mValueStack; int mChildrenValue; int mValidUnigramCount; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_entry.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_entry.h index 805014506..2b0cbd93b 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_entry.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_entry.h @@ -73,6 +73,10 @@ class BigramEntry { return mProbability; } + bool hasHistoricalInfo() const { + return mHistoricalInfo.isValid(); + } + const HistoricalInfo *getHistoricalInfo() const { return &mHistoricalInfo; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h index d1b28cc0a..36ba82be1 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h @@ -51,6 +51,10 @@ class ProbabilityEntry { return ProbabilityEntry(mFlags, mProbability, historicalInfo); } + bool hasHistoricalInfo() const { + return mHistoricalInfo.isValid(); + } + int getFlags() const { return mFlags; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h index 7e283f437..153d8990a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h @@ -45,9 +45,9 @@ class Ver4DictBuffers { } AK_FORCE_INLINE bool isValid() const { - return mDictBuffer.get() != 0 && mProbabilityDictContent.isValid() - && mTerminalPositionLookupTable.isValid() && mBigramDictContent.isValid() - && mShortcutDictContent.isValid(); + return mDictBuffer.get() != 0 && mHeaderPolicy.isValid() + && mProbabilityDictContent.isValid() && mTerminalPositionLookupTable.isValid() + && mBigramDictContent.isValid() && mShortcutDictContent.isValid(); } AK_FORCE_INLINE bool isNearSizeLimit() const { @@ -131,9 +131,10 @@ class Ver4DictBuffers { BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), // TODO: Quit using header size. mTerminalPositionLookupTable(dictDirPath, isUpdatable, mHeaderPolicy.getSize()), - mProbabilityDictContent(dictDirPath, mHeaderPolicy.hasHistricalInfoOfWords(), + mProbabilityDictContent(dictDirPath, mHeaderPolicy.hasHistoricalInfoOfWords(), + isUpdatable), + mBigramDictContent(dictDirPath, mHeaderPolicy.hasHistoricalInfoOfWords(), isUpdatable), - mBigramDictContent(dictDirPath, mHeaderPolicy.hasHistricalInfoOfWords(), isUpdatable), mShortcutDictContent(dictDirPath, isUpdatable), mIsUpdatable(isUpdatable) {} @@ -142,8 +143,8 @@ class Ver4DictBuffers { mExpandableHeaderBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), mExpandableTrieBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), mTerminalPositionLookupTable(), - mProbabilityDictContent(headerPolicy->hasHistricalInfoOfWords()), - mBigramDictContent(headerPolicy->hasHistricalInfoOfWords()), mShortcutDictContent(), + mProbabilityDictContent(headerPolicy->hasHistoricalInfoOfWords()), + mBigramDictContent(headerPolicy->hasHistoricalInfoOfWords()), mShortcutDictContent(), mIsUpdatable(true) {} const MmappedBuffer::MmappedBufferPtr mDictBuffer; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp index 5fa657d01..8ea029076 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp @@ -22,6 +22,7 @@ #include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" namespace latinime { @@ -62,7 +63,12 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce terminalId = Ver4PatriciaTrieReadingUtils::getTerminalIdAndAdvancePosition(dictBuf, &pos); const ProbabilityEntry probabilityEntry = mProbabilityDictContent->getProbabilityEntry(terminalId); - probability = probabilityEntry.getProbability(); + if (probabilityEntry.hasHistoricalInfo()) { + probability = ForgettingCurveUtils::decodeProbability( + probabilityEntry.getHistoricalInfo()); + } else { + probability = probabilityEntry.getProbability(); + } } int childrenPosFieldPos = pos; if (usesAdditionalBuffer) { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp index 07554342f..b60499e9f 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp @@ -17,6 +17,7 @@ #include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h" #include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h" +#include "suggest/policyimpl/dictionary/header/header_policy.h" #include "suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h" #include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h" @@ -147,27 +148,21 @@ bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeA AKLOGE("updatePtNodeProbabilityAndGetNeedsToSaveForGC is called for non-terminal PtNode."); return false; } - if (mBuffers->getHeaderPolicy()->isDecayingDict()) { - const ProbabilityEntry originalProbabilityEntry = - mBuffers->getProbabilityDictContent()->getProbabilityEntry( - toBeUpdatedPtNodeParams->getTerminalId()); - // TODO: Remove. - const int newProbability = ForgettingCurveUtils::getEncodedProbabilityToSave( - originalProbabilityEntry.getProbability(), mBuffers->getHeaderPolicy()); - const HistoricalInfo historicalInfo = - ForgettingCurveUtils::createHistoricalInfoToSave( - originalProbabilityEntry.getHistoricalInfo()); + const ProbabilityEntry originalProbabilityEntry = + mBuffers->getProbabilityDictContent()->getProbabilityEntry( + toBeUpdatedPtNodeParams->getTerminalId()); + if (originalProbabilityEntry.hasHistoricalInfo()) { + const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave( + originalProbabilityEntry.getHistoricalInfo()); const ProbabilityEntry probabilityEntry = - originalProbabilityEntry.createEntryWithUpdatedProbability(newProbability) - .createEntryWithUpdatedHistoricalInfo(&historicalInfo); + originalProbabilityEntry.createEntryWithUpdatedHistoricalInfo(&historicalInfo); if (!mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry( toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry)) { AKLOGE("Cannot write updated probability entry. terminalId: %d", toBeUpdatedPtNodeParams->getTerminalId()); return false; } - // TODO: Use ForgettingCurveUtils::needsToKeep(&historicalInfo). - const bool isValid = ForgettingCurveUtils::isValidEncodedProbability(newProbability); + const bool isValid = ForgettingCurveUtils::needsToKeep(&historicalInfo); if (!isValid) { if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) { AKLOGE("Cannot mark PtNode as willBecomeNonTerminal."); @@ -380,14 +375,13 @@ bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition( const ProbabilityEntry Ver4PatriciaTrieNodeWriter::createUpdatedEntryFrom( const ProbabilityEntry *const originalProbabilityEntry, const int newProbability, const int timestamp) const { - if (mNeedsToDecayWhenUpdating) { - const int updatedProbability = ForgettingCurveUtils::getUpdatedEncodedProbability( - originalProbabilityEntry->getProbability(), newProbability); + // TODO: Consolidate historical info and probability. + if (mBuffers->getHeaderPolicy()->hasHistoricalInfoOfWords()) { const HistoricalInfo updatedHistoricalInfo = ForgettingCurveUtils::createUpdatedHistoricalInfo( originalProbabilityEntry->getHistoricalInfo(), newProbability, timestamp); - return originalProbabilityEntry->createEntryWithUpdatedProbability(updatedProbability) - .createEntryWithUpdatedHistoricalInfo(&updatedHistoricalInfo); + return originalProbabilityEntry->createEntryWithUpdatedHistoricalInfo( + &updatedHistoricalInfo); } else { return originalProbabilityEntry->createEntryWithUpdatedProbability(newProbability); } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h index b0b5b70f2..1df853a30 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h @@ -40,12 +40,10 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter { public: Ver4PatriciaTrieNodeWriter(BufferWithExtendableBuffer *const trieBuffer, Ver4DictBuffers *const buffers, const Ver4PatriciaTrieNodeReader *const ptNodeReader, - Ver4BigramListPolicy *const bigramPolicy, Ver4ShortcutListPolicy *const shortcutPolicy, - const bool needsToDecayWhenUpdating) + Ver4BigramListPolicy *const bigramPolicy, Ver4ShortcutListPolicy *const shortcutPolicy) : mTrieBuffer(trieBuffer), mBuffers(buffers), mPtNodeReader(ptNodeReader), mReadingHelper(mTrieBuffer, mPtNodeReader), - mBigramPolicy(bigramPolicy), mShortcutPolicy(shortcutPolicy), - mNeedsToDecayWhenUpdating(needsToDecayWhenUpdating) {} + mBigramPolicy(bigramPolicy), mShortcutPolicy(shortcutPolicy) {} virtual ~Ver4PatriciaTrieNodeWriter() {} @@ -120,7 +118,6 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter { DynamicPatriciaTrieReadingHelper mReadingHelper; Ver4BigramListPolicy *const mBigramPolicy; Ver4ShortcutListPolicy *const mShortcutPolicy; - const bool mNeedsToDecayWhenUpdating; }; } // namespace latinime #endif /* LATINIME_VER4_PATRICIA_TRIE_NODE_WRITER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp index 3447dce93..7b5461d74 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp @@ -34,8 +34,6 @@ const char *const Ver4PatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; const char *const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; const char *const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT"; const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT"; -const char *const Ver4PatriciaTriePolicy::SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY = - "SET_NEEDS_TO_DECAY_FOR_TESTING"; const char *const Ver4PatriciaTriePolicy::SET_CURRENT_TIME_FOR_TESTING_QUERY_FORMAT = "SET_CURRENT_TIME_FOR_TESTING:%d"; const char *const Ver4PatriciaTriePolicy::GET_CURRENT_TIME_QUERY = "GET_CURRENT_TIME"; @@ -62,8 +60,7 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d // A DecayingDict may have a terminal PtNode that has a terminal DicNode whose // probability is NOT_A_PROBABILITY. In such case, we don't want to treat it as a // valid terminal DicNode. - isTerminal = getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY) - != NOT_A_PROBABILITY; + isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY; } childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(), ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), isTerminal, @@ -262,11 +259,7 @@ void Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) { AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); return; } - const bool needsToDecay = mHeaderPolicy->isDecayingDict() - && (mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay( - false /* mindsBlockByDecay */, mUnigramCount, mBigramCount, mHeaderPolicy)); - mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath, needsToDecay); - mNeedsToDecayForTesting = false; + mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath); } bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { @@ -286,8 +279,8 @@ bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { // Needs to reduce dictionary size. return true; } else if (mHeaderPolicy->isDecayingDict()) { - return mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay( - mindsBlockByGC, mUnigramCount, mBigramCount, mHeaderPolicy); + return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mUnigramCount, mBigramCount, + mHeaderPolicy); } return false; } @@ -308,8 +301,6 @@ void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int quer snprintf(outResult, maxResultLength, "%d", mHeaderPolicy->isDecayingDict() ? ForgettingCurveUtils::MAX_BIGRAM_COUNT : static_cast(Ver4DictConstants::MAX_DICTIONARY_SIZE)); - } else if (strncmp(query, SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY, compareLength) == 0) { - mNeedsToDecayForTesting = true; } else if (sscanf(query, SET_CURRENT_TIME_FOR_TESTING_QUERY_FORMAT, ×tamp) == 1) { TimeKeeper::startTestModeWithForceCurrentTime(timestamp); } else if (strncmp(query, GET_CURRENT_TIME_QUERY, compareLength) == 0) { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h index d2d2ead50..2d46c0ae2 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h @@ -41,17 +41,16 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { : mBuffers(buffers), mHeaderPolicy(mBuffers.get()->getHeaderPolicy()), mDictBuffer(mBuffers.get()->getWritableTrieBuffer()), mBigramPolicy(mBuffers.get()->getMutableBigramDictContent(), - mBuffers.get()->getTerminalPositionLookupTable(), mHeaderPolicy, - mHeaderPolicy->isDecayingDict()), + mBuffers.get()->getTerminalPositionLookupTable(), mHeaderPolicy), mShortcutPolicy(mBuffers.get()->getMutableShortcutDictContent(), mBuffers.get()->getTerminalPositionLookupTable()), mNodeReader(mDictBuffer, mBuffers.get()->getProbabilityDictContent()), mNodeWriter(mDictBuffer, mBuffers.get(), &mNodeReader, &mBigramPolicy, - &mShortcutPolicy, mHeaderPolicy->isDecayingDict()), + &mShortcutPolicy), mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter), mWritingHelper(mBuffers.get()), mUnigramCount(mHeaderPolicy->getUnigramCount()), - mBigramCount(mHeaderPolicy->getBigramCount()), mNeedsToDecayForTesting(false) {}; + mBigramCount(mHeaderPolicy->getBigramCount()) {}; AK_FORCE_INLINE int getRootPosition() const { return 0; @@ -117,7 +116,6 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { static const char *const BIGRAM_COUNT_QUERY; static const char *const MAX_UNIGRAM_COUNT_QUERY; static const char *const MAX_BIGRAM_COUNT_QUERY; - static const char *const SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY; static const char *const SET_CURRENT_TIME_FOR_TESTING_QUERY_FORMAT; static const char *const GET_CURRENT_TIME_QUERY; static const char *const QUIT_TIMEKEEPER_TEST_MODE_QUERY; @@ -137,7 +135,6 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { Ver4PatriciaTrieWritingHelper mWritingHelper; int mUnigramCount; int mBigramCount; - bool mNeedsToDecayForTesting; }; } // namespace latinime #endif // LATINIME_VER4_PATRICIA_TRIE_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp index 6ab460ff6..21d009ecb 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp @@ -53,20 +53,20 @@ void Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const trieFilePa } void Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos, - const char *const trieFilePath, const bool needsToDecay) { + const char *const trieFilePath) { const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy(); Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers( Ver4DictBuffers::createVer4DictBuffers(headerPolicy)); int unigramCount = 0; int bigramCount = 0; - if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &unigramCount, &bigramCount, - needsToDecay)) { + if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &unigramCount, &bigramCount)) { return; } BufferWithExtendableBuffer headerBuffer( BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */, - needsToDecay, unigramCount, bigramCount, 0 /* extendedRegionSize */)) { + true /* updatesLastDecayedTime */, unigramCount, bigramCount, + 0 /* extendedRegionSize */)) { return; } const int dirPathBufSize = strlen(trieFilePath) + 1 /* terminator */; @@ -77,30 +77,29 @@ void Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeAr bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite, - int *const outUnigramCount, int *const outBigramCount, const bool needsToDecay) { + int *const outUnigramCount, int *const outBigramCount) { Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer(), mBuffers->getProbabilityDictContent()); Ver4BigramListPolicy bigramPolicy(mBuffers->getMutableBigramDictContent(), - mBuffers->getTerminalPositionLookupTable(), headerPolicy, needsToDecay); + mBuffers->getTerminalPositionLookupTable(), headerPolicy); Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(), mBuffers->getTerminalPositionLookupTable()); Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(), - mBuffers, &ptNodeReader, &bigramPolicy, &shortcutPolicy, - false /* needsToDecayWhenUpdating */); + mBuffers, &ptNodeReader, &bigramPolicy, &shortcutPolicy); DynamicPatriciaTrieReadingHelper readingHelper(mBuffers->getTrieBuffer(), &ptNodeReader); readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); DynamicPatriciaTrieGcEventListeners ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( - headerPolicy, &ptNodeWriter, mBuffers->getWritableTrieBuffer(), - needsToDecay); + &ptNodeWriter); if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) { return false; } - if (needsToDecay && traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted - .getValidUnigramCount() > ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC) { + if (headerPolicy->isDecayingDict() + && traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted + .getValidUnigramCount() > ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC) { // TODO: Remove more unigrams. } @@ -111,8 +110,9 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, &traversePolicyToUpdateBigramProbability)) { return false; } - if (needsToDecay && traversePolicyToUpdateBigramProbability.getValidBigramEntryCount() - > ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC) { + if (headerPolicy->isDecayingDict() + && traversePolicyToUpdateBigramProbability.getValidBigramEntryCount() + > ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC) { // TODO: Remove more bigrams. } @@ -120,8 +120,7 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap; readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(), - buffersToWrite, &ptNodeReader, &bigramPolicy, &shortcutPolicy, - false /* needsToDecayWhenUpdating */); + buffersToWrite, &ptNodeReader, &bigramPolicy, &shortcutPolicy); DynamicPatriciaTrieGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers, buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap); @@ -134,13 +133,11 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(), buffersToWrite->getProbabilityDictContent()); Ver4BigramListPolicy newBigramPolicy(buffersToWrite->getMutableBigramDictContent(), - buffersToWrite->getTerminalPositionLookupTable(), headerPolicy, - false /* needsToDecay */); + buffersToWrite->getTerminalPositionLookupTable(), headerPolicy); Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(), buffersToWrite->getTerminalPositionLookupTable()); Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(), - buffersToWrite, &newPtNodeReader, &newBigramPolicy, &newShortcutPolicy, - false /* needsToDecayWhenUpdating */); + buffersToWrite, &newPtNodeReader, &newBigramPolicy, &newShortcutPolicy); // Re-assign terminal IDs for valid terminal PtNodes. TerminalPositionLookupTable::TerminalIdMap terminalIdMap; if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds( diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h index 505012b4f..82877fdcc 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h @@ -36,7 +36,7 @@ class Ver4PatriciaTrieWritingHelper { const int bigramCount) const; void writeToDictFileWithGC(const int rootPtNodeArrayPos, - const char *const trieFilePath, const bool needsToDecay); + const char *const trieFilePath); private: DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTrieWritingHelper); @@ -66,7 +66,7 @@ class Ver4PatriciaTrieWritingHelper { bool runGC(const int rootPtNodeArrayPos, const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite, int *const outUnigramCount, - int *const outBigramCount, const bool needsToDecay); + int *const outBigramCount); Ver4DictBuffers *const mBuffers; }; diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp index 466af7256..4050ad363 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp @@ -31,12 +31,6 @@ const int ForgettingCurveUtils::MAX_BIGRAM_COUNT = 12000; const int ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC = 10000; const int ForgettingCurveUtils::MAX_COMPUTED_PROBABILITY = 127; -const int ForgettingCurveUtils::MAX_ENCODED_PROBABILITY = 15; -const int ForgettingCurveUtils::MIN_VALID_ENCODED_PROBABILITY = 3; -const int ForgettingCurveUtils::ENCODED_PROBABILITY_STEP = 1; -// Currently, we try to decay each uni/bigram once every 2 hours. Accordingly, the expected -// duration of the decay is approximately 66hours. -const float ForgettingCurveUtils::MIN_PROBABILITY_TO_DECAY = 0.03f; const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60; const int ForgettingCurveUtils::MAX_LEVEL = 3; @@ -53,9 +47,9 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT const int newProbability, const int timestamp) { if (newProbability != NOT_A_PROBABILITY && originalHistoricalInfo->getLevel() == 0) { return HistoricalInfo(timestamp, MIN_VALID_LEVEL /* level */, 0 /* count */); - } else if (originalHistoricalInfo->getTimeStamp() == NOT_A_TIMESTAMP) { + } else if (!originalHistoricalInfo->isValid()) { // Initial information. - return HistoricalInfo(timestamp, 0 /* level */, 0 /* count */); + return HistoricalInfo(timestamp, 0 /* level */, 1 /* count */); } else { const int updatedCount = originalHistoricalInfo->getCount() + 1; if (updatedCount > MAX_COUNT) { @@ -75,85 +69,46 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT } } -/* static */ int ForgettingCurveUtils::getProbability(const int encodedUnigramProbability, - const int encodedBigramProbability) { - if (encodedUnigramProbability == NOT_A_PROBABILITY) { +/* static */ int ForgettingCurveUtils::decodeProbability( + const HistoricalInfo *const historicalInfo) { + const int elapsedTimeStepCount = getElapsedTimeStepCount(historicalInfo->getTimeStamp()); + return sProbabilityTable.getProbability(historicalInfo->getLevel(), + min(max(elapsedTimeStepCount, 0), MAX_ELAPSED_TIME_STEP_COUNT)); +} + +/* static */ int ForgettingCurveUtils::getProbability(const int unigramProbability, + const int bigramProbability) { + if (unigramProbability == NOT_A_PROBABILITY) { return NOT_A_PROBABILITY; - } else if (encodedBigramProbability == NOT_A_PROBABILITY) { - return backoff(decodeProbability(encodedUnigramProbability)); + } else if (bigramProbability == NOT_A_PROBABILITY) { + return min(backoff(unigramProbability), MAX_COMPUTED_PROBABILITY); } else { - const int unigramProbability = decodeProbability(encodedUnigramProbability); - const int bigramProbability = decodeProbability(encodedBigramProbability); return min(max(unigramProbability, bigramProbability), MAX_COMPUTED_PROBABILITY); } } -// Caveat: Unlike getProbability(), this method doesn't assume special bigram probability encoding -// (i.e. unigram probability + bigram probability delta). -/* static */ int ForgettingCurveUtils::getUpdatedEncodedProbability( - const int originalEncodedProbability, const int newProbability) { - if (originalEncodedProbability == NOT_A_PROBABILITY) { - // The bigram relation is not in this dictionary. - if (newProbability == NOT_A_PROBABILITY) { - // The bigram target is not in other dictionaries. - return 0; - } else { - return MIN_VALID_ENCODED_PROBABILITY; - } - } else { - if (newProbability != NOT_A_PROBABILITY - && originalEncodedProbability < MIN_VALID_ENCODED_PROBABILITY) { - return MIN_VALID_ENCODED_PROBABILITY; - } - return min(originalEncodedProbability + ENCODED_PROBABILITY_STEP, MAX_ENCODED_PROBABILITY); - } -} - -/* static */ int ForgettingCurveUtils::isValidEncodedProbability(const int encodedProbability) { - return encodedProbability >= MIN_VALID_ENCODED_PROBABILITY; -} - /* static */ bool ForgettingCurveUtils::needsToKeep(const HistoricalInfo *const historicalInfo) { return historicalInfo->getLevel() > 0 || getElapsedTimeStepCount(historicalInfo->getTimeStamp()) < DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD; } -/* static */ int ForgettingCurveUtils::getEncodedProbabilityToSave(const int encodedProbability, - const DictionaryHeaderStructurePolicy *const headerPolicy) { - const int elapsedTime = TimeKeeper::peekCurrentTime() - headerPolicy->getLastDecayedTime(); - const int decayIterationCount = max(elapsedTime / DECAY_INTERVAL_SECONDS, 1); - int currentEncodedProbability = max(min(encodedProbability, MAX_ENCODED_PROBABILITY), 0); - // TODO: Implement the decay in more proper way. - for (int i = 0; i < decayIterationCount; ++i) { - const float currentRate = static_cast(currentEncodedProbability) - / static_cast(MAX_ENCODED_PROBABILITY); - const float thresholdToDecay = (1.0f - MIN_PROBABILITY_TO_DECAY) * currentRate; - const float randValue = static_cast(rand()) / static_cast(RAND_MAX); - if (thresholdToDecay < randValue) { - currentEncodedProbability = max(currentEncodedProbability - ENCODED_PROBABILITY_STEP, - 0); - } - } - return currentEncodedProbability; -} - /* static */ const HistoricalInfo ForgettingCurveUtils::createHistoricalInfoToSave( const HistoricalInfo *const originalHistoricalInfo) { if (originalHistoricalInfo->getTimeStamp() == NOT_A_TIMESTAMP) { return HistoricalInfo(); } const int elapsedTimeStep = getElapsedTimeStepCount(originalHistoricalInfo->getTimeStamp()); - if (elapsedTimeStep < MAX_ELAPSED_TIME_STEP_COUNT) { + if (elapsedTimeStep <= MAX_ELAPSED_TIME_STEP_COUNT) { // No need to update historical info. return *originalHistoricalInfo; } // Level down. - const int maxLevelDownAmonut = elapsedTimeStep / MAX_ELAPSED_TIME_STEP_COUNT; + const int maxLevelDownAmonut = elapsedTimeStep / (MAX_ELAPSED_TIME_STEP_COUNT + 1); const int levelDownAmount = (maxLevelDownAmonut >= originalHistoricalInfo->getLevel()) ? originalHistoricalInfo->getLevel() : maxLevelDownAmonut; const int adjustedTimestamp = originalHistoricalInfo->getTimeStamp() + - levelDownAmount * MAX_ELAPSED_TIME_STEP_COUNT * TIME_STEP_DURATION_IN_SECONDS; + levelDownAmount * (MAX_ELAPSED_TIME_STEP_COUNT + 1) * TIME_STEP_DURATION_IN_SECONDS; return HistoricalInfo(adjustedTimestamp, originalHistoricalInfo->getLevel() - levelDownAmount, 0 /* count */); } @@ -179,14 +134,6 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT return false; } -/* static */ int ForgettingCurveUtils::decodeProbability(const int encodedProbability) { - if (encodedProbability < MIN_VALID_ENCODED_PROBABILITY) { - return NOT_A_PROBABILITY; - } else { - return min(sProbabilityTable.getProbability(encodedProbability), MAX_ENCODED_PROBABILITY); - } -} - // See comments in ProbabilityUtils::backoff(). /* static */ int ForgettingCurveUtils::backoff(const int unigramProbability) { if (unigramProbability == NOT_A_PROBABILITY) { @@ -201,14 +148,24 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT } ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTable() { - // Table entry is as follows: - // 1, 1, 1, 2, 3, 5, 6, 9, 13, 18, 25, 34, 48, 66, 91, 127. - // Note that first MIN_VALID_ENCODED_PROBABILITY values are not used. - mTable.resize(MAX_ENCODED_PROBABILITY + 1); - for (int i = 0; i <= MAX_ENCODED_PROBABILITY; ++i) { - const int probability = static_cast(powf(static_cast(MAX_COMPUTED_PROBABILITY), - static_cast(i) / static_cast(MAX_ENCODED_PROBABILITY))); - mTable[i] = min(MAX_COMPUTED_PROBABILITY, max(0, probability)); + mTable.resize(MAX_LEVEL + 1); + for (int level = 0; level <= MAX_LEVEL; ++level) { + mTable[level].resize(MAX_ELAPSED_TIME_STEP_COUNT + 1); + const float initialProbability = + static_cast(MAX_COMPUTED_PROBABILITY / (1 << (MAX_LEVEL - level))); + for (int timeStepCount = 0; timeStepCount <= MAX_ELAPSED_TIME_STEP_COUNT; ++timeStepCount) { + if (level == 0) { + mTable[level][timeStepCount] = NOT_A_PROBABILITY; + continue; + } + const int elapsedTime = timeStepCount * TIME_STEP_DURATION_IN_SECONDS; + const float probability = initialProbability + * powf(2.0f, -1.0f * static_cast(elapsedTime) + / static_cast(TIME_STEP_DURATION_IN_SECONDS + * (MAX_ELAPSED_TIME_STEP_COUNT + 1))); + mTable[level][timeStepCount] = + min(max(static_cast(probability), 1), MAX_COMPUTED_PROBABILITY); + } } } diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h index 76d172e0b..6ac8dc528 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h @@ -26,8 +26,6 @@ namespace latinime { class DictionaryHeaderStructurePolicy; -// TODO: Check the elapsed time and decrease the probability depending on the time. Time field is -// required to introduced to each terminal PtNode and bigram entry. // TODO: Quit using bigram probability to indicate the delta. class ForgettingCurveUtils { public: @@ -43,22 +41,13 @@ class ForgettingCurveUtils { static const HistoricalInfo createHistoricalInfoToSave( const HistoricalInfo *const originalHistoricalInfo); + static int decodeProbability(const HistoricalInfo *const historicalInfo); + static int getProbability(const int encodedUnigramProbability, const int encodedBigramProbability); - // TODO: Remove. - static int getUpdatedEncodedProbability(const int originalEncodedProbability, - const int newProbability); - - // TODO: Remove. - static int isValidEncodedProbability(const int encodedProbability); - static bool needsToKeep(const HistoricalInfo *const historicalInfo); - // TODO: Remove. - static int getEncodedProbabilityToSave(const int encodedProbability, - const DictionaryHeaderStructurePolicy *const headerPolicy); - static bool needsToDecay(const bool mindsBlockByDecay, const int unigramCount, const int bigramCount, const DictionaryHeaderStructurePolicy *const headerPolicy); @@ -69,24 +58,17 @@ class ForgettingCurveUtils { public: ProbabilityTable(); - int getProbability(const int encodedProbability) const { - if (encodedProbability < 0 || encodedProbability > static_cast(mTable.size())) { - return NOT_A_PROBABILITY; - } - return mTable[encodedProbability]; + int getProbability(const int level, const int elapsedTimeStepCount) const { + return mTable[level][elapsedTimeStepCount]; } private: DISALLOW_COPY_AND_ASSIGN(ProbabilityTable); - std::vector mTable; + std::vector > mTable; }; static const int MAX_COMPUTED_PROBABILITY; - static const int MAX_ENCODED_PROBABILITY; - static const int MIN_VALID_ENCODED_PROBABILITY; - static const int ENCODED_PROBABILITY_STEP; - static const float MIN_PROBABILITY_TO_DECAY; static const int DECAY_INTERVAL_SECONDS; static const int MAX_LEVEL; @@ -95,11 +77,10 @@ class ForgettingCurveUtils { static const int TIME_STEP_DURATION_IN_SECONDS; static const int MAX_ELAPSED_TIME_STEP_COUNT; static const int DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD; + static const int HALF_LIFE_TIME_IN_SECONDS; static const ProbabilityTable sProbabilityTable; - static int decodeProbability(const int encodedProbability); - static int backoff(const int unigramProbability); static int getElapsedTimeStepCount(const int timestamp); diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/historical_info.h b/native/jni/src/suggest/policyimpl/dictionary/utils/historical_info.h index 64c0136a6..428ca8626 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/historical_info.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/historical_info.h @@ -30,6 +30,10 @@ class HistoricalInfo { HistoricalInfo(const int timestamp, const int level, const int count) : mTimestamp(timestamp), mLevel(level), mCount(count) {} + bool isValid() const { + return mTimestamp != NOT_A_TIMESTAMP; + } + int getTimeStamp() const { return mTimestamp; } diff --git a/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java b/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java index 1ea54eae5..d77a11f01 100644 --- a/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java +++ b/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java @@ -30,6 +30,7 @@ import java.util.HashMap; import java.util.Locale; import java.util.Map; import java.util.Random; +import java.util.concurrent.TimeUnit; @LargeTest public class BinaryDictionaryDecayingTests extends AndroidTestCase { @@ -38,8 +39,6 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { // Note that these are corresponding definitions in native code in // latinime::Ver4PatriciaTriePolicy. - private static final String SET_NEEDS_TO_DECAY_FOR_TESTING_KEY = - "SET_NEEDS_TO_DECAY_FOR_TESTING"; private static final String SET_CURRENT_TIME_FOR_TESTING_QUERY = "SET_CURRENT_TIME_FOR_TESTING"; private static final String GET_CURRENT_TIME_QUERY = "GET_CURRENT_TIME"; @@ -47,14 +46,28 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { private static final int DUMMY_PROBABILITY = 0; + private int mCurrentTime = 0; + @Override protected void setUp() throws Exception { super.setUp(); + mCurrentTime = 0; } @Override protected void tearDown() throws Exception { super.tearDown(); + try { + final File dictFile = + createEmptyDictionaryAndGetFile("TestBinaryDictionary", FormatSpec.VERSION4); + final BinaryDictionary binaryDictionary = + new BinaryDictionary(dictFile.getAbsolutePath(), 0 /* offset */, + dictFile.length(), true /* useFullEditDistance */, Locale.getDefault(), + TEST_LOCALE, true /* isUpdatable */); + binaryDictionary.getPropertyForTests(QUIT_TIMEKEEPER_TEST_MODE_QUERY); + } catch (IOException e) { + fail("IOException while writing an initial dictionary : " + e); + } } private void addUnigramWord(final BinaryDictionary binaryDictionary, final String word, @@ -62,32 +75,29 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { binaryDictionary.addUnigramWord(word, probability, "" /* shortcutTarget */, BinaryDictionary.NOT_A_PROBABILITY /* shortcutProbability */, false /* isNotAWord */, false /* isBlacklisted */, - BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); + mCurrentTime /* timestamp */); } private void addBigramWords(final BinaryDictionary binaryDictionary, final String word0, final String word1, final int probability) { binaryDictionary.addBigramWords(word0, word1, probability, - BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */); + mCurrentTime /* timestamp */); } private void forcePassingShortTime(final BinaryDictionary binaryDictionary) { - // Entries having low probability would be suppressed once in 3 GCs. - final int count = 3; - for (int i = 0; i < count; i++) { - binaryDictionary.getPropertyForTests(SET_NEEDS_TO_DECAY_FOR_TESTING_KEY); - binaryDictionary.flushWithGC(); - } + // 4 days. + final int timeToElapse = (int)TimeUnit.SECONDS.convert(4, TimeUnit.DAYS); + mCurrentTime += timeToElapse; + setCurrentTime(binaryDictionary, mCurrentTime); + binaryDictionary.flushWithGC(); } private void forcePassingLongTime(final BinaryDictionary binaryDictionary) { - // Currently, probabilities are decayed when GC is run. All entries that have never been - // typed in 128 GCs would be removed. - final int count = 128; - for (int i = 0; i < count; i++) { - binaryDictionary.getPropertyForTests(SET_NEEDS_TO_DECAY_FOR_TESTING_KEY); - binaryDictionary.flushWithGC(); - } + // 60 days. + final int timeToElapse = (int)TimeUnit.SECONDS.convert(60, TimeUnit.DAYS); + mCurrentTime += timeToElapse; + setCurrentTime(binaryDictionary, mCurrentTime); + binaryDictionary.flushWithGC(); } private File createEmptyDictionaryAndGetFile(final String dictId, @@ -147,6 +157,7 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); + binaryDictionary.getPropertyForTests(QUIT_TIMEKEEPER_TEST_MODE_QUERY); final int startTime = getCurrentTime(binaryDictionary); for (int i = 0; i < TEST_COUNT; i++) { final int currentTime = random.nextInt(Integer.MAX_VALUE); @@ -233,6 +244,8 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { addUnigramWord(binaryDictionary, "a", DUMMY_PROBABILITY); addUnigramWord(binaryDictionary, "a", DUMMY_PROBABILITY); addUnigramWord(binaryDictionary, "a", DUMMY_PROBABILITY); + addUnigramWord(binaryDictionary, "a", DUMMY_PROBABILITY); + assertTrue(binaryDictionary.isValidWord("a")); forcePassingShortTime(binaryDictionary); assertTrue(binaryDictionary.isValidWord("a")); forcePassingLongTime(binaryDictionary); @@ -257,6 +270,9 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { addUnigramWord(binaryDictionary, "a", DUMMY_PROBABILITY); addUnigramWord(binaryDictionary, "b", DUMMY_PROBABILITY); addBigramWords(binaryDictionary, "a", "b", DUMMY_PROBABILITY); + addUnigramWord(binaryDictionary, "a", DUMMY_PROBABILITY); + addUnigramWord(binaryDictionary, "b", DUMMY_PROBABILITY); + addBigramWords(binaryDictionary, "a", "b", DUMMY_PROBABILITY); assertTrue(binaryDictionary.isValidBigram("a", "b")); forcePassingShortTime(binaryDictionary); assertTrue(binaryDictionary.isValidBigram("a", "b")); @@ -307,8 +323,7 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { Integer.parseInt(binaryDictionary.getPropertyForTests( BinaryDictionary.UNIGRAM_COUNT_QUERY)); while (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) { - binaryDictionary.getPropertyForTests(SET_NEEDS_TO_DECAY_FOR_TESTING_KEY); - binaryDictionary.flushWithGC(); + forcePassingShortTime(binaryDictionary); } final int unigramCountAfterGC = Integer.parseInt(binaryDictionary.getPropertyForTests( @@ -321,6 +336,10 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { BinaryDictionary.UNIGRAM_COUNT_QUERY)) > 0); assertTrue(Integer.parseInt(binaryDictionary.getPropertyForTests( BinaryDictionary.UNIGRAM_COUNT_QUERY)) <= maxUnigramCount); + forcePassingLongTime(binaryDictionary); + assertEquals(0, Integer.parseInt(binaryDictionary.getPropertyForTests( + BinaryDictionary.UNIGRAM_COUNT_QUERY))); + } public void testAddManyBigramsToDecayingDict() { @@ -378,8 +397,7 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { Integer.parseInt(binaryDictionary.getPropertyForTests( BinaryDictionary.BIGRAM_COUNT_QUERY)); while (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) { - binaryDictionary.getPropertyForTests(SET_NEEDS_TO_DECAY_FOR_TESTING_KEY); - binaryDictionary.flushWithGC(); + forcePassingShortTime(binaryDictionary); } final int bigramCountAfterGC = Integer.parseInt(binaryDictionary.getPropertyForTests( @@ -392,5 +410,8 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { BinaryDictionary.BIGRAM_COUNT_QUERY)) > 0); assertTrue(Integer.parseInt(binaryDictionary.getPropertyForTests( BinaryDictionary.BIGRAM_COUNT_QUERY)) <= maxBigramCount); + forcePassingLongTime(binaryDictionary); + assertEquals(0, Integer.parseInt(binaryDictionary.getPropertyForTests( + BinaryDictionary.BIGRAM_COUNT_QUERY))); } }