Merge "Check the elapsed time for decaying."

This commit is contained in:
Keisuke Kuroyanagi 2013-10-04 07:16:16 +00:00 committed by Android (Google) Code Review
commit 3e5f4b53b0
12 changed files with 113 additions and 59 deletions

View file

@ -37,6 +37,8 @@ class DictionaryHeaderStructurePolicy {
virtual float getMultiWordCostMultiplier() const = 0;
virtual int getLastDecayedTime() const = 0;
virtual void readHeaderValueOrQuestionMark(const char *const key, int *outValue,
int outValueSize) const = 0;

View file

@ -360,13 +360,13 @@ int DynamicBigramListPolicy::followBigramLinkAndGetCurrentBigramPtNodePos(
}
bool DynamicBigramListPolicy::updateProbabilityForDecay(
BigramListReadWriteUtils::BigramFlags bigramFlags, const int targetPtNodePos,
const BigramListReadWriteUtils::BigramFlags bigramFlags, const int targetPtNodePos,
int *const bigramEntryPos, bool *const outRemoved) const {
*outRemoved = false;
if (mIsDecayingDict) {
// Update bigram probability for decaying.
const int newProbability = ForgettingCurveUtils::getEncodedProbabilityToSave(
BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags));
BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags), mHeaderPolicy);
if (ForgettingCurveUtils::isValidEncodedProbability(newProbability)) {
// Write new probability.
const BigramListReadWriteUtils::BigramFlags updatedBigramFlags =

View file

@ -27,6 +27,7 @@
namespace latinime {
class BufferWithExtendableBuffer;
class DictionaryHeaderStructurePolicy;
class DictionaryShortcutsStructurePolicy;
/*
@ -34,10 +35,12 @@ class DictionaryShortcutsStructurePolicy;
*/
class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy {
public:
DynamicBigramListPolicy(BufferWithExtendableBuffer *const buffer,
DynamicBigramListPolicy(const DictionaryHeaderStructurePolicy *const headerPolicy,
BufferWithExtendableBuffer *const buffer,
const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
const bool isDecayingDict)
: mBuffer(buffer), mShortcutPolicy(shortcutPolicy), mIsDecayingDict(isDecayingDict) {}
: mHeaderPolicy(headerPolicy), mBuffer(buffer), mShortcutPolicy(shortcutPolicy),
mIsDecayingDict(isDecayingDict) {}
~DynamicBigramListPolicy() {}
@ -74,6 +77,7 @@ class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy {
static const int CONTINUING_BIGRAM_LINK_COUNT_LIMIT;
static const int BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT;
const DictionaryHeaderStructurePolicy *const mHeaderPolicy;
BufferWithExtendableBuffer *const mBuffer;
const DictionaryShortcutsStructurePolicy *const mShortcutPolicy;
const bool mIsDecayingDict;
@ -81,7 +85,7 @@ class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy {
// Follow bigram link and return the position of bigram target PtNode that is currently valid.
int followBigramLinkAndGetCurrentBigramPtNodePos(const int originalBigramPos) const;
bool updateProbabilityForDecay(BigramListReadWriteUtils::BigramFlags bigramFlags,
bool updateProbabilityForDecay(const BigramListReadWriteUtils::BigramFlags bigramFlags,
const int targetPtNodePos, int *const bigramEntryPos, bool *const outRemoved) const;
};
} // namespace latinime

View file

@ -16,6 +16,7 @@
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h"
#include "suggest/core/policy/dictionary_header_structure_policy.h"
#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
namespace latinime {
@ -29,7 +30,8 @@ bool DynamicPatriciaTrieGcEventListeners
bool isUselessPtNode = !node->isTerminal();
if (node->isTerminal() && mIsDecayingDict) {
const int newProbability =
ForgettingCurveUtils::getEncodedProbabilityToSave(node->getProbability());
ForgettingCurveUtils::getEncodedProbabilityToSave(node->getProbability(),
mHeaderPolicy);
int writingPos = node->getProbabilityFieldPos();
// Update probability.
if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition(

View file

@ -29,6 +29,8 @@
namespace latinime {
class DictionaryHeaderStructurePolicy;
class DynamicPatriciaTrieGcEventListeners {
public:
// Updates all PtNodes that can be reached from the root. Checks if each PtNode is useless or
@ -38,10 +40,12 @@ class DynamicPatriciaTrieGcEventListeners {
: public DynamicPatriciaTrieReadingHelper::TraversingEventListener {
public:
TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
const DictionaryHeaderStructurePolicy *const headerPolicy,
DynamicPatriciaTrieWritingHelper *const writingHelper,
BufferWithExtendableBuffer *const buffer, const bool isDecayingDict)
: mWritingHelper(writingHelper), mBuffer(buffer), mIsDecayingDict(isDecayingDict),
mValueStack(), mChildrenValue(0), mValidUnigramCount(0) {}
: mHeaderPolicy(headerPolicy), mWritingHelper(writingHelper), mBuffer(buffer),
mIsDecayingDict(isDecayingDict), mValueStack(), mChildrenValue(0),
mValidUnigramCount(0) {}
~TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted() {};
@ -72,9 +76,10 @@ class DynamicPatriciaTrieGcEventListeners {
DISALLOW_IMPLICIT_CONSTRUCTORS(
TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted);
const DictionaryHeaderStructurePolicy *const mHeaderPolicy;
DynamicPatriciaTrieWritingHelper *const mWritingHelper;
BufferWithExtendableBuffer *const mBuffer;
const int mIsDecayingDict;
const bool mIsDecayingDict;
std::vector<int> mValueStack;
int mChildrenValue;
int mValidUnigramCount;
@ -85,7 +90,8 @@ class DynamicPatriciaTrieGcEventListeners {
class TraversePolicyToUpdateBigramProbability
: public DynamicPatriciaTrieReadingHelper::TraversingEventListener {
public:
TraversePolicyToUpdateBigramProbability(DynamicBigramListPolicy *const bigramPolicy)
TraversePolicyToUpdateBigramProbability(
DynamicBigramListPolicy *const bigramPolicy)
: mBigramPolicy(bigramPolicy), mValidBigramEntryCount(0) {}
bool onAscend() { return true; }

View file

@ -42,7 +42,6 @@ const char *const DynamicPatriciaTriePolicy::SET_NEEDS_TO_DECAY_FOR_TESTING_QUER
const int DynamicPatriciaTriePolicy::MAX_DICT_EXTENDED_REGION_SIZE = 1024 * 1024;
const int DynamicPatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS =
DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE - 1024;
const int DynamicPatriciaTriePolicy::DECAY_INTERVAL_FOR_DECAYING_DICTS = 2 * 60 * 60;
void DynamicPatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode,
DicNodeVector *const childDicNodes) const {
@ -314,15 +313,15 @@ void DynamicPatriciaTriePolicy::flushWithGC(const char *const filePath) {
AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary.");
return;
}
const bool runGCwithDecay = needsToDecay();
DynamicBigramListPolicy bigramListPolicyForGC(&mBufferWithExtendableBuffer,
&mShortcutListPolicy, runGCwithDecay);
const bool needsToDecay = mHeaderPolicy.isDecayingDict()
&& (mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay(
false /* mindsBlockByDecay */, mUnigramCount, mBigramCount, &mHeaderPolicy));
DynamicBigramListPolicy bigramListPolicyForGC(&mHeaderPolicy, &mBufferWithExtendableBuffer,
&mShortcutListPolicy, needsToDecay);
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
&bigramListPolicyForGC, &mShortcutListPolicy, runGCwithDecay);
&bigramListPolicyForGC, &mShortcutListPolicy, needsToDecay);
writingHelper.writeToDictFileWithGC(getRootPosition(), filePath, &mHeaderPolicy);
if (runGCwithDecay) {
mNeedsToDecayForTesting = false;
}
mNeedsToDecayForTesting = false;
}
bool DynamicPatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {
@ -344,16 +343,8 @@ bool DynamicPatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {
// Needs to reduce dictionary size.
return true;
} else if (mHeaderPolicy.isDecayingDict()) {
if (mUnigramCount >= ForgettingCurveUtils::MAX_UNIGRAM_COUNT) {
// Unigram count exceeds the limit.
return true;
} else if (mBigramCount >= ForgettingCurveUtils::MAX_BIGRAM_COUNT) {
// Bigram count exceeds the limit.
return true;
} else if (mindsBlockByGC && needsToDecay()) {
// Time to update probabilities for decaying.
return true;
}
return mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay(
mindsBlockByGC, mUnigramCount, mBigramCount, &mHeaderPolicy);
}
return false;
}
@ -369,9 +360,4 @@ void DynamicPatriciaTriePolicy::getProperty(const char *const query, char *const
}
}
bool DynamicPatriciaTriePolicy::needsToDecay() const {
return mHeaderPolicy.isDecayingDict() && (mNeedsToDecayForTesting
|| mHeaderPolicy.getLastDecayedTime() + DECAY_INTERVAL_FOR_DECAYING_DICTS < time(0));
}
} // namespace latinime

View file

@ -37,7 +37,7 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
mBufferWithExtendableBuffer(mBuffer->getBuffer() + mHeaderPolicy.getSize(),
mBuffer->getBufferSize() - mHeaderPolicy.getSize()),
mShortcutListPolicy(&mBufferWithExtendableBuffer),
mBigramListPolicy(&mBufferWithExtendableBuffer, &mShortcutListPolicy,
mBigramListPolicy(&mHeaderPolicy, &mBufferWithExtendableBuffer, &mShortcutListPolicy,
mHeaderPolicy.isDecayingDict()),
mUnigramCount(mHeaderPolicy.getUnigramCount()),
mBigramCount(mHeaderPolicy.getBigramCount()), mNeedsToDecayForTesting(false) {}
@ -105,7 +105,6 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
static const char *const SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY;
static const int MAX_DICT_EXTENDED_REGION_SIZE;
static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS;
static const int DECAY_INTERVAL_FOR_DECAYING_DICTS;
const MmappedBuffer *const mBuffer;
const HeaderPolicy mHeaderPolicy;
@ -115,8 +114,6 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
int mUnigramCount;
int mBigramCount;
int mNeedsToDecayForTesting;
bool needsToDecay() const;
};
} // namespace latinime
#endif // LATINIME_DYNAMIC_PATRICIA_TRIE_POLICY_H

View file

@ -165,7 +165,10 @@ void DynamicPatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNod
MAX_DICTIONARY_SIZE);
int unigramCount = 0;
int bigramCount = 0;
if (!runGC(rootPtNodeArrayPos, &newDictBuffer, &unigramCount, &bigramCount)) {
if (mNeedsToDecay) {
ForgettingCurveUtils::sTimeKeeper.setCurrentTime();
}
if (!runGC(rootPtNodeArrayPos, headerPolicy, &newDictBuffer, &unigramCount, &bigramCount)) {
return;
}
BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
@ -481,14 +484,14 @@ bool DynamicPatriciaTrieWritingHelper::reallocatePtNodeAndAddNewPtNodes(
}
bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
BufferWithExtendableBuffer *const bufferToWrite, int *const outUnigramCount,
int *const outBigramCount) {
const HeaderPolicy *const headerPolicy, BufferWithExtendableBuffer *const bufferToWrite,
int *const outUnigramCount, int *const outBigramCount) {
DynamicPatriciaTrieReadingHelper readingHelper(mBuffer, mBigramPolicy, mShortcutPolicy);
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
DynamicPatriciaTrieGcEventListeners
::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
this, mBuffer, mNeedsToDecay);
headerPolicy, this, mBuffer, mNeedsToDecay);
if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
&traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) {
return false;
@ -505,7 +508,6 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
&traversePolicyToUpdateBigramProbability)) {
return false;
}
if (mNeedsToDecay && traversePolicyToUpdateBigramProbability.getValidBigramEntryCount()
> ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC) {
// TODO: Remove more bigrams.
@ -524,7 +526,7 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
// Create policy instance for the GCed dictionary.
DynamicShortcutListPolicy newDictShortcutPolicy(bufferToWrite);
DynamicBigramListPolicy newDictBigramPolicy(bufferToWrite, &newDictShortcutPolicy,
DynamicBigramListPolicy newDictBigramPolicy(headerPolicy, bufferToWrite, &newDictShortcutPolicy,
mNeedsToDecay);
// Create reading helper for the GCed dictionary.
DynamicPatriciaTrieReadingHelper newDictReadingHelper(bufferToWrite, &newDictBigramPolicy,

View file

@ -128,8 +128,9 @@ class DynamicPatriciaTrieWritingHelper {
const int probabilityOfNewPtNode, const int *const newNodeCodePoints,
const int newNodeCodePointCount);
bool runGC(const int rootPtNodeArrayPos, BufferWithExtendableBuffer *const bufferToWrite,
int *const outUnigramCount, int *const outBigramCount);
bool runGC(const int rootPtNodeArrayPos, const HeaderPolicy *const headerPolicy,
BufferWithExtendableBuffer *const bufferToWrite, int *const outUnigramCount,
int *const outBigramCount);
int getUpdatedProbability(const int originalProbability, const int newProbability);
};

View file

@ -15,10 +15,12 @@
*/
#include <cmath>
#include <ctime>
#include <stdlib.h>
#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
#include "suggest/core/policy/dictionary_header_structure_policy.h"
#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
namespace latinime {
@ -35,8 +37,14 @@ const int ForgettingCurveUtils::ENCODED_PROBABILITY_STEP = 1;
// Currently, we try to decay each uni/bigram once every 2 hours. Accordingly, the expected
// duration of the decay is approximately 66hours.
const float ForgettingCurveUtils::MIN_PROBABILITY_TO_DECAY = 0.03f;
const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60;
const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityTable;
ForgettingCurveUtils::TimeKeeper ForgettingCurveUtils::sTimeKeeper;
void ForgettingCurveUtils::TimeKeeper::setCurrentTime() {
mCurrentTime = time(0);
}
/* static */ int ForgettingCurveUtils::getProbability(const int encodedUnigramProbability,
const int encodedBigramProbability) {
@ -76,19 +84,44 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
return encodedProbability >= MIN_VALID_ENCODED_PROBABILITY;
}
/* static */ int ForgettingCurveUtils::getEncodedProbabilityToSave(const int encodedProbability) {
const int currentEncodedProbability = max(min(encodedProbability, MAX_ENCODED_PROBABILITY), 0);
/* static */ int ForgettingCurveUtils::getEncodedProbabilityToSave(const int encodedProbability,
const DictionaryHeaderStructurePolicy *const headerPolicy) {
const int elapsedTime = sTimeKeeper.peekCurrentTime() - headerPolicy->getLastDecayedTime();
const int decayIterationCount = max(elapsedTime / DECAY_INTERVAL_SECONDS, 1);
int currentEncodedProbability = max(min(encodedProbability, MAX_ENCODED_PROBABILITY), 0);
// TODO: Implement the decay in more proper way.
const float currentRate = static_cast<float>(currentEncodedProbability)
/ static_cast<float>(MAX_ENCODED_PROBABILITY);
const float thresholdToDecay = MIN_PROBABILITY_TO_DECAY
+ (1.0f - MIN_PROBABILITY_TO_DECAY) * (1.0f - currentRate);
const float randValue = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
if (thresholdToDecay < randValue) {
return max(currentEncodedProbability - ENCODED_PROBABILITY_STEP, 0);
} else {
return currentEncodedProbability;
for (int i = 0; i < decayIterationCount; ++i) {
const float currentRate = static_cast<float>(currentEncodedProbability)
/ static_cast<float>(MAX_ENCODED_PROBABILITY);
const float thresholdToDecay = MIN_PROBABILITY_TO_DECAY
+ (1.0f - MIN_PROBABILITY_TO_DECAY) * currentRate;
const float randValue = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
if (thresholdToDecay < randValue) {
currentEncodedProbability = max(currentEncodedProbability - ENCODED_PROBABILITY_STEP,
0);
}
}
return currentEncodedProbability;
}
/* static */ bool ForgettingCurveUtils::needsToDecay(const bool mindsBlockByDecay,
const int unigramCount, const int bigramCount,
const DictionaryHeaderStructurePolicy *const headerPolicy) {
if (unigramCount >= ForgettingCurveUtils::MAX_UNIGRAM_COUNT) {
// Unigram count exceeds the limit.
return true;
} else if (bigramCount >= ForgettingCurveUtils::MAX_BIGRAM_COUNT) {
// Bigram count exceeds the limit.
return true;
}
if (mindsBlockByDecay) {
return false;
}
if (headerPolicy->getLastDecayedTime() + DECAY_INTERVAL_SECONDS < time(0)) {
// Time to decay.
return true;
}
return false;
}
/* static */ int ForgettingCurveUtils::decodeProbability(const int encodedProbability) {

View file

@ -23,16 +23,32 @@
namespace latinime {
class DictionaryHeaderStructurePolicy;
// TODO: Check the elapsed time and decrease the probability depending on the time. Time field is
// required to introduced to each terminal PtNode and bigram entry.
// TODO: Quit using bigram probability to indicate the delta.
class ForgettingCurveUtils {
public:
class TimeKeeper {
public:
TimeKeeper() : mCurrentTime(0) {}
void setCurrentTime();
int peekCurrentTime() const { return mCurrentTime; };
private:
DISALLOW_COPY_AND_ASSIGN(TimeKeeper);
int mCurrentTime;
};
static const int MAX_UNIGRAM_COUNT;
static const int MAX_UNIGRAM_COUNT_AFTER_GC;
static const int MAX_BIGRAM_COUNT;
static const int MAX_BIGRAM_COUNT_AFTER_GC;
static TimeKeeper sTimeKeeper;
static int getProbability(const int encodedUnigramProbability,
const int encodedBigramProbability);
@ -41,7 +57,11 @@ class ForgettingCurveUtils {
static int isValidEncodedProbability(const int encodedProbability);
static int getEncodedProbabilityToSave(const int encodedProbability);
static int getEncodedProbabilityToSave(const int encodedProbability,
const DictionaryHeaderStructurePolicy *const headerPolicy);
static bool needsToDecay(const bool mindsBlockByDecay, const int unigramCount,
const int bigramCount, const DictionaryHeaderStructurePolicy *const headerPolicy);
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(ForgettingCurveUtils);
@ -68,6 +88,7 @@ class ForgettingCurveUtils {
static const int MIN_VALID_ENCODED_PROBABILITY;
static const int ENCODED_PROBABILITY_STEP;
static const float MIN_PROBABILITY_TO_DECAY;
static const int DECAY_INTERVAL_SECONDS;
static const ProbabilityTable sProbabilityTable;

View file

@ -50,8 +50,8 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
}
private void forcePassingShortTime(final BinaryDictionary binaryDictionary) {
// Entries having low probability would be suppressed once in 2 GCs.
final int count = 2;
// Entries having low probability would be suppressed once in 3 GCs.
final int count = 3;
for (int i = 0; i < count; i++) {
binaryDictionary.getPropertyForTests(SET_NEEDS_TO_DECAY_FOR_TESTING_KEY);
binaryDictionary.flushWithGC();