am 21ce9c4a
: Merge "Stochastic decay."
* commit '21ce9c4a89f90593e54ae29670ebd09a14533665': Stochastic decay.
This commit is contained in:
commit
879ae3aa92
6 changed files with 77 additions and 95 deletions
|
@ -43,7 +43,7 @@ void DynamicBigramListPolicy::getNextBigram(int *const outBigramPos, int *const
|
|||
}
|
||||
*outProbability = BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags);
|
||||
*outHasNext = BigramListReadWriteUtils::hasNext(bigramFlags);
|
||||
if (mIsDecayingDict && !ForgettingCurveUtils::isValidBigram(*outProbability)) {
|
||||
if (mIsDecayingDict && !ForgettingCurveUtils::isValidEncodedProbability(*outProbability)) {
|
||||
// This bigram is too weak to output.
|
||||
*outBigramPos = NOT_A_DICT_POS;
|
||||
} else {
|
||||
|
@ -261,8 +261,8 @@ bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTarg
|
|||
const int originalProbability = BigramListReadWriteUtils::getProbabilityFromFlags(
|
||||
bigramFlags);
|
||||
const int probabilityToWrite = mIsDecayingDict ?
|
||||
ForgettingCurveUtils::getUpdatedBigramProbabilityDelta(
|
||||
originalProbability, probability) : probability;
|
||||
ForgettingCurveUtils::getUpdatedEncodedProbability(originalProbability,
|
||||
probability) : probability;
|
||||
const BigramListReadWriteUtils::BigramFlags updatedFlags =
|
||||
BigramListReadWriteUtils::setProbabilityInFlags(bigramFlags,
|
||||
probabilityToWrite);
|
||||
|
@ -294,7 +294,7 @@ bool DynamicBigramListPolicy::writeNewBigramEntry(const int bigramTargetPos, con
|
|||
int *const writingPos) {
|
||||
// hasNext is false because we are adding a new bigram entry at the end of the bigram list.
|
||||
const int probabilityToWrite = mIsDecayingDict ?
|
||||
ForgettingCurveUtils::getUpdatedBigramProbabilityDelta(NOT_A_PROBABILITY, probability) :
|
||||
ForgettingCurveUtils::getUpdatedEncodedProbability(NOT_A_PROBABILITY, probability) :
|
||||
probability;
|
||||
return BigramListReadWriteUtils::createAndWriteBigramEntry(mBuffer, bigramTargetPos,
|
||||
probabilityToWrite, false /* hasNext */, writingPos);
|
||||
|
@ -365,9 +365,9 @@ bool DynamicBigramListPolicy::updateProbabilityForDecay(
|
|||
*outRemoved = false;
|
||||
if (mIsDecayingDict) {
|
||||
// Update bigram probability for decaying.
|
||||
const int newProbability = ForgettingCurveUtils::getBigramProbabilityDeltaToSave(
|
||||
const int newProbability = ForgettingCurveUtils::getEncodedProbabilityToSave(
|
||||
BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags));
|
||||
if (ForgettingCurveUtils::isValidBigram(newProbability)) {
|
||||
if (ForgettingCurveUtils::isValidEncodedProbability(newProbability)) {
|
||||
// Write new probability.
|
||||
const BigramListReadWriteUtils::BigramFlags updatedBigramFlags =
|
||||
BigramListReadWriteUtils::setProbabilityInFlags(
|
||||
|
|
|
@ -29,14 +29,14 @@ bool DynamicPatriciaTrieGcEventListeners
|
|||
bool isUselessPtNode = !node->isTerminal();
|
||||
if (node->isTerminal() && mIsDecayingDict) {
|
||||
const int newProbability =
|
||||
ForgettingCurveUtils::getUnigramProbabilityToSave(node->getProbability());
|
||||
ForgettingCurveUtils::getEncodedProbabilityToSave(node->getProbability());
|
||||
int writingPos = node->getProbabilityFieldPos();
|
||||
// Update probability.
|
||||
if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition(
|
||||
mBuffer, newProbability, &writingPos)) {
|
||||
return false;
|
||||
}
|
||||
if (!ForgettingCurveUtils::isValidUnigram(newProbability)) {
|
||||
if (!ForgettingCurveUtils::isValidEncodedProbability(newProbability)) {
|
||||
isUselessPtNode = false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -545,7 +545,7 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
|
|||
int DynamicPatriciaTrieWritingHelper::getUpdatedProbability(const int originalProbability,
|
||||
const int newProbability) {
|
||||
if (mNeedsToDecay) {
|
||||
return ForgettingCurveUtils::getUpdatedUnigramProbability(originalProbability,
|
||||
return ForgettingCurveUtils::getUpdatedEncodedProbability(originalProbability,
|
||||
newProbability);
|
||||
} else {
|
||||
return newProbability;
|
||||
|
|
|
@ -14,6 +14,8 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
|
||||
|
||||
#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
|
||||
|
@ -26,106 +28,91 @@ const int ForgettingCurveUtils::MAX_BIGRAM_COUNT = 12000;
|
|||
const int ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC = 10000;
|
||||
|
||||
const int ForgettingCurveUtils::MAX_COMPUTED_PROBABILITY = 127;
|
||||
const int ForgettingCurveUtils::MAX_UNIGRAM_PROBABILITY = 120;
|
||||
const int ForgettingCurveUtils::MIN_VALID_UNIGRAM_PROBABILITY = 24;
|
||||
const int ForgettingCurveUtils::UNIGRAM_PROBABILITY_STEP = 8;
|
||||
const int ForgettingCurveUtils::MAX_BIGRAM_PROBABILITY_DELTA = 15;
|
||||
const int ForgettingCurveUtils::MIN_VALID_BIGRAM_PROBABILITY_DELTA = 3;
|
||||
const int ForgettingCurveUtils::BIGRAM_PROBABILITY_DELTA_STEP = 1;
|
||||
const int ForgettingCurveUtils::MAX_ENCODED_PROBABILITY = 15;
|
||||
const int ForgettingCurveUtils::MIN_VALID_ENCODED_PROBABILITY = 3;
|
||||
const int ForgettingCurveUtils::ENCODED_PROBABILITY_STEP = 1;
|
||||
// Currently, we try to decay each uni/bigram once every 2 hours. Accordingly, the expected
|
||||
// duration of the decay is approximately 66hours.
|
||||
const float ForgettingCurveUtils::MIN_PROBABILITY_TO_DECAY = 0.03f;
|
||||
|
||||
/* static */ int ForgettingCurveUtils::getProbability(const int encodedUnigramProbability,
|
||||
const int encodedBigramProbabilityDelta) {
|
||||
const int encodedBigramProbability) {
|
||||
if (encodedUnigramProbability == NOT_A_PROBABILITY) {
|
||||
return NOT_A_PROBABILITY;
|
||||
} else if (encodedBigramProbabilityDelta == NOT_A_PROBABILITY) {
|
||||
const int rawProbability = ProbabilityUtils::backoff(decodeUnigramProbability(
|
||||
encodedUnigramProbability));
|
||||
return min(getDecayedProbability(rawProbability), MAX_COMPUTED_PROBABILITY);
|
||||
} else if (encodedBigramProbability == NOT_A_PROBABILITY) {
|
||||
return backoff(decodeUnigramProbability(encodedUnigramProbability));
|
||||
} else {
|
||||
const int rawProbability = ProbabilityUtils::computeProbabilityForBigram(
|
||||
decodeUnigramProbability(encodedUnigramProbability),
|
||||
decodeBigramProbabilityDelta(encodedBigramProbabilityDelta));
|
||||
return min(getDecayedProbability(rawProbability), MAX_COMPUTED_PROBABILITY);
|
||||
const int unigramProbability = decodeUnigramProbability(encodedUnigramProbability);
|
||||
const int bigramProbability = decodeBigramProbability(encodedBigramProbability);
|
||||
return min(max(unigramProbability, bigramProbability), MAX_COMPUTED_PROBABILITY);
|
||||
}
|
||||
}
|
||||
|
||||
/* static */ int ForgettingCurveUtils::getUpdatedUnigramProbability(
|
||||
// Caveat: Unlike getProbability(), this method doesn't assume special bigram probability encoding
|
||||
// (i.e. unigram probability + bigram probability delta).
|
||||
/* static */ int ForgettingCurveUtils::getUpdatedEncodedProbability(
|
||||
const int originalEncodedProbability, const int newProbability) {
|
||||
if (originalEncodedProbability == NOT_A_PROBABILITY) {
|
||||
// The unigram is not in this dictionary.
|
||||
if (newProbability == NOT_A_PROBABILITY) {
|
||||
// The unigram is not in other dictionaries.
|
||||
return 0;
|
||||
} else {
|
||||
return MIN_VALID_UNIGRAM_PROBABILITY;
|
||||
}
|
||||
} else {
|
||||
if (newProbability != NOT_A_PROBABILITY
|
||||
&& originalEncodedProbability < MIN_VALID_UNIGRAM_PROBABILITY) {
|
||||
return MIN_VALID_UNIGRAM_PROBABILITY;
|
||||
}
|
||||
return min(originalEncodedProbability + UNIGRAM_PROBABILITY_STEP, MAX_UNIGRAM_PROBABILITY);
|
||||
}
|
||||
}
|
||||
|
||||
/* static */ int ForgettingCurveUtils::getUnigramProbabilityToSave(const int encodedProbability) {
|
||||
return max(encodedProbability - UNIGRAM_PROBABILITY_STEP, 0);
|
||||
}
|
||||
|
||||
/* static */ int ForgettingCurveUtils::getBigramProbabilityDeltaToSave(
|
||||
const int encodedProbabilityDelta) {
|
||||
return max(encodedProbabilityDelta - BIGRAM_PROBABILITY_DELTA_STEP, 0);
|
||||
}
|
||||
|
||||
/* static */ int ForgettingCurveUtils::getUpdatedBigramProbabilityDelta(
|
||||
const int originalEncodedProbabilityDelta, const int newProbability) {
|
||||
if (originalEncodedProbabilityDelta == NOT_A_PROBABILITY) {
|
||||
// The bigram relation is not in this dictionary.
|
||||
if (newProbability == NOT_A_PROBABILITY) {
|
||||
// The bigram target is not in other dictionaries.
|
||||
return 0;
|
||||
} else {
|
||||
return MIN_VALID_BIGRAM_PROBABILITY_DELTA;
|
||||
return MIN_VALID_ENCODED_PROBABILITY;
|
||||
}
|
||||
} else {
|
||||
if (newProbability != NOT_A_PROBABILITY
|
||||
&& originalEncodedProbabilityDelta < MIN_VALID_BIGRAM_PROBABILITY_DELTA) {
|
||||
return MIN_VALID_BIGRAM_PROBABILITY_DELTA;
|
||||
&& originalEncodedProbability < MIN_VALID_ENCODED_PROBABILITY) {
|
||||
return MIN_VALID_ENCODED_PROBABILITY;
|
||||
}
|
||||
return min(originalEncodedProbabilityDelta + BIGRAM_PROBABILITY_DELTA_STEP,
|
||||
MAX_BIGRAM_PROBABILITY_DELTA);
|
||||
return min(originalEncodedProbability + ENCODED_PROBABILITY_STEP, MAX_ENCODED_PROBABILITY);
|
||||
}
|
||||
}
|
||||
|
||||
/* static */ int ForgettingCurveUtils::isValidUnigram(const int encodedUnigramProbability) {
|
||||
return encodedUnigramProbability >= MIN_VALID_UNIGRAM_PROBABILITY;
|
||||
/* static */ int ForgettingCurveUtils::isValidEncodedProbability(const int encodedProbability) {
|
||||
return encodedProbability >= MIN_VALID_ENCODED_PROBABILITY;
|
||||
}
|
||||
|
||||
/* static */ int ForgettingCurveUtils::isValidBigram(const int encodedBigramProbabilityDelta) {
|
||||
return encodedBigramProbabilityDelta >= MIN_VALID_BIGRAM_PROBABILITY_DELTA;
|
||||
/* static */ int ForgettingCurveUtils::getEncodedProbabilityToSave(const int encodedProbability) {
|
||||
const int currentEncodedProbability = max(min(encodedProbability, MAX_ENCODED_PROBABILITY), 0);
|
||||
// TODO: Implement the decay in more proper way.
|
||||
const float currentRate = static_cast<float>(currentEncodedProbability)
|
||||
/ static_cast<float>(MAX_ENCODED_PROBABILITY);
|
||||
const float thresholdToDecay = MIN_PROBABILITY_TO_DECAY
|
||||
+ (1.0f - MIN_PROBABILITY_TO_DECAY) * (1.0f - currentRate);
|
||||
const float randValue = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
|
||||
if (thresholdToDecay < randValue) {
|
||||
return max(currentEncodedProbability - ENCODED_PROBABILITY_STEP, 0);
|
||||
} else {
|
||||
return currentEncodedProbability;
|
||||
}
|
||||
}
|
||||
|
||||
/* static */ int ForgettingCurveUtils::decodeUnigramProbability(const int encodedProbability) {
|
||||
const int probability = encodedProbability - MIN_VALID_UNIGRAM_PROBABILITY;
|
||||
const int probability = encodedProbability - MIN_VALID_ENCODED_PROBABILITY;
|
||||
if (probability < 0) {
|
||||
return NOT_A_PROBABILITY;
|
||||
} else {
|
||||
return min(probability, MAX_UNIGRAM_PROBABILITY);
|
||||
return min(probability, MAX_ENCODED_PROBABILITY) * 8;
|
||||
}
|
||||
}
|
||||
|
||||
/* static */ int ForgettingCurveUtils::decodeBigramProbabilityDelta(
|
||||
const int encodedProbabilityDelta) {
|
||||
const int probabilityDelta = encodedProbabilityDelta - MIN_VALID_BIGRAM_PROBABILITY_DELTA;
|
||||
if (probabilityDelta < 0) {
|
||||
/* static */ int ForgettingCurveUtils::decodeBigramProbability(const int encodedProbability) {
|
||||
const int probability = encodedProbability - MIN_VALID_ENCODED_PROBABILITY;
|
||||
if (probability < 0) {
|
||||
return NOT_A_PROBABILITY;
|
||||
} else {
|
||||
return min(probabilityDelta, MAX_BIGRAM_PROBABILITY_DELTA);
|
||||
return min(probability, MAX_ENCODED_PROBABILITY) * 8;
|
||||
}
|
||||
}
|
||||
|
||||
/* static */ int ForgettingCurveUtils::getDecayedProbability(const int rawProbability) {
|
||||
return rawProbability;
|
||||
// See comments in ProbabilityUtils::backoff().
|
||||
/* static */ int ForgettingCurveUtils::backoff(const int unigramProbability) {
|
||||
if (unigramProbability == NOT_A_PROBABILITY) {
|
||||
return NOT_A_PROBABILITY;
|
||||
} else {
|
||||
return max(unigramProbability - 8, 0);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace latinime
|
||||
|
|
|
@ -24,7 +24,6 @@ namespace latinime {
|
|||
// TODO: Check the elapsed time and decrease the probability depending on the time. Time field is
|
||||
// required to introduced to each terminal PtNode and bigram entry.
|
||||
// TODO: Quit using bigram probability to indicate the delta.
|
||||
// TODO: Quit using bigram probability delta.
|
||||
class ForgettingCurveUtils {
|
||||
public:
|
||||
static const int MAX_UNIGRAM_COUNT;
|
||||
|
@ -33,38 +32,30 @@ class ForgettingCurveUtils {
|
|||
static const int MAX_BIGRAM_COUNT_AFTER_GC;
|
||||
|
||||
static int getProbability(const int encodedUnigramProbability,
|
||||
const int encodedBigramProbabilityDelta);
|
||||
const int encodedBigramProbability);
|
||||
|
||||
static int getUpdatedUnigramProbability(const int originalEncodedProbability,
|
||||
static int getUpdatedEncodedProbability(const int originalEncodedProbability,
|
||||
const int newProbability);
|
||||
|
||||
static int getUpdatedBigramProbabilityDelta(const int originalEncodedProbabilityDelta,
|
||||
const int newProbability);
|
||||
static int isValidEncodedProbability(const int encodedProbability);
|
||||
|
||||
static int isValidUnigram(const int encodedUnigramProbability);
|
||||
|
||||
static int isValidBigram(const int encodedProbabilityDelta);
|
||||
|
||||
static int getUnigramProbabilityToSave(const int encodedProbability);
|
||||
|
||||
static int getBigramProbabilityDeltaToSave(const int encodedProbabilityDelta);
|
||||
static int getEncodedProbabilityToSave(const int encodedProbability);
|
||||
|
||||
private:
|
||||
DISALLOW_IMPLICIT_CONSTRUCTORS(ForgettingCurveUtils);
|
||||
|
||||
static const int MAX_COMPUTED_PROBABILITY;
|
||||
static const int MAX_UNIGRAM_PROBABILITY;
|
||||
static const int MIN_VALID_UNIGRAM_PROBABILITY;
|
||||
static const int UNIGRAM_PROBABILITY_STEP;
|
||||
static const int MAX_BIGRAM_PROBABILITY_DELTA;
|
||||
static const int MIN_VALID_BIGRAM_PROBABILITY_DELTA;
|
||||
static const int BIGRAM_PROBABILITY_DELTA_STEP;
|
||||
static const int MAX_ENCODED_PROBABILITY;
|
||||
static const int MIN_VALID_ENCODED_PROBABILITY;
|
||||
static const int ENCODED_PROBABILITY_STEP;
|
||||
|
||||
static const float MIN_PROBABILITY_TO_DECAY;
|
||||
|
||||
static int decodeUnigramProbability(const int encodedProbability);
|
||||
|
||||
static int decodeBigramProbabilityDelta(const int encodedProbability);
|
||||
static int decodeBigramProbability(const int encodedProbability);
|
||||
|
||||
static int getDecayedProbability(const int rawProbability);
|
||||
static int backoff(const int unigramProbability);
|
||||
};
|
||||
} // namespace latinime
|
||||
#endif /* LATINIME_FORGETTING_CURVE_UTILS_H */
|
||||
|
|
|
@ -50,14 +50,18 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
|||
}
|
||||
|
||||
private void forcePassingShortTime(final BinaryDictionary binaryDictionary) {
|
||||
// Entries having low probability would be suppressed once in 2 GCs.
|
||||
final int count = 2;
|
||||
for (int i = 0; i < count; i++) {
|
||||
binaryDictionary.getPropertyForTests(SET_NEEDS_TO_DECAY_FOR_TESTING_KEY);
|
||||
binaryDictionary.flushWithGC();
|
||||
}
|
||||
}
|
||||
|
||||
private void forcePassingLongTime(final BinaryDictionary binaryDictionary) {
|
||||
// Currently, probabilities are decayed when GC is run. All entries that have never been
|
||||
// typed in 32 GCs are removed.
|
||||
final int count = 32;
|
||||
// typed in 128 GCs would be removed.
|
||||
final int count = 128;
|
||||
for (int i = 0; i < count; i++) {
|
||||
binaryDictionary.getPropertyForTests(SET_NEEDS_TO_DECAY_FOR_TESTING_KEY);
|
||||
binaryDictionary.flushWithGC();
|
||||
|
|
Loading…
Reference in a new issue