From fd02b2d6ee55d4aee7faab89a7a2b72764eafc47 Mon Sep 17 00:00:00 2001 From: Keisuke Kuroyanagi Date: Mon, 30 Sep 2013 20:53:35 +0900 Subject: [PATCH] Implement simple dictionary decay. Groundwork and implement simple decay. Increment probability when typed and decrement probability at GC. Bug: 6669677 Change-Id: Ib12caead0cbeef4ce7808fe8ac0b00ee331523fe --- ...ecayingExpandableBinaryDictionaryBase.java | 11 +- native/jni/Android.mk | 1 + .../bigram/dynamic_bigram_list_policy.cpp | 63 ++++++- .../bigram/dynamic_bigram_list_policy.h | 10 +- ...namic_patricia_trie_gc_event_listeners.cpp | 15 ++ ...dynamic_patricia_trie_gc_event_listeners.h | 7 +- .../dynamic_patricia_trie_policy.cpp | 77 ++++++-- .../dictionary/dynamic_patricia_trie_policy.h | 6 +- .../dynamic_patricia_trie_writing_helper.cpp | 40 +++- .../dynamic_patricia_trie_writing_helper.h | 11 +- .../dictionary/utils/decaying_utils.cpp | 129 +++++++++++++ .../dictionary/utils/decaying_utils.h | 70 +++++++ .../latin/BinaryDictionaryDecayingTests.java | 171 ++++++++++++++++++ 13 files changed, 572 insertions(+), 39 deletions(-) create mode 100644 native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.cpp create mode 100644 native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.h create mode 100644 tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java diff --git a/java/src/com/android/inputmethod/latin/personalization/DecayingExpandableBinaryDictionaryBase.java b/java/src/com/android/inputmethod/latin/personalization/DecayingExpandableBinaryDictionaryBase.java index 1ca91a0fd..7cf4f0c88 100644 --- a/java/src/com/android/inputmethod/latin/personalization/DecayingExpandableBinaryDictionaryBase.java +++ b/java/src/com/android/inputmethod/latin/personalization/DecayingExpandableBinaryDictionaryBase.java @@ -22,6 +22,7 @@ import android.util.Log; import com.android.inputmethod.annotations.UsedForTesting; import com.android.inputmethod.latin.Constants; +import com.android.inputmethod.latin.Dictionary; import com.android.inputmethod.latin.ExpandableBinaryDictionary; import com.android.inputmethod.latin.LatinImeLogger; import com.android.inputmethod.latin.makedict.DictDecoder; @@ -50,6 +51,9 @@ public abstract class DecayingExpandableBinaryDictionaryBase extends ExpandableB /** Any pair being typed or picked */ public static final int FREQUENCY_FOR_TYPED = 2; + public static final int FREQUENCY_FOR_WORDS_IN_DICTS = FREQUENCY_FOR_TYPED; + public static final int FREQUENCY_FOR_WORDS_NOT_IN_DICTS = Dictionary.NOT_A_PROBABILITY; + /** Locale for which this user history dictionary is storing words */ private final String mLocale; @@ -131,14 +135,17 @@ public abstract class DecayingExpandableBinaryDictionaryBase extends ExpandableB (word0 != null && word0.length() >= Constants.DICTIONARY_MAX_WORD_LENGTH)) { return; } - addWordDynamically(word1, null /* the "shortcut" parameter is null */, FREQUENCY_FOR_TYPED, + final int frequency = ENABLE_BINARY_DICTIONARY_DYNAMIC_UPDATE ? + (isValid ? FREQUENCY_FOR_WORDS_IN_DICTS : FREQUENCY_FOR_WORDS_NOT_IN_DICTS) : + FREQUENCY_FOR_TYPED; + addWordDynamically(word1, null /* the "shortcut" parameter is null */, frequency, false /* isNotAWord */); // Do not insert a word as a bigram of itself if (word1.equals(word0)) { return; } if (null != word0) { - addBigramDynamically(word0, word1, FREQUENCY_FOR_TYPED, isValid); + addBigramDynamically(word0, word1, frequency, isValid); } } diff --git a/native/jni/Android.mk b/native/jni/Android.mk index 0594ddff0..36afea54b 100644 --- a/native/jni/Android.mk +++ b/native/jni/Android.mk @@ -85,6 +85,7 @@ LATIN_IME_CORE_SRC_FILES := \ $(addprefix suggest/policyimpl/dictionary/utils/, \ buffer_with_extendable_buffer.cpp \ byte_array_utils.cpp \ + decaying_utils.cpp \ dict_file_writing_utils.cpp \ format_utils.cpp) \ suggest/policyimpl/gesture/gesture_suggest_policy_factory.cpp \ diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp index e02f4cbf1..67a085de3 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp @@ -17,10 +17,10 @@ #include "suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h" #include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" -#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" #include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/decaying_utils.h" namespace latinime { @@ -41,9 +41,14 @@ void DynamicBigramListPolicy::getNextBigram(int *const outBigramPos, int *const if (usesAdditionalBuffer && originalBigramPos != NOT_A_DICT_POS) { originalBigramPos += mBuffer->getOriginalBufferSize(); } - *outBigramPos = followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos); *outProbability = BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags); *outHasNext = BigramListReadWriteUtils::hasNext(bigramFlags); + if (mIsDecayingDict && !DecayingUtils::isValidBigram(*outProbability)) { + // This bigram is too weak to output. + *outBigramPos = NOT_A_DICT_POS; + } else { + *outBigramPos = followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos); + } if (usesAdditionalBuffer) { *bigramEntryPos += mBuffer->getOriginalBufferSize(); } @@ -153,15 +158,21 @@ bool DynamicBigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries( const int bigramTargetNodePos = followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos); nodeReader.fetchNodeInfoInBufferFromPtNodePos(bigramTargetNodePos); - // TODO: Update probability for supporting probability decaying. if (nodeReader.isDeleted() || !nodeReader.isTerminal() || bigramTargetNodePos == NOT_A_DICT_POS) { // The target is no longer valid terminal. Invalidate the current bigram entry. if (!BigramListReadWriteUtils::writeBigramEntry(mBuffer, bigramFlags, - NOT_A_DICT_POS /* targetOffset */, &bigramEntryPos)) { + NOT_A_DICT_POS /* targetPtNodePos */, &bigramEntryPos)) { return false; } - } else { + continue; + } + bool isRemoved = false; + if (!updateProbabilityForDecay(bigramFlags, bigramTargetNodePos, &bigramEntryPos, + &isRemoved)) { + return false; + } + if (!isRemoved) { (*outValidBigramEntryCount) += 1; } } while(BigramListReadWriteUtils::hasNext(bigramFlags)); @@ -247,8 +258,14 @@ bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTarg if (followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos) == bigramTargetPos) { // Update this bigram entry. *outAddedNewBigram = false; + const int originalProbability = BigramListReadWriteUtils::getProbabilityFromFlags( + bigramFlags); + const int probabilityToWrite = mIsDecayingDict ? + DecayingUtils::getUpdatedBigramProbabilityDelta( + originalProbability, probability) : probability; const BigramListReadWriteUtils::BigramFlags updatedFlags = - BigramListReadWriteUtils::setProbabilityInFlags(bigramFlags, probability); + BigramListReadWriteUtils::setProbabilityInFlags(bigramFlags, + probabilityToWrite); return BigramListReadWriteUtils::writeBigramEntry(mBuffer, updatedFlags, originalBigramPos, &entryPos); } @@ -276,8 +293,11 @@ bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTarg bool DynamicBigramListPolicy::writeNewBigramEntry(const int bigramTargetPos, const int probability, int *const writingPos) { // hasNext is false because we are adding a new bigram entry at the end of the bigram list. + const int probabilityToWrite = mIsDecayingDict ? + DecayingUtils::getUpdatedBigramProbabilityDelta(NOT_A_PROBABILITY, probability) : + probability; return BigramListReadWriteUtils::createAndWriteBigramEntry(mBuffer, bigramTargetPos, - probability, false /* hasNext */, writingPos); + probabilityToWrite, false /* hasNext */, writingPos); } bool DynamicBigramListPolicy::removeBigram(const int bigramListPos, const int bigramTargetPos) { @@ -339,4 +359,33 @@ int DynamicBigramListPolicy::followBigramLinkAndGetCurrentBigramPtNodePos( return currentPos; } +bool DynamicBigramListPolicy::updateProbabilityForDecay( + BigramListReadWriteUtils::BigramFlags bigramFlags, const int targetPtNodePos, + int *const bigramEntryPos, bool *const outRemoved) const { + *outRemoved = false; + if (mIsDecayingDict) { + // Update bigram probability for decaying. + const int newProbability = DecayingUtils::getBigramProbabilityDeltaToSave( + BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags)); + if (DecayingUtils::isValidBigram(newProbability)) { + // Write new probability. + const BigramListReadWriteUtils::BigramFlags updatedBigramFlags = + BigramListReadWriteUtils::setProbabilityInFlags( + bigramFlags, newProbability); + if (!BigramListReadWriteUtils::writeBigramEntry(mBuffer, updatedBigramFlags, + targetPtNodePos, bigramEntryPos)) { + return false; + } + } else { + // Remove current bigram entry. + *outRemoved = true; + if (!BigramListReadWriteUtils::writeBigramEntry(mBuffer, bigramFlags, + NOT_A_DICT_POS /* targetPtNodePos */, bigramEntryPos)) { + return false; + } + } + } + return true; +} + } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h index 3ebf69946..b358b4ed5 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h @@ -21,6 +21,7 @@ #include "defines.h" #include "suggest/core/policy/dictionary_bigrams_structure_policy.h" +#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" namespace latinime { @@ -34,8 +35,9 @@ class DictionaryShortcutsStructurePolicy; class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy { public: DynamicBigramListPolicy(BufferWithExtendableBuffer *const buffer, - const DictionaryShortcutsStructurePolicy *const shortcutPolicy) - : mBuffer(buffer), mShortcutPolicy(shortcutPolicy) {} + const DictionaryShortcutsStructurePolicy *const shortcutPolicy, + const bool isDecayingDict) + : mBuffer(buffer), mShortcutPolicy(shortcutPolicy), mIsDecayingDict(isDecayingDict) {} ~DynamicBigramListPolicy() {} @@ -74,9 +76,13 @@ class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy { BufferWithExtendableBuffer *const mBuffer; const DictionaryShortcutsStructurePolicy *const mShortcutPolicy; + const bool mIsDecayingDict; // Follow bigram link and return the position of bigram target PtNode that is currently valid. int followBigramLinkAndGetCurrentBigramPtNodePos(const int originalBigramPos) const; + + bool updateProbabilityForDecay(BigramListReadWriteUtils::BigramFlags bigramFlags, + const int targetPtNodePos, int *const bigramEntryPos, bool *const outRemoved) const; }; } // namespace latinime #endif // LATINIME_DYNAMIC_BIGRAM_LIST_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp index 5f755c302..081163a4d 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp @@ -16,6 +16,8 @@ #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h" +#include "suggest/policyimpl/dictionary/utils/decaying_utils.h" + namespace latinime { bool DynamicPatriciaTrieGcEventListeners @@ -25,6 +27,19 @@ bool DynamicPatriciaTrieGcEventListeners // PtNode is useless when the PtNode is not a terminal and doesn't have any not useless // children. bool isUselessPtNode = !node->isTerminal(); + if (node->isTerminal() && mIsDecayingDict) { + const int newProbability = + DecayingUtils::getUnigramProbabilityToSave(node->getProbability()); + int writingPos = node->getProbabilityFieldPos(); + // Update probability. + if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition( + mBuffer, newProbability, &writingPos)) { + return false; + } + if (!DecayingUtils::isValidUnigram(newProbability)) { + isUselessPtNode = false; + } + } if (mChildrenValue > 0) { isUselessPtNode = false; } else if (node->isTerminal()) { diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h index 301998882..463715af5 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h @@ -39,9 +39,9 @@ class DynamicPatriciaTrieGcEventListeners { public: TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( DynamicPatriciaTrieWritingHelper *const writingHelper, - BufferWithExtendableBuffer *const buffer) - : mWritingHelper(writingHelper), mBuffer(buffer), mValueStack(), - mChildrenValue(0), mValidUnigramCount(0) {} + BufferWithExtendableBuffer *const buffer, const bool isDecayingDict) + : mWritingHelper(writingHelper), mBuffer(buffer), mIsDecayingDict(isDecayingDict), + mValueStack(), mChildrenValue(0), mValidUnigramCount(0) {} ~TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted() {}; @@ -74,6 +74,7 @@ class DynamicPatriciaTrieGcEventListeners { DynamicPatriciaTrieWritingHelper *const mWritingHelper; BufferWithExtendableBuffer *const mBuffer; + const int mIsDecayingDict; std::vector mValueStack; int mChildrenValue; int mValidUnigramCount; diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp index 8c0890e2e..0d8c92768 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp @@ -18,6 +18,7 @@ #include #include +#include #include "defines.h" #include "suggest/core/dicnode/dic_node.h" @@ -27,12 +28,17 @@ #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" #include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/utils/decaying_utils.h" #include "suggest/policyimpl/dictionary/utils/probability_utils.h" namespace latinime { const char *const DynamicPatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; const char *const DynamicPatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; +const int DynamicPatriciaTriePolicy::MAX_DICT_EXTENDED_REGION_SIZE = 1024 * 1024; +const int DynamicPatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS = + DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE - 1024; +const int DynamicPatriciaTriePolicy::MIN_SECONDS_TO_REQUIRE_GC_WHEN_WRITING = 2 * 60 * 60; void DynamicPatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode, DicNodeVector *const childDicNodes) const { @@ -143,14 +149,17 @@ int DynamicPatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const in int DynamicPatriciaTriePolicy::getProbability(const int unigramProbability, const int bigramProbability) const { - // TODO: check mHeaderPolicy.usesForgettingCurve(); - if (unigramProbability == NOT_A_PROBABILITY) { - return NOT_A_PROBABILITY; - } else if (bigramProbability == NOT_A_PROBABILITY) { - return ProbabilityUtils::backoff(unigramProbability); + if (mHeaderPolicy.isDecayingDict()) { + return DecayingUtils::getProbability(unigramProbability, bigramProbability); } else { - return ProbabilityUtils::computeProbabilityForBigram(unigramProbability, - bigramProbability); + if (unigramProbability == NOT_A_PROBABILITY) { + return NOT_A_PROBABILITY; + } else if (bigramProbability == NOT_A_PROBABILITY) { + return ProbabilityUtils::backoff(unigramProbability); + } else { + return ProbabilityUtils::computeProbabilityForBigram(unigramProbability, + bigramProbability); + } } } @@ -199,11 +208,16 @@ bool DynamicPatriciaTriePolicy::addUnigramWord(const int *const word, const int AKLOGI("Warning: addUnigramWord() is called for non-updatable dictionary."); return false; } + if (mBufferWithExtendableBuffer.getTailPosition() + >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { + AKLOGE("The dictionary is too large to dynamically update."); + return false; + } DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy()); readingHelper.initWithPtNodeArrayPos(getRootPosition()); DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, - &mBigramListPolicy, &mShortcutListPolicy); + &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict()); bool addedNewUnigram = false; if (writingHelper.addUnigramWord(&readingHelper, word, length, probability, &addedNewUnigram)) { @@ -222,6 +236,11 @@ bool DynamicPatriciaTriePolicy::addBigramWords(const int *const word0, const int AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary."); return false; } + if (mBufferWithExtendableBuffer.getTailPosition() + >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { + AKLOGE("The dictionary is too large to dynamically update."); + return false; + } const int word0Pos = getTerminalNodePositionOfWord(word0, length0, false /* forceLowerCaseSearch */); if (word0Pos == NOT_A_DICT_POS) { @@ -233,7 +252,7 @@ bool DynamicPatriciaTriePolicy::addBigramWords(const int *const word0, const int return false; } DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, - &mBigramListPolicy, &mShortcutListPolicy); + &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict()); bool addedNewBigram = false; if (writingHelper.addBigramWords(word0Pos, word1Pos, probability, &addedNewBigram)) { if (addedNewBigram) { @@ -251,6 +270,11 @@ bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const AKLOGI("Warning: removeBigramWords() is called for non-updatable dictionary."); return false; } + if (mBufferWithExtendableBuffer.getTailPosition() + >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { + AKLOGE("The dictionary is too large to dynamically update."); + return false; + } const int word0Pos = getTerminalNodePositionOfWord(word0, length0, false /* forceLowerCaseSearch */); if (word0Pos == NOT_A_DICT_POS) { @@ -262,7 +286,7 @@ bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const return false; } DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, - &mBigramListPolicy, &mShortcutListPolicy); + &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict()); if (writingHelper.removeBigramWords(word0Pos, word1Pos)) { mBigramCount--; return true; @@ -277,7 +301,7 @@ void DynamicPatriciaTriePolicy::flush(const char *const filePath) { return; } DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, - &mBigramListPolicy, &mShortcutListPolicy); + &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict()); writingHelper.writeToDictFile(filePath, &mHeaderPolicy, mUnigramCount, mBigramCount); } @@ -287,7 +311,7 @@ void DynamicPatriciaTriePolicy::flushWithGC(const char *const filePath) { return; } DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, - &mBigramListPolicy, &mShortcutListPolicy); + &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict()); writingHelper.writeToDictFileWithGC(getRootPosition(), filePath, &mHeaderPolicy); } @@ -296,8 +320,33 @@ bool DynamicPatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary."); return false; } - // TODO: Implement more properly. - return mBufferWithExtendableBuffer.isNearSizeLimit(); + if (mBufferWithExtendableBuffer.isNearSizeLimit()) { + // Additional buffer size is near the limit. + return true; + } else if (mHeaderPolicy.getExtendedRegionSize() + + mBufferWithExtendableBuffer.getUsedAdditionalBufferSize() + > MAX_DICT_EXTENDED_REGION_SIZE) { + // Total extended region size exceeds the limit. + return true; + } else if (mBufferWithExtendableBuffer.getTailPosition() + >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS + && mBufferWithExtendableBuffer.getUsedAdditionalBufferSize() > 0) { + // Needs to reduce dictionary size. + return true; + } else if (mHeaderPolicy.isDecayingDict()) { + if (mUnigramCount >= DecayingUtils::MAX_UNIGRAM_COUNT) { + // Unigram count exceeds the limit. + return true; + } else if (mBigramCount >= DecayingUtils::MAX_BIGRAM_COUNT) { + // Bigram count exceeds the limit. + return true; + } else if (mindsBlockByGC && mHeaderPolicy.getLastUpdatedTime() + + MIN_SECONDS_TO_REQUIRE_GC_WHEN_WRITING < time(0)) { + // Time to update probabilities for decaying. + return true; + } + } + return false; } void DynamicPatriciaTriePolicy::getProperty(const char *const query, char *const outResult, diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h index bdb436c8e..d3150c6fc 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h @@ -37,7 +37,8 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { mBufferWithExtendableBuffer(mBuffer->getBuffer() + mHeaderPolicy.getSize(), mBuffer->getBufferSize() - mHeaderPolicy.getSize()), mShortcutListPolicy(&mBufferWithExtendableBuffer), - mBigramListPolicy(&mBufferWithExtendableBuffer, &mShortcutListPolicy), + mBigramListPolicy(&mBufferWithExtendableBuffer, &mShortcutListPolicy, + mHeaderPolicy.isDecayingDict()), mUnigramCount(mHeaderPolicy.getUnigramCount()), mBigramCount(mHeaderPolicy.getBigramCount()) {} @@ -101,6 +102,9 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { static const char*const UNIGRAM_COUNT_QUERY; static const char*const BIGRAM_COUNT_QUERY; + static const int MAX_DICT_EXTENDED_REGION_SIZE; + static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS; + static const int MIN_SECONDS_TO_REQUIRE_GC_WHEN_WRITING; const MmappedBuffer *const mBuffer; const HeaderPolicy mHeaderPolicy; diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp index 2a2e9bcbe..28124d251 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp @@ -25,6 +25,7 @@ #include "suggest/policyimpl/dictionary/header/header_policy.h" #include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h" +#include "suggest/policyimpl/dictionary/utils/decaying_utils.h" #include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" #include "utils/hash_map_compat.h" @@ -57,7 +58,9 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord( wordCodePoints[matchedCodePointCount + j])) { *outAddedNewUnigram = true; return reallocatePtNodeAndAddNewPtNodes(nodeReader, - readingHelper->getMergedNodeCodePoints(), j, probability, + readingHelper->getMergedNodeCodePoints(), j, + getUpdatedProbability(NOT_A_PROBABILITY /* originalProbability */, + probability), wordCodePoints + matchedCodePointCount, codePointCount - matchedCodePointCount); } @@ -69,7 +72,8 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord( } if (!nodeReader->hasChildren()) { *outAddedNewUnigram = true; - return createChildrenPtNodeArrayAndAChildPtNode(nodeReader, probability, + return createChildrenPtNodeArrayAndAChildPtNode(nodeReader, + getUpdatedProbability(NOT_A_PROBABILITY /* originalProbability */, probability), wordCodePoints + readingHelper->getTotalCodePointCount(), codePointCount - readingHelper->getTotalCodePointCount()); } @@ -86,7 +90,7 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord( return createAndInsertNodeIntoPtNodeArray(parentPos, wordCodePoints + readingHelper->getPrevTotalCodePointCount(), codePointCount - readingHelper->getPrevTotalCodePointCount(), - probability, &pos); + getUpdatedProbability(NOT_A_PROBABILITY /* originalProbability */, probability), &pos); } bool DynamicPatriciaTrieWritingHelper::addBigramWords(const int word0Pos, const int word1Pos, @@ -351,9 +355,11 @@ bool DynamicPatriciaTrieWritingHelper::setPtNodeProbability( if (originalPtNode->isTerminal()) { // Overwrites the probability. *outAddedNewUnigram = false; + const int probabilityToWrite = getUpdatedProbability(originalPtNode->getProbability(), + probability); int probabilityFieldPos = originalPtNode->getProbabilityFieldPos(); if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition(mBuffer, - probability, &probabilityFieldPos)) { + probabilityToWrite, &probabilityFieldPos)) { return false; } } else { @@ -365,7 +371,8 @@ bool DynamicPatriciaTrieWritingHelper::setPtNodeProbability( } if (!writePtNodeToBufferByCopyingPtNodeInfo(mBuffer, originalPtNode, originalPtNode->getParentPos(), codePoints, originalPtNode->getCodePointCount(), - probability, &movedPos)) { + getUpdatedProbability(NOT_A_PROBABILITY /* originalProbability */, probability), + &movedPos)) { return false; } } @@ -481,11 +488,15 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, DynamicPatriciaTrieGcEventListeners ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( - this, mBuffer); + this, mBuffer, mIsDecayingDict); if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) { return false; } + if (mIsDecayingDict && traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted + .getValidUnigramCount() > DecayingUtils::MAX_UNIGRAM_COUNT_AFTER_GC) { + // TODO: Remove more unigrams. + } readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateBigramProbability @@ -495,6 +506,11 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, return false; } + if (mIsDecayingDict && traversePolicyToUpdateBigramProbability.getValidBigramEntryCount() + > DecayingUtils::MAX_BIGRAM_COUNT_AFTER_GC) { + // TODO: Remove more bigrams. + } + // Mapping from positions in mBuffer to positions in bufferToWrite. DictPositionRelocationMap dictPositionRelocationMap; readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); @@ -508,7 +524,8 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, // Create policy instance for the GCed dictionary. DynamicShortcutListPolicy newDictShortcutPolicy(bufferToWrite); - DynamicBigramListPolicy newDictBigramPolicy(bufferToWrite, &newDictShortcutPolicy); + DynamicBigramListPolicy newDictBigramPolicy(bufferToWrite, &newDictShortcutPolicy, + mIsDecayingDict); // Create reading helper for the GCed dictionary. DynamicPatriciaTrieReadingHelper newDictReadingHelper(bufferToWrite, &newDictBigramPolicy, &newDictShortcutPolicy); @@ -525,4 +542,13 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, return true; } +int DynamicPatriciaTrieWritingHelper::getUpdatedProbability(const int originalProbability, + const int newProbability) { + if (mIsDecayingDict) { + return DecayingUtils::getUpdatedUnigramProbability(originalProbability, newProbability); + } else { + return newProbability; + } +} + } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h index 827b6097f..ecee2cdbf 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h @@ -47,10 +47,13 @@ class DynamicPatriciaTrieWritingHelper { DISALLOW_COPY_AND_ASSIGN(DictPositionRelocationMap); }; + static const size_t MAX_DICTIONARY_SIZE; + DynamicPatriciaTrieWritingHelper(BufferWithExtendableBuffer *const buffer, DynamicBigramListPolicy *const bigramPolicy, - DynamicShortcutListPolicy *const shortcutPolicy) - : mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortcutPolicy(shortcutPolicy) {} + DynamicShortcutListPolicy *const shortcutPolicy, const bool isDecayingDict) + : mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortcutPolicy(shortcutPolicy), + mIsDecayingDict(isDecayingDict) {} ~DynamicPatriciaTrieWritingHelper() {} @@ -87,11 +90,11 @@ class DynamicPatriciaTrieWritingHelper { DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTrieWritingHelper); static const int CHILDREN_POSITION_FIELD_SIZE; - static const size_t MAX_DICTIONARY_SIZE; BufferWithExtendableBuffer *const mBuffer; DynamicBigramListPolicy *const mBigramPolicy; DynamicShortcutListPolicy *const mShortcutPolicy; + const bool mIsDecayingDict; bool markNodeAsMovedAndSetPosition(const DynamicPatriciaTrieNodeReader *const nodeToUpdate, const int movedPos, const int bigramLinkedNodePos); @@ -127,6 +130,8 @@ class DynamicPatriciaTrieWritingHelper { bool runGC(const int rootPtNodeArrayPos, BufferWithExtendableBuffer *const bufferToWrite, int *const outUnigramCount, int *const outBigramCount); + + int getUpdatedProbability(const int originalProbability, const int newProbability); }; } // namespace latinime #endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.cpp new file mode 100644 index 000000000..942a74238 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.cpp @@ -0,0 +1,129 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/utils/decaying_utils.h" + +#include "suggest/policyimpl/dictionary/utils/probability_utils.h" + +namespace latinime { + +const int DecayingUtils::MAX_UNIGRAM_COUNT = 12000; +const int DecayingUtils::MAX_UNIGRAM_COUNT_AFTER_GC = 10000; +const int DecayingUtils::MAX_BIGRAM_COUNT = 12000; +const int DecayingUtils::MAX_BIGRAM_COUNT_AFTER_GC = 10000; + +const int DecayingUtils::MAX_COMPUTED_PROBABILITY = 127; +const int DecayingUtils::MAX_UNIGRAM_PROBABILITY = 120; +const int DecayingUtils::MIN_VALID_UNIGRAM_PROBABILITY = 24; +const int DecayingUtils::UNIGRAM_PROBABILITY_STEP = 8; +const int DecayingUtils::MAX_BIGRAM_PROBABILITY_DELTA = 15; +const int DecayingUtils::MIN_VALID_BIGRAM_PROBABILITY_DELTA = 3; +const int DecayingUtils::BIGRAM_PROBABILITY_DELTA_STEP = 1; + +/* static */ int DecayingUtils::getProbability(const int encodedUnigramProbability, + const int encodedBigramProbabilityDelta) { + if (encodedUnigramProbability == NOT_A_PROBABILITY) { + return NOT_A_PROBABILITY; + } else if (encodedBigramProbabilityDelta == NOT_A_PROBABILITY) { + const int rawProbability = ProbabilityUtils::backoff(decodeUnigramProbability( + encodedUnigramProbability)); + return min(getDecayedProbability(rawProbability), MAX_COMPUTED_PROBABILITY); + } else { + const int rawProbability = ProbabilityUtils::computeProbabilityForBigram( + decodeUnigramProbability(encodedUnigramProbability), + decodeBigramProbabilityDelta(encodedBigramProbabilityDelta)); + return min(getDecayedProbability(rawProbability), MAX_COMPUTED_PROBABILITY); + } +} + +/* static */ int DecayingUtils::getUpdatedUnigramProbability(const int originalEncodedProbability, + const int newProbability) { + if (originalEncodedProbability == NOT_A_PROBABILITY) { + // The unigram is not in this dictionary. + if (newProbability == NOT_A_PROBABILITY) { + // The unigram is not in other dictionaries. + return 0; + } else { + return MIN_VALID_UNIGRAM_PROBABILITY; + } + } else { + if (newProbability != NOT_A_PROBABILITY + && originalEncodedProbability < MIN_VALID_UNIGRAM_PROBABILITY) { + return MIN_VALID_UNIGRAM_PROBABILITY; + } + return min(originalEncodedProbability + UNIGRAM_PROBABILITY_STEP, MAX_UNIGRAM_PROBABILITY); + } +} + +/* static */ int DecayingUtils::getUnigramProbabilityToSave(const int encodedProbability) { + return max(encodedProbability - UNIGRAM_PROBABILITY_STEP, 0); +} + +/* static */ int DecayingUtils::getBigramProbabilityDeltaToSave(const int encodedProbabilityDelta) { + return max(encodedProbabilityDelta - BIGRAM_PROBABILITY_DELTA_STEP, 0); +} + +/* static */ int DecayingUtils::getUpdatedBigramProbabilityDelta( + const int originalEncodedProbabilityDelta, const int newProbability) { + if (originalEncodedProbabilityDelta == NOT_A_PROBABILITY) { + // The bigram relation is not in this dictionary. + if (newProbability == NOT_A_PROBABILITY) { + // The bigram target is not in other dictionaries. + return 0; + } else { + return MIN_VALID_BIGRAM_PROBABILITY_DELTA; + } + } else { + if (newProbability != NOT_A_PROBABILITY + && originalEncodedProbabilityDelta < MIN_VALID_BIGRAM_PROBABILITY_DELTA) { + return MIN_VALID_BIGRAM_PROBABILITY_DELTA; + } + return min(originalEncodedProbabilityDelta + BIGRAM_PROBABILITY_DELTA_STEP, + MAX_BIGRAM_PROBABILITY_DELTA); + } +} + +/* static */ int DecayingUtils::isValidUnigram(const int encodedUnigramProbability) { + return encodedUnigramProbability >= MIN_VALID_UNIGRAM_PROBABILITY; +} + +/* static */ int DecayingUtils::isValidBigram(const int encodedBigramProbabilityDelta) { + return encodedBigramProbabilityDelta >= MIN_VALID_BIGRAM_PROBABILITY_DELTA; +} + +/* static */ int DecayingUtils::decodeUnigramProbability(const int encodedProbability) { + const int probability = encodedProbability - MIN_VALID_UNIGRAM_PROBABILITY; + if (probability < 0) { + return NOT_A_PROBABILITY; + } else { + return min(probability, MAX_UNIGRAM_PROBABILITY); + } +} + +/* static */ int DecayingUtils::decodeBigramProbabilityDelta(const int encodedProbabilityDelta) { + const int probabilityDelta = encodedProbabilityDelta - MIN_VALID_BIGRAM_PROBABILITY_DELTA; + if (probabilityDelta < 0) { + return NOT_A_PROBABILITY; + } else { + return min(probabilityDelta, MAX_BIGRAM_PROBABILITY_DELTA); + } +} + +/* static */ int DecayingUtils::getDecayedProbability(const int rawProbability) { + return rawProbability; +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.h new file mode 100644 index 000000000..1ca03918f --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/decaying_utils.h @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DECAYING_UTILS_H +#define LATINIME_DECAYING_UTILS_H + +#include "defines.h" + +namespace latinime { + +// TODO: Check the elapsed time and decrease the probability depending on the time. Time field is +// required to introduced to each terminal PtNode and bigram entry. +// TODO: Quit using bigram probability to indicate the delta. +// TODO: Quit using bigram probability delta. +class DecayingUtils { + public: + static const int MAX_UNIGRAM_COUNT; + static const int MAX_UNIGRAM_COUNT_AFTER_GC; + static const int MAX_BIGRAM_COUNT; + static const int MAX_BIGRAM_COUNT_AFTER_GC; + + static int getProbability(const int encodedUnigramProbability, + const int encodedBigramProbabilityDelta); + + static int getUpdatedUnigramProbability(const int originalEncodedProbability, + const int newProbability); + + static int getUpdatedBigramProbabilityDelta(const int originalEncodedProbabilityDelta, + const int newProbability); + + static int isValidUnigram(const int encodedUnigramProbability); + + static int isValidBigram(const int encodedProbabilityDelta); + + static int getUnigramProbabilityToSave(const int encodedProbability); + + static int getBigramProbabilityDeltaToSave(const int encodedProbabilityDelta); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DecayingUtils); + + static const int MAX_COMPUTED_PROBABILITY; + static const int MAX_UNIGRAM_PROBABILITY; + static const int MIN_VALID_UNIGRAM_PROBABILITY; + static const int UNIGRAM_PROBABILITY_STEP; + static const int MAX_BIGRAM_PROBABILITY_DELTA; + static const int MIN_VALID_BIGRAM_PROBABILITY_DELTA; + static const int BIGRAM_PROBABILITY_DELTA_STEP; + + static int decodeUnigramProbability(const int encodedProbability); + + static int decodeBigramProbabilityDelta(const int encodedProbability); + + static int getDecayedProbability(const int rawProbability); +}; +} // namespace latinime +#endif /* LATINIME_DECAYING_UTILS_H */ diff --git a/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java b/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java new file mode 100644 index 000000000..cf85d97a0 --- /dev/null +++ b/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java @@ -0,0 +1,171 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.android.inputmethod.latin; + +import android.test.AndroidTestCase; +import android.test.suitebuilder.annotation.LargeTest; + +import com.android.inputmethod.latin.makedict.FormatSpec; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.Locale; +import java.util.Map; + +@LargeTest +public class BinaryDictionaryDecayingTests extends AndroidTestCase { + private static final String TEST_DICT_FILE_EXTENSION = ".testDict"; + private static final String TEST_LOCALE = "test"; + + private static final int DUMMY_PROBABILITY = 0; + + @Override + protected void setUp() throws Exception { + super.setUp(); + } + + @Override + protected void tearDown() throws Exception { + super.tearDown(); + } + + private void forcePassingShortTime(final BinaryDictionary binaryDictionary) { + binaryDictionary.flushWithGC(); + } + + private void forcePassingLongTime(final BinaryDictionary binaryDictionary) { + // Currently, probabilities are decayed when GC is run. All entries that have never been + // typed in 32 GCs are removed. + final int count = 32; + for (int i = 0; i < count; i++) { + binaryDictionary.flushWithGC(); + } + } + + private File createEmptyDictionaryAndGetFile(final String filename) throws IOException { + final File file = File.createTempFile(filename, TEST_DICT_FILE_EXTENSION, + getContext().getCacheDir()); + Map attributeMap = new HashMap(); + attributeMap.put(FormatSpec.FileHeader.SUPPORTS_DYNAMIC_UPDATE_ATTRIBUTE, + FormatSpec.FileHeader.ATTRIBUTE_VALUE_TRUE); + attributeMap.put(FormatSpec.FileHeader.USES_FORGETTING_CURVE_ATTRIBUTE, + FormatSpec.FileHeader.ATTRIBUTE_VALUE_TRUE); + if (BinaryDictionary.createEmptyDictFile(file.getAbsolutePath(), + 3 /* dictVersion */, attributeMap)) { + return file; + } else { + throw new IOException("Empty dictionary cannot be created."); + } + } + + public void testAddValidAndInvalidWords() { + File dictFile = null; + try { + dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary"); + } catch (IOException e) { + fail("IOException while writing an initial dictionary : " + e); + } + BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), + 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, + Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); + + binaryDictionary.addUnigramWord("a", Dictionary.NOT_A_PROBABILITY); + assertFalse(binaryDictionary.isValidWord("a")); + binaryDictionary.addUnigramWord("a", Dictionary.NOT_A_PROBABILITY); + assertFalse(binaryDictionary.isValidWord("a")); + binaryDictionary.addUnigramWord("a", Dictionary.NOT_A_PROBABILITY); + assertFalse(binaryDictionary.isValidWord("a")); + binaryDictionary.addUnigramWord("a", Dictionary.NOT_A_PROBABILITY); + assertTrue(binaryDictionary.isValidWord("a")); + + binaryDictionary.addUnigramWord("b", DUMMY_PROBABILITY); + assertTrue(binaryDictionary.isValidWord("b")); + + final int unigramProbability = binaryDictionary.getFrequency("a"); + binaryDictionary.addBigramWords("a", "b", Dictionary.NOT_A_PROBABILITY); + assertFalse(binaryDictionary.isValidBigram("a", "b")); + binaryDictionary.addBigramWords("a", "b", Dictionary.NOT_A_PROBABILITY); + assertFalse(binaryDictionary.isValidBigram("a", "b")); + binaryDictionary.addBigramWords("a", "b", Dictionary.NOT_A_PROBABILITY); + assertFalse(binaryDictionary.isValidBigram("a", "b")); + binaryDictionary.addBigramWords("a", "b", Dictionary.NOT_A_PROBABILITY); + assertTrue(binaryDictionary.isValidBigram("a", "b")); + + binaryDictionary.addUnigramWord("c", DUMMY_PROBABILITY); + binaryDictionary.addBigramWords("a", "c", DUMMY_PROBABILITY); + assertTrue(binaryDictionary.isValidBigram("a", "c")); + + binaryDictionary.close(); + dictFile.delete(); + } + + // TODO: Add large tests. + public void testDecayingProbability() { + File dictFile = null; + try { + dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary"); + } catch (IOException e) { + fail("IOException while writing an initial dictionary : " + e); + } + BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), + 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, + Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); + + binaryDictionary.addUnigramWord("a", DUMMY_PROBABILITY); + assertTrue(binaryDictionary.isValidWord("a")); + forcePassingShortTime(binaryDictionary); + assertFalse(binaryDictionary.isValidWord("a")); + + binaryDictionary.addUnigramWord("a", DUMMY_PROBABILITY); + binaryDictionary.addUnigramWord("a", DUMMY_PROBABILITY); + binaryDictionary.addUnigramWord("a", DUMMY_PROBABILITY); + binaryDictionary.addUnigramWord("a", DUMMY_PROBABILITY); + forcePassingShortTime(binaryDictionary); + assertTrue(binaryDictionary.isValidWord("a")); + forcePassingLongTime(binaryDictionary); + assertFalse(binaryDictionary.isValidWord("a")); + + binaryDictionary.addUnigramWord("a", DUMMY_PROBABILITY); + binaryDictionary.addUnigramWord("b", DUMMY_PROBABILITY); + binaryDictionary.addBigramWords("a", "b", DUMMY_PROBABILITY); + assertTrue(binaryDictionary.isValidBigram("a", "b")); + forcePassingShortTime(binaryDictionary); + assertFalse(binaryDictionary.isValidBigram("a", "b")); + + binaryDictionary.addUnigramWord("a", DUMMY_PROBABILITY); + binaryDictionary.addUnigramWord("b", DUMMY_PROBABILITY); + binaryDictionary.addBigramWords("a", "b", DUMMY_PROBABILITY); + binaryDictionary.addUnigramWord("a", DUMMY_PROBABILITY); + binaryDictionary.addUnigramWord("b", DUMMY_PROBABILITY); + binaryDictionary.addBigramWords("a", "b", DUMMY_PROBABILITY); + binaryDictionary.addUnigramWord("a", DUMMY_PROBABILITY); + binaryDictionary.addUnigramWord("b", DUMMY_PROBABILITY); + binaryDictionary.addBigramWords("a", "b", DUMMY_PROBABILITY); + binaryDictionary.addUnigramWord("a", DUMMY_PROBABILITY); + binaryDictionary.addUnigramWord("b", DUMMY_PROBABILITY); + binaryDictionary.addBigramWords("a", "b", DUMMY_PROBABILITY); + assertTrue(binaryDictionary.isValidBigram("a", "b")); + forcePassingShortTime(binaryDictionary); + assertTrue(binaryDictionary.isValidBigram("a", "b")); + forcePassingLongTime(binaryDictionary); + assertFalse(binaryDictionary.isValidBigram("a", "b")); + + binaryDictionary.close(); + dictFile.delete(); + } +}