Fix bug and Add large test for decaying dictionary.

- GC gets failure when the dictionary become empty.
- Useless unigrams are sometimes not removed.

Bug: 10197478
Change-Id: I8d1479c01efba61a81f03bc077da6bcb4797a940
main
Keisuke Kuroyanagi 2013-10-07 17:05:24 +09:00
parent 32cf6f85a2
commit cfb018ba6d
9 changed files with 82 additions and 4 deletions

View File

@ -52,6 +52,10 @@ public final class BinaryDictionary extends Dictionary {
public static final String UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; public static final String UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
@UsedForTesting @UsedForTesting
public static final String BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; public static final String BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
@UsedForTesting
public static final String MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT";
@UsedForTesting
public static final String MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT";
private long mNativeDict; private long mNativeDict;
private final Locale mLocale; private final Locale mLocale;

View File

@ -39,7 +39,7 @@ bool DynamicPatriciaTrieGcEventListeners
return false; return false;
} }
if (!ForgettingCurveUtils::isValidEncodedProbability(newProbability)) { if (!ForgettingCurveUtils::isValidEncodedProbability(newProbability)) {
isUselessPtNode = false; isUselessPtNode = true;
} }
} }
if (mChildrenValue > 0) { if (mChildrenValue > 0) {

View File

@ -60,6 +60,7 @@ class DynamicPatriciaTrieGcEventListeners {
bool onDescend(const int ptNodeArrayPos) { bool onDescend(const int ptNodeArrayPos) {
mValueStack.push_back(0); mValueStack.push_back(0);
mChildrenValue = 0;
return true; return true;
} }

View File

@ -37,6 +37,8 @@ namespace latinime {
// BinaryDictionaryDecayingTests. // BinaryDictionaryDecayingTests.
const char *const DynamicPatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; const char *const DynamicPatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
const char *const DynamicPatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; const char *const DynamicPatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
const char *const DynamicPatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT";
const char *const DynamicPatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT";
const char *const DynamicPatriciaTriePolicy::SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY = const char *const DynamicPatriciaTriePolicy::SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY =
"SET_NEEDS_TO_DECAY_FOR_TESTING"; "SET_NEEDS_TO_DECAY_FOR_TESTING";
const int DynamicPatriciaTriePolicy::MAX_DICT_EXTENDED_REGION_SIZE = 1024 * 1024; const int DynamicPatriciaTriePolicy::MAX_DICT_EXTENDED_REGION_SIZE = 1024 * 1024;
@ -355,6 +357,14 @@ void DynamicPatriciaTriePolicy::getProperty(const char *const query, char *const
snprintf(outResult, maxResultLength, "%d", mUnigramCount); snprintf(outResult, maxResultLength, "%d", mUnigramCount);
} else if (strncmp(query, BIGRAM_COUNT_QUERY, maxResultLength) == 0) { } else if (strncmp(query, BIGRAM_COUNT_QUERY, maxResultLength) == 0) {
snprintf(outResult, maxResultLength, "%d", mBigramCount); snprintf(outResult, maxResultLength, "%d", mBigramCount);
} else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, maxResultLength) == 0) {
snprintf(outResult, maxResultLength, "%d",
mHeaderPolicy.isDecayingDict() ? ForgettingCurveUtils::MAX_UNIGRAM_COUNT :
DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE);
} else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, maxResultLength) == 0) {
snprintf(outResult, maxResultLength, "%d",
mHeaderPolicy.isDecayingDict() ? ForgettingCurveUtils::MAX_BIGRAM_COUNT :
DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE);
} else if (strncmp(query, SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY, maxResultLength) == 0) { } else if (strncmp(query, SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY, maxResultLength) == 0) {
mNeedsToDecayForTesting = true; mNeedsToDecayForTesting = true;
} }

View File

@ -102,6 +102,8 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
static const char *const UNIGRAM_COUNT_QUERY; static const char *const UNIGRAM_COUNT_QUERY;
static const char *const BIGRAM_COUNT_QUERY; static const char *const BIGRAM_COUNT_QUERY;
static const char *const MAX_UNIGRAM_COUNT_QUERY;
static const char *const MAX_BIGRAM_COUNT_QUERY;
static const char *const SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY; static const char *const SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY;
static const int MAX_DICT_EXTENDED_REGION_SIZE; static const int MAX_DICT_EXTENDED_REGION_SIZE;
static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS; static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS;

View File

@ -93,6 +93,12 @@ bool DynamicPatriciaTrieReadingHelper::traverseAllPtNodesInPtNodeArrayLevelPreor
if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) { if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) {
return false; return false;
} }
if (isEnd()) {
// Empty dictionary. Needs to notify the listener of the tail of empty PtNode array.
if (!listener->onReadingPtNodeArrayTail()) {
return false;
}
}
pushReadingStateToStack(); pushReadingStateToStack();
while (!isEnd()) { while (!isEnd()) {
if (alreadyVisitedAllPtNodesInArray) { if (alreadyVisitedAllPtNodesInArray) {

View File

@ -279,9 +279,11 @@ class DynamicPatriciaTrieReadingHelper {
} else { } else {
mReadingState = mReadingStateStack.back(); mReadingState = mReadingStateStack.back();
mReadingStateStack.pop_back(); mReadingStateStack.pop_back();
if (!isEnd()) {
fetchPtNodeInfo(); fetchPtNodeInfo();
} }
} }
}
}; };
} // namespace latinime } // namespace latinime
#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H */ #endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H */

View File

@ -93,8 +93,7 @@ void ForgettingCurveUtils::TimeKeeper::setCurrentTime() {
for (int i = 0; i < decayIterationCount; ++i) { for (int i = 0; i < decayIterationCount; ++i) {
const float currentRate = static_cast<float>(currentEncodedProbability) const float currentRate = static_cast<float>(currentEncodedProbability)
/ static_cast<float>(MAX_ENCODED_PROBABILITY); / static_cast<float>(MAX_ENCODED_PROBABILITY);
const float thresholdToDecay = MIN_PROBABILITY_TO_DECAY const float thresholdToDecay = (1.0f - MIN_PROBABILITY_TO_DECAY) * currentRate;
+ (1.0f - MIN_PROBABILITY_TO_DECAY) * currentRate;
const float randValue = static_cast<float>(rand()) / static_cast<float>(RAND_MAX); const float randValue = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
if (thresholdToDecay < randValue) { if (thresholdToDecay < randValue) {
currentEncodedProbability = max(currentEncodedProbability - ENCODED_PROBABILITY_STEP, currentEncodedProbability = max(currentEncodedProbability - ENCODED_PROBABILITY_STEP,

View File

@ -19,13 +19,16 @@ package com.android.inputmethod.latin;
import android.test.AndroidTestCase; import android.test.AndroidTestCase;
import android.test.suitebuilder.annotation.LargeTest; import android.test.suitebuilder.annotation.LargeTest;
import com.android.inputmethod.latin.makedict.CodePointUtils;
import com.android.inputmethod.latin.makedict.FormatSpec; import com.android.inputmethod.latin.makedict.FormatSpec;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.Locale; import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.Random;
@LargeTest @LargeTest
public class BinaryDictionaryDecayingTests extends AndroidTestCase { public class BinaryDictionaryDecayingTests extends AndroidTestCase {
@ -179,4 +182,55 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
binaryDictionary.close(); binaryDictionary.close();
dictFile.delete(); dictFile.delete();
} }
public void testAddManyUnigramsToDecayingDict() {
final int unigramCount = 30000;
final int unigramTypedCount = 100000;
final int codePointSetSize = 50;
final long seed = System.currentTimeMillis();
final Random random = new Random(seed);
File dictFile = null;
try {
dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary");
} catch (IOException e) {
fail("IOException while writing an initial dictionary : " + e);
}
BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
final ArrayList<String> words = new ArrayList<String>();
for (int i = 0; i < unigramCount; i++) {
final String word = CodePointUtils.generateWord(random, codePointSet);
words.add(word);
}
final int maxUnigramCount = Integer.parseInt(
binaryDictionary.getPropertyForTests(BinaryDictionary.MAX_UNIGRAM_COUNT_QUERY));
for (int i = 0; i < unigramTypedCount; i++) {
final String word = words.get(random.nextInt(words.size()));
binaryDictionary.addUnigramWord(word, DUMMY_PROBABILITY);
if (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) {
final int unigramCountBeforeGC =
Integer.parseInt(binaryDictionary.getPropertyForTests(
BinaryDictionary.UNIGRAM_COUNT_QUERY));
while (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) {
binaryDictionary.flushWithGC();
}
final int unigramCountAfterGC =
Integer.parseInt(binaryDictionary.getPropertyForTests(
BinaryDictionary.UNIGRAM_COUNT_QUERY));
assertTrue(unigramCountBeforeGC > unigramCountAfterGC);
}
}
assertTrue(Integer.parseInt(binaryDictionary.getPropertyForTests(
BinaryDictionary.UNIGRAM_COUNT_QUERY)) > 0);
assertTrue(Integer.parseInt(binaryDictionary.getPropertyForTests(
BinaryDictionary.UNIGRAM_COUNT_QUERY)) <= maxUnigramCount);
}
} }