From cfb018ba6db78f2b33b54d4811f0bf166db29792 Mon Sep 17 00:00:00 2001 From: Keisuke Kuroyanagi Date: Mon, 7 Oct 2013 17:05:24 +0900 Subject: [PATCH] Fix bug and Add large test for decaying dictionary. - GC gets failure when the dictionary become empty. - Useless unigrams are sometimes not removed. Bug: 10197478 Change-Id: I8d1479c01efba61a81f03bc077da6bcb4797a940 --- .../inputmethod/latin/BinaryDictionary.java | 4 ++ ...namic_patricia_trie_gc_event_listeners.cpp | 2 +- ...dynamic_patricia_trie_gc_event_listeners.h | 1 + .../dynamic_patricia_trie_policy.cpp | 10 ++++ .../dictionary/dynamic_patricia_trie_policy.h | 2 + .../dynamic_patricia_trie_reading_helper.cpp | 6 +++ .../dynamic_patricia_trie_reading_helper.h | 4 +- .../utils/forgetting_curve_utils.cpp | 3 +- .../latin/BinaryDictionaryDecayingTests.java | 54 +++++++++++++++++++ 9 files changed, 82 insertions(+), 4 deletions(-) diff --git a/java/src/com/android/inputmethod/latin/BinaryDictionary.java b/java/src/com/android/inputmethod/latin/BinaryDictionary.java index 541e69788..fd296988e 100644 --- a/java/src/com/android/inputmethod/latin/BinaryDictionary.java +++ b/java/src/com/android/inputmethod/latin/BinaryDictionary.java @@ -52,6 +52,10 @@ public final class BinaryDictionary extends Dictionary { public static final String UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; @UsedForTesting public static final String BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; + @UsedForTesting + public static final String MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT"; + @UsedForTesting + public static final String MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT"; private long mNativeDict; private final Locale mLocale; diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp index a17a0acf6..5724c5d88 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp @@ -39,7 +39,7 @@ bool DynamicPatriciaTrieGcEventListeners return false; } if (!ForgettingCurveUtils::isValidEncodedProbability(newProbability)) { - isUselessPtNode = false; + isUselessPtNode = true; } } if (mChildrenValue > 0) { diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h index 3ca2f2a01..9755120b0 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h @@ -60,6 +60,7 @@ class DynamicPatriciaTrieGcEventListeners { bool onDescend(const int ptNodeArrayPos) { mValueStack.push_back(0); + mChildrenValue = 0; return true; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp index 31e3fb42f..3d07c9d6c 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp @@ -37,6 +37,8 @@ namespace latinime { // BinaryDictionaryDecayingTests. const char *const DynamicPatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; const char *const DynamicPatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; +const char *const DynamicPatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT"; +const char *const DynamicPatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT"; const char *const DynamicPatriciaTriePolicy::SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY = "SET_NEEDS_TO_DECAY_FOR_TESTING"; const int DynamicPatriciaTriePolicy::MAX_DICT_EXTENDED_REGION_SIZE = 1024 * 1024; @@ -355,6 +357,14 @@ void DynamicPatriciaTriePolicy::getProperty(const char *const query, char *const snprintf(outResult, maxResultLength, "%d", mUnigramCount); } else if (strncmp(query, BIGRAM_COUNT_QUERY, maxResultLength) == 0) { snprintf(outResult, maxResultLength, "%d", mBigramCount); + } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, maxResultLength) == 0) { + snprintf(outResult, maxResultLength, "%d", + mHeaderPolicy.isDecayingDict() ? ForgettingCurveUtils::MAX_UNIGRAM_COUNT : + DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE); + } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, maxResultLength) == 0) { + snprintf(outResult, maxResultLength, "%d", + mHeaderPolicy.isDecayingDict() ? ForgettingCurveUtils::MAX_BIGRAM_COUNT : + DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE); } else if (strncmp(query, SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY, maxResultLength) == 0) { mNeedsToDecayForTesting = true; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h index 903f65e8e..be97ee1a5 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h @@ -102,6 +102,8 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { static const char *const UNIGRAM_COUNT_QUERY; static const char *const BIGRAM_COUNT_QUERY; + static const char *const MAX_UNIGRAM_COUNT_QUERY; + static const char *const MAX_BIGRAM_COUNT_QUERY; static const char *const SET_NEEDS_TO_DECAY_FOR_TESTING_QUERY; static const int MAX_DICT_EXTENDED_REGION_SIZE; static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS; diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp index 601ee663b..f108c219f 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp @@ -93,6 +93,12 @@ bool DynamicPatriciaTrieReadingHelper::traverseAllPtNodesInPtNodeArrayLevelPreor if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) { return false; } + if (isEnd()) { + // Empty dictionary. Needs to notify the listener of the tail of empty PtNode array. + if (!listener->onReadingPtNodeArrayTail()) { + return false; + } + } pushReadingStateToStack(); while (!isEnd()) { if (alreadyVisitedAllPtNodesInArray) { diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h index 512a4d818..a71c06971 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h @@ -279,7 +279,9 @@ class DynamicPatriciaTrieReadingHelper { } else { mReadingState = mReadingStateStack.back(); mReadingStateStack.pop_back(); - fetchPtNodeInfo(); + if (!isEnd()) { + fetchPtNodeInfo(); + } } } }; diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp index 19ca35481..1632fd072 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp @@ -93,8 +93,7 @@ void ForgettingCurveUtils::TimeKeeper::setCurrentTime() { for (int i = 0; i < decayIterationCount; ++i) { const float currentRate = static_cast(currentEncodedProbability) / static_cast(MAX_ENCODED_PROBABILITY); - const float thresholdToDecay = MIN_PROBABILITY_TO_DECAY - + (1.0f - MIN_PROBABILITY_TO_DECAY) * currentRate; + const float thresholdToDecay = (1.0f - MIN_PROBABILITY_TO_DECAY) * currentRate; const float randValue = static_cast(rand()) / static_cast(RAND_MAX); if (thresholdToDecay < randValue) { currentEncodedProbability = max(currentEncodedProbability - ENCODED_PROBABILITY_STEP, diff --git a/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java b/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java index ded8eaa97..cecdd2ffb 100644 --- a/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java +++ b/tests/src/com/android/inputmethod/latin/BinaryDictionaryDecayingTests.java @@ -19,13 +19,16 @@ package com.android.inputmethod.latin; import android.test.AndroidTestCase; import android.test.suitebuilder.annotation.LargeTest; +import com.android.inputmethod.latin.makedict.CodePointUtils; import com.android.inputmethod.latin.makedict.FormatSpec; import java.io.File; import java.io.IOException; +import java.util.ArrayList; import java.util.HashMap; import java.util.Locale; import java.util.Map; +import java.util.Random; @LargeTest public class BinaryDictionaryDecayingTests extends AndroidTestCase { @@ -179,4 +182,55 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase { binaryDictionary.close(); dictFile.delete(); } + + public void testAddManyUnigramsToDecayingDict() { + final int unigramCount = 30000; + final int unigramTypedCount = 100000; + final int codePointSetSize = 50; + final long seed = System.currentTimeMillis(); + final Random random = new Random(seed); + + File dictFile = null; + try { + dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary"); + } catch (IOException e) { + fail("IOException while writing an initial dictionary : " + e); + } + BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), + 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, + Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); + + final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); + final ArrayList words = new ArrayList(); + + for (int i = 0; i < unigramCount; i++) { + final String word = CodePointUtils.generateWord(random, codePointSet); + words.add(word); + } + + final int maxUnigramCount = Integer.parseInt( + binaryDictionary.getPropertyForTests(BinaryDictionary.MAX_UNIGRAM_COUNT_QUERY)); + for (int i = 0; i < unigramTypedCount; i++) { + final String word = words.get(random.nextInt(words.size())); + binaryDictionary.addUnigramWord(word, DUMMY_PROBABILITY); + + if (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) { + final int unigramCountBeforeGC = + Integer.parseInt(binaryDictionary.getPropertyForTests( + BinaryDictionary.UNIGRAM_COUNT_QUERY)); + while (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) { + binaryDictionary.flushWithGC(); + } + final int unigramCountAfterGC = + Integer.parseInt(binaryDictionary.getPropertyForTests( + BinaryDictionary.UNIGRAM_COUNT_QUERY)); + assertTrue(unigramCountBeforeGC > unigramCountAfterGC); + } + } + + assertTrue(Integer.parseInt(binaryDictionary.getPropertyForTests( + BinaryDictionary.UNIGRAM_COUNT_QUERY)) > 0); + assertTrue(Integer.parseInt(binaryDictionary.getPropertyForTests( + BinaryDictionary.UNIGRAM_COUNT_QUERY)) <= maxUnigramCount); + } }