am 60388956: Discard useless bigrams when overflowing.

* commit '603889561127457f553ca328d6b5bb392d921681':
  Discard useless bigrams when overflowing.
main
Keisuke Kuroyanagi 2013-12-10 01:52:57 -08:00 committed by Android Git Automerger
commit 03fdb22e7c
3 changed files with 137 additions and 8 deletions

View File

@ -102,7 +102,7 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
.getValidUnigramCount(); .getValidUnigramCount();
if (headerPolicy->isDecayingDict() if (headerPolicy->isDecayingDict()
&& unigramCount > ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC) { && unigramCount > ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC) {
if (!turncateUnigrams(&ptNodeReader, &ptNodeWriter, if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter,
ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC)) { ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC)) {
AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount, AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount,
ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC); ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC);
@ -117,10 +117,14 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
&traversePolicyToUpdateBigramProbability)) { &traversePolicyToUpdateBigramProbability)) {
return false; return false;
} }
const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount();
if (headerPolicy->isDecayingDict() if (headerPolicy->isDecayingDict()
&& traversePolicyToUpdateBigramProbability.getValidBigramEntryCount() && bigramCount > ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC) {
> ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC) { if (!truncateBigrams(ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC)) {
// TODO: Remove more bigrams. AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount,
ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC);
return false;
}
} }
// Mapping from positions in mBuffer to positions in bufferToWrite. // Mapping from positions in mBuffer to positions in bufferToWrite.
@ -186,7 +190,7 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
return true; return true;
} }
bool Ver4PatriciaTrieWritingHelper::turncateUnigrams( bool Ver4PatriciaTrieWritingHelper::truncateUnigrams(
const Ver4PatriciaTrieNodeReader *const ptNodeReader, const Ver4PatriciaTrieNodeReader *const ptNodeReader,
Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount) { Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount) {
const TerminalPositionLookupTable *const terminalPosLookupTable = const TerminalPositionLookupTable *const terminalPosLookupTable =
@ -222,6 +226,50 @@ bool Ver4PatriciaTrieWritingHelper::turncateUnigrams(
return true; return true;
} }
bool Ver4PatriciaTrieWritingHelper::truncateBigrams(const int maxBigramCount) {
const TerminalPositionLookupTable *const terminalPosLookupTable =
mBuffers->getTerminalPositionLookupTable();
const int nextTerminalId = terminalPosLookupTable->getNextTerminalId();
std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator>
priorityQueue;
BigramDictContent *const bigramDictContent = mBuffers->getMutableBigramDictContent();
for (int i = 0; i < nextTerminalId; ++i) {
const int bigramListPos = bigramDictContent->getBigramListHeadPos(i);
if (bigramListPos == NOT_A_DICT_POS) {
continue;
}
bool hasNext = true;
int readingPos = bigramListPos;
while (hasNext) {
const int entryPos = readingPos;
const BigramEntry bigramEntry =
bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos);
hasNext = bigramEntry.hasNext();
if (!bigramEntry.isValid()) {
continue;
}
const int probability = bigramEntry.hasHistoricalInfo() ?
ForgettingCurveUtils::decodeProbability(bigramEntry.getHistoricalInfo()) :
bigramEntry.getProbability();
priorityQueue.push(DictProbability(entryPos, probability,
bigramEntry.getHistoricalInfo()->getTimeStamp()));
}
}
// Delete bigrams.
while (static_cast<int>(priorityQueue.size()) > maxBigramCount) {
const int entryPos = priorityQueue.top().getDictPos();
const BigramEntry bigramEntry = bigramDictContent->getBigramEntry(entryPos);
const BigramEntry invalidatedBigramEntry = bigramEntry.getInvalidatedEntry();
if (!bigramDictContent->writeBigramEntry(&invalidatedBigramEntry, entryPos)) {
AKLOGE("Cannot write bigram entry to remove. pos: %d", entryPos);
return false;
}
priorityQueue.pop();
}
return true;
}
bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) {
if (!ptNodeParams->isTerminal()) { if (!ptNodeParams->isTerminal()) {

View File

@ -65,7 +65,7 @@ class Ver4PatriciaTrieWritingHelper {
const TerminalPositionLookupTable::TerminalIdMap *const mTerminalIdMap; const TerminalPositionLookupTable::TerminalIdMap *const mTerminalIdMap;
}; };
// For truncateUnigrams(). // For truncateUnigrams() and truncateBigrams().
class DictProbability { class DictProbability {
public: public:
DictProbability(const int dictPos, const int probability, const int timestamp) DictProbability(const int dictPos, const int probability, const int timestamp)
@ -91,7 +91,7 @@ class Ver4PatriciaTrieWritingHelper {
int mTimestamp; int mTimestamp;
}; };
// For truncateUnigrams(). // For truncateUnigrams() and truncateBigrams().
class DictProbabilityComparator { class DictProbabilityComparator {
public: public:
bool operator()(const DictProbability &left, const DictProbability &right) { bool operator()(const DictProbability &left, const DictProbability &right) {
@ -112,9 +112,11 @@ class Ver4PatriciaTrieWritingHelper {
Ver4DictBuffers *const buffersToWrite, int *const outUnigramCount, Ver4DictBuffers *const buffersToWrite, int *const outUnigramCount,
int *const outBigramCount); int *const outBigramCount);
bool turncateUnigrams(const Ver4PatriciaTrieNodeReader *const ptNodeReader, bool truncateUnigrams(const Ver4PatriciaTrieNodeReader *const ptNodeReader,
Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount); Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount);
bool truncateBigrams(const int maxBigramCount);
Ver4DictBuffers *const mBuffers; Ver4DictBuffers *const mBuffers;
}; };
} // namespace latinime } // namespace latinime

View File

@ -474,4 +474,83 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
assertEquals(0, Integer.parseInt(binaryDictionary.getPropertyForTests( assertEquals(0, Integer.parseInt(binaryDictionary.getPropertyForTests(
BinaryDictionary.BIGRAM_COUNT_QUERY))); BinaryDictionary.BIGRAM_COUNT_QUERY)));
} }
public void testOverflowBigrams() {
testOverflowBigrams(FormatSpec.VERSION4);
}
private void testOverflowBigrams(final int formatVersion) {
final int bigramCount = 20000;
final int unigramCount = 1000;
final int unigramTypedCount = 20;
final int eachBigramTypedCount = 5;
final int strongBigramTypedCount = 20;
final int weakBigramTypedCount = 1;
final int codePointSetSize = 50;
final long seed = System.currentTimeMillis();
final Random random = new Random(seed);
File dictFile = null;
try {
dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
} catch (IOException e) {
fail("IOException while writing an initial dictionary : " + e);
}
BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
setCurrentTime(binaryDictionary, mCurrentTime);
final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
final ArrayList<String> words = new ArrayList<String>();
for (int i = 0; i < unigramCount; i++) {
final String word = CodePointUtils.generateWord(random, codePointSet);
words.add(word);
for (int j = 0; j < unigramTypedCount; j++) {
addUnigramWord(binaryDictionary, word, DUMMY_PROBABILITY);
}
}
final String strong = "strong";
final String weak = "weak";
final String target = "target";
for (int j = 0; j < unigramTypedCount; j++) {
addUnigramWord(binaryDictionary, strong, DUMMY_PROBABILITY);
addUnigramWord(binaryDictionary, weak, DUMMY_PROBABILITY);
addUnigramWord(binaryDictionary, target, DUMMY_PROBABILITY);
}
binaryDictionary.flushWithGC();
for (int j = 0; j < strongBigramTypedCount; j++) {
addBigramWords(binaryDictionary, strong, target, DUMMY_PROBABILITY);
}
for (int j = 0; j < weakBigramTypedCount; j++) {
addBigramWords(binaryDictionary, weak, target, DUMMY_PROBABILITY);
}
assertTrue(binaryDictionary.isValidBigram(strong, target));
assertTrue(binaryDictionary.isValidBigram(weak, target));
for (int i = 0; i < bigramCount; i++) {
final int word0Index = random.nextInt(words.size());
final String word0 = words.get(word0Index);
final int index = random.nextInt(words.size() - 1);
final int word1Index = (index >= word0Index) ? index + 1 : index;
final String word1 = words.get(word1Index);
for (int j = 0; j < eachBigramTypedCount; j++) {
addBigramWords(binaryDictionary, word0, word1, DUMMY_PROBABILITY);
}
if (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) {
final int bigramCountBeforeGC =
Integer.parseInt(binaryDictionary.getPropertyForTests(
BinaryDictionary.BIGRAM_COUNT_QUERY));
binaryDictionary.flushWithGC();
final int bigramCountAfterGC =
Integer.parseInt(binaryDictionary.getPropertyForTests(
BinaryDictionary.BIGRAM_COUNT_QUERY));
assertTrue(bigramCountBeforeGC > bigramCountAfterGC);
assertTrue(binaryDictionary.isValidBigram(strong, target));
assertFalse(binaryDictionary.isValidBigram(weak, target));
break;
}
}
}
} }