am 60388956: Discard useless bigrams when overflowing.
* commit '603889561127457f553ca328d6b5bb392d921681': Discard useless bigrams when overflowing.main
commit
03fdb22e7c
|
@ -102,7 +102,7 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
|
||||||
.getValidUnigramCount();
|
.getValidUnigramCount();
|
||||||
if (headerPolicy->isDecayingDict()
|
if (headerPolicy->isDecayingDict()
|
||||||
&& unigramCount > ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC) {
|
&& unigramCount > ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC) {
|
||||||
if (!turncateUnigrams(&ptNodeReader, &ptNodeWriter,
|
if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter,
|
||||||
ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC)) {
|
ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC)) {
|
||||||
AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount,
|
AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount,
|
||||||
ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC);
|
ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC);
|
||||||
|
@ -117,10 +117,14 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
|
||||||
&traversePolicyToUpdateBigramProbability)) {
|
&traversePolicyToUpdateBigramProbability)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount();
|
||||||
if (headerPolicy->isDecayingDict()
|
if (headerPolicy->isDecayingDict()
|
||||||
&& traversePolicyToUpdateBigramProbability.getValidBigramEntryCount()
|
&& bigramCount > ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC) {
|
||||||
> ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC) {
|
if (!truncateBigrams(ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC)) {
|
||||||
// TODO: Remove more bigrams.
|
AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount,
|
||||||
|
ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Mapping from positions in mBuffer to positions in bufferToWrite.
|
// Mapping from positions in mBuffer to positions in bufferToWrite.
|
||||||
|
@ -186,7 +190,7 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Ver4PatriciaTrieWritingHelper::turncateUnigrams(
|
bool Ver4PatriciaTrieWritingHelper::truncateUnigrams(
|
||||||
const Ver4PatriciaTrieNodeReader *const ptNodeReader,
|
const Ver4PatriciaTrieNodeReader *const ptNodeReader,
|
||||||
Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount) {
|
Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount) {
|
||||||
const TerminalPositionLookupTable *const terminalPosLookupTable =
|
const TerminalPositionLookupTable *const terminalPosLookupTable =
|
||||||
|
@ -222,6 +226,50 @@ bool Ver4PatriciaTrieWritingHelper::turncateUnigrams(
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool Ver4PatriciaTrieWritingHelper::truncateBigrams(const int maxBigramCount) {
|
||||||
|
const TerminalPositionLookupTable *const terminalPosLookupTable =
|
||||||
|
mBuffers->getTerminalPositionLookupTable();
|
||||||
|
const int nextTerminalId = terminalPosLookupTable->getNextTerminalId();
|
||||||
|
std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator>
|
||||||
|
priorityQueue;
|
||||||
|
BigramDictContent *const bigramDictContent = mBuffers->getMutableBigramDictContent();
|
||||||
|
for (int i = 0; i < nextTerminalId; ++i) {
|
||||||
|
const int bigramListPos = bigramDictContent->getBigramListHeadPos(i);
|
||||||
|
if (bigramListPos == NOT_A_DICT_POS) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
bool hasNext = true;
|
||||||
|
int readingPos = bigramListPos;
|
||||||
|
while (hasNext) {
|
||||||
|
const int entryPos = readingPos;
|
||||||
|
const BigramEntry bigramEntry =
|
||||||
|
bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos);
|
||||||
|
hasNext = bigramEntry.hasNext();
|
||||||
|
if (!bigramEntry.isValid()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const int probability = bigramEntry.hasHistoricalInfo() ?
|
||||||
|
ForgettingCurveUtils::decodeProbability(bigramEntry.getHistoricalInfo()) :
|
||||||
|
bigramEntry.getProbability();
|
||||||
|
priorityQueue.push(DictProbability(entryPos, probability,
|
||||||
|
bigramEntry.getHistoricalInfo()->getTimeStamp()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delete bigrams.
|
||||||
|
while (static_cast<int>(priorityQueue.size()) > maxBigramCount) {
|
||||||
|
const int entryPos = priorityQueue.top().getDictPos();
|
||||||
|
const BigramEntry bigramEntry = bigramDictContent->getBigramEntry(entryPos);
|
||||||
|
const BigramEntry invalidatedBigramEntry = bigramEntry.getInvalidatedEntry();
|
||||||
|
if (!bigramDictContent->writeBigramEntry(&invalidatedBigramEntry, entryPos)) {
|
||||||
|
AKLOGE("Cannot write bigram entry to remove. pos: %d", entryPos);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
priorityQueue.pop();
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
|
bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
|
||||||
::onVisitingPtNode(const PtNodeParams *const ptNodeParams) {
|
::onVisitingPtNode(const PtNodeParams *const ptNodeParams) {
|
||||||
if (!ptNodeParams->isTerminal()) {
|
if (!ptNodeParams->isTerminal()) {
|
||||||
|
|
|
@ -65,7 +65,7 @@ class Ver4PatriciaTrieWritingHelper {
|
||||||
const TerminalPositionLookupTable::TerminalIdMap *const mTerminalIdMap;
|
const TerminalPositionLookupTable::TerminalIdMap *const mTerminalIdMap;
|
||||||
};
|
};
|
||||||
|
|
||||||
// For truncateUnigrams().
|
// For truncateUnigrams() and truncateBigrams().
|
||||||
class DictProbability {
|
class DictProbability {
|
||||||
public:
|
public:
|
||||||
DictProbability(const int dictPos, const int probability, const int timestamp)
|
DictProbability(const int dictPos, const int probability, const int timestamp)
|
||||||
|
@ -91,7 +91,7 @@ class Ver4PatriciaTrieWritingHelper {
|
||||||
int mTimestamp;
|
int mTimestamp;
|
||||||
};
|
};
|
||||||
|
|
||||||
// For truncateUnigrams().
|
// For truncateUnigrams() and truncateBigrams().
|
||||||
class DictProbabilityComparator {
|
class DictProbabilityComparator {
|
||||||
public:
|
public:
|
||||||
bool operator()(const DictProbability &left, const DictProbability &right) {
|
bool operator()(const DictProbability &left, const DictProbability &right) {
|
||||||
|
@ -112,9 +112,11 @@ class Ver4PatriciaTrieWritingHelper {
|
||||||
Ver4DictBuffers *const buffersToWrite, int *const outUnigramCount,
|
Ver4DictBuffers *const buffersToWrite, int *const outUnigramCount,
|
||||||
int *const outBigramCount);
|
int *const outBigramCount);
|
||||||
|
|
||||||
bool turncateUnigrams(const Ver4PatriciaTrieNodeReader *const ptNodeReader,
|
bool truncateUnigrams(const Ver4PatriciaTrieNodeReader *const ptNodeReader,
|
||||||
Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount);
|
Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount);
|
||||||
|
|
||||||
|
bool truncateBigrams(const int maxBigramCount);
|
||||||
|
|
||||||
Ver4DictBuffers *const mBuffers;
|
Ver4DictBuffers *const mBuffers;
|
||||||
};
|
};
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
|
|
|
@ -474,4 +474,83 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
||||||
assertEquals(0, Integer.parseInt(binaryDictionary.getPropertyForTests(
|
assertEquals(0, Integer.parseInt(binaryDictionary.getPropertyForTests(
|
||||||
BinaryDictionary.BIGRAM_COUNT_QUERY)));
|
BinaryDictionary.BIGRAM_COUNT_QUERY)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testOverflowBigrams() {
|
||||||
|
testOverflowBigrams(FormatSpec.VERSION4);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void testOverflowBigrams(final int formatVersion) {
|
||||||
|
final int bigramCount = 20000;
|
||||||
|
final int unigramCount = 1000;
|
||||||
|
final int unigramTypedCount = 20;
|
||||||
|
final int eachBigramTypedCount = 5;
|
||||||
|
final int strongBigramTypedCount = 20;
|
||||||
|
final int weakBigramTypedCount = 1;
|
||||||
|
final int codePointSetSize = 50;
|
||||||
|
final long seed = System.currentTimeMillis();
|
||||||
|
final Random random = new Random(seed);
|
||||||
|
|
||||||
|
File dictFile = null;
|
||||||
|
try {
|
||||||
|
dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary", formatVersion);
|
||||||
|
} catch (IOException e) {
|
||||||
|
fail("IOException while writing an initial dictionary : " + e);
|
||||||
|
}
|
||||||
|
BinaryDictionary binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
|
||||||
|
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
|
||||||
|
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
|
||||||
|
setCurrentTime(binaryDictionary, mCurrentTime);
|
||||||
|
final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
|
||||||
|
|
||||||
|
final ArrayList<String> words = new ArrayList<String>();
|
||||||
|
for (int i = 0; i < unigramCount; i++) {
|
||||||
|
final String word = CodePointUtils.generateWord(random, codePointSet);
|
||||||
|
words.add(word);
|
||||||
|
for (int j = 0; j < unigramTypedCount; j++) {
|
||||||
|
addUnigramWord(binaryDictionary, word, DUMMY_PROBABILITY);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
final String strong = "strong";
|
||||||
|
final String weak = "weak";
|
||||||
|
final String target = "target";
|
||||||
|
for (int j = 0; j < unigramTypedCount; j++) {
|
||||||
|
addUnigramWord(binaryDictionary, strong, DUMMY_PROBABILITY);
|
||||||
|
addUnigramWord(binaryDictionary, weak, DUMMY_PROBABILITY);
|
||||||
|
addUnigramWord(binaryDictionary, target, DUMMY_PROBABILITY);
|
||||||
|
}
|
||||||
|
binaryDictionary.flushWithGC();
|
||||||
|
for (int j = 0; j < strongBigramTypedCount; j++) {
|
||||||
|
addBigramWords(binaryDictionary, strong, target, DUMMY_PROBABILITY);
|
||||||
|
}
|
||||||
|
for (int j = 0; j < weakBigramTypedCount; j++) {
|
||||||
|
addBigramWords(binaryDictionary, weak, target, DUMMY_PROBABILITY);
|
||||||
|
}
|
||||||
|
assertTrue(binaryDictionary.isValidBigram(strong, target));
|
||||||
|
assertTrue(binaryDictionary.isValidBigram(weak, target));
|
||||||
|
|
||||||
|
for (int i = 0; i < bigramCount; i++) {
|
||||||
|
final int word0Index = random.nextInt(words.size());
|
||||||
|
final String word0 = words.get(word0Index);
|
||||||
|
final int index = random.nextInt(words.size() - 1);
|
||||||
|
final int word1Index = (index >= word0Index) ? index + 1 : index;
|
||||||
|
final String word1 = words.get(word1Index);
|
||||||
|
|
||||||
|
for (int j = 0; j < eachBigramTypedCount; j++) {
|
||||||
|
addBigramWords(binaryDictionary, word0, word1, DUMMY_PROBABILITY);
|
||||||
|
}
|
||||||
|
if (binaryDictionary.needsToRunGC(true /* mindsBlockByGC */)) {
|
||||||
|
final int bigramCountBeforeGC =
|
||||||
|
Integer.parseInt(binaryDictionary.getPropertyForTests(
|
||||||
|
BinaryDictionary.BIGRAM_COUNT_QUERY));
|
||||||
|
binaryDictionary.flushWithGC();
|
||||||
|
final int bigramCountAfterGC =
|
||||||
|
Integer.parseInt(binaryDictionary.getPropertyForTests(
|
||||||
|
BinaryDictionary.BIGRAM_COUNT_QUERY));
|
||||||
|
assertTrue(bigramCountBeforeGC > bigramCountAfterGC);
|
||||||
|
assertTrue(binaryDictionary.isValidBigram(strong, target));
|
||||||
|
assertFalse(binaryDictionary.isValidBigram(weak, target));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue