Merge "Implement ver4 dictionary GC."

This commit is contained in:
Keisuke Kuroyanagi 2013-11-22 10:53:31 +00:00 committed by Android (Google) Code Review
commit d541d282a4
19 changed files with 329 additions and 46 deletions

View file

@ -94,7 +94,7 @@ bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTarget
bool Ver4BigramListPolicy::removeEntry(const int terminalId, const int targetTerminalId) { bool Ver4BigramListPolicy::removeEntry(const int terminalId, const int targetTerminalId) {
const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId); const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId);
if (bigramListPos == NOT_A_DICT_POS) { if (bigramListPos == NOT_A_DICT_POS) {
// Bigram list does't exist. // Bigram list doesn't exist.
return false; return false;
} }
const int entryPosToUpdate = getEntryPosToUpdate(targetTerminalId, bigramListPos); const int entryPosToUpdate = getEntryPosToUpdate(targetTerminalId, bigramListPos);
@ -118,12 +118,62 @@ bool Ver4BigramListPolicy::removeEntry(const int terminalId, const int targetTer
Ver4DictConstants::NOT_A_TERMINAL_ID /* targetTerminalId */, &writingPos); Ver4DictConstants::NOT_A_TERMINAL_ID /* targetTerminalId */, &writingPos);
} }
bool Ver4BigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(const int terminalId,
int *const outBigramCount) {
const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId);
if (bigramListPos == NOT_A_DICT_POS) {
// Bigram list doesn't exist.
return true;
}
bool hasNext = true;
int readingPos = bigramListPos;
while (hasNext) {
const int entryPos = readingPos;
int probability = NOT_A_PROBABILITY;
int targetTerminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
mBigramDictContent->getBigramEntryAndAdvancePosition(&probability, &hasNext,
&targetTerminalId, &readingPos);
if (targetTerminalId == Ver4DictConstants::NOT_A_TERMINAL_ID) {
continue;
}
const int targetPtNodePos = mTerminalPositionLookupTable->getTerminalPtNodePosition(
targetTerminalId);
if (targetPtNodePos == NOT_A_DICT_POS) {
// Invalidate bigram entry.
int writingPos = entryPos;
return mBigramDictContent->writeBigramEntryAndAdvancePosition(probability, hasNext,
Ver4DictConstants::NOT_A_TERMINAL_ID /* targetTerminalId */, &writingPos);
}
}
return true;
}
int Ver4BigramListPolicy::getBigramEntryConut(const int terminalId) {
const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId);
if (bigramListPos == NOT_A_DICT_POS) {
// Bigram list doesn't exist.
return 0;
}
int bigramCount = 0;
bool hasNext = true;
int readingPos = bigramListPos;
while (hasNext) {
int targetTerminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
mBigramDictContent->getBigramEntryAndAdvancePosition(0 /* probability */, &hasNext,
&targetTerminalId, &readingPos);
if (targetTerminalId != Ver4DictConstants::NOT_A_TERMINAL_ID) {
bigramCount++;
}
}
return bigramCount;
}
int Ver4BigramListPolicy::getEntryPosToUpdate(const int targetTerminalIdToFind, int Ver4BigramListPolicy::getEntryPosToUpdate(const int targetTerminalIdToFind,
const int bigramListPos) const { const int bigramListPos) const {
bool hasNext = true; bool hasNext = true;
int invalidEntryPos = NOT_A_DICT_POS; int invalidEntryPos = NOT_A_DICT_POS;
int readingPos = bigramListPos; int readingPos = bigramListPos;
while(hasNext) { while (hasNext) {
const int entryPos = readingPos; const int entryPos = readingPos;
int targetTerminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; int targetTerminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
mBigramDictContent->getBigramEntryAndAdvancePosition(0 /* probability */, &hasNext, mBigramDictContent->getBigramEntryAndAdvancePosition(0 /* probability */, &hasNext,

View file

@ -44,6 +44,11 @@ class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy {
bool removeEntry(const int terminalId, const int targetTerminalId); bool removeEntry(const int terminalId, const int targetTerminalId);
bool updateAllBigramEntriesAndDeleteUselessEntries(const int terminalId,
int *const outBigramCount);
int getBigramEntryConut(const int terminalId);
private: private:
DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4BigramListPolicy); DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4BigramListPolicy);

View file

@ -148,6 +148,14 @@ class PtNodeParams {
return PatriciaTrieReadingUtils::isNotAWord(mFlags); return PatriciaTrieReadingUtils::isNotAWord(mFlags);
} }
AK_FORCE_INLINE bool hasBigrams() const {
return PatriciaTrieReadingUtils::hasBigrams(mFlags);
}
AK_FORCE_INLINE bool hasShortcutTargets() const {
return PatriciaTrieReadingUtils::hasShortcutTargets(mFlags);
}
// Parent node position // Parent node position
AK_FORCE_INLINE int getParentPos() const { AK_FORCE_INLINE int getParentPos() const {
return mParentPos; return mParentPos;

View file

@ -67,16 +67,13 @@ bool DynamicPatriciaTrieGcEventListeners
bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateBigramProbability bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateBigramProbability
::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) {
if (!ptNodeParams->isDeleted()) { if (!ptNodeParams->isDeleted() && ptNodeParams->hasBigrams()) {
int pos = ptNodeParams->getBigramsPos(); int bigramEntryCount = 0;
if (pos != NOT_A_DICT_POS) { if (!mPtNodeWriter->updateAllBigramEntriesAndDeleteUselessEntries(ptNodeParams,
int bigramEntryCount = 0; &bigramEntryCount)) {
if (!mPtNodeWriter->updateAllBigramEntriesAndDeleteUselessEntries(ptNodeParams, return false;
&bigramEntryCount)) {
return false;
}
mValidBigramEntryCount += bigramEntryCount;
} }
mValidBigramEntryCount += bigramEntryCount;
} }
return true; return true;
} }

View file

@ -258,8 +258,7 @@ const PtNodeParams DynamicPatriciaTrieUpdatingHelper::getUpdatedPtNodeParams(
const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags( const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags(
originalPtNodeParams->isBlacklisted(), originalPtNodeParams->isNotAWord(), originalPtNodeParams->isBlacklisted(), originalPtNodeParams->isNotAWord(),
probability != NOT_A_PROBABILITY /* isTerminal */, probability != NOT_A_PROBABILITY /* isTerminal */,
originalPtNodeParams->getShortcutPos() != NOT_A_DICT_POS /* hasShortcutTargets */, originalPtNodeParams->hasShortcutTargets(), originalPtNodeParams->hasBigrams(),
originalPtNodeParams->getBigramsPos() != NOT_A_DICT_POS /* hasBigrams */,
codePointCount > 1 /* hasMultipleChars */, CHILDREN_POSITION_FIELD_SIZE); codePointCount > 1 /* hasMultipleChars */, CHILDREN_POSITION_FIELD_SIZE);
return PtNodeParams(originalPtNodeParams, flags, parentPos, codePointCount, codePoints, return PtNodeParams(originalPtNodeParams, flags, parentPos, codePointCount, codePoints,
probability); probability);

View file

@ -59,7 +59,7 @@ bool BigramDictContent::copyBigramList(const int bigramListPos, const int toPos)
bool hasNext = true; bool hasNext = true;
int readingPos = bigramListPos; int readingPos = bigramListPos;
int writingPos = toPos; int writingPos = toPos;
while(hasNext) { while (hasNext) {
int probability = NOT_A_PROBABILITY; int probability = NOT_A_PROBABILITY;
int targetTerminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; int targetTerminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
getBigramEntryAndAdvancePosition(&probability, &hasNext, &targetTerminalId, getBigramEntryAndAdvancePosition(&probability, &hasNext, &targetTerminalId,

View file

@ -41,25 +41,29 @@ class ProbabilityDictContent : public SingleDictContent {
} }
bool setProbability(const int terminalId, const int probability) { bool setProbability(const int terminalId, const int probability) {
if (terminalId < 0 || terminalId > getSize()) { if (terminalId < 0) {
return false; return false;
} }
if (terminalId == getSize()) { if (terminalId >= getSize()) {
// Write new entry. // Write new entry.
int flagWritingPos = terminalId * (Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE int writingPos = getBuffer()->getTailPosition();
+ Ver4DictConstants::PROBABILITY_SIZE); while (writingPos <= getEntryPos(terminalId)) {
const int dummyFlags = 0; const int dummyFlags = 0;
// Write dummy flags. if (!getWritableBuffer()->writeUintAndAdvancePosition(dummyFlags,
if (!getWritableBuffer()->writeUintAndAdvancePosition(dummyFlags, Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE, &writingPos)) {
Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE, &flagWritingPos)) { return false;
return false; }
const int dummyProbability = 0;
if (!getWritableBuffer()->writeUintAndAdvancePosition(dummyProbability,
Ver4DictConstants::PROBABILITY_SIZE, &writingPos)) {
return false;
}
} }
} }
int probabilityWritingPos = terminalId * (Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE const int probabilityWritingPos = getEntryPos(terminalId)
+ Ver4DictConstants::PROBABILITY_SIZE) + Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE;
+ Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE; return getWritableBuffer()->writeUint(probability,
return getWritableBuffer()->writeUintAndAdvancePosition(probability, Ver4DictConstants::PROBABILITY_SIZE, probabilityWritingPos);
Ver4DictConstants::PROBABILITY_SIZE, &probabilityWritingPos);
} }
bool flushToFile(const char *const dictDirPath) const { bool flushToFile(const char *const dictDirPath) const {
@ -69,6 +73,11 @@ class ProbabilityDictContent : public SingleDictContent {
private: private:
DISALLOW_COPY_AND_ASSIGN(ProbabilityDictContent); DISALLOW_COPY_AND_ASSIGN(ProbabilityDictContent);
int getEntryPos(const int terminalId) const {
return terminalId * (Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE
+ Ver4DictConstants::PROBABILITY_SIZE);
}
int getSize() const { int getSize() const {
return getBuffer()->getTailPosition() / (Ver4DictConstants::PROBABILITY_SIZE return getBuffer()->getTailPosition() / (Ver4DictConstants::PROBABILITY_SIZE
+ Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE); + Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE);

View file

@ -18,6 +18,22 @@
namespace latinime { namespace latinime {
bool SparseTableDictContent::copyContent(
const SparseTableDictContent *const sparseTableDictContent) {
if (!mExpandableLookupTableBuffer.copy(
&sparseTableDictContent->mExpandableLookupTableBuffer)) {
return false;
}
if (!mExpandableAddressTableBuffer.copy(
&sparseTableDictContent->mExpandableAddressTableBuffer)) {
return false;
}
if (!mExpandableContentBuffer.copy(&sparseTableDictContent->mExpandableContentBuffer)) {
return false;
}
return true;
}
bool SparseTableDictContent::flush(const char *const dictDirPath, bool SparseTableDictContent::flush(const char *const dictDirPath,
const char *const lookupTableFileName, const char *const addressTableFileName, const char *const lookupTableFileName, const char *const addressTableFileName,
const char *const contentFileName) const { const char *const contentFileName) const {

View file

@ -75,6 +75,8 @@ class SparseTableDictContent : public DictContent {
|| mExpandableContentBuffer.isNearSizeLimit(); || mExpandableContentBuffer.isNearSizeLimit();
} }
bool copyContent(const SparseTableDictContent *const sparseTableDictContent);
protected: protected:
SparseTable *getUpdatableAddressLookupTable() { SparseTable *getUpdatableAddressLookupTable() {
return &mAddressLookupTable; return &mAddressLookupTable;

View file

@ -44,23 +44,27 @@ class TerminalPositionLookupTable : public SingleDictContent {
if (terminalId < 0 || terminalId >= mSize) { if (terminalId < 0 || terminalId >= mSize) {
return NOT_A_DICT_POS; return NOT_A_DICT_POS;
} }
const int readingPos = terminalId * Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE;
return getBuffer()->readUint(Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, return getBuffer()->readUint(Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE,
readingPos) - mHeaderRegionSize; getEntryPos(terminalId)) - mHeaderRegionSize;
} }
bool setTerminalPtNodePosition(const int terminalId, const int terminalPtNodePos) { bool setTerminalPtNodePosition(const int terminalId, const int terminalPtNodePos) {
if (terminalId < 0 || terminalId > mSize) { if (terminalId < 0) {
return NOT_A_DICT_POS; return NOT_A_DICT_POS;
} }
if (terminalId == mSize) { if (terminalId >= mSize) {
// Use new terminal id. int writingPos = getBuffer()->getTailPosition();
mSize += 1; while(writingPos <= getEntryPos(terminalId)) {
// Write new entry.
getWritableBuffer()->writeUintAndAdvancePosition(
Ver4DictConstants::NOT_A_TERMINAL_ADDRESS,
Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, &writingPos);
}
mSize = getBuffer()->getTailPosition()
/ Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE;
} }
int writingPos = terminalId * Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE; return getWritableBuffer()->writeUint(terminalPtNodePos + mHeaderRegionSize,
return getWritableBuffer()->writeUintAndAdvancePosition( Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(terminalId));
terminalPtNodePos + mHeaderRegionSize,
Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, &writingPos);
} }
int getNextTerminalId() const { int getNextTerminalId() const {
@ -94,6 +98,10 @@ class TerminalPositionLookupTable : public SingleDictContent {
private: private:
DISALLOW_COPY_AND_ASSIGN(TerminalPositionLookupTable); DISALLOW_COPY_AND_ASSIGN(TerminalPositionLookupTable);
int getEntryPos(const int terminalId) const {
return terminalId * Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE;
}
int mSize; int mSize;
const int mHeaderRegionSize; const int mHeaderRegionSize;
}; };

View file

@ -93,6 +93,10 @@ class Ver4DictBuffers {
return &mBigramDictContent; return &mBigramDictContent;
} }
AK_FORCE_INLINE ShortcutDictContent *getUpdatableShortcutDictContent() {
return &mShortcutDictContent;
}
AK_FORCE_INLINE const ShortcutDictContent *getShortcutDictContent() const { AK_FORCE_INLINE const ShortcutDictContent *getShortcutDictContent() const {
return &mShortcutDictContent; return &mShortcutDictContent;
} }

View file

@ -41,6 +41,7 @@ const int Ver4DictConstants::NOT_A_TERMINAL_ID = -1;
const int Ver4DictConstants::PROBABILITY_SIZE = 1; const int Ver4DictConstants::PROBABILITY_SIZE = 1;
const int Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE = 1; const int Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE = 1;
const int Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3; const int Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3;
const int Ver4DictConstants::NOT_A_TERMINAL_ADDRESS = 0;
const int Ver4DictConstants::TERMINAL_ID_FIELD_SIZE = 4; const int Ver4DictConstants::TERMINAL_ID_FIELD_SIZE = 4;
const int Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 4; const int Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 4;

View file

@ -41,6 +41,7 @@ class Ver4DictConstants {
static const int PROBABILITY_SIZE; static const int PROBABILITY_SIZE;
static const int FLAGS_IN_PROBABILITY_FILE_SIZE; static const int FLAGS_IN_PROBABILITY_FILE_SIZE;
static const int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE; static const int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE;
static const int NOT_A_TERMINAL_ADDRESS;
static const int TERMINAL_ID_FIELD_SIZE; static const int TERMINAL_ID_FIELD_SIZE;
static const int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE; static const int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE;

View file

@ -45,8 +45,17 @@ bool Ver4PatriciaTrieNodeWriter::markPtNodeAsDeleted(
true /* isDeleted */); true /* isDeleted */);
int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); int writingPos = toBeUpdatedPtNodeParams->getHeadPos();
// Update flags. // Update flags.
return DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, if (!DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags,
&writingPos); &writingPos)) {
return false;
}
if (toBeUpdatedPtNodeParams->getTerminalId() != NOT_A_DICT_POS) {
// The PtNode is a terminal. Delete entry from the terminal position lookup table.
return mBuffers->getUpdatableTerminalPositionLookupTable()->setTerminalPtNodePosition(
toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */);
} else {
return true;
}
} }
bool Ver4PatriciaTrieNodeWriter::markPtNodeAsMoved( bool Ver4PatriciaTrieNodeWriter::markPtNodeAsMoved(
@ -171,7 +180,7 @@ bool Ver4PatriciaTrieNodeWriter::writePtNodeAndAdvancePosition(
PatriciaTrieReadingUtils::NodeFlags nodeFlags = PatriciaTrieReadingUtils::NodeFlags nodeFlags =
PatriciaTrieReadingUtils::createAndGetFlags(ptNodeParams->isBlacklisted(), PatriciaTrieReadingUtils::createAndGetFlags(ptNodeParams->isBlacklisted(),
ptNodeParams->isNotAWord(), isTerminal, ptNodeParams->isNotAWord(), isTerminal,
false /* hasShortcutTargets */, false /* hasBigrams */, ptNodeParams->hasShortcutTargets(), ptNodeParams->hasBigrams(),
ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */, ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */,
CHILDREN_POSITION_FIELD_SIZE); CHILDREN_POSITION_FIELD_SIZE);
int flagsFieldPos = nodePos; int flagsFieldPos = nodePos;
@ -198,16 +207,49 @@ bool Ver4PatriciaTrieNodeWriter::removeBigramEntry(
bool Ver4PatriciaTrieNodeWriter::updateAllBigramEntriesAndDeleteUselessEntries( bool Ver4PatriciaTrieNodeWriter::updateAllBigramEntriesAndDeleteUselessEntries(
const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount) { const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount) {
// TODO: Implement. return mBigramPolicy->updateAllBigramEntriesAndDeleteUselessEntries(
return false; sourcePtNodeParams->getTerminalId(), outBigramEntryCount);
} }
bool Ver4PatriciaTrieNodeWriter::updateAllPositionFields( bool Ver4PatriciaTrieNodeWriter::updateAllPositionFields(
const PtNodeParams *const toBeUpdatedPtNodeParams, const PtNodeParams *const toBeUpdatedPtNodeParams,
const DictPositionRelocationMap *const dictPositionRelocationMap, const DictPositionRelocationMap *const dictPositionRelocationMap,
int *const outBigramEntryCount) { int *const outBigramEntryCount) {
// TODO: Implement. int parentPos = toBeUpdatedPtNodeParams->getParentPos();
return false; if (parentPos != NOT_A_DICT_POS) {
PtNodeWriter::PtNodePositionRelocationMap::const_iterator it =
dictPositionRelocationMap->mPtNodePositionRelocationMap.find(parentPos);
if (it != dictPositionRelocationMap->mPtNodePositionRelocationMap.end()) {
parentPos = it->second;
}
}
int writingPos = toBeUpdatedPtNodeParams->getHeadPos()
+ DynamicPatriciaTrieWritingUtils::NODE_FLAG_FIELD_SIZE;
// Write updated parent offset.
if (!DynamicPatriciaTrieWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer,
parentPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) {
return false;
}
// Updates children position.
int childrenPos = toBeUpdatedPtNodeParams->getChildrenPos();
if (childrenPos != NOT_A_DICT_POS) {
PtNodeWriter::PtNodeArrayPositionRelocationMap::const_iterator it =
dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.find(childrenPos);
if (it != dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.end()) {
childrenPos = it->second;
}
}
if (!updateChildrenPosition(toBeUpdatedPtNodeParams, childrenPos)) {
return false;
}
// Counts bigram entries.
if (outBigramEntryCount) {
*outBigramEntryCount = mBigramPolicy->getBigramEntryConut(
toBeUpdatedPtNodeParams->getTerminalId());
}
return true;
} }
} }

View file

@ -223,7 +223,15 @@ void Ver4PatriciaTriePolicy::flush(const char *const filePath) {
} }
void Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) { void Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) {
// TODO: Implement. if (!mBuffers.get()->isUpdatable()) {
AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary.");
return;
}
const bool needsToDecay = mHeaderPolicy.isDecayingDict()
&& (mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay(
false /* mindsBlockByDecay */, mUnigramCount, mBigramCount, &mHeaderPolicy));
mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath, &mHeaderPolicy, needsToDecay);
mNeedsToDecayForTesting = false;
} }
bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {

View file

@ -20,6 +20,7 @@
#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h" #include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h"
#include "suggest/policyimpl/dictionary/header/header_policy.h" #include "suggest/policyimpl/dictionary/header/header_policy.h"
#include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_gc_event_listeners.h"
#include "suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h" #include "suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h"
#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h"
#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
@ -91,7 +92,78 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
DynamicPatriciaTrieReadingHelper readingHelper(mBuffers->getTrieBuffer(), &ptNodeReader); DynamicPatriciaTrieReadingHelper readingHelper(mBuffers->getTrieBuffer(), &ptNodeReader);
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
DynamicPatriciaTrieGcEventListeners
::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
headerPolicy, &ptNodeWriter, mBuffers->getWritableTrieBuffer(),
needsToDecay);
if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
&traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) {
return false;
}
if (needsToDecay && traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
.getValidUnigramCount() > ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC) {
// TODO: Remove more unigrams.
}
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateBigramProbability
traversePolicyToUpdateBigramProbability(&ptNodeWriter);
if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
&traversePolicyToUpdateBigramProbability)) {
return false;
}
if (needsToDecay && traversePolicyToUpdateBigramProbability.getValidBigramEntryCount()
> ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC) {
// TODO: Remove more bigrams.
}
// Mapping from positions in mBuffer to positions in bufferToWrite.
PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap;
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(),
buffersToWrite, &ptNodeReader, &bigramPolicy, &shortcutPolicy);
DynamicPatriciaTrieGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers,
buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap);
if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
&traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) {
return false;
}
// Create policy instances for the GCed dictionary.
Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(),
buffersToWrite->getProbabilityDictContent());
Ver4BigramListPolicy newBigramPolicy(buffersToWrite->getUpdatableBigramDictContent(),
buffersToWrite->getTerminalPositionLookupTable());
Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getShortcutDictContent(),
buffersToWrite->getTerminalPositionLookupTable());
Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(),
buffersToWrite, &newPtNodeReader, &newBigramPolicy, &newShortcutPolicy);
if(!buffersToWrite->getUpdatableBigramDictContent()->copyContent(
mBuffers->getBigramDictContent())) {
return false;
}
if(!buffersToWrite->getUpdatableShortcutDictContent()->copyContent(
mBuffers->getShortcutDictContent())) {
return false;
}
DynamicPatriciaTrieReadingHelper newDictReadingHelper(buffersToWrite->getTrieBuffer(),
&newPtNodeReader);
newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateAllPositionFields
traversePolicyToUpdateAllPositionFields(&newPtNodeWriter, &dictPositionRelocationMap);
if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
&traversePolicyToUpdateAllPositionFields)) {
return false;
}
// TODO: GC for dict contents.
*outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount();
*outBigramCount = traversePolicyToUpdateAllPositionFields.getBigramCount();
return true; return true;
} }

View file

@ -131,4 +131,21 @@ bool BufferWithExtendableBuffer::checkAndPrepareWriting(const int pos, const int
return true; return true;
} }
bool BufferWithExtendableBuffer::copy(const BufferWithExtendableBuffer *const sourceBuffer) {
int copyingPos = 0;
const int tailPos = sourceBuffer->getTailPosition();
const int maxDataChunkSize = sizeof(uint32_t);
while (copyingPos < tailPos) {
const int remainingSize = tailPos - copyingPos;
const int copyingSize = (remainingSize >= maxDataChunkSize) ?
maxDataChunkSize : remainingSize;
const uint32_t data = sourceBuffer->readUint(copyingSize, copyingPos);
if (!writeUint(data, copyingSize, copyingPos)) {
return false;
}
copyingPos += copyingSize;
}
return true;
}
} }

View file

@ -100,6 +100,8 @@ class BufferWithExtendableBuffer {
bool writeCodePointsAndAdvancePosition(const int *const codePoints, const int codePointCount, bool writeCodePointsAndAdvancePosition(const int *const codePoints, const int codePointCount,
const bool writesTerminator, int *const pos); const bool writesTerminator, int *const pos);
bool copy(const BufferWithExtendableBuffer *const sourceBuffer);
private: private:
DISALLOW_COPY_AND_ASSIGN(BufferWithExtendableBuffer); DISALLOW_COPY_AND_ASSIGN(BufferWithExtendableBuffer);

View file

@ -297,4 +297,46 @@ public class Ver4BinaryDictionaryTests extends AndroidTestCase {
binaryDictionary.close(); binaryDictionary.close();
} }
public void testFlushWithGCDictionary() {
final String dictVersion = Long.toString(System.currentTimeMillis());
File trieFile = null;
try {
trieFile = createEmptyDictionaryAndGetTrieFile(dictVersion);
} catch (IOException e) {
fail("IOException while writing an initial dictionary : " + e);
}
BinaryDictionary binaryDictionary = new BinaryDictionary(trieFile.getAbsolutePath(),
0 /* offset */, trieFile.length(), true /* useFullEditDistance */,
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
final int unigramProbability = 100;
final int bigramProbability = 10;
binaryDictionary.addUnigramWord("aaa", unigramProbability);
binaryDictionary.addUnigramWord("abb", unigramProbability);
binaryDictionary.addUnigramWord("bcc", unigramProbability);
binaryDictionary.addBigramWords("aaa", "abb", bigramProbability);
binaryDictionary.addBigramWords("aaa", "bcc", bigramProbability);
binaryDictionary.addBigramWords("abb", "aaa", bigramProbability);
binaryDictionary.addBigramWords("abb", "bcc", bigramProbability);
binaryDictionary.flushWithGC();
binaryDictionary.close();
binaryDictionary = new BinaryDictionary(trieFile.getAbsolutePath(),
0 /* offset */, trieFile.length(), true /* useFullEditDistance */,
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
final int probability = binaryDictionary.calculateProbability(unigramProbability,
bigramProbability);
assertEquals(unigramProbability, binaryDictionary.getFrequency("aaa"));
assertEquals(unigramProbability, binaryDictionary.getFrequency("abb"));
assertEquals(unigramProbability, binaryDictionary.getFrequency("bcc"));
assertEquals(probability, binaryDictionary.getBigramProbability("aaa", "abb"));
assertEquals(probability, binaryDictionary.getBigramProbability("aaa", "bcc"));
assertEquals(probability, binaryDictionary.getBigramProbability("abb", "aaa"));
assertEquals(probability, binaryDictionary.getBigramProbability("abb", "bcc"));
assertEquals(false, binaryDictionary.isValidBigram("bcc", "aaa"));
assertEquals(false, binaryDictionary.isValidBigram("bcc", "bbc"));
assertEquals(false, binaryDictionary.isValidBigram("aaa", "aaa"));
binaryDictionary.flushWithGC();
binaryDictionary.close();
}
} }