Merge "Implement ver4 dictionary GC."
This commit is contained in:
commit
d541d282a4
19 changed files with 329 additions and 46 deletions
|
@ -94,7 +94,7 @@ bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTarget
|
|||
bool Ver4BigramListPolicy::removeEntry(const int terminalId, const int targetTerminalId) {
|
||||
const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId);
|
||||
if (bigramListPos == NOT_A_DICT_POS) {
|
||||
// Bigram list does't exist.
|
||||
// Bigram list doesn't exist.
|
||||
return false;
|
||||
}
|
||||
const int entryPosToUpdate = getEntryPosToUpdate(targetTerminalId, bigramListPos);
|
||||
|
@ -118,12 +118,62 @@ bool Ver4BigramListPolicy::removeEntry(const int terminalId, const int targetTer
|
|||
Ver4DictConstants::NOT_A_TERMINAL_ID /* targetTerminalId */, &writingPos);
|
||||
}
|
||||
|
||||
bool Ver4BigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(const int terminalId,
|
||||
int *const outBigramCount) {
|
||||
const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId);
|
||||
if (bigramListPos == NOT_A_DICT_POS) {
|
||||
// Bigram list doesn't exist.
|
||||
return true;
|
||||
}
|
||||
bool hasNext = true;
|
||||
int readingPos = bigramListPos;
|
||||
while (hasNext) {
|
||||
const int entryPos = readingPos;
|
||||
int probability = NOT_A_PROBABILITY;
|
||||
int targetTerminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
|
||||
mBigramDictContent->getBigramEntryAndAdvancePosition(&probability, &hasNext,
|
||||
&targetTerminalId, &readingPos);
|
||||
if (targetTerminalId == Ver4DictConstants::NOT_A_TERMINAL_ID) {
|
||||
continue;
|
||||
}
|
||||
const int targetPtNodePos = mTerminalPositionLookupTable->getTerminalPtNodePosition(
|
||||
targetTerminalId);
|
||||
if (targetPtNodePos == NOT_A_DICT_POS) {
|
||||
// Invalidate bigram entry.
|
||||
int writingPos = entryPos;
|
||||
return mBigramDictContent->writeBigramEntryAndAdvancePosition(probability, hasNext,
|
||||
Ver4DictConstants::NOT_A_TERMINAL_ID /* targetTerminalId */, &writingPos);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int Ver4BigramListPolicy::getBigramEntryConut(const int terminalId) {
|
||||
const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId);
|
||||
if (bigramListPos == NOT_A_DICT_POS) {
|
||||
// Bigram list doesn't exist.
|
||||
return 0;
|
||||
}
|
||||
int bigramCount = 0;
|
||||
bool hasNext = true;
|
||||
int readingPos = bigramListPos;
|
||||
while (hasNext) {
|
||||
int targetTerminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
|
||||
mBigramDictContent->getBigramEntryAndAdvancePosition(0 /* probability */, &hasNext,
|
||||
&targetTerminalId, &readingPos);
|
||||
if (targetTerminalId != Ver4DictConstants::NOT_A_TERMINAL_ID) {
|
||||
bigramCount++;
|
||||
}
|
||||
}
|
||||
return bigramCount;
|
||||
}
|
||||
|
||||
int Ver4BigramListPolicy::getEntryPosToUpdate(const int targetTerminalIdToFind,
|
||||
const int bigramListPos) const {
|
||||
bool hasNext = true;
|
||||
int invalidEntryPos = NOT_A_DICT_POS;
|
||||
int readingPos = bigramListPos;
|
||||
while(hasNext) {
|
||||
while (hasNext) {
|
||||
const int entryPos = readingPos;
|
||||
int targetTerminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
|
||||
mBigramDictContent->getBigramEntryAndAdvancePosition(0 /* probability */, &hasNext,
|
||||
|
|
|
@ -44,6 +44,11 @@ class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy {
|
|||
|
||||
bool removeEntry(const int terminalId, const int targetTerminalId);
|
||||
|
||||
bool updateAllBigramEntriesAndDeleteUselessEntries(const int terminalId,
|
||||
int *const outBigramCount);
|
||||
|
||||
int getBigramEntryConut(const int terminalId);
|
||||
|
||||
private:
|
||||
DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4BigramListPolicy);
|
||||
|
||||
|
|
|
@ -148,6 +148,14 @@ class PtNodeParams {
|
|||
return PatriciaTrieReadingUtils::isNotAWord(mFlags);
|
||||
}
|
||||
|
||||
AK_FORCE_INLINE bool hasBigrams() const {
|
||||
return PatriciaTrieReadingUtils::hasBigrams(mFlags);
|
||||
}
|
||||
|
||||
AK_FORCE_INLINE bool hasShortcutTargets() const {
|
||||
return PatriciaTrieReadingUtils::hasShortcutTargets(mFlags);
|
||||
}
|
||||
|
||||
// Parent node position
|
||||
AK_FORCE_INLINE int getParentPos() const {
|
||||
return mParentPos;
|
||||
|
|
|
@ -67,16 +67,13 @@ bool DynamicPatriciaTrieGcEventListeners
|
|||
|
||||
bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateBigramProbability
|
||||
::onVisitingPtNode(const PtNodeParams *const ptNodeParams) {
|
||||
if (!ptNodeParams->isDeleted()) {
|
||||
int pos = ptNodeParams->getBigramsPos();
|
||||
if (pos != NOT_A_DICT_POS) {
|
||||
int bigramEntryCount = 0;
|
||||
if (!mPtNodeWriter->updateAllBigramEntriesAndDeleteUselessEntries(ptNodeParams,
|
||||
&bigramEntryCount)) {
|
||||
return false;
|
||||
}
|
||||
mValidBigramEntryCount += bigramEntryCount;
|
||||
if (!ptNodeParams->isDeleted() && ptNodeParams->hasBigrams()) {
|
||||
int bigramEntryCount = 0;
|
||||
if (!mPtNodeWriter->updateAllBigramEntriesAndDeleteUselessEntries(ptNodeParams,
|
||||
&bigramEntryCount)) {
|
||||
return false;
|
||||
}
|
||||
mValidBigramEntryCount += bigramEntryCount;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -258,8 +258,7 @@ const PtNodeParams DynamicPatriciaTrieUpdatingHelper::getUpdatedPtNodeParams(
|
|||
const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags(
|
||||
originalPtNodeParams->isBlacklisted(), originalPtNodeParams->isNotAWord(),
|
||||
probability != NOT_A_PROBABILITY /* isTerminal */,
|
||||
originalPtNodeParams->getShortcutPos() != NOT_A_DICT_POS /* hasShortcutTargets */,
|
||||
originalPtNodeParams->getBigramsPos() != NOT_A_DICT_POS /* hasBigrams */,
|
||||
originalPtNodeParams->hasShortcutTargets(), originalPtNodeParams->hasBigrams(),
|
||||
codePointCount > 1 /* hasMultipleChars */, CHILDREN_POSITION_FIELD_SIZE);
|
||||
return PtNodeParams(originalPtNodeParams, flags, parentPos, codePointCount, codePoints,
|
||||
probability);
|
||||
|
|
|
@ -59,7 +59,7 @@ bool BigramDictContent::copyBigramList(const int bigramListPos, const int toPos)
|
|||
bool hasNext = true;
|
||||
int readingPos = bigramListPos;
|
||||
int writingPos = toPos;
|
||||
while(hasNext) {
|
||||
while (hasNext) {
|
||||
int probability = NOT_A_PROBABILITY;
|
||||
int targetTerminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
|
||||
getBigramEntryAndAdvancePosition(&probability, &hasNext, &targetTerminalId,
|
||||
|
|
|
@ -41,25 +41,29 @@ class ProbabilityDictContent : public SingleDictContent {
|
|||
}
|
||||
|
||||
bool setProbability(const int terminalId, const int probability) {
|
||||
if (terminalId < 0 || terminalId > getSize()) {
|
||||
if (terminalId < 0) {
|
||||
return false;
|
||||
}
|
||||
if (terminalId == getSize()) {
|
||||
if (terminalId >= getSize()) {
|
||||
// Write new entry.
|
||||
int flagWritingPos = terminalId * (Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE
|
||||
+ Ver4DictConstants::PROBABILITY_SIZE);
|
||||
const int dummyFlags = 0;
|
||||
// Write dummy flags.
|
||||
if (!getWritableBuffer()->writeUintAndAdvancePosition(dummyFlags,
|
||||
Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE, &flagWritingPos)) {
|
||||
return false;
|
||||
int writingPos = getBuffer()->getTailPosition();
|
||||
while (writingPos <= getEntryPos(terminalId)) {
|
||||
const int dummyFlags = 0;
|
||||
if (!getWritableBuffer()->writeUintAndAdvancePosition(dummyFlags,
|
||||
Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE, &writingPos)) {
|
||||
return false;
|
||||
}
|
||||
const int dummyProbability = 0;
|
||||
if (!getWritableBuffer()->writeUintAndAdvancePosition(dummyProbability,
|
||||
Ver4DictConstants::PROBABILITY_SIZE, &writingPos)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
int probabilityWritingPos = terminalId * (Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE
|
||||
+ Ver4DictConstants::PROBABILITY_SIZE)
|
||||
+ Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE;
|
||||
return getWritableBuffer()->writeUintAndAdvancePosition(probability,
|
||||
Ver4DictConstants::PROBABILITY_SIZE, &probabilityWritingPos);
|
||||
const int probabilityWritingPos = getEntryPos(terminalId)
|
||||
+ Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE;
|
||||
return getWritableBuffer()->writeUint(probability,
|
||||
Ver4DictConstants::PROBABILITY_SIZE, probabilityWritingPos);
|
||||
}
|
||||
|
||||
bool flushToFile(const char *const dictDirPath) const {
|
||||
|
@ -69,6 +73,11 @@ class ProbabilityDictContent : public SingleDictContent {
|
|||
private:
|
||||
DISALLOW_COPY_AND_ASSIGN(ProbabilityDictContent);
|
||||
|
||||
int getEntryPos(const int terminalId) const {
|
||||
return terminalId * (Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE
|
||||
+ Ver4DictConstants::PROBABILITY_SIZE);
|
||||
}
|
||||
|
||||
int getSize() const {
|
||||
return getBuffer()->getTailPosition() / (Ver4DictConstants::PROBABILITY_SIZE
|
||||
+ Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE);
|
||||
|
|
|
@ -18,6 +18,22 @@
|
|||
|
||||
namespace latinime {
|
||||
|
||||
bool SparseTableDictContent::copyContent(
|
||||
const SparseTableDictContent *const sparseTableDictContent) {
|
||||
if (!mExpandableLookupTableBuffer.copy(
|
||||
&sparseTableDictContent->mExpandableLookupTableBuffer)) {
|
||||
return false;
|
||||
}
|
||||
if (!mExpandableAddressTableBuffer.copy(
|
||||
&sparseTableDictContent->mExpandableAddressTableBuffer)) {
|
||||
return false;
|
||||
}
|
||||
if (!mExpandableContentBuffer.copy(&sparseTableDictContent->mExpandableContentBuffer)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SparseTableDictContent::flush(const char *const dictDirPath,
|
||||
const char *const lookupTableFileName, const char *const addressTableFileName,
|
||||
const char *const contentFileName) const {
|
||||
|
|
|
@ -75,6 +75,8 @@ class SparseTableDictContent : public DictContent {
|
|||
|| mExpandableContentBuffer.isNearSizeLimit();
|
||||
}
|
||||
|
||||
bool copyContent(const SparseTableDictContent *const sparseTableDictContent);
|
||||
|
||||
protected:
|
||||
SparseTable *getUpdatableAddressLookupTable() {
|
||||
return &mAddressLookupTable;
|
||||
|
|
|
@ -44,23 +44,27 @@ class TerminalPositionLookupTable : public SingleDictContent {
|
|||
if (terminalId < 0 || terminalId >= mSize) {
|
||||
return NOT_A_DICT_POS;
|
||||
}
|
||||
const int readingPos = terminalId * Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE;
|
||||
return getBuffer()->readUint(Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE,
|
||||
readingPos) - mHeaderRegionSize;
|
||||
getEntryPos(terminalId)) - mHeaderRegionSize;
|
||||
}
|
||||
|
||||
bool setTerminalPtNodePosition(const int terminalId, const int terminalPtNodePos) {
|
||||
if (terminalId < 0 || terminalId > mSize) {
|
||||
if (terminalId < 0) {
|
||||
return NOT_A_DICT_POS;
|
||||
}
|
||||
if (terminalId == mSize) {
|
||||
// Use new terminal id.
|
||||
mSize += 1;
|
||||
if (terminalId >= mSize) {
|
||||
int writingPos = getBuffer()->getTailPosition();
|
||||
while(writingPos <= getEntryPos(terminalId)) {
|
||||
// Write new entry.
|
||||
getWritableBuffer()->writeUintAndAdvancePosition(
|
||||
Ver4DictConstants::NOT_A_TERMINAL_ADDRESS,
|
||||
Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, &writingPos);
|
||||
}
|
||||
mSize = getBuffer()->getTailPosition()
|
||||
/ Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE;
|
||||
}
|
||||
int writingPos = terminalId * Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE;
|
||||
return getWritableBuffer()->writeUintAndAdvancePosition(
|
||||
terminalPtNodePos + mHeaderRegionSize,
|
||||
Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, &writingPos);
|
||||
return getWritableBuffer()->writeUint(terminalPtNodePos + mHeaderRegionSize,
|
||||
Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(terminalId));
|
||||
}
|
||||
|
||||
int getNextTerminalId() const {
|
||||
|
@ -94,6 +98,10 @@ class TerminalPositionLookupTable : public SingleDictContent {
|
|||
private:
|
||||
DISALLOW_COPY_AND_ASSIGN(TerminalPositionLookupTable);
|
||||
|
||||
int getEntryPos(const int terminalId) const {
|
||||
return terminalId * Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE;
|
||||
}
|
||||
|
||||
int mSize;
|
||||
const int mHeaderRegionSize;
|
||||
};
|
||||
|
|
|
@ -93,6 +93,10 @@ class Ver4DictBuffers {
|
|||
return &mBigramDictContent;
|
||||
}
|
||||
|
||||
AK_FORCE_INLINE ShortcutDictContent *getUpdatableShortcutDictContent() {
|
||||
return &mShortcutDictContent;
|
||||
}
|
||||
|
||||
AK_FORCE_INLINE const ShortcutDictContent *getShortcutDictContent() const {
|
||||
return &mShortcutDictContent;
|
||||
}
|
||||
|
|
|
@ -41,6 +41,7 @@ const int Ver4DictConstants::NOT_A_TERMINAL_ID = -1;
|
|||
const int Ver4DictConstants::PROBABILITY_SIZE = 1;
|
||||
const int Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE = 1;
|
||||
const int Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3;
|
||||
const int Ver4DictConstants::NOT_A_TERMINAL_ADDRESS = 0;
|
||||
const int Ver4DictConstants::TERMINAL_ID_FIELD_SIZE = 4;
|
||||
|
||||
const int Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 4;
|
||||
|
|
|
@ -41,6 +41,7 @@ class Ver4DictConstants {
|
|||
static const int PROBABILITY_SIZE;
|
||||
static const int FLAGS_IN_PROBABILITY_FILE_SIZE;
|
||||
static const int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE;
|
||||
static const int NOT_A_TERMINAL_ADDRESS;
|
||||
static const int TERMINAL_ID_FIELD_SIZE;
|
||||
|
||||
static const int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE;
|
||||
|
|
|
@ -45,8 +45,17 @@ bool Ver4PatriciaTrieNodeWriter::markPtNodeAsDeleted(
|
|||
true /* isDeleted */);
|
||||
int writingPos = toBeUpdatedPtNodeParams->getHeadPos();
|
||||
// Update flags.
|
||||
return DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags,
|
||||
&writingPos);
|
||||
if (!DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags,
|
||||
&writingPos)) {
|
||||
return false;
|
||||
}
|
||||
if (toBeUpdatedPtNodeParams->getTerminalId() != NOT_A_DICT_POS) {
|
||||
// The PtNode is a terminal. Delete entry from the terminal position lookup table.
|
||||
return mBuffers->getUpdatableTerminalPositionLookupTable()->setTerminalPtNodePosition(
|
||||
toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */);
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
bool Ver4PatriciaTrieNodeWriter::markPtNodeAsMoved(
|
||||
|
@ -171,7 +180,7 @@ bool Ver4PatriciaTrieNodeWriter::writePtNodeAndAdvancePosition(
|
|||
PatriciaTrieReadingUtils::NodeFlags nodeFlags =
|
||||
PatriciaTrieReadingUtils::createAndGetFlags(ptNodeParams->isBlacklisted(),
|
||||
ptNodeParams->isNotAWord(), isTerminal,
|
||||
false /* hasShortcutTargets */, false /* hasBigrams */,
|
||||
ptNodeParams->hasShortcutTargets(), ptNodeParams->hasBigrams(),
|
||||
ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */,
|
||||
CHILDREN_POSITION_FIELD_SIZE);
|
||||
int flagsFieldPos = nodePos;
|
||||
|
@ -198,16 +207,49 @@ bool Ver4PatriciaTrieNodeWriter::removeBigramEntry(
|
|||
|
||||
bool Ver4PatriciaTrieNodeWriter::updateAllBigramEntriesAndDeleteUselessEntries(
|
||||
const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount) {
|
||||
// TODO: Implement.
|
||||
return false;
|
||||
return mBigramPolicy->updateAllBigramEntriesAndDeleteUselessEntries(
|
||||
sourcePtNodeParams->getTerminalId(), outBigramEntryCount);
|
||||
}
|
||||
|
||||
bool Ver4PatriciaTrieNodeWriter::updateAllPositionFields(
|
||||
const PtNodeParams *const toBeUpdatedPtNodeParams,
|
||||
const DictPositionRelocationMap *const dictPositionRelocationMap,
|
||||
int *const outBigramEntryCount) {
|
||||
// TODO: Implement.
|
||||
return false;
|
||||
int parentPos = toBeUpdatedPtNodeParams->getParentPos();
|
||||
if (parentPos != NOT_A_DICT_POS) {
|
||||
PtNodeWriter::PtNodePositionRelocationMap::const_iterator it =
|
||||
dictPositionRelocationMap->mPtNodePositionRelocationMap.find(parentPos);
|
||||
if (it != dictPositionRelocationMap->mPtNodePositionRelocationMap.end()) {
|
||||
parentPos = it->second;
|
||||
}
|
||||
}
|
||||
int writingPos = toBeUpdatedPtNodeParams->getHeadPos()
|
||||
+ DynamicPatriciaTrieWritingUtils::NODE_FLAG_FIELD_SIZE;
|
||||
// Write updated parent offset.
|
||||
if (!DynamicPatriciaTrieWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer,
|
||||
parentPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Updates children position.
|
||||
int childrenPos = toBeUpdatedPtNodeParams->getChildrenPos();
|
||||
if (childrenPos != NOT_A_DICT_POS) {
|
||||
PtNodeWriter::PtNodeArrayPositionRelocationMap::const_iterator it =
|
||||
dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.find(childrenPos);
|
||||
if (it != dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.end()) {
|
||||
childrenPos = it->second;
|
||||
}
|
||||
}
|
||||
if (!updateChildrenPosition(toBeUpdatedPtNodeParams, childrenPos)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Counts bigram entries.
|
||||
if (outBigramEntryCount) {
|
||||
*outBigramEntryCount = mBigramPolicy->getBigramEntryConut(
|
||||
toBeUpdatedPtNodeParams->getTerminalId());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -223,7 +223,15 @@ void Ver4PatriciaTriePolicy::flush(const char *const filePath) {
|
|||
}
|
||||
|
||||
void Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) {
|
||||
// TODO: Implement.
|
||||
if (!mBuffers.get()->isUpdatable()) {
|
||||
AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary.");
|
||||
return;
|
||||
}
|
||||
const bool needsToDecay = mHeaderPolicy.isDecayingDict()
|
||||
&& (mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay(
|
||||
false /* mindsBlockByDecay */, mUnigramCount, mBigramCount, &mHeaderPolicy));
|
||||
mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath, &mHeaderPolicy, needsToDecay);
|
||||
mNeedsToDecayForTesting = false;
|
||||
}
|
||||
|
||||
bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
|
||||
#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h"
|
||||
#include "suggest/policyimpl/dictionary/header/header_policy.h"
|
||||
#include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_gc_event_listeners.h"
|
||||
#include "suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h"
|
||||
#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h"
|
||||
#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
|
||||
|
@ -91,7 +92,78 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
|
|||
|
||||
DynamicPatriciaTrieReadingHelper readingHelper(mBuffers->getTrieBuffer(), &ptNodeReader);
|
||||
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
||||
DynamicPatriciaTrieGcEventListeners
|
||||
::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
|
||||
traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
|
||||
headerPolicy, &ptNodeWriter, mBuffers->getWritableTrieBuffer(),
|
||||
needsToDecay);
|
||||
if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
|
||||
&traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) {
|
||||
return false;
|
||||
}
|
||||
if (needsToDecay && traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
|
||||
.getValidUnigramCount() > ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC) {
|
||||
// TODO: Remove more unigrams.
|
||||
}
|
||||
|
||||
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
||||
DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateBigramProbability
|
||||
traversePolicyToUpdateBigramProbability(&ptNodeWriter);
|
||||
if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
|
||||
&traversePolicyToUpdateBigramProbability)) {
|
||||
return false;
|
||||
}
|
||||
if (needsToDecay && traversePolicyToUpdateBigramProbability.getValidBigramEntryCount()
|
||||
> ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC) {
|
||||
// TODO: Remove more bigrams.
|
||||
}
|
||||
|
||||
// Mapping from positions in mBuffer to positions in bufferToWrite.
|
||||
PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap;
|
||||
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
||||
Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(),
|
||||
buffersToWrite, &ptNodeReader, &bigramPolicy, &shortcutPolicy);
|
||||
DynamicPatriciaTrieGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
|
||||
traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers,
|
||||
buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap);
|
||||
if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
|
||||
&traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Create policy instances for the GCed dictionary.
|
||||
Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(),
|
||||
buffersToWrite->getProbabilityDictContent());
|
||||
Ver4BigramListPolicy newBigramPolicy(buffersToWrite->getUpdatableBigramDictContent(),
|
||||
buffersToWrite->getTerminalPositionLookupTable());
|
||||
Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getShortcutDictContent(),
|
||||
buffersToWrite->getTerminalPositionLookupTable());
|
||||
Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(),
|
||||
buffersToWrite, &newPtNodeReader, &newBigramPolicy, &newShortcutPolicy);
|
||||
|
||||
if(!buffersToWrite->getUpdatableBigramDictContent()->copyContent(
|
||||
mBuffers->getBigramDictContent())) {
|
||||
return false;
|
||||
}
|
||||
if(!buffersToWrite->getUpdatableShortcutDictContent()->copyContent(
|
||||
mBuffers->getShortcutDictContent())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
DynamicPatriciaTrieReadingHelper newDictReadingHelper(buffersToWrite->getTrieBuffer(),
|
||||
&newPtNodeReader);
|
||||
newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
||||
DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateAllPositionFields
|
||||
traversePolicyToUpdateAllPositionFields(&newPtNodeWriter, &dictPositionRelocationMap);
|
||||
if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
|
||||
&traversePolicyToUpdateAllPositionFields)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// TODO: GC for dict contents.
|
||||
|
||||
*outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount();
|
||||
*outBigramCount = traversePolicyToUpdateAllPositionFields.getBigramCount();
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -131,4 +131,21 @@ bool BufferWithExtendableBuffer::checkAndPrepareWriting(const int pos, const int
|
|||
return true;
|
||||
}
|
||||
|
||||
bool BufferWithExtendableBuffer::copy(const BufferWithExtendableBuffer *const sourceBuffer) {
|
||||
int copyingPos = 0;
|
||||
const int tailPos = sourceBuffer->getTailPosition();
|
||||
const int maxDataChunkSize = sizeof(uint32_t);
|
||||
while (copyingPos < tailPos) {
|
||||
const int remainingSize = tailPos - copyingPos;
|
||||
const int copyingSize = (remainingSize >= maxDataChunkSize) ?
|
||||
maxDataChunkSize : remainingSize;
|
||||
const uint32_t data = sourceBuffer->readUint(copyingSize, copyingPos);
|
||||
if (!writeUint(data, copyingSize, copyingPos)) {
|
||||
return false;
|
||||
}
|
||||
copyingPos += copyingSize;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -100,6 +100,8 @@ class BufferWithExtendableBuffer {
|
|||
bool writeCodePointsAndAdvancePosition(const int *const codePoints, const int codePointCount,
|
||||
const bool writesTerminator, int *const pos);
|
||||
|
||||
bool copy(const BufferWithExtendableBuffer *const sourceBuffer);
|
||||
|
||||
private:
|
||||
DISALLOW_COPY_AND_ASSIGN(BufferWithExtendableBuffer);
|
||||
|
||||
|
|
|
@ -297,4 +297,46 @@ public class Ver4BinaryDictionaryTests extends AndroidTestCase {
|
|||
binaryDictionary.close();
|
||||
}
|
||||
|
||||
public void testFlushWithGCDictionary() {
|
||||
final String dictVersion = Long.toString(System.currentTimeMillis());
|
||||
File trieFile = null;
|
||||
try {
|
||||
trieFile = createEmptyDictionaryAndGetTrieFile(dictVersion);
|
||||
} catch (IOException e) {
|
||||
fail("IOException while writing an initial dictionary : " + e);
|
||||
}
|
||||
BinaryDictionary binaryDictionary = new BinaryDictionary(trieFile.getAbsolutePath(),
|
||||
0 /* offset */, trieFile.length(), true /* useFullEditDistance */,
|
||||
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
|
||||
|
||||
final int unigramProbability = 100;
|
||||
final int bigramProbability = 10;
|
||||
binaryDictionary.addUnigramWord("aaa", unigramProbability);
|
||||
binaryDictionary.addUnigramWord("abb", unigramProbability);
|
||||
binaryDictionary.addUnigramWord("bcc", unigramProbability);
|
||||
binaryDictionary.addBigramWords("aaa", "abb", bigramProbability);
|
||||
binaryDictionary.addBigramWords("aaa", "bcc", bigramProbability);
|
||||
binaryDictionary.addBigramWords("abb", "aaa", bigramProbability);
|
||||
binaryDictionary.addBigramWords("abb", "bcc", bigramProbability);
|
||||
binaryDictionary.flushWithGC();
|
||||
binaryDictionary.close();
|
||||
|
||||
binaryDictionary = new BinaryDictionary(trieFile.getAbsolutePath(),
|
||||
0 /* offset */, trieFile.length(), true /* useFullEditDistance */,
|
||||
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
|
||||
final int probability = binaryDictionary.calculateProbability(unigramProbability,
|
||||
bigramProbability);
|
||||
assertEquals(unigramProbability, binaryDictionary.getFrequency("aaa"));
|
||||
assertEquals(unigramProbability, binaryDictionary.getFrequency("abb"));
|
||||
assertEquals(unigramProbability, binaryDictionary.getFrequency("bcc"));
|
||||
assertEquals(probability, binaryDictionary.getBigramProbability("aaa", "abb"));
|
||||
assertEquals(probability, binaryDictionary.getBigramProbability("aaa", "bcc"));
|
||||
assertEquals(probability, binaryDictionary.getBigramProbability("abb", "aaa"));
|
||||
assertEquals(probability, binaryDictionary.getBigramProbability("abb", "bcc"));
|
||||
assertEquals(false, binaryDictionary.isValidBigram("bcc", "aaa"));
|
||||
assertEquals(false, binaryDictionary.isValidBigram("bcc", "bbc"));
|
||||
assertEquals(false, binaryDictionary.isValidBigram("aaa", "aaa"));
|
||||
binaryDictionary.flushWithGC();
|
||||
binaryDictionary.close();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue