Merge "Implement ver4 dictionary GC."
This commit is contained in:
commit
d541d282a4
19 changed files with 329 additions and 46 deletions
|
@ -94,7 +94,7 @@ bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTarget
|
||||||
bool Ver4BigramListPolicy::removeEntry(const int terminalId, const int targetTerminalId) {
|
bool Ver4BigramListPolicy::removeEntry(const int terminalId, const int targetTerminalId) {
|
||||||
const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId);
|
const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId);
|
||||||
if (bigramListPos == NOT_A_DICT_POS) {
|
if (bigramListPos == NOT_A_DICT_POS) {
|
||||||
// Bigram list does't exist.
|
// Bigram list doesn't exist.
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
const int entryPosToUpdate = getEntryPosToUpdate(targetTerminalId, bigramListPos);
|
const int entryPosToUpdate = getEntryPosToUpdate(targetTerminalId, bigramListPos);
|
||||||
|
@ -118,12 +118,62 @@ bool Ver4BigramListPolicy::removeEntry(const int terminalId, const int targetTer
|
||||||
Ver4DictConstants::NOT_A_TERMINAL_ID /* targetTerminalId */, &writingPos);
|
Ver4DictConstants::NOT_A_TERMINAL_ID /* targetTerminalId */, &writingPos);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool Ver4BigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(const int terminalId,
|
||||||
|
int *const outBigramCount) {
|
||||||
|
const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId);
|
||||||
|
if (bigramListPos == NOT_A_DICT_POS) {
|
||||||
|
// Bigram list doesn't exist.
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
bool hasNext = true;
|
||||||
|
int readingPos = bigramListPos;
|
||||||
|
while (hasNext) {
|
||||||
|
const int entryPos = readingPos;
|
||||||
|
int probability = NOT_A_PROBABILITY;
|
||||||
|
int targetTerminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
|
||||||
|
mBigramDictContent->getBigramEntryAndAdvancePosition(&probability, &hasNext,
|
||||||
|
&targetTerminalId, &readingPos);
|
||||||
|
if (targetTerminalId == Ver4DictConstants::NOT_A_TERMINAL_ID) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const int targetPtNodePos = mTerminalPositionLookupTable->getTerminalPtNodePosition(
|
||||||
|
targetTerminalId);
|
||||||
|
if (targetPtNodePos == NOT_A_DICT_POS) {
|
||||||
|
// Invalidate bigram entry.
|
||||||
|
int writingPos = entryPos;
|
||||||
|
return mBigramDictContent->writeBigramEntryAndAdvancePosition(probability, hasNext,
|
||||||
|
Ver4DictConstants::NOT_A_TERMINAL_ID /* targetTerminalId */, &writingPos);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
int Ver4BigramListPolicy::getBigramEntryConut(const int terminalId) {
|
||||||
|
const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId);
|
||||||
|
if (bigramListPos == NOT_A_DICT_POS) {
|
||||||
|
// Bigram list doesn't exist.
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
int bigramCount = 0;
|
||||||
|
bool hasNext = true;
|
||||||
|
int readingPos = bigramListPos;
|
||||||
|
while (hasNext) {
|
||||||
|
int targetTerminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
|
||||||
|
mBigramDictContent->getBigramEntryAndAdvancePosition(0 /* probability */, &hasNext,
|
||||||
|
&targetTerminalId, &readingPos);
|
||||||
|
if (targetTerminalId != Ver4DictConstants::NOT_A_TERMINAL_ID) {
|
||||||
|
bigramCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return bigramCount;
|
||||||
|
}
|
||||||
|
|
||||||
int Ver4BigramListPolicy::getEntryPosToUpdate(const int targetTerminalIdToFind,
|
int Ver4BigramListPolicy::getEntryPosToUpdate(const int targetTerminalIdToFind,
|
||||||
const int bigramListPos) const {
|
const int bigramListPos) const {
|
||||||
bool hasNext = true;
|
bool hasNext = true;
|
||||||
int invalidEntryPos = NOT_A_DICT_POS;
|
int invalidEntryPos = NOT_A_DICT_POS;
|
||||||
int readingPos = bigramListPos;
|
int readingPos = bigramListPos;
|
||||||
while(hasNext) {
|
while (hasNext) {
|
||||||
const int entryPos = readingPos;
|
const int entryPos = readingPos;
|
||||||
int targetTerminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
|
int targetTerminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
|
||||||
mBigramDictContent->getBigramEntryAndAdvancePosition(0 /* probability */, &hasNext,
|
mBigramDictContent->getBigramEntryAndAdvancePosition(0 /* probability */, &hasNext,
|
||||||
|
|
|
@ -44,6 +44,11 @@ class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy {
|
||||||
|
|
||||||
bool removeEntry(const int terminalId, const int targetTerminalId);
|
bool removeEntry(const int terminalId, const int targetTerminalId);
|
||||||
|
|
||||||
|
bool updateAllBigramEntriesAndDeleteUselessEntries(const int terminalId,
|
||||||
|
int *const outBigramCount);
|
||||||
|
|
||||||
|
int getBigramEntryConut(const int terminalId);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4BigramListPolicy);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4BigramListPolicy);
|
||||||
|
|
||||||
|
|
|
@ -148,6 +148,14 @@ class PtNodeParams {
|
||||||
return PatriciaTrieReadingUtils::isNotAWord(mFlags);
|
return PatriciaTrieReadingUtils::isNotAWord(mFlags);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
AK_FORCE_INLINE bool hasBigrams() const {
|
||||||
|
return PatriciaTrieReadingUtils::hasBigrams(mFlags);
|
||||||
|
}
|
||||||
|
|
||||||
|
AK_FORCE_INLINE bool hasShortcutTargets() const {
|
||||||
|
return PatriciaTrieReadingUtils::hasShortcutTargets(mFlags);
|
||||||
|
}
|
||||||
|
|
||||||
// Parent node position
|
// Parent node position
|
||||||
AK_FORCE_INLINE int getParentPos() const {
|
AK_FORCE_INLINE int getParentPos() const {
|
||||||
return mParentPos;
|
return mParentPos;
|
||||||
|
|
|
@ -67,16 +67,13 @@ bool DynamicPatriciaTrieGcEventListeners
|
||||||
|
|
||||||
bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateBigramProbability
|
bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateBigramProbability
|
||||||
::onVisitingPtNode(const PtNodeParams *const ptNodeParams) {
|
::onVisitingPtNode(const PtNodeParams *const ptNodeParams) {
|
||||||
if (!ptNodeParams->isDeleted()) {
|
if (!ptNodeParams->isDeleted() && ptNodeParams->hasBigrams()) {
|
||||||
int pos = ptNodeParams->getBigramsPos();
|
int bigramEntryCount = 0;
|
||||||
if (pos != NOT_A_DICT_POS) {
|
if (!mPtNodeWriter->updateAllBigramEntriesAndDeleteUselessEntries(ptNodeParams,
|
||||||
int bigramEntryCount = 0;
|
&bigramEntryCount)) {
|
||||||
if (!mPtNodeWriter->updateAllBigramEntriesAndDeleteUselessEntries(ptNodeParams,
|
return false;
|
||||||
&bigramEntryCount)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
mValidBigramEntryCount += bigramEntryCount;
|
|
||||||
}
|
}
|
||||||
|
mValidBigramEntryCount += bigramEntryCount;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -258,8 +258,7 @@ const PtNodeParams DynamicPatriciaTrieUpdatingHelper::getUpdatedPtNodeParams(
|
||||||
const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags(
|
const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags(
|
||||||
originalPtNodeParams->isBlacklisted(), originalPtNodeParams->isNotAWord(),
|
originalPtNodeParams->isBlacklisted(), originalPtNodeParams->isNotAWord(),
|
||||||
probability != NOT_A_PROBABILITY /* isTerminal */,
|
probability != NOT_A_PROBABILITY /* isTerminal */,
|
||||||
originalPtNodeParams->getShortcutPos() != NOT_A_DICT_POS /* hasShortcutTargets */,
|
originalPtNodeParams->hasShortcutTargets(), originalPtNodeParams->hasBigrams(),
|
||||||
originalPtNodeParams->getBigramsPos() != NOT_A_DICT_POS /* hasBigrams */,
|
|
||||||
codePointCount > 1 /* hasMultipleChars */, CHILDREN_POSITION_FIELD_SIZE);
|
codePointCount > 1 /* hasMultipleChars */, CHILDREN_POSITION_FIELD_SIZE);
|
||||||
return PtNodeParams(originalPtNodeParams, flags, parentPos, codePointCount, codePoints,
|
return PtNodeParams(originalPtNodeParams, flags, parentPos, codePointCount, codePoints,
|
||||||
probability);
|
probability);
|
||||||
|
|
|
@ -59,7 +59,7 @@ bool BigramDictContent::copyBigramList(const int bigramListPos, const int toPos)
|
||||||
bool hasNext = true;
|
bool hasNext = true;
|
||||||
int readingPos = bigramListPos;
|
int readingPos = bigramListPos;
|
||||||
int writingPos = toPos;
|
int writingPos = toPos;
|
||||||
while(hasNext) {
|
while (hasNext) {
|
||||||
int probability = NOT_A_PROBABILITY;
|
int probability = NOT_A_PROBABILITY;
|
||||||
int targetTerminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
|
int targetTerminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
|
||||||
getBigramEntryAndAdvancePosition(&probability, &hasNext, &targetTerminalId,
|
getBigramEntryAndAdvancePosition(&probability, &hasNext, &targetTerminalId,
|
||||||
|
|
|
@ -41,25 +41,29 @@ class ProbabilityDictContent : public SingleDictContent {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool setProbability(const int terminalId, const int probability) {
|
bool setProbability(const int terminalId, const int probability) {
|
||||||
if (terminalId < 0 || terminalId > getSize()) {
|
if (terminalId < 0) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (terminalId == getSize()) {
|
if (terminalId >= getSize()) {
|
||||||
// Write new entry.
|
// Write new entry.
|
||||||
int flagWritingPos = terminalId * (Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE
|
int writingPos = getBuffer()->getTailPosition();
|
||||||
+ Ver4DictConstants::PROBABILITY_SIZE);
|
while (writingPos <= getEntryPos(terminalId)) {
|
||||||
const int dummyFlags = 0;
|
const int dummyFlags = 0;
|
||||||
// Write dummy flags.
|
if (!getWritableBuffer()->writeUintAndAdvancePosition(dummyFlags,
|
||||||
if (!getWritableBuffer()->writeUintAndAdvancePosition(dummyFlags,
|
Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE, &writingPos)) {
|
||||||
Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE, &flagWritingPos)) {
|
return false;
|
||||||
return false;
|
}
|
||||||
|
const int dummyProbability = 0;
|
||||||
|
if (!getWritableBuffer()->writeUintAndAdvancePosition(dummyProbability,
|
||||||
|
Ver4DictConstants::PROBABILITY_SIZE, &writingPos)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
int probabilityWritingPos = terminalId * (Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE
|
const int probabilityWritingPos = getEntryPos(terminalId)
|
||||||
+ Ver4DictConstants::PROBABILITY_SIZE)
|
+ Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE;
|
||||||
+ Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE;
|
return getWritableBuffer()->writeUint(probability,
|
||||||
return getWritableBuffer()->writeUintAndAdvancePosition(probability,
|
Ver4DictConstants::PROBABILITY_SIZE, probabilityWritingPos);
|
||||||
Ver4DictConstants::PROBABILITY_SIZE, &probabilityWritingPos);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool flushToFile(const char *const dictDirPath) const {
|
bool flushToFile(const char *const dictDirPath) const {
|
||||||
|
@ -69,6 +73,11 @@ class ProbabilityDictContent : public SingleDictContent {
|
||||||
private:
|
private:
|
||||||
DISALLOW_COPY_AND_ASSIGN(ProbabilityDictContent);
|
DISALLOW_COPY_AND_ASSIGN(ProbabilityDictContent);
|
||||||
|
|
||||||
|
int getEntryPos(const int terminalId) const {
|
||||||
|
return terminalId * (Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE
|
||||||
|
+ Ver4DictConstants::PROBABILITY_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
int getSize() const {
|
int getSize() const {
|
||||||
return getBuffer()->getTailPosition() / (Ver4DictConstants::PROBABILITY_SIZE
|
return getBuffer()->getTailPosition() / (Ver4DictConstants::PROBABILITY_SIZE
|
||||||
+ Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE);
|
+ Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE);
|
||||||
|
|
|
@ -18,6 +18,22 @@
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
|
bool SparseTableDictContent::copyContent(
|
||||||
|
const SparseTableDictContent *const sparseTableDictContent) {
|
||||||
|
if (!mExpandableLookupTableBuffer.copy(
|
||||||
|
&sparseTableDictContent->mExpandableLookupTableBuffer)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (!mExpandableAddressTableBuffer.copy(
|
||||||
|
&sparseTableDictContent->mExpandableAddressTableBuffer)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (!mExpandableContentBuffer.copy(&sparseTableDictContent->mExpandableContentBuffer)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
bool SparseTableDictContent::flush(const char *const dictDirPath,
|
bool SparseTableDictContent::flush(const char *const dictDirPath,
|
||||||
const char *const lookupTableFileName, const char *const addressTableFileName,
|
const char *const lookupTableFileName, const char *const addressTableFileName,
|
||||||
const char *const contentFileName) const {
|
const char *const contentFileName) const {
|
||||||
|
|
|
@ -75,6 +75,8 @@ class SparseTableDictContent : public DictContent {
|
||||||
|| mExpandableContentBuffer.isNearSizeLimit();
|
|| mExpandableContentBuffer.isNearSizeLimit();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool copyContent(const SparseTableDictContent *const sparseTableDictContent);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
SparseTable *getUpdatableAddressLookupTable() {
|
SparseTable *getUpdatableAddressLookupTable() {
|
||||||
return &mAddressLookupTable;
|
return &mAddressLookupTable;
|
||||||
|
|
|
@ -44,23 +44,27 @@ class TerminalPositionLookupTable : public SingleDictContent {
|
||||||
if (terminalId < 0 || terminalId >= mSize) {
|
if (terminalId < 0 || terminalId >= mSize) {
|
||||||
return NOT_A_DICT_POS;
|
return NOT_A_DICT_POS;
|
||||||
}
|
}
|
||||||
const int readingPos = terminalId * Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE;
|
|
||||||
return getBuffer()->readUint(Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE,
|
return getBuffer()->readUint(Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE,
|
||||||
readingPos) - mHeaderRegionSize;
|
getEntryPos(terminalId)) - mHeaderRegionSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool setTerminalPtNodePosition(const int terminalId, const int terminalPtNodePos) {
|
bool setTerminalPtNodePosition(const int terminalId, const int terminalPtNodePos) {
|
||||||
if (terminalId < 0 || terminalId > mSize) {
|
if (terminalId < 0) {
|
||||||
return NOT_A_DICT_POS;
|
return NOT_A_DICT_POS;
|
||||||
}
|
}
|
||||||
if (terminalId == mSize) {
|
if (terminalId >= mSize) {
|
||||||
// Use new terminal id.
|
int writingPos = getBuffer()->getTailPosition();
|
||||||
mSize += 1;
|
while(writingPos <= getEntryPos(terminalId)) {
|
||||||
|
// Write new entry.
|
||||||
|
getWritableBuffer()->writeUintAndAdvancePosition(
|
||||||
|
Ver4DictConstants::NOT_A_TERMINAL_ADDRESS,
|
||||||
|
Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, &writingPos);
|
||||||
|
}
|
||||||
|
mSize = getBuffer()->getTailPosition()
|
||||||
|
/ Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE;
|
||||||
}
|
}
|
||||||
int writingPos = terminalId * Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE;
|
return getWritableBuffer()->writeUint(terminalPtNodePos + mHeaderRegionSize,
|
||||||
return getWritableBuffer()->writeUintAndAdvancePosition(
|
Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, getEntryPos(terminalId));
|
||||||
terminalPtNodePos + mHeaderRegionSize,
|
|
||||||
Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE, &writingPos);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int getNextTerminalId() const {
|
int getNextTerminalId() const {
|
||||||
|
@ -94,6 +98,10 @@ class TerminalPositionLookupTable : public SingleDictContent {
|
||||||
private:
|
private:
|
||||||
DISALLOW_COPY_AND_ASSIGN(TerminalPositionLookupTable);
|
DISALLOW_COPY_AND_ASSIGN(TerminalPositionLookupTable);
|
||||||
|
|
||||||
|
int getEntryPos(const int terminalId) const {
|
||||||
|
return terminalId * Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE;
|
||||||
|
}
|
||||||
|
|
||||||
int mSize;
|
int mSize;
|
||||||
const int mHeaderRegionSize;
|
const int mHeaderRegionSize;
|
||||||
};
|
};
|
||||||
|
|
|
@ -93,6 +93,10 @@ class Ver4DictBuffers {
|
||||||
return &mBigramDictContent;
|
return &mBigramDictContent;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
AK_FORCE_INLINE ShortcutDictContent *getUpdatableShortcutDictContent() {
|
||||||
|
return &mShortcutDictContent;
|
||||||
|
}
|
||||||
|
|
||||||
AK_FORCE_INLINE const ShortcutDictContent *getShortcutDictContent() const {
|
AK_FORCE_INLINE const ShortcutDictContent *getShortcutDictContent() const {
|
||||||
return &mShortcutDictContent;
|
return &mShortcutDictContent;
|
||||||
}
|
}
|
||||||
|
|
|
@ -41,6 +41,7 @@ const int Ver4DictConstants::NOT_A_TERMINAL_ID = -1;
|
||||||
const int Ver4DictConstants::PROBABILITY_SIZE = 1;
|
const int Ver4DictConstants::PROBABILITY_SIZE = 1;
|
||||||
const int Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE = 1;
|
const int Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE = 1;
|
||||||
const int Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3;
|
const int Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3;
|
||||||
|
const int Ver4DictConstants::NOT_A_TERMINAL_ADDRESS = 0;
|
||||||
const int Ver4DictConstants::TERMINAL_ID_FIELD_SIZE = 4;
|
const int Ver4DictConstants::TERMINAL_ID_FIELD_SIZE = 4;
|
||||||
|
|
||||||
const int Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 4;
|
const int Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 4;
|
||||||
|
|
|
@ -41,6 +41,7 @@ class Ver4DictConstants {
|
||||||
static const int PROBABILITY_SIZE;
|
static const int PROBABILITY_SIZE;
|
||||||
static const int FLAGS_IN_PROBABILITY_FILE_SIZE;
|
static const int FLAGS_IN_PROBABILITY_FILE_SIZE;
|
||||||
static const int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE;
|
static const int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE;
|
||||||
|
static const int NOT_A_TERMINAL_ADDRESS;
|
||||||
static const int TERMINAL_ID_FIELD_SIZE;
|
static const int TERMINAL_ID_FIELD_SIZE;
|
||||||
|
|
||||||
static const int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE;
|
static const int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE;
|
||||||
|
|
|
@ -45,8 +45,17 @@ bool Ver4PatriciaTrieNodeWriter::markPtNodeAsDeleted(
|
||||||
true /* isDeleted */);
|
true /* isDeleted */);
|
||||||
int writingPos = toBeUpdatedPtNodeParams->getHeadPos();
|
int writingPos = toBeUpdatedPtNodeParams->getHeadPos();
|
||||||
// Update flags.
|
// Update flags.
|
||||||
return DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags,
|
if (!DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags,
|
||||||
&writingPos);
|
&writingPos)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (toBeUpdatedPtNodeParams->getTerminalId() != NOT_A_DICT_POS) {
|
||||||
|
// The PtNode is a terminal. Delete entry from the terminal position lookup table.
|
||||||
|
return mBuffers->getUpdatableTerminalPositionLookupTable()->setTerminalPtNodePosition(
|
||||||
|
toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */);
|
||||||
|
} else {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Ver4PatriciaTrieNodeWriter::markPtNodeAsMoved(
|
bool Ver4PatriciaTrieNodeWriter::markPtNodeAsMoved(
|
||||||
|
@ -171,7 +180,7 @@ bool Ver4PatriciaTrieNodeWriter::writePtNodeAndAdvancePosition(
|
||||||
PatriciaTrieReadingUtils::NodeFlags nodeFlags =
|
PatriciaTrieReadingUtils::NodeFlags nodeFlags =
|
||||||
PatriciaTrieReadingUtils::createAndGetFlags(ptNodeParams->isBlacklisted(),
|
PatriciaTrieReadingUtils::createAndGetFlags(ptNodeParams->isBlacklisted(),
|
||||||
ptNodeParams->isNotAWord(), isTerminal,
|
ptNodeParams->isNotAWord(), isTerminal,
|
||||||
false /* hasShortcutTargets */, false /* hasBigrams */,
|
ptNodeParams->hasShortcutTargets(), ptNodeParams->hasBigrams(),
|
||||||
ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */,
|
ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */,
|
||||||
CHILDREN_POSITION_FIELD_SIZE);
|
CHILDREN_POSITION_FIELD_SIZE);
|
||||||
int flagsFieldPos = nodePos;
|
int flagsFieldPos = nodePos;
|
||||||
|
@ -198,16 +207,49 @@ bool Ver4PatriciaTrieNodeWriter::removeBigramEntry(
|
||||||
|
|
||||||
bool Ver4PatriciaTrieNodeWriter::updateAllBigramEntriesAndDeleteUselessEntries(
|
bool Ver4PatriciaTrieNodeWriter::updateAllBigramEntriesAndDeleteUselessEntries(
|
||||||
const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount) {
|
const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount) {
|
||||||
// TODO: Implement.
|
return mBigramPolicy->updateAllBigramEntriesAndDeleteUselessEntries(
|
||||||
return false;
|
sourcePtNodeParams->getTerminalId(), outBigramEntryCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Ver4PatriciaTrieNodeWriter::updateAllPositionFields(
|
bool Ver4PatriciaTrieNodeWriter::updateAllPositionFields(
|
||||||
const PtNodeParams *const toBeUpdatedPtNodeParams,
|
const PtNodeParams *const toBeUpdatedPtNodeParams,
|
||||||
const DictPositionRelocationMap *const dictPositionRelocationMap,
|
const DictPositionRelocationMap *const dictPositionRelocationMap,
|
||||||
int *const outBigramEntryCount) {
|
int *const outBigramEntryCount) {
|
||||||
// TODO: Implement.
|
int parentPos = toBeUpdatedPtNodeParams->getParentPos();
|
||||||
return false;
|
if (parentPos != NOT_A_DICT_POS) {
|
||||||
|
PtNodeWriter::PtNodePositionRelocationMap::const_iterator it =
|
||||||
|
dictPositionRelocationMap->mPtNodePositionRelocationMap.find(parentPos);
|
||||||
|
if (it != dictPositionRelocationMap->mPtNodePositionRelocationMap.end()) {
|
||||||
|
parentPos = it->second;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int writingPos = toBeUpdatedPtNodeParams->getHeadPos()
|
||||||
|
+ DynamicPatriciaTrieWritingUtils::NODE_FLAG_FIELD_SIZE;
|
||||||
|
// Write updated parent offset.
|
||||||
|
if (!DynamicPatriciaTrieWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer,
|
||||||
|
parentPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Updates children position.
|
||||||
|
int childrenPos = toBeUpdatedPtNodeParams->getChildrenPos();
|
||||||
|
if (childrenPos != NOT_A_DICT_POS) {
|
||||||
|
PtNodeWriter::PtNodeArrayPositionRelocationMap::const_iterator it =
|
||||||
|
dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.find(childrenPos);
|
||||||
|
if (it != dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.end()) {
|
||||||
|
childrenPos = it->second;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!updateChildrenPosition(toBeUpdatedPtNodeParams, childrenPos)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Counts bigram entries.
|
||||||
|
if (outBigramEntryCount) {
|
||||||
|
*outBigramEntryCount = mBigramPolicy->getBigramEntryConut(
|
||||||
|
toBeUpdatedPtNodeParams->getTerminalId());
|
||||||
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -223,7 +223,15 @@ void Ver4PatriciaTriePolicy::flush(const char *const filePath) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) {
|
void Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) {
|
||||||
// TODO: Implement.
|
if (!mBuffers.get()->isUpdatable()) {
|
||||||
|
AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const bool needsToDecay = mHeaderPolicy.isDecayingDict()
|
||||||
|
&& (mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay(
|
||||||
|
false /* mindsBlockByDecay */, mUnigramCount, mBigramCount, &mHeaderPolicy));
|
||||||
|
mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath, &mHeaderPolicy, needsToDecay);
|
||||||
|
mNeedsToDecayForTesting = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {
|
bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {
|
||||||
|
|
|
@ -20,6 +20,7 @@
|
||||||
|
|
||||||
#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h"
|
#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h"
|
||||||
#include "suggest/policyimpl/dictionary/header/header_policy.h"
|
#include "suggest/policyimpl/dictionary/header/header_policy.h"
|
||||||
|
#include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_gc_event_listeners.h"
|
||||||
#include "suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h"
|
#include "suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h"
|
||||||
#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h"
|
#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h"
|
||||||
#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
|
#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
|
||||||
|
@ -91,7 +92,78 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
|
||||||
|
|
||||||
DynamicPatriciaTrieReadingHelper readingHelper(mBuffers->getTrieBuffer(), &ptNodeReader);
|
DynamicPatriciaTrieReadingHelper readingHelper(mBuffers->getTrieBuffer(), &ptNodeReader);
|
||||||
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
||||||
|
DynamicPatriciaTrieGcEventListeners
|
||||||
|
::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
|
||||||
|
traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
|
||||||
|
headerPolicy, &ptNodeWriter, mBuffers->getWritableTrieBuffer(),
|
||||||
|
needsToDecay);
|
||||||
|
if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
|
||||||
|
&traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (needsToDecay && traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
|
||||||
|
.getValidUnigramCount() > ForgettingCurveUtils::MAX_UNIGRAM_COUNT_AFTER_GC) {
|
||||||
|
// TODO: Remove more unigrams.
|
||||||
|
}
|
||||||
|
|
||||||
|
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
||||||
|
DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateBigramProbability
|
||||||
|
traversePolicyToUpdateBigramProbability(&ptNodeWriter);
|
||||||
|
if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
|
||||||
|
&traversePolicyToUpdateBigramProbability)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (needsToDecay && traversePolicyToUpdateBigramProbability.getValidBigramEntryCount()
|
||||||
|
> ForgettingCurveUtils::MAX_BIGRAM_COUNT_AFTER_GC) {
|
||||||
|
// TODO: Remove more bigrams.
|
||||||
|
}
|
||||||
|
|
||||||
|
// Mapping from positions in mBuffer to positions in bufferToWrite.
|
||||||
|
PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap;
|
||||||
|
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
||||||
|
Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(),
|
||||||
|
buffersToWrite, &ptNodeReader, &bigramPolicy, &shortcutPolicy);
|
||||||
|
DynamicPatriciaTrieGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
|
||||||
|
traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers,
|
||||||
|
buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap);
|
||||||
|
if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
|
||||||
|
&traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create policy instances for the GCed dictionary.
|
||||||
|
Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(),
|
||||||
|
buffersToWrite->getProbabilityDictContent());
|
||||||
|
Ver4BigramListPolicy newBigramPolicy(buffersToWrite->getUpdatableBigramDictContent(),
|
||||||
|
buffersToWrite->getTerminalPositionLookupTable());
|
||||||
|
Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getShortcutDictContent(),
|
||||||
|
buffersToWrite->getTerminalPositionLookupTable());
|
||||||
|
Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(),
|
||||||
|
buffersToWrite, &newPtNodeReader, &newBigramPolicy, &newShortcutPolicy);
|
||||||
|
|
||||||
|
if(!buffersToWrite->getUpdatableBigramDictContent()->copyContent(
|
||||||
|
mBuffers->getBigramDictContent())) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if(!buffersToWrite->getUpdatableShortcutDictContent()->copyContent(
|
||||||
|
mBuffers->getShortcutDictContent())) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
DynamicPatriciaTrieReadingHelper newDictReadingHelper(buffersToWrite->getTrieBuffer(),
|
||||||
|
&newPtNodeReader);
|
||||||
|
newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
||||||
|
DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateAllPositionFields
|
||||||
|
traversePolicyToUpdateAllPositionFields(&newPtNodeWriter, &dictPositionRelocationMap);
|
||||||
|
if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
|
||||||
|
&traversePolicyToUpdateAllPositionFields)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: GC for dict contents.
|
||||||
|
|
||||||
|
*outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount();
|
||||||
|
*outBigramCount = traversePolicyToUpdateAllPositionFields.getBigramCount();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -131,4 +131,21 @@ bool BufferWithExtendableBuffer::checkAndPrepareWriting(const int pos, const int
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool BufferWithExtendableBuffer::copy(const BufferWithExtendableBuffer *const sourceBuffer) {
|
||||||
|
int copyingPos = 0;
|
||||||
|
const int tailPos = sourceBuffer->getTailPosition();
|
||||||
|
const int maxDataChunkSize = sizeof(uint32_t);
|
||||||
|
while (copyingPos < tailPos) {
|
||||||
|
const int remainingSize = tailPos - copyingPos;
|
||||||
|
const int copyingSize = (remainingSize >= maxDataChunkSize) ?
|
||||||
|
maxDataChunkSize : remainingSize;
|
||||||
|
const uint32_t data = sourceBuffer->readUint(copyingSize, copyingPos);
|
||||||
|
if (!writeUint(data, copyingSize, copyingPos)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
copyingPos += copyingSize;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -100,6 +100,8 @@ class BufferWithExtendableBuffer {
|
||||||
bool writeCodePointsAndAdvancePosition(const int *const codePoints, const int codePointCount,
|
bool writeCodePointsAndAdvancePosition(const int *const codePoints, const int codePointCount,
|
||||||
const bool writesTerminator, int *const pos);
|
const bool writesTerminator, int *const pos);
|
||||||
|
|
||||||
|
bool copy(const BufferWithExtendableBuffer *const sourceBuffer);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_COPY_AND_ASSIGN(BufferWithExtendableBuffer);
|
DISALLOW_COPY_AND_ASSIGN(BufferWithExtendableBuffer);
|
||||||
|
|
||||||
|
|
|
@ -297,4 +297,46 @@ public class Ver4BinaryDictionaryTests extends AndroidTestCase {
|
||||||
binaryDictionary.close();
|
binaryDictionary.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testFlushWithGCDictionary() {
|
||||||
|
final String dictVersion = Long.toString(System.currentTimeMillis());
|
||||||
|
File trieFile = null;
|
||||||
|
try {
|
||||||
|
trieFile = createEmptyDictionaryAndGetTrieFile(dictVersion);
|
||||||
|
} catch (IOException e) {
|
||||||
|
fail("IOException while writing an initial dictionary : " + e);
|
||||||
|
}
|
||||||
|
BinaryDictionary binaryDictionary = new BinaryDictionary(trieFile.getAbsolutePath(),
|
||||||
|
0 /* offset */, trieFile.length(), true /* useFullEditDistance */,
|
||||||
|
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
|
||||||
|
|
||||||
|
final int unigramProbability = 100;
|
||||||
|
final int bigramProbability = 10;
|
||||||
|
binaryDictionary.addUnigramWord("aaa", unigramProbability);
|
||||||
|
binaryDictionary.addUnigramWord("abb", unigramProbability);
|
||||||
|
binaryDictionary.addUnigramWord("bcc", unigramProbability);
|
||||||
|
binaryDictionary.addBigramWords("aaa", "abb", bigramProbability);
|
||||||
|
binaryDictionary.addBigramWords("aaa", "bcc", bigramProbability);
|
||||||
|
binaryDictionary.addBigramWords("abb", "aaa", bigramProbability);
|
||||||
|
binaryDictionary.addBigramWords("abb", "bcc", bigramProbability);
|
||||||
|
binaryDictionary.flushWithGC();
|
||||||
|
binaryDictionary.close();
|
||||||
|
|
||||||
|
binaryDictionary = new BinaryDictionary(trieFile.getAbsolutePath(),
|
||||||
|
0 /* offset */, trieFile.length(), true /* useFullEditDistance */,
|
||||||
|
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
|
||||||
|
final int probability = binaryDictionary.calculateProbability(unigramProbability,
|
||||||
|
bigramProbability);
|
||||||
|
assertEquals(unigramProbability, binaryDictionary.getFrequency("aaa"));
|
||||||
|
assertEquals(unigramProbability, binaryDictionary.getFrequency("abb"));
|
||||||
|
assertEquals(unigramProbability, binaryDictionary.getFrequency("bcc"));
|
||||||
|
assertEquals(probability, binaryDictionary.getBigramProbability("aaa", "abb"));
|
||||||
|
assertEquals(probability, binaryDictionary.getBigramProbability("aaa", "bcc"));
|
||||||
|
assertEquals(probability, binaryDictionary.getBigramProbability("abb", "aaa"));
|
||||||
|
assertEquals(probability, binaryDictionary.getBigramProbability("abb", "bcc"));
|
||||||
|
assertEquals(false, binaryDictionary.isValidBigram("bcc", "aaa"));
|
||||||
|
assertEquals(false, binaryDictionary.isValidBigram("bcc", "bbc"));
|
||||||
|
assertEquals(false, binaryDictionary.isValidBigram("aaa", "aaa"));
|
||||||
|
binaryDictionary.flushWithGC();
|
||||||
|
binaryDictionary.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue