Merge "Implement GC for ver4 dict contents."
This commit is contained in:
commit
0e4a113c64
11 changed files with 210 additions and 41 deletions
|
@ -72,4 +72,66 @@ bool BigramDictContent::copyBigramList(const int bigramListPos, const int toPos)
|
|||
return true;
|
||||
}
|
||||
|
||||
bool BigramDictContent::runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
|
||||
const BigramDictContent *const originalBigramDictContent,
|
||||
int *const outBigramEntryCount) {
|
||||
for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin();
|
||||
it != terminalIdMap->end(); ++it) {
|
||||
const int originalBigramListPos =
|
||||
originalBigramDictContent->getBigramListHeadPos(it->first);
|
||||
if (originalBigramListPos == NOT_A_DICT_POS) {
|
||||
// This terminal does not have a bigram list.
|
||||
continue;
|
||||
}
|
||||
const int bigramListPos = getContentBuffer()->getTailPosition();
|
||||
int bigramEntryCount = 0;
|
||||
// Copy bigram list with GC from original content.
|
||||
if (!runGCBigramList(originalBigramListPos, originalBigramDictContent, bigramListPos,
|
||||
terminalIdMap, &bigramEntryCount)) {
|
||||
return false;
|
||||
}
|
||||
if (bigramEntryCount == 0) {
|
||||
// All bigram entries are useless. This terminal does not have a bigram list.
|
||||
continue;
|
||||
}
|
||||
*outBigramEntryCount += bigramEntryCount;
|
||||
// Set bigram list position to the lookup table.
|
||||
if (!getUpdatableAddressLookupTable()->set(it->second, bigramListPos)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool BigramDictContent::runGCBigramList(const int bigramListPos,
|
||||
const BigramDictContent *const sourceBigramDictContent, const int toPos,
|
||||
const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
|
||||
int *const outEntrycount) {
|
||||
bool hasNext = true;
|
||||
int readingPos = bigramListPos;
|
||||
int writingPos = toPos;
|
||||
while (hasNext) {
|
||||
int probability = NOT_A_PROBABILITY;
|
||||
int targetTerminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
|
||||
sourceBigramDictContent->getBigramEntryAndAdvancePosition(&probability, &hasNext,
|
||||
&targetTerminalId, &readingPos);
|
||||
if (targetTerminalId == Ver4DictConstants::NOT_A_TERMINAL_ID) {
|
||||
continue;
|
||||
}
|
||||
TerminalPositionLookupTable::TerminalIdMap::const_iterator it =
|
||||
terminalIdMap->find(targetTerminalId);
|
||||
if (it == terminalIdMap->end()) {
|
||||
AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd",
|
||||
targetTerminalId, terminalIdMap->size());
|
||||
return false;
|
||||
}
|
||||
if (!writeBigramEntryAndAdvancePosition(probability, hasNext, it->second,
|
||||
&writingPos)) {
|
||||
return false;
|
||||
}
|
||||
*outEntrycount += 1;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace latinime
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
|
||||
#include "defines.h"
|
||||
#include "suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h"
|
||||
#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h"
|
||||
#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
|
||||
|
||||
namespace latinime {
|
||||
|
@ -65,6 +66,10 @@ class BigramDictContent : public SparseTableDictContent {
|
|||
Ver4DictConstants::BIGRAM_FILE_EXTENSION);
|
||||
}
|
||||
|
||||
bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
|
||||
const BigramDictContent *const originalBigramDictContent,
|
||||
int *const outBigramEntryCount);
|
||||
|
||||
private:
|
||||
DISALLOW_COPY_AND_ASSIGN(BigramDictContent);
|
||||
|
||||
|
@ -72,6 +77,11 @@ class BigramDictContent : public SparseTableDictContent {
|
|||
return (probability & Ver4DictConstants::BIGRAM_PROBABILITY_MASK)
|
||||
| (hasNext ? Ver4DictConstants::BIGRAM_HAS_NEXT_MASK : 0);
|
||||
}
|
||||
|
||||
bool runGCBigramList(const int bigramListPos,
|
||||
const BigramDictContent *const sourceBigramDictContent, const int toPos,
|
||||
const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
|
||||
int *const outEntryCount);
|
||||
};
|
||||
} // namespace latinime
|
||||
#endif /* LATINIME_BIGRAM_DICT_CONTENT_H */
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
|
||||
#include "defines.h"
|
||||
#include "suggest/policyimpl/dictionary/structure/v4/content/single_dict_content.h"
|
||||
#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h"
|
||||
#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
|
||||
#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h"
|
||||
#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
|
||||
|
@ -28,13 +29,14 @@ namespace latinime {
|
|||
class ProbabilityDictContent : public SingleDictContent {
|
||||
public:
|
||||
ProbabilityDictContent(const char *const dictDirPath, const bool isUpdatable)
|
||||
: SingleDictContent(dictDirPath, Ver4DictConstants::FREQ_FILE_EXTENSION,
|
||||
isUpdatable) {}
|
||||
: SingleDictContent(dictDirPath, Ver4DictConstants::FREQ_FILE_EXTENSION, isUpdatable),
|
||||
mSize(getBuffer()->getTailPosition() / (Ver4DictConstants::PROBABILITY_SIZE
|
||||
+ Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE)) {}
|
||||
|
||||
ProbabilityDictContent() {}
|
||||
ProbabilityDictContent() : mSize(0) {}
|
||||
|
||||
int getProbability(const int terminalId) const {
|
||||
if (terminalId < 0 || terminalId >= getSize()) {
|
||||
if (terminalId < 0 || terminalId >= mSize) {
|
||||
return NOT_A_PROBABILITY;
|
||||
}
|
||||
return Ver4PatriciaTrieReadingUtils::getProbability(getBuffer(), terminalId);
|
||||
|
@ -44,7 +46,7 @@ class ProbabilityDictContent : public SingleDictContent {
|
|||
if (terminalId < 0) {
|
||||
return false;
|
||||
}
|
||||
if (terminalId >= getSize()) {
|
||||
if (terminalId >= mSize) {
|
||||
// Write new entry.
|
||||
int writingPos = getBuffer()->getTailPosition();
|
||||
while (writingPos <= getEntryPos(terminalId)) {
|
||||
|
@ -58,6 +60,7 @@ class ProbabilityDictContent : public SingleDictContent {
|
|||
Ver4DictConstants::PROBABILITY_SIZE, &writingPos)) {
|
||||
return false;
|
||||
}
|
||||
mSize++;
|
||||
}
|
||||
}
|
||||
const int probabilityWritingPos = getEntryPos(terminalId)
|
||||
|
@ -67,7 +70,32 @@ class ProbabilityDictContent : public SingleDictContent {
|
|||
}
|
||||
|
||||
bool flushToFile(const char *const dictDirPath) const {
|
||||
return flush(dictDirPath, Ver4DictConstants::FREQ_FILE_EXTENSION);
|
||||
if (getEntryPos(mSize) < getBuffer()->getTailPosition()) {
|
||||
ProbabilityDictContent probabilityDictContentToWrite;
|
||||
for (int i = 0; i < mSize; ++i) {
|
||||
if (!probabilityDictContentToWrite.setProbability(i, getProbability(i))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return probabilityDictContentToWrite.flush(dictDirPath,
|
||||
Ver4DictConstants::FREQ_FILE_EXTENSION);
|
||||
} else {
|
||||
return flush(dictDirPath, Ver4DictConstants::FREQ_FILE_EXTENSION);
|
||||
}
|
||||
}
|
||||
|
||||
bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
|
||||
const ProbabilityDictContent *const originalProbabilityDictContent) {
|
||||
mSize = 0;
|
||||
for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin();
|
||||
it != terminalIdMap->end(); ++it) {
|
||||
if (!setProbability(it->second,
|
||||
originalProbabilityDictContent->getProbability(it->first))) {
|
||||
return false;
|
||||
}
|
||||
mSize++;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
|
@ -78,10 +106,7 @@ class ProbabilityDictContent : public SingleDictContent {
|
|||
+ Ver4DictConstants::PROBABILITY_SIZE);
|
||||
}
|
||||
|
||||
int getSize() const {
|
||||
return getBuffer()->getTailPosition() / (Ver4DictConstants::PROBABILITY_SIZE
|
||||
+ Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE);
|
||||
}
|
||||
int mSize;
|
||||
};
|
||||
} // namespace latinime
|
||||
#endif /* LATINIME_PROBABILITY_DICT_CONTENT_H */
|
||||
|
|
|
@ -48,6 +48,30 @@ bool ShortcutDictContent::flushToFile(const char *const dictDirPath) const {
|
|||
Ver4DictConstants::SHORTCUT_FILE_EXTENSION);
|
||||
}
|
||||
|
||||
bool ShortcutDictContent::runGC(
|
||||
const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
|
||||
const ShortcutDictContent *const originalShortcutDictContent) {
|
||||
for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin();
|
||||
it != terminalIdMap->end(); ++it) {
|
||||
const int originalShortcutListPos =
|
||||
originalShortcutDictContent->getShortcutListHeadPos(it->first);
|
||||
if (originalShortcutListPos == NOT_A_DICT_POS) {
|
||||
continue;
|
||||
}
|
||||
const int shortcutListPos = getContentBuffer()->getTailPosition();
|
||||
// Copy shortcut list with GC from original content.
|
||||
if (!copyShortcutList(originalShortcutListPos, originalShortcutDictContent,
|
||||
shortcutListPos)) {
|
||||
return false;
|
||||
}
|
||||
// Set shortcut list position to the lookup table.
|
||||
if (!getUpdatableAddressLookupTable()->set(it->second, shortcutListPos)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ShortcutDictContent::copyShortcutList(const int shortcutListPos,
|
||||
const ShortcutDictContent *const sourceShortcutDictContent, const int toPos) {
|
||||
bool hasNext = true;
|
||||
|
|
|
@ -47,6 +47,9 @@ class ShortcutDictContent : public SparseTableDictContent {
|
|||
|
||||
bool flushToFile(const char *const dictDirPath) const;
|
||||
|
||||
bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
|
||||
const ShortcutDictContent *const originalShortcutDictContent);
|
||||
|
||||
private:
|
||||
DISALLOW_COPY_AND_ASSIGN(ShortcutDictContent);
|
||||
|
||||
|
|
|
@ -18,22 +18,6 @@
|
|||
|
||||
namespace latinime {
|
||||
|
||||
bool SparseTableDictContent::copyContent(
|
||||
const SparseTableDictContent *const sparseTableDictContent) {
|
||||
if (!mExpandableLookupTableBuffer.copy(
|
||||
&sparseTableDictContent->mExpandableLookupTableBuffer)) {
|
||||
return false;
|
||||
}
|
||||
if (!mExpandableAddressTableBuffer.copy(
|
||||
&sparseTableDictContent->mExpandableAddressTableBuffer)) {
|
||||
return false;
|
||||
}
|
||||
if (!mExpandableContentBuffer.copy(&sparseTableDictContent->mExpandableContentBuffer)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SparseTableDictContent::flush(const char *const dictDirPath,
|
||||
const char *const lookupTableFileName, const char *const addressTableFileName,
|
||||
const char *const contentFileName) const {
|
||||
|
|
|
@ -75,8 +75,6 @@ class SparseTableDictContent : public DictContent {
|
|||
|| mExpandableContentBuffer.isNearSizeLimit();
|
||||
}
|
||||
|
||||
bool copyContent(const SparseTableDictContent *const sparseTableDictContent);
|
||||
|
||||
protected:
|
||||
SparseTable *getUpdatableAddressLookupTable() {
|
||||
return &mAddressLookupTable;
|
||||
|
|
|
@ -126,6 +126,12 @@ bool Ver4PatriciaTrieNodeWriter::updateChildrenPosition(
|
|||
newChildrenPosition, &childrenPosFieldPos);
|
||||
}
|
||||
|
||||
bool Ver4PatriciaTrieNodeWriter::updateTerminalId(const PtNodeParams *const toBeUpdatedPtNodeParams,
|
||||
const int newTerminalId) {
|
||||
return mTrieBuffer->writeUint(newTerminalId, Ver4DictConstants::TERMINAL_ID_FIELD_SIZE,
|
||||
toBeUpdatedPtNodeParams->getTerminalIdFieldPos());
|
||||
}
|
||||
|
||||
bool Ver4PatriciaTrieNodeWriter::writePtNodeAndAdvancePosition(
|
||||
const PtNodeParams *const ptNodeParams, int *const ptNodeWritingPos) {
|
||||
const int nodePos = *ptNodeWritingPos;
|
||||
|
|
|
@ -57,6 +57,9 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter {
|
|||
virtual bool updateChildrenPosition(const PtNodeParams *const toBeUpdatedPtNodeParams,
|
||||
const int newChildrenPosition);
|
||||
|
||||
bool updateTerminalId(const PtNodeParams *const toBeUpdatedPtNodeParams,
|
||||
const int newTerminalId);
|
||||
|
||||
virtual bool writePtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams,
|
||||
int *const ptNodeWritingPos);
|
||||
|
||||
|
|
|
@ -20,7 +20,6 @@
|
|||
|
||||
#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h"
|
||||
#include "suggest/policyimpl/dictionary/header/header_policy.h"
|
||||
#include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_gc_event_listeners.h"
|
||||
#include "suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h"
|
||||
#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h"
|
||||
#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
|
||||
|
@ -141,15 +140,6 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
|
|||
Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(),
|
||||
buffersToWrite, &newPtNodeReader, &newBigramPolicy, &newShortcutPolicy);
|
||||
|
||||
if(!buffersToWrite->getUpdatableBigramDictContent()->copyContent(
|
||||
mBuffers->getBigramDictContent())) {
|
||||
return false;
|
||||
}
|
||||
if(!buffersToWrite->getUpdatableShortcutDictContent()->copyContent(
|
||||
mBuffers->getShortcutDictContent())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
DynamicPatriciaTrieReadingHelper newDictReadingHelper(buffersToWrite->getTrieBuffer(),
|
||||
&newPtNodeReader);
|
||||
newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
||||
|
@ -160,11 +150,50 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
|
|||
return false;
|
||||
}
|
||||
|
||||
// TODO: GC for dict contents.
|
||||
|
||||
// Re-assign terminal IDs for valid terminal PtNodes.
|
||||
TerminalPositionLookupTable::TerminalIdMap terminalIdMap;
|
||||
if(!buffersToWrite->getUpdatableTerminalPositionLookupTable()->runGCTerminalIds(
|
||||
&terminalIdMap)) {
|
||||
return false;
|
||||
}
|
||||
TraversePolicyToUpdateAllTerminalIds traversePolicyToUpdateAllTerminalIds(&newPtNodeWriter,
|
||||
&terminalIdMap);
|
||||
if (!newDictReadingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
|
||||
&traversePolicyToUpdateAllTerminalIds)) {
|
||||
return false;
|
||||
}
|
||||
// Run GC for probability dict content.
|
||||
if (!buffersToWrite->getUpdatableProbabilityDictContent()->runGC(&terminalIdMap,
|
||||
mBuffers->getProbabilityDictContent())) {
|
||||
return false;
|
||||
}
|
||||
// Run GC for bigram dict content.
|
||||
if(!buffersToWrite->getUpdatableBigramDictContent()->runGC(&terminalIdMap,
|
||||
mBuffers->getBigramDictContent(), outBigramCount)) {
|
||||
return false;
|
||||
}
|
||||
// Run GC for shortcut dict content.
|
||||
if(!buffersToWrite->getUpdatableShortcutDictContent()->runGC(&terminalIdMap,
|
||||
mBuffers->getShortcutDictContent())) {
|
||||
return false;
|
||||
}
|
||||
*outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount();
|
||||
*outBigramCount = traversePolicyToUpdateAllPositionFields.getBigramCount();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllTerminalIds::onVisitingPtNode(
|
||||
const PtNodeParams *const ptNodeParams) {
|
||||
if (!ptNodeParams->isTerminal()) {
|
||||
return true;
|
||||
}
|
||||
TerminalPositionLookupTable::TerminalIdMap::const_iterator it =
|
||||
mTerminalIdMap->find(ptNodeParams->getTerminalId());
|
||||
if (it == mTerminalIdMap->end()) {
|
||||
AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd",
|
||||
ptNodeParams->getTerminalId(), mTerminalIdMap->size());
|
||||
return false;
|
||||
}
|
||||
return mPtNodeWriter->updateTerminalId(ptNodeParams, it->second);
|
||||
}
|
||||
|
||||
} // namespace latinime
|
||||
|
|
|
@ -18,11 +18,14 @@
|
|||
#define LATINIME_VER4_PATRICIA_TRIE_WRITING_HELPER_H
|
||||
|
||||
#include "defines.h"
|
||||
#include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_gc_event_listeners.h"
|
||||
#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h"
|
||||
|
||||
namespace latinime {
|
||||
|
||||
class HeaderPolicy;
|
||||
class Ver4DictBuffers;
|
||||
class Ver4PatriciaTrieNodeWriter;
|
||||
|
||||
class Ver4PatriciaTrieWritingHelper {
|
||||
public:
|
||||
|
@ -39,6 +42,28 @@ class Ver4PatriciaTrieWritingHelper {
|
|||
private:
|
||||
DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTrieWritingHelper);
|
||||
|
||||
class TraversePolicyToUpdateAllTerminalIds
|
||||
: public DynamicPatriciaTrieReadingHelper::TraversingEventListener {
|
||||
public:
|
||||
TraversePolicyToUpdateAllTerminalIds(Ver4PatriciaTrieNodeWriter *const ptNodeWriter,
|
||||
const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap)
|
||||
: mPtNodeWriter(ptNodeWriter), mTerminalIdMap(terminalIdMap) {};
|
||||
|
||||
bool onAscend() { return true; }
|
||||
|
||||
bool onDescend(const int ptNodeArrayPos) { return true; }
|
||||
|
||||
bool onReadingPtNodeArrayTail() { return true; }
|
||||
|
||||
bool onVisitingPtNode(const PtNodeParams *const ptNodeParams);
|
||||
|
||||
private:
|
||||
DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateAllTerminalIds);
|
||||
|
||||
Ver4PatriciaTrieNodeWriter *const mPtNodeWriter;
|
||||
const TerminalPositionLookupTable::TerminalIdMap *const mTerminalIdMap;
|
||||
};
|
||||
|
||||
bool runGC(const int rootPtNodeArrayPos, const HeaderPolicy *const headerPolicy,
|
||||
Ver4DictBuffers *const buffersToWrite, int *const outUnigramCount,
|
||||
int *const outBigramCount, const bool needsToDecay);
|
||||
|
|
Loading…
Reference in a new issue