Implement GC for ver4 dict contents.

Bug: 11073222
Change-Id: Ia4dff07c7f4576897e0d3fc4840aed0ccb535299
main
Keisuke Kuroyanagi 2013-11-25 11:59:43 +09:00
parent 4095177c4f
commit 065a427f5d
11 changed files with 210 additions and 41 deletions

View File

@ -72,4 +72,66 @@ bool BigramDictContent::copyBigramList(const int bigramListPos, const int toPos)
return true;
}
bool BigramDictContent::runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
const BigramDictContent *const originalBigramDictContent,
int *const outBigramEntryCount) {
for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin();
it != terminalIdMap->end(); ++it) {
const int originalBigramListPos =
originalBigramDictContent->getBigramListHeadPos(it->first);
if (originalBigramListPos == NOT_A_DICT_POS) {
// This terminal does not have a bigram list.
continue;
}
const int bigramListPos = getContentBuffer()->getTailPosition();
int bigramEntryCount = 0;
// Copy bigram list with GC from original content.
if (!runGCBigramList(originalBigramListPos, originalBigramDictContent, bigramListPos,
terminalIdMap, &bigramEntryCount)) {
return false;
}
if (bigramEntryCount == 0) {
// All bigram entries are useless. This terminal does not have a bigram list.
continue;
}
*outBigramEntryCount += bigramEntryCount;
// Set bigram list position to the lookup table.
if (!getUpdatableAddressLookupTable()->set(it->second, bigramListPos)) {
return false;
}
}
return true;
}
bool BigramDictContent::runGCBigramList(const int bigramListPos,
const BigramDictContent *const sourceBigramDictContent, const int toPos,
const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
int *const outEntrycount) {
bool hasNext = true;
int readingPos = bigramListPos;
int writingPos = toPos;
while (hasNext) {
int probability = NOT_A_PROBABILITY;
int targetTerminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
sourceBigramDictContent->getBigramEntryAndAdvancePosition(&probability, &hasNext,
&targetTerminalId, &readingPos);
if (targetTerminalId == Ver4DictConstants::NOT_A_TERMINAL_ID) {
continue;
}
TerminalPositionLookupTable::TerminalIdMap::const_iterator it =
terminalIdMap->find(targetTerminalId);
if (it == terminalIdMap->end()) {
AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd",
targetTerminalId, terminalIdMap->size());
return false;
}
if (!writeBigramEntryAndAdvancePosition(probability, hasNext, it->second,
&writingPos)) {
return false;
}
*outEntrycount += 1;
}
return true;
}
} // namespace latinime

View File

@ -19,6 +19,7 @@
#include "defines.h"
#include "suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h"
#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h"
#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
namespace latinime {
@ -65,6 +66,10 @@ class BigramDictContent : public SparseTableDictContent {
Ver4DictConstants::BIGRAM_FILE_EXTENSION);
}
bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
const BigramDictContent *const originalBigramDictContent,
int *const outBigramEntryCount);
private:
DISALLOW_COPY_AND_ASSIGN(BigramDictContent);
@ -72,6 +77,11 @@ class BigramDictContent : public SparseTableDictContent {
return (probability & Ver4DictConstants::BIGRAM_PROBABILITY_MASK)
| (hasNext ? Ver4DictConstants::BIGRAM_HAS_NEXT_MASK : 0);
}
bool runGCBigramList(const int bigramListPos,
const BigramDictContent *const sourceBigramDictContent, const int toPos,
const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
int *const outEntryCount);
};
} // namespace latinime
#endif /* LATINIME_BIGRAM_DICT_CONTENT_H */

View File

@ -19,6 +19,7 @@
#include "defines.h"
#include "suggest/policyimpl/dictionary/structure/v4/content/single_dict_content.h"
#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h"
#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
@ -28,13 +29,14 @@ namespace latinime {
class ProbabilityDictContent : public SingleDictContent {
public:
ProbabilityDictContent(const char *const dictDirPath, const bool isUpdatable)
: SingleDictContent(dictDirPath, Ver4DictConstants::FREQ_FILE_EXTENSION,
isUpdatable) {}
: SingleDictContent(dictDirPath, Ver4DictConstants::FREQ_FILE_EXTENSION, isUpdatable),
mSize(getBuffer()->getTailPosition() / (Ver4DictConstants::PROBABILITY_SIZE
+ Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE)) {}
ProbabilityDictContent() {}
ProbabilityDictContent() : mSize(0) {}
int getProbability(const int terminalId) const {
if (terminalId < 0 || terminalId >= getSize()) {
if (terminalId < 0 || terminalId >= mSize) {
return NOT_A_PROBABILITY;
}
return Ver4PatriciaTrieReadingUtils::getProbability(getBuffer(), terminalId);
@ -44,7 +46,7 @@ class ProbabilityDictContent : public SingleDictContent {
if (terminalId < 0) {
return false;
}
if (terminalId >= getSize()) {
if (terminalId >= mSize) {
// Write new entry.
int writingPos = getBuffer()->getTailPosition();
while (writingPos <= getEntryPos(terminalId)) {
@ -58,6 +60,7 @@ class ProbabilityDictContent : public SingleDictContent {
Ver4DictConstants::PROBABILITY_SIZE, &writingPos)) {
return false;
}
mSize++;
}
}
const int probabilityWritingPos = getEntryPos(terminalId)
@ -67,7 +70,32 @@ class ProbabilityDictContent : public SingleDictContent {
}
bool flushToFile(const char *const dictDirPath) const {
return flush(dictDirPath, Ver4DictConstants::FREQ_FILE_EXTENSION);
if (getEntryPos(mSize) < getBuffer()->getTailPosition()) {
ProbabilityDictContent probabilityDictContentToWrite;
for (int i = 0; i < mSize; ++i) {
if (!probabilityDictContentToWrite.setProbability(i, getProbability(i))) {
return false;
}
}
return probabilityDictContentToWrite.flush(dictDirPath,
Ver4DictConstants::FREQ_FILE_EXTENSION);
} else {
return flush(dictDirPath, Ver4DictConstants::FREQ_FILE_EXTENSION);
}
}
bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
const ProbabilityDictContent *const originalProbabilityDictContent) {
mSize = 0;
for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin();
it != terminalIdMap->end(); ++it) {
if (!setProbability(it->second,
originalProbabilityDictContent->getProbability(it->first))) {
return false;
}
mSize++;
}
return true;
}
private:
@ -78,10 +106,7 @@ class ProbabilityDictContent : public SingleDictContent {
+ Ver4DictConstants::PROBABILITY_SIZE);
}
int getSize() const {
return getBuffer()->getTailPosition() / (Ver4DictConstants::PROBABILITY_SIZE
+ Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE);
}
int mSize;
};
} // namespace latinime
#endif /* LATINIME_PROBABILITY_DICT_CONTENT_H */

View File

@ -48,6 +48,30 @@ bool ShortcutDictContent::flushToFile(const char *const dictDirPath) const {
Ver4DictConstants::SHORTCUT_FILE_EXTENSION);
}
bool ShortcutDictContent::runGC(
const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
const ShortcutDictContent *const originalShortcutDictContent) {
for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin();
it != terminalIdMap->end(); ++it) {
const int originalShortcutListPos =
originalShortcutDictContent->getShortcutListHeadPos(it->first);
if (originalShortcutListPos == NOT_A_DICT_POS) {
continue;
}
const int shortcutListPos = getContentBuffer()->getTailPosition();
// Copy shortcut list with GC from original content.
if (!copyShortcutList(originalShortcutListPos, originalShortcutDictContent,
shortcutListPos)) {
return false;
}
// Set shortcut list position to the lookup table.
if (!getUpdatableAddressLookupTable()->set(it->second, shortcutListPos)) {
return false;
}
}
return true;
}
bool ShortcutDictContent::copyShortcutList(const int shortcutListPos,
const ShortcutDictContent *const sourceShortcutDictContent, const int toPos) {
bool hasNext = true;

View File

@ -47,6 +47,9 @@ class ShortcutDictContent : public SparseTableDictContent {
bool flushToFile(const char *const dictDirPath) const;
bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
const ShortcutDictContent *const originalShortcutDictContent);
private:
DISALLOW_COPY_AND_ASSIGN(ShortcutDictContent);

View File

@ -18,22 +18,6 @@
namespace latinime {
bool SparseTableDictContent::copyContent(
const SparseTableDictContent *const sparseTableDictContent) {
if (!mExpandableLookupTableBuffer.copy(
&sparseTableDictContent->mExpandableLookupTableBuffer)) {
return false;
}
if (!mExpandableAddressTableBuffer.copy(
&sparseTableDictContent->mExpandableAddressTableBuffer)) {
return false;
}
if (!mExpandableContentBuffer.copy(&sparseTableDictContent->mExpandableContentBuffer)) {
return false;
}
return true;
}
bool SparseTableDictContent::flush(const char *const dictDirPath,
const char *const lookupTableFileName, const char *const addressTableFileName,
const char *const contentFileName) const {

View File

@ -75,8 +75,6 @@ class SparseTableDictContent : public DictContent {
|| mExpandableContentBuffer.isNearSizeLimit();
}
bool copyContent(const SparseTableDictContent *const sparseTableDictContent);
protected:
SparseTable *getUpdatableAddressLookupTable() {
return &mAddressLookupTable;

View File

@ -126,6 +126,12 @@ bool Ver4PatriciaTrieNodeWriter::updateChildrenPosition(
newChildrenPosition, &childrenPosFieldPos);
}
bool Ver4PatriciaTrieNodeWriter::updateTerminalId(const PtNodeParams *const toBeUpdatedPtNodeParams,
const int newTerminalId) {
return mTrieBuffer->writeUint(newTerminalId, Ver4DictConstants::TERMINAL_ID_FIELD_SIZE,
toBeUpdatedPtNodeParams->getTerminalIdFieldPos());
}
bool Ver4PatriciaTrieNodeWriter::writePtNodeAndAdvancePosition(
const PtNodeParams *const ptNodeParams, int *const ptNodeWritingPos) {
const int nodePos = *ptNodeWritingPos;

View File

@ -57,6 +57,9 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter {
virtual bool updateChildrenPosition(const PtNodeParams *const toBeUpdatedPtNodeParams,
const int newChildrenPosition);
bool updateTerminalId(const PtNodeParams *const toBeUpdatedPtNodeParams,
const int newTerminalId);
virtual bool writePtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams,
int *const ptNodeWritingPos);

View File

@ -20,7 +20,6 @@
#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h"
#include "suggest/policyimpl/dictionary/header/header_policy.h"
#include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_gc_event_listeners.h"
#include "suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h"
#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h"
#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
@ -141,15 +140,6 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(),
buffersToWrite, &newPtNodeReader, &newBigramPolicy, &newShortcutPolicy);
if(!buffersToWrite->getUpdatableBigramDictContent()->copyContent(
mBuffers->getBigramDictContent())) {
return false;
}
if(!buffersToWrite->getUpdatableShortcutDictContent()->copyContent(
mBuffers->getShortcutDictContent())) {
return false;
}
DynamicPatriciaTrieReadingHelper newDictReadingHelper(buffersToWrite->getTrieBuffer(),
&newPtNodeReader);
newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
@ -160,11 +150,50 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
return false;
}
// TODO: GC for dict contents.
// Re-assign terminal IDs for valid terminal PtNodes.
TerminalPositionLookupTable::TerminalIdMap terminalIdMap;
if(!buffersToWrite->getUpdatableTerminalPositionLookupTable()->runGCTerminalIds(
&terminalIdMap)) {
return false;
}
TraversePolicyToUpdateAllTerminalIds traversePolicyToUpdateAllTerminalIds(&newPtNodeWriter,
&terminalIdMap);
if (!newDictReadingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
&traversePolicyToUpdateAllTerminalIds)) {
return false;
}
// Run GC for probability dict content.
if (!buffersToWrite->getUpdatableProbabilityDictContent()->runGC(&terminalIdMap,
mBuffers->getProbabilityDictContent())) {
return false;
}
// Run GC for bigram dict content.
if(!buffersToWrite->getUpdatableBigramDictContent()->runGC(&terminalIdMap,
mBuffers->getBigramDictContent(), outBigramCount)) {
return false;
}
// Run GC for shortcut dict content.
if(!buffersToWrite->getUpdatableShortcutDictContent()->runGC(&terminalIdMap,
mBuffers->getShortcutDictContent())) {
return false;
}
*outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount();
*outBigramCount = traversePolicyToUpdateAllPositionFields.getBigramCount();
return true;
}
bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllTerminalIds::onVisitingPtNode(
const PtNodeParams *const ptNodeParams) {
if (!ptNodeParams->isTerminal()) {
return true;
}
TerminalPositionLookupTable::TerminalIdMap::const_iterator it =
mTerminalIdMap->find(ptNodeParams->getTerminalId());
if (it == mTerminalIdMap->end()) {
AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd",
ptNodeParams->getTerminalId(), mTerminalIdMap->size());
return false;
}
return mPtNodeWriter->updateTerminalId(ptNodeParams, it->second);
}
} // namespace latinime

View File

@ -18,11 +18,14 @@
#define LATINIME_VER4_PATRICIA_TRIE_WRITING_HELPER_H
#include "defines.h"
#include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_gc_event_listeners.h"
#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h"
namespace latinime {
class HeaderPolicy;
class Ver4DictBuffers;
class Ver4PatriciaTrieNodeWriter;
class Ver4PatriciaTrieWritingHelper {
public:
@ -39,6 +42,28 @@ class Ver4PatriciaTrieWritingHelper {
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTrieWritingHelper);
class TraversePolicyToUpdateAllTerminalIds
: public DynamicPatriciaTrieReadingHelper::TraversingEventListener {
public:
TraversePolicyToUpdateAllTerminalIds(Ver4PatriciaTrieNodeWriter *const ptNodeWriter,
const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap)
: mPtNodeWriter(ptNodeWriter), mTerminalIdMap(terminalIdMap) {};
bool onAscend() { return true; }
bool onDescend(const int ptNodeArrayPos) { return true; }
bool onReadingPtNodeArrayTail() { return true; }
bool onVisitingPtNode(const PtNodeParams *const ptNodeParams);
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateAllTerminalIds);
Ver4PatriciaTrieNodeWriter *const mPtNodeWriter;
const TerminalPositionLookupTable::TerminalIdMap *const mTerminalIdMap;
};
bool runGC(const int rootPtNodeArrayPos, const HeaderPolicy *const headerPolicy,
Ver4DictBuffers *const buffersToWrite, int *const outUnigramCount,
int *const outBigramCount, const bool needsToDecay);