diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp index c22b9b455..906687647 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp @@ -72,4 +72,66 @@ bool BigramDictContent::copyBigramList(const int bigramListPos, const int toPos) return true; } +bool BigramDictContent::runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const BigramDictContent *const originalBigramDictContent, + int *const outBigramEntryCount) { + for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin(); + it != terminalIdMap->end(); ++it) { + const int originalBigramListPos = + originalBigramDictContent->getBigramListHeadPos(it->first); + if (originalBigramListPos == NOT_A_DICT_POS) { + // This terminal does not have a bigram list. + continue; + } + const int bigramListPos = getContentBuffer()->getTailPosition(); + int bigramEntryCount = 0; + // Copy bigram list with GC from original content. + if (!runGCBigramList(originalBigramListPos, originalBigramDictContent, bigramListPos, + terminalIdMap, &bigramEntryCount)) { + return false; + } + if (bigramEntryCount == 0) { + // All bigram entries are useless. This terminal does not have a bigram list. + continue; + } + *outBigramEntryCount += bigramEntryCount; + // Set bigram list position to the lookup table. + if (!getUpdatableAddressLookupTable()->set(it->second, bigramListPos)) { + return false; + } + } + return true; +} + +bool BigramDictContent::runGCBigramList(const int bigramListPos, + const BigramDictContent *const sourceBigramDictContent, const int toPos, + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + int *const outEntrycount) { + bool hasNext = true; + int readingPos = bigramListPos; + int writingPos = toPos; + while (hasNext) { + int probability = NOT_A_PROBABILITY; + int targetTerminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; + sourceBigramDictContent->getBigramEntryAndAdvancePosition(&probability, &hasNext, + &targetTerminalId, &readingPos); + if (targetTerminalId == Ver4DictConstants::NOT_A_TERMINAL_ID) { + continue; + } + TerminalPositionLookupTable::TerminalIdMap::const_iterator it = + terminalIdMap->find(targetTerminalId); + if (it == terminalIdMap->end()) { + AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd", + targetTerminalId, terminalIdMap->size()); + return false; + } + if (!writeBigramEntryAndAdvancePosition(probability, hasNext, it->second, + &writingPos)) { + return false; + } + *outEntrycount += 1; + } + return true; +} + } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h index c5410b83f..ec0d756d8 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h @@ -19,6 +19,7 @@ #include "defines.h" #include "suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" namespace latinime { @@ -65,6 +66,10 @@ class BigramDictContent : public SparseTableDictContent { Ver4DictConstants::BIGRAM_FILE_EXTENSION); } + bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const BigramDictContent *const originalBigramDictContent, + int *const outBigramEntryCount); + private: DISALLOW_COPY_AND_ASSIGN(BigramDictContent); @@ -72,6 +77,11 @@ class BigramDictContent : public SparseTableDictContent { return (probability & Ver4DictConstants::BIGRAM_PROBABILITY_MASK) | (hasNext ? Ver4DictConstants::BIGRAM_HAS_NEXT_MASK : 0); } + + bool runGCBigramList(const int bigramListPos, + const BigramDictContent *const sourceBigramDictContent, const int toPos, + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + int *const outEntryCount); }; } // namespace latinime #endif /* LATINIME_BIGRAM_DICT_CONTENT_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.h index 73a472a21..0958dd124 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.h @@ -19,6 +19,7 @@ #include "defines.h" #include "suggest/policyimpl/dictionary/structure/v4/content/single_dict_content.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" @@ -28,13 +29,14 @@ namespace latinime { class ProbabilityDictContent : public SingleDictContent { public: ProbabilityDictContent(const char *const dictDirPath, const bool isUpdatable) - : SingleDictContent(dictDirPath, Ver4DictConstants::FREQ_FILE_EXTENSION, - isUpdatable) {} + : SingleDictContent(dictDirPath, Ver4DictConstants::FREQ_FILE_EXTENSION, isUpdatable), + mSize(getBuffer()->getTailPosition() / (Ver4DictConstants::PROBABILITY_SIZE + + Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE)) {} - ProbabilityDictContent() {} + ProbabilityDictContent() : mSize(0) {} int getProbability(const int terminalId) const { - if (terminalId < 0 || terminalId >= getSize()) { + if (terminalId < 0 || terminalId >= mSize) { return NOT_A_PROBABILITY; } return Ver4PatriciaTrieReadingUtils::getProbability(getBuffer(), terminalId); @@ -44,7 +46,7 @@ class ProbabilityDictContent : public SingleDictContent { if (terminalId < 0) { return false; } - if (terminalId >= getSize()) { + if (terminalId >= mSize) { // Write new entry. int writingPos = getBuffer()->getTailPosition(); while (writingPos <= getEntryPos(terminalId)) { @@ -58,6 +60,7 @@ class ProbabilityDictContent : public SingleDictContent { Ver4DictConstants::PROBABILITY_SIZE, &writingPos)) { return false; } + mSize++; } } const int probabilityWritingPos = getEntryPos(terminalId) @@ -67,7 +70,32 @@ class ProbabilityDictContent : public SingleDictContent { } bool flushToFile(const char *const dictDirPath) const { - return flush(dictDirPath, Ver4DictConstants::FREQ_FILE_EXTENSION); + if (getEntryPos(mSize) < getBuffer()->getTailPosition()) { + ProbabilityDictContent probabilityDictContentToWrite; + for (int i = 0; i < mSize; ++i) { + if (!probabilityDictContentToWrite.setProbability(i, getProbability(i))) { + return false; + } + } + return probabilityDictContentToWrite.flush(dictDirPath, + Ver4DictConstants::FREQ_FILE_EXTENSION); + } else { + return flush(dictDirPath, Ver4DictConstants::FREQ_FILE_EXTENSION); + } + } + + bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const ProbabilityDictContent *const originalProbabilityDictContent) { + mSize = 0; + for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin(); + it != terminalIdMap->end(); ++it) { + if (!setProbability(it->second, + originalProbabilityDictContent->getProbability(it->first))) { + return false; + } + mSize++; + } + return true; } private: @@ -78,10 +106,7 @@ class ProbabilityDictContent : public SingleDictContent { + Ver4DictConstants::PROBABILITY_SIZE); } - int getSize() const { - return getBuffer()->getTailPosition() / (Ver4DictConstants::PROBABILITY_SIZE - + Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE); - } + int mSize; }; } // namespace latinime #endif /* LATINIME_PROBABILITY_DICT_CONTENT_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.cpp index f09bd4409..14eff673c 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.cpp @@ -48,6 +48,30 @@ bool ShortcutDictContent::flushToFile(const char *const dictDirPath) const { Ver4DictConstants::SHORTCUT_FILE_EXTENSION); } +bool ShortcutDictContent::runGC( + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const ShortcutDictContent *const originalShortcutDictContent) { + for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin(); + it != terminalIdMap->end(); ++it) { + const int originalShortcutListPos = + originalShortcutDictContent->getShortcutListHeadPos(it->first); + if (originalShortcutListPos == NOT_A_DICT_POS) { + continue; + } + const int shortcutListPos = getContentBuffer()->getTailPosition(); + // Copy shortcut list with GC from original content. + if (!copyShortcutList(originalShortcutListPos, originalShortcutDictContent, + shortcutListPos)) { + return false; + } + // Set shortcut list position to the lookup table. + if (!getUpdatableAddressLookupTable()->set(it->second, shortcutListPos)) { + return false; + } + } + return true; +} + bool ShortcutDictContent::copyShortcutList(const int shortcutListPos, const ShortcutDictContent *const sourceShortcutDictContent, const int toPos) { bool hasNext = true; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h index 71a8f6b31..902016a1f 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h @@ -47,6 +47,9 @@ class ShortcutDictContent : public SparseTableDictContent { bool flushToFile(const char *const dictDirPath) const; + bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const ShortcutDictContent *const originalShortcutDictContent); + private: DISALLOW_COPY_AND_ASSIGN(ShortcutDictContent); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.cpp index 1c5caa1a7..c65420614 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.cpp @@ -18,22 +18,6 @@ namespace latinime { -bool SparseTableDictContent::copyContent( - const SparseTableDictContent *const sparseTableDictContent) { - if (!mExpandableLookupTableBuffer.copy( - &sparseTableDictContent->mExpandableLookupTableBuffer)) { - return false; - } - if (!mExpandableAddressTableBuffer.copy( - &sparseTableDictContent->mExpandableAddressTableBuffer)) { - return false; - } - if (!mExpandableContentBuffer.copy(&sparseTableDictContent->mExpandableContentBuffer)) { - return false; - } - return true; -} - bool SparseTableDictContent::flush(const char *const dictDirPath, const char *const lookupTableFileName, const char *const addressTableFileName, const char *const contentFileName) const { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h index 151568747..9a4f1e1c0 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h @@ -75,8 +75,6 @@ class SparseTableDictContent : public DictContent { || mExpandableContentBuffer.isNearSizeLimit(); } - bool copyContent(const SparseTableDictContent *const sparseTableDictContent); - protected: SparseTable *getUpdatableAddressLookupTable() { return &mAddressLookupTable; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp index 071792544..e8a3142b8 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp @@ -126,6 +126,12 @@ bool Ver4PatriciaTrieNodeWriter::updateChildrenPosition( newChildrenPosition, &childrenPosFieldPos); } +bool Ver4PatriciaTrieNodeWriter::updateTerminalId(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int newTerminalId) { + return mTrieBuffer->writeUint(newTerminalId, Ver4DictConstants::TERMINAL_ID_FIELD_SIZE, + toBeUpdatedPtNodeParams->getTerminalIdFieldPos()); +} + bool Ver4PatriciaTrieNodeWriter::writePtNodeAndAdvancePosition( const PtNodeParams *const ptNodeParams, int *const ptNodeWritingPos) { const int nodePos = *ptNodeWritingPos; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h index 23e54cc09..12451525f 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h @@ -57,6 +57,9 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter { virtual bool updateChildrenPosition(const PtNodeParams *const toBeUpdatedPtNodeParams, const int newChildrenPosition); + bool updateTerminalId(const PtNodeParams *const toBeUpdatedPtNodeParams, + const int newTerminalId); + virtual bool writePtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams, int *const ptNodeWritingPos); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp index 11adf2c1d..f141d52f5 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp @@ -20,7 +20,6 @@ #include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h" #include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_gc_event_listeners.h" #include "suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" @@ -141,15 +140,6 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(), buffersToWrite, &newPtNodeReader, &newBigramPolicy, &newShortcutPolicy); - if(!buffersToWrite->getUpdatableBigramDictContent()->copyContent( - mBuffers->getBigramDictContent())) { - return false; - } - if(!buffersToWrite->getUpdatableShortcutDictContent()->copyContent( - mBuffers->getShortcutDictContent())) { - return false; - } - DynamicPatriciaTrieReadingHelper newDictReadingHelper(buffersToWrite->getTrieBuffer(), &newPtNodeReader); newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); @@ -160,11 +150,50 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, return false; } - // TODO: GC for dict contents. - + // Re-assign terminal IDs for valid terminal PtNodes. + TerminalPositionLookupTable::TerminalIdMap terminalIdMap; + if(!buffersToWrite->getUpdatableTerminalPositionLookupTable()->runGCTerminalIds( + &terminalIdMap)) { + return false; + } + TraversePolicyToUpdateAllTerminalIds traversePolicyToUpdateAllTerminalIds(&newPtNodeWriter, + &terminalIdMap); + if (!newDictReadingHelper.traverseAllPtNodesInPostorderDepthFirstManner( + &traversePolicyToUpdateAllTerminalIds)) { + return false; + } + // Run GC for probability dict content. + if (!buffersToWrite->getUpdatableProbabilityDictContent()->runGC(&terminalIdMap, + mBuffers->getProbabilityDictContent())) { + return false; + } + // Run GC for bigram dict content. + if(!buffersToWrite->getUpdatableBigramDictContent()->runGC(&terminalIdMap, + mBuffers->getBigramDictContent(), outBigramCount)) { + return false; + } + // Run GC for shortcut dict content. + if(!buffersToWrite->getUpdatableShortcutDictContent()->runGC(&terminalIdMap, + mBuffers->getShortcutDictContent())) { + return false; + } *outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount(); - *outBigramCount = traversePolicyToUpdateAllPositionFields.getBigramCount(); return true; } +bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllTerminalIds::onVisitingPtNode( + const PtNodeParams *const ptNodeParams) { + if (!ptNodeParams->isTerminal()) { + return true; + } + TerminalPositionLookupTable::TerminalIdMap::const_iterator it = + mTerminalIdMap->find(ptNodeParams->getTerminalId()); + if (it == mTerminalIdMap->end()) { + AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd", + ptNodeParams->getTerminalId(), mTerminalIdMap->size()); + return false; + } + return mPtNodeWriter->updateTerminalId(ptNodeParams, it->second); +} + } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h index 8072aa376..ea2fd532b 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h @@ -18,11 +18,14 @@ #define LATINIME_VER4_PATRICIA_TRIE_WRITING_HELPER_H #include "defines.h" +#include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_gc_event_listeners.h" +#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" namespace latinime { class HeaderPolicy; class Ver4DictBuffers; +class Ver4PatriciaTrieNodeWriter; class Ver4PatriciaTrieWritingHelper { public: @@ -39,6 +42,28 @@ class Ver4PatriciaTrieWritingHelper { private: DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTrieWritingHelper); + class TraversePolicyToUpdateAllTerminalIds + : public DynamicPatriciaTrieReadingHelper::TraversingEventListener { + public: + TraversePolicyToUpdateAllTerminalIds(Ver4PatriciaTrieNodeWriter *const ptNodeWriter, + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap) + : mPtNodeWriter(ptNodeWriter), mTerminalIdMap(terminalIdMap) {}; + + bool onAscend() { return true; } + + bool onDescend(const int ptNodeArrayPos) { return true; } + + bool onReadingPtNodeArrayTail() { return true; } + + bool onVisitingPtNode(const PtNodeParams *const ptNodeParams); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateAllTerminalIds); + + Ver4PatriciaTrieNodeWriter *const mPtNodeWriter; + const TerminalPositionLookupTable::TerminalIdMap *const mTerminalIdMap; + }; + bool runGC(const int rootPtNodeArrayPos, const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite, int *const outUnigramCount, int *const outBigramCount, const bool needsToDecay);