diff --git a/native/jni/Android.mk b/native/jni/Android.mk index 99a479e22..35fa352d9 100644 --- a/native/jni/Android.mk +++ b/native/jni/Android.mk @@ -84,6 +84,7 @@ LATIN_IME_CORE_SRC_FILES := \ dynamic_patricia_trie_policy.cpp \ dynamic_patricia_trie_reading_helper.cpp \ dynamic_patricia_trie_reading_utils.cpp \ + dynamic_patricia_trie_updating_helper.cpp \ dynamic_patricia_trie_writing_helper.cpp \ dynamic_patricia_trie_writing_utils.cpp) \ $(addprefix suggest/policyimpl/dictionary/structure/v4/, \ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h index c9eff3d80..0f1e635bc 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h @@ -46,6 +46,9 @@ class PtNodeWriter { const PtNodeParams *const targetPtNodeParam, const int probability, bool *const outAddedNewBigram) = 0; + virtual bool removeBigramEntry(const PtNodeParams *const sourcePtNodeParams, + const PtNodeParams *const targetPtNodeParam) = 0; + protected: PtNodeWriter() {}; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_node_writer.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_node_writer.cpp index 890606f89..3b1e16cc7 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_node_writer.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_node_writer.cpp @@ -224,4 +224,13 @@ bool DynamicPatriciaTrieNodeWriter::addNewBigramEntry( } } +bool DynamicPatriciaTrieNodeWriter::removeBigramEntry( + const PtNodeParams *const sourcePtNodeParams, const PtNodeParams *const targetPtNodeParam) { + if (sourcePtNodeParams->getBigramsPos() == NOT_A_DICT_POS) { + return false; + } + return mBigramPolicy->removeBigram(sourcePtNodeParams->getBigramsPos(), + targetPtNodeParam->getHeadPos()); +} + } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_node_writer.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_node_writer.h index e3a00e94e..87ed1b159 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_node_writer.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_node_writer.h @@ -63,6 +63,9 @@ class DynamicPatriciaTrieNodeWriter : public PtNodeWriter { const PtNodeParams *const targetPtNodeParam, const int probability, bool *const outAddedNewBigram); + virtual bool removeBigramEntry(const PtNodeParams *const sourcePtNodeParams, + const PtNodeParams *const targetPtNodeParam); + private: DISALLOW_COPY_AND_ASSIGN(DynamicPatriciaTrieNodeWriter); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_policy.cpp index 0500f23d1..6b58f5fb7 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_policy.cpp @@ -27,6 +27,7 @@ #include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_node_reader.h" #include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_reading_helper.h" #include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_updating_helper.h" #include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_writing_helper.h" #include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" #include "suggest/policyimpl/dictionary/utils/probability_utils.h" @@ -152,10 +153,8 @@ bool DynamicPatriciaTriePolicy::addUnigramWord(const int *const word, const int } DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, &mNodeReader); readingHelper.initWithPtNodeArrayPos(getRootPosition()); - DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, &mNodeReader, - &mNodeWriter, &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict()); bool addedNewUnigram = false; - if (writingHelper.addUnigramWord(&readingHelper, word, length, probability, + if (mUpdatingHelper.addUnigramWord(&readingHelper, word, length, probability, &addedNewUnigram)) { if (addedNewUnigram) { mUnigramCount++; @@ -187,10 +186,8 @@ bool DynamicPatriciaTriePolicy::addBigramWords(const int *const word0, const int if (word1Pos == NOT_A_DICT_POS) { return false; } - DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, &mNodeReader, - &mNodeWriter, &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict()); bool addedNewBigram = false; - if (writingHelper.addBigramWords(word0Pos, word1Pos, probability, &addedNewBigram)) { + if (mUpdatingHelper.addBigramWords(word0Pos, word1Pos, probability, &addedNewBigram)) { if (addedNewBigram) { mBigramCount++; } @@ -221,9 +218,7 @@ bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const if (word1Pos == NOT_A_DICT_POS) { return false; } - DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, &mNodeReader, - &mNodeWriter, &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict()); - if (writingHelper.removeBigramWords(word0Pos, word1Pos)) { + if (mUpdatingHelper.removeBigramWords(word0Pos, word1Pos)) { mBigramCount--; return true; } else { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_policy.h index 1c490ceb9..454698430 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_policy.h @@ -24,6 +24,7 @@ #include "suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h" #include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_node_reader.h" #include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_node_writer.h" +#include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_updating_helper.h" #include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" #include "suggest/policyimpl/dictionary/utils/format_utils.h" #include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" @@ -49,6 +50,8 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { mNodeReader(&mBufferWithExtendableBuffer, &mBigramListPolicy, &mShortcutListPolicy), mNodeWriter(&mBufferWithExtendableBuffer, &mNodeReader, &mBigramListPolicy, &mShortcutListPolicy), + mUpdatingHelper(&mBufferWithExtendableBuffer, &mNodeReader, &mNodeWriter, + mHeaderPolicy.isDecayingDict()), mUnigramCount(mHeaderPolicy.getUnigramCount()), mBigramCount(mHeaderPolicy.getBigramCount()), mNeedsToDecayForTesting(false) {} @@ -121,6 +124,7 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { DynamicBigramListPolicy mBigramListPolicy; DynamicPatriciaTrieNodeReader mNodeReader; DynamicPatriciaTrieNodeWriter mNodeWriter; + DynamicPatriciaTrieUpdatingHelper mUpdatingHelper; int mUnigramCount; int mBigramCount; int mNeedsToDecayForTesting; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_updating_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_updating_helper.cpp new file mode 100644 index 000000000..cbc0d2304 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_updating_helper.cpp @@ -0,0 +1,279 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_updating_helper.h" + +#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h" +#include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_reading_helper.h" +#include "suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_writing_utils.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" + +namespace latinime { + +const int DynamicPatriciaTrieUpdatingHelper::CHILDREN_POSITION_FIELD_SIZE = 3; + +bool DynamicPatriciaTrieUpdatingHelper::addUnigramWord( + DynamicPatriciaTrieReadingHelper *const readingHelper, + const int *const wordCodePoints, const int codePointCount, const int probability, + bool *const outAddedNewUnigram) { + int parentPos = NOT_A_DICT_POS; + while (!readingHelper->isEnd()) { + const PtNodeParams ptNodeParams(readingHelper->getPtNodeParams()); + if (!ptNodeParams.isValid()) { + break; + } + const int matchedCodePointCount = readingHelper->getPrevTotalCodePointCount(); + if (!readingHelper->isMatchedCodePoint(ptNodeParams, 0 /* index */, + wordCodePoints[matchedCodePointCount])) { + // The first code point is different from target code point. Skip this node and read + // the next sibling node. + readingHelper->readNextSiblingNode(ptNodeParams); + continue; + } + // Check following merged node code points. + const int nodeCodePointCount = ptNodeParams.getCodePointCount(); + for (int j = 1; j < nodeCodePointCount; ++j) { + const int nextIndex = matchedCodePointCount + j; + if (nextIndex >= codePointCount || !readingHelper->isMatchedCodePoint(ptNodeParams, j, + wordCodePoints[matchedCodePointCount + j])) { + *outAddedNewUnigram = true; + return reallocatePtNodeAndAddNewPtNodes(&ptNodeParams, j, + getUpdatedProbability(NOT_A_PROBABILITY /* originalProbability */, + probability), + wordCodePoints + matchedCodePointCount, + codePointCount - matchedCodePointCount); + } + } + // All characters are matched. + if (codePointCount == readingHelper->getTotalCodePointCount(ptNodeParams)) { + return setPtNodeProbability(&ptNodeParams, probability, outAddedNewUnigram); + } + if (!ptNodeParams.hasChildren()) { + *outAddedNewUnigram = true; + return createChildrenPtNodeArrayAndAChildPtNode(&ptNodeParams, + getUpdatedProbability(NOT_A_PROBABILITY /* originalProbability */, probability), + wordCodePoints + readingHelper->getTotalCodePointCount(ptNodeParams), + codePointCount - readingHelper->getTotalCodePointCount(ptNodeParams)); + } + // Advance to the children nodes. + parentPos = ptNodeParams.getHeadPos(); + readingHelper->readChildNode(ptNodeParams); + } + if (readingHelper->isError()) { + // The dictionary is invalid. + return false; + } + int pos = readingHelper->getPosOfLastForwardLinkField(); + *outAddedNewUnigram = true; + return createAndInsertNodeIntoPtNodeArray(parentPos, + wordCodePoints + readingHelper->getPrevTotalCodePointCount(), + codePointCount - readingHelper->getPrevTotalCodePointCount(), + getUpdatedProbability(NOT_A_PROBABILITY /* originalProbability */, probability), &pos); +} + +bool DynamicPatriciaTrieUpdatingHelper::addBigramWords(const int word0Pos, const int word1Pos, + const int probability, bool *const outAddedNewBigram) { + const PtNodeParams sourcePtNodeParams( + mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(word0Pos)); + const PtNodeParams targetPtNodeParams( + mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(word1Pos)); + return mPtNodeWriter->addNewBigramEntry(&sourcePtNodeParams, &targetPtNodeParams, probability, + outAddedNewBigram); +} + +// Remove a bigram relation from word0Pos to word1Pos. +bool DynamicPatriciaTrieUpdatingHelper::removeBigramWords(const int word0Pos, const int word1Pos) { + const PtNodeParams sourcePtNodeParams( + mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(word0Pos)); + const PtNodeParams targetPtNodeParams( + mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(word1Pos)); + return mPtNodeWriter->removeBigramEntry(&sourcePtNodeParams, &targetPtNodeParams); +} + +bool DynamicPatriciaTrieUpdatingHelper::createAndInsertNodeIntoPtNodeArray(const int parentPos, + const int *const nodeCodePoints, const int nodeCodePointCount, const int probability, + int *const forwardLinkFieldPos) { + const int newPtNodeArrayPos = mBuffer->getTailPosition(); + if (!DynamicPatriciaTrieWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, + newPtNodeArrayPos, forwardLinkFieldPos)) { + return false; + } + return createNewPtNodeArrayWithAChildPtNode(parentPos, nodeCodePoints, nodeCodePointCount, + probability); +} + +bool DynamicPatriciaTrieUpdatingHelper::setPtNodeProbability( + const PtNodeParams *const originalPtNodeParams, const int probability, + bool *const outAddedNewUnigram) { + if (originalPtNodeParams->isTerminal()) { + // Overwrites the probability. + *outAddedNewUnigram = false; + const int probabilityToWrite = getUpdatedProbability( + originalPtNodeParams->getProbability(), probability); + return mPtNodeWriter->updatePtNodeProbability(originalPtNodeParams, probabilityToWrite); + } else { + // Make the node terminal and write the probability. + *outAddedNewUnigram = true; + const int movedPos = mBuffer->getTailPosition(); + int writingPos = movedPos; + const PtNodeParams ptNodeParamsToWrite(getUpdatedPtNodeParams(originalPtNodeParams, + originalPtNodeParams->getParentPos(), originalPtNodeParams->getCodePointCount(), + originalPtNodeParams->getCodePoints(), + getUpdatedProbability(NOT_A_PROBABILITY /* originalProbability */, probability))); + if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&ptNodeParamsToWrite, &writingPos)) { + return false; + } + if (!mPtNodeWriter->markPtNodeAsMoved(originalPtNodeParams, movedPos, movedPos)) { + return false; + } + } + return true; +} + +bool DynamicPatriciaTrieUpdatingHelper::createChildrenPtNodeArrayAndAChildPtNode( + const PtNodeParams *const parentPtNodeParams, const int probability, + const int *const codePoints, const int codePointCount) { + const int newPtNodeArrayPos = mBuffer->getTailPosition(); + if (!mPtNodeWriter->updateChildrenPosition(parentPtNodeParams, newPtNodeArrayPos)) { + return false; + } + return createNewPtNodeArrayWithAChildPtNode(parentPtNodeParams->getHeadPos(), codePoints, + codePointCount, probability); +} + +bool DynamicPatriciaTrieUpdatingHelper::createNewPtNodeArrayWithAChildPtNode( + const int parentPtNodePos, const int *const nodeCodePoints, const int nodeCodePointCount, + const int probability) { + int writingPos = mBuffer->getTailPosition(); + if (!DynamicPatriciaTrieWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer, + 1 /* arraySize */, &writingPos)) { + return false; + } + const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( + parentPtNodePos, nodeCodePointCount, nodeCodePoints, probability)); + if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&ptNodeParamsToWrite, &writingPos)) { + return false; + } + if (!DynamicPatriciaTrieWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, + NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { + return false; + } + return true; +} + +// Returns whether the dictionary updating was succeeded or not. +bool DynamicPatriciaTrieUpdatingHelper::reallocatePtNodeAndAddNewPtNodes( + const PtNodeParams *const reallocatingPtNodeParams, const int overlappingCodePointCount, + const int probabilityOfNewPtNode, const int *const newNodeCodePoints, + const int newNodeCodePointCount) { + // When addsExtraChild is true, split the reallocating PtNode and add new child. + // Reallocating PtNode: abcde, newNode: abcxy. + // abc (1st, not terminal) __ de (2nd) + // \_ xy (extra child, terminal) + // Otherwise, this method makes 1st part terminal and write probabilityOfNewPtNode. + // Reallocating PtNode: abcde, newNode: abc. + // abc (1st, terminal) __ de (2nd) + const bool addsExtraChild = newNodeCodePointCount > overlappingCodePointCount; + const int firstPartOfReallocatedPtNodePos = mBuffer->getTailPosition(); + int writingPos = firstPartOfReallocatedPtNodePos; + // Write the 1st part of the reallocating node. The children position will be updated later + // with actual children position. + const int newProbability = addsExtraChild ? NOT_A_PROBABILITY : probabilityOfNewPtNode; + const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( + reallocatingPtNodeParams->getParentPos(), overlappingCodePointCount, + reallocatingPtNodeParams->getCodePoints(), newProbability)); + if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&ptNodeParamsToWrite, &writingPos)) { + return false; + } + const int actualChildrenPos = writingPos; + // Create new children PtNode array. + const size_t newPtNodeCount = addsExtraChild ? 2 : 1; + if (!DynamicPatriciaTrieWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer, + newPtNodeCount, &writingPos)) { + return false; + } + // Write the 2nd part of the reallocating node. + const int secondPartOfReallocatedPtNodePos = writingPos; + const PtNodeParams childPartPtNodeParams(getUpdatedPtNodeParams(reallocatingPtNodeParams, + firstPartOfReallocatedPtNodePos, + reallocatingPtNodeParams->getCodePointCount() - overlappingCodePointCount, + reallocatingPtNodeParams->getCodePoints() + overlappingCodePointCount, + reallocatingPtNodeParams->getProbability())); + if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&childPartPtNodeParams, &writingPos)) { + return false; + } + if (addsExtraChild) { + const PtNodeParams extraChildPtNodeParams(getPtNodeParamsForNewPtNode( + firstPartOfReallocatedPtNodePos, newNodeCodePointCount - overlappingCodePointCount, + newNodeCodePoints + overlappingCodePointCount, probabilityOfNewPtNode)); + if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&extraChildPtNodeParams, &writingPos)) { + return false; + } + } + if (!DynamicPatriciaTrieWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, + NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { + return false; + } + // Update original reallocating PtNode as moved. + if (!mPtNodeWriter->markPtNodeAsMoved(reallocatingPtNodeParams, firstPartOfReallocatedPtNodePos, + secondPartOfReallocatedPtNodePos)) { + return false; + } + // Load node info. Information of the 1st part will be fetched. + const PtNodeParams ptNodeParams( + mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(firstPartOfReallocatedPtNodePos)); + // Update children position. + return mPtNodeWriter->updateChildrenPosition(&ptNodeParams, actualChildrenPos); +} + +int DynamicPatriciaTrieUpdatingHelper::getUpdatedProbability(const int originalProbability, + const int newProbability) const { + if (mNeedsToDecay) { + return ForgettingCurveUtils::getUpdatedEncodedProbability(originalProbability, + newProbability); + } else { + return newProbability; + } +} + +const PtNodeParams DynamicPatriciaTrieUpdatingHelper::getUpdatedPtNodeParams( + const PtNodeParams *const originalPtNodeParams, const int parentPos, + const int codePointCount, const int *const codePoints, const int probability) const { + const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags( + originalPtNodeParams->isBlacklisted(), originalPtNodeParams->isNotAWord(), + probability != NOT_A_PROBABILITY /* isTerminal */, + originalPtNodeParams->getShortcutPos() != NOT_A_DICT_POS /* hasShortcutTargets */, + originalPtNodeParams->getBigramsPos() != NOT_A_DICT_POS /* hasBigrams */, + codePointCount > 1 /* hasMultipleChars */, CHILDREN_POSITION_FIELD_SIZE); + return PtNodeParams(originalPtNodeParams, flags, parentPos, codePointCount, codePoints, + probability); +} + +const PtNodeParams DynamicPatriciaTrieUpdatingHelper::getPtNodeParamsForNewPtNode( + const int parentPos, const int codePointCount, const int *const codePoints, + const int probability) const { + const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags( + false /* isBlacklisted */, false /* isNotAWord */, + probability != NOT_A_PROBABILITY /* isTerminal */, + false /* hasShortcutTargets */, false /* hasBigrams */, + codePointCount > 1 /* hasMultipleChars */, CHILDREN_POSITION_FIELD_SIZE); + return PtNodeParams(flags, parentPos, codePointCount, codePoints, probability); +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_updating_helper.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_updating_helper.h new file mode 100644 index 000000000..b9800cd80 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_updating_helper.h @@ -0,0 +1,93 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_UPDATING_HELPER_H +#define LATINIME_DYNAMIC_PATRICIA_TRIE_UPDATING_HELPER_H + +#include + +#include "defines.h" +#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" +#include "utils/hash_map_compat.h" + +namespace latinime { + +class BufferWithExtendableBuffer; +class DynamicPatriciaTrieReadingHelper; +class PtNodeReader; +class PtNodeWriter; + +// TODO: Move to pt_common. +class DynamicPatriciaTrieUpdatingHelper { + public: + DynamicPatriciaTrieUpdatingHelper(BufferWithExtendableBuffer *const buffer, + const PtNodeReader *const ptNodeReader, PtNodeWriter *const ptNodeWriter, + const bool needsToDecay) + : mBuffer(buffer), mPtNodeReader(ptNodeReader), mPtNodeWriter(ptNodeWriter), + mNeedsToDecay(needsToDecay) {} + + ~DynamicPatriciaTrieUpdatingHelper() {} + + // Add a word to the dictionary. If the word already exists, update the probability. + bool addUnigramWord(DynamicPatriciaTrieReadingHelper *const readingHelper, + const int *const wordCodePoints, const int codePointCount, const int probability, + bool *const outAddedNewUnigram); + + // Add a bigram relation from word0Pos to word1Pos. + bool addBigramWords(const int word0Pos, const int word1Pos, const int probability, + bool *const outAddedNewBigram); + + // Remove a bigram relation from word0Pos to word1Pos. + bool removeBigramWords(const int word0Pos, const int word1Pos); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTrieUpdatingHelper); + + static const int CHILDREN_POSITION_FIELD_SIZE; + + BufferWithExtendableBuffer *const mBuffer; + const PtNodeReader *const mPtNodeReader; + PtNodeWriter *const mPtNodeWriter; + const bool mNeedsToDecay; + + bool createAndInsertNodeIntoPtNodeArray(const int parentPos, const int *const nodeCodePoints, + const int nodeCodePointCount, const int probability, int *const forwardLinkFieldPos); + + bool setPtNodeProbability(const PtNodeParams *const originalPtNodeParams, const int probability, + bool *const outAddedNewUnigram); + + bool createChildrenPtNodeArrayAndAChildPtNode(const PtNodeParams *const parentPtNodeParams, + const int probability, const int *const codePoints, const int codePointCount); + + bool createNewPtNodeArrayWithAChildPtNode(const int parentPos, const int *const nodeCodePoints, + const int nodeCodePointCount, const int probability); + + bool reallocatePtNodeAndAddNewPtNodes( + const PtNodeParams *const reallocatingPtNodeParams, const int overlappingCodePointCount, + const int probabilityOfNewPtNode, const int *const newNodeCodePoints, + const int newNodeCodePointCount); + + int getUpdatedProbability(const int originalProbability, const int newProbability) const; + + const PtNodeParams getUpdatedPtNodeParams(const PtNodeParams *const originalPtNodeParams, + const int parentPos, const int codePointCount, const int *const codePoints, + const int probability) const; + + const PtNodeParams getPtNodeParamsForNewPtNode(const int parentPos, const int codePointCount, + const int *const codePoints, const int probability) const; +}; +} // namespace latinime +#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_UPDATING_HELPER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_writing_helper.cpp index 6a550a1af..d97cec53b 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_writing_helper.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_writing_helper.cpp @@ -34,88 +34,9 @@ namespace latinime { -const int DynamicPatriciaTrieWritingHelper::CHILDREN_POSITION_FIELD_SIZE = 3; // TODO: Make MAX_DICTIONARY_SIZE 8MB. const size_t DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE = 2 * 1024 * 1024; -bool DynamicPatriciaTrieWritingHelper::addUnigramWord( - DynamicPatriciaTrieReadingHelper *const readingHelper, - const int *const wordCodePoints, const int codePointCount, const int probability, - bool *const outAddedNewUnigram) { - int parentPos = NOT_A_DICT_POS; - while (!readingHelper->isEnd()) { - const PtNodeParams ptNodeParams(readingHelper->getPtNodeParams()); - if (!ptNodeParams.isValid()) { - break; - } - const int matchedCodePointCount = readingHelper->getPrevTotalCodePointCount(); - if (!readingHelper->isMatchedCodePoint(ptNodeParams, 0 /* index */, - wordCodePoints[matchedCodePointCount])) { - // The first code point is different from target code point. Skip this node and read - // the next sibling node. - readingHelper->readNextSiblingNode(ptNodeParams); - continue; - } - // Check following merged node code points. - const int nodeCodePointCount = ptNodeParams.getCodePointCount(); - for (int j = 1; j < nodeCodePointCount; ++j) { - const int nextIndex = matchedCodePointCount + j; - if (nextIndex >= codePointCount || !readingHelper->isMatchedCodePoint(ptNodeParams, j, - wordCodePoints[matchedCodePointCount + j])) { - *outAddedNewUnigram = true; - return reallocatePtNodeAndAddNewPtNodes(&ptNodeParams, j, - getUpdatedProbability(NOT_A_PROBABILITY /* originalProbability */, - probability), - wordCodePoints + matchedCodePointCount, - codePointCount - matchedCodePointCount); - } - } - // All characters are matched. - if (codePointCount == readingHelper->getTotalCodePointCount(ptNodeParams)) { - return setPtNodeProbability(&ptNodeParams, probability, outAddedNewUnigram); - } - if (!ptNodeParams.hasChildren()) { - *outAddedNewUnigram = true; - return createChildrenPtNodeArrayAndAChildPtNode(&ptNodeParams, - getUpdatedProbability(NOT_A_PROBABILITY /* originalProbability */, probability), - wordCodePoints + readingHelper->getTotalCodePointCount(ptNodeParams), - codePointCount - readingHelper->getTotalCodePointCount(ptNodeParams)); - } - // Advance to the children nodes. - parentPos = ptNodeParams.getHeadPos(); - readingHelper->readChildNode(ptNodeParams); - } - if (readingHelper->isError()) { - // The dictionary is invalid. - return false; - } - int pos = readingHelper->getPosOfLastForwardLinkField(); - *outAddedNewUnigram = true; - return createAndInsertNodeIntoPtNodeArray(parentPos, - wordCodePoints + readingHelper->getPrevTotalCodePointCount(), - codePointCount - readingHelper->getPrevTotalCodePointCount(), - getUpdatedProbability(NOT_A_PROBABILITY /* originalProbability */, probability), &pos); -} - -bool DynamicPatriciaTrieWritingHelper::addBigramWords(const int word0Pos, const int word1Pos, - const int probability, bool *const outAddedNewBigram) { - const PtNodeParams sourcePtNodeParams( - mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(word0Pos)); - const PtNodeParams targetPtNodeParams( - mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(word1Pos)); - return mPtNodeWriter->addNewBigramEntry(&sourcePtNodeParams, &targetPtNodeParams, probability, - outAddedNewBigram); -} - -// Remove a bigram relation from word0Pos to word1Pos. -bool DynamicPatriciaTrieWritingHelper::removeBigramWords(const int word0Pos, const int word1Pos) { - const PtNodeParams ptNodeParams(mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(word0Pos)); - if (ptNodeParams.getBigramsPos() == NOT_A_DICT_POS) { - return false; - } - return mBigramPolicy->removeBigram(ptNodeParams.getBigramsPos(), word1Pos); -} - void DynamicPatriciaTrieWritingHelper::writeToDictFile(const char *const fileName, const HeaderPolicy *const headerPolicy, const int unigramCount, const int bigramCount) { BufferWithExtendableBuffer headerBuffer( @@ -149,142 +70,6 @@ void DynamicPatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNod DictFileWritingUtils::flushAllHeaderAndBodyToFile(fileName, &headerBuffer, &newDictBuffer); } -bool DynamicPatriciaTrieWritingHelper::createAndInsertNodeIntoPtNodeArray(const int parentPos, - const int *const nodeCodePoints, const int nodeCodePointCount, const int probability, - int *const forwardLinkFieldPos) { - const int newPtNodeArrayPos = mBuffer->getTailPosition(); - if (!DynamicPatriciaTrieWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, - newPtNodeArrayPos, forwardLinkFieldPos)) { - return false; - } - return createNewPtNodeArrayWithAChildPtNode(parentPos, nodeCodePoints, nodeCodePointCount, - probability); -} - -bool DynamicPatriciaTrieWritingHelper::setPtNodeProbability( - const PtNodeParams *const originalPtNodeParams, const int probability, - bool *const outAddedNewUnigram) { - if (originalPtNodeParams->isTerminal()) { - // Overwrites the probability. - *outAddedNewUnigram = false; - const int probabilityToWrite = getUpdatedProbability( - originalPtNodeParams->getProbability(), probability); - return mPtNodeWriter->updatePtNodeProbability(originalPtNodeParams, probabilityToWrite); - } else { - // Make the node terminal and write the probability. - *outAddedNewUnigram = true; - const int movedPos = mBuffer->getTailPosition(); - int writingPos = movedPos; - const PtNodeParams ptNodeParamsToWrite(getUpdatedPtNodeParams(originalPtNodeParams, - originalPtNodeParams->getParentPos(), originalPtNodeParams->getCodePointCount(), - originalPtNodeParams->getCodePoints(), - getUpdatedProbability(NOT_A_PROBABILITY /* originalProbability */, probability))); - if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&ptNodeParamsToWrite, &writingPos)) { - return false; - } - if (!mPtNodeWriter->markPtNodeAsMoved(originalPtNodeParams, movedPos, movedPos)) { - return false; - } - } - return true; -} - -bool DynamicPatriciaTrieWritingHelper::createChildrenPtNodeArrayAndAChildPtNode( - const PtNodeParams *const parentPtNodeParams, const int probability, - const int *const codePoints, const int codePointCount) { - const int newPtNodeArrayPos = mBuffer->getTailPosition(); - if (!mPtNodeWriter->updateChildrenPosition(parentPtNodeParams, newPtNodeArrayPos)) { - return false; - } - return createNewPtNodeArrayWithAChildPtNode(parentPtNodeParams->getHeadPos(), codePoints, - codePointCount, probability); -} - -bool DynamicPatriciaTrieWritingHelper::createNewPtNodeArrayWithAChildPtNode( - const int parentPtNodePos, const int *const nodeCodePoints, const int nodeCodePointCount, - const int probability) { - int writingPos = mBuffer->getTailPosition(); - if (!DynamicPatriciaTrieWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer, - 1 /* arraySize */, &writingPos)) { - return false; - } - const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( - parentPtNodePos, nodeCodePointCount, nodeCodePoints, probability)); - if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&ptNodeParamsToWrite, &writingPos)) { - return false; - } - if (!DynamicPatriciaTrieWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, - NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { - return false; - } - return true; -} - -// Returns whether the dictionary updating was succeeded or not. -bool DynamicPatriciaTrieWritingHelper::reallocatePtNodeAndAddNewPtNodes( - const PtNodeParams *const reallocatingPtNodeParams, const int overlappingCodePointCount, - const int probabilityOfNewPtNode, const int *const newNodeCodePoints, - const int newNodeCodePointCount) { - // When addsExtraChild is true, split the reallocating PtNode and add new child. - // Reallocating PtNode: abcde, newNode: abcxy. - // abc (1st, not terminal) __ de (2nd) - // \_ xy (extra child, terminal) - // Otherwise, this method makes 1st part terminal and write probabilityOfNewPtNode. - // Reallocating PtNode: abcde, newNode: abc. - // abc (1st, terminal) __ de (2nd) - const bool addsExtraChild = newNodeCodePointCount > overlappingCodePointCount; - const int firstPartOfReallocatedPtNodePos = mBuffer->getTailPosition(); - int writingPos = firstPartOfReallocatedPtNodePos; - // Write the 1st part of the reallocating node. The children position will be updated later - // with actual children position. - const int newProbability = addsExtraChild ? NOT_A_PROBABILITY : probabilityOfNewPtNode; - const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( - reallocatingPtNodeParams->getParentPos(), overlappingCodePointCount, - reallocatingPtNodeParams->getCodePoints(), newProbability)); - if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&ptNodeParamsToWrite, &writingPos)) { - return false; - } - const int actualChildrenPos = writingPos; - // Create new children PtNode array. - const size_t newPtNodeCount = addsExtraChild ? 2 : 1; - if (!DynamicPatriciaTrieWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer, - newPtNodeCount, &writingPos)) { - return false; - } - // Write the 2nd part of the reallocating node. - const int secondPartOfReallocatedPtNodePos = writingPos; - const PtNodeParams childPartPtNodeParams(getUpdatedPtNodeParams(reallocatingPtNodeParams, - firstPartOfReallocatedPtNodePos, - reallocatingPtNodeParams->getCodePointCount() - overlappingCodePointCount, - reallocatingPtNodeParams->getCodePoints() + overlappingCodePointCount, - reallocatingPtNodeParams->getProbability())); - if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&childPartPtNodeParams, &writingPos)) { - return false; - } - if (addsExtraChild) { - const PtNodeParams extraChildPtNodeParams(getPtNodeParamsForNewPtNode( - firstPartOfReallocatedPtNodePos, newNodeCodePointCount - overlappingCodePointCount, - newNodeCodePoints + overlappingCodePointCount, probabilityOfNewPtNode)); - if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&extraChildPtNodeParams, &writingPos)) { - return false; - } - } - if (!DynamicPatriciaTrieWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, - NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { - return false; - } - // Update original reallocating PtNode as moved. - if (!mPtNodeWriter->markPtNodeAsMoved(reallocatingPtNodeParams, firstPartOfReallocatedPtNodePos, - secondPartOfReallocatedPtNodePos)) { - return false; - } - // Load node info. Information of the 1st part will be fetched. - const PtNodeParams ptNodeParams( - mPtNodeReader->fetchNodeInfoInBufferFromPtNodePos(firstPartOfReallocatedPtNodePos)); - // Update children position. - return mPtNodeWriter->updateChildrenPosition(&ptNodeParams, actualChildrenPos); -} - // TODO: Make this method version independent. bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, const HeaderPolicy *const headerPolicy, BufferWithExtendableBuffer *const bufferToWrite, @@ -351,38 +136,4 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, return true; } -int DynamicPatriciaTrieWritingHelper::getUpdatedProbability(const int originalProbability, - const int newProbability) const { - if (mNeedsToDecay) { - return ForgettingCurveUtils::getUpdatedEncodedProbability(originalProbability, - newProbability); - } else { - return newProbability; - } -} - -const PtNodeParams DynamicPatriciaTrieWritingHelper::getUpdatedPtNodeParams( - const PtNodeParams *const originalPtNodeParams, const int parentPos, - const int codePointCount, const int *const codePoints, const int probability) const { - const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags( - originalPtNodeParams->isBlacklisted(), originalPtNodeParams->isNotAWord(), - probability != NOT_A_PROBABILITY /* isTerminal */, - originalPtNodeParams->getShortcutPos() != NOT_A_DICT_POS /* hasShortcutTargets */, - originalPtNodeParams->getBigramsPos() != NOT_A_DICT_POS /* hasBigrams */, - codePointCount > 1 /* hasMultipleChars */, CHILDREN_POSITION_FIELD_SIZE); - return PtNodeParams(originalPtNodeParams, flags, parentPos, codePointCount, codePoints, - probability); -} - -const PtNodeParams DynamicPatriciaTrieWritingHelper::getPtNodeParamsForNewPtNode( - const int parentPos, const int codePointCount, const int *const codePoints, - const int probability) const { - const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags( - false /* isBlacklisted */, false /* isNotAWord */, - probability != NOT_A_PROBABILITY /* isTerminal */, - false /* hasShortcutTargets */, false /* hasBigrams */, - codePointCount > 1 /* hasMultipleChars */, CHILDREN_POSITION_FIELD_SIZE); - return PtNodeParams(flags, parentPos, codePointCount, codePoints, probability); -} - } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_writing_helper.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_writing_helper.h index 8135f5054..7223dea8b 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_writing_helper.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v3/dynamic_patricia_trie_writing_helper.h @@ -20,7 +20,6 @@ #include #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" #include "utils/hash_map_compat.h" namespace latinime { @@ -62,18 +61,6 @@ class DynamicPatriciaTrieWritingHelper { ~DynamicPatriciaTrieWritingHelper() {} - // Add a word to the dictionary. If the word already exists, update the probability. - bool addUnigramWord(DynamicPatriciaTrieReadingHelper *const readingHelper, - const int *const wordCodePoints, const int codePointCount, const int probability, - bool *const outAddedNewUnigram); - - // Add a bigram relation from word0Pos to word1Pos. - bool addBigramWords(const int word0Pos, const int word1Pos, const int probability, - bool *const outAddedNewBigram); - - // Remove a bigram relation from word0Pos to word1Pos. - bool removeBigramWords(const int word0Pos, const int word1Pos); - void writeToDictFile(const char *const fileName, const HeaderPolicy *const headerPolicy, const int unigramCount, const int bigramCount); @@ -83,8 +70,6 @@ class DynamicPatriciaTrieWritingHelper { private: DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTrieWritingHelper); - static const int CHILDREN_POSITION_FIELD_SIZE; - BufferWithExtendableBuffer *const mBuffer; const PtNodeReader *const mPtNodeReader; PtNodeWriter *const mPtNodeWriter; @@ -92,35 +77,9 @@ class DynamicPatriciaTrieWritingHelper { DynamicShortcutListPolicy *const mShortcutPolicy; const bool mNeedsToDecay; - bool createAndInsertNodeIntoPtNodeArray(const int parentPos, const int *const nodeCodePoints, - const int nodeCodePointCount, const int probability, int *const forwardLinkFieldPos); - - bool setPtNodeProbability(const PtNodeParams *const originalPtNodeParams, const int probability, - bool *const outAddedNewUnigram); - - bool createChildrenPtNodeArrayAndAChildPtNode(const PtNodeParams *const parentPtNodeParams, - const int probability, const int *const codePoints, const int codePointCount); - - bool createNewPtNodeArrayWithAChildPtNode(const int parentPos, const int *const nodeCodePoints, - const int nodeCodePointCount, const int probability); - - bool reallocatePtNodeAndAddNewPtNodes( - const PtNodeParams *const reallocatingPtNodeParams, const int overlappingCodePointCount, - const int probabilityOfNewPtNode, const int *const newNodeCodePoints, - const int newNodeCodePointCount); - bool runGC(const int rootPtNodeArrayPos, const HeaderPolicy *const headerPolicy, BufferWithExtendableBuffer *const bufferToWrite, int *const outUnigramCount, int *const outBigramCount); - - int getUpdatedProbability(const int originalProbability, const int newProbability) const; - - const PtNodeParams getUpdatedPtNodeParams(const PtNodeParams *const originalPtNodeParams, - const int parentPos, const int codePointCount, const int *const codePoints, - const int probability) const; - - const PtNodeParams getPtNodeParamsForNewPtNode(const int parentPos, const int codePointCount, - const int *const codePoints, const int probability) const; }; } // namespace latinime #endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H */