/* * Copyright (C) 2013, The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "dictionary/structure/pt_common/dynamic_pt_updating_helper.h" #include "dictionary/property/unigram_property.h" #include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h" #include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h" #include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" #include "dictionary/structure/pt_common/pt_node_reader.h" #include "dictionary/structure/pt_common/pt_node_writer.h" #include "dictionary/utils/buffer_with_extendable_buffer.h" namespace latinime { const int DynamicPtUpdatingHelper::CHILDREN_POSITION_FIELD_SIZE = 3; bool DynamicPtUpdatingHelper::addUnigramWord(DynamicPtReadingHelper *const readingHelper, const CodePointArrayView wordCodePoints, const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram) { int parentPos = NOT_A_DICT_POS; while (!readingHelper->isEnd()) { const PtNodeParams ptNodeParams(readingHelper->getPtNodeParams()); if (!ptNodeParams.isValid()) { break; } const size_t matchedCodePointCount = readingHelper->getPrevTotalCodePointCount(); if (!readingHelper->isMatchedCodePoint(ptNodeParams, 0 /* index */, wordCodePoints[matchedCodePointCount])) { // The first code point is different from target code point. Skip this node and read // the next sibling node. readingHelper->readNextSiblingNode(ptNodeParams); continue; } // Check following merged node code points. const size_t nodeCodePointCount = ptNodeParams.getCodePointArrayView().size(); for (size_t j = 1; j < nodeCodePointCount; ++j) { const size_t nextIndex = matchedCodePointCount + j; if (nextIndex >= wordCodePoints.size() || !readingHelper->isMatchedCodePoint(ptNodeParams, j, wordCodePoints[matchedCodePointCount + j])) { *outAddedNewUnigram = true; return reallocatePtNodeAndAddNewPtNodes(&ptNodeParams, j, unigramProperty, wordCodePoints.skip(matchedCodePointCount)); } } // All characters are matched. if (wordCodePoints.size() == readingHelper->getTotalCodePointCount(ptNodeParams)) { return setPtNodeProbability(&ptNodeParams, unigramProperty, outAddedNewUnigram); } if (!ptNodeParams.hasChildren()) { *outAddedNewUnigram = true; return createChildrenPtNodeArrayAndAChildPtNode(&ptNodeParams, unigramProperty, wordCodePoints.skip(readingHelper->getTotalCodePointCount(ptNodeParams))); } // Advance to the children nodes. parentPos = ptNodeParams.getHeadPos(); readingHelper->readChildNode(ptNodeParams); } if (readingHelper->isError()) { // The dictionary is invalid. return false; } int pos = readingHelper->getPosOfLastForwardLinkField(); *outAddedNewUnigram = true; return createAndInsertNodeIntoPtNodeArray(parentPos, wordCodePoints.skip(readingHelper->getPrevTotalCodePointCount()), unigramProperty, &pos); } bool DynamicPtUpdatingHelper::addNgramEntry(const PtNodePosArrayView prevWordsPtNodePos, const int wordPos, const NgramProperty *const ngramProperty, bool *const outAddedNewEntry) { if (prevWordsPtNodePos.empty()) { return false; } ASSERT(prevWordsPtNodePos.size() <= MAX_PREV_WORD_COUNT_FOR_N_GRAM); int prevWordTerminalIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; for (size_t i = 0; i < prevWordsPtNodePos.size(); ++i) { prevWordTerminalIds[i] = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos( prevWordsPtNodePos[i]).getTerminalId(); } const WordIdArrayView prevWordIds(prevWordTerminalIds, prevWordsPtNodePos.size()); const int wordId = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos).getTerminalId(); return mPtNodeWriter->addNgramEntry(prevWordIds, wordId, ngramProperty, outAddedNewEntry); } bool DynamicPtUpdatingHelper::removeNgramEntry(const PtNodePosArrayView prevWordsPtNodePos, const int wordPos) { if (prevWordsPtNodePos.empty()) { return false; } ASSERT(prevWordsPtNodePos.size() <= MAX_PREV_WORD_COUNT_FOR_N_GRAM); int prevWordTerminalIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; for (size_t i = 0; i < prevWordsPtNodePos.size(); ++i) { prevWordTerminalIds[i] = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos( prevWordsPtNodePos[i]).getTerminalId(); } const WordIdArrayView prevWordIds(prevWordTerminalIds, prevWordsPtNodePos.size()); const int wordId = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos).getTerminalId(); return mPtNodeWriter->removeNgramEntry(prevWordIds, wordId); } bool DynamicPtUpdatingHelper::addShortcutTarget(const int wordPos, const CodePointArrayView targetCodePoints, const int shortcutProbability) { const PtNodeParams ptNodeParams(mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos)); return mPtNodeWriter->addShortcutTarget(&ptNodeParams, targetCodePoints.data(), targetCodePoints.size(), shortcutProbability); } bool DynamicPtUpdatingHelper::createAndInsertNodeIntoPtNodeArray(const int parentPos, const CodePointArrayView ptNodeCodePoints, const UnigramProperty *const unigramProperty, int *const forwardLinkFieldPos) { const int newPtNodeArrayPos = mBuffer->getTailPosition(); if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, newPtNodeArrayPos, forwardLinkFieldPos)) { return false; } return createNewPtNodeArrayWithAChildPtNode(parentPos, ptNodeCodePoints, unigramProperty); } bool DynamicPtUpdatingHelper::setPtNodeProbability(const PtNodeParams *const originalPtNodeParams, const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram) { if (originalPtNodeParams->isTerminal() && !originalPtNodeParams->isDeleted()) { // Overwrites the probability. *outAddedNewUnigram = false; return mPtNodeWriter->updatePtNodeUnigramProperty(originalPtNodeParams, unigramProperty); } else { // Make the node terminal and write the probability. *outAddedNewUnigram = true; const int movedPos = mBuffer->getTailPosition(); int writingPos = movedPos; const PtNodeParams ptNodeParamsToWrite(getUpdatedPtNodeParams(originalPtNodeParams, unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(), true /* isTerminal */, originalPtNodeParams->getParentPos(), originalPtNodeParams->getCodePointArrayView(), unigramProperty->getProbability())); if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, unigramProperty, &writingPos)) { return false; } if (!mPtNodeWriter->markPtNodeAsMoved(originalPtNodeParams, movedPos, movedPos)) { return false; } } return true; } bool DynamicPtUpdatingHelper::createChildrenPtNodeArrayAndAChildPtNode( const PtNodeParams *const parentPtNodeParams, const UnigramProperty *const unigramProperty, const CodePointArrayView codePoints) { const int newPtNodeArrayPos = mBuffer->getTailPosition(); if (!mPtNodeWriter->updateChildrenPosition(parentPtNodeParams, newPtNodeArrayPos)) { return false; } return createNewPtNodeArrayWithAChildPtNode(parentPtNodeParams->getHeadPos(), codePoints, unigramProperty); } bool DynamicPtUpdatingHelper::createNewPtNodeArrayWithAChildPtNode( const int parentPtNodePos, const CodePointArrayView ptNodeCodePoints, const UnigramProperty *const unigramProperty) { int writingPos = mBuffer->getTailPosition(); if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer, 1 /* arraySize */, &writingPos)) { return false; } const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(), true /* isTerminal */, parentPtNodePos, ptNodeCodePoints, unigramProperty->getProbability())); if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, unigramProperty, &writingPos)) { return false; } if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { return false; } return true; } // Returns whether the dictionary updating was succeeded or not. bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes( const PtNodeParams *const reallocatingPtNodeParams, const size_t overlappingCodePointCount, const UnigramProperty *const unigramProperty, const CodePointArrayView newPtNodeCodePoints) { // When addsExtraChild is true, split the reallocating PtNode and add new child. // Reallocating PtNode: abcde, newNode: abcxy. // abc (1st, not terminal) __ de (2nd) // \_ xy (extra child, terminal) // Otherwise, this method makes 1st part terminal and write information in unigramProperty. // Reallocating PtNode: abcde, newNode: abc. // abc (1st, terminal) __ de (2nd) const bool addsExtraChild = newPtNodeCodePoints.size() > overlappingCodePointCount; const int firstPartOfReallocatedPtNodePos = mBuffer->getTailPosition(); int writingPos = firstPartOfReallocatedPtNodePos; // Write the 1st part of the reallocating node. The children position will be updated later // with actual children position. const CodePointArrayView firstPtNodeCodePoints = reallocatingPtNodeParams->getCodePointArrayView().limit(overlappingCodePointCount); if (addsExtraChild) { const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( false /* isNotAWord */, false /* isPossiblyOffensive */, false /* isTerminal */, reallocatingPtNodeParams->getParentPos(), firstPtNodeCodePoints, NOT_A_PROBABILITY)); if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&ptNodeParamsToWrite, &writingPos)) { return false; } } else { const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(), true /* isTerminal */, reallocatingPtNodeParams->getParentPos(), firstPtNodeCodePoints, unigramProperty->getProbability())); if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, unigramProperty, &writingPos)) { return false; } } const int actualChildrenPos = writingPos; // Create new children PtNode array. const size_t newPtNodeCount = addsExtraChild ? 2 : 1; if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer, newPtNodeCount, &writingPos)) { return false; } // Write the 2nd part of the reallocating node. const int secondPartOfReallocatedPtNodePos = writingPos; const PtNodeParams childPartPtNodeParams(getUpdatedPtNodeParams(reallocatingPtNodeParams, reallocatingPtNodeParams->isNotAWord(), reallocatingPtNodeParams->isPossiblyOffensive(), reallocatingPtNodeParams->isTerminal(), firstPartOfReallocatedPtNodePos, reallocatingPtNodeParams->getCodePointArrayView().skip(overlappingCodePointCount), reallocatingPtNodeParams->getProbability())); if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&childPartPtNodeParams, &writingPos)) { return false; } if (addsExtraChild) { const PtNodeParams extraChildPtNodeParams(getPtNodeParamsForNewPtNode( unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(), true /* isTerminal */, firstPartOfReallocatedPtNodePos, newPtNodeCodePoints.skip(overlappingCodePointCount), unigramProperty->getProbability())); if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&extraChildPtNodeParams, unigramProperty, &writingPos)) { return false; } } if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { return false; } // Update original reallocating PtNode as moved. if (!mPtNodeWriter->markPtNodeAsMoved(reallocatingPtNodeParams, firstPartOfReallocatedPtNodePos, secondPartOfReallocatedPtNodePos)) { return false; } // Load node info. Information of the 1st part will be fetched. const PtNodeParams ptNodeParams( mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(firstPartOfReallocatedPtNodePos)); // Update children position. return mPtNodeWriter->updateChildrenPosition(&ptNodeParams, actualChildrenPos); } const PtNodeParams DynamicPtUpdatingHelper::getUpdatedPtNodeParams( const PtNodeParams *const originalPtNodeParams, const bool isNotAWord, const bool isPossiblyOffensive, const bool isTerminal, const int parentPos, const CodePointArrayView codePoints, const int probability) const { const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags( isPossiblyOffensive, isNotAWord, isTerminal, false /* hasShortcutTargets */, false /* hasBigrams */, codePoints.size() > 1u /* hasMultipleChars */, CHILDREN_POSITION_FIELD_SIZE); return PtNodeParams(originalPtNodeParams, flags, parentPos, codePoints, probability); } const PtNodeParams DynamicPtUpdatingHelper::getPtNodeParamsForNewPtNode(const bool isNotAWord, const bool isPossiblyOffensive, const bool isTerminal, const int parentPos, const CodePointArrayView codePoints, const int probability) const { const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags( isPossiblyOffensive, isNotAWord, isTerminal, false /* hasShortcutTargets */, false /* hasBigrams */, codePoints.size() > 1u /* hasMultipleChars */, CHILDREN_POSITION_FIELD_SIZE); return PtNodeParams(flags, parentPos, codePoints, probability); } } // namespace latinime