300 lines
15 KiB
C++
300 lines
15 KiB
C++
/*
|
|
* Copyright (C) 2013, The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "dictionary/structure/pt_common/dynamic_pt_updating_helper.h"
|
|
|
|
#include "dictionary/property/unigram_property.h"
|
|
#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
|
|
#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
|
|
#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
|
|
#include "dictionary/structure/pt_common/pt_node_reader.h"
|
|
#include "dictionary/structure/pt_common/pt_node_writer.h"
|
|
#include "dictionary/utils/buffer_with_extendable_buffer.h"
|
|
|
|
namespace latinime {
|
|
|
|
const int DynamicPtUpdatingHelper::CHILDREN_POSITION_FIELD_SIZE = 3;
|
|
|
|
bool DynamicPtUpdatingHelper::addUnigramWord(DynamicPtReadingHelper *const readingHelper,
|
|
const CodePointArrayView wordCodePoints, const UnigramProperty *const unigramProperty,
|
|
bool *const outAddedNewUnigram) {
|
|
int parentPos = NOT_A_DICT_POS;
|
|
while (!readingHelper->isEnd()) {
|
|
const PtNodeParams ptNodeParams(readingHelper->getPtNodeParams());
|
|
if (!ptNodeParams.isValid()) {
|
|
break;
|
|
}
|
|
const size_t matchedCodePointCount = readingHelper->getPrevTotalCodePointCount();
|
|
if (!readingHelper->isMatchedCodePoint(ptNodeParams, 0 /* index */,
|
|
wordCodePoints[matchedCodePointCount])) {
|
|
// The first code point is different from target code point. Skip this node and read
|
|
// the next sibling node.
|
|
readingHelper->readNextSiblingNode(ptNodeParams);
|
|
continue;
|
|
}
|
|
// Check following merged node code points.
|
|
const size_t nodeCodePointCount = ptNodeParams.getCodePointArrayView().size();
|
|
for (size_t j = 1; j < nodeCodePointCount; ++j) {
|
|
const size_t nextIndex = matchedCodePointCount + j;
|
|
if (nextIndex >= wordCodePoints.size()
|
|
|| !readingHelper->isMatchedCodePoint(ptNodeParams, j,
|
|
wordCodePoints[matchedCodePointCount + j])) {
|
|
*outAddedNewUnigram = true;
|
|
return reallocatePtNodeAndAddNewPtNodes(&ptNodeParams, j, unigramProperty,
|
|
wordCodePoints.skip(matchedCodePointCount));
|
|
}
|
|
}
|
|
// All characters are matched.
|
|
if (wordCodePoints.size() == readingHelper->getTotalCodePointCount(ptNodeParams)) {
|
|
return setPtNodeProbability(&ptNodeParams, unigramProperty, outAddedNewUnigram);
|
|
}
|
|
if (!ptNodeParams.hasChildren()) {
|
|
*outAddedNewUnigram = true;
|
|
return createChildrenPtNodeArrayAndAChildPtNode(&ptNodeParams, unigramProperty,
|
|
wordCodePoints.skip(readingHelper->getTotalCodePointCount(ptNodeParams)));
|
|
}
|
|
// Advance to the children nodes.
|
|
parentPos = ptNodeParams.getHeadPos();
|
|
readingHelper->readChildNode(ptNodeParams);
|
|
}
|
|
if (readingHelper->isError()) {
|
|
// The dictionary is invalid.
|
|
return false;
|
|
}
|
|
int pos = readingHelper->getPosOfLastForwardLinkField();
|
|
*outAddedNewUnigram = true;
|
|
return createAndInsertNodeIntoPtNodeArray(parentPos,
|
|
wordCodePoints.skip(readingHelper->getPrevTotalCodePointCount()), unigramProperty,
|
|
&pos);
|
|
}
|
|
|
|
bool DynamicPtUpdatingHelper::addNgramEntry(const PtNodePosArrayView prevWordsPtNodePos,
|
|
const int wordPos, const NgramProperty *const ngramProperty,
|
|
bool *const outAddedNewEntry) {
|
|
if (prevWordsPtNodePos.empty()) {
|
|
return false;
|
|
}
|
|
ASSERT(prevWordsPtNodePos.size() <= MAX_PREV_WORD_COUNT_FOR_N_GRAM);
|
|
int prevWordTerminalIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
|
|
for (size_t i = 0; i < prevWordsPtNodePos.size(); ++i) {
|
|
prevWordTerminalIds[i] = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(
|
|
prevWordsPtNodePos[i]).getTerminalId();
|
|
}
|
|
const WordIdArrayView prevWordIds(prevWordTerminalIds, prevWordsPtNodePos.size());
|
|
const int wordId =
|
|
mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos).getTerminalId();
|
|
return mPtNodeWriter->addNgramEntry(prevWordIds, wordId, ngramProperty, outAddedNewEntry);
|
|
}
|
|
|
|
bool DynamicPtUpdatingHelper::removeNgramEntry(const PtNodePosArrayView prevWordsPtNodePos,
|
|
const int wordPos) {
|
|
if (prevWordsPtNodePos.empty()) {
|
|
return false;
|
|
}
|
|
ASSERT(prevWordsPtNodePos.size() <= MAX_PREV_WORD_COUNT_FOR_N_GRAM);
|
|
int prevWordTerminalIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
|
|
for (size_t i = 0; i < prevWordsPtNodePos.size(); ++i) {
|
|
prevWordTerminalIds[i] = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(
|
|
prevWordsPtNodePos[i]).getTerminalId();
|
|
}
|
|
const WordIdArrayView prevWordIds(prevWordTerminalIds, prevWordsPtNodePos.size());
|
|
const int wordId =
|
|
mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos).getTerminalId();
|
|
return mPtNodeWriter->removeNgramEntry(prevWordIds, wordId);
|
|
}
|
|
|
|
bool DynamicPtUpdatingHelper::addShortcutTarget(const int wordPos,
|
|
const CodePointArrayView targetCodePoints, const int shortcutProbability) {
|
|
const PtNodeParams ptNodeParams(mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos));
|
|
return mPtNodeWriter->addShortcutTarget(&ptNodeParams, targetCodePoints.data(),
|
|
targetCodePoints.size(), shortcutProbability);
|
|
}
|
|
|
|
bool DynamicPtUpdatingHelper::createAndInsertNodeIntoPtNodeArray(const int parentPos,
|
|
const CodePointArrayView ptNodeCodePoints, const UnigramProperty *const unigramProperty,
|
|
int *const forwardLinkFieldPos) {
|
|
const int newPtNodeArrayPos = mBuffer->getTailPosition();
|
|
if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer,
|
|
newPtNodeArrayPos, forwardLinkFieldPos)) {
|
|
return false;
|
|
}
|
|
return createNewPtNodeArrayWithAChildPtNode(parentPos, ptNodeCodePoints, unigramProperty);
|
|
}
|
|
|
|
bool DynamicPtUpdatingHelper::setPtNodeProbability(const PtNodeParams *const originalPtNodeParams,
|
|
const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram) {
|
|
if (originalPtNodeParams->isTerminal() && !originalPtNodeParams->isDeleted()) {
|
|
// Overwrites the probability.
|
|
*outAddedNewUnigram = false;
|
|
return mPtNodeWriter->updatePtNodeUnigramProperty(originalPtNodeParams, unigramProperty);
|
|
} else {
|
|
// Make the node terminal and write the probability.
|
|
*outAddedNewUnigram = true;
|
|
const int movedPos = mBuffer->getTailPosition();
|
|
int writingPos = movedPos;
|
|
const PtNodeParams ptNodeParamsToWrite(getUpdatedPtNodeParams(originalPtNodeParams,
|
|
unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
|
|
true /* isTerminal */, originalPtNodeParams->getParentPos(),
|
|
originalPtNodeParams->getCodePointArrayView(), unigramProperty->getProbability()));
|
|
if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
|
|
unigramProperty, &writingPos)) {
|
|
return false;
|
|
}
|
|
if (!mPtNodeWriter->markPtNodeAsMoved(originalPtNodeParams, movedPos, movedPos)) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool DynamicPtUpdatingHelper::createChildrenPtNodeArrayAndAChildPtNode(
|
|
const PtNodeParams *const parentPtNodeParams, const UnigramProperty *const unigramProperty,
|
|
const CodePointArrayView codePoints) {
|
|
const int newPtNodeArrayPos = mBuffer->getTailPosition();
|
|
if (!mPtNodeWriter->updateChildrenPosition(parentPtNodeParams, newPtNodeArrayPos)) {
|
|
return false;
|
|
}
|
|
return createNewPtNodeArrayWithAChildPtNode(parentPtNodeParams->getHeadPos(), codePoints,
|
|
unigramProperty);
|
|
}
|
|
|
|
bool DynamicPtUpdatingHelper::createNewPtNodeArrayWithAChildPtNode(
|
|
const int parentPtNodePos, const CodePointArrayView ptNodeCodePoints,
|
|
const UnigramProperty *const unigramProperty) {
|
|
int writingPos = mBuffer->getTailPosition();
|
|
if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer,
|
|
1 /* arraySize */, &writingPos)) {
|
|
return false;
|
|
}
|
|
const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
|
|
unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
|
|
true /* isTerminal */, parentPtNodePos, ptNodeCodePoints,
|
|
unigramProperty->getProbability()));
|
|
if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
|
|
unigramProperty, &writingPos)) {
|
|
return false;
|
|
}
|
|
if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer,
|
|
NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// Returns whether the dictionary updating was succeeded or not.
|
|
bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes(
|
|
const PtNodeParams *const reallocatingPtNodeParams, const size_t overlappingCodePointCount,
|
|
const UnigramProperty *const unigramProperty,
|
|
const CodePointArrayView newPtNodeCodePoints) {
|
|
// When addsExtraChild is true, split the reallocating PtNode and add new child.
|
|
// Reallocating PtNode: abcde, newNode: abcxy.
|
|
// abc (1st, not terminal) __ de (2nd)
|
|
// \_ xy (extra child, terminal)
|
|
// Otherwise, this method makes 1st part terminal and write information in unigramProperty.
|
|
// Reallocating PtNode: abcde, newNode: abc.
|
|
// abc (1st, terminal) __ de (2nd)
|
|
const bool addsExtraChild = newPtNodeCodePoints.size() > overlappingCodePointCount;
|
|
const int firstPartOfReallocatedPtNodePos = mBuffer->getTailPosition();
|
|
int writingPos = firstPartOfReallocatedPtNodePos;
|
|
// Write the 1st part of the reallocating node. The children position will be updated later
|
|
// with actual children position.
|
|
const CodePointArrayView firstPtNodeCodePoints =
|
|
reallocatingPtNodeParams->getCodePointArrayView().limit(overlappingCodePointCount);
|
|
if (addsExtraChild) {
|
|
const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
|
|
false /* isNotAWord */, false /* isPossiblyOffensive */, false /* isTerminal */,
|
|
reallocatingPtNodeParams->getParentPos(), firstPtNodeCodePoints,
|
|
NOT_A_PROBABILITY));
|
|
if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&ptNodeParamsToWrite, &writingPos)) {
|
|
return false;
|
|
}
|
|
} else {
|
|
const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
|
|
unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
|
|
true /* isTerminal */, reallocatingPtNodeParams->getParentPos(),
|
|
firstPtNodeCodePoints, unigramProperty->getProbability()));
|
|
if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
|
|
unigramProperty, &writingPos)) {
|
|
return false;
|
|
}
|
|
}
|
|
const int actualChildrenPos = writingPos;
|
|
// Create new children PtNode array.
|
|
const size_t newPtNodeCount = addsExtraChild ? 2 : 1;
|
|
if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer,
|
|
newPtNodeCount, &writingPos)) {
|
|
return false;
|
|
}
|
|
// Write the 2nd part of the reallocating node.
|
|
const int secondPartOfReallocatedPtNodePos = writingPos;
|
|
const PtNodeParams childPartPtNodeParams(getUpdatedPtNodeParams(reallocatingPtNodeParams,
|
|
reallocatingPtNodeParams->isNotAWord(), reallocatingPtNodeParams->isPossiblyOffensive(),
|
|
reallocatingPtNodeParams->isTerminal(), firstPartOfReallocatedPtNodePos,
|
|
reallocatingPtNodeParams->getCodePointArrayView().skip(overlappingCodePointCount),
|
|
reallocatingPtNodeParams->getProbability()));
|
|
if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&childPartPtNodeParams, &writingPos)) {
|
|
return false;
|
|
}
|
|
if (addsExtraChild) {
|
|
const PtNodeParams extraChildPtNodeParams(getPtNodeParamsForNewPtNode(
|
|
unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
|
|
true /* isTerminal */, firstPartOfReallocatedPtNodePos,
|
|
newPtNodeCodePoints.skip(overlappingCodePointCount),
|
|
unigramProperty->getProbability()));
|
|
if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&extraChildPtNodeParams,
|
|
unigramProperty, &writingPos)) {
|
|
return false;
|
|
}
|
|
}
|
|
if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer,
|
|
NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) {
|
|
return false;
|
|
}
|
|
// Update original reallocating PtNode as moved.
|
|
if (!mPtNodeWriter->markPtNodeAsMoved(reallocatingPtNodeParams, firstPartOfReallocatedPtNodePos,
|
|
secondPartOfReallocatedPtNodePos)) {
|
|
return false;
|
|
}
|
|
// Load node info. Information of the 1st part will be fetched.
|
|
const PtNodeParams ptNodeParams(
|
|
mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(firstPartOfReallocatedPtNodePos));
|
|
// Update children position.
|
|
return mPtNodeWriter->updateChildrenPosition(&ptNodeParams, actualChildrenPos);
|
|
}
|
|
|
|
const PtNodeParams DynamicPtUpdatingHelper::getUpdatedPtNodeParams(
|
|
const PtNodeParams *const originalPtNodeParams, const bool isNotAWord,
|
|
const bool isPossiblyOffensive, const bool isTerminal, const int parentPos,
|
|
const CodePointArrayView codePoints, const int probability) const {
|
|
const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags(
|
|
isPossiblyOffensive, isNotAWord, isTerminal, false /* hasShortcutTargets */,
|
|
false /* hasBigrams */, codePoints.size() > 1u /* hasMultipleChars */,
|
|
CHILDREN_POSITION_FIELD_SIZE);
|
|
return PtNodeParams(originalPtNodeParams, flags, parentPos, codePoints, probability);
|
|
}
|
|
|
|
const PtNodeParams DynamicPtUpdatingHelper::getPtNodeParamsForNewPtNode(const bool isNotAWord,
|
|
const bool isPossiblyOffensive, const bool isTerminal, const int parentPos,
|
|
const CodePointArrayView codePoints, const int probability) const {
|
|
const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags(
|
|
isPossiblyOffensive, isNotAWord, isTerminal, false /* hasShortcutTargets */,
|
|
false /* hasBigrams */, codePoints.size() > 1u /* hasMultipleChars */,
|
|
CHILDREN_POSITION_FIELD_SIZE);
|
|
return PtNodeParams(flags, parentPos, codePoints, probability);
|
|
}
|
|
|
|
} // namespace latinime
|