Refactoring: Use UnigramProperty to add/update unigram.

Bug: 13406708
Change-Id: I26fd541fb465d3543faa5f155becc455ddbb6c9c
main
Keisuke Kuroyanagi 2014-05-09 17:22:17 +09:00
parent eaa347bc1a
commit b636e25e95
6 changed files with 81 additions and 80 deletions

View File

@ -16,6 +16,7 @@
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h"
#include "suggest/core/dictionary/property/unigram_property.h"
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
@ -29,9 +30,8 @@ const int DynamicPtUpdatingHelper::CHILDREN_POSITION_FIELD_SIZE = 3;
bool DynamicPtUpdatingHelper::addUnigramWord(
DynamicPtReadingHelper *const readingHelper,
const int *const wordCodePoints, const int codePointCount, const int probability,
const bool isNotAWord, const bool isBlacklisted, const int timestamp,
bool *const outAddedNewUnigram) {
const int *const wordCodePoints, const int codePointCount,
const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram) {
int parentPos = NOT_A_DICT_POS;
while (!readingHelper->isEnd()) {
const PtNodeParams ptNodeParams(readingHelper->getPtNodeParams());
@ -53,20 +53,18 @@ bool DynamicPtUpdatingHelper::addUnigramWord(
if (nextIndex >= codePointCount || !readingHelper->isMatchedCodePoint(ptNodeParams, j,
wordCodePoints[matchedCodePointCount + j])) {
*outAddedNewUnigram = true;
return reallocatePtNodeAndAddNewPtNodes(&ptNodeParams, j, isNotAWord, isBlacklisted,
probability, timestamp, wordCodePoints + matchedCodePointCount,
return reallocatePtNodeAndAddNewPtNodes(&ptNodeParams, j, unigramProperty,
wordCodePoints + matchedCodePointCount,
codePointCount - matchedCodePointCount);
}
}
// All characters are matched.
if (codePointCount == readingHelper->getTotalCodePointCount(ptNodeParams)) {
return setPtNodeProbability(&ptNodeParams, isNotAWord, isBlacklisted, probability,
timestamp, outAddedNewUnigram);
return setPtNodeProbability(&ptNodeParams, unigramProperty, outAddedNewUnigram);
}
if (!ptNodeParams.hasChildren()) {
*outAddedNewUnigram = true;
return createChildrenPtNodeArrayAndAChildPtNode(&ptNodeParams,
isNotAWord, isBlacklisted, probability, timestamp,
return createChildrenPtNodeArrayAndAChildPtNode(&ptNodeParams, unigramProperty,
wordCodePoints + readingHelper->getTotalCodePointCount(ptNodeParams),
codePointCount - readingHelper->getTotalCodePointCount(ptNodeParams));
}
@ -83,7 +81,7 @@ bool DynamicPtUpdatingHelper::addUnigramWord(
return createAndInsertNodeIntoPtNodeArray(parentPos,
wordCodePoints + readingHelper->getPrevTotalCodePointCount(),
codePointCount - readingHelper->getPrevTotalCodePointCount(),
isNotAWord, isBlacklisted, probability, timestamp, &pos);
unigramProperty, &pos);
}
bool DynamicPtUpdatingHelper::addBigramWords(const int word0Pos, const int word1Pos,
@ -115,36 +113,34 @@ bool DynamicPtUpdatingHelper::addShortcutTarget(const int wordPos,
bool DynamicPtUpdatingHelper::createAndInsertNodeIntoPtNodeArray(const int parentPos,
const int *const nodeCodePoints, const int nodeCodePointCount,
const bool isNotAWord, const bool isBlacklisted, const int probability,
const int timestamp, int *const forwardLinkFieldPos) {
const UnigramProperty *const unigramProperty, int *const forwardLinkFieldPos) {
const int newPtNodeArrayPos = mBuffer->getTailPosition();
if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer,
newPtNodeArrayPos, forwardLinkFieldPos)) {
return false;
}
return createNewPtNodeArrayWithAChildPtNode(parentPos, nodeCodePoints, nodeCodePointCount,
isNotAWord, isBlacklisted, probability, timestamp);
unigramProperty);
}
bool DynamicPtUpdatingHelper::setPtNodeProbability(
const PtNodeParams *const originalPtNodeParams, const bool isNotAWord,
const bool isBlacklisted, const int probability, const int timestamp,
bool *const outAddedNewUnigram) {
bool DynamicPtUpdatingHelper::setPtNodeProbability(const PtNodeParams *const originalPtNodeParams,
const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram) {
if (originalPtNodeParams->isTerminal()) {
// Overwrites the probability.
*outAddedNewUnigram = false;
return mPtNodeWriter->updatePtNodeProbability(originalPtNodeParams, probability, timestamp);
return mPtNodeWriter->updatePtNodeUnigramProperty(originalPtNodeParams, unigramProperty);
} else {
// Make the node terminal and write the probability.
*outAddedNewUnigram = true;
const int movedPos = mBuffer->getTailPosition();
int writingPos = movedPos;
const PtNodeParams ptNodeParamsToWrite(getUpdatedPtNodeParams(originalPtNodeParams,
isNotAWord, isBlacklisted, true /* isTerminal */,
originalPtNodeParams->getParentPos(), originalPtNodeParams->getCodePointCount(),
originalPtNodeParams->getCodePoints(), probability));
unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(),
true /* isTerminal */, originalPtNodeParams->getParentPos(),
originalPtNodeParams->getCodePointCount(), originalPtNodeParams->getCodePoints(),
unigramProperty->getProbability()));
if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
timestamp, &writingPos)) {
unigramProperty, &writingPos)) {
return false;
}
if (!mPtNodeWriter->markPtNodeAsMoved(originalPtNodeParams, movedPos, movedPos)) {
@ -155,31 +151,30 @@ bool DynamicPtUpdatingHelper::setPtNodeProbability(
}
bool DynamicPtUpdatingHelper::createChildrenPtNodeArrayAndAChildPtNode(
const PtNodeParams *const parentPtNodeParams, const bool isNotAWord,
const bool isBlacklisted, const int probability, const int timestamp,
const PtNodeParams *const parentPtNodeParams, const UnigramProperty *const unigramProperty,
const int *const codePoints, const int codePointCount) {
const int newPtNodeArrayPos = mBuffer->getTailPosition();
if (!mPtNodeWriter->updateChildrenPosition(parentPtNodeParams, newPtNodeArrayPos)) {
return false;
}
return createNewPtNodeArrayWithAChildPtNode(parentPtNodeParams->getHeadPos(), codePoints,
codePointCount, isNotAWord, isBlacklisted, probability, timestamp);
codePointCount, unigramProperty);
}
bool DynamicPtUpdatingHelper::createNewPtNodeArrayWithAChildPtNode(
const int parentPtNodePos, const int *const nodeCodePoints, const int nodeCodePointCount,
const bool isNotAWord, const bool isBlacklisted, const int probability,
const int timestamp) {
const UnigramProperty *const unigramProperty) {
int writingPos = mBuffer->getTailPosition();
if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer,
1 /* arraySize */, &writingPos)) {
return false;
}
const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
isNotAWord, isBlacklisted, true /* isTerminal */,
parentPtNodePos, nodeCodePointCount, nodeCodePoints, probability));
if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, timestamp,
&writingPos)) {
unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), true /* isTerminal */,
parentPtNodePos, nodeCodePointCount, nodeCodePoints,
unigramProperty->getProbability()));
if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
unigramProperty, &writingPos)) {
return false;
}
if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer,
@ -192,13 +187,13 @@ bool DynamicPtUpdatingHelper::createNewPtNodeArrayWithAChildPtNode(
// Returns whether the dictionary updating was succeeded or not.
bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes(
const PtNodeParams *const reallocatingPtNodeParams, const int overlappingCodePointCount,
const bool isNotAWord, const bool isBlacklisted, const int probabilityOfNewPtNode,
const int timestamp, const int *const newNodeCodePoints, const int newNodeCodePointCount) {
const UnigramProperty *const unigramProperty, const int *const newNodeCodePoints,
const int newNodeCodePointCount) {
// When addsExtraChild is true, split the reallocating PtNode and add new child.
// Reallocating PtNode: abcde, newNode: abcxy.
// abc (1st, not terminal) __ de (2nd)
// \_ xy (extra child, terminal)
// Otherwise, this method makes 1st part terminal and write probabilityOfNewPtNode.
// Otherwise, this method makes 1st part terminal and write information in unigramProperty.
// Reallocating PtNode: abcde, newNode: abc.
// abc (1st, terminal) __ de (2nd)
const bool addsExtraChild = newNodeCodePointCount > overlappingCodePointCount;
@ -216,11 +211,12 @@ bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes(
}
} else {
const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
isNotAWord, isBlacklisted, true /* isTerminal */,
reallocatingPtNodeParams->getParentPos(), overlappingCodePointCount,
reallocatingPtNodeParams->getCodePoints(), probabilityOfNewPtNode));
unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(),
true /* isTerminal */, reallocatingPtNodeParams->getParentPos(),
overlappingCodePointCount, reallocatingPtNodeParams->getCodePoints(),
unigramProperty->getProbability()));
if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
timestamp, &writingPos)) {
unigramProperty, &writingPos)) {
return false;
}
}
@ -244,11 +240,12 @@ bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes(
}
if (addsExtraChild) {
const PtNodeParams extraChildPtNodeParams(getPtNodeParamsForNewPtNode(
isNotAWord, isBlacklisted, true /* isTerminal */,
firstPartOfReallocatedPtNodePos, newNodeCodePointCount - overlappingCodePointCount,
newNodeCodePoints + overlappingCodePointCount, probabilityOfNewPtNode));
unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(),
true /* isTerminal */, firstPartOfReallocatedPtNodePos,
newNodeCodePointCount - overlappingCodePointCount,
newNodeCodePoints + overlappingCodePointCount, unigramProperty->getProbability()));
if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&extraChildPtNodeParams,
timestamp, &writingPos)) {
unigramProperty, &writingPos)) {
return false;
}
}
@ -269,8 +266,8 @@ bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes(
}
const PtNodeParams DynamicPtUpdatingHelper::getUpdatedPtNodeParams(
const PtNodeParams *const originalPtNodeParams, const bool isNotAWord,
const bool isBlacklisted, const bool isTerminal, const int parentPos,
const PtNodeParams *const originalPtNodeParams,
const bool isNotAWord, const bool isBlacklisted, const bool isTerminal, const int parentPos,
const int codePointCount, const int *const codePoints, const int probability) const {
const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags(
isBlacklisted, isNotAWord, isTerminal, originalPtNodeParams->hasShortcutTargets(),

View File

@ -26,6 +26,7 @@ class BufferWithExtendableBuffer;
class DynamicPtReadingHelper;
class PtNodeReader;
class PtNodeWriter;
class UnigramProperty;
class DynamicPtUpdatingHelper {
public:
@ -37,9 +38,8 @@ class DynamicPtUpdatingHelper {
// Add a word to the dictionary. If the word already exists, update the probability.
bool addUnigramWord(DynamicPtReadingHelper *const readingHelper,
const int *const wordCodePoints, const int codePointCount, const int probability,
const bool isNotAWord, const bool isBlacklisted, const int timestamp,
bool *const outAddedNewUnigram);
const int *const wordCodePoints, const int codePointCount,
const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram);
// Add a bigram relation from word0Pos to word1Pos.
bool addBigramWords(const int word0Pos, const int word1Pos, const int probability,
@ -62,25 +62,22 @@ class DynamicPtUpdatingHelper {
PtNodeWriter *const mPtNodeWriter;
bool createAndInsertNodeIntoPtNodeArray(const int parentPos, const int *const nodeCodePoints,
const int nodeCodePointCount, const bool isNotAWord, const bool isBlacklisted,
const int probability, const int timestamp, int *const forwardLinkFieldPos);
const int nodeCodePointCount, const UnigramProperty *const unigramProperty,
int *const forwardLinkFieldPos);
bool setPtNodeProbability(const PtNodeParams *const originalPtNodeParams, const bool isNotAWord,
const bool isBlacklisted, const int probability, const int timestamp,
bool *const outAddedNewUnigram);
bool setPtNodeProbability(const PtNodeParams *const originalPtNodeParams,
const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram);
bool createChildrenPtNodeArrayAndAChildPtNode(const PtNodeParams *const parentPtNodeParams,
const bool isNotAWord, const bool isBlacklisted, const int probability,
const int timestamp, const int *const codePoints, const int codePointCount);
const UnigramProperty *const unigramProperty, const int *const codePoints,
const int codePointCount);
bool createNewPtNodeArrayWithAChildPtNode(const int parentPos, const int *const nodeCodePoints,
const int nodeCodePointCount, const bool isNotAWord, const bool isBlacklisted,
const int probability, const int timestamp);
const int nodeCodePointCount, const UnigramProperty *const unigramProperty);
bool reallocatePtNodeAndAddNewPtNodes(
const PtNodeParams *const reallocatingPtNodeParams, const int overlappingCodePointCount,
const bool isNotAWord, const bool isBlacklisted, const int probabilityOfNewPtNode,
const int timestamp, const int *const newNodeCodePoints,
const UnigramProperty *const unigramProperty, const int *const newNodeCodePoints,
const int newNodeCodePointCount);
const PtNodeParams getUpdatedPtNodeParams(const PtNodeParams *const originalPtNodeParams,

View File

@ -24,6 +24,8 @@
namespace latinime {
class UnigramProperty;
// Interface class used to write PtNode information.
class PtNodeWriter {
public:
@ -51,8 +53,8 @@ class PtNodeWriter {
virtual bool markPtNodeAsWillBecomeNonTerminal(
const PtNodeParams *const toBeUpdatedPtNodeParams) = 0;
virtual bool updatePtNodeProbability(const PtNodeParams *const toBeUpdatedPtNodeParams,
const int probability, const int timestamp) = 0;
virtual bool updatePtNodeUnigramProperty(const PtNodeParams *const toBeUpdatedPtNodeParams,
const UnigramProperty *const unigramProperty) = 0;
virtual bool updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC(
const PtNodeParams *const toBeUpdatedPtNodeParams,
@ -65,7 +67,7 @@ class PtNodeWriter {
int *const ptNodeWritingPos) = 0;
virtual bool writeNewTerminalPtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams,
const int timestamp, int *const ptNodeWritingPos) = 0;
const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos) = 0;
virtual bool addNewBigramEntry(const PtNodeParams *const sourcePtNodeParams,
const PtNodeParams *const targetPtNodeParam, const int probability, const int timestamp,

View File

@ -16,6 +16,7 @@
#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h"
#include "suggest/core/dictionary/property/unigram_property.h"
#include "suggest/policyimpl/dictionary/bigram/ver4_bigram_list_policy.h"
#include "suggest/policyimpl/dictionary/header/header_policy.h"
#include "suggest/policyimpl/dictionary/shortcut/ver4_shortcut_list_policy.h"
@ -133,9 +134,11 @@ bool Ver4PatriciaTrieNodeWriter::markPtNodeAsWillBecomeNonTerminal(
&writingPos);
}
bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbability(
const PtNodeParams *const toBeUpdatedPtNodeParams, const int newProbability,
const int timestamp) {
bool Ver4PatriciaTrieNodeWriter::updatePtNodeUnigramProperty(
const PtNodeParams *const toBeUpdatedPtNodeParams,
const UnigramProperty *const unigramProperty) {
// Update probability and historical information.
// TODO: Update other information in the unigram property.
if (!toBeUpdatedPtNodeParams->isTerminal()) {
return false;
}
@ -143,7 +146,7 @@ bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbability(
mBuffers->getProbabilityDictContent()->getProbabilityEntry(
toBeUpdatedPtNodeParams->getTerminalId());
const ProbabilityEntry probabilityEntry = createUpdatedEntryFrom(&originalProbabilityEntry,
newProbability, timestamp);
unigramProperty);
return mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry(
toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry);
}
@ -204,7 +207,8 @@ bool Ver4PatriciaTrieNodeWriter::writePtNodeAndAdvancePosition(
bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition(
const PtNodeParams *const ptNodeParams, const int timestamp, int *const ptNodeWritingPos) {
const PtNodeParams *const ptNodeParams, const UnigramProperty *const unigramProperty,
int *const ptNodeWritingPos) {
int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
if (!writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, &terminalId,
ptNodeWritingPos)) {
@ -213,7 +217,7 @@ bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition(
// Write probability.
ProbabilityEntry newProbabilityEntry;
const ProbabilityEntry probabilityEntryToWrite = createUpdatedEntryFrom(
&newProbabilityEntry, ptNodeParams->getProbability(), timestamp);
&newProbabilityEntry, unigramProperty);
return mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry(terminalId,
&probabilityEntryToWrite);
}
@ -379,18 +383,20 @@ bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition(
}
const ProbabilityEntry Ver4PatriciaTrieNodeWriter::createUpdatedEntryFrom(
const ProbabilityEntry *const originalProbabilityEntry, const int newProbability,
const int timestamp) const {
const ProbabilityEntry *const originalProbabilityEntry,
const UnigramProperty *const unigramProperty) const {
// TODO: Consolidate historical info and probability.
if (mHeaderPolicy->hasHistoricalInfoOfWords()) {
const HistoricalInfo updatedHistoricalInfo =
ForgettingCurveUtils::createUpdatedHistoricalInfo(
originalProbabilityEntry->getHistoricalInfo(), newProbability, timestamp,
originalProbabilityEntry->getHistoricalInfo(),
unigramProperty->getProbability(), unigramProperty->getTimestamp(),
mHeaderPolicy);
return originalProbabilityEntry->createEntryWithUpdatedHistoricalInfo(
&updatedHistoricalInfo);
} else {
return originalProbabilityEntry->createEntryWithUpdatedProbability(newProbability);
return originalProbabilityEntry->createEntryWithUpdatedProbability(
unigramProperty->getProbability());
}
}

View File

@ -57,8 +57,8 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter {
virtual bool markPtNodeAsWillBecomeNonTerminal(
const PtNodeParams *const toBeUpdatedPtNodeParams);
virtual bool updatePtNodeProbability(const PtNodeParams *const toBeUpdatedPtNodeParams,
const int newProbability, const int timestamp);
virtual bool updatePtNodeUnigramProperty(const PtNodeParams *const toBeUpdatedPtNodeParams,
const UnigramProperty *const unigramProperty);
virtual bool updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC(
const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode);
@ -73,7 +73,7 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter {
int *const ptNodeWritingPos);
virtual bool writeNewTerminalPtNodeAndAdvancePosition(const PtNodeParams *const ptNodeParams,
const int timestamp, int *const ptNodeWritingPos);
const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos);
virtual bool addNewBigramEntry(const PtNodeParams *const sourcePtNodeParams,
const PtNodeParams *const targetPtNodeParam, const int probability, const int timestamp,
@ -102,11 +102,12 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter {
const PtNodeParams *const ptNodeParams, int *const outTerminalId,
int *const ptNodeWritingPos);
// Create updated probability entry using given probability and timestamp. In addition to the
// Create updated probability entry using given unigram property. In addition to the
// probability, this method updates historical information if needed.
// TODO: Update flags belonging to the unigram property.
const ProbabilityEntry createUpdatedEntryFrom(
const ProbabilityEntry *const originalProbabilityEntry, const int newProbability,
const int timestamp) const;
const ProbabilityEntry *const originalProbabilityEntry,
const UnigramProperty *const unigramProperty) const;
bool updatePtNodeFlags(const int ptNodePos, const bool isBlacklisted, const bool isNotAWord,
const bool isTerminal, const bool hasShortcutTargets, const bool hasBigrams,

View File

@ -179,9 +179,7 @@ bool Ver4PatriciaTriePolicy::addUnigramWord(const int *const word, const int len
readingHelper.initWithPtNodeArrayPos(getRootPosition());
bool addedNewUnigram = false;
if (mUpdatingHelper.addUnigramWord(&readingHelper, word, length,
unigramProperty->getProbability(), unigramProperty->isNotAWord(),
unigramProperty->isBlacklisted(), unigramProperty->getTimestamp(),
&addedNewUnigram)) {
unigramProperty, &addedNewUnigram)) {
if (addedNewUnigram) {
mUnigramCount++;
}