Quit reading unigram probability in Ver4PatriciaTrieNodeReader.

Bug: 14425059
Change-Id: I4fc7b0e236151a2c64e7131772264024c6597633
main
Keisuke Kuroyanagi 2014-09-25 11:41:50 +09:00
parent 2842e50c4b
commit cb4f544198
6 changed files with 20 additions and 38 deletions

View File

@ -63,9 +63,14 @@ const WordAttributes LanguageModelDictContent::getWordAttributes(const WordIdArr
int probability = NOT_A_PROBABILITY; int probability = NOT_A_PROBABILITY;
if (mHasHistoricalInfo) { if (mHasHistoricalInfo) {
const int rawProbability = ForgettingCurveUtils::decodeProbability( const int rawProbability = ForgettingCurveUtils::decodeProbability(
probabilityEntry.getHistoricalInfo(), headerPolicy) probabilityEntry.getHistoricalInfo(), headerPolicy);
+ ForgettingCurveUtils::getProbabilityBiasForNgram(i + 1 /* n */); if (rawProbability == NOT_A_PROBABILITY) {
probability = std::min(rawProbability, MAX_PROBABILITY); // The entry should not be treated as a valid entry.
continue;
}
probability = std::min(rawProbability
+ ForgettingCurveUtils::getProbabilityBiasForNgram(i + 1 /* n */),
MAX_PROBABILITY);
} else { } else {
probability = probabilityEntry.getProbability(); probability = probabilityEntry.getProbability();
} }

View File

@ -51,26 +51,17 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce
const int parentPos = const int parentPos =
DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos); DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos);
int codePoints[MAX_WORD_LENGTH]; int codePoints[MAX_WORD_LENGTH];
const int codePonitCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition( // Code point table is not used for ver4 dictionaries.
dictBuf, flags, MAX_WORD_LENGTH, mHeaderPolicy->getCodePointTable(), codePoints, &pos); const int codePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition(
dictBuf, flags, MAX_WORD_LENGTH, nullptr /* codePointTable */, codePoints, &pos);
int terminalIdFieldPos = NOT_A_DICT_POS; int terminalIdFieldPos = NOT_A_DICT_POS;
int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
int probability = NOT_A_PROBABILITY;
if (PatriciaTrieReadingUtils::isTerminal(flags)) { if (PatriciaTrieReadingUtils::isTerminal(flags)) {
terminalIdFieldPos = pos; terminalIdFieldPos = pos;
if (usesAdditionalBuffer) { if (usesAdditionalBuffer) {
terminalIdFieldPos += mBuffer->getOriginalBufferSize(); terminalIdFieldPos += mBuffer->getOriginalBufferSize();
} }
terminalId = Ver4PatriciaTrieReadingUtils::getTerminalIdAndAdvancePosition(dictBuf, &pos); terminalId = Ver4PatriciaTrieReadingUtils::getTerminalIdAndAdvancePosition(dictBuf, &pos);
// TODO: Quit reading probability here.
const ProbabilityEntry probabilityEntry =
mLanguageModelDictContent->getProbabilityEntry(terminalId);
if (probabilityEntry.hasHistoricalInfo()) {
probability = ForgettingCurveUtils::decodeProbability(
probabilityEntry.getHistoricalInfo(), mHeaderPolicy);
} else {
probability = probabilityEntry.getProbability();
}
} }
int childrenPosFieldPos = pos; int childrenPosFieldPos = pos;
if (usesAdditionalBuffer) { if (usesAdditionalBuffer) {
@ -91,8 +82,8 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce
// The destination position is stored at the same place as the parent position. // The destination position is stored at the same place as the parent position.
return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(parentPos, newSiblingNodePos); return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(parentPos, newSiblingNodePos);
} else { } else {
return PtNodeParams(headPos, flags, parentPos, codePonitCount, codePoints, return PtNodeParams(headPos, flags, parentPos, codePointCount, codePoints,
terminalIdFieldPos, terminalId, probability, childrenPosFieldPos, childrenPos, terminalIdFieldPos, terminalId, NOT_A_PROBABILITY, childrenPosFieldPos, childrenPos,
newSiblingNodePos); newSiblingNodePos);
} }
} }

View File

@ -29,15 +29,12 @@ class LanguageModelDictContent;
/* /*
* This class is used for helping to read nodes of ver4 patricia trie. This class handles moved * This class is used for helping to read nodes of ver4 patricia trie. This class handles moved
* node and reads node attributes including probability form language model. * node and reads node attributes.
*/ */
class Ver4PatriciaTrieNodeReader : public PtNodeReader { class Ver4PatriciaTrieNodeReader : public PtNodeReader {
public: public:
Ver4PatriciaTrieNodeReader(const BufferWithExtendableBuffer *const buffer, explicit Ver4PatriciaTrieNodeReader(const BufferWithExtendableBuffer *const buffer)
const LanguageModelDictContent *const languageModelDictContent, : mBuffer(buffer) {}
const HeaderPolicy *const headerPolicy)
: mBuffer(buffer), mLanguageModelDictContent(languageModelDictContent),
mHeaderPolicy(headerPolicy) {}
~Ver4PatriciaTrieNodeReader() {} ~Ver4PatriciaTrieNodeReader() {}
@ -50,8 +47,6 @@ class Ver4PatriciaTrieNodeReader : public PtNodeReader {
DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeReader); DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeReader);
const BufferWithExtendableBuffer *const mBuffer; const BufferWithExtendableBuffer *const mBuffer;
const LanguageModelDictContent *const mLanguageModelDictContent;
const HeaderPolicy *const mHeaderPolicy;
const PtNodeParams fetchPtNodeInfoFromBufferAndProcessMovedPtNode(const int ptNodePos, const PtNodeParams fetchPtNodeInfoFromBufferAndProcessMovedPtNode(const int ptNodePos,
const int siblingNodePos) const; const int siblingNodePos) const;

View File

@ -56,13 +56,7 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d
if (!ptNodeParams.isValid()) { if (!ptNodeParams.isValid()) {
break; break;
} }
bool isTerminal = ptNodeParams.isTerminal() && !ptNodeParams.isDeleted(); const bool isTerminal = ptNodeParams.isTerminal() && !ptNodeParams.isDeleted();
if (isTerminal && mHeaderPolicy->isDecayingDict()) {
// A DecayingDict may have a terminal PtNode that has a terminal DicNode whose
// probability is NOT_A_PROBABILITY. In such case, we don't want to treat it as a
// valid terminal DicNode.
isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY;
}
const int wordId = isTerminal ? ptNodeParams.getTerminalId() : NOT_A_WORD_ID; const int wordId = isTerminal ? ptNodeParams.getTerminalId() : NOT_A_WORD_ID;
childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getChildrenPos(), childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getChildrenPos(),
wordId, ptNodeParams.getCodePointArrayView()); wordId, ptNodeParams.getCodePointArrayView());

View File

@ -45,8 +45,7 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
mDictBuffer(mBuffers->getWritableTrieBuffer()), mDictBuffer(mBuffers->getWritableTrieBuffer()),
mShortcutPolicy(mBuffers->getMutableShortcutDictContent(), mShortcutPolicy(mBuffers->getMutableShortcutDictContent(),
mBuffers->getTerminalPositionLookupTable()), mBuffers->getTerminalPositionLookupTable()),
mNodeReader(mDictBuffer, mBuffers->getLanguageModelDictContent(), mHeaderPolicy), mNodeReader(mDictBuffer), mPtNodeArrayReader(mDictBuffer),
mPtNodeArrayReader(mDictBuffer),
mNodeWriter(mDictBuffer, mBuffers.get(), mHeaderPolicy, &mNodeReader, mNodeWriter(mDictBuffer, mBuffers.get(), mHeaderPolicy, &mNodeReader,
&mPtNodeArrayReader, &mShortcutPolicy), &mPtNodeArrayReader, &mShortcutPolicy),
mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter), mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter),

View File

@ -73,8 +73,7 @@ bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeAr
bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite, const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite,
int *const outUnigramCount, int *const outBigramCount) { int *const outUnigramCount, int *const outBigramCount) {
Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer(), Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer());
mBuffers->getLanguageModelDictContent(), headerPolicy);
Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer()); Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer());
Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(), Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(),
mBuffers->getTerminalPositionLookupTable()); mBuffers->getTerminalPositionLookupTable());
@ -137,8 +136,7 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
} }
// Create policy instances for the GCed dictionary. // Create policy instances for the GCed dictionary.
Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(), Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer());
buffersToWrite->getLanguageModelDictContent(), headerPolicy);
Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer()); Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer());
Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(), Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(),
buffersToWrite->getTerminalPositionLookupTable()); buffersToWrite->getTerminalPositionLookupTable());