308 lines
15 KiB
C++
308 lines
15 KiB
C++
/*
|
|
* Copyright (C) 2013, The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
/*
|
|
* !!!!! DO NOT EDIT THIS FILE !!!!!
|
|
*
|
|
* This file was generated from
|
|
* dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp
|
|
*/
|
|
|
|
#include "dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h"
|
|
|
|
#include <cstring>
|
|
#include <queue>
|
|
|
|
#include "dictionary/header/header_policy.h"
|
|
#include "dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h"
|
|
#include "dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h"
|
|
#include "dictionary/structure/backward/v402/ver4_dict_buffers.h"
|
|
#include "dictionary/structure/backward/v402/ver4_dict_constants.h"
|
|
#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
|
|
#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h"
|
|
#include "dictionary/structure/backward/v402/ver4_pt_node_array_reader.h"
|
|
#include "dictionary/utils/buffer_with_extendable_buffer.h"
|
|
#include "dictionary/utils/file_utils.h"
|
|
#include "dictionary/utils/forgetting_curve_utils.h"
|
|
|
|
namespace latinime {
|
|
namespace backward {
|
|
namespace v402 {
|
|
|
|
bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath,
|
|
const EntryCounts &entryCounts) const {
|
|
const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
|
|
BufferWithExtendableBuffer headerBuffer(
|
|
BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
|
|
const int extendedRegionSize = headerPolicy->getExtendedRegionSize()
|
|
+ mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize();
|
|
if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */,
|
|
entryCounts, extendedRegionSize, &headerBuffer)) {
|
|
AKLOGE("Cannot write header structure to buffer. "
|
|
"updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, "
|
|
"extendedRegionSize: %d", false, entryCounts.getNgramCount(NgramType::Unigram),
|
|
entryCounts.getNgramCount(NgramType::Bigram), extendedRegionSize);
|
|
return false;
|
|
}
|
|
return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
|
|
}
|
|
|
|
bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos,
|
|
const char *const dictDirPath) {
|
|
const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
|
|
Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers(
|
|
Ver4DictBuffers::createVer4DictBuffers(headerPolicy,
|
|
Ver4DictConstants::MAX_DICTIONARY_SIZE));
|
|
int unigramCount = 0;
|
|
int bigramCount = 0;
|
|
if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &unigramCount, &bigramCount)) {
|
|
return false;
|
|
}
|
|
BufferWithExtendableBuffer headerBuffer(
|
|
BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
|
|
MutableEntryCounters entryCounters;
|
|
entryCounters.setNgramCount(NgramType::Unigram, unigramCount);
|
|
entryCounters.setNgramCount(NgramType::Bigram, bigramCount);
|
|
if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */,
|
|
entryCounters.getEntryCounts(), 0 /* extendedRegionSize */, &headerBuffer)) {
|
|
return false;
|
|
}
|
|
return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
|
|
}
|
|
|
|
bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
|
|
const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite,
|
|
int *const outUnigramCount, int *const outBigramCount) {
|
|
Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer(),
|
|
mBuffers->getProbabilityDictContent(), headerPolicy);
|
|
Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer());
|
|
Ver4BigramListPolicy bigramPolicy(mBuffers->getMutableBigramDictContent(),
|
|
mBuffers->getTerminalPositionLookupTable(), headerPolicy);
|
|
Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(),
|
|
mBuffers->getTerminalPositionLookupTable());
|
|
Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(),
|
|
mBuffers, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy,
|
|
&shortcutPolicy);
|
|
|
|
DynamicPtReadingHelper readingHelper(&ptNodeReader, &ptNodeArrayReader);
|
|
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
|
DynamicPtGcEventListeners
|
|
::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
|
|
traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
|
|
&ptNodeWriter);
|
|
if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
|
|
&traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) {
|
|
return false;
|
|
}
|
|
const int unigramCount = traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
|
|
.getValidUnigramCount();
|
|
const int maxUnigramCount = headerPolicy->getMaxNgramCounts().getNgramCount(NgramType::Unigram);
|
|
if (headerPolicy->isDecayingDict() && unigramCount > maxUnigramCount) {
|
|
if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter, maxUnigramCount)) {
|
|
AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount,
|
|
maxUnigramCount);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
|
DynamicPtGcEventListeners::TraversePolicyToUpdateBigramProbability
|
|
traversePolicyToUpdateBigramProbability(&ptNodeWriter);
|
|
if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
|
|
&traversePolicyToUpdateBigramProbability)) {
|
|
return false;
|
|
}
|
|
const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount();
|
|
const int maxBigramCount = headerPolicy->getMaxNgramCounts().getNgramCount(NgramType::Bigram);
|
|
if (headerPolicy->isDecayingDict() && bigramCount > maxBigramCount) {
|
|
if (!truncateBigrams(maxBigramCount)) {
|
|
AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount, maxBigramCount);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Mapping from positions in mBuffer to positions in bufferToWrite.
|
|
PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap;
|
|
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
|
Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(),
|
|
buffersToWrite, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy,
|
|
&shortcutPolicy);
|
|
DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
|
|
traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers,
|
|
buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap);
|
|
if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
|
|
&traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) {
|
|
return false;
|
|
}
|
|
|
|
// Create policy instances for the GCed dictionary.
|
|
Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(),
|
|
buffersToWrite->getProbabilityDictContent(), headerPolicy);
|
|
Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer());
|
|
Ver4BigramListPolicy newBigramPolicy(buffersToWrite->getMutableBigramDictContent(),
|
|
buffersToWrite->getTerminalPositionLookupTable(), headerPolicy);
|
|
Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(),
|
|
buffersToWrite->getTerminalPositionLookupTable());
|
|
Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(),
|
|
buffersToWrite, headerPolicy, &newPtNodeReader, &newPtNodeArrayreader, &newBigramPolicy,
|
|
&newShortcutPolicy);
|
|
// Re-assign terminal IDs for valid terminal PtNodes.
|
|
TerminalPositionLookupTable::TerminalIdMap terminalIdMap;
|
|
if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds(
|
|
&terminalIdMap)) {
|
|
return false;
|
|
}
|
|
// Run GC for probability dict content.
|
|
if (!buffersToWrite->getMutableProbabilityDictContent()->runGC(&terminalIdMap,
|
|
mBuffers->getProbabilityDictContent())) {
|
|
return false;
|
|
}
|
|
// Run GC for bigram dict content.
|
|
if(!buffersToWrite->getMutableBigramDictContent()->runGC(&terminalIdMap,
|
|
mBuffers->getBigramDictContent(), outBigramCount)) {
|
|
return false;
|
|
}
|
|
// Run GC for shortcut dict content.
|
|
if(!buffersToWrite->getMutableShortcutDictContent()->runGC(&terminalIdMap,
|
|
mBuffers->getShortcutDictContent())) {
|
|
return false;
|
|
}
|
|
DynamicPtReadingHelper newDictReadingHelper(&newPtNodeReader, &newPtNodeArrayreader);
|
|
newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
|
DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields
|
|
traversePolicyToUpdateAllPositionFields(&newPtNodeWriter, &dictPositionRelocationMap);
|
|
if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
|
|
&traversePolicyToUpdateAllPositionFields)) {
|
|
return false;
|
|
}
|
|
newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
|
TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
|
|
traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds(&newPtNodeWriter, &terminalIdMap);
|
|
if (!newDictReadingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
|
|
&traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds)) {
|
|
return false;
|
|
}
|
|
*outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount();
|
|
return true;
|
|
}
|
|
|
|
bool Ver4PatriciaTrieWritingHelper::truncateUnigrams(
|
|
const Ver4PatriciaTrieNodeReader *const ptNodeReader,
|
|
Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount) {
|
|
const TerminalPositionLookupTable *const terminalPosLookupTable =
|
|
mBuffers->getTerminalPositionLookupTable();
|
|
const int nextTerminalId = terminalPosLookupTable->getNextTerminalId();
|
|
std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator>
|
|
priorityQueue;
|
|
for (int i = 0; i < nextTerminalId; ++i) {
|
|
const int terminalPos = terminalPosLookupTable->getTerminalPtNodePosition(i);
|
|
if (terminalPos == NOT_A_DICT_POS) {
|
|
continue;
|
|
}
|
|
const ProbabilityEntry probabilityEntry =
|
|
mBuffers->getProbabilityDictContent()->getProbabilityEntry(i);
|
|
const int probability = probabilityEntry.hasHistoricalInfo() ?
|
|
ForgettingCurveUtils::decodeProbability(
|
|
probabilityEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) :
|
|
probabilityEntry.getProbability();
|
|
priorityQueue.push(DictProbability(terminalPos, probability,
|
|
probabilityEntry.getHistoricalInfo()->getTimestamp()));
|
|
}
|
|
|
|
// Delete unigrams.
|
|
while (static_cast<int>(priorityQueue.size()) > maxUnigramCount) {
|
|
const int ptNodePos = priorityQueue.top().getDictPos();
|
|
priorityQueue.pop();
|
|
const PtNodeParams ptNodeParams =
|
|
ptNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
|
|
if (ptNodeParams.representsNonWordInfo()) {
|
|
continue;
|
|
}
|
|
if (!ptNodeWriter->markPtNodeAsWillBecomeNonTerminal(&ptNodeParams)) {
|
|
AKLOGE("Cannot mark PtNode as willBecomeNonterminal. PtNode pos: %d", ptNodePos);
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool Ver4PatriciaTrieWritingHelper::truncateBigrams(const int maxBigramCount) {
|
|
const TerminalPositionLookupTable *const terminalPosLookupTable =
|
|
mBuffers->getTerminalPositionLookupTable();
|
|
const int nextTerminalId = terminalPosLookupTable->getNextTerminalId();
|
|
std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator>
|
|
priorityQueue;
|
|
BigramDictContent *const bigramDictContent = mBuffers->getMutableBigramDictContent();
|
|
for (int i = 0; i < nextTerminalId; ++i) {
|
|
const int bigramListPos = bigramDictContent->getBigramListHeadPos(i);
|
|
if (bigramListPos == NOT_A_DICT_POS) {
|
|
continue;
|
|
}
|
|
bool hasNext = true;
|
|
int readingPos = bigramListPos;
|
|
while (hasNext) {
|
|
const int entryPos = readingPos;
|
|
const BigramEntry bigramEntry =
|
|
bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos);
|
|
hasNext = bigramEntry.hasNext();
|
|
if (!bigramEntry.isValid()) {
|
|
continue;
|
|
}
|
|
const int probability = bigramEntry.hasHistoricalInfo() ?
|
|
ForgettingCurveUtils::decodeProbability(
|
|
bigramEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) :
|
|
bigramEntry.getProbability();
|
|
priorityQueue.push(DictProbability(entryPos, probability,
|
|
bigramEntry.getHistoricalInfo()->getTimestamp()));
|
|
}
|
|
}
|
|
|
|
// Delete bigrams.
|
|
while (static_cast<int>(priorityQueue.size()) > maxBigramCount) {
|
|
const int entryPos = priorityQueue.top().getDictPos();
|
|
const BigramEntry bigramEntry = bigramDictContent->getBigramEntry(entryPos);
|
|
const BigramEntry invalidatedBigramEntry = bigramEntry.getInvalidatedEntry();
|
|
if (!bigramDictContent->writeBigramEntry(&invalidatedBigramEntry, entryPos)) {
|
|
AKLOGE("Cannot write bigram entry to remove. pos: %d", entryPos);
|
|
return false;
|
|
}
|
|
priorityQueue.pop();
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
|
|
::onVisitingPtNode(const PtNodeParams *const ptNodeParams) {
|
|
if (!ptNodeParams->isTerminal()) {
|
|
return true;
|
|
}
|
|
TerminalPositionLookupTable::TerminalIdMap::const_iterator it =
|
|
mTerminalIdMap->find(ptNodeParams->getTerminalId());
|
|
if (it == mTerminalIdMap->end()) {
|
|
AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd",
|
|
ptNodeParams->getTerminalId(), mTerminalIdMap->size());
|
|
return false;
|
|
}
|
|
if (!mPtNodeWriter->updateTerminalId(ptNodeParams, it->second)) {
|
|
AKLOGE("Cannot update terminal id. %d -> %d", it->first, it->second);
|
|
}
|
|
return mPtNodeWriter->updatePtNodeHasBigramsAndShortcutTargetsFlags(ptNodeParams);
|
|
}
|
|
|
|
} // namespace v402
|
|
} // namespace backward
|
|
} // namespace latinime
|