From e8750d970eed61b9239d8b2fa19648b8457696c1 Mon Sep 17 00:00:00 2001 From: Keisuke Kuroyanagi Date: Tue, 21 Oct 2014 15:46:14 +0900 Subject: [PATCH] Introduce EntryCounters to count entries in a dictionary. Bug: 14425059 Change-Id: Ic13ba827d96fa4a147485ba92fdb37e23e04e8e8 --- .../dictionary/header/header_policy.cpp | 20 +-- .../dictionary/header/header_policy.h | 34 +++-- .../v402/ver4_patricia_trie_policy.cpp | 18 +-- .../backward/v402/ver4_patricia_trie_policy.h | 8 +- .../ver4_patricia_trie_writing_helper.cpp | 11 +- .../v402/ver4_patricia_trie_writing_helper.h | 4 +- .../content/language_model_dict_content.cpp | 9 +- .../v4/content/language_model_dict_content.h | 4 +- .../v4/ver4_patricia_trie_policy.cpp | 25 ++-- .../structure/v4/ver4_patricia_trie_policy.h | 10 +- .../v4/ver4_patricia_trie_writing_helper.cpp | 12 +- .../v4/ver4_patricia_trie_writing_helper.h | 5 +- .../utils/dict_file_writing_utils.cpp | 4 +- .../dictionary/utils/entry_counters.h | 119 ++++++++++++++++++ .../utils/forgetting_curve_utils.cpp | 17 ++- .../dictionary/utils/forgetting_curve_utils.h | 19 ++- 16 files changed, 230 insertions(+), 89 deletions(-) create mode 100644 native/jni/src/suggest/policyimpl/dictionary/utils/entry_counters.h diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp index 8fb256c54..300e96c4e 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp @@ -30,6 +30,7 @@ const char *const HeaderPolicy::DATE_KEY = "date"; const char *const HeaderPolicy::LAST_DECAYED_TIME_KEY = "LAST_DECAYED_TIME"; const char *const HeaderPolicy::UNIGRAM_COUNT_KEY = "UNIGRAM_COUNT"; const char *const HeaderPolicy::BIGRAM_COUNT_KEY = "BIGRAM_COUNT"; +const char *const HeaderPolicy::TRIGRAM_COUNT_KEY = "TRIGRAM_COUNT"; const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE"; // Historical info is information that is needed to support decaying such as timestamp, level and // count. @@ -94,12 +95,11 @@ bool HeaderPolicy::readRequiresGermanUmlautProcessing() const { } bool HeaderPolicy::fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime, - const int unigramCount, const int bigramCount, - const int extendedRegionSize, BufferWithExtendableBuffer *const outBuffer) const { + const EntryCounts &entryCounts, const int extendedRegionSize, + BufferWithExtendableBuffer *const outBuffer) const { int writingPos = 0; DictionaryHeaderStructurePolicy::AttributeMap attributeMapToWrite(mAttributeMap); - fillInHeader(updatesLastDecayedTime, unigramCount, bigramCount, - extendedRegionSize, &attributeMapToWrite); + fillInHeader(updatesLastDecayedTime, entryCounts, extendedRegionSize, &attributeMapToWrite); if (!HeaderReadWriteUtils::writeDictionaryVersion(outBuffer, mDictFormatVersion, &writingPos)) { return false; @@ -126,11 +126,15 @@ bool HeaderPolicy::fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTim return true; } -void HeaderPolicy::fillInHeader(const bool updatesLastDecayedTime, const int unigramCount, - const int bigramCount, const int extendedRegionSize, +void HeaderPolicy::fillInHeader(const bool updatesLastDecayedTime, + const EntryCounts &entryCounts, const int extendedRegionSize, DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const { - HeaderReadWriteUtils::setIntAttribute(outAttributeMap, UNIGRAM_COUNT_KEY, unigramCount); - HeaderReadWriteUtils::setIntAttribute(outAttributeMap, BIGRAM_COUNT_KEY, bigramCount); + HeaderReadWriteUtils::setIntAttribute(outAttributeMap, UNIGRAM_COUNT_KEY, + entryCounts.getUnigramCount()); + HeaderReadWriteUtils::setIntAttribute(outAttributeMap, BIGRAM_COUNT_KEY, + entryCounts.getBigramCount()); + HeaderReadWriteUtils::setIntAttribute(outAttributeMap, TRIGRAM_COUNT_KEY, + entryCounts.getTrigramCount()); HeaderReadWriteUtils::setIntAttribute(outAttributeMap, EXTENDED_REGION_SIZE_KEY, extendedRegionSize); // Set the current time as the generation time. diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h index 836bbe5a1..44c2f443f 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h @@ -22,6 +22,7 @@ #include "defines.h" #include "suggest/core/policy/dictionary_header_structure_policy.h" #include "suggest/policyimpl/dictionary/header/header_read_write_utils.h" +#include "suggest/policyimpl/dictionary/utils/entry_counters.h" #include "suggest/policyimpl/dictionary/utils/format_utils.h" #include "utils/char_utils.h" #include "utils/time_keeper.h" @@ -49,6 +50,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { UNIGRAM_COUNT_KEY, 0 /* defaultValue */)), mBigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, BIGRAM_COUNT_KEY, 0 /* defaultValue */)), + mTrigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + TRIGRAM_COUNT_KEY, 0 /* defaultValue */)), mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)), mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue( @@ -60,6 +63,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { &mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)), mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue( &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)), + mMaxTrigramCount(HeaderReadWriteUtils::readIntAttributeValue( + &mAttributeMap, MAX_TRIGRAM_COUNT_KEY, DEFAULT_MAX_TRIGRAM_COUNT)), mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {} // Constructs header information using an attribute map. @@ -77,7 +82,7 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), - mUnigramCount(0), mBigramCount(0), mExtendedRegionSize(0), + mUnigramCount(0), mBigramCount(0), mTrigramCount(0), mExtendedRegionSize(0), mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue( &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)), mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue( @@ -87,6 +92,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { &mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)), mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue( &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)), + mMaxTrigramCount(HeaderReadWriteUtils::readIntAttributeValue( + &mAttributeMap, MAX_TRIGRAM_COUNT_KEY, DEFAULT_MAX_TRIGRAM_COUNT)), mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {} // Copy header information @@ -99,12 +106,14 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { mIsDecayingDict(headerPolicy->mIsDecayingDict), mDate(headerPolicy->mDate), mLastDecayedTime(headerPolicy->mLastDecayedTime), mUnigramCount(headerPolicy->mUnigramCount), mBigramCount(headerPolicy->mBigramCount), + mTrigramCount(headerPolicy->mTrigramCount), mExtendedRegionSize(headerPolicy->mExtendedRegionSize), mHasHistoricalInfoOfWords(headerPolicy->mHasHistoricalInfoOfWords), mForgettingCurveProbabilityValuesTableId( headerPolicy->mForgettingCurveProbabilityValuesTableId), mMaxUnigramCount(headerPolicy->mMaxUnigramCount), mMaxBigramCount(headerPolicy->mMaxBigramCount), + mMaxTrigramCount(headerPolicy->mMaxTrigramCount), mCodePointTable(headerPolicy->mCodePointTable) {} // Temporary dummy header. @@ -112,10 +121,10 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { : mDictFormatVersion(FormatUtils::UNKNOWN_VERSION), mDictionaryFlags(0), mSize(0), mAttributeMap(), mLocale(CharUtils::EMPTY_STRING), mMultiWordCostMultiplier(0.0f), mRequiresGermanUmlautProcessing(false), mIsDecayingDict(false), - mDate(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0), + mDate(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0), mTrigramCount(0), mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false), mForgettingCurveProbabilityValuesTableId(0), mMaxUnigramCount(0), mMaxBigramCount(0), - mCodePointTable(nullptr) {} + mMaxTrigramCount(0), mCodePointTable(nullptr) {} ~HeaderPolicy() {} @@ -183,6 +192,10 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { return mBigramCount; } + AK_FORCE_INLINE int getTrigramCount() const { + return mTrigramCount; + } + AK_FORCE_INLINE int getExtendedRegionSize() const { return mExtendedRegionSize; } @@ -212,15 +225,19 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { return mMaxBigramCount; } + AK_FORCE_INLINE int getMaxTrigramCount() const { + return mMaxTrigramCount; + } + void readHeaderValueOrQuestionMark(const char *const key, int *outValue, int outValueSize) const; bool fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime, - const int unigramCount, const int bigramCount, - const int extendedRegionSize, BufferWithExtendableBuffer *const outBuffer) const; + const EntryCounts &entryCounts, const int extendedRegionSize, + BufferWithExtendableBuffer *const outBuffer) const; - void fillInHeader(const bool updatesLastDecayedTime, - const int unigramCount, const int bigramCount, const int extendedRegionSize, + void fillInHeader(const bool updatesLastDecayedTime, const EntryCounts &entryCounts, + const int extendedRegionSize, DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const; AK_FORCE_INLINE const std::vector *getLocale() const { @@ -245,6 +262,7 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { static const char *const LAST_DECAYED_TIME_KEY; static const char *const UNIGRAM_COUNT_KEY; static const char *const BIGRAM_COUNT_KEY; + static const char *const TRIGRAM_COUNT_KEY; static const char *const EXTENDED_REGION_SIZE_KEY; static const char *const HAS_HISTORICAL_INFO_KEY; static const char *const LOCALE_KEY; @@ -273,11 +291,13 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { const int mLastDecayedTime; const int mUnigramCount; const int mBigramCount; + const int mTrigramCount; const int mExtendedRegionSize; const bool mHasHistoricalInfoOfWords; const int mForgettingCurveProbabilityValuesTableId; const int mMaxUnigramCount; const int mMaxBigramCount; + const int mMaxTrigramCount; const int *const mCodePointTable; const std::vector readLocale() const; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp index 0eae934ae..d0dccc3be 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp @@ -303,7 +303,7 @@ bool Ver4PatriciaTriePolicy::addUnigramEntry(const CodePointArrayView wordCodePo if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointArrayView, unigramProperty, &addedNewUnigram)) { if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) { - mUnigramCount++; + mEntryCounters.incrementUnigramCount(); } if (unigramProperty->getShortcuts().size() > 0) { // Add shortcut target. @@ -397,7 +397,7 @@ bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramContext *const ngramContex if (mUpdatingHelper.addNgramEntry(PtNodePosArrayView::singleElementView(&prevWordPtNodePos), wordPos, ngramProperty, &addedNewBigram)) { if (addedNewBigram) { - mBigramCount++; + mEntryCounters.incrementBigramCount(); } return true; } else { @@ -438,7 +438,7 @@ bool Ver4PatriciaTriePolicy::removeNgramEntry(const NgramContext *const ngramCon const int prevWordPtNodePos = getTerminalPtNodePosFromWordId(prevWordIds[0]); if (mUpdatingHelper.removeNgramEntry( PtNodePosArrayView::singleElementView(&prevWordPtNodePos), wordPos)) { - mBigramCount--; + mEntryCounters.decrementBigramCount(); return true; } else { return false; @@ -477,7 +477,7 @@ bool Ver4PatriciaTriePolicy::flush(const char *const filePath) { AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath); return false; } - if (!mWritingHelper.writeToDictFile(filePath, mUnigramCount, mBigramCount)) { + if (!mWritingHelper.writeToDictFile(filePath, mEntryCounters.getEntryCounts())) { AKLOGE("Cannot flush the dictionary to file."); mIsCorrupted = true; return false; @@ -515,7 +515,7 @@ bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { // Needs to reduce dictionary size. return true; } else if (mHeaderPolicy->isDecayingDict()) { - return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mUnigramCount, mBigramCount, + return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mEntryCounters.getEntryCounts(), mHeaderPolicy); } return false; @@ -525,19 +525,19 @@ void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int quer char *const outResult, const int maxResultLength) { const int compareLength = queryLength + 1 /* terminator */; if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) { - snprintf(outResult, maxResultLength, "%d", mUnigramCount); + snprintf(outResult, maxResultLength, "%d", mEntryCounters.getUnigramCount()); } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) { - snprintf(outResult, maxResultLength, "%d", mBigramCount); + snprintf(outResult, maxResultLength, "%d", mEntryCounters.getBigramCount()); } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) { snprintf(outResult, maxResultLength, "%d", mHeaderPolicy->isDecayingDict() ? - ForgettingCurveUtils::getUnigramCountHardLimit( + ForgettingCurveUtils::getEntryCountHardLimit( mHeaderPolicy->getMaxUnigramCount()) : static_cast(Ver4DictConstants::MAX_DICTIONARY_SIZE)); } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) { snprintf(outResult, maxResultLength, "%d", mHeaderPolicy->isDecayingDict() ? - ForgettingCurveUtils::getBigramCountHardLimit( + ForgettingCurveUtils::getEntryCountHardLimit( mHeaderPolicy->getMaxBigramCount()) : static_cast(Ver4DictConstants::MAX_DICTIONARY_SIZE)); } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h index 1ad5e7e36..2cda0d3fa 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h @@ -41,6 +41,7 @@ #include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h" #include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h" #include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/entry_counters.h" #include "utils/int_array_view.h" namespace latinime { @@ -75,8 +76,8 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { &mPtNodeArrayReader, &mBigramPolicy, &mShortcutPolicy), mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter), mWritingHelper(mBuffers.get()), - mUnigramCount(mHeaderPolicy->getUnigramCount()), - mBigramCount(mHeaderPolicy->getBigramCount()), + mEntryCounters(mHeaderPolicy->getUnigramCount(), mHeaderPolicy->getBigramCount(), + mHeaderPolicy->getTrigramCount()), mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {}; virtual int getRootPosition() const { @@ -163,8 +164,7 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { Ver4PatriciaTrieNodeWriter mNodeWriter; DynamicPtUpdatingHelper mUpdatingHelper; Ver4PatriciaTrieWritingHelper mWritingHelper; - int mUnigramCount; - int mBigramCount; + MutableEntryCounters mEntryCounters; std::vector mTerminalPtNodePositionsForIteratingWords; mutable bool mIsCorrupted; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp index 2887dc6b1..5f440819a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp @@ -43,18 +43,18 @@ namespace backward { namespace v402 { bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath, - const int unigramCount, const int bigramCount) const { + const EntryCounts &entryCounts) const { const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy(); BufferWithExtendableBuffer headerBuffer( BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); const int extendedRegionSize = headerPolicy->getExtendedRegionSize() + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize(); if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */, - unigramCount, bigramCount, extendedRegionSize, &headerBuffer)) { + entryCounts, extendedRegionSize, &headerBuffer)) { AKLOGE("Cannot write header structure to buffer. " "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, " - "extendedRegionSize: %d", false, unigramCount, bigramCount, - extendedRegionSize); + "extendedRegionSize: %d", false, entryCounters.getUnigramCount(), + entryCounters.getBigramCount(), extendedRegionSize); return false; } return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); @@ -74,7 +74,8 @@ bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeAr BufferWithExtendableBuffer headerBuffer( BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */, - unigramCount, bigramCount, 0 /* extendedRegionSize */, &headerBuffer)) { + EntryCounts(unigramCount, bigramCount, 0 /* trigramCount */), + 0 /* extendedRegionSize */, &headerBuffer)) { return false; } return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h index 9034ee656..1aad33e38 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h @@ -27,6 +27,7 @@ #include "defines.h" #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h" #include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "suggest/policyimpl/dictionary/utils/entry_counters.h" namespace latinime { namespace backward { @@ -46,8 +47,7 @@ class Ver4PatriciaTrieWritingHelper { Ver4PatriciaTrieWritingHelper(Ver4DictBuffers *const buffers) : mBuffers(buffers) {} - bool writeToDictFile(const char *const dictDirPath, const int unigramCount, - const int bigramCount) const; + bool writeToDictFile(const char *const dictDirPath, const EntryCounts &entryCounts) const; // This method cannot be const because the original dictionary buffer will be updated to detect // useless PtNodes during GC. diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp index c4297f5d6..934c4f470 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp @@ -161,10 +161,7 @@ bool LanguageModelDictContent::truncateEntries(const int *const entryCounts, bool LanguageModelDictContent::updateAllEntriesOnInputWord(const WordIdArrayView prevWordIds, const int wordId, const bool isValid, const HistoricalInfo historicalInfo, - const HeaderPolicy *const headerPolicy, int *const outAddedNewNgramEntryCount) { - if (outAddedNewNgramEntryCount) { - *outAddedNewNgramEntryCount = 0; - } + const HeaderPolicy *const headerPolicy, MutableEntryCounters *const entryCountersToUpdate) { if (!mHasHistoricalInfo) { AKLOGE("updateAllEntriesOnInputWord is called for dictionary without historical info."); return false; @@ -188,8 +185,8 @@ bool LanguageModelDictContent::updateAllEntriesOnInputWord(const WordIdArrayView if (!setNgramProbabilityEntry(limitedPrevWordIds, wordId, &updatedNgramProbabilityEntry)) { return false; } - if (!originalNgramProbabilityEntry.isValid() && outAddedNewNgramEntryCount) { - *outAddedNewNgramEntryCount += 1; + if (!originalNgramProbabilityEntry.isValid()) { + entryCountersToUpdate->incrementNgramCount(i + 2); } } return true; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h index 51ef090e1..9a5f87741 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h @@ -25,6 +25,7 @@ #include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h" #include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" +#include "suggest/policyimpl/dictionary/utils/entry_counters.h" #include "suggest/policyimpl/dictionary/utils/trie_map.h" #include "utils/byte_array_view.h" #include "utils/int_array_view.h" @@ -169,7 +170,8 @@ class LanguageModelDictContent { bool updateAllEntriesOnInputWord(const WordIdArrayView prevWordIds, const int wordId, const bool isValid, const HistoricalInfo historicalInfo, - const HeaderPolicy *const headerPolicy, int *const outAddedNewNgramEntryCount); + const HeaderPolicy *const headerPolicy, + MutableEntryCounters *const entryCountersToUpdate); private: DISALLOW_COPY_AND_ASSIGN(LanguageModelDictContent); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp index ea8c0dc22..ead1bde50 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp @@ -211,7 +211,7 @@ bool Ver4PatriciaTriePolicy::addUnigramEntry(const CodePointArrayView wordCodePo if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointArrayView, unigramProperty, &addedNewUnigram)) { if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) { - mUnigramCount++; + mEntryCounters.incrementUnigramCount(); } if (unigramProperty->getShortcuts().size() > 0) { // Add shortcut target. @@ -259,7 +259,7 @@ bool Ver4PatriciaTriePolicy::removeUnigramEntry(const CodePointArrayView wordCod return false; } if (!ptNodeParams.representsNonWordInfo()) { - mUnigramCount--; + mEntryCounters.decrementUnigramCount(); } return true; } @@ -316,7 +316,7 @@ bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramContext *const ngramContex bool addedNewEntry = false; if (mNodeWriter.addNgramEntry(prevWordIds, wordId, ngramProperty, &addedNewEntry)) { if (addedNewEntry) { - mBigramCount++; + mEntryCounters.incrementNgramCount(prevWordIds.size() + 1); } return true; } else { @@ -354,7 +354,7 @@ bool Ver4PatriciaTriePolicy::removeNgramEntry(const NgramContext *const ngramCon return false; } if (mNodeWriter.removeNgramEntry(prevWordIds, wordId)) { - mBigramCount--; + mEntryCounters.decrementNgramCount(prevWordIds.size()); return true; } else { return false; @@ -401,12 +401,10 @@ bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext( // Refresh word ids. ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); } - int addedNewNgramEntryCount = 0; if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord(prevWordIds, - wordId, updateAsAValidWord, historicalInfo, mHeaderPolicy, &addedNewNgramEntryCount)) { + wordId, updateAsAValidWord, historicalInfo, mHeaderPolicy, &mEntryCounters)) { return false; } - mBigramCount += addedNewNgramEntryCount; return true; } @@ -415,7 +413,7 @@ bool Ver4PatriciaTriePolicy::flush(const char *const filePath) { AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath); return false; } - if (!mWritingHelper.writeToDictFile(filePath, mUnigramCount, mBigramCount)) { + if (!mWritingHelper.writeToDictFile(filePath, mEntryCounters.getEntryCounts())) { AKLOGE("Cannot flush the dictionary to file."); mIsCorrupted = true; return false; @@ -453,8 +451,7 @@ bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { // Needs to reduce dictionary size. return true; } else if (mHeaderPolicy->isDecayingDict()) { - return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mUnigramCount, mBigramCount, - mHeaderPolicy); + return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mEntryCounters.getEntryCounts(), mHeaderPolicy); } return false; } @@ -463,19 +460,19 @@ void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int quer char *const outResult, const int maxResultLength) { const int compareLength = queryLength + 1 /* terminator */; if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) { - snprintf(outResult, maxResultLength, "%d", mUnigramCount); + snprintf(outResult, maxResultLength, "%d", mEntryCounters.getUnigramCount()); } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) { - snprintf(outResult, maxResultLength, "%d", mBigramCount); + snprintf(outResult, maxResultLength, "%d", mEntryCounters.getBigramCount()); } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) { snprintf(outResult, maxResultLength, "%d", mHeaderPolicy->isDecayingDict() ? - ForgettingCurveUtils::getUnigramCountHardLimit( + ForgettingCurveUtils::getEntryCountHardLimit( mHeaderPolicy->getMaxUnigramCount()) : static_cast(Ver4DictConstants::MAX_DICTIONARY_SIZE)); } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) { snprintf(outResult, maxResultLength, "%d", mHeaderPolicy->isDecayingDict() ? - ForgettingCurveUtils::getBigramCountHardLimit( + ForgettingCurveUtils::getEntryCountHardLimit( mHeaderPolicy->getMaxBigramCount()) : static_cast(Ver4DictConstants::MAX_DICTIONARY_SIZE)); } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h index c0532815c..e3611cb32 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h @@ -30,6 +30,7 @@ #include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h" #include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/entry_counters.h" #include "utils/int_array_view.h" namespace latinime { @@ -37,7 +38,6 @@ namespace latinime { class DicNode; class DicNodeVector; -// TODO: Support counting ngram entries. // Word id = Artificial id that is stored in the PtNode looked up by the word. class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { public: @@ -51,8 +51,8 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { &mShortcutPolicy), mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter), mWritingHelper(mBuffers.get()), - mUnigramCount(mHeaderPolicy->getUnigramCount()), - mBigramCount(mHeaderPolicy->getBigramCount()), + mEntryCounters(mHeaderPolicy->getUnigramCount(), mHeaderPolicy->getBigramCount(), + mHeaderPolicy->getTrigramCount()), mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {}; AK_FORCE_INLINE int getRootPosition() const { @@ -141,9 +141,7 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { Ver4PatriciaTrieNodeWriter mNodeWriter; DynamicPtUpdatingHelper mUpdatingHelper; Ver4PatriciaTrieWritingHelper mWritingHelper; - int mUnigramCount; - // TODO: Support counting ngram entries. - int mBigramCount; + MutableEntryCounters mEntryCounters; std::vector mTerminalPtNodePositionsForIteratingWords; mutable bool mIsCorrupted; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp index f0d59c150..e49d0308e 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp @@ -33,17 +33,18 @@ namespace latinime { bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath, - const int unigramCount, const int bigramCount) const { + const EntryCounts &entryCounts) const { const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy(); BufferWithExtendableBuffer headerBuffer( BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); const int extendedRegionSize = headerPolicy->getExtendedRegionSize() + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize(); if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */, - unigramCount, bigramCount, extendedRegionSize, &headerBuffer)) { + entryCounts, extendedRegionSize, &headerBuffer)) { AKLOGE("Cannot write header structure to buffer. " - "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, " - "extendedRegionSize: %d", false, unigramCount, bigramCount, + "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, trigramCount: %d," + "extendedRegionSize: %d", false, entryCounters.getUnigramCount(), + entryCounters.getBigramCount(), entryCounters.getTrigramCount(), extendedRegionSize); return false; } @@ -64,7 +65,8 @@ bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeAr BufferWithExtendableBuffer headerBuffer( BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */, - unigramCount, bigramCount, 0 /* extendedRegionSize */, &headerBuffer)) { + EntryCounts(unigramCount, bigramCount, 0 /* trigramCount */), + 0 /* extendedRegionSize */, &headerBuffer)) { return false; } return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h index 3569d0576..57a1f7bb1 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h @@ -20,6 +20,7 @@ #include "defines.h" #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h" #include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" +#include "suggest/policyimpl/dictionary/utils/entry_counters.h" namespace latinime { @@ -33,9 +34,7 @@ class Ver4PatriciaTrieWritingHelper { Ver4PatriciaTrieWritingHelper(Ver4DictBuffers *const buffers) : mBuffers(buffers) {} - // TODO: Support counting ngram entries. - bool writeToDictFile(const char *const dictDirPath, const int unigramCount, - const int bigramCount) const; + bool writeToDictFile(const char *const dictDirPath, const EntryCounts &entryCounts) const; // This method cannot be const because the original dictionary buffer will be updated to detect // useless PtNodes during GC. diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp index b7e2a7278..9d8e86675 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp @@ -27,6 +27,7 @@ #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" #include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/entry_counters.h" #include "suggest/policyimpl/dictionary/utils/file_utils.h" #include "suggest/policyimpl/dictionary/utils/format_utils.h" #include "utils/time_keeper.h" @@ -69,8 +70,7 @@ template DictBuffersPtr dictBuffers = DictBuffers::createVer4DictBuffers(&headerPolicy, DictConstants::MAX_DICT_EXTENDED_REGION_SIZE); headerPolicy.fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */, - 0 /* unigramCount */, 0 /* bigramCount */, - 0 /* extendedRegionSize */, dictBuffers->getWritableHeaderBuffer()); + EntryCounts(), 0 /* extendedRegionSize */, dictBuffers->getWritableHeaderBuffer()); if (!DynamicPtWritingUtils::writeEmptyDictionary( dictBuffers->getWritableTrieBuffer(), 0 /* rootPos */)) { AKLOGE("Empty ver4 dictionary structure cannot be created on memory."); diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/entry_counters.h b/native/jni/src/suggest/policyimpl/dictionary/utils/entry_counters.h new file mode 100644 index 000000000..b8fa5aa9e --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/entry_counters.h @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_ENTRY_COUNTERS_H +#define LATINIME_ENTRY_COUNTERS_H + +#include + +#include "defines.h" + +namespace latinime { + +// Copyable but immutable +class EntryCounts final { + public: + EntryCounts() : mEntryCounts({{0, 0, 0}}) {} + + EntryCounts(const int unigramCount, const int bigramCount, const int trigramCount) + : mEntryCounts({{unigramCount, bigramCount, trigramCount}}) {} + + explicit EntryCounts(const std::array &counters) + : mEntryCounts(counters) {} + + int getUnigramCount() const { + return mEntryCounts[0]; + } + + int getBigramCount() const { + return mEntryCounts[1]; + } + + int getTrigramCount() const { + return mEntryCounts[2]; + } + + private: + DISALLOW_ASSIGNMENT_OPERATOR(EntryCounts); + + const std::array mEntryCounts; +}; + +class MutableEntryCounters final { + public: + MutableEntryCounters() { + mEntryCounters.fill(0); + } + + MutableEntryCounters(const int unigramCount, const int bigramCount, const int trigramCount) + : mEntryCounters({{unigramCount, bigramCount, trigramCount}}) {} + + const EntryCounts getEntryCounts() const { + return EntryCounts(mEntryCounters); + } + + int getUnigramCount() const { + return mEntryCounters[0]; + } + + int getBigramCount() const { + return mEntryCounters[1]; + } + + int getTrigramCount() const { + return mEntryCounters[2]; + } + + void incrementUnigramCount() { + ++mEntryCounters[0]; + } + + void decrementUnigramCount() { + ASSERT(mEntryCounters[0] != 0); + --mEntryCounters[0]; + } + + void incrementBigramCount() { + ++mEntryCounters[1]; + } + + void decrementBigramCount() { + ASSERT(mEntryCounters[1] != 0); + --mEntryCounters[1]; + } + + void incrementNgramCount(const size_t n) { + if (n < 1 || n > mEntryCounters.size()) { + return; + } + ++mEntryCounters[n - 1]; + } + + void decrementNgramCount(const size_t n) { + if (n < 1 || n > mEntryCounters.size()) { + return; + } + ASSERT(mEntryCounters[n - 1] != 0); + --mEntryCounters[n - 1]; + } + + private: + DISALLOW_COPY_AND_ASSIGN(MutableEntryCounters); + + std::array mEntryCounters; +}; +} // namespace latinime +#endif /* LATINIME_ENTRY_COUNTERS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp index e5ef2abf8..9055f7bfc 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp @@ -38,8 +38,7 @@ const int ForgettingCurveUtils::OCCURRENCES_TO_RAISE_THE_LEVEL = 1; // 15 days const int ForgettingCurveUtils::DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS = 15 * 24 * 60 * 60; -const float ForgettingCurveUtils::UNIGRAM_COUNT_HARD_LIMIT_WEIGHT = 1.2; -const float ForgettingCurveUtils::BIGRAM_COUNT_HARD_LIMIT_WEIGHT = 1.2; +const float ForgettingCurveUtils::ENTRY_COUNT_HARD_LIMIT_WEIGHT = 1.2; const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityTable; @@ -126,14 +125,22 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT } /* static */ bool ForgettingCurveUtils::needsToDecay(const bool mindsBlockByDecay, - const int unigramCount, const int bigramCount, const HeaderPolicy *const headerPolicy) { - if (unigramCount >= getUnigramCountHardLimit(headerPolicy->getMaxUnigramCount())) { + const EntryCounts &entryCounts, const HeaderPolicy *const headerPolicy) { + if (entryCounts.getUnigramCount() + >= getEntryCountHardLimit(headerPolicy->getMaxUnigramCount())) { // Unigram count exceeds the limit. return true; - } else if (bigramCount >= getBigramCountHardLimit(headerPolicy->getMaxBigramCount())) { + } + if (entryCounts.getBigramCount() + >= getEntryCountHardLimit(headerPolicy->getMaxBigramCount())) { // Bigram count exceeds the limit. return true; } + if (entryCounts.getTrigramCount() + >= getEntryCountHardLimit(headerPolicy->getMaxTrigramCount())) { + // Trigram count exceeds the limit. + return true; + } if (mindsBlockByDecay) { return false; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h index ccbc4a98d..06dcae8a1 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h @@ -21,6 +21,7 @@ #include "defines.h" #include "suggest/core/dictionary/property/historical_info.h" +#include "suggest/policyimpl/dictionary/utils/entry_counters.h" namespace latinime { @@ -42,22 +43,17 @@ class ForgettingCurveUtils { static bool needsToKeep(const HistoricalInfo *const historicalInfo, const HeaderPolicy *const headerPolicy); - static bool needsToDecay(const bool mindsBlockByDecay, const int unigramCount, - const int bigramCount, const HeaderPolicy *const headerPolicy); + static bool needsToDecay(const bool mindsBlockByDecay, const EntryCounts &entryCounters, + const HeaderPolicy *const headerPolicy); // TODO: Improve probability computation method and remove this. static int getProbabilityBiasForNgram(const int n) { return (n - 1) * MULTIPLIER_TWO_IN_PROBABILITY_SCALE; } - AK_FORCE_INLINE static int getUnigramCountHardLimit(const int maxUnigramCount) { - return static_cast(static_cast(maxUnigramCount) - * UNIGRAM_COUNT_HARD_LIMIT_WEIGHT); - } - - AK_FORCE_INLINE static int getBigramCountHardLimit(const int maxBigramCount) { - return static_cast(static_cast(maxBigramCount) - * BIGRAM_COUNT_HARD_LIMIT_WEIGHT); + AK_FORCE_INLINE static int getEntryCountHardLimit(const int maxEntryCount) { + return static_cast(static_cast(maxEntryCount) + * ENTRY_COUNT_HARD_LIMIT_WEIGHT); } private: @@ -101,8 +97,7 @@ class ForgettingCurveUtils { static const int OCCURRENCES_TO_RAISE_THE_LEVEL; static const int DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS; - static const float UNIGRAM_COUNT_HARD_LIMIT_WEIGHT; - static const float BIGRAM_COUNT_HARD_LIMIT_WEIGHT; + static const float ENTRY_COUNT_HARD_LIMIT_WEIGHT; static const ProbabilityTable sProbabilityTable;