From 31097a57cc6f8022abc0ea56f27147399f41b630 Mon Sep 17 00:00:00 2001 From: Keisuke Kuroyanagi Date: Fri, 27 Sep 2013 23:12:12 +0900 Subject: [PATCH] Add unigram/bigram counting. Bug: 6669677 Change-Id: I05ea2201d822dddf062b08c8467daa336760e16c --- .../inputmethod/latin/BinaryDictionary.java | 12 +++++ ...oid_inputmethod_latin_BinaryDictionary.cpp | 23 ++++++++ .../suggest/core/dictionary/dictionary.cpp | 5 ++ .../src/suggest/core/dictionary/dictionary.h | 3 ++ .../dictionary_structure_with_buffer_policy.h | 3 ++ .../bigram/dynamic_bigram_list_policy.cpp | 12 +++-- .../bigram/dynamic_bigram_list_policy.h | 9 ++-- ...namic_patricia_trie_gc_event_listeners.cpp | 10 +++- ...dynamic_patricia_trie_gc_event_listeners.h | 32 +++++++++-- .../dynamic_patricia_trie_policy.cpp | 45 ++++++++++++++-- .../dictionary/dynamic_patricia_trie_policy.h | 12 ++++- .../dynamic_patricia_trie_writing_helper.cpp | 43 ++++++++++----- .../dynamic_patricia_trie_writing_helper.h | 14 +++-- .../dictionary/header/header_policy.cpp | 54 ++++++------------- .../dictionary/header/header_policy.h | 47 ++++++++++++---- .../header/header_read_write_utils.cpp | 37 ++++++++----- .../header/header_read_write_utils.h | 14 +++-- .../dictionary/patricia_trie_policy.h | 8 +++ .../utils/dict_file_writing_utils.cpp | 3 +- .../latin/BinaryDictionaryTests.java | 54 +++++++++++++++++++ 20 files changed, 339 insertions(+), 101 deletions(-) diff --git a/java/src/com/android/inputmethod/latin/BinaryDictionary.java b/java/src/com/android/inputmethod/latin/BinaryDictionary.java index a463651d5..6ec7aeec3 100644 --- a/java/src/com/android/inputmethod/latin/BinaryDictionary.java +++ b/java/src/com/android/inputmethod/latin/BinaryDictionary.java @@ -48,6 +48,11 @@ public final class BinaryDictionary extends Dictionary { // TODO: Remove this heuristic. private static final int SPACE_COUNT_FOR_AUTO_COMMIT = 3; + @UsedForTesting + public static final String UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; + @UsedForTesting + public static final String BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; + private long mNativeDict; private final Locale mLocale; private final long mDictSize; @@ -129,6 +134,7 @@ public final class BinaryDictionary extends Dictionary { private static native void removeBigramWordsNative(long dict, int[] word0, int[] word1); private static native int calculateProbabilityNative(long dict, int unigramProbability, int bigramProbability); + private static native String getPropertyNative(long dict, String query); @UsedForTesting public static boolean createEmptyDictFile(final String filePath, final long dictVersion, @@ -331,6 +337,12 @@ public final class BinaryDictionary extends Dictionary { return calculateProbabilityNative(mNativeDict, unigramProbability, bigramProbability); } + @UsedForTesting + public String getPropertyForTests(String query) { + if (!isValidDictionary()) return ""; + return getPropertyNative(mNativeDict, query); + } + @Override public boolean shouldAutoCommit(final SuggestedWordInfo candidate) { // TODO: actually use the confidence rather than use this completely broken heuristic diff --git a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp index 7761ec4d5..85e100e33 100644 --- a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp +++ b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp @@ -323,6 +323,24 @@ static int latinime_BinaryDictionary_calculateProbabilityNative(JNIEnv *env, jcl bigramProbability); } +static jstring latinime_BinaryDictionary_getProperty(JNIEnv *env, jclass clazz, jlong dict, + jstring query) { + Dictionary *dictionary = reinterpret_cast(dict); + if (!dictionary) { + return env->NewStringUTF(""); + } + const jsize queryUtf8Length = env->GetStringUTFLength(query); + char queryChars[queryUtf8Length + 1]; + env->GetStringUTFRegion(query, 0, env->GetStringLength(query), queryChars); + queryChars[queryUtf8Length] = '\0'; + static const int GET_PROPERTY_RESULT_LENGTH = 100; + char resultChars[GET_PROPERTY_RESULT_LENGTH]; + resultChars[0] = '\0'; + dictionary->getDictionaryStructurePolicy()->getProperty(queryChars, resultChars, + GET_PROPERTY_RESULT_LENGTH); + return env->NewStringUTF(resultChars); +} + static const JNINativeMethod sMethods[] = { { const_cast("createEmptyDictFileNative"), @@ -398,6 +416,11 @@ static const JNINativeMethod sMethods[] = { const_cast("calculateProbabilityNative"), const_cast("(JII)I"), reinterpret_cast(latinime_BinaryDictionary_calculateProbabilityNative) + }, + { + const_cast("getPropertyNative"), + const_cast("(JLjava/lang/String;)Ljava/lang/String;"), + reinterpret_cast(latinime_BinaryDictionary_getProperty) } }; diff --git a/native/jni/src/suggest/core/dictionary/dictionary.cpp b/native/jni/src/suggest/core/dictionary/dictionary.cpp index ec1b63a12..214df1bbf 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/dictionary.cpp @@ -125,6 +125,11 @@ bool Dictionary::needsToRunGC() { return mDictionaryStructureWithBufferPolicy->needsToRunGC(); } +void Dictionary::getProperty(const char *const query, char *const outResult, + const int maxResultLength) const { + return mDictionaryStructureWithBufferPolicy->getProperty(query, outResult, maxResultLength); +} + void Dictionary::logDictionaryInfo(JNIEnv *const env) const { const int BUFFER_SIZE = 16; int dictionaryIdCodePointBuffer[BUFFER_SIZE]; diff --git a/native/jni/src/suggest/core/dictionary/dictionary.h b/native/jni/src/suggest/core/dictionary/dictionary.h index 974447468..800751745 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.h +++ b/native/jni/src/suggest/core/dictionary/dictionary.h @@ -83,6 +83,9 @@ class Dictionary { bool needsToRunGC(); + void getProperty(const char *const query, char *const outResult, + const int maxResultLength) const; + const DictionaryStructureWithBufferPolicy *getDictionaryStructurePolicy() const { return mDictionaryStructureWithBufferPolicy; } diff --git a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h index b95488ebd..2434287b1 100644 --- a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h +++ b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h @@ -80,6 +80,9 @@ class DictionaryStructureWithBufferPolicy { virtual bool needsToRunGC() const = 0; + virtual void getProperty(const char *const query, char *const outResult, + const int maxResultLength) const = 0; + protected: DictionaryStructureWithBufferPolicy() {} diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp index 29307b56a..e02f4cbf1 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp @@ -119,7 +119,7 @@ bool DynamicBigramListPolicy::copyAllBigrams(BufferWithExtendableBuffer *const b // Finding useless bigram entries and remove them. Bigram entry is useless when the target PtNode // has been deleted or is not a valid terminal. bool DynamicBigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries( - int *const bigramListPos) { + int *const bigramListPos, int *const outValidBigramEntryCount) { const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos); if (usesAdditionalBuffer) { *bigramListPos -= mBuffer->getOriginalBufferSize(); @@ -161,6 +161,8 @@ bool DynamicBigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries( NOT_A_DICT_POS /* targetOffset */, &bigramEntryPos)) { return false; } + } else { + (*outValidBigramEntryCount) += 1; } } while(BigramListReadWriteUtils::hasNext(bigramFlags)); return true; @@ -169,7 +171,7 @@ bool DynamicBigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries( // Updates bigram target PtNode positions in the list after the placing step in GC. bool DynamicBigramListPolicy::updateAllBigramTargetPtNodePositions(int *const bigramListPos, const DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap *const - ptNodePositionRelocationMap) { + ptNodePositionRelocationMap, int *const outBigramEntryCount) { const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos); if (usesAdditionalBuffer) { *bigramListPos -= mBuffer->getOriginalBufferSize(); @@ -211,11 +213,12 @@ bool DynamicBigramListPolicy::updateAllBigramTargetPtNodePositions(int *const bi return false; } } while(BigramListReadWriteUtils::hasNext(bigramFlags)); + (*outBigramEntryCount) = bigramEntryCount; return true; } bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTargetPos, - const int probability, int *const bigramListPos) { + const int probability, int *const bigramListPos, bool *const outAddedNewBigram) { const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos); if (usesAdditionalBuffer) { *bigramListPos -= mBuffer->getOriginalBufferSize(); @@ -243,6 +246,7 @@ bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTarg } if (followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos) == bigramTargetPos) { // Update this bigram entry. + *outAddedNewBigram = false; const BigramListReadWriteUtils::BigramFlags updatedFlags = BigramListReadWriteUtils::setProbabilityInFlags(bigramFlags, probability); return BigramListReadWriteUtils::writeBigramEntry(mBuffer, updatedFlags, @@ -254,12 +258,14 @@ bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTarg // The current last entry is found. // First, update the flags of the last entry. if (!BigramListReadWriteUtils::setHasNextFlag(mBuffer, true /* hasNext */, entryPos)) { + *outAddedNewBigram = false; return false; } if (usesAdditionalBuffer) { *bigramListPos += mBuffer->getOriginalBufferSize(); } // Then, add a new entry after the last entry. + *outAddedNewBigram = true; return writeNewBigramEntry(bigramTargetPos, probability, bigramListPos); } while(BigramListReadWriteUtils::hasNext(bigramFlags)); // We return directly from the while loop. diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h index 8ea318a41..3ebf69946 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h @@ -50,19 +50,20 @@ class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy { bool copyAllBigrams(BufferWithExtendableBuffer *const bufferToWrite, int *const fromPos, int *const toPos, int *const outBigramsCount) const; - bool updateAllBigramEntriesAndDeleteUselessEntries(int *const bigramListPos); + bool updateAllBigramEntriesAndDeleteUselessEntries(int *const bigramListPos, + int *const outBigramEntryCount); bool updateAllBigramTargetPtNodePositions(int *const bigramListPos, const DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap *const - ptNodePositionRelocationMap); + ptNodePositionRelocationMap, int *const outValidBigramEntryCount); bool addNewBigramEntryToBigramList(const int bigramTargetPos, const int probability, - int *const bigramListPos); + int *const bigramListPos, bool *const outAddedNewBigram); bool writeNewBigramEntry(const int bigramTargetPos, const int probability, int *const writingPos); - // Return if targetBigramPos is found or not. + // Return whether or not targetBigramPos is found. bool removeBigram(const int bigramListPos, const int bigramTargetPos); private: diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp index c60e45819..5eb473300 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp @@ -42,6 +42,9 @@ bool DynamicPatriciaTrieGcEventListeners } } else { valueStack.back() += 1; + if (node->isTerminal()) { + mValidUnigramCount += 1; + } } return true; } @@ -137,10 +140,15 @@ bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateAllPositionField // Updates bigram target PtNode positions in the bigram list. int bigramsPos = node->getBigramsPos(); if (bigramsPos != NOT_A_DICT_POS) { + int bigramEntryCount; if (!mBigramPolicy->updateAllBigramTargetPtNodePositions(&bigramsPos, - &mDictPositionRelocationMap->mPtNodePositionRelocationMap)) { + &mDictPositionRelocationMap->mPtNodePositionRelocationMap, &bigramEntryCount)) { return false; } + mBigramCount += bigramEntryCount; + } + if (node->isTerminal()) { + mUnigramCount++; } return true; diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h index 4256f22fb..aa6e60959 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h @@ -41,7 +41,7 @@ class DynamicPatriciaTrieGcEventListeners { DynamicPatriciaTrieWritingHelper *const writingHelper, BufferWithExtendableBuffer *const buffer) : mWritingHelper(writingHelper), mBuffer(buffer), valueStack(), - mChildrenValue(0) {} + mChildrenValue(0), mValidUnigramCount(0) {} ~TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted() {}; @@ -64,6 +64,10 @@ class DynamicPatriciaTrieGcEventListeners { bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, const int *const nodeCodePoints); + int getValidUnigramCount() const { + return mValidUnigramCount; + } + private: DISALLOW_IMPLICIT_CONSTRUCTORS( TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted); @@ -72,6 +76,7 @@ class DynamicPatriciaTrieGcEventListeners { BufferWithExtendableBuffer *const mBuffer; std::vector valueStack; int mChildrenValue; + int mValidUnigramCount; }; // Updates all bigram entries that are held by valid PtNodes. This removes useless bigram @@ -80,7 +85,7 @@ class DynamicPatriciaTrieGcEventListeners { : public DynamicPatriciaTrieReadingHelper::TraversingEventListener { public: TraversePolicyToUpdateBigramProbability(DynamicBigramListPolicy *const bigramPolicy) - : mBigramPolicy(bigramPolicy) {} + : mBigramPolicy(bigramPolicy), mValidBigramEntryCount(0) {} bool onAscend() { return true; } @@ -93,18 +98,26 @@ class DynamicPatriciaTrieGcEventListeners { if (!node->isDeleted()) { int pos = node->getBigramsPos(); if (pos != NOT_A_DICT_POS) { - if (!mBigramPolicy->updateAllBigramEntriesAndDeleteUselessEntries(&pos)) { + int bigramEntryCount = 0; + if (!mBigramPolicy->updateAllBigramEntriesAndDeleteUselessEntries(&pos, + &bigramEntryCount)) { return false; } + mValidBigramEntryCount += bigramEntryCount; } } return true; } + int getValidBigramEntryCount() const { + return mValidBigramEntryCount; + } + private: DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateBigramProbability); DynamicBigramListPolicy *const mBigramPolicy; + int mValidBigramEntryCount; }; class TraversePolicyToPlaceAndWriteValidPtNodesToBuffer @@ -150,7 +163,8 @@ class DynamicPatriciaTrieGcEventListeners { dictPositionRelocationMap) : mWritingHelper(writingHelper), mBigramPolicy(bigramPolicy), mBufferToWrite(bufferToWrite), - mDictPositionRelocationMap(dictPositionRelocationMap) {}; + mDictPositionRelocationMap(dictPositionRelocationMap), mUnigramCount(0), + mBigramCount(0) {}; bool onAscend() { return true; } @@ -161,6 +175,14 @@ class DynamicPatriciaTrieGcEventListeners { bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, const int *const nodeCodePoints); + int getUnigramCount() const { + return mUnigramCount; + } + + int getBigramCount() const { + return mBigramCount; + } + private: DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateAllPositionFields); @@ -169,6 +191,8 @@ class DynamicPatriciaTrieGcEventListeners { BufferWithExtendableBuffer *const mBufferToWrite; const DynamicPatriciaTrieWritingHelper::DictPositionRelocationMap *const mDictPositionRelocationMap; + int mUnigramCount; + int mBigramCount; }; private: diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp index 42397c19e..4581ec093 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp @@ -16,6 +16,9 @@ #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h" +#include +#include + #include "defines.h" #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_vector.h" @@ -28,6 +31,9 @@ namespace latinime { +const char *const DynamicPatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; +const char *const DynamicPatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; + void DynamicPatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode, DicNodeVector *const childDicNodes) const { if (!dicNode->hasChildren()) { @@ -198,7 +204,16 @@ bool DynamicPatriciaTriePolicy::addUnigramWord(const int *const word, const int readingHelper.initWithPtNodeArrayPos(getRootPosition()); DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, &mBigramListPolicy, &mShortcutListPolicy); - return writingHelper.addUnigramWord(&readingHelper, word, length, probability); + bool addedNewUnigram = false; + if (writingHelper.addUnigramWord(&readingHelper, word, length, probability, + &addedNewUnigram)) { + if (addedNewUnigram) { + mUnigramCount++; + } + return true; + } else { + return false; + } } bool DynamicPatriciaTriePolicy::addBigramWords(const int *const word0, const int length0, @@ -219,7 +234,15 @@ bool DynamicPatriciaTriePolicy::addBigramWords(const int *const word0, const int } DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, &mBigramListPolicy, &mShortcutListPolicy); - return writingHelper.addBigramWords(word0Pos, word1Pos, probability); + bool addedNewBigram = false; + if (writingHelper.addBigramWords(word0Pos, word1Pos, probability, &addedNewBigram)) { + if (addedNewBigram) { + mBigramCount++; + } + return true; + } else { + return false; + } } bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const int length0, @@ -240,7 +263,12 @@ bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const } DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, &mBigramListPolicy, &mShortcutListPolicy); - return writingHelper.removeBigramWords(word0Pos, word1Pos); + if (writingHelper.removeBigramWords(word0Pos, word1Pos)) { + mBigramCount--; + return true; + } else { + return false; + } } void DynamicPatriciaTriePolicy::flush(const char *const filePath) { @@ -250,7 +278,7 @@ void DynamicPatriciaTriePolicy::flush(const char *const filePath) { } DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, &mBigramListPolicy, &mShortcutListPolicy); - writingHelper.writeToDictFile(filePath, &mHeaderPolicy); + writingHelper.writeToDictFile(filePath, &mHeaderPolicy, mUnigramCount, mBigramCount); } void DynamicPatriciaTriePolicy::flushWithGC(const char *const filePath) { @@ -272,4 +300,13 @@ bool DynamicPatriciaTriePolicy::needsToRunGC() const { return mBufferWithExtendableBuffer.isNearSizeLimit(); } +void DynamicPatriciaTriePolicy::getProperty(const char *const query, char *const outResult, + const int maxResultLength) const { + if (strncmp(query, UNIGRAM_COUNT_QUERY, maxResultLength) == 0) { + snprintf(outResult, maxResultLength, "%d", mUnigramCount); + } else if (strncmp(query, BIGRAM_COUNT_QUERY, maxResultLength) == 0) { + snprintf(outResult, maxResultLength, "%d", mBigramCount); + } +} + } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h index 06d8095d8..7f9d4d924 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h @@ -37,7 +37,9 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { mBufferWithExtendableBuffer(mBuffer->getBuffer() + mHeaderPolicy.getSize(), mBuffer->getBufferSize() - mHeaderPolicy.getSize()), mShortcutListPolicy(&mBufferWithExtendableBuffer), - mBigramListPolicy(&mBufferWithExtendableBuffer, &mShortcutListPolicy) {} + mBigramListPolicy(&mBufferWithExtendableBuffer, &mShortcutListPolicy), + mUnigramCount(mHeaderPolicy.getUnigramCount()), + mBigramCount(mHeaderPolicy.getBigramCount()) {} ~DynamicPatriciaTriePolicy() { delete mBuffer; @@ -91,14 +93,22 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { bool needsToRunGC() const; + void getProperty(const char *const query, char *const outResult, + const int maxResultLength) const; + private: DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTriePolicy); + static const char*const UNIGRAM_COUNT_QUERY; + static const char*const BIGRAM_COUNT_QUERY; + const MmappedBuffer *const mBuffer; const HeaderPolicy mHeaderPolicy; BufferWithExtendableBuffer mBufferWithExtendableBuffer; DynamicShortcutListPolicy mShortcutListPolicy; DynamicBigramListPolicy mBigramListPolicy; + int mUnigramCount; + int mBigramCount; }; } // namespace latinime #endif // LATINIME_DYNAMIC_PATRICIA_TRIE_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp index 578645cd5..bae5e8cad 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp @@ -36,7 +36,8 @@ const size_t DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE = 2 * 1024 * bool DynamicPatriciaTrieWritingHelper::addUnigramWord( DynamicPatriciaTrieReadingHelper *const readingHelper, - const int *const wordCodePoints, const int codePointCount, const int probability) { + const int *const wordCodePoints, const int codePointCount, const int probability, + bool *const outAddedNewUnigram) { int parentPos = NOT_A_DICT_POS; while (!readingHelper->isEnd()) { const int matchedCodePointCount = readingHelper->getPrevTotalCodePointCount(); @@ -54,6 +55,7 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord( const int nextIndex = matchedCodePointCount + j; if (nextIndex >= codePointCount || !readingHelper->isMatchedCodePoint(j, wordCodePoints[matchedCodePointCount + j])) { + *outAddedNewUnigram = true; return reallocatePtNodeAndAddNewPtNodes(nodeReader, readingHelper->getMergedNodeCodePoints(), j, probability, wordCodePoints + matchedCodePointCount, @@ -63,9 +65,10 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord( // All characters are matched. if (codePointCount == readingHelper->getTotalCodePointCount()) { return setPtNodeProbability(nodeReader, probability, - readingHelper->getMergedNodeCodePoints()); + readingHelper->getMergedNodeCodePoints(), outAddedNewUnigram); } if (!nodeReader->hasChildren()) { + *outAddedNewUnigram = true; return createChildrenPtNodeArrayAndAChildPtNode(nodeReader, probability, wordCodePoints + readingHelper->getTotalCodePointCount(), codePointCount - readingHelper->getTotalCodePointCount()); @@ -79,6 +82,7 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord( return false; } int pos = readingHelper->getPosOfLastForwardLinkField(); + *outAddedNewUnigram = true; return createAndInsertNodeIntoPtNodeArray(parentPos, wordCodePoints + readingHelper->getPrevTotalCodePointCount(), codePointCount - readingHelper->getPrevTotalCodePointCount(), @@ -86,7 +90,7 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord( } bool DynamicPatriciaTrieWritingHelper::addBigramWords(const int word0Pos, const int word1Pos, - const int probability) { + const int probability, bool *const outAddedNewBigram) { int mMergedNodeCodePoints[MAX_WORD_LENGTH]; DynamicPatriciaTrieNodeReader nodeReader(mBuffer, mBigramPolicy, mShortcutPolicy); nodeReader.fetchNodeInfoInBufferFromPtNodePosAndGetNodeCodePoints(word0Pos, MAX_WORD_LENGTH, @@ -107,9 +111,11 @@ bool DynamicPatriciaTrieWritingHelper::addBigramWords(const int word0Pos, const if (nodeReader.getBigramsPos() != NOT_A_DICT_POS) { // Insert a new bigram entry into the existing bigram list. int bigramListPos = nodeReader.getBigramsPos(); - return mBigramPolicy->addNewBigramEntryToBigramList(word1Pos, probability, &bigramListPos); + return mBigramPolicy->addNewBigramEntryToBigramList(word1Pos, probability, &bigramListPos, + outAddedNewBigram); } else { // The PtNode doesn't have a bigram list. + *outAddedNewBigram = true; // First, Write a bigram entry at the tail position of the PtNode. if (!mBigramPolicy->writeNewBigramEntry(word1Pos, probability, &writingPos)) { return false; @@ -138,9 +144,12 @@ bool DynamicPatriciaTrieWritingHelper::removeBigramWords(const int word0Pos, con } void DynamicPatriciaTrieWritingHelper::writeToDictFile(const char *const fileName, - const HeaderPolicy *const headerPolicy) { + const HeaderPolicy *const headerPolicy, const int unigramCount, const int bigramCount) { BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */); - if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, false /* updatesLastUpdatedTime */)) { + const int extendedRegionSize = headerPolicy->getExtendedRegionSize() + + mBuffer->getTailPosition() - mBuffer->getOriginalBufferSize(); + if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, false /* updatesLastUpdatedTime */, + unigramCount, bigramCount, extendedRegionSize)) { return; } DictFileWritingUtils::flushAllHeaderAndBodyToFile(fileName, &headerBuffer, mBuffer); @@ -148,13 +157,16 @@ void DynamicPatriciaTrieWritingHelper::writeToDictFile(const char *const fileNam void DynamicPatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos, const char *const fileName, const HeaderPolicy *const headerPolicy) { - BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */); - if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */)) { - return; - } BufferWithExtendableBuffer newDictBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */, MAX_DICTIONARY_SIZE); - if (!runGC(rootPtNodeArrayPos, &newDictBuffer)) { + int unigramCount = 0; + int bigramCount = 0; + if (!runGC(rootPtNodeArrayPos, &newDictBuffer, &unigramCount, &bigramCount)) { + return; + } + BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */); + if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */, + unigramCount, bigramCount, 0 /* extendedRegionSize */)) { return; } DictFileWritingUtils::flushAllHeaderAndBodyToFile(fileName, &headerBuffer, &newDictBuffer); @@ -335,9 +347,10 @@ bool DynamicPatriciaTrieWritingHelper::createAndInsertNodeIntoPtNodeArray(const bool DynamicPatriciaTrieWritingHelper::setPtNodeProbability( const DynamicPatriciaTrieNodeReader *const originalPtNode, const int probability, - const int *const codePoints) { + const int *const codePoints, bool *const outAddedNewUnigram) { if (originalPtNode->isTerminal()) { // Overwrites the probability. + *outAddedNewUnigram = false; int probabilityFieldPos = originalPtNode->getProbabilityFieldPos(); if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition(mBuffer, probability, &probabilityFieldPos)) { @@ -345,6 +358,7 @@ bool DynamicPatriciaTrieWritingHelper::setPtNodeProbability( } } else { // Make the node terminal and write the probability. + *outAddedNewUnigram = true; int movedPos = mBuffer->getTailPosition(); if (!markNodeAsMovedAndSetPosition(originalPtNode, movedPos, movedPos)) { return false; @@ -460,7 +474,8 @@ bool DynamicPatriciaTrieWritingHelper::reallocatePtNodeAndAddNewPtNodes( } bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, - BufferWithExtendableBuffer *const bufferToWrite) { + BufferWithExtendableBuffer *const bufferToWrite, int *const outUnigramCount, + int *const outBigramCount) { DynamicPatriciaTrieReadingHelper readingHelper(mBuffer, mBigramPolicy, mShortcutPolicy); readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); DynamicPatriciaTrieGcEventListeners @@ -505,6 +520,8 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, &traversePolicyToUpdateAllPositionFields)) { return false; } + *outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount(); + *outBigramCount = traversePolicyToUpdateAllPositionFields.getBigramCount(); return true; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h index fe1b2437a..827b6097f 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h @@ -56,15 +56,18 @@ class DynamicPatriciaTrieWritingHelper { // Add a word to the dictionary. If the word already exists, update the probability. bool addUnigramWord(DynamicPatriciaTrieReadingHelper *const readingHelper, - const int *const wordCodePoints, const int codePointCount, const int probability); + const int *const wordCodePoints, const int codePointCount, const int probability, + bool *const outAddedNewUnigram); // Add a bigram relation from word0Pos to word1Pos. - bool addBigramWords(const int word0Pos, const int word1Pos, const int probability); + bool addBigramWords(const int word0Pos, const int word1Pos, const int probability, + bool *const outAddedNewBigram); // Remove a bigram relation from word0Pos to word1Pos. bool removeBigramWords(const int word0Pos, const int word1Pos); - void writeToDictFile(const char *const fileName, const HeaderPolicy *const headerPolicy); + void writeToDictFile(const char *const fileName, const HeaderPolicy *const headerPolicy, + const int unigramCount, const int bigramCount); void writeToDictFileWithGC(const int rootPtNodeArrayPos, const char *const fileName, const HeaderPolicy *const headerPolicy); @@ -107,7 +110,7 @@ class DynamicPatriciaTrieWritingHelper { const int nodeCodePointCount, const int probability, int *const forwardLinkFieldPos); bool setPtNodeProbability(const DynamicPatriciaTrieNodeReader *const originalNode, - const int probability, const int *const codePoints); + const int probability, const int *const codePoints, bool *const outAddedNewUnigram); bool createChildrenPtNodeArrayAndAChildPtNode( const DynamicPatriciaTrieNodeReader *const parentNode, const int probability, @@ -122,7 +125,8 @@ class DynamicPatriciaTrieWritingHelper { const int probabilityOfNewPtNode, const int *const newNodeCodePoints, const int newNodeCodePointCount); - bool runGC(const int rootPtNodeArrayPos, BufferWithExtendableBuffer *const bufferToWrite); + bool runGC(const int rootPtNodeArrayPos, BufferWithExtendableBuffer *const bufferToWrite, + int *const outUnigramCount, int *const outBigramCount); }; } // namespace latinime #endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp index 7bbeacaa0..78c6c042f 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp @@ -16,17 +16,15 @@ #include "suggest/policyimpl/dictionary/header/header_policy.h" -#include -#include -#include - namespace latinime { - // Note that these are corresponding definitions in Java side in FormatSpec.FileHeader. const char *const HeaderPolicy::MULTIPLE_WORDS_DEMOTION_RATE_KEY = "MULTIPLE_WORDS_DEMOTION_RATE"; const char *const HeaderPolicy::USES_FORGETTING_CURVE_KEY = "USES_FORGETTING_CURVE"; const char *const HeaderPolicy::LAST_UPDATED_TIME_KEY = "date"; +const char *const HeaderPolicy::UNIGRAM_COUNT_KEY = "UNIGRAM_COUNT"; +const char *const HeaderPolicy::BIGRAM_COUNT_KEY = "BIGRAM_COUNT"; +const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE"; const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100; const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f; @@ -55,33 +53,17 @@ void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *out } float HeaderPolicy::readMultipleWordCostMultiplier() const { - std::vector keyVector; - HeaderReadWriteUtils::insertCharactersIntoVector(MULTIPLE_WORDS_DEMOTION_RATE_KEY, &keyVector); const int demotionRate = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, - &keyVector, DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE); + MULTIPLE_WORDS_DEMOTION_RATE_KEY, DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE); if (demotionRate <= 0) { return static_cast(MAX_VALUE_FOR_WEIGHTING); } return MULTIPLE_WORD_COST_MULTIPLIER_SCALE / static_cast(demotionRate); } -bool HeaderPolicy::readUsesForgettingCurveFlag() const { - std::vector keyVector; - HeaderReadWriteUtils::insertCharactersIntoVector(USES_FORGETTING_CURVE_KEY, &keyVector); - return HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, &keyVector, - false /* defaultValue */); -} - -// Returns current time when the key is not found or the value is invalid. -int HeaderPolicy::readLastUpdatedTime() const { - std::vector keyVector; - HeaderReadWriteUtils::insertCharactersIntoVector(LAST_UPDATED_TIME_KEY, &keyVector); - return HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, &keyVector, - time(0) /* defaultValue */); -} - bool HeaderPolicy::writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferToWrite, - const bool updatesLastUpdatedTime) const { + const bool updatesLastUpdatedTime, const int unigramCount, const int bigramCount, + const int extendedRegionSize) const { int writingPos = 0; if (!HeaderReadWriteUtils::writeDictionaryVersion(bufferToWrite, mDictFormatVersion, &writingPos)) { @@ -97,21 +79,19 @@ bool HeaderPolicy::writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferT &writingPos)) { return false; } + HeaderReadWriteUtils::AttributeMap attributeMapTowrite(mAttributeMap); + HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, UNIGRAM_COUNT_KEY, unigramCount); + HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, BIGRAM_COUNT_KEY, bigramCount); + HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, EXTENDED_REGION_SIZE_KEY, + extendedRegionSize); if (updatesLastUpdatedTime) { // Set current time as a last updated time. - HeaderReadWriteUtils::AttributeMap attributeMapTowrite(mAttributeMap); - std::vector updatedTimekey; - HeaderReadWriteUtils::insertCharactersIntoVector(LAST_UPDATED_TIME_KEY, &updatedTimekey); - HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, &updatedTimekey, time(0)); - if (!HeaderReadWriteUtils::writeHeaderAttributes(bufferToWrite, &attributeMapTowrite, - &writingPos)) { - return false; - } - } else { - if (!HeaderReadWriteUtils::writeHeaderAttributes(bufferToWrite, &mAttributeMap, - &writingPos)) { - return false; - } + HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, LAST_UPDATED_TIME_KEY, + time(0)); + } + if (!HeaderReadWriteUtils::writeHeaderAttributes(bufferToWrite, &attributeMapTowrite, + &writingPos)) { + return false; } // Writes an actual header size. if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(bufferToWrite, writingPos, diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h index e97c08ca4..93b9c6fcb 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h @@ -17,6 +17,7 @@ #ifndef LATINIME_HEADER_POLICY_H #define LATINIME_HEADER_POLICY_H +#include #include #include "defines.h" @@ -35,8 +36,16 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { mSize(HeaderReadWriteUtils::getHeaderSize(dictBuf)), mAttributeMap(createAttributeMapAndReadAllAttributes(dictBuf)), mMultiWordCostMultiplier(readMultipleWordCostMultiplier()), - mUsesForgettingCurve(readUsesForgettingCurveFlag()), - mLastUpdatedTime(readLastUpdatedTime()) {} + mUsesForgettingCurve(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap, + USES_FORGETTING_CURVE_KEY, false /* defaultValue */)), + mLastUpdatedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + LAST_UPDATED_TIME_KEY, time(0) /* defaultValue */)), + mUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + UNIGRAM_COUNT_KEY, 0 /* defaultValue */)), + mBigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + BIGRAM_COUNT_KEY, 0 /* defaultValue */)), + mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)) {} // Constructs header information using an attribute map. HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion, @@ -44,9 +53,12 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { : mDictFormatVersion(dictFormatVersion), mDictionaryFlags(HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap( attributeMap)), mSize(0), mAttributeMap(*attributeMap), - mMultiWordCostMultiplier(readUsesForgettingCurveFlag()), - mUsesForgettingCurve(readUsesForgettingCurveFlag()), - mLastUpdatedTime(readLastUpdatedTime()) {} + mMultiWordCostMultiplier(readMultipleWordCostMultiplier()), + mUsesForgettingCurve(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap, + USES_FORGETTING_CURVE_KEY, false /* defaultValue */)), + mLastUpdatedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + LAST_UPDATED_TIME_KEY, time(0) /* defaultValue */)), + mUnigramCount(0), mBigramCount(0), mExtendedRegionSize(0) {} ~HeaderPolicy() {} @@ -78,11 +90,24 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { return mLastUpdatedTime; } + AK_FORCE_INLINE int getUnigramCount() const { + return mUnigramCount; + } + + AK_FORCE_INLINE int getBigramCount() const { + return mBigramCount; + } + + AK_FORCE_INLINE int getExtendedRegionSize() const { + return mExtendedRegionSize; + } + void readHeaderValueOrQuestionMark(const char *const key, int *outValue, int outValueSize) const; bool writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferToWrite, - const bool updatesLastUpdatedTime) const; + const bool updatesLastUpdatedTime, const int unigramCount, + const int bigramCount, const int extendedRegionSize) const; private: DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderPolicy); @@ -90,6 +115,9 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { static const char *const MULTIPLE_WORDS_DEMOTION_RATE_KEY; static const char *const USES_FORGETTING_CURVE_KEY; static const char *const LAST_UPDATED_TIME_KEY; + static const char *const UNIGRAM_COUNT_KEY; + static const char *const BIGRAM_COUNT_KEY; + static const char *const EXTENDED_REGION_SIZE_KEY; static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE; static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE; @@ -100,13 +128,12 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { const float mMultiWordCostMultiplier; const bool mUsesForgettingCurve; const int mLastUpdatedTime; + const int mUnigramCount; + const int mBigramCount; + const int mExtendedRegionSize; float readMultipleWordCostMultiplier() const; - bool readUsesForgettingCurveFlag() const; - - int readLastUpdatedTime() const; - static HeaderReadWriteUtils::AttributeMap createAttributeMapAndReadAllAttributes( const uint8_t *const dictBuf); }; diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp index 3b1c78085..2694ce8d5 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp @@ -68,18 +68,12 @@ const char *const HeaderReadWriteUtils::REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY /* static */ HeaderReadWriteUtils::DictionaryFlags HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap( const HeaderReadWriteUtils::AttributeMap *const attributeMap) { - AttributeMap::key_type key; - insertCharactersIntoVector(REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY, &key); - const bool requiresGermanUmlautProcessing = readBoolAttributeValue(attributeMap, &key, - false /* defaultValue */); - key.clear(); - insertCharactersIntoVector(REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY, &key); - const bool requiresFrenchLigatureProcessing = readBoolAttributeValue(attributeMap, &key, - false /* defaultValue */); - key.clear(); - insertCharactersIntoVector(SUPPORTS_DYNAMIC_UPDATE_KEY, &key); - const bool supportsDynamicUpdate = readBoolAttributeValue(attributeMap, &key, - false /* defaultValue */); + const bool requiresGermanUmlautProcessing = readBoolAttributeValue(attributeMap, + REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY, false /* defaultValue */); + const bool requiresFrenchLigatureProcessing = readBoolAttributeValue(attributeMap, + REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY, false /* defaultValue */); + const bool supportsDynamicUpdate = readBoolAttributeValue(attributeMap, + SUPPORTS_DYNAMIC_UPDATE_KEY, false /* defaultValue */); DictionaryFlags dictflags = NO_FLAGS; dictflags |= requiresGermanUmlautProcessing ? GERMAN_UMLAUT_PROCESSING_FLAG : 0; dictflags |= requiresFrenchLigatureProcessing ? FRENCH_LIGATURE_PROCESSING_FLAG : 0; @@ -160,11 +154,18 @@ const char *const HeaderReadWriteUtils::REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY } /* static */ void HeaderReadWriteUtils::setBoolAttribute(AttributeMap *const headerAttributes, - const AttributeMap::key_type *const key, const bool value) { + const char *const key, const bool value) { setIntAttribute(headerAttributes, key, value ? 1 : 0); } /* static */ void HeaderReadWriteUtils::setIntAttribute(AttributeMap *const headerAttributes, + const char *const key, const int value) { + AttributeMap::key_type keyVector; + insertCharactersIntoVector(key, &keyVector); + setIntAttributeInner(headerAttributes, &keyVector, value); +} + +/* static */ void HeaderReadWriteUtils::setIntAttributeInner(AttributeMap *const headerAttributes, const AttributeMap::key_type *const key, const int value) { AttributeMap::mapped_type valueVector; char charBuf[LARGEST_INT_DIGIT_COUNT + 1]; @@ -174,7 +175,7 @@ const char *const HeaderReadWriteUtils::REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY } /* static */ bool HeaderReadWriteUtils::readBoolAttributeValue( - const AttributeMap *const headerAttributes, const AttributeMap::key_type *const key, + const AttributeMap *const headerAttributes, const char *const key, const bool defaultValue) { const int intDefaultValue = defaultValue ? 1 : 0; const int intValue = readIntAttributeValue(headerAttributes, key, intDefaultValue); @@ -182,6 +183,14 @@ const char *const HeaderReadWriteUtils::REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY } /* static */ int HeaderReadWriteUtils::readIntAttributeValue( + const AttributeMap *const headerAttributes, const char *const key, + const int defaultValue) { + AttributeMap::key_type keyVector; + insertCharactersIntoVector(key, &keyVector); + return readIntAttributeValueInner(headerAttributes, &keyVector, defaultValue); +} + +/* static */ int HeaderReadWriteUtils::readIntAttributeValueInner( const AttributeMap *const headerAttributes, const AttributeMap::key_type *const key, const int defaultValue) { AttributeMap::const_iterator it = headerAttributes->find(*key); diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h index caa5097f6..225968323 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h @@ -76,16 +76,16 @@ class HeaderReadWriteUtils { * Methods for header attributes. */ static void setBoolAttribute(AttributeMap *const headerAttributes, - const AttributeMap::key_type *const key, const bool value); + const char *const key, const bool value); static void setIntAttribute(AttributeMap *const headerAttributes, - const AttributeMap::key_type *const key, const int value); + const char *const key, const int value); static bool readBoolAttributeValue(const AttributeMap *const headerAttributes, - const AttributeMap::key_type *const key, const bool defaultValue); + const char *const key, const bool defaultValue); static int readIntAttributeValue(const AttributeMap *const headerAttributes, - const AttributeMap::key_type *const key, const int defaultValue); + const char *const key, const int defaultValue); static void insertCharactersIntoVector(const char *const characters, AttributeMap::key_type *const key); @@ -112,6 +112,12 @@ class HeaderReadWriteUtils { static const char *const SUPPORTS_DYNAMIC_UPDATE_KEY; static const char *const REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY; static const char *const REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY; + + static void setIntAttributeInner(AttributeMap *const headerAttributes, + const AttributeMap::key_type *const key, const int value); + + static int readIntAttributeValueInner(const AttributeMap *const headerAttributes, + const AttributeMap::key_type *const key, const int defaultValue); }; } #endif /* LATINIME_HEADER_READ_WRITE_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h index f1de914cb..4277ff5d7 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h @@ -113,6 +113,14 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { return false; } + void getProperty(const char *const query, char *const outResult, + const int maxResultLength) const { + // getProperty is not supported for this class. + if (maxResultLength > 0) { + outResult[0] = '\0'; + } + } + private: DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTriePolicy); diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp index 2e4ec2e1d..4fae91936 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp @@ -43,7 +43,8 @@ const char *const DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE = const HeaderReadWriteUtils::AttributeMap *const attributeMap) { BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */); HeaderPolicy headerPolicy(FormatUtils::VERSION_3, attributeMap); - headerPolicy.writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */); + headerPolicy.writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */, + 0 /* unigramCount */, 0 /* bigramCount */, 0 /* extendedRegionSize */); BufferWithExtendableBuffer bodyBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */); if (!DynamicPatriciaTrieWritingUtils::writeEmptyDictionary(&bodyBuffer, 0 /* rootPos */)) { return false; diff --git a/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java b/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java index 7ed3ee180..826c0f7b2 100644 --- a/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java +++ b/tests/src/com/android/inputmethod/latin/BinaryDictionaryTests.java @@ -27,6 +27,7 @@ import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.Locale; import java.util.Map; import java.util.Random; @@ -625,4 +626,57 @@ public class BinaryDictionaryTests extends AndroidTestCase { dictFile.delete(); } + + public void testUnigramAndBigramCount() { + final int flashWithGCIterationCount = 10; + final int codePointSetSize = 50; + final int unigramCountPerIteration = 1000; + final int bigramCountPerIteration = 2000; + final int seed = 1123581321; + + final Random random = new Random(seed); + + File dictFile = null; + try { + dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary"); + } catch (IOException e) { + fail("IOException while writing an initial dictionary : " + e); + } + + final ArrayList words = new ArrayList(); + final HashSet> bigrams = new HashSet>(); + final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random); + + BinaryDictionary binaryDictionary; + for (int i = 0; i < flashWithGCIterationCount; i++) { + binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(), + 0 /* offset */, dictFile.length(), true /* useFullEditDistance */, + Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */); + for (int j = 0; j < unigramCountPerIteration; j++) { + final String word = CodePointUtils.generateWord(random, codePointSet); + words.add(word); + final int unigramProbability = random.nextInt(0xFF); + binaryDictionary.addUnigramWord(word, unigramProbability); + } + for (int j = 0; j < bigramCountPerIteration; j++) { + final String word0 = words.get(random.nextInt(words.size())); + final String word1 = words.get(random.nextInt(words.size())); + bigrams.add(new Pair(word0, word1)); + final int bigramProbability = random.nextInt(0xF); + binaryDictionary.addBigramWords(word0, word1, bigramProbability); + } + assertEquals(new HashSet(words).size(), Integer.parseInt( + binaryDictionary.getPropertyForTests(BinaryDictionary.UNIGRAM_COUNT_QUERY))); + assertEquals(new HashSet>(bigrams).size(), Integer.parseInt( + binaryDictionary.getPropertyForTests(BinaryDictionary.BIGRAM_COUNT_QUERY))); + binaryDictionary.flushWithGC(); + assertEquals(new HashSet(words).size(), Integer.parseInt( + binaryDictionary.getPropertyForTests(BinaryDictionary.UNIGRAM_COUNT_QUERY))); + assertEquals(new HashSet>(bigrams).size(), Integer.parseInt( + binaryDictionary.getPropertyForTests(BinaryDictionary.BIGRAM_COUNT_QUERY))); + binaryDictionary.close(); + } + + dictFile.delete(); + } }