Add unigram/bigram counting.

Bug: 6669677
Change-Id: I05ea2201d822dddf062b08c8467daa336760e16c
main
Keisuke Kuroyanagi 2013-09-27 23:12:12 +09:00
parent 5ef6209656
commit 31097a57cc
20 changed files with 339 additions and 101 deletions

View File

@ -48,6 +48,11 @@ public final class BinaryDictionary extends Dictionary {
// TODO: Remove this heuristic. // TODO: Remove this heuristic.
private static final int SPACE_COUNT_FOR_AUTO_COMMIT = 3; private static final int SPACE_COUNT_FOR_AUTO_COMMIT = 3;
@UsedForTesting
public static final String UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
@UsedForTesting
public static final String BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
private long mNativeDict; private long mNativeDict;
private final Locale mLocale; private final Locale mLocale;
private final long mDictSize; private final long mDictSize;
@ -129,6 +134,7 @@ public final class BinaryDictionary extends Dictionary {
private static native void removeBigramWordsNative(long dict, int[] word0, int[] word1); private static native void removeBigramWordsNative(long dict, int[] word0, int[] word1);
private static native int calculateProbabilityNative(long dict, int unigramProbability, private static native int calculateProbabilityNative(long dict, int unigramProbability,
int bigramProbability); int bigramProbability);
private static native String getPropertyNative(long dict, String query);
@UsedForTesting @UsedForTesting
public static boolean createEmptyDictFile(final String filePath, final long dictVersion, public static boolean createEmptyDictFile(final String filePath, final long dictVersion,
@ -331,6 +337,12 @@ public final class BinaryDictionary extends Dictionary {
return calculateProbabilityNative(mNativeDict, unigramProbability, bigramProbability); return calculateProbabilityNative(mNativeDict, unigramProbability, bigramProbability);
} }
@UsedForTesting
public String getPropertyForTests(String query) {
if (!isValidDictionary()) return "";
return getPropertyNative(mNativeDict, query);
}
@Override @Override
public boolean shouldAutoCommit(final SuggestedWordInfo candidate) { public boolean shouldAutoCommit(final SuggestedWordInfo candidate) {
// TODO: actually use the confidence rather than use this completely broken heuristic // TODO: actually use the confidence rather than use this completely broken heuristic

View File

@ -323,6 +323,24 @@ static int latinime_BinaryDictionary_calculateProbabilityNative(JNIEnv *env, jcl
bigramProbability); bigramProbability);
} }
static jstring latinime_BinaryDictionary_getProperty(JNIEnv *env, jclass clazz, jlong dict,
jstring query) {
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
if (!dictionary) {
return env->NewStringUTF("");
}
const jsize queryUtf8Length = env->GetStringUTFLength(query);
char queryChars[queryUtf8Length + 1];
env->GetStringUTFRegion(query, 0, env->GetStringLength(query), queryChars);
queryChars[queryUtf8Length] = '\0';
static const int GET_PROPERTY_RESULT_LENGTH = 100;
char resultChars[GET_PROPERTY_RESULT_LENGTH];
resultChars[0] = '\0';
dictionary->getDictionaryStructurePolicy()->getProperty(queryChars, resultChars,
GET_PROPERTY_RESULT_LENGTH);
return env->NewStringUTF(resultChars);
}
static const JNINativeMethod sMethods[] = { static const JNINativeMethod sMethods[] = {
{ {
const_cast<char *>("createEmptyDictFileNative"), const_cast<char *>("createEmptyDictFileNative"),
@ -398,6 +416,11 @@ static const JNINativeMethod sMethods[] = {
const_cast<char *>("calculateProbabilityNative"), const_cast<char *>("calculateProbabilityNative"),
const_cast<char *>("(JII)I"), const_cast<char *>("(JII)I"),
reinterpret_cast<void *>(latinime_BinaryDictionary_calculateProbabilityNative) reinterpret_cast<void *>(latinime_BinaryDictionary_calculateProbabilityNative)
},
{
const_cast<char *>("getPropertyNative"),
const_cast<char *>("(JLjava/lang/String;)Ljava/lang/String;"),
reinterpret_cast<void *>(latinime_BinaryDictionary_getProperty)
} }
}; };

View File

@ -125,6 +125,11 @@ bool Dictionary::needsToRunGC() {
return mDictionaryStructureWithBufferPolicy->needsToRunGC(); return mDictionaryStructureWithBufferPolicy->needsToRunGC();
} }
void Dictionary::getProperty(const char *const query, char *const outResult,
const int maxResultLength) const {
return mDictionaryStructureWithBufferPolicy->getProperty(query, outResult, maxResultLength);
}
void Dictionary::logDictionaryInfo(JNIEnv *const env) const { void Dictionary::logDictionaryInfo(JNIEnv *const env) const {
const int BUFFER_SIZE = 16; const int BUFFER_SIZE = 16;
int dictionaryIdCodePointBuffer[BUFFER_SIZE]; int dictionaryIdCodePointBuffer[BUFFER_SIZE];

View File

@ -83,6 +83,9 @@ class Dictionary {
bool needsToRunGC(); bool needsToRunGC();
void getProperty(const char *const query, char *const outResult,
const int maxResultLength) const;
const DictionaryStructureWithBufferPolicy *getDictionaryStructurePolicy() const { const DictionaryStructureWithBufferPolicy *getDictionaryStructurePolicy() const {
return mDictionaryStructureWithBufferPolicy; return mDictionaryStructureWithBufferPolicy;
} }

View File

@ -80,6 +80,9 @@ class DictionaryStructureWithBufferPolicy {
virtual bool needsToRunGC() const = 0; virtual bool needsToRunGC() const = 0;
virtual void getProperty(const char *const query, char *const outResult,
const int maxResultLength) const = 0;
protected: protected:
DictionaryStructureWithBufferPolicy() {} DictionaryStructureWithBufferPolicy() {}

View File

@ -119,7 +119,7 @@ bool DynamicBigramListPolicy::copyAllBigrams(BufferWithExtendableBuffer *const b
// Finding useless bigram entries and remove them. Bigram entry is useless when the target PtNode // Finding useless bigram entries and remove them. Bigram entry is useless when the target PtNode
// has been deleted or is not a valid terminal. // has been deleted or is not a valid terminal.
bool DynamicBigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries( bool DynamicBigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(
int *const bigramListPos) { int *const bigramListPos, int *const outValidBigramEntryCount) {
const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos); const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos);
if (usesAdditionalBuffer) { if (usesAdditionalBuffer) {
*bigramListPos -= mBuffer->getOriginalBufferSize(); *bigramListPos -= mBuffer->getOriginalBufferSize();
@ -161,6 +161,8 @@ bool DynamicBigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(
NOT_A_DICT_POS /* targetOffset */, &bigramEntryPos)) { NOT_A_DICT_POS /* targetOffset */, &bigramEntryPos)) {
return false; return false;
} }
} else {
(*outValidBigramEntryCount) += 1;
} }
} while(BigramListReadWriteUtils::hasNext(bigramFlags)); } while(BigramListReadWriteUtils::hasNext(bigramFlags));
return true; return true;
@ -169,7 +171,7 @@ bool DynamicBigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(
// Updates bigram target PtNode positions in the list after the placing step in GC. // Updates bigram target PtNode positions in the list after the placing step in GC.
bool DynamicBigramListPolicy::updateAllBigramTargetPtNodePositions(int *const bigramListPos, bool DynamicBigramListPolicy::updateAllBigramTargetPtNodePositions(int *const bigramListPos,
const DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap *const const DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap *const
ptNodePositionRelocationMap) { ptNodePositionRelocationMap, int *const outBigramEntryCount) {
const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos); const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos);
if (usesAdditionalBuffer) { if (usesAdditionalBuffer) {
*bigramListPos -= mBuffer->getOriginalBufferSize(); *bigramListPos -= mBuffer->getOriginalBufferSize();
@ -211,11 +213,12 @@ bool DynamicBigramListPolicy::updateAllBigramTargetPtNodePositions(int *const bi
return false; return false;
} }
} while(BigramListReadWriteUtils::hasNext(bigramFlags)); } while(BigramListReadWriteUtils::hasNext(bigramFlags));
(*outBigramEntryCount) = bigramEntryCount;
return true; return true;
} }
bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTargetPos, bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTargetPos,
const int probability, int *const bigramListPos) { const int probability, int *const bigramListPos, bool *const outAddedNewBigram) {
const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos); const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos);
if (usesAdditionalBuffer) { if (usesAdditionalBuffer) {
*bigramListPos -= mBuffer->getOriginalBufferSize(); *bigramListPos -= mBuffer->getOriginalBufferSize();
@ -243,6 +246,7 @@ bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTarg
} }
if (followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos) == bigramTargetPos) { if (followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos) == bigramTargetPos) {
// Update this bigram entry. // Update this bigram entry.
*outAddedNewBigram = false;
const BigramListReadWriteUtils::BigramFlags updatedFlags = const BigramListReadWriteUtils::BigramFlags updatedFlags =
BigramListReadWriteUtils::setProbabilityInFlags(bigramFlags, probability); BigramListReadWriteUtils::setProbabilityInFlags(bigramFlags, probability);
return BigramListReadWriteUtils::writeBigramEntry(mBuffer, updatedFlags, return BigramListReadWriteUtils::writeBigramEntry(mBuffer, updatedFlags,
@ -254,12 +258,14 @@ bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTarg
// The current last entry is found. // The current last entry is found.
// First, update the flags of the last entry. // First, update the flags of the last entry.
if (!BigramListReadWriteUtils::setHasNextFlag(mBuffer, true /* hasNext */, entryPos)) { if (!BigramListReadWriteUtils::setHasNextFlag(mBuffer, true /* hasNext */, entryPos)) {
*outAddedNewBigram = false;
return false; return false;
} }
if (usesAdditionalBuffer) { if (usesAdditionalBuffer) {
*bigramListPos += mBuffer->getOriginalBufferSize(); *bigramListPos += mBuffer->getOriginalBufferSize();
} }
// Then, add a new entry after the last entry. // Then, add a new entry after the last entry.
*outAddedNewBigram = true;
return writeNewBigramEntry(bigramTargetPos, probability, bigramListPos); return writeNewBigramEntry(bigramTargetPos, probability, bigramListPos);
} while(BigramListReadWriteUtils::hasNext(bigramFlags)); } while(BigramListReadWriteUtils::hasNext(bigramFlags));
// We return directly from the while loop. // We return directly from the while loop.

View File

@ -50,19 +50,20 @@ class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy {
bool copyAllBigrams(BufferWithExtendableBuffer *const bufferToWrite, int *const fromPos, bool copyAllBigrams(BufferWithExtendableBuffer *const bufferToWrite, int *const fromPos,
int *const toPos, int *const outBigramsCount) const; int *const toPos, int *const outBigramsCount) const;
bool updateAllBigramEntriesAndDeleteUselessEntries(int *const bigramListPos); bool updateAllBigramEntriesAndDeleteUselessEntries(int *const bigramListPos,
int *const outBigramEntryCount);
bool updateAllBigramTargetPtNodePositions(int *const bigramListPos, bool updateAllBigramTargetPtNodePositions(int *const bigramListPos,
const DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap *const const DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap *const
ptNodePositionRelocationMap); ptNodePositionRelocationMap, int *const outValidBigramEntryCount);
bool addNewBigramEntryToBigramList(const int bigramTargetPos, const int probability, bool addNewBigramEntryToBigramList(const int bigramTargetPos, const int probability,
int *const bigramListPos); int *const bigramListPos, bool *const outAddedNewBigram);
bool writeNewBigramEntry(const int bigramTargetPos, const int probability, bool writeNewBigramEntry(const int bigramTargetPos, const int probability,
int *const writingPos); int *const writingPos);
// Return if targetBigramPos is found or not. // Return whether or not targetBigramPos is found.
bool removeBigram(const int bigramListPos, const int bigramTargetPos); bool removeBigram(const int bigramListPos, const int bigramTargetPos);
private: private:

View File

@ -42,6 +42,9 @@ bool DynamicPatriciaTrieGcEventListeners
} }
} else { } else {
valueStack.back() += 1; valueStack.back() += 1;
if (node->isTerminal()) {
mValidUnigramCount += 1;
}
} }
return true; return true;
} }
@ -137,10 +140,15 @@ bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateAllPositionField
// Updates bigram target PtNode positions in the bigram list. // Updates bigram target PtNode positions in the bigram list.
int bigramsPos = node->getBigramsPos(); int bigramsPos = node->getBigramsPos();
if (bigramsPos != NOT_A_DICT_POS) { if (bigramsPos != NOT_A_DICT_POS) {
int bigramEntryCount;
if (!mBigramPolicy->updateAllBigramTargetPtNodePositions(&bigramsPos, if (!mBigramPolicy->updateAllBigramTargetPtNodePositions(&bigramsPos,
&mDictPositionRelocationMap->mPtNodePositionRelocationMap)) { &mDictPositionRelocationMap->mPtNodePositionRelocationMap, &bigramEntryCount)) {
return false; return false;
} }
mBigramCount += bigramEntryCount;
}
if (node->isTerminal()) {
mUnigramCount++;
} }
return true; return true;

View File

@ -41,7 +41,7 @@ class DynamicPatriciaTrieGcEventListeners {
DynamicPatriciaTrieWritingHelper *const writingHelper, DynamicPatriciaTrieWritingHelper *const writingHelper,
BufferWithExtendableBuffer *const buffer) BufferWithExtendableBuffer *const buffer)
: mWritingHelper(writingHelper), mBuffer(buffer), valueStack(), : mWritingHelper(writingHelper), mBuffer(buffer), valueStack(),
mChildrenValue(0) {} mChildrenValue(0), mValidUnigramCount(0) {}
~TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted() {}; ~TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted() {};
@ -64,6 +64,10 @@ class DynamicPatriciaTrieGcEventListeners {
bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node,
const int *const nodeCodePoints); const int *const nodeCodePoints);
int getValidUnigramCount() const {
return mValidUnigramCount;
}
private: private:
DISALLOW_IMPLICIT_CONSTRUCTORS( DISALLOW_IMPLICIT_CONSTRUCTORS(
TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted); TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted);
@ -72,6 +76,7 @@ class DynamicPatriciaTrieGcEventListeners {
BufferWithExtendableBuffer *const mBuffer; BufferWithExtendableBuffer *const mBuffer;
std::vector<int> valueStack; std::vector<int> valueStack;
int mChildrenValue; int mChildrenValue;
int mValidUnigramCount;
}; };
// Updates all bigram entries that are held by valid PtNodes. This removes useless bigram // Updates all bigram entries that are held by valid PtNodes. This removes useless bigram
@ -80,7 +85,7 @@ class DynamicPatriciaTrieGcEventListeners {
: public DynamicPatriciaTrieReadingHelper::TraversingEventListener { : public DynamicPatriciaTrieReadingHelper::TraversingEventListener {
public: public:
TraversePolicyToUpdateBigramProbability(DynamicBigramListPolicy *const bigramPolicy) TraversePolicyToUpdateBigramProbability(DynamicBigramListPolicy *const bigramPolicy)
: mBigramPolicy(bigramPolicy) {} : mBigramPolicy(bigramPolicy), mValidBigramEntryCount(0) {}
bool onAscend() { return true; } bool onAscend() { return true; }
@ -93,18 +98,26 @@ class DynamicPatriciaTrieGcEventListeners {
if (!node->isDeleted()) { if (!node->isDeleted()) {
int pos = node->getBigramsPos(); int pos = node->getBigramsPos();
if (pos != NOT_A_DICT_POS) { if (pos != NOT_A_DICT_POS) {
if (!mBigramPolicy->updateAllBigramEntriesAndDeleteUselessEntries(&pos)) { int bigramEntryCount = 0;
if (!mBigramPolicy->updateAllBigramEntriesAndDeleteUselessEntries(&pos,
&bigramEntryCount)) {
return false; return false;
} }
mValidBigramEntryCount += bigramEntryCount;
} }
} }
return true; return true;
} }
int getValidBigramEntryCount() const {
return mValidBigramEntryCount;
}
private: private:
DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateBigramProbability); DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateBigramProbability);
DynamicBigramListPolicy *const mBigramPolicy; DynamicBigramListPolicy *const mBigramPolicy;
int mValidBigramEntryCount;
}; };
class TraversePolicyToPlaceAndWriteValidPtNodesToBuffer class TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
@ -150,7 +163,8 @@ class DynamicPatriciaTrieGcEventListeners {
dictPositionRelocationMap) dictPositionRelocationMap)
: mWritingHelper(writingHelper), mBigramPolicy(bigramPolicy), : mWritingHelper(writingHelper), mBigramPolicy(bigramPolicy),
mBufferToWrite(bufferToWrite), mBufferToWrite(bufferToWrite),
mDictPositionRelocationMap(dictPositionRelocationMap) {}; mDictPositionRelocationMap(dictPositionRelocationMap), mUnigramCount(0),
mBigramCount(0) {};
bool onAscend() { return true; } bool onAscend() { return true; }
@ -161,6 +175,14 @@ class DynamicPatriciaTrieGcEventListeners {
bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node,
const int *const nodeCodePoints); const int *const nodeCodePoints);
int getUnigramCount() const {
return mUnigramCount;
}
int getBigramCount() const {
return mBigramCount;
}
private: private:
DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateAllPositionFields); DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateAllPositionFields);
@ -169,6 +191,8 @@ class DynamicPatriciaTrieGcEventListeners {
BufferWithExtendableBuffer *const mBufferToWrite; BufferWithExtendableBuffer *const mBufferToWrite;
const DynamicPatriciaTrieWritingHelper::DictPositionRelocationMap *const const DynamicPatriciaTrieWritingHelper::DictPositionRelocationMap *const
mDictPositionRelocationMap; mDictPositionRelocationMap;
int mUnigramCount;
int mBigramCount;
}; };
private: private:

View File

@ -16,6 +16,9 @@
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h"
#include <cstdio>
#include <cstring>
#include "defines.h" #include "defines.h"
#include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node.h"
#include "suggest/core/dicnode/dic_node_vector.h" #include "suggest/core/dicnode/dic_node_vector.h"
@ -28,6 +31,9 @@
namespace latinime { namespace latinime {
const char *const DynamicPatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
const char *const DynamicPatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
void DynamicPatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode, void DynamicPatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode,
DicNodeVector *const childDicNodes) const { DicNodeVector *const childDicNodes) const {
if (!dicNode->hasChildren()) { if (!dicNode->hasChildren()) {
@ -198,7 +204,16 @@ bool DynamicPatriciaTriePolicy::addUnigramWord(const int *const word, const int
readingHelper.initWithPtNodeArrayPos(getRootPosition()); readingHelper.initWithPtNodeArrayPos(getRootPosition());
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
&mBigramListPolicy, &mShortcutListPolicy); &mBigramListPolicy, &mShortcutListPolicy);
return writingHelper.addUnigramWord(&readingHelper, word, length, probability); bool addedNewUnigram = false;
if (writingHelper.addUnigramWord(&readingHelper, word, length, probability,
&addedNewUnigram)) {
if (addedNewUnigram) {
mUnigramCount++;
}
return true;
} else {
return false;
}
} }
bool DynamicPatriciaTriePolicy::addBigramWords(const int *const word0, const int length0, bool DynamicPatriciaTriePolicy::addBigramWords(const int *const word0, const int length0,
@ -219,7 +234,15 @@ bool DynamicPatriciaTriePolicy::addBigramWords(const int *const word0, const int
} }
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
&mBigramListPolicy, &mShortcutListPolicy); &mBigramListPolicy, &mShortcutListPolicy);
return writingHelper.addBigramWords(word0Pos, word1Pos, probability); bool addedNewBigram = false;
if (writingHelper.addBigramWords(word0Pos, word1Pos, probability, &addedNewBigram)) {
if (addedNewBigram) {
mBigramCount++;
}
return true;
} else {
return false;
}
} }
bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const int length0, bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const int length0,
@ -240,7 +263,12 @@ bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const
} }
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
&mBigramListPolicy, &mShortcutListPolicy); &mBigramListPolicy, &mShortcutListPolicy);
return writingHelper.removeBigramWords(word0Pos, word1Pos); if (writingHelper.removeBigramWords(word0Pos, word1Pos)) {
mBigramCount--;
return true;
} else {
return false;
}
} }
void DynamicPatriciaTriePolicy::flush(const char *const filePath) { void DynamicPatriciaTriePolicy::flush(const char *const filePath) {
@ -250,7 +278,7 @@ void DynamicPatriciaTriePolicy::flush(const char *const filePath) {
} }
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
&mBigramListPolicy, &mShortcutListPolicy); &mBigramListPolicy, &mShortcutListPolicy);
writingHelper.writeToDictFile(filePath, &mHeaderPolicy); writingHelper.writeToDictFile(filePath, &mHeaderPolicy, mUnigramCount, mBigramCount);
} }
void DynamicPatriciaTriePolicy::flushWithGC(const char *const filePath) { void DynamicPatriciaTriePolicy::flushWithGC(const char *const filePath) {
@ -272,4 +300,13 @@ bool DynamicPatriciaTriePolicy::needsToRunGC() const {
return mBufferWithExtendableBuffer.isNearSizeLimit(); return mBufferWithExtendableBuffer.isNearSizeLimit();
} }
void DynamicPatriciaTriePolicy::getProperty(const char *const query, char *const outResult,
const int maxResultLength) const {
if (strncmp(query, UNIGRAM_COUNT_QUERY, maxResultLength) == 0) {
snprintf(outResult, maxResultLength, "%d", mUnigramCount);
} else if (strncmp(query, BIGRAM_COUNT_QUERY, maxResultLength) == 0) {
snprintf(outResult, maxResultLength, "%d", mBigramCount);
}
}
} // namespace latinime } // namespace latinime

View File

@ -37,7 +37,9 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
mBufferWithExtendableBuffer(mBuffer->getBuffer() + mHeaderPolicy.getSize(), mBufferWithExtendableBuffer(mBuffer->getBuffer() + mHeaderPolicy.getSize(),
mBuffer->getBufferSize() - mHeaderPolicy.getSize()), mBuffer->getBufferSize() - mHeaderPolicy.getSize()),
mShortcutListPolicy(&mBufferWithExtendableBuffer), mShortcutListPolicy(&mBufferWithExtendableBuffer),
mBigramListPolicy(&mBufferWithExtendableBuffer, &mShortcutListPolicy) {} mBigramListPolicy(&mBufferWithExtendableBuffer, &mShortcutListPolicy),
mUnigramCount(mHeaderPolicy.getUnigramCount()),
mBigramCount(mHeaderPolicy.getBigramCount()) {}
~DynamicPatriciaTriePolicy() { ~DynamicPatriciaTriePolicy() {
delete mBuffer; delete mBuffer;
@ -91,14 +93,22 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
bool needsToRunGC() const; bool needsToRunGC() const;
void getProperty(const char *const query, char *const outResult,
const int maxResultLength) const;
private: private:
DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTriePolicy); DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTriePolicy);
static const char*const UNIGRAM_COUNT_QUERY;
static const char*const BIGRAM_COUNT_QUERY;
const MmappedBuffer *const mBuffer; const MmappedBuffer *const mBuffer;
const HeaderPolicy mHeaderPolicy; const HeaderPolicy mHeaderPolicy;
BufferWithExtendableBuffer mBufferWithExtendableBuffer; BufferWithExtendableBuffer mBufferWithExtendableBuffer;
DynamicShortcutListPolicy mShortcutListPolicy; DynamicShortcutListPolicy mShortcutListPolicy;
DynamicBigramListPolicy mBigramListPolicy; DynamicBigramListPolicy mBigramListPolicy;
int mUnigramCount;
int mBigramCount;
}; };
} // namespace latinime } // namespace latinime
#endif // LATINIME_DYNAMIC_PATRICIA_TRIE_POLICY_H #endif // LATINIME_DYNAMIC_PATRICIA_TRIE_POLICY_H

View File

@ -36,7 +36,8 @@ const size_t DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE = 2 * 1024 *
bool DynamicPatriciaTrieWritingHelper::addUnigramWord( bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
DynamicPatriciaTrieReadingHelper *const readingHelper, DynamicPatriciaTrieReadingHelper *const readingHelper,
const int *const wordCodePoints, const int codePointCount, const int probability) { const int *const wordCodePoints, const int codePointCount, const int probability,
bool *const outAddedNewUnigram) {
int parentPos = NOT_A_DICT_POS; int parentPos = NOT_A_DICT_POS;
while (!readingHelper->isEnd()) { while (!readingHelper->isEnd()) {
const int matchedCodePointCount = readingHelper->getPrevTotalCodePointCount(); const int matchedCodePointCount = readingHelper->getPrevTotalCodePointCount();
@ -54,6 +55,7 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
const int nextIndex = matchedCodePointCount + j; const int nextIndex = matchedCodePointCount + j;
if (nextIndex >= codePointCount || !readingHelper->isMatchedCodePoint(j, if (nextIndex >= codePointCount || !readingHelper->isMatchedCodePoint(j,
wordCodePoints[matchedCodePointCount + j])) { wordCodePoints[matchedCodePointCount + j])) {
*outAddedNewUnigram = true;
return reallocatePtNodeAndAddNewPtNodes(nodeReader, return reallocatePtNodeAndAddNewPtNodes(nodeReader,
readingHelper->getMergedNodeCodePoints(), j, probability, readingHelper->getMergedNodeCodePoints(), j, probability,
wordCodePoints + matchedCodePointCount, wordCodePoints + matchedCodePointCount,
@ -63,9 +65,10 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
// All characters are matched. // All characters are matched.
if (codePointCount == readingHelper->getTotalCodePointCount()) { if (codePointCount == readingHelper->getTotalCodePointCount()) {
return setPtNodeProbability(nodeReader, probability, return setPtNodeProbability(nodeReader, probability,
readingHelper->getMergedNodeCodePoints()); readingHelper->getMergedNodeCodePoints(), outAddedNewUnigram);
} }
if (!nodeReader->hasChildren()) { if (!nodeReader->hasChildren()) {
*outAddedNewUnigram = true;
return createChildrenPtNodeArrayAndAChildPtNode(nodeReader, probability, return createChildrenPtNodeArrayAndAChildPtNode(nodeReader, probability,
wordCodePoints + readingHelper->getTotalCodePointCount(), wordCodePoints + readingHelper->getTotalCodePointCount(),
codePointCount - readingHelper->getTotalCodePointCount()); codePointCount - readingHelper->getTotalCodePointCount());
@ -79,6 +82,7 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
return false; return false;
} }
int pos = readingHelper->getPosOfLastForwardLinkField(); int pos = readingHelper->getPosOfLastForwardLinkField();
*outAddedNewUnigram = true;
return createAndInsertNodeIntoPtNodeArray(parentPos, return createAndInsertNodeIntoPtNodeArray(parentPos,
wordCodePoints + readingHelper->getPrevTotalCodePointCount(), wordCodePoints + readingHelper->getPrevTotalCodePointCount(),
codePointCount - readingHelper->getPrevTotalCodePointCount(), codePointCount - readingHelper->getPrevTotalCodePointCount(),
@ -86,7 +90,7 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
} }
bool DynamicPatriciaTrieWritingHelper::addBigramWords(const int word0Pos, const int word1Pos, bool DynamicPatriciaTrieWritingHelper::addBigramWords(const int word0Pos, const int word1Pos,
const int probability) { const int probability, bool *const outAddedNewBigram) {
int mMergedNodeCodePoints[MAX_WORD_LENGTH]; int mMergedNodeCodePoints[MAX_WORD_LENGTH];
DynamicPatriciaTrieNodeReader nodeReader(mBuffer, mBigramPolicy, mShortcutPolicy); DynamicPatriciaTrieNodeReader nodeReader(mBuffer, mBigramPolicy, mShortcutPolicy);
nodeReader.fetchNodeInfoInBufferFromPtNodePosAndGetNodeCodePoints(word0Pos, MAX_WORD_LENGTH, nodeReader.fetchNodeInfoInBufferFromPtNodePosAndGetNodeCodePoints(word0Pos, MAX_WORD_LENGTH,
@ -107,9 +111,11 @@ bool DynamicPatriciaTrieWritingHelper::addBigramWords(const int word0Pos, const
if (nodeReader.getBigramsPos() != NOT_A_DICT_POS) { if (nodeReader.getBigramsPos() != NOT_A_DICT_POS) {
// Insert a new bigram entry into the existing bigram list. // Insert a new bigram entry into the existing bigram list.
int bigramListPos = nodeReader.getBigramsPos(); int bigramListPos = nodeReader.getBigramsPos();
return mBigramPolicy->addNewBigramEntryToBigramList(word1Pos, probability, &bigramListPos); return mBigramPolicy->addNewBigramEntryToBigramList(word1Pos, probability, &bigramListPos,
outAddedNewBigram);
} else { } else {
// The PtNode doesn't have a bigram list. // The PtNode doesn't have a bigram list.
*outAddedNewBigram = true;
// First, Write a bigram entry at the tail position of the PtNode. // First, Write a bigram entry at the tail position of the PtNode.
if (!mBigramPolicy->writeNewBigramEntry(word1Pos, probability, &writingPos)) { if (!mBigramPolicy->writeNewBigramEntry(word1Pos, probability, &writingPos)) {
return false; return false;
@ -138,9 +144,12 @@ bool DynamicPatriciaTrieWritingHelper::removeBigramWords(const int word0Pos, con
} }
void DynamicPatriciaTrieWritingHelper::writeToDictFile(const char *const fileName, void DynamicPatriciaTrieWritingHelper::writeToDictFile(const char *const fileName,
const HeaderPolicy *const headerPolicy) { const HeaderPolicy *const headerPolicy, const int unigramCount, const int bigramCount) {
BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */); BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, false /* updatesLastUpdatedTime */)) { const int extendedRegionSize = headerPolicy->getExtendedRegionSize() +
mBuffer->getTailPosition() - mBuffer->getOriginalBufferSize();
if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, false /* updatesLastUpdatedTime */,
unigramCount, bigramCount, extendedRegionSize)) {
return; return;
} }
DictFileWritingUtils::flushAllHeaderAndBodyToFile(fileName, &headerBuffer, mBuffer); DictFileWritingUtils::flushAllHeaderAndBodyToFile(fileName, &headerBuffer, mBuffer);
@ -148,13 +157,16 @@ void DynamicPatriciaTrieWritingHelper::writeToDictFile(const char *const fileNam
void DynamicPatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos, void DynamicPatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos,
const char *const fileName, const HeaderPolicy *const headerPolicy) { const char *const fileName, const HeaderPolicy *const headerPolicy) {
BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */)) {
return;
}
BufferWithExtendableBuffer newDictBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */, BufferWithExtendableBuffer newDictBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */,
MAX_DICTIONARY_SIZE); MAX_DICTIONARY_SIZE);
if (!runGC(rootPtNodeArrayPos, &newDictBuffer)) { int unigramCount = 0;
int bigramCount = 0;
if (!runGC(rootPtNodeArrayPos, &newDictBuffer, &unigramCount, &bigramCount)) {
return;
}
BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */,
unigramCount, bigramCount, 0 /* extendedRegionSize */)) {
return; return;
} }
DictFileWritingUtils::flushAllHeaderAndBodyToFile(fileName, &headerBuffer, &newDictBuffer); DictFileWritingUtils::flushAllHeaderAndBodyToFile(fileName, &headerBuffer, &newDictBuffer);
@ -335,9 +347,10 @@ bool DynamicPatriciaTrieWritingHelper::createAndInsertNodeIntoPtNodeArray(const
bool DynamicPatriciaTrieWritingHelper::setPtNodeProbability( bool DynamicPatriciaTrieWritingHelper::setPtNodeProbability(
const DynamicPatriciaTrieNodeReader *const originalPtNode, const int probability, const DynamicPatriciaTrieNodeReader *const originalPtNode, const int probability,
const int *const codePoints) { const int *const codePoints, bool *const outAddedNewUnigram) {
if (originalPtNode->isTerminal()) { if (originalPtNode->isTerminal()) {
// Overwrites the probability. // Overwrites the probability.
*outAddedNewUnigram = false;
int probabilityFieldPos = originalPtNode->getProbabilityFieldPos(); int probabilityFieldPos = originalPtNode->getProbabilityFieldPos();
if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition(mBuffer, if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition(mBuffer,
probability, &probabilityFieldPos)) { probability, &probabilityFieldPos)) {
@ -345,6 +358,7 @@ bool DynamicPatriciaTrieWritingHelper::setPtNodeProbability(
} }
} else { } else {
// Make the node terminal and write the probability. // Make the node terminal and write the probability.
*outAddedNewUnigram = true;
int movedPos = mBuffer->getTailPosition(); int movedPos = mBuffer->getTailPosition();
if (!markNodeAsMovedAndSetPosition(originalPtNode, movedPos, movedPos)) { if (!markNodeAsMovedAndSetPosition(originalPtNode, movedPos, movedPos)) {
return false; return false;
@ -460,7 +474,8 @@ bool DynamicPatriciaTrieWritingHelper::reallocatePtNodeAndAddNewPtNodes(
} }
bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
BufferWithExtendableBuffer *const bufferToWrite) { BufferWithExtendableBuffer *const bufferToWrite, int *const outUnigramCount,
int *const outBigramCount) {
DynamicPatriciaTrieReadingHelper readingHelper(mBuffer, mBigramPolicy, mShortcutPolicy); DynamicPatriciaTrieReadingHelper readingHelper(mBuffer, mBigramPolicy, mShortcutPolicy);
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
DynamicPatriciaTrieGcEventListeners DynamicPatriciaTrieGcEventListeners
@ -505,6 +520,8 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
&traversePolicyToUpdateAllPositionFields)) { &traversePolicyToUpdateAllPositionFields)) {
return false; return false;
} }
*outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount();
*outBigramCount = traversePolicyToUpdateAllPositionFields.getBigramCount();
return true; return true;
} }

View File

@ -56,15 +56,18 @@ class DynamicPatriciaTrieWritingHelper {
// Add a word to the dictionary. If the word already exists, update the probability. // Add a word to the dictionary. If the word already exists, update the probability.
bool addUnigramWord(DynamicPatriciaTrieReadingHelper *const readingHelper, bool addUnigramWord(DynamicPatriciaTrieReadingHelper *const readingHelper,
const int *const wordCodePoints, const int codePointCount, const int probability); const int *const wordCodePoints, const int codePointCount, const int probability,
bool *const outAddedNewUnigram);
// Add a bigram relation from word0Pos to word1Pos. // Add a bigram relation from word0Pos to word1Pos.
bool addBigramWords(const int word0Pos, const int word1Pos, const int probability); bool addBigramWords(const int word0Pos, const int word1Pos, const int probability,
bool *const outAddedNewBigram);
// Remove a bigram relation from word0Pos to word1Pos. // Remove a bigram relation from word0Pos to word1Pos.
bool removeBigramWords(const int word0Pos, const int word1Pos); bool removeBigramWords(const int word0Pos, const int word1Pos);
void writeToDictFile(const char *const fileName, const HeaderPolicy *const headerPolicy); void writeToDictFile(const char *const fileName, const HeaderPolicy *const headerPolicy,
const int unigramCount, const int bigramCount);
void writeToDictFileWithGC(const int rootPtNodeArrayPos, const char *const fileName, void writeToDictFileWithGC(const int rootPtNodeArrayPos, const char *const fileName,
const HeaderPolicy *const headerPolicy); const HeaderPolicy *const headerPolicy);
@ -107,7 +110,7 @@ class DynamicPatriciaTrieWritingHelper {
const int nodeCodePointCount, const int probability, int *const forwardLinkFieldPos); const int nodeCodePointCount, const int probability, int *const forwardLinkFieldPos);
bool setPtNodeProbability(const DynamicPatriciaTrieNodeReader *const originalNode, bool setPtNodeProbability(const DynamicPatriciaTrieNodeReader *const originalNode,
const int probability, const int *const codePoints); const int probability, const int *const codePoints, bool *const outAddedNewUnigram);
bool createChildrenPtNodeArrayAndAChildPtNode( bool createChildrenPtNodeArrayAndAChildPtNode(
const DynamicPatriciaTrieNodeReader *const parentNode, const int probability, const DynamicPatriciaTrieNodeReader *const parentNode, const int probability,
@ -122,7 +125,8 @@ class DynamicPatriciaTrieWritingHelper {
const int probabilityOfNewPtNode, const int *const newNodeCodePoints, const int probabilityOfNewPtNode, const int *const newNodeCodePoints,
const int newNodeCodePointCount); const int newNodeCodePointCount);
bool runGC(const int rootPtNodeArrayPos, BufferWithExtendableBuffer *const bufferToWrite); bool runGC(const int rootPtNodeArrayPos, BufferWithExtendableBuffer *const bufferToWrite,
int *const outUnigramCount, int *const outBigramCount);
}; };
} // namespace latinime } // namespace latinime
#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H */ #endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H */

View File

@ -16,17 +16,15 @@
#include "suggest/policyimpl/dictionary/header/header_policy.h" #include "suggest/policyimpl/dictionary/header/header_policy.h"
#include <cstddef>
#include <cstdio>
#include <ctime>
namespace latinime { namespace latinime {
// Note that these are corresponding definitions in Java side in FormatSpec.FileHeader. // Note that these are corresponding definitions in Java side in FormatSpec.FileHeader.
const char *const HeaderPolicy::MULTIPLE_WORDS_DEMOTION_RATE_KEY = "MULTIPLE_WORDS_DEMOTION_RATE"; const char *const HeaderPolicy::MULTIPLE_WORDS_DEMOTION_RATE_KEY = "MULTIPLE_WORDS_DEMOTION_RATE";
const char *const HeaderPolicy::USES_FORGETTING_CURVE_KEY = "USES_FORGETTING_CURVE"; const char *const HeaderPolicy::USES_FORGETTING_CURVE_KEY = "USES_FORGETTING_CURVE";
const char *const HeaderPolicy::LAST_UPDATED_TIME_KEY = "date"; const char *const HeaderPolicy::LAST_UPDATED_TIME_KEY = "date";
const char *const HeaderPolicy::UNIGRAM_COUNT_KEY = "UNIGRAM_COUNT";
const char *const HeaderPolicy::BIGRAM_COUNT_KEY = "BIGRAM_COUNT";
const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE";
const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100; const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100;
const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f; const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f;
@ -55,33 +53,17 @@ void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *out
} }
float HeaderPolicy::readMultipleWordCostMultiplier() const { float HeaderPolicy::readMultipleWordCostMultiplier() const {
std::vector<int> keyVector;
HeaderReadWriteUtils::insertCharactersIntoVector(MULTIPLE_WORDS_DEMOTION_RATE_KEY, &keyVector);
const int demotionRate = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, const int demotionRate = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
&keyVector, DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE); MULTIPLE_WORDS_DEMOTION_RATE_KEY, DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE);
if (demotionRate <= 0) { if (demotionRate <= 0) {
return static_cast<float>(MAX_VALUE_FOR_WEIGHTING); return static_cast<float>(MAX_VALUE_FOR_WEIGHTING);
} }
return MULTIPLE_WORD_COST_MULTIPLIER_SCALE / static_cast<float>(demotionRate); return MULTIPLE_WORD_COST_MULTIPLIER_SCALE / static_cast<float>(demotionRate);
} }
bool HeaderPolicy::readUsesForgettingCurveFlag() const {
std::vector<int> keyVector;
HeaderReadWriteUtils::insertCharactersIntoVector(USES_FORGETTING_CURVE_KEY, &keyVector);
return HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, &keyVector,
false /* defaultValue */);
}
// Returns current time when the key is not found or the value is invalid.
int HeaderPolicy::readLastUpdatedTime() const {
std::vector<int> keyVector;
HeaderReadWriteUtils::insertCharactersIntoVector(LAST_UPDATED_TIME_KEY, &keyVector);
return HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, &keyVector,
time(0) /* defaultValue */);
}
bool HeaderPolicy::writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferToWrite, bool HeaderPolicy::writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferToWrite,
const bool updatesLastUpdatedTime) const { const bool updatesLastUpdatedTime, const int unigramCount, const int bigramCount,
const int extendedRegionSize) const {
int writingPos = 0; int writingPos = 0;
if (!HeaderReadWriteUtils::writeDictionaryVersion(bufferToWrite, mDictFormatVersion, if (!HeaderReadWriteUtils::writeDictionaryVersion(bufferToWrite, mDictFormatVersion,
&writingPos)) { &writingPos)) {
@ -97,21 +79,19 @@ bool HeaderPolicy::writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferT
&writingPos)) { &writingPos)) {
return false; return false;
} }
HeaderReadWriteUtils::AttributeMap attributeMapTowrite(mAttributeMap);
HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, UNIGRAM_COUNT_KEY, unigramCount);
HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, BIGRAM_COUNT_KEY, bigramCount);
HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, EXTENDED_REGION_SIZE_KEY,
extendedRegionSize);
if (updatesLastUpdatedTime) { if (updatesLastUpdatedTime) {
// Set current time as a last updated time. // Set current time as a last updated time.
HeaderReadWriteUtils::AttributeMap attributeMapTowrite(mAttributeMap); HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, LAST_UPDATED_TIME_KEY,
std::vector<int> updatedTimekey; time(0));
HeaderReadWriteUtils::insertCharactersIntoVector(LAST_UPDATED_TIME_KEY, &updatedTimekey); }
HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, &updatedTimekey, time(0)); if (!HeaderReadWriteUtils::writeHeaderAttributes(bufferToWrite, &attributeMapTowrite,
if (!HeaderReadWriteUtils::writeHeaderAttributes(bufferToWrite, &attributeMapTowrite, &writingPos)) {
&writingPos)) { return false;
return false;
}
} else {
if (!HeaderReadWriteUtils::writeHeaderAttributes(bufferToWrite, &mAttributeMap,
&writingPos)) {
return false;
}
} }
// Writes an actual header size. // Writes an actual header size.
if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(bufferToWrite, writingPos, if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(bufferToWrite, writingPos,

View File

@ -17,6 +17,7 @@
#ifndef LATINIME_HEADER_POLICY_H #ifndef LATINIME_HEADER_POLICY_H
#define LATINIME_HEADER_POLICY_H #define LATINIME_HEADER_POLICY_H
#include <ctime>
#include <stdint.h> #include <stdint.h>
#include "defines.h" #include "defines.h"
@ -35,8 +36,16 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
mSize(HeaderReadWriteUtils::getHeaderSize(dictBuf)), mSize(HeaderReadWriteUtils::getHeaderSize(dictBuf)),
mAttributeMap(createAttributeMapAndReadAllAttributes(dictBuf)), mAttributeMap(createAttributeMapAndReadAllAttributes(dictBuf)),
mMultiWordCostMultiplier(readMultipleWordCostMultiplier()), mMultiWordCostMultiplier(readMultipleWordCostMultiplier()),
mUsesForgettingCurve(readUsesForgettingCurveFlag()), mUsesForgettingCurve(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap,
mLastUpdatedTime(readLastUpdatedTime()) {} USES_FORGETTING_CURVE_KEY, false /* defaultValue */)),
mLastUpdatedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
LAST_UPDATED_TIME_KEY, time(0) /* defaultValue */)),
mUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
UNIGRAM_COUNT_KEY, 0 /* defaultValue */)),
mBigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
BIGRAM_COUNT_KEY, 0 /* defaultValue */)),
mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)) {}
// Constructs header information using an attribute map. // Constructs header information using an attribute map.
HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion, HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion,
@ -44,9 +53,12 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
: mDictFormatVersion(dictFormatVersion), : mDictFormatVersion(dictFormatVersion),
mDictionaryFlags(HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap( mDictionaryFlags(HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap(
attributeMap)), mSize(0), mAttributeMap(*attributeMap), attributeMap)), mSize(0), mAttributeMap(*attributeMap),
mMultiWordCostMultiplier(readUsesForgettingCurveFlag()), mMultiWordCostMultiplier(readMultipleWordCostMultiplier()),
mUsesForgettingCurve(readUsesForgettingCurveFlag()), mUsesForgettingCurve(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap,
mLastUpdatedTime(readLastUpdatedTime()) {} USES_FORGETTING_CURVE_KEY, false /* defaultValue */)),
mLastUpdatedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
LAST_UPDATED_TIME_KEY, time(0) /* defaultValue */)),
mUnigramCount(0), mBigramCount(0), mExtendedRegionSize(0) {}
~HeaderPolicy() {} ~HeaderPolicy() {}
@ -78,11 +90,24 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
return mLastUpdatedTime; return mLastUpdatedTime;
} }
AK_FORCE_INLINE int getUnigramCount() const {
return mUnigramCount;
}
AK_FORCE_INLINE int getBigramCount() const {
return mBigramCount;
}
AK_FORCE_INLINE int getExtendedRegionSize() const {
return mExtendedRegionSize;
}
void readHeaderValueOrQuestionMark(const char *const key, void readHeaderValueOrQuestionMark(const char *const key,
int *outValue, int outValueSize) const; int *outValue, int outValueSize) const;
bool writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferToWrite, bool writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferToWrite,
const bool updatesLastUpdatedTime) const; const bool updatesLastUpdatedTime, const int unigramCount,
const int bigramCount, const int extendedRegionSize) const;
private: private:
DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderPolicy); DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderPolicy);
@ -90,6 +115,9 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
static const char *const MULTIPLE_WORDS_DEMOTION_RATE_KEY; static const char *const MULTIPLE_WORDS_DEMOTION_RATE_KEY;
static const char *const USES_FORGETTING_CURVE_KEY; static const char *const USES_FORGETTING_CURVE_KEY;
static const char *const LAST_UPDATED_TIME_KEY; static const char *const LAST_UPDATED_TIME_KEY;
static const char *const UNIGRAM_COUNT_KEY;
static const char *const BIGRAM_COUNT_KEY;
static const char *const EXTENDED_REGION_SIZE_KEY;
static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE; static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE;
static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE; static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE;
@ -100,13 +128,12 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
const float mMultiWordCostMultiplier; const float mMultiWordCostMultiplier;
const bool mUsesForgettingCurve; const bool mUsesForgettingCurve;
const int mLastUpdatedTime; const int mLastUpdatedTime;
const int mUnigramCount;
const int mBigramCount;
const int mExtendedRegionSize;
float readMultipleWordCostMultiplier() const; float readMultipleWordCostMultiplier() const;
bool readUsesForgettingCurveFlag() const;
int readLastUpdatedTime() const;
static HeaderReadWriteUtils::AttributeMap createAttributeMapAndReadAllAttributes( static HeaderReadWriteUtils::AttributeMap createAttributeMapAndReadAllAttributes(
const uint8_t *const dictBuf); const uint8_t *const dictBuf);
}; };

View File

@ -68,18 +68,12 @@ const char *const HeaderReadWriteUtils::REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY
/* static */ HeaderReadWriteUtils::DictionaryFlags /* static */ HeaderReadWriteUtils::DictionaryFlags
HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap( HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap(
const HeaderReadWriteUtils::AttributeMap *const attributeMap) { const HeaderReadWriteUtils::AttributeMap *const attributeMap) {
AttributeMap::key_type key; const bool requiresGermanUmlautProcessing = readBoolAttributeValue(attributeMap,
insertCharactersIntoVector(REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY, &key); REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY, false /* defaultValue */);
const bool requiresGermanUmlautProcessing = readBoolAttributeValue(attributeMap, &key, const bool requiresFrenchLigatureProcessing = readBoolAttributeValue(attributeMap,
false /* defaultValue */); REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY, false /* defaultValue */);
key.clear(); const bool supportsDynamicUpdate = readBoolAttributeValue(attributeMap,
insertCharactersIntoVector(REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY, &key); SUPPORTS_DYNAMIC_UPDATE_KEY, false /* defaultValue */);
const bool requiresFrenchLigatureProcessing = readBoolAttributeValue(attributeMap, &key,
false /* defaultValue */);
key.clear();
insertCharactersIntoVector(SUPPORTS_DYNAMIC_UPDATE_KEY, &key);
const bool supportsDynamicUpdate = readBoolAttributeValue(attributeMap, &key,
false /* defaultValue */);
DictionaryFlags dictflags = NO_FLAGS; DictionaryFlags dictflags = NO_FLAGS;
dictflags |= requiresGermanUmlautProcessing ? GERMAN_UMLAUT_PROCESSING_FLAG : 0; dictflags |= requiresGermanUmlautProcessing ? GERMAN_UMLAUT_PROCESSING_FLAG : 0;
dictflags |= requiresFrenchLigatureProcessing ? FRENCH_LIGATURE_PROCESSING_FLAG : 0; dictflags |= requiresFrenchLigatureProcessing ? FRENCH_LIGATURE_PROCESSING_FLAG : 0;
@ -160,11 +154,18 @@ const char *const HeaderReadWriteUtils::REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY
} }
/* static */ void HeaderReadWriteUtils::setBoolAttribute(AttributeMap *const headerAttributes, /* static */ void HeaderReadWriteUtils::setBoolAttribute(AttributeMap *const headerAttributes,
const AttributeMap::key_type *const key, const bool value) { const char *const key, const bool value) {
setIntAttribute(headerAttributes, key, value ? 1 : 0); setIntAttribute(headerAttributes, key, value ? 1 : 0);
} }
/* static */ void HeaderReadWriteUtils::setIntAttribute(AttributeMap *const headerAttributes, /* static */ void HeaderReadWriteUtils::setIntAttribute(AttributeMap *const headerAttributes,
const char *const key, const int value) {
AttributeMap::key_type keyVector;
insertCharactersIntoVector(key, &keyVector);
setIntAttributeInner(headerAttributes, &keyVector, value);
}
/* static */ void HeaderReadWriteUtils::setIntAttributeInner(AttributeMap *const headerAttributes,
const AttributeMap::key_type *const key, const int value) { const AttributeMap::key_type *const key, const int value) {
AttributeMap::mapped_type valueVector; AttributeMap::mapped_type valueVector;
char charBuf[LARGEST_INT_DIGIT_COUNT + 1]; char charBuf[LARGEST_INT_DIGIT_COUNT + 1];
@ -174,7 +175,7 @@ const char *const HeaderReadWriteUtils::REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY
} }
/* static */ bool HeaderReadWriteUtils::readBoolAttributeValue( /* static */ bool HeaderReadWriteUtils::readBoolAttributeValue(
const AttributeMap *const headerAttributes, const AttributeMap::key_type *const key, const AttributeMap *const headerAttributes, const char *const key,
const bool defaultValue) { const bool defaultValue) {
const int intDefaultValue = defaultValue ? 1 : 0; const int intDefaultValue = defaultValue ? 1 : 0;
const int intValue = readIntAttributeValue(headerAttributes, key, intDefaultValue); const int intValue = readIntAttributeValue(headerAttributes, key, intDefaultValue);
@ -182,6 +183,14 @@ const char *const HeaderReadWriteUtils::REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY
} }
/* static */ int HeaderReadWriteUtils::readIntAttributeValue( /* static */ int HeaderReadWriteUtils::readIntAttributeValue(
const AttributeMap *const headerAttributes, const char *const key,
const int defaultValue) {
AttributeMap::key_type keyVector;
insertCharactersIntoVector(key, &keyVector);
return readIntAttributeValueInner(headerAttributes, &keyVector, defaultValue);
}
/* static */ int HeaderReadWriteUtils::readIntAttributeValueInner(
const AttributeMap *const headerAttributes, const AttributeMap::key_type *const key, const AttributeMap *const headerAttributes, const AttributeMap::key_type *const key,
const int defaultValue) { const int defaultValue) {
AttributeMap::const_iterator it = headerAttributes->find(*key); AttributeMap::const_iterator it = headerAttributes->find(*key);

View File

@ -76,16 +76,16 @@ class HeaderReadWriteUtils {
* Methods for header attributes. * Methods for header attributes.
*/ */
static void setBoolAttribute(AttributeMap *const headerAttributes, static void setBoolAttribute(AttributeMap *const headerAttributes,
const AttributeMap::key_type *const key, const bool value); const char *const key, const bool value);
static void setIntAttribute(AttributeMap *const headerAttributes, static void setIntAttribute(AttributeMap *const headerAttributes,
const AttributeMap::key_type *const key, const int value); const char *const key, const int value);
static bool readBoolAttributeValue(const AttributeMap *const headerAttributes, static bool readBoolAttributeValue(const AttributeMap *const headerAttributes,
const AttributeMap::key_type *const key, const bool defaultValue); const char *const key, const bool defaultValue);
static int readIntAttributeValue(const AttributeMap *const headerAttributes, static int readIntAttributeValue(const AttributeMap *const headerAttributes,
const AttributeMap::key_type *const key, const int defaultValue); const char *const key, const int defaultValue);
static void insertCharactersIntoVector(const char *const characters, static void insertCharactersIntoVector(const char *const characters,
AttributeMap::key_type *const key); AttributeMap::key_type *const key);
@ -112,6 +112,12 @@ class HeaderReadWriteUtils {
static const char *const SUPPORTS_DYNAMIC_UPDATE_KEY; static const char *const SUPPORTS_DYNAMIC_UPDATE_KEY;
static const char *const REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY; static const char *const REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY;
static const char *const REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY; static const char *const REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY;
static void setIntAttributeInner(AttributeMap *const headerAttributes,
const AttributeMap::key_type *const key, const int value);
static int readIntAttributeValueInner(const AttributeMap *const headerAttributes,
const AttributeMap::key_type *const key, const int defaultValue);
}; };
} }
#endif /* LATINIME_HEADER_READ_WRITE_UTILS_H */ #endif /* LATINIME_HEADER_READ_WRITE_UTILS_H */

View File

@ -113,6 +113,14 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
return false; return false;
} }
void getProperty(const char *const query, char *const outResult,
const int maxResultLength) const {
// getProperty is not supported for this class.
if (maxResultLength > 0) {
outResult[0] = '\0';
}
}
private: private:
DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTriePolicy); DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTriePolicy);

View File

@ -43,7 +43,8 @@ const char *const DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE =
const HeaderReadWriteUtils::AttributeMap *const attributeMap) { const HeaderReadWriteUtils::AttributeMap *const attributeMap) {
BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */); BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
HeaderPolicy headerPolicy(FormatUtils::VERSION_3, attributeMap); HeaderPolicy headerPolicy(FormatUtils::VERSION_3, attributeMap);
headerPolicy.writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */); headerPolicy.writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */,
0 /* unigramCount */, 0 /* bigramCount */, 0 /* extendedRegionSize */);
BufferWithExtendableBuffer bodyBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */); BufferWithExtendableBuffer bodyBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
if (!DynamicPatriciaTrieWritingUtils::writeEmptyDictionary(&bodyBuffer, 0 /* rootPos */)) { if (!DynamicPatriciaTrieWritingUtils::writeEmptyDictionary(&bodyBuffer, 0 /* rootPos */)) {
return false; return false;

View File

@ -27,6 +27,7 @@ import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet;
import java.util.Locale; import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.Random; import java.util.Random;
@ -625,4 +626,57 @@ public class BinaryDictionaryTests extends AndroidTestCase {
dictFile.delete(); dictFile.delete();
} }
public void testUnigramAndBigramCount() {
final int flashWithGCIterationCount = 10;
final int codePointSetSize = 50;
final int unigramCountPerIteration = 1000;
final int bigramCountPerIteration = 2000;
final int seed = 1123581321;
final Random random = new Random(seed);
File dictFile = null;
try {
dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary");
} catch (IOException e) {
fail("IOException while writing an initial dictionary : " + e);
}
final ArrayList<String> words = new ArrayList<String>();
final HashSet<Pair<String, String>> bigrams = new HashSet<Pair<String, String>>();
final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
BinaryDictionary binaryDictionary;
for (int i = 0; i < flashWithGCIterationCount; i++) {
binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
for (int j = 0; j < unigramCountPerIteration; j++) {
final String word = CodePointUtils.generateWord(random, codePointSet);
words.add(word);
final int unigramProbability = random.nextInt(0xFF);
binaryDictionary.addUnigramWord(word, unigramProbability);
}
for (int j = 0; j < bigramCountPerIteration; j++) {
final String word0 = words.get(random.nextInt(words.size()));
final String word1 = words.get(random.nextInt(words.size()));
bigrams.add(new Pair<String, String>(word0, word1));
final int bigramProbability = random.nextInt(0xF);
binaryDictionary.addBigramWords(word0, word1, bigramProbability);
}
assertEquals(new HashSet<String>(words).size(), Integer.parseInt(
binaryDictionary.getPropertyForTests(BinaryDictionary.UNIGRAM_COUNT_QUERY)));
assertEquals(new HashSet<Pair<String, String>>(bigrams).size(), Integer.parseInt(
binaryDictionary.getPropertyForTests(BinaryDictionary.BIGRAM_COUNT_QUERY)));
binaryDictionary.flushWithGC();
assertEquals(new HashSet<String>(words).size(), Integer.parseInt(
binaryDictionary.getPropertyForTests(BinaryDictionary.UNIGRAM_COUNT_QUERY)));
assertEquals(new HashSet<Pair<String, String>>(bigrams).size(), Integer.parseInt(
binaryDictionary.getPropertyForTests(BinaryDictionary.BIGRAM_COUNT_QUERY)));
binaryDictionary.close();
}
dictFile.delete();
}
} }