* commit 'de12fcb0f22eec6ff0650a45e937da168bbb6fbc': Add unigram/bigram counting.
This commit is contained in:
commit
6540132d94
20 changed files with 339 additions and 101 deletions
|
@ -48,6 +48,11 @@ public final class BinaryDictionary extends Dictionary {
|
||||||
// TODO: Remove this heuristic.
|
// TODO: Remove this heuristic.
|
||||||
private static final int SPACE_COUNT_FOR_AUTO_COMMIT = 3;
|
private static final int SPACE_COUNT_FOR_AUTO_COMMIT = 3;
|
||||||
|
|
||||||
|
@UsedForTesting
|
||||||
|
public static final String UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
|
||||||
|
@UsedForTesting
|
||||||
|
public static final String BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
|
||||||
|
|
||||||
private long mNativeDict;
|
private long mNativeDict;
|
||||||
private final Locale mLocale;
|
private final Locale mLocale;
|
||||||
private final long mDictSize;
|
private final long mDictSize;
|
||||||
|
@ -129,6 +134,7 @@ public final class BinaryDictionary extends Dictionary {
|
||||||
private static native void removeBigramWordsNative(long dict, int[] word0, int[] word1);
|
private static native void removeBigramWordsNative(long dict, int[] word0, int[] word1);
|
||||||
private static native int calculateProbabilityNative(long dict, int unigramProbability,
|
private static native int calculateProbabilityNative(long dict, int unigramProbability,
|
||||||
int bigramProbability);
|
int bigramProbability);
|
||||||
|
private static native String getPropertyNative(long dict, String query);
|
||||||
|
|
||||||
@UsedForTesting
|
@UsedForTesting
|
||||||
public static boolean createEmptyDictFile(final String filePath, final long dictVersion,
|
public static boolean createEmptyDictFile(final String filePath, final long dictVersion,
|
||||||
|
@ -331,6 +337,12 @@ public final class BinaryDictionary extends Dictionary {
|
||||||
return calculateProbabilityNative(mNativeDict, unigramProbability, bigramProbability);
|
return calculateProbabilityNative(mNativeDict, unigramProbability, bigramProbability);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@UsedForTesting
|
||||||
|
public String getPropertyForTests(String query) {
|
||||||
|
if (!isValidDictionary()) return "";
|
||||||
|
return getPropertyNative(mNativeDict, query);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean shouldAutoCommit(final SuggestedWordInfo candidate) {
|
public boolean shouldAutoCommit(final SuggestedWordInfo candidate) {
|
||||||
// TODO: actually use the confidence rather than use this completely broken heuristic
|
// TODO: actually use the confidence rather than use this completely broken heuristic
|
||||||
|
|
|
@ -323,6 +323,24 @@ static int latinime_BinaryDictionary_calculateProbabilityNative(JNIEnv *env, jcl
|
||||||
bigramProbability);
|
bigramProbability);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static jstring latinime_BinaryDictionary_getProperty(JNIEnv *env, jclass clazz, jlong dict,
|
||||||
|
jstring query) {
|
||||||
|
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
|
||||||
|
if (!dictionary) {
|
||||||
|
return env->NewStringUTF("");
|
||||||
|
}
|
||||||
|
const jsize queryUtf8Length = env->GetStringUTFLength(query);
|
||||||
|
char queryChars[queryUtf8Length + 1];
|
||||||
|
env->GetStringUTFRegion(query, 0, env->GetStringLength(query), queryChars);
|
||||||
|
queryChars[queryUtf8Length] = '\0';
|
||||||
|
static const int GET_PROPERTY_RESULT_LENGTH = 100;
|
||||||
|
char resultChars[GET_PROPERTY_RESULT_LENGTH];
|
||||||
|
resultChars[0] = '\0';
|
||||||
|
dictionary->getDictionaryStructurePolicy()->getProperty(queryChars, resultChars,
|
||||||
|
GET_PROPERTY_RESULT_LENGTH);
|
||||||
|
return env->NewStringUTF(resultChars);
|
||||||
|
}
|
||||||
|
|
||||||
static const JNINativeMethod sMethods[] = {
|
static const JNINativeMethod sMethods[] = {
|
||||||
{
|
{
|
||||||
const_cast<char *>("createEmptyDictFileNative"),
|
const_cast<char *>("createEmptyDictFileNative"),
|
||||||
|
@ -398,6 +416,11 @@ static const JNINativeMethod sMethods[] = {
|
||||||
const_cast<char *>("calculateProbabilityNative"),
|
const_cast<char *>("calculateProbabilityNative"),
|
||||||
const_cast<char *>("(JII)I"),
|
const_cast<char *>("(JII)I"),
|
||||||
reinterpret_cast<void *>(latinime_BinaryDictionary_calculateProbabilityNative)
|
reinterpret_cast<void *>(latinime_BinaryDictionary_calculateProbabilityNative)
|
||||||
|
},
|
||||||
|
{
|
||||||
|
const_cast<char *>("getPropertyNative"),
|
||||||
|
const_cast<char *>("(JLjava/lang/String;)Ljava/lang/String;"),
|
||||||
|
reinterpret_cast<void *>(latinime_BinaryDictionary_getProperty)
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -125,6 +125,11 @@ bool Dictionary::needsToRunGC() {
|
||||||
return mDictionaryStructureWithBufferPolicy->needsToRunGC();
|
return mDictionaryStructureWithBufferPolicy->needsToRunGC();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Dictionary::getProperty(const char *const query, char *const outResult,
|
||||||
|
const int maxResultLength) const {
|
||||||
|
return mDictionaryStructureWithBufferPolicy->getProperty(query, outResult, maxResultLength);
|
||||||
|
}
|
||||||
|
|
||||||
void Dictionary::logDictionaryInfo(JNIEnv *const env) const {
|
void Dictionary::logDictionaryInfo(JNIEnv *const env) const {
|
||||||
const int BUFFER_SIZE = 16;
|
const int BUFFER_SIZE = 16;
|
||||||
int dictionaryIdCodePointBuffer[BUFFER_SIZE];
|
int dictionaryIdCodePointBuffer[BUFFER_SIZE];
|
||||||
|
|
|
@ -83,6 +83,9 @@ class Dictionary {
|
||||||
|
|
||||||
bool needsToRunGC();
|
bool needsToRunGC();
|
||||||
|
|
||||||
|
void getProperty(const char *const query, char *const outResult,
|
||||||
|
const int maxResultLength) const;
|
||||||
|
|
||||||
const DictionaryStructureWithBufferPolicy *getDictionaryStructurePolicy() const {
|
const DictionaryStructureWithBufferPolicy *getDictionaryStructurePolicy() const {
|
||||||
return mDictionaryStructureWithBufferPolicy;
|
return mDictionaryStructureWithBufferPolicy;
|
||||||
}
|
}
|
||||||
|
|
|
@ -80,6 +80,9 @@ class DictionaryStructureWithBufferPolicy {
|
||||||
|
|
||||||
virtual bool needsToRunGC() const = 0;
|
virtual bool needsToRunGC() const = 0;
|
||||||
|
|
||||||
|
virtual void getProperty(const char *const query, char *const outResult,
|
||||||
|
const int maxResultLength) const = 0;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
DictionaryStructureWithBufferPolicy() {}
|
DictionaryStructureWithBufferPolicy() {}
|
||||||
|
|
||||||
|
|
|
@ -119,7 +119,7 @@ bool DynamicBigramListPolicy::copyAllBigrams(BufferWithExtendableBuffer *const b
|
||||||
// Finding useless bigram entries and remove them. Bigram entry is useless when the target PtNode
|
// Finding useless bigram entries and remove them. Bigram entry is useless when the target PtNode
|
||||||
// has been deleted or is not a valid terminal.
|
// has been deleted or is not a valid terminal.
|
||||||
bool DynamicBigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(
|
bool DynamicBigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(
|
||||||
int *const bigramListPos) {
|
int *const bigramListPos, int *const outValidBigramEntryCount) {
|
||||||
const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos);
|
const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos);
|
||||||
if (usesAdditionalBuffer) {
|
if (usesAdditionalBuffer) {
|
||||||
*bigramListPos -= mBuffer->getOriginalBufferSize();
|
*bigramListPos -= mBuffer->getOriginalBufferSize();
|
||||||
|
@ -161,6 +161,8 @@ bool DynamicBigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(
|
||||||
NOT_A_DICT_POS /* targetOffset */, &bigramEntryPos)) {
|
NOT_A_DICT_POS /* targetOffset */, &bigramEntryPos)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
(*outValidBigramEntryCount) += 1;
|
||||||
}
|
}
|
||||||
} while(BigramListReadWriteUtils::hasNext(bigramFlags));
|
} while(BigramListReadWriteUtils::hasNext(bigramFlags));
|
||||||
return true;
|
return true;
|
||||||
|
@ -169,7 +171,7 @@ bool DynamicBigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(
|
||||||
// Updates bigram target PtNode positions in the list after the placing step in GC.
|
// Updates bigram target PtNode positions in the list after the placing step in GC.
|
||||||
bool DynamicBigramListPolicy::updateAllBigramTargetPtNodePositions(int *const bigramListPos,
|
bool DynamicBigramListPolicy::updateAllBigramTargetPtNodePositions(int *const bigramListPos,
|
||||||
const DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap *const
|
const DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap *const
|
||||||
ptNodePositionRelocationMap) {
|
ptNodePositionRelocationMap, int *const outBigramEntryCount) {
|
||||||
const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos);
|
const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos);
|
||||||
if (usesAdditionalBuffer) {
|
if (usesAdditionalBuffer) {
|
||||||
*bigramListPos -= mBuffer->getOriginalBufferSize();
|
*bigramListPos -= mBuffer->getOriginalBufferSize();
|
||||||
|
@ -211,11 +213,12 @@ bool DynamicBigramListPolicy::updateAllBigramTargetPtNodePositions(int *const bi
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} while(BigramListReadWriteUtils::hasNext(bigramFlags));
|
} while(BigramListReadWriteUtils::hasNext(bigramFlags));
|
||||||
|
(*outBigramEntryCount) = bigramEntryCount;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTargetPos,
|
bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTargetPos,
|
||||||
const int probability, int *const bigramListPos) {
|
const int probability, int *const bigramListPos, bool *const outAddedNewBigram) {
|
||||||
const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos);
|
const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos);
|
||||||
if (usesAdditionalBuffer) {
|
if (usesAdditionalBuffer) {
|
||||||
*bigramListPos -= mBuffer->getOriginalBufferSize();
|
*bigramListPos -= mBuffer->getOriginalBufferSize();
|
||||||
|
@ -243,6 +246,7 @@ bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTarg
|
||||||
}
|
}
|
||||||
if (followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos) == bigramTargetPos) {
|
if (followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos) == bigramTargetPos) {
|
||||||
// Update this bigram entry.
|
// Update this bigram entry.
|
||||||
|
*outAddedNewBigram = false;
|
||||||
const BigramListReadWriteUtils::BigramFlags updatedFlags =
|
const BigramListReadWriteUtils::BigramFlags updatedFlags =
|
||||||
BigramListReadWriteUtils::setProbabilityInFlags(bigramFlags, probability);
|
BigramListReadWriteUtils::setProbabilityInFlags(bigramFlags, probability);
|
||||||
return BigramListReadWriteUtils::writeBigramEntry(mBuffer, updatedFlags,
|
return BigramListReadWriteUtils::writeBigramEntry(mBuffer, updatedFlags,
|
||||||
|
@ -254,12 +258,14 @@ bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTarg
|
||||||
// The current last entry is found.
|
// The current last entry is found.
|
||||||
// First, update the flags of the last entry.
|
// First, update the flags of the last entry.
|
||||||
if (!BigramListReadWriteUtils::setHasNextFlag(mBuffer, true /* hasNext */, entryPos)) {
|
if (!BigramListReadWriteUtils::setHasNextFlag(mBuffer, true /* hasNext */, entryPos)) {
|
||||||
|
*outAddedNewBigram = false;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (usesAdditionalBuffer) {
|
if (usesAdditionalBuffer) {
|
||||||
*bigramListPos += mBuffer->getOriginalBufferSize();
|
*bigramListPos += mBuffer->getOriginalBufferSize();
|
||||||
}
|
}
|
||||||
// Then, add a new entry after the last entry.
|
// Then, add a new entry after the last entry.
|
||||||
|
*outAddedNewBigram = true;
|
||||||
return writeNewBigramEntry(bigramTargetPos, probability, bigramListPos);
|
return writeNewBigramEntry(bigramTargetPos, probability, bigramListPos);
|
||||||
} while(BigramListReadWriteUtils::hasNext(bigramFlags));
|
} while(BigramListReadWriteUtils::hasNext(bigramFlags));
|
||||||
// We return directly from the while loop.
|
// We return directly from the while loop.
|
||||||
|
|
|
@ -50,19 +50,20 @@ class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy {
|
||||||
bool copyAllBigrams(BufferWithExtendableBuffer *const bufferToWrite, int *const fromPos,
|
bool copyAllBigrams(BufferWithExtendableBuffer *const bufferToWrite, int *const fromPos,
|
||||||
int *const toPos, int *const outBigramsCount) const;
|
int *const toPos, int *const outBigramsCount) const;
|
||||||
|
|
||||||
bool updateAllBigramEntriesAndDeleteUselessEntries(int *const bigramListPos);
|
bool updateAllBigramEntriesAndDeleteUselessEntries(int *const bigramListPos,
|
||||||
|
int *const outBigramEntryCount);
|
||||||
|
|
||||||
bool updateAllBigramTargetPtNodePositions(int *const bigramListPos,
|
bool updateAllBigramTargetPtNodePositions(int *const bigramListPos,
|
||||||
const DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap *const
|
const DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap *const
|
||||||
ptNodePositionRelocationMap);
|
ptNodePositionRelocationMap, int *const outValidBigramEntryCount);
|
||||||
|
|
||||||
bool addNewBigramEntryToBigramList(const int bigramTargetPos, const int probability,
|
bool addNewBigramEntryToBigramList(const int bigramTargetPos, const int probability,
|
||||||
int *const bigramListPos);
|
int *const bigramListPos, bool *const outAddedNewBigram);
|
||||||
|
|
||||||
bool writeNewBigramEntry(const int bigramTargetPos, const int probability,
|
bool writeNewBigramEntry(const int bigramTargetPos, const int probability,
|
||||||
int *const writingPos);
|
int *const writingPos);
|
||||||
|
|
||||||
// Return if targetBigramPos is found or not.
|
// Return whether or not targetBigramPos is found.
|
||||||
bool removeBigram(const int bigramListPos, const int bigramTargetPos);
|
bool removeBigram(const int bigramListPos, const int bigramTargetPos);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
|
@ -42,6 +42,9 @@ bool DynamicPatriciaTrieGcEventListeners
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
valueStack.back() += 1;
|
valueStack.back() += 1;
|
||||||
|
if (node->isTerminal()) {
|
||||||
|
mValidUnigramCount += 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -137,10 +140,15 @@ bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateAllPositionField
|
||||||
// Updates bigram target PtNode positions in the bigram list.
|
// Updates bigram target PtNode positions in the bigram list.
|
||||||
int bigramsPos = node->getBigramsPos();
|
int bigramsPos = node->getBigramsPos();
|
||||||
if (bigramsPos != NOT_A_DICT_POS) {
|
if (bigramsPos != NOT_A_DICT_POS) {
|
||||||
|
int bigramEntryCount;
|
||||||
if (!mBigramPolicy->updateAllBigramTargetPtNodePositions(&bigramsPos,
|
if (!mBigramPolicy->updateAllBigramTargetPtNodePositions(&bigramsPos,
|
||||||
&mDictPositionRelocationMap->mPtNodePositionRelocationMap)) {
|
&mDictPositionRelocationMap->mPtNodePositionRelocationMap, &bigramEntryCount)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
mBigramCount += bigramEntryCount;
|
||||||
|
}
|
||||||
|
if (node->isTerminal()) {
|
||||||
|
mUnigramCount++;
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -41,7 +41,7 @@ class DynamicPatriciaTrieGcEventListeners {
|
||||||
DynamicPatriciaTrieWritingHelper *const writingHelper,
|
DynamicPatriciaTrieWritingHelper *const writingHelper,
|
||||||
BufferWithExtendableBuffer *const buffer)
|
BufferWithExtendableBuffer *const buffer)
|
||||||
: mWritingHelper(writingHelper), mBuffer(buffer), valueStack(),
|
: mWritingHelper(writingHelper), mBuffer(buffer), valueStack(),
|
||||||
mChildrenValue(0) {}
|
mChildrenValue(0), mValidUnigramCount(0) {}
|
||||||
|
|
||||||
~TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted() {};
|
~TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted() {};
|
||||||
|
|
||||||
|
@ -64,6 +64,10 @@ class DynamicPatriciaTrieGcEventListeners {
|
||||||
bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node,
|
bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node,
|
||||||
const int *const nodeCodePoints);
|
const int *const nodeCodePoints);
|
||||||
|
|
||||||
|
int getValidUnigramCount() const {
|
||||||
|
return mValidUnigramCount;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(
|
DISALLOW_IMPLICIT_CONSTRUCTORS(
|
||||||
TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted);
|
TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted);
|
||||||
|
@ -72,6 +76,7 @@ class DynamicPatriciaTrieGcEventListeners {
|
||||||
BufferWithExtendableBuffer *const mBuffer;
|
BufferWithExtendableBuffer *const mBuffer;
|
||||||
std::vector<int> valueStack;
|
std::vector<int> valueStack;
|
||||||
int mChildrenValue;
|
int mChildrenValue;
|
||||||
|
int mValidUnigramCount;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Updates all bigram entries that are held by valid PtNodes. This removes useless bigram
|
// Updates all bigram entries that are held by valid PtNodes. This removes useless bigram
|
||||||
|
@ -80,7 +85,7 @@ class DynamicPatriciaTrieGcEventListeners {
|
||||||
: public DynamicPatriciaTrieReadingHelper::TraversingEventListener {
|
: public DynamicPatriciaTrieReadingHelper::TraversingEventListener {
|
||||||
public:
|
public:
|
||||||
TraversePolicyToUpdateBigramProbability(DynamicBigramListPolicy *const bigramPolicy)
|
TraversePolicyToUpdateBigramProbability(DynamicBigramListPolicy *const bigramPolicy)
|
||||||
: mBigramPolicy(bigramPolicy) {}
|
: mBigramPolicy(bigramPolicy), mValidBigramEntryCount(0) {}
|
||||||
|
|
||||||
bool onAscend() { return true; }
|
bool onAscend() { return true; }
|
||||||
|
|
||||||
|
@ -93,18 +98,26 @@ class DynamicPatriciaTrieGcEventListeners {
|
||||||
if (!node->isDeleted()) {
|
if (!node->isDeleted()) {
|
||||||
int pos = node->getBigramsPos();
|
int pos = node->getBigramsPos();
|
||||||
if (pos != NOT_A_DICT_POS) {
|
if (pos != NOT_A_DICT_POS) {
|
||||||
if (!mBigramPolicy->updateAllBigramEntriesAndDeleteUselessEntries(&pos)) {
|
int bigramEntryCount = 0;
|
||||||
|
if (!mBigramPolicy->updateAllBigramEntriesAndDeleteUselessEntries(&pos,
|
||||||
|
&bigramEntryCount)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
mValidBigramEntryCount += bigramEntryCount;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int getValidBigramEntryCount() const {
|
||||||
|
return mValidBigramEntryCount;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateBigramProbability);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateBigramProbability);
|
||||||
|
|
||||||
DynamicBigramListPolicy *const mBigramPolicy;
|
DynamicBigramListPolicy *const mBigramPolicy;
|
||||||
|
int mValidBigramEntryCount;
|
||||||
};
|
};
|
||||||
|
|
||||||
class TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
|
class TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
|
||||||
|
@ -150,7 +163,8 @@ class DynamicPatriciaTrieGcEventListeners {
|
||||||
dictPositionRelocationMap)
|
dictPositionRelocationMap)
|
||||||
: mWritingHelper(writingHelper), mBigramPolicy(bigramPolicy),
|
: mWritingHelper(writingHelper), mBigramPolicy(bigramPolicy),
|
||||||
mBufferToWrite(bufferToWrite),
|
mBufferToWrite(bufferToWrite),
|
||||||
mDictPositionRelocationMap(dictPositionRelocationMap) {};
|
mDictPositionRelocationMap(dictPositionRelocationMap), mUnigramCount(0),
|
||||||
|
mBigramCount(0) {};
|
||||||
|
|
||||||
bool onAscend() { return true; }
|
bool onAscend() { return true; }
|
||||||
|
|
||||||
|
@ -161,6 +175,14 @@ class DynamicPatriciaTrieGcEventListeners {
|
||||||
bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node,
|
bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node,
|
||||||
const int *const nodeCodePoints);
|
const int *const nodeCodePoints);
|
||||||
|
|
||||||
|
int getUnigramCount() const {
|
||||||
|
return mUnigramCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
int getBigramCount() const {
|
||||||
|
return mBigramCount;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateAllPositionFields);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateAllPositionFields);
|
||||||
|
|
||||||
|
@ -169,6 +191,8 @@ class DynamicPatriciaTrieGcEventListeners {
|
||||||
BufferWithExtendableBuffer *const mBufferToWrite;
|
BufferWithExtendableBuffer *const mBufferToWrite;
|
||||||
const DynamicPatriciaTrieWritingHelper::DictPositionRelocationMap *const
|
const DynamicPatriciaTrieWritingHelper::DictPositionRelocationMap *const
|
||||||
mDictPositionRelocationMap;
|
mDictPositionRelocationMap;
|
||||||
|
int mUnigramCount;
|
||||||
|
int mBigramCount;
|
||||||
};
|
};
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
|
@ -16,6 +16,9 @@
|
||||||
|
|
||||||
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h"
|
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h"
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
#include "defines.h"
|
#include "defines.h"
|
||||||
#include "suggest/core/dicnode/dic_node.h"
|
#include "suggest/core/dicnode/dic_node.h"
|
||||||
#include "suggest/core/dicnode/dic_node_vector.h"
|
#include "suggest/core/dicnode/dic_node_vector.h"
|
||||||
|
@ -28,6 +31,9 @@
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
|
const char *const DynamicPatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
|
||||||
|
const char *const DynamicPatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
|
||||||
|
|
||||||
void DynamicPatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode,
|
void DynamicPatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode,
|
||||||
DicNodeVector *const childDicNodes) const {
|
DicNodeVector *const childDicNodes) const {
|
||||||
if (!dicNode->hasChildren()) {
|
if (!dicNode->hasChildren()) {
|
||||||
|
@ -198,7 +204,16 @@ bool DynamicPatriciaTriePolicy::addUnigramWord(const int *const word, const int
|
||||||
readingHelper.initWithPtNodeArrayPos(getRootPosition());
|
readingHelper.initWithPtNodeArrayPos(getRootPosition());
|
||||||
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
|
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
|
||||||
&mBigramListPolicy, &mShortcutListPolicy);
|
&mBigramListPolicy, &mShortcutListPolicy);
|
||||||
return writingHelper.addUnigramWord(&readingHelper, word, length, probability);
|
bool addedNewUnigram = false;
|
||||||
|
if (writingHelper.addUnigramWord(&readingHelper, word, length, probability,
|
||||||
|
&addedNewUnigram)) {
|
||||||
|
if (addedNewUnigram) {
|
||||||
|
mUnigramCount++;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool DynamicPatriciaTriePolicy::addBigramWords(const int *const word0, const int length0,
|
bool DynamicPatriciaTriePolicy::addBigramWords(const int *const word0, const int length0,
|
||||||
|
@ -219,7 +234,15 @@ bool DynamicPatriciaTriePolicy::addBigramWords(const int *const word0, const int
|
||||||
}
|
}
|
||||||
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
|
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
|
||||||
&mBigramListPolicy, &mShortcutListPolicy);
|
&mBigramListPolicy, &mShortcutListPolicy);
|
||||||
return writingHelper.addBigramWords(word0Pos, word1Pos, probability);
|
bool addedNewBigram = false;
|
||||||
|
if (writingHelper.addBigramWords(word0Pos, word1Pos, probability, &addedNewBigram)) {
|
||||||
|
if (addedNewBigram) {
|
||||||
|
mBigramCount++;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const int length0,
|
bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const int length0,
|
||||||
|
@ -240,7 +263,12 @@ bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const
|
||||||
}
|
}
|
||||||
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
|
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
|
||||||
&mBigramListPolicy, &mShortcutListPolicy);
|
&mBigramListPolicy, &mShortcutListPolicy);
|
||||||
return writingHelper.removeBigramWords(word0Pos, word1Pos);
|
if (writingHelper.removeBigramWords(word0Pos, word1Pos)) {
|
||||||
|
mBigramCount--;
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void DynamicPatriciaTriePolicy::flush(const char *const filePath) {
|
void DynamicPatriciaTriePolicy::flush(const char *const filePath) {
|
||||||
|
@ -250,7 +278,7 @@ void DynamicPatriciaTriePolicy::flush(const char *const filePath) {
|
||||||
}
|
}
|
||||||
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
|
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
|
||||||
&mBigramListPolicy, &mShortcutListPolicy);
|
&mBigramListPolicy, &mShortcutListPolicy);
|
||||||
writingHelper.writeToDictFile(filePath, &mHeaderPolicy);
|
writingHelper.writeToDictFile(filePath, &mHeaderPolicy, mUnigramCount, mBigramCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
void DynamicPatriciaTriePolicy::flushWithGC(const char *const filePath) {
|
void DynamicPatriciaTriePolicy::flushWithGC(const char *const filePath) {
|
||||||
|
@ -272,4 +300,13 @@ bool DynamicPatriciaTriePolicy::needsToRunGC() const {
|
||||||
return mBufferWithExtendableBuffer.isNearSizeLimit();
|
return mBufferWithExtendableBuffer.isNearSizeLimit();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void DynamicPatriciaTriePolicy::getProperty(const char *const query, char *const outResult,
|
||||||
|
const int maxResultLength) const {
|
||||||
|
if (strncmp(query, UNIGRAM_COUNT_QUERY, maxResultLength) == 0) {
|
||||||
|
snprintf(outResult, maxResultLength, "%d", mUnigramCount);
|
||||||
|
} else if (strncmp(query, BIGRAM_COUNT_QUERY, maxResultLength) == 0) {
|
||||||
|
snprintf(outResult, maxResultLength, "%d", mBigramCount);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
|
|
|
@ -37,7 +37,9 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
mBufferWithExtendableBuffer(mBuffer->getBuffer() + mHeaderPolicy.getSize(),
|
mBufferWithExtendableBuffer(mBuffer->getBuffer() + mHeaderPolicy.getSize(),
|
||||||
mBuffer->getBufferSize() - mHeaderPolicy.getSize()),
|
mBuffer->getBufferSize() - mHeaderPolicy.getSize()),
|
||||||
mShortcutListPolicy(&mBufferWithExtendableBuffer),
|
mShortcutListPolicy(&mBufferWithExtendableBuffer),
|
||||||
mBigramListPolicy(&mBufferWithExtendableBuffer, &mShortcutListPolicy) {}
|
mBigramListPolicy(&mBufferWithExtendableBuffer, &mShortcutListPolicy),
|
||||||
|
mUnigramCount(mHeaderPolicy.getUnigramCount()),
|
||||||
|
mBigramCount(mHeaderPolicy.getBigramCount()) {}
|
||||||
|
|
||||||
~DynamicPatriciaTriePolicy() {
|
~DynamicPatriciaTriePolicy() {
|
||||||
delete mBuffer;
|
delete mBuffer;
|
||||||
|
@ -91,14 +93,22 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
|
|
||||||
bool needsToRunGC() const;
|
bool needsToRunGC() const;
|
||||||
|
|
||||||
|
void getProperty(const char *const query, char *const outResult,
|
||||||
|
const int maxResultLength) const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTriePolicy);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTriePolicy);
|
||||||
|
|
||||||
|
static const char*const UNIGRAM_COUNT_QUERY;
|
||||||
|
static const char*const BIGRAM_COUNT_QUERY;
|
||||||
|
|
||||||
const MmappedBuffer *const mBuffer;
|
const MmappedBuffer *const mBuffer;
|
||||||
const HeaderPolicy mHeaderPolicy;
|
const HeaderPolicy mHeaderPolicy;
|
||||||
BufferWithExtendableBuffer mBufferWithExtendableBuffer;
|
BufferWithExtendableBuffer mBufferWithExtendableBuffer;
|
||||||
DynamicShortcutListPolicy mShortcutListPolicy;
|
DynamicShortcutListPolicy mShortcutListPolicy;
|
||||||
DynamicBigramListPolicy mBigramListPolicy;
|
DynamicBigramListPolicy mBigramListPolicy;
|
||||||
|
int mUnigramCount;
|
||||||
|
int mBigramCount;
|
||||||
};
|
};
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
#endif // LATINIME_DYNAMIC_PATRICIA_TRIE_POLICY_H
|
#endif // LATINIME_DYNAMIC_PATRICIA_TRIE_POLICY_H
|
||||||
|
|
|
@ -36,7 +36,8 @@ const size_t DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE = 2 * 1024 *
|
||||||
|
|
||||||
bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
|
bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
|
||||||
DynamicPatriciaTrieReadingHelper *const readingHelper,
|
DynamicPatriciaTrieReadingHelper *const readingHelper,
|
||||||
const int *const wordCodePoints, const int codePointCount, const int probability) {
|
const int *const wordCodePoints, const int codePointCount, const int probability,
|
||||||
|
bool *const outAddedNewUnigram) {
|
||||||
int parentPos = NOT_A_DICT_POS;
|
int parentPos = NOT_A_DICT_POS;
|
||||||
while (!readingHelper->isEnd()) {
|
while (!readingHelper->isEnd()) {
|
||||||
const int matchedCodePointCount = readingHelper->getPrevTotalCodePointCount();
|
const int matchedCodePointCount = readingHelper->getPrevTotalCodePointCount();
|
||||||
|
@ -54,6 +55,7 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
|
||||||
const int nextIndex = matchedCodePointCount + j;
|
const int nextIndex = matchedCodePointCount + j;
|
||||||
if (nextIndex >= codePointCount || !readingHelper->isMatchedCodePoint(j,
|
if (nextIndex >= codePointCount || !readingHelper->isMatchedCodePoint(j,
|
||||||
wordCodePoints[matchedCodePointCount + j])) {
|
wordCodePoints[matchedCodePointCount + j])) {
|
||||||
|
*outAddedNewUnigram = true;
|
||||||
return reallocatePtNodeAndAddNewPtNodes(nodeReader,
|
return reallocatePtNodeAndAddNewPtNodes(nodeReader,
|
||||||
readingHelper->getMergedNodeCodePoints(), j, probability,
|
readingHelper->getMergedNodeCodePoints(), j, probability,
|
||||||
wordCodePoints + matchedCodePointCount,
|
wordCodePoints + matchedCodePointCount,
|
||||||
|
@ -63,9 +65,10 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
|
||||||
// All characters are matched.
|
// All characters are matched.
|
||||||
if (codePointCount == readingHelper->getTotalCodePointCount()) {
|
if (codePointCount == readingHelper->getTotalCodePointCount()) {
|
||||||
return setPtNodeProbability(nodeReader, probability,
|
return setPtNodeProbability(nodeReader, probability,
|
||||||
readingHelper->getMergedNodeCodePoints());
|
readingHelper->getMergedNodeCodePoints(), outAddedNewUnigram);
|
||||||
}
|
}
|
||||||
if (!nodeReader->hasChildren()) {
|
if (!nodeReader->hasChildren()) {
|
||||||
|
*outAddedNewUnigram = true;
|
||||||
return createChildrenPtNodeArrayAndAChildPtNode(nodeReader, probability,
|
return createChildrenPtNodeArrayAndAChildPtNode(nodeReader, probability,
|
||||||
wordCodePoints + readingHelper->getTotalCodePointCount(),
|
wordCodePoints + readingHelper->getTotalCodePointCount(),
|
||||||
codePointCount - readingHelper->getTotalCodePointCount());
|
codePointCount - readingHelper->getTotalCodePointCount());
|
||||||
|
@ -79,6 +82,7 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
int pos = readingHelper->getPosOfLastForwardLinkField();
|
int pos = readingHelper->getPosOfLastForwardLinkField();
|
||||||
|
*outAddedNewUnigram = true;
|
||||||
return createAndInsertNodeIntoPtNodeArray(parentPos,
|
return createAndInsertNodeIntoPtNodeArray(parentPos,
|
||||||
wordCodePoints + readingHelper->getPrevTotalCodePointCount(),
|
wordCodePoints + readingHelper->getPrevTotalCodePointCount(),
|
||||||
codePointCount - readingHelper->getPrevTotalCodePointCount(),
|
codePointCount - readingHelper->getPrevTotalCodePointCount(),
|
||||||
|
@ -86,7 +90,7 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
|
||||||
}
|
}
|
||||||
|
|
||||||
bool DynamicPatriciaTrieWritingHelper::addBigramWords(const int word0Pos, const int word1Pos,
|
bool DynamicPatriciaTrieWritingHelper::addBigramWords(const int word0Pos, const int word1Pos,
|
||||||
const int probability) {
|
const int probability, bool *const outAddedNewBigram) {
|
||||||
int mMergedNodeCodePoints[MAX_WORD_LENGTH];
|
int mMergedNodeCodePoints[MAX_WORD_LENGTH];
|
||||||
DynamicPatriciaTrieNodeReader nodeReader(mBuffer, mBigramPolicy, mShortcutPolicy);
|
DynamicPatriciaTrieNodeReader nodeReader(mBuffer, mBigramPolicy, mShortcutPolicy);
|
||||||
nodeReader.fetchNodeInfoInBufferFromPtNodePosAndGetNodeCodePoints(word0Pos, MAX_WORD_LENGTH,
|
nodeReader.fetchNodeInfoInBufferFromPtNodePosAndGetNodeCodePoints(word0Pos, MAX_WORD_LENGTH,
|
||||||
|
@ -107,9 +111,11 @@ bool DynamicPatriciaTrieWritingHelper::addBigramWords(const int word0Pos, const
|
||||||
if (nodeReader.getBigramsPos() != NOT_A_DICT_POS) {
|
if (nodeReader.getBigramsPos() != NOT_A_DICT_POS) {
|
||||||
// Insert a new bigram entry into the existing bigram list.
|
// Insert a new bigram entry into the existing bigram list.
|
||||||
int bigramListPos = nodeReader.getBigramsPos();
|
int bigramListPos = nodeReader.getBigramsPos();
|
||||||
return mBigramPolicy->addNewBigramEntryToBigramList(word1Pos, probability, &bigramListPos);
|
return mBigramPolicy->addNewBigramEntryToBigramList(word1Pos, probability, &bigramListPos,
|
||||||
|
outAddedNewBigram);
|
||||||
} else {
|
} else {
|
||||||
// The PtNode doesn't have a bigram list.
|
// The PtNode doesn't have a bigram list.
|
||||||
|
*outAddedNewBigram = true;
|
||||||
// First, Write a bigram entry at the tail position of the PtNode.
|
// First, Write a bigram entry at the tail position of the PtNode.
|
||||||
if (!mBigramPolicy->writeNewBigramEntry(word1Pos, probability, &writingPos)) {
|
if (!mBigramPolicy->writeNewBigramEntry(word1Pos, probability, &writingPos)) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -138,9 +144,12 @@ bool DynamicPatriciaTrieWritingHelper::removeBigramWords(const int word0Pos, con
|
||||||
}
|
}
|
||||||
|
|
||||||
void DynamicPatriciaTrieWritingHelper::writeToDictFile(const char *const fileName,
|
void DynamicPatriciaTrieWritingHelper::writeToDictFile(const char *const fileName,
|
||||||
const HeaderPolicy *const headerPolicy) {
|
const HeaderPolicy *const headerPolicy, const int unigramCount, const int bigramCount) {
|
||||||
BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
|
BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
|
||||||
if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, false /* updatesLastUpdatedTime */)) {
|
const int extendedRegionSize = headerPolicy->getExtendedRegionSize() +
|
||||||
|
mBuffer->getTailPosition() - mBuffer->getOriginalBufferSize();
|
||||||
|
if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, false /* updatesLastUpdatedTime */,
|
||||||
|
unigramCount, bigramCount, extendedRegionSize)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
DictFileWritingUtils::flushAllHeaderAndBodyToFile(fileName, &headerBuffer, mBuffer);
|
DictFileWritingUtils::flushAllHeaderAndBodyToFile(fileName, &headerBuffer, mBuffer);
|
||||||
|
@ -148,13 +157,16 @@ void DynamicPatriciaTrieWritingHelper::writeToDictFile(const char *const fileNam
|
||||||
|
|
||||||
void DynamicPatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos,
|
void DynamicPatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos,
|
||||||
const char *const fileName, const HeaderPolicy *const headerPolicy) {
|
const char *const fileName, const HeaderPolicy *const headerPolicy) {
|
||||||
BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
|
|
||||||
if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
BufferWithExtendableBuffer newDictBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */,
|
BufferWithExtendableBuffer newDictBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */,
|
||||||
MAX_DICTIONARY_SIZE);
|
MAX_DICTIONARY_SIZE);
|
||||||
if (!runGC(rootPtNodeArrayPos, &newDictBuffer)) {
|
int unigramCount = 0;
|
||||||
|
int bigramCount = 0;
|
||||||
|
if (!runGC(rootPtNodeArrayPos, &newDictBuffer, &unigramCount, &bigramCount)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
|
||||||
|
if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */,
|
||||||
|
unigramCount, bigramCount, 0 /* extendedRegionSize */)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
DictFileWritingUtils::flushAllHeaderAndBodyToFile(fileName, &headerBuffer, &newDictBuffer);
|
DictFileWritingUtils::flushAllHeaderAndBodyToFile(fileName, &headerBuffer, &newDictBuffer);
|
||||||
|
@ -335,9 +347,10 @@ bool DynamicPatriciaTrieWritingHelper::createAndInsertNodeIntoPtNodeArray(const
|
||||||
|
|
||||||
bool DynamicPatriciaTrieWritingHelper::setPtNodeProbability(
|
bool DynamicPatriciaTrieWritingHelper::setPtNodeProbability(
|
||||||
const DynamicPatriciaTrieNodeReader *const originalPtNode, const int probability,
|
const DynamicPatriciaTrieNodeReader *const originalPtNode, const int probability,
|
||||||
const int *const codePoints) {
|
const int *const codePoints, bool *const outAddedNewUnigram) {
|
||||||
if (originalPtNode->isTerminal()) {
|
if (originalPtNode->isTerminal()) {
|
||||||
// Overwrites the probability.
|
// Overwrites the probability.
|
||||||
|
*outAddedNewUnigram = false;
|
||||||
int probabilityFieldPos = originalPtNode->getProbabilityFieldPos();
|
int probabilityFieldPos = originalPtNode->getProbabilityFieldPos();
|
||||||
if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition(mBuffer,
|
if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition(mBuffer,
|
||||||
probability, &probabilityFieldPos)) {
|
probability, &probabilityFieldPos)) {
|
||||||
|
@ -345,6 +358,7 @@ bool DynamicPatriciaTrieWritingHelper::setPtNodeProbability(
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Make the node terminal and write the probability.
|
// Make the node terminal and write the probability.
|
||||||
|
*outAddedNewUnigram = true;
|
||||||
int movedPos = mBuffer->getTailPosition();
|
int movedPos = mBuffer->getTailPosition();
|
||||||
if (!markNodeAsMovedAndSetPosition(originalPtNode, movedPos, movedPos)) {
|
if (!markNodeAsMovedAndSetPosition(originalPtNode, movedPos, movedPos)) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -460,7 +474,8 @@ bool DynamicPatriciaTrieWritingHelper::reallocatePtNodeAndAddNewPtNodes(
|
||||||
}
|
}
|
||||||
|
|
||||||
bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
|
bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
|
||||||
BufferWithExtendableBuffer *const bufferToWrite) {
|
BufferWithExtendableBuffer *const bufferToWrite, int *const outUnigramCount,
|
||||||
|
int *const outBigramCount) {
|
||||||
DynamicPatriciaTrieReadingHelper readingHelper(mBuffer, mBigramPolicy, mShortcutPolicy);
|
DynamicPatriciaTrieReadingHelper readingHelper(mBuffer, mBigramPolicy, mShortcutPolicy);
|
||||||
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
||||||
DynamicPatriciaTrieGcEventListeners
|
DynamicPatriciaTrieGcEventListeners
|
||||||
|
@ -505,6 +520,8 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
|
||||||
&traversePolicyToUpdateAllPositionFields)) {
|
&traversePolicyToUpdateAllPositionFields)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
*outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount();
|
||||||
|
*outBigramCount = traversePolicyToUpdateAllPositionFields.getBigramCount();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -56,15 +56,18 @@ class DynamicPatriciaTrieWritingHelper {
|
||||||
|
|
||||||
// Add a word to the dictionary. If the word already exists, update the probability.
|
// Add a word to the dictionary. If the word already exists, update the probability.
|
||||||
bool addUnigramWord(DynamicPatriciaTrieReadingHelper *const readingHelper,
|
bool addUnigramWord(DynamicPatriciaTrieReadingHelper *const readingHelper,
|
||||||
const int *const wordCodePoints, const int codePointCount, const int probability);
|
const int *const wordCodePoints, const int codePointCount, const int probability,
|
||||||
|
bool *const outAddedNewUnigram);
|
||||||
|
|
||||||
// Add a bigram relation from word0Pos to word1Pos.
|
// Add a bigram relation from word0Pos to word1Pos.
|
||||||
bool addBigramWords(const int word0Pos, const int word1Pos, const int probability);
|
bool addBigramWords(const int word0Pos, const int word1Pos, const int probability,
|
||||||
|
bool *const outAddedNewBigram);
|
||||||
|
|
||||||
// Remove a bigram relation from word0Pos to word1Pos.
|
// Remove a bigram relation from word0Pos to word1Pos.
|
||||||
bool removeBigramWords(const int word0Pos, const int word1Pos);
|
bool removeBigramWords(const int word0Pos, const int word1Pos);
|
||||||
|
|
||||||
void writeToDictFile(const char *const fileName, const HeaderPolicy *const headerPolicy);
|
void writeToDictFile(const char *const fileName, const HeaderPolicy *const headerPolicy,
|
||||||
|
const int unigramCount, const int bigramCount);
|
||||||
|
|
||||||
void writeToDictFileWithGC(const int rootPtNodeArrayPos, const char *const fileName,
|
void writeToDictFileWithGC(const int rootPtNodeArrayPos, const char *const fileName,
|
||||||
const HeaderPolicy *const headerPolicy);
|
const HeaderPolicy *const headerPolicy);
|
||||||
|
@ -107,7 +110,7 @@ class DynamicPatriciaTrieWritingHelper {
|
||||||
const int nodeCodePointCount, const int probability, int *const forwardLinkFieldPos);
|
const int nodeCodePointCount, const int probability, int *const forwardLinkFieldPos);
|
||||||
|
|
||||||
bool setPtNodeProbability(const DynamicPatriciaTrieNodeReader *const originalNode,
|
bool setPtNodeProbability(const DynamicPatriciaTrieNodeReader *const originalNode,
|
||||||
const int probability, const int *const codePoints);
|
const int probability, const int *const codePoints, bool *const outAddedNewUnigram);
|
||||||
|
|
||||||
bool createChildrenPtNodeArrayAndAChildPtNode(
|
bool createChildrenPtNodeArrayAndAChildPtNode(
|
||||||
const DynamicPatriciaTrieNodeReader *const parentNode, const int probability,
|
const DynamicPatriciaTrieNodeReader *const parentNode, const int probability,
|
||||||
|
@ -122,7 +125,8 @@ class DynamicPatriciaTrieWritingHelper {
|
||||||
const int probabilityOfNewPtNode, const int *const newNodeCodePoints,
|
const int probabilityOfNewPtNode, const int *const newNodeCodePoints,
|
||||||
const int newNodeCodePointCount);
|
const int newNodeCodePointCount);
|
||||||
|
|
||||||
bool runGC(const int rootPtNodeArrayPos, BufferWithExtendableBuffer *const bufferToWrite);
|
bool runGC(const int rootPtNodeArrayPos, BufferWithExtendableBuffer *const bufferToWrite,
|
||||||
|
int *const outUnigramCount, int *const outBigramCount);
|
||||||
};
|
};
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H */
|
#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H */
|
||||||
|
|
|
@ -16,17 +16,15 @@
|
||||||
|
|
||||||
#include "suggest/policyimpl/dictionary/header/header_policy.h"
|
#include "suggest/policyimpl/dictionary/header/header_policy.h"
|
||||||
|
|
||||||
#include <cstddef>
|
|
||||||
#include <cstdio>
|
|
||||||
#include <ctime>
|
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
|
|
||||||
// Note that these are corresponding definitions in Java side in FormatSpec.FileHeader.
|
// Note that these are corresponding definitions in Java side in FormatSpec.FileHeader.
|
||||||
const char *const HeaderPolicy::MULTIPLE_WORDS_DEMOTION_RATE_KEY = "MULTIPLE_WORDS_DEMOTION_RATE";
|
const char *const HeaderPolicy::MULTIPLE_WORDS_DEMOTION_RATE_KEY = "MULTIPLE_WORDS_DEMOTION_RATE";
|
||||||
const char *const HeaderPolicy::USES_FORGETTING_CURVE_KEY = "USES_FORGETTING_CURVE";
|
const char *const HeaderPolicy::USES_FORGETTING_CURVE_KEY = "USES_FORGETTING_CURVE";
|
||||||
const char *const HeaderPolicy::LAST_UPDATED_TIME_KEY = "date";
|
const char *const HeaderPolicy::LAST_UPDATED_TIME_KEY = "date";
|
||||||
|
const char *const HeaderPolicy::UNIGRAM_COUNT_KEY = "UNIGRAM_COUNT";
|
||||||
|
const char *const HeaderPolicy::BIGRAM_COUNT_KEY = "BIGRAM_COUNT";
|
||||||
|
const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE";
|
||||||
const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100;
|
const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100;
|
||||||
const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f;
|
const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f;
|
||||||
|
|
||||||
|
@ -55,33 +53,17 @@ void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *out
|
||||||
}
|
}
|
||||||
|
|
||||||
float HeaderPolicy::readMultipleWordCostMultiplier() const {
|
float HeaderPolicy::readMultipleWordCostMultiplier() const {
|
||||||
std::vector<int> keyVector;
|
|
||||||
HeaderReadWriteUtils::insertCharactersIntoVector(MULTIPLE_WORDS_DEMOTION_RATE_KEY, &keyVector);
|
|
||||||
const int demotionRate = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
|
const int demotionRate = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
|
||||||
&keyVector, DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE);
|
MULTIPLE_WORDS_DEMOTION_RATE_KEY, DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE);
|
||||||
if (demotionRate <= 0) {
|
if (demotionRate <= 0) {
|
||||||
return static_cast<float>(MAX_VALUE_FOR_WEIGHTING);
|
return static_cast<float>(MAX_VALUE_FOR_WEIGHTING);
|
||||||
}
|
}
|
||||||
return MULTIPLE_WORD_COST_MULTIPLIER_SCALE / static_cast<float>(demotionRate);
|
return MULTIPLE_WORD_COST_MULTIPLIER_SCALE / static_cast<float>(demotionRate);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool HeaderPolicy::readUsesForgettingCurveFlag() const {
|
|
||||||
std::vector<int> keyVector;
|
|
||||||
HeaderReadWriteUtils::insertCharactersIntoVector(USES_FORGETTING_CURVE_KEY, &keyVector);
|
|
||||||
return HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, &keyVector,
|
|
||||||
false /* defaultValue */);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns current time when the key is not found or the value is invalid.
|
|
||||||
int HeaderPolicy::readLastUpdatedTime() const {
|
|
||||||
std::vector<int> keyVector;
|
|
||||||
HeaderReadWriteUtils::insertCharactersIntoVector(LAST_UPDATED_TIME_KEY, &keyVector);
|
|
||||||
return HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, &keyVector,
|
|
||||||
time(0) /* defaultValue */);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool HeaderPolicy::writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferToWrite,
|
bool HeaderPolicy::writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferToWrite,
|
||||||
const bool updatesLastUpdatedTime) const {
|
const bool updatesLastUpdatedTime, const int unigramCount, const int bigramCount,
|
||||||
|
const int extendedRegionSize) const {
|
||||||
int writingPos = 0;
|
int writingPos = 0;
|
||||||
if (!HeaderReadWriteUtils::writeDictionaryVersion(bufferToWrite, mDictFormatVersion,
|
if (!HeaderReadWriteUtils::writeDictionaryVersion(bufferToWrite, mDictFormatVersion,
|
||||||
&writingPos)) {
|
&writingPos)) {
|
||||||
|
@ -97,21 +79,19 @@ bool HeaderPolicy::writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferT
|
||||||
&writingPos)) {
|
&writingPos)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
HeaderReadWriteUtils::AttributeMap attributeMapTowrite(mAttributeMap);
|
||||||
|
HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, UNIGRAM_COUNT_KEY, unigramCount);
|
||||||
|
HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, BIGRAM_COUNT_KEY, bigramCount);
|
||||||
|
HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, EXTENDED_REGION_SIZE_KEY,
|
||||||
|
extendedRegionSize);
|
||||||
if (updatesLastUpdatedTime) {
|
if (updatesLastUpdatedTime) {
|
||||||
// Set current time as a last updated time.
|
// Set current time as a last updated time.
|
||||||
HeaderReadWriteUtils::AttributeMap attributeMapTowrite(mAttributeMap);
|
HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, LAST_UPDATED_TIME_KEY,
|
||||||
std::vector<int> updatedTimekey;
|
time(0));
|
||||||
HeaderReadWriteUtils::insertCharactersIntoVector(LAST_UPDATED_TIME_KEY, &updatedTimekey);
|
}
|
||||||
HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, &updatedTimekey, time(0));
|
if (!HeaderReadWriteUtils::writeHeaderAttributes(bufferToWrite, &attributeMapTowrite,
|
||||||
if (!HeaderReadWriteUtils::writeHeaderAttributes(bufferToWrite, &attributeMapTowrite,
|
&writingPos)) {
|
||||||
&writingPos)) {
|
return false;
|
||||||
return false;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (!HeaderReadWriteUtils::writeHeaderAttributes(bufferToWrite, &mAttributeMap,
|
|
||||||
&writingPos)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
// Writes an actual header size.
|
// Writes an actual header size.
|
||||||
if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(bufferToWrite, writingPos,
|
if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(bufferToWrite, writingPos,
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
#ifndef LATINIME_HEADER_POLICY_H
|
#ifndef LATINIME_HEADER_POLICY_H
|
||||||
#define LATINIME_HEADER_POLICY_H
|
#define LATINIME_HEADER_POLICY_H
|
||||||
|
|
||||||
|
#include <ctime>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#include "defines.h"
|
#include "defines.h"
|
||||||
|
@ -35,8 +36,16 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
||||||
mSize(HeaderReadWriteUtils::getHeaderSize(dictBuf)),
|
mSize(HeaderReadWriteUtils::getHeaderSize(dictBuf)),
|
||||||
mAttributeMap(createAttributeMapAndReadAllAttributes(dictBuf)),
|
mAttributeMap(createAttributeMapAndReadAllAttributes(dictBuf)),
|
||||||
mMultiWordCostMultiplier(readMultipleWordCostMultiplier()),
|
mMultiWordCostMultiplier(readMultipleWordCostMultiplier()),
|
||||||
mUsesForgettingCurve(readUsesForgettingCurveFlag()),
|
mUsesForgettingCurve(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap,
|
||||||
mLastUpdatedTime(readLastUpdatedTime()) {}
|
USES_FORGETTING_CURVE_KEY, false /* defaultValue */)),
|
||||||
|
mLastUpdatedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
|
||||||
|
LAST_UPDATED_TIME_KEY, time(0) /* defaultValue */)),
|
||||||
|
mUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
|
||||||
|
UNIGRAM_COUNT_KEY, 0 /* defaultValue */)),
|
||||||
|
mBigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
|
||||||
|
BIGRAM_COUNT_KEY, 0 /* defaultValue */)),
|
||||||
|
mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
|
||||||
|
EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)) {}
|
||||||
|
|
||||||
// Constructs header information using an attribute map.
|
// Constructs header information using an attribute map.
|
||||||
HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion,
|
HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion,
|
||||||
|
@ -44,9 +53,12 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
||||||
: mDictFormatVersion(dictFormatVersion),
|
: mDictFormatVersion(dictFormatVersion),
|
||||||
mDictionaryFlags(HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap(
|
mDictionaryFlags(HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap(
|
||||||
attributeMap)), mSize(0), mAttributeMap(*attributeMap),
|
attributeMap)), mSize(0), mAttributeMap(*attributeMap),
|
||||||
mMultiWordCostMultiplier(readUsesForgettingCurveFlag()),
|
mMultiWordCostMultiplier(readMultipleWordCostMultiplier()),
|
||||||
mUsesForgettingCurve(readUsesForgettingCurveFlag()),
|
mUsesForgettingCurve(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap,
|
||||||
mLastUpdatedTime(readLastUpdatedTime()) {}
|
USES_FORGETTING_CURVE_KEY, false /* defaultValue */)),
|
||||||
|
mLastUpdatedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
|
||||||
|
LAST_UPDATED_TIME_KEY, time(0) /* defaultValue */)),
|
||||||
|
mUnigramCount(0), mBigramCount(0), mExtendedRegionSize(0) {}
|
||||||
|
|
||||||
~HeaderPolicy() {}
|
~HeaderPolicy() {}
|
||||||
|
|
||||||
|
@ -78,11 +90,24 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
||||||
return mLastUpdatedTime;
|
return mLastUpdatedTime;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
AK_FORCE_INLINE int getUnigramCount() const {
|
||||||
|
return mUnigramCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
AK_FORCE_INLINE int getBigramCount() const {
|
||||||
|
return mBigramCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
AK_FORCE_INLINE int getExtendedRegionSize() const {
|
||||||
|
return mExtendedRegionSize;
|
||||||
|
}
|
||||||
|
|
||||||
void readHeaderValueOrQuestionMark(const char *const key,
|
void readHeaderValueOrQuestionMark(const char *const key,
|
||||||
int *outValue, int outValueSize) const;
|
int *outValue, int outValueSize) const;
|
||||||
|
|
||||||
bool writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferToWrite,
|
bool writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferToWrite,
|
||||||
const bool updatesLastUpdatedTime) const;
|
const bool updatesLastUpdatedTime, const int unigramCount,
|
||||||
|
const int bigramCount, const int extendedRegionSize) const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderPolicy);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderPolicy);
|
||||||
|
@ -90,6 +115,9 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
||||||
static const char *const MULTIPLE_WORDS_DEMOTION_RATE_KEY;
|
static const char *const MULTIPLE_WORDS_DEMOTION_RATE_KEY;
|
||||||
static const char *const USES_FORGETTING_CURVE_KEY;
|
static const char *const USES_FORGETTING_CURVE_KEY;
|
||||||
static const char *const LAST_UPDATED_TIME_KEY;
|
static const char *const LAST_UPDATED_TIME_KEY;
|
||||||
|
static const char *const UNIGRAM_COUNT_KEY;
|
||||||
|
static const char *const BIGRAM_COUNT_KEY;
|
||||||
|
static const char *const EXTENDED_REGION_SIZE_KEY;
|
||||||
static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE;
|
static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE;
|
||||||
static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE;
|
static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE;
|
||||||
|
|
||||||
|
@ -100,13 +128,12 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
||||||
const float mMultiWordCostMultiplier;
|
const float mMultiWordCostMultiplier;
|
||||||
const bool mUsesForgettingCurve;
|
const bool mUsesForgettingCurve;
|
||||||
const int mLastUpdatedTime;
|
const int mLastUpdatedTime;
|
||||||
|
const int mUnigramCount;
|
||||||
|
const int mBigramCount;
|
||||||
|
const int mExtendedRegionSize;
|
||||||
|
|
||||||
float readMultipleWordCostMultiplier() const;
|
float readMultipleWordCostMultiplier() const;
|
||||||
|
|
||||||
bool readUsesForgettingCurveFlag() const;
|
|
||||||
|
|
||||||
int readLastUpdatedTime() const;
|
|
||||||
|
|
||||||
static HeaderReadWriteUtils::AttributeMap createAttributeMapAndReadAllAttributes(
|
static HeaderReadWriteUtils::AttributeMap createAttributeMapAndReadAllAttributes(
|
||||||
const uint8_t *const dictBuf);
|
const uint8_t *const dictBuf);
|
||||||
};
|
};
|
||||||
|
|
|
@ -68,18 +68,12 @@ const char *const HeaderReadWriteUtils::REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY
|
||||||
/* static */ HeaderReadWriteUtils::DictionaryFlags
|
/* static */ HeaderReadWriteUtils::DictionaryFlags
|
||||||
HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap(
|
HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap(
|
||||||
const HeaderReadWriteUtils::AttributeMap *const attributeMap) {
|
const HeaderReadWriteUtils::AttributeMap *const attributeMap) {
|
||||||
AttributeMap::key_type key;
|
const bool requiresGermanUmlautProcessing = readBoolAttributeValue(attributeMap,
|
||||||
insertCharactersIntoVector(REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY, &key);
|
REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY, false /* defaultValue */);
|
||||||
const bool requiresGermanUmlautProcessing = readBoolAttributeValue(attributeMap, &key,
|
const bool requiresFrenchLigatureProcessing = readBoolAttributeValue(attributeMap,
|
||||||
false /* defaultValue */);
|
REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY, false /* defaultValue */);
|
||||||
key.clear();
|
const bool supportsDynamicUpdate = readBoolAttributeValue(attributeMap,
|
||||||
insertCharactersIntoVector(REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY, &key);
|
SUPPORTS_DYNAMIC_UPDATE_KEY, false /* defaultValue */);
|
||||||
const bool requiresFrenchLigatureProcessing = readBoolAttributeValue(attributeMap, &key,
|
|
||||||
false /* defaultValue */);
|
|
||||||
key.clear();
|
|
||||||
insertCharactersIntoVector(SUPPORTS_DYNAMIC_UPDATE_KEY, &key);
|
|
||||||
const bool supportsDynamicUpdate = readBoolAttributeValue(attributeMap, &key,
|
|
||||||
false /* defaultValue */);
|
|
||||||
DictionaryFlags dictflags = NO_FLAGS;
|
DictionaryFlags dictflags = NO_FLAGS;
|
||||||
dictflags |= requiresGermanUmlautProcessing ? GERMAN_UMLAUT_PROCESSING_FLAG : 0;
|
dictflags |= requiresGermanUmlautProcessing ? GERMAN_UMLAUT_PROCESSING_FLAG : 0;
|
||||||
dictflags |= requiresFrenchLigatureProcessing ? FRENCH_LIGATURE_PROCESSING_FLAG : 0;
|
dictflags |= requiresFrenchLigatureProcessing ? FRENCH_LIGATURE_PROCESSING_FLAG : 0;
|
||||||
|
@ -160,11 +154,18 @@ const char *const HeaderReadWriteUtils::REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY
|
||||||
}
|
}
|
||||||
|
|
||||||
/* static */ void HeaderReadWriteUtils::setBoolAttribute(AttributeMap *const headerAttributes,
|
/* static */ void HeaderReadWriteUtils::setBoolAttribute(AttributeMap *const headerAttributes,
|
||||||
const AttributeMap::key_type *const key, const bool value) {
|
const char *const key, const bool value) {
|
||||||
setIntAttribute(headerAttributes, key, value ? 1 : 0);
|
setIntAttribute(headerAttributes, key, value ? 1 : 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* static */ void HeaderReadWriteUtils::setIntAttribute(AttributeMap *const headerAttributes,
|
/* static */ void HeaderReadWriteUtils::setIntAttribute(AttributeMap *const headerAttributes,
|
||||||
|
const char *const key, const int value) {
|
||||||
|
AttributeMap::key_type keyVector;
|
||||||
|
insertCharactersIntoVector(key, &keyVector);
|
||||||
|
setIntAttributeInner(headerAttributes, &keyVector, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* static */ void HeaderReadWriteUtils::setIntAttributeInner(AttributeMap *const headerAttributes,
|
||||||
const AttributeMap::key_type *const key, const int value) {
|
const AttributeMap::key_type *const key, const int value) {
|
||||||
AttributeMap::mapped_type valueVector;
|
AttributeMap::mapped_type valueVector;
|
||||||
char charBuf[LARGEST_INT_DIGIT_COUNT + 1];
|
char charBuf[LARGEST_INT_DIGIT_COUNT + 1];
|
||||||
|
@ -174,7 +175,7 @@ const char *const HeaderReadWriteUtils::REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY
|
||||||
}
|
}
|
||||||
|
|
||||||
/* static */ bool HeaderReadWriteUtils::readBoolAttributeValue(
|
/* static */ bool HeaderReadWriteUtils::readBoolAttributeValue(
|
||||||
const AttributeMap *const headerAttributes, const AttributeMap::key_type *const key,
|
const AttributeMap *const headerAttributes, const char *const key,
|
||||||
const bool defaultValue) {
|
const bool defaultValue) {
|
||||||
const int intDefaultValue = defaultValue ? 1 : 0;
|
const int intDefaultValue = defaultValue ? 1 : 0;
|
||||||
const int intValue = readIntAttributeValue(headerAttributes, key, intDefaultValue);
|
const int intValue = readIntAttributeValue(headerAttributes, key, intDefaultValue);
|
||||||
|
@ -182,6 +183,14 @@ const char *const HeaderReadWriteUtils::REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY
|
||||||
}
|
}
|
||||||
|
|
||||||
/* static */ int HeaderReadWriteUtils::readIntAttributeValue(
|
/* static */ int HeaderReadWriteUtils::readIntAttributeValue(
|
||||||
|
const AttributeMap *const headerAttributes, const char *const key,
|
||||||
|
const int defaultValue) {
|
||||||
|
AttributeMap::key_type keyVector;
|
||||||
|
insertCharactersIntoVector(key, &keyVector);
|
||||||
|
return readIntAttributeValueInner(headerAttributes, &keyVector, defaultValue);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* static */ int HeaderReadWriteUtils::readIntAttributeValueInner(
|
||||||
const AttributeMap *const headerAttributes, const AttributeMap::key_type *const key,
|
const AttributeMap *const headerAttributes, const AttributeMap::key_type *const key,
|
||||||
const int defaultValue) {
|
const int defaultValue) {
|
||||||
AttributeMap::const_iterator it = headerAttributes->find(*key);
|
AttributeMap::const_iterator it = headerAttributes->find(*key);
|
||||||
|
|
|
@ -76,16 +76,16 @@ class HeaderReadWriteUtils {
|
||||||
* Methods for header attributes.
|
* Methods for header attributes.
|
||||||
*/
|
*/
|
||||||
static void setBoolAttribute(AttributeMap *const headerAttributes,
|
static void setBoolAttribute(AttributeMap *const headerAttributes,
|
||||||
const AttributeMap::key_type *const key, const bool value);
|
const char *const key, const bool value);
|
||||||
|
|
||||||
static void setIntAttribute(AttributeMap *const headerAttributes,
|
static void setIntAttribute(AttributeMap *const headerAttributes,
|
||||||
const AttributeMap::key_type *const key, const int value);
|
const char *const key, const int value);
|
||||||
|
|
||||||
static bool readBoolAttributeValue(const AttributeMap *const headerAttributes,
|
static bool readBoolAttributeValue(const AttributeMap *const headerAttributes,
|
||||||
const AttributeMap::key_type *const key, const bool defaultValue);
|
const char *const key, const bool defaultValue);
|
||||||
|
|
||||||
static int readIntAttributeValue(const AttributeMap *const headerAttributes,
|
static int readIntAttributeValue(const AttributeMap *const headerAttributes,
|
||||||
const AttributeMap::key_type *const key, const int defaultValue);
|
const char *const key, const int defaultValue);
|
||||||
|
|
||||||
static void insertCharactersIntoVector(const char *const characters,
|
static void insertCharactersIntoVector(const char *const characters,
|
||||||
AttributeMap::key_type *const key);
|
AttributeMap::key_type *const key);
|
||||||
|
@ -112,6 +112,12 @@ class HeaderReadWriteUtils {
|
||||||
static const char *const SUPPORTS_DYNAMIC_UPDATE_KEY;
|
static const char *const SUPPORTS_DYNAMIC_UPDATE_KEY;
|
||||||
static const char *const REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY;
|
static const char *const REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY;
|
||||||
static const char *const REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY;
|
static const char *const REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY;
|
||||||
|
|
||||||
|
static void setIntAttributeInner(AttributeMap *const headerAttributes,
|
||||||
|
const AttributeMap::key_type *const key, const int value);
|
||||||
|
|
||||||
|
static int readIntAttributeValueInner(const AttributeMap *const headerAttributes,
|
||||||
|
const AttributeMap::key_type *const key, const int defaultValue);
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
#endif /* LATINIME_HEADER_READ_WRITE_UTILS_H */
|
#endif /* LATINIME_HEADER_READ_WRITE_UTILS_H */
|
||||||
|
|
|
@ -113,6 +113,14 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void getProperty(const char *const query, char *const outResult,
|
||||||
|
const int maxResultLength) const {
|
||||||
|
// getProperty is not supported for this class.
|
||||||
|
if (maxResultLength > 0) {
|
||||||
|
outResult[0] = '\0';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTriePolicy);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTriePolicy);
|
||||||
|
|
||||||
|
|
|
@ -43,7 +43,8 @@ const char *const DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE =
|
||||||
const HeaderReadWriteUtils::AttributeMap *const attributeMap) {
|
const HeaderReadWriteUtils::AttributeMap *const attributeMap) {
|
||||||
BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
|
BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
|
||||||
HeaderPolicy headerPolicy(FormatUtils::VERSION_3, attributeMap);
|
HeaderPolicy headerPolicy(FormatUtils::VERSION_3, attributeMap);
|
||||||
headerPolicy.writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */);
|
headerPolicy.writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */,
|
||||||
|
0 /* unigramCount */, 0 /* bigramCount */, 0 /* extendedRegionSize */);
|
||||||
BufferWithExtendableBuffer bodyBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
|
BufferWithExtendableBuffer bodyBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
|
||||||
if (!DynamicPatriciaTrieWritingUtils::writeEmptyDictionary(&bodyBuffer, 0 /* rootPos */)) {
|
if (!DynamicPatriciaTrieWritingUtils::writeEmptyDictionary(&bodyBuffer, 0 /* rootPos */)) {
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -27,6 +27,7 @@ import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
|
@ -625,4 +626,57 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
||||||
|
|
||||||
dictFile.delete();
|
dictFile.delete();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testUnigramAndBigramCount() {
|
||||||
|
final int flashWithGCIterationCount = 10;
|
||||||
|
final int codePointSetSize = 50;
|
||||||
|
final int unigramCountPerIteration = 1000;
|
||||||
|
final int bigramCountPerIteration = 2000;
|
||||||
|
final int seed = 1123581321;
|
||||||
|
|
||||||
|
final Random random = new Random(seed);
|
||||||
|
|
||||||
|
File dictFile = null;
|
||||||
|
try {
|
||||||
|
dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary");
|
||||||
|
} catch (IOException e) {
|
||||||
|
fail("IOException while writing an initial dictionary : " + e);
|
||||||
|
}
|
||||||
|
|
||||||
|
final ArrayList<String> words = new ArrayList<String>();
|
||||||
|
final HashSet<Pair<String, String>> bigrams = new HashSet<Pair<String, String>>();
|
||||||
|
final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
|
||||||
|
|
||||||
|
BinaryDictionary binaryDictionary;
|
||||||
|
for (int i = 0; i < flashWithGCIterationCount; i++) {
|
||||||
|
binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
|
||||||
|
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
|
||||||
|
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
|
||||||
|
for (int j = 0; j < unigramCountPerIteration; j++) {
|
||||||
|
final String word = CodePointUtils.generateWord(random, codePointSet);
|
||||||
|
words.add(word);
|
||||||
|
final int unigramProbability = random.nextInt(0xFF);
|
||||||
|
binaryDictionary.addUnigramWord(word, unigramProbability);
|
||||||
|
}
|
||||||
|
for (int j = 0; j < bigramCountPerIteration; j++) {
|
||||||
|
final String word0 = words.get(random.nextInt(words.size()));
|
||||||
|
final String word1 = words.get(random.nextInt(words.size()));
|
||||||
|
bigrams.add(new Pair<String, String>(word0, word1));
|
||||||
|
final int bigramProbability = random.nextInt(0xF);
|
||||||
|
binaryDictionary.addBigramWords(word0, word1, bigramProbability);
|
||||||
|
}
|
||||||
|
assertEquals(new HashSet<String>(words).size(), Integer.parseInt(
|
||||||
|
binaryDictionary.getPropertyForTests(BinaryDictionary.UNIGRAM_COUNT_QUERY)));
|
||||||
|
assertEquals(new HashSet<Pair<String, String>>(bigrams).size(), Integer.parseInt(
|
||||||
|
binaryDictionary.getPropertyForTests(BinaryDictionary.BIGRAM_COUNT_QUERY)));
|
||||||
|
binaryDictionary.flushWithGC();
|
||||||
|
assertEquals(new HashSet<String>(words).size(), Integer.parseInt(
|
||||||
|
binaryDictionary.getPropertyForTests(BinaryDictionary.UNIGRAM_COUNT_QUERY)));
|
||||||
|
assertEquals(new HashSet<Pair<String, String>>(bigrams).size(), Integer.parseInt(
|
||||||
|
binaryDictionary.getPropertyForTests(BinaryDictionary.BIGRAM_COUNT_QUERY)));
|
||||||
|
binaryDictionary.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
dictFile.delete();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue