* commit 'de12fcb0f22eec6ff0650a45e937da168bbb6fbc': Add unigram/bigram counting.
This commit is contained in:
commit
6540132d94
20 changed files with 339 additions and 101 deletions
|
@ -48,6 +48,11 @@ public final class BinaryDictionary extends Dictionary {
|
|||
// TODO: Remove this heuristic.
|
||||
private static final int SPACE_COUNT_FOR_AUTO_COMMIT = 3;
|
||||
|
||||
@UsedForTesting
|
||||
public static final String UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
|
||||
@UsedForTesting
|
||||
public static final String BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
|
||||
|
||||
private long mNativeDict;
|
||||
private final Locale mLocale;
|
||||
private final long mDictSize;
|
||||
|
@ -129,6 +134,7 @@ public final class BinaryDictionary extends Dictionary {
|
|||
private static native void removeBigramWordsNative(long dict, int[] word0, int[] word1);
|
||||
private static native int calculateProbabilityNative(long dict, int unigramProbability,
|
||||
int bigramProbability);
|
||||
private static native String getPropertyNative(long dict, String query);
|
||||
|
||||
@UsedForTesting
|
||||
public static boolean createEmptyDictFile(final String filePath, final long dictVersion,
|
||||
|
@ -331,6 +337,12 @@ public final class BinaryDictionary extends Dictionary {
|
|||
return calculateProbabilityNative(mNativeDict, unigramProbability, bigramProbability);
|
||||
}
|
||||
|
||||
@UsedForTesting
|
||||
public String getPropertyForTests(String query) {
|
||||
if (!isValidDictionary()) return "";
|
||||
return getPropertyNative(mNativeDict, query);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean shouldAutoCommit(final SuggestedWordInfo candidate) {
|
||||
// TODO: actually use the confidence rather than use this completely broken heuristic
|
||||
|
|
|
@ -323,6 +323,24 @@ static int latinime_BinaryDictionary_calculateProbabilityNative(JNIEnv *env, jcl
|
|||
bigramProbability);
|
||||
}
|
||||
|
||||
static jstring latinime_BinaryDictionary_getProperty(JNIEnv *env, jclass clazz, jlong dict,
|
||||
jstring query) {
|
||||
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
|
||||
if (!dictionary) {
|
||||
return env->NewStringUTF("");
|
||||
}
|
||||
const jsize queryUtf8Length = env->GetStringUTFLength(query);
|
||||
char queryChars[queryUtf8Length + 1];
|
||||
env->GetStringUTFRegion(query, 0, env->GetStringLength(query), queryChars);
|
||||
queryChars[queryUtf8Length] = '\0';
|
||||
static const int GET_PROPERTY_RESULT_LENGTH = 100;
|
||||
char resultChars[GET_PROPERTY_RESULT_LENGTH];
|
||||
resultChars[0] = '\0';
|
||||
dictionary->getDictionaryStructurePolicy()->getProperty(queryChars, resultChars,
|
||||
GET_PROPERTY_RESULT_LENGTH);
|
||||
return env->NewStringUTF(resultChars);
|
||||
}
|
||||
|
||||
static const JNINativeMethod sMethods[] = {
|
||||
{
|
||||
const_cast<char *>("createEmptyDictFileNative"),
|
||||
|
@ -398,6 +416,11 @@ static const JNINativeMethod sMethods[] = {
|
|||
const_cast<char *>("calculateProbabilityNative"),
|
||||
const_cast<char *>("(JII)I"),
|
||||
reinterpret_cast<void *>(latinime_BinaryDictionary_calculateProbabilityNative)
|
||||
},
|
||||
{
|
||||
const_cast<char *>("getPropertyNative"),
|
||||
const_cast<char *>("(JLjava/lang/String;)Ljava/lang/String;"),
|
||||
reinterpret_cast<void *>(latinime_BinaryDictionary_getProperty)
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -125,6 +125,11 @@ bool Dictionary::needsToRunGC() {
|
|||
return mDictionaryStructureWithBufferPolicy->needsToRunGC();
|
||||
}
|
||||
|
||||
void Dictionary::getProperty(const char *const query, char *const outResult,
|
||||
const int maxResultLength) const {
|
||||
return mDictionaryStructureWithBufferPolicy->getProperty(query, outResult, maxResultLength);
|
||||
}
|
||||
|
||||
void Dictionary::logDictionaryInfo(JNIEnv *const env) const {
|
||||
const int BUFFER_SIZE = 16;
|
||||
int dictionaryIdCodePointBuffer[BUFFER_SIZE];
|
||||
|
|
|
@ -83,6 +83,9 @@ class Dictionary {
|
|||
|
||||
bool needsToRunGC();
|
||||
|
||||
void getProperty(const char *const query, char *const outResult,
|
||||
const int maxResultLength) const;
|
||||
|
||||
const DictionaryStructureWithBufferPolicy *getDictionaryStructurePolicy() const {
|
||||
return mDictionaryStructureWithBufferPolicy;
|
||||
}
|
||||
|
|
|
@ -80,6 +80,9 @@ class DictionaryStructureWithBufferPolicy {
|
|||
|
||||
virtual bool needsToRunGC() const = 0;
|
||||
|
||||
virtual void getProperty(const char *const query, char *const outResult,
|
||||
const int maxResultLength) const = 0;
|
||||
|
||||
protected:
|
||||
DictionaryStructureWithBufferPolicy() {}
|
||||
|
||||
|
|
|
@ -119,7 +119,7 @@ bool DynamicBigramListPolicy::copyAllBigrams(BufferWithExtendableBuffer *const b
|
|||
// Finding useless bigram entries and remove them. Bigram entry is useless when the target PtNode
|
||||
// has been deleted or is not a valid terminal.
|
||||
bool DynamicBigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(
|
||||
int *const bigramListPos) {
|
||||
int *const bigramListPos, int *const outValidBigramEntryCount) {
|
||||
const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos);
|
||||
if (usesAdditionalBuffer) {
|
||||
*bigramListPos -= mBuffer->getOriginalBufferSize();
|
||||
|
@ -161,6 +161,8 @@ bool DynamicBigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(
|
|||
NOT_A_DICT_POS /* targetOffset */, &bigramEntryPos)) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
(*outValidBigramEntryCount) += 1;
|
||||
}
|
||||
} while(BigramListReadWriteUtils::hasNext(bigramFlags));
|
||||
return true;
|
||||
|
@ -169,7 +171,7 @@ bool DynamicBigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(
|
|||
// Updates bigram target PtNode positions in the list after the placing step in GC.
|
||||
bool DynamicBigramListPolicy::updateAllBigramTargetPtNodePositions(int *const bigramListPos,
|
||||
const DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap *const
|
||||
ptNodePositionRelocationMap) {
|
||||
ptNodePositionRelocationMap, int *const outBigramEntryCount) {
|
||||
const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos);
|
||||
if (usesAdditionalBuffer) {
|
||||
*bigramListPos -= mBuffer->getOriginalBufferSize();
|
||||
|
@ -211,11 +213,12 @@ bool DynamicBigramListPolicy::updateAllBigramTargetPtNodePositions(int *const bi
|
|||
return false;
|
||||
}
|
||||
} while(BigramListReadWriteUtils::hasNext(bigramFlags));
|
||||
(*outBigramEntryCount) = bigramEntryCount;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTargetPos,
|
||||
const int probability, int *const bigramListPos) {
|
||||
const int probability, int *const bigramListPos, bool *const outAddedNewBigram) {
|
||||
const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos);
|
||||
if (usesAdditionalBuffer) {
|
||||
*bigramListPos -= mBuffer->getOriginalBufferSize();
|
||||
|
@ -243,6 +246,7 @@ bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTarg
|
|||
}
|
||||
if (followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos) == bigramTargetPos) {
|
||||
// Update this bigram entry.
|
||||
*outAddedNewBigram = false;
|
||||
const BigramListReadWriteUtils::BigramFlags updatedFlags =
|
||||
BigramListReadWriteUtils::setProbabilityInFlags(bigramFlags, probability);
|
||||
return BigramListReadWriteUtils::writeBigramEntry(mBuffer, updatedFlags,
|
||||
|
@ -254,12 +258,14 @@ bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTarg
|
|||
// The current last entry is found.
|
||||
// First, update the flags of the last entry.
|
||||
if (!BigramListReadWriteUtils::setHasNextFlag(mBuffer, true /* hasNext */, entryPos)) {
|
||||
*outAddedNewBigram = false;
|
||||
return false;
|
||||
}
|
||||
if (usesAdditionalBuffer) {
|
||||
*bigramListPos += mBuffer->getOriginalBufferSize();
|
||||
}
|
||||
// Then, add a new entry after the last entry.
|
||||
*outAddedNewBigram = true;
|
||||
return writeNewBigramEntry(bigramTargetPos, probability, bigramListPos);
|
||||
} while(BigramListReadWriteUtils::hasNext(bigramFlags));
|
||||
// We return directly from the while loop.
|
||||
|
|
|
@ -50,19 +50,20 @@ class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy {
|
|||
bool copyAllBigrams(BufferWithExtendableBuffer *const bufferToWrite, int *const fromPos,
|
||||
int *const toPos, int *const outBigramsCount) const;
|
||||
|
||||
bool updateAllBigramEntriesAndDeleteUselessEntries(int *const bigramListPos);
|
||||
bool updateAllBigramEntriesAndDeleteUselessEntries(int *const bigramListPos,
|
||||
int *const outBigramEntryCount);
|
||||
|
||||
bool updateAllBigramTargetPtNodePositions(int *const bigramListPos,
|
||||
const DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap *const
|
||||
ptNodePositionRelocationMap);
|
||||
ptNodePositionRelocationMap, int *const outValidBigramEntryCount);
|
||||
|
||||
bool addNewBigramEntryToBigramList(const int bigramTargetPos, const int probability,
|
||||
int *const bigramListPos);
|
||||
int *const bigramListPos, bool *const outAddedNewBigram);
|
||||
|
||||
bool writeNewBigramEntry(const int bigramTargetPos, const int probability,
|
||||
int *const writingPos);
|
||||
|
||||
// Return if targetBigramPos is found or not.
|
||||
// Return whether or not targetBigramPos is found.
|
||||
bool removeBigram(const int bigramListPos, const int bigramTargetPos);
|
||||
|
||||
private:
|
||||
|
|
|
@ -42,6 +42,9 @@ bool DynamicPatriciaTrieGcEventListeners
|
|||
}
|
||||
} else {
|
||||
valueStack.back() += 1;
|
||||
if (node->isTerminal()) {
|
||||
mValidUnigramCount += 1;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
@ -137,10 +140,15 @@ bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateAllPositionField
|
|||
// Updates bigram target PtNode positions in the bigram list.
|
||||
int bigramsPos = node->getBigramsPos();
|
||||
if (bigramsPos != NOT_A_DICT_POS) {
|
||||
int bigramEntryCount;
|
||||
if (!mBigramPolicy->updateAllBigramTargetPtNodePositions(&bigramsPos,
|
||||
&mDictPositionRelocationMap->mPtNodePositionRelocationMap)) {
|
||||
&mDictPositionRelocationMap->mPtNodePositionRelocationMap, &bigramEntryCount)) {
|
||||
return false;
|
||||
}
|
||||
mBigramCount += bigramEntryCount;
|
||||
}
|
||||
if (node->isTerminal()) {
|
||||
mUnigramCount++;
|
||||
}
|
||||
|
||||
return true;
|
||||
|
|
|
@ -41,7 +41,7 @@ class DynamicPatriciaTrieGcEventListeners {
|
|||
DynamicPatriciaTrieWritingHelper *const writingHelper,
|
||||
BufferWithExtendableBuffer *const buffer)
|
||||
: mWritingHelper(writingHelper), mBuffer(buffer), valueStack(),
|
||||
mChildrenValue(0) {}
|
||||
mChildrenValue(0), mValidUnigramCount(0) {}
|
||||
|
||||
~TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted() {};
|
||||
|
||||
|
@ -64,6 +64,10 @@ class DynamicPatriciaTrieGcEventListeners {
|
|||
bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node,
|
||||
const int *const nodeCodePoints);
|
||||
|
||||
int getValidUnigramCount() const {
|
||||
return mValidUnigramCount;
|
||||
}
|
||||
|
||||
private:
|
||||
DISALLOW_IMPLICIT_CONSTRUCTORS(
|
||||
TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted);
|
||||
|
@ -72,6 +76,7 @@ class DynamicPatriciaTrieGcEventListeners {
|
|||
BufferWithExtendableBuffer *const mBuffer;
|
||||
std::vector<int> valueStack;
|
||||
int mChildrenValue;
|
||||
int mValidUnigramCount;
|
||||
};
|
||||
|
||||
// Updates all bigram entries that are held by valid PtNodes. This removes useless bigram
|
||||
|
@ -80,7 +85,7 @@ class DynamicPatriciaTrieGcEventListeners {
|
|||
: public DynamicPatriciaTrieReadingHelper::TraversingEventListener {
|
||||
public:
|
||||
TraversePolicyToUpdateBigramProbability(DynamicBigramListPolicy *const bigramPolicy)
|
||||
: mBigramPolicy(bigramPolicy) {}
|
||||
: mBigramPolicy(bigramPolicy), mValidBigramEntryCount(0) {}
|
||||
|
||||
bool onAscend() { return true; }
|
||||
|
||||
|
@ -93,18 +98,26 @@ class DynamicPatriciaTrieGcEventListeners {
|
|||
if (!node->isDeleted()) {
|
||||
int pos = node->getBigramsPos();
|
||||
if (pos != NOT_A_DICT_POS) {
|
||||
if (!mBigramPolicy->updateAllBigramEntriesAndDeleteUselessEntries(&pos)) {
|
||||
int bigramEntryCount = 0;
|
||||
if (!mBigramPolicy->updateAllBigramEntriesAndDeleteUselessEntries(&pos,
|
||||
&bigramEntryCount)) {
|
||||
return false;
|
||||
}
|
||||
mValidBigramEntryCount += bigramEntryCount;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int getValidBigramEntryCount() const {
|
||||
return mValidBigramEntryCount;
|
||||
}
|
||||
|
||||
private:
|
||||
DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateBigramProbability);
|
||||
|
||||
DynamicBigramListPolicy *const mBigramPolicy;
|
||||
int mValidBigramEntryCount;
|
||||
};
|
||||
|
||||
class TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
|
||||
|
@ -150,7 +163,8 @@ class DynamicPatriciaTrieGcEventListeners {
|
|||
dictPositionRelocationMap)
|
||||
: mWritingHelper(writingHelper), mBigramPolicy(bigramPolicy),
|
||||
mBufferToWrite(bufferToWrite),
|
||||
mDictPositionRelocationMap(dictPositionRelocationMap) {};
|
||||
mDictPositionRelocationMap(dictPositionRelocationMap), mUnigramCount(0),
|
||||
mBigramCount(0) {};
|
||||
|
||||
bool onAscend() { return true; }
|
||||
|
||||
|
@ -161,6 +175,14 @@ class DynamicPatriciaTrieGcEventListeners {
|
|||
bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node,
|
||||
const int *const nodeCodePoints);
|
||||
|
||||
int getUnigramCount() const {
|
||||
return mUnigramCount;
|
||||
}
|
||||
|
||||
int getBigramCount() const {
|
||||
return mBigramCount;
|
||||
}
|
||||
|
||||
private:
|
||||
DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateAllPositionFields);
|
||||
|
||||
|
@ -169,6 +191,8 @@ class DynamicPatriciaTrieGcEventListeners {
|
|||
BufferWithExtendableBuffer *const mBufferToWrite;
|
||||
const DynamicPatriciaTrieWritingHelper::DictPositionRelocationMap *const
|
||||
mDictPositionRelocationMap;
|
||||
int mUnigramCount;
|
||||
int mBigramCount;
|
||||
};
|
||||
|
||||
private:
|
||||
|
|
|
@ -16,6 +16,9 @@
|
|||
|
||||
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
|
||||
#include "defines.h"
|
||||
#include "suggest/core/dicnode/dic_node.h"
|
||||
#include "suggest/core/dicnode/dic_node_vector.h"
|
||||
|
@ -28,6 +31,9 @@
|
|||
|
||||
namespace latinime {
|
||||
|
||||
const char *const DynamicPatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
|
||||
const char *const DynamicPatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
|
||||
|
||||
void DynamicPatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode,
|
||||
DicNodeVector *const childDicNodes) const {
|
||||
if (!dicNode->hasChildren()) {
|
||||
|
@ -198,7 +204,16 @@ bool DynamicPatriciaTriePolicy::addUnigramWord(const int *const word, const int
|
|||
readingHelper.initWithPtNodeArrayPos(getRootPosition());
|
||||
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
|
||||
&mBigramListPolicy, &mShortcutListPolicy);
|
||||
return writingHelper.addUnigramWord(&readingHelper, word, length, probability);
|
||||
bool addedNewUnigram = false;
|
||||
if (writingHelper.addUnigramWord(&readingHelper, word, length, probability,
|
||||
&addedNewUnigram)) {
|
||||
if (addedNewUnigram) {
|
||||
mUnigramCount++;
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool DynamicPatriciaTriePolicy::addBigramWords(const int *const word0, const int length0,
|
||||
|
@ -219,7 +234,15 @@ bool DynamicPatriciaTriePolicy::addBigramWords(const int *const word0, const int
|
|||
}
|
||||
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
|
||||
&mBigramListPolicy, &mShortcutListPolicy);
|
||||
return writingHelper.addBigramWords(word0Pos, word1Pos, probability);
|
||||
bool addedNewBigram = false;
|
||||
if (writingHelper.addBigramWords(word0Pos, word1Pos, probability, &addedNewBigram)) {
|
||||
if (addedNewBigram) {
|
||||
mBigramCount++;
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const int length0,
|
||||
|
@ -240,7 +263,12 @@ bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const
|
|||
}
|
||||
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
|
||||
&mBigramListPolicy, &mShortcutListPolicy);
|
||||
return writingHelper.removeBigramWords(word0Pos, word1Pos);
|
||||
if (writingHelper.removeBigramWords(word0Pos, word1Pos)) {
|
||||
mBigramCount--;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
void DynamicPatriciaTriePolicy::flush(const char *const filePath) {
|
||||
|
@ -250,7 +278,7 @@ void DynamicPatriciaTriePolicy::flush(const char *const filePath) {
|
|||
}
|
||||
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
|
||||
&mBigramListPolicy, &mShortcutListPolicy);
|
||||
writingHelper.writeToDictFile(filePath, &mHeaderPolicy);
|
||||
writingHelper.writeToDictFile(filePath, &mHeaderPolicy, mUnigramCount, mBigramCount);
|
||||
}
|
||||
|
||||
void DynamicPatriciaTriePolicy::flushWithGC(const char *const filePath) {
|
||||
|
@ -272,4 +300,13 @@ bool DynamicPatriciaTriePolicy::needsToRunGC() const {
|
|||
return mBufferWithExtendableBuffer.isNearSizeLimit();
|
||||
}
|
||||
|
||||
void DynamicPatriciaTriePolicy::getProperty(const char *const query, char *const outResult,
|
||||
const int maxResultLength) const {
|
||||
if (strncmp(query, UNIGRAM_COUNT_QUERY, maxResultLength) == 0) {
|
||||
snprintf(outResult, maxResultLength, "%d", mUnigramCount);
|
||||
} else if (strncmp(query, BIGRAM_COUNT_QUERY, maxResultLength) == 0) {
|
||||
snprintf(outResult, maxResultLength, "%d", mBigramCount);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace latinime
|
||||
|
|
|
@ -37,7 +37,9 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
|||
mBufferWithExtendableBuffer(mBuffer->getBuffer() + mHeaderPolicy.getSize(),
|
||||
mBuffer->getBufferSize() - mHeaderPolicy.getSize()),
|
||||
mShortcutListPolicy(&mBufferWithExtendableBuffer),
|
||||
mBigramListPolicy(&mBufferWithExtendableBuffer, &mShortcutListPolicy) {}
|
||||
mBigramListPolicy(&mBufferWithExtendableBuffer, &mShortcutListPolicy),
|
||||
mUnigramCount(mHeaderPolicy.getUnigramCount()),
|
||||
mBigramCount(mHeaderPolicy.getBigramCount()) {}
|
||||
|
||||
~DynamicPatriciaTriePolicy() {
|
||||
delete mBuffer;
|
||||
|
@ -91,14 +93,22 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
|||
|
||||
bool needsToRunGC() const;
|
||||
|
||||
void getProperty(const char *const query, char *const outResult,
|
||||
const int maxResultLength) const;
|
||||
|
||||
private:
|
||||
DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTriePolicy);
|
||||
|
||||
static const char*const UNIGRAM_COUNT_QUERY;
|
||||
static const char*const BIGRAM_COUNT_QUERY;
|
||||
|
||||
const MmappedBuffer *const mBuffer;
|
||||
const HeaderPolicy mHeaderPolicy;
|
||||
BufferWithExtendableBuffer mBufferWithExtendableBuffer;
|
||||
DynamicShortcutListPolicy mShortcutListPolicy;
|
||||
DynamicBigramListPolicy mBigramListPolicy;
|
||||
int mUnigramCount;
|
||||
int mBigramCount;
|
||||
};
|
||||
} // namespace latinime
|
||||
#endif // LATINIME_DYNAMIC_PATRICIA_TRIE_POLICY_H
|
||||
|
|
|
@ -36,7 +36,8 @@ const size_t DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE = 2 * 1024 *
|
|||
|
||||
bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
|
||||
DynamicPatriciaTrieReadingHelper *const readingHelper,
|
||||
const int *const wordCodePoints, const int codePointCount, const int probability) {
|
||||
const int *const wordCodePoints, const int codePointCount, const int probability,
|
||||
bool *const outAddedNewUnigram) {
|
||||
int parentPos = NOT_A_DICT_POS;
|
||||
while (!readingHelper->isEnd()) {
|
||||
const int matchedCodePointCount = readingHelper->getPrevTotalCodePointCount();
|
||||
|
@ -54,6 +55,7 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
|
|||
const int nextIndex = matchedCodePointCount + j;
|
||||
if (nextIndex >= codePointCount || !readingHelper->isMatchedCodePoint(j,
|
||||
wordCodePoints[matchedCodePointCount + j])) {
|
||||
*outAddedNewUnigram = true;
|
||||
return reallocatePtNodeAndAddNewPtNodes(nodeReader,
|
||||
readingHelper->getMergedNodeCodePoints(), j, probability,
|
||||
wordCodePoints + matchedCodePointCount,
|
||||
|
@ -63,9 +65,10 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
|
|||
// All characters are matched.
|
||||
if (codePointCount == readingHelper->getTotalCodePointCount()) {
|
||||
return setPtNodeProbability(nodeReader, probability,
|
||||
readingHelper->getMergedNodeCodePoints());
|
||||
readingHelper->getMergedNodeCodePoints(), outAddedNewUnigram);
|
||||
}
|
||||
if (!nodeReader->hasChildren()) {
|
||||
*outAddedNewUnigram = true;
|
||||
return createChildrenPtNodeArrayAndAChildPtNode(nodeReader, probability,
|
||||
wordCodePoints + readingHelper->getTotalCodePointCount(),
|
||||
codePointCount - readingHelper->getTotalCodePointCount());
|
||||
|
@ -79,6 +82,7 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
|
|||
return false;
|
||||
}
|
||||
int pos = readingHelper->getPosOfLastForwardLinkField();
|
||||
*outAddedNewUnigram = true;
|
||||
return createAndInsertNodeIntoPtNodeArray(parentPos,
|
||||
wordCodePoints + readingHelper->getPrevTotalCodePointCount(),
|
||||
codePointCount - readingHelper->getPrevTotalCodePointCount(),
|
||||
|
@ -86,7 +90,7 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
|
|||
}
|
||||
|
||||
bool DynamicPatriciaTrieWritingHelper::addBigramWords(const int word0Pos, const int word1Pos,
|
||||
const int probability) {
|
||||
const int probability, bool *const outAddedNewBigram) {
|
||||
int mMergedNodeCodePoints[MAX_WORD_LENGTH];
|
||||
DynamicPatriciaTrieNodeReader nodeReader(mBuffer, mBigramPolicy, mShortcutPolicy);
|
||||
nodeReader.fetchNodeInfoInBufferFromPtNodePosAndGetNodeCodePoints(word0Pos, MAX_WORD_LENGTH,
|
||||
|
@ -107,9 +111,11 @@ bool DynamicPatriciaTrieWritingHelper::addBigramWords(const int word0Pos, const
|
|||
if (nodeReader.getBigramsPos() != NOT_A_DICT_POS) {
|
||||
// Insert a new bigram entry into the existing bigram list.
|
||||
int bigramListPos = nodeReader.getBigramsPos();
|
||||
return mBigramPolicy->addNewBigramEntryToBigramList(word1Pos, probability, &bigramListPos);
|
||||
return mBigramPolicy->addNewBigramEntryToBigramList(word1Pos, probability, &bigramListPos,
|
||||
outAddedNewBigram);
|
||||
} else {
|
||||
// The PtNode doesn't have a bigram list.
|
||||
*outAddedNewBigram = true;
|
||||
// First, Write a bigram entry at the tail position of the PtNode.
|
||||
if (!mBigramPolicy->writeNewBigramEntry(word1Pos, probability, &writingPos)) {
|
||||
return false;
|
||||
|
@ -138,9 +144,12 @@ bool DynamicPatriciaTrieWritingHelper::removeBigramWords(const int word0Pos, con
|
|||
}
|
||||
|
||||
void DynamicPatriciaTrieWritingHelper::writeToDictFile(const char *const fileName,
|
||||
const HeaderPolicy *const headerPolicy) {
|
||||
const HeaderPolicy *const headerPolicy, const int unigramCount, const int bigramCount) {
|
||||
BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
|
||||
if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, false /* updatesLastUpdatedTime */)) {
|
||||
const int extendedRegionSize = headerPolicy->getExtendedRegionSize() +
|
||||
mBuffer->getTailPosition() - mBuffer->getOriginalBufferSize();
|
||||
if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, false /* updatesLastUpdatedTime */,
|
||||
unigramCount, bigramCount, extendedRegionSize)) {
|
||||
return;
|
||||
}
|
||||
DictFileWritingUtils::flushAllHeaderAndBodyToFile(fileName, &headerBuffer, mBuffer);
|
||||
|
@ -148,13 +157,16 @@ void DynamicPatriciaTrieWritingHelper::writeToDictFile(const char *const fileNam
|
|||
|
||||
void DynamicPatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos,
|
||||
const char *const fileName, const HeaderPolicy *const headerPolicy) {
|
||||
BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
|
||||
if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */)) {
|
||||
return;
|
||||
}
|
||||
BufferWithExtendableBuffer newDictBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */,
|
||||
MAX_DICTIONARY_SIZE);
|
||||
if (!runGC(rootPtNodeArrayPos, &newDictBuffer)) {
|
||||
int unigramCount = 0;
|
||||
int bigramCount = 0;
|
||||
if (!runGC(rootPtNodeArrayPos, &newDictBuffer, &unigramCount, &bigramCount)) {
|
||||
return;
|
||||
}
|
||||
BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
|
||||
if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */,
|
||||
unigramCount, bigramCount, 0 /* extendedRegionSize */)) {
|
||||
return;
|
||||
}
|
||||
DictFileWritingUtils::flushAllHeaderAndBodyToFile(fileName, &headerBuffer, &newDictBuffer);
|
||||
|
@ -335,9 +347,10 @@ bool DynamicPatriciaTrieWritingHelper::createAndInsertNodeIntoPtNodeArray(const
|
|||
|
||||
bool DynamicPatriciaTrieWritingHelper::setPtNodeProbability(
|
||||
const DynamicPatriciaTrieNodeReader *const originalPtNode, const int probability,
|
||||
const int *const codePoints) {
|
||||
const int *const codePoints, bool *const outAddedNewUnigram) {
|
||||
if (originalPtNode->isTerminal()) {
|
||||
// Overwrites the probability.
|
||||
*outAddedNewUnigram = false;
|
||||
int probabilityFieldPos = originalPtNode->getProbabilityFieldPos();
|
||||
if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition(mBuffer,
|
||||
probability, &probabilityFieldPos)) {
|
||||
|
@ -345,6 +358,7 @@ bool DynamicPatriciaTrieWritingHelper::setPtNodeProbability(
|
|||
}
|
||||
} else {
|
||||
// Make the node terminal and write the probability.
|
||||
*outAddedNewUnigram = true;
|
||||
int movedPos = mBuffer->getTailPosition();
|
||||
if (!markNodeAsMovedAndSetPosition(originalPtNode, movedPos, movedPos)) {
|
||||
return false;
|
||||
|
@ -460,7 +474,8 @@ bool DynamicPatriciaTrieWritingHelper::reallocatePtNodeAndAddNewPtNodes(
|
|||
}
|
||||
|
||||
bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
|
||||
BufferWithExtendableBuffer *const bufferToWrite) {
|
||||
BufferWithExtendableBuffer *const bufferToWrite, int *const outUnigramCount,
|
||||
int *const outBigramCount) {
|
||||
DynamicPatriciaTrieReadingHelper readingHelper(mBuffer, mBigramPolicy, mShortcutPolicy);
|
||||
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
||||
DynamicPatriciaTrieGcEventListeners
|
||||
|
@ -505,6 +520,8 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
|
|||
&traversePolicyToUpdateAllPositionFields)) {
|
||||
return false;
|
||||
}
|
||||
*outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount();
|
||||
*outBigramCount = traversePolicyToUpdateAllPositionFields.getBigramCount();
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -56,15 +56,18 @@ class DynamicPatriciaTrieWritingHelper {
|
|||
|
||||
// Add a word to the dictionary. If the word already exists, update the probability.
|
||||
bool addUnigramWord(DynamicPatriciaTrieReadingHelper *const readingHelper,
|
||||
const int *const wordCodePoints, const int codePointCount, const int probability);
|
||||
const int *const wordCodePoints, const int codePointCount, const int probability,
|
||||
bool *const outAddedNewUnigram);
|
||||
|
||||
// Add a bigram relation from word0Pos to word1Pos.
|
||||
bool addBigramWords(const int word0Pos, const int word1Pos, const int probability);
|
||||
bool addBigramWords(const int word0Pos, const int word1Pos, const int probability,
|
||||
bool *const outAddedNewBigram);
|
||||
|
||||
// Remove a bigram relation from word0Pos to word1Pos.
|
||||
bool removeBigramWords(const int word0Pos, const int word1Pos);
|
||||
|
||||
void writeToDictFile(const char *const fileName, const HeaderPolicy *const headerPolicy);
|
||||
void writeToDictFile(const char *const fileName, const HeaderPolicy *const headerPolicy,
|
||||
const int unigramCount, const int bigramCount);
|
||||
|
||||
void writeToDictFileWithGC(const int rootPtNodeArrayPos, const char *const fileName,
|
||||
const HeaderPolicy *const headerPolicy);
|
||||
|
@ -107,7 +110,7 @@ class DynamicPatriciaTrieWritingHelper {
|
|||
const int nodeCodePointCount, const int probability, int *const forwardLinkFieldPos);
|
||||
|
||||
bool setPtNodeProbability(const DynamicPatriciaTrieNodeReader *const originalNode,
|
||||
const int probability, const int *const codePoints);
|
||||
const int probability, const int *const codePoints, bool *const outAddedNewUnigram);
|
||||
|
||||
bool createChildrenPtNodeArrayAndAChildPtNode(
|
||||
const DynamicPatriciaTrieNodeReader *const parentNode, const int probability,
|
||||
|
@ -122,7 +125,8 @@ class DynamicPatriciaTrieWritingHelper {
|
|||
const int probabilityOfNewPtNode, const int *const newNodeCodePoints,
|
||||
const int newNodeCodePointCount);
|
||||
|
||||
bool runGC(const int rootPtNodeArrayPos, BufferWithExtendableBuffer *const bufferToWrite);
|
||||
bool runGC(const int rootPtNodeArrayPos, BufferWithExtendableBuffer *const bufferToWrite,
|
||||
int *const outUnigramCount, int *const outBigramCount);
|
||||
};
|
||||
} // namespace latinime
|
||||
#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H */
|
||||
|
|
|
@ -16,17 +16,15 @@
|
|||
|
||||
#include "suggest/policyimpl/dictionary/header/header_policy.h"
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdio>
|
||||
#include <ctime>
|
||||
|
||||
namespace latinime {
|
||||
|
||||
|
||||
// Note that these are corresponding definitions in Java side in FormatSpec.FileHeader.
|
||||
const char *const HeaderPolicy::MULTIPLE_WORDS_DEMOTION_RATE_KEY = "MULTIPLE_WORDS_DEMOTION_RATE";
|
||||
const char *const HeaderPolicy::USES_FORGETTING_CURVE_KEY = "USES_FORGETTING_CURVE";
|
||||
const char *const HeaderPolicy::LAST_UPDATED_TIME_KEY = "date";
|
||||
const char *const HeaderPolicy::UNIGRAM_COUNT_KEY = "UNIGRAM_COUNT";
|
||||
const char *const HeaderPolicy::BIGRAM_COUNT_KEY = "BIGRAM_COUNT";
|
||||
const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE";
|
||||
const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100;
|
||||
const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f;
|
||||
|
||||
|
@ -55,33 +53,17 @@ void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *out
|
|||
}
|
||||
|
||||
float HeaderPolicy::readMultipleWordCostMultiplier() const {
|
||||
std::vector<int> keyVector;
|
||||
HeaderReadWriteUtils::insertCharactersIntoVector(MULTIPLE_WORDS_DEMOTION_RATE_KEY, &keyVector);
|
||||
const int demotionRate = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
|
||||
&keyVector, DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE);
|
||||
MULTIPLE_WORDS_DEMOTION_RATE_KEY, DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE);
|
||||
if (demotionRate <= 0) {
|
||||
return static_cast<float>(MAX_VALUE_FOR_WEIGHTING);
|
||||
}
|
||||
return MULTIPLE_WORD_COST_MULTIPLIER_SCALE / static_cast<float>(demotionRate);
|
||||
}
|
||||
|
||||
bool HeaderPolicy::readUsesForgettingCurveFlag() const {
|
||||
std::vector<int> keyVector;
|
||||
HeaderReadWriteUtils::insertCharactersIntoVector(USES_FORGETTING_CURVE_KEY, &keyVector);
|
||||
return HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, &keyVector,
|
||||
false /* defaultValue */);
|
||||
}
|
||||
|
||||
// Returns current time when the key is not found or the value is invalid.
|
||||
int HeaderPolicy::readLastUpdatedTime() const {
|
||||
std::vector<int> keyVector;
|
||||
HeaderReadWriteUtils::insertCharactersIntoVector(LAST_UPDATED_TIME_KEY, &keyVector);
|
||||
return HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, &keyVector,
|
||||
time(0) /* defaultValue */);
|
||||
}
|
||||
|
||||
bool HeaderPolicy::writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferToWrite,
|
||||
const bool updatesLastUpdatedTime) const {
|
||||
const bool updatesLastUpdatedTime, const int unigramCount, const int bigramCount,
|
||||
const int extendedRegionSize) const {
|
||||
int writingPos = 0;
|
||||
if (!HeaderReadWriteUtils::writeDictionaryVersion(bufferToWrite, mDictFormatVersion,
|
||||
&writingPos)) {
|
||||
|
@ -97,21 +79,19 @@ bool HeaderPolicy::writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferT
|
|||
&writingPos)) {
|
||||
return false;
|
||||
}
|
||||
HeaderReadWriteUtils::AttributeMap attributeMapTowrite(mAttributeMap);
|
||||
HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, UNIGRAM_COUNT_KEY, unigramCount);
|
||||
HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, BIGRAM_COUNT_KEY, bigramCount);
|
||||
HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, EXTENDED_REGION_SIZE_KEY,
|
||||
extendedRegionSize);
|
||||
if (updatesLastUpdatedTime) {
|
||||
// Set current time as a last updated time.
|
||||
HeaderReadWriteUtils::AttributeMap attributeMapTowrite(mAttributeMap);
|
||||
std::vector<int> updatedTimekey;
|
||||
HeaderReadWriteUtils::insertCharactersIntoVector(LAST_UPDATED_TIME_KEY, &updatedTimekey);
|
||||
HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, &updatedTimekey, time(0));
|
||||
if (!HeaderReadWriteUtils::writeHeaderAttributes(bufferToWrite, &attributeMapTowrite,
|
||||
&writingPos)) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (!HeaderReadWriteUtils::writeHeaderAttributes(bufferToWrite, &mAttributeMap,
|
||||
&writingPos)) {
|
||||
return false;
|
||||
}
|
||||
HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, LAST_UPDATED_TIME_KEY,
|
||||
time(0));
|
||||
}
|
||||
if (!HeaderReadWriteUtils::writeHeaderAttributes(bufferToWrite, &attributeMapTowrite,
|
||||
&writingPos)) {
|
||||
return false;
|
||||
}
|
||||
// Writes an actual header size.
|
||||
if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(bufferToWrite, writingPos,
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
#ifndef LATINIME_HEADER_POLICY_H
|
||||
#define LATINIME_HEADER_POLICY_H
|
||||
|
||||
#include <ctime>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "defines.h"
|
||||
|
@ -35,8 +36,16 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
|||
mSize(HeaderReadWriteUtils::getHeaderSize(dictBuf)),
|
||||
mAttributeMap(createAttributeMapAndReadAllAttributes(dictBuf)),
|
||||
mMultiWordCostMultiplier(readMultipleWordCostMultiplier()),
|
||||
mUsesForgettingCurve(readUsesForgettingCurveFlag()),
|
||||
mLastUpdatedTime(readLastUpdatedTime()) {}
|
||||
mUsesForgettingCurve(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap,
|
||||
USES_FORGETTING_CURVE_KEY, false /* defaultValue */)),
|
||||
mLastUpdatedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
|
||||
LAST_UPDATED_TIME_KEY, time(0) /* defaultValue */)),
|
||||
mUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
|
||||
UNIGRAM_COUNT_KEY, 0 /* defaultValue */)),
|
||||
mBigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
|
||||
BIGRAM_COUNT_KEY, 0 /* defaultValue */)),
|
||||
mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
|
||||
EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)) {}
|
||||
|
||||
// Constructs header information using an attribute map.
|
||||
HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion,
|
||||
|
@ -44,9 +53,12 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
|||
: mDictFormatVersion(dictFormatVersion),
|
||||
mDictionaryFlags(HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap(
|
||||
attributeMap)), mSize(0), mAttributeMap(*attributeMap),
|
||||
mMultiWordCostMultiplier(readUsesForgettingCurveFlag()),
|
||||
mUsesForgettingCurve(readUsesForgettingCurveFlag()),
|
||||
mLastUpdatedTime(readLastUpdatedTime()) {}
|
||||
mMultiWordCostMultiplier(readMultipleWordCostMultiplier()),
|
||||
mUsesForgettingCurve(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap,
|
||||
USES_FORGETTING_CURVE_KEY, false /* defaultValue */)),
|
||||
mLastUpdatedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
|
||||
LAST_UPDATED_TIME_KEY, time(0) /* defaultValue */)),
|
||||
mUnigramCount(0), mBigramCount(0), mExtendedRegionSize(0) {}
|
||||
|
||||
~HeaderPolicy() {}
|
||||
|
||||
|
@ -78,11 +90,24 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
|||
return mLastUpdatedTime;
|
||||
}
|
||||
|
||||
AK_FORCE_INLINE int getUnigramCount() const {
|
||||
return mUnigramCount;
|
||||
}
|
||||
|
||||
AK_FORCE_INLINE int getBigramCount() const {
|
||||
return mBigramCount;
|
||||
}
|
||||
|
||||
AK_FORCE_INLINE int getExtendedRegionSize() const {
|
||||
return mExtendedRegionSize;
|
||||
}
|
||||
|
||||
void readHeaderValueOrQuestionMark(const char *const key,
|
||||
int *outValue, int outValueSize) const;
|
||||
|
||||
bool writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferToWrite,
|
||||
const bool updatesLastUpdatedTime) const;
|
||||
const bool updatesLastUpdatedTime, const int unigramCount,
|
||||
const int bigramCount, const int extendedRegionSize) const;
|
||||
|
||||
private:
|
||||
DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderPolicy);
|
||||
|
@ -90,6 +115,9 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
|||
static const char *const MULTIPLE_WORDS_DEMOTION_RATE_KEY;
|
||||
static const char *const USES_FORGETTING_CURVE_KEY;
|
||||
static const char *const LAST_UPDATED_TIME_KEY;
|
||||
static const char *const UNIGRAM_COUNT_KEY;
|
||||
static const char *const BIGRAM_COUNT_KEY;
|
||||
static const char *const EXTENDED_REGION_SIZE_KEY;
|
||||
static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE;
|
||||
static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE;
|
||||
|
||||
|
@ -100,13 +128,12 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
|
|||
const float mMultiWordCostMultiplier;
|
||||
const bool mUsesForgettingCurve;
|
||||
const int mLastUpdatedTime;
|
||||
const int mUnigramCount;
|
||||
const int mBigramCount;
|
||||
const int mExtendedRegionSize;
|
||||
|
||||
float readMultipleWordCostMultiplier() const;
|
||||
|
||||
bool readUsesForgettingCurveFlag() const;
|
||||
|
||||
int readLastUpdatedTime() const;
|
||||
|
||||
static HeaderReadWriteUtils::AttributeMap createAttributeMapAndReadAllAttributes(
|
||||
const uint8_t *const dictBuf);
|
||||
};
|
||||
|
|
|
@ -68,18 +68,12 @@ const char *const HeaderReadWriteUtils::REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY
|
|||
/* static */ HeaderReadWriteUtils::DictionaryFlags
|
||||
HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap(
|
||||
const HeaderReadWriteUtils::AttributeMap *const attributeMap) {
|
||||
AttributeMap::key_type key;
|
||||
insertCharactersIntoVector(REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY, &key);
|
||||
const bool requiresGermanUmlautProcessing = readBoolAttributeValue(attributeMap, &key,
|
||||
false /* defaultValue */);
|
||||
key.clear();
|
||||
insertCharactersIntoVector(REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY, &key);
|
||||
const bool requiresFrenchLigatureProcessing = readBoolAttributeValue(attributeMap, &key,
|
||||
false /* defaultValue */);
|
||||
key.clear();
|
||||
insertCharactersIntoVector(SUPPORTS_DYNAMIC_UPDATE_KEY, &key);
|
||||
const bool supportsDynamicUpdate = readBoolAttributeValue(attributeMap, &key,
|
||||
false /* defaultValue */);
|
||||
const bool requiresGermanUmlautProcessing = readBoolAttributeValue(attributeMap,
|
||||
REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY, false /* defaultValue */);
|
||||
const bool requiresFrenchLigatureProcessing = readBoolAttributeValue(attributeMap,
|
||||
REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY, false /* defaultValue */);
|
||||
const bool supportsDynamicUpdate = readBoolAttributeValue(attributeMap,
|
||||
SUPPORTS_DYNAMIC_UPDATE_KEY, false /* defaultValue */);
|
||||
DictionaryFlags dictflags = NO_FLAGS;
|
||||
dictflags |= requiresGermanUmlautProcessing ? GERMAN_UMLAUT_PROCESSING_FLAG : 0;
|
||||
dictflags |= requiresFrenchLigatureProcessing ? FRENCH_LIGATURE_PROCESSING_FLAG : 0;
|
||||
|
@ -160,11 +154,18 @@ const char *const HeaderReadWriteUtils::REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY
|
|||
}
|
||||
|
||||
/* static */ void HeaderReadWriteUtils::setBoolAttribute(AttributeMap *const headerAttributes,
|
||||
const AttributeMap::key_type *const key, const bool value) {
|
||||
const char *const key, const bool value) {
|
||||
setIntAttribute(headerAttributes, key, value ? 1 : 0);
|
||||
}
|
||||
|
||||
/* static */ void HeaderReadWriteUtils::setIntAttribute(AttributeMap *const headerAttributes,
|
||||
const char *const key, const int value) {
|
||||
AttributeMap::key_type keyVector;
|
||||
insertCharactersIntoVector(key, &keyVector);
|
||||
setIntAttributeInner(headerAttributes, &keyVector, value);
|
||||
}
|
||||
|
||||
/* static */ void HeaderReadWriteUtils::setIntAttributeInner(AttributeMap *const headerAttributes,
|
||||
const AttributeMap::key_type *const key, const int value) {
|
||||
AttributeMap::mapped_type valueVector;
|
||||
char charBuf[LARGEST_INT_DIGIT_COUNT + 1];
|
||||
|
@ -174,7 +175,7 @@ const char *const HeaderReadWriteUtils::REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY
|
|||
}
|
||||
|
||||
/* static */ bool HeaderReadWriteUtils::readBoolAttributeValue(
|
||||
const AttributeMap *const headerAttributes, const AttributeMap::key_type *const key,
|
||||
const AttributeMap *const headerAttributes, const char *const key,
|
||||
const bool defaultValue) {
|
||||
const int intDefaultValue = defaultValue ? 1 : 0;
|
||||
const int intValue = readIntAttributeValue(headerAttributes, key, intDefaultValue);
|
||||
|
@ -182,6 +183,14 @@ const char *const HeaderReadWriteUtils::REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY
|
|||
}
|
||||
|
||||
/* static */ int HeaderReadWriteUtils::readIntAttributeValue(
|
||||
const AttributeMap *const headerAttributes, const char *const key,
|
||||
const int defaultValue) {
|
||||
AttributeMap::key_type keyVector;
|
||||
insertCharactersIntoVector(key, &keyVector);
|
||||
return readIntAttributeValueInner(headerAttributes, &keyVector, defaultValue);
|
||||
}
|
||||
|
||||
/* static */ int HeaderReadWriteUtils::readIntAttributeValueInner(
|
||||
const AttributeMap *const headerAttributes, const AttributeMap::key_type *const key,
|
||||
const int defaultValue) {
|
||||
AttributeMap::const_iterator it = headerAttributes->find(*key);
|
||||
|
|
|
@ -76,16 +76,16 @@ class HeaderReadWriteUtils {
|
|||
* Methods for header attributes.
|
||||
*/
|
||||
static void setBoolAttribute(AttributeMap *const headerAttributes,
|
||||
const AttributeMap::key_type *const key, const bool value);
|
||||
const char *const key, const bool value);
|
||||
|
||||
static void setIntAttribute(AttributeMap *const headerAttributes,
|
||||
const AttributeMap::key_type *const key, const int value);
|
||||
const char *const key, const int value);
|
||||
|
||||
static bool readBoolAttributeValue(const AttributeMap *const headerAttributes,
|
||||
const AttributeMap::key_type *const key, const bool defaultValue);
|
||||
const char *const key, const bool defaultValue);
|
||||
|
||||
static int readIntAttributeValue(const AttributeMap *const headerAttributes,
|
||||
const AttributeMap::key_type *const key, const int defaultValue);
|
||||
const char *const key, const int defaultValue);
|
||||
|
||||
static void insertCharactersIntoVector(const char *const characters,
|
||||
AttributeMap::key_type *const key);
|
||||
|
@ -112,6 +112,12 @@ class HeaderReadWriteUtils {
|
|||
static const char *const SUPPORTS_DYNAMIC_UPDATE_KEY;
|
||||
static const char *const REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY;
|
||||
static const char *const REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY;
|
||||
|
||||
static void setIntAttributeInner(AttributeMap *const headerAttributes,
|
||||
const AttributeMap::key_type *const key, const int value);
|
||||
|
||||
static int readIntAttributeValueInner(const AttributeMap *const headerAttributes,
|
||||
const AttributeMap::key_type *const key, const int defaultValue);
|
||||
};
|
||||
}
|
||||
#endif /* LATINIME_HEADER_READ_WRITE_UTILS_H */
|
||||
|
|
|
@ -113,6 +113,14 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
|||
return false;
|
||||
}
|
||||
|
||||
void getProperty(const char *const query, char *const outResult,
|
||||
const int maxResultLength) const {
|
||||
// getProperty is not supported for this class.
|
||||
if (maxResultLength > 0) {
|
||||
outResult[0] = '\0';
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTriePolicy);
|
||||
|
||||
|
|
|
@ -43,7 +43,8 @@ const char *const DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE =
|
|||
const HeaderReadWriteUtils::AttributeMap *const attributeMap) {
|
||||
BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
|
||||
HeaderPolicy headerPolicy(FormatUtils::VERSION_3, attributeMap);
|
||||
headerPolicy.writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */);
|
||||
headerPolicy.writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */,
|
||||
0 /* unigramCount */, 0 /* bigramCount */, 0 /* extendedRegionSize */);
|
||||
BufferWithExtendableBuffer bodyBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
|
||||
if (!DynamicPatriciaTrieWritingUtils::writeEmptyDictionary(&bodyBuffer, 0 /* rootPos */)) {
|
||||
return false;
|
||||
|
|
|
@ -27,6 +27,7 @@ import java.io.File;
|
|||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
|
@ -625,4 +626,57 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
|
||||
dictFile.delete();
|
||||
}
|
||||
|
||||
public void testUnigramAndBigramCount() {
|
||||
final int flashWithGCIterationCount = 10;
|
||||
final int codePointSetSize = 50;
|
||||
final int unigramCountPerIteration = 1000;
|
||||
final int bigramCountPerIteration = 2000;
|
||||
final int seed = 1123581321;
|
||||
|
||||
final Random random = new Random(seed);
|
||||
|
||||
File dictFile = null;
|
||||
try {
|
||||
dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary");
|
||||
} catch (IOException e) {
|
||||
fail("IOException while writing an initial dictionary : " + e);
|
||||
}
|
||||
|
||||
final ArrayList<String> words = new ArrayList<String>();
|
||||
final HashSet<Pair<String, String>> bigrams = new HashSet<Pair<String, String>>();
|
||||
final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
|
||||
|
||||
BinaryDictionary binaryDictionary;
|
||||
for (int i = 0; i < flashWithGCIterationCount; i++) {
|
||||
binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
|
||||
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
|
||||
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
|
||||
for (int j = 0; j < unigramCountPerIteration; j++) {
|
||||
final String word = CodePointUtils.generateWord(random, codePointSet);
|
||||
words.add(word);
|
||||
final int unigramProbability = random.nextInt(0xFF);
|
||||
binaryDictionary.addUnigramWord(word, unigramProbability);
|
||||
}
|
||||
for (int j = 0; j < bigramCountPerIteration; j++) {
|
||||
final String word0 = words.get(random.nextInt(words.size()));
|
||||
final String word1 = words.get(random.nextInt(words.size()));
|
||||
bigrams.add(new Pair<String, String>(word0, word1));
|
||||
final int bigramProbability = random.nextInt(0xF);
|
||||
binaryDictionary.addBigramWords(word0, word1, bigramProbability);
|
||||
}
|
||||
assertEquals(new HashSet<String>(words).size(), Integer.parseInt(
|
||||
binaryDictionary.getPropertyForTests(BinaryDictionary.UNIGRAM_COUNT_QUERY)));
|
||||
assertEquals(new HashSet<Pair<String, String>>(bigrams).size(), Integer.parseInt(
|
||||
binaryDictionary.getPropertyForTests(BinaryDictionary.BIGRAM_COUNT_QUERY)));
|
||||
binaryDictionary.flushWithGC();
|
||||
assertEquals(new HashSet<String>(words).size(), Integer.parseInt(
|
||||
binaryDictionary.getPropertyForTests(BinaryDictionary.UNIGRAM_COUNT_QUERY)));
|
||||
assertEquals(new HashSet<Pair<String, String>>(bigrams).size(), Integer.parseInt(
|
||||
binaryDictionary.getPropertyForTests(BinaryDictionary.BIGRAM_COUNT_QUERY)));
|
||||
binaryDictionary.close();
|
||||
}
|
||||
|
||||
dictFile.delete();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue