Add unigram/bigram counting.

Bug: 6669677
Change-Id: I05ea2201d822dddf062b08c8467daa336760e16c
main
Keisuke Kuroyanagi 2013-09-27 23:12:12 +09:00
parent 5ef6209656
commit 31097a57cc
20 changed files with 339 additions and 101 deletions

View File

@ -48,6 +48,11 @@ public final class BinaryDictionary extends Dictionary {
// TODO: Remove this heuristic.
private static final int SPACE_COUNT_FOR_AUTO_COMMIT = 3;
@UsedForTesting
public static final String UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
@UsedForTesting
public static final String BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
private long mNativeDict;
private final Locale mLocale;
private final long mDictSize;
@ -129,6 +134,7 @@ public final class BinaryDictionary extends Dictionary {
private static native void removeBigramWordsNative(long dict, int[] word0, int[] word1);
private static native int calculateProbabilityNative(long dict, int unigramProbability,
int bigramProbability);
private static native String getPropertyNative(long dict, String query);
@UsedForTesting
public static boolean createEmptyDictFile(final String filePath, final long dictVersion,
@ -331,6 +337,12 @@ public final class BinaryDictionary extends Dictionary {
return calculateProbabilityNative(mNativeDict, unigramProbability, bigramProbability);
}
@UsedForTesting
public String getPropertyForTests(String query) {
if (!isValidDictionary()) return "";
return getPropertyNative(mNativeDict, query);
}
@Override
public boolean shouldAutoCommit(final SuggestedWordInfo candidate) {
// TODO: actually use the confidence rather than use this completely broken heuristic

View File

@ -323,6 +323,24 @@ static int latinime_BinaryDictionary_calculateProbabilityNative(JNIEnv *env, jcl
bigramProbability);
}
static jstring latinime_BinaryDictionary_getProperty(JNIEnv *env, jclass clazz, jlong dict,
jstring query) {
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
if (!dictionary) {
return env->NewStringUTF("");
}
const jsize queryUtf8Length = env->GetStringUTFLength(query);
char queryChars[queryUtf8Length + 1];
env->GetStringUTFRegion(query, 0, env->GetStringLength(query), queryChars);
queryChars[queryUtf8Length] = '\0';
static const int GET_PROPERTY_RESULT_LENGTH = 100;
char resultChars[GET_PROPERTY_RESULT_LENGTH];
resultChars[0] = '\0';
dictionary->getDictionaryStructurePolicy()->getProperty(queryChars, resultChars,
GET_PROPERTY_RESULT_LENGTH);
return env->NewStringUTF(resultChars);
}
static const JNINativeMethod sMethods[] = {
{
const_cast<char *>("createEmptyDictFileNative"),
@ -398,6 +416,11 @@ static const JNINativeMethod sMethods[] = {
const_cast<char *>("calculateProbabilityNative"),
const_cast<char *>("(JII)I"),
reinterpret_cast<void *>(latinime_BinaryDictionary_calculateProbabilityNative)
},
{
const_cast<char *>("getPropertyNative"),
const_cast<char *>("(JLjava/lang/String;)Ljava/lang/String;"),
reinterpret_cast<void *>(latinime_BinaryDictionary_getProperty)
}
};

View File

@ -125,6 +125,11 @@ bool Dictionary::needsToRunGC() {
return mDictionaryStructureWithBufferPolicy->needsToRunGC();
}
void Dictionary::getProperty(const char *const query, char *const outResult,
const int maxResultLength) const {
return mDictionaryStructureWithBufferPolicy->getProperty(query, outResult, maxResultLength);
}
void Dictionary::logDictionaryInfo(JNIEnv *const env) const {
const int BUFFER_SIZE = 16;
int dictionaryIdCodePointBuffer[BUFFER_SIZE];

View File

@ -83,6 +83,9 @@ class Dictionary {
bool needsToRunGC();
void getProperty(const char *const query, char *const outResult,
const int maxResultLength) const;
const DictionaryStructureWithBufferPolicy *getDictionaryStructurePolicy() const {
return mDictionaryStructureWithBufferPolicy;
}

View File

@ -80,6 +80,9 @@ class DictionaryStructureWithBufferPolicy {
virtual bool needsToRunGC() const = 0;
virtual void getProperty(const char *const query, char *const outResult,
const int maxResultLength) const = 0;
protected:
DictionaryStructureWithBufferPolicy() {}

View File

@ -119,7 +119,7 @@ bool DynamicBigramListPolicy::copyAllBigrams(BufferWithExtendableBuffer *const b
// Finding useless bigram entries and remove them. Bigram entry is useless when the target PtNode
// has been deleted or is not a valid terminal.
bool DynamicBigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(
int *const bigramListPos) {
int *const bigramListPos, int *const outValidBigramEntryCount) {
const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos);
if (usesAdditionalBuffer) {
*bigramListPos -= mBuffer->getOriginalBufferSize();
@ -161,6 +161,8 @@ bool DynamicBigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(
NOT_A_DICT_POS /* targetOffset */, &bigramEntryPos)) {
return false;
}
} else {
(*outValidBigramEntryCount) += 1;
}
} while(BigramListReadWriteUtils::hasNext(bigramFlags));
return true;
@ -169,7 +171,7 @@ bool DynamicBigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(
// Updates bigram target PtNode positions in the list after the placing step in GC.
bool DynamicBigramListPolicy::updateAllBigramTargetPtNodePositions(int *const bigramListPos,
const DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap *const
ptNodePositionRelocationMap) {
ptNodePositionRelocationMap, int *const outBigramEntryCount) {
const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos);
if (usesAdditionalBuffer) {
*bigramListPos -= mBuffer->getOriginalBufferSize();
@ -211,11 +213,12 @@ bool DynamicBigramListPolicy::updateAllBigramTargetPtNodePositions(int *const bi
return false;
}
} while(BigramListReadWriteUtils::hasNext(bigramFlags));
(*outBigramEntryCount) = bigramEntryCount;
return true;
}
bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTargetPos,
const int probability, int *const bigramListPos) {
const int probability, int *const bigramListPos, bool *const outAddedNewBigram) {
const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos);
if (usesAdditionalBuffer) {
*bigramListPos -= mBuffer->getOriginalBufferSize();
@ -243,6 +246,7 @@ bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTarg
}
if (followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos) == bigramTargetPos) {
// Update this bigram entry.
*outAddedNewBigram = false;
const BigramListReadWriteUtils::BigramFlags updatedFlags =
BigramListReadWriteUtils::setProbabilityInFlags(bigramFlags, probability);
return BigramListReadWriteUtils::writeBigramEntry(mBuffer, updatedFlags,
@ -254,12 +258,14 @@ bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTarg
// The current last entry is found.
// First, update the flags of the last entry.
if (!BigramListReadWriteUtils::setHasNextFlag(mBuffer, true /* hasNext */, entryPos)) {
*outAddedNewBigram = false;
return false;
}
if (usesAdditionalBuffer) {
*bigramListPos += mBuffer->getOriginalBufferSize();
}
// Then, add a new entry after the last entry.
*outAddedNewBigram = true;
return writeNewBigramEntry(bigramTargetPos, probability, bigramListPos);
} while(BigramListReadWriteUtils::hasNext(bigramFlags));
// We return directly from the while loop.

View File

@ -50,19 +50,20 @@ class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy {
bool copyAllBigrams(BufferWithExtendableBuffer *const bufferToWrite, int *const fromPos,
int *const toPos, int *const outBigramsCount) const;
bool updateAllBigramEntriesAndDeleteUselessEntries(int *const bigramListPos);
bool updateAllBigramEntriesAndDeleteUselessEntries(int *const bigramListPos,
int *const outBigramEntryCount);
bool updateAllBigramTargetPtNodePositions(int *const bigramListPos,
const DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap *const
ptNodePositionRelocationMap);
ptNodePositionRelocationMap, int *const outValidBigramEntryCount);
bool addNewBigramEntryToBigramList(const int bigramTargetPos, const int probability,
int *const bigramListPos);
int *const bigramListPos, bool *const outAddedNewBigram);
bool writeNewBigramEntry(const int bigramTargetPos, const int probability,
int *const writingPos);
// Return if targetBigramPos is found or not.
// Return whether or not targetBigramPos is found.
bool removeBigram(const int bigramListPos, const int bigramTargetPos);
private:

View File

@ -42,6 +42,9 @@ bool DynamicPatriciaTrieGcEventListeners
}
} else {
valueStack.back() += 1;
if (node->isTerminal()) {
mValidUnigramCount += 1;
}
}
return true;
}
@ -137,10 +140,15 @@ bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateAllPositionField
// Updates bigram target PtNode positions in the bigram list.
int bigramsPos = node->getBigramsPos();
if (bigramsPos != NOT_A_DICT_POS) {
int bigramEntryCount;
if (!mBigramPolicy->updateAllBigramTargetPtNodePositions(&bigramsPos,
&mDictPositionRelocationMap->mPtNodePositionRelocationMap)) {
&mDictPositionRelocationMap->mPtNodePositionRelocationMap, &bigramEntryCount)) {
return false;
}
mBigramCount += bigramEntryCount;
}
if (node->isTerminal()) {
mUnigramCount++;
}
return true;

View File

@ -41,7 +41,7 @@ class DynamicPatriciaTrieGcEventListeners {
DynamicPatriciaTrieWritingHelper *const writingHelper,
BufferWithExtendableBuffer *const buffer)
: mWritingHelper(writingHelper), mBuffer(buffer), valueStack(),
mChildrenValue(0) {}
mChildrenValue(0), mValidUnigramCount(0) {}
~TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted() {};
@ -64,6 +64,10 @@ class DynamicPatriciaTrieGcEventListeners {
bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node,
const int *const nodeCodePoints);
int getValidUnigramCount() const {
return mValidUnigramCount;
}
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(
TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted);
@ -72,6 +76,7 @@ class DynamicPatriciaTrieGcEventListeners {
BufferWithExtendableBuffer *const mBuffer;
std::vector<int> valueStack;
int mChildrenValue;
int mValidUnigramCount;
};
// Updates all bigram entries that are held by valid PtNodes. This removes useless bigram
@ -80,7 +85,7 @@ class DynamicPatriciaTrieGcEventListeners {
: public DynamicPatriciaTrieReadingHelper::TraversingEventListener {
public:
TraversePolicyToUpdateBigramProbability(DynamicBigramListPolicy *const bigramPolicy)
: mBigramPolicy(bigramPolicy) {}
: mBigramPolicy(bigramPolicy), mValidBigramEntryCount(0) {}
bool onAscend() { return true; }
@ -93,18 +98,26 @@ class DynamicPatriciaTrieGcEventListeners {
if (!node->isDeleted()) {
int pos = node->getBigramsPos();
if (pos != NOT_A_DICT_POS) {
if (!mBigramPolicy->updateAllBigramEntriesAndDeleteUselessEntries(&pos)) {
int bigramEntryCount = 0;
if (!mBigramPolicy->updateAllBigramEntriesAndDeleteUselessEntries(&pos,
&bigramEntryCount)) {
return false;
}
mValidBigramEntryCount += bigramEntryCount;
}
}
return true;
}
int getValidBigramEntryCount() const {
return mValidBigramEntryCount;
}
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateBigramProbability);
DynamicBigramListPolicy *const mBigramPolicy;
int mValidBigramEntryCount;
};
class TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
@ -150,7 +163,8 @@ class DynamicPatriciaTrieGcEventListeners {
dictPositionRelocationMap)
: mWritingHelper(writingHelper), mBigramPolicy(bigramPolicy),
mBufferToWrite(bufferToWrite),
mDictPositionRelocationMap(dictPositionRelocationMap) {};
mDictPositionRelocationMap(dictPositionRelocationMap), mUnigramCount(0),
mBigramCount(0) {};
bool onAscend() { return true; }
@ -161,6 +175,14 @@ class DynamicPatriciaTrieGcEventListeners {
bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node,
const int *const nodeCodePoints);
int getUnigramCount() const {
return mUnigramCount;
}
int getBigramCount() const {
return mBigramCount;
}
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateAllPositionFields);
@ -169,6 +191,8 @@ class DynamicPatriciaTrieGcEventListeners {
BufferWithExtendableBuffer *const mBufferToWrite;
const DynamicPatriciaTrieWritingHelper::DictPositionRelocationMap *const
mDictPositionRelocationMap;
int mUnigramCount;
int mBigramCount;
};
private:

View File

@ -16,6 +16,9 @@
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h"
#include <cstdio>
#include <cstring>
#include "defines.h"
#include "suggest/core/dicnode/dic_node.h"
#include "suggest/core/dicnode/dic_node_vector.h"
@ -28,6 +31,9 @@
namespace latinime {
const char *const DynamicPatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
const char *const DynamicPatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
void DynamicPatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode,
DicNodeVector *const childDicNodes) const {
if (!dicNode->hasChildren()) {
@ -198,7 +204,16 @@ bool DynamicPatriciaTriePolicy::addUnigramWord(const int *const word, const int
readingHelper.initWithPtNodeArrayPos(getRootPosition());
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
&mBigramListPolicy, &mShortcutListPolicy);
return writingHelper.addUnigramWord(&readingHelper, word, length, probability);
bool addedNewUnigram = false;
if (writingHelper.addUnigramWord(&readingHelper, word, length, probability,
&addedNewUnigram)) {
if (addedNewUnigram) {
mUnigramCount++;
}
return true;
} else {
return false;
}
}
bool DynamicPatriciaTriePolicy::addBigramWords(const int *const word0, const int length0,
@ -219,7 +234,15 @@ bool DynamicPatriciaTriePolicy::addBigramWords(const int *const word0, const int
}
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
&mBigramListPolicy, &mShortcutListPolicy);
return writingHelper.addBigramWords(word0Pos, word1Pos, probability);
bool addedNewBigram = false;
if (writingHelper.addBigramWords(word0Pos, word1Pos, probability, &addedNewBigram)) {
if (addedNewBigram) {
mBigramCount++;
}
return true;
} else {
return false;
}
}
bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const int length0,
@ -240,7 +263,12 @@ bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const
}
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
&mBigramListPolicy, &mShortcutListPolicy);
return writingHelper.removeBigramWords(word0Pos, word1Pos);
if (writingHelper.removeBigramWords(word0Pos, word1Pos)) {
mBigramCount--;
return true;
} else {
return false;
}
}
void DynamicPatriciaTriePolicy::flush(const char *const filePath) {
@ -250,7 +278,7 @@ void DynamicPatriciaTriePolicy::flush(const char *const filePath) {
}
DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
&mBigramListPolicy, &mShortcutListPolicy);
writingHelper.writeToDictFile(filePath, &mHeaderPolicy);
writingHelper.writeToDictFile(filePath, &mHeaderPolicy, mUnigramCount, mBigramCount);
}
void DynamicPatriciaTriePolicy::flushWithGC(const char *const filePath) {
@ -272,4 +300,13 @@ bool DynamicPatriciaTriePolicy::needsToRunGC() const {
return mBufferWithExtendableBuffer.isNearSizeLimit();
}
void DynamicPatriciaTriePolicy::getProperty(const char *const query, char *const outResult,
const int maxResultLength) const {
if (strncmp(query, UNIGRAM_COUNT_QUERY, maxResultLength) == 0) {
snprintf(outResult, maxResultLength, "%d", mUnigramCount);
} else if (strncmp(query, BIGRAM_COUNT_QUERY, maxResultLength) == 0) {
snprintf(outResult, maxResultLength, "%d", mBigramCount);
}
}
} // namespace latinime

View File

@ -37,7 +37,9 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
mBufferWithExtendableBuffer(mBuffer->getBuffer() + mHeaderPolicy.getSize(),
mBuffer->getBufferSize() - mHeaderPolicy.getSize()),
mShortcutListPolicy(&mBufferWithExtendableBuffer),
mBigramListPolicy(&mBufferWithExtendableBuffer, &mShortcutListPolicy) {}
mBigramListPolicy(&mBufferWithExtendableBuffer, &mShortcutListPolicy),
mUnigramCount(mHeaderPolicy.getUnigramCount()),
mBigramCount(mHeaderPolicy.getBigramCount()) {}
~DynamicPatriciaTriePolicy() {
delete mBuffer;
@ -91,14 +93,22 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
bool needsToRunGC() const;
void getProperty(const char *const query, char *const outResult,
const int maxResultLength) const;
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTriePolicy);
static const char*const UNIGRAM_COUNT_QUERY;
static const char*const BIGRAM_COUNT_QUERY;
const MmappedBuffer *const mBuffer;
const HeaderPolicy mHeaderPolicy;
BufferWithExtendableBuffer mBufferWithExtendableBuffer;
DynamicShortcutListPolicy mShortcutListPolicy;
DynamicBigramListPolicy mBigramListPolicy;
int mUnigramCount;
int mBigramCount;
};
} // namespace latinime
#endif // LATINIME_DYNAMIC_PATRICIA_TRIE_POLICY_H

View File

@ -36,7 +36,8 @@ const size_t DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE = 2 * 1024 *
bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
DynamicPatriciaTrieReadingHelper *const readingHelper,
const int *const wordCodePoints, const int codePointCount, const int probability) {
const int *const wordCodePoints, const int codePointCount, const int probability,
bool *const outAddedNewUnigram) {
int parentPos = NOT_A_DICT_POS;
while (!readingHelper->isEnd()) {
const int matchedCodePointCount = readingHelper->getPrevTotalCodePointCount();
@ -54,6 +55,7 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
const int nextIndex = matchedCodePointCount + j;
if (nextIndex >= codePointCount || !readingHelper->isMatchedCodePoint(j,
wordCodePoints[matchedCodePointCount + j])) {
*outAddedNewUnigram = true;
return reallocatePtNodeAndAddNewPtNodes(nodeReader,
readingHelper->getMergedNodeCodePoints(), j, probability,
wordCodePoints + matchedCodePointCount,
@ -63,9 +65,10 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
// All characters are matched.
if (codePointCount == readingHelper->getTotalCodePointCount()) {
return setPtNodeProbability(nodeReader, probability,
readingHelper->getMergedNodeCodePoints());
readingHelper->getMergedNodeCodePoints(), outAddedNewUnigram);
}
if (!nodeReader->hasChildren()) {
*outAddedNewUnigram = true;
return createChildrenPtNodeArrayAndAChildPtNode(nodeReader, probability,
wordCodePoints + readingHelper->getTotalCodePointCount(),
codePointCount - readingHelper->getTotalCodePointCount());
@ -79,6 +82,7 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
return false;
}
int pos = readingHelper->getPosOfLastForwardLinkField();
*outAddedNewUnigram = true;
return createAndInsertNodeIntoPtNodeArray(parentPos,
wordCodePoints + readingHelper->getPrevTotalCodePointCount(),
codePointCount - readingHelper->getPrevTotalCodePointCount(),
@ -86,7 +90,7 @@ bool DynamicPatriciaTrieWritingHelper::addUnigramWord(
}
bool DynamicPatriciaTrieWritingHelper::addBigramWords(const int word0Pos, const int word1Pos,
const int probability) {
const int probability, bool *const outAddedNewBigram) {
int mMergedNodeCodePoints[MAX_WORD_LENGTH];
DynamicPatriciaTrieNodeReader nodeReader(mBuffer, mBigramPolicy, mShortcutPolicy);
nodeReader.fetchNodeInfoInBufferFromPtNodePosAndGetNodeCodePoints(word0Pos, MAX_WORD_LENGTH,
@ -107,9 +111,11 @@ bool DynamicPatriciaTrieWritingHelper::addBigramWords(const int word0Pos, const
if (nodeReader.getBigramsPos() != NOT_A_DICT_POS) {
// Insert a new bigram entry into the existing bigram list.
int bigramListPos = nodeReader.getBigramsPos();
return mBigramPolicy->addNewBigramEntryToBigramList(word1Pos, probability, &bigramListPos);
return mBigramPolicy->addNewBigramEntryToBigramList(word1Pos, probability, &bigramListPos,
outAddedNewBigram);
} else {
// The PtNode doesn't have a bigram list.
*outAddedNewBigram = true;
// First, Write a bigram entry at the tail position of the PtNode.
if (!mBigramPolicy->writeNewBigramEntry(word1Pos, probability, &writingPos)) {
return false;
@ -138,9 +144,12 @@ bool DynamicPatriciaTrieWritingHelper::removeBigramWords(const int word0Pos, con
}
void DynamicPatriciaTrieWritingHelper::writeToDictFile(const char *const fileName,
const HeaderPolicy *const headerPolicy) {
const HeaderPolicy *const headerPolicy, const int unigramCount, const int bigramCount) {
BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, false /* updatesLastUpdatedTime */)) {
const int extendedRegionSize = headerPolicy->getExtendedRegionSize() +
mBuffer->getTailPosition() - mBuffer->getOriginalBufferSize();
if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, false /* updatesLastUpdatedTime */,
unigramCount, bigramCount, extendedRegionSize)) {
return;
}
DictFileWritingUtils::flushAllHeaderAndBodyToFile(fileName, &headerBuffer, mBuffer);
@ -148,13 +157,16 @@ void DynamicPatriciaTrieWritingHelper::writeToDictFile(const char *const fileNam
void DynamicPatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos,
const char *const fileName, const HeaderPolicy *const headerPolicy) {
BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */)) {
return;
}
BufferWithExtendableBuffer newDictBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */,
MAX_DICTIONARY_SIZE);
if (!runGC(rootPtNodeArrayPos, &newDictBuffer)) {
int unigramCount = 0;
int bigramCount = 0;
if (!runGC(rootPtNodeArrayPos, &newDictBuffer, &unigramCount, &bigramCount)) {
return;
}
BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */,
unigramCount, bigramCount, 0 /* extendedRegionSize */)) {
return;
}
DictFileWritingUtils::flushAllHeaderAndBodyToFile(fileName, &headerBuffer, &newDictBuffer);
@ -335,9 +347,10 @@ bool DynamicPatriciaTrieWritingHelper::createAndInsertNodeIntoPtNodeArray(const
bool DynamicPatriciaTrieWritingHelper::setPtNodeProbability(
const DynamicPatriciaTrieNodeReader *const originalPtNode, const int probability,
const int *const codePoints) {
const int *const codePoints, bool *const outAddedNewUnigram) {
if (originalPtNode->isTerminal()) {
// Overwrites the probability.
*outAddedNewUnigram = false;
int probabilityFieldPos = originalPtNode->getProbabilityFieldPos();
if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition(mBuffer,
probability, &probabilityFieldPos)) {
@ -345,6 +358,7 @@ bool DynamicPatriciaTrieWritingHelper::setPtNodeProbability(
}
} else {
// Make the node terminal and write the probability.
*outAddedNewUnigram = true;
int movedPos = mBuffer->getTailPosition();
if (!markNodeAsMovedAndSetPosition(originalPtNode, movedPos, movedPos)) {
return false;
@ -460,7 +474,8 @@ bool DynamicPatriciaTrieWritingHelper::reallocatePtNodeAndAddNewPtNodes(
}
bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
BufferWithExtendableBuffer *const bufferToWrite) {
BufferWithExtendableBuffer *const bufferToWrite, int *const outUnigramCount,
int *const outBigramCount) {
DynamicPatriciaTrieReadingHelper readingHelper(mBuffer, mBigramPolicy, mShortcutPolicy);
readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
DynamicPatriciaTrieGcEventListeners
@ -505,6 +520,8 @@ bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
&traversePolicyToUpdateAllPositionFields)) {
return false;
}
*outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount();
*outBigramCount = traversePolicyToUpdateAllPositionFields.getBigramCount();
return true;
}

View File

@ -56,15 +56,18 @@ class DynamicPatriciaTrieWritingHelper {
// Add a word to the dictionary. If the word already exists, update the probability.
bool addUnigramWord(DynamicPatriciaTrieReadingHelper *const readingHelper,
const int *const wordCodePoints, const int codePointCount, const int probability);
const int *const wordCodePoints, const int codePointCount, const int probability,
bool *const outAddedNewUnigram);
// Add a bigram relation from word0Pos to word1Pos.
bool addBigramWords(const int word0Pos, const int word1Pos, const int probability);
bool addBigramWords(const int word0Pos, const int word1Pos, const int probability,
bool *const outAddedNewBigram);
// Remove a bigram relation from word0Pos to word1Pos.
bool removeBigramWords(const int word0Pos, const int word1Pos);
void writeToDictFile(const char *const fileName, const HeaderPolicy *const headerPolicy);
void writeToDictFile(const char *const fileName, const HeaderPolicy *const headerPolicy,
const int unigramCount, const int bigramCount);
void writeToDictFileWithGC(const int rootPtNodeArrayPos, const char *const fileName,
const HeaderPolicy *const headerPolicy);
@ -107,7 +110,7 @@ class DynamicPatriciaTrieWritingHelper {
const int nodeCodePointCount, const int probability, int *const forwardLinkFieldPos);
bool setPtNodeProbability(const DynamicPatriciaTrieNodeReader *const originalNode,
const int probability, const int *const codePoints);
const int probability, const int *const codePoints, bool *const outAddedNewUnigram);
bool createChildrenPtNodeArrayAndAChildPtNode(
const DynamicPatriciaTrieNodeReader *const parentNode, const int probability,
@ -122,7 +125,8 @@ class DynamicPatriciaTrieWritingHelper {
const int probabilityOfNewPtNode, const int *const newNodeCodePoints,
const int newNodeCodePointCount);
bool runGC(const int rootPtNodeArrayPos, BufferWithExtendableBuffer *const bufferToWrite);
bool runGC(const int rootPtNodeArrayPos, BufferWithExtendableBuffer *const bufferToWrite,
int *const outUnigramCount, int *const outBigramCount);
};
} // namespace latinime
#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H */

View File

@ -16,17 +16,15 @@
#include "suggest/policyimpl/dictionary/header/header_policy.h"
#include <cstddef>
#include <cstdio>
#include <ctime>
namespace latinime {
// Note that these are corresponding definitions in Java side in FormatSpec.FileHeader.
const char *const HeaderPolicy::MULTIPLE_WORDS_DEMOTION_RATE_KEY = "MULTIPLE_WORDS_DEMOTION_RATE";
const char *const HeaderPolicy::USES_FORGETTING_CURVE_KEY = "USES_FORGETTING_CURVE";
const char *const HeaderPolicy::LAST_UPDATED_TIME_KEY = "date";
const char *const HeaderPolicy::UNIGRAM_COUNT_KEY = "UNIGRAM_COUNT";
const char *const HeaderPolicy::BIGRAM_COUNT_KEY = "BIGRAM_COUNT";
const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE";
const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100;
const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f;
@ -55,33 +53,17 @@ void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *out
}
float HeaderPolicy::readMultipleWordCostMultiplier() const {
std::vector<int> keyVector;
HeaderReadWriteUtils::insertCharactersIntoVector(MULTIPLE_WORDS_DEMOTION_RATE_KEY, &keyVector);
const int demotionRate = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
&keyVector, DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE);
MULTIPLE_WORDS_DEMOTION_RATE_KEY, DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE);
if (demotionRate <= 0) {
return static_cast<float>(MAX_VALUE_FOR_WEIGHTING);
}
return MULTIPLE_WORD_COST_MULTIPLIER_SCALE / static_cast<float>(demotionRate);
}
bool HeaderPolicy::readUsesForgettingCurveFlag() const {
std::vector<int> keyVector;
HeaderReadWriteUtils::insertCharactersIntoVector(USES_FORGETTING_CURVE_KEY, &keyVector);
return HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, &keyVector,
false /* defaultValue */);
}
// Returns current time when the key is not found or the value is invalid.
int HeaderPolicy::readLastUpdatedTime() const {
std::vector<int> keyVector;
HeaderReadWriteUtils::insertCharactersIntoVector(LAST_UPDATED_TIME_KEY, &keyVector);
return HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, &keyVector,
time(0) /* defaultValue */);
}
bool HeaderPolicy::writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferToWrite,
const bool updatesLastUpdatedTime) const {
const bool updatesLastUpdatedTime, const int unigramCount, const int bigramCount,
const int extendedRegionSize) const {
int writingPos = 0;
if (!HeaderReadWriteUtils::writeDictionaryVersion(bufferToWrite, mDictFormatVersion,
&writingPos)) {
@ -97,22 +79,20 @@ bool HeaderPolicy::writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferT
&writingPos)) {
return false;
}
HeaderReadWriteUtils::AttributeMap attributeMapTowrite(mAttributeMap);
HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, UNIGRAM_COUNT_KEY, unigramCount);
HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, BIGRAM_COUNT_KEY, bigramCount);
HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, EXTENDED_REGION_SIZE_KEY,
extendedRegionSize);
if (updatesLastUpdatedTime) {
// Set current time as a last updated time.
HeaderReadWriteUtils::AttributeMap attributeMapTowrite(mAttributeMap);
std::vector<int> updatedTimekey;
HeaderReadWriteUtils::insertCharactersIntoVector(LAST_UPDATED_TIME_KEY, &updatedTimekey);
HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, &updatedTimekey, time(0));
HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, LAST_UPDATED_TIME_KEY,
time(0));
}
if (!HeaderReadWriteUtils::writeHeaderAttributes(bufferToWrite, &attributeMapTowrite,
&writingPos)) {
return false;
}
} else {
if (!HeaderReadWriteUtils::writeHeaderAttributes(bufferToWrite, &mAttributeMap,
&writingPos)) {
return false;
}
}
// Writes an actual header size.
if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(bufferToWrite, writingPos,
&headerSizeFieldPos)) {

View File

@ -17,6 +17,7 @@
#ifndef LATINIME_HEADER_POLICY_H
#define LATINIME_HEADER_POLICY_H
#include <ctime>
#include <stdint.h>
#include "defines.h"
@ -35,8 +36,16 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
mSize(HeaderReadWriteUtils::getHeaderSize(dictBuf)),
mAttributeMap(createAttributeMapAndReadAllAttributes(dictBuf)),
mMultiWordCostMultiplier(readMultipleWordCostMultiplier()),
mUsesForgettingCurve(readUsesForgettingCurveFlag()),
mLastUpdatedTime(readLastUpdatedTime()) {}
mUsesForgettingCurve(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap,
USES_FORGETTING_CURVE_KEY, false /* defaultValue */)),
mLastUpdatedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
LAST_UPDATED_TIME_KEY, time(0) /* defaultValue */)),
mUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
UNIGRAM_COUNT_KEY, 0 /* defaultValue */)),
mBigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
BIGRAM_COUNT_KEY, 0 /* defaultValue */)),
mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)) {}
// Constructs header information using an attribute map.
HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion,
@ -44,9 +53,12 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
: mDictFormatVersion(dictFormatVersion),
mDictionaryFlags(HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap(
attributeMap)), mSize(0), mAttributeMap(*attributeMap),
mMultiWordCostMultiplier(readUsesForgettingCurveFlag()),
mUsesForgettingCurve(readUsesForgettingCurveFlag()),
mLastUpdatedTime(readLastUpdatedTime()) {}
mMultiWordCostMultiplier(readMultipleWordCostMultiplier()),
mUsesForgettingCurve(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap,
USES_FORGETTING_CURVE_KEY, false /* defaultValue */)),
mLastUpdatedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
LAST_UPDATED_TIME_KEY, time(0) /* defaultValue */)),
mUnigramCount(0), mBigramCount(0), mExtendedRegionSize(0) {}
~HeaderPolicy() {}
@ -78,11 +90,24 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
return mLastUpdatedTime;
}
AK_FORCE_INLINE int getUnigramCount() const {
return mUnigramCount;
}
AK_FORCE_INLINE int getBigramCount() const {
return mBigramCount;
}
AK_FORCE_INLINE int getExtendedRegionSize() const {
return mExtendedRegionSize;
}
void readHeaderValueOrQuestionMark(const char *const key,
int *outValue, int outValueSize) const;
bool writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferToWrite,
const bool updatesLastUpdatedTime) const;
const bool updatesLastUpdatedTime, const int unigramCount,
const int bigramCount, const int extendedRegionSize) const;
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderPolicy);
@ -90,6 +115,9 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
static const char *const MULTIPLE_WORDS_DEMOTION_RATE_KEY;
static const char *const USES_FORGETTING_CURVE_KEY;
static const char *const LAST_UPDATED_TIME_KEY;
static const char *const UNIGRAM_COUNT_KEY;
static const char *const BIGRAM_COUNT_KEY;
static const char *const EXTENDED_REGION_SIZE_KEY;
static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE;
static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE;
@ -100,13 +128,12 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
const float mMultiWordCostMultiplier;
const bool mUsesForgettingCurve;
const int mLastUpdatedTime;
const int mUnigramCount;
const int mBigramCount;
const int mExtendedRegionSize;
float readMultipleWordCostMultiplier() const;
bool readUsesForgettingCurveFlag() const;
int readLastUpdatedTime() const;
static HeaderReadWriteUtils::AttributeMap createAttributeMapAndReadAllAttributes(
const uint8_t *const dictBuf);
};

View File

@ -68,18 +68,12 @@ const char *const HeaderReadWriteUtils::REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY
/* static */ HeaderReadWriteUtils::DictionaryFlags
HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap(
const HeaderReadWriteUtils::AttributeMap *const attributeMap) {
AttributeMap::key_type key;
insertCharactersIntoVector(REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY, &key);
const bool requiresGermanUmlautProcessing = readBoolAttributeValue(attributeMap, &key,
false /* defaultValue */);
key.clear();
insertCharactersIntoVector(REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY, &key);
const bool requiresFrenchLigatureProcessing = readBoolAttributeValue(attributeMap, &key,
false /* defaultValue */);
key.clear();
insertCharactersIntoVector(SUPPORTS_DYNAMIC_UPDATE_KEY, &key);
const bool supportsDynamicUpdate = readBoolAttributeValue(attributeMap, &key,
false /* defaultValue */);
const bool requiresGermanUmlautProcessing = readBoolAttributeValue(attributeMap,
REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY, false /* defaultValue */);
const bool requiresFrenchLigatureProcessing = readBoolAttributeValue(attributeMap,
REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY, false /* defaultValue */);
const bool supportsDynamicUpdate = readBoolAttributeValue(attributeMap,
SUPPORTS_DYNAMIC_UPDATE_KEY, false /* defaultValue */);
DictionaryFlags dictflags = NO_FLAGS;
dictflags |= requiresGermanUmlautProcessing ? GERMAN_UMLAUT_PROCESSING_FLAG : 0;
dictflags |= requiresFrenchLigatureProcessing ? FRENCH_LIGATURE_PROCESSING_FLAG : 0;
@ -160,11 +154,18 @@ const char *const HeaderReadWriteUtils::REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY
}
/* static */ void HeaderReadWriteUtils::setBoolAttribute(AttributeMap *const headerAttributes,
const AttributeMap::key_type *const key, const bool value) {
const char *const key, const bool value) {
setIntAttribute(headerAttributes, key, value ? 1 : 0);
}
/* static */ void HeaderReadWriteUtils::setIntAttribute(AttributeMap *const headerAttributes,
const char *const key, const int value) {
AttributeMap::key_type keyVector;
insertCharactersIntoVector(key, &keyVector);
setIntAttributeInner(headerAttributes, &keyVector, value);
}
/* static */ void HeaderReadWriteUtils::setIntAttributeInner(AttributeMap *const headerAttributes,
const AttributeMap::key_type *const key, const int value) {
AttributeMap::mapped_type valueVector;
char charBuf[LARGEST_INT_DIGIT_COUNT + 1];
@ -174,7 +175,7 @@ const char *const HeaderReadWriteUtils::REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY
}
/* static */ bool HeaderReadWriteUtils::readBoolAttributeValue(
const AttributeMap *const headerAttributes, const AttributeMap::key_type *const key,
const AttributeMap *const headerAttributes, const char *const key,
const bool defaultValue) {
const int intDefaultValue = defaultValue ? 1 : 0;
const int intValue = readIntAttributeValue(headerAttributes, key, intDefaultValue);
@ -182,6 +183,14 @@ const char *const HeaderReadWriteUtils::REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY
}
/* static */ int HeaderReadWriteUtils::readIntAttributeValue(
const AttributeMap *const headerAttributes, const char *const key,
const int defaultValue) {
AttributeMap::key_type keyVector;
insertCharactersIntoVector(key, &keyVector);
return readIntAttributeValueInner(headerAttributes, &keyVector, defaultValue);
}
/* static */ int HeaderReadWriteUtils::readIntAttributeValueInner(
const AttributeMap *const headerAttributes, const AttributeMap::key_type *const key,
const int defaultValue) {
AttributeMap::const_iterator it = headerAttributes->find(*key);

View File

@ -76,16 +76,16 @@ class HeaderReadWriteUtils {
* Methods for header attributes.
*/
static void setBoolAttribute(AttributeMap *const headerAttributes,
const AttributeMap::key_type *const key, const bool value);
const char *const key, const bool value);
static void setIntAttribute(AttributeMap *const headerAttributes,
const AttributeMap::key_type *const key, const int value);
const char *const key, const int value);
static bool readBoolAttributeValue(const AttributeMap *const headerAttributes,
const AttributeMap::key_type *const key, const bool defaultValue);
const char *const key, const bool defaultValue);
static int readIntAttributeValue(const AttributeMap *const headerAttributes,
const AttributeMap::key_type *const key, const int defaultValue);
const char *const key, const int defaultValue);
static void insertCharactersIntoVector(const char *const characters,
AttributeMap::key_type *const key);
@ -112,6 +112,12 @@ class HeaderReadWriteUtils {
static const char *const SUPPORTS_DYNAMIC_UPDATE_KEY;
static const char *const REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY;
static const char *const REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY;
static void setIntAttributeInner(AttributeMap *const headerAttributes,
const AttributeMap::key_type *const key, const int value);
static int readIntAttributeValueInner(const AttributeMap *const headerAttributes,
const AttributeMap::key_type *const key, const int defaultValue);
};
}
#endif /* LATINIME_HEADER_READ_WRITE_UTILS_H */

View File

@ -113,6 +113,14 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
return false;
}
void getProperty(const char *const query, char *const outResult,
const int maxResultLength) const {
// getProperty is not supported for this class.
if (maxResultLength > 0) {
outResult[0] = '\0';
}
}
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTriePolicy);

View File

@ -43,7 +43,8 @@ const char *const DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE =
const HeaderReadWriteUtils::AttributeMap *const attributeMap) {
BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
HeaderPolicy headerPolicy(FormatUtils::VERSION_3, attributeMap);
headerPolicy.writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */);
headerPolicy.writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */,
0 /* unigramCount */, 0 /* bigramCount */, 0 /* extendedRegionSize */);
BufferWithExtendableBuffer bodyBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */);
if (!DynamicPatriciaTrieWritingUtils::writeEmptyDictionary(&bodyBuffer, 0 /* rootPos */)) {
return false;

View File

@ -27,6 +27,7 @@ import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Random;
@ -625,4 +626,57 @@ public class BinaryDictionaryTests extends AndroidTestCase {
dictFile.delete();
}
public void testUnigramAndBigramCount() {
final int flashWithGCIterationCount = 10;
final int codePointSetSize = 50;
final int unigramCountPerIteration = 1000;
final int bigramCountPerIteration = 2000;
final int seed = 1123581321;
final Random random = new Random(seed);
File dictFile = null;
try {
dictFile = createEmptyDictionaryAndGetFile("TestBinaryDictionary");
} catch (IOException e) {
fail("IOException while writing an initial dictionary : " + e);
}
final ArrayList<String> words = new ArrayList<String>();
final HashSet<Pair<String, String>> bigrams = new HashSet<Pair<String, String>>();
final int[] codePointSet = CodePointUtils.generateCodePointSet(codePointSetSize, random);
BinaryDictionary binaryDictionary;
for (int i = 0; i < flashWithGCIterationCount; i++) {
binaryDictionary = new BinaryDictionary(dictFile.getAbsolutePath(),
0 /* offset */, dictFile.length(), true /* useFullEditDistance */,
Locale.getDefault(), TEST_LOCALE, true /* isUpdatable */);
for (int j = 0; j < unigramCountPerIteration; j++) {
final String word = CodePointUtils.generateWord(random, codePointSet);
words.add(word);
final int unigramProbability = random.nextInt(0xFF);
binaryDictionary.addUnigramWord(word, unigramProbability);
}
for (int j = 0; j < bigramCountPerIteration; j++) {
final String word0 = words.get(random.nextInt(words.size()));
final String word1 = words.get(random.nextInt(words.size()));
bigrams.add(new Pair<String, String>(word0, word1));
final int bigramProbability = random.nextInt(0xF);
binaryDictionary.addBigramWords(word0, word1, bigramProbability);
}
assertEquals(new HashSet<String>(words).size(), Integer.parseInt(
binaryDictionary.getPropertyForTests(BinaryDictionary.UNIGRAM_COUNT_QUERY)));
assertEquals(new HashSet<Pair<String, String>>(bigrams).size(), Integer.parseInt(
binaryDictionary.getPropertyForTests(BinaryDictionary.BIGRAM_COUNT_QUERY)));
binaryDictionary.flushWithGC();
assertEquals(new HashSet<String>(words).size(), Integer.parseInt(
binaryDictionary.getPropertyForTests(BinaryDictionary.UNIGRAM_COUNT_QUERY)));
assertEquals(new HashSet<Pair<String, String>>(bigrams).size(), Integer.parseInt(
binaryDictionary.getPropertyForTests(BinaryDictionary.BIGRAM_COUNT_QUERY)));
binaryDictionary.close();
}
dictFile.delete();
}
}