Merge "Implement updateCounter() by using existing entry adding methods."
This commit is contained in:
commit
0afad267c5
13 changed files with 129 additions and 21 deletions
|
@ -374,7 +374,7 @@ static bool latinime_BinaryDictionary_addUnigramEntry(JNIEnv *env, jclass clazz,
|
|||
// Use 1 for count to indicate the word has inputted.
|
||||
const UnigramProperty unigramProperty(isBeginningOfSentence, isNotAWord,
|
||||
isBlacklisted, probability, HistoricalInfo(timestamp, 0 /* level */, 1 /* count */),
|
||||
&shortcuts);
|
||||
std::move(shortcuts));
|
||||
return dictionary->addUnigramEntry(CodePointArrayView(codePoints, codePointCount),
|
||||
&unigramProperty);
|
||||
}
|
||||
|
@ -434,10 +434,16 @@ static bool latinime_BinaryDictionary_updateCounter(JNIEnv *env, jclass clazz, j
|
|||
if (!dictionary) {
|
||||
return false;
|
||||
}
|
||||
jsize wordLength = env->GetArrayLength(word);
|
||||
int wordCodePoints[wordLength];
|
||||
env->GetIntArrayRegion(word, 0, wordLength, wordCodePoints);
|
||||
return false;
|
||||
const PrevWordsInfo prevWordsInfo = JniDataUtils::constructPrevWordsInfo(env,
|
||||
prevWordCodePointArrays, isBeginningOfSentenceArray,
|
||||
env->GetArrayLength(prevWordCodePointArrays));
|
||||
jsize codePointCount = env->GetArrayLength(word);
|
||||
int wordCodePoints[codePointCount];
|
||||
env->GetIntArrayRegion(word, 0, codePointCount, wordCodePoints);
|
||||
const HistoricalInfo historicalInfo(timestamp, 0 /* level */, count);
|
||||
return dictionary->updateCounter(&prevWordsInfo,
|
||||
CodePointArrayView(wordCodePoints, codePointCount), isValidWord == JNI_TRUE,
|
||||
historicalInfo);
|
||||
}
|
||||
|
||||
// Returns how many language model params are processed.
|
||||
|
@ -509,7 +515,7 @@ static int latinime_BinaryDictionary_addMultipleDictionaryEntries(JNIEnv *env, j
|
|||
// Use 1 for count to indicate the word has inputted.
|
||||
const UnigramProperty unigramProperty(false /* isBeginningOfSentence */, isNotAWord,
|
||||
isBlacklisted, unigramProbability,
|
||||
HistoricalInfo(timestamp, 0 /* level */, 1 /* count */), &shortcuts);
|
||||
HistoricalInfo(timestamp, 0 /* level */, 1 /* count */), std::move(shortcuts));
|
||||
dictionary->addUnigramEntry(CodePointArrayView(word1CodePoints, word1Length),
|
||||
&unigramProperty);
|
||||
if (word0) {
|
||||
|
|
|
@ -155,6 +155,14 @@ bool Dictionary::removeNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
|||
return mDictionaryStructureWithBufferPolicy->removeNgramEntry(prevWordsInfo, codePoints);
|
||||
}
|
||||
|
||||
bool Dictionary::updateCounter(const PrevWordsInfo *const prevWordsInfo,
|
||||
const CodePointArrayView codePoints, const bool isValidWord,
|
||||
const HistoricalInfo historicalInfo) {
|
||||
TimeKeeper::setCurrentTime();
|
||||
return mDictionaryStructureWithBufferPolicy->updateCounter(prevWordsInfo, codePoints,
|
||||
isValidWord, historicalInfo);
|
||||
}
|
||||
|
||||
bool Dictionary::flush(const char *const filePath) {
|
||||
TimeKeeper::setCurrentTime();
|
||||
return mDictionaryStructureWithBufferPolicy->flush(filePath);
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include "defines.h"
|
||||
#include "jni.h"
|
||||
#include "suggest/core/dictionary/ngram_listener.h"
|
||||
#include "suggest/core/dictionary/property/historical_info.h"
|
||||
#include "suggest/core/dictionary/property/word_property.h"
|
||||
#include "suggest/core/policy/dictionary_header_structure_policy.h"
|
||||
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
|
||||
|
@ -90,6 +91,10 @@ class Dictionary {
|
|||
bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
||||
const CodePointArrayView codePoints);
|
||||
|
||||
bool updateCounter(const PrevWordsInfo *const prevWordsInfo,
|
||||
const CodePointArrayView codePoints, const bool isValidWord,
|
||||
const HistoricalInfo historicalInfo);
|
||||
|
||||
bool flush(const char *const filePath);
|
||||
|
||||
bool flushWithGC(const char *const filePath);
|
||||
|
|
|
@ -27,7 +27,7 @@ namespace latinime {
|
|||
class NgramProperty {
|
||||
public:
|
||||
NgramProperty(const std::vector<int> &&targetCodePoints, const int probability,
|
||||
const HistoricalInfo &historicalInfo)
|
||||
const HistoricalInfo historicalInfo)
|
||||
: mTargetCodePoints(std::move(targetCodePoints)), mProbability(probability),
|
||||
mHistoricalInfo(historicalInfo) {}
|
||||
|
||||
|
|
|
@ -54,11 +54,18 @@ class UnigramProperty {
|
|||
mProbability(NOT_A_PROBABILITY), mHistoricalInfo(), mShortcuts() {}
|
||||
|
||||
UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord,
|
||||
const bool isBlacklisted, const int probability, const HistoricalInfo &historicalInfo,
|
||||
const std::vector<ShortcutProperty> *const shortcuts)
|
||||
const bool isBlacklisted, const int probability, const HistoricalInfo historicalInfo,
|
||||
const std::vector<ShortcutProperty> &&shortcuts)
|
||||
: mRepresentsBeginningOfSentence(representsBeginningOfSentence),
|
||||
mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), mProbability(probability),
|
||||
mHistoricalInfo(historicalInfo), mShortcuts(*shortcuts) {}
|
||||
mHistoricalInfo(historicalInfo), mShortcuts(std::move(shortcuts)) {}
|
||||
|
||||
// Without shortcuts.
|
||||
UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord,
|
||||
const bool isBlacklisted, const int probability, const HistoricalInfo historicalInfo)
|
||||
: mRepresentsBeginningOfSentence(representsBeginningOfSentence),
|
||||
mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), mProbability(probability),
|
||||
mHistoricalInfo(historicalInfo), mShortcuts() {}
|
||||
|
||||
bool representsBeginningOfSentence() const {
|
||||
return mRepresentsBeginningOfSentence;
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
|
||||
#include "defines.h"
|
||||
#include "suggest/core/dictionary/binary_dictionary_shortcut_iterator.h"
|
||||
#include "suggest/core/dictionary/property/historical_info.h"
|
||||
#include "suggest/core/dictionary/property/word_property.h"
|
||||
#include "suggest/core/dictionary/word_attributes.h"
|
||||
#include "utils/int_array_view.h"
|
||||
|
@ -87,6 +88,11 @@ class DictionaryStructureWithBufferPolicy {
|
|||
virtual bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
||||
const CodePointArrayView wordCodePoints) = 0;
|
||||
|
||||
// Returns whether the update was success or not.
|
||||
virtual bool updateCounter(const PrevWordsInfo *const prevWordsInfo,
|
||||
const CodePointArrayView wordCodePoints, const bool isValidWord,
|
||||
const HistoricalInfo historicalInfo) = 0;
|
||||
|
||||
// Returns whether the flush was success or not.
|
||||
virtual bool flush(const char *const filePath) = 0;
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ class PrevWordsInfo {
|
|||
clear();
|
||||
}
|
||||
|
||||
PrevWordsInfo(PrevWordsInfo &&prevWordsInfo)
|
||||
PrevWordsInfo(const PrevWordsInfo &prevWordsInfo)
|
||||
: mPrevWordCount(prevWordsInfo.mPrevWordCount) {
|
||||
for (size_t i = 0; i < mPrevWordCount; ++i) {
|
||||
mPrevWordCodePointCount[i] = prevWordsInfo.mPrevWordCodePointCount[i];
|
||||
|
@ -73,6 +73,16 @@ class PrevWordsInfo {
|
|||
mIsBeginningOfSentence[0] = isBeginningOfSentence;
|
||||
}
|
||||
|
||||
size_t getPrevWordCount() const {
|
||||
return mPrevWordCount;
|
||||
}
|
||||
|
||||
// TODO: Remove.
|
||||
const PrevWordsInfo getTrimmedPrevWordsInfo(const size_t maxPrevWordCount) const {
|
||||
return PrevWordsInfo(mPrevWordCodePoints, mPrevWordCodePointCount, mIsBeginningOfSentence,
|
||||
std::min(mPrevWordCount, maxPrevWordCount));
|
||||
}
|
||||
|
||||
bool isValid() const {
|
||||
if (mPrevWordCodePointCount[0] > 0) {
|
||||
return true;
|
||||
|
@ -112,7 +122,7 @@ class PrevWordsInfo {
|
|||
}
|
||||
|
||||
private:
|
||||
DISALLOW_COPY_AND_ASSIGN(PrevWordsInfo);
|
||||
DISALLOW_ASSIGNMENT_OPERATOR(PrevWordsInfo);
|
||||
|
||||
static int getWordId(const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
|
||||
const int *const wordCodePoints, const int wordCodePointCount,
|
||||
|
|
|
@ -52,6 +52,7 @@ const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_C
|
|||
const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024;
|
||||
const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS =
|
||||
Ver4DictConstants::MAX_DICTIONARY_SIZE - MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS;
|
||||
const int Ver4PatriciaTriePolicy::DUMMY_PROBABILITY_FOR_VALID_WORDS = 1;
|
||||
|
||||
void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode,
|
||||
DicNodeVector *const childDicNodes) const {
|
||||
|
@ -339,11 +340,9 @@ bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsI
|
|||
}
|
||||
if (prevWordIds[0] == NOT_A_WORD_ID) {
|
||||
if (prevWordsInfo->isNthPrevWordBeginningOfSentence(1 /* n */)) {
|
||||
const std::vector<UnigramProperty::ShortcutProperty> shortcuts;
|
||||
const UnigramProperty beginningOfSentenceUnigramProperty(
|
||||
true /* representsBeginningOfSentence */, true /* isNotAWord */,
|
||||
false /* isBlacklisted */, MAX_PROBABILITY /* probability */,
|
||||
HistoricalInfo(), &shortcuts);
|
||||
false /* isBlacklisted */, MAX_PROBABILITY /* probability */, HistoricalInfo());
|
||||
if (!addUnigramEntry(prevWordsInfo->getNthPrevWordCodePoints(1 /* n */),
|
||||
&beginningOfSentenceUnigramProperty)) {
|
||||
AKLOGE("Cannot add unigram entry for the beginning-of-sentence.");
|
||||
|
@ -414,6 +413,29 @@ bool Ver4PatriciaTriePolicy::removeNgramEntry(const PrevWordsInfo *const prevWor
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
bool Ver4PatriciaTriePolicy::updateCounter(const PrevWordsInfo *const prevWordsInfo,
|
||||
const CodePointArrayView wordCodePoints, const bool isValidWord,
|
||||
const HistoricalInfo historicalInfo) {
|
||||
if (!mBuffers->isUpdatable()) {
|
||||
AKLOGI("Warning: updateCounter() is called for non-updatable dictionary.");
|
||||
return false;
|
||||
}
|
||||
const int probability = isValidWord ? DUMMY_PROBABILITY_FOR_VALID_WORDS : NOT_A_PROBABILITY;
|
||||
const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */,
|
||||
false /* isNotAWord */, false /*isBlacklisted*/, probability, historicalInfo);
|
||||
if (!addUnigramEntry(wordCodePoints, &unigramProperty)) {
|
||||
AKLOGE("Cannot update unigarm entry in updateCounter().");
|
||||
return false;
|
||||
}
|
||||
const NgramProperty ngramProperty(wordCodePoints.toVector(), probability, historicalInfo);
|
||||
if (!addNgramEntry(prevWordsInfo, &ngramProperty)) {
|
||||
AKLOGE("Cannot update unigarm entry in updateCounter().");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Ver4PatriciaTriePolicy::flush(const char *const filePath) {
|
||||
if (!mBuffers->isUpdatable()) {
|
||||
AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath);
|
||||
|
@ -551,7 +573,7 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
|
|||
}
|
||||
const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
|
||||
ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
|
||||
*historicalInfo, &shortcuts);
|
||||
*historicalInfo, std::move(shortcuts));
|
||||
return WordProperty(wordCodePoints.toVector(), &unigramProperty, &ngrams);
|
||||
}
|
||||
|
||||
|
|
|
@ -118,6 +118,10 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
|||
bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
||||
const CodePointArrayView wordCodePoints);
|
||||
|
||||
bool updateCounter(const PrevWordsInfo *const prevWordsInfo,
|
||||
const CodePointArrayView wordCodePoints, const bool isValidWord,
|
||||
const HistoricalInfo historicalInfo);
|
||||
|
||||
bool flush(const char *const filePath);
|
||||
|
||||
bool flushWithGC(const char *const filePath);
|
||||
|
@ -147,6 +151,7 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
|||
// prevent the dictionary from overflowing.
|
||||
static const int MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS;
|
||||
static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS;
|
||||
static const int DUMMY_PROBABILITY_FOR_VALID_WORDS;
|
||||
|
||||
const Ver4DictBuffers::Ver4DictBuffersPtr mBuffers;
|
||||
const HeaderPolicy *const mHeaderPolicy;
|
||||
|
|
|
@ -477,7 +477,7 @@ const WordProperty PatriciaTriePolicy::getWordProperty(
|
|||
}
|
||||
const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
|
||||
ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
|
||||
HistoricalInfo(), &shortcuts);
|
||||
HistoricalInfo(), std::move(shortcuts));
|
||||
return WordProperty(wordCodePoints.toVector(), &unigramProperty, &ngrams);
|
||||
}
|
||||
|
||||
|
|
|
@ -107,6 +107,14 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
|||
return false;
|
||||
}
|
||||
|
||||
bool updateCounter(const PrevWordsInfo *const prevWordsInfo,
|
||||
const CodePointArrayView wordCodePoints, const bool isValidWord,
|
||||
const HistoricalInfo historicalInfo) {
|
||||
// This method should not be called for non-updatable dictionary.
|
||||
AKLOGI("Warning: updateCounter() is called for non-updatable dictionary.");
|
||||
return false;
|
||||
}
|
||||
|
||||
bool flush(const char *const filePath) {
|
||||
// This method should not be called for non-updatable dictionary.
|
||||
AKLOGI("Warning: flush() is called for non-updatable dictionary.");
|
||||
|
|
|
@ -43,6 +43,7 @@ const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_C
|
|||
const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024;
|
||||
const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS =
|
||||
Ver4DictConstants::MAX_DICTIONARY_SIZE - MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS;
|
||||
const int Ver4PatriciaTriePolicy::DUMMY_PROBABILITY_FOR_VALID_WORDS = 1;
|
||||
|
||||
void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode,
|
||||
DicNodeVector *const childDicNodes) const {
|
||||
|
@ -298,11 +299,9 @@ bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsI
|
|||
if (!prevWordsInfo->isNthPrevWordBeginningOfSentence(i + 1 /* n */)) {
|
||||
return false;
|
||||
}
|
||||
const std::vector<UnigramProperty::ShortcutProperty> shortcuts;
|
||||
const UnigramProperty beginningOfSentenceUnigramProperty(
|
||||
true /* representsBeginningOfSentence */, true /* isNotAWord */,
|
||||
false /* isBlacklisted */, MAX_PROBABILITY /* probability */,
|
||||
HistoricalInfo(), &shortcuts);
|
||||
false /* isBlacklisted */, MAX_PROBABILITY /* probability */, HistoricalInfo());
|
||||
if (!addUnigramEntry(prevWordsInfo->getNthPrevWordCodePoints(1 /* n */),
|
||||
&beginningOfSentenceUnigramProperty)) {
|
||||
AKLOGE("Cannot add unigram entry for the beginning-of-sentence.");
|
||||
|
@ -364,6 +363,32 @@ bool Ver4PatriciaTriePolicy::removeNgramEntry(const PrevWordsInfo *const prevWor
|
|||
}
|
||||
}
|
||||
|
||||
bool Ver4PatriciaTriePolicy::updateCounter(const PrevWordsInfo *const prevWordsInfo,
|
||||
const CodePointArrayView wordCodePoints, const bool isValidWord,
|
||||
const HistoricalInfo historicalInfo) {
|
||||
if (!mBuffers->isUpdatable()) {
|
||||
AKLOGI("Warning: updateCounter() is called for non-updatable dictionary.");
|
||||
return false;
|
||||
}
|
||||
// TODO: Have count up method in language model dict content.
|
||||
const int probability = isValidWord ? DUMMY_PROBABILITY_FOR_VALID_WORDS : NOT_A_PROBABILITY;
|
||||
const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */,
|
||||
false /* isNotAWord */, false /*isBlacklisted*/, probability, historicalInfo);
|
||||
if (!addUnigramEntry(wordCodePoints, &unigramProperty)) {
|
||||
AKLOGE("Cannot update unigarm entry in updateCounter().");
|
||||
return false;
|
||||
}
|
||||
const NgramProperty ngramProperty(wordCodePoints.toVector(), probability, historicalInfo);
|
||||
for (size_t i = 1; i <= prevWordsInfo->getPrevWordCount(); ++i) {
|
||||
const PrevWordsInfo trimmedPrevWordsInfo(prevWordsInfo->getTrimmedPrevWordsInfo(i));
|
||||
if (!addNgramEntry(&trimmedPrevWordsInfo, &ngramProperty)) {
|
||||
AKLOGE("Cannot update ngram entry in updateCounter().");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Ver4PatriciaTriePolicy::flush(const char *const filePath) {
|
||||
if (!mBuffers->isUpdatable()) {
|
||||
AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath);
|
||||
|
@ -486,7 +511,7 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
|
|||
}
|
||||
const UnigramProperty unigramProperty(probabilityEntry.representsBeginningOfSentence(),
|
||||
probabilityEntry.isNotAWord(), probabilityEntry.isBlacklisted(),
|
||||
probabilityEntry.getProbability(), *historicalInfo, &shortcuts);
|
||||
probabilityEntry.getProbability(), *historicalInfo, std::move(shortcuts));
|
||||
return WordProperty(wordCodePoints.toVector(), &unigramProperty, &ngrams);
|
||||
}
|
||||
|
||||
|
|
|
@ -98,6 +98,10 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
|||
bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
||||
const CodePointArrayView wordCodePoints);
|
||||
|
||||
bool updateCounter(const PrevWordsInfo *const prevWordsInfo,
|
||||
const CodePointArrayView wordCodePoints, const bool isValidWord,
|
||||
const HistoricalInfo historicalInfo);
|
||||
|
||||
bool flush(const char *const filePath);
|
||||
|
||||
bool flushWithGC(const char *const filePath);
|
||||
|
@ -127,6 +131,8 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
|||
// prevent the dictionary from overflowing.
|
||||
static const int MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS;
|
||||
static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS;
|
||||
// TODO: Remove
|
||||
static const int DUMMY_PROBABILITY_FOR_VALID_WORDS;
|
||||
|
||||
const Ver4DictBuffers::Ver4DictBuffersPtr mBuffers;
|
||||
const HeaderPolicy *const mHeaderPolicy;
|
||||
|
|
Loading…
Reference in a new issue