Use PrevWordsInfo to add/remove n(bi)-gram in native code.
Bug: 14119293 Bug: 14425059 Change-Id: I4b9a46bfd670b35195418eaee51456d44fb91b6d
This commit is contained in:
parent
c18b1c42f3
commit
9f8c9a0161
14 changed files with 135 additions and 81 deletions
|
@ -343,7 +343,7 @@ static void latinime_BinaryDictionary_addUnigramWord(JNIEnv *env, jclass clazz,
|
||||||
// Use 1 for count to indicate the word has inputted.
|
// Use 1 for count to indicate the word has inputted.
|
||||||
const UnigramProperty unigramProperty(isNotAWord, isBlacklisted,
|
const UnigramProperty unigramProperty(isNotAWord, isBlacklisted,
|
||||||
probability, timestamp, 0 /* level */, 1 /* count */, &shortcuts);
|
probability, timestamp, 0 /* level */, 1 /* count */, &shortcuts);
|
||||||
dictionary->addUnigramWord(codePoints, codePointCount, &unigramProperty);
|
dictionary->addUnigramEntry(codePoints, codePointCount, &unigramProperty);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void latinime_BinaryDictionary_addBigramWords(JNIEnv *env, jclass clazz, jlong dict,
|
static void latinime_BinaryDictionary_addBigramWords(JNIEnv *env, jclass clazz, jlong dict,
|
||||||
|
@ -363,7 +363,9 @@ static void latinime_BinaryDictionary_addBigramWords(JNIEnv *env, jclass clazz,
|
||||||
// Use 1 for count to indicate the bigram has inputted.
|
// Use 1 for count to indicate the bigram has inputted.
|
||||||
const BigramProperty bigramProperty(&bigramTargetCodePoints, probability,
|
const BigramProperty bigramProperty(&bigramTargetCodePoints, probability,
|
||||||
timestamp, 0 /* level */, 1 /* count */);
|
timestamp, 0 /* level */, 1 /* count */);
|
||||||
dictionary->addBigramWords(word0CodePoints, word0Length, &bigramProperty);
|
const PrevWordsInfo prevWordsInfo(word0CodePoints, word0Length,
|
||||||
|
false /* isBeginningOfSentence */);
|
||||||
|
dictionary->addNgramEntry(&prevWordsInfo, &bigramProperty);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void latinime_BinaryDictionary_removeBigramWords(JNIEnv *env, jclass clazz, jlong dict,
|
static void latinime_BinaryDictionary_removeBigramWords(JNIEnv *env, jclass clazz, jlong dict,
|
||||||
|
@ -378,8 +380,9 @@ static void latinime_BinaryDictionary_removeBigramWords(JNIEnv *env, jclass claz
|
||||||
jsize word1Length = env->GetArrayLength(word1);
|
jsize word1Length = env->GetArrayLength(word1);
|
||||||
int word1CodePoints[word1Length];
|
int word1CodePoints[word1Length];
|
||||||
env->GetIntArrayRegion(word1, 0, word1Length, word1CodePoints);
|
env->GetIntArrayRegion(word1, 0, word1Length, word1CodePoints);
|
||||||
dictionary->removeBigramWords(word0CodePoints, word0Length, word1CodePoints,
|
const PrevWordsInfo prevWordsInfo(word0CodePoints, word0Length,
|
||||||
word1Length);
|
false /* isBeginningOfSentence */);
|
||||||
|
dictionary->removeNgramEntry(&prevWordsInfo, word1CodePoints, word1Length);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns how many language model params are processed.
|
// Returns how many language model params are processed.
|
||||||
|
@ -449,7 +452,7 @@ static int latinime_BinaryDictionary_addMultipleDictionaryEntries(JNIEnv *env, j
|
||||||
// Use 1 for count to indicate the word has inputted.
|
// Use 1 for count to indicate the word has inputted.
|
||||||
const UnigramProperty unigramProperty(isNotAWord, isBlacklisted,
|
const UnigramProperty unigramProperty(isNotAWord, isBlacklisted,
|
||||||
unigramProbability, timestamp, 0 /* level */, 1 /* count */, &shortcuts);
|
unigramProbability, timestamp, 0 /* level */, 1 /* count */, &shortcuts);
|
||||||
dictionary->addUnigramWord(word1CodePoints, word1Length, &unigramProperty);
|
dictionary->addUnigramEntry(word1CodePoints, word1Length, &unigramProperty);
|
||||||
if (word0) {
|
if (word0) {
|
||||||
jint bigramProbability = env->GetIntField(languageModelParam, bigramProbabilityFieldId);
|
jint bigramProbability = env->GetIntField(languageModelParam, bigramProbabilityFieldId);
|
||||||
const std::vector<int> bigramTargetCodePoints(
|
const std::vector<int> bigramTargetCodePoints(
|
||||||
|
@ -457,7 +460,9 @@ static int latinime_BinaryDictionary_addMultipleDictionaryEntries(JNIEnv *env, j
|
||||||
// Use 1 for count to indicate the bigram has inputted.
|
// Use 1 for count to indicate the bigram has inputted.
|
||||||
const BigramProperty bigramProperty(&bigramTargetCodePoints, bigramProbability,
|
const BigramProperty bigramProperty(&bigramTargetCodePoints, bigramProbability,
|
||||||
timestamp, 0 /* level */, 1 /* count */);
|
timestamp, 0 /* level */, 1 /* count */);
|
||||||
dictionary->addBigramWords(word0CodePoints, word0Length, &bigramProperty);
|
const PrevWordsInfo prevWordsInfo(word0CodePoints, word0Length,
|
||||||
|
false /* isBeginningOfSentence */);
|
||||||
|
dictionary->addNgramEntry(&prevWordsInfo, &bigramProperty);
|
||||||
}
|
}
|
||||||
if (dictionary->needsToRunGC(true /* mindsBlockByGC */)) {
|
if (dictionary->needsToRunGC(true /* mindsBlockByGC */)) {
|
||||||
return i + 1;
|
return i + 1;
|
||||||
|
@ -541,7 +546,7 @@ static bool latinime_BinaryDictionary_migrateNative(JNIEnv *env, jclass clazz, j
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!dictionaryStructureWithBufferPolicy->addUnigramWord(wordCodePoints, wordLength,
|
if (!dictionaryStructureWithBufferPolicy->addUnigramEntry(wordCodePoints, wordLength,
|
||||||
wordProperty.getUnigramProperty())) {
|
wordProperty.getUnigramProperty())) {
|
||||||
LogUtils::logToJava(env, "Cannot add unigram to the new dict.");
|
LogUtils::logToJava(env, "Cannot add unigram to the new dict.");
|
||||||
return false;
|
return false;
|
||||||
|
@ -561,8 +566,10 @@ static bool latinime_BinaryDictionary_migrateNative(JNIEnv *env, jclass clazz, j
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
const PrevWordsInfo prevWordsInfo(wordCodePoints, wordLength,
|
||||||
|
false /* isStartOfSentence */);
|
||||||
for (const BigramProperty &bigramProperty : *wordProperty.getBigramProperties()) {
|
for (const BigramProperty &bigramProperty : *wordProperty.getBigramProperties()) {
|
||||||
if (!dictionaryStructureWithBufferPolicy->addBigramWords(wordCodePoints, wordLength,
|
if (!dictionaryStructureWithBufferPolicy->addNgramEntry(&prevWordsInfo,
|
||||||
&bigramProperty)) {
|
&bigramProperty)) {
|
||||||
LogUtils::logToJava(env, "Cannot add bigram to the new dict.");
|
LogUtils::logToJava(env, "Cannot add bigram to the new dict.");
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -203,12 +203,12 @@ class DicNode {
|
||||||
return mDicNodeState.mDicNodeStateInput.getInputIndex(0) < inputSize - 1;
|
return mDicNodeState.mDicNodeStateInput.getInputIndex(0) < inputSize - 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Used to get n-gram probability in DicNodeUtils
|
// Used to get n-gram probability in DicNodeUtils.
|
||||||
int getPtNodePos() const {
|
int getPtNodePos() const {
|
||||||
return mDicNodeProperties.getPtNodePos();
|
return mDicNodeProperties.getPtNodePos();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Used to get n-gram probability in DicNodeUtils
|
// Used to get n-gram probability in DicNodeUtils. n is 1-indexed.
|
||||||
int getNthPrevWordTerminalPtNodePos(const int n) const {
|
int getNthPrevWordTerminalPtNodePos(const int n) const {
|
||||||
if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) {
|
if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) {
|
||||||
return NOT_A_DICT_POS;
|
return NOT_A_DICT_POS;
|
||||||
|
|
|
@ -74,28 +74,28 @@ int Dictionary::getProbability(const int *word, int length) const {
|
||||||
return getDictionaryStructurePolicy()->getUnigramProbabilityOfPtNode(pos);
|
return getDictionaryStructurePolicy()->getUnigramProbabilityOfPtNode(pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
int Dictionary::getBigramProbability(const PrevWordsInfo *const prevWordsInfo, const int *word1,
|
int Dictionary::getBigramProbability(const PrevWordsInfo *const prevWordsInfo, const int *word,
|
||||||
int length1) const {
|
int length) const {
|
||||||
TimeKeeper::setCurrentTime();
|
TimeKeeper::setCurrentTime();
|
||||||
return mBigramDictionary.getBigramProbability(prevWordsInfo, word1, length1);
|
return mBigramDictionary.getBigramProbability(prevWordsInfo, word, length);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Dictionary::addUnigramWord(const int *const word, const int length,
|
void Dictionary::addUnigramEntry(const int *const word, const int length,
|
||||||
const UnigramProperty *const unigramProperty) {
|
const UnigramProperty *const unigramProperty) {
|
||||||
TimeKeeper::setCurrentTime();
|
TimeKeeper::setCurrentTime();
|
||||||
mDictionaryStructureWithBufferPolicy->addUnigramWord(word, length, unigramProperty);
|
mDictionaryStructureWithBufferPolicy->addUnigramEntry(word, length, unigramProperty);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Dictionary::addBigramWords(const int *const word0, const int length0,
|
void Dictionary::addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
||||||
const BigramProperty *const bigramProperty) {
|
const BigramProperty *const bigramProperty) {
|
||||||
TimeKeeper::setCurrentTime();
|
TimeKeeper::setCurrentTime();
|
||||||
mDictionaryStructureWithBufferPolicy->addBigramWords(word0, length0, bigramProperty);
|
mDictionaryStructureWithBufferPolicy->addNgramEntry(prevWordsInfo, bigramProperty);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Dictionary::removeBigramWords(const int *const word0, const int length0,
|
void Dictionary::removeNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
||||||
const int *const word1, const int length1) {
|
const int *const word, const int length) {
|
||||||
TimeKeeper::setCurrentTime();
|
TimeKeeper::setCurrentTime();
|
||||||
mDictionaryStructureWithBufferPolicy->removeBigramWords(word0, length0, word1, length1);
|
mDictionaryStructureWithBufferPolicy->removeNgramEntry(prevWordsInfo, word, length);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Dictionary::flush(const char *const filePath) {
|
void Dictionary::flush(const char *const filePath) {
|
||||||
|
|
|
@ -73,16 +73,16 @@ class Dictionary {
|
||||||
int getProbability(const int *word, int length) const;
|
int getProbability(const int *word, int length) const;
|
||||||
|
|
||||||
int getBigramProbability(const PrevWordsInfo *const prevWordsInfo,
|
int getBigramProbability(const PrevWordsInfo *const prevWordsInfo,
|
||||||
const int *word1, int length1) const;
|
const int *word, int length) const;
|
||||||
|
|
||||||
void addUnigramWord(const int *const codePoints, const int codePointCount,
|
void addUnigramEntry(const int *const codePoints, const int codePointCount,
|
||||||
const UnigramProperty *const unigramProperty);
|
const UnigramProperty *const unigramProperty);
|
||||||
|
|
||||||
void addBigramWords(const int *const word0, const int length0,
|
void addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
||||||
const BigramProperty *const bigramProperty);
|
const BigramProperty *const bigramProperty);
|
||||||
|
|
||||||
void removeBigramWords(const int *const word0, const int length0, const int *const word1,
|
void removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word,
|
||||||
const int length1);
|
const int length);
|
||||||
|
|
||||||
void flush(const char *const filePath);
|
void flush(const char *const filePath);
|
||||||
|
|
||||||
|
|
|
@ -23,6 +23,7 @@
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
|
// TODO: Change to NgramProperty.
|
||||||
class BigramProperty {
|
class BigramProperty {
|
||||||
public:
|
public:
|
||||||
BigramProperty(const std::vector<int> *const targetCodePoints,
|
BigramProperty(const std::vector<int> *const targetCodePoints,
|
||||||
|
|
|
@ -29,6 +29,7 @@ class DicNodeVector;
|
||||||
class DictionaryBigramsStructurePolicy;
|
class DictionaryBigramsStructurePolicy;
|
||||||
class DictionaryHeaderStructurePolicy;
|
class DictionaryHeaderStructurePolicy;
|
||||||
class DictionaryShortcutsStructurePolicy;
|
class DictionaryShortcutsStructurePolicy;
|
||||||
|
class PrevWordsInfo;
|
||||||
class UnigramProperty;
|
class UnigramProperty;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -69,16 +70,16 @@ class DictionaryStructureWithBufferPolicy {
|
||||||
virtual const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const = 0;
|
virtual const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const = 0;
|
||||||
|
|
||||||
// Returns whether the update was success or not.
|
// Returns whether the update was success or not.
|
||||||
virtual bool addUnigramWord(const int *const word, const int length,
|
virtual bool addUnigramEntry(const int *const word, const int length,
|
||||||
const UnigramProperty *const unigramProperty) = 0;
|
const UnigramProperty *const unigramProperty) = 0;
|
||||||
|
|
||||||
// Returns whether the update was success or not.
|
// Returns whether the update was success or not.
|
||||||
virtual bool addBigramWords(const int *const word0, const int length0,
|
virtual bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
||||||
const BigramProperty *const bigramProperty) = 0;
|
const BigramProperty *const bigramProperty) = 0;
|
||||||
|
|
||||||
// Returns whether the update was success or not.
|
// Returns whether the update was success or not.
|
||||||
virtual bool removeBigramWords(const int *const word0, const int length0,
|
virtual bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
||||||
const int *const word1, const int length1) = 0;
|
const int *const word, const int length) = 0;
|
||||||
|
|
||||||
virtual void flush(const char *const filePath) = 0;
|
virtual void flush(const char *const filePath) = 0;
|
||||||
|
|
||||||
|
|
|
@ -36,7 +36,7 @@ void DicTraverseSession::init(const Dictionary *const dictionary,
|
||||||
->getMultiWordCostMultiplier();
|
->getMultiWordCostMultiplier();
|
||||||
mSuggestOptions = suggestOptions;
|
mSuggestOptions = suggestOptions;
|
||||||
prevWordsInfo->getPrevWordsTerminalPtNodePos(
|
prevWordsInfo->getPrevWordsTerminalPtNodePos(
|
||||||
getDictionaryStructurePolicy(), mPrevWordsPtNodePos);
|
getDictionaryStructurePolicy(), mPrevWordsPtNodePos, true /* tryLowerCaseSearch */);
|
||||||
}
|
}
|
||||||
|
|
||||||
void DicTraverseSession::setupForGetSuggestions(const ProximityInfo *pInfo,
|
void DicTraverseSession::setupForGetSuggestions(const ProximityInfo *pInfo,
|
||||||
|
|
|
@ -41,13 +41,23 @@ class PrevWordsInfo {
|
||||||
mIsBeginningOfSentence[0] = isBeginningOfSentence;
|
mIsBeginningOfSentence[0] = isBeginningOfSentence;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool isValid() const {
|
||||||
|
for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) {
|
||||||
|
if (mPrevWordCodePointCount[i] > MAX_WORD_LENGTH) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
void getPrevWordsTerminalPtNodePos(
|
void getPrevWordsTerminalPtNodePos(
|
||||||
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
|
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
|
||||||
int *const outPrevWordsTerminalPtNodePos) const {
|
int *const outPrevWordsTerminalPtNodePos,
|
||||||
|
const bool tryLowerCaseSearch) const {
|
||||||
for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) {
|
for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) {
|
||||||
outPrevWordsTerminalPtNodePos[i] = getTerminalPtNodePosOfWord(dictStructurePolicy,
|
outPrevWordsTerminalPtNodePos[i] = getTerminalPtNodePosOfWord(dictStructurePolicy,
|
||||||
mPrevWordCodePoints[i], mPrevWordCodePointCount[i],
|
mPrevWordCodePoints[i], mPrevWordCodePointCount[i],
|
||||||
mIsBeginningOfSentence[i]);
|
mIsBeginningOfSentence[i], tryLowerCaseSearch);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -66,19 +76,37 @@ class PrevWordsInfo {
|
||||||
dictStructurePolicy->getBigramsStructurePolicy(), pos);
|
dictStructurePolicy->getBigramsStructurePolicy(), pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// n is 1-indexed.
|
||||||
|
const int *getNthPrevWordCodePoints(const int n) const {
|
||||||
|
if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
return mPrevWordCodePoints[n - 1];
|
||||||
|
}
|
||||||
|
|
||||||
|
// n is 1-indexed.
|
||||||
|
int getNthPrevWordCodePointCount(const int n) const {
|
||||||
|
if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return mPrevWordCodePointCount[n - 1];
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_COPY_AND_ASSIGN(PrevWordsInfo);
|
DISALLOW_COPY_AND_ASSIGN(PrevWordsInfo);
|
||||||
|
|
||||||
static int getTerminalPtNodePosOfWord(
|
static int getTerminalPtNodePosOfWord(
|
||||||
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
|
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
|
||||||
const int *const wordCodePoints, const int wordCodePointCount,
|
const int *const wordCodePoints, const int wordCodePointCount,
|
||||||
const bool isBeginningOfSentence) {
|
const bool isBeginningOfSentence, const bool tryLowerCaseSearch) {
|
||||||
if (!dictStructurePolicy || !wordCodePoints) {
|
if (!dictStructurePolicy || !wordCodePoints) {
|
||||||
return NOT_A_DICT_POS;
|
return NOT_A_DICT_POS;
|
||||||
}
|
}
|
||||||
const int wordPtNodePos = dictStructurePolicy->getTerminalPtNodePositionOfWord(
|
const int wordPtNodePos = dictStructurePolicy->getTerminalPtNodePositionOfWord(
|
||||||
wordCodePoints, wordCodePointCount, false /* forceLowerCaseSearch */);
|
wordCodePoints, wordCodePointCount, false /* forceLowerCaseSearch */);
|
||||||
if (wordPtNodePos != NOT_A_DICT_POS) {
|
if (wordPtNodePos != NOT_A_DICT_POS || !tryLowerCaseSearch) {
|
||||||
|
// Return the position when when the word was found or doesn't try lower case
|
||||||
|
// search.
|
||||||
return wordPtNodePos;
|
return wordPtNodePos;
|
||||||
}
|
}
|
||||||
// Check bigrams for lower-cased previous word if original was not found. Useful for
|
// Check bigrams for lower-cased previous word if original was not found. Useful for
|
||||||
|
|
|
@ -31,6 +31,7 @@
|
||||||
#include "suggest/core/dictionary/property/bigram_property.h"
|
#include "suggest/core/dictionary/property/bigram_property.h"
|
||||||
#include "suggest/core/dictionary/property/unigram_property.h"
|
#include "suggest/core/dictionary/property/unigram_property.h"
|
||||||
#include "suggest/core/dictionary/property/word_property.h"
|
#include "suggest/core/dictionary/property/word_property.h"
|
||||||
|
#include "suggest/core/session/prev_words_info.h"
|
||||||
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
|
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
|
||||||
#include "suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_node_reader.h"
|
#include "suggest/policyimpl/dictionary/structure/backward/v401/ver4_patricia_trie_node_reader.h"
|
||||||
#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
|
#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
|
||||||
|
@ -163,10 +164,10 @@ int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) cons
|
||||||
ptNodeParams.getTerminalId());
|
ptNodeParams.getTerminalId());
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Ver4PatriciaTriePolicy::addUnigramWord(const int *const word, const int length,
|
bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int length,
|
||||||
const UnigramProperty *const unigramProperty) {
|
const UnigramProperty *const unigramProperty) {
|
||||||
if (!mBuffers->isUpdatable()) {
|
if (!mBuffers->isUpdatable()) {
|
||||||
AKLOGI("Warning: addUnigramWord() is called for non-updatable dictionary.");
|
AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
|
if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
|
||||||
|
@ -218,10 +219,12 @@ bool Ver4PatriciaTriePolicy::addUnigramWord(const int *const word, const int len
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Ver4PatriciaTriePolicy::addBigramWords(const int *const word0, const int length0,
|
bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
||||||
const BigramProperty *const bigramProperty) {
|
const BigramProperty *const bigramProperty) {
|
||||||
|
const int length0 = prevWordsInfo->getNthPrevWordCodePointCount(1);
|
||||||
|
const int *word0 = prevWordsInfo->getNthPrevWordCodePoints(1);
|
||||||
if (!mBuffers->isUpdatable()) {
|
if (!mBuffers->isUpdatable()) {
|
||||||
AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary.");
|
AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
|
if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
|
||||||
|
@ -257,8 +260,10 @@ bool Ver4PatriciaTriePolicy::addBigramWords(const int *const word0, const int le
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Ver4PatriciaTriePolicy::removeBigramWords(const int *const word0, const int length0,
|
bool Ver4PatriciaTriePolicy::removeNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
||||||
const int *const word1, const int length1) {
|
const int *const word1, const int length1) {
|
||||||
|
const int length0 = prevWordsInfo->getNthPrevWordCodePointCount(1);
|
||||||
|
const int *word0 = prevWordsInfo->getNthPrevWordCodePoints(1);
|
||||||
if (!mBuffers->isUpdatable()) {
|
if (!mBuffers->isUpdatable()) {
|
||||||
AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary.");
|
AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary.");
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -108,14 +108,14 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
return &mShortcutPolicy;
|
return &mShortcutPolicy;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool addUnigramWord(const int *const word, const int length,
|
bool addUnigramEntry(const int *const word, const int length,
|
||||||
const UnigramProperty *const unigramProperty);
|
const UnigramProperty *const unigramProperty);
|
||||||
|
|
||||||
bool addBigramWords(const int *const word0, const int length0,
|
bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
||||||
const BigramProperty *const bigramProperty);
|
const BigramProperty *const bigramProperty);
|
||||||
|
|
||||||
bool removeBigramWords(const int *const word0, const int length0, const int *const word1,
|
bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word,
|
||||||
const int length1);
|
const int length);
|
||||||
|
|
||||||
void flush(const char *const filePath);
|
void flush(const char *const filePath);
|
||||||
|
|
||||||
|
|
|
@ -145,7 +145,8 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str
|
||||||
char dictPath[dictDirPathBufSize];
|
char dictPath[dictDirPathBufSize];
|
||||||
if (!FileUtils::getFilePathWithoutSuffix(headerFilePath,
|
if (!FileUtils::getFilePathWithoutSuffix(headerFilePath,
|
||||||
DictConstants::HEADER_FILE_EXTENSION, dictDirPathBufSize, dictPath)) {
|
DictConstants::HEADER_FILE_EXTENSION, dictDirPathBufSize, dictPath)) {
|
||||||
AKLOGE("Dictionary file name is not valid as a ver4 dictionary. path: %s", path);
|
AKLOGE("Dictionary file name is not valid as a ver4 dictionary. header path: %s",
|
||||||
|
headerFilePath);
|
||||||
ASSERT(false);
|
ASSERT(false);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
@ -153,7 +154,7 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str
|
||||||
DictBuffers::openVer4DictBuffers(dictPath, std::move(mmappedBuffer), formatVersion);
|
DictBuffers::openVer4DictBuffers(dictPath, std::move(mmappedBuffer), formatVersion);
|
||||||
if (!dictBuffers || !dictBuffers->isValid()) {
|
if (!dictBuffers || !dictBuffers->isValid()) {
|
||||||
AKLOGE("DICT: The dictionary doesn't satisfy ver4 format requirements. path: %s",
|
AKLOGE("DICT: The dictionary doesn't satisfy ver4 format requirements. path: %s",
|
||||||
path);
|
dictPath);
|
||||||
ASSERT(false);
|
ASSERT(false);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
|
@ -81,24 +81,24 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
return &mShortcutListPolicy;
|
return &mShortcutListPolicy;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool addUnigramWord(const int *const word, const int length,
|
bool addUnigramEntry(const int *const word, const int length,
|
||||||
const UnigramProperty *const unigramProperty) {
|
const UnigramProperty *const unigramProperty) {
|
||||||
// This method should not be called for non-updatable dictionary.
|
// This method should not be called for non-updatable dictionary.
|
||||||
AKLOGI("Warning: addUnigramWord() is called for non-updatable dictionary.");
|
AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool addBigramWords(const int *const word0, const int length0,
|
bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
||||||
const BigramProperty *const bigramProperty) {
|
const BigramProperty *const bigramProperty) {
|
||||||
// This method should not be called for non-updatable dictionary.
|
// This method should not be called for non-updatable dictionary.
|
||||||
AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary.");
|
AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool removeBigramWords(const int *const word0, const int length0, const int *const word1,
|
bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word,
|
||||||
const int length1) {
|
const int length) {
|
||||||
// This method should not be called for non-updatable dictionary.
|
// This method should not be called for non-updatable dictionary.
|
||||||
AKLOGI("Warning: removeBigramWords() is called for non-updatable dictionary.");
|
AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -23,6 +23,7 @@
|
||||||
#include "suggest/core/dictionary/property/bigram_property.h"
|
#include "suggest/core/dictionary/property/bigram_property.h"
|
||||||
#include "suggest/core/dictionary/property/unigram_property.h"
|
#include "suggest/core/dictionary/property/unigram_property.h"
|
||||||
#include "suggest/core/dictionary/property/word_property.h"
|
#include "suggest/core/dictionary/property/word_property.h"
|
||||||
|
#include "suggest/core/session/prev_words_info.h"
|
||||||
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
|
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
|
||||||
#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
|
#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
|
||||||
#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
|
#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
|
||||||
|
@ -155,10 +156,10 @@ int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) cons
|
||||||
ptNodeParams.getTerminalId());
|
ptNodeParams.getTerminalId());
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Ver4PatriciaTriePolicy::addUnigramWord(const int *const word, const int length,
|
bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int length,
|
||||||
const UnigramProperty *const unigramProperty) {
|
const UnigramProperty *const unigramProperty) {
|
||||||
if (!mBuffers->isUpdatable()) {
|
if (!mBuffers->isUpdatable()) {
|
||||||
AKLOGI("Warning: addUnigramWord() is called for non-updatable dictionary.");
|
AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
|
if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
|
||||||
|
@ -210,10 +211,10 @@ bool Ver4PatriciaTriePolicy::addUnigramWord(const int *const word, const int len
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Ver4PatriciaTriePolicy::addBigramWords(const int *const word0, const int length0,
|
bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
||||||
const BigramProperty *const bigramProperty) {
|
const BigramProperty *const bigramProperty) {
|
||||||
if (!mBuffers->isUpdatable()) {
|
if (!mBuffers->isUpdatable()) {
|
||||||
AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary.");
|
AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
|
if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
|
||||||
|
@ -221,15 +222,20 @@ bool Ver4PatriciaTriePolicy::addBigramWords(const int *const word0, const int le
|
||||||
mDictBuffer->getTailPosition());
|
mDictBuffer->getTailPosition());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (length0 > MAX_WORD_LENGTH
|
if (!prevWordsInfo->isValid()) {
|
||||||
|| bigramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
|
AKLOGE("prev words info is not valid for adding n-gram entry to the dictionary.");
|
||||||
AKLOGE("Either src word or target word is too long to insert the bigram to the dictionary. "
|
|
||||||
"length0: %d, length1: %d", length0, bigramProperty->getTargetCodePoints()->size());
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
const int word0Pos = getTerminalPtNodePositionOfWord(word0, length0,
|
if (bigramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
|
||||||
false /* forceLowerCaseSearch */);
|
AKLOGE("The word is too long to insert the ngram to the dictionary. "
|
||||||
if (word0Pos == NOT_A_DICT_POS) {
|
"length: %d", bigramProperty->getTargetCodePoints()->size());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
|
||||||
|
prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos,
|
||||||
|
false /* tryLowerCaseSearch */);
|
||||||
|
// TODO: Support N-gram.
|
||||||
|
if (prevWordsPtNodePos[0] == NOT_A_DICT_POS) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
const int word1Pos = getTerminalPtNodePositionOfWord(
|
const int word1Pos = getTerminalPtNodePositionOfWord(
|
||||||
|
@ -239,7 +245,8 @@ bool Ver4PatriciaTriePolicy::addBigramWords(const int *const word0, const int le
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
bool addedNewBigram = false;
|
bool addedNewBigram = false;
|
||||||
if (mUpdatingHelper.addBigramWords(word0Pos, word1Pos, bigramProperty, &addedNewBigram)) {
|
if (mUpdatingHelper.addBigramWords(prevWordsPtNodePos[0], word1Pos, bigramProperty,
|
||||||
|
&addedNewBigram)) {
|
||||||
if (addedNewBigram) {
|
if (addedNewBigram) {
|
||||||
mBigramCount++;
|
mBigramCount++;
|
||||||
}
|
}
|
||||||
|
@ -249,10 +256,10 @@ bool Ver4PatriciaTriePolicy::addBigramWords(const int *const word0, const int le
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Ver4PatriciaTriePolicy::removeBigramWords(const int *const word0, const int length0,
|
bool Ver4PatriciaTriePolicy::removeNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
||||||
const int *const word1, const int length1) {
|
const int *const word, const int length) {
|
||||||
if (!mBuffers->isUpdatable()) {
|
if (!mBuffers->isUpdatable()) {
|
||||||
AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary.");
|
AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
|
if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
|
||||||
|
@ -260,22 +267,26 @@ bool Ver4PatriciaTriePolicy::removeBigramWords(const int *const word0, const int
|
||||||
mDictBuffer->getTailPosition());
|
mDictBuffer->getTailPosition());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (length0 > MAX_WORD_LENGTH || length1 > MAX_WORD_LENGTH) {
|
if (!prevWordsInfo->isValid()) {
|
||||||
AKLOGE("Either src word or target word is too long to remove the bigram to from the "
|
AKLOGE("prev words info is not valid for removing n-gram entry form the dictionary.");
|
||||||
"dictionary. length0: %d, length1: %d", length0, length1);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
const int word0Pos = getTerminalPtNodePositionOfWord(word0, length0,
|
if (length > MAX_WORD_LENGTH) {
|
||||||
|
AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %d", length);
|
||||||
|
}
|
||||||
|
int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
|
||||||
|
prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos,
|
||||||
|
false /* tryLowerCaseSerch */);
|
||||||
|
// TODO: Support N-gram.
|
||||||
|
if (prevWordsPtNodePos[0] == NOT_A_DICT_POS) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
const int wordPos = getTerminalPtNodePositionOfWord(word, length,
|
||||||
false /* forceLowerCaseSearch */);
|
false /* forceLowerCaseSearch */);
|
||||||
if (word0Pos == NOT_A_DICT_POS) {
|
if (wordPos == NOT_A_DICT_POS) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
const int word1Pos = getTerminalPtNodePositionOfWord(word1, length1,
|
if (mUpdatingHelper.removeBigramWords(prevWordsPtNodePos[0], wordPos)) {
|
||||||
false /* forceLowerCaseSearch */);
|
|
||||||
if (word1Pos == NOT_A_DICT_POS) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (mUpdatingHelper.removeBigramWords(word0Pos, word1Pos)) {
|
|
||||||
mBigramCount--;
|
mBigramCount--;
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -90,13 +90,13 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
return &mShortcutPolicy;
|
return &mShortcutPolicy;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool addUnigramWord(const int *const word, const int length,
|
bool addUnigramEntry(const int *const word, const int length,
|
||||||
const UnigramProperty *const unigramProperty);
|
const UnigramProperty *const unigramProperty);
|
||||||
|
|
||||||
bool addBigramWords(const int *const word0, const int length0,
|
bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
|
||||||
const BigramProperty *const bigramProperty);
|
const BigramProperty *const bigramProperty);
|
||||||
|
|
||||||
bool removeBigramWords(const int *const word0, const int length0, const int *const word1,
|
bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word1,
|
||||||
const int length1);
|
const int length1);
|
||||||
|
|
||||||
void flush(const char *const filePath);
|
void flush(const char *const filePath);
|
||||||
|
|
Loading…
Reference in a new issue