Merge changes I210acb81,Ie9508788
* changes: Make NgramProperty have NgramContext. Create .cpp file for NgramContext.
This commit is contained in:
commit
dfc82fa366
15 changed files with 188 additions and 150 deletions
|
@ -40,6 +40,7 @@ LATIN_IME_CORE_SRC_FILES := \
|
||||||
proximity_info_state_utils.cpp) \
|
proximity_info_state_utils.cpp) \
|
||||||
suggest/core/policy/weighting.cpp \
|
suggest/core/policy/weighting.cpp \
|
||||||
suggest/core/session/dic_traverse_session.cpp \
|
suggest/core/session/dic_traverse_session.cpp \
|
||||||
|
suggest/core/session/ngram_context.cpp \
|
||||||
$(addprefix suggest/core/result/, \
|
$(addprefix suggest/core/result/, \
|
||||||
suggestion_results.cpp \
|
suggestion_results.cpp \
|
||||||
suggestions_output_utils.cpp) \
|
suggestions_output_utils.cpp) \
|
||||||
|
|
|
@ -409,9 +409,10 @@ static bool latinime_BinaryDictionary_addNgramEntry(JNIEnv *env, jclass clazz, j
|
||||||
int wordCodePoints[wordLength];
|
int wordCodePoints[wordLength];
|
||||||
env->GetIntArrayRegion(word, 0, wordLength, wordCodePoints);
|
env->GetIntArrayRegion(word, 0, wordLength, wordCodePoints);
|
||||||
// Use 1 for count to indicate the ngram has inputted.
|
// Use 1 for count to indicate the ngram has inputted.
|
||||||
const NgramProperty ngramProperty(CodePointArrayView(wordCodePoints, wordLength).toVector(),
|
const NgramProperty ngramProperty(ngramContext,
|
||||||
|
CodePointArrayView(wordCodePoints, wordLength).toVector(),
|
||||||
probability, HistoricalInfo(timestamp, 0 /* level */, 1 /* count */));
|
probability, HistoricalInfo(timestamp, 0 /* level */, 1 /* count */));
|
||||||
return dictionary->addNgramEntry(&ngramContext, &ngramProperty);
|
return dictionary->addNgramEntry(&ngramProperty);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool latinime_BinaryDictionary_removeNgramEntry(JNIEnv *env, jclass clazz, jlong dict,
|
static bool latinime_BinaryDictionary_removeNgramEntry(JNIEnv *env, jclass clazz, jlong dict,
|
||||||
|
@ -527,12 +528,12 @@ static int latinime_BinaryDictionary_addMultipleDictionaryEntries(JNIEnv *env, j
|
||||||
if (word0) {
|
if (word0) {
|
||||||
jint bigramProbability = env->GetIntField(languageModelParam, bigramProbabilityFieldId);
|
jint bigramProbability = env->GetIntField(languageModelParam, bigramProbabilityFieldId);
|
||||||
// Use 1 for count to indicate the bigram has inputted.
|
// Use 1 for count to indicate the bigram has inputted.
|
||||||
const NgramProperty ngramProperty(
|
|
||||||
CodePointArrayView(word1CodePoints, word1Length).toVector(),
|
|
||||||
bigramProbability, HistoricalInfo(timestamp, 0 /* level */, 1 /* count */));
|
|
||||||
const NgramContext ngramContext(word0CodePoints, word0Length,
|
const NgramContext ngramContext(word0CodePoints, word0Length,
|
||||||
false /* isBeginningOfSentence */);
|
false /* isBeginningOfSentence */);
|
||||||
dictionary->addNgramEntry(&ngramContext, &ngramProperty);
|
const NgramProperty ngramProperty(ngramContext,
|
||||||
|
CodePointArrayView(word1CodePoints, word1Length).toVector(),
|
||||||
|
bigramProbability, HistoricalInfo(timestamp, 0 /* level */, 1 /* count */));
|
||||||
|
dictionary->addNgramEntry(&ngramProperty);
|
||||||
}
|
}
|
||||||
if (dictionary->needsToRunGC(true /* mindsBlockByGC */)) {
|
if (dictionary->needsToRunGC(true /* mindsBlockByGC */)) {
|
||||||
return i + 1;
|
return i + 1;
|
||||||
|
@ -642,11 +643,8 @@ static bool latinime_BinaryDictionary_migrateNative(JNIEnv *env, jclass clazz, j
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
const NgramContext ngramContext(wordCodePoints, wordCodePointCount,
|
|
||||||
wordProperty.getUnigramProperty()->representsBeginningOfSentence());
|
|
||||||
for (const NgramProperty &ngramProperty : *wordProperty.getNgramProperties()) {
|
for (const NgramProperty &ngramProperty : *wordProperty.getNgramProperties()) {
|
||||||
if (!dictionaryStructureWithBufferPolicy->addNgramEntry(&ngramContext,
|
if (!dictionaryStructureWithBufferPolicy->addNgramEntry(&ngramProperty)) {
|
||||||
&ngramProperty)) {
|
|
||||||
LogUtils::logToJava(env, "Cannot add ngram to the new dict.");
|
LogUtils::logToJava(env, "Cannot add ngram to the new dict.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
|
@ -140,10 +140,9 @@ bool Dictionary::removeUnigramEntry(const CodePointArrayView codePoints) {
|
||||||
return mDictionaryStructureWithBufferPolicy->removeUnigramEntry(codePoints);
|
return mDictionaryStructureWithBufferPolicy->removeUnigramEntry(codePoints);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Dictionary::addNgramEntry(const NgramContext *const ngramContext,
|
bool Dictionary::addNgramEntry(const NgramProperty *const ngramProperty) {
|
||||||
const NgramProperty *const ngramProperty) {
|
|
||||||
TimeKeeper::setCurrentTime();
|
TimeKeeper::setCurrentTime();
|
||||||
return mDictionaryStructureWithBufferPolicy->addNgramEntry(ngramContext, ngramProperty);
|
return mDictionaryStructureWithBufferPolicy->addNgramEntry(ngramProperty);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Dictionary::removeNgramEntry(const NgramContext *const ngramContext,
|
bool Dictionary::removeNgramEntry(const NgramContext *const ngramContext,
|
||||||
|
|
|
@ -85,8 +85,7 @@ class Dictionary {
|
||||||
|
|
||||||
bool removeUnigramEntry(const CodePointArrayView codePoints);
|
bool removeUnigramEntry(const CodePointArrayView codePoints);
|
||||||
|
|
||||||
bool addNgramEntry(const NgramContext *const ngramContext,
|
bool addNgramEntry(const NgramProperty *const ngramProperty);
|
||||||
const NgramProperty *const ngramProperty);
|
|
||||||
|
|
||||||
bool removeNgramEntry(const NgramContext *const ngramContext,
|
bool removeNgramEntry(const NgramContext *const ngramContext,
|
||||||
const CodePointArrayView codePoints);
|
const CodePointArrayView codePoints);
|
||||||
|
|
|
@ -21,15 +21,20 @@
|
||||||
|
|
||||||
#include "defines.h"
|
#include "defines.h"
|
||||||
#include "suggest/core/dictionary/property/historical_info.h"
|
#include "suggest/core/dictionary/property/historical_info.h"
|
||||||
|
#include "suggest/core/session/ngram_context.h"
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
class NgramProperty {
|
class NgramProperty {
|
||||||
public:
|
public:
|
||||||
NgramProperty(const std::vector<int> &&targetCodePoints, const int probability,
|
NgramProperty(const NgramContext &ngramContext, const std::vector<int> &&targetCodePoints,
|
||||||
const HistoricalInfo historicalInfo)
|
const int probability, const HistoricalInfo historicalInfo)
|
||||||
: mTargetCodePoints(std::move(targetCodePoints)), mProbability(probability),
|
: mNgramContext(ngramContext), mTargetCodePoints(std::move(targetCodePoints)),
|
||||||
mHistoricalInfo(historicalInfo) {}
|
mProbability(probability), mHistoricalInfo(historicalInfo) {}
|
||||||
|
|
||||||
|
const NgramContext *getNgramContext() const {
|
||||||
|
return &mNgramContext;
|
||||||
|
}
|
||||||
|
|
||||||
const std::vector<int> *getTargetCodePoints() const {
|
const std::vector<int> *getTargetCodePoints() const {
|
||||||
return &mTargetCodePoints;
|
return &mTargetCodePoints;
|
||||||
|
@ -48,6 +53,7 @@ class NgramProperty {
|
||||||
DISALLOW_DEFAULT_CONSTRUCTOR(NgramProperty);
|
DISALLOW_DEFAULT_CONSTRUCTOR(NgramProperty);
|
||||||
DISALLOW_ASSIGNMENT_OPERATOR(NgramProperty);
|
DISALLOW_ASSIGNMENT_OPERATOR(NgramProperty);
|
||||||
|
|
||||||
|
const NgramContext mNgramContext;
|
||||||
const std::vector<int> mTargetCodePoints;
|
const std::vector<int> mTargetCodePoints;
|
||||||
const int mProbability;
|
const int mProbability;
|
||||||
const HistoricalInfo mHistoricalInfo;
|
const HistoricalInfo mHistoricalInfo;
|
||||||
|
|
|
@ -34,9 +34,9 @@ class WordProperty {
|
||||||
: mCodePoints(), mUnigramProperty(), mNgrams() {}
|
: mCodePoints(), mUnigramProperty(), mNgrams() {}
|
||||||
|
|
||||||
WordProperty(const std::vector<int> &&codePoints, const UnigramProperty *const unigramProperty,
|
WordProperty(const std::vector<int> &&codePoints, const UnigramProperty *const unigramProperty,
|
||||||
const std::vector<NgramProperty> *const bigrams)
|
const std::vector<NgramProperty> *const ngrams)
|
||||||
: mCodePoints(std::move(codePoints)), mUnigramProperty(*unigramProperty),
|
: mCodePoints(std::move(codePoints)), mUnigramProperty(*unigramProperty),
|
||||||
mNgrams(*bigrams) {}
|
mNgrams(*ngrams) {}
|
||||||
|
|
||||||
void outputProperties(JNIEnv *const env, jintArray outCodePoints, jbooleanArray outFlags,
|
void outputProperties(JNIEnv *const env, jintArray outCodePoints, jbooleanArray outFlags,
|
||||||
jintArray outProbabilityInfo, jobject outBigramTargets, jobject outBigramProbabilities,
|
jintArray outProbabilityInfo, jobject outBigramTargets, jobject outBigramProbabilities,
|
||||||
|
|
|
@ -40,7 +40,6 @@ class UnigramProperty;
|
||||||
* This class abstracts the structure of dictionaries.
|
* This class abstracts the structure of dictionaries.
|
||||||
* Implement this policy to support additional dictionaries.
|
* Implement this policy to support additional dictionaries.
|
||||||
*/
|
*/
|
||||||
// TODO: Use word id instead of terminal PtNode position.
|
|
||||||
class DictionaryStructureWithBufferPolicy {
|
class DictionaryStructureWithBufferPolicy {
|
||||||
public:
|
public:
|
||||||
typedef std::unique_ptr<DictionaryStructureWithBufferPolicy> StructurePolicyPtr;
|
typedef std::unique_ptr<DictionaryStructureWithBufferPolicy> StructurePolicyPtr;
|
||||||
|
@ -81,8 +80,7 @@ class DictionaryStructureWithBufferPolicy {
|
||||||
virtual bool removeUnigramEntry(const CodePointArrayView wordCodePoints) = 0;
|
virtual bool removeUnigramEntry(const CodePointArrayView wordCodePoints) = 0;
|
||||||
|
|
||||||
// Returns whether the update was success or not.
|
// Returns whether the update was success or not.
|
||||||
virtual bool addNgramEntry(const NgramContext *const ngramContext,
|
virtual bool addNgramEntry(const NgramProperty *const ngramProperty) = 0;
|
||||||
const NgramProperty *const ngramProperty) = 0;
|
|
||||||
|
|
||||||
// Returns whether the update was success or not.
|
// Returns whether the update was success or not.
|
||||||
virtual bool removeNgramEntry(const NgramContext *const ngramContext,
|
virtual bool removeNgramEntry(const NgramContext *const ngramContext,
|
||||||
|
@ -106,7 +104,6 @@ class DictionaryStructureWithBufferPolicy {
|
||||||
virtual void getProperty(const char *const query, const int queryLength, char *const outResult,
|
virtual void getProperty(const char *const query, const int queryLength, char *const outResult,
|
||||||
const int maxResultLength) = 0;
|
const int maxResultLength) = 0;
|
||||||
|
|
||||||
// Used for testing.
|
|
||||||
virtual const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const = 0;
|
virtual const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const = 0;
|
||||||
|
|
||||||
// Method to iterate all words in the dictionary.
|
// Method to iterate all words in the dictionary.
|
||||||
|
|
123
native/jni/src/suggest/core/session/ngram_context.cpp
Normal file
123
native/jni/src/suggest/core/session/ngram_context.cpp
Normal file
|
@ -0,0 +1,123 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2014 The Android Open Source Project
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "suggest/core/session/ngram_context.h"
|
||||||
|
|
||||||
|
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
|
||||||
|
#include "utils/char_utils.h"
|
||||||
|
|
||||||
|
namespace latinime {
|
||||||
|
|
||||||
|
NgramContext::NgramContext() : mPrevWordCount(0) {}
|
||||||
|
|
||||||
|
NgramContext::NgramContext(const NgramContext &ngramContext)
|
||||||
|
: mPrevWordCount(ngramContext.mPrevWordCount) {
|
||||||
|
for (size_t i = 0; i < mPrevWordCount; ++i) {
|
||||||
|
mPrevWordCodePointCount[i] = ngramContext.mPrevWordCodePointCount[i];
|
||||||
|
memmove(mPrevWordCodePoints[i], ngramContext.mPrevWordCodePoints[i],
|
||||||
|
sizeof(mPrevWordCodePoints[i][0]) * mPrevWordCodePointCount[i]);
|
||||||
|
mIsBeginningOfSentence[i] = ngramContext.mIsBeginningOfSentence[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
NgramContext::NgramContext(const int prevWordCodePoints[][MAX_WORD_LENGTH],
|
||||||
|
const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence,
|
||||||
|
const size_t prevWordCount)
|
||||||
|
: mPrevWordCount(std::min(NELEMS(mPrevWordCodePoints), prevWordCount)) {
|
||||||
|
clear();
|
||||||
|
for (size_t i = 0; i < mPrevWordCount; ++i) {
|
||||||
|
if (prevWordCodePointCount[i] < 0 || prevWordCodePointCount[i] > MAX_WORD_LENGTH) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
memmove(mPrevWordCodePoints[i], prevWordCodePoints[i],
|
||||||
|
sizeof(mPrevWordCodePoints[i][0]) * prevWordCodePointCount[i]);
|
||||||
|
mPrevWordCodePointCount[i] = prevWordCodePointCount[i];
|
||||||
|
mIsBeginningOfSentence[i] = isBeginningOfSentence[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
NgramContext::NgramContext(const int *const prevWordCodePoints, const int prevWordCodePointCount,
|
||||||
|
const bool isBeginningOfSentence) : mPrevWordCount(1) {
|
||||||
|
clear();
|
||||||
|
if (prevWordCodePointCount > MAX_WORD_LENGTH || !prevWordCodePoints) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
memmove(mPrevWordCodePoints[0], prevWordCodePoints,
|
||||||
|
sizeof(mPrevWordCodePoints[0][0]) * prevWordCodePointCount);
|
||||||
|
mPrevWordCodePointCount[0] = prevWordCodePointCount;
|
||||||
|
mIsBeginningOfSentence[0] = isBeginningOfSentence;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool NgramContext::isValid() const {
|
||||||
|
if (mPrevWordCodePointCount[0] > 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (mIsBeginningOfSentence[0]) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const CodePointArrayView NgramContext::getNthPrevWordCodePoints(const size_t n) const {
|
||||||
|
if (n <= 0 || n > mPrevWordCount) {
|
||||||
|
return CodePointArrayView();
|
||||||
|
}
|
||||||
|
return CodePointArrayView(mPrevWordCodePoints[n - 1], mPrevWordCodePointCount[n - 1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool NgramContext::isNthPrevWordBeginningOfSentence(const size_t n) const {
|
||||||
|
if (n <= 0 || n > mPrevWordCount) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return mIsBeginningOfSentence[n - 1];
|
||||||
|
}
|
||||||
|
|
||||||
|
/* static */ int NgramContext::getWordId(
|
||||||
|
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
|
||||||
|
const int *const wordCodePoints, const int wordCodePointCount,
|
||||||
|
const bool isBeginningOfSentence, const bool tryLowerCaseSearch) {
|
||||||
|
if (!dictStructurePolicy || !wordCodePoints || wordCodePointCount > MAX_WORD_LENGTH) {
|
||||||
|
return NOT_A_WORD_ID;
|
||||||
|
}
|
||||||
|
int codePoints[MAX_WORD_LENGTH];
|
||||||
|
int codePointCount = wordCodePointCount;
|
||||||
|
memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount);
|
||||||
|
if (isBeginningOfSentence) {
|
||||||
|
codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints, codePointCount,
|
||||||
|
MAX_WORD_LENGTH);
|
||||||
|
if (codePointCount <= 0) {
|
||||||
|
return NOT_A_WORD_ID;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const CodePointArrayView codePointArrayView(codePoints, codePointCount);
|
||||||
|
const int wordId = dictStructurePolicy->getWordId(codePointArrayView,
|
||||||
|
false /* forceLowerCaseSearch */);
|
||||||
|
if (wordId != NOT_A_WORD_ID || !tryLowerCaseSearch) {
|
||||||
|
// Return the id when when the word was found or doesn't try lower case search.
|
||||||
|
return wordId;
|
||||||
|
}
|
||||||
|
// Check bigrams for lower-cased previous word if original was not found. Useful for
|
||||||
|
// auto-capitalized words like "The [current_word]".
|
||||||
|
return dictStructurePolicy->getWordId(codePointArrayView, true /* forceLowerCaseSearch */);
|
||||||
|
}
|
||||||
|
|
||||||
|
void NgramContext::clear() {
|
||||||
|
for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) {
|
||||||
|
mPrevWordCodePointCount[i] = 0;
|
||||||
|
mIsBeginningOfSentence[i] = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // namespace latinime
|
|
@ -20,145 +20,54 @@
|
||||||
#include <array>
|
#include <array>
|
||||||
|
|
||||||
#include "defines.h"
|
#include "defines.h"
|
||||||
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
|
|
||||||
#include "utils/char_utils.h"
|
|
||||||
#include "utils/int_array_view.h"
|
#include "utils/int_array_view.h"
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
// Rename to NgramContext.
|
class DictionaryStructureWithBufferPolicy;
|
||||||
|
|
||||||
class NgramContext {
|
class NgramContext {
|
||||||
public:
|
public:
|
||||||
// No prev word information.
|
// No prev word information.
|
||||||
NgramContext() : mPrevWordCount(0) {
|
NgramContext();
|
||||||
clear();
|
// Copy constructor to use this class with std::vector and use this class as a return value.
|
||||||
}
|
NgramContext(const NgramContext &ngramContext);
|
||||||
|
|
||||||
NgramContext(const NgramContext &ngramContext)
|
|
||||||
: mPrevWordCount(ngramContext.mPrevWordCount) {
|
|
||||||
for (size_t i = 0; i < mPrevWordCount; ++i) {
|
|
||||||
mPrevWordCodePointCount[i] = ngramContext.mPrevWordCodePointCount[i];
|
|
||||||
memmove(mPrevWordCodePoints[i], ngramContext.mPrevWordCodePoints[i],
|
|
||||||
sizeof(mPrevWordCodePoints[i][0]) * mPrevWordCodePointCount[i]);
|
|
||||||
mIsBeginningOfSentence[i] = ngramContext.mIsBeginningOfSentence[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Construct from previous words.
|
// Construct from previous words.
|
||||||
NgramContext(const int prevWordCodePoints[][MAX_WORD_LENGTH],
|
NgramContext(const int prevWordCodePoints[][MAX_WORD_LENGTH],
|
||||||
const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence,
|
const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence,
|
||||||
const size_t prevWordCount)
|
const size_t prevWordCount);
|
||||||
: mPrevWordCount(std::min(NELEMS(mPrevWordCodePoints), prevWordCount)) {
|
|
||||||
clear();
|
|
||||||
for (size_t i = 0; i < mPrevWordCount; ++i) {
|
|
||||||
if (prevWordCodePointCount[i] < 0 || prevWordCodePointCount[i] > MAX_WORD_LENGTH) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
memmove(mPrevWordCodePoints[i], prevWordCodePoints[i],
|
|
||||||
sizeof(mPrevWordCodePoints[i][0]) * prevWordCodePointCount[i]);
|
|
||||||
mPrevWordCodePointCount[i] = prevWordCodePointCount[i];
|
|
||||||
mIsBeginningOfSentence[i] = isBeginningOfSentence[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Construct from a previous word.
|
// Construct from a previous word.
|
||||||
NgramContext(const int *const prevWordCodePoints, const int prevWordCodePointCount,
|
NgramContext(const int *const prevWordCodePoints, const int prevWordCodePointCount,
|
||||||
const bool isBeginningOfSentence) : mPrevWordCount(1) {
|
const bool isBeginningOfSentence);
|
||||||
clear();
|
|
||||||
if (prevWordCodePointCount > MAX_WORD_LENGTH || !prevWordCodePoints) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
memmove(mPrevWordCodePoints[0], prevWordCodePoints,
|
|
||||||
sizeof(mPrevWordCodePoints[0][0]) * prevWordCodePointCount);
|
|
||||||
mPrevWordCodePointCount[0] = prevWordCodePointCount;
|
|
||||||
mIsBeginningOfSentence[0] = isBeginningOfSentence;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t getPrevWordCount() const {
|
size_t getPrevWordCount() const {
|
||||||
return mPrevWordCount;
|
return mPrevWordCount;
|
||||||
}
|
}
|
||||||
|
bool isValid() const;
|
||||||
// TODO: Remove.
|
|
||||||
const NgramContext getTrimmedNgramContext(const size_t maxPrevWordCount) const {
|
|
||||||
return NgramContext(mPrevWordCodePoints, mPrevWordCodePointCount, mIsBeginningOfSentence,
|
|
||||||
std::min(mPrevWordCount, maxPrevWordCount));
|
|
||||||
}
|
|
||||||
|
|
||||||
bool isValid() const {
|
|
||||||
if (mPrevWordCodePointCount[0] > 0) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (mIsBeginningOfSentence[0]) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<size_t N>
|
template<size_t N>
|
||||||
const WordIdArrayView getPrevWordIds(
|
const WordIdArrayView getPrevWordIds(
|
||||||
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
|
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
|
||||||
std::array<int, N> *const prevWordIdBuffer, const bool tryLowerCaseSearch) const {
|
WordIdArray<N> *const prevWordIdBuffer, const bool tryLowerCaseSearch) const {
|
||||||
for (size_t i = 0; i < std::min(mPrevWordCount, N); ++i) {
|
for (size_t i = 0; i < std::min(mPrevWordCount, N); ++i) {
|
||||||
prevWordIdBuffer->at(i) = getWordId(dictStructurePolicy,
|
prevWordIdBuffer->at(i) = getWordId(dictStructurePolicy, mPrevWordCodePoints[i],
|
||||||
mPrevWordCodePoints[i], mPrevWordCodePointCount[i],
|
mPrevWordCodePointCount[i], mIsBeginningOfSentence[i], tryLowerCaseSearch);
|
||||||
mIsBeginningOfSentence[i], tryLowerCaseSearch);
|
|
||||||
}
|
}
|
||||||
return WordIdArrayView::fromArray(*prevWordIdBuffer).limit(mPrevWordCount);
|
return WordIdArrayView::fromArray(*prevWordIdBuffer).limit(mPrevWordCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
// n is 1-indexed.
|
// n is 1-indexed.
|
||||||
const CodePointArrayView getNthPrevWordCodePoints(const size_t n) const {
|
const CodePointArrayView getNthPrevWordCodePoints(const size_t n) const;
|
||||||
if (n <= 0 || n > mPrevWordCount) {
|
|
||||||
return CodePointArrayView();
|
|
||||||
}
|
|
||||||
return CodePointArrayView(mPrevWordCodePoints[n - 1], mPrevWordCodePointCount[n - 1]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// n is 1-indexed.
|
// n is 1-indexed.
|
||||||
bool isNthPrevWordBeginningOfSentence(const size_t n) const {
|
bool isNthPrevWordBeginningOfSentence(const size_t n) const;
|
||||||
if (n <= 0 || n > mPrevWordCount) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return mIsBeginningOfSentence[n - 1];
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_ASSIGNMENT_OPERATOR(NgramContext);
|
DISALLOW_ASSIGNMENT_OPERATOR(NgramContext);
|
||||||
|
|
||||||
static int getWordId(const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
|
static int getWordId(const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
|
||||||
const int *const wordCodePoints, const int wordCodePointCount,
|
const int *const wordCodePoints, const int wordCodePointCount,
|
||||||
const bool isBeginningOfSentence, const bool tryLowerCaseSearch) {
|
const bool isBeginningOfSentence, const bool tryLowerCaseSearch);
|
||||||
if (!dictStructurePolicy || !wordCodePoints || wordCodePointCount > MAX_WORD_LENGTH) {
|
void clear();
|
||||||
return NOT_A_WORD_ID;
|
|
||||||
}
|
|
||||||
int codePoints[MAX_WORD_LENGTH];
|
|
||||||
int codePointCount = wordCodePointCount;
|
|
||||||
memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount);
|
|
||||||
if (isBeginningOfSentence) {
|
|
||||||
codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints,
|
|
||||||
codePointCount, MAX_WORD_LENGTH);
|
|
||||||
if (codePointCount <= 0) {
|
|
||||||
return NOT_A_WORD_ID;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
const CodePointArrayView codePointArrayView(codePoints, codePointCount);
|
|
||||||
const int wordId = dictStructurePolicy->getWordId(
|
|
||||||
codePointArrayView, false /* forceLowerCaseSearch */);
|
|
||||||
if (wordId != NOT_A_WORD_ID || !tryLowerCaseSearch) {
|
|
||||||
// Return the id when when the word was found or doesn't try lower case search.
|
|
||||||
return wordId;
|
|
||||||
}
|
|
||||||
// Check bigrams for lower-cased previous word if original was not found. Useful for
|
|
||||||
// auto-capitalized words like "The [current_word]".
|
|
||||||
return dictStructurePolicy->getWordId(codePointArrayView, true /* forceLowerCaseSearch */);
|
|
||||||
}
|
|
||||||
|
|
||||||
void clear() {
|
|
||||||
for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) {
|
|
||||||
mPrevWordCodePointCount[i] = 0;
|
|
||||||
mIsBeginningOfSentence[i] = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const size_t mPrevWordCount;
|
const size_t mPrevWordCount;
|
||||||
int mPrevWordCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH];
|
int mPrevWordCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH];
|
||||||
|
|
|
@ -344,8 +344,7 @@ bool Ver4PatriciaTriePolicy::removeUnigramEntry(const CodePointArrayView wordCod
|
||||||
return mNodeWriter.suppressUnigramEntry(&ptNodeParams);
|
return mNodeWriter.suppressUnigramEntry(&ptNodeParams);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramContext *const ngramContext,
|
bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramProperty *const ngramProperty) {
|
||||||
const NgramProperty *const ngramProperty) {
|
|
||||||
if (!mBuffers->isUpdatable()) {
|
if (!mBuffers->isUpdatable()) {
|
||||||
AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
|
AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
|
||||||
return false;
|
return false;
|
||||||
|
@ -355,6 +354,7 @@ bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramContext *const ngramContex
|
||||||
mDictBuffer->getTailPosition());
|
mDictBuffer->getTailPosition());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
const NgramContext *const ngramContext = ngramProperty->getNgramContext();
|
||||||
if (!ngramContext->isValid()) {
|
if (!ngramContext->isValid()) {
|
||||||
AKLOGE("Ngram context is not valid for adding n-gram entry to the dictionary.");
|
AKLOGE("Ngram context is not valid for adding n-gram entry to the dictionary.");
|
||||||
return false;
|
return false;
|
||||||
|
@ -463,9 +463,9 @@ bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext(
|
||||||
}
|
}
|
||||||
const int probabilityForNgram = ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)
|
const int probabilityForNgram = ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)
|
||||||
? NOT_A_PROBABILITY : probability;
|
? NOT_A_PROBABILITY : probability;
|
||||||
const NgramProperty ngramProperty(wordCodePoints.toVector(), probabilityForNgram,
|
const NgramProperty ngramProperty(*ngramContext, wordCodePoints.toVector(), probabilityForNgram,
|
||||||
historicalInfo);
|
historicalInfo);
|
||||||
if (!addNgramEntry(ngramContext, &ngramProperty)) {
|
if (!addNgramEntry(&ngramProperty)) {
|
||||||
AKLOGE("Cannot update unigarm entry in updateEntriesForWordWithNgramContext().");
|
AKLOGE("Cannot update unigarm entry in updateEntriesForWordWithNgramContext().");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -585,6 +585,8 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
|
||||||
bigramEntry.getHistoricalInfo(), mHeaderPolicy) :
|
bigramEntry.getHistoricalInfo(), mHeaderPolicy) :
|
||||||
bigramEntry.getProbability();
|
bigramEntry.getProbability();
|
||||||
ngrams.emplace_back(
|
ngrams.emplace_back(
|
||||||
|
NgramContext(wordCodePoints.data(), wordCodePoints.size(),
|
||||||
|
ptNodeParams.representsBeginningOfSentence()),
|
||||||
CodePointArrayView(bigramWord1CodePoints, codePointCount).toVector(),
|
CodePointArrayView(bigramWord1CodePoints, codePointCount).toVector(),
|
||||||
probability, *historicalInfo);
|
probability, *historicalInfo);
|
||||||
}
|
}
|
||||||
|
|
|
@ -113,8 +113,7 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
|
|
||||||
bool removeUnigramEntry(const CodePointArrayView wordCodePoints);
|
bool removeUnigramEntry(const CodePointArrayView wordCodePoints);
|
||||||
|
|
||||||
bool addNgramEntry(const NgramContext *const ngramContext,
|
bool addNgramEntry(const NgramProperty *const ngramProperty);
|
||||||
const NgramProperty *const ngramProperty);
|
|
||||||
|
|
||||||
bool removeNgramEntry(const NgramContext *const ngramContext,
|
bool removeNgramEntry(const NgramContext *const ngramContext,
|
||||||
const CodePointArrayView wordCodePoints);
|
const CodePointArrayView wordCodePoints);
|
||||||
|
|
|
@ -451,6 +451,8 @@ const WordProperty PatriciaTriePolicy::getWordProperty(
|
||||||
bigramWord1CodePoints, &word1Probability);
|
bigramWord1CodePoints, &word1Probability);
|
||||||
const int probability = getProbability(word1Probability, bigramsIt.getProbability());
|
const int probability = getProbability(word1Probability, bigramsIt.getProbability());
|
||||||
ngrams.emplace_back(
|
ngrams.emplace_back(
|
||||||
|
NgramContext(wordCodePoints.data(), wordCodePoints.size(),
|
||||||
|
ptNodeParams.representsBeginningOfSentence()),
|
||||||
CodePointArrayView(bigramWord1CodePoints, word1CodePointCount).toVector(),
|
CodePointArrayView(bigramWord1CodePoints, word1CodePointCount).toVector(),
|
||||||
probability, HistoricalInfo());
|
probability, HistoricalInfo());
|
||||||
}
|
}
|
||||||
|
|
|
@ -93,8 +93,7 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool addNgramEntry(const NgramContext *const ngramContext,
|
bool addNgramEntry(const NgramProperty *const ngramProperty) {
|
||||||
const NgramProperty *const ngramProperty) {
|
|
||||||
// This method should not be called for non-updatable dictionary.
|
// This method should not be called for non-updatable dictionary.
|
||||||
AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
|
AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -264,8 +264,7 @@ bool Ver4PatriciaTriePolicy::removeUnigramEntry(const CodePointArrayView wordCod
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramContext *const ngramContext,
|
bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramProperty *const ngramProperty) {
|
||||||
const NgramProperty *const ngramProperty) {
|
|
||||||
if (!mBuffers->isUpdatable()) {
|
if (!mBuffers->isUpdatable()) {
|
||||||
AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
|
AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
|
||||||
return false;
|
return false;
|
||||||
|
@ -275,6 +274,7 @@ bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramContext *const ngramContex
|
||||||
mDictBuffer->getTailPosition());
|
mDictBuffer->getTailPosition());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
const NgramContext *const ngramContext = ngramProperty->getNgramContext();
|
||||||
if (!ngramContext->isValid()) {
|
if (!ngramContext->isValid()) {
|
||||||
AKLOGE("Ngram context is not valid for adding n-gram entry to the dictionary.");
|
AKLOGE("Ngram context is not valid for adding n-gram entry to the dictionary.");
|
||||||
return false;
|
return false;
|
||||||
|
@ -453,7 +453,8 @@ bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {
|
||||||
// Needs to reduce dictionary size.
|
// Needs to reduce dictionary size.
|
||||||
return true;
|
return true;
|
||||||
} else if (mHeaderPolicy->isDecayingDict()) {
|
} else if (mHeaderPolicy->isDecayingDict()) {
|
||||||
return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mEntryCounters.getEntryCounts(), mHeaderPolicy);
|
return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mEntryCounters.getEntryCounts(),
|
||||||
|
mHeaderPolicy);
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -503,12 +504,16 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
|
||||||
prevWordIds)) {
|
prevWordIds)) {
|
||||||
const int codePointCount = getCodePointsAndReturnCodePointCount(entry.getWordId(),
|
const int codePointCount = getCodePointsAndReturnCodePointCount(entry.getWordId(),
|
||||||
MAX_WORD_LENGTH, bigramWord1CodePoints);
|
MAX_WORD_LENGTH, bigramWord1CodePoints);
|
||||||
const ProbabilityEntry probabilityEntry = entry.getProbabilityEntry();
|
const ProbabilityEntry ngramProbabilityEntry = entry.getProbabilityEntry();
|
||||||
const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo();
|
const HistoricalInfo *const historicalInfo = ngramProbabilityEntry.getHistoricalInfo();
|
||||||
const int probability = probabilityEntry.hasHistoricalInfo() ?
|
const int probability = ngramProbabilityEntry.hasHistoricalInfo() ?
|
||||||
ForgettingCurveUtils::decodeProbability(historicalInfo, mHeaderPolicy) :
|
ForgettingCurveUtils::decodeProbability(historicalInfo, mHeaderPolicy) :
|
||||||
probabilityEntry.getProbability();
|
ngramProbabilityEntry.getProbability();
|
||||||
ngrams.emplace_back(CodePointArrayView(bigramWord1CodePoints, codePointCount).toVector(),
|
ngrams.emplace_back(
|
||||||
|
NgramContext(
|
||||||
|
wordCodePoints.data(), wordCodePoints.size(),
|
||||||
|
probabilityEntry.representsBeginningOfSentence()),
|
||||||
|
CodePointArrayView(bigramWord1CodePoints, codePointCount).toVector(),
|
||||||
probability, *historicalInfo);
|
probability, *historicalInfo);
|
||||||
}
|
}
|
||||||
// Fetch shortcut information.
|
// Fetch shortcut information.
|
||||||
|
|
|
@ -92,8 +92,7 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
|
|
||||||
bool removeUnigramEntry(const CodePointArrayView wordCodePoints);
|
bool removeUnigramEntry(const CodePointArrayView wordCodePoints);
|
||||||
|
|
||||||
bool addNgramEntry(const NgramContext *const ngramContext,
|
bool addNgramEntry(const NgramProperty *const ngramProperty);
|
||||||
const NgramProperty *const ngramProperty);
|
|
||||||
|
|
||||||
bool removeNgramEntry(const NgramContext *const ngramContext,
|
bool removeNgramEntry(const NgramContext *const ngramContext,
|
||||||
const CodePointArrayView wordCodePoints);
|
const CodePointArrayView wordCodePoints);
|
||||||
|
|
Loading…
Reference in a new issue