Use passed previous word count in PrevWordsInfo.

Bug: 14425059

Change-Id: I04007bdacf0176a05be7a27ef1c20c5b851d8bed
main
Keisuke Kuroyanagi 2014-09-14 17:29:38 +09:00
parent 537f6eea8a
commit c43b6664fa
7 changed files with 67 additions and 48 deletions

View File

@ -93,13 +93,13 @@ void Dictionary::NgramListenerForPrediction::onVisitEntry(const int ngramProbabi
void Dictionary::getPredictions(const PrevWordsInfo *const prevWordsInfo, void Dictionary::getPredictions(const PrevWordsInfo *const prevWordsInfo,
SuggestionResults *const outSuggestionResults) const { SuggestionResults *const outSuggestionResults) const {
TimeKeeper::setCurrentTime(); TimeKeeper::setCurrentTime();
WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIds; WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
prevWordsInfo->getPrevWordIds(mDictionaryStructureWithBufferPolicy.get(), prevWordIds.data(), const WordIdArrayView prevWordIds = prevWordsInfo->getPrevWordIds(
mDictionaryStructureWithBufferPolicy.get(), &prevWordIdArray,
true /* tryLowerCaseSearch */); true /* tryLowerCaseSearch */);
const WordIdArrayView prevWordIdArrayView = WordIdArrayView::fromArray(prevWordIds); NgramListenerForPrediction listener(prevWordsInfo, prevWordIds, outSuggestionResults,
NgramListenerForPrediction listener(prevWordsInfo, prevWordIdArrayView, outSuggestionResults,
mDictionaryStructureWithBufferPolicy.get()); mDictionaryStructureWithBufferPolicy.get());
mDictionaryStructureWithBufferPolicy->iterateNgramEntries(prevWordIdArrayView, &listener); mDictionaryStructureWithBufferPolicy->iterateNgramEntries(prevWordIds, &listener);
} }
int Dictionary::getProbability(const int *word, int length) const { int Dictionary::getProbability(const int *word, int length) const {
@ -121,11 +121,11 @@ int Dictionary::getNgramProbability(const PrevWordsInfo *const prevWordsInfo, co
if (!prevWordsInfo) { if (!prevWordsInfo) {
return getDictionaryStructurePolicy()->getProbabilityOfWord(WordIdArrayView(), wordId); return getDictionaryStructurePolicy()->getProbabilityOfWord(WordIdArrayView(), wordId);
} }
WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIds; WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
prevWordsInfo->getPrevWordIds(mDictionaryStructureWithBufferPolicy.get(), prevWordIds.data(), const WordIdArrayView prevWordIds = prevWordsInfo->getPrevWordIds
(mDictionaryStructureWithBufferPolicy.get(), &prevWordIdArray,
true /* tryLowerCaseSearch */); true /* tryLowerCaseSearch */);
return getDictionaryStructurePolicy()->getProbabilityOfWord( return getDictionaryStructurePolicy()->getProbabilityOfWord(prevWordIds, wordId);
IntArrayView::fromArray(prevWordIds), wordId);
} }
bool Dictionary::addUnigramEntry(const int *const word, const int length, bool Dictionary::addUnigramEntry(const int *const word, const int length,

View File

@ -35,12 +35,11 @@ namespace latinime {
// No prev words information. // No prev words information.
PrevWordsInfo emptyPrevWordsInfo; PrevWordsInfo emptyPrevWordsInfo;
WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIds; WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
emptyPrevWordsInfo.getPrevWordIds(dictionaryStructurePolicy, prevWordIds.data(), const WordIdArrayView prevWordIds = emptyPrevWordsInfo.getPrevWordIds(
false /* tryLowerCaseSearch */); dictionaryStructurePolicy, &prevWordIdArray, false /* tryLowerCaseSearch */);
current.emplace_back(); current.emplace_back();
DicNodeUtils::initAsRoot(dictionaryStructurePolicy, DicNodeUtils::initAsRoot(dictionaryStructurePolicy, prevWordIds, &current.front());
IntArrayView::fromArray(prevWordIds), &current.front());
for (int i = 0; i < codePointCount; ++i) { for (int i = 0; i < codePointCount; ++i) {
// The base-lower input is used to ignore case errors and accent errors. // The base-lower input is used to ignore case errors and accent errors.
const int codePoint = CharUtils::toBaseLowerCase(codePoints[i]); const int codePoint = CharUtils::toBaseLowerCase(codePoints[i]);

View File

@ -35,8 +35,8 @@ void DicTraverseSession::init(const Dictionary *const dictionary,
mMultiWordCostMultiplier = getDictionaryStructurePolicy()->getHeaderStructurePolicy() mMultiWordCostMultiplier = getDictionaryStructurePolicy()->getHeaderStructurePolicy()
->getMultiWordCostMultiplier(); ->getMultiWordCostMultiplier();
mSuggestOptions = suggestOptions; mSuggestOptions = suggestOptions;
prevWordsInfo->getPrevWordIds(getDictionaryStructurePolicy(), mPrevWordsIds.data(), mPrevWordIdCount = prevWordsInfo->getPrevWordIds(getDictionaryStructurePolicy(),
true /* tryLowerCaseSearch */); &mPrevWordIdArray, true /* tryLowerCaseSearch */).size();
} }
void DicTraverseSession::setupForGetSuggestions(const ProximityInfo *pInfo, void DicTraverseSession::setupForGetSuggestions(const ProximityInfo *pInfo,

View File

@ -51,12 +51,11 @@ class DicTraverseSession {
} }
AK_FORCE_INLINE DicTraverseSession(JNIEnv *env, jstring localeStr, bool usesLargeCache) AK_FORCE_INLINE DicTraverseSession(JNIEnv *env, jstring localeStr, bool usesLargeCache)
: mProximityInfo(nullptr), mDictionary(nullptr), mSuggestOptions(nullptr), : mPrevWordIdCount(0), mProximityInfo(nullptr), mDictionary(nullptr),
mDicNodesCache(usesLargeCache), mMultiBigramMap(), mInputSize(0), mMaxPointerCount(1), mSuggestOptions(nullptr), mDicNodesCache(usesLargeCache), mMultiBigramMap(),
mMultiWordCostMultiplier(1.0f) { mInputSize(0), mMaxPointerCount(1), mMultiWordCostMultiplier(1.0f) {
// NOTE: mProximityInfoStates is an array of instances. // NOTE: mProximityInfoStates is an array of instances.
// No need to initialize it explicitly here. // No need to initialize it explicitly here.
mPrevWordsIds.fill(NOT_A_DICT_POS);
} }
// Non virtual inline destructor -- never inherit this class // Non virtual inline destructor -- never inherit this class
@ -78,7 +77,9 @@ class DicTraverseSession {
//-------------------- //--------------------
const ProximityInfo *getProximityInfo() const { return mProximityInfo; } const ProximityInfo *getProximityInfo() const { return mProximityInfo; }
const SuggestOptions *getSuggestOptions() const { return mSuggestOptions; } const SuggestOptions *getSuggestOptions() const { return mSuggestOptions; }
const WordIdArrayView getPrevWordIds() const { return IntArrayView::fromArray(mPrevWordsIds); } const WordIdArrayView getPrevWordIds() const {
return WordIdArrayView::fromArray(mPrevWordIdArray).limit(mPrevWordIdCount);
}
DicNodesCache *getDicTraverseCache() { return &mDicNodesCache; } DicNodesCache *getDicTraverseCache() { return &mDicNodesCache; }
MultiBigramMap *getMultiBigramMap() { return &mMultiBigramMap; } MultiBigramMap *getMultiBigramMap() { return &mMultiBigramMap; }
const ProximityInfoState *getProximityInfoState(int id) const { const ProximityInfoState *getProximityInfoState(int id) const {
@ -165,7 +166,8 @@ class DicTraverseSession {
const int *const inputYs, const int *const times, const int *const pointerIds, const int *const inputYs, const int *const times, const int *const pointerIds,
const int inputSize, const float maxSpatialDistance, const int maxPointerCount); const int inputSize, const float maxSpatialDistance, const int maxPointerCount);
WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> mPrevWordsIds; WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> mPrevWordIdArray;
size_t mPrevWordIdCount;
const ProximityInfo *mProximityInfo; const ProximityInfo *mProximityInfo;
const Dictionary *mDictionary; const Dictionary *mDictionary;
const SuggestOptions *mSuggestOptions; const SuggestOptions *mSuggestOptions;

View File

@ -17,6 +17,8 @@
#ifndef LATINIME_PREV_WORDS_INFO_H #ifndef LATINIME_PREV_WORDS_INFO_H
#define LATINIME_PREV_WORDS_INFO_H #define LATINIME_PREV_WORDS_INFO_H
#include <array>
#include "defines.h" #include "defines.h"
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
#include "utils/char_utils.h" #include "utils/char_utils.h"
@ -27,12 +29,13 @@ namespace latinime {
class PrevWordsInfo { class PrevWordsInfo {
public: public:
// No prev word information. // No prev word information.
PrevWordsInfo() { PrevWordsInfo() : mPrevWordCount(0) {
clear(); clear();
} }
PrevWordsInfo(PrevWordsInfo &&prevWordsInfo) { PrevWordsInfo(PrevWordsInfo &&prevWordsInfo)
for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) { : mPrevWordCount(prevWordsInfo.mPrevWordCount) {
for (size_t i = 0; i < mPrevWordCount; ++i) {
mPrevWordCodePointCount[i] = prevWordsInfo.mPrevWordCodePointCount[i]; mPrevWordCodePointCount[i] = prevWordsInfo.mPrevWordCodePointCount[i];
memmove(mPrevWordCodePoints[i], prevWordsInfo.mPrevWordCodePoints[i], memmove(mPrevWordCodePoints[i], prevWordsInfo.mPrevWordCodePoints[i],
sizeof(mPrevWordCodePoints[i][0]) * mPrevWordCodePointCount[i]); sizeof(mPrevWordCodePoints[i][0]) * mPrevWordCodePointCount[i]);
@ -43,9 +46,10 @@ class PrevWordsInfo {
// Construct from previous words. // Construct from previous words.
PrevWordsInfo(const int prevWordCodePoints[][MAX_WORD_LENGTH], PrevWordsInfo(const int prevWordCodePoints[][MAX_WORD_LENGTH],
const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence, const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence,
const size_t prevWordCount) { const size_t prevWordCount)
: mPrevWordCount(std::min(NELEMS(mPrevWordCodePoints), prevWordCount)) {
clear(); clear();
for (size_t i = 0; i < std::min(NELEMS(mPrevWordCodePoints), prevWordCount); ++i) { for (size_t i = 0; i < mPrevWordCount; ++i) {
if (prevWordCodePointCount[i] < 0 || prevWordCodePointCount[i] > MAX_WORD_LENGTH) { if (prevWordCodePointCount[i] < 0 || prevWordCodePointCount[i] > MAX_WORD_LENGTH) {
continue; continue;
} }
@ -58,7 +62,7 @@ class PrevWordsInfo {
// Construct from a previous word. // Construct from a previous word.
PrevWordsInfo(const int *const prevWordCodePoints, const int prevWordCodePointCount, PrevWordsInfo(const int *const prevWordCodePoints, const int prevWordCodePointCount,
const bool isBeginningOfSentence) { const bool isBeginningOfSentence) : mPrevWordCount(1) {
clear(); clear();
if (prevWordCodePointCount > MAX_WORD_LENGTH || !prevWordCodePoints) { if (prevWordCodePointCount > MAX_WORD_LENGTH || !prevWordCodePoints) {
return; return;
@ -79,26 +83,29 @@ class PrevWordsInfo {
return false; return false;
} }
void getPrevWordIds(const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, template<size_t N>
int *const outPrevWordIds, const bool tryLowerCaseSearch) const { const WordIdArrayView getPrevWordIds(
for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) { const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
outPrevWordIds[i] = getWordId(dictStructurePolicy, std::array<int, N> *const prevWordIdBuffer, const bool tryLowerCaseSearch) const {
for (size_t i = 0; i < std::min(mPrevWordCount, N); ++i) {
prevWordIdBuffer->at(i) = getWordId(dictStructurePolicy,
mPrevWordCodePoints[i], mPrevWordCodePointCount[i], mPrevWordCodePoints[i], mPrevWordCodePointCount[i],
mIsBeginningOfSentence[i], tryLowerCaseSearch); mIsBeginningOfSentence[i], tryLowerCaseSearch);
} }
return WordIdArrayView::fromArray(*prevWordIdBuffer).limit(mPrevWordCount);
} }
// n is 1-indexed. // n is 1-indexed.
const CodePointArrayView getNthPrevWordCodePoints(const int n) const { const CodePointArrayView getNthPrevWordCodePoints(const size_t n) const {
if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) { if (n <= 0 || n > mPrevWordCount) {
return CodePointArrayView(); return CodePointArrayView();
} }
return CodePointArrayView(mPrevWordCodePoints[n - 1], mPrevWordCodePointCount[n - 1]); return CodePointArrayView(mPrevWordCodePoints[n - 1], mPrevWordCodePointCount[n - 1]);
} }
// n is 1-indexed. // n is 1-indexed.
bool isNthPrevWordBeginningOfSentence(const int n) const { bool isNthPrevWordBeginningOfSentence(const size_t n) const {
if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) { if (n <= 0 || n > mPrevWordCount) {
return false; return false;
} }
return mIsBeginningOfSentence[n - 1]; return mIsBeginningOfSentence[n - 1];
@ -142,6 +149,7 @@ class PrevWordsInfo {
} }
} }
const size_t mPrevWordCount;
int mPrevWordCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH]; int mPrevWordCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH];
int mPrevWordCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; int mPrevWordCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
bool mIsBeginningOfSentence[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; bool mIsBeginningOfSentence[MAX_PREV_WORD_COUNT_FOR_N_GRAM];

View File

@ -332,8 +332,12 @@ bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsI
"length: %zd", bigramProperty->getTargetCodePoints()->size()); "length: %zd", bigramProperty->getTargetCodePoints()->size());
return false; return false;
} }
int prevWordIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
prevWordsInfo->getPrevWordIds(this, prevWordIds, false /* tryLowerCaseSearch */); const WordIdArrayView prevWordIds = prevWordsInfo->getPrevWordIds(this, &prevWordIdArray,
false /* tryLowerCaseSearch */);
if (prevWordIds.empty()) {
return false;
}
if (prevWordIds[0] == NOT_A_WORD_ID) { if (prevWordIds[0] == NOT_A_WORD_ID) {
if (prevWordsInfo->isNthPrevWordBeginningOfSentence(1 /* n */)) { if (prevWordsInfo->isNthPrevWordBeginningOfSentence(1 /* n */)) {
const std::vector<UnigramProperty::ShortcutProperty> shortcuts; const std::vector<UnigramProperty::ShortcutProperty> shortcuts;
@ -347,7 +351,7 @@ bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsI
return false; return false;
} }
// Refresh word ids. // Refresh word ids.
prevWordsInfo->getPrevWordIds(this, prevWordIds, false /* tryLowerCaseSearch */); prevWordsInfo->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */);
} else { } else {
return false; return false;
} }
@ -390,9 +394,10 @@ bool Ver4PatriciaTriePolicy::removeNgramEntry(const PrevWordsInfo *const prevWor
AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %zd", AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %zd",
wordCodePoints.size()); wordCodePoints.size());
} }
int prevWordIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
prevWordsInfo->getPrevWordIds(this, prevWordIds, false /* tryLowerCaseSerch */); const WordIdArrayView prevWordIds = prevWordsInfo->getPrevWordIds(this, &prevWordIdArray,
if (prevWordIds[0] == NOT_A_WORD_ID) { false /* tryLowerCaseSerch */);
if (prevWordIds.empty() || prevWordIds[0] == NOT_A_WORD_ID) {
return false; return false;
} }
const int wordPos = getTerminalPtNodePosFromWordId(getWordId(wordCodePoints, const int wordPos = getTerminalPtNodePosFromWordId(getWordId(wordCodePoints,

View File

@ -303,8 +303,12 @@ bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsI
"length: %zd", bigramProperty->getTargetCodePoints()->size()); "length: %zd", bigramProperty->getTargetCodePoints()->size());
return false; return false;
} }
WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIds; WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
prevWordsInfo->getPrevWordIds(this, prevWordIds.data(), false /* tryLowerCaseSearch */); const WordIdArrayView prevWordIds = prevWordsInfo->getPrevWordIds(this, &prevWordIdArray,
false /* tryLowerCaseSearch */);
if (prevWordIds.empty()) {
return false;
}
// TODO: Support N-gram. // TODO: Support N-gram.
if (prevWordIds[0] == NOT_A_WORD_ID) { if (prevWordIds[0] == NOT_A_WORD_ID) {
if (prevWordsInfo->isNthPrevWordBeginningOfSentence(1 /* n */)) { if (prevWordsInfo->isNthPrevWordBeginningOfSentence(1 /* n */)) {
@ -319,7 +323,7 @@ bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsI
return false; return false;
} }
// Refresh word ids. // Refresh word ids.
prevWordsInfo->getPrevWordIds(this, prevWordIds.data(), false /* tryLowerCaseSearch */); prevWordsInfo->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */);
} else { } else {
return false; return false;
} }
@ -367,10 +371,11 @@ bool Ver4PatriciaTriePolicy::removeNgramEntry(const PrevWordsInfo *const prevWor
AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %zd", AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %zd",
wordCodePoints.size()); wordCodePoints.size());
} }
WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIds; WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
prevWordsInfo->getPrevWordIds(this, prevWordIds.data(), false /* tryLowerCaseSerch */); const WordIdArrayView prevWordIds = prevWordsInfo->getPrevWordIds(this, &prevWordIdArray,
false /* tryLowerCaseSerch */);
// TODO: Support N-gram. // TODO: Support N-gram.
if (prevWordIds[0] == NOT_A_WORD_ID) { if (prevWordIds.empty() || prevWordIds[0] == NOT_A_WORD_ID) {
return false; return false;
} }
const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */); const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */);