Merge "Get bigrams iterator via dict structure policy."
commit
42a64e24b8
|
@ -24,6 +24,11 @@ namespace latinime {
|
||||||
|
|
||||||
class BinaryDictionaryBigramsIterator {
|
class BinaryDictionaryBigramsIterator {
|
||||||
public:
|
public:
|
||||||
|
// Empty iterator.
|
||||||
|
BinaryDictionaryBigramsIterator()
|
||||||
|
: mBigramsStructurePolicy(nullptr), mPos(NOT_A_DICT_POS),
|
||||||
|
mBigramPos(NOT_A_DICT_POS), mProbability(NOT_A_PROBABILITY), mHasNext(false) {}
|
||||||
|
|
||||||
BinaryDictionaryBigramsIterator(
|
BinaryDictionaryBigramsIterator(
|
||||||
const DictionaryBigramsStructurePolicy *const bigramsStructurePolicy, const int pos)
|
const DictionaryBigramsStructurePolicy *const bigramsStructurePolicy, const int pos)
|
||||||
: mBigramsStructurePolicy(bigramsStructurePolicy), mPos(pos),
|
: mBigramsStructurePolicy(bigramsStructurePolicy), mPos(pos),
|
||||||
|
|
|
@ -53,9 +53,8 @@ int MultiBigramMap::getBigramProbability(
|
||||||
|
|
||||||
void MultiBigramMap::BigramMap::init(
|
void MultiBigramMap::BigramMap::init(
|
||||||
const DictionaryStructureWithBufferPolicy *const structurePolicy, const int nodePos) {
|
const DictionaryStructureWithBufferPolicy *const structurePolicy, const int nodePos) {
|
||||||
const int bigramsListPos = structurePolicy->getBigramsPositionOfPtNode(nodePos);
|
BinaryDictionaryBigramsIterator bigramsIt =
|
||||||
BinaryDictionaryBigramsIterator bigramsIt(structurePolicy->getBigramsStructurePolicy(),
|
structurePolicy->getBigramsIteratorOfPtNode(nodePos);
|
||||||
bigramsListPos);
|
|
||||||
while (bigramsIt.hasNext()) {
|
while (bigramsIt.hasNext()) {
|
||||||
bigramsIt.next();
|
bigramsIt.next();
|
||||||
if (bigramsIt.getBigramPos() == NOT_A_DICT_POS) {
|
if (bigramsIt.getBigramPos() == NOT_A_DICT_POS) {
|
||||||
|
@ -89,9 +88,8 @@ int MultiBigramMap::readBigramProbabilityFromBinaryDictionary(
|
||||||
const DictionaryStructureWithBufferPolicy *const structurePolicy, const int nodePos,
|
const DictionaryStructureWithBufferPolicy *const structurePolicy, const int nodePos,
|
||||||
const int nextWordPosition, const int unigramProbability) {
|
const int nextWordPosition, const int unigramProbability) {
|
||||||
int bigramProbability = NOT_A_PROBABILITY;
|
int bigramProbability = NOT_A_PROBABILITY;
|
||||||
const int bigramsListPos = structurePolicy->getBigramsPositionOfPtNode(nodePos);
|
BinaryDictionaryBigramsIterator bigramsIt =
|
||||||
BinaryDictionaryBigramsIterator bigramsIt(structurePolicy->getBigramsStructurePolicy(),
|
structurePolicy->getBigramsIteratorOfPtNode(nodePos);
|
||||||
bigramsListPos);
|
|
||||||
while (bigramsIt.hasNext()) {
|
while (bigramsIt.hasNext()) {
|
||||||
bigramsIt.next();
|
bigramsIt.next();
|
||||||
if (bigramsIt.getBigramPos() == nextWordPosition) {
|
if (bigramsIt.getBigramPos() == nextWordPosition) {
|
||||||
|
|
|
@ -20,6 +20,7 @@
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
#include "defines.h"
|
#include "defines.h"
|
||||||
|
#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
|
||||||
#include "suggest/core/dictionary/property/word_property.h"
|
#include "suggest/core/dictionary/property/word_property.h"
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
@ -61,12 +62,10 @@ class DictionaryStructureWithBufferPolicy {
|
||||||
|
|
||||||
virtual int getShortcutPositionOfPtNode(const int nodePos) const = 0;
|
virtual int getShortcutPositionOfPtNode(const int nodePos) const = 0;
|
||||||
|
|
||||||
virtual int getBigramsPositionOfPtNode(const int nodePos) const = 0;
|
virtual BinaryDictionaryBigramsIterator getBigramsIteratorOfPtNode(const int nodePos) const = 0;
|
||||||
|
|
||||||
virtual const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const = 0;
|
virtual const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const = 0;
|
||||||
|
|
||||||
virtual const DictionaryBigramsStructurePolicy *getBigramsStructurePolicy() const = 0;
|
|
||||||
|
|
||||||
virtual const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const = 0;
|
virtual const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const = 0;
|
||||||
|
|
||||||
// Returns whether the update was success or not.
|
// Returns whether the update was success or not.
|
||||||
|
|
|
@ -92,11 +92,9 @@ class PrevWordsInfo {
|
||||||
|
|
||||||
BinaryDictionaryBigramsIterator getBigramsIteratorForPrediction(
|
BinaryDictionaryBigramsIterator getBigramsIteratorForPrediction(
|
||||||
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy) const {
|
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy) const {
|
||||||
const int bigramListPos = getBigramListPositionForWordWithTryingLowerCaseSearch(
|
return getBigramsIteratorForWordWithTryingLowerCaseSearch(
|
||||||
dictStructurePolicy, mPrevWordCodePoints[0], mPrevWordCodePointCount[0],
|
dictStructurePolicy, mPrevWordCodePoints[0], mPrevWordCodePointCount[0],
|
||||||
mIsBeginningOfSentence[0]);
|
mIsBeginningOfSentence[0]);
|
||||||
return BinaryDictionaryBigramsIterator(dictStructurePolicy->getBigramsStructurePolicy(),
|
|
||||||
bigramListPos);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// n is 1-indexed.
|
// n is 1-indexed.
|
||||||
|
@ -156,12 +154,12 @@ class PrevWordsInfo {
|
||||||
codePoints, codePointCount, true /* forceLowerCaseSearch */);
|
codePoints, codePointCount, true /* forceLowerCaseSearch */);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int getBigramListPositionForWordWithTryingLowerCaseSearch(
|
static BinaryDictionaryBigramsIterator getBigramsIteratorForWordWithTryingLowerCaseSearch(
|
||||||
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
|
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
|
||||||
const int *const wordCodePoints, const int wordCodePointCount,
|
const int *const wordCodePoints, const int wordCodePointCount,
|
||||||
const bool isBeginningOfSentence) {
|
const bool isBeginningOfSentence) {
|
||||||
if (!dictStructurePolicy || !wordCodePoints || wordCodePointCount > MAX_WORD_LENGTH) {
|
if (!dictStructurePolicy || !wordCodePoints || wordCodePointCount > MAX_WORD_LENGTH) {
|
||||||
return NOT_A_DICT_POS;
|
return BinaryDictionaryBigramsIterator();
|
||||||
}
|
}
|
||||||
int codePoints[MAX_WORD_LENGTH];
|
int codePoints[MAX_WORD_LENGTH];
|
||||||
int codePointCount = wordCodePointCount;
|
int codePointCount = wordCodePointCount;
|
||||||
|
@ -170,30 +168,30 @@ class PrevWordsInfo {
|
||||||
codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints,
|
codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints,
|
||||||
codePointCount, MAX_WORD_LENGTH);
|
codePointCount, MAX_WORD_LENGTH);
|
||||||
if (codePointCount <= 0) {
|
if (codePointCount <= 0) {
|
||||||
return NOT_A_DICT_POS;
|
return BinaryDictionaryBigramsIterator();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
int pos = getBigramListPositionForWord(dictStructurePolicy, codePoints,
|
BinaryDictionaryBigramsIterator bigramsIt = getBigramsIteratorForWord(dictStructurePolicy,
|
||||||
codePointCount, false /* forceLowerCaseSearch */);
|
codePoints, codePointCount, false /* forceLowerCaseSearch */);
|
||||||
// getBigramListPositionForWord returns NOT_A_DICT_POS if this word isn't in the
|
// getBigramsIteratorForWord returns an empty iterator if this word isn't in the dictionary
|
||||||
// dictionary or has no bigrams
|
// or has no bigrams.
|
||||||
if (NOT_A_DICT_POS == pos) {
|
if (bigramsIt.hasNext()) {
|
||||||
// If no bigrams for this exact word, search again in lower case.
|
return bigramsIt;
|
||||||
pos = getBigramListPositionForWord(dictStructurePolicy, codePoints,
|
|
||||||
codePointCount, true /* forceLowerCaseSearch */);
|
|
||||||
}
|
}
|
||||||
return pos;
|
// If no bigrams for this exact word, search again in lower case.
|
||||||
|
return getBigramsIteratorForWord(dictStructurePolicy, codePoints,
|
||||||
|
codePointCount, true /* forceLowerCaseSearch */);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int getBigramListPositionForWord(
|
static BinaryDictionaryBigramsIterator getBigramsIteratorForWord(
|
||||||
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
|
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
|
||||||
const int *wordCodePoints, const int wordCodePointCount,
|
const int *wordCodePoints, const int wordCodePointCount,
|
||||||
const bool forceLowerCaseSearch) {
|
const bool forceLowerCaseSearch) {
|
||||||
if (!wordCodePoints || wordCodePointCount <= 0) return NOT_A_DICT_POS;
|
if (!wordCodePoints || wordCodePointCount <= 0) return BinaryDictionaryBigramsIterator();
|
||||||
const int terminalPtNodePos = dictStructurePolicy->getTerminalPtNodePositionOfWord(
|
const int terminalPtNodePos = dictStructurePolicy->getTerminalPtNodePositionOfWord(
|
||||||
wordCodePoints, wordCodePointCount, forceLowerCaseSearch);
|
wordCodePoints, wordCodePointCount, forceLowerCaseSearch);
|
||||||
if (NOT_A_DICT_POS == terminalPtNodePos) return NOT_A_DICT_POS;
|
if (NOT_A_DICT_POS == terminalPtNodePos) return BinaryDictionaryBigramsIterator();
|
||||||
return dictStructurePolicy->getBigramsPositionOfPtNode(terminalPtNodePos);
|
return dictStructurePolicy->getBigramsIteratorOfPtNode(terminalPtNodePos);
|
||||||
}
|
}
|
||||||
|
|
||||||
void clear() {
|
void clear() {
|
||||||
|
|
|
@ -154,6 +154,12 @@ int Ver4PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) con
|
||||||
ptNodeParams.getTerminalId());
|
ptNodeParams.getTerminalId());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
BinaryDictionaryBigramsIterator Ver4PatriciaTriePolicy::getBigramsIteratorOfPtNode(
|
||||||
|
const int ptNodePos) const {
|
||||||
|
const int bigramsPosition = getBigramsPositionOfPtNode(ptNodePos);
|
||||||
|
return BinaryDictionaryBigramsIterator(&mBigramPolicy, bigramsPosition);
|
||||||
|
}
|
||||||
|
|
||||||
int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const {
|
int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const {
|
||||||
if (ptNodePos == NOT_A_DICT_POS) {
|
if (ptNodePos == NOT_A_DICT_POS) {
|
||||||
return NOT_A_DICT_POS;
|
return NOT_A_DICT_POS;
|
||||||
|
|
|
@ -94,16 +94,12 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
|
|
||||||
int getShortcutPositionOfPtNode(const int ptNodePos) const;
|
int getShortcutPositionOfPtNode(const int ptNodePos) const;
|
||||||
|
|
||||||
int getBigramsPositionOfPtNode(const int ptNodePos) const;
|
BinaryDictionaryBigramsIterator getBigramsIteratorOfPtNode(const int ptNodePos) const;
|
||||||
|
|
||||||
const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const {
|
const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const {
|
||||||
return mHeaderPolicy;
|
return mHeaderPolicy;
|
||||||
}
|
}
|
||||||
|
|
||||||
const DictionaryBigramsStructurePolicy *getBigramsStructurePolicy() const {
|
|
||||||
return &mBigramPolicy;
|
|
||||||
}
|
|
||||||
|
|
||||||
const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const {
|
const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const {
|
||||||
return &mShortcutPolicy;
|
return &mShortcutPolicy;
|
||||||
}
|
}
|
||||||
|
@ -167,6 +163,8 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
int mBigramCount;
|
int mBigramCount;
|
||||||
std::vector<int> mTerminalPtNodePositionsForIteratingWords;
|
std::vector<int> mTerminalPtNodePositionsForIteratingWords;
|
||||||
mutable bool mIsCorrupted;
|
mutable bool mIsCorrupted;
|
||||||
|
|
||||||
|
int getBigramsPositionOfPtNode(const int ptNodePos) const;
|
||||||
};
|
};
|
||||||
} // namespace v402
|
} // namespace v402
|
||||||
} // namespace backward
|
} // namespace backward
|
||||||
|
|
|
@ -304,6 +304,12 @@ int PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const {
|
||||||
return mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos).getShortcutPos();
|
return mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos).getShortcutPos();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
BinaryDictionaryBigramsIterator PatriciaTriePolicy::getBigramsIteratorOfPtNode(
|
||||||
|
const int ptNodePos) const {
|
||||||
|
const int bigramsPosition = getBigramsPositionOfPtNode(ptNodePos);
|
||||||
|
return BinaryDictionaryBigramsIterator(&mBigramListPolicy, bigramsPosition);
|
||||||
|
}
|
||||||
|
|
||||||
int PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const {
|
int PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const {
|
||||||
if (ptNodePos == NOT_A_DICT_POS) {
|
if (ptNodePos == NOT_A_DICT_POS) {
|
||||||
return NOT_A_DICT_POS;
|
return NOT_A_DICT_POS;
|
||||||
|
@ -322,7 +328,7 @@ int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNod
|
||||||
int bigramPos = NOT_A_DICT_POS;
|
int bigramPos = NOT_A_DICT_POS;
|
||||||
int siblingPos = NOT_A_DICT_POS;
|
int siblingPos = NOT_A_DICT_POS;
|
||||||
PatriciaTrieReadingUtils::readPtNodeInfo(mDictRoot, ptNodePos, getShortcutsStructurePolicy(),
|
PatriciaTrieReadingUtils::readPtNodeInfo(mDictRoot, ptNodePos, getShortcutsStructurePolicy(),
|
||||||
getBigramsStructurePolicy(), &flags, &mergedNodeCodePointCount, mergedNodeCodePoints,
|
&mBigramListPolicy, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints,
|
||||||
&probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos);
|
&probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos);
|
||||||
// Skip PtNodes don't start with Unicode code point because they represent non-word information.
|
// Skip PtNodes don't start with Unicode code point because they represent non-word information.
|
||||||
if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) {
|
if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) {
|
||||||
|
@ -352,7 +358,7 @@ const WordProperty PatriciaTriePolicy::getWordProperty(const int *const codePoin
|
||||||
std::vector<BigramProperty> bigrams;
|
std::vector<BigramProperty> bigrams;
|
||||||
const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos);
|
const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos);
|
||||||
int bigramWord1CodePoints[MAX_WORD_LENGTH];
|
int bigramWord1CodePoints[MAX_WORD_LENGTH];
|
||||||
BinaryDictionaryBigramsIterator bigramsIt(getBigramsStructurePolicy(), bigramListPos);
|
BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramListPos);
|
||||||
while (bigramsIt.hasNext()) {
|
while (bigramsIt.hasNext()) {
|
||||||
// Fetch the next bigram information and forward the iterator.
|
// Fetch the next bigram information and forward the iterator.
|
||||||
bigramsIt.next();
|
bigramsIt.next();
|
||||||
|
|
|
@ -67,16 +67,12 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
|
|
||||||
int getShortcutPositionOfPtNode(const int ptNodePos) const;
|
int getShortcutPositionOfPtNode(const int ptNodePos) const;
|
||||||
|
|
||||||
int getBigramsPositionOfPtNode(const int ptNodePos) const;
|
BinaryDictionaryBigramsIterator getBigramsIteratorOfPtNode(const int ptNodePos) const;
|
||||||
|
|
||||||
const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const {
|
const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const {
|
||||||
return &mHeaderPolicy;
|
return &mHeaderPolicy;
|
||||||
}
|
}
|
||||||
|
|
||||||
const DictionaryBigramsStructurePolicy *getBigramsStructurePolicy() const {
|
|
||||||
return &mBigramListPolicy;
|
|
||||||
}
|
|
||||||
|
|
||||||
const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const {
|
const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const {
|
||||||
return &mShortcutListPolicy;
|
return &mShortcutListPolicy;
|
||||||
}
|
}
|
||||||
|
@ -158,6 +154,7 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
std::vector<int> mTerminalPtNodePositionsForIteratingWords;
|
std::vector<int> mTerminalPtNodePositionsForIteratingWords;
|
||||||
mutable bool mIsCorrupted;
|
mutable bool mIsCorrupted;
|
||||||
|
|
||||||
|
int getBigramsPositionOfPtNode(const int ptNodePos) const;
|
||||||
int createAndGetLeavingChildNode(const DicNode *const dicNode, const int ptNodePos,
|
int createAndGetLeavingChildNode(const DicNode *const dicNode, const int ptNodePos,
|
||||||
DicNodeVector *const childDicNodes) const;
|
DicNodeVector *const childDicNodes) const;
|
||||||
};
|
};
|
||||||
|
|
|
@ -144,6 +144,12 @@ int Ver4PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) con
|
||||||
ptNodeParams.getTerminalId());
|
ptNodeParams.getTerminalId());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
BinaryDictionaryBigramsIterator Ver4PatriciaTriePolicy::getBigramsIteratorOfPtNode(
|
||||||
|
const int ptNodePos) const {
|
||||||
|
const int bigramsPosition = getBigramsPositionOfPtNode(ptNodePos);
|
||||||
|
return BinaryDictionaryBigramsIterator(&mBigramPolicy, bigramsPosition);
|
||||||
|
}
|
||||||
|
|
||||||
int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const {
|
int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const {
|
||||||
if (ptNodePos == NOT_A_DICT_POS) {
|
if (ptNodePos == NOT_A_DICT_POS) {
|
||||||
return NOT_A_DICT_POS;
|
return NOT_A_DICT_POS;
|
||||||
|
|
|
@ -76,16 +76,12 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
|
|
||||||
int getShortcutPositionOfPtNode(const int ptNodePos) const;
|
int getShortcutPositionOfPtNode(const int ptNodePos) const;
|
||||||
|
|
||||||
int getBigramsPositionOfPtNode(const int ptNodePos) const;
|
BinaryDictionaryBigramsIterator getBigramsIteratorOfPtNode(const int ptNodePos) const;
|
||||||
|
|
||||||
const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const {
|
const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const {
|
||||||
return mHeaderPolicy;
|
return mHeaderPolicy;
|
||||||
}
|
}
|
||||||
|
|
||||||
const DictionaryBigramsStructurePolicy *getBigramsStructurePolicy() const {
|
|
||||||
return &mBigramPolicy;
|
|
||||||
}
|
|
||||||
|
|
||||||
const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const {
|
const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const {
|
||||||
return &mShortcutPolicy;
|
return &mShortcutPolicy;
|
||||||
}
|
}
|
||||||
|
@ -146,6 +142,8 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
int mBigramCount;
|
int mBigramCount;
|
||||||
std::vector<int> mTerminalPtNodePositionsForIteratingWords;
|
std::vector<int> mTerminalPtNodePositionsForIteratingWords;
|
||||||
mutable bool mIsCorrupted;
|
mutable bool mIsCorrupted;
|
||||||
|
|
||||||
|
int getBigramsPositionOfPtNode(const int ptNodePos) const;
|
||||||
};
|
};
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
#endif // LATINIME_VER4_PATRICIA_TRIE_POLICY_H
|
#endif // LATINIME_VER4_PATRICIA_TRIE_POLICY_H
|
||||||
|
|
Loading…
Reference in New Issue