Merge "Get bigrams iterator via dict structure policy."
This commit is contained in:
commit
42a64e24b8
10 changed files with 56 additions and 45 deletions
|
@ -24,6 +24,11 @@ namespace latinime {
|
|||
|
||||
class BinaryDictionaryBigramsIterator {
|
||||
public:
|
||||
// Empty iterator.
|
||||
BinaryDictionaryBigramsIterator()
|
||||
: mBigramsStructurePolicy(nullptr), mPos(NOT_A_DICT_POS),
|
||||
mBigramPos(NOT_A_DICT_POS), mProbability(NOT_A_PROBABILITY), mHasNext(false) {}
|
||||
|
||||
BinaryDictionaryBigramsIterator(
|
||||
const DictionaryBigramsStructurePolicy *const bigramsStructurePolicy, const int pos)
|
||||
: mBigramsStructurePolicy(bigramsStructurePolicy), mPos(pos),
|
||||
|
|
|
@ -53,9 +53,8 @@ int MultiBigramMap::getBigramProbability(
|
|||
|
||||
void MultiBigramMap::BigramMap::init(
|
||||
const DictionaryStructureWithBufferPolicy *const structurePolicy, const int nodePos) {
|
||||
const int bigramsListPos = structurePolicy->getBigramsPositionOfPtNode(nodePos);
|
||||
BinaryDictionaryBigramsIterator bigramsIt(structurePolicy->getBigramsStructurePolicy(),
|
||||
bigramsListPos);
|
||||
BinaryDictionaryBigramsIterator bigramsIt =
|
||||
structurePolicy->getBigramsIteratorOfPtNode(nodePos);
|
||||
while (bigramsIt.hasNext()) {
|
||||
bigramsIt.next();
|
||||
if (bigramsIt.getBigramPos() == NOT_A_DICT_POS) {
|
||||
|
@ -89,9 +88,8 @@ int MultiBigramMap::readBigramProbabilityFromBinaryDictionary(
|
|||
const DictionaryStructureWithBufferPolicy *const structurePolicy, const int nodePos,
|
||||
const int nextWordPosition, const int unigramProbability) {
|
||||
int bigramProbability = NOT_A_PROBABILITY;
|
||||
const int bigramsListPos = structurePolicy->getBigramsPositionOfPtNode(nodePos);
|
||||
BinaryDictionaryBigramsIterator bigramsIt(structurePolicy->getBigramsStructurePolicy(),
|
||||
bigramsListPos);
|
||||
BinaryDictionaryBigramsIterator bigramsIt =
|
||||
structurePolicy->getBigramsIteratorOfPtNode(nodePos);
|
||||
while (bigramsIt.hasNext()) {
|
||||
bigramsIt.next();
|
||||
if (bigramsIt.getBigramPos() == nextWordPosition) {
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
#include <memory>
|
||||
|
||||
#include "defines.h"
|
||||
#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
|
||||
#include "suggest/core/dictionary/property/word_property.h"
|
||||
|
||||
namespace latinime {
|
||||
|
@ -61,12 +62,10 @@ class DictionaryStructureWithBufferPolicy {
|
|||
|
||||
virtual int getShortcutPositionOfPtNode(const int nodePos) const = 0;
|
||||
|
||||
virtual int getBigramsPositionOfPtNode(const int nodePos) const = 0;
|
||||
virtual BinaryDictionaryBigramsIterator getBigramsIteratorOfPtNode(const int nodePos) const = 0;
|
||||
|
||||
virtual const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const = 0;
|
||||
|
||||
virtual const DictionaryBigramsStructurePolicy *getBigramsStructurePolicy() const = 0;
|
||||
|
||||
virtual const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const = 0;
|
||||
|
||||
// Returns whether the update was success or not.
|
||||
|
|
|
@ -92,11 +92,9 @@ class PrevWordsInfo {
|
|||
|
||||
BinaryDictionaryBigramsIterator getBigramsIteratorForPrediction(
|
||||
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy) const {
|
||||
const int bigramListPos = getBigramListPositionForWordWithTryingLowerCaseSearch(
|
||||
return getBigramsIteratorForWordWithTryingLowerCaseSearch(
|
||||
dictStructurePolicy, mPrevWordCodePoints[0], mPrevWordCodePointCount[0],
|
||||
mIsBeginningOfSentence[0]);
|
||||
return BinaryDictionaryBigramsIterator(dictStructurePolicy->getBigramsStructurePolicy(),
|
||||
bigramListPos);
|
||||
}
|
||||
|
||||
// n is 1-indexed.
|
||||
|
@ -156,12 +154,12 @@ class PrevWordsInfo {
|
|||
codePoints, codePointCount, true /* forceLowerCaseSearch */);
|
||||
}
|
||||
|
||||
static int getBigramListPositionForWordWithTryingLowerCaseSearch(
|
||||
static BinaryDictionaryBigramsIterator getBigramsIteratorForWordWithTryingLowerCaseSearch(
|
||||
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
|
||||
const int *const wordCodePoints, const int wordCodePointCount,
|
||||
const bool isBeginningOfSentence) {
|
||||
if (!dictStructurePolicy || !wordCodePoints || wordCodePointCount > MAX_WORD_LENGTH) {
|
||||
return NOT_A_DICT_POS;
|
||||
return BinaryDictionaryBigramsIterator();
|
||||
}
|
||||
int codePoints[MAX_WORD_LENGTH];
|
||||
int codePointCount = wordCodePointCount;
|
||||
|
@ -170,30 +168,30 @@ class PrevWordsInfo {
|
|||
codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints,
|
||||
codePointCount, MAX_WORD_LENGTH);
|
||||
if (codePointCount <= 0) {
|
||||
return NOT_A_DICT_POS;
|
||||
return BinaryDictionaryBigramsIterator();
|
||||
}
|
||||
}
|
||||
int pos = getBigramListPositionForWord(dictStructurePolicy, codePoints,
|
||||
codePointCount, false /* forceLowerCaseSearch */);
|
||||
// getBigramListPositionForWord returns NOT_A_DICT_POS if this word isn't in the
|
||||
// dictionary or has no bigrams
|
||||
if (NOT_A_DICT_POS == pos) {
|
||||
// If no bigrams for this exact word, search again in lower case.
|
||||
pos = getBigramListPositionForWord(dictStructurePolicy, codePoints,
|
||||
codePointCount, true /* forceLowerCaseSearch */);
|
||||
BinaryDictionaryBigramsIterator bigramsIt = getBigramsIteratorForWord(dictStructurePolicy,
|
||||
codePoints, codePointCount, false /* forceLowerCaseSearch */);
|
||||
// getBigramsIteratorForWord returns an empty iterator if this word isn't in the dictionary
|
||||
// or has no bigrams.
|
||||
if (bigramsIt.hasNext()) {
|
||||
return bigramsIt;
|
||||
}
|
||||
return pos;
|
||||
// If no bigrams for this exact word, search again in lower case.
|
||||
return getBigramsIteratorForWord(dictStructurePolicy, codePoints,
|
||||
codePointCount, true /* forceLowerCaseSearch */);
|
||||
}
|
||||
|
||||
static int getBigramListPositionForWord(
|
||||
static BinaryDictionaryBigramsIterator getBigramsIteratorForWord(
|
||||
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
|
||||
const int *wordCodePoints, const int wordCodePointCount,
|
||||
const bool forceLowerCaseSearch) {
|
||||
if (!wordCodePoints || wordCodePointCount <= 0) return NOT_A_DICT_POS;
|
||||
if (!wordCodePoints || wordCodePointCount <= 0) return BinaryDictionaryBigramsIterator();
|
||||
const int terminalPtNodePos = dictStructurePolicy->getTerminalPtNodePositionOfWord(
|
||||
wordCodePoints, wordCodePointCount, forceLowerCaseSearch);
|
||||
if (NOT_A_DICT_POS == terminalPtNodePos) return NOT_A_DICT_POS;
|
||||
return dictStructurePolicy->getBigramsPositionOfPtNode(terminalPtNodePos);
|
||||
if (NOT_A_DICT_POS == terminalPtNodePos) return BinaryDictionaryBigramsIterator();
|
||||
return dictStructurePolicy->getBigramsIteratorOfPtNode(terminalPtNodePos);
|
||||
}
|
||||
|
||||
void clear() {
|
||||
|
|
|
@ -154,6 +154,12 @@ int Ver4PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) con
|
|||
ptNodeParams.getTerminalId());
|
||||
}
|
||||
|
||||
BinaryDictionaryBigramsIterator Ver4PatriciaTriePolicy::getBigramsIteratorOfPtNode(
|
||||
const int ptNodePos) const {
|
||||
const int bigramsPosition = getBigramsPositionOfPtNode(ptNodePos);
|
||||
return BinaryDictionaryBigramsIterator(&mBigramPolicy, bigramsPosition);
|
||||
}
|
||||
|
||||
int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const {
|
||||
if (ptNodePos == NOT_A_DICT_POS) {
|
||||
return NOT_A_DICT_POS;
|
||||
|
|
|
@ -94,16 +94,12 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
|||
|
||||
int getShortcutPositionOfPtNode(const int ptNodePos) const;
|
||||
|
||||
int getBigramsPositionOfPtNode(const int ptNodePos) const;
|
||||
BinaryDictionaryBigramsIterator getBigramsIteratorOfPtNode(const int ptNodePos) const;
|
||||
|
||||
const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const {
|
||||
return mHeaderPolicy;
|
||||
}
|
||||
|
||||
const DictionaryBigramsStructurePolicy *getBigramsStructurePolicy() const {
|
||||
return &mBigramPolicy;
|
||||
}
|
||||
|
||||
const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const {
|
||||
return &mShortcutPolicy;
|
||||
}
|
||||
|
@ -167,6 +163,8 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
|||
int mBigramCount;
|
||||
std::vector<int> mTerminalPtNodePositionsForIteratingWords;
|
||||
mutable bool mIsCorrupted;
|
||||
|
||||
int getBigramsPositionOfPtNode(const int ptNodePos) const;
|
||||
};
|
||||
} // namespace v402
|
||||
} // namespace backward
|
||||
|
|
|
@ -304,6 +304,12 @@ int PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const {
|
|||
return mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos).getShortcutPos();
|
||||
}
|
||||
|
||||
BinaryDictionaryBigramsIterator PatriciaTriePolicy::getBigramsIteratorOfPtNode(
|
||||
const int ptNodePos) const {
|
||||
const int bigramsPosition = getBigramsPositionOfPtNode(ptNodePos);
|
||||
return BinaryDictionaryBigramsIterator(&mBigramListPolicy, bigramsPosition);
|
||||
}
|
||||
|
||||
int PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const {
|
||||
if (ptNodePos == NOT_A_DICT_POS) {
|
||||
return NOT_A_DICT_POS;
|
||||
|
@ -322,7 +328,7 @@ int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNod
|
|||
int bigramPos = NOT_A_DICT_POS;
|
||||
int siblingPos = NOT_A_DICT_POS;
|
||||
PatriciaTrieReadingUtils::readPtNodeInfo(mDictRoot, ptNodePos, getShortcutsStructurePolicy(),
|
||||
getBigramsStructurePolicy(), &flags, &mergedNodeCodePointCount, mergedNodeCodePoints,
|
||||
&mBigramListPolicy, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints,
|
||||
&probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos);
|
||||
// Skip PtNodes don't start with Unicode code point because they represent non-word information.
|
||||
if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) {
|
||||
|
@ -352,7 +358,7 @@ const WordProperty PatriciaTriePolicy::getWordProperty(const int *const codePoin
|
|||
std::vector<BigramProperty> bigrams;
|
||||
const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos);
|
||||
int bigramWord1CodePoints[MAX_WORD_LENGTH];
|
||||
BinaryDictionaryBigramsIterator bigramsIt(getBigramsStructurePolicy(), bigramListPos);
|
||||
BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramListPos);
|
||||
while (bigramsIt.hasNext()) {
|
||||
// Fetch the next bigram information and forward the iterator.
|
||||
bigramsIt.next();
|
||||
|
|
|
@ -67,16 +67,12 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
|||
|
||||
int getShortcutPositionOfPtNode(const int ptNodePos) const;
|
||||
|
||||
int getBigramsPositionOfPtNode(const int ptNodePos) const;
|
||||
BinaryDictionaryBigramsIterator getBigramsIteratorOfPtNode(const int ptNodePos) const;
|
||||
|
||||
const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const {
|
||||
return &mHeaderPolicy;
|
||||
}
|
||||
|
||||
const DictionaryBigramsStructurePolicy *getBigramsStructurePolicy() const {
|
||||
return &mBigramListPolicy;
|
||||
}
|
||||
|
||||
const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const {
|
||||
return &mShortcutListPolicy;
|
||||
}
|
||||
|
@ -158,6 +154,7 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
|||
std::vector<int> mTerminalPtNodePositionsForIteratingWords;
|
||||
mutable bool mIsCorrupted;
|
||||
|
||||
int getBigramsPositionOfPtNode(const int ptNodePos) const;
|
||||
int createAndGetLeavingChildNode(const DicNode *const dicNode, const int ptNodePos,
|
||||
DicNodeVector *const childDicNodes) const;
|
||||
};
|
||||
|
|
|
@ -144,6 +144,12 @@ int Ver4PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) con
|
|||
ptNodeParams.getTerminalId());
|
||||
}
|
||||
|
||||
BinaryDictionaryBigramsIterator Ver4PatriciaTriePolicy::getBigramsIteratorOfPtNode(
|
||||
const int ptNodePos) const {
|
||||
const int bigramsPosition = getBigramsPositionOfPtNode(ptNodePos);
|
||||
return BinaryDictionaryBigramsIterator(&mBigramPolicy, bigramsPosition);
|
||||
}
|
||||
|
||||
int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const {
|
||||
if (ptNodePos == NOT_A_DICT_POS) {
|
||||
return NOT_A_DICT_POS;
|
||||
|
|
|
@ -76,16 +76,12 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
|||
|
||||
int getShortcutPositionOfPtNode(const int ptNodePos) const;
|
||||
|
||||
int getBigramsPositionOfPtNode(const int ptNodePos) const;
|
||||
BinaryDictionaryBigramsIterator getBigramsIteratorOfPtNode(const int ptNodePos) const;
|
||||
|
||||
const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const {
|
||||
return mHeaderPolicy;
|
||||
}
|
||||
|
||||
const DictionaryBigramsStructurePolicy *getBigramsStructurePolicy() const {
|
||||
return &mBigramPolicy;
|
||||
}
|
||||
|
||||
const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const {
|
||||
return &mShortcutPolicy;
|
||||
}
|
||||
|
@ -146,6 +142,8 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
|||
int mBigramCount;
|
||||
std::vector<int> mTerminalPtNodePositionsForIteratingWords;
|
||||
mutable bool mIsCorrupted;
|
||||
|
||||
int getBigramsPositionOfPtNode(const int ptNodePos) const;
|
||||
};
|
||||
} // namespace latinime
|
||||
#endif // LATINIME_VER4_PATRICIA_TRIE_POLICY_H
|
||||
|
|
Loading…
Reference in a new issue