Use NgramListener in MultiBigramMap.

Bug: 14425059
Change-Id: I425536290111f2a8172f31370706f858a1e07f6e
main
Keisuke Kuroyanagi 2014-08-01 11:00:03 +09:00
parent da5ccd9f18
commit 35c62b2cc9
4 changed files with 55 additions and 56 deletions

View File

@ -117,7 +117,7 @@ class DicNode {
int newPrevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; int newPrevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
newPrevWordsPtNodePos[0] = dicNode->mDicNodeProperties.getPtNodePos(); newPrevWordsPtNodePos[0] = dicNode->mDicNodeProperties.getPtNodePos();
for (size_t i = 1; i < NELEMS(newPrevWordsPtNodePos); ++i) { for (size_t i = 1; i < NELEMS(newPrevWordsPtNodePos); ++i) {
newPrevWordsPtNodePos[i] = dicNode->getNthPrevWordTerminalPtNodePos(i); newPrevWordsPtNodePos[i] = dicNode->getPrevWordsTerminalPtNodePos()[i - 1];
} }
mDicNodeProperties.init(rootPtNodeArrayPos, newPrevWordsPtNodePos); mDicNodeProperties.init(rootPtNodeArrayPos, newPrevWordsPtNodePos);
mDicNodeState.initAsRootWithPreviousWord(&dicNode->mDicNodeState, mDicNodeState.initAsRootWithPreviousWord(&dicNode->mDicNodeState,
@ -208,12 +208,9 @@ class DicNode {
return mDicNodeProperties.getPtNodePos(); return mDicNodeProperties.getPtNodePos();
} }
// Used to get n-gram probability in DicNodeUtils. n is 1-indexed. // TODO: Use view class to return PtNodePos array.
int getNthPrevWordTerminalPtNodePos(const int n) const { const int *getPrevWordsTerminalPtNodePos() const {
if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) { return mDicNodeProperties.getPrevWordsTerminalPtNodePos();
return NOT_A_DICT_POS;
}
return mDicNodeProperties.getPrevWordsTerminalPtNodePos()[n - 1];
} }
// Used in DicNodeUtils // Used in DicNodeUtils

View File

@ -85,17 +85,10 @@ namespace latinime {
const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy,
const DicNode *const dicNode, MultiBigramMap *const multiBigramMap) { const DicNode *const dicNode, MultiBigramMap *const multiBigramMap) {
const int unigramProbability = dicNode->getProbability(); const int unigramProbability = dicNode->getProbability();
const int ptNodePos = dicNode->getPtNodePos();
const int prevWordTerminalPtNodePos = dicNode->getNthPrevWordTerminalPtNodePos(1 /* n */);
if (NOT_A_DICT_POS == ptNodePos || NOT_A_DICT_POS == prevWordTerminalPtNodePos) {
// Note: Normally wordPos comes from the dictionary and should never equal
// NOT_A_VALID_WORD_POS.
return dictionaryStructurePolicy->getProbability(unigramProbability,
NOT_A_PROBABILITY);
}
if (multiBigramMap) { if (multiBigramMap) {
const int *const prevWordsPtNodePos = dicNode->getPrevWordsTerminalPtNodePos();
return multiBigramMap->getBigramProbability(dictionaryStructurePolicy, return multiBigramMap->getBigramProbability(dictionaryStructurePolicy,
prevWordTerminalPtNodePos, ptNodePos, unigramProbability); prevWordsPtNodePos, dicNode->getPtNodePos(), unigramProbability);
} }
return dictionaryStructurePolicy->getProbability(unigramProbability, return dictionaryStructurePolicy->getProbability(unigramProbability,
NOT_A_PROBABILITY); NOT_A_PROBABILITY);

View File

@ -35,34 +35,30 @@ const int MultiBigramMap::BigramMap::DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP =
// Also caches the bigrams if there is space remaining and they have not been cached already. // Also caches the bigrams if there is space remaining and they have not been cached already.
int MultiBigramMap::getBigramProbability( int MultiBigramMap::getBigramProbability(
const DictionaryStructureWithBufferPolicy *const structurePolicy, const DictionaryStructureWithBufferPolicy *const structurePolicy,
const int wordPosition, const int nextWordPosition, const int unigramProbability) { const int *const prevWordsPtNodePos, const int nextWordPosition,
const int unigramProbability) {
if (!prevWordsPtNodePos || prevWordsPtNodePos[0] == NOT_A_DICT_POS) {
return structurePolicy->getProbability(unigramProbability, NOT_A_PROBABILITY);
}
std::unordered_map<int, BigramMap>::const_iterator mapPosition = std::unordered_map<int, BigramMap>::const_iterator mapPosition =
mBigramMaps.find(wordPosition); mBigramMaps.find(prevWordsPtNodePos[0]);
if (mapPosition != mBigramMaps.end()) { if (mapPosition != mBigramMaps.end()) {
return mapPosition->second.getBigramProbability(structurePolicy, nextWordPosition, return mapPosition->second.getBigramProbability(structurePolicy, nextWordPosition,
unigramProbability); unigramProbability);
} }
if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) { if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) {
addBigramsForWordPosition(structurePolicy, wordPosition); addBigramsForWordPosition(structurePolicy, prevWordsPtNodePos);
return mBigramMaps[wordPosition].getBigramProbability(structurePolicy, return mBigramMaps[prevWordsPtNodePos[0]].getBigramProbability(structurePolicy,
nextWordPosition, unigramProbability); nextWordPosition, unigramProbability);
} }
return readBigramProbabilityFromBinaryDictionary(structurePolicy, wordPosition, return readBigramProbabilityFromBinaryDictionary(structurePolicy, prevWordsPtNodePos,
nextWordPosition, unigramProbability); nextWordPosition, unigramProbability);
} }
void MultiBigramMap::BigramMap::init( void MultiBigramMap::BigramMap::init(
const DictionaryStructureWithBufferPolicy *const structurePolicy, const int nodePos) { const DictionaryStructureWithBufferPolicy *const structurePolicy,
BinaryDictionaryBigramsIterator bigramsIt = const int *const prevWordsPtNodePos) {
structurePolicy->getBigramsIteratorOfPtNode(nodePos); structurePolicy->iterateNgramEntries(prevWordsPtNodePos, this /* listener */);
while (bigramsIt.hasNext()) {
bigramsIt.next();
if (bigramsIt.getBigramPos() == NOT_A_DICT_POS) {
continue;
}
mBigramMap[bigramsIt.getBigramPos()] = bigramsIt.getProbability();
mBloomFilter.setInFilter(bigramsIt.getBigramPos());
}
} }
int MultiBigramMap::BigramMap::getBigramProbability( int MultiBigramMap::BigramMap::getBigramProbability(
@ -79,25 +75,33 @@ int MultiBigramMap::BigramMap::getBigramProbability(
return structurePolicy->getProbability(unigramProbability, bigramProbability); return structurePolicy->getProbability(unigramProbability, bigramProbability);
} }
void MultiBigramMap::BigramMap::onVisitEntry(const int ngramProbability,
const int targetPtNodePos) {
if (targetPtNodePos == NOT_A_DICT_POS) {
return;
}
mBigramMap[targetPtNodePos] = ngramProbability;
mBloomFilter.setInFilter(targetPtNodePos);
}
void MultiBigramMap::addBigramsForWordPosition( void MultiBigramMap::addBigramsForWordPosition(
const DictionaryStructureWithBufferPolicy *const structurePolicy, const int position) { const DictionaryStructureWithBufferPolicy *const structurePolicy,
mBigramMaps[position].init(structurePolicy, position); const int *const prevWordsPtNodePos) {
if (prevWordsPtNodePos) {
mBigramMaps[prevWordsPtNodePos[0]].init(structurePolicy, prevWordsPtNodePos);
}
} }
int MultiBigramMap::readBigramProbabilityFromBinaryDictionary( int MultiBigramMap::readBigramProbabilityFromBinaryDictionary(
const DictionaryStructureWithBufferPolicy *const structurePolicy, const int nodePos, const DictionaryStructureWithBufferPolicy *const structurePolicy,
const int nextWordPosition, const int unigramProbability) { const int *const prevWordsPtNodePos, const int nextWordPosition,
int bigramProbability = NOT_A_PROBABILITY; const int unigramProbability) {
BinaryDictionaryBigramsIterator bigramsIt = const int bigramProbability = structurePolicy->getProbabilityOfPtNode(prevWordsPtNodePos,
structurePolicy->getBigramsIteratorOfPtNode(nodePos); nextWordPosition);
while (bigramsIt.hasNext()) { if (bigramProbability != NOT_A_PROBABILITY) {
bigramsIt.next(); return bigramProbability;
if (bigramsIt.getBigramPos() == nextWordPosition) {
bigramProbability = bigramsIt.getProbability();
break;
}
} }
return structurePolicy->getProbability(unigramProbability, bigramProbability); return structurePolicy->getProbability(unigramProbability, NOT_A_PROBABILITY);
} }
} // namespace latinime } // namespace latinime

View File

@ -23,6 +23,7 @@
#include "defines.h" #include "defines.h"
#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h" #include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
#include "suggest/core/dictionary/bloom_filter.h" #include "suggest/core/dictionary/bloom_filter.h"
#include "suggest/core/dictionary/ngram_listener.h"
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
namespace latinime { namespace latinime {
@ -38,7 +39,8 @@ class MultiBigramMap {
// Look up the bigram probability for the given word pair from the cached bigram maps. // Look up the bigram probability for the given word pair from the cached bigram maps.
// Also caches the bigrams if there is space remaining and they have not been cached already. // Also caches the bigrams if there is space remaining and they have not been cached already.
int getBigramProbability(const DictionaryStructureWithBufferPolicy *const structurePolicy, int getBigramProbability(const DictionaryStructureWithBufferPolicy *const structurePolicy,
const int wordPosition, const int nextWordPosition, const int unigramProbability); const int *const prevWordsPtNodePos, const int nextWordPosition,
const int unigramProbability);
void clear() { void clear() {
mBigramMaps.clear(); mBigramMaps.clear();
@ -47,32 +49,35 @@ class MultiBigramMap {
private: private:
DISALLOW_COPY_AND_ASSIGN(MultiBigramMap); DISALLOW_COPY_AND_ASSIGN(MultiBigramMap);
class BigramMap { class BigramMap : public NgramListener {
public: public:
BigramMap() : mBigramMap(DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP), mBloomFilter() {} BigramMap() : mBigramMap(DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP), mBloomFilter() {}
~BigramMap() {} // Copy constructor needed for std::unordered_map.
BigramMap(const BigramMap &bigramMap)
: mBigramMap(bigramMap.mBigramMap), mBloomFilter(bigramMap.mBloomFilter) {}
virtual ~BigramMap() {}
void init(const DictionaryStructureWithBufferPolicy *const structurePolicy, void init(const DictionaryStructureWithBufferPolicy *const structurePolicy,
const int nodePos); const int *const prevWordsPtNodePos);
int getBigramProbability( int getBigramProbability(
const DictionaryStructureWithBufferPolicy *const structurePolicy, const DictionaryStructureWithBufferPolicy *const structurePolicy,
const int nextWordPosition, const int unigramProbability) const; const int nextWordPosition, const int unigramProbability) const;
virtual void onVisitEntry(const int ngramProbability, const int targetPtNodePos);
private: private:
// NOTE: The BigramMap class doesn't use DISALLOW_COPY_AND_ASSIGN() because its default
// copy constructor is needed for use in hash_map.
static const int DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP; static const int DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP;
std::unordered_map<int, int> mBigramMap; std::unordered_map<int, int> mBigramMap;
BloomFilter mBloomFilter; BloomFilter mBloomFilter;
}; };
void addBigramsForWordPosition( void addBigramsForWordPosition(
const DictionaryStructureWithBufferPolicy *const structurePolicy, const int position); const DictionaryStructureWithBufferPolicy *const structurePolicy,
const int *const prevWordsPtNodePos);
int readBigramProbabilityFromBinaryDictionary( int readBigramProbabilityFromBinaryDictionary(
const DictionaryStructureWithBufferPolicy *const structurePolicy, const int nodePos, const DictionaryStructureWithBufferPolicy *const structurePolicy,
const int nextWordPosition, const int unigramProbability); const int *const prevWordsPtNodePos, const int nextWordPosition,
const int unigramProbability);
static const size_t MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP; static const size_t MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP;
std::unordered_map<int, BigramMap> mBigramMaps; std::unordered_map<int, BigramMap> mBigramMaps;