Compute probability using structure policy.

Bug: 6669677
Change-Id: Icd50fc30274cce65ebeb2e7cc8368b59e5cda05a
This commit is contained in:
Keisuke Kuroyanagi 2013-09-06 16:50:09 +09:00
parent b690c03927
commit 65d19946be
12 changed files with 62 additions and 28 deletions

View file

@ -21,7 +21,6 @@
#include "suggest/core/dicnode/dic_node.h"
#include "suggest/core/dicnode/dic_node_vector.h"
#include "suggest/core/dictionary/multi_bigram_map.h"
#include "suggest/core/dictionary/probability_utils.h"
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
#include "utils/char_utils.h"
@ -93,13 +92,15 @@ namespace latinime {
if (NOT_A_VALID_WORD_POS == wordPos || NOT_A_VALID_WORD_POS == prevWordPos) {
// Note: Normally wordPos comes from the dictionary and should never equal
// NOT_A_VALID_WORD_POS.
return ProbabilityUtils::backoff(unigramProbability);
return dictionaryStructurePolicy->getProbability(unigramProbability,
NOT_A_PROBABILITY);
}
if (multiBigramMap) {
return multiBigramMap->getBigramProbability(dictionaryStructurePolicy, prevWordPos,
wordPos, unigramProbability);
}
return ProbabilityUtils::backoff(unigramProbability);
return dictionaryStructurePolicy->getProbability(unigramProbability,
NOT_A_PROBABILITY);
}
////////////////

View file

@ -116,10 +116,6 @@ class DicNodeStatePrevWord {
return mPrevWordStart;
}
int16_t getPrevWordProbability() const {
return mPrevWordProbability;
}
int getPrevWordNodePos() const {
return mPrevWordNodePos;
}

View file

@ -23,7 +23,6 @@
#include "defines.h"
#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
#include "suggest/core/dictionary/dictionary.h"
#include "suggest/core/dictionary/probability_utils.h"
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
#include "utils/char_utils.h"
@ -131,7 +130,7 @@ int BigramDictionary::getPredictions(const int *prevWord, const int prevWordLeng
// resulting probability is 8 - although in the practice it's never bigger than 3 or 4
// in very bad cases. This means that sometimes, we'll see some bigrams interverted
// here, but it can't get too bad.
const int probability = ProbabilityUtils::computeProbabilityForBigram(
const int probability = mDictionaryStructurePolicy->getProbability(
unigramProbability, bigramsIt.getProbability());
addWordBigram(bigramBuffer, codePointCount, probability, outBigramProbability,
outBigramCodePoints, outputTypes);

View file

@ -90,7 +90,7 @@ int Dictionary::getProbability(const int *word, int length) const {
if (NOT_A_VALID_WORD_POS == pos) {
return NOT_A_PROBABILITY;
}
return getDictionaryStructurePolicy()->getUnigramProbability(pos);
return getDictionaryStructurePolicy()->getUnigramProbabilityOfPtNode(pos);
}
bool Dictionary::isValidBigram(const int *word0, int length0, const int *word1, int length1) const {

View file

@ -22,7 +22,6 @@
#include "defines.h"
#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
#include "suggest/core/dictionary/bloom_filter.h"
#include "suggest/core/dictionary/probability_utils.h"
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
#include "utils/hash_map_compat.h"
@ -43,11 +42,12 @@ class MultiBigramMap {
hash_map_compat<int, BigramMap>::const_iterator mapPosition =
mBigramMaps.find(wordPosition);
if (mapPosition != mBigramMaps.end()) {
return mapPosition->second.getBigramProbability(nextWordPosition, unigramProbability);
return mapPosition->second.getBigramProbability(structurePolicy, nextWordPosition,
unigramProbability);
}
if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) {
addBigramsForWordPosition(structurePolicy, wordPosition);
return mBigramMaps[wordPosition].getBigramProbability(
return mBigramMaps[wordPosition].getBigramProbability(structurePolicy,
nextWordPosition, unigramProbability);
}
return readBigramProbabilityFromBinaryDictionary(structurePolicy, wordPosition,
@ -82,17 +82,17 @@ class MultiBigramMap {
}
AK_FORCE_INLINE int getBigramProbability(
const DictionaryStructureWithBufferPolicy *const structurePolicy,
const int nextWordPosition, const int unigramProbability) const {
int bigramProbability = NOT_A_PROBABILITY;
if (mBloomFilter.isInFilter(nextWordPosition)) {
const hash_map_compat<int, int>::const_iterator bigramProbabilityIt =
mBigramMap.find(nextWordPosition);
if (bigramProbabilityIt != mBigramMap.end()) {
const int bigramProbability = bigramProbabilityIt->second;
return ProbabilityUtils::computeProbabilityForBigram(
unigramProbability, bigramProbability);
bigramProbability = bigramProbabilityIt->second;
}
}
return ProbabilityUtils::backoff(unigramProbability);
return structurePolicy->getProbability(unigramProbability, bigramProbability);
}
private:
@ -111,17 +111,18 @@ class MultiBigramMap {
AK_FORCE_INLINE int readBigramProbabilityFromBinaryDictionary(
const DictionaryStructureWithBufferPolicy *const structurePolicy, const int nodePos,
const int nextWordPosition, const int unigramProbability) {
int bigramProbability = NOT_A_PROBABILITY;
const int bigramsListPos = structurePolicy->getBigramsPositionOfNode(nodePos);
BinaryDictionaryBigramsIterator bigramsIt(structurePolicy->getBigramsStructurePolicy(),
bigramsListPos);
while (bigramsIt.hasNext()) {
bigramsIt.next();
if (bigramsIt.getBigramPos() == nextWordPosition) {
return ProbabilityUtils::computeProbabilityForBigram(
unigramProbability, bigramsIt.getProbability());
bigramProbability = bigramsIt.getProbability();
break;
}
}
return ProbabilityUtils::backoff(unigramProbability);
return structurePolicy->getProbability(unigramProbability, bigramProbability);
}
static const size_t MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP;

View file

@ -47,7 +47,10 @@ class DictionaryStructureWithBufferPolicy {
virtual int getTerminalNodePositionOfWord(const int *const inWord,
const int length, const bool forceLowerCaseSearch) const = 0;
virtual int getUnigramProbability(const int nodePos) const = 0;
virtual int getProbability(const int unigramProbability,
const int bigramProbability) const = 0;
virtual int getUnigramProbabilityOfPtNode(const int nodePos) const = 0;
virtual int getShortcutPositionOfNode(const int nodePos) const = 0;

View file

@ -171,7 +171,9 @@ int Suggest::outputSuggestions(DicTraverseSession *traverseSession, int *frequen
terminalIndex, doubleLetterTerminalIndex, doubleLetterLevel);
const float compoundDistance = terminalDicNode->getCompoundDistance(languageWeight)
+ doubleLetterCost;
const bool isPossiblyOffensiveWord = terminalDicNode->getProbability() <= 0;
const bool isPossiblyOffensiveWord =
traverseSession->getDictionaryStructurePolicy()->getProbability(
terminalDicNode->getProbability(), NOT_A_PROBABILITY) <= 0;
const bool isExactMatch = terminalDicNode->isExactMatch();
const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase();
// Heuristic: We exclude freq=0 first-char-uppercase words from exact match.

View file

@ -24,6 +24,7 @@
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h"
#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
namespace latinime {
@ -134,7 +135,20 @@ int DynamicPatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const in
return NOT_A_VALID_WORD_POS;
}
int DynamicPatriciaTriePolicy::getUnigramProbability(const int nodePos) const {
int DynamicPatriciaTriePolicy::getProbability(const int unigramProbability,
const int bigramProbability) const {
// TODO: check mHeaderPolicy.usesForgettingCurve();
if (unigramProbability == NOT_A_PROBABILITY) {
return NOT_A_PROBABILITY;
} else if (bigramProbability == NOT_A_PROBABILITY) {
return ProbabilityUtils::backoff(unigramProbability);
} else {
return ProbabilityUtils::computeProbabilityForBigram(unigramProbability,
bigramProbability);
}
}
int DynamicPatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int nodePos) const {
if (nodePos == NOT_A_VALID_WORD_POS) {
return NOT_A_PROBABILITY;
}
@ -144,7 +158,7 @@ int DynamicPatriciaTriePolicy::getUnigramProbability(const int nodePos) const {
if (nodeReader.isDeleted() || nodeReader.isBlacklisted() || nodeReader.isNotAWord()) {
return NOT_A_PROBABILITY;
}
return nodeReader.getProbability();
return getProbability(nodeReader.getProbability(), NOT_A_PROBABILITY);
}
int DynamicPatriciaTriePolicy::getShortcutPositionOfNode(const int nodePos) const {

View file

@ -57,7 +57,9 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
int getTerminalNodePositionOfWord(const int *const inWord,
const int length, const bool forceLowerCaseSearch) const;
int getUnigramProbability(const int nodePos) const;
int getProbability(const int unigramProbability, const int bigramProbability) const;
int getUnigramProbabilityOfPtNode(const int nodePos) const;
int getShortcutPositionOfNode(const int nodePos) const;

View file

@ -21,6 +21,7 @@
#include "suggest/core/dicnode/dic_node.h"
#include "suggest/core/dicnode/dic_node_vector.h"
#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
namespace latinime {
@ -306,7 +307,19 @@ int PatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const inWord,
}
}
int PatriciaTriePolicy::getUnigramProbability(const int nodePos) const {
int PatriciaTriePolicy::getProbability(const int unigramProbability,
const int bigramProbability) const {
if (unigramProbability == NOT_A_PROBABILITY) {
return NOT_A_PROBABILITY;
} else if (bigramProbability == NOT_A_PROBABILITY) {
return ProbabilityUtils::backoff(unigramProbability);
} else {
return ProbabilityUtils::computeProbabilityForBigram(unigramProbability,
bigramProbability);
}
}
int PatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int nodePos) const {
if (nodePos == NOT_A_VALID_WORD_POS) {
return NOT_A_PROBABILITY;
}
@ -324,7 +337,8 @@ int PatriciaTriePolicy::getUnigramProbability(const int nodePos) const {
return NOT_A_PROBABILITY;
}
PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos);
return PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos);
return getProbability(PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(
mDictRoot, &pos), NOT_A_PROBABILITY);
}
int PatriciaTriePolicy::getShortcutPositionOfNode(const int nodePos) const {

View file

@ -56,7 +56,9 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
int getTerminalNodePositionOfWord(const int *const inWord,
const int length, const bool forceLowerCaseSearch) const;
int getUnigramProbability(const int nodePos) const;
int getProbability(const int unigramProbability, const int bigramProbability) const;
int getUnigramProbabilityOfPtNode(const int nodePos) const;
int getShortcutPositionOfNode(const int nodePos) const;