am 40182c73
: Merge "Compute probability using structure policy."
* commit '40182c737de462a37c90ba1f85b4d67095d893f7': Compute probability using structure policy.
This commit is contained in:
commit
ae7e7e0e4e
12 changed files with 62 additions and 28 deletions
|
@ -21,7 +21,6 @@
|
|||
#include "suggest/core/dicnode/dic_node.h"
|
||||
#include "suggest/core/dicnode/dic_node_vector.h"
|
||||
#include "suggest/core/dictionary/multi_bigram_map.h"
|
||||
#include "suggest/core/dictionary/probability_utils.h"
|
||||
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
|
||||
#include "utils/char_utils.h"
|
||||
|
||||
|
@ -93,13 +92,15 @@ namespace latinime {
|
|||
if (NOT_A_VALID_WORD_POS == wordPos || NOT_A_VALID_WORD_POS == prevWordPos) {
|
||||
// Note: Normally wordPos comes from the dictionary and should never equal
|
||||
// NOT_A_VALID_WORD_POS.
|
||||
return ProbabilityUtils::backoff(unigramProbability);
|
||||
return dictionaryStructurePolicy->getProbability(unigramProbability,
|
||||
NOT_A_PROBABILITY);
|
||||
}
|
||||
if (multiBigramMap) {
|
||||
return multiBigramMap->getBigramProbability(dictionaryStructurePolicy, prevWordPos,
|
||||
wordPos, unigramProbability);
|
||||
}
|
||||
return ProbabilityUtils::backoff(unigramProbability);
|
||||
return dictionaryStructurePolicy->getProbability(unigramProbability,
|
||||
NOT_A_PROBABILITY);
|
||||
}
|
||||
|
||||
////////////////
|
||||
|
|
|
@ -116,10 +116,6 @@ class DicNodeStatePrevWord {
|
|||
return mPrevWordStart;
|
||||
}
|
||||
|
||||
int16_t getPrevWordProbability() const {
|
||||
return mPrevWordProbability;
|
||||
}
|
||||
|
||||
int getPrevWordNodePos() const {
|
||||
return mPrevWordNodePos;
|
||||
}
|
||||
|
|
|
@ -23,7 +23,6 @@
|
|||
#include "defines.h"
|
||||
#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
|
||||
#include "suggest/core/dictionary/dictionary.h"
|
||||
#include "suggest/core/dictionary/probability_utils.h"
|
||||
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
|
||||
#include "utils/char_utils.h"
|
||||
|
||||
|
@ -131,7 +130,7 @@ int BigramDictionary::getPredictions(const int *prevWord, const int prevWordLeng
|
|||
// resulting probability is 8 - although in the practice it's never bigger than 3 or 4
|
||||
// in very bad cases. This means that sometimes, we'll see some bigrams interverted
|
||||
// here, but it can't get too bad.
|
||||
const int probability = ProbabilityUtils::computeProbabilityForBigram(
|
||||
const int probability = mDictionaryStructurePolicy->getProbability(
|
||||
unigramProbability, bigramsIt.getProbability());
|
||||
addWordBigram(bigramBuffer, codePointCount, probability, outBigramProbability,
|
||||
outBigramCodePoints, outputTypes);
|
||||
|
|
|
@ -90,7 +90,7 @@ int Dictionary::getProbability(const int *word, int length) const {
|
|||
if (NOT_A_VALID_WORD_POS == pos) {
|
||||
return NOT_A_PROBABILITY;
|
||||
}
|
||||
return getDictionaryStructurePolicy()->getUnigramProbability(pos);
|
||||
return getDictionaryStructurePolicy()->getUnigramProbabilityOfPtNode(pos);
|
||||
}
|
||||
|
||||
bool Dictionary::isValidBigram(const int *word0, int length0, const int *word1, int length1) const {
|
||||
|
|
|
@ -22,7 +22,6 @@
|
|||
#include "defines.h"
|
||||
#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
|
||||
#include "suggest/core/dictionary/bloom_filter.h"
|
||||
#include "suggest/core/dictionary/probability_utils.h"
|
||||
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
|
||||
#include "utils/hash_map_compat.h"
|
||||
|
||||
|
@ -43,11 +42,12 @@ class MultiBigramMap {
|
|||
hash_map_compat<int, BigramMap>::const_iterator mapPosition =
|
||||
mBigramMaps.find(wordPosition);
|
||||
if (mapPosition != mBigramMaps.end()) {
|
||||
return mapPosition->second.getBigramProbability(nextWordPosition, unigramProbability);
|
||||
return mapPosition->second.getBigramProbability(structurePolicy, nextWordPosition,
|
||||
unigramProbability);
|
||||
}
|
||||
if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) {
|
||||
addBigramsForWordPosition(structurePolicy, wordPosition);
|
||||
return mBigramMaps[wordPosition].getBigramProbability(
|
||||
return mBigramMaps[wordPosition].getBigramProbability(structurePolicy,
|
||||
nextWordPosition, unigramProbability);
|
||||
}
|
||||
return readBigramProbabilityFromBinaryDictionary(structurePolicy, wordPosition,
|
||||
|
@ -82,17 +82,17 @@ class MultiBigramMap {
|
|||
}
|
||||
|
||||
AK_FORCE_INLINE int getBigramProbability(
|
||||
const DictionaryStructureWithBufferPolicy *const structurePolicy,
|
||||
const int nextWordPosition, const int unigramProbability) const {
|
||||
int bigramProbability = NOT_A_PROBABILITY;
|
||||
if (mBloomFilter.isInFilter(nextWordPosition)) {
|
||||
const hash_map_compat<int, int>::const_iterator bigramProbabilityIt =
|
||||
mBigramMap.find(nextWordPosition);
|
||||
if (bigramProbabilityIt != mBigramMap.end()) {
|
||||
const int bigramProbability = bigramProbabilityIt->second;
|
||||
return ProbabilityUtils::computeProbabilityForBigram(
|
||||
unigramProbability, bigramProbability);
|
||||
bigramProbability = bigramProbabilityIt->second;
|
||||
}
|
||||
}
|
||||
return ProbabilityUtils::backoff(unigramProbability);
|
||||
return structurePolicy->getProbability(unigramProbability, bigramProbability);
|
||||
}
|
||||
|
||||
private:
|
||||
|
@ -111,17 +111,18 @@ class MultiBigramMap {
|
|||
AK_FORCE_INLINE int readBigramProbabilityFromBinaryDictionary(
|
||||
const DictionaryStructureWithBufferPolicy *const structurePolicy, const int nodePos,
|
||||
const int nextWordPosition, const int unigramProbability) {
|
||||
int bigramProbability = NOT_A_PROBABILITY;
|
||||
const int bigramsListPos = structurePolicy->getBigramsPositionOfNode(nodePos);
|
||||
BinaryDictionaryBigramsIterator bigramsIt(structurePolicy->getBigramsStructurePolicy(),
|
||||
bigramsListPos);
|
||||
while (bigramsIt.hasNext()) {
|
||||
bigramsIt.next();
|
||||
if (bigramsIt.getBigramPos() == nextWordPosition) {
|
||||
return ProbabilityUtils::computeProbabilityForBigram(
|
||||
unigramProbability, bigramsIt.getProbability());
|
||||
bigramProbability = bigramsIt.getProbability();
|
||||
break;
|
||||
}
|
||||
}
|
||||
return ProbabilityUtils::backoff(unigramProbability);
|
||||
return structurePolicy->getProbability(unigramProbability, bigramProbability);
|
||||
}
|
||||
|
||||
static const size_t MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP;
|
||||
|
|
|
@ -47,7 +47,10 @@ class DictionaryStructureWithBufferPolicy {
|
|||
virtual int getTerminalNodePositionOfWord(const int *const inWord,
|
||||
const int length, const bool forceLowerCaseSearch) const = 0;
|
||||
|
||||
virtual int getUnigramProbability(const int nodePos) const = 0;
|
||||
virtual int getProbability(const int unigramProbability,
|
||||
const int bigramProbability) const = 0;
|
||||
|
||||
virtual int getUnigramProbabilityOfPtNode(const int nodePos) const = 0;
|
||||
|
||||
virtual int getShortcutPositionOfNode(const int nodePos) const = 0;
|
||||
|
||||
|
|
|
@ -171,7 +171,9 @@ int Suggest::outputSuggestions(DicTraverseSession *traverseSession, int *frequen
|
|||
terminalIndex, doubleLetterTerminalIndex, doubleLetterLevel);
|
||||
const float compoundDistance = terminalDicNode->getCompoundDistance(languageWeight)
|
||||
+ doubleLetterCost;
|
||||
const bool isPossiblyOffensiveWord = terminalDicNode->getProbability() <= 0;
|
||||
const bool isPossiblyOffensiveWord =
|
||||
traverseSession->getDictionaryStructurePolicy()->getProbability(
|
||||
terminalDicNode->getProbability(), NOT_A_PROBABILITY) <= 0;
|
||||
const bool isExactMatch = terminalDicNode->isExactMatch();
|
||||
const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase();
|
||||
// Heuristic: We exclude freq=0 first-char-uppercase words from exact match.
|
||||
|
|
|
@ -24,6 +24,7 @@
|
|||
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h"
|
||||
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h"
|
||||
#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h"
|
||||
#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
|
||||
|
||||
namespace latinime {
|
||||
|
||||
|
@ -134,7 +135,20 @@ int DynamicPatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const in
|
|||
return NOT_A_VALID_WORD_POS;
|
||||
}
|
||||
|
||||
int DynamicPatriciaTriePolicy::getUnigramProbability(const int nodePos) const {
|
||||
int DynamicPatriciaTriePolicy::getProbability(const int unigramProbability,
|
||||
const int bigramProbability) const {
|
||||
// TODO: check mHeaderPolicy.usesForgettingCurve();
|
||||
if (unigramProbability == NOT_A_PROBABILITY) {
|
||||
return NOT_A_PROBABILITY;
|
||||
} else if (bigramProbability == NOT_A_PROBABILITY) {
|
||||
return ProbabilityUtils::backoff(unigramProbability);
|
||||
} else {
|
||||
return ProbabilityUtils::computeProbabilityForBigram(unigramProbability,
|
||||
bigramProbability);
|
||||
}
|
||||
}
|
||||
|
||||
int DynamicPatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int nodePos) const {
|
||||
if (nodePos == NOT_A_VALID_WORD_POS) {
|
||||
return NOT_A_PROBABILITY;
|
||||
}
|
||||
|
@ -144,7 +158,7 @@ int DynamicPatriciaTriePolicy::getUnigramProbability(const int nodePos) const {
|
|||
if (nodeReader.isDeleted() || nodeReader.isBlacklisted() || nodeReader.isNotAWord()) {
|
||||
return NOT_A_PROBABILITY;
|
||||
}
|
||||
return nodeReader.getProbability();
|
||||
return getProbability(nodeReader.getProbability(), NOT_A_PROBABILITY);
|
||||
}
|
||||
|
||||
int DynamicPatriciaTriePolicy::getShortcutPositionOfNode(const int nodePos) const {
|
||||
|
|
|
@ -57,7 +57,9 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
|||
int getTerminalNodePositionOfWord(const int *const inWord,
|
||||
const int length, const bool forceLowerCaseSearch) const;
|
||||
|
||||
int getUnigramProbability(const int nodePos) const;
|
||||
int getProbability(const int unigramProbability, const int bigramProbability) const;
|
||||
|
||||
int getUnigramProbabilityOfPtNode(const int nodePos) const;
|
||||
|
||||
int getShortcutPositionOfNode(const int nodePos) const;
|
||||
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
#include "suggest/core/dicnode/dic_node.h"
|
||||
#include "suggest/core/dicnode/dic_node_vector.h"
|
||||
#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h"
|
||||
#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
|
||||
|
||||
namespace latinime {
|
||||
|
||||
|
@ -306,7 +307,19 @@ int PatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const inWord,
|
|||
}
|
||||
}
|
||||
|
||||
int PatriciaTriePolicy::getUnigramProbability(const int nodePos) const {
|
||||
int PatriciaTriePolicy::getProbability(const int unigramProbability,
|
||||
const int bigramProbability) const {
|
||||
if (unigramProbability == NOT_A_PROBABILITY) {
|
||||
return NOT_A_PROBABILITY;
|
||||
} else if (bigramProbability == NOT_A_PROBABILITY) {
|
||||
return ProbabilityUtils::backoff(unigramProbability);
|
||||
} else {
|
||||
return ProbabilityUtils::computeProbabilityForBigram(unigramProbability,
|
||||
bigramProbability);
|
||||
}
|
||||
}
|
||||
|
||||
int PatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int nodePos) const {
|
||||
if (nodePos == NOT_A_VALID_WORD_POS) {
|
||||
return NOT_A_PROBABILITY;
|
||||
}
|
||||
|
@ -324,7 +337,8 @@ int PatriciaTriePolicy::getUnigramProbability(const int nodePos) const {
|
|||
return NOT_A_PROBABILITY;
|
||||
}
|
||||
PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos);
|
||||
return PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos);
|
||||
return getProbability(PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(
|
||||
mDictRoot, &pos), NOT_A_PROBABILITY);
|
||||
}
|
||||
|
||||
int PatriciaTriePolicy::getShortcutPositionOfNode(const int nodePos) const {
|
||||
|
|
|
@ -56,7 +56,9 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
|||
int getTerminalNodePositionOfWord(const int *const inWord,
|
||||
const int length, const bool forceLowerCaseSearch) const;
|
||||
|
||||
int getUnigramProbability(const int nodePos) const;
|
||||
int getProbability(const int unigramProbability, const int bigramProbability) const;
|
||||
|
||||
int getUnigramProbabilityOfPtNode(const int nodePos) const;
|
||||
|
||||
int getShortcutPositionOfNode(const int nodePos) const;
|
||||
|
||||
|
|
Loading…
Reference in a new issue