am 40182c73: Merge "Compute probability using structure policy."
* commit '40182c737de462a37c90ba1f85b4d67095d893f7': Compute probability using structure policy.main
commit
ae7e7e0e4e
|
@ -21,7 +21,6 @@
|
||||||
#include "suggest/core/dicnode/dic_node.h"
|
#include "suggest/core/dicnode/dic_node.h"
|
||||||
#include "suggest/core/dicnode/dic_node_vector.h"
|
#include "suggest/core/dicnode/dic_node_vector.h"
|
||||||
#include "suggest/core/dictionary/multi_bigram_map.h"
|
#include "suggest/core/dictionary/multi_bigram_map.h"
|
||||||
#include "suggest/core/dictionary/probability_utils.h"
|
|
||||||
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
|
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
|
||||||
#include "utils/char_utils.h"
|
#include "utils/char_utils.h"
|
||||||
|
|
||||||
|
@ -93,13 +92,15 @@ namespace latinime {
|
||||||
if (NOT_A_VALID_WORD_POS == wordPos || NOT_A_VALID_WORD_POS == prevWordPos) {
|
if (NOT_A_VALID_WORD_POS == wordPos || NOT_A_VALID_WORD_POS == prevWordPos) {
|
||||||
// Note: Normally wordPos comes from the dictionary and should never equal
|
// Note: Normally wordPos comes from the dictionary and should never equal
|
||||||
// NOT_A_VALID_WORD_POS.
|
// NOT_A_VALID_WORD_POS.
|
||||||
return ProbabilityUtils::backoff(unigramProbability);
|
return dictionaryStructurePolicy->getProbability(unigramProbability,
|
||||||
|
NOT_A_PROBABILITY);
|
||||||
}
|
}
|
||||||
if (multiBigramMap) {
|
if (multiBigramMap) {
|
||||||
return multiBigramMap->getBigramProbability(dictionaryStructurePolicy, prevWordPos,
|
return multiBigramMap->getBigramProbability(dictionaryStructurePolicy, prevWordPos,
|
||||||
wordPos, unigramProbability);
|
wordPos, unigramProbability);
|
||||||
}
|
}
|
||||||
return ProbabilityUtils::backoff(unigramProbability);
|
return dictionaryStructurePolicy->getProbability(unigramProbability,
|
||||||
|
NOT_A_PROBABILITY);
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////
|
////////////////
|
||||||
|
|
|
@ -116,10 +116,6 @@ class DicNodeStatePrevWord {
|
||||||
return mPrevWordStart;
|
return mPrevWordStart;
|
||||||
}
|
}
|
||||||
|
|
||||||
int16_t getPrevWordProbability() const {
|
|
||||||
return mPrevWordProbability;
|
|
||||||
}
|
|
||||||
|
|
||||||
int getPrevWordNodePos() const {
|
int getPrevWordNodePos() const {
|
||||||
return mPrevWordNodePos;
|
return mPrevWordNodePos;
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,7 +23,6 @@
|
||||||
#include "defines.h"
|
#include "defines.h"
|
||||||
#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
|
#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
|
||||||
#include "suggest/core/dictionary/dictionary.h"
|
#include "suggest/core/dictionary/dictionary.h"
|
||||||
#include "suggest/core/dictionary/probability_utils.h"
|
|
||||||
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
|
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
|
||||||
#include "utils/char_utils.h"
|
#include "utils/char_utils.h"
|
||||||
|
|
||||||
|
@ -131,7 +130,7 @@ int BigramDictionary::getPredictions(const int *prevWord, const int prevWordLeng
|
||||||
// resulting probability is 8 - although in the practice it's never bigger than 3 or 4
|
// resulting probability is 8 - although in the practice it's never bigger than 3 or 4
|
||||||
// in very bad cases. This means that sometimes, we'll see some bigrams interverted
|
// in very bad cases. This means that sometimes, we'll see some bigrams interverted
|
||||||
// here, but it can't get too bad.
|
// here, but it can't get too bad.
|
||||||
const int probability = ProbabilityUtils::computeProbabilityForBigram(
|
const int probability = mDictionaryStructurePolicy->getProbability(
|
||||||
unigramProbability, bigramsIt.getProbability());
|
unigramProbability, bigramsIt.getProbability());
|
||||||
addWordBigram(bigramBuffer, codePointCount, probability, outBigramProbability,
|
addWordBigram(bigramBuffer, codePointCount, probability, outBigramProbability,
|
||||||
outBigramCodePoints, outputTypes);
|
outBigramCodePoints, outputTypes);
|
||||||
|
|
|
@ -90,7 +90,7 @@ int Dictionary::getProbability(const int *word, int length) const {
|
||||||
if (NOT_A_VALID_WORD_POS == pos) {
|
if (NOT_A_VALID_WORD_POS == pos) {
|
||||||
return NOT_A_PROBABILITY;
|
return NOT_A_PROBABILITY;
|
||||||
}
|
}
|
||||||
return getDictionaryStructurePolicy()->getUnigramProbability(pos);
|
return getDictionaryStructurePolicy()->getUnigramProbabilityOfPtNode(pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Dictionary::isValidBigram(const int *word0, int length0, const int *word1, int length1) const {
|
bool Dictionary::isValidBigram(const int *word0, int length0, const int *word1, int length1) const {
|
||||||
|
|
|
@ -22,7 +22,6 @@
|
||||||
#include "defines.h"
|
#include "defines.h"
|
||||||
#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
|
#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
|
||||||
#include "suggest/core/dictionary/bloom_filter.h"
|
#include "suggest/core/dictionary/bloom_filter.h"
|
||||||
#include "suggest/core/dictionary/probability_utils.h"
|
|
||||||
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
|
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
|
||||||
#include "utils/hash_map_compat.h"
|
#include "utils/hash_map_compat.h"
|
||||||
|
|
||||||
|
@ -43,11 +42,12 @@ class MultiBigramMap {
|
||||||
hash_map_compat<int, BigramMap>::const_iterator mapPosition =
|
hash_map_compat<int, BigramMap>::const_iterator mapPosition =
|
||||||
mBigramMaps.find(wordPosition);
|
mBigramMaps.find(wordPosition);
|
||||||
if (mapPosition != mBigramMaps.end()) {
|
if (mapPosition != mBigramMaps.end()) {
|
||||||
return mapPosition->second.getBigramProbability(nextWordPosition, unigramProbability);
|
return mapPosition->second.getBigramProbability(structurePolicy, nextWordPosition,
|
||||||
|
unigramProbability);
|
||||||
}
|
}
|
||||||
if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) {
|
if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) {
|
||||||
addBigramsForWordPosition(structurePolicy, wordPosition);
|
addBigramsForWordPosition(structurePolicy, wordPosition);
|
||||||
return mBigramMaps[wordPosition].getBigramProbability(
|
return mBigramMaps[wordPosition].getBigramProbability(structurePolicy,
|
||||||
nextWordPosition, unigramProbability);
|
nextWordPosition, unigramProbability);
|
||||||
}
|
}
|
||||||
return readBigramProbabilityFromBinaryDictionary(structurePolicy, wordPosition,
|
return readBigramProbabilityFromBinaryDictionary(structurePolicy, wordPosition,
|
||||||
|
@ -82,17 +82,17 @@ class MultiBigramMap {
|
||||||
}
|
}
|
||||||
|
|
||||||
AK_FORCE_INLINE int getBigramProbability(
|
AK_FORCE_INLINE int getBigramProbability(
|
||||||
|
const DictionaryStructureWithBufferPolicy *const structurePolicy,
|
||||||
const int nextWordPosition, const int unigramProbability) const {
|
const int nextWordPosition, const int unigramProbability) const {
|
||||||
|
int bigramProbability = NOT_A_PROBABILITY;
|
||||||
if (mBloomFilter.isInFilter(nextWordPosition)) {
|
if (mBloomFilter.isInFilter(nextWordPosition)) {
|
||||||
const hash_map_compat<int, int>::const_iterator bigramProbabilityIt =
|
const hash_map_compat<int, int>::const_iterator bigramProbabilityIt =
|
||||||
mBigramMap.find(nextWordPosition);
|
mBigramMap.find(nextWordPosition);
|
||||||
if (bigramProbabilityIt != mBigramMap.end()) {
|
if (bigramProbabilityIt != mBigramMap.end()) {
|
||||||
const int bigramProbability = bigramProbabilityIt->second;
|
bigramProbability = bigramProbabilityIt->second;
|
||||||
return ProbabilityUtils::computeProbabilityForBigram(
|
|
||||||
unigramProbability, bigramProbability);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return ProbabilityUtils::backoff(unigramProbability);
|
return structurePolicy->getProbability(unigramProbability, bigramProbability);
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -111,17 +111,18 @@ class MultiBigramMap {
|
||||||
AK_FORCE_INLINE int readBigramProbabilityFromBinaryDictionary(
|
AK_FORCE_INLINE int readBigramProbabilityFromBinaryDictionary(
|
||||||
const DictionaryStructureWithBufferPolicy *const structurePolicy, const int nodePos,
|
const DictionaryStructureWithBufferPolicy *const structurePolicy, const int nodePos,
|
||||||
const int nextWordPosition, const int unigramProbability) {
|
const int nextWordPosition, const int unigramProbability) {
|
||||||
|
int bigramProbability = NOT_A_PROBABILITY;
|
||||||
const int bigramsListPos = structurePolicy->getBigramsPositionOfNode(nodePos);
|
const int bigramsListPos = structurePolicy->getBigramsPositionOfNode(nodePos);
|
||||||
BinaryDictionaryBigramsIterator bigramsIt(structurePolicy->getBigramsStructurePolicy(),
|
BinaryDictionaryBigramsIterator bigramsIt(structurePolicy->getBigramsStructurePolicy(),
|
||||||
bigramsListPos);
|
bigramsListPos);
|
||||||
while (bigramsIt.hasNext()) {
|
while (bigramsIt.hasNext()) {
|
||||||
bigramsIt.next();
|
bigramsIt.next();
|
||||||
if (bigramsIt.getBigramPos() == nextWordPosition) {
|
if (bigramsIt.getBigramPos() == nextWordPosition) {
|
||||||
return ProbabilityUtils::computeProbabilityForBigram(
|
bigramProbability = bigramsIt.getProbability();
|
||||||
unigramProbability, bigramsIt.getProbability());
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return ProbabilityUtils::backoff(unigramProbability);
|
return structurePolicy->getProbability(unigramProbability, bigramProbability);
|
||||||
}
|
}
|
||||||
|
|
||||||
static const size_t MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP;
|
static const size_t MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP;
|
||||||
|
|
|
@ -47,7 +47,10 @@ class DictionaryStructureWithBufferPolicy {
|
||||||
virtual int getTerminalNodePositionOfWord(const int *const inWord,
|
virtual int getTerminalNodePositionOfWord(const int *const inWord,
|
||||||
const int length, const bool forceLowerCaseSearch) const = 0;
|
const int length, const bool forceLowerCaseSearch) const = 0;
|
||||||
|
|
||||||
virtual int getUnigramProbability(const int nodePos) const = 0;
|
virtual int getProbability(const int unigramProbability,
|
||||||
|
const int bigramProbability) const = 0;
|
||||||
|
|
||||||
|
virtual int getUnigramProbabilityOfPtNode(const int nodePos) const = 0;
|
||||||
|
|
||||||
virtual int getShortcutPositionOfNode(const int nodePos) const = 0;
|
virtual int getShortcutPositionOfNode(const int nodePos) const = 0;
|
||||||
|
|
||||||
|
|
|
@ -171,7 +171,9 @@ int Suggest::outputSuggestions(DicTraverseSession *traverseSession, int *frequen
|
||||||
terminalIndex, doubleLetterTerminalIndex, doubleLetterLevel);
|
terminalIndex, doubleLetterTerminalIndex, doubleLetterLevel);
|
||||||
const float compoundDistance = terminalDicNode->getCompoundDistance(languageWeight)
|
const float compoundDistance = terminalDicNode->getCompoundDistance(languageWeight)
|
||||||
+ doubleLetterCost;
|
+ doubleLetterCost;
|
||||||
const bool isPossiblyOffensiveWord = terminalDicNode->getProbability() <= 0;
|
const bool isPossiblyOffensiveWord =
|
||||||
|
traverseSession->getDictionaryStructurePolicy()->getProbability(
|
||||||
|
terminalDicNode->getProbability(), NOT_A_PROBABILITY) <= 0;
|
||||||
const bool isExactMatch = terminalDicNode->isExactMatch();
|
const bool isExactMatch = terminalDicNode->isExactMatch();
|
||||||
const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase();
|
const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase();
|
||||||
// Heuristic: We exclude freq=0 first-char-uppercase words from exact match.
|
// Heuristic: We exclude freq=0 first-char-uppercase words from exact match.
|
||||||
|
|
|
@ -24,6 +24,7 @@
|
||||||
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h"
|
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h"
|
||||||
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h"
|
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h"
|
||||||
#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h"
|
#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h"
|
||||||
|
#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
|
@ -134,7 +135,20 @@ int DynamicPatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const in
|
||||||
return NOT_A_VALID_WORD_POS;
|
return NOT_A_VALID_WORD_POS;
|
||||||
}
|
}
|
||||||
|
|
||||||
int DynamicPatriciaTriePolicy::getUnigramProbability(const int nodePos) const {
|
int DynamicPatriciaTriePolicy::getProbability(const int unigramProbability,
|
||||||
|
const int bigramProbability) const {
|
||||||
|
// TODO: check mHeaderPolicy.usesForgettingCurve();
|
||||||
|
if (unigramProbability == NOT_A_PROBABILITY) {
|
||||||
|
return NOT_A_PROBABILITY;
|
||||||
|
} else if (bigramProbability == NOT_A_PROBABILITY) {
|
||||||
|
return ProbabilityUtils::backoff(unigramProbability);
|
||||||
|
} else {
|
||||||
|
return ProbabilityUtils::computeProbabilityForBigram(unigramProbability,
|
||||||
|
bigramProbability);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int DynamicPatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int nodePos) const {
|
||||||
if (nodePos == NOT_A_VALID_WORD_POS) {
|
if (nodePos == NOT_A_VALID_WORD_POS) {
|
||||||
return NOT_A_PROBABILITY;
|
return NOT_A_PROBABILITY;
|
||||||
}
|
}
|
||||||
|
@ -144,7 +158,7 @@ int DynamicPatriciaTriePolicy::getUnigramProbability(const int nodePos) const {
|
||||||
if (nodeReader.isDeleted() || nodeReader.isBlacklisted() || nodeReader.isNotAWord()) {
|
if (nodeReader.isDeleted() || nodeReader.isBlacklisted() || nodeReader.isNotAWord()) {
|
||||||
return NOT_A_PROBABILITY;
|
return NOT_A_PROBABILITY;
|
||||||
}
|
}
|
||||||
return nodeReader.getProbability();
|
return getProbability(nodeReader.getProbability(), NOT_A_PROBABILITY);
|
||||||
}
|
}
|
||||||
|
|
||||||
int DynamicPatriciaTriePolicy::getShortcutPositionOfNode(const int nodePos) const {
|
int DynamicPatriciaTriePolicy::getShortcutPositionOfNode(const int nodePos) const {
|
||||||
|
|
|
@ -57,7 +57,9 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
int getTerminalNodePositionOfWord(const int *const inWord,
|
int getTerminalNodePositionOfWord(const int *const inWord,
|
||||||
const int length, const bool forceLowerCaseSearch) const;
|
const int length, const bool forceLowerCaseSearch) const;
|
||||||
|
|
||||||
int getUnigramProbability(const int nodePos) const;
|
int getProbability(const int unigramProbability, const int bigramProbability) const;
|
||||||
|
|
||||||
|
int getUnigramProbabilityOfPtNode(const int nodePos) const;
|
||||||
|
|
||||||
int getShortcutPositionOfNode(const int nodePos) const;
|
int getShortcutPositionOfNode(const int nodePos) const;
|
||||||
|
|
||||||
|
|
|
@ -21,6 +21,7 @@
|
||||||
#include "suggest/core/dicnode/dic_node.h"
|
#include "suggest/core/dicnode/dic_node.h"
|
||||||
#include "suggest/core/dicnode/dic_node_vector.h"
|
#include "suggest/core/dicnode/dic_node_vector.h"
|
||||||
#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h"
|
#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h"
|
||||||
|
#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
|
@ -306,7 +307,19 @@ int PatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const inWord,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int PatriciaTriePolicy::getUnigramProbability(const int nodePos) const {
|
int PatriciaTriePolicy::getProbability(const int unigramProbability,
|
||||||
|
const int bigramProbability) const {
|
||||||
|
if (unigramProbability == NOT_A_PROBABILITY) {
|
||||||
|
return NOT_A_PROBABILITY;
|
||||||
|
} else if (bigramProbability == NOT_A_PROBABILITY) {
|
||||||
|
return ProbabilityUtils::backoff(unigramProbability);
|
||||||
|
} else {
|
||||||
|
return ProbabilityUtils::computeProbabilityForBigram(unigramProbability,
|
||||||
|
bigramProbability);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int PatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int nodePos) const {
|
||||||
if (nodePos == NOT_A_VALID_WORD_POS) {
|
if (nodePos == NOT_A_VALID_WORD_POS) {
|
||||||
return NOT_A_PROBABILITY;
|
return NOT_A_PROBABILITY;
|
||||||
}
|
}
|
||||||
|
@ -324,7 +337,8 @@ int PatriciaTriePolicy::getUnigramProbability(const int nodePos) const {
|
||||||
return NOT_A_PROBABILITY;
|
return NOT_A_PROBABILITY;
|
||||||
}
|
}
|
||||||
PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos);
|
PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos);
|
||||||
return PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos);
|
return getProbability(PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(
|
||||||
|
mDictRoot, &pos), NOT_A_PROBABILITY);
|
||||||
}
|
}
|
||||||
|
|
||||||
int PatriciaTriePolicy::getShortcutPositionOfNode(const int nodePos) const {
|
int PatriciaTriePolicy::getShortcutPositionOfNode(const int nodePos) const {
|
||||||
|
|
|
@ -56,7 +56,9 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
int getTerminalNodePositionOfWord(const int *const inWord,
|
int getTerminalNodePositionOfWord(const int *const inWord,
|
||||||
const int length, const bool forceLowerCaseSearch) const;
|
const int length, const bool forceLowerCaseSearch) const;
|
||||||
|
|
||||||
int getUnigramProbability(const int nodePos) const;
|
int getProbability(const int unigramProbability, const int bigramProbability) const;
|
||||||
|
|
||||||
|
int getUnigramProbabilityOfPtNode(const int nodePos) const;
|
||||||
|
|
||||||
int getShortcutPositionOfNode(const int nodePos) const;
|
int getShortcutPositionOfNode(const int nodePos) const;
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue