am 40182c73: Merge "Compute probability using structure policy."

* commit '40182c737de462a37c90ba1f85b4d67095d893f7':
  Compute probability using structure policy.
This commit is contained in:
Keisuke Kuroyanagi 2013-09-06 01:35:14 -07:00 committed by Android Git Automerger
commit ae7e7e0e4e
12 changed files with 62 additions and 28 deletions

View file

@ -21,7 +21,6 @@
#include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node.h"
#include "suggest/core/dicnode/dic_node_vector.h" #include "suggest/core/dicnode/dic_node_vector.h"
#include "suggest/core/dictionary/multi_bigram_map.h" #include "suggest/core/dictionary/multi_bigram_map.h"
#include "suggest/core/dictionary/probability_utils.h"
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
#include "utils/char_utils.h" #include "utils/char_utils.h"
@ -93,13 +92,15 @@ namespace latinime {
if (NOT_A_VALID_WORD_POS == wordPos || NOT_A_VALID_WORD_POS == prevWordPos) { if (NOT_A_VALID_WORD_POS == wordPos || NOT_A_VALID_WORD_POS == prevWordPos) {
// Note: Normally wordPos comes from the dictionary and should never equal // Note: Normally wordPos comes from the dictionary and should never equal
// NOT_A_VALID_WORD_POS. // NOT_A_VALID_WORD_POS.
return ProbabilityUtils::backoff(unigramProbability); return dictionaryStructurePolicy->getProbability(unigramProbability,
NOT_A_PROBABILITY);
} }
if (multiBigramMap) { if (multiBigramMap) {
return multiBigramMap->getBigramProbability(dictionaryStructurePolicy, prevWordPos, return multiBigramMap->getBigramProbability(dictionaryStructurePolicy, prevWordPos,
wordPos, unigramProbability); wordPos, unigramProbability);
} }
return ProbabilityUtils::backoff(unigramProbability); return dictionaryStructurePolicy->getProbability(unigramProbability,
NOT_A_PROBABILITY);
} }
//////////////// ////////////////

View file

@ -116,10 +116,6 @@ class DicNodeStatePrevWord {
return mPrevWordStart; return mPrevWordStart;
} }
int16_t getPrevWordProbability() const {
return mPrevWordProbability;
}
int getPrevWordNodePos() const { int getPrevWordNodePos() const {
return mPrevWordNodePos; return mPrevWordNodePos;
} }

View file

@ -23,7 +23,6 @@
#include "defines.h" #include "defines.h"
#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h" #include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
#include "suggest/core/dictionary/dictionary.h" #include "suggest/core/dictionary/dictionary.h"
#include "suggest/core/dictionary/probability_utils.h"
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
#include "utils/char_utils.h" #include "utils/char_utils.h"
@ -131,7 +130,7 @@ int BigramDictionary::getPredictions(const int *prevWord, const int prevWordLeng
// resulting probability is 8 - although in the practice it's never bigger than 3 or 4 // resulting probability is 8 - although in the practice it's never bigger than 3 or 4
// in very bad cases. This means that sometimes, we'll see some bigrams interverted // in very bad cases. This means that sometimes, we'll see some bigrams interverted
// here, but it can't get too bad. // here, but it can't get too bad.
const int probability = ProbabilityUtils::computeProbabilityForBigram( const int probability = mDictionaryStructurePolicy->getProbability(
unigramProbability, bigramsIt.getProbability()); unigramProbability, bigramsIt.getProbability());
addWordBigram(bigramBuffer, codePointCount, probability, outBigramProbability, addWordBigram(bigramBuffer, codePointCount, probability, outBigramProbability,
outBigramCodePoints, outputTypes); outBigramCodePoints, outputTypes);

View file

@ -90,7 +90,7 @@ int Dictionary::getProbability(const int *word, int length) const {
if (NOT_A_VALID_WORD_POS == pos) { if (NOT_A_VALID_WORD_POS == pos) {
return NOT_A_PROBABILITY; return NOT_A_PROBABILITY;
} }
return getDictionaryStructurePolicy()->getUnigramProbability(pos); return getDictionaryStructurePolicy()->getUnigramProbabilityOfPtNode(pos);
} }
bool Dictionary::isValidBigram(const int *word0, int length0, const int *word1, int length1) const { bool Dictionary::isValidBigram(const int *word0, int length0, const int *word1, int length1) const {

View file

@ -22,7 +22,6 @@
#include "defines.h" #include "defines.h"
#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h" #include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
#include "suggest/core/dictionary/bloom_filter.h" #include "suggest/core/dictionary/bloom_filter.h"
#include "suggest/core/dictionary/probability_utils.h"
#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
#include "utils/hash_map_compat.h" #include "utils/hash_map_compat.h"
@ -43,11 +42,12 @@ class MultiBigramMap {
hash_map_compat<int, BigramMap>::const_iterator mapPosition = hash_map_compat<int, BigramMap>::const_iterator mapPosition =
mBigramMaps.find(wordPosition); mBigramMaps.find(wordPosition);
if (mapPosition != mBigramMaps.end()) { if (mapPosition != mBigramMaps.end()) {
return mapPosition->second.getBigramProbability(nextWordPosition, unigramProbability); return mapPosition->second.getBigramProbability(structurePolicy, nextWordPosition,
unigramProbability);
} }
if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) { if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) {
addBigramsForWordPosition(structurePolicy, wordPosition); addBigramsForWordPosition(structurePolicy, wordPosition);
return mBigramMaps[wordPosition].getBigramProbability( return mBigramMaps[wordPosition].getBigramProbability(structurePolicy,
nextWordPosition, unigramProbability); nextWordPosition, unigramProbability);
} }
return readBigramProbabilityFromBinaryDictionary(structurePolicy, wordPosition, return readBigramProbabilityFromBinaryDictionary(structurePolicy, wordPosition,
@ -82,17 +82,17 @@ class MultiBigramMap {
} }
AK_FORCE_INLINE int getBigramProbability( AK_FORCE_INLINE int getBigramProbability(
const DictionaryStructureWithBufferPolicy *const structurePolicy,
const int nextWordPosition, const int unigramProbability) const { const int nextWordPosition, const int unigramProbability) const {
int bigramProbability = NOT_A_PROBABILITY;
if (mBloomFilter.isInFilter(nextWordPosition)) { if (mBloomFilter.isInFilter(nextWordPosition)) {
const hash_map_compat<int, int>::const_iterator bigramProbabilityIt = const hash_map_compat<int, int>::const_iterator bigramProbabilityIt =
mBigramMap.find(nextWordPosition); mBigramMap.find(nextWordPosition);
if (bigramProbabilityIt != mBigramMap.end()) { if (bigramProbabilityIt != mBigramMap.end()) {
const int bigramProbability = bigramProbabilityIt->second; bigramProbability = bigramProbabilityIt->second;
return ProbabilityUtils::computeProbabilityForBigram(
unigramProbability, bigramProbability);
} }
} }
return ProbabilityUtils::backoff(unigramProbability); return structurePolicy->getProbability(unigramProbability, bigramProbability);
} }
private: private:
@ -111,17 +111,18 @@ class MultiBigramMap {
AK_FORCE_INLINE int readBigramProbabilityFromBinaryDictionary( AK_FORCE_INLINE int readBigramProbabilityFromBinaryDictionary(
const DictionaryStructureWithBufferPolicy *const structurePolicy, const int nodePos, const DictionaryStructureWithBufferPolicy *const structurePolicy, const int nodePos,
const int nextWordPosition, const int unigramProbability) { const int nextWordPosition, const int unigramProbability) {
int bigramProbability = NOT_A_PROBABILITY;
const int bigramsListPos = structurePolicy->getBigramsPositionOfNode(nodePos); const int bigramsListPos = structurePolicy->getBigramsPositionOfNode(nodePos);
BinaryDictionaryBigramsIterator bigramsIt(structurePolicy->getBigramsStructurePolicy(), BinaryDictionaryBigramsIterator bigramsIt(structurePolicy->getBigramsStructurePolicy(),
bigramsListPos); bigramsListPos);
while (bigramsIt.hasNext()) { while (bigramsIt.hasNext()) {
bigramsIt.next(); bigramsIt.next();
if (bigramsIt.getBigramPos() == nextWordPosition) { if (bigramsIt.getBigramPos() == nextWordPosition) {
return ProbabilityUtils::computeProbabilityForBigram( bigramProbability = bigramsIt.getProbability();
unigramProbability, bigramsIt.getProbability()); break;
} }
} }
return ProbabilityUtils::backoff(unigramProbability); return structurePolicy->getProbability(unigramProbability, bigramProbability);
} }
static const size_t MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP; static const size_t MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP;

View file

@ -47,7 +47,10 @@ class DictionaryStructureWithBufferPolicy {
virtual int getTerminalNodePositionOfWord(const int *const inWord, virtual int getTerminalNodePositionOfWord(const int *const inWord,
const int length, const bool forceLowerCaseSearch) const = 0; const int length, const bool forceLowerCaseSearch) const = 0;
virtual int getUnigramProbability(const int nodePos) const = 0; virtual int getProbability(const int unigramProbability,
const int bigramProbability) const = 0;
virtual int getUnigramProbabilityOfPtNode(const int nodePos) const = 0;
virtual int getShortcutPositionOfNode(const int nodePos) const = 0; virtual int getShortcutPositionOfNode(const int nodePos) const = 0;

View file

@ -171,7 +171,9 @@ int Suggest::outputSuggestions(DicTraverseSession *traverseSession, int *frequen
terminalIndex, doubleLetterTerminalIndex, doubleLetterLevel); terminalIndex, doubleLetterTerminalIndex, doubleLetterLevel);
const float compoundDistance = terminalDicNode->getCompoundDistance(languageWeight) const float compoundDistance = terminalDicNode->getCompoundDistance(languageWeight)
+ doubleLetterCost; + doubleLetterCost;
const bool isPossiblyOffensiveWord = terminalDicNode->getProbability() <= 0; const bool isPossiblyOffensiveWord =
traverseSession->getDictionaryStructurePolicy()->getProbability(
terminalDicNode->getProbability(), NOT_A_PROBABILITY) <= 0;
const bool isExactMatch = terminalDicNode->isExactMatch(); const bool isExactMatch = terminalDicNode->isExactMatch();
const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase(); const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase();
// Heuristic: We exclude freq=0 first-char-uppercase words from exact match. // Heuristic: We exclude freq=0 first-char-uppercase words from exact match.

View file

@ -24,6 +24,7 @@
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h"
#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
namespace latinime { namespace latinime {
@ -134,7 +135,20 @@ int DynamicPatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const in
return NOT_A_VALID_WORD_POS; return NOT_A_VALID_WORD_POS;
} }
int DynamicPatriciaTriePolicy::getUnigramProbability(const int nodePos) const { int DynamicPatriciaTriePolicy::getProbability(const int unigramProbability,
const int bigramProbability) const {
// TODO: check mHeaderPolicy.usesForgettingCurve();
if (unigramProbability == NOT_A_PROBABILITY) {
return NOT_A_PROBABILITY;
} else if (bigramProbability == NOT_A_PROBABILITY) {
return ProbabilityUtils::backoff(unigramProbability);
} else {
return ProbabilityUtils::computeProbabilityForBigram(unigramProbability,
bigramProbability);
}
}
int DynamicPatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int nodePos) const {
if (nodePos == NOT_A_VALID_WORD_POS) { if (nodePos == NOT_A_VALID_WORD_POS) {
return NOT_A_PROBABILITY; return NOT_A_PROBABILITY;
} }
@ -144,7 +158,7 @@ int DynamicPatriciaTriePolicy::getUnigramProbability(const int nodePos) const {
if (nodeReader.isDeleted() || nodeReader.isBlacklisted() || nodeReader.isNotAWord()) { if (nodeReader.isDeleted() || nodeReader.isBlacklisted() || nodeReader.isNotAWord()) {
return NOT_A_PROBABILITY; return NOT_A_PROBABILITY;
} }
return nodeReader.getProbability(); return getProbability(nodeReader.getProbability(), NOT_A_PROBABILITY);
} }
int DynamicPatriciaTriePolicy::getShortcutPositionOfNode(const int nodePos) const { int DynamicPatriciaTriePolicy::getShortcutPositionOfNode(const int nodePos) const {

View file

@ -57,7 +57,9 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
int getTerminalNodePositionOfWord(const int *const inWord, int getTerminalNodePositionOfWord(const int *const inWord,
const int length, const bool forceLowerCaseSearch) const; const int length, const bool forceLowerCaseSearch) const;
int getUnigramProbability(const int nodePos) const; int getProbability(const int unigramProbability, const int bigramProbability) const;
int getUnigramProbabilityOfPtNode(const int nodePos) const;
int getShortcutPositionOfNode(const int nodePos) const; int getShortcutPositionOfNode(const int nodePos) const;

View file

@ -21,6 +21,7 @@
#include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node.h"
#include "suggest/core/dicnode/dic_node_vector.h" #include "suggest/core/dicnode/dic_node_vector.h"
#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
namespace latinime { namespace latinime {
@ -306,7 +307,19 @@ int PatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const inWord,
} }
} }
int PatriciaTriePolicy::getUnigramProbability(const int nodePos) const { int PatriciaTriePolicy::getProbability(const int unigramProbability,
const int bigramProbability) const {
if (unigramProbability == NOT_A_PROBABILITY) {
return NOT_A_PROBABILITY;
} else if (bigramProbability == NOT_A_PROBABILITY) {
return ProbabilityUtils::backoff(unigramProbability);
} else {
return ProbabilityUtils::computeProbabilityForBigram(unigramProbability,
bigramProbability);
}
}
int PatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int nodePos) const {
if (nodePos == NOT_A_VALID_WORD_POS) { if (nodePos == NOT_A_VALID_WORD_POS) {
return NOT_A_PROBABILITY; return NOT_A_PROBABILITY;
} }
@ -324,7 +337,8 @@ int PatriciaTriePolicy::getUnigramProbability(const int nodePos) const {
return NOT_A_PROBABILITY; return NOT_A_PROBABILITY;
} }
PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos); PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos);
return PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos); return getProbability(PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(
mDictRoot, &pos), NOT_A_PROBABILITY);
} }
int PatriciaTriePolicy::getShortcutPositionOfNode(const int nodePos) const { int PatriciaTriePolicy::getShortcutPositionOfNode(const int nodePos) const {

View file

@ -56,7 +56,9 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
int getTerminalNodePositionOfWord(const int *const inWord, int getTerminalNodePositionOfWord(const int *const inWord,
const int length, const bool forceLowerCaseSearch) const; const int length, const bool forceLowerCaseSearch) const;
int getUnigramProbability(const int nodePos) const; int getProbability(const int unigramProbability, const int bigramProbability) const;
int getUnigramProbabilityOfPtNode(const int nodePos) const;
int getShortcutPositionOfNode(const int nodePos) const; int getShortcutPositionOfNode(const int nodePos) const;