Fill in the bloom filter for bigram lookup.
Bug: 6313806 Change-Id: Ib79e14f6f8b241f053da6069c15f19c71084317e
This commit is contained in:
parent
165725aba8
commit
f1634c872c
4 changed files with 34 additions and 6 deletions
|
@ -153,8 +153,14 @@ int BigramDictionary::getBigramListPositionForWord(const int32_t *prevWord,
|
||||||
return pos;
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
void BigramDictionary::fillBigramAddressToFrequencyMap(const int32_t *prevWord,
|
static inline void setInFilter(uint8_t *filter, const int position) {
|
||||||
const int prevWordLength, std::map<int, int> *map) {
|
const unsigned int bucket = position % BIGRAM_FILTER_MODULO;
|
||||||
|
filter[bucket >> 3] |= (1 << (bucket & 0x7));
|
||||||
|
}
|
||||||
|
|
||||||
|
void BigramDictionary::fillBigramAddressToFrequencyMapAndFilter(const int32_t *prevWord,
|
||||||
|
const int prevWordLength, std::map<int, int> *map, uint8_t *filter) {
|
||||||
|
memset(filter, 0, BIGRAM_FILTER_BYTE_SIZE);
|
||||||
const uint8_t* const root = DICT;
|
const uint8_t* const root = DICT;
|
||||||
int pos = getBigramListPositionForWord(prevWord, prevWordLength);
|
int pos = getBigramListPositionForWord(prevWord, prevWordLength);
|
||||||
if (0 == pos) return;
|
if (0 == pos) return;
|
||||||
|
@ -166,6 +172,7 @@ void BigramDictionary::fillBigramAddressToFrequencyMap(const int32_t *prevWord,
|
||||||
const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags,
|
const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags,
|
||||||
&pos);
|
&pos);
|
||||||
(*map)[bigramPos] = frequency;
|
(*map)[bigramPos] = frequency;
|
||||||
|
setInFilter(filter, bigramPos);
|
||||||
} while (0 != (UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags));
|
} while (0 != (UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -20,6 +20,8 @@
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#include "defines.h"
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
class Dictionary;
|
class Dictionary;
|
||||||
|
@ -29,8 +31,8 @@ class BigramDictionary {
|
||||||
int getBigrams(const int32_t *word, int length, int *codes, int codesSize,
|
int getBigrams(const int32_t *word, int length, int *codes, int codesSize,
|
||||||
unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams);
|
unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams);
|
||||||
int getBigramListPositionForWord(const int32_t *prevWord, const int prevWordLength);
|
int getBigramListPositionForWord(const int32_t *prevWord, const int prevWordLength);
|
||||||
void fillBigramAddressToFrequencyMap(const int32_t *prevWord, const int prevWordLength,
|
void fillBigramAddressToFrequencyMapAndFilter(const int32_t *prevWord, const int prevWordLength,
|
||||||
std::map<int, int> *map);
|
std::map<int, int> *map, uint8_t *filter);
|
||||||
~BigramDictionary();
|
~BigramDictionary();
|
||||||
private:
|
private:
|
||||||
bool addWordBigram(unsigned short *word, int length, int frequency);
|
bool addWordBigram(unsigned short *word, int length, int frequency);
|
||||||
|
|
|
@ -241,6 +241,24 @@ static inline void prof_out(void) {
|
||||||
#define MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION 3
|
#define MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION 3
|
||||||
#define MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION 3
|
#define MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION 3
|
||||||
|
|
||||||
|
// Size, in bytes, of the bloom filter index for bigrams
|
||||||
|
// 128 gives us 1024 buckets. The probability of false positive is (1 - e ** (-kn/m))**k,
|
||||||
|
// where k is the number of hash functions, n the number of bigrams, and m the number of
|
||||||
|
// bits we can test.
|
||||||
|
// At the moment 100 is the maximum number of bigrams for a word with the current
|
||||||
|
// dictionaries, so n = 100. 1024 buckets give us m = 1024.
|
||||||
|
// With 1 hash function, our false positive rate is about 9.3%, which should be enough for
|
||||||
|
// our uses since we are only using this to increase average performance. For the record,
|
||||||
|
// k = 2 gives 3.1% and k = 3 gives 1.6%. With k = 1, making m = 2048 gives 4.8%,
|
||||||
|
// and m = 4096 gives 2.4%.
|
||||||
|
#define BIGRAM_FILTER_BYTE_SIZE 128
|
||||||
|
// Must be smaller than BIGRAM_FILTER_BYTE_SIZE * 8, and preferably prime. 1021 is the largest
|
||||||
|
// prime under 128 * 8.
|
||||||
|
#define BIGRAM_FILTER_MODULO 1021
|
||||||
|
#if BIGRAM_FILTER_BYTE_SIZE * 8 < BIGRAM_FILTER_MODULO
|
||||||
|
#error "BIGRAM_FILTER_MODULO is larger than BIGRAM_FILTER_BYTE_SIZE"
|
||||||
|
#endif
|
||||||
|
|
||||||
template<typename T> inline T min(T a, T b) { return a < b ? a : b; }
|
template<typename T> inline T min(T a, T b) { return a < b ? a : b; }
|
||||||
template<typename T> inline T max(T a, T b) { return a > b ? a : b; }
|
template<typename T> inline T max(T a, T b) { return a > b ? a : b; }
|
||||||
|
|
||||||
|
|
|
@ -42,8 +42,9 @@ class Dictionary {
|
||||||
const int bigramListPosition = !prevWordChars ? 0
|
const int bigramListPosition = !prevWordChars ? 0
|
||||||
: mBigramDictionary->getBigramListPositionForWord(prevWordChars, prevWordLength);
|
: mBigramDictionary->getBigramListPositionForWord(prevWordChars, prevWordLength);
|
||||||
std::map<int, int> bigramMap;
|
std::map<int, int> bigramMap;
|
||||||
mBigramDictionary->fillBigramAddressToFrequencyMap(prevWordChars, prevWordLength,
|
uint8_t bigramFilter[BIGRAM_FILTER_BYTE_SIZE];
|
||||||
&bigramMap);
|
mBigramDictionary->fillBigramAddressToFrequencyMapAndFilter(prevWordChars,
|
||||||
|
prevWordLength, &bigramMap, bigramFilter);
|
||||||
return mUnigramDictionary->getSuggestions(proximityInfo, mWordsPriorityQueuePool,
|
return mUnigramDictionary->getSuggestions(proximityInfo, mWordsPriorityQueuePool,
|
||||||
mCorrection, xcoordinates, ycoordinates, codes, codesSize, bigramListPosition,
|
mCorrection, xcoordinates, ycoordinates, codes, codesSize, bigramListPosition,
|
||||||
useFullEditDistance, outWords, frequencies);
|
useFullEditDistance, outWords, frequencies);
|
||||||
|
|
Loading…
Reference in a new issue