From 1ff81e889045d35ff8420b266398e73239bd15c9 Mon Sep 17 00:00:00 2001 From: Keisuke Kuroynagi Date: Fri, 14 Jun 2013 20:35:41 +0900 Subject: [PATCH] Use bloom filter in multi bigram map. Evaluated with previous word "this". without bloom filter (use only hash_map): Total 147792.34 (sum of others 147771.57) with bloom filter: Total 145900.64 (sum of others 145874.30) always read binary dictionary: Total 148603.14 (sum of others 148579.90) Bug: 8592527 Change-Id: I821dc39454543826adb73b9eeeef6408fad8ae28 --- native/jni/Android.mk | 4 +- native/jni/src/defines.h | 27 ---------- .../suggest/core/dictionary/bloom_filter.cpp | 25 +++++++++ .../suggest/core/dictionary/bloom_filter.h | 52 +++++++++++++++---- .../core/dictionary/multi_bigram_map.cpp | 33 ++++++++++++ .../core/dictionary/multi_bigram_map.h | 26 +++++++--- 6 files changed, 121 insertions(+), 46 deletions(-) create mode 100644 native/jni/src/suggest/core/dictionary/bloom_filter.cpp create mode 100644 native/jni/src/suggest/core/dictionary/multi_bigram_map.cpp diff --git a/native/jni/Android.mk b/native/jni/Android.mk index 9db50473d..fb60139d3 100644 --- a/native/jni/Android.mk +++ b/native/jni/Android.mk @@ -57,9 +57,11 @@ LATIN_IME_CORE_SRC_FILES := \ binary_dictionary_format_utils.cpp \ binary_dictionary_header.cpp \ binary_dictionary_header_reading_utils.cpp \ + bloom_filter.cpp \ byte_array_utils.cpp \ dictionary.cpp \ - digraph_utils.cpp) \ + digraph_utils.cpp \ + multi_bigram_map.cpp) \ $(addprefix suggest/core/layout/, \ additional_proximity_chars.cpp \ proximity_info.cpp \ diff --git a/native/jni/src/defines.h b/native/jni/src/defines.h index a3cf6a4b4..e349aedb1 100644 --- a/native/jni/src/defines.h +++ b/native/jni/src/defines.h @@ -300,33 +300,6 @@ static inline void prof_out(void) { #define DIC_NODES_CACHE_INITIAL_QUEUE_ID_CACHE_FOR_CONTINUOUS_SUGGESTION 3 #define DIC_NODES_CACHE_PRIORITY_QUEUES_SIZE 4 -// Size, in bytes, of the bloom filter index for bigrams -// 128 gives us 1024 buckets. The probability of false positive is (1 - e ** (-kn/m))**k, -// where k is the number of hash functions, n the number of bigrams, and m the number of -// bits we can test. -// At the moment 100 is the maximum number of bigrams for a word with the current -// dictionaries, so n = 100. 1024 buckets give us m = 1024. -// With 1 hash function, our false positive rate is about 9.3%, which should be enough for -// our uses since we are only using this to increase average performance. For the record, -// k = 2 gives 3.1% and k = 3 gives 1.6%. With k = 1, making m = 2048 gives 4.8%, -// and m = 4096 gives 2.4%. -#define BIGRAM_FILTER_BYTE_SIZE 128 -// Must be smaller than BIGRAM_FILTER_BYTE_SIZE * 8, and preferably prime. 1021 is the largest -// prime under 128 * 8. -#define BIGRAM_FILTER_MODULO 1021 -#if BIGRAM_FILTER_BYTE_SIZE * 8 < BIGRAM_FILTER_MODULO -#error "BIGRAM_FILTER_MODULO is larger than BIGRAM_FILTER_BYTE_SIZE" -#endif - -// Max number of bigram maps (previous word contexts) to be cached. Increasing this number could -// improve bigram lookup speed for multi-word suggestions, but at the cost of more memory usage. -// Also, there are diminishing returns since the most frequently used bigrams are typically near -// the beginning of the input and are thus the first ones to be cached. Note that these bigrams -// are reset for each new composing word. -#define MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP 25 -// Most common previous word contexts currently have 100 bigrams -#define DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP 100 - template AK_FORCE_INLINE const T &min(const T &a, const T &b) { return a < b ? a : b; } template AK_FORCE_INLINE const T &max(const T &a, const T &b) { return a > b ? a : b; } diff --git a/native/jni/src/suggest/core/dictionary/bloom_filter.cpp b/native/jni/src/suggest/core/dictionary/bloom_filter.cpp new file mode 100644 index 000000000..4ae474e0c --- /dev/null +++ b/native/jni/src/suggest/core/dictionary/bloom_filter.cpp @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/dictionary/bloom_filter.h" + +namespace latinime { + +// Must be smaller than BIGRAM_FILTER_BYTE_SIZE * 8, and preferably prime. 1021 is the largest +// prime under 128 * 8. +const int BloomFilter::BIGRAM_FILTER_MODULO = 1021; + +} // namespace latinime diff --git a/native/jni/src/suggest/core/dictionary/bloom_filter.h b/native/jni/src/suggest/core/dictionary/bloom_filter.h index bcce1f7ea..5205456a8 100644 --- a/native/jni/src/suggest/core/dictionary/bloom_filter.h +++ b/native/jni/src/suggest/core/dictionary/bloom_filter.h @@ -23,16 +23,48 @@ namespace latinime { -// TODO: uint32_t position -static inline void setInFilter(uint8_t *filter, const int32_t position) { - const uint32_t bucket = static_cast(position % BIGRAM_FILTER_MODULO); - filter[bucket >> 3] |= static_cast(1 << (bucket & 0x7)); -} +// This bloom filter is used for optimizing bigram retrieval. +// Execution times with previous word "this" are as follows: +// without bloom filter (use only hash_map): +// Total 147792.34 (sum of others 147771.57) +// with bloom filter: +// Total 145900.64 (sum of others 145874.30) +// always read binary dictionary: +// Total 148603.14 (sum of others 148579.90) +class BloomFilter { + public: + BloomFilter() { + ASSERT(BIGRAM_FILTER_BYTE_SIZE * 8 >= BIGRAM_FILTER_MODULO); + } -// TODO: uint32_t position -static inline bool isInFilter(const uint8_t *filter, const int32_t position) { - const uint32_t bucket = static_cast(position % BIGRAM_FILTER_MODULO); - return filter[bucket >> 3] & static_cast(1 << (bucket & 0x7)); -} + // TODO: uint32_t position + AK_FORCE_INLINE void setInFilter(const int32_t position) { + const uint32_t bucket = static_cast(position % BIGRAM_FILTER_MODULO); + mFilter[bucket >> 3] |= static_cast(1 << (bucket & 0x7)); + } + + // TODO: uint32_t position + AK_FORCE_INLINE bool isInFilter(const int32_t position) const { + const uint32_t bucket = static_cast(position % BIGRAM_FILTER_MODULO); + return (mFilter[bucket >> 3] & static_cast(1 << (bucket & 0x7))) != 0; + } + + private: + // Size, in bytes, of the bloom filter index for bigrams + // 128 gives us 1024 buckets. The probability of false positive is (1 - e ** (-kn/m))**k, + // where k is the number of hash functions, n the number of bigrams, and m the number of + // bits we can test. + // At the moment 100 is the maximum number of bigrams for a word with the current + // dictionaries, so n = 100. 1024 buckets give us m = 1024. + // With 1 hash function, our false positive rate is about 9.3%, which should be enough for + // our uses since we are only using this to increase average performance. For the record, + // k = 2 gives 3.1% and k = 3 gives 1.6%. With k = 1, making m = 2048 gives 4.8%, + // and m = 4096 gives 2.4%. + // This is assigned here because it is used for array size. + static const int BIGRAM_FILTER_BYTE_SIZE = 128; + static const int BIGRAM_FILTER_MODULO; + + uint8_t mFilter[BIGRAM_FILTER_BYTE_SIZE]; +}; } // namespace latinime #endif // LATINIME_BLOOM_FILTER_H diff --git a/native/jni/src/suggest/core/dictionary/multi_bigram_map.cpp b/native/jni/src/suggest/core/dictionary/multi_bigram_map.cpp new file mode 100644 index 000000000..b1d2f4b4d --- /dev/null +++ b/native/jni/src/suggest/core/dictionary/multi_bigram_map.cpp @@ -0,0 +1,33 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/dictionary/multi_bigram_map.h" + +#include + +namespace latinime { + +// Max number of bigram maps (previous word contexts) to be cached. Increasing this number +// could improve bigram lookup speed for multi-word suggestions, but at the cost of more memory +// usage. Also, there are diminishing returns since the most frequently used bigrams are +// typically near the beginning of the input and are thus the first ones to be cached. Note +// that these bigrams are reset for each new composing word. +const size_t MultiBigramMap::MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP = 25; + +// Most common previous word contexts currently have 100 bigrams +const int MultiBigramMap::BigramMap::DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP = 100; + +} // namespace latinime diff --git a/native/jni/src/suggest/core/dictionary/multi_bigram_map.h b/native/jni/src/suggest/core/dictionary/multi_bigram_map.h index b380e9727..60169f80e 100644 --- a/native/jni/src/suggest/core/dictionary/multi_bigram_map.h +++ b/native/jni/src/suggest/core/dictionary/multi_bigram_map.h @@ -17,10 +17,13 @@ #ifndef LATINIME_MULTI_BIGRAM_MAP_H #define LATINIME_MULTI_BIGRAM_MAP_H +#include + #include "defines.h" #include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h" #include "suggest/core/dictionary/binary_dictionary_info.h" #include "suggest/core/dictionary/binary_format.h" +#include "suggest/core/dictionary/bloom_filter.h" #include "utils/hash_map_compat.h" namespace latinime { @@ -60,7 +63,7 @@ class MultiBigramMap { class BigramMap { public: - BigramMap() : mBigramMap(DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP) {} + BigramMap() : mBigramMap(DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP), mBloomFilter() {} ~BigramMap() {} void init(const BinaryDictionaryInfo *const binaryDictionaryInfo, const int nodePos) { @@ -73,24 +76,30 @@ class MultiBigramMap { bigramsIt.hasNext(); /* no-op */) { bigramsIt.next(); mBigramMap[bigramsIt.getBigramPos()] = bigramsIt.getProbability(); + mBloomFilter.setInFilter(bigramsIt.getBigramPos()); } } AK_FORCE_INLINE int getBigramProbability( const int nextWordPosition, const int unigramProbability) const { - const hash_map_compat::const_iterator bigramProbabilityIt = - mBigramMap.find(nextWordPosition); - if (bigramProbabilityIt != mBigramMap.end()) { - const int bigramProbability = bigramProbabilityIt->second; - return ProbabilityUtils::computeProbabilityForBigram( - unigramProbability, bigramProbability); + if (mBloomFilter.isInFilter(nextWordPosition)) { + const hash_map_compat::const_iterator bigramProbabilityIt = + mBigramMap.find(nextWordPosition); + if (bigramProbabilityIt != mBigramMap.end()) { + const int bigramProbability = bigramProbabilityIt->second; + return ProbabilityUtils::computeProbabilityForBigram( + unigramProbability, bigramProbability); + } } return ProbabilityUtils::backoff(unigramProbability); } private: - // Note: Default copy constructor needed for use in hash_map. + // NOTE: The BigramMap class doesn't use DISALLOW_COPY_AND_ASSIGN() because its default + // copy constructor is needed for use in hash_map. + static const int DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP; hash_map_compat mBigramMap; + BloomFilter mBloomFilter; }; AK_FORCE_INLINE void addBigramsForWordPosition( @@ -117,6 +126,7 @@ class MultiBigramMap { return ProbabilityUtils::backoff(unigramProbability); } + static const size_t MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP; hash_map_compat mBigramMaps; }; } // namespace latinime