LatinIME/native/jni/src/dictionary/utils/multi_bigram_map.cpp

/*
 * Copyright (C) 2013, The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "dictionary/utils/multi_bigram_map.h"

#include <cstddef>
#include <unordered_map>

namespace latinime {

// Max number of bigram maps (previous word contexts) to be cached. Increasing this number
// could improve bigram lookup speed for multi-word suggestions, but at the cost of more memory
// usage. Also, there are diminishing returns since the most frequently used bigrams are
// typically near the beginning of the input and are thus the first ones to be cached. Note
// that these bigrams are reset for each new composing word.
const size_t MultiBigramMap::MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP = 25;

// Most common previous word contexts currently have 100 bigrams
const int MultiBigramMap::BigramMap::DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP = 100;

// Look up the bigram probability for the given word pair from the cached bigram maps.
// Also caches the bigrams if there is space remaining and they have not been cached already.
int MultiBigramMap::getBigramProbability(
        const DictionaryStructureWithBufferPolicy *const structurePolicy,
        const WordIdArrayView prevWordIds, const int nextWordId,
        const int unigramProbability) {
    if (prevWordIds.empty() || prevWordIds[0] == NOT_A_WORD_ID) {
        return structurePolicy->getProbability(unigramProbability, NOT_A_PROBABILITY);
    }
    const auto mapPosition = mBigramMaps.find(prevWordIds[0]);
    if (mapPosition != mBigramMaps.end()) {
        return mapPosition->second.getBigramProbability(structurePolicy, nextWordId,
                unigramProbability);
    }
    if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) {
        addBigramsForWord(structurePolicy, prevWordIds);
        return mBigramMaps[prevWordIds[0]].getBigramProbability(structurePolicy,
                nextWordId, unigramProbability);
    }
    return readBigramProbabilityFromBinaryDictionary(structurePolicy, prevWordIds,
            nextWordId, unigramProbability);
}

void MultiBigramMap::BigramMap::init(
        const DictionaryStructureWithBufferPolicy *const structurePolicy,
        const WordIdArrayView prevWordIds) {
    structurePolicy->iterateNgramEntries(prevWordIds, this /* listener */);
}

int MultiBigramMap::BigramMap::getBigramProbability(
        const DictionaryStructureWithBufferPolicy *const structurePolicy,
        const int nextWordId, const int unigramProbability) const {
    int bigramProbability = NOT_A_PROBABILITY;
    if (mBloomFilter.isInFilter(nextWordId)) {
        const auto bigramProbabilityIt = mBigramMap.find(nextWordId);
        if (bigramProbabilityIt != mBigramMap.end()) {
            bigramProbability = bigramProbabilityIt->second;
        }
    }
    return structurePolicy->getProbability(unigramProbability, bigramProbability);
}

void MultiBigramMap::BigramMap::onVisitEntry(const int ngramProbability, const int targetWordId) {
    if (targetWordId == NOT_A_WORD_ID) {
        return;
    }
    mBigramMap[targetWordId] = ngramProbability;
    mBloomFilter.setInFilter(targetWordId);
}

void MultiBigramMap::addBigramsForWord(
        const DictionaryStructureWithBufferPolicy *const structurePolicy,
        const WordIdArrayView prevWordIds) {
    mBigramMaps[prevWordIds[0]].init(structurePolicy, prevWordIds);
}

int MultiBigramMap::readBigramProbabilityFromBinaryDictionary(
        const DictionaryStructureWithBufferPolicy *const structurePolicy,
        const WordIdArrayView prevWordIds, const int nextWordId, const int unigramProbability) {
    const int bigramProbability = structurePolicy->getProbabilityOfWord(prevWordIds, nextWordId);
    if (bigramProbability != NOT_A_PROBABILITY) {
        return bigramProbability;
    }
    return structurePolicy->getProbability(unigramProbability, NOT_A_PROBABILITY);
}

} // namespace latinime
Use bloom filter in multi bigram map. Evaluated with previous word "this". without bloom filter (use only hash_map): Total 147792.34 (sum of others 147771.57) with bloom filter: Total 145900.64 (sum of others 145874.30) always read binary dictionary: Total 148603.14 (sum of others 148579.90) Bug: 8592527 Change-Id: I821dc39454543826adb73b9eeeef6408fad8ae28 2013-06-14 11:35:41 +00:00			`/*`
			`* Copyright (C) 2013, The Android Open Source Project`
			`*`
			`* Licensed under the Apache License, Version 2.0 (the "License");`
			`* you may not use this file except in compliance with the License.`
			`* You may obtain a copy of the License at`
			`*`
			`* http://www.apache.org/licenses/LICENSE-2.0`
			`*`
			`* Unless required by applicable law or agreed to in writing, software`
			`* distributed under the License is distributed on an "AS IS" BASIS,`
			`* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`* See the License for the specific language governing permissions and`
			`* limitations under the License.`
			`*/`

Move dictionary code to top level dictionary dir. Bug: 18725954 Change-Id: Ia442ba4b5d84311057d83edf6e7aeb151d6a820b 2014-12-17 07:02:09 +00:00			`#include "dictionary/utils/multi_bigram_map.h"`
Use bloom filter in multi bigram map. Evaluated with previous word "this". without bloom filter (use only hash_map): Total 147792.34 (sum of others 147771.57) with bloom filter: Total 145900.64 (sum of others 145874.30) always read binary dictionary: Total 148603.14 (sum of others 148579.90) Bug: 8592527 Change-Id: I821dc39454543826adb73b9eeeef6408fad8ae28 2013-06-14 11:35:41 +00:00
			`#include <cstddef>`
s/hash_map_compat/unordered_map/ Change-Id: Icce5f9a12b04bdd7540c52750d303a585d71f28a 2014-04-11 09:07:59 +00:00			`#include <unordered_map>`
Use bloom filter in multi bigram map. Evaluated with previous word "this". without bloom filter (use only hash_map): Total 147792.34 (sum of others 147771.57) with bloom filter: Total 145900.64 (sum of others 145874.30) always read binary dictionary: Total 148603.14 (sum of others 148579.90) Bug: 8592527 Change-Id: I821dc39454543826adb73b9eeeef6408fad8ae28 2013-06-14 11:35:41 +00:00
			`namespace latinime {`

			`// Max number of bigram maps (previous word contexts) to be cached. Increasing this number`
			`// could improve bigram lookup speed for multi-word suggestions, but at the cost of more memory`
			`// usage. Also, there are diminishing returns since the most frequently used bigrams are`
			`// typically near the beginning of the input and are thus the first ones to be cached. Note`
			`// that these bigrams are reset for each new composing word.`
			`const size_t MultiBigramMap::MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP = 25;`

			`// Most common previous word contexts currently have 100 bigrams`
			`const int MultiBigramMap::BigramMap::DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP = 100;`

Reset to 9bd6dac4708ad94fd0257c53e977df62b152e20c The bulk merge from -bayo to klp-dev should not have been merged to master. Change-Id: I527a03a76f5247e4939a672f27c314dc11cbb854 2013-12-13 08:09:16 +00:00			`// Look up the bigram probability for the given word pair from the cached bigram maps.`
			`// Also caches the bigrams if there is space remaining and they have not been cached already.`
			`int MultiBigramMap::getBigramProbability(`
			`const DictionaryStructureWithBufferPolicy *const structurePolicy,`
Use WordIdArrayView for prevWordIds. Bug: 14425059 Change-Id: Ia84fb997d89564e60111b46ca83bbfa3b187f316 2014-09-11 10:36:22 +00:00			`const WordIdArrayView prevWordIds, const int nextWordId,`
Use NgramListener in MultiBigramMap. Bug: 14425059 Change-Id: I425536290111f2a8172f31370706f858a1e07f6e 2014-08-01 02:00:03 +00:00			`const int unigramProbability) {`
Use WordIdArrayView for prevWordIds. Bug: 14425059 Change-Id: Ia84fb997d89564e60111b46ca83bbfa3b187f316 2014-09-11 10:36:22 +00:00			`if (prevWordIds.empty() \|\| prevWordIds[0] == NOT_A_WORD_ID) {`
Use NgramListener in MultiBigramMap. Bug: 14425059 Change-Id: I425536290111f2a8172f31370706f858a1e07f6e 2014-08-01 02:00:03 +00:00			`return structurePolicy->getProbability(unigramProbability, NOT_A_PROBABILITY);`
			`}`
Use word id for methods related to n-grams. Bug: 14425059 Change-Id: I81e5d3793527776d3c9faa5594005ddbd4a71354 2014-09-03 07:32:43 +00:00			`const auto mapPosition = mBigramMaps.find(prevWordIds[0]);`
Reset to 9bd6dac4708ad94fd0257c53e977df62b152e20c The bulk merge from -bayo to klp-dev should not have been merged to master. Change-Id: I527a03a76f5247e4939a672f27c314dc11cbb854 2013-12-13 08:09:16 +00:00			`if (mapPosition != mBigramMaps.end()) {`
Use word id for methods related to n-grams. Bug: 14425059 Change-Id: I81e5d3793527776d3c9faa5594005ddbd4a71354 2014-09-03 07:32:43 +00:00			`return mapPosition->second.getBigramProbability(structurePolicy, nextWordId,`
Reset to 9bd6dac4708ad94fd0257c53e977df62b152e20c The bulk merge from -bayo to klp-dev should not have been merged to master. Change-Id: I527a03a76f5247e4939a672f27c314dc11cbb854 2013-12-13 08:09:16 +00:00			`unigramProbability);`
			`}`
			`if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) {`
Use word id for methods related to n-grams. Bug: 14425059 Change-Id: I81e5d3793527776d3c9faa5594005ddbd4a71354 2014-09-03 07:32:43 +00:00			`addBigramsForWord(structurePolicy, prevWordIds);`
			`return mBigramMaps[prevWordIds[0]].getBigramProbability(structurePolicy,`
			`nextWordId, unigramProbability);`
Reset to 9bd6dac4708ad94fd0257c53e977df62b152e20c The bulk merge from -bayo to klp-dev should not have been merged to master. Change-Id: I527a03a76f5247e4939a672f27c314dc11cbb854 2013-12-13 08:09:16 +00:00			`}`
Use word id for methods related to n-grams. Bug: 14425059 Change-Id: I81e5d3793527776d3c9faa5594005ddbd4a71354 2014-09-03 07:32:43 +00:00			`return readBigramProbabilityFromBinaryDictionary(structurePolicy, prevWordIds,`
			`nextWordId, unigramProbability);`
Reset to 9bd6dac4708ad94fd0257c53e977df62b152e20c The bulk merge from -bayo to klp-dev should not have been merged to master. Change-Id: I527a03a76f5247e4939a672f27c314dc11cbb854 2013-12-13 08:09:16 +00:00			`}`

			`void MultiBigramMap::BigramMap::init(`
Use NgramListener in MultiBigramMap. Bug: 14425059 Change-Id: I425536290111f2a8172f31370706f858a1e07f6e 2014-08-01 02:00:03 +00:00			`const DictionaryStructureWithBufferPolicy *const structurePolicy,`
Use WordIdArrayView for prevWordIds. Bug: 14425059 Change-Id: Ia84fb997d89564e60111b46ca83bbfa3b187f316 2014-09-11 10:36:22 +00:00			`const WordIdArrayView prevWordIds) {`
Use word id for methods related to n-grams. Bug: 14425059 Change-Id: I81e5d3793527776d3c9faa5594005ddbd4a71354 2014-09-03 07:32:43 +00:00			`structurePolicy->iterateNgramEntries(prevWordIds, this /* listener */);`
Reset to 9bd6dac4708ad94fd0257c53e977df62b152e20c The bulk merge from -bayo to klp-dev should not have been merged to master. Change-Id: I527a03a76f5247e4939a672f27c314dc11cbb854 2013-12-13 08:09:16 +00:00			`}`

			`int MultiBigramMap::BigramMap::getBigramProbability(`
			`const DictionaryStructureWithBufferPolicy *const structurePolicy,`
Use word id for methods related to n-grams. Bug: 14425059 Change-Id: I81e5d3793527776d3c9faa5594005ddbd4a71354 2014-09-03 07:32:43 +00:00			`const int nextWordId, const int unigramProbability) const {`
Reset to 9bd6dac4708ad94fd0257c53e977df62b152e20c The bulk merge from -bayo to klp-dev should not have been merged to master. Change-Id: I527a03a76f5247e4939a672f27c314dc11cbb854 2013-12-13 08:09:16 +00:00			`int bigramProbability = NOT_A_PROBABILITY;`
Use word id for methods related to n-grams. Bug: 14425059 Change-Id: I81e5d3793527776d3c9faa5594005ddbd4a71354 2014-09-03 07:32:43 +00:00			`if (mBloomFilter.isInFilter(nextWordId)) {`
			`const auto bigramProbabilityIt = mBigramMap.find(nextWordId);`
Reset to 9bd6dac4708ad94fd0257c53e977df62b152e20c The bulk merge from -bayo to klp-dev should not have been merged to master. Change-Id: I527a03a76f5247e4939a672f27c314dc11cbb854 2013-12-13 08:09:16 +00:00			`if (bigramProbabilityIt != mBigramMap.end()) {`
			`bigramProbability = bigramProbabilityIt->second;`
			`}`
			`}`
			`return structurePolicy->getProbability(unigramProbability, bigramProbability);`
			`}`

Use word id for methods related to n-grams. Bug: 14425059 Change-Id: I81e5d3793527776d3c9faa5594005ddbd4a71354 2014-09-03 07:32:43 +00:00			`void MultiBigramMap::BigramMap::onVisitEntry(const int ngramProbability, const int targetWordId) {`
			`if (targetWordId == NOT_A_WORD_ID) {`
Use NgramListener in MultiBigramMap. Bug: 14425059 Change-Id: I425536290111f2a8172f31370706f858a1e07f6e 2014-08-01 02:00:03 +00:00			`return;`
			`}`
Use word id for methods related to n-grams. Bug: 14425059 Change-Id: I81e5d3793527776d3c9faa5594005ddbd4a71354 2014-09-03 07:32:43 +00:00			`mBigramMap[targetWordId] = ngramProbability;`
			`mBloomFilter.setInFilter(targetWordId);`
Use NgramListener in MultiBigramMap. Bug: 14425059 Change-Id: I425536290111f2a8172f31370706f858a1e07f6e 2014-08-01 02:00:03 +00:00			`}`

Use word id for methods related to n-grams. Bug: 14425059 Change-Id: I81e5d3793527776d3c9faa5594005ddbd4a71354 2014-09-03 07:32:43 +00:00			`void MultiBigramMap::addBigramsForWord(`
Use NgramListener in MultiBigramMap. Bug: 14425059 Change-Id: I425536290111f2a8172f31370706f858a1e07f6e 2014-08-01 02:00:03 +00:00			`const DictionaryStructureWithBufferPolicy *const structurePolicy,`
Use WordIdArrayView for prevWordIds. Bug: 14425059 Change-Id: Ia84fb997d89564e60111b46ca83bbfa3b187f316 2014-09-11 10:36:22 +00:00			`const WordIdArrayView prevWordIds) {`
			`mBigramMaps[prevWordIds[0]].init(structurePolicy, prevWordIds);`
Reset to 9bd6dac4708ad94fd0257c53e977df62b152e20c The bulk merge from -bayo to klp-dev should not have been merged to master. Change-Id: I527a03a76f5247e4939a672f27c314dc11cbb854 2013-12-13 08:09:16 +00:00			`}`

			`int MultiBigramMap::readBigramProbabilityFromBinaryDictionary(`
Use NgramListener in MultiBigramMap. Bug: 14425059 Change-Id: I425536290111f2a8172f31370706f858a1e07f6e 2014-08-01 02:00:03 +00:00			`const DictionaryStructureWithBufferPolicy *const structurePolicy,`
Use WordIdArrayView for prevWordIds. Bug: 14425059 Change-Id: Ia84fb997d89564e60111b46ca83bbfa3b187f316 2014-09-11 10:36:22 +00:00			`const WordIdArrayView prevWordIds, const int nextWordId, const int unigramProbability) {`
Use word id for methods related to n-grams. Bug: 14425059 Change-Id: I81e5d3793527776d3c9faa5594005ddbd4a71354 2014-09-03 07:32:43 +00:00			`const int bigramProbability = structurePolicy->getProbabilityOfWord(prevWordIds, nextWordId);`
Use NgramListener in MultiBigramMap. Bug: 14425059 Change-Id: I425536290111f2a8172f31370706f858a1e07f6e 2014-08-01 02:00:03 +00:00			`if (bigramProbability != NOT_A_PROBABILITY) {`
			`return bigramProbability;`
Reset to 9bd6dac4708ad94fd0257c53e977df62b152e20c The bulk merge from -bayo to klp-dev should not have been merged to master. Change-Id: I527a03a76f5247e4939a672f27c314dc11cbb854 2013-12-13 08:09:16 +00:00			`}`
Use NgramListener in MultiBigramMap. Bug: 14425059 Change-Id: I425536290111f2a8172f31370706f858a1e07f6e 2014-08-01 02:00:03 +00:00			`return structurePolicy->getProbability(unigramProbability, NOT_A_PROBABILITY);`
Reset to 9bd6dac4708ad94fd0257c53e977df62b152e20c The bulk merge from -bayo to klp-dev should not have been merged to master. Change-Id: I527a03a76f5247e4939a672f27c314dc11cbb854 2013-12-13 08:09:16 +00:00			`}`

Use bloom filter in multi bigram map. Evaluated with previous word "this". without bloom filter (use only hash_map): Total 147792.34 (sum of others 147771.57) with bloom filter: Total 145900.64 (sum of others 145874.30) always read binary dictionary: Total 148603.14 (sum of others 148579.90) Bug: 8592527 Change-Id: I821dc39454543826adb73b9eeeef6408fad8ae28 2013-06-14 11:35:41 +00:00			`} // namespace latinime`