Merge "Introduce BinaryDictionaryBigramsIterator to access bigrams attributes in binary dictionaries."

main
Keisuke Kuroynagi 2013-06-14 11:24:18 +00:00 committed by Android (Google) Code Review
commit 4f19193560
10 changed files with 299 additions and 101 deletions

View File

@ -53,6 +53,7 @@ LATIN_IME_CORE_SRC_FILES := \
dic_nodes_cache.cpp) \ dic_nodes_cache.cpp) \
$(addprefix suggest/core/dictionary/, \ $(addprefix suggest/core/dictionary/, \
bigram_dictionary.cpp \ bigram_dictionary.cpp \
binary_dictionary_bigrams_reading_utils.cpp \
binary_dictionary_format_utils.cpp \ binary_dictionary_format_utils.cpp \
binary_dictionary_header.cpp \ binary_dictionary_header.cpp \
binary_dictionary_header_reading_utils.cpp \ binary_dictionary_header_reading_utils.cpp \

View File

@ -233,8 +233,7 @@ namespace latinime {
return multiBigramMap->getBigramProbability( return multiBigramMap->getBigramProbability(
binaryDictionaryInfo, prevWordPos, wordPos, unigramProbability); binaryDictionaryInfo, prevWordPos, wordPos, unigramProbability);
} }
return BinaryFormat::getBigramProbability( return ProbabilityUtils::backoff(unigramProbability);
binaryDictionaryInfo->getDictRoot(), prevWordPos, wordPos, unigramProbability);
} }
/////////////////////////////////////// ///////////////////////////////////////

View File

@ -21,6 +21,7 @@
#include "bigram_dictionary.h" #include "bigram_dictionary.h"
#include "defines.h" #include "defines.h"
#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
#include "suggest/core/dictionary/binary_dictionary_info.h" #include "suggest/core/dictionary/binary_dictionary_info.h"
#include "suggest/core/dictionary/binary_format.h" #include "suggest/core/dictionary/binary_format.h"
#include "suggest/core/dictionary/dictionary.h" #include "suggest/core/dictionary/dictionary.h"
@ -100,12 +101,11 @@ void BigramDictionary::addWordBigram(int *word, int length, int probability, int
* and the bigrams are used to boost unigram result scores, it makes little sense to * and the bigrams are used to boost unigram result scores, it makes little sense to
* reduce their scope to the ones that match the first letter. * reduce their scope to the ones that match the first letter.
*/ */
int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *inputCodePoints, int BigramDictionary::getPredictions(const int *prevWord, int prevWordLength, int *inputCodePoints,
int inputSize, int *bigramCodePoints, int *bigramProbability, int *outputTypes) const { int inputSize, int *bigramCodePoints, int *bigramProbability, int *outputTypes) const {
// TODO: remove unused arguments, and refrain from storing stuff in members of this class // TODO: remove unused arguments, and refrain from storing stuff in members of this class
// TODO: have "in" arguments before "out" ones, and make out args explicit in the name // TODO: have "in" arguments before "out" ones, and make out args explicit in the name
const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot();
int pos = getBigramListPositionForWord(prevWord, prevWordLength, int pos = getBigramListPositionForWord(prevWord, prevWordLength,
false /* forceLowerCaseSearch */); false /* forceLowerCaseSearch */);
// getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
@ -116,21 +116,20 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i
} }
// If still no bigrams, we really don't have them! // If still no bigrams, we really don't have them!
if (0 == pos) return 0; if (0 == pos) return 0;
uint8_t bigramFlags;
int bigramCount = 0; int bigramCount = 0;
do {
bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
int bigramBuffer[MAX_WORD_LENGTH];
int unigramProbability = 0; int unigramProbability = 0;
const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags, int bigramBuffer[MAX_WORD_LENGTH];
&pos); for (BinaryDictionaryBigramsIterator bigramsIt(mBinaryDictionaryInfo, pos);
const int length = BinaryFormat::getWordAtAddress(root, bigramPos, MAX_WORD_LENGTH, bigramsIt.hasNext(); /* no-op */) {
bigramBuffer, &unigramProbability); bigramsIt.next();
const int length = BinaryFormat::getWordAtAddress(
mBinaryDictionaryInfo->getDictRoot(), bigramsIt.getBigramPos(),
MAX_WORD_LENGTH, bigramBuffer, &unigramProbability);
// inputSize == 0 means we are trying to find bigram predictions. // inputSize == 0 means we are trying to find bigram predictions.
if (inputSize < 1 || checkFirstCharacter(bigramBuffer, inputCodePoints)) { if (inputSize < 1 || checkFirstCharacter(bigramBuffer, inputCodePoints)) {
const int bigramProbabilityTemp = const int bigramProbabilityTemp = bigramsIt.getProbability();
BinaryFormat::MASK_ATTRIBUTE_PROBABILITY & bigramFlags;
// Due to space constraints, the probability for bigrams is approximate - the lower the // Due to space constraints, the probability for bigrams is approximate - the lower the
// unigram probability, the worse the precision. The theoritical maximum error in // unigram probability, the worse the precision. The theoritical maximum error in
// resulting probability is 8 - although in the practice it's never bigger than 3 or 4 // resulting probability is 8 - although in the practice it's never bigger than 3 or 4
@ -142,7 +141,7 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i
outputTypes); outputTypes);
++bigramCount; ++bigramCount;
} }
} while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags); }
return min(bigramCount, MAX_RESULTS); return min(bigramCount, MAX_RESULTS);
} }
@ -187,22 +186,20 @@ bool BigramDictionary::checkFirstCharacter(int *word, int *inputCodePoints) cons
bool BigramDictionary::isValidBigram(const int *word1, int length1, const int *word2, bool BigramDictionary::isValidBigram(const int *word1, int length1, const int *word2,
int length2) const { int length2) const {
const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot();
int pos = getBigramListPositionForWord(word1, length1, false /* forceLowerCaseSearch */); int pos = getBigramListPositionForWord(word1, length1, false /* forceLowerCaseSearch */);
// getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
if (0 == pos) return false; if (0 == pos) return false;
int nextWordPos = BinaryFormat::getTerminalPosition(root, word2, length2, int nextWordPos = BinaryFormat::getTerminalPosition(mBinaryDictionaryInfo->getDictRoot(),
false /* forceLowerCaseSearch */); word2, length2, false /* forceLowerCaseSearch */);
if (NOT_VALID_WORD == nextWordPos) return false; if (NOT_VALID_WORD == nextWordPos) return false;
uint8_t bigramFlags;
do { for (BinaryDictionaryBigramsIterator bigramsIt(mBinaryDictionaryInfo, pos);
bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); bigramsIt.hasNext(); /* no-op */) {
const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags, bigramsIt.next();
&pos); if (bigramsIt.getBigramPos() == nextWordPos) {
if (bigramPos == nextWordPos) {
return true; return true;
} }
} while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags); }
return false; return false;
} }

View File

@ -27,8 +27,8 @@ class BigramDictionary {
public: public:
BigramDictionary(const BinaryDictionaryInfo *const binaryDictionaryInfo); BigramDictionary(const BinaryDictionaryInfo *const binaryDictionaryInfo);
int getBigrams(const int *word, int length, int *inputCodePoints, int inputSize, int *outWords, int getPredictions(const int *word, int length, int *inputCodePoints, int inputSize,
int *frequencies, int *outputTypes) const; int *outWords, int *frequencies, int *outputTypes) const;
bool isValidBigram(const int *word1, int length1, const int *word2, int length2) const; bool isValidBigram(const int *word1, int length1, const int *word2, int length2) const;
~BigramDictionary(); ~BigramDictionary();

View File

@ -0,0 +1,67 @@
/*
* Copyright (C) 2013 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H
#define LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H
#include "defines.h"
#include "suggest/core/dictionary/binary_dictionary_bigrams_reading_utils.h"
#include "suggest/core/dictionary/binary_dictionary_info.h"
namespace latinime {
class BinaryDictionaryBigramsIterator {
public:
BinaryDictionaryBigramsIterator(
const BinaryDictionaryInfo *const binaryDictionaryInfo, const int pos)
: mBinaryDictionaryInfo(binaryDictionaryInfo), mPos(pos), mBigramFlags(0),
mBigramPos(0), mHasNext(true) {}
AK_FORCE_INLINE bool hasNext() const {
return mHasNext;
}
AK_FORCE_INLINE void next() {
mBigramFlags = BinaryDictionaryBigramsReadingUtils::getFlagsAndForwardPointer(
mBinaryDictionaryInfo, &mPos);
mBigramPos = BinaryDictionaryBigramsReadingUtils::getBigramAddressAndForwardPointer(
mBinaryDictionaryInfo, mBigramFlags, &mPos);
mHasNext = BinaryDictionaryBigramsReadingUtils::hasNext(mBigramFlags);
}
AK_FORCE_INLINE int getProbability() const {
return BinaryDictionaryBigramsReadingUtils::getBigramProbability(mBigramFlags);
}
AK_FORCE_INLINE int getBigramPos() const {
return mBigramPos;
}
AK_FORCE_INLINE int getFlags() const {
return mBigramFlags;
}
private:
DISALLOW_COPY_AND_ASSIGN(BinaryDictionaryBigramsIterator);
const BinaryDictionaryInfo *const mBinaryDictionaryInfo;
int mPos;
BinaryDictionaryBigramsReadingUtils::BigramFlags mBigramFlags;
int mBigramPos;
bool mHasNext;
};
} // namespace latinime
#endif // LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H

View File

@ -0,0 +1,68 @@
/*
* Copyright (C) 2013 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "suggest/core/dictionary/binary_dictionary_bigrams_reading_utils.h"
#include "suggest/core/dictionary/binary_dictionary_info.h"
#include "suggest/core/dictionary/byte_array_utils.h"
namespace latinime {
const BinaryDictionaryBigramsReadingUtils::BigramFlags
BinaryDictionaryBigramsReadingUtils::MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
const BinaryDictionaryBigramsReadingUtils::BigramFlags
BinaryDictionaryBigramsReadingUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
const BinaryDictionaryBigramsReadingUtils::BigramFlags
BinaryDictionaryBigramsReadingUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
const BinaryDictionaryBigramsReadingUtils::BigramFlags
BinaryDictionaryBigramsReadingUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
const BinaryDictionaryBigramsReadingUtils::BigramFlags
BinaryDictionaryBigramsReadingUtils::FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
// Flag for presence of more attributes
const BinaryDictionaryBigramsReadingUtils::BigramFlags
BinaryDictionaryBigramsReadingUtils::FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
// Mask for attribute probability, stored on 4 bits inside the flags byte.
const BinaryDictionaryBigramsReadingUtils::BigramFlags
BinaryDictionaryBigramsReadingUtils::MASK_ATTRIBUTE_PROBABILITY = 0x0F;
const int BinaryDictionaryBigramsReadingUtils::ATTRIBUTE_ADDRESS_SHIFT = 4;
/* static */ int BinaryDictionaryBigramsReadingUtils::getBigramAddressAndForwardPointer(
const BinaryDictionaryInfo *const binaryDictionaryInfo, const BigramFlags flags,
int *const pos) {
int offset = 0;
const int origin = *pos;
switch (MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
offset = ByteArrayUtils::readUint8andAdvancePosition(
binaryDictionaryInfo->getDictRoot(), pos);
break;
case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
offset = ByteArrayUtils::readUint16andAdvancePosition(
binaryDictionaryInfo->getDictRoot(), pos);
break;
case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
offset = ByteArrayUtils::readUint24andAdvancePosition(
binaryDictionaryInfo->getDictRoot(), pos);
break;
}
if (isOffsetNegative(flags)) {
return origin - offset;
} else {
return origin + offset;
}
}
} // namespace latinime

View File

@ -0,0 +1,90 @@
/*
* Copyright (C) 2013 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LATINIME_BINARY_DICTIONARY_BIGRAM_READING_UTILS_H
#define LATINIME_BINARY_DICTIONARY_BIGRAM_READING_UTILS_H
#include <stdint.h>
#include "defines.h"
#include "suggest/core/dictionary/binary_dictionary_info.h"
#include "suggest/core/dictionary/byte_array_utils.h"
namespace latinime {
class BinaryDictionaryBigramsReadingUtils {
public:
typedef uint8_t BigramFlags;
static AK_FORCE_INLINE void skipExistingBigrams(
const BinaryDictionaryInfo *const binaryDictionaryInfo, int *const pos) {
BigramFlags flags = getFlagsAndForwardPointer(binaryDictionaryInfo, pos);
while (hasNext(flags)) {
*pos += attributeAddressSize(flags);
flags = getFlagsAndForwardPointer(binaryDictionaryInfo, pos);
}
*pos += attributeAddressSize(flags);
}
static AK_FORCE_INLINE BigramFlags getFlagsAndForwardPointer(
const BinaryDictionaryInfo *const binaryDictionaryInfo, int *const pos) {
return ByteArrayUtils::readUint8andAdvancePosition(
binaryDictionaryInfo->getDictRoot(), pos);
}
static AK_FORCE_INLINE int getBigramProbability(const BigramFlags flags) {
return flags & MASK_ATTRIBUTE_PROBABILITY;
}
static AK_FORCE_INLINE bool isOffsetNegative(const BigramFlags flags) {
return (flags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) != 0;
}
static AK_FORCE_INLINE bool hasNext(const BigramFlags flags) {
return (flags & FLAG_ATTRIBUTE_HAS_NEXT) != 0;
}
static int getBigramAddressAndForwardPointer(
const BinaryDictionaryInfo *const binaryDictionaryInfo,
const BigramFlags flags, int *const pos);
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryDictionaryBigramsReadingUtils);
static const BigramFlags MASK_ATTRIBUTE_ADDRESS_TYPE;
static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE;
static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES;
static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES;
static const BigramFlags FLAG_ATTRIBUTE_OFFSET_NEGATIVE;
static const BigramFlags FLAG_ATTRIBUTE_HAS_NEXT;
static const BigramFlags MASK_ATTRIBUTE_PROBABILITY;
static const int ATTRIBUTE_ADDRESS_SHIFT;
static AK_FORCE_INLINE int attributeAddressSize(const BigramFlags flags) {
return (flags & MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT;
/* Note: this is a value-dependant optimization of what may probably be
more readably written this way:
switch (flags * BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) {
case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1;
case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2;
case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3;
default: return 0;
}
*/
}
};
}
#endif /* LATINIME_BINARY_DICTIONARY_BIGRAM_READING_UTILS_H */

View File

@ -21,7 +21,6 @@
#include "suggest/core/dictionary/probability_utils.h" #include "suggest/core/dictionary/probability_utils.h"
#include "utils/char_utils.h" #include "utils/char_utils.h"
#include "utils/hash_map_compat.h"
namespace latinime { namespace latinime {
@ -81,16 +80,10 @@ class BinaryFormat {
const int length, const bool forceLowerCaseSearch); const int length, const bool forceLowerCaseSearch);
static int getWordAtAddress(const uint8_t *const root, const int address, const int maxDepth, static int getWordAtAddress(const uint8_t *const root, const int address, const int maxDepth,
int *outWord, int *outUnigramProbability); int *outWord, int *outUnigramProbability);
static int getBigramProbabilityFromHashMap(const int position, static int getBigramListPositionForWordPosition(const uint8_t *const root, int position);
const hash_map_compat<int, int> *bigramMap, const int unigramProbability);
static void fillBigramProbabilityToHashMap(const uint8_t *const root, int position,
hash_map_compat<int, int> *bigramMap);
static int getBigramProbability(const uint8_t *const root, int position,
const int nextPosition, const int unigramProbability);
private: private:
DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat); DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat);
static int getBigramListPositionForWordPosition(const uint8_t *const root, int position);
static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00; static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40; static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
@ -516,57 +509,6 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co
return 0; return 0;
} }
// This returns a probability in log space.
inline int BinaryFormat::getBigramProbabilityFromHashMap(const int position,
const hash_map_compat<int, int> *bigramMap, const int unigramProbability) {
if (!bigramMap) {
return ProbabilityUtils::backoff(unigramProbability);
}
const hash_map_compat<int, int>::const_iterator bigramProbabilityIt = bigramMap->find(position);
if (bigramProbabilityIt != bigramMap->end()) {
const int bigramProbability = bigramProbabilityIt->second;
return ProbabilityUtils::computeProbabilityForBigram(unigramProbability, bigramProbability);
}
return ProbabilityUtils::backoff(unigramProbability);
}
AK_FORCE_INLINE void BinaryFormat::fillBigramProbabilityToHashMap(
const uint8_t *const root, int position, hash_map_compat<int, int> *bigramMap) {
position = getBigramListPositionForWordPosition(root, position);
if (0 == position) return;
uint8_t bigramFlags;
do {
bigramFlags = getFlagsAndForwardPointer(root, &position);
const int probability = MASK_ATTRIBUTE_PROBABILITY & bigramFlags;
const int bigramPos = getAttributeAddressAndForwardPointer(root, bigramFlags,
&position);
(*bigramMap)[bigramPos] = probability;
} while (FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
}
AK_FORCE_INLINE int BinaryFormat::getBigramProbability(const uint8_t *const root, int position,
const int nextPosition, const int unigramProbability) {
position = getBigramListPositionForWordPosition(root, position);
if (0 == position) {
return ProbabilityUtils::backoff(unigramProbability);
}
uint8_t bigramFlags;
do {
bigramFlags = getFlagsAndForwardPointer(root, &position);
const int bigramPos = getAttributeAddressAndForwardPointer(
root, bigramFlags, &position);
if (bigramPos == nextPosition) {
const int bigramProbability = MASK_ATTRIBUTE_PROBABILITY & bigramFlags;
return ProbabilityUtils::computeProbabilityForBigram(
unigramProbability, bigramProbability);
}
} while (FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
return ProbabilityUtils::backoff(unigramProbability);
}
// Returns a pointer to the start of the bigram list.
AK_FORCE_INLINE int BinaryFormat::getBigramListPositionForWordPosition( AK_FORCE_INLINE int BinaryFormat::getBigramListPositionForWordPosition(
const uint8_t *const root, int position) { const uint8_t *const root, int position) {
if (NOT_VALID_WORD == position) return 0; if (NOT_VALID_WORD == position) return 0;

View File

@ -79,7 +79,7 @@ int Dictionary::getSuggestions(ProximityInfo *proximityInfo, DicTraverseSession
int Dictionary::getBigrams(const int *word, int length, int *inputCodePoints, int inputSize, int Dictionary::getBigrams(const int *word, int length, int *inputCodePoints, int inputSize,
int *outWords, int *frequencies, int *outputTypes) const { int *outWords, int *frequencies, int *outputTypes) const {
if (length <= 0) return 0; if (length <= 0) return 0;
return mBigramDictionary->getBigrams(word, length, inputCodePoints, inputSize, outWords, return mBigramDictionary->getPredictions(word, length, inputCodePoints, inputSize, outWords,
frequencies, outputTypes); frequencies, outputTypes);
} }

View File

@ -18,6 +18,7 @@
#define LATINIME_MULTI_BIGRAM_MAP_H #define LATINIME_MULTI_BIGRAM_MAP_H
#include "defines.h" #include "defines.h"
#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
#include "suggest/core/dictionary/binary_dictionary_info.h" #include "suggest/core/dictionary/binary_dictionary_info.h"
#include "suggest/core/dictionary/binary_format.h" #include "suggest/core/dictionary/binary_format.h"
#include "utils/hash_map_compat.h" #include "utils/hash_map_compat.h"
@ -34,7 +35,7 @@ class MultiBigramMap {
// Look up the bigram probability for the given word pair from the cached bigram maps. // Look up the bigram probability for the given word pair from the cached bigram maps.
// Also caches the bigrams if there is space remaining and they have not been cached already. // Also caches the bigrams if there is space remaining and they have not been cached already.
int getBigramProbability(const BinaryDictionaryInfo *const binaryDicitonaryInfo, int getBigramProbability(const BinaryDictionaryInfo *const binaryDictionaryInfo,
const int wordPosition, const int nextWordPosition, const int unigramProbability) { const int wordPosition, const int nextWordPosition, const int unigramProbability) {
hash_map_compat<int, BigramMap>::const_iterator mapPosition = hash_map_compat<int, BigramMap>::const_iterator mapPosition =
mBigramMaps.find(wordPosition); mBigramMaps.find(wordPosition);
@ -42,11 +43,11 @@ class MultiBigramMap {
return mapPosition->second.getBigramProbability(nextWordPosition, unigramProbability); return mapPosition->second.getBigramProbability(nextWordPosition, unigramProbability);
} }
if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) { if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) {
addBigramsForWordPosition(binaryDicitonaryInfo, wordPosition); addBigramsForWordPosition(binaryDictionaryInfo, wordPosition);
return mBigramMaps[wordPosition].getBigramProbability( return mBigramMaps[wordPosition].getBigramProbability(
nextWordPosition, unigramProbability); nextWordPosition, unigramProbability);
} }
return BinaryFormat::getBigramProbability(binaryDicitonaryInfo->getDictRoot(), return readBigramProbabilityFromBinaryDictionary(binaryDictionaryInfo,
wordPosition, nextWordPosition, unigramProbability); wordPosition, nextWordPosition, unigramProbability);
} }
@ -62,15 +63,29 @@ class MultiBigramMap {
BigramMap() : mBigramMap(DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP) {} BigramMap() : mBigramMap(DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP) {}
~BigramMap() {} ~BigramMap() {}
void init(const BinaryDictionaryInfo *const binaryDicitonaryInfo, const int position) { void init(const BinaryDictionaryInfo *const binaryDictionaryInfo, const int nodePos) {
BinaryFormat::fillBigramProbabilityToHashMap( const int bigramsListPos = BinaryFormat::getBigramListPositionForWordPosition(
binaryDicitonaryInfo->getDictRoot(), position, &mBigramMap); binaryDictionaryInfo->getDictRoot(), nodePos);
if (0 == bigramsListPos) {
return;
}
for (BinaryDictionaryBigramsIterator bigramsIt(binaryDictionaryInfo, bigramsListPos);
bigramsIt.hasNext(); /* no-op */) {
bigramsIt.next();
mBigramMap[bigramsIt.getBigramPos()] = bigramsIt.getProbability();
}
} }
inline int getBigramProbability(const int nextWordPosition, const int unigramProbability) AK_FORCE_INLINE int getBigramProbability(
const { const int nextWordPosition, const int unigramProbability) const {
return BinaryFormat::getBigramProbabilityFromHashMap( const hash_map_compat<int, int>::const_iterator bigramProbabilityIt =
nextWordPosition, &mBigramMap, unigramProbability); mBigramMap.find(nextWordPosition);
if (bigramProbabilityIt != mBigramMap.end()) {
const int bigramProbability = bigramProbabilityIt->second;
return ProbabilityUtils::computeProbabilityForBigram(
unigramProbability, bigramProbability);
}
return ProbabilityUtils::backoff(unigramProbability);
} }
private: private:
@ -78,9 +93,28 @@ class MultiBigramMap {
hash_map_compat<int, int> mBigramMap; hash_map_compat<int, int> mBigramMap;
}; };
void addBigramsForWordPosition(const BinaryDictionaryInfo *const binaryDicitonaryInfo, AK_FORCE_INLINE void addBigramsForWordPosition(
const int position) { const BinaryDictionaryInfo *const binaryDictionaryInfo, const int position) {
mBigramMaps[position].init(binaryDicitonaryInfo, position); mBigramMaps[position].init(binaryDictionaryInfo, position);
}
AK_FORCE_INLINE int readBigramProbabilityFromBinaryDictionary(
const BinaryDictionaryInfo *const binaryDictionaryInfo, const int nodePos,
const int nextWordPosition, const int unigramProbability) {
const int bigramsListPos = BinaryFormat::getBigramListPositionForWordPosition(
binaryDictionaryInfo->getDictRoot(), nodePos);
if (0 == bigramsListPos) {
return ProbabilityUtils::backoff(unigramProbability);
}
for (BinaryDictionaryBigramsIterator bigramsIt(binaryDictionaryInfo, bigramsListPos);
bigramsIt.hasNext(); /* no-op */) {
bigramsIt.next();
if (bigramsIt.getBigramPos() == nextWordPosition) {
return ProbabilityUtils::computeProbabilityForBigram(
unigramProbability, bigramsIt.getProbability());
}
}
return ProbabilityUtils::backoff(unigramProbability);
} }
hash_map_compat<int, BigramMap> mBigramMaps; hash_map_compat<int, BigramMap> mBigramMaps;