Merge "Introduce BinaryDictionaryBigramsIterator to access bigrams attributes in binary dictionaries."
This commit is contained in:
commit
4f19193560
10 changed files with 299 additions and 101 deletions
|
@ -53,6 +53,7 @@ LATIN_IME_CORE_SRC_FILES := \
|
|||
dic_nodes_cache.cpp) \
|
||||
$(addprefix suggest/core/dictionary/, \
|
||||
bigram_dictionary.cpp \
|
||||
binary_dictionary_bigrams_reading_utils.cpp \
|
||||
binary_dictionary_format_utils.cpp \
|
||||
binary_dictionary_header.cpp \
|
||||
binary_dictionary_header_reading_utils.cpp \
|
||||
|
|
|
@ -233,8 +233,7 @@ namespace latinime {
|
|||
return multiBigramMap->getBigramProbability(
|
||||
binaryDictionaryInfo, prevWordPos, wordPos, unigramProbability);
|
||||
}
|
||||
return BinaryFormat::getBigramProbability(
|
||||
binaryDictionaryInfo->getDictRoot(), prevWordPos, wordPos, unigramProbability);
|
||||
return ProbabilityUtils::backoff(unigramProbability);
|
||||
}
|
||||
|
||||
///////////////////////////////////////
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
#include "bigram_dictionary.h"
|
||||
|
||||
#include "defines.h"
|
||||
#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
|
||||
#include "suggest/core/dictionary/binary_dictionary_info.h"
|
||||
#include "suggest/core/dictionary/binary_format.h"
|
||||
#include "suggest/core/dictionary/dictionary.h"
|
||||
|
@ -100,12 +101,11 @@ void BigramDictionary::addWordBigram(int *word, int length, int probability, int
|
|||
* and the bigrams are used to boost unigram result scores, it makes little sense to
|
||||
* reduce their scope to the ones that match the first letter.
|
||||
*/
|
||||
int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *inputCodePoints,
|
||||
int BigramDictionary::getPredictions(const int *prevWord, int prevWordLength, int *inputCodePoints,
|
||||
int inputSize, int *bigramCodePoints, int *bigramProbability, int *outputTypes) const {
|
||||
// TODO: remove unused arguments, and refrain from storing stuff in members of this class
|
||||
// TODO: have "in" arguments before "out" ones, and make out args explicit in the name
|
||||
|
||||
const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot();
|
||||
int pos = getBigramListPositionForWord(prevWord, prevWordLength,
|
||||
false /* forceLowerCaseSearch */);
|
||||
// getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
|
||||
|
@ -116,21 +116,20 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i
|
|||
}
|
||||
// If still no bigrams, we really don't have them!
|
||||
if (0 == pos) return 0;
|
||||
uint8_t bigramFlags;
|
||||
|
||||
int bigramCount = 0;
|
||||
do {
|
||||
bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
|
||||
int bigramBuffer[MAX_WORD_LENGTH];
|
||||
int unigramProbability = 0;
|
||||
const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags,
|
||||
&pos);
|
||||
const int length = BinaryFormat::getWordAtAddress(root, bigramPos, MAX_WORD_LENGTH,
|
||||
bigramBuffer, &unigramProbability);
|
||||
int unigramProbability = 0;
|
||||
int bigramBuffer[MAX_WORD_LENGTH];
|
||||
for (BinaryDictionaryBigramsIterator bigramsIt(mBinaryDictionaryInfo, pos);
|
||||
bigramsIt.hasNext(); /* no-op */) {
|
||||
bigramsIt.next();
|
||||
const int length = BinaryFormat::getWordAtAddress(
|
||||
mBinaryDictionaryInfo->getDictRoot(), bigramsIt.getBigramPos(),
|
||||
MAX_WORD_LENGTH, bigramBuffer, &unigramProbability);
|
||||
|
||||
// inputSize == 0 means we are trying to find bigram predictions.
|
||||
if (inputSize < 1 || checkFirstCharacter(bigramBuffer, inputCodePoints)) {
|
||||
const int bigramProbabilityTemp =
|
||||
BinaryFormat::MASK_ATTRIBUTE_PROBABILITY & bigramFlags;
|
||||
const int bigramProbabilityTemp = bigramsIt.getProbability();
|
||||
// Due to space constraints, the probability for bigrams is approximate - the lower the
|
||||
// unigram probability, the worse the precision. The theoritical maximum error in
|
||||
// resulting probability is 8 - although in the practice it's never bigger than 3 or 4
|
||||
|
@ -142,7 +141,7 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i
|
|||
outputTypes);
|
||||
++bigramCount;
|
||||
}
|
||||
} while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
|
||||
}
|
||||
return min(bigramCount, MAX_RESULTS);
|
||||
}
|
||||
|
||||
|
@ -187,22 +186,20 @@ bool BigramDictionary::checkFirstCharacter(int *word, int *inputCodePoints) cons
|
|||
|
||||
bool BigramDictionary::isValidBigram(const int *word1, int length1, const int *word2,
|
||||
int length2) const {
|
||||
const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot();
|
||||
int pos = getBigramListPositionForWord(word1, length1, false /* forceLowerCaseSearch */);
|
||||
// getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
|
||||
if (0 == pos) return false;
|
||||
int nextWordPos = BinaryFormat::getTerminalPosition(root, word2, length2,
|
||||
false /* forceLowerCaseSearch */);
|
||||
int nextWordPos = BinaryFormat::getTerminalPosition(mBinaryDictionaryInfo->getDictRoot(),
|
||||
word2, length2, false /* forceLowerCaseSearch */);
|
||||
if (NOT_VALID_WORD == nextWordPos) return false;
|
||||
uint8_t bigramFlags;
|
||||
do {
|
||||
bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
|
||||
const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags,
|
||||
&pos);
|
||||
if (bigramPos == nextWordPos) {
|
||||
|
||||
for (BinaryDictionaryBigramsIterator bigramsIt(mBinaryDictionaryInfo, pos);
|
||||
bigramsIt.hasNext(); /* no-op */) {
|
||||
bigramsIt.next();
|
||||
if (bigramsIt.getBigramPos() == nextWordPos) {
|
||||
return true;
|
||||
}
|
||||
} while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
@ -27,8 +27,8 @@ class BigramDictionary {
|
|||
public:
|
||||
BigramDictionary(const BinaryDictionaryInfo *const binaryDictionaryInfo);
|
||||
|
||||
int getBigrams(const int *word, int length, int *inputCodePoints, int inputSize, int *outWords,
|
||||
int *frequencies, int *outputTypes) const;
|
||||
int getPredictions(const int *word, int length, int *inputCodePoints, int inputSize,
|
||||
int *outWords, int *frequencies, int *outputTypes) const;
|
||||
bool isValidBigram(const int *word1, int length1, const int *word2, int length2) const;
|
||||
~BigramDictionary();
|
||||
|
||||
|
|
|
@ -0,0 +1,67 @@
|
|||
/*
|
||||
* Copyright (C) 2013 The Android Open Source Project
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H
|
||||
#define LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H
|
||||
|
||||
#include "defines.h"
|
||||
#include "suggest/core/dictionary/binary_dictionary_bigrams_reading_utils.h"
|
||||
#include "suggest/core/dictionary/binary_dictionary_info.h"
|
||||
|
||||
namespace latinime {
|
||||
|
||||
class BinaryDictionaryBigramsIterator {
|
||||
public:
|
||||
BinaryDictionaryBigramsIterator(
|
||||
const BinaryDictionaryInfo *const binaryDictionaryInfo, const int pos)
|
||||
: mBinaryDictionaryInfo(binaryDictionaryInfo), mPos(pos), mBigramFlags(0),
|
||||
mBigramPos(0), mHasNext(true) {}
|
||||
|
||||
AK_FORCE_INLINE bool hasNext() const {
|
||||
return mHasNext;
|
||||
}
|
||||
|
||||
AK_FORCE_INLINE void next() {
|
||||
mBigramFlags = BinaryDictionaryBigramsReadingUtils::getFlagsAndForwardPointer(
|
||||
mBinaryDictionaryInfo, &mPos);
|
||||
mBigramPos = BinaryDictionaryBigramsReadingUtils::getBigramAddressAndForwardPointer(
|
||||
mBinaryDictionaryInfo, mBigramFlags, &mPos);
|
||||
mHasNext = BinaryDictionaryBigramsReadingUtils::hasNext(mBigramFlags);
|
||||
}
|
||||
|
||||
AK_FORCE_INLINE int getProbability() const {
|
||||
return BinaryDictionaryBigramsReadingUtils::getBigramProbability(mBigramFlags);
|
||||
}
|
||||
|
||||
AK_FORCE_INLINE int getBigramPos() const {
|
||||
return mBigramPos;
|
||||
}
|
||||
|
||||
AK_FORCE_INLINE int getFlags() const {
|
||||
return mBigramFlags;
|
||||
}
|
||||
|
||||
private:
|
||||
DISALLOW_COPY_AND_ASSIGN(BinaryDictionaryBigramsIterator);
|
||||
|
||||
const BinaryDictionaryInfo *const mBinaryDictionaryInfo;
|
||||
int mPos;
|
||||
BinaryDictionaryBigramsReadingUtils::BigramFlags mBigramFlags;
|
||||
int mBigramPos;
|
||||
bool mHasNext;
|
||||
};
|
||||
} // namespace latinime
|
||||
#endif // LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H
|
|
@ -0,0 +1,68 @@
|
|||
/*
|
||||
* Copyright (C) 2013 The Android Open Source Project
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "suggest/core/dictionary/binary_dictionary_bigrams_reading_utils.h"
|
||||
|
||||
#include "suggest/core/dictionary/binary_dictionary_info.h"
|
||||
#include "suggest/core/dictionary/byte_array_utils.h"
|
||||
|
||||
namespace latinime {
|
||||
|
||||
const BinaryDictionaryBigramsReadingUtils::BigramFlags
|
||||
BinaryDictionaryBigramsReadingUtils::MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
|
||||
const BinaryDictionaryBigramsReadingUtils::BigramFlags
|
||||
BinaryDictionaryBigramsReadingUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
|
||||
const BinaryDictionaryBigramsReadingUtils::BigramFlags
|
||||
BinaryDictionaryBigramsReadingUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
|
||||
const BinaryDictionaryBigramsReadingUtils::BigramFlags
|
||||
BinaryDictionaryBigramsReadingUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
|
||||
const BinaryDictionaryBigramsReadingUtils::BigramFlags
|
||||
BinaryDictionaryBigramsReadingUtils::FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
|
||||
// Flag for presence of more attributes
|
||||
const BinaryDictionaryBigramsReadingUtils::BigramFlags
|
||||
BinaryDictionaryBigramsReadingUtils::FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
|
||||
// Mask for attribute probability, stored on 4 bits inside the flags byte.
|
||||
const BinaryDictionaryBigramsReadingUtils::BigramFlags
|
||||
BinaryDictionaryBigramsReadingUtils::MASK_ATTRIBUTE_PROBABILITY = 0x0F;
|
||||
const int BinaryDictionaryBigramsReadingUtils::ATTRIBUTE_ADDRESS_SHIFT = 4;
|
||||
|
||||
/* static */ int BinaryDictionaryBigramsReadingUtils::getBigramAddressAndForwardPointer(
|
||||
const BinaryDictionaryInfo *const binaryDictionaryInfo, const BigramFlags flags,
|
||||
int *const pos) {
|
||||
int offset = 0;
|
||||
const int origin = *pos;
|
||||
switch (MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
|
||||
case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
|
||||
offset = ByteArrayUtils::readUint8andAdvancePosition(
|
||||
binaryDictionaryInfo->getDictRoot(), pos);
|
||||
break;
|
||||
case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
|
||||
offset = ByteArrayUtils::readUint16andAdvancePosition(
|
||||
binaryDictionaryInfo->getDictRoot(), pos);
|
||||
break;
|
||||
case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
|
||||
offset = ByteArrayUtils::readUint24andAdvancePosition(
|
||||
binaryDictionaryInfo->getDictRoot(), pos);
|
||||
break;
|
||||
}
|
||||
if (isOffsetNegative(flags)) {
|
||||
return origin - offset;
|
||||
} else {
|
||||
return origin + offset;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace latinime
|
|
@ -0,0 +1,90 @@
|
|||
/*
|
||||
* Copyright (C) 2013 The Android Open Source Project
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef LATINIME_BINARY_DICTIONARY_BIGRAM_READING_UTILS_H
|
||||
#define LATINIME_BINARY_DICTIONARY_BIGRAM_READING_UTILS_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "defines.h"
|
||||
#include "suggest/core/dictionary/binary_dictionary_info.h"
|
||||
#include "suggest/core/dictionary/byte_array_utils.h"
|
||||
|
||||
namespace latinime {
|
||||
|
||||
class BinaryDictionaryBigramsReadingUtils {
|
||||
public:
|
||||
typedef uint8_t BigramFlags;
|
||||
|
||||
static AK_FORCE_INLINE void skipExistingBigrams(
|
||||
const BinaryDictionaryInfo *const binaryDictionaryInfo, int *const pos) {
|
||||
BigramFlags flags = getFlagsAndForwardPointer(binaryDictionaryInfo, pos);
|
||||
while (hasNext(flags)) {
|
||||
*pos += attributeAddressSize(flags);
|
||||
flags = getFlagsAndForwardPointer(binaryDictionaryInfo, pos);
|
||||
}
|
||||
*pos += attributeAddressSize(flags);
|
||||
}
|
||||
|
||||
static AK_FORCE_INLINE BigramFlags getFlagsAndForwardPointer(
|
||||
const BinaryDictionaryInfo *const binaryDictionaryInfo, int *const pos) {
|
||||
return ByteArrayUtils::readUint8andAdvancePosition(
|
||||
binaryDictionaryInfo->getDictRoot(), pos);
|
||||
}
|
||||
|
||||
static AK_FORCE_INLINE int getBigramProbability(const BigramFlags flags) {
|
||||
return flags & MASK_ATTRIBUTE_PROBABILITY;
|
||||
}
|
||||
|
||||
static AK_FORCE_INLINE bool isOffsetNegative(const BigramFlags flags) {
|
||||
return (flags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) != 0;
|
||||
}
|
||||
|
||||
static AK_FORCE_INLINE bool hasNext(const BigramFlags flags) {
|
||||
return (flags & FLAG_ATTRIBUTE_HAS_NEXT) != 0;
|
||||
}
|
||||
|
||||
static int getBigramAddressAndForwardPointer(
|
||||
const BinaryDictionaryInfo *const binaryDictionaryInfo,
|
||||
const BigramFlags flags, int *const pos);
|
||||
|
||||
private:
|
||||
DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryDictionaryBigramsReadingUtils);
|
||||
|
||||
static const BigramFlags MASK_ATTRIBUTE_ADDRESS_TYPE;
|
||||
static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE;
|
||||
static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES;
|
||||
static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES;
|
||||
static const BigramFlags FLAG_ATTRIBUTE_OFFSET_NEGATIVE;
|
||||
static const BigramFlags FLAG_ATTRIBUTE_HAS_NEXT;
|
||||
static const BigramFlags MASK_ATTRIBUTE_PROBABILITY;
|
||||
static const int ATTRIBUTE_ADDRESS_SHIFT;
|
||||
|
||||
static AK_FORCE_INLINE int attributeAddressSize(const BigramFlags flags) {
|
||||
return (flags & MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT;
|
||||
/* Note: this is a value-dependant optimization of what may probably be
|
||||
more readably written this way:
|
||||
switch (flags * BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) {
|
||||
case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1;
|
||||
case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2;
|
||||
case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3;
|
||||
default: return 0;
|
||||
}
|
||||
*/
|
||||
}
|
||||
};
|
||||
}
|
||||
#endif /* LATINIME_BINARY_DICTIONARY_BIGRAM_READING_UTILS_H */
|
|
@ -21,7 +21,6 @@
|
|||
|
||||
#include "suggest/core/dictionary/probability_utils.h"
|
||||
#include "utils/char_utils.h"
|
||||
#include "utils/hash_map_compat.h"
|
||||
|
||||
namespace latinime {
|
||||
|
||||
|
@ -81,16 +80,10 @@ class BinaryFormat {
|
|||
const int length, const bool forceLowerCaseSearch);
|
||||
static int getWordAtAddress(const uint8_t *const root, const int address, const int maxDepth,
|
||||
int *outWord, int *outUnigramProbability);
|
||||
static int getBigramProbabilityFromHashMap(const int position,
|
||||
const hash_map_compat<int, int> *bigramMap, const int unigramProbability);
|
||||
static void fillBigramProbabilityToHashMap(const uint8_t *const root, int position,
|
||||
hash_map_compat<int, int> *bigramMap);
|
||||
static int getBigramProbability(const uint8_t *const root, int position,
|
||||
const int nextPosition, const int unigramProbability);
|
||||
static int getBigramListPositionForWordPosition(const uint8_t *const root, int position);
|
||||
|
||||
private:
|
||||
DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat);
|
||||
static int getBigramListPositionForWordPosition(const uint8_t *const root, int position);
|
||||
|
||||
static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
|
||||
static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
|
||||
|
@ -516,57 +509,6 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co
|
|||
return 0;
|
||||
}
|
||||
|
||||
// This returns a probability in log space.
|
||||
inline int BinaryFormat::getBigramProbabilityFromHashMap(const int position,
|
||||
const hash_map_compat<int, int> *bigramMap, const int unigramProbability) {
|
||||
if (!bigramMap) {
|
||||
return ProbabilityUtils::backoff(unigramProbability);
|
||||
}
|
||||
const hash_map_compat<int, int>::const_iterator bigramProbabilityIt = bigramMap->find(position);
|
||||
if (bigramProbabilityIt != bigramMap->end()) {
|
||||
const int bigramProbability = bigramProbabilityIt->second;
|
||||
return ProbabilityUtils::computeProbabilityForBigram(unigramProbability, bigramProbability);
|
||||
}
|
||||
return ProbabilityUtils::backoff(unigramProbability);
|
||||
}
|
||||
|
||||
AK_FORCE_INLINE void BinaryFormat::fillBigramProbabilityToHashMap(
|
||||
const uint8_t *const root, int position, hash_map_compat<int, int> *bigramMap) {
|
||||
position = getBigramListPositionForWordPosition(root, position);
|
||||
if (0 == position) return;
|
||||
|
||||
uint8_t bigramFlags;
|
||||
do {
|
||||
bigramFlags = getFlagsAndForwardPointer(root, &position);
|
||||
const int probability = MASK_ATTRIBUTE_PROBABILITY & bigramFlags;
|
||||
const int bigramPos = getAttributeAddressAndForwardPointer(root, bigramFlags,
|
||||
&position);
|
||||
(*bigramMap)[bigramPos] = probability;
|
||||
} while (FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
|
||||
}
|
||||
|
||||
AK_FORCE_INLINE int BinaryFormat::getBigramProbability(const uint8_t *const root, int position,
|
||||
const int nextPosition, const int unigramProbability) {
|
||||
position = getBigramListPositionForWordPosition(root, position);
|
||||
if (0 == position) {
|
||||
return ProbabilityUtils::backoff(unigramProbability);
|
||||
}
|
||||
|
||||
uint8_t bigramFlags;
|
||||
do {
|
||||
bigramFlags = getFlagsAndForwardPointer(root, &position);
|
||||
const int bigramPos = getAttributeAddressAndForwardPointer(
|
||||
root, bigramFlags, &position);
|
||||
if (bigramPos == nextPosition) {
|
||||
const int bigramProbability = MASK_ATTRIBUTE_PROBABILITY & bigramFlags;
|
||||
return ProbabilityUtils::computeProbabilityForBigram(
|
||||
unigramProbability, bigramProbability);
|
||||
}
|
||||
} while (FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
|
||||
return ProbabilityUtils::backoff(unigramProbability);
|
||||
}
|
||||
|
||||
// Returns a pointer to the start of the bigram list.
|
||||
AK_FORCE_INLINE int BinaryFormat::getBigramListPositionForWordPosition(
|
||||
const uint8_t *const root, int position) {
|
||||
if (NOT_VALID_WORD == position) return 0;
|
||||
|
|
|
@ -79,7 +79,7 @@ int Dictionary::getSuggestions(ProximityInfo *proximityInfo, DicTraverseSession
|
|||
int Dictionary::getBigrams(const int *word, int length, int *inputCodePoints, int inputSize,
|
||||
int *outWords, int *frequencies, int *outputTypes) const {
|
||||
if (length <= 0) return 0;
|
||||
return mBigramDictionary->getBigrams(word, length, inputCodePoints, inputSize, outWords,
|
||||
return mBigramDictionary->getPredictions(word, length, inputCodePoints, inputSize, outWords,
|
||||
frequencies, outputTypes);
|
||||
}
|
||||
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#define LATINIME_MULTI_BIGRAM_MAP_H
|
||||
|
||||
#include "defines.h"
|
||||
#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
|
||||
#include "suggest/core/dictionary/binary_dictionary_info.h"
|
||||
#include "suggest/core/dictionary/binary_format.h"
|
||||
#include "utils/hash_map_compat.h"
|
||||
|
@ -34,7 +35,7 @@ class MultiBigramMap {
|
|||
|
||||
// Look up the bigram probability for the given word pair from the cached bigram maps.
|
||||
// Also caches the bigrams if there is space remaining and they have not been cached already.
|
||||
int getBigramProbability(const BinaryDictionaryInfo *const binaryDicitonaryInfo,
|
||||
int getBigramProbability(const BinaryDictionaryInfo *const binaryDictionaryInfo,
|
||||
const int wordPosition, const int nextWordPosition, const int unigramProbability) {
|
||||
hash_map_compat<int, BigramMap>::const_iterator mapPosition =
|
||||
mBigramMaps.find(wordPosition);
|
||||
|
@ -42,11 +43,11 @@ class MultiBigramMap {
|
|||
return mapPosition->second.getBigramProbability(nextWordPosition, unigramProbability);
|
||||
}
|
||||
if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) {
|
||||
addBigramsForWordPosition(binaryDicitonaryInfo, wordPosition);
|
||||
addBigramsForWordPosition(binaryDictionaryInfo, wordPosition);
|
||||
return mBigramMaps[wordPosition].getBigramProbability(
|
||||
nextWordPosition, unigramProbability);
|
||||
}
|
||||
return BinaryFormat::getBigramProbability(binaryDicitonaryInfo->getDictRoot(),
|
||||
return readBigramProbabilityFromBinaryDictionary(binaryDictionaryInfo,
|
||||
wordPosition, nextWordPosition, unigramProbability);
|
||||
}
|
||||
|
||||
|
@ -62,15 +63,29 @@ class MultiBigramMap {
|
|||
BigramMap() : mBigramMap(DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP) {}
|
||||
~BigramMap() {}
|
||||
|
||||
void init(const BinaryDictionaryInfo *const binaryDicitonaryInfo, const int position) {
|
||||
BinaryFormat::fillBigramProbabilityToHashMap(
|
||||
binaryDicitonaryInfo->getDictRoot(), position, &mBigramMap);
|
||||
void init(const BinaryDictionaryInfo *const binaryDictionaryInfo, const int nodePos) {
|
||||
const int bigramsListPos = BinaryFormat::getBigramListPositionForWordPosition(
|
||||
binaryDictionaryInfo->getDictRoot(), nodePos);
|
||||
if (0 == bigramsListPos) {
|
||||
return;
|
||||
}
|
||||
for (BinaryDictionaryBigramsIterator bigramsIt(binaryDictionaryInfo, bigramsListPos);
|
||||
bigramsIt.hasNext(); /* no-op */) {
|
||||
bigramsIt.next();
|
||||
mBigramMap[bigramsIt.getBigramPos()] = bigramsIt.getProbability();
|
||||
}
|
||||
}
|
||||
|
||||
inline int getBigramProbability(const int nextWordPosition, const int unigramProbability)
|
||||
const {
|
||||
return BinaryFormat::getBigramProbabilityFromHashMap(
|
||||
nextWordPosition, &mBigramMap, unigramProbability);
|
||||
AK_FORCE_INLINE int getBigramProbability(
|
||||
const int nextWordPosition, const int unigramProbability) const {
|
||||
const hash_map_compat<int, int>::const_iterator bigramProbabilityIt =
|
||||
mBigramMap.find(nextWordPosition);
|
||||
if (bigramProbabilityIt != mBigramMap.end()) {
|
||||
const int bigramProbability = bigramProbabilityIt->second;
|
||||
return ProbabilityUtils::computeProbabilityForBigram(
|
||||
unigramProbability, bigramProbability);
|
||||
}
|
||||
return ProbabilityUtils::backoff(unigramProbability);
|
||||
}
|
||||
|
||||
private:
|
||||
|
@ -78,9 +93,28 @@ class MultiBigramMap {
|
|||
hash_map_compat<int, int> mBigramMap;
|
||||
};
|
||||
|
||||
void addBigramsForWordPosition(const BinaryDictionaryInfo *const binaryDicitonaryInfo,
|
||||
const int position) {
|
||||
mBigramMaps[position].init(binaryDicitonaryInfo, position);
|
||||
AK_FORCE_INLINE void addBigramsForWordPosition(
|
||||
const BinaryDictionaryInfo *const binaryDictionaryInfo, const int position) {
|
||||
mBigramMaps[position].init(binaryDictionaryInfo, position);
|
||||
}
|
||||
|
||||
AK_FORCE_INLINE int readBigramProbabilityFromBinaryDictionary(
|
||||
const BinaryDictionaryInfo *const binaryDictionaryInfo, const int nodePos,
|
||||
const int nextWordPosition, const int unigramProbability) {
|
||||
const int bigramsListPos = BinaryFormat::getBigramListPositionForWordPosition(
|
||||
binaryDictionaryInfo->getDictRoot(), nodePos);
|
||||
if (0 == bigramsListPos) {
|
||||
return ProbabilityUtils::backoff(unigramProbability);
|
||||
}
|
||||
for (BinaryDictionaryBigramsIterator bigramsIt(binaryDictionaryInfo, bigramsListPos);
|
||||
bigramsIt.hasNext(); /* no-op */) {
|
||||
bigramsIt.next();
|
||||
if (bigramsIt.getBigramPos() == nextWordPosition) {
|
||||
return ProbabilityUtils::computeProbabilityForBigram(
|
||||
unigramProbability, bigramsIt.getProbability());
|
||||
}
|
||||
}
|
||||
return ProbabilityUtils::backoff(unigramProbability);
|
||||
}
|
||||
|
||||
hash_map_compat<int, BigramMap> mBigramMaps;
|
||||
|
|
Loading…
Reference in a new issue