am 4f191935
: Merge "Introduce BinaryDictionaryBigramsIterator to access bigrams attributes in binary dictionaries."
* commit '4f19193560c2eb4ecc9111b6c6daaae83352e649': Introduce BinaryDictionaryBigramsIterator to access bigrams attributes in binary dictionaries.
This commit is contained in:
commit
98c1139b32
10 changed files with 299 additions and 101 deletions
|
@ -53,6 +53,7 @@ LATIN_IME_CORE_SRC_FILES := \
|
||||||
dic_nodes_cache.cpp) \
|
dic_nodes_cache.cpp) \
|
||||||
$(addprefix suggest/core/dictionary/, \
|
$(addprefix suggest/core/dictionary/, \
|
||||||
bigram_dictionary.cpp \
|
bigram_dictionary.cpp \
|
||||||
|
binary_dictionary_bigrams_reading_utils.cpp \
|
||||||
binary_dictionary_format_utils.cpp \
|
binary_dictionary_format_utils.cpp \
|
||||||
binary_dictionary_header.cpp \
|
binary_dictionary_header.cpp \
|
||||||
binary_dictionary_header_reading_utils.cpp \
|
binary_dictionary_header_reading_utils.cpp \
|
||||||
|
|
|
@ -233,8 +233,7 @@ namespace latinime {
|
||||||
return multiBigramMap->getBigramProbability(
|
return multiBigramMap->getBigramProbability(
|
||||||
binaryDictionaryInfo, prevWordPos, wordPos, unigramProbability);
|
binaryDictionaryInfo, prevWordPos, wordPos, unigramProbability);
|
||||||
}
|
}
|
||||||
return BinaryFormat::getBigramProbability(
|
return ProbabilityUtils::backoff(unigramProbability);
|
||||||
binaryDictionaryInfo->getDictRoot(), prevWordPos, wordPos, unigramProbability);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////
|
///////////////////////////////////////
|
||||||
|
|
|
@ -21,6 +21,7 @@
|
||||||
#include "bigram_dictionary.h"
|
#include "bigram_dictionary.h"
|
||||||
|
|
||||||
#include "defines.h"
|
#include "defines.h"
|
||||||
|
#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
|
||||||
#include "suggest/core/dictionary/binary_dictionary_info.h"
|
#include "suggest/core/dictionary/binary_dictionary_info.h"
|
||||||
#include "suggest/core/dictionary/binary_format.h"
|
#include "suggest/core/dictionary/binary_format.h"
|
||||||
#include "suggest/core/dictionary/dictionary.h"
|
#include "suggest/core/dictionary/dictionary.h"
|
||||||
|
@ -100,12 +101,11 @@ void BigramDictionary::addWordBigram(int *word, int length, int probability, int
|
||||||
* and the bigrams are used to boost unigram result scores, it makes little sense to
|
* and the bigrams are used to boost unigram result scores, it makes little sense to
|
||||||
* reduce their scope to the ones that match the first letter.
|
* reduce their scope to the ones that match the first letter.
|
||||||
*/
|
*/
|
||||||
int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *inputCodePoints,
|
int BigramDictionary::getPredictions(const int *prevWord, int prevWordLength, int *inputCodePoints,
|
||||||
int inputSize, int *bigramCodePoints, int *bigramProbability, int *outputTypes) const {
|
int inputSize, int *bigramCodePoints, int *bigramProbability, int *outputTypes) const {
|
||||||
// TODO: remove unused arguments, and refrain from storing stuff in members of this class
|
// TODO: remove unused arguments, and refrain from storing stuff in members of this class
|
||||||
// TODO: have "in" arguments before "out" ones, and make out args explicit in the name
|
// TODO: have "in" arguments before "out" ones, and make out args explicit in the name
|
||||||
|
|
||||||
const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot();
|
|
||||||
int pos = getBigramListPositionForWord(prevWord, prevWordLength,
|
int pos = getBigramListPositionForWord(prevWord, prevWordLength,
|
||||||
false /* forceLowerCaseSearch */);
|
false /* forceLowerCaseSearch */);
|
||||||
// getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
|
// getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
|
||||||
|
@ -116,21 +116,20 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i
|
||||||
}
|
}
|
||||||
// If still no bigrams, we really don't have them!
|
// If still no bigrams, we really don't have them!
|
||||||
if (0 == pos) return 0;
|
if (0 == pos) return 0;
|
||||||
uint8_t bigramFlags;
|
|
||||||
int bigramCount = 0;
|
int bigramCount = 0;
|
||||||
do {
|
|
||||||
bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
|
|
||||||
int bigramBuffer[MAX_WORD_LENGTH];
|
|
||||||
int unigramProbability = 0;
|
int unigramProbability = 0;
|
||||||
const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags,
|
int bigramBuffer[MAX_WORD_LENGTH];
|
||||||
&pos);
|
for (BinaryDictionaryBigramsIterator bigramsIt(mBinaryDictionaryInfo, pos);
|
||||||
const int length = BinaryFormat::getWordAtAddress(root, bigramPos, MAX_WORD_LENGTH,
|
bigramsIt.hasNext(); /* no-op */) {
|
||||||
bigramBuffer, &unigramProbability);
|
bigramsIt.next();
|
||||||
|
const int length = BinaryFormat::getWordAtAddress(
|
||||||
|
mBinaryDictionaryInfo->getDictRoot(), bigramsIt.getBigramPos(),
|
||||||
|
MAX_WORD_LENGTH, bigramBuffer, &unigramProbability);
|
||||||
|
|
||||||
// inputSize == 0 means we are trying to find bigram predictions.
|
// inputSize == 0 means we are trying to find bigram predictions.
|
||||||
if (inputSize < 1 || checkFirstCharacter(bigramBuffer, inputCodePoints)) {
|
if (inputSize < 1 || checkFirstCharacter(bigramBuffer, inputCodePoints)) {
|
||||||
const int bigramProbabilityTemp =
|
const int bigramProbabilityTemp = bigramsIt.getProbability();
|
||||||
BinaryFormat::MASK_ATTRIBUTE_PROBABILITY & bigramFlags;
|
|
||||||
// Due to space constraints, the probability for bigrams is approximate - the lower the
|
// Due to space constraints, the probability for bigrams is approximate - the lower the
|
||||||
// unigram probability, the worse the precision. The theoritical maximum error in
|
// unigram probability, the worse the precision. The theoritical maximum error in
|
||||||
// resulting probability is 8 - although in the practice it's never bigger than 3 or 4
|
// resulting probability is 8 - although in the practice it's never bigger than 3 or 4
|
||||||
|
@ -142,7 +141,7 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i
|
||||||
outputTypes);
|
outputTypes);
|
||||||
++bigramCount;
|
++bigramCount;
|
||||||
}
|
}
|
||||||
} while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
|
}
|
||||||
return min(bigramCount, MAX_RESULTS);
|
return min(bigramCount, MAX_RESULTS);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -187,22 +186,20 @@ bool BigramDictionary::checkFirstCharacter(int *word, int *inputCodePoints) cons
|
||||||
|
|
||||||
bool BigramDictionary::isValidBigram(const int *word1, int length1, const int *word2,
|
bool BigramDictionary::isValidBigram(const int *word1, int length1, const int *word2,
|
||||||
int length2) const {
|
int length2) const {
|
||||||
const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot();
|
|
||||||
int pos = getBigramListPositionForWord(word1, length1, false /* forceLowerCaseSearch */);
|
int pos = getBigramListPositionForWord(word1, length1, false /* forceLowerCaseSearch */);
|
||||||
// getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
|
// getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
|
||||||
if (0 == pos) return false;
|
if (0 == pos) return false;
|
||||||
int nextWordPos = BinaryFormat::getTerminalPosition(root, word2, length2,
|
int nextWordPos = BinaryFormat::getTerminalPosition(mBinaryDictionaryInfo->getDictRoot(),
|
||||||
false /* forceLowerCaseSearch */);
|
word2, length2, false /* forceLowerCaseSearch */);
|
||||||
if (NOT_VALID_WORD == nextWordPos) return false;
|
if (NOT_VALID_WORD == nextWordPos) return false;
|
||||||
uint8_t bigramFlags;
|
|
||||||
do {
|
for (BinaryDictionaryBigramsIterator bigramsIt(mBinaryDictionaryInfo, pos);
|
||||||
bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
|
bigramsIt.hasNext(); /* no-op */) {
|
||||||
const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags,
|
bigramsIt.next();
|
||||||
&pos);
|
if (bigramsIt.getBigramPos() == nextWordPos) {
|
||||||
if (bigramPos == nextWordPos) {
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
} while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -27,8 +27,8 @@ class BigramDictionary {
|
||||||
public:
|
public:
|
||||||
BigramDictionary(const BinaryDictionaryInfo *const binaryDictionaryInfo);
|
BigramDictionary(const BinaryDictionaryInfo *const binaryDictionaryInfo);
|
||||||
|
|
||||||
int getBigrams(const int *word, int length, int *inputCodePoints, int inputSize, int *outWords,
|
int getPredictions(const int *word, int length, int *inputCodePoints, int inputSize,
|
||||||
int *frequencies, int *outputTypes) const;
|
int *outWords, int *frequencies, int *outputTypes) const;
|
||||||
bool isValidBigram(const int *word1, int length1, const int *word2, int length2) const;
|
bool isValidBigram(const int *word1, int length1, const int *word2, int length2) const;
|
||||||
~BigramDictionary();
|
~BigramDictionary();
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,67 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2013 The Android Open Source Project
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H
|
||||||
|
#define LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H
|
||||||
|
|
||||||
|
#include "defines.h"
|
||||||
|
#include "suggest/core/dictionary/binary_dictionary_bigrams_reading_utils.h"
|
||||||
|
#include "suggest/core/dictionary/binary_dictionary_info.h"
|
||||||
|
|
||||||
|
namespace latinime {
|
||||||
|
|
||||||
|
class BinaryDictionaryBigramsIterator {
|
||||||
|
public:
|
||||||
|
BinaryDictionaryBigramsIterator(
|
||||||
|
const BinaryDictionaryInfo *const binaryDictionaryInfo, const int pos)
|
||||||
|
: mBinaryDictionaryInfo(binaryDictionaryInfo), mPos(pos), mBigramFlags(0),
|
||||||
|
mBigramPos(0), mHasNext(true) {}
|
||||||
|
|
||||||
|
AK_FORCE_INLINE bool hasNext() const {
|
||||||
|
return mHasNext;
|
||||||
|
}
|
||||||
|
|
||||||
|
AK_FORCE_INLINE void next() {
|
||||||
|
mBigramFlags = BinaryDictionaryBigramsReadingUtils::getFlagsAndForwardPointer(
|
||||||
|
mBinaryDictionaryInfo, &mPos);
|
||||||
|
mBigramPos = BinaryDictionaryBigramsReadingUtils::getBigramAddressAndForwardPointer(
|
||||||
|
mBinaryDictionaryInfo, mBigramFlags, &mPos);
|
||||||
|
mHasNext = BinaryDictionaryBigramsReadingUtils::hasNext(mBigramFlags);
|
||||||
|
}
|
||||||
|
|
||||||
|
AK_FORCE_INLINE int getProbability() const {
|
||||||
|
return BinaryDictionaryBigramsReadingUtils::getBigramProbability(mBigramFlags);
|
||||||
|
}
|
||||||
|
|
||||||
|
AK_FORCE_INLINE int getBigramPos() const {
|
||||||
|
return mBigramPos;
|
||||||
|
}
|
||||||
|
|
||||||
|
AK_FORCE_INLINE int getFlags() const {
|
||||||
|
return mBigramFlags;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
DISALLOW_COPY_AND_ASSIGN(BinaryDictionaryBigramsIterator);
|
||||||
|
|
||||||
|
const BinaryDictionaryInfo *const mBinaryDictionaryInfo;
|
||||||
|
int mPos;
|
||||||
|
BinaryDictionaryBigramsReadingUtils::BigramFlags mBigramFlags;
|
||||||
|
int mBigramPos;
|
||||||
|
bool mHasNext;
|
||||||
|
};
|
||||||
|
} // namespace latinime
|
||||||
|
#endif // LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H
|
|
@ -0,0 +1,68 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2013 The Android Open Source Project
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "suggest/core/dictionary/binary_dictionary_bigrams_reading_utils.h"
|
||||||
|
|
||||||
|
#include "suggest/core/dictionary/binary_dictionary_info.h"
|
||||||
|
#include "suggest/core/dictionary/byte_array_utils.h"
|
||||||
|
|
||||||
|
namespace latinime {
|
||||||
|
|
||||||
|
const BinaryDictionaryBigramsReadingUtils::BigramFlags
|
||||||
|
BinaryDictionaryBigramsReadingUtils::MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
|
||||||
|
const BinaryDictionaryBigramsReadingUtils::BigramFlags
|
||||||
|
BinaryDictionaryBigramsReadingUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
|
||||||
|
const BinaryDictionaryBigramsReadingUtils::BigramFlags
|
||||||
|
BinaryDictionaryBigramsReadingUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
|
||||||
|
const BinaryDictionaryBigramsReadingUtils::BigramFlags
|
||||||
|
BinaryDictionaryBigramsReadingUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
|
||||||
|
const BinaryDictionaryBigramsReadingUtils::BigramFlags
|
||||||
|
BinaryDictionaryBigramsReadingUtils::FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
|
||||||
|
// Flag for presence of more attributes
|
||||||
|
const BinaryDictionaryBigramsReadingUtils::BigramFlags
|
||||||
|
BinaryDictionaryBigramsReadingUtils::FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
|
||||||
|
// Mask for attribute probability, stored on 4 bits inside the flags byte.
|
||||||
|
const BinaryDictionaryBigramsReadingUtils::BigramFlags
|
||||||
|
BinaryDictionaryBigramsReadingUtils::MASK_ATTRIBUTE_PROBABILITY = 0x0F;
|
||||||
|
const int BinaryDictionaryBigramsReadingUtils::ATTRIBUTE_ADDRESS_SHIFT = 4;
|
||||||
|
|
||||||
|
/* static */ int BinaryDictionaryBigramsReadingUtils::getBigramAddressAndForwardPointer(
|
||||||
|
const BinaryDictionaryInfo *const binaryDictionaryInfo, const BigramFlags flags,
|
||||||
|
int *const pos) {
|
||||||
|
int offset = 0;
|
||||||
|
const int origin = *pos;
|
||||||
|
switch (MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
|
||||||
|
case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
|
||||||
|
offset = ByteArrayUtils::readUint8andAdvancePosition(
|
||||||
|
binaryDictionaryInfo->getDictRoot(), pos);
|
||||||
|
break;
|
||||||
|
case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
|
||||||
|
offset = ByteArrayUtils::readUint16andAdvancePosition(
|
||||||
|
binaryDictionaryInfo->getDictRoot(), pos);
|
||||||
|
break;
|
||||||
|
case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
|
||||||
|
offset = ByteArrayUtils::readUint24andAdvancePosition(
|
||||||
|
binaryDictionaryInfo->getDictRoot(), pos);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (isOffsetNegative(flags)) {
|
||||||
|
return origin - offset;
|
||||||
|
} else {
|
||||||
|
return origin + offset;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace latinime
|
|
@ -0,0 +1,90 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2013 The Android Open Source Project
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef LATINIME_BINARY_DICTIONARY_BIGRAM_READING_UTILS_H
|
||||||
|
#define LATINIME_BINARY_DICTIONARY_BIGRAM_READING_UTILS_H
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#include "defines.h"
|
||||||
|
#include "suggest/core/dictionary/binary_dictionary_info.h"
|
||||||
|
#include "suggest/core/dictionary/byte_array_utils.h"
|
||||||
|
|
||||||
|
namespace latinime {
|
||||||
|
|
||||||
|
class BinaryDictionaryBigramsReadingUtils {
|
||||||
|
public:
|
||||||
|
typedef uint8_t BigramFlags;
|
||||||
|
|
||||||
|
static AK_FORCE_INLINE void skipExistingBigrams(
|
||||||
|
const BinaryDictionaryInfo *const binaryDictionaryInfo, int *const pos) {
|
||||||
|
BigramFlags flags = getFlagsAndForwardPointer(binaryDictionaryInfo, pos);
|
||||||
|
while (hasNext(flags)) {
|
||||||
|
*pos += attributeAddressSize(flags);
|
||||||
|
flags = getFlagsAndForwardPointer(binaryDictionaryInfo, pos);
|
||||||
|
}
|
||||||
|
*pos += attributeAddressSize(flags);
|
||||||
|
}
|
||||||
|
|
||||||
|
static AK_FORCE_INLINE BigramFlags getFlagsAndForwardPointer(
|
||||||
|
const BinaryDictionaryInfo *const binaryDictionaryInfo, int *const pos) {
|
||||||
|
return ByteArrayUtils::readUint8andAdvancePosition(
|
||||||
|
binaryDictionaryInfo->getDictRoot(), pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
static AK_FORCE_INLINE int getBigramProbability(const BigramFlags flags) {
|
||||||
|
return flags & MASK_ATTRIBUTE_PROBABILITY;
|
||||||
|
}
|
||||||
|
|
||||||
|
static AK_FORCE_INLINE bool isOffsetNegative(const BigramFlags flags) {
|
||||||
|
return (flags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static AK_FORCE_INLINE bool hasNext(const BigramFlags flags) {
|
||||||
|
return (flags & FLAG_ATTRIBUTE_HAS_NEXT) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int getBigramAddressAndForwardPointer(
|
||||||
|
const BinaryDictionaryInfo *const binaryDictionaryInfo,
|
||||||
|
const BigramFlags flags, int *const pos);
|
||||||
|
|
||||||
|
private:
|
||||||
|
DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryDictionaryBigramsReadingUtils);
|
||||||
|
|
||||||
|
static const BigramFlags MASK_ATTRIBUTE_ADDRESS_TYPE;
|
||||||
|
static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE;
|
||||||
|
static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES;
|
||||||
|
static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES;
|
||||||
|
static const BigramFlags FLAG_ATTRIBUTE_OFFSET_NEGATIVE;
|
||||||
|
static const BigramFlags FLAG_ATTRIBUTE_HAS_NEXT;
|
||||||
|
static const BigramFlags MASK_ATTRIBUTE_PROBABILITY;
|
||||||
|
static const int ATTRIBUTE_ADDRESS_SHIFT;
|
||||||
|
|
||||||
|
static AK_FORCE_INLINE int attributeAddressSize(const BigramFlags flags) {
|
||||||
|
return (flags & MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT;
|
||||||
|
/* Note: this is a value-dependant optimization of what may probably be
|
||||||
|
more readably written this way:
|
||||||
|
switch (flags * BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) {
|
||||||
|
case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1;
|
||||||
|
case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2;
|
||||||
|
case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3;
|
||||||
|
default: return 0;
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
#endif /* LATINIME_BINARY_DICTIONARY_BIGRAM_READING_UTILS_H */
|
|
@ -21,7 +21,6 @@
|
||||||
|
|
||||||
#include "suggest/core/dictionary/probability_utils.h"
|
#include "suggest/core/dictionary/probability_utils.h"
|
||||||
#include "utils/char_utils.h"
|
#include "utils/char_utils.h"
|
||||||
#include "utils/hash_map_compat.h"
|
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
|
@ -81,16 +80,10 @@ class BinaryFormat {
|
||||||
const int length, const bool forceLowerCaseSearch);
|
const int length, const bool forceLowerCaseSearch);
|
||||||
static int getWordAtAddress(const uint8_t *const root, const int address, const int maxDepth,
|
static int getWordAtAddress(const uint8_t *const root, const int address, const int maxDepth,
|
||||||
int *outWord, int *outUnigramProbability);
|
int *outWord, int *outUnigramProbability);
|
||||||
static int getBigramProbabilityFromHashMap(const int position,
|
static int getBigramListPositionForWordPosition(const uint8_t *const root, int position);
|
||||||
const hash_map_compat<int, int> *bigramMap, const int unigramProbability);
|
|
||||||
static void fillBigramProbabilityToHashMap(const uint8_t *const root, int position,
|
|
||||||
hash_map_compat<int, int> *bigramMap);
|
|
||||||
static int getBigramProbability(const uint8_t *const root, int position,
|
|
||||||
const int nextPosition, const int unigramProbability);
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat);
|
||||||
static int getBigramListPositionForWordPosition(const uint8_t *const root, int position);
|
|
||||||
|
|
||||||
static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
|
static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
|
||||||
static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
|
static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
|
||||||
|
@ -516,57 +509,6 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// This returns a probability in log space.
|
|
||||||
inline int BinaryFormat::getBigramProbabilityFromHashMap(const int position,
|
|
||||||
const hash_map_compat<int, int> *bigramMap, const int unigramProbability) {
|
|
||||||
if (!bigramMap) {
|
|
||||||
return ProbabilityUtils::backoff(unigramProbability);
|
|
||||||
}
|
|
||||||
const hash_map_compat<int, int>::const_iterator bigramProbabilityIt = bigramMap->find(position);
|
|
||||||
if (bigramProbabilityIt != bigramMap->end()) {
|
|
||||||
const int bigramProbability = bigramProbabilityIt->second;
|
|
||||||
return ProbabilityUtils::computeProbabilityForBigram(unigramProbability, bigramProbability);
|
|
||||||
}
|
|
||||||
return ProbabilityUtils::backoff(unigramProbability);
|
|
||||||
}
|
|
||||||
|
|
||||||
AK_FORCE_INLINE void BinaryFormat::fillBigramProbabilityToHashMap(
|
|
||||||
const uint8_t *const root, int position, hash_map_compat<int, int> *bigramMap) {
|
|
||||||
position = getBigramListPositionForWordPosition(root, position);
|
|
||||||
if (0 == position) return;
|
|
||||||
|
|
||||||
uint8_t bigramFlags;
|
|
||||||
do {
|
|
||||||
bigramFlags = getFlagsAndForwardPointer(root, &position);
|
|
||||||
const int probability = MASK_ATTRIBUTE_PROBABILITY & bigramFlags;
|
|
||||||
const int bigramPos = getAttributeAddressAndForwardPointer(root, bigramFlags,
|
|
||||||
&position);
|
|
||||||
(*bigramMap)[bigramPos] = probability;
|
|
||||||
} while (FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
|
|
||||||
}
|
|
||||||
|
|
||||||
AK_FORCE_INLINE int BinaryFormat::getBigramProbability(const uint8_t *const root, int position,
|
|
||||||
const int nextPosition, const int unigramProbability) {
|
|
||||||
position = getBigramListPositionForWordPosition(root, position);
|
|
||||||
if (0 == position) {
|
|
||||||
return ProbabilityUtils::backoff(unigramProbability);
|
|
||||||
}
|
|
||||||
|
|
||||||
uint8_t bigramFlags;
|
|
||||||
do {
|
|
||||||
bigramFlags = getFlagsAndForwardPointer(root, &position);
|
|
||||||
const int bigramPos = getAttributeAddressAndForwardPointer(
|
|
||||||
root, bigramFlags, &position);
|
|
||||||
if (bigramPos == nextPosition) {
|
|
||||||
const int bigramProbability = MASK_ATTRIBUTE_PROBABILITY & bigramFlags;
|
|
||||||
return ProbabilityUtils::computeProbabilityForBigram(
|
|
||||||
unigramProbability, bigramProbability);
|
|
||||||
}
|
|
||||||
} while (FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags);
|
|
||||||
return ProbabilityUtils::backoff(unigramProbability);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns a pointer to the start of the bigram list.
|
|
||||||
AK_FORCE_INLINE int BinaryFormat::getBigramListPositionForWordPosition(
|
AK_FORCE_INLINE int BinaryFormat::getBigramListPositionForWordPosition(
|
||||||
const uint8_t *const root, int position) {
|
const uint8_t *const root, int position) {
|
||||||
if (NOT_VALID_WORD == position) return 0;
|
if (NOT_VALID_WORD == position) return 0;
|
||||||
|
|
|
@ -79,7 +79,7 @@ int Dictionary::getSuggestions(ProximityInfo *proximityInfo, DicTraverseSession
|
||||||
int Dictionary::getBigrams(const int *word, int length, int *inputCodePoints, int inputSize,
|
int Dictionary::getBigrams(const int *word, int length, int *inputCodePoints, int inputSize,
|
||||||
int *outWords, int *frequencies, int *outputTypes) const {
|
int *outWords, int *frequencies, int *outputTypes) const {
|
||||||
if (length <= 0) return 0;
|
if (length <= 0) return 0;
|
||||||
return mBigramDictionary->getBigrams(word, length, inputCodePoints, inputSize, outWords,
|
return mBigramDictionary->getPredictions(word, length, inputCodePoints, inputSize, outWords,
|
||||||
frequencies, outputTypes);
|
frequencies, outputTypes);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -18,6 +18,7 @@
|
||||||
#define LATINIME_MULTI_BIGRAM_MAP_H
|
#define LATINIME_MULTI_BIGRAM_MAP_H
|
||||||
|
|
||||||
#include "defines.h"
|
#include "defines.h"
|
||||||
|
#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
|
||||||
#include "suggest/core/dictionary/binary_dictionary_info.h"
|
#include "suggest/core/dictionary/binary_dictionary_info.h"
|
||||||
#include "suggest/core/dictionary/binary_format.h"
|
#include "suggest/core/dictionary/binary_format.h"
|
||||||
#include "utils/hash_map_compat.h"
|
#include "utils/hash_map_compat.h"
|
||||||
|
@ -34,7 +35,7 @@ class MultiBigramMap {
|
||||||
|
|
||||||
// Look up the bigram probability for the given word pair from the cached bigram maps.
|
// Look up the bigram probability for the given word pair from the cached bigram maps.
|
||||||
// Also caches the bigrams if there is space remaining and they have not been cached already.
|
// Also caches the bigrams if there is space remaining and they have not been cached already.
|
||||||
int getBigramProbability(const BinaryDictionaryInfo *const binaryDicitonaryInfo,
|
int getBigramProbability(const BinaryDictionaryInfo *const binaryDictionaryInfo,
|
||||||
const int wordPosition, const int nextWordPosition, const int unigramProbability) {
|
const int wordPosition, const int nextWordPosition, const int unigramProbability) {
|
||||||
hash_map_compat<int, BigramMap>::const_iterator mapPosition =
|
hash_map_compat<int, BigramMap>::const_iterator mapPosition =
|
||||||
mBigramMaps.find(wordPosition);
|
mBigramMaps.find(wordPosition);
|
||||||
|
@ -42,11 +43,11 @@ class MultiBigramMap {
|
||||||
return mapPosition->second.getBigramProbability(nextWordPosition, unigramProbability);
|
return mapPosition->second.getBigramProbability(nextWordPosition, unigramProbability);
|
||||||
}
|
}
|
||||||
if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) {
|
if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) {
|
||||||
addBigramsForWordPosition(binaryDicitonaryInfo, wordPosition);
|
addBigramsForWordPosition(binaryDictionaryInfo, wordPosition);
|
||||||
return mBigramMaps[wordPosition].getBigramProbability(
|
return mBigramMaps[wordPosition].getBigramProbability(
|
||||||
nextWordPosition, unigramProbability);
|
nextWordPosition, unigramProbability);
|
||||||
}
|
}
|
||||||
return BinaryFormat::getBigramProbability(binaryDicitonaryInfo->getDictRoot(),
|
return readBigramProbabilityFromBinaryDictionary(binaryDictionaryInfo,
|
||||||
wordPosition, nextWordPosition, unigramProbability);
|
wordPosition, nextWordPosition, unigramProbability);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -62,15 +63,29 @@ class MultiBigramMap {
|
||||||
BigramMap() : mBigramMap(DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP) {}
|
BigramMap() : mBigramMap(DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP) {}
|
||||||
~BigramMap() {}
|
~BigramMap() {}
|
||||||
|
|
||||||
void init(const BinaryDictionaryInfo *const binaryDicitonaryInfo, const int position) {
|
void init(const BinaryDictionaryInfo *const binaryDictionaryInfo, const int nodePos) {
|
||||||
BinaryFormat::fillBigramProbabilityToHashMap(
|
const int bigramsListPos = BinaryFormat::getBigramListPositionForWordPosition(
|
||||||
binaryDicitonaryInfo->getDictRoot(), position, &mBigramMap);
|
binaryDictionaryInfo->getDictRoot(), nodePos);
|
||||||
|
if (0 == bigramsListPos) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
for (BinaryDictionaryBigramsIterator bigramsIt(binaryDictionaryInfo, bigramsListPos);
|
||||||
|
bigramsIt.hasNext(); /* no-op */) {
|
||||||
|
bigramsIt.next();
|
||||||
|
mBigramMap[bigramsIt.getBigramPos()] = bigramsIt.getProbability();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline int getBigramProbability(const int nextWordPosition, const int unigramProbability)
|
AK_FORCE_INLINE int getBigramProbability(
|
||||||
const {
|
const int nextWordPosition, const int unigramProbability) const {
|
||||||
return BinaryFormat::getBigramProbabilityFromHashMap(
|
const hash_map_compat<int, int>::const_iterator bigramProbabilityIt =
|
||||||
nextWordPosition, &mBigramMap, unigramProbability);
|
mBigramMap.find(nextWordPosition);
|
||||||
|
if (bigramProbabilityIt != mBigramMap.end()) {
|
||||||
|
const int bigramProbability = bigramProbabilityIt->second;
|
||||||
|
return ProbabilityUtils::computeProbabilityForBigram(
|
||||||
|
unigramProbability, bigramProbability);
|
||||||
|
}
|
||||||
|
return ProbabilityUtils::backoff(unigramProbability);
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -78,9 +93,28 @@ class MultiBigramMap {
|
||||||
hash_map_compat<int, int> mBigramMap;
|
hash_map_compat<int, int> mBigramMap;
|
||||||
};
|
};
|
||||||
|
|
||||||
void addBigramsForWordPosition(const BinaryDictionaryInfo *const binaryDicitonaryInfo,
|
AK_FORCE_INLINE void addBigramsForWordPosition(
|
||||||
const int position) {
|
const BinaryDictionaryInfo *const binaryDictionaryInfo, const int position) {
|
||||||
mBigramMaps[position].init(binaryDicitonaryInfo, position);
|
mBigramMaps[position].init(binaryDictionaryInfo, position);
|
||||||
|
}
|
||||||
|
|
||||||
|
AK_FORCE_INLINE int readBigramProbabilityFromBinaryDictionary(
|
||||||
|
const BinaryDictionaryInfo *const binaryDictionaryInfo, const int nodePos,
|
||||||
|
const int nextWordPosition, const int unigramProbability) {
|
||||||
|
const int bigramsListPos = BinaryFormat::getBigramListPositionForWordPosition(
|
||||||
|
binaryDictionaryInfo->getDictRoot(), nodePos);
|
||||||
|
if (0 == bigramsListPos) {
|
||||||
|
return ProbabilityUtils::backoff(unigramProbability);
|
||||||
|
}
|
||||||
|
for (BinaryDictionaryBigramsIterator bigramsIt(binaryDictionaryInfo, bigramsListPos);
|
||||||
|
bigramsIt.hasNext(); /* no-op */) {
|
||||||
|
bigramsIt.next();
|
||||||
|
if (bigramsIt.getBigramPos() == nextWordPosition) {
|
||||||
|
return ProbabilityUtils::computeProbabilityForBigram(
|
||||||
|
unigramProbability, bigramsIt.getProbability());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ProbabilityUtils::backoff(unigramProbability);
|
||||||
}
|
}
|
||||||
|
|
||||||
hash_map_compat<int, BigramMap> mBigramMaps;
|
hash_map_compat<int, BigramMap> mBigramMaps;
|
||||||
|
|
Loading…
Reference in a new issue