From a71ed8caa27c4a0174f25750171282980bc26880 Mon Sep 17 00:00:00 2001 From: Keisuke Kuroynagi Date: Fri, 14 Jun 2013 18:12:56 +0900 Subject: [PATCH] Introduce BinaryDictionaryBigramsIterator to access bigrams attributes in binary dictionaries. Bug: 6669677 Change-Id: Ifb1adebc5305a930c80396f6b4ec31d84400a9dc --- native/jni/Android.mk | 1 + .../suggest/core/dicnode/dic_node_utils.cpp | 3 +- .../core/dictionary/bigram_dictionary.cpp | 45 +++++----- .../core/dictionary/bigram_dictionary.h | 4 +- .../binary_dictionary_bigrams_iterator.h | 67 ++++++++++++++ ...inary_dictionary_bigrams_reading_utils.cpp | 68 ++++++++++++++ .../binary_dictionary_bigrams_reading_utils.h | 90 +++++++++++++++++++ .../suggest/core/dictionary/binary_format.h | 60 +------------ .../suggest/core/dictionary/dictionary.cpp | 2 +- .../core/dictionary/multi_bigram_map.h | 60 ++++++++++--- 10 files changed, 299 insertions(+), 101 deletions(-) create mode 100644 native/jni/src/suggest/core/dictionary/binary_dictionary_bigrams_iterator.h create mode 100644 native/jni/src/suggest/core/dictionary/binary_dictionary_bigrams_reading_utils.cpp create mode 100644 native/jni/src/suggest/core/dictionary/binary_dictionary_bigrams_reading_utils.h diff --git a/native/jni/Android.mk b/native/jni/Android.mk index 1cdfbe4d1..9db50473d 100644 --- a/native/jni/Android.mk +++ b/native/jni/Android.mk @@ -53,6 +53,7 @@ LATIN_IME_CORE_SRC_FILES := \ dic_nodes_cache.cpp) \ $(addprefix suggest/core/dictionary/, \ bigram_dictionary.cpp \ + binary_dictionary_bigrams_reading_utils.cpp \ binary_dictionary_format_utils.cpp \ binary_dictionary_header.cpp \ binary_dictionary_header_reading_utils.cpp \ diff --git a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp index 3deee1a42..f0f26c72b 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp +++ b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp @@ -233,8 +233,7 @@ namespace latinime { return multiBigramMap->getBigramProbability( binaryDictionaryInfo, prevWordPos, wordPos, unigramProbability); } - return BinaryFormat::getBigramProbability( - binaryDictionaryInfo->getDictRoot(), prevWordPos, wordPos, unigramProbability); + return ProbabilityUtils::backoff(unigramProbability); } /////////////////////////////////////// diff --git a/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp b/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp index 53e2df62d..6e02100fc 100644 --- a/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp @@ -21,6 +21,7 @@ #include "bigram_dictionary.h" #include "defines.h" +#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h" #include "suggest/core/dictionary/binary_dictionary_info.h" #include "suggest/core/dictionary/binary_format.h" #include "suggest/core/dictionary/dictionary.h" @@ -100,12 +101,11 @@ void BigramDictionary::addWordBigram(int *word, int length, int probability, int * and the bigrams are used to boost unigram result scores, it makes little sense to * reduce their scope to the ones that match the first letter. */ -int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *inputCodePoints, +int BigramDictionary::getPredictions(const int *prevWord, int prevWordLength, int *inputCodePoints, int inputSize, int *bigramCodePoints, int *bigramProbability, int *outputTypes) const { // TODO: remove unused arguments, and refrain from storing stuff in members of this class // TODO: have "in" arguments before "out" ones, and make out args explicit in the name - const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot(); int pos = getBigramListPositionForWord(prevWord, prevWordLength, false /* forceLowerCaseSearch */); // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams @@ -116,21 +116,20 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i } // If still no bigrams, we really don't have them! if (0 == pos) return 0; - uint8_t bigramFlags; + int bigramCount = 0; - do { - bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); - int bigramBuffer[MAX_WORD_LENGTH]; - int unigramProbability = 0; - const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags, - &pos); - const int length = BinaryFormat::getWordAtAddress(root, bigramPos, MAX_WORD_LENGTH, - bigramBuffer, &unigramProbability); + int unigramProbability = 0; + int bigramBuffer[MAX_WORD_LENGTH]; + for (BinaryDictionaryBigramsIterator bigramsIt(mBinaryDictionaryInfo, pos); + bigramsIt.hasNext(); /* no-op */) { + bigramsIt.next(); + const int length = BinaryFormat::getWordAtAddress( + mBinaryDictionaryInfo->getDictRoot(), bigramsIt.getBigramPos(), + MAX_WORD_LENGTH, bigramBuffer, &unigramProbability); // inputSize == 0 means we are trying to find bigram predictions. if (inputSize < 1 || checkFirstCharacter(bigramBuffer, inputCodePoints)) { - const int bigramProbabilityTemp = - BinaryFormat::MASK_ATTRIBUTE_PROBABILITY & bigramFlags; + const int bigramProbabilityTemp = bigramsIt.getProbability(); // Due to space constraints, the probability for bigrams is approximate - the lower the // unigram probability, the worse the precision. The theoritical maximum error in // resulting probability is 8 - although in the practice it's never bigger than 3 or 4 @@ -142,7 +141,7 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i outputTypes); ++bigramCount; } - } while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags); + } return min(bigramCount, MAX_RESULTS); } @@ -187,22 +186,20 @@ bool BigramDictionary::checkFirstCharacter(int *word, int *inputCodePoints) cons bool BigramDictionary::isValidBigram(const int *word1, int length1, const int *word2, int length2) const { - const uint8_t *const root = mBinaryDictionaryInfo->getDictRoot(); int pos = getBigramListPositionForWord(word1, length1, false /* forceLowerCaseSearch */); // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams if (0 == pos) return false; - int nextWordPos = BinaryFormat::getTerminalPosition(root, word2, length2, - false /* forceLowerCaseSearch */); + int nextWordPos = BinaryFormat::getTerminalPosition(mBinaryDictionaryInfo->getDictRoot(), + word2, length2, false /* forceLowerCaseSearch */); if (NOT_VALID_WORD == nextWordPos) return false; - uint8_t bigramFlags; - do { - bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); - const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags, - &pos); - if (bigramPos == nextWordPos) { + + for (BinaryDictionaryBigramsIterator bigramsIt(mBinaryDictionaryInfo, pos); + bigramsIt.hasNext(); /* no-op */) { + bigramsIt.next(); + if (bigramsIt.getBigramPos() == nextWordPos) { return true; } - } while (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags); + } return false; } diff --git a/native/jni/src/suggest/core/dictionary/bigram_dictionary.h b/native/jni/src/suggest/core/dictionary/bigram_dictionary.h index 06d0e9da3..7706a2c22 100644 --- a/native/jni/src/suggest/core/dictionary/bigram_dictionary.h +++ b/native/jni/src/suggest/core/dictionary/bigram_dictionary.h @@ -27,8 +27,8 @@ class BigramDictionary { public: BigramDictionary(const BinaryDictionaryInfo *const binaryDictionaryInfo); - int getBigrams(const int *word, int length, int *inputCodePoints, int inputSize, int *outWords, - int *frequencies, int *outputTypes) const; + int getPredictions(const int *word, int length, int *inputCodePoints, int inputSize, + int *outWords, int *frequencies, int *outputTypes) const; bool isValidBigram(const int *word1, int length1, const int *word2, int length2) const; ~BigramDictionary(); diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_bigrams_iterator.h b/native/jni/src/suggest/core/dictionary/binary_dictionary_bigrams_iterator.h new file mode 100644 index 000000000..0856840b2 --- /dev/null +++ b/native/jni/src/suggest/core/dictionary/binary_dictionary_bigrams_iterator.h @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H +#define LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H + +#include "defines.h" +#include "suggest/core/dictionary/binary_dictionary_bigrams_reading_utils.h" +#include "suggest/core/dictionary/binary_dictionary_info.h" + +namespace latinime { + +class BinaryDictionaryBigramsIterator { + public: + BinaryDictionaryBigramsIterator( + const BinaryDictionaryInfo *const binaryDictionaryInfo, const int pos) + : mBinaryDictionaryInfo(binaryDictionaryInfo), mPos(pos), mBigramFlags(0), + mBigramPos(0), mHasNext(true) {} + + AK_FORCE_INLINE bool hasNext() const { + return mHasNext; + } + + AK_FORCE_INLINE void next() { + mBigramFlags = BinaryDictionaryBigramsReadingUtils::getFlagsAndForwardPointer( + mBinaryDictionaryInfo, &mPos); + mBigramPos = BinaryDictionaryBigramsReadingUtils::getBigramAddressAndForwardPointer( + mBinaryDictionaryInfo, mBigramFlags, &mPos); + mHasNext = BinaryDictionaryBigramsReadingUtils::hasNext(mBigramFlags); + } + + AK_FORCE_INLINE int getProbability() const { + return BinaryDictionaryBigramsReadingUtils::getBigramProbability(mBigramFlags); + } + + AK_FORCE_INLINE int getBigramPos() const { + return mBigramPos; + } + + AK_FORCE_INLINE int getFlags() const { + return mBigramFlags; + } + + private: + DISALLOW_COPY_AND_ASSIGN(BinaryDictionaryBigramsIterator); + + const BinaryDictionaryInfo *const mBinaryDictionaryInfo; + int mPos; + BinaryDictionaryBigramsReadingUtils::BigramFlags mBigramFlags; + int mBigramPos; + bool mHasNext; +}; +} // namespace latinime +#endif // LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_bigrams_reading_utils.cpp b/native/jni/src/suggest/core/dictionary/binary_dictionary_bigrams_reading_utils.cpp new file mode 100644 index 000000000..78a54b141 --- /dev/null +++ b/native/jni/src/suggest/core/dictionary/binary_dictionary_bigrams_reading_utils.cpp @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/dictionary/binary_dictionary_bigrams_reading_utils.h" + +#include "suggest/core/dictionary/binary_dictionary_info.h" +#include "suggest/core/dictionary/byte_array_utils.h" + +namespace latinime { + +const BinaryDictionaryBigramsReadingUtils::BigramFlags + BinaryDictionaryBigramsReadingUtils::MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30; +const BinaryDictionaryBigramsReadingUtils::BigramFlags + BinaryDictionaryBigramsReadingUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10; +const BinaryDictionaryBigramsReadingUtils::BigramFlags + BinaryDictionaryBigramsReadingUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; +const BinaryDictionaryBigramsReadingUtils::BigramFlags + BinaryDictionaryBigramsReadingUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; +const BinaryDictionaryBigramsReadingUtils::BigramFlags + BinaryDictionaryBigramsReadingUtils::FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; +// Flag for presence of more attributes +const BinaryDictionaryBigramsReadingUtils::BigramFlags + BinaryDictionaryBigramsReadingUtils::FLAG_ATTRIBUTE_HAS_NEXT = 0x80; +// Mask for attribute probability, stored on 4 bits inside the flags byte. +const BinaryDictionaryBigramsReadingUtils::BigramFlags + BinaryDictionaryBigramsReadingUtils::MASK_ATTRIBUTE_PROBABILITY = 0x0F; +const int BinaryDictionaryBigramsReadingUtils::ATTRIBUTE_ADDRESS_SHIFT = 4; + +/* static */ int BinaryDictionaryBigramsReadingUtils::getBigramAddressAndForwardPointer( + const BinaryDictionaryInfo *const binaryDictionaryInfo, const BigramFlags flags, + int *const pos) { + int offset = 0; + const int origin = *pos; + switch (MASK_ATTRIBUTE_ADDRESS_TYPE & flags) { + case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: + offset = ByteArrayUtils::readUint8andAdvancePosition( + binaryDictionaryInfo->getDictRoot(), pos); + break; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: + offset = ByteArrayUtils::readUint16andAdvancePosition( + binaryDictionaryInfo->getDictRoot(), pos); + break; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: + offset = ByteArrayUtils::readUint24andAdvancePosition( + binaryDictionaryInfo->getDictRoot(), pos); + break; + } + if (isOffsetNegative(flags)) { + return origin - offset; + } else { + return origin + offset; + } +} + +} // namespace latinime diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_bigrams_reading_utils.h b/native/jni/src/suggest/core/dictionary/binary_dictionary_bigrams_reading_utils.h new file mode 100644 index 000000000..e71f2a17a --- /dev/null +++ b/native/jni/src/suggest/core/dictionary/binary_dictionary_bigrams_reading_utils.h @@ -0,0 +1,90 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BINARY_DICTIONARY_BIGRAM_READING_UTILS_H +#define LATINIME_BINARY_DICTIONARY_BIGRAM_READING_UTILS_H + +#include + +#include "defines.h" +#include "suggest/core/dictionary/binary_dictionary_info.h" +#include "suggest/core/dictionary/byte_array_utils.h" + +namespace latinime { + +class BinaryDictionaryBigramsReadingUtils { + public: + typedef uint8_t BigramFlags; + + static AK_FORCE_INLINE void skipExistingBigrams( + const BinaryDictionaryInfo *const binaryDictionaryInfo, int *const pos) { + BigramFlags flags = getFlagsAndForwardPointer(binaryDictionaryInfo, pos); + while (hasNext(flags)) { + *pos += attributeAddressSize(flags); + flags = getFlagsAndForwardPointer(binaryDictionaryInfo, pos); + } + *pos += attributeAddressSize(flags); + } + + static AK_FORCE_INLINE BigramFlags getFlagsAndForwardPointer( + const BinaryDictionaryInfo *const binaryDictionaryInfo, int *const pos) { + return ByteArrayUtils::readUint8andAdvancePosition( + binaryDictionaryInfo->getDictRoot(), pos); + } + + static AK_FORCE_INLINE int getBigramProbability(const BigramFlags flags) { + return flags & MASK_ATTRIBUTE_PROBABILITY; + } + + static AK_FORCE_INLINE bool isOffsetNegative(const BigramFlags flags) { + return (flags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) != 0; + } + + static AK_FORCE_INLINE bool hasNext(const BigramFlags flags) { + return (flags & FLAG_ATTRIBUTE_HAS_NEXT) != 0; + } + + static int getBigramAddressAndForwardPointer( + const BinaryDictionaryInfo *const binaryDictionaryInfo, + const BigramFlags flags, int *const pos); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryDictionaryBigramsReadingUtils); + + static const BigramFlags MASK_ATTRIBUTE_ADDRESS_TYPE; + static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE; + static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES; + static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES; + static const BigramFlags FLAG_ATTRIBUTE_OFFSET_NEGATIVE; + static const BigramFlags FLAG_ATTRIBUTE_HAS_NEXT; + static const BigramFlags MASK_ATTRIBUTE_PROBABILITY; + static const int ATTRIBUTE_ADDRESS_SHIFT; + + static AK_FORCE_INLINE int attributeAddressSize(const BigramFlags flags) { + return (flags & MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT; + /* Note: this is a value-dependant optimization of what may probably be + more readably written this way: + switch (flags * BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) { + case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3; + default: return 0; + } + */ + } +}; +} +#endif /* LATINIME_BINARY_DICTIONARY_BIGRAM_READING_UTILS_H */ diff --git a/native/jni/src/suggest/core/dictionary/binary_format.h b/native/jni/src/suggest/core/dictionary/binary_format.h index 0a290d80a..df0ec480d 100644 --- a/native/jni/src/suggest/core/dictionary/binary_format.h +++ b/native/jni/src/suggest/core/dictionary/binary_format.h @@ -21,7 +21,6 @@ #include "suggest/core/dictionary/probability_utils.h" #include "utils/char_utils.h" -#include "utils/hash_map_compat.h" namespace latinime { @@ -81,16 +80,10 @@ class BinaryFormat { const int length, const bool forceLowerCaseSearch); static int getWordAtAddress(const uint8_t *const root, const int address, const int maxDepth, int *outWord, int *outUnigramProbability); - static int getBigramProbabilityFromHashMap(const int position, - const hash_map_compat *bigramMap, const int unigramProbability); - static void fillBigramProbabilityToHashMap(const uint8_t *const root, int position, - hash_map_compat *bigramMap); - static int getBigramProbability(const uint8_t *const root, int position, - const int nextPosition, const int unigramProbability); + static int getBigramListPositionForWordPosition(const uint8_t *const root, int position); private: DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat); - static int getBigramListPositionForWordPosition(const uint8_t *const root, int position); static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00; static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40; @@ -516,57 +509,6 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co return 0; } -// This returns a probability in log space. -inline int BinaryFormat::getBigramProbabilityFromHashMap(const int position, - const hash_map_compat *bigramMap, const int unigramProbability) { - if (!bigramMap) { - return ProbabilityUtils::backoff(unigramProbability); - } - const hash_map_compat::const_iterator bigramProbabilityIt = bigramMap->find(position); - if (bigramProbabilityIt != bigramMap->end()) { - const int bigramProbability = bigramProbabilityIt->second; - return ProbabilityUtils::computeProbabilityForBigram(unigramProbability, bigramProbability); - } - return ProbabilityUtils::backoff(unigramProbability); -} - -AK_FORCE_INLINE void BinaryFormat::fillBigramProbabilityToHashMap( - const uint8_t *const root, int position, hash_map_compat *bigramMap) { - position = getBigramListPositionForWordPosition(root, position); - if (0 == position) return; - - uint8_t bigramFlags; - do { - bigramFlags = getFlagsAndForwardPointer(root, &position); - const int probability = MASK_ATTRIBUTE_PROBABILITY & bigramFlags; - const int bigramPos = getAttributeAddressAndForwardPointer(root, bigramFlags, - &position); - (*bigramMap)[bigramPos] = probability; - } while (FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags); -} - -AK_FORCE_INLINE int BinaryFormat::getBigramProbability(const uint8_t *const root, int position, - const int nextPosition, const int unigramProbability) { - position = getBigramListPositionForWordPosition(root, position); - if (0 == position) { - return ProbabilityUtils::backoff(unigramProbability); - } - - uint8_t bigramFlags; - do { - bigramFlags = getFlagsAndForwardPointer(root, &position); - const int bigramPos = getAttributeAddressAndForwardPointer( - root, bigramFlags, &position); - if (bigramPos == nextPosition) { - const int bigramProbability = MASK_ATTRIBUTE_PROBABILITY & bigramFlags; - return ProbabilityUtils::computeProbabilityForBigram( - unigramProbability, bigramProbability); - } - } while (FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags); - return ProbabilityUtils::backoff(unigramProbability); -} - -// Returns a pointer to the start of the bigram list. AK_FORCE_INLINE int BinaryFormat::getBigramListPositionForWordPosition( const uint8_t *const root, int position) { if (NOT_VALID_WORD == position) return 0; diff --git a/native/jni/src/suggest/core/dictionary/dictionary.cpp b/native/jni/src/suggest/core/dictionary/dictionary.cpp index 561e22d2d..27b052b7e 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/dictionary.cpp @@ -79,7 +79,7 @@ int Dictionary::getSuggestions(ProximityInfo *proximityInfo, DicTraverseSession int Dictionary::getBigrams(const int *word, int length, int *inputCodePoints, int inputSize, int *outWords, int *frequencies, int *outputTypes) const { if (length <= 0) return 0; - return mBigramDictionary->getBigrams(word, length, inputCodePoints, inputSize, outWords, + return mBigramDictionary->getPredictions(word, length, inputCodePoints, inputSize, outWords, frequencies, outputTypes); } diff --git a/native/jni/src/suggest/core/dictionary/multi_bigram_map.h b/native/jni/src/suggest/core/dictionary/multi_bigram_map.h index ba97e5842..b380e9727 100644 --- a/native/jni/src/suggest/core/dictionary/multi_bigram_map.h +++ b/native/jni/src/suggest/core/dictionary/multi_bigram_map.h @@ -18,6 +18,7 @@ #define LATINIME_MULTI_BIGRAM_MAP_H #include "defines.h" +#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h" #include "suggest/core/dictionary/binary_dictionary_info.h" #include "suggest/core/dictionary/binary_format.h" #include "utils/hash_map_compat.h" @@ -34,7 +35,7 @@ class MultiBigramMap { // Look up the bigram probability for the given word pair from the cached bigram maps. // Also caches the bigrams if there is space remaining and they have not been cached already. - int getBigramProbability(const BinaryDictionaryInfo *const binaryDicitonaryInfo, + int getBigramProbability(const BinaryDictionaryInfo *const binaryDictionaryInfo, const int wordPosition, const int nextWordPosition, const int unigramProbability) { hash_map_compat::const_iterator mapPosition = mBigramMaps.find(wordPosition); @@ -42,11 +43,11 @@ class MultiBigramMap { return mapPosition->second.getBigramProbability(nextWordPosition, unigramProbability); } if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) { - addBigramsForWordPosition(binaryDicitonaryInfo, wordPosition); + addBigramsForWordPosition(binaryDictionaryInfo, wordPosition); return mBigramMaps[wordPosition].getBigramProbability( nextWordPosition, unigramProbability); } - return BinaryFormat::getBigramProbability(binaryDicitonaryInfo->getDictRoot(), + return readBigramProbabilityFromBinaryDictionary(binaryDictionaryInfo, wordPosition, nextWordPosition, unigramProbability); } @@ -62,15 +63,29 @@ class MultiBigramMap { BigramMap() : mBigramMap(DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP) {} ~BigramMap() {} - void init(const BinaryDictionaryInfo *const binaryDicitonaryInfo, const int position) { - BinaryFormat::fillBigramProbabilityToHashMap( - binaryDicitonaryInfo->getDictRoot(), position, &mBigramMap); + void init(const BinaryDictionaryInfo *const binaryDictionaryInfo, const int nodePos) { + const int bigramsListPos = BinaryFormat::getBigramListPositionForWordPosition( + binaryDictionaryInfo->getDictRoot(), nodePos); + if (0 == bigramsListPos) { + return; + } + for (BinaryDictionaryBigramsIterator bigramsIt(binaryDictionaryInfo, bigramsListPos); + bigramsIt.hasNext(); /* no-op */) { + bigramsIt.next(); + mBigramMap[bigramsIt.getBigramPos()] = bigramsIt.getProbability(); + } } - inline int getBigramProbability(const int nextWordPosition, const int unigramProbability) - const { - return BinaryFormat::getBigramProbabilityFromHashMap( - nextWordPosition, &mBigramMap, unigramProbability); + AK_FORCE_INLINE int getBigramProbability( + const int nextWordPosition, const int unigramProbability) const { + const hash_map_compat::const_iterator bigramProbabilityIt = + mBigramMap.find(nextWordPosition); + if (bigramProbabilityIt != mBigramMap.end()) { + const int bigramProbability = bigramProbabilityIt->second; + return ProbabilityUtils::computeProbabilityForBigram( + unigramProbability, bigramProbability); + } + return ProbabilityUtils::backoff(unigramProbability); } private: @@ -78,9 +93,28 @@ class MultiBigramMap { hash_map_compat mBigramMap; }; - void addBigramsForWordPosition(const BinaryDictionaryInfo *const binaryDicitonaryInfo, - const int position) { - mBigramMaps[position].init(binaryDicitonaryInfo, position); + AK_FORCE_INLINE void addBigramsForWordPosition( + const BinaryDictionaryInfo *const binaryDictionaryInfo, const int position) { + mBigramMaps[position].init(binaryDictionaryInfo, position); + } + + AK_FORCE_INLINE int readBigramProbabilityFromBinaryDictionary( + const BinaryDictionaryInfo *const binaryDictionaryInfo, const int nodePos, + const int nextWordPosition, const int unigramProbability) { + const int bigramsListPos = BinaryFormat::getBigramListPositionForWordPosition( + binaryDictionaryInfo->getDictRoot(), nodePos); + if (0 == bigramsListPos) { + return ProbabilityUtils::backoff(unigramProbability); + } + for (BinaryDictionaryBigramsIterator bigramsIt(binaryDictionaryInfo, bigramsListPos); + bigramsIt.hasNext(); /* no-op */) { + bigramsIt.next(); + if (bigramsIt.getBigramPos() == nextWordPosition) { + return ProbabilityUtils::computeProbabilityForBigram( + unigramProbability, bigramsIt.getProbability()); + } + } + return ProbabilityUtils::backoff(unigramProbability); } hash_map_compat mBigramMaps;