From 88ad30f40b05128d891fd412bb684bdbdc514446 Mon Sep 17 00:00:00 2001 From: Keisuke Kuroyanagi Date: Sat, 1 Jun 2013 14:08:20 -0700 Subject: [PATCH] Move dictionary format independent probability calculation methods to ProbabilityUtils. Bug: 6669677 Change-Id: Idc09a2fbb04e4d843e11313011178022177616aa --- native/jni/src/bigram_dictionary.cpp | 3 +- .../suggest/core/dicnode/dic_node_utils.cpp | 3 +- .../suggest/core/dictionary/binary_format.h | 58 +++------------ .../core/dictionary/probability_utils.h | 74 +++++++++++++++++++ native/jni/src/unigram_dictionary.cpp | 5 +- 5 files changed, 93 insertions(+), 50 deletions(-) create mode 100644 native/jni/src/suggest/core/dictionary/probability_utils.h diff --git a/native/jni/src/bigram_dictionary.cpp b/native/jni/src/bigram_dictionary.cpp index ebe27994f..e04d8cfc1 100644 --- a/native/jni/src/bigram_dictionary.cpp +++ b/native/jni/src/bigram_dictionary.cpp @@ -25,6 +25,7 @@ #include "suggest/core/dictionary/bloom_filter.h" #include "suggest/core/dictionary/char_utils.h" #include "suggest/core/dictionary/dictionary.h" +#include "suggest/core/dictionary/probability_utils.h" namespace latinime { @@ -134,7 +135,7 @@ int BigramDictionary::getBigrams(const int *prevWord, int prevWordLength, int *i // resulting probability is 8 - although in the practice it's never bigger than 3 or 4 // in very bad cases. This means that sometimes, we'll see some bigrams interverted // here, but it can't get too bad. - const int probability = BinaryFormat::computeProbabilityForBigram( + const int probability = ProbabilityUtils::computeProbabilityForBigram( unigramProbability, bigramProbabilityTemp); addWordBigram(bigramBuffer, length, probability, bigramProbability, bigramCodePoints, outputTypes); diff --git a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp index c754a5ec2..6787efff5 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp +++ b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp @@ -23,6 +23,7 @@ #include "suggest/core/dictionary/binary_format.h" #include "suggest/core/dictionary/char_utils.h" #include "suggest/core/dictionary/multi_bigram_map.h" +#include "suggest/core/dictionary/probability_utils.h" #include "suggest/core/layout/proximity_info.h" #include "suggest/core/layout/proximity_info_state.h" @@ -211,7 +212,7 @@ namespace latinime { const int prevWordPos = node->getPrevWordPos(); if (NOT_VALID_WORD == wordPos || NOT_VALID_WORD == prevWordPos) { // Note: Normally wordPos comes from the dictionary and should never equal NOT_VALID_WORD. - return backoff(unigramProbability); + return ProbabilityUtils::backoff(unigramProbability); } if (multiBigramMap) { return multiBigramMap->getBigramProbability( diff --git a/native/jni/src/suggest/core/dictionary/binary_format.h b/native/jni/src/suggest/core/dictionary/binary_format.h index ef9fd3785..3bacc0982 100644 --- a/native/jni/src/suggest/core/dictionary/binary_format.h +++ b/native/jni/src/suggest/core/dictionary/binary_format.h @@ -18,12 +18,12 @@ #define LATINIME_BINARY_FORMAT_H #include -#include #include #include "hash_map_compat.h" #include "suggest/core/dictionary/bloom_filter.h" #include "suggest/core/dictionary/char_utils.h" +#include "suggest/core/dictionary/probability_utils.h" namespace latinime { @@ -91,10 +91,6 @@ class BinaryFormat { const int length, const bool forceLowerCaseSearch); static int getWordAtAddress(const uint8_t *const root, const int address, const int maxDepth, int *outWord, int *outUnigramProbability); - static int computeProbabilityForBigram( - const int unigramProbability, const int bigramProbability); - static int getProbability(const int position, const std::map *bigramMap, - const uint8_t *bigramFilter, const int unigramProbability); static int getBigramProbabilityFromHashMap(const int position, const hash_map_compat *bigramMap, const int unigramProbability); static float getMultiWordCostMultiplier(const uint8_t *const dict, const int dictSize); @@ -678,51 +674,18 @@ AK_FORCE_INLINE int BinaryFormat::getWordAtAddress(const uint8_t *const root, co return 0; } -static inline int backoff(const int unigramProbability) { - return unigramProbability; - // For some reason, applying the backoff weight gives bad results in tests. To apply the - // backoff weight, we divide the probability by 2, which in our storing format means - // decreasing the score by 8. - // TODO: figure out what's wrong with this. - // return unigramProbability > 8 ? unigramProbability - 8 : (0 == unigramProbability ? 0 : 8); -} - -inline int BinaryFormat::computeProbabilityForBigram( - const int unigramProbability, const int bigramProbability) { - // We divide the range [unigramProbability..255] in 16.5 steps - in other words, we want the - // unigram probability to be the median value of the 17th step from the top. A value of - // 0 for the bigram probability represents the middle of the 16th step from the top, - // while a value of 15 represents the middle of the top step. - // See makedict.BinaryDictInputOutput for details. - const float stepSize = static_cast(MAX_PROBABILITY - unigramProbability) - / (1.5f + MAX_BIGRAM_ENCODED_PROBABILITY); - return unigramProbability - + static_cast(static_cast(bigramProbability + 1) * stepSize); -} - -// This returns a probability in log space. -inline int BinaryFormat::getProbability(const int position, const std::map *bigramMap, - const uint8_t *bigramFilter, const int unigramProbability) { - if (!bigramMap || !bigramFilter) return backoff(unigramProbability); - if (!isInFilter(bigramFilter, position)) return backoff(unigramProbability); - const std::map::const_iterator bigramProbabilityIt = bigramMap->find(position); - if (bigramProbabilityIt != bigramMap->end()) { - const int bigramProbability = bigramProbabilityIt->second; - return computeProbabilityForBigram(unigramProbability, bigramProbability); - } - return backoff(unigramProbability); -} - // This returns a probability in log space. inline int BinaryFormat::getBigramProbabilityFromHashMap(const int position, const hash_map_compat *bigramMap, const int unigramProbability) { - if (!bigramMap) return backoff(unigramProbability); + if (!bigramMap) { + return ProbabilityUtils::backoff(unigramProbability); + } const hash_map_compat::const_iterator bigramProbabilityIt = bigramMap->find(position); if (bigramProbabilityIt != bigramMap->end()) { const int bigramProbability = bigramProbabilityIt->second; - return computeProbabilityForBigram(unigramProbability, bigramProbability); + return ProbabilityUtils::computeProbabilityForBigram(unigramProbability, bigramProbability); } - return backoff(unigramProbability); + return ProbabilityUtils::backoff(unigramProbability); } AK_FORCE_INLINE void BinaryFormat::fillBigramProbabilityToHashMap( @@ -743,7 +706,9 @@ AK_FORCE_INLINE void BinaryFormat::fillBigramProbabilityToHashMap( AK_FORCE_INLINE int BinaryFormat::getBigramProbability(const uint8_t *const root, int position, const int nextPosition, const int unigramProbability) { position = getBigramListPositionForWordPosition(root, position); - if (0 == position) return backoff(unigramProbability); + if (0 == position) { + return ProbabilityUtils::backoff(unigramProbability); + } uint8_t bigramFlags; do { @@ -752,10 +717,11 @@ AK_FORCE_INLINE int BinaryFormat::getBigramProbability(const uint8_t *const root root, bigramFlags, &position); if (bigramPos == nextPosition) { const int bigramProbability = MASK_ATTRIBUTE_PROBABILITY & bigramFlags; - return computeProbabilityForBigram(unigramProbability, bigramProbability); + return ProbabilityUtils::computeProbabilityForBigram( + unigramProbability, bigramProbability); } } while (FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags); - return backoff(unigramProbability); + return ProbabilityUtils::backoff(unigramProbability); } // Returns a pointer to the start of the bigram list. diff --git a/native/jni/src/suggest/core/dictionary/probability_utils.h b/native/jni/src/suggest/core/dictionary/probability_utils.h new file mode 100644 index 000000000..14d2f8436 --- /dev/null +++ b/native/jni/src/suggest/core/dictionary/probability_utils.h @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PROBABILITY_UTILS_H +#define LATINIME_PROBABILITY_UTILS_H + +#include +#include + +#include "defines.h" + +namespace latinime { + +class ProbabilityUtils { + public: + static AK_FORCE_INLINE int backoff(const int unigramProbability) { + return unigramProbability; + // For some reason, applying the backoff weight gives bad results in tests. To apply the + // backoff weight, we divide the probability by 2, which in our storing format means + // decreasing the score by 8. + // TODO: figure out what's wrong with this. + // return unigramProbability > 8 ? + // unigramProbability - 8 : (0 == unigramProbability ? 0 : 8); + } + + static AK_FORCE_INLINE int computeProbabilityForBigram( + const int unigramProbability, const int bigramProbability) { + // We divide the range [unigramProbability..255] in 16.5 steps - in other words, we want + // the unigram probability to be the median value of the 17th step from the top. A value of + // 0 for the bigram probability represents the middle of the 16th step from the top, + // while a value of 15 represents the middle of the top step. + // See makedict.BinaryDictInputOutput for details. + const float stepSize = static_cast(MAX_PROBABILITY - unigramProbability) + / (1.5f + MAX_BIGRAM_ENCODED_PROBABILITY); + return unigramProbability + + static_cast(static_cast(bigramProbability + 1) * stepSize); + } + + // This returns a probability in log space. + static AK_FORCE_INLINE int getProbability(const int position, + const std::map *const bigramMap, + const uint8_t *bigramFilter, const int unigramProbability) { + if (!bigramMap || !bigramFilter) { + return backoff(unigramProbability); + } + if (!isInFilter(bigramFilter, position)){ + return backoff(unigramProbability); + } + const std::map::const_iterator bigramProbabilityIt = bigramMap->find(position); + if (bigramProbabilityIt != bigramMap->end()) { + const int bigramProbability = bigramProbabilityIt->second; + return computeProbabilityForBigram(unigramProbability, bigramProbability); + } + return backoff(unigramProbability); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ProbabilityUtils); +}; +} +#endif /* LATINIME_PROBABILITY_UTILS_H */ diff --git a/native/jni/src/unigram_dictionary.cpp b/native/jni/src/unigram_dictionary.cpp index 66a8b8542..1d979dec0 100644 --- a/native/jni/src/unigram_dictionary.cpp +++ b/native/jni/src/unigram_dictionary.cpp @@ -23,6 +23,7 @@ #include "suggest/core/dictionary/char_utils.h" #include "suggest/core/dictionary/dictionary.h" #include "suggest/core/dictionary/digraph_utils.h" +#include "suggest/core/dictionary/probability_utils.h" #include "suggest/core/dictionary/terminal_attributes.h" #include "suggest/core/layout/proximity_info.h" #include "unigram_dictionary.h" @@ -935,8 +936,8 @@ bool UnigramDictionary::processCurrentNode(const int initialPos, TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos); // bigramMap contains the bigram frequencies indexed by addresses for fast lookup. // bigramFilter is a bloom filter of said frequencies for even faster rejection. - const int probability = BinaryFormat::getProbability(initialPos, bigramMap, bigramFilter, - unigramProbability); + const int probability = ProbabilityUtils::getProbability( + initialPos, bigramMap, bigramFilter, unigramProbability); onTerminal(probability, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal, currentWordIndex);