Merge "Return the bigram frequency if available." into jb-dev
commit
bc77adefbb
|
@ -74,9 +74,9 @@
|
||||||
<item></item>
|
<item></item>
|
||||||
<!-- Modest : Suggestion whose normalized score is greater than this value
|
<!-- Modest : Suggestion whose normalized score is greater than this value
|
||||||
will be subject to auto-correction. -->
|
will be subject to auto-correction. -->
|
||||||
<item>0.22</item>
|
<item>0.185</item>
|
||||||
<!-- Aggressive -->
|
<!-- Aggressive -->
|
||||||
<item>0.08</item>
|
<item>0.067</item>
|
||||||
<!-- Very Aggressive : Suggestion whose normalized score is greater than this value
|
<!-- Very Aggressive : Suggestion whose normalized score is greater than this value
|
||||||
will be subject to auto-correction. -->
|
will be subject to auto-correction. -->
|
||||||
<item>0</item>
|
<item>0</item>
|
||||||
|
|
|
@ -520,19 +520,33 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// This should probably return a probability in log space.
|
static inline int backoff(const int unigramFreq) {
|
||||||
|
return unigramFreq;
|
||||||
|
// For some reason, applying the backoff weight gives bad results in tests. To apply the
|
||||||
|
// backoff weight, we divide the probability by 2, which in our storing format means
|
||||||
|
// decreasing the score by 8.
|
||||||
|
// TODO: figure out what's wrong with this.
|
||||||
|
// return unigramFreq > 8 ? unigramFreq - 8 : (0 == unigramFreq ? 0 : 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
// This returns a probability in log space.
|
||||||
inline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap,
|
inline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap,
|
||||||
const uint8_t *bigramFilter, const int unigramFreq) {
|
const uint8_t *bigramFilter, const int unigramFreq) {
|
||||||
if (!bigramMap || !bigramFilter) return unigramFreq;
|
if (!bigramMap || !bigramFilter) return backoff(unigramFreq);
|
||||||
if (!isInFilter(bigramFilter, position)) return unigramFreq;
|
if (!isInFilter(bigramFilter, position)) return backoff(unigramFreq);
|
||||||
const std::map<int, int>::const_iterator bigramFreq = bigramMap->find(position);
|
const std::map<int, int>::const_iterator bigramFreqIt = bigramMap->find(position);
|
||||||
if (bigramFreq != bigramMap->end()) {
|
if (bigramFreqIt != bigramMap->end()) {
|
||||||
// TODO: return the frequency in bigramFreq->second
|
const int bigramFreq = bigramFreqIt->second;
|
||||||
return unigramFreq;
|
// We divide the range [unigramFreq..255] in 16.5 steps - in other words, we want the
|
||||||
|
// unigram frequency to be the median value of the 17th step from the top. A value of
|
||||||
|
// 0 for the bigram frequency represents the middle of the 16th step from the top,
|
||||||
|
// while a value of 15 represents the middle of the top step.
|
||||||
|
// See makedict.BinaryDictInputOutput for details.
|
||||||
|
const float stepSize = ((float)MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ);
|
||||||
|
return (int)(unigramFreq + bigramFreq * stepSize);
|
||||||
} else {
|
} else {
|
||||||
return unigramFreq;
|
return backoff(unigramFreq);
|
||||||
}
|
}
|
||||||
// TODO: if the unigram frequency is used, compute the actual probability
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
|
|
|
@ -207,6 +207,7 @@ static inline void prof_out(void) {
|
||||||
#define NEUTRAL_SCORE_SQUARED_RADIUS 8.0f
|
#define NEUTRAL_SCORE_SQUARED_RADIUS 8.0f
|
||||||
#define HALF_SCORE_SQUARED_RADIUS 32.0f
|
#define HALF_SCORE_SQUARED_RADIUS 32.0f
|
||||||
#define MAX_FREQ 255
|
#define MAX_FREQ 255
|
||||||
|
#define MAX_BIGRAM_FREQ 15
|
||||||
|
|
||||||
// This must be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
|
// This must be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
|
||||||
// This is only used for the size of array. Not to be used in c functions.
|
// This is only used for the size of array. Not to be used in c functions.
|
||||||
|
@ -225,8 +226,8 @@ static inline void prof_out(void) {
|
||||||
#define MULTIPLE_WORDS_DEMOTION_RATE 80
|
#define MULTIPLE_WORDS_DEMOTION_RATE 80
|
||||||
#define MIN_INPUT_LENGTH_FOR_THREE_OR_MORE_WORDS_CORRECTION 6
|
#define MIN_INPUT_LENGTH_FOR_THREE_OR_MORE_WORDS_CORRECTION 6
|
||||||
|
|
||||||
#define TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD 0.39
|
#define TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD 0.35
|
||||||
#define START_TWO_WORDS_CORRECTION_THRESHOLD 0.22
|
#define START_TWO_WORDS_CORRECTION_THRESHOLD 0.185
|
||||||
|
|
||||||
#define MAX_DEPTH_MULTIPLIER 3
|
#define MAX_DEPTH_MULTIPLIER 3
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue