* commit 'c941bc9bd3eb5b564f8b5f63ff07a44097262671': Split a method to reconstruct freq from uni/bi freq
This commit is contained in:
commit
7a4a1a0c5c
1 changed files with 12 additions and 7 deletions
|
@ -67,6 +67,7 @@ class BinaryFormat {
|
||||||
const int length);
|
const int length);
|
||||||
static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth,
|
static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth,
|
||||||
uint16_t* outWord);
|
uint16_t* outWord);
|
||||||
|
static int computeFrequencyForBigram(const int unigramFreq, const int bigramFreq);
|
||||||
static int getProbability(const int position, const std::map<int, int> *bigramMap,
|
static int getProbability(const int position, const std::map<int, int> *bigramMap,
|
||||||
const uint8_t *bigramFilter, const int unigramFreq);
|
const uint8_t *bigramFilter, const int unigramFreq);
|
||||||
|
|
||||||
|
@ -529,6 +530,16 @@ static inline int backoff(const int unigramFreq) {
|
||||||
// return unigramFreq > 8 ? unigramFreq - 8 : (0 == unigramFreq ? 0 : 8);
|
// return unigramFreq > 8 ? unigramFreq - 8 : (0 == unigramFreq ? 0 : 8);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline int BinaryFormat::computeFrequencyForBigram(const int unigramFreq, const int bigramFreq) {
|
||||||
|
// We divide the range [unigramFreq..255] in 16.5 steps - in other words, we want the
|
||||||
|
// unigram frequency to be the median value of the 17th step from the top. A value of
|
||||||
|
// 0 for the bigram frequency represents the middle of the 16th step from the top,
|
||||||
|
// while a value of 15 represents the middle of the top step.
|
||||||
|
// See makedict.BinaryDictInputOutput for details.
|
||||||
|
const float stepSize = ((float)MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ);
|
||||||
|
return (int)(unigramFreq + bigramFreq * stepSize);
|
||||||
|
}
|
||||||
|
|
||||||
// This returns a probability in log space.
|
// This returns a probability in log space.
|
||||||
inline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap,
|
inline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap,
|
||||||
const uint8_t *bigramFilter, const int unigramFreq) {
|
const uint8_t *bigramFilter, const int unigramFreq) {
|
||||||
|
@ -537,13 +548,7 @@ inline int BinaryFormat::getProbability(const int position, const std::map<int,
|
||||||
const std::map<int, int>::const_iterator bigramFreqIt = bigramMap->find(position);
|
const std::map<int, int>::const_iterator bigramFreqIt = bigramMap->find(position);
|
||||||
if (bigramFreqIt != bigramMap->end()) {
|
if (bigramFreqIt != bigramMap->end()) {
|
||||||
const int bigramFreq = bigramFreqIt->second;
|
const int bigramFreq = bigramFreqIt->second;
|
||||||
// We divide the range [unigramFreq..255] in 16.5 steps - in other words, we want the
|
return computeFrequencyForBigram(unigramFreq, bigramFreq);
|
||||||
// unigram frequency to be the median value of the 17th step from the top. A value of
|
|
||||||
// 0 for the bigram frequency represents the middle of the 16th step from the top,
|
|
||||||
// while a value of 15 represents the middle of the top step.
|
|
||||||
// See makedict.BinaryDictInputOutput for details.
|
|
||||||
const float stepSize = ((float)MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ);
|
|
||||||
return (int)(unigramFreq + bigramFreq * stepSize);
|
|
||||||
} else {
|
} else {
|
||||||
return backoff(unigramFreq);
|
return backoff(unigramFreq);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue