am 22f951c8
: Merge "Remove checkFirstCharacter from BigramDictionary."
* commit '22f951c8819b40e674a0354f96249927a64f8b6f': Remove checkFirstCharacter from BigramDictionary.
This commit is contained in:
commit
be81b05fcc
5 changed files with 23 additions and 55 deletions
|
@ -186,7 +186,7 @@ static int latinime_BinaryDictionary_getSuggestions(JNIEnv *env, jclass clazz, j
|
||||||
scores, spaceIndices, outputTypes);
|
scores, spaceIndices, outputTypes);
|
||||||
} else {
|
} else {
|
||||||
count = dictionary->getBigrams(prevWordCodePoints, prevWordCodePointsLength,
|
count = dictionary->getBigrams(prevWordCodePoints, prevWordCodePointsLength,
|
||||||
inputCodePoints, inputSize, outputCodePoints, scores, outputTypes);
|
outputCodePoints, scores, outputTypes);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Copy back the output values
|
// Copy back the output values
|
||||||
|
|
|
@ -87,21 +87,14 @@ void BigramDictionary::addWordBigram(int *word, int length, int probability, int
|
||||||
/* Parameters :
|
/* Parameters :
|
||||||
* prevWord: the word before, the one for which we need to look up bigrams.
|
* prevWord: the word before, the one for which we need to look up bigrams.
|
||||||
* prevWordLength: its length.
|
* prevWordLength: its length.
|
||||||
* inputCodePoints: what user typed, in the same format as for UnigramDictionary::getSuggestions.
|
* outBigramCodePoints: an array for output, at the same format as outwords for getSuggestions.
|
||||||
* inputSize: the size of the codes array.
|
* outBigramProbability: an array to output frequencies.
|
||||||
* bigramCodePoints: an array for output, at the same format as outwords for getSuggestions.
|
|
||||||
* bigramProbability: an array to output frequencies.
|
|
||||||
* outputTypes: an array to output types.
|
* outputTypes: an array to output types.
|
||||||
* This method returns the number of bigrams this word has, for backward compatibility.
|
* This method returns the number of bigrams this word has, for backward compatibility.
|
||||||
* Note: this is not the number of bigrams output in the array, which is the number of
|
|
||||||
* bigrams this word has WHOSE first letter also matches the letter the user typed.
|
|
||||||
* TODO: this may not be a sensible thing to do. It makes sense when the bigrams are
|
|
||||||
* used to match the first letter of the second word, but once the user has typed more
|
|
||||||
* and the bigrams are used to boost unigram result scores, it makes little sense to
|
|
||||||
* reduce their scope to the ones that match the first letter.
|
|
||||||
*/
|
*/
|
||||||
int BigramDictionary::getPredictions(const int *prevWord, int prevWordLength, int *inputCodePoints,
|
int BigramDictionary::getPredictions(const int *prevWord, const int prevWordLength,
|
||||||
int inputSize, int *bigramCodePoints, int *bigramProbability, int *outputTypes) const {
|
int *const outBigramCodePoints, int *const outBigramProbability,
|
||||||
|
int *const outputTypes) const {
|
||||||
// TODO: remove unused arguments, and refrain from storing stuff in members of this class
|
// TODO: remove unused arguments, and refrain from storing stuff in members of this class
|
||||||
// TODO: have "in" arguments before "out" ones, and make out args explicit in the name
|
// TODO: have "in" arguments before "out" ones, and make out args explicit in the name
|
||||||
|
|
||||||
|
@ -126,22 +119,17 @@ int BigramDictionary::getPredictions(const int *prevWord, int prevWordLength, in
|
||||||
getCodePointsAndProbabilityAndReturnCodePointCount(
|
getCodePointsAndProbabilityAndReturnCodePointCount(
|
||||||
mBinaryDictionaryInfo, bigramsIt.getBigramPos(), MAX_WORD_LENGTH,
|
mBinaryDictionaryInfo, bigramsIt.getBigramPos(), MAX_WORD_LENGTH,
|
||||||
bigramBuffer, &unigramProbability);
|
bigramBuffer, &unigramProbability);
|
||||||
|
|
||||||
// inputSize == 0 means we are trying to find bigram predictions.
|
|
||||||
if (inputSize < 1 || checkFirstCharacter(bigramBuffer, inputCodePoints)) {
|
|
||||||
const int bigramProbabilityTemp = bigramsIt.getProbability();
|
|
||||||
// Due to space constraints, the probability for bigrams is approximate - the lower the
|
// Due to space constraints, the probability for bigrams is approximate - the lower the
|
||||||
// unigram probability, the worse the precision. The theoritical maximum error in
|
// unigram probability, the worse the precision. The theoritical maximum error in
|
||||||
// resulting probability is 8 - although in the practice it's never bigger than 3 or 4
|
// resulting probability is 8 - although in the practice it's never bigger than 3 or 4
|
||||||
// in very bad cases. This means that sometimes, we'll see some bigrams interverted
|
// in very bad cases. This means that sometimes, we'll see some bigrams interverted
|
||||||
// here, but it can't get too bad.
|
// here, but it can't get too bad.
|
||||||
const int probability = ProbabilityUtils::computeProbabilityForBigram(
|
const int probability = ProbabilityUtils::computeProbabilityForBigram(
|
||||||
unigramProbability, bigramProbabilityTemp);
|
unigramProbability, bigramsIt.getProbability());
|
||||||
addWordBigram(bigramBuffer, length, probability, bigramProbability, bigramCodePoints,
|
addWordBigram(bigramBuffer, length, probability, outBigramProbability, outBigramCodePoints,
|
||||||
outputTypes);
|
outputTypes);
|
||||||
++bigramCount;
|
++bigramCount;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
return min(bigramCount, MAX_RESULTS);
|
return min(bigramCount, MAX_RESULTS);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -157,22 +145,6 @@ int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const in
|
||||||
mBinaryDictionaryInfo, pos);
|
mBinaryDictionaryInfo, pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool BigramDictionary::checkFirstCharacter(int *word, int *inputCodePoints) const {
|
|
||||||
// Checks whether this word starts with same character or neighboring characters of
|
|
||||||
// what user typed.
|
|
||||||
|
|
||||||
int maxAlt = MAX_ALTERNATIVES;
|
|
||||||
const int firstBaseLowerCodePoint = CharUtils::toBaseLowerCase(*word);
|
|
||||||
while (maxAlt > 0) {
|
|
||||||
if (CharUtils::toBaseLowerCase(*inputCodePoints) == firstBaseLowerCodePoint) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
inputCodePoints++;
|
|
||||||
maxAlt--;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool BigramDictionary::isValidBigram(const int *word0, int length0, const int *word1,
|
bool BigramDictionary::isValidBigram(const int *word0, int length0, const int *word1,
|
||||||
int length1) const {
|
int length1) const {
|
||||||
int pos = getBigramListPositionForWord(word0, length0, false /* forceLowerCaseSearch */);
|
int pos = getBigramListPositionForWord(word0, length0, false /* forceLowerCaseSearch */);
|
||||||
|
|
|
@ -27,8 +27,8 @@ class BigramDictionary {
|
||||||
public:
|
public:
|
||||||
BigramDictionary(const BinaryDictionaryInfo *const binaryDictionaryInfo);
|
BigramDictionary(const BinaryDictionaryInfo *const binaryDictionaryInfo);
|
||||||
|
|
||||||
int getPredictions(const int *word, int length, int *inputCodePoints, int inputSize,
|
int getPredictions(const int *word, int length, int *outBigramCodePoints,
|
||||||
int *outWords, int *frequencies, int *outputTypes) const;
|
int *outBigramProbability, int *outputTypes) const;
|
||||||
bool isValidBigram(const int *word1, int length1, const int *word2, int length2) const;
|
bool isValidBigram(const int *word1, int length1, const int *word2, int length2) const;
|
||||||
~BigramDictionary();
|
~BigramDictionary();
|
||||||
|
|
||||||
|
@ -37,13 +37,10 @@ class BigramDictionary {
|
||||||
|
|
||||||
void addWordBigram(int *word, int length, int probability, int *bigramProbability,
|
void addWordBigram(int *word, int length, int probability, int *bigramProbability,
|
||||||
int *bigramCodePoints, int *outputTypes) const;
|
int *bigramCodePoints, int *outputTypes) const;
|
||||||
bool checkFirstCharacter(int *word, int *inputCodePoints) const;
|
|
||||||
int getBigramListPositionForWord(const int *prevWord, const int prevWordLength,
|
int getBigramListPositionForWord(const int *prevWord, const int prevWordLength,
|
||||||
const bool forceLowerCaseSearch) const;
|
const bool forceLowerCaseSearch) const;
|
||||||
|
|
||||||
const BinaryDictionaryInfo *const mBinaryDictionaryInfo;
|
const BinaryDictionaryInfo *const mBinaryDictionaryInfo;
|
||||||
// TODO: Re-implement proximity correction for bigram correction
|
|
||||||
static const int MAX_ALTERNATIVES = 1;
|
|
||||||
};
|
};
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
#endif // LATINIME_BIGRAM_DICTIONARY_H
|
#endif // LATINIME_BIGRAM_DICTIONARY_H
|
||||||
|
|
|
@ -76,11 +76,10 @@ int Dictionary::getSuggestions(ProximityInfo *proximityInfo, DicTraverseSession
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int Dictionary::getBigrams(const int *word, int length, int *inputCodePoints, int inputSize,
|
int Dictionary::getBigrams(const int *word, int length, int *outWords, int *frequencies,
|
||||||
int *outWords, int *frequencies, int *outputTypes) const {
|
int *outputTypes) const {
|
||||||
if (length <= 0) return 0;
|
if (length <= 0) return 0;
|
||||||
return mBigramDictionary->getPredictions(word, length, inputCodePoints, inputSize, outWords,
|
return mBigramDictionary->getPredictions(word, length, outWords, frequencies, outputTypes);
|
||||||
frequencies, outputTypes);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int Dictionary::getProbability(const int *word, int length) const {
|
int Dictionary::getProbability(const int *word, int length) const {
|
||||||
|
|
|
@ -62,8 +62,8 @@ class Dictionary {
|
||||||
const SuggestOptions *const suggestOptions, int *outWords, int *frequencies,
|
const SuggestOptions *const suggestOptions, int *outWords, int *frequencies,
|
||||||
int *spaceIndices, int *outputTypes) const;
|
int *spaceIndices, int *outputTypes) const;
|
||||||
|
|
||||||
int getBigrams(const int *word, int length, int *inputCodePoints, int inputSize, int *outWords,
|
int getBigrams(const int *word, int length, int *outWords, int *frequencies,
|
||||||
int *frequencies, int *outputTypes) const;
|
int *outputTypes) const;
|
||||||
|
|
||||||
int getProbability(const int *word, int length) const;
|
int getProbability(const int *word, int length) const;
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue