Merge "Improve fat finger correction."
commit
1d66cc1be6
|
@ -95,10 +95,8 @@ int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLen
|
||||||
}
|
}
|
||||||
|
|
||||||
*word = mWord;
|
*word = mWord;
|
||||||
const bool sameLength = (mExcessivePos == mInputLength - 1) ? (mInputLength == inputIndex + 2)
|
|
||||||
: (mInputLength == inputIndex + 1);
|
|
||||||
return Correction::RankingAlgorithm::calculateFinalFreq(
|
return Correction::RankingAlgorithm::calculateFinalFreq(
|
||||||
inputIndex, outputIndex, freq, sameLength, mEditDistanceTable, this);
|
inputIndex, outputIndex, freq, mEditDistanceTable, this);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Correction::initProcessState(const int outputIndex) {
|
bool Correction::initProcessState(const int outputIndex) {
|
||||||
|
@ -205,20 +203,6 @@ Correction::CorrectionType Correction::processCharAndCalcState(
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mNeedsToTraverseAllNodes || isQuote(c)) {
|
if (mNeedsToTraverseAllNodes || isQuote(c)) {
|
||||||
const bool checkProximityChars =
|
|
||||||
!(mSkippedCount > 0 || mExcessivePos >= 0 || mTransposedPos >= 0);
|
|
||||||
// Note: This logic tries saving cases like contrst --> contrast -- "a" is one of
|
|
||||||
// proximity chars of "s", but it should rather be handled as a skipped char.
|
|
||||||
if (checkProximityChars
|
|
||||||
&& mInputIndex > 0
|
|
||||||
&& mCorrectionStates[mOutputIndex].mProximityMatching
|
|
||||||
&& mCorrectionStates[mOutputIndex].mSkipping
|
|
||||||
&& mProximityInfo->getMatchedProximityId(
|
|
||||||
mInputIndex - 1, c, false)
|
|
||||||
== ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR) {
|
|
||||||
++mSkippedCount;
|
|
||||||
--mProximityCount;
|
|
||||||
}
|
|
||||||
return processSkipChar(c, isTerminal);
|
return processSkipChar(c, isTerminal);
|
||||||
} else {
|
} else {
|
||||||
int inputIndexForProximity = mInputIndex;
|
int inputIndexForProximity = mInputIndex;
|
||||||
|
@ -250,6 +234,8 @@ Correction::CorrectionType Correction::processCharAndCalcState(
|
||||||
&& mProximityInfo->getMatchedProximityId(
|
&& mProximityInfo->getMatchedProximityId(
|
||||||
inputIndexForProximity - 1, c, false)
|
inputIndexForProximity - 1, c, false)
|
||||||
== ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR) {
|
== ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR) {
|
||||||
|
// Note: This logic tries saving cases like contrst --> contrast -- "a" is one of
|
||||||
|
// proximity chars of "s", but it should rather be handled as a skipped char.
|
||||||
++mSkippedCount;
|
++mSkippedCount;
|
||||||
--mProximityCount;
|
--mProximityCount;
|
||||||
return processSkipChar(c, isTerminal);
|
return processSkipChar(c, isTerminal);
|
||||||
|
@ -344,6 +330,16 @@ inline static void multiplyRate(const int rate, int *freq) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline static int getQuoteCount(const unsigned short* word, const int length) {
|
||||||
|
int quoteCount = 0;
|
||||||
|
for (int i = 0; i < length; ++i) {
|
||||||
|
if(word[i] == '\'') {
|
||||||
|
++quoteCount;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return quoteCount;
|
||||||
|
}
|
||||||
|
|
||||||
/* static */
|
/* static */
|
||||||
inline static int editDistance(
|
inline static int editDistance(
|
||||||
int* editDistanceTable, const unsigned short* input,
|
int* editDistanceTable, const unsigned short* input,
|
||||||
|
@ -392,8 +388,7 @@ inline static int editDistance(
|
||||||
|
|
||||||
/* static */
|
/* static */
|
||||||
int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const int outputIndex,
|
int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const int outputIndex,
|
||||||
const int freq, const bool sameLength, int* editDistanceTable,
|
const int freq, int* editDistanceTable, const Correction* correction) {
|
||||||
const Correction* correction) {
|
|
||||||
const int excessivePos = correction->getExcessivePos();
|
const int excessivePos = correction->getExcessivePos();
|
||||||
const int transposedPos = correction->getTransposedPos();
|
const int transposedPos = correction->getTransposedPos();
|
||||||
const int inputLength = correction->mInputLength;
|
const int inputLength = correction->mInputLength;
|
||||||
|
@ -402,6 +397,12 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
|
||||||
const ProximityInfo *proximityInfo = correction->mProximityInfo;
|
const ProximityInfo *proximityInfo = correction->mProximityInfo;
|
||||||
const int skipCount = correction->mSkippedCount;
|
const int skipCount = correction->mSkippedCount;
|
||||||
const int proximityMatchedCount = correction->mProximityCount;
|
const int proximityMatchedCount = correction->mProximityCount;
|
||||||
|
if (skipCount >= inputLength || inputLength == 0) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
const bool sameLength = (excessivePos == inputLength - 1) ? (inputLength == inputIndex + 2)
|
||||||
|
: (inputLength == inputIndex + 1);
|
||||||
|
|
||||||
|
|
||||||
// TODO: use mExcessiveCount
|
// TODO: use mExcessiveCount
|
||||||
int matchCount = inputLength - correction->mProximityCount - (excessivePos >= 0 ? 1 : 0);
|
int matchCount = inputLength - correction->mProximityCount - (excessivePos >= 0 ? 1 : 0);
|
||||||
|
@ -409,67 +410,52 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
|
||||||
const unsigned short* word = correction->mWord;
|
const unsigned short* word = correction->mWord;
|
||||||
const bool skipped = skipCount > 0;
|
const bool skipped = skipCount > 0;
|
||||||
|
|
||||||
// ----- TODO: use edit distance here as follows? ---------------------- /
|
const int quoteDiffCount = max(0, getQuoteCount(word, outputIndex + 1)
|
||||||
//if (!skipped && excessivePos < 0 && transposedPos < 0) {
|
- getQuoteCount(proximityInfo->getPrimaryInputWord(), inputLength));
|
||||||
// const int ed = editDistance(dp, proximityInfo->getInputWord(),
|
|
||||||
// inputLength, word, outputIndex + 1);
|
// TODO: Calculate edit distance for transposed and excessive
|
||||||
// matchCount = outputIndex + 1 - ed;
|
int matchWeight;
|
||||||
// if (ed == 1 && !sameLength) ++matchCount;
|
int ed = 0;
|
||||||
//}
|
int adJustedProximityMatchedCount = proximityMatchedCount;
|
||||||
// const int ed = editDistance(dp, proximityInfo->getInputWord(),
|
if (excessivePos < 0 && transposedPos < 0 && (proximityMatchedCount > 0 || skipped)) {
|
||||||
// inputLength, word, outputIndex + 1);
|
const unsigned short* primaryInputWord = proximityInfo->getPrimaryInputWord();
|
||||||
// if (ed == 1 && !sameLength) ++matchCount; ------------------------ /
|
ed = editDistance(editDistanceTable, primaryInputWord,
|
||||||
int matchWeight = powerIntCapped(typedLetterMultiplier, matchCount);
|
inputLength, word, outputIndex + 1);
|
||||||
|
matchWeight = powerIntCapped(typedLetterMultiplier, outputIndex + 1 - ed);
|
||||||
|
if (ed == 1 && inputLength == outputIndex) {
|
||||||
|
// Promote a word with just one skipped char
|
||||||
|
multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE, &matchWeight);
|
||||||
|
}
|
||||||
|
ed = max(0, ed - quoteDiffCount);
|
||||||
|
adJustedProximityMatchedCount = min(max(0, ed - (outputIndex + 1 - inputLength)),
|
||||||
|
proximityMatchedCount);
|
||||||
|
} else {
|
||||||
|
matchWeight = powerIntCapped(typedLetterMultiplier, matchCount);
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: Demote by edit distance
|
// TODO: Demote by edit distance
|
||||||
int finalFreq = freq * matchWeight;
|
int finalFreq = freq * matchWeight;
|
||||||
// +1 +11/-12
|
|
||||||
/*if (inputLength == outputIndex && !skipped && excessivePos < 0 && transposedPos < 0) {
|
|
||||||
const int ed = editDistance(dp, proximityInfo->getInputWord(),
|
|
||||||
inputLength, word, outputIndex + 1);
|
|
||||||
if (ed == 1) {
|
|
||||||
multiplyRate(160, &finalFreq);
|
|
||||||
}
|
|
||||||
}*/
|
|
||||||
if (inputLength == outputIndex && excessivePos < 0 && transposedPos < 0
|
|
||||||
&& (proximityMatchedCount > 0 || skipped)) {
|
|
||||||
const int ed = editDistance(editDistanceTable, proximityInfo->getPrimaryInputWord(),
|
|
||||||
inputLength, word, outputIndex + 1);
|
|
||||||
if (ed == 1) {
|
|
||||||
multiplyRate(160, &finalFreq);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: Promote properly?
|
///////////////////////////////////////////////
|
||||||
//if (skipCount == 1 && excessivePos < 0 && transposedPos < 0 && inputLength == outputIndex
|
// Promotion and Demotion for each correction
|
||||||
// && !sameLength) {
|
|
||||||
// multiplyRate(150, &finalFreq);
|
|
||||||
//}
|
|
||||||
//if (skipCount == 0 && excessivePos < 0 && transposedPos < 0 && inputLength == outputIndex
|
|
||||||
// && !sameLength) {
|
|
||||||
// multiplyRate(150, &finalFreq);
|
|
||||||
//}
|
|
||||||
//if (skipCount == 0 && excessivePos < 0 && transposedPos < 0
|
|
||||||
// && inputLength == outputIndex + 1) {
|
|
||||||
// multiplyRate(150, &finalFreq);
|
|
||||||
//}
|
|
||||||
|
|
||||||
|
// Demotion for a word with missing character
|
||||||
if (skipped) {
|
if (skipped) {
|
||||||
if (inputLength >= 2) {
|
const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE
|
||||||
const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE
|
* (10 * inputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)
|
||||||
* (10 * inputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)
|
/ (10 * inputLength
|
||||||
/ (10 * inputLength
|
- WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);
|
||||||
- WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);
|
if (DEBUG_DICT_FULL) {
|
||||||
if (DEBUG_DICT_FULL) {
|
LOGI("Demotion rate for missing character is %d.", demotionRate);
|
||||||
LOGI("Demotion rate for missing character is %d.", demotionRate);
|
|
||||||
}
|
|
||||||
multiplyRate(demotionRate, &finalFreq);
|
|
||||||
} else {
|
|
||||||
finalFreq = 0;
|
|
||||||
}
|
}
|
||||||
|
multiplyRate(demotionRate, &finalFreq);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Demotion for a word with transposed character
|
||||||
if (transposedPos >= 0) multiplyRate(
|
if (transposedPos >= 0) multiplyRate(
|
||||||
WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);
|
WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);
|
||||||
|
|
||||||
|
// Demotion for a word with excessive character
|
||||||
if (excessivePos >= 0) {
|
if (excessivePos >= 0) {
|
||||||
multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);
|
multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);
|
||||||
if (!proximityInfo->existsAdjacentProximityChars(inputIndex)) {
|
if (!proximityInfo->existsAdjacentProximityChars(inputIndex)) {
|
||||||
|
@ -478,52 +464,62 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
|
||||||
multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);
|
multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
int lengthFreq = typedLetterMultiplier;
|
|
||||||
multiplyIntCapped(powerIntCapped(typedLetterMultiplier, outputIndex), &lengthFreq);
|
// Promotion for a word with proximity characters
|
||||||
if ((outputIndex + 1) == matchCount) {
|
for (int i = 0; i < adJustedProximityMatchedCount; ++i) {
|
||||||
// Full exact match
|
|
||||||
if (outputIndex > 1) {
|
|
||||||
if (DEBUG_DICT) {
|
|
||||||
LOGI("Found full matched word.");
|
|
||||||
}
|
|
||||||
multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
|
|
||||||
}
|
|
||||||
if (sameLength && transposedPos < 0 && !skipped && excessivePos < 0) {
|
|
||||||
finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
|
|
||||||
}
|
|
||||||
} else if (sameLength && transposedPos < 0 && !skipped && excessivePos < 0
|
|
||||||
&& outputIndex > 0) {
|
|
||||||
// A word with proximity corrections
|
// A word with proximity corrections
|
||||||
if (DEBUG_DICT) {
|
if (DEBUG_DICT_FULL) {
|
||||||
LOGI("Found one proximity correction.");
|
LOGI("Found a proximity correction.");
|
||||||
}
|
}
|
||||||
multiplyIntCapped(typedLetterMultiplier, &finalFreq);
|
multiplyIntCapped(typedLetterMultiplier, &finalFreq);
|
||||||
multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
|
multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
|
||||||
}
|
}
|
||||||
if (DEBUG_DICT_FULL) {
|
|
||||||
LOGI("calc: %d, %d", outputIndex, sameLength);
|
|
||||||
}
|
|
||||||
if (sameLength) multiplyIntCapped(fullWordMultiplier, &finalFreq);
|
|
||||||
|
|
||||||
// TODO: check excessive count and transposed count
|
const int errorCount = proximityMatchedCount + skipCount;
|
||||||
|
multiplyRate(
|
||||||
|
100 - CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE * errorCount / inputLength, &finalFreq);
|
||||||
|
|
||||||
|
// Promotion for an exactly matched word
|
||||||
|
if (matchCount == outputIndex + 1) {
|
||||||
|
// Full exact match
|
||||||
|
if (sameLength && transposedPos < 0 && !skipped && excessivePos < 0) {
|
||||||
|
finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Promote a word with no correction
|
||||||
|
if (proximityMatchedCount == 0 && transposedPos < 0 && !skipped && excessivePos < 0) {
|
||||||
|
multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: Check excessive count and transposed count
|
||||||
|
// TODO: Remove this if possible
|
||||||
/*
|
/*
|
||||||
If the last character of the user input word is the same as the next character
|
If the last character of the user input word is the same as the next character
|
||||||
of the output word, and also all of characters of the user input are matched
|
of the output word, and also all of characters of the user input are matched
|
||||||
to the output word, we'll promote that word a bit because
|
to the output word, we'll promote that word a bit because
|
||||||
that word can be considered the combination of skipped and matched characters.
|
that word can be considered the combination of skipped and matched characters.
|
||||||
This means that the 'sm' pattern wins over the 'ma' pattern.
|
This means that the 'sm' pattern wins over the 'ma' pattern.
|
||||||
e.g.)
|
e.g.)
|
||||||
shel -> shell [mmmma] or [mmmsm]
|
shel -> shell [mmmma] or [mmmsm]
|
||||||
hel -> hello [mmmaa] or [mmsma]
|
hel -> hello [mmmaa] or [mmsma]
|
||||||
m ... matching
|
m ... matching
|
||||||
s ... skipping
|
s ... skipping
|
||||||
a ... traversing all
|
a ... traversing all
|
||||||
*/
|
*/
|
||||||
if (matchCount == inputLength && matchCount >= 2 && !skipped
|
if (matchCount == inputLength && matchCount >= 2 && !skipped
|
||||||
&& word[matchCount] == word[matchCount - 1]) {
|
&& word[matchCount] == word[matchCount - 1]) {
|
||||||
multiplyRate(WORDS_WITH_MATCH_SKIP_PROMOTION_RATE, &finalFreq);
|
multiplyRate(WORDS_WITH_MATCH_SKIP_PROMOTION_RATE, &finalFreq);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (sameLength) {
|
||||||
|
multiplyIntCapped(fullWordMultiplier, &finalFreq);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (DEBUG_DICT_FULL) {
|
||||||
|
LOGI("calc: %d, %d", outputIndex, sameLength);
|
||||||
|
}
|
||||||
|
|
||||||
return finalFreq;
|
return finalFreq;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -139,8 +139,7 @@ private:
|
||||||
class RankingAlgorithm {
|
class RankingAlgorithm {
|
||||||
public:
|
public:
|
||||||
static int calculateFinalFreq(const int inputIndex, const int depth,
|
static int calculateFinalFreq(const int inputIndex, const int depth,
|
||||||
const int freq, const bool sameLength, int *editDistanceTable,
|
const int freq, int *editDistanceTable, const Correction* correction);
|
||||||
const Correction* correction);
|
|
||||||
static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
|
static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
|
||||||
const Correction* correction);
|
const Correction* correction);
|
||||||
};
|
};
|
||||||
|
|
|
@ -177,6 +177,8 @@ static void dumpWord(const unsigned short* word, const int length) {
|
||||||
#define FULL_MATCHED_WORDS_PROMOTION_RATE 120
|
#define FULL_MATCHED_WORDS_PROMOTION_RATE 120
|
||||||
#define WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE 90
|
#define WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE 90
|
||||||
#define WORDS_WITH_MATCH_SKIP_PROMOTION_RATE 105
|
#define WORDS_WITH_MATCH_SKIP_PROMOTION_RATE 105
|
||||||
|
#define WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE 160
|
||||||
|
#define CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE 42
|
||||||
|
|
||||||
// This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
|
// This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
|
||||||
// This is only used for the size of array. Not to be used in c functions.
|
// This is only used for the size of array. Not to be used in c functions.
|
||||||
|
@ -194,5 +196,6 @@ static void dumpWord(const unsigned short* word, const int length) {
|
||||||
#define MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION 3
|
#define MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION 3
|
||||||
|
|
||||||
#define min(a,b) ((a)<(b)?(a):(b))
|
#define min(a,b) ((a)<(b)?(a):(b))
|
||||||
|
#define max(a,b) ((a)>(b)?(a):(b))
|
||||||
|
|
||||||
#endif // LATINIME_DEFINES_H
|
#endif // LATINIME_DEFINES_H
|
||||||
|
|
Loading…
Reference in New Issue