Merge "Improve fat finger correction."

main
satok 2011-08-16 09:21:46 -07:00 committed by Android (Google) Code Review
commit 1d66cc1be6
3 changed files with 103 additions and 105 deletions

View File

@ -95,10 +95,8 @@ int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLen
} }
*word = mWord; *word = mWord;
const bool sameLength = (mExcessivePos == mInputLength - 1) ? (mInputLength == inputIndex + 2)
: (mInputLength == inputIndex + 1);
return Correction::RankingAlgorithm::calculateFinalFreq( return Correction::RankingAlgorithm::calculateFinalFreq(
inputIndex, outputIndex, freq, sameLength, mEditDistanceTable, this); inputIndex, outputIndex, freq, mEditDistanceTable, this);
} }
bool Correction::initProcessState(const int outputIndex) { bool Correction::initProcessState(const int outputIndex) {
@ -205,20 +203,6 @@ Correction::CorrectionType Correction::processCharAndCalcState(
} }
if (mNeedsToTraverseAllNodes || isQuote(c)) { if (mNeedsToTraverseAllNodes || isQuote(c)) {
const bool checkProximityChars =
!(mSkippedCount > 0 || mExcessivePos >= 0 || mTransposedPos >= 0);
// Note: This logic tries saving cases like contrst --> contrast -- "a" is one of
// proximity chars of "s", but it should rather be handled as a skipped char.
if (checkProximityChars
&& mInputIndex > 0
&& mCorrectionStates[mOutputIndex].mProximityMatching
&& mCorrectionStates[mOutputIndex].mSkipping
&& mProximityInfo->getMatchedProximityId(
mInputIndex - 1, c, false)
== ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR) {
++mSkippedCount;
--mProximityCount;
}
return processSkipChar(c, isTerminal); return processSkipChar(c, isTerminal);
} else { } else {
int inputIndexForProximity = mInputIndex; int inputIndexForProximity = mInputIndex;
@ -250,6 +234,8 @@ Correction::CorrectionType Correction::processCharAndCalcState(
&& mProximityInfo->getMatchedProximityId( && mProximityInfo->getMatchedProximityId(
inputIndexForProximity - 1, c, false) inputIndexForProximity - 1, c, false)
== ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR) { == ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR) {
// Note: This logic tries saving cases like contrst --> contrast -- "a" is one of
// proximity chars of "s", but it should rather be handled as a skipped char.
++mSkippedCount; ++mSkippedCount;
--mProximityCount; --mProximityCount;
return processSkipChar(c, isTerminal); return processSkipChar(c, isTerminal);
@ -344,6 +330,16 @@ inline static void multiplyRate(const int rate, int *freq) {
} }
} }
inline static int getQuoteCount(const unsigned short* word, const int length) {
int quoteCount = 0;
for (int i = 0; i < length; ++i) {
if(word[i] == '\'') {
++quoteCount;
}
}
return quoteCount;
}
/* static */ /* static */
inline static int editDistance( inline static int editDistance(
int* editDistanceTable, const unsigned short* input, int* editDistanceTable, const unsigned short* input,
@ -392,8 +388,7 @@ inline static int editDistance(
/* static */ /* static */
int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const int outputIndex, int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const int outputIndex,
const int freq, const bool sameLength, int* editDistanceTable, const int freq, int* editDistanceTable, const Correction* correction) {
const Correction* correction) {
const int excessivePos = correction->getExcessivePos(); const int excessivePos = correction->getExcessivePos();
const int transposedPos = correction->getTransposedPos(); const int transposedPos = correction->getTransposedPos();
const int inputLength = correction->mInputLength; const int inputLength = correction->mInputLength;
@ -402,6 +397,12 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
const ProximityInfo *proximityInfo = correction->mProximityInfo; const ProximityInfo *proximityInfo = correction->mProximityInfo;
const int skipCount = correction->mSkippedCount; const int skipCount = correction->mSkippedCount;
const int proximityMatchedCount = correction->mProximityCount; const int proximityMatchedCount = correction->mProximityCount;
if (skipCount >= inputLength || inputLength == 0) {
return -1;
}
const bool sameLength = (excessivePos == inputLength - 1) ? (inputLength == inputIndex + 2)
: (inputLength == inputIndex + 1);
// TODO: use mExcessiveCount // TODO: use mExcessiveCount
int matchCount = inputLength - correction->mProximityCount - (excessivePos >= 0 ? 1 : 0); int matchCount = inputLength - correction->mProximityCount - (excessivePos >= 0 ? 1 : 0);
@ -409,67 +410,52 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
const unsigned short* word = correction->mWord; const unsigned short* word = correction->mWord;
const bool skipped = skipCount > 0; const bool skipped = skipCount > 0;
// ----- TODO: use edit distance here as follows? ---------------------- / const int quoteDiffCount = max(0, getQuoteCount(word, outputIndex + 1)
//if (!skipped && excessivePos < 0 && transposedPos < 0) { - getQuoteCount(proximityInfo->getPrimaryInputWord(), inputLength));
// const int ed = editDistance(dp, proximityInfo->getInputWord(),
// inputLength, word, outputIndex + 1); // TODO: Calculate edit distance for transposed and excessive
// matchCount = outputIndex + 1 - ed; int matchWeight;
// if (ed == 1 && !sameLength) ++matchCount; int ed = 0;
//} int adJustedProximityMatchedCount = proximityMatchedCount;
// const int ed = editDistance(dp, proximityInfo->getInputWord(), if (excessivePos < 0 && transposedPos < 0 && (proximityMatchedCount > 0 || skipped)) {
// inputLength, word, outputIndex + 1); const unsigned short* primaryInputWord = proximityInfo->getPrimaryInputWord();
// if (ed == 1 && !sameLength) ++matchCount; ------------------------ / ed = editDistance(editDistanceTable, primaryInputWord,
int matchWeight = powerIntCapped(typedLetterMultiplier, matchCount); inputLength, word, outputIndex + 1);
matchWeight = powerIntCapped(typedLetterMultiplier, outputIndex + 1 - ed);
if (ed == 1 && inputLength == outputIndex) {
// Promote a word with just one skipped char
multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE, &matchWeight);
}
ed = max(0, ed - quoteDiffCount);
adJustedProximityMatchedCount = min(max(0, ed - (outputIndex + 1 - inputLength)),
proximityMatchedCount);
} else {
matchWeight = powerIntCapped(typedLetterMultiplier, matchCount);
}
// TODO: Demote by edit distance // TODO: Demote by edit distance
int finalFreq = freq * matchWeight; int finalFreq = freq * matchWeight;
// +1 +11/-12
/*if (inputLength == outputIndex && !skipped && excessivePos < 0 && transposedPos < 0) {
const int ed = editDistance(dp, proximityInfo->getInputWord(),
inputLength, word, outputIndex + 1);
if (ed == 1) {
multiplyRate(160, &finalFreq);
}
}*/
if (inputLength == outputIndex && excessivePos < 0 && transposedPos < 0
&& (proximityMatchedCount > 0 || skipped)) {
const int ed = editDistance(editDistanceTable, proximityInfo->getPrimaryInputWord(),
inputLength, word, outputIndex + 1);
if (ed == 1) {
multiplyRate(160, &finalFreq);
}
}
// TODO: Promote properly? ///////////////////////////////////////////////
//if (skipCount == 1 && excessivePos < 0 && transposedPos < 0 && inputLength == outputIndex // Promotion and Demotion for each correction
// && !sameLength) {
// multiplyRate(150, &finalFreq);
//}
//if (skipCount == 0 && excessivePos < 0 && transposedPos < 0 && inputLength == outputIndex
// && !sameLength) {
// multiplyRate(150, &finalFreq);
//}
//if (skipCount == 0 && excessivePos < 0 && transposedPos < 0
// && inputLength == outputIndex + 1) {
// multiplyRate(150, &finalFreq);
//}
// Demotion for a word with missing character
if (skipped) { if (skipped) {
if (inputLength >= 2) { const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE
const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE * (10 * inputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)
* (10 * inputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X) / (10 * inputLength
/ (10 * inputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);
- WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10); if (DEBUG_DICT_FULL) {
if (DEBUG_DICT_FULL) { LOGI("Demotion rate for missing character is %d.", demotionRate);
LOGI("Demotion rate for missing character is %d.", demotionRate);
}
multiplyRate(demotionRate, &finalFreq);
} else {
finalFreq = 0;
} }
multiplyRate(demotionRate, &finalFreq);
} }
// Demotion for a word with transposed character
if (transposedPos >= 0) multiplyRate( if (transposedPos >= 0) multiplyRate(
WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq); WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);
// Demotion for a word with excessive character
if (excessivePos >= 0) { if (excessivePos >= 0) {
multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq); multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);
if (!proximityInfo->existsAdjacentProximityChars(inputIndex)) { if (!proximityInfo->existsAdjacentProximityChars(inputIndex)) {
@ -478,52 +464,62 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq); multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);
} }
} }
int lengthFreq = typedLetterMultiplier;
multiplyIntCapped(powerIntCapped(typedLetterMultiplier, outputIndex), &lengthFreq); // Promotion for a word with proximity characters
if ((outputIndex + 1) == matchCount) { for (int i = 0; i < adJustedProximityMatchedCount; ++i) {
// Full exact match
if (outputIndex > 1) {
if (DEBUG_DICT) {
LOGI("Found full matched word.");
}
multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
}
if (sameLength && transposedPos < 0 && !skipped && excessivePos < 0) {
finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
}
} else if (sameLength && transposedPos < 0 && !skipped && excessivePos < 0
&& outputIndex > 0) {
// A word with proximity corrections // A word with proximity corrections
if (DEBUG_DICT) { if (DEBUG_DICT_FULL) {
LOGI("Found one proximity correction."); LOGI("Found a proximity correction.");
} }
multiplyIntCapped(typedLetterMultiplier, &finalFreq); multiplyIntCapped(typedLetterMultiplier, &finalFreq);
multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
} }
if (DEBUG_DICT_FULL) {
LOGI("calc: %d, %d", outputIndex, sameLength);
}
if (sameLength) multiplyIntCapped(fullWordMultiplier, &finalFreq);
// TODO: check excessive count and transposed count const int errorCount = proximityMatchedCount + skipCount;
multiplyRate(
100 - CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE * errorCount / inputLength, &finalFreq);
// Promotion for an exactly matched word
if (matchCount == outputIndex + 1) {
// Full exact match
if (sameLength && transposedPos < 0 && !skipped && excessivePos < 0) {
finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
}
}
// Promote a word with no correction
if (proximityMatchedCount == 0 && transposedPos < 0 && !skipped && excessivePos < 0) {
multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
}
// TODO: Check excessive count and transposed count
// TODO: Remove this if possible
/* /*
If the last character of the user input word is the same as the next character If the last character of the user input word is the same as the next character
of the output word, and also all of characters of the user input are matched of the output word, and also all of characters of the user input are matched
to the output word, we'll promote that word a bit because to the output word, we'll promote that word a bit because
that word can be considered the combination of skipped and matched characters. that word can be considered the combination of skipped and matched characters.
This means that the 'sm' pattern wins over the 'ma' pattern. This means that the 'sm' pattern wins over the 'ma' pattern.
e.g.) e.g.)
shel -> shell [mmmma] or [mmmsm] shel -> shell [mmmma] or [mmmsm]
hel -> hello [mmmaa] or [mmsma] hel -> hello [mmmaa] or [mmsma]
m ... matching m ... matching
s ... skipping s ... skipping
a ... traversing all a ... traversing all
*/ */
if (matchCount == inputLength && matchCount >= 2 && !skipped if (matchCount == inputLength && matchCount >= 2 && !skipped
&& word[matchCount] == word[matchCount - 1]) { && word[matchCount] == word[matchCount - 1]) {
multiplyRate(WORDS_WITH_MATCH_SKIP_PROMOTION_RATE, &finalFreq); multiplyRate(WORDS_WITH_MATCH_SKIP_PROMOTION_RATE, &finalFreq);
} }
if (sameLength) {
multiplyIntCapped(fullWordMultiplier, &finalFreq);
}
if (DEBUG_DICT_FULL) {
LOGI("calc: %d, %d", outputIndex, sameLength);
}
return finalFreq; return finalFreq;
} }

View File

@ -139,8 +139,7 @@ private:
class RankingAlgorithm { class RankingAlgorithm {
public: public:
static int calculateFinalFreq(const int inputIndex, const int depth, static int calculateFinalFreq(const int inputIndex, const int depth,
const int freq, const bool sameLength, int *editDistanceTable, const int freq, int *editDistanceTable, const Correction* correction);
const Correction* correction);
static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq, static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
const Correction* correction); const Correction* correction);
}; };

View File

@ -177,6 +177,8 @@ static void dumpWord(const unsigned short* word, const int length) {
#define FULL_MATCHED_WORDS_PROMOTION_RATE 120 #define FULL_MATCHED_WORDS_PROMOTION_RATE 120
#define WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE 90 #define WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE 90
#define WORDS_WITH_MATCH_SKIP_PROMOTION_RATE 105 #define WORDS_WITH_MATCH_SKIP_PROMOTION_RATE 105
#define WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE 160
#define CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE 42
// This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java // This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
// This is only used for the size of array. Not to be used in c functions. // This is only used for the size of array. Not to be used in c functions.
@ -194,5 +196,6 @@ static void dumpWord(const unsigned short* word, const int length) {
#define MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION 3 #define MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION 3
#define min(a,b) ((a)<(b)?(a):(b)) #define min(a,b) ((a)<(b)?(a):(b))
#define max(a,b) ((a)>(b)?(a):(b))
#endif // LATINIME_DEFINES_H #endif // LATINIME_DEFINES_H