Improve fat finger correction.

Change-Id: I37ed0dc6956f7e6fab5dcfd0483ab5691cd819d4
This commit is contained in:
satok 2011-08-15 22:30:33 +09:00
parent 77838275ca
commit bcac0e9e23
3 changed files with 103 additions and 105 deletions

View file

@ -95,10 +95,8 @@ int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLen
}
*word = mWord;
const bool sameLength = (mExcessivePos == mInputLength - 1) ? (mInputLength == inputIndex + 2)
: (mInputLength == inputIndex + 1);
return Correction::RankingAlgorithm::calculateFinalFreq(
inputIndex, outputIndex, freq, sameLength, mEditDistanceTable, this);
inputIndex, outputIndex, freq, mEditDistanceTable, this);
}
bool Correction::initProcessState(const int outputIndex) {
@ -205,20 +203,6 @@ Correction::CorrectionType Correction::processCharAndCalcState(
}
if (mNeedsToTraverseAllNodes || isQuote(c)) {
const bool checkProximityChars =
!(mSkippedCount > 0 || mExcessivePos >= 0 || mTransposedPos >= 0);
// Note: This logic tries saving cases like contrst --> contrast -- "a" is one of
// proximity chars of "s", but it should rather be handled as a skipped char.
if (checkProximityChars
&& mInputIndex > 0
&& mCorrectionStates[mOutputIndex].mProximityMatching
&& mCorrectionStates[mOutputIndex].mSkipping
&& mProximityInfo->getMatchedProximityId(
mInputIndex - 1, c, false)
== ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR) {
++mSkippedCount;
--mProximityCount;
}
return processSkipChar(c, isTerminal);
} else {
int inputIndexForProximity = mInputIndex;
@ -250,6 +234,8 @@ Correction::CorrectionType Correction::processCharAndCalcState(
&& mProximityInfo->getMatchedProximityId(
inputIndexForProximity - 1, c, false)
== ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR) {
// Note: This logic tries saving cases like contrst --> contrast -- "a" is one of
// proximity chars of "s", but it should rather be handled as a skipped char.
++mSkippedCount;
--mProximityCount;
return processSkipChar(c, isTerminal);
@ -344,6 +330,16 @@ inline static void multiplyRate(const int rate, int *freq) {
}
}
inline static int getQuoteCount(const unsigned short* word, const int length) {
int quoteCount = 0;
for (int i = 0; i < length; ++i) {
if(word[i] == '\'') {
++quoteCount;
}
}
return quoteCount;
}
/* static */
inline static int editDistance(
int* editDistanceTable, const unsigned short* input,
@ -392,8 +388,7 @@ inline static int editDistance(
/* static */
int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const int outputIndex,
const int freq, const bool sameLength, int* editDistanceTable,
const Correction* correction) {
const int freq, int* editDistanceTable, const Correction* correction) {
const int excessivePos = correction->getExcessivePos();
const int transposedPos = correction->getTransposedPos();
const int inputLength = correction->mInputLength;
@ -402,6 +397,12 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
const ProximityInfo *proximityInfo = correction->mProximityInfo;
const int skipCount = correction->mSkippedCount;
const int proximityMatchedCount = correction->mProximityCount;
if (skipCount >= inputLength || inputLength == 0) {
return -1;
}
const bool sameLength = (excessivePos == inputLength - 1) ? (inputLength == inputIndex + 2)
: (inputLength == inputIndex + 1);
// TODO: use mExcessiveCount
int matchCount = inputLength - correction->mProximityCount - (excessivePos >= 0 ? 1 : 0);
@ -409,67 +410,52 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
const unsigned short* word = correction->mWord;
const bool skipped = skipCount > 0;
// ----- TODO: use edit distance here as follows? ---------------------- /
//if (!skipped && excessivePos < 0 && transposedPos < 0) {
// const int ed = editDistance(dp, proximityInfo->getInputWord(),
// inputLength, word, outputIndex + 1);
// matchCount = outputIndex + 1 - ed;
// if (ed == 1 && !sameLength) ++matchCount;
//}
// const int ed = editDistance(dp, proximityInfo->getInputWord(),
// inputLength, word, outputIndex + 1);
// if (ed == 1 && !sameLength) ++matchCount; ------------------------ /
int matchWeight = powerIntCapped(typedLetterMultiplier, matchCount);
const int quoteDiffCount = max(0, getQuoteCount(word, outputIndex + 1)
- getQuoteCount(proximityInfo->getPrimaryInputWord(), inputLength));
// TODO: Calculate edit distance for transposed and excessive
int matchWeight;
int ed = 0;
int adJustedProximityMatchedCount = proximityMatchedCount;
if (excessivePos < 0 && transposedPos < 0 && (proximityMatchedCount > 0 || skipped)) {
const unsigned short* primaryInputWord = proximityInfo->getPrimaryInputWord();
ed = editDistance(editDistanceTable, primaryInputWord,
inputLength, word, outputIndex + 1);
matchWeight = powerIntCapped(typedLetterMultiplier, outputIndex + 1 - ed);
if (ed == 1 && inputLength == outputIndex) {
// Promote a word with just one skipped char
multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE, &matchWeight);
}
ed = max(0, ed - quoteDiffCount);
adJustedProximityMatchedCount = min(max(0, ed - (outputIndex + 1 - inputLength)),
proximityMatchedCount);
} else {
matchWeight = powerIntCapped(typedLetterMultiplier, matchCount);
}
// TODO: Demote by edit distance
int finalFreq = freq * matchWeight;
// +1 +11/-12
/*if (inputLength == outputIndex && !skipped && excessivePos < 0 && transposedPos < 0) {
const int ed = editDistance(dp, proximityInfo->getInputWord(),
inputLength, word, outputIndex + 1);
if (ed == 1) {
multiplyRate(160, &finalFreq);
}
}*/
if (inputLength == outputIndex && excessivePos < 0 && transposedPos < 0
&& (proximityMatchedCount > 0 || skipped)) {
const int ed = editDistance(editDistanceTable, proximityInfo->getPrimaryInputWord(),
inputLength, word, outputIndex + 1);
if (ed == 1) {
multiplyRate(160, &finalFreq);
}
}
// TODO: Promote properly?
//if (skipCount == 1 && excessivePos < 0 && transposedPos < 0 && inputLength == outputIndex
// && !sameLength) {
// multiplyRate(150, &finalFreq);
//}
//if (skipCount == 0 && excessivePos < 0 && transposedPos < 0 && inputLength == outputIndex
// && !sameLength) {
// multiplyRate(150, &finalFreq);
//}
//if (skipCount == 0 && excessivePos < 0 && transposedPos < 0
// && inputLength == outputIndex + 1) {
// multiplyRate(150, &finalFreq);
//}
///////////////////////////////////////////////
// Promotion and Demotion for each correction
// Demotion for a word with missing character
if (skipped) {
if (inputLength >= 2) {
const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE
* (10 * inputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)
/ (10 * inputLength
- WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);
if (DEBUG_DICT_FULL) {
LOGI("Demotion rate for missing character is %d.", demotionRate);
}
multiplyRate(demotionRate, &finalFreq);
} else {
finalFreq = 0;
const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE
* (10 * inputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)
/ (10 * inputLength
- WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);
if (DEBUG_DICT_FULL) {
LOGI("Demotion rate for missing character is %d.", demotionRate);
}
multiplyRate(demotionRate, &finalFreq);
}
// Demotion for a word with transposed character
if (transposedPos >= 0) multiplyRate(
WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);
// Demotion for a word with excessive character
if (excessivePos >= 0) {
multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);
if (!proximityInfo->existsAdjacentProximityChars(inputIndex)) {
@ -478,52 +464,62 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);
}
}
int lengthFreq = typedLetterMultiplier;
multiplyIntCapped(powerIntCapped(typedLetterMultiplier, outputIndex), &lengthFreq);
if ((outputIndex + 1) == matchCount) {
// Full exact match
if (outputIndex > 1) {
if (DEBUG_DICT) {
LOGI("Found full matched word.");
}
multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
}
if (sameLength && transposedPos < 0 && !skipped && excessivePos < 0) {
finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
}
} else if (sameLength && transposedPos < 0 && !skipped && excessivePos < 0
&& outputIndex > 0) {
// Promotion for a word with proximity characters
for (int i = 0; i < adJustedProximityMatchedCount; ++i) {
// A word with proximity corrections
if (DEBUG_DICT) {
LOGI("Found one proximity correction.");
if (DEBUG_DICT_FULL) {
LOGI("Found a proximity correction.");
}
multiplyIntCapped(typedLetterMultiplier, &finalFreq);
multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
}
if (DEBUG_DICT_FULL) {
LOGI("calc: %d, %d", outputIndex, sameLength);
}
if (sameLength) multiplyIntCapped(fullWordMultiplier, &finalFreq);
// TODO: check excessive count and transposed count
const int errorCount = proximityMatchedCount + skipCount;
multiplyRate(
100 - CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE * errorCount / inputLength, &finalFreq);
// Promotion for an exactly matched word
if (matchCount == outputIndex + 1) {
// Full exact match
if (sameLength && transposedPos < 0 && !skipped && excessivePos < 0) {
finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
}
}
// Promote a word with no correction
if (proximityMatchedCount == 0 && transposedPos < 0 && !skipped && excessivePos < 0) {
multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
}
// TODO: Check excessive count and transposed count
// TODO: Remove this if possible
/*
If the last character of the user input word is the same as the next character
of the output word, and also all of characters of the user input are matched
to the output word, we'll promote that word a bit because
that word can be considered the combination of skipped and matched characters.
This means that the 'sm' pattern wins over the 'ma' pattern.
e.g.)
shel -> shell [mmmma] or [mmmsm]
hel -> hello [mmmaa] or [mmsma]
m ... matching
s ... skipping
a ... traversing all
If the last character of the user input word is the same as the next character
of the output word, and also all of characters of the user input are matched
to the output word, we'll promote that word a bit because
that word can be considered the combination of skipped and matched characters.
This means that the 'sm' pattern wins over the 'ma' pattern.
e.g.)
shel -> shell [mmmma] or [mmmsm]
hel -> hello [mmmaa] or [mmsma]
m ... matching
s ... skipping
a ... traversing all
*/
if (matchCount == inputLength && matchCount >= 2 && !skipped
&& word[matchCount] == word[matchCount - 1]) {
multiplyRate(WORDS_WITH_MATCH_SKIP_PROMOTION_RATE, &finalFreq);
}
if (sameLength) {
multiplyIntCapped(fullWordMultiplier, &finalFreq);
}
if (DEBUG_DICT_FULL) {
LOGI("calc: %d, %d", outputIndex, sameLength);
}
return finalFreq;
}

View file

@ -139,8 +139,7 @@ private:
class RankingAlgorithm {
public:
static int calculateFinalFreq(const int inputIndex, const int depth,
const int freq, const bool sameLength, int *editDistanceTable,
const Correction* correction);
const int freq, int *editDistanceTable, const Correction* correction);
static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
const Correction* correction);
};

View file

@ -177,6 +177,8 @@ static void dumpWord(const unsigned short* word, const int length) {
#define FULL_MATCHED_WORDS_PROMOTION_RATE 120
#define WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE 90
#define WORDS_WITH_MATCH_SKIP_PROMOTION_RATE 105
#define WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE 160
#define CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE 42
// This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
// This is only used for the size of array. Not to be used in c functions.
@ -194,5 +196,6 @@ static void dumpWord(const unsigned short* word, const int length) {
#define MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION 3
#define min(a,b) ((a)<(b)?(a):(b))
#define max(a,b) ((a)>(b)?(a):(b))
#endif // LATINIME_DEFINES_H