From bcac0e9e23853891a5a45fd19b6f8917ffc705f7 Mon Sep 17 00:00:00 2001 From: satok Date: Mon, 15 Aug 2011 22:30:33 +0900 Subject: [PATCH] Improve fat finger correction. Change-Id: I37ed0dc6956f7e6fab5dcfd0483ab5691cd819d4 --- native/src/correction.cpp | 202 +++++++++++++++++++------------------- native/src/correction.h | 3 +- native/src/defines.h | 3 + 3 files changed, 103 insertions(+), 105 deletions(-) diff --git a/native/src/correction.cpp b/native/src/correction.cpp index a4090a966..99412b211 100644 --- a/native/src/correction.cpp +++ b/native/src/correction.cpp @@ -95,10 +95,8 @@ int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLen } *word = mWord; - const bool sameLength = (mExcessivePos == mInputLength - 1) ? (mInputLength == inputIndex + 2) - : (mInputLength == inputIndex + 1); return Correction::RankingAlgorithm::calculateFinalFreq( - inputIndex, outputIndex, freq, sameLength, mEditDistanceTable, this); + inputIndex, outputIndex, freq, mEditDistanceTable, this); } bool Correction::initProcessState(const int outputIndex) { @@ -205,20 +203,6 @@ Correction::CorrectionType Correction::processCharAndCalcState( } if (mNeedsToTraverseAllNodes || isQuote(c)) { - const bool checkProximityChars = - !(mSkippedCount > 0 || mExcessivePos >= 0 || mTransposedPos >= 0); - // Note: This logic tries saving cases like contrst --> contrast -- "a" is one of - // proximity chars of "s", but it should rather be handled as a skipped char. - if (checkProximityChars - && mInputIndex > 0 - && mCorrectionStates[mOutputIndex].mProximityMatching - && mCorrectionStates[mOutputIndex].mSkipping - && mProximityInfo->getMatchedProximityId( - mInputIndex - 1, c, false) - == ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR) { - ++mSkippedCount; - --mProximityCount; - } return processSkipChar(c, isTerminal); } else { int inputIndexForProximity = mInputIndex; @@ -250,6 +234,8 @@ Correction::CorrectionType Correction::processCharAndCalcState( && mProximityInfo->getMatchedProximityId( inputIndexForProximity - 1, c, false) == ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR) { + // Note: This logic tries saving cases like contrst --> contrast -- "a" is one of + // proximity chars of "s", but it should rather be handled as a skipped char. ++mSkippedCount; --mProximityCount; return processSkipChar(c, isTerminal); @@ -344,6 +330,16 @@ inline static void multiplyRate(const int rate, int *freq) { } } +inline static int getQuoteCount(const unsigned short* word, const int length) { + int quoteCount = 0; + for (int i = 0; i < length; ++i) { + if(word[i] == '\'') { + ++quoteCount; + } + } + return quoteCount; +} + /* static */ inline static int editDistance( int* editDistanceTable, const unsigned short* input, @@ -392,8 +388,7 @@ inline static int editDistance( /* static */ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const int outputIndex, - const int freq, const bool sameLength, int* editDistanceTable, - const Correction* correction) { + const int freq, int* editDistanceTable, const Correction* correction) { const int excessivePos = correction->getExcessivePos(); const int transposedPos = correction->getTransposedPos(); const int inputLength = correction->mInputLength; @@ -402,6 +397,12 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const const ProximityInfo *proximityInfo = correction->mProximityInfo; const int skipCount = correction->mSkippedCount; const int proximityMatchedCount = correction->mProximityCount; + if (skipCount >= inputLength || inputLength == 0) { + return -1; + } + const bool sameLength = (excessivePos == inputLength - 1) ? (inputLength == inputIndex + 2) + : (inputLength == inputIndex + 1); + // TODO: use mExcessiveCount int matchCount = inputLength - correction->mProximityCount - (excessivePos >= 0 ? 1 : 0); @@ -409,67 +410,52 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const const unsigned short* word = correction->mWord; const bool skipped = skipCount > 0; - // ----- TODO: use edit distance here as follows? ---------------------- / - //if (!skipped && excessivePos < 0 && transposedPos < 0) { - // const int ed = editDistance(dp, proximityInfo->getInputWord(), - // inputLength, word, outputIndex + 1); - // matchCount = outputIndex + 1 - ed; - // if (ed == 1 && !sameLength) ++matchCount; - //} - // const int ed = editDistance(dp, proximityInfo->getInputWord(), - // inputLength, word, outputIndex + 1); - // if (ed == 1 && !sameLength) ++matchCount; ------------------------ / - int matchWeight = powerIntCapped(typedLetterMultiplier, matchCount); + const int quoteDiffCount = max(0, getQuoteCount(word, outputIndex + 1) + - getQuoteCount(proximityInfo->getPrimaryInputWord(), inputLength)); + + // TODO: Calculate edit distance for transposed and excessive + int matchWeight; + int ed = 0; + int adJustedProximityMatchedCount = proximityMatchedCount; + if (excessivePos < 0 && transposedPos < 0 && (proximityMatchedCount > 0 || skipped)) { + const unsigned short* primaryInputWord = proximityInfo->getPrimaryInputWord(); + ed = editDistance(editDistanceTable, primaryInputWord, + inputLength, word, outputIndex + 1); + matchWeight = powerIntCapped(typedLetterMultiplier, outputIndex + 1 - ed); + if (ed == 1 && inputLength == outputIndex) { + // Promote a word with just one skipped char + multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE, &matchWeight); + } + ed = max(0, ed - quoteDiffCount); + adJustedProximityMatchedCount = min(max(0, ed - (outputIndex + 1 - inputLength)), + proximityMatchedCount); + } else { + matchWeight = powerIntCapped(typedLetterMultiplier, matchCount); + } // TODO: Demote by edit distance int finalFreq = freq * matchWeight; - // +1 +11/-12 - /*if (inputLength == outputIndex && !skipped && excessivePos < 0 && transposedPos < 0) { - const int ed = editDistance(dp, proximityInfo->getInputWord(), - inputLength, word, outputIndex + 1); - if (ed == 1) { - multiplyRate(160, &finalFreq); - } - }*/ - if (inputLength == outputIndex && excessivePos < 0 && transposedPos < 0 - && (proximityMatchedCount > 0 || skipped)) { - const int ed = editDistance(editDistanceTable, proximityInfo->getPrimaryInputWord(), - inputLength, word, outputIndex + 1); - if (ed == 1) { - multiplyRate(160, &finalFreq); - } - } - // TODO: Promote properly? - //if (skipCount == 1 && excessivePos < 0 && transposedPos < 0 && inputLength == outputIndex - // && !sameLength) { - // multiplyRate(150, &finalFreq); - //} - //if (skipCount == 0 && excessivePos < 0 && transposedPos < 0 && inputLength == outputIndex - // && !sameLength) { - // multiplyRate(150, &finalFreq); - //} - //if (skipCount == 0 && excessivePos < 0 && transposedPos < 0 - // && inputLength == outputIndex + 1) { - // multiplyRate(150, &finalFreq); - //} + /////////////////////////////////////////////// + // Promotion and Demotion for each correction + // Demotion for a word with missing character if (skipped) { - if (inputLength >= 2) { - const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE - * (10 * inputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X) - / (10 * inputLength - - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10); - if (DEBUG_DICT_FULL) { - LOGI("Demotion rate for missing character is %d.", demotionRate); - } - multiplyRate(demotionRate, &finalFreq); - } else { - finalFreq = 0; + const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE + * (10 * inputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X) + / (10 * inputLength + - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10); + if (DEBUG_DICT_FULL) { + LOGI("Demotion rate for missing character is %d.", demotionRate); } + multiplyRate(demotionRate, &finalFreq); } + + // Demotion for a word with transposed character if (transposedPos >= 0) multiplyRate( WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq); + + // Demotion for a word with excessive character if (excessivePos >= 0) { multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq); if (!proximityInfo->existsAdjacentProximityChars(inputIndex)) { @@ -478,52 +464,62 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq); } } - int lengthFreq = typedLetterMultiplier; - multiplyIntCapped(powerIntCapped(typedLetterMultiplier, outputIndex), &lengthFreq); - if ((outputIndex + 1) == matchCount) { - // Full exact match - if (outputIndex > 1) { - if (DEBUG_DICT) { - LOGI("Found full matched word."); - } - multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq); - } - if (sameLength && transposedPos < 0 && !skipped && excessivePos < 0) { - finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq); - } - } else if (sameLength && transposedPos < 0 && !skipped && excessivePos < 0 - && outputIndex > 0) { + + // Promotion for a word with proximity characters + for (int i = 0; i < adJustedProximityMatchedCount; ++i) { // A word with proximity corrections - if (DEBUG_DICT) { - LOGI("Found one proximity correction."); + if (DEBUG_DICT_FULL) { + LOGI("Found a proximity correction."); } multiplyIntCapped(typedLetterMultiplier, &finalFreq); multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); } - if (DEBUG_DICT_FULL) { - LOGI("calc: %d, %d", outputIndex, sameLength); - } - if (sameLength) multiplyIntCapped(fullWordMultiplier, &finalFreq); - // TODO: check excessive count and transposed count + const int errorCount = proximityMatchedCount + skipCount; + multiplyRate( + 100 - CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE * errorCount / inputLength, &finalFreq); + + // Promotion for an exactly matched word + if (matchCount == outputIndex + 1) { + // Full exact match + if (sameLength && transposedPos < 0 && !skipped && excessivePos < 0) { + finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq); + } + } + + // Promote a word with no correction + if (proximityMatchedCount == 0 && transposedPos < 0 && !skipped && excessivePos < 0) { + multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq); + } + + // TODO: Check excessive count and transposed count + // TODO: Remove this if possible /* - If the last character of the user input word is the same as the next character - of the output word, and also all of characters of the user input are matched - to the output word, we'll promote that word a bit because - that word can be considered the combination of skipped and matched characters. - This means that the 'sm' pattern wins over the 'ma' pattern. - e.g.) - shel -> shell [mmmma] or [mmmsm] - hel -> hello [mmmaa] or [mmsma] - m ... matching - s ... skipping - a ... traversing all + If the last character of the user input word is the same as the next character + of the output word, and also all of characters of the user input are matched + to the output word, we'll promote that word a bit because + that word can be considered the combination of skipped and matched characters. + This means that the 'sm' pattern wins over the 'ma' pattern. + e.g.) + shel -> shell [mmmma] or [mmmsm] + hel -> hello [mmmaa] or [mmsma] + m ... matching + s ... skipping + a ... traversing all */ if (matchCount == inputLength && matchCount >= 2 && !skipped && word[matchCount] == word[matchCount - 1]) { multiplyRate(WORDS_WITH_MATCH_SKIP_PROMOTION_RATE, &finalFreq); } + if (sameLength) { + multiplyIntCapped(fullWordMultiplier, &finalFreq); + } + + if (DEBUG_DICT_FULL) { + LOGI("calc: %d, %d", outputIndex, sameLength); + } + return finalFreq; } diff --git a/native/src/correction.h b/native/src/correction.h index 9d385a44e..871a04251 100644 --- a/native/src/correction.h +++ b/native/src/correction.h @@ -139,8 +139,7 @@ private: class RankingAlgorithm { public: static int calculateFinalFreq(const int inputIndex, const int depth, - const int freq, const bool sameLength, int *editDistanceTable, - const Correction* correction); + const int freq, int *editDistanceTable, const Correction* correction); static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq, const Correction* correction); }; diff --git a/native/src/defines.h b/native/src/defines.h index c1d08e695..a29fb7e5b 100644 --- a/native/src/defines.h +++ b/native/src/defines.h @@ -177,6 +177,8 @@ static void dumpWord(const unsigned short* word, const int length) { #define FULL_MATCHED_WORDS_PROMOTION_RATE 120 #define WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE 90 #define WORDS_WITH_MATCH_SKIP_PROMOTION_RATE 105 +#define WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE 160 +#define CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE 42 // This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java // This is only used for the size of array. Not to be used in c functions. @@ -194,5 +196,6 @@ static void dumpWord(const unsigned short* word, const int length) { #define MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION 3 #define min(a,b) ((a)<(b)?(a):(b)) +#define max(a,b) ((a)>(b)?(a):(b)) #endif // LATINIME_DEFINES_H