Merge "Improve fat finger correction."

2011-08-16 09:21:46 -07:00 · 2011-08-16 09:21:46 -07:00 · 1d66cc1be6
commit 1d66cc1be6
parent c160373b6a bcac0e9e23
3 changed files with 103 additions and 105 deletions
--- a/native/src/correction.cpp
+++ b/native/src/correction.cpp
@ -95,10 +95,8 @@ int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLen
    }

    *word = mWord;
-    const bool sameLength = (mExcessivePos == mInputLength - 1) ? (mInputLength == inputIndex + 2)
-            : (mInputLength == inputIndex + 1);
    return Correction::RankingAlgorithm::calculateFinalFreq(
-            inputIndex, outputIndex, freq, sameLength, mEditDistanceTable, this);
+            inputIndex, outputIndex, freq, mEditDistanceTable, this);
 }

 bool Correction::initProcessState(const int outputIndex) {
@ -205,20 +203,6 @@ Correction::CorrectionType Correction::processCharAndCalcState(
    }

    if (mNeedsToTraverseAllNodes || isQuote(c)) {
-        const bool checkProximityChars =
-                !(mSkippedCount > 0 || mExcessivePos >= 0 || mTransposedPos >= 0);
-        // Note: This logic tries saving cases like contrst --> contrast -- "a" is one of
-        // proximity chars of "s", but it should rather be handled as a skipped char.
-        if (checkProximityChars
-                && mInputIndex > 0
-                && mCorrectionStates[mOutputIndex].mProximityMatching
-                && mCorrectionStates[mOutputIndex].mSkipping
-                && mProximityInfo->getMatchedProximityId(
-                        mInputIndex - 1, c, false)
-                        == ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR) {
-            ++mSkippedCount;
-            --mProximityCount;
-        }
        return processSkipChar(c, isTerminal);
    } else {
        int inputIndexForProximity = mInputIndex;
@ -250,6 +234,8 @@ Correction::CorrectionType Correction::processCharAndCalcState(
                    && mProximityInfo->getMatchedProximityId(
                            inputIndexForProximity - 1, c, false)
                                    == ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR) {
+                // Note: This logic tries saving cases like contrst --> contrast -- "a" is one of
+                // proximity chars of "s", but it should rather be handled as a skipped char.
                ++mSkippedCount;
                --mProximityCount;
                return processSkipChar(c, isTerminal);
@ -344,6 +330,16 @@ inline static void multiplyRate(const int rate, int *freq) {
    }
 }

+inline static int getQuoteCount(const unsigned short* word, const int length) {
+    int quoteCount = 0;
+    for (int i = 0; i < length; ++i) {
+        if(word[i] == '\'') {
+            ++quoteCount;
+        }
+    }
+    return quoteCount;
+}
+
 /* static */
 inline static int editDistance(
        int* editDistanceTable, const unsigned short* input,
@ -392,8 +388,7 @@ inline static int editDistance(

 /* static */
 int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const int outputIndex,
-        const int freq, const bool sameLength, int* editDistanceTable,
-        const Correction* correction) {
+        const int freq, int* editDistanceTable, const Correction* correction) {
    const int excessivePos = correction->getExcessivePos();
    const int transposedPos = correction->getTransposedPos();
    const int inputLength = correction->mInputLength;
@ -402,6 +397,12 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
    const ProximityInfo *proximityInfo = correction->mProximityInfo;
    const int skipCount = correction->mSkippedCount;
    const int proximityMatchedCount = correction->mProximityCount;
+    if (skipCount >= inputLength || inputLength == 0) {
+        return -1;
+    }
+    const bool sameLength = (excessivePos == inputLength - 1) ? (inputLength == inputIndex + 2)
+            : (inputLength == inputIndex + 1);
+

    // TODO: use mExcessiveCount
    int matchCount = inputLength - correction->mProximityCount - (excessivePos >= 0 ? 1 : 0);
@ -409,67 +410,52 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
    const unsigned short* word = correction->mWord;
    const bool skipped = skipCount > 0;

-    // ----- TODO: use edit distance here as follows? ---------------------- /
-    //if (!skipped && excessivePos < 0 && transposedPos < 0) {
-    //    const int ed = editDistance(dp, proximityInfo->getInputWord(),
-    //            inputLength, word, outputIndex + 1);
-    //    matchCount = outputIndex + 1 - ed;
-    //    if (ed == 1 && !sameLength) ++matchCount;
-    //}
-    //    const int ed = editDistance(dp, proximityInfo->getInputWord(),
-    //    inputLength, word, outputIndex + 1);
-    //    if (ed == 1 && !sameLength) ++matchCount; ------------------------ /
-    int matchWeight = powerIntCapped(typedLetterMultiplier, matchCount);
+    const int quoteDiffCount = max(0, getQuoteCount(word, outputIndex + 1)
+            - getQuoteCount(proximityInfo->getPrimaryInputWord(), inputLength));
+
+    // TODO: Calculate edit distance for transposed and excessive
+    int matchWeight;
+    int ed = 0;
+    int adJustedProximityMatchedCount = proximityMatchedCount;
+    if (excessivePos < 0 && transposedPos < 0 && (proximityMatchedCount > 0 || skipped)) {
+        const unsigned short* primaryInputWord = proximityInfo->getPrimaryInputWord();
+        ed = editDistance(editDistanceTable, primaryInputWord,
+                inputLength, word, outputIndex + 1);
+        matchWeight = powerIntCapped(typedLetterMultiplier, outputIndex + 1 - ed);
+        if (ed == 1 && inputLength == outputIndex) {
+            // Promote a word with just one skipped char
+            multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE, &matchWeight);
+        }
+        ed = max(0, ed - quoteDiffCount);
+        adJustedProximityMatchedCount = min(max(0, ed - (outputIndex + 1 - inputLength)),
+                proximityMatchedCount);
+    } else {
+        matchWeight = powerIntCapped(typedLetterMultiplier, matchCount);
+    }

    // TODO: Demote by edit distance
    int finalFreq = freq * matchWeight;
-    // +1 +11/-12
-    /*if (inputLength == outputIndex && !skipped && excessivePos < 0 && transposedPos < 0) {
-        const int ed = editDistance(dp, proximityInfo->getInputWord(),
-                inputLength, word, outputIndex + 1);
-        if (ed == 1) {
-            multiplyRate(160, &finalFreq);
-        }
-    }*/
-    if (inputLength == outputIndex && excessivePos < 0 && transposedPos < 0
-            && (proximityMatchedCount > 0 || skipped)) {
-        const int ed = editDistance(editDistanceTable, proximityInfo->getPrimaryInputWord(),
-                inputLength, word, outputIndex + 1);
-        if (ed == 1) {
-            multiplyRate(160, &finalFreq);
-        }
-    }

-    // TODO: Promote properly?
-    //if (skipCount == 1 && excessivePos < 0 && transposedPos < 0 && inputLength == outputIndex
-    //        && !sameLength) {
-    //    multiplyRate(150, &finalFreq);
-    //}
-    //if (skipCount == 0 && excessivePos < 0 && transposedPos < 0 && inputLength == outputIndex
-    //        && !sameLength) {
-    //    multiplyRate(150, &finalFreq);
-    //}
-    //if (skipCount == 0 && excessivePos < 0 && transposedPos < 0
-    //        && inputLength == outputIndex + 1) {
-    //    multiplyRate(150, &finalFreq);
-    //}
+    ///////////////////////////////////////////////
+    // Promotion and Demotion for each correction

+    // Demotion for a word with missing character
    if (skipped) {
-        if (inputLength >= 2) {
-            const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE
-                    * (10 * inputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)
-                    / (10 * inputLength
-                            - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);
-            if (DEBUG_DICT_FULL) {
-                LOGI("Demotion rate for missing character is %d.", demotionRate);
-            }
-            multiplyRate(demotionRate, &finalFreq);
-        } else {
-            finalFreq = 0;
+        const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE
+                * (10 * inputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)
+                / (10 * inputLength
+                        - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);
+        if (DEBUG_DICT_FULL) {
+            LOGI("Demotion rate for missing character is %d.", demotionRate);
        }
+        multiplyRate(demotionRate, &finalFreq);
    }
+
+    // Demotion for a word with transposed character
    if (transposedPos >= 0) multiplyRate(
            WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);
+
+    // Demotion for a word with excessive character
    if (excessivePos >= 0) {
        multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);
        if (!proximityInfo->existsAdjacentProximityChars(inputIndex)) {
@ -478,52 +464,62 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
            multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);
        }
    }
-    int lengthFreq = typedLetterMultiplier;
-    multiplyIntCapped(powerIntCapped(typedLetterMultiplier, outputIndex), &lengthFreq);
-    if ((outputIndex + 1) == matchCount) {
-        // Full exact match
-        if (outputIndex > 1) {
-            if (DEBUG_DICT) {
-                LOGI("Found full matched word.");
-            }
-            multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
-        }
-        if (sameLength && transposedPos < 0 && !skipped && excessivePos < 0) {
-            finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
-        }
-    } else if (sameLength && transposedPos < 0 && !skipped && excessivePos < 0
-            && outputIndex > 0) {
+
+    // Promotion for a word with proximity characters
+    for (int i = 0; i < adJustedProximityMatchedCount; ++i) {
        // A word with proximity corrections
-        if (DEBUG_DICT) {
-            LOGI("Found one proximity correction.");
+        if (DEBUG_DICT_FULL) {
+            LOGI("Found a proximity correction.");
        }
        multiplyIntCapped(typedLetterMultiplier, &finalFreq);
        multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
    }
-    if (DEBUG_DICT_FULL) {
-        LOGI("calc: %d, %d", outputIndex, sameLength);
-    }
-    if (sameLength) multiplyIntCapped(fullWordMultiplier, &finalFreq);

-    // TODO: check excessive count and transposed count
+    const int errorCount = proximityMatchedCount + skipCount;
+    multiplyRate(
+            100 - CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE * errorCount / inputLength, &finalFreq);
+
+    // Promotion for an exactly matched word
+    if (matchCount == outputIndex + 1) {
+        // Full exact match
+        if (sameLength && transposedPos < 0 && !skipped && excessivePos < 0) {
+            finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
+        }
+    }
+
+    // Promote a word with no correction
+    if (proximityMatchedCount == 0 && transposedPos < 0 && !skipped && excessivePos < 0) {
+        multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
+    }
+
+    // TODO: Check excessive count and transposed count
+    // TODO: Remove this if possible
    /*
-     If the last character of the user input word is the same as the next character
-     of the output word, and also all of characters of the user input are matched
-     to the output word, we'll promote that word a bit because
-     that word can be considered the combination of skipped and matched characters.
-     This means that the 'sm' pattern wins over the 'ma' pattern.
-     e.g.)
-     shel -> shell [mmmma] or [mmmsm]
-     hel -> hello [mmmaa] or [mmsma]
-     m ... matching
-     s ... skipping
-     a ... traversing all
+         If the last character of the user input word is the same as the next character
+         of the output word, and also all of characters of the user input are matched
+         to the output word, we'll promote that word a bit because
+         that word can be considered the combination of skipped and matched characters.
+         This means that the 'sm' pattern wins over the 'ma' pattern.
+         e.g.)
+         shel -> shell [mmmma] or [mmmsm]
+         hel -> hello [mmmaa] or [mmsma]
+         m ... matching
+         s ... skipping
+         a ... traversing all
     */
    if (matchCount == inputLength && matchCount >= 2 && !skipped
            && word[matchCount] == word[matchCount - 1]) {
        multiplyRate(WORDS_WITH_MATCH_SKIP_PROMOTION_RATE, &finalFreq);
    }

+    if (sameLength) {
+        multiplyIntCapped(fullWordMultiplier, &finalFreq);
+    }
+
+    if (DEBUG_DICT_FULL) {
+        LOGI("calc: %d, %d", outputIndex, sameLength);
+    }
+
    return finalFreq;
 }

--- a/native/src/correction.h
+++ b/native/src/correction.h
@ -139,8 +139,7 @@ private:
    class RankingAlgorithm {
    public:
        static int calculateFinalFreq(const int inputIndex, const int depth,
-                const int freq, const bool sameLength, int *editDistanceTable,
-                const Correction* correction);
+                const int freq, int *editDistanceTable, const Correction* correction);
        static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
                const Correction* correction);
    };
--- a/native/src/defines.h
+++ b/native/src/defines.h
@ -177,6 +177,8 @@ static void dumpWord(const unsigned short* word, const int length) {
 #define FULL_MATCHED_WORDS_PROMOTION_RATE 120
 #define WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE 90
 #define WORDS_WITH_MATCH_SKIP_PROMOTION_RATE 105
+#define WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE 160
+#define CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE 42

 // This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
 // This is only used for the size of array. Not to be used in c functions.
@ -194,5 +196,6 @@ static void dumpWord(const unsigned short* word, const int length) {
 #define MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION 3

 #define min(a,b) ((a)<(b)?(a):(b))
+#define max(a,b) ((a)>(b)?(a):(b))

 #endif // LATINIME_DEFINES_H