Promote full matches with differing accents.

Stop considering accented characters as different from their base character for proximity scoring. Also give a huge boost (basically overriding frequency) to a word fully matched with only differing accents. Bug: 2550587 Change-Id: I2da7a71229fb3868d9e4a53703ccf8caeb6fcf10
2011-01-27 14:20:22 +09:00 · 2011-01-27 14:20:22 +09:00 · 8dc754a411
parent 588d2a525c
commit 8dc754a411
3 changed files with 39 additions and 21 deletions
--- a/native/src/defines.h
+++ b/native/src/defines.h
@ -129,6 +129,7 @@ static void prof_out(void) {
 #define SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER true
 #define SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS true

+// The following "rate"s are used as a multiplier before dividing by 100, so they are in percent.
 #define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 75
 #define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 80
 #define WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE 75
@ -136,6 +137,9 @@ static void prof_out(void) {
 #define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 60
 #define FULL_MATCHED_WORDS_PROMOTION_RATE 120

+// This is used as a bare multiplier (not subject to /100)
+#define FULL_MATCH_ACCENTS_OR_CAPITALIZATION_DIFFER_MULTIPLIER 2
+
 // This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
 // This is only used for the size of array. Not to be used in c functions.
 #define MAX_WORD_LENGTH_INTERNAL 48
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@ -363,10 +363,15 @@ inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int
    }
    int lengthFreq = TYPED_LETTER_MULTIPLIER;
    for (int i = 0; i < depth; ++i) lengthFreq *= TYPED_LETTER_MULTIPLIER;
-    if (depth > 1 && lengthFreq == snr) {
+    if (lengthFreq == snr) {
+        if (depth > 1) {
            if (DEBUG_DICT) LOGI("Found full matched word.");
            multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
        }
+        if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) {
+            finalFreq *= FULL_MATCH_ACCENTS_OR_CAPITALIZATION_DIFFER_MULTIPLIER;
+        }
+    }
    if (sameLength && skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER;
    return finalFreq;
 }
@ -385,10 +390,9 @@ inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsGreaterThanInputLe

 inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsSameAsInputLength(
        unsigned short *word, const int inputIndex, const int depth, const int snr,
-        const int skipPos, const int excessivePos, const int transposedPos, const int freq,
-        const int addedWeight) {
+        const int skipPos, const int excessivePos, const int transposedPos, const int freq) {
    if (sameAsTyped(word, depth + 1)) return;
-    const int finalFreq = calculateFinalFreq(inputIndex, depth, snr * addedWeight, skipPos,
+    const int finalFreq = calculateFinalFreq(inputIndex, depth, snr, skipPos,
            excessivePos, transposedPos, freq, true);
    // Proximity collection will promote a word of the same length as what user typed.
    if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, finalFreq);
@ -424,9 +428,9 @@ inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex
    return false;
 }

-inline int UnigramDictionary::getMatchedProximityId(const int *currentChars,
-        const unsigned short c, const int skipPos, const int excessivePos,
-        const int transposedPos) {
+inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId(
+        const int *currentChars, const unsigned short c, const int skipPos,
+        const int excessivePos, const int transposedPos) {
    const unsigned short lowerC = toLowerCase(c);
    int j = 0;
    while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {
@ -434,18 +438,19 @@ inline int UnigramDictionary::getMatchedProximityId(const int *currentChars,
        // If skipPos is defined, not to search proximity collections.
        // First char is what user  typed.
        if (matched) {
-            return j;
+            if (j > 0) return NEAR_PROXIMITY_CHAR;
+            return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;
        } else if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0) {
            // Not to check proximity characters
-            return -1;
+            return UNRELATED_CHAR;
        }
        ++j;
    }
-    return -1;
+    return UNRELATED_CHAR;
 }

 inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth,
-        const int maxDepth, const bool traverseAllNodes, const int snr, int inputIndex,
+        const int maxDepth, const bool traverseAllNodes, int snr, int inputIndex,
        const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
        int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
        bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs,
@ -492,22 +497,24 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth

        int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,
                transposedPos);
-        if (matchedProximityCharId < 0) return false;
+        if (UNRELATED_CHAR == matchedProximityCharId) return false;
        mWord[depth] = c;
        // If inputIndex is greater than mInputLength, that means there is no
        // proximity chars. So, we don't need to check proximity.
-        const int addedWeight = matchedProximityCharId == 0 ? TYPED_LETTER_MULTIPLIER : 1;
+        if (SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
+            snr = snr * TYPED_LETTER_MULTIPLIER;
+        }
        bool isSameAsUserTypedLength = mInputLength == inputIndex + 1
                || (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);
        if (isSameAsUserTypedLength && terminal) {
            onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, inputIndex, depth, snr,
-                    skipPos, excessivePos, transposedPos, freq, addedWeight);
+                    skipPos, excessivePos, transposedPos, freq);
        }
        if (!needsToTraverseChildrenNodes) return false;
        // Start traversing all nodes after the index exceeds the user typed length
        *newTraverseAllNodes = isSameAsUserTypedLength;
-        *newSnr = snr * addedWeight;
-        *newDiffs = diffs + ((matchedProximityCharId > 0) ? 1 : 0);
+        *newSnr = snr;
+        *newDiffs = diffs + ((NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);
        *newInputIndex = inputIndex + 1;
    }
    // Optimization: Prune out words that are too long compared to how much was typed.
--- a/native/src/unigram_dictionary.h
+++ b/native/src/unigram_dictionary.h
@ -22,6 +22,13 @@
 namespace latinime {

 class UnigramDictionary {
+
+    typedef enum {                             // Used as a return value for character comparison
+        SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR,  // Same char, possibly with different case or accent
+        NEAR_PROXIMITY_CHAR,                   // It is a char located nearby on the keyboard
+        UNRELATED_CHAR                         // It is an unrelated char
+    } ProximityType;
+
 public:
    UnigramDictionary(const unsigned char *dict, int typedLetterMultipler, int fullWordMultiplier,
            int maxWordLength, int maxWords, int maxProximityChars, const bool isLatestDictVersion);
@ -60,11 +67,11 @@ private:
            const int transposedPos, const int freq);
    void onTerminalWhenUserTypedLengthIsSameAsInputLength(unsigned short *word,
            const int inputIndex, const int depth, const int snr, const int skipPos,
-            const int excessivePos, const int transposedPos, const int freq, const int addedWeight);
+            const int excessivePos, const int transposedPos, const int freq);
    bool needsToSkipCurrentNode(const unsigned short c,
            const int inputIndex, const int skipPos, const int depth);
-    int getMatchedProximityId(const int *currentChars, const unsigned short c, const int skipPos,
-            const int excessivePos, const int transposedPos);
+    ProximityType getMatchedProximityId(const int *currentChars, const unsigned short c,
+            const int skipPos, const int excessivePos, const int transposedPos);
    // Process a node by considering proximity, missing and excessive character
    bool processCurrentNode(const int pos, const int depth,
            const int maxDepth, const bool traverseAllNodes, const int snr, int inputIndex,