From 07a8406bc184a354ea47fb6352e48df39e35310e Mon Sep 17 00:00:00 2001
From: Jean Chalard <jchalard@google.com>
Date: Thu, 3 Mar 2011 10:22:10 +0900
Subject: [PATCH] Demote skipped characters matched words with respect to
 length.

Words that matched user input with skipped characters used to be demoted
in BinaryDictionary by a constant factor and not at all in those dictionaries
implemented in java code. To represent the fact that the impact of a skipped
character gets larger as the word is shorter, this change will implement a
demotion that gets larger as the typed word is shorter. The demotion rate
is (n - 2) / (n - 1) where n is the length of the typed word for n >= 2.
It implements it for both BinaryDictionary and java dictionaries.

Bug: 3340731
Change-Id: I3a18be80a9708981d56a950dc25fe08f018b5b89
---
 .../latin/ExpandableDictionary.java           | 41 +++++++++++++++----
 native/src/defines.h                          |  2 +-
 native/src/unigram_dictionary.cpp             | 13 ++++--
 native/src/unigram_dictionary.h               |  7 ++--
 4 files changed, 47 insertions(+), 16 deletions(-)

diff --git a/java/src/com/android/inputmethod/latin/ExpandableDictionary.java b/java/src/com/android/inputmethod/latin/ExpandableDictionary.java
index b10e7a61e..f2305f18c 100644
--- a/java/src/com/android/inputmethod/latin/ExpandableDictionary.java
+++ b/java/src/com/android/inputmethod/latin/ExpandableDictionary.java
@@ -230,6 +230,16 @@ public class ExpandableDictionary extends Dictionary {
         return (node == null) ? -1 : node.mFrequency;
     }
 
+    private static int computeSkippedWordFinalFreq(int freq, int snr, int inputLength) {
+        // The computation itself makes sense for >= 2, but the == 2 case returns 0
+        // anyway so we may as well test against 3 instead and return the constant
+        if (inputLength >= 3) {
+            return (freq * snr * (inputLength - 2)) / (inputLength - 1);
+        } else {
+            return 0;
+        }
+    }
+
     /**
      * Recursively traverse the tree for words that match the input. Input consists of
      * a list of arrays. Each item in the list is one input character position. An input
@@ -243,13 +253,14 @@ public class ExpandableDictionary extends Dictionary {
      * @param completion whether the traversal is now in completion mode - meaning that we've
      * exhausted the input and we're looking for all possible suffixes.
      * @param snr current weight of the word being formed
-     * @param inputIndex position in the input characters. This can be off from the depth in 
+     * @param inputIndex position in the input characters. This can be off from the depth in
      * case we skip over some punctuations such as apostrophe in the traversal. That is, if you type
      * "wouldve", it could be matching "would've", so the depth will be one more than the
      * inputIndex
      * @param callback the callback class for adding a word
      */
-    protected void getWordsRec(NodeArray roots, final WordComposer codes, final char[] word, 
+    // TODO: Share this routine with the native code for BinaryDictionary
+    protected void getWordsRec(NodeArray roots, final WordComposer codes, final char[] word,
             final int depth, boolean completion, int snr, int inputIndex, int skipPos,
             WordCallback callback) {
         final int count = roots.mLength;
@@ -275,8 +286,14 @@ public class ExpandableDictionary extends Dictionary {
             if (completion) {
                 word[depth] = c;
                 if (terminal) {
-                    if (!callback.addWord(word, 0, depth + 1, freq * snr, mDicTypeId,
-                                DataType.UNIGRAM)) {
+                    final int finalFreq;
+                    if (skipPos < 0) {
+                        finalFreq = freq * snr;
+                    } else {
+                        finalFreq = computeSkippedWordFinalFreq(freq, snr, mInputLength);
+                    }
+                    if (!callback.addWord(word, 0, depth + 1, finalFreq, mDicTypeId,
+                            DataType.UNIGRAM)) {
                         return;
                     }
                 }
@@ -288,7 +305,7 @@ public class ExpandableDictionary extends Dictionary {
                 // Skip the ' and continue deeper
                 word[depth] = c;
                 if (children != null) {
-                    getWordsRec(children, codes, word, depth + 1, completion, snr, inputIndex, 
+                    getWordsRec(children, codes, word, depth + 1, completion, snr, inputIndex,
                             skipPos, callback);
                 }
             } else {
@@ -305,10 +322,16 @@ public class ExpandableDictionary extends Dictionary {
 
                         if (codeSize == inputIndex + 1) {
                             if (terminal) {
-                                if (INCLUDE_TYPED_WORD_IF_VALID 
+                                if (INCLUDE_TYPED_WORD_IF_VALID
                                         || !same(word, depth + 1, codes.getTypedWord())) {
-                                    int finalFreq = freq * snr * addedAttenuation;
-                                    if (skipPos < 0) finalFreq *= FULL_WORD_FREQ_MULTIPLIER;
+                                    final int finalFreq;
+                                    if (skipPos < 0) {
+                                        finalFreq = freq * snr * addedAttenuation
+                                                * FULL_WORD_FREQ_MULTIPLIER;
+                                    } else {
+                                        finalFreq = computeSkippedWordFinalFreq(freq,
+                                                snr * addedAttenuation, mInputLength);
+                                    }
                                     callback.addWord(word, 0, depth + 1, finalFreq, mDicTypeId,
                                             DataType.UNIGRAM);
                                 }
@@ -319,7 +342,7 @@ public class ExpandableDictionary extends Dictionary {
                                         skipPos, callback);
                             }
                         } else if (children != null) {
-                            getWordsRec(children, codes, word, depth + 1, 
+                            getWordsRec(children, codes, word, depth + 1,
                                     false, snr * addedAttenuation, inputIndex + 1,
                                     skipPos, callback);
                         }
diff --git a/native/src/defines.h b/native/src/defines.h
index ddd65c9fa..0d1f037e1 100644
--- a/native/src/defines.h
+++ b/native/src/defines.h
@@ -135,7 +135,7 @@ static void prof_out(void) {
 #define SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS true
 
 // The following "rate"s are used as a multiplier before dividing by 100, so they are in percent.
-#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 75
+#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 100
 #define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 80
 #define WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE 75
 #define WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE 75
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index 9aa36b064..17a87a708 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@@ -457,10 +457,17 @@ static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(con
 }
 inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth,
         const int matchWeight, const int skipPos, const int excessivePos, const int transposedPos,
-        const int freq, const bool sameLength) {
+        const int freq, const bool sameLength) const {
     // TODO: Demote by edit distance
     int finalFreq = freq * matchWeight;
-    if (skipPos >= 0) multiplyRate(WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE, &finalFreq);
+    if (skipPos >= 0) {
+        if (mInputLength >= 3) {
+            multiplyRate(WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE *
+                    (mInputLength - 2) / (mInputLength - 1), &finalFreq);
+        } else {
+            finalFreq = 0;
+        }
+    }
     if (transposedPos >= 0) multiplyRate(
             WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);
     if (excessivePos >= 0) {
@@ -514,7 +521,7 @@ inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,
 }
 
 inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex,
-        const int inputLength) {
+        const int inputLength) const {
     if (inputIndex < 0 || inputIndex >= inputLength) return false;
     const int currentChar = *getInputCharsAt(inputIndex);
     const int leftIndex = inputIndex - 1;
diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h
index a95984520..2912cb0b7 100644
--- a/native/src/unigram_dictionary.h
+++ b/native/src/unigram_dictionary.h
@@ -71,7 +71,8 @@ private:
             const int nextLettersSize);
     void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize);
     int calculateFinalFreq(const int inputIndex, const int depth, const int snr, const int skipPos,
-            const int excessivePos, const int transposedPos, const int freq, const bool sameLength);
+            const int excessivePos, const int transposedPos, const int freq,
+            const bool sameLength) const;
     void onTerminalWhenUserTypedLengthIsGreaterThanInputLength(unsigned short *word,
             const int inputIndex, const int depth, const int snr, int *nextLetters,
             const int nextLettersSize, const int skipPos, const int excessivePos,
@@ -95,8 +96,8 @@ private:
     bool processCurrentNodeForExactMatch(const int firstChildPos,
             const int startInputIndex, const int depth, unsigned short *word,
             int *newChildPosition, int *newCount, bool *newTerminal, int *newFreq, int *siblingPos);
-    bool existsAdjacentProximityChars(const int inputIndex, const int inputLength);
-    inline const int* getInputCharsAt(const int index) {
+    bool existsAdjacentProximityChars(const int inputIndex, const int inputLength) const;
+    inline const int* getInputCharsAt(const int index) const {
         return mInputCodes + (index * MAX_PROXIMITY_CHARS);
     }
     const unsigned char *DICT;