Force autocorrection of matching words with different accents.

When entering a word without accents the user expects the system to add accents automatically if there is no other matching word. This patch ensures the accented version is promoted accordingly and autocorrection really takes place. Issue: 3400015 Change-Id: I8cd3db5bf131ec6844b26abecc1ecbd1d6269df4
2011-02-18 17:50:58 +09:00 · 2011-02-18 17:50:58 +09:00 · a5d5849701
commit a5d5849701
parent 050c0462dc
4 changed files with 113 additions and 20 deletions
--- a/java/src/com/android/inputmethod/latin/Utils.java
+++ b/java/src/com/android/inputmethod/latin/Utils.java
@ -285,13 +285,22 @@ public class Utils {
    //         (the number of matched characters between typed word and suggested word))
    //     * (individual word's score which defined in the unigram dictionary,
    //         and this score is defined in range [0, 255].)
-    //     * (when before.length() == after.length(),
-    //         mFullWordMultiplier (this is defined 2))
-    // So, maximum original score is pow(2, before.length()) * 255 * 2
-    // So, we can normalize original score by dividing this value.
+    // Then, the following processing is applied.
+    //     - If the dictionary word is matched up to the point of the user entry
+    //       (full match up to min(before.length(), after.length())
+    //       => Then multiply by FULL_MATCHED_WORDS_PROMOTION_RATE (this is defined 1.2)
+    //     - If the word is a true full match except for differences in accents or
+    //       capitalization, then treat it as if the frequency was 255.
+    //     - If before.length() == after.length()
+    //       => multiply by mFullWordMultiplier (this is defined 2))
+    // So, maximum original score is pow(2, min(before.length(), after.length())) * 255 * 2 * 1.2
+    // For historical reasons we ignore the 1.2 modifier (because the measure for a good
+    // autocorrection threshold was done at a time when it didn't exist). This doesn't change
+    // the result.
+    // So, we can normalize original score by dividing pow(2, min(b.l(),a.l())) * 255 * 2.
    private static final int MAX_INITIAL_SCORE = 255;
    private static final int TYPED_LETTER_MULTIPLIER = 2;
-    private static final int FULL_WORD_MULTIPLYER = 2;
+    private static final int FULL_WORD_MULTIPLIER = 2;
    public static double calcNormalizedScore(CharSequence before, CharSequence after, int score) {
        final int beforeLength = before.length();
        final int afterLength = after.length();
@ -301,7 +310,7 @@ public class Utils {
        // correction.
        final double maximumScore = MAX_INITIAL_SCORE
                * Math.pow(TYPED_LETTER_MULTIPLIER, Math.min(beforeLength, afterLength))
-                * FULL_WORD_MULTIPLYER;
+                * FULL_WORD_MULTIPLIER;
        // add a weight based on edit distance.
        // distance <= max(afterLength, beforeLength) == afterLength,
        // so, 0 <= distance / afterLength <= 1
--- a/native/src/debug.h
+++ b/native/src/debug.h
@ -0,0 +1,58 @@
+/*
+**
+** Copyright 2011, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#ifndef LATINIME_DEBUG_H
+#define LATINIME_DEBUG_H
+
+#include "defines.h"
+
+static inline unsigned char* convertToUnibyteString(unsigned short* input, unsigned char* output,
+        const unsigned int length) {
+    int i = 0;
+    for (; i <= length && input[i] != 0; ++i)
+        output[i] = input[i] & 0xFF;
+    output[i] = 0;
+    return output;
+}
+static inline unsigned char* convertToUnibyteStringAndReplaceLastChar(unsigned short* input,
+        unsigned char* output, const unsigned int length, unsigned char c) {
+    int i = 0;
+    for (; i <= length && input[i] != 0; ++i)
+        output[i] = input[i] & 0xFF;
+    output[i-1] = c;
+    output[i] = 0;
+    return output;
+}
+static inline void LOGI_S16(unsigned short* string, const unsigned int length) {
+    unsigned char tmp_buffer[length];
+    convertToUnibyteString(string, tmp_buffer, length);
+    LOGI(">> %s", tmp_buffer);
+    // The log facility is throwing out log that comes too fast. The following
+    // is a dirty way of slowing down processing so that we can see all log.
+    // TODO : refactor this in a blocking log or something.
+    // usleep(10);
+}
+static inline void LOGI_S16_PLUS(unsigned short* string, const unsigned int length,
+        unsigned char c) {
+    unsigned char tmp_buffer[length+1];
+    convertToUnibyteStringAndReplaceLastChar(string, tmp_buffer, length, c);
+    LOGI(">> %s", tmp_buffer);
+    // Likewise
+    // usleep(10);
+}
+
+#endif // LATINIME_DEBUG_H
--- a/native/src/defines.h
+++ b/native/src/defines.h
@ -100,6 +100,9 @@ static void prof_out(void) {
 #ifndef U_SHORT_MAX
 #define U_SHORT_MAX 1 << 16
 #endif
+#ifndef S_INT_MAX
+#define S_INT_MAX ((1 << 31) - 1)
+#endif

 // Define this to use mmap() for dictionary loading.  Undefine to use malloc() instead of mmap().
 // We measured and compared performance of both, and found mmap() is fairly good in terms of
@ -137,9 +140,6 @@ static void prof_out(void) {
 #define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 60
 #define FULL_MATCHED_WORDS_PROMOTION_RATE 120

-// This is used as a bare multiplier (not subject to /100)
-#define FULL_MATCH_ACCENTS_OR_CAPITALIZATION_DIFFER_MULTIPLIER 2
-
 // This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
 // This is only used for the size of array. Not to be used in c functions.
 #define MAX_WORD_LENGTH_INTERNAL 48
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@ -347,6 +347,10 @@ void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, cons
    }
 }

+static const int TWO_31ST_DIV_255 = ((1 << 31) - 1) / 255;
+static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {
+    return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);
+}
 inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth,
        const int snr, const int skipPos, const int excessivePos, const int transposedPos,
        const int freq, const bool sameLength) {
@ -369,7 +373,7 @@ inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int
            multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
        }
        if (sameLength && transposedPos < 0 && skipPos < 0 && excessivePos < 0) {
-            finalFreq *= FULL_MATCH_ACCENTS_OR_CAPITALIZATION_DIFFER_MULTIPLIER;
+            finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
        }
    }
    if (sameLength && skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER;
@ -428,24 +432,46 @@ inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex
    return false;
 }

+
+// In the following function, c is the current character of the dictionary word
+// currently examined.
+// currentChars is an array containing the keys close to the character the
+// user actually typed at the same position. We want to see if c is in it: if so,
+// then the word contains at that position a character close to what the user
+// typed.
+// What the user typed is actually the first character of the array.
+// Notice : accented characters do not have a proximity list, so they are alone
+// in their list. The non-accented version of the character should be considered
+// "close", but not the other keys close to the non-accented version.
 inline UnigramDictionary::ProximityType UnigramDictionary::getMatchedProximityId(
        const int *currentChars, const unsigned short c, const int skipPos,
        const int excessivePos, const int transposedPos) {
    const unsigned short lowerC = toLowerCase(c);
-    int j = 0;
+
+    // The first char in the array is what user typed. If it matches right away,
+    // that means the user typed that same char for this pos.
+    if (currentChars[0] == lowerC || currentChars[0] == c)
+        return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;
+
+    // If one of those is true, we should not check for close characters at all.
+    if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0)
+        return UNRELATED_CHAR;
+
+    // If the non-accented, lowercased version of that first character matches c,
+    // then we have a non-accented version of the accented character the user
+    // typed. Treat it as a close char.
+    if (toLowerCase(currentChars[0]) == lowerC)
+        return NEAR_PROXIMITY_CHAR;
+
+    // Not an exact nor an accent-alike match: search the list of close keys
+    int j = 1;
    while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {
        const bool matched = (currentChars[j] == lowerC || currentChars[j] == c);
-        // If skipPos is defined, not to search proximity collections.
-        // First char is what user  typed.
-        if (matched) {
-            if (j > 0) return NEAR_PROXIMITY_CHAR;
-            return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;
-        } else if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0) {
-            // Not to check proximity characters
-            return UNRELATED_CHAR;
-        }
+        if (matched) return NEAR_PROXIMITY_CHAR;
        ++j;
    }
+
+    // Was not included, signal this as an unrelated character.
    return UNRELATED_CHAR;
 }