From be0cf72253f15bff6abdeaa79f60a56f06ab7b86 Mon Sep 17 00:00:00 2001 From: satok Date: Thu, 12 Jan 2012 18:44:40 +0900 Subject: [PATCH] Move auto correction thresthold to the native code bug: 5858137 Change-Id: Ic4b6270c6e51ef4ed25a6a1d8ddd7fdfa70fd78d --- .../inputmethod/latin/AutoCorrection.java | 5 +- .../inputmethod/latin/BinaryDictionary.java | 14 +++ .../com/android/inputmethod/latin/Utils.java | 95 +------------------ .../AndroidSpellCheckerService.java | 9 +- ...oid_inputmethod_latin_BinaryDictionary.cpp | 29 +++++- native/src/correction.cpp | 82 +++++++++++++--- native/src/correction.h | 24 +++-- .../inputmethod/latin/EditDistanceTests.java | 18 ++-- 8 files changed, 148 insertions(+), 128 deletions(-) diff --git a/java/src/com/android/inputmethod/latin/AutoCorrection.java b/java/src/com/android/inputmethod/latin/AutoCorrection.java index cd066a3d1..e5eb15938 100644 --- a/java/src/com/android/inputmethod/latin/AutoCorrection.java +++ b/java/src/com/android/inputmethod/latin/AutoCorrection.java @@ -118,8 +118,9 @@ public class AutoCorrection { final int autoCorrectionSuggestionScore = sortedScores[0]; // TODO: when the normalized score of the first suggestion is nearly equals to // the normalized score of the second suggestion, behave less aggressive. - mNormalizedScore = Utils.calcNormalizedScore( - typedWord,autoCorrectionSuggestion, autoCorrectionSuggestionScore); + mNormalizedScore = BinaryDictionary.calcNormalizedScore( + typedWord.toString(), autoCorrectionSuggestion.toString(), + autoCorrectionSuggestionScore); if (DBG) { Log.d(TAG, "Normalized " + typedWord + "," + autoCorrectionSuggestion + "," + autoCorrectionSuggestionScore + ", " + mNormalizedScore diff --git a/java/src/com/android/inputmethod/latin/BinaryDictionary.java b/java/src/com/android/inputmethod/latin/BinaryDictionary.java index f0e56d346..3692ac179 100644 --- a/java/src/com/android/inputmethod/latin/BinaryDictionary.java +++ b/java/src/com/android/inputmethod/latin/BinaryDictionary.java @@ -118,6 +118,10 @@ public class BinaryDictionary extends Dictionary { private native int getBigramsNative(long dict, char[] prevWord, int prevWordLength, int[] inputCodes, int inputCodesLength, char[] outputChars, int[] scores, int maxWordLength, int maxBigrams, int maxAlternatives); + private static native double calcNormalizedScoreNative( + char[] before, int beforeLength, char[] after, int afterLength, int score); + private static native int editDistanceNative( + char[] before, int beforeLength, char[] after, int afterLength); private final void loadDictionary(String path, long startOffset, long length) { mNativeDict = openNative(path, startOffset, length, @@ -211,6 +215,16 @@ public class BinaryDictionary extends Dictionary { mFlags, outputChars, scores); } + public static double calcNormalizedScore(String before, String after, int score) { + return calcNormalizedScoreNative(before.toCharArray(), before.length(), + after.toCharArray(), after.length(), score); + } + + public static int editDistance(String before, String after) { + return editDistanceNative( + before.toCharArray(), before.length(), after.toCharArray(), after.length()); + } + @Override public boolean isValidWord(CharSequence word) { if (word == null) return false; diff --git a/java/src/com/android/inputmethod/latin/Utils.java b/java/src/com/android/inputmethod/latin/Utils.java index bfa51897e..38148438b 100644 --- a/java/src/com/android/inputmethod/latin/Utils.java +++ b/java/src/com/android/inputmethod/latin/Utils.java @@ -191,7 +191,8 @@ public class Utils { final int typedWordLength = typedWord.length(); final int maxEditDistanceOfNativeDictionary = (typedWordLength < 5 ? 2 : typedWordLength / 2) + 1; - final int distance = Utils.editDistance(typedWord, suggestionWord); + final int distance = BinaryDictionary.editDistance( + typedWord.toString(), suggestionWord.toString()); if (DBG) { Log.d(TAG, "Autocorrected edit distance = " + distance + ", " + maxEditDistanceOfNativeDictionary); @@ -323,49 +324,6 @@ public class Utils { } } - - /* Damerau-Levenshtein distance */ - public static int editDistance(CharSequence s, CharSequence t) { - if (s == null || t == null) { - throw new IllegalArgumentException("editDistance: Arguments should not be null."); - } - final int sl = s.length(); - final int tl = t.length(); - int[][] dp = new int [sl + 1][tl + 1]; - for (int i = 0; i <= sl; i++) { - dp[i][0] = i; - } - for (int j = 0; j <= tl; j++) { - dp[0][j] = j; - } - for (int i = 0; i < sl; ++i) { - for (int j = 0; j < tl; ++j) { - final char sc = Character.toLowerCase(s.charAt(i)); - final char tc = Character.toLowerCase(t.charAt(j)); - final int cost = sc == tc ? 0 : 1; - dp[i + 1][j + 1] = Math.min( - dp[i][j + 1] + 1, Math.min(dp[i + 1][j] + 1, dp[i][j] + cost)); - // Overwrite for transposition cases - if (i > 0 && j > 0 - && sc == Character.toLowerCase(t.charAt(j - 1)) - && tc == Character.toLowerCase(s.charAt(i - 1))) { - dp[i + 1][j + 1] = Math.min(dp[i + 1][j + 1], dp[i - 1][j - 1] + cost); - } - } - } - if (DBG_EDIT_DISTANCE) { - Log.d(TAG, "editDistance:" + s + "," + t); - for (int i = 0; i < dp.length; ++i) { - StringBuffer sb = new StringBuffer(); - for (int j = 0; j < dp[i].length; ++j) { - sb.append(dp[i][j]).append(','); - } - Log.d(TAG, i + ":" + sb.toString()); - } - } - return dp[sl][tl]; - } - // Get the current stack trace public static String getStackTrace() { StringBuilder sb = new StringBuilder(); @@ -379,55 +337,6 @@ public class Utils { return sb.toString(); } - // In dictionary.cpp, getSuggestion() method, - // suggestion scores are computed using the below formula. - // original score - // := pow(mTypedLetterMultiplier (this is defined 2), - // (the number of matched characters between typed word and suggested word)) - // * (individual word's score which defined in the unigram dictionary, - // and this score is defined in range [0, 255].) - // Then, the following processing is applied. - // - If the dictionary word is matched up to the point of the user entry - // (full match up to min(before.length(), after.length()) - // => Then multiply by FULL_MATCHED_WORDS_PROMOTION_RATE (this is defined 1.2) - // - If the word is a true full match except for differences in accents or - // capitalization, then treat it as if the score was 255. - // - If before.length() == after.length() - // => multiply by mFullWordMultiplier (this is defined 2)) - // So, maximum original score is pow(2, min(before.length(), after.length())) * 255 * 2 * 1.2 - // For historical reasons we ignore the 1.2 modifier (because the measure for a good - // autocorrection threshold was done at a time when it didn't exist). This doesn't change - // the result. - // So, we can normalize original score by dividing pow(2, min(b.l(),a.l())) * 255 * 2. - private static final int MAX_INITIAL_SCORE = 255; - private static final int TYPED_LETTER_MULTIPLIER = 2; - private static final int FULL_WORD_MULTIPLIER = 2; - private static final int S_INT_MAX = 2147483647; - public static double calcNormalizedScore(CharSequence before, CharSequence after, int score) { - final int beforeLength = before.length(); - final int afterLength = after.length(); - if (beforeLength == 0 || afterLength == 0) return 0; - final int distance = editDistance(before, after); - // If afterLength < beforeLength, the algorithm is suggesting a word by excessive character - // correction. - int spaceCount = 0; - for (int i = 0; i < afterLength; ++i) { - if (after.charAt(i) == Keyboard.CODE_SPACE) { - ++spaceCount; - } - } - if (spaceCount == afterLength) return 0; - final double maximumScore = score == S_INT_MAX ? S_INT_MAX : MAX_INITIAL_SCORE - * Math.pow( - TYPED_LETTER_MULTIPLIER, Math.min(beforeLength, afterLength - spaceCount)) - * FULL_WORD_MULTIPLIER; - // add a weight based on edit distance. - // distance <= max(afterLength, beforeLength) == afterLength, - // so, 0 <= distance / afterLength <= 1 - final double weight = 1.0 - (double) distance / afterLength; - return (score / maximumScore) * weight; - } - public static class UsabilityStudyLogUtils { private static final String USABILITY_TAG = UsabilityStudyLogUtils.class.getSimpleName(); private static final String FILENAME = "log.txt"; diff --git a/java/src/com/android/inputmethod/latin/spellcheck/AndroidSpellCheckerService.java b/java/src/com/android/inputmethod/latin/spellcheck/AndroidSpellCheckerService.java index 1ffee0de0..39e47f661 100644 --- a/java/src/com/android/inputmethod/latin/spellcheck/AndroidSpellCheckerService.java +++ b/java/src/com/android/inputmethod/latin/spellcheck/AndroidSpellCheckerService.java @@ -270,7 +270,7 @@ public class AndroidSpellCheckerService extends SpellCheckerService // make the threshold. final String wordString = new String(word, wordOffset, wordLength); final double normalizedScore = - Utils.calcNormalizedScore(mOriginalText, wordString, score); + BinaryDictionary.calcNormalizedScore(mOriginalText, wordString, score); if (normalizedScore < mSuggestionThreshold) { if (DBG) Log.i(TAG, wordString + " does not make the score threshold"); return true; @@ -303,8 +303,8 @@ public class AndroidSpellCheckerService extends SpellCheckerService hasRecommendedSuggestions = false; } else { gatheredSuggestions = EMPTY_STRING_ARRAY; - final double normalizedScore = - Utils.calcNormalizedScore(mOriginalText, mBestSuggestion, mBestScore); + final double normalizedScore = BinaryDictionary.calcNormalizedScore( + mOriginalText, mBestSuggestion, mBestScore); hasRecommendedSuggestions = (normalizedScore > mRecommendedThreshold); } } else { @@ -338,7 +338,8 @@ public class AndroidSpellCheckerService extends SpellCheckerService final int bestScore = mScores[mLength - 1]; final CharSequence bestSuggestion = mSuggestions.get(0); final double normalizedScore = - Utils.calcNormalizedScore(mOriginalText, bestSuggestion, bestScore); + BinaryDictionary.calcNormalizedScore( + mOriginalText, bestSuggestion.toString(), bestScore); hasRecommendedSuggestions = (normalizedScore > mRecommendedThreshold); if (DBG) { Log.i(TAG, "Best suggestion : " + bestSuggestion + ", score " + bestScore); diff --git a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp index 42d0e3207..71a893ca7 100644 --- a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp +++ b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp @@ -18,6 +18,7 @@ #define LOG_TAG "LatinIME: jni: BinaryDictionary" #include "binary_format.h" +#include "correction.h" #include "com_android_inputmethod_latin_BinaryDictionary.h" #include "dictionary.h" #include "jni.h" @@ -188,6 +189,29 @@ static jboolean latinime_BinaryDictionary_isValidWord(JNIEnv *env, jobject objec return result; } +static jdouble latinime_BinaryDictionary_calcNormalizedScore(JNIEnv *env, jobject object, + jcharArray before, jint beforeLength, jcharArray after, jint afterLength, jint score) { + jchar *beforeChars = env->GetCharArrayElements(before, 0); + jchar *afterChars = env->GetCharArrayElements(after, 0); + jdouble result = Correction::RankingAlgorithm::calcNormalizedScore( + (unsigned short*)beforeChars, beforeLength, (unsigned short*)afterChars, afterLength, + score); + env->ReleaseCharArrayElements(before, beforeChars, JNI_ABORT); + env->ReleaseCharArrayElements(after, afterChars, JNI_ABORT); + return result; +} + +static jint latinime_BinaryDictionary_editDistance(JNIEnv *env, jobject object, + jcharArray before, jint beforeLength, jcharArray after, jint afterLength) { + jchar *beforeChars = env->GetCharArrayElements(before, 0); + jchar *afterChars = env->GetCharArrayElements(after, 0); + jint result = Correction::RankingAlgorithm::editDistance( + (unsigned short*)beforeChars, beforeLength, (unsigned short*)afterChars, afterLength); + env->ReleaseCharArrayElements(before, beforeChars, JNI_ABORT); + env->ReleaseCharArrayElements(after, afterChars, JNI_ABORT); + return result; +} + static void latinime_BinaryDictionary_close(JNIEnv *env, jobject object, jlong dict) { Dictionary *dictionary = (Dictionary*)dict; if (!dictionary) return; @@ -222,7 +246,10 @@ static JNINativeMethod sMethods[] = { {"closeNative", "(J)V", (void*)latinime_BinaryDictionary_close}, {"getSuggestionsNative", "(JJ[I[I[III[C[I)I", (void*)latinime_BinaryDictionary_getSuggestions}, {"isValidWordNative", "(J[CI)Z", (void*)latinime_BinaryDictionary_isValidWord}, - {"getBigramsNative", "(J[CI[II[C[IIII)I", (void*)latinime_BinaryDictionary_getBigrams} + {"getBigramsNative", "(J[CI[II[C[IIII)I", (void*)latinime_BinaryDictionary_getBigrams}, + {"calcNormalizedScoreNative", "([CI[CII)D", + (void*)latinime_BinaryDictionary_calcNormalizedScore}, + {"editDistanceNative", "([CI[CI)I", (void*)latinime_BinaryDictionary_editDistance} }; int register_BinaryDictionary(JNIEnv *env) { diff --git a/native/src/correction.cpp b/native/src/correction.cpp index 503fb7c53..dc31bfae7 100644 --- a/native/src/correction.cpp +++ b/native/src/correction.cpp @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -933,14 +934,14 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords( return totalFreq; } -#if 0 /* no longer used. keep just for reference */ -inline static int editDistance( - int* editDistanceTable, const unsigned short* input, - const int inputLength, const unsigned short* output, const int outputLength) { +/* Damerau-Levenshtein distance */ +inline static int editDistanceInternal( + int* editDistanceTable, const unsigned short* before, + const int beforeLength, const unsigned short* after, const int afterLength) { // dp[li][lo] dp[a][b] = dp[ a * lo + b] int* dp = editDistanceTable; - const int li = inputLength + 1; - const int lo = outputLength + 1; + const int li = beforeLength + 1; + const int lo = afterLength + 1; for (int i = 0; i < li; ++i) { dp[lo * i] = i; } @@ -950,13 +951,13 @@ inline static int editDistance( for (int i = 0; i < li - 1; ++i) { for (int j = 0; j < lo - 1; ++j) { - const uint32_t ci = toBaseLowerCase(input[i]); - const uint32_t co = toBaseLowerCase(output[j]); + const uint32_t ci = toBaseLowerCase(before[i]); + const uint32_t co = toBaseLowerCase(after[j]); const uint16_t cost = (ci == co) ? 0 : 1; dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1, min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost)); - if (i > 0 && j > 0 && ci == toBaseLowerCase(output[j - 1]) - && co == toBaseLowerCase(input[i - 1])) { + if (i > 0 && j > 0 && ci == toBaseLowerCase(after[j - 1]) + && co == toBaseLowerCase(before[i - 1])) { dp[(i + 1) * lo + (j + 1)] = min( dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost); } @@ -964,7 +965,7 @@ inline static int editDistance( } if (DEBUG_EDIT_DISTANCE) { - LOGI("IN = %d, OUT = %d", inputLength, outputLength); + LOGI("IN = %d, OUT = %d", beforeLength, afterLength); for (int i = 0; i < li; ++i) { for (int j = 0; j < lo; ++j) { LOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]); @@ -973,6 +974,63 @@ inline static int editDistance( } return dp[li * lo - 1]; } -#endif + +int Correction::RankingAlgorithm::editDistance(const unsigned short* before, + const int beforeLength, const unsigned short* after, const int afterLength) { + int table[(beforeLength + 1) * (afterLength + 1)]; + return editDistanceInternal(table, before, beforeLength, after, afterLength); +} + + +// In dictionary.cpp, getSuggestion() method, +// suggestion scores are computed using the below formula. +// original score +// := pow(mTypedLetterMultiplier (this is defined 2), +// (the number of matched characters between typed word and suggested word)) +// * (individual word's score which defined in the unigram dictionary, +// and this score is defined in range [0, 255].) +// Then, the following processing is applied. +// - If the dictionary word is matched up to the point of the user entry +// (full match up to min(before.length(), after.length()) +// => Then multiply by FULL_MATCHED_WORDS_PROMOTION_RATE (this is defined 1.2) +// - If the word is a true full match except for differences in accents or +// capitalization, then treat it as if the score was 255. +// - If before.length() == after.length() +// => multiply by mFullWordMultiplier (this is defined 2)) +// So, maximum original score is pow(2, min(before.length(), after.length())) * 255 * 2 * 1.2 +// For historical reasons we ignore the 1.2 modifier (because the measure for a good +// autocorrection threshold was done at a time when it didn't exist). This doesn't change +// the result. +// So, we can normalize original score by dividing pow(2, min(b.l(),a.l())) * 255 * 2. + +/* static */ +double Correction::RankingAlgorithm::calcNormalizedScore(const unsigned short* before, + const int beforeLength, const unsigned short* after, const int afterLength, + const int score) { + if (0 == beforeLength || 0 == afterLength) { + return 0; + } + const int distance = editDistance(before, beforeLength, after, afterLength); + int spaceCount = 0; + for (int i = 0; i < afterLength; ++i) { + if (after[i] == CODE_SPACE) { + ++spaceCount; + } + } + + if (spaceCount == afterLength) { + return 0; + } + + const double maxScore = score >= S_INT_MAX ? S_INT_MAX : MAX_INITIAL_SCORE + * pow((double)TYPED_LETTER_MULTIPLIER, + (double)min(beforeLength, afterLength - spaceCount)) * FULL_WORD_MULTIPLIER; + + // add a weight based on edit distance. + // distance <= max(afterLength, beforeLength) == afterLength, + // so, 0 <= distance / afterLength <= 1 + const double weight = 1.0 - (double) distance / afterLength; + return (score / maxScore) * weight; +} } // namespace latinime diff --git a/native/src/correction.h b/native/src/correction.h index 9ba472955..4012e7e82 100644 --- a/native/src/correction.h +++ b/native/src/correction.h @@ -95,6 +95,23 @@ class Correction { return mCorrectionStates[index].mParentIndex; } + class RankingAlgorithm { + public: + static int calculateFinalFreq(const int inputIndex, const int depth, + const int freq, int *editDistanceTable, const Correction* correction); + static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq, + const Correction* correction, const unsigned short *word); + static double calcNormalizedScore(const unsigned short* before, const int beforeLength, + const unsigned short* after, const int afterLength, const int score); + static int editDistance(const unsigned short* before, + const int beforeLength, const unsigned short* after, const int afterLength); + private: + static const int CODE_SPACE = ' '; + static const int MAX_INITIAL_SCORE = 255; + static const int TYPED_LETTER_MULTIPLIER = 2; + static const int FULL_WORD_MULTIPLIER = 2; + }; + private: inline void incrementInputIndex(); inline void incrementOutputIndex(); @@ -153,13 +170,6 @@ class Correction { bool mTransposing; bool mSkipping; - class RankingAlgorithm { - public: - static int calculateFinalFreq(const int inputIndex, const int depth, - const int freq, int *editDistanceTable, const Correction* correction); - static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq, - const Correction* correction, const unsigned short *word); - }; }; } // namespace latinime #endif // LATINIME_CORRECTION_H diff --git a/tests/src/com/android/inputmethod/latin/EditDistanceTests.java b/tests/src/com/android/inputmethod/latin/EditDistanceTests.java index 75bd04938..c053a49e5 100644 --- a/tests/src/com/android/inputmethod/latin/EditDistanceTests.java +++ b/tests/src/com/android/inputmethod/latin/EditDistanceTests.java @@ -37,7 +37,7 @@ public class EditDistanceTests extends AndroidTestCase { * sitting */ public void testExample1() { - final int dist = Utils.editDistance("kitten", "sitting"); + final int dist = BinaryDictionary.editDistance("kitten", "sitting"); assertEquals("edit distance between 'kitten' and 'sitting' is 3", 3, dist); } @@ -50,26 +50,26 @@ public class EditDistanceTests extends AndroidTestCase { * S--unday */ public void testExample2() { - final int dist = Utils.editDistance("Saturday", "Sunday"); + final int dist = BinaryDictionary.editDistance("Saturday", "Sunday"); assertEquals("edit distance between 'Saturday' and 'Sunday' is 3", 3, dist); } public void testBothEmpty() { - final int dist = Utils.editDistance("", ""); + final int dist = BinaryDictionary.editDistance("", ""); assertEquals("when both string are empty, no edits are needed", 0, dist); } public void testFirstArgIsEmpty() { - final int dist = Utils.editDistance("", "aaaa"); + final int dist = BinaryDictionary.editDistance("", "aaaa"); assertEquals("when only one string of the arguments is empty," + " the edit distance is the length of the other.", 4, dist); } public void testSecoondArgIsEmpty() { - final int dist = Utils.editDistance("aaaa", ""); + final int dist = BinaryDictionary.editDistance("aaaa", ""); assertEquals("when only one string of the arguments is empty," + " the edit distance is the length of the other.", 4, dist); @@ -78,27 +78,27 @@ public class EditDistanceTests extends AndroidTestCase { public void testSameStrings() { final String arg1 = "The quick brown fox jumps over the lazy dog."; final String arg2 = "The quick brown fox jumps over the lazy dog."; - final int dist = Utils.editDistance(arg1, arg2); + final int dist = BinaryDictionary.editDistance(arg1, arg2); assertEquals("when same strings are passed, distance equals 0.", 0, dist); } public void testSameReference() { final String arg = "The quick brown fox jumps over the lazy dog."; - final int dist = Utils.editDistance(arg, arg); + final int dist = BinaryDictionary.editDistance(arg, arg); assertEquals("when same string references are passed, the distance equals 0.", 0, dist); } public void testNullArg() { try { - Utils.editDistance(null, "aaa"); + BinaryDictionary.editDistance(null, "aaa"); fail("IllegalArgumentException should be thrown."); } catch (Exception e) { assertTrue(e instanceof IllegalArgumentException); } try { - Utils.editDistance("aaa", null); + BinaryDictionary.editDistance("aaa", null); fail("IllegalArgumentException should be thrown."); } catch (Exception e) { assertTrue(e instanceof IllegalArgumentException);