From 07a8406bc184a354ea47fb6352e48df39e35310e Mon Sep 17 00:00:00 2001 From: Jean Chalard Date: Thu, 3 Mar 2011 10:22:10 +0900 Subject: [PATCH] Demote skipped characters matched words with respect to length. Words that matched user input with skipped characters used to be demoted in BinaryDictionary by a constant factor and not at all in those dictionaries implemented in java code. To represent the fact that the impact of a skipped character gets larger as the word is shorter, this change will implement a demotion that gets larger as the typed word is shorter. The demotion rate is (n - 2) / (n - 1) where n is the length of the typed word for n >= 2. It implements it for both BinaryDictionary and java dictionaries. Bug: 3340731 Change-Id: I3a18be80a9708981d56a950dc25fe08f018b5b89 --- .../latin/ExpandableDictionary.java | 41 +++++++++++++++---- native/src/defines.h | 2 +- native/src/unigram_dictionary.cpp | 13 ++++-- native/src/unigram_dictionary.h | 7 ++-- 4 files changed, 47 insertions(+), 16 deletions(-) diff --git a/java/src/com/android/inputmethod/latin/ExpandableDictionary.java b/java/src/com/android/inputmethod/latin/ExpandableDictionary.java index b10e7a61e..f2305f18c 100644 --- a/java/src/com/android/inputmethod/latin/ExpandableDictionary.java +++ b/java/src/com/android/inputmethod/latin/ExpandableDictionary.java @@ -230,6 +230,16 @@ public class ExpandableDictionary extends Dictionary { return (node == null) ? -1 : node.mFrequency; } + private static int computeSkippedWordFinalFreq(int freq, int snr, int inputLength) { + // The computation itself makes sense for >= 2, but the == 2 case returns 0 + // anyway so we may as well test against 3 instead and return the constant + if (inputLength >= 3) { + return (freq * snr * (inputLength - 2)) / (inputLength - 1); + } else { + return 0; + } + } + /** * Recursively traverse the tree for words that match the input. Input consists of * a list of arrays. Each item in the list is one input character position. An input @@ -243,13 +253,14 @@ public class ExpandableDictionary extends Dictionary { * @param completion whether the traversal is now in completion mode - meaning that we've * exhausted the input and we're looking for all possible suffixes. * @param snr current weight of the word being formed - * @param inputIndex position in the input characters. This can be off from the depth in + * @param inputIndex position in the input characters. This can be off from the depth in * case we skip over some punctuations such as apostrophe in the traversal. That is, if you type * "wouldve", it could be matching "would've", so the depth will be one more than the * inputIndex * @param callback the callback class for adding a word */ - protected void getWordsRec(NodeArray roots, final WordComposer codes, final char[] word, + // TODO: Share this routine with the native code for BinaryDictionary + protected void getWordsRec(NodeArray roots, final WordComposer codes, final char[] word, final int depth, boolean completion, int snr, int inputIndex, int skipPos, WordCallback callback) { final int count = roots.mLength; @@ -275,8 +286,14 @@ public class ExpandableDictionary extends Dictionary { if (completion) { word[depth] = c; if (terminal) { - if (!callback.addWord(word, 0, depth + 1, freq * snr, mDicTypeId, - DataType.UNIGRAM)) { + final int finalFreq; + if (skipPos < 0) { + finalFreq = freq * snr; + } else { + finalFreq = computeSkippedWordFinalFreq(freq, snr, mInputLength); + } + if (!callback.addWord(word, 0, depth + 1, finalFreq, mDicTypeId, + DataType.UNIGRAM)) { return; } } @@ -288,7 +305,7 @@ public class ExpandableDictionary extends Dictionary { // Skip the ' and continue deeper word[depth] = c; if (children != null) { - getWordsRec(children, codes, word, depth + 1, completion, snr, inputIndex, + getWordsRec(children, codes, word, depth + 1, completion, snr, inputIndex, skipPos, callback); } } else { @@ -305,10 +322,16 @@ public class ExpandableDictionary extends Dictionary { if (codeSize == inputIndex + 1) { if (terminal) { - if (INCLUDE_TYPED_WORD_IF_VALID + if (INCLUDE_TYPED_WORD_IF_VALID || !same(word, depth + 1, codes.getTypedWord())) { - int finalFreq = freq * snr * addedAttenuation; - if (skipPos < 0) finalFreq *= FULL_WORD_FREQ_MULTIPLIER; + final int finalFreq; + if (skipPos < 0) { + finalFreq = freq * snr * addedAttenuation + * FULL_WORD_FREQ_MULTIPLIER; + } else { + finalFreq = computeSkippedWordFinalFreq(freq, + snr * addedAttenuation, mInputLength); + } callback.addWord(word, 0, depth + 1, finalFreq, mDicTypeId, DataType.UNIGRAM); } @@ -319,7 +342,7 @@ public class ExpandableDictionary extends Dictionary { skipPos, callback); } } else if (children != null) { - getWordsRec(children, codes, word, depth + 1, + getWordsRec(children, codes, word, depth + 1, false, snr * addedAttenuation, inputIndex + 1, skipPos, callback); } diff --git a/native/src/defines.h b/native/src/defines.h index ddd65c9fa..0d1f037e1 100644 --- a/native/src/defines.h +++ b/native/src/defines.h @@ -135,7 +135,7 @@ static void prof_out(void) { #define SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS true // The following "rate"s are used as a multiplier before dividing by 100, so they are in percent. -#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 75 +#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 100 #define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 80 #define WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE 75 #define WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE 75 diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp index 9aa36b064..17a87a708 100644 --- a/native/src/unigram_dictionary.cpp +++ b/native/src/unigram_dictionary.cpp @@ -457,10 +457,17 @@ static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(con } inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth, const int matchWeight, const int skipPos, const int excessivePos, const int transposedPos, - const int freq, const bool sameLength) { + const int freq, const bool sameLength) const { // TODO: Demote by edit distance int finalFreq = freq * matchWeight; - if (skipPos >= 0) multiplyRate(WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE, &finalFreq); + if (skipPos >= 0) { + if (mInputLength >= 3) { + multiplyRate(WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE * + (mInputLength - 2) / (mInputLength - 1), &finalFreq); + } else { + finalFreq = 0; + } + } if (transposedPos >= 0) multiplyRate( WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq); if (excessivePos >= 0) { @@ -514,7 +521,7 @@ inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c, } inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex, - const int inputLength) { + const int inputLength) const { if (inputIndex < 0 || inputIndex >= inputLength) return false; const int currentChar = *getInputCharsAt(inputIndex); const int leftIndex = inputIndex - 1; diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h index a95984520..2912cb0b7 100644 --- a/native/src/unigram_dictionary.h +++ b/native/src/unigram_dictionary.h @@ -71,7 +71,8 @@ private: const int nextLettersSize); void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize); int calculateFinalFreq(const int inputIndex, const int depth, const int snr, const int skipPos, - const int excessivePos, const int transposedPos, const int freq, const bool sameLength); + const int excessivePos, const int transposedPos, const int freq, + const bool sameLength) const; void onTerminalWhenUserTypedLengthIsGreaterThanInputLength(unsigned short *word, const int inputIndex, const int depth, const int snr, int *nextLetters, const int nextLettersSize, const int skipPos, const int excessivePos, @@ -95,8 +96,8 @@ private: bool processCurrentNodeForExactMatch(const int firstChildPos, const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition, int *newCount, bool *newTerminal, int *newFreq, int *siblingPos); - bool existsAdjacentProximityChars(const int inputIndex, const int inputLength); - inline const int* getInputCharsAt(const int index) { + bool existsAdjacentProximityChars(const int inputIndex, const int inputLength) const; + inline const int* getInputCharsAt(const int index) const { return mInputCodes + (index * MAX_PROXIMITY_CHARS); } const unsigned char *DICT;