Suggest excessive characters

bug: 3193883 Change-Id: Iea7a0fce7ce62d8779a7c7e4613d50db30d82b07
2010-12-08 16:04:16 +09:00 · 2010-12-08 16:04:16 +09:00 · cdbbea735f
commit cdbbea735f
parent e1c216a1a3
4 changed files with 65 additions and 67 deletions
--- a/java/src/com/android/inputmethod/latin/Suggest.java
+++ b/java/src/com/android/inputmethod/latin/Suggest.java
@ -34,6 +34,8 @@ import java.util.List;
 */
 public class Suggest implements Dictionary.WordCallback {

+    public static final String TAG = "Suggest";
+
    public static final int APPROX_MAX_WORD_LENGTH = 32;

    public static final int CORRECTION_NONE = 0;
@ -188,33 +190,6 @@ public class Suggest implements Dictionary.WordCallback {
        }
    }

-    private boolean haveSufficientCommonality(String original, CharSequence suggestion) {
-        final int originalLength = original.length();
-        final int suggestionLength = suggestion.length();
-        final int minLength = Math.min(originalLength, suggestionLength);
-        if (minLength <= 2) return true;
-        int matching = 0;
-        int lessMatching = 0; // Count matches if we skip one character
-        int i;
-        for (i = 0; i < minLength; i++) {
-            final char origChar = ExpandableDictionary.toLowerCase(original.charAt(i));
-            if (origChar == ExpandableDictionary.toLowerCase(suggestion.charAt(i))) {
-                matching++;
-                lessMatching++;
-            } else if (i + 1 < suggestionLength
-                    && origChar == ExpandableDictionary.toLowerCase(suggestion.charAt(i + 1))) {
-                lessMatching++;
-            }
-        }
-        matching = Math.max(matching, lessMatching);
-
-        if (minLength <= 4) {
-            return matching >= 2;
-        } else {
-            return matching > minLength / 2;
-        }
-    }
-
    /**
     * Returns a list of words that match the list of character codes passed in.
     * This list will be overwritten the next time this function is called.
@ -311,6 +286,11 @@ public class Suggest implements Dictionary.WordCallback {
                //       the normalized score of the second suggestion, behave less aggressive.
                final double normalizedScore = LatinIMEUtil.calcNormalizedScore(
                        mOriginalWord, mSuggestions.get(0), mPriorities[0]);
+                if (LatinImeLogger.sDBG) {
+                    Log.d(TAG, "Normalized " + mOriginalWord + "," + mSuggestions.get(0) + ","
+                            + mPriorities[0] + normalizedScore
+                            + "(" + mAutoCompleteThreshold + ")");
+                }
                if (normalizedScore >= mAutoCompleteThreshold) {
                    mHaveCorrection = true;
                }
@ -319,15 +299,6 @@ public class Suggest implements Dictionary.WordCallback {
        if (mOriginalWord != null) {
            mSuggestions.add(0, mOriginalWord.toString());
        }
-
-        // Check if the first suggestion has a minimum number of characters in common
-        if (wordComposer.size() > 1 && mSuggestions.size() > 1
-                && (mCorrectionMode == CORRECTION_FULL
-                || mCorrectionMode == CORRECTION_FULL_BIGRAM)) {
-            if (!haveSufficientCommonality(mLowerOriginalWord, mSuggestions.get(1))) {
-                mHaveCorrection = false;
-            }
-        }
        if (mAutoTextEnabled) {
            int i = 0;
            int max = 6;
--- a/native/src/defines.h
+++ b/native/src/defines.h
@ -23,10 +23,12 @@
 #ifndef LOG_TAG
 #define LOG_TAG "LatinIME: "
 #endif
-#define DEBUG_DICT 1
+#define DEBUG_DICT true
+#define DEBUG_SHOW_FOUND_WORD false
 #else // FLAG_DBG
 #define LOGI
-#define DEBUG_DICT 0
+#define DEBUG_DICT false
+#define DEBUG_SHOW_FOUND_WORD false
 #endif // FLAG_DBG

 // 22-bit address = ~4MB dictionary size limit, which on average would be about 200k-300k words
@ -48,7 +50,8 @@
 #define NOT_VALID_WORD -99

 #define SUGGEST_MISSING_CHARACTERS true
-#define SUGGEST_MISSING_CHARACTERS_THRESHOLD 5
+
+#define SUGGEST_EXCESSIVE_CHARACTERS true

 // This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
 // This is only used for the size of array. Not to be used in c functions.
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@ -46,19 +46,30 @@ int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short

    initSuggestions(codes, codesSize, outWords, frequencies);

-    int suggestedWordsCount = getSuggestionCandidates(codesSize, -1, nextLetters,
+    int suggestedWordsCount = getSuggestionCandidates(codesSize, -1, -1, nextLetters,
            nextLettersSize);

    // If there aren't sufficient suggestions, search for words by allowing wild cards at
    // the different character positions. This feature is not ready for prime-time as we need
    // to figure out the best ranking for such words compared to proximity corrections and
    // completions.
-    if (SUGGEST_MISSING_CHARACTERS && suggestedWordsCount < SUGGEST_MISSING_CHARACTERS_THRESHOLD) {
+    if (SUGGEST_MISSING_CHARACTERS) {
        for (int i = 0; i < codesSize; ++i) {
-            int tempCount = getSuggestionCandidates(codesSize, i, NULL, 0);
+            if (DEBUG_DICT) LOGI("--- Suggest missing characters %d", i);
+            const int tempCount = getSuggestionCandidates(codesSize, i, -1, NULL, 0);
+            if (tempCount > suggestedWordsCount) {
+                suggestedWordsCount = tempCount;
+            }
+        }
+    }
+
+    // Suggest excessive characters
+    if (SUGGEST_EXCESSIVE_CHARACTERS) {
+        for (int i = 0; i < codesSize; ++i) {
+            if (DEBUG_DICT) LOGI("--- Suggest excessive characters %d", i);
+            const int tempCount = getSuggestionCandidates(codesSize, -1, i, NULL, 0);
            if (tempCount > suggestedWordsCount) {
                suggestedWordsCount = tempCount;
-                break;
            }
        }
    }
@ -86,14 +97,14 @@ void UnigramDictionary::initSuggestions(int *codes, int codesSize, unsigned shor
    mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
 }

-int UnigramDictionary::getSuggestionCandidates(int inputLength, int skipPos,
+int UnigramDictionary::getSuggestionCandidates(int inputLength, int skipPos, int excessivePos,
        int *nextLetters, int nextLettersSize) {
    if (DEBUG_DICT) LOGI("getSuggestionCandidates");
    int initialPos = 0;
    if (IS_LATEST_DICT_VERSION) {
        initialPos = DICTIONARY_HEADER_SIZE;
    }
-    getWords(initialPos, inputLength, skipPos, nextLetters, nextLettersSize);
+    getWords(initialPos, inputLength, skipPos, excessivePos, nextLetters, nextLettersSize);

    // Get the word count
    int suggestedWordsCount = 0;
@ -115,7 +126,7 @@ bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency)
    if (DEBUG_DICT) {
        char s[length + 1];
        for (int i = 0; i <= length; i++) s[i] = word[i];
-        LOGI("Found word = %s, freq = %d : \n", s, frequency);
+        if (DEBUG_SHOW_FOUND_WORD) LOGI("Found word = %s, freq = %d :  \n", s, frequency);
    }
    if (length > MAX_WORD_LENGTH) {
        if (DEBUG_DICT) LOGI("Exceeded max word length.");
@ -132,6 +143,11 @@ bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency)
        insertAt++;
    }
    if (insertAt < MAX_WORDS) {
+        if (DEBUG_DICT) {
+            char s[length + 1];
+            for (int i = 0; i <= length; i++) s[i] = word[i];
+            LOGI("Added word = %s, freq = %d :  \n", s, frequency);
+        }
        memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),
               (char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),
               (MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0]));
@ -181,16 +197,16 @@ static const char QUOTE = '\'';

 // Keep this for comparing spec to new getWords
 void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,
-        int *nextLetters, const int nextLettersSize) {
+        const int excessivePos, int *nextLetters, const int nextLettersSize) {
    int initialPosition = initialPos;
    const int count = Dictionary::getCount(DICT, &initialPosition);
    getWordsRec(count, initialPosition, 0,
            min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),
-            mInputLength <= 0, 1, 0, 0, skipPos, nextLetters, nextLettersSize);
+            mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, nextLetters, nextLettersSize);
 }

 void UnigramDictionary::getWords(const int rootPos, const int inputLength, const int skipPos,
-        int *nextLetters, const int nextLettersSize) {
+        const int excessivePos, int *nextLetters, const int nextLettersSize) {
    int rootPosition = rootPos;
    const int MAX_DEPTH = min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
    // Get the number of child of root, then increment the position
@ -216,9 +232,9 @@ void UnigramDictionary::getWords(const int rootPos, const int inputLength, const
            // depth will never be greater than MAX_DEPTH because in that case,
            // needsToTraverseChildrenNodes should be false
            const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth,
-                    MAX_DEPTH, traverseAllNodes, snr, inputIndex, diffs, skipPos, nextLetters,
-                    nextLettersSize, &childCount, &firstChildPos, &traverseAllNodes, &snr,
-                    &inputIndex, &diffs, &siblingPos);
+                    MAX_DEPTH, traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos,
+                    nextLetters, nextLettersSize, &childCount, &firstChildPos, &traverseAllNodes,
+                    &snr, &inputIndex, &diffs, &siblingPos);
            // Next sibling pos
            mStackSiblingPos[depth] = siblingPos;
            if (needsToTraverseChildrenNodes) {
@ -232,7 +248,7 @@ void UnigramDictionary::getWords(const int rootPos, const int inputLength, const
                mStackSiblingPos[depth] = firstChildPos;
            }
        } else {
-            // Goes to parent node
+            // Goes to parent sibling node
            --depth;
        }
    }
@ -241,7 +257,8 @@ void UnigramDictionary::getWords(const int rootPos, const int inputLength, const
 // snr : frequency?
 void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,
        const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex,
-        const int diffs, const int skipPos, int *nextLetters, const int nextLettersSize) {
+        const int diffs, const int skipPos, const int excessivePos, int *nextLetters,
+        const int nextLettersSize) {
    int siblingPos = pos;
    for (int i = 0; i < childrenCount; ++i) {
        int newCount;
@ -253,14 +270,16 @@ void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, cons
        int newDiffs;
        int newSiblingPos;
        const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,
-                traverseAllNodes, snr, inputIndex, diffs, skipPos, nextLetters, nextLettersSize,
+                traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos, nextLetters,
+                nextLettersSize,
                &newCount, &newChildPosition, &newTraverseAllNodes, &newSnr,
                &newInputIndex, &newDiffs, &newSiblingPos);
        siblingPos = newSiblingPos;

        if (needsToTraverseChildrenNodes) {
            getWordsRec(newCount, newChildPosition, newDepth, maxDepth, newTraverseAllNodes,
-                    newSnr, newInputIndex, newDiffs, skipPos, nextLetters, nextLettersSize);
+                    newSnr, newInputIndex, newDiffs, skipPos, excessivePos, nextLetters,
+                    nextLettersSize);
        }
    }
 }
@ -312,14 +331,18 @@ inline int UnigramDictionary::getMatchedProximityId(const int *currentChars,
 }

 inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth,
-        const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex,
-        const int diffs, const int skipPos, int *nextLetters, const int nextLettersSize,
-        int *newCount, int *newChildPosition, bool *newTraverseAllNodes,
+        const int maxDepth, const bool traverseAllNodes, const int snr, int inputIndex,
+        const int diffs, const int skipPos, const int excessivePos, int *nextLetters,
+        const int nextLettersSize, int *newCount, int *newChildPosition, bool *newTraverseAllNodes,
        int *newSnr, int*newInputIndex, int *newDiffs, int *nextSiblingPosition) {
+    if (DEBUG_DICT) assert(skipPos < 0 || excessivePos < 0);
    unsigned short c;
    int childPosition;
    bool terminal;
    int freq;
+
+    if (excessivePos == depth) ++inputIndex;
+
    *nextSiblingPosition = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, pos, &c,
            &childPosition, &terminal, &freq);

--- a/native/src/unigram_dictionary.h
+++ b/native/src/unigram_dictionary.h
@ -31,7 +31,7 @@ public:

 private:
    void initSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies);
-    int getSuggestionCandidates(int inputLength, int skipPos, int *nextLetters,
+    int getSuggestionCandidates(int inputLength, int skipPos, int excessivePos, int *nextLetters,
            int nextLettersSize);
    void getVersionNumber();
    bool checkIfDictVersionIsLatest();
@ -43,12 +43,12 @@ private:
    unsigned short toLowerCase(unsigned short c);
    void getWordsRec(const int childrenCount, const int pos, const int depth, const int maxDepth,
            const bool traverseAllNodes, const int snr, const int inputIndex, const int diffs,
-            const int skipPos, int *nextLetters, const int nextLettersSize);
+            const int skipPos, const int excessivePos, int *nextLetters, const int nextLettersSize);
    void getWords(const int rootPos, const int inputLength, const int skipPos,
-            int *nextLetters, const int nextLettersSize);
+            const int excessivePos, int *nextLetters, const int nextLettersSize);
    // Keep getWordsOld for comparing performance between getWords and getWordsOld
    void getWordsOld(const int initialPos, const int inputLength, const int skipPos,
-            int *nextLetters, const int nextLettersSize);
+            const int excessivePos, int *nextLetters, const int nextLettersSize);
    void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize);
    void onTerminalWhenUserTypedLengthIsGreaterThanInputLength(unsigned short *word,
            const int mInputLength, const int depth, const int snr, int *nextLetters,
@ -59,10 +59,11 @@ private:
            const int inputIndex, const int skipPos, const int depth);
    int getMatchedProximityId(const int *currentChars, const unsigned short c, const int skipPos);
    bool processCurrentNode(const int pos, const int depth,
-            const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex,
-            const int diffs, const int skipPos, int *nextLetters, const int nextLettersSize,
-            int *newCount, int *newChildPosition, bool *newTraverseAllNodes,
-            int *newSnr, int*newInputIndex, int *newDiffs, int *nextSiblingPosition);
+            const int maxDepth, const bool traverseAllNodes, const int snr, int inputIndex,
+            const int diffs, const int skipPos, const int excessivePos, int *nextLetters,
+            const int nextLettersSize, int *newCount, int *newChildPosition,
+            bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs,
+            int *nextSiblingPosition);
    const unsigned char *DICT;
    const int MAX_WORDS;
    const int MAX_WORD_LENGTH;