Suggest words with transposed chars

Bug: 3193883 Change-Id: I884b669258bfc522bc04e14f22a7646164a4cac5
2010-12-09 22:08:33 +09:00 · 2010-12-09 22:08:33 +09:00 · a3d78f606e
parent 3f7eac0d2c
commit a3d78f606e
3 changed files with 110 additions and 46 deletions
--- a/native/src/defines.h
+++ b/native/src/defines.h
@ -24,13 +24,17 @@
 #define LOG_TAG "LatinIME: "
 #endif
 #define DEBUG_DICT true
-#define DEBUG_SHOW_FOUND_WORD false
-#define DEBUG_NODE true
+#define DEBUG_DICT_FULL true
+#define DEBUG_SHOW_FOUND_WORD DEBUG_DICT_FULL
+#define DEBUG_NODE DEBUG_DICT_FULL
+#define DEBUG_TRACE DEBUG_DICT_FULL
 #else // FLAG_DBG
 #define LOGI
 #define DEBUG_DICT false
+#define DEBUG_DICT_FULL false
 #define DEBUG_SHOW_FOUND_WORD false
 #define DEBUG_NODE false
+#define DEBUG_TRACE false
 #endif // FLAG_DBG

 #ifndef U_SHORT_MAX
@ -58,6 +62,12 @@
 #define SUGGEST_WORDS_WITH_MISSING_CHARACTER true
 #define SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER true
 #define SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER true
+#define SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS true
+
+#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 75
+#define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 80
+#define WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE 75
+#define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 60

 // This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
 // This is only used for the size of array. Not to be used in c functions.
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@ -36,7 +36,7 @@ UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterM
    MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
    TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
    ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0) {
-    LOGI("UnigramDictionary - constructor");
+    if (DEBUG_DICT) LOGI("UnigramDictionary - constructor");
 }

 UnigramDictionary::~UnigramDictionary() {}
@ -45,26 +45,36 @@ int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short
        int *frequencies, int *nextLetters, int nextLettersSize)
 {
    initSuggestions(codes, codesSize, outWords, frequencies);
-    getSuggestionCandidates(codesSize, -1, -1, nextLetters, nextLettersSize);
+    const int MAX_DEPTH = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
+    getSuggestionCandidates(codesSize, -1, -1, -1, nextLetters, nextLettersSize, MAX_DEPTH);

    // Suggestion with missing character
    if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {
        for (int i = 0; i < codesSize; ++i) {
            if (DEBUG_DICT) LOGI("--- Suggest missing characters %d", i);
-            getSuggestionCandidates(codesSize, i, -1, NULL, 0);
+            getSuggestionCandidates(codesSize, i, -1, -1, NULL, 0, MAX_DEPTH);
        }
    }

    // Suggestion with excessive character
-    if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER) {
+    if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER && mInputLength > MIN_SUGGEST_DEPTH) {
        for (int i = 0; i < codesSize; ++i) {
            if (existsAdjacentProximityChars(i, codesSize)) {
                if (DEBUG_DICT) LOGI("--- Suggest excessive characters %d", i);
-                getSuggestionCandidates(codesSize, -1, i, NULL, 0);
+                getSuggestionCandidates(codesSize, -1, i, -1, NULL, 0, MAX_DEPTH);
            }
        }
    }

+    // Suggestion with transposed characters
+    // Only suggest words that length is mInputLength
+    if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) {
+        for (int i = 0; i < codesSize; ++i) {
+            if (DEBUG_DICT) LOGI("--- Suggest transposed characters %d", i);
+            getSuggestionCandidates(codesSize, -1, -1, i, NULL, 0, mInputLength - 1);
+        }
+    }
+
    // Suggestions with missing space
    if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER && mInputLength > MIN_SUGGEST_DEPTH) {
        for (int i = 1; i < codesSize; ++i) {
@ -187,10 +197,13 @@ static const char QUOTE = '\'';
 static const char SPACE = ' ';

 void UnigramDictionary::getSuggestionCandidates(const int inputLength, const int skipPos,
-        const int excessivePos, int *nextLetters, const int nextLettersSize) {
-    if (DEBUG_DICT) LOGI("getSuggestionCandidates");
+        const int excessivePos, const int transposedPos, int *nextLetters,
+        const int nextLettersSize, const int maxDepth) {
+    if (DEBUG_DICT) LOGI("getSuggestionCandidates %d", maxDepth);
+    if (DEBUG_DICT) assert(transposedPos + 1 < inputLength);
+    if (DEBUG_DICT) assert(excessivePos < inputLength);
+    if (DEBUG_DICT) assert(missingPos < inputLength);
    int rootPosition = ROOT_POS;
-    const int MAX_DEPTH = min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
    // Get the number of child of root, then increment the position
    int childCount = Dictionary::getCount(DICT, &rootPosition);
    int depth = 0;
@ -212,12 +225,12 @@ void UnigramDictionary::getSuggestionCandidates(const int inputLength, const int
            int diffs = mStackDiffs[depth];
            int siblingPos = mStackSiblingPos[depth];
            int firstChildPos;
-            // depth will never be greater than MAX_DEPTH because in that case,
+            // depth will never be greater than maxDepth because in that case,
            // needsToTraverseChildrenNodes should be false
            const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth,
-                    MAX_DEPTH, traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos,
-                    nextLetters, nextLettersSize, &childCount, &firstChildPos, &traverseAllNodes,
-                    &snr, &inputIndex, &diffs, &siblingPos);
+                    maxDepth, traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos,
+                    transposedPos, nextLetters, nextLettersSize, &childCount, &firstChildPos,
+                    &traverseAllNodes, &snr, &inputIndex, &diffs, &siblingPos);
            // Update next sibling pos
            mStackSiblingPos[depth] = siblingPos;
            if (needsToTraverseChildrenNodes) {
@ -252,7 +265,7 @@ bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int mi
    }

    const int secondFreq = getBestWordFreq(missingSpacePos, inputLength - missingSpacePos, mWord);
-    if (DEBUG_DICT) LOGI("Second freq:  %d", secondFreq);
+    if (DEBUG_DICT) LOGI("Second  freq:  %d", secondFreq);
    if (secondFreq <= 0) return false;

    word[missingSpacePos] = SPACE;
@ -262,24 +275,27 @@ bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int mi

    int pairFreq = ((firstFreq + secondFreq) / 2);
    for (int i = 0; i < inputLength; ++i) pairFreq *= TYPED_LETTER_MULTIPLIER;
+    pairFreq = pairFreq * WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE / 100;
    addWord(word, newWordLength, pairFreq);
    return true;
 }

 // Keep this for comparing spec to new getWords
 void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,
-        const int excessivePos, int *nextLetters, const int nextLettersSize) {
+        const int excessivePos, const int transposedPos,int *nextLetters,
+        const int nextLettersSize) {
    int initialPosition = initialPos;
    const int count = Dictionary::getCount(DICT, &initialPosition);
    getWordsRec(count, initialPosition, 0,
            min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),
-            mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, nextLetters, nextLettersSize);
+            mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters,
+            nextLettersSize);
 }

 void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,
        const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex,
-        const int diffs, const int skipPos, const int excessivePos, int *nextLetters,
-        const int nextLettersSize) {
+        const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
+        int *nextLetters, const int nextLettersSize) {
    int siblingPos = pos;
    for (int i = 0; i < childrenCount; ++i) {
        int newCount;
@ -291,34 +307,50 @@ void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, cons
        int newDiffs;
        int newSiblingPos;
        const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,
-                traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos, nextLetters,
-                nextLettersSize,
+                traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos, transposedPos,
+                nextLetters, nextLettersSize,
                &newCount, &newChildPosition, &newTraverseAllNodes, &newSnr,
                &newInputIndex, &newDiffs, &newSiblingPos);
        siblingPos = newSiblingPos;

        if (needsToTraverseChildrenNodes) {
            getWordsRec(newCount, newChildPosition, newDepth, maxDepth, newTraverseAllNodes,
-                    newSnr, newInputIndex, newDiffs, skipPos, excessivePos, nextLetters,
-                    nextLettersSize);
+                    newSnr, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos,
+                    nextLetters, nextLettersSize);
        }
    }
 }

 inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsGreaterThanInputLength(
        unsigned short *word, const int inputLength, const int depth, const int snr,
-        int *nextLetters, const int nextLettersSize, const int skipPos, const int freq) {
-    if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, freq * snr);
+        int *nextLetters, const int nextLettersSize, const int skipPos, const int excessivePos,
+        const int transposedPos, const int freq) {
+    int finalFreq = freq * snr;
+    // TODO: Demote by edit distance
+    if (skipPos >= 0) finalFreq = finalFreq * WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE / 100;
+    if (excessivePos >= 0) finalFreq = finalFreq
+            * WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE / 100;
+    if (transposedPos >= 0) finalFreq = finalFreq
+            * WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE / 100;
+
+    if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, finalFreq);
    if (depth >= inputLength && skipPos < 0) {
        registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize);
    }
 }

 inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsSameAsInputLength(
-        unsigned short *word, const int depth, const int snr, const int skipPos, const int freq,
-        const int addedWeight) {
+        unsigned short *word, const int depth, const int snr, const int skipPos,
+        const int excessivePos, const int transposedPos, const int freq, const int addedWeight) {
    if (!sameAsTyped(word, depth + 1)) {
        int finalFreq = freq * snr * addedWeight;
+        // TODO: Demote by edit distance
+        if (skipPos >= 0) finalFreq = finalFreq * WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE / 100;
+        if (excessivePos >= 0) finalFreq = finalFreq
+                * WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE / 100;
+        if (transposedPos >= 0) finalFreq = finalFreq
+                * WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE / 100;
+
        // Proximity collection will promote a word of the same length as
        // what user typed.
        if (skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER;
@ -357,16 +389,18 @@ inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex
 }

 inline int UnigramDictionary::getMatchedProximityId(const int *currentChars,
-        const unsigned short c, const int skipPos) {
+        const unsigned short c, const int skipPos, const int excessivePos,
+        const int transposedPos) {
    const unsigned short lowerC = toLowerCase(c);
    int j = 0;
    while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {
        const bool matched = (currentChars[j] == lowerC || currentChars[j] == c);
        // If skipPos is defined, not to search proximity collections.
-        // First char is what user typed.
+        // First char is what user  typed.
        if (matched) {
            return j;
-        } else if (skipPos >= 0) {
+        } else if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0) {
+            // Not to check proximity characters
            return -1;
        }
        ++j;
@ -376,10 +410,17 @@ inline int UnigramDictionary::getMatchedProximityId(const int *currentChars,

 inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth,
        const int maxDepth, const bool traverseAllNodes, const int snr, int inputIndex,
-        const int diffs, const int skipPos, const int excessivePos, int *nextLetters,
-        const int nextLettersSize, int *newCount, int *newChildPosition, bool *newTraverseAllNodes,
-        int *newSnr, int*newInputIndex, int *newDiffs, int *nextSiblingPosition) {
-    if (DEBUG_DICT) assert(skipPos < 0 || excessivePos < 0);
+        const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
+        int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
+        bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs,
+        int *nextSiblingPosition) {
+    if (DEBUG_DICT) {
+        int inputCount = 0;
+        if (skipPos >= 0) ++inputCount;
+        if (excessivePos >= 0) ++inputCount;
+        if (transposedPos >= 0) ++inputCount;
+        assert(inputCount <= 1);
+    }
    unsigned short c;
    int childPosition;
    bool terminal;
@ -397,7 +438,7 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
        mWord[depth] = c;
        if (traverseAllNodes && terminal) {
            onTerminalWhenUserTypedLengthIsGreaterThanInputLength(mWord, mInputLength, depth,
-                    snr, nextLetters, nextLettersSize, skipPos, freq);
+                    snr, nextLetters, nextLettersSize, skipPos, excessivePos, transposedPos, freq);
        }
        if (!needsToTraverseChildrenNodes) return false;
        *newTraverseAllNodes = traverseAllNodes;
@ -406,7 +447,14 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
        *newInputIndex = inputIndex;
    } else {
        int *currentChars = mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS);
-        int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos);
+
+        if (transposedPos >= 0) {
+            if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;
+            if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;
+        }
+
+        int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,
+                transposedPos);
        if (matchedProximityCharId < 0) return false;
        mWord[depth] = c;
        // If inputIndex is greater than mInputLength, that means there is no
@ -415,13 +463,13 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
        const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1;
        if (isSameAsUserTypedLength && terminal) {
            onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, depth, snr,
-                    skipPos, freq, addedWeight);
+                    skipPos, excessivePos, transposedPos, freq, addedWeight);
        }
        if (!needsToTraverseChildrenNodes) return false;
        // Start traversing all nodes after the index exceeds the user typed length
        *newTraverseAllNodes = isSameAsUserTypedLength;
        *newSnr = snr * addedWeight;
-        *newDiffs = diffs + (matchedProximityCharId > 0);
+        *newDiffs = diffs + ((matchedProximityCharId > 0) ? 1 : 0);
        *newInputIndex = inputIndex + 1;
    }
    // Optimization: Prune out words that are too long compared to how much was typed.
--- a/native/src/unigram_dictionary.h
+++ b/native/src/unigram_dictionary.h
@ -32,7 +32,8 @@ public:
 private:
    void initSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies);
    void getSuggestionCandidates(const int inputLength, const int skipPos, const int excessivePos,
-            int *nextLetters, const int nextLettersSize);
+            const int transposedPos, int *nextLetters, const int nextLettersSize,
+            const int maxDepth);
    void getVersionNumber();
    bool checkIfDictVersionIsLatest();
    int getAddress(int *pos);
@ -43,25 +44,30 @@ private:
    unsigned short toLowerCase(unsigned short c);
    void getWordsRec(const int childrenCount, const int pos, const int depth, const int maxDepth,
            const bool traverseAllNodes, const int snr, const int inputIndex, const int diffs,
-            const int skipPos, const int excessivePos, int *nextLetters, const int nextLettersSize);
+            const int skipPos, const int excessivePos, const int transposedPos, int *nextLetters,
+            const int nextLettersSize);
    bool getMissingSpaceWords(const int inputLength, const int missingSpacePos);
    // Keep getWordsOld for comparing performance between getWords and getWordsOld
    void getWordsOld(const int initialPos, const int inputLength, const int skipPos,
-            const int excessivePos, int *nextLetters, const int nextLettersSize);
+            const int excessivePos, const int transposedPos, int *nextLetters,
+            const int nextLettersSize);
    void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize);
    void onTerminalWhenUserTypedLengthIsGreaterThanInputLength(unsigned short *word,
            const int mInputLength, const int depth, const int snr, int *nextLetters,
-            const int nextLettersSize, const int skipPos, const int freq);
+            const int nextLettersSize, const int skipPos, const int excessivePos,
+            const int transposedPos, const int freq);
    void onTerminalWhenUserTypedLengthIsSameAsInputLength(unsigned short *word, const int depth,
-            const int snr, const int skipPos, const int freq, const int addedWeight);
+            const int snr, const int skipPos, const int excessivePos, const int transposedPos,
+            const int freq, const int addedWeight);
    bool needsToSkipCurrentNode(const unsigned short c,
            const int inputIndex, const int skipPos, const int depth);
-    int getMatchedProximityId(const int *currentChars, const unsigned short c, const int skipPos);
+    int getMatchedProximityId(const int *currentChars, const unsigned short c, const int skipPos,
+            const int excessivePos, const int transposedPos);
    // Process a node by considering proximity, missing and excessive character
    bool processCurrentNode(const int pos, const int depth,
            const int maxDepth, const bool traverseAllNodes, const int snr, int inputIndex,
-            const int diffs, const int skipPos, const int excessivePos, int *nextLetters,
-            const int nextLettersSize, int *newCount, int *newChildPosition,
+            const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
+            int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
            bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs,
            int *nextSiblingPosition);
    int getBestWordFreq(const int startInputIndex, const int inputLength, unsigned short *word);