Support multi words suggestion

result: I4d097612db2f2a93522 Change-Id: Iedbb24f431dac43e52b6dcce8cb610a75e0ca46e
2012-01-30 18:18:30 +09:00 · 2012-01-30 18:18:30 +09:00 · a85f4929cd
commit a85f4929cd
parent fd6a52c8d5
5 changed files with 125 additions and 64 deletions
--- a/native/src/correction.cpp
+++ b/native/src/correction.cpp
@ -159,10 +159,10 @@ void Correction::checkState() {
    }
 }

-int Correction::getFreqForSplitTwoWords(const int *freqArray, const int *wordLengthArray,
-        const bool isSpaceProximity, const unsigned short *word) {
-    return Correction::RankingAlgorithm::calcFreqForSplitTwoWords(freqArray, wordLengthArray, this,
-            isSpaceProximity, word);
+int Correction::getFreqForSplitMultipleWords(const int *freqArray, const int *wordLengthArray,
+        const int wordCount, const bool isSpaceProximity, const unsigned short *word) {
+    return Correction::RankingAlgorithm::calcFreqForSplitMultipleWords(freqArray, wordLengthArray,
+            wordCount, this, isSpaceProximity, word);
 }

 int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLength) {
@ -911,45 +911,85 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
 }

 /* static */
-int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
-        const int *freqArray, const int *wordLengthArray, const Correction* correction,
-        const bool isSpaceProximity, const unsigned short *word) {
-    const int firstFreq = freqArray[0];
-    const int secondFreq = freqArray[1];
-    const int firstWordLength = wordLengthArray[0];
-    const int secondWordLength = wordLengthArray[1];
+int Correction::RankingAlgorithm::calcFreqForSplitMultipleWords(
+        const int *freqArray, const int *wordLengthArray, const int wordCount,
+        const Correction* correction, const bool isSpaceProximity, const unsigned short *word) {
    const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;

    bool firstCapitalizedWordDemotion = false;
-    if (firstWordLength >= 2) {
-        firstCapitalizedWordDemotion = isUpperCase(word[0]);
+    bool secondCapitalizedWordDemotion = false;
+
+    {
+        // TODO: Handle multiple capitalized word demotion properly
+        const int firstWordLength = wordLengthArray[0];
+        const int secondWordLength = wordLengthArray[1];
+        if (firstWordLength >= 2) {
+            firstCapitalizedWordDemotion = isUpperCase(word[0]);
+        }
+
+        if (secondWordLength >= 2) {
+            // FIXME: word[firstWordLength + 1] is incorrect.
+            secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]);
+        }
    }

-    bool secondCapitalizedWordDemotion = false;
-    if (secondWordLength >= 2) {
-        secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]);
-    }

    const bool capitalizedWordDemotion =
            firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion;

-    if (firstWordLength == 0 || secondWordLength == 0) {
+    int totalLength = 0;
+    int totalFreq = 0;
+    for (int i = 0; i < wordCount; ++i){
+        const int wordLength = wordLengthArray[i];
+        if (wordLength <= 0) {
+            return 0;
+        }
+        totalLength += wordLength;
+        const int demotionRate = 100 - TWO_WORDS_CORRECTION_DEMOTION_BASE / (wordLength + 1);
+        int tempFirstFreq = freqArray[i];
+        multiplyRate(demotionRate, &tempFirstFreq);
+        totalFreq += tempFirstFreq;
+    }
+
+    if (totalLength <= 0 || totalFreq <= 0) {
        return 0;
    }
-    const int firstDemotionRate = 100 - TWO_WORDS_CORRECTION_DEMOTION_BASE / (firstWordLength + 1);
-    int tempFirstFreq = firstFreq;
-    multiplyRate(firstDemotionRate, &tempFirstFreq);
-
-    const int secondDemotionRate = 100
-            - TWO_WORDS_CORRECTION_DEMOTION_BASE / (secondWordLength + 1);
-    int tempSecondFreq = secondFreq;
-    multiplyRate(secondDemotionRate, &tempSecondFreq);
-
-    const int totalLength = firstWordLength + secondWordLength;

+    // TODO: Currently totalFreq is adjusted to two word metrix.
    // Promote pairFreq with multiplying by 2, because the word length is the same as the typed
    // length.
-    int totalFreq = tempFirstFreq + tempSecondFreq;
+    totalFreq = totalFreq * 2 / wordCount;
+    if (wordCount > 2) {
+        // Safety net for 3+ words -- Caveats: many heuristics and workarounds here.
+        int oneLengthCounter = 0;
+        int twoLengthCounter = 0;
+        for (int i = 0; i < wordCount; ++i) {
+            const int wordLength = wordLengthArray[i];
+            // TODO: Use bigram instead of this safety net
+            if (i < wordCount - 1) {
+                const int nextWordLength = wordLengthArray[i + 1];
+                if (wordLength == 1 && nextWordLength == 2) {
+                    // Safety net to filter 1 length and 2 length sequential words
+                    return 0;
+                }
+            }
+            const int freq = freqArray[i];
+            // Demote too short weak words
+            if (wordLength <= 4 && freq <= MAX_FREQ * 2 / 3 /* heuristic... */) {
+                multiplyRate(100 * freq / MAX_FREQ, &totalFreq);
+            }
+            if (wordLength == 1) {
+                ++oneLengthCounter;
+            } else if (wordLength == 2) {
+                ++twoLengthCounter;
+            }
+            if (oneLengthCounter >= 2 || (oneLengthCounter + twoLengthCounter) >= 4) {
+                // Safety net to filter too many short words
+                return 0;
+            }
+        }
+        multiplyRate(MULTIPLE_WORDS_DEMOTION_RATE, &totalFreq);
+    }

    // This is a workaround to try offsetting the not-enough-demotion which will be done in
    // calcNormalizedScore in Utils.java.
@ -993,9 +1033,9 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
    }

    if (DEBUG_CORRECTION_FREQ) {
-        AKLOGI("Two words (%d, %d) (%d, %d) %d, %d", firstFreq, secondFreq, firstWordLength,
-                secondWordLength, capitalizedWordDemotion, totalFreq);
-        DUMP_WORD(word, firstWordLength);
+        AKLOGI("Multiple words (%d, %d) (%d, %d) %d, %d", freqArray[0], freqArray[1],
+                wordLengthArray[0], wordLengthArray[1], capitalizedWordDemotion, totalFreq);
+        DUMP_WORD(word, wordLengthArray[0]);
    }

    return totalFreq;
--- a/native/src/correction.h
+++ b/native/src/correction.h
@ -121,9 +121,9 @@ class Correction {

    bool needsToPrune() const;

-    int getFreqForSplitTwoWords(
-            const int *freqArray, const int *wordLengthArray, const bool isSpaceProximity,
-            const unsigned short *word);
+    int getFreqForSplitMultipleWords(
+            const int *freqArray, const int *wordLengthArray, const int wordCount,
+            const bool isSpaceProximity, const unsigned short *word);
    int getFinalFreq(const int freq, unsigned short **word, int* wordLength);
    int getFinalFreqForSubQueue(const int freq, unsigned short **word, int* wordLength,
            const int inputLength);
@ -151,8 +151,8 @@ class Correction {
        static int calculateFinalFreq(const int inputIndex, const int depth,
                const int freq, int *editDistanceTable, const Correction* correction,
                const int inputLength);
-        static int calcFreqForSplitTwoWords(const int *freqArray, const int *wordLengthArray,
-                const Correction* correction, const bool isSpaceProximity,
+        static int calcFreqForSplitMultipleWords(const int *freqArray, const int *wordLengthArray,
+                const int wordCount, const Correction* correction, const bool isSpaceProximity,
                const unsigned short *word);
        static double calcNormalizedScore(const unsigned short* before, const int beforeLength,
                const unsigned short* after, const int afterLength, const int score);
--- a/native/src/defines.h
+++ b/native/src/defines.h
@ -208,6 +208,7 @@ static void prof_out(void) {
 #define ZERO_DISTANCE_PROMOTION_RATE 110
 #define NEUTRAL_SCORE_SQUARED_RADIUS 8.0f
 #define HALF_SCORE_SQUARED_RADIUS 32.0f
+#define MAX_FREQ 255

 // This must be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
 // This is only used for the size of array. Not to be used in c functions.
@ -222,7 +223,9 @@ static void prof_out(void) {
 #define SUB_QUEUE_MAX_WORDS 1
 #define SUB_QUEUE_MAX_COUNT 10
 #define SUB_QUEUE_MIN_WORD_LENGTH 4
-#define MULTIPLE_WORDS_SUGGESTION_MAX_WORDS 2
+#define MULTIPLE_WORDS_SUGGESTION_MAX_WORDS 10
+#define MULTIPLE_WORDS_DEMOTION_RATE 80
+#define MIN_INPUT_LENGTH_FOR_THREE_OR_MORE_WORDS_CORRECTION 6

 #define TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD 0.39
 #define START_TWO_WORDS_CORRECTION_THRESHOLD 0.22
@ -230,7 +233,6 @@ static void prof_out(void) {
 #define MAX_DEPTH_MULTIPLIER 3

 #define FIRST_WORD_INDEX 0
-#define SECOND_WORD_INDEX 1

 // TODO: Reduce this constant if possible; check the maximum number of umlauts in the same German
 // word in the dictionary
@ -248,7 +250,7 @@ template<typename T> inline T max(T a, T b) { return a > b ? a : b; }
 #define NEUTRAL_AREA_RADIUS_RATIO 1.3f

 // DEBUG
-#define INPUTLENGTH_FOR_DEBUG 10
+#define INPUTLENGTH_FOR_DEBUG -1
 #define MIN_OUTPUT_INDEX_FOR_DEBUG -1

 #endif // LATINIME_DEFINES_H
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@ -224,7 +224,7 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
    // Multiple word suggestions
    if (SUGGEST_MULTIPLE_WORDS
            && inputLength >= MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION) {
-        getSplitTwoWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
+        getSplitMultipleWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
                useFullEditDistance, inputLength, correction, queuePool,
                hasAutoCorrectionCandidate);
    }
@ -445,17 +445,18 @@ bool UnigramDictionary::getSubStringSuggestion(
        if (outputWordStartPos + nextWordLength >= MAX_WORD_LENGTH) {
            return false;
        }
-        outputWord[outputWordStartPos + tempOutputWordLength] = SPACE;
+        outputWord[tempOutputWordLength] = SPACE;
        if (outputWordLength) {
            ++*outputWordLength;
        }
    } else if (currentWordIndex >= 1) {
        // TODO: Handle 3 or more words
-        const int pairFreq = correction->getFreqForSplitTwoWords(
-                freqArray, wordLengthArray, isSpaceProximity, outputWord);
+        const int pairFreq = correction->getFreqForSplitMultipleWords(
+                freqArray, wordLengthArray, currentWordIndex + 1, isSpaceProximity, outputWord);
        if (DEBUG_DICT) {
-            AKLOGI("Split two words: %d, %d, %d, %d, (%d)", freqArray[0], freqArray[1], pairFreq,
-                    inputLength, wordLengthArray[0]);
+            DUMP_WORD(outputWord, tempOutputWordLength);
+            AKLOGI("Split two words: %d, %d, %d, %d, (%d) %d", freqArray[0], freqArray[1], pairFreq,
+                    inputLength, wordLengthArray[0], tempOutputWordLength);
        }
        addWord(outputWord, tempOutputWordLength, pairFreq, queuePool->getMasterQueue());
    }
@ -473,30 +474,46 @@ void UnigramDictionary::getMultiWordsSuggestionRec(ProximityInfo *proximityInfo,
        // Return if the last word index
        return;
    }
-    for (int i = 1; i < inputLength; ++i) {
-        int tempOutputWordLength = 0;
-        // First word
-        int inputWordStartPos = 0;
-        int inputWordLength = i;
+    if (startWordIndex >= 1
+            && (hasAutoCorrectionCandidate
+                    || inputLength < MIN_INPUT_LENGTH_FOR_THREE_OR_MORE_WORDS_CORRECTION)) {
+        // Do not suggest 3+ words if already has auto correction candidate
+        return;
+    }
+    for (int i = startInputPos + 1; i < inputLength; ++i) {
        if (DEBUG_CORRECTION_FREQ) {
-            AKLOGI("Two words, %d", inputWordLength);
+            AKLOGI("Multi words(%d), start in %d sep %d start out %d",
+                    startWordIndex, startInputPos, i, outputWordLength);
+            DUMP_WORD(outputWord, outputWordLength);
        }
+        int tempOutputWordLength = 0;
+        // Current word
+        int inputWordStartPos = startInputPos;
+        int inputWordLength = i - startInputPos;
        if (!getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
                useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
-                FIRST_WORD_INDEX, inputWordStartPos, inputWordLength, 0, true /* not used */,
-                freqArray, wordLengthArray, outputWord, &tempOutputWordLength)) {
+                startWordIndex, inputWordStartPos, inputWordLength, outputWordLength,
+                true /* not used */, freqArray, wordLengthArray, outputWord,
+                &tempOutputWordLength)) {
            continue;
        }

-        // Second word
+        if (DEBUG_CORRECTION_FREQ) {
+            AKLOGI("Do missing space correction");
+        }
+        // Next word
        // Missing space
        inputWordStartPos = i;
        inputWordLength = inputLength - i;
-        getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
+        if(!getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
                useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
-                SECOND_WORD_INDEX, inputWordStartPos, inputWordLength, tempOutputWordLength,
-                false /* missing space */, freqArray, wordLengthArray, outputWord,
-                0);
+                startWordIndex + 1, inputWordStartPos, inputWordLength, tempOutputWordLength,
+                false /* missing space */, freqArray, wordLengthArray, outputWord, 0)) {
+            getMultiWordsSuggestionRec(proximityInfo, xcoordinates, ycoordinates, codes,
+                    useFullEditDistance, inputLength, correction, queuePool,
+                    hasAutoCorrectionCandidate, inputWordStartPos, startWordIndex + 1,
+                    tempOutputWordLength, freqArray, wordLengthArray, outputWord);
+        }

        // Mistyped space
        ++inputWordStartPos;
@ -512,15 +529,17 @@ void UnigramDictionary::getMultiWordsSuggestionRec(ProximityInfo *proximityInfo,
            continue;
        }

+        if (DEBUG_CORRECTION_FREQ) {
+            AKLOGI("Do mistyped space correction");
+        }
        getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
                useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
-                SECOND_WORD_INDEX, inputWordStartPos, inputWordLength, tempOutputWordLength,
-                true /* mistyped space */, freqArray, wordLengthArray, outputWord,
-                0);
+                startWordIndex + 1, inputWordStartPos, inputWordLength, tempOutputWordLength,
+                true /* mistyped space */, freqArray, wordLengthArray, outputWord, 0);
    }
 }

-void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo,
+void UnigramDictionary::getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo,
        const int *xcoordinates, const int *ycoordinates, const int *codes,
        const bool useFullEditDistance, const int inputLength,
        Correction *correction, WordsPriorityQueuePool* queuePool,
--- a/native/src/unigram_dictionary.h
+++ b/native/src/unigram_dictionary.h
@ -101,7 +101,7 @@ class UnigramDictionary {
            const bool useFullEditDistance, const int inputLength, Correction *correction,
            WordsPriorityQueuePool* queuePool, const bool doAutoCompletion, const int maxErrors,
            const int currentWordIndex);
-    void getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo,
+    void getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo,
            const int *xcoordinates, const int *ycoordinates, const int *codes,
            const bool useFullEditDistance, const int inputLength,
            Correction *correction, WordsPriorityQueuePool* queuePool,