From 29dc80614bc529ca2c0b96e1a731ebb7a5433090 Mon Sep 17 00:00:00 2001
From: satok <satok@google.com>
Date: Tue, 17 Jan 2012 15:59:15 +0900
Subject: [PATCH] Prepair for advanced two words error correction

Change-Id: I4c8a21f0f6e349ddafd9b402583321a60855cfe8
---
 native/src/correction.cpp         | 96 ++++++++++++++++++++++++++++++-
 native/src/correction.h           |  2 +
 native/src/defines.h              |  6 +-
 native/src/unigram_dictionary.cpp | 92 ++++++++++++++++++++++++++++-
 native/src/unigram_dictionary.h   |  4 ++
 native/src/words_priority_queue.h |  5 +-
 6 files changed, 198 insertions(+), 7 deletions(-)

diff --git a/native/src/correction.cpp b/native/src/correction.cpp
index 5dc6f8737..6a129d4e3 100644
--- a/native/src/correction.cpp
+++ b/native/src/correction.cpp
@@ -83,7 +83,7 @@ inline static void calcEditDistanceOneStep(int *editDistanceTable, const unsigne
 
 inline static int getCurrentEditDistance(
         int *editDistanceTable, const int inputLength, const int outputLength) {
-    if (DEBUG_DICT) {
+    if (DEBUG_EDIT_DISTANCE) {
         AKLOGI("getCurrentEditDistance %d, %d", inputLength, outputLength);
     }
     return editDistanceTable[(inputLength + 1) * (outputLength + 1) - 1];
@@ -935,6 +935,100 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
     return totalFreq;
 }
 
+/* static */
+int Correction::RankingAlgorithm::calcFreqForSplitTwoWordsOld(
+        const int firstFreq, const int secondFreq, const Correction* correction,
+        const unsigned short *word) {
+    const int spaceProximityPos = correction->mSpaceProximityPos;
+    const int missingSpacePos = correction->mMissingSpacePos;
+    if (DEBUG_DICT) {
+        int inputCount = 0;
+        if (spaceProximityPos >= 0) ++inputCount;
+        if (missingSpacePos >= 0) ++inputCount;
+        assert(inputCount <= 1);
+    }
+    const bool isSpaceProximity = spaceProximityPos >= 0;
+    const int inputLength = correction->mInputLength;
+    const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
+    const int secondWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1)
+            : (inputLength - missingSpacePos);
+    const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;
+
+    bool firstCapitalizedWordDemotion = false;
+    if (firstWordLength >= 2) {
+        firstCapitalizedWordDemotion = isUpperCase(word[0]);
+    }
+
+    bool secondCapitalizedWordDemotion = false;
+    if (secondWordLength >= 2) {
+        secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]);
+    }
+
+    const bool capitalizedWordDemotion =
+            firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion;
+
+    if (DEBUG_DICT_FULL) {
+        AKLOGI("Two words: %c, %c, %d",
+                word[0], word[firstWordLength + 1], capitalizedWordDemotion);
+    }
+
+    if (firstWordLength == 0 || secondWordLength == 0) {
+        return 0;
+    }
+    const int firstDemotionRate = 100 - 100 / (firstWordLength + 1);
+    int tempFirstFreq = firstFreq;
+    multiplyRate(firstDemotionRate, &tempFirstFreq);
+
+    const int secondDemotionRate = 100 - 100 / (secondWordLength + 1);
+    int tempSecondFreq = secondFreq;
+    multiplyRate(secondDemotionRate, &tempSecondFreq);
+
+    const int totalLength = firstWordLength + secondWordLength;
+
+    // Promote pairFreq with multiplying by 2, because the word length is the same as the typed
+    // length.
+    int totalFreq = tempFirstFreq + tempSecondFreq;
+
+    // This is a workaround to try offsetting the not-enough-demotion which will be done in
+    // calcNormalizedScore in Utils.java.
+    // In calcNormalizedScore the score will be demoted by (1 - 1 / length)
+    // but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by
+    // (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length))
+    const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength);
+    multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq);
+
+    // At this moment, totalFreq is calculated by the following formula:
+    // (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1)))
+    //        * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1))
+
+    multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq);
+
+    // This is another workaround to offset the demotion which will be done in
+    // calcNormalizedScore in Utils.java.
+    // In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote
+    // the same amount because we already have adjusted the synthetic freq of this "missing or
+    // mistyped space" suggestion candidate above in this method.
+    const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength);
+    multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq);
+
+    if (isSpaceProximity) {
+        // A word pair with one space proximity correction
+        if (DEBUG_DICT) {
+            AKLOGI("Found a word pair with space proximity correction.");
+        }
+        multiplyIntCapped(typedLetterMultiplier, &totalFreq);
+        multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);
+    }
+
+    multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);
+
+    if (capitalizedWordDemotion) {
+        multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq);
+    }
+
+    return totalFreq;
+}
+
 /* Damerau-Levenshtein distance */
 inline static int editDistanceInternal(
         int* editDistanceTable, const unsigned short* before,
diff --git a/native/src/correction.h b/native/src/correction.h
index a0fd55fd9..22a424f5c 100644
--- a/native/src/correction.h
+++ b/native/src/correction.h
@@ -100,6 +100,8 @@ class Correction {
                 const int freq, int *editDistanceTable, const Correction* correction);
         static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
                 const Correction* correction, const unsigned short *word);
+        static int calcFreqForSplitTwoWordsOld(const int firstFreq, const int secondFreq,
+                const Correction* correction, const unsigned short *word);
         static double calcNormalizedScore(const unsigned short* before, const int beforeLength,
                 const unsigned short* after, const int afterLength, const int score);
         static int editDistance(const unsigned short* before,
diff --git a/native/src/defines.h b/native/src/defines.h
index 31175c369..d739043a4 100644
--- a/native/src/defines.h
+++ b/native/src/defines.h
@@ -117,8 +117,8 @@ static void prof_out(void) {
 #define DEBUG_TRACE DEBUG_DICT_FULL
 #define DEBUG_PROXIMITY_INFO false
 #define DEBUG_CORRECTION false
-#define DEBUG_CORRECTION_FREQ true
-#define DEBUG_WORDS_PRIORITY_QUEUE true
+#define DEBUG_CORRECTION_FREQ false
+#define DEBUG_WORDS_PRIORITY_QUEUE false
 
 #else // FLAG_DBG
 
@@ -213,6 +213,8 @@ static void prof_out(void) {
 #define SUB_QUEUE_MAX_WORDS 1
 #define SUB_QUEUE_MAX_COUNT 10
 
+#define TWO_WORDS_CORRECTION_THRESHOLD 0.22f
+
 #define MAX_DEPTH_MULTIPLIER 3
 
 // TODO: Reduce this constant if possible; check the maximum number of umlauts in the same German
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index 69e3200fc..8be95bc40 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@@ -241,8 +241,24 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
         }
     }
     PROF_END(6);
-    if (DEBUG_WORDS_PRIORITY_QUEUE) {
+    if (DEBUG_DICT) {
         queuePool->dumpSubQueue1TopSuggestions();
+        for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
+            WordsPriorityQueue* queue = queuePool->getSubQueue1(i);
+            if (queue->size() > 0) {
+                WordsPriorityQueue::SuggestedWord* sw = queue->top();
+                const int score = sw->mScore;
+                const unsigned short* word = sw->mWord;
+                const int wordLength = sw->mWordLength;
+                double ns = Correction::RankingAlgorithm::calcNormalizedScore(
+                        proximityInfo->getPrimaryInputWord(), i, word, wordLength, score);
+                ns += 0;
+                AKLOGI("--- TOP SUB WORDS for %d --- %d %f [%d]", i, score, ns,
+                        (ns > TWO_WORDS_CORRECTION_THRESHOLD));
+                DUMP_WORD(proximityInfo->getPrimaryInputWord(), i);
+                DUMP_WORD(word, wordLength);
+            }
+        }
     }
 }
 
@@ -441,6 +457,80 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo
     return;
 }
 
+void UnigramDictionary::getSplitTwoWordsSuggestionsOld(ProximityInfo *proximityInfo,
+        const int *xcoordinates, const int *ycoordinates, const int *codes,
+        const bool useFullEditDistance, const int inputLength, const int missingSpacePos,
+        const int  spaceProximityPos, Correction *correction, WordsPriorityQueuePool* queuePool) {
+    WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
+
+    if (DEBUG_DICT) {
+        int inputCount = 0;
+        if (spaceProximityPos >= 0) ++inputCount;
+        if (missingSpacePos >= 0) ++inputCount;
+        assert(inputCount <= 1);
+    }
+    const bool isSpaceProximity = spaceProximityPos >= 0;
+    const int firstWordStartPos = 0;
+    const int secondWordStartPos = isSpaceProximity ? (spaceProximityPos + 1) : missingSpacePos;
+    const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
+    const int secondWordLength = isSpaceProximity
+            ? (inputLength - spaceProximityPos - 1)
+            : (inputLength - missingSpacePos);
+
+    if (inputLength >= MAX_WORD_LENGTH) return;
+    if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos
+            || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength)
+        return;
+
+    const int newWordLength = firstWordLength + secondWordLength + 1;
+
+
+    // Space proximity preparation
+    //WordsPriorityQueue *subQueue = queuePool->getSubQueue1();
+    //initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, firstWordLength, subQueue,
+    //correction);
+    //getSuggestionCandidates(useFullEditDistance, firstWordLength, correction, subQueue, false,
+    //MAX_ERRORS_FOR_TWO_WORDS);
+
+    // Allocating variable length array on stack
+    unsigned short word[newWordLength];
+    const int firstFreq = getMostFrequentWordLike(
+            firstWordStartPos, firstWordLength, proximityInfo, mWord);
+    if (DEBUG_DICT) {
+        AKLOGI("First freq: %d", firstFreq);
+    }
+    if (firstFreq <= 0) return;
+
+    for (int i = 0; i < firstWordLength; ++i) {
+        word[i] = mWord[i];
+    }
+
+    const int secondFreq = getMostFrequentWordLike(
+            secondWordStartPos, secondWordLength, proximityInfo, mWord);
+    if (DEBUG_DICT) {
+        AKLOGI("Second  freq:  %d", secondFreq);
+    }
+    if (secondFreq <= 0) return;
+
+    word[firstWordLength] = SPACE;
+    for (int i = (firstWordLength + 1); i < newWordLength; ++i) {
+        word[i] = mWord[i - firstWordLength - 1];
+    }
+
+    // TODO: Remove initSuggestions and correction->setCorrectionParams
+    initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction);
+
+    correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */,
+            -1 /* transposedPos */, spaceProximityPos, missingSpacePos,
+            useFullEditDistance, false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS);
+    const int pairFreq = correction->getFreqForSplitTwoWords(firstFreq, secondFreq, word);
+    if (DEBUG_DICT) {
+        AKLOGI("Split two words:  %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength);
+    }
+    addWord(word, newWordLength, pairFreq, masterQueue);
+    return;
+}
+
 // Wrapper for getMostFrequentWordLikeInner, which matches it to the previous
 // interface.
 inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h
index 5e7a7580f..b950971bb 100644
--- a/native/src/unigram_dictionary.h
+++ b/native/src/unigram_dictionary.h
@@ -104,6 +104,10 @@ class UnigramDictionary {
             const int *xcoordinates, const int *ycoordinates, const int *codes,
             const bool useFullEditDistance, const int inputLength, const int spaceProximityPos,
             const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool);
+    void getSplitTwoWordsSuggestionsOld(ProximityInfo *proximityInfo,
+            const int *xcoordinates, const int *ycoordinates, const int *codes,
+            const bool useFullEditDistance, const int inputLength, const int spaceProximityPos,
+            const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool);
     void getMissingSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates,
             const int *ycoordinates, const int *codes, const bool useFullEditDistance,
             const int inputLength, const int missingSpacePos, Correction *correction,
diff --git a/native/src/words_priority_queue.h b/native/src/words_priority_queue.h
index 54bf27a59..6262439b5 100644
--- a/native/src/words_priority_queue.h
+++ b/native/src/words_priority_queue.h
@@ -81,10 +81,9 @@ class WordsPriorityQueue {
         mSuggestions.push(sw);
     }
 
-    SuggestedWord* topAndPop() {
+    SuggestedWord* top() {
         if (mSuggestions.empty()) return 0;
         SuggestedWord* sw = mSuggestions.top();
-        mSuggestions.pop();
         return sw;
     }
 
@@ -112,7 +111,7 @@ class WordsPriorityQueue {
         return size;
     }
 
-    int size() {
+    int size() const {
         return mSuggestions.size();
     }