am c812d0b8: Merge "Demote words with a capitalized char"

* commit 'c812d0b816c6e3ac4a2df23b5831b17acdc7c414': Demote words with a capitalized char
2011-10-03 04:29:01 -07:00 · 2011-10-03 04:29:01 -07:00 · fb5b1b2eaa
parent 35988e0858 c812d0b816
commit fb5b1b2eaa
4 changed files with 45 additions and 8 deletions
--- a/native/src/correction.cpp
+++ b/native/src/correction.cpp
@ -15,6 +15,7 @@
 */
 #include <assert.h>
 #include <ctype.h>
 #include <stdio.h>
 #include <string.h>
@ -89,8 +90,10 @@ void Correction::checkState() {
    }
 }
-int Correction::getFreqForSplitTwoWords(const int firstFreq, const int secondFreq) {
+int Correction::getFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
-    return Correction::RankingAlgorithm::calcFreqForSplitTwoWords(firstFreq, secondFreq, this);
+        const unsigned short *word) {
    return Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
            firstFreq, secondFreq, this, word);
 }
 int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLength) {
@ -498,6 +501,16 @@ inline static int getQuoteCount(const unsigned short* word, const int length) {
    return quoteCount;
 }
 inline static bool isUpperCase(unsigned short c) {
     if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
         c = BASE_CHARS[c];
     }
     if (isupper(c)) {
         return true;
     }
     return false;
 }
 /* static */
 inline static int editDistance(
        int* editDistanceTable, const unsigned short* input,
@ -749,7 +762,8 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
 /* static */
 int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
-        const int firstFreq, const int secondFreq, const Correction* correction) {
+        const int firstFreq, const int secondFreq, const Correction* correction,
        const unsigned short *word) {
    const int spaceProximityPos = correction->mSpaceProximityPos;
    const int missingSpacePos = correction->mMissingSpacePos;
    if (DEBUG_DICT) {
@ -761,11 +775,27 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
    const bool isSpaceProximity = spaceProximityPos >= 0;
    const int inputLength = correction->mInputLength;
    const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
-    const int secondWordLength = isSpaceProximity
+    const int secondWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1)
            ? (inputLength - spaceProximityPos - 1)
            : (inputLength - missingSpacePos);
    const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;
    bool firstCapitalizedWordDemotion = false;
    if (firstWordLength >= 2) {
        firstCapitalizedWordDemotion = isUpperCase(word[0]);
    }
    bool secondCapitalizedWordDemotion = false;
    if (secondWordLength >= 2) {
        secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]);
    }
    const bool capitalizedWordDemotion =
            firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion;
    if (DEBUG_DICT_FULL) {
        LOGI("Two words: %c, %c, %d", word[0], word[firstWordLength + 1], capitalizedWordDemotion);
    }
    if (firstWordLength == 0 || secondWordLength == 0) {
        return 0;
    }
@ -815,6 +845,11 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
    }
    multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);
    if (capitalizedWordDemotion) {
        multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq);
    }
    return totalFreq;
 }
--- a/native/src/correction.h
+++ b/native/src/correction.h
@ -73,7 +73,8 @@ public:
    bool needsToPrune() const;
-    int getFreqForSplitTwoWords(const int firstFreq, const int secondFreq);
+    int getFreqForSplitTwoWords(
            const int firstFreq, const int secondFreq, const unsigned short *word);
    int getFinalFreq(const int freq, unsigned short **word, int* wordLength);
    CorrectionType processCharAndCalcState(const int32_t c, const bool isTerminal);
@ -151,7 +152,7 @@ private:
        static int calculateFinalFreq(const int inputIndex, const int depth,
                const int freq, int *editDistanceTable, const Correction* correction);
        static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
-                const Correction* correction);
+                const Correction* correction, const unsigned short *word);
    };
 };
 } // namespace latinime
--- a/native/src/defines.h
+++ b/native/src/defines.h
@ -189,6 +189,7 @@ static void dumpWord(const unsigned short* word, const int length) {
 #define CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE 45
 #define INPUT_EXCEEDS_OUTPUT_DEMOTION_RATE 70
 #define FIRST_CHAR_DIFFERENT_DEMOTION_RATE 96
 #define TWO_WORDS_CAPITALIZED_DEMOTION_RATE 50
 // This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
 // This is only used for the size of array. Not to be used in c functions.
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@ -431,7 +431,7 @@ void UnigramDictionary::getSplitTwoWordsSuggestion(
        word[i] = mWord[i - firstWordLength - 1];
    }
-    const int pairFreq = mCorrection->getFreqForSplitTwoWords(firstFreq, secondFreq);
+    const int pairFreq = mCorrection->getFreqForSplitTwoWords(firstFreq, secondFreq, word);
    if (DEBUG_DICT) {
        LOGI("Split two words:  %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength);
    }