From eb050fc2dc97a7e6ddcaf254c110dc16279dfd0d Mon Sep 17 00:00:00 2001 From: satok Date: Mon, 3 Oct 2011 19:21:13 +0900 Subject: [PATCH] Demote words with a capitalized char Bug: 5371514 +1 4 -1 2 +2 0 -2 0 +3 0 -3 0 +4 1 -4 3 +5 0 -5 12 +6 3 -6 3 +7 12 -7 0 Change-Id: I6b46e43f9059f1e8a1cc02a626ea6eb8f1f9924f --- native/src/correction.cpp | 45 +++++++++++++++++++++++++++---- native/src/correction.h | 5 ++-- native/src/defines.h | 1 + native/src/unigram_dictionary.cpp | 2 +- 4 files changed, 45 insertions(+), 8 deletions(-) diff --git a/native/src/correction.cpp b/native/src/correction.cpp index 5128c2e5c..9e75ffc3e 100644 --- a/native/src/correction.cpp +++ b/native/src/correction.cpp @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -89,8 +90,10 @@ void Correction::checkState() { } } -int Correction::getFreqForSplitTwoWords(const int firstFreq, const int secondFreq) { - return Correction::RankingAlgorithm::calcFreqForSplitTwoWords(firstFreq, secondFreq, this); +int Correction::getFreqForSplitTwoWords(const int firstFreq, const int secondFreq, + const unsigned short *word) { + return Correction::RankingAlgorithm::calcFreqForSplitTwoWords( + firstFreq, secondFreq, this, word); } int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLength) { @@ -498,6 +501,16 @@ inline static int getQuoteCount(const unsigned short* word, const int length) { return quoteCount; } +inline static bool isUpperCase(unsigned short c) { + if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) { + c = BASE_CHARS[c]; + } + if (isupper(c)) { + return true; + } + return false; +} + /* static */ inline static int editDistance( int* editDistanceTable, const unsigned short* input, @@ -749,7 +762,8 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const /* static */ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords( - const int firstFreq, const int secondFreq, const Correction* correction) { + const int firstFreq, const int secondFreq, const Correction* correction, + const unsigned short *word) { const int spaceProximityPos = correction->mSpaceProximityPos; const int missingSpacePos = correction->mMissingSpacePos; if (DEBUG_DICT) { @@ -761,11 +775,27 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords( const bool isSpaceProximity = spaceProximityPos >= 0; const int inputLength = correction->mInputLength; const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos; - const int secondWordLength = isSpaceProximity - ? (inputLength - spaceProximityPos - 1) + const int secondWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1) : (inputLength - missingSpacePos); const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER; + bool firstCapitalizedWordDemotion = false; + if (firstWordLength >= 2) { + firstCapitalizedWordDemotion = isUpperCase(word[0]); + } + + bool secondCapitalizedWordDemotion = false; + if (secondWordLength >= 2) { + secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]); + } + + const bool capitalizedWordDemotion = + firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion; + + if (DEBUG_DICT_FULL) { + LOGI("Two words: %c, %c, %d", word[0], word[firstWordLength + 1], capitalizedWordDemotion); + } + if (firstWordLength == 0 || secondWordLength == 0) { return 0; } @@ -815,6 +845,11 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords( } multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq); + + if (capitalizedWordDemotion) { + multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq); + } + return totalFreq; } diff --git a/native/src/correction.h b/native/src/correction.h index 84e075266..a630646c1 100644 --- a/native/src/correction.h +++ b/native/src/correction.h @@ -73,7 +73,8 @@ public: bool needsToPrune() const; - int getFreqForSplitTwoWords(const int firstFreq, const int secondFreq); + int getFreqForSplitTwoWords( + const int firstFreq, const int secondFreq, const unsigned short *word); int getFinalFreq(const int freq, unsigned short **word, int* wordLength); CorrectionType processCharAndCalcState(const int32_t c, const bool isTerminal); @@ -151,7 +152,7 @@ private: static int calculateFinalFreq(const int inputIndex, const int depth, const int freq, int *editDistanceTable, const Correction* correction); static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq, - const Correction* correction); + const Correction* correction, const unsigned short *word); }; }; } // namespace latinime diff --git a/native/src/defines.h b/native/src/defines.h index dab862924..57bd9f763 100644 --- a/native/src/defines.h +++ b/native/src/defines.h @@ -189,6 +189,7 @@ static void dumpWord(const unsigned short* word, const int length) { #define CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE 45 #define INPUT_EXCEEDS_OUTPUT_DEMOTION_RATE 70 #define FIRST_CHAR_DIFFERENT_DEMOTION_RATE 96 +#define TWO_WORDS_CAPITALIZED_DEMOTION_RATE 50 // This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java // This is only used for the size of array. Not to be used in c functions. diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp index f23bd3208..8eb5a9700 100644 --- a/native/src/unigram_dictionary.cpp +++ b/native/src/unigram_dictionary.cpp @@ -431,7 +431,7 @@ void UnigramDictionary::getSplitTwoWordsSuggestion( word[i] = mWord[i - firstWordLength - 1]; } - const int pairFreq = mCorrection->getFreqForSplitTwoWords(firstFreq, secondFreq); + const int pairFreq = mCorrection->getFreqForSplitTwoWords(firstFreq, secondFreq, word); if (DEBUG_DICT) { LOGI("Split two words: %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength); }