am c812d0b8: Merge "Demote words with a capitalized char"

* commit 'c812d0b816c6e3ac4a2df23b5831b17acdc7c414':
  Demote words with a capitalized char
main
satok 2011-10-03 04:29:01 -07:00 committed by Android Git Automerger
commit fb5b1b2eaa
4 changed files with 45 additions and 8 deletions

View File

@ -15,6 +15,7 @@
*/ */
#include <assert.h> #include <assert.h>
#include <ctype.h>
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
@ -89,8 +90,10 @@ void Correction::checkState() {
} }
} }
int Correction::getFreqForSplitTwoWords(const int firstFreq, const int secondFreq) { int Correction::getFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
return Correction::RankingAlgorithm::calcFreqForSplitTwoWords(firstFreq, secondFreq, this); const unsigned short *word) {
return Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
firstFreq, secondFreq, this, word);
} }
int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLength) { int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLength) {
@ -498,6 +501,16 @@ inline static int getQuoteCount(const unsigned short* word, const int length) {
return quoteCount; return quoteCount;
} }
inline static bool isUpperCase(unsigned short c) {
if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
c = BASE_CHARS[c];
}
if (isupper(c)) {
return true;
}
return false;
}
/* static */ /* static */
inline static int editDistance( inline static int editDistance(
int* editDistanceTable, const unsigned short* input, int* editDistanceTable, const unsigned short* input,
@ -749,7 +762,8 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
/* static */ /* static */
int Correction::RankingAlgorithm::calcFreqForSplitTwoWords( int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
const int firstFreq, const int secondFreq, const Correction* correction) { const int firstFreq, const int secondFreq, const Correction* correction,
const unsigned short *word) {
const int spaceProximityPos = correction->mSpaceProximityPos; const int spaceProximityPos = correction->mSpaceProximityPos;
const int missingSpacePos = correction->mMissingSpacePos; const int missingSpacePos = correction->mMissingSpacePos;
if (DEBUG_DICT) { if (DEBUG_DICT) {
@ -761,11 +775,27 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
const bool isSpaceProximity = spaceProximityPos >= 0; const bool isSpaceProximity = spaceProximityPos >= 0;
const int inputLength = correction->mInputLength; const int inputLength = correction->mInputLength;
const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos; const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
const int secondWordLength = isSpaceProximity const int secondWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1)
? (inputLength - spaceProximityPos - 1)
: (inputLength - missingSpacePos); : (inputLength - missingSpacePos);
const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER; const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;
bool firstCapitalizedWordDemotion = false;
if (firstWordLength >= 2) {
firstCapitalizedWordDemotion = isUpperCase(word[0]);
}
bool secondCapitalizedWordDemotion = false;
if (secondWordLength >= 2) {
secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]);
}
const bool capitalizedWordDemotion =
firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion;
if (DEBUG_DICT_FULL) {
LOGI("Two words: %c, %c, %d", word[0], word[firstWordLength + 1], capitalizedWordDemotion);
}
if (firstWordLength == 0 || secondWordLength == 0) { if (firstWordLength == 0 || secondWordLength == 0) {
return 0; return 0;
} }
@ -815,6 +845,11 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
} }
multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq); multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);
if (capitalizedWordDemotion) {
multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq);
}
return totalFreq; return totalFreq;
} }

View File

@ -73,7 +73,8 @@ public:
bool needsToPrune() const; bool needsToPrune() const;
int getFreqForSplitTwoWords(const int firstFreq, const int secondFreq); int getFreqForSplitTwoWords(
const int firstFreq, const int secondFreq, const unsigned short *word);
int getFinalFreq(const int freq, unsigned short **word, int* wordLength); int getFinalFreq(const int freq, unsigned short **word, int* wordLength);
CorrectionType processCharAndCalcState(const int32_t c, const bool isTerminal); CorrectionType processCharAndCalcState(const int32_t c, const bool isTerminal);
@ -151,7 +152,7 @@ private:
static int calculateFinalFreq(const int inputIndex, const int depth, static int calculateFinalFreq(const int inputIndex, const int depth,
const int freq, int *editDistanceTable, const Correction* correction); const int freq, int *editDistanceTable, const Correction* correction);
static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq, static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
const Correction* correction); const Correction* correction, const unsigned short *word);
}; };
}; };
} // namespace latinime } // namespace latinime

View File

@ -189,6 +189,7 @@ static void dumpWord(const unsigned short* word, const int length) {
#define CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE 45 #define CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE 45
#define INPUT_EXCEEDS_OUTPUT_DEMOTION_RATE 70 #define INPUT_EXCEEDS_OUTPUT_DEMOTION_RATE 70
#define FIRST_CHAR_DIFFERENT_DEMOTION_RATE 96 #define FIRST_CHAR_DIFFERENT_DEMOTION_RATE 96
#define TWO_WORDS_CAPITALIZED_DEMOTION_RATE 50
// This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java // This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
// This is only used for the size of array. Not to be used in c functions. // This is only used for the size of array. Not to be used in c functions.

View File

@ -431,7 +431,7 @@ void UnigramDictionary::getSplitTwoWordsSuggestion(
word[i] = mWord[i - firstWordLength - 1]; word[i] = mWord[i - firstWordLength - 1];
} }
const int pairFreq = mCorrection->getFreqForSplitTwoWords(firstFreq, secondFreq); const int pairFreq = mCorrection->getFreqForSplitTwoWords(firstFreq, secondFreq, word);
if (DEBUG_DICT) { if (DEBUG_DICT) {
LOGI("Split two words: %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength); LOGI("Split two words: %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength);
} }