Merge "Demote words with a capitalized char"
commit
c812d0b816
|
@ -15,6 +15,7 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
#include <ctype.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
|
@ -89,8 +90,10 @@ void Correction::checkState() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int Correction::getFreqForSplitTwoWords(const int firstFreq, const int secondFreq) {
|
int Correction::getFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
|
||||||
return Correction::RankingAlgorithm::calcFreqForSplitTwoWords(firstFreq, secondFreq, this);
|
const unsigned short *word) {
|
||||||
|
return Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
|
||||||
|
firstFreq, secondFreq, this, word);
|
||||||
}
|
}
|
||||||
|
|
||||||
int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLength) {
|
int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLength) {
|
||||||
|
@ -498,6 +501,16 @@ inline static int getQuoteCount(const unsigned short* word, const int length) {
|
||||||
return quoteCount;
|
return quoteCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline static bool isUpperCase(unsigned short c) {
|
||||||
|
if (c < sizeof(BASE_CHARS) / sizeof(BASE_CHARS[0])) {
|
||||||
|
c = BASE_CHARS[c];
|
||||||
|
}
|
||||||
|
if (isupper(c)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
/* static */
|
/* static */
|
||||||
inline static int editDistance(
|
inline static int editDistance(
|
||||||
int* editDistanceTable, const unsigned short* input,
|
int* editDistanceTable, const unsigned short* input,
|
||||||
|
@ -749,7 +762,8 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
|
||||||
|
|
||||||
/* static */
|
/* static */
|
||||||
int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
|
int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
|
||||||
const int firstFreq, const int secondFreq, const Correction* correction) {
|
const int firstFreq, const int secondFreq, const Correction* correction,
|
||||||
|
const unsigned short *word) {
|
||||||
const int spaceProximityPos = correction->mSpaceProximityPos;
|
const int spaceProximityPos = correction->mSpaceProximityPos;
|
||||||
const int missingSpacePos = correction->mMissingSpacePos;
|
const int missingSpacePos = correction->mMissingSpacePos;
|
||||||
if (DEBUG_DICT) {
|
if (DEBUG_DICT) {
|
||||||
|
@ -761,11 +775,27 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
|
||||||
const bool isSpaceProximity = spaceProximityPos >= 0;
|
const bool isSpaceProximity = spaceProximityPos >= 0;
|
||||||
const int inputLength = correction->mInputLength;
|
const int inputLength = correction->mInputLength;
|
||||||
const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
|
const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
|
||||||
const int secondWordLength = isSpaceProximity
|
const int secondWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1)
|
||||||
? (inputLength - spaceProximityPos - 1)
|
|
||||||
: (inputLength - missingSpacePos);
|
: (inputLength - missingSpacePos);
|
||||||
const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;
|
const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;
|
||||||
|
|
||||||
|
bool firstCapitalizedWordDemotion = false;
|
||||||
|
if (firstWordLength >= 2) {
|
||||||
|
firstCapitalizedWordDemotion = isUpperCase(word[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool secondCapitalizedWordDemotion = false;
|
||||||
|
if (secondWordLength >= 2) {
|
||||||
|
secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
const bool capitalizedWordDemotion =
|
||||||
|
firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion;
|
||||||
|
|
||||||
|
if (DEBUG_DICT_FULL) {
|
||||||
|
LOGI("Two words: %c, %c, %d", word[0], word[firstWordLength + 1], capitalizedWordDemotion);
|
||||||
|
}
|
||||||
|
|
||||||
if (firstWordLength == 0 || secondWordLength == 0) {
|
if (firstWordLength == 0 || secondWordLength == 0) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -815,6 +845,11 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
|
||||||
}
|
}
|
||||||
|
|
||||||
multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);
|
multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);
|
||||||
|
|
||||||
|
if (capitalizedWordDemotion) {
|
||||||
|
multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq);
|
||||||
|
}
|
||||||
|
|
||||||
return totalFreq;
|
return totalFreq;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -73,7 +73,8 @@ public:
|
||||||
|
|
||||||
bool needsToPrune() const;
|
bool needsToPrune() const;
|
||||||
|
|
||||||
int getFreqForSplitTwoWords(const int firstFreq, const int secondFreq);
|
int getFreqForSplitTwoWords(
|
||||||
|
const int firstFreq, const int secondFreq, const unsigned short *word);
|
||||||
int getFinalFreq(const int freq, unsigned short **word, int* wordLength);
|
int getFinalFreq(const int freq, unsigned short **word, int* wordLength);
|
||||||
|
|
||||||
CorrectionType processCharAndCalcState(const int32_t c, const bool isTerminal);
|
CorrectionType processCharAndCalcState(const int32_t c, const bool isTerminal);
|
||||||
|
@ -151,7 +152,7 @@ private:
|
||||||
static int calculateFinalFreq(const int inputIndex, const int depth,
|
static int calculateFinalFreq(const int inputIndex, const int depth,
|
||||||
const int freq, int *editDistanceTable, const Correction* correction);
|
const int freq, int *editDistanceTable, const Correction* correction);
|
||||||
static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
|
static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
|
||||||
const Correction* correction);
|
const Correction* correction, const unsigned short *word);
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
|
|
|
@ -189,6 +189,7 @@ static void dumpWord(const unsigned short* word, const int length) {
|
||||||
#define CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE 45
|
#define CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE 45
|
||||||
#define INPUT_EXCEEDS_OUTPUT_DEMOTION_RATE 70
|
#define INPUT_EXCEEDS_OUTPUT_DEMOTION_RATE 70
|
||||||
#define FIRST_CHAR_DIFFERENT_DEMOTION_RATE 96
|
#define FIRST_CHAR_DIFFERENT_DEMOTION_RATE 96
|
||||||
|
#define TWO_WORDS_CAPITALIZED_DEMOTION_RATE 50
|
||||||
|
|
||||||
// This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
|
// This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
|
||||||
// This is only used for the size of array. Not to be used in c functions.
|
// This is only used for the size of array. Not to be used in c functions.
|
||||||
|
|
|
@ -431,7 +431,7 @@ void UnigramDictionary::getSplitTwoWordsSuggestion(
|
||||||
word[i] = mWord[i - firstWordLength - 1];
|
word[i] = mWord[i - firstWordLength - 1];
|
||||||
}
|
}
|
||||||
|
|
||||||
const int pairFreq = mCorrection->getFreqForSplitTwoWords(firstFreq, secondFreq);
|
const int pairFreq = mCorrection->getFreqForSplitTwoWords(firstFreq, secondFreq, word);
|
||||||
if (DEBUG_DICT) {
|
if (DEBUG_DICT) {
|
||||||
LOGI("Split two words: %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength);
|
LOGI("Split two words: %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue