Merge "Support multi words suggestion"
This commit is contained in:
commit
bb7a39b4c5
5 changed files with 125 additions and 64 deletions
|
@ -159,10 +159,10 @@ void Correction::checkState() {
|
|||
}
|
||||
}
|
||||
|
||||
int Correction::getFreqForSplitTwoWords(const int *freqArray, const int *wordLengthArray,
|
||||
const bool isSpaceProximity, const unsigned short *word) {
|
||||
return Correction::RankingAlgorithm::calcFreqForSplitTwoWords(freqArray, wordLengthArray, this,
|
||||
isSpaceProximity, word);
|
||||
int Correction::getFreqForSplitMultipleWords(const int *freqArray, const int *wordLengthArray,
|
||||
const int wordCount, const bool isSpaceProximity, const unsigned short *word) {
|
||||
return Correction::RankingAlgorithm::calcFreqForSplitMultipleWords(freqArray, wordLengthArray,
|
||||
wordCount, this, isSpaceProximity, word);
|
||||
}
|
||||
|
||||
int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLength) {
|
||||
|
@ -911,45 +911,85 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
|
|||
}
|
||||
|
||||
/* static */
|
||||
int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
|
||||
const int *freqArray, const int *wordLengthArray, const Correction* correction,
|
||||
const bool isSpaceProximity, const unsigned short *word) {
|
||||
const int firstFreq = freqArray[0];
|
||||
const int secondFreq = freqArray[1];
|
||||
const int firstWordLength = wordLengthArray[0];
|
||||
const int secondWordLength = wordLengthArray[1];
|
||||
int Correction::RankingAlgorithm::calcFreqForSplitMultipleWords(
|
||||
const int *freqArray, const int *wordLengthArray, const int wordCount,
|
||||
const Correction* correction, const bool isSpaceProximity, const unsigned short *word) {
|
||||
const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;
|
||||
|
||||
bool firstCapitalizedWordDemotion = false;
|
||||
if (firstWordLength >= 2) {
|
||||
firstCapitalizedWordDemotion = isUpperCase(word[0]);
|
||||
bool secondCapitalizedWordDemotion = false;
|
||||
|
||||
{
|
||||
// TODO: Handle multiple capitalized word demotion properly
|
||||
const int firstWordLength = wordLengthArray[0];
|
||||
const int secondWordLength = wordLengthArray[1];
|
||||
if (firstWordLength >= 2) {
|
||||
firstCapitalizedWordDemotion = isUpperCase(word[0]);
|
||||
}
|
||||
|
||||
if (secondWordLength >= 2) {
|
||||
// FIXME: word[firstWordLength + 1] is incorrect.
|
||||
secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]);
|
||||
}
|
||||
}
|
||||
|
||||
bool secondCapitalizedWordDemotion = false;
|
||||
if (secondWordLength >= 2) {
|
||||
secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]);
|
||||
}
|
||||
|
||||
const bool capitalizedWordDemotion =
|
||||
firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion;
|
||||
|
||||
if (firstWordLength == 0 || secondWordLength == 0) {
|
||||
int totalLength = 0;
|
||||
int totalFreq = 0;
|
||||
for (int i = 0; i < wordCount; ++i){
|
||||
const int wordLength = wordLengthArray[i];
|
||||
if (wordLength <= 0) {
|
||||
return 0;
|
||||
}
|
||||
totalLength += wordLength;
|
||||
const int demotionRate = 100 - TWO_WORDS_CORRECTION_DEMOTION_BASE / (wordLength + 1);
|
||||
int tempFirstFreq = freqArray[i];
|
||||
multiplyRate(demotionRate, &tempFirstFreq);
|
||||
totalFreq += tempFirstFreq;
|
||||
}
|
||||
|
||||
if (totalLength <= 0 || totalFreq <= 0) {
|
||||
return 0;
|
||||
}
|
||||
const int firstDemotionRate = 100 - TWO_WORDS_CORRECTION_DEMOTION_BASE / (firstWordLength + 1);
|
||||
int tempFirstFreq = firstFreq;
|
||||
multiplyRate(firstDemotionRate, &tempFirstFreq);
|
||||
|
||||
const int secondDemotionRate = 100
|
||||
- TWO_WORDS_CORRECTION_DEMOTION_BASE / (secondWordLength + 1);
|
||||
int tempSecondFreq = secondFreq;
|
||||
multiplyRate(secondDemotionRate, &tempSecondFreq);
|
||||
|
||||
const int totalLength = firstWordLength + secondWordLength;
|
||||
|
||||
// TODO: Currently totalFreq is adjusted to two word metrix.
|
||||
// Promote pairFreq with multiplying by 2, because the word length is the same as the typed
|
||||
// length.
|
||||
int totalFreq = tempFirstFreq + tempSecondFreq;
|
||||
totalFreq = totalFreq * 2 / wordCount;
|
||||
if (wordCount > 2) {
|
||||
// Safety net for 3+ words -- Caveats: many heuristics and workarounds here.
|
||||
int oneLengthCounter = 0;
|
||||
int twoLengthCounter = 0;
|
||||
for (int i = 0; i < wordCount; ++i) {
|
||||
const int wordLength = wordLengthArray[i];
|
||||
// TODO: Use bigram instead of this safety net
|
||||
if (i < wordCount - 1) {
|
||||
const int nextWordLength = wordLengthArray[i + 1];
|
||||
if (wordLength == 1 && nextWordLength == 2) {
|
||||
// Safety net to filter 1 length and 2 length sequential words
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
const int freq = freqArray[i];
|
||||
// Demote too short weak words
|
||||
if (wordLength <= 4 && freq <= MAX_FREQ * 2 / 3 /* heuristic... */) {
|
||||
multiplyRate(100 * freq / MAX_FREQ, &totalFreq);
|
||||
}
|
||||
if (wordLength == 1) {
|
||||
++oneLengthCounter;
|
||||
} else if (wordLength == 2) {
|
||||
++twoLengthCounter;
|
||||
}
|
||||
if (oneLengthCounter >= 2 || (oneLengthCounter + twoLengthCounter) >= 4) {
|
||||
// Safety net to filter too many short words
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
multiplyRate(MULTIPLE_WORDS_DEMOTION_RATE, &totalFreq);
|
||||
}
|
||||
|
||||
// This is a workaround to try offsetting the not-enough-demotion which will be done in
|
||||
// calcNormalizedScore in Utils.java.
|
||||
|
@ -993,9 +1033,9 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
|
|||
}
|
||||
|
||||
if (DEBUG_CORRECTION_FREQ) {
|
||||
AKLOGI("Two words (%d, %d) (%d, %d) %d, %d", firstFreq, secondFreq, firstWordLength,
|
||||
secondWordLength, capitalizedWordDemotion, totalFreq);
|
||||
DUMP_WORD(word, firstWordLength);
|
||||
AKLOGI("Multiple words (%d, %d) (%d, %d) %d, %d", freqArray[0], freqArray[1],
|
||||
wordLengthArray[0], wordLengthArray[1], capitalizedWordDemotion, totalFreq);
|
||||
DUMP_WORD(word, wordLengthArray[0]);
|
||||
}
|
||||
|
||||
return totalFreq;
|
||||
|
|
|
@ -121,9 +121,9 @@ class Correction {
|
|||
|
||||
bool needsToPrune() const;
|
||||
|
||||
int getFreqForSplitTwoWords(
|
||||
const int *freqArray, const int *wordLengthArray, const bool isSpaceProximity,
|
||||
const unsigned short *word);
|
||||
int getFreqForSplitMultipleWords(
|
||||
const int *freqArray, const int *wordLengthArray, const int wordCount,
|
||||
const bool isSpaceProximity, const unsigned short *word);
|
||||
int getFinalFreq(const int freq, unsigned short **word, int* wordLength);
|
||||
int getFinalFreqForSubQueue(const int freq, unsigned short **word, int* wordLength,
|
||||
const int inputLength);
|
||||
|
@ -151,8 +151,8 @@ class Correction {
|
|||
static int calculateFinalFreq(const int inputIndex, const int depth,
|
||||
const int freq, int *editDistanceTable, const Correction* correction,
|
||||
const int inputLength);
|
||||
static int calcFreqForSplitTwoWords(const int *freqArray, const int *wordLengthArray,
|
||||
const Correction* correction, const bool isSpaceProximity,
|
||||
static int calcFreqForSplitMultipleWords(const int *freqArray, const int *wordLengthArray,
|
||||
const int wordCount, const Correction* correction, const bool isSpaceProximity,
|
||||
const unsigned short *word);
|
||||
static double calcNormalizedScore(const unsigned short* before, const int beforeLength,
|
||||
const unsigned short* after, const int afterLength, const int score);
|
||||
|
|
|
@ -208,6 +208,7 @@ static void prof_out(void) {
|
|||
#define ZERO_DISTANCE_PROMOTION_RATE 110
|
||||
#define NEUTRAL_SCORE_SQUARED_RADIUS 8.0f
|
||||
#define HALF_SCORE_SQUARED_RADIUS 32.0f
|
||||
#define MAX_FREQ 255
|
||||
|
||||
// This must be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
|
||||
// This is only used for the size of array. Not to be used in c functions.
|
||||
|
@ -222,7 +223,9 @@ static void prof_out(void) {
|
|||
#define SUB_QUEUE_MAX_WORDS 1
|
||||
#define SUB_QUEUE_MAX_COUNT 10
|
||||
#define SUB_QUEUE_MIN_WORD_LENGTH 4
|
||||
#define MULTIPLE_WORDS_SUGGESTION_MAX_WORDS 2
|
||||
#define MULTIPLE_WORDS_SUGGESTION_MAX_WORDS 10
|
||||
#define MULTIPLE_WORDS_DEMOTION_RATE 80
|
||||
#define MIN_INPUT_LENGTH_FOR_THREE_OR_MORE_WORDS_CORRECTION 6
|
||||
|
||||
#define TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD 0.39
|
||||
#define START_TWO_WORDS_CORRECTION_THRESHOLD 0.22
|
||||
|
@ -230,7 +233,6 @@ static void prof_out(void) {
|
|||
#define MAX_DEPTH_MULTIPLIER 3
|
||||
|
||||
#define FIRST_WORD_INDEX 0
|
||||
#define SECOND_WORD_INDEX 1
|
||||
|
||||
// TODO: Reduce this constant if possible; check the maximum number of umlauts in the same German
|
||||
// word in the dictionary
|
||||
|
@ -248,7 +250,7 @@ template<typename T> inline T max(T a, T b) { return a > b ? a : b; }
|
|||
#define NEUTRAL_AREA_RADIUS_RATIO 1.3f
|
||||
|
||||
// DEBUG
|
||||
#define INPUTLENGTH_FOR_DEBUG 10
|
||||
#define INPUTLENGTH_FOR_DEBUG -1
|
||||
#define MIN_OUTPUT_INDEX_FOR_DEBUG -1
|
||||
|
||||
#endif // LATINIME_DEFINES_H
|
||||
|
|
|
@ -224,7 +224,7 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
|
|||
// Multiple word suggestions
|
||||
if (SUGGEST_MULTIPLE_WORDS
|
||||
&& inputLength >= MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION) {
|
||||
getSplitTwoWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
|
||||
getSplitMultipleWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
|
||||
useFullEditDistance, inputLength, correction, queuePool,
|
||||
hasAutoCorrectionCandidate);
|
||||
}
|
||||
|
@ -445,17 +445,18 @@ bool UnigramDictionary::getSubStringSuggestion(
|
|||
if (outputWordStartPos + nextWordLength >= MAX_WORD_LENGTH) {
|
||||
return false;
|
||||
}
|
||||
outputWord[outputWordStartPos + tempOutputWordLength] = SPACE;
|
||||
outputWord[tempOutputWordLength] = SPACE;
|
||||
if (outputWordLength) {
|
||||
++*outputWordLength;
|
||||
}
|
||||
} else if (currentWordIndex >= 1) {
|
||||
// TODO: Handle 3 or more words
|
||||
const int pairFreq = correction->getFreqForSplitTwoWords(
|
||||
freqArray, wordLengthArray, isSpaceProximity, outputWord);
|
||||
const int pairFreq = correction->getFreqForSplitMultipleWords(
|
||||
freqArray, wordLengthArray, currentWordIndex + 1, isSpaceProximity, outputWord);
|
||||
if (DEBUG_DICT) {
|
||||
AKLOGI("Split two words: %d, %d, %d, %d, (%d)", freqArray[0], freqArray[1], pairFreq,
|
||||
inputLength, wordLengthArray[0]);
|
||||
DUMP_WORD(outputWord, tempOutputWordLength);
|
||||
AKLOGI("Split two words: %d, %d, %d, %d, (%d) %d", freqArray[0], freqArray[1], pairFreq,
|
||||
inputLength, wordLengthArray[0], tempOutputWordLength);
|
||||
}
|
||||
addWord(outputWord, tempOutputWordLength, pairFreq, queuePool->getMasterQueue());
|
||||
}
|
||||
|
@ -473,30 +474,46 @@ void UnigramDictionary::getMultiWordsSuggestionRec(ProximityInfo *proximityInfo,
|
|||
// Return if the last word index
|
||||
return;
|
||||
}
|
||||
for (int i = 1; i < inputLength; ++i) {
|
||||
int tempOutputWordLength = 0;
|
||||
// First word
|
||||
int inputWordStartPos = 0;
|
||||
int inputWordLength = i;
|
||||
if (startWordIndex >= 1
|
||||
&& (hasAutoCorrectionCandidate
|
||||
|| inputLength < MIN_INPUT_LENGTH_FOR_THREE_OR_MORE_WORDS_CORRECTION)) {
|
||||
// Do not suggest 3+ words if already has auto correction candidate
|
||||
return;
|
||||
}
|
||||
for (int i = startInputPos + 1; i < inputLength; ++i) {
|
||||
if (DEBUG_CORRECTION_FREQ) {
|
||||
AKLOGI("Two words, %d", inputWordLength);
|
||||
AKLOGI("Multi words(%d), start in %d sep %d start out %d",
|
||||
startWordIndex, startInputPos, i, outputWordLength);
|
||||
DUMP_WORD(outputWord, outputWordLength);
|
||||
}
|
||||
int tempOutputWordLength = 0;
|
||||
// Current word
|
||||
int inputWordStartPos = startInputPos;
|
||||
int inputWordLength = i - startInputPos;
|
||||
if (!getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
|
||||
useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
|
||||
FIRST_WORD_INDEX, inputWordStartPos, inputWordLength, 0, true /* not used */,
|
||||
freqArray, wordLengthArray, outputWord, &tempOutputWordLength)) {
|
||||
startWordIndex, inputWordStartPos, inputWordLength, outputWordLength,
|
||||
true /* not used */, freqArray, wordLengthArray, outputWord,
|
||||
&tempOutputWordLength)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Second word
|
||||
if (DEBUG_CORRECTION_FREQ) {
|
||||
AKLOGI("Do missing space correction");
|
||||
}
|
||||
// Next word
|
||||
// Missing space
|
||||
inputWordStartPos = i;
|
||||
inputWordLength = inputLength - i;
|
||||
getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
|
||||
if(!getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
|
||||
useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
|
||||
SECOND_WORD_INDEX, inputWordStartPos, inputWordLength, tempOutputWordLength,
|
||||
false /* missing space */, freqArray, wordLengthArray, outputWord,
|
||||
0);
|
||||
startWordIndex + 1, inputWordStartPos, inputWordLength, tempOutputWordLength,
|
||||
false /* missing space */, freqArray, wordLengthArray, outputWord, 0)) {
|
||||
getMultiWordsSuggestionRec(proximityInfo, xcoordinates, ycoordinates, codes,
|
||||
useFullEditDistance, inputLength, correction, queuePool,
|
||||
hasAutoCorrectionCandidate, inputWordStartPos, startWordIndex + 1,
|
||||
tempOutputWordLength, freqArray, wordLengthArray, outputWord);
|
||||
}
|
||||
|
||||
// Mistyped space
|
||||
++inputWordStartPos;
|
||||
|
@ -512,15 +529,17 @@ void UnigramDictionary::getMultiWordsSuggestionRec(ProximityInfo *proximityInfo,
|
|||
continue;
|
||||
}
|
||||
|
||||
if (DEBUG_CORRECTION_FREQ) {
|
||||
AKLOGI("Do mistyped space correction");
|
||||
}
|
||||
getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
|
||||
useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
|
||||
SECOND_WORD_INDEX, inputWordStartPos, inputWordLength, tempOutputWordLength,
|
||||
true /* mistyped space */, freqArray, wordLengthArray, outputWord,
|
||||
0);
|
||||
startWordIndex + 1, inputWordStartPos, inputWordLength, tempOutputWordLength,
|
||||
true /* mistyped space */, freqArray, wordLengthArray, outputWord, 0);
|
||||
}
|
||||
}
|
||||
|
||||
void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo,
|
||||
void UnigramDictionary::getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo,
|
||||
const int *xcoordinates, const int *ycoordinates, const int *codes,
|
||||
const bool useFullEditDistance, const int inputLength,
|
||||
Correction *correction, WordsPriorityQueuePool* queuePool,
|
||||
|
|
|
@ -101,7 +101,7 @@ class UnigramDictionary {
|
|||
const bool useFullEditDistance, const int inputLength, Correction *correction,
|
||||
WordsPriorityQueuePool* queuePool, const bool doAutoCompletion, const int maxErrors,
|
||||
const int currentWordIndex);
|
||||
void getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo,
|
||||
void getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo,
|
||||
const int *xcoordinates, const int *ycoordinates, const int *codes,
|
||||
const bool useFullEditDistance, const int inputLength,
|
||||
Correction *correction, WordsPriorityQueuePool* queuePool,
|
||||
|
|
Loading…
Reference in a new issue