Merge "Merge missing space and mistyped space correction algorithm"
This commit is contained in:
commit
23f486f770
5 changed files with 93 additions and 124 deletions
|
@ -158,10 +158,10 @@ void Correction::checkState() {
|
|||
}
|
||||
}
|
||||
|
||||
int Correction::getFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
|
||||
const unsigned short *word) {
|
||||
return Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
|
||||
firstFreq, secondFreq, this, word);
|
||||
int Correction::getFreqForSplitTwoWords(const int *freqArray, const int *wordLengthArray,
|
||||
const bool isSpaceProximity, const unsigned short *word) {
|
||||
return Correction::RankingAlgorithm::calcFreqForSplitTwoWords(freqArray, wordLengthArray, this,
|
||||
isSpaceProximity, word);
|
||||
}
|
||||
|
||||
int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLength) {
|
||||
|
@ -806,21 +806,12 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
|
|||
|
||||
/* static */
|
||||
int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
|
||||
const int firstFreq, const int secondFreq, const Correction* correction,
|
||||
const unsigned short *word) {
|
||||
const int spaceProximityPos = correction->mSpaceProximityPos;
|
||||
const int missingSpacePos = correction->mMissingSpacePos;
|
||||
if (DEBUG_DICT) {
|
||||
int inputCount = 0;
|
||||
if (spaceProximityPos >= 0) ++inputCount;
|
||||
if (missingSpacePos >= 0) ++inputCount;
|
||||
assert(inputCount <= 1);
|
||||
}
|
||||
const bool isSpaceProximity = spaceProximityPos >= 0;
|
||||
const int inputLength = correction->mInputLength;
|
||||
const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
|
||||
const int secondWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1)
|
||||
: (inputLength - missingSpacePos);
|
||||
const int *freqArray, const int *wordLengthArray, const Correction* correction,
|
||||
const bool isSpaceProximity, const unsigned short *word) {
|
||||
const int firstFreq = freqArray[0];
|
||||
const int secondFreq = freqArray[1];
|
||||
const int firstWordLength = wordLengthArray[0];
|
||||
const int secondWordLength = wordLengthArray[1];
|
||||
const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;
|
||||
|
||||
bool firstCapitalizedWordDemotion = false;
|
||||
|
|
|
@ -122,7 +122,8 @@ class Correction {
|
|||
bool needsToPrune() const;
|
||||
|
||||
int getFreqForSplitTwoWords(
|
||||
const int firstFreq, const int secondFreq, const unsigned short *word);
|
||||
const int *freqArray, const int *wordLengthArray, const bool isSpaceProximity,
|
||||
const unsigned short *word);
|
||||
int getFinalFreq(const int freq, unsigned short **word, int* wordLength);
|
||||
int getFinalFreqForSubQueue(const int freq, unsigned short **word, int* wordLength,
|
||||
const int inputLength);
|
||||
|
@ -150,8 +151,9 @@ class Correction {
|
|||
static int calculateFinalFreq(const int inputIndex, const int depth,
|
||||
const int freq, int *editDistanceTable, const Correction* correction,
|
||||
const int inputLength);
|
||||
static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
|
||||
const Correction* correction, const unsigned short *word);
|
||||
static int calcFreqForSplitTwoWords(const int *freqArray, const int *wordLengthArray,
|
||||
const Correction* correction, const bool isSpaceProximity,
|
||||
const unsigned short *word);
|
||||
static double calcNormalizedScore(const unsigned short* before, const int beforeLength,
|
||||
const unsigned short* after, const int afterLength, const int score);
|
||||
static int editDistance(const unsigned short* before,
|
||||
|
|
|
@ -180,10 +180,9 @@ static void prof_out(void) {
|
|||
#define CALIBRATE_SCORE_BY_TOUCH_COORDINATES true
|
||||
|
||||
#define SUGGEST_WORDS_WITH_MISSING_CHARACTER true
|
||||
#define SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER true
|
||||
#define SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER true
|
||||
#define SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS true
|
||||
#define SUGGEST_WORDS_WITH_SPACE_PROXIMITY true
|
||||
#define SUGGEST_MULTIPLE_WORDS true
|
||||
|
||||
// The following "rate"s are used as a multiplier before dividing by 100, so they are in percent.
|
||||
#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 80
|
||||
|
@ -233,7 +232,7 @@ static void prof_out(void) {
|
|||
|
||||
// Minimum suggest depth for one word for all cases except for missing space suggestions.
|
||||
#define MIN_SUGGEST_DEPTH 1
|
||||
#define MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION 3
|
||||
#define MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION 3
|
||||
#define MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION 3
|
||||
|
||||
#define min(a,b) ((a)<(b)?(a):(b))
|
||||
|
|
|
@ -211,7 +211,6 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
|
|||
PROF_END(3);
|
||||
|
||||
PROF_START(4);
|
||||
// Note: This line is intentionally left blank
|
||||
bool hasAutoCorrectionCandidate = false;
|
||||
WordsPriorityQueue* masterQueue = queuePool->getMasterQueue();
|
||||
if (masterQueue->size() > 0) {
|
||||
|
@ -222,14 +221,14 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
|
|||
PROF_END(4);
|
||||
|
||||
PROF_START(5);
|
||||
// Suggestions with missing space
|
||||
if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER
|
||||
&& inputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) {
|
||||
// Multiple word suggestions
|
||||
if (SUGGEST_MULTIPLE_WORDS
|
||||
&& inputLength >= MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION) {
|
||||
for (int i = 1; i < inputLength; ++i) {
|
||||
if (DEBUG_DICT) {
|
||||
AKLOGI("--- Suggest missing space characters %d", i);
|
||||
AKLOGI("--- Suggest multiple words %d", i);
|
||||
}
|
||||
getMissingSpaceWords(proximityInfo, xcoordinates, ycoordinates, codes,
|
||||
getSplitTwoWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
|
||||
useFullEditDistance, inputLength, i, correction, queuePool,
|
||||
hasAutoCorrectionCandidate);
|
||||
}
|
||||
|
@ -237,26 +236,9 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
|
|||
PROF_END(5);
|
||||
|
||||
PROF_START(6);
|
||||
if (SUGGEST_WORDS_WITH_SPACE_PROXIMITY && proximityInfo) {
|
||||
// The first and last "mistyped spaces" are taken care of by excessive character handling
|
||||
for (int i = 1; i < inputLength - 1; ++i) {
|
||||
if (DEBUG_DICT) {
|
||||
AKLOGI("--- Suggest words with proximity space %d", i);
|
||||
}
|
||||
const int x = xcoordinates[i];
|
||||
const int y = ycoordinates[i];
|
||||
if (DEBUG_PROXIMITY_INFO) {
|
||||
AKLOGI("Input[%d] x = %d, y = %d, has space proximity = %d",
|
||||
i, x, y, proximityInfo->hasSpaceProximity(x, y));
|
||||
}
|
||||
if (proximityInfo->hasSpaceProximity(x, y)) {
|
||||
getMistypedSpaceWords(proximityInfo, xcoordinates, ycoordinates, codes,
|
||||
useFullEditDistance, inputLength, i, correction, queuePool,
|
||||
hasAutoCorrectionCandidate);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Note: This line is intentionally left blank
|
||||
PROF_END(6);
|
||||
|
||||
if (DEBUG_DICT) {
|
||||
queuePool->dumpSubQueue1TopSuggestions();
|
||||
for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
|
||||
|
@ -337,24 +319,6 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance,
|
|||
}
|
||||
}
|
||||
|
||||
void UnigramDictionary::getMissingSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates,
|
||||
const int *ycoordinates, const int *codes, const bool useFullEditDistance,
|
||||
const int inputLength, const int missingSpacePos, Correction *correction,
|
||||
WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate) {
|
||||
getSplitTwoWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
|
||||
useFullEditDistance, inputLength, missingSpacePos, -1/* spaceProximityPos */,
|
||||
correction, queuePool, hasAutoCorrectionCandidate);
|
||||
}
|
||||
|
||||
void UnigramDictionary::getMistypedSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates,
|
||||
const int *ycoordinates, const int *codes, const bool useFullEditDistance,
|
||||
const int inputLength, const int spaceProximityPos, Correction *correction,
|
||||
WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate) {
|
||||
getSplitTwoWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
|
||||
useFullEditDistance, inputLength, -1 /* missingSpacePos */, spaceProximityPos,
|
||||
correction, queuePool, hasAutoCorrectionCandidate);
|
||||
}
|
||||
|
||||
inline void UnigramDictionary::onTerminal(const int freq,
|
||||
const TerminalAttributes& terminalAttributes, Correction *correction,
|
||||
WordsPriorityQueuePool *queuePool, const bool addToMasterQueue,
|
||||
|
@ -405,15 +369,23 @@ inline void UnigramDictionary::onTerminal(const int freq,
|
|||
}
|
||||
}
|
||||
|
||||
int UnigramDictionary::getSubStringSuggestion(
|
||||
bool UnigramDictionary::getSubStringSuggestion(
|
||||
ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates,
|
||||
const int *codes, const bool useFullEditDistance, Correction *correction,
|
||||
WordsPriorityQueuePool* queuePool, const int inputLength,
|
||||
const bool hasAutoCorrectionCandidate, const int currentWordIndex,
|
||||
const int inputWordStartPos, const int inputWordLength,
|
||||
const int outputWordStartPos, unsigned short* outputWord, int *outputWordLength) {
|
||||
const int outputWordStartPos, const bool isSpaceProximity, int *freqArray,
|
||||
int*wordLengthArray, unsigned short* outputWord, int *outputWordLength) {
|
||||
if (DEBUG_DICT) {
|
||||
assert(currentWordIndex >= 1);
|
||||
}
|
||||
unsigned short* tempOutputWord = 0;
|
||||
int tempOutputWordLength = 0;
|
||||
// TODO: Optimize init suggestion
|
||||
initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
|
||||
inputLength, correction);
|
||||
|
||||
int freq = getMostFrequentWordLike(
|
||||
inputWordStartPos, inputWordLength, proximityInfo, mWord);
|
||||
if (freq > 0) {
|
||||
|
@ -438,7 +410,7 @@ int UnigramDictionary::getSubStringSuggestion(
|
|||
}
|
||||
WordsPriorityQueue* queue = queuePool->getSubQueue(currentWordIndex, inputWordLength);
|
||||
if (!queue || queue->size() < 1) {
|
||||
return 0;
|
||||
return false;
|
||||
}
|
||||
int score = 0;
|
||||
const double ns = queue->getHighestNormalizedScore(
|
||||
|
@ -451,93 +423,105 @@ int UnigramDictionary::getSubStringSuggestion(
|
|||
// threshold.
|
||||
if (ns < TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD
|
||||
|| tempOutputWordLength < SUB_QUEUE_MIN_WORD_LENGTH) {
|
||||
return 0;
|
||||
return false;
|
||||
}
|
||||
freq = score >> (tempOutputWordLength
|
||||
+ TWO_WORDS_PLUS_OTHER_ERROR_CORRECTION_DEMOTION_DIVIDER);
|
||||
}
|
||||
if (DEBUG_DICT) {
|
||||
AKLOGI("Freq(%d): %d", currentWordIndex, freq);
|
||||
AKLOGI("Freq(%d): %d, length: %d, input length: %d, input start: %d"
|
||||
, currentWordIndex, freq, tempOutputWordLength, inputWordLength, inputWordStartPos);
|
||||
}
|
||||
if (freq <= 0 || tempOutputWordLength <= 0
|
||||
|| MAX_WORD_LENGTH <= (outputWordStartPos + tempOutputWordLength)) {
|
||||
return 0;
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < tempOutputWordLength; ++i) {
|
||||
outputWord[outputWordStartPos + i] = tempOutputWord[i];
|
||||
}
|
||||
|
||||
// Put output values
|
||||
freqArray[currentWordIndex - 1] = freq;
|
||||
// TODO: put output length instead of input length
|
||||
wordLengthArray[currentWordIndex - 1] = inputWordLength;
|
||||
*outputWordLength = outputWordStartPos + tempOutputWordLength;
|
||||
|
||||
if ((inputWordStartPos + inputWordLength) < inputLength) {
|
||||
if (outputWordStartPos + tempOutputWordLength >= MAX_WORD_LENGTH) {
|
||||
return 0;
|
||||
return false;
|
||||
}
|
||||
outputWord[outputWordStartPos + tempOutputWordLength] = SPACE;
|
||||
++tempOutputWordLength;
|
||||
++*outputWordLength;
|
||||
} else if (currentWordIndex >= 2) {
|
||||
// TODO: Handle 3 or more words
|
||||
const int pairFreq = correction->getFreqForSplitTwoWords(
|
||||
freqArray, wordLengthArray, isSpaceProximity, outputWord);
|
||||
if (DEBUG_DICT) {
|
||||
AKLOGI("Split two words: %d, %d, %d, %d", freqArray[0], freqArray[1], pairFreq,
|
||||
inputLength);
|
||||
}
|
||||
*outputWordLength = outputWordStartPos + tempOutputWordLength;
|
||||
return freq;
|
||||
addWord(outputWord, *outputWordLength, pairFreq, queuePool->getMasterQueue());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo,
|
||||
const int *xcoordinates, const int *ycoordinates, const int *codes,
|
||||
const bool useFullEditDistance, const int inputLength, const int missingSpacePos,
|
||||
const int spaceProximityPos, Correction *correction, WordsPriorityQueuePool* queuePool,
|
||||
const bool useFullEditDistance, const int inputLength, const int wordDivideIndex,
|
||||
Correction *correction, WordsPriorityQueuePool* queuePool,
|
||||
const bool hasAutoCorrectionCandidate) {
|
||||
if (inputLength >= MAX_WORD_LENGTH) return;
|
||||
if (DEBUG_DICT) {
|
||||
int inputCount = 0;
|
||||
if (spaceProximityPos >= 0) ++inputCount;
|
||||
if (missingSpacePos >= 0) ++inputCount;
|
||||
assert(inputCount <= 1);
|
||||
// MAX_PROXIMITY_CHARS_SIZE in ProximityInfo.java should be 16
|
||||
assert(MAX_PROXIMITY_CHARS == 16);
|
||||
}
|
||||
|
||||
initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
|
||||
inputLength, correction);
|
||||
|
||||
// Allocating fixed length array on stack
|
||||
unsigned short outputWord[MAX_WORD_LENGTH];
|
||||
int freqArray[SUB_QUEUE_MAX_WORD_INDEX];
|
||||
int wordLengthArray[SUB_QUEUE_MAX_WORD_INDEX];
|
||||
int outputWordLength = 0;
|
||||
|
||||
WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
|
||||
const bool isSpaceProximity = spaceProximityPos >= 0;
|
||||
|
||||
// First word
|
||||
int inputWordStartPos = 0;
|
||||
int inputWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
|
||||
const int firstFreq = getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
|
||||
int inputWordLength = wordDivideIndex;
|
||||
if (!getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
|
||||
useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
|
||||
FIRST_WORD_INDEX, inputWordStartPos, inputWordLength, 0, outputWord, &outputWordLength);
|
||||
if (firstFreq <= 0) {
|
||||
FIRST_WORD_INDEX, inputWordStartPos, inputWordLength, 0, true /* not used */,
|
||||
freqArray, wordLengthArray, outputWord, &outputWordLength)) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int tempOutputWordLength = outputWordLength;
|
||||
// Second word
|
||||
inputWordStartPos = isSpaceProximity ? (spaceProximityPos + 1) : missingSpacePos;
|
||||
inputWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1)
|
||||
: (inputLength - missingSpacePos);
|
||||
const int secondFreq = getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
|
||||
// Missing space
|
||||
inputWordStartPos = wordDivideIndex;
|
||||
inputWordLength = inputLength - wordDivideIndex;
|
||||
getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
|
||||
useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
|
||||
SECOND_WORD_INDEX, inputWordStartPos, inputWordLength, outputWordLength, outputWord,
|
||||
&outputWordLength);
|
||||
if (secondFreq <= 0) {
|
||||
SECOND_WORD_INDEX, inputWordStartPos, inputWordLength, tempOutputWordLength,
|
||||
false /* missing space */, freqArray, wordLengthArray, outputWord, &outputWordLength);
|
||||
|
||||
// Mistyped space
|
||||
++inputWordStartPos;
|
||||
--inputWordLength;
|
||||
|
||||
if (inputWordLength <= 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO: Remove initSuggestions and correction->setCorrectionParams
|
||||
initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction);
|
||||
|
||||
correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */,
|
||||
-1 /* transposedPos */, spaceProximityPos, missingSpacePos,
|
||||
useFullEditDistance, false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS);
|
||||
const int pairFreq = correction->getFreqForSplitTwoWords(firstFreq, secondFreq, outputWord);
|
||||
if (DEBUG_DICT) {
|
||||
AKLOGI("Split two words: %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength);
|
||||
}
|
||||
addWord(outputWord, outputWordLength, pairFreq, masterQueue);
|
||||
const int x = xcoordinates[inputWordStartPos - 1];
|
||||
const int y = ycoordinates[inputWordStartPos - 1];
|
||||
if (!proximityInfo->hasSpaceProximity(x, y)) {
|
||||
return;
|
||||
}
|
||||
|
||||
getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
|
||||
useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
|
||||
SECOND_WORD_INDEX, inputWordStartPos, inputWordLength, tempOutputWordLength,
|
||||
true /* mistyped space */, freqArray, wordLengthArray, outputWord, &outputWordLength);
|
||||
}
|
||||
|
||||
// Wrapper for getMostFrequentWordLikeInner, which matches it to the previous
|
||||
// interface.
|
||||
inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
|
||||
|
|
|
@ -103,17 +103,9 @@ class UnigramDictionary {
|
|||
const int currentWordIndex);
|
||||
void getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo,
|
||||
const int *xcoordinates, const int *ycoordinates, const int *codes,
|
||||
const bool useFullEditDistance, const int inputLength, const int spaceProximityPos,
|
||||
const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool,
|
||||
const bool useFullEditDistance, const int inputLength, const int wordDivideIndex,
|
||||
Correction *correction, WordsPriorityQueuePool* queuePool,
|
||||
const bool hasAutoCorrectionCandidate);
|
||||
void getMissingSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates,
|
||||
const int *ycoordinates, const int *codes, const bool useFullEditDistance,
|
||||
const int inputLength, const int missingSpacePos, Correction *correction,
|
||||
WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate);
|
||||
void getMistypedSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates,
|
||||
const int *ycoordinates, const int *codes, const bool useFullEditDistance,
|
||||
const int inputLength, const int spaceProximityPos, Correction *correction,
|
||||
WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate);
|
||||
void onTerminal(const int freq, const TerminalAttributes& terminalAttributes,
|
||||
Correction *correction, WordsPriorityQueuePool *queuePool, const bool addToMasterQueue,
|
||||
const int currentWordIndex);
|
||||
|
@ -127,13 +119,14 @@ class UnigramDictionary {
|
|||
ProximityInfo *proximityInfo, unsigned short *word);
|
||||
int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length,
|
||||
short unsigned int *outWord);
|
||||
int getSubStringSuggestion(
|
||||
bool getSubStringSuggestion(
|
||||
ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates,
|
||||
const int *codes, const bool useFullEditDistance, Correction *correction,
|
||||
WordsPriorityQueuePool* queuePool, const int inputLength,
|
||||
const bool hasAutoCorrectionCandidate, const int currentWordIndex,
|
||||
const int inputWordStartPos, const int inputWordLength,
|
||||
const int outputWordStartPos, unsigned short* outputWord, int *outputWordLength);
|
||||
const int outputWordStartPos, const bool isSpaceProximity, int *freqArray,
|
||||
int *wordLengthArray, unsigned short* outputWord, int *outputWordLength);
|
||||
|
||||
const uint8_t* const DICT_ROOT;
|
||||
const int MAX_WORD_LENGTH;
|
||||
|
|
Loading…
Reference in a new issue