Merge "Do other error correction for the second word of two word correction"
This commit is contained in:
commit
61b31a646e
7 changed files with 181 additions and 101 deletions
|
@ -269,7 +269,7 @@ bool Correction::needsToPrune() const {
|
||||||
// TODO: use edit distance here
|
// TODO: use edit distance here
|
||||||
return mOutputIndex - 1 >= mMaxDepth || mProximityCount > mMaxEditDistance
|
return mOutputIndex - 1 >= mMaxDepth || mProximityCount > mMaxEditDistance
|
||||||
// Allow one char longer word for missing character
|
// Allow one char longer word for missing character
|
||||||
|| (!mDoAutoCompletion && (mOutputIndex + 1 >= mInputLength));
|
|| (!mDoAutoCompletion && (mOutputIndex > mInputLength));
|
||||||
}
|
}
|
||||||
|
|
||||||
void Correction::addCharToCurrentWord(const int32_t c) {
|
void Correction::addCharToCurrentWord(const int32_t c) {
|
||||||
|
@ -555,55 +555,6 @@ Correction::CorrectionType Correction::processCharAndCalcState(
|
||||||
Correction::~Correction() {
|
Correction::~Correction() {
|
||||||
}
|
}
|
||||||
|
|
||||||
/////////////////////////
|
|
||||||
// static inline utils //
|
|
||||||
/////////////////////////
|
|
||||||
|
|
||||||
static const int TWO_31ST_DIV_255 = S_INT_MAX / 255;
|
|
||||||
static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {
|
|
||||||
return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);
|
|
||||||
}
|
|
||||||
|
|
||||||
static const int TWO_31ST_DIV_2 = S_INT_MAX / 2;
|
|
||||||
inline static void multiplyIntCapped(const int multiplier, int *base) {
|
|
||||||
const int temp = *base;
|
|
||||||
if (temp != S_INT_MAX) {
|
|
||||||
// Branch if multiplier == 2 for the optimization
|
|
||||||
if (multiplier == 2) {
|
|
||||||
*base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX;
|
|
||||||
} else {
|
|
||||||
// TODO: This overflow check gives a wrong answer when, for example,
|
|
||||||
// temp = 2^16 + 1 and multiplier = 2^17 + 1.
|
|
||||||
// Fix this behavior.
|
|
||||||
const int tempRetval = temp * multiplier;
|
|
||||||
*base = tempRetval >= temp ? tempRetval : S_INT_MAX;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static int powerIntCapped(const int base, const int n) {
|
|
||||||
if (n <= 0) return 1;
|
|
||||||
if (base == 2) {
|
|
||||||
return n < 31 ? 1 << n : S_INT_MAX;
|
|
||||||
} else {
|
|
||||||
int ret = base;
|
|
||||||
for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static void multiplyRate(const int rate, int *freq) {
|
|
||||||
if (*freq != S_INT_MAX) {
|
|
||||||
if (*freq > 1000000) {
|
|
||||||
*freq /= 100;
|
|
||||||
multiplyIntCapped(rate, freq);
|
|
||||||
} else {
|
|
||||||
multiplyIntCapped(rate, freq);
|
|
||||||
*freq /= 100;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static int getQuoteCount(const unsigned short* word, const int length) {
|
inline static int getQuoteCount(const unsigned short* word, const int length) {
|
||||||
int quoteCount = 0;
|
int quoteCount = 0;
|
||||||
for (int i = 0; i < length; ++i) {
|
for (int i = 0; i < length; ++i) {
|
||||||
|
@ -939,7 +890,11 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
|
||||||
multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);
|
multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);
|
||||||
}
|
}
|
||||||
|
|
||||||
multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);
|
if (isSpaceProximity) {
|
||||||
|
multiplyRate(WORDS_WITH_MISTYPED_SPACE_DEMOTION_RATE, &totalFreq);
|
||||||
|
} else {
|
||||||
|
multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);
|
||||||
|
}
|
||||||
|
|
||||||
if (capitalizedWordDemotion) {
|
if (capitalizedWordDemotion) {
|
||||||
multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq);
|
multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq);
|
||||||
|
|
|
@ -36,6 +36,55 @@ class Correction {
|
||||||
NOT_ON_TERMINAL
|
NOT_ON_TERMINAL
|
||||||
} CorrectionType;
|
} CorrectionType;
|
||||||
|
|
||||||
|
/////////////////////////
|
||||||
|
// static inline utils //
|
||||||
|
/////////////////////////
|
||||||
|
|
||||||
|
static const int TWO_31ST_DIV_255 = S_INT_MAX / 255;
|
||||||
|
static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {
|
||||||
|
return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);
|
||||||
|
}
|
||||||
|
|
||||||
|
static const int TWO_31ST_DIV_2 = S_INT_MAX / 2;
|
||||||
|
inline static void multiplyIntCapped(const int multiplier, int *base) {
|
||||||
|
const int temp = *base;
|
||||||
|
if (temp != S_INT_MAX) {
|
||||||
|
// Branch if multiplier == 2 for the optimization
|
||||||
|
if (multiplier == 2) {
|
||||||
|
*base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX;
|
||||||
|
} else {
|
||||||
|
// TODO: This overflow check gives a wrong answer when, for example,
|
||||||
|
// temp = 2^16 + 1 and multiplier = 2^17 + 1.
|
||||||
|
// Fix this behavior.
|
||||||
|
const int tempRetval = temp * multiplier;
|
||||||
|
*base = tempRetval >= temp ? tempRetval : S_INT_MAX;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline static int powerIntCapped(const int base, const int n) {
|
||||||
|
if (n <= 0) return 1;
|
||||||
|
if (base == 2) {
|
||||||
|
return n < 31 ? 1 << n : S_INT_MAX;
|
||||||
|
} else {
|
||||||
|
int ret = base;
|
||||||
|
for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline static void multiplyRate(const int rate, int *freq) {
|
||||||
|
if (*freq != S_INT_MAX) {
|
||||||
|
if (*freq > 1000000) {
|
||||||
|
*freq /= 100;
|
||||||
|
multiplyIntCapped(rate, freq);
|
||||||
|
} else {
|
||||||
|
multiplyIntCapped(rate, freq);
|
||||||
|
*freq /= 100;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Correction(const int typedLetterMultiplier, const int fullWordMultiplier);
|
Correction(const int typedLetterMultiplier, const int fullWordMultiplier);
|
||||||
void initCorrection(
|
void initCorrection(
|
||||||
const ProximityInfo *pi, const int inputLength, const int maxWordLength);
|
const ProximityInfo *pi, const int inputLength, const int maxWordLength);
|
||||||
|
|
|
@ -189,6 +189,7 @@ static void prof_out(void) {
|
||||||
#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 80
|
#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 80
|
||||||
#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X 12
|
#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X 12
|
||||||
#define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 58
|
#define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 58
|
||||||
|
#define WORDS_WITH_MISTYPED_SPACE_DEMOTION_RATE 50
|
||||||
#define WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE 75
|
#define WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE 75
|
||||||
#define WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE 75
|
#define WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE 75
|
||||||
#define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 70
|
#define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 70
|
||||||
|
@ -222,6 +223,9 @@ static void prof_out(void) {
|
||||||
|
|
||||||
#define MAX_DEPTH_MULTIPLIER 3
|
#define MAX_DEPTH_MULTIPLIER 3
|
||||||
|
|
||||||
|
#define FIRST_WORD_INDEX 1
|
||||||
|
#define SECOND_WORD_INDEX 2
|
||||||
|
|
||||||
// TODO: Reduce this constant if possible; check the maximum number of umlauts in the same German
|
// TODO: Reduce this constant if possible; check the maximum number of umlauts in the same German
|
||||||
// word in the dictionary
|
// word in the dictionary
|
||||||
#define DEFAULT_MAX_UMLAUT_SEARCH_DEPTH 5
|
#define DEFAULT_MAX_UMLAUT_SEARCH_DEPTH 5
|
||||||
|
|
|
@ -159,19 +159,26 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo,
|
||||||
}
|
}
|
||||||
|
|
||||||
PROF_START(20);
|
PROF_START(20);
|
||||||
|
if (DEBUG_DICT) {
|
||||||
|
double ns = queuePool->getMasterQueue()->getHighestNormalizedScore(
|
||||||
|
proximityInfo->getPrimaryInputWord(), codesSize, 0, 0, 0);
|
||||||
|
ns += 0;
|
||||||
|
AKLOGI("Max normalized score = %f", ns);
|
||||||
|
}
|
||||||
const int suggestedWordsCount =
|
const int suggestedWordsCount =
|
||||||
queuePool->getMasterQueue()->outputSuggestions(frequencies, outWords);
|
queuePool->getMasterQueue()->outputSuggestions(frequencies, outWords);
|
||||||
|
|
||||||
if (DEBUG_DICT) {
|
if (DEBUG_DICT) {
|
||||||
|
double ns = queuePool->getMasterQueue()->getHighestNormalizedScore(
|
||||||
|
proximityInfo->getPrimaryInputWord(), codesSize, 0, 0, 0);
|
||||||
|
ns += 0;
|
||||||
AKLOGI("Returning %d words", suggestedWordsCount);
|
AKLOGI("Returning %d words", suggestedWordsCount);
|
||||||
/// Print the returned words
|
/// Print the returned words
|
||||||
for (int j = 0; j < suggestedWordsCount; ++j) {
|
for (int j = 0; j < suggestedWordsCount; ++j) {
|
||||||
#ifdef FLAG_DBG
|
|
||||||
short unsigned int* w = outWords + j * MAX_WORD_LENGTH;
|
short unsigned int* w = outWords + j * MAX_WORD_LENGTH;
|
||||||
char s[MAX_WORD_LENGTH];
|
char s[MAX_WORD_LENGTH];
|
||||||
for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i];
|
for (int i = 0; i <= MAX_WORD_LENGTH; i++) s[i] = w[i];
|
||||||
AKLOGI("%s %i", s, frequencies[j]);
|
AKLOGI("%s %i", s, frequencies[j]);
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
PROF_END(20);
|
PROF_END(20);
|
||||||
|
@ -205,6 +212,13 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
|
||||||
|
|
||||||
PROF_START(4);
|
PROF_START(4);
|
||||||
// Note: This line is intentionally left blank
|
// Note: This line is intentionally left blank
|
||||||
|
bool hasAutoCorrectionCandidate = false;
|
||||||
|
WordsPriorityQueue* masterQueue = queuePool->getMasterQueue();
|
||||||
|
if (masterQueue->size() > 0) {
|
||||||
|
double nsForMaster = masterQueue->getHighestNormalizedScore(
|
||||||
|
proximityInfo->getPrimaryInputWord(), inputLength, 0, 0, 0);
|
||||||
|
hasAutoCorrectionCandidate = (nsForMaster > START_TWO_WORDS_CORRECTION_THRESHOLD);
|
||||||
|
}
|
||||||
PROF_END(4);
|
PROF_END(4);
|
||||||
|
|
||||||
PROF_START(5);
|
PROF_START(5);
|
||||||
|
@ -216,7 +230,8 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
|
||||||
AKLOGI("--- Suggest missing space characters %d", i);
|
AKLOGI("--- Suggest missing space characters %d", i);
|
||||||
}
|
}
|
||||||
getMissingSpaceWords(proximityInfo, xcoordinates, ycoordinates, codes,
|
getMissingSpaceWords(proximityInfo, xcoordinates, ycoordinates, codes,
|
||||||
useFullEditDistance, inputLength, i, correction, queuePool);
|
useFullEditDistance, inputLength, i, correction, queuePool,
|
||||||
|
hasAutoCorrectionCandidate);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
PROF_END(5);
|
PROF_END(5);
|
||||||
|
@ -236,7 +251,8 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
|
||||||
}
|
}
|
||||||
if (proximityInfo->hasSpaceProximity(x, y)) {
|
if (proximityInfo->hasSpaceProximity(x, y)) {
|
||||||
getMistypedSpaceWords(proximityInfo, xcoordinates, ycoordinates, codes,
|
getMistypedSpaceWords(proximityInfo, xcoordinates, ycoordinates, codes,
|
||||||
useFullEditDistance, inputLength, i, correction, queuePool);
|
useFullEditDistance, inputLength, i, correction, queuePool,
|
||||||
|
hasAutoCorrectionCandidate);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -281,12 +297,12 @@ void UnigramDictionary::getOneWordSuggestions(ProximityInfo *proximityInfo,
|
||||||
WordsPriorityQueuePool *queuePool) {
|
WordsPriorityQueuePool *queuePool) {
|
||||||
initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction);
|
initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction);
|
||||||
getSuggestionCandidates(useFullEditDistance, inputLength, correction, queuePool,
|
getSuggestionCandidates(useFullEditDistance, inputLength, correction, queuePool,
|
||||||
true /* doAutoCompletion */, DEFAULT_MAX_ERRORS);
|
true /* doAutoCompletion */, DEFAULT_MAX_ERRORS, FIRST_WORD_INDEX);
|
||||||
}
|
}
|
||||||
|
|
||||||
void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance,
|
void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance,
|
||||||
const int inputLength, Correction *correction, WordsPriorityQueuePool *queuePool,
|
const int inputLength, Correction *correction, WordsPriorityQueuePool *queuePool,
|
||||||
const bool doAutoCompletion, const int maxErrors) {
|
const bool doAutoCompletion, const int maxErrors, const int currentWordIndex) {
|
||||||
// TODO: Remove setCorrectionParams
|
// TODO: Remove setCorrectionParams
|
||||||
correction->setCorrectionParams(0, 0, 0,
|
correction->setCorrectionParams(0, 0, 0,
|
||||||
-1 /* spaceProximityPos */, -1 /* missingSpacePos */, useFullEditDistance,
|
-1 /* spaceProximityPos */, -1 /* missingSpacePos */, useFullEditDistance,
|
||||||
|
@ -305,7 +321,8 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance,
|
||||||
int firstChildPos;
|
int firstChildPos;
|
||||||
|
|
||||||
const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos,
|
const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos,
|
||||||
correction, &childCount, &firstChildPos, &siblingPos, queuePool);
|
correction, &childCount, &firstChildPos, &siblingPos, queuePool,
|
||||||
|
currentWordIndex);
|
||||||
// Update next sibling pos
|
// Update next sibling pos
|
||||||
correction->setTreeSiblingPos(outputIndex, siblingPos);
|
correction->setTreeSiblingPos(outputIndex, siblingPos);
|
||||||
|
|
||||||
|
@ -323,31 +340,32 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance,
|
||||||
void UnigramDictionary::getMissingSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates,
|
void UnigramDictionary::getMissingSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates,
|
||||||
const int *ycoordinates, const int *codes, const bool useFullEditDistance,
|
const int *ycoordinates, const int *codes, const bool useFullEditDistance,
|
||||||
const int inputLength, const int missingSpacePos, Correction *correction,
|
const int inputLength, const int missingSpacePos, Correction *correction,
|
||||||
WordsPriorityQueuePool* queuePool) {
|
WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate) {
|
||||||
getSplitTwoWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
|
getSplitTwoWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
|
||||||
useFullEditDistance, inputLength, missingSpacePos, -1/* spaceProximityPos */,
|
useFullEditDistance, inputLength, missingSpacePos, -1/* spaceProximityPos */,
|
||||||
correction, queuePool);
|
correction, queuePool, hasAutoCorrectionCandidate);
|
||||||
}
|
}
|
||||||
|
|
||||||
void UnigramDictionary::getMistypedSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates,
|
void UnigramDictionary::getMistypedSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates,
|
||||||
const int *ycoordinates, const int *codes, const bool useFullEditDistance,
|
const int *ycoordinates, const int *codes, const bool useFullEditDistance,
|
||||||
const int inputLength, const int spaceProximityPos, Correction *correction,
|
const int inputLength, const int spaceProximityPos, Correction *correction,
|
||||||
WordsPriorityQueuePool* queuePool) {
|
WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate) {
|
||||||
getSplitTwoWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
|
getSplitTwoWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
|
||||||
useFullEditDistance, inputLength, -1 /* missingSpacePos */, spaceProximityPos,
|
useFullEditDistance, inputLength, -1 /* missingSpacePos */, spaceProximityPos,
|
||||||
correction, queuePool);
|
correction, queuePool, hasAutoCorrectionCandidate);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void UnigramDictionary::onTerminal(const int freq,
|
inline void UnigramDictionary::onTerminal(const int freq,
|
||||||
const TerminalAttributes& terminalAttributes, Correction *correction,
|
const TerminalAttributes& terminalAttributes, Correction *correction,
|
||||||
WordsPriorityQueuePool *queuePool, const bool addToMasterQueue) {
|
WordsPriorityQueuePool *queuePool, const bool addToMasterQueue,
|
||||||
|
const int currentWordIndex) {
|
||||||
const int inputIndex = correction->getInputIndex();
|
const int inputIndex = correction->getInputIndex();
|
||||||
const bool addToSubQueue = inputIndex < SUB_QUEUE_MAX_COUNT;
|
const bool addToSubQueue = inputIndex < SUB_QUEUE_MAX_COUNT;
|
||||||
|
|
||||||
int wordLength;
|
int wordLength;
|
||||||
unsigned short* wordPointer;
|
unsigned short* wordPointer;
|
||||||
|
|
||||||
if (addToMasterQueue) {
|
if ((currentWordIndex == 1) && addToMasterQueue) {
|
||||||
WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
|
WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
|
||||||
const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength);
|
const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength);
|
||||||
if (finalFreq != NOT_A_FREQUENCY) {
|
if (finalFreq != NOT_A_FREQUENCY) {
|
||||||
|
@ -376,9 +394,14 @@ inline void UnigramDictionary::onTerminal(const int freq,
|
||||||
// We only allow two words + other error correction for words with SUB_QUEUE_MIN_WORD_LENGTH
|
// We only allow two words + other error correction for words with SUB_QUEUE_MIN_WORD_LENGTH
|
||||||
// or more length.
|
// or more length.
|
||||||
if (inputIndex >= SUB_QUEUE_MIN_WORD_LENGTH && addToSubQueue) {
|
if (inputIndex >= SUB_QUEUE_MIN_WORD_LENGTH && addToSubQueue) {
|
||||||
// TODO: Check the validity of "inputIndex == wordLength"
|
WordsPriorityQueue *subQueue;
|
||||||
//if (addToSubQueue && inputIndex == wordLength) {
|
if (currentWordIndex == 1) {
|
||||||
WordsPriorityQueue *subQueue = queuePool->getSubQueue1(inputIndex);
|
subQueue = queuePool->getSubQueue1(inputIndex);
|
||||||
|
} else if (currentWordIndex == 2) {
|
||||||
|
subQueue = queuePool->getSubQueue2(inputIndex);
|
||||||
|
} else {
|
||||||
|
return;
|
||||||
|
}
|
||||||
const int finalFreq = correction->getFinalFreqForSubQueue(freq, &wordPointer, &wordLength,
|
const int finalFreq = correction->getFinalFreqForSubQueue(freq, &wordPointer, &wordLength,
|
||||||
inputIndex);
|
inputIndex);
|
||||||
addWord(wordPointer, wordLength, finalFreq, subQueue);
|
addWord(wordPointer, wordLength, finalFreq, subQueue);
|
||||||
|
@ -388,17 +411,21 @@ inline void UnigramDictionary::onTerminal(const int freq,
|
||||||
void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo,
|
void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo,
|
||||||
const int *xcoordinates, const int *ycoordinates, const int *codes,
|
const int *xcoordinates, const int *ycoordinates, const int *codes,
|
||||||
const bool useFullEditDistance, const int inputLength, const int missingSpacePos,
|
const bool useFullEditDistance, const int inputLength, const int missingSpacePos,
|
||||||
const int spaceProximityPos, Correction *correction, WordsPriorityQueuePool* queuePool) {
|
const int spaceProximityPos, Correction *correction, WordsPriorityQueuePool* queuePool,
|
||||||
|
const bool hasAutoCorrectionCandidate) {
|
||||||
if (inputLength >= MAX_WORD_LENGTH) return;
|
if (inputLength >= MAX_WORD_LENGTH) return;
|
||||||
if (DEBUG_DICT) {
|
if (DEBUG_DICT) {
|
||||||
int inputCount = 0;
|
int inputCount = 0;
|
||||||
if (spaceProximityPos >= 0) ++inputCount;
|
if (spaceProximityPos >= 0) ++inputCount;
|
||||||
if (missingSpacePos >= 0) ++inputCount;
|
if (missingSpacePos >= 0) ++inputCount;
|
||||||
assert(inputCount <= 1);
|
assert(inputCount <= 1);
|
||||||
|
// MAX_PROXIMITY_CHARS_SIZE in ProximityInfo.java should be 16
|
||||||
|
assert(MAX_PROXIMITY_CHARS == 16);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
|
||||||
|
inputLength, correction);
|
||||||
WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
|
WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
|
||||||
|
|
||||||
const bool isSpaceProximity = spaceProximityPos >= 0;
|
const bool isSpaceProximity = spaceProximityPos >= 0;
|
||||||
|
|
||||||
// First word
|
// First word
|
||||||
|
@ -411,26 +438,22 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo
|
||||||
if (firstFreq > 0) {
|
if (firstFreq > 0) {
|
||||||
firstOutputWordLength = firstInputWordLength;
|
firstOutputWordLength = firstInputWordLength;
|
||||||
firstOutputWord = mWord;
|
firstOutputWord = mWord;
|
||||||
} else {
|
} else if (!hasAutoCorrectionCandidate) {
|
||||||
if (masterQueue->size() > 0) {
|
|
||||||
double nsForMaster = masterQueue->getHighestNormalizedScore(
|
|
||||||
proximityInfo->getPrimaryInputWord(), inputLength, 0, 0, 0);
|
|
||||||
if (nsForMaster > START_TWO_WORDS_CORRECTION_THRESHOLD) {
|
|
||||||
// Do nothing if the highest suggestion exceeds the threshold.
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
WordsPriorityQueue* firstWordQueue = queuePool->getSubQueue1(firstInputWordLength);
|
WordsPriorityQueue* firstWordQueue = queuePool->getSubQueue1(firstInputWordLength);
|
||||||
if (firstWordQueue->size() < 1) {
|
if (!firstWordQueue || firstWordQueue->size() < 1) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
int score = 0;
|
int score = 0;
|
||||||
const double ns = firstWordQueue->getHighestNormalizedScore(
|
const double ns = firstWordQueue->getHighestNormalizedScore(
|
||||||
proximityInfo->getPrimaryInputWord(), firstInputWordLength,
|
proximityInfo->getPrimaryInputWord(), firstInputWordLength,
|
||||||
&firstOutputWord, &score, &firstOutputWordLength);
|
&firstOutputWord, &score, &firstOutputWordLength);
|
||||||
|
if (DEBUG_DICT) {
|
||||||
|
AKLOGI("NS1 = %f, Score = %d", ns, score);
|
||||||
|
}
|
||||||
// Two words correction won't be done if the score of the first word doesn't exceed the
|
// Two words correction won't be done if the score of the first word doesn't exceed the
|
||||||
// threshold.
|
// threshold.
|
||||||
if (ns < TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD) {
|
if (ns < TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD
|
||||||
|
|| firstOutputWordLength < SUB_QUEUE_MIN_WORD_LENGTH) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
firstFreq = score >> (firstOutputWordLength
|
firstFreq = score >> (firstOutputWordLength
|
||||||
|
@ -456,14 +479,6 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo
|
||||||
outputWord[firstOutputWordLength] = SPACE;
|
outputWord[firstOutputWordLength] = SPACE;
|
||||||
outputWordLength = firstOutputWordLength + 1;
|
outputWordLength = firstOutputWordLength + 1;
|
||||||
|
|
||||||
//const int outputWordLength = firstOutputWordLength + secondWordLength + 1;
|
|
||||||
// Space proximity preparation
|
|
||||||
//WordsPriorityQueue *subQueue = queuePool->getSubQueue1();
|
|
||||||
//initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, firstOutputWordLength,
|
|
||||||
//subQueue, correction);
|
|
||||||
//getSuggestionCandidates(useFullEditDistance, firstOutputWordLength, correction, subQueue,
|
|
||||||
//false, MAX_ERRORS_FOR_TWO_WORDS);
|
|
||||||
|
|
||||||
// Second word
|
// Second word
|
||||||
const int secondInputWordLength = isSpaceProximity
|
const int secondInputWordLength = isSpaceProximity
|
||||||
? (inputLength - spaceProximityPos - 1)
|
? (inputLength - spaceProximityPos - 1)
|
||||||
|
@ -478,9 +493,42 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo
|
||||||
if (secondFreq > 0) {
|
if (secondFreq > 0) {
|
||||||
secondOutputWordLength = secondInputWordLength;
|
secondOutputWordLength = secondInputWordLength;
|
||||||
secondOutputWord = mWord;
|
secondOutputWord = mWord;
|
||||||
|
} else if (!hasAutoCorrectionCandidate) {
|
||||||
|
const int offset = secondInputWordStartPos;
|
||||||
|
initSuggestions(proximityInfo, &xcoordinates[offset], &ycoordinates[offset],
|
||||||
|
codes + offset * MAX_PROXIMITY_CHARS, secondInputWordLength, correction);
|
||||||
|
queuePool->clearSubQueue2();
|
||||||
|
getSuggestionCandidates(useFullEditDistance, secondInputWordLength, correction,
|
||||||
|
queuePool, false, MAX_ERRORS_FOR_TWO_WORDS, SECOND_WORD_INDEX);
|
||||||
|
if (DEBUG_DICT) {
|
||||||
|
AKLOGI("Dump second word candidates %d", secondInputWordLength);
|
||||||
|
for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
|
||||||
|
queuePool->getSubQueue2(i)->dumpTopWord();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
WordsPriorityQueue* secondWordQueue = queuePool->getSubQueue2(secondInputWordLength);
|
||||||
|
if (!secondWordQueue || secondWordQueue->size() < 1) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
int score = 0;
|
||||||
|
const double ns = secondWordQueue->getHighestNormalizedScore(
|
||||||
|
proximityInfo->getPrimaryInputWord(), secondInputWordLength,
|
||||||
|
&secondOutputWord, &score, &secondOutputWordLength);
|
||||||
|
if (DEBUG_DICT) {
|
||||||
|
AKLOGI("NS2 = %f, Score = %d", ns, score);
|
||||||
|
}
|
||||||
|
// Two words correction won't be done if the score of the first word doesn't exceed the
|
||||||
|
// threshold.
|
||||||
|
if (ns < TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD
|
||||||
|
|| secondOutputWordLength < SUB_QUEUE_MIN_WORD_LENGTH) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
secondFreq = score >> (secondOutputWordLength
|
||||||
|
+ TWO_WORDS_PLUS_OTHER_ERROR_CORRECTION_DEMOTION_DIVIDER);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (DEBUG_DICT) {
|
if (DEBUG_DICT) {
|
||||||
|
DUMP_WORD(secondOutputWord, secondOutputWordLength);
|
||||||
AKLOGI("Second freq: %d", secondFreq);
|
AKLOGI("Second freq: %d", secondFreq);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -742,7 +790,8 @@ int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offs
|
||||||
// given level, as output into newCount when traversing this level's parent.
|
// given level, as output into newCount when traversing this level's parent.
|
||||||
inline bool UnigramDictionary::processCurrentNode(const int initialPos,
|
inline bool UnigramDictionary::processCurrentNode(const int initialPos,
|
||||||
Correction *correction, int *newCount,
|
Correction *correction, int *newCount,
|
||||||
int *newChildrenPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool) {
|
int *newChildrenPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool,
|
||||||
|
const int currentWordIndex) {
|
||||||
if (DEBUG_DICT) {
|
if (DEBUG_DICT) {
|
||||||
correction->checkState();
|
correction->checkState();
|
||||||
}
|
}
|
||||||
|
@ -823,7 +872,8 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos,
|
||||||
const int childrenAddressPos = BinaryFormat::skipFrequency(flags, pos);
|
const int childrenAddressPos = BinaryFormat::skipFrequency(flags, pos);
|
||||||
const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos);
|
const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos);
|
||||||
TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos);
|
TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos);
|
||||||
onTerminal(freq, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal);
|
onTerminal(freq, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal,
|
||||||
|
currentWordIndex);
|
||||||
|
|
||||||
// If there are more chars in this node, then this virtual node has children.
|
// If there are more chars in this node, then this virtual node has children.
|
||||||
// If we are on the last char, this virtual node has children if this node has.
|
// If we are on the last char, this virtual node has children if this node has.
|
||||||
|
|
|
@ -99,11 +99,13 @@ class UnigramDictionary {
|
||||||
const int inputLength, Correction *correction, WordsPriorityQueuePool* queuePool);
|
const int inputLength, Correction *correction, WordsPriorityQueuePool* queuePool);
|
||||||
void getSuggestionCandidates(
|
void getSuggestionCandidates(
|
||||||
const bool useFullEditDistance, const int inputLength, Correction *correction,
|
const bool useFullEditDistance, const int inputLength, Correction *correction,
|
||||||
WordsPriorityQueuePool* queuePool, const bool doAutoCompletion, const int maxErrors);
|
WordsPriorityQueuePool* queuePool, const bool doAutoCompletion, const int maxErrors,
|
||||||
|
const int currentWordIndex);
|
||||||
void getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo,
|
void getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo,
|
||||||
const int *xcoordinates, const int *ycoordinates, const int *codes,
|
const int *xcoordinates, const int *ycoordinates, const int *codes,
|
||||||
const bool useFullEditDistance, const int inputLength, const int spaceProximityPos,
|
const bool useFullEditDistance, const int inputLength, const int spaceProximityPos,
|
||||||
const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool);
|
const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool,
|
||||||
|
const bool hasAutoCorrectionCandidate);
|
||||||
void getSplitTwoWordsSuggestionsOld(ProximityInfo *proximityInfo,
|
void getSplitTwoWordsSuggestionsOld(ProximityInfo *proximityInfo,
|
||||||
const int *xcoordinates, const int *ycoordinates, const int *codes,
|
const int *xcoordinates, const int *ycoordinates, const int *codes,
|
||||||
const bool useFullEditDistance, const int inputLength, const int spaceProximityPos,
|
const bool useFullEditDistance, const int inputLength, const int spaceProximityPos,
|
||||||
|
@ -111,18 +113,20 @@ class UnigramDictionary {
|
||||||
void getMissingSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates,
|
void getMissingSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates,
|
||||||
const int *ycoordinates, const int *codes, const bool useFullEditDistance,
|
const int *ycoordinates, const int *codes, const bool useFullEditDistance,
|
||||||
const int inputLength, const int missingSpacePos, Correction *correction,
|
const int inputLength, const int missingSpacePos, Correction *correction,
|
||||||
WordsPriorityQueuePool* queuePool);
|
WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate);
|
||||||
void getMistypedSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates,
|
void getMistypedSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates,
|
||||||
const int *ycoordinates, const int *codes, const bool useFullEditDistance,
|
const int *ycoordinates, const int *codes, const bool useFullEditDistance,
|
||||||
const int inputLength, const int spaceProximityPos, Correction *correction,
|
const int inputLength, const int spaceProximityPos, Correction *correction,
|
||||||
WordsPriorityQueuePool* queuePool);
|
WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate);
|
||||||
void onTerminal(const int freq, const TerminalAttributes& terminalAttributes,
|
void onTerminal(const int freq, const TerminalAttributes& terminalAttributes,
|
||||||
Correction *correction, WordsPriorityQueuePool *queuePool, const bool addToMasterQueue);
|
Correction *correction, WordsPriorityQueuePool *queuePool, const bool addToMasterQueue,
|
||||||
|
const int currentWordIndex);
|
||||||
bool needsToSkipCurrentNode(const unsigned short c,
|
bool needsToSkipCurrentNode(const unsigned short c,
|
||||||
const int inputIndex, const int skipPos, const int depth);
|
const int inputIndex, const int skipPos, const int depth);
|
||||||
// Process a node by considering proximity, missing and excessive character
|
// Process a node by considering proximity, missing and excessive character
|
||||||
bool processCurrentNode(const int initialPos, Correction *correction, int *newCount,
|
bool processCurrentNode(const int initialPos, Correction *correction, int *newCount,
|
||||||
int *newChildPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool);
|
int *newChildPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool,
|
||||||
|
const int currentWordIndex);
|
||||||
int getMostFrequentWordLike(const int startInputIndex, const int inputLength,
|
int getMostFrequentWordLike(const int startInputIndex, const int inputLength,
|
||||||
ProximityInfo *proximityInfo, unsigned short *word);
|
ProximityInfo *proximityInfo, unsigned short *word);
|
||||||
int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length,
|
int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length,
|
||||||
|
|
|
@ -137,7 +137,7 @@ class WordsPriorityQueue {
|
||||||
if (size() <= 0) {
|
if (size() <= 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
DUMP_WORD(mSuggestions.top()->mWord, mSuggestions.top()->mWordLength);
|
DUMP_WORD(mHighestSuggestedWord->mWord, mHighestSuggestedWord->mWordLength);
|
||||||
}
|
}
|
||||||
|
|
||||||
double getHighestNormalizedScore(const unsigned short* before, const int beforeLength,
|
double getHighestNormalizedScore(const unsigned short* before, const int beforeLength,
|
||||||
|
|
|
@ -45,15 +45,21 @@ class WordsPriorityQueuePool {
|
||||||
|
|
||||||
// TODO: Come up with more generic pool
|
// TODO: Come up with more generic pool
|
||||||
WordsPriorityQueue* getSubQueue1(const int id) {
|
WordsPriorityQueue* getSubQueue1(const int id) {
|
||||||
if (DEBUG_WORDS_PRIORITY_QUEUE) {
|
if (id < 0 || id >= SUB_QUEUE_MAX_COUNT) {
|
||||||
assert(id >= 0 && id < SUB_QUEUE_MAX_COUNT);
|
if (DEBUG_WORDS_PRIORITY_QUEUE) {
|
||||||
|
assert(false);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
return mSubQueues1[id];
|
return mSubQueues1[id];
|
||||||
}
|
}
|
||||||
|
|
||||||
WordsPriorityQueue* getSubQueue2(const int id) {
|
WordsPriorityQueue* getSubQueue2(const int id) {
|
||||||
if (DEBUG_WORDS_PRIORITY_QUEUE) {
|
if (id < 0 || id >= SUB_QUEUE_MAX_COUNT) {
|
||||||
assert(id >= 0 && id < SUB_QUEUE_MAX_COUNT);
|
if (DEBUG_WORDS_PRIORITY_QUEUE) {
|
||||||
|
assert(false);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
return mSubQueues2[id];
|
return mSubQueues2[id];
|
||||||
}
|
}
|
||||||
|
@ -66,6 +72,18 @@ class WordsPriorityQueuePool {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline void clearSubQueue1() {
|
||||||
|
for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
|
||||||
|
mSubQueues1[i]->clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void clearSubQueue2() {
|
||||||
|
for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
|
||||||
|
mSubQueues2[i]->clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void dumpSubQueue1TopSuggestions() {
|
void dumpSubQueue1TopSuggestions() {
|
||||||
AKLOGI("DUMP SUBQUEUE1 TOP SUGGESTIONS");
|
AKLOGI("DUMP SUBQUEUE1 TOP SUGGESTIONS");
|
||||||
for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
|
for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
|
||||||
|
|
Loading…
Reference in a new issue