Merge multiple words suggestions algorithm

Change-Id: I70d85b90ddaa28a41e9679f445bc14ef9ff50f16
This commit is contained in:
satok 2012-01-26 18:36:19 +09:00
parent 2692a87007
commit 3c09bb18d9
2 changed files with 90 additions and 126 deletions

View file

@ -407,21 +407,74 @@ inline void UnigramDictionary::onTerminal(const int freq,
int UnigramDictionary::getSubStringSuggestion( int UnigramDictionary::getSubStringSuggestion(
ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates,
const int *codes, const bool useFullEditDistance, const Correction *correction, const int *codes, const bool useFullEditDistance, Correction *correction,
WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate, WordsPriorityQueuePool* queuePool, const int inputLength,
const int currentWordIndex, const int inputWordStartPos, const int inputWordLength, const bool hasAutoCorrectionCandidate, const int currentWordIndex,
const int inputWordStartPos, const int inputWordLength,
const int outputWordStartPos, unsigned short* outputWord, int *outputWordLength) { const int outputWordStartPos, unsigned short* outputWord, int *outputWordLength) {
// under constructiong unsigned short* tempOutputWord = 0;
// unsigned short* tempOutputWord = 0; int tempOutputWordLength = 0;
// int tempOutputWordLength = 0; int freq = getMostFrequentWordLike(
// int freq = getMostFrequentWordLike( inputWordStartPos, inputWordLength, proximityInfo, mWord);
// inputWordStartPos, inputWordLength, proximityInfo, mWord); if (freq > 0) {
// if (freq > 0) { tempOutputWordLength = inputWordLength;
// tempOutputWordLength = inputWordLength; tempOutputWord = mWord;
// tempOutputWord = mWord; } else if (!hasAutoCorrectionCandidate) {
// } else if (!hasAutoCorrectionCandidate) { if (inputWordStartPos > 0) {
// } const int offset = inputWordStartPos;
initSuggestions(proximityInfo, &xcoordinates[offset], &ycoordinates[offset],
codes + offset * MAX_PROXIMITY_CHARS, inputWordLength, correction);
queuePool->clearSubQueue(currentWordIndex);
getSuggestionCandidates(useFullEditDistance, inputWordLength, correction,
queuePool, false, MAX_ERRORS_FOR_TWO_WORDS, currentWordIndex);
if (DEBUG_DICT) {
if (currentWordIndex <= SUB_QUEUE_MAX_WORD_INDEX) {
AKLOGI("Dump word candidates(%d) %d", currentWordIndex, inputWordLength);
for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
queuePool->getSubQueue(currentWordIndex, i)->dumpTopWord();
}
}
}
}
WordsPriorityQueue* queue = queuePool->getSubQueue(currentWordIndex, inputWordLength);
if (!queue || queue->size() < 1) {
return 0; return 0;
}
int score = 0;
const double ns = queue->getHighestNormalizedScore(
proximityInfo->getPrimaryInputWord(), inputWordLength,
&tempOutputWord, &score, &tempOutputWordLength);
if (DEBUG_DICT) {
AKLOGI("NS(%d) = %f, Score = %d", currentWordIndex, ns, score);
}
// Two words correction won't be done if the score of the first word doesn't exceed the
// threshold.
if (ns < TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD
|| tempOutputWordLength < SUB_QUEUE_MIN_WORD_LENGTH) {
return 0;
}
freq = score >> (tempOutputWordLength
+ TWO_WORDS_PLUS_OTHER_ERROR_CORRECTION_DEMOTION_DIVIDER);
}
if (DEBUG_DICT) {
AKLOGI("Freq(%d): %d", currentWordIndex, freq);
}
if (freq <= 0 || tempOutputWordLength <= 0
|| MAX_WORD_LENGTH <= (outputWordStartPos + tempOutputWordLength)) {
return 0;
}
for (int i = 0; i < tempOutputWordLength; ++i) {
outputWord[outputWordStartPos + i] = tempOutputWord[i];
}
if ((inputWordStartPos + inputWordLength) < inputLength) {
if (outputWordStartPos + tempOutputWordLength >= MAX_WORD_LENGTH) {
return 0;
}
outputWord[outputWordStartPos + tempOutputWordLength] = SPACE;
++tempOutputWordLength;
}
*outputWordLength = outputWordStartPos + tempOutputWordLength;
return freq;
} }
void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo, void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo,
@ -441,125 +494,35 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo
initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
inputLength, correction); inputLength, correction);
WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
const bool isSpaceProximity = spaceProximityPos >= 0;
// First word
const int firstInputWordStartPos = 0;
const int firstInputWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
int firstFreq = getMostFrequentWordLike(
firstInputWordStartPos, firstInputWordLength, proximityInfo, mWord);
unsigned short* firstOutputWord = 0;
int firstOutputWordLength = 0;
if (firstFreq > 0) {
firstOutputWordLength = firstInputWordLength;
firstOutputWord = mWord;
} else if (!hasAutoCorrectionCandidate) {
WordsPriorityQueue* firstWordQueue = queuePool->getSubQueue(
FIRST_WORD_INDEX, firstInputWordLength);
if (!firstWordQueue || firstWordQueue->size() < 1) {
return;
}
int score = 0;
const double ns = firstWordQueue->getHighestNormalizedScore(
proximityInfo->getPrimaryInputWord(), firstInputWordLength,
&firstOutputWord, &score, &firstOutputWordLength);
if (DEBUG_DICT) {
AKLOGI("NS1 = %f, Score = %d", ns, score);
}
// Two words correction won't be done if the score of the first word doesn't exceed the
// threshold.
if (ns < TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD
|| firstOutputWordLength < SUB_QUEUE_MIN_WORD_LENGTH) {
return;
}
firstFreq = score >> (firstOutputWordLength
+ TWO_WORDS_PLUS_OTHER_ERROR_CORRECTION_DEMOTION_DIVIDER);
}
if (DEBUG_DICT) {
AKLOGI("First freq: %d", firstFreq);
}
if (firstFreq <= 0 || firstOutputWordLength <= 0 || MAX_WORD_LENGTH <= firstOutputWordLength) {
return;
}
// Allocating fixed length array on stack // Allocating fixed length array on stack
unsigned short outputWord[MAX_WORD_LENGTH]; unsigned short outputWord[MAX_WORD_LENGTH];
int outputWordLength = 0; int outputWordLength = 0;
for (int i = 0; i < firstOutputWordLength; ++i) { WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
outputWord[i] = firstOutputWord[i]; const bool isSpaceProximity = spaceProximityPos >= 0;
}
outputWord[firstOutputWordLength] = SPACE; // First word
outputWordLength = firstOutputWordLength + 1; int inputWordStartPos = 0;
int inputWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
const int firstFreq = getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
FIRST_WORD_INDEX, inputWordStartPos, inputWordLength, 0, outputWord, &outputWordLength);
if (firstFreq <= 0) {
return;
}
// Second word // Second word
const int secondInputWordLength = isSpaceProximity inputWordStartPos = isSpaceProximity ? (spaceProximityPos + 1) : missingSpacePos;
? (inputLength - spaceProximityPos - 1) inputWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1)
: (inputLength - missingSpacePos); : (inputLength - missingSpacePos);
const int secondInputWordStartPos = const int secondFreq = getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
isSpaceProximity ? (spaceProximityPos + 1) : missingSpacePos; useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
int secondFreq = getMostFrequentWordLike( SECOND_WORD_INDEX, inputWordStartPos, inputWordLength, outputWordLength, outputWord,
secondInputWordStartPos, secondInputWordLength, proximityInfo, mWord); &outputWordLength);
unsigned short* secondOutputWord = 0; if (secondFreq <= 0) {
int secondOutputWordLength = 0;
if (secondFreq > 0) {
secondOutputWordLength = secondInputWordLength;
secondOutputWord = mWord;
} else if (!hasAutoCorrectionCandidate) {
const int offset = secondInputWordStartPos;
initSuggestions(proximityInfo, &xcoordinates[offset], &ycoordinates[offset],
codes + offset * MAX_PROXIMITY_CHARS, secondInputWordLength, correction);
queuePool->clearSubQueue(SECOND_WORD_INDEX);
getSuggestionCandidates(useFullEditDistance, secondInputWordLength, correction,
queuePool, false, MAX_ERRORS_FOR_TWO_WORDS, SECOND_WORD_INDEX);
if (DEBUG_DICT) {
AKLOGI("Dump second word candidates %d", secondInputWordLength);
for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
queuePool->getSubQueue(SECOND_WORD_INDEX, i)->dumpTopWord();
}
}
WordsPriorityQueue* secondWordQueue = queuePool->getSubQueue(
SECOND_WORD_INDEX, secondInputWordLength);
if (!secondWordQueue || secondWordQueue->size() < 1) {
return; return;
} }
int score = 0;
const double ns = secondWordQueue->getHighestNormalizedScore(
proximityInfo->getPrimaryInputWord(), secondInputWordLength,
&secondOutputWord, &score, &secondOutputWordLength);
if (DEBUG_DICT) {
AKLOGI("NS2 = %f, Score = %d", ns, score);
}
// Two words correction won't be done if the score of the first word doesn't exceed the
// threshold.
if (ns < TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD
|| secondOutputWordLength < SUB_QUEUE_MIN_WORD_LENGTH) {
return;
}
secondFreq = score >> (secondOutputWordLength
+ TWO_WORDS_PLUS_OTHER_ERROR_CORRECTION_DEMOTION_DIVIDER);
}
if (DEBUG_DICT) {
DUMP_WORD(secondOutputWord, secondOutputWordLength);
AKLOGI("Second freq: %d", secondFreq);
}
if (secondFreq <= 0 || secondOutputWordLength <= 0
|| MAX_WORD_LENGTH <= (firstOutputWordLength + 1 + secondOutputWordLength)) {
return;
}
for (int i = 0; i < secondOutputWordLength; ++i) {
outputWord[firstOutputWordLength + 1 + i] = secondOutputWord[i];
}
outputWordLength += secondOutputWordLength;
// TODO: Remove initSuggestions and correction->setCorrectionParams // TODO: Remove initSuggestions and correction->setCorrectionParams
initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction); initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction);

View file

@ -129,9 +129,10 @@ class UnigramDictionary {
short unsigned int *outWord); short unsigned int *outWord);
int getSubStringSuggestion( int getSubStringSuggestion(
ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates,
const int *codes, const bool useFullEditDistance, const Correction *correction, const int *codes, const bool useFullEditDistance, Correction *correction,
WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate, WordsPriorityQueuePool* queuePool, const int inputLength,
const int currentWordIndex, const int inputWordStartPos, const int inputWordLength, const bool hasAutoCorrectionCandidate, const int currentWordIndex,
const int inputWordStartPos, const int inputWordLength,
const int outputWordStartPos, unsigned short* outputWord, int *outputWordLength); const int outputWordStartPos, unsigned short* outputWord, int *outputWordLength);
const uint8_t* const DICT_ROOT; const uint8_t* const DICT_ROOT;