Merge "Implement multi words suggestions step1"

This commit is contained in:
satok 2012-01-30 01:04:42 -08:00 committed by Android (Google) Code Review
commit a191afb78d
5 changed files with 132 additions and 110 deletions

View file

@ -827,11 +827,6 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
const bool capitalizedWordDemotion =
firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion;
if (DEBUG_DICT_FULL) {
AKLOGI("Two words: %c, %c, %d",
word[0], word[firstWordLength + 1], capitalizedWordDemotion);
}
if (firstWordLength == 0 || secondWordLength == 0) {
return 0;
}
@ -891,6 +886,12 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq);
}
if (DEBUG_CORRECTION_FREQ) {
AKLOGI("Two words (%d, %d) (%d, %d) %d, %d", firstFreq, secondFreq, firstWordLength,
secondWordLength, capitalizedWordDemotion, totalFreq);
DUMP_WORD(word, firstWordLength);
}
return totalFreq;
}

View file

@ -216,15 +216,15 @@ static void prof_out(void) {
#define SUB_QUEUE_MAX_WORDS 1
#define SUB_QUEUE_MAX_COUNT 10
#define SUB_QUEUE_MIN_WORD_LENGTH 4
#define SUB_QUEUE_MAX_WORD_INDEX 2
#define MULTIPLE_WORDS_SUGGESTION_MAX_WORDS 2
#define TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD 0.39
#define START_TWO_WORDS_CORRECTION_THRESHOLD 0.22
#define MAX_DEPTH_MULTIPLIER 3
#define FIRST_WORD_INDEX 1
#define SECOND_WORD_INDEX 2
#define FIRST_WORD_INDEX 0
#define SECOND_WORD_INDEX 1
// TODO: Reduce this constant if possible; check the maximum number of umlauts in the same German
// word in the dictionary

View file

@ -224,14 +224,9 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
// Multiple word suggestions
if (SUGGEST_MULTIPLE_WORDS
&& inputLength >= MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION) {
for (int i = 1; i < inputLength; ++i) {
if (DEBUG_DICT) {
AKLOGI("--- Suggest multiple words %d", i);
}
getSplitTwoWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
useFullEditDistance, inputLength, i, correction, queuePool,
hasAutoCorrectionCandidate);
}
getSplitTwoWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
useFullEditDistance, inputLength, correction, queuePool,
hasAutoCorrectionCandidate);
}
PROF_END(5);
@ -329,7 +324,7 @@ inline void UnigramDictionary::onTerminal(const int freq,
int wordLength;
unsigned short* wordPointer;
if ((currentWordIndex == 1) && addToMasterQueue) {
if ((currentWordIndex == FIRST_WORD_INDEX) && addToMasterQueue) {
WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength);
if (finalFreq != NOT_A_FREQUENCY) {
@ -377,11 +372,8 @@ bool UnigramDictionary::getSubStringSuggestion(
const int inputWordStartPos, const int inputWordLength,
const int outputWordStartPos, const bool isSpaceProximity, int *freqArray,
int*wordLengthArray, unsigned short* outputWord, int *outputWordLength) {
if (DEBUG_DICT) {
assert(currentWordIndex >= 1);
}
unsigned short* tempOutputWord = 0;
int tempOutputWordLength = 0;
int nextWordLength = 0;
// TODO: Optimize init suggestion
initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
inputLength, correction);
@ -389,7 +381,7 @@ bool UnigramDictionary::getSubStringSuggestion(
int freq = getMostFrequentWordLike(
inputWordStartPos, inputWordLength, proximityInfo, mWord);
if (freq > 0) {
tempOutputWordLength = inputWordLength;
nextWordLength = inputWordLength;
tempOutputWord = mWord;
} else if (!hasAutoCorrectionCandidate) {
if (inputWordStartPos > 0) {
@ -400,7 +392,7 @@ bool UnigramDictionary::getSubStringSuggestion(
getSuggestionCandidates(useFullEditDistance, inputWordLength, correction,
queuePool, false, MAX_ERRORS_FOR_TWO_WORDS, currentWordIndex);
if (DEBUG_DICT) {
if (currentWordIndex <= SUB_QUEUE_MAX_WORD_INDEX) {
if (currentWordIndex < MULTIPLE_WORDS_SUGGESTION_MAX_WORDS) {
AKLOGI("Dump word candidates(%d) %d", currentWordIndex, inputWordLength);
for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
queuePool->getSubQueue(currentWordIndex, i)->dumpTopWord();
@ -415,59 +407,122 @@ bool UnigramDictionary::getSubStringSuggestion(
int score = 0;
const double ns = queue->getHighestNormalizedScore(
proximityInfo->getPrimaryInputWord(), inputWordLength,
&tempOutputWord, &score, &tempOutputWordLength);
&tempOutputWord, &score, &nextWordLength);
if (DEBUG_DICT) {
AKLOGI("NS(%d) = %f, Score = %d", currentWordIndex, ns, score);
}
// Two words correction won't be done if the score of the first word doesn't exceed the
// threshold.
if (ns < TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD
|| tempOutputWordLength < SUB_QUEUE_MIN_WORD_LENGTH) {
|| nextWordLength < SUB_QUEUE_MIN_WORD_LENGTH) {
return false;
}
freq = score >> (tempOutputWordLength
+ TWO_WORDS_PLUS_OTHER_ERROR_CORRECTION_DEMOTION_DIVIDER);
freq = score >> (nextWordLength + TWO_WORDS_PLUS_OTHER_ERROR_CORRECTION_DEMOTION_DIVIDER);
}
if (DEBUG_DICT) {
AKLOGI("Freq(%d): %d, length: %d, input length: %d, input start: %d"
, currentWordIndex, freq, tempOutputWordLength, inputWordLength, inputWordStartPos);
AKLOGI("Freq(%d): %d, length: %d, input length: %d, input start: %d (%d)"
, currentWordIndex, freq, nextWordLength, inputWordLength, inputWordStartPos,
wordLengthArray[0]);
}
if (freq <= 0 || tempOutputWordLength <= 0
|| MAX_WORD_LENGTH <= (outputWordStartPos + tempOutputWordLength)) {
if (freq <= 0 || nextWordLength <= 0
|| MAX_WORD_LENGTH <= (outputWordStartPos + nextWordLength)) {
return false;
}
for (int i = 0; i < tempOutputWordLength; ++i) {
for (int i = 0; i < nextWordLength; ++i) {
outputWord[outputWordStartPos + i] = tempOutputWord[i];
}
// Put output values
freqArray[currentWordIndex - 1] = freq;
freqArray[currentWordIndex] = freq;
// TODO: put output length instead of input length
wordLengthArray[currentWordIndex - 1] = inputWordLength;
*outputWordLength = outputWordStartPos + tempOutputWordLength;
wordLengthArray[currentWordIndex] = inputWordLength;
const int tempOutputWordLength = outputWordStartPos + nextWordLength;
if (outputWordLength) {
*outputWordLength = tempOutputWordLength;
}
if ((inputWordStartPos + inputWordLength) < inputLength) {
if (outputWordStartPos + tempOutputWordLength >= MAX_WORD_LENGTH) {
if (outputWordStartPos + nextWordLength >= MAX_WORD_LENGTH) {
return false;
}
outputWord[outputWordStartPos + tempOutputWordLength] = SPACE;
++*outputWordLength;
} else if (currentWordIndex >= 2) {
if (outputWordLength) {
++*outputWordLength;
}
} else if (currentWordIndex >= 1) {
// TODO: Handle 3 or more words
const int pairFreq = correction->getFreqForSplitTwoWords(
freqArray, wordLengthArray, isSpaceProximity, outputWord);
if (DEBUG_DICT) {
AKLOGI("Split two words: %d, %d, %d, %d", freqArray[0], freqArray[1], pairFreq,
inputLength);
AKLOGI("Split two words: %d, %d, %d, %d, (%d)", freqArray[0], freqArray[1], pairFreq,
inputLength, wordLengthArray[0]);
}
addWord(outputWord, *outputWordLength, pairFreq, queuePool->getMasterQueue());
addWord(outputWord, tempOutputWordLength, pairFreq, queuePool->getMasterQueue());
}
return true;
}
void UnigramDictionary::getMultiWordsSuggestionRec(ProximityInfo *proximityInfo,
const int *xcoordinates, const int *ycoordinates, const int *codes,
const bool useFullEditDistance, const int inputLength,
Correction *correction, WordsPriorityQueuePool* queuePool,
const bool hasAutoCorrectionCandidate, const int startInputPos, const int startWordIndex,
const int outputWordLength, int *freqArray, int* wordLengthArray,
unsigned short* outputWord) {
if (startWordIndex >= (MULTIPLE_WORDS_SUGGESTION_MAX_WORDS - 1)) {
// Return if the last word index
return;
}
for (int i = 1; i < inputLength; ++i) {
int tempOutputWordLength = 0;
// First word
int inputWordStartPos = 0;
int inputWordLength = i;
if (DEBUG_CORRECTION_FREQ) {
AKLOGI("Two words, %d", inputWordLength);
}
if (!getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
FIRST_WORD_INDEX, inputWordStartPos, inputWordLength, 0, true /* not used */,
freqArray, wordLengthArray, outputWord, &tempOutputWordLength)) {
continue;
}
// Second word
// Missing space
inputWordStartPos = i;
inputWordLength = inputLength - i;
getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
SECOND_WORD_INDEX, inputWordStartPos, inputWordLength, tempOutputWordLength,
false /* missing space */, freqArray, wordLengthArray, outputWord,
0);
// Mistyped space
++inputWordStartPos;
--inputWordLength;
if (inputWordLength <= 0) {
continue;
}
const int x = xcoordinates[inputWordStartPos - 1];
const int y = ycoordinates[inputWordStartPos - 1];
if (!proximityInfo->hasSpaceProximity(x, y)) {
continue;
}
getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
SECOND_WORD_INDEX, inputWordStartPos, inputWordLength, tempOutputWordLength,
true /* mistyped space */, freqArray, wordLengthArray, outputWord,
0);
}
}
void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo,
const int *xcoordinates, const int *ycoordinates, const int *codes,
const bool useFullEditDistance, const int inputLength, const int wordDivideIndex,
const bool useFullEditDistance, const int inputLength,
Correction *correction, WordsPriorityQueuePool* queuePool,
const bool hasAutoCorrectionCandidate) {
if (inputLength >= MAX_WORD_LENGTH) return;
@ -475,51 +530,21 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo
// MAX_PROXIMITY_CHARS_SIZE in ProximityInfo.java should be 16
assert(MAX_PROXIMITY_CHARS == 16);
}
if (DEBUG_DICT) {
AKLOGI("--- Suggest multiple words");
}
// Allocating fixed length array on stack
unsigned short outputWord[MAX_WORD_LENGTH];
int freqArray[SUB_QUEUE_MAX_WORD_INDEX];
int wordLengthArray[SUB_QUEUE_MAX_WORD_INDEX];
int outputWordLength = 0;
// First word
int inputWordStartPos = 0;
int inputWordLength = wordDivideIndex;
if (!getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
FIRST_WORD_INDEX, inputWordStartPos, inputWordLength, 0, true /* not used */,
freqArray, wordLengthArray, outputWord, &outputWordLength)) {
return;
}
const int tempOutputWordLength = outputWordLength;
// Second word
// Missing space
inputWordStartPos = wordDivideIndex;
inputWordLength = inputLength - wordDivideIndex;
getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
SECOND_WORD_INDEX, inputWordStartPos, inputWordLength, tempOutputWordLength,
false /* missing space */, freqArray, wordLengthArray, outputWord, &outputWordLength);
// Mistyped space
++inputWordStartPos;
--inputWordLength;
if (inputWordLength <= 0) {
return;
}
const int x = xcoordinates[inputWordStartPos - 1];
const int y = ycoordinates[inputWordStartPos - 1];
if (!proximityInfo->hasSpaceProximity(x, y)) {
return;
}
getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
SECOND_WORD_INDEX, inputWordStartPos, inputWordLength, tempOutputWordLength,
true /* mistyped space */, freqArray, wordLengthArray, outputWord, &outputWordLength);
int freqArray[MULTIPLE_WORDS_SUGGESTION_MAX_WORDS];
int wordLengthArray[MULTIPLE_WORDS_SUGGESTION_MAX_WORDS];
const int outputWordLength = 0;
const int startInputPos = 0;
const int startWordIndex = 0;
getMultiWordsSuggestionRec(proximityInfo, xcoordinates, ycoordinates, codes,
useFullEditDistance, inputLength, correction, queuePool, hasAutoCorrectionCandidate,
startInputPos, startWordIndex, outputWordLength, freqArray, wordLengthArray,
outputWord);
}
// Wrapper for getMostFrequentWordLikeInner, which matches it to the previous

View file

@ -103,7 +103,7 @@ class UnigramDictionary {
const int currentWordIndex);
void getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo,
const int *xcoordinates, const int *ycoordinates, const int *codes,
const bool useFullEditDistance, const int inputLength, const int wordDivideIndex,
const bool useFullEditDistance, const int inputLength,
Correction *correction, WordsPriorityQueuePool* queuePool,
const bool hasAutoCorrectionCandidate);
void onTerminal(const int freq, const TerminalAttributes& terminalAttributes,
@ -127,6 +127,13 @@ class UnigramDictionary {
const int inputWordStartPos, const int inputWordLength,
const int outputWordStartPos, const bool isSpaceProximity, int *freqArray,
int *wordLengthArray, unsigned short* outputWord, int *outputWordLength);
void getMultiWordsSuggestionRec(ProximityInfo *proximityInfo,
const int *xcoordinates, const int *ycoordinates, const int *codes,
const bool useFullEditDistance, const int inputLength,
Correction *correction, WordsPriorityQueuePool* queuePool,
const bool hasAutoCorrectionCandidate, const int startPos, const int startWordIndex,
const int outputWordLength, int *freqArray, int* wordLengthArray,
unsigned short* outputWord);
const uint8_t* const DICT_ROOT;
const int MAX_WORD_LENGTH;

View file

@ -27,11 +27,10 @@ class WordsPriorityQueuePool {
public:
WordsPriorityQueuePool(int mainQueueMaxWords, int subQueueMaxWords, int maxWordLength) {
mMasterQueue = new(mMasterQueueBuf) WordsPriorityQueue(mainQueueMaxWords, maxWordLength);
for (int i = 0, subQueueBufOffset = 0; i < SUB_QUEUE_MAX_COUNT;
for (int i = 0, subQueueBufOffset = 0;
i < MULTIPLE_WORDS_SUGGESTION_MAX_WORDS * SUB_QUEUE_MAX_COUNT;
++i, subQueueBufOffset += sizeof(WordsPriorityQueue)) {
mSubQueues1[i] = new(mSubQueueBuf1 + subQueueBufOffset)
WordsPriorityQueue(subQueueMaxWords, maxWordLength);
mSubQueues2[i] = new(mSubQueueBuf2 + subQueueBufOffset)
mSubQueues[i] = new(mSubQueueBuf + subQueueBufOffset)
WordsPriorityQueue(subQueueMaxWords, maxWordLength);
}
}
@ -44,7 +43,7 @@ class WordsPriorityQueuePool {
}
WordsPriorityQueue* getSubQueue(const int wordIndex, const int inputWordLength) {
if (wordIndex > SUB_QUEUE_MAX_WORD_INDEX) {
if (wordIndex >= MULTIPLE_WORDS_SUGGESTION_MAX_WORDS) {
return 0;
}
if (inputWordLength < 0 || inputWordLength >= SUB_QUEUE_MAX_COUNT) {
@ -53,30 +52,21 @@ class WordsPriorityQueuePool {
}
return 0;
}
// TODO: Come up with more generic pool
if (wordIndex == 1) {
return mSubQueues1[inputWordLength];
} else if (wordIndex == 2) {
return mSubQueues2[inputWordLength];
} else {
return 0;
}
return mSubQueues[wordIndex * SUB_QUEUE_MAX_COUNT + inputWordLength];
}
inline void clearAll() {
mMasterQueue->clear();
for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
mSubQueues1[i]->clear();
mSubQueues2[i]->clear();
for (int i = 0; i < MULTIPLE_WORDS_SUGGESTION_MAX_WORDS; ++i) {
clearSubQueue(i);
}
}
inline void clearSubQueue(const int wordIndex) {
for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
if (wordIndex == 1) {
mSubQueues1[i]->clear();
} else if (wordIndex == 2) {
mSubQueues2[i]->clear();
WordsPriorityQueue* queue = getSubQueue(wordIndex, i);
if (queue) {
queue->clear();
}
}
}
@ -84,17 +74,16 @@ class WordsPriorityQueuePool {
void dumpSubQueue1TopSuggestions() {
AKLOGI("DUMP SUBQUEUE1 TOP SUGGESTIONS");
for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
mSubQueues1[i]->dumpTopWord();
getSubQueue(0, i)->dumpTopWord();
}
}
private:
WordsPriorityQueue* mMasterQueue;
WordsPriorityQueue* mSubQueues1[SUB_QUEUE_MAX_COUNT];
WordsPriorityQueue* mSubQueues2[SUB_QUEUE_MAX_COUNT];
WordsPriorityQueue* mSubQueues[SUB_QUEUE_MAX_COUNT * MULTIPLE_WORDS_SUGGESTION_MAX_WORDS];
char mMasterQueueBuf[sizeof(WordsPriorityQueue)];
char mSubQueueBuf1[SUB_QUEUE_MAX_COUNT * sizeof(WordsPriorityQueue)];
char mSubQueueBuf2[SUB_QUEUE_MAX_COUNT * sizeof(WordsPriorityQueue)];
char mSubQueueBuf[MULTIPLE_WORDS_SUGGESTION_MAX_WORDS
* SUB_QUEUE_MAX_COUNT * sizeof(WordsPriorityQueue)];
};
}