From 6ad15fcd158de5bec18f6529b961a55e7db9007f Mon Sep 17 00:00:00 2001 From: satok Date: Mon, 16 Jan 2012 16:21:21 +0900 Subject: [PATCH] Store suggestions for each input length for missing space algorithm etc. Change-Id: Ief8f6ddd29e043744863e5b9be3a51a70987291c --- native/src/correction.cpp | 32 ++++++------ native/src/correction.h | 3 +- native/src/defines.h | 30 ++++++------ native/src/unigram_dictionary.cpp | 68 ++++++++++++++++---------- native/src/unigram_dictionary.h | 9 ++-- native/src/words_priority_queue.h | 7 +++ native/src/words_priority_queue_pool.h | 15 ++++++ 7 files changed, 99 insertions(+), 65 deletions(-) diff --git a/native/src/correction.cpp b/native/src/correction.cpp index c40f94b3f..43ab3f59c 100644 --- a/native/src/correction.cpp +++ b/native/src/correction.cpp @@ -214,21 +214,11 @@ int Correction::goDownTree( return mOutputIndex; } -// TODO: remove -int Correction::getOutputIndex() { - return mOutputIndex; -} - // TODO: remove int Correction::getInputIndex() { return mInputIndex; } -// TODO: remove -bool Correction::needsToTraverseAllNodes() { - return mNeedsToTraverseAllNodes; -} - void Correction::incrementInputIndex() { ++mInputIndex; } @@ -278,13 +268,12 @@ void Correction::addCharToCurrentWord(const int32_t c) { mWord, mOutputIndex + 1); } -// TODO: inline? Correction::CorrectionType Correction::processSkipChar( const int32_t c, const bool isTerminal, const bool inputIndexIncremented) { addCharToCurrentWord(c); - if (needsToTraverseAllNodes() && isTerminal) { - mTerminalInputIndex = mInputIndex - (inputIndexIncremented ? 1 : 0); - mTerminalOutputIndex = mOutputIndex; + mTerminalInputIndex = mInputIndex - (inputIndexIncremented ? 1 : 0); + mTerminalOutputIndex = mOutputIndex; + if (mNeedsToTraverseAllNodes && isTerminal) { incrementOutputIndex(); return TRAVERSE_ALL_ON_TERMINAL; } else { @@ -293,6 +282,13 @@ Correction::CorrectionType Correction::processSkipChar( } } +Correction::CorrectionType Correction::processUnrelatedCorrectionType() { + // Needs to set mTerminalInputIndex and mTerminalOutputIndex before returning any CorrectionType + mTerminalInputIndex = mInputIndex; + mTerminalOutputIndex = mOutputIndex; + return UNRELATED; +} + inline bool isEquivalentChar(ProximityInfo::ProximityType type) { return type == ProximityInfo::EQUIVALENT_CHAR; } @@ -301,7 +297,7 @@ Correction::CorrectionType Correction::processCharAndCalcState( const int32_t c, const bool isTerminal) { const int correctionCount = (mSkippedCount + mExcessiveCount + mTransposedCount); if (correctionCount > mMaxErrors) { - return UNRELATED; + return processUnrelatedCorrectionType(); } // TODO: Change the limit if we'll allow two or more corrections @@ -381,7 +377,7 @@ Correction::CorrectionType Correction::processCharAndCalcState( AKLOGI("UNRELATED(0): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, mTransposedCount, mExcessiveCount, c); } - return UNRELATED; + return processUnrelatedCorrectionType(); } } @@ -484,7 +480,7 @@ Correction::CorrectionType Correction::processCharAndCalcState( AKLOGI("UNRELATED(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount, mTransposedCount, mExcessiveCount, c); } - return UNRELATED; + return processUnrelatedCorrectionType(); } } else if (secondTransposing) { // If inputIndex is greater than mInputLength, that means there is no @@ -539,6 +535,8 @@ Correction::CorrectionType Correction::processCharAndCalcState( } return ON_TERMINAL; } else { + mTerminalInputIndex = mInputIndex - 1; + mTerminalOutputIndex = mOutputIndex - 1; return NOT_ON_TERMINAL; } } diff --git a/native/src/correction.h b/native/src/correction.h index 4012e7e82..a0fd55fd9 100644 --- a/native/src/correction.h +++ b/native/src/correction.h @@ -48,7 +48,6 @@ class Correction { void checkState(); bool initProcessState(const int index); - int getOutputIndex(); int getInputIndex(); virtual ~Correction(); @@ -115,11 +114,11 @@ class Correction { private: inline void incrementInputIndex(); inline void incrementOutputIndex(); - inline bool needsToTraverseAllNodes(); inline void startToTraverseAllNodes(); inline bool isQuote(const unsigned short c); inline CorrectionType processSkipChar( const int32_t c, const bool isTerminal, const bool inputIndexIncremented); + inline CorrectionType processUnrelatedCorrectionType(); inline void addCharToCurrentWord(const int32_t c); const int TYPED_LETTER_MULTIPLIER; diff --git a/native/src/defines.h b/native/src/defines.h index 01ef65678..48f92d86c 100644 --- a/native/src/defines.h +++ b/native/src/defines.h @@ -22,9 +22,23 @@ #include #define AKLOGE ALOGE #define AKLOGI ALOGI + +#define DUMP_WORD(word, length) do { dumpWord(word, length); } while(0) + +static char charBuf[50]; + +static void dumpWord(const unsigned short* word, const int length) { + for (int i = 0; i < length; ++i) { + charBuf[i] = word[i]; + } + charBuf[length] = 0; + AKLOGI("[ %s ]", charBuf); +} + #else #define AKLOGE(fmt, ...) #define AKLOGI(fmt, ...) +#define DUMP_WORD(word, length) #endif #ifdef FLAG_DO_PROFILE @@ -106,18 +120,6 @@ static void prof_out(void) { #define DEBUG_CORRECTION_FREQ true #define DEBUG_WORDS_PRIORITY_QUEUE true -#define DUMP_WORD(word, length) do { dumpWord(word, length); } while(0) - -static char charBuf[50]; - -static void dumpWord(const unsigned short* word, const int length) { - for (int i = 0; i < length; ++i) { - charBuf[i] = word[i]; - } - charBuf[length] = 0; - AKLOGI("[ %s ]", charBuf); -} - #else // FLAG_DBG #define DEBUG_DICT false @@ -131,7 +133,6 @@ static void dumpWord(const unsigned short* word, const int length) { #define DEBUG_CORRECTION_FREQ false #define DEBUG_WORDS_PRIORITY_QUEUE false -#define DUMP_WORD(word, length) #endif // FLAG_DBG @@ -207,7 +208,8 @@ static void dumpWord(const unsigned short* word, const int length) { // Word limit for sub queues used in WordsPriorityQueuePool. Sub queues are temporary queues used // for better performance. -#define SUB_QUEUE_MAX_WORDS 5 +// Holds up to 1 candidate for each word +#define SUB_QUEUE_MAX_WORDS 1 #define SUB_QUEUE_MAX_COUNT 10 #define MAX_DEPTH_MULTIPLIER 3 diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp index bbae5a864..1cff4d838 100644 --- a/native/src/unigram_dictionary.cpp +++ b/native/src/unigram_dictionary.cpp @@ -186,7 +186,7 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, PROF_OPEN; PROF_START(0); - // Note: This line is intentionally left blank + queuePool->clearAll(); PROF_END(0); PROF_START(1); @@ -241,18 +241,17 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, } } PROF_END(6); + if (DEBUG_WORDS_PRIORITY_QUEUE) { + queuePool->dumpSubQueue1TopSuggestions(); + } } void UnigramDictionary::initSuggestions(ProximityInfo *proximityInfo, const int *xCoordinates, - const int *yCoordinates, const int *codes, const int inputLength, - WordsPriorityQueue *queue, Correction *correction) { + const int *yCoordinates, const int *codes, const int inputLength, Correction *correction) { if (DEBUG_DICT) { AKLOGI("initSuggest"); } proximityInfo->setInputParams(codes, inputLength, xCoordinates, yCoordinates); - if (queue) { - queue->clear(); - } const int maxDepth = min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH); correction->initCorrection(proximityInfo, inputLength, maxDepth); } @@ -264,15 +263,13 @@ void UnigramDictionary::getOneWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, Correction *correction, WordsPriorityQueuePool *queuePool) { - WordsPriorityQueue *masterQueue = queuePool->getMasterQueue(); - initSuggestions( - proximityInfo, xcoordinates, ycoordinates, codes, inputLength, masterQueue, correction); - getSuggestionCandidates(useFullEditDistance, inputLength, correction, masterQueue, + initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction); + getSuggestionCandidates(useFullEditDistance, inputLength, correction, queuePool, true /* doAutoCompletion */, DEFAULT_MAX_ERRORS); } void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, - const int inputLength, Correction *correction, WordsPriorityQueue *queue, + const int inputLength, Correction *correction, WordsPriorityQueuePool *queuePool, const bool doAutoCompletion, const int maxErrors) { // TODO: Remove setCorrectionParams correction->setCorrectionParams(0, 0, 0, @@ -292,7 +289,7 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, int firstChildPos; const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, - correction, &childCount, &firstChildPos, &siblingPos, queue); + correction, &childCount, &firstChildPos, &siblingPos, queuePool); // Update next sibling pos correction->setTreeSiblingPos(outputIndex, siblingPos); @@ -327,14 +324,34 @@ void UnigramDictionary::getMistypedSpaceWords(ProximityInfo *proximityInfo, cons inline void UnigramDictionary::onTerminal(const int freq, const TerminalAttributes& terminalAttributes, Correction *correction, - WordsPriorityQueue *queue) { + WordsPriorityQueuePool *queuePool, const bool addToMasterQueue) { + const int inputIndex = correction->getInputIndex(); + const bool addToSubQueue = inputIndex < SUB_QUEUE_MAX_COUNT; + if (!addToMasterQueue && !addToSubQueue) { + return; + } + WordsPriorityQueue *masterQueue = queuePool->getMasterQueue(); + WordsPriorityQueue *subQueue = queuePool->getSubQueue1(inputIndex); int wordLength; unsigned short* wordPointer; const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength); if (finalFreq >= 0) { if (!terminalAttributes.isShortcutOnly()) { - addWord(wordPointer, wordLength, finalFreq, queue); + if (addToMasterQueue) { + addWord(wordPointer, wordLength, finalFreq, masterQueue); + } + // TODO: Check the validity of "inputIndex == wordLength" + //if (addToSubQueue && inputIndex == wordLength) { + if (addToSubQueue) { + addWord(wordPointer, wordLength, finalFreq, subQueue); + } } + // Please note that the shortcut candidates will be added to the master queue only. + if (!addToMasterQueue) { + return; + } + + // From here, below is the code to add shortcut candidates. TerminalAttributes::ShortcutIterator iterator = terminalAttributes.getShortcutIterator(); while (iterator.hasNextShortcutTarget()) { // TODO: addWord only supports weak ordering, meaning we have no means to control the @@ -345,7 +362,7 @@ inline void UnigramDictionary::onTerminal(const int freq, uint16_t shortcutTarget[MAX_WORD_LENGTH_INTERNAL]; const int shortcutTargetStringLength = iterator.getNextShortcutTarget( MAX_WORD_LENGTH_INTERNAL, shortcutTarget); - addWord(shortcutTarget, shortcutTargetStringLength, finalFreq, queue); + addWord(shortcutTarget, shortcutTargetStringLength, finalFreq, masterQueue); } } } @@ -411,8 +428,7 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo } // TODO: Remove initSuggestions and correction->setCorrectionParams - initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, - 0 /* do not clear queue */, correction); + initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction); correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */, -1 /* transposedPos */, spaceProximityPos, missingSpacePos, @@ -583,7 +599,7 @@ int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offs // given level, as output into newCount when traversing this level's parent. inline bool UnigramDictionary::processCurrentNode(const int initialPos, Correction *correction, int *newCount, - int *newChildrenPosition, int *nextSiblingPosition, WordsPriorityQueue *queue) { + int *newChildrenPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool) { if (DEBUG_DICT) { correction->checkState(); } @@ -658,15 +674,13 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, } while (NOT_A_CHARACTER != c); if (isTerminalNode) { - if (needsToInvokeOnTerminal) { - // The frequency should be here, because we come here only if this is actually - // a terminal node, and we are on its last char. - const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos); - const int childrenAddressPos = BinaryFormat::skipFrequency(flags, pos); - const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos); - TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos); - onTerminal(freq, terminalAttributes, correction, queue); - } + // The frequency should be here, because we come here only if this is actually + // a terminal node, and we are on its last char. + const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos); + const int childrenAddressPos = BinaryFormat::skipFrequency(flags, pos); + const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos); + TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos); + onTerminal(freq, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal); // If there are more chars in this node, then this virtual node has children. // If we are on the last char, this virtual node has children if this node has. diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h index 23581425a..5e7a7580f 100644 --- a/native/src/unigram_dictionary.h +++ b/native/src/unigram_dictionary.h @@ -93,14 +93,13 @@ class UnigramDictionary { const int codesRemain, const int currentDepth, int* codesDest, Correction *correction, WordsPriorityQueuePool* queuePool); void initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, - const int *ycoordinates, const int *codes, const int codesSize, - WordsPriorityQueue *queue, Correction *correction); + const int *ycoordinates, const int *codes, const int codesSize, Correction *correction); void getOneWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, Correction *correction, WordsPriorityQueuePool* queuePool); void getSuggestionCandidates( const bool useFullEditDistance, const int inputLength, Correction *correction, - WordsPriorityQueue* queue, const bool doAutoCompletion, const int maxErrors); + WordsPriorityQueuePool* queuePool, const bool doAutoCompletion, const int maxErrors); void getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int spaceProximityPos, @@ -114,12 +113,12 @@ class UnigramDictionary { const int inputLength, const int spaceProximityPos, Correction *correction, WordsPriorityQueuePool* queuePool); void onTerminal(const int freq, const TerminalAttributes& terminalAttributes, - Correction *correction, WordsPriorityQueue *queue); + Correction *correction, WordsPriorityQueuePool *queuePool, const bool addToMasterQueue); bool needsToSkipCurrentNode(const unsigned short c, const int inputIndex, const int skipPos, const int depth); // Process a node by considering proximity, missing and excessive character bool processCurrentNode(const int initialPos, Correction *correction, int *newCount, - int *newChildPosition, int *nextSiblingPosition, WordsPriorityQueue *queue); + int *newChildPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool); int getMostFrequentWordLike(const int startInputIndex, const int inputLength, ProximityInfo *proximityInfo, unsigned short *word); int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length, diff --git a/native/src/words_priority_queue.h b/native/src/words_priority_queue.h index ce5d2ce51..54bf27a59 100644 --- a/native/src/words_priority_queue.h +++ b/native/src/words_priority_queue.h @@ -128,6 +128,13 @@ class WordsPriorityQueue { } } + void dumpTopWord() { + if (size() <= 0) { + return; + } + DUMP_WORD(mSuggestions.top()->mWord, mSuggestions.top()->mWordLength); + } + private: struct wordComparator { bool operator ()(SuggestedWord * left, SuggestedWord * right) { diff --git a/native/src/words_priority_queue_pool.h b/native/src/words_priority_queue_pool.h index bf9619e19..5fa254852 100644 --- a/native/src/words_priority_queue_pool.h +++ b/native/src/words_priority_queue_pool.h @@ -58,6 +58,21 @@ class WordsPriorityQueuePool { return mSubQueues2[id]; } + inline void clearAll() { + mMasterQueue->clear(); + for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) { + mSubQueues1[i]->clear(); + mSubQueues2[i]->clear(); + } + } + + void dumpSubQueue1TopSuggestions() { + AKLOGI("DUMP SUBQUEUE1 TOP SUGGESTIONS"); + for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) { + mSubQueues1[i]->dumpTopWord(); + } + } + private: WordsPriorityQueue* mMasterQueue; WordsPriorityQueue* mSubQueues1[SUB_QUEUE_MAX_COUNT];