From 9db2097f7bbfce0b4679d80cf8a4f6127616f1ae Mon Sep 17 00:00:00 2001 From: satok Date: Wed, 17 Aug 2011 17:55:16 +0900 Subject: [PATCH] Do the transposed correction and the excessive correction by one loop Change-Id: Idc7a3451a65f7b980e5c499e9083f67646b3a199 --- native/src/correction.cpp | 151 ++++++++++++++++++++++-------- native/src/correction.h | 17 +++- native/src/correction_state.h | 36 ++++++- native/src/unigram_dictionary.cpp | 25 ++--- 4 files changed, 165 insertions(+), 64 deletions(-) diff --git a/native/src/correction.cpp b/native/src/correction.cpp index 99412b211..0a2211826 100644 --- a/native/src/correction.cpp +++ b/native/src/correction.cpp @@ -56,17 +56,22 @@ void Correction::initCorrectionState( const int rootPos, const int childCount, const bool traverseAll) { latinime::initCorrectionState(mCorrectionStates, rootPos, childCount, traverseAll); // TODO: remove + mCorrectionStates[0].mTransposedPos = mTransposedPos; + mCorrectionStates[0].mExcessivePos = mExcessivePos; mCorrectionStates[0].mSkipPos = mSkipPos; } void Correction::setCorrectionParams(const int skipPos, const int excessivePos, const int transposedPos, const int spaceProximityPos, const int missingSpacePos) { // TODO: remove + mTransposedPos = transposedPos; + mExcessivePos = excessivePos; mSkipPos = skipPos; // TODO: remove + mCorrectionStates[0].mTransposedPos = transposedPos; + mCorrectionStates[0].mExcessivePos = excessivePos; mCorrectionStates[0].mSkipPos = skipPos; - mExcessivePos = excessivePos; - mTransposedPos = transposedPos; + mSpaceProximityPos = spaceProximityPos; mMissingSpacePos = missingSpacePos; } @@ -107,12 +112,23 @@ bool Correction::initProcessState(const int outputIndex) { --(mCorrectionStates[outputIndex].mChildCount); mInputIndex = mCorrectionStates[outputIndex].mInputIndex; mNeedsToTraverseAllNodes = mCorrectionStates[outputIndex].mNeedsToTraverseAllNodes; + mProximityCount = mCorrectionStates[outputIndex].mProximityCount; + mTransposedCount = mCorrectionStates[outputIndex].mTransposedCount; + mExcessiveCount = mCorrectionStates[outputIndex].mExcessiveCount; mSkippedCount = mCorrectionStates[outputIndex].mSkippedCount; + mLastCharExceeded = mCorrectionStates[outputIndex].mLastCharExceeded; + + mTransposedPos = mCorrectionStates[outputIndex].mTransposedPos; + mExcessivePos = mCorrectionStates[outputIndex].mExcessivePos; mSkipPos = mCorrectionStates[outputIndex].mSkipPos; - mSkipping = false; - mProximityMatching = false; + mMatching = false; + mProximityMatching = false; + mTransposing = false; + mExceeding = false; + mSkipping = false; + return true; } @@ -150,12 +166,23 @@ void Correction::incrementOutputIndex() { mCorrectionStates[mOutputIndex].mSiblingPos = mCorrectionStates[mOutputIndex - 1].mSiblingPos; mCorrectionStates[mOutputIndex].mInputIndex = mInputIndex; mCorrectionStates[mOutputIndex].mNeedsToTraverseAllNodes = mNeedsToTraverseAllNodes; + mCorrectionStates[mOutputIndex].mProximityCount = mProximityCount; + mCorrectionStates[mOutputIndex].mTransposedCount = mTransposedCount; + mCorrectionStates[mOutputIndex].mExcessiveCount = mExcessiveCount; mCorrectionStates[mOutputIndex].mSkippedCount = mSkippedCount; - mCorrectionStates[mOutputIndex].mSkipping = mSkipping; + mCorrectionStates[mOutputIndex].mSkipPos = mSkipPos; + mCorrectionStates[mOutputIndex].mTransposedPos = mTransposedPos; + mCorrectionStates[mOutputIndex].mExcessivePos = mExcessivePos; + + mCorrectionStates[mOutputIndex].mLastCharExceeded = mLastCharExceeded; + mCorrectionStates[mOutputIndex].mMatching = mMatching; mCorrectionStates[mOutputIndex].mProximityMatching = mProximityMatching; + mCorrectionStates[mOutputIndex].mTransposing = mTransposing; + mCorrectionStates[mOutputIndex].mExceeding = mExceeding; + mCorrectionStates[mOutputIndex].mSkipping = mSkipping; } void Correction::startToTraverseAllNodes() { @@ -184,13 +211,16 @@ Correction::CorrectionType Correction::processSkipChar( Correction::CorrectionType Correction::processCharAndCalcState( const int32_t c, const bool isTerminal) { CorrectionType currentStateType = NOT_ON_TERMINAL; - // This has to be done for each virtual char (this forwards the "inputIndex" which - // is the index in the user-inputted chars, as read by proximity chars. - if (mExcessivePos == mOutputIndex && mInputIndex < mInputLength - 1) { - incrementInputIndex(); + + if (mExcessivePos >= 0) { + if (mExcessiveCount == 0 && mExcessivePos < mOutputIndex) { + ++mExcessivePos; + } + if (mExcessivePos < mInputLength - 1) { + mExceeding = mExcessivePos == mInputIndex; + } } - bool skip = false; if (mSkipPos >= 0) { if (mSkippedCount == 0 && mSkipPos < mOutputIndex) { if (DEBUG_DICT) { @@ -198,42 +228,64 @@ Correction::CorrectionType Correction::processCharAndCalcState( } ++mSkipPos; } - skip = mSkipPos == mOutputIndex; - mSkipping = true; + mSkipping = mSkipPos == mOutputIndex; + } + + if (mTransposedPos >= 0) { + if (mTransposedCount == 0 && mTransposedPos < mOutputIndex) { + ++mTransposedPos; + } + if (mTransposedPos < mInputLength - 1) { + mTransposing = mInputIndex == mTransposedPos; + } } if (mNeedsToTraverseAllNodes || isQuote(c)) { return processSkipChar(c, isTerminal); } else { - int inputIndexForProximity = mInputIndex; - - if (mTransposedPos >= 0) { - if (mInputIndex == mTransposedPos) { - ++inputIndexForProximity; - } - if (mInputIndex == (mTransposedPos + 1)) { - --inputIndexForProximity; + bool secondTransposing = false; + if (mTransposedCount % 2 == 1) { + if (mProximityInfo->getMatchedProximityId(mInputIndex - 1, c, false) + == ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR) { + ++mTransposedCount; + secondTransposing = true; + } else if (mCorrectionStates[mOutputIndex].mExceeding) { + --mTransposedCount; + ++mExcessiveCount; + incrementInputIndex(); + } else { + --mTransposedCount; + return UNRELATED; } } // TODO: sum counters const bool checkProximityChars = !(mSkippedCount > 0 || mExcessivePos >= 0 || mTransposedPos >= 0); - int matchedProximityCharId = mProximityInfo->getMatchedProximityId( - inputIndexForProximity, c, checkProximityChars); + // TODO: do not check if second transposing + const int matchedProximityCharId = mProximityInfo->getMatchedProximityId( + mInputIndex, c, checkProximityChars); - if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) { - if (skip && mProximityCount == 0) { + if (!secondTransposing && ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) { + if (mInputIndex - 1 < mInputLength && (mExceeding || mTransposing) + && mProximityInfo->getMatchedProximityId(mInputIndex + 1, c, false) + == ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR) { + if (mTransposing) { + ++mTransposedCount; + } else { + ++mExcessiveCount; + incrementInputIndex(); + } + } else if (mSkipping && mProximityCount == 0) { // Skip this letter and continue deeper ++mSkippedCount; return processSkipChar(c, isTerminal); } else if (checkProximityChars - && inputIndexForProximity > 0 + && mInputIndex > 0 && mCorrectionStates[mOutputIndex].mProximityMatching && mCorrectionStates[mOutputIndex].mSkipping - && mProximityInfo->getMatchedProximityId( - inputIndexForProximity - 1, c, false) - == ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR) { + && mProximityInfo->getMatchedProximityId(mInputIndex - 1, c, false) + == ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR) { // Note: This logic tries saving cases like contrst --> contrast -- "a" is one of // proximity chars of "s", but it should rather be handled as a skipped char. ++mSkippedCount; @@ -242,7 +294,8 @@ Correction::CorrectionType Correction::processCharAndCalcState( } else { return UNRELATED; } - } else if (ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) { + } else if (secondTransposing + || ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) { // If inputIndex is greater than mInputLength, that means there is no // proximity chars. So, we don't need to check proximity. mMatching = true; @@ -253,10 +306,15 @@ Correction::CorrectionType Correction::processCharAndCalcState( mWord[mOutputIndex] = c; - const bool isSameAsUserTypedLength = mInputLength - == getInputIndex() + 1 - || (mExcessivePos == mInputLength - 1 - && getInputIndex() == mInputLength - 2); + mLastCharExceeded = mExcessiveCount == 0 && mSkippedCount == 0 + && mProximityCount == 0 && mTransposedCount == 0 + // TODO: remove this line once excessive correction is conmibned to others. + && mExcessivePos >= 0 && (mInputIndex == mInputLength - 2); + const bool isSameAsUserTypedLength = (mInputLength == mInputIndex + 1) || mLastCharExceeded; + if (mLastCharExceeded) { + // TODO: Decrement mExcessiveCount if next char is matched word. + ++mExcessiveCount; + } if (isSameAsUserTypedLength && isTerminal) { mTerminalInputIndex = mInputIndex; mTerminalOutputIndex = mOutputIndex; @@ -395,20 +453,33 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER; const int fullWordMultiplier = correction->FULL_WORD_MULTIPLIER; const ProximityInfo *proximityInfo = correction->mProximityInfo; - const int skipCount = correction->mSkippedCount; + const int skippedCount = correction->mSkippedCount; + const int transposedCount = correction->mTransposedCount; + const int excessiveCount = correction->mExcessiveCount; const int proximityMatchedCount = correction->mProximityCount; - if (skipCount >= inputLength || inputLength == 0) { + const bool lastCharExceeded = correction->mLastCharExceeded; + if (skippedCount >= inputLength || inputLength == 0) { return -1; } - const bool sameLength = (excessivePos == inputLength - 1) ? (inputLength == inputIndex + 2) - : (inputLength == inputIndex + 1); + // TODO: remove + if (transposedPos >= 0 && transposedCount == 0) { + return -1; + } + + // TODO: remove + if (excessivePos >= 0 && excessiveCount == 0) { + return -1; + } + + const bool sameLength = lastCharExceeded ? (inputLength == inputIndex + 2) + : (inputLength == inputIndex + 1); // TODO: use mExcessiveCount int matchCount = inputLength - correction->mProximityCount - (excessivePos >= 0 ? 1 : 0); const unsigned short* word = correction->mWord; - const bool skipped = skipCount > 0; + const bool skipped = skippedCount > 0; const int quoteDiffCount = max(0, getQuoteCount(word, outputIndex + 1) - getQuoteCount(proximityInfo->getPrimaryInputWord(), inputLength)); @@ -417,6 +488,8 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const int matchWeight; int ed = 0; int adJustedProximityMatchedCount = proximityMatchedCount; + + // TODO: Optimize this. if (excessivePos < 0 && transposedPos < 0 && (proximityMatchedCount > 0 || skipped)) { const unsigned short* primaryInputWord = proximityInfo->getPrimaryInputWord(); ed = editDistance(editDistanceTable, primaryInputWord, @@ -475,7 +548,7 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); } - const int errorCount = proximityMatchedCount + skipCount; + const int errorCount = proximityMatchedCount + skippedCount; multiplyRate( 100 - CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE * errorCount / inputLength, &finalFreq); diff --git a/native/src/correction.h b/native/src/correction.h index 871a04251..3cd600cf0 100644 --- a/native/src/correction.h +++ b/native/src/correction.h @@ -113,8 +113,6 @@ private: int mMaxEditDistance; int mMaxDepth; int mInputLength; - int mExcessivePos; - int mTransposedPos; int mSpaceProximityPos; int mMissingSpacePos; int mTerminalInputIndex; @@ -126,15 +124,26 @@ private: CorrectionState mCorrectionStates[MAX_WORD_LENGTH_INTERNAL]; // The following member variables are being used as cache values of the correction state. + bool mNeedsToTraverseAllNodes; int mOutputIndex; int mInputIndex; + int mProximityCount; + int mExcessiveCount; + int mTransposedCount; int mSkippedCount; + + int mTransposedPos; + int mExcessivePos; int mSkipPos; - bool mNeedsToTraverseAllNodes; + + bool mLastCharExceeded; + bool mMatching; - bool mSkipping; bool mProximityMatching; + bool mExceeding; + bool mTransposing; + bool mSkipping; class RankingAlgorithm { public: diff --git a/native/src/correction_state.h b/native/src/correction_state.h index 267deda9b..93f8a8aab 100644 --- a/native/src/correction_state.h +++ b/native/src/correction_state.h @@ -28,12 +28,25 @@ struct CorrectionState { int mSiblingPos; uint16_t mChildCount; uint8_t mInputIndex; + uint8_t mProximityCount; + uint8_t mTransposedCount; + uint8_t mExcessiveCount; uint8_t mSkippedCount; + + int8_t mTransposedPos; + int8_t mExcessivePos; int8_t mSkipPos; // should be signed + + // TODO: int? + bool mLastCharExceeded; + bool mMatching; + bool mTransposing; + bool mExceeding; bool mSkipping; bool mProximityMatching; + bool mNeedsToTraverseAllNodes; }; @@ -43,14 +56,27 @@ inline static void initCorrectionState(CorrectionState *state, const int rootPos state->mParentIndex = -1; state->mChildCount = childCount; state->mInputIndex = 0; - state->mProximityCount = 0; state->mSiblingPos = rootPos; - state->mSkippedCount = 0; - state->mMatching = false; - state->mSkipping = false; - state->mProximityMatching = false; state->mNeedsToTraverseAllNodes = traverseAll; + + state->mTransposedPos = -1; + state->mExcessivePos = -1; state->mSkipPos = -1; + + + state->mProximityCount = 0; + state->mTransposedCount = 0; + state->mExcessiveCount = 0; + state->mSkippedCount = 0; + + state->mLastCharExceeded = false; + + state->mMatching = false; + state->mProximityMatching = false; + state->mTransposing = false; + state->mExceeding = false; + state->mSkipping = false; + } } // namespace latinime diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp index 6bc350505..805e1cbb7 100644 --- a/native/src/unigram_dictionary.cpp +++ b/native/src/unigram_dictionary.cpp @@ -194,34 +194,27 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, PROF_START(2); // Suggestion with missing character - LOGI("--- Suggest missing characters"); + if (DEBUG_DICT) { + LOGI("--- Suggest missing characters"); + } getSuggestionCandidates(0, -1, -1); PROF_END(2); PROF_START(3); // Suggestion with excessive character - if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER - && mInputLength >= MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION) { - for (int i = 0; i < codesSize; ++i) { - if (DEBUG_DICT) { - LOGI("--- Suggest excessive characters %d", i); - } - getSuggestionCandidates(-1, i, -1); - } + if (DEBUG_DICT) { + LOGI("--- Suggest excessive characters"); } + getSuggestionCandidates(-1, 0, -1); PROF_END(3); PROF_START(4); // Suggestion with transposed characters // Only suggest words that length is mInputLength - if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) { - for (int i = 0; i < codesSize; ++i) { - if (DEBUG_DICT) { - LOGI("--- Suggest transposed characters %d", i); - } - getSuggestionCandidates(-1, -1, i); - } + if (DEBUG_DICT) { + LOGI("--- Suggest transposed characters"); } + getSuggestionCandidates(-1, -1, 0); PROF_END(4); PROF_START(5);