From 985312e88f11e3ce61f35191df59c6bdf9e80e79 Mon Sep 17 00:00:00 2001 From: satok Date: Fri, 5 Aug 2011 21:21:01 +0900 Subject: [PATCH] Refactor the correction algorithm related to missing character correction Change-Id: If68f2aaea7df48d013aea5401cee4ec0df32111a --- native/src/correction_state.cpp | 73 ++++++++++++++++++++++++--------- native/src/correction_state.h | 7 +++- native/src/proximity_info.cpp | 9 +--- native/src/proximity_info.h | 2 +- 4 files changed, 61 insertions(+), 30 deletions(-) diff --git a/native/src/correction_state.cpp b/native/src/correction_state.cpp index 9000e9e9c..0de11ce19 100644 --- a/native/src/correction_state.cpp +++ b/native/src/correction_state.cpp @@ -30,10 +30,9 @@ namespace latinime { ////////////////////// static const char QUOTE = '\''; -inline bool CorrectionState::needsToSkipCurrentNode(const unsigned short c) { +inline bool CorrectionState::isQuote(const unsigned short c) { const unsigned short userTypedChar = mProximityInfo->getPrimaryCharAt(mInputIndex); - // Skip the ' or other letter and continue deeper - return (c == QUOTE && userTypedChar != QUOTE) || mSkipPos == mOutputIndex; + return (c == QUOTE && userTypedChar != QUOTE); } ///////////////////// @@ -50,6 +49,7 @@ void CorrectionState::initCorrectionState(const ProximityInfo *pi, const int inp mInputLength = inputLength; mMaxDepth = maxDepth; mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2; + mSkippedOutputIndex = -1; } void CorrectionState::setCorrectionParams(const int skipPos, const int excessivePos, @@ -77,9 +77,8 @@ int CorrectionState::getFreqForSplitTwoWords(const int firstFreq, const int seco } int CorrectionState::getFinalFreq(const int freq, unsigned short **word, int *wordLength) { - const int outputIndex = mOutputIndex - 1; - const int inputIndex = (mCurrentStateType == TRAVERSE_ALL_ON_TERMINAL - || mCurrentStateType == TRAVERSE_ALL_NOT_ON_TERMINAL) ? mInputIndex : mInputIndex - 1; + const int outputIndex = mTerminalOutputIndex; + const int inputIndex = mTerminalInputIndex; *wordLength = outputIndex + 1; if (mProximityInfo->sameAsTyped(mWord, outputIndex + 1) || outputIndex < MIN_SUGGEST_DEPTH) { return -1; @@ -145,22 +144,36 @@ bool CorrectionState::needsToPrune() const { || mDiffs > mMaxEditDistance); } +CorrectionState::CorrectionStateType CorrectionState::processSkipChar( + const int32_t c, const bool isTerminal) { + mWord[mOutputIndex] = c; + if (needsToTraverseAll() && isTerminal) { + mTerminalInputIndex = mInputIndex; + mTerminalOutputIndex = mOutputIndex; + incrementOutputIndex(); + return TRAVERSE_ALL_ON_TERMINAL; + } else { + incrementOutputIndex(); + return TRAVERSE_ALL_NOT_ON_TERMINAL; + } +} + CorrectionState::CorrectionStateType CorrectionState::processCharAndCalcState( const int32_t c, const bool isTerminal) { - mCurrentStateType = NOT_ON_TERMINAL; + CorrectionStateType currentStateType = NOT_ON_TERMINAL; // This has to be done for each virtual char (this forwards the "inputIndex" which // is the index in the user-inputted chars, as read by proximity chars. if (mExcessivePos == mOutputIndex && mInputIndex < mInputLength - 1) { incrementInputIndex(); } - if (mTraverseAllNodes || needsToSkipCurrentNode(c)) { - mWord[mOutputIndex] = c; - if (needsToTraverseAll() && isTerminal) { - mCurrentStateType = TRAVERSE_ALL_ON_TERMINAL; - } else { - mCurrentStateType = TRAVERSE_ALL_NOT_ON_TERMINAL; - } + bool skip = false; + if (mSkipPos >= 0) { + skip = mSkipPos == mOutputIndex; + } + + if (mTraverseAllNodes || isQuote(c)) { + return processSkipChar(c, isTerminal); } else { int inputIndexForProximity = mInputIndex; @@ -173,12 +186,30 @@ CorrectionState::CorrectionStateType CorrectionState::processCharAndCalcState( } } + const bool checkProximityChars = + !(mSkipPos >= 0 || mExcessivePos >= 0 || mTransposedPos >= 0); int matchedProximityCharId = mProximityInfo->getMatchedProximityId( - inputIndexForProximity, c, this); - if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) { - mCurrentStateType = UNRELATED; - return mCurrentStateType; + inputIndexForProximity, c, checkProximityChars); + + const bool unrelated = ProximityInfo::UNRELATED_CHAR == matchedProximityCharId; + if (unrelated) { + if (skip) { + // Skip this letter and continue deeper + mSkippedOutputIndex = mOutputIndex; + return processSkipChar(c, isTerminal); + } else { + return UNRELATED; + } } + + // No need to skip. Finish traversing and increment skipPos. + // TODO: Remove this? + if (skip) { + mWord[mOutputIndex] = c; + incrementOutputIndex(); + return TRAVERSE_ALL_NOT_ON_TERMINAL; + } + mWord[mOutputIndex] = c; // If inputIndex is greater than mInputLength, that means there is no // proximity chars. So, we don't need to check proximity. @@ -195,7 +226,9 @@ CorrectionState::CorrectionStateType CorrectionState::processCharAndCalcState( || (mExcessivePos == mInputLength - 1 && getInputIndex() == mInputLength - 2); if (isSameAsUserTypedLength && isTerminal) { - mCurrentStateType = ON_TERMINAL; + mTerminalInputIndex = mInputIndex; + mTerminalOutputIndex = mOutputIndex; + currentStateType = ON_TERMINAL; } // Start traversing all nodes after the index exceeds the user typed length if (isSameAsUserTypedLength) { @@ -213,7 +246,7 @@ CorrectionState::CorrectionStateType CorrectionState::processCharAndCalcState( // Also, the next char is one "virtual node" depth more than this char. incrementOutputIndex(); - return mCurrentStateType; + return currentStateType; } CorrectionState::~CorrectionState() { diff --git a/native/src/correction_state.h b/native/src/correction_state.h index a548bcb68..7ea5aa37d 100644 --- a/native/src/correction_state.h +++ b/native/src/correction_state.h @@ -101,6 +101,7 @@ private: int mMaxDepth; int mInputLength; int mSkipPos; + int mSkippedOutputIndex; int mExcessivePos; int mTransposedPos; int mSpaceProximityPos; @@ -109,12 +110,14 @@ private: int mMatchedCharCount; int mInputIndex; int mOutputIndex; + int mTerminalInputIndex; + int mTerminalOutputIndex; int mDiffs; bool mTraverseAllNodes; - CorrectionStateType mCurrentStateType; unsigned short mWord[MAX_WORD_LENGTH_INTERNAL]; - inline bool needsToSkipCurrentNode(const unsigned short c); + inline bool isQuote(const unsigned short c); + inline CorrectionStateType processSkipChar(const int32_t c, const bool isTerminal); class RankingAlgorithm { public: diff --git a/native/src/proximity_info.cpp b/native/src/proximity_info.cpp index bed92cf9e..d437e251a 100644 --- a/native/src/proximity_info.cpp +++ b/native/src/proximity_info.cpp @@ -114,10 +114,7 @@ bool ProximityInfo::existsAdjacentProximityChars(const int index) const { // in their list. The non-accented version of the character should be considered // "close", but not the other keys close to the non-accented version. ProximityInfo::ProximityType ProximityInfo::getMatchedProximityId( - const int index, const unsigned short c, CorrectionState *correctionState) const { - const int skipPos = correctionState->getSkipPos(); - const int excessivePos = correctionState->getExcessivePos(); - const int transposedPos = correctionState->getTransposedPos(); + const int index, const unsigned short c, const bool checkProximityChars) const { const int *currentChars = getProximityCharsAt(index); const unsigned short baseLowerC = Dictionary::toBaseLowerCase(c); @@ -126,9 +123,7 @@ ProximityInfo::ProximityType ProximityInfo::getMatchedProximityId( if (currentChars[0] == baseLowerC || currentChars[0] == c) return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR; - // If one of those is true, we should not check for close characters at all. - if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0) - return UNRELATED_CHAR; + if (!checkProximityChars) return UNRELATED_CHAR; // If the non-accented, lowercased version of that first character matches c, // then we have a non-accented version of the accented character the user diff --git a/native/src/proximity_info.h b/native/src/proximity_info.h index b28191d01..a9477e41a 100644 --- a/native/src/proximity_info.h +++ b/native/src/proximity_info.h @@ -44,7 +44,7 @@ public: bool existsCharInProximityAt(const int index, const int c) const; bool existsAdjacentProximityChars(const int index) const; ProximityType getMatchedProximityId( - const int index, const unsigned short c, CorrectionState *correctionState) const; + const int index, const unsigned short c, const bool checkProximityChars) const; bool sameAsTyped(const unsigned short *word, int length) const; private: int getStartIndexFromCoordinates(const int x, const int y) const;