Refactor the correction algorithm related to missing character correction

Change-Id: If68f2aaea7df48d013aea5401cee4ec0df32111a
main
satok 2011-08-05 21:21:01 +09:00
parent 44beeab4a4
commit 985312e88f
4 changed files with 61 additions and 30 deletions

View File

@ -30,10 +30,9 @@ namespace latinime {
////////////////////// //////////////////////
static const char QUOTE = '\''; static const char QUOTE = '\'';
inline bool CorrectionState::needsToSkipCurrentNode(const unsigned short c) { inline bool CorrectionState::isQuote(const unsigned short c) {
const unsigned short userTypedChar = mProximityInfo->getPrimaryCharAt(mInputIndex); const unsigned short userTypedChar = mProximityInfo->getPrimaryCharAt(mInputIndex);
// Skip the ' or other letter and continue deeper return (c == QUOTE && userTypedChar != QUOTE);
return (c == QUOTE && userTypedChar != QUOTE) || mSkipPos == mOutputIndex;
} }
///////////////////// /////////////////////
@ -50,6 +49,7 @@ void CorrectionState::initCorrectionState(const ProximityInfo *pi, const int inp
mInputLength = inputLength; mInputLength = inputLength;
mMaxDepth = maxDepth; mMaxDepth = maxDepth;
mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2; mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
mSkippedOutputIndex = -1;
} }
void CorrectionState::setCorrectionParams(const int skipPos, const int excessivePos, void CorrectionState::setCorrectionParams(const int skipPos, const int excessivePos,
@ -77,9 +77,8 @@ int CorrectionState::getFreqForSplitTwoWords(const int firstFreq, const int seco
} }
int CorrectionState::getFinalFreq(const int freq, unsigned short **word, int *wordLength) { int CorrectionState::getFinalFreq(const int freq, unsigned short **word, int *wordLength) {
const int outputIndex = mOutputIndex - 1; const int outputIndex = mTerminalOutputIndex;
const int inputIndex = (mCurrentStateType == TRAVERSE_ALL_ON_TERMINAL const int inputIndex = mTerminalInputIndex;
|| mCurrentStateType == TRAVERSE_ALL_NOT_ON_TERMINAL) ? mInputIndex : mInputIndex - 1;
*wordLength = outputIndex + 1; *wordLength = outputIndex + 1;
if (mProximityInfo->sameAsTyped(mWord, outputIndex + 1) || outputIndex < MIN_SUGGEST_DEPTH) { if (mProximityInfo->sameAsTyped(mWord, outputIndex + 1) || outputIndex < MIN_SUGGEST_DEPTH) {
return -1; return -1;
@ -145,22 +144,36 @@ bool CorrectionState::needsToPrune() const {
|| mDiffs > mMaxEditDistance); || mDiffs > mMaxEditDistance);
} }
CorrectionState::CorrectionStateType CorrectionState::processSkipChar(
const int32_t c, const bool isTerminal) {
mWord[mOutputIndex] = c;
if (needsToTraverseAll() && isTerminal) {
mTerminalInputIndex = mInputIndex;
mTerminalOutputIndex = mOutputIndex;
incrementOutputIndex();
return TRAVERSE_ALL_ON_TERMINAL;
} else {
incrementOutputIndex();
return TRAVERSE_ALL_NOT_ON_TERMINAL;
}
}
CorrectionState::CorrectionStateType CorrectionState::processCharAndCalcState( CorrectionState::CorrectionStateType CorrectionState::processCharAndCalcState(
const int32_t c, const bool isTerminal) { const int32_t c, const bool isTerminal) {
mCurrentStateType = NOT_ON_TERMINAL; CorrectionStateType currentStateType = NOT_ON_TERMINAL;
// This has to be done for each virtual char (this forwards the "inputIndex" which // This has to be done for each virtual char (this forwards the "inputIndex" which
// is the index in the user-inputted chars, as read by proximity chars. // is the index in the user-inputted chars, as read by proximity chars.
if (mExcessivePos == mOutputIndex && mInputIndex < mInputLength - 1) { if (mExcessivePos == mOutputIndex && mInputIndex < mInputLength - 1) {
incrementInputIndex(); incrementInputIndex();
} }
if (mTraverseAllNodes || needsToSkipCurrentNode(c)) { bool skip = false;
mWord[mOutputIndex] = c; if (mSkipPos >= 0) {
if (needsToTraverseAll() && isTerminal) { skip = mSkipPos == mOutputIndex;
mCurrentStateType = TRAVERSE_ALL_ON_TERMINAL; }
} else {
mCurrentStateType = TRAVERSE_ALL_NOT_ON_TERMINAL; if (mTraverseAllNodes || isQuote(c)) {
} return processSkipChar(c, isTerminal);
} else { } else {
int inputIndexForProximity = mInputIndex; int inputIndexForProximity = mInputIndex;
@ -173,12 +186,30 @@ CorrectionState::CorrectionStateType CorrectionState::processCharAndCalcState(
} }
} }
const bool checkProximityChars =
!(mSkipPos >= 0 || mExcessivePos >= 0 || mTransposedPos >= 0);
int matchedProximityCharId = mProximityInfo->getMatchedProximityId( int matchedProximityCharId = mProximityInfo->getMatchedProximityId(
inputIndexForProximity, c, this); inputIndexForProximity, c, checkProximityChars);
if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) {
mCurrentStateType = UNRELATED; const bool unrelated = ProximityInfo::UNRELATED_CHAR == matchedProximityCharId;
return mCurrentStateType; if (unrelated) {
if (skip) {
// Skip this letter and continue deeper
mSkippedOutputIndex = mOutputIndex;
return processSkipChar(c, isTerminal);
} else {
return UNRELATED;
}
} }
// No need to skip. Finish traversing and increment skipPos.
// TODO: Remove this?
if (skip) {
mWord[mOutputIndex] = c;
incrementOutputIndex();
return TRAVERSE_ALL_NOT_ON_TERMINAL;
}
mWord[mOutputIndex] = c; mWord[mOutputIndex] = c;
// If inputIndex is greater than mInputLength, that means there is no // If inputIndex is greater than mInputLength, that means there is no
// proximity chars. So, we don't need to check proximity. // proximity chars. So, we don't need to check proximity.
@ -195,7 +226,9 @@ CorrectionState::CorrectionStateType CorrectionState::processCharAndCalcState(
|| (mExcessivePos == mInputLength - 1 || (mExcessivePos == mInputLength - 1
&& getInputIndex() == mInputLength - 2); && getInputIndex() == mInputLength - 2);
if (isSameAsUserTypedLength && isTerminal) { if (isSameAsUserTypedLength && isTerminal) {
mCurrentStateType = ON_TERMINAL; mTerminalInputIndex = mInputIndex;
mTerminalOutputIndex = mOutputIndex;
currentStateType = ON_TERMINAL;
} }
// Start traversing all nodes after the index exceeds the user typed length // Start traversing all nodes after the index exceeds the user typed length
if (isSameAsUserTypedLength) { if (isSameAsUserTypedLength) {
@ -213,7 +246,7 @@ CorrectionState::CorrectionStateType CorrectionState::processCharAndCalcState(
// Also, the next char is one "virtual node" depth more than this char. // Also, the next char is one "virtual node" depth more than this char.
incrementOutputIndex(); incrementOutputIndex();
return mCurrentStateType; return currentStateType;
} }
CorrectionState::~CorrectionState() { CorrectionState::~CorrectionState() {

View File

@ -101,6 +101,7 @@ private:
int mMaxDepth; int mMaxDepth;
int mInputLength; int mInputLength;
int mSkipPos; int mSkipPos;
int mSkippedOutputIndex;
int mExcessivePos; int mExcessivePos;
int mTransposedPos; int mTransposedPos;
int mSpaceProximityPos; int mSpaceProximityPos;
@ -109,12 +110,14 @@ private:
int mMatchedCharCount; int mMatchedCharCount;
int mInputIndex; int mInputIndex;
int mOutputIndex; int mOutputIndex;
int mTerminalInputIndex;
int mTerminalOutputIndex;
int mDiffs; int mDiffs;
bool mTraverseAllNodes; bool mTraverseAllNodes;
CorrectionStateType mCurrentStateType;
unsigned short mWord[MAX_WORD_LENGTH_INTERNAL]; unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];
inline bool needsToSkipCurrentNode(const unsigned short c); inline bool isQuote(const unsigned short c);
inline CorrectionStateType processSkipChar(const int32_t c, const bool isTerminal);
class RankingAlgorithm { class RankingAlgorithm {
public: public:

View File

@ -114,10 +114,7 @@ bool ProximityInfo::existsAdjacentProximityChars(const int index) const {
// in their list. The non-accented version of the character should be considered // in their list. The non-accented version of the character should be considered
// "close", but not the other keys close to the non-accented version. // "close", but not the other keys close to the non-accented version.
ProximityInfo::ProximityType ProximityInfo::getMatchedProximityId( ProximityInfo::ProximityType ProximityInfo::getMatchedProximityId(
const int index, const unsigned short c, CorrectionState *correctionState) const { const int index, const unsigned short c, const bool checkProximityChars) const {
const int skipPos = correctionState->getSkipPos();
const int excessivePos = correctionState->getExcessivePos();
const int transposedPos = correctionState->getTransposedPos();
const int *currentChars = getProximityCharsAt(index); const int *currentChars = getProximityCharsAt(index);
const unsigned short baseLowerC = Dictionary::toBaseLowerCase(c); const unsigned short baseLowerC = Dictionary::toBaseLowerCase(c);
@ -126,9 +123,7 @@ ProximityInfo::ProximityType ProximityInfo::getMatchedProximityId(
if (currentChars[0] == baseLowerC || currentChars[0] == c) if (currentChars[0] == baseLowerC || currentChars[0] == c)
return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR; return SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR;
// If one of those is true, we should not check for close characters at all. if (!checkProximityChars) return UNRELATED_CHAR;
if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0)
return UNRELATED_CHAR;
// If the non-accented, lowercased version of that first character matches c, // If the non-accented, lowercased version of that first character matches c,
// then we have a non-accented version of the accented character the user // then we have a non-accented version of the accented character the user

View File

@ -44,7 +44,7 @@ public:
bool existsCharInProximityAt(const int index, const int c) const; bool existsCharInProximityAt(const int index, const int c) const;
bool existsAdjacentProximityChars(const int index) const; bool existsAdjacentProximityChars(const int index) const;
ProximityType getMatchedProximityId( ProximityType getMatchedProximityId(
const int index, const unsigned short c, CorrectionState *correctionState) const; const int index, const unsigned short c, const bool checkProximityChars) const;
bool sameAsTyped(const unsigned short *word, int length) const; bool sameAsTyped(const unsigned short *word, int length) const;
private: private:
int getStartIndexFromCoordinates(const int x, const int y) const; int getStartIndexFromCoordinates(const int x, const int y) const;