Do the transposed correction and the excessive correction by one loop

Change-Id: Idc7a3451a65f7b980e5c499e9083f67646b3a199
main
satok 2011-08-17 17:55:16 +09:00
parent 961453c3b3
commit 9db2097f7b
4 changed files with 165 additions and 64 deletions

View File

@ -56,17 +56,22 @@ void Correction::initCorrectionState(
const int rootPos, const int childCount, const bool traverseAll) { const int rootPos, const int childCount, const bool traverseAll) {
latinime::initCorrectionState(mCorrectionStates, rootPos, childCount, traverseAll); latinime::initCorrectionState(mCorrectionStates, rootPos, childCount, traverseAll);
// TODO: remove // TODO: remove
mCorrectionStates[0].mTransposedPos = mTransposedPos;
mCorrectionStates[0].mExcessivePos = mExcessivePos;
mCorrectionStates[0].mSkipPos = mSkipPos; mCorrectionStates[0].mSkipPos = mSkipPos;
} }
void Correction::setCorrectionParams(const int skipPos, const int excessivePos, void Correction::setCorrectionParams(const int skipPos, const int excessivePos,
const int transposedPos, const int spaceProximityPos, const int missingSpacePos) { const int transposedPos, const int spaceProximityPos, const int missingSpacePos) {
// TODO: remove // TODO: remove
mTransposedPos = transposedPos;
mExcessivePos = excessivePos;
mSkipPos = skipPos; mSkipPos = skipPos;
// TODO: remove // TODO: remove
mCorrectionStates[0].mTransposedPos = transposedPos;
mCorrectionStates[0].mExcessivePos = excessivePos;
mCorrectionStates[0].mSkipPos = skipPos; mCorrectionStates[0].mSkipPos = skipPos;
mExcessivePos = excessivePos;
mTransposedPos = transposedPos;
mSpaceProximityPos = spaceProximityPos; mSpaceProximityPos = spaceProximityPos;
mMissingSpacePos = missingSpacePos; mMissingSpacePos = missingSpacePos;
} }
@ -107,12 +112,23 @@ bool Correction::initProcessState(const int outputIndex) {
--(mCorrectionStates[outputIndex].mChildCount); --(mCorrectionStates[outputIndex].mChildCount);
mInputIndex = mCorrectionStates[outputIndex].mInputIndex; mInputIndex = mCorrectionStates[outputIndex].mInputIndex;
mNeedsToTraverseAllNodes = mCorrectionStates[outputIndex].mNeedsToTraverseAllNodes; mNeedsToTraverseAllNodes = mCorrectionStates[outputIndex].mNeedsToTraverseAllNodes;
mProximityCount = mCorrectionStates[outputIndex].mProximityCount; mProximityCount = mCorrectionStates[outputIndex].mProximityCount;
mTransposedCount = mCorrectionStates[outputIndex].mTransposedCount;
mExcessiveCount = mCorrectionStates[outputIndex].mExcessiveCount;
mSkippedCount = mCorrectionStates[outputIndex].mSkippedCount; mSkippedCount = mCorrectionStates[outputIndex].mSkippedCount;
mLastCharExceeded = mCorrectionStates[outputIndex].mLastCharExceeded;
mTransposedPos = mCorrectionStates[outputIndex].mTransposedPos;
mExcessivePos = mCorrectionStates[outputIndex].mExcessivePos;
mSkipPos = mCorrectionStates[outputIndex].mSkipPos; mSkipPos = mCorrectionStates[outputIndex].mSkipPos;
mSkipping = false;
mProximityMatching = false;
mMatching = false; mMatching = false;
mProximityMatching = false;
mTransposing = false;
mExceeding = false;
mSkipping = false;
return true; return true;
} }
@ -150,12 +166,23 @@ void Correction::incrementOutputIndex() {
mCorrectionStates[mOutputIndex].mSiblingPos = mCorrectionStates[mOutputIndex - 1].mSiblingPos; mCorrectionStates[mOutputIndex].mSiblingPos = mCorrectionStates[mOutputIndex - 1].mSiblingPos;
mCorrectionStates[mOutputIndex].mInputIndex = mInputIndex; mCorrectionStates[mOutputIndex].mInputIndex = mInputIndex;
mCorrectionStates[mOutputIndex].mNeedsToTraverseAllNodes = mNeedsToTraverseAllNodes; mCorrectionStates[mOutputIndex].mNeedsToTraverseAllNodes = mNeedsToTraverseAllNodes;
mCorrectionStates[mOutputIndex].mProximityCount = mProximityCount; mCorrectionStates[mOutputIndex].mProximityCount = mProximityCount;
mCorrectionStates[mOutputIndex].mTransposedCount = mTransposedCount;
mCorrectionStates[mOutputIndex].mExcessiveCount = mExcessiveCount;
mCorrectionStates[mOutputIndex].mSkippedCount = mSkippedCount; mCorrectionStates[mOutputIndex].mSkippedCount = mSkippedCount;
mCorrectionStates[mOutputIndex].mSkipping = mSkipping;
mCorrectionStates[mOutputIndex].mSkipPos = mSkipPos; mCorrectionStates[mOutputIndex].mSkipPos = mSkipPos;
mCorrectionStates[mOutputIndex].mTransposedPos = mTransposedPos;
mCorrectionStates[mOutputIndex].mExcessivePos = mExcessivePos;
mCorrectionStates[mOutputIndex].mLastCharExceeded = mLastCharExceeded;
mCorrectionStates[mOutputIndex].mMatching = mMatching; mCorrectionStates[mOutputIndex].mMatching = mMatching;
mCorrectionStates[mOutputIndex].mProximityMatching = mProximityMatching; mCorrectionStates[mOutputIndex].mProximityMatching = mProximityMatching;
mCorrectionStates[mOutputIndex].mTransposing = mTransposing;
mCorrectionStates[mOutputIndex].mExceeding = mExceeding;
mCorrectionStates[mOutputIndex].mSkipping = mSkipping;
} }
void Correction::startToTraverseAllNodes() { void Correction::startToTraverseAllNodes() {
@ -184,13 +211,16 @@ Correction::CorrectionType Correction::processSkipChar(
Correction::CorrectionType Correction::processCharAndCalcState( Correction::CorrectionType Correction::processCharAndCalcState(
const int32_t c, const bool isTerminal) { const int32_t c, const bool isTerminal) {
CorrectionType currentStateType = NOT_ON_TERMINAL; CorrectionType currentStateType = NOT_ON_TERMINAL;
// This has to be done for each virtual char (this forwards the "inputIndex" which
// is the index in the user-inputted chars, as read by proximity chars. if (mExcessivePos >= 0) {
if (mExcessivePos == mOutputIndex && mInputIndex < mInputLength - 1) { if (mExcessiveCount == 0 && mExcessivePos < mOutputIndex) {
incrementInputIndex(); ++mExcessivePos;
}
if (mExcessivePos < mInputLength - 1) {
mExceeding = mExcessivePos == mInputIndex;
}
} }
bool skip = false;
if (mSkipPos >= 0) { if (mSkipPos >= 0) {
if (mSkippedCount == 0 && mSkipPos < mOutputIndex) { if (mSkippedCount == 0 && mSkipPos < mOutputIndex) {
if (DEBUG_DICT) { if (DEBUG_DICT) {
@ -198,42 +228,64 @@ Correction::CorrectionType Correction::processCharAndCalcState(
} }
++mSkipPos; ++mSkipPos;
} }
skip = mSkipPos == mOutputIndex; mSkipping = mSkipPos == mOutputIndex;
mSkipping = true; }
if (mTransposedPos >= 0) {
if (mTransposedCount == 0 && mTransposedPos < mOutputIndex) {
++mTransposedPos;
}
if (mTransposedPos < mInputLength - 1) {
mTransposing = mInputIndex == mTransposedPos;
}
} }
if (mNeedsToTraverseAllNodes || isQuote(c)) { if (mNeedsToTraverseAllNodes || isQuote(c)) {
return processSkipChar(c, isTerminal); return processSkipChar(c, isTerminal);
} else { } else {
int inputIndexForProximity = mInputIndex; bool secondTransposing = false;
if (mTransposedCount % 2 == 1) {
if (mTransposedPos >= 0) { if (mProximityInfo->getMatchedProximityId(mInputIndex - 1, c, false)
if (mInputIndex == mTransposedPos) { == ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR) {
++inputIndexForProximity; ++mTransposedCount;
} secondTransposing = true;
if (mInputIndex == (mTransposedPos + 1)) { } else if (mCorrectionStates[mOutputIndex].mExceeding) {
--inputIndexForProximity; --mTransposedCount;
++mExcessiveCount;
incrementInputIndex();
} else {
--mTransposedCount;
return UNRELATED;
} }
} }
// TODO: sum counters // TODO: sum counters
const bool checkProximityChars = const bool checkProximityChars =
!(mSkippedCount > 0 || mExcessivePos >= 0 || mTransposedPos >= 0); !(mSkippedCount > 0 || mExcessivePos >= 0 || mTransposedPos >= 0);
int matchedProximityCharId = mProximityInfo->getMatchedProximityId( // TODO: do not check if second transposing
inputIndexForProximity, c, checkProximityChars); const int matchedProximityCharId = mProximityInfo->getMatchedProximityId(
mInputIndex, c, checkProximityChars);
if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) { if (!secondTransposing && ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) {
if (skip && mProximityCount == 0) { if (mInputIndex - 1 < mInputLength && (mExceeding || mTransposing)
&& mProximityInfo->getMatchedProximityId(mInputIndex + 1, c, false)
== ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR) {
if (mTransposing) {
++mTransposedCount;
} else {
++mExcessiveCount;
incrementInputIndex();
}
} else if (mSkipping && mProximityCount == 0) {
// Skip this letter and continue deeper // Skip this letter and continue deeper
++mSkippedCount; ++mSkippedCount;
return processSkipChar(c, isTerminal); return processSkipChar(c, isTerminal);
} else if (checkProximityChars } else if (checkProximityChars
&& inputIndexForProximity > 0 && mInputIndex > 0
&& mCorrectionStates[mOutputIndex].mProximityMatching && mCorrectionStates[mOutputIndex].mProximityMatching
&& mCorrectionStates[mOutputIndex].mSkipping && mCorrectionStates[mOutputIndex].mSkipping
&& mProximityInfo->getMatchedProximityId( && mProximityInfo->getMatchedProximityId(mInputIndex - 1, c, false)
inputIndexForProximity - 1, c, false) == ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR) {
== ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR) {
// Note: This logic tries saving cases like contrst --> contrast -- "a" is one of // Note: This logic tries saving cases like contrst --> contrast -- "a" is one of
// proximity chars of "s", but it should rather be handled as a skipped char. // proximity chars of "s", but it should rather be handled as a skipped char.
++mSkippedCount; ++mSkippedCount;
@ -242,7 +294,8 @@ Correction::CorrectionType Correction::processCharAndCalcState(
} else { } else {
return UNRELATED; return UNRELATED;
} }
} else if (ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) { } else if (secondTransposing
|| ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
// If inputIndex is greater than mInputLength, that means there is no // If inputIndex is greater than mInputLength, that means there is no
// proximity chars. So, we don't need to check proximity. // proximity chars. So, we don't need to check proximity.
mMatching = true; mMatching = true;
@ -253,10 +306,15 @@ Correction::CorrectionType Correction::processCharAndCalcState(
mWord[mOutputIndex] = c; mWord[mOutputIndex] = c;
const bool isSameAsUserTypedLength = mInputLength mLastCharExceeded = mExcessiveCount == 0 && mSkippedCount == 0
== getInputIndex() + 1 && mProximityCount == 0 && mTransposedCount == 0
|| (mExcessivePos == mInputLength - 1 // TODO: remove this line once excessive correction is conmibned to others.
&& getInputIndex() == mInputLength - 2); && mExcessivePos >= 0 && (mInputIndex == mInputLength - 2);
const bool isSameAsUserTypedLength = (mInputLength == mInputIndex + 1) || mLastCharExceeded;
if (mLastCharExceeded) {
// TODO: Decrement mExcessiveCount if next char is matched word.
++mExcessiveCount;
}
if (isSameAsUserTypedLength && isTerminal) { if (isSameAsUserTypedLength && isTerminal) {
mTerminalInputIndex = mInputIndex; mTerminalInputIndex = mInputIndex;
mTerminalOutputIndex = mOutputIndex; mTerminalOutputIndex = mOutputIndex;
@ -395,20 +453,33 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER; const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;
const int fullWordMultiplier = correction->FULL_WORD_MULTIPLIER; const int fullWordMultiplier = correction->FULL_WORD_MULTIPLIER;
const ProximityInfo *proximityInfo = correction->mProximityInfo; const ProximityInfo *proximityInfo = correction->mProximityInfo;
const int skipCount = correction->mSkippedCount; const int skippedCount = correction->mSkippedCount;
const int transposedCount = correction->mTransposedCount;
const int excessiveCount = correction->mExcessiveCount;
const int proximityMatchedCount = correction->mProximityCount; const int proximityMatchedCount = correction->mProximityCount;
if (skipCount >= inputLength || inputLength == 0) { const bool lastCharExceeded = correction->mLastCharExceeded;
if (skippedCount >= inputLength || inputLength == 0) {
return -1; return -1;
} }
const bool sameLength = (excessivePos == inputLength - 1) ? (inputLength == inputIndex + 2)
: (inputLength == inputIndex + 1);
// TODO: remove
if (transposedPos >= 0 && transposedCount == 0) {
return -1;
}
// TODO: remove
if (excessivePos >= 0 && excessiveCount == 0) {
return -1;
}
const bool sameLength = lastCharExceeded ? (inputLength == inputIndex + 2)
: (inputLength == inputIndex + 1);
// TODO: use mExcessiveCount // TODO: use mExcessiveCount
int matchCount = inputLength - correction->mProximityCount - (excessivePos >= 0 ? 1 : 0); int matchCount = inputLength - correction->mProximityCount - (excessivePos >= 0 ? 1 : 0);
const unsigned short* word = correction->mWord; const unsigned short* word = correction->mWord;
const bool skipped = skipCount > 0; const bool skipped = skippedCount > 0;
const int quoteDiffCount = max(0, getQuoteCount(word, outputIndex + 1) const int quoteDiffCount = max(0, getQuoteCount(word, outputIndex + 1)
- getQuoteCount(proximityInfo->getPrimaryInputWord(), inputLength)); - getQuoteCount(proximityInfo->getPrimaryInputWord(), inputLength));
@ -417,6 +488,8 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
int matchWeight; int matchWeight;
int ed = 0; int ed = 0;
int adJustedProximityMatchedCount = proximityMatchedCount; int adJustedProximityMatchedCount = proximityMatchedCount;
// TODO: Optimize this.
if (excessivePos < 0 && transposedPos < 0 && (proximityMatchedCount > 0 || skipped)) { if (excessivePos < 0 && transposedPos < 0 && (proximityMatchedCount > 0 || skipped)) {
const unsigned short* primaryInputWord = proximityInfo->getPrimaryInputWord(); const unsigned short* primaryInputWord = proximityInfo->getPrimaryInputWord();
ed = editDistance(editDistanceTable, primaryInputWord, ed = editDistance(editDistanceTable, primaryInputWord,
@ -475,7 +548,7 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq); multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
} }
const int errorCount = proximityMatchedCount + skipCount; const int errorCount = proximityMatchedCount + skippedCount;
multiplyRate( multiplyRate(
100 - CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE * errorCount / inputLength, &finalFreq); 100 - CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE * errorCount / inputLength, &finalFreq);

View File

@ -113,8 +113,6 @@ private:
int mMaxEditDistance; int mMaxEditDistance;
int mMaxDepth; int mMaxDepth;
int mInputLength; int mInputLength;
int mExcessivePos;
int mTransposedPos;
int mSpaceProximityPos; int mSpaceProximityPos;
int mMissingSpacePos; int mMissingSpacePos;
int mTerminalInputIndex; int mTerminalInputIndex;
@ -126,15 +124,26 @@ private:
CorrectionState mCorrectionStates[MAX_WORD_LENGTH_INTERNAL]; CorrectionState mCorrectionStates[MAX_WORD_LENGTH_INTERNAL];
// The following member variables are being used as cache values of the correction state. // The following member variables are being used as cache values of the correction state.
bool mNeedsToTraverseAllNodes;
int mOutputIndex; int mOutputIndex;
int mInputIndex; int mInputIndex;
int mProximityCount; int mProximityCount;
int mExcessiveCount;
int mTransposedCount;
int mSkippedCount; int mSkippedCount;
int mTransposedPos;
int mExcessivePos;
int mSkipPos; int mSkipPos;
bool mNeedsToTraverseAllNodes;
bool mLastCharExceeded;
bool mMatching; bool mMatching;
bool mSkipping;
bool mProximityMatching; bool mProximityMatching;
bool mExceeding;
bool mTransposing;
bool mSkipping;
class RankingAlgorithm { class RankingAlgorithm {
public: public:

View File

@ -28,12 +28,25 @@ struct CorrectionState {
int mSiblingPos; int mSiblingPos;
uint16_t mChildCount; uint16_t mChildCount;
uint8_t mInputIndex; uint8_t mInputIndex;
uint8_t mProximityCount; uint8_t mProximityCount;
uint8_t mTransposedCount;
uint8_t mExcessiveCount;
uint8_t mSkippedCount; uint8_t mSkippedCount;
int8_t mTransposedPos;
int8_t mExcessivePos;
int8_t mSkipPos; // should be signed int8_t mSkipPos; // should be signed
// TODO: int?
bool mLastCharExceeded;
bool mMatching; bool mMatching;
bool mTransposing;
bool mExceeding;
bool mSkipping; bool mSkipping;
bool mProximityMatching; bool mProximityMatching;
bool mNeedsToTraverseAllNodes; bool mNeedsToTraverseAllNodes;
}; };
@ -43,14 +56,27 @@ inline static void initCorrectionState(CorrectionState *state, const int rootPos
state->mParentIndex = -1; state->mParentIndex = -1;
state->mChildCount = childCount; state->mChildCount = childCount;
state->mInputIndex = 0; state->mInputIndex = 0;
state->mProximityCount = 0;
state->mSiblingPos = rootPos; state->mSiblingPos = rootPos;
state->mSkippedCount = 0;
state->mMatching = false;
state->mSkipping = false;
state->mProximityMatching = false;
state->mNeedsToTraverseAllNodes = traverseAll; state->mNeedsToTraverseAllNodes = traverseAll;
state->mTransposedPos = -1;
state->mExcessivePos = -1;
state->mSkipPos = -1; state->mSkipPos = -1;
state->mProximityCount = 0;
state->mTransposedCount = 0;
state->mExcessiveCount = 0;
state->mSkippedCount = 0;
state->mLastCharExceeded = false;
state->mMatching = false;
state->mProximityMatching = false;
state->mTransposing = false;
state->mExceeding = false;
state->mSkipping = false;
} }
} // namespace latinime } // namespace latinime

View File

@ -194,34 +194,27 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
PROF_START(2); PROF_START(2);
// Suggestion with missing character // Suggestion with missing character
LOGI("--- Suggest missing characters"); if (DEBUG_DICT) {
LOGI("--- Suggest missing characters");
}
getSuggestionCandidates(0, -1, -1); getSuggestionCandidates(0, -1, -1);
PROF_END(2); PROF_END(2);
PROF_START(3); PROF_START(3);
// Suggestion with excessive character // Suggestion with excessive character
if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER if (DEBUG_DICT) {
&& mInputLength >= MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION) { LOGI("--- Suggest excessive characters");
for (int i = 0; i < codesSize; ++i) {
if (DEBUG_DICT) {
LOGI("--- Suggest excessive characters %d", i);
}
getSuggestionCandidates(-1, i, -1);
}
} }
getSuggestionCandidates(-1, 0, -1);
PROF_END(3); PROF_END(3);
PROF_START(4); PROF_START(4);
// Suggestion with transposed characters // Suggestion with transposed characters
// Only suggest words that length is mInputLength // Only suggest words that length is mInputLength
if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) { if (DEBUG_DICT) {
for (int i = 0; i < codesSize; ++i) { LOGI("--- Suggest transposed characters");
if (DEBUG_DICT) {
LOGI("--- Suggest transposed characters %d", i);
}
getSuggestionCandidates(-1, -1, i);
}
} }
getSuggestionCandidates(-1, -1, 0);
PROF_END(4); PROF_END(4);
PROF_START(5); PROF_START(5);