From 4d355989bd972ba792ba546a55c67e5b6fc2527a Mon Sep 17 00:00:00 2001 From: satok Date: Thu, 15 Dec 2011 14:53:19 +0900 Subject: [PATCH] Add a functionality to limit the max correction errors Before ==== test finished, terminate logcat ===== (0) 121.97 (0.28%) (1) 42032.07 (95.46%) (2) 11.03 (0.03%) (3) 12.19 (0.03%) (4) 10.02 (0.02%) (5) 1417.41 (3.22%) (6) 258.43 (0.59%) (20) 50.20 (0.11%) Total 44033.07 (sum of others 43913.32) After ==== test finished, terminate logcat ===== (0) 110.81 (0.29%) (1) 36416.11 (94.47%) (2) 10.06 (0.03%) (3) 9.45 (0.02%) (4) 9.83 (0.03%) (5) 1535.52 (3.98%) (6) 290.25 (0.75%) (20) 40.57 (0.11%) Total 38546.83 (sum of others 38422.60) Change-Id: Iffd24ce0b2dc422c8c6085d5be5f6bfdaf59ca7d --- native/src/correction.cpp | 100 +++++++++++++++++------------- native/src/correction.h | 3 +- native/src/unigram_dictionary.cpp | 6 +- native/src/unigram_dictionary.h | 4 ++ 4 files changed, 65 insertions(+), 48 deletions(-) diff --git a/native/src/correction.cpp b/native/src/correction.cpp index 364913f38..2da82dc3d 100644 --- a/native/src/correction.cpp +++ b/native/src/correction.cpp @@ -32,48 +32,6 @@ namespace latinime { // edit distance funcitons // ///////////////////////////// -#if 0 /* no longer used */ -inline static int editDistance( - int* editDistanceTable, const unsigned short* input, - const int inputLength, const unsigned short* output, const int outputLength) { - // dp[li][lo] dp[a][b] = dp[ a * lo + b] - int* dp = editDistanceTable; - const int li = inputLength + 1; - const int lo = outputLength + 1; - for (int i = 0; i < li; ++i) { - dp[lo * i] = i; - } - for (int i = 0; i < lo; ++i) { - dp[i] = i; - } - - for (int i = 0; i < li - 1; ++i) { - for (int j = 0; j < lo - 1; ++j) { - const uint32_t ci = toBaseLowerCase(input[i]); - const uint32_t co = toBaseLowerCase(output[j]); - const uint16_t cost = (ci == co) ? 0 : 1; - dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1, - min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost)); - if (i > 0 && j > 0 && ci == toBaseLowerCase(output[j - 1]) - && co == toBaseLowerCase(input[i - 1])) { - dp[(i + 1) * lo + (j + 1)] = min( - dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost); - } - } - } - - if (DEBUG_EDIT_DISTANCE) { - LOGI("IN = %d, OUT = %d", inputLength, outputLength); - for (int i = 0; i < li; ++i) { - for (int j = 0; j < lo; ++j) { - LOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]); - } - } - } - return dp[li * lo - 1]; -} -#endif - inline static void initEditDistance(int *editDistanceTable) { for (int i = 0; i <= MAX_WORD_LENGTH_INTERNAL; ++i) { editDistanceTable[i] = i; @@ -145,7 +103,7 @@ void Correction::initCorrectionState( void Correction::setCorrectionParams(const int skipPos, const int excessivePos, const int transposedPos, const int spaceProximityPos, const int missingSpacePos, - const bool useFullEditDistance, const bool doAutoCompletion) { + const bool useFullEditDistance, const bool doAutoCompletion, const int maxErrors) { // TODO: remove mTransposedPos = transposedPos; mExcessivePos = excessivePos; @@ -159,6 +117,7 @@ void Correction::setCorrectionParams(const int skipPos, const int excessivePos, mMissingSpacePos = missingSpacePos; mUseFullEditDistance = useFullEditDistance; mDoAutoCompletion = doAutoCompletion; + mMaxErrors = maxErrors; } void Correction::checkState() { @@ -314,12 +273,17 @@ inline bool isEquivalentChar(ProximityInfo::ProximityType type) { Correction::CorrectionType Correction::processCharAndCalcState( const int32_t c, const bool isTerminal) { const int correctionCount = (mSkippedCount + mExcessiveCount + mTransposedCount); + if (correctionCount > mMaxErrors) { + return UNRELATED; + } + // TODO: Change the limit if we'll allow two or more corrections const bool noCorrectionsHappenedSoFar = correctionCount == 0; const bool canTryCorrection = noCorrectionsHappenedSoFar; int proximityIndex = 0; mDistances[mOutputIndex] = NOT_A_DISTANCE; + // Skip checking this node if (mNeedsToTraverseAllNodes || isQuote(c)) { bool incremented = false; if (mLastCharExceeded && mInputIndex == mInputLength - 1) { @@ -344,6 +308,7 @@ Correction::CorrectionType Correction::processCharAndCalcState( return processSkipChar(c, isTerminal, incremented); } + // Check possible corrections. if (mExcessivePos >= 0) { if (mExcessiveCount == 0 && mExcessivePos < mOutputIndex) { mExcessivePos = mOutputIndex; @@ -394,7 +359,12 @@ Correction::CorrectionType Correction::processCharAndCalcState( } // TODO: Change the limit if we'll allow two or more proximity chars with corrections - const bool checkProximityChars = noCorrectionsHappenedSoFar || mProximityCount == 0; + // Work around: When the mMaxErrors is 1, we only allow just one error + // including proximity correction. + const bool checkProximityChars = (mMaxErrors > 1) + ? (noCorrectionsHappenedSoFar || mProximityCount == 0) + : (noCorrectionsHappenedSoFar && mProximityCount == 0); + ProximityInfo::ProximityType matchedProximityCharId = secondTransposing ? ProximityInfo::EQUIVALENT_CHAR : mProximityInfo->getMatchedProximityId( @@ -934,4 +904,46 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords( return totalFreq; } +#if 0 /* no longer used. keep just for reference */ +inline static int editDistance( + int* editDistanceTable, const unsigned short* input, + const int inputLength, const unsigned short* output, const int outputLength) { + // dp[li][lo] dp[a][b] = dp[ a * lo + b] + int* dp = editDistanceTable; + const int li = inputLength + 1; + const int lo = outputLength + 1; + for (int i = 0; i < li; ++i) { + dp[lo * i] = i; + } + for (int i = 0; i < lo; ++i) { + dp[i] = i; + } + + for (int i = 0; i < li - 1; ++i) { + for (int j = 0; j < lo - 1; ++j) { + const uint32_t ci = toBaseLowerCase(input[i]); + const uint32_t co = toBaseLowerCase(output[j]); + const uint16_t cost = (ci == co) ? 0 : 1; + dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1, + min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost)); + if (i > 0 && j > 0 && ci == toBaseLowerCase(output[j - 1]) + && co == toBaseLowerCase(input[i - 1])) { + dp[(i + 1) * lo + (j + 1)] = min( + dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost); + } + } + } + + if (DEBUG_EDIT_DISTANCE) { + LOGI("IN = %d, OUT = %d", inputLength, outputLength); + for (int i = 0; i < li; ++i) { + for (int j = 0; j < lo; ++j) { + LOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]); + } + } + } + return dp[li * lo - 1]; +} +#endif + } // namespace latinime diff --git a/native/src/correction.h b/native/src/correction.h index 4a8d1fab7..e55be8dd6 100644 --- a/native/src/correction.h +++ b/native/src/correction.h @@ -45,7 +45,7 @@ public: // TODO: remove void setCorrectionParams(const int skipPos, const int excessivePos, const int transposedPos, const int spaceProximityPos, const int missingSpacePos, const bool useFullEditDistance, - const bool doAutoCompletion); + const bool doAutoCompletion, const int maxErrors); void checkState(); bool initProcessState(const int index); @@ -118,6 +118,7 @@ private: int mMissingSpacePos; int mTerminalInputIndex; int mTerminalOutputIndex; + int mMaxErrors; // The following arrays are state buffer. unsigned short mWord[MAX_WORD_LENGTH_INTERNAL]; diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp index 7cf191970..ac9f53ed2 100644 --- a/native/src/unigram_dictionary.cpp +++ b/native/src/unigram_dictionary.cpp @@ -261,7 +261,7 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, // TODO: Remove setCorrectionParams correction->setCorrectionParams(0, 0, 0, -1 /* spaceProximityPos */, -1 /* missingSpacePos */, useFullEditDistance, - true /* doAutoCompletion */); + true /* doAutoCompletion */, DEFAULT_MAX_ERRORS); int rootPosition = ROOT_POS; // Get the number of children of root, then increment the position int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition); @@ -296,7 +296,7 @@ void UnigramDictionary::getMissingSpaceWords( Correction *correction, const bool useFullEditDistance, WordsPriorityQueue *queue) { correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */, -1 /* transposedPos */, -1 /* spaceProximityPos */, missingSpacePos, - useFullEditDistance, true /* doAutoCompletion */); + useFullEditDistance, false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS); getSplitTwoWordsSuggestion(inputLength, proximityInfo, correction, queue); } @@ -305,7 +305,7 @@ void UnigramDictionary::getMistypedSpaceWords( Correction *correction, const bool useFullEditDistance, WordsPriorityQueue *queue) { correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */, -1 /* transposedPos */, spaceProximityPos, -1 /* missingSpacePos */, - useFullEditDistance, true /* doAutoCompletion */); + useFullEditDistance, false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS); getSplitTwoWordsSuggestion(inputLength, proximityInfo, correction, queue); } diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h index 0b0126524..f5cb43860 100644 --- a/native/src/unigram_dictionary.h +++ b/native/src/unigram_dictionary.h @@ -61,6 +61,10 @@ public: static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; + // Error tolerances + static const int DEFAULT_MAX_ERRORS = 2; + static const int MAX_ERRORS_FOR_TWO_WORDS = 1; + UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler, int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars, const bool isLatestDictVersion);