Add a functionality to limit the max correction errors

Before
==== test finished, terminate logcat =====
(0)  121.97 (0.28%)
(1)  42032.07 (95.46%)
(2)  11.03 (0.03%)
(3)  12.19 (0.03%)
(4)  10.02 (0.02%)
(5)  1417.41 (3.22%)
(6)  258.43 (0.59%)
(20) 50.20 (0.11%)
Total 44033.07 (sum of others 43913.32)

After
==== test finished, terminate logcat =====
(0)  110.81 (0.29%)
(1)  36416.11 (94.47%)
(2)  10.06 (0.03%)
(3)  9.45 (0.02%)
(4)  9.83 (0.03%)
(5)  1535.52 (3.98%)
(6)  290.25 (0.75%)
(20) 40.57 (0.11%)
Total 38546.83 (sum of others 38422.60)

Change-Id: Iffd24ce0b2dc422c8c6085d5be5f6bfdaf59ca7d
This commit is contained in:
satok 2011-12-15 14:53:19 +09:00
parent d03317c4be
commit 4d355989bd
4 changed files with 65 additions and 48 deletions

View file

@ -32,48 +32,6 @@ namespace latinime {
// edit distance funcitons // // edit distance funcitons //
///////////////////////////// /////////////////////////////
#if 0 /* no longer used */
inline static int editDistance(
int* editDistanceTable, const unsigned short* input,
const int inputLength, const unsigned short* output, const int outputLength) {
// dp[li][lo] dp[a][b] = dp[ a * lo + b]
int* dp = editDistanceTable;
const int li = inputLength + 1;
const int lo = outputLength + 1;
for (int i = 0; i < li; ++i) {
dp[lo * i] = i;
}
for (int i = 0; i < lo; ++i) {
dp[i] = i;
}
for (int i = 0; i < li - 1; ++i) {
for (int j = 0; j < lo - 1; ++j) {
const uint32_t ci = toBaseLowerCase(input[i]);
const uint32_t co = toBaseLowerCase(output[j]);
const uint16_t cost = (ci == co) ? 0 : 1;
dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1,
min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost));
if (i > 0 && j > 0 && ci == toBaseLowerCase(output[j - 1])
&& co == toBaseLowerCase(input[i - 1])) {
dp[(i + 1) * lo + (j + 1)] = min(
dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost);
}
}
}
if (DEBUG_EDIT_DISTANCE) {
LOGI("IN = %d, OUT = %d", inputLength, outputLength);
for (int i = 0; i < li; ++i) {
for (int j = 0; j < lo; ++j) {
LOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]);
}
}
}
return dp[li * lo - 1];
}
#endif
inline static void initEditDistance(int *editDistanceTable) { inline static void initEditDistance(int *editDistanceTable) {
for (int i = 0; i <= MAX_WORD_LENGTH_INTERNAL; ++i) { for (int i = 0; i <= MAX_WORD_LENGTH_INTERNAL; ++i) {
editDistanceTable[i] = i; editDistanceTable[i] = i;
@ -145,7 +103,7 @@ void Correction::initCorrectionState(
void Correction::setCorrectionParams(const int skipPos, const int excessivePos, void Correction::setCorrectionParams(const int skipPos, const int excessivePos,
const int transposedPos, const int spaceProximityPos, const int missingSpacePos, const int transposedPos, const int spaceProximityPos, const int missingSpacePos,
const bool useFullEditDistance, const bool doAutoCompletion) { const bool useFullEditDistance, const bool doAutoCompletion, const int maxErrors) {
// TODO: remove // TODO: remove
mTransposedPos = transposedPos; mTransposedPos = transposedPos;
mExcessivePos = excessivePos; mExcessivePos = excessivePos;
@ -159,6 +117,7 @@ void Correction::setCorrectionParams(const int skipPos, const int excessivePos,
mMissingSpacePos = missingSpacePos; mMissingSpacePos = missingSpacePos;
mUseFullEditDistance = useFullEditDistance; mUseFullEditDistance = useFullEditDistance;
mDoAutoCompletion = doAutoCompletion; mDoAutoCompletion = doAutoCompletion;
mMaxErrors = maxErrors;
} }
void Correction::checkState() { void Correction::checkState() {
@ -314,12 +273,17 @@ inline bool isEquivalentChar(ProximityInfo::ProximityType type) {
Correction::CorrectionType Correction::processCharAndCalcState( Correction::CorrectionType Correction::processCharAndCalcState(
const int32_t c, const bool isTerminal) { const int32_t c, const bool isTerminal) {
const int correctionCount = (mSkippedCount + mExcessiveCount + mTransposedCount); const int correctionCount = (mSkippedCount + mExcessiveCount + mTransposedCount);
if (correctionCount > mMaxErrors) {
return UNRELATED;
}
// TODO: Change the limit if we'll allow two or more corrections // TODO: Change the limit if we'll allow two or more corrections
const bool noCorrectionsHappenedSoFar = correctionCount == 0; const bool noCorrectionsHappenedSoFar = correctionCount == 0;
const bool canTryCorrection = noCorrectionsHappenedSoFar; const bool canTryCorrection = noCorrectionsHappenedSoFar;
int proximityIndex = 0; int proximityIndex = 0;
mDistances[mOutputIndex] = NOT_A_DISTANCE; mDistances[mOutputIndex] = NOT_A_DISTANCE;
// Skip checking this node
if (mNeedsToTraverseAllNodes || isQuote(c)) { if (mNeedsToTraverseAllNodes || isQuote(c)) {
bool incremented = false; bool incremented = false;
if (mLastCharExceeded && mInputIndex == mInputLength - 1) { if (mLastCharExceeded && mInputIndex == mInputLength - 1) {
@ -344,6 +308,7 @@ Correction::CorrectionType Correction::processCharAndCalcState(
return processSkipChar(c, isTerminal, incremented); return processSkipChar(c, isTerminal, incremented);
} }
// Check possible corrections.
if (mExcessivePos >= 0) { if (mExcessivePos >= 0) {
if (mExcessiveCount == 0 && mExcessivePos < mOutputIndex) { if (mExcessiveCount == 0 && mExcessivePos < mOutputIndex) {
mExcessivePos = mOutputIndex; mExcessivePos = mOutputIndex;
@ -394,7 +359,12 @@ Correction::CorrectionType Correction::processCharAndCalcState(
} }
// TODO: Change the limit if we'll allow two or more proximity chars with corrections // TODO: Change the limit if we'll allow two or more proximity chars with corrections
const bool checkProximityChars = noCorrectionsHappenedSoFar || mProximityCount == 0; // Work around: When the mMaxErrors is 1, we only allow just one error
// including proximity correction.
const bool checkProximityChars = (mMaxErrors > 1)
? (noCorrectionsHappenedSoFar || mProximityCount == 0)
: (noCorrectionsHappenedSoFar && mProximityCount == 0);
ProximityInfo::ProximityType matchedProximityCharId = secondTransposing ProximityInfo::ProximityType matchedProximityCharId = secondTransposing
? ProximityInfo::EQUIVALENT_CHAR ? ProximityInfo::EQUIVALENT_CHAR
: mProximityInfo->getMatchedProximityId( : mProximityInfo->getMatchedProximityId(
@ -934,4 +904,46 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
return totalFreq; return totalFreq;
} }
#if 0 /* no longer used. keep just for reference */
inline static int editDistance(
int* editDistanceTable, const unsigned short* input,
const int inputLength, const unsigned short* output, const int outputLength) {
// dp[li][lo] dp[a][b] = dp[ a * lo + b]
int* dp = editDistanceTable;
const int li = inputLength + 1;
const int lo = outputLength + 1;
for (int i = 0; i < li; ++i) {
dp[lo * i] = i;
}
for (int i = 0; i < lo; ++i) {
dp[i] = i;
}
for (int i = 0; i < li - 1; ++i) {
for (int j = 0; j < lo - 1; ++j) {
const uint32_t ci = toBaseLowerCase(input[i]);
const uint32_t co = toBaseLowerCase(output[j]);
const uint16_t cost = (ci == co) ? 0 : 1;
dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1,
min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost));
if (i > 0 && j > 0 && ci == toBaseLowerCase(output[j - 1])
&& co == toBaseLowerCase(input[i - 1])) {
dp[(i + 1) * lo + (j + 1)] = min(
dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost);
}
}
}
if (DEBUG_EDIT_DISTANCE) {
LOGI("IN = %d, OUT = %d", inputLength, outputLength);
for (int i = 0; i < li; ++i) {
for (int j = 0; j < lo; ++j) {
LOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]);
}
}
}
return dp[li * lo - 1];
}
#endif
} // namespace latinime } // namespace latinime

View file

@ -45,7 +45,7 @@ public:
// TODO: remove // TODO: remove
void setCorrectionParams(const int skipPos, const int excessivePos, const int transposedPos, void setCorrectionParams(const int skipPos, const int excessivePos, const int transposedPos,
const int spaceProximityPos, const int missingSpacePos, const bool useFullEditDistance, const int spaceProximityPos, const int missingSpacePos, const bool useFullEditDistance,
const bool doAutoCompletion); const bool doAutoCompletion, const int maxErrors);
void checkState(); void checkState();
bool initProcessState(const int index); bool initProcessState(const int index);
@ -118,6 +118,7 @@ private:
int mMissingSpacePos; int mMissingSpacePos;
int mTerminalInputIndex; int mTerminalInputIndex;
int mTerminalOutputIndex; int mTerminalOutputIndex;
int mMaxErrors;
// The following arrays are state buffer. // The following arrays are state buffer.
unsigned short mWord[MAX_WORD_LENGTH_INTERNAL]; unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];

View file

@ -261,7 +261,7 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance,
// TODO: Remove setCorrectionParams // TODO: Remove setCorrectionParams
correction->setCorrectionParams(0, 0, 0, correction->setCorrectionParams(0, 0, 0,
-1 /* spaceProximityPos */, -1 /* missingSpacePos */, useFullEditDistance, -1 /* spaceProximityPos */, -1 /* missingSpacePos */, useFullEditDistance,
true /* doAutoCompletion */); true /* doAutoCompletion */, DEFAULT_MAX_ERRORS);
int rootPosition = ROOT_POS; int rootPosition = ROOT_POS;
// Get the number of children of root, then increment the position // Get the number of children of root, then increment the position
int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition); int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition);
@ -296,7 +296,7 @@ void UnigramDictionary::getMissingSpaceWords(
Correction *correction, const bool useFullEditDistance, WordsPriorityQueue *queue) { Correction *correction, const bool useFullEditDistance, WordsPriorityQueue *queue) {
correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */, correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */,
-1 /* transposedPos */, -1 /* spaceProximityPos */, missingSpacePos, -1 /* transposedPos */, -1 /* spaceProximityPos */, missingSpacePos,
useFullEditDistance, true /* doAutoCompletion */); useFullEditDistance, false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS);
getSplitTwoWordsSuggestion(inputLength, proximityInfo, correction, queue); getSplitTwoWordsSuggestion(inputLength, proximityInfo, correction, queue);
} }
@ -305,7 +305,7 @@ void UnigramDictionary::getMistypedSpaceWords(
Correction *correction, const bool useFullEditDistance, WordsPriorityQueue *queue) { Correction *correction, const bool useFullEditDistance, WordsPriorityQueue *queue) {
correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */, correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */,
-1 /* transposedPos */, spaceProximityPos, -1 /* missingSpacePos */, -1 /* transposedPos */, spaceProximityPos, -1 /* missingSpacePos */,
useFullEditDistance, true /* doAutoCompletion */); useFullEditDistance, false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS);
getSplitTwoWordsSuggestion(inputLength, proximityInfo, correction, queue); getSplitTwoWordsSuggestion(inputLength, proximityInfo, correction, queue);
} }

View file

@ -61,6 +61,10 @@ public:
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
// Error tolerances
static const int DEFAULT_MAX_ERRORS = 2;
static const int MAX_ERRORS_FOR_TWO_WORDS = 1;
UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler, UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler,
int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars, int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
const bool isLatestDictVersion); const bool isLatestDictVersion);