am 4d355989: Add a functionality to limit the max correction errors

* commit '4d355989bd972ba792ba546a55c67e5b6fc2527a':
  Add a functionality to limit the max correction errors
main
satok 2011-12-15 10:10:59 -08:00 committed by Android Git Automerger
commit 2d651760c4
4 changed files with 65 additions and 48 deletions

View File

@ -32,48 +32,6 @@ namespace latinime {
// edit distance funcitons // // edit distance funcitons //
///////////////////////////// /////////////////////////////
#if 0 /* no longer used */
inline static int editDistance(
int* editDistanceTable, const unsigned short* input,
const int inputLength, const unsigned short* output, const int outputLength) {
// dp[li][lo] dp[a][b] = dp[ a * lo + b]
int* dp = editDistanceTable;
const int li = inputLength + 1;
const int lo = outputLength + 1;
for (int i = 0; i < li; ++i) {
dp[lo * i] = i;
}
for (int i = 0; i < lo; ++i) {
dp[i] = i;
}
for (int i = 0; i < li - 1; ++i) {
for (int j = 0; j < lo - 1; ++j) {
const uint32_t ci = toBaseLowerCase(input[i]);
const uint32_t co = toBaseLowerCase(output[j]);
const uint16_t cost = (ci == co) ? 0 : 1;
dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1,
min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost));
if (i > 0 && j > 0 && ci == toBaseLowerCase(output[j - 1])
&& co == toBaseLowerCase(input[i - 1])) {
dp[(i + 1) * lo + (j + 1)] = min(
dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost);
}
}
}
if (DEBUG_EDIT_DISTANCE) {
LOGI("IN = %d, OUT = %d", inputLength, outputLength);
for (int i = 0; i < li; ++i) {
for (int j = 0; j < lo; ++j) {
LOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]);
}
}
}
return dp[li * lo - 1];
}
#endif
inline static void initEditDistance(int *editDistanceTable) { inline static void initEditDistance(int *editDistanceTable) {
for (int i = 0; i <= MAX_WORD_LENGTH_INTERNAL; ++i) { for (int i = 0; i <= MAX_WORD_LENGTH_INTERNAL; ++i) {
editDistanceTable[i] = i; editDistanceTable[i] = i;
@ -145,7 +103,7 @@ void Correction::initCorrectionState(
void Correction::setCorrectionParams(const int skipPos, const int excessivePos, void Correction::setCorrectionParams(const int skipPos, const int excessivePos,
const int transposedPos, const int spaceProximityPos, const int missingSpacePos, const int transposedPos, const int spaceProximityPos, const int missingSpacePos,
const bool useFullEditDistance, const bool doAutoCompletion) { const bool useFullEditDistance, const bool doAutoCompletion, const int maxErrors) {
// TODO: remove // TODO: remove
mTransposedPos = transposedPos; mTransposedPos = transposedPos;
mExcessivePos = excessivePos; mExcessivePos = excessivePos;
@ -159,6 +117,7 @@ void Correction::setCorrectionParams(const int skipPos, const int excessivePos,
mMissingSpacePos = missingSpacePos; mMissingSpacePos = missingSpacePos;
mUseFullEditDistance = useFullEditDistance; mUseFullEditDistance = useFullEditDistance;
mDoAutoCompletion = doAutoCompletion; mDoAutoCompletion = doAutoCompletion;
mMaxErrors = maxErrors;
} }
void Correction::checkState() { void Correction::checkState() {
@ -314,12 +273,17 @@ inline bool isEquivalentChar(ProximityInfo::ProximityType type) {
Correction::CorrectionType Correction::processCharAndCalcState( Correction::CorrectionType Correction::processCharAndCalcState(
const int32_t c, const bool isTerminal) { const int32_t c, const bool isTerminal) {
const int correctionCount = (mSkippedCount + mExcessiveCount + mTransposedCount); const int correctionCount = (mSkippedCount + mExcessiveCount + mTransposedCount);
if (correctionCount > mMaxErrors) {
return UNRELATED;
}
// TODO: Change the limit if we'll allow two or more corrections // TODO: Change the limit if we'll allow two or more corrections
const bool noCorrectionsHappenedSoFar = correctionCount == 0; const bool noCorrectionsHappenedSoFar = correctionCount == 0;
const bool canTryCorrection = noCorrectionsHappenedSoFar; const bool canTryCorrection = noCorrectionsHappenedSoFar;
int proximityIndex = 0; int proximityIndex = 0;
mDistances[mOutputIndex] = NOT_A_DISTANCE; mDistances[mOutputIndex] = NOT_A_DISTANCE;
// Skip checking this node
if (mNeedsToTraverseAllNodes || isQuote(c)) { if (mNeedsToTraverseAllNodes || isQuote(c)) {
bool incremented = false; bool incremented = false;
if (mLastCharExceeded && mInputIndex == mInputLength - 1) { if (mLastCharExceeded && mInputIndex == mInputLength - 1) {
@ -344,6 +308,7 @@ Correction::CorrectionType Correction::processCharAndCalcState(
return processSkipChar(c, isTerminal, incremented); return processSkipChar(c, isTerminal, incremented);
} }
// Check possible corrections.
if (mExcessivePos >= 0) { if (mExcessivePos >= 0) {
if (mExcessiveCount == 0 && mExcessivePos < mOutputIndex) { if (mExcessiveCount == 0 && mExcessivePos < mOutputIndex) {
mExcessivePos = mOutputIndex; mExcessivePos = mOutputIndex;
@ -394,7 +359,12 @@ Correction::CorrectionType Correction::processCharAndCalcState(
} }
// TODO: Change the limit if we'll allow two or more proximity chars with corrections // TODO: Change the limit if we'll allow two or more proximity chars with corrections
const bool checkProximityChars = noCorrectionsHappenedSoFar || mProximityCount == 0; // Work around: When the mMaxErrors is 1, we only allow just one error
// including proximity correction.
const bool checkProximityChars = (mMaxErrors > 1)
? (noCorrectionsHappenedSoFar || mProximityCount == 0)
: (noCorrectionsHappenedSoFar && mProximityCount == 0);
ProximityInfo::ProximityType matchedProximityCharId = secondTransposing ProximityInfo::ProximityType matchedProximityCharId = secondTransposing
? ProximityInfo::EQUIVALENT_CHAR ? ProximityInfo::EQUIVALENT_CHAR
: mProximityInfo->getMatchedProximityId( : mProximityInfo->getMatchedProximityId(
@ -934,4 +904,46 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
return totalFreq; return totalFreq;
} }
#if 0 /* no longer used. keep just for reference */
inline static int editDistance(
int* editDistanceTable, const unsigned short* input,
const int inputLength, const unsigned short* output, const int outputLength) {
// dp[li][lo] dp[a][b] = dp[ a * lo + b]
int* dp = editDistanceTable;
const int li = inputLength + 1;
const int lo = outputLength + 1;
for (int i = 0; i < li; ++i) {
dp[lo * i] = i;
}
for (int i = 0; i < lo; ++i) {
dp[i] = i;
}
for (int i = 0; i < li - 1; ++i) {
for (int j = 0; j < lo - 1; ++j) {
const uint32_t ci = toBaseLowerCase(input[i]);
const uint32_t co = toBaseLowerCase(output[j]);
const uint16_t cost = (ci == co) ? 0 : 1;
dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1,
min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost));
if (i > 0 && j > 0 && ci == toBaseLowerCase(output[j - 1])
&& co == toBaseLowerCase(input[i - 1])) {
dp[(i + 1) * lo + (j + 1)] = min(
dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost);
}
}
}
if (DEBUG_EDIT_DISTANCE) {
LOGI("IN = %d, OUT = %d", inputLength, outputLength);
for (int i = 0; i < li; ++i) {
for (int j = 0; j < lo; ++j) {
LOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]);
}
}
}
return dp[li * lo - 1];
}
#endif
} // namespace latinime } // namespace latinime

View File

@ -45,7 +45,7 @@ public:
// TODO: remove // TODO: remove
void setCorrectionParams(const int skipPos, const int excessivePos, const int transposedPos, void setCorrectionParams(const int skipPos, const int excessivePos, const int transposedPos,
const int spaceProximityPos, const int missingSpacePos, const bool useFullEditDistance, const int spaceProximityPos, const int missingSpacePos, const bool useFullEditDistance,
const bool doAutoCompletion); const bool doAutoCompletion, const int maxErrors);
void checkState(); void checkState();
bool initProcessState(const int index); bool initProcessState(const int index);
@ -118,6 +118,7 @@ private:
int mMissingSpacePos; int mMissingSpacePos;
int mTerminalInputIndex; int mTerminalInputIndex;
int mTerminalOutputIndex; int mTerminalOutputIndex;
int mMaxErrors;
// The following arrays are state buffer. // The following arrays are state buffer.
unsigned short mWord[MAX_WORD_LENGTH_INTERNAL]; unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];

View File

@ -261,7 +261,7 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance,
// TODO: Remove setCorrectionParams // TODO: Remove setCorrectionParams
correction->setCorrectionParams(0, 0, 0, correction->setCorrectionParams(0, 0, 0,
-1 /* spaceProximityPos */, -1 /* missingSpacePos */, useFullEditDistance, -1 /* spaceProximityPos */, -1 /* missingSpacePos */, useFullEditDistance,
true /* doAutoCompletion */); true /* doAutoCompletion */, DEFAULT_MAX_ERRORS);
int rootPosition = ROOT_POS; int rootPosition = ROOT_POS;
// Get the number of children of root, then increment the position // Get the number of children of root, then increment the position
int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition); int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition);
@ -296,7 +296,7 @@ void UnigramDictionary::getMissingSpaceWords(
Correction *correction, const bool useFullEditDistance, WordsPriorityQueue *queue) { Correction *correction, const bool useFullEditDistance, WordsPriorityQueue *queue) {
correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */, correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */,
-1 /* transposedPos */, -1 /* spaceProximityPos */, missingSpacePos, -1 /* transposedPos */, -1 /* spaceProximityPos */, missingSpacePos,
useFullEditDistance, true /* doAutoCompletion */); useFullEditDistance, false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS);
getSplitTwoWordsSuggestion(inputLength, proximityInfo, correction, queue); getSplitTwoWordsSuggestion(inputLength, proximityInfo, correction, queue);
} }
@ -305,7 +305,7 @@ void UnigramDictionary::getMistypedSpaceWords(
Correction *correction, const bool useFullEditDistance, WordsPriorityQueue *queue) { Correction *correction, const bool useFullEditDistance, WordsPriorityQueue *queue) {
correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */, correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */,
-1 /* transposedPos */, spaceProximityPos, -1 /* missingSpacePos */, -1 /* transposedPos */, spaceProximityPos, -1 /* missingSpacePos */,
useFullEditDistance, true /* doAutoCompletion */); useFullEditDistance, false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS);
getSplitTwoWordsSuggestion(inputLength, proximityInfo, correction, queue); getSplitTwoWordsSuggestion(inputLength, proximityInfo, correction, queue);
} }

View File

@ -61,6 +61,10 @@ public:
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
// Error tolerances
static const int DEFAULT_MAX_ERRORS = 2;
static const int MAX_ERRORS_FOR_TWO_WORDS = 1;
UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler, UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler,
int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars, int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
const bool isLatestDictVersion); const bool isLatestDictVersion);