am 4d355989: Add a functionality to limit the max correction errors
* commit '4d355989bd972ba792ba546a55c67e5b6fc2527a': Add a functionality to limit the max correction errorsmain
commit
2d651760c4
|
@ -32,48 +32,6 @@ namespace latinime {
|
||||||
// edit distance funcitons //
|
// edit distance funcitons //
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
|
|
||||||
#if 0 /* no longer used */
|
|
||||||
inline static int editDistance(
|
|
||||||
int* editDistanceTable, const unsigned short* input,
|
|
||||||
const int inputLength, const unsigned short* output, const int outputLength) {
|
|
||||||
// dp[li][lo] dp[a][b] = dp[ a * lo + b]
|
|
||||||
int* dp = editDistanceTable;
|
|
||||||
const int li = inputLength + 1;
|
|
||||||
const int lo = outputLength + 1;
|
|
||||||
for (int i = 0; i < li; ++i) {
|
|
||||||
dp[lo * i] = i;
|
|
||||||
}
|
|
||||||
for (int i = 0; i < lo; ++i) {
|
|
||||||
dp[i] = i;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < li - 1; ++i) {
|
|
||||||
for (int j = 0; j < lo - 1; ++j) {
|
|
||||||
const uint32_t ci = toBaseLowerCase(input[i]);
|
|
||||||
const uint32_t co = toBaseLowerCase(output[j]);
|
|
||||||
const uint16_t cost = (ci == co) ? 0 : 1;
|
|
||||||
dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1,
|
|
||||||
min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost));
|
|
||||||
if (i > 0 && j > 0 && ci == toBaseLowerCase(output[j - 1])
|
|
||||||
&& co == toBaseLowerCase(input[i - 1])) {
|
|
||||||
dp[(i + 1) * lo + (j + 1)] = min(
|
|
||||||
dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (DEBUG_EDIT_DISTANCE) {
|
|
||||||
LOGI("IN = %d, OUT = %d", inputLength, outputLength);
|
|
||||||
for (int i = 0; i < li; ++i) {
|
|
||||||
for (int j = 0; j < lo; ++j) {
|
|
||||||
LOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return dp[li * lo - 1];
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
inline static void initEditDistance(int *editDistanceTable) {
|
inline static void initEditDistance(int *editDistanceTable) {
|
||||||
for (int i = 0; i <= MAX_WORD_LENGTH_INTERNAL; ++i) {
|
for (int i = 0; i <= MAX_WORD_LENGTH_INTERNAL; ++i) {
|
||||||
editDistanceTable[i] = i;
|
editDistanceTable[i] = i;
|
||||||
|
@ -145,7 +103,7 @@ void Correction::initCorrectionState(
|
||||||
|
|
||||||
void Correction::setCorrectionParams(const int skipPos, const int excessivePos,
|
void Correction::setCorrectionParams(const int skipPos, const int excessivePos,
|
||||||
const int transposedPos, const int spaceProximityPos, const int missingSpacePos,
|
const int transposedPos, const int spaceProximityPos, const int missingSpacePos,
|
||||||
const bool useFullEditDistance, const bool doAutoCompletion) {
|
const bool useFullEditDistance, const bool doAutoCompletion, const int maxErrors) {
|
||||||
// TODO: remove
|
// TODO: remove
|
||||||
mTransposedPos = transposedPos;
|
mTransposedPos = transposedPos;
|
||||||
mExcessivePos = excessivePos;
|
mExcessivePos = excessivePos;
|
||||||
|
@ -159,6 +117,7 @@ void Correction::setCorrectionParams(const int skipPos, const int excessivePos,
|
||||||
mMissingSpacePos = missingSpacePos;
|
mMissingSpacePos = missingSpacePos;
|
||||||
mUseFullEditDistance = useFullEditDistance;
|
mUseFullEditDistance = useFullEditDistance;
|
||||||
mDoAutoCompletion = doAutoCompletion;
|
mDoAutoCompletion = doAutoCompletion;
|
||||||
|
mMaxErrors = maxErrors;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Correction::checkState() {
|
void Correction::checkState() {
|
||||||
|
@ -314,12 +273,17 @@ inline bool isEquivalentChar(ProximityInfo::ProximityType type) {
|
||||||
Correction::CorrectionType Correction::processCharAndCalcState(
|
Correction::CorrectionType Correction::processCharAndCalcState(
|
||||||
const int32_t c, const bool isTerminal) {
|
const int32_t c, const bool isTerminal) {
|
||||||
const int correctionCount = (mSkippedCount + mExcessiveCount + mTransposedCount);
|
const int correctionCount = (mSkippedCount + mExcessiveCount + mTransposedCount);
|
||||||
|
if (correctionCount > mMaxErrors) {
|
||||||
|
return UNRELATED;
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: Change the limit if we'll allow two or more corrections
|
// TODO: Change the limit if we'll allow two or more corrections
|
||||||
const bool noCorrectionsHappenedSoFar = correctionCount == 0;
|
const bool noCorrectionsHappenedSoFar = correctionCount == 0;
|
||||||
const bool canTryCorrection = noCorrectionsHappenedSoFar;
|
const bool canTryCorrection = noCorrectionsHappenedSoFar;
|
||||||
int proximityIndex = 0;
|
int proximityIndex = 0;
|
||||||
mDistances[mOutputIndex] = NOT_A_DISTANCE;
|
mDistances[mOutputIndex] = NOT_A_DISTANCE;
|
||||||
|
|
||||||
|
// Skip checking this node
|
||||||
if (mNeedsToTraverseAllNodes || isQuote(c)) {
|
if (mNeedsToTraverseAllNodes || isQuote(c)) {
|
||||||
bool incremented = false;
|
bool incremented = false;
|
||||||
if (mLastCharExceeded && mInputIndex == mInputLength - 1) {
|
if (mLastCharExceeded && mInputIndex == mInputLength - 1) {
|
||||||
|
@ -344,6 +308,7 @@ Correction::CorrectionType Correction::processCharAndCalcState(
|
||||||
return processSkipChar(c, isTerminal, incremented);
|
return processSkipChar(c, isTerminal, incremented);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check possible corrections.
|
||||||
if (mExcessivePos >= 0) {
|
if (mExcessivePos >= 0) {
|
||||||
if (mExcessiveCount == 0 && mExcessivePos < mOutputIndex) {
|
if (mExcessiveCount == 0 && mExcessivePos < mOutputIndex) {
|
||||||
mExcessivePos = mOutputIndex;
|
mExcessivePos = mOutputIndex;
|
||||||
|
@ -394,7 +359,12 @@ Correction::CorrectionType Correction::processCharAndCalcState(
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Change the limit if we'll allow two or more proximity chars with corrections
|
// TODO: Change the limit if we'll allow two or more proximity chars with corrections
|
||||||
const bool checkProximityChars = noCorrectionsHappenedSoFar || mProximityCount == 0;
|
// Work around: When the mMaxErrors is 1, we only allow just one error
|
||||||
|
// including proximity correction.
|
||||||
|
const bool checkProximityChars = (mMaxErrors > 1)
|
||||||
|
? (noCorrectionsHappenedSoFar || mProximityCount == 0)
|
||||||
|
: (noCorrectionsHappenedSoFar && mProximityCount == 0);
|
||||||
|
|
||||||
ProximityInfo::ProximityType matchedProximityCharId = secondTransposing
|
ProximityInfo::ProximityType matchedProximityCharId = secondTransposing
|
||||||
? ProximityInfo::EQUIVALENT_CHAR
|
? ProximityInfo::EQUIVALENT_CHAR
|
||||||
: mProximityInfo->getMatchedProximityId(
|
: mProximityInfo->getMatchedProximityId(
|
||||||
|
@ -934,4 +904,46 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
|
||||||
return totalFreq;
|
return totalFreq;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if 0 /* no longer used. keep just for reference */
|
||||||
|
inline static int editDistance(
|
||||||
|
int* editDistanceTable, const unsigned short* input,
|
||||||
|
const int inputLength, const unsigned short* output, const int outputLength) {
|
||||||
|
// dp[li][lo] dp[a][b] = dp[ a * lo + b]
|
||||||
|
int* dp = editDistanceTable;
|
||||||
|
const int li = inputLength + 1;
|
||||||
|
const int lo = outputLength + 1;
|
||||||
|
for (int i = 0; i < li; ++i) {
|
||||||
|
dp[lo * i] = i;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < lo; ++i) {
|
||||||
|
dp[i] = i;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < li - 1; ++i) {
|
||||||
|
for (int j = 0; j < lo - 1; ++j) {
|
||||||
|
const uint32_t ci = toBaseLowerCase(input[i]);
|
||||||
|
const uint32_t co = toBaseLowerCase(output[j]);
|
||||||
|
const uint16_t cost = (ci == co) ? 0 : 1;
|
||||||
|
dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1,
|
||||||
|
min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost));
|
||||||
|
if (i > 0 && j > 0 && ci == toBaseLowerCase(output[j - 1])
|
||||||
|
&& co == toBaseLowerCase(input[i - 1])) {
|
||||||
|
dp[(i + 1) * lo + (j + 1)] = min(
|
||||||
|
dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (DEBUG_EDIT_DISTANCE) {
|
||||||
|
LOGI("IN = %d, OUT = %d", inputLength, outputLength);
|
||||||
|
for (int i = 0; i < li; ++i) {
|
||||||
|
for (int j = 0; j < lo; ++j) {
|
||||||
|
LOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return dp[li * lo - 1];
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
|
|
|
@ -45,7 +45,7 @@ public:
|
||||||
// TODO: remove
|
// TODO: remove
|
||||||
void setCorrectionParams(const int skipPos, const int excessivePos, const int transposedPos,
|
void setCorrectionParams(const int skipPos, const int excessivePos, const int transposedPos,
|
||||||
const int spaceProximityPos, const int missingSpacePos, const bool useFullEditDistance,
|
const int spaceProximityPos, const int missingSpacePos, const bool useFullEditDistance,
|
||||||
const bool doAutoCompletion);
|
const bool doAutoCompletion, const int maxErrors);
|
||||||
void checkState();
|
void checkState();
|
||||||
bool initProcessState(const int index);
|
bool initProcessState(const int index);
|
||||||
|
|
||||||
|
@ -118,6 +118,7 @@ private:
|
||||||
int mMissingSpacePos;
|
int mMissingSpacePos;
|
||||||
int mTerminalInputIndex;
|
int mTerminalInputIndex;
|
||||||
int mTerminalOutputIndex;
|
int mTerminalOutputIndex;
|
||||||
|
int mMaxErrors;
|
||||||
|
|
||||||
// The following arrays are state buffer.
|
// The following arrays are state buffer.
|
||||||
unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];
|
unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];
|
||||||
|
|
|
@ -261,7 +261,7 @@ void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance,
|
||||||
// TODO: Remove setCorrectionParams
|
// TODO: Remove setCorrectionParams
|
||||||
correction->setCorrectionParams(0, 0, 0,
|
correction->setCorrectionParams(0, 0, 0,
|
||||||
-1 /* spaceProximityPos */, -1 /* missingSpacePos */, useFullEditDistance,
|
-1 /* spaceProximityPos */, -1 /* missingSpacePos */, useFullEditDistance,
|
||||||
true /* doAutoCompletion */);
|
true /* doAutoCompletion */, DEFAULT_MAX_ERRORS);
|
||||||
int rootPosition = ROOT_POS;
|
int rootPosition = ROOT_POS;
|
||||||
// Get the number of children of root, then increment the position
|
// Get the number of children of root, then increment the position
|
||||||
int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition);
|
int childCount = Dictionary::getCount(DICT_ROOT, &rootPosition);
|
||||||
|
@ -296,7 +296,7 @@ void UnigramDictionary::getMissingSpaceWords(
|
||||||
Correction *correction, const bool useFullEditDistance, WordsPriorityQueue *queue) {
|
Correction *correction, const bool useFullEditDistance, WordsPriorityQueue *queue) {
|
||||||
correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */,
|
correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */,
|
||||||
-1 /* transposedPos */, -1 /* spaceProximityPos */, missingSpacePos,
|
-1 /* transposedPos */, -1 /* spaceProximityPos */, missingSpacePos,
|
||||||
useFullEditDistance, true /* doAutoCompletion */);
|
useFullEditDistance, false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS);
|
||||||
getSplitTwoWordsSuggestion(inputLength, proximityInfo, correction, queue);
|
getSplitTwoWordsSuggestion(inputLength, proximityInfo, correction, queue);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -305,7 +305,7 @@ void UnigramDictionary::getMistypedSpaceWords(
|
||||||
Correction *correction, const bool useFullEditDistance, WordsPriorityQueue *queue) {
|
Correction *correction, const bool useFullEditDistance, WordsPriorityQueue *queue) {
|
||||||
correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */,
|
correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */,
|
||||||
-1 /* transposedPos */, spaceProximityPos, -1 /* missingSpacePos */,
|
-1 /* transposedPos */, spaceProximityPos, -1 /* missingSpacePos */,
|
||||||
useFullEditDistance, true /* doAutoCompletion */);
|
useFullEditDistance, false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS);
|
||||||
getSplitTwoWordsSuggestion(inputLength, proximityInfo, correction, queue);
|
getSplitTwoWordsSuggestion(inputLength, proximityInfo, correction, queue);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -61,6 +61,10 @@ public:
|
||||||
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
|
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
|
||||||
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
|
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
|
||||||
|
|
||||||
|
// Error tolerances
|
||||||
|
static const int DEFAULT_MAX_ERRORS = 2;
|
||||||
|
static const int MAX_ERRORS_FOR_TWO_WORDS = 1;
|
||||||
|
|
||||||
UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler,
|
UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler,
|
||||||
int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
|
int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
|
||||||
const bool isLatestDictVersion);
|
const bool isLatestDictVersion);
|
||||||
|
|
Loading…
Reference in New Issue