LatinIME/native/jni/src/correction.cpp

1146 lines
48 KiB
C++
Raw Normal View History

/*
* Copyright (C) 2011 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <assert.h>
#include <ctype.h>
#include <math.h>
#include <stdio.h>
#include <string.h>
#define LOG_TAG "LatinIME: correction.cpp"
#include "char_utils.h"
#include "correction.h"
#include "defines.h"
#include "dictionary.h"
#include "proximity_info.h"
namespace latinime {
/////////////////////////////
// edit distance funcitons //
/////////////////////////////
inline static void initEditDistance(int *editDistanceTable) {
for (int i = 0; i <= MAX_WORD_LENGTH_INTERNAL; ++i) {
editDistanceTable[i] = i;
}
}
inline static void dumpEditDistance10ForDebug(int *editDistanceTable,
const int editDistanceTableWidth, const int outputLength) {
if (DEBUG_DICT) {
AKLOGI("EditDistanceTable");
for (int i = 0; i <= 10; ++i) {
int c[11];
for (int j = 0; j <= 10; ++j) {
if (j < editDistanceTableWidth + 1 && i < outputLength + 1) {
c[j] = (editDistanceTable + i * (editDistanceTableWidth + 1))[j];
} else {
c[j] = -1;
}
}
AKLOGI("[ %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d ]",
c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8], c[9], c[10]);
}
}
}
inline static void calcEditDistanceOneStep(int *editDistanceTable, const unsigned short *input,
const int inputLength, const unsigned short *output, const int outputLength) {
// TODO: Make sure that editDistance[0 ~ MAX_WORD_LENGTH_INTERNAL] is not touched.
// Let dp[i][j] be editDistanceTable[i * (inputLength + 1) + j].
// Assuming that dp[0][0] ... dp[outputLength - 1][inputLength] are already calculated,
// and calculate dp[ouputLength][0] ... dp[outputLength][inputLength].
int *const current = editDistanceTable + outputLength * (inputLength + 1);
const int *const prev = editDistanceTable + (outputLength - 1) * (inputLength + 1);
const int *const prevprev =
outputLength >= 2 ? editDistanceTable + (outputLength - 2) * (inputLength + 1) : 0;
current[0] = outputLength;
const uint32_t co = toBaseLowerCase(output[outputLength - 1]);
const uint32_t prevCO = outputLength >= 2 ? toBaseLowerCase(output[outputLength - 2]) : 0;
for (int i = 1; i <= inputLength; ++i) {
const uint32_t ci = toBaseLowerCase(input[i - 1]);
const uint16_t cost = (ci == co) ? 0 : 1;
current[i] = min(current[i - 1] + 1, min(prev[i] + 1, prev[i - 1] + cost));
if (i >= 2 && prevprev && ci == prevCO && co == toBaseLowerCase(input[i - 2])) {
current[i] = min(current[i], prevprev[i - 2] + 1);
}
}
}
inline static int getCurrentEditDistance(int *editDistanceTable, const int editDistanceTableWidth,
const int outputLength, const int inputLength) {
if (DEBUG_EDIT_DISTANCE) {
AKLOGI("getCurrentEditDistance %d, %d", inputLength, outputLength);
}
return editDistanceTable[(editDistanceTableWidth + 1) * (outputLength) + inputLength];
}
//////////////////////
// inline functions //
//////////////////////
static const char QUOTE = '\'';
inline bool Correction::isQuote(const unsigned short c) {
const unsigned short userTypedChar = mProximityInfo->getPrimaryCharAt(mInputIndex);
return (c == QUOTE && userTypedChar != QUOTE);
}
////////////////
// Correction //
////////////////
Correction::Correction(const int typedLetterMultiplier, const int fullWordMultiplier)
: TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier) {
initEditDistance(mEditDistanceTable);
}
void Correction::initCorrection(const ProximityInfo *pi, const int inputLength,
const int maxDepth) {
mProximityInfo = pi;
mInputLength = inputLength;
mMaxDepth = maxDepth;
mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
// TODO: This is not supposed to be required. Check what's going wrong with
// editDistance[0 ~ MAX_WORD_LENGTH_INTERNAL]
initEditDistance(mEditDistanceTable);
}
void Correction::initCorrectionState(
const int rootPos, const int childCount, const bool traverseAll) {
latinime::initCorrectionState(mCorrectionStates, rootPos, childCount, traverseAll);
// TODO: remove
mCorrectionStates[0].mTransposedPos = mTransposedPos;
mCorrectionStates[0].mExcessivePos = mExcessivePos;
mCorrectionStates[0].mSkipPos = mSkipPos;
}
void Correction::setCorrectionParams(const int skipPos, const int excessivePos,
const int transposedPos, const int spaceProximityPos, const int missingSpacePos,
const bool useFullEditDistance, const bool doAutoCompletion, const int maxErrors) {
// TODO: remove
mTransposedPos = transposedPos;
mExcessivePos = excessivePos;
mSkipPos = skipPos;
// TODO: remove
mCorrectionStates[0].mTransposedPos = transposedPos;
mCorrectionStates[0].mExcessivePos = excessivePos;
mCorrectionStates[0].mSkipPos = skipPos;
mSpaceProximityPos = spaceProximityPos;
mMissingSpacePos = missingSpacePos;
mUseFullEditDistance = useFullEditDistance;
mDoAutoCompletion = doAutoCompletion;
mMaxErrors = maxErrors;
}
void Correction::checkState() {
if (DEBUG_DICT) {
int inputCount = 0;
if (mSkipPos >= 0) ++inputCount;
if (mExcessivePos >= 0) ++inputCount;
if (mTransposedPos >= 0) ++inputCount;
// TODO: remove this assert
assert(inputCount <= 1);
}
}
int Correction::getFreqForSplitMultipleWords(const int *freqArray, const int *wordLengthArray,
const int wordCount, const bool isSpaceProximity, const unsigned short *word) {
return Correction::RankingAlgorithm::calcFreqForSplitMultipleWords(freqArray, wordLengthArray,
wordCount, this, isSpaceProximity, word);
}
int Correction::getFinalProbability(const int probability, unsigned short **word, int *wordLength) {
return getFinalProbabilityInternal(probability, word, wordLength, mInputLength);
}
int Correction::getFinalProbabilityForSubQueue(const int probability, unsigned short **word,
int *wordLength, const int inputLength) {
return getFinalProbabilityInternal(probability, word, wordLength, inputLength);
}
int Correction::getFinalProbabilityInternal(const int probability, unsigned short **word,
int *wordLength, const int inputLength) {
const int outputIndex = mTerminalOutputIndex;
const int inputIndex = mTerminalInputIndex;
*wordLength = outputIndex + 1;
if (outputIndex < MIN_SUGGEST_DEPTH) {
return NOT_A_PROBABILITY;
}
*word = mWord;
int finalProbability= Correction::RankingAlgorithm::calculateFinalProbability(
inputIndex, outputIndex, probability, mEditDistanceTable, this, inputLength);
return finalProbability;
}
bool Correction::initProcessState(const int outputIndex) {
if (mCorrectionStates[outputIndex].mChildCount <= 0) {
return false;
}
mOutputIndex = outputIndex;
--(mCorrectionStates[outputIndex].mChildCount);
mInputIndex = mCorrectionStates[outputIndex].mInputIndex;
mNeedsToTraverseAllNodes = mCorrectionStates[outputIndex].mNeedsToTraverseAllNodes;
mEquivalentCharCount = mCorrectionStates[outputIndex].mEquivalentCharCount;
mProximityCount = mCorrectionStates[outputIndex].mProximityCount;
mTransposedCount = mCorrectionStates[outputIndex].mTransposedCount;
mExcessiveCount = mCorrectionStates[outputIndex].mExcessiveCount;
mSkippedCount = mCorrectionStates[outputIndex].mSkippedCount;
mLastCharExceeded = mCorrectionStates[outputIndex].mLastCharExceeded;
mTransposedPos = mCorrectionStates[outputIndex].mTransposedPos;
mExcessivePos = mCorrectionStates[outputIndex].mExcessivePos;
mSkipPos = mCorrectionStates[outputIndex].mSkipPos;
mMatching = false;
mProximityMatching = false;
mAdditionalProximityMatching = false;
mTransposing = false;
mExceeding = false;
mSkipping = false;
return true;
}
int Correction::goDownTree(
const int parentIndex, const int childCount, const int firstChildPos) {
mCorrectionStates[mOutputIndex].mParentIndex = parentIndex;
mCorrectionStates[mOutputIndex].mChildCount = childCount;
mCorrectionStates[mOutputIndex].mSiblingPos = firstChildPos;
return mOutputIndex;
}
// TODO: remove
int Correction::getInputIndex() {
return mInputIndex;
}
void Correction::incrementInputIndex() {
++mInputIndex;
}
void Correction::incrementOutputIndex() {
++mOutputIndex;
mCorrectionStates[mOutputIndex].mParentIndex = mCorrectionStates[mOutputIndex - 1].mParentIndex;
mCorrectionStates[mOutputIndex].mChildCount = mCorrectionStates[mOutputIndex - 1].mChildCount;
mCorrectionStates[mOutputIndex].mSiblingPos = mCorrectionStates[mOutputIndex - 1].mSiblingPos;
mCorrectionStates[mOutputIndex].mInputIndex = mInputIndex;
mCorrectionStates[mOutputIndex].mNeedsToTraverseAllNodes = mNeedsToTraverseAllNodes;
mCorrectionStates[mOutputIndex].mEquivalentCharCount = mEquivalentCharCount;
mCorrectionStates[mOutputIndex].mProximityCount = mProximityCount;
mCorrectionStates[mOutputIndex].mTransposedCount = mTransposedCount;
mCorrectionStates[mOutputIndex].mExcessiveCount = mExcessiveCount;
mCorrectionStates[mOutputIndex].mSkippedCount = mSkippedCount;
mCorrectionStates[mOutputIndex].mSkipPos = mSkipPos;
mCorrectionStates[mOutputIndex].mTransposedPos = mTransposedPos;
mCorrectionStates[mOutputIndex].mExcessivePos = mExcessivePos;
mCorrectionStates[mOutputIndex].mLastCharExceeded = mLastCharExceeded;
mCorrectionStates[mOutputIndex].mMatching = mMatching;
mCorrectionStates[mOutputIndex].mProximityMatching = mProximityMatching;
mCorrectionStates[mOutputIndex].mAdditionalProximityMatching = mAdditionalProximityMatching;
mCorrectionStates[mOutputIndex].mTransposing = mTransposing;
mCorrectionStates[mOutputIndex].mExceeding = mExceeding;
mCorrectionStates[mOutputIndex].mSkipping = mSkipping;
}
void Correction::startToTraverseAllNodes() {
mNeedsToTraverseAllNodes = true;
}
bool Correction::needsToPrune() const {
// TODO: use edit distance here
return mOutputIndex - 1 >= mMaxDepth || mProximityCount > mMaxEditDistance
// Allow one char longer word for missing character
|| (!mDoAutoCompletion && (mOutputIndex > mInputLength));
}
void Correction::addCharToCurrentWord(const int32_t c) {
mWord[mOutputIndex] = c;
const unsigned short *primaryInputWord = mProximityInfo->getPrimaryInputWord();
calcEditDistanceOneStep(mEditDistanceTable, primaryInputWord, mInputLength,
mWord, mOutputIndex + 1);
}
Correction::CorrectionType Correction::processSkipChar(
const int32_t c, const bool isTerminal, const bool inputIndexIncremented) {
addCharToCurrentWord(c);
mTerminalInputIndex = mInputIndex - (inputIndexIncremented ? 1 : 0);
mTerminalOutputIndex = mOutputIndex;
if (mNeedsToTraverseAllNodes && isTerminal) {
incrementOutputIndex();
return TRAVERSE_ALL_ON_TERMINAL;
} else {
incrementOutputIndex();
return TRAVERSE_ALL_NOT_ON_TERMINAL;
}
}
Correction::CorrectionType Correction::processUnrelatedCorrectionType() {
// Needs to set mTerminalInputIndex and mTerminalOutputIndex before returning any CorrectionType
mTerminalInputIndex = mInputIndex;
mTerminalOutputIndex = mOutputIndex;
return UNRELATED;
}
inline bool isEquivalentChar(ProximityInfo::ProximityType type) {
return type == ProximityInfo::EQUIVALENT_CHAR;
}
inline bool isProximityCharOrEquivalentChar(ProximityInfo::ProximityType type) {
return type == ProximityInfo::EQUIVALENT_CHAR
|| type == ProximityInfo::NEAR_PROXIMITY_CHAR;
}
Correction::CorrectionType Correction::processCharAndCalcState(
const int32_t c, const bool isTerminal) {
const int correctionCount = (mSkippedCount + mExcessiveCount + mTransposedCount);
if (correctionCount > mMaxErrors) {
return processUnrelatedCorrectionType();
}
// TODO: Change the limit if we'll allow two or more corrections
const bool noCorrectionsHappenedSoFar = correctionCount == 0;
const bool canTryCorrection = noCorrectionsHappenedSoFar;
int proximityIndex = 0;
mDistances[mOutputIndex] = NOT_A_DISTANCE;
// Skip checking this node
if (mNeedsToTraverseAllNodes || isQuote(c)) {
bool incremented = false;
if (mLastCharExceeded && mInputIndex == mInputLength - 1) {
// TODO: Do not check the proximity if EditDistance exceeds the threshold
const ProximityInfo::ProximityType matchId =
mProximityInfo->getMatchedProximityId(mInputIndex, c, true, &proximityIndex);
if (isEquivalentChar(matchId)) {
mLastCharExceeded = false;
--mExcessiveCount;
mDistances[mOutputIndex] =
mProximityInfo->getNormalizedSquaredDistance(mInputIndex, 0);
} else if (matchId == ProximityInfo::NEAR_PROXIMITY_CHAR) {
mLastCharExceeded = false;
--mExcessiveCount;
++mProximityCount;
mDistances[mOutputIndex] =
mProximityInfo->getNormalizedSquaredDistance(mInputIndex, proximityIndex);
}
if (!isQuote(c)) {
incrementInputIndex();
incremented = true;
}
}
return processSkipChar(c, isTerminal, incremented);
}
// Check possible corrections.
if (mExcessivePos >= 0) {
if (mExcessiveCount == 0 && mExcessivePos < mOutputIndex) {
mExcessivePos = mOutputIndex;
}
if (mExcessivePos < mInputLength - 1) {
mExceeding = mExcessivePos == mInputIndex && canTryCorrection;
}
}
if (mSkipPos >= 0) {
if (mSkippedCount == 0 && mSkipPos < mOutputIndex) {
if (DEBUG_DICT) {
assert(mSkipPos == mOutputIndex - 1);
}
mSkipPos = mOutputIndex;
}
mSkipping = mSkipPos == mOutputIndex && canTryCorrection;
}
if (mTransposedPos >= 0) {
if (mTransposedCount == 0 && mTransposedPos < mOutputIndex) {
mTransposedPos = mOutputIndex;
}
if (mTransposedPos < mInputLength - 1) {
mTransposing = mInputIndex == mTransposedPos && canTryCorrection;
}
}
bool secondTransposing = false;
if (mTransposedCount % 2 == 1) {
if (isEquivalentChar(mProximityInfo->getMatchedProximityId(mInputIndex - 1, c, false))) {
++mTransposedCount;
secondTransposing = true;
} else if (mCorrectionStates[mOutputIndex].mExceeding) {
--mTransposedCount;
++mExcessiveCount;
--mExcessivePos;
incrementInputIndex();
} else {
--mTransposedCount;
if (DEBUG_CORRECTION
&& (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength)
&& (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
|| MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
DUMP_WORD(mWord, mOutputIndex);
AKLOGI("UNRELATED(0): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
mTransposedCount, mExcessiveCount, c);
}
return processUnrelatedCorrectionType();
}
}
// TODO: Change the limit if we'll allow two or more proximity chars with corrections
// Work around: When the mMaxErrors is 1, we only allow just one error
// including proximity correction.
const bool checkProximityChars = (mMaxErrors > 1)
? (noCorrectionsHappenedSoFar || mProximityCount == 0)
: (noCorrectionsHappenedSoFar && mProximityCount == 0);
ProximityInfo::ProximityType matchedProximityCharId = secondTransposing
? ProximityInfo::EQUIVALENT_CHAR
: mProximityInfo->getMatchedProximityId(
mInputIndex, c, checkProximityChars, &proximityIndex);
if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId
|| ProximityInfo::ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) {
if (canTryCorrection && mOutputIndex > 0
&& mCorrectionStates[mOutputIndex].mProximityMatching
&& mCorrectionStates[mOutputIndex].mExceeding
&& isEquivalentChar(mProximityInfo->getMatchedProximityId(
mInputIndex, mWord[mOutputIndex - 1], false))) {
if (DEBUG_CORRECTION
&& (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength)
&& (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
|| MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
AKLOGI("CONVERSION p->e %c", mWord[mOutputIndex - 1]);
}
// Conversion p->e
// Example:
// wearth -> earth
// px -> (E)mmmmm
++mExcessiveCount;
--mProximityCount;
mExcessivePos = mOutputIndex - 1;
++mInputIndex;
// Here, we are doing something equivalent to matchedProximityCharId,
// but we already know that "excessive char correction" just happened
// so that we just need to check "mProximityCount == 0".
matchedProximityCharId = mProximityInfo->getMatchedProximityId(
mInputIndex, c, mProximityCount == 0, &proximityIndex);
}
}
if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId
|| ProximityInfo::ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) {
if (ProximityInfo::ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) {
mAdditionalProximityMatching = true;
}
// TODO: Optimize
// As the current char turned out to be an unrelated char,
// we will try other correction-types. Please note that mCorrectionStates[mOutputIndex]
// here refers to the previous state.
if (mInputIndex < mInputLength - 1 && mOutputIndex > 0 && mTransposedCount > 0
&& !mCorrectionStates[mOutputIndex].mTransposing
&& mCorrectionStates[mOutputIndex - 1].mTransposing
&& isEquivalentChar(mProximityInfo->getMatchedProximityId(
mInputIndex, mWord[mOutputIndex - 1], false))
&& isEquivalentChar(
mProximityInfo->getMatchedProximityId(mInputIndex + 1, c, false))) {
// Conversion t->e
// Example:
// occaisional -> occa sional
// mmmmttx -> mmmm(E)mmmmmm
mTransposedCount -= 2;
++mExcessiveCount;
++mInputIndex;
} else if (mOutputIndex > 0 && mInputIndex > 0 && mTransposedCount > 0
&& !mCorrectionStates[mOutputIndex].mTransposing
&& mCorrectionStates[mOutputIndex - 1].mTransposing
&& isEquivalentChar(
mProximityInfo->getMatchedProximityId(mInputIndex - 1, c, false))) {
// Conversion t->s
// Example:
// chcolate -> chocolate
// mmttx -> mmsmmmmmm
mTransposedCount -= 2;
++mSkippedCount;
--mInputIndex;
} else if (canTryCorrection && mInputIndex > 0
&& mCorrectionStates[mOutputIndex].mProximityMatching
&& mCorrectionStates[mOutputIndex].mSkipping
&& isEquivalentChar(
mProximityInfo->getMatchedProximityId(mInputIndex - 1, c, false))) {
// Conversion p->s
// Note: This logic tries saving cases like contrst --> contrast -- "a" is one of
// proximity chars of "s", but it should rather be handled as a skipped char.
++mSkippedCount;
--mProximityCount;
return processSkipChar(c, isTerminal, false);
} else if (mInputIndex - 1 < mInputLength
&& mSkippedCount > 0
&& mCorrectionStates[mOutputIndex].mSkipping
&& mCorrectionStates[mOutputIndex].mAdditionalProximityMatching
&& isProximityCharOrEquivalentChar(
mProximityInfo->getMatchedProximityId(mInputIndex + 1, c, false))) {
// Conversion s->a
incrementInputIndex();
--mSkippedCount;
mProximityMatching = true;
++mProximityCount;
mDistances[mOutputIndex] = ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO;
} else if ((mExceeding || mTransposing) && mInputIndex - 1 < mInputLength
&& isEquivalentChar(
mProximityInfo->getMatchedProximityId(mInputIndex + 1, c, false))) {
// 1.2. Excessive or transpose correction
if (mTransposing) {
++mTransposedCount;
} else {
++mExcessiveCount;
incrementInputIndex();
}
if (DEBUG_CORRECTION
&& (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength)
&& (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
|| MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
DUMP_WORD(mWord, mOutputIndex);
if (mTransposing) {
AKLOGI("TRANSPOSE: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
mTransposedCount, mExcessiveCount, c);
} else {
AKLOGI("EXCEED: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
mTransposedCount, mExcessiveCount, c);
}
}
} else if (mSkipping) {
// 3. Skip correction
++mSkippedCount;
if (DEBUG_CORRECTION
&& (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength)
&& (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
|| MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
AKLOGI("SKIP: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
mTransposedCount, mExcessiveCount, c);
}
return processSkipChar(c, isTerminal, false);
} else if (ProximityInfo::ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) {
// As a last resort, use additional proximity characters
mProximityMatching = true;
++mProximityCount;
mDistances[mOutputIndex] = ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO;
if (DEBUG_CORRECTION
&& (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength)
&& (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
|| MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
AKLOGI("ADDITIONALPROX: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
mTransposedCount, mExcessiveCount, c);
}
} else {
if (DEBUG_CORRECTION
&& (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength)
&& (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
|| MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
DUMP_WORD(mWord, mOutputIndex);
AKLOGI("UNRELATED(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
mTransposedCount, mExcessiveCount, c);
}
return processUnrelatedCorrectionType();
}
} else if (secondTransposing) {
// If inputIndex is greater than mInputLength, that means there is no
// proximity chars. So, we don't need to check proximity.
mMatching = true;
} else if (isEquivalentChar(matchedProximityCharId)) {
mMatching = true;
++mEquivalentCharCount;
mDistances[mOutputIndex] = mProximityInfo->getNormalizedSquaredDistance(mInputIndex, 0);
} else if (ProximityInfo::NEAR_PROXIMITY_CHAR == matchedProximityCharId) {
mProximityMatching = true;
++mProximityCount;
mDistances[mOutputIndex] =
mProximityInfo->getNormalizedSquaredDistance(mInputIndex, proximityIndex);
if (DEBUG_CORRECTION
&& (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength)
&& (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
|| MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
AKLOGI("PROX: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
mTransposedCount, mExcessiveCount, c);
}
}
addCharToCurrentWord(c);
// 4. Last char excessive correction
mLastCharExceeded = mExcessiveCount == 0 && mSkippedCount == 0 && mTransposedCount == 0
&& mProximityCount == 0 && (mInputIndex == mInputLength - 2);
const bool isSameAsUserTypedLength = (mInputLength == mInputIndex + 1) || mLastCharExceeded;
if (mLastCharExceeded) {
++mExcessiveCount;
}
// Start traversing all nodes after the index exceeds the user typed length
if (isSameAsUserTypedLength) {
startToTraverseAllNodes();
}
const bool needsToTryOnTerminalForTheLastPossibleExcessiveChar =
mExceeding && mInputIndex == mInputLength - 2;
// Finally, we are ready to go to the next character, the next "virtual node".
// We should advance the input index.
// We do this in this branch of the 'if traverseAllNodes' because we are still matching
// characters to input; the other branch is not matching them but searching for
// completions, this is why it does not have to do it.
incrementInputIndex();
// Also, the next char is one "virtual node" depth more than this char.
incrementOutputIndex();
if ((needsToTryOnTerminalForTheLastPossibleExcessiveChar
|| isSameAsUserTypedLength) && isTerminal) {
mTerminalInputIndex = mInputIndex - 1;
mTerminalOutputIndex = mOutputIndex - 1;
if (DEBUG_CORRECTION
&& (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength)
&& (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
DUMP_WORD(mWord, mOutputIndex);
AKLOGI("ONTERMINAL(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
mTransposedCount, mExcessiveCount, c);
}
return ON_TERMINAL;
} else {
mTerminalInputIndex = mInputIndex - 1;
mTerminalOutputIndex = mOutputIndex - 1;
return NOT_ON_TERMINAL;
}
}
Correction::~Correction() {
}
inline static int getQuoteCount(const unsigned short* word, const int length) {
int quoteCount = 0;
for (int i = 0; i < length; ++i) {
if(word[i] == '\'') {
++quoteCount;
}
}
return quoteCount;
}
inline static bool isUpperCase(unsigned short c) {
return isAsciiUpper(toBaseChar(c));
}
//////////////////////
// RankingAlgorithm //
//////////////////////
/* static */
int Correction::RankingAlgorithm::calculateFinalProbability(const int inputIndex,
const int outputIndex, const int freq, int* editDistanceTable, const Correction* correction,
const int inputLength) {
const int excessivePos = correction->getExcessivePos();
const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;
const int fullWordMultiplier = correction->FULL_WORD_MULTIPLIER;
const ProximityInfo *proximityInfo = correction->mProximityInfo;
const int skippedCount = correction->mSkippedCount;
const int transposedCount = correction->mTransposedCount / 2;
const int excessiveCount = correction->mExcessiveCount + correction->mTransposedCount % 2;
const int proximityMatchedCount = correction->mProximityCount;
const bool lastCharExceeded = correction->mLastCharExceeded;
const bool useFullEditDistance = correction->mUseFullEditDistance;
const int outputLength = outputIndex + 1;
if (skippedCount >= inputLength || inputLength == 0) {
return -1;
}
// TODO: find more robust way
bool sameLength = lastCharExceeded ? (inputLength == inputIndex + 2)
: (inputLength == inputIndex + 1);
// TODO: use mExcessiveCount
const int matchCount = inputLength - correction->mProximityCount - excessiveCount;
const unsigned short* word = correction->mWord;
const bool skipped = skippedCount > 0;
const int quoteDiffCount = max(0, getQuoteCount(word, outputLength)
- getQuoteCount(proximityInfo->getPrimaryInputWord(), inputLength));
// TODO: Calculate edit distance for transposed and excessive
int ed = 0;
if (DEBUG_DICT_FULL) {
dumpEditDistance10ForDebug(editDistanceTable, correction->mInputLength, outputLength);
}
int adjustedProximityMatchedCount = proximityMatchedCount;
int finalFreq = freq;
if (DEBUG_CORRECTION_FREQ
&& (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == inputLength)) {
AKLOGI("FinalFreq0: %d", finalFreq);
}
// TODO: Optimize this.
if (transposedCount > 0 || proximityMatchedCount > 0 || skipped || excessiveCount > 0) {
ed = getCurrentEditDistance(editDistanceTable, correction->mInputLength, outputLength,
inputLength) - transposedCount;
const int matchWeight = powerIntCapped(typedLetterMultiplier,
max(inputLength, outputLength) - ed);
multiplyIntCapped(matchWeight, &finalFreq);
// TODO: Demote further if there are two or more excessive chars with longer user input?
if (inputLength > outputLength) {
multiplyRate(INPUT_EXCEEDS_OUTPUT_DEMOTION_RATE, &finalFreq);
}
ed = max(0, ed - quoteDiffCount);
adjustedProximityMatchedCount = min(max(0, ed - (outputLength - inputLength)),
proximityMatchedCount);
if (transposedCount <= 0) {
if (ed == 1 && (inputLength == outputLength - 1 || inputLength == outputLength + 1)) {
// Promote a word with just one skipped or excessive char
if (sameLength) {
multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE
+ WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_MULTIPLIER * outputLength,
&finalFreq);
} else {
multiplyIntCapped(typedLetterMultiplier, &finalFreq);
}
} else if (ed == 0) {
multiplyIntCapped(typedLetterMultiplier, &finalFreq);
sameLength = true;
}
}
} else {
const int matchWeight = powerIntCapped(typedLetterMultiplier, matchCount);
multiplyIntCapped(matchWeight, &finalFreq);
}
if (proximityInfo->getMatchedProximityId(0, word[0], true)
== ProximityInfo::UNRELATED_CHAR) {
multiplyRate(FIRST_CHAR_DIFFERENT_DEMOTION_RATE, &finalFreq);
}
///////////////////////////////////////////////
// Promotion and Demotion for each correction
// Demotion for a word with missing character
if (skipped) {
const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE
* (10 * inputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)
/ (10 * inputLength
- WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);
if (DEBUG_DICT_FULL) {
AKLOGI("Demotion rate for missing character is %d.", demotionRate);
}
multiplyRate(demotionRate, &finalFreq);
}
// Demotion for a word with transposed character
if (transposedCount > 0) multiplyRate(
WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);
// Demotion for a word with excessive character
if (excessiveCount > 0) {
multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);
if (!lastCharExceeded && !proximityInfo->existsAdjacentProximityChars(excessivePos)) {
if (DEBUG_DICT_FULL) {
AKLOGI("Double excessive demotion");
}
// If an excessive character is not adjacent to the left char or the right char,
// we will demote this word.
multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);
}
}
const bool performTouchPositionCorrection =
CALIBRATE_SCORE_BY_TOUCH_COORDINATES && proximityInfo->touchPositionCorrectionEnabled()
&& skippedCount == 0 && excessiveCount == 0 && transposedCount == 0;
// Score calibration by touch coordinates is being done only for pure-fat finger typing error
// cases.
int additionalProximityCount = 0;
// TODO: Remove this constraint.
if (performTouchPositionCorrection) {
for (int i = 0; i < outputLength; ++i) {
const int squaredDistance = correction->mDistances[i];
if (i < adjustedProximityMatchedCount) {
multiplyIntCapped(typedLetterMultiplier, &finalFreq);
}
if (squaredDistance >= 0) {
// Promote or demote the score according to the distance from the sweet spot
static const float A = ZERO_DISTANCE_PROMOTION_RATE / 100.0f;
static const float B = 1.0f;
static const float C = 0.5f;
static const float MIN = 0.3f;
static const float R1 = NEUTRAL_SCORE_SQUARED_RADIUS;
static const float R2 = HALF_SCORE_SQUARED_RADIUS;
const float x = (float)squaredDistance
/ ProximityInfo::NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR;
const float factor = max((x < R1)
? (A * (R1 - x) + B * x) / R1
: (B * (R2 - x) + C * (x - R1)) / (R2 - R1), MIN);
// factor is piecewise linear function like:
// A -_ .
// ^-_ .
// B \ .
// \_ .
// C ------------.
// .
// 0 R1 R2 .
multiplyRate((int)(factor * 100), &finalFreq);
} else if (squaredDistance == PROXIMITY_CHAR_WITHOUT_DISTANCE_INFO) {
multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
} else if (squaredDistance == ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO) {
++additionalProximityCount;
multiplyRate(WORDS_WITH_ADDITIONAL_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
}
}
} else {
// Demote additional proximity characters
for (int i = 0; i < outputLength; ++i) {
const int squaredDistance = correction->mDistances[i];
if (squaredDistance == ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO) {
++additionalProximityCount;
}
}
// Promotion for a word with proximity characters
for (int i = 0; i < adjustedProximityMatchedCount; ++i) {
// A word with proximity corrections
if (DEBUG_DICT_FULL) {
AKLOGI("Found a proximity correction.");
}
multiplyIntCapped(typedLetterMultiplier, &finalFreq);
if (i < additionalProximityCount) {
multiplyRate(WORDS_WITH_ADDITIONAL_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
} else {
multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
}
}
}
// If the user types too many(three or more) proximity characters with additional proximity
// character,do not treat as the same length word.
if (sameLength && additionalProximityCount > 0 && (adjustedProximityMatchedCount >= 3
|| transposedCount > 0 || skipped || excessiveCount > 0)) {
sameLength = false;
}
const int errorCount = adjustedProximityMatchedCount > 0
? adjustedProximityMatchedCount
: (proximityMatchedCount + transposedCount);
multiplyRate(
100 - CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE * errorCount / inputLength, &finalFreq);
// Promotion for an exactly matched word
if (ed == 0) {
// Full exact match
if (sameLength && transposedCount == 0 && !skipped && excessiveCount == 0
&& quoteDiffCount == 0 && additionalProximityCount == 0) {
finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
}
}
// Promote a word with no correction
if (proximityMatchedCount == 0 && transposedCount == 0 && !skipped && excessiveCount == 0
&& additionalProximityCount == 0) {
multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
}
// TODO: Check excessive count and transposed count
// TODO: Remove this if possible
/*
If the last character of the user input word is the same as the next character
of the output word, and also all of characters of the user input are matched
to the output word, we'll promote that word a bit because
that word can be considered the combination of skipped and matched characters.
This means that the 'sm' pattern wins over the 'ma' pattern.
e.g.)
shel -> shell [mmmma] or [mmmsm]
hel -> hello [mmmaa] or [mmsma]
m ... matching
s ... skipping
a ... traversing all
t ... transposing
e ... exceeding
p ... proximity matching
*/
if (matchCount == inputLength && matchCount >= 2 && !skipped
&& word[matchCount] == word[matchCount - 1]) {
multiplyRate(WORDS_WITH_MATCH_SKIP_PROMOTION_RATE, &finalFreq);
}
// TODO: Do not use sameLength?
if (sameLength) {
multiplyIntCapped(fullWordMultiplier, &finalFreq);
}
if (useFullEditDistance && outputLength > inputLength + 1) {
const int diff = outputLength - inputLength - 1;
const int divider = diff < 31 ? 1 << diff : S_INT_MAX;
finalFreq = divider > finalFreq ? 1 : finalFreq / divider;
}
if (DEBUG_DICT_FULL) {
AKLOGI("calc: %d, %d", outputLength, sameLength);
}
if (DEBUG_CORRECTION_FREQ
&& (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == inputLength)) {
DUMP_WORD(proximityInfo->getPrimaryInputWord(), inputLength);
DUMP_WORD(correction->mWord, outputLength);
AKLOGI("FinalFreq: [P%d, S%d, T%d, E%d, A%d] %d, %d, %d, %d, %d, %d", proximityMatchedCount,
skippedCount, transposedCount, excessiveCount, additionalProximityCount,
outputLength, lastCharExceeded, sameLength, quoteDiffCount, ed, finalFreq);
}
return finalFreq;
}
/* static */
int Correction::RankingAlgorithm::calcFreqForSplitMultipleWords(
const int *freqArray, const int *wordLengthArray, const int wordCount,
const Correction* correction, const bool isSpaceProximity, const unsigned short *word) {
const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;
bool firstCapitalizedWordDemotion = false;
bool secondCapitalizedWordDemotion = false;
{
// TODO: Handle multiple capitalized word demotion properly
const int firstWordLength = wordLengthArray[0];
const int secondWordLength = wordLengthArray[1];
if (firstWordLength >= 2) {
firstCapitalizedWordDemotion = isUpperCase(word[0]);
}
if (secondWordLength >= 2) {
// FIXME: word[firstWordLength + 1] is incorrect.
secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]);
}
}
const bool capitalizedWordDemotion =
firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion;
int totalLength = 0;
int totalFreq = 0;
for (int i = 0; i < wordCount; ++i){
const int wordLength = wordLengthArray[i];
if (wordLength <= 0) {
return 0;
}
totalLength += wordLength;
const int demotionRate = 100 - TWO_WORDS_CORRECTION_DEMOTION_BASE / (wordLength + 1);
int tempFirstFreq = freqArray[i];
multiplyRate(demotionRate, &tempFirstFreq);
totalFreq += tempFirstFreq;
}
if (totalLength <= 0 || totalFreq <= 0) {
return 0;
}
// TODO: Currently totalFreq is adjusted to two word metrix.
// Promote pairFreq with multiplying by 2, because the word length is the same as the typed
// length.
totalFreq = totalFreq * 2 / wordCount;
if (wordCount > 2) {
// Safety net for 3+ words -- Caveats: many heuristics and workarounds here.
int oneLengthCounter = 0;
int twoLengthCounter = 0;
for (int i = 0; i < wordCount; ++i) {
const int wordLength = wordLengthArray[i];
// TODO: Use bigram instead of this safety net
if (i < wordCount - 1) {
const int nextWordLength = wordLengthArray[i + 1];
if (wordLength == 1 && nextWordLength == 2) {
// Safety net to filter 1 length and 2 length sequential words
return 0;
}
}
const int freq = freqArray[i];
// Demote too short weak words
if (wordLength <= 4 && freq <= MAX_FREQ * 2 / 3 /* heuristic... */) {
multiplyRate(100 * freq / MAX_FREQ, &totalFreq);
}
if (wordLength == 1) {
++oneLengthCounter;
} else if (wordLength == 2) {
++twoLengthCounter;
}
if (oneLengthCounter >= 2 || (oneLengthCounter + twoLengthCounter) >= 4) {
// Safety net to filter too many short words
return 0;
}
}
multiplyRate(MULTIPLE_WORDS_DEMOTION_RATE, &totalFreq);
}
// This is a workaround to try offsetting the not-enough-demotion which will be done in
// calcNormalizedScore in Utils.java.
// In calcNormalizedScore the score will be demoted by (1 - 1 / length)
// but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by
// (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length))
const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength);
multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq);
// At this moment, totalFreq is calculated by the following formula:
// (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1)))
// * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1))
multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq);
// This is another workaround to offset the demotion which will be done in
// calcNormalizedScore in Utils.java.
// In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote
// the same amount because we already have adjusted the synthetic freq of this "missing or
// mistyped space" suggestion candidate above in this method.
const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength);
multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq);
if (isSpaceProximity) {
// A word pair with one space proximity correction
if (DEBUG_DICT) {
AKLOGI("Found a word pair with space proximity correction.");
}
multiplyIntCapped(typedLetterMultiplier, &totalFreq);
multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);
}
if (isSpaceProximity) {
multiplyRate(WORDS_WITH_MISTYPED_SPACE_DEMOTION_RATE, &totalFreq);
} else {
multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);
}
if (capitalizedWordDemotion) {
multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq);
}
if (DEBUG_CORRECTION_FREQ) {
AKLOGI("Multiple words (%d, %d) (%d, %d) %d, %d", freqArray[0], freqArray[1],
wordLengthArray[0], wordLengthArray[1], capitalizedWordDemotion, totalFreq);
DUMP_WORD(word, wordLengthArray[0]);
}
return totalFreq;
}
/* Damerau-Levenshtein distance */
inline static int editDistanceInternal(
int* editDistanceTable, const unsigned short* before,
const int beforeLength, const unsigned short* after, const int afterLength) {
// dp[li][lo] dp[a][b] = dp[ a * lo + b]
int* dp = editDistanceTable;
const int li = beforeLength + 1;
const int lo = afterLength + 1;
for (int i = 0; i < li; ++i) {
dp[lo * i] = i;
}
for (int i = 0; i < lo; ++i) {
dp[i] = i;
}
for (int i = 0; i < li - 1; ++i) {
for (int j = 0; j < lo - 1; ++j) {
const uint32_t ci = toBaseLowerCase(before[i]);
const uint32_t co = toBaseLowerCase(after[j]);
const uint16_t cost = (ci == co) ? 0 : 1;
dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1,
min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost));
if (i > 0 && j > 0 && ci == toBaseLowerCase(after[j - 1])
&& co == toBaseLowerCase(before[i - 1])) {
dp[(i + 1) * lo + (j + 1)] = min(
dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost);
}
}
}
if (DEBUG_EDIT_DISTANCE) {
AKLOGI("IN = %d, OUT = %d", beforeLength, afterLength);
for (int i = 0; i < li; ++i) {
for (int j = 0; j < lo; ++j) {
AKLOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]);
}
}
}
return dp[li * lo - 1];
}
int Correction::RankingAlgorithm::editDistance(const unsigned short* before,
const int beforeLength, const unsigned short* after, const int afterLength) {
int table[(beforeLength + 1) * (afterLength + 1)];
return editDistanceInternal(table, before, beforeLength, after, afterLength);
}
// In dictionary.cpp, getSuggestion() method,
// suggestion scores are computed using the below formula.
// original score
// := pow(mTypedLetterMultiplier (this is defined 2),
// (the number of matched characters between typed word and suggested word))
// * (individual word's score which defined in the unigram dictionary,
// and this score is defined in range [0, 255].)
// Then, the following processing is applied.
// - If the dictionary word is matched up to the point of the user entry
// (full match up to min(before.length(), after.length())
// => Then multiply by FULL_MATCHED_WORDS_PROMOTION_RATE (this is defined 1.2)
// - If the word is a true full match except for differences in accents or
// capitalization, then treat it as if the score was 255.
// - If before.length() == after.length()
// => multiply by mFullWordMultiplier (this is defined 2))
// So, maximum original score is pow(2, min(before.length(), after.length())) * 255 * 2 * 1.2
// For historical reasons we ignore the 1.2 modifier (because the measure for a good
// autocorrection threshold was done at a time when it didn't exist). This doesn't change
// the result.
// So, we can normalize original score by dividing pow(2, min(b.l(),a.l())) * 255 * 2.
/* static */
double Correction::RankingAlgorithm::calcNormalizedScore(const unsigned short* before,
const int beforeLength, const unsigned short* after, const int afterLength,
const int score) {
if (0 == beforeLength || 0 == afterLength) {
return 0;
}
const int distance = editDistance(before, beforeLength, after, afterLength);
int spaceCount = 0;
for (int i = 0; i < afterLength; ++i) {
if (after[i] == CODE_SPACE) {
++spaceCount;
}
}
if (spaceCount == afterLength) {
return 0;
}
const double maxScore = score >= S_INT_MAX ? S_INT_MAX : MAX_INITIAL_SCORE
* pow((double)TYPED_LETTER_MULTIPLIER,
(double)min(beforeLength, afterLength - spaceCount)) * FULL_WORD_MULTIPLIER;
// add a weight based on edit distance.
// distance <= max(afterLength, beforeLength) == afterLength,
// so, 0 <= distance / afterLength <= 1
const double weight = 1.0 - (double) distance / afterLength;
return (score / maxScore) * weight;
}
} // namespace latinime