1144 lines
48 KiB
C++
1144 lines
48 KiB
C++
/*
|
|
* Copyright (C) 2011 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include <cassert>
|
|
#include <cctype>
|
|
#include <cmath>
|
|
#include <cstring>
|
|
|
|
#define LOG_TAG "LatinIME: correction.cpp"
|
|
|
|
#include "char_utils.h"
|
|
#include "correction.h"
|
|
#include "defines.h"
|
|
#include "proximity_info_state.h"
|
|
|
|
namespace latinime {
|
|
|
|
class ProximityInfo;
|
|
|
|
/////////////////////////////
|
|
// edit distance funcitons //
|
|
/////////////////////////////
|
|
|
|
inline static void initEditDistance(int *editDistanceTable) {
|
|
for (int i = 0; i <= MAX_WORD_LENGTH_INTERNAL; ++i) {
|
|
editDistanceTable[i] = i;
|
|
}
|
|
}
|
|
|
|
inline static void dumpEditDistance10ForDebug(int *editDistanceTable,
|
|
const int editDistanceTableWidth, const int outputLength) {
|
|
if (DEBUG_DICT) {
|
|
AKLOGI("EditDistanceTable");
|
|
for (int i = 0; i <= 10; ++i) {
|
|
int c[11];
|
|
for (int j = 0; j <= 10; ++j) {
|
|
if (j < editDistanceTableWidth + 1 && i < outputLength + 1) {
|
|
c[j] = (editDistanceTable + i * (editDistanceTableWidth + 1))[j];
|
|
} else {
|
|
c[j] = -1;
|
|
}
|
|
}
|
|
AKLOGI("[ %d, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d ]",
|
|
c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8], c[9], c[10]);
|
|
(void)c;
|
|
}
|
|
}
|
|
}
|
|
|
|
inline static void calcEditDistanceOneStep(int *editDistanceTable, const unsigned short *input,
|
|
const int inputLength, const unsigned short *output, const int outputLength) {
|
|
// TODO: Make sure that editDistance[0 ~ MAX_WORD_LENGTH_INTERNAL] is not touched.
|
|
// Let dp[i][j] be editDistanceTable[i * (inputLength + 1) + j].
|
|
// Assuming that dp[0][0] ... dp[outputLength - 1][inputLength] are already calculated,
|
|
// and calculate dp[ouputLength][0] ... dp[outputLength][inputLength].
|
|
int *const current = editDistanceTable + outputLength * (inputLength + 1);
|
|
const int *const prev = editDistanceTable + (outputLength - 1) * (inputLength + 1);
|
|
const int *const prevprev =
|
|
outputLength >= 2 ? editDistanceTable + (outputLength - 2) * (inputLength + 1) : 0;
|
|
current[0] = outputLength;
|
|
const uint32_t co = toBaseLowerCase(output[outputLength - 1]);
|
|
const uint32_t prevCO = outputLength >= 2 ? toBaseLowerCase(output[outputLength - 2]) : 0;
|
|
for (int i = 1; i <= inputLength; ++i) {
|
|
const uint32_t ci = toBaseLowerCase(input[i - 1]);
|
|
const uint16_t cost = (ci == co) ? 0 : 1;
|
|
current[i] = min(current[i - 1] + 1, min(prev[i] + 1, prev[i - 1] + cost));
|
|
if (i >= 2 && prevprev && ci == prevCO && co == toBaseLowerCase(input[i - 2])) {
|
|
current[i] = min(current[i], prevprev[i - 2] + 1);
|
|
}
|
|
}
|
|
}
|
|
|
|
inline static int getCurrentEditDistance(int *editDistanceTable, const int editDistanceTableWidth,
|
|
const int outputLength, const int inputLength) {
|
|
if (DEBUG_EDIT_DISTANCE) {
|
|
AKLOGI("getCurrentEditDistance %d, %d", inputLength, outputLength);
|
|
}
|
|
return editDistanceTable[(editDistanceTableWidth + 1) * (outputLength) + inputLength];
|
|
}
|
|
|
|
//////////////////////
|
|
// inline functions //
|
|
//////////////////////
|
|
static const char SINGLE_QUOTE = '\'';
|
|
|
|
inline bool Correction::isSingleQuote(const unsigned short c) {
|
|
const unsigned short userTypedChar = mProximityInfoState.getPrimaryCharAt(mInputIndex);
|
|
return (c == SINGLE_QUOTE && userTypedChar != SINGLE_QUOTE);
|
|
}
|
|
|
|
////////////////
|
|
// Correction //
|
|
////////////////
|
|
|
|
void Correction::resetCorrection() {
|
|
mTotalTraverseCount = 0;
|
|
}
|
|
|
|
void Correction::initCorrection(const ProximityInfo *pi, const int inputLength,
|
|
const int maxDepth) {
|
|
mProximityInfo = pi;
|
|
mInputLength = inputLength;
|
|
mMaxDepth = maxDepth;
|
|
mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
|
|
// TODO: This is not supposed to be required. Check what's going wrong with
|
|
// editDistance[0 ~ MAX_WORD_LENGTH_INTERNAL]
|
|
initEditDistance(mEditDistanceTable);
|
|
}
|
|
|
|
void Correction::initCorrectionState(
|
|
const int rootPos, const int childCount, const bool traverseAll) {
|
|
latinime::initCorrectionState(mCorrectionStates, rootPos, childCount, traverseAll);
|
|
// TODO: remove
|
|
mCorrectionStates[0].mTransposedPos = mTransposedPos;
|
|
mCorrectionStates[0].mExcessivePos = mExcessivePos;
|
|
mCorrectionStates[0].mSkipPos = mSkipPos;
|
|
}
|
|
|
|
void Correction::setCorrectionParams(const int skipPos, const int excessivePos,
|
|
const int transposedPos, const int spaceProximityPos, const int missingSpacePos,
|
|
const bool useFullEditDistance, const bool doAutoCompletion, const int maxErrors) {
|
|
// TODO: remove
|
|
mTransposedPos = transposedPos;
|
|
mExcessivePos = excessivePos;
|
|
mSkipPos = skipPos;
|
|
// TODO: remove
|
|
mCorrectionStates[0].mTransposedPos = transposedPos;
|
|
mCorrectionStates[0].mExcessivePos = excessivePos;
|
|
mCorrectionStates[0].mSkipPos = skipPos;
|
|
|
|
mSpaceProximityPos = spaceProximityPos;
|
|
mMissingSpacePos = missingSpacePos;
|
|
mUseFullEditDistance = useFullEditDistance;
|
|
mDoAutoCompletion = doAutoCompletion;
|
|
mMaxErrors = maxErrors;
|
|
}
|
|
|
|
void Correction::checkState() {
|
|
if (DEBUG_DICT) {
|
|
int inputCount = 0;
|
|
if (mSkipPos >= 0) ++inputCount;
|
|
if (mExcessivePos >= 0) ++inputCount;
|
|
if (mTransposedPos >= 0) ++inputCount;
|
|
}
|
|
}
|
|
|
|
bool Correction::sameAsTyped() {
|
|
return mProximityInfoState.sameAsTyped(mWord, mOutputIndex);
|
|
}
|
|
|
|
int Correction::getFreqForSplitMultipleWords(const int *freqArray, const int *wordLengthArray,
|
|
const int wordCount, const bool isSpaceProximity, const unsigned short *word) {
|
|
return Correction::RankingAlgorithm::calcFreqForSplitMultipleWords(freqArray, wordLengthArray,
|
|
wordCount, this, isSpaceProximity, word);
|
|
}
|
|
|
|
int Correction::getFinalProbability(const int probability, unsigned short **word, int *wordLength) {
|
|
return getFinalProbabilityInternal(probability, word, wordLength, mInputLength);
|
|
}
|
|
|
|
int Correction::getFinalProbabilityForSubQueue(const int probability, unsigned short **word,
|
|
int *wordLength, const int inputLength) {
|
|
return getFinalProbabilityInternal(probability, word, wordLength, inputLength);
|
|
}
|
|
|
|
int Correction::getFinalProbabilityInternal(const int probability, unsigned short **word,
|
|
int *wordLength, const int inputLength) {
|
|
const int outputIndex = mTerminalOutputIndex;
|
|
const int inputIndex = mTerminalInputIndex;
|
|
*wordLength = outputIndex + 1;
|
|
*word = mWord;
|
|
int finalProbability= Correction::RankingAlgorithm::calculateFinalProbability(
|
|
inputIndex, outputIndex, probability, mEditDistanceTable, this, inputLength);
|
|
return finalProbability;
|
|
}
|
|
|
|
bool Correction::initProcessState(const int outputIndex) {
|
|
if (mCorrectionStates[outputIndex].mChildCount <= 0) {
|
|
return false;
|
|
}
|
|
mOutputIndex = outputIndex;
|
|
--(mCorrectionStates[outputIndex].mChildCount);
|
|
mInputIndex = mCorrectionStates[outputIndex].mInputIndex;
|
|
mNeedsToTraverseAllNodes = mCorrectionStates[outputIndex].mNeedsToTraverseAllNodes;
|
|
|
|
mEquivalentCharCount = mCorrectionStates[outputIndex].mEquivalentCharCount;
|
|
mProximityCount = mCorrectionStates[outputIndex].mProximityCount;
|
|
mTransposedCount = mCorrectionStates[outputIndex].mTransposedCount;
|
|
mExcessiveCount = mCorrectionStates[outputIndex].mExcessiveCount;
|
|
mSkippedCount = mCorrectionStates[outputIndex].mSkippedCount;
|
|
mLastCharExceeded = mCorrectionStates[outputIndex].mLastCharExceeded;
|
|
|
|
mTransposedPos = mCorrectionStates[outputIndex].mTransposedPos;
|
|
mExcessivePos = mCorrectionStates[outputIndex].mExcessivePos;
|
|
mSkipPos = mCorrectionStates[outputIndex].mSkipPos;
|
|
|
|
mMatching = false;
|
|
mProximityMatching = false;
|
|
mAdditionalProximityMatching = false;
|
|
mTransposing = false;
|
|
mExceeding = false;
|
|
mSkipping = false;
|
|
|
|
return true;
|
|
}
|
|
|
|
int Correction::goDownTree(
|
|
const int parentIndex, const int childCount, const int firstChildPos) {
|
|
mCorrectionStates[mOutputIndex].mParentIndex = parentIndex;
|
|
mCorrectionStates[mOutputIndex].mChildCount = childCount;
|
|
mCorrectionStates[mOutputIndex].mSiblingPos = firstChildPos;
|
|
return mOutputIndex;
|
|
}
|
|
|
|
// TODO: remove
|
|
int Correction::getInputIndex() const {
|
|
return mInputIndex;
|
|
}
|
|
|
|
void Correction::incrementInputIndex() {
|
|
++mInputIndex;
|
|
}
|
|
|
|
void Correction::incrementOutputIndex() {
|
|
++mOutputIndex;
|
|
mCorrectionStates[mOutputIndex].mParentIndex = mCorrectionStates[mOutputIndex - 1].mParentIndex;
|
|
mCorrectionStates[mOutputIndex].mChildCount = mCorrectionStates[mOutputIndex - 1].mChildCount;
|
|
mCorrectionStates[mOutputIndex].mSiblingPos = mCorrectionStates[mOutputIndex - 1].mSiblingPos;
|
|
mCorrectionStates[mOutputIndex].mInputIndex = mInputIndex;
|
|
mCorrectionStates[mOutputIndex].mNeedsToTraverseAllNodes = mNeedsToTraverseAllNodes;
|
|
|
|
mCorrectionStates[mOutputIndex].mEquivalentCharCount = mEquivalentCharCount;
|
|
mCorrectionStates[mOutputIndex].mProximityCount = mProximityCount;
|
|
mCorrectionStates[mOutputIndex].mTransposedCount = mTransposedCount;
|
|
mCorrectionStates[mOutputIndex].mExcessiveCount = mExcessiveCount;
|
|
mCorrectionStates[mOutputIndex].mSkippedCount = mSkippedCount;
|
|
|
|
mCorrectionStates[mOutputIndex].mSkipPos = mSkipPos;
|
|
mCorrectionStates[mOutputIndex].mTransposedPos = mTransposedPos;
|
|
mCorrectionStates[mOutputIndex].mExcessivePos = mExcessivePos;
|
|
|
|
mCorrectionStates[mOutputIndex].mLastCharExceeded = mLastCharExceeded;
|
|
|
|
mCorrectionStates[mOutputIndex].mMatching = mMatching;
|
|
mCorrectionStates[mOutputIndex].mProximityMatching = mProximityMatching;
|
|
mCorrectionStates[mOutputIndex].mAdditionalProximityMatching = mAdditionalProximityMatching;
|
|
mCorrectionStates[mOutputIndex].mTransposing = mTransposing;
|
|
mCorrectionStates[mOutputIndex].mExceeding = mExceeding;
|
|
mCorrectionStates[mOutputIndex].mSkipping = mSkipping;
|
|
}
|
|
|
|
void Correction::startToTraverseAllNodes() {
|
|
mNeedsToTraverseAllNodes = true;
|
|
}
|
|
|
|
bool Correction::needsToPrune() const {
|
|
// TODO: use edit distance here
|
|
return mOutputIndex - 1 >= mMaxDepth || mProximityCount > mMaxEditDistance
|
|
// Allow one char longer word for missing character
|
|
|| (!mDoAutoCompletion && (mOutputIndex > mInputLength));
|
|
}
|
|
|
|
void Correction::addCharToCurrentWord(const int32_t c) {
|
|
mWord[mOutputIndex] = c;
|
|
const unsigned short *primaryInputWord = mProximityInfoState.getPrimaryInputWord();
|
|
calcEditDistanceOneStep(mEditDistanceTable, primaryInputWord, mInputLength,
|
|
mWord, mOutputIndex + 1);
|
|
}
|
|
|
|
Correction::CorrectionType Correction::processSkipChar(
|
|
const int32_t c, const bool isTerminal, const bool inputIndexIncremented) {
|
|
addCharToCurrentWord(c);
|
|
mTerminalInputIndex = mInputIndex - (inputIndexIncremented ? 1 : 0);
|
|
mTerminalOutputIndex = mOutputIndex;
|
|
if (mNeedsToTraverseAllNodes && isTerminal) {
|
|
incrementOutputIndex();
|
|
return TRAVERSE_ALL_ON_TERMINAL;
|
|
} else {
|
|
incrementOutputIndex();
|
|
return TRAVERSE_ALL_NOT_ON_TERMINAL;
|
|
}
|
|
}
|
|
|
|
Correction::CorrectionType Correction::processUnrelatedCorrectionType() {
|
|
// Needs to set mTerminalInputIndex and mTerminalOutputIndex before returning any CorrectionType
|
|
mTerminalInputIndex = mInputIndex;
|
|
mTerminalOutputIndex = mOutputIndex;
|
|
return UNRELATED;
|
|
}
|
|
|
|
inline bool isEquivalentChar(ProximityType type) {
|
|
return type == EQUIVALENT_CHAR;
|
|
}
|
|
|
|
inline bool isProximityCharOrEquivalentChar(ProximityType type) {
|
|
return type == EQUIVALENT_CHAR || type == NEAR_PROXIMITY_CHAR;
|
|
}
|
|
|
|
Correction::CorrectionType Correction::processCharAndCalcState(
|
|
const int32_t c, const bool isTerminal) {
|
|
const int correctionCount = (mSkippedCount + mExcessiveCount + mTransposedCount);
|
|
if (correctionCount > mMaxErrors) {
|
|
return processUnrelatedCorrectionType();
|
|
}
|
|
|
|
// TODO: Change the limit if we'll allow two or more corrections
|
|
const bool noCorrectionsHappenedSoFar = correctionCount == 0;
|
|
const bool canTryCorrection = noCorrectionsHappenedSoFar;
|
|
int proximityIndex = 0;
|
|
mDistances[mOutputIndex] = NOT_A_DISTANCE;
|
|
|
|
// Skip checking this node
|
|
if (mNeedsToTraverseAllNodes || isSingleQuote(c)) {
|
|
bool incremented = false;
|
|
if (mLastCharExceeded && mInputIndex == mInputLength - 1) {
|
|
// TODO: Do not check the proximity if EditDistance exceeds the threshold
|
|
const ProximityType matchId = mProximityInfoState.getMatchedProximityId(
|
|
mInputIndex, c, true, &proximityIndex);
|
|
if (isEquivalentChar(matchId)) {
|
|
mLastCharExceeded = false;
|
|
--mExcessiveCount;
|
|
mDistances[mOutputIndex] =
|
|
mProximityInfoState.getNormalizedSquaredDistance(mInputIndex, 0);
|
|
} else if (matchId == NEAR_PROXIMITY_CHAR) {
|
|
mLastCharExceeded = false;
|
|
--mExcessiveCount;
|
|
++mProximityCount;
|
|
mDistances[mOutputIndex] = mProximityInfoState.getNormalizedSquaredDistance(
|
|
mInputIndex, proximityIndex);
|
|
}
|
|
if (!isSingleQuote(c)) {
|
|
incrementInputIndex();
|
|
incremented = true;
|
|
}
|
|
}
|
|
return processSkipChar(c, isTerminal, incremented);
|
|
}
|
|
|
|
// Check possible corrections.
|
|
if (mExcessivePos >= 0) {
|
|
if (mExcessiveCount == 0 && mExcessivePos < mOutputIndex) {
|
|
mExcessivePos = mOutputIndex;
|
|
}
|
|
if (mExcessivePos < mInputLength - 1) {
|
|
mExceeding = mExcessivePos == mInputIndex && canTryCorrection;
|
|
}
|
|
}
|
|
|
|
if (mSkipPos >= 0) {
|
|
if (mSkippedCount == 0 && mSkipPos < mOutputIndex) {
|
|
if (DEBUG_DICT) {
|
|
assert(mSkipPos == mOutputIndex - 1);
|
|
}
|
|
mSkipPos = mOutputIndex;
|
|
}
|
|
mSkipping = mSkipPos == mOutputIndex && canTryCorrection;
|
|
}
|
|
|
|
if (mTransposedPos >= 0) {
|
|
if (mTransposedCount == 0 && mTransposedPos < mOutputIndex) {
|
|
mTransposedPos = mOutputIndex;
|
|
}
|
|
if (mTransposedPos < mInputLength - 1) {
|
|
mTransposing = mInputIndex == mTransposedPos && canTryCorrection;
|
|
}
|
|
}
|
|
|
|
bool secondTransposing = false;
|
|
if (mTransposedCount % 2 == 1) {
|
|
if (isEquivalentChar(mProximityInfoState.getMatchedProximityId(
|
|
mInputIndex - 1, c, false))) {
|
|
++mTransposedCount;
|
|
secondTransposing = true;
|
|
} else if (mCorrectionStates[mOutputIndex].mExceeding) {
|
|
--mTransposedCount;
|
|
++mExcessiveCount;
|
|
--mExcessivePos;
|
|
incrementInputIndex();
|
|
} else {
|
|
--mTransposedCount;
|
|
if (DEBUG_CORRECTION
|
|
&& (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength)
|
|
&& (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
|
|
|| MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
|
|
DUMP_WORD(mWord, mOutputIndex);
|
|
AKLOGI("UNRELATED(0): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
|
|
mTransposedCount, mExcessiveCount, c);
|
|
}
|
|
return processUnrelatedCorrectionType();
|
|
}
|
|
}
|
|
|
|
// TODO: Change the limit if we'll allow two or more proximity chars with corrections
|
|
// Work around: When the mMaxErrors is 1, we only allow just one error
|
|
// including proximity correction.
|
|
const bool checkProximityChars = (mMaxErrors > 1)
|
|
? (noCorrectionsHappenedSoFar || mProximityCount == 0)
|
|
: (noCorrectionsHappenedSoFar && mProximityCount == 0);
|
|
|
|
ProximityType matchedProximityCharId = secondTransposing
|
|
? EQUIVALENT_CHAR
|
|
: mProximityInfoState.getMatchedProximityId(
|
|
mInputIndex, c, checkProximityChars, &proximityIndex);
|
|
|
|
if (UNRELATED_CHAR == matchedProximityCharId
|
|
|| ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) {
|
|
if (canTryCorrection && mOutputIndex > 0
|
|
&& mCorrectionStates[mOutputIndex].mProximityMatching
|
|
&& mCorrectionStates[mOutputIndex].mExceeding
|
|
&& isEquivalentChar(mProximityInfoState.getMatchedProximityId(
|
|
mInputIndex, mWord[mOutputIndex - 1], false))) {
|
|
if (DEBUG_CORRECTION
|
|
&& (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength)
|
|
&& (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
|
|
|| MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
|
|
AKLOGI("CONVERSION p->e %c", mWord[mOutputIndex - 1]);
|
|
}
|
|
// Conversion p->e
|
|
// Example:
|
|
// wearth -> earth
|
|
// px -> (E)mmmmm
|
|
++mExcessiveCount;
|
|
--mProximityCount;
|
|
mExcessivePos = mOutputIndex - 1;
|
|
++mInputIndex;
|
|
// Here, we are doing something equivalent to matchedProximityCharId,
|
|
// but we already know that "excessive char correction" just happened
|
|
// so that we just need to check "mProximityCount == 0".
|
|
matchedProximityCharId = mProximityInfoState.getMatchedProximityId(
|
|
mInputIndex, c, mProximityCount == 0, &proximityIndex);
|
|
}
|
|
}
|
|
|
|
if (UNRELATED_CHAR == matchedProximityCharId
|
|
|| ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) {
|
|
if (ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) {
|
|
mAdditionalProximityMatching = true;
|
|
}
|
|
// TODO: Optimize
|
|
// As the current char turned out to be an unrelated char,
|
|
// we will try other correction-types. Please note that mCorrectionStates[mOutputIndex]
|
|
// here refers to the previous state.
|
|
if (mInputIndex < mInputLength - 1 && mOutputIndex > 0 && mTransposedCount > 0
|
|
&& !mCorrectionStates[mOutputIndex].mTransposing
|
|
&& mCorrectionStates[mOutputIndex - 1].mTransposing
|
|
&& isEquivalentChar(mProximityInfoState.getMatchedProximityId(
|
|
mInputIndex, mWord[mOutputIndex - 1], false))
|
|
&& isEquivalentChar(
|
|
mProximityInfoState.getMatchedProximityId(mInputIndex + 1, c, false))) {
|
|
// Conversion t->e
|
|
// Example:
|
|
// occaisional -> occa sional
|
|
// mmmmttx -> mmmm(E)mmmmmm
|
|
mTransposedCount -= 2;
|
|
++mExcessiveCount;
|
|
++mInputIndex;
|
|
} else if (mOutputIndex > 0 && mInputIndex > 0 && mTransposedCount > 0
|
|
&& !mCorrectionStates[mOutputIndex].mTransposing
|
|
&& mCorrectionStates[mOutputIndex - 1].mTransposing
|
|
&& isEquivalentChar(
|
|
mProximityInfoState.getMatchedProximityId(mInputIndex - 1, c, false))) {
|
|
// Conversion t->s
|
|
// Example:
|
|
// chcolate -> chocolate
|
|
// mmttx -> mmsmmmmmm
|
|
mTransposedCount -= 2;
|
|
++mSkippedCount;
|
|
--mInputIndex;
|
|
} else if (canTryCorrection && mInputIndex > 0
|
|
&& mCorrectionStates[mOutputIndex].mProximityMatching
|
|
&& mCorrectionStates[mOutputIndex].mSkipping
|
|
&& isEquivalentChar(
|
|
mProximityInfoState.getMatchedProximityId(mInputIndex - 1, c, false))) {
|
|
// Conversion p->s
|
|
// Note: This logic tries saving cases like contrst --> contrast -- "a" is one of
|
|
// proximity chars of "s", but it should rather be handled as a skipped char.
|
|
++mSkippedCount;
|
|
--mProximityCount;
|
|
return processSkipChar(c, isTerminal, false);
|
|
} else if (mInputIndex - 1 < mInputLength
|
|
&& mSkippedCount > 0
|
|
&& mCorrectionStates[mOutputIndex].mSkipping
|
|
&& mCorrectionStates[mOutputIndex].mAdditionalProximityMatching
|
|
&& isProximityCharOrEquivalentChar(
|
|
mProximityInfoState.getMatchedProximityId(mInputIndex + 1, c, false))) {
|
|
// Conversion s->a
|
|
incrementInputIndex();
|
|
--mSkippedCount;
|
|
mProximityMatching = true;
|
|
++mProximityCount;
|
|
mDistances[mOutputIndex] = ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO;
|
|
} else if ((mExceeding || mTransposing) && mInputIndex - 1 < mInputLength
|
|
&& isEquivalentChar(
|
|
mProximityInfoState.getMatchedProximityId(mInputIndex + 1, c, false))) {
|
|
// 1.2. Excessive or transpose correction
|
|
if (mTransposing) {
|
|
++mTransposedCount;
|
|
} else {
|
|
++mExcessiveCount;
|
|
incrementInputIndex();
|
|
}
|
|
if (DEBUG_CORRECTION
|
|
&& (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength)
|
|
&& (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
|
|
|| MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
|
|
DUMP_WORD(mWord, mOutputIndex);
|
|
if (mTransposing) {
|
|
AKLOGI("TRANSPOSE: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
|
|
mTransposedCount, mExcessiveCount, c);
|
|
} else {
|
|
AKLOGI("EXCEED: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
|
|
mTransposedCount, mExcessiveCount, c);
|
|
}
|
|
}
|
|
} else if (mSkipping) {
|
|
// 3. Skip correction
|
|
++mSkippedCount;
|
|
if (DEBUG_CORRECTION
|
|
&& (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength)
|
|
&& (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
|
|
|| MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
|
|
AKLOGI("SKIP: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
|
|
mTransposedCount, mExcessiveCount, c);
|
|
}
|
|
return processSkipChar(c, isTerminal, false);
|
|
} else if (ADDITIONAL_PROXIMITY_CHAR == matchedProximityCharId) {
|
|
// As a last resort, use additional proximity characters
|
|
mProximityMatching = true;
|
|
++mProximityCount;
|
|
mDistances[mOutputIndex] = ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO;
|
|
if (DEBUG_CORRECTION
|
|
&& (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength)
|
|
&& (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
|
|
|| MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
|
|
AKLOGI("ADDITIONALPROX: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
|
|
mTransposedCount, mExcessiveCount, c);
|
|
}
|
|
} else {
|
|
if (DEBUG_CORRECTION
|
|
&& (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength)
|
|
&& (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
|
|
|| MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
|
|
DUMP_WORD(mWord, mOutputIndex);
|
|
AKLOGI("UNRELATED(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
|
|
mTransposedCount, mExcessiveCount, c);
|
|
}
|
|
return processUnrelatedCorrectionType();
|
|
}
|
|
} else if (secondTransposing) {
|
|
// If inputIndex is greater than mInputLength, that means there is no
|
|
// proximity chars. So, we don't need to check proximity.
|
|
mMatching = true;
|
|
} else if (isEquivalentChar(matchedProximityCharId)) {
|
|
mMatching = true;
|
|
++mEquivalentCharCount;
|
|
mDistances[mOutputIndex] = mProximityInfoState.getNormalizedSquaredDistance(mInputIndex, 0);
|
|
} else if (NEAR_PROXIMITY_CHAR == matchedProximityCharId) {
|
|
mProximityMatching = true;
|
|
++mProximityCount;
|
|
mDistances[mOutputIndex] =
|
|
mProximityInfoState.getNormalizedSquaredDistance(mInputIndex, proximityIndex);
|
|
if (DEBUG_CORRECTION
|
|
&& (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength)
|
|
&& (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0
|
|
|| MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
|
|
AKLOGI("PROX: %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
|
|
mTransposedCount, mExcessiveCount, c);
|
|
}
|
|
}
|
|
|
|
addCharToCurrentWord(c);
|
|
|
|
// 4. Last char excessive correction
|
|
mLastCharExceeded = mExcessiveCount == 0 && mSkippedCount == 0 && mTransposedCount == 0
|
|
&& mProximityCount == 0 && (mInputIndex == mInputLength - 2);
|
|
const bool isSameAsUserTypedLength = (mInputLength == mInputIndex + 1) || mLastCharExceeded;
|
|
if (mLastCharExceeded) {
|
|
++mExcessiveCount;
|
|
}
|
|
|
|
// Start traversing all nodes after the index exceeds the user typed length
|
|
if (isSameAsUserTypedLength) {
|
|
startToTraverseAllNodes();
|
|
}
|
|
|
|
const bool needsToTryOnTerminalForTheLastPossibleExcessiveChar =
|
|
mExceeding && mInputIndex == mInputLength - 2;
|
|
|
|
// Finally, we are ready to go to the next character, the next "virtual node".
|
|
// We should advance the input index.
|
|
// We do this in this branch of the 'if traverseAllNodes' because we are still matching
|
|
// characters to input; the other branch is not matching them but searching for
|
|
// completions, this is why it does not have to do it.
|
|
incrementInputIndex();
|
|
// Also, the next char is one "virtual node" depth more than this char.
|
|
incrementOutputIndex();
|
|
|
|
if ((needsToTryOnTerminalForTheLastPossibleExcessiveChar
|
|
|| isSameAsUserTypedLength) && isTerminal) {
|
|
mTerminalInputIndex = mInputIndex - 1;
|
|
mTerminalOutputIndex = mOutputIndex - 1;
|
|
if (DEBUG_CORRECTION
|
|
&& (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == mInputLength)
|
|
&& (MIN_OUTPUT_INDEX_FOR_DEBUG <= 0 || MIN_OUTPUT_INDEX_FOR_DEBUG < mOutputIndex)) {
|
|
DUMP_WORD(mWord, mOutputIndex);
|
|
AKLOGI("ONTERMINAL(1): %d, %d, %d, %d, %c", mProximityCount, mSkippedCount,
|
|
mTransposedCount, mExcessiveCount, c);
|
|
}
|
|
return ON_TERMINAL;
|
|
} else {
|
|
mTerminalInputIndex = mInputIndex - 1;
|
|
mTerminalOutputIndex = mOutputIndex - 1;
|
|
return NOT_ON_TERMINAL;
|
|
}
|
|
}
|
|
|
|
Correction::~Correction() {
|
|
}
|
|
|
|
inline static int getQuoteCount(const unsigned short *word, const int length) {
|
|
int quoteCount = 0;
|
|
for (int i = 0; i < length; ++i) {
|
|
if(word[i] == '\'') {
|
|
++quoteCount;
|
|
}
|
|
}
|
|
return quoteCount;
|
|
}
|
|
|
|
inline static bool isUpperCase(unsigned short c) {
|
|
return isAsciiUpper(toBaseChar(c));
|
|
}
|
|
|
|
//////////////////////
|
|
// RankingAlgorithm //
|
|
//////////////////////
|
|
|
|
/* static */
|
|
int Correction::RankingAlgorithm::calculateFinalProbability(const int inputIndex,
|
|
const int outputIndex, const int freq, int *editDistanceTable, const Correction *correction,
|
|
const int inputLength) {
|
|
const int excessivePos = correction->getExcessivePos();
|
|
const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;
|
|
const int fullWordMultiplier = correction->FULL_WORD_MULTIPLIER;
|
|
const ProximityInfoState *proximityInfoState = &correction->mProximityInfoState;
|
|
const int skippedCount = correction->mSkippedCount;
|
|
const int transposedCount = correction->mTransposedCount / 2;
|
|
const int excessiveCount = correction->mExcessiveCount + correction->mTransposedCount % 2;
|
|
const int proximityMatchedCount = correction->mProximityCount;
|
|
const bool lastCharExceeded = correction->mLastCharExceeded;
|
|
const bool useFullEditDistance = correction->mUseFullEditDistance;
|
|
const int outputLength = outputIndex + 1;
|
|
if (skippedCount >= inputLength || inputLength == 0) {
|
|
return -1;
|
|
}
|
|
|
|
// TODO: find more robust way
|
|
bool sameLength = lastCharExceeded ? (inputLength == inputIndex + 2)
|
|
: (inputLength == inputIndex + 1);
|
|
|
|
// TODO: use mExcessiveCount
|
|
const int matchCount = inputLength - correction->mProximityCount - excessiveCount;
|
|
|
|
const unsigned short *word = correction->mWord;
|
|
const bool skipped = skippedCount > 0;
|
|
|
|
const int quoteDiffCount = max(0, getQuoteCount(word, outputLength)
|
|
- getQuoteCount(proximityInfoState->getPrimaryInputWord(), inputLength));
|
|
|
|
// TODO: Calculate edit distance for transposed and excessive
|
|
int ed = 0;
|
|
if (DEBUG_DICT_FULL) {
|
|
dumpEditDistance10ForDebug(editDistanceTable, correction->mInputLength, outputLength);
|
|
}
|
|
int adjustedProximityMatchedCount = proximityMatchedCount;
|
|
|
|
int finalFreq = freq;
|
|
|
|
if (DEBUG_CORRECTION_FREQ
|
|
&& (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == inputLength)) {
|
|
AKLOGI("FinalFreq0: %d", finalFreq);
|
|
}
|
|
// TODO: Optimize this.
|
|
if (transposedCount > 0 || proximityMatchedCount > 0 || skipped || excessiveCount > 0) {
|
|
ed = getCurrentEditDistance(editDistanceTable, correction->mInputLength, outputLength,
|
|
inputLength) - transposedCount;
|
|
|
|
const int matchWeight = powerIntCapped(typedLetterMultiplier,
|
|
max(inputLength, outputLength) - ed);
|
|
multiplyIntCapped(matchWeight, &finalFreq);
|
|
|
|
// TODO: Demote further if there are two or more excessive chars with longer user input?
|
|
if (inputLength > outputLength) {
|
|
multiplyRate(INPUT_EXCEEDS_OUTPUT_DEMOTION_RATE, &finalFreq);
|
|
}
|
|
|
|
ed = max(0, ed - quoteDiffCount);
|
|
adjustedProximityMatchedCount = min(max(0, ed - (outputLength - inputLength)),
|
|
proximityMatchedCount);
|
|
if (transposedCount <= 0) {
|
|
if (ed == 1 && (inputLength == outputLength - 1 || inputLength == outputLength + 1)) {
|
|
// Promote a word with just one skipped or excessive char
|
|
if (sameLength) {
|
|
multiplyRate(WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_RATE
|
|
+ WORDS_WITH_JUST_ONE_CORRECTION_PROMOTION_MULTIPLIER * outputLength,
|
|
&finalFreq);
|
|
} else {
|
|
multiplyIntCapped(typedLetterMultiplier, &finalFreq);
|
|
}
|
|
} else if (ed == 0) {
|
|
multiplyIntCapped(typedLetterMultiplier, &finalFreq);
|
|
sameLength = true;
|
|
}
|
|
}
|
|
} else {
|
|
const int matchWeight = powerIntCapped(typedLetterMultiplier, matchCount);
|
|
multiplyIntCapped(matchWeight, &finalFreq);
|
|
}
|
|
|
|
if (proximityInfoState->getMatchedProximityId(0, word[0], true) == UNRELATED_CHAR) {
|
|
multiplyRate(FIRST_CHAR_DIFFERENT_DEMOTION_RATE, &finalFreq);
|
|
}
|
|
|
|
///////////////////////////////////////////////
|
|
// Promotion and Demotion for each correction
|
|
|
|
// Demotion for a word with missing character
|
|
if (skipped) {
|
|
const int demotionRate = WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE
|
|
* (10 * inputLength - WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X)
|
|
/ (10 * inputLength
|
|
- WORDS_WITH_MISSING_CHARACTER_DEMOTION_START_POS_10X + 10);
|
|
if (DEBUG_DICT_FULL) {
|
|
AKLOGI("Demotion rate for missing character is %d.", demotionRate);
|
|
}
|
|
multiplyRate(demotionRate, &finalFreq);
|
|
}
|
|
|
|
// Demotion for a word with transposed character
|
|
if (transposedCount > 0) multiplyRate(
|
|
WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);
|
|
|
|
// Demotion for a word with excessive character
|
|
if (excessiveCount > 0) {
|
|
multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);
|
|
if (!lastCharExceeded && !proximityInfoState->existsAdjacentProximityChars(excessivePos)) {
|
|
if (DEBUG_DICT_FULL) {
|
|
AKLOGI("Double excessive demotion");
|
|
}
|
|
// If an excessive character is not adjacent to the left char or the right char,
|
|
// we will demote this word.
|
|
multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);
|
|
}
|
|
}
|
|
|
|
const bool performTouchPositionCorrection =
|
|
CALIBRATE_SCORE_BY_TOUCH_COORDINATES
|
|
&& proximityInfoState->touchPositionCorrectionEnabled()
|
|
&& skippedCount == 0 && excessiveCount == 0 && transposedCount == 0;
|
|
// Score calibration by touch coordinates is being done only for pure-fat finger typing error
|
|
// cases.
|
|
int additionalProximityCount = 0;
|
|
// TODO: Remove this constraint.
|
|
if (performTouchPositionCorrection) {
|
|
for (int i = 0; i < outputLength; ++i) {
|
|
const int squaredDistance = correction->mDistances[i];
|
|
if (i < adjustedProximityMatchedCount) {
|
|
multiplyIntCapped(typedLetterMultiplier, &finalFreq);
|
|
}
|
|
if (squaredDistance >= 0) {
|
|
// Promote or demote the score according to the distance from the sweet spot
|
|
static const float A = ZERO_DISTANCE_PROMOTION_RATE / 100.0f;
|
|
static const float B = 1.0f;
|
|
static const float C = 0.5f;
|
|
static const float MIN = 0.3f;
|
|
static const float R1 = NEUTRAL_SCORE_SQUARED_RADIUS;
|
|
static const float R2 = HALF_SCORE_SQUARED_RADIUS;
|
|
const float x = static_cast<float>(squaredDistance)
|
|
/ ProximityInfoState::NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR;
|
|
const float factor = max((x < R1)
|
|
? (A * (R1 - x) + B * x) / R1
|
|
: (B * (R2 - x) + C * (x - R1)) / (R2 - R1), MIN);
|
|
// factor is piecewise linear function like:
|
|
// A -_ .
|
|
// ^-_ .
|
|
// B \ .
|
|
// \_ .
|
|
// C ------------.
|
|
// .
|
|
// 0 R1 R2 .
|
|
multiplyRate((int)(factor * 100), &finalFreq);
|
|
} else if (squaredDistance == PROXIMITY_CHAR_WITHOUT_DISTANCE_INFO) {
|
|
multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
|
|
} else if (squaredDistance == ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO) {
|
|
++additionalProximityCount;
|
|
multiplyRate(WORDS_WITH_ADDITIONAL_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
|
|
}
|
|
}
|
|
} else {
|
|
// Demote additional proximity characters
|
|
for (int i = 0; i < outputLength; ++i) {
|
|
const int squaredDistance = correction->mDistances[i];
|
|
if (squaredDistance == ADDITIONAL_PROXIMITY_CHAR_DISTANCE_INFO) {
|
|
++additionalProximityCount;
|
|
}
|
|
}
|
|
// Promotion for a word with proximity characters
|
|
for (int i = 0; i < adjustedProximityMatchedCount; ++i) {
|
|
// A word with proximity corrections
|
|
if (DEBUG_DICT_FULL) {
|
|
AKLOGI("Found a proximity correction.");
|
|
}
|
|
multiplyIntCapped(typedLetterMultiplier, &finalFreq);
|
|
if (i < additionalProximityCount) {
|
|
multiplyRate(WORDS_WITH_ADDITIONAL_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
|
|
} else {
|
|
multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &finalFreq);
|
|
}
|
|
}
|
|
}
|
|
|
|
// If the user types too many(three or more) proximity characters with additional proximity
|
|
// character,do not treat as the same length word.
|
|
if (sameLength && additionalProximityCount > 0 && (adjustedProximityMatchedCount >= 3
|
|
|| transposedCount > 0 || skipped || excessiveCount > 0)) {
|
|
sameLength = false;
|
|
}
|
|
|
|
const int errorCount = adjustedProximityMatchedCount > 0
|
|
? adjustedProximityMatchedCount
|
|
: (proximityMatchedCount + transposedCount);
|
|
multiplyRate(
|
|
100 - CORRECTION_COUNT_RATE_DEMOTION_RATE_BASE * errorCount / inputLength, &finalFreq);
|
|
|
|
// Promotion for an exactly matched word
|
|
if (ed == 0) {
|
|
// Full exact match
|
|
if (sameLength && transposedCount == 0 && !skipped && excessiveCount == 0
|
|
&& quoteDiffCount == 0 && additionalProximityCount == 0) {
|
|
finalFreq = capped255MultForFullMatchAccentsOrCapitalizationDifference(finalFreq);
|
|
}
|
|
}
|
|
|
|
// Promote a word with no correction
|
|
if (proximityMatchedCount == 0 && transposedCount == 0 && !skipped && excessiveCount == 0
|
|
&& additionalProximityCount == 0) {
|
|
multiplyRate(FULL_MATCHED_WORDS_PROMOTION_RATE, &finalFreq);
|
|
}
|
|
|
|
// TODO: Check excessive count and transposed count
|
|
// TODO: Remove this if possible
|
|
/*
|
|
If the last character of the user input word is the same as the next character
|
|
of the output word, and also all of characters of the user input are matched
|
|
to the output word, we'll promote that word a bit because
|
|
that word can be considered the combination of skipped and matched characters.
|
|
This means that the 'sm' pattern wins over the 'ma' pattern.
|
|
e.g.)
|
|
shel -> shell [mmmma] or [mmmsm]
|
|
hel -> hello [mmmaa] or [mmsma]
|
|
m ... matching
|
|
s ... skipping
|
|
a ... traversing all
|
|
t ... transposing
|
|
e ... exceeding
|
|
p ... proximity matching
|
|
*/
|
|
if (matchCount == inputLength && matchCount >= 2 && !skipped
|
|
&& word[matchCount] == word[matchCount - 1]) {
|
|
multiplyRate(WORDS_WITH_MATCH_SKIP_PROMOTION_RATE, &finalFreq);
|
|
}
|
|
|
|
// TODO: Do not use sameLength?
|
|
if (sameLength) {
|
|
multiplyIntCapped(fullWordMultiplier, &finalFreq);
|
|
}
|
|
|
|
if (useFullEditDistance && outputLength > inputLength + 1) {
|
|
const int diff = outputLength - inputLength - 1;
|
|
const int divider = diff < 31 ? 1 << diff : S_INT_MAX;
|
|
finalFreq = divider > finalFreq ? 1 : finalFreq / divider;
|
|
}
|
|
|
|
if (DEBUG_DICT_FULL) {
|
|
AKLOGI("calc: %d, %d", outputLength, sameLength);
|
|
}
|
|
|
|
if (DEBUG_CORRECTION_FREQ
|
|
&& (INPUTLENGTH_FOR_DEBUG <= 0 || INPUTLENGTH_FOR_DEBUG == inputLength)) {
|
|
DUMP_WORD(correction->getPrimaryInputWord(), inputLength);
|
|
DUMP_WORD(correction->mWord, outputLength);
|
|
AKLOGI("FinalFreq: [P%d, S%d, T%d, E%d, A%d] %d, %d, %d, %d, %d, %d", proximityMatchedCount,
|
|
skippedCount, transposedCount, excessiveCount, additionalProximityCount,
|
|
outputLength, lastCharExceeded, sameLength, quoteDiffCount, ed, finalFreq);
|
|
}
|
|
|
|
return finalFreq;
|
|
}
|
|
|
|
/* static */
|
|
int Correction::RankingAlgorithm::calcFreqForSplitMultipleWords(
|
|
const int *freqArray, const int *wordLengthArray, const int wordCount,
|
|
const Correction *correction, const bool isSpaceProximity, const unsigned short *word) {
|
|
const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;
|
|
|
|
bool firstCapitalizedWordDemotion = false;
|
|
bool secondCapitalizedWordDemotion = false;
|
|
|
|
{
|
|
// TODO: Handle multiple capitalized word demotion properly
|
|
const int firstWordLength = wordLengthArray[0];
|
|
const int secondWordLength = wordLengthArray[1];
|
|
if (firstWordLength >= 2) {
|
|
firstCapitalizedWordDemotion = isUpperCase(word[0]);
|
|
}
|
|
|
|
if (secondWordLength >= 2) {
|
|
// FIXME: word[firstWordLength + 1] is incorrect.
|
|
secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]);
|
|
}
|
|
}
|
|
|
|
|
|
const bool capitalizedWordDemotion =
|
|
firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion;
|
|
|
|
int totalLength = 0;
|
|
int totalFreq = 0;
|
|
for (int i = 0; i < wordCount; ++i){
|
|
const int wordLength = wordLengthArray[i];
|
|
if (wordLength <= 0) {
|
|
return 0;
|
|
}
|
|
totalLength += wordLength;
|
|
const int demotionRate = 100 - TWO_WORDS_CORRECTION_DEMOTION_BASE / (wordLength + 1);
|
|
int tempFirstFreq = freqArray[i];
|
|
multiplyRate(demotionRate, &tempFirstFreq);
|
|
totalFreq += tempFirstFreq;
|
|
}
|
|
|
|
if (totalLength <= 0 || totalFreq <= 0) {
|
|
return 0;
|
|
}
|
|
|
|
// TODO: Currently totalFreq is adjusted to two word metrix.
|
|
// Promote pairFreq with multiplying by 2, because the word length is the same as the typed
|
|
// length.
|
|
totalFreq = totalFreq * 2 / wordCount;
|
|
if (wordCount > 2) {
|
|
// Safety net for 3+ words -- Caveats: many heuristics and workarounds here.
|
|
int oneLengthCounter = 0;
|
|
int twoLengthCounter = 0;
|
|
for (int i = 0; i < wordCount; ++i) {
|
|
const int wordLength = wordLengthArray[i];
|
|
// TODO: Use bigram instead of this safety net
|
|
if (i < wordCount - 1) {
|
|
const int nextWordLength = wordLengthArray[i + 1];
|
|
if (wordLength == 1 && nextWordLength == 2) {
|
|
// Safety net to filter 1 length and 2 length sequential words
|
|
return 0;
|
|
}
|
|
}
|
|
const int freq = freqArray[i];
|
|
// Demote too short weak words
|
|
if (wordLength <= 4 && freq <= SUPPRESS_SHORT_MULTIPLE_WORDS_THRESHOLD_FREQ) {
|
|
multiplyRate(100 * freq / MAX_FREQ, &totalFreq);
|
|
}
|
|
if (wordLength == 1) {
|
|
++oneLengthCounter;
|
|
} else if (wordLength == 2) {
|
|
++twoLengthCounter;
|
|
}
|
|
if (oneLengthCounter >= 2 || (oneLengthCounter + twoLengthCounter) >= 4) {
|
|
// Safety net to filter too many short words
|
|
return 0;
|
|
}
|
|
}
|
|
multiplyRate(MULTIPLE_WORDS_DEMOTION_RATE, &totalFreq);
|
|
}
|
|
|
|
// This is a workaround to try offsetting the not-enough-demotion which will be done in
|
|
// calcNormalizedScore in Utils.java.
|
|
// In calcNormalizedScore the score will be demoted by (1 - 1 / length)
|
|
// but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by
|
|
// (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length))
|
|
const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength);
|
|
multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq);
|
|
|
|
// At this moment, totalFreq is calculated by the following formula:
|
|
// (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1)))
|
|
// * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1))
|
|
|
|
multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq);
|
|
|
|
// This is another workaround to offset the demotion which will be done in
|
|
// calcNormalizedScore in Utils.java.
|
|
// In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote
|
|
// the same amount because we already have adjusted the synthetic freq of this "missing or
|
|
// mistyped space" suggestion candidate above in this method.
|
|
const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength);
|
|
multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq);
|
|
|
|
if (isSpaceProximity) {
|
|
// A word pair with one space proximity correction
|
|
if (DEBUG_DICT) {
|
|
AKLOGI("Found a word pair with space proximity correction.");
|
|
}
|
|
multiplyIntCapped(typedLetterMultiplier, &totalFreq);
|
|
multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);
|
|
}
|
|
|
|
if (isSpaceProximity) {
|
|
multiplyRate(WORDS_WITH_MISTYPED_SPACE_DEMOTION_RATE, &totalFreq);
|
|
} else {
|
|
multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);
|
|
}
|
|
|
|
if (capitalizedWordDemotion) {
|
|
multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq);
|
|
}
|
|
|
|
if (DEBUG_CORRECTION_FREQ) {
|
|
AKLOGI("Multiple words (%d, %d) (%d, %d) %d, %d", freqArray[0], freqArray[1],
|
|
wordLengthArray[0], wordLengthArray[1], capitalizedWordDemotion, totalFreq);
|
|
DUMP_WORD(word, wordLengthArray[0]);
|
|
}
|
|
|
|
return totalFreq;
|
|
}
|
|
|
|
/* Damerau-Levenshtein distance */
|
|
inline static int editDistanceInternal(
|
|
int *editDistanceTable, const unsigned short *before,
|
|
const int beforeLength, const unsigned short *after, const int afterLength) {
|
|
// dp[li][lo] dp[a][b] = dp[ a * lo + b]
|
|
int *dp = editDistanceTable;
|
|
const int li = beforeLength + 1;
|
|
const int lo = afterLength + 1;
|
|
for (int i = 0; i < li; ++i) {
|
|
dp[lo * i] = i;
|
|
}
|
|
for (int i = 0; i < lo; ++i) {
|
|
dp[i] = i;
|
|
}
|
|
|
|
for (int i = 0; i < li - 1; ++i) {
|
|
for (int j = 0; j < lo - 1; ++j) {
|
|
const uint32_t ci = toBaseLowerCase(before[i]);
|
|
const uint32_t co = toBaseLowerCase(after[j]);
|
|
const uint16_t cost = (ci == co) ? 0 : 1;
|
|
dp[(i + 1) * lo + (j + 1)] = min(dp[i * lo + (j + 1)] + 1,
|
|
min(dp[(i + 1) * lo + j] + 1, dp[i * lo + j] + cost));
|
|
if (i > 0 && j > 0 && ci == toBaseLowerCase(after[j - 1])
|
|
&& co == toBaseLowerCase(before[i - 1])) {
|
|
dp[(i + 1) * lo + (j + 1)] = min(
|
|
dp[(i + 1) * lo + (j + 1)], dp[(i - 1) * lo + (j - 1)] + cost);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (DEBUG_EDIT_DISTANCE) {
|
|
AKLOGI("IN = %d, OUT = %d", beforeLength, afterLength);
|
|
for (int i = 0; i < li; ++i) {
|
|
for (int j = 0; j < lo; ++j) {
|
|
AKLOGI("EDIT[%d][%d], %d", i, j, dp[i * lo + j]);
|
|
}
|
|
}
|
|
}
|
|
return dp[li * lo - 1];
|
|
}
|
|
|
|
int Correction::RankingAlgorithm::editDistance(const unsigned short *before,
|
|
const int beforeLength, const unsigned short *after, const int afterLength) {
|
|
int table[(beforeLength + 1) * (afterLength + 1)];
|
|
return editDistanceInternal(table, before, beforeLength, after, afterLength);
|
|
}
|
|
|
|
|
|
// In dictionary.cpp, getSuggestion() method,
|
|
// suggestion scores are computed using the below formula.
|
|
// original score
|
|
// := powf(mTypedLetterMultiplier (this is defined 2),
|
|
// (the number of matched characters between typed word and suggested word))
|
|
// * (individual word's score which defined in the unigram dictionary,
|
|
// and this score is defined in range [0, 255].)
|
|
// Then, the following processing is applied.
|
|
// - If the dictionary word is matched up to the point of the user entry
|
|
// (full match up to min(before.length(), after.length())
|
|
// => Then multiply by FULL_MATCHED_WORDS_PROMOTION_RATE (this is defined 1.2)
|
|
// - If the word is a true full match except for differences in accents or
|
|
// capitalization, then treat it as if the score was 255.
|
|
// - If before.length() == after.length()
|
|
// => multiply by mFullWordMultiplier (this is defined 2))
|
|
// So, maximum original score is powf(2, min(before.length(), after.length())) * 255 * 2 * 1.2
|
|
// For historical reasons we ignore the 1.2 modifier (because the measure for a good
|
|
// autocorrection threshold was done at a time when it didn't exist). This doesn't change
|
|
// the result.
|
|
// So, we can normalize original score by dividing powf(2, min(b.l(),a.l())) * 255 * 2.
|
|
|
|
/* static */
|
|
float Correction::RankingAlgorithm::calcNormalizedScore(const unsigned short *before,
|
|
const int beforeLength, const unsigned short *after, const int afterLength,
|
|
const int score) {
|
|
if (0 == beforeLength || 0 == afterLength) {
|
|
return 0;
|
|
}
|
|
const int distance = editDistance(before, beforeLength, after, afterLength);
|
|
int spaceCount = 0;
|
|
for (int i = 0; i < afterLength; ++i) {
|
|
if (after[i] == CODE_SPACE) {
|
|
++spaceCount;
|
|
}
|
|
}
|
|
|
|
if (spaceCount == afterLength) {
|
|
return 0;
|
|
}
|
|
|
|
const float maxScore = score >= S_INT_MAX ? S_INT_MAX : MAX_INITIAL_SCORE
|
|
* powf(static_cast<float>(TYPED_LETTER_MULTIPLIER),
|
|
static_cast<float>(min(beforeLength, afterLength - spaceCount)))
|
|
* FULL_WORD_MULTIPLIER;
|
|
|
|
// add a weight based on edit distance.
|
|
// distance <= max(afterLength, beforeLength) == afterLength,
|
|
// so, 0 <= distance / afterLength <= 1
|
|
const float weight = 1.0f - static_cast<float>(distance) / static_cast<float>(afterLength);
|
|
return (score / maxScore) * weight;
|
|
}
|
|
} // namespace latinime
|