2011-07-15 04:49:00 +00:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2011 The Android Open Source Project
|
|
|
|
*
|
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
* you may not use this file except in compliance with the License.
|
|
|
|
* You may obtain a copy of the License at
|
|
|
|
*
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
*
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
* limitations under the License.
|
|
|
|
*/
|
|
|
|
|
2011-08-10 05:30:10 +00:00
|
|
|
#ifndef LATINIME_CORRECTION_H
|
|
|
|
#define LATINIME_CORRECTION_H
|
2011-07-15 04:49:00 +00:00
|
|
|
|
|
|
|
#include <stdint.h>
|
2011-08-10 06:44:08 +00:00
|
|
|
#include "correction_state.h"
|
2011-07-15 04:49:00 +00:00
|
|
|
|
|
|
|
#include "defines.h"
|
|
|
|
|
|
|
|
namespace latinime {
|
|
|
|
|
|
|
|
class ProximityInfo;
|
|
|
|
|
2011-08-10 05:30:10 +00:00
|
|
|
class Correction {
|
2012-01-06 03:24:38 +00:00
|
|
|
public:
|
2011-08-03 14:27:32 +00:00
|
|
|
typedef enum {
|
2011-08-04 09:31:57 +00:00
|
|
|
TRAVERSE_ALL_ON_TERMINAL,
|
|
|
|
TRAVERSE_ALL_NOT_ON_TERMINAL,
|
2011-08-03 14:27:32 +00:00
|
|
|
UNRELATED,
|
2011-08-04 09:31:57 +00:00
|
|
|
ON_TERMINAL,
|
|
|
|
NOT_ON_TERMINAL
|
2011-08-10 05:30:10 +00:00
|
|
|
} CorrectionType;
|
2011-08-03 14:27:32 +00:00
|
|
|
|
2012-01-23 07:52:37 +00:00
|
|
|
/////////////////////////
|
|
|
|
// static inline utils //
|
|
|
|
/////////////////////////
|
|
|
|
|
|
|
|
static const int TWO_31ST_DIV_255 = S_INT_MAX / 255;
|
|
|
|
static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(const int num) {
|
|
|
|
return (num < TWO_31ST_DIV_255 ? 255 * num : S_INT_MAX);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const int TWO_31ST_DIV_2 = S_INT_MAX / 2;
|
|
|
|
inline static void multiplyIntCapped(const int multiplier, int *base) {
|
|
|
|
const int temp = *base;
|
|
|
|
if (temp != S_INT_MAX) {
|
|
|
|
// Branch if multiplier == 2 for the optimization
|
|
|
|
if (multiplier == 2) {
|
|
|
|
*base = TWO_31ST_DIV_2 >= temp ? temp << 1 : S_INT_MAX;
|
|
|
|
} else {
|
|
|
|
// TODO: This overflow check gives a wrong answer when, for example,
|
|
|
|
// temp = 2^16 + 1 and multiplier = 2^17 + 1.
|
|
|
|
// Fix this behavior.
|
|
|
|
const int tempRetval = temp * multiplier;
|
|
|
|
*base = tempRetval >= temp ? tempRetval : S_INT_MAX;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
inline static int powerIntCapped(const int base, const int n) {
|
|
|
|
if (n <= 0) return 1;
|
|
|
|
if (base == 2) {
|
|
|
|
return n < 31 ? 1 << n : S_INT_MAX;
|
|
|
|
} else {
|
|
|
|
int ret = base;
|
|
|
|
for (int i = 1; i < n; ++i) multiplyIntCapped(base, &ret);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
inline static void multiplyRate(const int rate, int *freq) {
|
|
|
|
if (*freq != S_INT_MAX) {
|
|
|
|
if (*freq > 1000000) {
|
|
|
|
*freq /= 100;
|
|
|
|
multiplyIntCapped(rate, freq);
|
|
|
|
} else {
|
|
|
|
multiplyIntCapped(rate, freq);
|
|
|
|
*freq /= 100;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-02-06 12:51:31 +00:00
|
|
|
Correction(const int typedLetterMultiplier, const int fullWordMultiplier);
|
2011-08-10 05:30:10 +00:00
|
|
|
void initCorrection(
|
2011-08-04 09:31:57 +00:00
|
|
|
const ProximityInfo *pi, const int inputLength, const int maxWordLength);
|
2011-08-10 06:44:08 +00:00
|
|
|
void initCorrectionState(const int rootPos, const int childCount, const bool traverseAll);
|
|
|
|
|
|
|
|
// TODO: remove
|
2011-08-01 10:35:27 +00:00
|
|
|
void setCorrectionParams(const int skipPos, const int excessivePos, const int transposedPos,
|
2011-12-14 12:38:11 +00:00
|
|
|
const int spaceProximityPos, const int missingSpacePos, const bool useFullEditDistance,
|
2011-12-15 05:53:19 +00:00
|
|
|
const bool doAutoCompletion, const int maxErrors);
|
2011-07-15 04:49:00 +00:00
|
|
|
void checkState();
|
2011-08-10 06:44:08 +00:00
|
|
|
bool initProcessState(const int index);
|
|
|
|
|
2011-08-03 14:27:32 +00:00
|
|
|
int getInputIndex();
|
|
|
|
|
2011-08-10 05:30:10 +00:00
|
|
|
virtual ~Correction();
|
2011-08-04 09:31:57 +00:00
|
|
|
int getSpaceProximityPos() const {
|
|
|
|
return mSpaceProximityPos;
|
|
|
|
}
|
|
|
|
int getMissingSpacePos() const {
|
|
|
|
return mMissingSpacePos;
|
|
|
|
}
|
|
|
|
|
2011-07-15 04:49:00 +00:00
|
|
|
int getSkipPos() const {
|
|
|
|
return mSkipPos;
|
|
|
|
}
|
2011-08-04 09:31:57 +00:00
|
|
|
|
2011-07-15 04:49:00 +00:00
|
|
|
int getExcessivePos() const {
|
|
|
|
return mExcessivePos;
|
|
|
|
}
|
2011-08-04 09:31:57 +00:00
|
|
|
|
2011-07-15 04:49:00 +00:00
|
|
|
int getTransposedPos() const {
|
|
|
|
return mTransposedPos;
|
|
|
|
}
|
2011-08-04 09:31:57 +00:00
|
|
|
|
|
|
|
bool needsToPrune() const;
|
|
|
|
|
2011-10-03 10:21:13 +00:00
|
|
|
int getFreqForSplitTwoWords(
|
2012-01-26 13:49:13 +00:00
|
|
|
const int *freqArray, const int *wordLengthArray, const bool isSpaceProximity,
|
|
|
|
const unsigned short *word);
|
2011-08-04 09:31:57 +00:00
|
|
|
int getFinalFreq(const int freq, unsigned short **word, int* wordLength);
|
2012-01-17 06:58:23 +00:00
|
|
|
int getFinalFreqForSubQueue(const int freq, unsigned short **word, int* wordLength,
|
|
|
|
const int inputLength);
|
2011-08-04 09:31:57 +00:00
|
|
|
|
2011-08-10 05:30:10 +00:00
|
|
|
CorrectionType processCharAndCalcState(const int32_t c, const bool isTerminal);
|
2011-08-01 10:35:27 +00:00
|
|
|
|
2011-08-10 06:44:08 +00:00
|
|
|
/////////////////////////
|
|
|
|
// Tree helper methods
|
|
|
|
int goDownTree(const int parentIndex, const int childCount, const int firstChildPos);
|
|
|
|
|
|
|
|
inline int getTreeSiblingPos(const int index) const {
|
|
|
|
return mCorrectionStates[index].mSiblingPos;
|
|
|
|
}
|
|
|
|
|
|
|
|
inline void setTreeSiblingPos(const int index, const int pos) {
|
|
|
|
mCorrectionStates[index].mSiblingPos = pos;
|
|
|
|
}
|
|
|
|
|
|
|
|
inline int getTreeParentIndex(const int index) const {
|
|
|
|
return mCorrectionStates[index].mParentIndex;
|
|
|
|
}
|
2012-01-06 03:24:38 +00:00
|
|
|
|
2012-01-12 09:44:40 +00:00
|
|
|
class RankingAlgorithm {
|
|
|
|
public:
|
|
|
|
static int calculateFinalFreq(const int inputIndex, const int depth,
|
2012-01-17 06:58:23 +00:00
|
|
|
const int freq, int *editDistanceTable, const Correction* correction,
|
|
|
|
const int inputLength);
|
2012-01-26 13:49:13 +00:00
|
|
|
static int calcFreqForSplitTwoWords(const int *freqArray, const int *wordLengthArray,
|
|
|
|
const Correction* correction, const bool isSpaceProximity,
|
|
|
|
const unsigned short *word);
|
2012-01-12 09:44:40 +00:00
|
|
|
static double calcNormalizedScore(const unsigned short* before, const int beforeLength,
|
|
|
|
const unsigned short* after, const int afterLength, const int score);
|
|
|
|
static int editDistance(const unsigned short* before,
|
|
|
|
const int beforeLength, const unsigned short* after, const int afterLength);
|
|
|
|
private:
|
|
|
|
static const int CODE_SPACE = ' ';
|
|
|
|
static const int MAX_INITIAL_SCORE = 255;
|
|
|
|
static const int TYPED_LETTER_MULTIPLIER = 2;
|
|
|
|
static const int FULL_WORD_MULTIPLIER = 2;
|
|
|
|
};
|
|
|
|
|
2012-01-06 03:24:38 +00:00
|
|
|
private:
|
2011-08-10 13:19:33 +00:00
|
|
|
inline void incrementInputIndex();
|
|
|
|
inline void incrementOutputIndex();
|
|
|
|
inline void startToTraverseAllNodes();
|
|
|
|
inline bool isQuote(const unsigned short c);
|
2011-08-19 13:05:59 +00:00
|
|
|
inline CorrectionType processSkipChar(
|
|
|
|
const int32_t c, const bool isTerminal, const bool inputIndexIncremented);
|
2012-01-16 07:21:21 +00:00
|
|
|
inline CorrectionType processUnrelatedCorrectionType();
|
2011-10-13 06:26:45 +00:00
|
|
|
inline void addCharToCurrentWord(const int32_t c);
|
2012-01-17 06:58:23 +00:00
|
|
|
inline int getFinalFreqInternal(const int freq, unsigned short **word, int* wordLength,
|
|
|
|
const int inputLength);
|
2011-08-04 09:31:57 +00:00
|
|
|
|
2011-08-01 10:35:27 +00:00
|
|
|
const int TYPED_LETTER_MULTIPLIER;
|
|
|
|
const int FULL_WORD_MULTIPLIER;
|
2011-07-15 04:49:00 +00:00
|
|
|
const ProximityInfo *mProximityInfo;
|
2011-08-04 09:31:57 +00:00
|
|
|
|
2011-09-29 09:36:56 +00:00
|
|
|
bool mUseFullEditDistance;
|
2011-12-14 12:38:11 +00:00
|
|
|
bool mDoAutoCompletion;
|
2011-08-04 09:31:57 +00:00
|
|
|
int mMaxEditDistance;
|
|
|
|
int mMaxDepth;
|
2011-07-15 04:49:00 +00:00
|
|
|
int mInputLength;
|
2011-08-01 10:35:27 +00:00
|
|
|
int mSpaceProximityPos;
|
|
|
|
int mMissingSpacePos;
|
2011-08-05 12:21:01 +00:00
|
|
|
int mTerminalInputIndex;
|
|
|
|
int mTerminalOutputIndex;
|
2011-12-15 05:53:19 +00:00
|
|
|
int mMaxErrors;
|
2011-10-06 10:12:20 +00:00
|
|
|
|
|
|
|
// The following arrays are state buffer.
|
2011-08-04 09:31:57 +00:00
|
|
|
unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];
|
2011-10-06 10:12:20 +00:00
|
|
|
int mDistances[MAX_WORD_LENGTH_INTERNAL];
|
|
|
|
|
2011-09-16 02:28:54 +00:00
|
|
|
// Edit distance calculation requires a buffer with (N+1)^2 length for the input length N.
|
2011-08-11 16:05:27 +00:00
|
|
|
// Caveat: Do not create multiple tables per thread as this table eats up RAM a lot.
|
2011-09-16 02:28:54 +00:00
|
|
|
int mEditDistanceTable[(MAX_WORD_LENGTH_INTERNAL + 1) * (MAX_WORD_LENGTH_INTERNAL + 1)];
|
2011-08-04 09:31:57 +00:00
|
|
|
|
2011-08-10 06:44:08 +00:00
|
|
|
CorrectionState mCorrectionStates[MAX_WORD_LENGTH_INTERNAL];
|
|
|
|
|
2011-08-10 13:19:33 +00:00
|
|
|
// The following member variables are being used as cache values of the correction state.
|
2011-08-17 08:55:16 +00:00
|
|
|
bool mNeedsToTraverseAllNodes;
|
2011-08-10 13:19:33 +00:00
|
|
|
int mOutputIndex;
|
|
|
|
int mInputIndex;
|
2011-08-17 08:55:16 +00:00
|
|
|
|
2011-10-05 05:55:07 +00:00
|
|
|
int mEquivalentCharCount;
|
2011-08-11 12:25:39 +00:00
|
|
|
int mProximityCount;
|
2011-08-17 08:55:16 +00:00
|
|
|
int mExcessiveCount;
|
|
|
|
int mTransposedCount;
|
2011-08-10 13:19:33 +00:00
|
|
|
int mSkippedCount;
|
2011-08-17 08:55:16 +00:00
|
|
|
|
|
|
|
int mTransposedPos;
|
|
|
|
int mExcessivePos;
|
2011-08-11 07:27:28 +00:00
|
|
|
int mSkipPos;
|
2011-08-17 08:55:16 +00:00
|
|
|
|
|
|
|
bool mLastCharExceeded;
|
|
|
|
|
2011-08-10 13:19:33 +00:00
|
|
|
bool mMatching;
|
2011-08-11 16:05:27 +00:00
|
|
|
bool mProximityMatching;
|
2012-02-02 09:49:22 +00:00
|
|
|
bool mAdditionalProximityMatching;
|
2011-08-17 08:55:16 +00:00
|
|
|
bool mExceeding;
|
|
|
|
bool mTransposing;
|
|
|
|
bool mSkipping;
|
2011-08-02 17:19:44 +00:00
|
|
|
|
2011-07-15 04:49:00 +00:00
|
|
|
};
|
|
|
|
} // namespace latinime
|
2011-08-10 05:30:10 +00:00
|
|
|
#endif // LATINIME_CORRECTION_H
|