Merge "Suggest words with transposed chars"
This commit is contained in:
commit
f6a6429e10
3 changed files with 110 additions and 46 deletions
|
@ -24,13 +24,17 @@
|
|||
#define LOG_TAG "LatinIME: "
|
||||
#endif
|
||||
#define DEBUG_DICT true
|
||||
#define DEBUG_SHOW_FOUND_WORD false
|
||||
#define DEBUG_NODE true
|
||||
#define DEBUG_DICT_FULL true
|
||||
#define DEBUG_SHOW_FOUND_WORD DEBUG_DICT_FULL
|
||||
#define DEBUG_NODE DEBUG_DICT_FULL
|
||||
#define DEBUG_TRACE DEBUG_DICT_FULL
|
||||
#else // FLAG_DBG
|
||||
#define LOGI
|
||||
#define DEBUG_DICT false
|
||||
#define DEBUG_DICT_FULL false
|
||||
#define DEBUG_SHOW_FOUND_WORD false
|
||||
#define DEBUG_NODE false
|
||||
#define DEBUG_TRACE false
|
||||
#endif // FLAG_DBG
|
||||
|
||||
#ifndef U_SHORT_MAX
|
||||
|
@ -58,6 +62,12 @@
|
|||
#define SUGGEST_WORDS_WITH_MISSING_CHARACTER true
|
||||
#define SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER true
|
||||
#define SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER true
|
||||
#define SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS true
|
||||
|
||||
#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 75
|
||||
#define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 80
|
||||
#define WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE 75
|
||||
#define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 60
|
||||
|
||||
// This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
|
||||
// This is only used for the size of array. Not to be used in c functions.
|
||||
|
|
|
@ -36,7 +36,7 @@ UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterM
|
|||
MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
|
||||
TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
|
||||
ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0) {
|
||||
LOGI("UnigramDictionary - constructor");
|
||||
if (DEBUG_DICT) LOGI("UnigramDictionary - constructor");
|
||||
}
|
||||
|
||||
UnigramDictionary::~UnigramDictionary() {}
|
||||
|
@ -45,26 +45,36 @@ int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short
|
|||
int *frequencies, int *nextLetters, int nextLettersSize)
|
||||
{
|
||||
initSuggestions(codes, codesSize, outWords, frequencies);
|
||||
getSuggestionCandidates(codesSize, -1, -1, nextLetters, nextLettersSize);
|
||||
const int MAX_DEPTH = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
|
||||
getSuggestionCandidates(codesSize, -1, -1, -1, nextLetters, nextLettersSize, MAX_DEPTH);
|
||||
|
||||
// Suggestion with missing character
|
||||
if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {
|
||||
for (int i = 0; i < codesSize; ++i) {
|
||||
if (DEBUG_DICT) LOGI("--- Suggest missing characters %d", i);
|
||||
getSuggestionCandidates(codesSize, i, -1, NULL, 0);
|
||||
getSuggestionCandidates(codesSize, i, -1, -1, NULL, 0, MAX_DEPTH);
|
||||
}
|
||||
}
|
||||
|
||||
// Suggestion with excessive character
|
||||
if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER) {
|
||||
if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER && mInputLength > MIN_SUGGEST_DEPTH) {
|
||||
for (int i = 0; i < codesSize; ++i) {
|
||||
if (existsAdjacentProximityChars(i, codesSize)) {
|
||||
if (DEBUG_DICT) LOGI("--- Suggest excessive characters %d", i);
|
||||
getSuggestionCandidates(codesSize, -1, i, NULL, 0);
|
||||
getSuggestionCandidates(codesSize, -1, i, -1, NULL, 0, MAX_DEPTH);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Suggestion with transposed characters
|
||||
// Only suggest words that length is mInputLength
|
||||
if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) {
|
||||
for (int i = 0; i < codesSize; ++i) {
|
||||
if (DEBUG_DICT) LOGI("--- Suggest transposed characters %d", i);
|
||||
getSuggestionCandidates(codesSize, -1, -1, i, NULL, 0, mInputLength - 1);
|
||||
}
|
||||
}
|
||||
|
||||
// Suggestions with missing space
|
||||
if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER && mInputLength > MIN_SUGGEST_DEPTH) {
|
||||
for (int i = 1; i < codesSize; ++i) {
|
||||
|
@ -187,10 +197,13 @@ static const char QUOTE = '\'';
|
|||
static const char SPACE = ' ';
|
||||
|
||||
void UnigramDictionary::getSuggestionCandidates(const int inputLength, const int skipPos,
|
||||
const int excessivePos, int *nextLetters, const int nextLettersSize) {
|
||||
if (DEBUG_DICT) LOGI("getSuggestionCandidates");
|
||||
const int excessivePos, const int transposedPos, int *nextLetters,
|
||||
const int nextLettersSize, const int maxDepth) {
|
||||
if (DEBUG_DICT) LOGI("getSuggestionCandidates %d", maxDepth);
|
||||
if (DEBUG_DICT) assert(transposedPos + 1 < inputLength);
|
||||
if (DEBUG_DICT) assert(excessivePos < inputLength);
|
||||
if (DEBUG_DICT) assert(missingPos < inputLength);
|
||||
int rootPosition = ROOT_POS;
|
||||
const int MAX_DEPTH = min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
|
||||
// Get the number of child of root, then increment the position
|
||||
int childCount = Dictionary::getCount(DICT, &rootPosition);
|
||||
int depth = 0;
|
||||
|
@ -212,12 +225,12 @@ void UnigramDictionary::getSuggestionCandidates(const int inputLength, const int
|
|||
int diffs = mStackDiffs[depth];
|
||||
int siblingPos = mStackSiblingPos[depth];
|
||||
int firstChildPos;
|
||||
// depth will never be greater than MAX_DEPTH because in that case,
|
||||
// depth will never be greater than maxDepth because in that case,
|
||||
// needsToTraverseChildrenNodes should be false
|
||||
const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth,
|
||||
MAX_DEPTH, traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos,
|
||||
nextLetters, nextLettersSize, &childCount, &firstChildPos, &traverseAllNodes,
|
||||
&snr, &inputIndex, &diffs, &siblingPos);
|
||||
maxDepth, traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos,
|
||||
transposedPos, nextLetters, nextLettersSize, &childCount, &firstChildPos,
|
||||
&traverseAllNodes, &snr, &inputIndex, &diffs, &siblingPos);
|
||||
// Update next sibling pos
|
||||
mStackSiblingPos[depth] = siblingPos;
|
||||
if (needsToTraverseChildrenNodes) {
|
||||
|
@ -252,7 +265,7 @@ bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int mi
|
|||
}
|
||||
|
||||
const int secondFreq = getBestWordFreq(missingSpacePos, inputLength - missingSpacePos, mWord);
|
||||
if (DEBUG_DICT) LOGI("Second freq: %d", secondFreq);
|
||||
if (DEBUG_DICT) LOGI("Second freq: %d", secondFreq);
|
||||
if (secondFreq <= 0) return false;
|
||||
|
||||
word[missingSpacePos] = SPACE;
|
||||
|
@ -262,24 +275,27 @@ bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int mi
|
|||
|
||||
int pairFreq = ((firstFreq + secondFreq) / 2);
|
||||
for (int i = 0; i < inputLength; ++i) pairFreq *= TYPED_LETTER_MULTIPLIER;
|
||||
pairFreq = pairFreq * WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE / 100;
|
||||
addWord(word, newWordLength, pairFreq);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Keep this for comparing spec to new getWords
|
||||
void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,
|
||||
const int excessivePos, int *nextLetters, const int nextLettersSize) {
|
||||
const int excessivePos, const int transposedPos,int *nextLetters,
|
||||
const int nextLettersSize) {
|
||||
int initialPosition = initialPos;
|
||||
const int count = Dictionary::getCount(DICT, &initialPosition);
|
||||
getWordsRec(count, initialPosition, 0,
|
||||
min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),
|
||||
mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, nextLetters, nextLettersSize);
|
||||
mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters,
|
||||
nextLettersSize);
|
||||
}
|
||||
|
||||
void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,
|
||||
const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex,
|
||||
const int diffs, const int skipPos, const int excessivePos, int *nextLetters,
|
||||
const int nextLettersSize) {
|
||||
const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
|
||||
int *nextLetters, const int nextLettersSize) {
|
||||
int siblingPos = pos;
|
||||
for (int i = 0; i < childrenCount; ++i) {
|
||||
int newCount;
|
||||
|
@ -291,34 +307,50 @@ void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, cons
|
|||
int newDiffs;
|
||||
int newSiblingPos;
|
||||
const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,
|
||||
traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos, nextLetters,
|
||||
nextLettersSize,
|
||||
traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos, transposedPos,
|
||||
nextLetters, nextLettersSize,
|
||||
&newCount, &newChildPosition, &newTraverseAllNodes, &newSnr,
|
||||
&newInputIndex, &newDiffs, &newSiblingPos);
|
||||
siblingPos = newSiblingPos;
|
||||
|
||||
if (needsToTraverseChildrenNodes) {
|
||||
getWordsRec(newCount, newChildPosition, newDepth, maxDepth, newTraverseAllNodes,
|
||||
newSnr, newInputIndex, newDiffs, skipPos, excessivePos, nextLetters,
|
||||
nextLettersSize);
|
||||
newSnr, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos,
|
||||
nextLetters, nextLettersSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsGreaterThanInputLength(
|
||||
unsigned short *word, const int inputLength, const int depth, const int snr,
|
||||
int *nextLetters, const int nextLettersSize, const int skipPos, const int freq) {
|
||||
if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, freq * snr);
|
||||
int *nextLetters, const int nextLettersSize, const int skipPos, const int excessivePos,
|
||||
const int transposedPos, const int freq) {
|
||||
int finalFreq = freq * snr;
|
||||
// TODO: Demote by edit distance
|
||||
if (skipPos >= 0) finalFreq = finalFreq * WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE / 100;
|
||||
if (excessivePos >= 0) finalFreq = finalFreq
|
||||
* WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE / 100;
|
||||
if (transposedPos >= 0) finalFreq = finalFreq
|
||||
* WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE / 100;
|
||||
|
||||
if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, finalFreq);
|
||||
if (depth >= inputLength && skipPos < 0) {
|
||||
registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize);
|
||||
}
|
||||
}
|
||||
|
||||
inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsSameAsInputLength(
|
||||
unsigned short *word, const int depth, const int snr, const int skipPos, const int freq,
|
||||
const int addedWeight) {
|
||||
unsigned short *word, const int depth, const int snr, const int skipPos,
|
||||
const int excessivePos, const int transposedPos, const int freq, const int addedWeight) {
|
||||
if (!sameAsTyped(word, depth + 1)) {
|
||||
int finalFreq = freq * snr * addedWeight;
|
||||
// TODO: Demote by edit distance
|
||||
if (skipPos >= 0) finalFreq = finalFreq * WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE / 100;
|
||||
if (excessivePos >= 0) finalFreq = finalFreq
|
||||
* WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE / 100;
|
||||
if (transposedPos >= 0) finalFreq = finalFreq
|
||||
* WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE / 100;
|
||||
|
||||
// Proximity collection will promote a word of the same length as
|
||||
// what user typed.
|
||||
if (skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER;
|
||||
|
@ -357,16 +389,18 @@ inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex
|
|||
}
|
||||
|
||||
inline int UnigramDictionary::getMatchedProximityId(const int *currentChars,
|
||||
const unsigned short c, const int skipPos) {
|
||||
const unsigned short c, const int skipPos, const int excessivePos,
|
||||
const int transposedPos) {
|
||||
const unsigned short lowerC = toLowerCase(c);
|
||||
int j = 0;
|
||||
while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {
|
||||
const bool matched = (currentChars[j] == lowerC || currentChars[j] == c);
|
||||
// If skipPos is defined, not to search proximity collections.
|
||||
// First char is what user typed.
|
||||
// First char is what user typed.
|
||||
if (matched) {
|
||||
return j;
|
||||
} else if (skipPos >= 0) {
|
||||
} else if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0) {
|
||||
// Not to check proximity characters
|
||||
return -1;
|
||||
}
|
||||
++j;
|
||||
|
@ -376,10 +410,17 @@ inline int UnigramDictionary::getMatchedProximityId(const int *currentChars,
|
|||
|
||||
inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth,
|
||||
const int maxDepth, const bool traverseAllNodes, const int snr, int inputIndex,
|
||||
const int diffs, const int skipPos, const int excessivePos, int *nextLetters,
|
||||
const int nextLettersSize, int *newCount, int *newChildPosition, bool *newTraverseAllNodes,
|
||||
int *newSnr, int*newInputIndex, int *newDiffs, int *nextSiblingPosition) {
|
||||
if (DEBUG_DICT) assert(skipPos < 0 || excessivePos < 0);
|
||||
const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
|
||||
int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
|
||||
bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs,
|
||||
int *nextSiblingPosition) {
|
||||
if (DEBUG_DICT) {
|
||||
int inputCount = 0;
|
||||
if (skipPos >= 0) ++inputCount;
|
||||
if (excessivePos >= 0) ++inputCount;
|
||||
if (transposedPos >= 0) ++inputCount;
|
||||
assert(inputCount <= 1);
|
||||
}
|
||||
unsigned short c;
|
||||
int childPosition;
|
||||
bool terminal;
|
||||
|
@ -397,7 +438,7 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
|
|||
mWord[depth] = c;
|
||||
if (traverseAllNodes && terminal) {
|
||||
onTerminalWhenUserTypedLengthIsGreaterThanInputLength(mWord, mInputLength, depth,
|
||||
snr, nextLetters, nextLettersSize, skipPos, freq);
|
||||
snr, nextLetters, nextLettersSize, skipPos, excessivePos, transposedPos, freq);
|
||||
}
|
||||
if (!needsToTraverseChildrenNodes) return false;
|
||||
*newTraverseAllNodes = traverseAllNodes;
|
||||
|
@ -406,7 +447,14 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
|
|||
*newInputIndex = inputIndex;
|
||||
} else {
|
||||
int *currentChars = mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS);
|
||||
int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos);
|
||||
|
||||
if (transposedPos >= 0) {
|
||||
if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;
|
||||
if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;
|
||||
}
|
||||
|
||||
int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,
|
||||
transposedPos);
|
||||
if (matchedProximityCharId < 0) return false;
|
||||
mWord[depth] = c;
|
||||
// If inputIndex is greater than mInputLength, that means there is no
|
||||
|
@ -415,13 +463,13 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
|
|||
const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1;
|
||||
if (isSameAsUserTypedLength && terminal) {
|
||||
onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, depth, snr,
|
||||
skipPos, freq, addedWeight);
|
||||
skipPos, excessivePos, transposedPos, freq, addedWeight);
|
||||
}
|
||||
if (!needsToTraverseChildrenNodes) return false;
|
||||
// Start traversing all nodes after the index exceeds the user typed length
|
||||
*newTraverseAllNodes = isSameAsUserTypedLength;
|
||||
*newSnr = snr * addedWeight;
|
||||
*newDiffs = diffs + (matchedProximityCharId > 0);
|
||||
*newDiffs = diffs + ((matchedProximityCharId > 0) ? 1 : 0);
|
||||
*newInputIndex = inputIndex + 1;
|
||||
}
|
||||
// Optimization: Prune out words that are too long compared to how much was typed.
|
||||
|
|
|
@ -32,7 +32,8 @@ public:
|
|||
private:
|
||||
void initSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies);
|
||||
void getSuggestionCandidates(const int inputLength, const int skipPos, const int excessivePos,
|
||||
int *nextLetters, const int nextLettersSize);
|
||||
const int transposedPos, int *nextLetters, const int nextLettersSize,
|
||||
const int maxDepth);
|
||||
void getVersionNumber();
|
||||
bool checkIfDictVersionIsLatest();
|
||||
int getAddress(int *pos);
|
||||
|
@ -43,25 +44,30 @@ private:
|
|||
unsigned short toLowerCase(unsigned short c);
|
||||
void getWordsRec(const int childrenCount, const int pos, const int depth, const int maxDepth,
|
||||
const bool traverseAllNodes, const int snr, const int inputIndex, const int diffs,
|
||||
const int skipPos, const int excessivePos, int *nextLetters, const int nextLettersSize);
|
||||
const int skipPos, const int excessivePos, const int transposedPos, int *nextLetters,
|
||||
const int nextLettersSize);
|
||||
bool getMissingSpaceWords(const int inputLength, const int missingSpacePos);
|
||||
// Keep getWordsOld for comparing performance between getWords and getWordsOld
|
||||
void getWordsOld(const int initialPos, const int inputLength, const int skipPos,
|
||||
const int excessivePos, int *nextLetters, const int nextLettersSize);
|
||||
const int excessivePos, const int transposedPos, int *nextLetters,
|
||||
const int nextLettersSize);
|
||||
void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize);
|
||||
void onTerminalWhenUserTypedLengthIsGreaterThanInputLength(unsigned short *word,
|
||||
const int mInputLength, const int depth, const int snr, int *nextLetters,
|
||||
const int nextLettersSize, const int skipPos, const int freq);
|
||||
const int nextLettersSize, const int skipPos, const int excessivePos,
|
||||
const int transposedPos, const int freq);
|
||||
void onTerminalWhenUserTypedLengthIsSameAsInputLength(unsigned short *word, const int depth,
|
||||
const int snr, const int skipPos, const int freq, const int addedWeight);
|
||||
const int snr, const int skipPos, const int excessivePos, const int transposedPos,
|
||||
const int freq, const int addedWeight);
|
||||
bool needsToSkipCurrentNode(const unsigned short c,
|
||||
const int inputIndex, const int skipPos, const int depth);
|
||||
int getMatchedProximityId(const int *currentChars, const unsigned short c, const int skipPos);
|
||||
int getMatchedProximityId(const int *currentChars, const unsigned short c, const int skipPos,
|
||||
const int excessivePos, const int transposedPos);
|
||||
// Process a node by considering proximity, missing and excessive character
|
||||
bool processCurrentNode(const int pos, const int depth,
|
||||
const int maxDepth, const bool traverseAllNodes, const int snr, int inputIndex,
|
||||
const int diffs, const int skipPos, const int excessivePos, int *nextLetters,
|
||||
const int nextLettersSize, int *newCount, int *newChildPosition,
|
||||
const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
|
||||
int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
|
||||
bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs,
|
||||
int *nextSiblingPosition);
|
||||
int getBestWordFreq(const int startInputIndex, const int inputLength, unsigned short *word);
|
||||
|
|
Loading…
Reference in a new issue