Merge "Suggest words with transposed chars"

This commit is contained in:
satok 2010-12-10 01:38:51 -08:00 committed by Android (Google) Code Review
commit f6a6429e10
3 changed files with 110 additions and 46 deletions

View file

@ -24,13 +24,17 @@
#define LOG_TAG "LatinIME: "
#endif
#define DEBUG_DICT true
#define DEBUG_SHOW_FOUND_WORD false
#define DEBUG_NODE true
#define DEBUG_DICT_FULL true
#define DEBUG_SHOW_FOUND_WORD DEBUG_DICT_FULL
#define DEBUG_NODE DEBUG_DICT_FULL
#define DEBUG_TRACE DEBUG_DICT_FULL
#else // FLAG_DBG
#define LOGI
#define DEBUG_DICT false
#define DEBUG_DICT_FULL false
#define DEBUG_SHOW_FOUND_WORD false
#define DEBUG_NODE false
#define DEBUG_TRACE false
#endif // FLAG_DBG
#ifndef U_SHORT_MAX
@ -58,6 +62,12 @@
#define SUGGEST_WORDS_WITH_MISSING_CHARACTER true
#define SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER true
#define SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER true
#define SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS true
#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 75
#define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 80
#define WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE 75
#define WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE 60
// This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
// This is only used for the size of array. Not to be used in c functions.

View file

@ -36,7 +36,7 @@ UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterM
MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0) {
LOGI("UnigramDictionary - constructor");
if (DEBUG_DICT) LOGI("UnigramDictionary - constructor");
}
UnigramDictionary::~UnigramDictionary() {}
@ -45,26 +45,36 @@ int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short
int *frequencies, int *nextLetters, int nextLettersSize)
{
initSuggestions(codes, codesSize, outWords, frequencies);
getSuggestionCandidates(codesSize, -1, -1, nextLetters, nextLettersSize);
const int MAX_DEPTH = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
getSuggestionCandidates(codesSize, -1, -1, -1, nextLetters, nextLettersSize, MAX_DEPTH);
// Suggestion with missing character
if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {
for (int i = 0; i < codesSize; ++i) {
if (DEBUG_DICT) LOGI("--- Suggest missing characters %d", i);
getSuggestionCandidates(codesSize, i, -1, NULL, 0);
getSuggestionCandidates(codesSize, i, -1, -1, NULL, 0, MAX_DEPTH);
}
}
// Suggestion with excessive character
if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER) {
if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER && mInputLength > MIN_SUGGEST_DEPTH) {
for (int i = 0; i < codesSize; ++i) {
if (existsAdjacentProximityChars(i, codesSize)) {
if (DEBUG_DICT) LOGI("--- Suggest excessive characters %d", i);
getSuggestionCandidates(codesSize, -1, i, NULL, 0);
getSuggestionCandidates(codesSize, -1, i, -1, NULL, 0, MAX_DEPTH);
}
}
}
// Suggestion with transposed characters
// Only suggest words that length is mInputLength
if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) {
for (int i = 0; i < codesSize; ++i) {
if (DEBUG_DICT) LOGI("--- Suggest transposed characters %d", i);
getSuggestionCandidates(codesSize, -1, -1, i, NULL, 0, mInputLength - 1);
}
}
// Suggestions with missing space
if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER && mInputLength > MIN_SUGGEST_DEPTH) {
for (int i = 1; i < codesSize; ++i) {
@ -187,10 +197,13 @@ static const char QUOTE = '\'';
static const char SPACE = ' ';
void UnigramDictionary::getSuggestionCandidates(const int inputLength, const int skipPos,
const int excessivePos, int *nextLetters, const int nextLettersSize) {
if (DEBUG_DICT) LOGI("getSuggestionCandidates");
const int excessivePos, const int transposedPos, int *nextLetters,
const int nextLettersSize, const int maxDepth) {
if (DEBUG_DICT) LOGI("getSuggestionCandidates %d", maxDepth);
if (DEBUG_DICT) assert(transposedPos + 1 < inputLength);
if (DEBUG_DICT) assert(excessivePos < inputLength);
if (DEBUG_DICT) assert(missingPos < inputLength);
int rootPosition = ROOT_POS;
const int MAX_DEPTH = min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
// Get the number of child of root, then increment the position
int childCount = Dictionary::getCount(DICT, &rootPosition);
int depth = 0;
@ -212,12 +225,12 @@ void UnigramDictionary::getSuggestionCandidates(const int inputLength, const int
int diffs = mStackDiffs[depth];
int siblingPos = mStackSiblingPos[depth];
int firstChildPos;
// depth will never be greater than MAX_DEPTH because in that case,
// depth will never be greater than maxDepth because in that case,
// needsToTraverseChildrenNodes should be false
const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth,
MAX_DEPTH, traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos,
nextLetters, nextLettersSize, &childCount, &firstChildPos, &traverseAllNodes,
&snr, &inputIndex, &diffs, &siblingPos);
maxDepth, traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos,
transposedPos, nextLetters, nextLettersSize, &childCount, &firstChildPos,
&traverseAllNodes, &snr, &inputIndex, &diffs, &siblingPos);
// Update next sibling pos
mStackSiblingPos[depth] = siblingPos;
if (needsToTraverseChildrenNodes) {
@ -252,7 +265,7 @@ bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int mi
}
const int secondFreq = getBestWordFreq(missingSpacePos, inputLength - missingSpacePos, mWord);
if (DEBUG_DICT) LOGI("Second freq: %d", secondFreq);
if (DEBUG_DICT) LOGI("Second freq: %d", secondFreq);
if (secondFreq <= 0) return false;
word[missingSpacePos] = SPACE;
@ -262,24 +275,27 @@ bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int mi
int pairFreq = ((firstFreq + secondFreq) / 2);
for (int i = 0; i < inputLength; ++i) pairFreq *= TYPED_LETTER_MULTIPLIER;
pairFreq = pairFreq * WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE / 100;
addWord(word, newWordLength, pairFreq);
return true;
}
// Keep this for comparing spec to new getWords
void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,
const int excessivePos, int *nextLetters, const int nextLettersSize) {
const int excessivePos, const int transposedPos,int *nextLetters,
const int nextLettersSize) {
int initialPosition = initialPos;
const int count = Dictionary::getCount(DICT, &initialPosition);
getWordsRec(count, initialPosition, 0,
min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),
mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, nextLetters, nextLettersSize);
mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, transposedPos, nextLetters,
nextLettersSize);
}
void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,
const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex,
const int diffs, const int skipPos, const int excessivePos, int *nextLetters,
const int nextLettersSize) {
const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
int *nextLetters, const int nextLettersSize) {
int siblingPos = pos;
for (int i = 0; i < childrenCount; ++i) {
int newCount;
@ -291,34 +307,50 @@ void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, cons
int newDiffs;
int newSiblingPos;
const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,
traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos, nextLetters,
nextLettersSize,
traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos, transposedPos,
nextLetters, nextLettersSize,
&newCount, &newChildPosition, &newTraverseAllNodes, &newSnr,
&newInputIndex, &newDiffs, &newSiblingPos);
siblingPos = newSiblingPos;
if (needsToTraverseChildrenNodes) {
getWordsRec(newCount, newChildPosition, newDepth, maxDepth, newTraverseAllNodes,
newSnr, newInputIndex, newDiffs, skipPos, excessivePos, nextLetters,
nextLettersSize);
newSnr, newInputIndex, newDiffs, skipPos, excessivePos, transposedPos,
nextLetters, nextLettersSize);
}
}
}
inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsGreaterThanInputLength(
unsigned short *word, const int inputLength, const int depth, const int snr,
int *nextLetters, const int nextLettersSize, const int skipPos, const int freq) {
if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, freq * snr);
int *nextLetters, const int nextLettersSize, const int skipPos, const int excessivePos,
const int transposedPos, const int freq) {
int finalFreq = freq * snr;
// TODO: Demote by edit distance
if (skipPos >= 0) finalFreq = finalFreq * WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE / 100;
if (excessivePos >= 0) finalFreq = finalFreq
* WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE / 100;
if (transposedPos >= 0) finalFreq = finalFreq
* WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE / 100;
if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, finalFreq);
if (depth >= inputLength && skipPos < 0) {
registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize);
}
}
inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsSameAsInputLength(
unsigned short *word, const int depth, const int snr, const int skipPos, const int freq,
const int addedWeight) {
unsigned short *word, const int depth, const int snr, const int skipPos,
const int excessivePos, const int transposedPos, const int freq, const int addedWeight) {
if (!sameAsTyped(word, depth + 1)) {
int finalFreq = freq * snr * addedWeight;
// TODO: Demote by edit distance
if (skipPos >= 0) finalFreq = finalFreq * WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE / 100;
if (excessivePos >= 0) finalFreq = finalFreq
* WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE / 100;
if (transposedPos >= 0) finalFreq = finalFreq
* WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE / 100;
// Proximity collection will promote a word of the same length as
// what user typed.
if (skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER;
@ -357,16 +389,18 @@ inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex
}
inline int UnigramDictionary::getMatchedProximityId(const int *currentChars,
const unsigned short c, const int skipPos) {
const unsigned short c, const int skipPos, const int excessivePos,
const int transposedPos) {
const unsigned short lowerC = toLowerCase(c);
int j = 0;
while (currentChars[j] > 0 && j < MAX_PROXIMITY_CHARS) {
const bool matched = (currentChars[j] == lowerC || currentChars[j] == c);
// If skipPos is defined, not to search proximity collections.
// First char is what user typed.
// First char is what user typed.
if (matched) {
return j;
} else if (skipPos >= 0) {
} else if (skipPos >= 0 || excessivePos >= 0 || transposedPos >= 0) {
// Not to check proximity characters
return -1;
}
++j;
@ -376,10 +410,17 @@ inline int UnigramDictionary::getMatchedProximityId(const int *currentChars,
inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth,
const int maxDepth, const bool traverseAllNodes, const int snr, int inputIndex,
const int diffs, const int skipPos, const int excessivePos, int *nextLetters,
const int nextLettersSize, int *newCount, int *newChildPosition, bool *newTraverseAllNodes,
int *newSnr, int*newInputIndex, int *newDiffs, int *nextSiblingPosition) {
if (DEBUG_DICT) assert(skipPos < 0 || excessivePos < 0);
const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs,
int *nextSiblingPosition) {
if (DEBUG_DICT) {
int inputCount = 0;
if (skipPos >= 0) ++inputCount;
if (excessivePos >= 0) ++inputCount;
if (transposedPos >= 0) ++inputCount;
assert(inputCount <= 1);
}
unsigned short c;
int childPosition;
bool terminal;
@ -397,7 +438,7 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
mWord[depth] = c;
if (traverseAllNodes && terminal) {
onTerminalWhenUserTypedLengthIsGreaterThanInputLength(mWord, mInputLength, depth,
snr, nextLetters, nextLettersSize, skipPos, freq);
snr, nextLetters, nextLettersSize, skipPos, excessivePos, transposedPos, freq);
}
if (!needsToTraverseChildrenNodes) return false;
*newTraverseAllNodes = traverseAllNodes;
@ -406,7 +447,14 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
*newInputIndex = inputIndex;
} else {
int *currentChars = mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS);
int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos);
if (transposedPos >= 0) {
if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;
if (inputIndex == (transposedPos + 1)) currentChars -= MAX_PROXIMITY_CHARS;
}
int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos, excessivePos,
transposedPos);
if (matchedProximityCharId < 0) return false;
mWord[depth] = c;
// If inputIndex is greater than mInputLength, that means there is no
@ -415,13 +463,13 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1;
if (isSameAsUserTypedLength && terminal) {
onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, depth, snr,
skipPos, freq, addedWeight);
skipPos, excessivePos, transposedPos, freq, addedWeight);
}
if (!needsToTraverseChildrenNodes) return false;
// Start traversing all nodes after the index exceeds the user typed length
*newTraverseAllNodes = isSameAsUserTypedLength;
*newSnr = snr * addedWeight;
*newDiffs = diffs + (matchedProximityCharId > 0);
*newDiffs = diffs + ((matchedProximityCharId > 0) ? 1 : 0);
*newInputIndex = inputIndex + 1;
}
// Optimization: Prune out words that are too long compared to how much was typed.

View file

@ -32,7 +32,8 @@ public:
private:
void initSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies);
void getSuggestionCandidates(const int inputLength, const int skipPos, const int excessivePos,
int *nextLetters, const int nextLettersSize);
const int transposedPos, int *nextLetters, const int nextLettersSize,
const int maxDepth);
void getVersionNumber();
bool checkIfDictVersionIsLatest();
int getAddress(int *pos);
@ -43,25 +44,30 @@ private:
unsigned short toLowerCase(unsigned short c);
void getWordsRec(const int childrenCount, const int pos, const int depth, const int maxDepth,
const bool traverseAllNodes, const int snr, const int inputIndex, const int diffs,
const int skipPos, const int excessivePos, int *nextLetters, const int nextLettersSize);
const int skipPos, const int excessivePos, const int transposedPos, int *nextLetters,
const int nextLettersSize);
bool getMissingSpaceWords(const int inputLength, const int missingSpacePos);
// Keep getWordsOld for comparing performance between getWords and getWordsOld
void getWordsOld(const int initialPos, const int inputLength, const int skipPos,
const int excessivePos, int *nextLetters, const int nextLettersSize);
const int excessivePos, const int transposedPos, int *nextLetters,
const int nextLettersSize);
void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize);
void onTerminalWhenUserTypedLengthIsGreaterThanInputLength(unsigned short *word,
const int mInputLength, const int depth, const int snr, int *nextLetters,
const int nextLettersSize, const int skipPos, const int freq);
const int nextLettersSize, const int skipPos, const int excessivePos,
const int transposedPos, const int freq);
void onTerminalWhenUserTypedLengthIsSameAsInputLength(unsigned short *word, const int depth,
const int snr, const int skipPos, const int freq, const int addedWeight);
const int snr, const int skipPos, const int excessivePos, const int transposedPos,
const int freq, const int addedWeight);
bool needsToSkipCurrentNode(const unsigned short c,
const int inputIndex, const int skipPos, const int depth);
int getMatchedProximityId(const int *currentChars, const unsigned short c, const int skipPos);
int getMatchedProximityId(const int *currentChars, const unsigned short c, const int skipPos,
const int excessivePos, const int transposedPos);
// Process a node by considering proximity, missing and excessive character
bool processCurrentNode(const int pos, const int depth,
const int maxDepth, const bool traverseAllNodes, const int snr, int inputIndex,
const int diffs, const int skipPos, const int excessivePos, int *nextLetters,
const int nextLettersSize, int *newCount, int *newChildPosition,
const int diffs, const int skipPos, const int excessivePos, const int transposedPos,
int *nextLetters, const int nextLettersSize, int *newCount, int *newChildPosition,
bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs,
int *nextSiblingPosition);
int getBestWordFreq(const int startInputIndex, const int inputLength, unsigned short *word);