Demote skipped characters matched words with respect to length.

Words that matched user input with skipped characters used to be demoted
in BinaryDictionary by a constant factor and not at all in those dictionaries
implemented in java code. To represent the fact that the impact of a skipped
character gets larger as the word is shorter, this change will implement a
demotion that gets larger as the typed word is shorter. The demotion rate
is (n - 2) / (n - 1) where n is the length of the typed word for n >= 2.
It implements it for both BinaryDictionary and java dictionaries.

Bug: 3340731
Change-Id: I3a18be80a9708981d56a950dc25fe08f018b5b89
main
Jean Chalard 2011-03-03 10:22:10 +09:00
parent 50a4cb403f
commit 07a8406bc1
4 changed files with 47 additions and 16 deletions

View File

@ -230,6 +230,16 @@ public class ExpandableDictionary extends Dictionary {
return (node == null) ? -1 : node.mFrequency; return (node == null) ? -1 : node.mFrequency;
} }
private static int computeSkippedWordFinalFreq(int freq, int snr, int inputLength) {
// The computation itself makes sense for >= 2, but the == 2 case returns 0
// anyway so we may as well test against 3 instead and return the constant
if (inputLength >= 3) {
return (freq * snr * (inputLength - 2)) / (inputLength - 1);
} else {
return 0;
}
}
/** /**
* Recursively traverse the tree for words that match the input. Input consists of * Recursively traverse the tree for words that match the input. Input consists of
* a list of arrays. Each item in the list is one input character position. An input * a list of arrays. Each item in the list is one input character position. An input
@ -249,6 +259,7 @@ public class ExpandableDictionary extends Dictionary {
* inputIndex * inputIndex
* @param callback the callback class for adding a word * @param callback the callback class for adding a word
*/ */
// TODO: Share this routine with the native code for BinaryDictionary
protected void getWordsRec(NodeArray roots, final WordComposer codes, final char[] word, protected void getWordsRec(NodeArray roots, final WordComposer codes, final char[] word,
final int depth, boolean completion, int snr, int inputIndex, int skipPos, final int depth, boolean completion, int snr, int inputIndex, int skipPos,
WordCallback callback) { WordCallback callback) {
@ -275,8 +286,14 @@ public class ExpandableDictionary extends Dictionary {
if (completion) { if (completion) {
word[depth] = c; word[depth] = c;
if (terminal) { if (terminal) {
if (!callback.addWord(word, 0, depth + 1, freq * snr, mDicTypeId, final int finalFreq;
DataType.UNIGRAM)) { if (skipPos < 0) {
finalFreq = freq * snr;
} else {
finalFreq = computeSkippedWordFinalFreq(freq, snr, mInputLength);
}
if (!callback.addWord(word, 0, depth + 1, finalFreq, mDicTypeId,
DataType.UNIGRAM)) {
return; return;
} }
} }
@ -307,8 +324,14 @@ public class ExpandableDictionary extends Dictionary {
if (terminal) { if (terminal) {
if (INCLUDE_TYPED_WORD_IF_VALID if (INCLUDE_TYPED_WORD_IF_VALID
|| !same(word, depth + 1, codes.getTypedWord())) { || !same(word, depth + 1, codes.getTypedWord())) {
int finalFreq = freq * snr * addedAttenuation; final int finalFreq;
if (skipPos < 0) finalFreq *= FULL_WORD_FREQ_MULTIPLIER; if (skipPos < 0) {
finalFreq = freq * snr * addedAttenuation
* FULL_WORD_FREQ_MULTIPLIER;
} else {
finalFreq = computeSkippedWordFinalFreq(freq,
snr * addedAttenuation, mInputLength);
}
callback.addWord(word, 0, depth + 1, finalFreq, mDicTypeId, callback.addWord(word, 0, depth + 1, finalFreq, mDicTypeId,
DataType.UNIGRAM); DataType.UNIGRAM);
} }

View File

@ -135,7 +135,7 @@ static void prof_out(void) {
#define SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS true #define SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS true
// The following "rate"s are used as a multiplier before dividing by 100, so they are in percent. // The following "rate"s are used as a multiplier before dividing by 100, so they are in percent.
#define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 75 #define WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE 100
#define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 80 #define WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE 80
#define WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE 75 #define WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE 75
#define WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE 75 #define WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE 75

View File

@ -457,10 +457,17 @@ static inline int capped255MultForFullMatchAccentsOrCapitalizationDifference(con
} }
inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth, inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int depth,
const int matchWeight, const int skipPos, const int excessivePos, const int transposedPos, const int matchWeight, const int skipPos, const int excessivePos, const int transposedPos,
const int freq, const bool sameLength) { const int freq, const bool sameLength) const {
// TODO: Demote by edit distance // TODO: Demote by edit distance
int finalFreq = freq * matchWeight; int finalFreq = freq * matchWeight;
if (skipPos >= 0) multiplyRate(WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE, &finalFreq); if (skipPos >= 0) {
if (mInputLength >= 3) {
multiplyRate(WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE *
(mInputLength - 2) / (mInputLength - 1), &finalFreq);
} else {
finalFreq = 0;
}
}
if (transposedPos >= 0) multiplyRate( if (transposedPos >= 0) multiplyRate(
WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq); WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);
if (excessivePos >= 0) { if (excessivePos >= 0) {
@ -514,7 +521,7 @@ inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,
} }
inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex, inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex,
const int inputLength) { const int inputLength) const {
if (inputIndex < 0 || inputIndex >= inputLength) return false; if (inputIndex < 0 || inputIndex >= inputLength) return false;
const int currentChar = *getInputCharsAt(inputIndex); const int currentChar = *getInputCharsAt(inputIndex);
const int leftIndex = inputIndex - 1; const int leftIndex = inputIndex - 1;

View File

@ -71,7 +71,8 @@ private:
const int nextLettersSize); const int nextLettersSize);
void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize); void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize);
int calculateFinalFreq(const int inputIndex, const int depth, const int snr, const int skipPos, int calculateFinalFreq(const int inputIndex, const int depth, const int snr, const int skipPos,
const int excessivePos, const int transposedPos, const int freq, const bool sameLength); const int excessivePos, const int transposedPos, const int freq,
const bool sameLength) const;
void onTerminalWhenUserTypedLengthIsGreaterThanInputLength(unsigned short *word, void onTerminalWhenUserTypedLengthIsGreaterThanInputLength(unsigned short *word,
const int inputIndex, const int depth, const int snr, int *nextLetters, const int inputIndex, const int depth, const int snr, int *nextLetters,
const int nextLettersSize, const int skipPos, const int excessivePos, const int nextLettersSize, const int skipPos, const int excessivePos,
@ -95,8 +96,8 @@ private:
bool processCurrentNodeForExactMatch(const int firstChildPos, bool processCurrentNodeForExactMatch(const int firstChildPos,
const int startInputIndex, const int depth, unsigned short *word, const int startInputIndex, const int depth, unsigned short *word,
int *newChildPosition, int *newCount, bool *newTerminal, int *newFreq, int *siblingPos); int *newChildPosition, int *newCount, bool *newTerminal, int *newFreq, int *siblingPos);
bool existsAdjacentProximityChars(const int inputIndex, const int inputLength); bool existsAdjacentProximityChars(const int inputIndex, const int inputLength) const;
inline const int* getInputCharsAt(const int index) { inline const int* getInputCharsAt(const int index) const {
return mInputCodes + (index * MAX_PROXIMITY_CHARS); return mInputCodes + (index * MAX_PROXIMITY_CHARS);
} }
const unsigned char *DICT; const unsigned char *DICT;