Merge "Fix a bug that We can't suggest words with missing space if one of the words starts with a capitalized character."

main
satok 2010-12-09 04:46:00 -08:00 committed by Android (Google) Code Review
commit 8db866aaa2
3 changed files with 94 additions and 54 deletions

View File

@ -25,10 +25,12 @@
#endif #endif
#define DEBUG_DICT true #define DEBUG_DICT true
#define DEBUG_SHOW_FOUND_WORD false #define DEBUG_SHOW_FOUND_WORD false
#define DEBUG_NODE true
#else // FLAG_DBG #else // FLAG_DBG
#define LOGI #define LOGI
#define DEBUG_DICT false #define DEBUG_DICT false
#define DEBUG_SHOW_FOUND_WORD false #define DEBUG_SHOW_FOUND_WORD false
#define DEBUG_NODE false
#endif // FLAG_DBG #endif // FLAG_DBG
#ifndef U_SHORT_MAX #ifndef U_SHORT_MAX

View File

@ -64,7 +64,7 @@ int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short
} }
// Suggestions with missing space // Suggestions with missing space
if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER) { if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER && mInputLength > MIN_SUGGEST_DEPTH) {
for (int i = 1; i < codesSize; ++i) { for (int i = 1; i < codesSize; ++i) {
if (DEBUG_DICT) LOGI("--- Suggest missing space characters %d", i); if (DEBUG_DICT) LOGI("--- Suggest missing space characters %d", i);
getMissingSpaceWords(mInputLength, i); getMissingSpaceWords(mInputLength, i);
@ -236,33 +236,30 @@ void UnigramDictionary::getSuggestionCandidates(const int inputLength, const int
} }
bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) { bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {
if (missingSpacePos <= 0 || missingSpacePos >= inputLength) return false; if (missingSpacePos <= 0 || missingSpacePos >= inputLength
const int firstFreq = getWordFreq(0, missingSpacePos); || inputLength >= MAX_WORD_LENGTH) return false;
const int secondFreq = getWordFreq(missingSpacePos, inputLength - missingSpacePos);
if (DEBUG_DICT) LOGI("First freq: %d, Second freq: %d", firstFreq, secondFreq);
if (firstFreq <= 0 || secondFreq <= 0) return false;
int pairFreq = (firstFreq + secondFreq) / 2;
for (int i = 0; i < inputLength; ++i) pairFreq *= TYPED_LETTER_MULTIPLIER;
const int newWordLength = inputLength + 1; const int newWordLength = inputLength + 1;
// Allocating variable length array on stack // Allocating variable length array on stack
unsigned short word[newWordLength]; unsigned short word[newWordLength];
int j = 0; const int firstFreq = getBestWordFreq(0, missingSpacePos, mWord);
if (DEBUG_DICT) LOGI("First freq: %d", firstFreq);
if (firstFreq <= 0) return false;
for (int i = 0; i < missingSpacePos; ++i) { for (int i = 0; i < missingSpacePos; ++i) {
// Down-casting word[i] = mWord[i];
if (DEBUG_DICT) {
assert((*(mInputCodes + i * MAX_PROXIMITY_CHARS)) <= U_SHORT_MAX);
}
word[i] = (unsigned short) *(mInputCodes + i * MAX_PROXIMITY_CHARS);
} }
const int secondFreq = getBestWordFreq(missingSpacePos, inputLength - missingSpacePos, mWord);
if (DEBUG_DICT) LOGI("Second freq: %d", secondFreq);
if (secondFreq <= 0) return false;
word[missingSpacePos] = SPACE; word[missingSpacePos] = SPACE;
for (int i = (missingSpacePos + 1); i < newWordLength; ++i) { for (int i = (missingSpacePos + 1); i < newWordLength; ++i) {
// Down-casting word[i] = mWord[i - missingSpacePos - 1];
if (DEBUG_DICT) {
assert((*(mInputCodes + (i - 1) * MAX_PROXIMITY_CHARS)) <= U_SHORT_MAX);
}
word[i] = (unsigned short) *(mInputCodes + (i - 1) * MAX_PROXIMITY_CHARS);
} }
int pairFreq = ((firstFreq + secondFreq) / 2);
for (int i = 0; i < inputLength; ++i) pairFreq *= TYPED_LETTER_MULTIPLIER;
addWord(word, newWordLength, pairFreq); addWord(word, newWordLength, pairFreq);
return true; return true;
} }
@ -418,48 +415,89 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
return needsToTraverseChildrenNodes; return needsToTraverseChildrenNodes;
} }
inline int UnigramDictionary::getWordFreq(const int startInputIndex, const int inputLength) { inline int UnigramDictionary::getBestWordFreq(const int startInputIndex, const int inputLength,
unsigned short *word) {
int pos = ROOT_POS; int pos = ROOT_POS;
int count = Dictionary::getCount(DICT, &pos); int count = Dictionary::getCount(DICT, &pos);
int freq = 0; int maxFreq = 0;
int depth = 0;
unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];
bool terminal = false; bool terminal = false;
for (int i = 0; i < inputLength; ++i) { mStackChildCount[0] = count;
bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(pos, count, mStackSiblingPos[0] = pos;
startInputIndex + i, &pos, &count, &terminal, &freq);
if (!needsToTraverseChildrenNodes && (i < inputLength - 1)) { while (depth >= 0) {
return 0; if (mStackChildCount[depth] > 0) {
} --mStackChildCount[depth];
} int firstChildPos;
int newFreq;
int siblingPos = mStackSiblingPos[depth];
const bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(siblingPos,
startInputIndex, depth, newWord, &firstChildPos, &count, &terminal, &newFreq,
&siblingPos);
mStackSiblingPos[depth] = siblingPos;
if (depth == (inputLength - 1)) {
// Traverse sibling node
if (terminal) { if (terminal) {
return freq; if (newFreq > maxFreq) {
} else { for (int i = 0; i < inputLength; ++i) word[i] = newWord[i];
return 0; if (DEBUG_DICT && DEBUG_NODE) {
char s[inputLength + 1];
for (int i = 0; i < inputLength; ++i) s[i] = word[i];
s[inputLength] = 0;
LOGI("New missing space word found: %d > %d (%s), %d, %d",
newFreq, maxFreq, s, inputLength, depth);
} }
maxFreq = newFreq;
}
}
} else if (needsToTraverseChildrenNodes) {
// Traverse children nodes
++depth;
mStackChildCount[depth] = count;
mStackSiblingPos[depth] = firstChildPos;
}
} else {
// Traverse parent node
--depth;
}
}
word[inputLength] = 0;
return maxFreq;
} }
inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos, inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,
const int count, const int inputIndex, int *newChildPosition, int *newCount, const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition,
bool *newTerminal, int *newFreq) { int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) {
const int inputIndex = startInputIndex + depth;
const int *currentChars = mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS); const int *currentChars = mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS);
int pos = firstChildPos;
unsigned short c; unsigned short c;
for (int i = 0; i < count; ++i) { *siblingPos = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, firstChildPos, &c,
pos = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, pos, &c,
newChildPosition, newTerminal, newFreq); newChildPosition, newTerminal, newFreq);
const unsigned int inputC = currentChars[0]; const unsigned int inputC = currentChars[0];
if (DEBUG_DICT) assert(inputC <= U_SHORT_MAX);
const unsigned short lowerC = toLowerCase(c); const unsigned short lowerC = toLowerCase(c);
const bool matched = (inputC == lowerC || inputC == c); const bool matched = (inputC == lowerC || inputC == c);
const bool hasChild = *newChildPosition != 0; const bool hasChild = *newChildPosition != 0;
if (matched) { if (matched) {
word[depth] = c;
if (DEBUG_DICT && DEBUG_NODE) {
LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq);
if (*newTerminal) LOGI("Terminal %d", *newFreq);
}
if (hasChild) { if (hasChild) {
*newCount = Dictionary::getCount(DICT, newChildPosition); *newCount = Dictionary::getCount(DICT, newChildPosition);
return true; return true;
} else { } else {
return false; return false;
} }
} } else {
} // If this node is not user typed character, this method treats this word as unmatched.
// Thus newTerminal shouldn't be true.
*newTerminal = false;
return false; return false;
} }
}
} // namespace latinime } // namespace latinime

View File

@ -64,11 +64,11 @@ private:
const int nextLettersSize, int *newCount, int *newChildPosition, const int nextLettersSize, int *newCount, int *newChildPosition,
bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs, bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs,
int *nextSiblingPosition); int *nextSiblingPosition);
int getWordFreq(const int startInputIndex, const int inputLength); int getBestWordFreq(const int startInputIndex, const int inputLength, unsigned short *word);
// Process a node by considering missing space // Process a node by considering missing space
bool processCurrentNodeForExactMatch(const int firstChildPos, const int count, bool processCurrentNodeForExactMatch(const int firstChildPos,
const int inputIndex, int *newChildPosition, int *newCount, bool *newTerminal, const int startInputIndex, const int depth, unsigned short *word,
int *newFreq); int *newChildPosition, int *newCount, bool *newTerminal, int *newFreq, int *siblingPos);
const unsigned char *DICT; const unsigned char *DICT;
const int MAX_WORDS; const int MAX_WORDS;