Merge "Fix a bug that We can't suggest words with missing space if one of the words starts with a capitalized character."
commit
8db866aaa2
|
@ -25,10 +25,12 @@
|
||||||
#endif
|
#endif
|
||||||
#define DEBUG_DICT true
|
#define DEBUG_DICT true
|
||||||
#define DEBUG_SHOW_FOUND_WORD false
|
#define DEBUG_SHOW_FOUND_WORD false
|
||||||
|
#define DEBUG_NODE true
|
||||||
#else // FLAG_DBG
|
#else // FLAG_DBG
|
||||||
#define LOGI
|
#define LOGI
|
||||||
#define DEBUG_DICT false
|
#define DEBUG_DICT false
|
||||||
#define DEBUG_SHOW_FOUND_WORD false
|
#define DEBUG_SHOW_FOUND_WORD false
|
||||||
|
#define DEBUG_NODE false
|
||||||
#endif // FLAG_DBG
|
#endif // FLAG_DBG
|
||||||
|
|
||||||
#ifndef U_SHORT_MAX
|
#ifndef U_SHORT_MAX
|
||||||
|
|
|
@ -64,7 +64,7 @@ int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short
|
||||||
}
|
}
|
||||||
|
|
||||||
// Suggestions with missing space
|
// Suggestions with missing space
|
||||||
if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER) {
|
if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER && mInputLength > MIN_SUGGEST_DEPTH) {
|
||||||
for (int i = 1; i < codesSize; ++i) {
|
for (int i = 1; i < codesSize; ++i) {
|
||||||
if (DEBUG_DICT) LOGI("--- Suggest missing space characters %d", i);
|
if (DEBUG_DICT) LOGI("--- Suggest missing space characters %d", i);
|
||||||
getMissingSpaceWords(mInputLength, i);
|
getMissingSpaceWords(mInputLength, i);
|
||||||
|
@ -236,33 +236,30 @@ void UnigramDictionary::getSuggestionCandidates(const int inputLength, const int
|
||||||
}
|
}
|
||||||
|
|
||||||
bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {
|
bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {
|
||||||
if (missingSpacePos <= 0 || missingSpacePos >= inputLength) return false;
|
if (missingSpacePos <= 0 || missingSpacePos >= inputLength
|
||||||
const int firstFreq = getWordFreq(0, missingSpacePos);
|
|| inputLength >= MAX_WORD_LENGTH) return false;
|
||||||
const int secondFreq = getWordFreq(missingSpacePos, inputLength - missingSpacePos);
|
|
||||||
if (DEBUG_DICT) LOGI("First freq: %d, Second freq: %d", firstFreq, secondFreq);
|
|
||||||
|
|
||||||
if (firstFreq <= 0 || secondFreq <= 0) return false;
|
|
||||||
int pairFreq = (firstFreq + secondFreq) / 2;
|
|
||||||
for (int i = 0; i < inputLength; ++i) pairFreq *= TYPED_LETTER_MULTIPLIER;
|
|
||||||
const int newWordLength = inputLength + 1;
|
const int newWordLength = inputLength + 1;
|
||||||
// Allocating variable length array on stack
|
// Allocating variable length array on stack
|
||||||
unsigned short word[newWordLength];
|
unsigned short word[newWordLength];
|
||||||
int j = 0;
|
const int firstFreq = getBestWordFreq(0, missingSpacePos, mWord);
|
||||||
|
if (DEBUG_DICT) LOGI("First freq: %d", firstFreq);
|
||||||
|
if (firstFreq <= 0) return false;
|
||||||
|
|
||||||
for (int i = 0; i < missingSpacePos; ++i) {
|
for (int i = 0; i < missingSpacePos; ++i) {
|
||||||
// Down-casting
|
word[i] = mWord[i];
|
||||||
if (DEBUG_DICT) {
|
|
||||||
assert((*(mInputCodes + i * MAX_PROXIMITY_CHARS)) <= U_SHORT_MAX);
|
|
||||||
}
|
|
||||||
word[i] = (unsigned short) *(mInputCodes + i * MAX_PROXIMITY_CHARS);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const int secondFreq = getBestWordFreq(missingSpacePos, inputLength - missingSpacePos, mWord);
|
||||||
|
if (DEBUG_DICT) LOGI("Second freq: %d", secondFreq);
|
||||||
|
if (secondFreq <= 0) return false;
|
||||||
|
|
||||||
word[missingSpacePos] = SPACE;
|
word[missingSpacePos] = SPACE;
|
||||||
for (int i = (missingSpacePos + 1); i < newWordLength; ++i) {
|
for (int i = (missingSpacePos + 1); i < newWordLength; ++i) {
|
||||||
// Down-casting
|
word[i] = mWord[i - missingSpacePos - 1];
|
||||||
if (DEBUG_DICT) {
|
|
||||||
assert((*(mInputCodes + (i - 1) * MAX_PROXIMITY_CHARS)) <= U_SHORT_MAX);
|
|
||||||
}
|
|
||||||
word[i] = (unsigned short) *(mInputCodes + (i - 1) * MAX_PROXIMITY_CHARS);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int pairFreq = ((firstFreq + secondFreq) / 2);
|
||||||
|
for (int i = 0; i < inputLength; ++i) pairFreq *= TYPED_LETTER_MULTIPLIER;
|
||||||
addWord(word, newWordLength, pairFreq);
|
addWord(word, newWordLength, pairFreq);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -418,48 +415,89 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
|
||||||
return needsToTraverseChildrenNodes;
|
return needsToTraverseChildrenNodes;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline int UnigramDictionary::getWordFreq(const int startInputIndex, const int inputLength) {
|
inline int UnigramDictionary::getBestWordFreq(const int startInputIndex, const int inputLength,
|
||||||
|
unsigned short *word) {
|
||||||
int pos = ROOT_POS;
|
int pos = ROOT_POS;
|
||||||
int count = Dictionary::getCount(DICT, &pos);
|
int count = Dictionary::getCount(DICT, &pos);
|
||||||
int freq = 0;
|
int maxFreq = 0;
|
||||||
|
int depth = 0;
|
||||||
|
unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];
|
||||||
bool terminal = false;
|
bool terminal = false;
|
||||||
|
|
||||||
for (int i = 0; i < inputLength; ++i) {
|
mStackChildCount[0] = count;
|
||||||
bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(pos, count,
|
mStackSiblingPos[0] = pos;
|
||||||
startInputIndex + i, &pos, &count, &terminal, &freq);
|
|
||||||
if (!needsToTraverseChildrenNodes && (i < inputLength - 1)) {
|
while (depth >= 0) {
|
||||||
return 0;
|
if (mStackChildCount[depth] > 0) {
|
||||||
|
--mStackChildCount[depth];
|
||||||
|
int firstChildPos;
|
||||||
|
int newFreq;
|
||||||
|
int siblingPos = mStackSiblingPos[depth];
|
||||||
|
const bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(siblingPos,
|
||||||
|
startInputIndex, depth, newWord, &firstChildPos, &count, &terminal, &newFreq,
|
||||||
|
&siblingPos);
|
||||||
|
mStackSiblingPos[depth] = siblingPos;
|
||||||
|
if (depth == (inputLength - 1)) {
|
||||||
|
// Traverse sibling node
|
||||||
|
if (terminal) {
|
||||||
|
if (newFreq > maxFreq) {
|
||||||
|
for (int i = 0; i < inputLength; ++i) word[i] = newWord[i];
|
||||||
|
if (DEBUG_DICT && DEBUG_NODE) {
|
||||||
|
char s[inputLength + 1];
|
||||||
|
for (int i = 0; i < inputLength; ++i) s[i] = word[i];
|
||||||
|
s[inputLength] = 0;
|
||||||
|
LOGI("New missing space word found: %d > %d (%s), %d, %d",
|
||||||
|
newFreq, maxFreq, s, inputLength, depth);
|
||||||
|
}
|
||||||
|
maxFreq = newFreq;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (needsToTraverseChildrenNodes) {
|
||||||
|
// Traverse children nodes
|
||||||
|
++depth;
|
||||||
|
mStackChildCount[depth] = count;
|
||||||
|
mStackSiblingPos[depth] = firstChildPos;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Traverse parent node
|
||||||
|
--depth;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (terminal) {
|
|
||||||
return freq;
|
word[inputLength] = 0;
|
||||||
} else {
|
return maxFreq;
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,
|
inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,
|
||||||
const int count, const int inputIndex, int *newChildPosition, int *newCount,
|
const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition,
|
||||||
bool *newTerminal, int *newFreq) {
|
int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) {
|
||||||
|
const int inputIndex = startInputIndex + depth;
|
||||||
const int *currentChars = mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS);
|
const int *currentChars = mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS);
|
||||||
int pos = firstChildPos;
|
|
||||||
unsigned short c;
|
unsigned short c;
|
||||||
for (int i = 0; i < count; ++i) {
|
*siblingPos = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, firstChildPos, &c,
|
||||||
pos = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, pos, &c,
|
newChildPosition, newTerminal, newFreq);
|
||||||
newChildPosition, newTerminal, newFreq);
|
const unsigned int inputC = currentChars[0];
|
||||||
const unsigned int inputC = currentChars[0];
|
if (DEBUG_DICT) assert(inputC <= U_SHORT_MAX);
|
||||||
const unsigned short lowerC = toLowerCase(c);
|
const unsigned short lowerC = toLowerCase(c);
|
||||||
const bool matched = (inputC == lowerC || inputC == c);
|
const bool matched = (inputC == lowerC || inputC == c);
|
||||||
const bool hasChild = *newChildPosition != 0;
|
const bool hasChild = *newChildPosition != 0;
|
||||||
if (matched) {
|
if (matched) {
|
||||||
if (hasChild) {
|
word[depth] = c;
|
||||||
*newCount = Dictionary::getCount(DICT, newChildPosition);
|
if (DEBUG_DICT && DEBUG_NODE) {
|
||||||
return true;
|
LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq);
|
||||||
} else {
|
if (*newTerminal) LOGI("Terminal %d", *newFreq);
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
if (hasChild) {
|
||||||
|
*newCount = Dictionary::getCount(DICT, newChildPosition);
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// If this node is not user typed character, this method treats this word as unmatched.
|
||||||
|
// Thus newTerminal shouldn't be true.
|
||||||
|
*newTerminal = false;
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
|
|
|
@ -64,11 +64,11 @@ private:
|
||||||
const int nextLettersSize, int *newCount, int *newChildPosition,
|
const int nextLettersSize, int *newCount, int *newChildPosition,
|
||||||
bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs,
|
bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs,
|
||||||
int *nextSiblingPosition);
|
int *nextSiblingPosition);
|
||||||
int getWordFreq(const int startInputIndex, const int inputLength);
|
int getBestWordFreq(const int startInputIndex, const int inputLength, unsigned short *word);
|
||||||
// Process a node by considering missing space
|
// Process a node by considering missing space
|
||||||
bool processCurrentNodeForExactMatch(const int firstChildPos, const int count,
|
bool processCurrentNodeForExactMatch(const int firstChildPos,
|
||||||
const int inputIndex, int *newChildPosition, int *newCount, bool *newTerminal,
|
const int startInputIndex, const int depth, unsigned short *word,
|
||||||
int *newFreq);
|
int *newChildPosition, int *newCount, bool *newTerminal, int *newFreq, int *siblingPos);
|
||||||
|
|
||||||
const unsigned char *DICT;
|
const unsigned char *DICT;
|
||||||
const int MAX_WORDS;
|
const int MAX_WORDS;
|
||||||
|
|
Loading…
Reference in New Issue