Merge "Suggest words with missing space"

This commit is contained in:
satok 2010-12-08 21:31:59 -08:00 committed by Android (Google) Code Review
commit 71890a78b0
5 changed files with 164 additions and 84 deletions

View file

@ -30,8 +30,8 @@ BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength,
: DICT(dict), MAX_WORD_LENGTH(maxWordLength), : DICT(dict), MAX_WORD_LENGTH(maxWordLength),
MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion), MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion),
HAS_BIGRAM(hasBigram), mParentDictionary(parentDictionary) { HAS_BIGRAM(hasBigram), mParentDictionary(parentDictionary) {
LOGI("BigramDictionary - constructor"); if (DEBUG_DICT) LOGI("BigramDictionary - constructor");
LOGI("Has Bigram : %d \n", hasBigram); if (DEBUG_DICT) LOGI("Has Bigram : %d \n", hasBigram);
} }
BigramDictionary::~BigramDictionary() { BigramDictionary::~BigramDictionary() {
@ -54,7 +54,7 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ
} }
insertAt++; insertAt++;
} }
LOGI("Bigram: InsertAt -> %d maxBigrams: %d\n", insertAt, mMaxBigrams); if (DEBUG_DICT) LOGI("Bigram: InsertAt -> %d maxBigrams: %d\n", insertAt, mMaxBigrams);
if (insertAt < mMaxBigrams) { if (insertAt < mMaxBigrams) {
memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]), memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]),
(char*) mBigramFreq + insertAt * sizeof(mBigramFreq[0]), (char*) mBigramFreq + insertAt * sizeof(mBigramFreq[0]),
@ -107,7 +107,7 @@ int BigramDictionary::getBigrams(unsigned short *prevWord, int prevWordLength, i
if (HAS_BIGRAM && IS_LATEST_DICT_VERSION) { if (HAS_BIGRAM && IS_LATEST_DICT_VERSION) {
int pos = mParentDictionary->isValidWordRec( int pos = mParentDictionary->isValidWordRec(
DICTIONARY_HEADER_SIZE, prevWord, 0, prevWordLength); DICTIONARY_HEADER_SIZE, prevWord, 0, prevWordLength);
LOGI("Pos -> %d\n", pos); if (DEBUG_DICT) LOGI("Pos -> %d\n", pos);
if (pos < 0) { if (pos < 0) {
return 0; return 0;
} }
@ -151,7 +151,7 @@ void BigramDictionary::searchForTerminalNode(int addressLookingFor, int frequenc
} }
pos = followDownBranchAddress; // pos start at count pos = followDownBranchAddress; // pos start at count
int count = DICT[pos] & 0xFF; int count = DICT[pos] & 0xFF;
LOGI("count - %d\n",count); if (DEBUG_DICT) LOGI("count - %d\n",count);
pos++; pos++;
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
// pos at data // pos at data
@ -225,7 +225,7 @@ void BigramDictionary::searchForTerminalNode(int addressLookingFor, int frequenc
} }
depth++; depth++;
if (followDownBranchAddress == 0) { if (followDownBranchAddress == 0) {
LOGI("ERROR!!! Cannot find bigram!!"); if (DEBUG_DICT) LOGI("ERROR!!! Cannot find bigram!!");
break; break;
} }
} }

View file

@ -31,6 +31,10 @@
#define DEBUG_SHOW_FOUND_WORD false #define DEBUG_SHOW_FOUND_WORD false
#endif // FLAG_DBG #endif // FLAG_DBG
#ifndef U_SHORT_MAX
#define U_SHORT_MAX 1 << 16
#endif
// 22-bit address = ~4MB dictionary size limit, which on average would be about 200k-300k words // 22-bit address = ~4MB dictionary size limit, which on average would be about 200k-300k words
#define ADDRESS_MASK 0x3FFFFF #define ADDRESS_MASK 0x3FFFFF
@ -49,9 +53,9 @@
#define DICTIONARY_HEADER_SIZE 2 #define DICTIONARY_HEADER_SIZE 2
#define NOT_VALID_WORD -99 #define NOT_VALID_WORD -99
#define SUGGEST_MISSING_CHARACTERS true #define SUGGEST_WORDS_WITH_MISSING_CHARACTER true
#define SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER true
#define SUGGEST_EXCESSIVE_CHARACTERS true #define SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER true
// This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java // This should be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
// This is only used for the size of array. Not to be used in c functions. // This is only used for the size of array. Not to be used in c functions.
@ -59,6 +63,8 @@
#define MAX_DEPTH_MULTIPLIER 3 #define MAX_DEPTH_MULTIPLIER 3
#define MIN_SUGGEST_DEPTH 2
#define min(a,b) ((a)<(b)?(a):(b)) #define min(a,b) ((a)<(b)?(a):(b))
#endif // LATINIME_DEFINES_H #endif // LATINIME_DEFINES_H

View file

@ -27,20 +27,21 @@ Dictionary::Dictionary(void *dict, int typedLetterMultiplier, int fullWordMultip
int maxWordLength, int maxWords, int maxAlternatives) int maxWordLength, int maxWords, int maxAlternatives)
: DICT((unsigned char*) dict), : DICT((unsigned char*) dict),
// Checks whether it has the latest dictionary or the old dictionary // Checks whether it has the latest dictionary or the old dictionary
IS_LATEST_DICT_VERSION((((unsigned char*) dict)[0] & 0xFF) >= DICTIONARY_VERSION_MIN) IS_LATEST_DICT_VERSION((((unsigned char*) dict)[0] & 0xFF) >= DICTIONARY_VERSION_MIN) {
{ if (DEBUG_DICT) {
if (MAX_WORD_LENGTH_INTERNAL < maxWordLength) { if (MAX_WORD_LENGTH_INTERNAL < maxWordLength) {
LOGI("Max word length (%d) is greater than %d", maxWordLength, MAX_WORD_LENGTH_INTERNAL); LOGI("Max word length (%d) is greater than %d",
} maxWordLength, MAX_WORD_LENGTH_INTERNAL);
LOGI("IN NATIVE SUGGEST Version: %d \n", (DICT[0] & 0xFF)); LOGI("IN NATIVE SUGGEST Version: %d \n", (DICT[0] & 0xFF));
}
}
mUnigramDictionary = new UnigramDictionary(DICT, typedLetterMultiplier, fullWordMultiplier, mUnigramDictionary = new UnigramDictionary(DICT, typedLetterMultiplier, fullWordMultiplier,
maxWordLength, maxWords, maxAlternatives, IS_LATEST_DICT_VERSION); maxWordLength, maxWords, maxAlternatives, IS_LATEST_DICT_VERSION);
mBigramDictionary = new BigramDictionary(DICT, maxWordLength, maxAlternatives, mBigramDictionary = new BigramDictionary(DICT, maxWordLength, maxAlternatives,
IS_LATEST_DICT_VERSION, hasBigram(), this); IS_LATEST_DICT_VERSION, hasBigram(), this);
} }
Dictionary::~Dictionary() Dictionary::~Dictionary() {
{
delete mUnigramDictionary; delete mUnigramDictionary;
delete mBigramDictionary; delete mBigramDictionary;
} }

View file

@ -30,11 +30,12 @@
namespace latinime { namespace latinime {
UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterMultiplier, UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterMultiplier,
int fullWordMultiplier, int maxWordLength, int maxWords, int maxAlternatives, int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
const bool isLatestDictVersion) const bool isLatestDictVersion)
: DICT(dict), MAX_WORD_LENGTH(maxWordLength),MAX_WORDS(maxWords), : DICT(dict), MAX_WORD_LENGTH(maxWordLength),MAX_WORDS(maxWords),
MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion), MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier) { TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0) {
LOGI("UnigramDictionary - constructor"); LOGI("UnigramDictionary - constructor");
} }
@ -43,35 +44,37 @@ UnigramDictionary::~UnigramDictionary() {}
int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short *outWords, int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short *outWords,
int *frequencies, int *nextLetters, int nextLettersSize) int *frequencies, int *nextLetters, int nextLettersSize)
{ {
initSuggestions(codes, codesSize, outWords, frequencies); initSuggestions(codes, codesSize, outWords, frequencies);
getSuggestionCandidates(codesSize, -1, -1, nextLetters, nextLettersSize);
int suggestedWordsCount = getSuggestionCandidates(codesSize, -1, -1, nextLetters, // Suggestion with missing character
nextLettersSize); if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {
// If there aren't sufficient suggestions, search for words by allowing wild cards at
// the different character positions. This feature is not ready for prime-time as we need
// to figure out the best ranking for such words compared to proximity corrections and
// completions.
if (SUGGEST_MISSING_CHARACTERS) {
for (int i = 0; i < codesSize; ++i) { for (int i = 0; i < codesSize; ++i) {
if (DEBUG_DICT) LOGI("--- Suggest missing characters %d", i); if (DEBUG_DICT) LOGI("--- Suggest missing characters %d", i);
const int tempCount = getSuggestionCandidates(codesSize, i, -1, NULL, 0); getSuggestionCandidates(codesSize, i, -1, NULL, 0);
if (tempCount > suggestedWordsCount) {
suggestedWordsCount = tempCount;
}
} }
} }
// Suggest excessive characters // Suggestion with excessive character
if (SUGGEST_EXCESSIVE_CHARACTERS) { if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER) {
for (int i = 0; i < codesSize; ++i) { for (int i = 0; i < codesSize; ++i) {
if (DEBUG_DICT) LOGI("--- Suggest excessive characters %d", i); if (DEBUG_DICT) LOGI("--- Suggest excessive characters %d", i);
const int tempCount = getSuggestionCandidates(codesSize, -1, i, NULL, 0); getSuggestionCandidates(codesSize, -1, i, NULL, 0);
if (tempCount > suggestedWordsCount) {
suggestedWordsCount = tempCount;
} }
} }
// Suggestions with missing space
if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER) {
for (int i = 1; i < codesSize; ++i) {
if (DEBUG_DICT) LOGI("--- Suggest missing space characters %d", i);
getMissingSpaceWords(mInputLength, i);
}
}
// Get the word count
int suggestedWordsCount = 0;
while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {
suggestedWordsCount++;
} }
if (DEBUG_DICT) { if (DEBUG_DICT) {
@ -84,6 +87,7 @@ int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short
} }
LOGI("\n"); LOGI("\n");
} }
return suggestedWordsCount; return suggestedWordsCount;
} }
@ -97,23 +101,6 @@ void UnigramDictionary::initSuggestions(int *codes, int codesSize, unsigned shor
mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2; mMaxEditDistance = mInputLength < 5 ? 2 : mInputLength / 2;
} }
int UnigramDictionary::getSuggestionCandidates(int inputLength, int skipPos, int excessivePos,
int *nextLetters, int nextLettersSize) {
if (DEBUG_DICT) LOGI("getSuggestionCandidates");
int initialPos = 0;
if (IS_LATEST_DICT_VERSION) {
initialPos = DICTIONARY_HEADER_SIZE;
}
getWords(initialPos, inputLength, skipPos, excessivePos, nextLetters, nextLettersSize);
// Get the word count
int suggestedWordsCount = 0;
while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {
suggestedWordsCount++;
}
return suggestedWordsCount;
}
void UnigramDictionary::registerNextLetter( void UnigramDictionary::registerNextLetter(
unsigned short c, int *nextLetters, int nextLettersSize) { unsigned short c, int *nextLetters, int nextLettersSize) {
if (c < nextLettersSize) { if (c < nextLettersSize) {
@ -121,12 +108,13 @@ void UnigramDictionary::registerNextLetter(
} }
} }
// TODO: We need to optimize addWord by using STL or something
bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) { bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency) {
word[length] = 0; word[length] = 0;
if (DEBUG_DICT) { if (DEBUG_DICT && DEBUG_SHOW_FOUND_WORD) {
char s[length + 1]; char s[length + 1];
for (int i = 0; i <= length; i++) s[i] = word[i]; for (int i = 0; i <= length; i++) s[i] = word[i];
if (DEBUG_SHOW_FOUND_WORD) LOGI("Found word = %s, freq = %d : \n", s, frequency); LOGI("Found word = %s, freq = %d", s, frequency);
} }
if (length > MAX_WORD_LENGTH) { if (length > MAX_WORD_LENGTH) {
if (DEBUG_DICT) LOGI("Exceeded max word length."); if (DEBUG_DICT) LOGI("Exceeded max word length.");
@ -146,7 +134,7 @@ bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency)
if (DEBUG_DICT) { if (DEBUG_DICT) {
char s[length + 1]; char s[length + 1];
for (int i = 0; i <= length; i++) s[i] = word[i]; for (int i = 0; i <= length; i++) s[i] = word[i];
LOGI("Added word = %s, freq = %d : \n", s, frequency); LOGI("Added word = %s, freq = %d", s, frequency);
} }
memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]), memmove((char*) mFrequencies + (insertAt + 1) * sizeof(mFrequencies[0]),
(char*) mFrequencies + insertAt * sizeof(mFrequencies[0]), (char*) mFrequencies + insertAt * sizeof(mFrequencies[0]),
@ -160,7 +148,7 @@ bool UnigramDictionary::addWord(unsigned short *word, int length, int frequency)
*dest++ = *word++; *dest++ = *word++;
} }
*dest = 0; // NULL terminate *dest = 0; // NULL terminate
if (DEBUG_DICT) LOGI("Added word at %d\n", insertAt); if (DEBUG_DICT) LOGI("Added word at %d", insertAt);
return true; return true;
} }
return false; return false;
@ -187,27 +175,19 @@ bool UnigramDictionary::sameAsTyped(unsigned short *word, int length) {
if ((unsigned int) *inputCodes != (unsigned int) *word) { if ((unsigned int) *inputCodes != (unsigned int) *word) {
return false; return false;
} }
inputCodes += MAX_ALTERNATIVES; inputCodes += MAX_PROXIMITY_CHARS;
word++; word++;
} }
return true; return true;
} }
static const char QUOTE = '\''; static const char QUOTE = '\'';
static const char SPACE = ' ';
// Keep this for comparing spec to new getWords void UnigramDictionary::getSuggestionCandidates(const int inputLength, const int skipPos,
void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,
const int excessivePos, int *nextLetters, const int nextLettersSize) { const int excessivePos, int *nextLetters, const int nextLettersSize) {
int initialPosition = initialPos; if (DEBUG_DICT) LOGI("getSuggestionCandidates");
const int count = Dictionary::getCount(DICT, &initialPosition); int rootPosition = ROOT_POS;
getWordsRec(count, initialPosition, 0,
min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),
mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, nextLetters, nextLettersSize);
}
void UnigramDictionary::getWords(const int rootPos, const int inputLength, const int skipPos,
const int excessivePos, int *nextLetters, const int nextLettersSize) {
int rootPosition = rootPos;
const int MAX_DEPTH = min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH); const int MAX_DEPTH = min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
// Get the number of child of root, then increment the position // Get the number of child of root, then increment the position
int childCount = Dictionary::getCount(DICT, &rootPosition); int childCount = Dictionary::getCount(DICT, &rootPosition);
@ -220,6 +200,7 @@ void UnigramDictionary::getWords(const int rootPos, const int inputLength, const
mStackDiffs[0] = 0; mStackDiffs[0] = 0;
mStackSiblingPos[0] = rootPosition; mStackSiblingPos[0] = rootPosition;
// Depth first search
while (depth >= 0) { while (depth >= 0) {
if (mStackChildCount[depth] > 0) { if (mStackChildCount[depth] > 0) {
--mStackChildCount[depth]; --mStackChildCount[depth];
@ -235,7 +216,7 @@ void UnigramDictionary::getWords(const int rootPos, const int inputLength, const
MAX_DEPTH, traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos, MAX_DEPTH, traverseAllNodes, snr, inputIndex, diffs, skipPos, excessivePos,
nextLetters, nextLettersSize, &childCount, &firstChildPos, &traverseAllNodes, nextLetters, nextLettersSize, &childCount, &firstChildPos, &traverseAllNodes,
&snr, &inputIndex, &diffs, &siblingPos); &snr, &inputIndex, &diffs, &siblingPos);
// Next sibling pos // Update next sibling pos
mStackSiblingPos[depth] = siblingPos; mStackSiblingPos[depth] = siblingPos;
if (needsToTraverseChildrenNodes) { if (needsToTraverseChildrenNodes) {
// Goes to child node // Goes to child node
@ -254,7 +235,48 @@ void UnigramDictionary::getWords(const int rootPos, const int inputLength, const
} }
} }
// snr : frequency? bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {
if (missingSpacePos <= 0 || missingSpacePos >= inputLength) return false;
const int firstFreq = getWordFreq(0, missingSpacePos);
const int secondFreq = getWordFreq(missingSpacePos, inputLength - missingSpacePos);
if (DEBUG_DICT) LOGI("First freq: %d, Second freq: %d", firstFreq, secondFreq);
if (firstFreq <= 0 || secondFreq <= 0) return false;
int pairFreq = (firstFreq + secondFreq) / 2;
for (int i = 0; i < inputLength; ++i) pairFreq *= TYPED_LETTER_MULTIPLIER;
const int newWordLength = inputLength + 1;
// Allocating variable length array on stack
unsigned short word[newWordLength];
int j = 0;
for (int i = 0; i < missingSpacePos; ++i) {
// Down-casting
if (DEBUG_DICT) {
assert((*(mInputCodes + i * MAX_PROXIMITY_CHARS)) <= U_SHORT_MAX);
}
word[i] = (unsigned short) *(mInputCodes + i * MAX_PROXIMITY_CHARS);
}
word[missingSpacePos] = SPACE;
for (int i = (missingSpacePos + 1); i < newWordLength; ++i) {
// Down-casting
if (DEBUG_DICT) {
assert((*(mInputCodes + (i - 1) * MAX_PROXIMITY_CHARS)) <= U_SHORT_MAX);
}
word[i] = (unsigned short) *(mInputCodes + (i - 1) * MAX_PROXIMITY_CHARS);
}
addWord(word, newWordLength, pairFreq);
return true;
}
// Keep this for comparing spec to new getWords
void UnigramDictionary::getWordsOld(const int initialPos, const int inputLength, const int skipPos,
const int excessivePos, int *nextLetters, const int nextLettersSize) {
int initialPosition = initialPos;
const int count = Dictionary::getCount(DICT, &initialPosition);
getWordsRec(count, initialPosition, 0,
min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH),
mInputLength <= 0, 1, 0, 0, skipPos, excessivePos, nextLetters, nextLettersSize);
}
void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth, void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,
const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex, const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex,
const int diffs, const int skipPos, const int excessivePos, int *nextLetters, const int diffs, const int skipPos, const int excessivePos, int *nextLetters,
@ -287,7 +309,7 @@ void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, cons
inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsGreaterThanInputLength( inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsGreaterThanInputLength(
unsigned short *word, const int inputLength, const int depth, const int snr, unsigned short *word, const int inputLength, const int depth, const int snr,
int *nextLetters, const int nextLettersSize, const int skipPos, const int freq) { int *nextLetters, const int nextLettersSize, const int skipPos, const int freq) {
addWord(word, depth + 1, freq * snr); if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, freq * snr);
if (depth >= inputLength && skipPos < 0) { if (depth >= inputLength && skipPos < 0) {
registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize); registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize);
} }
@ -301,13 +323,13 @@ inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsSameAsInputLength(
// Proximity collection will promote a word of the same length as // Proximity collection will promote a word of the same length as
// what user typed. // what user typed.
if (skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER; if (skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER;
addWord(word, depth + 1, finalFreq); if (depth >= MIN_SUGGEST_DEPTH) addWord(word, depth + 1, finalFreq);
} }
} }
inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c, inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,
const int inputIndex, const int skipPos, const int depth) { const int inputIndex, const int skipPos, const int depth) {
const unsigned short userTypedChar = (mInputCodes + (inputIndex * MAX_ALTERNATIVES))[0]; const unsigned short userTypedChar = (mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS))[0];
// Skip the ' or other letter and continue deeper // Skip the ' or other letter and continue deeper
return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth; return (c == QUOTE && userTypedChar != QUOTE) || skipPos == depth;
} }
@ -361,7 +383,7 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
*newDiffs = diffs; *newDiffs = diffs;
*newInputIndex = inputIndex; *newInputIndex = inputIndex;
} else { } else {
int *currentChars = mInputCodes + (inputIndex * MAX_ALTERNATIVES); int *currentChars = mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS);
int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos); int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos);
if (matchedProximityCharId < 0) return false; if (matchedProximityCharId < 0) return false;
mWord[depth] = c; mWord[depth] = c;
@ -396,4 +418,48 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
return needsToTraverseChildrenNodes; return needsToTraverseChildrenNodes;
} }
inline int UnigramDictionary::getWordFreq(const int startInputIndex, const int inputLength) {
int pos = ROOT_POS;
int count = Dictionary::getCount(DICT, &pos);
int freq = 0;
bool terminal = false;
for (int i = 0; i < inputLength; ++i) {
bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(pos, count,
startInputIndex + i, &pos, &count, &terminal, &freq);
if (!needsToTraverseChildrenNodes && (i < inputLength - 1)) {
return 0;
}
}
if (terminal) {
return freq;
} else {
return 0;
}
}
inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,
const int count, const int inputIndex, int *newChildPosition, int *newCount,
bool *newTerminal, int *newFreq) {
const int *currentChars = mInputCodes + (inputIndex * MAX_PROXIMITY_CHARS);
int pos = firstChildPos;
unsigned short c;
for (int i = 0; i < count; ++i) {
pos = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, pos, &c,
newChildPosition, newTerminal, newFreq);
const unsigned int inputC = currentChars[0];
const unsigned short lowerC = toLowerCase(c);
const bool matched = (inputC == lowerC || inputC == c);
const bool hasChild = *newChildPosition != 0;
if (matched) {
if (hasChild) {
*newCount = Dictionary::getCount(DICT, newChildPosition);
return true;
} else {
return false;
}
}
}
return false;
}
} // namespace latinime } // namespace latinime

View file

@ -24,15 +24,15 @@ namespace latinime {
class UnigramDictionary { class UnigramDictionary {
public: public:
UnigramDictionary(const unsigned char *dict, int typedLetterMultipler, int fullWordMultiplier, UnigramDictionary(const unsigned char *dict, int typedLetterMultipler, int fullWordMultiplier,
int maxWordLength, int maxWords, int maxAlternatives, const bool isLatestDictVersion); int maxWordLength, int maxWords, int maxProximityChars, const bool isLatestDictVersion);
int getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies, int getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies,
int *nextLetters, int nextLettersSize); int *nextLetters, int nextLettersSize);
~UnigramDictionary(); ~UnigramDictionary();
private: private:
void initSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies); void initSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies);
int getSuggestionCandidates(int inputLength, int skipPos, int excessivePos, int *nextLetters, void getSuggestionCandidates(const int inputLength, const int skipPos, const int excessivePos,
int nextLettersSize); int *nextLetters, const int nextLettersSize);
void getVersionNumber(); void getVersionNumber();
bool checkIfDictVersionIsLatest(); bool checkIfDictVersionIsLatest();
int getAddress(int *pos); int getAddress(int *pos);
@ -44,8 +44,7 @@ private:
void getWordsRec(const int childrenCount, const int pos, const int depth, const int maxDepth, void getWordsRec(const int childrenCount, const int pos, const int depth, const int maxDepth,
const bool traverseAllNodes, const int snr, const int inputIndex, const int diffs, const bool traverseAllNodes, const int snr, const int inputIndex, const int diffs,
const int skipPos, const int excessivePos, int *nextLetters, const int nextLettersSize); const int skipPos, const int excessivePos, int *nextLetters, const int nextLettersSize);
void getWords(const int rootPos, const int inputLength, const int skipPos, bool getMissingSpaceWords(const int inputLength, const int missingSpacePos);
const int excessivePos, int *nextLetters, const int nextLettersSize);
// Keep getWordsOld for comparing performance between getWords and getWordsOld // Keep getWordsOld for comparing performance between getWords and getWordsOld
void getWordsOld(const int initialPos, const int inputLength, const int skipPos, void getWordsOld(const int initialPos, const int inputLength, const int skipPos,
const int excessivePos, int *nextLetters, const int nextLettersSize); const int excessivePos, int *nextLetters, const int nextLettersSize);
@ -58,17 +57,25 @@ private:
bool needsToSkipCurrentNode(const unsigned short c, bool needsToSkipCurrentNode(const unsigned short c,
const int inputIndex, const int skipPos, const int depth); const int inputIndex, const int skipPos, const int depth);
int getMatchedProximityId(const int *currentChars, const unsigned short c, const int skipPos); int getMatchedProximityId(const int *currentChars, const unsigned short c, const int skipPos);
// Process a node by considering proximity, missing and excessive character
bool processCurrentNode(const int pos, const int depth, bool processCurrentNode(const int pos, const int depth,
const int maxDepth, const bool traverseAllNodes, const int snr, int inputIndex, const int maxDepth, const bool traverseAllNodes, const int snr, int inputIndex,
const int diffs, const int skipPos, const int excessivePos, int *nextLetters, const int diffs, const int skipPos, const int excessivePos, int *nextLetters,
const int nextLettersSize, int *newCount, int *newChildPosition, const int nextLettersSize, int *newCount, int *newChildPosition,
bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs, bool *newTraverseAllNodes, int *newSnr, int*newInputIndex, int *newDiffs,
int *nextSiblingPosition); int *nextSiblingPosition);
int getWordFreq(const int startInputIndex, const int inputLength);
// Process a node by considering missing space
bool processCurrentNodeForExactMatch(const int firstChildPos, const int count,
const int inputIndex, int *newChildPosition, int *newCount, bool *newTerminal,
int *newFreq);
const unsigned char *DICT; const unsigned char *DICT;
const int MAX_WORDS; const int MAX_WORDS;
const int MAX_WORD_LENGTH; const int MAX_WORD_LENGTH;
const int MAX_ALTERNATIVES; const int MAX_PROXIMITY_CHARS;
const bool IS_LATEST_DICT_VERSION; const bool IS_LATEST_DICT_VERSION;
const int ROOT_POS;
const int TYPED_LETTER_MULTIPLIER; const int TYPED_LETTER_MULTIPLIER;
const int FULL_WORD_MULTIPLIER; const int FULL_WORD_MULTIPLIER;