diff --git a/native/src/defines.h b/native/src/defines.h index aaaf3483d..a3edaab59 100644 --- a/native/src/defines.h +++ b/native/src/defines.h @@ -50,4 +50,6 @@ #define SUGGEST_MISSING_CHARACTERS true #define SUGGEST_MISSING_CHARACTERS_THRESHOLD 5 +#define MAX_WORD_LENGTH_INTERNAL 64 + #endif // LATINIME_DEFINES_H diff --git a/native/src/dictionary.cpp b/native/src/dictionary.cpp index 05692f7ef..6936dc928 100644 --- a/native/src/dictionary.cpp +++ b/native/src/dictionary.cpp @@ -29,6 +29,9 @@ Dictionary::Dictionary(void *dict, int typedLetterMultiplier, int fullWordMultip // Checks whether it has the latest dictionary or the old dictionary IS_LATEST_DICT_VERSION((((unsigned char*) dict)[0] & 0xFF) >= DICTIONARY_VERSION_MIN) { + if (MAX_WORD_LENGTH_INTERNAL < maxWordLength) { + LOGI("Max word length (%d) is greater than %d", maxWordLength, MAX_WORD_LENGTH_INTERNAL); + } LOGI("IN NATIVE SUGGEST Version: %d \n", (DICT[0] & 0xFF)); mUnigramDictionary = new UnigramDictionary(DICT, typedLetterMultiplier, fullWordMultiplier, maxWordLength, maxWords, maxAlternatives, IS_LATEST_DICT_VERSION); diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp index d0c903e81..e4edc5ab6 100644 --- a/native/src/unigram_dictionary.cpp +++ b/native/src/unigram_dictionary.cpp @@ -102,7 +102,8 @@ int UnigramDictionary::getSuggestionCandidates(int inputLength, int skipPos, return suggestedWordsCount; } -void UnigramDictionary::registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize) { +void UnigramDictionary::registerNextLetter( + unsigned short c, int *nextLetters, int nextLettersSize) { if (c < nextLettersSize) { nextLetters[c]++; } @@ -121,9 +122,8 @@ UnigramDictionary::addWord(unsigned short *word, int length, int frequency) // Find the right insertion point int insertAt = 0; while (insertAt < MAX_WORDS) { - if (frequency > mFrequencies[insertAt] - || (mFrequencies[insertAt] == frequency - && length < Dictionary::wideStrLen(mOutputChars + insertAt * MAX_WORD_LENGTH))) { + if (frequency > mFrequencies[insertAt] || (mFrequencies[insertAt] == frequency + && length < Dictionary::wideStrLen(mOutputChars + insertAt * MAX_WORD_LENGTH))) { break; } insertAt++; @@ -134,9 +134,9 @@ UnigramDictionary::addWord(unsigned short *word, int length, int frequency) (MAX_WORDS - insertAt - 1) * sizeof(mFrequencies[0])); mFrequencies[insertAt] = frequency; memmove((char*) mOutputChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short), - (char*) mOutputChars + (insertAt ) * MAX_WORD_LENGTH * sizeof(short), + (char*) mOutputChars + insertAt * MAX_WORD_LENGTH * sizeof(short), (MAX_WORDS - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH); - unsigned short *dest = mOutputChars + (insertAt ) * MAX_WORD_LENGTH; + unsigned short *dest = mOutputChars + insertAt * MAX_WORD_LENGTH; while (length--) { *dest++ = *word++; } @@ -177,8 +177,9 @@ UnigramDictionary::sameAsTyped(unsigned short *word, int length) return true; } -static char QUOTE = '\''; +static const char QUOTE = '\''; +// snr : frequency? void UnigramDictionary::getWordsRec(int pos, int depth, int maxDepth, bool completion, int snr, int inputIndex, int diffs, int skipPos, int *nextLetters, int nextLettersSize) @@ -190,8 +191,10 @@ UnigramDictionary::getWordsRec(int pos, int depth, int maxDepth, bool completion if (diffs > mMaxEditDistance) { return; } + // get the count of nodes and increment pos. int count = Dictionary::getCount(DICT, &pos); int *currentChars = NULL; + // If inputIndex is greater than mInputLength, that means there are no proximity chars. if (mInputLength <= inputIndex) { completion = true; } else { @@ -205,8 +208,10 @@ UnigramDictionary::getWordsRec(int pos, int depth, int maxDepth, bool completion unsigned short lowerC = toLowerCase(c); bool terminal = Dictionary::getTerminal(DICT, &pos); int childrenAddress = Dictionary::getAddress(DICT, &pos); + const bool needsToContinue = childrenAddress != 0; // -- after address or flag int freq = 1; + // If terminal, increment pos if (terminal) freq = Dictionary::getFreq(DICT, IS_LATEST_DICT_VERSION, &pos); // -- after add or freq @@ -214,53 +219,70 @@ UnigramDictionary::getWordsRec(int pos, int depth, int maxDepth, bool completion if (completion) { mWord[depth] = c; if (terminal) { - addWord(mWord, depth + 1, freq * snr); - if (depth >= mInputLength && skipPos < 0) { - registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize); - } + onTerminalWhenUserTypedLengthIsGreaterThanInputLength(mWord, mInputLength, depth, + snr, nextLetters, nextLettersSize, skipPos, freq); } - if (childrenAddress != 0) { - getWordsRec(childrenAddress, depth + 1, maxDepth, completion, snr, inputIndex, + if (needsToContinue) { + // No need to do proximity suggest any more. + getWordsRec(childrenAddress, depth + 1, maxDepth, true, snr, inputIndex, diffs, skipPos, nextLetters, nextLettersSize); } } else if ((c == QUOTE && currentChars[0] != QUOTE) || skipPos == depth) { // Skip the ' or other letter and continue deeper mWord[depth] = c; - if (childrenAddress != 0) { - getWordsRec(childrenAddress, depth + 1, maxDepth, false, snr, inputIndex, diffs, - skipPos, nextLetters, nextLettersSize); + if (needsToContinue) { + getWordsRec(childrenAddress, depth + 1, maxDepth, false, snr, inputIndex, + diffs, skipPos, nextLetters, nextLettersSize); } } else { int j = 0; while (currentChars[j] > 0) { + // Move to child node if (currentChars[j] == lowerC || currentChars[j] == c) { - int addedWeight = j == 0 ? TYPED_LETTER_MULTIPLIER : 1; mWord[depth] = c; - if (mInputLength == inputIndex + 1) { + const int addedWeight = j == 0 ? TYPED_LETTER_MULTIPLIER : 1; + const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1; + // If inputIndex is greater than mInputLength, that means there is no + // proximity chars. So, we don't need to check proximity. + if (isSameAsUserTypedLength) { if (terminal) { - if (//INCLUDE_TYPED_WORD_IF_VALID || - !sameAsTyped(mWord, depth + 1)) { - int finalFreq = freq * snr * addedWeight; - if (skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER; - addWord(mWord, depth + 1, finalFreq); - } + onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, depth, snr, + skipPos, freq, addedWeight); } - if (childrenAddress != 0) { - getWordsRec(childrenAddress, depth + 1, - maxDepth, true, snr * addedWeight, inputIndex + 1, - diffs + (j > 0), skipPos, nextLetters, nextLettersSize); - } - } else if (childrenAddress != 0) { + } + if (needsToContinue) { getWordsRec(childrenAddress, depth + 1, maxDepth, - false, snr * addedWeight, inputIndex + 1, diffs + (j > 0), - skipPos, nextLetters, nextLettersSize); + isSameAsUserTypedLength, snr * addedWeight, inputIndex + 1, + diffs + (j > 0), skipPos, nextLetters, nextLettersSize); } } - j++; + ++j; + // If skipPos is defined, not to search proximity collections. + // First char is what user typed. if (skipPos >= 0) break; } } } } +inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsGreaterThanInputLength( + unsigned short *word, const int inputLength, const int depth, const int snr, + int *nextLetters, const int nextLettersSize, const int skipPos, const int freq) { + addWord(word, depth + 1, freq * snr); + if (depth >= inputLength && skipPos < 0) { + registerNextLetter(mWord[mInputLength], nextLetters, nextLettersSize); + } +} + +inline void UnigramDictionary::onTerminalWhenUserTypedLengthIsSameAsInputLength( + unsigned short *word, const int depth, const int snr, const int skipPos, const int freq, + const int addedWeight) { + if (!sameAsTyped(word, depth + 1)) { + int finalFreq = freq * snr * addedWeight; + // Proximity collection will promote a word of the same length as + // what user typed. + if (skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER; + addWord(word, depth + 1, finalFreq); + } +} } // namespace latinime diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h index 557e54cb7..118d7dc29 100644 --- a/native/src/unigram_dictionary.h +++ b/native/src/unigram_dictionary.h @@ -42,10 +42,17 @@ private: bool sameAsTyped(unsigned short *word, int length); bool addWord(unsigned short *word, int length, int frequency); unsigned short toLowerCase(unsigned short c); - void getWordsRec(int pos, int depth, int maxDepth, bool completion, int frequency, + void getWordsRec(int pos, int depth, int maxDepth, bool completion, int snr, int inputIndex, int diffs, int skipPos, int *nextLetters, int nextLettersSize); void registerNextLetter(unsigned short c, int *nextLetters, int nextLettersSize); + void onTerminalWhenUserTypedLengthIsGreaterThanInputLength(unsigned short *word, + const int mInputLength, const int depth, const int snr, int *nextLetters, + const int nextLettersSize, const int skipPos, const int freq); + + void onTerminalWhenUserTypedLengthIsSameAsInputLength(unsigned short *word, const int depth, + const int snr, const int skipPos, const int freq, const int addedWeight); + const unsigned char *DICT; const int MAX_WORDS; const int MAX_WORD_LENGTH; @@ -58,7 +65,8 @@ private: unsigned short *mOutputChars; int *mInputCodes; int mInputLength; - unsigned short mWord[128]; + // MAX_WORD_LENGTH_INTERNAL must be bigger than MAX_WORD_LENGTH + unsigned short mWord[MAX_WORD_LENGTH_INTERNAL]; int mMaxEditDistance; };