From 48e432ceb830c1932bd4f52d5fd2780b94685bf7 Mon Sep 17 00:00:00 2001 From: satok Date: Mon, 6 Dec 2010 17:38:58 +0900 Subject: [PATCH] Breakdown getWordRec Change-Id: I4fef02c227fb858334dbe2eabf2762d5b6e1d919 --- native/src/dictionary.h | 19 ++++ native/src/unigram_dictionary.cpp | 143 +++++++++++++++++------------- native/src/unigram_dictionary.h | 9 +- 3 files changed, 108 insertions(+), 63 deletions(-) diff --git a/native/src/dictionary.h b/native/src/dictionary.h index 1cd517cfb..da876242d 100644 --- a/native/src/dictionary.h +++ b/native/src/dictionary.h @@ -54,6 +54,10 @@ public: static int getAddress(const unsigned char *dict, int *pos); static int getFreq(const unsigned char *dict, const bool isLatestDictVersion, int *pos); static int wideStrLen(unsigned short *str); + // returns next sibling's position + static int setDictionaryValues(const unsigned char *dict, const bool isLatestDictVersion, + const int pos, unsigned short *c, int *childrenPosition, + bool *terminal, int *freq); private: bool hasBigram(); @@ -127,5 +131,20 @@ inline int Dictionary::wideStrLen(unsigned short *str) { return end - str; } +inline int Dictionary::setDictionaryValues(const unsigned char *dict, + const bool isLatestDictVersion, const int pos, unsigned short *c,int *childrenPosition, + bool *terminal, int *freq) { + int position = pos; + // -- at char + *c = Dictionary::getChar(dict, &position); + // -- at flag/add + *terminal = Dictionary::getTerminal(dict, &position); + *childrenPosition = Dictionary::getAddress(dict, &position); + // -- after address or flag + *freq = (*terminal) ? Dictionary::getFreq(dict, isLatestDictVersion, &position) : 1; + // returns next sibling's position + return position; +} + }; // namespace latinime #endif // LATINIME_DICTIONARY_H diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp index 8a9742bbf..fa4e29632 100644 --- a/native/src/unigram_dictionary.cpp +++ b/native/src/unigram_dictionary.cpp @@ -15,9 +15,9 @@ ** limitations under the License. */ +#include #include #include -#include #include #define LOG_TAG "LatinIME: unigram_dictionary.cpp" @@ -185,66 +185,24 @@ void UnigramDictionary::getWords(const int initialPos, const int inputLength, co void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth, const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex, const int diffs, const int skipPos, int *nextLetters, const int nextLettersSize) { - int position = pos; - // If inputIndex is greater than mInputLength, that means there are no proximity chars. + int siblingPos = pos; for (int i = 0; i < childrenCount; ++i) { - // -- at char - const unsigned short c = Dictionary::getChar(DICT, &position); - // -- at flag/add - const unsigned short lowerC = toLowerCase(c); - const bool terminal = Dictionary::getTerminal(DICT, &position); - int childrenPosition = Dictionary::getAddress(DICT, &position); - int matchedProximityCharId = -1; - const bool needsToTraverseNextNode = childrenPosition != 0; - // -- after address or flag - int freq = 1; - // If terminal, increment pos - if (terminal) freq = Dictionary::getFreq(DICT, IS_LATEST_DICT_VERSION, &position); - // -- after add or freq - bool newTraverseAllNodes = traverseAllNodes; - int newSnr = snr; - int newDiffs = diffs; - int newInputIndex = inputIndex; - const int newDepth = depth + 1; + int newCount; + int newChildPosition; + int newDepth; + bool newTraverseAllNodes; + int newSnr; + int newInputIndex; + int newDiffs; + int newSiblingPos; + const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth, + traverseAllNodes, snr, inputIndex, diffs, skipPos, nextLetters, nextLettersSize, + &newCount, &newChildPosition, &newDepth, &newTraverseAllNodes, &newSnr, + &newInputIndex, &newDiffs, &newSiblingPos); + siblingPos = newSiblingPos; - // If we are only doing traverseAllNodes, no need to look at the typed characters. - if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) { - mWord[depth] = c; - if (traverseAllNodes && terminal) { - onTerminalWhenUserTypedLengthIsGreaterThanInputLength(mWord, mInputLength, depth, - snr, nextLetters, nextLettersSize, skipPos, freq); - } - } else { - int *currentChars = mInputCodes + (inputIndex * MAX_ALTERNATIVES); - matchedProximityCharId = getMatchedProximityId(currentChars, lowerC, c, skipPos); - if (matchedProximityCharId < 0) continue; - mWord[depth] = c; - // If inputIndex is greater than mInputLength, that means there is no - // proximity chars. So, we don't need to check proximity. - const int addedWeight = matchedProximityCharId == 0 ? TYPED_LETTER_MULTIPLIER : 1; - const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1; - if (isSameAsUserTypedLength && terminal) { - onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, depth, snr, - skipPos, freq, addedWeight); - } - if (!needsToTraverseNextNode) continue; - // Start traversing all nodes after the index exceeds the user typed length - newTraverseAllNodes = isSameAsUserTypedLength; - newSnr *= addedWeight; - newDiffs += (matchedProximityCharId > 0); - ++newInputIndex; - } - // Optimization: Prune out words that are too long compared to how much was typed. - if (newDepth > maxDepth || newDiffs > mMaxEditDistance) { - continue; - } - if (mInputLength <= newInputIndex) { - newTraverseAllNodes = true; - } - if (needsToTraverseNextNode) { - // get the count of nodes and increment childAddress. - const int count = Dictionary::getCount(DICT, &childrenPosition); - getWordsRec(count, childrenPosition, newDepth, maxDepth, newTraverseAllNodes, + if (needsToTraverseChildrenNodes) { + getWordsRec(newCount, newChildPosition, newDepth, maxDepth, newTraverseAllNodes, newSnr, newInputIndex, newDiffs, skipPos, nextLetters, nextLettersSize); } } @@ -279,7 +237,8 @@ inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c, } inline int UnigramDictionary::getMatchedProximityId(const int *currentChars, - const unsigned short lowerC, const unsigned short c, const int skipPos) { + const unsigned short c, const int skipPos) { + const unsigned short lowerC = toLowerCase(c); int j = 0; while (currentChars[j] > 0) { const bool matched = (currentChars[j] == lowerC || currentChars[j] == c); @@ -295,4 +254,68 @@ inline int UnigramDictionary::getMatchedProximityId(const int *currentChars, return -1; } +inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth, + const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex, + const int diffs, const int skipPos, int *nextLetters, const int nextLettersSize, + int *newCount, int *newChildPosition, int *newDepth, bool *newTraverseAllNodes, + int *newSnr, int*newInputIndex, int *newDiffs, int *nextSiblingPosition) { + unsigned short c; + int childPosition; + bool terminal; + int freq; + *nextSiblingPosition = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, pos, &c, + &childPosition, &terminal, &freq); + + const bool needsToTraverseChildrenNodes = childPosition != 0; + + // If we are only doing traverseAllNodes, no need to look at the typed characters. + if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) { + mWord[depth] = c; + if (traverseAllNodes && terminal) { + onTerminalWhenUserTypedLengthIsGreaterThanInputLength(mWord, mInputLength, depth, + snr, nextLetters, nextLettersSize, skipPos, freq); + } + if (!needsToTraverseChildrenNodes) return false; + *newTraverseAllNodes = traverseAllNodes; + *newSnr = snr; + *newDiffs = diffs; + *newInputIndex = inputIndex; + *newDepth = depth + 1; + } else { + int *currentChars = mInputCodes + (inputIndex * MAX_ALTERNATIVES); + int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos); + if (matchedProximityCharId < 0) return false; + mWord[depth] = c; + // If inputIndex is greater than mInputLength, that means there is no + // proximity chars. So, we don't need to check proximity. + const int addedWeight = matchedProximityCharId == 0 ? TYPED_LETTER_MULTIPLIER : 1; + const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1; + if (isSameAsUserTypedLength && terminal) { + onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, depth, snr, + skipPos, freq, addedWeight); + } + if (!needsToTraverseChildrenNodes) return false; + // Start traversing all nodes after the index exceeds the user typed length + *newTraverseAllNodes = isSameAsUserTypedLength; + *newSnr = snr * addedWeight; + *newDiffs = diffs + (matchedProximityCharId > 0); + *newInputIndex = inputIndex + 1; + *newDepth = depth + 1; + } + // Optimization: Prune out words that are too long compared to how much was typed. + if (*newDepth > maxDepth || *newDiffs > mMaxEditDistance) { + return false; + } + + // If inputIndex is greater than mInputLength, that means there are no proximity chars. + if (mInputLength <= *newInputIndex) { + *newTraverseAllNodes = true; + } + // get the count of nodes and increment childAddress. + *newCount = Dictionary::getCount(DICT, &childPosition); + *newChildPosition = childPosition; + if (DEBUG_DICT) assert(needsToTraverseChildrenNodes); + return needsToTraverseChildrenNodes; +} + } // namespace latinime diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h index 733b80c79..c02d366de 100644 --- a/native/src/unigram_dictionary.h +++ b/native/src/unigram_dictionary.h @@ -54,9 +54,12 @@ private: const int snr, const int skipPos, const int freq, const int addedWeight); bool needsToSkipCurrentNode(const unsigned short c, const int inputIndex, const int skipPos, const int depth); - int getMatchedProximityId(const int *currentChars, const unsigned short lowerC, - const unsigned short c, const int skipPos); - + int getMatchedProximityId(const int *currentChars, const unsigned short c, const int skipPos); + bool processCurrentNode(const int pos, const int depth, + const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex, + const int diffs, const int skipPos, int *nextLetters, const int nextLettersSize, + int *newCount, int *newChildPosition, int *newDepth, bool *newTraverseAllNodes, + int *newSnr, int*newInputIndex, int *newDiffs, int *nextSiblingPosition); const unsigned char *DICT; const int MAX_WORDS; const int MAX_WORD_LENGTH;