Breakdown getWordRec

Change-Id: I4fef02c227fb858334dbe2eabf2762d5b6e1d919
main
satok 2010-12-06 17:38:58 +09:00
parent 4e24668a75
commit 48e432ceb8
3 changed files with 108 additions and 63 deletions

View File

@ -54,6 +54,10 @@ public:
static int getAddress(const unsigned char *dict, int *pos); static int getAddress(const unsigned char *dict, int *pos);
static int getFreq(const unsigned char *dict, const bool isLatestDictVersion, int *pos); static int getFreq(const unsigned char *dict, const bool isLatestDictVersion, int *pos);
static int wideStrLen(unsigned short *str); static int wideStrLen(unsigned short *str);
// returns next sibling's position
static int setDictionaryValues(const unsigned char *dict, const bool isLatestDictVersion,
const int pos, unsigned short *c, int *childrenPosition,
bool *terminal, int *freq);
private: private:
bool hasBigram(); bool hasBigram();
@ -127,5 +131,20 @@ inline int Dictionary::wideStrLen(unsigned short *str) {
return end - str; return end - str;
} }
inline int Dictionary::setDictionaryValues(const unsigned char *dict,
const bool isLatestDictVersion, const int pos, unsigned short *c,int *childrenPosition,
bool *terminal, int *freq) {
int position = pos;
// -- at char
*c = Dictionary::getChar(dict, &position);
// -- at flag/add
*terminal = Dictionary::getTerminal(dict, &position);
*childrenPosition = Dictionary::getAddress(dict, &position);
// -- after address or flag
*freq = (*terminal) ? Dictionary::getFreq(dict, isLatestDictVersion, &position) : 1;
// returns next sibling's position
return position;
}
}; // namespace latinime }; // namespace latinime
#endif // LATINIME_DICTIONARY_H #endif // LATINIME_DICTIONARY_H

View File

@ -15,9 +15,9 @@
** limitations under the License. ** limitations under the License.
*/ */
#include <assert.h>
#include <stdio.h> #include <stdio.h>
#include <fcntl.h> #include <fcntl.h>
#include <sys/mman.h>
#include <string.h> #include <string.h>
#define LOG_TAG "LatinIME: unigram_dictionary.cpp" #define LOG_TAG "LatinIME: unigram_dictionary.cpp"
@ -185,66 +185,24 @@ void UnigramDictionary::getWords(const int initialPos, const int inputLength, co
void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth, void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,
const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex, const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex,
const int diffs, const int skipPos, int *nextLetters, const int nextLettersSize) { const int diffs, const int skipPos, int *nextLetters, const int nextLettersSize) {
int position = pos; int siblingPos = pos;
// If inputIndex is greater than mInputLength, that means there are no proximity chars.
for (int i = 0; i < childrenCount; ++i) { for (int i = 0; i < childrenCount; ++i) {
// -- at char int newCount;
const unsigned short c = Dictionary::getChar(DICT, &position); int newChildPosition;
// -- at flag/add int newDepth;
const unsigned short lowerC = toLowerCase(c); bool newTraverseAllNodes;
const bool terminal = Dictionary::getTerminal(DICT, &position); int newSnr;
int childrenPosition = Dictionary::getAddress(DICT, &position); int newInputIndex;
int matchedProximityCharId = -1; int newDiffs;
const bool needsToTraverseNextNode = childrenPosition != 0; int newSiblingPos;
// -- after address or flag const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,
int freq = 1; traverseAllNodes, snr, inputIndex, diffs, skipPos, nextLetters, nextLettersSize,
// If terminal, increment pos &newCount, &newChildPosition, &newDepth, &newTraverseAllNodes, &newSnr,
if (terminal) freq = Dictionary::getFreq(DICT, IS_LATEST_DICT_VERSION, &position); &newInputIndex, &newDiffs, &newSiblingPos);
// -- after add or freq siblingPos = newSiblingPos;
bool newTraverseAllNodes = traverseAllNodes;
int newSnr = snr;
int newDiffs = diffs;
int newInputIndex = inputIndex;
const int newDepth = depth + 1;
// If we are only doing traverseAllNodes, no need to look at the typed characters. if (needsToTraverseChildrenNodes) {
if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) { getWordsRec(newCount, newChildPosition, newDepth, maxDepth, newTraverseAllNodes,
mWord[depth] = c;
if (traverseAllNodes && terminal) {
onTerminalWhenUserTypedLengthIsGreaterThanInputLength(mWord, mInputLength, depth,
snr, nextLetters, nextLettersSize, skipPos, freq);
}
} else {
int *currentChars = mInputCodes + (inputIndex * MAX_ALTERNATIVES);
matchedProximityCharId = getMatchedProximityId(currentChars, lowerC, c, skipPos);
if (matchedProximityCharId < 0) continue;
mWord[depth] = c;
// If inputIndex is greater than mInputLength, that means there is no
// proximity chars. So, we don't need to check proximity.
const int addedWeight = matchedProximityCharId == 0 ? TYPED_LETTER_MULTIPLIER : 1;
const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1;
if (isSameAsUserTypedLength && terminal) {
onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, depth, snr,
skipPos, freq, addedWeight);
}
if (!needsToTraverseNextNode) continue;
// Start traversing all nodes after the index exceeds the user typed length
newTraverseAllNodes = isSameAsUserTypedLength;
newSnr *= addedWeight;
newDiffs += (matchedProximityCharId > 0);
++newInputIndex;
}
// Optimization: Prune out words that are too long compared to how much was typed.
if (newDepth > maxDepth || newDiffs > mMaxEditDistance) {
continue;
}
if (mInputLength <= newInputIndex) {
newTraverseAllNodes = true;
}
if (needsToTraverseNextNode) {
// get the count of nodes and increment childAddress.
const int count = Dictionary::getCount(DICT, &childrenPosition);
getWordsRec(count, childrenPosition, newDepth, maxDepth, newTraverseAllNodes,
newSnr, newInputIndex, newDiffs, skipPos, nextLetters, nextLettersSize); newSnr, newInputIndex, newDiffs, skipPos, nextLetters, nextLettersSize);
} }
} }
@ -279,7 +237,8 @@ inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,
} }
inline int UnigramDictionary::getMatchedProximityId(const int *currentChars, inline int UnigramDictionary::getMatchedProximityId(const int *currentChars,
const unsigned short lowerC, const unsigned short c, const int skipPos) { const unsigned short c, const int skipPos) {
const unsigned short lowerC = toLowerCase(c);
int j = 0; int j = 0;
while (currentChars[j] > 0) { while (currentChars[j] > 0) {
const bool matched = (currentChars[j] == lowerC || currentChars[j] == c); const bool matched = (currentChars[j] == lowerC || currentChars[j] == c);
@ -295,4 +254,68 @@ inline int UnigramDictionary::getMatchedProximityId(const int *currentChars,
return -1; return -1;
} }
inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth,
const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex,
const int diffs, const int skipPos, int *nextLetters, const int nextLettersSize,
int *newCount, int *newChildPosition, int *newDepth, bool *newTraverseAllNodes,
int *newSnr, int*newInputIndex, int *newDiffs, int *nextSiblingPosition) {
unsigned short c;
int childPosition;
bool terminal;
int freq;
*nextSiblingPosition = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, pos, &c,
&childPosition, &terminal, &freq);
const bool needsToTraverseChildrenNodes = childPosition != 0;
// If we are only doing traverseAllNodes, no need to look at the typed characters.
if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {
mWord[depth] = c;
if (traverseAllNodes && terminal) {
onTerminalWhenUserTypedLengthIsGreaterThanInputLength(mWord, mInputLength, depth,
snr, nextLetters, nextLettersSize, skipPos, freq);
}
if (!needsToTraverseChildrenNodes) return false;
*newTraverseAllNodes = traverseAllNodes;
*newSnr = snr;
*newDiffs = diffs;
*newInputIndex = inputIndex;
*newDepth = depth + 1;
} else {
int *currentChars = mInputCodes + (inputIndex * MAX_ALTERNATIVES);
int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos);
if (matchedProximityCharId < 0) return false;
mWord[depth] = c;
// If inputIndex is greater than mInputLength, that means there is no
// proximity chars. So, we don't need to check proximity.
const int addedWeight = matchedProximityCharId == 0 ? TYPED_LETTER_MULTIPLIER : 1;
const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1;
if (isSameAsUserTypedLength && terminal) {
onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, depth, snr,
skipPos, freq, addedWeight);
}
if (!needsToTraverseChildrenNodes) return false;
// Start traversing all nodes after the index exceeds the user typed length
*newTraverseAllNodes = isSameAsUserTypedLength;
*newSnr = snr * addedWeight;
*newDiffs = diffs + (matchedProximityCharId > 0);
*newInputIndex = inputIndex + 1;
*newDepth = depth + 1;
}
// Optimization: Prune out words that are too long compared to how much was typed.
if (*newDepth > maxDepth || *newDiffs > mMaxEditDistance) {
return false;
}
// If inputIndex is greater than mInputLength, that means there are no proximity chars.
if (mInputLength <= *newInputIndex) {
*newTraverseAllNodes = true;
}
// get the count of nodes and increment childAddress.
*newCount = Dictionary::getCount(DICT, &childPosition);
*newChildPosition = childPosition;
if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);
return needsToTraverseChildrenNodes;
}
} // namespace latinime } // namespace latinime

View File

@ -54,9 +54,12 @@ private:
const int snr, const int skipPos, const int freq, const int addedWeight); const int snr, const int skipPos, const int freq, const int addedWeight);
bool needsToSkipCurrentNode(const unsigned short c, bool needsToSkipCurrentNode(const unsigned short c,
const int inputIndex, const int skipPos, const int depth); const int inputIndex, const int skipPos, const int depth);
int getMatchedProximityId(const int *currentChars, const unsigned short lowerC, int getMatchedProximityId(const int *currentChars, const unsigned short c, const int skipPos);
const unsigned short c, const int skipPos); bool processCurrentNode(const int pos, const int depth,
const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex,
const int diffs, const int skipPos, int *nextLetters, const int nextLettersSize,
int *newCount, int *newChildPosition, int *newDepth, bool *newTraverseAllNodes,
int *newSnr, int*newInputIndex, int *newDiffs, int *nextSiblingPosition);
const unsigned char *DICT; const unsigned char *DICT;
const int MAX_WORDS; const int MAX_WORDS;
const int MAX_WORD_LENGTH; const int MAX_WORD_LENGTH;