parent
4e24668a75
commit
48e432ceb8
|
@ -54,6 +54,10 @@ public:
|
||||||
static int getAddress(const unsigned char *dict, int *pos);
|
static int getAddress(const unsigned char *dict, int *pos);
|
||||||
static int getFreq(const unsigned char *dict, const bool isLatestDictVersion, int *pos);
|
static int getFreq(const unsigned char *dict, const bool isLatestDictVersion, int *pos);
|
||||||
static int wideStrLen(unsigned short *str);
|
static int wideStrLen(unsigned short *str);
|
||||||
|
// returns next sibling's position
|
||||||
|
static int setDictionaryValues(const unsigned char *dict, const bool isLatestDictVersion,
|
||||||
|
const int pos, unsigned short *c, int *childrenPosition,
|
||||||
|
bool *terminal, int *freq);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool hasBigram();
|
bool hasBigram();
|
||||||
|
@ -127,5 +131,20 @@ inline int Dictionary::wideStrLen(unsigned short *str) {
|
||||||
return end - str;
|
return end - str;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline int Dictionary::setDictionaryValues(const unsigned char *dict,
|
||||||
|
const bool isLatestDictVersion, const int pos, unsigned short *c,int *childrenPosition,
|
||||||
|
bool *terminal, int *freq) {
|
||||||
|
int position = pos;
|
||||||
|
// -- at char
|
||||||
|
*c = Dictionary::getChar(dict, &position);
|
||||||
|
// -- at flag/add
|
||||||
|
*terminal = Dictionary::getTerminal(dict, &position);
|
||||||
|
*childrenPosition = Dictionary::getAddress(dict, &position);
|
||||||
|
// -- after address or flag
|
||||||
|
*freq = (*terminal) ? Dictionary::getFreq(dict, isLatestDictVersion, &position) : 1;
|
||||||
|
// returns next sibling's position
|
||||||
|
return position;
|
||||||
|
}
|
||||||
|
|
||||||
}; // namespace latinime
|
}; // namespace latinime
|
||||||
#endif // LATINIME_DICTIONARY_H
|
#endif // LATINIME_DICTIONARY_H
|
||||||
|
|
|
@ -15,9 +15,9 @@
|
||||||
** limitations under the License.
|
** limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include <assert.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <fcntl.h>
|
#include <fcntl.h>
|
||||||
#include <sys/mman.h>
|
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
#define LOG_TAG "LatinIME: unigram_dictionary.cpp"
|
#define LOG_TAG "LatinIME: unigram_dictionary.cpp"
|
||||||
|
@ -185,66 +185,24 @@ void UnigramDictionary::getWords(const int initialPos, const int inputLength, co
|
||||||
void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,
|
void UnigramDictionary::getWordsRec(const int childrenCount, const int pos, const int depth,
|
||||||
const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex,
|
const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex,
|
||||||
const int diffs, const int skipPos, int *nextLetters, const int nextLettersSize) {
|
const int diffs, const int skipPos, int *nextLetters, const int nextLettersSize) {
|
||||||
int position = pos;
|
int siblingPos = pos;
|
||||||
// If inputIndex is greater than mInputLength, that means there are no proximity chars.
|
|
||||||
for (int i = 0; i < childrenCount; ++i) {
|
for (int i = 0; i < childrenCount; ++i) {
|
||||||
// -- at char
|
int newCount;
|
||||||
const unsigned short c = Dictionary::getChar(DICT, &position);
|
int newChildPosition;
|
||||||
// -- at flag/add
|
int newDepth;
|
||||||
const unsigned short lowerC = toLowerCase(c);
|
bool newTraverseAllNodes;
|
||||||
const bool terminal = Dictionary::getTerminal(DICT, &position);
|
int newSnr;
|
||||||
int childrenPosition = Dictionary::getAddress(DICT, &position);
|
int newInputIndex;
|
||||||
int matchedProximityCharId = -1;
|
int newDiffs;
|
||||||
const bool needsToTraverseNextNode = childrenPosition != 0;
|
int newSiblingPos;
|
||||||
// -- after address or flag
|
const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, depth, maxDepth,
|
||||||
int freq = 1;
|
traverseAllNodes, snr, inputIndex, diffs, skipPos, nextLetters, nextLettersSize,
|
||||||
// If terminal, increment pos
|
&newCount, &newChildPosition, &newDepth, &newTraverseAllNodes, &newSnr,
|
||||||
if (terminal) freq = Dictionary::getFreq(DICT, IS_LATEST_DICT_VERSION, &position);
|
&newInputIndex, &newDiffs, &newSiblingPos);
|
||||||
// -- after add or freq
|
siblingPos = newSiblingPos;
|
||||||
bool newTraverseAllNodes = traverseAllNodes;
|
|
||||||
int newSnr = snr;
|
|
||||||
int newDiffs = diffs;
|
|
||||||
int newInputIndex = inputIndex;
|
|
||||||
const int newDepth = depth + 1;
|
|
||||||
|
|
||||||
// If we are only doing traverseAllNodes, no need to look at the typed characters.
|
if (needsToTraverseChildrenNodes) {
|
||||||
if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {
|
getWordsRec(newCount, newChildPosition, newDepth, maxDepth, newTraverseAllNodes,
|
||||||
mWord[depth] = c;
|
|
||||||
if (traverseAllNodes && terminal) {
|
|
||||||
onTerminalWhenUserTypedLengthIsGreaterThanInputLength(mWord, mInputLength, depth,
|
|
||||||
snr, nextLetters, nextLettersSize, skipPos, freq);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
int *currentChars = mInputCodes + (inputIndex * MAX_ALTERNATIVES);
|
|
||||||
matchedProximityCharId = getMatchedProximityId(currentChars, lowerC, c, skipPos);
|
|
||||||
if (matchedProximityCharId < 0) continue;
|
|
||||||
mWord[depth] = c;
|
|
||||||
// If inputIndex is greater than mInputLength, that means there is no
|
|
||||||
// proximity chars. So, we don't need to check proximity.
|
|
||||||
const int addedWeight = matchedProximityCharId == 0 ? TYPED_LETTER_MULTIPLIER : 1;
|
|
||||||
const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1;
|
|
||||||
if (isSameAsUserTypedLength && terminal) {
|
|
||||||
onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, depth, snr,
|
|
||||||
skipPos, freq, addedWeight);
|
|
||||||
}
|
|
||||||
if (!needsToTraverseNextNode) continue;
|
|
||||||
// Start traversing all nodes after the index exceeds the user typed length
|
|
||||||
newTraverseAllNodes = isSameAsUserTypedLength;
|
|
||||||
newSnr *= addedWeight;
|
|
||||||
newDiffs += (matchedProximityCharId > 0);
|
|
||||||
++newInputIndex;
|
|
||||||
}
|
|
||||||
// Optimization: Prune out words that are too long compared to how much was typed.
|
|
||||||
if (newDepth > maxDepth || newDiffs > mMaxEditDistance) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (mInputLength <= newInputIndex) {
|
|
||||||
newTraverseAllNodes = true;
|
|
||||||
}
|
|
||||||
if (needsToTraverseNextNode) {
|
|
||||||
// get the count of nodes and increment childAddress.
|
|
||||||
const int count = Dictionary::getCount(DICT, &childrenPosition);
|
|
||||||
getWordsRec(count, childrenPosition, newDepth, maxDepth, newTraverseAllNodes,
|
|
||||||
newSnr, newInputIndex, newDiffs, skipPos, nextLetters, nextLettersSize);
|
newSnr, newInputIndex, newDiffs, skipPos, nextLetters, nextLettersSize);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -279,7 +237,8 @@ inline bool UnigramDictionary::needsToSkipCurrentNode(const unsigned short c,
|
||||||
}
|
}
|
||||||
|
|
||||||
inline int UnigramDictionary::getMatchedProximityId(const int *currentChars,
|
inline int UnigramDictionary::getMatchedProximityId(const int *currentChars,
|
||||||
const unsigned short lowerC, const unsigned short c, const int skipPos) {
|
const unsigned short c, const int skipPos) {
|
||||||
|
const unsigned short lowerC = toLowerCase(c);
|
||||||
int j = 0;
|
int j = 0;
|
||||||
while (currentChars[j] > 0) {
|
while (currentChars[j] > 0) {
|
||||||
const bool matched = (currentChars[j] == lowerC || currentChars[j] == c);
|
const bool matched = (currentChars[j] == lowerC || currentChars[j] == c);
|
||||||
|
@ -295,4 +254,68 @@ inline int UnigramDictionary::getMatchedProximityId(const int *currentChars,
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth,
|
||||||
|
const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex,
|
||||||
|
const int diffs, const int skipPos, int *nextLetters, const int nextLettersSize,
|
||||||
|
int *newCount, int *newChildPosition, int *newDepth, bool *newTraverseAllNodes,
|
||||||
|
int *newSnr, int*newInputIndex, int *newDiffs, int *nextSiblingPosition) {
|
||||||
|
unsigned short c;
|
||||||
|
int childPosition;
|
||||||
|
bool terminal;
|
||||||
|
int freq;
|
||||||
|
*nextSiblingPosition = Dictionary::setDictionaryValues(DICT, IS_LATEST_DICT_VERSION, pos, &c,
|
||||||
|
&childPosition, &terminal, &freq);
|
||||||
|
|
||||||
|
const bool needsToTraverseChildrenNodes = childPosition != 0;
|
||||||
|
|
||||||
|
// If we are only doing traverseAllNodes, no need to look at the typed characters.
|
||||||
|
if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {
|
||||||
|
mWord[depth] = c;
|
||||||
|
if (traverseAllNodes && terminal) {
|
||||||
|
onTerminalWhenUserTypedLengthIsGreaterThanInputLength(mWord, mInputLength, depth,
|
||||||
|
snr, nextLetters, nextLettersSize, skipPos, freq);
|
||||||
|
}
|
||||||
|
if (!needsToTraverseChildrenNodes) return false;
|
||||||
|
*newTraverseAllNodes = traverseAllNodes;
|
||||||
|
*newSnr = snr;
|
||||||
|
*newDiffs = diffs;
|
||||||
|
*newInputIndex = inputIndex;
|
||||||
|
*newDepth = depth + 1;
|
||||||
|
} else {
|
||||||
|
int *currentChars = mInputCodes + (inputIndex * MAX_ALTERNATIVES);
|
||||||
|
int matchedProximityCharId = getMatchedProximityId(currentChars, c, skipPos);
|
||||||
|
if (matchedProximityCharId < 0) return false;
|
||||||
|
mWord[depth] = c;
|
||||||
|
// If inputIndex is greater than mInputLength, that means there is no
|
||||||
|
// proximity chars. So, we don't need to check proximity.
|
||||||
|
const int addedWeight = matchedProximityCharId == 0 ? TYPED_LETTER_MULTIPLIER : 1;
|
||||||
|
const bool isSameAsUserTypedLength = mInputLength == inputIndex + 1;
|
||||||
|
if (isSameAsUserTypedLength && terminal) {
|
||||||
|
onTerminalWhenUserTypedLengthIsSameAsInputLength(mWord, depth, snr,
|
||||||
|
skipPos, freq, addedWeight);
|
||||||
|
}
|
||||||
|
if (!needsToTraverseChildrenNodes) return false;
|
||||||
|
// Start traversing all nodes after the index exceeds the user typed length
|
||||||
|
*newTraverseAllNodes = isSameAsUserTypedLength;
|
||||||
|
*newSnr = snr * addedWeight;
|
||||||
|
*newDiffs = diffs + (matchedProximityCharId > 0);
|
||||||
|
*newInputIndex = inputIndex + 1;
|
||||||
|
*newDepth = depth + 1;
|
||||||
|
}
|
||||||
|
// Optimization: Prune out words that are too long compared to how much was typed.
|
||||||
|
if (*newDepth > maxDepth || *newDiffs > mMaxEditDistance) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If inputIndex is greater than mInputLength, that means there are no proximity chars.
|
||||||
|
if (mInputLength <= *newInputIndex) {
|
||||||
|
*newTraverseAllNodes = true;
|
||||||
|
}
|
||||||
|
// get the count of nodes and increment childAddress.
|
||||||
|
*newCount = Dictionary::getCount(DICT, &childPosition);
|
||||||
|
*newChildPosition = childPosition;
|
||||||
|
if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);
|
||||||
|
return needsToTraverseChildrenNodes;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
|
|
|
@ -54,9 +54,12 @@ private:
|
||||||
const int snr, const int skipPos, const int freq, const int addedWeight);
|
const int snr, const int skipPos, const int freq, const int addedWeight);
|
||||||
bool needsToSkipCurrentNode(const unsigned short c,
|
bool needsToSkipCurrentNode(const unsigned short c,
|
||||||
const int inputIndex, const int skipPos, const int depth);
|
const int inputIndex, const int skipPos, const int depth);
|
||||||
int getMatchedProximityId(const int *currentChars, const unsigned short lowerC,
|
int getMatchedProximityId(const int *currentChars, const unsigned short c, const int skipPos);
|
||||||
const unsigned short c, const int skipPos);
|
bool processCurrentNode(const int pos, const int depth,
|
||||||
|
const int maxDepth, const bool traverseAllNodes, const int snr, const int inputIndex,
|
||||||
|
const int diffs, const int skipPos, int *nextLetters, const int nextLettersSize,
|
||||||
|
int *newCount, int *newChildPosition, int *newDepth, bool *newTraverseAllNodes,
|
||||||
|
int *newSnr, int*newInputIndex, int *newDiffs, int *nextSiblingPosition);
|
||||||
const unsigned char *DICT;
|
const unsigned char *DICT;
|
||||||
const int MAX_WORDS;
|
const int MAX_WORDS;
|
||||||
const int MAX_WORD_LENGTH;
|
const int MAX_WORD_LENGTH;
|
||||||
|
|
Loading…
Reference in New Issue