Remove old dictionary format code
Change-Id: Ic4b9e069c9bd5c088769519f44d0a9ea45acb833main
parent
ccec49793c
commit
db2c0919cf
|
@ -8,9 +8,6 @@ LOCAL_CFLAGS += -Werror -Wall
|
||||||
# To suppress compiler warnings for unused variables/functions used for debug features etc.
|
# To suppress compiler warnings for unused variables/functions used for debug features etc.
|
||||||
LOCAL_CFLAGS += -Wno-unused-parameter -Wno-unused-function
|
LOCAL_CFLAGS += -Wno-unused-parameter -Wno-unused-function
|
||||||
|
|
||||||
# Use the new dictionary format
|
|
||||||
LOCAL_CFLAGS += -DNEW_DICTIONARY_FORMAT
|
|
||||||
|
|
||||||
LOCAL_SRC_FILES := \
|
LOCAL_SRC_FILES := \
|
||||||
jni/com_android_inputmethod_keyboard_ProximityInfo.cpp \
|
jni/com_android_inputmethod_keyboard_ProximityInfo.cpp \
|
||||||
jni/com_android_inputmethod_latin_BinaryDictionary.cpp \
|
jni/com_android_inputmethod_latin_BinaryDictionary.cpp \
|
||||||
|
|
|
@ -24,9 +24,7 @@
|
||||||
#include "dictionary.h"
|
#include "dictionary.h"
|
||||||
#include "unigram_dictionary.h"
|
#include "unigram_dictionary.h"
|
||||||
|
|
||||||
#ifdef NEW_DICTIONARY_FORMAT
|
|
||||||
#include "binary_format.h"
|
#include "binary_format.h"
|
||||||
#endif // NEW_DICTIONARY_FORMAT
|
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
|
@ -39,20 +37,12 @@ const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =
|
||||||
UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier,
|
UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier,
|
||||||
int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
|
int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
|
||||||
const bool isLatestDictVersion)
|
const bool isLatestDictVersion)
|
||||||
#ifndef NEW_DICTIONARY_FORMAT
|
|
||||||
: DICT_ROOT(streamStart),
|
|
||||||
#else // NEW_DICTIONARY_FORMAT
|
|
||||||
: DICT_ROOT(streamStart + NEW_DICTIONARY_HEADER_SIZE),
|
: DICT_ROOT(streamStart + NEW_DICTIONARY_HEADER_SIZE),
|
||||||
#endif // NEW_DICTIONARY_FORMAT
|
|
||||||
MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
|
MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
|
||||||
MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
|
MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
|
||||||
TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
|
TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
|
||||||
#ifndef NEW_DICTIONARY_FORMAT
|
|
||||||
ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0),
|
|
||||||
#else // NEW_DICTIONARY_FORMAT
|
|
||||||
// TODO : remove this variable.
|
// TODO : remove this variable.
|
||||||
ROOT_POS(0),
|
ROOT_POS(0),
|
||||||
#endif // NEW_DICTIONARY_FORMAT
|
|
||||||
BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(int)),
|
BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(int)),
|
||||||
MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) {
|
MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) {
|
||||||
if (DEBUG_DICT) {
|
if (DEBUG_DICT) {
|
||||||
|
@ -656,243 +646,6 @@ bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength,
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef NEW_DICTIONARY_FORMAT
|
|
||||||
inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
|
|
||||||
const int inputLength, unsigned short *word) {
|
|
||||||
int pos = ROOT_POS;
|
|
||||||
int count = Dictionary::getCount(DICT_ROOT, &pos);
|
|
||||||
int maxFreq = 0;
|
|
||||||
int depth = 0;
|
|
||||||
unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];
|
|
||||||
bool terminal = false;
|
|
||||||
|
|
||||||
mStackChildCount[0] = count;
|
|
||||||
mStackSiblingPos[0] = pos;
|
|
||||||
|
|
||||||
while (depth >= 0) {
|
|
||||||
if (mStackChildCount[depth] > 0) {
|
|
||||||
--mStackChildCount[depth];
|
|
||||||
int firstChildPos;
|
|
||||||
int newFreq;
|
|
||||||
int siblingPos = mStackSiblingPos[depth];
|
|
||||||
const bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(siblingPos,
|
|
||||||
startInputIndex, depth, newWord, &firstChildPos, &count, &terminal, &newFreq,
|
|
||||||
&siblingPos);
|
|
||||||
mStackSiblingPos[depth] = siblingPos;
|
|
||||||
if (depth == (inputLength - 1)) {
|
|
||||||
// Traverse sibling node
|
|
||||||
if (terminal) {
|
|
||||||
if (newFreq > maxFreq) {
|
|
||||||
for (int i = 0; i < inputLength; ++i) word[i] = newWord[i];
|
|
||||||
if (DEBUG_DICT && DEBUG_NODE) {
|
|
||||||
#ifdef FLAG_DBG
|
|
||||||
char s[inputLength + 1];
|
|
||||||
for (int i = 0; i < inputLength; ++i) s[i] = word[i];
|
|
||||||
s[inputLength] = 0;
|
|
||||||
LOGI("New missing space word found: %d > %d (%s), %d, %d",
|
|
||||||
newFreq, maxFreq, s, inputLength, depth);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
maxFreq = newFreq;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (needsToTraverseChildrenNodes) {
|
|
||||||
// Traverse children nodes
|
|
||||||
++depth;
|
|
||||||
mStackChildCount[depth] = count;
|
|
||||||
mStackSiblingPos[depth] = firstChildPos;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Traverse parent node
|
|
||||||
--depth;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
word[inputLength] = 0;
|
|
||||||
return maxFreq;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,
|
|
||||||
const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition,
|
|
||||||
int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) {
|
|
||||||
const int inputIndex = startInputIndex + depth;
|
|
||||||
unsigned short c;
|
|
||||||
*siblingPos = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, firstChildPos,
|
|
||||||
&c, newChildPosition, newTerminal, newFreq);
|
|
||||||
const unsigned int inputC = mProximityInfo->getPrimaryCharAt(inputIndex);
|
|
||||||
if (DEBUG_DICT) {
|
|
||||||
assert(inputC <= U_SHORT_MAX);
|
|
||||||
}
|
|
||||||
const unsigned short baseLowerC = Dictionary::toBaseLowerCase(c);
|
|
||||||
const bool matched = (inputC == baseLowerC || inputC == c);
|
|
||||||
const bool hasChild = *newChildPosition != 0;
|
|
||||||
if (matched) {
|
|
||||||
word[depth] = c;
|
|
||||||
if (DEBUG_DICT && DEBUG_NODE) {
|
|
||||||
LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq);
|
|
||||||
if (*newTerminal) {
|
|
||||||
LOGI("Terminal %d", *newFreq);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (hasChild) {
|
|
||||||
*newCount = Dictionary::getCount(DICT_ROOT, newChildPosition);
|
|
||||||
return true;
|
|
||||||
} else {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// If this node is not user typed character, this method treats this word as unmatched.
|
|
||||||
// Thus newTerminal shouldn't be true.
|
|
||||||
*newTerminal = false;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: use uint32_t instead of unsigned short
|
|
||||||
bool UnigramDictionary::isValidWord(unsigned short *word, int length) {
|
|
||||||
if (IS_LATEST_DICT_VERSION) {
|
|
||||||
return (getBigramPosition(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);
|
|
||||||
} else {
|
|
||||||
return (getBigramPosition(0, word, 0, length) != NOT_VALID_WORD);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Require strict exact match.
|
|
||||||
int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,
|
|
||||||
int length) const {
|
|
||||||
// returns address of bigram data of that word
|
|
||||||
// return -99 if not found
|
|
||||||
|
|
||||||
int count = Dictionary::getCount(DICT_ROOT, &pos);
|
|
||||||
unsigned short currentChar = (unsigned short) word[offset];
|
|
||||||
for (int j = 0; j < count; j++) {
|
|
||||||
unsigned short c = Dictionary::getChar(DICT_ROOT, &pos);
|
|
||||||
int terminal = Dictionary::getTerminal(DICT_ROOT, &pos);
|
|
||||||
int childPos = Dictionary::getAddress(DICT_ROOT, &pos);
|
|
||||||
if (c == currentChar) {
|
|
||||||
if (offset == length - 1) {
|
|
||||||
if (terminal) {
|
|
||||||
return (pos+1);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (childPos != 0) {
|
|
||||||
int t = getBigramPosition(childPos, word, offset + 1, length);
|
|
||||||
if (t > 0) {
|
|
||||||
return t;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (terminal) {
|
|
||||||
Dictionary::getFreq(DICT_ROOT, IS_LATEST_DICT_VERSION, &pos);
|
|
||||||
}
|
|
||||||
// There could be two instances of each alphabet - upper and lower case. So continue
|
|
||||||
// looking ...
|
|
||||||
}
|
|
||||||
return NOT_VALID_WORD;
|
|
||||||
}
|
|
||||||
|
|
||||||
// The following functions will be modified.
|
|
||||||
inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth,
|
|
||||||
const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex,
|
|
||||||
const int initialDiffs, int *nextLetters, const int nextLettersSize,
|
|
||||||
CorrectionState *correctionState, int *newCount, int *newChildPosition,
|
|
||||||
bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,
|
|
||||||
int *nextSiblingPosition, int *nextOutputIndex) {
|
|
||||||
const int skipPos = correctionState->getSkipPos();
|
|
||||||
const int excessivePos = correctionState->getExcessivePos();
|
|
||||||
const int transposedPos = correctionState->getTransposedPos();
|
|
||||||
if (DEBUG_DICT) {
|
|
||||||
int inputCount = 0;
|
|
||||||
if (skipPos >= 0) ++inputCount;
|
|
||||||
if (excessivePos >= 0) ++inputCount;
|
|
||||||
if (transposedPos >= 0) ++inputCount;
|
|
||||||
assert(inputCount <= 1);
|
|
||||||
}
|
|
||||||
unsigned short c;
|
|
||||||
int childPosition;
|
|
||||||
bool terminal;
|
|
||||||
int freq;
|
|
||||||
bool isSameAsUserTypedLength = false;
|
|
||||||
|
|
||||||
const int pos = initialPos;
|
|
||||||
const int depth = initialDepth;
|
|
||||||
const int traverseAllNodes = initialTraverseAllNodes;
|
|
||||||
const int diffs = initialDiffs;
|
|
||||||
|
|
||||||
const uint8_t flags = 0; // No flags for now
|
|
||||||
|
|
||||||
if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;
|
|
||||||
|
|
||||||
*nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos,
|
|
||||||
&c, &childPosition, &terminal, &freq);
|
|
||||||
*nextOutputIndex = depth + 1;
|
|
||||||
|
|
||||||
const bool needsToTraverseChildrenNodes = childPosition != 0;
|
|
||||||
|
|
||||||
// If we are only doing traverseAllNodes, no need to look at the typed characters.
|
|
||||||
if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {
|
|
||||||
mWord[depth] = c;
|
|
||||||
if (traverseAllNodes && terminal) {
|
|
||||||
onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight,
|
|
||||||
freq, false, nextLetters, nextLettersSize, mCorrectionState);
|
|
||||||
}
|
|
||||||
if (!needsToTraverseChildrenNodes) return false;
|
|
||||||
*newTraverseAllNodes = traverseAllNodes;
|
|
||||||
*newMatchRate = matchWeight;
|
|
||||||
*newDiffs = diffs;
|
|
||||||
*newInputIndex = inputIndex;
|
|
||||||
} else {
|
|
||||||
int inputIndexForProximity = inputIndex;
|
|
||||||
|
|
||||||
if (transposedPos >= 0) {
|
|
||||||
if (inputIndex == transposedPos) ++inputIndexForProximity;
|
|
||||||
if (inputIndex == (transposedPos + 1)) --inputIndexForProximity;
|
|
||||||
}
|
|
||||||
|
|
||||||
ProximityInfo::ProximityType matchedProximityCharId = mProximityInfo->getMatchedProximityId(
|
|
||||||
inputIndexForProximity, c, mCorrectionState);
|
|
||||||
if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) return false;
|
|
||||||
mWord[depth] = c;
|
|
||||||
// If inputIndex is greater than mInputLength, that means there is no
|
|
||||||
// proximity chars. So, we don't need to check proximity.
|
|
||||||
if (ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
|
|
||||||
multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);
|
|
||||||
}
|
|
||||||
bool isSameAsUserTypedLength = mInputLength == inputIndex + 1
|
|
||||||
|| (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);
|
|
||||||
if (isSameAsUserTypedLength && terminal) {
|
|
||||||
onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight,
|
|
||||||
freq, true, nextLetters, nextLettersSize, mCorrectionState);
|
|
||||||
}
|
|
||||||
if (!needsToTraverseChildrenNodes) return false;
|
|
||||||
// Start traversing all nodes after the index exceeds the user typed length
|
|
||||||
*newTraverseAllNodes = isSameAsUserTypedLength;
|
|
||||||
*newMatchRate = matchWeight;
|
|
||||||
*newDiffs = diffs
|
|
||||||
+ ((ProximityInfo::NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);
|
|
||||||
*newInputIndex = inputIndex + 1;
|
|
||||||
}
|
|
||||||
// Optimization: Prune out words that are too long compared to how much was typed.
|
|
||||||
if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// If inputIndex is greater than mInputLength, that means there are no proximity chars.
|
|
||||||
// TODO: Check if this can be isSameAsUserTypedLength only.
|
|
||||||
if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) {
|
|
||||||
*newTraverseAllNodes = true;
|
|
||||||
}
|
|
||||||
// get the count of nodes and increment childAddress.
|
|
||||||
*newCount = Dictionary::getCount(DICT_ROOT, &childPosition);
|
|
||||||
*newChildPosition = childPosition;
|
|
||||||
if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);
|
|
||||||
return needsToTraverseChildrenNodes;
|
|
||||||
}
|
|
||||||
|
|
||||||
#else // NEW_DICTIONARY_FORMAT
|
|
||||||
|
|
||||||
// Wrapper for getMostFrequentWordLikeInner, which matches it to the previous
|
// Wrapper for getMostFrequentWordLikeInner, which matches it to the previous
|
||||||
// interface.
|
// interface.
|
||||||
inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
|
inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
|
||||||
|
@ -1245,6 +998,4 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // NEW_DICTIONARY_FORMAT
|
|
||||||
|
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
|
|
|
@ -31,7 +31,6 @@ namespace latinime {
|
||||||
class UnigramDictionary {
|
class UnigramDictionary {
|
||||||
|
|
||||||
public:
|
public:
|
||||||
#ifdef NEW_DICTIONARY_FORMAT
|
|
||||||
|
|
||||||
// Mask and flags for children address type selection.
|
// Mask and flags for children address type selection.
|
||||||
static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
|
static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
|
||||||
|
@ -63,16 +62,11 @@ public:
|
||||||
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
|
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
|
||||||
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
|
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
|
||||||
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
|
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
|
||||||
#endif // NEW_DICTIONARY_FORMAT
|
|
||||||
|
|
||||||
UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler,
|
UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler,
|
||||||
int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
|
int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
|
||||||
const bool isLatestDictVersion);
|
const bool isLatestDictVersion);
|
||||||
#ifndef NEW_DICTIONARY_FORMAT
|
|
||||||
bool isValidWord(unsigned short *word, int length);
|
|
||||||
#else // NEW_DICTIONARY_FORMAT
|
|
||||||
bool isValidWord(const uint16_t* const inWord, const int length) const;
|
bool isValidWord(const uint16_t* const inWord, const int length) const;
|
||||||
#endif // NEW_DICTIONARY_FORMAT
|
|
||||||
int getBigramPosition(int pos, unsigned short *word, int offset, int length) const;
|
int getBigramPosition(int pos, unsigned short *word, int offset, int length) const;
|
||||||
int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
|
int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
|
||||||
const int *ycoordinates, const int *codes, const int codesSize, const int flags,
|
const int *ycoordinates, const int *codes, const int codesSize, const int flags,
|
||||||
|
@ -117,15 +111,8 @@ private:
|
||||||
int *nextSiblingPosition, int *nextOutputIndex);
|
int *nextSiblingPosition, int *nextOutputIndex);
|
||||||
int getMostFrequentWordLike(const int startInputIndex, const int inputLength,
|
int getMostFrequentWordLike(const int startInputIndex, const int inputLength,
|
||||||
unsigned short *word);
|
unsigned short *word);
|
||||||
#ifndef NEW_DICTIONARY_FORMAT
|
|
||||||
// Process a node by considering missing space
|
|
||||||
bool processCurrentNodeForExactMatch(const int firstChildPos,
|
|
||||||
const int startInputIndex, const int depth, unsigned short *word,
|
|
||||||
int *newChildPosition, int *newCount, bool *newTerminal, int *newFreq, int *siblingPos);
|
|
||||||
#else // NEW_DICTIONARY_FORMAT
|
|
||||||
int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length,
|
int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length,
|
||||||
short unsigned int* outWord);
|
short unsigned int* outWord);
|
||||||
#endif // NEW_DICTIONARY_FORMAT
|
|
||||||
|
|
||||||
const uint8_t* const DICT_ROOT;
|
const uint8_t* const DICT_ROOT;
|
||||||
const int MAX_WORD_LENGTH;
|
const int MAX_WORD_LENGTH;
|
||||||
|
|
Loading…
Reference in New Issue