Remove old dictionary format code
Change-Id: Ic4b9e069c9bd5c088769519f44d0a9ea45acb833main
parent
ccec49793c
commit
db2c0919cf
|
@ -8,9 +8,6 @@ LOCAL_CFLAGS += -Werror -Wall
|
|||
# To suppress compiler warnings for unused variables/functions used for debug features etc.
|
||||
LOCAL_CFLAGS += -Wno-unused-parameter -Wno-unused-function
|
||||
|
||||
# Use the new dictionary format
|
||||
LOCAL_CFLAGS += -DNEW_DICTIONARY_FORMAT
|
||||
|
||||
LOCAL_SRC_FILES := \
|
||||
jni/com_android_inputmethod_keyboard_ProximityInfo.cpp \
|
||||
jni/com_android_inputmethod_latin_BinaryDictionary.cpp \
|
||||
|
|
|
@ -24,9 +24,7 @@
|
|||
#include "dictionary.h"
|
||||
#include "unigram_dictionary.h"
|
||||
|
||||
#ifdef NEW_DICTIONARY_FORMAT
|
||||
#include "binary_format.h"
|
||||
#endif // NEW_DICTIONARY_FORMAT
|
||||
|
||||
namespace latinime {
|
||||
|
||||
|
@ -39,20 +37,12 @@ const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =
|
|||
UnigramDictionary::UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultiplier,
|
||||
int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
|
||||
const bool isLatestDictVersion)
|
||||
#ifndef NEW_DICTIONARY_FORMAT
|
||||
: DICT_ROOT(streamStart),
|
||||
#else // NEW_DICTIONARY_FORMAT
|
||||
: DICT_ROOT(streamStart + NEW_DICTIONARY_HEADER_SIZE),
|
||||
#endif // NEW_DICTIONARY_FORMAT
|
||||
MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
|
||||
MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
|
||||
TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
|
||||
#ifndef NEW_DICTIONARY_FORMAT
|
||||
ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0),
|
||||
#else // NEW_DICTIONARY_FORMAT
|
||||
// TODO : remove this variable.
|
||||
ROOT_POS(0),
|
||||
#endif // NEW_DICTIONARY_FORMAT
|
||||
BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(int)),
|
||||
MAX_UMLAUT_SEARCH_DEPTH(DEFAULT_MAX_UMLAUT_SEARCH_DEPTH) {
|
||||
if (DEBUG_DICT) {
|
||||
|
@ -656,243 +646,6 @@ bool UnigramDictionary::getSplitTwoWordsSuggestion(const int inputLength,
|
|||
return true;
|
||||
}
|
||||
|
||||
#ifndef NEW_DICTIONARY_FORMAT
|
||||
inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
|
||||
const int inputLength, unsigned short *word) {
|
||||
int pos = ROOT_POS;
|
||||
int count = Dictionary::getCount(DICT_ROOT, &pos);
|
||||
int maxFreq = 0;
|
||||
int depth = 0;
|
||||
unsigned short newWord[MAX_WORD_LENGTH_INTERNAL];
|
||||
bool terminal = false;
|
||||
|
||||
mStackChildCount[0] = count;
|
||||
mStackSiblingPos[0] = pos;
|
||||
|
||||
while (depth >= 0) {
|
||||
if (mStackChildCount[depth] > 0) {
|
||||
--mStackChildCount[depth];
|
||||
int firstChildPos;
|
||||
int newFreq;
|
||||
int siblingPos = mStackSiblingPos[depth];
|
||||
const bool needsToTraverseChildrenNodes = processCurrentNodeForExactMatch(siblingPos,
|
||||
startInputIndex, depth, newWord, &firstChildPos, &count, &terminal, &newFreq,
|
||||
&siblingPos);
|
||||
mStackSiblingPos[depth] = siblingPos;
|
||||
if (depth == (inputLength - 1)) {
|
||||
// Traverse sibling node
|
||||
if (terminal) {
|
||||
if (newFreq > maxFreq) {
|
||||
for (int i = 0; i < inputLength; ++i) word[i] = newWord[i];
|
||||
if (DEBUG_DICT && DEBUG_NODE) {
|
||||
#ifdef FLAG_DBG
|
||||
char s[inputLength + 1];
|
||||
for (int i = 0; i < inputLength; ++i) s[i] = word[i];
|
||||
s[inputLength] = 0;
|
||||
LOGI("New missing space word found: %d > %d (%s), %d, %d",
|
||||
newFreq, maxFreq, s, inputLength, depth);
|
||||
#endif
|
||||
}
|
||||
maxFreq = newFreq;
|
||||
}
|
||||
}
|
||||
} else if (needsToTraverseChildrenNodes) {
|
||||
// Traverse children nodes
|
||||
++depth;
|
||||
mStackChildCount[depth] = count;
|
||||
mStackSiblingPos[depth] = firstChildPos;
|
||||
}
|
||||
} else {
|
||||
// Traverse parent node
|
||||
--depth;
|
||||
}
|
||||
}
|
||||
|
||||
word[inputLength] = 0;
|
||||
return maxFreq;
|
||||
}
|
||||
|
||||
inline bool UnigramDictionary::processCurrentNodeForExactMatch(const int firstChildPos,
|
||||
const int startInputIndex, const int depth, unsigned short *word, int *newChildPosition,
|
||||
int *newCount, bool *newTerminal, int *newFreq, int *siblingPos) {
|
||||
const int inputIndex = startInputIndex + depth;
|
||||
unsigned short c;
|
||||
*siblingPos = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, firstChildPos,
|
||||
&c, newChildPosition, newTerminal, newFreq);
|
||||
const unsigned int inputC = mProximityInfo->getPrimaryCharAt(inputIndex);
|
||||
if (DEBUG_DICT) {
|
||||
assert(inputC <= U_SHORT_MAX);
|
||||
}
|
||||
const unsigned short baseLowerC = Dictionary::toBaseLowerCase(c);
|
||||
const bool matched = (inputC == baseLowerC || inputC == c);
|
||||
const bool hasChild = *newChildPosition != 0;
|
||||
if (matched) {
|
||||
word[depth] = c;
|
||||
if (DEBUG_DICT && DEBUG_NODE) {
|
||||
LOGI("Node(%c, %c)<%d>, %d, %d", inputC, c, matched, hasChild, *newFreq);
|
||||
if (*newTerminal) {
|
||||
LOGI("Terminal %d", *newFreq);
|
||||
}
|
||||
}
|
||||
if (hasChild) {
|
||||
*newCount = Dictionary::getCount(DICT_ROOT, newChildPosition);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
// If this node is not user typed character, this method treats this word as unmatched.
|
||||
// Thus newTerminal shouldn't be true.
|
||||
*newTerminal = false;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: use uint32_t instead of unsigned short
|
||||
bool UnigramDictionary::isValidWord(unsigned short *word, int length) {
|
||||
if (IS_LATEST_DICT_VERSION) {
|
||||
return (getBigramPosition(DICTIONARY_HEADER_SIZE, word, 0, length) != NOT_VALID_WORD);
|
||||
} else {
|
||||
return (getBigramPosition(0, word, 0, length) != NOT_VALID_WORD);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Require strict exact match.
|
||||
int UnigramDictionary::getBigramPosition(int pos, unsigned short *word, int offset,
|
||||
int length) const {
|
||||
// returns address of bigram data of that word
|
||||
// return -99 if not found
|
||||
|
||||
int count = Dictionary::getCount(DICT_ROOT, &pos);
|
||||
unsigned short currentChar = (unsigned short) word[offset];
|
||||
for (int j = 0; j < count; j++) {
|
||||
unsigned short c = Dictionary::getChar(DICT_ROOT, &pos);
|
||||
int terminal = Dictionary::getTerminal(DICT_ROOT, &pos);
|
||||
int childPos = Dictionary::getAddress(DICT_ROOT, &pos);
|
||||
if (c == currentChar) {
|
||||
if (offset == length - 1) {
|
||||
if (terminal) {
|
||||
return (pos+1);
|
||||
}
|
||||
} else {
|
||||
if (childPos != 0) {
|
||||
int t = getBigramPosition(childPos, word, offset + 1, length);
|
||||
if (t > 0) {
|
||||
return t;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (terminal) {
|
||||
Dictionary::getFreq(DICT_ROOT, IS_LATEST_DICT_VERSION, &pos);
|
||||
}
|
||||
// There could be two instances of each alphabet - upper and lower case. So continue
|
||||
// looking ...
|
||||
}
|
||||
return NOT_VALID_WORD;
|
||||
}
|
||||
|
||||
// The following functions will be modified.
|
||||
inline bool UnigramDictionary::processCurrentNode(const int initialPos, const int initialDepth,
|
||||
const int maxDepth, const bool initialTraverseAllNodes, int matchWeight, int inputIndex,
|
||||
const int initialDiffs, int *nextLetters, const int nextLettersSize,
|
||||
CorrectionState *correctionState, int *newCount, int *newChildPosition,
|
||||
bool *newTraverseAllNodes, int *newMatchRate, int *newInputIndex, int *newDiffs,
|
||||
int *nextSiblingPosition, int *nextOutputIndex) {
|
||||
const int skipPos = correctionState->getSkipPos();
|
||||
const int excessivePos = correctionState->getExcessivePos();
|
||||
const int transposedPos = correctionState->getTransposedPos();
|
||||
if (DEBUG_DICT) {
|
||||
int inputCount = 0;
|
||||
if (skipPos >= 0) ++inputCount;
|
||||
if (excessivePos >= 0) ++inputCount;
|
||||
if (transposedPos >= 0) ++inputCount;
|
||||
assert(inputCount <= 1);
|
||||
}
|
||||
unsigned short c;
|
||||
int childPosition;
|
||||
bool terminal;
|
||||
int freq;
|
||||
bool isSameAsUserTypedLength = false;
|
||||
|
||||
const int pos = initialPos;
|
||||
const int depth = initialDepth;
|
||||
const int traverseAllNodes = initialTraverseAllNodes;
|
||||
const int diffs = initialDiffs;
|
||||
|
||||
const uint8_t flags = 0; // No flags for now
|
||||
|
||||
if (excessivePos == depth && inputIndex < mInputLength - 1) ++inputIndex;
|
||||
|
||||
*nextSiblingPosition = Dictionary::setDictionaryValues(DICT_ROOT, IS_LATEST_DICT_VERSION, pos,
|
||||
&c, &childPosition, &terminal, &freq);
|
||||
*nextOutputIndex = depth + 1;
|
||||
|
||||
const bool needsToTraverseChildrenNodes = childPosition != 0;
|
||||
|
||||
// If we are only doing traverseAllNodes, no need to look at the typed characters.
|
||||
if (traverseAllNodes || needsToSkipCurrentNode(c, inputIndex, skipPos, depth)) {
|
||||
mWord[depth] = c;
|
||||
if (traverseAllNodes && terminal) {
|
||||
onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight,
|
||||
freq, false, nextLetters, nextLettersSize, mCorrectionState);
|
||||
}
|
||||
if (!needsToTraverseChildrenNodes) return false;
|
||||
*newTraverseAllNodes = traverseAllNodes;
|
||||
*newMatchRate = matchWeight;
|
||||
*newDiffs = diffs;
|
||||
*newInputIndex = inputIndex;
|
||||
} else {
|
||||
int inputIndexForProximity = inputIndex;
|
||||
|
||||
if (transposedPos >= 0) {
|
||||
if (inputIndex == transposedPos) ++inputIndexForProximity;
|
||||
if (inputIndex == (transposedPos + 1)) --inputIndexForProximity;
|
||||
}
|
||||
|
||||
ProximityInfo::ProximityType matchedProximityCharId = mProximityInfo->getMatchedProximityId(
|
||||
inputIndexForProximity, c, mCorrectionState);
|
||||
if (ProximityInfo::UNRELATED_CHAR == matchedProximityCharId) return false;
|
||||
mWord[depth] = c;
|
||||
// If inputIndex is greater than mInputLength, that means there is no
|
||||
// proximity chars. So, we don't need to check proximity.
|
||||
if (ProximityInfo::SAME_OR_ACCENTED_OR_CAPITALIZED_CHAR == matchedProximityCharId) {
|
||||
multiplyIntCapped(TYPED_LETTER_MULTIPLIER, &matchWeight);
|
||||
}
|
||||
bool isSameAsUserTypedLength = mInputLength == inputIndex + 1
|
||||
|| (excessivePos == mInputLength - 1 && inputIndex == mInputLength - 2);
|
||||
if (isSameAsUserTypedLength && terminal) {
|
||||
onTerminal(mWord, depth, DICT_ROOT, flags, pos, inputIndex, matchWeight,
|
||||
freq, true, nextLetters, nextLettersSize, mCorrectionState);
|
||||
}
|
||||
if (!needsToTraverseChildrenNodes) return false;
|
||||
// Start traversing all nodes after the index exceeds the user typed length
|
||||
*newTraverseAllNodes = isSameAsUserTypedLength;
|
||||
*newMatchRate = matchWeight;
|
||||
*newDiffs = diffs
|
||||
+ ((ProximityInfo::NEAR_PROXIMITY_CHAR == matchedProximityCharId) ? 1 : 0);
|
||||
*newInputIndex = inputIndex + 1;
|
||||
}
|
||||
// Optimization: Prune out words that are too long compared to how much was typed.
|
||||
if (depth >= maxDepth || *newDiffs > mMaxEditDistance) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// If inputIndex is greater than mInputLength, that means there are no proximity chars.
|
||||
// TODO: Check if this can be isSameAsUserTypedLength only.
|
||||
if (isSameAsUserTypedLength || mInputLength <= *newInputIndex) {
|
||||
*newTraverseAllNodes = true;
|
||||
}
|
||||
// get the count of nodes and increment childAddress.
|
||||
*newCount = Dictionary::getCount(DICT_ROOT, &childPosition);
|
||||
*newChildPosition = childPosition;
|
||||
if (DEBUG_DICT) assert(needsToTraverseChildrenNodes);
|
||||
return needsToTraverseChildrenNodes;
|
||||
}
|
||||
|
||||
#else // NEW_DICTIONARY_FORMAT
|
||||
|
||||
// Wrapper for getMostFrequentWordLikeInner, which matches it to the previous
|
||||
// interface.
|
||||
inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
|
||||
|
@ -1245,6 +998,4 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, const in
|
|||
return true;
|
||||
}
|
||||
|
||||
#endif // NEW_DICTIONARY_FORMAT
|
||||
|
||||
} // namespace latinime
|
||||
|
|
|
@ -31,7 +31,6 @@ namespace latinime {
|
|||
class UnigramDictionary {
|
||||
|
||||
public:
|
||||
#ifdef NEW_DICTIONARY_FORMAT
|
||||
|
||||
// Mask and flags for children address type selection.
|
||||
static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
|
||||
|
@ -63,16 +62,11 @@ public:
|
|||
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
|
||||
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
|
||||
static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
|
||||
#endif // NEW_DICTIONARY_FORMAT
|
||||
|
||||
UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler,
|
||||
int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
|
||||
const bool isLatestDictVersion);
|
||||
#ifndef NEW_DICTIONARY_FORMAT
|
||||
bool isValidWord(unsigned short *word, int length);
|
||||
#else // NEW_DICTIONARY_FORMAT
|
||||
bool isValidWord(const uint16_t* const inWord, const int length) const;
|
||||
#endif // NEW_DICTIONARY_FORMAT
|
||||
int getBigramPosition(int pos, unsigned short *word, int offset, int length) const;
|
||||
int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
|
||||
const int *ycoordinates, const int *codes, const int codesSize, const int flags,
|
||||
|
@ -117,15 +111,8 @@ private:
|
|||
int *nextSiblingPosition, int *nextOutputIndex);
|
||||
int getMostFrequentWordLike(const int startInputIndex, const int inputLength,
|
||||
unsigned short *word);
|
||||
#ifndef NEW_DICTIONARY_FORMAT
|
||||
// Process a node by considering missing space
|
||||
bool processCurrentNodeForExactMatch(const int firstChildPos,
|
||||
const int startInputIndex, const int depth, unsigned short *word,
|
||||
int *newChildPosition, int *newCount, bool *newTerminal, int *newFreq, int *siblingPos);
|
||||
#else // NEW_DICTIONARY_FORMAT
|
||||
int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length,
|
||||
short unsigned int* outWord);
|
||||
#endif // NEW_DICTIONARY_FORMAT
|
||||
|
||||
const uint8_t* const DICT_ROOT;
|
||||
const int MAX_WORD_LENGTH;
|
||||
|
|
Loading…
Reference in New Issue