Merge "Read shortcuts as strings in the dictionary."
commit
fb64d0cd03
|
@ -123,6 +123,7 @@ int BigramDictionary::getBigrams(unsigned short *prevWord, int prevWordLength, i
|
|||
}
|
||||
pos = BinaryFormat::skipChildrenPosition(flags, pos);
|
||||
pos = BinaryFormat::skipFrequency(flags, pos);
|
||||
pos = BinaryFormat::skipShortcuts(root, flags, pos);
|
||||
int bigramFlags;
|
||||
int bigramCount = 0;
|
||||
do {
|
||||
|
|
|
@ -40,6 +40,9 @@ class BinaryFormat {
|
|||
// implementations. On this occasion, we made the magic number 32 bits long.
|
||||
const static uint32_t FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE;
|
||||
|
||||
const static int CHARACTER_ARRAY_TERMINATOR_SIZE = 1;
|
||||
const static int SHORTCUT_LIST_SIZE_SIZE = 2;
|
||||
|
||||
static int detectFormat(const uint8_t* const dict);
|
||||
static unsigned int getHeaderSize(const uint8_t* const dict);
|
||||
static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos);
|
||||
|
@ -47,9 +50,10 @@ class BinaryFormat {
|
|||
static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos);
|
||||
static int readFrequencyWithoutMovingPointer(const uint8_t* const dict, const int pos);
|
||||
static int skipOtherCharacters(const uint8_t* const dict, const int pos);
|
||||
static int skipAttributes(const uint8_t* const dict, const int pos);
|
||||
static int skipChildrenPosition(const uint8_t flags, const int pos);
|
||||
static int skipFrequency(const uint8_t flags, const int pos);
|
||||
static int skipShortcuts(const uint8_t* const dict, const uint8_t flags, const int pos);
|
||||
static int skipBigrams(const uint8_t* const dict, const uint8_t flags, const int pos);
|
||||
static int skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos);
|
||||
static int skipChildrenPosAndAttributes(const uint8_t* const dict, const uint8_t flags,
|
||||
const int pos);
|
||||
|
@ -157,12 +161,12 @@ static inline int attributeAddressSize(const uint8_t flags) {
|
|||
*/
|
||||
}
|
||||
|
||||
inline int BinaryFormat::skipAttributes(const uint8_t* const dict, const int pos) {
|
||||
static inline int skipExistingBigrams(const uint8_t* const dict, const int pos) {
|
||||
int currentPos = pos;
|
||||
uint8_t flags = getFlagsAndForwardPointer(dict, ¤tPos);
|
||||
uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos);
|
||||
while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) {
|
||||
currentPos += attributeAddressSize(flags);
|
||||
flags = getFlagsAndForwardPointer(dict, ¤tPos);
|
||||
flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos);
|
||||
}
|
||||
currentPos += attributeAddressSize(flags);
|
||||
return currentPos;
|
||||
|
@ -174,6 +178,10 @@ static inline int childrenAddressSize(const uint8_t flags) {
|
|||
/* See the note in attributeAddressSize. The same applies here */
|
||||
}
|
||||
|
||||
static inline int shortcutByteSize(const uint8_t* const dict, const int pos) {
|
||||
return ((int)(dict[pos] << 8)) + (dict[pos + 1]);
|
||||
}
|
||||
|
||||
inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos) {
|
||||
return pos + childrenAddressSize(flags);
|
||||
}
|
||||
|
@ -182,16 +190,30 @@ inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {
|
|||
return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
|
||||
}
|
||||
|
||||
inline int BinaryFormat::skipShortcuts(const uint8_t* const dict, const uint8_t flags,
|
||||
const int pos) {
|
||||
if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) {
|
||||
return pos + shortcutByteSize(dict, pos);
|
||||
} else {
|
||||
return pos;
|
||||
}
|
||||
}
|
||||
|
||||
inline int BinaryFormat::skipBigrams(const uint8_t* const dict, const uint8_t flags,
|
||||
const int pos) {
|
||||
if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) {
|
||||
return skipExistingBigrams(dict, pos);
|
||||
} else {
|
||||
return pos;
|
||||
}
|
||||
}
|
||||
|
||||
inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags,
|
||||
const int pos) {
|
||||
// This function skips all attributes: shortcuts and bigrams.
|
||||
int newPos = pos;
|
||||
if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) {
|
||||
newPos = skipAttributes(dict, newPos);
|
||||
}
|
||||
if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) {
|
||||
newPos = skipAttributes(dict, newPos);
|
||||
}
|
||||
newPos = skipShortcuts(dict, flags, newPos);
|
||||
newPos = skipBigrams(dict, flags, newPos);
|
||||
return newPos;
|
||||
}
|
||||
|
||||
|
|
|
@ -45,13 +45,19 @@ class TerminalAttributes {
|
|||
|
||||
// Gets the shortcut target itself as a uint16_t string. For parameters and return value
|
||||
// see BinaryFormat::getWordAtAddress.
|
||||
// TODO: make the output an uint32_t* to handle the whole unicode range.
|
||||
inline int getNextShortcutTarget(const int maxDepth, uint16_t* outWord) {
|
||||
const int shortcutFlags = BinaryFormat::getFlagsAndForwardPointer(mDict, &mPos);
|
||||
mHasNextShortcutTarget =
|
||||
0 != (shortcutFlags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT);
|
||||
int shortcutAddress =
|
||||
BinaryFormat::getAttributeAddressAndForwardPointer(mDict, shortcutFlags, &mPos);
|
||||
return BinaryFormat::getWordAtAddress(mDict, shortcutAddress, maxDepth, outWord);
|
||||
unsigned int i;
|
||||
for (i = 0; i < MAX_WORD_LENGTH_INTERNAL; ++i) {
|
||||
const int charCode = BinaryFormat::getCharCodeAndForwardPointer(mDict, &mPos);
|
||||
if (NOT_A_CHARACTER == charCode) break;
|
||||
outWord[i] = (uint16_t)charCode;
|
||||
}
|
||||
mPos += BinaryFormat::CHARACTER_ARRAY_TERMINATOR_SIZE;
|
||||
return i;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -65,12 +71,10 @@ class TerminalAttributes {
|
|||
mDict(dict), mFlags(flags), mStartPos(pos) {
|
||||
}
|
||||
|
||||
inline bool isShortcutOnly() const {
|
||||
return 0 != (mFlags & UnigramDictionary::FLAG_IS_SHORTCUT_ONLY);
|
||||
}
|
||||
|
||||
inline ShortcutIterator getShortcutIterator() const {
|
||||
return ShortcutIterator(mDict, mStartPos, mFlags);
|
||||
// The size of the shortcuts is stored here so that the whole shortcut chunk can be
|
||||
// skipped quickly, so we ignore it.
|
||||
return ShortcutIterator(mDict, mStartPos + BinaryFormat::SHORTCUT_LIST_SIZE_SIZE, mFlags);
|
||||
}
|
||||
};
|
||||
} // namespace latinime
|
||||
|
|
|
@ -366,10 +366,9 @@ inline void UnigramDictionary::onTerminal(const int freq,
|
|||
WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
|
||||
const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength);
|
||||
if (finalFreq != NOT_A_FREQUENCY) {
|
||||
if (!terminalAttributes.isShortcutOnly()) {
|
||||
addWord(wordPointer, wordLength, finalFreq, masterQueue);
|
||||
}
|
||||
addWord(wordPointer, wordLength, finalFreq, masterQueue);
|
||||
|
||||
const int shortcutFreq = finalFreq > 0 ? finalFreq - 1 : 0;
|
||||
// Please note that the shortcut candidates will be added to the master queue only.
|
||||
TerminalAttributes::ShortcutIterator iterator =
|
||||
terminalAttributes.getShortcutIterator();
|
||||
|
@ -379,11 +378,12 @@ inline void UnigramDictionary::onTerminal(const int freq,
|
|||
// We need to either modulate the frequency of each shortcut according
|
||||
// to its own shortcut frequency or to make the queue
|
||||
// so that the insert order is protected inside the queue for words
|
||||
// with the same score.
|
||||
// with the same score. For the moment we use -1 to make sure the shortcut will
|
||||
// never be in front of the word.
|
||||
uint16_t shortcutTarget[MAX_WORD_LENGTH_INTERNAL];
|
||||
const int shortcutTargetStringLength = iterator.getNextShortcutTarget(
|
||||
MAX_WORD_LENGTH_INTERNAL, shortcutTarget);
|
||||
addWord(shortcutTarget, shortcutTargetStringLength, finalFreq, masterQueue);
|
||||
addWord(shortcutTarget, shortcutTargetStringLength, shortcutFreq, masterQueue);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -49,10 +49,6 @@ class UnigramDictionary {
|
|||
static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
|
||||
// Flag for bigram presence
|
||||
static const int FLAG_HAS_BIGRAMS = 0x04;
|
||||
// Flag for shortcut-only words. Some words are shortcut-only, which means they match when
|
||||
// the user types them but they don't pop in the suggestion strip, only the words they are
|
||||
// shortcuts for do.
|
||||
static const int FLAG_IS_SHORTCUT_ONLY = 0x02;
|
||||
|
||||
// Attribute (bigram/shortcut) related flags:
|
||||
// Flag for presence of more attributes
|
||||
|
|
Loading…
Reference in New Issue