Merge "Read shortcuts as strings in the dictionary."
commit
fb64d0cd03
|
@ -123,6 +123,7 @@ int BigramDictionary::getBigrams(unsigned short *prevWord, int prevWordLength, i
|
||||||
}
|
}
|
||||||
pos = BinaryFormat::skipChildrenPosition(flags, pos);
|
pos = BinaryFormat::skipChildrenPosition(flags, pos);
|
||||||
pos = BinaryFormat::skipFrequency(flags, pos);
|
pos = BinaryFormat::skipFrequency(flags, pos);
|
||||||
|
pos = BinaryFormat::skipShortcuts(root, flags, pos);
|
||||||
int bigramFlags;
|
int bigramFlags;
|
||||||
int bigramCount = 0;
|
int bigramCount = 0;
|
||||||
do {
|
do {
|
||||||
|
|
|
@ -40,6 +40,9 @@ class BinaryFormat {
|
||||||
// implementations. On this occasion, we made the magic number 32 bits long.
|
// implementations. On this occasion, we made the magic number 32 bits long.
|
||||||
const static uint32_t FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE;
|
const static uint32_t FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE;
|
||||||
|
|
||||||
|
const static int CHARACTER_ARRAY_TERMINATOR_SIZE = 1;
|
||||||
|
const static int SHORTCUT_LIST_SIZE_SIZE = 2;
|
||||||
|
|
||||||
static int detectFormat(const uint8_t* const dict);
|
static int detectFormat(const uint8_t* const dict);
|
||||||
static unsigned int getHeaderSize(const uint8_t* const dict);
|
static unsigned int getHeaderSize(const uint8_t* const dict);
|
||||||
static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos);
|
static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos);
|
||||||
|
@ -47,9 +50,10 @@ class BinaryFormat {
|
||||||
static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos);
|
static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos);
|
||||||
static int readFrequencyWithoutMovingPointer(const uint8_t* const dict, const int pos);
|
static int readFrequencyWithoutMovingPointer(const uint8_t* const dict, const int pos);
|
||||||
static int skipOtherCharacters(const uint8_t* const dict, const int pos);
|
static int skipOtherCharacters(const uint8_t* const dict, const int pos);
|
||||||
static int skipAttributes(const uint8_t* const dict, const int pos);
|
|
||||||
static int skipChildrenPosition(const uint8_t flags, const int pos);
|
static int skipChildrenPosition(const uint8_t flags, const int pos);
|
||||||
static int skipFrequency(const uint8_t flags, const int pos);
|
static int skipFrequency(const uint8_t flags, const int pos);
|
||||||
|
static int skipShortcuts(const uint8_t* const dict, const uint8_t flags, const int pos);
|
||||||
|
static int skipBigrams(const uint8_t* const dict, const uint8_t flags, const int pos);
|
||||||
static int skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos);
|
static int skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos);
|
||||||
static int skipChildrenPosAndAttributes(const uint8_t* const dict, const uint8_t flags,
|
static int skipChildrenPosAndAttributes(const uint8_t* const dict, const uint8_t flags,
|
||||||
const int pos);
|
const int pos);
|
||||||
|
@ -157,12 +161,12 @@ static inline int attributeAddressSize(const uint8_t flags) {
|
||||||
*/
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
inline int BinaryFormat::skipAttributes(const uint8_t* const dict, const int pos) {
|
static inline int skipExistingBigrams(const uint8_t* const dict, const int pos) {
|
||||||
int currentPos = pos;
|
int currentPos = pos;
|
||||||
uint8_t flags = getFlagsAndForwardPointer(dict, ¤tPos);
|
uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos);
|
||||||
while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) {
|
while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) {
|
||||||
currentPos += attributeAddressSize(flags);
|
currentPos += attributeAddressSize(flags);
|
||||||
flags = getFlagsAndForwardPointer(dict, ¤tPos);
|
flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos);
|
||||||
}
|
}
|
||||||
currentPos += attributeAddressSize(flags);
|
currentPos += attributeAddressSize(flags);
|
||||||
return currentPos;
|
return currentPos;
|
||||||
|
@ -174,6 +178,10 @@ static inline int childrenAddressSize(const uint8_t flags) {
|
||||||
/* See the note in attributeAddressSize. The same applies here */
|
/* See the note in attributeAddressSize. The same applies here */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline int shortcutByteSize(const uint8_t* const dict, const int pos) {
|
||||||
|
return ((int)(dict[pos] << 8)) + (dict[pos + 1]);
|
||||||
|
}
|
||||||
|
|
||||||
inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos) {
|
inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos) {
|
||||||
return pos + childrenAddressSize(flags);
|
return pos + childrenAddressSize(flags);
|
||||||
}
|
}
|
||||||
|
@ -182,16 +190,30 @@ inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) {
|
||||||
return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
|
return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline int BinaryFormat::skipShortcuts(const uint8_t* const dict, const uint8_t flags,
|
||||||
|
const int pos) {
|
||||||
|
if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) {
|
||||||
|
return pos + shortcutByteSize(dict, pos);
|
||||||
|
} else {
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int BinaryFormat::skipBigrams(const uint8_t* const dict, const uint8_t flags,
|
||||||
|
const int pos) {
|
||||||
|
if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) {
|
||||||
|
return skipExistingBigrams(dict, pos);
|
||||||
|
} else {
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags,
|
inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags,
|
||||||
const int pos) {
|
const int pos) {
|
||||||
// This function skips all attributes: shortcuts and bigrams.
|
// This function skips all attributes: shortcuts and bigrams.
|
||||||
int newPos = pos;
|
int newPos = pos;
|
||||||
if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) {
|
newPos = skipShortcuts(dict, flags, newPos);
|
||||||
newPos = skipAttributes(dict, newPos);
|
newPos = skipBigrams(dict, flags, newPos);
|
||||||
}
|
|
||||||
if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) {
|
|
||||||
newPos = skipAttributes(dict, newPos);
|
|
||||||
}
|
|
||||||
return newPos;
|
return newPos;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -45,13 +45,19 @@ class TerminalAttributes {
|
||||||
|
|
||||||
// Gets the shortcut target itself as a uint16_t string. For parameters and return value
|
// Gets the shortcut target itself as a uint16_t string. For parameters and return value
|
||||||
// see BinaryFormat::getWordAtAddress.
|
// see BinaryFormat::getWordAtAddress.
|
||||||
|
// TODO: make the output an uint32_t* to handle the whole unicode range.
|
||||||
inline int getNextShortcutTarget(const int maxDepth, uint16_t* outWord) {
|
inline int getNextShortcutTarget(const int maxDepth, uint16_t* outWord) {
|
||||||
const int shortcutFlags = BinaryFormat::getFlagsAndForwardPointer(mDict, &mPos);
|
const int shortcutFlags = BinaryFormat::getFlagsAndForwardPointer(mDict, &mPos);
|
||||||
mHasNextShortcutTarget =
|
mHasNextShortcutTarget =
|
||||||
0 != (shortcutFlags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT);
|
0 != (shortcutFlags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT);
|
||||||
int shortcutAddress =
|
unsigned int i;
|
||||||
BinaryFormat::getAttributeAddressAndForwardPointer(mDict, shortcutFlags, &mPos);
|
for (i = 0; i < MAX_WORD_LENGTH_INTERNAL; ++i) {
|
||||||
return BinaryFormat::getWordAtAddress(mDict, shortcutAddress, maxDepth, outWord);
|
const int charCode = BinaryFormat::getCharCodeAndForwardPointer(mDict, &mPos);
|
||||||
|
if (NOT_A_CHARACTER == charCode) break;
|
||||||
|
outWord[i] = (uint16_t)charCode;
|
||||||
|
}
|
||||||
|
mPos += BinaryFormat::CHARACTER_ARRAY_TERMINATOR_SIZE;
|
||||||
|
return i;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -65,12 +71,10 @@ class TerminalAttributes {
|
||||||
mDict(dict), mFlags(flags), mStartPos(pos) {
|
mDict(dict), mFlags(flags), mStartPos(pos) {
|
||||||
}
|
}
|
||||||
|
|
||||||
inline bool isShortcutOnly() const {
|
|
||||||
return 0 != (mFlags & UnigramDictionary::FLAG_IS_SHORTCUT_ONLY);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline ShortcutIterator getShortcutIterator() const {
|
inline ShortcutIterator getShortcutIterator() const {
|
||||||
return ShortcutIterator(mDict, mStartPos, mFlags);
|
// The size of the shortcuts is stored here so that the whole shortcut chunk can be
|
||||||
|
// skipped quickly, so we ignore it.
|
||||||
|
return ShortcutIterator(mDict, mStartPos + BinaryFormat::SHORTCUT_LIST_SIZE_SIZE, mFlags);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
|
|
|
@ -366,10 +366,9 @@ inline void UnigramDictionary::onTerminal(const int freq,
|
||||||
WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
|
WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
|
||||||
const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength);
|
const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength);
|
||||||
if (finalFreq != NOT_A_FREQUENCY) {
|
if (finalFreq != NOT_A_FREQUENCY) {
|
||||||
if (!terminalAttributes.isShortcutOnly()) {
|
addWord(wordPointer, wordLength, finalFreq, masterQueue);
|
||||||
addWord(wordPointer, wordLength, finalFreq, masterQueue);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
const int shortcutFreq = finalFreq > 0 ? finalFreq - 1 : 0;
|
||||||
// Please note that the shortcut candidates will be added to the master queue only.
|
// Please note that the shortcut candidates will be added to the master queue only.
|
||||||
TerminalAttributes::ShortcutIterator iterator =
|
TerminalAttributes::ShortcutIterator iterator =
|
||||||
terminalAttributes.getShortcutIterator();
|
terminalAttributes.getShortcutIterator();
|
||||||
|
@ -379,11 +378,12 @@ inline void UnigramDictionary::onTerminal(const int freq,
|
||||||
// We need to either modulate the frequency of each shortcut according
|
// We need to either modulate the frequency of each shortcut according
|
||||||
// to its own shortcut frequency or to make the queue
|
// to its own shortcut frequency or to make the queue
|
||||||
// so that the insert order is protected inside the queue for words
|
// so that the insert order is protected inside the queue for words
|
||||||
// with the same score.
|
// with the same score. For the moment we use -1 to make sure the shortcut will
|
||||||
|
// never be in front of the word.
|
||||||
uint16_t shortcutTarget[MAX_WORD_LENGTH_INTERNAL];
|
uint16_t shortcutTarget[MAX_WORD_LENGTH_INTERNAL];
|
||||||
const int shortcutTargetStringLength = iterator.getNextShortcutTarget(
|
const int shortcutTargetStringLength = iterator.getNextShortcutTarget(
|
||||||
MAX_WORD_LENGTH_INTERNAL, shortcutTarget);
|
MAX_WORD_LENGTH_INTERNAL, shortcutTarget);
|
||||||
addWord(shortcutTarget, shortcutTargetStringLength, finalFreq, masterQueue);
|
addWord(shortcutTarget, shortcutTargetStringLength, shortcutFreq, masterQueue);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -49,10 +49,6 @@ class UnigramDictionary {
|
||||||
static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
|
static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
|
||||||
// Flag for bigram presence
|
// Flag for bigram presence
|
||||||
static const int FLAG_HAS_BIGRAMS = 0x04;
|
static const int FLAG_HAS_BIGRAMS = 0x04;
|
||||||
// Flag for shortcut-only words. Some words are shortcut-only, which means they match when
|
|
||||||
// the user types them but they don't pop in the suggestion strip, only the words they are
|
|
||||||
// shortcuts for do.
|
|
||||||
static const int FLAG_IS_SHORTCUT_ONLY = 0x02;
|
|
||||||
|
|
||||||
// Attribute (bigram/shortcut) related flags:
|
// Attribute (bigram/shortcut) related flags:
|
||||||
// Flag for presence of more attributes
|
// Flag for presence of more attributes
|
||||||
|
|
Loading…
Reference in New Issue