diff --git a/native/jni/src/bigram_dictionary.cpp b/native/jni/src/bigram_dictionary.cpp index f7a3d3e60..8d6c3d179 100644 --- a/native/jni/src/bigram_dictionary.cpp +++ b/native/jni/src/bigram_dictionary.cpp @@ -123,6 +123,7 @@ int BigramDictionary::getBigrams(unsigned short *prevWord, int prevWordLength, i } pos = BinaryFormat::skipChildrenPosition(flags, pos); pos = BinaryFormat::skipFrequency(flags, pos); + pos = BinaryFormat::skipShortcuts(root, flags, pos); int bigramFlags; int bigramCount = 0; do { diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h index ab033ad90..2ac6e053f 100644 --- a/native/jni/src/binary_format.h +++ b/native/jni/src/binary_format.h @@ -40,6 +40,9 @@ class BinaryFormat { // implementations. On this occasion, we made the magic number 32 bits long. const static uint32_t FORMAT_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE; + const static int CHARACTER_ARRAY_TERMINATOR_SIZE = 1; + const static int SHORTCUT_LIST_SIZE_SIZE = 2; + static int detectFormat(const uint8_t* const dict); static unsigned int getHeaderSize(const uint8_t* const dict); static int getGroupCountAndForwardPointer(const uint8_t* const dict, int* pos); @@ -47,9 +50,10 @@ class BinaryFormat { static int32_t getCharCodeAndForwardPointer(const uint8_t* const dict, int* pos); static int readFrequencyWithoutMovingPointer(const uint8_t* const dict, const int pos); static int skipOtherCharacters(const uint8_t* const dict, const int pos); - static int skipAttributes(const uint8_t* const dict, const int pos); static int skipChildrenPosition(const uint8_t flags, const int pos); static int skipFrequency(const uint8_t flags, const int pos); + static int skipShortcuts(const uint8_t* const dict, const uint8_t flags, const int pos); + static int skipBigrams(const uint8_t* const dict, const uint8_t flags, const int pos); static int skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos); static int skipChildrenPosAndAttributes(const uint8_t* const dict, const uint8_t flags, const int pos); @@ -157,12 +161,12 @@ static inline int attributeAddressSize(const uint8_t flags) { */ } -inline int BinaryFormat::skipAttributes(const uint8_t* const dict, const int pos) { +static inline int skipExistingBigrams(const uint8_t* const dict, const int pos) { int currentPos = pos; - uint8_t flags = getFlagsAndForwardPointer(dict, ¤tPos); + uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos); while (flags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT) { currentPos += attributeAddressSize(flags); - flags = getFlagsAndForwardPointer(dict, ¤tPos); + flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos); } currentPos += attributeAddressSize(flags); return currentPos; @@ -174,6 +178,10 @@ static inline int childrenAddressSize(const uint8_t flags) { /* See the note in attributeAddressSize. The same applies here */ } +static inline int shortcutByteSize(const uint8_t* const dict, const int pos) { + return ((int)(dict[pos] << 8)) + (dict[pos + 1]); +} + inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos) { return pos + childrenAddressSize(flags); } @@ -182,16 +190,30 @@ inline int BinaryFormat::skipFrequency(const uint8_t flags, const int pos) { return UnigramDictionary::FLAG_IS_TERMINAL & flags ? pos + 1 : pos; } +inline int BinaryFormat::skipShortcuts(const uint8_t* const dict, const uint8_t flags, + const int pos) { + if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) { + return pos + shortcutByteSize(dict, pos); + } else { + return pos; + } +} + +inline int BinaryFormat::skipBigrams(const uint8_t* const dict, const uint8_t flags, + const int pos) { + if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) { + return skipExistingBigrams(dict, pos); + } else { + return pos; + } +} + inline int BinaryFormat::skipAllAttributes(const uint8_t* const dict, const uint8_t flags, const int pos) { // This function skips all attributes: shortcuts and bigrams. int newPos = pos; - if (UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS & flags) { - newPos = skipAttributes(dict, newPos); - } - if (UnigramDictionary::FLAG_HAS_BIGRAMS & flags) { - newPos = skipAttributes(dict, newPos); - } + newPos = skipShortcuts(dict, flags, newPos); + newPos = skipBigrams(dict, flags, newPos); return newPos; } diff --git a/native/jni/src/terminal_attributes.h b/native/jni/src/terminal_attributes.h index 1f9815936..9a803cca1 100644 --- a/native/jni/src/terminal_attributes.h +++ b/native/jni/src/terminal_attributes.h @@ -45,13 +45,19 @@ class TerminalAttributes { // Gets the shortcut target itself as a uint16_t string. For parameters and return value // see BinaryFormat::getWordAtAddress. + // TODO: make the output an uint32_t* to handle the whole unicode range. inline int getNextShortcutTarget(const int maxDepth, uint16_t* outWord) { const int shortcutFlags = BinaryFormat::getFlagsAndForwardPointer(mDict, &mPos); mHasNextShortcutTarget = 0 != (shortcutFlags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT); - int shortcutAddress = - BinaryFormat::getAttributeAddressAndForwardPointer(mDict, shortcutFlags, &mPos); - return BinaryFormat::getWordAtAddress(mDict, shortcutAddress, maxDepth, outWord); + unsigned int i; + for (i = 0; i < MAX_WORD_LENGTH_INTERNAL; ++i) { + const int charCode = BinaryFormat::getCharCodeAndForwardPointer(mDict, &mPos); + if (NOT_A_CHARACTER == charCode) break; + outWord[i] = (uint16_t)charCode; + } + mPos += BinaryFormat::CHARACTER_ARRAY_TERMINATOR_SIZE; + return i; } }; @@ -65,12 +71,10 @@ class TerminalAttributes { mDict(dict), mFlags(flags), mStartPos(pos) { } - inline bool isShortcutOnly() const { - return 0 != (mFlags & UnigramDictionary::FLAG_IS_SHORTCUT_ONLY); - } - inline ShortcutIterator getShortcutIterator() const { - return ShortcutIterator(mDict, mStartPos, mFlags); + // The size of the shortcuts is stored here so that the whole shortcut chunk can be + // skipped quickly, so we ignore it. + return ShortcutIterator(mDict, mStartPos + BinaryFormat::SHORTCUT_LIST_SIZE_SIZE, mFlags); } }; } // namespace latinime diff --git a/native/jni/src/unigram_dictionary.cpp b/native/jni/src/unigram_dictionary.cpp index ed4c066f3..50805ad87 100644 --- a/native/jni/src/unigram_dictionary.cpp +++ b/native/jni/src/unigram_dictionary.cpp @@ -366,10 +366,9 @@ inline void UnigramDictionary::onTerminal(const int freq, WordsPriorityQueue *masterQueue = queuePool->getMasterQueue(); const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength); if (finalFreq != NOT_A_FREQUENCY) { - if (!terminalAttributes.isShortcutOnly()) { - addWord(wordPointer, wordLength, finalFreq, masterQueue); - } + addWord(wordPointer, wordLength, finalFreq, masterQueue); + const int shortcutFreq = finalFreq > 0 ? finalFreq - 1 : 0; // Please note that the shortcut candidates will be added to the master queue only. TerminalAttributes::ShortcutIterator iterator = terminalAttributes.getShortcutIterator(); @@ -379,11 +378,12 @@ inline void UnigramDictionary::onTerminal(const int freq, // We need to either modulate the frequency of each shortcut according // to its own shortcut frequency or to make the queue // so that the insert order is protected inside the queue for words - // with the same score. + // with the same score. For the moment we use -1 to make sure the shortcut will + // never be in front of the word. uint16_t shortcutTarget[MAX_WORD_LENGTH_INTERNAL]; const int shortcutTargetStringLength = iterator.getNextShortcutTarget( MAX_WORD_LENGTH_INTERNAL, shortcutTarget); - addWord(shortcutTarget, shortcutTargetStringLength, finalFreq, masterQueue); + addWord(shortcutTarget, shortcutTargetStringLength, shortcutFreq, masterQueue); } } } diff --git a/native/jni/src/unigram_dictionary.h b/native/jni/src/unigram_dictionary.h index c8f15566c..d501d5019 100644 --- a/native/jni/src/unigram_dictionary.h +++ b/native/jni/src/unigram_dictionary.h @@ -49,10 +49,6 @@ class UnigramDictionary { static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08; // Flag for bigram presence static const int FLAG_HAS_BIGRAMS = 0x04; - // Flag for shortcut-only words. Some words are shortcut-only, which means they match when - // the user types them but they don't pop in the suggestion strip, only the words they are - // shortcuts for do. - static const int FLAG_IS_SHORTCUT_ONLY = 0x02; // Attribute (bigram/shortcut) related flags: // Flag for presence of more attributes