diff --git a/native/src/binary_format.h b/native/src/binary_format.h index 7deec27d3..a946b1ee3 100644 --- a/native/src/binary_format.h +++ b/native/src/binary_format.h @@ -48,6 +48,8 @@ public: static bool hasChildrenInFlags(const uint8_t flags); static int getAttributeAddressAndForwardPointer(const uint8_t* const dict, const uint8_t flags, int *pos); + static int getTerminalPosition(const uint8_t* const root, const uint16_t* const inWord, + const int length); }; inline int BinaryFormat::detectFormat(const uint8_t* const dict) { @@ -217,6 +219,77 @@ inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t* con } } +// This function gets the byte position of the last chargroup of the exact matching word in the +// dictionary. If no match is found, it returns NOT_VALID_WORD. +inline int BinaryFormat::getTerminalPosition(const uint8_t* const root, + const uint16_t* const inWord, const int length) { + int pos = 0; + int wordPos = 0; + + while (true) { + // If we already traversed the tree further than the word is long, there means + // there was no match (or we would have found it). + if (wordPos > length) return NOT_VALID_WORD; + int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos); + const uint16_t wChar = inWord[wordPos]; + while (true) { + // If there are no more character groups in this node, it means we could not + // find a matching character for this depth, therefore there is no match. + if (0 >= charGroupCount) return NOT_VALID_WORD; + const int charGroupPos = pos; + const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); + int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); + if (character == wChar) { + // This is the correct node. Only one character group may start with the same + // char within a node, so either we found our match in this node, or there is + // no match and we can return NOT_VALID_WORD. So we will check all the characters + // in this character group indeed does match. + if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { + character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); + while (NOT_A_CHARACTER != character) { + ++wordPos; + // If we shoot the length of the word we search for, or if we find a single + // character that does not match, as explained above, it means the word is + // not in the dictionary (by virtue of this chargroup being the only one to + // match the word on the first character, but not matching the whole word). + if (wordPos > length) return NOT_VALID_WORD; + if (inWord[wordPos] != character) return NOT_VALID_WORD; + character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); + } + } + // If we come here we know that so far, we do match. Either we are on a terminal + // and we match the length, in which case we found it, or we traverse children. + // If we don't match the length AND don't have children, then a word in the + // dictionary fully matches a prefix of the searched word but not the full word. + ++wordPos; + if (UnigramDictionary::FLAG_IS_TERMINAL & flags) { + if (wordPos == length) { + return charGroupPos; + } + pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos); + } + if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS + == (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) { + return NOT_VALID_WORD; + } + // We have children and we are still shorter than the word we are searching for, so + // we need to traverse children. Put the pointer on the children position, and + // break + pos = BinaryFormat::readChildrenPosition(root, flags, pos); + break; + } else { + // This chargroup does not match, so skip the remaining part and go to the next. + if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { + pos = BinaryFormat::skipOtherCharacters(root, pos); + } + pos = BinaryFormat::skipFrequency(flags, pos); + pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos); + } + --charGroupCount; + } + } +} + } // namespace latinime #endif // LATINIME_BINARY_FORMAT_H diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp index 3cfed6f46..bccd37a61 100644 --- a/native/src/unigram_dictionary.cpp +++ b/native/src/unigram_dictionary.cpp @@ -1055,86 +1055,8 @@ int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWor return maxFreq; } -// This function gets the byte position of the last chargroup of the exact matching word in the -// dictionary. If no match is found, it returns NOT_VALID_WORD. -static inline int getTerminalPosition(const uint8_t* const root, const uint16_t* const inWord, - const int length) { - int pos = 0; - int wordPos = 0; - - while (true) { - // If we already traversed the tree further than the word is long, there means - // there was no match (or we would have found it). - if (wordPos > length) return NOT_VALID_WORD; - int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos); - const uint16_t wChar = inWord[wordPos]; - while (true) { - // If there are no more character groups in this node, it means we could not - // find a matching character for this depth, therefore there is no match. - if (0 >= charGroupCount) return NOT_VALID_WORD; - const int charGroupPos = pos; - const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); - int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); - if (character == wChar) { - // This is the correct node. Only one character group may start with the same - // char within a node, so either we found our match in this node, or there is - // no match and we can return NOT_VALID_WORD. So we will check all the characters - // in this character group indeed does match. - if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { - character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); - while (NOT_A_CHARACTER != character) { - ++wordPos; - // If we shoot the length of the word we search for, or if we find a single - // character that does not match, as explained above, it means the word is - // not in the dictionary (by virtue of this chargroup being the only one to - // match the word on the first character, but not matching the whole word). - if (wordPos > length) return NOT_VALID_WORD; - if (inWord[wordPos] != character) return NOT_VALID_WORD; - character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos); - } - } - // If we come here we know that so far, we do match. Either we are on a terminal - // and we match the length, in which case we found it, or we traverse children. - // If we don't match the length AND don't have children, then a word in the - // dictionary fully matches a prefix of the searched word but not the full word. - ++wordPos; - if (UnigramDictionary::FLAG_IS_TERMINAL & flags) { - if (wordPos == length) { - return charGroupPos; - } - pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos); - } - if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS - == (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) { - return NOT_VALID_WORD; - } - // We have children and we are still shorter than the word we are searching for, so - // we need to traverse children. Put the pointer on the children position, and - // break - pos = BinaryFormat::readChildrenPosition(root, flags, pos); - break; - } else { - // This chargroup does not match, so skip the remaining part and go to the next. - if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) { - pos = BinaryFormat::skipOtherCharacters(root, pos); - } - pos = BinaryFormat::skipFrequency(flags, pos); - pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos); - } - --charGroupCount; - } - } -} - bool UnigramDictionary::isValidWord(const uint16_t* const inWord, const int length) const { - return NOT_VALID_WORD != getTerminalPosition(DICT_ROOT, inWord, length); -} - -int UnigramDictionary::getBigrams(unsigned short *word, int length, int *codes, int codesSize, - unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams, - int maxAlternatives) { - // TODO: add implementation. - return 0; + return NOT_VALID_WORD != BinaryFormat::getTerminalPosition(DICT_ROOT, inWord, length); } // TODO: remove this function. diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h index 55771eeb8..97198ef13 100644 --- a/native/src/unigram_dictionary.h +++ b/native/src/unigram_dictionary.h @@ -71,9 +71,6 @@ public: bool isValidWord(unsigned short *word, int length); #else // NEW_DICTIONARY_FORMAT bool isValidWord(const uint16_t* const inWord, const int length) const; - int getBigrams(unsigned short *word, int length, int *codes, int codesSize, - unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams, - int maxAlternatives); #endif // NEW_DICTIONARY_FORMAT int getBigramPosition(int pos, unsigned short *word, int offset, int length) const; int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,