Merge "Small native refactoring."

2011-07-26 00:49:56 -07:00 · 2011-07-26 00:49:56 -07:00 · f24eb69d3f
commit f24eb69d3f
parent f50aa19337 6a0e9642a8
3 changed files with 74 additions and 82 deletions
--- a/native/src/binary_format.h
+++ b/native/src/binary_format.h
@ -48,6 +48,8 @@ public:
    static bool hasChildrenInFlags(const uint8_t flags);
    static int getAttributeAddressAndForwardPointer(const uint8_t* const dict, const uint8_t flags,
            int *pos);
    static int getTerminalPosition(const uint8_t* const root, const uint16_t* const inWord,
            const int length);
 };
 inline int BinaryFormat::detectFormat(const uint8_t* const dict) {
@ -217,6 +219,77 @@ inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t* con
    }
 }
 // This function gets the byte position of the last chargroup of the exact matching word in the
 // dictionary. If no match is found, it returns NOT_VALID_WORD.
 inline int BinaryFormat::getTerminalPosition(const uint8_t* const root,
        const uint16_t* const inWord, const int length) {
    int pos = 0;
    int wordPos = 0;
    while (true) {
        // If we already traversed the tree further than the word is long, there means
        // there was no match (or we would have found it).
        if (wordPos > length) return NOT_VALID_WORD;
        int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos);
        const uint16_t wChar = inWord[wordPos];
        while (true) {
            // If there are no more character groups in this node, it means we could not
            // find a matching character for this depth, therefore there is no match.
            if (0 >= charGroupCount) return NOT_VALID_WORD;
            const int charGroupPos = pos;
            const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
            int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
            if (character == wChar) {
                // This is the correct node. Only one character group may start with the same
                // char within a node, so either we found our match in this node, or there is
                // no match and we can return NOT_VALID_WORD. So we will check all the characters
                // in this character group indeed does match.
                if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
                    character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
                    while (NOT_A_CHARACTER != character) {
                        ++wordPos;
                        // If we shoot the length of the word we search for, or if we find a single
                        // character that does not match, as explained above, it means the word is
                        // not in the dictionary (by virtue of this chargroup being the only one to
                        // match the word on the first character, but not matching the whole word).
                        if (wordPos > length) return NOT_VALID_WORD;
                        if (inWord[wordPos] != character) return NOT_VALID_WORD;
                        character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
                    }
                }
                // If we come here we know that so far, we do match. Either we are on a terminal
                // and we match the length, in which case we found it, or we traverse children.
                // If we don't match the length AND don't have children, then a word in the
                // dictionary fully matches a prefix of the searched word but not the full word.
                ++wordPos;
                if (UnigramDictionary::FLAG_IS_TERMINAL & flags) {
                    if (wordPos == length) {
                        return charGroupPos;
                    }
                    pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos);
                }
                if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
                        == (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) {
                    return NOT_VALID_WORD;
                }
                // We have children and we are still shorter than the word we are searching for, so
                // we need to traverse children. Put the pointer on the children position, and
                // break
                pos = BinaryFormat::readChildrenPosition(root, flags, pos);
                break;
            } else {
                // This chargroup does not match, so skip the remaining part and go to the next.
                if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
                    pos = BinaryFormat::skipOtherCharacters(root, pos);
                }
                pos = BinaryFormat::skipFrequency(flags, pos);
                pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
            }
            --charGroupCount;
        }
    }
 }
 } // namespace latinime
 #endif // LATINIME_BINARY_FORMAT_H
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@ -1055,86 +1055,8 @@ int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWor
    return maxFreq;
 }
 // This function gets the byte position of the last chargroup of the exact matching word in the
 // dictionary. If no match is found, it returns NOT_VALID_WORD.
 static inline int getTerminalPosition(const uint8_t* const root, const uint16_t* const inWord,
        const int length) {
    int pos = 0;
    int wordPos = 0;
    while (true) {
        // If we already traversed the tree further than the word is long, there means
        // there was no match (or we would have found it).
        if (wordPos > length) return NOT_VALID_WORD;
        int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos);
        const uint16_t wChar = inWord[wordPos];
        while (true) {
            // If there are no more character groups in this node, it means we could not
            // find a matching character for this depth, therefore there is no match.
            if (0 >= charGroupCount) return NOT_VALID_WORD;
            const int charGroupPos = pos;
            const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
            int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
            if (character == wChar) {
                // This is the correct node. Only one character group may start with the same
                // char within a node, so either we found our match in this node, or there is
                // no match and we can return NOT_VALID_WORD. So we will check all the characters
                // in this character group indeed does match.
                if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
                    character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
                    while (NOT_A_CHARACTER != character) {
                        ++wordPos;
                        // If we shoot the length of the word we search for, or if we find a single
                        // character that does not match, as explained above, it means the word is
                        // not in the dictionary (by virtue of this chargroup being the only one to
                        // match the word on the first character, but not matching the whole word).
                        if (wordPos > length) return NOT_VALID_WORD;
                        if (inWord[wordPos] != character) return NOT_VALID_WORD;
                        character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
                    }
                }
                // If we come here we know that so far, we do match. Either we are on a terminal
                // and we match the length, in which case we found it, or we traverse children.
                // If we don't match the length AND don't have children, then a word in the
                // dictionary fully matches a prefix of the searched word but not the full word.
                ++wordPos;
                if (UnigramDictionary::FLAG_IS_TERMINAL & flags) {
                    if (wordPos == length) {
                        return charGroupPos;
                    }
                    pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos);
                }
                if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
                        == (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) {
                    return NOT_VALID_WORD;
                }
                // We have children and we are still shorter than the word we are searching for, so
                // we need to traverse children. Put the pointer on the children position, and
                // break
                pos = BinaryFormat::readChildrenPosition(root, flags, pos);
                break;
            } else {
                // This chargroup does not match, so skip the remaining part and go to the next.
                if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
                    pos = BinaryFormat::skipOtherCharacters(root, pos);
                }
                pos = BinaryFormat::skipFrequency(flags, pos);
                pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
            }
            --charGroupCount;
        }
    }
 }
 bool UnigramDictionary::isValidWord(const uint16_t* const inWord, const int length) const {
-    return NOT_VALID_WORD != getTerminalPosition(DICT_ROOT, inWord, length);
+    return NOT_VALID_WORD != BinaryFormat::getTerminalPosition(DICT_ROOT, inWord, length);
 }
 int UnigramDictionary::getBigrams(unsigned short *word, int length, int *codes, int codesSize,
        unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams,
        int maxAlternatives) {
    // TODO: add implementation.
    return 0;
 }
 // TODO: remove this function.
--- a/native/src/unigram_dictionary.h
+++ b/native/src/unigram_dictionary.h
@ -71,9 +71,6 @@ public:
    bool isValidWord(unsigned short *word, int length);
 #else // NEW_DICTIONARY_FORMAT
    bool isValidWord(const uint16_t* const inWord, const int length) const;
    int getBigrams(unsigned short *word, int length, int *codes, int codesSize,
            unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams,
            int maxAlternatives);
 #endif // NEW_DICTIONARY_FORMAT
    int getBigramPosition(int pos, unsigned short *word, int offset, int length) const;
    int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,