Merge "Small native refactoring."
This commit is contained in:
commit
f24eb69d3f
3 changed files with 74 additions and 82 deletions
|
@ -48,6 +48,8 @@ public:
|
||||||
static bool hasChildrenInFlags(const uint8_t flags);
|
static bool hasChildrenInFlags(const uint8_t flags);
|
||||||
static int getAttributeAddressAndForwardPointer(const uint8_t* const dict, const uint8_t flags,
|
static int getAttributeAddressAndForwardPointer(const uint8_t* const dict, const uint8_t flags,
|
||||||
int *pos);
|
int *pos);
|
||||||
|
static int getTerminalPosition(const uint8_t* const root, const uint16_t* const inWord,
|
||||||
|
const int length);
|
||||||
};
|
};
|
||||||
|
|
||||||
inline int BinaryFormat::detectFormat(const uint8_t* const dict) {
|
inline int BinaryFormat::detectFormat(const uint8_t* const dict) {
|
||||||
|
@ -217,6 +219,77 @@ inline int BinaryFormat::getAttributeAddressAndForwardPointer(const uint8_t* con
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// This function gets the byte position of the last chargroup of the exact matching word in the
|
||||||
|
// dictionary. If no match is found, it returns NOT_VALID_WORD.
|
||||||
|
inline int BinaryFormat::getTerminalPosition(const uint8_t* const root,
|
||||||
|
const uint16_t* const inWord, const int length) {
|
||||||
|
int pos = 0;
|
||||||
|
int wordPos = 0;
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
// If we already traversed the tree further than the word is long, there means
|
||||||
|
// there was no match (or we would have found it).
|
||||||
|
if (wordPos > length) return NOT_VALID_WORD;
|
||||||
|
int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos);
|
||||||
|
const uint16_t wChar = inWord[wordPos];
|
||||||
|
while (true) {
|
||||||
|
// If there are no more character groups in this node, it means we could not
|
||||||
|
// find a matching character for this depth, therefore there is no match.
|
||||||
|
if (0 >= charGroupCount) return NOT_VALID_WORD;
|
||||||
|
const int charGroupPos = pos;
|
||||||
|
const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
|
||||||
|
int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
|
||||||
|
if (character == wChar) {
|
||||||
|
// This is the correct node. Only one character group may start with the same
|
||||||
|
// char within a node, so either we found our match in this node, or there is
|
||||||
|
// no match and we can return NOT_VALID_WORD. So we will check all the characters
|
||||||
|
// in this character group indeed does match.
|
||||||
|
if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
|
||||||
|
character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
|
||||||
|
while (NOT_A_CHARACTER != character) {
|
||||||
|
++wordPos;
|
||||||
|
// If we shoot the length of the word we search for, or if we find a single
|
||||||
|
// character that does not match, as explained above, it means the word is
|
||||||
|
// not in the dictionary (by virtue of this chargroup being the only one to
|
||||||
|
// match the word on the first character, but not matching the whole word).
|
||||||
|
if (wordPos > length) return NOT_VALID_WORD;
|
||||||
|
if (inWord[wordPos] != character) return NOT_VALID_WORD;
|
||||||
|
character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// If we come here we know that so far, we do match. Either we are on a terminal
|
||||||
|
// and we match the length, in which case we found it, or we traverse children.
|
||||||
|
// If we don't match the length AND don't have children, then a word in the
|
||||||
|
// dictionary fully matches a prefix of the searched word but not the full word.
|
||||||
|
++wordPos;
|
||||||
|
if (UnigramDictionary::FLAG_IS_TERMINAL & flags) {
|
||||||
|
if (wordPos == length) {
|
||||||
|
return charGroupPos;
|
||||||
|
}
|
||||||
|
pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos);
|
||||||
|
}
|
||||||
|
if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
|
||||||
|
== (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) {
|
||||||
|
return NOT_VALID_WORD;
|
||||||
|
}
|
||||||
|
// We have children and we are still shorter than the word we are searching for, so
|
||||||
|
// we need to traverse children. Put the pointer on the children position, and
|
||||||
|
// break
|
||||||
|
pos = BinaryFormat::readChildrenPosition(root, flags, pos);
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
// This chargroup does not match, so skip the remaining part and go to the next.
|
||||||
|
if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
|
||||||
|
pos = BinaryFormat::skipOtherCharacters(root, pos);
|
||||||
|
}
|
||||||
|
pos = BinaryFormat::skipFrequency(flags, pos);
|
||||||
|
pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
|
||||||
|
}
|
||||||
|
--charGroupCount;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
|
|
||||||
#endif // LATINIME_BINARY_FORMAT_H
|
#endif // LATINIME_BINARY_FORMAT_H
|
||||||
|
|
|
@ -1055,86 +1055,8 @@ int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWor
|
||||||
return maxFreq;
|
return maxFreq;
|
||||||
}
|
}
|
||||||
|
|
||||||
// This function gets the byte position of the last chargroup of the exact matching word in the
|
|
||||||
// dictionary. If no match is found, it returns NOT_VALID_WORD.
|
|
||||||
static inline int getTerminalPosition(const uint8_t* const root, const uint16_t* const inWord,
|
|
||||||
const int length) {
|
|
||||||
int pos = 0;
|
|
||||||
int wordPos = 0;
|
|
||||||
|
|
||||||
while (true) {
|
|
||||||
// If we already traversed the tree further than the word is long, there means
|
|
||||||
// there was no match (or we would have found it).
|
|
||||||
if (wordPos > length) return NOT_VALID_WORD;
|
|
||||||
int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos);
|
|
||||||
const uint16_t wChar = inWord[wordPos];
|
|
||||||
while (true) {
|
|
||||||
// If there are no more character groups in this node, it means we could not
|
|
||||||
// find a matching character for this depth, therefore there is no match.
|
|
||||||
if (0 >= charGroupCount) return NOT_VALID_WORD;
|
|
||||||
const int charGroupPos = pos;
|
|
||||||
const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
|
|
||||||
int32_t character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
|
|
||||||
if (character == wChar) {
|
|
||||||
// This is the correct node. Only one character group may start with the same
|
|
||||||
// char within a node, so either we found our match in this node, or there is
|
|
||||||
// no match and we can return NOT_VALID_WORD. So we will check all the characters
|
|
||||||
// in this character group indeed does match.
|
|
||||||
if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
|
|
||||||
character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
|
|
||||||
while (NOT_A_CHARACTER != character) {
|
|
||||||
++wordPos;
|
|
||||||
// If we shoot the length of the word we search for, or if we find a single
|
|
||||||
// character that does not match, as explained above, it means the word is
|
|
||||||
// not in the dictionary (by virtue of this chargroup being the only one to
|
|
||||||
// match the word on the first character, but not matching the whole word).
|
|
||||||
if (wordPos > length) return NOT_VALID_WORD;
|
|
||||||
if (inWord[wordPos] != character) return NOT_VALID_WORD;
|
|
||||||
character = BinaryFormat::getCharCodeAndForwardPointer(root, &pos);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// If we come here we know that so far, we do match. Either we are on a terminal
|
|
||||||
// and we match the length, in which case we found it, or we traverse children.
|
|
||||||
// If we don't match the length AND don't have children, then a word in the
|
|
||||||
// dictionary fully matches a prefix of the searched word but not the full word.
|
|
||||||
++wordPos;
|
|
||||||
if (UnigramDictionary::FLAG_IS_TERMINAL & flags) {
|
|
||||||
if (wordPos == length) {
|
|
||||||
return charGroupPos;
|
|
||||||
}
|
|
||||||
pos = BinaryFormat::skipFrequency(UnigramDictionary::FLAG_IS_TERMINAL, pos);
|
|
||||||
}
|
|
||||||
if (UnigramDictionary::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
|
|
||||||
== (UnigramDictionary::MASK_GROUP_ADDRESS_TYPE & flags)) {
|
|
||||||
return NOT_VALID_WORD;
|
|
||||||
}
|
|
||||||
// We have children and we are still shorter than the word we are searching for, so
|
|
||||||
// we need to traverse children. Put the pointer on the children position, and
|
|
||||||
// break
|
|
||||||
pos = BinaryFormat::readChildrenPosition(root, flags, pos);
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
// This chargroup does not match, so skip the remaining part and go to the next.
|
|
||||||
if (UnigramDictionary::FLAG_HAS_MULTIPLE_CHARS & flags) {
|
|
||||||
pos = BinaryFormat::skipOtherCharacters(root, pos);
|
|
||||||
}
|
|
||||||
pos = BinaryFormat::skipFrequency(flags, pos);
|
|
||||||
pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos);
|
|
||||||
}
|
|
||||||
--charGroupCount;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bool UnigramDictionary::isValidWord(const uint16_t* const inWord, const int length) const {
|
bool UnigramDictionary::isValidWord(const uint16_t* const inWord, const int length) const {
|
||||||
return NOT_VALID_WORD != getTerminalPosition(DICT_ROOT, inWord, length);
|
return NOT_VALID_WORD != BinaryFormat::getTerminalPosition(DICT_ROOT, inWord, length);
|
||||||
}
|
|
||||||
|
|
||||||
int UnigramDictionary::getBigrams(unsigned short *word, int length, int *codes, int codesSize,
|
|
||||||
unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams,
|
|
||||||
int maxAlternatives) {
|
|
||||||
// TODO: add implementation.
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: remove this function.
|
// TODO: remove this function.
|
||||||
|
|
|
@ -71,9 +71,6 @@ public:
|
||||||
bool isValidWord(unsigned short *word, int length);
|
bool isValidWord(unsigned short *word, int length);
|
||||||
#else // NEW_DICTIONARY_FORMAT
|
#else // NEW_DICTIONARY_FORMAT
|
||||||
bool isValidWord(const uint16_t* const inWord, const int length) const;
|
bool isValidWord(const uint16_t* const inWord, const int length) const;
|
||||||
int getBigrams(unsigned short *word, int length, int *codes, int codesSize,
|
|
||||||
unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams,
|
|
||||||
int maxAlternatives);
|
|
||||||
#endif // NEW_DICTIONARY_FORMAT
|
#endif // NEW_DICTIONARY_FORMAT
|
||||||
int getBigramPosition(int pos, unsigned short *word, int offset, int length) const;
|
int getBigramPosition(int pos, unsigned short *word, int offset, int length) const;
|
||||||
int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
|
int getSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
|
||||||
|
|
Loading…
Reference in a new issue