Merge "Give PatriciaTrieReadingUtils methods for reading nodes."
This commit is contained in:
commit
80c9b829bd
9 changed files with 306 additions and 113 deletions
|
@ -72,7 +72,8 @@ LATIN_IME_CORE_SRC_FILES := \
|
||||||
suggest/core/session/dic_traverse_session.cpp \
|
suggest/core/session/dic_traverse_session.cpp \
|
||||||
$(addprefix suggest/policyimpl/dictionary/, \
|
$(addprefix suggest/policyimpl/dictionary/, \
|
||||||
dynamic_patricia_trie_policy.cpp \
|
dynamic_patricia_trie_policy.cpp \
|
||||||
patricia_trie_policy.cpp) \
|
patricia_trie_policy.cpp \
|
||||||
|
patricia_trie_reading_utils.cpp) \
|
||||||
suggest/policyimpl/gesture/gesture_suggest_policy_factory.cpp \
|
suggest/policyimpl/gesture/gesture_suggest_policy_factory.cpp \
|
||||||
$(addprefix suggest/policyimpl/typing/, \
|
$(addprefix suggest/policyimpl/typing/, \
|
||||||
scoring_params.cpp \
|
scoring_params.cpp \
|
||||||
|
|
|
@ -44,15 +44,15 @@ const int TaUtils::WHITELIST_SHORTCUT_PROBABILITY = 15;
|
||||||
const int origin = *pos;
|
const int origin = *pos;
|
||||||
switch (MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
|
switch (MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
|
||||||
case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
|
case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
|
||||||
offset = ByteArrayUtils::readUint8andAdvancePosition(
|
offset = ByteArrayUtils::readUint8AndAdvancePosition(
|
||||||
binaryDictionaryInfo->getDictRoot(), pos);
|
binaryDictionaryInfo->getDictRoot(), pos);
|
||||||
break;
|
break;
|
||||||
case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
|
case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
|
||||||
offset = ByteArrayUtils::readUint16andAdvancePosition(
|
offset = ByteArrayUtils::readUint16AndAdvancePosition(
|
||||||
binaryDictionaryInfo->getDictRoot(), pos);
|
binaryDictionaryInfo->getDictRoot(), pos);
|
||||||
break;
|
break;
|
||||||
case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
|
case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
|
||||||
offset = ByteArrayUtils::readUint24andAdvancePosition(
|
offset = ByteArrayUtils::readUint24AndAdvancePosition(
|
||||||
binaryDictionaryInfo->getDictRoot(), pos);
|
binaryDictionaryInfo->getDictRoot(), pos);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
|
@ -33,7 +33,7 @@ class BinaryDictionaryTerminalAttributesReadingUtils {
|
||||||
|
|
||||||
static AK_FORCE_INLINE TerminalAttributeFlags getFlagsAndForwardPointer(
|
static AK_FORCE_INLINE TerminalAttributeFlags getFlagsAndForwardPointer(
|
||||||
const BinaryDictionaryInfo *const binaryDictionaryInfo, int *const pos) {
|
const BinaryDictionaryInfo *const binaryDictionaryInfo, int *const pos) {
|
||||||
return ByteArrayUtils::readUint8andAdvancePosition(
|
return ByteArrayUtils::readUint8AndAdvancePosition(
|
||||||
binaryDictionaryInfo->getDictRoot(), pos);
|
binaryDictionaryInfo->getDictRoot(), pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -66,7 +66,7 @@ class BinaryDictionaryTerminalAttributesReadingUtils {
|
||||||
static AK_FORCE_INLINE int getShortcutListSizeAndForwardPointer(
|
static AK_FORCE_INLINE int getShortcutListSizeAndForwardPointer(
|
||||||
const BinaryDictionaryInfo *const binaryDictionaryInfo, int *const pos) {
|
const BinaryDictionaryInfo *const binaryDictionaryInfo, int *const pos) {
|
||||||
// readUint16andAdvancePosition() returns an offset *including* the uint16 field itself.
|
// readUint16andAdvancePosition() returns an offset *including* the uint16 field itself.
|
||||||
return ByteArrayUtils::readUint16andAdvancePosition(
|
return ByteArrayUtils::readUint16AndAdvancePosition(
|
||||||
binaryDictionaryInfo->getDictRoot(), pos) - SHORTCUT_LIST_SIZE_FIELD_SIZE;
|
binaryDictionaryInfo->getDictRoot(), pos) - SHORTCUT_LIST_SIZE_FIELD_SIZE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -50,39 +50,39 @@ class ByteArrayUtils {
|
||||||
return buffer[pos];
|
return buffer[pos];
|
||||||
}
|
}
|
||||||
|
|
||||||
static AK_FORCE_INLINE uint32_t readUint32andAdvancePosition(
|
static AK_FORCE_INLINE uint32_t readUint32AndAdvancePosition(
|
||||||
const uint8_t *const buffer, int *const pos) {
|
const uint8_t *const buffer, int *const pos) {
|
||||||
const uint32_t value = readUint32(buffer, *pos);
|
const uint32_t value = readUint32(buffer, *pos);
|
||||||
*pos += 4;
|
*pos += 4;
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
static AK_FORCE_INLINE int readSint24andAdvancePosition(
|
static AK_FORCE_INLINE int readSint24AndAdvancePosition(
|
||||||
const uint8_t *const buffer, int *const pos) {
|
const uint8_t *const buffer, int *const pos) {
|
||||||
const uint8_t value = readUint8(buffer, *pos);
|
const uint8_t value = readUint8(buffer, *pos);
|
||||||
if (value < 0x80) {
|
if (value < 0x80) {
|
||||||
return readUint24andAdvancePosition(buffer, pos);
|
return readUint24AndAdvancePosition(buffer, pos);
|
||||||
} else {
|
} else {
|
||||||
(*pos)++;
|
(*pos)++;
|
||||||
return -(((value & 0x7F) << 16) ^ readUint16andAdvancePosition(buffer, pos));
|
return -(((value & 0x7F) << 16) ^ readUint16AndAdvancePosition(buffer, pos));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static AK_FORCE_INLINE uint32_t readUint24andAdvancePosition(
|
static AK_FORCE_INLINE uint32_t readUint24AndAdvancePosition(
|
||||||
const uint8_t *const buffer, int *const pos) {
|
const uint8_t *const buffer, int *const pos) {
|
||||||
const uint32_t value = readUint24(buffer, *pos);
|
const uint32_t value = readUint24(buffer, *pos);
|
||||||
*pos += 3;
|
*pos += 3;
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
static AK_FORCE_INLINE uint16_t readUint16andAdvancePosition(
|
static AK_FORCE_INLINE uint16_t readUint16AndAdvancePosition(
|
||||||
const uint8_t *const buffer, int *const pos) {
|
const uint8_t *const buffer, int *const pos) {
|
||||||
const uint16_t value = readUint16(buffer, *pos);
|
const uint16_t value = readUint16(buffer, *pos);
|
||||||
*pos += 2;
|
*pos += 2;
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
static AK_FORCE_INLINE uint8_t readUint8andAdvancePosition(
|
static AK_FORCE_INLINE uint8_t readUint8AndAdvancePosition(
|
||||||
const uint8_t *const buffer, int *const pos) {
|
const uint8_t *const buffer, int *const pos) {
|
||||||
return buffer[(*pos)++];
|
return buffer[(*pos)++];
|
||||||
}
|
}
|
||||||
|
@ -113,7 +113,7 @@ class ByteArrayUtils {
|
||||||
*pos += 1;
|
*pos += 1;
|
||||||
return NOT_A_CODE_POINT;
|
return NOT_A_CODE_POINT;
|
||||||
} else {
|
} else {
|
||||||
return readUint24andAdvancePosition(buffer, pos);
|
return readUint24AndAdvancePosition(buffer, pos);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
*pos += 1;
|
*pos += 1;
|
||||||
|
|
|
@ -56,7 +56,6 @@ class BinaryFormat {
|
||||||
// Mask and flags for attribute address type selection.
|
// Mask and flags for attribute address type selection.
|
||||||
static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
|
static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
|
||||||
|
|
||||||
static bool hasBlacklistedOrNotAWordFlag(const int flags);
|
|
||||||
static int getGroupCountAndForwardPointer(const uint8_t *const dict, int *pos);
|
static int getGroupCountAndForwardPointer(const uint8_t *const dict, int *pos);
|
||||||
static uint8_t getFlagsAndForwardPointer(const uint8_t *const dict, int *pos);
|
static uint8_t getFlagsAndForwardPointer(const uint8_t *const dict, int *pos);
|
||||||
static int getCodePointAndForwardPointer(const uint8_t *const dict, int *pos);
|
static int getCodePointAndForwardPointer(const uint8_t *const dict, int *pos);
|
||||||
|
@ -74,10 +73,6 @@ class BinaryFormat {
|
||||||
static int getCodePointsAndProbabilityAndReturnCodePointCount(
|
static int getCodePointsAndProbabilityAndReturnCodePointCount(
|
||||||
const uint8_t *const root, const int nodePos, const int maxCodePointCount,
|
const uint8_t *const root, const int nodePos, const int maxCodePointCount,
|
||||||
int *const outCodePoints, int *const outUnigramProbability);
|
int *const outCodePoints, int *const outUnigramProbability);
|
||||||
static int getBigramListPositionForWordPosition(const uint8_t *const root,
|
|
||||||
const int nodePosition);
|
|
||||||
static int getShortcutListPositionForWordPosition(const uint8_t *const root,
|
|
||||||
const int nodePosition);
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat);
|
||||||
|
@ -99,10 +94,6 @@ class BinaryFormat {
|
||||||
static int skipBigrams(const uint8_t *const dict, const uint8_t flags, const int pos);
|
static int skipBigrams(const uint8_t *const dict, const uint8_t flags, const int pos);
|
||||||
};
|
};
|
||||||
|
|
||||||
inline bool BinaryFormat::hasBlacklistedOrNotAWordFlag(const int flags) {
|
|
||||||
return (flags & (FLAG_IS_BLACKLISTED | FLAG_IS_NOT_A_WORD)) != 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
AK_FORCE_INLINE int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t *const dict,
|
AK_FORCE_INLINE int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t *const dict,
|
||||||
int *pos) {
|
int *pos) {
|
||||||
const int msb = dict[(*pos)++];
|
const int msb = dict[(*pos)++];
|
||||||
|
@ -475,38 +466,5 @@ AK_FORCE_INLINE int BinaryFormat::getCodePointsAndProbabilityAndReturnCodePointC
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
AK_FORCE_INLINE int BinaryFormat::getBigramListPositionForWordPosition(
|
|
||||||
const uint8_t *const root, const int nodePosition) {
|
|
||||||
if (NOT_A_VALID_WORD_POS == nodePosition) return NOT_A_DICT_POS;
|
|
||||||
int position = nodePosition;
|
|
||||||
const uint8_t flags = getFlagsAndForwardPointer(root, &position);
|
|
||||||
if (!(flags & FLAG_HAS_BIGRAMS)) return NOT_A_DICT_POS;
|
|
||||||
if (flags & FLAG_HAS_MULTIPLE_CHARS) {
|
|
||||||
position = skipOtherCharacters(root, position);
|
|
||||||
} else {
|
|
||||||
getCodePointAndForwardPointer(root, &position);
|
|
||||||
}
|
|
||||||
position = skipProbability(flags, position);
|
|
||||||
position = skipChildrenPosition(flags, position);
|
|
||||||
position = skipShortcuts(root, flags, position);
|
|
||||||
return position;
|
|
||||||
}
|
|
||||||
|
|
||||||
AK_FORCE_INLINE int BinaryFormat::getShortcutListPositionForWordPosition(
|
|
||||||
const uint8_t *const root, const int nodePosition) {
|
|
||||||
if (NOT_A_VALID_WORD_POS == nodePosition) return NOT_A_DICT_POS;
|
|
||||||
int position = nodePosition;
|
|
||||||
const uint8_t flags = getFlagsAndForwardPointer(root, &position);
|
|
||||||
if (!(flags & FLAG_HAS_SHORTCUT_TARGETS)) return NOT_A_DICT_POS;
|
|
||||||
if (flags & FLAG_HAS_MULTIPLE_CHARS) {
|
|
||||||
position = skipOtherCharacters(root, position);
|
|
||||||
} else {
|
|
||||||
getCodePointAndForwardPointer(root, &position);
|
|
||||||
}
|
|
||||||
position = skipProbability(flags, position);
|
|
||||||
position = skipChildrenPosition(flags, position);
|
|
||||||
return position;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
#endif // LATINIME_BINARY_FORMAT_H
|
#endif // LATINIME_BINARY_FORMAT_H
|
||||||
|
|
|
@ -21,7 +21,9 @@
|
||||||
#include "suggest/core/dicnode/dic_node.h"
|
#include "suggest/core/dicnode/dic_node.h"
|
||||||
#include "suggest/core/dicnode/dic_node_vector.h"
|
#include "suggest/core/dicnode/dic_node_vector.h"
|
||||||
#include "suggest/core/dictionary/binary_dictionary_info.h"
|
#include "suggest/core/dictionary/binary_dictionary_info.h"
|
||||||
|
#include "suggest/core/dictionary/binary_dictionary_terminal_attributes_reading_utils.h"
|
||||||
#include "suggest/policyimpl/dictionary/binary_format.h"
|
#include "suggest/policyimpl/dictionary/binary_format.h"
|
||||||
|
#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h"
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
|
@ -34,7 +36,7 @@ void PatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode,
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
int nextPos = dicNode->getChildrenPos();
|
int nextPos = dicNode->getChildrenPos();
|
||||||
const int childCount = BinaryFormat::getGroupCountAndForwardPointer(
|
const int childCount = PatriciaTrieReadingUtils::getGroupCountAndAdvancePosition(
|
||||||
binaryDictionaryInfo->getDictRoot(), &nextPos);
|
binaryDictionaryInfo->getDictRoot(), &nextPos);
|
||||||
for (int i = 0; i < childCount; i++) {
|
for (int i = 0; i < childCount; i++) {
|
||||||
nextPos = createAndGetLeavingChildNode(dicNode, nextPos, binaryDictionaryInfo,
|
nextPos = createAndGetLeavingChildNode(dicNode, nextPos, binaryDictionaryInfo,
|
||||||
|
@ -60,82 +62,108 @@ int PatriciaTriePolicy::getTerminalNodePositionOfWord(
|
||||||
|
|
||||||
int PatriciaTriePolicy::getUnigramProbability(
|
int PatriciaTriePolicy::getUnigramProbability(
|
||||||
const BinaryDictionaryInfo *const binaryDictionaryInfo, const int nodePos) const {
|
const BinaryDictionaryInfo *const binaryDictionaryInfo, const int nodePos) const {
|
||||||
const uint8_t *const root = binaryDictionaryInfo->getDictRoot();
|
if (nodePos == NOT_A_VALID_WORD_POS) {
|
||||||
|
return NOT_A_PROBABILITY;
|
||||||
|
}
|
||||||
|
const uint8_t *const dictRoot = binaryDictionaryInfo->getDictRoot();
|
||||||
int pos = nodePos;
|
int pos = nodePos;
|
||||||
const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos);
|
const PatriciaTrieReadingUtils::NodeFlags flags =
|
||||||
if (flags & (BinaryFormat::FLAG_IS_BLACKLISTED | BinaryFormat::FLAG_IS_NOT_A_WORD)) {
|
PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictRoot, &pos);
|
||||||
|
if (!PatriciaTrieReadingUtils::isTerminal(flags)) {
|
||||||
|
return NOT_A_PROBABILITY;
|
||||||
|
}
|
||||||
|
if (PatriciaTrieReadingUtils::isNotAWord(flags)
|
||||||
|
|| PatriciaTrieReadingUtils::isBlacklisted(flags)) {
|
||||||
// If this is not a word, or if it's a blacklisted entry, it should behave as
|
// If this is not a word, or if it's a blacklisted entry, it should behave as
|
||||||
// having no probability outside of the suggestion process (where it should be used
|
// having no probability outside of the suggestion process (where it should be used
|
||||||
// for shortcuts).
|
// for shortcuts).
|
||||||
return NOT_A_PROBABILITY;
|
return NOT_A_PROBABILITY;
|
||||||
}
|
}
|
||||||
const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
|
PatriciaTrieReadingUtils::skipCharacters(dictRoot, flags, MAX_WORD_LENGTH, &pos);
|
||||||
if (hasMultipleChars) {
|
return PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(dictRoot, &pos);
|
||||||
pos = BinaryFormat::skipOtherCharacters(root, pos);
|
|
||||||
} else {
|
|
||||||
BinaryFormat::getCodePointAndForwardPointer(root, &pos);
|
|
||||||
}
|
|
||||||
return BinaryFormat::readProbabilityWithoutMovingPointer(root, pos);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int PatriciaTriePolicy::getShortcutPositionOfNode(
|
int PatriciaTriePolicy::getShortcutPositionOfNode(
|
||||||
const BinaryDictionaryInfo *const binaryDictionaryInfo,
|
const BinaryDictionaryInfo *const binaryDictionaryInfo,
|
||||||
const int nodePos) const {
|
const int nodePos) const {
|
||||||
return BinaryFormat::getShortcutListPositionForWordPosition(
|
if (nodePos == NOT_A_VALID_WORD_POS) {
|
||||||
binaryDictionaryInfo->getDictRoot(), nodePos);
|
return NOT_A_DICT_POS;
|
||||||
|
}
|
||||||
|
const uint8_t *const dictRoot = binaryDictionaryInfo->getDictRoot();
|
||||||
|
int pos = nodePos;
|
||||||
|
const PatriciaTrieReadingUtils::NodeFlags flags =
|
||||||
|
PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictRoot, &pos);
|
||||||
|
if (!PatriciaTrieReadingUtils::hasShortcutTargets(flags)) {
|
||||||
|
return NOT_A_DICT_POS;
|
||||||
|
}
|
||||||
|
PatriciaTrieReadingUtils::skipCharacters(dictRoot, flags, MAX_WORD_LENGTH, &pos);
|
||||||
|
if (PatriciaTrieReadingUtils::isTerminal(flags)) {
|
||||||
|
PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(dictRoot, &pos);
|
||||||
|
}
|
||||||
|
if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) {
|
||||||
|
PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(dictRoot, flags, &pos);
|
||||||
|
}
|
||||||
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
int PatriciaTriePolicy::getBigramsPositionOfNode(
|
int PatriciaTriePolicy::getBigramsPositionOfNode(
|
||||||
const BinaryDictionaryInfo *const binaryDictionaryInfo,
|
const BinaryDictionaryInfo *const binaryDictionaryInfo,
|
||||||
const int nodePos) const {
|
const int nodePos) const {
|
||||||
return BinaryFormat::getBigramListPositionForWordPosition(
|
if (nodePos == NOT_A_VALID_WORD_POS) {
|
||||||
binaryDictionaryInfo->getDictRoot(), nodePos);
|
return NOT_A_DICT_POS;
|
||||||
|
}
|
||||||
|
const uint8_t *const dictRoot = binaryDictionaryInfo->getDictRoot();
|
||||||
|
int pos = nodePos;
|
||||||
|
const PatriciaTrieReadingUtils::NodeFlags flags =
|
||||||
|
PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictRoot, &pos);
|
||||||
|
if (!PatriciaTrieReadingUtils::hasBigrams(flags)) {
|
||||||
|
return NOT_A_DICT_POS;
|
||||||
|
}
|
||||||
|
PatriciaTrieReadingUtils::skipCharacters(dictRoot, flags, MAX_WORD_LENGTH, &pos);
|
||||||
|
if (PatriciaTrieReadingUtils::isTerminal(flags)) {
|
||||||
|
PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(dictRoot, &pos);
|
||||||
|
}
|
||||||
|
if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) {
|
||||||
|
PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(dictRoot, flags, &pos);
|
||||||
|
}
|
||||||
|
if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) {
|
||||||
|
BinaryDictionaryTerminalAttributesReadingUtils::skipShortcuts(binaryDictionaryInfo, &pos);
|
||||||
|
}
|
||||||
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNode, int pos,
|
int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNode,
|
||||||
const BinaryDictionaryInfo *const binaryDictionaryInfo,
|
const int nodePos, const BinaryDictionaryInfo *const binaryDictionaryInfo,
|
||||||
const NodeFilter *const childrenFilter, DicNodeVector *childDicNodes) const {
|
const NodeFilter *const childrenFilter, DicNodeVector *childDicNodes) const {
|
||||||
const int nextPos = pos;
|
const uint8_t *const dictRoot = binaryDictionaryInfo->getDictRoot();
|
||||||
const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(
|
int pos = nodePos;
|
||||||
binaryDictionaryInfo->getDictRoot(), &pos);
|
const PatriciaTrieReadingUtils::NodeFlags flags =
|
||||||
const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
|
PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictRoot, &pos);
|
||||||
const bool isTerminal = (0 != (BinaryFormat::FLAG_IS_TERMINAL & flags));
|
|
||||||
const bool hasChildren = BinaryFormat::hasChildrenInFlags(flags);
|
|
||||||
const bool isBlacklistedOrNotAWord = BinaryFormat::hasBlacklistedOrNotAWordFlag(flags);
|
|
||||||
|
|
||||||
int codePoint = BinaryFormat::getCodePointAndForwardPointer(
|
|
||||||
binaryDictionaryInfo->getDictRoot(), &pos);
|
|
||||||
ASSERT(NOT_A_CODE_POINT != codePoint);
|
|
||||||
// TODO: optimize this
|
|
||||||
int mergedNodeCodePoints[MAX_WORD_LENGTH];
|
int mergedNodeCodePoints[MAX_WORD_LENGTH];
|
||||||
uint16_t mergedNodeCodePointCount = 0;
|
const int mergedNodeCodePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition(
|
||||||
mergedNodeCodePoints[mergedNodeCodePointCount++] = codePoint;
|
dictRoot, flags, MAX_WORD_LENGTH, mergedNodeCodePoints, &pos);
|
||||||
|
const int probability = (PatriciaTrieReadingUtils::isTerminal(flags))?
|
||||||
do {
|
PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(dictRoot, &pos)
|
||||||
const int nextCodePoint = hasMultipleChars
|
: NOT_A_PROBABILITY;
|
||||||
? BinaryFormat::getCodePointAndForwardPointer(
|
const int childrenPos = PatriciaTrieReadingUtils::hasChildrenInFlags(flags) ?
|
||||||
binaryDictionaryInfo->getDictRoot(), &pos) : NOT_A_CODE_POINT;
|
PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(
|
||||||
const bool isLastChar = (NOT_A_CODE_POINT == nextCodePoint);
|
dictRoot, flags, &pos) : NOT_A_DICT_POS;
|
||||||
if (!isLastChar) {
|
if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) {
|
||||||
mergedNodeCodePoints[mergedNodeCodePointCount++] = nextCodePoint;
|
BinaryDictionaryTerminalAttributesReadingUtils::skipShortcuts(binaryDictionaryInfo, &pos);
|
||||||
}
|
}
|
||||||
codePoint = nextCodePoint;
|
if (PatriciaTrieReadingUtils::hasBigrams(flags)) {
|
||||||
} while (NOT_A_CODE_POINT != codePoint);
|
BinaryDictionaryTerminalAttributesReadingUtils::skipExistingBigrams(
|
||||||
|
binaryDictionaryInfo, &pos);
|
||||||
const int probability = isTerminal ? BinaryFormat::readProbabilityWithoutMovingPointer(
|
|
||||||
binaryDictionaryInfo->getDictRoot(), pos) : NOT_A_PROBABILITY;
|
|
||||||
pos = BinaryFormat::skipProbability(flags, pos);
|
|
||||||
int childrenPos = hasChildren ? BinaryFormat::readChildrenPosition(
|
|
||||||
binaryDictionaryInfo->getDictRoot(), flags, pos) : NOT_A_DICT_POS;
|
|
||||||
const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(
|
|
||||||
binaryDictionaryInfo->getDictRoot(), flags, pos);
|
|
||||||
|
|
||||||
if (childrenFilter->isFilteredOut(mergedNodeCodePoints[0])) {
|
|
||||||
return siblingPos;
|
|
||||||
}
|
}
|
||||||
childDicNodes->pushLeavingChild(dicNode, nextPos, childrenPos, probability, isTerminal,
|
if (!childrenFilter->isFilteredOut(mergedNodeCodePoints[0])) {
|
||||||
hasChildren, isBlacklistedOrNotAWord, mergedNodeCodePointCount, mergedNodeCodePoints);
|
childDicNodes->pushLeavingChild(dicNode, nodePos, childrenPos, probability,
|
||||||
return siblingPos;
|
PatriciaTrieReadingUtils::isTerminal(flags),
|
||||||
|
PatriciaTrieReadingUtils::hasChildrenInFlags(flags),
|
||||||
|
PatriciaTrieReadingUtils::isBlacklisted(flags) ||
|
||||||
|
PatriciaTrieReadingUtils::isNotAWord(flags),
|
||||||
|
mergedNodeCodePointCount, mergedNodeCodePoints);
|
||||||
|
}
|
||||||
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
|
|
|
@ -61,7 +61,7 @@ class PatriciaTriePolicy : public DictionaryStructurePolicy {
|
||||||
PatriciaTriePolicy() {}
|
PatriciaTriePolicy() {}
|
||||||
~PatriciaTriePolicy() {}
|
~PatriciaTriePolicy() {}
|
||||||
|
|
||||||
int createAndGetLeavingChildNode(const DicNode *const dicNode, int pos,
|
int createAndGetLeavingChildNode(const DicNode *const dicNode, const int nodePos,
|
||||||
const BinaryDictionaryInfo *const binaryDictionaryInfo,
|
const BinaryDictionaryInfo *const binaryDictionaryInfo,
|
||||||
const NodeFilter *const nodeFilter, DicNodeVector *const childDicNodes) const;
|
const NodeFilter *const nodeFilter, DicNodeVector *const childDicNodes) const;
|
||||||
};
|
};
|
||||||
|
|
|
@ -0,0 +1,67 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2013, The Android Open Source Project
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h"
|
||||||
|
|
||||||
|
#include "defines.h"
|
||||||
|
#include "suggest/core/dictionary/byte_array_utils.h"
|
||||||
|
|
||||||
|
namespace latinime {
|
||||||
|
|
||||||
|
typedef PatriciaTrieReadingUtils PtReadingUtils;
|
||||||
|
|
||||||
|
const PtReadingUtils::NodeFlags PtReadingUtils::MASK_GROUP_ADDRESS_TYPE = 0xC0;
|
||||||
|
const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
|
||||||
|
const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
|
||||||
|
const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
|
||||||
|
const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
|
||||||
|
|
||||||
|
// Flag for single/multiple char group
|
||||||
|
const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_MULTIPLE_CHARS = 0x20;
|
||||||
|
// Flag for terminal groups
|
||||||
|
const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_TERMINAL = 0x10;
|
||||||
|
// Flag for shortcut targets presence
|
||||||
|
const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_SHORTCUT_TARGETS = 0x08;
|
||||||
|
// Flag for bigram presence
|
||||||
|
const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_BIGRAMS = 0x04;
|
||||||
|
// Flag for non-words (typically, shortcut only entries)
|
||||||
|
const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_NOT_A_WORD = 0x02;
|
||||||
|
// Flag for blacklist
|
||||||
|
const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
|
||||||
|
|
||||||
|
/* static */ int PtReadingUtils::readChildrenPositionAndAdvancePosition(
|
||||||
|
const uint8_t *const buffer, const NodeFlags flags, int *const pos) {
|
||||||
|
const int base = *pos;
|
||||||
|
int offset = 0;
|
||||||
|
switch (MASK_GROUP_ADDRESS_TYPE & flags) {
|
||||||
|
case FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
|
||||||
|
offset = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
|
||||||
|
break;
|
||||||
|
case FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
|
||||||
|
offset = ByteArrayUtils::readUint16AndAdvancePosition(buffer, pos);
|
||||||
|
break;
|
||||||
|
case FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
|
||||||
|
offset = ByteArrayUtils::readUint24AndAdvancePosition(buffer, pos);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
// If we come here, it means we asked for the children of a word with
|
||||||
|
// no children.
|
||||||
|
return NOT_A_DICT_POS;
|
||||||
|
}
|
||||||
|
return base + offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace latinime
|
|
@ -0,0 +1,139 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2013, The Android Open Source Project
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef LATINIME_PATRICIA_TRIE_READING_UTILS_H
|
||||||
|
#define LATINIME_PATRICIA_TRIE_READING_UTILS_H
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#include "defines.h"
|
||||||
|
#include "suggest/core/dictionary/byte_array_utils.h"
|
||||||
|
|
||||||
|
namespace latinime {
|
||||||
|
|
||||||
|
class PatriciaTrieReadingUtils {
|
||||||
|
public:
|
||||||
|
typedef uint8_t NodeFlags;
|
||||||
|
|
||||||
|
static AK_FORCE_INLINE int getGroupCountAndAdvancePosition(
|
||||||
|
const uint8_t *const buffer, int *const pos) {
|
||||||
|
const uint8_t firstByte = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
|
||||||
|
if (firstByte < 0x80) {
|
||||||
|
return firstByte;
|
||||||
|
} else {
|
||||||
|
return ((firstByte & 0x7F) << 8) ^ ByteArrayUtils::readUint8AndAdvancePosition(
|
||||||
|
buffer, pos);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static AK_FORCE_INLINE NodeFlags getFlagsAndAdvancePosition(const uint8_t *const buffer,
|
||||||
|
int *const pos) {
|
||||||
|
return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
static AK_FORCE_INLINE int getCodePointAndAdvancePosition(const uint8_t *const buffer,
|
||||||
|
int *const pos) {
|
||||||
|
return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns the number of read characters.
|
||||||
|
static AK_FORCE_INLINE int getCharsAndAdvancePosition(const uint8_t *const buffer,
|
||||||
|
const NodeFlags flags, const int maxLength, int *const outBuffer, int *const pos) {
|
||||||
|
int length = 0;
|
||||||
|
if (hasMultipleChars(flags)) {
|
||||||
|
length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, outBuffer,
|
||||||
|
pos);
|
||||||
|
} else {
|
||||||
|
if (maxLength > 0) {
|
||||||
|
outBuffer[0] = getCodePointAndAdvancePosition(buffer, pos);
|
||||||
|
length = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns the number of skipped characters.
|
||||||
|
static AK_FORCE_INLINE int skipCharacters(const uint8_t *const buffer, const NodeFlags flags,
|
||||||
|
const int maxLength, int *const pos) {
|
||||||
|
if (hasMultipleChars(flags)) {
|
||||||
|
return ByteArrayUtils::advancePositionToBehindString(buffer, maxLength, pos);
|
||||||
|
} else {
|
||||||
|
if (maxLength > 0) {
|
||||||
|
getCodePointAndAdvancePosition(buffer, pos);
|
||||||
|
return 1;
|
||||||
|
} else {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static AK_FORCE_INLINE int readProbabilityAndAdvancePosition(const uint8_t *const buffer,
|
||||||
|
int *const pos) {
|
||||||
|
return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int readChildrenPositionAndAdvancePosition(const uint8_t *const buffer,
|
||||||
|
const NodeFlags flags, int *const pos);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Node Flags
|
||||||
|
*/
|
||||||
|
static AK_FORCE_INLINE bool isBlacklisted(const NodeFlags flags) {
|
||||||
|
return (flags & FLAG_IS_BLACKLISTED) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static AK_FORCE_INLINE bool isNotAWord(const NodeFlags flags) {
|
||||||
|
return (flags & FLAG_IS_NOT_A_WORD) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static AK_FORCE_INLINE bool isTerminal(const NodeFlags flags) {
|
||||||
|
return (flags & FLAG_IS_TERMINAL) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static AK_FORCE_INLINE bool hasShortcutTargets(const NodeFlags flags) {
|
||||||
|
return (flags & FLAG_HAS_SHORTCUT_TARGETS) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static AK_FORCE_INLINE bool hasBigrams(const NodeFlags flags) {
|
||||||
|
return (flags & FLAG_HAS_BIGRAMS) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static AK_FORCE_INLINE bool hasMultipleChars(const NodeFlags flags) {
|
||||||
|
return (flags & FLAG_HAS_MULTIPLE_CHARS) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static AK_FORCE_INLINE bool hasChildrenInFlags(const NodeFlags flags) {
|
||||||
|
return FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != (MASK_GROUP_ADDRESS_TYPE & flags);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTrieReadingUtils);
|
||||||
|
|
||||||
|
static const NodeFlags MASK_GROUP_ADDRESS_TYPE;
|
||||||
|
static const NodeFlags FLAG_GROUP_ADDRESS_TYPE_NOADDRESS;
|
||||||
|
static const NodeFlags FLAG_GROUP_ADDRESS_TYPE_ONEBYTE;
|
||||||
|
static const NodeFlags FLAG_GROUP_ADDRESS_TYPE_TWOBYTES;
|
||||||
|
static const NodeFlags FLAG_GROUP_ADDRESS_TYPE_THREEBYTES;
|
||||||
|
|
||||||
|
static const NodeFlags FLAG_HAS_MULTIPLE_CHARS;
|
||||||
|
static const NodeFlags FLAG_IS_TERMINAL;
|
||||||
|
static const NodeFlags FLAG_HAS_SHORTCUT_TARGETS;
|
||||||
|
static const NodeFlags FLAG_HAS_BIGRAMS;
|
||||||
|
static const NodeFlags FLAG_IS_NOT_A_WORD;
|
||||||
|
static const NodeFlags FLAG_IS_BLACKLISTED;
|
||||||
|
};
|
||||||
|
} // namespace latinime
|
||||||
|
#endif /* LATINIME_PATRICIA_TRIE_NODE_READING_UTILS_H */
|
Loading…
Reference in a new issue