Merge "Introduce DynamicPatriciaTrieReadingHelper."

This commit is contained in:
Keisuke Kuroyanagi 2013-09-02 02:05:32 +00:00 committed by Android (Google) Code Review
commit be76dbfef9
5 changed files with 350 additions and 127 deletions

View file

@ -74,6 +74,7 @@ LATIN_IME_CORE_SRC_FILES := \
dictionary_structure_with_buffer_policy_factory.cpp \
dynamic_patricia_trie_node_reader.cpp \
dynamic_patricia_trie_policy.cpp \
dynamic_patricia_trie_reading_helper.cpp \
dynamic_patricia_trie_reading_utils.cpp \
patricia_trie_policy.cpp \
patricia_trie_reading_utils.cpp) \

View file

@ -20,95 +20,68 @@
#include "suggest/core/dicnode/dic_node.h"
#include "suggest/core/dicnode/dic_node_vector.h"
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h"
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h"
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h"
namespace latinime {
// To avoid infinite loop caused by invalid or malicious forward links.
const int DynamicPatriciaTriePolicy::MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP = 100000;
void DynamicPatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode,
DicNodeVector *const childDicNodes) const {
if (!dicNode->hasChildren()) {
return;
}
DynamicPatriciaTrieNodeReader nodeReader(mDictRoot, mOriginalDictSize, &mExtendableBuffer,
getBigramsStructurePolicy(), getShortcutsStructurePolicy());
int mergedNodeCodePoints[MAX_WORD_LENGTH];
int nextPos = dicNode->getChildrenPos();
int totalChildCount = 0;
do {
const int childCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(
mDictRoot, &nextPos);
totalChildCount += childCount;
if (childCount <= 0 || totalChildCount > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP) {
// Invalid dictionary.
AKLOGI("Invalid dictionary. childCount: %d, totalChildCount: %d, MAX: %d",
childCount, totalChildCount, MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP);
ASSERT(false);
return;
}
for (int i = 0; i < childCount; i++) {
nodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(nextPos, MAX_WORD_LENGTH,
mergedNodeCodePoints);
if (!nodeReader.isDeleted()) {
// Push child node when the node is not a deleted node.
childDicNodes->pushLeavingChild(dicNode, nodeReader.getNodePos(),
nodeReader.getChildrenPos(), nodeReader.getProbability(),
nodeReader.isTerminal(), nodeReader.hasChildren(),
nodeReader.isBlacklisted() || nodeReader.isNotAWord(),
nodeReader.getCodePointCount(), mergedNodeCodePoints);
}
nextPos = nodeReader.getSiblingNodePos();
}
nextPos = DynamicPatriciaTrieReadingUtils::getForwardLinkPosition(mDictRoot, nextPos);
} while (DynamicPatriciaTrieReadingUtils::isValidForwardLinkPosition(nextPos));
DynamicPatriciaTrieReadingHelper readingHelper(mDictRoot, mOriginalDictSize,
&mExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy());
readingHelper.initWithNodeArrayPos(dicNode->getChildrenPos());
const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper.getNodeReader();
while (!readingHelper.isEnd()) {
childDicNodes->pushLeavingChild(dicNode, nodeReader->getNodePos(),
nodeReader->getChildrenPos(), nodeReader->getProbability(),
nodeReader->isTerminal() && !nodeReader->isDeleted(),
nodeReader->hasChildren(), nodeReader->isBlacklisted() || nodeReader->isNotAWord(),
nodeReader->getCodePointCount(), readingHelper.getMergedNodeCodePoints());
readingHelper.readNextSiblingNode();
}
}
int DynamicPatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
const int nodePos, const int maxCodePointCount, int *const outCodePoints,
int *const outUnigramProbability) const {
if (nodePos == NOT_A_VALID_WORD_POS) {
*outUnigramProbability = NOT_A_PROBABILITY;
return 0;
}
// This method traverses parent nodes from the terminal by following parent pointers; thus,
// node code points are stored in the buffer in the reverse order.
int reverseCodePoints[maxCodePointCount];
int mergedNodeCodePoints[maxCodePointCount];
int codePointCount = 0;
DynamicPatriciaTrieNodeReader nodeReader(mDictRoot, mOriginalDictSize, &mExtendableBuffer,
getBigramsStructurePolicy(), getShortcutsStructurePolicy());
// First, read terminal node and get its probability.
nodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(nodePos, maxCodePointCount,
mergedNodeCodePoints);
// Store terminal node probability.
*outUnigramProbability = nodeReader.getProbability();
// Store terminal node code points to buffer in the reverse order.
for (int i = nodeReader.getCodePointCount() - 1; i >= 0; --i) {
reverseCodePoints[codePointCount++] = mergedNodeCodePoints[i];
DynamicPatriciaTrieReadingHelper readingHelper(mDictRoot, mOriginalDictSize,
&mExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy());
// First, read the terminal node and get its probability.
readingHelper.initWithNodePos(nodePos);
if (!readingHelper.isValidTerminalNode()) {
// Node at the nodePos is not a valid terminal node.
*outUnigramProbability = NOT_A_PROBABILITY;
return 0;
}
// Then, follow parent pos toward the root node.
while (nodeReader.getParentPos() != NOT_A_DICT_POS) {
// codePointCount must be incremented at least once in each iteration to ensure preventing
// infinite loop.
if (nodeReader.isDeleted() || codePointCount > maxCodePointCount
|| nodeReader.getCodePointCount() <= 0) {
// Store terminal node probability.
*outUnigramProbability = readingHelper.getNodeReader()->getProbability();
// Then, following parent node link to the dictionary root and fetch node code points.
while (!readingHelper.isEnd()) {
if (readingHelper.getTotalCodePointCount() > maxCodePointCount) {
// The nodePos is not a valid terminal node position in the dictionary.
*outUnigramProbability = NOT_A_PROBABILITY;
return 0;
}
// Read parent node.
nodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(nodeReader.getParentPos(),
maxCodePointCount, mergedNodeCodePoints);
// Store node code points to buffer in the reverse order.
for (int i = nodeReader.getCodePointCount() - 1; i >= 0; --i) {
reverseCodePoints[codePointCount++] = mergedNodeCodePoints[i];
}
readingHelper.fetchMergedNodeCodePointsInReverseOrder(
readingHelper.getPrevTotalCodePointCount(), reverseCodePoints);
// Follow parent node toward the root node.
readingHelper.readParentNode();
}
if (readingHelper.isError()) {
// The node position or the dictionary is invalid.
*outUnigramProbability = NOT_A_PROBABILITY;
return 0;
}
// Reverse the stored code points to output them.
const int codePointCount = readingHelper.getTotalCodePointCount();
for (int i = 0; i < codePointCount; ++i) {
outCodePoints[i] = reverseCodePoints[codePointCount - i - 1];
}
@ -121,73 +94,39 @@ int DynamicPatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const in
for (int i = 0; i < length; ++i) {
searchCodePoints[i] = forceLowerCaseSearch ? CharUtils::toLowerCase(inWord[i]) : inWord[i];
}
int mergedNodeCodePoints[MAX_WORD_LENGTH];
int currentLength = 0;
int pos = getRootPosition();
DynamicPatriciaTrieNodeReader nodeReader(mDictRoot, mOriginalDictSize, &mExtendableBuffer,
getBigramsStructurePolicy(), getShortcutsStructurePolicy());
while (currentLength < length) {
// When foundMatchedNode becomes true, currentLength is increased at least once.
bool foundMatchedNode = false;
int totalChildCount = 0;
do {
const int childCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(
mDictRoot, &pos);
totalChildCount += childCount;
if (childCount <= 0 || totalChildCount > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP) {
// Invalid dictionary.
AKLOGI("Invalid dictionary. childCount: %d, totalChildCount: %d, MAX: %d",
childCount, totalChildCount, MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP);
ASSERT(false);
DynamicPatriciaTrieReadingHelper readingHelper(mDictRoot, mOriginalDictSize,
&mExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy());
readingHelper.initWithNodeArrayPos(getRootPosition());
const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper.getNodeReader();
while (!readingHelper.isEnd()) {
const int matchedCodePointCount = readingHelper.getPrevTotalCodePointCount();
if (readingHelper.getTotalCodePointCount() > length
|| !readingHelper.isMatchedCodePoint(0 /* index */,
searchCodePoints[matchedCodePointCount])) {
// Current node has too many code points or its first code point is different from
// target code point. Skip this node and read the next sibling node.
readingHelper.readNextSiblingNode();
continue;
}
// Check following merged node code points.
const int nodeCodePointCount = nodeReader->getCodePointCount();
for (int j = 1; j < nodeCodePointCount; ++j) {
if (!readingHelper.isMatchedCodePoint(
j, searchCodePoints[matchedCodePointCount + j])) {
// Different code point is found. The given word is not included in the dictionary.
return NOT_A_VALID_WORD_POS;
}
for (int i = 0; i < childCount; i++) {
nodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(pos, MAX_WORD_LENGTH,
mergedNodeCodePoints);
const int nodeCodePointCount = nodeReader.getCodePointCount();
if (nodeReader.isDeleted() || nodeCodePointCount <= 0
|| currentLength + nodeCodePointCount > length) {
// Skip deleted or empty node.
pos = nodeReader.getSiblingNodePos();
continue;
}
bool matched = true;
for (int j = 0; j < nodeCodePointCount; ++j) {
if (mergedNodeCodePoints[j] != searchCodePoints[currentLength + j]) {
// Different code point is found.
matched = false;
break;
}
}
if (matched) {
currentLength += nodeCodePointCount;
if (length == currentLength) {
// Terminal position is found.
return nodeReader.getNodePos();
}
if (!nodeReader.hasChildren()) {
return NOT_A_VALID_WORD_POS;
}
foundMatchedNode = true;
// Advance to the children nodes.
pos = nodeReader.getChildrenPos();
break;
}
// Try next sibling node.
pos = nodeReader.getSiblingNodePos();
}
if (foundMatchedNode) {
break;
}
// If the matched node is not found in the current PtNode array, try to follow the
// forward link.
pos = DynamicPatriciaTrieReadingUtils::getForwardLinkPosition(
mDictRoot, pos);
} while (DynamicPatriciaTrieReadingUtils::isValidForwardLinkPosition(pos));
if (!foundMatchedNode) {
// Matched node is not found.
}
// All characters are matched.
if (length == readingHelper.getTotalCodePointCount()) {
// Terminal position is found.
return nodeReader->getNodePos();
}
if (!nodeReader->hasChildren()) {
return NOT_A_VALID_WORD_POS;
}
// Advance to the children nodes.
readingHelper.readChildNode();
}
// If we already traversed the tree further than the word is long, there means
// there was no match (or we would have found it).

View file

@ -87,7 +87,6 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTriePolicy);
static const int MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP;
const MmappedBuffer *const mBuffer;
const ExtendableBuffer mExtendableBuffer;

View file

@ -0,0 +1,83 @@
/*
* Copyright (C) 2013, The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h"
namespace latinime {
// To avoid infinite loop caused by invalid or malicious forward links.
const int DynamicPatriciaTrieReadingHelper::MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP = 100000;
const int DynamicPatriciaTrieReadingHelper::MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP = 100000;
// Read node array size and process empty node arrays. Nodes and arrays are counted up in this
// method to avoid an infinite loop.
void DynamicPatriciaTrieReadingHelper::nextNodeArray() {
const bool usesAdditionalBuffer = mPos >= mOriginalDictSize;
const uint8_t *const dictBuf = (usesAdditionalBuffer)
? mExtendableBuffer->getBuffer() : mDictRoot;
if (usesAdditionalBuffer) {
mPos -= mOriginalDictSize;
}
mNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(dictBuf,
&mPos);
if (usesAdditionalBuffer) {
mPos += mOriginalDictSize;
}
// Count up nodes and node arrays to avoid infinite loop.
mTotalNodeCount += mNodeCount;
mNodeArrayCount++;
if (mNodeCount < 0 || mTotalNodeCount > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP
|| mNodeArrayCount > MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP) {
// Invalid dictionary.
AKLOGI("Invalid dictionary. nodeCount: %d, totalNodeCount: %d, MAX_CHILD_COUNT: %d"
"nodeArrayCount: %d, MAX_NODE_ARRAY_COUNT: %d",
mNodeCount, mTotalNodeCount, MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP,
mNodeArrayCount, MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP);
ASSERT(false);
mIsError = true;
mPos = NOT_A_DICT_POS;
return;
}
if (mNodeCount == 0) {
// Empty node array. Try following forward link.
followForwardLink();
}
}
// Follow the forward link and read the next node array if exists.
void DynamicPatriciaTrieReadingHelper::followForwardLink() {
const bool usesAdditionalBuffer = mPos >= mOriginalDictSize;
const uint8_t *const dictBuf = (usesAdditionalBuffer)
? mExtendableBuffer->getBuffer() : mDictRoot;
if (usesAdditionalBuffer) {
mPos -= mOriginalDictSize;
}
const int forwardLinkPosition =
DynamicPatriciaTrieReadingUtils::getForwardLinkPosition(dictBuf, mPos);
if (usesAdditionalBuffer) {
mPos += mOriginalDictSize;
}
if (DynamicPatriciaTrieReadingUtils::isValidForwardLinkPosition(forwardLinkPosition)) {
// Follow the forward link.
mPos = forwardLinkPosition;
nextNodeArray();
} else {
// All node arrays have been read.
mPos = NOT_A_DICT_POS;
}
}
} // namespace latinime

View file

@ -0,0 +1,201 @@
/*
* Copyright (C) 2013, The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H
#define LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H
#include "defines.h"
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h"
#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/utils/extendable_buffer.h"
namespace latinime {
/*
* This class is used for traversing dynamic patricia trie. This class supports iterating nodes and
* dealing with additional buffer. This class counts nodes and node arrays to avoid infinite loop.
*/
class DynamicPatriciaTrieReadingHelper {
public:
DynamicPatriciaTrieReadingHelper(const uint8_t *const dictRoot, const int originalDictSize,
const ExtendableBuffer *const extendableBuffer,
const DictionaryBigramsStructurePolicy *const bigramsPolicy,
const DictionaryShortcutsStructurePolicy *const shortcutsPolicy)
: mIsError(false), mPos(NOT_A_DICT_POS), mNodeCount(0), mPrevTotalCodePointCount(0),
mTotalNodeCount(0), mNodeArrayCount(0), mDictRoot(dictRoot),
mOriginalDictSize(originalDictSize), mExtendableBuffer(extendableBuffer),
mNodeReader(mDictRoot, mOriginalDictSize, mExtendableBuffer, bigramsPolicy,
shortcutsPolicy) {}
~DynamicPatriciaTrieReadingHelper() {}
AK_FORCE_INLINE bool isError() const {
return mIsError;
}
AK_FORCE_INLINE bool isEnd() const {
return mPos == NOT_A_DICT_POS;
}
// Initialize reading state with the head position of a node array.
AK_FORCE_INLINE void initWithNodeArrayPos(const int nodeArrayPos) {
if (nodeArrayPos == NOT_A_DICT_POS) {
mPos = NOT_A_DICT_POS;
} else {
mIsError = false;
mPos = nodeArrayPos;
mNodeCount = 0;
mPrevTotalCodePointCount = 0;
mTotalNodeCount = 0;
mNodeArrayCount = 0;
nextNodeArray();
if (!isEnd()) {
fetchNodeInfo();
}
}
}
// Initialize reading state with the head position of a node.
AK_FORCE_INLINE void initWithNodePos(const int nodePos) {
// TODO: Consolidate NOT_A_VALID_WORD_POS and NOT_A_DICT_POS
if (nodePos == NOT_A_VALID_WORD_POS || nodePos == NOT_A_DICT_POS) {
mPos = NOT_A_DICT_POS;
} else {
mIsError = false;
mPos = nodePos;
mNodeCount = 1;
mPrevTotalCodePointCount = 0;
mTotalNodeCount = 1;
mNodeArrayCount = 1;
fetchNodeInfo();
}
}
AK_FORCE_INLINE const DynamicPatriciaTrieNodeReader* getNodeReader() const {
return &mNodeReader;
}
AK_FORCE_INLINE bool isValidTerminalNode() const {
return !isEnd() && !mNodeReader.isDeleted() && mNodeReader.isTerminal();
}
AK_FORCE_INLINE bool isMatchedCodePoint(const int index, const int codePoint) const {
return mMergedNodeCodePoints[index] == codePoint;
}
// Return code point count exclude the last read node's code points.
AK_FORCE_INLINE int getPrevTotalCodePointCount() const {
return mPrevTotalCodePointCount;
}
// Return code point count include the last read node's code points.
AK_FORCE_INLINE int getTotalCodePointCount() const {
return mPrevTotalCodePointCount + mNodeReader.getCodePointCount();
}
AK_FORCE_INLINE void fetchMergedNodeCodePointsInReverseOrder(
const int index, int *const outCodePoints) const {
const int nodeCodePointCount = mNodeReader.getCodePointCount();
for (int i = 0; i < nodeCodePointCount; ++i) {
outCodePoints[index + i] = mMergedNodeCodePoints[nodeCodePointCount - 1 - i];
}
}
AK_FORCE_INLINE const int *getMergedNodeCodePoints() const {
return mMergedNodeCodePoints;
}
AK_FORCE_INLINE void readNextSiblingNode() {
mNodeCount -= 1;
mPos = mNodeReader.getSiblingNodePos();
if (mNodeCount <= 0) {
// All nodes in the current node array have been read.
followForwardLink();
if (!isEnd()) {
fetchNodeInfo();
}
} else {
fetchNodeInfo();
}
}
// Read the first child node of the current node.
AK_FORCE_INLINE void readChildNode() {
if (mNodeReader.hasChildren()) {
mPrevTotalCodePointCount += mNodeReader.getCodePointCount();
mTotalNodeCount = 0;
mNodeArrayCount = 0;
mPos = mNodeReader.getChildrenPos();
// Read children node array.
nextNodeArray();
if (!isEnd()) {
fetchNodeInfo();
}
} else {
mPos = NOT_A_DICT_POS;
}
}
// Read the parent node of the current node.
AK_FORCE_INLINE void readParentNode() {
if (mNodeReader.getParentPos() != NOT_A_DICT_POS) {
mPrevTotalCodePointCount += mNodeReader.getCodePointCount();
mTotalNodeCount = 1;
mNodeArrayCount = 1;
mNodeCount = 1;
mPos = mNodeReader.getParentPos();
fetchNodeInfo();
} else {
mPos = NOT_A_DICT_POS;
}
}
private:
DISALLOW_COPY_AND_ASSIGN(DynamicPatriciaTrieReadingHelper);
static const int MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP;
static const int MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP;
bool mIsError;
int mPos;
// Node count of a node array.
int mNodeCount;
int mPrevTotalCodePointCount;
int mTotalNodeCount;
int mNodeArrayCount;
const uint8_t *const mDictRoot;
const int mOriginalDictSize;
const ExtendableBuffer *const mExtendableBuffer;
DynamicPatriciaTrieNodeReader mNodeReader;
int mMergedNodeCodePoints[MAX_WORD_LENGTH];
void nextNodeArray();
void followForwardLink();
AK_FORCE_INLINE void fetchNodeInfo() {
mNodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(mPos, MAX_WORD_LENGTH,
mMergedNodeCodePoints);
if (mNodeReader.getCodePointCount() <= 0) {
// Empty node is not allowed.
mIsError = true;
mPos = NOT_A_DICT_POS;
}
}
};
} // namespace latinime
#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H */