LatinIME/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp

322 lines
13 KiB
C++

/*
* Copyright (C) 2013, The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
#include "dictionary/structure/pt_common/pt_node_array_reader.h"
#include "utils/char_utils.h"
namespace latinime {
// To avoid infinite loop caused by invalid or malicious forward links.
const int DynamicPtReadingHelper::MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP = 100000;
const int DynamicPtReadingHelper::MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP = 100000;
const size_t DynamicPtReadingHelper::MAX_READING_STATE_STACK_SIZE = MAX_WORD_LENGTH;
bool DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions::onVisitingPtNode(
const PtNodeParams *const ptNodeParams) {
if (ptNodeParams->isTerminal() && !ptNodeParams->isDeleted()) {
mTerminalPositions->push_back(ptNodeParams->getHeadPos());
}
return true;
}
// Visits all PtNodes in post-order depth first manner.
// For example, visits c -> b -> y -> x -> a for the following dictionary:
// a _ b _ c
// \ x _ y
bool DynamicPtReadingHelper::traverseAllPtNodesInPostorderDepthFirstManner(
TraversingEventListener *const listener) {
bool alreadyVisitedChildren = false;
// Descend from the root to the root PtNode array.
if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) {
return false;
}
while (!isEnd()) {
const PtNodeParams ptNodeParams(getPtNodeParams());
if (!ptNodeParams.isValid()) {
break;
}
if (!alreadyVisitedChildren) {
if (ptNodeParams.hasChildren()) {
// Move to the first child.
if (!listener->onDescend(ptNodeParams.getChildrenPos())) {
return false;
}
pushReadingStateToStack();
readChildNode(ptNodeParams);
} else {
alreadyVisitedChildren = true;
}
} else {
if (!listener->onVisitingPtNode(&ptNodeParams)) {
return false;
}
readNextSiblingNode(ptNodeParams);
if (isEnd()) {
// All PtNodes in current linked PtNode arrays have been visited.
// Return to the parent.
if (!listener->onReadingPtNodeArrayTail()) {
return false;
}
if (mReadingStateStack.size() <= 0) {
break;
}
if (!listener->onAscend()) {
return false;
}
popReadingStateFromStack();
alreadyVisitedChildren = true;
} else {
// Process sibling PtNode.
alreadyVisitedChildren = false;
}
}
}
// Ascend from the root PtNode array to the root.
if (!listener->onAscend()) {
return false;
}
return !isError();
}
// Visits all PtNodes in PtNode array level pre-order depth first manner, which is the same order
// that PtNodes are written in the dictionary buffer.
// For example, visits a -> b -> x -> c -> y for the following dictionary:
// a _ b _ c
// \ x _ y
bool DynamicPtReadingHelper::traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
TraversingEventListener *const listener) {
bool alreadyVisitedAllPtNodesInArray = false;
bool alreadyVisitedChildren = false;
// Descend from the root to the root PtNode array.
if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) {
return false;
}
if (isEnd()) {
// Empty dictionary. Needs to notify the listener of the tail of empty PtNode array.
if (!listener->onReadingPtNodeArrayTail()) {
return false;
}
}
pushReadingStateToStack();
while (!isEnd()) {
const PtNodeParams ptNodeParams(getPtNodeParams());
if (!ptNodeParams.isValid()) {
break;
}
if (alreadyVisitedAllPtNodesInArray) {
if (alreadyVisitedChildren) {
// Move to next sibling PtNode's children.
readNextSiblingNode(ptNodeParams);
if (isEnd()) {
// Return to the parent PTNode.
if (!listener->onAscend()) {
return false;
}
if (mReadingStateStack.size() <= 0) {
break;
}
popReadingStateFromStack();
alreadyVisitedChildren = true;
alreadyVisitedAllPtNodesInArray = true;
} else {
alreadyVisitedChildren = false;
}
} else {
if (ptNodeParams.hasChildren()) {
// Move to the first child.
if (!listener->onDescend(ptNodeParams.getChildrenPos())) {
return false;
}
pushReadingStateToStack();
readChildNode(ptNodeParams);
// Push state to return the head of PtNode array.
pushReadingStateToStack();
alreadyVisitedAllPtNodesInArray = false;
alreadyVisitedChildren = false;
} else {
alreadyVisitedChildren = true;
}
}
} else {
if (!listener->onVisitingPtNode(&ptNodeParams)) {
return false;
}
readNextSiblingNode(ptNodeParams);
if (isEnd()) {
if (!listener->onReadingPtNodeArrayTail()) {
return false;
}
// Return to the head of current PtNode array.
popReadingStateFromStack();
alreadyVisitedAllPtNodesInArray = true;
}
}
}
popReadingStateFromStack();
// Ascend from the root PtNode array to the root.
if (!listener->onAscend()) {
return false;
}
return !isError();
}
int DynamicPtReadingHelper::getCodePointsAndReturnCodePointCount(const int maxCodePointCount,
int *const outCodePoints) {
// This method traverses parent nodes from the terminal by following parent pointers; thus,
// node code points are stored in the buffer in the reverse order.
int reverseCodePoints[maxCodePointCount];
const PtNodeParams terminalPtNodeParams(getPtNodeParams());
// First, read the terminal node and get its probability.
if (!isValidTerminalNode(terminalPtNodeParams)) {
// Node at the ptNodePos is not a valid terminal node.
return 0;
}
// Then, following parent node link to the dictionary root and fetch node code points.
int totalCodePointCount = 0;
while (!isEnd()) {
const PtNodeParams ptNodeParams(getPtNodeParams());
totalCodePointCount = getTotalCodePointCount(ptNodeParams);
if (!ptNodeParams.isValid() || totalCodePointCount > maxCodePointCount) {
// The ptNodePos is not a valid terminal node position in the dictionary.
return 0;
}
// Store node code points to buffer in the reverse order.
fetchMergedNodeCodePointsInReverseOrder(ptNodeParams, getPrevTotalCodePointCount(),
reverseCodePoints);
// Follow parent node toward the root node.
readParentNode(ptNodeParams);
}
if (isError()) {
// The node position or the dictionary is invalid.
return 0;
}
// Reverse the stored code points to output them.
for (int i = 0; i < totalCodePointCount; ++i) {
outCodePoints[i] = reverseCodePoints[totalCodePointCount - i - 1];
}
return totalCodePointCount;
}
int DynamicPtReadingHelper::getTerminalPtNodePositionOfWord(const int *const inWord,
const size_t length, const bool forceLowerCaseSearch) {
int searchCodePoints[length];
for (size_t i = 0; i < length; ++i) {
searchCodePoints[i] = forceLowerCaseSearch ? CharUtils::toLowerCase(inWord[i]) : inWord[i];
}
while (!isEnd()) {
const PtNodeParams ptNodeParams(getPtNodeParams());
const int matchedCodePointCount = getPrevTotalCodePointCount();
if (getTotalCodePointCount(ptNodeParams) > length
|| !isMatchedCodePoint(ptNodeParams, 0 /* index */,
searchCodePoints[matchedCodePointCount])) {
// Current node has too many code points or its first code point is different from
// target code point. Skip this node and read the next sibling node.
readNextSiblingNode(ptNodeParams);
continue;
}
// Check following merged node code points.
const int nodeCodePointCount = ptNodeParams.getCodePointCount();
for (int j = 1; j < nodeCodePointCount; ++j) {
if (!isMatchedCodePoint(ptNodeParams, j, searchCodePoints[matchedCodePointCount + j])) {
// Different code point is found. The given word is not included in the dictionary.
return NOT_A_DICT_POS;
}
}
// All characters are matched.
if (length == getTotalCodePointCount(ptNodeParams)) {
if (!ptNodeParams.isTerminal()) {
return NOT_A_DICT_POS;
}
// Terminal position is found.
return ptNodeParams.getHeadPos();
}
if (!ptNodeParams.hasChildren()) {
return NOT_A_DICT_POS;
}
// Advance to the children nodes.
readChildNode(ptNodeParams);
}
// If we already traversed the tree further than the word is long, there means
// there was no match (or we would have found it).
return NOT_A_DICT_POS;
}
// Read node array size and process empty node arrays. Nodes and arrays are counted up in this
// method to avoid an infinite loop.
void DynamicPtReadingHelper::nextPtNodeArray() {
int ptNodeCountInArray = 0;
int firstPtNodePos = NOT_A_DICT_POS;
if (!mPtNodeArrayReader->readPtNodeArrayInfoAndReturnIfValid(
mReadingState.mPos, &ptNodeCountInArray, &firstPtNodePos)) {
mIsError = true;
mReadingState.mPos = NOT_A_DICT_POS;
return;
}
mReadingState.mPosOfThisPtNodeArrayHead = mReadingState.mPos;
mReadingState.mRemainingPtNodeCountInThisArray = ptNodeCountInArray;
mReadingState.mPos = firstPtNodePos;
// Count up nodes and node arrays to avoid infinite loop.
mReadingState.mTotalPtNodeIndexInThisArrayChain +=
mReadingState.mRemainingPtNodeCountInThisArray;
mReadingState.mPtNodeArrayIndexInThisArrayChain++;
if (mReadingState.mRemainingPtNodeCountInThisArray < 0
|| mReadingState.mTotalPtNodeIndexInThisArrayChain
> MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP
|| mReadingState.mPtNodeArrayIndexInThisArrayChain
> MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP) {
// Invalid dictionary.
AKLOGI("Invalid dictionary. nodeCount: %d, totalNodeCount: %d, MAX_CHILD_COUNT: %d"
"nodeArrayCount: %d, MAX_NODE_ARRAY_COUNT: %d",
mReadingState.mRemainingPtNodeCountInThisArray,
mReadingState.mTotalPtNodeIndexInThisArrayChain,
MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP,
mReadingState.mPtNodeArrayIndexInThisArrayChain,
MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP);
ASSERT(false);
mIsError = true;
mReadingState.mPos = NOT_A_DICT_POS;
return;
}
if (mReadingState.mRemainingPtNodeCountInThisArray == 0) {
// Empty node array. Try following forward link.
followForwardLink();
}
}
// Follow the forward link and read the next node array if exists.
void DynamicPtReadingHelper::followForwardLink() {
int nextPtNodeArrayPos = NOT_A_DICT_POS;
if (!mPtNodeArrayReader->readForwardLinkAndReturnIfValid(
mReadingState.mPos, &nextPtNodeArrayPos)) {
mIsError = true;
mReadingState.mPos = NOT_A_DICT_POS;
return;
}
mReadingState.mPosOfLastForwardLinkField = mReadingState.mPos;
if (nextPtNodeArrayPos != NOT_A_DICT_POS) {
// Follow the forward link.
mReadingState.mPos = nextPtNodeArrayPos;
nextPtNodeArray();
} else {
// All node arrays have been read.
mReadingState.mPos = NOT_A_DICT_POS;
}
}
} // namespace latinime