Merge "Use ReadOnlyByteArrayView in PatriciaTriePolicy."

main
Keisuke Kuroyanagi 2014-09-17 12:50:15 +00:00 committed by Android (Google) Code Review
commit 5849feeee1
3 changed files with 59 additions and 44 deletions

View File

@ -37,19 +37,19 @@ void PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNo
return; return;
} }
int nextPos = dicNode->getChildrenPtNodeArrayPos(); int nextPos = dicNode->getChildrenPtNodeArrayPos();
if (nextPos < 0 || nextPos >= mDictBufferSize) { if (!isValidPos(nextPos)) {
AKLOGE("Children PtNode array position is invalid. pos: %d, dict size: %d", AKLOGE("Children PtNode array position is invalid. pos: %d, dict size: %zd",
nextPos, mDictBufferSize); nextPos, mBuffer.size());
mIsCorrupted = true; mIsCorrupted = true;
ASSERT(false); ASSERT(false);
return; return;
} }
const int childCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( const int childCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(
mDictRoot, &nextPos); mBuffer.data(), &nextPos);
for (int i = 0; i < childCount; i++) { for (int i = 0; i < childCount; i++) {
if (nextPos < 0 || nextPos >= mDictBufferSize) { if (!isValidPos(nextPos)) {
AKLOGE("Child PtNode position is invalid. pos: %d, dict size: %d, childCount: %d / %d", AKLOGE("Child PtNode position is invalid. pos: %d, dict size: %zd, childCount: %d / %d",
nextPos, mDictBufferSize, i, childCount); nextPos, mBuffer.size(), i, childCount);
mIsCorrupted = true; mIsCorrupted = true;
ASSERT(false); ASSERT(false);
return; return;
@ -91,56 +91,57 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
int lastCandidatePtNodePos = 0; int lastCandidatePtNodePos = 0;
// Let's loop through PtNodes in this PtNode array searching for either the terminal // Let's loop through PtNodes in this PtNode array searching for either the terminal
// or one of its ascendants. // or one of its ascendants.
if (pos < 0 || pos >= mDictBufferSize) { if (!isValidPos(pos)) {
AKLOGE("PtNode array position is invalid. pos: %d, dict size: %d", AKLOGE("PtNode array position is invalid. pos: %d, dict size: %zd",
pos, mDictBufferSize); pos, mBuffer.size());
mIsCorrupted = true; mIsCorrupted = true;
ASSERT(false); ASSERT(false);
*outUnigramProbability = NOT_A_PROBABILITY; *outUnigramProbability = NOT_A_PROBABILITY;
return 0; return 0;
} }
for (int ptNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( for (int ptNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(
mDictRoot, &pos); ptNodeCount > 0; --ptNodeCount) { mBuffer.data(), &pos); ptNodeCount > 0; --ptNodeCount) {
const int startPos = pos; const int startPos = pos;
if (pos < 0 || pos >= mDictBufferSize) { if (!isValidPos(pos)) {
AKLOGE("PtNode position is invalid. pos: %d, dict size: %d", pos, mDictBufferSize); AKLOGE("PtNode position is invalid. pos: %d, dict size: %zd", pos, mBuffer.size());
mIsCorrupted = true; mIsCorrupted = true;
ASSERT(false); ASSERT(false);
*outUnigramProbability = NOT_A_PROBABILITY; *outUnigramProbability = NOT_A_PROBABILITY;
return 0; return 0;
} }
const PatriciaTrieReadingUtils::NodeFlags flags = const PatriciaTrieReadingUtils::NodeFlags flags =
PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mBuffer.data(), &pos);
const int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( const int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
mDictRoot, &pos); mBuffer.data(), &pos);
if (ptNodePos == startPos) { if (ptNodePos == startPos) {
// We found the position. Copy the rest of the code points in the buffer and return // We found the position. Copy the rest of the code points in the buffer and return
// the length. // the length.
outCodePoints[wordPos] = character; outCodePoints[wordPos] = character;
if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
mDictRoot, &pos); mBuffer.data(), &pos);
// We count code points in order to avoid infinite loops if the file is broken // We count code points in order to avoid infinite loops if the file is broken
// or if there is some other bug // or if there is some other bug
int charCount = maxCodePointCount; int charCount = maxCodePointCount;
while (NOT_A_CODE_POINT != nextChar && --charCount > 0) { while (NOT_A_CODE_POINT != nextChar && --charCount > 0) {
outCodePoints[++wordPos] = nextChar; outCodePoints[++wordPos] = nextChar;
nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
mDictRoot, &pos); mBuffer.data(), &pos);
} }
} }
*outUnigramProbability = *outUnigramProbability =
PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(),
&pos); &pos);
return ++wordPos; return ++wordPos;
} }
// We need to skip past this PtNode, so skip any remaining code points after the // We need to skip past this PtNode, so skip any remaining code points after the
// first and possibly the probability. // first and possibly the probability.
if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos); PatriciaTrieReadingUtils::skipCharacters(mBuffer.data(), flags, MAX_WORD_LENGTH,
&pos);
} }
if (PatriciaTrieReadingUtils::isTerminal(flags)) { if (PatriciaTrieReadingUtils::isTerminal(flags)) {
PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos); PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(), &pos);
} }
// The fact that this PtNode has children is very important. Since we already know // The fact that this PtNode has children is very important. Since we already know
// that this PtNode does not match, if it has no children we know it is irrelevant // that this PtNode does not match, if it has no children we know it is irrelevant
@ -155,7 +156,8 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
int currentPos = pos; int currentPos = pos;
// Here comes the tricky part. First, read the children position. // Here comes the tricky part. First, read the children position.
const int childrenPos = PatriciaTrieReadingUtils const int childrenPos = PatriciaTrieReadingUtils
::readChildrenPositionAndAdvancePosition(mDictRoot, flags, &currentPos); ::readChildrenPositionAndAdvancePosition(mBuffer.data(), flags,
&currentPos);
if (childrenPos > ptNodePos) { if (childrenPos > ptNodePos) {
// If the children pos is greater than the position, it means the previous // If the children pos is greater than the position, it means the previous
// PtNode, which position is stored in lastCandidatePtNodePos, was the right // PtNode, which position is stored in lastCandidatePtNodePos, was the right
@ -185,30 +187,30 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
if (0 != lastCandidatePtNodePos) { if (0 != lastCandidatePtNodePos) {
const PatriciaTrieReadingUtils::NodeFlags lastFlags = const PatriciaTrieReadingUtils::NodeFlags lastFlags =
PatriciaTrieReadingUtils::getFlagsAndAdvancePosition( PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(
mDictRoot, &lastCandidatePtNodePos); mBuffer.data(), &lastCandidatePtNodePos);
const int lastChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( const int lastChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
mDictRoot, &lastCandidatePtNodePos); mBuffer.data(), &lastCandidatePtNodePos);
// We copy all the characters in this PtNode to the buffer // We copy all the characters in this PtNode to the buffer
outCodePoints[wordPos] = lastChar; outCodePoints[wordPos] = lastChar;
if (PatriciaTrieReadingUtils::hasMultipleChars(lastFlags)) { if (PatriciaTrieReadingUtils::hasMultipleChars(lastFlags)) {
int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
mDictRoot, &lastCandidatePtNodePos); mBuffer.data(), &lastCandidatePtNodePos);
int charCount = maxCodePointCount; int charCount = maxCodePointCount;
while (-1 != nextChar && --charCount > 0) { while (-1 != nextChar && --charCount > 0) {
outCodePoints[++wordPos] = nextChar; outCodePoints[++wordPos] = nextChar;
nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
mDictRoot, &lastCandidatePtNodePos); mBuffer.data(), &lastCandidatePtNodePos);
} }
} }
++wordPos; ++wordPos;
// Now we only need to branch to the children address. Skip the probability if // Now we only need to branch to the children address. Skip the probability if
// it's there, read pos, and break to resume the search at pos. // it's there, read pos, and break to resume the search at pos.
if (PatriciaTrieReadingUtils::isTerminal(lastFlags)) { if (PatriciaTrieReadingUtils::isTerminal(lastFlags)) {
PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(),
&lastCandidatePtNodePos); &lastCandidatePtNodePos);
} }
pos = PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( pos = PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(
mDictRoot, lastFlags, &lastCandidatePtNodePos); mBuffer.data(), lastFlags, &lastCandidatePtNodePos);
break; break;
} else { } else {
// Here is a little tricky part: we come here if we found out that all children // Here is a little tricky part: we come here if we found out that all children
@ -220,14 +222,14 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
// ready to start the next one. // ready to start the next one.
if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) {
PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(
mDictRoot, flags, &pos); mBuffer.data(), flags, &pos);
} }
if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) {
mShortcutListPolicy.skipAllShortcuts(&pos); mShortcutListPolicy.skipAllShortcuts(&pos);
} }
if (PatriciaTrieReadingUtils::hasBigrams(flags)) { if (PatriciaTrieReadingUtils::hasBigrams(flags)) {
if (!mBigramListPolicy.skipAllBigrams(&pos)) { if (!mBigramListPolicy.skipAllBigrams(&pos)) {
AKLOGE("Cannot skip bigrams. BufSize: %d, pos: %d.", mDictBufferSize, AKLOGE("Cannot skip bigrams. BufSize: %zd, pos: %d.", mBuffer.size(),
pos); pos);
mIsCorrupted = true; mIsCorrupted = true;
ASSERT(false); ASSERT(false);
@ -244,14 +246,14 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
// our pos is after the end of this PtNode, at the start of the next one. // our pos is after the end of this PtNode, at the start of the next one.
if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) {
PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(
mDictRoot, flags, &pos); mBuffer.data(), flags, &pos);
} }
if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) {
mShortcutListPolicy.skipAllShortcuts(&pos); mShortcutListPolicy.skipAllShortcuts(&pos);
} }
if (PatriciaTrieReadingUtils::hasBigrams(flags)) { if (PatriciaTrieReadingUtils::hasBigrams(flags)) {
if (!mBigramListPolicy.skipAllBigrams(&pos)) { if (!mBigramListPolicy.skipAllBigrams(&pos)) {
AKLOGE("Cannot skip bigrams. BufSize: %d, pos: %d.", mDictBufferSize, pos); AKLOGE("Cannot skip bigrams. BufSize: %zd, pos: %d.", mBuffer.size(), pos);
mIsCorrupted = true; mIsCorrupted = true;
ASSERT(false); ASSERT(false);
*outUnigramProbability = NOT_A_PROBABILITY; *outUnigramProbability = NOT_A_PROBABILITY;
@ -402,7 +404,7 @@ int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNod
int shortcutPos = NOT_A_DICT_POS; int shortcutPos = NOT_A_DICT_POS;
int bigramPos = NOT_A_DICT_POS; int bigramPos = NOT_A_DICT_POS;
int siblingPos = NOT_A_DICT_POS; int siblingPos = NOT_A_DICT_POS;
PatriciaTrieReadingUtils::readPtNodeInfo(mDictRoot, ptNodePos, &mShortcutListPolicy, PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, &mShortcutListPolicy,
&mBigramListPolicy, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints, &mBigramListPolicy, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints,
&probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos); &probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos);
// Skip PtNodes don't start with Unicode code point because they represent non-word information. // Skip PtNodes don't start with Unicode code point because they represent non-word information.
@ -452,14 +454,16 @@ const WordProperty PatriciaTriePolicy::getWordProperty(
int shortcutPos = getShortcutPositionOfPtNode(ptNodePos); int shortcutPos = getShortcutPositionOfPtNode(ptNodePos);
if (shortcutPos != NOT_A_DICT_POS) { if (shortcutPos != NOT_A_DICT_POS) {
int shortcutTargetCodePoints[MAX_WORD_LENGTH]; int shortcutTargetCodePoints[MAX_WORD_LENGTH];
ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(mDictRoot, &shortcutPos); ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(mBuffer.data(),
&shortcutPos);
bool hasNext = true; bool hasNext = true;
while (hasNext) { while (hasNext) {
const ShortcutListReadingUtils::ShortcutFlags shortcutFlags = const ShortcutListReadingUtils::ShortcutFlags shortcutFlags =
ShortcutListReadingUtils::getFlagsAndForwardPointer(mDictRoot, &shortcutPos); ShortcutListReadingUtils::getFlagsAndForwardPointer(mBuffer.data(),
&shortcutPos);
hasNext = ShortcutListReadingUtils::hasNext(shortcutFlags); hasNext = ShortcutListReadingUtils::hasNext(shortcutFlags);
const int shortcutTargetLength = ShortcutListReadingUtils::readShortcutTarget( const int shortcutTargetLength = ShortcutListReadingUtils::readShortcutTarget(
mDictRoot, MAX_WORD_LENGTH, shortcutTargetCodePoints, &shortcutPos); mBuffer.data(), MAX_WORD_LENGTH, shortcutTargetCodePoints, &shortcutPos);
const std::vector<int> shortcutTarget(shortcutTargetCodePoints, const std::vector<int> shortcutTarget(shortcutTargetCodePoints,
shortcutTargetCodePoints + shortcutTargetLength); shortcutTargetCodePoints + shortcutTargetLength);
const int shortcutProbability = const int shortcutProbability =
@ -512,4 +516,9 @@ int PatriciaTriePolicy::getWordIdFromTerminalPtNodePos(const int ptNodePos) cons
int PatriciaTriePolicy::getTerminalPtNodePosFromWordId(const int wordId) const { int PatriciaTriePolicy::getTerminalPtNodePosFromWordId(const int wordId) const {
return wordId == NOT_A_WORD_ID ? NOT_A_DICT_POS : wordId; return wordId == NOT_A_WORD_ID ? NOT_A_DICT_POS : wordId;
} }
bool PatriciaTriePolicy::isValidPos(const int pos) const {
return pos >= 0 && pos < static_cast<int>(mBuffer.size());
}
} // namespace latinime } // namespace latinime

View File

@ -44,13 +44,12 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
: mMmappedBuffer(std::move(mmappedBuffer)), : mMmappedBuffer(std::move(mmappedBuffer)),
mHeaderPolicy(mMmappedBuffer->getReadOnlyByteArrayView().data(), mHeaderPolicy(mMmappedBuffer->getReadOnlyByteArrayView().data(),
FormatUtils::VERSION_2), FormatUtils::VERSION_2),
mDictRoot(mMmappedBuffer->getReadOnlyByteArrayView().data() mBuffer(mMmappedBuffer->getReadOnlyByteArrayView().skip(mHeaderPolicy.getSize())),
+ mHeaderPolicy.getSize()), mBigramListPolicy(mBuffer.data(), mBuffer.size()),
mDictBufferSize(mMmappedBuffer->getReadOnlyByteArrayView().size() mShortcutListPolicy(mBuffer.data()),
- mHeaderPolicy.getSize()), mPtNodeReader(mBuffer.data(), mBuffer.size(), &mBigramListPolicy,
mBigramListPolicy(mDictRoot, mDictBufferSize), mShortcutListPolicy(mDictRoot), &mShortcutListPolicy),
mPtNodeReader(mDictRoot, mDictBufferSize, &mBigramListPolicy, &mShortcutListPolicy), mPtNodeArrayReader(mBuffer.data(), mBuffer.size()),
mPtNodeArrayReader(mDictRoot, mDictBufferSize),
mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {} mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {}
AK_FORCE_INLINE int getRootPosition() const { AK_FORCE_INLINE int getRootPosition() const {
@ -149,8 +148,7 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
const MmappedBuffer::MmappedBufferPtr mMmappedBuffer; const MmappedBuffer::MmappedBufferPtr mMmappedBuffer;
const HeaderPolicy mHeaderPolicy; const HeaderPolicy mHeaderPolicy;
const uint8_t *const mDictRoot; const ReadOnlyByteArrayView mBuffer;
const int mDictBufferSize;
const BigramListPolicy mBigramListPolicy; const BigramListPolicy mBigramListPolicy;
const ShortcutListPolicy mShortcutListPolicy; const ShortcutListPolicy mShortcutListPolicy;
const Ver2ParticiaTrieNodeReader mPtNodeReader; const Ver2ParticiaTrieNodeReader mPtNodeReader;
@ -166,6 +164,7 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
int getTerminalPtNodePosFromWordId(const int wordId) const; int getTerminalPtNodePosFromWordId(const int wordId) const;
const WordAttributes getWordAttributes(const int probability, const WordAttributes getWordAttributes(const int probability,
const PtNodeParams &ptNodeParams) const; const PtNodeParams &ptNodeParams) const;
bool isValidPos(const int pos) const;
}; };
} // namespace latinime } // namespace latinime
#endif // LATINIME_PATRICIA_TRIE_POLICY_H #endif // LATINIME_PATRICIA_TRIE_POLICY_H

View File

@ -42,6 +42,13 @@ class ReadOnlyByteArrayView {
return mPtr; return mPtr;
} }
AK_FORCE_INLINE const ReadOnlyByteArrayView skip(const size_t n) const {
if (mSize <= n) {
return ReadOnlyByteArrayView();
}
return ReadOnlyByteArrayView(mPtr + n, mSize - n);
}
private: private:
DISALLOW_ASSIGNMENT_OPERATOR(ReadOnlyByteArrayView); DISALLOW_ASSIGNMENT_OPERATOR(ReadOnlyByteArrayView);