Merge "Create Ver2ParticiaTrieNodeReader."

This commit is contained in:
Keisuke Kuroyanagi 2014-02-10 12:11:01 +00:00 committed by Android (Google) Code Review
commit 7c73e0f2a8
8 changed files with 181 additions and 69 deletions

View file

@ -57,7 +57,8 @@ LATIN_IME_CORE_SRC_FILES := \
dynamic_pt_writing_utils.cpp) \ dynamic_pt_writing_utils.cpp) \
$(addprefix suggest/policyimpl/dictionary/structure/v2/, \ $(addprefix suggest/policyimpl/dictionary/structure/v2/, \
patricia_trie_policy.cpp \ patricia_trie_policy.cpp \
patricia_trie_reading_utils.cpp) \ patricia_trie_reading_utils.cpp \
ver2_patricia_trie_node_reader.cpp) \
$(addprefix suggest/policyimpl/dictionary/structure/v4/, \ $(addprefix suggest/policyimpl/dictionary/structure/v4/, \
ver4_dict_buffers.cpp \ ver4_dict_buffers.cpp \
ver4_dict_constants.cpp \ ver4_dict_constants.cpp \

View file

@ -53,6 +53,21 @@ class PtNodeParams {
memcpy(mCodePoints, ptNodeParams.getCodePoints(), sizeof(int) * mCodePointCount); memcpy(mCodePoints, ptNodeParams.getCodePoints(), sizeof(int) * mCodePointCount);
} }
// PtNode read from version 2 dictionary.
PtNodeParams(const int headPos, const PatriciaTrieReadingUtils::NodeFlags flags,
const int codePointCount, const int *const codePoints, const int probability,
const int childrenPos, const int shortcutPos, const int bigramPos,
const int siblingPos)
: mHeadPos(headPos), mFlags(flags), mParentPos(NOT_A_DICT_POS),
mCodePointCount(codePointCount), mCodePoints(), mTerminalIdFieldPos(NOT_A_DICT_POS),
mTerminalId(Ver4DictConstants::NOT_A_TERMINAL_ID),
mProbabilityFieldPos(NOT_A_DICT_POS), mProbability(probability),
mChildrenPosFieldPos(NOT_A_DICT_POS), mChildrenPos(childrenPos),
mBigramLinkedNodePos(NOT_A_DICT_POS), mShortcutPos(shortcutPos),
mBigramPos(bigramPos), mSiblingPos(siblingPos) {
memcpy(mCodePoints, codePoints, sizeof(int) * mCodePointCount);
}
// PtNode with a terminal id. // PtNode with a terminal id.
PtNodeParams(const int headPos, const PatriciaTrieReadingUtils::NodeFlags flags, PtNodeParams(const int headPos, const PatriciaTrieReadingUtils::NodeFlags flags,
const int parentPos, const int codePointCount, const int *const codePoints, const int parentPos, const int codePointCount, const int *const codePoints,

View file

@ -336,99 +336,50 @@ int PatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int ptNodePos) const
if (ptNodePos == NOT_A_DICT_POS) { if (ptNodePos == NOT_A_DICT_POS) {
return NOT_A_PROBABILITY; return NOT_A_PROBABILITY;
} }
int pos = ptNodePos; const PtNodeParams ptNodeParams = mPtNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos);
const PatriciaTrieReadingUtils::NodeFlags flags = if (ptNodeParams.isNotAWord() || ptNodeParams.isBlacklisted()) {
PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos);
if (!PatriciaTrieReadingUtils::isTerminal(flags)) {
return NOT_A_PROBABILITY;
}
if (PatriciaTrieReadingUtils::isNotAWord(flags)
|| PatriciaTrieReadingUtils::isBlacklisted(flags)) {
// If this is not a word, or if it's a blacklisted entry, it should behave as // If this is not a word, or if it's a blacklisted entry, it should behave as
// having no probability outside of the suggestion process (where it should be used // having no probability outside of the suggestion process (where it should be used
// for shortcuts). // for shortcuts).
return NOT_A_PROBABILITY; return NOT_A_PROBABILITY;
} }
PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos); return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY);
return getProbability(PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(
mDictRoot, &pos), NOT_A_PROBABILITY);
} }
int PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const { int PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const {
if (ptNodePos == NOT_A_DICT_POS) { if (ptNodePos == NOT_A_DICT_POS) {
return NOT_A_DICT_POS; return NOT_A_DICT_POS;
} }
int pos = ptNodePos; return mPtNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos).getShortcutPos();
const PatriciaTrieReadingUtils::NodeFlags flags =
PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos);
if (!PatriciaTrieReadingUtils::hasShortcutTargets(flags)) {
return NOT_A_DICT_POS;
}
PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos);
if (PatriciaTrieReadingUtils::isTerminal(flags)) {
PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos);
}
if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) {
PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(mDictRoot, flags, &pos);
}
return pos;
} }
int PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const { int PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const {
if (ptNodePos == NOT_A_DICT_POS) { if (ptNodePos == NOT_A_DICT_POS) {
return NOT_A_DICT_POS; return NOT_A_DICT_POS;
} }
int pos = ptNodePos; return mPtNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos).getBigramsPos();
const PatriciaTrieReadingUtils::NodeFlags flags =
PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos);
if (!PatriciaTrieReadingUtils::hasBigrams(flags)) {
return NOT_A_DICT_POS;
}
PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos);
if (PatriciaTrieReadingUtils::isTerminal(flags)) {
PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos);
}
if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) {
PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(mDictRoot, flags, &pos);
}
if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) {
mShortcutListPolicy.skipAllShortcuts(&pos);;
}
return pos;
} }
int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNode, int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNode,
const int ptNodePos, DicNodeVector *childDicNodes) const { const int ptNodePos, DicNodeVector *childDicNodes) const {
int pos = ptNodePos; PatriciaTrieReadingUtils::NodeFlags flags;
const PatriciaTrieReadingUtils::NodeFlags flags = int mergedNodeCodePointCount = 0;
PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos);
int mergedNodeCodePoints[MAX_WORD_LENGTH]; int mergedNodeCodePoints[MAX_WORD_LENGTH];
const int mergedNodeCodePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition( int probability = NOT_A_PROBABILITY;
mDictRoot, flags, MAX_WORD_LENGTH, mergedNodeCodePoints, &pos); int childrenPos = NOT_A_DICT_POS;
const int probability = (PatriciaTrieReadingUtils::isTerminal(flags))? int shortcutPos = NOT_A_DICT_POS;
PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos) int bigramPos = NOT_A_DICT_POS;
: NOT_A_PROBABILITY; int siblingPos = NOT_A_DICT_POS;
const int childrenPos = PatriciaTrieReadingUtils::hasChildrenInFlags(flags) ? PatriciaTrieReadingUtils::readPtNodeInfo(mDictRoot, ptNodePos, getShortcutsStructurePolicy(),
PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( getBigramsStructurePolicy(), &flags, &mergedNodeCodePointCount, mergedNodeCodePoints,
mDictRoot, flags, &pos) : NOT_A_DICT_POS; &probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos);
if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) {
getShortcutsStructurePolicy()->skipAllShortcuts(&pos);
}
if (PatriciaTrieReadingUtils::hasBigrams(flags)) {
getBigramsStructurePolicy()->skipAllBigrams(&pos);
}
if (mergedNodeCodePointCount <= 0) {
AKLOGE("Empty PtNode is not allowed. Code point count: %d", mergedNodeCodePointCount);
ASSERT(false);
return pos;
}
childDicNodes->pushLeavingChild(dicNode, ptNodePos, childrenPos, probability, childDicNodes->pushLeavingChild(dicNode, ptNodePos, childrenPos, probability,
PatriciaTrieReadingUtils::isTerminal(flags), PatriciaTrieReadingUtils::isTerminal(flags),
PatriciaTrieReadingUtils::hasChildrenInFlags(flags), PatriciaTrieReadingUtils::hasChildrenInFlags(flags),
PatriciaTrieReadingUtils::isBlacklisted(flags) || PatriciaTrieReadingUtils::isBlacklisted(flags)
PatriciaTrieReadingUtils::isNotAWord(flags), || PatriciaTrieReadingUtils::isNotAWord(flags),
mergedNodeCodePointCount, mergedNodeCodePoints); mergedNodeCodePointCount, mergedNodeCodePoints);
return pos; return siblingPos;
} }
} // namespace latinime } // namespace latinime

View file

@ -24,6 +24,7 @@
#include "suggest/policyimpl/dictionary/bigram/bigram_list_policy.h" #include "suggest/policyimpl/dictionary/bigram/bigram_list_policy.h"
#include "suggest/policyimpl/dictionary/header/header_policy.h" #include "suggest/policyimpl/dictionary/header/header_policy.h"
#include "suggest/policyimpl/dictionary/shortcut/shortcut_list_policy.h" #include "suggest/policyimpl/dictionary/shortcut/shortcut_list_policy.h"
#include "suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h"
#include "suggest/policyimpl/dictionary/utils/format_utils.h" #include "suggest/policyimpl/dictionary/utils/format_utils.h"
#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" #include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h"
@ -40,7 +41,8 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
mDictRoot(mMmappedBuffer.get()->getBuffer() + mHeaderPolicy.getSize()), mDictRoot(mMmappedBuffer.get()->getBuffer() + mHeaderPolicy.getSize()),
mDictBufferSize(mMmappedBuffer.get()->getBufferSize() mDictBufferSize(mMmappedBuffer.get()->getBufferSize()
- mHeaderPolicy.getSize()), - mHeaderPolicy.getSize()),
mBigramListPolicy(mDictRoot), mShortcutListPolicy(mDictRoot) {} mBigramListPolicy(mDictRoot), mShortcutListPolicy(mDictRoot),
mPtNodeReader(mDictRoot, mDictBufferSize, &mBigramListPolicy, &mShortcutListPolicy) {}
AK_FORCE_INLINE int getRootPosition() const { AK_FORCE_INLINE int getRootPosition() const {
return 0; return 0;
@ -143,6 +145,7 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
const int mDictBufferSize; const int mDictBufferSize;
const BigramListPolicy mBigramListPolicy; const BigramListPolicy mBigramListPolicy;
const ShortcutListPolicy mShortcutListPolicy; const ShortcutListPolicy mShortcutListPolicy;
const Ver2ParticiaTrieNodeReader mPtNodeReader;
int createAndGetLeavingChildNode(const DicNode *const dicNode, const int ptNodePos, int createAndGetLeavingChildNode(const DicNode *const dicNode, const int ptNodePos,
DicNodeVector *const childDicNodes) const; DicNodeVector *const childDicNodes) const;

View file

@ -17,6 +17,8 @@
#include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_reading_utils.h"
#include "defines.h" #include "defines.h"
#include "suggest/core/policy/dictionary_bigrams_structure_policy.h"
#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h"
#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" #include "suggest/policyimpl/dictionary/utils/byte_array_utils.h"
namespace latinime { namespace latinime {
@ -130,4 +132,32 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
return base + offset; return base + offset;
} }
/* static */ void PtReadingUtils::readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos,
const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
const DictionaryBigramsStructurePolicy *const bigramPolicy,
NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint,
int *const outProbability, int *const outChildrenPos, int *const outShortcutPos,
int *const outBigramPos, int *const outSiblingPos) {
int readingPos = ptNodePos;
const NodeFlags flags = getFlagsAndAdvancePosition(dictBuf, &readingPos);
*outFlags = flags;
*outCodePointCount = getCharsAndAdvancePosition(
dictBuf, flags, MAX_WORD_LENGTH, outCodePoint, &readingPos);
*outProbability = isTerminal(flags) ?
readProbabilityAndAdvancePosition(dictBuf, &readingPos) : NOT_A_PROBABILITY;
*outChildrenPos = hasChildrenInFlags(flags) ?
readChildrenPositionAndAdvancePosition(dictBuf, flags, &readingPos) : NOT_A_DICT_POS;
*outShortcutPos = NOT_A_DICT_POS;
if (hasShortcutTargets(flags)) {
*outShortcutPos = readingPos;
shortcutPolicy->skipAllShortcuts(&readingPos);
}
*outBigramPos = NOT_A_DICT_POS;
if (hasBigrams(flags)) {
*outBigramPos = readingPos;
bigramPolicy->skipAllBigrams(&readingPos);
}
*outSiblingPos = readingPos;
}
} // namespace latinime } // namespace latinime

View file

@ -23,6 +23,9 @@
namespace latinime { namespace latinime {
class DictionaryShortcutsStructurePolicy;
class DictionaryBigramsStructurePolicy;
// TODO: Move to pt_common // TODO: Move to pt_common
class PatriciaTrieReadingUtils { class PatriciaTrieReadingUtils {
public: public:
@ -101,6 +104,13 @@ class PatriciaTrieReadingUtils {
return nodeFlags; return nodeFlags;
} }
static void readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos,
const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
const DictionaryBigramsStructurePolicy *const bigramPolicy,
NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint,
int *const outProbability, int *const outChildrenPos, int *const outShortcutPos,
int *const outBigramPos, int *const outSiblingPos);
private: private:
DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTrieReadingUtils); DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTrieReadingUtils);

View file

@ -0,0 +1,52 @@
/*
* Copyright (C) 2014, The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h"
#include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_reading_utils.h"
namespace latinime {
const PtNodeParams Ver2ParticiaTrieNodeReader::fetchNodeInfoInBufferFromPtNodePos(
const int ptNodePos) const {
if (ptNodePos < 0 || ptNodePos >= mDictSize) {
// Reading invalid position because of bug or broken dictionary.
AKLOGE("Fetching PtNode info from invalid dictionary position: %d, dictionary size: %d",
ptNodePos, mDictSize);
ASSERT(false);
return PtNodeParams();
}
PatriciaTrieReadingUtils::NodeFlags flags;
int mergedNodeCodePointCount = 0;
int mergedNodeCodePoints[MAX_WORD_LENGTH];
int probability = NOT_A_PROBABILITY;
int childrenPos = NOT_A_DICT_POS;
int shortcutPos = NOT_A_DICT_POS;
int bigramPos = NOT_A_DICT_POS;
int siblingPos = NOT_A_DICT_POS;
PatriciaTrieReadingUtils::readPtNodeInfo(mDictBuffer, ptNodePos, mShortuctPolicy,
mBigramPolicy, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints, &probability,
&childrenPos, &shortcutPos, &bigramPos, &siblingPos);
if (mergedNodeCodePointCount <= 0) {
AKLOGE("Empty PtNode is not allowed. Code point count: %d", mergedNodeCodePointCount);
ASSERT(false);
return PtNodeParams();
}
return PtNodeParams(ptNodePos, flags, mergedNodeCodePointCount, mergedNodeCodePoints,
probability, childrenPos, shortcutPos, bigramPos, siblingPos);
}
}

View file

@ -0,0 +1,50 @@
/*
* Copyright (C) 2014, The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef LATINIME_VER2_PATRICIA_TRIE_NODE_READER_H
#define LATINIME_VER2_PATRICIA_TRIE_NODE_READER_H
#include <stdint.h>
#include "defines.h"
#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h"
#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h"
namespace latinime {
class DictionaryBigramsStructurePolicy;
class DictionaryShortcutsStructurePolicy;
class Ver2ParticiaTrieNodeReader : public PtNodeReader {
public:
Ver2ParticiaTrieNodeReader(const uint8_t *const dictBuffer, const int dictSize,
const DictionaryBigramsStructurePolicy *const bigramPolicy,
const DictionaryShortcutsStructurePolicy *const shortcutPolicy)
: mDictBuffer(dictBuffer), mDictSize(dictSize), mBigramPolicy(bigramPolicy),
mShortuctPolicy(shortcutPolicy) {}
virtual const PtNodeParams fetchNodeInfoInBufferFromPtNodePos(const int ptNodePos) const;
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(Ver2ParticiaTrieNodeReader);
const uint8_t *const mDictBuffer;
const int mDictSize;
const DictionaryBigramsStructurePolicy *const mBigramPolicy;
const DictionaryShortcutsStructurePolicy *const mShortuctPolicy;
};
} // namespace latinime
#endif /* LATINIME_VER2_PATRICIA_TRIE_NODE_READER_H */