Merge "Use word id to construct DicNode instead of isTerminal flag."

This commit is contained in:
Keisuke Kuroyanagi 2014-08-27 11:21:40 +00:00 committed by Android (Google) Code Review
commit b85bf4ebb8
11 changed files with 23 additions and 16 deletions

View file

@ -299,6 +299,7 @@ static inline void prof_out(void) {
#define NOT_AN_INDEX (-1) #define NOT_AN_INDEX (-1)
#define NOT_A_PROBABILITY (-1) #define NOT_A_PROBABILITY (-1)
#define NOT_A_DICT_POS (S_INT_MIN) #define NOT_A_DICT_POS (S_INT_MIN)
#define NOT_A_WORD_ID (S_INT_MIN)
#define NOT_A_TIMESTAMP (-1) #define NOT_A_TIMESTAMP (-1)
#define NOT_A_LANGUAGE_WEIGHT (-1.0f) #define NOT_A_LANGUAGE_WEIGHT (-1.0f)

View file

@ -136,7 +136,7 @@ class DicNode {
} }
void initAsChild(const DicNode *const dicNode, const int ptNodePos, void initAsChild(const DicNode *const dicNode, const int ptNodePos,
const int childrenPtNodeArrayPos, const int probability, const bool isTerminal, const int childrenPtNodeArrayPos, const int probability, const int wordId,
const bool hasChildren, const bool isBlacklistedOrNotAWord, const bool hasChildren, const bool isBlacklistedOrNotAWord,
const uint16_t mergedNodeCodePointCount, const int *const mergedNodeCodePoints) { const uint16_t mergedNodeCodePointCount, const int *const mergedNodeCodePoints) {
uint16_t newDepth = static_cast<uint16_t>(dicNode->getNodeCodePointCount() + 1); uint16_t newDepth = static_cast<uint16_t>(dicNode->getNodeCodePointCount() + 1);
@ -144,7 +144,7 @@ class DicNode {
const uint16_t newLeavingDepth = static_cast<uint16_t>( const uint16_t newLeavingDepth = static_cast<uint16_t>(
dicNode->mDicNodeProperties.getLeavingDepth() + mergedNodeCodePointCount); dicNode->mDicNodeProperties.getLeavingDepth() + mergedNodeCodePointCount);
mDicNodeProperties.init(ptNodePos, childrenPtNodeArrayPos, mergedNodeCodePoints[0], mDicNodeProperties.init(ptNodePos, childrenPtNodeArrayPos, mergedNodeCodePoints[0],
probability, isTerminal, hasChildren, isBlacklistedOrNotAWord, newDepth, probability, wordId, hasChildren, isBlacklistedOrNotAWord, newDepth,
newLeavingDepth, dicNode->mDicNodeProperties.getPrevWordsTerminalPtNodePos()); newLeavingDepth, dicNode->mDicNodeProperties.getPrevWordsTerminalPtNodePos());
mDicNodeState.init(&dicNode->mDicNodeState, mergedNodeCodePointCount, mDicNodeState.init(&dicNode->mDicNodeState, mergedNodeCodePointCount,
mergedNodeCodePoints); mergedNodeCodePoints);

View file

@ -59,13 +59,13 @@ class DicNodeVector {
} }
void pushLeavingChild(const DicNode *const dicNode, const int ptNodePos, void pushLeavingChild(const DicNode *const dicNode, const int ptNodePos,
const int childrenPtNodeArrayPos, const int probability, const bool isTerminal, const int childrenPtNodeArrayPos, const int probability, const int wordId,
const bool hasChildren, const bool isBlacklistedOrNotAWord, const bool hasChildren, const bool isBlacklistedOrNotAWord,
const uint16_t mergedNodeCodePointCount, const int *const mergedNodeCodePoints) { const uint16_t mergedNodeCodePointCount, const int *const mergedNodeCodePoints) {
ASSERT(!mLock); ASSERT(!mLock);
mDicNodes.emplace_back(); mDicNodes.emplace_back();
mDicNodes.back().initAsChild(dicNode, ptNodePos, childrenPtNodeArrayPos, probability, mDicNodes.back().initAsChild(dicNode, ptNodePos, childrenPtNodeArrayPos, probability,
isTerminal, hasChildren, isBlacklistedOrNotAWord, mergedNodeCodePointCount, wordId, hasChildren, isBlacklistedOrNotAWord, mergedNodeCodePointCount,
mergedNodeCodePoints); mergedNodeCodePoints);
} }

View file

@ -31,20 +31,20 @@ class DicNodeProperties {
AK_FORCE_INLINE DicNodeProperties() AK_FORCE_INLINE DicNodeProperties()
: mPtNodePos(NOT_A_DICT_POS), mChildrenPtNodeArrayPos(NOT_A_DICT_POS), : mPtNodePos(NOT_A_DICT_POS), mChildrenPtNodeArrayPos(NOT_A_DICT_POS),
mProbability(NOT_A_PROBABILITY), mDicNodeCodePoint(NOT_A_CODE_POINT), mProbability(NOT_A_PROBABILITY), mDicNodeCodePoint(NOT_A_CODE_POINT),
mIsTerminal(false), mHasChildrenPtNodes(false), mWordId(NOT_A_WORD_ID), mHasChildrenPtNodes(false),
mIsBlacklistedOrNotAWord(false), mDepth(0), mLeavingDepth(0) {} mIsBlacklistedOrNotAWord(false), mDepth(0), mLeavingDepth(0) {}
~DicNodeProperties() {} ~DicNodeProperties() {}
// Should be called only once per DicNode is initialized. // Should be called only once per DicNode is initialized.
void init(const int pos, const int childrenPos, const int nodeCodePoint, const int probability, void init(const int pos, const int childrenPos, const int nodeCodePoint, const int probability,
const bool isTerminal, const bool hasChildren, const bool isBlacklistedOrNotAWord, const int wordId, const bool hasChildren, const bool isBlacklistedOrNotAWord,
const uint16_t depth, const uint16_t leavingDepth, const int *const prevWordsNodePos) { const uint16_t depth, const uint16_t leavingDepth, const int *const prevWordsNodePos) {
mPtNodePos = pos; mPtNodePos = pos;
mChildrenPtNodeArrayPos = childrenPos; mChildrenPtNodeArrayPos = childrenPos;
mDicNodeCodePoint = nodeCodePoint; mDicNodeCodePoint = nodeCodePoint;
mProbability = probability; mProbability = probability;
mIsTerminal = isTerminal; mWordId = wordId;
mHasChildrenPtNodes = hasChildren; mHasChildrenPtNodes = hasChildren;
mIsBlacklistedOrNotAWord = isBlacklistedOrNotAWord; mIsBlacklistedOrNotAWord = isBlacklistedOrNotAWord;
mDepth = depth; mDepth = depth;
@ -58,7 +58,7 @@ class DicNodeProperties {
mChildrenPtNodeArrayPos = rootPtNodeArrayPos; mChildrenPtNodeArrayPos = rootPtNodeArrayPos;
mDicNodeCodePoint = NOT_A_CODE_POINT; mDicNodeCodePoint = NOT_A_CODE_POINT;
mProbability = NOT_A_PROBABILITY; mProbability = NOT_A_PROBABILITY;
mIsTerminal = false; mWordId = NOT_A_WORD_ID;
mHasChildrenPtNodes = true; mHasChildrenPtNodes = true;
mIsBlacklistedOrNotAWord = false; mIsBlacklistedOrNotAWord = false;
mDepth = 0; mDepth = 0;
@ -71,7 +71,7 @@ class DicNodeProperties {
mChildrenPtNodeArrayPos = dicNodeProp->mChildrenPtNodeArrayPos; mChildrenPtNodeArrayPos = dicNodeProp->mChildrenPtNodeArrayPos;
mDicNodeCodePoint = dicNodeProp->mDicNodeCodePoint; mDicNodeCodePoint = dicNodeProp->mDicNodeCodePoint;
mProbability = dicNodeProp->mProbability; mProbability = dicNodeProp->mProbability;
mIsTerminal = dicNodeProp->mIsTerminal; mWordId = dicNodeProp->mWordId;
mHasChildrenPtNodes = dicNodeProp->mHasChildrenPtNodes; mHasChildrenPtNodes = dicNodeProp->mHasChildrenPtNodes;
mIsBlacklistedOrNotAWord = dicNodeProp->mIsBlacklistedOrNotAWord; mIsBlacklistedOrNotAWord = dicNodeProp->mIsBlacklistedOrNotAWord;
mDepth = dicNodeProp->mDepth; mDepth = dicNodeProp->mDepth;
@ -86,7 +86,7 @@ class DicNodeProperties {
mChildrenPtNodeArrayPos = dicNodeProp->mChildrenPtNodeArrayPos; mChildrenPtNodeArrayPos = dicNodeProp->mChildrenPtNodeArrayPos;
mDicNodeCodePoint = codePoint; // Overwrite the node char of a passing child mDicNodeCodePoint = codePoint; // Overwrite the node char of a passing child
mProbability = dicNodeProp->mProbability; mProbability = dicNodeProp->mProbability;
mIsTerminal = dicNodeProp->mIsTerminal; mWordId = dicNodeProp->mWordId;
mHasChildrenPtNodes = dicNodeProp->mHasChildrenPtNodes; mHasChildrenPtNodes = dicNodeProp->mHasChildrenPtNodes;
mIsBlacklistedOrNotAWord = dicNodeProp->mIsBlacklistedOrNotAWord; mIsBlacklistedOrNotAWord = dicNodeProp->mIsBlacklistedOrNotAWord;
mDepth = dicNodeProp->mDepth + 1; // Increment the depth of a passing child mDepth = dicNodeProp->mDepth + 1; // Increment the depth of a passing child
@ -121,7 +121,7 @@ class DicNodeProperties {
} }
bool isTerminal() const { bool isTerminal() const {
return mIsTerminal; return mWordId != NOT_A_WORD_ID;
} }
bool hasChildren() const { bool hasChildren() const {
@ -144,7 +144,7 @@ class DicNodeProperties {
int mChildrenPtNodeArrayPos; int mChildrenPtNodeArrayPos;
int mProbability; int mProbability;
int mDicNodeCodePoint; int mDicNodeCodePoint;
bool mIsTerminal; int mWordId;
bool mHasChildrenPtNodes; bool mHasChildrenPtNodes;
bool mIsBlacklistedOrNotAWord; bool mIsBlacklistedOrNotAWord;
uint16_t mDepth; uint16_t mDepth;

View file

@ -36,6 +36,7 @@ class UnigramProperty;
* This class abstracts the structure of dictionaries. * This class abstracts the structure of dictionaries.
* Implement this policy to support additional dictionaries. * Implement this policy to support additional dictionaries.
*/ */
// TODO: Use word id instead of terminal PtNode position.
class DictionaryStructureWithBufferPolicy { class DictionaryStructureWithBufferPolicy {
public: public:
typedef std::unique_ptr<DictionaryStructureWithBufferPolicy> StructurePolicyPtr; typedef std::unique_ptr<DictionaryStructureWithBufferPolicy> StructurePolicyPtr;

View file

@ -76,8 +76,9 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d
// Skip PtNodes that represent non-word information. // Skip PtNodes that represent non-word information.
continue; continue;
} }
const int wordId = isTerminal ? ptNodeParams.getHeadPos() : NOT_A_WORD_ID;
childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(), childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(),
ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), isTerminal, ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), wordId,
ptNodeParams.hasChildren(), ptNodeParams.hasChildren(),
ptNodeParams.isBlacklisted() ptNodeParams.isBlacklisted()
|| ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */, || ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */,

View file

@ -55,6 +55,7 @@ class DicNodeVector;
namespace backward { namespace backward {
namespace v402 { namespace v402 {
// Word id = Position of a PtNode that represents the word.
class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
public: public:
Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers) Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers)

View file

@ -367,8 +367,8 @@ int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNod
&probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos); &probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos);
// Skip PtNodes don't start with Unicode code point because they represent non-word information. // Skip PtNodes don't start with Unicode code point because they represent non-word information.
if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) { if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) {
childDicNodes->pushLeavingChild(dicNode, ptNodePos, childrenPos, probability, const int wordId = PatriciaTrieReadingUtils::isTerminal(flags) ? ptNodePos : NOT_A_WORD_ID;
PatriciaTrieReadingUtils::isTerminal(flags), childDicNodes->pushLeavingChild(dicNode, ptNodePos, childrenPos, probability, wordId,
PatriciaTrieReadingUtils::hasChildrenInFlags(flags), PatriciaTrieReadingUtils::hasChildrenInFlags(flags),
PatriciaTrieReadingUtils::isBlacklisted(flags) PatriciaTrieReadingUtils::isBlacklisted(flags)
|| PatriciaTrieReadingUtils::isNotAWord(flags), || PatriciaTrieReadingUtils::isNotAWord(flags),

View file

@ -36,6 +36,7 @@ namespace latinime {
class DicNode; class DicNode;
class DicNodeVector; class DicNodeVector;
// Word id = Position of a PtNode that represents the word.
class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
public: public:
PatriciaTriePolicy(MmappedBuffer::MmappedBufferPtr mmappedBuffer) PatriciaTriePolicy(MmappedBuffer::MmappedBufferPtr mmappedBuffer)

View file

@ -66,8 +66,9 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d
// Skip PtNodes that represent non-word information. // Skip PtNodes that represent non-word information.
continue; continue;
} }
const int wordId = isTerminal ? ptNodeParams.getTerminalId() : NOT_A_WORD_ID;
childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(), childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(),
ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), isTerminal, ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), wordId,
ptNodeParams.hasChildren(), ptNodeParams.hasChildren(),
ptNodeParams.isBlacklisted() ptNodeParams.isBlacklisted()
|| ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */, || ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */,

View file

@ -36,6 +36,7 @@ namespace latinime {
class DicNode; class DicNode;
class DicNodeVector; class DicNodeVector;
// Word id = Artificial id that is stored in the PtNode looked up by the word.
class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
public: public:
Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers) Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers)