am b85bf4eb
: Merge "Use word id to construct DicNode instead of isTerminal flag."
* commit 'b85bf4ebb80ad471b1f61ea92d971bbeffb77872': Use word id to construct DicNode instead of isTerminal flag.
This commit is contained in:
commit
6302f53797
11 changed files with 23 additions and 16 deletions
|
@ -299,6 +299,7 @@ static inline void prof_out(void) {
|
|||
#define NOT_AN_INDEX (-1)
|
||||
#define NOT_A_PROBABILITY (-1)
|
||||
#define NOT_A_DICT_POS (S_INT_MIN)
|
||||
#define NOT_A_WORD_ID (S_INT_MIN)
|
||||
#define NOT_A_TIMESTAMP (-1)
|
||||
#define NOT_A_LANGUAGE_WEIGHT (-1.0f)
|
||||
|
||||
|
|
|
@ -136,7 +136,7 @@ class DicNode {
|
|||
}
|
||||
|
||||
void initAsChild(const DicNode *const dicNode, const int ptNodePos,
|
||||
const int childrenPtNodeArrayPos, const int probability, const bool isTerminal,
|
||||
const int childrenPtNodeArrayPos, const int probability, const int wordId,
|
||||
const bool hasChildren, const bool isBlacklistedOrNotAWord,
|
||||
const uint16_t mergedNodeCodePointCount, const int *const mergedNodeCodePoints) {
|
||||
uint16_t newDepth = static_cast<uint16_t>(dicNode->getNodeCodePointCount() + 1);
|
||||
|
@ -144,7 +144,7 @@ class DicNode {
|
|||
const uint16_t newLeavingDepth = static_cast<uint16_t>(
|
||||
dicNode->mDicNodeProperties.getLeavingDepth() + mergedNodeCodePointCount);
|
||||
mDicNodeProperties.init(ptNodePos, childrenPtNodeArrayPos, mergedNodeCodePoints[0],
|
||||
probability, isTerminal, hasChildren, isBlacklistedOrNotAWord, newDepth,
|
||||
probability, wordId, hasChildren, isBlacklistedOrNotAWord, newDepth,
|
||||
newLeavingDepth, dicNode->mDicNodeProperties.getPrevWordsTerminalPtNodePos());
|
||||
mDicNodeState.init(&dicNode->mDicNodeState, mergedNodeCodePointCount,
|
||||
mergedNodeCodePoints);
|
||||
|
|
|
@ -59,13 +59,13 @@ class DicNodeVector {
|
|||
}
|
||||
|
||||
void pushLeavingChild(const DicNode *const dicNode, const int ptNodePos,
|
||||
const int childrenPtNodeArrayPos, const int probability, const bool isTerminal,
|
||||
const int childrenPtNodeArrayPos, const int probability, const int wordId,
|
||||
const bool hasChildren, const bool isBlacklistedOrNotAWord,
|
||||
const uint16_t mergedNodeCodePointCount, const int *const mergedNodeCodePoints) {
|
||||
ASSERT(!mLock);
|
||||
mDicNodes.emplace_back();
|
||||
mDicNodes.back().initAsChild(dicNode, ptNodePos, childrenPtNodeArrayPos, probability,
|
||||
isTerminal, hasChildren, isBlacklistedOrNotAWord, mergedNodeCodePointCount,
|
||||
wordId, hasChildren, isBlacklistedOrNotAWord, mergedNodeCodePointCount,
|
||||
mergedNodeCodePoints);
|
||||
}
|
||||
|
||||
|
|
|
@ -31,20 +31,20 @@ class DicNodeProperties {
|
|||
AK_FORCE_INLINE DicNodeProperties()
|
||||
: mPtNodePos(NOT_A_DICT_POS), mChildrenPtNodeArrayPos(NOT_A_DICT_POS),
|
||||
mProbability(NOT_A_PROBABILITY), mDicNodeCodePoint(NOT_A_CODE_POINT),
|
||||
mIsTerminal(false), mHasChildrenPtNodes(false),
|
||||
mWordId(NOT_A_WORD_ID), mHasChildrenPtNodes(false),
|
||||
mIsBlacklistedOrNotAWord(false), mDepth(0), mLeavingDepth(0) {}
|
||||
|
||||
~DicNodeProperties() {}
|
||||
|
||||
// Should be called only once per DicNode is initialized.
|
||||
void init(const int pos, const int childrenPos, const int nodeCodePoint, const int probability,
|
||||
const bool isTerminal, const bool hasChildren, const bool isBlacklistedOrNotAWord,
|
||||
const int wordId, const bool hasChildren, const bool isBlacklistedOrNotAWord,
|
||||
const uint16_t depth, const uint16_t leavingDepth, const int *const prevWordsNodePos) {
|
||||
mPtNodePos = pos;
|
||||
mChildrenPtNodeArrayPos = childrenPos;
|
||||
mDicNodeCodePoint = nodeCodePoint;
|
||||
mProbability = probability;
|
||||
mIsTerminal = isTerminal;
|
||||
mWordId = wordId;
|
||||
mHasChildrenPtNodes = hasChildren;
|
||||
mIsBlacklistedOrNotAWord = isBlacklistedOrNotAWord;
|
||||
mDepth = depth;
|
||||
|
@ -58,7 +58,7 @@ class DicNodeProperties {
|
|||
mChildrenPtNodeArrayPos = rootPtNodeArrayPos;
|
||||
mDicNodeCodePoint = NOT_A_CODE_POINT;
|
||||
mProbability = NOT_A_PROBABILITY;
|
||||
mIsTerminal = false;
|
||||
mWordId = NOT_A_WORD_ID;
|
||||
mHasChildrenPtNodes = true;
|
||||
mIsBlacklistedOrNotAWord = false;
|
||||
mDepth = 0;
|
||||
|
@ -71,7 +71,7 @@ class DicNodeProperties {
|
|||
mChildrenPtNodeArrayPos = dicNodeProp->mChildrenPtNodeArrayPos;
|
||||
mDicNodeCodePoint = dicNodeProp->mDicNodeCodePoint;
|
||||
mProbability = dicNodeProp->mProbability;
|
||||
mIsTerminal = dicNodeProp->mIsTerminal;
|
||||
mWordId = dicNodeProp->mWordId;
|
||||
mHasChildrenPtNodes = dicNodeProp->mHasChildrenPtNodes;
|
||||
mIsBlacklistedOrNotAWord = dicNodeProp->mIsBlacklistedOrNotAWord;
|
||||
mDepth = dicNodeProp->mDepth;
|
||||
|
@ -86,7 +86,7 @@ class DicNodeProperties {
|
|||
mChildrenPtNodeArrayPos = dicNodeProp->mChildrenPtNodeArrayPos;
|
||||
mDicNodeCodePoint = codePoint; // Overwrite the node char of a passing child
|
||||
mProbability = dicNodeProp->mProbability;
|
||||
mIsTerminal = dicNodeProp->mIsTerminal;
|
||||
mWordId = dicNodeProp->mWordId;
|
||||
mHasChildrenPtNodes = dicNodeProp->mHasChildrenPtNodes;
|
||||
mIsBlacklistedOrNotAWord = dicNodeProp->mIsBlacklistedOrNotAWord;
|
||||
mDepth = dicNodeProp->mDepth + 1; // Increment the depth of a passing child
|
||||
|
@ -121,7 +121,7 @@ class DicNodeProperties {
|
|||
}
|
||||
|
||||
bool isTerminal() const {
|
||||
return mIsTerminal;
|
||||
return mWordId != NOT_A_WORD_ID;
|
||||
}
|
||||
|
||||
bool hasChildren() const {
|
||||
|
@ -144,7 +144,7 @@ class DicNodeProperties {
|
|||
int mChildrenPtNodeArrayPos;
|
||||
int mProbability;
|
||||
int mDicNodeCodePoint;
|
||||
bool mIsTerminal;
|
||||
int mWordId;
|
||||
bool mHasChildrenPtNodes;
|
||||
bool mIsBlacklistedOrNotAWord;
|
||||
uint16_t mDepth;
|
||||
|
|
|
@ -36,6 +36,7 @@ class UnigramProperty;
|
|||
* This class abstracts the structure of dictionaries.
|
||||
* Implement this policy to support additional dictionaries.
|
||||
*/
|
||||
// TODO: Use word id instead of terminal PtNode position.
|
||||
class DictionaryStructureWithBufferPolicy {
|
||||
public:
|
||||
typedef std::unique_ptr<DictionaryStructureWithBufferPolicy> StructurePolicyPtr;
|
||||
|
|
|
@ -76,8 +76,9 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d
|
|||
// Skip PtNodes that represent non-word information.
|
||||
continue;
|
||||
}
|
||||
const int wordId = isTerminal ? ptNodeParams.getHeadPos() : NOT_A_WORD_ID;
|
||||
childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(),
|
||||
ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), isTerminal,
|
||||
ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), wordId,
|
||||
ptNodeParams.hasChildren(),
|
||||
ptNodeParams.isBlacklisted()
|
||||
|| ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */,
|
||||
|
|
|
@ -55,6 +55,7 @@ class DicNodeVector;
|
|||
namespace backward {
|
||||
namespace v402 {
|
||||
|
||||
// Word id = Position of a PtNode that represents the word.
|
||||
class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||
public:
|
||||
Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers)
|
||||
|
|
|
@ -367,8 +367,8 @@ int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNod
|
|||
&probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos);
|
||||
// Skip PtNodes don't start with Unicode code point because they represent non-word information.
|
||||
if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) {
|
||||
childDicNodes->pushLeavingChild(dicNode, ptNodePos, childrenPos, probability,
|
||||
PatriciaTrieReadingUtils::isTerminal(flags),
|
||||
const int wordId = PatriciaTrieReadingUtils::isTerminal(flags) ? ptNodePos : NOT_A_WORD_ID;
|
||||
childDicNodes->pushLeavingChild(dicNode, ptNodePos, childrenPos, probability, wordId,
|
||||
PatriciaTrieReadingUtils::hasChildrenInFlags(flags),
|
||||
PatriciaTrieReadingUtils::isBlacklisted(flags)
|
||||
|| PatriciaTrieReadingUtils::isNotAWord(flags),
|
||||
|
|
|
@ -36,6 +36,7 @@ namespace latinime {
|
|||
class DicNode;
|
||||
class DicNodeVector;
|
||||
|
||||
// Word id = Position of a PtNode that represents the word.
|
||||
class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||
public:
|
||||
PatriciaTriePolicy(MmappedBuffer::MmappedBufferPtr mmappedBuffer)
|
||||
|
|
|
@ -66,8 +66,9 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d
|
|||
// Skip PtNodes that represent non-word information.
|
||||
continue;
|
||||
}
|
||||
const int wordId = isTerminal ? ptNodeParams.getTerminalId() : NOT_A_WORD_ID;
|
||||
childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(),
|
||||
ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), isTerminal,
|
||||
ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), wordId,
|
||||
ptNodeParams.hasChildren(),
|
||||
ptNodeParams.isBlacklisted()
|
||||
|| ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */,
|
||||
|
|
|
@ -36,6 +36,7 @@ namespace latinime {
|
|||
class DicNode;
|
||||
class DicNodeVector;
|
||||
|
||||
// Word id = Artificial id that is stored in the PtNode looked up by the word.
|
||||
class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||
public:
|
||||
Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers)
|
||||
|
|
Loading…
Reference in a new issue