am b85bf4eb: Merge "Use word id to construct DicNode instead of isTerminal flag."
* commit 'b85bf4ebb80ad471b1f61ea92d971bbeffb77872': Use word id to construct DicNode instead of isTerminal flag.main
commit
6302f53797
|
@ -299,6 +299,7 @@ static inline void prof_out(void) {
|
||||||
#define NOT_AN_INDEX (-1)
|
#define NOT_AN_INDEX (-1)
|
||||||
#define NOT_A_PROBABILITY (-1)
|
#define NOT_A_PROBABILITY (-1)
|
||||||
#define NOT_A_DICT_POS (S_INT_MIN)
|
#define NOT_A_DICT_POS (S_INT_MIN)
|
||||||
|
#define NOT_A_WORD_ID (S_INT_MIN)
|
||||||
#define NOT_A_TIMESTAMP (-1)
|
#define NOT_A_TIMESTAMP (-1)
|
||||||
#define NOT_A_LANGUAGE_WEIGHT (-1.0f)
|
#define NOT_A_LANGUAGE_WEIGHT (-1.0f)
|
||||||
|
|
||||||
|
|
|
@ -136,7 +136,7 @@ class DicNode {
|
||||||
}
|
}
|
||||||
|
|
||||||
void initAsChild(const DicNode *const dicNode, const int ptNodePos,
|
void initAsChild(const DicNode *const dicNode, const int ptNodePos,
|
||||||
const int childrenPtNodeArrayPos, const int probability, const bool isTerminal,
|
const int childrenPtNodeArrayPos, const int probability, const int wordId,
|
||||||
const bool hasChildren, const bool isBlacklistedOrNotAWord,
|
const bool hasChildren, const bool isBlacklistedOrNotAWord,
|
||||||
const uint16_t mergedNodeCodePointCount, const int *const mergedNodeCodePoints) {
|
const uint16_t mergedNodeCodePointCount, const int *const mergedNodeCodePoints) {
|
||||||
uint16_t newDepth = static_cast<uint16_t>(dicNode->getNodeCodePointCount() + 1);
|
uint16_t newDepth = static_cast<uint16_t>(dicNode->getNodeCodePointCount() + 1);
|
||||||
|
@ -144,7 +144,7 @@ class DicNode {
|
||||||
const uint16_t newLeavingDepth = static_cast<uint16_t>(
|
const uint16_t newLeavingDepth = static_cast<uint16_t>(
|
||||||
dicNode->mDicNodeProperties.getLeavingDepth() + mergedNodeCodePointCount);
|
dicNode->mDicNodeProperties.getLeavingDepth() + mergedNodeCodePointCount);
|
||||||
mDicNodeProperties.init(ptNodePos, childrenPtNodeArrayPos, mergedNodeCodePoints[0],
|
mDicNodeProperties.init(ptNodePos, childrenPtNodeArrayPos, mergedNodeCodePoints[0],
|
||||||
probability, isTerminal, hasChildren, isBlacklistedOrNotAWord, newDepth,
|
probability, wordId, hasChildren, isBlacklistedOrNotAWord, newDepth,
|
||||||
newLeavingDepth, dicNode->mDicNodeProperties.getPrevWordsTerminalPtNodePos());
|
newLeavingDepth, dicNode->mDicNodeProperties.getPrevWordsTerminalPtNodePos());
|
||||||
mDicNodeState.init(&dicNode->mDicNodeState, mergedNodeCodePointCount,
|
mDicNodeState.init(&dicNode->mDicNodeState, mergedNodeCodePointCount,
|
||||||
mergedNodeCodePoints);
|
mergedNodeCodePoints);
|
||||||
|
|
|
@ -59,13 +59,13 @@ class DicNodeVector {
|
||||||
}
|
}
|
||||||
|
|
||||||
void pushLeavingChild(const DicNode *const dicNode, const int ptNodePos,
|
void pushLeavingChild(const DicNode *const dicNode, const int ptNodePos,
|
||||||
const int childrenPtNodeArrayPos, const int probability, const bool isTerminal,
|
const int childrenPtNodeArrayPos, const int probability, const int wordId,
|
||||||
const bool hasChildren, const bool isBlacklistedOrNotAWord,
|
const bool hasChildren, const bool isBlacklistedOrNotAWord,
|
||||||
const uint16_t mergedNodeCodePointCount, const int *const mergedNodeCodePoints) {
|
const uint16_t mergedNodeCodePointCount, const int *const mergedNodeCodePoints) {
|
||||||
ASSERT(!mLock);
|
ASSERT(!mLock);
|
||||||
mDicNodes.emplace_back();
|
mDicNodes.emplace_back();
|
||||||
mDicNodes.back().initAsChild(dicNode, ptNodePos, childrenPtNodeArrayPos, probability,
|
mDicNodes.back().initAsChild(dicNode, ptNodePos, childrenPtNodeArrayPos, probability,
|
||||||
isTerminal, hasChildren, isBlacklistedOrNotAWord, mergedNodeCodePointCount,
|
wordId, hasChildren, isBlacklistedOrNotAWord, mergedNodeCodePointCount,
|
||||||
mergedNodeCodePoints);
|
mergedNodeCodePoints);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -31,20 +31,20 @@ class DicNodeProperties {
|
||||||
AK_FORCE_INLINE DicNodeProperties()
|
AK_FORCE_INLINE DicNodeProperties()
|
||||||
: mPtNodePos(NOT_A_DICT_POS), mChildrenPtNodeArrayPos(NOT_A_DICT_POS),
|
: mPtNodePos(NOT_A_DICT_POS), mChildrenPtNodeArrayPos(NOT_A_DICT_POS),
|
||||||
mProbability(NOT_A_PROBABILITY), mDicNodeCodePoint(NOT_A_CODE_POINT),
|
mProbability(NOT_A_PROBABILITY), mDicNodeCodePoint(NOT_A_CODE_POINT),
|
||||||
mIsTerminal(false), mHasChildrenPtNodes(false),
|
mWordId(NOT_A_WORD_ID), mHasChildrenPtNodes(false),
|
||||||
mIsBlacklistedOrNotAWord(false), mDepth(0), mLeavingDepth(0) {}
|
mIsBlacklistedOrNotAWord(false), mDepth(0), mLeavingDepth(0) {}
|
||||||
|
|
||||||
~DicNodeProperties() {}
|
~DicNodeProperties() {}
|
||||||
|
|
||||||
// Should be called only once per DicNode is initialized.
|
// Should be called only once per DicNode is initialized.
|
||||||
void init(const int pos, const int childrenPos, const int nodeCodePoint, const int probability,
|
void init(const int pos, const int childrenPos, const int nodeCodePoint, const int probability,
|
||||||
const bool isTerminal, const bool hasChildren, const bool isBlacklistedOrNotAWord,
|
const int wordId, const bool hasChildren, const bool isBlacklistedOrNotAWord,
|
||||||
const uint16_t depth, const uint16_t leavingDepth, const int *const prevWordsNodePos) {
|
const uint16_t depth, const uint16_t leavingDepth, const int *const prevWordsNodePos) {
|
||||||
mPtNodePos = pos;
|
mPtNodePos = pos;
|
||||||
mChildrenPtNodeArrayPos = childrenPos;
|
mChildrenPtNodeArrayPos = childrenPos;
|
||||||
mDicNodeCodePoint = nodeCodePoint;
|
mDicNodeCodePoint = nodeCodePoint;
|
||||||
mProbability = probability;
|
mProbability = probability;
|
||||||
mIsTerminal = isTerminal;
|
mWordId = wordId;
|
||||||
mHasChildrenPtNodes = hasChildren;
|
mHasChildrenPtNodes = hasChildren;
|
||||||
mIsBlacklistedOrNotAWord = isBlacklistedOrNotAWord;
|
mIsBlacklistedOrNotAWord = isBlacklistedOrNotAWord;
|
||||||
mDepth = depth;
|
mDepth = depth;
|
||||||
|
@ -58,7 +58,7 @@ class DicNodeProperties {
|
||||||
mChildrenPtNodeArrayPos = rootPtNodeArrayPos;
|
mChildrenPtNodeArrayPos = rootPtNodeArrayPos;
|
||||||
mDicNodeCodePoint = NOT_A_CODE_POINT;
|
mDicNodeCodePoint = NOT_A_CODE_POINT;
|
||||||
mProbability = NOT_A_PROBABILITY;
|
mProbability = NOT_A_PROBABILITY;
|
||||||
mIsTerminal = false;
|
mWordId = NOT_A_WORD_ID;
|
||||||
mHasChildrenPtNodes = true;
|
mHasChildrenPtNodes = true;
|
||||||
mIsBlacklistedOrNotAWord = false;
|
mIsBlacklistedOrNotAWord = false;
|
||||||
mDepth = 0;
|
mDepth = 0;
|
||||||
|
@ -71,7 +71,7 @@ class DicNodeProperties {
|
||||||
mChildrenPtNodeArrayPos = dicNodeProp->mChildrenPtNodeArrayPos;
|
mChildrenPtNodeArrayPos = dicNodeProp->mChildrenPtNodeArrayPos;
|
||||||
mDicNodeCodePoint = dicNodeProp->mDicNodeCodePoint;
|
mDicNodeCodePoint = dicNodeProp->mDicNodeCodePoint;
|
||||||
mProbability = dicNodeProp->mProbability;
|
mProbability = dicNodeProp->mProbability;
|
||||||
mIsTerminal = dicNodeProp->mIsTerminal;
|
mWordId = dicNodeProp->mWordId;
|
||||||
mHasChildrenPtNodes = dicNodeProp->mHasChildrenPtNodes;
|
mHasChildrenPtNodes = dicNodeProp->mHasChildrenPtNodes;
|
||||||
mIsBlacklistedOrNotAWord = dicNodeProp->mIsBlacklistedOrNotAWord;
|
mIsBlacklistedOrNotAWord = dicNodeProp->mIsBlacklistedOrNotAWord;
|
||||||
mDepth = dicNodeProp->mDepth;
|
mDepth = dicNodeProp->mDepth;
|
||||||
|
@ -86,7 +86,7 @@ class DicNodeProperties {
|
||||||
mChildrenPtNodeArrayPos = dicNodeProp->mChildrenPtNodeArrayPos;
|
mChildrenPtNodeArrayPos = dicNodeProp->mChildrenPtNodeArrayPos;
|
||||||
mDicNodeCodePoint = codePoint; // Overwrite the node char of a passing child
|
mDicNodeCodePoint = codePoint; // Overwrite the node char of a passing child
|
||||||
mProbability = dicNodeProp->mProbability;
|
mProbability = dicNodeProp->mProbability;
|
||||||
mIsTerminal = dicNodeProp->mIsTerminal;
|
mWordId = dicNodeProp->mWordId;
|
||||||
mHasChildrenPtNodes = dicNodeProp->mHasChildrenPtNodes;
|
mHasChildrenPtNodes = dicNodeProp->mHasChildrenPtNodes;
|
||||||
mIsBlacklistedOrNotAWord = dicNodeProp->mIsBlacklistedOrNotAWord;
|
mIsBlacklistedOrNotAWord = dicNodeProp->mIsBlacklistedOrNotAWord;
|
||||||
mDepth = dicNodeProp->mDepth + 1; // Increment the depth of a passing child
|
mDepth = dicNodeProp->mDepth + 1; // Increment the depth of a passing child
|
||||||
|
@ -121,7 +121,7 @@ class DicNodeProperties {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool isTerminal() const {
|
bool isTerminal() const {
|
||||||
return mIsTerminal;
|
return mWordId != NOT_A_WORD_ID;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool hasChildren() const {
|
bool hasChildren() const {
|
||||||
|
@ -144,7 +144,7 @@ class DicNodeProperties {
|
||||||
int mChildrenPtNodeArrayPos;
|
int mChildrenPtNodeArrayPos;
|
||||||
int mProbability;
|
int mProbability;
|
||||||
int mDicNodeCodePoint;
|
int mDicNodeCodePoint;
|
||||||
bool mIsTerminal;
|
int mWordId;
|
||||||
bool mHasChildrenPtNodes;
|
bool mHasChildrenPtNodes;
|
||||||
bool mIsBlacklistedOrNotAWord;
|
bool mIsBlacklistedOrNotAWord;
|
||||||
uint16_t mDepth;
|
uint16_t mDepth;
|
||||||
|
|
|
@ -36,6 +36,7 @@ class UnigramProperty;
|
||||||
* This class abstracts the structure of dictionaries.
|
* This class abstracts the structure of dictionaries.
|
||||||
* Implement this policy to support additional dictionaries.
|
* Implement this policy to support additional dictionaries.
|
||||||
*/
|
*/
|
||||||
|
// TODO: Use word id instead of terminal PtNode position.
|
||||||
class DictionaryStructureWithBufferPolicy {
|
class DictionaryStructureWithBufferPolicy {
|
||||||
public:
|
public:
|
||||||
typedef std::unique_ptr<DictionaryStructureWithBufferPolicy> StructurePolicyPtr;
|
typedef std::unique_ptr<DictionaryStructureWithBufferPolicy> StructurePolicyPtr;
|
||||||
|
|
|
@ -76,8 +76,9 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d
|
||||||
// Skip PtNodes that represent non-word information.
|
// Skip PtNodes that represent non-word information.
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
const int wordId = isTerminal ? ptNodeParams.getHeadPos() : NOT_A_WORD_ID;
|
||||||
childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(),
|
childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(),
|
||||||
ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), isTerminal,
|
ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), wordId,
|
||||||
ptNodeParams.hasChildren(),
|
ptNodeParams.hasChildren(),
|
||||||
ptNodeParams.isBlacklisted()
|
ptNodeParams.isBlacklisted()
|
||||||
|| ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */,
|
|| ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */,
|
||||||
|
|
|
@ -55,6 +55,7 @@ class DicNodeVector;
|
||||||
namespace backward {
|
namespace backward {
|
||||||
namespace v402 {
|
namespace v402 {
|
||||||
|
|
||||||
|
// Word id = Position of a PtNode that represents the word.
|
||||||
class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
public:
|
public:
|
||||||
Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers)
|
Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers)
|
||||||
|
|
|
@ -367,8 +367,8 @@ int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNod
|
||||||
&probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos);
|
&probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos);
|
||||||
// Skip PtNodes don't start with Unicode code point because they represent non-word information.
|
// Skip PtNodes don't start with Unicode code point because they represent non-word information.
|
||||||
if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) {
|
if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) {
|
||||||
childDicNodes->pushLeavingChild(dicNode, ptNodePos, childrenPos, probability,
|
const int wordId = PatriciaTrieReadingUtils::isTerminal(flags) ? ptNodePos : NOT_A_WORD_ID;
|
||||||
PatriciaTrieReadingUtils::isTerminal(flags),
|
childDicNodes->pushLeavingChild(dicNode, ptNodePos, childrenPos, probability, wordId,
|
||||||
PatriciaTrieReadingUtils::hasChildrenInFlags(flags),
|
PatriciaTrieReadingUtils::hasChildrenInFlags(flags),
|
||||||
PatriciaTrieReadingUtils::isBlacklisted(flags)
|
PatriciaTrieReadingUtils::isBlacklisted(flags)
|
||||||
|| PatriciaTrieReadingUtils::isNotAWord(flags),
|
|| PatriciaTrieReadingUtils::isNotAWord(flags),
|
||||||
|
|
|
@ -36,6 +36,7 @@ namespace latinime {
|
||||||
class DicNode;
|
class DicNode;
|
||||||
class DicNodeVector;
|
class DicNodeVector;
|
||||||
|
|
||||||
|
// Word id = Position of a PtNode that represents the word.
|
||||||
class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
public:
|
public:
|
||||||
PatriciaTriePolicy(MmappedBuffer::MmappedBufferPtr mmappedBuffer)
|
PatriciaTriePolicy(MmappedBuffer::MmappedBufferPtr mmappedBuffer)
|
||||||
|
|
|
@ -66,8 +66,9 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d
|
||||||
// Skip PtNodes that represent non-word information.
|
// Skip PtNodes that represent non-word information.
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
const int wordId = isTerminal ? ptNodeParams.getTerminalId() : NOT_A_WORD_ID;
|
||||||
childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(),
|
childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(),
|
||||||
ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), isTerminal,
|
ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), wordId,
|
||||||
ptNodeParams.hasChildren(),
|
ptNodeParams.hasChildren(),
|
||||||
ptNodeParams.isBlacklisted()
|
ptNodeParams.isBlacklisted()
|
||||||
|| ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */,
|
|| ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */,
|
||||||
|
|
|
@ -36,6 +36,7 @@ namespace latinime {
|
||||||
class DicNode;
|
class DicNode;
|
||||||
class DicNodeVector;
|
class DicNodeVector;
|
||||||
|
|
||||||
|
// Word id = Artificial id that is stored in the PtNode looked up by the word.
|
||||||
class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
|
||||||
public:
|
public:
|
||||||
Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers)
|
Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers)
|
||||||
|
|
Loading…
Reference in New Issue