Use WordAttributes for checking flags.
Bug: 14425059 Change-Id: Idee84478a482a0e7b5cc53e5dbd4e2484584ba79
This commit is contained in:
parent
2111e3abc9
commit
87a5c76906
8 changed files with 22 additions and 41 deletions
|
@ -136,14 +136,14 @@ class DicNode {
|
||||||
}
|
}
|
||||||
|
|
||||||
void initAsChild(const DicNode *const dicNode, const int childrenPtNodeArrayPos,
|
void initAsChild(const DicNode *const dicNode, const int childrenPtNodeArrayPos,
|
||||||
const int unigramProbability, const int wordId, const bool isBlacklistedOrNotAWord,
|
const int unigramProbability, const int wordId, const uint16_t mergedNodeCodePointCount,
|
||||||
const uint16_t mergedNodeCodePointCount, const int *const mergedNodeCodePoints) {
|
const int *const mergedNodeCodePoints) {
|
||||||
uint16_t newDepth = static_cast<uint16_t>(dicNode->getNodeCodePointCount() + 1);
|
uint16_t newDepth = static_cast<uint16_t>(dicNode->getNodeCodePointCount() + 1);
|
||||||
mIsCachedForNextSuggestion = dicNode->mIsCachedForNextSuggestion;
|
mIsCachedForNextSuggestion = dicNode->mIsCachedForNextSuggestion;
|
||||||
const uint16_t newLeavingDepth = static_cast<uint16_t>(
|
const uint16_t newLeavingDepth = static_cast<uint16_t>(
|
||||||
dicNode->mDicNodeProperties.getLeavingDepth() + mergedNodeCodePointCount);
|
dicNode->mDicNodeProperties.getLeavingDepth() + mergedNodeCodePointCount);
|
||||||
mDicNodeProperties.init(childrenPtNodeArrayPos, mergedNodeCodePoints[0],
|
mDicNodeProperties.init(childrenPtNodeArrayPos, mergedNodeCodePoints[0],
|
||||||
unigramProbability, wordId, isBlacklistedOrNotAWord, newDepth, newLeavingDepth,
|
unigramProbability, wordId, newDepth, newLeavingDepth,
|
||||||
dicNode->mDicNodeProperties.getPrevWordIds());
|
dicNode->mDicNodeProperties.getPrevWordIds());
|
||||||
mDicNodeState.init(&dicNode->mDicNodeState, mergedNodeCodePointCount,
|
mDicNodeState.init(&dicNode->mDicNodeState, mergedNodeCodePointCount,
|
||||||
mergedNodeCodePoints);
|
mergedNodeCodePoints);
|
||||||
|
@ -178,9 +178,6 @@ class DicNode {
|
||||||
// Check if the current word and the previous word can be considered as a valid multiple word
|
// Check if the current word and the previous word can be considered as a valid multiple word
|
||||||
// suggestion.
|
// suggestion.
|
||||||
bool isValidMultipleWordSuggestion() const {
|
bool isValidMultipleWordSuggestion() const {
|
||||||
if (isBlacklistedOrNotAWord()) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
// Treat suggestion as invalid if the current and the previous word are single character
|
// Treat suggestion as invalid if the current and the previous word are single character
|
||||||
// words.
|
// words.
|
||||||
const int prevWordLen = mDicNodeState.mDicNodeStateOutput.getPrevWordsLength()
|
const int prevWordLen = mDicNodeState.mDicNodeStateOutput.getPrevWordsLength()
|
||||||
|
@ -404,10 +401,6 @@ class DicNode {
|
||||||
return mDicNodeState.mDicNodeStateScoring.getContainedErrorTypes();
|
return mDicNodeState.mDicNodeStateScoring.getContainedErrorTypes();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool isBlacklistedOrNotAWord() const {
|
|
||||||
return mDicNodeProperties.isBlacklistedOrNotAWord();
|
|
||||||
}
|
|
||||||
|
|
||||||
inline uint16_t getNodeCodePointCount() const {
|
inline uint16_t getNodeCodePointCount() const {
|
||||||
return mDicNodeProperties.getDepth();
|
return mDicNodeProperties.getDepth();
|
||||||
}
|
}
|
||||||
|
|
|
@ -74,6 +74,10 @@ namespace latinime {
|
||||||
}
|
}
|
||||||
const WordAttributes wordAttributes = dictionaryStructurePolicy->getWordAttributesInContext(
|
const WordAttributes wordAttributes = dictionaryStructurePolicy->getWordAttributesInContext(
|
||||||
dicNode->getPrevWordIds(), dicNode->getWordId(), multiBigramMap);
|
dicNode->getPrevWordIds(), dicNode->getWordId(), multiBigramMap);
|
||||||
|
if (dicNode->hasMultipleWords()
|
||||||
|
&& (wordAttributes.isBlacklisted() || wordAttributes.isNotAWord())) {
|
||||||
|
return static_cast<float>(MAX_VALUE_FOR_WEIGHTING);
|
||||||
|
}
|
||||||
// TODO: This equation to calculate the improbability looks unreasonable. Investigate this.
|
// TODO: This equation to calculate the improbability looks unreasonable. Investigate this.
|
||||||
const float cost = static_cast<float>(MAX_PROBABILITY - wordAttributes.getProbability())
|
const float cost = static_cast<float>(MAX_PROBABILITY - wordAttributes.getProbability())
|
||||||
/ static_cast<float>(MAX_PROBABILITY);
|
/ static_cast<float>(MAX_PROBABILITY);
|
||||||
|
|
|
@ -59,12 +59,12 @@ class DicNodeVector {
|
||||||
}
|
}
|
||||||
|
|
||||||
void pushLeavingChild(const DicNode *const dicNode, const int childrenPtNodeArrayPos,
|
void pushLeavingChild(const DicNode *const dicNode, const int childrenPtNodeArrayPos,
|
||||||
const int unigramProbability, const int wordId, const bool isBlacklistedOrNotAWord,
|
const int unigramProbability, const int wordId,
|
||||||
const uint16_t mergedNodeCodePointCount, const int *const mergedNodeCodePoints) {
|
const uint16_t mergedNodeCodePointCount, const int *const mergedNodeCodePoints) {
|
||||||
ASSERT(!mLock);
|
ASSERT(!mLock);
|
||||||
mDicNodes.emplace_back();
|
mDicNodes.emplace_back();
|
||||||
mDicNodes.back().initAsChild(dicNode, childrenPtNodeArrayPos, unigramProbability,
|
mDicNodes.back().initAsChild(dicNode, childrenPtNodeArrayPos, unigramProbability,
|
||||||
wordId, isBlacklistedOrNotAWord, mergedNodeCodePointCount, mergedNodeCodePoints);
|
wordId, mergedNodeCodePointCount, mergedNodeCodePoints);
|
||||||
}
|
}
|
||||||
|
|
||||||
DicNode *operator[](const int id) {
|
DicNode *operator[](const int id) {
|
||||||
|
|
|
@ -30,20 +30,19 @@ class DicNodeProperties {
|
||||||
public:
|
public:
|
||||||
AK_FORCE_INLINE DicNodeProperties()
|
AK_FORCE_INLINE DicNodeProperties()
|
||||||
: mChildrenPtNodeArrayPos(NOT_A_DICT_POS), mUnigramProbability(NOT_A_PROBABILITY),
|
: mChildrenPtNodeArrayPos(NOT_A_DICT_POS), mUnigramProbability(NOT_A_PROBABILITY),
|
||||||
mDicNodeCodePoint(NOT_A_CODE_POINT), mWordId(NOT_A_WORD_ID),
|
mDicNodeCodePoint(NOT_A_CODE_POINT), mWordId(NOT_A_WORD_ID), mDepth(0),
|
||||||
mIsBlacklistedOrNotAWord(false), mDepth(0), mLeavingDepth(0) {}
|
mLeavingDepth(0) {}
|
||||||
|
|
||||||
~DicNodeProperties() {}
|
~DicNodeProperties() {}
|
||||||
|
|
||||||
// Should be called only once per DicNode is initialized.
|
// Should be called only once per DicNode is initialized.
|
||||||
void init(const int childrenPos, const int nodeCodePoint, const int unigramProbability,
|
void init(const int childrenPos, const int nodeCodePoint, const int unigramProbability,
|
||||||
const int wordId, const bool isBlacklistedOrNotAWord, const uint16_t depth,
|
const int wordId, const uint16_t depth, const uint16_t leavingDepth,
|
||||||
const uint16_t leavingDepth, const int *const prevWordIds) {
|
const int *const prevWordIds) {
|
||||||
mChildrenPtNodeArrayPos = childrenPos;
|
mChildrenPtNodeArrayPos = childrenPos;
|
||||||
mDicNodeCodePoint = nodeCodePoint;
|
mDicNodeCodePoint = nodeCodePoint;
|
||||||
mUnigramProbability = unigramProbability;
|
mUnigramProbability = unigramProbability;
|
||||||
mWordId = wordId;
|
mWordId = wordId;
|
||||||
mIsBlacklistedOrNotAWord = isBlacklistedOrNotAWord;
|
|
||||||
mDepth = depth;
|
mDepth = depth;
|
||||||
mLeavingDepth = leavingDepth;
|
mLeavingDepth = leavingDepth;
|
||||||
memmove(mPrevWordIds, prevWordIds, sizeof(mPrevWordIds));
|
memmove(mPrevWordIds, prevWordIds, sizeof(mPrevWordIds));
|
||||||
|
@ -55,7 +54,6 @@ class DicNodeProperties {
|
||||||
mDicNodeCodePoint = NOT_A_CODE_POINT;
|
mDicNodeCodePoint = NOT_A_CODE_POINT;
|
||||||
mUnigramProbability = NOT_A_PROBABILITY;
|
mUnigramProbability = NOT_A_PROBABILITY;
|
||||||
mWordId = NOT_A_WORD_ID;
|
mWordId = NOT_A_WORD_ID;
|
||||||
mIsBlacklistedOrNotAWord = false;
|
|
||||||
mDepth = 0;
|
mDepth = 0;
|
||||||
mLeavingDepth = 0;
|
mLeavingDepth = 0;
|
||||||
memmove(mPrevWordIds, prevWordIds, sizeof(mPrevWordIds));
|
memmove(mPrevWordIds, prevWordIds, sizeof(mPrevWordIds));
|
||||||
|
@ -66,7 +64,6 @@ class DicNodeProperties {
|
||||||
mDicNodeCodePoint = dicNodeProp->mDicNodeCodePoint;
|
mDicNodeCodePoint = dicNodeProp->mDicNodeCodePoint;
|
||||||
mUnigramProbability = dicNodeProp->mUnigramProbability;
|
mUnigramProbability = dicNodeProp->mUnigramProbability;
|
||||||
mWordId = dicNodeProp->mWordId;
|
mWordId = dicNodeProp->mWordId;
|
||||||
mIsBlacklistedOrNotAWord = dicNodeProp->mIsBlacklistedOrNotAWord;
|
|
||||||
mDepth = dicNodeProp->mDepth;
|
mDepth = dicNodeProp->mDepth;
|
||||||
mLeavingDepth = dicNodeProp->mLeavingDepth;
|
mLeavingDepth = dicNodeProp->mLeavingDepth;
|
||||||
memmove(mPrevWordIds, dicNodeProp->mPrevWordIds, sizeof(mPrevWordIds));
|
memmove(mPrevWordIds, dicNodeProp->mPrevWordIds, sizeof(mPrevWordIds));
|
||||||
|
@ -78,7 +75,6 @@ class DicNodeProperties {
|
||||||
mDicNodeCodePoint = codePoint; // Overwrite the node char of a passing child
|
mDicNodeCodePoint = codePoint; // Overwrite the node char of a passing child
|
||||||
mUnigramProbability = dicNodeProp->mUnigramProbability;
|
mUnigramProbability = dicNodeProp->mUnigramProbability;
|
||||||
mWordId = dicNodeProp->mWordId;
|
mWordId = dicNodeProp->mWordId;
|
||||||
mIsBlacklistedOrNotAWord = dicNodeProp->mIsBlacklistedOrNotAWord;
|
|
||||||
mDepth = dicNodeProp->mDepth + 1; // Increment the depth of a passing child
|
mDepth = dicNodeProp->mDepth + 1; // Increment the depth of a passing child
|
||||||
mLeavingDepth = dicNodeProp->mLeavingDepth;
|
mLeavingDepth = dicNodeProp->mLeavingDepth;
|
||||||
memmove(mPrevWordIds, dicNodeProp->mPrevWordIds, sizeof(mPrevWordIds));
|
memmove(mPrevWordIds, dicNodeProp->mPrevWordIds, sizeof(mPrevWordIds));
|
||||||
|
@ -113,10 +109,6 @@ class DicNodeProperties {
|
||||||
return (mChildrenPtNodeArrayPos != NOT_A_DICT_POS) || mDepth != mLeavingDepth;
|
return (mChildrenPtNodeArrayPos != NOT_A_DICT_POS) || mDepth != mLeavingDepth;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool isBlacklistedOrNotAWord() const {
|
|
||||||
return mIsBlacklistedOrNotAWord;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int *getPrevWordIds() const {
|
const int *getPrevWordIds() const {
|
||||||
return mPrevWordIds;
|
return mPrevWordIds;
|
||||||
}
|
}
|
||||||
|
@ -134,8 +126,6 @@ class DicNodeProperties {
|
||||||
int mUnigramProbability;
|
int mUnigramProbability;
|
||||||
int mDicNodeCodePoint;
|
int mDicNodeCodePoint;
|
||||||
int mWordId;
|
int mWordId;
|
||||||
// TODO: Remove
|
|
||||||
bool mIsBlacklistedOrNotAWord;
|
|
||||||
uint16_t mDepth;
|
uint16_t mDepth;
|
||||||
uint16_t mLeavingDepth;
|
uint16_t mLeavingDepth;
|
||||||
int mPrevWordIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
|
int mPrevWordIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
|
||||||
|
|
|
@ -85,9 +85,9 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
|
||||||
scoringPolicy->getDoubleLetterDemotionDistanceCost(terminalDicNode);
|
scoringPolicy->getDoubleLetterDemotionDistanceCost(terminalDicNode);
|
||||||
const float compoundDistance = terminalDicNode->getCompoundDistance(languageWeight)
|
const float compoundDistance = terminalDicNode->getCompoundDistance(languageWeight)
|
||||||
+ doubleLetterCost;
|
+ doubleLetterCost;
|
||||||
const bool isPossiblyOffensiveWord =
|
const WordAttributes wordAttributes = traverseSession->getDictionaryStructurePolicy()
|
||||||
traverseSession->getDictionaryStructurePolicy()->getProbability(
|
->getWordAttributesInContext(terminalDicNode->getPrevWordIds(),
|
||||||
terminalDicNode->getUnigramProbability(), NOT_A_PROBABILITY) <= 0;
|
terminalDicNode->getWordId(), nullptr /* multiBigramMap */);
|
||||||
const bool isExactMatch =
|
const bool isExactMatch =
|
||||||
ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes());
|
ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes());
|
||||||
const bool isExactMatchWithIntentionalOmission =
|
const bool isExactMatchWithIntentionalOmission =
|
||||||
|
@ -97,19 +97,19 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
|
||||||
// Heuristic: We exclude probability=0 first-char-uppercase words from exact match.
|
// Heuristic: We exclude probability=0 first-char-uppercase words from exact match.
|
||||||
// (e.g. "AMD" and "and")
|
// (e.g. "AMD" and "and")
|
||||||
const bool isSafeExactMatch = isExactMatch
|
const bool isSafeExactMatch = isExactMatch
|
||||||
&& !(isPossiblyOffensiveWord && isFirstCharUppercase);
|
&& !(wordAttributes.isPossiblyOffensive() && isFirstCharUppercase);
|
||||||
const int outputTypeFlags =
|
const int outputTypeFlags =
|
||||||
(isPossiblyOffensiveWord ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0)
|
(wordAttributes.isPossiblyOffensive() ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0)
|
||||||
| ((isSafeExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0)
|
| ((isSafeExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0)
|
||||||
| (isExactMatchWithIntentionalOmission ?
|
| (isExactMatchWithIntentionalOmission ?
|
||||||
Dictionary::KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION : 0);
|
Dictionary::KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION : 0);
|
||||||
|
|
||||||
// Entries that are blacklisted or do not represent a word should not be output.
|
// Entries that are blacklisted or do not represent a word should not be output.
|
||||||
const bool isValidWord = !terminalDicNode->isBlacklistedOrNotAWord();
|
const bool isValidWord = !(wordAttributes.isBlacklisted() || wordAttributes.isNotAWord());
|
||||||
// When we have to block offensive words, non-exact matched offensive words should not be
|
// When we have to block offensive words, non-exact matched offensive words should not be
|
||||||
// output.
|
// output.
|
||||||
const bool blockOffensiveWords = traverseSession->getSuggestOptions()->blockOffensiveWords();
|
const bool blockOffensiveWords = traverseSession->getSuggestOptions()->blockOffensiveWords();
|
||||||
const bool isBlockedOffensiveWord = blockOffensiveWords && isPossiblyOffensiveWord
|
const bool isBlockedOffensiveWord = blockOffensiveWords && wordAttributes.isPossiblyOffensive()
|
||||||
&& !isSafeExactMatch;
|
&& !isSafeExactMatch;
|
||||||
|
|
||||||
// Increase output score of top typing suggestion to ensure autocorrection.
|
// Increase output score of top typing suggestion to ensure autocorrection.
|
||||||
|
|
|
@ -79,10 +79,8 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d
|
||||||
}
|
}
|
||||||
const int wordId = isTerminal ? ptNodeParams.getHeadPos() : NOT_A_WORD_ID;
|
const int wordId = isTerminal ? ptNodeParams.getHeadPos() : NOT_A_WORD_ID;
|
||||||
childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getChildrenPos(),
|
childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getChildrenPos(),
|
||||||
ptNodeParams.getProbability(), wordId,
|
ptNodeParams.getProbability(), wordId, ptNodeParams.getCodePointCount(),
|
||||||
ptNodeParams.isBlacklisted()
|
ptNodeParams.getCodePoints());
|
||||||
|| ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */,
|
|
||||||
ptNodeParams.getCodePointCount(), ptNodeParams.getCodePoints());
|
|
||||||
}
|
}
|
||||||
if (readingHelper.isError()) {
|
if (readingHelper.isError()) {
|
||||||
mIsCorrupted = true;
|
mIsCorrupted = true;
|
||||||
|
|
|
@ -408,8 +408,6 @@ int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNod
|
||||||
if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) {
|
if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) {
|
||||||
const int wordId = PatriciaTrieReadingUtils::isTerminal(flags) ? ptNodePos : NOT_A_WORD_ID;
|
const int wordId = PatriciaTrieReadingUtils::isTerminal(flags) ? ptNodePos : NOT_A_WORD_ID;
|
||||||
childDicNodes->pushLeavingChild(dicNode, childrenPos, probability, wordId,
|
childDicNodes->pushLeavingChild(dicNode, childrenPos, probability, wordId,
|
||||||
PatriciaTrieReadingUtils::isBlacklisted(flags)
|
|
||||||
|| PatriciaTrieReadingUtils::isNotAWord(flags),
|
|
||||||
mergedNodeCodePointCount, mergedNodeCodePoints);
|
mergedNodeCodePointCount, mergedNodeCodePoints);
|
||||||
}
|
}
|
||||||
return siblingPos;
|
return siblingPos;
|
||||||
|
|
|
@ -70,8 +70,6 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d
|
||||||
const int wordId = isTerminal ? ptNodeParams.getTerminalId() : NOT_A_WORD_ID;
|
const int wordId = isTerminal ? ptNodeParams.getTerminalId() : NOT_A_WORD_ID;
|
||||||
childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getChildrenPos(),
|
childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getChildrenPos(),
|
||||||
ptNodeParams.getProbability(), wordId,
|
ptNodeParams.getProbability(), wordId,
|
||||||
ptNodeParams.isBlacklisted()
|
|
||||||
|| ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */,
|
|
||||||
ptNodeParams.getCodePointCount(), ptNodeParams.getCodePoints());
|
ptNodeParams.getCodePointCount(), ptNodeParams.getCodePoints());
|
||||||
}
|
}
|
||||||
if (readingHelper.isError()) {
|
if (readingHelper.isError()) {
|
||||||
|
|
Loading…
Reference in a new issue