am c1e86d3f: Merge "Skip PtNodes with non-Unicode code points for suggestion."
* commit 'c1e86d3f491e65ae11b9dc8d932c3775c50394e4': Skip PtNodes with non-Unicode code points for suggestion.main
commit
0b6451599d
|
@ -23,6 +23,7 @@
|
||||||
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
|
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
|
||||||
#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
|
#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
|
||||||
#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
|
#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
|
||||||
|
#include "utils/char_utils.h"
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
|
@ -158,6 +159,10 @@ class PtNodeParams {
|
||||||
return PatriciaTrieReadingUtils::hasShortcutTargets(mFlags);
|
return PatriciaTrieReadingUtils::hasShortcutTargets(mFlags);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
AK_FORCE_INLINE bool representsNonWordInfo() const {
|
||||||
|
return getCodePointCount() > 0 && CharUtils::isInUnicodeSpace(getCodePoints()[0]);
|
||||||
|
}
|
||||||
|
|
||||||
// Parent node position
|
// Parent node position
|
||||||
AK_FORCE_INLINE int getParentPos() const {
|
AK_FORCE_INLINE int getParentPos() const {
|
||||||
return mParentPos;
|
return mParentPos;
|
||||||
|
|
|
@ -24,6 +24,7 @@
|
||||||
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
|
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
|
||||||
#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
|
#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
|
||||||
#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
|
#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
|
||||||
|
#include "utils/char_utils.h"
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
|
@ -318,12 +319,15 @@ int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNod
|
||||||
PatriciaTrieReadingUtils::readPtNodeInfo(mDictRoot, ptNodePos, getShortcutsStructurePolicy(),
|
PatriciaTrieReadingUtils::readPtNodeInfo(mDictRoot, ptNodePos, getShortcutsStructurePolicy(),
|
||||||
getBigramsStructurePolicy(), &flags, &mergedNodeCodePointCount, mergedNodeCodePoints,
|
getBigramsStructurePolicy(), &flags, &mergedNodeCodePointCount, mergedNodeCodePoints,
|
||||||
&probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos);
|
&probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos);
|
||||||
childDicNodes->pushLeavingChild(dicNode, ptNodePos, childrenPos, probability,
|
// Skip PtNodes don't start with Unicode code point because they represent non-word information.
|
||||||
PatriciaTrieReadingUtils::isTerminal(flags),
|
if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) {
|
||||||
PatriciaTrieReadingUtils::hasChildrenInFlags(flags),
|
childDicNodes->pushLeavingChild(dicNode, ptNodePos, childrenPos, probability,
|
||||||
PatriciaTrieReadingUtils::isBlacklisted(flags)
|
PatriciaTrieReadingUtils::isTerminal(flags),
|
||||||
|| PatriciaTrieReadingUtils::isNotAWord(flags),
|
PatriciaTrieReadingUtils::hasChildrenInFlags(flags),
|
||||||
mergedNodeCodePointCount, mergedNodeCodePoints);
|
PatriciaTrieReadingUtils::isBlacklisted(flags)
|
||||||
|
|| PatriciaTrieReadingUtils::isNotAWord(flags),
|
||||||
|
mergedNodeCodePointCount, mergedNodeCodePoints);
|
||||||
|
}
|
||||||
return siblingPos;
|
return siblingPos;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -59,13 +59,17 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d
|
||||||
// valid terminal DicNode.
|
// valid terminal DicNode.
|
||||||
isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY;
|
isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY;
|
||||||
}
|
}
|
||||||
|
readingHelper.readNextSiblingNode(ptNodeParams);
|
||||||
|
if (!ptNodeParams.representsNonWordInfo()) {
|
||||||
|
// Skip PtNodes that represent non-word information.
|
||||||
|
continue;
|
||||||
|
}
|
||||||
childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(),
|
childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(),
|
||||||
ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), isTerminal,
|
ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), isTerminal,
|
||||||
ptNodeParams.hasChildren(),
|
ptNodeParams.hasChildren(),
|
||||||
ptNodeParams.isBlacklisted()
|
ptNodeParams.isBlacklisted()
|
||||||
|| ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */,
|
|| ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */,
|
||||||
ptNodeParams.getCodePointCount(), ptNodeParams.getCodePoints());
|
ptNodeParams.getCodePointCount(), ptNodeParams.getCodePoints());
|
||||||
readingHelper.readNextSiblingNode(ptNodeParams);
|
|
||||||
}
|
}
|
||||||
if (readingHelper.isError()) {
|
if (readingHelper.isError()) {
|
||||||
mIsCorrupted = true;
|
mIsCorrupted = true;
|
||||||
|
|
|
@ -22,6 +22,9 @@
|
||||||
|
|
||||||
namespace latinime {
|
namespace latinime {
|
||||||
|
|
||||||
|
const int CharUtils::MIN_UNICODE_CODE_POINT = 0;
|
||||||
|
const int CharUtils::MAX_UNICODE_CODE_POINT = 0x10FFFF;
|
||||||
|
|
||||||
struct LatinCapitalSmallPair {
|
struct LatinCapitalSmallPair {
|
||||||
unsigned short capital;
|
unsigned short capital;
|
||||||
unsigned short small;
|
unsigned short small;
|
||||||
|
|
|
@ -86,12 +86,19 @@ class CharUtils {
|
||||||
return spaceCount;
|
return spaceCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static AK_FORCE_INLINE int isInUnicodeSpace(const int codePoint) {
|
||||||
|
return codePoint >= MIN_UNICODE_CODE_POINT && codePoint <= MAX_UNICODE_CODE_POINT;
|
||||||
|
}
|
||||||
|
|
||||||
static unsigned short latin_tolower(const unsigned short c);
|
static unsigned short latin_tolower(const unsigned short c);
|
||||||
static const std::vector<int> EMPTY_STRING;
|
static const std::vector<int> EMPTY_STRING;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils);
|
||||||
|
|
||||||
|
static const int MIN_UNICODE_CODE_POINT;
|
||||||
|
static const int MAX_UNICODE_CODE_POINT;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Table mapping most combined Latin, Greek, and Cyrillic characters
|
* Table mapping most combined Latin, Greek, and Cyrillic characters
|
||||||
* to their base characters. If c is in range, BASE_CHARS[c] == c
|
* to their base characters. If c is in range, BASE_CHARS[c] == c
|
||||||
|
|
Loading…
Reference in New Issue