diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h index 91192fc57..bef401f87 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h @@ -23,6 +23,7 @@ #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" #include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" +#include "utils/char_utils.h" namespace latinime { @@ -158,6 +159,10 @@ class PtNodeParams { return PatriciaTrieReadingUtils::hasShortcutTargets(mFlags); } + AK_FORCE_INLINE bool representsNonWordInfo() const { + return getCodePointCount() > 0 && CharUtils::isInUnicodeSpace(getCodePoints()[0]); + } + // Parent node position AK_FORCE_INLINE int getParentPos() const { return mParentPos; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp index b3af1f47a..30dcfba37 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp @@ -24,6 +24,7 @@ #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" #include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/utils/probability_utils.h" +#include "utils/char_utils.h" namespace latinime { @@ -318,12 +319,15 @@ int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNod PatriciaTrieReadingUtils::readPtNodeInfo(mDictRoot, ptNodePos, getShortcutsStructurePolicy(), getBigramsStructurePolicy(), &flags, &mergedNodeCodePointCount, mergedNodeCodePoints, &probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos); - childDicNodes->pushLeavingChild(dicNode, ptNodePos, childrenPos, probability, - PatriciaTrieReadingUtils::isTerminal(flags), - PatriciaTrieReadingUtils::hasChildrenInFlags(flags), - PatriciaTrieReadingUtils::isBlacklisted(flags) - || PatriciaTrieReadingUtils::isNotAWord(flags), - mergedNodeCodePointCount, mergedNodeCodePoints); + // Skip PtNodes don't start with Unicode code point because they represent non-word information. + if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) { + childDicNodes->pushLeavingChild(dicNode, ptNodePos, childrenPos, probability, + PatriciaTrieReadingUtils::isTerminal(flags), + PatriciaTrieReadingUtils::hasChildrenInFlags(flags), + PatriciaTrieReadingUtils::isBlacklisted(flags) + || PatriciaTrieReadingUtils::isNotAWord(flags), + mergedNodeCodePointCount, mergedNodeCodePoints); + } return siblingPos; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp index 2fb3decee..8373dc549 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp @@ -59,13 +59,17 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d // valid terminal DicNode. isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY; } + readingHelper.readNextSiblingNode(ptNodeParams); + if (!ptNodeParams.representsNonWordInfo()) { + // Skip PtNodes that represent non-word information. + continue; + } childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(), ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), isTerminal, ptNodeParams.hasChildren(), ptNodeParams.isBlacklisted() || ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */, ptNodeParams.getCodePointCount(), ptNodeParams.getCodePoints()); - readingHelper.readNextSiblingNode(ptNodeParams); } if (readingHelper.isError()) { mIsCorrupted = true; diff --git a/native/jni/src/utils/char_utils.cpp b/native/jni/src/utils/char_utils.cpp index adc474b4c..b17e0847d 100644 --- a/native/jni/src/utils/char_utils.cpp +++ b/native/jni/src/utils/char_utils.cpp @@ -22,6 +22,9 @@ namespace latinime { +const int CharUtils::MIN_UNICODE_CODE_POINT = 0; +const int CharUtils::MAX_UNICODE_CODE_POINT = 0x10FFFF; + struct LatinCapitalSmallPair { unsigned short capital; unsigned short small; diff --git a/native/jni/src/utils/char_utils.h b/native/jni/src/utils/char_utils.h index 239419d5b..634c45b04 100644 --- a/native/jni/src/utils/char_utils.h +++ b/native/jni/src/utils/char_utils.h @@ -86,12 +86,19 @@ class CharUtils { return spaceCount; } + static AK_FORCE_INLINE int isInUnicodeSpace(const int codePoint) { + return codePoint >= MIN_UNICODE_CODE_POINT && codePoint <= MAX_UNICODE_CODE_POINT; + } + static unsigned short latin_tolower(const unsigned short c); static const std::vector EMPTY_STRING; private: DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils); + static const int MIN_UNICODE_CODE_POINT; + static const int MAX_UNICODE_CODE_POINT; + /** * Table mapping most combined Latin, Greek, and Cyrillic characters * to their base characters. If c is in range, BASE_CHARS[c] == c