Skip PtNodes with non-Unicode code points for suggestion.

Bug: 14119293
Change-Id: Id1d3b789b5f18757070878dba35a7980bfb44591
This commit is contained in:
Keisuke Kuroyanagi 2014-05-12 19:38:26 +09:00
parent 4162cfdc59
commit 79ba633402
5 changed files with 30 additions and 7 deletions

View file

@ -23,6 +23,7 @@
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
#include "utils/char_utils.h"
namespace latinime { namespace latinime {
@ -158,6 +159,10 @@ class PtNodeParams {
return PatriciaTrieReadingUtils::hasShortcutTargets(mFlags); return PatriciaTrieReadingUtils::hasShortcutTargets(mFlags);
} }
AK_FORCE_INLINE bool representsNonWordInfo() const {
return getCodePointCount() > 0 && CharUtils::isInUnicodeSpace(getCodePoints()[0]);
}
// Parent node position // Parent node position
AK_FORCE_INLINE int getParentPos() const { AK_FORCE_INLINE int getParentPos() const {
return mParentPos; return mParentPos;

View file

@ -24,6 +24,7 @@
#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
#include "suggest/policyimpl/dictionary/utils/probability_utils.h" #include "suggest/policyimpl/dictionary/utils/probability_utils.h"
#include "utils/char_utils.h"
namespace latinime { namespace latinime {
@ -318,12 +319,15 @@ int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNod
PatriciaTrieReadingUtils::readPtNodeInfo(mDictRoot, ptNodePos, getShortcutsStructurePolicy(), PatriciaTrieReadingUtils::readPtNodeInfo(mDictRoot, ptNodePos, getShortcutsStructurePolicy(),
getBigramsStructurePolicy(), &flags, &mergedNodeCodePointCount, mergedNodeCodePoints, getBigramsStructurePolicy(), &flags, &mergedNodeCodePointCount, mergedNodeCodePoints,
&probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos); &probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos);
childDicNodes->pushLeavingChild(dicNode, ptNodePos, childrenPos, probability, // Skip PtNodes don't start with Unicode code point because they represent non-word information.
PatriciaTrieReadingUtils::isTerminal(flags), if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) {
PatriciaTrieReadingUtils::hasChildrenInFlags(flags), childDicNodes->pushLeavingChild(dicNode, ptNodePos, childrenPos, probability,
PatriciaTrieReadingUtils::isBlacklisted(flags) PatriciaTrieReadingUtils::isTerminal(flags),
|| PatriciaTrieReadingUtils::isNotAWord(flags), PatriciaTrieReadingUtils::hasChildrenInFlags(flags),
mergedNodeCodePointCount, mergedNodeCodePoints); PatriciaTrieReadingUtils::isBlacklisted(flags)
|| PatriciaTrieReadingUtils::isNotAWord(flags),
mergedNodeCodePointCount, mergedNodeCodePoints);
}
return siblingPos; return siblingPos;
} }

View file

@ -59,13 +59,17 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d
// valid terminal DicNode. // valid terminal DicNode.
isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY; isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY;
} }
readingHelper.readNextSiblingNode(ptNodeParams);
if (!ptNodeParams.representsNonWordInfo()) {
// Skip PtNodes that represent non-word information.
continue;
}
childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(), childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(),
ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), isTerminal, ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), isTerminal,
ptNodeParams.hasChildren(), ptNodeParams.hasChildren(),
ptNodeParams.isBlacklisted() ptNodeParams.isBlacklisted()
|| ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */, || ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */,
ptNodeParams.getCodePointCount(), ptNodeParams.getCodePoints()); ptNodeParams.getCodePointCount(), ptNodeParams.getCodePoints());
readingHelper.readNextSiblingNode(ptNodeParams);
} }
if (readingHelper.isError()) { if (readingHelper.isError()) {
mIsCorrupted = true; mIsCorrupted = true;

View file

@ -22,6 +22,9 @@
namespace latinime { namespace latinime {
const int CharUtils::MIN_UNICODE_CODE_POINT = 0;
const int CharUtils::MAX_UNICODE_CODE_POINT = 0x10FFFF;
struct LatinCapitalSmallPair { struct LatinCapitalSmallPair {
unsigned short capital; unsigned short capital;
unsigned short small; unsigned short small;

View file

@ -86,12 +86,19 @@ class CharUtils {
return spaceCount; return spaceCount;
} }
static AK_FORCE_INLINE int isInUnicodeSpace(const int codePoint) {
return codePoint >= MIN_UNICODE_CODE_POINT && codePoint <= MAX_UNICODE_CODE_POINT;
}
static unsigned short latin_tolower(const unsigned short c); static unsigned short latin_tolower(const unsigned short c);
static const std::vector<int> EMPTY_STRING; static const std::vector<int> EMPTY_STRING;
private: private:
DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils); DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils);
static const int MIN_UNICODE_CODE_POINT;
static const int MAX_UNICODE_CODE_POINT;
/** /**
* Table mapping most combined Latin, Greek, and Cyrillic characters * Table mapping most combined Latin, Greek, and Cyrillic characters
* to their base characters. If c is in range, BASE_CHARS[c] == c * to their base characters. If c is in range, BASE_CHARS[c] == c