From 2111e3abc9c9c0ea0350b8470532bf636b78cdd7 Mon Sep 17 00:00:00 2001 From: Keisuke Kuroyanagi Date: Tue, 9 Sep 2014 18:33:55 +0900 Subject: [PATCH] Introduce WordAttributes to get word probability and flags. Bug: 14425059 Change-Id: Iee11d038e0893d7ddd6c52447907f8c55fecb6a5 --- .../suggest/core/dicnode/dic_node_utils.cpp | 4 +- .../suggest/core/dictionary/dictionary.cpp | 7 ++- .../suggest/core/dictionary/word_attributes.h | 60 +++++++++++++++++++ .../dictionary_structure_with_buffer_policy.h | 5 +- .../v402/ver4_patricia_trie_policy.cpp | 23 ++++--- .../backward/v402/ver4_patricia_trie_policy.h | 4 +- .../structure/v2/patricia_trie_policy.cpp | 20 +++++-- .../structure/v2/patricia_trie_policy.h | 4 +- .../v4/ver4_patricia_trie_policy.cpp | 15 +++-- .../structure/v4/ver4_patricia_trie_policy.h | 2 +- 10 files changed, 116 insertions(+), 28 deletions(-) create mode 100644 native/jni/src/suggest/core/dictionary/word_attributes.h diff --git a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp index 19f92cc0b..26c7e3357 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp +++ b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp @@ -72,10 +72,10 @@ namespace latinime { if (dicNode->hasMultipleWords() && !dicNode->isValidMultipleWordSuggestion()) { return static_cast(MAX_VALUE_FOR_WEIGHTING); } - const int probability = dictionaryStructurePolicy->getProbabilityOfWordInContext( + const WordAttributes wordAttributes = dictionaryStructurePolicy->getWordAttributesInContext( dicNode->getPrevWordIds(), dicNode->getWordId(), multiBigramMap); // TODO: This equation to calculate the improbability looks unreasonable. Investigate this. - const float cost = static_cast(MAX_PROBABILITY - probability) + const float cost = static_cast(MAX_PROBABILITY - wordAttributes.getProbability()) / static_cast(MAX_PROBABILITY); return cost; } diff --git a/native/jni/src/suggest/core/dictionary/dictionary.cpp b/native/jni/src/suggest/core/dictionary/dictionary.cpp index c9725d1b0..1de405104 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/dictionary.cpp @@ -84,9 +84,10 @@ void Dictionary::NgramListenerForPrediction::onVisitEntry(const int ngramProbabi if (codePointCount <= 0) { return; } - const int probability = mDictStructurePolicy->getProbabilityOfWordInContext(mPrevWordIds.data(), - targetWordId, nullptr /* multiBigramMap */); - mSuggestionResults->addPrediction(targetWordCodePoints, codePointCount, probability); + const WordAttributes wordAttributes = mDictStructurePolicy->getWordAttributesInContext( + mPrevWordIds.data(), targetWordId, nullptr /* multiBigramMap */); + mSuggestionResults->addPrediction(targetWordCodePoints, codePointCount, + wordAttributes.getProbability()); } void Dictionary::getPredictions(const PrevWordsInfo *const prevWordsInfo, diff --git a/native/jni/src/suggest/core/dictionary/word_attributes.h b/native/jni/src/suggest/core/dictionary/word_attributes.h new file mode 100644 index 000000000..6e9da3570 --- /dev/null +++ b/native/jni/src/suggest/core/dictionary/word_attributes.h @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_WORD_ATTRIBUTES_H +#define LATINIME_WORD_ATTRIBUTES_H + +#include "defines.h" + +class WordAttributes { + public: + // Invalid word attributes. + WordAttributes() + : mProbability(NOT_A_PROBABILITY), mIsBlacklisted(false), mIsNotAWord(false), + mIsPossiblyOffensive(false) {} + + WordAttributes(const int probability, const bool isBlacklisted, const bool isNotAWord, + const bool isPossiblyOffensive) + : mProbability(probability), mIsBlacklisted(isBlacklisted), mIsNotAWord(isNotAWord), + mIsPossiblyOffensive(isPossiblyOffensive) {} + + int getProbability() const { + return mProbability; + } + + bool isBlacklisted() const { + return mIsBlacklisted; + } + + bool isNotAWord() const { + return mIsNotAWord; + } + + bool isPossiblyOffensive() const { + return mIsPossiblyOffensive; + } + + private: + DISALLOW_ASSIGNMENT_OPERATOR(WordAttributes); + + int mProbability; + bool mIsBlacklisted; + bool mIsNotAWord; + bool mIsPossiblyOffensive; +}; + + // namespace +#endif /* LATINIME_WORD_ATTRIBUTES_H */ diff --git a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h index 4e55418ae..7414f696c 100644 --- a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h +++ b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h @@ -22,6 +22,7 @@ #include "defines.h" #include "suggest/core/dictionary/binary_dictionary_shortcut_iterator.h" #include "suggest/core/dictionary/property/word_property.h" +#include "suggest/core/dictionary/word_attributes.h" #include "utils/int_array_view.h" namespace latinime { @@ -57,8 +58,8 @@ class DictionaryStructureWithBufferPolicy { virtual int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const = 0; - virtual int getProbabilityOfWordInContext(const int *const prevWordIds, const int wordId, - MultiBigramMap *const multiBigramMap) const = 0; + virtual const WordAttributes getWordAttributesInContext(const int *const prevWordIds, + const int wordId, MultiBigramMap *const multiBigramMap) const = 0; // TODO: Remove virtual int getProbability(const int unigramProbability, const int bigramProbability) const = 0; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp index df3daa816..547cc997c 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp @@ -118,24 +118,33 @@ int Ver4PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints, return getWordIdFromTerminalPtNodePos(ptNodePos); } -int Ver4PatriciaTriePolicy::getProbabilityOfWordInContext(const int *const prevWordIds, - const int wordId, MultiBigramMap *const multiBigramMap) const { +const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext( + const int *const prevWordIds, const int wordId, + MultiBigramMap *const multiBigramMap) const { if (wordId == NOT_A_WORD_ID) { - return NOT_A_PROBABILITY; + return WordAttributes(); } const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); if (multiBigramMap) { - return multiBigramMap->getBigramProbability(this /* structurePolicy */, prevWordIds, - wordId, ptNodeParams.getProbability()); + const int probability = multiBigramMap->getBigramProbability(this /* structurePolicy */, + prevWordIds, wordId, ptNodeParams.getProbability()); + return getWordAttributes(probability, ptNodeParams); } if (prevWordIds) { const int probability = getProbabilityOfWord(prevWordIds, wordId); if (probability != NOT_A_PROBABILITY) { - return probability; + return getWordAttributes(probability, ptNodeParams); } } - return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY); + return getWordAttributes(getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY), + ptNodeParams); +} + +const WordAttributes Ver4PatriciaTriePolicy::getWordAttributes(const int probability, + const PtNodeParams &ptNodeParams) const { + return WordAttributes(probability, ptNodeParams.isBlacklisted(), ptNodeParams.isNotAWord(), + ptNodeParams.getProbability() == 0); } int Ver4PatriciaTriePolicy::getProbability(const int unigramProbability, diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h index 06d704174..871b556e1 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h @@ -91,7 +91,7 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const; - int getProbabilityOfWordInContext(const int *const prevWordIds, const int wordId, + const WordAttributes getWordAttributesInContext(const int *const prevWordIds, const int wordId, MultiBigramMap *const multiBigramMap) const; int getProbability(const int unigramProbability, const int bigramProbability) const; @@ -166,6 +166,8 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { int getShortcutPositionOfPtNode(const int ptNodePos) const; int getWordIdFromTerminalPtNodePos(const int ptNodePos) const; int getTerminalPtNodePosFromWordId(const int wordId) const; + const WordAttributes getWordAttributes(const int probability, + const PtNodeParams &ptNodeParams) const; }; } // namespace v402 } // namespace backward diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp index 80bbf47c0..44148e817 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp @@ -282,25 +282,33 @@ int PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints, return getWordIdFromTerminalPtNodePos(ptNodePos); } -int PatriciaTriePolicy::getProbabilityOfWordInContext(const int *const prevWordIds, +const WordAttributes PatriciaTriePolicy::getWordAttributesInContext(const int *const prevWordIds, const int wordId, MultiBigramMap *const multiBigramMap) const { if (wordId == NOT_A_WORD_ID) { - return NOT_A_PROBABILITY; + return WordAttributes(); } const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); const PtNodeParams ptNodeParams = mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); if (multiBigramMap) { - return multiBigramMap->getBigramProbability(this /* structurePolicy */, prevWordIds, - wordId, ptNodeParams.getProbability()); + const int probability = multiBigramMap->getBigramProbability(this /* structurePolicy */, + prevWordIds, wordId, ptNodeParams.getProbability()); + return getWordAttributes(probability, ptNodeParams); } if (prevWordIds) { const int bigramProbability = getProbabilityOfWord(prevWordIds, wordId); if (bigramProbability != NOT_A_PROBABILITY) { - return bigramProbability; + return getWordAttributes(bigramProbability, ptNodeParams); } } - return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY); + return getWordAttributes(getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY), + ptNodeParams); +} + +const WordAttributes PatriciaTriePolicy::getWordAttributes(const int probability, + const PtNodeParams &ptNodeParams) const { + return WordAttributes(probability, ptNodeParams.isBlacklisted(), ptNodeParams.isNotAWord(), + ptNodeParams.getProbability() == 0); } int PatriciaTriePolicy::getProbability(const int unigramProbability, diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h index a2d6b6fa6..8c1665d7d 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h @@ -66,7 +66,7 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const; - int getProbabilityOfWordInContext(const int *const prevWordIds, const int wordId, + const WordAttributes getWordAttributesInContext(const int *const prevWordIds, const int wordId, MultiBigramMap *const multiBigramMap) const; int getProbability(const int unigramProbability, const int bigramProbability) const; @@ -163,6 +163,8 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { DicNodeVector *const childDicNodes) const; int getWordIdFromTerminalPtNodePos(const int ptNodePos) const; int getTerminalPtNodePosFromWordId(const int wordId) const; + const WordAttributes getWordAttributes(const int probability, + const PtNodeParams &ptNodeParams) const; }; } // namespace latinime #endif // LATINIME_PATRICIA_TRIE_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp index 308c35585..e4462550e 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp @@ -113,14 +113,19 @@ int Ver4PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints, return ptNodeParams.getTerminalId(); } -int Ver4PatriciaTriePolicy::getProbabilityOfWordInContext(const int *const prevWordIds, - const int wordId, MultiBigramMap *const multiBigramMap) const { +const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext( + const int *const prevWordIds, const int wordId, + MultiBigramMap *const multiBigramMap) const { if (wordId == NOT_A_WORD_ID) { - return NOT_A_PROBABILITY; + return WordAttributes(); } + const int ptNodePos = + mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId); + const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); // TODO: Support n-gram. - return mBuffers->getLanguageModelDictContent()->getWordProbability( - WordIdArrayView::singleElementView(prevWordIds), wordId); + return WordAttributes(mBuffers->getLanguageModelDictContent()->getWordProbability( + WordIdArrayView::singleElementView(prevWordIds), wordId), ptNodeParams.isBlacklisted(), + ptNodeParams.isNotAWord(), ptNodeParams.getProbability() == 0); } int Ver4PatriciaTriePolicy::getProbability(const int unigramProbability, diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h index c9df9df4b..980c16e4a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h @@ -68,7 +68,7 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const; - int getProbabilityOfWordInContext(const int *const prevWordIds, const int wordId, + const WordAttributes getWordAttributesInContext(const int *const prevWordIds, const int wordId, MultiBigramMap *const multiBigramMap) const; int getProbability(const int unigramProbability, const int bigramProbability) const;