From 7d911d6f91af56586fbca40672bfb77b494ee871 Mon Sep 17 00:00:00 2001 From: Keisuke Kuroyanagi Date: Wed, 24 Sep 2014 14:15:34 +0900 Subject: [PATCH] Move word flags to language model dict content. Bug: 14425059 Change-Id: I64712e5c83d0bc241e6f0f16117ab47b5d75bd4b --- .../dictionary/property/unigram_property.h | 5 +++ .../content/language_model_dict_content.cpp | 17 +++++++--- .../v4/content/language_model_dict_content.h | 3 +- .../structure/v4/content/probability_entry.h | 30 +++++++++++++++-- .../structure/v4/ver4_dict_constants.cpp | 3 ++ .../structure/v4/ver4_dict_constants.h | 3 ++ .../v4/ver4_patricia_trie_node_writer.cpp | 14 ++++---- .../v4/ver4_patricia_trie_node_writer.h | 3 +- .../v4/ver4_patricia_trie_policy.cpp | 32 ++++++------------- 9 files changed, 68 insertions(+), 42 deletions(-) diff --git a/native/jni/src/suggest/core/dictionary/property/unigram_property.h b/native/jni/src/suggest/core/dictionary/property/unigram_property.h index 902eb000f..65c8333bb 100644 --- a/native/jni/src/suggest/core/dictionary/property/unigram_property.h +++ b/native/jni/src/suggest/core/dictionary/property/unigram_property.h @@ -71,6 +71,11 @@ class UnigramProperty { return mIsBlacklisted; } + bool isPossiblyOffensive() const { + // TODO: Have dedicated flag. + return mProbability == 0; + } + bool hasShortcuts() const { return !mShortcuts.empty(); } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp index 35f0f768f..89094c83a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp @@ -38,7 +38,7 @@ bool LanguageModelDictContent::runGC( 0 /* nextLevelBitmapEntryIndex */, outNgramCount); } -int LanguageModelDictContent::getWordProbability(const WordIdArrayView prevWordIds, +const WordAttributes LanguageModelDictContent::getWordAttributes(const WordIdArrayView prevWordIds, const int wordId, const HeaderPolicy *const headerPolicy) const { int bitmapEntryIndices[MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1]; bitmapEntryIndices[0] = mTrieMap.getRootBitmapEntryIndex(); @@ -60,17 +60,24 @@ int LanguageModelDictContent::getWordProbability(const WordIdArrayView prevWordI } const ProbabilityEntry probabilityEntry = ProbabilityEntry::decode(result.mValue, mHasHistoricalInfo); + int probability = NOT_A_PROBABILITY; if (mHasHistoricalInfo) { - const int probability = ForgettingCurveUtils::decodeProbability( + const int rawProbability = ForgettingCurveUtils::decodeProbability( probabilityEntry.getHistoricalInfo(), headerPolicy) + ForgettingCurveUtils::getProbabilityBiasForNgram(i + 1 /* n */); - return std::min(probability, MAX_PROBABILITY); + probability = std::min(rawProbability, MAX_PROBABILITY); } else { - return probabilityEntry.getProbability(); + probability = probabilityEntry.getProbability(); } + // TODO: Some flags in unigramProbabilityEntry should be overwritten by flags in + // probabilityEntry. + const ProbabilityEntry unigramProbabilityEntry = getProbabilityEntry(wordId); + return WordAttributes(probability, unigramProbabilityEntry.isNotAWord(), + unigramProbabilityEntry.isBlacklisted(), + unigramProbabilityEntry.isPossiblyOffensive()); } // Cannot find the word. - return NOT_A_PROBABILITY; + return WordAttributes(); } ProbabilityEntry LanguageModelDictContent::getNgramProbabilityEntry( diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h index a793af4be..b7e4af977 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h @@ -21,6 +21,7 @@ #include #include "defines.h" +#include "suggest/core/dictionary/word_attributes.h" #include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h" #include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" @@ -128,7 +129,7 @@ class LanguageModelDictContent { const LanguageModelDictContent *const originalContent, int *const outNgramCount); - int getWordProbability(const WordIdArrayView prevWordIds, const int wordId, + const WordAttributes getWordAttributes(const WordIdArrayView prevWordIds, const int wordId, const HeaderPolicy *const headerPolicy) const; ProbabilityEntry getProbabilityEntry(const int wordId) const { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h index f1bf12cb2..e1e10ca17 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h @@ -49,7 +49,9 @@ class ProbabilityEntry { // Create from unigram property. ProbabilityEntry(const UnigramProperty *const unigramProperty) - : mFlags(createFlags(unigramProperty->representsBeginningOfSentence())), + : mFlags(createFlags(unigramProperty->representsBeginningOfSentence(), + unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), + unigramProperty->isPossiblyOffensive())), mProbability(unigramProperty->getProbability()), mHistoricalInfo(unigramProperty->getTimestamp(), unigramProperty->getLevel(), unigramProperty->getCount()) {} @@ -85,6 +87,18 @@ class ProbabilityEntry { return (mFlags & Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE) != 0; } + bool isNotAWord() const { + return (mFlags & Ver4DictConstants::FLAG_NOT_A_WORD) != 0; + } + + bool isBlacklisted() const { + return (mFlags & Ver4DictConstants::FLAG_BLACKLISTED) != 0; + } + + bool isPossiblyOffensive() const { + return (mFlags & Ver4DictConstants::FLAG_POSSIBLY_OFFENSIVE) != 0; + } + uint64_t encode(const bool hasHistoricalInfo) const { uint64_t encodedEntry = static_cast(mFlags); if (hasHistoricalInfo) { @@ -142,10 +156,20 @@ class ProbabilityEntry { (encodedEntry >> (pos * CHAR_BIT)) & ((1ull << (size * CHAR_BIT)) - 1)); } - static uint8_t createFlags(const bool representsBeginningOfSentence) { + static uint8_t createFlags(const bool representsBeginningOfSentence, + const bool isNotAWord, const bool isBlacklisted, const bool isPossiblyOffensive) { uint8_t flags = 0; if (representsBeginningOfSentence) { - flags ^= Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE; + flags |= Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE; + } + if (isNotAWord) { + flags |= Ver4DictConstants::FLAG_NOT_A_WORD; + } + if (isBlacklisted) { + flags |= Ver4DictConstants::FLAG_BLACKLISTED; + } + if (isPossiblyOffensive) { + flags |= Ver4DictConstants::FLAG_POSSIBLY_OFFENSIVE; } return flags; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp index 39822b94a..8e6cb974b 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp @@ -54,6 +54,9 @@ const int Ver4DictConstants::WORD_COUNT_FIELD_SIZE = 1; const uint8_t Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE = 0x1; const uint8_t Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY = 0x2; +const uint8_t Ver4DictConstants::FLAG_NOT_A_WORD = 0x4; +const uint8_t Ver4DictConstants::FLAG_BLACKLISTED = 0x8; +const uint8_t Ver4DictConstants::FLAG_POSSIBLY_OFFENSIVE = 0x10; const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE = 64; const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE = 4; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h index dfcdd4d6f..600b5ffe4 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h @@ -52,6 +52,9 @@ class Ver4DictConstants { // Flags in probability entry. static const uint8_t FLAG_REPRESENTS_BEGINNING_OF_SENTENCE; static const uint8_t FLAG_NOT_A_VALID_ENTRY; + static const uint8_t FLAG_NOT_A_WORD; + static const uint8_t FLAG_BLACKLISTED; + static const uint8_t FLAG_POSSIBLY_OFFENSIVE; static const int SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE; static const int SHORTCUT_ADDRESS_TABLE_DATA_SIZE; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp index 75ec16912..a1a33d27a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp @@ -191,7 +191,6 @@ bool Ver4PatriciaTrieNodeWriter::writePtNodeAndAdvancePosition( ptNodeWritingPos); } - bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition( const PtNodeParams *const ptNodeParams, const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos) { @@ -341,8 +340,8 @@ bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition( ptNodeParams->getChildrenPos(), ptNodeWritingPos)) { return false; } - return updatePtNodeFlags(nodePos, ptNodeParams->isBlacklisted(), ptNodeParams->isNotAWord(), - isTerminal, ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); + return updatePtNodeFlags(nodePos, isTerminal, + ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); } // TODO: Move probability handling code to LanguageModelDictContent. @@ -361,14 +360,13 @@ const ProbabilityEntry Ver4PatriciaTrieNodeWriter::createUpdatedEntryFrom( } } -bool Ver4PatriciaTrieNodeWriter::updatePtNodeFlags(const int ptNodePos, - const bool isBlacklisted, const bool isNotAWord, const bool isTerminal, +bool Ver4PatriciaTrieNodeWriter::updatePtNodeFlags(const int ptNodePos, const bool isTerminal, const bool hasMultipleChars) { // Create node flags and write them. PatriciaTrieReadingUtils::NodeFlags nodeFlags = - PatriciaTrieReadingUtils::createAndGetFlags(isBlacklisted, isNotAWord, isTerminal, - false /* hasShortcutTargets */, false /* hasBigrams */, hasMultipleChars, - CHILDREN_POSITION_FIELD_SIZE); + PatriciaTrieReadingUtils::createAndGetFlags(false /* isNotAWord */, + false /* isBlacklisted */, isTerminal, false /* hasShortcutTargets */, + false /* hasBigrams */, hasMultipleChars, CHILDREN_POSITION_FIELD_SIZE); if (!DynamicPtWritingUtils::writeFlags(mTrieBuffer, nodeFlags, ptNodePos)) { AKLOGE("Cannot write PtNode flags. flags: %x, pos: %d", nodeFlags, ptNodePos); return false; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h index 08b7d3825..17915273b 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h @@ -103,8 +103,7 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter { const ProbabilityEntry *const originalProbabilityEntry, const ProbabilityEntry *const probabilityEntry) const; - bool updatePtNodeFlags(const int ptNodePos, const bool isBlacklisted, const bool isNotAWord, - const bool isTerminal, const bool hasMultipleChars); + bool updatePtNodeFlags(const int ptNodePos, const bool isTerminal, const bool hasMultipleChars); static const int CHILDREN_POSITION_FIELD_SIZE; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp index cca0c2924..16ff6c609 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp @@ -63,14 +63,10 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d // valid terminal DicNode. isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY; } - readingHelper.readNextSiblingNode(ptNodeParams); - if (ptNodeParams.representsNonWordInfo()) { - // Skip PtNodes that represent non-word information. - continue; - } const int wordId = isTerminal ? ptNodeParams.getTerminalId() : NOT_A_WORD_ID; childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getChildrenPos(), wordId, ptNodeParams.getCodePointArrayView()); + readingHelper.readNextSiblingNode(ptNodeParams); } if (readingHelper.isError()) { mIsCorrupted = true; @@ -117,13 +113,8 @@ const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext( if (wordId == NOT_A_WORD_ID) { return WordAttributes(); } - const int ptNodePos = - mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId); - const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); - const int probability = mBuffers->getLanguageModelDictContent()->getWordProbability( - prevWordIds, wordId, mHeaderPolicy); - return WordAttributes(probability, ptNodeParams.isBlacklisted(), ptNodeParams.isNotAWord(), - probability == 0); + return mBuffers->getLanguageModelDictContent()->getWordAttributes(prevWordIds, wordId, + mHeaderPolicy); } int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds, @@ -131,15 +122,10 @@ int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordI if (wordId == NOT_A_WORD_ID || prevWordIds.contains(NOT_A_WORD_ID)) { return NOT_A_PROBABILITY; } - const int ptNodePos = - mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId); - const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); - if (ptNodeParams.isDeleted() || ptNodeParams.isBlacklisted() || ptNodeParams.isNotAWord()) { - return NOT_A_PROBABILITY; - } const ProbabilityEntry probabilityEntry = mBuffers->getLanguageModelDictContent()->getNgramProbabilityEntry(prevWordIds, wordId); - if (!probabilityEntry.isValid()) { + if (!probabilityEntry.isValid() || probabilityEntry.isBlacklisted() + || probabilityEntry.isNotAWord()) { return NOT_A_PROBABILITY; } if (mHeaderPolicy->hasHistoricalInfoOfWords()) { @@ -511,10 +497,10 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty( shortcuts.emplace_back(&target, shortcutProbability); } } - const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(), - ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(), - historicalInfo->getTimeStamp(), historicalInfo->getLevel(), - historicalInfo->getCount(), &shortcuts); + const UnigramProperty unigramProperty(probabilityEntry.representsBeginningOfSentence(), + probabilityEntry.isNotAWord(), probabilityEntry.isBlacklisted(), + probabilityEntry.getProbability(), historicalInfo->getTimeStamp(), + historicalInfo->getLevel(), historicalInfo->getCount(), &shortcuts); return WordProperty(&codePointVector, &unigramProperty, &bigrams); }