am c912957f: Merge "Move word flags to language model dict content."
* commit 'c912957f3ffd4425b6b9ecd594c8e273a38a6227': Move word flags to language model dict content.main
commit
f40f2916a5
|
@ -71,6 +71,11 @@ class UnigramProperty {
|
||||||
return mIsBlacklisted;
|
return mIsBlacklisted;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool isPossiblyOffensive() const {
|
||||||
|
// TODO: Have dedicated flag.
|
||||||
|
return mProbability == 0;
|
||||||
|
}
|
||||||
|
|
||||||
bool hasShortcuts() const {
|
bool hasShortcuts() const {
|
||||||
return !mShortcuts.empty();
|
return !mShortcuts.empty();
|
||||||
}
|
}
|
||||||
|
|
|
@ -38,7 +38,7 @@ bool LanguageModelDictContent::runGC(
|
||||||
0 /* nextLevelBitmapEntryIndex */, outNgramCount);
|
0 /* nextLevelBitmapEntryIndex */, outNgramCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
int LanguageModelDictContent::getWordProbability(const WordIdArrayView prevWordIds,
|
const WordAttributes LanguageModelDictContent::getWordAttributes(const WordIdArrayView prevWordIds,
|
||||||
const int wordId, const HeaderPolicy *const headerPolicy) const {
|
const int wordId, const HeaderPolicy *const headerPolicy) const {
|
||||||
int bitmapEntryIndices[MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1];
|
int bitmapEntryIndices[MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1];
|
||||||
bitmapEntryIndices[0] = mTrieMap.getRootBitmapEntryIndex();
|
bitmapEntryIndices[0] = mTrieMap.getRootBitmapEntryIndex();
|
||||||
|
@ -60,17 +60,24 @@ int LanguageModelDictContent::getWordProbability(const WordIdArrayView prevWordI
|
||||||
}
|
}
|
||||||
const ProbabilityEntry probabilityEntry =
|
const ProbabilityEntry probabilityEntry =
|
||||||
ProbabilityEntry::decode(result.mValue, mHasHistoricalInfo);
|
ProbabilityEntry::decode(result.mValue, mHasHistoricalInfo);
|
||||||
|
int probability = NOT_A_PROBABILITY;
|
||||||
if (mHasHistoricalInfo) {
|
if (mHasHistoricalInfo) {
|
||||||
const int probability = ForgettingCurveUtils::decodeProbability(
|
const int rawProbability = ForgettingCurveUtils::decodeProbability(
|
||||||
probabilityEntry.getHistoricalInfo(), headerPolicy)
|
probabilityEntry.getHistoricalInfo(), headerPolicy)
|
||||||
+ ForgettingCurveUtils::getProbabilityBiasForNgram(i + 1 /* n */);
|
+ ForgettingCurveUtils::getProbabilityBiasForNgram(i + 1 /* n */);
|
||||||
return std::min(probability, MAX_PROBABILITY);
|
probability = std::min(rawProbability, MAX_PROBABILITY);
|
||||||
} else {
|
} else {
|
||||||
return probabilityEntry.getProbability();
|
probability = probabilityEntry.getProbability();
|
||||||
}
|
}
|
||||||
|
// TODO: Some flags in unigramProbabilityEntry should be overwritten by flags in
|
||||||
|
// probabilityEntry.
|
||||||
|
const ProbabilityEntry unigramProbabilityEntry = getProbabilityEntry(wordId);
|
||||||
|
return WordAttributes(probability, unigramProbabilityEntry.isNotAWord(),
|
||||||
|
unigramProbabilityEntry.isBlacklisted(),
|
||||||
|
unigramProbabilityEntry.isPossiblyOffensive());
|
||||||
}
|
}
|
||||||
// Cannot find the word.
|
// Cannot find the word.
|
||||||
return NOT_A_PROBABILITY;
|
return WordAttributes();
|
||||||
}
|
}
|
||||||
|
|
||||||
ProbabilityEntry LanguageModelDictContent::getNgramProbabilityEntry(
|
ProbabilityEntry LanguageModelDictContent::getNgramProbabilityEntry(
|
||||||
|
|
|
@ -21,6 +21,7 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "defines.h"
|
#include "defines.h"
|
||||||
|
#include "suggest/core/dictionary/word_attributes.h"
|
||||||
#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h"
|
#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h"
|
||||||
#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h"
|
#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h"
|
||||||
#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
|
#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
|
||||||
|
@ -128,7 +129,7 @@ class LanguageModelDictContent {
|
||||||
const LanguageModelDictContent *const originalContent,
|
const LanguageModelDictContent *const originalContent,
|
||||||
int *const outNgramCount);
|
int *const outNgramCount);
|
||||||
|
|
||||||
int getWordProbability(const WordIdArrayView prevWordIds, const int wordId,
|
const WordAttributes getWordAttributes(const WordIdArrayView prevWordIds, const int wordId,
|
||||||
const HeaderPolicy *const headerPolicy) const;
|
const HeaderPolicy *const headerPolicy) const;
|
||||||
|
|
||||||
ProbabilityEntry getProbabilityEntry(const int wordId) const {
|
ProbabilityEntry getProbabilityEntry(const int wordId) const {
|
||||||
|
|
|
@ -49,7 +49,9 @@ class ProbabilityEntry {
|
||||||
|
|
||||||
// Create from unigram property.
|
// Create from unigram property.
|
||||||
ProbabilityEntry(const UnigramProperty *const unigramProperty)
|
ProbabilityEntry(const UnigramProperty *const unigramProperty)
|
||||||
: mFlags(createFlags(unigramProperty->representsBeginningOfSentence())),
|
: mFlags(createFlags(unigramProperty->representsBeginningOfSentence(),
|
||||||
|
unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(),
|
||||||
|
unigramProperty->isPossiblyOffensive())),
|
||||||
mProbability(unigramProperty->getProbability()),
|
mProbability(unigramProperty->getProbability()),
|
||||||
mHistoricalInfo(unigramProperty->getTimestamp(), unigramProperty->getLevel(),
|
mHistoricalInfo(unigramProperty->getTimestamp(), unigramProperty->getLevel(),
|
||||||
unigramProperty->getCount()) {}
|
unigramProperty->getCount()) {}
|
||||||
|
@ -85,6 +87,18 @@ class ProbabilityEntry {
|
||||||
return (mFlags & Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE) != 0;
|
return (mFlags & Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE) != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool isNotAWord() const {
|
||||||
|
return (mFlags & Ver4DictConstants::FLAG_NOT_A_WORD) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool isBlacklisted() const {
|
||||||
|
return (mFlags & Ver4DictConstants::FLAG_BLACKLISTED) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool isPossiblyOffensive() const {
|
||||||
|
return (mFlags & Ver4DictConstants::FLAG_POSSIBLY_OFFENSIVE) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
uint64_t encode(const bool hasHistoricalInfo) const {
|
uint64_t encode(const bool hasHistoricalInfo) const {
|
||||||
uint64_t encodedEntry = static_cast<uint64_t>(mFlags);
|
uint64_t encodedEntry = static_cast<uint64_t>(mFlags);
|
||||||
if (hasHistoricalInfo) {
|
if (hasHistoricalInfo) {
|
||||||
|
@ -142,10 +156,20 @@ class ProbabilityEntry {
|
||||||
(encodedEntry >> (pos * CHAR_BIT)) & ((1ull << (size * CHAR_BIT)) - 1));
|
(encodedEntry >> (pos * CHAR_BIT)) & ((1ull << (size * CHAR_BIT)) - 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint8_t createFlags(const bool representsBeginningOfSentence) {
|
static uint8_t createFlags(const bool representsBeginningOfSentence,
|
||||||
|
const bool isNotAWord, const bool isBlacklisted, const bool isPossiblyOffensive) {
|
||||||
uint8_t flags = 0;
|
uint8_t flags = 0;
|
||||||
if (representsBeginningOfSentence) {
|
if (representsBeginningOfSentence) {
|
||||||
flags ^= Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE;
|
flags |= Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE;
|
||||||
|
}
|
||||||
|
if (isNotAWord) {
|
||||||
|
flags |= Ver4DictConstants::FLAG_NOT_A_WORD;
|
||||||
|
}
|
||||||
|
if (isBlacklisted) {
|
||||||
|
flags |= Ver4DictConstants::FLAG_BLACKLISTED;
|
||||||
|
}
|
||||||
|
if (isPossiblyOffensive) {
|
||||||
|
flags |= Ver4DictConstants::FLAG_POSSIBLY_OFFENSIVE;
|
||||||
}
|
}
|
||||||
return flags;
|
return flags;
|
||||||
}
|
}
|
||||||
|
|
|
@ -54,6 +54,9 @@ const int Ver4DictConstants::WORD_COUNT_FIELD_SIZE = 1;
|
||||||
|
|
||||||
const uint8_t Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE = 0x1;
|
const uint8_t Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE = 0x1;
|
||||||
const uint8_t Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY = 0x2;
|
const uint8_t Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY = 0x2;
|
||||||
|
const uint8_t Ver4DictConstants::FLAG_NOT_A_WORD = 0x4;
|
||||||
|
const uint8_t Ver4DictConstants::FLAG_BLACKLISTED = 0x8;
|
||||||
|
const uint8_t Ver4DictConstants::FLAG_POSSIBLY_OFFENSIVE = 0x10;
|
||||||
|
|
||||||
const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE = 64;
|
const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE = 64;
|
||||||
const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE = 4;
|
const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE = 4;
|
||||||
|
|
|
@ -52,6 +52,9 @@ class Ver4DictConstants {
|
||||||
// Flags in probability entry.
|
// Flags in probability entry.
|
||||||
static const uint8_t FLAG_REPRESENTS_BEGINNING_OF_SENTENCE;
|
static const uint8_t FLAG_REPRESENTS_BEGINNING_OF_SENTENCE;
|
||||||
static const uint8_t FLAG_NOT_A_VALID_ENTRY;
|
static const uint8_t FLAG_NOT_A_VALID_ENTRY;
|
||||||
|
static const uint8_t FLAG_NOT_A_WORD;
|
||||||
|
static const uint8_t FLAG_BLACKLISTED;
|
||||||
|
static const uint8_t FLAG_POSSIBLY_OFFENSIVE;
|
||||||
|
|
||||||
static const int SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE;
|
static const int SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE;
|
||||||
static const int SHORTCUT_ADDRESS_TABLE_DATA_SIZE;
|
static const int SHORTCUT_ADDRESS_TABLE_DATA_SIZE;
|
||||||
|
|
|
@ -191,7 +191,6 @@ bool Ver4PatriciaTrieNodeWriter::writePtNodeAndAdvancePosition(
|
||||||
ptNodeWritingPos);
|
ptNodeWritingPos);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition(
|
bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition(
|
||||||
const PtNodeParams *const ptNodeParams, const UnigramProperty *const unigramProperty,
|
const PtNodeParams *const ptNodeParams, const UnigramProperty *const unigramProperty,
|
||||||
int *const ptNodeWritingPos) {
|
int *const ptNodeWritingPos) {
|
||||||
|
@ -341,8 +340,8 @@ bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition(
|
||||||
ptNodeParams->getChildrenPos(), ptNodeWritingPos)) {
|
ptNodeParams->getChildrenPos(), ptNodeWritingPos)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return updatePtNodeFlags(nodePos, ptNodeParams->isBlacklisted(), ptNodeParams->isNotAWord(),
|
return updatePtNodeFlags(nodePos, isTerminal,
|
||||||
isTerminal, ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */);
|
ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */);
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Move probability handling code to LanguageModelDictContent.
|
// TODO: Move probability handling code to LanguageModelDictContent.
|
||||||
|
@ -361,14 +360,13 @@ const ProbabilityEntry Ver4PatriciaTrieNodeWriter::createUpdatedEntryFrom(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Ver4PatriciaTrieNodeWriter::updatePtNodeFlags(const int ptNodePos,
|
bool Ver4PatriciaTrieNodeWriter::updatePtNodeFlags(const int ptNodePos, const bool isTerminal,
|
||||||
const bool isBlacklisted, const bool isNotAWord, const bool isTerminal,
|
|
||||||
const bool hasMultipleChars) {
|
const bool hasMultipleChars) {
|
||||||
// Create node flags and write them.
|
// Create node flags and write them.
|
||||||
PatriciaTrieReadingUtils::NodeFlags nodeFlags =
|
PatriciaTrieReadingUtils::NodeFlags nodeFlags =
|
||||||
PatriciaTrieReadingUtils::createAndGetFlags(isBlacklisted, isNotAWord, isTerminal,
|
PatriciaTrieReadingUtils::createAndGetFlags(false /* isNotAWord */,
|
||||||
false /* hasShortcutTargets */, false /* hasBigrams */, hasMultipleChars,
|
false /* isBlacklisted */, isTerminal, false /* hasShortcutTargets */,
|
||||||
CHILDREN_POSITION_FIELD_SIZE);
|
false /* hasBigrams */, hasMultipleChars, CHILDREN_POSITION_FIELD_SIZE);
|
||||||
if (!DynamicPtWritingUtils::writeFlags(mTrieBuffer, nodeFlags, ptNodePos)) {
|
if (!DynamicPtWritingUtils::writeFlags(mTrieBuffer, nodeFlags, ptNodePos)) {
|
||||||
AKLOGE("Cannot write PtNode flags. flags: %x, pos: %d", nodeFlags, ptNodePos);
|
AKLOGE("Cannot write PtNode flags. flags: %x, pos: %d", nodeFlags, ptNodePos);
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -103,8 +103,7 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter {
|
||||||
const ProbabilityEntry *const originalProbabilityEntry,
|
const ProbabilityEntry *const originalProbabilityEntry,
|
||||||
const ProbabilityEntry *const probabilityEntry) const;
|
const ProbabilityEntry *const probabilityEntry) const;
|
||||||
|
|
||||||
bool updatePtNodeFlags(const int ptNodePos, const bool isBlacklisted, const bool isNotAWord,
|
bool updatePtNodeFlags(const int ptNodePos, const bool isTerminal, const bool hasMultipleChars);
|
||||||
const bool isTerminal, const bool hasMultipleChars);
|
|
||||||
|
|
||||||
static const int CHILDREN_POSITION_FIELD_SIZE;
|
static const int CHILDREN_POSITION_FIELD_SIZE;
|
||||||
|
|
||||||
|
|
|
@ -63,14 +63,10 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d
|
||||||
// valid terminal DicNode.
|
// valid terminal DicNode.
|
||||||
isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY;
|
isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY;
|
||||||
}
|
}
|
||||||
readingHelper.readNextSiblingNode(ptNodeParams);
|
|
||||||
if (ptNodeParams.representsNonWordInfo()) {
|
|
||||||
// Skip PtNodes that represent non-word information.
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
const int wordId = isTerminal ? ptNodeParams.getTerminalId() : NOT_A_WORD_ID;
|
const int wordId = isTerminal ? ptNodeParams.getTerminalId() : NOT_A_WORD_ID;
|
||||||
childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getChildrenPos(),
|
childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getChildrenPos(),
|
||||||
wordId, ptNodeParams.getCodePointArrayView());
|
wordId, ptNodeParams.getCodePointArrayView());
|
||||||
|
readingHelper.readNextSiblingNode(ptNodeParams);
|
||||||
}
|
}
|
||||||
if (readingHelper.isError()) {
|
if (readingHelper.isError()) {
|
||||||
mIsCorrupted = true;
|
mIsCorrupted = true;
|
||||||
|
@ -117,13 +113,8 @@ const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext(
|
||||||
if (wordId == NOT_A_WORD_ID) {
|
if (wordId == NOT_A_WORD_ID) {
|
||||||
return WordAttributes();
|
return WordAttributes();
|
||||||
}
|
}
|
||||||
const int ptNodePos =
|
return mBuffers->getLanguageModelDictContent()->getWordAttributes(prevWordIds, wordId,
|
||||||
mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
|
mHeaderPolicy);
|
||||||
const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
|
|
||||||
const int probability = mBuffers->getLanguageModelDictContent()->getWordProbability(
|
|
||||||
prevWordIds, wordId, mHeaderPolicy);
|
|
||||||
return WordAttributes(probability, ptNodeParams.isBlacklisted(), ptNodeParams.isNotAWord(),
|
|
||||||
probability == 0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds,
|
int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds,
|
||||||
|
@ -131,15 +122,10 @@ int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordI
|
||||||
if (wordId == NOT_A_WORD_ID || prevWordIds.contains(NOT_A_WORD_ID)) {
|
if (wordId == NOT_A_WORD_ID || prevWordIds.contains(NOT_A_WORD_ID)) {
|
||||||
return NOT_A_PROBABILITY;
|
return NOT_A_PROBABILITY;
|
||||||
}
|
}
|
||||||
const int ptNodePos =
|
|
||||||
mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
|
|
||||||
const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
|
|
||||||
if (ptNodeParams.isDeleted() || ptNodeParams.isBlacklisted() || ptNodeParams.isNotAWord()) {
|
|
||||||
return NOT_A_PROBABILITY;
|
|
||||||
}
|
|
||||||
const ProbabilityEntry probabilityEntry =
|
const ProbabilityEntry probabilityEntry =
|
||||||
mBuffers->getLanguageModelDictContent()->getNgramProbabilityEntry(prevWordIds, wordId);
|
mBuffers->getLanguageModelDictContent()->getNgramProbabilityEntry(prevWordIds, wordId);
|
||||||
if (!probabilityEntry.isValid()) {
|
if (!probabilityEntry.isValid() || probabilityEntry.isBlacklisted()
|
||||||
|
|| probabilityEntry.isNotAWord()) {
|
||||||
return NOT_A_PROBABILITY;
|
return NOT_A_PROBABILITY;
|
||||||
}
|
}
|
||||||
if (mHeaderPolicy->hasHistoricalInfoOfWords()) {
|
if (mHeaderPolicy->hasHistoricalInfoOfWords()) {
|
||||||
|
@ -511,10 +497,10 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
|
||||||
shortcuts.emplace_back(&target, shortcutProbability);
|
shortcuts.emplace_back(&target, shortcutProbability);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
|
const UnigramProperty unigramProperty(probabilityEntry.representsBeginningOfSentence(),
|
||||||
ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
|
probabilityEntry.isNotAWord(), probabilityEntry.isBlacklisted(),
|
||||||
historicalInfo->getTimeStamp(), historicalInfo->getLevel(),
|
probabilityEntry.getProbability(), historicalInfo->getTimeStamp(),
|
||||||
historicalInfo->getCount(), &shortcuts);
|
historicalInfo->getLevel(), historicalInfo->getCount(), &shortcuts);
|
||||||
return WordProperty(&codePointVector, &unigramProperty, &bigrams);
|
return WordProperty(&codePointVector, &unigramProperty, &bigrams);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue