Merge "Renaming "blacklist" flag to "possibly offensive""
commit
c51b9b5b3f
|
@ -69,7 +69,7 @@ public final class BinaryDictionary extends Dictionary {
|
|||
// Format to get unigram flags from native side via getWordPropertyNative().
|
||||
private static final int FORMAT_WORD_PROPERTY_OUTPUT_FLAG_COUNT = 5;
|
||||
private static final int FORMAT_WORD_PROPERTY_IS_NOT_A_WORD_INDEX = 0;
|
||||
private static final int FORMAT_WORD_PROPERTY_IS_BLACKLISTED_INDEX = 1;
|
||||
private static final int FORMAT_WORD_PROPERTY_IS_POSSIBLY_OFFENSIVE_INDEX = 1;
|
||||
private static final int FORMAT_WORD_PROPERTY_HAS_NGRAMS_INDEX = 2;
|
||||
private static final int FORMAT_WORD_PROPERTY_HAS_SHORTCUTS_INDEX = 3;
|
||||
private static final int FORMAT_WORD_PROPERTY_IS_BEGINNING_OF_SENTENCE_INDEX = 4;
|
||||
|
@ -195,7 +195,7 @@ public final class BinaryDictionary extends Dictionary {
|
|||
float[] inOutWeightOfLangModelVsSpatialModel);
|
||||
private static native boolean addUnigramEntryNative(long dict, int[] word, int probability,
|
||||
int[] shortcutTarget, int shortcutProbability, boolean isBeginningOfSentence,
|
||||
boolean isNotAWord, boolean isBlacklisted, int timestamp);
|
||||
boolean isNotAWord, boolean isPossiblyOffensive, int timestamp);
|
||||
private static native boolean removeUnigramEntryNative(long dict, int[] word);
|
||||
private static native boolean addNgramEntryNative(long dict,
|
||||
int[][] prevWordCodePointArrays, boolean[] isBeginningOfSentenceArray,
|
||||
|
@ -402,7 +402,7 @@ public final class BinaryDictionary extends Dictionary {
|
|||
outNgramProbabilityInfo, outShortcutTargets, outShortcutProbabilities);
|
||||
return new WordProperty(codePoints,
|
||||
outFlags[FORMAT_WORD_PROPERTY_IS_NOT_A_WORD_INDEX],
|
||||
outFlags[FORMAT_WORD_PROPERTY_IS_BLACKLISTED_INDEX],
|
||||
outFlags[FORMAT_WORD_PROPERTY_IS_POSSIBLY_OFFENSIVE_INDEX],
|
||||
outFlags[FORMAT_WORD_PROPERTY_HAS_NGRAMS_INDEX],
|
||||
outFlags[FORMAT_WORD_PROPERTY_HAS_SHORTCUTS_INDEX],
|
||||
outFlags[FORMAT_WORD_PROPERTY_IS_BEGINNING_OF_SENTENCE_INDEX], outProbabilityInfo,
|
||||
|
@ -439,7 +439,7 @@ public final class BinaryDictionary extends Dictionary {
|
|||
public boolean addUnigramEntry(final String word, final int probability,
|
||||
final String shortcutTarget, final int shortcutProbability,
|
||||
final boolean isBeginningOfSentence, final boolean isNotAWord,
|
||||
final boolean isBlacklisted, final int timestamp) {
|
||||
final boolean isPossiblyOffensive, final int timestamp) {
|
||||
if (word == null || (word.isEmpty() && !isBeginningOfSentence)) {
|
||||
return false;
|
||||
}
|
||||
|
@ -447,7 +447,8 @@ public final class BinaryDictionary extends Dictionary {
|
|||
final int[] shortcutTargetCodePoints = (shortcutTarget != null) ?
|
||||
StringUtils.toCodePointArray(shortcutTarget) : null;
|
||||
if (!addUnigramEntryNative(mNativeDict, codePoints, probability, shortcutTargetCodePoints,
|
||||
shortcutProbability, isBeginningOfSentence, isNotAWord, isBlacklisted, timestamp)) {
|
||||
shortcutProbability, isBeginningOfSentence, isNotAWord, isPossiblyOffensive,
|
||||
timestamp)) {
|
||||
return false;
|
||||
}
|
||||
mHasUpdated = true;
|
||||
|
|
|
@ -137,7 +137,7 @@ public class ContactsBinaryDictionary extends ExpandableBinaryDictionary {
|
|||
}
|
||||
runGCIfRequiredLocked(true /* mindsBlockByGC */);
|
||||
addUnigramLocked(word, FREQUENCY_FOR_CONTACTS, null /* shortcut */,
|
||||
0 /* shortcutFreq */, false /* isNotAWord */, false /* isBlacklisted */,
|
||||
0 /* shortcutFreq */, false /* isNotAWord */, false /* isPossiblyOffensive */,
|
||||
BinaryDictionary.NOT_A_VALID_TIMESTAMP);
|
||||
}
|
||||
}
|
||||
|
@ -238,7 +238,8 @@ public class ContactsBinaryDictionary extends ExpandableBinaryDictionary {
|
|||
runGCIfRequiredLocked(true /* mindsBlockByGC */);
|
||||
addUnigramLocked(word, FREQUENCY_FOR_CONTACTS,
|
||||
null /* shortcut */, 0 /* shortcutFreq */, false /* isNotAWord */,
|
||||
false /* isBlacklisted */, BinaryDictionary.NOT_A_VALID_TIMESTAMP);
|
||||
false /* isPossiblyOffensive */,
|
||||
BinaryDictionary.NOT_A_VALID_TIMESTAMP);
|
||||
if (!ngramContext.isValid() && mUseFirstLastBigrams) {
|
||||
runGCIfRequiredLocked(true /* mindsBlockByGC */);
|
||||
addNgramEntryLocked(ngramContext, word, FREQUENCY_FOR_CONTACTS_BIGRAM,
|
||||
|
|
|
@ -809,7 +809,7 @@ public class DictionaryFacilitator {
|
|||
contextualDict.addUnigramEntryWithCheckingDistracter(
|
||||
subPhraseStr, probability, null /* shortcutTarget */,
|
||||
Dictionary.NOT_A_PROBABILITY /* shortcutFreq */,
|
||||
false /* isNotAWord */, false /* isBlacklisted */,
|
||||
false /* isNotAWord */, false /* isPossiblyOffensive */,
|
||||
BinaryDictionary.NOT_A_VALID_TIMESTAMP,
|
||||
DistracterFilter.EMPTY_DISTRACTER_FILTER);
|
||||
contextualDict.addNgramEntry(ngramContext, subPhraseStr,
|
||||
|
@ -819,7 +819,7 @@ public class DictionaryFacilitator {
|
|||
contextualDict.addUnigramEntryWithCheckingDistracter(
|
||||
phrase[i], probability, null /* shortcutTarget */,
|
||||
Dictionary.NOT_A_PROBABILITY /* shortcutFreq */,
|
||||
false /* isNotAWord */, false /* isBlacklisted */,
|
||||
false /* isNotAWord */, false /* isPossiblyOffensive */,
|
||||
BinaryDictionary.NOT_A_VALID_TIMESTAMP,
|
||||
DistracterFilter.EMPTY_DISTRACTER_FILTER);
|
||||
contextualDict.addNgramEntry(ngramContext, phrase[i],
|
||||
|
|
|
@ -316,22 +316,22 @@ abstract public class ExpandableBinaryDictionary extends Dictionary {
|
|||
*/
|
||||
public void addUnigramEntryWithCheckingDistracter(final String word, final int frequency,
|
||||
final String shortcutTarget, final int shortcutFreq, final boolean isNotAWord,
|
||||
final boolean isBlacklisted, final int timestamp,
|
||||
final boolean isPossiblyOffensive, final int timestamp,
|
||||
@Nonnull final DistracterFilter distracterFilter) {
|
||||
updateDictionaryWithWriteLockIfWordIsNotADistracter(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
addUnigramLocked(word, frequency, shortcutTarget, shortcutFreq,
|
||||
isNotAWord, isBlacklisted, timestamp);
|
||||
isNotAWord, isPossiblyOffensive, timestamp);
|
||||
}
|
||||
}, word, distracterFilter);
|
||||
}
|
||||
|
||||
protected void addUnigramLocked(final String word, final int frequency,
|
||||
final String shortcutTarget, final int shortcutFreq, final boolean isNotAWord,
|
||||
final boolean isBlacklisted, final int timestamp) {
|
||||
final boolean isPossiblyOffensive, final int timestamp) {
|
||||
if (!mBinaryDictionary.addUnigramEntry(word, frequency, shortcutTarget, shortcutFreq,
|
||||
false /* isBeginningOfSentence */, isNotAWord, isBlacklisted, timestamp)) {
|
||||
false /* isBeginningOfSentence */, isNotAWord, isPossiblyOffensive, timestamp)) {
|
||||
Log.e(TAG, "Cannot add unigram entry. word: " + word);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -257,12 +257,14 @@ public class UserBinaryDictionary extends ExpandableBinaryDictionary {
|
|||
runGCIfRequiredLocked(true /* mindsBlockByGC */);
|
||||
addUnigramLocked(word, adjustedFrequency, null /* shortcutTarget */,
|
||||
0 /* shortcutFreq */, false /* isNotAWord */,
|
||||
false /* isBlacklisted */, BinaryDictionary.NOT_A_VALID_TIMESTAMP);
|
||||
false /* isPossiblyOffensive */,
|
||||
BinaryDictionary.NOT_A_VALID_TIMESTAMP);
|
||||
if (null != shortcut && shortcut.length() <= MAX_WORD_LENGTH) {
|
||||
runGCIfRequiredLocked(true /* mindsBlockByGC */);
|
||||
addUnigramLocked(shortcut, adjustedFrequency, word,
|
||||
USER_DICT_SHORTCUT_FREQUENCY, true /* isNotAWord */,
|
||||
false /* isBlacklisted */, BinaryDictionary.NOT_A_VALID_TIMESTAMP);
|
||||
false /* isPossiblyOffensive */,
|
||||
BinaryDictionary.NOT_A_VALID_TIMESTAMP);
|
||||
}
|
||||
}
|
||||
cursor.moveToNext();
|
||||
|
|
|
@ -93,7 +93,7 @@ public final class FormatSpec {
|
|||
* s | has shortcut targets ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_SHORTCUT_TARGETS
|
||||
* | has bigrams ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_BIGRAMS
|
||||
* | is not a word ? 1 bit, 1 = yes, 0 = no : FLAG_IS_NOT_A_WORD
|
||||
* | is blacklisted ? 1 bit, 1 = yes, 0 = no : FLAG_IS_BLACKLISTED
|
||||
* | is possibly offensive ? 1 bit, 1 = yes, 0 = no : FLAG_IS_POSSIBLY_OFFENSIVE
|
||||
*
|
||||
* c | IF FLAG_HAS_MULTIPLE_CHARS
|
||||
* h | char, char, char, char n * (1 or 3 bytes) : use PtNodeInfo for i/o helpers
|
||||
|
@ -197,7 +197,7 @@ public final class FormatSpec {
|
|||
static final int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
|
||||
static final int FLAG_HAS_BIGRAMS = 0x04;
|
||||
static final int FLAG_IS_NOT_A_WORD = 0x02;
|
||||
static final int FLAG_IS_BLACKLISTED = 0x01;
|
||||
static final int FLAG_IS_POSSIBLY_OFFENSIVE = 0x01;
|
||||
|
||||
static final int FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT = 0x80;
|
||||
static final int FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE = 0x40;
|
||||
|
|
|
@ -41,7 +41,7 @@ public final class WordProperty implements Comparable<WordProperty> {
|
|||
// TODO: Support mIsBeginningOfSentence.
|
||||
public final boolean mIsBeginningOfSentence;
|
||||
public final boolean mIsNotAWord;
|
||||
public final boolean mIsBlacklistEntry;
|
||||
public final boolean mIsPossiblyOffensive;
|
||||
public final boolean mHasShortcuts;
|
||||
public final boolean mHasNgrams;
|
||||
|
||||
|
@ -52,7 +52,7 @@ public final class WordProperty implements Comparable<WordProperty> {
|
|||
public WordProperty(final String word, final ProbabilityInfo probabilityInfo,
|
||||
final ArrayList<WeightedString> shortcutTargets,
|
||||
@Nullable final ArrayList<WeightedString> bigrams,
|
||||
final boolean isNotAWord, final boolean isBlacklistEntry) {
|
||||
final boolean isNotAWord, final boolean isPossiblyOffensive) {
|
||||
mWord = word;
|
||||
mProbabilityInfo = probabilityInfo;
|
||||
mShortcutTargets = shortcutTargets;
|
||||
|
@ -69,7 +69,7 @@ public final class WordProperty implements Comparable<WordProperty> {
|
|||
}
|
||||
mIsBeginningOfSentence = false;
|
||||
mIsNotAWord = isNotAWord;
|
||||
mIsBlacklistEntry = isBlacklistEntry;
|
||||
mIsPossiblyOffensive = isPossiblyOffensive;
|
||||
mHasNgrams = bigrams != null && !bigrams.isEmpty();
|
||||
mHasShortcuts = shortcutTargets != null && !shortcutTargets.isEmpty();
|
||||
}
|
||||
|
@ -85,7 +85,7 @@ public final class WordProperty implements Comparable<WordProperty> {
|
|||
// Construct word property using information from native code.
|
||||
// This represents invalid word when the probability is BinaryDictionary.NOT_A_PROBABILITY.
|
||||
public WordProperty(final int[] codePoints, final boolean isNotAWord,
|
||||
final boolean isBlacklisted, final boolean hasBigram, final boolean hasShortcuts,
|
||||
final boolean isPossiblyOffensive, final boolean hasBigram, final boolean hasShortcuts,
|
||||
final boolean isBeginningOfSentence, final int[] probabilityInfo,
|
||||
final ArrayList<int[][]> ngramPrevWordsArray,
|
||||
final ArrayList<boolean[]> outNgramPrevWordIsBeginningOfSentenceArray,
|
||||
|
@ -98,7 +98,7 @@ public final class WordProperty implements Comparable<WordProperty> {
|
|||
final ArrayList<NgramProperty> ngrams = new ArrayList<>();
|
||||
mIsBeginningOfSentence = isBeginningOfSentence;
|
||||
mIsNotAWord = isNotAWord;
|
||||
mIsBlacklistEntry = isBlacklisted;
|
||||
mIsPossiblyOffensive = isPossiblyOffensive;
|
||||
mHasShortcuts = hasShortcuts;
|
||||
mHasNgrams = hasBigram;
|
||||
|
||||
|
@ -150,7 +150,7 @@ public final class WordProperty implements Comparable<WordProperty> {
|
|||
word.mShortcutTargets,
|
||||
word.mNgrams,
|
||||
word.mIsNotAWord,
|
||||
word.mIsBlacklistEntry
|
||||
word.mIsPossiblyOffensive
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -180,7 +180,7 @@ public final class WordProperty implements Comparable<WordProperty> {
|
|||
WordProperty w = (WordProperty)o;
|
||||
return mProbabilityInfo.equals(w.mProbabilityInfo) && mWord.equals(w.mWord)
|
||||
&& mShortcutTargets.equals(w.mShortcutTargets) && equals(mNgrams, w.mNgrams)
|
||||
&& mIsNotAWord == w.mIsNotAWord && mIsBlacklistEntry == w.mIsBlacklistEntry
|
||||
&& mIsNotAWord == w.mIsNotAWord && mIsPossiblyOffensive == w.mIsPossiblyOffensive
|
||||
&& mHasNgrams == w.mHasNgrams && mHasShortcuts && w.mHasNgrams;
|
||||
}
|
||||
|
||||
|
|
|
@ -63,7 +63,7 @@ public class CombinedFormatUtils {
|
|||
if (wordProperty.mIsNotAWord) {
|
||||
builder.append("," + NOT_A_WORD_TAG + "=true");
|
||||
}
|
||||
if (wordProperty.mIsBlacklistEntry) {
|
||||
if (wordProperty.mIsPossiblyOffensive) {
|
||||
builder.append("," + BLACKLISTED_TAG + "=true");
|
||||
}
|
||||
builder.append("\n");
|
||||
|
|
|
@ -54,7 +54,7 @@ public final class LanguageModelParam {
|
|||
public final int mBigramProbability;
|
||||
public final int mShortcutProbability;
|
||||
public final boolean mIsNotAWord;
|
||||
public final boolean mIsBlacklisted;
|
||||
public final boolean mIsPossiblyOffensive;
|
||||
// Time stamp in seconds.
|
||||
public final int mTimestamp;
|
||||
|
||||
|
@ -78,7 +78,7 @@ public final class LanguageModelParam {
|
|||
mBigramProbability = bigramProbability;
|
||||
mShortcutProbability = Dictionary.NOT_A_PROBABILITY;
|
||||
mIsNotAWord = false;
|
||||
mIsBlacklisted = false;
|
||||
mIsPossiblyOffensive = false;
|
||||
mTimestamp = timestamp;
|
||||
}
|
||||
|
||||
|
|
|
@ -358,7 +358,7 @@ static void latinime_BinaryDictionary_getWordProperty(JNIEnv *env, jclass clazz,
|
|||
|
||||
static bool latinime_BinaryDictionary_addUnigramEntry(JNIEnv *env, jclass clazz, jlong dict,
|
||||
jintArray word, jint probability, jintArray shortcutTarget, jint shortcutProbability,
|
||||
jboolean isBeginningOfSentence, jboolean isNotAWord, jboolean isBlacklisted,
|
||||
jboolean isBeginningOfSentence, jboolean isNotAWord, jboolean isPossiblyOffensive,
|
||||
jint timestamp) {
|
||||
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
|
||||
if (!dictionary) {
|
||||
|
@ -377,8 +377,8 @@ static bool latinime_BinaryDictionary_addUnigramEntry(JNIEnv *env, jclass clazz,
|
|||
}
|
||||
// Use 1 for count to indicate the word has inputted.
|
||||
const UnigramProperty unigramProperty(isBeginningOfSentence, isNotAWord,
|
||||
isBlacklisted, probability, HistoricalInfo(timestamp, 0 /* level */, 1 /* count */),
|
||||
std::move(shortcuts));
|
||||
isPossiblyOffensive, probability, HistoricalInfo(timestamp, 0 /* level */,
|
||||
1 /* count */), std::move(shortcuts));
|
||||
return dictionary->addUnigramEntry(CodePointArrayView(codePoints, codePointCount),
|
||||
&unigramProperty);
|
||||
}
|
||||
|
@ -480,8 +480,8 @@ static int latinime_BinaryDictionary_addMultipleDictionaryEntries(JNIEnv *env, j
|
|||
env->GetFieldID(languageModelParamClass, "mShortcutProbability", "I");
|
||||
jfieldID isNotAWordFieldId =
|
||||
env->GetFieldID(languageModelParamClass, "mIsNotAWord", "Z");
|
||||
jfieldID isBlacklistedFieldId =
|
||||
env->GetFieldID(languageModelParamClass, "mIsBlacklisted", "Z");
|
||||
jfieldID isPossiblyOffensiveFieldId =
|
||||
env->GetFieldID(languageModelParamClass, "mIsPossiblyOffensive", "Z");
|
||||
env->DeleteLocalRef(languageModelParamClass);
|
||||
|
||||
for (int i = startIndex; i < languageModelParamCount; ++i) {
|
||||
|
@ -504,7 +504,8 @@ static int latinime_BinaryDictionary_addMultipleDictionaryEntries(JNIEnv *env, j
|
|||
jint unigramProbability = env->GetIntField(languageModelParam, unigramProbabilityFieldId);
|
||||
jint timestamp = env->GetIntField(languageModelParam, timestampFieldId);
|
||||
jboolean isNotAWord = env->GetBooleanField(languageModelParam, isNotAWordFieldId);
|
||||
jboolean isBlacklisted = env->GetBooleanField(languageModelParam, isBlacklistedFieldId);
|
||||
jboolean isPossiblyOffensive = env->GetBooleanField(languageModelParam,
|
||||
isPossiblyOffensiveFieldId);
|
||||
jintArray shortcutTarget = static_cast<jintArray>(
|
||||
env->GetObjectField(languageModelParam, shortcutTargetFieldId));
|
||||
std::vector<UnigramProperty::ShortcutProperty> shortcuts;
|
||||
|
@ -519,7 +520,7 @@ static int latinime_BinaryDictionary_addMultipleDictionaryEntries(JNIEnv *env, j
|
|||
}
|
||||
// Use 1 for count to indicate the word has inputted.
|
||||
const UnigramProperty unigramProperty(false /* isBeginningOfSentence */, isNotAWord,
|
||||
isBlacklisted, unigramProbability,
|
||||
isPossiblyOffensive, unigramProbability,
|
||||
HistoricalInfo(timestamp, 0 /* level */, 1 /* count */), std::move(shortcuts));
|
||||
dictionary->addUnigramEntry(CodePointArrayView(word1CodePoints, word1Length),
|
||||
&unigramProperty);
|
||||
|
|
|
@ -49,21 +49,44 @@ class UnigramProperty {
|
|||
};
|
||||
|
||||
UnigramProperty()
|
||||
: mRepresentsBeginningOfSentence(false), mIsNotAWord(false), mIsBlacklisted(false),
|
||||
mProbability(NOT_A_PROBABILITY), mHistoricalInfo(), mShortcuts() {}
|
||||
: mRepresentsBeginningOfSentence(false), mIsNotAWord(false),
|
||||
mIsBlacklisted(false), mIsPossiblyOffensive(false), mProbability(NOT_A_PROBABILITY),
|
||||
mHistoricalInfo(), mShortcuts() {}
|
||||
|
||||
// In contexts which do not support the Blacklisted flag (v2, v4<403)
|
||||
UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord,
|
||||
const bool isBlacklisted, const int probability, const HistoricalInfo historicalInfo,
|
||||
const std::vector<ShortcutProperty> &&shortcuts)
|
||||
const bool isPossiblyOffensive, const int probability,
|
||||
const HistoricalInfo historicalInfo, const std::vector<ShortcutProperty> &&shortcuts)
|
||||
: mRepresentsBeginningOfSentence(representsBeginningOfSentence),
|
||||
mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), mProbability(probability),
|
||||
mIsNotAWord(isNotAWord), mIsBlacklisted(false),
|
||||
mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability),
|
||||
mHistoricalInfo(historicalInfo), mShortcuts(std::move(shortcuts)) {}
|
||||
|
||||
// Without shortcuts.
|
||||
// Without shortcuts, in contexts which do not support the Blacklisted flag (v2, v4<403)
|
||||
UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord,
|
||||
const bool isBlacklisted, const int probability, const HistoricalInfo historicalInfo)
|
||||
const bool isPossiblyOffensive, const int probability,
|
||||
const HistoricalInfo historicalInfo)
|
||||
: mRepresentsBeginningOfSentence(representsBeginningOfSentence),
|
||||
mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), mProbability(probability),
|
||||
mIsNotAWord(isNotAWord), mIsBlacklisted(false),
|
||||
mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability),
|
||||
mHistoricalInfo(historicalInfo), mShortcuts() {}
|
||||
|
||||
// In contexts which DO support the Blacklisted flag (v403)
|
||||
UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord,
|
||||
const bool isBlacklisted, const bool isPossiblyOffensive, const int probability,
|
||||
const HistoricalInfo historicalInfo, const std::vector<ShortcutProperty> &&shortcuts)
|
||||
: mRepresentsBeginningOfSentence(representsBeginningOfSentence),
|
||||
mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted),
|
||||
mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability),
|
||||
mHistoricalInfo(historicalInfo), mShortcuts(std::move(shortcuts)) {}
|
||||
|
||||
// Without shortcuts, in contexts which DO support the Blacklisted flag (v403)
|
||||
UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord,
|
||||
const bool isBlacklisted, const bool isPossiblyOffensive, const int probability,
|
||||
const HistoricalInfo historicalInfo)
|
||||
: mRepresentsBeginningOfSentence(representsBeginningOfSentence),
|
||||
mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted),
|
||||
mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability),
|
||||
mHistoricalInfo(historicalInfo), mShortcuts() {}
|
||||
|
||||
bool representsBeginningOfSentence() const {
|
||||
|
@ -74,13 +97,12 @@ class UnigramProperty {
|
|||
return mIsNotAWord;
|
||||
}
|
||||
|
||||
bool isBlacklisted() const {
|
||||
return mIsBlacklisted;
|
||||
bool isPossiblyOffensive() const {
|
||||
return mIsPossiblyOffensive;
|
||||
}
|
||||
|
||||
bool isPossiblyOffensive() const {
|
||||
// TODO: Have dedicated flag.
|
||||
return mProbability == 0;
|
||||
bool isBlacklisted() const {
|
||||
return mIsBlacklisted;
|
||||
}
|
||||
|
||||
bool hasShortcuts() const {
|
||||
|
@ -106,6 +128,7 @@ class UnigramProperty {
|
|||
const bool mRepresentsBeginningOfSentence;
|
||||
const bool mIsNotAWord;
|
||||
const bool mIsBlacklisted;
|
||||
const bool mIsPossiblyOffensive;
|
||||
const int mProbability;
|
||||
const HistoricalInfo mHistoricalInfo;
|
||||
const std::vector<ShortcutProperty> mShortcuts;
|
||||
|
|
|
@ -28,7 +28,7 @@ void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints,
|
|||
JniDataUtils::outputCodePoints(env, outCodePoints, 0 /* start */,
|
||||
MAX_WORD_LENGTH /* maxLength */, mCodePoints.data(), mCodePoints.size(),
|
||||
false /* needsNullTermination */);
|
||||
jboolean flags[] = {mUnigramProperty.isNotAWord(), mUnigramProperty.isBlacklisted(),
|
||||
jboolean flags[] = {mUnigramProperty.isNotAWord(), mUnigramProperty.isPossiblyOffensive(),
|
||||
!mNgrams.empty(), mUnigramProperty.hasShortcuts(),
|
||||
mUnigramProperty.representsBeginningOfSentence()};
|
||||
env->SetBooleanArrayRegion(outFlags, 0 /* start */, NELEMS(flags), flags);
|
||||
|
|
|
@ -43,6 +43,14 @@ class WordAttributes {
|
|||
return mIsNotAWord;
|
||||
}
|
||||
|
||||
// Whether or not a word is possibly offensive.
|
||||
// * Static dictionaries <v202, as well as dynamic dictionaries <v403, will set this based on
|
||||
// whether or not the probability of the word is zero.
|
||||
// * Static dictionaries >=v203 will set this based on the IS_POSSIBLY_OFFENSIVE PtNode flag.
|
||||
// * Dynamic dictionaries >=v403 will set this based on the IS_POSSIBLY_OFFENSIVE language model
|
||||
// flag (the PtNode flag IS_BLACKLISTED is ignored and kept as zero)
|
||||
//
|
||||
// See the ::getWordAttributes function for each of these dictionary policies for more details.
|
||||
bool isPossiblyOffensive() const {
|
||||
return mIsPossiblyOffensive;
|
||||
}
|
||||
|
|
|
@ -245,7 +245,7 @@ bool Ver4PatriciaTrieNodeWriter::addNgramEntry(const WordIdArrayView prevWordIds
|
|||
if (!sourcePtNodeParams.hasBigrams()) {
|
||||
// Update has bigrams flag.
|
||||
return updatePtNodeFlags(sourcePtNodeParams.getHeadPos(),
|
||||
sourcePtNodeParams.isBlacklisted(), sourcePtNodeParams.isNotAWord(),
|
||||
sourcePtNodeParams.isPossiblyOffensive(), sourcePtNodeParams.isNotAWord(),
|
||||
sourcePtNodeParams.isTerminal(), sourcePtNodeParams.hasShortcutTargets(),
|
||||
true /* hasBigrams */,
|
||||
sourcePtNodeParams.getCodePointCount() > 1 /* hasMultipleChars */);
|
||||
|
@ -316,7 +316,7 @@ bool Ver4PatriciaTrieNodeWriter::addShortcutTarget(const PtNodeParams *const ptN
|
|||
if (!ptNodeParams->hasShortcutTargets()) {
|
||||
// Update has shortcut targets flag.
|
||||
return updatePtNodeFlags(ptNodeParams->getHeadPos(),
|
||||
ptNodeParams->isBlacklisted(), ptNodeParams->isNotAWord(),
|
||||
ptNodeParams->isPossiblyOffensive(), ptNodeParams->isNotAWord(),
|
||||
ptNodeParams->isTerminal(), true /* hasShortcutTargets */,
|
||||
ptNodeParams->hasBigrams(),
|
||||
ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */);
|
||||
|
@ -330,7 +330,7 @@ bool Ver4PatriciaTrieNodeWriter::updatePtNodeHasBigramsAndShortcutTargetsFlags(
|
|||
ptNodeParams->getTerminalId()) != NOT_A_DICT_POS;
|
||||
const bool hasShortcutTargets = mBuffers->getShortcutDictContent()->getShortcutListHeadPos(
|
||||
ptNodeParams->getTerminalId()) != NOT_A_DICT_POS;
|
||||
return updatePtNodeFlags(ptNodeParams->getHeadPos(), ptNodeParams->isBlacklisted(),
|
||||
return updatePtNodeFlags(ptNodeParams->getHeadPos(), ptNodeParams->isPossiblyOffensive(),
|
||||
ptNodeParams->isNotAWord(), ptNodeParams->isTerminal(), hasShortcutTargets,
|
||||
hasBigrams, ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */);
|
||||
}
|
||||
|
@ -386,8 +386,9 @@ bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition(
|
|||
ptNodeParams->getChildrenPos(), ptNodeWritingPos)) {
|
||||
return false;
|
||||
}
|
||||
return updatePtNodeFlags(nodePos, ptNodeParams->isBlacklisted(), ptNodeParams->isNotAWord(),
|
||||
isTerminal, ptNodeParams->hasShortcutTargets(), ptNodeParams->hasBigrams(),
|
||||
return updatePtNodeFlags(nodePos, ptNodeParams->isPossiblyOffensive(),
|
||||
ptNodeParams->isNotAWord(), isTerminal, ptNodeParams->hasShortcutTargets(),
|
||||
ptNodeParams->hasBigrams(),
|
||||
ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */);
|
||||
}
|
||||
|
||||
|
|
|
@ -608,8 +608,8 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
|
|||
}
|
||||
}
|
||||
const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
|
||||
ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
|
||||
*historicalInfo, std::move(shortcuts));
|
||||
ptNodeParams.isNotAWord(), ptNodeParams.isPossiblyOffensive(),
|
||||
ptNodeParams.getProbability(), *historicalInfo, std::move(shortcuts));
|
||||
return WordProperty(wordCodePoints.toVector(), &unigramProperty, &ngrams);
|
||||
}
|
||||
|
||||
|
|
|
@ -146,7 +146,7 @@ bool DynamicPtUpdatingHelper::setPtNodeProbability(const PtNodeParams *const ori
|
|||
const int movedPos = mBuffer->getTailPosition();
|
||||
int writingPos = movedPos;
|
||||
const PtNodeParams ptNodeParamsToWrite(getUpdatedPtNodeParams(originalPtNodeParams,
|
||||
unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(),
|
||||
unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
|
||||
true /* isTerminal */, originalPtNodeParams->getParentPos(),
|
||||
originalPtNodeParams->getCodePointArrayView(), unigramProperty->getProbability()));
|
||||
if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
|
||||
|
@ -180,8 +180,9 @@ bool DynamicPtUpdatingHelper::createNewPtNodeArrayWithAChildPtNode(
|
|||
return false;
|
||||
}
|
||||
const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
|
||||
unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), true /* isTerminal */,
|
||||
parentPtNodePos, ptNodeCodePoints, unigramProperty->getProbability()));
|
||||
unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
|
||||
true /* isTerminal */, parentPtNodePos, ptNodeCodePoints,
|
||||
unigramProperty->getProbability()));
|
||||
if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
|
||||
unigramProperty, &writingPos)) {
|
||||
return false;
|
||||
|
@ -214,7 +215,7 @@ bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes(
|
|||
reallocatingPtNodeParams->getCodePointArrayView().limit(overlappingCodePointCount);
|
||||
if (addsExtraChild) {
|
||||
const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
|
||||
false /* isNotAWord */, false /* isBlacklisted */, false /* isTerminal */,
|
||||
false /* isNotAWord */, false /* isPossiblyOffensive */, false /* isTerminal */,
|
||||
reallocatingPtNodeParams->getParentPos(), firstPtNodeCodePoints,
|
||||
NOT_A_PROBABILITY));
|
||||
if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&ptNodeParamsToWrite, &writingPos)) {
|
||||
|
@ -222,7 +223,7 @@ bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes(
|
|||
}
|
||||
} else {
|
||||
const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
|
||||
unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(),
|
||||
unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
|
||||
true /* isTerminal */, reallocatingPtNodeParams->getParentPos(),
|
||||
firstPtNodeCodePoints, unigramProperty->getProbability()));
|
||||
if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
|
||||
|
@ -240,7 +241,7 @@ bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes(
|
|||
// Write the 2nd part of the reallocating node.
|
||||
const int secondPartOfReallocatedPtNodePos = writingPos;
|
||||
const PtNodeParams childPartPtNodeParams(getUpdatedPtNodeParams(reallocatingPtNodeParams,
|
||||
reallocatingPtNodeParams->isNotAWord(), reallocatingPtNodeParams->isBlacklisted(),
|
||||
reallocatingPtNodeParams->isNotAWord(), reallocatingPtNodeParams->isPossiblyOffensive(),
|
||||
reallocatingPtNodeParams->isTerminal(), firstPartOfReallocatedPtNodePos,
|
||||
reallocatingPtNodeParams->getCodePointArrayView().skip(overlappingCodePointCount),
|
||||
reallocatingPtNodeParams->getProbability()));
|
||||
|
@ -249,7 +250,7 @@ bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes(
|
|||
}
|
||||
if (addsExtraChild) {
|
||||
const PtNodeParams extraChildPtNodeParams(getPtNodeParamsForNewPtNode(
|
||||
unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(),
|
||||
unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
|
||||
true /* isTerminal */, firstPartOfReallocatedPtNodePos,
|
||||
newPtNodeCodePoints.skip(overlappingCodePointCount),
|
||||
unigramProperty->getProbability()));
|
||||
|
@ -276,20 +277,20 @@ bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes(
|
|||
|
||||
const PtNodeParams DynamicPtUpdatingHelper::getUpdatedPtNodeParams(
|
||||
const PtNodeParams *const originalPtNodeParams, const bool isNotAWord,
|
||||
const bool isBlacklisted, const bool isTerminal, const int parentPos,
|
||||
const bool isPossiblyOffensive, const bool isTerminal, const int parentPos,
|
||||
const CodePointArrayView codePoints, const int probability) const {
|
||||
const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags(
|
||||
isBlacklisted, isNotAWord, isTerminal, false /* hasShortcutTargets */,
|
||||
isPossiblyOffensive, isNotAWord, isTerminal, false /* hasShortcutTargets */,
|
||||
false /* hasBigrams */, codePoints.size() > 1u /* hasMultipleChars */,
|
||||
CHILDREN_POSITION_FIELD_SIZE);
|
||||
return PtNodeParams(originalPtNodeParams, flags, parentPos, codePoints, probability);
|
||||
}
|
||||
|
||||
const PtNodeParams DynamicPtUpdatingHelper::getPtNodeParamsForNewPtNode(const bool isNotAWord,
|
||||
const bool isBlacklisted, const bool isTerminal, const int parentPos,
|
||||
const bool isPossiblyOffensive, const bool isTerminal, const int parentPos,
|
||||
const CodePointArrayView codePoints, const int probability) const {
|
||||
const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags(
|
||||
isBlacklisted, isNotAWord, isTerminal, false /* hasShortcutTargets */,
|
||||
isPossiblyOffensive, isNotAWord, isTerminal, false /* hasShortcutTargets */,
|
||||
false /* hasBigrams */, codePoints.size() > 1u /* hasMultipleChars */,
|
||||
CHILDREN_POSITION_FIELD_SIZE);
|
||||
return PtNodeParams(flags, parentPos, codePoints, probability);
|
||||
|
|
|
@ -85,12 +85,12 @@ class DynamicPtUpdatingHelper {
|
|||
const CodePointArrayView newPtNodeCodePoints);
|
||||
|
||||
const PtNodeParams getUpdatedPtNodeParams(const PtNodeParams *const originalPtNodeParams,
|
||||
const bool isNotAWord, const bool isBlacklisted, const bool isTerminal,
|
||||
const bool isNotAWord, const bool isPossiblyOffensive, const bool isTerminal,
|
||||
const int parentPos, const CodePointArrayView codePoints, const int probability) const;
|
||||
|
||||
const PtNodeParams getPtNodeParamsForNewPtNode(const bool isNotAWord, const bool isBlacklisted,
|
||||
const bool isTerminal, const int parentPos, const CodePointArrayView codePoints,
|
||||
const int probability) const;
|
||||
const PtNodeParams getPtNodeParamsForNewPtNode(const bool isNotAWord,
|
||||
const bool isPossiblyOffensive, const bool isTerminal, const int parentPos,
|
||||
const CodePointArrayView codePoints, const int probability) const;
|
||||
};
|
||||
} // namespace latinime
|
||||
#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_UPDATING_HELPER_H */
|
||||
|
|
|
@ -41,8 +41,8 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_SHORTCUT_TARGETS = 0x08
|
|||
const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_BIGRAMS = 0x04;
|
||||
// Flag for non-words (typically, shortcut only entries)
|
||||
const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_NOT_A_WORD = 0x02;
|
||||
// Flag for blacklist
|
||||
const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
|
||||
// Flag for possibly offensive words
|
||||
const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_POSSIBLY_OFFENSIVE = 0x01;
|
||||
|
||||
/* static */ int PtReadingUtils::getPtNodeArraySizeAndAdvancePosition(
|
||||
const uint8_t *const buffer, int *const pos) {
|
||||
|
|
|
@ -54,8 +54,8 @@ class PatriciaTrieReadingUtils {
|
|||
/**
|
||||
* Node Flags
|
||||
*/
|
||||
static AK_FORCE_INLINE bool isBlacklisted(const NodeFlags flags) {
|
||||
return (flags & FLAG_IS_BLACKLISTED) != 0;
|
||||
static AK_FORCE_INLINE bool isPossiblyOffensive(const NodeFlags flags) {
|
||||
return (flags & FLAG_IS_POSSIBLY_OFFENSIVE) != 0;
|
||||
}
|
||||
|
||||
static AK_FORCE_INLINE bool isNotAWord(const NodeFlags flags) {
|
||||
|
@ -82,12 +82,12 @@ class PatriciaTrieReadingUtils {
|
|||
return FLAG_CHILDREN_POSITION_TYPE_NOPOSITION != (MASK_CHILDREN_POSITION_TYPE & flags);
|
||||
}
|
||||
|
||||
static AK_FORCE_INLINE NodeFlags createAndGetFlags(const bool isBlacklisted,
|
||||
static AK_FORCE_INLINE NodeFlags createAndGetFlags(const bool isPossiblyOffensive,
|
||||
const bool isNotAWord, const bool isTerminal, const bool hasShortcutTargets,
|
||||
const bool hasBigrams, const bool hasMultipleChars,
|
||||
const int childrenPositionFieldSize) {
|
||||
NodeFlags nodeFlags = 0;
|
||||
nodeFlags = isBlacklisted ? (nodeFlags | FLAG_IS_BLACKLISTED) : nodeFlags;
|
||||
nodeFlags = isPossiblyOffensive ? (nodeFlags | FLAG_IS_POSSIBLY_OFFENSIVE) : nodeFlags;
|
||||
nodeFlags = isNotAWord ? (nodeFlags | FLAG_IS_NOT_A_WORD) : nodeFlags;
|
||||
nodeFlags = isTerminal ? (nodeFlags | FLAG_IS_TERMINAL) : nodeFlags;
|
||||
nodeFlags = hasShortcutTargets ? (nodeFlags | FLAG_HAS_SHORTCUT_TARGETS) : nodeFlags;
|
||||
|
@ -127,7 +127,7 @@ class PatriciaTrieReadingUtils {
|
|||
static const NodeFlags FLAG_HAS_SHORTCUT_TARGETS;
|
||||
static const NodeFlags FLAG_HAS_BIGRAMS;
|
||||
static const NodeFlags FLAG_IS_NOT_A_WORD;
|
||||
static const NodeFlags FLAG_IS_BLACKLISTED;
|
||||
static const NodeFlags FLAG_IS_POSSIBLY_OFFENSIVE;
|
||||
};
|
||||
} // namespace latinime
|
||||
#endif /* LATINIME_PATRICIA_TRIE_NODE_READING_UTILS_H */
|
||||
|
|
|
@ -145,7 +145,18 @@ class PtNodeParams {
|
|||
}
|
||||
|
||||
AK_FORCE_INLINE bool isBlacklisted() const {
|
||||
return PatriciaTrieReadingUtils::isBlacklisted(mFlags);
|
||||
// Note: this method will be removed in the next change.
|
||||
// It is used in getProbabilityOfWord and getWordAttributes for both v402 and v403.
|
||||
// * getProbabilityOfWord will be changed to no longer return NOT_A_PROBABILITY
|
||||
// when isBlacklisted (i.e. to only check if isNotAWord or isDeleted)
|
||||
// * getWordAttributes will be changed to always return blacklisted=false and
|
||||
// isPossiblyOffensive according to the function below (instead of the current
|
||||
// behaviour of checking if the probability is zero)
|
||||
return PatriciaTrieReadingUtils::isPossiblyOffensive(mFlags);
|
||||
}
|
||||
|
||||
AK_FORCE_INLINE bool isPossiblyOffensive() const {
|
||||
return PatriciaTrieReadingUtils::isPossiblyOffensive(mFlags);
|
||||
}
|
||||
|
||||
AK_FORCE_INLINE bool isNotAWord() const {
|
||||
|
|
|
@ -476,8 +476,8 @@ const WordProperty PatriciaTriePolicy::getWordProperty(
|
|||
}
|
||||
}
|
||||
const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
|
||||
ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
|
||||
HistoricalInfo(), std::move(shortcuts));
|
||||
ptNodeParams.isNotAWord(), ptNodeParams.isPossiblyOffensive(),
|
||||
ptNodeParams.getProbability(), HistoricalInfo(), std::move(shortcuts));
|
||||
return WordProperty(wordCodePoints.toVector(), &unigramProperty, &ngrams);
|
||||
}
|
||||
|
||||
|
|
|
@ -342,7 +342,7 @@ bool Ver4PatriciaTrieNodeWriter::updatePtNodeFlags(const int ptNodePos, const bo
|
|||
// Create node flags and write them.
|
||||
PatriciaTrieReadingUtils::NodeFlags nodeFlags =
|
||||
PatriciaTrieReadingUtils::createAndGetFlags(false /* isNotAWord */,
|
||||
false /* isBlacklisted */, isTerminal, false /* hasShortcutTargets */,
|
||||
false /* isPossiblyOffensive */, isTerminal, false /* hasShortcutTargets */,
|
||||
false /* hasBigrams */, hasMultipleChars, CHILDREN_POSITION_FIELD_SIZE);
|
||||
if (!DynamicPtWritingUtils::writeFlags(mTrieBuffer, nodeFlags, ptNodePos)) {
|
||||
AKLOGE("Cannot write PtNode flags. flags: %x, pos: %d", nodeFlags, ptNodePos);
|
||||
|
|
|
@ -299,7 +299,8 @@ bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramContext *const ngramContex
|
|||
}
|
||||
const UnigramProperty beginningOfSentenceUnigramProperty(
|
||||
true /* representsBeginningOfSentence */, true /* isNotAWord */,
|
||||
false /* isBlacklisted */, MAX_PROBABILITY /* probability */, HistoricalInfo());
|
||||
false /* isBlacklisted */, false /* isPossiblyOffensive */,
|
||||
MAX_PROBABILITY /* probability */, HistoricalInfo());
|
||||
if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */),
|
||||
&beginningOfSentenceUnigramProperty)) {
|
||||
AKLOGE("Cannot add unigram entry for the beginning-of-sentence.");
|
||||
|
@ -375,8 +376,9 @@ bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext(
|
|||
if (wordId == NOT_A_WORD_ID) {
|
||||
// The word is not in the dictionary.
|
||||
const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */,
|
||||
false /* isNotAWord */, false /* isBlacklisted */, NOT_A_PROBABILITY,
|
||||
HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, 0 /* count */));
|
||||
false /* isNotAWord */, false /* isBlacklisted */, false /* isPossiblyOffensive */,
|
||||
NOT_A_PROBABILITY, HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */,
|
||||
0 /* count */));
|
||||
if (!addUnigramEntry(wordCodePoints, &unigramProperty)) {
|
||||
AKLOGE("Cannot add unigarm entry in updateEntriesForWordWithNgramContext().");
|
||||
return false;
|
||||
|
@ -391,7 +393,7 @@ bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext(
|
|||
&& ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) {
|
||||
const UnigramProperty beginningOfSentenceUnigramProperty(
|
||||
true /* representsBeginningOfSentence */,
|
||||
true /* isNotAWord */, false /* isBlacklisted */, NOT_A_PROBABILITY,
|
||||
true /* isNotAWord */, false /* isPossiblyOffensive */, NOT_A_PROBABILITY,
|
||||
HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, 0 /* count */));
|
||||
if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */),
|
||||
&beginningOfSentenceUnigramProperty)) {
|
||||
|
@ -529,7 +531,8 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
|
|||
}
|
||||
const UnigramProperty unigramProperty(probabilityEntry.representsBeginningOfSentence(),
|
||||
probabilityEntry.isNotAWord(), probabilityEntry.isBlacklisted(),
|
||||
probabilityEntry.getProbability(), *historicalInfo, std::move(shortcuts));
|
||||
probabilityEntry.isPossiblyOffensive(), probabilityEntry.getProbability(),
|
||||
*historicalInfo, std::move(shortcuts));
|
||||
return WordProperty(wordCodePoints.toVector(), &unigramProperty, &ngrams);
|
||||
}
|
||||
|
||||
|
|
|
@ -684,8 +684,8 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
|||
|
||||
binaryDictionary.addUnigramEntry("", DUMMY_PROBABILITY, "" /* shortcutTarget */,
|
||||
BinaryDictionary.NOT_A_PROBABILITY /* shortcutProbability */,
|
||||
true /* isBeginningOfSentence */, true /* isNotAWord */, false /* isBlacklisted */,
|
||||
mCurrentTime);
|
||||
true /* isBeginningOfSentence */, true /* isNotAWord */,
|
||||
false /* isPossiblyOffensive */, mCurrentTime);
|
||||
final NgramContext beginningOfSentenceContext = NgramContext.BEGINNING_OF_SENTENCE;
|
||||
onInputWordWithBeginningOfSentenceContext(binaryDictionary, "aaa", true /* isValidWord */);
|
||||
assertFalse(binaryDictionary.isValidNgram(beginningOfSentenceContext, "aaa"));
|
||||
|
|
|
@ -200,7 +200,7 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
// Too long short cut.
|
||||
binaryDictionary.addUnigramEntry("a", probability, invalidLongWord,
|
||||
10 /* shortcutProbability */, false /* isBeginningOfSentence */,
|
||||
false /* isNotAWord */, false /* isBlacklisted */,
|
||||
false /* isNotAWord */, false /* isPossiblyOffensive */,
|
||||
BinaryDictionary.NOT_A_VALID_TIMESTAMP);
|
||||
addUnigramWord(binaryDictionary, "abc", probability);
|
||||
final int updatedProbability = 200;
|
||||
|
@ -221,7 +221,8 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
binaryDictionary.addUnigramEntry(word, probability, "" /* shortcutTarget */,
|
||||
BinaryDictionary.NOT_A_PROBABILITY /* shortcutProbability */,
|
||||
false /* isBeginningOfSentence */, false /* isNotAWord */,
|
||||
false /* isBlacklisted */, BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
|
||||
false /* isPossiblyOffensive */,
|
||||
BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
|
||||
}
|
||||
|
||||
private static void addBigramWords(final BinaryDictionary binaryDictionary, final String word0,
|
||||
|
@ -971,11 +972,11 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
final String word = CodePointUtils.generateWord(random, codePointSet);
|
||||
final int unigramProbability = random.nextInt(0xFF);
|
||||
final boolean isNotAWord = random.nextBoolean();
|
||||
final boolean isBlacklisted = random.nextBoolean();
|
||||
final boolean isPossiblyOffensive = random.nextBoolean();
|
||||
// TODO: Add tests for historical info.
|
||||
binaryDictionary.addUnigramEntry(word, unigramProbability,
|
||||
null /* shortcutTarget */, BinaryDictionary.NOT_A_PROBABILITY,
|
||||
false /* isBeginningOfSentence */, isNotAWord, isBlacklisted,
|
||||
false /* isBeginningOfSentence */, isNotAWord, isPossiblyOffensive,
|
||||
BinaryDictionary.NOT_A_VALID_TIMESTAMP);
|
||||
if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
|
||||
binaryDictionary.flushWithGC();
|
||||
|
@ -987,7 +988,7 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
assertEquals(word, wordProperty.mWord);
|
||||
assertTrue(wordProperty.isValid());
|
||||
assertEquals(isNotAWord, wordProperty.mIsNotAWord);
|
||||
assertEquals(isBlacklisted, wordProperty.mIsBlacklistEntry);
|
||||
assertEquals(isPossiblyOffensive, wordProperty.mIsPossiblyOffensive);
|
||||
assertEquals(false, wordProperty.mHasNgrams);
|
||||
assertEquals(false, wordProperty.mHasShortcuts);
|
||||
assertEquals(unigramProbability, wordProperty.mProbabilityInfo.mProbability);
|
||||
|
@ -1142,7 +1143,7 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
final int shortcutProbability = 10;
|
||||
binaryDictionary.addUnigramEntry("aaa", unigramProbability, "zzz",
|
||||
shortcutProbability, false /* isBeginningOfSentence */,
|
||||
false /* isNotAWord */, false /* isBlacklisted */, 0 /* timestamp */);
|
||||
false /* isNotAWord */, false /* isPossiblyOffensive */, 0 /* timestamp */);
|
||||
WordProperty wordProperty = binaryDictionary.getWordProperty("aaa",
|
||||
false /* isBeginningOfSentence */);
|
||||
assertEquals(1, wordProperty.mShortcutTargets.size());
|
||||
|
@ -1151,7 +1152,7 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
final int updatedShortcutProbability = 2;
|
||||
binaryDictionary.addUnigramEntry("aaa", unigramProbability, "zzz",
|
||||
updatedShortcutProbability, false /* isBeginningOfSentence */,
|
||||
false /* isNotAWord */, false /* isBlacklisted */, 0 /* timestamp */);
|
||||
false /* isNotAWord */, false /* isPossiblyOffensive */, 0 /* timestamp */);
|
||||
wordProperty = binaryDictionary.getWordProperty("aaa",
|
||||
false /* isBeginningOfSentence */);
|
||||
assertEquals(1, wordProperty.mShortcutTargets.size());
|
||||
|
@ -1160,7 +1161,7 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
wordProperty.mShortcutTargets.get(0).getProbability());
|
||||
binaryDictionary.addUnigramEntry("aaa", unigramProbability, "yyy",
|
||||
shortcutProbability, false /* isBeginningOfSentence */, false /* isNotAWord */,
|
||||
false /* isBlacklisted */, 0 /* timestamp */);
|
||||
false /* isPossiblyOffensive */, 0 /* timestamp */);
|
||||
final HashMap<String, Integer> shortcutTargets = new HashMap<>();
|
||||
shortcutTargets.put("zzz", updatedShortcutProbability);
|
||||
shortcutTargets.put("yyy", shortcutProbability);
|
||||
|
@ -1223,7 +1224,7 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
final int unigramProbability = unigramProbabilities.get(word);
|
||||
binaryDictionary.addUnigramEntry(word, unigramProbability, shortcutTarget,
|
||||
shortcutProbability, false /* isBeginningOfSentence */, false /* isNotAWord */,
|
||||
false /* isBlacklisted */, 0 /* timestamp */);
|
||||
false /* isPossiblyOffensive */, 0 /* timestamp */);
|
||||
if (shortcutTargets.containsKey(word)) {
|
||||
final HashMap<String, Integer> shortcutTargetsOfWord = shortcutTargets.get(word);
|
||||
shortcutTargetsOfWord.put(shortcutTarget, shortcutProbability);
|
||||
|
@ -1255,6 +1256,15 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
public void testPossiblyOffensiveAttributeMaintained() {
|
||||
final BinaryDictionary binaryDictionary =
|
||||
getEmptyBinaryDictionary(FormatSpec.VERSION4_DEV);
|
||||
binaryDictionary.addUnigramEntry("ddd", 100, null, Dictionary.NOT_A_PROBABILITY,
|
||||
false, true, true, 0);
|
||||
WordProperty wordProperty = binaryDictionary.getWordProperty("ddd", false);
|
||||
assertEquals(true, wordProperty.mIsPossiblyOffensive);
|
||||
}
|
||||
|
||||
public void testDictMigration() {
|
||||
for (final int formatVersion : DICT_FORMAT_VERSIONS) {
|
||||
testDictMigration(FormatSpec.VERSION4_ONLY_FOR_TESTING, formatVersion);
|
||||
|
@ -1271,10 +1281,10 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
final int shortcutProbability = 10;
|
||||
binaryDictionary.addUnigramEntry("ccc", unigramProbability, "xxx", shortcutProbability,
|
||||
false /* isBeginningOfSentence */, false /* isNotAWord */,
|
||||
false /* isBlacklisted */, 0 /* timestamp */);
|
||||
false /* isPossiblyOffensive */, 0 /* timestamp */);
|
||||
binaryDictionary.addUnigramEntry("ddd", unigramProbability, null /* shortcutTarget */,
|
||||
Dictionary.NOT_A_PROBABILITY, false /* isBeginningOfSentence */,
|
||||
true /* isNotAWord */, true /* isBlacklisted */, 0 /* timestamp */);
|
||||
true /* isNotAWord */, true /* isPossiblyOffensive */, 0 /* timestamp */);
|
||||
binaryDictionary.addNgramEntry(NgramContext.BEGINNING_OF_SENTENCE,
|
||||
"aaa", bigramProbability, 0 /* timestamp */);
|
||||
assertEquals(unigramProbability, binaryDictionary.getFrequency("aaa"));
|
||||
|
@ -1298,7 +1308,7 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
|||
assertEquals("xxx", wordProperty.mShortcutTargets.get(0).mWord);
|
||||
wordProperty = binaryDictionary.getWordProperty("ddd",
|
||||
false /* isBeginningOfSentence */);
|
||||
assertTrue(wordProperty.mIsBlacklistEntry);
|
||||
assertTrue(wordProperty.mIsPossiblyOffensive);
|
||||
assertTrue(wordProperty.mIsNotAWord);
|
||||
}
|
||||
|
||||
|
|
|
@ -35,16 +35,20 @@ public class FusionDictionaryTests extends AndroidTestCase {
|
|||
FusionDictionary dict = new FusionDictionary(new PtNodeArray(),
|
||||
new DictionaryOptions(new HashMap<String,String>()));
|
||||
|
||||
dict.add("abc", new ProbabilityInfo(10), null, false /* isNotAWord */);
|
||||
dict.add("abc", new ProbabilityInfo(10), null, false /* isNotAWord */,
|
||||
false /* isPossiblyOffensive */);
|
||||
assertNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "aaa"));
|
||||
assertNotNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "abc"));
|
||||
|
||||
dict.add("aa", new ProbabilityInfo(10), null, false /* isNotAWord */);
|
||||
dict.add("aa", new ProbabilityInfo(10), null, false /* isNotAWord */,
|
||||
false /* isPossiblyOffensive */);
|
||||
assertNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "aaa"));
|
||||
assertNotNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "aa"));
|
||||
|
||||
dict.add("babcd", new ProbabilityInfo(10), null, false /* isNotAWord */);
|
||||
dict.add("bacde", new ProbabilityInfo(10), null, false /* isNotAWord */);
|
||||
dict.add("babcd", new ProbabilityInfo(10), null, false /* isNotAWord */,
|
||||
false /* isPossiblyOffensive */);
|
||||
dict.add("bacde", new ProbabilityInfo(10), null, false /* isNotAWord */,
|
||||
false /* isPossiblyOffensive */);
|
||||
assertNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "ba"));
|
||||
assertNotNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "babcd"));
|
||||
assertNotNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "bacde"));
|
||||
|
|
|
@ -149,7 +149,8 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
|
|||
}
|
||||
}
|
||||
dict.add(word, new ProbabilityInfo(UNIGRAM_FREQ),
|
||||
(shortcutMap == null) ? null : shortcuts, false /* isNotAWord */);
|
||||
(shortcutMap == null) ? null : shortcuts, false /* isNotAWord */,
|
||||
false /* isPossiblyOffensive */);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -572,12 +572,12 @@ public class BinaryDictEncoderUtils {
|
|||
* @param hasShortcuts whether the PtNode has shortcuts.
|
||||
* @param hasBigrams whether the PtNode has bigrams.
|
||||
* @param isNotAWord whether the PtNode is not a word.
|
||||
* @param isBlackListEntry whether the PtNode is a blacklist entry.
|
||||
* @param isPossiblyOffensive whether the PtNode is a possibly offensive entry.
|
||||
* @return the flags
|
||||
*/
|
||||
static int makePtNodeFlags(final boolean hasMultipleChars, final boolean isTerminal,
|
||||
final int childrenAddressSize, final boolean hasShortcuts, final boolean hasBigrams,
|
||||
final boolean isNotAWord, final boolean isBlackListEntry) {
|
||||
final boolean isNotAWord, final boolean isPossiblyOffensive) {
|
||||
byte flags = 0;
|
||||
if (hasMultipleChars) flags |= FormatSpec.FLAG_HAS_MULTIPLE_CHARS;
|
||||
if (isTerminal) flags |= FormatSpec.FLAG_IS_TERMINAL;
|
||||
|
@ -600,7 +600,7 @@ public class BinaryDictEncoderUtils {
|
|||
if (hasShortcuts) flags |= FormatSpec.FLAG_HAS_SHORTCUT_TARGETS;
|
||||
if (hasBigrams) flags |= FormatSpec.FLAG_HAS_BIGRAMS;
|
||||
if (isNotAWord) flags |= FormatSpec.FLAG_IS_NOT_A_WORD;
|
||||
if (isBlackListEntry) flags |= FormatSpec.FLAG_IS_BLACKLISTED;
|
||||
if (isPossiblyOffensive) flags |= FormatSpec.FLAG_IS_POSSIBLY_OFFENSIVE;
|
||||
return flags;
|
||||
}
|
||||
|
||||
|
@ -609,7 +609,7 @@ public class BinaryDictEncoderUtils {
|
|||
getByteSize(childrenOffset),
|
||||
node.mShortcutTargets != null && !node.mShortcutTargets.isEmpty(),
|
||||
node.mBigrams != null && !node.mBigrams.isEmpty(),
|
||||
node.mIsNotAWord, node.mIsBlacklistEntry);
|
||||
node.mIsNotAWord, node.mIsPossiblyOffensive);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -89,7 +89,7 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
|||
int mTerminalId; // NOT_A_TERMINAL == mTerminalId indicates this is not a terminal.
|
||||
PtNodeArray mChildren;
|
||||
boolean mIsNotAWord; // Only a shortcut
|
||||
boolean mIsBlacklistEntry;
|
||||
boolean mIsPossiblyOffensive;
|
||||
// mCachedSize and mCachedAddressBefore/AfterUpdate are helpers for binary dictionary
|
||||
// generation. Before and After always hold the same value except during dictionary
|
||||
// address compression, where the update process needs to know about both values at the
|
||||
|
@ -102,7 +102,7 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
|||
|
||||
public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
|
||||
final ArrayList<WeightedString> bigrams, final ProbabilityInfo probabilityInfo,
|
||||
final boolean isNotAWord, final boolean isBlacklistEntry) {
|
||||
final boolean isNotAWord, final boolean isPossiblyOffensive) {
|
||||
mChars = chars;
|
||||
mProbabilityInfo = probabilityInfo;
|
||||
mTerminalId = probabilityInfo == null ? NOT_A_TERMINAL : probabilityInfo.mProbability;
|
||||
|
@ -110,12 +110,12 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
|||
mBigrams = bigrams;
|
||||
mChildren = null;
|
||||
mIsNotAWord = isNotAWord;
|
||||
mIsBlacklistEntry = isBlacklistEntry;
|
||||
mIsPossiblyOffensive = isPossiblyOffensive;
|
||||
}
|
||||
|
||||
public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
|
||||
final ArrayList<WeightedString> bigrams, final ProbabilityInfo probabilityInfo,
|
||||
final boolean isNotAWord, final boolean isBlacklistEntry,
|
||||
final boolean isNotAWord, final boolean isPossiblyOffensive,
|
||||
final PtNodeArray children) {
|
||||
mChars = chars;
|
||||
mProbabilityInfo = probabilityInfo;
|
||||
|
@ -123,7 +123,7 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
|||
mBigrams = bigrams;
|
||||
mChildren = children;
|
||||
mIsNotAWord = isNotAWord;
|
||||
mIsBlacklistEntry = isBlacklistEntry;
|
||||
mIsPossiblyOffensive = isPossiblyOffensive;
|
||||
}
|
||||
|
||||
public void addChild(PtNode n) {
|
||||
|
@ -153,8 +153,8 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
|||
return mIsNotAWord;
|
||||
}
|
||||
|
||||
public boolean getIsBlacklistEntry() {
|
||||
return mIsBlacklistEntry;
|
||||
public boolean getIsPossiblyOffensive() {
|
||||
return mIsPossiblyOffensive;
|
||||
}
|
||||
|
||||
public ArrayList<WeightedString> getShortcutTargets() {
|
||||
|
@ -238,7 +238,7 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
|||
private void update(final ProbabilityInfo probabilityInfo,
|
||||
final ArrayList<WeightedString> shortcutTargets,
|
||||
final ArrayList<WeightedString> bigrams,
|
||||
final boolean isNotAWord, final boolean isBlacklistEntry) {
|
||||
final boolean isNotAWord, final boolean isPossiblyOffensive) {
|
||||
mProbabilityInfo = ProbabilityInfo.max(mProbabilityInfo, probabilityInfo);
|
||||
if (shortcutTargets != null) {
|
||||
if (mShortcutTargets == null) {
|
||||
|
@ -275,7 +275,7 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
|||
}
|
||||
}
|
||||
mIsNotAWord = isNotAWord;
|
||||
mIsBlacklistEntry = isBlacklistEntry;
|
||||
mIsPossiblyOffensive = isPossiblyOffensive;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -323,24 +323,12 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
|||
* @param probabilityInfo probability information of the word.
|
||||
* @param shortcutTargets a list of shortcut targets for this word, or null.
|
||||
* @param isNotAWord true if this should not be considered a word (e.g. shortcut only)
|
||||
* @param isPossiblyOffensive true if this word is possibly offensive
|
||||
*/
|
||||
public void add(final String word, final ProbabilityInfo probabilityInfo,
|
||||
final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) {
|
||||
add(getCodePoints(word), probabilityInfo, shortcutTargets, isNotAWord,
|
||||
false /* isBlacklistEntry */);
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper method to add a blacklist entry as a string.
|
||||
*
|
||||
* @param word the word to add as a blacklist entry.
|
||||
* @param shortcutTargets a list of shortcut targets for this word, or null.
|
||||
* @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so)
|
||||
*/
|
||||
public void addBlacklistEntry(final String word,
|
||||
final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) {
|
||||
add(getCodePoints(word), new ProbabilityInfo(0), shortcutTargets, isNotAWord,
|
||||
true /* isBlacklistEntry */);
|
||||
final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord,
|
||||
final boolean isPossiblyOffensive) {
|
||||
add(getCodePoints(word), probabilityInfo, shortcutTargets, isNotAWord, isPossiblyOffensive);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -375,7 +363,7 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
|||
final PtNode ptNode1 = findWordInTree(mRootNodeArray, word1);
|
||||
if (ptNode1 == null) {
|
||||
add(getCodePoints(word1), new ProbabilityInfo(0), null, false /* isNotAWord */,
|
||||
false /* isBlacklistEntry */);
|
||||
false /* isPossiblyOffensive */);
|
||||
// The PtNode for the first word may have moved by the above insertion,
|
||||
// if word1 and word2 share a common stem that happens not to have been
|
||||
// a cutting point until now. In this case, we need to refresh ptNode.
|
||||
|
@ -397,11 +385,11 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
|||
* @param probabilityInfo the probability information of the word.
|
||||
* @param shortcutTargets an optional list of shortcut targets for this word (null if none).
|
||||
* @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so)
|
||||
* @param isBlacklistEntry true if this is a blacklisted word, false otherwise
|
||||
* @param isPossiblyOffensive true if this word is possibly offensive
|
||||
*/
|
||||
private void add(final int[] word, final ProbabilityInfo probabilityInfo,
|
||||
final ArrayList<WeightedString> shortcutTargets,
|
||||
final boolean isNotAWord, final boolean isBlacklistEntry) {
|
||||
final boolean isNotAWord, final boolean isPossiblyOffensive) {
|
||||
assert(probabilityInfo.mProbability <= FormatSpec.MAX_TERMINAL_FREQUENCY);
|
||||
if (word.length >= Constants.DICTIONARY_MAX_WORD_LENGTH) {
|
||||
MakedictLog.w("Ignoring a word that is too long: word.length = " + word.length);
|
||||
|
@ -431,7 +419,7 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
|||
final int insertionIndex = findInsertionIndex(currentNodeArray, word[charIndex]);
|
||||
final PtNode newPtNode = new PtNode(Arrays.copyOfRange(word, charIndex, word.length),
|
||||
shortcutTargets, null /* bigrams */, probabilityInfo, isNotAWord,
|
||||
isBlacklistEntry);
|
||||
isPossiblyOffensive);
|
||||
currentNodeArray.mData.add(insertionIndex, newPtNode);
|
||||
if (DBG) checkStack(currentNodeArray);
|
||||
} else {
|
||||
|
@ -442,14 +430,14 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
|||
// should end already exists as is. Since the old PtNode was not a terminal,
|
||||
// make it one by filling in its frequency and other attributes
|
||||
currentPtNode.update(probabilityInfo, shortcutTargets, null, isNotAWord,
|
||||
isBlacklistEntry);
|
||||
isPossiblyOffensive);
|
||||
} else {
|
||||
// The new word matches the full old word and extends past it.
|
||||
// We only have to create a new node and add it to the end of this.
|
||||
final PtNode newNode = new PtNode(
|
||||
Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length),
|
||||
shortcutTargets, null /* bigrams */, probabilityInfo,
|
||||
isNotAWord, isBlacklistEntry);
|
||||
isNotAWord, isPossiblyOffensive);
|
||||
currentPtNode.mChildren = new PtNodeArray();
|
||||
currentPtNode.mChildren.mData.add(newNode);
|
||||
}
|
||||
|
@ -459,7 +447,7 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
|||
// new shortcuts to the existing shortcut list if it already exists.
|
||||
currentPtNode.update(probabilityInfo, shortcutTargets, null,
|
||||
currentPtNode.mIsNotAWord && isNotAWord,
|
||||
currentPtNode.mIsBlacklistEntry || isBlacklistEntry);
|
||||
currentPtNode.mIsPossiblyOffensive || isPossiblyOffensive);
|
||||
} else {
|
||||
// Partial prefix match only. We have to replace the current node with a node
|
||||
// containing the current prefix and create two new ones for the tails.
|
||||
|
@ -468,7 +456,7 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
|||
Arrays.copyOfRange(currentPtNode.mChars, differentCharIndex,
|
||||
currentPtNode.mChars.length), currentPtNode.mShortcutTargets,
|
||||
currentPtNode.mBigrams, currentPtNode.mProbabilityInfo,
|
||||
currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry,
|
||||
currentPtNode.mIsNotAWord, currentPtNode.mIsPossiblyOffensive,
|
||||
currentPtNode.mChildren);
|
||||
newChildren.mData.add(newOldWord);
|
||||
|
||||
|
@ -477,17 +465,17 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
|||
newParent = new PtNode(
|
||||
Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex),
|
||||
shortcutTargets, null /* bigrams */, probabilityInfo,
|
||||
isNotAWord, isBlacklistEntry, newChildren);
|
||||
isNotAWord, isPossiblyOffensive, newChildren);
|
||||
} else {
|
||||
newParent = new PtNode(
|
||||
Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex),
|
||||
null /* shortcutTargets */, null /* bigrams */,
|
||||
null /* probabilityInfo */, false /* isNotAWord */,
|
||||
false /* isBlacklistEntry */, newChildren);
|
||||
false /* isPossiblyOffensive */, newChildren);
|
||||
final PtNode newWord = new PtNode(Arrays.copyOfRange(word,
|
||||
charIndex + differentCharIndex, word.length),
|
||||
shortcutTargets, null /* bigrams */, probabilityInfo,
|
||||
isNotAWord, isBlacklistEntry);
|
||||
isNotAWord, isPossiblyOffensive);
|
||||
final int addIndex = word[charIndex + differentCharIndex]
|
||||
> currentPtNode.mChars[differentCharIndex] ? 1 : 0;
|
||||
newChildren.mData.add(addIndex, newWord);
|
||||
|
@ -549,7 +537,7 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
|||
final ArrayList<PtNode> data = nodeArray.mData;
|
||||
final PtNode reference = new PtNode(new int[] { character },
|
||||
null /* shortcutTargets */, null /* bigrams */, null /* probabilityInfo */,
|
||||
false /* isNotAWord */, false /* isBlacklistEntry */);
|
||||
false /* isNotAWord */, false /* isPossiblyOffensive */);
|
||||
int result = Collections.binarySearch(data, reference, PTNODE_COMPARATOR);
|
||||
return result >= 0 ? result : -result - 1;
|
||||
}
|
||||
|
@ -686,7 +674,7 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
|||
return new WordProperty(mCurrentString.toString(),
|
||||
currentPtNode.mProbabilityInfo,
|
||||
currentPtNode.mShortcutTargets, currentPtNode.mBigrams,
|
||||
currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry);
|
||||
currentPtNode.mIsNotAWord, currentPtNode.mIsPossiblyOffensive);
|
||||
}
|
||||
} else {
|
||||
mPositions.removeLast();
|
||||
|
|
|
@ -283,13 +283,9 @@ public class Ver2DictDecoder extends AbstractDictDecoder {
|
|||
|
||||
// Insert unigrams into the fusion dictionary.
|
||||
for (final WordProperty wordProperty : wordProperties) {
|
||||
if (wordProperty.mIsBlacklistEntry) {
|
||||
fusionDict.addBlacklistEntry(wordProperty.mWord, wordProperty.mShortcutTargets,
|
||||
wordProperty.mIsNotAWord);
|
||||
} else {
|
||||
fusionDict.add(wordProperty.mWord, wordProperty.mProbabilityInfo,
|
||||
wordProperty.mShortcutTargets, wordProperty.mIsNotAWord);
|
||||
}
|
||||
fusionDict.add(wordProperty.mWord, wordProperty.mProbabilityInfo,
|
||||
wordProperty.mShortcutTargets, wordProperty.mIsNotAWord,
|
||||
wordProperty.mIsPossiblyOffensive);
|
||||
}
|
||||
// Insert bigrams into the fusion dictionary.
|
||||
for (final WordProperty wordProperty : wordProperties) {
|
||||
|
|
|
@ -85,7 +85,8 @@ public class Ver2DictEncoderTests extends AndroidTestCase {
|
|||
}
|
||||
}
|
||||
dict.add(word, new ProbabilityInfo(UNIGRAM_FREQ),
|
||||
(shortcutMap == null) ? null : shortcuts, false /* isNotAWord */);
|
||||
(shortcutMap == null) ? null : shortcuts, false /* isNotAWord */,
|
||||
false /* isPossiblyOffensive */);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -88,13 +88,9 @@ public class Ver4DictDecoder extends AbstractDictDecoder {
|
|||
|
||||
// Insert unigrams into the fusion dictionary.
|
||||
for (final WordProperty wordProperty : wordProperties) {
|
||||
if (wordProperty.mIsBlacklistEntry) {
|
||||
fusionDict.addBlacklistEntry(wordProperty.mWord, wordProperty.mShortcutTargets,
|
||||
wordProperty.mIsNotAWord);
|
||||
} else {
|
||||
fusionDict.add(wordProperty.mWord, wordProperty.mProbabilityInfo,
|
||||
wordProperty.mShortcutTargets, wordProperty.mIsNotAWord);
|
||||
}
|
||||
fusionDict.add(wordProperty.mWord, wordProperty.mProbabilityInfo,
|
||||
wordProperty.mShortcutTargets, wordProperty.mIsNotAWord,
|
||||
wordProperty.mIsPossiblyOffensive);
|
||||
}
|
||||
// Insert bigrams into the fusion dictionary.
|
||||
// TODO: Support ngrams.
|
||||
|
|
|
@ -79,7 +79,7 @@ public class Ver4DictEncoder implements DictEncoder {
|
|||
if (!binaryDict.addUnigramEntry(wordProperty.mWord, wordProperty.getProbability(),
|
||||
null /* shortcutTarget */, 0 /* shortcutProbability */,
|
||||
wordProperty.mIsBeginningOfSentence, wordProperty.mIsNotAWord,
|
||||
wordProperty.mIsBlacklistEntry, 0 /* timestamp */)) {
|
||||
wordProperty.mIsPossiblyOffensive, 0 /* timestamp */)) {
|
||||
MakedictLog.e("Cannot add unigram entry for " + wordProperty.mWord);
|
||||
}
|
||||
} else {
|
||||
|
@ -88,7 +88,7 @@ public class Ver4DictEncoder implements DictEncoder {
|
|||
wordProperty.getProbability(),
|
||||
shortcutTarget.mWord, shortcutTarget.getProbability(),
|
||||
wordProperty.mIsBeginningOfSentence, wordProperty.mIsNotAWord,
|
||||
wordProperty.mIsBlacklistEntry, 0 /* timestamp */)) {
|
||||
wordProperty.mIsPossiblyOffensive, 0 /* timestamp */)) {
|
||||
MakedictLog.e("Cannot add unigram entry for " + wordProperty.mWord
|
||||
+ ", shortcutTarget: " + shortcutTarget.mWord);
|
||||
return;
|
||||
|
|
|
@ -106,7 +106,7 @@ public class CombinedInputOutput {
|
|||
if (args[0].matches(CombinedFormatUtils.WORD_TAG + "=.*")) {
|
||||
if (null != word) {
|
||||
dict.add(word, probabilityInfo, shortcuts.isEmpty() ? null : shortcuts,
|
||||
isNotAWord);
|
||||
isNotAWord, false /* isPossiblyOffensive */);
|
||||
for (WeightedString s : bigrams) {
|
||||
dict.setBigram(word, s.mWord, s.mProbabilityInfo);
|
||||
}
|
||||
|
@ -189,7 +189,8 @@ public class CombinedInputOutput {
|
|||
}
|
||||
}
|
||||
if (null != word) {
|
||||
dict.add(word, probabilityInfo, shortcuts.isEmpty() ? null : shortcuts, isNotAWord);
|
||||
dict.add(word, probabilityInfo, shortcuts.isEmpty() ? null : shortcuts, isNotAWord,
|
||||
false /* isPossiblyOffensive */);
|
||||
for (WeightedString s : bigrams) {
|
||||
dict.setBigram(word, s.mWord, s.mProbabilityInfo);
|
||||
}
|
||||
|
|
|
@ -128,10 +128,10 @@ public class Diff extends Dicttool.Command {
|
|||
+ word0Property.mIsNotAWord + " -> " + word1PtNode.getIsNotAWord());
|
||||
hasDifferences = true;
|
||||
}
|
||||
if (word0Property.mIsBlacklistEntry != word1PtNode.getIsBlacklistEntry()) {
|
||||
System.out.println("Blacklist: " + word0Property.mWord + " "
|
||||
+ word0Property.mIsBlacklistEntry + " -> "
|
||||
+ word1PtNode.getIsBlacklistEntry());
|
||||
if (word0Property.mIsPossiblyOffensive != word1PtNode.getIsPossiblyOffensive()) {
|
||||
System.out.println("Possibly-offensive: " + word0Property.mWord + " "
|
||||
+ word0Property.mIsPossiblyOffensive + " -> "
|
||||
+ word1PtNode.getIsPossiblyOffensive());
|
||||
hasDifferences = true;
|
||||
}
|
||||
hasDifferences |= hasAttributesDifferencesAndPrintThemIfAny(word0Property.mWord,
|
||||
|
|
|
@ -76,8 +76,8 @@ public class Info extends Dicttool.Command {
|
|||
if (ptNode.getIsNotAWord()) {
|
||||
System.out.println(" Is not a word");
|
||||
}
|
||||
if (ptNode.getIsBlacklistEntry()) {
|
||||
System.out.println(" Is a blacklist entry");
|
||||
if (ptNode.getIsPossiblyOffensive()) {
|
||||
System.out.println(" Is possibly offensive");
|
||||
}
|
||||
final ArrayList<WeightedString> shortcutTargets = ptNode.getShortcutTargets();
|
||||
if (null == shortcutTargets || shortcutTargets.isEmpty()) {
|
||||
|
|
|
@ -90,7 +90,8 @@ public class XmlDictInputOutput {
|
|||
for (final String shortcutOnly : mShortcutsMap.keySet()) {
|
||||
if (dict.hasWord(shortcutOnly)) continue;
|
||||
dict.add(shortcutOnly, new ProbabilityInfo(SHORTCUT_ONLY_WORD_PROBABILITY),
|
||||
mShortcutsMap.get(shortcutOnly), true /* isNotAWord */);
|
||||
mShortcutsMap.get(shortcutOnly), true /* isNotAWord */,
|
||||
false /* isPossiblyOffensive */);
|
||||
}
|
||||
mDictionary = null;
|
||||
mShortcutsMap.clear();
|
||||
|
@ -138,7 +139,7 @@ public class XmlDictInputOutput {
|
|||
public void endElement(String uri, String localName, String qName) {
|
||||
if (WORD == mState) {
|
||||
mDictionary.add(mWord, new ProbabilityInfo(mFreq), mShortcutsMap.get(mWord),
|
||||
false /* isNotAWord */);
|
||||
false /* isNotAWord */, false /* isPossiblyOffensive */);
|
||||
mState = START;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -54,11 +54,16 @@ public class BinaryDictOffdeviceUtilsTests extends TestCase {
|
|||
testOptions.mAttributes.put(DictionaryHeader.DICTIONARY_LOCALE_KEY, LOCALE);
|
||||
testOptions.mAttributes.put(DictionaryHeader.DICTIONARY_ID_KEY, ID);
|
||||
final FusionDictionary dict = new FusionDictionary(new PtNodeArray(), testOptions);
|
||||
dict.add("foo", new ProbabilityInfo(TEST_FREQ), null, false /* isNotAWord */);
|
||||
dict.add("fta", new ProbabilityInfo(1), null, false /* isNotAWord */);
|
||||
dict.add("ftb", new ProbabilityInfo(1), null, false /* isNotAWord */);
|
||||
dict.add("bar", new ProbabilityInfo(1), null, false /* isNotAWord */);
|
||||
dict.add("fool", new ProbabilityInfo(1), null, false /* isNotAWord */);
|
||||
dict.add("foo", new ProbabilityInfo(TEST_FREQ), null, false /* isNotAWord */,
|
||||
false /* isPossiblyOffensive */);
|
||||
dict.add("fta", new ProbabilityInfo(1), null, false /* isNotAWord */,
|
||||
false /* isPossiblyOffensive */);
|
||||
dict.add("ftb", new ProbabilityInfo(1), null, false /* isNotAWord */,
|
||||
false /* isPossiblyOffensive */);
|
||||
dict.add("bar", new ProbabilityInfo(1), null, false /* isNotAWord */,
|
||||
false /* isPossiblyOffensive */);
|
||||
dict.add("fool", new ProbabilityInfo(1), null, false /* isNotAWord */,
|
||||
false /* isPossiblyOffensive */);
|
||||
|
||||
final File dst = File.createTempFile("testGetRawDict", ".tmp");
|
||||
dst.deleteOnExit();
|
||||
|
|
|
@ -33,11 +33,16 @@ public class BinaryDictEncoderFlattenTreeTests extends TestCase {
|
|||
public void testFlattenNodes() {
|
||||
final FusionDictionary dict = new FusionDictionary(new PtNodeArray(),
|
||||
new DictionaryOptions(new HashMap<String, String>()));
|
||||
dict.add("foo", new ProbabilityInfo(1), null, false /* isNotAWord */);
|
||||
dict.add("fta", new ProbabilityInfo(1), null, false /* isNotAWord */);
|
||||
dict.add("ftb", new ProbabilityInfo(1), null, false /* isNotAWord */);
|
||||
dict.add("bar", new ProbabilityInfo(1), null, false /* isNotAWord */);
|
||||
dict.add("fool", new ProbabilityInfo(1), null, false /* isNotAWord */);
|
||||
dict.add("foo", new ProbabilityInfo(1), null, false /* isNotAWord */,
|
||||
false /* isPossiblyOffensive */);
|
||||
dict.add("fta", new ProbabilityInfo(1), null, false /* isNotAWord */,
|
||||
false /* isPossiblyOffensive */);
|
||||
dict.add("ftb", new ProbabilityInfo(1), null, false /* isNotAWord */,
|
||||
false /* isPossiblyOffensive */);
|
||||
dict.add("bar", new ProbabilityInfo(1), null, false /* isNotAWord */,
|
||||
false /* isPossiblyOffensive */);
|
||||
dict.add("fool", new ProbabilityInfo(1), null, false /* isNotAWord */,
|
||||
false /* isPossiblyOffensive */);
|
||||
final ArrayList<PtNodeArray> result =
|
||||
BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray);
|
||||
assertEquals(4, result.size());
|
||||
|
|
|
@ -101,7 +101,8 @@ public class FusionDictionaryTest extends TestCase {
|
|||
prepare(time);
|
||||
for (int i = 0; i < sWords.size(); ++i) {
|
||||
System.out.println("Adding in pos " + i + " : " + dumpWord(sWords.get(i)));
|
||||
dict.add(sWords.get(i), new ProbabilityInfo(180), null, false);
|
||||
dict.add(sWords.get(i), new ProbabilityInfo(180), null, false,
|
||||
false /* isPossiblyOffensive */);
|
||||
dumpDict(dict);
|
||||
checkDictionary(dict, sWords, i);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue