am c51b9b5b: Merge "Renaming "blacklist" flag to "possibly offensive""
* commit 'c51b9b5b3f9b9b80d07186691ddfa09502fd4659': Renaming "blacklist" flag to "possibly offensive"main
commit
e5e722210e
|
@ -69,7 +69,7 @@ public final class BinaryDictionary extends Dictionary {
|
||||||
// Format to get unigram flags from native side via getWordPropertyNative().
|
// Format to get unigram flags from native side via getWordPropertyNative().
|
||||||
private static final int FORMAT_WORD_PROPERTY_OUTPUT_FLAG_COUNT = 5;
|
private static final int FORMAT_WORD_PROPERTY_OUTPUT_FLAG_COUNT = 5;
|
||||||
private static final int FORMAT_WORD_PROPERTY_IS_NOT_A_WORD_INDEX = 0;
|
private static final int FORMAT_WORD_PROPERTY_IS_NOT_A_WORD_INDEX = 0;
|
||||||
private static final int FORMAT_WORD_PROPERTY_IS_BLACKLISTED_INDEX = 1;
|
private static final int FORMAT_WORD_PROPERTY_IS_POSSIBLY_OFFENSIVE_INDEX = 1;
|
||||||
private static final int FORMAT_WORD_PROPERTY_HAS_NGRAMS_INDEX = 2;
|
private static final int FORMAT_WORD_PROPERTY_HAS_NGRAMS_INDEX = 2;
|
||||||
private static final int FORMAT_WORD_PROPERTY_HAS_SHORTCUTS_INDEX = 3;
|
private static final int FORMAT_WORD_PROPERTY_HAS_SHORTCUTS_INDEX = 3;
|
||||||
private static final int FORMAT_WORD_PROPERTY_IS_BEGINNING_OF_SENTENCE_INDEX = 4;
|
private static final int FORMAT_WORD_PROPERTY_IS_BEGINNING_OF_SENTENCE_INDEX = 4;
|
||||||
|
@ -195,7 +195,7 @@ public final class BinaryDictionary extends Dictionary {
|
||||||
float[] inOutWeightOfLangModelVsSpatialModel);
|
float[] inOutWeightOfLangModelVsSpatialModel);
|
||||||
private static native boolean addUnigramEntryNative(long dict, int[] word, int probability,
|
private static native boolean addUnigramEntryNative(long dict, int[] word, int probability,
|
||||||
int[] shortcutTarget, int shortcutProbability, boolean isBeginningOfSentence,
|
int[] shortcutTarget, int shortcutProbability, boolean isBeginningOfSentence,
|
||||||
boolean isNotAWord, boolean isBlacklisted, int timestamp);
|
boolean isNotAWord, boolean isPossiblyOffensive, int timestamp);
|
||||||
private static native boolean removeUnigramEntryNative(long dict, int[] word);
|
private static native boolean removeUnigramEntryNative(long dict, int[] word);
|
||||||
private static native boolean addNgramEntryNative(long dict,
|
private static native boolean addNgramEntryNative(long dict,
|
||||||
int[][] prevWordCodePointArrays, boolean[] isBeginningOfSentenceArray,
|
int[][] prevWordCodePointArrays, boolean[] isBeginningOfSentenceArray,
|
||||||
|
@ -402,7 +402,7 @@ public final class BinaryDictionary extends Dictionary {
|
||||||
outNgramProbabilityInfo, outShortcutTargets, outShortcutProbabilities);
|
outNgramProbabilityInfo, outShortcutTargets, outShortcutProbabilities);
|
||||||
return new WordProperty(codePoints,
|
return new WordProperty(codePoints,
|
||||||
outFlags[FORMAT_WORD_PROPERTY_IS_NOT_A_WORD_INDEX],
|
outFlags[FORMAT_WORD_PROPERTY_IS_NOT_A_WORD_INDEX],
|
||||||
outFlags[FORMAT_WORD_PROPERTY_IS_BLACKLISTED_INDEX],
|
outFlags[FORMAT_WORD_PROPERTY_IS_POSSIBLY_OFFENSIVE_INDEX],
|
||||||
outFlags[FORMAT_WORD_PROPERTY_HAS_NGRAMS_INDEX],
|
outFlags[FORMAT_WORD_PROPERTY_HAS_NGRAMS_INDEX],
|
||||||
outFlags[FORMAT_WORD_PROPERTY_HAS_SHORTCUTS_INDEX],
|
outFlags[FORMAT_WORD_PROPERTY_HAS_SHORTCUTS_INDEX],
|
||||||
outFlags[FORMAT_WORD_PROPERTY_IS_BEGINNING_OF_SENTENCE_INDEX], outProbabilityInfo,
|
outFlags[FORMAT_WORD_PROPERTY_IS_BEGINNING_OF_SENTENCE_INDEX], outProbabilityInfo,
|
||||||
|
@ -439,7 +439,7 @@ public final class BinaryDictionary extends Dictionary {
|
||||||
public boolean addUnigramEntry(final String word, final int probability,
|
public boolean addUnigramEntry(final String word, final int probability,
|
||||||
final String shortcutTarget, final int shortcutProbability,
|
final String shortcutTarget, final int shortcutProbability,
|
||||||
final boolean isBeginningOfSentence, final boolean isNotAWord,
|
final boolean isBeginningOfSentence, final boolean isNotAWord,
|
||||||
final boolean isBlacklisted, final int timestamp) {
|
final boolean isPossiblyOffensive, final int timestamp) {
|
||||||
if (word == null || (word.isEmpty() && !isBeginningOfSentence)) {
|
if (word == null || (word.isEmpty() && !isBeginningOfSentence)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -447,7 +447,8 @@ public final class BinaryDictionary extends Dictionary {
|
||||||
final int[] shortcutTargetCodePoints = (shortcutTarget != null) ?
|
final int[] shortcutTargetCodePoints = (shortcutTarget != null) ?
|
||||||
StringUtils.toCodePointArray(shortcutTarget) : null;
|
StringUtils.toCodePointArray(shortcutTarget) : null;
|
||||||
if (!addUnigramEntryNative(mNativeDict, codePoints, probability, shortcutTargetCodePoints,
|
if (!addUnigramEntryNative(mNativeDict, codePoints, probability, shortcutTargetCodePoints,
|
||||||
shortcutProbability, isBeginningOfSentence, isNotAWord, isBlacklisted, timestamp)) {
|
shortcutProbability, isBeginningOfSentence, isNotAWord, isPossiblyOffensive,
|
||||||
|
timestamp)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
mHasUpdated = true;
|
mHasUpdated = true;
|
||||||
|
|
|
@ -137,7 +137,7 @@ public class ContactsBinaryDictionary extends ExpandableBinaryDictionary {
|
||||||
}
|
}
|
||||||
runGCIfRequiredLocked(true /* mindsBlockByGC */);
|
runGCIfRequiredLocked(true /* mindsBlockByGC */);
|
||||||
addUnigramLocked(word, FREQUENCY_FOR_CONTACTS, null /* shortcut */,
|
addUnigramLocked(word, FREQUENCY_FOR_CONTACTS, null /* shortcut */,
|
||||||
0 /* shortcutFreq */, false /* isNotAWord */, false /* isBlacklisted */,
|
0 /* shortcutFreq */, false /* isNotAWord */, false /* isPossiblyOffensive */,
|
||||||
BinaryDictionary.NOT_A_VALID_TIMESTAMP);
|
BinaryDictionary.NOT_A_VALID_TIMESTAMP);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -238,7 +238,8 @@ public class ContactsBinaryDictionary extends ExpandableBinaryDictionary {
|
||||||
runGCIfRequiredLocked(true /* mindsBlockByGC */);
|
runGCIfRequiredLocked(true /* mindsBlockByGC */);
|
||||||
addUnigramLocked(word, FREQUENCY_FOR_CONTACTS,
|
addUnigramLocked(word, FREQUENCY_FOR_CONTACTS,
|
||||||
null /* shortcut */, 0 /* shortcutFreq */, false /* isNotAWord */,
|
null /* shortcut */, 0 /* shortcutFreq */, false /* isNotAWord */,
|
||||||
false /* isBlacklisted */, BinaryDictionary.NOT_A_VALID_TIMESTAMP);
|
false /* isPossiblyOffensive */,
|
||||||
|
BinaryDictionary.NOT_A_VALID_TIMESTAMP);
|
||||||
if (!ngramContext.isValid() && mUseFirstLastBigrams) {
|
if (!ngramContext.isValid() && mUseFirstLastBigrams) {
|
||||||
runGCIfRequiredLocked(true /* mindsBlockByGC */);
|
runGCIfRequiredLocked(true /* mindsBlockByGC */);
|
||||||
addNgramEntryLocked(ngramContext, word, FREQUENCY_FOR_CONTACTS_BIGRAM,
|
addNgramEntryLocked(ngramContext, word, FREQUENCY_FOR_CONTACTS_BIGRAM,
|
||||||
|
|
|
@ -809,7 +809,7 @@ public class DictionaryFacilitator {
|
||||||
contextualDict.addUnigramEntryWithCheckingDistracter(
|
contextualDict.addUnigramEntryWithCheckingDistracter(
|
||||||
subPhraseStr, probability, null /* shortcutTarget */,
|
subPhraseStr, probability, null /* shortcutTarget */,
|
||||||
Dictionary.NOT_A_PROBABILITY /* shortcutFreq */,
|
Dictionary.NOT_A_PROBABILITY /* shortcutFreq */,
|
||||||
false /* isNotAWord */, false /* isBlacklisted */,
|
false /* isNotAWord */, false /* isPossiblyOffensive */,
|
||||||
BinaryDictionary.NOT_A_VALID_TIMESTAMP,
|
BinaryDictionary.NOT_A_VALID_TIMESTAMP,
|
||||||
DistracterFilter.EMPTY_DISTRACTER_FILTER);
|
DistracterFilter.EMPTY_DISTRACTER_FILTER);
|
||||||
contextualDict.addNgramEntry(ngramContext, subPhraseStr,
|
contextualDict.addNgramEntry(ngramContext, subPhraseStr,
|
||||||
|
@ -819,7 +819,7 @@ public class DictionaryFacilitator {
|
||||||
contextualDict.addUnigramEntryWithCheckingDistracter(
|
contextualDict.addUnigramEntryWithCheckingDistracter(
|
||||||
phrase[i], probability, null /* shortcutTarget */,
|
phrase[i], probability, null /* shortcutTarget */,
|
||||||
Dictionary.NOT_A_PROBABILITY /* shortcutFreq */,
|
Dictionary.NOT_A_PROBABILITY /* shortcutFreq */,
|
||||||
false /* isNotAWord */, false /* isBlacklisted */,
|
false /* isNotAWord */, false /* isPossiblyOffensive */,
|
||||||
BinaryDictionary.NOT_A_VALID_TIMESTAMP,
|
BinaryDictionary.NOT_A_VALID_TIMESTAMP,
|
||||||
DistracterFilter.EMPTY_DISTRACTER_FILTER);
|
DistracterFilter.EMPTY_DISTRACTER_FILTER);
|
||||||
contextualDict.addNgramEntry(ngramContext, phrase[i],
|
contextualDict.addNgramEntry(ngramContext, phrase[i],
|
||||||
|
|
|
@ -316,22 +316,22 @@ abstract public class ExpandableBinaryDictionary extends Dictionary {
|
||||||
*/
|
*/
|
||||||
public void addUnigramEntryWithCheckingDistracter(final String word, final int frequency,
|
public void addUnigramEntryWithCheckingDistracter(final String word, final int frequency,
|
||||||
final String shortcutTarget, final int shortcutFreq, final boolean isNotAWord,
|
final String shortcutTarget, final int shortcutFreq, final boolean isNotAWord,
|
||||||
final boolean isBlacklisted, final int timestamp,
|
final boolean isPossiblyOffensive, final int timestamp,
|
||||||
@Nonnull final DistracterFilter distracterFilter) {
|
@Nonnull final DistracterFilter distracterFilter) {
|
||||||
updateDictionaryWithWriteLockIfWordIsNotADistracter(new Runnable() {
|
updateDictionaryWithWriteLockIfWordIsNotADistracter(new Runnable() {
|
||||||
@Override
|
@Override
|
||||||
public void run() {
|
public void run() {
|
||||||
addUnigramLocked(word, frequency, shortcutTarget, shortcutFreq,
|
addUnigramLocked(word, frequency, shortcutTarget, shortcutFreq,
|
||||||
isNotAWord, isBlacklisted, timestamp);
|
isNotAWord, isPossiblyOffensive, timestamp);
|
||||||
}
|
}
|
||||||
}, word, distracterFilter);
|
}, word, distracterFilter);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void addUnigramLocked(final String word, final int frequency,
|
protected void addUnigramLocked(final String word, final int frequency,
|
||||||
final String shortcutTarget, final int shortcutFreq, final boolean isNotAWord,
|
final String shortcutTarget, final int shortcutFreq, final boolean isNotAWord,
|
||||||
final boolean isBlacklisted, final int timestamp) {
|
final boolean isPossiblyOffensive, final int timestamp) {
|
||||||
if (!mBinaryDictionary.addUnigramEntry(word, frequency, shortcutTarget, shortcutFreq,
|
if (!mBinaryDictionary.addUnigramEntry(word, frequency, shortcutTarget, shortcutFreq,
|
||||||
false /* isBeginningOfSentence */, isNotAWord, isBlacklisted, timestamp)) {
|
false /* isBeginningOfSentence */, isNotAWord, isPossiblyOffensive, timestamp)) {
|
||||||
Log.e(TAG, "Cannot add unigram entry. word: " + word);
|
Log.e(TAG, "Cannot add unigram entry. word: " + word);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -257,12 +257,14 @@ public class UserBinaryDictionary extends ExpandableBinaryDictionary {
|
||||||
runGCIfRequiredLocked(true /* mindsBlockByGC */);
|
runGCIfRequiredLocked(true /* mindsBlockByGC */);
|
||||||
addUnigramLocked(word, adjustedFrequency, null /* shortcutTarget */,
|
addUnigramLocked(word, adjustedFrequency, null /* shortcutTarget */,
|
||||||
0 /* shortcutFreq */, false /* isNotAWord */,
|
0 /* shortcutFreq */, false /* isNotAWord */,
|
||||||
false /* isBlacklisted */, BinaryDictionary.NOT_A_VALID_TIMESTAMP);
|
false /* isPossiblyOffensive */,
|
||||||
|
BinaryDictionary.NOT_A_VALID_TIMESTAMP);
|
||||||
if (null != shortcut && shortcut.length() <= MAX_WORD_LENGTH) {
|
if (null != shortcut && shortcut.length() <= MAX_WORD_LENGTH) {
|
||||||
runGCIfRequiredLocked(true /* mindsBlockByGC */);
|
runGCIfRequiredLocked(true /* mindsBlockByGC */);
|
||||||
addUnigramLocked(shortcut, adjustedFrequency, word,
|
addUnigramLocked(shortcut, adjustedFrequency, word,
|
||||||
USER_DICT_SHORTCUT_FREQUENCY, true /* isNotAWord */,
|
USER_DICT_SHORTCUT_FREQUENCY, true /* isNotAWord */,
|
||||||
false /* isBlacklisted */, BinaryDictionary.NOT_A_VALID_TIMESTAMP);
|
false /* isPossiblyOffensive */,
|
||||||
|
BinaryDictionary.NOT_A_VALID_TIMESTAMP);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
cursor.moveToNext();
|
cursor.moveToNext();
|
||||||
|
|
|
@ -93,7 +93,7 @@ public final class FormatSpec {
|
||||||
* s | has shortcut targets ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_SHORTCUT_TARGETS
|
* s | has shortcut targets ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_SHORTCUT_TARGETS
|
||||||
* | has bigrams ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_BIGRAMS
|
* | has bigrams ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_BIGRAMS
|
||||||
* | is not a word ? 1 bit, 1 = yes, 0 = no : FLAG_IS_NOT_A_WORD
|
* | is not a word ? 1 bit, 1 = yes, 0 = no : FLAG_IS_NOT_A_WORD
|
||||||
* | is blacklisted ? 1 bit, 1 = yes, 0 = no : FLAG_IS_BLACKLISTED
|
* | is possibly offensive ? 1 bit, 1 = yes, 0 = no : FLAG_IS_POSSIBLY_OFFENSIVE
|
||||||
*
|
*
|
||||||
* c | IF FLAG_HAS_MULTIPLE_CHARS
|
* c | IF FLAG_HAS_MULTIPLE_CHARS
|
||||||
* h | char, char, char, char n * (1 or 3 bytes) : use PtNodeInfo for i/o helpers
|
* h | char, char, char, char n * (1 or 3 bytes) : use PtNodeInfo for i/o helpers
|
||||||
|
@ -197,7 +197,7 @@ public final class FormatSpec {
|
||||||
static final int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
|
static final int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
|
||||||
static final int FLAG_HAS_BIGRAMS = 0x04;
|
static final int FLAG_HAS_BIGRAMS = 0x04;
|
||||||
static final int FLAG_IS_NOT_A_WORD = 0x02;
|
static final int FLAG_IS_NOT_A_WORD = 0x02;
|
||||||
static final int FLAG_IS_BLACKLISTED = 0x01;
|
static final int FLAG_IS_POSSIBLY_OFFENSIVE = 0x01;
|
||||||
|
|
||||||
static final int FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT = 0x80;
|
static final int FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT = 0x80;
|
||||||
static final int FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE = 0x40;
|
static final int FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE = 0x40;
|
||||||
|
|
|
@ -41,7 +41,7 @@ public final class WordProperty implements Comparable<WordProperty> {
|
||||||
// TODO: Support mIsBeginningOfSentence.
|
// TODO: Support mIsBeginningOfSentence.
|
||||||
public final boolean mIsBeginningOfSentence;
|
public final boolean mIsBeginningOfSentence;
|
||||||
public final boolean mIsNotAWord;
|
public final boolean mIsNotAWord;
|
||||||
public final boolean mIsBlacklistEntry;
|
public final boolean mIsPossiblyOffensive;
|
||||||
public final boolean mHasShortcuts;
|
public final boolean mHasShortcuts;
|
||||||
public final boolean mHasNgrams;
|
public final boolean mHasNgrams;
|
||||||
|
|
||||||
|
@ -52,7 +52,7 @@ public final class WordProperty implements Comparable<WordProperty> {
|
||||||
public WordProperty(final String word, final ProbabilityInfo probabilityInfo,
|
public WordProperty(final String word, final ProbabilityInfo probabilityInfo,
|
||||||
final ArrayList<WeightedString> shortcutTargets,
|
final ArrayList<WeightedString> shortcutTargets,
|
||||||
@Nullable final ArrayList<WeightedString> bigrams,
|
@Nullable final ArrayList<WeightedString> bigrams,
|
||||||
final boolean isNotAWord, final boolean isBlacklistEntry) {
|
final boolean isNotAWord, final boolean isPossiblyOffensive) {
|
||||||
mWord = word;
|
mWord = word;
|
||||||
mProbabilityInfo = probabilityInfo;
|
mProbabilityInfo = probabilityInfo;
|
||||||
mShortcutTargets = shortcutTargets;
|
mShortcutTargets = shortcutTargets;
|
||||||
|
@ -69,7 +69,7 @@ public final class WordProperty implements Comparable<WordProperty> {
|
||||||
}
|
}
|
||||||
mIsBeginningOfSentence = false;
|
mIsBeginningOfSentence = false;
|
||||||
mIsNotAWord = isNotAWord;
|
mIsNotAWord = isNotAWord;
|
||||||
mIsBlacklistEntry = isBlacklistEntry;
|
mIsPossiblyOffensive = isPossiblyOffensive;
|
||||||
mHasNgrams = bigrams != null && !bigrams.isEmpty();
|
mHasNgrams = bigrams != null && !bigrams.isEmpty();
|
||||||
mHasShortcuts = shortcutTargets != null && !shortcutTargets.isEmpty();
|
mHasShortcuts = shortcutTargets != null && !shortcutTargets.isEmpty();
|
||||||
}
|
}
|
||||||
|
@ -85,7 +85,7 @@ public final class WordProperty implements Comparable<WordProperty> {
|
||||||
// Construct word property using information from native code.
|
// Construct word property using information from native code.
|
||||||
// This represents invalid word when the probability is BinaryDictionary.NOT_A_PROBABILITY.
|
// This represents invalid word when the probability is BinaryDictionary.NOT_A_PROBABILITY.
|
||||||
public WordProperty(final int[] codePoints, final boolean isNotAWord,
|
public WordProperty(final int[] codePoints, final boolean isNotAWord,
|
||||||
final boolean isBlacklisted, final boolean hasBigram, final boolean hasShortcuts,
|
final boolean isPossiblyOffensive, final boolean hasBigram, final boolean hasShortcuts,
|
||||||
final boolean isBeginningOfSentence, final int[] probabilityInfo,
|
final boolean isBeginningOfSentence, final int[] probabilityInfo,
|
||||||
final ArrayList<int[][]> ngramPrevWordsArray,
|
final ArrayList<int[][]> ngramPrevWordsArray,
|
||||||
final ArrayList<boolean[]> outNgramPrevWordIsBeginningOfSentenceArray,
|
final ArrayList<boolean[]> outNgramPrevWordIsBeginningOfSentenceArray,
|
||||||
|
@ -98,7 +98,7 @@ public final class WordProperty implements Comparable<WordProperty> {
|
||||||
final ArrayList<NgramProperty> ngrams = new ArrayList<>();
|
final ArrayList<NgramProperty> ngrams = new ArrayList<>();
|
||||||
mIsBeginningOfSentence = isBeginningOfSentence;
|
mIsBeginningOfSentence = isBeginningOfSentence;
|
||||||
mIsNotAWord = isNotAWord;
|
mIsNotAWord = isNotAWord;
|
||||||
mIsBlacklistEntry = isBlacklisted;
|
mIsPossiblyOffensive = isPossiblyOffensive;
|
||||||
mHasShortcuts = hasShortcuts;
|
mHasShortcuts = hasShortcuts;
|
||||||
mHasNgrams = hasBigram;
|
mHasNgrams = hasBigram;
|
||||||
|
|
||||||
|
@ -150,7 +150,7 @@ public final class WordProperty implements Comparable<WordProperty> {
|
||||||
word.mShortcutTargets,
|
word.mShortcutTargets,
|
||||||
word.mNgrams,
|
word.mNgrams,
|
||||||
word.mIsNotAWord,
|
word.mIsNotAWord,
|
||||||
word.mIsBlacklistEntry
|
word.mIsPossiblyOffensive
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -180,7 +180,7 @@ public final class WordProperty implements Comparable<WordProperty> {
|
||||||
WordProperty w = (WordProperty)o;
|
WordProperty w = (WordProperty)o;
|
||||||
return mProbabilityInfo.equals(w.mProbabilityInfo) && mWord.equals(w.mWord)
|
return mProbabilityInfo.equals(w.mProbabilityInfo) && mWord.equals(w.mWord)
|
||||||
&& mShortcutTargets.equals(w.mShortcutTargets) && equals(mNgrams, w.mNgrams)
|
&& mShortcutTargets.equals(w.mShortcutTargets) && equals(mNgrams, w.mNgrams)
|
||||||
&& mIsNotAWord == w.mIsNotAWord && mIsBlacklistEntry == w.mIsBlacklistEntry
|
&& mIsNotAWord == w.mIsNotAWord && mIsPossiblyOffensive == w.mIsPossiblyOffensive
|
||||||
&& mHasNgrams == w.mHasNgrams && mHasShortcuts && w.mHasNgrams;
|
&& mHasNgrams == w.mHasNgrams && mHasShortcuts && w.mHasNgrams;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -63,7 +63,7 @@ public class CombinedFormatUtils {
|
||||||
if (wordProperty.mIsNotAWord) {
|
if (wordProperty.mIsNotAWord) {
|
||||||
builder.append("," + NOT_A_WORD_TAG + "=true");
|
builder.append("," + NOT_A_WORD_TAG + "=true");
|
||||||
}
|
}
|
||||||
if (wordProperty.mIsBlacklistEntry) {
|
if (wordProperty.mIsPossiblyOffensive) {
|
||||||
builder.append("," + BLACKLISTED_TAG + "=true");
|
builder.append("," + BLACKLISTED_TAG + "=true");
|
||||||
}
|
}
|
||||||
builder.append("\n");
|
builder.append("\n");
|
||||||
|
|
|
@ -54,7 +54,7 @@ public final class LanguageModelParam {
|
||||||
public final int mBigramProbability;
|
public final int mBigramProbability;
|
||||||
public final int mShortcutProbability;
|
public final int mShortcutProbability;
|
||||||
public final boolean mIsNotAWord;
|
public final boolean mIsNotAWord;
|
||||||
public final boolean mIsBlacklisted;
|
public final boolean mIsPossiblyOffensive;
|
||||||
// Time stamp in seconds.
|
// Time stamp in seconds.
|
||||||
public final int mTimestamp;
|
public final int mTimestamp;
|
||||||
|
|
||||||
|
@ -78,7 +78,7 @@ public final class LanguageModelParam {
|
||||||
mBigramProbability = bigramProbability;
|
mBigramProbability = bigramProbability;
|
||||||
mShortcutProbability = Dictionary.NOT_A_PROBABILITY;
|
mShortcutProbability = Dictionary.NOT_A_PROBABILITY;
|
||||||
mIsNotAWord = false;
|
mIsNotAWord = false;
|
||||||
mIsBlacklisted = false;
|
mIsPossiblyOffensive = false;
|
||||||
mTimestamp = timestamp;
|
mTimestamp = timestamp;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -358,7 +358,7 @@ static void latinime_BinaryDictionary_getWordProperty(JNIEnv *env, jclass clazz,
|
||||||
|
|
||||||
static bool latinime_BinaryDictionary_addUnigramEntry(JNIEnv *env, jclass clazz, jlong dict,
|
static bool latinime_BinaryDictionary_addUnigramEntry(JNIEnv *env, jclass clazz, jlong dict,
|
||||||
jintArray word, jint probability, jintArray shortcutTarget, jint shortcutProbability,
|
jintArray word, jint probability, jintArray shortcutTarget, jint shortcutProbability,
|
||||||
jboolean isBeginningOfSentence, jboolean isNotAWord, jboolean isBlacklisted,
|
jboolean isBeginningOfSentence, jboolean isNotAWord, jboolean isPossiblyOffensive,
|
||||||
jint timestamp) {
|
jint timestamp) {
|
||||||
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
|
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
|
||||||
if (!dictionary) {
|
if (!dictionary) {
|
||||||
|
@ -377,8 +377,8 @@ static bool latinime_BinaryDictionary_addUnigramEntry(JNIEnv *env, jclass clazz,
|
||||||
}
|
}
|
||||||
// Use 1 for count to indicate the word has inputted.
|
// Use 1 for count to indicate the word has inputted.
|
||||||
const UnigramProperty unigramProperty(isBeginningOfSentence, isNotAWord,
|
const UnigramProperty unigramProperty(isBeginningOfSentence, isNotAWord,
|
||||||
isBlacklisted, probability, HistoricalInfo(timestamp, 0 /* level */, 1 /* count */),
|
isPossiblyOffensive, probability, HistoricalInfo(timestamp, 0 /* level */,
|
||||||
std::move(shortcuts));
|
1 /* count */), std::move(shortcuts));
|
||||||
return dictionary->addUnigramEntry(CodePointArrayView(codePoints, codePointCount),
|
return dictionary->addUnigramEntry(CodePointArrayView(codePoints, codePointCount),
|
||||||
&unigramProperty);
|
&unigramProperty);
|
||||||
}
|
}
|
||||||
|
@ -480,8 +480,8 @@ static int latinime_BinaryDictionary_addMultipleDictionaryEntries(JNIEnv *env, j
|
||||||
env->GetFieldID(languageModelParamClass, "mShortcutProbability", "I");
|
env->GetFieldID(languageModelParamClass, "mShortcutProbability", "I");
|
||||||
jfieldID isNotAWordFieldId =
|
jfieldID isNotAWordFieldId =
|
||||||
env->GetFieldID(languageModelParamClass, "mIsNotAWord", "Z");
|
env->GetFieldID(languageModelParamClass, "mIsNotAWord", "Z");
|
||||||
jfieldID isBlacklistedFieldId =
|
jfieldID isPossiblyOffensiveFieldId =
|
||||||
env->GetFieldID(languageModelParamClass, "mIsBlacklisted", "Z");
|
env->GetFieldID(languageModelParamClass, "mIsPossiblyOffensive", "Z");
|
||||||
env->DeleteLocalRef(languageModelParamClass);
|
env->DeleteLocalRef(languageModelParamClass);
|
||||||
|
|
||||||
for (int i = startIndex; i < languageModelParamCount; ++i) {
|
for (int i = startIndex; i < languageModelParamCount; ++i) {
|
||||||
|
@ -504,7 +504,8 @@ static int latinime_BinaryDictionary_addMultipleDictionaryEntries(JNIEnv *env, j
|
||||||
jint unigramProbability = env->GetIntField(languageModelParam, unigramProbabilityFieldId);
|
jint unigramProbability = env->GetIntField(languageModelParam, unigramProbabilityFieldId);
|
||||||
jint timestamp = env->GetIntField(languageModelParam, timestampFieldId);
|
jint timestamp = env->GetIntField(languageModelParam, timestampFieldId);
|
||||||
jboolean isNotAWord = env->GetBooleanField(languageModelParam, isNotAWordFieldId);
|
jboolean isNotAWord = env->GetBooleanField(languageModelParam, isNotAWordFieldId);
|
||||||
jboolean isBlacklisted = env->GetBooleanField(languageModelParam, isBlacklistedFieldId);
|
jboolean isPossiblyOffensive = env->GetBooleanField(languageModelParam,
|
||||||
|
isPossiblyOffensiveFieldId);
|
||||||
jintArray shortcutTarget = static_cast<jintArray>(
|
jintArray shortcutTarget = static_cast<jintArray>(
|
||||||
env->GetObjectField(languageModelParam, shortcutTargetFieldId));
|
env->GetObjectField(languageModelParam, shortcutTargetFieldId));
|
||||||
std::vector<UnigramProperty::ShortcutProperty> shortcuts;
|
std::vector<UnigramProperty::ShortcutProperty> shortcuts;
|
||||||
|
@ -519,7 +520,7 @@ static int latinime_BinaryDictionary_addMultipleDictionaryEntries(JNIEnv *env, j
|
||||||
}
|
}
|
||||||
// Use 1 for count to indicate the word has inputted.
|
// Use 1 for count to indicate the word has inputted.
|
||||||
const UnigramProperty unigramProperty(false /* isBeginningOfSentence */, isNotAWord,
|
const UnigramProperty unigramProperty(false /* isBeginningOfSentence */, isNotAWord,
|
||||||
isBlacklisted, unigramProbability,
|
isPossiblyOffensive, unigramProbability,
|
||||||
HistoricalInfo(timestamp, 0 /* level */, 1 /* count */), std::move(shortcuts));
|
HistoricalInfo(timestamp, 0 /* level */, 1 /* count */), std::move(shortcuts));
|
||||||
dictionary->addUnigramEntry(CodePointArrayView(word1CodePoints, word1Length),
|
dictionary->addUnigramEntry(CodePointArrayView(word1CodePoints, word1Length),
|
||||||
&unigramProperty);
|
&unigramProperty);
|
||||||
|
|
|
@ -49,21 +49,44 @@ class UnigramProperty {
|
||||||
};
|
};
|
||||||
|
|
||||||
UnigramProperty()
|
UnigramProperty()
|
||||||
: mRepresentsBeginningOfSentence(false), mIsNotAWord(false), mIsBlacklisted(false),
|
: mRepresentsBeginningOfSentence(false), mIsNotAWord(false),
|
||||||
mProbability(NOT_A_PROBABILITY), mHistoricalInfo(), mShortcuts() {}
|
mIsBlacklisted(false), mIsPossiblyOffensive(false), mProbability(NOT_A_PROBABILITY),
|
||||||
|
mHistoricalInfo(), mShortcuts() {}
|
||||||
|
|
||||||
|
// In contexts which do not support the Blacklisted flag (v2, v4<403)
|
||||||
UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord,
|
UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord,
|
||||||
const bool isBlacklisted, const int probability, const HistoricalInfo historicalInfo,
|
const bool isPossiblyOffensive, const int probability,
|
||||||
const std::vector<ShortcutProperty> &&shortcuts)
|
const HistoricalInfo historicalInfo, const std::vector<ShortcutProperty> &&shortcuts)
|
||||||
: mRepresentsBeginningOfSentence(representsBeginningOfSentence),
|
: mRepresentsBeginningOfSentence(representsBeginningOfSentence),
|
||||||
mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), mProbability(probability),
|
mIsNotAWord(isNotAWord), mIsBlacklisted(false),
|
||||||
|
mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability),
|
||||||
mHistoricalInfo(historicalInfo), mShortcuts(std::move(shortcuts)) {}
|
mHistoricalInfo(historicalInfo), mShortcuts(std::move(shortcuts)) {}
|
||||||
|
|
||||||
// Without shortcuts.
|
// Without shortcuts, in contexts which do not support the Blacklisted flag (v2, v4<403)
|
||||||
UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord,
|
UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord,
|
||||||
const bool isBlacklisted, const int probability, const HistoricalInfo historicalInfo)
|
const bool isPossiblyOffensive, const int probability,
|
||||||
|
const HistoricalInfo historicalInfo)
|
||||||
: mRepresentsBeginningOfSentence(representsBeginningOfSentence),
|
: mRepresentsBeginningOfSentence(representsBeginningOfSentence),
|
||||||
mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), mProbability(probability),
|
mIsNotAWord(isNotAWord), mIsBlacklisted(false),
|
||||||
|
mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability),
|
||||||
|
mHistoricalInfo(historicalInfo), mShortcuts() {}
|
||||||
|
|
||||||
|
// In contexts which DO support the Blacklisted flag (v403)
|
||||||
|
UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord,
|
||||||
|
const bool isBlacklisted, const bool isPossiblyOffensive, const int probability,
|
||||||
|
const HistoricalInfo historicalInfo, const std::vector<ShortcutProperty> &&shortcuts)
|
||||||
|
: mRepresentsBeginningOfSentence(representsBeginningOfSentence),
|
||||||
|
mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted),
|
||||||
|
mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability),
|
||||||
|
mHistoricalInfo(historicalInfo), mShortcuts(std::move(shortcuts)) {}
|
||||||
|
|
||||||
|
// Without shortcuts, in contexts which DO support the Blacklisted flag (v403)
|
||||||
|
UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord,
|
||||||
|
const bool isBlacklisted, const bool isPossiblyOffensive, const int probability,
|
||||||
|
const HistoricalInfo historicalInfo)
|
||||||
|
: mRepresentsBeginningOfSentence(representsBeginningOfSentence),
|
||||||
|
mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted),
|
||||||
|
mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability),
|
||||||
mHistoricalInfo(historicalInfo), mShortcuts() {}
|
mHistoricalInfo(historicalInfo), mShortcuts() {}
|
||||||
|
|
||||||
bool representsBeginningOfSentence() const {
|
bool representsBeginningOfSentence() const {
|
||||||
|
@ -74,13 +97,12 @@ class UnigramProperty {
|
||||||
return mIsNotAWord;
|
return mIsNotAWord;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool isBlacklisted() const {
|
bool isPossiblyOffensive() const {
|
||||||
return mIsBlacklisted;
|
return mIsPossiblyOffensive;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool isPossiblyOffensive() const {
|
bool isBlacklisted() const {
|
||||||
// TODO: Have dedicated flag.
|
return mIsBlacklisted;
|
||||||
return mProbability == 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool hasShortcuts() const {
|
bool hasShortcuts() const {
|
||||||
|
@ -106,6 +128,7 @@ class UnigramProperty {
|
||||||
const bool mRepresentsBeginningOfSentence;
|
const bool mRepresentsBeginningOfSentence;
|
||||||
const bool mIsNotAWord;
|
const bool mIsNotAWord;
|
||||||
const bool mIsBlacklisted;
|
const bool mIsBlacklisted;
|
||||||
|
const bool mIsPossiblyOffensive;
|
||||||
const int mProbability;
|
const int mProbability;
|
||||||
const HistoricalInfo mHistoricalInfo;
|
const HistoricalInfo mHistoricalInfo;
|
||||||
const std::vector<ShortcutProperty> mShortcuts;
|
const std::vector<ShortcutProperty> mShortcuts;
|
||||||
|
|
|
@ -28,7 +28,7 @@ void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints,
|
||||||
JniDataUtils::outputCodePoints(env, outCodePoints, 0 /* start */,
|
JniDataUtils::outputCodePoints(env, outCodePoints, 0 /* start */,
|
||||||
MAX_WORD_LENGTH /* maxLength */, mCodePoints.data(), mCodePoints.size(),
|
MAX_WORD_LENGTH /* maxLength */, mCodePoints.data(), mCodePoints.size(),
|
||||||
false /* needsNullTermination */);
|
false /* needsNullTermination */);
|
||||||
jboolean flags[] = {mUnigramProperty.isNotAWord(), mUnigramProperty.isBlacklisted(),
|
jboolean flags[] = {mUnigramProperty.isNotAWord(), mUnigramProperty.isPossiblyOffensive(),
|
||||||
!mNgrams.empty(), mUnigramProperty.hasShortcuts(),
|
!mNgrams.empty(), mUnigramProperty.hasShortcuts(),
|
||||||
mUnigramProperty.representsBeginningOfSentence()};
|
mUnigramProperty.representsBeginningOfSentence()};
|
||||||
env->SetBooleanArrayRegion(outFlags, 0 /* start */, NELEMS(flags), flags);
|
env->SetBooleanArrayRegion(outFlags, 0 /* start */, NELEMS(flags), flags);
|
||||||
|
|
|
@ -43,6 +43,14 @@ class WordAttributes {
|
||||||
return mIsNotAWord;
|
return mIsNotAWord;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Whether or not a word is possibly offensive.
|
||||||
|
// * Static dictionaries <v202, as well as dynamic dictionaries <v403, will set this based on
|
||||||
|
// whether or not the probability of the word is zero.
|
||||||
|
// * Static dictionaries >=v203 will set this based on the IS_POSSIBLY_OFFENSIVE PtNode flag.
|
||||||
|
// * Dynamic dictionaries >=v403 will set this based on the IS_POSSIBLY_OFFENSIVE language model
|
||||||
|
// flag (the PtNode flag IS_BLACKLISTED is ignored and kept as zero)
|
||||||
|
//
|
||||||
|
// See the ::getWordAttributes function for each of these dictionary policies for more details.
|
||||||
bool isPossiblyOffensive() const {
|
bool isPossiblyOffensive() const {
|
||||||
return mIsPossiblyOffensive;
|
return mIsPossiblyOffensive;
|
||||||
}
|
}
|
||||||
|
|
|
@ -245,7 +245,7 @@ bool Ver4PatriciaTrieNodeWriter::addNgramEntry(const WordIdArrayView prevWordIds
|
||||||
if (!sourcePtNodeParams.hasBigrams()) {
|
if (!sourcePtNodeParams.hasBigrams()) {
|
||||||
// Update has bigrams flag.
|
// Update has bigrams flag.
|
||||||
return updatePtNodeFlags(sourcePtNodeParams.getHeadPos(),
|
return updatePtNodeFlags(sourcePtNodeParams.getHeadPos(),
|
||||||
sourcePtNodeParams.isBlacklisted(), sourcePtNodeParams.isNotAWord(),
|
sourcePtNodeParams.isPossiblyOffensive(), sourcePtNodeParams.isNotAWord(),
|
||||||
sourcePtNodeParams.isTerminal(), sourcePtNodeParams.hasShortcutTargets(),
|
sourcePtNodeParams.isTerminal(), sourcePtNodeParams.hasShortcutTargets(),
|
||||||
true /* hasBigrams */,
|
true /* hasBigrams */,
|
||||||
sourcePtNodeParams.getCodePointCount() > 1 /* hasMultipleChars */);
|
sourcePtNodeParams.getCodePointCount() > 1 /* hasMultipleChars */);
|
||||||
|
@ -316,7 +316,7 @@ bool Ver4PatriciaTrieNodeWriter::addShortcutTarget(const PtNodeParams *const ptN
|
||||||
if (!ptNodeParams->hasShortcutTargets()) {
|
if (!ptNodeParams->hasShortcutTargets()) {
|
||||||
// Update has shortcut targets flag.
|
// Update has shortcut targets flag.
|
||||||
return updatePtNodeFlags(ptNodeParams->getHeadPos(),
|
return updatePtNodeFlags(ptNodeParams->getHeadPos(),
|
||||||
ptNodeParams->isBlacklisted(), ptNodeParams->isNotAWord(),
|
ptNodeParams->isPossiblyOffensive(), ptNodeParams->isNotAWord(),
|
||||||
ptNodeParams->isTerminal(), true /* hasShortcutTargets */,
|
ptNodeParams->isTerminal(), true /* hasShortcutTargets */,
|
||||||
ptNodeParams->hasBigrams(),
|
ptNodeParams->hasBigrams(),
|
||||||
ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */);
|
ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */);
|
||||||
|
@ -330,7 +330,7 @@ bool Ver4PatriciaTrieNodeWriter::updatePtNodeHasBigramsAndShortcutTargetsFlags(
|
||||||
ptNodeParams->getTerminalId()) != NOT_A_DICT_POS;
|
ptNodeParams->getTerminalId()) != NOT_A_DICT_POS;
|
||||||
const bool hasShortcutTargets = mBuffers->getShortcutDictContent()->getShortcutListHeadPos(
|
const bool hasShortcutTargets = mBuffers->getShortcutDictContent()->getShortcutListHeadPos(
|
||||||
ptNodeParams->getTerminalId()) != NOT_A_DICT_POS;
|
ptNodeParams->getTerminalId()) != NOT_A_DICT_POS;
|
||||||
return updatePtNodeFlags(ptNodeParams->getHeadPos(), ptNodeParams->isBlacklisted(),
|
return updatePtNodeFlags(ptNodeParams->getHeadPos(), ptNodeParams->isPossiblyOffensive(),
|
||||||
ptNodeParams->isNotAWord(), ptNodeParams->isTerminal(), hasShortcutTargets,
|
ptNodeParams->isNotAWord(), ptNodeParams->isTerminal(), hasShortcutTargets,
|
||||||
hasBigrams, ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */);
|
hasBigrams, ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */);
|
||||||
}
|
}
|
||||||
|
@ -386,8 +386,9 @@ bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition(
|
||||||
ptNodeParams->getChildrenPos(), ptNodeWritingPos)) {
|
ptNodeParams->getChildrenPos(), ptNodeWritingPos)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return updatePtNodeFlags(nodePos, ptNodeParams->isBlacklisted(), ptNodeParams->isNotAWord(),
|
return updatePtNodeFlags(nodePos, ptNodeParams->isPossiblyOffensive(),
|
||||||
isTerminal, ptNodeParams->hasShortcutTargets(), ptNodeParams->hasBigrams(),
|
ptNodeParams->isNotAWord(), isTerminal, ptNodeParams->hasShortcutTargets(),
|
||||||
|
ptNodeParams->hasBigrams(),
|
||||||
ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */);
|
ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -608,8 +608,8 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
|
const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
|
||||||
ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
|
ptNodeParams.isNotAWord(), ptNodeParams.isPossiblyOffensive(),
|
||||||
*historicalInfo, std::move(shortcuts));
|
ptNodeParams.getProbability(), *historicalInfo, std::move(shortcuts));
|
||||||
return WordProperty(wordCodePoints.toVector(), &unigramProperty, &ngrams);
|
return WordProperty(wordCodePoints.toVector(), &unigramProperty, &ngrams);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -146,7 +146,7 @@ bool DynamicPtUpdatingHelper::setPtNodeProbability(const PtNodeParams *const ori
|
||||||
const int movedPos = mBuffer->getTailPosition();
|
const int movedPos = mBuffer->getTailPosition();
|
||||||
int writingPos = movedPos;
|
int writingPos = movedPos;
|
||||||
const PtNodeParams ptNodeParamsToWrite(getUpdatedPtNodeParams(originalPtNodeParams,
|
const PtNodeParams ptNodeParamsToWrite(getUpdatedPtNodeParams(originalPtNodeParams,
|
||||||
unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(),
|
unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
|
||||||
true /* isTerminal */, originalPtNodeParams->getParentPos(),
|
true /* isTerminal */, originalPtNodeParams->getParentPos(),
|
||||||
originalPtNodeParams->getCodePointArrayView(), unigramProperty->getProbability()));
|
originalPtNodeParams->getCodePointArrayView(), unigramProperty->getProbability()));
|
||||||
if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
|
if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
|
||||||
|
@ -180,8 +180,9 @@ bool DynamicPtUpdatingHelper::createNewPtNodeArrayWithAChildPtNode(
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
|
const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
|
||||||
unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), true /* isTerminal */,
|
unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
|
||||||
parentPtNodePos, ptNodeCodePoints, unigramProperty->getProbability()));
|
true /* isTerminal */, parentPtNodePos, ptNodeCodePoints,
|
||||||
|
unigramProperty->getProbability()));
|
||||||
if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
|
if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
|
||||||
unigramProperty, &writingPos)) {
|
unigramProperty, &writingPos)) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -214,7 +215,7 @@ bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes(
|
||||||
reallocatingPtNodeParams->getCodePointArrayView().limit(overlappingCodePointCount);
|
reallocatingPtNodeParams->getCodePointArrayView().limit(overlappingCodePointCount);
|
||||||
if (addsExtraChild) {
|
if (addsExtraChild) {
|
||||||
const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
|
const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
|
||||||
false /* isNotAWord */, false /* isBlacklisted */, false /* isTerminal */,
|
false /* isNotAWord */, false /* isPossiblyOffensive */, false /* isTerminal */,
|
||||||
reallocatingPtNodeParams->getParentPos(), firstPtNodeCodePoints,
|
reallocatingPtNodeParams->getParentPos(), firstPtNodeCodePoints,
|
||||||
NOT_A_PROBABILITY));
|
NOT_A_PROBABILITY));
|
||||||
if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&ptNodeParamsToWrite, &writingPos)) {
|
if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&ptNodeParamsToWrite, &writingPos)) {
|
||||||
|
@ -222,7 +223,7 @@ bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes(
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
|
const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
|
||||||
unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(),
|
unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
|
||||||
true /* isTerminal */, reallocatingPtNodeParams->getParentPos(),
|
true /* isTerminal */, reallocatingPtNodeParams->getParentPos(),
|
||||||
firstPtNodeCodePoints, unigramProperty->getProbability()));
|
firstPtNodeCodePoints, unigramProperty->getProbability()));
|
||||||
if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
|
if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
|
||||||
|
@ -240,7 +241,7 @@ bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes(
|
||||||
// Write the 2nd part of the reallocating node.
|
// Write the 2nd part of the reallocating node.
|
||||||
const int secondPartOfReallocatedPtNodePos = writingPos;
|
const int secondPartOfReallocatedPtNodePos = writingPos;
|
||||||
const PtNodeParams childPartPtNodeParams(getUpdatedPtNodeParams(reallocatingPtNodeParams,
|
const PtNodeParams childPartPtNodeParams(getUpdatedPtNodeParams(reallocatingPtNodeParams,
|
||||||
reallocatingPtNodeParams->isNotAWord(), reallocatingPtNodeParams->isBlacklisted(),
|
reallocatingPtNodeParams->isNotAWord(), reallocatingPtNodeParams->isPossiblyOffensive(),
|
||||||
reallocatingPtNodeParams->isTerminal(), firstPartOfReallocatedPtNodePos,
|
reallocatingPtNodeParams->isTerminal(), firstPartOfReallocatedPtNodePos,
|
||||||
reallocatingPtNodeParams->getCodePointArrayView().skip(overlappingCodePointCount),
|
reallocatingPtNodeParams->getCodePointArrayView().skip(overlappingCodePointCount),
|
||||||
reallocatingPtNodeParams->getProbability()));
|
reallocatingPtNodeParams->getProbability()));
|
||||||
|
@ -249,7 +250,7 @@ bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes(
|
||||||
}
|
}
|
||||||
if (addsExtraChild) {
|
if (addsExtraChild) {
|
||||||
const PtNodeParams extraChildPtNodeParams(getPtNodeParamsForNewPtNode(
|
const PtNodeParams extraChildPtNodeParams(getPtNodeParamsForNewPtNode(
|
||||||
unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(),
|
unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
|
||||||
true /* isTerminal */, firstPartOfReallocatedPtNodePos,
|
true /* isTerminal */, firstPartOfReallocatedPtNodePos,
|
||||||
newPtNodeCodePoints.skip(overlappingCodePointCount),
|
newPtNodeCodePoints.skip(overlappingCodePointCount),
|
||||||
unigramProperty->getProbability()));
|
unigramProperty->getProbability()));
|
||||||
|
@ -276,20 +277,20 @@ bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes(
|
||||||
|
|
||||||
const PtNodeParams DynamicPtUpdatingHelper::getUpdatedPtNodeParams(
|
const PtNodeParams DynamicPtUpdatingHelper::getUpdatedPtNodeParams(
|
||||||
const PtNodeParams *const originalPtNodeParams, const bool isNotAWord,
|
const PtNodeParams *const originalPtNodeParams, const bool isNotAWord,
|
||||||
const bool isBlacklisted, const bool isTerminal, const int parentPos,
|
const bool isPossiblyOffensive, const bool isTerminal, const int parentPos,
|
||||||
const CodePointArrayView codePoints, const int probability) const {
|
const CodePointArrayView codePoints, const int probability) const {
|
||||||
const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags(
|
const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags(
|
||||||
isBlacklisted, isNotAWord, isTerminal, false /* hasShortcutTargets */,
|
isPossiblyOffensive, isNotAWord, isTerminal, false /* hasShortcutTargets */,
|
||||||
false /* hasBigrams */, codePoints.size() > 1u /* hasMultipleChars */,
|
false /* hasBigrams */, codePoints.size() > 1u /* hasMultipleChars */,
|
||||||
CHILDREN_POSITION_FIELD_SIZE);
|
CHILDREN_POSITION_FIELD_SIZE);
|
||||||
return PtNodeParams(originalPtNodeParams, flags, parentPos, codePoints, probability);
|
return PtNodeParams(originalPtNodeParams, flags, parentPos, codePoints, probability);
|
||||||
}
|
}
|
||||||
|
|
||||||
const PtNodeParams DynamicPtUpdatingHelper::getPtNodeParamsForNewPtNode(const bool isNotAWord,
|
const PtNodeParams DynamicPtUpdatingHelper::getPtNodeParamsForNewPtNode(const bool isNotAWord,
|
||||||
const bool isBlacklisted, const bool isTerminal, const int parentPos,
|
const bool isPossiblyOffensive, const bool isTerminal, const int parentPos,
|
||||||
const CodePointArrayView codePoints, const int probability) const {
|
const CodePointArrayView codePoints, const int probability) const {
|
||||||
const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags(
|
const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags(
|
||||||
isBlacklisted, isNotAWord, isTerminal, false /* hasShortcutTargets */,
|
isPossiblyOffensive, isNotAWord, isTerminal, false /* hasShortcutTargets */,
|
||||||
false /* hasBigrams */, codePoints.size() > 1u /* hasMultipleChars */,
|
false /* hasBigrams */, codePoints.size() > 1u /* hasMultipleChars */,
|
||||||
CHILDREN_POSITION_FIELD_SIZE);
|
CHILDREN_POSITION_FIELD_SIZE);
|
||||||
return PtNodeParams(flags, parentPos, codePoints, probability);
|
return PtNodeParams(flags, parentPos, codePoints, probability);
|
||||||
|
|
|
@ -85,12 +85,12 @@ class DynamicPtUpdatingHelper {
|
||||||
const CodePointArrayView newPtNodeCodePoints);
|
const CodePointArrayView newPtNodeCodePoints);
|
||||||
|
|
||||||
const PtNodeParams getUpdatedPtNodeParams(const PtNodeParams *const originalPtNodeParams,
|
const PtNodeParams getUpdatedPtNodeParams(const PtNodeParams *const originalPtNodeParams,
|
||||||
const bool isNotAWord, const bool isBlacklisted, const bool isTerminal,
|
const bool isNotAWord, const bool isPossiblyOffensive, const bool isTerminal,
|
||||||
const int parentPos, const CodePointArrayView codePoints, const int probability) const;
|
const int parentPos, const CodePointArrayView codePoints, const int probability) const;
|
||||||
|
|
||||||
const PtNodeParams getPtNodeParamsForNewPtNode(const bool isNotAWord, const bool isBlacklisted,
|
const PtNodeParams getPtNodeParamsForNewPtNode(const bool isNotAWord,
|
||||||
const bool isTerminal, const int parentPos, const CodePointArrayView codePoints,
|
const bool isPossiblyOffensive, const bool isTerminal, const int parentPos,
|
||||||
const int probability) const;
|
const CodePointArrayView codePoints, const int probability) const;
|
||||||
};
|
};
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_UPDATING_HELPER_H */
|
#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_UPDATING_HELPER_H */
|
||||||
|
|
|
@ -41,8 +41,8 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_SHORTCUT_TARGETS = 0x08
|
||||||
const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_BIGRAMS = 0x04;
|
const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_BIGRAMS = 0x04;
|
||||||
// Flag for non-words (typically, shortcut only entries)
|
// Flag for non-words (typically, shortcut only entries)
|
||||||
const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_NOT_A_WORD = 0x02;
|
const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_NOT_A_WORD = 0x02;
|
||||||
// Flag for blacklist
|
// Flag for possibly offensive words
|
||||||
const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
|
const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_POSSIBLY_OFFENSIVE = 0x01;
|
||||||
|
|
||||||
/* static */ int PtReadingUtils::getPtNodeArraySizeAndAdvancePosition(
|
/* static */ int PtReadingUtils::getPtNodeArraySizeAndAdvancePosition(
|
||||||
const uint8_t *const buffer, int *const pos) {
|
const uint8_t *const buffer, int *const pos) {
|
||||||
|
|
|
@ -54,8 +54,8 @@ class PatriciaTrieReadingUtils {
|
||||||
/**
|
/**
|
||||||
* Node Flags
|
* Node Flags
|
||||||
*/
|
*/
|
||||||
static AK_FORCE_INLINE bool isBlacklisted(const NodeFlags flags) {
|
static AK_FORCE_INLINE bool isPossiblyOffensive(const NodeFlags flags) {
|
||||||
return (flags & FLAG_IS_BLACKLISTED) != 0;
|
return (flags & FLAG_IS_POSSIBLY_OFFENSIVE) != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static AK_FORCE_INLINE bool isNotAWord(const NodeFlags flags) {
|
static AK_FORCE_INLINE bool isNotAWord(const NodeFlags flags) {
|
||||||
|
@ -82,12 +82,12 @@ class PatriciaTrieReadingUtils {
|
||||||
return FLAG_CHILDREN_POSITION_TYPE_NOPOSITION != (MASK_CHILDREN_POSITION_TYPE & flags);
|
return FLAG_CHILDREN_POSITION_TYPE_NOPOSITION != (MASK_CHILDREN_POSITION_TYPE & flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
static AK_FORCE_INLINE NodeFlags createAndGetFlags(const bool isBlacklisted,
|
static AK_FORCE_INLINE NodeFlags createAndGetFlags(const bool isPossiblyOffensive,
|
||||||
const bool isNotAWord, const bool isTerminal, const bool hasShortcutTargets,
|
const bool isNotAWord, const bool isTerminal, const bool hasShortcutTargets,
|
||||||
const bool hasBigrams, const bool hasMultipleChars,
|
const bool hasBigrams, const bool hasMultipleChars,
|
||||||
const int childrenPositionFieldSize) {
|
const int childrenPositionFieldSize) {
|
||||||
NodeFlags nodeFlags = 0;
|
NodeFlags nodeFlags = 0;
|
||||||
nodeFlags = isBlacklisted ? (nodeFlags | FLAG_IS_BLACKLISTED) : nodeFlags;
|
nodeFlags = isPossiblyOffensive ? (nodeFlags | FLAG_IS_POSSIBLY_OFFENSIVE) : nodeFlags;
|
||||||
nodeFlags = isNotAWord ? (nodeFlags | FLAG_IS_NOT_A_WORD) : nodeFlags;
|
nodeFlags = isNotAWord ? (nodeFlags | FLAG_IS_NOT_A_WORD) : nodeFlags;
|
||||||
nodeFlags = isTerminal ? (nodeFlags | FLAG_IS_TERMINAL) : nodeFlags;
|
nodeFlags = isTerminal ? (nodeFlags | FLAG_IS_TERMINAL) : nodeFlags;
|
||||||
nodeFlags = hasShortcutTargets ? (nodeFlags | FLAG_HAS_SHORTCUT_TARGETS) : nodeFlags;
|
nodeFlags = hasShortcutTargets ? (nodeFlags | FLAG_HAS_SHORTCUT_TARGETS) : nodeFlags;
|
||||||
|
@ -127,7 +127,7 @@ class PatriciaTrieReadingUtils {
|
||||||
static const NodeFlags FLAG_HAS_SHORTCUT_TARGETS;
|
static const NodeFlags FLAG_HAS_SHORTCUT_TARGETS;
|
||||||
static const NodeFlags FLAG_HAS_BIGRAMS;
|
static const NodeFlags FLAG_HAS_BIGRAMS;
|
||||||
static const NodeFlags FLAG_IS_NOT_A_WORD;
|
static const NodeFlags FLAG_IS_NOT_A_WORD;
|
||||||
static const NodeFlags FLAG_IS_BLACKLISTED;
|
static const NodeFlags FLAG_IS_POSSIBLY_OFFENSIVE;
|
||||||
};
|
};
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
#endif /* LATINIME_PATRICIA_TRIE_NODE_READING_UTILS_H */
|
#endif /* LATINIME_PATRICIA_TRIE_NODE_READING_UTILS_H */
|
||||||
|
|
|
@ -145,7 +145,18 @@ class PtNodeParams {
|
||||||
}
|
}
|
||||||
|
|
||||||
AK_FORCE_INLINE bool isBlacklisted() const {
|
AK_FORCE_INLINE bool isBlacklisted() const {
|
||||||
return PatriciaTrieReadingUtils::isBlacklisted(mFlags);
|
// Note: this method will be removed in the next change.
|
||||||
|
// It is used in getProbabilityOfWord and getWordAttributes for both v402 and v403.
|
||||||
|
// * getProbabilityOfWord will be changed to no longer return NOT_A_PROBABILITY
|
||||||
|
// when isBlacklisted (i.e. to only check if isNotAWord or isDeleted)
|
||||||
|
// * getWordAttributes will be changed to always return blacklisted=false and
|
||||||
|
// isPossiblyOffensive according to the function below (instead of the current
|
||||||
|
// behaviour of checking if the probability is zero)
|
||||||
|
return PatriciaTrieReadingUtils::isPossiblyOffensive(mFlags);
|
||||||
|
}
|
||||||
|
|
||||||
|
AK_FORCE_INLINE bool isPossiblyOffensive() const {
|
||||||
|
return PatriciaTrieReadingUtils::isPossiblyOffensive(mFlags);
|
||||||
}
|
}
|
||||||
|
|
||||||
AK_FORCE_INLINE bool isNotAWord() const {
|
AK_FORCE_INLINE bool isNotAWord() const {
|
||||||
|
|
|
@ -476,8 +476,8 @@ const WordProperty PatriciaTriePolicy::getWordProperty(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
|
const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
|
||||||
ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
|
ptNodeParams.isNotAWord(), ptNodeParams.isPossiblyOffensive(),
|
||||||
HistoricalInfo(), std::move(shortcuts));
|
ptNodeParams.getProbability(), HistoricalInfo(), std::move(shortcuts));
|
||||||
return WordProperty(wordCodePoints.toVector(), &unigramProperty, &ngrams);
|
return WordProperty(wordCodePoints.toVector(), &unigramProperty, &ngrams);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -342,7 +342,7 @@ bool Ver4PatriciaTrieNodeWriter::updatePtNodeFlags(const int ptNodePos, const bo
|
||||||
// Create node flags and write them.
|
// Create node flags and write them.
|
||||||
PatriciaTrieReadingUtils::NodeFlags nodeFlags =
|
PatriciaTrieReadingUtils::NodeFlags nodeFlags =
|
||||||
PatriciaTrieReadingUtils::createAndGetFlags(false /* isNotAWord */,
|
PatriciaTrieReadingUtils::createAndGetFlags(false /* isNotAWord */,
|
||||||
false /* isBlacklisted */, isTerminal, false /* hasShortcutTargets */,
|
false /* isPossiblyOffensive */, isTerminal, false /* hasShortcutTargets */,
|
||||||
false /* hasBigrams */, hasMultipleChars, CHILDREN_POSITION_FIELD_SIZE);
|
false /* hasBigrams */, hasMultipleChars, CHILDREN_POSITION_FIELD_SIZE);
|
||||||
if (!DynamicPtWritingUtils::writeFlags(mTrieBuffer, nodeFlags, ptNodePos)) {
|
if (!DynamicPtWritingUtils::writeFlags(mTrieBuffer, nodeFlags, ptNodePos)) {
|
||||||
AKLOGE("Cannot write PtNode flags. flags: %x, pos: %d", nodeFlags, ptNodePos);
|
AKLOGE("Cannot write PtNode flags. flags: %x, pos: %d", nodeFlags, ptNodePos);
|
||||||
|
|
|
@ -299,7 +299,8 @@ bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramContext *const ngramContex
|
||||||
}
|
}
|
||||||
const UnigramProperty beginningOfSentenceUnigramProperty(
|
const UnigramProperty beginningOfSentenceUnigramProperty(
|
||||||
true /* representsBeginningOfSentence */, true /* isNotAWord */,
|
true /* representsBeginningOfSentence */, true /* isNotAWord */,
|
||||||
false /* isBlacklisted */, MAX_PROBABILITY /* probability */, HistoricalInfo());
|
false /* isBlacklisted */, false /* isPossiblyOffensive */,
|
||||||
|
MAX_PROBABILITY /* probability */, HistoricalInfo());
|
||||||
if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */),
|
if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */),
|
||||||
&beginningOfSentenceUnigramProperty)) {
|
&beginningOfSentenceUnigramProperty)) {
|
||||||
AKLOGE("Cannot add unigram entry for the beginning-of-sentence.");
|
AKLOGE("Cannot add unigram entry for the beginning-of-sentence.");
|
||||||
|
@ -375,8 +376,9 @@ bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext(
|
||||||
if (wordId == NOT_A_WORD_ID) {
|
if (wordId == NOT_A_WORD_ID) {
|
||||||
// The word is not in the dictionary.
|
// The word is not in the dictionary.
|
||||||
const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */,
|
const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */,
|
||||||
false /* isNotAWord */, false /* isBlacklisted */, NOT_A_PROBABILITY,
|
false /* isNotAWord */, false /* isBlacklisted */, false /* isPossiblyOffensive */,
|
||||||
HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, 0 /* count */));
|
NOT_A_PROBABILITY, HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */,
|
||||||
|
0 /* count */));
|
||||||
if (!addUnigramEntry(wordCodePoints, &unigramProperty)) {
|
if (!addUnigramEntry(wordCodePoints, &unigramProperty)) {
|
||||||
AKLOGE("Cannot add unigarm entry in updateEntriesForWordWithNgramContext().");
|
AKLOGE("Cannot add unigarm entry in updateEntriesForWordWithNgramContext().");
|
||||||
return false;
|
return false;
|
||||||
|
@ -391,7 +393,7 @@ bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext(
|
||||||
&& ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) {
|
&& ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) {
|
||||||
const UnigramProperty beginningOfSentenceUnigramProperty(
|
const UnigramProperty beginningOfSentenceUnigramProperty(
|
||||||
true /* representsBeginningOfSentence */,
|
true /* representsBeginningOfSentence */,
|
||||||
true /* isNotAWord */, false /* isBlacklisted */, NOT_A_PROBABILITY,
|
true /* isNotAWord */, false /* isPossiblyOffensive */, NOT_A_PROBABILITY,
|
||||||
HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, 0 /* count */));
|
HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, 0 /* count */));
|
||||||
if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */),
|
if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */),
|
||||||
&beginningOfSentenceUnigramProperty)) {
|
&beginningOfSentenceUnigramProperty)) {
|
||||||
|
@ -529,7 +531,8 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
|
||||||
}
|
}
|
||||||
const UnigramProperty unigramProperty(probabilityEntry.representsBeginningOfSentence(),
|
const UnigramProperty unigramProperty(probabilityEntry.representsBeginningOfSentence(),
|
||||||
probabilityEntry.isNotAWord(), probabilityEntry.isBlacklisted(),
|
probabilityEntry.isNotAWord(), probabilityEntry.isBlacklisted(),
|
||||||
probabilityEntry.getProbability(), *historicalInfo, std::move(shortcuts));
|
probabilityEntry.isPossiblyOffensive(), probabilityEntry.getProbability(),
|
||||||
|
*historicalInfo, std::move(shortcuts));
|
||||||
return WordProperty(wordCodePoints.toVector(), &unigramProperty, &ngrams);
|
return WordProperty(wordCodePoints.toVector(), &unigramProperty, &ngrams);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -684,8 +684,8 @@ public class BinaryDictionaryDecayingTests extends AndroidTestCase {
|
||||||
|
|
||||||
binaryDictionary.addUnigramEntry("", DUMMY_PROBABILITY, "" /* shortcutTarget */,
|
binaryDictionary.addUnigramEntry("", DUMMY_PROBABILITY, "" /* shortcutTarget */,
|
||||||
BinaryDictionary.NOT_A_PROBABILITY /* shortcutProbability */,
|
BinaryDictionary.NOT_A_PROBABILITY /* shortcutProbability */,
|
||||||
true /* isBeginningOfSentence */, true /* isNotAWord */, false /* isBlacklisted */,
|
true /* isBeginningOfSentence */, true /* isNotAWord */,
|
||||||
mCurrentTime);
|
false /* isPossiblyOffensive */, mCurrentTime);
|
||||||
final NgramContext beginningOfSentenceContext = NgramContext.BEGINNING_OF_SENTENCE;
|
final NgramContext beginningOfSentenceContext = NgramContext.BEGINNING_OF_SENTENCE;
|
||||||
onInputWordWithBeginningOfSentenceContext(binaryDictionary, "aaa", true /* isValidWord */);
|
onInputWordWithBeginningOfSentenceContext(binaryDictionary, "aaa", true /* isValidWord */);
|
||||||
assertFalse(binaryDictionary.isValidNgram(beginningOfSentenceContext, "aaa"));
|
assertFalse(binaryDictionary.isValidNgram(beginningOfSentenceContext, "aaa"));
|
||||||
|
|
|
@ -200,7 +200,7 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
||||||
// Too long short cut.
|
// Too long short cut.
|
||||||
binaryDictionary.addUnigramEntry("a", probability, invalidLongWord,
|
binaryDictionary.addUnigramEntry("a", probability, invalidLongWord,
|
||||||
10 /* shortcutProbability */, false /* isBeginningOfSentence */,
|
10 /* shortcutProbability */, false /* isBeginningOfSentence */,
|
||||||
false /* isNotAWord */, false /* isBlacklisted */,
|
false /* isNotAWord */, false /* isPossiblyOffensive */,
|
||||||
BinaryDictionary.NOT_A_VALID_TIMESTAMP);
|
BinaryDictionary.NOT_A_VALID_TIMESTAMP);
|
||||||
addUnigramWord(binaryDictionary, "abc", probability);
|
addUnigramWord(binaryDictionary, "abc", probability);
|
||||||
final int updatedProbability = 200;
|
final int updatedProbability = 200;
|
||||||
|
@ -221,7 +221,8 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
||||||
binaryDictionary.addUnigramEntry(word, probability, "" /* shortcutTarget */,
|
binaryDictionary.addUnigramEntry(word, probability, "" /* shortcutTarget */,
|
||||||
BinaryDictionary.NOT_A_PROBABILITY /* shortcutProbability */,
|
BinaryDictionary.NOT_A_PROBABILITY /* shortcutProbability */,
|
||||||
false /* isBeginningOfSentence */, false /* isNotAWord */,
|
false /* isBeginningOfSentence */, false /* isNotAWord */,
|
||||||
false /* isBlacklisted */, BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
|
false /* isPossiblyOffensive */,
|
||||||
|
BinaryDictionary.NOT_A_VALID_TIMESTAMP /* timestamp */);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void addBigramWords(final BinaryDictionary binaryDictionary, final String word0,
|
private static void addBigramWords(final BinaryDictionary binaryDictionary, final String word0,
|
||||||
|
@ -971,11 +972,11 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
||||||
final String word = CodePointUtils.generateWord(random, codePointSet);
|
final String word = CodePointUtils.generateWord(random, codePointSet);
|
||||||
final int unigramProbability = random.nextInt(0xFF);
|
final int unigramProbability = random.nextInt(0xFF);
|
||||||
final boolean isNotAWord = random.nextBoolean();
|
final boolean isNotAWord = random.nextBoolean();
|
||||||
final boolean isBlacklisted = random.nextBoolean();
|
final boolean isPossiblyOffensive = random.nextBoolean();
|
||||||
// TODO: Add tests for historical info.
|
// TODO: Add tests for historical info.
|
||||||
binaryDictionary.addUnigramEntry(word, unigramProbability,
|
binaryDictionary.addUnigramEntry(word, unigramProbability,
|
||||||
null /* shortcutTarget */, BinaryDictionary.NOT_A_PROBABILITY,
|
null /* shortcutTarget */, BinaryDictionary.NOT_A_PROBABILITY,
|
||||||
false /* isBeginningOfSentence */, isNotAWord, isBlacklisted,
|
false /* isBeginningOfSentence */, isNotAWord, isPossiblyOffensive,
|
||||||
BinaryDictionary.NOT_A_VALID_TIMESTAMP);
|
BinaryDictionary.NOT_A_VALID_TIMESTAMP);
|
||||||
if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
|
if (binaryDictionary.needsToRunGC(false /* mindsBlockByGC */)) {
|
||||||
binaryDictionary.flushWithGC();
|
binaryDictionary.flushWithGC();
|
||||||
|
@ -987,7 +988,7 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
||||||
assertEquals(word, wordProperty.mWord);
|
assertEquals(word, wordProperty.mWord);
|
||||||
assertTrue(wordProperty.isValid());
|
assertTrue(wordProperty.isValid());
|
||||||
assertEquals(isNotAWord, wordProperty.mIsNotAWord);
|
assertEquals(isNotAWord, wordProperty.mIsNotAWord);
|
||||||
assertEquals(isBlacklisted, wordProperty.mIsBlacklistEntry);
|
assertEquals(isPossiblyOffensive, wordProperty.mIsPossiblyOffensive);
|
||||||
assertEquals(false, wordProperty.mHasNgrams);
|
assertEquals(false, wordProperty.mHasNgrams);
|
||||||
assertEquals(false, wordProperty.mHasShortcuts);
|
assertEquals(false, wordProperty.mHasShortcuts);
|
||||||
assertEquals(unigramProbability, wordProperty.mProbabilityInfo.mProbability);
|
assertEquals(unigramProbability, wordProperty.mProbabilityInfo.mProbability);
|
||||||
|
@ -1142,7 +1143,7 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
||||||
final int shortcutProbability = 10;
|
final int shortcutProbability = 10;
|
||||||
binaryDictionary.addUnigramEntry("aaa", unigramProbability, "zzz",
|
binaryDictionary.addUnigramEntry("aaa", unigramProbability, "zzz",
|
||||||
shortcutProbability, false /* isBeginningOfSentence */,
|
shortcutProbability, false /* isBeginningOfSentence */,
|
||||||
false /* isNotAWord */, false /* isBlacklisted */, 0 /* timestamp */);
|
false /* isNotAWord */, false /* isPossiblyOffensive */, 0 /* timestamp */);
|
||||||
WordProperty wordProperty = binaryDictionary.getWordProperty("aaa",
|
WordProperty wordProperty = binaryDictionary.getWordProperty("aaa",
|
||||||
false /* isBeginningOfSentence */);
|
false /* isBeginningOfSentence */);
|
||||||
assertEquals(1, wordProperty.mShortcutTargets.size());
|
assertEquals(1, wordProperty.mShortcutTargets.size());
|
||||||
|
@ -1151,7 +1152,7 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
||||||
final int updatedShortcutProbability = 2;
|
final int updatedShortcutProbability = 2;
|
||||||
binaryDictionary.addUnigramEntry("aaa", unigramProbability, "zzz",
|
binaryDictionary.addUnigramEntry("aaa", unigramProbability, "zzz",
|
||||||
updatedShortcutProbability, false /* isBeginningOfSentence */,
|
updatedShortcutProbability, false /* isBeginningOfSentence */,
|
||||||
false /* isNotAWord */, false /* isBlacklisted */, 0 /* timestamp */);
|
false /* isNotAWord */, false /* isPossiblyOffensive */, 0 /* timestamp */);
|
||||||
wordProperty = binaryDictionary.getWordProperty("aaa",
|
wordProperty = binaryDictionary.getWordProperty("aaa",
|
||||||
false /* isBeginningOfSentence */);
|
false /* isBeginningOfSentence */);
|
||||||
assertEquals(1, wordProperty.mShortcutTargets.size());
|
assertEquals(1, wordProperty.mShortcutTargets.size());
|
||||||
|
@ -1160,7 +1161,7 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
||||||
wordProperty.mShortcutTargets.get(0).getProbability());
|
wordProperty.mShortcutTargets.get(0).getProbability());
|
||||||
binaryDictionary.addUnigramEntry("aaa", unigramProbability, "yyy",
|
binaryDictionary.addUnigramEntry("aaa", unigramProbability, "yyy",
|
||||||
shortcutProbability, false /* isBeginningOfSentence */, false /* isNotAWord */,
|
shortcutProbability, false /* isBeginningOfSentence */, false /* isNotAWord */,
|
||||||
false /* isBlacklisted */, 0 /* timestamp */);
|
false /* isPossiblyOffensive */, 0 /* timestamp */);
|
||||||
final HashMap<String, Integer> shortcutTargets = new HashMap<>();
|
final HashMap<String, Integer> shortcutTargets = new HashMap<>();
|
||||||
shortcutTargets.put("zzz", updatedShortcutProbability);
|
shortcutTargets.put("zzz", updatedShortcutProbability);
|
||||||
shortcutTargets.put("yyy", shortcutProbability);
|
shortcutTargets.put("yyy", shortcutProbability);
|
||||||
|
@ -1223,7 +1224,7 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
||||||
final int unigramProbability = unigramProbabilities.get(word);
|
final int unigramProbability = unigramProbabilities.get(word);
|
||||||
binaryDictionary.addUnigramEntry(word, unigramProbability, shortcutTarget,
|
binaryDictionary.addUnigramEntry(word, unigramProbability, shortcutTarget,
|
||||||
shortcutProbability, false /* isBeginningOfSentence */, false /* isNotAWord */,
|
shortcutProbability, false /* isBeginningOfSentence */, false /* isNotAWord */,
|
||||||
false /* isBlacklisted */, 0 /* timestamp */);
|
false /* isPossiblyOffensive */, 0 /* timestamp */);
|
||||||
if (shortcutTargets.containsKey(word)) {
|
if (shortcutTargets.containsKey(word)) {
|
||||||
final HashMap<String, Integer> shortcutTargetsOfWord = shortcutTargets.get(word);
|
final HashMap<String, Integer> shortcutTargetsOfWord = shortcutTargets.get(word);
|
||||||
shortcutTargetsOfWord.put(shortcutTarget, shortcutProbability);
|
shortcutTargetsOfWord.put(shortcutTarget, shortcutProbability);
|
||||||
|
@ -1255,6 +1256,15 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testPossiblyOffensiveAttributeMaintained() {
|
||||||
|
final BinaryDictionary binaryDictionary =
|
||||||
|
getEmptyBinaryDictionary(FormatSpec.VERSION4_DEV);
|
||||||
|
binaryDictionary.addUnigramEntry("ddd", 100, null, Dictionary.NOT_A_PROBABILITY,
|
||||||
|
false, true, true, 0);
|
||||||
|
WordProperty wordProperty = binaryDictionary.getWordProperty("ddd", false);
|
||||||
|
assertEquals(true, wordProperty.mIsPossiblyOffensive);
|
||||||
|
}
|
||||||
|
|
||||||
public void testDictMigration() {
|
public void testDictMigration() {
|
||||||
for (final int formatVersion : DICT_FORMAT_VERSIONS) {
|
for (final int formatVersion : DICT_FORMAT_VERSIONS) {
|
||||||
testDictMigration(FormatSpec.VERSION4_ONLY_FOR_TESTING, formatVersion);
|
testDictMigration(FormatSpec.VERSION4_ONLY_FOR_TESTING, formatVersion);
|
||||||
|
@ -1271,10 +1281,10 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
||||||
final int shortcutProbability = 10;
|
final int shortcutProbability = 10;
|
||||||
binaryDictionary.addUnigramEntry("ccc", unigramProbability, "xxx", shortcutProbability,
|
binaryDictionary.addUnigramEntry("ccc", unigramProbability, "xxx", shortcutProbability,
|
||||||
false /* isBeginningOfSentence */, false /* isNotAWord */,
|
false /* isBeginningOfSentence */, false /* isNotAWord */,
|
||||||
false /* isBlacklisted */, 0 /* timestamp */);
|
false /* isPossiblyOffensive */, 0 /* timestamp */);
|
||||||
binaryDictionary.addUnigramEntry("ddd", unigramProbability, null /* shortcutTarget */,
|
binaryDictionary.addUnigramEntry("ddd", unigramProbability, null /* shortcutTarget */,
|
||||||
Dictionary.NOT_A_PROBABILITY, false /* isBeginningOfSentence */,
|
Dictionary.NOT_A_PROBABILITY, false /* isBeginningOfSentence */,
|
||||||
true /* isNotAWord */, true /* isBlacklisted */, 0 /* timestamp */);
|
true /* isNotAWord */, true /* isPossiblyOffensive */, 0 /* timestamp */);
|
||||||
binaryDictionary.addNgramEntry(NgramContext.BEGINNING_OF_SENTENCE,
|
binaryDictionary.addNgramEntry(NgramContext.BEGINNING_OF_SENTENCE,
|
||||||
"aaa", bigramProbability, 0 /* timestamp */);
|
"aaa", bigramProbability, 0 /* timestamp */);
|
||||||
assertEquals(unigramProbability, binaryDictionary.getFrequency("aaa"));
|
assertEquals(unigramProbability, binaryDictionary.getFrequency("aaa"));
|
||||||
|
@ -1298,7 +1308,7 @@ public class BinaryDictionaryTests extends AndroidTestCase {
|
||||||
assertEquals("xxx", wordProperty.mShortcutTargets.get(0).mWord);
|
assertEquals("xxx", wordProperty.mShortcutTargets.get(0).mWord);
|
||||||
wordProperty = binaryDictionary.getWordProperty("ddd",
|
wordProperty = binaryDictionary.getWordProperty("ddd",
|
||||||
false /* isBeginningOfSentence */);
|
false /* isBeginningOfSentence */);
|
||||||
assertTrue(wordProperty.mIsBlacklistEntry);
|
assertTrue(wordProperty.mIsPossiblyOffensive);
|
||||||
assertTrue(wordProperty.mIsNotAWord);
|
assertTrue(wordProperty.mIsNotAWord);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -35,16 +35,20 @@ public class FusionDictionaryTests extends AndroidTestCase {
|
||||||
FusionDictionary dict = new FusionDictionary(new PtNodeArray(),
|
FusionDictionary dict = new FusionDictionary(new PtNodeArray(),
|
||||||
new DictionaryOptions(new HashMap<String,String>()));
|
new DictionaryOptions(new HashMap<String,String>()));
|
||||||
|
|
||||||
dict.add("abc", new ProbabilityInfo(10), null, false /* isNotAWord */);
|
dict.add("abc", new ProbabilityInfo(10), null, false /* isNotAWord */,
|
||||||
|
false /* isPossiblyOffensive */);
|
||||||
assertNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "aaa"));
|
assertNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "aaa"));
|
||||||
assertNotNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "abc"));
|
assertNotNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "abc"));
|
||||||
|
|
||||||
dict.add("aa", new ProbabilityInfo(10), null, false /* isNotAWord */);
|
dict.add("aa", new ProbabilityInfo(10), null, false /* isNotAWord */,
|
||||||
|
false /* isPossiblyOffensive */);
|
||||||
assertNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "aaa"));
|
assertNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "aaa"));
|
||||||
assertNotNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "aa"));
|
assertNotNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "aa"));
|
||||||
|
|
||||||
dict.add("babcd", new ProbabilityInfo(10), null, false /* isNotAWord */);
|
dict.add("babcd", new ProbabilityInfo(10), null, false /* isNotAWord */,
|
||||||
dict.add("bacde", new ProbabilityInfo(10), null, false /* isNotAWord */);
|
false /* isPossiblyOffensive */);
|
||||||
|
dict.add("bacde", new ProbabilityInfo(10), null, false /* isNotAWord */,
|
||||||
|
false /* isPossiblyOffensive */);
|
||||||
assertNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "ba"));
|
assertNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "ba"));
|
||||||
assertNotNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "babcd"));
|
assertNotNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "babcd"));
|
||||||
assertNotNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "bacde"));
|
assertNotNull(FusionDictionary.findWordInTree(dict.mRootNodeArray, "bacde"));
|
||||||
|
|
|
@ -149,7 +149,8 @@ public class BinaryDictDecoderEncoderTests extends AndroidTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
dict.add(word, new ProbabilityInfo(UNIGRAM_FREQ),
|
dict.add(word, new ProbabilityInfo(UNIGRAM_FREQ),
|
||||||
(shortcutMap == null) ? null : shortcuts, false /* isNotAWord */);
|
(shortcutMap == null) ? null : shortcuts, false /* isNotAWord */,
|
||||||
|
false /* isPossiblyOffensive */);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -572,12 +572,12 @@ public class BinaryDictEncoderUtils {
|
||||||
* @param hasShortcuts whether the PtNode has shortcuts.
|
* @param hasShortcuts whether the PtNode has shortcuts.
|
||||||
* @param hasBigrams whether the PtNode has bigrams.
|
* @param hasBigrams whether the PtNode has bigrams.
|
||||||
* @param isNotAWord whether the PtNode is not a word.
|
* @param isNotAWord whether the PtNode is not a word.
|
||||||
* @param isBlackListEntry whether the PtNode is a blacklist entry.
|
* @param isPossiblyOffensive whether the PtNode is a possibly offensive entry.
|
||||||
* @return the flags
|
* @return the flags
|
||||||
*/
|
*/
|
||||||
static int makePtNodeFlags(final boolean hasMultipleChars, final boolean isTerminal,
|
static int makePtNodeFlags(final boolean hasMultipleChars, final boolean isTerminal,
|
||||||
final int childrenAddressSize, final boolean hasShortcuts, final boolean hasBigrams,
|
final int childrenAddressSize, final boolean hasShortcuts, final boolean hasBigrams,
|
||||||
final boolean isNotAWord, final boolean isBlackListEntry) {
|
final boolean isNotAWord, final boolean isPossiblyOffensive) {
|
||||||
byte flags = 0;
|
byte flags = 0;
|
||||||
if (hasMultipleChars) flags |= FormatSpec.FLAG_HAS_MULTIPLE_CHARS;
|
if (hasMultipleChars) flags |= FormatSpec.FLAG_HAS_MULTIPLE_CHARS;
|
||||||
if (isTerminal) flags |= FormatSpec.FLAG_IS_TERMINAL;
|
if (isTerminal) flags |= FormatSpec.FLAG_IS_TERMINAL;
|
||||||
|
@ -600,7 +600,7 @@ public class BinaryDictEncoderUtils {
|
||||||
if (hasShortcuts) flags |= FormatSpec.FLAG_HAS_SHORTCUT_TARGETS;
|
if (hasShortcuts) flags |= FormatSpec.FLAG_HAS_SHORTCUT_TARGETS;
|
||||||
if (hasBigrams) flags |= FormatSpec.FLAG_HAS_BIGRAMS;
|
if (hasBigrams) flags |= FormatSpec.FLAG_HAS_BIGRAMS;
|
||||||
if (isNotAWord) flags |= FormatSpec.FLAG_IS_NOT_A_WORD;
|
if (isNotAWord) flags |= FormatSpec.FLAG_IS_NOT_A_WORD;
|
||||||
if (isBlackListEntry) flags |= FormatSpec.FLAG_IS_BLACKLISTED;
|
if (isPossiblyOffensive) flags |= FormatSpec.FLAG_IS_POSSIBLY_OFFENSIVE;
|
||||||
return flags;
|
return flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -609,7 +609,7 @@ public class BinaryDictEncoderUtils {
|
||||||
getByteSize(childrenOffset),
|
getByteSize(childrenOffset),
|
||||||
node.mShortcutTargets != null && !node.mShortcutTargets.isEmpty(),
|
node.mShortcutTargets != null && !node.mShortcutTargets.isEmpty(),
|
||||||
node.mBigrams != null && !node.mBigrams.isEmpty(),
|
node.mBigrams != null && !node.mBigrams.isEmpty(),
|
||||||
node.mIsNotAWord, node.mIsBlacklistEntry);
|
node.mIsNotAWord, node.mIsPossiblyOffensive);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -89,7 +89,7 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
int mTerminalId; // NOT_A_TERMINAL == mTerminalId indicates this is not a terminal.
|
int mTerminalId; // NOT_A_TERMINAL == mTerminalId indicates this is not a terminal.
|
||||||
PtNodeArray mChildren;
|
PtNodeArray mChildren;
|
||||||
boolean mIsNotAWord; // Only a shortcut
|
boolean mIsNotAWord; // Only a shortcut
|
||||||
boolean mIsBlacklistEntry;
|
boolean mIsPossiblyOffensive;
|
||||||
// mCachedSize and mCachedAddressBefore/AfterUpdate are helpers for binary dictionary
|
// mCachedSize and mCachedAddressBefore/AfterUpdate are helpers for binary dictionary
|
||||||
// generation. Before and After always hold the same value except during dictionary
|
// generation. Before and After always hold the same value except during dictionary
|
||||||
// address compression, where the update process needs to know about both values at the
|
// address compression, where the update process needs to know about both values at the
|
||||||
|
@ -102,7 +102,7 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
|
|
||||||
public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
|
public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
|
||||||
final ArrayList<WeightedString> bigrams, final ProbabilityInfo probabilityInfo,
|
final ArrayList<WeightedString> bigrams, final ProbabilityInfo probabilityInfo,
|
||||||
final boolean isNotAWord, final boolean isBlacklistEntry) {
|
final boolean isNotAWord, final boolean isPossiblyOffensive) {
|
||||||
mChars = chars;
|
mChars = chars;
|
||||||
mProbabilityInfo = probabilityInfo;
|
mProbabilityInfo = probabilityInfo;
|
||||||
mTerminalId = probabilityInfo == null ? NOT_A_TERMINAL : probabilityInfo.mProbability;
|
mTerminalId = probabilityInfo == null ? NOT_A_TERMINAL : probabilityInfo.mProbability;
|
||||||
|
@ -110,12 +110,12 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
mBigrams = bigrams;
|
mBigrams = bigrams;
|
||||||
mChildren = null;
|
mChildren = null;
|
||||||
mIsNotAWord = isNotAWord;
|
mIsNotAWord = isNotAWord;
|
||||||
mIsBlacklistEntry = isBlacklistEntry;
|
mIsPossiblyOffensive = isPossiblyOffensive;
|
||||||
}
|
}
|
||||||
|
|
||||||
public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
|
public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
|
||||||
final ArrayList<WeightedString> bigrams, final ProbabilityInfo probabilityInfo,
|
final ArrayList<WeightedString> bigrams, final ProbabilityInfo probabilityInfo,
|
||||||
final boolean isNotAWord, final boolean isBlacklistEntry,
|
final boolean isNotAWord, final boolean isPossiblyOffensive,
|
||||||
final PtNodeArray children) {
|
final PtNodeArray children) {
|
||||||
mChars = chars;
|
mChars = chars;
|
||||||
mProbabilityInfo = probabilityInfo;
|
mProbabilityInfo = probabilityInfo;
|
||||||
|
@ -123,7 +123,7 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
mBigrams = bigrams;
|
mBigrams = bigrams;
|
||||||
mChildren = children;
|
mChildren = children;
|
||||||
mIsNotAWord = isNotAWord;
|
mIsNotAWord = isNotAWord;
|
||||||
mIsBlacklistEntry = isBlacklistEntry;
|
mIsPossiblyOffensive = isPossiblyOffensive;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addChild(PtNode n) {
|
public void addChild(PtNode n) {
|
||||||
|
@ -153,8 +153,8 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
return mIsNotAWord;
|
return mIsNotAWord;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean getIsBlacklistEntry() {
|
public boolean getIsPossiblyOffensive() {
|
||||||
return mIsBlacklistEntry;
|
return mIsPossiblyOffensive;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ArrayList<WeightedString> getShortcutTargets() {
|
public ArrayList<WeightedString> getShortcutTargets() {
|
||||||
|
@ -238,7 +238,7 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
private void update(final ProbabilityInfo probabilityInfo,
|
private void update(final ProbabilityInfo probabilityInfo,
|
||||||
final ArrayList<WeightedString> shortcutTargets,
|
final ArrayList<WeightedString> shortcutTargets,
|
||||||
final ArrayList<WeightedString> bigrams,
|
final ArrayList<WeightedString> bigrams,
|
||||||
final boolean isNotAWord, final boolean isBlacklistEntry) {
|
final boolean isNotAWord, final boolean isPossiblyOffensive) {
|
||||||
mProbabilityInfo = ProbabilityInfo.max(mProbabilityInfo, probabilityInfo);
|
mProbabilityInfo = ProbabilityInfo.max(mProbabilityInfo, probabilityInfo);
|
||||||
if (shortcutTargets != null) {
|
if (shortcutTargets != null) {
|
||||||
if (mShortcutTargets == null) {
|
if (mShortcutTargets == null) {
|
||||||
|
@ -275,7 +275,7 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
mIsNotAWord = isNotAWord;
|
mIsNotAWord = isNotAWord;
|
||||||
mIsBlacklistEntry = isBlacklistEntry;
|
mIsPossiblyOffensive = isPossiblyOffensive;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -323,24 +323,12 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
* @param probabilityInfo probability information of the word.
|
* @param probabilityInfo probability information of the word.
|
||||||
* @param shortcutTargets a list of shortcut targets for this word, or null.
|
* @param shortcutTargets a list of shortcut targets for this word, or null.
|
||||||
* @param isNotAWord true if this should not be considered a word (e.g. shortcut only)
|
* @param isNotAWord true if this should not be considered a word (e.g. shortcut only)
|
||||||
|
* @param isPossiblyOffensive true if this word is possibly offensive
|
||||||
*/
|
*/
|
||||||
public void add(final String word, final ProbabilityInfo probabilityInfo,
|
public void add(final String word, final ProbabilityInfo probabilityInfo,
|
||||||
final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) {
|
final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord,
|
||||||
add(getCodePoints(word), probabilityInfo, shortcutTargets, isNotAWord,
|
final boolean isPossiblyOffensive) {
|
||||||
false /* isBlacklistEntry */);
|
add(getCodePoints(word), probabilityInfo, shortcutTargets, isNotAWord, isPossiblyOffensive);
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Helper method to add a blacklist entry as a string.
|
|
||||||
*
|
|
||||||
* @param word the word to add as a blacklist entry.
|
|
||||||
* @param shortcutTargets a list of shortcut targets for this word, or null.
|
|
||||||
* @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so)
|
|
||||||
*/
|
|
||||||
public void addBlacklistEntry(final String word,
|
|
||||||
final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) {
|
|
||||||
add(getCodePoints(word), new ProbabilityInfo(0), shortcutTargets, isNotAWord,
|
|
||||||
true /* isBlacklistEntry */);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -375,7 +363,7 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
final PtNode ptNode1 = findWordInTree(mRootNodeArray, word1);
|
final PtNode ptNode1 = findWordInTree(mRootNodeArray, word1);
|
||||||
if (ptNode1 == null) {
|
if (ptNode1 == null) {
|
||||||
add(getCodePoints(word1), new ProbabilityInfo(0), null, false /* isNotAWord */,
|
add(getCodePoints(word1), new ProbabilityInfo(0), null, false /* isNotAWord */,
|
||||||
false /* isBlacklistEntry */);
|
false /* isPossiblyOffensive */);
|
||||||
// The PtNode for the first word may have moved by the above insertion,
|
// The PtNode for the first word may have moved by the above insertion,
|
||||||
// if word1 and word2 share a common stem that happens not to have been
|
// if word1 and word2 share a common stem that happens not to have been
|
||||||
// a cutting point until now. In this case, we need to refresh ptNode.
|
// a cutting point until now. In this case, we need to refresh ptNode.
|
||||||
|
@ -397,11 +385,11 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
* @param probabilityInfo the probability information of the word.
|
* @param probabilityInfo the probability information of the word.
|
||||||
* @param shortcutTargets an optional list of shortcut targets for this word (null if none).
|
* @param shortcutTargets an optional list of shortcut targets for this word (null if none).
|
||||||
* @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so)
|
* @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so)
|
||||||
* @param isBlacklistEntry true if this is a blacklisted word, false otherwise
|
* @param isPossiblyOffensive true if this word is possibly offensive
|
||||||
*/
|
*/
|
||||||
private void add(final int[] word, final ProbabilityInfo probabilityInfo,
|
private void add(final int[] word, final ProbabilityInfo probabilityInfo,
|
||||||
final ArrayList<WeightedString> shortcutTargets,
|
final ArrayList<WeightedString> shortcutTargets,
|
||||||
final boolean isNotAWord, final boolean isBlacklistEntry) {
|
final boolean isNotAWord, final boolean isPossiblyOffensive) {
|
||||||
assert(probabilityInfo.mProbability <= FormatSpec.MAX_TERMINAL_FREQUENCY);
|
assert(probabilityInfo.mProbability <= FormatSpec.MAX_TERMINAL_FREQUENCY);
|
||||||
if (word.length >= Constants.DICTIONARY_MAX_WORD_LENGTH) {
|
if (word.length >= Constants.DICTIONARY_MAX_WORD_LENGTH) {
|
||||||
MakedictLog.w("Ignoring a word that is too long: word.length = " + word.length);
|
MakedictLog.w("Ignoring a word that is too long: word.length = " + word.length);
|
||||||
|
@ -431,7 +419,7 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
final int insertionIndex = findInsertionIndex(currentNodeArray, word[charIndex]);
|
final int insertionIndex = findInsertionIndex(currentNodeArray, word[charIndex]);
|
||||||
final PtNode newPtNode = new PtNode(Arrays.copyOfRange(word, charIndex, word.length),
|
final PtNode newPtNode = new PtNode(Arrays.copyOfRange(word, charIndex, word.length),
|
||||||
shortcutTargets, null /* bigrams */, probabilityInfo, isNotAWord,
|
shortcutTargets, null /* bigrams */, probabilityInfo, isNotAWord,
|
||||||
isBlacklistEntry);
|
isPossiblyOffensive);
|
||||||
currentNodeArray.mData.add(insertionIndex, newPtNode);
|
currentNodeArray.mData.add(insertionIndex, newPtNode);
|
||||||
if (DBG) checkStack(currentNodeArray);
|
if (DBG) checkStack(currentNodeArray);
|
||||||
} else {
|
} else {
|
||||||
|
@ -442,14 +430,14 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
// should end already exists as is. Since the old PtNode was not a terminal,
|
// should end already exists as is. Since the old PtNode was not a terminal,
|
||||||
// make it one by filling in its frequency and other attributes
|
// make it one by filling in its frequency and other attributes
|
||||||
currentPtNode.update(probabilityInfo, shortcutTargets, null, isNotAWord,
|
currentPtNode.update(probabilityInfo, shortcutTargets, null, isNotAWord,
|
||||||
isBlacklistEntry);
|
isPossiblyOffensive);
|
||||||
} else {
|
} else {
|
||||||
// The new word matches the full old word and extends past it.
|
// The new word matches the full old word and extends past it.
|
||||||
// We only have to create a new node and add it to the end of this.
|
// We only have to create a new node and add it to the end of this.
|
||||||
final PtNode newNode = new PtNode(
|
final PtNode newNode = new PtNode(
|
||||||
Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length),
|
Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length),
|
||||||
shortcutTargets, null /* bigrams */, probabilityInfo,
|
shortcutTargets, null /* bigrams */, probabilityInfo,
|
||||||
isNotAWord, isBlacklistEntry);
|
isNotAWord, isPossiblyOffensive);
|
||||||
currentPtNode.mChildren = new PtNodeArray();
|
currentPtNode.mChildren = new PtNodeArray();
|
||||||
currentPtNode.mChildren.mData.add(newNode);
|
currentPtNode.mChildren.mData.add(newNode);
|
||||||
}
|
}
|
||||||
|
@ -459,7 +447,7 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
// new shortcuts to the existing shortcut list if it already exists.
|
// new shortcuts to the existing shortcut list if it already exists.
|
||||||
currentPtNode.update(probabilityInfo, shortcutTargets, null,
|
currentPtNode.update(probabilityInfo, shortcutTargets, null,
|
||||||
currentPtNode.mIsNotAWord && isNotAWord,
|
currentPtNode.mIsNotAWord && isNotAWord,
|
||||||
currentPtNode.mIsBlacklistEntry || isBlacklistEntry);
|
currentPtNode.mIsPossiblyOffensive || isPossiblyOffensive);
|
||||||
} else {
|
} else {
|
||||||
// Partial prefix match only. We have to replace the current node with a node
|
// Partial prefix match only. We have to replace the current node with a node
|
||||||
// containing the current prefix and create two new ones for the tails.
|
// containing the current prefix and create two new ones for the tails.
|
||||||
|
@ -468,7 +456,7 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
Arrays.copyOfRange(currentPtNode.mChars, differentCharIndex,
|
Arrays.copyOfRange(currentPtNode.mChars, differentCharIndex,
|
||||||
currentPtNode.mChars.length), currentPtNode.mShortcutTargets,
|
currentPtNode.mChars.length), currentPtNode.mShortcutTargets,
|
||||||
currentPtNode.mBigrams, currentPtNode.mProbabilityInfo,
|
currentPtNode.mBigrams, currentPtNode.mProbabilityInfo,
|
||||||
currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry,
|
currentPtNode.mIsNotAWord, currentPtNode.mIsPossiblyOffensive,
|
||||||
currentPtNode.mChildren);
|
currentPtNode.mChildren);
|
||||||
newChildren.mData.add(newOldWord);
|
newChildren.mData.add(newOldWord);
|
||||||
|
|
||||||
|
@ -477,17 +465,17 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
newParent = new PtNode(
|
newParent = new PtNode(
|
||||||
Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex),
|
Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex),
|
||||||
shortcutTargets, null /* bigrams */, probabilityInfo,
|
shortcutTargets, null /* bigrams */, probabilityInfo,
|
||||||
isNotAWord, isBlacklistEntry, newChildren);
|
isNotAWord, isPossiblyOffensive, newChildren);
|
||||||
} else {
|
} else {
|
||||||
newParent = new PtNode(
|
newParent = new PtNode(
|
||||||
Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex),
|
Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex),
|
||||||
null /* shortcutTargets */, null /* bigrams */,
|
null /* shortcutTargets */, null /* bigrams */,
|
||||||
null /* probabilityInfo */, false /* isNotAWord */,
|
null /* probabilityInfo */, false /* isNotAWord */,
|
||||||
false /* isBlacklistEntry */, newChildren);
|
false /* isPossiblyOffensive */, newChildren);
|
||||||
final PtNode newWord = new PtNode(Arrays.copyOfRange(word,
|
final PtNode newWord = new PtNode(Arrays.copyOfRange(word,
|
||||||
charIndex + differentCharIndex, word.length),
|
charIndex + differentCharIndex, word.length),
|
||||||
shortcutTargets, null /* bigrams */, probabilityInfo,
|
shortcutTargets, null /* bigrams */, probabilityInfo,
|
||||||
isNotAWord, isBlacklistEntry);
|
isNotAWord, isPossiblyOffensive);
|
||||||
final int addIndex = word[charIndex + differentCharIndex]
|
final int addIndex = word[charIndex + differentCharIndex]
|
||||||
> currentPtNode.mChars[differentCharIndex] ? 1 : 0;
|
> currentPtNode.mChars[differentCharIndex] ? 1 : 0;
|
||||||
newChildren.mData.add(addIndex, newWord);
|
newChildren.mData.add(addIndex, newWord);
|
||||||
|
@ -549,7 +537,7 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
final ArrayList<PtNode> data = nodeArray.mData;
|
final ArrayList<PtNode> data = nodeArray.mData;
|
||||||
final PtNode reference = new PtNode(new int[] { character },
|
final PtNode reference = new PtNode(new int[] { character },
|
||||||
null /* shortcutTargets */, null /* bigrams */, null /* probabilityInfo */,
|
null /* shortcutTargets */, null /* bigrams */, null /* probabilityInfo */,
|
||||||
false /* isNotAWord */, false /* isBlacklistEntry */);
|
false /* isNotAWord */, false /* isPossiblyOffensive */);
|
||||||
int result = Collections.binarySearch(data, reference, PTNODE_COMPARATOR);
|
int result = Collections.binarySearch(data, reference, PTNODE_COMPARATOR);
|
||||||
return result >= 0 ? result : -result - 1;
|
return result >= 0 ? result : -result - 1;
|
||||||
}
|
}
|
||||||
|
@ -686,7 +674,7 @@ public final class FusionDictionary implements Iterable<WordProperty> {
|
||||||
return new WordProperty(mCurrentString.toString(),
|
return new WordProperty(mCurrentString.toString(),
|
||||||
currentPtNode.mProbabilityInfo,
|
currentPtNode.mProbabilityInfo,
|
||||||
currentPtNode.mShortcutTargets, currentPtNode.mBigrams,
|
currentPtNode.mShortcutTargets, currentPtNode.mBigrams,
|
||||||
currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry);
|
currentPtNode.mIsNotAWord, currentPtNode.mIsPossiblyOffensive);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
mPositions.removeLast();
|
mPositions.removeLast();
|
||||||
|
|
|
@ -283,13 +283,9 @@ public class Ver2DictDecoder extends AbstractDictDecoder {
|
||||||
|
|
||||||
// Insert unigrams into the fusion dictionary.
|
// Insert unigrams into the fusion dictionary.
|
||||||
for (final WordProperty wordProperty : wordProperties) {
|
for (final WordProperty wordProperty : wordProperties) {
|
||||||
if (wordProperty.mIsBlacklistEntry) {
|
|
||||||
fusionDict.addBlacklistEntry(wordProperty.mWord, wordProperty.mShortcutTargets,
|
|
||||||
wordProperty.mIsNotAWord);
|
|
||||||
} else {
|
|
||||||
fusionDict.add(wordProperty.mWord, wordProperty.mProbabilityInfo,
|
fusionDict.add(wordProperty.mWord, wordProperty.mProbabilityInfo,
|
||||||
wordProperty.mShortcutTargets, wordProperty.mIsNotAWord);
|
wordProperty.mShortcutTargets, wordProperty.mIsNotAWord,
|
||||||
}
|
wordProperty.mIsPossiblyOffensive);
|
||||||
}
|
}
|
||||||
// Insert bigrams into the fusion dictionary.
|
// Insert bigrams into the fusion dictionary.
|
||||||
for (final WordProperty wordProperty : wordProperties) {
|
for (final WordProperty wordProperty : wordProperties) {
|
||||||
|
|
|
@ -85,7 +85,8 @@ public class Ver2DictEncoderTests extends AndroidTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
dict.add(word, new ProbabilityInfo(UNIGRAM_FREQ),
|
dict.add(word, new ProbabilityInfo(UNIGRAM_FREQ),
|
||||||
(shortcutMap == null) ? null : shortcuts, false /* isNotAWord */);
|
(shortcutMap == null) ? null : shortcuts, false /* isNotAWord */,
|
||||||
|
false /* isPossiblyOffensive */);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -88,13 +88,9 @@ public class Ver4DictDecoder extends AbstractDictDecoder {
|
||||||
|
|
||||||
// Insert unigrams into the fusion dictionary.
|
// Insert unigrams into the fusion dictionary.
|
||||||
for (final WordProperty wordProperty : wordProperties) {
|
for (final WordProperty wordProperty : wordProperties) {
|
||||||
if (wordProperty.mIsBlacklistEntry) {
|
|
||||||
fusionDict.addBlacklistEntry(wordProperty.mWord, wordProperty.mShortcutTargets,
|
|
||||||
wordProperty.mIsNotAWord);
|
|
||||||
} else {
|
|
||||||
fusionDict.add(wordProperty.mWord, wordProperty.mProbabilityInfo,
|
fusionDict.add(wordProperty.mWord, wordProperty.mProbabilityInfo,
|
||||||
wordProperty.mShortcutTargets, wordProperty.mIsNotAWord);
|
wordProperty.mShortcutTargets, wordProperty.mIsNotAWord,
|
||||||
}
|
wordProperty.mIsPossiblyOffensive);
|
||||||
}
|
}
|
||||||
// Insert bigrams into the fusion dictionary.
|
// Insert bigrams into the fusion dictionary.
|
||||||
// TODO: Support ngrams.
|
// TODO: Support ngrams.
|
||||||
|
|
|
@ -79,7 +79,7 @@ public class Ver4DictEncoder implements DictEncoder {
|
||||||
if (!binaryDict.addUnigramEntry(wordProperty.mWord, wordProperty.getProbability(),
|
if (!binaryDict.addUnigramEntry(wordProperty.mWord, wordProperty.getProbability(),
|
||||||
null /* shortcutTarget */, 0 /* shortcutProbability */,
|
null /* shortcutTarget */, 0 /* shortcutProbability */,
|
||||||
wordProperty.mIsBeginningOfSentence, wordProperty.mIsNotAWord,
|
wordProperty.mIsBeginningOfSentence, wordProperty.mIsNotAWord,
|
||||||
wordProperty.mIsBlacklistEntry, 0 /* timestamp */)) {
|
wordProperty.mIsPossiblyOffensive, 0 /* timestamp */)) {
|
||||||
MakedictLog.e("Cannot add unigram entry for " + wordProperty.mWord);
|
MakedictLog.e("Cannot add unigram entry for " + wordProperty.mWord);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -88,7 +88,7 @@ public class Ver4DictEncoder implements DictEncoder {
|
||||||
wordProperty.getProbability(),
|
wordProperty.getProbability(),
|
||||||
shortcutTarget.mWord, shortcutTarget.getProbability(),
|
shortcutTarget.mWord, shortcutTarget.getProbability(),
|
||||||
wordProperty.mIsBeginningOfSentence, wordProperty.mIsNotAWord,
|
wordProperty.mIsBeginningOfSentence, wordProperty.mIsNotAWord,
|
||||||
wordProperty.mIsBlacklistEntry, 0 /* timestamp */)) {
|
wordProperty.mIsPossiblyOffensive, 0 /* timestamp */)) {
|
||||||
MakedictLog.e("Cannot add unigram entry for " + wordProperty.mWord
|
MakedictLog.e("Cannot add unigram entry for " + wordProperty.mWord
|
||||||
+ ", shortcutTarget: " + shortcutTarget.mWord);
|
+ ", shortcutTarget: " + shortcutTarget.mWord);
|
||||||
return;
|
return;
|
||||||
|
|
|
@ -106,7 +106,7 @@ public class CombinedInputOutput {
|
||||||
if (args[0].matches(CombinedFormatUtils.WORD_TAG + "=.*")) {
|
if (args[0].matches(CombinedFormatUtils.WORD_TAG + "=.*")) {
|
||||||
if (null != word) {
|
if (null != word) {
|
||||||
dict.add(word, probabilityInfo, shortcuts.isEmpty() ? null : shortcuts,
|
dict.add(word, probabilityInfo, shortcuts.isEmpty() ? null : shortcuts,
|
||||||
isNotAWord);
|
isNotAWord, false /* isPossiblyOffensive */);
|
||||||
for (WeightedString s : bigrams) {
|
for (WeightedString s : bigrams) {
|
||||||
dict.setBigram(word, s.mWord, s.mProbabilityInfo);
|
dict.setBigram(word, s.mWord, s.mProbabilityInfo);
|
||||||
}
|
}
|
||||||
|
@ -189,7 +189,8 @@ public class CombinedInputOutput {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (null != word) {
|
if (null != word) {
|
||||||
dict.add(word, probabilityInfo, shortcuts.isEmpty() ? null : shortcuts, isNotAWord);
|
dict.add(word, probabilityInfo, shortcuts.isEmpty() ? null : shortcuts, isNotAWord,
|
||||||
|
false /* isPossiblyOffensive */);
|
||||||
for (WeightedString s : bigrams) {
|
for (WeightedString s : bigrams) {
|
||||||
dict.setBigram(word, s.mWord, s.mProbabilityInfo);
|
dict.setBigram(word, s.mWord, s.mProbabilityInfo);
|
||||||
}
|
}
|
||||||
|
|
|
@ -128,10 +128,10 @@ public class Diff extends Dicttool.Command {
|
||||||
+ word0Property.mIsNotAWord + " -> " + word1PtNode.getIsNotAWord());
|
+ word0Property.mIsNotAWord + " -> " + word1PtNode.getIsNotAWord());
|
||||||
hasDifferences = true;
|
hasDifferences = true;
|
||||||
}
|
}
|
||||||
if (word0Property.mIsBlacklistEntry != word1PtNode.getIsBlacklistEntry()) {
|
if (word0Property.mIsPossiblyOffensive != word1PtNode.getIsPossiblyOffensive()) {
|
||||||
System.out.println("Blacklist: " + word0Property.mWord + " "
|
System.out.println("Possibly-offensive: " + word0Property.mWord + " "
|
||||||
+ word0Property.mIsBlacklistEntry + " -> "
|
+ word0Property.mIsPossiblyOffensive + " -> "
|
||||||
+ word1PtNode.getIsBlacklistEntry());
|
+ word1PtNode.getIsPossiblyOffensive());
|
||||||
hasDifferences = true;
|
hasDifferences = true;
|
||||||
}
|
}
|
||||||
hasDifferences |= hasAttributesDifferencesAndPrintThemIfAny(word0Property.mWord,
|
hasDifferences |= hasAttributesDifferencesAndPrintThemIfAny(word0Property.mWord,
|
||||||
|
|
|
@ -76,8 +76,8 @@ public class Info extends Dicttool.Command {
|
||||||
if (ptNode.getIsNotAWord()) {
|
if (ptNode.getIsNotAWord()) {
|
||||||
System.out.println(" Is not a word");
|
System.out.println(" Is not a word");
|
||||||
}
|
}
|
||||||
if (ptNode.getIsBlacklistEntry()) {
|
if (ptNode.getIsPossiblyOffensive()) {
|
||||||
System.out.println(" Is a blacklist entry");
|
System.out.println(" Is possibly offensive");
|
||||||
}
|
}
|
||||||
final ArrayList<WeightedString> shortcutTargets = ptNode.getShortcutTargets();
|
final ArrayList<WeightedString> shortcutTargets = ptNode.getShortcutTargets();
|
||||||
if (null == shortcutTargets || shortcutTargets.isEmpty()) {
|
if (null == shortcutTargets || shortcutTargets.isEmpty()) {
|
||||||
|
|
|
@ -90,7 +90,8 @@ public class XmlDictInputOutput {
|
||||||
for (final String shortcutOnly : mShortcutsMap.keySet()) {
|
for (final String shortcutOnly : mShortcutsMap.keySet()) {
|
||||||
if (dict.hasWord(shortcutOnly)) continue;
|
if (dict.hasWord(shortcutOnly)) continue;
|
||||||
dict.add(shortcutOnly, new ProbabilityInfo(SHORTCUT_ONLY_WORD_PROBABILITY),
|
dict.add(shortcutOnly, new ProbabilityInfo(SHORTCUT_ONLY_WORD_PROBABILITY),
|
||||||
mShortcutsMap.get(shortcutOnly), true /* isNotAWord */);
|
mShortcutsMap.get(shortcutOnly), true /* isNotAWord */,
|
||||||
|
false /* isPossiblyOffensive */);
|
||||||
}
|
}
|
||||||
mDictionary = null;
|
mDictionary = null;
|
||||||
mShortcutsMap.clear();
|
mShortcutsMap.clear();
|
||||||
|
@ -138,7 +139,7 @@ public class XmlDictInputOutput {
|
||||||
public void endElement(String uri, String localName, String qName) {
|
public void endElement(String uri, String localName, String qName) {
|
||||||
if (WORD == mState) {
|
if (WORD == mState) {
|
||||||
mDictionary.add(mWord, new ProbabilityInfo(mFreq), mShortcutsMap.get(mWord),
|
mDictionary.add(mWord, new ProbabilityInfo(mFreq), mShortcutsMap.get(mWord),
|
||||||
false /* isNotAWord */);
|
false /* isNotAWord */, false /* isPossiblyOffensive */);
|
||||||
mState = START;
|
mState = START;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -54,11 +54,16 @@ public class BinaryDictOffdeviceUtilsTests extends TestCase {
|
||||||
testOptions.mAttributes.put(DictionaryHeader.DICTIONARY_LOCALE_KEY, LOCALE);
|
testOptions.mAttributes.put(DictionaryHeader.DICTIONARY_LOCALE_KEY, LOCALE);
|
||||||
testOptions.mAttributes.put(DictionaryHeader.DICTIONARY_ID_KEY, ID);
|
testOptions.mAttributes.put(DictionaryHeader.DICTIONARY_ID_KEY, ID);
|
||||||
final FusionDictionary dict = new FusionDictionary(new PtNodeArray(), testOptions);
|
final FusionDictionary dict = new FusionDictionary(new PtNodeArray(), testOptions);
|
||||||
dict.add("foo", new ProbabilityInfo(TEST_FREQ), null, false /* isNotAWord */);
|
dict.add("foo", new ProbabilityInfo(TEST_FREQ), null, false /* isNotAWord */,
|
||||||
dict.add("fta", new ProbabilityInfo(1), null, false /* isNotAWord */);
|
false /* isPossiblyOffensive */);
|
||||||
dict.add("ftb", new ProbabilityInfo(1), null, false /* isNotAWord */);
|
dict.add("fta", new ProbabilityInfo(1), null, false /* isNotAWord */,
|
||||||
dict.add("bar", new ProbabilityInfo(1), null, false /* isNotAWord */);
|
false /* isPossiblyOffensive */);
|
||||||
dict.add("fool", new ProbabilityInfo(1), null, false /* isNotAWord */);
|
dict.add("ftb", new ProbabilityInfo(1), null, false /* isNotAWord */,
|
||||||
|
false /* isPossiblyOffensive */);
|
||||||
|
dict.add("bar", new ProbabilityInfo(1), null, false /* isNotAWord */,
|
||||||
|
false /* isPossiblyOffensive */);
|
||||||
|
dict.add("fool", new ProbabilityInfo(1), null, false /* isNotAWord */,
|
||||||
|
false /* isPossiblyOffensive */);
|
||||||
|
|
||||||
final File dst = File.createTempFile("testGetRawDict", ".tmp");
|
final File dst = File.createTempFile("testGetRawDict", ".tmp");
|
||||||
dst.deleteOnExit();
|
dst.deleteOnExit();
|
||||||
|
|
|
@ -33,11 +33,16 @@ public class BinaryDictEncoderFlattenTreeTests extends TestCase {
|
||||||
public void testFlattenNodes() {
|
public void testFlattenNodes() {
|
||||||
final FusionDictionary dict = new FusionDictionary(new PtNodeArray(),
|
final FusionDictionary dict = new FusionDictionary(new PtNodeArray(),
|
||||||
new DictionaryOptions(new HashMap<String, String>()));
|
new DictionaryOptions(new HashMap<String, String>()));
|
||||||
dict.add("foo", new ProbabilityInfo(1), null, false /* isNotAWord */);
|
dict.add("foo", new ProbabilityInfo(1), null, false /* isNotAWord */,
|
||||||
dict.add("fta", new ProbabilityInfo(1), null, false /* isNotAWord */);
|
false /* isPossiblyOffensive */);
|
||||||
dict.add("ftb", new ProbabilityInfo(1), null, false /* isNotAWord */);
|
dict.add("fta", new ProbabilityInfo(1), null, false /* isNotAWord */,
|
||||||
dict.add("bar", new ProbabilityInfo(1), null, false /* isNotAWord */);
|
false /* isPossiblyOffensive */);
|
||||||
dict.add("fool", new ProbabilityInfo(1), null, false /* isNotAWord */);
|
dict.add("ftb", new ProbabilityInfo(1), null, false /* isNotAWord */,
|
||||||
|
false /* isPossiblyOffensive */);
|
||||||
|
dict.add("bar", new ProbabilityInfo(1), null, false /* isNotAWord */,
|
||||||
|
false /* isPossiblyOffensive */);
|
||||||
|
dict.add("fool", new ProbabilityInfo(1), null, false /* isNotAWord */,
|
||||||
|
false /* isPossiblyOffensive */);
|
||||||
final ArrayList<PtNodeArray> result =
|
final ArrayList<PtNodeArray> result =
|
||||||
BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray);
|
BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray);
|
||||||
assertEquals(4, result.size());
|
assertEquals(4, result.size());
|
||||||
|
|
|
@ -101,7 +101,8 @@ public class FusionDictionaryTest extends TestCase {
|
||||||
prepare(time);
|
prepare(time);
|
||||||
for (int i = 0; i < sWords.size(); ++i) {
|
for (int i = 0; i < sWords.size(); ++i) {
|
||||||
System.out.println("Adding in pos " + i + " : " + dumpWord(sWords.get(i)));
|
System.out.println("Adding in pos " + i + " : " + dumpWord(sWords.get(i)));
|
||||||
dict.add(sWords.get(i), new ProbabilityInfo(180), null, false);
|
dict.add(sWords.get(i), new ProbabilityInfo(180), null, false,
|
||||||
|
false /* isPossiblyOffensive */);
|
||||||
dumpDict(dict);
|
dumpDict(dict);
|
||||||
checkDictionary(dict, sWords, i);
|
checkDictionary(dict, sWords, i);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue