am 434c08a7
: Merge "Block offensive words in multi-word suggestions"
* commit '434c08a79ec60347590e26725372834678a8aa84': Block offensive words in multi-word suggestions
This commit is contained in:
commit
a1ba8ae1bc
3 changed files with 65 additions and 13 deletions
|
@ -76,6 +76,52 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
|
|||
weightOfLangModelVsSpatialModelToOutputSuggestions, outSuggestionResults);
|
||||
}
|
||||
|
||||
/* static */ bool SuggestionsOutputUtils::shouldBlockWord(
|
||||
const SuggestOptions *const suggestOptions, const DicNode *const terminalDicNode,
|
||||
const WordAttributes wordAttributes, const bool isLastWord) {
|
||||
const bool currentWordExactMatch =
|
||||
ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes());
|
||||
// When we have to block offensive words, non-exact matched offensive words should not be
|
||||
// output.
|
||||
const bool shouldBlockOffensiveWords = suggestOptions->blockOffensiveWords();
|
||||
|
||||
const bool isBlockedOffensiveWord = shouldBlockOffensiveWords &&
|
||||
wordAttributes.isPossiblyOffensive();
|
||||
|
||||
// This function is called in two situations:
|
||||
//
|
||||
// 1) At the end of a search, in which case terminalDicNode will point to the last DicNode
|
||||
// of the search, and isLastWord will be true.
|
||||
// "fuck"
|
||||
// |
|
||||
// \ terminalDicNode (isLastWord=true, currentWordExactMatch=true)
|
||||
// In this case, if the current word is an exact match, we will always let the word
|
||||
// through, even if the user is blocking offensive words (it's exactly what they typed!)
|
||||
//
|
||||
// 2) In the middle of the search, when we hit a terminal node, to decide whether or not
|
||||
// to start a new search at root, to try to match the rest of the input. In this case,
|
||||
// terminalDicNode will point to the terminal node we just hit, and isLastWord will be
|
||||
// false.
|
||||
// "fuckvthis"
|
||||
// |
|
||||
// \ terminalDicNode (isLastWord=false, currentWordExactMatch=true)
|
||||
//
|
||||
// In this case, we should NOT allow the match through (correcting "fuckthis" to "fuck this"
|
||||
// when offensive words are blocked would be a bad idea).
|
||||
//
|
||||
// In the case of a multi-word correction where the offensive word is typed last (eg.
|
||||
// for the input "allfuck"), this function will be called with isLastWord==true, but
|
||||
// currentWordExactMatch==false. So we are OK in this case as well.
|
||||
// "allfuck"
|
||||
// |
|
||||
// \ terminalDicNode (isLastWord=true, currentWordExactMatch=false)
|
||||
if (isLastWord && currentWordExactMatch) {
|
||||
return false;
|
||||
} else {
|
||||
return isBlockedOffensiveWord;
|
||||
}
|
||||
}
|
||||
|
||||
/* static */ void SuggestionsOutputUtils::outputSuggestionsOfDicNode(
|
||||
const Scoring *const scoringPolicy, DicTraverseSession *traverseSession,
|
||||
const DicNode *const terminalDicNode, const float weightOfLangModelVsSpatialModel,
|
||||
|
@ -98,24 +144,16 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
|
|||
const bool isExactMatchWithIntentionalOmission =
|
||||
ErrorTypeUtils::isExactMatchWithIntentionalOmission(
|
||||
terminalDicNode->getContainedErrorTypes());
|
||||
const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase();
|
||||
// Heuristic: We exclude probability=0 first-char-uppercase words from exact match.
|
||||
// (e.g. "AMD" and "and")
|
||||
const bool isSafeExactMatch = isExactMatch
|
||||
&& !(wordAttributes.isPossiblyOffensive() && isFirstCharUppercase);
|
||||
const int outputTypeFlags =
|
||||
(wordAttributes.isPossiblyOffensive() ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0)
|
||||
| ((isSafeExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0)
|
||||
| ((isExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0)
|
||||
| (isExactMatchWithIntentionalOmission ?
|
||||
Dictionary::KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION : 0);
|
||||
|
||||
// Entries that are blacklisted or do not represent a word should not be output.
|
||||
const bool isValidWord = !(wordAttributes.isBlacklisted() || wordAttributes.isNotAWord());
|
||||
// When we have to block offensive words, non-exact matched offensive words should not be
|
||||
// output.
|
||||
const bool blockOffensiveWords = traverseSession->getSuggestOptions()->blockOffensiveWords();
|
||||
const bool isBlockedOffensiveWord = blockOffensiveWords && wordAttributes.isPossiblyOffensive()
|
||||
&& !isSafeExactMatch;
|
||||
|
||||
const bool shouldBlockThisWord = shouldBlockWord(traverseSession->getSuggestOptions(),
|
||||
terminalDicNode, wordAttributes, true /* isLastWord */);
|
||||
|
||||
// Increase output score of top typing suggestion to ensure autocorrection.
|
||||
// TODO: Better integration with java side autocorrection logic.
|
||||
|
@ -127,7 +165,7 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
|
|||
|
||||
// Don't output invalid or blocked offensive words. However, we still need to submit their
|
||||
// shortcuts if any.
|
||||
if (isValidWord && !isBlockedOffensiveWord) {
|
||||
if (isValidWord && !shouldBlockThisWord) {
|
||||
int codePoints[MAX_WORD_LENGTH];
|
||||
terminalDicNode->outputResult(codePoints);
|
||||
const int indexToPartialCommit = outputSecondWordFirstLetterInputIndex ?
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#define LATINIME_SUGGESTIONS_OUTPUT_UTILS
|
||||
|
||||
#include "defines.h"
|
||||
#include "suggest/core/dictionary/word_attributes.h"
|
||||
|
||||
namespace latinime {
|
||||
|
||||
|
@ -25,10 +26,18 @@ class BinaryDictionaryShortcutIterator;
|
|||
class DicNode;
|
||||
class DicTraverseSession;
|
||||
class Scoring;
|
||||
class SuggestOptions;
|
||||
class SuggestionResults;
|
||||
|
||||
class SuggestionsOutputUtils {
|
||||
public:
|
||||
/**
|
||||
* Returns true if we should block the incoming word, in the context of the user's
|
||||
* preferences to include or not include possibly offensive words
|
||||
*/
|
||||
static bool shouldBlockWord(const SuggestOptions *const suggestOptions,
|
||||
const DicNode *const terminalDicNode, const WordAttributes wordAttributes,
|
||||
const bool isLastWord);
|
||||
/**
|
||||
* Outputs the final list of suggestions (i.e., terminal nodes).
|
||||
*/
|
||||
|
|
|
@ -416,6 +416,11 @@ void Suggest::createNextWordDicNode(DicTraverseSession *traverseSession, DicNode
|
|||
traverseSession->getDictionaryStructurePolicy()->getWordAttributesInContext(
|
||||
dicNode->getPrevWordIds(), dicNode->getWordId(),
|
||||
traverseSession->getMultiBigramMap());
|
||||
if (SuggestionsOutputUtils::shouldBlockWord(traverseSession->getSuggestOptions(),
|
||||
dicNode, wordAttributes, false /* isLastWord */)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!TRAVERSAL->isGoodToTraverseNextWord(dicNode, wordAttributes.getProbability())) {
|
||||
return;
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue