Block offensive words in multi-word suggestions

If the user has chosen to block offensive words and types
"aaaxbb", where "aaa" is an offensive word and "bb" is not,
we should not suggest "aaa bb".

Bug: 11031090
Change-Id: Ie23b8dd5d347bc26b1c046c3f5e8dfbc259bf528
This commit is contained in:
Adrian Velicu 2014-10-21 22:11:23 +09:00
parent 61d43e5c94
commit 10416241f7
3 changed files with 65 additions and 13 deletions

View file

@ -76,6 +76,52 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
weightOfLangModelVsSpatialModelToOutputSuggestions, outSuggestionResults); weightOfLangModelVsSpatialModelToOutputSuggestions, outSuggestionResults);
} }
/* static */ bool SuggestionsOutputUtils::shouldBlockWord(
const SuggestOptions *const suggestOptions, const DicNode *const terminalDicNode,
const WordAttributes wordAttributes, const bool isLastWord) {
const bool currentWordExactMatch =
ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes());
// When we have to block offensive words, non-exact matched offensive words should not be
// output.
const bool shouldBlockOffensiveWords = suggestOptions->blockOffensiveWords();
const bool isBlockedOffensiveWord = shouldBlockOffensiveWords &&
wordAttributes.isPossiblyOffensive();
// This function is called in two situations:
//
// 1) At the end of a search, in which case terminalDicNode will point to the last DicNode
// of the search, and isLastWord will be true.
// "fuck"
// |
// \ terminalDicNode (isLastWord=true, currentWordExactMatch=true)
// In this case, if the current word is an exact match, we will always let the word
// through, even if the user is blocking offensive words (it's exactly what they typed!)
//
// 2) In the middle of the search, when we hit a terminal node, to decide whether or not
// to start a new search at root, to try to match the rest of the input. In this case,
// terminalDicNode will point to the terminal node we just hit, and isLastWord will be
// false.
// "fuckvthis"
// |
// \ terminalDicNode (isLastWord=false, currentWordExactMatch=true)
//
// In this case, we should NOT allow the match through (correcting "fuckthis" to "fuck this"
// when offensive words are blocked would be a bad idea).
//
// In the case of a multi-word correction where the offensive word is typed last (eg.
// for the input "allfuck"), this function will be called with isLastWord==true, but
// currentWordExactMatch==false. So we are OK in this case as well.
// "allfuck"
// |
// \ terminalDicNode (isLastWord=true, currentWordExactMatch=false)
if (isLastWord && currentWordExactMatch) {
return false;
} else {
return isBlockedOffensiveWord;
}
}
/* static */ void SuggestionsOutputUtils::outputSuggestionsOfDicNode( /* static */ void SuggestionsOutputUtils::outputSuggestionsOfDicNode(
const Scoring *const scoringPolicy, DicTraverseSession *traverseSession, const Scoring *const scoringPolicy, DicTraverseSession *traverseSession,
const DicNode *const terminalDicNode, const float weightOfLangModelVsSpatialModel, const DicNode *const terminalDicNode, const float weightOfLangModelVsSpatialModel,
@ -98,24 +144,16 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
const bool isExactMatchWithIntentionalOmission = const bool isExactMatchWithIntentionalOmission =
ErrorTypeUtils::isExactMatchWithIntentionalOmission( ErrorTypeUtils::isExactMatchWithIntentionalOmission(
terminalDicNode->getContainedErrorTypes()); terminalDicNode->getContainedErrorTypes());
const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase();
// Heuristic: We exclude probability=0 first-char-uppercase words from exact match.
// (e.g. "AMD" and "and")
const bool isSafeExactMatch = isExactMatch
&& !(wordAttributes.isPossiblyOffensive() && isFirstCharUppercase);
const int outputTypeFlags = const int outputTypeFlags =
(wordAttributes.isPossiblyOffensive() ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0) (wordAttributes.isPossiblyOffensive() ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0)
| ((isSafeExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0) | ((isExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0)
| (isExactMatchWithIntentionalOmission ? | (isExactMatchWithIntentionalOmission ?
Dictionary::KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION : 0); Dictionary::KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION : 0);
// Entries that are blacklisted or do not represent a word should not be output. // Entries that are blacklisted or do not represent a word should not be output.
const bool isValidWord = !(wordAttributes.isBlacklisted() || wordAttributes.isNotAWord()); const bool isValidWord = !(wordAttributes.isBlacklisted() || wordAttributes.isNotAWord());
// When we have to block offensive words, non-exact matched offensive words should not be
// output. const bool shouldBlockThisWord = shouldBlockWord(traverseSession->getSuggestOptions(),
const bool blockOffensiveWords = traverseSession->getSuggestOptions()->blockOffensiveWords(); terminalDicNode, wordAttributes, true /* isLastWord */);
const bool isBlockedOffensiveWord = blockOffensiveWords && wordAttributes.isPossiblyOffensive()
&& !isSafeExactMatch;
// Increase output score of top typing suggestion to ensure autocorrection. // Increase output score of top typing suggestion to ensure autocorrection.
// TODO: Better integration with java side autocorrection logic. // TODO: Better integration with java side autocorrection logic.
@ -127,7 +165,7 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
// Don't output invalid or blocked offensive words. However, we still need to submit their // Don't output invalid or blocked offensive words. However, we still need to submit their
// shortcuts if any. // shortcuts if any.
if (isValidWord && !isBlockedOffensiveWord) { if (isValidWord && !shouldBlockThisWord) {
int codePoints[MAX_WORD_LENGTH]; int codePoints[MAX_WORD_LENGTH];
terminalDicNode->outputResult(codePoints); terminalDicNode->outputResult(codePoints);
const int indexToPartialCommit = outputSecondWordFirstLetterInputIndex ? const int indexToPartialCommit = outputSecondWordFirstLetterInputIndex ?

View file

@ -18,6 +18,7 @@
#define LATINIME_SUGGESTIONS_OUTPUT_UTILS #define LATINIME_SUGGESTIONS_OUTPUT_UTILS
#include "defines.h" #include "defines.h"
#include "suggest/core/dictionary/word_attributes.h"
namespace latinime { namespace latinime {
@ -25,10 +26,18 @@ class BinaryDictionaryShortcutIterator;
class DicNode; class DicNode;
class DicTraverseSession; class DicTraverseSession;
class Scoring; class Scoring;
class SuggestOptions;
class SuggestionResults; class SuggestionResults;
class SuggestionsOutputUtils { class SuggestionsOutputUtils {
public: public:
/**
* Returns true if we should block the incoming word, in the context of the user's
* preferences to include or not include possibly offensive words
*/
static bool shouldBlockWord(const SuggestOptions *const suggestOptions,
const DicNode *const terminalDicNode, const WordAttributes wordAttributes,
const bool isLastWord);
/** /**
* Outputs the final list of suggestions (i.e., terminal nodes). * Outputs the final list of suggestions (i.e., terminal nodes).
*/ */

View file

@ -416,6 +416,11 @@ void Suggest::createNextWordDicNode(DicTraverseSession *traverseSession, DicNode
traverseSession->getDictionaryStructurePolicy()->getWordAttributesInContext( traverseSession->getDictionaryStructurePolicy()->getWordAttributesInContext(
dicNode->getPrevWordIds(), dicNode->getWordId(), dicNode->getPrevWordIds(), dicNode->getWordId(),
traverseSession->getMultiBigramMap()); traverseSession->getMultiBigramMap());
if (SuggestionsOutputUtils::shouldBlockWord(traverseSession->getSuggestOptions(),
dicNode, wordAttributes, false /* isLastWord */)) {
return;
}
if (!TRAVERSAL->isGoodToTraverseNextWord(dicNode, wordAttributes.getProbability())) { if (!TRAVERSAL->isGoodToTraverseNextWord(dicNode, wordAttributes.getProbability())) {
return; return;
} }