Further fixes to treat 0-frequency words

Previously, when both legitimate 0-frequency words (such as
distracters) and offensive words were encoded in the same
way, distracters would never show up when the user blocked
offensive words (the default setting, as well as the setting
for regression tests).

When b/11031090 was fixed and a separate encoding was used
for offensive words, 0-frequency words would no longer be
blocked when they were an "exact match" (where case
mismatches and accent mismatches would be considered an
"exact match"). The exact match boosting functionality meant
that, for example, when the user typed "mt" they would be
suggested the word "Mt", although they most probably meant
to type "my".

For this reason, we introduced this change, which does the
following:
* Defines the "perfect match" as a really exact match, with
no room for case or accent mismatches
* When the target word has probability zero (as "Mt" does,
because it is a distracter), ONLY boost its score if it is a
perfect match.

By doing this, when the user types "mt", the word "Mt" will
NOT be boosted, and they will get "my". However, if the user
makes an explicit effort to type "Mt", we do boost the word
"Mt" so that the user's input is not autocorrected to "My".

Bug: 11031090
Change-Id: I92ee1b4e742645d52e2f7f8c4390920481e8fff0
main
Adrian Velicu 2014-10-29 15:46:19 +09:00
parent 10416241f7
commit 009e02ce4a
7 changed files with 47 additions and 12 deletions

View File

@ -31,6 +31,7 @@ const ErrorTypeUtils::ErrorType ErrorTypeUtils::NEW_WORD = 0x100;
const ErrorTypeUtils::ErrorType ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH =
NOT_AN_ERROR | MATCH_WITH_WRONG_CASE | MATCH_WITH_MISSING_ACCENT | MATCH_WITH_DIGRAPH;
const ErrorTypeUtils::ErrorType ErrorTypeUtils::ERRORS_TREATED_AS_A_PERFECT_MATCH = NOT_AN_ERROR;
const ErrorTypeUtils::ErrorType
ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION =

View File

@ -52,6 +52,10 @@ class ErrorTypeUtils {
return (containedErrorTypes & ~ERRORS_TREATED_AS_AN_EXACT_MATCH) == 0;
}
static bool isPerfectMatch(const ErrorType containedErrorTypes) {
return (containedErrorTypes & ~ERRORS_TREATED_AS_A_PERFECT_MATCH) == 0;
}
static bool isExactMatchWithIntentionalOmission(const ErrorType containedErrorTypes) {
return (containedErrorTypes
& ~ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION) == 0;
@ -73,6 +77,7 @@ class ErrorTypeUtils {
DISALLOW_IMPLICIT_CONSTRUCTORS(ErrorTypeUtils);
static const ErrorType ERRORS_TREATED_AS_AN_EXACT_MATCH;
static const ErrorType ERRORS_TREATED_AS_A_PERFECT_MATCH;
static const ErrorType ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION;
};
} // namespace latinime

View File

@ -30,7 +30,7 @@ class Scoring {
public:
virtual int calculateFinalScore(const float compoundDistance, const int inputSize,
const ErrorTypeUtils::ErrorType containedErrorTypes, const bool forceCommit,
const bool boostExactMatches) const = 0;
const bool boostExactMatches, const bool hasProbabilityZero) const = 0;
virtual void getMostProbableString(const DicTraverseSession *const traverseSession,
const float weightOfLangModelVsSpatialModel,
SuggestionResults *const outSuggestionResults) const = 0;

View File

@ -161,7 +161,7 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
compoundDistance, traverseSession->getInputSize(),
terminalDicNode->getContainedErrorTypes(),
(forceCommitMultiWords && terminalDicNode->hasMultipleWords()),
boostExactMatches);
boostExactMatches, wordAttributes.getProbability() == 0);
// Don't output invalid or blocked offensive words. However, we still need to submit their
// shortcuts if any.

View File

@ -24,6 +24,7 @@ const int ScoringParams::THRESHOLD_NEXT_WORD_PROBABILITY_FOR_CAPPED = 120;
const float ScoringParams::AUTOCORRECT_OUTPUT_THRESHOLD = 1.0f;
const float ScoringParams::EXACT_MATCH_PROMOTION = 1.1f;
const float ScoringParams::PERFECT_MATCH_PROMOTION = 1.1f;
const float ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH = 0.01f;
const float ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH = 0.02f;
const float ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH = 0.03f;

View File

@ -34,6 +34,7 @@ class ScoringParams {
static const int THRESHOLD_SHORT_WORD_LENGTH;
static const float EXACT_MATCH_PROMOTION;
static const float PERFECT_MATCH_PROMOTION;
static const float CASE_ERROR_PENALTY_FOR_EXACT_MATCH;
static const float ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH;
static const float DIGRAPH_PENALTY_FOR_EXACT_MATCH;

View File

@ -44,23 +44,50 @@ class TypingScoring : public Scoring {
AK_FORCE_INLINE int calculateFinalScore(const float compoundDistance, const int inputSize,
const ErrorTypeUtils::ErrorType containedErrorTypes, const bool forceCommit,
const bool boostExactMatches) const {
const bool boostExactMatches, const bool hasProbabilityZero) const {
const float maxDistance = ScoringParams::DISTANCE_WEIGHT_LANGUAGE
+ static_cast<float>(inputSize) * ScoringParams::TYPING_MAX_OUTPUT_SCORE_PER_INPUT;
float score = ScoringParams::TYPING_BASE_OUTPUT_SCORE - compoundDistance / maxDistance;
if (forceCommit) {
score += ScoringParams::AUTOCORRECT_OUTPUT_THRESHOLD;
}
if (boostExactMatches && ErrorTypeUtils::isExactMatch(containedErrorTypes)) {
score += ScoringParams::EXACT_MATCH_PROMOTION;
if ((ErrorTypeUtils::MATCH_WITH_WRONG_CASE & containedErrorTypes) != 0) {
score -= ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH;
if (hasProbabilityZero) {
// Previously, when both legitimate 0-frequency words (such as distracters) and
// offensive words were encoded in the same way, distracters would never show up
// when the user blocked offensive words (the default setting, as well as the
// setting for regression tests).
//
// When b/11031090 was fixed and a separate encoding was used for offensive words,
// 0-frequency words would no longer be blocked when they were an "exact match"
// (where case mismatches and accent mismatches would be considered an "exact
// match"). The exact match boosting functionality meant that, for example, when
// the user typed "mt" they would be suggested the word "Mt", although they most
// probably meant to type "my".
//
// For this reason, we introduced this change, which does the following:
// * Defines the "perfect match" as a really exact match, with no room for case or
// accent mismatches
// * When the target word has probability zero (as "Mt" does, because it is a
// distracter), ONLY boost its score if it is a perfect match.
//
// By doing this, when the user types "mt", the word "Mt" will NOT be boosted, and
// they will get "my". However, if the user makes an explicit effort to type "Mt",
// we do boost the word "Mt" so that the user's input is not autocorrected to "My".
if (boostExactMatches && ErrorTypeUtils::isPerfectMatch(containedErrorTypes)) {
score += ScoringParams::PERFECT_MATCH_PROMOTION;
}
if ((ErrorTypeUtils::MATCH_WITH_MISSING_ACCENT & containedErrorTypes) != 0) {
score -= ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH;
}
if ((ErrorTypeUtils::MATCH_WITH_DIGRAPH & containedErrorTypes) != 0) {
score -= ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH;
} else {
if (boostExactMatches && ErrorTypeUtils::isExactMatch(containedErrorTypes)) {
score += ScoringParams::EXACT_MATCH_PROMOTION;
if ((ErrorTypeUtils::MATCH_WITH_WRONG_CASE & containedErrorTypes) != 0) {
score -= ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH;
}
if ((ErrorTypeUtils::MATCH_WITH_MISSING_ACCENT & containedErrorTypes) != 0) {
score -= ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH;
}
if ((ErrorTypeUtils::MATCH_WITH_DIGRAPH & containedErrorTypes) != 0) {
score -= ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH;
}
}
}
return static_cast<int>(score * SUGGEST_INTERFACE_OUTPUT_SCALE);