Further fixes to treat 0-frequency words
Previously, when both legitimate 0-frequency words (such as distracters) and offensive words were encoded in the same way, distracters would never show up when the user blocked offensive words (the default setting, as well as the setting for regression tests). When b/11031090 was fixed and a separate encoding was used for offensive words, 0-frequency words would no longer be blocked when they were an "exact match" (where case mismatches and accent mismatches would be considered an "exact match"). The exact match boosting functionality meant that, for example, when the user typed "mt" they would be suggested the word "Mt", although they most probably meant to type "my". For this reason, we introduced this change, which does the following: * Defines the "perfect match" as a really exact match, with no room for case or accent mismatches * When the target word has probability zero (as "Mt" does, because it is a distracter), ONLY boost its score if it is a perfect match. By doing this, when the user types "mt", the word "Mt" will NOT be boosted, and they will get "my". However, if the user makes an explicit effort to type "Mt", we do boost the word "Mt" so that the user's input is not autocorrected to "My". Bug: 11031090 Change-Id: I92ee1b4e742645d52e2f7f8c4390920481e8fff0
This commit is contained in:
parent
10416241f7
commit
009e02ce4a
7 changed files with 47 additions and 12 deletions
|
@ -31,6 +31,7 @@ const ErrorTypeUtils::ErrorType ErrorTypeUtils::NEW_WORD = 0x100;
|
||||||
|
|
||||||
const ErrorTypeUtils::ErrorType ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH =
|
const ErrorTypeUtils::ErrorType ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH =
|
||||||
NOT_AN_ERROR | MATCH_WITH_WRONG_CASE | MATCH_WITH_MISSING_ACCENT | MATCH_WITH_DIGRAPH;
|
NOT_AN_ERROR | MATCH_WITH_WRONG_CASE | MATCH_WITH_MISSING_ACCENT | MATCH_WITH_DIGRAPH;
|
||||||
|
const ErrorTypeUtils::ErrorType ErrorTypeUtils::ERRORS_TREATED_AS_A_PERFECT_MATCH = NOT_AN_ERROR;
|
||||||
|
|
||||||
const ErrorTypeUtils::ErrorType
|
const ErrorTypeUtils::ErrorType
|
||||||
ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION =
|
ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION =
|
||||||
|
|
|
@ -52,6 +52,10 @@ class ErrorTypeUtils {
|
||||||
return (containedErrorTypes & ~ERRORS_TREATED_AS_AN_EXACT_MATCH) == 0;
|
return (containedErrorTypes & ~ERRORS_TREATED_AS_AN_EXACT_MATCH) == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool isPerfectMatch(const ErrorType containedErrorTypes) {
|
||||||
|
return (containedErrorTypes & ~ERRORS_TREATED_AS_A_PERFECT_MATCH) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
static bool isExactMatchWithIntentionalOmission(const ErrorType containedErrorTypes) {
|
static bool isExactMatchWithIntentionalOmission(const ErrorType containedErrorTypes) {
|
||||||
return (containedErrorTypes
|
return (containedErrorTypes
|
||||||
& ~ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION) == 0;
|
& ~ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION) == 0;
|
||||||
|
@ -73,6 +77,7 @@ class ErrorTypeUtils {
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(ErrorTypeUtils);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(ErrorTypeUtils);
|
||||||
|
|
||||||
static const ErrorType ERRORS_TREATED_AS_AN_EXACT_MATCH;
|
static const ErrorType ERRORS_TREATED_AS_AN_EXACT_MATCH;
|
||||||
|
static const ErrorType ERRORS_TREATED_AS_A_PERFECT_MATCH;
|
||||||
static const ErrorType ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION;
|
static const ErrorType ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION;
|
||||||
};
|
};
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
|
|
|
@ -30,7 +30,7 @@ class Scoring {
|
||||||
public:
|
public:
|
||||||
virtual int calculateFinalScore(const float compoundDistance, const int inputSize,
|
virtual int calculateFinalScore(const float compoundDistance, const int inputSize,
|
||||||
const ErrorTypeUtils::ErrorType containedErrorTypes, const bool forceCommit,
|
const ErrorTypeUtils::ErrorType containedErrorTypes, const bool forceCommit,
|
||||||
const bool boostExactMatches) const = 0;
|
const bool boostExactMatches, const bool hasProbabilityZero) const = 0;
|
||||||
virtual void getMostProbableString(const DicTraverseSession *const traverseSession,
|
virtual void getMostProbableString(const DicTraverseSession *const traverseSession,
|
||||||
const float weightOfLangModelVsSpatialModel,
|
const float weightOfLangModelVsSpatialModel,
|
||||||
SuggestionResults *const outSuggestionResults) const = 0;
|
SuggestionResults *const outSuggestionResults) const = 0;
|
||||||
|
|
|
@ -161,7 +161,7 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
|
||||||
compoundDistance, traverseSession->getInputSize(),
|
compoundDistance, traverseSession->getInputSize(),
|
||||||
terminalDicNode->getContainedErrorTypes(),
|
terminalDicNode->getContainedErrorTypes(),
|
||||||
(forceCommitMultiWords && terminalDicNode->hasMultipleWords()),
|
(forceCommitMultiWords && terminalDicNode->hasMultipleWords()),
|
||||||
boostExactMatches);
|
boostExactMatches, wordAttributes.getProbability() == 0);
|
||||||
|
|
||||||
// Don't output invalid or blocked offensive words. However, we still need to submit their
|
// Don't output invalid or blocked offensive words. However, we still need to submit their
|
||||||
// shortcuts if any.
|
// shortcuts if any.
|
||||||
|
|
|
@ -24,6 +24,7 @@ const int ScoringParams::THRESHOLD_NEXT_WORD_PROBABILITY_FOR_CAPPED = 120;
|
||||||
const float ScoringParams::AUTOCORRECT_OUTPUT_THRESHOLD = 1.0f;
|
const float ScoringParams::AUTOCORRECT_OUTPUT_THRESHOLD = 1.0f;
|
||||||
|
|
||||||
const float ScoringParams::EXACT_MATCH_PROMOTION = 1.1f;
|
const float ScoringParams::EXACT_MATCH_PROMOTION = 1.1f;
|
||||||
|
const float ScoringParams::PERFECT_MATCH_PROMOTION = 1.1f;
|
||||||
const float ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH = 0.01f;
|
const float ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH = 0.01f;
|
||||||
const float ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH = 0.02f;
|
const float ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH = 0.02f;
|
||||||
const float ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH = 0.03f;
|
const float ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH = 0.03f;
|
||||||
|
|
|
@ -34,6 +34,7 @@ class ScoringParams {
|
||||||
static const int THRESHOLD_SHORT_WORD_LENGTH;
|
static const int THRESHOLD_SHORT_WORD_LENGTH;
|
||||||
|
|
||||||
static const float EXACT_MATCH_PROMOTION;
|
static const float EXACT_MATCH_PROMOTION;
|
||||||
|
static const float PERFECT_MATCH_PROMOTION;
|
||||||
static const float CASE_ERROR_PENALTY_FOR_EXACT_MATCH;
|
static const float CASE_ERROR_PENALTY_FOR_EXACT_MATCH;
|
||||||
static const float ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH;
|
static const float ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH;
|
||||||
static const float DIGRAPH_PENALTY_FOR_EXACT_MATCH;
|
static const float DIGRAPH_PENALTY_FOR_EXACT_MATCH;
|
||||||
|
|
|
@ -44,23 +44,50 @@ class TypingScoring : public Scoring {
|
||||||
|
|
||||||
AK_FORCE_INLINE int calculateFinalScore(const float compoundDistance, const int inputSize,
|
AK_FORCE_INLINE int calculateFinalScore(const float compoundDistance, const int inputSize,
|
||||||
const ErrorTypeUtils::ErrorType containedErrorTypes, const bool forceCommit,
|
const ErrorTypeUtils::ErrorType containedErrorTypes, const bool forceCommit,
|
||||||
const bool boostExactMatches) const {
|
const bool boostExactMatches, const bool hasProbabilityZero) const {
|
||||||
const float maxDistance = ScoringParams::DISTANCE_WEIGHT_LANGUAGE
|
const float maxDistance = ScoringParams::DISTANCE_WEIGHT_LANGUAGE
|
||||||
+ static_cast<float>(inputSize) * ScoringParams::TYPING_MAX_OUTPUT_SCORE_PER_INPUT;
|
+ static_cast<float>(inputSize) * ScoringParams::TYPING_MAX_OUTPUT_SCORE_PER_INPUT;
|
||||||
float score = ScoringParams::TYPING_BASE_OUTPUT_SCORE - compoundDistance / maxDistance;
|
float score = ScoringParams::TYPING_BASE_OUTPUT_SCORE - compoundDistance / maxDistance;
|
||||||
if (forceCommit) {
|
if (forceCommit) {
|
||||||
score += ScoringParams::AUTOCORRECT_OUTPUT_THRESHOLD;
|
score += ScoringParams::AUTOCORRECT_OUTPUT_THRESHOLD;
|
||||||
}
|
}
|
||||||
if (boostExactMatches && ErrorTypeUtils::isExactMatch(containedErrorTypes)) {
|
if (hasProbabilityZero) {
|
||||||
score += ScoringParams::EXACT_MATCH_PROMOTION;
|
// Previously, when both legitimate 0-frequency words (such as distracters) and
|
||||||
if ((ErrorTypeUtils::MATCH_WITH_WRONG_CASE & containedErrorTypes) != 0) {
|
// offensive words were encoded in the same way, distracters would never show up
|
||||||
score -= ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH;
|
// when the user blocked offensive words (the default setting, as well as the
|
||||||
|
// setting for regression tests).
|
||||||
|
//
|
||||||
|
// When b/11031090 was fixed and a separate encoding was used for offensive words,
|
||||||
|
// 0-frequency words would no longer be blocked when they were an "exact match"
|
||||||
|
// (where case mismatches and accent mismatches would be considered an "exact
|
||||||
|
// match"). The exact match boosting functionality meant that, for example, when
|
||||||
|
// the user typed "mt" they would be suggested the word "Mt", although they most
|
||||||
|
// probably meant to type "my".
|
||||||
|
//
|
||||||
|
// For this reason, we introduced this change, which does the following:
|
||||||
|
// * Defines the "perfect match" as a really exact match, with no room for case or
|
||||||
|
// accent mismatches
|
||||||
|
// * When the target word has probability zero (as "Mt" does, because it is a
|
||||||
|
// distracter), ONLY boost its score if it is a perfect match.
|
||||||
|
//
|
||||||
|
// By doing this, when the user types "mt", the word "Mt" will NOT be boosted, and
|
||||||
|
// they will get "my". However, if the user makes an explicit effort to type "Mt",
|
||||||
|
// we do boost the word "Mt" so that the user's input is not autocorrected to "My".
|
||||||
|
if (boostExactMatches && ErrorTypeUtils::isPerfectMatch(containedErrorTypes)) {
|
||||||
|
score += ScoringParams::PERFECT_MATCH_PROMOTION;
|
||||||
}
|
}
|
||||||
if ((ErrorTypeUtils::MATCH_WITH_MISSING_ACCENT & containedErrorTypes) != 0) {
|
} else {
|
||||||
score -= ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH;
|
if (boostExactMatches && ErrorTypeUtils::isExactMatch(containedErrorTypes)) {
|
||||||
}
|
score += ScoringParams::EXACT_MATCH_PROMOTION;
|
||||||
if ((ErrorTypeUtils::MATCH_WITH_DIGRAPH & containedErrorTypes) != 0) {
|
if ((ErrorTypeUtils::MATCH_WITH_WRONG_CASE & containedErrorTypes) != 0) {
|
||||||
score -= ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH;
|
score -= ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH;
|
||||||
|
}
|
||||||
|
if ((ErrorTypeUtils::MATCH_WITH_MISSING_ACCENT & containedErrorTypes) != 0) {
|
||||||
|
score -= ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH;
|
||||||
|
}
|
||||||
|
if ((ErrorTypeUtils::MATCH_WITH_DIGRAPH & containedErrorTypes) != 0) {
|
||||||
|
score -= ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return static_cast<int>(score * SUGGEST_INTERFACE_OUTPUT_SCALE);
|
return static_cast<int>(score * SUGGEST_INTERFACE_OUTPUT_SCALE);
|
||||||
|
|
Loading…
Reference in a new issue