am 372ca14d: Merge "Further fixes to treat 0-frequency words"
* commit '372ca14deafbc12ccd34004a8779a9d24ff1dcf8': Further fixes to treat 0-frequency wordsmain
commit
9367ec5f76
|
@ -31,6 +31,7 @@ const ErrorTypeUtils::ErrorType ErrorTypeUtils::NEW_WORD = 0x100;
|
||||||
|
|
||||||
const ErrorTypeUtils::ErrorType ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH =
|
const ErrorTypeUtils::ErrorType ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH =
|
||||||
NOT_AN_ERROR | MATCH_WITH_WRONG_CASE | MATCH_WITH_MISSING_ACCENT | MATCH_WITH_DIGRAPH;
|
NOT_AN_ERROR | MATCH_WITH_WRONG_CASE | MATCH_WITH_MISSING_ACCENT | MATCH_WITH_DIGRAPH;
|
||||||
|
const ErrorTypeUtils::ErrorType ErrorTypeUtils::ERRORS_TREATED_AS_A_PERFECT_MATCH = NOT_AN_ERROR;
|
||||||
|
|
||||||
const ErrorTypeUtils::ErrorType
|
const ErrorTypeUtils::ErrorType
|
||||||
ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION =
|
ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION =
|
||||||
|
|
|
@ -52,6 +52,10 @@ class ErrorTypeUtils {
|
||||||
return (containedErrorTypes & ~ERRORS_TREATED_AS_AN_EXACT_MATCH) == 0;
|
return (containedErrorTypes & ~ERRORS_TREATED_AS_AN_EXACT_MATCH) == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool isPerfectMatch(const ErrorType containedErrorTypes) {
|
||||||
|
return (containedErrorTypes & ~ERRORS_TREATED_AS_A_PERFECT_MATCH) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
static bool isExactMatchWithIntentionalOmission(const ErrorType containedErrorTypes) {
|
static bool isExactMatchWithIntentionalOmission(const ErrorType containedErrorTypes) {
|
||||||
return (containedErrorTypes
|
return (containedErrorTypes
|
||||||
& ~ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION) == 0;
|
& ~ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION) == 0;
|
||||||
|
@ -73,6 +77,7 @@ class ErrorTypeUtils {
|
||||||
DISALLOW_IMPLICIT_CONSTRUCTORS(ErrorTypeUtils);
|
DISALLOW_IMPLICIT_CONSTRUCTORS(ErrorTypeUtils);
|
||||||
|
|
||||||
static const ErrorType ERRORS_TREATED_AS_AN_EXACT_MATCH;
|
static const ErrorType ERRORS_TREATED_AS_AN_EXACT_MATCH;
|
||||||
|
static const ErrorType ERRORS_TREATED_AS_A_PERFECT_MATCH;
|
||||||
static const ErrorType ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION;
|
static const ErrorType ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION;
|
||||||
};
|
};
|
||||||
} // namespace latinime
|
} // namespace latinime
|
||||||
|
|
|
@ -30,7 +30,7 @@ class Scoring {
|
||||||
public:
|
public:
|
||||||
virtual int calculateFinalScore(const float compoundDistance, const int inputSize,
|
virtual int calculateFinalScore(const float compoundDistance, const int inputSize,
|
||||||
const ErrorTypeUtils::ErrorType containedErrorTypes, const bool forceCommit,
|
const ErrorTypeUtils::ErrorType containedErrorTypes, const bool forceCommit,
|
||||||
const bool boostExactMatches) const = 0;
|
const bool boostExactMatches, const bool hasProbabilityZero) const = 0;
|
||||||
virtual void getMostProbableString(const DicTraverseSession *const traverseSession,
|
virtual void getMostProbableString(const DicTraverseSession *const traverseSession,
|
||||||
const float weightOfLangModelVsSpatialModel,
|
const float weightOfLangModelVsSpatialModel,
|
||||||
SuggestionResults *const outSuggestionResults) const = 0;
|
SuggestionResults *const outSuggestionResults) const = 0;
|
||||||
|
|
|
@ -161,7 +161,7 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
|
||||||
compoundDistance, traverseSession->getInputSize(),
|
compoundDistance, traverseSession->getInputSize(),
|
||||||
terminalDicNode->getContainedErrorTypes(),
|
terminalDicNode->getContainedErrorTypes(),
|
||||||
(forceCommitMultiWords && terminalDicNode->hasMultipleWords()),
|
(forceCommitMultiWords && terminalDicNode->hasMultipleWords()),
|
||||||
boostExactMatches);
|
boostExactMatches, wordAttributes.getProbability() == 0);
|
||||||
|
|
||||||
// Don't output invalid or blocked offensive words. However, we still need to submit their
|
// Don't output invalid or blocked offensive words. However, we still need to submit their
|
||||||
// shortcuts if any.
|
// shortcuts if any.
|
||||||
|
|
|
@ -24,6 +24,7 @@ const int ScoringParams::THRESHOLD_NEXT_WORD_PROBABILITY_FOR_CAPPED = 120;
|
||||||
const float ScoringParams::AUTOCORRECT_OUTPUT_THRESHOLD = 1.0f;
|
const float ScoringParams::AUTOCORRECT_OUTPUT_THRESHOLD = 1.0f;
|
||||||
|
|
||||||
const float ScoringParams::EXACT_MATCH_PROMOTION = 1.1f;
|
const float ScoringParams::EXACT_MATCH_PROMOTION = 1.1f;
|
||||||
|
const float ScoringParams::PERFECT_MATCH_PROMOTION = 1.1f;
|
||||||
const float ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH = 0.01f;
|
const float ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH = 0.01f;
|
||||||
const float ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH = 0.02f;
|
const float ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH = 0.02f;
|
||||||
const float ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH = 0.03f;
|
const float ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH = 0.03f;
|
||||||
|
|
|
@ -34,6 +34,7 @@ class ScoringParams {
|
||||||
static const int THRESHOLD_SHORT_WORD_LENGTH;
|
static const int THRESHOLD_SHORT_WORD_LENGTH;
|
||||||
|
|
||||||
static const float EXACT_MATCH_PROMOTION;
|
static const float EXACT_MATCH_PROMOTION;
|
||||||
|
static const float PERFECT_MATCH_PROMOTION;
|
||||||
static const float CASE_ERROR_PENALTY_FOR_EXACT_MATCH;
|
static const float CASE_ERROR_PENALTY_FOR_EXACT_MATCH;
|
||||||
static const float ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH;
|
static const float ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH;
|
||||||
static const float DIGRAPH_PENALTY_FOR_EXACT_MATCH;
|
static const float DIGRAPH_PENALTY_FOR_EXACT_MATCH;
|
||||||
|
|
|
@ -44,23 +44,50 @@ class TypingScoring : public Scoring {
|
||||||
|
|
||||||
AK_FORCE_INLINE int calculateFinalScore(const float compoundDistance, const int inputSize,
|
AK_FORCE_INLINE int calculateFinalScore(const float compoundDistance, const int inputSize,
|
||||||
const ErrorTypeUtils::ErrorType containedErrorTypes, const bool forceCommit,
|
const ErrorTypeUtils::ErrorType containedErrorTypes, const bool forceCommit,
|
||||||
const bool boostExactMatches) const {
|
const bool boostExactMatches, const bool hasProbabilityZero) const {
|
||||||
const float maxDistance = ScoringParams::DISTANCE_WEIGHT_LANGUAGE
|
const float maxDistance = ScoringParams::DISTANCE_WEIGHT_LANGUAGE
|
||||||
+ static_cast<float>(inputSize) * ScoringParams::TYPING_MAX_OUTPUT_SCORE_PER_INPUT;
|
+ static_cast<float>(inputSize) * ScoringParams::TYPING_MAX_OUTPUT_SCORE_PER_INPUT;
|
||||||
float score = ScoringParams::TYPING_BASE_OUTPUT_SCORE - compoundDistance / maxDistance;
|
float score = ScoringParams::TYPING_BASE_OUTPUT_SCORE - compoundDistance / maxDistance;
|
||||||
if (forceCommit) {
|
if (forceCommit) {
|
||||||
score += ScoringParams::AUTOCORRECT_OUTPUT_THRESHOLD;
|
score += ScoringParams::AUTOCORRECT_OUTPUT_THRESHOLD;
|
||||||
}
|
}
|
||||||
if (boostExactMatches && ErrorTypeUtils::isExactMatch(containedErrorTypes)) {
|
if (hasProbabilityZero) {
|
||||||
score += ScoringParams::EXACT_MATCH_PROMOTION;
|
// Previously, when both legitimate 0-frequency words (such as distracters) and
|
||||||
if ((ErrorTypeUtils::MATCH_WITH_WRONG_CASE & containedErrorTypes) != 0) {
|
// offensive words were encoded in the same way, distracters would never show up
|
||||||
score -= ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH;
|
// when the user blocked offensive words (the default setting, as well as the
|
||||||
|
// setting for regression tests).
|
||||||
|
//
|
||||||
|
// When b/11031090 was fixed and a separate encoding was used for offensive words,
|
||||||
|
// 0-frequency words would no longer be blocked when they were an "exact match"
|
||||||
|
// (where case mismatches and accent mismatches would be considered an "exact
|
||||||
|
// match"). The exact match boosting functionality meant that, for example, when
|
||||||
|
// the user typed "mt" they would be suggested the word "Mt", although they most
|
||||||
|
// probably meant to type "my".
|
||||||
|
//
|
||||||
|
// For this reason, we introduced this change, which does the following:
|
||||||
|
// * Defines the "perfect match" as a really exact match, with no room for case or
|
||||||
|
// accent mismatches
|
||||||
|
// * When the target word has probability zero (as "Mt" does, because it is a
|
||||||
|
// distracter), ONLY boost its score if it is a perfect match.
|
||||||
|
//
|
||||||
|
// By doing this, when the user types "mt", the word "Mt" will NOT be boosted, and
|
||||||
|
// they will get "my". However, if the user makes an explicit effort to type "Mt",
|
||||||
|
// we do boost the word "Mt" so that the user's input is not autocorrected to "My".
|
||||||
|
if (boostExactMatches && ErrorTypeUtils::isPerfectMatch(containedErrorTypes)) {
|
||||||
|
score += ScoringParams::PERFECT_MATCH_PROMOTION;
|
||||||
}
|
}
|
||||||
if ((ErrorTypeUtils::MATCH_WITH_MISSING_ACCENT & containedErrorTypes) != 0) {
|
} else {
|
||||||
score -= ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH;
|
if (boostExactMatches && ErrorTypeUtils::isExactMatch(containedErrorTypes)) {
|
||||||
}
|
score += ScoringParams::EXACT_MATCH_PROMOTION;
|
||||||
if ((ErrorTypeUtils::MATCH_WITH_DIGRAPH & containedErrorTypes) != 0) {
|
if ((ErrorTypeUtils::MATCH_WITH_WRONG_CASE & containedErrorTypes) != 0) {
|
||||||
score -= ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH;
|
score -= ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH;
|
||||||
|
}
|
||||||
|
if ((ErrorTypeUtils::MATCH_WITH_MISSING_ACCENT & containedErrorTypes) != 0) {
|
||||||
|
score -= ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH;
|
||||||
|
}
|
||||||
|
if ((ErrorTypeUtils::MATCH_WITH_DIGRAPH & containedErrorTypes) != 0) {
|
||||||
|
score -= ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return static_cast<int>(score * SUGGEST_INTERFACE_OUTPUT_SCALE);
|
return static_cast<int>(score * SUGGEST_INTERFACE_OUTPUT_SCALE);
|
||||||
|
|
Loading…
Reference in New Issue