Merge "Apply distracter filter for in-vocabulary words."

This commit is contained in:
Keisuke Kuroyanagi 2014-05-26 14:25:17 +00:00 committed by Android (Google) Code Review
commit f85b478cd8

View file

@ -130,37 +130,27 @@ public final class LanguageModelParam {
if (locale == null) { if (locale == null) {
return null; return null;
} }
// TODO: Though targetWord is an IV (in-vocabulary) word, we should still apply
// distracterFilter in the following code. If targetWord is a distracter,
// it should be filtered out.
if (dictionaryFacilitator.isValidWord(targetWord, false /* ignoreCase */)) { if (dictionaryFacilitator.isValidWord(targetWord, false /* ignoreCase */)) {
return createAndGetLanguageModelParamOfWord(prevWordsInfo, targetWord, timestamp, return createAndGetLanguageModelParamOfWord(prevWordsInfo, targetWord, timestamp,
true /* isValidWord */, locale); true /* isValidWord */, locale, distracterFilter);
} }
final String lowerCaseTargetWord = targetWord.toLowerCase(locale); final String lowerCaseTargetWord = targetWord.toLowerCase(locale);
if (dictionaryFacilitator.isValidWord(lowerCaseTargetWord, false /* ignoreCase */)) { if (dictionaryFacilitator.isValidWord(lowerCaseTargetWord, false /* ignoreCase */)) {
// Add the lower-cased word. // Add the lower-cased word.
return createAndGetLanguageModelParamOfWord(prevWordsInfo, lowerCaseTargetWord, return createAndGetLanguageModelParamOfWord(prevWordsInfo, lowerCaseTargetWord,
timestamp, true /* isValidWord */, locale); timestamp, true /* isValidWord */, locale, distracterFilter);
} }
// Treat the word as an OOV word. The following statement checks whether this OOV // Treat the word as an OOV word.
// is a distracter to words in dictionaries. Being a distracter means the OOV word is
// too close to a common word in dictionaries (e.g., the OOV "mot" is very close to "not").
// Adding such a word to dictonaries would interfere with entering in-dictionary words. For
// example, adding "mot" to dictionaries might interfere with entering "not".
// This kind of OOV should be filtered out.
if (distracterFilter.isDistracterToWordsInDictionaries(prevWordsInfo, targetWord, locale)) {
return null;
}
return createAndGetLanguageModelParamOfWord(prevWordsInfo, targetWord, timestamp, return createAndGetLanguageModelParamOfWord(prevWordsInfo, targetWord, timestamp,
false /* isValidWord */, locale); false /* isValidWord */, locale, distracterFilter);
} }
private static LanguageModelParam createAndGetLanguageModelParamOfWord( private static LanguageModelParam createAndGetLanguageModelParamOfWord(
final PrevWordsInfo prevWordsInfo, final String targetWord, final int timestamp, final PrevWordsInfo prevWordsInfo, final String targetWord, final int timestamp,
final boolean isValidWord, final Locale locale) { final boolean isValidWord, final Locale locale,
final DistracterFilter distracterFilter) {
final String word; final String word;
if (StringUtils.getCapitalizationType(targetWord) == StringUtils.CAPITALIZE_FIRST if (StringUtils.getCapitalizationType(targetWord) == StringUtils.CAPITALIZE_FIRST
&& prevWordsInfo.mPrevWord == null && !isValidWord) { && prevWordsInfo.mPrevWord == null && !isValidWord) {
@ -168,6 +158,13 @@ public final class LanguageModelParam {
} else { } else {
word = targetWord; word = targetWord;
} }
// Check whether the word is a distracter to words in the dictionaries.
if (distracterFilter.isDistracterToWordsInDictionaries(prevWordsInfo, word, locale)) {
if (DEBUG) {
Log.d(TAG, "The word (" + word + ") is a distracter. Skip this word.");
}
return null;
}
final int unigramProbability = isValidWord ? final int unigramProbability = isValidWord ?
UNIGRAM_PROBABILITY_FOR_VALID_WORD : UNIGRAM_PROBABILITY_FOR_OOV_WORD; UNIGRAM_PROBABILITY_FOR_VALID_WORD : UNIGRAM_PROBABILITY_FOR_OOV_WORD;
if (prevWordsInfo.mPrevWord == null) { if (prevWordsInfo.mPrevWord == null) {