/* * Copyright (C) 2014 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.android.inputmethod.latin.utils; import android.util.Log; import com.android.inputmethod.latin.Dictionary; import com.android.inputmethod.latin.DictionaryFacilitatorForSuggest; import com.android.inputmethod.latin.PrevWordsInfo; import com.android.inputmethod.latin.settings.SpacingAndPunctuations; import java.util.ArrayList; import java.util.Locale; // Note: this class is used as a parameter type of a native method. You should be careful when you // rename this class or field name. See BinaryDictionary#addMultipleDictionaryEntriesNative(). public final class LanguageModelParam { private static final String TAG = LanguageModelParam.class.getSimpleName(); private static final boolean DEBUG = false; private static final boolean DEBUG_TOKEN = false; // For now, these probability values are being referred to only when we add new entries to // decaying dynamic binary dictionaries. When these are referred to, what matters is 0 or // non-0. Thus, it's not meaningful to compare 10, 100, and so on. // TODO: Revise the logic in ForgettingCurveUtils in native code. private static final int UNIGRAM_PROBABILITY_FOR_VALID_WORD = 100; private static final int UNIGRAM_PROBABILITY_FOR_OOV_WORD = Dictionary.NOT_A_PROBABILITY; private static final int BIGRAM_PROBABILITY_FOR_VALID_WORD = 10; private static final int BIGRAM_PROBABILITY_FOR_OOV_WORD = Dictionary.NOT_A_PROBABILITY; public final String mTargetWord; public final int[] mWord0; public final int[] mWord1; // TODO: this needs to be a list of shortcuts public final int[] mShortcutTarget; public final int mUnigramProbability; public final int mBigramProbability; public final int mShortcutProbability; public final boolean mIsNotAWord; public final boolean mIsBlacklisted; // Time stamp in seconds. public final int mTimestamp; // Constructor for unigram. TODO: support shortcuts public LanguageModelParam(final String word, final int unigramProbability, final int timestamp) { this(null /* word0 */, word, unigramProbability, Dictionary.NOT_A_PROBABILITY, timestamp); } // Constructor for unigram and bigram. public LanguageModelParam(final String word0, final String word1, final int unigramProbability, final int bigramProbability, final int timestamp) { mTargetWord = word1; mWord0 = (word0 == null) ? null : StringUtils.toCodePointArray(word0); mWord1 = StringUtils.toCodePointArray(word1); mShortcutTarget = null; mUnigramProbability = unigramProbability; mBigramProbability = bigramProbability; mShortcutProbability = Dictionary.NOT_A_PROBABILITY; mIsNotAWord = false; mIsBlacklisted = false; mTimestamp = timestamp; } // Process a list of words and return a list of {@link LanguageModelParam} objects. public static ArrayList createLanguageModelParamsFrom( final ArrayList tokens, final int timestamp, final DictionaryFacilitatorForSuggest dictionaryFacilitator, final SpacingAndPunctuations spacingAndPunctuations, final DistracterFilter distracterFilter) { final ArrayList languageModelParams = CollectionUtils.newArrayList(); final int N = tokens.size(); PrevWordsInfo prevWordsInfo = new PrevWordsInfo(null); for (int i = 0; i < N; ++i) { final String tempWord = tokens.get(i); if (StringUtils.isEmptyStringOrWhiteSpaces(tempWord)) { // just skip this token if (DEBUG_TOKEN) { Log.d(TAG, "--- isEmptyStringOrWhiteSpaces: \"" + tempWord + "\""); } continue; } if (!DictionaryInfoUtils.looksValidForDictionaryInsertion( tempWord, spacingAndPunctuations)) { if (DEBUG_TOKEN) { Log.d(TAG, "--- not looksValidForDictionaryInsertion: \"" + tempWord + "\""); } // Sentence terminator found. Split. prevWordsInfo = new PrevWordsInfo(null); continue; } if (DEBUG_TOKEN) { Log.d(TAG, "--- word: \"" + tempWord + "\""); } final LanguageModelParam languageModelParam = detectWhetherVaildWordOrNotAndGetLanguageModelParam( prevWordsInfo, tempWord, timestamp, dictionaryFacilitator, distracterFilter); if (languageModelParam == null) { continue; } languageModelParams.add(languageModelParam); prevWordsInfo = new PrevWordsInfo(languageModelParam.mTargetWord); } return languageModelParams; } private static LanguageModelParam detectWhetherVaildWordOrNotAndGetLanguageModelParam( final PrevWordsInfo prevWordsInfo, final String targetWord, final int timestamp, final DictionaryFacilitatorForSuggest dictionaryFacilitator, final DistracterFilter distracterFilter) { final Locale locale = dictionaryFacilitator.getLocale(); if (locale == null) { return null; } // TODO: Though targetWord is an IV (in-vocabulary) word, we should still apply // distracterFilter in the following code. If targetWord is a distracter, // it should be filtered out. if (dictionaryFacilitator.isValidWord(targetWord, false /* ignoreCase */)) { return createAndGetLanguageModelParamOfWord(prevWordsInfo, targetWord, timestamp, true /* isValidWord */, locale); } final String lowerCaseTargetWord = targetWord.toLowerCase(locale); if (dictionaryFacilitator.isValidWord(lowerCaseTargetWord, false /* ignoreCase */)) { // Add the lower-cased word. return createAndGetLanguageModelParamOfWord(prevWordsInfo, lowerCaseTargetWord, timestamp, true /* isValidWord */, locale); } // Treat the word as an OOV word. The following statement checks whether this OOV // is a distracter to words in dictionaries. Being a distracter means the OOV word is // too close to a common word in dictionaries (e.g., the OOV "mot" is very close to "not"). // Adding such a word to dictonaries would interfere with entering in-dictionary words. For // example, adding "mot" to dictionaries might interfere with entering "not". // This kind of OOV should be filtered out. if (distracterFilter.isDistracterToWordsInDictionaries(prevWordsInfo, targetWord, locale)) { return null; } return createAndGetLanguageModelParamOfWord(prevWordsInfo, targetWord, timestamp, false /* isValidWord */, locale); } private static LanguageModelParam createAndGetLanguageModelParamOfWord( final PrevWordsInfo prevWordsInfo, final String targetWord, final int timestamp, final boolean isValidWord, final Locale locale) { final String word; if (StringUtils.getCapitalizationType(targetWord) == StringUtils.CAPITALIZE_FIRST && prevWordsInfo.mPrevWord == null && !isValidWord) { word = targetWord.toLowerCase(locale); } else { word = targetWord; } final int unigramProbability = isValidWord ? UNIGRAM_PROBABILITY_FOR_VALID_WORD : UNIGRAM_PROBABILITY_FOR_OOV_WORD; if (prevWordsInfo.mPrevWord == null) { if (DEBUG) { Log.d(TAG, "--- add unigram: current(" + (isValidWord ? "Valid" : "OOV") + ") = " + word); } return new LanguageModelParam(word, unigramProbability, timestamp); } if (DEBUG) { Log.d(TAG, "--- add bigram: prev = " + prevWordsInfo.mPrevWord + ", current(" + (isValidWord ? "Valid" : "OOV") + ") = " + word); } final int bigramProbability = isValidWord ? BIGRAM_PROBABILITY_FOR_VALID_WORD : BIGRAM_PROBABILITY_FOR_OOV_WORD; return new LanguageModelParam(prevWordsInfo.mPrevWord, word, unigramProbability, bigramProbability, timestamp); } }