From 01c1a1a2ba4a75ae3e52196a45e13fee598c5b09 Mon Sep 17 00:00:00 2001 From: "Tadashi G. Takaoka" Date: Wed, 15 Jan 2014 17:44:52 +0900 Subject: [PATCH] Refactor LanguageModelParam class a bit This change must be checked in together with I9ecfd1adc1. Change-Id: I26f8c17d6347b75dba6abb3d73239f4052327dae --- .../latin/utils/LanguageModelParam.java | 113 ++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/java/src/com/android/inputmethod/latin/utils/LanguageModelParam.java b/java/src/com/android/inputmethod/latin/utils/LanguageModelParam.java index 663b6cdbe..3a5af4b05 100644 --- a/java/src/com/android/inputmethod/latin/utils/LanguageModelParam.java +++ b/java/src/com/android/inputmethod/latin/utils/LanguageModelParam.java @@ -16,9 +16,29 @@ package com.android.inputmethod.latin.utils; +import android.util.Log; + import com.android.inputmethod.latin.Dictionary; +import com.android.inputmethod.latin.DictionaryFacilitatorForSuggest; +import com.android.inputmethod.latin.settings.SpacingAndPunctuations; + +import java.util.ArrayList; +import java.util.Locale; public class LanguageModelParam { + private static final String TAG = LanguageModelParam.class.getSimpleName(); + private static final boolean DEBUG = false; + private static final boolean DEBUG_TOKEN = false; + + // For now, these probability values are being referred to only when we add new entries to + // decaying dynamic binary dictionaries. When these are referred to, what matters is 0 or + // non-0. Thus, it's not meaningful to compare 10, 100, and so on. + // TODO: Revise the logic in ForgettingCurveUtils in native code. + private static final int UNIGRAM_PROBABILITY_FOR_VALID_WORD = 100; + private static final int UNIGRAM_PROBABILITY_FOR_OOV_WORD = 10; + private static final int BIGRAM_PROBABILITY_FOR_VALID_WORD = 0; + private static final int BIGRAM_PROBABILITY_FOR_OOV_WORD = 0; + public final String mTargetWord; public final int[] mWord0; public final int[] mWord1; @@ -53,4 +73,97 @@ public class LanguageModelParam { mIsBlacklisted = false; mTimestamp = timestamp; } + + // Process a list of words and return a list of {@link LanguageModelParam} objects. + public static ArrayList createLanguageModelParamsFrom( + final ArrayList tokens, final int timestamp, + final DictionaryFacilitatorForSuggest dictionaryFacilitator, + final SpacingAndPunctuations spacingAndPunctuations) { + final ArrayList languageModelParams = + CollectionUtils.newArrayList(); + final int N = tokens.size(); + String prevWord = null; + for (int i = 0; i < N; ++i) { + final String tempWord = tokens.get(i); + if (StringUtils.isEmptyStringOrWhiteSpaces(tempWord)) { + // just skip this token + if (DEBUG_TOKEN) { + Log.d(TAG, "--- isEmptyStringOrWhiteSpaces: \"" + tempWord + "\""); + } + continue; + } + if (!DictionaryInfoUtils.looksValidForDictionaryInsertion( + tempWord, spacingAndPunctuations)) { + if (DEBUG_TOKEN) { + Log.d(TAG, "--- not looksValidForDictionaryInsertion: \"" + + tempWord + "\""); + } + // Sentence terminator found. Split. + prevWord = null; + continue; + } + if (DEBUG_TOKEN) { + Log.d(TAG, "--- word: \"" + tempWord + "\""); + } + final LanguageModelParam languageModelParam = + detectWhetherVaildWordOrNotAndGetLanguageModelParam( + prevWord, tempWord, timestamp, dictionaryFacilitator); + languageModelParams.add(languageModelParam); + prevWord = languageModelParam.mTargetWord; + } + return languageModelParams; + } + + private static LanguageModelParam detectWhetherVaildWordOrNotAndGetLanguageModelParam( + final String prevWord, final String targetWord, final int timestamp, + final DictionaryFacilitatorForSuggest dictionaryFacilitator) { + final Locale locale = dictionaryFacilitator.mLocale; + if (!dictionaryFacilitator.isValidWord(targetWord, true /* ignoreCase */)) { + // OOV word. + return createAndGetLanguageModelParamOfWord(prevWord, targetWord, timestamp, + false /* isValidWord */, locale); + } + if (dictionaryFacilitator.isValidWord(targetWord, false /* ignoreCase */)) { + return createAndGetLanguageModelParamOfWord(prevWord, targetWord, timestamp, + true /* isValidWord */, locale); + } + final String lowerCaseTargetWord = targetWord.toLowerCase(locale); + if (dictionaryFacilitator.isValidWord(lowerCaseTargetWord, false /* ignoreCase */)) { + // Add the lower-cased word. + return createAndGetLanguageModelParamOfWord(prevWord, lowerCaseTargetWord, + timestamp, true /* isValidWord */, locale); + } + // Treat the word as an OOV word. + return createAndGetLanguageModelParamOfWord(prevWord, targetWord, timestamp, + false /* isValidWord */, locale); + } + + private static LanguageModelParam createAndGetLanguageModelParamOfWord( + final String prevWord, final String targetWord, final int timestamp, + final boolean isValidWord, final Locale locale) { + final String word; + if (StringUtils.getCapitalizationType(targetWord) == StringUtils.CAPITALIZE_FIRST + && prevWord == null && !isValidWord) { + word = targetWord.toLowerCase(locale); + } else { + word = targetWord; + } + final int unigramProbability = isValidWord ? + UNIGRAM_PROBABILITY_FOR_VALID_WORD : UNIGRAM_PROBABILITY_FOR_OOV_WORD; + if (prevWord == null) { + if (DEBUG) { + Log.d(TAG, "--- add unigram: current(" + + (isValidWord ? "Valid" : "OOV") + ") = " + word); + } + return new LanguageModelParam(word, unigramProbability, timestamp); + } + if (DEBUG) { + Log.d(TAG, "--- add bigram: prev = " + prevWord + ", current(" + + (isValidWord ? "Valid" : "OOV") + ") = " + word); + } + final int bigramProbability = isValidWord ? + BIGRAM_PROBABILITY_FOR_VALID_WORD : BIGRAM_PROBABILITY_FOR_OOV_WORD; + return new LanguageModelParam(prevWord, word, unigramProbability, + bigramProbability, timestamp); + } }