From ddfaeff544c77b1d42857ebf34ca2545e8868e13 Mon Sep 17 00:00:00 2001 From: Keisuke Kuroyanagi Date: Mon, 22 Sep 2014 18:18:50 +0900 Subject: [PATCH] Prepare supporting n-gram for user history dictionary. Bug:17097992 Change-Id: Ic8bfde3d4cc0e720bf7681e08e16fb2ad94d5670 --- .../inputmethod/latin/PrevWordsInfo.java | 7 ++++ .../UserHistoryDictionary.java | 35 ++++++++++++------- .../v4/ver4_patricia_trie_policy.cpp | 3 ++ 3 files changed, 33 insertions(+), 12 deletions(-) diff --git a/java/src/com/android/inputmethod/latin/PrevWordsInfo.java b/java/src/com/android/inputmethod/latin/PrevWordsInfo.java index 76d4f57da..1b7e8f96d 100644 --- a/java/src/com/android/inputmethod/latin/PrevWordsInfo.java +++ b/java/src/com/android/inputmethod/latin/PrevWordsInfo.java @@ -126,6 +126,13 @@ public class PrevWordsInfo { } } + public PrevWordsInfo getTrimmedPrevWordsInfo(final int maxPrevWordCount) { + final int newSize = Math.min(maxPrevWordCount, mPrevWordsInfo.length); + // TODO: Quit creating a new array. + final WordInfo[] prevWordsInfo = Arrays.copyOf(mPrevWordsInfo, newSize); + return new PrevWordsInfo(prevWordsInfo); + } + public int getPrevWordCount() { return mPrevWordsInfo.length; } diff --git a/java/src/com/android/inputmethod/latin/personalization/UserHistoryDictionary.java b/java/src/com/android/inputmethod/latin/personalization/UserHistoryDictionary.java index 34d4d4ed7..d1486f630 100644 --- a/java/src/com/android/inputmethod/latin/personalization/UserHistoryDictionary.java +++ b/java/src/com/android/inputmethod/latin/personalization/UserHistoryDictionary.java @@ -35,6 +35,7 @@ import java.util.Locale; */ public class UserHistoryDictionary extends DecayingExpandableBinaryDictionaryBase { /* package */ static final String NAME = UserHistoryDictionary.class.getSimpleName(); + private final static int SUPPORTED_NGRAM = 2; // TODO: 3 // TODO: Make this constructor private /* package */ UserHistoryDictionary(final Context context, final Locale locale) { @@ -61,9 +62,7 @@ public class UserHistoryDictionary extends DecayingExpandableBinaryDictionaryBas public static void addToDictionary(final ExpandableBinaryDictionary userHistoryDictionary, final PrevWordsInfo prevWordsInfo, final String word, final boolean isValid, final int timestamp, final DistracterFilter distracterFilter) { - final CharSequence prevWord = prevWordsInfo.mPrevWordsInfo[0].mWord; - if (word.length() > Constants.DICTIONARY_MAX_WORD_LENGTH || - (prevWord != null && prevWord.length() > Constants.DICTIONARY_MAX_WORD_LENGTH)) { + if (word.length() > Constants.DICTIONARY_MAX_WORD_LENGTH) { return; } final int frequency = isValid ? @@ -71,17 +70,29 @@ public class UserHistoryDictionary extends DecayingExpandableBinaryDictionaryBas userHistoryDictionary.addUnigramEntryWithCheckingDistracter(word, frequency, null /* shortcutTarget */, 0 /* shortcutFreq */, false /* isNotAWord */, false /* isBlacklisted */, timestamp, distracterFilter); - // Do not insert a word as a bigram of itself - if (TextUtils.equals(word, prevWord)) { - return; - } - if (null != prevWord) { - if (prevWordsInfo.mPrevWordsInfo[0].mIsBeginningOfSentence) { - // Beginning-of-Sentence n-gram entry is treated as a n-gram entry of invalid word. - userHistoryDictionary.addNgramEntry(prevWordsInfo, word, + + final boolean isBeginningOfSentenceContext = + prevWordsInfo.mPrevWordsInfo[0].mIsBeginningOfSentence; + final PrevWordsInfo prevWordsInfoToBeSaved = + prevWordsInfo.getTrimmedPrevWordsInfo(SUPPORTED_NGRAM - 1); + for (int i = 0; i < prevWordsInfoToBeSaved.getPrevWordCount(); i++) { + final CharSequence prevWord = prevWordsInfoToBeSaved.mPrevWordsInfo[i].mWord; + if (prevWord == null || (prevWord.length() > Constants.DICTIONARY_MAX_WORD_LENGTH)) { + return; + } + // Do not insert a word as a bigram of itself + if (i == 0 && TextUtils.equals(word, prevWord)) { + return; + } + if (isBeginningOfSentenceContext) { + // Beginning-of-Sentence n-gram entry is added as an n-gram entry of an OOV word. + userHistoryDictionary.addNgramEntry( + prevWordsInfoToBeSaved.getTrimmedPrevWordsInfo(i + 1), word, FREQUENCY_FOR_WORDS_NOT_IN_DICTS, timestamp); } else { - userHistoryDictionary.addNgramEntry(prevWordsInfo, word, frequency, timestamp); + userHistoryDictionary.addNgramEntry( + prevWordsInfoToBeSaved.getTrimmedPrevWordsInfo(i + 1), word, frequency, + timestamp); } } } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp index 8d4135679..cca0c2924 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp @@ -166,6 +166,9 @@ void Ver4PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordI for (const auto entry : languageModelDictContent->getProbabilityEntries( prevWordIds.limit(i))) { const ProbabilityEntry &probabilityEntry = entry.getProbabilityEntry(); + if (!probabilityEntry.isValid()) { + continue; + } const int probability = probabilityEntry.hasHistoricalInfo() ? ForgettingCurveUtils::decodeProbability( probabilityEntry.getHistoricalInfo(), mHeaderPolicy)