93445b4821
Change-Id: I7290cd1fb675a1b85b9b6ac2d464c932b5bca1dd
287 lines
13 KiB
Java
287 lines
13 KiB
Java
/*
|
|
* Copyright (C) 2012 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
package com.android.inputmethod.research;
|
|
|
|
import android.util.Log;
|
|
|
|
import com.android.inputmethod.annotations.UsedForTesting;
|
|
import com.android.inputmethod.latin.Dictionary;
|
|
import com.android.inputmethod.latin.Suggest;
|
|
import com.android.inputmethod.latin.define.ProductionFlag;
|
|
|
|
import java.io.IOException;
|
|
import java.util.ArrayList;
|
|
import java.util.LinkedList;
|
|
|
|
/**
|
|
* MainLogBuffer is a FixedLogBuffer that tracks the state of LogUnits to make privacy guarantees.
|
|
*
|
|
* There are three forms of privacy protection: 1) only words in the main dictionary are allowed to
|
|
* be logged in enough detail to determine their contents, 2) only a subset of words are logged
|
|
* in detail, such as 10%, and 3) no numbers are logged.
|
|
*
|
|
* This class maintains a list of LogUnits, each corresponding to a word. As the user completes
|
|
* words, they are added here. But if the user backs up over their current word to edit a word
|
|
* entered earlier, then it is pulled out of this LogBuffer, changes are then added to the end of
|
|
* the LogUnit, and it is pushed back in here when the user is done. Because words may be pulled
|
|
* back out even after they are pushed in, we must not publish the contents of this LogBuffer too
|
|
* quickly. However, we cannot let the contents pile up either, or it will limit the editing that
|
|
* a user can perform.
|
|
*
|
|
* To balance these requirements (keep history so user can edit, flush history so it does not pile
|
|
* up), the LogBuffer is considered "complete" when the user has entered enough words to form an
|
|
* n-gram, followed by enough additional non-detailed words (that are in the 90%, as per above).
|
|
* Once complete, the n-gram may be published to flash storage (via the ResearchLog class).
|
|
* However, the additional non-detailed words are retained, in case the user backspaces to edit
|
|
* them. The MainLogBuffer then continues to add words, publishing individual non-detailed words
|
|
* as new words arrive. After enough non-detailed words have been pushed out to account for the
|
|
* 90% between words, the words at the front of the LogBuffer can be published as an n-gram again.
|
|
*
|
|
* If the words that would form the valid n-gram are not in the dictionary, then words are pushed
|
|
* through the LogBuffer one at a time until an n-gram is found that is entirely composed of
|
|
* dictionary words.
|
|
*
|
|
* If the user closes a session, then the entire LogBuffer is flushed, publishing any embedded
|
|
* n-gram containing dictionary words.
|
|
*/
|
|
public abstract class MainLogBuffer extends FixedLogBuffer {
|
|
private static final String TAG = MainLogBuffer.class.getSimpleName();
|
|
private static final boolean DEBUG = false
|
|
&& ProductionFlag.USES_DEVELOPMENT_ONLY_DIAGNOSTICS_DEBUG;
|
|
|
|
// Keep consistent with switch statement in Statistics.recordPublishabilityResultCode()
|
|
public static final int PUBLISHABILITY_PUBLISHABLE = 0;
|
|
public static final int PUBLISHABILITY_UNPUBLISHABLE_STOPPING = 1;
|
|
public static final int PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT = 2;
|
|
public static final int PUBLISHABILITY_UNPUBLISHABLE_SAMPLED_TOO_RECENTLY = 3;
|
|
public static final int PUBLISHABILITY_UNPUBLISHABLE_DICTIONARY_UNAVAILABLE = 4;
|
|
public static final int PUBLISHABILITY_UNPUBLISHABLE_MAY_CONTAIN_DIGIT = 5;
|
|
public static final int PUBLISHABILITY_UNPUBLISHABLE_NOT_IN_DICTIONARY = 6;
|
|
|
|
// The size of the n-grams logged. E.g. N_GRAM_SIZE = 2 means to sample bigrams.
|
|
public static final int N_GRAM_SIZE = 2;
|
|
|
|
// TODO: Remove dependence on Suggest, and pass in Dictionary as a parameter to an appropriate
|
|
// method.
|
|
private final Suggest mSuggest;
|
|
@UsedForTesting
|
|
private Dictionary mDictionaryForTesting;
|
|
private boolean mIsStopping = false;
|
|
|
|
/* package for test */ int mNumWordsBetweenNGrams;
|
|
|
|
// Counter for words left to suppress before an n-gram can be sampled. Reset to mMinWordPeriod
|
|
// after a sample is taken.
|
|
/* package for test */ int mNumWordsUntilSafeToSample;
|
|
|
|
public MainLogBuffer(final int wordsBetweenSamples, final int numInitialWordsToIgnore,
|
|
final Suggest suggest) {
|
|
super(N_GRAM_SIZE + wordsBetweenSamples);
|
|
mNumWordsBetweenNGrams = wordsBetweenSamples;
|
|
mNumWordsUntilSafeToSample = DEBUG ? 0 : numInitialWordsToIgnore;
|
|
mSuggest = suggest;
|
|
}
|
|
|
|
@UsedForTesting
|
|
/* package for test */ void setDictionaryForTesting(final Dictionary dictionary) {
|
|
mDictionaryForTesting = dictionary;
|
|
}
|
|
|
|
private Dictionary getDictionary() {
|
|
if (mDictionaryForTesting != null) {
|
|
return mDictionaryForTesting;
|
|
}
|
|
if (mSuggest == null || !mSuggest.hasMainDictionary()) return null;
|
|
return mSuggest.getMainDictionary();
|
|
}
|
|
|
|
public void setIsStopping() {
|
|
mIsStopping = true;
|
|
}
|
|
|
|
/**
|
|
* Determines whether the string determined by a series of LogUnits will not violate user
|
|
* privacy if published.
|
|
*
|
|
* @param logUnits a LogUnit list to check for publishability
|
|
* @param nGramSize the smallest n-gram acceptable to be published. if
|
|
* {@link ResearchLogger#IS_LOGGING_EVERYTHING} is true, then publish if there are more than
|
|
* {@code minNGramSize} words in the logUnits, otherwise wait. if {@link
|
|
* ResearchLogger#IS_LOGGING_EVERYTHING} is false, then ensure that there are exactly nGramSize
|
|
* words in the LogUnits.
|
|
*
|
|
* @return one of the {@code PUBLISHABILITY_*} result codes defined in this class.
|
|
*/
|
|
private int getPublishabilityResultCode(final ArrayList<LogUnit> logUnits,
|
|
final int nGramSize) {
|
|
// Bypass privacy checks when debugging.
|
|
if (ResearchLogger.IS_LOGGING_EVERYTHING) {
|
|
if (mIsStopping) {
|
|
return PUBLISHABILITY_UNPUBLISHABLE_STOPPING;
|
|
}
|
|
// Only check that it is the right length. If not, wait for later words to make
|
|
// complete n-grams.
|
|
int numWordsInLogUnitList = 0;
|
|
final int length = logUnits.size();
|
|
for (int i = 0; i < length; i++) {
|
|
final LogUnit logUnit = logUnits.get(i);
|
|
numWordsInLogUnitList += logUnit.getNumWords();
|
|
}
|
|
if (numWordsInLogUnitList >= nGramSize) {
|
|
return PUBLISHABILITY_PUBLISHABLE;
|
|
} else {
|
|
return PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT;
|
|
}
|
|
}
|
|
|
|
// Check that we are not sampling too frequently. Having sampled recently might disclose
|
|
// too much of the user's intended meaning.
|
|
if (mNumWordsUntilSafeToSample > 0) {
|
|
return PUBLISHABILITY_UNPUBLISHABLE_SAMPLED_TOO_RECENTLY;
|
|
}
|
|
// Reload the dictionary in case it has changed (e.g., because the user has changed
|
|
// languages).
|
|
final Dictionary dictionary = getDictionary();
|
|
if (dictionary == null) {
|
|
// Main dictionary is unavailable. Since we cannot check it, we cannot tell if a
|
|
// word is out-of-vocabulary or not. Therefore, we must judge the entire buffer
|
|
// contents to potentially pose a privacy risk.
|
|
return PUBLISHABILITY_UNPUBLISHABLE_DICTIONARY_UNAVAILABLE;
|
|
}
|
|
|
|
// Check each word in the buffer. If any word poses a privacy threat, we cannot upload
|
|
// the complete buffer contents in detail.
|
|
int numWordsInLogUnitList = 0;
|
|
final int length = logUnits.size();
|
|
for (final LogUnit logUnit : logUnits) {
|
|
if (!logUnit.hasOneOrMoreWords()) {
|
|
// Digits outside words are a privacy threat.
|
|
if (logUnit.mayContainDigit()) {
|
|
return PUBLISHABILITY_UNPUBLISHABLE_MAY_CONTAIN_DIGIT;
|
|
}
|
|
} else {
|
|
numWordsInLogUnitList += logUnit.getNumWords();
|
|
final String[] words = logUnit.getWordsAsStringArray();
|
|
for (final String word : words) {
|
|
// Words not in the dictionary are a privacy threat.
|
|
if (ResearchLogger.hasLetters(word) && !(dictionary.isValidWord(word))) {
|
|
if (DEBUG) {
|
|
Log.d(TAG, "\"" + word + "\" NOT SAFE!: hasLetters: "
|
|
+ ResearchLogger.hasLetters(word)
|
|
+ ", isValid: " + (dictionary.isValidWord(word)));
|
|
}
|
|
return PUBLISHABILITY_UNPUBLISHABLE_NOT_IN_DICTIONARY;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Finally, only return true if the ngram is the right size.
|
|
if (numWordsInLogUnitList == nGramSize) {
|
|
return PUBLISHABILITY_PUBLISHABLE;
|
|
} else {
|
|
return PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT;
|
|
}
|
|
}
|
|
|
|
public void shiftAndPublishAll() throws IOException {
|
|
final LinkedList<LogUnit> logUnits = getLogUnits();
|
|
while (!logUnits.isEmpty()) {
|
|
publishLogUnitsAtFrontOfBuffer();
|
|
}
|
|
}
|
|
|
|
@Override
|
|
protected final void onBufferFull() {
|
|
try {
|
|
publishLogUnitsAtFrontOfBuffer();
|
|
} catch (final IOException e) {
|
|
if (DEBUG) {
|
|
Log.w(TAG, "IOException when publishing front of LogBuffer", e);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* If there is a safe n-gram at the front of this log buffer, publish it with all details, and
|
|
* remove the LogUnits that constitute it.
|
|
*
|
|
* An n-gram might not be "safe" if it violates privacy controls. E.g., it might contain
|
|
* numbers, an out-of-vocabulary word, or another n-gram may have been published recently. If
|
|
* there is no safe n-gram, then the LogUnits up through the first word-containing LogUnit are
|
|
* published, but without disclosing any privacy-related details, such as the word the LogUnit
|
|
* generated, motion data, etc.
|
|
*
|
|
* Note that a LogUnit can hold more than one word if the user types without explicit spaces.
|
|
* In this case, the words may be grouped together in such a way that pulling an n-gram off the
|
|
* front would require splitting a LogUnit. Splitting a LogUnit is not possible, so this case
|
|
* is treated just as the unsafe n-gram case. This may cause n-grams to be sampled at slightly
|
|
* less than the target frequency.
|
|
*/
|
|
protected final void publishLogUnitsAtFrontOfBuffer() throws IOException {
|
|
// TODO: Refactor this method to require fewer passes through the LogUnits. Should really
|
|
// require only one pass.
|
|
ArrayList<LogUnit> logUnits = peekAtFirstNWords(N_GRAM_SIZE);
|
|
final int publishabilityResultCode = getPublishabilityResultCode(logUnits, N_GRAM_SIZE);
|
|
ResearchLogger.recordPublishabilityResultCode(publishabilityResultCode);
|
|
if (publishabilityResultCode == MainLogBuffer.PUBLISHABILITY_PUBLISHABLE) {
|
|
// Good n-gram at the front of the buffer. Publish it, disclosing details.
|
|
publish(logUnits, true /* canIncludePrivateData */);
|
|
shiftOutWords(N_GRAM_SIZE);
|
|
mNumWordsUntilSafeToSample = mNumWordsBetweenNGrams;
|
|
return;
|
|
}
|
|
// No good n-gram at front, and buffer is full. Shift out up through the first logUnit
|
|
// with associated words (or if there is none, all the existing logUnits).
|
|
logUnits.clear();
|
|
LogUnit logUnit = shiftOut();
|
|
while (logUnit != null) {
|
|
logUnits.add(logUnit);
|
|
final int numWords = logUnit.getNumWords();
|
|
if (numWords > 0) {
|
|
mNumWordsUntilSafeToSample = Math.max(0, mNumWordsUntilSafeToSample - numWords);
|
|
break;
|
|
}
|
|
logUnit = shiftOut();
|
|
}
|
|
publish(logUnits, false /* canIncludePrivateData */);
|
|
}
|
|
|
|
/**
|
|
* Called when a list of logUnits should be published.
|
|
*
|
|
* It is the subclass's responsibility to implement the publication.
|
|
*
|
|
* @param logUnits The list of logUnits to be published.
|
|
* @param canIncludePrivateData Whether the private data in the logUnits can be included in
|
|
* publication.
|
|
*
|
|
* @throws IOException if publication to the log file is not possible
|
|
*/
|
|
protected abstract void publish(final ArrayList<LogUnit> logUnits,
|
|
final boolean canIncludePrivateData) throws IOException;
|
|
|
|
@Override
|
|
protected int shiftOutWords(final int numWords) {
|
|
final int numWordsShiftedOut = super.shiftOutWords(numWords);
|
|
mNumWordsUntilSafeToSample = Math.max(0, mNumWordsUntilSafeToSample - numWordsShiftedOut);
|
|
if (DEBUG) {
|
|
Log.d(TAG, "wordsUntilSafeToSample now at " + mNumWordsUntilSafeToSample);
|
|
}
|
|
return numWordsShiftedOut;
|
|
}
|
|
}
|