2012-08-03 03:22:29 +00:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2012 The Android Open Source Project
|
|
|
|
*
|
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
|
|
|
* use this file except in compliance with the License. You may obtain a copy of
|
|
|
|
* the License at
|
|
|
|
*
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
*
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
|
|
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
|
|
* License for the specific language governing permissions and limitations under
|
|
|
|
* the License.
|
|
|
|
*/
|
|
|
|
|
|
|
|
package com.android.inputmethod.research;
|
|
|
|
|
2012-08-09 19:20:45 +00:00
|
|
|
import android.util.Log;
|
|
|
|
|
2012-08-03 03:22:29 +00:00
|
|
|
import com.android.inputmethod.latin.Dictionary;
|
|
|
|
import com.android.inputmethod.latin.Suggest;
|
2012-12-18 02:19:58 +00:00
|
|
|
import com.android.inputmethod.latin.define.ProductionFlag;
|
2012-08-03 03:22:29 +00:00
|
|
|
|
2012-12-23 18:40:34 +00:00
|
|
|
import java.util.LinkedList;
|
2012-08-03 03:22:29 +00:00
|
|
|
import java.util.Random;
|
|
|
|
|
2012-12-23 18:40:34 +00:00
|
|
|
/**
|
|
|
|
* Provide a log buffer of fixed length that enforces privacy restrictions.
|
|
|
|
*
|
|
|
|
* The privacy restrictions include making sure that no numbers are logged, that all logged words
|
|
|
|
* are in the dictionary, and that words are recorded infrequently enough that the user's meaning
|
|
|
|
* cannot be easily determined.
|
|
|
|
*/
|
|
|
|
public class MainLogBuffer extends FixedLogBuffer {
|
2012-08-09 19:20:45 +00:00
|
|
|
private static final String TAG = MainLogBuffer.class.getSimpleName();
|
2012-12-18 02:19:58 +00:00
|
|
|
private static final boolean DEBUG = false && ProductionFlag.IS_EXPERIMENTAL_DEBUG;
|
2012-08-09 19:20:45 +00:00
|
|
|
|
2012-08-03 03:22:29 +00:00
|
|
|
// The size of the n-grams logged. E.g. N_GRAM_SIZE = 2 means to sample bigrams.
|
|
|
|
private static final int N_GRAM_SIZE = 2;
|
2012-12-23 18:40:34 +00:00
|
|
|
// The number of words between n-grams to omit from the log. If debugging, record 50% of all
|
|
|
|
// words. Otherwise, only record 10%.
|
2012-12-18 02:19:58 +00:00
|
|
|
private static final int DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES =
|
|
|
|
ProductionFlag.IS_EXPERIMENTAL_DEBUG ? 2 : 18;
|
2012-08-03 03:22:29 +00:00
|
|
|
|
|
|
|
private final ResearchLog mResearchLog;
|
|
|
|
private Suggest mSuggest;
|
|
|
|
|
|
|
|
// The minimum periodicity with which n-grams can be sampled. E.g. mWinWordPeriod is 10 if
|
|
|
|
// every 10th bigram is sampled, i.e., words 1-8 are not, but the bigram at words 9 and 10, etc.
|
|
|
|
// for 11-18, and the bigram at words 19 and 20. If an n-gram is not safe (e.g. it contains a
|
|
|
|
// number in the middle or an out-of-vocabulary word), then sampling is delayed until a safe
|
|
|
|
// n-gram does appear.
|
|
|
|
/* package for test */ int mMinWordPeriod;
|
|
|
|
|
|
|
|
// Counter for words left to suppress before an n-gram can be sampled. Reset to mMinWordPeriod
|
|
|
|
// after a sample is taken.
|
|
|
|
/* package for test */ int mWordsUntilSafeToSample;
|
|
|
|
|
|
|
|
public MainLogBuffer(final ResearchLog researchLog) {
|
|
|
|
super(N_GRAM_SIZE);
|
|
|
|
mResearchLog = researchLog;
|
|
|
|
mMinWordPeriod = DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES + N_GRAM_SIZE;
|
|
|
|
final Random random = new Random();
|
|
|
|
mWordsUntilSafeToSample = random.nextInt(mMinWordPeriod);
|
|
|
|
}
|
|
|
|
|
2012-12-23 18:40:34 +00:00
|
|
|
public void setSuggest(final Suggest suggest) {
|
2012-08-03 03:22:29 +00:00
|
|
|
mSuggest = suggest;
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public void shiftIn(final LogUnit newLogUnit) {
|
|
|
|
super.shiftIn(newLogUnit);
|
|
|
|
if (newLogUnit.hasWord()) {
|
|
|
|
if (mWordsUntilSafeToSample > 0) {
|
|
|
|
mWordsUntilSafeToSample--;
|
|
|
|
}
|
|
|
|
}
|
2012-08-09 19:20:45 +00:00
|
|
|
if (DEBUG) {
|
|
|
|
Log.d(TAG, "shiftedIn " + (newLogUnit.hasWord() ? newLogUnit.getWord() : ""));
|
|
|
|
}
|
2012-08-03 03:22:29 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
public void resetWordCounter() {
|
|
|
|
mWordsUntilSafeToSample = mMinWordPeriod;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Determines whether the content of the MainLogBuffer can be safely uploaded in its complete
|
|
|
|
* form and still protect the user's privacy.
|
|
|
|
*
|
|
|
|
* The size of the MainLogBuffer is just enough to hold one n-gram, its corrections, and any
|
|
|
|
* non-character data that is typed between words. The decision about privacy is made based on
|
|
|
|
* the buffer's entire content. If it is decided that the privacy risks are too great to upload
|
|
|
|
* the contents of this buffer, a censored version of the LogItems may still be uploaded. E.g.,
|
|
|
|
* the screen orientation and other characteristics about the device can be uploaded without
|
|
|
|
* revealing much about the user.
|
|
|
|
*/
|
|
|
|
public boolean isSafeToLog() {
|
|
|
|
// Check that we are not sampling too frequently. Having sampled recently might disclose
|
|
|
|
// too much of the user's intended meaning.
|
|
|
|
if (mWordsUntilSafeToSample > 0) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if (mSuggest == null || !mSuggest.hasMainDictionary()) {
|
|
|
|
// Main dictionary is unavailable. Since we cannot check it, we cannot tell if a word
|
|
|
|
// is out-of-vocabulary or not. Therefore, we must judge the entire buffer contents to
|
|
|
|
// potentially pose a privacy risk.
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
// Reload the dictionary in case it has changed (e.g., because the user has changed
|
|
|
|
// languages).
|
|
|
|
final Dictionary dictionary = mSuggest.getMainDictionary();
|
|
|
|
if (dictionary == null) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
// Check each word in the buffer. If any word poses a privacy threat, we cannot upload the
|
|
|
|
// complete buffer contents in detail.
|
2012-12-23 18:40:34 +00:00
|
|
|
final LinkedList<LogUnit> logUnits = getLogUnits();
|
|
|
|
final int length = logUnits.size();
|
2012-12-23 23:12:51 +00:00
|
|
|
int wordsFound = 0;
|
2012-08-03 03:22:29 +00:00
|
|
|
for (int i = 0; i < length; i++) {
|
2012-12-23 18:40:34 +00:00
|
|
|
final LogUnit logUnit = logUnits.get(i);
|
2012-08-03 03:22:29 +00:00
|
|
|
final String word = logUnit.getWord();
|
|
|
|
if (word == null) {
|
|
|
|
// Digits outside words are a privacy threat.
|
2012-08-09 22:58:25 +00:00
|
|
|
if (logUnit.mayContainDigit()) {
|
2012-08-03 03:22:29 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Words not in the dictionary are a privacy threat.
|
2012-08-10 08:54:06 +00:00
|
|
|
if (ResearchLogger.hasLetters(word) && !(dictionary.isValidWord(word))) {
|
2012-08-12 20:54:53 +00:00
|
|
|
if (DEBUG) {
|
|
|
|
Log.d(TAG, "NOT SAFE!: hasLetters: " + ResearchLogger.hasLetters(word)
|
|
|
|
+ ", isValid: " + (dictionary.isValidWord(word)));
|
|
|
|
}
|
2012-08-03 03:22:29 +00:00
|
|
|
return false;
|
2012-12-23 23:12:51 +00:00
|
|
|
} else {
|
|
|
|
wordsFound++;
|
2012-08-03 03:22:29 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2012-12-23 23:12:51 +00:00
|
|
|
if (wordsFound < N_GRAM_SIZE) {
|
|
|
|
// Not enough words. Not unsafe, but reject anyway.
|
|
|
|
if (DEBUG) {
|
|
|
|
Log.d(TAG, "not enough words");
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
2012-08-03 03:22:29 +00:00
|
|
|
// All checks have passed; this buffer's content can be safely uploaded.
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
2012-12-23 18:40:34 +00:00
|
|
|
protected void onShiftOut(final LogUnit logUnit) {
|
2012-08-03 03:22:29 +00:00
|
|
|
if (mResearchLog != null) {
|
|
|
|
mResearchLog.publish(logUnit, false /* isIncludingPrivateData */);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|