diff --git a/java/src/com/android/inputmethod/research/FixedLogBuffer.java b/java/src/com/android/inputmethod/research/FixedLogBuffer.java index 9613c2db2..777111947 100644 --- a/java/src/com/android/inputmethod/research/FixedLogBuffer.java +++ b/java/src/com/android/inputmethod/research/FixedLogBuffer.java @@ -81,7 +81,7 @@ public class FixedLogBuffer extends LogBuffer { return logUnit; } - private void shiftOutThroughFirstWord() { + public void shiftOutThroughFirstWord() { final LinkedList logUnits = getLogUnits(); while (!logUnits.isEmpty()) { final LogUnit logUnit = logUnits.removeFirst(); diff --git a/java/src/com/android/inputmethod/research/MainLogBuffer.java b/java/src/com/android/inputmethod/research/MainLogBuffer.java index 898a042d6..a8f255a41 100644 --- a/java/src/com/android/inputmethod/research/MainLogBuffer.java +++ b/java/src/com/android/inputmethod/research/MainLogBuffer.java @@ -26,18 +26,42 @@ import java.util.LinkedList; import java.util.Random; /** - * Provide a log buffer of fixed length that enforces privacy restrictions. + * MainLogBuffer is a FixedLogBuffer that tracks the state of LogUnits to make privacy guarantees. * - * The privacy restrictions include making sure that no numbers are logged, that all logged words - * are in the dictionary, and that words are recorded infrequently enough that the user's meaning - * cannot be easily determined. + * There are three forms of privacy protection: 1) only words in the main dictionary are allowed to + * be logged in enough detail to determine their contents, 2) only a subset of words are logged + * in detail, such as 10%, and 3) no numbers are logged. + * + * This class maintains a list of LogUnits, each corresponding to a word. As the user completes + * words, they are added here. But if the user backs up over their current word to edit a word + * entered earlier, then it is pulled out of this LogBuffer, changes are then added to the end of + * the LogUnit, and it is pushed back in here when the user is done. Because words may be pulled + * back out even after they are pushed in, we must not publish the contents of this LogBuffer too + * quickly. However, we cannot let the contents pile up either, or it will limit the editing that + * a user can perform. + * + * To balance these requirements (keep history so user can edit, flush history so it does not pile + * up), the LogBuffer is considered "complete" when the user has entered enough words to form an + * n-gram, followed by enough additional non-detailed words (that are in the 90%, as per above). + * Once complete, the n-gram may be published to flash storage (via the ResearchLog class). + * However, the additional non-detailed words are retained, in case the user backspaces to edit + * them. The MainLogBuffer then continues to add words, publishing individual non-detailed words + * as new words arrive. After enough non-detailed words have been pushed out to account for the + * 90% between words, the words at the front of the LogBuffer can be published as an n-gram again. + * + * If the words that would form the valid n-gram are not in the dictionary, then words are pushed + * through the LogBuffer one at a time until an n-gram is found that is entirely composed of + * dictionary words. + * + * If the user closes a session, then the entire LogBuffer is flushed, publishing any embedded + * n-gram containing dictionary words. */ public class MainLogBuffer extends FixedLogBuffer { private static final String TAG = MainLogBuffer.class.getSimpleName(); private static final boolean DEBUG = false && ProductionFlag.IS_EXPERIMENTAL_DEBUG; // The size of the n-grams logged. E.g. N_GRAM_SIZE = 2 means to sample bigrams. - private static final int N_GRAM_SIZE = 2; + public static final int N_GRAM_SIZE = 2; // The number of words between n-grams to omit from the log. If debugging, record 50% of all // words. Otherwise, only record 10%. private static final int DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES = @@ -46,49 +70,31 @@ public class MainLogBuffer extends FixedLogBuffer { private final ResearchLog mResearchLog; private Suggest mSuggest; - // The minimum periodicity with which n-grams can be sampled. E.g. mWinWordPeriod is 10 if - // every 10th bigram is sampled, i.e., words 1-8 are not, but the bigram at words 9 and 10, etc. - // for 11-18, and the bigram at words 19 and 20. If an n-gram is not safe (e.g. it contains a - // number in the middle or an out-of-vocabulary word), then sampling is delayed until a safe - // n-gram does appear. - /* package for test */ int mMinWordPeriod; + /* package for test */ int mNumWordsBetweenNGrams; // Counter for words left to suppress before an n-gram can be sampled. Reset to mMinWordPeriod // after a sample is taken. - /* package for test */ int mWordsUntilSafeToSample; + /* package for test */ int mNumWordsUntilSafeToSample; public MainLogBuffer(final ResearchLog researchLog) { - super(N_GRAM_SIZE); + super(N_GRAM_SIZE + DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES); mResearchLog = researchLog; - mMinWordPeriod = DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES + N_GRAM_SIZE; + mNumWordsBetweenNGrams = DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES; final Random random = new Random(); - mWordsUntilSafeToSample = random.nextInt(mMinWordPeriod); + mNumWordsUntilSafeToSample = DEBUG ? 0 : random.nextInt(mNumWordsBetweenNGrams + 1); } public void setSuggest(final Suggest suggest) { mSuggest = suggest; } - @Override - public void shiftIn(final LogUnit newLogUnit) { - super.shiftIn(newLogUnit); - if (newLogUnit.hasWord()) { - if (mWordsUntilSafeToSample > 0) { - mWordsUntilSafeToSample--; - } - } - if (DEBUG) { - Log.d(TAG, "shiftedIn " + (newLogUnit.hasWord() ? newLogUnit.getWord() : "")); - } - } - public void resetWordCounter() { - mWordsUntilSafeToSample = mMinWordPeriod; + mNumWordsUntilSafeToSample = mNumWordsBetweenNGrams; } /** - * Determines whether the content of the MainLogBuffer can be safely uploaded in its complete - * form and still protect the user's privacy. + * Determines whether uploading the n words at the front the MainLogBuffer will not violate + * user privacy. * * The size of the MainLogBuffer is just enough to hold one n-gram, its corrections, and any * non-character data that is typed between words. The decision about privacy is made based on @@ -97,10 +103,10 @@ public class MainLogBuffer extends FixedLogBuffer { * the screen orientation and other characteristics about the device can be uploaded without * revealing much about the user. */ - public boolean isSafeToLog() { + public boolean isNGramSafe() { // Check that we are not sampling too frequently. Having sampled recently might disclose // too much of the user's intended meaning. - if (mWordsUntilSafeToSample > 0) { + if (mNumWordsUntilSafeToSample > 0) { return false; } if (mSuggest == null || !mSuggest.hasMainDictionary()) { @@ -119,8 +125,8 @@ public class MainLogBuffer extends FixedLogBuffer { // complete buffer contents in detail. final LinkedList logUnits = getLogUnits(); final int length = logUnits.size(); - int wordsFound = 0; - for (int i = 0; i < length; i++) { + int wordsNeeded = N_GRAM_SIZE; + for (int i = 0; i < length && wordsNeeded > 0; i++) { final LogUnit logUnit = logUnits.get(i); final String word = logUnit.getWord(); if (word == null) { @@ -136,26 +142,41 @@ public class MainLogBuffer extends FixedLogBuffer { + ", isValid: " + (dictionary.isValidWord(word))); } return false; - } else { - wordsFound++; } } } - if (wordsFound < N_GRAM_SIZE) { - // Not enough words. Not unsafe, but reject anyway. - if (DEBUG) { - Log.d(TAG, "not enough words"); - } - return false; - } // All checks have passed; this buffer's content can be safely uploaded. return true; } + public boolean isNGramComplete() { + final LinkedList logUnits = getLogUnits(); + final int length = logUnits.size(); + int wordsNeeded = N_GRAM_SIZE; + for (int i = 0; i < length && wordsNeeded > 0; i++) { + final LogUnit logUnit = logUnits.get(i); + final String word = logUnit.getWord(); + if (word != null) { + wordsNeeded--; + } + } + return wordsNeeded == 0; + } + @Override protected void onShiftOut(final LogUnit logUnit) { if (mResearchLog != null) { - mResearchLog.publish(logUnit, false /* isIncludingPrivateData */); + mResearchLog.publish(logUnit, + ResearchLogger.IS_LOGGING_EVERYTHING /* isIncludingPrivateData */); + } + if (logUnit.hasWord()) { + if (mNumWordsUntilSafeToSample > 0) { + mNumWordsUntilSafeToSample--; + Log.d(TAG, "wordsUntilSafeToSample now at " + mNumWordsUntilSafeToSample); + } + } + if (DEBUG) { + Log.d(TAG, "shiftedOut " + (logUnit.hasWord() ? logUnit.getWord() : "")); } } } diff --git a/java/src/com/android/inputmethod/research/ResearchLogger.java b/java/src/com/android/inputmethod/research/ResearchLogger.java index b61db272c..f464facf4 100644 --- a/java/src/com/android/inputmethod/research/ResearchLogger.java +++ b/java/src/com/android/inputmethod/research/ResearchLogger.java @@ -85,7 +85,7 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang private static final String TAG = ResearchLogger.class.getSimpleName(); private static final boolean DEBUG = false && ProductionFlag.IS_EXPERIMENTAL_DEBUG; // Whether all n-grams should be logged. true will disclose private info. - private static final boolean IS_LOGGING_EVERYTHING = false + public static final boolean IS_LOGGING_EVERYTHING = false && ProductionFlag.IS_EXPERIMENTAL_DEBUG; // Whether the TextView contents are logged at the end of the session. true will disclose // private info. @@ -394,8 +394,16 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang commitCurrentLogUnit(); if (mMainLogBuffer != null) { - publishLogBuffer(mMainLogBuffer, mMainResearchLog, - IS_LOGGING_EVERYTHING /* isIncludingPrivateData */); + while (!mMainLogBuffer.isEmpty()) { + if ((mMainLogBuffer.isNGramSafe() || IS_LOGGING_EVERYTHING) && + mMainResearchLog != null) { + publishLogBuffer(mMainLogBuffer, mMainResearchLog, + true /* isIncludingPrivateData */); + mMainLogBuffer.resetWordCounter(); + } else { + mMainLogBuffer.shiftOutThroughFirstWord(); + } + } mMainResearchLog.close(null /* callback */); mMainLogBuffer = null; } @@ -702,8 +710,9 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang } if (!mCurrentLogUnit.isEmpty()) { if (mMainLogBuffer != null) { - if ((mMainLogBuffer.isSafeToLog() || IS_LOGGING_EVERYTHING) - && mMainResearchLog != null) { + if ((mMainLogBuffer.isNGramSafe() || IS_LOGGING_EVERYTHING) && + mMainLogBuffer.isNGramComplete() && + mMainResearchLog != null) { publishLogBuffer(mMainLogBuffer, mMainResearchLog, true /* isIncludingPrivateData */); mMainLogBuffer.resetWordCounter(); @@ -714,6 +723,10 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang mFeedbackLogBuffer.shiftIn(mCurrentLogUnit); } mCurrentLogUnit = new LogUnit(); + } else { + if (DEBUG) { + Log.d(TAG, "Warning: tried to commit empty log unit."); + } } } @@ -756,8 +769,8 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang mFeedbackLogBuffer.unshiftIn(); } if (DEBUG) { - Log.d(TAG, "uncommitCurrentLogUnit back to " + (mCurrentLogUnit.hasWord() - ? ": '" + mCurrentLogUnit.getWord() + "'" : "")); + Log.d(TAG, "uncommitCurrentLogUnit (dump=" + dumpCurrentLogUnit + ") back to " + + (mCurrentLogUnit.hasWord() ? ": '" + mCurrentLogUnit.getWord() + "'" : "")); } } @@ -773,12 +786,16 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang isIncludingPrivateData); researchLog.publish(openingLogUnit, true /* isIncludingPrivateData */); LogUnit logUnit; - while ((logUnit = logBuffer.shiftOut()) != null) { + int numWordsToPublish = MainLogBuffer.N_GRAM_SIZE; + while ((logUnit = logBuffer.shiftOut()) != null && numWordsToPublish > 0) { if (DEBUG) { Log.d(TAG, "publishLogBuffer: " + (logUnit.hasWord() ? logUnit.getWord() : "")); } researchLog.publish(logUnit, isIncludingPrivateData); + if (logUnit.getWord() != null) { + numWordsToPublish--; + } } final LogUnit closingLogUnit = new LogUnit(); closingLogUnit.addLogStatement(LOGSTATEMENT_LOG_SEGMENT_CLOSING, @@ -1254,9 +1271,12 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang public static void latinIME_revertCommit(final String committedWord, final String originallyTypedWord, final boolean isBatchMode) { final ResearchLogger researchLogger = getInstance(); - final LogUnit logUnit = researchLogger.mMainLogBuffer.peekLastLogUnit(); + // Assume that mCurrentLogUnit has been restored to contain the reverted word. + final LogUnit logUnit = researchLogger.mCurrentLogUnit; if (originallyTypedWord.length() > 0 && hasLetters(originallyTypedWord)) { if (logUnit != null) { + // Probably not necessary, but setting as a precaution in case the word isn't + // committed later. logUnit.setWord(originallyTypedWord); } }