[Rlog56] Buffer words before pushing out LogUnit

Previously, a logbuffer only held an n-gram.  Data went in and out of it, FIFO, until privacy
conditions were met (i.e. data not collected too frequently), and then an n-gram was saved.
E.g., if n=2, and only 10% of data is collected, then 18 words went through the logbuffer before
it captured the next 2 words.

However, if a user then went back and edited the n-gram, these edits were not captured.

This change changes the logbuffer size to temporarily hold data about words that are not recorded,
so that if the user backs up over them, the edits to an n-gram that we do eventually capture are
stored.  If the example above, instead of a logbuffer holding 2 words, it holds 20.  The system
waits until all the words not needed for the n-gram have been gathered (i.e. the buffer is full),
so the user has adequate time to edit, before shifting out the n-gram.  The buffer is still flushed
when the user closes the IME.  See the comment for MainLogBuffer for an explanation.

multi-project commit with I45317bc95eeb859adc1b35b24d0478f2df1a67f3

Change-Id: I4ffd95d08c6437dcf650d866ef9e24b6af512334
main
Kurt Partridge 2013-01-08 11:18:43 -08:00
parent 8bda35809b
commit 403c423940
3 changed files with 96 additions and 55 deletions

View File

@ -81,7 +81,7 @@ public class FixedLogBuffer extends LogBuffer {
return logUnit;
}
private void shiftOutThroughFirstWord() {
public void shiftOutThroughFirstWord() {
final LinkedList<LogUnit> logUnits = getLogUnits();
while (!logUnits.isEmpty()) {
final LogUnit logUnit = logUnits.removeFirst();

View File

@ -26,18 +26,42 @@ import java.util.LinkedList;
import java.util.Random;
/**
* Provide a log buffer of fixed length that enforces privacy restrictions.
* MainLogBuffer is a FixedLogBuffer that tracks the state of LogUnits to make privacy guarantees.
*
* The privacy restrictions include making sure that no numbers are logged, that all logged words
* are in the dictionary, and that words are recorded infrequently enough that the user's meaning
* cannot be easily determined.
* There are three forms of privacy protection: 1) only words in the main dictionary are allowed to
* be logged in enough detail to determine their contents, 2) only a subset of words are logged
* in detail, such as 10%, and 3) no numbers are logged.
*
* This class maintains a list of LogUnits, each corresponding to a word. As the user completes
* words, they are added here. But if the user backs up over their current word to edit a word
* entered earlier, then it is pulled out of this LogBuffer, changes are then added to the end of
* the LogUnit, and it is pushed back in here when the user is done. Because words may be pulled
* back out even after they are pushed in, we must not publish the contents of this LogBuffer too
* quickly. However, we cannot let the contents pile up either, or it will limit the editing that
* a user can perform.
*
* To balance these requirements (keep history so user can edit, flush history so it does not pile
* up), the LogBuffer is considered "complete" when the user has entered enough words to form an
* n-gram, followed by enough additional non-detailed words (that are in the 90%, as per above).
* Once complete, the n-gram may be published to flash storage (via the ResearchLog class).
* However, the additional non-detailed words are retained, in case the user backspaces to edit
* them. The MainLogBuffer then continues to add words, publishing individual non-detailed words
* as new words arrive. After enough non-detailed words have been pushed out to account for the
* 90% between words, the words at the front of the LogBuffer can be published as an n-gram again.
*
* If the words that would form the valid n-gram are not in the dictionary, then words are pushed
* through the LogBuffer one at a time until an n-gram is found that is entirely composed of
* dictionary words.
*
* If the user closes a session, then the entire LogBuffer is flushed, publishing any embedded
* n-gram containing dictionary words.
*/
public class MainLogBuffer extends FixedLogBuffer {
private static final String TAG = MainLogBuffer.class.getSimpleName();
private static final boolean DEBUG = false && ProductionFlag.IS_EXPERIMENTAL_DEBUG;
// The size of the n-grams logged. E.g. N_GRAM_SIZE = 2 means to sample bigrams.
private static final int N_GRAM_SIZE = 2;
public static final int N_GRAM_SIZE = 2;
// The number of words between n-grams to omit from the log. If debugging, record 50% of all
// words. Otherwise, only record 10%.
private static final int DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES =
@ -46,49 +70,31 @@ public class MainLogBuffer extends FixedLogBuffer {
private final ResearchLog mResearchLog;
private Suggest mSuggest;
// The minimum periodicity with which n-grams can be sampled. E.g. mWinWordPeriod is 10 if
// every 10th bigram is sampled, i.e., words 1-8 are not, but the bigram at words 9 and 10, etc.
// for 11-18, and the bigram at words 19 and 20. If an n-gram is not safe (e.g. it contains a
// number in the middle or an out-of-vocabulary word), then sampling is delayed until a safe
// n-gram does appear.
/* package for test */ int mMinWordPeriod;
/* package for test */ int mNumWordsBetweenNGrams;
// Counter for words left to suppress before an n-gram can be sampled. Reset to mMinWordPeriod
// after a sample is taken.
/* package for test */ int mWordsUntilSafeToSample;
/* package for test */ int mNumWordsUntilSafeToSample;
public MainLogBuffer(final ResearchLog researchLog) {
super(N_GRAM_SIZE);
super(N_GRAM_SIZE + DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES);
mResearchLog = researchLog;
mMinWordPeriod = DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES + N_GRAM_SIZE;
mNumWordsBetweenNGrams = DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES;
final Random random = new Random();
mWordsUntilSafeToSample = random.nextInt(mMinWordPeriod);
mNumWordsUntilSafeToSample = DEBUG ? 0 : random.nextInt(mNumWordsBetweenNGrams + 1);
}
public void setSuggest(final Suggest suggest) {
mSuggest = suggest;
}
@Override
public void shiftIn(final LogUnit newLogUnit) {
super.shiftIn(newLogUnit);
if (newLogUnit.hasWord()) {
if (mWordsUntilSafeToSample > 0) {
mWordsUntilSafeToSample--;
}
}
if (DEBUG) {
Log.d(TAG, "shiftedIn " + (newLogUnit.hasWord() ? newLogUnit.getWord() : ""));
}
}
public void resetWordCounter() {
mWordsUntilSafeToSample = mMinWordPeriod;
mNumWordsUntilSafeToSample = mNumWordsBetweenNGrams;
}
/**
* Determines whether the content of the MainLogBuffer can be safely uploaded in its complete
* form and still protect the user's privacy.
* Determines whether uploading the n words at the front the MainLogBuffer will not violate
* user privacy.
*
* The size of the MainLogBuffer is just enough to hold one n-gram, its corrections, and any
* non-character data that is typed between words. The decision about privacy is made based on
@ -97,10 +103,10 @@ public class MainLogBuffer extends FixedLogBuffer {
* the screen orientation and other characteristics about the device can be uploaded without
* revealing much about the user.
*/
public boolean isSafeToLog() {
public boolean isNGramSafe() {
// Check that we are not sampling too frequently. Having sampled recently might disclose
// too much of the user's intended meaning.
if (mWordsUntilSafeToSample > 0) {
if (mNumWordsUntilSafeToSample > 0) {
return false;
}
if (mSuggest == null || !mSuggest.hasMainDictionary()) {
@ -119,8 +125,8 @@ public class MainLogBuffer extends FixedLogBuffer {
// complete buffer contents in detail.
final LinkedList<LogUnit> logUnits = getLogUnits();
final int length = logUnits.size();
int wordsFound = 0;
for (int i = 0; i < length; i++) {
int wordsNeeded = N_GRAM_SIZE;
for (int i = 0; i < length && wordsNeeded > 0; i++) {
final LogUnit logUnit = logUnits.get(i);
final String word = logUnit.getWord();
if (word == null) {
@ -136,26 +142,41 @@ public class MainLogBuffer extends FixedLogBuffer {
+ ", isValid: " + (dictionary.isValidWord(word)));
}
return false;
} else {
wordsFound++;
}
}
}
if (wordsFound < N_GRAM_SIZE) {
// Not enough words. Not unsafe, but reject anyway.
if (DEBUG) {
Log.d(TAG, "not enough words");
}
return false;
}
// All checks have passed; this buffer's content can be safely uploaded.
return true;
}
public boolean isNGramComplete() {
final LinkedList<LogUnit> logUnits = getLogUnits();
final int length = logUnits.size();
int wordsNeeded = N_GRAM_SIZE;
for (int i = 0; i < length && wordsNeeded > 0; i++) {
final LogUnit logUnit = logUnits.get(i);
final String word = logUnit.getWord();
if (word != null) {
wordsNeeded--;
}
}
return wordsNeeded == 0;
}
@Override
protected void onShiftOut(final LogUnit logUnit) {
if (mResearchLog != null) {
mResearchLog.publish(logUnit, false /* isIncludingPrivateData */);
mResearchLog.publish(logUnit,
ResearchLogger.IS_LOGGING_EVERYTHING /* isIncludingPrivateData */);
}
if (logUnit.hasWord()) {
if (mNumWordsUntilSafeToSample > 0) {
mNumWordsUntilSafeToSample--;
Log.d(TAG, "wordsUntilSafeToSample now at " + mNumWordsUntilSafeToSample);
}
}
if (DEBUG) {
Log.d(TAG, "shiftedOut " + (logUnit.hasWord() ? logUnit.getWord() : ""));
}
}
}

View File

@ -85,7 +85,7 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang
private static final String TAG = ResearchLogger.class.getSimpleName();
private static final boolean DEBUG = false && ProductionFlag.IS_EXPERIMENTAL_DEBUG;
// Whether all n-grams should be logged. true will disclose private info.
private static final boolean IS_LOGGING_EVERYTHING = false
public static final boolean IS_LOGGING_EVERYTHING = false
&& ProductionFlag.IS_EXPERIMENTAL_DEBUG;
// Whether the TextView contents are logged at the end of the session. true will disclose
// private info.
@ -394,8 +394,16 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang
commitCurrentLogUnit();
if (mMainLogBuffer != null) {
publishLogBuffer(mMainLogBuffer, mMainResearchLog,
IS_LOGGING_EVERYTHING /* isIncludingPrivateData */);
while (!mMainLogBuffer.isEmpty()) {
if ((mMainLogBuffer.isNGramSafe() || IS_LOGGING_EVERYTHING) &&
mMainResearchLog != null) {
publishLogBuffer(mMainLogBuffer, mMainResearchLog,
true /* isIncludingPrivateData */);
mMainLogBuffer.resetWordCounter();
} else {
mMainLogBuffer.shiftOutThroughFirstWord();
}
}
mMainResearchLog.close(null /* callback */);
mMainLogBuffer = null;
}
@ -702,8 +710,9 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang
}
if (!mCurrentLogUnit.isEmpty()) {
if (mMainLogBuffer != null) {
if ((mMainLogBuffer.isSafeToLog() || IS_LOGGING_EVERYTHING)
&& mMainResearchLog != null) {
if ((mMainLogBuffer.isNGramSafe() || IS_LOGGING_EVERYTHING) &&
mMainLogBuffer.isNGramComplete() &&
mMainResearchLog != null) {
publishLogBuffer(mMainLogBuffer, mMainResearchLog,
true /* isIncludingPrivateData */);
mMainLogBuffer.resetWordCounter();
@ -714,6 +723,10 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang
mFeedbackLogBuffer.shiftIn(mCurrentLogUnit);
}
mCurrentLogUnit = new LogUnit();
} else {
if (DEBUG) {
Log.d(TAG, "Warning: tried to commit empty log unit.");
}
}
}
@ -756,8 +769,8 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang
mFeedbackLogBuffer.unshiftIn();
}
if (DEBUG) {
Log.d(TAG, "uncommitCurrentLogUnit back to " + (mCurrentLogUnit.hasWord()
? ": '" + mCurrentLogUnit.getWord() + "'" : ""));
Log.d(TAG, "uncommitCurrentLogUnit (dump=" + dumpCurrentLogUnit + ") back to "
+ (mCurrentLogUnit.hasWord() ? ": '" + mCurrentLogUnit.getWord() + "'" : ""));
}
}
@ -773,12 +786,16 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang
isIncludingPrivateData);
researchLog.publish(openingLogUnit, true /* isIncludingPrivateData */);
LogUnit logUnit;
while ((logUnit = logBuffer.shiftOut()) != null) {
int numWordsToPublish = MainLogBuffer.N_GRAM_SIZE;
while ((logUnit = logBuffer.shiftOut()) != null && numWordsToPublish > 0) {
if (DEBUG) {
Log.d(TAG, "publishLogBuffer: " + (logUnit.hasWord() ? logUnit.getWord()
: "<wordless>"));
}
researchLog.publish(logUnit, isIncludingPrivateData);
if (logUnit.getWord() != null) {
numWordsToPublish--;
}
}
final LogUnit closingLogUnit = new LogUnit();
closingLogUnit.addLogStatement(LOGSTATEMENT_LOG_SEGMENT_CLOSING,
@ -1254,9 +1271,12 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang
public static void latinIME_revertCommit(final String committedWord,
final String originallyTypedWord, final boolean isBatchMode) {
final ResearchLogger researchLogger = getInstance();
final LogUnit logUnit = researchLogger.mMainLogBuffer.peekLastLogUnit();
// Assume that mCurrentLogUnit has been restored to contain the reverted word.
final LogUnit logUnit = researchLogger.mCurrentLogUnit;
if (originallyTypedWord.length() > 0 && hasLetters(originallyTypedWord)) {
if (logUnit != null) {
// Probably not necessary, but setting as a precaution in case the word isn't
// committed later.
logUnit.setWord(originallyTypedWord);
}
}