am 264fb958: Merge "Log reasons for punting n-gram logging"

* commit '264fb9581c0c5142dc3f78c50dab77444ef17b00':
  Log reasons for punting n-gram logging
main
Kurt Partridge 2013-05-23 17:21:52 -07:00 committed by Android Git Automerger
commit a0542472c2
3 changed files with 107 additions and 19 deletions

View File

@ -63,6 +63,15 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
private static final boolean DEBUG = false private static final boolean DEBUG = false
&& ProductionFlag.USES_DEVELOPMENT_ONLY_DIAGNOSTICS_DEBUG; && ProductionFlag.USES_DEVELOPMENT_ONLY_DIAGNOSTICS_DEBUG;
// Keep consistent with switch statement in Statistics.recordPublishabilityResultCode()
public static final int PUBLISHABILITY_PUBLISHABLE = 0;
public static final int PUBLISHABILITY_UNPUBLISHABLE_STOPPING = 1;
public static final int PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT = 2;
public static final int PUBLISHABILITY_UNPUBLISHABLE_SAMPLED_TOO_RECENTLY = 3;
public static final int PUBLISHABILITY_UNPUBLISHABLE_DICTIONARY_UNAVAILABLE = 4;
public static final int PUBLISHABILITY_UNPUBLISHABLE_MAY_CONTAIN_DIGIT = 5;
public static final int PUBLISHABILITY_UNPUBLISHABLE_NOT_IN_DICTIONARY = 6;
// The size of the n-grams logged. E.g. N_GRAM_SIZE = 2 means to sample bigrams. // The size of the n-grams logged. E.g. N_GRAM_SIZE = 2 means to sample bigrams.
public static final int N_GRAM_SIZE = 2; public static final int N_GRAM_SIZE = 2;
@ -105,21 +114,24 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
} }
/** /**
* Determines whether uploading the n words at the front the MainLogBuffer will not violate * Determines whether the string determined by a series of LogUnits will not violate user
* user privacy. * privacy if published.
* *
* The size of the MainLogBuffer is just enough to hold one n-gram, its corrections, and any * @param logUnits a LogUnit list to check for publishability
* non-character data that is typed between words. The decision about privacy is made based on * @param nGramSize the smallest n-gram acceptable to be published. if
* the buffer's entire content. If it is decided that the privacy risks are too great to upload * {@link ResearchLogger.IS_LOGGING_EVERYTHING} is true, then publish if there are more than
* the contents of this buffer, a censored version of the LogItems may still be uploaded. E.g., * {@code minNGramSize} words in the logUnits, otherwise wait. if {@link
* the screen orientation and other characteristics about the device can be uploaded without * ResearchLogger.IS_LOGGING_EVERYTHING} is false, then ensure that there are exactly nGramSize
* revealing much about the user. * words in the LogUnits.
*
* @return one of the {@code PUBLISHABILITY_*} result codes defined in this class.
*/ */
private boolean isSafeNGram(final ArrayList<LogUnit> logUnits, final int minNGramSize) { private int getPublishabilityResultCode(final ArrayList<LogUnit> logUnits,
final int nGramSize) {
// Bypass privacy checks when debugging. // Bypass privacy checks when debugging.
if (ResearchLogger.IS_LOGGING_EVERYTHING) { if (ResearchLogger.IS_LOGGING_EVERYTHING) {
if (mIsStopping) { if (mIsStopping) {
return true; return PUBLISHABILITY_UNPUBLISHABLE_STOPPING;
} }
// Only check that it is the right length. If not, wait for later words to make // Only check that it is the right length. If not, wait for later words to make
// complete n-grams. // complete n-grams.
@ -129,13 +141,17 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
final LogUnit logUnit = logUnits.get(i); final LogUnit logUnit = logUnits.get(i);
numWordsInLogUnitList += logUnit.getNumWords(); numWordsInLogUnitList += logUnit.getNumWords();
} }
return numWordsInLogUnitList >= minNGramSize; if (numWordsInLogUnitList >= nGramSize) {
return PUBLISHABILITY_PUBLISHABLE;
} else {
return PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT;
}
} }
// Check that we are not sampling too frequently. Having sampled recently might disclose // Check that we are not sampling too frequently. Having sampled recently might disclose
// too much of the user's intended meaning. // too much of the user's intended meaning.
if (mNumWordsUntilSafeToSample > 0) { if (mNumWordsUntilSafeToSample > 0) {
return false; return PUBLISHABILITY_UNPUBLISHABLE_SAMPLED_TOO_RECENTLY;
} }
// Reload the dictionary in case it has changed (e.g., because the user has changed // Reload the dictionary in case it has changed (e.g., because the user has changed
// languages). // languages).
@ -144,7 +160,7 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
// Main dictionary is unavailable. Since we cannot check it, we cannot tell if a // Main dictionary is unavailable. Since we cannot check it, we cannot tell if a
// word is out-of-vocabulary or not. Therefore, we must judge the entire buffer // word is out-of-vocabulary or not. Therefore, we must judge the entire buffer
// contents to potentially pose a privacy risk. // contents to potentially pose a privacy risk.
return false; return PUBLISHABILITY_UNPUBLISHABLE_DICTIONARY_UNAVAILABLE;
} }
// Check each word in the buffer. If any word poses a privacy threat, we cannot upload // Check each word in the buffer. If any word poses a privacy threat, we cannot upload
@ -155,7 +171,7 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
if (!logUnit.hasOneOrMoreWords()) { if (!logUnit.hasOneOrMoreWords()) {
// Digits outside words are a privacy threat. // Digits outside words are a privacy threat.
if (logUnit.mayContainDigit()) { if (logUnit.mayContainDigit()) {
return false; return PUBLISHABILITY_UNPUBLISHABLE_MAY_CONTAIN_DIGIT;
} }
} else { } else {
numWordsInLogUnitList += logUnit.getNumWords(); numWordsInLogUnitList += logUnit.getNumWords();
@ -168,14 +184,18 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
+ ResearchLogger.hasLetters(word) + ResearchLogger.hasLetters(word)
+ ", isValid: " + (dictionary.isValidWord(word))); + ", isValid: " + (dictionary.isValidWord(word)));
} }
return false; return PUBLISHABILITY_UNPUBLISHABLE_NOT_IN_DICTIONARY;
} }
} }
} }
} }
// Finally, only return true if the ngram is the right size. // Finally, only return true if the ngram is the right size.
return numWordsInLogUnitList == minNGramSize; if (numWordsInLogUnitList == nGramSize) {
return PUBLISHABILITY_PUBLISHABLE;
} else {
return PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT;
}
} }
public void shiftAndPublishAll() throws IOException { public void shiftAndPublishAll() throws IOException {
@ -216,7 +236,9 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
// TODO: Refactor this method to require fewer passes through the LogUnits. Should really // TODO: Refactor this method to require fewer passes through the LogUnits. Should really
// require only one pass. // require only one pass.
ArrayList<LogUnit> logUnits = peekAtFirstNWords(N_GRAM_SIZE); ArrayList<LogUnit> logUnits = peekAtFirstNWords(N_GRAM_SIZE);
if (isSafeNGram(logUnits, N_GRAM_SIZE)) { final int publishabilityResultCode = getPublishabilityResultCode(logUnits, N_GRAM_SIZE);
ResearchLogger.recordPublishabilityResultCode(publishabilityResultCode);
if (publishabilityResultCode == MainLogBuffer.PUBLISHABILITY_PUBLISHABLE) {
// Good n-gram at the front of the buffer. Publish it, disclosing details. // Good n-gram at the front of the buffer. Publish it, disclosing details.
publish(logUnits, true /* canIncludePrivateData */); publish(logUnits, true /* canIncludePrivateData */);
shiftOutWords(N_GRAM_SIZE); shiftOutWords(N_GRAM_SIZE);

View File

@ -1895,6 +1895,20 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang
isComposingWord); isComposingWord);
} }
/**
* Call this method when the logging system has attempted publication of an n-gram.
*
* Statistics are gathered about the success or failure.
*
* @param publishabilityResultCode a result code as defined by
* {@code MainLogBuffer.PUBLISHABILITY_*}
*/
static void recordPublishabilityResultCode(final int publishabilityResultCode) {
final ResearchLogger researchLogger = getInstance();
final Statistics statistics = researchLogger.mStatistics;
statistics.recordPublishabilityResultCode(publishabilityResultCode);
}
/** /**
* Log statistics. * Log statistics.
* *
@ -1907,7 +1921,11 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang
"averageTimeDuringRepeatedDelete", "averageTimeAfterDelete", "averageTimeDuringRepeatedDelete", "averageTimeAfterDelete",
"dictionaryWordCount", "splitWordsCount", "gestureInputCount", "dictionaryWordCount", "splitWordsCount", "gestureInputCount",
"gestureCharsCount", "gesturesDeletedCount", "manualSuggestionsCount", "gestureCharsCount", "gesturesDeletedCount", "manualSuggestionsCount",
"revertCommitsCount", "correctedWordsCount", "autoCorrectionsCount"); "revertCommitsCount", "correctedWordsCount", "autoCorrectionsCount",
"publishableCount", "unpublishableStoppingCount",
"unpublishableIncorrectWordCount", "unpublishableSampledTooRecentlyCount",
"unpublishableDictionaryUnavailableCount", "unpublishableMayContainDigitCount",
"unpublishableNotInDictionaryCount");
private static void logStatistics() { private static void logStatistics() {
final ResearchLogger researchLogger = getInstance(); final ResearchLogger researchLogger = getInstance();
final Statistics statistics = researchLogger.mStatistics; final Statistics statistics = researchLogger.mStatistics;
@ -1922,6 +1940,10 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang
statistics.mGesturesInputCount, statistics.mGesturesCharsCount, statistics.mGesturesInputCount, statistics.mGesturesCharsCount,
statistics.mGesturesDeletedCount, statistics.mManualSuggestionsCount, statistics.mGesturesDeletedCount, statistics.mManualSuggestionsCount,
statistics.mRevertCommitsCount, statistics.mCorrectedWordsCount, statistics.mRevertCommitsCount, statistics.mCorrectedWordsCount,
statistics.mAutoCorrectionsCount); statistics.mAutoCorrectionsCount, statistics.mPublishableCount,
statistics.mUnpublishableStoppingCount, statistics.mUnpublishableIncorrectWordCount,
statistics.mUnpublishableSampledTooRecently,
statistics.mUnpublishableDictionaryUnavailable,
statistics.mUnpublishableMayContainDigit, statistics.mUnpublishableNotInDictionary);
} }
} }

View File

@ -61,6 +61,16 @@ public class Statistics {
boolean mIsEmptyUponStarting; boolean mIsEmptyUponStarting;
boolean mIsEmptinessStateKnown; boolean mIsEmptinessStateKnown;
// Counts of how often an n-gram is collected or not, and the reasons for the decision.
// Keep consistent with publishability result code list in MainLogBuffer
int mPublishableCount;
int mUnpublishableStoppingCount;
int mUnpublishableIncorrectWordCount;
int mUnpublishableSampledTooRecently;
int mUnpublishableDictionaryUnavailable;
int mUnpublishableMayContainDigit;
int mUnpublishableNotInDictionary;
// Timers to count average time to enter a key, first press a delete key, // Timers to count average time to enter a key, first press a delete key,
// between delete keys, and then to return typing after a delete key. // between delete keys, and then to return typing after a delete key.
final AverageTimeCounter mKeyCounter = new AverageTimeCounter(); final AverageTimeCounter mKeyCounter = new AverageTimeCounter();
@ -133,6 +143,13 @@ public class Statistics {
mAfterDeleteKeyCounter.reset(); mAfterDeleteKeyCounter.reset();
mGesturesCharsCount = 0; mGesturesCharsCount = 0;
mGesturesDeletedCount = 0; mGesturesDeletedCount = 0;
mPublishableCount = 0;
mUnpublishableStoppingCount = 0;
mUnpublishableIncorrectWordCount = 0;
mUnpublishableSampledTooRecently = 0;
mUnpublishableDictionaryUnavailable = 0;
mUnpublishableMayContainDigit = 0;
mUnpublishableNotInDictionary = 0;
mLastTapTime = 0; mLastTapTime = 0;
mIsLastKeyDeleteKey = false; mIsLastKeyDeleteKey = false;
@ -230,4 +247,31 @@ public class Statistics {
mIsLastKeyDeleteKey = isDeletion; mIsLastKeyDeleteKey = isDeletion;
mLastTapTime = time; mLastTapTime = time;
} }
public void recordPublishabilityResultCode(final int publishabilityResultCode) {
// Keep consistent with publishability result code list in MainLogBuffer
switch (publishabilityResultCode) {
case MainLogBuffer.PUBLISHABILITY_PUBLISHABLE:
mPublishableCount++;
break;
case MainLogBuffer.PUBLISHABILITY_UNPUBLISHABLE_STOPPING:
mUnpublishableStoppingCount++;
break;
case MainLogBuffer.PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT:
mUnpublishableIncorrectWordCount++;
break;
case MainLogBuffer.PUBLISHABILITY_UNPUBLISHABLE_SAMPLED_TOO_RECENTLY:
mUnpublishableSampledTooRecently++;
break;
case MainLogBuffer.PUBLISHABILITY_UNPUBLISHABLE_DICTIONARY_UNAVAILABLE:
mUnpublishableDictionaryUnavailable++;
break;
case MainLogBuffer.PUBLISHABILITY_UNPUBLISHABLE_MAY_CONTAIN_DIGIT:
mUnpublishableMayContainDigit++;
break;
case MainLogBuffer.PUBLISHABILITY_UNPUBLISHABLE_NOT_IN_DICTIONARY:
mUnpublishableNotInDictionary++;
break;
}
}
} }