Log reasons for punting n-gram logging
Addresses b/9074239 Change-Id: I91a3bfcbd32b03e4891ff5f65be01383a3fb8975main
parent
5fdea4775e
commit
aaa23bc12c
|
@ -63,6 +63,15 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
|
||||||
private static final boolean DEBUG = false
|
private static final boolean DEBUG = false
|
||||||
&& ProductionFlag.USES_DEVELOPMENT_ONLY_DIAGNOSTICS_DEBUG;
|
&& ProductionFlag.USES_DEVELOPMENT_ONLY_DIAGNOSTICS_DEBUG;
|
||||||
|
|
||||||
|
// Keep consistent with switch statement in Statistics.recordPublishabilityResultCode()
|
||||||
|
public static final int PUBLISHABILITY_PUBLISHABLE = 0;
|
||||||
|
public static final int PUBLISHABILITY_UNPUBLISHABLE_STOPPING = 1;
|
||||||
|
public static final int PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT = 2;
|
||||||
|
public static final int PUBLISHABILITY_UNPUBLISHABLE_SAMPLED_TOO_RECENTLY = 3;
|
||||||
|
public static final int PUBLISHABILITY_UNPUBLISHABLE_DICTIONARY_UNAVAILABLE = 4;
|
||||||
|
public static final int PUBLISHABILITY_UNPUBLISHABLE_MAY_CONTAIN_DIGIT = 5;
|
||||||
|
public static final int PUBLISHABILITY_UNPUBLISHABLE_NOT_IN_DICTIONARY = 6;
|
||||||
|
|
||||||
// The size of the n-grams logged. E.g. N_GRAM_SIZE = 2 means to sample bigrams.
|
// The size of the n-grams logged. E.g. N_GRAM_SIZE = 2 means to sample bigrams.
|
||||||
public static final int N_GRAM_SIZE = 2;
|
public static final int N_GRAM_SIZE = 2;
|
||||||
|
|
||||||
|
@ -105,21 +114,24 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Determines whether uploading the n words at the front the MainLogBuffer will not violate
|
* Determines whether the string determined by a series of LogUnits will not violate user
|
||||||
* user privacy.
|
* privacy if published.
|
||||||
*
|
*
|
||||||
* The size of the MainLogBuffer is just enough to hold one n-gram, its corrections, and any
|
* @param logUnits a LogUnit list to check for publishability
|
||||||
* non-character data that is typed between words. The decision about privacy is made based on
|
* @param nGramSize the smallest n-gram acceptable to be published. if
|
||||||
* the buffer's entire content. If it is decided that the privacy risks are too great to upload
|
* {@link ResearchLogger.IS_LOGGING_EVERYTHING} is true, then publish if there are more than
|
||||||
* the contents of this buffer, a censored version of the LogItems may still be uploaded. E.g.,
|
* {@code minNGramSize} words in the logUnits, otherwise wait. if {@link
|
||||||
* the screen orientation and other characteristics about the device can be uploaded without
|
* ResearchLogger.IS_LOGGING_EVERYTHING} is false, then ensure that there are exactly nGramSize
|
||||||
* revealing much about the user.
|
* words in the LogUnits.
|
||||||
|
*
|
||||||
|
* @return one of the {@code PUBLISHABILITY_*} result codes defined in this class.
|
||||||
*/
|
*/
|
||||||
private boolean isSafeNGram(final ArrayList<LogUnit> logUnits, final int minNGramSize) {
|
private int getPublishabilityResultCode(final ArrayList<LogUnit> logUnits,
|
||||||
|
final int nGramSize) {
|
||||||
// Bypass privacy checks when debugging.
|
// Bypass privacy checks when debugging.
|
||||||
if (ResearchLogger.IS_LOGGING_EVERYTHING) {
|
if (ResearchLogger.IS_LOGGING_EVERYTHING) {
|
||||||
if (mIsStopping) {
|
if (mIsStopping) {
|
||||||
return true;
|
return PUBLISHABILITY_UNPUBLISHABLE_STOPPING;
|
||||||
}
|
}
|
||||||
// Only check that it is the right length. If not, wait for later words to make
|
// Only check that it is the right length. If not, wait for later words to make
|
||||||
// complete n-grams.
|
// complete n-grams.
|
||||||
|
@ -129,13 +141,17 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
|
||||||
final LogUnit logUnit = logUnits.get(i);
|
final LogUnit logUnit = logUnits.get(i);
|
||||||
numWordsInLogUnitList += logUnit.getNumWords();
|
numWordsInLogUnitList += logUnit.getNumWords();
|
||||||
}
|
}
|
||||||
return numWordsInLogUnitList >= minNGramSize;
|
if (numWordsInLogUnitList >= nGramSize) {
|
||||||
|
return PUBLISHABILITY_PUBLISHABLE;
|
||||||
|
} else {
|
||||||
|
return PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check that we are not sampling too frequently. Having sampled recently might disclose
|
// Check that we are not sampling too frequently. Having sampled recently might disclose
|
||||||
// too much of the user's intended meaning.
|
// too much of the user's intended meaning.
|
||||||
if (mNumWordsUntilSafeToSample > 0) {
|
if (mNumWordsUntilSafeToSample > 0) {
|
||||||
return false;
|
return PUBLISHABILITY_UNPUBLISHABLE_SAMPLED_TOO_RECENTLY;
|
||||||
}
|
}
|
||||||
// Reload the dictionary in case it has changed (e.g., because the user has changed
|
// Reload the dictionary in case it has changed (e.g., because the user has changed
|
||||||
// languages).
|
// languages).
|
||||||
|
@ -144,7 +160,7 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
|
||||||
// Main dictionary is unavailable. Since we cannot check it, we cannot tell if a
|
// Main dictionary is unavailable. Since we cannot check it, we cannot tell if a
|
||||||
// word is out-of-vocabulary or not. Therefore, we must judge the entire buffer
|
// word is out-of-vocabulary or not. Therefore, we must judge the entire buffer
|
||||||
// contents to potentially pose a privacy risk.
|
// contents to potentially pose a privacy risk.
|
||||||
return false;
|
return PUBLISHABILITY_UNPUBLISHABLE_DICTIONARY_UNAVAILABLE;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check each word in the buffer. If any word poses a privacy threat, we cannot upload
|
// Check each word in the buffer. If any word poses a privacy threat, we cannot upload
|
||||||
|
@ -155,7 +171,7 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
|
||||||
if (!logUnit.hasOneOrMoreWords()) {
|
if (!logUnit.hasOneOrMoreWords()) {
|
||||||
// Digits outside words are a privacy threat.
|
// Digits outside words are a privacy threat.
|
||||||
if (logUnit.mayContainDigit()) {
|
if (logUnit.mayContainDigit()) {
|
||||||
return false;
|
return PUBLISHABILITY_UNPUBLISHABLE_MAY_CONTAIN_DIGIT;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
numWordsInLogUnitList += logUnit.getNumWords();
|
numWordsInLogUnitList += logUnit.getNumWords();
|
||||||
|
@ -168,14 +184,18 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
|
||||||
+ ResearchLogger.hasLetters(word)
|
+ ResearchLogger.hasLetters(word)
|
||||||
+ ", isValid: " + (dictionary.isValidWord(word)));
|
+ ", isValid: " + (dictionary.isValidWord(word)));
|
||||||
}
|
}
|
||||||
return false;
|
return PUBLISHABILITY_UNPUBLISHABLE_NOT_IN_DICTIONARY;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Finally, only return true if the ngram is the right size.
|
// Finally, only return true if the ngram is the right size.
|
||||||
return numWordsInLogUnitList == minNGramSize;
|
if (numWordsInLogUnitList == nGramSize) {
|
||||||
|
return PUBLISHABILITY_PUBLISHABLE;
|
||||||
|
} else {
|
||||||
|
return PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void shiftAndPublishAll() throws IOException {
|
public void shiftAndPublishAll() throws IOException {
|
||||||
|
@ -216,7 +236,9 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
|
||||||
// TODO: Refactor this method to require fewer passes through the LogUnits. Should really
|
// TODO: Refactor this method to require fewer passes through the LogUnits. Should really
|
||||||
// require only one pass.
|
// require only one pass.
|
||||||
ArrayList<LogUnit> logUnits = peekAtFirstNWords(N_GRAM_SIZE);
|
ArrayList<LogUnit> logUnits = peekAtFirstNWords(N_GRAM_SIZE);
|
||||||
if (isSafeNGram(logUnits, N_GRAM_SIZE)) {
|
final int publishabilityResultCode = getPublishabilityResultCode(logUnits, N_GRAM_SIZE);
|
||||||
|
ResearchLogger.recordPublishabilityResultCode(publishabilityResultCode);
|
||||||
|
if (publishabilityResultCode == MainLogBuffer.PUBLISHABILITY_PUBLISHABLE) {
|
||||||
// Good n-gram at the front of the buffer. Publish it, disclosing details.
|
// Good n-gram at the front of the buffer. Publish it, disclosing details.
|
||||||
publish(logUnits, true /* canIncludePrivateData */);
|
publish(logUnits, true /* canIncludePrivateData */);
|
||||||
shiftOutWords(N_GRAM_SIZE);
|
shiftOutWords(N_GRAM_SIZE);
|
||||||
|
|
|
@ -1895,6 +1895,20 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang
|
||||||
isComposingWord);
|
isComposingWord);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Call this method when the logging system has attempted publication of an n-gram.
|
||||||
|
*
|
||||||
|
* Statistics are gathered about the success or failure.
|
||||||
|
*
|
||||||
|
* @param publishabilityResultCode a result code as defined by
|
||||||
|
* {@code MainLogBuffer.PUBLISHABILITY_*}
|
||||||
|
*/
|
||||||
|
static void recordPublishabilityResultCode(final int publishabilityResultCode) {
|
||||||
|
final ResearchLogger researchLogger = getInstance();
|
||||||
|
final Statistics statistics = researchLogger.mStatistics;
|
||||||
|
statistics.recordPublishabilityResultCode(publishabilityResultCode);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Log statistics.
|
* Log statistics.
|
||||||
*
|
*
|
||||||
|
@ -1907,7 +1921,11 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang
|
||||||
"averageTimeDuringRepeatedDelete", "averageTimeAfterDelete",
|
"averageTimeDuringRepeatedDelete", "averageTimeAfterDelete",
|
||||||
"dictionaryWordCount", "splitWordsCount", "gestureInputCount",
|
"dictionaryWordCount", "splitWordsCount", "gestureInputCount",
|
||||||
"gestureCharsCount", "gesturesDeletedCount", "manualSuggestionsCount",
|
"gestureCharsCount", "gesturesDeletedCount", "manualSuggestionsCount",
|
||||||
"revertCommitsCount", "correctedWordsCount", "autoCorrectionsCount");
|
"revertCommitsCount", "correctedWordsCount", "autoCorrectionsCount",
|
||||||
|
"publishableCount", "unpublishableStoppingCount",
|
||||||
|
"unpublishableIncorrectWordCount", "unpublishableSampledTooRecentlyCount",
|
||||||
|
"unpublishableDictionaryUnavailableCount", "unpublishableMayContainDigitCount",
|
||||||
|
"unpublishableNotInDictionaryCount");
|
||||||
private static void logStatistics() {
|
private static void logStatistics() {
|
||||||
final ResearchLogger researchLogger = getInstance();
|
final ResearchLogger researchLogger = getInstance();
|
||||||
final Statistics statistics = researchLogger.mStatistics;
|
final Statistics statistics = researchLogger.mStatistics;
|
||||||
|
@ -1922,6 +1940,10 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang
|
||||||
statistics.mGesturesInputCount, statistics.mGesturesCharsCount,
|
statistics.mGesturesInputCount, statistics.mGesturesCharsCount,
|
||||||
statistics.mGesturesDeletedCount, statistics.mManualSuggestionsCount,
|
statistics.mGesturesDeletedCount, statistics.mManualSuggestionsCount,
|
||||||
statistics.mRevertCommitsCount, statistics.mCorrectedWordsCount,
|
statistics.mRevertCommitsCount, statistics.mCorrectedWordsCount,
|
||||||
statistics.mAutoCorrectionsCount);
|
statistics.mAutoCorrectionsCount, statistics.mPublishableCount,
|
||||||
|
statistics.mUnpublishableStoppingCount, statistics.mUnpublishableIncorrectWordCount,
|
||||||
|
statistics.mUnpublishableSampledTooRecently,
|
||||||
|
statistics.mUnpublishableDictionaryUnavailable,
|
||||||
|
statistics.mUnpublishableMayContainDigit, statistics.mUnpublishableNotInDictionary);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -61,6 +61,16 @@ public class Statistics {
|
||||||
boolean mIsEmptyUponStarting;
|
boolean mIsEmptyUponStarting;
|
||||||
boolean mIsEmptinessStateKnown;
|
boolean mIsEmptinessStateKnown;
|
||||||
|
|
||||||
|
// Counts of how often an n-gram is collected or not, and the reasons for the decision.
|
||||||
|
// Keep consistent with publishability result code list in MainLogBuffer
|
||||||
|
int mPublishableCount;
|
||||||
|
int mUnpublishableStoppingCount;
|
||||||
|
int mUnpublishableIncorrectWordCount;
|
||||||
|
int mUnpublishableSampledTooRecently;
|
||||||
|
int mUnpublishableDictionaryUnavailable;
|
||||||
|
int mUnpublishableMayContainDigit;
|
||||||
|
int mUnpublishableNotInDictionary;
|
||||||
|
|
||||||
// Timers to count average time to enter a key, first press a delete key,
|
// Timers to count average time to enter a key, first press a delete key,
|
||||||
// between delete keys, and then to return typing after a delete key.
|
// between delete keys, and then to return typing after a delete key.
|
||||||
final AverageTimeCounter mKeyCounter = new AverageTimeCounter();
|
final AverageTimeCounter mKeyCounter = new AverageTimeCounter();
|
||||||
|
@ -133,6 +143,13 @@ public class Statistics {
|
||||||
mAfterDeleteKeyCounter.reset();
|
mAfterDeleteKeyCounter.reset();
|
||||||
mGesturesCharsCount = 0;
|
mGesturesCharsCount = 0;
|
||||||
mGesturesDeletedCount = 0;
|
mGesturesDeletedCount = 0;
|
||||||
|
mPublishableCount = 0;
|
||||||
|
mUnpublishableStoppingCount = 0;
|
||||||
|
mUnpublishableIncorrectWordCount = 0;
|
||||||
|
mUnpublishableSampledTooRecently = 0;
|
||||||
|
mUnpublishableDictionaryUnavailable = 0;
|
||||||
|
mUnpublishableMayContainDigit = 0;
|
||||||
|
mUnpublishableNotInDictionary = 0;
|
||||||
|
|
||||||
mLastTapTime = 0;
|
mLastTapTime = 0;
|
||||||
mIsLastKeyDeleteKey = false;
|
mIsLastKeyDeleteKey = false;
|
||||||
|
@ -230,4 +247,31 @@ public class Statistics {
|
||||||
mIsLastKeyDeleteKey = isDeletion;
|
mIsLastKeyDeleteKey = isDeletion;
|
||||||
mLastTapTime = time;
|
mLastTapTime = time;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void recordPublishabilityResultCode(final int publishabilityResultCode) {
|
||||||
|
// Keep consistent with publishability result code list in MainLogBuffer
|
||||||
|
switch (publishabilityResultCode) {
|
||||||
|
case MainLogBuffer.PUBLISHABILITY_PUBLISHABLE:
|
||||||
|
mPublishableCount++;
|
||||||
|
break;
|
||||||
|
case MainLogBuffer.PUBLISHABILITY_UNPUBLISHABLE_STOPPING:
|
||||||
|
mUnpublishableStoppingCount++;
|
||||||
|
break;
|
||||||
|
case MainLogBuffer.PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT:
|
||||||
|
mUnpublishableIncorrectWordCount++;
|
||||||
|
break;
|
||||||
|
case MainLogBuffer.PUBLISHABILITY_UNPUBLISHABLE_SAMPLED_TOO_RECENTLY:
|
||||||
|
mUnpublishableSampledTooRecently++;
|
||||||
|
break;
|
||||||
|
case MainLogBuffer.PUBLISHABILITY_UNPUBLISHABLE_DICTIONARY_UNAVAILABLE:
|
||||||
|
mUnpublishableDictionaryUnavailable++;
|
||||||
|
break;
|
||||||
|
case MainLogBuffer.PUBLISHABILITY_UNPUBLISHABLE_MAY_CONTAIN_DIGIT:
|
||||||
|
mUnpublishableMayContainDigit++;
|
||||||
|
break;
|
||||||
|
case MainLogBuffer.PUBLISHABILITY_UNPUBLISHABLE_NOT_IN_DICTIONARY:
|
||||||
|
mUnpublishableNotInDictionary++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue