am 264fb958
: Merge "Log reasons for punting n-gram logging"
* commit '264fb9581c0c5142dc3f78c50dab77444ef17b00': Log reasons for punting n-gram logging
This commit is contained in:
commit
a0542472c2
3 changed files with 107 additions and 19 deletions
|
@ -63,6 +63,15 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
|
|||
private static final boolean DEBUG = false
|
||||
&& ProductionFlag.USES_DEVELOPMENT_ONLY_DIAGNOSTICS_DEBUG;
|
||||
|
||||
// Keep consistent with switch statement in Statistics.recordPublishabilityResultCode()
|
||||
public static final int PUBLISHABILITY_PUBLISHABLE = 0;
|
||||
public static final int PUBLISHABILITY_UNPUBLISHABLE_STOPPING = 1;
|
||||
public static final int PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT = 2;
|
||||
public static final int PUBLISHABILITY_UNPUBLISHABLE_SAMPLED_TOO_RECENTLY = 3;
|
||||
public static final int PUBLISHABILITY_UNPUBLISHABLE_DICTIONARY_UNAVAILABLE = 4;
|
||||
public static final int PUBLISHABILITY_UNPUBLISHABLE_MAY_CONTAIN_DIGIT = 5;
|
||||
public static final int PUBLISHABILITY_UNPUBLISHABLE_NOT_IN_DICTIONARY = 6;
|
||||
|
||||
// The size of the n-grams logged. E.g. N_GRAM_SIZE = 2 means to sample bigrams.
|
||||
public static final int N_GRAM_SIZE = 2;
|
||||
|
||||
|
@ -105,21 +114,24 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
|
|||
}
|
||||
|
||||
/**
|
||||
* Determines whether uploading the n words at the front the MainLogBuffer will not violate
|
||||
* user privacy.
|
||||
* Determines whether the string determined by a series of LogUnits will not violate user
|
||||
* privacy if published.
|
||||
*
|
||||
* The size of the MainLogBuffer is just enough to hold one n-gram, its corrections, and any
|
||||
* non-character data that is typed between words. The decision about privacy is made based on
|
||||
* the buffer's entire content. If it is decided that the privacy risks are too great to upload
|
||||
* the contents of this buffer, a censored version of the LogItems may still be uploaded. E.g.,
|
||||
* the screen orientation and other characteristics about the device can be uploaded without
|
||||
* revealing much about the user.
|
||||
* @param logUnits a LogUnit list to check for publishability
|
||||
* @param nGramSize the smallest n-gram acceptable to be published. if
|
||||
* {@link ResearchLogger.IS_LOGGING_EVERYTHING} is true, then publish if there are more than
|
||||
* {@code minNGramSize} words in the logUnits, otherwise wait. if {@link
|
||||
* ResearchLogger.IS_LOGGING_EVERYTHING} is false, then ensure that there are exactly nGramSize
|
||||
* words in the LogUnits.
|
||||
*
|
||||
* @return one of the {@code PUBLISHABILITY_*} result codes defined in this class.
|
||||
*/
|
||||
private boolean isSafeNGram(final ArrayList<LogUnit> logUnits, final int minNGramSize) {
|
||||
private int getPublishabilityResultCode(final ArrayList<LogUnit> logUnits,
|
||||
final int nGramSize) {
|
||||
// Bypass privacy checks when debugging.
|
||||
if (ResearchLogger.IS_LOGGING_EVERYTHING) {
|
||||
if (mIsStopping) {
|
||||
return true;
|
||||
return PUBLISHABILITY_UNPUBLISHABLE_STOPPING;
|
||||
}
|
||||
// Only check that it is the right length. If not, wait for later words to make
|
||||
// complete n-grams.
|
||||
|
@ -129,13 +141,17 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
|
|||
final LogUnit logUnit = logUnits.get(i);
|
||||
numWordsInLogUnitList += logUnit.getNumWords();
|
||||
}
|
||||
return numWordsInLogUnitList >= minNGramSize;
|
||||
if (numWordsInLogUnitList >= nGramSize) {
|
||||
return PUBLISHABILITY_PUBLISHABLE;
|
||||
} else {
|
||||
return PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT;
|
||||
}
|
||||
}
|
||||
|
||||
// Check that we are not sampling too frequently. Having sampled recently might disclose
|
||||
// too much of the user's intended meaning.
|
||||
if (mNumWordsUntilSafeToSample > 0) {
|
||||
return false;
|
||||
return PUBLISHABILITY_UNPUBLISHABLE_SAMPLED_TOO_RECENTLY;
|
||||
}
|
||||
// Reload the dictionary in case it has changed (e.g., because the user has changed
|
||||
// languages).
|
||||
|
@ -144,7 +160,7 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
|
|||
// Main dictionary is unavailable. Since we cannot check it, we cannot tell if a
|
||||
// word is out-of-vocabulary or not. Therefore, we must judge the entire buffer
|
||||
// contents to potentially pose a privacy risk.
|
||||
return false;
|
||||
return PUBLISHABILITY_UNPUBLISHABLE_DICTIONARY_UNAVAILABLE;
|
||||
}
|
||||
|
||||
// Check each word in the buffer. If any word poses a privacy threat, we cannot upload
|
||||
|
@ -155,7 +171,7 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
|
|||
if (!logUnit.hasOneOrMoreWords()) {
|
||||
// Digits outside words are a privacy threat.
|
||||
if (logUnit.mayContainDigit()) {
|
||||
return false;
|
||||
return PUBLISHABILITY_UNPUBLISHABLE_MAY_CONTAIN_DIGIT;
|
||||
}
|
||||
} else {
|
||||
numWordsInLogUnitList += logUnit.getNumWords();
|
||||
|
@ -168,14 +184,18 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
|
|||
+ ResearchLogger.hasLetters(word)
|
||||
+ ", isValid: " + (dictionary.isValidWord(word)));
|
||||
}
|
||||
return false;
|
||||
return PUBLISHABILITY_UNPUBLISHABLE_NOT_IN_DICTIONARY;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Finally, only return true if the ngram is the right size.
|
||||
return numWordsInLogUnitList == minNGramSize;
|
||||
if (numWordsInLogUnitList == nGramSize) {
|
||||
return PUBLISHABILITY_PUBLISHABLE;
|
||||
} else {
|
||||
return PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT;
|
||||
}
|
||||
}
|
||||
|
||||
public void shiftAndPublishAll() throws IOException {
|
||||
|
@ -216,7 +236,9 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
|
|||
// TODO: Refactor this method to require fewer passes through the LogUnits. Should really
|
||||
// require only one pass.
|
||||
ArrayList<LogUnit> logUnits = peekAtFirstNWords(N_GRAM_SIZE);
|
||||
if (isSafeNGram(logUnits, N_GRAM_SIZE)) {
|
||||
final int publishabilityResultCode = getPublishabilityResultCode(logUnits, N_GRAM_SIZE);
|
||||
ResearchLogger.recordPublishabilityResultCode(publishabilityResultCode);
|
||||
if (publishabilityResultCode == MainLogBuffer.PUBLISHABILITY_PUBLISHABLE) {
|
||||
// Good n-gram at the front of the buffer. Publish it, disclosing details.
|
||||
publish(logUnits, true /* canIncludePrivateData */);
|
||||
shiftOutWords(N_GRAM_SIZE);
|
||||
|
|
|
@ -1895,6 +1895,20 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang
|
|||
isComposingWord);
|
||||
}
|
||||
|
||||
/**
|
||||
* Call this method when the logging system has attempted publication of an n-gram.
|
||||
*
|
||||
* Statistics are gathered about the success or failure.
|
||||
*
|
||||
* @param publishabilityResultCode a result code as defined by
|
||||
* {@code MainLogBuffer.PUBLISHABILITY_*}
|
||||
*/
|
||||
static void recordPublishabilityResultCode(final int publishabilityResultCode) {
|
||||
final ResearchLogger researchLogger = getInstance();
|
||||
final Statistics statistics = researchLogger.mStatistics;
|
||||
statistics.recordPublishabilityResultCode(publishabilityResultCode);
|
||||
}
|
||||
|
||||
/**
|
||||
* Log statistics.
|
||||
*
|
||||
|
@ -1907,7 +1921,11 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang
|
|||
"averageTimeDuringRepeatedDelete", "averageTimeAfterDelete",
|
||||
"dictionaryWordCount", "splitWordsCount", "gestureInputCount",
|
||||
"gestureCharsCount", "gesturesDeletedCount", "manualSuggestionsCount",
|
||||
"revertCommitsCount", "correctedWordsCount", "autoCorrectionsCount");
|
||||
"revertCommitsCount", "correctedWordsCount", "autoCorrectionsCount",
|
||||
"publishableCount", "unpublishableStoppingCount",
|
||||
"unpublishableIncorrectWordCount", "unpublishableSampledTooRecentlyCount",
|
||||
"unpublishableDictionaryUnavailableCount", "unpublishableMayContainDigitCount",
|
||||
"unpublishableNotInDictionaryCount");
|
||||
private static void logStatistics() {
|
||||
final ResearchLogger researchLogger = getInstance();
|
||||
final Statistics statistics = researchLogger.mStatistics;
|
||||
|
@ -1922,6 +1940,10 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang
|
|||
statistics.mGesturesInputCount, statistics.mGesturesCharsCount,
|
||||
statistics.mGesturesDeletedCount, statistics.mManualSuggestionsCount,
|
||||
statistics.mRevertCommitsCount, statistics.mCorrectedWordsCount,
|
||||
statistics.mAutoCorrectionsCount);
|
||||
statistics.mAutoCorrectionsCount, statistics.mPublishableCount,
|
||||
statistics.mUnpublishableStoppingCount, statistics.mUnpublishableIncorrectWordCount,
|
||||
statistics.mUnpublishableSampledTooRecently,
|
||||
statistics.mUnpublishableDictionaryUnavailable,
|
||||
statistics.mUnpublishableMayContainDigit, statistics.mUnpublishableNotInDictionary);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -61,6 +61,16 @@ public class Statistics {
|
|||
boolean mIsEmptyUponStarting;
|
||||
boolean mIsEmptinessStateKnown;
|
||||
|
||||
// Counts of how often an n-gram is collected or not, and the reasons for the decision.
|
||||
// Keep consistent with publishability result code list in MainLogBuffer
|
||||
int mPublishableCount;
|
||||
int mUnpublishableStoppingCount;
|
||||
int mUnpublishableIncorrectWordCount;
|
||||
int mUnpublishableSampledTooRecently;
|
||||
int mUnpublishableDictionaryUnavailable;
|
||||
int mUnpublishableMayContainDigit;
|
||||
int mUnpublishableNotInDictionary;
|
||||
|
||||
// Timers to count average time to enter a key, first press a delete key,
|
||||
// between delete keys, and then to return typing after a delete key.
|
||||
final AverageTimeCounter mKeyCounter = new AverageTimeCounter();
|
||||
|
@ -133,6 +143,13 @@ public class Statistics {
|
|||
mAfterDeleteKeyCounter.reset();
|
||||
mGesturesCharsCount = 0;
|
||||
mGesturesDeletedCount = 0;
|
||||
mPublishableCount = 0;
|
||||
mUnpublishableStoppingCount = 0;
|
||||
mUnpublishableIncorrectWordCount = 0;
|
||||
mUnpublishableSampledTooRecently = 0;
|
||||
mUnpublishableDictionaryUnavailable = 0;
|
||||
mUnpublishableMayContainDigit = 0;
|
||||
mUnpublishableNotInDictionary = 0;
|
||||
|
||||
mLastTapTime = 0;
|
||||
mIsLastKeyDeleteKey = false;
|
||||
|
@ -230,4 +247,31 @@ public class Statistics {
|
|||
mIsLastKeyDeleteKey = isDeletion;
|
||||
mLastTapTime = time;
|
||||
}
|
||||
|
||||
public void recordPublishabilityResultCode(final int publishabilityResultCode) {
|
||||
// Keep consistent with publishability result code list in MainLogBuffer
|
||||
switch (publishabilityResultCode) {
|
||||
case MainLogBuffer.PUBLISHABILITY_PUBLISHABLE:
|
||||
mPublishableCount++;
|
||||
break;
|
||||
case MainLogBuffer.PUBLISHABILITY_UNPUBLISHABLE_STOPPING:
|
||||
mUnpublishableStoppingCount++;
|
||||
break;
|
||||
case MainLogBuffer.PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT:
|
||||
mUnpublishableIncorrectWordCount++;
|
||||
break;
|
||||
case MainLogBuffer.PUBLISHABILITY_UNPUBLISHABLE_SAMPLED_TOO_RECENTLY:
|
||||
mUnpublishableSampledTooRecently++;
|
||||
break;
|
||||
case MainLogBuffer.PUBLISHABILITY_UNPUBLISHABLE_DICTIONARY_UNAVAILABLE:
|
||||
mUnpublishableDictionaryUnavailable++;
|
||||
break;
|
||||
case MainLogBuffer.PUBLISHABILITY_UNPUBLISHABLE_MAY_CONTAIN_DIGIT:
|
||||
mUnpublishableMayContainDigit++;
|
||||
break;
|
||||
case MainLogBuffer.PUBLISHABILITY_UNPUBLISHABLE_NOT_IN_DICTIONARY:
|
||||
mUnpublishableNotInDictionary++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue