diff --git a/java/src/com/android/inputmethod/research/MainLogBuffer.java b/java/src/com/android/inputmethod/research/MainLogBuffer.java index 7e8f16697..3482153b4 100644 --- a/java/src/com/android/inputmethod/research/MainLogBuffer.java +++ b/java/src/com/android/inputmethod/research/MainLogBuffer.java @@ -63,6 +63,15 @@ public abstract class MainLogBuffer extends FixedLogBuffer { private static final boolean DEBUG = false && ProductionFlag.USES_DEVELOPMENT_ONLY_DIAGNOSTICS_DEBUG; + // Keep consistent with switch statement in Statistics.recordPublishabilityResultCode() + public static final int PUBLISHABILITY_PUBLISHABLE = 0; + public static final int PUBLISHABILITY_UNPUBLISHABLE_STOPPING = 1; + public static final int PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT = 2; + public static final int PUBLISHABILITY_UNPUBLISHABLE_SAMPLED_TOO_RECENTLY = 3; + public static final int PUBLISHABILITY_UNPUBLISHABLE_DICTIONARY_UNAVAILABLE = 4; + public static final int PUBLISHABILITY_UNPUBLISHABLE_MAY_CONTAIN_DIGIT = 5; + public static final int PUBLISHABILITY_UNPUBLISHABLE_NOT_IN_DICTIONARY = 6; + // The size of the n-grams logged. E.g. N_GRAM_SIZE = 2 means to sample bigrams. public static final int N_GRAM_SIZE = 2; @@ -105,21 +114,24 @@ public abstract class MainLogBuffer extends FixedLogBuffer { } /** - * Determines whether uploading the n words at the front the MainLogBuffer will not violate - * user privacy. + * Determines whether the string determined by a series of LogUnits will not violate user + * privacy if published. * - * The size of the MainLogBuffer is just enough to hold one n-gram, its corrections, and any - * non-character data that is typed between words. The decision about privacy is made based on - * the buffer's entire content. If it is decided that the privacy risks are too great to upload - * the contents of this buffer, a censored version of the LogItems may still be uploaded. E.g., - * the screen orientation and other characteristics about the device can be uploaded without - * revealing much about the user. + * @param logUnits a LogUnit list to check for publishability + * @param nGramSize the smallest n-gram acceptable to be published. if + * {@link ResearchLogger.IS_LOGGING_EVERYTHING} is true, then publish if there are more than + * {@code minNGramSize} words in the logUnits, otherwise wait. if {@link + * ResearchLogger.IS_LOGGING_EVERYTHING} is false, then ensure that there are exactly nGramSize + * words in the LogUnits. + * + * @return one of the {@code PUBLISHABILITY_*} result codes defined in this class. */ - private boolean isSafeNGram(final ArrayList logUnits, final int minNGramSize) { + private int getPublishabilityResultCode(final ArrayList logUnits, + final int nGramSize) { // Bypass privacy checks when debugging. if (ResearchLogger.IS_LOGGING_EVERYTHING) { if (mIsStopping) { - return true; + return PUBLISHABILITY_UNPUBLISHABLE_STOPPING; } // Only check that it is the right length. If not, wait for later words to make // complete n-grams. @@ -129,13 +141,17 @@ public abstract class MainLogBuffer extends FixedLogBuffer { final LogUnit logUnit = logUnits.get(i); numWordsInLogUnitList += logUnit.getNumWords(); } - return numWordsInLogUnitList >= minNGramSize; + if (numWordsInLogUnitList >= nGramSize) { + return PUBLISHABILITY_PUBLISHABLE; + } else { + return PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT; + } } // Check that we are not sampling too frequently. Having sampled recently might disclose // too much of the user's intended meaning. if (mNumWordsUntilSafeToSample > 0) { - return false; + return PUBLISHABILITY_UNPUBLISHABLE_SAMPLED_TOO_RECENTLY; } // Reload the dictionary in case it has changed (e.g., because the user has changed // languages). @@ -144,7 +160,7 @@ public abstract class MainLogBuffer extends FixedLogBuffer { // Main dictionary is unavailable. Since we cannot check it, we cannot tell if a // word is out-of-vocabulary or not. Therefore, we must judge the entire buffer // contents to potentially pose a privacy risk. - return false; + return PUBLISHABILITY_UNPUBLISHABLE_DICTIONARY_UNAVAILABLE; } // Check each word in the buffer. If any word poses a privacy threat, we cannot upload @@ -155,7 +171,7 @@ public abstract class MainLogBuffer extends FixedLogBuffer { if (!logUnit.hasOneOrMoreWords()) { // Digits outside words are a privacy threat. if (logUnit.mayContainDigit()) { - return false; + return PUBLISHABILITY_UNPUBLISHABLE_MAY_CONTAIN_DIGIT; } } else { numWordsInLogUnitList += logUnit.getNumWords(); @@ -168,14 +184,18 @@ public abstract class MainLogBuffer extends FixedLogBuffer { + ResearchLogger.hasLetters(word) + ", isValid: " + (dictionary.isValidWord(word))); } - return false; + return PUBLISHABILITY_UNPUBLISHABLE_NOT_IN_DICTIONARY; } } } } // Finally, only return true if the ngram is the right size. - return numWordsInLogUnitList == minNGramSize; + if (numWordsInLogUnitList == nGramSize) { + return PUBLISHABILITY_PUBLISHABLE; + } else { + return PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT; + } } public void shiftAndPublishAll() throws IOException { @@ -216,7 +236,9 @@ public abstract class MainLogBuffer extends FixedLogBuffer { // TODO: Refactor this method to require fewer passes through the LogUnits. Should really // require only one pass. ArrayList logUnits = peekAtFirstNWords(N_GRAM_SIZE); - if (isSafeNGram(logUnits, N_GRAM_SIZE)) { + final int publishabilityResultCode = getPublishabilityResultCode(logUnits, N_GRAM_SIZE); + ResearchLogger.recordPublishabilityResultCode(publishabilityResultCode); + if (publishabilityResultCode == MainLogBuffer.PUBLISHABILITY_PUBLISHABLE) { // Good n-gram at the front of the buffer. Publish it, disclosing details. publish(logUnits, true /* canIncludePrivateData */); shiftOutWords(N_GRAM_SIZE); diff --git a/java/src/com/android/inputmethod/research/ResearchLogger.java b/java/src/com/android/inputmethod/research/ResearchLogger.java index c4409d5c8..64f0349fc 100644 --- a/java/src/com/android/inputmethod/research/ResearchLogger.java +++ b/java/src/com/android/inputmethod/research/ResearchLogger.java @@ -1895,6 +1895,20 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang isComposingWord); } + /** + * Call this method when the logging system has attempted publication of an n-gram. + * + * Statistics are gathered about the success or failure. + * + * @param publishabilityResultCode a result code as defined by + * {@code MainLogBuffer.PUBLISHABILITY_*} + */ + static void recordPublishabilityResultCode(final int publishabilityResultCode) { + final ResearchLogger researchLogger = getInstance(); + final Statistics statistics = researchLogger.mStatistics; + statistics.recordPublishabilityResultCode(publishabilityResultCode); + } + /** * Log statistics. * @@ -1907,7 +1921,11 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang "averageTimeDuringRepeatedDelete", "averageTimeAfterDelete", "dictionaryWordCount", "splitWordsCount", "gestureInputCount", "gestureCharsCount", "gesturesDeletedCount", "manualSuggestionsCount", - "revertCommitsCount", "correctedWordsCount", "autoCorrectionsCount"); + "revertCommitsCount", "correctedWordsCount", "autoCorrectionsCount", + "publishableCount", "unpublishableStoppingCount", + "unpublishableIncorrectWordCount", "unpublishableSampledTooRecentlyCount", + "unpublishableDictionaryUnavailableCount", "unpublishableMayContainDigitCount", + "unpublishableNotInDictionaryCount"); private static void logStatistics() { final ResearchLogger researchLogger = getInstance(); final Statistics statistics = researchLogger.mStatistics; @@ -1922,6 +1940,10 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang statistics.mGesturesInputCount, statistics.mGesturesCharsCount, statistics.mGesturesDeletedCount, statistics.mManualSuggestionsCount, statistics.mRevertCommitsCount, statistics.mCorrectedWordsCount, - statistics.mAutoCorrectionsCount); + statistics.mAutoCorrectionsCount, statistics.mPublishableCount, + statistics.mUnpublishableStoppingCount, statistics.mUnpublishableIncorrectWordCount, + statistics.mUnpublishableSampledTooRecently, + statistics.mUnpublishableDictionaryUnavailable, + statistics.mUnpublishableMayContainDigit, statistics.mUnpublishableNotInDictionary); } } diff --git a/java/src/com/android/inputmethod/research/Statistics.java b/java/src/com/android/inputmethod/research/Statistics.java index 7f6c851bb..e573ca012 100644 --- a/java/src/com/android/inputmethod/research/Statistics.java +++ b/java/src/com/android/inputmethod/research/Statistics.java @@ -61,6 +61,16 @@ public class Statistics { boolean mIsEmptyUponStarting; boolean mIsEmptinessStateKnown; + // Counts of how often an n-gram is collected or not, and the reasons for the decision. + // Keep consistent with publishability result code list in MainLogBuffer + int mPublishableCount; + int mUnpublishableStoppingCount; + int mUnpublishableIncorrectWordCount; + int mUnpublishableSampledTooRecently; + int mUnpublishableDictionaryUnavailable; + int mUnpublishableMayContainDigit; + int mUnpublishableNotInDictionary; + // Timers to count average time to enter a key, first press a delete key, // between delete keys, and then to return typing after a delete key. final AverageTimeCounter mKeyCounter = new AverageTimeCounter(); @@ -133,6 +143,13 @@ public class Statistics { mAfterDeleteKeyCounter.reset(); mGesturesCharsCount = 0; mGesturesDeletedCount = 0; + mPublishableCount = 0; + mUnpublishableStoppingCount = 0; + mUnpublishableIncorrectWordCount = 0; + mUnpublishableSampledTooRecently = 0; + mUnpublishableDictionaryUnavailable = 0; + mUnpublishableMayContainDigit = 0; + mUnpublishableNotInDictionary = 0; mLastTapTime = 0; mIsLastKeyDeleteKey = false; @@ -230,4 +247,31 @@ public class Statistics { mIsLastKeyDeleteKey = isDeletion; mLastTapTime = time; } + + public void recordPublishabilityResultCode(final int publishabilityResultCode) { + // Keep consistent with publishability result code list in MainLogBuffer + switch (publishabilityResultCode) { + case MainLogBuffer.PUBLISHABILITY_PUBLISHABLE: + mPublishableCount++; + break; + case MainLogBuffer.PUBLISHABILITY_UNPUBLISHABLE_STOPPING: + mUnpublishableStoppingCount++; + break; + case MainLogBuffer.PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT: + mUnpublishableIncorrectWordCount++; + break; + case MainLogBuffer.PUBLISHABILITY_UNPUBLISHABLE_SAMPLED_TOO_RECENTLY: + mUnpublishableSampledTooRecently++; + break; + case MainLogBuffer.PUBLISHABILITY_UNPUBLISHABLE_DICTIONARY_UNAVAILABLE: + mUnpublishableDictionaryUnavailable++; + break; + case MainLogBuffer.PUBLISHABILITY_UNPUBLISHABLE_MAY_CONTAIN_DIGIT: + mUnpublishableMayContainDigit++; + break; + case MainLogBuffer.PUBLISHABILITY_UNPUBLISHABLE_NOT_IN_DICTIONARY: + mUnpublishableNotInDictionary++; + break; + } + } }