LatinIME/java/src/com/android/inputmethod/research/MainLogBuffer.java

/*
 * Copyright (C) 2012 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.android.inputmethod.research;

import android.util.Log;

import com.android.inputmethod.latin.Dictionary;
import com.android.inputmethod.latin.Suggest;
import com.android.inputmethod.latin.define.ProductionFlag;

import java.util.LinkedList;
import java.util.Random;

/**
 * Provide a log buffer of fixed length that enforces privacy restrictions.
 *
 * The privacy restrictions include making sure that no numbers are logged, that all logged words
 * are in the dictionary, and that words are recorded infrequently enough that the user's meaning
 * cannot be easily determined.
 */
public class MainLogBuffer extends FixedLogBuffer {
    private static final String TAG = MainLogBuffer.class.getSimpleName();
    private static final boolean DEBUG = false && ProductionFlag.IS_EXPERIMENTAL_DEBUG;

    // The size of the n-grams logged.  E.g. N_GRAM_SIZE = 2 means to sample bigrams.
    private static final int N_GRAM_SIZE = 2;
    // The number of words between n-grams to omit from the log.  If debugging, record 50% of all
    // words.  Otherwise, only record 10%.
    private static final int DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES =
            ProductionFlag.IS_EXPERIMENTAL_DEBUG ? 2 : 18;

    private final ResearchLog mResearchLog;
    private Suggest mSuggest;

    // The minimum periodicity with which n-grams can be sampled.  E.g. mWinWordPeriod is 10 if
    // every 10th bigram is sampled, i.e., words 1-8 are not, but the bigram at words 9 and 10, etc.
    // for 11-18, and the bigram at words 19 and 20.  If an n-gram is not safe (e.g. it  contains a
    // number in the middle or an out-of-vocabulary word), then sampling is delayed until a safe
    // n-gram does appear.
    /* package for test */ int mMinWordPeriod;

    // Counter for words left to suppress before an n-gram can be sampled.  Reset to mMinWordPeriod
    // after a sample is taken.
    /* package for test */ int mWordsUntilSafeToSample;

    public MainLogBuffer(final ResearchLog researchLog) {
        super(N_GRAM_SIZE);
        mResearchLog = researchLog;
        mMinWordPeriod = DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES + N_GRAM_SIZE;
        final Random random = new Random();
        mWordsUntilSafeToSample = random.nextInt(mMinWordPeriod);
    }

    public void setSuggest(final Suggest suggest) {
        mSuggest = suggest;
    }

    @Override
    public void shiftIn(final LogUnit newLogUnit) {
        super.shiftIn(newLogUnit);
        if (newLogUnit.hasWord()) {
            if (mWordsUntilSafeToSample > 0) {
                mWordsUntilSafeToSample--;
            }
        }
        if (DEBUG) {
            Log.d(TAG, "shiftedIn " + (newLogUnit.hasWord() ? newLogUnit.getWord() : ""));
        }
    }

    public void resetWordCounter() {
        mWordsUntilSafeToSample = mMinWordPeriod;
    }

    /**
     * Determines whether the content of the MainLogBuffer can be safely uploaded in its complete
     * form and still protect the user's privacy.
     *
     * The size of the MainLogBuffer is just enough to hold one n-gram, its corrections, and any
     * non-character data that is typed between words.  The decision about privacy is made based on
     * the buffer's entire content.  If it is decided that the privacy risks are too great to upload
     * the contents of this buffer, a censored version of the LogItems may still be uploaded.  E.g.,
     * the screen orientation and other characteristics about the device can be uploaded without
     * revealing much about the user.
     */
    public boolean isSafeToLog() {
        // Check that we are not sampling too frequently.  Having sampled recently might disclose
        // too much of the user's intended meaning.
        if (mWordsUntilSafeToSample > 0) {
            return false;
        }
        if (mSuggest == null || !mSuggest.hasMainDictionary()) {
            // Main dictionary is unavailable.  Since we cannot check it, we cannot tell if a word
            // is out-of-vocabulary or not.  Therefore, we must judge the entire buffer contents to
            // potentially pose a privacy risk.
            return false;
        }
        // Reload the dictionary in case it has changed (e.g., because the user has changed
        // languages).
        final Dictionary dictionary = mSuggest.getMainDictionary();
        if (dictionary == null) {
            return false;
        }
        // Check each word in the buffer.  If any word poses a privacy threat, we cannot upload the
        // complete buffer contents in detail.
        final LinkedList<LogUnit> logUnits = getLogUnits();
        final int length = logUnits.size();
        for (int i = 0; i < length; i++) {
            final LogUnit logUnit = logUnits.get(i);
            final String word = logUnit.getWord();
            if (word == null) {
                // Digits outside words are a privacy threat.
                if (logUnit.mayContainDigit()) {
                    return false;
                }
            } else {
                // Words not in the dictionary are a privacy threat.
                if (ResearchLogger.hasLetters(word) && !(dictionary.isValidWord(word))) {
                    if (DEBUG) {
                        Log.d(TAG, "NOT SAFE!: hasLetters: " + ResearchLogger.hasLetters(word)
                                + ", isValid: " + (dictionary.isValidWord(word)));
                    }
                    return false;
                }
            }
        }
        // All checks have passed; this buffer's content can be safely uploaded.
        return true;
    }

    @Override
    protected void onShiftOut(final LogUnit logUnit) {
        if (mResearchLog != null) {
            mResearchLog.publish(logUnit, false /* isIncludingPrivateData */);
        }
    }
}
ResearchLogging capture full n-gram data - Captures complete motion data for all words in an n-gram. - Also filters n-grams properly; if any word in the n-gram is not in the dictionary, it is not included. - Simplify ResearchLog to not require explicit state - Added LogBuffer class MainLogBuffer class to allow n-gram-level decisions about privacy. - Moved LogUnit out from ResearchLogger multi-project change with Ic70ccb6c2e11eb02d887821434b44daa3eb7a3e2 Bug: 6188932 Change-Id: I731d6cff6539e997874f723b68bdb0d9b651b937 2012-08-03 03:22:29 +00:00			`/*`
			`* Copyright (C) 2012 The Android Open Source Project`
			`*`
			`* Licensed under the Apache License, Version 2.0 (the "License"); you may not`
			`* use this file except in compliance with the License. You may obtain a copy of`
			`* the License at`
			`*`
			`* http://www.apache.org/licenses/LICENSE-2.0`
			`*`
			`* Unless required by applicable law or agreed to in writing, software`
			`* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT`
			`* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the`
			`* License for the specific language governing permissions and limitations under`
			`* the License.`
			`*/`

			`package com.android.inputmethod.research;`

ResearchLogger add debugging code Change-Id: I03729506984f259dee63f3a66fd91963e3403d16 2012-08-09 19:20:45 +00:00			`import android.util.Log;`

ResearchLogging capture full n-gram data - Captures complete motion data for all words in an n-gram. - Also filters n-grams properly; if any word in the n-gram is not in the dictionary, it is not included. - Simplify ResearchLog to not require explicit state - Added LogBuffer class MainLogBuffer class to allow n-gram-level decisions about privacy. - Moved LogUnit out from ResearchLogger multi-project change with Ic70ccb6c2e11eb02d887821434b44daa3eb7a3e2 Bug: 6188932 Change-Id: I731d6cff6539e997874f723b68bdb0d9b651b937 2012-08-03 03:22:29 +00:00			`import com.android.inputmethod.latin.Dictionary;`
			`import com.android.inputmethod.latin.Suggest;`
Add ProductionFlag.IS_EXPERIMENTAL_DEBUG The IS_EXPERIMENTAL_DEBUG flag gives a single place to turn off all debugging flags that might be set and to also enforce privacy controls. Currently only used in the research package. multi-project commit with I9275a7c8e40bf56106447a02d3056655329074b3 Change-Id: If769fe3a633f33963ca49e8ddf01ab24a30b6fd2 2012-12-18 02:19:58 +00:00			`import com.android.inputmethod.latin.define.ProductionFlag;`
ResearchLogging capture full n-gram data - Captures complete motion data for all words in an n-gram. - Also filters n-grams properly; if any word in the n-gram is not in the dictionary, it is not included. - Simplify ResearchLog to not require explicit state - Added LogBuffer class MainLogBuffer class to allow n-gram-level decisions about privacy. - Moved LogUnit out from ResearchLogger multi-project change with Ic70ccb6c2e11eb02d887821434b44daa3eb7a3e2 Bug: 6188932 Change-Id: I731d6cff6539e997874f723b68bdb0d9b651b937 2012-08-03 03:22:29 +00:00
[Rlog27] Refactor LogBuffer Cleanup and prepare for replaying Change-Id: Ie09e912c6e9c0d7375168c575ccf1cfd9375dd31 2012-12-23 18:40:34 +00:00			`import java.util.LinkedList;`
ResearchLogging capture full n-gram data - Captures complete motion data for all words in an n-gram. - Also filters n-grams properly; if any word in the n-gram is not in the dictionary, it is not included. - Simplify ResearchLog to not require explicit state - Added LogBuffer class MainLogBuffer class to allow n-gram-level decisions about privacy. - Moved LogUnit out from ResearchLogger multi-project change with Ic70ccb6c2e11eb02d887821434b44daa3eb7a3e2 Bug: 6188932 Change-Id: I731d6cff6539e997874f723b68bdb0d9b651b937 2012-08-03 03:22:29 +00:00			`import java.util.Random;`

[Rlog27] Refactor LogBuffer Cleanup and prepare for replaying Change-Id: Ie09e912c6e9c0d7375168c575ccf1cfd9375dd31 2012-12-23 18:40:34 +00:00			`/**`
			`* Provide a log buffer of fixed length that enforces privacy restrictions.`
			`*`
			`* The privacy restrictions include making sure that no numbers are logged, that all logged words`
			`* are in the dictionary, and that words are recorded infrequently enough that the user's meaning`
			`* cannot be easily determined.`
			`*/`
			`public class MainLogBuffer extends FixedLogBuffer {`
ResearchLogger add debugging code Change-Id: I03729506984f259dee63f3a66fd91963e3403d16 2012-08-09 19:20:45 +00:00			`private static final String TAG = MainLogBuffer.class.getSimpleName();`
Add ProductionFlag.IS_EXPERIMENTAL_DEBUG The IS_EXPERIMENTAL_DEBUG flag gives a single place to turn off all debugging flags that might be set and to also enforce privacy controls. Currently only used in the research package. multi-project commit with I9275a7c8e40bf56106447a02d3056655329074b3 Change-Id: If769fe3a633f33963ca49e8ddf01ab24a30b6fd2 2012-12-18 02:19:58 +00:00			`private static final boolean DEBUG = false && ProductionFlag.IS_EXPERIMENTAL_DEBUG;`
ResearchLogger add debugging code Change-Id: I03729506984f259dee63f3a66fd91963e3403d16 2012-08-09 19:20:45 +00:00
ResearchLogging capture full n-gram data - Captures complete motion data for all words in an n-gram. - Also filters n-grams properly; if any word in the n-gram is not in the dictionary, it is not included. - Simplify ResearchLog to not require explicit state - Added LogBuffer class MainLogBuffer class to allow n-gram-level decisions about privacy. - Moved LogUnit out from ResearchLogger multi-project change with Ic70ccb6c2e11eb02d887821434b44daa3eb7a3e2 Bug: 6188932 Change-Id: I731d6cff6539e997874f723b68bdb0d9b651b937 2012-08-03 03:22:29 +00:00			`// The size of the n-grams logged. E.g. N_GRAM_SIZE = 2 means to sample bigrams.`
			`private static final int N_GRAM_SIZE = 2;`
[Rlog27] Refactor LogBuffer Cleanup and prepare for replaying Change-Id: Ie09e912c6e9c0d7375168c575ccf1cfd9375dd31 2012-12-23 18:40:34 +00:00			`// The number of words between n-grams to omit from the log. If debugging, record 50% of all`
			`// words. Otherwise, only record 10%.`
Add ProductionFlag.IS_EXPERIMENTAL_DEBUG The IS_EXPERIMENTAL_DEBUG flag gives a single place to turn off all debugging flags that might be set and to also enforce privacy controls. Currently only used in the research package. multi-project commit with I9275a7c8e40bf56106447a02d3056655329074b3 Change-Id: If769fe3a633f33963ca49e8ddf01ab24a30b6fd2 2012-12-18 02:19:58 +00:00			`private static final int DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES =`
			`ProductionFlag.IS_EXPERIMENTAL_DEBUG ? 2 : 18;`
ResearchLogging capture full n-gram data - Captures complete motion data for all words in an n-gram. - Also filters n-grams properly; if any word in the n-gram is not in the dictionary, it is not included. - Simplify ResearchLog to not require explicit state - Added LogBuffer class MainLogBuffer class to allow n-gram-level decisions about privacy. - Moved LogUnit out from ResearchLogger multi-project change with Ic70ccb6c2e11eb02d887821434b44daa3eb7a3e2 Bug: 6188932 Change-Id: I731d6cff6539e997874f723b68bdb0d9b651b937 2012-08-03 03:22:29 +00:00
			`private final ResearchLog mResearchLog;`
			`private Suggest mSuggest;`

			`// The minimum periodicity with which n-grams can be sampled. E.g. mWinWordPeriod is 10 if`
			`// every 10th bigram is sampled, i.e., words 1-8 are not, but the bigram at words 9 and 10, etc.`
			`// for 11-18, and the bigram at words 19 and 20. If an n-gram is not safe (e.g. it contains a`
			`// number in the middle or an out-of-vocabulary word), then sampling is delayed until a safe`
			`// n-gram does appear.`
			`/* package for test */ int mMinWordPeriod;`

			`// Counter for words left to suppress before an n-gram can be sampled. Reset to mMinWordPeriod`
			`// after a sample is taken.`
			`/* package for test */ int mWordsUntilSafeToSample;`

			`public MainLogBuffer(final ResearchLog researchLog) {`
			`super(N_GRAM_SIZE);`
			`mResearchLog = researchLog;`
			`mMinWordPeriod = DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES + N_GRAM_SIZE;`
			`final Random random = new Random();`
			`mWordsUntilSafeToSample = random.nextInt(mMinWordPeriod);`
			`}`

[Rlog27] Refactor LogBuffer Cleanup and prepare for replaying Change-Id: Ie09e912c6e9c0d7375168c575ccf1cfd9375dd31 2012-12-23 18:40:34 +00:00			`public void setSuggest(final Suggest suggest) {`
ResearchLogging capture full n-gram data - Captures complete motion data for all words in an n-gram. - Also filters n-grams properly; if any word in the n-gram is not in the dictionary, it is not included. - Simplify ResearchLog to not require explicit state - Added LogBuffer class MainLogBuffer class to allow n-gram-level decisions about privacy. - Moved LogUnit out from ResearchLogger multi-project change with Ic70ccb6c2e11eb02d887821434b44daa3eb7a3e2 Bug: 6188932 Change-Id: I731d6cff6539e997874f723b68bdb0d9b651b937 2012-08-03 03:22:29 +00:00			`mSuggest = suggest;`
			`}`

			`@Override`
			`public void shiftIn(final LogUnit newLogUnit) {`
			`super.shiftIn(newLogUnit);`
			`if (newLogUnit.hasWord()) {`
			`if (mWordsUntilSafeToSample > 0) {`
			`mWordsUntilSafeToSample--;`
			`}`
			`}`
ResearchLogger add debugging code Change-Id: I03729506984f259dee63f3a66fd91963e3403d16 2012-08-09 19:20:45 +00:00			`if (DEBUG) {`
			`Log.d(TAG, "shiftedIn " + (newLogUnit.hasWord() ? newLogUnit.getWord() : ""));`
			`}`
ResearchLogging capture full n-gram data - Captures complete motion data for all words in an n-gram. - Also filters n-grams properly; if any word in the n-gram is not in the dictionary, it is not included. - Simplify ResearchLog to not require explicit state - Added LogBuffer class MainLogBuffer class to allow n-gram-level decisions about privacy. - Moved LogUnit out from ResearchLogger multi-project change with Ic70ccb6c2e11eb02d887821434b44daa3eb7a3e2 Bug: 6188932 Change-Id: I731d6cff6539e997874f723b68bdb0d9b651b937 2012-08-03 03:22:29 +00:00			`}`

			`public void resetWordCounter() {`
			`mWordsUntilSafeToSample = mMinWordPeriod;`
			`}`

			`/**`
			`* Determines whether the content of the MainLogBuffer can be safely uploaded in its complete`
			`* form and still protect the user's privacy.`
			`*`
			`* The size of the MainLogBuffer is just enough to hold one n-gram, its corrections, and any`
			`* non-character data that is typed between words. The decision about privacy is made based on`
			`* the buffer's entire content. If it is decided that the privacy risks are too great to upload`
			`* the contents of this buffer, a censored version of the LogItems may still be uploaded. E.g.,`
			`* the screen orientation and other characteristics about the device can be uploaded without`
			`* revealing much about the user.`
			`*/`
			`public boolean isSafeToLog() {`
			`// Check that we are not sampling too frequently. Having sampled recently might disclose`
			`// too much of the user's intended meaning.`
			`if (mWordsUntilSafeToSample > 0) {`
			`return false;`
			`}`
			`if (mSuggest == null \|\| !mSuggest.hasMainDictionary()) {`
			`// Main dictionary is unavailable. Since we cannot check it, we cannot tell if a word`
			`// is out-of-vocabulary or not. Therefore, we must judge the entire buffer contents to`
			`// potentially pose a privacy risk.`
			`return false;`
			`}`
			`// Reload the dictionary in case it has changed (e.g., because the user has changed`
			`// languages).`
			`final Dictionary dictionary = mSuggest.getMainDictionary();`
			`if (dictionary == null) {`
			`return false;`
			`}`
			`// Check each word in the buffer. If any word poses a privacy threat, we cannot upload the`
			`// complete buffer contents in detail.`
[Rlog27] Refactor LogBuffer Cleanup and prepare for replaying Change-Id: Ie09e912c6e9c0d7375168c575ccf1cfd9375dd31 2012-12-23 18:40:34 +00:00			`final LinkedList<LogUnit> logUnits = getLogUnits();`
			`final int length = logUnits.size();`
ResearchLogging capture full n-gram data - Captures complete motion data for all words in an n-gram. - Also filters n-grams properly; if any word in the n-gram is not in the dictionary, it is not included. - Simplify ResearchLog to not require explicit state - Added LogBuffer class MainLogBuffer class to allow n-gram-level decisions about privacy. - Moved LogUnit out from ResearchLogger multi-project change with Ic70ccb6c2e11eb02d887821434b44daa3eb7a3e2 Bug: 6188932 Change-Id: I731d6cff6539e997874f723b68bdb0d9b651b937 2012-08-03 03:22:29 +00:00			`for (int i = 0; i < length; i++) {`
[Rlog27] Refactor LogBuffer Cleanup and prepare for replaying Change-Id: Ie09e912c6e9c0d7375168c575ccf1cfd9375dd31 2012-12-23 18:40:34 +00:00			`final LogUnit logUnit = logUnits.get(i);`
ResearchLogging capture full n-gram data - Captures complete motion data for all words in an n-gram. - Also filters n-grams properly; if any word in the n-gram is not in the dictionary, it is not included. - Simplify ResearchLog to not require explicit state - Added LogBuffer class MainLogBuffer class to allow n-gram-level decisions about privacy. - Moved LogUnit out from ResearchLogger multi-project change with Ic70ccb6c2e11eb02d887821434b44daa3eb7a3e2 Bug: 6188932 Change-Id: I731d6cff6539e997874f723b68bdb0d9b651b937 2012-08-03 03:22:29 +00:00			`final String word = logUnit.getWord();`
			`if (word == null) {`
			`// Digits outside words are a privacy threat.`
[Rlog1] Track time of log statements Log statements are now recorded with a timestamp. This is important for filtering out statements not part of words that are sampled when spaces are inserted automatically. multi-project commit with Change-Id: I68221a2528045d25632aef4bb716f92a4f4a56a4 Change-Id: I46ac9b3b1dcbb08425160d0109028cb64445139c 2012-08-09 22:58:25 +00:00			`if (logUnit.mayContainDigit()) {`
ResearchLogging capture full n-gram data - Captures complete motion data for all words in an n-gram. - Also filters n-grams properly; if any word in the n-gram is not in the dictionary, it is not included. - Simplify ResearchLog to not require explicit state - Added LogBuffer class MainLogBuffer class to allow n-gram-level decisions about privacy. - Moved LogUnit out from ResearchLogger multi-project change with Ic70ccb6c2e11eb02d887821434b44daa3eb7a3e2 Bug: 6188932 Change-Id: I731d6cff6539e997874f723b68bdb0d9b651b937 2012-08-03 03:22:29 +00:00			`return false;`
			`}`
			`} else {`
			`// Words not in the dictionary are a privacy threat.`
[Rlog2] ResearchLogging fix multi-space logging multi-space logging should look like single-space logging, missing a few minor log statements (SuggestionUpdates, SetComposingText) multi-project commit with I2af842348c2f2b8f7271ac5b63def245e83df24d Change-Id: Icd3187c0d0377255f82787afffea657c14345803 2012-08-10 08:54:06 +00:00			`if (ResearchLogger.hasLetters(word) && !(dictionary.isValidWord(word))) {`
[Rlog4] ResearchLogger add LOG_EVERYTHING flag - broaden OUTPUT_WHOLE_BUFFER to log not just the buffer at the end, but everything along the way. This should only be set when the user is aware that logging is on, e.g. in a user test. Change-Id: I8f9874697524e533586da40d0a3e452f6a04d3e4 2012-08-12 20:54:53 +00:00			`if (DEBUG) {`
			`Log.d(TAG, "NOT SAFE!: hasLetters: " + ResearchLogger.hasLetters(word)`
			`+ ", isValid: " + (dictionary.isValidWord(word)));`
			`}`
ResearchLogging capture full n-gram data - Captures complete motion data for all words in an n-gram. - Also filters n-grams properly; if any word in the n-gram is not in the dictionary, it is not included. - Simplify ResearchLog to not require explicit state - Added LogBuffer class MainLogBuffer class to allow n-gram-level decisions about privacy. - Moved LogUnit out from ResearchLogger multi-project change with Ic70ccb6c2e11eb02d887821434b44daa3eb7a3e2 Bug: 6188932 Change-Id: I731d6cff6539e997874f723b68bdb0d9b651b937 2012-08-03 03:22:29 +00:00			`return false;`
			`}`
			`}`
			`}`
			`// All checks have passed; this buffer's content can be safely uploaded.`
			`return true;`
			`}`

			`@Override`
[Rlog27] Refactor LogBuffer Cleanup and prepare for replaying Change-Id: Ie09e912c6e9c0d7375168c575ccf1cfd9375dd31 2012-12-23 18:40:34 +00:00			`protected void onShiftOut(final LogUnit logUnit) {`
ResearchLogging capture full n-gram data - Captures complete motion data for all words in an n-gram. - Also filters n-grams properly; if any word in the n-gram is not in the dictionary, it is not included. - Simplify ResearchLog to not require explicit state - Added LogBuffer class MainLogBuffer class to allow n-gram-level decisions about privacy. - Moved LogUnit out from ResearchLogger multi-project change with Ic70ccb6c2e11eb02d887821434b44daa3eb7a3e2 Bug: 6188932 Change-Id: I731d6cff6539e997874f723b68bdb0d9b651b937 2012-08-03 03:22:29 +00:00			`if (mResearchLog != null) {`
			`mResearchLog.publish(logUnit, false /* isIncludingPrivateData */);`
			`}`
			`}`
			`}`