Allow LogUnits to hold >1 word

LogUnits have been annotated with the autocorrected words, but until now this was assumed to be a single word without spaces. But spaceless typing can result in spaces in the LogUnit label. With this change, the LogUnit inspects the autocorrected text to determine how many words were inserted, and counts them accurately. This change corrects a privacy problem, which was that if the word sampling algorithm chose a LogUnit that actually contained multiple words, then more than two successive words would be included in the log. Change-Id: I7c01c3dd3ac33d7e96c00836256bae9c14b124ed
2013-04-15 18:41:59 -07:00 · 2013-04-15 18:41:59 -07:00 · e92b5e145f
commit e92b5e145f
parent 56f35a10cd
4 changed files with 147 additions and 86 deletions
--- a/java/src/com/android/inputmethod/research/FixedLogBuffer.java
+++ b/java/src/com/android/inputmethod/research/FixedLogBuffer.java
@ -57,28 +57,29 @@ public class FixedLogBuffer extends LogBuffer {
     */
    @Override
    public void shiftIn(final LogUnit newLogUnit) {
-        if (!newLogUnit.hasWord()) {
-            // This LogUnit isn't a word, so it doesn't count toward the word-limit.
+        if (!newLogUnit.hasOneOrMoreWords()) {
+            // This LogUnit doesn't contain any word, so it doesn't count toward the word-limit.
            super.shiftIn(newLogUnit);
            return;
        }
+        final int numWordsIncoming = newLogUnit.getNumWords();
        if (mNumActualWords >= mWordCapacity) {
            // Give subclass a chance to handle the buffer full condition by shifting out logUnits.
            onBufferFull();
            // If still full, evict.
            if (mNumActualWords >= mWordCapacity) {
-                shiftOutWords(1);
+                shiftOutWords(numWordsIncoming);
            }
        }
        super.shiftIn(newLogUnit);
-        mNumActualWords++; // Must be a word, or we wouldn't be here.
+        mNumActualWords += numWordsIncoming;
    }

    @Override
    public LogUnit unshiftIn() {
        final LogUnit logUnit = super.unshiftIn();
-        if (logUnit != null && logUnit.hasWord()) {
-            mNumActualWords--;
+        if (logUnit != null && logUnit.hasOneOrMoreWords()) {
+            mNumActualWords -= logUnit.getNumWords();
        }
        return logUnit;
    }
@ -109,8 +110,8 @@ public class FixedLogBuffer extends LogBuffer {
    @Override
    public LogUnit shiftOut() {
        final LogUnit logUnit = super.shiftOut();
-        if (logUnit != null && logUnit.hasWord()) {
-            mNumActualWords--;
+        if (logUnit != null && logUnit.hasOneOrMoreWords()) {
+            mNumActualWords -= logUnit.getNumWords();
        }
        return logUnit;
    }
@ -121,15 +122,15 @@ public class FixedLogBuffer extends LogBuffer {
     * If there are less than {@code numWords} word-containing {@link LogUnit}s, shifts out
     * all {@code LogUnit}s in the buffer.
     *
-     * @param numWords the number of word-containing {@link LogUnit}s to shift out
+     * @param numWords the minimum number of word-containing {@link LogUnit}s to shift out
     * @return the number of actual {@code LogUnit}s shifted out
     */
    protected int shiftOutWords(final int numWords) {
        int numWordContainingLogUnitsShiftedOut = 0;
        for (LogUnit logUnit = shiftOut(); logUnit != null
                && numWordContainingLogUnitsShiftedOut < numWords; logUnit = shiftOut()) {
-            if (logUnit.hasWord()) {
-                numWordContainingLogUnitsShiftedOut++;
+            if (logUnit.hasOneOrMoreWords()) {
+                numWordContainingLogUnitsShiftedOut += logUnit.getNumWords();
            }
        }
        return numWordContainingLogUnitsShiftedOut;
@ -144,27 +145,31 @@ public class FixedLogBuffer extends LogBuffer {
    }

    /**
-     * Returns a list of {@link LogUnit}s at the front of the buffer that have associated words.  No
-     * more than {@code n} LogUnits will have words associated with them.  If there are not enough
-     * LogUnits in the buffer to meet the word requirement, returns the all LogUnits.
+     * Returns a list of {@link LogUnit}s at the front of the buffer that have words associated with
+     * them.
+     *
+     * There will be no more than {@code n} words in the returned list.  So if 2 words are
+     * requested, and the first LogUnit has 3 words, it is not returned.  If 2 words are requested,
+     * and the first LogUnit has only 1 word, and the next LogUnit 2 words, only the first LogUnit
+     * is returned.  If the first LogUnit has no words associated with it, and the second LogUnit
+     * has three words, then only the first LogUnit (which has no associated words) is returned.  If
+     * there are not enough LogUnits in the buffer to meet the word requirement, then all LogUnits
+     * will be returned.
     *
     * @param n The maximum number of {@link LogUnit}s with words to return.
     * @return The list of the {@link LogUnit}s containing the first n words
     */
    public ArrayList<LogUnit> peekAtFirstNWords(int n) {
        final LinkedList<LogUnit> logUnits = getLogUnits();
-        final int length = logUnits.size();
        // Allocate space for n*2 logUnits.  There will be at least n, one for each word, and
        // there may be additional for punctuation, between-word commands, etc.  This should be
        // enough that reallocation won't be necessary.
-        final ArrayList<LogUnit> list = new ArrayList<LogUnit>(n * 2);
-        for (int i = 0; i < length && n > 0; i++) {
-            final LogUnit logUnit = logUnits.get(i);
-            list.add(logUnit);
-            if (logUnit.hasWord()) {
-                n--;
-            }
+        final ArrayList<LogUnit> resultList = new ArrayList<LogUnit>(n * 2);
+        for (final LogUnit logUnit : logUnits) {
+            n -= logUnit.getNumWords();
+            if (n < 0) break;
+            resultList.add(logUnit);
        }
-        return list;
+        return resultList;
    }
 }
--- a/java/src/com/android/inputmethod/research/LogUnit.java
+++ b/java/src/com/android/inputmethod/research/LogUnit.java
@ -25,10 +25,10 @@ import com.android.inputmethod.latin.SuggestedWords;
 import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo;
 import com.android.inputmethod.latin.define.ProductionFlag;

-import java.io.IOException;
-import java.io.StringWriter;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
+import java.util.regex.Pattern;

 /**
 * A group of log statements related to each other.
@ -49,27 +49,45 @@ public class LogUnit {
    private static final boolean DEBUG = false
            && ProductionFlag.USES_DEVELOPMENT_ONLY_DIAGNOSTICS_DEBUG;

+    private static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
+    private static final String[] EMPTY_STRING_ARRAY = new String[0];
+
    private final ArrayList<LogStatement> mLogStatementList;
    private final ArrayList<Object[]> mValuesList;
    // Assume that mTimeList is sorted in increasing order.  Do not insert null values into
    // mTimeList.
    private final ArrayList<Long> mTimeList;
-    // Word that this LogUnit generates.  Should be null if the LogUnit does not generate a genuine
-    // word (i.e. separators alone do not count as a word).  Should never be empty.
-    private String mWord;
+    // Words that this LogUnit generates.  Should be null if the data in the LogUnit does not
+    // generate a genuine word (i.e. separators alone do not count as a word).  Should never be
+    // empty.  Note that if the user types spaces explicitly, then normally mWords should contain
+    // only a single word; it will only contain space-separate multiple words if the user does not
+    // enter a space, and the system enters one automatically.
+    private String mWords;
+    private String[] mWordArray = EMPTY_STRING_ARRAY;
    private boolean mMayContainDigit;
    private boolean mIsPartOfMegaword;
    private boolean mContainsCorrection;

-    // mCorrectionType indicates whether the word was corrected at all, and if so, whether it was
-    // to a different word or just a "typo" correction.  It is considered a "typo" if the final
-    // word was listed in the suggestions available the first time the word was gestured or
-    // tapped.
+    // mCorrectionType indicates whether the word was corrected at all, and if so, the nature of the
+    // correction.
    private int mCorrectionType;
+    // LogUnits start in this state.  If a word is entered without being corrected, it will have
+    // this CorrectiontType.
    public static final int CORRECTIONTYPE_NO_CORRECTION = 0;
+    // The LogUnit was corrected manually by the user in an unspecified way.
    public static final int CORRECTIONTYPE_CORRECTION = 1;
+    // The LogUnit was corrected manually by the user to a word not in the list of suggestions of
+    // the first word typed here.  (Note: this is a heuristic value, it may be incorrect, for
+    // example, if the user repositions the cursor).
    public static final int CORRECTIONTYPE_DIFFERENT_WORD = 2;
+    // The LogUnit was corrected manually by the user to a word that was in the list of suggestions
+    // of the first word typed here.  (Again, a heuristic).  It is probably a typo correction.
    public static final int CORRECTIONTYPE_TYPO = 3;
+    // TODO: Rather than just tracking the current state, keep a historical record of the LogUnit's
+    // state and statistics.  This should include how many times it has been corrected, whether
+    // other LogUnit edits were done between edits to this LogUnit, etc.  Also track when a LogUnit
+    // previously contained a word, but was corrected to empty (because it was deleted, and there is
+    // no known replacement).

    private SuggestedWords mSuggestedWords;

@ -166,7 +184,7 @@ public class LogUnit {
        final LogStatement logStatement;
        if (canIncludePrivateData) {
            LOGSTATEMENT_LOG_UNIT_BEGIN_WITH_PRIVATE_DATA.outputToLocked(jsonWriter,
-                    SystemClock.uptimeMillis(), getWord(), getCorrectionType());
+                    SystemClock.uptimeMillis(), getWordsAsString(), getCorrectionType());
        } else {
            LOGSTATEMENT_LOG_UNIT_BEGIN_WITHOUT_PRIVATE_DATA.outputToLocked(jsonWriter,
                    SystemClock.uptimeMillis());
@ -181,22 +199,22 @@ public class LogUnit {
    }

    /**
-     * Mark the current logUnit as containing data to generate {@code word}.
+     * Mark the current logUnit as containing data to generate {@code newWords}.
     *
     * If {@code setWord()} was previously called for this LogUnit, then the method will try to
     * determine what kind of correction it is, and update its internal state of the correctionType
     * accordingly.
     *
-     * @param word The word this LogUnit generates.  Caller should not pass null or the empty
+     * @param newWords The words this LogUnit generates.  Caller should not pass null or the empty
     * string.
     */
-    public void setWord(final String word) {
-        if (hasWord()) {
+    public void setWords(final String newWords) {
+        if (hasOneOrMoreWords()) {
            // The word was already set once, and it is now being changed.  See if the new word
            // is close to the old word.  If so, then the change is probably a typo correction.
            // If not, the user may have decided to enter a different word, so flag it.
            if (mSuggestedWords != null) {
-                if (isInSuggestedWords(word, mSuggestedWords)) {
+                if (isInSuggestedWords(newWords, mSuggestedWords)) {
                    mCorrectionType = CORRECTIONTYPE_TYPO;
                } else {
                    mCorrectionType = CORRECTIONTYPE_DIFFERENT_WORD;
@ -206,38 +224,71 @@ public class LogUnit {
                // Mark it as a generic correction.
                mCorrectionType = CORRECTIONTYPE_CORRECTION;
            }
+        } else {
+            mCorrectionType = CORRECTIONTYPE_NO_CORRECTION;
+        }
+        mWords = newWords;
+
+        // Update mWordArray
+        mWordArray = (TextUtils.isEmpty(mWords)) ? EMPTY_STRING_ARRAY
+                : WHITESPACE_PATTERN.split(mWords);
+        if (mWordArray.length > 0 && TextUtils.isEmpty(mWordArray[0])) {
+            // Empty string at beginning of array.  Must have been whitespace at the start of the
+            // word.  Remove the empty string.
+            mWordArray = Arrays.copyOfRange(mWordArray, 1, mWordArray.length);
        }
-        mWord = word;
    }

-    public String getWord() {
-        return mWord;
+    public String getWordsAsString() {
+        return mWords;
    }

-    public boolean hasWord() {
-        return mWord != null && !TextUtils.isEmpty(mWord.trim());
+    /**
+     * Retuns the words generated by the data in this LogUnit.
+     *
+     * The first word may be an empty string, if the data in the LogUnit started by generating
+     * whitespace.
+     *
+     * @return the array of words. an empty list of there are no words associated with this LogUnit.
+     */
+    public String[] getWordsAsStringArray() {
+        return mWordArray;
    }

+    public boolean hasOneOrMoreWords() {
+        return mWordArray.length >= 1;
+    }
+
+    public int getNumWords() {
+        return mWordArray.length;
+    }
+
+    // TODO: Refactor to eliminate getter/setters
    public void setMayContainDigit() {
        mMayContainDigit = true;
    }

+    // TODO: Refactor to eliminate getter/setters
    public boolean mayContainDigit() {
        return mMayContainDigit;
    }

+    // TODO: Refactor to eliminate getter/setters
    public void setContainsCorrection() {
        mContainsCorrection = true;
    }

+    // TODO: Refactor to eliminate getter/setters
    public boolean containsCorrection() {
        return mContainsCorrection;
    }

+    // TODO: Refactor to eliminate getter/setters
    public void setCorrectionType(final int correctionType) {
        mCorrectionType = correctionType;
    }

+    // TODO: Refactor to eliminate getter/setters
    public int getCorrectionType() {
        return mCorrectionType;
    }
@ -267,7 +318,7 @@ public class LogUnit {
                        new ArrayList<Object[]>(laterValues),
                        new ArrayList<Long>(laterTimes),
                        true /* isPartOfMegaword */);
-                newLogUnit.mWord = null;
+                newLogUnit.mWords = null;
                newLogUnit.mMayContainDigit = mMayContainDigit;
                newLogUnit.mContainsCorrection = mContainsCorrection;

@ -287,9 +338,9 @@ public class LogUnit {
        mLogStatementList.addAll(logUnit.mLogStatementList);
        mValuesList.addAll(logUnit.mValuesList);
        mTimeList.addAll(logUnit.mTimeList);
-        mWord = null;
-        if (logUnit.mWord != null) {
-            setWord(logUnit.mWord);
+        mWords = null;
+        if (logUnit.mWords != null) {
+            setWords(logUnit.mWords);
        }
        mMayContainDigit = mMayContainDigit || logUnit.mMayContainDigit;
        mContainsCorrection = mContainsCorrection || logUnit.mContainsCorrection;
--- a/java/src/com/android/inputmethod/research/MainLogBuffer.java
+++ b/java/src/com/android/inputmethod/research/MainLogBuffer.java
@ -126,10 +126,7 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
            final int length = logUnits.size();
            for (int i = 0; i < length; i++) {
                final LogUnit logUnit = logUnits.get(i);
-                final String word = logUnit.getWord();
-                if (word != null) {
-                    numWordsInLogUnitList++;
-                }
+                numWordsInLogUnitList += logUnit.getNumWords();
            }
            return numWordsInLogUnitList >= minNGramSize;
        }
@ -153,29 +150,31 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
        // the complete buffer contents in detail.
        int numWordsInLogUnitList = 0;
        final int length = logUnits.size();
-        for (int i = 0; i < length; i++) {
-            final LogUnit logUnit = logUnits.get(i);
-            if (!logUnit.hasWord()) {
+        for (final LogUnit logUnit : logUnits) {
+            if (!logUnit.hasOneOrMoreWords()) {
                // Digits outside words are a privacy threat.
                if (logUnit.mayContainDigit()) {
                    return false;
                }
            } else {
-                numWordsInLogUnitList++;
-                final String word = logUnit.getWord();
-                // Words not in the dictionary are a privacy threat.
-                if (ResearchLogger.hasLetters(word) && !(dictionary.isValidWord(word))) {
-                    if (DEBUG) {
-                        Log.d(TAG, "NOT SAFE!: hasLetters: " + ResearchLogger.hasLetters(word)
-                                + ", isValid: " + (dictionary.isValidWord(word)));
+                numWordsInLogUnitList += logUnit.getNumWords();
+                final String[] words = logUnit.getWordsAsStringArray();
+                for (final String word : words) {
+                    // Words not in the dictionary are a privacy threat.
+                    if (ResearchLogger.hasLetters(word) && !(dictionary.isValidWord(word))) {
+                        if (DEBUG) {
+                            Log.d(TAG, "\"" + word + "\" NOT SAFE!: hasLetters: "
+                                    + ResearchLogger.hasLetters(word)
+                                    + ", isValid: " + (dictionary.isValidWord(word)));
+                        }
+                        return false;
                    }
-                    return false;
                }
            }
        }

-        // Finally, only return true if the minNGramSize is met.
-        return numWordsInLogUnitList >= minNGramSize;
+        // Finally, only return true if the ngram is the right size.
+        return numWordsInLogUnitList == minNGramSize;
    }

    public void shiftAndPublishAll() {
@ -198,11 +197,14 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
            shiftOutWords(N_GRAM_SIZE);
            mNumWordsUntilSafeToSample = mNumWordsBetweenNGrams;
        } else {
-            // No good n-gram at front, and buffer is full.  Shift out the first word (or if there
-            // is none, the existing logUnits).
-            logUnits = peekAtFirstNWords(1);
+            // No good n-gram at front, and buffer is full.  Shift out up through the first logUnit
+            // with associated words (or if there is none, all the existing logUnits).
+            logUnits.clear();
+            for (LogUnit logUnit = shiftOut(); logUnit != null && !logUnit.hasOneOrMoreWords();
+                    logUnit = shiftOut()) {
+                logUnits.add(logUnit);
+            }
            publish(logUnits, false /* canIncludePrivateData */);
-            shiftOutWords(1);
        }
    }

--- a/java/src/com/android/inputmethod/research/ResearchLogger.java
+++ b/java/src/com/android/inputmethod/research/ResearchLogger.java
@ -397,13 +397,14 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang
                protected void publish(final ArrayList<LogUnit> logUnits,
                        boolean canIncludePrivateData) {
                    canIncludePrivateData |= IS_LOGGING_EVERYTHING;
-                    final int length = logUnits.size();
-                    for (int i = 0; i < length; i++) {
-                        final LogUnit logUnit = logUnits.get(i);
-                        final String word = logUnit.getWord();
-                        if (word != null && word.length() > 0 && hasLetters(word)) {
-                            Log.d(TAG, "onPublish: " + word + ", hc: "
-                                    + logUnit.containsCorrection());
+                    for (final LogUnit logUnit : logUnits) {
+                        if (DEBUG) {
+                            final String wordsString = logUnit.getWordsAsString();
+                            Log.d(TAG, "onPublish: '" + wordsString
+                                    + "', hc: " + logUnit.containsCorrection()
+                                    + ", cipd: " + canIncludePrivateData);
+                        }
+                        for (final String word : logUnit.getWordsAsStringArray()) {
                            final Dictionary dictionary = getDictionary();
                            mStatistics.recordWordEntered(
                                    dictionary != null && dictionary.isValidWord(word),
@ -852,8 +853,8 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang

    /* package for test */ void commitCurrentLogUnit() {
        if (DEBUG) {
-            Log.d(TAG, "commitCurrentLogUnit" + (mCurrentLogUnit.hasWord() ?
-                    ": " + mCurrentLogUnit.getWord() : ""));
+            Log.d(TAG, "commitCurrentLogUnit" + (mCurrentLogUnit.hasOneOrMoreWords() ?
+                    ": " + mCurrentLogUnit.getWordsAsString() : ""));
        }
        if (!mCurrentLogUnit.isEmpty()) {
            if (mMainLogBuffer != null) {
@ -893,8 +894,8 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang

        // Check that expected word matches.
        if (oldLogUnit != null) {
-            final String oldLogUnitWord = oldLogUnit.getWord();
-            if (oldLogUnitWord != null && !oldLogUnitWord.equals(expectedWord)) {
+            final String oldLogUnitWords = oldLogUnit.getWordsAsString();
+            if (oldLogUnitWords != null && !oldLogUnitWords.equals(expectedWord)) {
                return;
            }
        }
@ -916,7 +917,8 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang
        enqueueEvent(LOGSTATEMENT_UNCOMMIT_CURRENT_LOGUNIT);
        if (DEBUG) {
            Log.d(TAG, "uncommitCurrentLogUnit (dump=" + dumpCurrentLogUnit + ") back to "
-                    + (mCurrentLogUnit.hasWord() ? ": '" + mCurrentLogUnit.getWord() + "'" : ""));
+                    + (mCurrentLogUnit.hasOneOrMoreWords() ? ": '"
+                        + mCurrentLogUnit.getWordsAsString() + "'" : ""));
        }
    }

@ -950,8 +952,9 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang
        }
        for (LogUnit logUnit : logUnits) {
            if (DEBUG) {
-                Log.d(TAG, "publishLogBuffer: " + (logUnit.hasWord() ? logUnit.getWord()
-                        : "<wordless>") + ", correction?: " + logUnit.containsCorrection());
+                Log.d(TAG, "publishLogBuffer: " + (logUnit.hasOneOrMoreWords()
+                        ? logUnit.getWordsAsString() : "<wordless>")
+                        + ", correction?: " + logUnit.containsCorrection());
            }
            researchLog.publish(logUnit, canIncludePrivateData);
        }
@ -986,7 +989,7 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang
            return;
        }
        if (word.length() > 0 && hasLetters(word)) {
-            mCurrentLogUnit.setWord(word);
+            mCurrentLogUnit.setWords(word);
        }
        final LogUnit newLogUnit = mCurrentLogUnit.splitByTime(maxTime);
        enqueueCommitText(word, isBatchMode);
@ -1478,7 +1481,7 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang
        }
        if (originallyTypedWord.length() > 0 && hasLetters(originallyTypedWord)) {
            if (logUnit != null) {
-                logUnit.setWord(originallyTypedWord);
+                logUnit.setWords(originallyTypedWord);
            }
        }
        researchLogger.enqueueEvent(logUnit != null ? logUnit : researchLogger.mCurrentLogUnit,
@ -1616,7 +1619,7 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang
     * Log a call to LatinIME.commitCurrentAutoCorrection().
     *
     * SystemResponse: The IME has committed an auto-correction.  An auto-correction changes the raw
-     * text input to another word that the user more likely desired to type.
+     * text input to another word (or words) that the user more likely desired to type.
     */
    private static final LogStatement LOGSTATEMENT_LATINIME_COMMITCURRENTAUTOCORRECTION =
            new LogStatement("LatinIMECommitCurrentAutoCorrection", true, true, "typedWord",
@ -1827,7 +1830,7 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang
            final int enteredWordPos, final SuggestedWords suggestedWords) {
        final ResearchLogger researchLogger = getInstance();
        if (!TextUtils.isEmpty(enteredText) && hasLetters(enteredText.toString())) {
-            researchLogger.mCurrentLogUnit.setWord(enteredText.toString());
+            researchLogger.mCurrentLogUnit.setWords(enteredText.toString());
        }
        researchLogger.enqueueEvent(LOGSTATEMENT_LATINIME_ONENDBATCHINPUT, enteredText,
                enteredWordPos);