diff --git a/java/src/com/android/inputmethod/research/FixedLogBuffer.java b/java/src/com/android/inputmethod/research/FixedLogBuffer.java index 641bf7eae..4249af544 100644 --- a/java/src/com/android/inputmethod/research/FixedLogBuffer.java +++ b/java/src/com/android/inputmethod/research/FixedLogBuffer.java @@ -57,28 +57,29 @@ public class FixedLogBuffer extends LogBuffer { */ @Override public void shiftIn(final LogUnit newLogUnit) { - if (!newLogUnit.hasWord()) { - // This LogUnit isn't a word, so it doesn't count toward the word-limit. + if (!newLogUnit.hasOneOrMoreWords()) { + // This LogUnit doesn't contain any word, so it doesn't count toward the word-limit. super.shiftIn(newLogUnit); return; } + final int numWordsIncoming = newLogUnit.getNumWords(); if (mNumActualWords >= mWordCapacity) { // Give subclass a chance to handle the buffer full condition by shifting out logUnits. onBufferFull(); // If still full, evict. if (mNumActualWords >= mWordCapacity) { - shiftOutWords(1); + shiftOutWords(numWordsIncoming); } } super.shiftIn(newLogUnit); - mNumActualWords++; // Must be a word, or we wouldn't be here. + mNumActualWords += numWordsIncoming; } @Override public LogUnit unshiftIn() { final LogUnit logUnit = super.unshiftIn(); - if (logUnit != null && logUnit.hasWord()) { - mNumActualWords--; + if (logUnit != null && logUnit.hasOneOrMoreWords()) { + mNumActualWords -= logUnit.getNumWords(); } return logUnit; } @@ -109,8 +110,8 @@ public class FixedLogBuffer extends LogBuffer { @Override public LogUnit shiftOut() { final LogUnit logUnit = super.shiftOut(); - if (logUnit != null && logUnit.hasWord()) { - mNumActualWords--; + if (logUnit != null && logUnit.hasOneOrMoreWords()) { + mNumActualWords -= logUnit.getNumWords(); } return logUnit; } @@ -121,15 +122,15 @@ public class FixedLogBuffer extends LogBuffer { * If there are less than {@code numWords} word-containing {@link LogUnit}s, shifts out * all {@code LogUnit}s in the buffer. * - * @param numWords the number of word-containing {@link LogUnit}s to shift out + * @param numWords the minimum number of word-containing {@link LogUnit}s to shift out * @return the number of actual {@code LogUnit}s shifted out */ protected int shiftOutWords(final int numWords) { int numWordContainingLogUnitsShiftedOut = 0; for (LogUnit logUnit = shiftOut(); logUnit != null && numWordContainingLogUnitsShiftedOut < numWords; logUnit = shiftOut()) { - if (logUnit.hasWord()) { - numWordContainingLogUnitsShiftedOut++; + if (logUnit.hasOneOrMoreWords()) { + numWordContainingLogUnitsShiftedOut += logUnit.getNumWords(); } } return numWordContainingLogUnitsShiftedOut; @@ -144,27 +145,31 @@ public class FixedLogBuffer extends LogBuffer { } /** - * Returns a list of {@link LogUnit}s at the front of the buffer that have associated words. No - * more than {@code n} LogUnits will have words associated with them. If there are not enough - * LogUnits in the buffer to meet the word requirement, returns the all LogUnits. + * Returns a list of {@link LogUnit}s at the front of the buffer that have words associated with + * them. + * + * There will be no more than {@code n} words in the returned list. So if 2 words are + * requested, and the first LogUnit has 3 words, it is not returned. If 2 words are requested, + * and the first LogUnit has only 1 word, and the next LogUnit 2 words, only the first LogUnit + * is returned. If the first LogUnit has no words associated with it, and the second LogUnit + * has three words, then only the first LogUnit (which has no associated words) is returned. If + * there are not enough LogUnits in the buffer to meet the word requirement, then all LogUnits + * will be returned. * * @param n The maximum number of {@link LogUnit}s with words to return. * @return The list of the {@link LogUnit}s containing the first n words */ public ArrayList peekAtFirstNWords(int n) { final LinkedList logUnits = getLogUnits(); - final int length = logUnits.size(); // Allocate space for n*2 logUnits. There will be at least n, one for each word, and // there may be additional for punctuation, between-word commands, etc. This should be // enough that reallocation won't be necessary. - final ArrayList list = new ArrayList(n * 2); - for (int i = 0; i < length && n > 0; i++) { - final LogUnit logUnit = logUnits.get(i); - list.add(logUnit); - if (logUnit.hasWord()) { - n--; - } + final ArrayList resultList = new ArrayList(n * 2); + for (final LogUnit logUnit : logUnits) { + n -= logUnit.getNumWords(); + if (n < 0) break; + resultList.add(logUnit); } - return list; + return resultList; } } diff --git a/java/src/com/android/inputmethod/research/LogUnit.java b/java/src/com/android/inputmethod/research/LogUnit.java index 1c01675bd..4d60bda53 100644 --- a/java/src/com/android/inputmethod/research/LogUnit.java +++ b/java/src/com/android/inputmethod/research/LogUnit.java @@ -25,10 +25,10 @@ import com.android.inputmethod.latin.SuggestedWords; import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo; import com.android.inputmethod.latin.define.ProductionFlag; -import java.io.IOException; -import java.io.StringWriter; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; +import java.util.regex.Pattern; /** * A group of log statements related to each other. @@ -49,27 +49,45 @@ public class LogUnit { private static final boolean DEBUG = false && ProductionFlag.USES_DEVELOPMENT_ONLY_DIAGNOSTICS_DEBUG; + private static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+"); + private static final String[] EMPTY_STRING_ARRAY = new String[0]; + private final ArrayList mLogStatementList; private final ArrayList mValuesList; // Assume that mTimeList is sorted in increasing order. Do not insert null values into // mTimeList. private final ArrayList mTimeList; - // Word that this LogUnit generates. Should be null if the LogUnit does not generate a genuine - // word (i.e. separators alone do not count as a word). Should never be empty. - private String mWord; + // Words that this LogUnit generates. Should be null if the data in the LogUnit does not + // generate a genuine word (i.e. separators alone do not count as a word). Should never be + // empty. Note that if the user types spaces explicitly, then normally mWords should contain + // only a single word; it will only contain space-separate multiple words if the user does not + // enter a space, and the system enters one automatically. + private String mWords; + private String[] mWordArray = EMPTY_STRING_ARRAY; private boolean mMayContainDigit; private boolean mIsPartOfMegaword; private boolean mContainsCorrection; - // mCorrectionType indicates whether the word was corrected at all, and if so, whether it was - // to a different word or just a "typo" correction. It is considered a "typo" if the final - // word was listed in the suggestions available the first time the word was gestured or - // tapped. + // mCorrectionType indicates whether the word was corrected at all, and if so, the nature of the + // correction. private int mCorrectionType; + // LogUnits start in this state. If a word is entered without being corrected, it will have + // this CorrectiontType. public static final int CORRECTIONTYPE_NO_CORRECTION = 0; + // The LogUnit was corrected manually by the user in an unspecified way. public static final int CORRECTIONTYPE_CORRECTION = 1; + // The LogUnit was corrected manually by the user to a word not in the list of suggestions of + // the first word typed here. (Note: this is a heuristic value, it may be incorrect, for + // example, if the user repositions the cursor). public static final int CORRECTIONTYPE_DIFFERENT_WORD = 2; + // The LogUnit was corrected manually by the user to a word that was in the list of suggestions + // of the first word typed here. (Again, a heuristic). It is probably a typo correction. public static final int CORRECTIONTYPE_TYPO = 3; + // TODO: Rather than just tracking the current state, keep a historical record of the LogUnit's + // state and statistics. This should include how many times it has been corrected, whether + // other LogUnit edits were done between edits to this LogUnit, etc. Also track when a LogUnit + // previously contained a word, but was corrected to empty (because it was deleted, and there is + // no known replacement). private SuggestedWords mSuggestedWords; @@ -166,7 +184,7 @@ public class LogUnit { final LogStatement logStatement; if (canIncludePrivateData) { LOGSTATEMENT_LOG_UNIT_BEGIN_WITH_PRIVATE_DATA.outputToLocked(jsonWriter, - SystemClock.uptimeMillis(), getWord(), getCorrectionType()); + SystemClock.uptimeMillis(), getWordsAsString(), getCorrectionType()); } else { LOGSTATEMENT_LOG_UNIT_BEGIN_WITHOUT_PRIVATE_DATA.outputToLocked(jsonWriter, SystemClock.uptimeMillis()); @@ -181,22 +199,22 @@ public class LogUnit { } /** - * Mark the current logUnit as containing data to generate {@code word}. + * Mark the current logUnit as containing data to generate {@code newWords}. * * If {@code setWord()} was previously called for this LogUnit, then the method will try to * determine what kind of correction it is, and update its internal state of the correctionType * accordingly. * - * @param word The word this LogUnit generates. Caller should not pass null or the empty + * @param newWords The words this LogUnit generates. Caller should not pass null or the empty * string. */ - public void setWord(final String word) { - if (hasWord()) { + public void setWords(final String newWords) { + if (hasOneOrMoreWords()) { // The word was already set once, and it is now being changed. See if the new word // is close to the old word. If so, then the change is probably a typo correction. // If not, the user may have decided to enter a different word, so flag it. if (mSuggestedWords != null) { - if (isInSuggestedWords(word, mSuggestedWords)) { + if (isInSuggestedWords(newWords, mSuggestedWords)) { mCorrectionType = CORRECTIONTYPE_TYPO; } else { mCorrectionType = CORRECTIONTYPE_DIFFERENT_WORD; @@ -206,38 +224,71 @@ public class LogUnit { // Mark it as a generic correction. mCorrectionType = CORRECTIONTYPE_CORRECTION; } + } else { + mCorrectionType = CORRECTIONTYPE_NO_CORRECTION; + } + mWords = newWords; + + // Update mWordArray + mWordArray = (TextUtils.isEmpty(mWords)) ? EMPTY_STRING_ARRAY + : WHITESPACE_PATTERN.split(mWords); + if (mWordArray.length > 0 && TextUtils.isEmpty(mWordArray[0])) { + // Empty string at beginning of array. Must have been whitespace at the start of the + // word. Remove the empty string. + mWordArray = Arrays.copyOfRange(mWordArray, 1, mWordArray.length); } - mWord = word; } - public String getWord() { - return mWord; + public String getWordsAsString() { + return mWords; } - public boolean hasWord() { - return mWord != null && !TextUtils.isEmpty(mWord.trim()); + /** + * Retuns the words generated by the data in this LogUnit. + * + * The first word may be an empty string, if the data in the LogUnit started by generating + * whitespace. + * + * @return the array of words. an empty list of there are no words associated with this LogUnit. + */ + public String[] getWordsAsStringArray() { + return mWordArray; } + public boolean hasOneOrMoreWords() { + return mWordArray.length >= 1; + } + + public int getNumWords() { + return mWordArray.length; + } + + // TODO: Refactor to eliminate getter/setters public void setMayContainDigit() { mMayContainDigit = true; } + // TODO: Refactor to eliminate getter/setters public boolean mayContainDigit() { return mMayContainDigit; } + // TODO: Refactor to eliminate getter/setters public void setContainsCorrection() { mContainsCorrection = true; } + // TODO: Refactor to eliminate getter/setters public boolean containsCorrection() { return mContainsCorrection; } + // TODO: Refactor to eliminate getter/setters public void setCorrectionType(final int correctionType) { mCorrectionType = correctionType; } + // TODO: Refactor to eliminate getter/setters public int getCorrectionType() { return mCorrectionType; } @@ -267,7 +318,7 @@ public class LogUnit { new ArrayList(laterValues), new ArrayList(laterTimes), true /* isPartOfMegaword */); - newLogUnit.mWord = null; + newLogUnit.mWords = null; newLogUnit.mMayContainDigit = mMayContainDigit; newLogUnit.mContainsCorrection = mContainsCorrection; @@ -287,9 +338,9 @@ public class LogUnit { mLogStatementList.addAll(logUnit.mLogStatementList); mValuesList.addAll(logUnit.mValuesList); mTimeList.addAll(logUnit.mTimeList); - mWord = null; - if (logUnit.mWord != null) { - setWord(logUnit.mWord); + mWords = null; + if (logUnit.mWords != null) { + setWords(logUnit.mWords); } mMayContainDigit = mMayContainDigit || logUnit.mMayContainDigit; mContainsCorrection = mContainsCorrection || logUnit.mContainsCorrection; diff --git a/java/src/com/android/inputmethod/research/MainLogBuffer.java b/java/src/com/android/inputmethod/research/MainLogBuffer.java index cd4c1db6e..42ef5d3b6 100644 --- a/java/src/com/android/inputmethod/research/MainLogBuffer.java +++ b/java/src/com/android/inputmethod/research/MainLogBuffer.java @@ -126,10 +126,7 @@ public abstract class MainLogBuffer extends FixedLogBuffer { final int length = logUnits.size(); for (int i = 0; i < length; i++) { final LogUnit logUnit = logUnits.get(i); - final String word = logUnit.getWord(); - if (word != null) { - numWordsInLogUnitList++; - } + numWordsInLogUnitList += logUnit.getNumWords(); } return numWordsInLogUnitList >= minNGramSize; } @@ -153,29 +150,31 @@ public abstract class MainLogBuffer extends FixedLogBuffer { // the complete buffer contents in detail. int numWordsInLogUnitList = 0; final int length = logUnits.size(); - for (int i = 0; i < length; i++) { - final LogUnit logUnit = logUnits.get(i); - if (!logUnit.hasWord()) { + for (final LogUnit logUnit : logUnits) { + if (!logUnit.hasOneOrMoreWords()) { // Digits outside words are a privacy threat. if (logUnit.mayContainDigit()) { return false; } } else { - numWordsInLogUnitList++; - final String word = logUnit.getWord(); - // Words not in the dictionary are a privacy threat. - if (ResearchLogger.hasLetters(word) && !(dictionary.isValidWord(word))) { - if (DEBUG) { - Log.d(TAG, "NOT SAFE!: hasLetters: " + ResearchLogger.hasLetters(word) - + ", isValid: " + (dictionary.isValidWord(word))); + numWordsInLogUnitList += logUnit.getNumWords(); + final String[] words = logUnit.getWordsAsStringArray(); + for (final String word : words) { + // Words not in the dictionary are a privacy threat. + if (ResearchLogger.hasLetters(word) && !(dictionary.isValidWord(word))) { + if (DEBUG) { + Log.d(TAG, "\"" + word + "\" NOT SAFE!: hasLetters: " + + ResearchLogger.hasLetters(word) + + ", isValid: " + (dictionary.isValidWord(word))); + } + return false; } - return false; } } } - // Finally, only return true if the minNGramSize is met. - return numWordsInLogUnitList >= minNGramSize; + // Finally, only return true if the ngram is the right size. + return numWordsInLogUnitList == minNGramSize; } public void shiftAndPublishAll() { @@ -198,11 +197,14 @@ public abstract class MainLogBuffer extends FixedLogBuffer { shiftOutWords(N_GRAM_SIZE); mNumWordsUntilSafeToSample = mNumWordsBetweenNGrams; } else { - // No good n-gram at front, and buffer is full. Shift out the first word (or if there - // is none, the existing logUnits). - logUnits = peekAtFirstNWords(1); + // No good n-gram at front, and buffer is full. Shift out up through the first logUnit + // with associated words (or if there is none, all the existing logUnits). + logUnits.clear(); + for (LogUnit logUnit = shiftOut(); logUnit != null && !logUnit.hasOneOrMoreWords(); + logUnit = shiftOut()) { + logUnits.add(logUnit); + } publish(logUnits, false /* canIncludePrivateData */); - shiftOutWords(1); } } diff --git a/java/src/com/android/inputmethod/research/ResearchLogger.java b/java/src/com/android/inputmethod/research/ResearchLogger.java index cd18e3de6..1f6845c8b 100644 --- a/java/src/com/android/inputmethod/research/ResearchLogger.java +++ b/java/src/com/android/inputmethod/research/ResearchLogger.java @@ -397,13 +397,14 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang protected void publish(final ArrayList logUnits, boolean canIncludePrivateData) { canIncludePrivateData |= IS_LOGGING_EVERYTHING; - final int length = logUnits.size(); - for (int i = 0; i < length; i++) { - final LogUnit logUnit = logUnits.get(i); - final String word = logUnit.getWord(); - if (word != null && word.length() > 0 && hasLetters(word)) { - Log.d(TAG, "onPublish: " + word + ", hc: " - + logUnit.containsCorrection()); + for (final LogUnit logUnit : logUnits) { + if (DEBUG) { + final String wordsString = logUnit.getWordsAsString(); + Log.d(TAG, "onPublish: '" + wordsString + + "', hc: " + logUnit.containsCorrection() + + ", cipd: " + canIncludePrivateData); + } + for (final String word : logUnit.getWordsAsStringArray()) { final Dictionary dictionary = getDictionary(); mStatistics.recordWordEntered( dictionary != null && dictionary.isValidWord(word), @@ -852,8 +853,8 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang /* package for test */ void commitCurrentLogUnit() { if (DEBUG) { - Log.d(TAG, "commitCurrentLogUnit" + (mCurrentLogUnit.hasWord() ? - ": " + mCurrentLogUnit.getWord() : "")); + Log.d(TAG, "commitCurrentLogUnit" + (mCurrentLogUnit.hasOneOrMoreWords() ? + ": " + mCurrentLogUnit.getWordsAsString() : "")); } if (!mCurrentLogUnit.isEmpty()) { if (mMainLogBuffer != null) { @@ -893,8 +894,8 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang // Check that expected word matches. if (oldLogUnit != null) { - final String oldLogUnitWord = oldLogUnit.getWord(); - if (oldLogUnitWord != null && !oldLogUnitWord.equals(expectedWord)) { + final String oldLogUnitWords = oldLogUnit.getWordsAsString(); + if (oldLogUnitWords != null && !oldLogUnitWords.equals(expectedWord)) { return; } } @@ -916,7 +917,8 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang enqueueEvent(LOGSTATEMENT_UNCOMMIT_CURRENT_LOGUNIT); if (DEBUG) { Log.d(TAG, "uncommitCurrentLogUnit (dump=" + dumpCurrentLogUnit + ") back to " - + (mCurrentLogUnit.hasWord() ? ": '" + mCurrentLogUnit.getWord() + "'" : "")); + + (mCurrentLogUnit.hasOneOrMoreWords() ? ": '" + + mCurrentLogUnit.getWordsAsString() + "'" : "")); } } @@ -950,8 +952,9 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang } for (LogUnit logUnit : logUnits) { if (DEBUG) { - Log.d(TAG, "publishLogBuffer: " + (logUnit.hasWord() ? logUnit.getWord() - : "") + ", correction?: " + logUnit.containsCorrection()); + Log.d(TAG, "publishLogBuffer: " + (logUnit.hasOneOrMoreWords() + ? logUnit.getWordsAsString() : "") + + ", correction?: " + logUnit.containsCorrection()); } researchLog.publish(logUnit, canIncludePrivateData); } @@ -986,7 +989,7 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang return; } if (word.length() > 0 && hasLetters(word)) { - mCurrentLogUnit.setWord(word); + mCurrentLogUnit.setWords(word); } final LogUnit newLogUnit = mCurrentLogUnit.splitByTime(maxTime); enqueueCommitText(word, isBatchMode); @@ -1478,7 +1481,7 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang } if (originallyTypedWord.length() > 0 && hasLetters(originallyTypedWord)) { if (logUnit != null) { - logUnit.setWord(originallyTypedWord); + logUnit.setWords(originallyTypedWord); } } researchLogger.enqueueEvent(logUnit != null ? logUnit : researchLogger.mCurrentLogUnit, @@ -1616,7 +1619,7 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang * Log a call to LatinIME.commitCurrentAutoCorrection(). * * SystemResponse: The IME has committed an auto-correction. An auto-correction changes the raw - * text input to another word that the user more likely desired to type. + * text input to another word (or words) that the user more likely desired to type. */ private static final LogStatement LOGSTATEMENT_LATINIME_COMMITCURRENTAUTOCORRECTION = new LogStatement("LatinIMECommitCurrentAutoCorrection", true, true, "typedWord", @@ -1827,7 +1830,7 @@ public class ResearchLogger implements SharedPreferences.OnSharedPreferenceChang final int enteredWordPos, final SuggestedWords suggestedWords) { final ResearchLogger researchLogger = getInstance(); if (!TextUtils.isEmpty(enteredText) && hasLetters(enteredText.toString())) { - researchLogger.mCurrentLogUnit.setWord(enteredText.toString()); + researchLogger.mCurrentLogUnit.setWords(enteredText.toString()); } researchLogger.enqueueEvent(LOGSTATEMENT_LATINIME_ONENDBATCHINPUT, enteredText, enteredWordPos);