am 5cb9b70a
: Merge "Fix abbreviations processing for English (A7)" into jb-mr1-dev
* commit '5cb9b70a3a84a20d64e3ffb2ba2bd79ea07500de': Fix abbreviations processing for English (A7)
This commit is contained in:
commit
86342fb797
2 changed files with 123 additions and 27 deletions
|
@ -304,34 +304,89 @@ public final class StringUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (j <= 0) return TextUtils.CAP_MODE_CHARACTERS & reqModes;
|
if (j <= 0) return TextUtils.CAP_MODE_CHARACTERS & reqModes;
|
||||||
char c = cs.charAt(j - 1);
|
char c = cs.charAt(--j);
|
||||||
if (c == Keyboard.CODE_PERIOD || c == Keyboard.CODE_QUESTION_MARK
|
|
||||||
|| c == Keyboard.CODE_EXCLAMATION_MARK) {
|
// We found the next interesting chunk of text ; next we need to determine if it's the
|
||||||
// Here we found a marker for sentence end (we consider these to be one of
|
// end of a sentence. If we have a question mark or an exclamation mark, it's the end of
|
||||||
// either . or ? or ! only). So this is probably the end of a sentence, but if we
|
// a sentence. If it's neither, the only remaining case is the period so we get the opposite
|
||||||
// found a period, we still want to check the case where this is a abbreviation
|
// case out of the way.
|
||||||
// period rather than a full stop. To do this, we look for a period within a word
|
if (c == Keyboard.CODE_QUESTION_MARK || c == Keyboard.CODE_EXCLAMATION_MARK) {
|
||||||
// before the period we just found; if any, we take that to mean it was an
|
|
||||||
// abbreviation.
|
|
||||||
// A typical example of the above is "In the U.S. ", where the last period is
|
|
||||||
// not a full stop and we should not capitalize.
|
|
||||||
// TODO: the rule below is broken. In particular it fails for runs of periods,
|
|
||||||
// whatever the reason. In the example "in the U.S..", the last period is a full
|
|
||||||
// stop following the abbreviation period, and we should capitalize but we don't.
|
|
||||||
// Likewise, "I don't know... " should capitalize, but fails to do so.
|
|
||||||
if (c == Keyboard.CODE_PERIOD) {
|
|
||||||
for (int k = j - 2; k >= 0; k--) {
|
|
||||||
c = cs.charAt(k);
|
|
||||||
if (c == Keyboard.CODE_PERIOD) {
|
|
||||||
return TextUtils.CAP_MODE_CHARACTERS & reqModes;
|
|
||||||
}
|
|
||||||
if (!Character.isLetter(c)) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_SENTENCES) & reqModes;
|
return (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_SENTENCES) & reqModes;
|
||||||
}
|
}
|
||||||
return TextUtils.CAP_MODE_CHARACTERS & reqModes;
|
if (c != Keyboard.CODE_PERIOD || j <= 0) {
|
||||||
|
return (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS) & reqModes;
|
||||||
|
}
|
||||||
|
|
||||||
|
// We found out that we have a period. We need to determine if this is a full stop or
|
||||||
|
// otherwise sentence-ending period, or an abbreviation like "e.g.". An abbreviation
|
||||||
|
// looks like (\w\.){2,}
|
||||||
|
// To find out, we will have a simple state machine with the following states :
|
||||||
|
// START, WORD, PERIOD, ABBREVIATION
|
||||||
|
// On START : (just before the first period)
|
||||||
|
// letter => WORD
|
||||||
|
// whitespace => end with no caps (it was a stand-alone period)
|
||||||
|
// otherwise => end with caps (several periods/symbols in a row)
|
||||||
|
// On WORD : (within the word just before the first period)
|
||||||
|
// letter => WORD
|
||||||
|
// period => PERIOD
|
||||||
|
// otherwise => end with caps (it was a word with a full stop at the end)
|
||||||
|
// On PERIOD : (period within a potential abbreviation)
|
||||||
|
// letter => LETTER
|
||||||
|
// otherwise => end with caps (it was not an abbreviation)
|
||||||
|
// On LETTER : (letter within a potential abbreviation)
|
||||||
|
// letter => LETTER
|
||||||
|
// period => PERIOD
|
||||||
|
// otherwise => end with no caps (it was an abbreviation)
|
||||||
|
// "Not an abbreviation" in the above chart essentially covers cases like "...yes.". This
|
||||||
|
// should capitalize.
|
||||||
|
|
||||||
|
final int START = 0;
|
||||||
|
final int WORD = 1;
|
||||||
|
final int PERIOD = 2;
|
||||||
|
final int LETTER = 3;
|
||||||
|
final int caps = (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS
|
||||||
|
| TextUtils.CAP_MODE_SENTENCES) & reqModes;
|
||||||
|
final int noCaps = (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS) & reqModes;
|
||||||
|
int state = START;
|
||||||
|
while (j > 0) {
|
||||||
|
c = cs.charAt(--j);
|
||||||
|
switch (state) {
|
||||||
|
case START:
|
||||||
|
if (Character.isLetter(c)) {
|
||||||
|
state = WORD;
|
||||||
|
} else if (Character.isWhitespace(c)) {
|
||||||
|
return noCaps;
|
||||||
|
} else {
|
||||||
|
return caps;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case WORD:
|
||||||
|
if (Character.isLetter(c)) {
|
||||||
|
state = WORD;
|
||||||
|
} else if (c == Keyboard.CODE_PERIOD) {
|
||||||
|
state = PERIOD;
|
||||||
|
} else {
|
||||||
|
return caps;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case PERIOD:
|
||||||
|
if (Character.isLetter(c)) {
|
||||||
|
state = LETTER;
|
||||||
|
} else {
|
||||||
|
return caps;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case LETTER:
|
||||||
|
if (Character.isLetter(c)) {
|
||||||
|
state = LETTER;
|
||||||
|
} else if (c == Keyboard.CODE_PERIOD) {
|
||||||
|
state = PERIOD;
|
||||||
|
} else {
|
||||||
|
return noCaps;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Here we arrived at the start of the line. This should behave exactly like whitespace.
|
||||||
|
return (START == state || LETTER == state) ? noCaps : caps;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
package com.android.inputmethod.latin;
|
package com.android.inputmethod.latin;
|
||||||
|
|
||||||
import android.test.AndroidTestCase;
|
import android.test.AndroidTestCase;
|
||||||
|
import android.text.TextUtils;
|
||||||
|
|
||||||
public class StringUtilsTests extends AndroidTestCase {
|
public class StringUtilsTests extends AndroidTestCase {
|
||||||
public void testContainsInArray() {
|
public void testContainsInArray() {
|
||||||
|
@ -99,4 +100,44 @@ public class StringUtilsTests extends AndroidTestCase {
|
||||||
assertFalse("lower-case string", StringUtils.hasUpperCase("string"));
|
assertFalse("lower-case string", StringUtils.hasUpperCase("string"));
|
||||||
assertFalse("lower-case string with non-letters", StringUtils.hasUpperCase("he's"));
|
assertFalse("lower-case string with non-letters", StringUtils.hasUpperCase("he's"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void onePathForCaps(final CharSequence cs, final int expectedResult, final int mask) {
|
||||||
|
int oneTimeResult = expectedResult & mask;
|
||||||
|
assertEquals("After >" + cs + "<", oneTimeResult, StringUtils.getCapsMode(cs, mask));
|
||||||
|
}
|
||||||
|
|
||||||
|
private void allPathsForCaps(final CharSequence cs, final int expectedResult) {
|
||||||
|
final int c = TextUtils.CAP_MODE_CHARACTERS;
|
||||||
|
final int w = TextUtils.CAP_MODE_WORDS;
|
||||||
|
final int s = TextUtils.CAP_MODE_SENTENCES;
|
||||||
|
onePathForCaps(cs, expectedResult, c | w | s);
|
||||||
|
onePathForCaps(cs, expectedResult, w | s);
|
||||||
|
onePathForCaps(cs, expectedResult, c | s);
|
||||||
|
onePathForCaps(cs, expectedResult, c | w);
|
||||||
|
onePathForCaps(cs, expectedResult, c);
|
||||||
|
onePathForCaps(cs, expectedResult, w);
|
||||||
|
onePathForCaps(cs, expectedResult, s);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testGetCapsMode() {
|
||||||
|
final int c = TextUtils.CAP_MODE_CHARACTERS;
|
||||||
|
final int w = TextUtils.CAP_MODE_WORDS;
|
||||||
|
final int s = TextUtils.CAP_MODE_SENTENCES;
|
||||||
|
allPathsForCaps("", c | w | s);
|
||||||
|
allPathsForCaps("Word", c);
|
||||||
|
allPathsForCaps("Word.", c);
|
||||||
|
allPathsForCaps("Word ", c | w);
|
||||||
|
allPathsForCaps("Word. ", c | w | s);
|
||||||
|
allPathsForCaps("Word..", c);
|
||||||
|
allPathsForCaps("Word.. ", c | w | s);
|
||||||
|
allPathsForCaps("Word... ", c | w | s);
|
||||||
|
allPathsForCaps("Word ... ", c | w | s);
|
||||||
|
allPathsForCaps("Word . ", c | w);
|
||||||
|
allPathsForCaps("In the U.S ", c | w);
|
||||||
|
allPathsForCaps("In the U.S. ", c | w);
|
||||||
|
allPathsForCaps("Some stuff (e.g. ", c | w);
|
||||||
|
allPathsForCaps("In the U.S.. ", c | w | s);
|
||||||
|
allPathsForCaps("\"Word.\" ", c | w | s);
|
||||||
|
allPathsForCaps("\"Word\" ", c | w);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue